Python module for converting PDF to text

哪些是将PDF文件转换为文本的最佳Python模块?


Try PDFMiner. It can extract text from PDF files as HTML, SGML or "Tagged PDF" format.

http://www.unixuser.org/~euske/python/pdfminer/index.html

The Tagged PDF format seems to be the cleanest, and stripping out the XML tags leaves just the bare text.

A Python 3 version is available under:

  • https://github.com/pdfminer/pdfminer.six

  • The PDFMiner package has changed since codeape posted.

    EDIT (again):

    PDFMiner has been updated again in version 20100213

    You can check the version you have installed with the following:

    >>> import pdfminer
    >>> pdfminer.__version__
    '20100213'
    

    Here's the updated version (with comments on what I changed/added):

    def pdf_to_csv(filename):
        from cStringIO import StringIO  #<-- added so you can copy/paste this to try it
        from pdfminer.converter import LTTextItem, TextConverter
        from pdfminer.pdfparser import PDFDocument, PDFParser
        from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    
        class CsvConverter(TextConverter):
            def __init__(self, *args, **kwargs):
                TextConverter.__init__(self, *args, **kwargs)
    
            def end_page(self, i):
                from collections import defaultdict
                lines = defaultdict(lambda : {})
                for child in self.cur_item.objs:
                    if isinstance(child, LTTextItem):
                        (_,_,x,y) = child.bbox                   #<-- changed
                        line = lines[int(-y)]
                        line[x] = child.text.encode(self.codec)  #<-- changed
    
                for y in sorted(lines.keys()):
                    line = lines[y]
                    self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
                    self.outfp.write("n")
    
        # ... the following part of the code is a remix of the 
        # convert() function in the pdfminer/tools/pdf2text module
        rsrc = PDFResourceManager()
        outfp = StringIO()
        device = CsvConverter(rsrc, outfp, codec="utf-8")  #<-- changed 
            # becuase my test documents are utf-8 (note: utf-8 is the default codec)
    
        doc = PDFDocument()
        fp = open(filename, 'rb')
        parser = PDFParser(fp)       #<-- changed
        parser.set_document(doc)     #<-- added
        doc.set_parser(parser)       #<-- added
        doc.initialize('')
    
        interpreter = PDFPageInterpreter(rsrc, device)
    
        for i, page in enumerate(doc.get_pages()):
            outfp.write("START PAGE %dn" % i)
            interpreter.process_page(page)
            outfp.write("END PAGE %dn" % i)
    
        device.close()
        fp.close()
    
        return outfp.getvalue()
    

    Edit (yet again):

    Here is an update for the latest version in pypi, 20100619p1 . In short I replaced LTTextItem with LTChar and passed an instance of LAParams to the CsvConverter constructor.

    def pdf_to_csv(filename):
        from cStringIO import StringIO  
        from pdfminer.converter import LTChar, TextConverter    #<-- changed
        from pdfminer.layout import LAParams
        from pdfminer.pdfparser import PDFDocument, PDFParser
        from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    
        class CsvConverter(TextConverter):
            def __init__(self, *args, **kwargs):
                TextConverter.__init__(self, *args, **kwargs)
    
            def end_page(self, i):
                from collections import defaultdict
                lines = defaultdict(lambda : {})
                for child in self.cur_item.objs:
                    if isinstance(child, LTChar):               #<-- changed
                        (_,_,x,y) = child.bbox                   
                        line = lines[int(-y)]
                        line[x] = child.text.encode(self.codec)
    
                for y in sorted(lines.keys()):
                    line = lines[y]
                    self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
                    self.outfp.write("n")
    
        # ... the following part of the code is a remix of the 
        # convert() function in the pdfminer/tools/pdf2text module
        rsrc = PDFResourceManager()
        outfp = StringIO()
        device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())  #<-- changed
            # becuase my test documents are utf-8 (note: utf-8 is the default codec)
    
        doc = PDFDocument()
        fp = open(filename, 'rb')
        parser = PDFParser(fp)       
        parser.set_document(doc)     
        doc.set_parser(parser)       
        doc.initialize('')
    
        interpreter = PDFPageInterpreter(rsrc, device)
    
        for i, page in enumerate(doc.get_pages()):
            outfp.write("START PAGE %dn" % i)
            if page is not None:
                interpreter.process_page(page)
            outfp.write("END PAGE %dn" % i)
    
        device.close()
        fp.close()
    
        return outfp.getvalue()
    

    EDIT (one more time):

    Updated for version 20110515 (thanks to Oeufcoque Penteano!):

    def pdf_to_csv(filename):
        from cStringIO import StringIO  
        from pdfminer.converter import LTChar, TextConverter
        from pdfminer.layout import LAParams
        from pdfminer.pdfparser import PDFDocument, PDFParser
        from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    
        class CsvConverter(TextConverter):
            def __init__(self, *args, **kwargs):
                TextConverter.__init__(self, *args, **kwargs)
    
            def end_page(self, i):
                from collections import defaultdict
                lines = defaultdict(lambda : {})
                for child in self.cur_item._objs:                #<-- changed
                    if isinstance(child, LTChar):
                        (_,_,x,y) = child.bbox                   
                        line = lines[int(-y)]
                        line[x] = child._text.encode(self.codec) #<-- changed
    
                for y in sorted(lines.keys()):
                    line = lines[y]
                    self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
                    self.outfp.write("n")
    
        # ... the following part of the code is a remix of the 
        # convert() function in the pdfminer/tools/pdf2text module
        rsrc = PDFResourceManager()
        outfp = StringIO()
        device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
            # becuase my test documents are utf-8 (note: utf-8 is the default codec)
    
        doc = PDFDocument()
        fp = open(filename, 'rb')
        parser = PDFParser(fp)       
        parser.set_document(doc)     
        doc.set_parser(parser)       
        doc.initialize('')
    
        interpreter = PDFPageInterpreter(rsrc, device)
    
        for i, page in enumerate(doc.get_pages()):
            outfp.write("START PAGE %dn" % i)
            if page is not None:
                interpreter.process_page(page)
            outfp.write("END PAGE %dn" % i)
    
        device.close()
        fp.close()
    
        return outfp.getvalue()
    

    Since none for these solutions support the latest version of PDFMiner I wrote a simple solution that will return text of a pdf using PDFMiner. This will work for those who are getting import errors with process_pdf

    import sys
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfpage import PDFPage
    from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
    from pdfminer.layout import LAParams
    from cStringIO import StringIO
    
    def pdfparser(data):
    
        fp = file(data, 'rb')
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
    
        for page in PDFPage.get_pages(fp):
            interpreter.process_page(page)
            data =  retstr.getvalue()
    
        print data
    
    if __name__ == '__main__':
        pdfparser(sys.argv[1])  
    

    See below code that works for Python 3:

    import sys
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfpage import PDFPage
    from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
    from pdfminer.layout import LAParams
    import io
    
    def pdfparser(data):
    
        fp = open(data, 'rb')
        rsrcmgr = PDFResourceManager()
        retstr = io.StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
    
        for page in PDFPage.get_pages(fp):
            interpreter.process_page(page)
            data =  retstr.getvalue()
    
        print(data)
    
    if __name__ == '__main__':
        pdfparser(sys.argv[1])  
    
    链接地址: http://www.djcxy.com/p/65440.html

    上一篇: 如何调整UITextView的内容大小?

    下一篇: 用于将PDF转换为文本的Python模块