用于将PDF转换为文本的Python模块

哪些是将PDF文件转换为文本的最佳Python模块?


试试PDFMiner。 它可以从PDF文件中提取HTML,SGML或“Tagged PDF”格式的文本。

http://www.unixuser.org/~euske/python/pdfminer/index.html

标记PDF格式似乎是最干净的,并且剥离出XML标签只剩下裸露的文本。

Python 3版本可在以下位置获得:

  • https://github.com/pdfminer/pdfminer.six

  • 自发布codeape后,PDFMiner软件包已更改。

    编辑(再次):

    PDFMiner已在20100213版中再次更新

    您可以通过以下方式检查您安装的版本:

    >>> import pdfminer
    >>> pdfminer.__version__
    '20100213'
    

    以下是更新后的版本(包含对我更改/添加的内容的评论):

    def pdf_to_csv(filename):
        from cStringIO import StringIO  #<-- added so you can copy/paste this to try it
        from pdfminer.converter import LTTextItem, TextConverter
        from pdfminer.pdfparser import PDFDocument, PDFParser
        from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    
        class CsvConverter(TextConverter):
            def __init__(self, *args, **kwargs):
                TextConverter.__init__(self, *args, **kwargs)
    
            def end_page(self, i):
                from collections import defaultdict
                lines = defaultdict(lambda : {})
                for child in self.cur_item.objs:
                    if isinstance(child, LTTextItem):
                        (_,_,x,y) = child.bbox                   #<-- changed
                        line = lines[int(-y)]
                        line[x] = child.text.encode(self.codec)  #<-- changed
    
                for y in sorted(lines.keys()):
                    line = lines[y]
                    self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
                    self.outfp.write("n")
    
        # ... the following part of the code is a remix of the 
        # convert() function in the pdfminer/tools/pdf2text module
        rsrc = PDFResourceManager()
        outfp = StringIO()
        device = CsvConverter(rsrc, outfp, codec="utf-8")  #<-- changed 
            # becuase my test documents are utf-8 (note: utf-8 is the default codec)
    
        doc = PDFDocument()
        fp = open(filename, 'rb')
        parser = PDFParser(fp)       #<-- changed
        parser.set_document(doc)     #<-- added
        doc.set_parser(parser)       #<-- added
        doc.initialize('')
    
        interpreter = PDFPageInterpreter(rsrc, device)
    
        for i, page in enumerate(doc.get_pages()):
            outfp.write("START PAGE %dn" % i)
            interpreter.process_page(page)
            outfp.write("END PAGE %dn" % i)
    
        device.close()
        fp.close()
    
        return outfp.getvalue()
    

    编辑(再次):

    以下是pypi, 20100619p1最新版本的更新。 总之我换成LTTextItemLTChar并通过LAParams的实例向CsvConverter构造。

    def pdf_to_csv(filename):
        from cStringIO import StringIO  
        from pdfminer.converter import LTChar, TextConverter    #<-- changed
        from pdfminer.layout import LAParams
        from pdfminer.pdfparser import PDFDocument, PDFParser
        from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    
        class CsvConverter(TextConverter):
            def __init__(self, *args, **kwargs):
                TextConverter.__init__(self, *args, **kwargs)
    
            def end_page(self, i):
                from collections import defaultdict
                lines = defaultdict(lambda : {})
                for child in self.cur_item.objs:
                    if isinstance(child, LTChar):               #<-- changed
                        (_,_,x,y) = child.bbox                   
                        line = lines[int(-y)]
                        line[x] = child.text.encode(self.codec)
    
                for y in sorted(lines.keys()):
                    line = lines[y]
                    self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
                    self.outfp.write("n")
    
        # ... the following part of the code is a remix of the 
        # convert() function in the pdfminer/tools/pdf2text module
        rsrc = PDFResourceManager()
        outfp = StringIO()
        device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())  #<-- changed
            # becuase my test documents are utf-8 (note: utf-8 is the default codec)
    
        doc = PDFDocument()
        fp = open(filename, 'rb')
        parser = PDFParser(fp)       
        parser.set_document(doc)     
        doc.set_parser(parser)       
        doc.initialize('')
    
        interpreter = PDFPageInterpreter(rsrc, device)
    
        for i, page in enumerate(doc.get_pages()):
            outfp.write("START PAGE %dn" % i)
            if page is not None:
                interpreter.process_page(page)
            outfp.write("END PAGE %dn" % i)
    
        device.close()
        fp.close()
    
        return outfp.getvalue()
    

    编辑(一次):

    更新版本20110515 (感谢Oeufcoque Penteano!):

    def pdf_to_csv(filename):
        from cStringIO import StringIO  
        from pdfminer.converter import LTChar, TextConverter
        from pdfminer.layout import LAParams
        from pdfminer.pdfparser import PDFDocument, PDFParser
        from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    
        class CsvConverter(TextConverter):
            def __init__(self, *args, **kwargs):
                TextConverter.__init__(self, *args, **kwargs)
    
            def end_page(self, i):
                from collections import defaultdict
                lines = defaultdict(lambda : {})
                for child in self.cur_item._objs:                #<-- changed
                    if isinstance(child, LTChar):
                        (_,_,x,y) = child.bbox                   
                        line = lines[int(-y)]
                        line[x] = child._text.encode(self.codec) #<-- changed
    
                for y in sorted(lines.keys()):
                    line = lines[y]
                    self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
                    self.outfp.write("n")
    
        # ... the following part of the code is a remix of the 
        # convert() function in the pdfminer/tools/pdf2text module
        rsrc = PDFResourceManager()
        outfp = StringIO()
        device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
            # becuase my test documents are utf-8 (note: utf-8 is the default codec)
    
        doc = PDFDocument()
        fp = open(filename, 'rb')
        parser = PDFParser(fp)       
        parser.set_document(doc)     
        doc.set_parser(parser)       
        doc.initialize('')
    
        interpreter = PDFPageInterpreter(rsrc, device)
    
        for i, page in enumerate(doc.get_pages()):
            outfp.write("START PAGE %dn" % i)
            if page is not None:
                interpreter.process_page(page)
            outfp.write("END PAGE %dn" % i)
    
        device.close()
        fp.close()
    
        return outfp.getvalue()
    

    由于这些解决方案都不支持最新版本的PDFMiner,我写了一个简单的解决方案,它将使用PDFMiner返回PDF文本。 这将适用于那些正在使用process_pdf获取导入错误的人

    import sys
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfpage import PDFPage
    from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
    from pdfminer.layout import LAParams
    from cStringIO import StringIO
    
    def pdfparser(data):
    
        fp = file(data, 'rb')
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
    
        for page in PDFPage.get_pages(fp):
            interpreter.process_page(page)
            data =  retstr.getvalue()
    
        print data
    
    if __name__ == '__main__':
        pdfparser(sys.argv[1])  
    

    请参阅以下适用于Python 3的代码:

    import sys
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfpage import PDFPage
    from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
    from pdfminer.layout import LAParams
    import io
    
    def pdfparser(data):
    
        fp = open(data, 'rb')
        rsrcmgr = PDFResourceManager()
        retstr = io.StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
    
        for page in PDFPage.get_pages(fp):
            interpreter.process_page(page)
            data =  retstr.getvalue()
    
        print(data)
    
    if __name__ == '__main__':
        pdfparser(sys.argv[1])  
    
    链接地址: http://www.djcxy.com/p/65439.html

    上一篇: Python module for converting PDF to text

    下一篇: How to connect to new WiFi network using adb without rebooting the phone