用于将PDF转换为文本的Python模块
哪些是将PDF文件转换为文本的最佳Python模块?
试试PDFMiner。 它可以从PDF文件中提取HTML,SGML或“Tagged PDF”格式的文本。
http://www.unixuser.org/~euske/python/pdfminer/index.html
标记PDF格式似乎是最干净的,并且剥离出XML标签只剩下裸露的文本。
Python 3版本可在以下位置获得:
自发布codeape后,PDFMiner软件包已更改。
编辑(再次):
PDFMiner已在20100213
版中再次更新
您可以通过以下方式检查您安装的版本:
>>> import pdfminer
>>> pdfminer.__version__
'20100213'
以下是更新后的版本(包含对我更改/添加的内容的评论):
def pdf_to_csv(filename):
from cStringIO import StringIO #<-- added so you can copy/paste this to try it
from pdfminer.converter import LTTextItem, TextConverter
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
class CsvConverter(TextConverter):
def __init__(self, *args, **kwargs):
TextConverter.__init__(self, *args, **kwargs)
def end_page(self, i):
from collections import defaultdict
lines = defaultdict(lambda : {})
for child in self.cur_item.objs:
if isinstance(child, LTTextItem):
(_,_,x,y) = child.bbox #<-- changed
line = lines[int(-y)]
line[x] = child.text.encode(self.codec) #<-- changed
for y in sorted(lines.keys()):
line = lines[y]
self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
self.outfp.write("n")
# ... the following part of the code is a remix of the
# convert() function in the pdfminer/tools/pdf2text module
rsrc = PDFResourceManager()
outfp = StringIO()
device = CsvConverter(rsrc, outfp, codec="utf-8") #<-- changed
# becuase my test documents are utf-8 (note: utf-8 is the default codec)
doc = PDFDocument()
fp = open(filename, 'rb')
parser = PDFParser(fp) #<-- changed
parser.set_document(doc) #<-- added
doc.set_parser(parser) #<-- added
doc.initialize('')
interpreter = PDFPageInterpreter(rsrc, device)
for i, page in enumerate(doc.get_pages()):
outfp.write("START PAGE %dn" % i)
interpreter.process_page(page)
outfp.write("END PAGE %dn" % i)
device.close()
fp.close()
return outfp.getvalue()
编辑(再次):
以下是pypi, 20100619p1
最新版本的更新。 总之我换成LTTextItem
与LTChar
并通过LAParams的实例向CsvConverter构造。
def pdf_to_csv(filename):
from cStringIO import StringIO
from pdfminer.converter import LTChar, TextConverter #<-- changed
from pdfminer.layout import LAParams
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
class CsvConverter(TextConverter):
def __init__(self, *args, **kwargs):
TextConverter.__init__(self, *args, **kwargs)
def end_page(self, i):
from collections import defaultdict
lines = defaultdict(lambda : {})
for child in self.cur_item.objs:
if isinstance(child, LTChar): #<-- changed
(_,_,x,y) = child.bbox
line = lines[int(-y)]
line[x] = child.text.encode(self.codec)
for y in sorted(lines.keys()):
line = lines[y]
self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
self.outfp.write("n")
# ... the following part of the code is a remix of the
# convert() function in the pdfminer/tools/pdf2text module
rsrc = PDFResourceManager()
outfp = StringIO()
device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) #<-- changed
# becuase my test documents are utf-8 (note: utf-8 is the default codec)
doc = PDFDocument()
fp = open(filename, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
interpreter = PDFPageInterpreter(rsrc, device)
for i, page in enumerate(doc.get_pages()):
outfp.write("START PAGE %dn" % i)
if page is not None:
interpreter.process_page(page)
outfp.write("END PAGE %dn" % i)
device.close()
fp.close()
return outfp.getvalue()
编辑(一次):
更新版本20110515
(感谢Oeufcoque Penteano!):
def pdf_to_csv(filename):
from cStringIO import StringIO
from pdfminer.converter import LTChar, TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
class CsvConverter(TextConverter):
def __init__(self, *args, **kwargs):
TextConverter.__init__(self, *args, **kwargs)
def end_page(self, i):
from collections import defaultdict
lines = defaultdict(lambda : {})
for child in self.cur_item._objs: #<-- changed
if isinstance(child, LTChar):
(_,_,x,y) = child.bbox
line = lines[int(-y)]
line[x] = child._text.encode(self.codec) #<-- changed
for y in sorted(lines.keys()):
line = lines[y]
self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
self.outfp.write("n")
# ... the following part of the code is a remix of the
# convert() function in the pdfminer/tools/pdf2text module
rsrc = PDFResourceManager()
outfp = StringIO()
device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
# becuase my test documents are utf-8 (note: utf-8 is the default codec)
doc = PDFDocument()
fp = open(filename, 'rb')
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
interpreter = PDFPageInterpreter(rsrc, device)
for i, page in enumerate(doc.get_pages()):
outfp.write("START PAGE %dn" % i)
if page is not None:
interpreter.process_page(page)
outfp.write("END PAGE %dn" % i)
device.close()
fp.close()
return outfp.getvalue()
由于这些解决方案都不支持最新版本的PDFMiner,我写了一个简单的解决方案,它将使用PDFMiner返回PDF文本。 这将适用于那些正在使用process_pdf
获取导入错误的人
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
def pdfparser(data):
fp = file(data, 'rb')
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
data = retstr.getvalue()
print data
if __name__ == '__main__':
pdfparser(sys.argv[1])
请参阅以下适用于Python 3的代码:
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import io
def pdfparser(data):
fp = open(data, 'rb')
rsrcmgr = PDFResourceManager()
retstr = io.StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
data = retstr.getvalue()
print(data)
if __name__ == '__main__':
pdfparser(sys.argv[1])
链接地址: http://www.djcxy.com/p/65439.html
上一篇: Python module for converting PDF to text
下一篇: How to connect to new WiFi network using adb without rebooting the phone