python - TypeError:“PSKeyword”类型的对象没有 len()
问题描述
我想将 PDF 转换为文本文件来处理文档。我正在使用 python 3.x 并且我尝试使用PDFMiner转换 pdftotext 。某些 pdf 无法解析并导致错误(python 3.x) 这是将 pdf 转换为文本的代码。onlyfiles[2] 是 pdf 文件名。无法弄清楚错误消息。
import io
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
def extract_text_by_page(pdf_path):
with open(pdf_path, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
yield text
# close open handles
converter.close()
fake_file_handle.close()
def extract_text(pdf_path):
for page in extract_text_by_page(pdf_path):
print(page)
print()
if __name__ == '__main__':
print(extract_text(onlyfiles[2]))
错误信息是:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-19-7fed22ec1779> in <module>
25 print()
26 if __name__ == '__main__':
---> 27 print(extract_text(onlyfiles[2]))
<ipython-input-19-7fed22ec1779> in extract_text(pdf_path)
21 fake_file_handle.close()
22 def extract_text(pdf_path):
---> 23 for page in extract_text_by_page(pdf_path):
24 print(page)
25 print()
<ipython-input-19-7fed22ec1779> in extract_text_by_page(pdf_path)
14 converter = TextConverter(resource_manager, fake_file_handle)
15 page_interpreter = PDFPageInterpreter(resource_manager, converter)
---> 16 page_interpreter.process_page(page)
17 text = fake_file_handle.getvalue()
18 yield text
~\AppData\Local\Continuum\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfinterp.py in process_page(self, page)
850 ctm = (1, 0, 0, 1, -x0, -y0)
851 self.device.begin_page(page, ctm)
--> 852 self.render_contents(page.resources, page.contents, ctm=ctm)
853 self.device.end_page(page)
854 return
~\AppData\Local\Continuum\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfinterp.py in render_contents(self, resources, streams, ctm)
862 self.init_resources(resources)
863 self.init_state(ctm)
--> 864 self.execute(list_value(streams))
865 return
866
~\AppData\Local\Continuum\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfinterp.py in execute(self, streams)
886 log.debug('exec: %s %r', name, args)
887 if len(args) == nargs:
--> 888 func(*args)
889 else:
890 log.debug('exec: %s', name)
~\AppData\Local\Continuum\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfinterp.py in do_TJ(self, seq)
770 raise PDFInterpreterError('No font specified!')
771 return
--> 772 self.device.render_string(self.textstate, seq, self.ncs, self.graphicstate.copy())
773 return
774
~\AppData\Local\Continuum\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfdevice.py in render_string(self, textstate, seq, ncs, graphicstate)
85 textstate.linematrix = self.render_string_horizontal(
86 seq, matrix, textstate.linematrix, font, fontsize,
---> 87 scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate)
88 return
89
~\AppData\Local\Continuum\anaconda3\envs\py36\lib\site-packages\pdfminer\pdfdevice.py in render_string_horizontal(self, seq, matrix, pos, font, fontsize, scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate)
98 needcharspace = True
99 else:
--> 100 for cid in font.decode(obj):
101 if needcharspace:
102 x += charspace
~\AppData\Local\Continuum\anaconda3\envs\py36\lib\site-packages\pdfminer\pdffont.py in decode(self, bytes)
717
718 def decode(self, bytes):
--> 719 return self.cmap.decode(bytes)
720
721 def char_disp(self, cid):
~\AppData\Local\Continuum\anaconda3\envs\py36\lib\site-packages\pdfminer\cmapdb.py in decode(self, code)
125
126 def decode(self, code):
--> 127 n = len(code)//2
128 if n:
129 return struct.unpack('>%dH' % n, code)
TypeError: object of type 'PSKeyword' has no len()
解决方案
推荐阅读
- dataframe - 显示在合并中丢失的观察
- node.js - 网络错误:错误:连接 ETIMEDOUT 149.154.167.220:443
- c++ - 在 Boost 版本 1.65 中使用 boost::math::normal_distribution<>
- database - 传递 Table 和 Column 作为方法参数
- android - 如何使用导航按钮制作 Python Kivy webview 应用程序并在浏览器中打开外部链接?
- python - Python:从csv中逐行提取关键字
- python - 从 python 中的 JSON 文件创建纯 python 列表的更好方法
- javascript - 如何将登录信息发布到 API Gateway 以从 Amazon S3 获取对象
- authorization - 只需检查 ALFA 目标子句中是否存在
- python - 叠合式循环和 Python 语法