首页 > 解决方案 > Python Code PDF to Text,提取表格数据

问题描述

我正在尝试从此 pdf 文件中以逗号分隔格式提取表格数据。它太大而无法与 Adob​​e Pro 一起使用,因此我研究了使用 Python。PDF 文件位于 https://www.live-military-mode-s.eu/pdf/Military%20Mode-S%20codes.pdf。该代码提取了数据,但最终得到了大约 360,000 行数据。我希望能够只导入表数据。没有标题或底部页面信息数据。

from tika import parser
import os
import glob
from easygui import *
from time import sleep
import random
import string
from PIL import Image

path=None

basewidth = 150
img = Image.open('dlogo.jpg')
wpercent = (basewidth/float(img.size[0]))
hsize = int((float(img.size[1])*float(wpercent)))




def converter(filename,savelocation):
    parsed = parser.from_file(filename+'.pdf')
    text=parsed["content"]

    new_name=filename+'.txt'
    fname=savelocation+'\\'+new_name.split('\\')[-1]
    with open(fname,'w+', encoding='utf-8',errors='ignore') as f:
        f.writelines(text)

    remove_empty_lines(fname)
    return new_name

def remove_empty_lines(filename):
    if not os.path.isfile(filename):
        print("{} does not exist ".format(filename))
        return
    with open(filename, errors='ignore') as filehandle:
        lines = filehandle.readlines()

    with open(filename, 'w',errors='ignore') as filehandle:
        lines = filter(lambda x: x.strip(), lines)
        filehandle.writelines(lines)




while 1:

    msg = "Please Choose a File or Folder"
    title = "PDF Converter"

    choices = ["Exit","Choose File","Choose Folder"]

    reply = buttonbox(msg,title=title,choices=choices)
    if reply is 'Exit':
        break
    elif reply is 'Choose File':
        path=fileopenbox()
        savelocation=buttonbox("Choose a Save location",title="Saving",choices=["Save Location","Cancel"])
        if savelocation is 'Cancel':
            continue
        savepath=diropenbox()
        print(savepath)
        filename, file_extension = os.path.splitext(path)
        name=converter(filename,savepath)
        print(name)

        msgbox("File Successfully Converted to Text!!")

    elif reply is 'Choose Folder':
        path=diropenbox()
        savelocation=buttonbox("Choose a Save location",title="Save Location",choices=["Save Location","Cancel"])
        if savelocation is 'Cancel':
            continue
        savepath=diropenbox()
        n=1
        for i in glob.iglob(path+'\*.pdf'):
            randomname=''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(5))
            os.rename(i,path+'\\'+str(n)+'_'+str(randomname)+'.pdf')
            n+=1
        for f in glob.iglob(path+'\*.pdf'):
            filename, file_extension = os.path.splitext(f)
            name=converter(filename,savepath)
        #     
        msgbox("PDFS Successfully Converted to Text!!")

标签: pythonpython-3.x

解决方案


推荐阅读