python - 如何使烧瓶搜索应用程序可搜索 PDF?
问题描述
我一直在为一个非常重要的个人项目做研究。我想创建一个 Flask 搜索应用程序,它允许我在 100 Plus PDF 文件中搜索内容。我发现了一些关于 A ElasticSearch Lib 的信息,这些信息适用于烧瓶。
#!/usr/bin/env python3
#-*- coding: utf-8 -*-
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import json
from flask import Flask, jsonify, request, render_template, json
from datetime import datetime
import pandas as pd
# import the Elasticsearch low-level client library
from elasticsearch import Elasticsearch
# create a new client instance of Elasticsearch
elastic_client = Elasticsearch(hosts=["localhost"])
es = Elasticsearch("http://localhost:9200/")
app = Flask(__name__)
# create a new PDF object with FPDF
pdf = FPDF()
# use an iterator to create 10 pages
for page in range(10):
pdf.add_page()
pdf.set_font("Arial", size=14)
pdf.cell(150, 12, txt="Object Rocket ROCKS!!", ln=1, align="C")
# output all of the data to a new PDF file
pdf.output("object_rocket.pdf")
'''
read_pdf = PyPDF2.PdfFileReader("object_rocket.pdf")
page = read_pdf.getPage(0)
page_mode = read_pdf.getPageMode()
page_text = page.extractText()
print (type(page_text))
'''
#with open(path, 'rb') as file:
# get the PDF path and read the file
file = "Sheet3.pdf"
read_pdf = PyPDF2.PdfFileReader(file, strict=False)
#print (read_pdf)
# get the read object's meta info
pdf_meta = read_pdf.getDocumentInfo()
# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)
# create a dictionary object for page data
all_pages = {}
# put meta data into a dict key
all_pages["meta"] = {}
# Use 'iteritems()` instead of 'items()' for Python 2
for meta, value in pdf_meta.items():
print (meta, value)
all_pages["meta"][meta] = value
# iterate the page numbers
for page in range(num):
data = read_pdf.getPage(page)
#page_mode = read_pdf.getPageMode()
# extract the page's text
page_text = data.extractText()
# put the text data into the dict
all_pages[page] = page_text
# create a JSON string from the dictionary
json_data = json.dumps(all_pages)
#print ("\nJSON:", json_data)
# convert JSON string to bytes-like obj
bytes_string = bytes(json_data, 'utf-8')
#print ("\nbytes_string:", bytes_string)
# convert bytes to base64 encoded string
encoded_pdf = base64.b64encode(bytes_string)
encoded_pdf = str(encoded_pdf)
#print ("\nbase64:", encoded_pdf)
# put the PDF data into a dictionary body to pass to the API request
body_doc = {"data": encoded_pdf}
# call the index() method to index the data
result = elastic_client.index(index="pdf", doc_type="_doc", id="42", body=body_doc)
# print the returned sresults
#print ("\nindex result:", result['result'])
# make another Elasticsearch API request to get the indexed PDF
result = elastic_client.get(index="pdf", doc_type='_doc', id=42)
# print the data to terminal
result_data = result["_source"]["data"]
#print ("\nresult_data:", result_data, '-- type:', type(result_data))
# decode the base64 data (use to [:] to slice off
# the 'b and ' in the string)
decoded_pdf = base64.b64decode(result_data[2:-1]).decode("utf-8")
#print ("\ndecoded_pdf:", decoded_pdf)
# take decoded string and make into JSON object
json_dict = json.loads(decoded_pdf)
#print ("\njson_str:", json_dict, "\n\ntype:", type(json_dict))
result2 = elastic_client.index(index="pdftext", doc_type="_doc", id="42", body=json_dict)
# create new FPDF object
pdf = FPDF()
# build the new PDF from the Elasticsearch dictionary
# Use 'iteritems()` instead of 'items()' for Python 2
""" for page, value in json_data:
if page != "meta":
# create new page
pdf.add_page()
pdf.set_font("Arial", size=14)
# add content to page
output = value + " -- Page: " + str(int(page)+1)
pdf.cell(150, 12, txt=output, ln=1, align="C")
else:
# create the meta data for the new PDF
for meta, meta_val in json_dict["meta"].items():
if "title" in meta.lower():
pdf.set_title(meta_val)
elif "producer" in meta.lower() or "creator" in meta.lower():
pdf.set_creator(meta_val)
"""
# output the PDF object's data to a PDF file
#pdf.output("object_rocket_from_elaticsearch.pdf" )
@app.route('/', methods=['GET'])
def index():
return jsonify(json_dict)
@app.route('/<id>', methods=['GET'])
def index_by_id(id):
return jsonify(json_dict[id])
""" @app.route('/insert_data', methods=['PUT'])
def insert_data():
slug = request.form['slug']
title = request.form['title']
content = request.form['content']
body = {
'slug': slug,
'title': title,
'content': content,
'timestamp': datetime.now()
}
result = es.index(index='contents', doc_type='title', id=slug, body=body)
return jsonify(result) """
app.run(port=5003, debug=True)
------进度----- 我现在有一个没有前端搜索功能的工作解决方案:
# Load_single_PDF_BY_PAGE_TO_index.py
#!/usr/bin/env python3
#-*- coding: utf-8 -*-
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
from flask import Flask, jsonify, request, render_template, json
from datetime import datetime
import pandas as pd
# import the Elasticsearch low-level client library
from elasticsearch import Elasticsearch
# create a new client instance of Elasticsearch
elastic_client = Elasticsearch(hosts=["localhost"])
es = Elasticsearch("http://localhost:9200/")
app = Flask(__name__)
#with open(path, 'rb') as file:
# get the PDF path and read the file
file = "Sheet3.pdf"
read_pdf = PyPDF2.PdfFileReader(file, strict=False)
#print (read_pdf)
# get the read object's meta info
pdf_meta = read_pdf.getDocumentInfo()
# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)
# create a dictionary object for page data
all_pages = {}
# put meta data into a dict key
all_pages["meta"] = {}
# Use 'iteritems()` instead of 'items()' for Python 2
for meta, value in pdf_meta.items():
print (meta, value)
all_pages["meta"][meta] = value
x = 44
# iterate the page numbers
for page in range(num):
data = read_pdf.getPage(page)
#page_mode = read_pdf.getPageMode()
# extract the page's text
page_text = data.extractText()
# put the text data into the dict
all_pages[page] = page_text
body_doc2 = {"data": page_text}
result3 = elastic_client.index(index="pdfclearn", doc_type="_doc", id=x, body=body_doc2)
x += 1
上面的代码按页面将单个 pdf 加载到 elasticsearch 中。
from flask import Flask, jsonify, request,render_template
from elasticsearch import Elasticsearch
from datetime import datetime
es = Elasticsearch("http://localhost:9200/")
app = Flask(__name__)
@app.route('/pdf', methods=['GET'])
def index():
results = es.get(index='pdfclearn', doc_type='_doc', id='44')
return jsonify(results['_source'])
@app.route('/pdf/<id>', methods=['GET'])
def index_by_id(id):
results = es.get(index='pdfclearn', doc_type='_doc', id=id)
return jsonify(results['_source'])
@app.route('/search/<keyword>', methods=['POST','GET'])
def search(keyword):
keyword = keyword
body = {
"query": {
"multi_match": {
"query": keyword,
"fields": ["data"]
}
}
}
res = es.search(index="pdfclearn", doc_type="_doc", body=body)
return jsonify(res['hits']['hits'])
@app.route("/searhbar")
def searhbar():
return render_template("index.html")
@app.route("/searhbar/<string:box>")
def process(box):
query = request.args.get('query')
if box == 'names':
keyword = box
body = {
"query": {
"multi_match": {
"query": keyword,
"fields": ["data"]
}
}
}
res = es.search(index="pdfclearn", doc_type="_doc", body=body)
return jsonify(res['hits']['hits'])
app.run(port=5003, debug=True)
在上面的代码中,我们可以在所有页面中搜索关键字或短语。
curl http://127.0.0.1:5003/search/test //it works!!
我找到了一篇关于如何在 ElasticSearch 中将 PDF 文件作为 Base64 索引的博客。我已经看到 DocuSign 的 API 为文档模板执行此操作。但是,我不明白如何以可搜索 ElasticSearch 的方式对 Base64 PDF 进行 Jsonify。
curl "http://localhost:9200/pdftext/_doc/42"
curl -X POST "http://localhost:9200/pdf/_search?q=*"
我可以检索 700 页文档的 Base64。但我认为我需要的是索引和检索文档的每一页。
我研究过的博客让我成功了:
- https://kb.objectrocket.com/elasticsearch/how-to-index-a-pdf-file-as-an-elasticsearch-index-267
- https://blog.miguelgrinberg.com/post/the-flask-mega-tutorial-part-xvi-full-text-search
结束游戏:
我将继续研究 Elastic Search 和 Base64 编码和解码。但我想要一些帮助来实现我的目标。任何详细的例子将不胜感激。
解决方案
------进度----- 我现在有一个没有前端搜索功能的工作解决方案:
# Load_single_PDF_BY_PAGE_TO_index.py
#!/usr/bin/env python3
#-*- coding: utf-8 -*-
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
from flask import Flask, jsonify, request, render_template, json
from datetime import datetime
import pandas as pd
# import the Elasticsearch low-level client library
from elasticsearch import Elasticsearch
# create a new client instance of Elasticsearch
elastic_client = Elasticsearch(hosts=["localhost"])
es = Elasticsearch("http://localhost:9200/")
app = Flask(__name__)
#with open(path, 'rb') as file:
# get the PDF path and read the file
file = "Sheet3.pdf"
read_pdf = PyPDF2.PdfFileReader(file, strict=False)
#print (read_pdf)
# get the read object's meta info
pdf_meta = read_pdf.getDocumentInfo()
# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)
# create a dictionary object for page data
all_pages = {}
# put meta data into a dict key
all_pages["meta"] = {}
# Use 'iteritems()` instead of 'items()' for Python 2
for meta, value in pdf_meta.items():
print (meta, value)
all_pages["meta"][meta] = value
x = 44
# iterate the page numbers
for page in range(num):
data = read_pdf.getPage(page)
#page_mode = read_pdf.getPageMode()
# extract the page's text
page_text = data.extractText()
# put the text data into the dict
all_pages[page] = page_text
body_doc2 = {"data": page_text}
result3 = elastic_client.index(index="pdfclearn", doc_type="_doc", id=x, body=body_doc2)
x += 1
上面的代码按页面将单个 pdf 加载到 elasticsearch 中。
from flask import Flask, jsonify, request,render_template
from elasticsearch import Elasticsearch
from datetime import datetime
es = Elasticsearch("http://localhost:9200/")
app = Flask(__name__)
@app.route('/pdf', methods=['GET'])
def index():
results = es.get(index='pdfclearn', doc_type='_doc', id='44')
return jsonify(results['_source'])
@app.route('/pdf/<id>', methods=['GET'])
def index_by_id(id):
results = es.get(index='pdfclearn', doc_type='_doc', id=id)
return jsonify(results['_source'])
@app.route('/search/<keyword>', methods=['POST','GET'])
def search(keyword):
keyword = keyword
body = {
"query": {
"multi_match": {
"query": keyword,
"fields": ["data"]
}
}
}
res = es.search(index="pdfclearn", doc_type="_doc", body=body)
return jsonify(res['hits']['hits'])
@app.route("/searhbar")
def searhbar():
return render_template("index.html")
@app.route("/searhbar/<string:box>")
def process(box):
query = request.args.get('query')
if box == 'names':
keyword = box
body = {
"query": {
"multi_match": {
"query": keyword,
"fields": ["data"]
}
}
}
res = es.search(index="pdfclearn", doc_type="_doc", body=body)
return jsonify(res['hits']['hits'])
app.run(port=5003, debug=True)
在上面的代码中,我们可以在所有页面中搜索关键字或短语。
curl http://127.0.0.1:5003/search/test //it works!!
推荐阅读
- javascript - 动态地将 aria-hidden 标签添加到主干视图
- python - 根据另一列减去某个熊猫数据框列的最小值
- ssl - 如何避免从 AWS 迁移到 GCP 的停机时间?通常用于负载平衡和 SSL 设置
- react-native - Expo Recording + Google Cloud Storage:文件损坏?错误的 API POST?
- http - How to extract comma separated values from query parameter in Go?
- java - 为什么在 Netbeans 中完成“刷新指数”需要几个小时?刷新时如何清理项目?
- javascript - Javascript:innerWidth 的条件不起作用
- anaconda - 尝试激活我的 conda 环境时,终端中没有任何内容
- typescript - NestJS ClientRedis 发出但未触发 EventPattern 处理程序
- reactjs - React useEffect 钩子中的平滑滚动错误仅在 Chrome / Chromium 上