python - 我的拥抱脸模型没有在异步事件循环中生成摘要是否有原因?
问题描述
我正在尝试从 streamlit 前端处理一个 CSV 文件,该文件有一个 URL 列表,我正在使用 nltk 对其进行预处理以传递给拥抱面部转换器进行汇总。我想为此使用 asyncio 和 ProcessPoolExecutor 创建一个后台任务,并将 taskid 返回到 UI 以轮询结果,这些结果分别存储在后端存储的文本文件中的每个 URL 中。我无法弄清楚为什么当我使用后台异步任务时我的模型没有被调用,它不会输出任何错误或日志。当我以同步方式为文件中的第一个 URL 调用模型时,它返回一个输出,不知道为什么它在异步事件循环中时没有被调用。这是我的 main.py,它接收一个 CSV 文件并调用推理函数。
class Job(BaseModel):
uid: UUID = Field(default_factory=uuid4)
status: str = "in_progress"
processed_urls: List[str] = Field(default_factory=list)
app = FastAPI()
nlp = NLP()
jobs: Dict[UUID, Job] = {}
@app.get("/")
def read_root():
return {"message": "Welcome from the API"}
@app.post("/{contentType}", status_code=HTTPStatus.ACCEPTED)
async def get_summary(background_tasks: BackgroundTasks, contentType:
str, file: UploadFile = File(...)):
df = pd.read_excel(file.file.read(), index_col=None, header=None)
model_name = config.MODEL_NAMES[contentType]
start = time.time()
name = f"/storage/{str(uuid.uuid4())}.txt"
new_task = Job()
jobs[new_task.uid] = new_task
background_tasks.add_task(generate_remaining_summaries, new_task.uid,
model_name, name, df)
return new_task
async def generate_remaining_summaries(uid: UUID, model_name, name, df):
executor = ProcessPoolExecutor()
event_loop = asyncio.get_event_loop()
jobs[uid].result = await event_loop.run_in_executor(
executor, partial(generate_summary, uid, model_name, name, df))
jobs[uid].status = "complete"
def generate_summary(task_id: UUID, model_name, name, df):
logger.info("model_name in generate_summary " + model_name)
for ind in range(len(df)):
url = df.iat[ind + 1, 0]
article_text = get_text(url)
summary = nlp.inference(model_name, article_text)
name = name.split(".")[0]
name = f"{name.split('_')[0]}_{ind}.txt"
logger.info("name " + name)
with open(name, 'w+') as file1:
for listItem in summary:
file1.write('%s\n' % listItem)
jobs[task_id].processed_urls.append(name)
jobs[task_id].status = "completed"
@app.get("/work/{uid}/status")
async def status_handler(uid: UUID):
return jobs[uid]
if __name__ == "__main__":
uvicorn.run("main:app", host="0.0.0.0", port=8080)
> Inference.py
import transformers
from transformers import BartTokenizer, BartForConditionalGeneration
import nltk
import torch
import logging.config
import os
import config
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
transformers.logging.set_verbosity_info()
def nest_sentences(document):
nested = []
sent = []
length = 0
for sentence in nltk.sent_tokenize(document):
length += len(sentence)
if length < 1024:
sent.append(sentence)
else:
nested.append(sent)
sent = [sentence]
length = len(sentence)
if sent:
nested.append(sent)
return nested
class NLP:
def __init__(self):
self.cache_dir = os.environ["MODEL_DIR"] + "facebook/bart-large-cnn/"
self.tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn",
cache_dir=self.cache_dir)
self.model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn",
cache_dir=self.cache_dir)
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def generate_summary(self, nested_sentences):
logger.info("Inside inference before generate summary")
summaries = []
for nested in nested_sentences:
input_tokenized = self.tokenizer.encode(' '.join(nested), truncation=True,
return_tensors='pt')
input_tokenized = input_tokenized.to(self.device)
summary_ids = self.model.to(self.device).generate(input_tokenized,
length_penalty=3.0)
output = [self.tokenizer.decode(g, skip_special_tokens=True,
clean_up_tokenization_spaces=False) for g in
summary_ids]
summaries.append(output)
summaries = [sentence for sublist in summaries for sentence in sublist]
return summaries
def inference(self, model_name, article_text):
nested = nest_sentences(article_text)
logger.info(nested)
summarized_text = self.generate_summary(nested)
logger.info("Inside inference summarized text")
logger.info(summarized_text)
nested_summ = nest_sentences(' '.join(summarized_text))
return self.generate_summary(nested_summ)
解决方案
推荐阅读
- sql - 在 T-Sql 中使用用户定义的数据类型识别所有存储过程和表
- wget - 使用 wget 递归获取 .php 文件中的 .txt 文件,但过滤器会破坏命令
- node.js - 找不到模块“install-npm-version”
- c# - C# - 返回 PayDate 最低但 ReconciliationDate 最高的记录
- c++ - 我需要使用哈希表创建 MultiMap,但出现超出时间限制的错误(C++)
- javascript - 需要 CanvasRenderingContext2D 作为父对象才能运行
- java - 如何在 java 中使用 Gmail API 回复邮件?
- ruby - Jekyll:在不使用外部插件的情况下获取图像的宽度/高度
- python - 我对我的 Python 感到困惑。ValueError:对已关闭文件的 I/O 操作
- excel - 使用 PowerShell 在 Excel 中批量添加超链接