首页 > 解决方案 > 我的拥抱脸模型没有在异步事件循环中生成摘要是否有原因?

问题描述

我正在尝试从 streamlit 前端处理一个 CSV 文件,该文件有一个 URL 列表,我正在使用 nltk 对其进行预处理以传递给拥抱面部转换器进行汇总。我想为此使用 asyncio 和 ProcessPoolExecutor 创建一个后台任务,并将 taskid 返回到 UI 以轮询结果,这些结果分别存储在后端存储的文本文件中的每个 URL 中。我无法弄清楚为什么当我使用后台异步任务时我的模型没有被调用,它不会输出任何错误或日志。当我以同步方式为文件中的第一个 URL 调用模型时,它返回一个输出,不知道为什么它在异步事件循环中时没有被调用。这是我的 main.py,它接收一个 CSV 文件并调用推理函数。

class Job(BaseModel):
uid: UUID = Field(default_factory=uuid4)
status: str = "in_progress"
processed_urls: List[str] = Field(default_factory=list)

app = FastAPI()
nlp = NLP()
jobs: Dict[UUID, Job] = {}


@app.get("/")
def read_root():
return {"message": "Welcome from the API"}

@app.post("/{contentType}", status_code=HTTPStatus.ACCEPTED)

async def get_summary(background_tasks: BackgroundTasks, contentType: 
str, file: UploadFile = File(...)):

df = pd.read_excel(file.file.read(), index_col=None, header=None)
model_name = config.MODEL_NAMES[contentType]
start = time.time()
name = f"/storage/{str(uuid.uuid4())}.txt"
new_task = Job()
jobs[new_task.uid] = new_task
background_tasks.add_task(generate_remaining_summaries, new_task.uid, 
model_name, name, df)
return new_task

async def generate_remaining_summaries(uid: UUID, model_name, name, df):
executor = ProcessPoolExecutor()
event_loop = asyncio.get_event_loop()
jobs[uid].result = await event_loop.run_in_executor(
    executor, partial(generate_summary, uid, model_name, name, df))
jobs[uid].status = "complete"


def generate_summary(task_id: UUID, model_name, name, df):
logger.info("model_name in generate_summary " + model_name)
for ind in range(len(df)):
    url = df.iat[ind + 1, 0]
    
    article_text = get_text(url)
    summary = nlp.inference(model_name, article_text)
    name = name.split(".")[0]
    name = f"{name.split('_')[0]}_{ind}.txt"
    logger.info("name " + name)
    with open(name, 'w+') as file1:
        for listItem in summary:
            file1.write('%s\n' % listItem)
    jobs[task_id].processed_urls.append(name)
jobs[task_id].status = "completed"


 @app.get("/work/{uid}/status")
 async def status_handler(uid: UUID):
 return jobs[uid]


if __name__ == "__main__":
 uvicorn.run("main:app", host="0.0.0.0", port=8080) 

> Inference.py

import transformers
from transformers import BartTokenizer, BartForConditionalGeneration

import nltk
import torch
import logging.config
import os
import config
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
 transformers.logging.set_verbosity_info()


def nest_sentences(document):
 nested = []
 sent = []
 length = 0
 for sentence in nltk.sent_tokenize(document):
    length += len(sentence)
    if length < 1024:
        sent.append(sentence)
    else:
        nested.append(sent)
        sent = [sentence]
        length = len(sentence)

  if sent:
    nested.append(sent)

  return nested


 class NLP:
   def __init__(self):
    self.cache_dir = os.environ["MODEL_DIR"] + "facebook/bart-large-cnn/"
    self.tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn", 
    cache_dir=self.cache_dir)
    self.model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn", 
    cache_dir=self.cache_dir)
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  def generate_summary(self, nested_sentences):
    logger.info("Inside inference before generate summary")
    
    summaries = []
    for nested in nested_sentences:
        input_tokenized = self.tokenizer.encode(' '.join(nested), truncation=True, 
     return_tensors='pt')
        input_tokenized = input_tokenized.to(self.device)
        summary_ids = self.model.to(self.device).generate(input_tokenized,
                                                        length_penalty=3.0)
        output = [self.tokenizer.decode(g, skip_special_tokens=True, 
                 clean_up_tokenization_spaces=False) for g in
                  summary_ids]
        summaries.append(output)

       summaries = [sentence for sublist in summaries for sentence in sublist]
       return summaries

def inference(self, model_name, article_text):
    
    nested = nest_sentences(article_text)
    logger.info(nested)
    summarized_text = self.generate_summary(nested)
    logger.info("Inside inference summarized text")
    logger.info(summarized_text)
    nested_summ = nest_sentences(' '.join(summarized_text))
    return self.generate_summary(nested_summ)

标签: pythonpython-asynciofastapihuggingface-transformersbackground-task

解决方案


推荐阅读