首页 > 解决方案 > 如何加快文本翻译速度?

问题描述

有没有加快处理速度的方法?

我必须在每个 ~126k 样本上翻译三个文本字段。此任务的估计时间超过 96 小时:

import pickle
from deep_translator import GoogleTranslator
from tqdm import tqdm

def translate(text):
    return GoogleTranslator(
        source='english',
        target='portuguese').translate(text)

def translate_samples(samples):
    translated_sample = []
    for sample in tqdm(samples):
        translated_sample.append({
            "idx": sample["idx"],
            "qs1": translate(sample["qs1"]),
            "qs2": translate(sample["qs2"]),
            "ans": translate(sample["ans"]),
            "cls":sample["cls"]
            })

    return translated_sample




def perform_tasks():
    with open("resource/dataset/aug.pkl", "rb") as samples_file:
        samples = pickle.load(samples_file)

    translated_sample = translate_samples(samples)

    with open("resource/dataset/aug_pt_br.pkl", "wb") as samples_file:
        pickle.dump(translated_sample, samples_file)



if __name__ == '__main__':
    perform_tasks()

# 0%|                                    | 36/126738 [00:36<96:12:38,  2.14s/it]

你能给我一些指示吗?

标签: pythonparallel-processinggoogle-translate

解决方案


您可以尝试拥抱人脸库中提供的其他预训练模型。检查下面的示例代码可能会减少约 75 小时。为您的数据集。您可以尝试将批处理和 GPU 结合使用,以获得更好的性能。

from transformers import MarianTokenizer, MarianMTModel
import time

model_name = 'Helsinki-NLP/opus-mt-fr-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# function to get the chunks out of the entire data
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i : i + n]

for i, col in enumerate(source_cols):
    start = time.time()
    translations = []
    batch_no = 0
    # encode the source language text
    for source_text in chunks(data[col].tolist(), 500):
        batch_no += 1
        print('batch %d tokenization started' % batch_no)
        batch = tokenizer.encode(source_text, return_tensors='pt', padding=True)
        # predict the output token ids
        print('batch %d prediction started.' % batch_no)
        outputs = model.generate(**batch)
        print('batch %d decoding started.' % batch_no)
        decoded_output = tokenizer.decode(outputs, skip_special_tokens=True)
        translations.extend(decoded_output)
        print('batch %d completed' % batch_no)
    data[target_cols[i]] = translations
    end = time.time()
    print('%.2f hours taken for verbatim %s' % ((end - start)/3600, col))

推荐阅读