首页 > 解决方案 > 在 collat​​e_batch(self, features)->ValueError: 只有一个元素张量可以转换为 Python 标量

问题描述

我正在尝试在 commonsense_qa 数据集上使用转换器。我想使用数据作为

question,option1 question,option2等对于每个选项我也会为每个输入传递问题。对于 input_ids、attention_mask、decoder_input_ids、decoder_attention_mask,我执行以下操作。最大长度 = 128

def convert_to_commonsense_qa_features(example_batch):
    num_examples = len(example_batch["question"])
    num_choices = len(example_batch["choices"][0]["text"])
    features, features2 = {}, {}
    labels2id = {char: i for i, char in enumerate("ABCDE")}
    for example_i in range(num_examples):
        choices_inputs = tokenizer.batch_encode_plus(
            list(zip(
                [example_batch["question"][example_i]] * num_choices,
                example_batch["choices"][example_i]["text"],
            )),
            max_length=max_length, pad_to_max_length=True,
        )
        for k, v in choices_inputs.items():
            if k not in features:
                features[k] = []
            features[k].append(v)
        choices_outputs = tokenizer.batch_encode_plus(
                      str(labels2id[example_batch["answerKey"][example_i]]),
                      max_length=max_length, pad_to_max_length=True,
        )
        for k, v in choices_outputs.items():
            if k not in features2:
                features2[k] = []
            features2[k].append(v)
    labels2id = {char: i for i, char in enumerate("ABCDE")}

    features["decoder_input_ids"] = features2["input_ids"]
    features["decoder_attention_mask"] = features2["attention_mask"]
    # Dummy answers for test
    """
    if example_batch["answerKey"][0]:
        features["labels"] = [labels2id[ans] for ans in example_batch["answerKey"]]
    else:
        features["labels"] = [0] * num_examples
    """ 
    features["labels"] = features2["input_ids"]  
    #print(features)
    return features

convert_func_dict = {
    "commonsense_qa": convert_to_commonsense_qa_features,
}


columns_dict = {
    "commonsense_qa": ['input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels'],
}

features_dict = {}
for task_name, dataset in dataset_dict.items():
    features_dict[task_name] = {}
    for phase, phase_dataset in dataset.items():
        print(phase, " ",phase_dataset)
        if phase!="test":
          features_dict[task_name][phase] = phase_dataset.map(
              convert_func_dict[task_name],
              batched=True,
              load_from_cache_file=False,
          )
          print(task_name, phase, len(phase_dataset), len(features_dict[task_name][phase]))
          features_dict[task_name][phase].set_format(
              type="torch", 
              columns=columns_dict[task_name],
          )
          print(task_name, phase, len(phase_dataset), len(features_dict[task_name][phase]))

和数据加载器是

class NLPDataCollator(DataCollator):
    """
    Extending the existing DataCollator to work with NLP dataset batches
    """
    def collate_batch(self, features: List[Union[InputDataClass, Dict]]) -> Dict[str, torch.Tensor]:
        first = features[0]
        if isinstance(first, dict):
          # NLP data sets current works presents features as lists of dictionary
          # (one per example), so we  will adapt the collate_batch logic for that
          if "labels" in first and first["labels"] is not None:
              if first["labels"].dtype == torch.int64:
                  labels = torch.tensor([f["labels"] for f in features], dtype=torch.long)
              else:
                  labels = torch.tensor([f["labels"] for f in features], dtype=torch.float)
              batch = {"labels": labels}
          for k, v in first.items():
              if k != "labels" and v is not None and not isinstance(v, str):
                  batch[k] = torch.stack([f[k] for f in features])
          return batch
        else:
          # otherwise, revert to using the default collate_batch
          return DefaultDataCollator().collate_batch(features)

教练是

trainer = MultitaskTrainer(
    model=model,
    args=transformers.TrainingArguments(
        output_dir="./models/",
        overwrite_output_dir=True,
        learning_rate=1e-5,
        do_train=True,
        do_eval=True,
        num_train_epochs=3,
        # Adjust batch size if this doesn't fit on the Colab GPU
        per_device_train_batch_size=8,  
        save_steps=3000,
    ),
    data_collator=NLPDataCollator(),
    train_dataset=train_dataset,
    eval_dataset = valid_dataset,

)
trainer.train()

我收到以下错误:

ValueError                                Traceback (most recent call last)

<ipython-input-49-68181c1063c6> in <module>()
     26 
     27 )
---> 28 trainer.train()

8 frames

<ipython-input-39-4359c3725689> in collate_batch(self, features)
     20           if "labels" in first and first["labels"] is not None:
     21               if first["labels"].dtype == torch.int64:
---> 22                   labels = torch.tensor([f["labels"] for f in features], dtype=torch.long)
     23               else:
     24                   labels = torch.tensor([f["labels"] for f in features], dtype=torch.float)

ValueError: only one element tensors can be converted to Python scalars

标签: pytorchhuggingface-transformerspytorch-dataloader

解决方案


推荐阅读