LINUX.ORG.RU

Пытаюсь тренировать модель HuggingFace

 ,


0

1

пытаюсь тренировать модель, получаю ошибку, но не понимаю, как её решить. Гугл ничего путного не дал

ValueError: The batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: input_ids,attention_mask,token_type_ids,position_ids,head_mask,inputs_embeds,labels,output_attentions,output_hidden_states,return_dict,labels,label,label_ids.

Вот, сам код

## imports

from datasets import Dataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)

## data
# Load 20-newsgroup dataset and arrange it into a list of tuples
# data = [("description1", "category1"), ("description2", "category2"), ...]

newsgroups_train = fetch_20newsgroups(subset="train")
data = [
    (
        newsgroups_train.data[i],
        newsgroups_train.target_names[newsgroups_train.target[i]],
    )
    for i in range(len(newsgroups_train.data))
]


## Prepare the dataset
descriptions = [item[0] for item in data]
categories = [item[1] for item in data]

# Tokenizer and Model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(set(categories))
)


## Encoding data
def encode(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length")


## Train/Test split
(
    train_descriptions,
    test_descriptions,
    train_categories,
    test_categories,
) = train_test_split(descriptions, categories, test_size=0.2)
training_args = TrainingArguments("test_trainer")


def hugginface_dataset(text, labels):
    return Dataset.from_dict(
        {
            "text": text,
            "labels": labels,
        }
    )


train_dataset = hugginface_dataset(train_descriptions, train_categories)
test_dataset = hugginface_dataset(test_descriptions, test_categories)


## Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encode(train_dataset),
    eval_dataset=encode(test_dataset),
    # compute_metrics=compute_metrics,
)


## Train the model
trainer.train()

# Evaluation
train_metrics = trainer.evaluate(train_dataset)
test_metrics = trainer.evaluate(test_dataset)

# Print metrics
print("Training metrics:", train_metrics)
print("Testing metrics:", test_metrics)

# Collect metrics
metrics = {"train": train_metrics, "test": test_metrics}

# Plot metrics
plt.figure(figsize=(10, 6))
for metric in ["loss", "accuracy"]:
    plt.plot(
        ["train", "test"],
        [metrics["train"][metric], metrics["test"][metric]],
        label=metric,
    )
plt.legend()
plt.title("Training and testing metrics")
plt.show()
##
★★

Последнее исправление: phrm (всего исправлений: 1)

Пример не минимизирован, зато минимизирован трейсбэк, чтобы мы не догадались, где эта ошибка случилась. На тебе ответ, сообразный твоей культуре задавания вопросов:

Double-check that your training dataset contains keys expected by the model: input_ids,attention_mask,token_type_ids,position_ids,head_mask,inputs_embeds,labels,output_attentions,output_hidden_states,return_dict,labels,label,label_ids.

И как, contains?

t184256 ★★★★★
()

:D

ValueError: полученный пакет был пустым, ваша модель не сможет на нем обучаться. Дважды проверьте, что ваш набор обучающих данных содержит ключи, ожидаемые моделью.
LINUX-ORG-RU ★★★★★
()

Иван, Вы пытаетесь обучить теплое - мягким, буквально. Формат данных которым вы пытаетесь накормить бедную BERTу вообще не совпадает с тем что она кушала до этого.

Решение:

import torch
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import numpy as np
from datasets import load_dataset

model_name = "bert-base-uncased"
max_length = 512

tokenizer = AutoTokenizer.from_pretrained(model_name)

def read_20newsgroups(test_size=0.2):
    dataset = fetch_20newsgroups(subset="all", shuffle=True, remove=("headers", "footers", "quotes"))
    documents = dataset.data
    labels = dataset.target

    return train_test_split(documents, labels, test_size=test_size), dataset.target_names

(train_texts, valid_texts, train_labels, valid_labels), target_names = read_20newsgroups()
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)

class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsGroupsDataset(train_encodings, train_labels)
eval_dataset = NewsGroupsDataset(valid_encodings, valid_labels)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names))  # тут лучше добавить .to("cuda") если есть возможность т.к. на CPU это займет часы
training_args = TrainingArguments(output_dir="test_trainer")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
Obezyan
()