I wrote a code in Python to familiarize myself with Machine Learning. The program opens a JSON file and then populate the data into a dataset, and then fine tune a hugging face model with the dataset that I provided. However, I ran into this error:
Traceback (most recent call last):
File "C:\Users\chenp\Documents\ML\machineLearning.py", line 45, in <module>
trainer.train()
File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 1859, in train
return inner_training_loop(
File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 2165, in _inner_training_loop
for step, inputs in enumerate(epoch_iterator):
File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\accelerate\data_loader.py", line 454, in __iter__
current_batch = next(dataloader_iter)
File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 631, in __next__
data = self._next_data()
File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 675, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\fetch.py", line 54, in fetch
return self.collate_fn(data)
File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\data\data_collator.py", line 271, in __call__
batch = pad_without_fast_tokenizer_warning(
File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\data\data_collator.py", line 66, in pad_without_fast_tokenizer_warning
padded = tokenizer.pad(*pad_args, **pad_kwargs)
File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\tokenization_utils_base.py", line 3274, in pad
raise ValueError(
ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['question', 'answer']
from this line:
trainer.train()
This is my full code:
# https:
# https:
# https:
# https:
import transformers as tf
import datasets as ds
import pandas as pd
import numpy as np
import torch
import json
############## Check if CUDA is enabled. ################
hasCUDA=torch.cuda.is_available()
print(f"CUDA Enabled? {hasCUDA}")
device="cuda" if hasCUDA else "cpu"
############## Loading file and populating data ################
fileName="qna.json"
trainDS=ds.load_dataset("json", data_files=fileName, split="train")
evalDS=ds.load_dataset("json", data_files=fileName)
# rawDS=ds.load_dataset('squad')
############## Model ##########################################
modelName="./distilbert-base-cased" #or replace the model name with whatever you feel like.
config=tf.AutoConfig.from_pretrained(modelName+"/config.json")
model=tf.AutoModelForQuestionAnswering.from_pretrained(modelName,config=config)
tokenizer=tf.AutoTokenizer.from_pretrained(modelName)
############## Training #######################################
trnArgs=tf.TrainingArguments(
output_dir="./",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=3,
remove_unused_columns=False,
fp16=True
)
trainer=tf.Trainer(
model=model,
args=trnArgs,
train_dataset=trainDS,
eval_dataset=evalDS,
tokenizer=tokenizer
)
trainer.train()
What is the way to conform to the format that the trainer is expecting?
What I have tried:
Added input_id column which gave me a different error.
for i in len(trainDS):
input_ids.append(10+i)
trainDS = trainDS.add_column("input_ids", input_ids)