Commit 5f6cd3eb authored by Koen van der Veen's avatar Koen van der Veen
Browse files

multiple implementations for uploading and downloading models

parent 93eaadbc
Showing with 2169 additions and 0 deletions
+2169 -0
%% Cell type:code id:15eaf136 tags:
``` python
from transformers import AutoModelForSequenceClassification, AutoTokenizer
```
%% Cell type:code id:ebb34ad5 tags:
``` python
from transformers import AutoModel
```
%% Cell type:code id:03570a6d tags:
``` python
model = AutoModelForSequenceClassification.from_pretrained("distilroberta-base", num_labels=10)
```
%% Output
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
%% Cell type:code id:79ec1f77 tags:
``` python
# Hyperparameters
model_name = "distilroberta-base"
batch_size = 32
learning_rate = 1e-3
```
%% Cell type:code id:3d4ccd98 tags:
``` python
# # Load model
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=10)
# # To increase training speed, we will freeze all layers except the classifier head.
# for param in model.base_model.parameters():
# param.requires_grad = False
```
%% Cell type:markdown id:eee2c6b1 tags:
## Huggingface
%% Cell type:code id:0d6d1660 tags:
``` python
model_name = "my-test-model"
```
%% Cell type:code id:36e5d3c6 tags:
``` python
model.push_to_hub(model_name, use_temp_dir=True)
```
%% Output
/opt/anaconda3/envs/finetune-test1/lib/python3.7/site-packages/huggingface_hub/hf_api.py:1004: FutureWarning: `create_repo` now takes `token` as an optional positional argument. Be sure to adapt your code!
FutureWarning,
Cloning https://huggingface.co/koenvdv/my-test-model into local empty directory.
To https://huggingface.co/koenvdv/my-test-model
566c22b..bbd3b52 main -> main
'https://huggingface.co/koenvdv/my-test-model/commit/bbd3b526dc553b2055e90e38bdec3d126e3375b3'
%% Cell type:code id:9f9356b6 tags:
``` python
list(model.pooler.parameters())[0]
```
%% Output
Parameter containing:
tensor([[-1.1055e-02, 7.6790e-03, -1.7858e-02, ..., 4.5453e-02,
1.3963e-03, -3.4204e-03],
[-1.4697e-02, 3.0819e-02, 2.9549e-02, ..., -2.0563e-02,
6.0812e-05, -1.7535e-02],
[-3.9068e-03, 4.4484e-03, -2.5533e-02, ..., -2.3708e-02,
-3.8119e-02, -3.1126e-02],
...,
[ 2.5309e-02, -3.8438e-03, 1.3118e-03, ..., 1.6762e-02,
-2.8897e-04, -1.2778e-02],
[ 4.5472e-03, -3.1136e-02, -3.6995e-03, ..., -8.1584e-03,
-3.2597e-02, 8.8213e-03],
[ 9.7282e-03, 1.5389e-02, -3.4779e-03, ..., 4.2545e-02,
-1.3812e-02, -1.7629e-02]], requires_grad=True)
%% Cell type:code id:f463062b tags:
``` python
model2 = AutoModel.from_pretrained("koenvdv/my-test-model")
```
%% Output
Some weights of the model checkpoint at koenvdv/my-test-model were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at koenvdv/my-test-model and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
%% Cell type:code id:bc4d775b tags:
``` python
list(model2.pooler.parameters())[0]
```
%% Output
Parameter containing:
tensor([[-0.0255, -0.0311, -0.0239, ..., 0.0234, -0.0239, 0.0197],
[-0.0102, -0.0106, 0.0066, ..., 0.0140, -0.0143, 0.0446],
[ 0.0193, 0.0129, -0.0208, ..., -0.0049, -0.0234, -0.0031],
...,
[ 0.0045, 0.0013, -0.0094, ..., 0.0265, 0.0124, 0.0301],
[ 0.0038, 0.0182, -0.0185, ..., -0.0072, 0.0107, -0.0242],
[ 0.0063, -0.0095, 0.0072, ..., 0.0284, -0.0157, 0.0159]],
requires_grad=True)
%% Cell type:code id:fcf5e957 tags:
``` python
import torch
```
%% Cell type:code id:3212640d tags:
``` python
# Import
import wandb
# Save your model.
torch.save(model.state_dict(), 'model.pth')
# Save as artifact for version control.
run = wandb.init(project='test-project')
artifact = wandb.Artifact('model', type='model')
artifact.add_file('model.pth')
run.log_artifact(artifact)
run.finish()
```
%% Output
%% Cell type:code id:e59354b4 tags:
``` python
# !huggingface-cli login
```
%% Output
_| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|
_| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
_|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|
_| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|
_| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|
To login, `huggingface_hub` now requires a token generated from https://huggingface.co/settings/token.
(Deprecated, will be removed in v0.3.0) To login with username and password instead, interrupt with Ctrl+C.
Token:
%% Cell type:code id:75501228 tags:
``` python
import transformers
import datasets
```
%% Cell type:code id:b6f6ad1c tags:
``` python
training_args = transformers.TrainingArguments(
"twitter-emoji-trainer",
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
logging_steps=100,
)
trainer = transformers.Trainer(
model=model,
args=training_args,
train_dataset=dataset
)
```
%% Output
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
/var/folders/q1/ryq93kwj055dlbpngxv1c7z40000gn/T/ipykernel_14295/673354679.py in <module>
9 model=model,
10 args=training_args,
---> 11 train_dataset=dataset
12 )
NameError: name 'dataset' is not defined
%% Cell type:code id:10b545ba tags:
``` python
AutoModelForSequenceClassification.from_pretrained()
```
%% Cell type:code id:1ac0bc31 tags:
``` python
model.parameters()
```
%% Output
<generator object Module.parameters at 0x7fe3c130ced0>
%% Cell type:code id:6f15a71d tags:
``` python
wandb.finish()
```
%% Cell type:code id:1b1e8b3a tags:
``` python
%env WANDB_PROJECT=twitter_test_wandb
%env WANDB_LOG_MODEL=true
```
%% Output
env: WANDB_PROJECT=twitter_test_wandb
env: WANDB_LOG_MODEL=true
%% Cell type:code id:365d52bd tags:
``` python
%env WANDB_LOG_MODEL=true
```
%% Output
env: WANDB_LOG_MODEL=true
%% Cell type:code id:510ea51e tags:
``` python
import wandb
```
%% Cell type:code id:10a6dcf3 tags:
``` python
wandb.login()
```
%% Output
True
%% Cell type:code id:5da87653 tags:
``` python
with wandb.init(project="twitter_test_wandb") as run:
my_model_name = "run-bert-base-high-lr:latest"
my_model_artifact = run.use_artifact(my_model_name)
# Download model weights to a folder and return the path
model_dir = my_model_artifact.download()
# Load your Hugging Face model from that folder
# using the same model class
model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=num_labels)
```
%% Cell type:code id:3c859c6f tags:
``` python
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import AutoModel
from fastprogress.fastprogress import progress_bar
from pathlib import Path
import transformers
import torch
import wandb
import requests
import os, sys
from getpass import getpass
from datetime import datetime
from git import Repo
```
%% Cell type:code id:e50d8f8b tags:
``` python
model = AutoModelForSequenceClassification.from_pretrained("distilroberta-base", num_labels=10)
```
%% Output
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
%% Cell type:code id:1301a03b tags:
``` python
MEMRI_PATH = Path.home() / ".memri"
ACCESS_TOKEN_PATH = Path.home() / ".memri/access_token/access_token.txt"
GITLAB_API_BASE_URL = "https://gitlab.memri.io/api/v4"
DEFAULT_PLUGIN_MODEL_PACKAGE_NAME = "plugin-model-package"
DEFAULT_PYTORCH_MODEL_NAME = "pytorch_model.bin"
DEFAULT_HUGGINFACE_CONFIG_NAME = "config.json"
DEFAULT_PACKAGE_VERSION = "0.0.1"
TIME_FORMAT_GITLAB = '%Y-%m-%dT%H:%M:%S.%fZ'
```
%% Cell type:code id:c9ac7d8e tags:
``` python
def find_git_repo():
path = "."
for i in range(10):
try:
repo = Repo(f"{path + ('.' * i)}/")
except:
pass
else:
break
if i == 9:
raise ValueError(f"could not fine git repo in {os.path.abspath('')}")
repo_name = repo.remotes.origin.url.split('.git')[0].split('/')[-1]
return repo_name
```
%% Cell type:code id:d776ea68 tags:
``` python
def get_registry_api_key():
ACCESS_TOKEN_PATH.parent.mkdir(parents=True, exist_ok=True)
if ACCESS_TOKEN_PATH.is_file():
with open(ACCESS_TOKEN_PATH, "r") as f:
return f.read()
else:
print(f"""
The first time you are uploading a model you need to create an access_token
at https://gitlab.memri.io/-/profile/personal_access_tokens?name=Model+Access+token&scopes=api
Click at the blue button with 'Create personal access token'"
""")
access_token = getpass("Then copy your personal access token from 'Your new personal access token', and paste here: ")
with open(ACCESS_TOKEN_PATH, "w") as f:
f.write(access_token)
return access_token
```
%% Cell type:code id:0bbaadcb tags:
``` python
# pb_ = progress_bar(range(1000))
# pb = iter(pb_)
```
%% Cell type:code id:49861048 tags:
``` python
# for i in range(1000):
# next(pb)
# pb_.update_bar(1000)
```
%% Cell type:code id:d210854e tags:
``` python
class upload_in_chunks(object):
def __init__(self, filename, chunksize=1 << 13):
self.filename = filename
self.chunksize = chunksize
self.totalsize = os.path.getsize(filename)
self.readsofar = 0
def __iter__(self):
n = 1000
pb = progress_bar(range(n))
pb_iter = iter(pb)
i = 1
delta = 1 / n
with open(self.filename, 'rb') as file:
while True:
data = file.read(self.chunksize)
if not data:
sys.stderr.write("\n")
break
self.readsofar += len(data)
percent = self.readsofar * 1e2 / self.totalsize
while (percent / 100) > i * delta:
next(pb_iter, None)
i += 1
yield data
pb.update_bar(n)
def __len__(self):
return self.totalsize
class IterableToFileAdapter(object):
def __init__(self, iterable):
self.iterator = iter(iterable)
self.length = len(iterable)
def read(self, size=-1): # TBD: add buffer for `len(data) > size` case
return next(self.iterator, b'')
def __len__(self):
return self.length
```
%% Cell type:code id:369a1a56 tags:
``` python
def write_file_to_package_registry(project_id, file_path, api_key, version=DEFAULT_PACKAGE_VERSION):
file_path = Path(file_path)
file_name = file_path.name
url = f"{GITLAB_API_BASE_URL}/projects/{project_id}/packages/generic/{DEFAULT_PLUGIN_MODEL_PACKAGE_NAME}/{version}/{file_name}"
print(f"uploading {file_path}")
it = upload_in_chunks(file_path, 10)
res = requests.put(url=url, data=IterableToFileAdapter(it),
headers={"PRIVATE-TOKEN": api_key})
if res.status_code not in [200, 201]:
print(f"Failed to upload {file_path}: {res.content}")
else:
print(f"Succesfully uploaded {file_path}")
```
%% Cell type:code id:6d0ad383 tags:
``` python
# write_file_to_package_registry(155, "config.json", get_registry_api_key())
```
%% Cell type:code id:4a574c05 tags:
``` python
def project_id_from_name(project_name, api_key):
res = requests.get(f"{GITLAB_API_BASE_URL}/projects",
headers={"PRIVATE-TOKEN": api_key},
params={
"owned": True,
"search": "plugin-test2"
})
res = [x.get("id") for x in res.json()]
if len(res) == 0:
raise ValueError(f"No plugin found with name {project_name}")
else:
return res[0]
```
%% Cell type:code id:5919704a tags:
``` python
def write_huggingface_model_to_package_registry(project_name, model):
api_key = get_registry_api_key()
project_id = project_id_from_name(project_name, api_key)
local_save_dir = Path("/tmp")
torch.save(model.state_dict(), local_save_dir / DEFAULT_PYTORCH_MODEL_NAME)
model.config.to_json_file(local_save_dir / DEFAULT_HUGGINFACE_CONFIG_NAME)
for f in [DEFAULT_HUGGINFACE_CONFIG_NAME, DEFAULT_PYTORCH_MODEL_NAME]:
file_path = local_save_dir / f
print(f"writing {f} to package registry of {project_name} with project id {project_id}")
write_file_to_package_registry(project_id, file_path, api_key)
```
%% Cell type:code id:0fab27e3 tags:
``` python
def write_model_to_package_registry(model, project_name=None):
project_name = project_name if project_name is not None else find_git_repo()
if isinstance(model, transformers.PreTrainedModel):
write_huggingface_model_to_package_registry(project_name, model)
else:
raise ValueError(f"Model type not supported: {type(model)}")
```
%% Cell type:code id:3046cff9 tags:
``` python
# project_name="plugin-test2"
```
%% Cell type:code id:d1681ea5 tags:
``` python
# write_model_to_package_registry(model)
```
%% Cell type:code id:c8338e66 tags:
``` python
def download_package_file(filename, project_name=None, out_dir=None, package_name=DEFAULT_PLUGIN_MODEL_PACKAGE_NAME,
package_version=DEFAULT_PACKAGE_VERSION):
if project_name is None:
try:
project_name = find_git_repo()
except Exception as e:
raise ValueError("no project name provided, but could also not find a git repo to infer project name") from None
out_dir = out_dir if out_dir is not None else MEMRI_PATH / "projects" / project_name
out_dir.mkdir(parents=True, exist_ok=True)
api_key = get_registry_api_key()
project_id = project_id_from_name(project_name, api_key)
print(f"downloading {filename} from project with id {project_id} from package {package_name}")
res = requests.get(
url=f"{GITLAB_API_BASE_URL}/projects/{project_id}/packages/generic/{package_name}/{package_version}/{filename}"
)
res.raise_for_status()
with open(out_dir / filename, "wb") as f:
print(f"writing {filename} to {out_dir}")
f.write(res.content)
return out_dir
```
%% Cell type:code id:4f72de38 tags:
``` python
def download_huggingface_model_for_project(files=None):
if files is None:
files = ["config.json", "pytorch_model.bin"]
for f in files:
out_dir = download_package_file(f)
return out_dir
```
%% Cell type:code id:0ca16cf8 tags:
``` python
out_dir = download_huggingface_model_for_project()
```
%% Output
downloading config.json from project with id 155 from package plugin-model-package
writing config.json to /Users/koen/.memri/projects/finetuning-example
downloading pytorch_model.bin from project with id 155 from package plugin-model-package
writing pytorch_model.bin to /Users/koen/.memri/projects/finetuning-example
%% Cell type:code id:38fd199e tags:
``` python
filename = "config.json"
download_package_file(filename)
```
%% Output
writing config.json to /Users/koen/.memri/projects/finetuning-example
%% Cell type:code id:a17d5d3b tags:
``` python
```
%% Cell type:code id:2d0312f1 tags:
``` python
```
%% Cell type:code id:49c70d67 tags:
``` python
```
%% Cell type:code id:cd6f2a0c tags:
``` python
```
%% Cell type:code id:76303275 tags:
``` python
# def get_package_id_from_package_name(project_id, package_name):
# res = requests.get(url=f"{BASE_URL}//projects/{project_id}/packages",
# params={})
# res = [x.get("id") for x in res.json()]
# if len(res) == 0:
# raise ValueError(f"No package found with name {package_name}")
# else:
# return res[0]
# def get_package_file_id(project_id, package_id, filename):
# res = requests.get(f"{GITLAB_API_BASE_URL}/projects/{project_id}/packages/{package_id}/package_files")
# if res.status_code in [200, 201]:
# res = [x for x in res.json() if x.get("file_name", None) == filename]
# res = sorted(res, key=lambda x: datetime.strptime(x.get("created_at", None), TIME_FORMAT_GITLAB))
# if len(res)==0:
# raise ValueError(f"Could not find package files for package with id {package_id}")
# else:
# return res[0]["id"]
# else:
# raise ValueError(f"Could not find package files for package with id {package_id}")
```
%% Cell type:code id:f21a3952 tags:
``` python
file_id = get_package_file_id(project_id, package_id, filename)
```
%% Cell type:code id:3660827b tags:
``` python
package_name
```
%% Output
'plugin_model_package'
%% Cell type:code id:2c66004b tags:
``` python
package_id=155
```
%% Cell type:markdown id:5e643a4a tags:
# Fine-tuning a text classifier on your Dataset
Memri data apps often contain machine learning models, which can be trained on the labeled data from the Memri app. In this guide, we will load a labeled dataset from the Pod, and use it to fine-tune a [RoBERTa text classifier](https://medium.com/analytics-vidhya/evolving-with-bert-introduction-to-roberta-5174ec0e7c82). The main purpose behind this notebook is to run it on your own dataset by labeling your data, and filling in the corresponding dataset name and pod key in the following steps. The output of this tutorial is a model, which can subsequently be used in your data app. For this tutorial, we created a fake dataset, which can be reproduced using [this notebook](https://gitlab.memri.io/memri/finetuning-example). To make the training speed a bit faster, we use a smaller version of this model: [distilRoBERTa](https://huggingface.co/distilroberta-base). The code for this tutorial is available as a [notebook](https://gitlab.memri.io/memri/finetuning-example/-/blob/main/finetune_model.ipynb).
%% Cell type:code id:e2cc7d46 tags:
``` python
from pymemri.data.itembase import Item, Edge
from pymemri.data.schema import (
Message,
Dataset,
CategoricalLabel,
DatasetEntry,
)
from pymemri.pod.client import PodClient
import datasets
import random
from IPython.display import clear_output
```
%% Cell type:code id:199c0724 tags:
``` python
# from the dataset page: https://huggingface.co/datasets/tweet_eval#source-data
emoji = '❤😍😂💕🔥😊😎✨💙😘📷🇺🇸☀💜😉💯😁🎄📸😜'
emoji_map = {i: e for i, e in enumerate(emoji)}
emoji_dataset = datasets.load_dataset("tweet_eval", "emoji")
emoji_dataset = emoji_dataset["train"]
```
%% Output
Reusing dataset tweet_eval (/Users/koen/.cache/huggingface/datasets/tweet_eval/emoji/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)
%% Cell type:markdown id:18035047 tags:
Next, we download and inspect the `tweet-eval-emoji` dataset. All entries in the dataset can be found through the `Dataset.entry` edge.
%% Cell type:markdown id:f3214b11 tags:
The first step in training your model is exporting the dataset to a format we can use in Python. The `Dataset` class in pymemri can format your dataset to various datatypes with the `Dataset.to` method. In this notebook, we will use Pandas.
The `columns` argument of `Dataset.to` defines which features are used. A column is either a property of the items in the Dataset (for example, the `content` of a `Message`), or a property of a connected item (the `value` of a `Label` connected to the `Message`).
%% Cell type:code id:c8fafe5c tags:
``` python
# data = dataset.to("pandas", columns=["data.content", "annotation.labelValue"])
# data.head()
```
%% Cell type:markdown id:de7037d2 tags:
# Finetuning a Hugging Face model
In this guide, we finetune a Hugging Face model on the tweet_eval emoji task. The `transformers` library contains all code to do the training for us, we only need to define a torch `Dataset` that contains our data and handles tokenization.
%% Cell type:code id:3da52423 tags:
``` python
# Hyperparameters
model_name = "distilroberta-base"
batch_size = 32
learning_rate = 1e-3
```
%% Cell type:code id:f3687ff4 tags:
``` python
emoji_dataset[0]
```
%% Output
{'text': 'Sunday afternoon walking through Venice in the sun with @user ️ ️ ️ @ Abbot Kinney, Venice',
'label': 12}
%% Cell type:code id:b56c58d1 tags:
``` python
# class EmojiDataset(torch.utils.data.Dataset):
# def __init__(self, data: pd.DataFrame, tokenizer: transformers.PreTrainedTokenizerBase):
# self.data = data
# self.label2idx, self.idx2label = self.get_label_map()
# self.num_labels = len(self.label2idx)
# self.tokenizer = tokenizer
# def tokenize(self, message, label=None):
# tokenized = self.tokenizer(message, padding="max_length", truncation=True)
# if label:
# tokenized["label"] = self.label2idx[label]
# return tokenized
# def get_label_map(self):
# unique_labels = data["annotation.labelValue"].unique()
# return {l: i for i, l in enumerate(unique_labels)}, {i: l for i, l in enumerate(unique_labels)}
# def __len__(self):
# return len(self.data)
# def __getitem__(self, idx):
# # Get the row from self.data, and skip the first column (id).
# return self.tokenize(*self.data.iloc[idx][1:])
tokenizer = AutoTokenizer.from_pretrained(model_name)
# dataset = EmojiDataset(data, tokenizer)
```
%% Cell type:markdown id:9d78f8e0 tags:
## Training
The 🤗 Transformers library provides all the code we need for training a RoBERTa model. We will use their `Trainer` class, which handles all training, monitoring and integration with [Weights & Biases](https://wandb.ai/site) for us. The 🤗 Transformers documentation has a detailed tutorial on fine-tuning models, which can be found [here](https://huggingface.co/docs/transformers/training).
%% Cell type:code id:da9f8303 tags:
``` python
# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=10)
# To increase training speed, we will freeze all layers except the classifier head.
for param in model.base_model.parameters():
param.requires_grad = False
```
%% Cell type:code id:13a86a98 tags:
``` python
training_args = transformers.TrainingArguments(
"twitter-emoji-trainer",
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
logging_steps=100,
)
trainer = transformers.Trainer(
model=model,
args=training_args,
train_dataset=emoji_dataset
)
```
%% Cell type:code id:26d534f0 tags:
``` python
trainer.train()
```
%% Output
The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
/opt/anaconda3/envs/finetune-test1/lib/python3.7/site-packages/transformers/optimization.py:309: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use thePyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
FutureWarning,
***** Running training *****
Num examples = 45000
Num Epochs = 3
Instantaneous batch size per device = 32
Total train batch size (w. parallel, distributed & accumulation) = 32
Gradient Accumulation steps = 1
Total optimization steps = 4221
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/q1/ryq93kwj055dlbpngxv1c7z40000gn/T/ipykernel_40819/4032920361.py in <module>
----> 1 trainer.train()
/opt/anaconda3/envs/finetune-test1/lib/python3.7/site-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1363 tr_loss_step = self.training_step(model, inputs)
1364 else:
-> 1365 tr_loss_step = self.training_step(model, inputs)
1366
1367 if (
/opt/anaconda3/envs/finetune-test1/lib/python3.7/site-packages/transformers/trainer.py in training_step(self, model, inputs)
1938
1939 with self.autocast_smart_context_manager():
-> 1940 loss = self.compute_loss(model, inputs)
1941
1942 if self.args.n_gpu > 1:
/opt/anaconda3/envs/finetune-test1/lib/python3.7/site-packages/transformers/trainer.py in compute_loss(self, model, inputs, return_outputs)
1970 else:
1971 labels = None
-> 1972 outputs = model(**inputs)
1973 # Save past state if it exists
1974 # TODO: this needs to be fixed and made cleaner later.
/opt/anaconda3/envs/finetune-test1/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
/opt/anaconda3/envs/finetune-test1/lib/python3.7/site-packages/transformers/models/roberta/modeling_roberta.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)
1211 output_attentions=output_attentions,
1212 output_hidden_states=output_hidden_states,
-> 1213 return_dict=return_dict,
1214 )
1215 sequence_output = outputs[0]
/opt/anaconda3/envs/finetune-test1/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
/opt/anaconda3/envs/finetune-test1/lib/python3.7/site-packages/transformers/models/roberta/modeling_roberta.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
800 input_shape = inputs_embeds.size()[:-1]
801 else:
--> 802 raise ValueError("You have to specify either input_ids or inputs_embeds")
803
804 batch_size, seq_length = input_shape
ValueError: You have to specify either input_ids or inputs_embeds
%% Cell type:markdown id:2fd063f5 tags:
Thats it, we trained our model. Check out the next tutorial to see how you can use this model from within your data app.
%% Cell type:code id:5be36f3a tags:
``` python
```
wandb.ipynb 0 → 100644
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment