In [3]:
# https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling

In [4]:
import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_FOR_CAUSAL_LM_MAPPING,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    is_torch_tpu_available,
    set_seed,
)

from itertools import chain

from transformers.testing_utils import CaptureLogger
from transformers.trainer_utils import get_last_checkpoint
# from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

import datasets
from datasets import load_dataset

In [5]:
# check_min_version("4.23.0.dev0")

In [6]:
require_version("datasets>=1.8.0")

In [7]:
set_seed(37)

##### Get all of the huggingface objects that we need: tokenzier, gpt2 model, poetry dataset.

In [8]:
raw_datasets = load_dataset("merve/poetry")

Using custom data configuration merve--poetry-ca9a13ef5858cc3a
Found cached dataset csv (/Users/matth/.cache/huggingface/datasets/merve___csv/merve--poetry-ca9a13ef5858cc3a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [10]:
config = AutoConfig.from_pretrained('gpt2')

# max_seq_length

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    "gpt2",
    config=config
)
model.max_seq_length = 128
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [12]:
raw_datasets['train']

Dataset({
    features: ['author', 'content', 'poem name', 'age', 'type'],
    num_rows: 573
})

In [13]:
raw_datasets['train']['type'][0]

'Mythology & Folklore'

In [14]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['author', 'content', 'poem name', 'age', 'type'],
        num_rows: 573
    })
})

In [15]:
tok_logger = transformers.utils.logging.get_logger(
    "transformers.tokenization_utils_base"
)

In [16]:
def tokenize_function(examples):
    with CaptureLogger(tok_logger) as cl:
        output = tokenizer(examples[text_column_name])
    # clm input could be much much longer than block_size
    if "Token indices sequence length is longer than the" in cl.out:
        tok_logger.warning(
            "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
            " before being passed to the model."
        )
    return output

In [17]:
column_names = raw_datasets["train"].column_names
# text_column_name = "text" if "text" in column_names else column_names[0]
text_column_name = "content"

In [18]:
tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    # num_proc=data_args.preprocessing_num_workers,
    remove_columns=column_names,
    # load_from_cache_file=not data_args.overwrite_cache,
    desc="Running tokenizer on dataset",
)

Loading cached processed dataset at /Users/matth/.cache/huggingface/datasets/merve___csv/merve--poetry-ca9a13ef5858cc3a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-62fd9c772e30c8d3.arrow


In [19]:
block_size = tokenizer.model_max_length

In [20]:
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [21]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    # num_proc=data_args.preprocessing_num_workers,
    # load_from_cache_file=not data_args.overwrite_cache,
    desc=f"Grouping texts in chunks of {block_size}",
)

Loading cached processed dataset at /Users/matth/.cache/huggingface/datasets/merve___csv/merve--poetry-ca9a13ef5858cc3a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-88d7c64be469684a.arrow


In [22]:
train_dataset = lm_datasets["train"]

#### Do the fine-tuning

In [25]:
training_args = TrainingArguments(
    output_dir="gpt2-poetry-model", 
    overwrite_output_dir=True,
    # per_gpu_train_batch_size=256
    per_device_train_batch_size=16,
    push_to_hub=True,
    push_to_hub_token="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"
)

In [26]:
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    # Data collator will default to DataCollatorWithPadding, so we change it.
    data_collator=default_data_collator,
    # compute_metrics=compute_metrics
    # if training_args.do_eval and not is_torch_tpu_available()
    # else None,
    # preprocess_logits_for_metrics=preprocess_logits_for_metrics
    # if training_args.do_eval and not is_torch_tpu_available()
    # else None,
)

In [None]:
# Training
# checkpoint = None
# train_result = trainer.train(resume_from_checkpoint=checkpoint)
# trainer.save_model()  # Saves the tokenizer too for easy upload

# metrics = train_result.metrics

# max_train_samples = (len(train_dataset))
# metrics["train_samples"] = min(max_train_samples, len(train_dataset))

# trainer.log_metrics("train", metrics)
# trainer.save_metrics("train", metrics)
# trainer.save_state()
# # Upload the the hugging face hub for easy use in inference.
# trainer.push_to_hub()

***** Running training *****
  Num examples = 171
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 66


  0%|          | 0/66 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to tmp_trainer
Configuration saved in tmp_trainer/config.json


{'train_runtime': 2967.2818, 'train_samples_per_second': 0.173, 'train_steps_per_second': 0.022, 'train_loss': 4.249474265358665, 'epoch': 3.0}


Model weights saved in tmp_trainer/pytorch_model.bin
tokenizer config file saved in tmp_trainer/tokenizer_config.json
Special tokens file saved in tmp_trainer/special_tokens_map.json


***** train metrics *****
  epoch                    =        3.0
  train_loss               =     4.2495
  train_runtime            = 0:49:27.28
  train_samples            =        171
  train_samples_per_second =      0.173
  train_steps_per_second   =      0.022


In [27]:
from huggingface_hub import notebook_login
notebook_login()
trainer.push_to_hub()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

Saving model checkpoint to gpt2-poetry-model
Configuration saved in gpt2-poetry-model/config.json
Model weights saved in gpt2-poetry-model/pytorch_model.bin
tokenizer config file saved in gpt2-poetry-model/tokenizer_config.json
Special tokens file saved in gpt2-poetry-model/special_tokens_map.json


AttributeError: 'Trainer' object has no attribute 'repo'