tudormunteanu
commited on
Commit
·
a4e6c29
0
Parent(s):
first attempt
Browse files- README.md +3 -0
- main.py +152 -0
- poetry.lock +0 -0
- pyproject.toml +22 -0
README.md
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Tiny Training
|
2 |
+
|
3 |
+
This is meant to be a tiny LLM training experiment to understand how Hugging Face works.
|
main.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from typing import Dict
|
3 |
+
|
4 |
+
import torch
|
5 |
+
from datasets import load_dataset
|
6 |
+
from torch.optim import AdamW
|
7 |
+
from torch.utils.data import DataLoader
|
8 |
+
from tqdm.auto import tqdm
|
9 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, get_scheduler
|
10 |
+
|
11 |
+
|
12 |
+
def setup_model():
|
13 |
+
# Using a smaller CodeT5 model suitable for the free tier
|
14 |
+
model_name = "Salesforce/codet5-small"
|
15 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
16 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
17 |
+
return model, tokenizer
|
18 |
+
|
19 |
+
|
20 |
+
def prepare_dataset():
|
21 |
+
# Load Python subset of CodeSearchNet
|
22 |
+
dataset = load_dataset(
|
23 |
+
"code_search_net", "python", split="train[:1000]", trust_remote_code=True
|
24 |
+
) # Limited to 1000 examples for free tier
|
25 |
+
|
26 |
+
def extract_function_info(example: Dict) -> Dict:
|
27 |
+
"""Extract clean function definitions and docstrings."""
|
28 |
+
code = example["whole_func_string"]
|
29 |
+
|
30 |
+
# Basic filtering for API-style functions
|
31 |
+
if not code.strip().startswith("def "):
|
32 |
+
# Empty strings are better handled downstream.
|
33 |
+
return {
|
34 |
+
"function": "",
|
35 |
+
"documentation": "",
|
36 |
+
"input": "",
|
37 |
+
"output": ""
|
38 |
+
}
|
39 |
+
|
40 |
+
# Remove multiple newlines and standardize spacing
|
41 |
+
code = re.sub(r"\n\s*\n", "\n", code)
|
42 |
+
docstring = example["func_documentation_string"].strip()
|
43 |
+
|
44 |
+
return {
|
45 |
+
"function": code,
|
46 |
+
"documentation": docstring,
|
47 |
+
"input": f"Write a Python function that: {docstring}",
|
48 |
+
"output": code,
|
49 |
+
}
|
50 |
+
|
51 |
+
# Process and filter the dataset
|
52 |
+
processed_dataset = dataset.map(extract_function_info)
|
53 |
+
# Filter out empty entries after mapping
|
54 |
+
processed_dataset = processed_dataset.filter(lambda x: x["function"] != "")
|
55 |
+
|
56 |
+
return processed_dataset
|
57 |
+
|
58 |
+
|
59 |
+
def tokenize_data(examples, tokenizer, max_length=512):
|
60 |
+
"""Tokenize inputs and outputs for training."""
|
61 |
+
# Batch tokenization for inputs
|
62 |
+
model_inputs = tokenizer(
|
63 |
+
examples['input'],
|
64 |
+
max_length=max_length,
|
65 |
+
padding='max_length',
|
66 |
+
truncation=True
|
67 |
+
)
|
68 |
+
|
69 |
+
# Batch tokenization for outputs
|
70 |
+
with tokenizer.as_target_tokenizer():
|
71 |
+
labels = tokenizer(
|
72 |
+
examples['output'],
|
73 |
+
max_length=max_length,
|
74 |
+
padding='max_length',
|
75 |
+
truncation=True
|
76 |
+
).input_ids
|
77 |
+
|
78 |
+
model_inputs['labels'] = labels
|
79 |
+
return model_inputs
|
80 |
+
|
81 |
+
|
82 |
+
def train():
|
83 |
+
model, tokenizer = setup_model()
|
84 |
+
dataset = prepare_dataset()
|
85 |
+
|
86 |
+
# Training configuration
|
87 |
+
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
|
88 |
+
model.to(device)
|
89 |
+
|
90 |
+
# Hyperparameters
|
91 |
+
batch_size = 8
|
92 |
+
num_epochs = 3
|
93 |
+
learning_rate = 5e-5
|
94 |
+
max_length = 512
|
95 |
+
|
96 |
+
# Modify the dataset mapping
|
97 |
+
tokenized_dataset = dataset.map(
|
98 |
+
lambda x: tokenize_data(x, tokenizer, max_length),
|
99 |
+
batched=True,
|
100 |
+
batch_size=16, # Explicit batch size for processing
|
101 |
+
remove_columns=dataset.column_names,
|
102 |
+
)
|
103 |
+
|
104 |
+
def collate_fn(examples):
|
105 |
+
return {
|
106 |
+
'input_ids': torch.stack([torch.tensor(example['input_ids']) for example in examples]).to(device),
|
107 |
+
'attention_mask': torch.stack([torch.tensor(example['attention_mask']) for example in examples]).to(device),
|
108 |
+
'labels': torch.stack([torch.tensor(example['labels']) for example in examples]).to(device)
|
109 |
+
}
|
110 |
+
|
111 |
+
train_dataloader = DataLoader(
|
112 |
+
tokenized_dataset,
|
113 |
+
shuffle=True,
|
114 |
+
batch_size=batch_size,
|
115 |
+
collate_fn=collate_fn
|
116 |
+
)
|
117 |
+
|
118 |
+
# Initialize optimizer and scheduler
|
119 |
+
optimizer = AdamW(model.parameters(), lr=learning_rate)
|
120 |
+
num_training_steps = num_epochs * len(train_dataloader)
|
121 |
+
lr_scheduler = get_scheduler(
|
122 |
+
name="linear",
|
123 |
+
optimizer=optimizer,
|
124 |
+
num_warmup_steps=0,
|
125 |
+
num_training_steps=num_training_steps,
|
126 |
+
)
|
127 |
+
|
128 |
+
# Training loop
|
129 |
+
progress_bar = tqdm(range(num_training_steps))
|
130 |
+
model.train()
|
131 |
+
|
132 |
+
for epoch in range(num_epochs):
|
133 |
+
for batch in train_dataloader:
|
134 |
+
outputs = model(**batch)
|
135 |
+
loss = outputs.loss
|
136 |
+
loss.backward()
|
137 |
+
|
138 |
+
optimizer.step()
|
139 |
+
lr_scheduler.step()
|
140 |
+
optimizer.zero_grad()
|
141 |
+
progress_bar.update(1)
|
142 |
+
progress_bar.set_description(f"Loss: {loss.item():.4f}")
|
143 |
+
|
144 |
+
# Save checkpoint after each epoch
|
145 |
+
model.save_pretrained(f"checkpoint-epoch-{epoch}")
|
146 |
+
tokenizer.save_pretrained(f"checkpoint-epoch-{epoch}")
|
147 |
+
|
148 |
+
print("Training completed!")
|
149 |
+
|
150 |
+
|
151 |
+
if __name__ == "__main__":
|
152 |
+
train()
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "tiny-training"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = ""
|
5 |
+
authors = ["Your Name <[email protected]>"]
|
6 |
+
readme = "README.md"
|
7 |
+
packages = [{include = "tiny_training"}]
|
8 |
+
|
9 |
+
[tool.poetry.dependencies]
|
10 |
+
python = "3.11.8"
|
11 |
+
transformers = "^4.46.2"
|
12 |
+
torch = "^2.5.1"
|
13 |
+
datasets = "^3.1.0"
|
14 |
+
tqdm = "^4.67.0"
|
15 |
+
|
16 |
+
[tool.poetry.group.dev.dependencies]
|
17 |
+
black = "^24.10.0"
|
18 |
+
isort = "^5.13.2"
|
19 |
+
|
20 |
+
[build-system]
|
21 |
+
requires = ["poetry-core"]
|
22 |
+
build-backend = "poetry.core.masonry.api"
|