tudormunteanu commited on
Commit
a4e6c29
·
0 Parent(s):

first attempt

Browse files
Files changed (4) hide show
  1. README.md +3 -0
  2. main.py +152 -0
  3. poetry.lock +0 -0
  4. pyproject.toml +22 -0
README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Tiny Training
2
+
3
+ This is meant to be a tiny LLM training experiment to understand how Hugging Face works.
main.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Dict
3
+
4
+ import torch
5
+ from datasets import load_dataset
6
+ from torch.optim import AdamW
7
+ from torch.utils.data import DataLoader
8
+ from tqdm.auto import tqdm
9
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, get_scheduler
10
+
11
+
12
+ def setup_model():
13
+ # Using a smaller CodeT5 model suitable for the free tier
14
+ model_name = "Salesforce/codet5-small"
15
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
17
+ return model, tokenizer
18
+
19
+
20
+ def prepare_dataset():
21
+ # Load Python subset of CodeSearchNet
22
+ dataset = load_dataset(
23
+ "code_search_net", "python", split="train[:1000]", trust_remote_code=True
24
+ ) # Limited to 1000 examples for free tier
25
+
26
+ def extract_function_info(example: Dict) -> Dict:
27
+ """Extract clean function definitions and docstrings."""
28
+ code = example["whole_func_string"]
29
+
30
+ # Basic filtering for API-style functions
31
+ if not code.strip().startswith("def "):
32
+ # Empty strings are better handled downstream.
33
+ return {
34
+ "function": "",
35
+ "documentation": "",
36
+ "input": "",
37
+ "output": ""
38
+ }
39
+
40
+ # Remove multiple newlines and standardize spacing
41
+ code = re.sub(r"\n\s*\n", "\n", code)
42
+ docstring = example["func_documentation_string"].strip()
43
+
44
+ return {
45
+ "function": code,
46
+ "documentation": docstring,
47
+ "input": f"Write a Python function that: {docstring}",
48
+ "output": code,
49
+ }
50
+
51
+ # Process and filter the dataset
52
+ processed_dataset = dataset.map(extract_function_info)
53
+ # Filter out empty entries after mapping
54
+ processed_dataset = processed_dataset.filter(lambda x: x["function"] != "")
55
+
56
+ return processed_dataset
57
+
58
+
59
+ def tokenize_data(examples, tokenizer, max_length=512):
60
+ """Tokenize inputs and outputs for training."""
61
+ # Batch tokenization for inputs
62
+ model_inputs = tokenizer(
63
+ examples['input'],
64
+ max_length=max_length,
65
+ padding='max_length',
66
+ truncation=True
67
+ )
68
+
69
+ # Batch tokenization for outputs
70
+ with tokenizer.as_target_tokenizer():
71
+ labels = tokenizer(
72
+ examples['output'],
73
+ max_length=max_length,
74
+ padding='max_length',
75
+ truncation=True
76
+ ).input_ids
77
+
78
+ model_inputs['labels'] = labels
79
+ return model_inputs
80
+
81
+
82
+ def train():
83
+ model, tokenizer = setup_model()
84
+ dataset = prepare_dataset()
85
+
86
+ # Training configuration
87
+ device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
88
+ model.to(device)
89
+
90
+ # Hyperparameters
91
+ batch_size = 8
92
+ num_epochs = 3
93
+ learning_rate = 5e-5
94
+ max_length = 512
95
+
96
+ # Modify the dataset mapping
97
+ tokenized_dataset = dataset.map(
98
+ lambda x: tokenize_data(x, tokenizer, max_length),
99
+ batched=True,
100
+ batch_size=16, # Explicit batch size for processing
101
+ remove_columns=dataset.column_names,
102
+ )
103
+
104
+ def collate_fn(examples):
105
+ return {
106
+ 'input_ids': torch.stack([torch.tensor(example['input_ids']) for example in examples]).to(device),
107
+ 'attention_mask': torch.stack([torch.tensor(example['attention_mask']) for example in examples]).to(device),
108
+ 'labels': torch.stack([torch.tensor(example['labels']) for example in examples]).to(device)
109
+ }
110
+
111
+ train_dataloader = DataLoader(
112
+ tokenized_dataset,
113
+ shuffle=True,
114
+ batch_size=batch_size,
115
+ collate_fn=collate_fn
116
+ )
117
+
118
+ # Initialize optimizer and scheduler
119
+ optimizer = AdamW(model.parameters(), lr=learning_rate)
120
+ num_training_steps = num_epochs * len(train_dataloader)
121
+ lr_scheduler = get_scheduler(
122
+ name="linear",
123
+ optimizer=optimizer,
124
+ num_warmup_steps=0,
125
+ num_training_steps=num_training_steps,
126
+ )
127
+
128
+ # Training loop
129
+ progress_bar = tqdm(range(num_training_steps))
130
+ model.train()
131
+
132
+ for epoch in range(num_epochs):
133
+ for batch in train_dataloader:
134
+ outputs = model(**batch)
135
+ loss = outputs.loss
136
+ loss.backward()
137
+
138
+ optimizer.step()
139
+ lr_scheduler.step()
140
+ optimizer.zero_grad()
141
+ progress_bar.update(1)
142
+ progress_bar.set_description(f"Loss: {loss.item():.4f}")
143
+
144
+ # Save checkpoint after each epoch
145
+ model.save_pretrained(f"checkpoint-epoch-{epoch}")
146
+ tokenizer.save_pretrained(f"checkpoint-epoch-{epoch}")
147
+
148
+ print("Training completed!")
149
+
150
+
151
+ if __name__ == "__main__":
152
+ train()
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "tiny-training"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Your Name <[email protected]>"]
6
+ readme = "README.md"
7
+ packages = [{include = "tiny_training"}]
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "3.11.8"
11
+ transformers = "^4.46.2"
12
+ torch = "^2.5.1"
13
+ datasets = "^3.1.0"
14
+ tqdm = "^4.67.0"
15
+
16
+ [tool.poetry.group.dev.dependencies]
17
+ black = "^24.10.0"
18
+ isort = "^5.13.2"
19
+
20
+ [build-system]
21
+ requires = ["poetry-core"]
22
+ build-backend = "poetry.core.masonry.api"