starxultra commited on
Commit
13de4f2
·
verified ·
1 Parent(s): a916617

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -0
app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
3
+ from datasets import Dataset
4
+ import tempfile
5
+ import os
6
+
7
+ # Load base tokenizer and model
8
+ model_name = "arnir0/Tiny-LLM"
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+ model = AutoModelForCausalLM.from_pretrained(model_name)
11
+
12
+ def fine_tune_and_generate(uploaded_file, prompt):
13
+ # Save uploaded file temporarily
14
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp:
15
+ tmp.write(uploaded_file.read())
16
+ tmp_path = tmp.name
17
+
18
+ # Read lines from uploaded text file
19
+ with open(tmp_path, "r", encoding="utf-8") as f:
20
+ lines = [line.strip() for line in f.readlines() if line.strip()]
21
+
22
+ # Clean up temp file
23
+ os.remove(tmp_path)
24
+
25
+ # Create dataset for fine-tuning
26
+ dataset = Dataset.from_dict({"text": lines})
27
+
28
+ # Tokenization function
29
+ def tokenize_function(examples):
30
+ return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
31
+
32
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
33
+
34
+ # Set training args (very small for demo)
35
+ training_args = TrainingArguments(
36
+ output_dir="./fine_tuned",
37
+ num_train_epochs=1,
38
+ per_device_train_batch_size=2,
39
+ logging_dir="./logs",
40
+ logging_steps=10,
41
+ save_strategy="no",
42
+ learning_rate=5e-5,
43
+ weight_decay=0.01,
44
+ )
45
+
46
+ # Trainer init
47
+ trainer = Trainer(
48
+ model=model,
49
+ args=training_args,
50
+ train_dataset=tokenized_dataset,
51
+ )
52
+
53
+ # Fine-tune the model
54
+ trainer.train()
55
+
56
+ # Generate text from prompt
57
+ input_ids = tokenizer(prompt, return_tensors="pt").input_ids
58
+ outputs = model.generate(input_ids, max_length=50, do_sample=True, top_p=0.95, top_k=50)
59
+ generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
60
+
61
+ return generated_text
62
+
63
+ # Gradio interface
64
+ iface = gr.Interface(
65
+ fn=fine_tune_and_generate,
66
+ inputs=[
67
+ gr.File(label="Upload training text (.txt)"),
68
+ gr.Textbox(lines=2, placeholder="Enter prompt for generation", label="Prompt"),
69
+ ],
70
+ outputs="text",
71
+ title="Tiny-LLM Fine-tune & Generate",
72
+ description="Upload your text file to fine-tune Tiny-LLM and generate text from a prompt."
73
+ )
74
+
75
+ iface.launch()