MrUnknown420 commited on
Commit
9476a0f
Β·
verified Β·
1 Parent(s): 2420031
Files changed (1) hide show
  1. app.py +235 -139
app.py CHANGED
@@ -1,8 +1,10 @@
1
  import os
2
  import json
 
 
3
  import gradio as gr
4
- from huggingface_hub import HfApi, snapshot_download
5
- from datasets import load_dataset
6
  from transformers import (
7
  AutoModelForCausalLM,
8
  AutoTokenizer,
@@ -10,184 +12,278 @@ from transformers import (
10
  TrainingArguments,
11
  DataCollatorForLanguageModeling
12
  )
13
- import torch
14
 
15
- # ========== GLOBALS ==========
16
- API = HfApi()
17
- MEMORY_DIR = "memory"
18
- MODEL_DIR = "saved_models"
 
 
 
19
  os.makedirs(MEMORY_DIR, exist_ok=True)
20
- os.makedirs(MODEL_DIR, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- # ========== MEMORY HANDLING ==========
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def get_memory_file(model_name):
24
- return os.path.join(MEMORY_DIR, f"{model_name}_memory.json")
25
 
26
  def load_memory(model_name):
27
- file = get_memory_file(model_name)
28
- if os.path.exists(file):
29
- with open(file, "r") as f:
30
- return json.load(f)
31
  return []
32
 
33
- def save_memory(model_name, chat_log):
34
- file = get_memory_file(model_name)
35
- with open(file, "w") as f:
36
- json.dump(chat_log, f, indent=2)
37
 
38
- # ========== HUGGING FACE HUB HELPERS ==========
39
- def get_top_models(limit=10):
40
- models = API.list_models(sort="downloads", direction=-1, limit=limit)
41
- return [m.modelId for m in models]
 
 
42
 
43
- def get_top_datasets(limit=10):
44
- datasets = API.list_datasets(sort="downloads", direction=-1, limit=limit)
45
- return [d.id for d in datasets]
 
 
 
 
 
46
 
47
- # ========== TRAINING ==========
48
- def train_model(model_name, dataset_name, output_name, epochs=1):
 
 
 
 
 
 
 
 
49
  try:
50
- dataset = load_dataset(dataset_name)
 
 
51
  tokenizer = AutoTokenizer.from_pretrained(model_name)
52
  model = AutoModelForCausalLM.from_pretrained(model_name)
53
 
54
- def tokenize_fn(examples):
55
- return tokenizer(examples["text"], truncation=True, padding="max_length")
56
 
57
- tokenized = dataset.map(tokenize_fn, batched=True, remove_columns=dataset["train"].column_names)
58
- collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
 
 
 
 
59
 
60
  training_args = TrainingArguments(
61
- output_dir=os.path.join(MODEL_DIR, output_name),
62
  overwrite_output_dir=True,
63
- num_train_epochs=epochs,
64
  per_device_train_batch_size=2,
65
- save_strategy="epoch",
 
66
  logging_dir="./logs",
67
- logging_steps=10,
68
- push_to_hub=False
69
  )
70
 
71
  trainer = Trainer(
72
  model=model,
73
  args=training_args,
74
- train_dataset=tokenized["train"],
75
- tokenizer=tokenizer,
76
- data_collator=collator
77
  )
78
 
79
  trainer.train()
80
- trainer.save_model(os.path.join(MODEL_DIR, output_name))
81
- return f"βœ… Training finished. Model saved as {output_name}"
 
 
 
82
  except Exception as e:
83
- return f"❌ Training error: {str(e)}"
 
84
 
85
- # ========== CHAT ==========
86
- def chat_with_model(model_name, user_input):
87
- try:
88
- path = os.path.join(MODEL_DIR, model_name)
89
- if os.path.exists(path):
90
- model = AutoModelForCausalLM.from_pretrained(path)
91
- tokenizer = AutoTokenizer.from_pretrained(path)
92
- else:
93
- model = AutoModelForCausalLM.from_pretrained(model_name)
94
- tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
 
 
 
95
 
96
- memory = load_memory(model_name)
97
- memory_text = " ".join([f"User: {m['user']} AI: {m['ai']}" for m in memory])
 
 
98
 
99
- inputs = tokenizer(memory_text + " User: " + user_input + " AI:", return_tensors="pt")
100
- outputs = model.generate(**inputs, max_length=300, pad_token_id=tokenizer.eos_token_id)
101
- response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("AI:")[-1].strip()
 
 
102
 
103
- memory.append({"user": user_input, "ai": response})
104
- save_memory(model_name, memory)
 
 
 
 
 
 
 
 
 
 
 
105
 
106
- return response
107
- except Exception as e:
108
- return f"❌ Chat error: {str(e)}"
109
 
110
- # ========== INTERFACE TABS ==========
 
 
 
 
111
 
112
- # Home Tab
113
- home_tab = gr.Markdown("# πŸ€– My AI Model Builder\nWelcome! Use this tool to search, train, and chat with AI models. All memory & models are stored locally inside this Space.")
 
 
 
114
 
115
- # Models Tab
116
- with gr.Blocks() as models_tab:
117
- gr.Markdown("## πŸ” Search Models")
118
  with gr.Row():
119
- model_input = gr.Textbox(label="Search or enter model name")
120
- model_output = gr.Textbox(label="Result")
121
- search_button = gr.Button("Search Model")
122
-
123
- def search_model(name):
124
- try:
125
- info = API.model_info(name)
126
- return f"βœ… Found: {info.modelId}\nDownloads: {info.downloads}\nTags: {info.tags}"
127
- except Exception as e:
128
- return f"❌ {str(e)}"
129
- search_button.click(search_model, model_input, model_output)
130
-
131
- # Datasets Tab
132
- with gr.Blocks() as datasets_tab:
133
- gr.Markdown("## πŸ“Š Search Datasets")
134
- with gr.Row():
135
- dataset_input = gr.Textbox(label="Search or enter dataset name")
136
- dataset_output = gr.Textbox(label="Result")
137
- dataset_button = gr.Button("Search Dataset")
138
-
139
- def search_dataset(name):
140
- try:
141
- info = API.dataset_info(name)
142
- return f"βœ… Found: {info.id}\nDownloads: {info.downloads}\nTags: {info.tags}"
143
- except Exception as e:
144
- return f"❌ {str(e)}"
145
- dataset_button.click(search_dataset, dataset_input, dataset_output)
146
-
147
- # Training Tab
148
- with gr.Blocks() as training_tab:
149
- gr.Markdown("## πŸ‹οΈ Train / Fine-tune a Model")
150
- model_choice = gr.Dropdown(choices=get_top_models(), label="Pick Base Model", interactive=True)
151
- dataset_choice = gr.Dropdown(choices=get_top_datasets(), label="Pick Dataset", interactive=True)
152
- output_name = gr.Textbox(label="New Model Name")
153
- epochs = gr.Slider(1, 5, step=1, label="Epochs", value=1)
154
- train_button = gr.Button("πŸš€ Train Model")
155
- train_output = gr.Textbox(label="Training Status")
156
- train_button.click(train_model, [model_choice, dataset_choice, output_name, epochs], train_output)
157
 
158
- # Chat/Test Tab
159
- with gr.Blocks() as chat_tab:
160
- gr.Markdown("## πŸ’¬ Chat with Your Model")
161
- chat_model = gr.Textbox(label="Enter Model Name")
162
- user_input = gr.Textbox(label="Your Message")
163
- chat_output = gr.Textbox(label="AI Response")
164
- chat_button = gr.Button("Send")
165
- chat_button.click(chat_with_model, [chat_model, user_input], chat_output)
166
-
167
- # Memory Tab
168
- with gr.Blocks() as memory_tab:
169
- gr.Markdown("## 🧠 Model Memory")
170
- memory_model = gr.Textbox(label="Model Name")
171
- memory_display = gr.Textbox(label="Memory Log")
172
- def show_memory(name): return json.dumps(load_memory(name), indent=2)
173
- memory_button = gr.Button("Load Memory")
174
- memory_button.click(show_memory, memory_model, memory_display)
175
-
176
- # Guide Tab
177
- guide_tab = gr.Markdown("""
178
- # πŸ“– Mini Guide
179
- 1. Use **Models** to explore Hugging Face models.
180
- 2. Use **Datasets** to find training data.
181
- 3. Use **Training** to fine-tune.
182
- 4. Use **Chat** to test models.
183
- 5. All models & memory are saved in this Space.
184
- """)
185
-
186
- # Launch Interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  demo = gr.TabbedInterface(
188
- [home_tab, models_tab, datasets_tab, training_tab, chat_tab, memory_tab, guide_tab],
189
- ["Home", "Models", "Datasets", "Training", "Chat/Test", "Memory", "Guide"]
190
  )
191
 
192
  if __name__ == "__main__":
193
- demo.launch()
 
1
  import os
2
  import json
3
+ import logging
4
+ from datetime import datetime
5
  import gradio as gr
6
+ from datasets import list_datasets, load_dataset
7
+ from huggingface_hub import HfApi, HfFolder
8
  from transformers import (
9
  AutoModelForCausalLM,
10
  AutoTokenizer,
 
12
  TrainingArguments,
13
  DataCollatorForLanguageModeling
14
  )
 
15
 
16
+ # ===============================
17
+ # Setup directories & logging
18
+ # ===============================
19
+ BASE_DIR = "storage"
20
+ MEMORY_DIR = os.path.join(BASE_DIR, "memory")
21
+ LOG_FILE = os.path.join(BASE_DIR, "logs.txt")
22
+
23
  os.makedirs(MEMORY_DIR, exist_ok=True)
24
+ os.makedirs(BASE_DIR, exist_ok=True)
25
+
26
+ logging.basicConfig(
27
+ filename=LOG_FILE,
28
+ level=logging.INFO,
29
+ format="%(asctime)s - %(levelname)s - %(message)s"
30
+ )
31
+
32
+ def log_event(event: str):
33
+ """Log both to file and console"""
34
+ logging.info(event)
35
+ print(event)
36
 
37
+ # ===============================
38
+ # Hugging Face Auto-fetch
39
+ # ===============================
40
+ def fetch_top_models(limit=10):
41
+ """Fetch top models from Hugging Face Hub"""
42
+ api = HfApi()
43
+ models = api.list_models(sort="downloads", limit=limit)
44
+ return [m.modelId for m in models]
45
+
46
+ def fetch_top_datasets(limit=10):
47
+ """Fetch top datasets from Hugging Face Hub"""
48
+ api = HfApi()
49
+ datasets = api.list_datasets(sort="downloads", limit=limit)
50
+ return [d.id for d in datasets]
51
+
52
+ TOP_MODELS = fetch_top_models()
53
+ TOP_DATASETS = fetch_top_datasets()
54
+
55
+ # ===============================
56
+ # Memory Management
57
+ # ===============================
58
  def get_memory_file(model_name):
59
+ return os.path.join(MEMORY_DIR, f"{model_name.replace('/', '_')}_memory.json")
60
 
61
  def load_memory(model_name):
62
+ f = get_memory_file(model_name)
63
+ if os.path.exists(f):
64
+ with open(f, "r") as file:
65
+ return json.load(file)
66
  return []
67
 
68
+ def save_memory(model_name, messages):
69
+ f = get_memory_file(model_name)
70
+ with open(f, "w") as file:
71
+ json.dump(messages, file, indent=2)
72
 
73
+ # ===============================
74
+ # Chat Functionality
75
+ # ===============================
76
+ def chat_with_model(user_input, model_choice):
77
+ if not model_choice:
78
+ return "❌ Please select a model.", ""
79
 
80
+ log_event(f"User chatting with {model_choice}: {user_input}")
81
+ tokenizer = AutoTokenizer.from_pretrained(model_choice)
82
+ model = AutoModelForCausalLM.from_pretrained(model_choice)
83
+
84
+ inputs = tokenizer(user_input, return_tensors="pt")
85
+ outputs = model.generate(**inputs, max_length=200)
86
+
87
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
88
 
89
+ memory = load_memory(model_choice)
90
+ memory.append({"user": user_input, "bot": response})
91
+ save_memory(model_choice, memory)
92
+
93
+ return response, json.dumps(memory, indent=2)
94
+
95
+ # ===============================
96
+ # Training
97
+ # ===============================
98
+ def train_model(model_name, dataset_name, epochs, output_dir):
99
  try:
100
+ log_event(f"Starting training: model={model_name}, dataset={dataset_name}, epochs={epochs}")
101
+ dataset = load_dataset(dataset_name, split="train")
102
+
103
  tokenizer = AutoTokenizer.from_pretrained(model_name)
104
  model = AutoModelForCausalLM.from_pretrained(model_name)
105
 
106
+ def tokenize_function(examples):
107
+ return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
108
 
109
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
110
+
111
+ data_collator = DataCollatorForLanguageModeling(
112
+ tokenizer=tokenizer,
113
+ mlm=False
114
+ )
115
 
116
  training_args = TrainingArguments(
117
+ output_dir=output_dir,
118
  overwrite_output_dir=True,
119
+ num_train_epochs=int(epochs),
120
  per_device_train_batch_size=2,
121
+ save_steps=500,
122
+ save_total_limit=2,
123
  logging_dir="./logs",
124
+ logging_steps=50
 
125
  )
126
 
127
  trainer = Trainer(
128
  model=model,
129
  args=training_args,
130
+ train_dataset=tokenized_dataset,
131
+ data_collator=data_collator
 
132
  )
133
 
134
  trainer.train()
135
+ model.save_pretrained(output_dir)
136
+ tokenizer.save_pretrained(output_dir)
137
+
138
+ log_event(f"βœ… Training completed. Model saved to {output_dir}")
139
+ return f"βœ… Training completed. Model saved to {output_dir}"
140
  except Exception as e:
141
+ log_event(f"❌ Training failed: {e}")
142
+ return f"❌ Error during training: {str(e)}"
143
 
144
+ # ===============================
145
+ # Gradio UI – Training Tab
146
+ # ===============================
147
+ with gr.Blocks() as training_tab:
148
+ gr.Markdown("## πŸ“š Train a Custom Model")
149
+ with gr.Row():
150
+ model_dropdown = gr.Dropdown(choices=TOP_MODELS, label="Choose Model", interactive=True)
151
+ dataset_dropdown = gr.Dropdown(choices=TOP_DATASETS, label="Choose Dataset", interactive=True)
152
+ with gr.Row():
153
+ model_text = gr.Textbox(label="Or enter custom model ID", placeholder="e.g. gpt2")
154
+ dataset_text = gr.Textbox(label="Or enter custom dataset ID", placeholder="e.g. wikitext")
155
+ epochs = gr.Number(value=1, label="Epochs")
156
+ output_dir = gr.Textbox(value="./trained_model", label="Output Directory")
157
+ train_btn = gr.Button("πŸš€ Start Training")
158
+ train_output = gr.Textbox(label="Training Status")
159
 
160
+ def handle_train(model_d, model_t, dataset_d, dataset_t, epochs, output_dir):
161
+ model = model_t if model_t else model_d
162
+ dataset = dataset_t if dataset_t else dataset_d
163
+ return train_model(model, dataset, epochs, output_dir)
164
 
165
+ train_btn.click(
166
+ fn=handle_train,
167
+ inputs=[model_dropdown, model_text, dataset_dropdown, dataset_text, epochs, output_dir],
168
+ outputs=train_output
169
+ )
170
 
171
+ # ===============================
172
+ # Gradio UI – Chat Tab
173
+ # ===============================
174
+ with gr.Blocks() as chat_tab:
175
+ gr.Markdown("## πŸ’¬ Chat with Model")
176
+ with gr.Row():
177
+ chat_model_dropdown = gr.Dropdown(choices=TOP_MODELS, label="Choose Model", interactive=True)
178
+ chat_model_text = gr.Textbox(label="Or enter custom model ID", placeholder="e.g. gpt2")
179
+ with gr.Row():
180
+ chat_input = gr.Textbox(label="Your Message")
181
+ send_btn = gr.Button("Send")
182
+ chat_output = gr.Textbox(label="Model Response")
183
+ memory_display = gr.Textbox(label="Conversation Memory", interactive=False)
184
 
185
+ def handle_chat(user_input, model_d, model_t):
186
+ model = model_t if model_t else model_d
187
+ return chat_with_model(user_input, model)
188
 
189
+ send_btn.click(
190
+ fn=handle_chat,
191
+ inputs=[chat_input, chat_model_dropdown, chat_model_text],
192
+ outputs=[chat_output, memory_display]
193
+ )
194
 
195
+ # ===============================
196
+ # Gradio UI – Memory Tab
197
+ # ===============================
198
+ with gr.Blocks() as memory_tab:
199
+ gr.Markdown("## 🧠 Manage Memory")
200
 
 
 
 
201
  with gr.Row():
202
+ memory_model_dropdown = gr.Dropdown(choices=TOP_MODELS, label="Select Model")
203
+ memory_model_text = gr.Textbox(label="Or enter custom model ID")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
+ memory_output = gr.Textbox(label="Stored Memory", interactive=False)
206
+ load_btn = gr.Button("πŸ“‚ Load Memory")
207
+ clear_btn = gr.Button("πŸ—‘οΈ Clear Memory")
208
+
209
+ def handle_load(model_d, model_t):
210
+ model = model_t if model_t else model_d
211
+ memory = load_memory(model)
212
+ return json.dumps(memory, indent=2)
213
+
214
+ def handle_clear(model_d, model_t):
215
+ model = model_t if model_t else model_d
216
+ f = get_memory_file(model)
217
+ if os.path.exists(f):
218
+ os.remove(f)
219
+ log_event(f"Cleared memory for {model}")
220
+ return "βœ… Memory cleared."
221
+ return "⚠️ No memory found."
222
+
223
+ load_btn.click(
224
+ fn=handle_load,
225
+ inputs=[memory_model_dropdown, memory_model_text],
226
+ outputs=memory_output
227
+ )
228
+
229
+ clear_btn.click(
230
+ fn=handle_clear,
231
+ inputs=[memory_model_dropdown, memory_model_text],
232
+ outputs=memory_output
233
+ )
234
+
235
+ # ===============================
236
+ # Gradio UI – Logs Tab
237
+ # ===============================
238
+ with gr.Blocks() as logs_tab:
239
+ gr.Markdown("## πŸ“œ Application Logs")
240
+ log_display = gr.Textbox(value=open(LOG_FILE).read() if os.path.exists(LOG_FILE) else "No logs yet.", lines=20)
241
+
242
+ refresh_btn = gr.Button("πŸ”„ Refresh Logs")
243
+
244
+ def refresh_logs():
245
+ return open(LOG_FILE).read() if os.path.exists(LOG_FILE) else "No logs yet."
246
+
247
+ refresh_btn.click(
248
+ fn=refresh_logs,
249
+ outputs=log_display
250
+ )
251
+
252
+ # ===============================
253
+ # Gradio UI – Help Tab
254
+ # ===============================
255
+ with gr.Blocks() as help_tab:
256
+ gr.Markdown("## πŸ“– Help & User Manual")
257
+
258
+ gr.Markdown("""
259
+ ### πŸ”Ή Beginner Guide
260
+ 1. Go to **Train a Model** tab β†’ pick a model & dataset or enter custom IDs.
261
+ 2. Choose number of epochs & output directory β†’ click **Start Training**.
262
+ 3. Once training completes, the model is saved and can be used later.
263
+ 4. Go to **Chat with Model** tab β†’ type your message or test the trained model.
264
+ 5. Conversation is auto-saved per model (see **Memory** tab).
265
+ 6. Use **Logs** tab for detailed runtime events.
266
+
267
+ ### πŸ”Ή Technical Details
268
+ - Models/Datasets pulled live from Hugging Face Hub (top 10 auto-fetched).
269
+ - Memory stored in `/storage/memory/` as JSON files (per model).
270
+ - Logs stored in `/storage/logs.txt`.
271
+ - Training uses πŸ€— Transformers `Trainer` API with causal LM objective.
272
+ - Safe checks auto-create missing directories & files.
273
+
274
+ ### πŸ”Ή Tips
275
+ - For large datasets, train on GPU (CPU will be very slow).
276
+ - Memory files can be manually edited in `/storage/memory/`.
277
+ - You can load any public Hugging Face dataset/model by entering its ID.
278
+ """)
279
+
280
+ # ===============================
281
+ # Final Tabbed Interface
282
+ # ===============================
283
  demo = gr.TabbedInterface(
284
+ [training_tab, chat_tab, memory_tab, logs_tab, help_tab],
285
+ ["Train a Model", "Chat", "Memory", "Logs", "Help & Manual"]
286
  )
287
 
288
  if __name__ == "__main__":
289
+ demo.launch(server_name="0.0.0.0", server_port=7860)