Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import time | |
| import threading | |
| import random | |
| from datetime import datetime | |
| from datasets import load_dataset | |
| import pandas as pd | |
| import spaces | |
| import io | |
| # Global state | |
| class TrainingState: | |
| def __init__(self): | |
| self.status = "idle" | |
| self.progress = 0 | |
| self.logs = ["✅ System initialized"] | |
| self.start_time = None | |
| self.model_name = "tasal9/pashto-base-bloom" | |
| self.active_process = None | |
| self.dataset_loaded = False | |
| self.dataset_info = "No dataset loaded" | |
| self.dataset_sample = pd.DataFrame() | |
| def load_dataset(self): | |
| try: | |
| self.logs.append("⏳ Loading dataset: tasal9/ZamAi-Pashto-Datasets-V2") | |
| dataset = load_dataset("tasal9/ZamAi-Pashto-Datasets-V2") | |
| self.dataset_loaded = True | |
| self.dataset_info = f"✅ Dataset loaded!\nName: ZamAi-Pashto-Datasets-V2\nSize: {len(dataset['train'])} examples" | |
| self.dataset_sample = pd.DataFrame(dataset['train'].select(range(5))) | |
| self.logs.append(f"📊 {len(dataset['train'])} Pashto examples loaded") | |
| return True | |
| except Exception as e: | |
| self.logs.append(f"❌ Error loading dataset: {str(e)}") | |
| self.dataset_info = f"Error: {str(e)}" | |
| return False | |
| def load_local_file(self, file): | |
| try: | |
| ext = file.name.split('.')[-1] | |
| if ext == "csv": | |
| df = pd.read_csv(file.name) | |
| elif ext == "json": | |
| df = pd.read_json(file.name) | |
| elif ext == "txt": | |
| df = pd.DataFrame({"text": open(file.name).read().splitlines()}) | |
| else: | |
| raise ValueError("Unsupported file format") | |
| self.dataset_sample = df.head(5) | |
| self.dataset_info = f"✅ Local file loaded: {file.name}" | |
| self.dataset_loaded = True | |
| self.logs.append(f"📁 Local dataset loaded: {file.name}") | |
| return True | |
| except Exception as e: | |
| self.dataset_info = f"❌ Error loading file: {str(e)}" | |
| self.logs.append(self.dataset_info) | |
| return False | |
| def start_training(self, size): | |
| self.status = "training" | |
| self.progress = 0 | |
| self.logs = [f"🏋️ Training started at {datetime.now().strftime('%H:%M:%S')}"] | |
| self.logs.append(f"📝 Data size: {size} characters") | |
| self.start_time = time.time() | |
| def start_finetuning(self, size): | |
| self.status = "fine-tuning" | |
| self.progress = 0 | |
| self.logs = [f"🎯 Fine-tuning started at {datetime.now().strftime('%H:%M:%S')}"] | |
| self.logs.append(f"📝 Data size: {size} characters") | |
| self.start_time = time.time() | |
| def update_progress(self, progress): | |
| self.progress = min(100, max(0, progress)) | |
| if progress >= 100: | |
| self.complete_process() | |
| def add_log(self, msg): | |
| self.logs.append(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}") | |
| if len(self.logs) > 15: | |
| self.logs.pop(0) | |
| def get_status(self): | |
| return self.status | |
| def complete_process(self): | |
| elapsed = time.time() - self.start_time | |
| self.add_log(f"🏁 {self.status.capitalize()} completed in {elapsed:.1f}s") | |
| self.status = "idle" | |
| self.progress = 100 | |
| state = TrainingState() | |
| translations = { | |
| "English": { | |
| "title": "🌸 Pashto-Base-Bloom Trainer", | |
| "load_dataset": "Load Dataset", | |
| "upload_file": "Upload Local File", | |
| "status": "Status", | |
| "preview": "Sample Preview", | |
| "test_input": "Input", | |
| "test_output": "Output", | |
| "test": "Test", | |
| "train_data": "Training Data", | |
| "train": "Start Training", | |
| "finetune_data": "Fine-tuning Data", | |
| "finetune": "Start Fine-tuning", | |
| "current_status": "Current Status", | |
| "progress": "Progress", | |
| "logs": "Logs", | |
| "refresh": "🔄 Refresh", | |
| "export_logs": "📥 Export Logs", | |
| "language": "Language" | |
| }, | |
| "پښتو": { | |
| "title": "🌸 پښتو-بیس-بلوم روزونکی", | |
| "load_dataset": "ډیټاسټ لوډ کړئ", | |
| "upload_file": "محلي فایل اپلوډ کړئ", | |
| "status": "حالت", | |
| "preview": "نمونه ښودنه", | |
| "test_input": "ورودی", | |
| "test_output": "وتی", | |
| "test": "ازموینه", | |
| "train_data": "د روزنې معلومات", | |
| "train": "روزنه پیل کړئ", | |
| "finetune_data": "د فاین ټیون معلومات", | |
| "finetune": "فاین ټیون پیل کړئ", | |
| "current_status": "اوسنی حالت", | |
| "progress": "پرمختګ", | |
| "logs": "لاګونه", | |
| "refresh": "🔄 تازه کړئ", | |
| "export_logs": "📥 لاګونه ډاونلوډ کړئ", | |
| "language": "ژبه" | |
| } | |
| } | |
| def test_model(text): | |
| if not text.strip(): | |
| return "❗ Enter text to test." | |
| options = [ | |
| f"Processed: '{text}'", | |
| f"Model response to: {text}", | |
| f"Pashto analysis: {len(text)} characters", | |
| f"✅ Got it: {text}", | |
| f"Generated: {text}... [simulated]", | |
| f"🔍 Words: {len(text.split())}" | |
| ] | |
| return random.choice(options) | |
| def simulate_process(duration, process_type, data_size): | |
| if process_type == "train": | |
| state.start_training(data_size) | |
| else: | |
| state.start_finetuning(data_size) | |
| steps = 10 | |
| for i in range(steps + 1): | |
| time.sleep(duration / steps) | |
| state.update_progress(int((i / steps) * 100)) | |
| if i % 3 == 0: | |
| state.add_log(random.choice([ | |
| f"Batch {i}/{steps}", | |
| f"Loss: {random.uniform(0.1, 1.0):.3f}", | |
| f"LR: {random.uniform(1e-5, 1e-3):.6f}", | |
| f"GPU: {random.randint(60, 95)}% (sim)", | |
| ])) | |
| state.complete_process() | |
| def train_model(text): | |
| if not text.strip(): | |
| return "❌ Add training data.", "" | |
| if not state.dataset_loaded: | |
| return "❌ Load dataset first.", "" | |
| if state.status != "idle": | |
| return "⏳ Wait for current process.", "" | |
| threading.Thread(target=simulate_process, args=(15, "train", len(text)), daemon=True).start() | |
| return "✅ Training started", "" | |
| def finetune_model(text): | |
| if not text.strip(): | |
| return "❌ Add fine-tuning data.", "" | |
| if not state.dataset_loaded: | |
| return "❌ Load dataset first.", "" | |
| if state.status != "idle": | |
| return "⏳ Wait for current process.", "" | |
| threading.Thread(target=simulate_process, args=(10, "fine-tune", len(text)), daemon=True).start() | |
| return "✅ Fine-tuning started", "" | |
| def load_hf_dataset(): | |
| ok = state.load_dataset() | |
| return state.dataset_info, state.dataset_sample if ok else pd.DataFrame() | |
| def load_local_dataset(file): | |
| ok = state.load_local_file(file) | |
| return state.dataset_info, state.dataset_sample if ok else pd.DataFrame() | |
| def get_current_status(): | |
| return state.get_status(), state.progress / 100, "\n".join(state.logs) | |
| def export_logs(): | |
| df = pd.DataFrame({"Logs": state.logs}) | |
| buffer = io.StringIO() | |
| df.to_csv(buffer, index=False) | |
| buffer.seek(0) | |
| return buffer.getvalue() | |
| with gr.Blocks(title="Pashto Base Bloom Trainer", theme="soft") as demo: | |
| lang_selector = gr.Dropdown(choices=["English", "پښتو"], value="English", label="Language") | |
| labels = translations["English"] | |
| gr.Markdown(f"# {labels['title']}") | |
| with gr.Tab(labels["load_dataset"]): | |
| gr.Markdown(f"### {labels['load_dataset']}") | |
| with gr.Row(): | |
| dataset_btn = gr.Button(labels["load_dataset"]) | |
| dataset_status = gr.Textbox(label=labels["status"], lines=2, interactive=False) | |
| if __name__ == "__main__": | |
| demo.launch() | |