import gradio as gr import torch from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM import pandas as pd import json import io import csv from typing import List, Dict import threading import time import queue from concurrent.futures import ThreadPoolExecutor, as_completed import asyncio # Global model cache and loading status MODEL_CACHE = {} MODEL_LOADING_STATUS = {} MODEL_LOADING_LOCK = threading.Lock() def check_model_loading_status(model_names: List[str]) -> Dict: """Check loading status of multiple models""" with MODEL_LOADING_LOCK: status = {} for model_name in model_names: if model_name in MODEL_CACHE: status[model_name] = "ready" elif model_name in MODEL_LOADING_STATUS: status[model_name] = MODEL_LOADING_STATUS[model_name] else: status[model_name] = "not_loaded" return status def load_model_with_status_tracking(model_name: str): """Load model with status tracking""" with MODEL_LOADING_LOCK: if model_name in MODEL_CACHE: return MODEL_CACHE[model_name], None if model_name in MODEL_LOADING_STATUS: return None, f"โมเดล {model_name} กำลังโหลดอยู่..." MODEL_LOADING_STATUS[model_name] = "loading" try: print(f"🔄 เริ่มโหลดโมเดล {model_name}...") # Update status with MODEL_LOADING_LOCK: MODEL_LOADING_STATUS[model_name] = "downloading" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) with MODEL_LOADING_LOCK: MODEL_LOADING_STATUS[model_name] = "loading_model" model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True ) with MODEL_LOADING_LOCK: MODEL_LOADING_STATUS[model_name] = "creating_pipeline" generator = pipeline("text-generation", model=model, tokenizer=tokenizer) with MODEL_LOADING_LOCK: MODEL_CACHE[model_name] = generator MODEL_LOADING_STATUS[model_name] = "ready" print(f"✅ โหลดโมเดล {model_name} สำเร็จ") return generator, None except Exception as e: error_msg = f"❌ ไม่สามารถโหลดโมเดล {model_name}: {str(e)}" print(error_msg) with MODEL_LOADING_LOCK: if model_name in MODEL_LOADING_STATUS: del MODEL_LOADING_STATUS[model_name] return None, error_msg def preload_models_async(model_names: List[str], progress_callback=None): """Preload models asynchronously""" def load_single_model(model_name): generator, error = load_model_with_status_tracking(model_name) if progress_callback: progress_callback(model_name, "ready" if generator else "error", error) return model_name, generator, error results = {} with ThreadPoolExecutor(max_workers=2) as executor: # Limit concurrent loading futures = {executor.submit(load_single_model, model): model for model in model_names} for future in as_completed(futures): model_name, generator, error = future.result() results[model_name] = {"generator": generator, "error": error} return results # Predefined task templates with Thai language support TASK_TEMPLATES = { "text_generation": { "name": "การสร้างข้อความ (Text Generation)", "template": "เขียนเรื่องราวสร้างสรรค์เกี่ยวกับ {topic}", "description": "สร้างข้อความสร้างสรรค์ภาษาไทยจากหัวข้อที่กำหนด" }, "question_answering": { "name": "คำถาม-คำตอบ (Question Answering)", "template": "คำถาม: {question}\nคำตอบ:", "description": "สร้างคู่คำถาม-คำตอบภาษาไทย" }, "summarization": { "name": "การสรุปข้อความ (Text Summarization)", "template": "สรุปข้อความต่อไปนี้: {text}", "description": "สร้างตัวอย่างการสรุปข้อความภาษาไทย" }, "translation": { "name": "การแปลภาษา (Translation)", "template": "แปลจาก {source_lang} เป็น {target_lang}: {text}", "description": "สร้างคู่ข้อมูลสำหรับการแปลภาษา" }, "classification": { "name": "การจำแนกข้อความ (Text Classification)", "template": "จำแนกอารมณ์ของข้อความนี้: {text}\nอารมณ์:", "description": "สร้างตัวอย่างการจำแนกอารมณ์หรือหมวดหมู่ของข้อความ" }, "conversation": { "name": "บทสนทนา (Conversation)", "template": "มนุษย์: {input}\nผู้ช่วย:", "description": "สร้างข้อมูลบทสนทนาภาษาไทย" }, "instruction_following": { "name": "การทำตามคำสั่ง (Instruction Following)", "template": "คำสั่ง: {instruction}\nการตอบสนอง:", "description": "สร้างคู่คำสั่ง-การตอบสนองภาษาไทย" }, "thai_poetry": { "name": "กวีนิพนธ์ไทย (Thai Poetry)", "template": "แต่งกวีนิพนธ์เกี่ยวกับ {topic} ในรูปแบบ {style}", "description": "สร้างกวีนิพนธ์ไทยในรูปแบบต่างๆ" }, "thai_news": { "name": "ข่าวภาษาไทย (Thai News)", "template": "เขียนข่าวภาษาไทยเกี่ยวกับ {topic} ในหัวข้อ {category}", "description": "สร้างข้อความข่าวภาษาไทยในหมวดหมู่ต่างๆ" } } # Thai language models from Hugging Face THAI_MODELS = { "typhoon-7b": { "name": "🌪️ Typhoon-7B (SCB10X)", "model_id": "scb10x/typhoon-7b", "description": "โมเดลภาษาไทยขนาด 7B พารามิเตอร์ ประสิทธิภาพสูง" }, "openthaigpt": { "name": "🇹🇭 OpenThaiGPT 1.5-7B", "model_id": "openthaigpt/openthaigpt1.5-7b-instruct", "description": "โมเดลภาษาไทยรองรับคำสั่งและบทสนทนาหลายรอบ" }, "wangchanlion": { "name": "🦁 Gemma2-9B WangchanLION", "model_id": "aisingapore/Gemma2-9b-WangchanLIONv2-instruct", "description": "โมเดลขนาด 9B รองรับไทย-อังกฤษ พัฒนาโดย AI Singapore" }, "sambalingo": { "name": "🌍 SambaLingo-Thai-Base", "model_id": "sambanovasystems/SambaLingo-Thai-Base", "description": "โมเดลภาษาไทยพื้นฐาน รองรับทั้งไทยและอังกฤษ" }, "other": { "name": "🔧 โมเดลอื่นๆ (Custom)", "model_id": "custom", "description": "ระบุชื่อโมเดลที่ต้องการใช้งานเอง" } } def load_file_data(file_path: str) -> List[Dict]: """Load data from uploaded file""" try: if file_path.endswith('.csv'): df = pd.read_csv(file_path) return df.to_dict('records') elif file_path.endswith('.json'): with open(file_path, 'r', encoding='utf-8') as f: return json.load(f) elif file_path.endswith('.txt'): with open(file_path, 'r', encoding='utf-8') as f: lines = f.readlines() return [{'text': line.strip()} for line in lines if line.strip()] else: raise ValueError("Unsupported file format. Use CSV, JSON, or TXT files.") except Exception as e: raise Exception(f"Error reading file: {str(e)}") def generate_from_template(template: str, data_row: Dict) -> str: """Generate prompt from template and data""" try: return template.format(**data_row) except KeyError as e: return f"Template error: Missing field {e}" def load_model(model_name): """Load a Hugging Face model for text generation""" try: tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) generator = pipeline("text-generation", model=model, tokenizer=tokenizer) return generator, None except Exception as e: return None, str(e) def generate_dataset(model_name, prompt_template, num_samples, max_length, temperature, top_p): """Generate dataset using Hugging Face model""" try: generator, error = load_model(model_name) if error: return None, f"Error loading model: {error}" dataset = [] for i in range(num_samples): # Generate text generated = generator( prompt_template, max_length=max_length, temperature=temperature, top_p=top_p, num_return_sequences=1, do_sample=True ) generated_text = generated[0]['generated_text'] dataset.append({ 'id': i + 1, 'prompt': prompt_template, 'generated_text': generated_text, 'full_text': generated_text }) # Convert to DataFrame for display df = pd.DataFrame(dataset) # Create downloadable files csv_data = df.to_csv(index=False) json_data = json.dumps(dataset, indent=2, ensure_ascii=False) return df, csv_data, json_data, None except Exception as e: return None, None, None, f"Error generating dataset: {str(e)}" def generate_dataset_from_task(model_name, task_type, custom_template, file_data, num_samples, max_length, temperature, top_p): """Generate dataset using task templates or file input""" try: generator, error = load_model(model_name) if error: return None, f"Error loading model: {error}" dataset = [] # Determine the template to use if custom_template and custom_template.strip(): template = custom_template elif task_type in TASK_TEMPLATES: template = TASK_TEMPLATES[task_type]["template"] else: template = "Generate text: {input}" # Generate samples for i in range(num_samples): if file_data and len(file_data) > 0: # Use file data cyclically data_row = file_data[i % len(file_data)] prompt = generate_from_template(template, data_row) else: # Use template with placeholder values prompt = template.replace("{topic}", "artificial intelligence") \ .replace("{question}", "What is machine learning?") \ .replace("{text}", "Sample text for processing") \ .replace("{input}", f"Sample input {i+1}") \ .replace("{instruction}", f"Complete this task {i+1}") # Generate text generated = generator( prompt, max_length=max_length, temperature=temperature, top_p=top_p, num_return_sequences=1, do_sample=True, pad_token_id=generator.tokenizer.eos_token_id ) generated_text = generated[0]['generated_text'] dataset.append({ 'id': i + 1, 'task_type': task_type, 'prompt': prompt, 'generated_text': generated_text, 'original_data': data_row if file_data else None }) # Convert to DataFrame for display df = pd.DataFrame(dataset) # Create downloadable files csv_data = df.to_csv(index=False) json_data = json.dumps(dataset, indent=2, ensure_ascii=False) return df, csv_data, json_data, None except Exception as e: return None, None, None, f"Error generating dataset: {str(e)}" # Multi-model generation status tracking class ModelStatus: def __init__(self): self.models = {} self.record_status = {} # record_id: {"status": "pending/processing/completed", "model": "model_name"} self.completed_records = [] self.lock = threading.Lock() def set_record_processing(self, record_id: int, model_name: str): with self.lock: self.record_status[record_id] = {"status": "processing", "model": model_name} def set_record_completed(self, record_id: int, result: dict): with self.lock: self.record_status[record_id]["status"] = "completed" self.completed_records.append(result) def get_next_available_record(self, total_records: int, model_name: str) -> int: with self.lock: for i in range(total_records): if i not in self.record_status or self.record_status[i]["status"] == "pending": self.record_status[i] = {"status": "pending", "model": model_name} return i return -1 # No available records def get_progress(self, total_records: int) -> dict: with self.lock: completed = len([r for r in self.record_status.values() if r["status"] == "completed"]) processing = len([r for r in self.record_status.values() if r["status"] == "processing"]) return { "completed": completed, "processing": processing, "total": total_records, "percentage": (completed / total_records * 100) if total_records > 0 else 0 } def load_model_with_cache(model_name: str, cache: dict): """Load model with caching and progress feedback""" if model_name in cache: return cache[model_name], None try: print(f"🔄 กำลังโหลดโมเดล {model_name}...") # Use smaller models or quantized versions for faster loading if "typhoon" in model_name.lower(): # Load with optimizations tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, # Use half precision device_map="auto", trust_remote_code=True ) else: tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map="auto" ) generator = pipeline("text-generation", model=model, tokenizer=tokenizer) cache[model_name] = generator print(f"✅ โหลดโมเดล {model_name} สำเร็จ") return generator, None except Exception as e: error_msg = f"❌ ไม่สามารถโหลดโมเดล {model_name}: {str(e)}" print(error_msg) return None, error_msg def generate_single_record(generator, prompt: str, record_id: int, model_name: str, max_length: int, temperature: float, top_p: float, task_type: str, original_data: dict, status_tracker: ModelStatus): """Generate a single record with the given model""" try: # Mark record as processing status_tracker.set_record_processing(record_id, model_name) # Generate text generated = generator( prompt, max_length=max_length, temperature=temperature, top_p=top_p, num_return_sequences=1, do_sample=True, pad_token_id=generator.tokenizer.eos_token_id if hasattr(generator.tokenizer, 'eos_token_id') else generator.tokenizer.pad_token_id ) generated_text = generated[0]['generated_text'] result = { 'id': record_id + 1, 'model_used': model_name, 'task_type': task_type, 'prompt': prompt, 'generated_text': generated_text, 'original_data': original_data, 'generation_time': time.time() } # Mark record as completed status_tracker.set_record_completed(record_id, result) return result except Exception as e: # If generation fails, mark as pending again for other models to try with status_tracker.lock: if record_id in status_tracker.record_status: status_tracker.record_status[record_id]["status"] = "pending" return None def model_worker(model_name: str, model_cache: dict, prompts: List[str], task_type: str, original_data_list: List[dict], max_length: int, temperature: float, top_p: float, status_tracker: ModelStatus, progress_callback=None): """Worker function for each model to process available records""" # Load model generator, error = load_model_with_cache(model_name, model_cache) if error: return f"Error loading {model_name}: {error}" total_records = len(prompts) processed_count = 0 while True: # Get next available record record_id = status_tracker.get_next_available_record(total_records, model_name) if record_id == -1: # No more records available break # Generate record prompt = prompts[record_id] original_data = original_data_list[record_id] if original_data_list else None result = generate_single_record( generator, prompt, record_id, model_name, max_length, temperature, top_p, task_type, original_data, status_tracker ) if result: processed_count += 1 # Update progress if progress_callback: progress = status_tracker.get_progress(total_records) progress_callback(progress, model_name, processed_count) return f"{model_name}: Processed {processed_count} records" def generate_dataset_multi_model(selected_models: List[str], task_type: str, custom_template: str, file_data: List[dict], num_samples: int, max_length: int, temperature: float, top_p: float, progress_callback=None): """Generate dataset using multiple models collaboratively""" try: # Prepare prompts prompts = [] original_data_list = [] # Determine template if custom_template and custom_template.strip(): template = custom_template elif task_type in TASK_TEMPLATES: template = TASK_TEMPLATES[task_type]["template"] else: template = "Generate text: {input}" # Generate prompts for all records for i in range(num_samples): if file_data and len(file_data) > 0: data_row = file_data[i % len(file_data)] prompt = generate_from_template(template, data_row) original_data_list.append(data_row) else: # Use template with placeholder values prompt = template.replace("{topic}", f"หัวข้อที่ {i+1}") \ .replace("{question}", f"คำถามที่ {i+1} เกี่ยวกับการเรียนรู้ของเครื่อง") \ .replace("{text}", f"ข้อความตัวอย่างที่ {i+1} สำหรับการประมวลผล") \ .replace("{input}", f"ข้อมูลนำเข้าที่ {i+1}") \ .replace("{instruction}", f"คำสั่งที่ {i+1}: ให้ทำงานนี้") \ .replace("{category}", "เทคโนโลยี") \ .replace("{style}", "โคลงสี่สุภาพ") original_data_list.append(None) prompts.append(prompt) # Initialize status tracker status_tracker = ModelStatus() model_cache = {} # Start worker threads for each model with ThreadPoolExecutor(max_workers=len(selected_models)) as executor: futures = [] for model_name in selected_models: future = executor.submit( model_worker, model_name, model_cache, prompts, task_type, original_data_list, max_length, temperature, top_p, status_tracker, progress_callback ) futures.append((future, model_name)) # Wait for all workers to complete for future, model_name in futures: try: result = future.result(timeout=300) # 5 minute timeout per model print(f"Model {model_name} completed: {result}") except Exception as e: print(f"Model {model_name} failed: {str(e)}") # Collect results dataset = sorted(status_tracker.completed_records, key=lambda x: x['id']) if not dataset: return None, None, None, "ไม่สามารถสร้างข้อมูลได้" # Convert to DataFrame df = pd.DataFrame(dataset) # Create downloadable files csv_data = df.to_csv(index=False) json_data = json.dumps(dataset, indent=2, ensure_ascii=False) return df, csv_data, json_data, None except Exception as e: return None, None, None, f"Error in multi-model generation: {str(e)}" def create_interface(): with gr.Blocks(title="🇹🇭 Thai Dataset Generator", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🤗 เครื่องมือสร้างชุดข้อมูลภาษาไทยคุณภาพสูง") gr.Markdown("⚡ **เคล็ดลับ**: ใช้โมเดลใดก็ได้จาก Hugging Face - เริ่มต้นด้วยโมเดลเล็กๆ เพื่อทดสอบก่อน") with gr.Row(): with gr.Column(): # Flexible model input gr.Markdown("### 🤖 เลือกโมเดลจาก Hugging Face") gr.Markdown("💡 **คำแนะนำ**: ใส่ชื่อโมเดลจาก [Hugging Face](https://huggingface.co/models) เช่น `microsoft/DialoGPT-small`, `gpt2`, `scb10x/typhoon-7b`") model_input_mode = gr.Radio( choices=[ ("📝 ใส่ชื่อโมเดลเอง", "manual"), ("📋 เลือกจากรายการแนะนำ", "suggested"), ("🔀 ใช้หลายโมเดลพร้อมกัน", "multiple") ], value="manual", label="วิธีการเลือกโมเดล" ) # Manual model input manual_model_group = gr.Group(visible=True) with manual_model_group: single_model_name = gr.Textbox( label="ชื่อโมเดลจาก Hugging Face", value="microsoft/DialoGPT-small", placeholder="เช่น gpt2, microsoft/DialoGPT-medium, scb10x/typhoon-7b", info="ใส่ชื่อโมเดลที่ต้องการใช้งาน" ) model_verification = gr.Button("🔍 ตรวจสอบโมเดล", variant="secondary", size="sm") model_download = gr.Button("⬇️ ดาวน์โหลดโมเดล", variant="secondary", size="sm") model_status = gr.Textbox( label="สถานะโมเดล", value="ยังไม่ได้ตรวจสอบ", interactive=False ) # เชื่อมปุ่มตรวจสอบโมเดลกับฟังก์ชันตรวจสอบ def verify_model(model_name): from transformers import AutoTokenizer try: # ลองโหลด tokenizer (เร็วกว่าโหลด model) AutoTokenizer.from_pretrained(model_name) return gr.update(value=f"✅ พบโมเดล {model_name} ใน Hugging Face", interactive=False) except Exception as e: return gr.update(value=f"❌ ไม่พบโมเดลหรือโหลดไม่ได้: {str(e)}", interactive=False) model_verification.click( fn=verify_model, inputs=[single_model_name], outputs=[model_status] ) # ปุ่มดาวน์โหลดโมเดล (preload) def download_model(model_name): import time from transformers import AutoTokenizer, AutoModelForCausalLM try: t0 = time.time() model_status_msg = f"⏳ กำลังดาวน์โหลดและโหลดโมเดล {model_name} ..." yield gr.update(value=model_status_msg, interactive=False) # โหลด tokenizer และ model tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) t1 = time.time() msg = f"✅ โหลดโมเดล {model_name} สำเร็จใน {t1-t0:.1f} วินาที" yield gr.update(value=msg, interactive=False) except Exception as e: yield gr.update(value=f"❌ ไม่สามารถโหลดโมเดล: {str(e)}", interactive=False) model_download.click( fn=download_model, inputs=[single_model_name], outputs=[model_status] ) # Suggested models suggested_model_group = gr.Group(visible=False) with suggested_model_group: gr.Markdown("#### โมเดลแนะนำ") suggested_models = gr.Dropdown( choices=[ # Small/Fast models ("⚡ DistilGPT2 (เล็ก, เร็ว)", "distilgpt2"), ("⚡ GPT2 (กลาง)", "gpt2"), ("⚡ DialoGPT-small (บทสนทนา)", "microsoft/DialoGPT-small"), ("⚡ DialoGPT-medium (บทสนทนา)", "microsoft/DialoGPT-medium"), # Thai models ("🇹🇭 Typhoon-7B (ไทย, ใหญ่)", "scb10x/typhoon-7b"), ("🇹🇭 OpenThaiGPT-1.5-7B (ไทย)", "openthaigpt/openthaigpt1.5-7b-instruct"), ("🇹🇭 WangchanLION-7B (ไทย)", "aisingapore/llama2-7b-chat-thai"), # Multilingual models ("🌍 mGPT (หลายภาษา)", "ai-forever/mGPT"), ("🌍 Bloom-560m (หลายภาษา, เล็ก)", "bigscience/bloom-560m"), ("🌍 Bloom-1b1 (หลายภาษา)", "bigscience/bloom-1b1"), # Instruction-following ("🎯 Flan-T5-small (คำสั่ง)", "google/flan-t5-small"), ("🎯 Flan-T5-base (คำสั่ง)", "google/flan-t5-base"), # Other popular models ("🔥 OPT-350m (Meta)", "facebook/opt-350m"), ("🔥 OPT-1.3b (Meta)", "facebook/opt-1.3b"), ], value="distilgpt2", label="เลือกโมเดลแนะนำ" ) # Multiple models multiple_model_group = gr.Group(visible=False) with multiple_model_group: multiple_model_names = gr.Textbox( label="ชื่อโมเดลหลายตัว (แยกด้วยเครื่องหมายจุลภาค)", value="distilgpt2, microsoft/DialoGPT-small", placeholder="gpt2, microsoft/DialoGPT-medium, scb10x/typhoon-7b", lines=3, info="ใส่ชื่อโมเดลหลายตัวแยกด้วยเครื่องหมายจุลภาค" ) model_distribution_mode = gr.Radio( choices=[ ("🔄 แบ่งงานกัน (Collaborative)", "collaborative"), ("🎲 สุ่มเลือก (Random)", "random"), ("📊 เท่าๆ กัน (Round-robin)", "round_robin") ], value="collaborative", label="วิธีการใช้โมเดลหลายตัว" ) # Model info display current_models_display = gr.Textbox( label="โมเดลที่จะใช้", value="microsoft/DialoGPT-small", interactive=False ) # Task selection with Thai tasks gr.Markdown("### 📝 เลือกประเภทงาน") task_dropdown = gr.Dropdown( choices=[(v["name"], k) for k, v in TASK_TEMPLATES.items()], value="text_generation", label="ประเภทงานที่ต้องการ" ) task_description = gr.Textbox( label="คำอธิบายงาน", value=TASK_TEMPLATES["text_generation"]["description"], interactive=False ) # File upload section gr.Markdown("### 📁 อัปโหลดข้อมูลต้นฉบับ (ไม่บังคับ)") gr.Markdown("อัปโหลดไฟล์ CSV, JSON หรือ TXT ที่มีข้อมูลต้นฉบับภาษาไทย") file_upload = gr.File( label="อัปโหลดไฟล์ข้อมูล", file_types=[".csv", ".json", ".txt"] ) file_preview = gr.Dataframe( label="ตัวอย่างข้อมูลจากไฟล์ (5 แถวแรก)", visible=False ) # State สำหรับเก็บข้อมูลไฟล์ (ต้องอยู่ก่อนใช้งาน) file_data_state = gr.State() # ฟังก์ชัน handle file upload def handle_file_upload(file): import pandas as pd import json if file is None: return gr.update(visible=False), None try: if file.name.endswith('.csv'): df = pd.read_csv(file.name) elif file.name.endswith('.json'): with open(file.name, 'r', encoding='utf-8') as f: data = json.load(f) df = pd.DataFrame(data) elif file.name.endswith('.txt'): with open(file.name, 'r', encoding='utf-8') as f: lines = f.readlines() df = pd.DataFrame({'text': [line.strip() for line in lines if line.strip()]}) else: return gr.update(visible=True, value="ไม่รองรับไฟล์นี้"), None preview = df.head(5) # คืน preview และข้อมูลทั้งหมด (list of dict) return gr.update(visible=True, value=preview), df.to_dict('records') except Exception as e: return gr.update(visible=True, value=f"❌ อ่านไฟล์ผิดพลาด: {str(e)}"), None file_upload.change( fn=handle_file_upload, inputs=[file_upload], outputs=[file_preview, file_data_state] ) # Template customization with multi-prompt support gr.Markdown("### 🎯 ปรับแต่งเทมเพลตและ Prompt") gr.Markdown("ใช้ {ชื่อฟิลด์} สำหรับตัวแปรในเทมเพลต") prompt_mode = gr.Radio( choices=[ ("📝 Prompt เดียว (Single)", "single"), ("📋 หลาย Prompt (Multiple)", "multiple"), ("🎲 สุ่มจาก Template (Random)", "random") ], value="single", label="โหมดการใส่ Prompt" ) # Single prompt mode single_prompt_group = gr.Group(visible=True) with single_prompt_group: template_display = gr.Textbox( label="เทมเพลตปัจจุบัน", value=TASK_TEMPLATES["text_generation"]["template"], interactive=False ) custom_template = gr.Textbox( label="เทมเพลตกำหนดเอง (ไม่บังคับ)", lines=3, placeholder="สร้างเทมเพลตของคุณเองที่นี่..." ) # Multiple prompts mode multi_prompt_group = gr.Group(visible=False) with multi_prompt_group: gr.Markdown("#### 📋 ใส่หลาย Prompt (แต่ละบรรทัดคือ prompt หนึ่งตัว)") multi_prompts = gr.Textbox( label="Prompts หลายตัว (แยกด้วยการขึ้นบรรทัดใหม่)", lines=10, placeholder="""เขียนเรื่องราวเกี่ยวกับการผจญภัยในป่า สร้างบทสนทนาระหว่างครูกับนักเรียน อธิบายวิธีการทำอาหารไทย เขียนบทกวีเกี่ยวกับธรรมชาติ สร้างเรื่องสั้นเกี่ยวกับมิตรภาพ""" ) prompt_distribution = gr.Radio( choices=[ ("📊 กระจายเท่าๆ กัน", "equal"), ("🎯 ตามสัดส่วนที่กำหนด", "weighted"), ("🎲 สุ่ม", "random") ], value="equal", label="วิธีการกระจาย Prompt" ) prompt_weights = gr.Textbox( label="น้ำหนักของแต่ละ Prompt (เช่น 2,1,3,1,2)", placeholder="2,1,3,1,2", visible=False ) # Random template mode random_prompt_group = gr.Group(visible=False) with random_prompt_group: gr.Markdown("#### 🎲 สุ่ม Prompt จาก Template ที่เลือก") random_templates = gr.CheckboxGroup( choices=[(v["name"], k) for k, v in TASK_TEMPLATES.items()], value=["text_generation", "conversation"], label="เลือก Template ที่จะสุ่ม" ) random_variables = gr.Textbox( label="ตัวแปรสำหรับสุ่ม (JSON format)", lines=5, value="""{ "topic": ["การเดินทาง", "เทคโนโลยี", "อาหาร", "ธรรมชาติ", "ศิลปะ"], "question": ["AI คืออะไร", "โลกร้อนคืออะไร", "การศึกษาสำคัญอย่างไร"], "instruction": ["เขียนบทความ", "สรุปข้อมูล", "วิเคราะห์ปัญหา"] }""", placeholder="ใส่ตัวแปรในรูปแบบ JSON" ) # Prompt preview and count prompt_preview = gr.Textbox( label="ตัวอย่าง Prompt ที่จะใช้", lines=3, interactive=False ) prompt_count = gr.Textbox( label="จำนวน Prompt ที่พร้อมใช้", value="1 prompt", interactive=False ) # State สำหรับเก็บข้อมูลไฟล์ file_data_state = gr.State() # ตัวเลือกจำนวนแถวข้อมูล (row_preset) row_preset = gr.Dropdown( choices=[ ("10 แถว", 10), ("100 แถว", 100), ("500 แถว", 500), ("1000 แถว", 1000) ], value=10, label="จำนวนแถวข้อมูลที่ต้องการสร้าง" ) # กำหนดจำนวนแถวเอง (custom_rows) custom_rows = gr.Textbox( label="จำนวนแถวกำหนดเอง (ถ้าเว้นว่างจะใช้ค่าจากด้านบน)", placeholder="ใส่ตัวเลข เช่น 123" ) # ตัวเลือกการตั้งค่าการสร้างข้อความ max_length = gr.Slider( minimum=16, maximum=2048, value=128, step=1, label="ความยาวสูงสุดของข้อความที่สร้าง (max_length)" ) temperature = gr.Slider( minimum=0.1, maximum=2.0, value=1.0, step=0.05, label="Temperature (ความสุ่ม)" ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)" ) batch_size = gr.Slider( minimum=1, maximum=32, value=1, step=1, label="Batch size" ) # ปุ่มสร้างข้อมูล generate_btn = gr.Button("🚀 สร้างข้อมูล", variant="primary") # Data Quality Settings gr.Markdown("### 🧼 การจัดการคุณภาพข้อมูล") enable_cleaning = gr.Checkbox( label="เปิดใช้การทำความสะอาดข้อมูล", value=True ) remove_duplicates = gr.Checkbox( label="ลบข้อมูลซ้ำซ้อน", value=True ) min_quality_score = gr.Slider( minimum=0.0, maximum=1.0, value=0.5, step=0.1, label="คะแนนคุณภาพขั้นต่ำ (0-1)" ) # ตัวเลือกแยกชุดข้อมูล (train/val/test split) create_splits = gr.Checkbox( label="แยกชุดข้อมูลเป็น train/val/test", value=False ) # Export Settings gr.Markdown("### 📦 การส่งออกข้อมูล") export_format = gr.CheckboxGroup( choices=[ ("📊 CSV (Excel, Spreadsheet)", "csv"), ("📋 JSON (Web APIs, General)", "json"), ("📄 JSONL (Fine-tuning, Streaming)", "jsonl"), ("🤗 Hugging Face Dataset (Complete Package)", "huggingface"), ("📝 TXT (Simple Text)", "txt"), ("🗃️ Parquet (Big Data, Analytics)", "parquet"), ("📋 TSV (Tab-separated)", "tsv"), ("🎯 Custom Format", "custom") ], value=["csv", "json"], label="เลือกรูปแบบไฟล์ที่ต้องการ (สามารถเลือกหลายแบบ)" ) # Custom format settings custom_format_group = gr.Group(visible=False) with custom_format_group: gr.Markdown("#### 🎯 การตั้งค่ารูปแบบกำหนดเอง") custom_template_format = gr.Textbox( label="Template สำหรับแต่ละ record", value="Input: {input}\nOutput: {output}\n---", lines=3, placeholder="ใช้ {field_name} สำหรับข้อมูล" ) custom_file_extension = gr.Textbox( label="นามสกุลไฟล์", value="txt", placeholder="เช่น txt, md, xml" ) # Advanced export options with gr.Accordion("⚙️ ตัวเลือกขั้นสูง", open=False): include_metadata = gr.Checkbox( label="รวม Metadata (model_used, timestamp, etc.)", value=True ) include_quality_score = gr.Checkbox( label="รวม Quality Score", value=True ) file_naming_pattern = gr.Textbox( label="รูปแบบชื่อไฟล์", value="thai_dataset_{task}_{timestamp}", placeholder="ใช้ {task}, {timestamp}, {model}, {count}" ) compression = gr.Radio( choices=[ ("ไม่บีบอัด", "none"), ("ZIP", "zip"), ("GZIP", "gzip") ], value="none", label="การบีบอัดไฟล์" ) # ...existing code... with gr.Column(): with gr.Tabs(): with gr.TabItem("📊 ตัวอย่างข้อมูล"): dataset_preview = gr.Dataframe( headers=["id", "task_type", "input", "output", "quality_score"], interactive=False ) status_message = gr.Markdown( value="", visible=True ) # State สำหรับข้อมูลที่สร้าง csv_data_state = gr.State() json_data_state = gr.State() dataset_card_state = gr.State() hf_export_state = gr.State() loading_status = gr.State() with gr.TabItem("📈 รายงานคุณภาพ"): quality_report = gr.JSON( label="รายงานคุณภาพข้อมูล", visible=True ) quality_summary = gr.Markdown( value="สร้างข้อมูลเสร็จแล้วจึงจะแสดงรายงานคุณภาพ" ) with gr.TabItem("💾 ดาวน์โหลด"): gr.Markdown("### 💾 ดาวน์โหลดชุดข้อมูลในรูปแบบต่างๆ") download_status = gr.Markdown("สร้างข้อมูลเสร็จแล้วจึงจะสามารถดาวน์โหลดได้") # Dynamic download buttons based on selected formats download_buttons = {} download_files = {} with gr.Row(): csv_btn = gr.Button("📊 CSV", variant="secondary", visible=False) json_btn = gr.Button("📋 JSON", variant="secondary", visible=False) jsonl_btn = gr.Button("📄 JSONL", variant="secondary", visible=False) txt_btn = gr.Button("📝 TXT", variant="secondary", visible=False) with gr.Row(): parquet_btn = gr.Button("🗃️ Parquet", variant="secondary", visible=False) tsv_btn = gr.Button("📋 TSV", variant="secondary", visible=False) hf_btn = gr.Button("🤗 HF Dataset", variant="secondary", visible=False) custom_btn = gr.Button("🎯 Custom", variant="secondary", visible=False) # Download files csv_download = gr.File(label="CSV File", visible=False) json_download = gr.File(label="JSON File", visible=False) jsonl_download = gr.File(label="JSONL File", visible=False) txt_download = gr.File(label="TXT File", visible=False) parquet_download = gr.File(label="Parquet File", visible=False) tsv_download = gr.File(label="TSV File", visible=False) hf_download = gr.File(label="HF Dataset Package", visible=False) custom_download = gr.File(label="Custom Format", visible=False) # All formats in one package with gr.Row(): package_btn = gr.Button("📦 ดาวน์โหลดทั้งหมด (ZIP)", variant="primary") package_download = gr.File(label="Complete Package", visible=False) # ...existing code for states... def update_export_format_visibility(selected_formats): """Update visibility of download buttons based on selected formats""" return [ gr.update(visible=("csv" in selected_formats)), gr.update(visible=("json" in selected_formats)), gr.update(visible=("jsonl" in selected_formats)), gr.update(visible=("txt" in selected_formats)), gr.update(visible=("parquet" in selected_formats)), gr.update(visible=("tsv" in selected_formats)), gr.update(visible=("huggingface" in selected_formats)), gr.update(visible=("custom" in selected_formats)), gr.update(visible=("custom" in selected_formats)) ] def generate_multiple_formats(data, selected_formats, include_metadata, include_quality_score, file_naming_pattern, custom_template_format, custom_file_extension, task_type, compression): """Generate data in multiple formats""" from datetime import datetime import tempfile import zipfile import gzip import pyarrow as pa import pyarrow.parquet as pq if not data: return {} timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") model_name = data[0].get('model_used', 'unknown').replace('/', '_') # Prepare data export_data = [] for record in data: export_record = {} export_record['input'] = record.get('prompt', '') export_record['output'] = record.get('generated_text', '') if include_metadata: export_record['metadata'] = { 'model_used': record.get('model_used', ''), 'task_type': record.get('task_type', ''), 'timestamp': record.get('generation_time', '') } if include_quality_score and 'quality_score' in record: export_record['quality_score'] = record['quality_score'] export_data.append(export_record) # Generate filename filename_base = file_naming_pattern.format( task=task_type, timestamp=timestamp, model=model_name, count=len(export_data) ) generated_files = {} # Generate each format if "csv" in selected_formats: df = pd.DataFrame(export_data) csv_content = df.to_csv(index=False) generated_files['csv'] = (f"{filename_base}.csv", csv_content) if "json" in selected_formats: json_content = json.dumps(export_data, indent=2, ensure_ascii=False) generated_files['json'] = (f"{filename_base}.json", json_content) if "jsonl" in selected_formats: jsonl_content = '\n'.join([json.dumps(record, ensure_ascii=False) for record in export_data]) generated_files['jsonl'] = (f"{filename_base}.jsonl", jsonl_content) if "txt" in selected_formats: txt_content = '\n'.join([f"Input: {record['input']}\nOutput: {record['output']}\n---" for record in export_data]) generated_files['txt'] = (f"{filename_base}.txt", txt_content) if "tsv" in selected_formats: df = pd.DataFrame(export_data) tsv_content = df.to_csv(index=False, sep='\t') generated_files['tsv'] = (f"{filename_base}.tsv", tsv_content) if "parquet" in selected_formats: df = pd.DataFrame(export_data) temp_parquet = tempfile.mktemp(suffix='.parquet') df.to_parquet(temp_parquet) with open(temp_parquet, 'rb') as f: parquet_content = f.read() generated_files['parquet'] = (f"{filename_base}.parquet", parquet_content) if "custom" in selected_formats: custom_content = [] for record in export_data: formatted = custom_template_format.format(**record) custom_content.append(formatted) custom_text = '\n'.join(custom_content) generated_files['custom'] = (f"{filename_base}.{custom_file_extension}", custom_text) # Apply compression if selected if compression == "gzip": for format_name, (filename, content) in generated_files.items(): if isinstance(content, str): content = content.encode('utf-8') compressed = gzip.compress(content) generated_files[format_name] = (filename + '.gz', compressed) return generated_files def create_complete_package(generated_files, compression): """Create a complete package with all formats""" import tempfile import zipfile if not generated_files: return None temp_zip = tempfile.mktemp(suffix='.zip') with zipfile.ZipFile(temp_zip, 'w', zipfile.ZIP_DEFLATED) as zipf: for format_name, (filename, content) in generated_files.items(): if isinstance(content, str): content = content.encode('utf-8') zipf.writestr(filename, content) return temp_zip def download_specific_format(format_name, generated_files): """Download specific format""" if format_name in generated_files: filename, content = generated_files[format_name] if isinstance(content, str): return gr.update(visible=True, value=io.StringIO(content)) else: temp_file = tempfile.mktemp() with open(temp_file, 'wb') as f: f.write(content) return gr.update(visible=True, value=temp_file) return gr.update(visible=False) # Event handlers export_format.change( fn=update_export_format_visibility, inputs=[export_format], outputs=[csv_btn, json_btn, jsonl_btn, txt_btn, parquet_btn, tsv_btn, hf_btn, custom_btn, custom_format_group] ) # ...existing code for other event handlers... # Download button handlers csv_btn.click( fn=lambda files: download_specific_format('csv', files), inputs=[gr.State()], # Will be connected to generated files state outputs=[csv_download] ) json_btn.click( fn=lambda files: download_specific_format('json', files), inputs=[gr.State()], outputs=[json_download] ) jsonl_btn.click( fn=lambda files: download_specific_format('jsonl', files), inputs=[gr.State()], outputs=[jsonl_download] ) txt_btn.click( fn=lambda files: download_specific_format('txt', files), inputs=[gr.State()], outputs=[txt_download] ) parquet_btn.click( fn=lambda files: download_specific_format('parquet', files), inputs=[gr.State()], outputs=[parquet_download] ) tsv_btn.click( fn=lambda files: download_specific_format('tsv', files), inputs=[gr.State()], outputs=[tsv_download] ) hf_btn.click( fn=lambda files: download_specific_format('huggingface', files), inputs=[gr.State()], outputs=[hf_download] ) custom_btn.click( fn=lambda files: download_specific_format('custom', files), inputs=[gr.State()], outputs=[custom_download] ) package_btn.click( fn=lambda files, comp: gr.update(visible=True, value=create_complete_package(files, comp)), inputs=[gr.State(), compression], # Will be connected to generated files and compression outputs=[package_download] ) # Update generate button to use correct function generate_btn.click( fn=process_with_flexible_models, inputs=[model_input_mode, single_model_name, suggested_models, multiple_model_names, model_distribution_mode, task_dropdown, prompt_mode, custom_template, multi_prompts, random_templates, random_variables, file_data_state, row_preset, custom_rows, max_length, temperature, top_p, batch_size, enable_cleaning, remove_duplicates, min_quality_score, create_splits, export_format], outputs=[dataset_preview, status_message, quality_report, quality_summary, csv_data_state, json_data_state, dataset_card_state, hf_export_state, loading_status] ) return demo def validate_models_before_generation(*args, **kwargs): # TODO: implement validation logic return None def process_with_flexible_models(input_mode, single_model, suggested_model, multiple_models, model_distribution_mode, task_type, prompt_mode, custom_template, multi_prompts, random_templates, random_variables, file_data, row_preset, custom_rows, max_length, temperature, top_p, batch_size, enable_cleaning, remove_duplicates, min_quality_score, create_splits, export_format): """Process generation with flexible model selection""" # ฟังก์ชันเลือกโมเดลที่ใช้จริง def get_selected_models(input_mode, single_model, suggested_model, multiple_models): if input_mode == "manual": return [single_model.strip()] if single_model and single_model.strip() else [] elif input_mode == "suggested": return [suggested_model] if suggested_model else [] elif input_mode == "multiple": # แยกชื่อโมเดลด้วย , และลบช่องว่าง return [m.strip() for m in multiple_models.split(",") if m.strip()] return [] # ฟังก์ชันนับจำนวนแถวข้อมูลที่ต้องการสร้าง def get_final_row_count(row_preset, custom_rows): try: if custom_rows and str(custom_rows).strip(): return int(custom_rows) return int(row_preset) except Exception: return 10 # Get selected models selected_models = get_selected_models(input_mode, single_model, suggested_model, multiple_models) if not selected_models: yield ( gr.update(visible=False), gr.update(visible=True, value="❌ กรุณาเลือกโมเดลอย่างน้อยหนึ่งตัว"), {}, "ไม่มีโมเดล", None, None, None, None, "❌ ไม่ได้เลือกโมเดล" ) return num_samples = get_final_row_count(row_preset, custom_rows) try: yield ( gr.update(visible=False), gr.update(visible=True, value=f"🔄 กำลังสร้างข้อมูล {num_samples} แถว..."), {}, "กำลังสร้าง...", None, None, None, None, f"🔄 กำลังประมวลผล..." ) # Simple generation for now model_name = selected_models[0] df, csv_data, json_data, error = generate_dataset_from_task( model_name, task_type, custom_template, file_data, num_samples, max_length, temperature, top_p ) if error: yield ( gr.update(visible=False), gr.update(visible=True, value=f"❌ เกิดข้อผิดพลาด: {error}"), {}, "เกิดข้อผิดพลาด", None, None, None, None, f"❌ {error}" ) return # Basic quality processing raw_data = df.to_dict('records') quality_report = { "total_records": len(raw_data), "models_used": selected_models } final_df = pd.DataFrame(raw_data) final_csv = final_df.to_csv(index=False) final_json = json.dumps(raw_data, indent=2, ensure_ascii=False) dataset_card = f"# Dataset generated with {model_name}\n\nRecords: {len(raw_data)}" success_msg = f"✅ สร้างข้อมูลสำเร็จ! ได้ {len(raw_data)} แถว" quality_summary = f"📊 จำนวนข้อมูล: {len(raw_data)} แถว" yield ( gr.update(visible=True, value=final_df), gr.update(visible=True, value=success_msg), quality_report, quality_summary, final_csv, final_json, dataset_card, None, "✅ เสร็จสิ้น!" ) except Exception as e: yield ( gr.update(visible=False), gr.update(visible=True, value=f"❌ ข้อผิดพลาด: {str(e)}"), {}, "เกิดข้อผิดพลาด", None, None, None, None, f"❌ {str(e)}" ) if __name__ == "__main__": demo = create_interface() demo.launch( server_name="0.0.0.0", server_port=7860, share=True )