Spaces:

JonusNattapong
/

DekGenerate

Running

File size: 66,307 Bytes

import gradio as gr
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import json
import io
import csv
from typing import List, Dict
import threading
import time
import queue
from concurrent.futures import ThreadPoolExecutor, as_completed
import asyncio

# Global model cache and loading status
MODEL_CACHE = {}
MODEL_LOADING_STATUS = {}
MODEL_LOADING_LOCK = threading.Lock()

def check_model_loading_status(model_names: List[str]) -> Dict:
    """Check loading status of multiple models"""
    with MODEL_LOADING_LOCK:
        status = {}
        for model_name in model_names:
            if model_name in MODEL_CACHE:
                status[model_name] = "ready"
            elif model_name in MODEL_LOADING_STATUS:
                status[model_name] = MODEL_LOADING_STATUS[model_name]
            else:
                status[model_name] = "not_loaded"
        return status

def load_model_with_status_tracking(model_name: str):
    """Load model with status tracking"""
    with MODEL_LOADING_LOCK:
        if model_name in MODEL_CACHE:
            return MODEL_CACHE[model_name], None
        
        if model_name in MODEL_LOADING_STATUS:
            return None, f"โมเดล {model_name} กำลังโหลดอยู่..."
        
        MODEL_LOADING_STATUS[model_name] = "loading"
    
    try:
        print(f"🔄 เริ่มโหลดโมเดล {model_name}...")
        
        # Update status
        with MODEL_LOADING_LOCK:
            MODEL_LOADING_STATUS[model_name] = "downloading"
        
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        
        with MODEL_LOADING_LOCK:
            MODEL_LOADING_STATUS[model_name] = "loading_model"
        
        model = AutoModelForCausalLM.from_pretrained(
            model_name, 
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True
        )
        
        with MODEL_LOADING_LOCK:
            MODEL_LOADING_STATUS[model_name] = "creating_pipeline"
        
        generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
        
        with MODEL_LOADING_LOCK:
            MODEL_CACHE[model_name] = generator
            MODEL_LOADING_STATUS[model_name] = "ready"
        
        print(f"✅ โหลดโมเดล {model_name} สำเร็จ")
        return generator, None
        
    except Exception as e:
        error_msg = f"❌ ไม่สามารถโหลดโมเดล {model_name}: {str(e)}"
        print(error_msg)
        
        with MODEL_LOADING_LOCK:
            if model_name in MODEL_LOADING_STATUS:
                del MODEL_LOADING_STATUS[model_name]
        
        return None, error_msg

def preload_models_async(model_names: List[str], progress_callback=None):
    """Preload models asynchronously"""
    def load_single_model(model_name):
        generator, error = load_model_with_status_tracking(model_name)
        if progress_callback:
            progress_callback(model_name, "ready" if generator else "error", error)
        return model_name, generator, error
    
    results = {}
    with ThreadPoolExecutor(max_workers=2) as executor:  # Limit concurrent loading
        futures = {executor.submit(load_single_model, model): model for model in model_names}
        
        for future in as_completed(futures):
            model_name, generator, error = future.result()
            results[model_name] = {"generator": generator, "error": error}
    
    return results

# Predefined task templates with Thai language support
TASK_TEMPLATES = {
    "text_generation": {
        "name": "การสร้างข้อความ (Text Generation)",
        "template": "เขียนเรื่องราวสร้างสรรค์เกี่ยวกับ {topic}",
        "description": "สร้างข้อความสร้างสรรค์ภาษาไทยจากหัวข้อที่กำหนด"
    },
    "question_answering": {
        "name": "คำถาม-คำตอบ (Question Answering)",
        "template": "คำถาม: {question}\nคำตอบ:",
        "description": "สร้างคู่คำถาม-คำตอบภาษาไทย"
    },
    "summarization": {
        "name": "การสรุปข้อความ (Text Summarization)",
        "template": "สรุปข้อความต่อไปนี้: {text}",
        "description": "สร้างตัวอย่างการสรุปข้อความภาษาไทย"
    },
    "translation": {
        "name": "การแปลภาษา (Translation)",
        "template": "แปลจาก {source_lang} เป็น {target_lang}: {text}",
        "description": "สร้างคู่ข้อมูลสำหรับการแปลภาษา"
    },
    "classification": {
        "name": "การจำแนกข้อความ (Text Classification)",
        "template": "จำแนกอารมณ์ของข้อความนี้: {text}\nอารมณ์:",
        "description": "สร้างตัวอย่างการจำแนกอารมณ์หรือหมวดหมู่ของข้อความ"
    },
    "conversation": {
        "name": "บทสนทนา (Conversation)",
        "template": "มนุษย์: {input}\nผู้ช่วย:",
        "description": "สร้างข้อมูลบทสนทนาภาษาไทย"
    },
    "instruction_following": {
        "name": "การทำตามคำสั่ง (Instruction Following)",
        "template": "คำสั่ง: {instruction}\nการตอบสนอง:",
        "description": "สร้างคู่คำสั่ง-การตอบสนองภาษาไทย"
    },
    "thai_poetry": {
        "name": "กวีนิพนธ์ไทย (Thai Poetry)",
        "template": "แต่งกวีนิพนธ์เกี่ยวกับ {topic} ในรูปแบบ {style}",
        "description": "สร้างกวีนิพนธ์ไทยในรูปแบบต่างๆ"
    },
    "thai_news": {
        "name": "ข่าวภาษาไทย (Thai News)",
        "template": "เขียนข่าวภาษาไทยเกี่ยวกับ {topic} ในหัวข้อ {category}",
        "description": "สร้างข้อความข่าวภาษาไทยในหมวดหมู่ต่างๆ"
    }
}

# Thai language models from Hugging Face
THAI_MODELS = {
    "typhoon-7b": {
        "name": "🌪️ Typhoon-7B (SCB10X)",
        "model_id": "scb10x/typhoon-7b",
        "description": "โมเดลภาษาไทยขนาด 7B พารามิเตอร์ ประสิทธิภาพสูง"
    },
    "openthaigpt": {
        "name": "🇹🇭 OpenThaiGPT 1.5-7B",
        "model_id": "openthaigpt/openthaigpt1.5-7b-instruct",
        "description": "โมเดลภาษาไทยรองรับคำสั่งและบทสนทนาหลายรอบ"
    },
    "wangchanlion": {
        "name": "🦁 Gemma2-9B WangchanLION",
        "model_id": "aisingapore/Gemma2-9b-WangchanLIONv2-instruct",
        "description": "โมเดลขนาด 9B รองรับไทย-อังกฤษ พัฒนาโดย AI Singapore"
    },
    "sambalingo": {
        "name": "🌍 SambaLingo-Thai-Base",
        "model_id": "sambanovasystems/SambaLingo-Thai-Base",
        "description": "โมเดลภาษาไทยพื้นฐาน รองรับทั้งไทยและอังกฤษ"
    },
    "other": {
        "name": "🔧 โมเดลอื่นๆ (Custom)",
        "model_id": "custom",
        "description": "ระบุชื่อโมเดลที่ต้องการใช้งานเอง"
    }
}

def load_file_data(file_path: str) -> List[Dict]:
    """Load data from uploaded file"""
    try:
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
            return df.to_dict('records')
        elif file_path.endswith('.json'):
            with open(file_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        elif file_path.endswith('.txt'):
            with open(file_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
                return [{'text': line.strip()} for line in lines if line.strip()]
        else:
            raise ValueError("Unsupported file format. Use CSV, JSON, or TXT files.")
    except Exception as e:
        raise Exception(f"Error reading file: {str(e)}")

def generate_from_template(template: str, data_row: Dict) -> str:
    """Generate prompt from template and data"""
    try:
        return template.format(**data_row)
    except KeyError as e:
        return f"Template error: Missing field {e}"

def load_model(model_name):
    """Load a Hugging Face model for text generation"""
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)
        generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
        return generator, None
    except Exception as e:
        return None, str(e)

def generate_dataset(model_name, prompt_template, num_samples, max_length, temperature, top_p):
    """Generate dataset using Hugging Face model"""
    try:
        generator, error = load_model(model_name)
        if error:
            return None, f"Error loading model: {error}"
        
        dataset = []
        
        for i in range(num_samples):
            # Generate text
            generated = generator(
                prompt_template,
                max_length=max_length,
                temperature=temperature,
                top_p=top_p,
                num_return_sequences=1,
                do_sample=True
            )
            
            generated_text = generated[0]['generated_text']
            
            dataset.append({
                'id': i + 1,
                'prompt': prompt_template,
                'generated_text': generated_text,
                'full_text': generated_text
            })
        
        # Convert to DataFrame for display
        df = pd.DataFrame(dataset)
        
        # Create downloadable files
        csv_data = df.to_csv(index=False)
        json_data = json.dumps(dataset, indent=2, ensure_ascii=False)
        
        return df, csv_data, json_data, None
        
    except Exception as e:
        return None, None, None, f"Error generating dataset: {str(e)}"

def generate_dataset_from_task(model_name, task_type, custom_template, file_data, num_samples, max_length, temperature, top_p):
    """Generate dataset using task templates or file input"""
    try:
        generator, error = load_model(model_name)
        if error:
            return None, f"Error loading model: {error}"
        
        dataset = []
        
        # Determine the template to use
        if custom_template and custom_template.strip():
            template = custom_template
        elif task_type in TASK_TEMPLATES:
            template = TASK_TEMPLATES[task_type]["template"]
        else:
            template = "Generate text: {input}"
        
        # Generate samples
        for i in range(num_samples):
            if file_data and len(file_data) > 0:
                # Use file data cyclically
                data_row = file_data[i % len(file_data)]
                prompt = generate_from_template(template, data_row)
            else:
                # Use template with placeholder values
                prompt = template.replace("{topic}", "artificial intelligence") \
                              .replace("{question}", "What is machine learning?") \
                              .replace("{text}", "Sample text for processing") \
                              .replace("{input}", f"Sample input {i+1}") \
                              .replace("{instruction}", f"Complete this task {i+1}")
            
            # Generate text
            generated = generator(
                prompt,
                max_length=max_length,
                temperature=temperature,
                top_p=top_p,
                num_return_sequences=1,
                do_sample=True,
                pad_token_id=generator.tokenizer.eos_token_id
            )
            
            generated_text = generated[0]['generated_text']
            
            dataset.append({
                'id': i + 1,
                'task_type': task_type,
                'prompt': prompt,
                'generated_text': generated_text,
                'original_data': data_row if file_data else None
            })
        
        # Convert to DataFrame for display
        df = pd.DataFrame(dataset)
        
        # Create downloadable files
        csv_data = df.to_csv(index=False)
        json_data = json.dumps(dataset, indent=2, ensure_ascii=False)
        
        return df, csv_data, json_data, None
        
    except Exception as e:
        return None, None, None, f"Error generating dataset: {str(e)}"

# Multi-model generation status tracking
class ModelStatus:
    def __init__(self):
        self.models = {}
        self.record_status = {}  # record_id: {"status": "pending/processing/completed", "model": "model_name"}
        self.completed_records = []
        self.lock = threading.Lock()
    
    def set_record_processing(self, record_id: int, model_name: str):
        with self.lock:
            self.record_status[record_id] = {"status": "processing", "model": model_name}
    
    def set_record_completed(self, record_id: int, result: dict):
        with self.lock:
            self.record_status[record_id]["status"] = "completed"
            self.completed_records.append(result)
    
    def get_next_available_record(self, total_records: int, model_name: str) -> int:
        with self.lock:
            for i in range(total_records):
                if i not in self.record_status or self.record_status[i]["status"] == "pending":
                    self.record_status[i] = {"status": "pending", "model": model_name}
                    return i
            return -1  # No available records
    
    def get_progress(self, total_records: int) -> dict:
        with self.lock:
            completed = len([r for r in self.record_status.values() if r["status"] == "completed"])
            processing = len([r for r in self.record_status.values() if r["status"] == "processing"])
            return {
                "completed": completed,
                "processing": processing,
                "total": total_records,
                "percentage": (completed / total_records * 100) if total_records > 0 else 0
            }

def load_model_with_cache(model_name: str, cache: dict):
    """Load model with caching and progress feedback"""
    if model_name in cache:
        return cache[model_name], None
    
    try:
        print(f"🔄 กำลังโหลดโมเดล {model_name}...")
        
        # Use smaller models or quantized versions for faster loading
        if "typhoon" in model_name.lower():
            # Load with optimizations
            tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
            model = AutoModelForCausalLM.from_pretrained(
                model_name, 
                torch_dtype=torch.float16,  # Use half precision
                device_map="auto",
                trust_remote_code=True
            )
        else:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16,
                device_map="auto"
            )
        
        generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
        cache[model_name] = generator
        print(f"✅ โหลดโมเดล {model_name} สำเร็จ")
        return generator, None
        
    except Exception as e:
        error_msg = f"❌ ไม่สามารถโหลดโมเดล {model_name}: {str(e)}"
        print(error_msg)
        return None, error_msg

def generate_single_record(generator, prompt: str, record_id: int, model_name: str, 
                          max_length: int, temperature: float, top_p: float, 
                          task_type: str, original_data: dict, status_tracker: ModelStatus):
    """Generate a single record with the given model"""
    try:
        # Mark record as processing
        status_tracker.set_record_processing(record_id, model_name)
        
        # Generate text
        generated = generator(
            prompt,
            max_length=max_length,
            temperature=temperature,
            top_p=top_p,
            num_return_sequences=1,
            do_sample=True,
            pad_token_id=generator.tokenizer.eos_token_id if hasattr(generator.tokenizer, 'eos_token_id') else generator.tokenizer.pad_token_id
        )
        
        generated_text = generated[0]['generated_text']
        
        result = {
            'id': record_id + 1,
            'model_used': model_name,
            'task_type': task_type,
            'prompt': prompt,
            'generated_text': generated_text,
            'original_data': original_data,
            'generation_time': time.time()
        }
        
        # Mark record as completed
        status_tracker.set_record_completed(record_id, result)
        return result
        
    except Exception as e:
        # If generation fails, mark as pending again for other models to try
        with status_tracker.lock:
            if record_id in status_tracker.record_status:
                status_tracker.record_status[record_id]["status"] = "pending"
        return None

def model_worker(model_name: str, model_cache: dict, prompts: List[str], 
                task_type: str, original_data_list: List[dict], 
                max_length: int, temperature: float, top_p: float,
                status_tracker: ModelStatus, progress_callback=None):
    """Worker function for each model to process available records"""
    
    # Load model
    generator, error = load_model_with_cache(model_name, model_cache)
    if error:
        return f"Error loading {model_name}: {error}"
    
    total_records = len(prompts)
    processed_count = 0
    
    while True:
        # Get next available record
        record_id = status_tracker.get_next_available_record(total_records, model_name)
        
        if record_id == -1:  # No more records available
            break
            
        # Generate record
        prompt = prompts[record_id]
        original_data = original_data_list[record_id] if original_data_list else None
        
        result = generate_single_record(
            generator, prompt, record_id, model_name,
            max_length, temperature, top_p, task_type, 
            original_data, status_tracker
        )
        
        if result:
            processed_count += 1
            
        # Update progress
        if progress_callback:
            progress = status_tracker.get_progress(total_records)
            progress_callback(progress, model_name, processed_count)
    
    return f"{model_name}: Processed {processed_count} records"

def generate_dataset_multi_model(selected_models: List[str], task_type: str, custom_template: str, 
                                file_data: List[dict], num_samples: int, max_length: int, 
                                temperature: float, top_p: float, progress_callback=None):
    """Generate dataset using multiple models collaboratively"""
    try:
        # Prepare prompts
        prompts = []
        original_data_list = []
        
        # Determine template
        if custom_template and custom_template.strip():
            template = custom_template
        elif task_type in TASK_TEMPLATES:
            template = TASK_TEMPLATES[task_type]["template"]
        else:
            template = "Generate text: {input}"
        
        # Generate prompts for all records
        for i in range(num_samples):
            if file_data and len(file_data) > 0:
                data_row = file_data[i % len(file_data)]
                prompt = generate_from_template(template, data_row)
                original_data_list.append(data_row)
            else:
                # Use template with placeholder values
                prompt = template.replace("{topic}", f"หัวข้อที่ {i+1}") \
                              .replace("{question}", f"คำถามที่ {i+1} เกี่ยวกับการเรียนรู้ของเครื่อง") \
                              .replace("{text}", f"ข้อความตัวอย่างที่ {i+1} สำหรับการประมวลผล") \
                              .replace("{input}", f"ข้อมูลนำเข้าที่ {i+1}") \
                              .replace("{instruction}", f"คำสั่งที่ {i+1}: ให้ทำงานนี้") \
                              .replace("{category}", "เทคโนโลยี") \
                              .replace("{style}", "โคลงสี่สุภาพ")
                original_data_list.append(None)
            
            prompts.append(prompt)
        
        # Initialize status tracker
        status_tracker = ModelStatus()
        model_cache = {}
        
        # Start worker threads for each model
        with ThreadPoolExecutor(max_workers=len(selected_models)) as executor:
            futures = []
            
            for model_name in selected_models:
                future = executor.submit(
                    model_worker, model_name, model_cache, prompts,
                    task_type, original_data_list, max_length, 
                    temperature, top_p, status_tracker, progress_callback
                )
                futures.append((future, model_name))
            
            # Wait for all workers to complete
            for future, model_name in futures:
                try:
                    result = future.result(timeout=300)  # 5 minute timeout per model
                    print(f"Model {model_name} completed: {result}")
                except Exception as e:
                    print(f"Model {model_name} failed: {str(e)}")
        
        # Collect results
        dataset = sorted(status_tracker.completed_records, key=lambda x: x['id'])
        
        if not dataset:
            return None, None, None, "ไม่สามารถสร้างข้อมูลได้"
        
        # Convert to DataFrame
        df = pd.DataFrame(dataset)
        
        # Create downloadable files
        csv_data = df.to_csv(index=False)
        json_data = json.dumps(dataset, indent=2, ensure_ascii=False)
        
        return df, csv_data, json_data, None
        
    except Exception as e:
        return None, None, None, f"Error in multi-model generation: {str(e)}"

def create_interface():
    with gr.Blocks(title="🇹🇭 Thai Dataset Generator", theme=gr.themes.Soft()) as demo:
        gr.Markdown("# 🤗 เครื่องมือสร้างชุดข้อมูลภาษาไทยคุณภาพสูง")
        gr.Markdown("⚡ **เคล็ดลับ**: ใช้โมเดลใดก็ได้จาก Hugging Face - เริ่มต้นด้วยโมเดลเล็กๆ เพื่อทดสอบก่อน")
        
        with gr.Row():
            with gr.Column():
                # Flexible model input
                gr.Markdown("### 🤖 เลือกโมเดลจาก Hugging Face")
                gr.Markdown("💡 **คำแนะนำ**: ใส่ชื่อโมเดลจาก [Hugging Face](https://huggingface.co/models) เช่น `microsoft/DialoGPT-small`, `gpt2`, `scb10x/typhoon-7b`")
                
                model_input_mode = gr.Radio(
                    choices=[
                        ("📝 ใส่ชื่อโมเดลเอง", "manual"),
                        ("📋 เลือกจากรายการแนะนำ", "suggested"),
                        ("🔀 ใช้หลายโมเดลพร้อมกัน", "multiple")
                    ],
                    value="manual",
                    label="วิธีการเลือกโมเดล"
                )
                
                # Manual model input
                manual_model_group = gr.Group(visible=True)
                with manual_model_group:
                    single_model_name = gr.Textbox(
                        label="ชื่อโมเดลจาก Hugging Face",
                        value="microsoft/DialoGPT-small",
                        placeholder="เช่น gpt2, microsoft/DialoGPT-medium, scb10x/typhoon-7b",
                        info="ใส่ชื่อโมเดลที่ต้องการใช้งาน"
                    )
                    
                    model_verification = gr.Button("🔍 ตรวจสอบโมเดล", variant="secondary", size="sm")
                    model_download = gr.Button("⬇️ ดาวน์โหลดโมเดล", variant="secondary", size="sm")
                    model_status = gr.Textbox(
                        label="สถานะโมเดล",
                        value="ยังไม่ได้ตรวจสอบ",
                        interactive=False
                    )

                    # เชื่อมปุ่มตรวจสอบโมเดลกับฟังก์ชันตรวจสอบ
                    def verify_model(model_name):
                        from transformers import AutoTokenizer
                        try:
                            # ลองโหลด tokenizer (เร็วกว่าโหลด model)
                            AutoTokenizer.from_pretrained(model_name)
                            return gr.update(value=f"✅ พบโมเดล {model_name} ใน Hugging Face", interactive=False)
                        except Exception as e:
                            return gr.update(value=f"❌ ไม่พบโมเดลหรือโหลดไม่ได้: {str(e)}", interactive=False)

                    model_verification.click(
                        fn=verify_model,
                        inputs=[single_model_name],
                        outputs=[model_status]
                    )

                    # ปุ่มดาวน์โหลดโมเดล (preload)
                    def download_model(model_name):
                        import time
                        from transformers import AutoTokenizer, AutoModelForCausalLM
                        try:
                            t0 = time.time()
                            model_status_msg = f"⏳ กำลังดาวน์โหลดและโหลดโมเดล {model_name} ..."
                            yield gr.update(value=model_status_msg, interactive=False)
                            # โหลด tokenizer และ model
                            tokenizer = AutoTokenizer.from_pretrained(model_name)
                            model = AutoModelForCausalLM.from_pretrained(model_name)
                            t1 = time.time()
                            msg = f"✅ โหลดโมเดล {model_name} สำเร็จใน {t1-t0:.1f} วินาที"
                            yield gr.update(value=msg, interactive=False)
                        except Exception as e:
                            yield gr.update(value=f"❌ ไม่สามารถโหลดโมเดล: {str(e)}", interactive=False)

                    model_download.click(
                        fn=download_model,
                        inputs=[single_model_name],
                        outputs=[model_status]
                    )
                
                # Suggested models
                suggested_model_group = gr.Group(visible=False)
                with suggested_model_group:
                    gr.Markdown("#### โมเดลแนะนำ")
                    
                    suggested_models = gr.Dropdown(
                        choices=[
                            # Small/Fast models
                            ("⚡ DistilGPT2 (เล็ก, เร็ว)", "distilgpt2"),
                            ("⚡ GPT2 (กลาง)", "gpt2"),
                            ("⚡ DialoGPT-small (บทสนทนา)", "microsoft/DialoGPT-small"),
                            ("⚡ DialoGPT-medium (บทสนทนา)", "microsoft/DialoGPT-medium"),
                            
                            # Thai models
                            ("🇹🇭 Typhoon-7B (ไทย, ใหญ่)", "scb10x/typhoon-7b"),
                            ("🇹🇭 OpenThaiGPT-1.5-7B (ไทย)", "openthaigpt/openthaigpt1.5-7b-instruct"),
                            ("🇹🇭 WangchanLION-7B (ไทย)", "aisingapore/llama2-7b-chat-thai"),
                            
                            # Multilingual models
                            ("🌍 mGPT (หลายภาษา)", "ai-forever/mGPT"),
                            ("🌍 Bloom-560m (หลายภาษา, เล็ก)", "bigscience/bloom-560m"),
                            ("🌍 Bloom-1b1 (หลายภาษา)", "bigscience/bloom-1b1"),
                            
                            # Instruction-following
                            ("🎯 Flan-T5-small (คำสั่ง)", "google/flan-t5-small"),
                            ("🎯 Flan-T5-base (คำสั่ง)", "google/flan-t5-base"),
                            
                            # Other popular models
                            ("🔥 OPT-350m (Meta)", "facebook/opt-350m"),
                            ("🔥 OPT-1.3b (Meta)", "facebook/opt-1.3b"),
                        ],
                        value="distilgpt2",
                        label="เลือกโมเดลแนะนำ"
                    )
                
                # Multiple models
                multiple_model_group = gr.Group(visible=False)
                with multiple_model_group:
                    multiple_model_names = gr.Textbox(
                        label="ชื่อโมเดลหลายตัว (แยกด้วยเครื่องหมายจุลภาค)",
                        value="distilgpt2, microsoft/DialoGPT-small",
                        placeholder="gpt2, microsoft/DialoGPT-medium, scb10x/typhoon-7b",
                        lines=3,
                        info="ใส่ชื่อโมเดลหลายตัวแยกด้วยเครื่องหมายจุลภาค"
                    )
                    
                    model_distribution_mode = gr.Radio(
                        choices=[
                            ("🔄 แบ่งงานกัน (Collaborative)", "collaborative"),
                            ("🎲 สุ่มเลือก (Random)", "random"),
                            ("📊 เท่าๆ กัน (Round-robin)", "round_robin")
                        ],
                        value="collaborative",
                        label="วิธีการใช้โมเดลหลายตัว"
                    )
                
                # Model info display
                current_models_display = gr.Textbox(
                    label="โมเดลที่จะใช้",
                    value="microsoft/DialoGPT-small",
                    interactive=False
                )
                
                # Task selection with Thai tasks
                gr.Markdown("### 📝 เลือกประเภทงาน")
                task_dropdown = gr.Dropdown(
                    choices=[(v["name"], k) for k, v in TASK_TEMPLATES.items()],
                    value="text_generation",
                    label="ประเภทงานที่ต้องการ"
                )
                
                task_description = gr.Textbox(
                    label="คำอธิบายงาน",
                    value=TASK_TEMPLATES["text_generation"]["description"],
                    interactive=False
                )
                
                # File upload section
                gr.Markdown("### 📁 อัปโหลดข้อมูลต้นฉบับ (ไม่บังคับ)")
                gr.Markdown("อัปโหลดไฟล์ CSV, JSON หรือ TXT ที่มีข้อมูลต้นฉบับภาษาไทย")
                file_upload = gr.File(
                    label="อัปโหลดไฟล์ข้อมูล",
                    file_types=[".csv", ".json", ".txt"]
                )
                
                file_preview = gr.Dataframe(
                    label="ตัวอย่างข้อมูลจากไฟล์ (5 แถวแรก)",
                    visible=False
                )
                # State สำหรับเก็บข้อมูลไฟล์ (ต้องอยู่ก่อนใช้งาน)
                file_data_state = gr.State()

                # ฟังก์ชัน handle file upload
                def handle_file_upload(file):
                    import pandas as pd
                    import json
                    if file is None:
                        return gr.update(visible=False), None
                    try:
                        if file.name.endswith('.csv'):
                            df = pd.read_csv(file.name)
                        elif file.name.endswith('.json'):
                            with open(file.name, 'r', encoding='utf-8') as f:
                                data = json.load(f)
                            df = pd.DataFrame(data)
                        elif file.name.endswith('.txt'):
                            with open(file.name, 'r', encoding='utf-8') as f:
                                lines = f.readlines()
                            df = pd.DataFrame({'text': [line.strip() for line in lines if line.strip()]})
                        else:
                            return gr.update(visible=True, value="ไม่รองรับไฟล์นี้"), None
                        preview = df.head(5)
                        # คืน preview และข้อมูลทั้งหมด (list of dict)
                        return gr.update(visible=True, value=preview), df.to_dict('records')
                    except Exception as e:
                        return gr.update(visible=True, value=f"❌ อ่านไฟล์ผิดพลาด: {str(e)}"), None

                file_upload.change(
                    fn=handle_file_upload,
                    inputs=[file_upload],
                    outputs=[file_preview, file_data_state]
                )
                
                # Template customization with multi-prompt support
                gr.Markdown("### 🎯 ปรับแต่งเทมเพลตและ Prompt")
                gr.Markdown("ใช้ {ชื่อฟิลด์} สำหรับตัวแปรในเทมเพลต")
                
                prompt_mode = gr.Radio(
                    choices=[
                        ("📝 Prompt เดียว (Single)", "single"),
                        ("📋 หลาย Prompt (Multiple)", "multiple"),
                        ("🎲 สุ่มจาก Template (Random)", "random")
                    ],
                    value="single",
                    label="โหมดการใส่ Prompt"
                )
                
                # Single prompt mode
                single_prompt_group = gr.Group(visible=True)
                with single_prompt_group:
                    template_display = gr.Textbox(
                        label="เทมเพลตปัจจุบัน",
                        value=TASK_TEMPLATES["text_generation"]["template"],
                        interactive=False
                    )
                    
                    custom_template = gr.Textbox(
                        label="เทมเพลตกำหนดเอง (ไม่บังคับ)",
                        lines=3,
                        placeholder="สร้างเทมเพลตของคุณเองที่นี่..."
                    )
                
                # Multiple prompts mode
                multi_prompt_group = gr.Group(visible=False)
                with multi_prompt_group:
                    gr.Markdown("#### 📋 ใส่หลาย Prompt (แต่ละบรรทัดคือ prompt หนึ่งตัว)")
                    
                    multi_prompts = gr.Textbox(
                        label="Prompts หลายตัว (แยกด้วยการขึ้นบรรทัดใหม่)",
                        lines=10,
                        placeholder="""เขียนเรื่องราวเกี่ยวกับการผจญภัยในป่า
สร้างบทสนทนาระหว่างครูกับนักเรียน
อธิบายวิธีการทำอาหารไทย
เขียนบทกวีเกี่ยวกับธรรมชาติ
สร้างเรื่องสั้นเกี่ยวกับมิตรภาพ"""
                    )
                    
                    prompt_distribution = gr.Radio(
                        choices=[
                            ("📊 กระจายเท่าๆ กัน", "equal"),
                            ("🎯 ตามสัดส่วนที่กำหนด", "weighted"),
                            ("🎲 สุ่ม", "random")
                        ],
                        value="equal",
                        label="วิธีการกระจาย Prompt"
                    )
                    
                    prompt_weights = gr.Textbox(
                        label="น้ำหนักของแต่ละ Prompt (เช่น 2,1,3,1,2)",
                        placeholder="2,1,3,1,2",
                        visible=False
                    )
                
                # Random template mode
                random_prompt_group = gr.Group(visible=False)
                with random_prompt_group:
                    gr.Markdown("#### 🎲 สุ่ม Prompt จาก Template ที่เลือก")
                    
                    random_templates = gr.CheckboxGroup(
                        choices=[(v["name"], k) for k, v in TASK_TEMPLATES.items()],
                        value=["text_generation", "conversation"],
                        label="เลือก Template ที่จะสุ่ม"
                    )
                    
                    random_variables = gr.Textbox(
                        label="ตัวแปรสำหรับสุ่ม (JSON format)",
                        lines=5,
                        value="""{
    "topic": ["การเดินทาง", "เทคโนโลยี", "อาหาร", "ธรรมชาติ", "ศิลปะ"],
    "question": ["AI คืออะไร", "โลกร้อนคืออะไร", "การศึกษาสำคัญอย่างไร"],
    "instruction": ["เขียนบทความ", "สรุปข้อมูล", "วิเคราะห์ปัญหา"]
}""",
                        placeholder="ใส่ตัวแปรในรูปแบบ JSON"
                    )
                
                # Prompt preview and count
                prompt_preview = gr.Textbox(
                    label="ตัวอย่าง Prompt ที่จะใช้",
                    lines=3,
                    interactive=False
                )
                
                prompt_count = gr.Textbox(
                    label="จำนวน Prompt ที่พร้อมใช้",
                    value="1 prompt",
                    interactive=False
                )

                # State สำหรับเก็บข้อมูลไฟล์
                file_data_state = gr.State()

                # ตัวเลือกจำนวนแถวข้อมูล (row_preset)
                row_preset = gr.Dropdown(
                    choices=[
                        ("10 แถว", 10),
                        ("100 แถว", 100),
                        ("500 แถว", 500),
                        ("1000 แถว", 1000)
                    ],
                    value=10,
                    label="จำนวนแถวข้อมูลที่ต้องการสร้าง"
                )

                # กำหนดจำนวนแถวเอง (custom_rows)
                custom_rows = gr.Textbox(
                    label="จำนวนแถวกำหนดเอง (ถ้าเว้นว่างจะใช้ค่าจากด้านบน)",
                    placeholder="ใส่ตัวเลข เช่น 123"
                )

                # ตัวเลือกการตั้งค่าการสร้างข้อความ
                max_length = gr.Slider(
                    minimum=16,
                    maximum=2048,
                    value=128,
                    step=1,
                    label="ความยาวสูงสุดของข้อความที่สร้าง (max_length)"
                )
                temperature = gr.Slider(
                    minimum=0.1,
                    maximum=2.0,
                    value=1.0,
                    step=0.05,
                    label="Temperature (ความสุ่ม)"
                )
                top_p = gr.Slider(
                    minimum=0.1,
                    maximum=1.0,
                    value=0.95,
                    step=0.01,
                    label="Top-p (nucleus sampling)"
                )
                batch_size = gr.Slider(
                    minimum=1,
                    maximum=32,
                    value=1,
                    step=1,
                    label="Batch size"
                )

                # ปุ่มสร้างข้อมูล
                generate_btn = gr.Button("🚀 สร้างข้อมูล", variant="primary")
        
                # Data Quality Settings
                gr.Markdown("### 🧼 การจัดการคุณภาพข้อมูล")
                
                enable_cleaning = gr.Checkbox(
                    label="เปิดใช้การทำความสะอาดข้อมูล",
                    value=True
                )
                
                remove_duplicates = gr.Checkbox(
                    label="ลบข้อมูลซ้ำซ้อน",
                    value=True
                )
                
                min_quality_score = gr.Slider(
                    minimum=0.0,
                    maximum=1.0,
                    value=0.5,
                    step=0.1,
                    label="คะแนนคุณภาพขั้นต่ำ (0-1)"
                )

                # ตัวเลือกแยกชุดข้อมูล (train/val/test split)
                create_splits = gr.Checkbox(
                    label="แยกชุดข้อมูลเป็น train/val/test",
                    value=False
                )
                
                # Export Settings
                gr.Markdown("### 📦 การส่งออกข้อมูล")
                
                export_format = gr.CheckboxGroup(
                    choices=[
                        ("📊 CSV (Excel, Spreadsheet)", "csv"),
                        ("📋 JSON (Web APIs, General)", "json"), 
                        ("📄 JSONL (Fine-tuning, Streaming)", "jsonl"),
                        ("🤗 Hugging Face Dataset (Complete Package)", "huggingface"),
                        ("📝 TXT (Simple Text)", "txt"),
                        ("🗃️ Parquet (Big Data, Analytics)", "parquet"),
                        ("📋 TSV (Tab-separated)", "tsv"),
                        ("🎯 Custom Format", "custom")
                    ],
                    value=["csv", "json"],
                    label="เลือกรูปแบบไฟล์ที่ต้องการ (สามารถเลือกหลายแบบ)"
                )
                
                # Custom format settings
                custom_format_group = gr.Group(visible=False)
                with custom_format_group:
                    gr.Markdown("#### 🎯 การตั้งค่ารูปแบบกำหนดเอง")
                    
                    custom_template_format = gr.Textbox(
                        label="Template สำหรับแต่ละ record",
                        value="Input: {input}\nOutput: {output}\n---",
                        lines=3,
                        placeholder="ใช้ {field_name} สำหรับข้อมูล"
                    )
                    
                    custom_file_extension = gr.Textbox(
                        label="นามสกุลไฟล์",
                        value="txt",
                        placeholder="เช่น txt, md, xml"
                    )
                
                # Advanced export options
                with gr.Accordion("⚙️ ตัวเลือกขั้นสูง", open=False):
                    include_metadata = gr.Checkbox(
                        label="รวม Metadata (model_used, timestamp, etc.)",
                        value=True
                    )
                    
                    include_quality_score = gr.Checkbox(
                        label="รวม Quality Score",
                        value=True
                    )
                    
                    file_naming_pattern = gr.Textbox(
                        label="รูปแบบชื่อไฟล์",
                        value="thai_dataset_{task}_{timestamp}",
                        placeholder="ใช้ {task}, {timestamp}, {model}, {count}"
                    )
                    
                    compression = gr.Radio(
                        choices=[
                            ("ไม่บีบอัด", "none"),
                            ("ZIP", "zip"),
                            ("GZIP", "gzip")
                        ],
                        value="none",
                        label="การบีบอัดไฟล์"
                    )
                
                # ...existing code...
        
        with gr.Column():
            with gr.Tabs():
                with gr.TabItem("📊 ตัวอย่างข้อมูล"):
                    dataset_preview = gr.Dataframe(
                        headers=["id", "task_type", "input", "output", "quality_score"],
                        interactive=False
                    )
                    status_message = gr.Markdown(
                        value="",
                        visible=True
                    )
                    # State สำหรับข้อมูลที่สร้าง
                    csv_data_state = gr.State()
                    json_data_state = gr.State()
                    dataset_card_state = gr.State()
                    hf_export_state = gr.State()
                    loading_status = gr.State()
                
                with gr.TabItem("📈 รายงานคุณภาพ"):
                    quality_report = gr.JSON(
                        label="รายงานคุณภาพข้อมูล",
                        visible=True
                    )
                    
                    quality_summary = gr.Markdown(
                        value="สร้างข้อมูลเสร็จแล้วจึงจะแสดงรายงานคุณภาพ"
                    )
                
                with gr.TabItem("💾 ดาวน์โหลด"):
                    gr.Markdown("### 💾 ดาวน์โหลดชุดข้อมูลในรูปแบบต่างๆ")
                    
                    download_status = gr.Markdown("สร้างข้อมูลเสร็จแล้วจึงจะสามารถดาวน์โหลดได้")
                    
                    # Dynamic download buttons based on selected formats
                    download_buttons = {}
                    download_files = {}
                    
                    with gr.Row():
                        csv_btn = gr.Button("📊 CSV", variant="secondary", visible=False)
                        json_btn = gr.Button("📋 JSON", variant="secondary", visible=False)
                        jsonl_btn = gr.Button("📄 JSONL", variant="secondary", visible=False)
                        txt_btn = gr.Button("📝 TXT", variant="secondary", visible=False)
                    
                    with gr.Row():
                        parquet_btn = gr.Button("🗃️ Parquet", variant="secondary", visible=False)
                        tsv_btn = gr.Button("📋 TSV", variant="secondary", visible=False)
                        hf_btn = gr.Button("🤗 HF Dataset", variant="secondary", visible=False)
                        custom_btn = gr.Button("🎯 Custom", variant="secondary", visible=False)
                    
                    # Download files
                    csv_download = gr.File(label="CSV File", visible=False)
                    json_download = gr.File(label="JSON File", visible=False)
                    jsonl_download = gr.File(label="JSONL File", visible=False)
                    txt_download = gr.File(label="TXT File", visible=False)
                    parquet_download = gr.File(label="Parquet File", visible=False)
                    tsv_download = gr.File(label="TSV File", visible=False)
                    hf_download = gr.File(label="HF Dataset Package", visible=False)
                    custom_download = gr.File(label="Custom Format", visible=False)
                    
                    # All formats in one package
                    with gr.Row():
                        package_btn = gr.Button("📦 ดาวน์โหลดทั้งหมด (ZIP)", variant="primary")
                        package_download = gr.File(label="Complete Package", visible=False)
        
        # ...existing code for states...
        
        def update_export_format_visibility(selected_formats):
            """Update visibility of download buttons based on selected formats"""
            return [
                gr.update(visible=("csv" in selected_formats)),
                gr.update(visible=("json" in selected_formats)),
                gr.update(visible=("jsonl" in selected_formats)),
                gr.update(visible=("txt" in selected_formats)),
                gr.update(visible=("parquet" in selected_formats)),
                gr.update(visible=("tsv" in selected_formats)),
                gr.update(visible=("huggingface" in selected_formats)),
                gr.update(visible=("custom" in selected_formats)),
                gr.update(visible=("custom" in selected_formats))
            ]
        
        def generate_multiple_formats(data, selected_formats, include_metadata, include_quality_score,
                                    file_naming_pattern, custom_template_format, custom_file_extension,
                                    task_type, compression):
            """Generate data in multiple formats"""
            from datetime import datetime
            import tempfile
            import zipfile
            import gzip
            import pyarrow as pa
            import pyarrow.parquet as pq
            
            if not data:
                return {}
            
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            model_name = data[0].get('model_used', 'unknown').replace('/', '_')
            
            # Prepare data
            export_data = []
            for record in data:
                export_record = {}
                export_record['input'] = record.get('prompt', '')
                export_record['output'] = record.get('generated_text', '')
                
                if include_metadata:
                    export_record['metadata'] = {
                        'model_used': record.get('model_used', ''),
                        'task_type': record.get('task_type', ''),
                        'timestamp': record.get('generation_time', '')
                    }
                
                if include_quality_score and 'quality_score' in record:
                    export_record['quality_score'] = record['quality_score']
                
                export_data.append(export_record)
            
            # Generate filename
            filename_base = file_naming_pattern.format(
                task=task_type,
                timestamp=timestamp,
                model=model_name,
                count=len(export_data)
            )
            
            generated_files = {}
            
            # Generate each format
            if "csv" in selected_formats:
                df = pd.DataFrame(export_data)
                csv_content = df.to_csv(index=False)
                generated_files['csv'] = (f"{filename_base}.csv", csv_content)
            
            if "json" in selected_formats:
                json_content = json.dumps(export_data, indent=2, ensure_ascii=False)
                generated_files['json'] = (f"{filename_base}.json", json_content)
            
            if "jsonl" in selected_formats:
                jsonl_content = '\n'.join([json.dumps(record, ensure_ascii=False) for record in export_data])
                generated_files['jsonl'] = (f"{filename_base}.jsonl", jsonl_content)
            
            if "txt" in selected_formats:
                txt_content = '\n'.join([f"Input: {record['input']}\nOutput: {record['output']}\n---" for record in export_data])
                generated_files['txt'] = (f"{filename_base}.txt", txt_content)
            
            if "tsv" in selected_formats:
                df = pd.DataFrame(export_data)
                tsv_content = df.to_csv(index=False, sep='\t')
                generated_files['tsv'] = (f"{filename_base}.tsv", tsv_content)
            
            if "parquet" in selected_formats:
                df = pd.DataFrame(export_data)
                temp_parquet = tempfile.mktemp(suffix='.parquet')
                df.to_parquet(temp_parquet)
                with open(temp_parquet, 'rb') as f:
                    parquet_content = f.read()
                generated_files['parquet'] = (f"{filename_base}.parquet", parquet_content)
            
            if "custom" in selected_formats:
                custom_content = []
                for record in export_data:
                    formatted = custom_template_format.format(**record)
                    custom_content.append(formatted)
                custom_text = '\n'.join(custom_content)
                generated_files['custom'] = (f"{filename_base}.{custom_file_extension}", custom_text)
            
            # Apply compression if selected
            if compression == "gzip":
                for format_name, (filename, content) in generated_files.items():
                    if isinstance(content, str):
                        content = content.encode('utf-8')
                    compressed = gzip.compress(content)
                    generated_files[format_name] = (filename + '.gz', compressed)
            
            return generated_files
        
        def create_complete_package(generated_files, compression):
            """Create a complete package with all formats"""
            import tempfile
            import zipfile
            
            if not generated_files:
                return None
            
            temp_zip = tempfile.mktemp(suffix='.zip')
            
            with zipfile.ZipFile(temp_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
                for format_name, (filename, content) in generated_files.items():
                    if isinstance(content, str):
                        content = content.encode('utf-8')
                    zipf.writestr(filename, content)
            
            return temp_zip
        
        def download_specific_format(format_name, generated_files):
            """Download specific format"""
            if format_name in generated_files:
                filename, content = generated_files[format_name]
                if isinstance(content, str):
                    return gr.update(visible=True, value=io.StringIO(content))
                else:
                    temp_file = tempfile.mktemp()
                    with open(temp_file, 'wb') as f:
                        f.write(content)
                    return gr.update(visible=True, value=temp_file)
            return gr.update(visible=False)
        
        # Event handlers
        export_format.change(
            fn=update_export_format_visibility,
            inputs=[export_format],
            outputs=[csv_btn, json_btn, jsonl_btn, txt_btn, parquet_btn, tsv_btn, hf_btn, custom_btn, custom_format_group]
        )
        
        # ...existing code for other event handlers...
        
        # Download button handlers
        csv_btn.click(
            fn=lambda files: download_specific_format('csv', files),
            inputs=[gr.State()],  # Will be connected to generated files state
            outputs=[csv_download]
        )
        
        json_btn.click(
            fn=lambda files: download_specific_format('json', files),
            inputs=[gr.State()],
            outputs=[json_download]
        )
        
        jsonl_btn.click(
            fn=lambda files: download_specific_format('jsonl', files),
            inputs=[gr.State()],
            outputs=[jsonl_download]
        )
        
        txt_btn.click(
            fn=lambda files: download_specific_format('txt', files),
            inputs=[gr.State()],
            outputs=[txt_download]
        )
        
        parquet_btn.click(
            fn=lambda files: download_specific_format('parquet', files),
            inputs=[gr.State()],
            outputs=[parquet_download]
        )
        
        tsv_btn.click(
            fn=lambda files: download_specific_format('tsv', files),
            inputs=[gr.State()],
            outputs=[tsv_download]
        )
        
        hf_btn.click(
            fn=lambda files: download_specific_format('huggingface', files),
            inputs=[gr.State()],
            outputs=[hf_download]
        )
        
        custom_btn.click(
            fn=lambda files: download_specific_format('custom', files),
            inputs=[gr.State()],
            outputs=[custom_download]
        )
        
        package_btn.click(
            fn=lambda files, comp: gr.update(visible=True, value=create_complete_package(files, comp)),
            inputs=[gr.State(), compression],  # Will be connected to generated files and compression
            outputs=[package_download]
        )
        
        # Update generate button to use correct function
        generate_btn.click(
            fn=process_with_flexible_models,
            inputs=[model_input_mode, single_model_name, suggested_models, multiple_model_names,
                   model_distribution_mode, task_dropdown, prompt_mode, custom_template,
                   multi_prompts, random_templates, random_variables, file_data_state, 
                   row_preset, custom_rows, max_length, temperature, top_p, batch_size,
                   enable_cleaning, remove_duplicates, min_quality_score, 
                   create_splits, export_format],
            outputs=[dataset_preview, status_message, quality_report, quality_summary,
                    csv_data_state, json_data_state, dataset_card_state, hf_export_state,
                    loading_status]
        )
    
    return demo

def validate_models_before_generation(*args, **kwargs):
    # TODO: implement validation logic
    return None

def process_with_flexible_models(input_mode, single_model, suggested_model, multiple_models, 
                               model_distribution_mode, task_type, prompt_mode, custom_template, 
                               multi_prompts, random_templates, random_variables, file_data, 
                               row_preset, custom_rows, max_length, temperature, top_p, batch_size,
                               enable_cleaning, remove_duplicates, min_quality_score, 
                               create_splits, export_format):
    """Process generation with flexible model selection"""

    # ฟังก์ชันเลือกโมเดลที่ใช้จริง
    def get_selected_models(input_mode, single_model, suggested_model, multiple_models):
        if input_mode == "manual":
            return [single_model.strip()] if single_model and single_model.strip() else []
        elif input_mode == "suggested":
            return [suggested_model] if suggested_model else []
        elif input_mode == "multiple":
            # แยกชื่อโมเดลด้วย , และลบช่องว่าง
            return [m.strip() for m in multiple_models.split(",") if m.strip()]
        return []

    # ฟังก์ชันนับจำนวนแถวข้อมูลที่ต้องการสร้าง
    def get_final_row_count(row_preset, custom_rows):
        try:
            if custom_rows and str(custom_rows).strip():
                return int(custom_rows)
            return int(row_preset)
        except Exception:
            return 10

    # Get selected models
    selected_models = get_selected_models(input_mode, single_model, suggested_model, multiple_models)
    
    if not selected_models:
        yield (
            gr.update(visible=False),
            gr.update(visible=True, value="❌ กรุณาเลือกโมเดลอย่างน้อยหนึ่งตัว"),
            {}, "ไม่มีโมเดล", None, None, None, None,
            "❌ ไม่ได้เลือกโมเดล"
        )
        return

    num_samples = get_final_row_count(row_preset, custom_rows)
    
    try:
        yield (
            gr.update(visible=False),
            gr.update(visible=True, value=f"🔄 กำลังสร้างข้อมูล {num_samples} แถว..."),
            {}, "กำลังสร้าง...", None, None, None, None,
            f"🔄 กำลังประมวลผล..."
        )
        
        # Simple generation for now
        model_name = selected_models[0]
        df, csv_data, json_data, error = generate_dataset_from_task(
            model_name, task_type, custom_template, file_data,
            num_samples, max_length, temperature, top_p
        )
        
        if error:
            yield (
                gr.update(visible=False),
                gr.update(visible=True, value=f"❌ เกิดข้อผิดพลาด: {error}"),
                {}, "เกิดข้อผิดพลาด", None, None, None, None,
                f"❌ {error}"
            )
            return
        
        # Basic quality processing
        raw_data = df.to_dict('records')
        
        quality_report = {
            "total_records": len(raw_data),
            "models_used": selected_models
        }
        
        final_df = pd.DataFrame(raw_data)
        final_csv = final_df.to_csv(index=False)
        final_json = json.dumps(raw_data, indent=2, ensure_ascii=False)
        
        dataset_card = f"# Dataset generated with {model_name}\n\nRecords: {len(raw_data)}"
        
        success_msg = f"✅ สร้างข้อมูลสำเร็จ! ได้ {len(raw_data)} แถว"
        quality_summary = f"📊 จำนวนข้อมูล: {len(raw_data)} แถว"
        
        yield (
            gr.update(visible=True, value=final_df),
            gr.update(visible=True, value=success_msg),
            quality_report,
            quality_summary,
            final_csv,
            final_json,
            dataset_card,
            None,
            "✅ เสร็จสิ้น!"
        )
        
    except Exception as e:
        yield (
            gr.update(visible=False),
            gr.update(visible=True, value=f"❌ ข้อผิดพลาด: {str(e)}"),
            {}, "เกิดข้อผิดพลาด", None, None, None, None,
            f"❌ {str(e)}"
        )

if __name__ == "__main__":
    demo = create_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True
    )