Spaces:

JonusNattapong
/

DekGenerate

Running

App Files Files Community

Nattapong Tapachoom commited on 12 days ago

Commit

11885ff

1 Parent(s): bf8bb9c

Enhance model loading and quality management features with status tracking and progress feedback

Browse files

Files changed (1) hide show

app.py +465 -166

app.py CHANGED Viewed

@@ -11,7 +11,94 @@ import time
 import queue
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import asyncio
-from data_quality import DataQualityManager, export_to_huggingface_format
 # Predefined task templates with Thai language support
 TASK_TEMPLATES = {
@@ -268,18 +355,40 @@ class ModelStatus:
             }
 def load_model_with_cache(model_name: str, cache: dict):
-    """Load model with caching to avoid reloading"""
     if model_name in cache:
         return cache[model_name], None
     try:
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForCausalLM.from_pretrained(model_name)
         generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
         cache[model_name] = generator
         return generator, None
     except Exception as e:
-        return None, str(e)
 def generate_single_record(generator, prompt: str, record_id: int, model_name: str,
                           max_length: int, temperature: float, top_p: float,
@@ -443,34 +552,104 @@ def generate_dataset_multi_model(selected_models: List[str], task_type: str, cus
         return None, None, None, f"Error in multi-model generation: {str(e)}"
 def create_interface():
-    with gr.Blocks(title="🇹🇭 Thai Dataset Generator with Hugging Face", theme=gr.themes.Soft()) as demo:
         gr.Markdown("# 🤗 เครื่องมือสร้างชุดข้อมูลภาษาไทยคุณภาพสูง")
-        gr.Markdown("สร้างชุดข้อมูลภาษาไทยคุณภาพสูง สะอาด และเป็นสากลด้วยโมเดลหลายตัว")
         with gr.Row():
             with gr.Column():
-                # Multi-model selection
-                gr.Markdown("### 🤖 เลือกโมเดลภาษาไทย (หลายตัว)")
-                model_checkboxes = gr.CheckboxGroup(
                     choices=[
-                        ("🌪️ Typhoon-7B (SCB10X)", "scb10x/typhoon-7b"),
-                        ("🇹🇭 OpenThaiGPT 1.5-7B", "openthaigpt/openthaigpt1.5-7b-instruct"),
-                        ("🦁 Gemma2-9B WangchanLION", "aisingapore/Gemma2-9b-WangchanLIONv2-instruct"),
-                        ("🌍 SambaLingo-Thai-Base", "sambanovasystems/SambaLingo-Thai-Base")
                     ],
-                    value=["scb10x/typhoon-7b"],
-                    label="เลือกโมเดลที่ต้องการใช้งาน (สามารถเลือกหลายตัว)"
                 )
-                gr.Markdown("### 📊 โหมดการทำงาน")
-                work_mode = gr.Radio(
-                    choices=[
-                        ("🔄 แบ่งงานกัน (Multi-Model Collaboration)", "collaborative"),
-                        ("📝 ใช้โมเดลเดียว (Single Model)", "single")
-                    ],
-                    value="collaborative",
-                    label="เลือกโหมดการทำงาน"
                 )
                 # Task selection with Thai tasks
@@ -500,21 +679,101 @@ def create_interface():
                     visible=False
                 )
-                # Template customization
-                gr.Markdown("### 🎯 ปรับแต่งเทมเพลต")
                 gr.Markdown("ใช้ {ชื่อฟิลด์} สำหรับตัวแปรในเทมเพลต")
-                template_display = gr.Textbox(
-                    label="เทมเพลตปัจจุบัน",
-                    value=TASK_TEMPLATES["text_generation"]["template"],
-                    interactive=False
                 )
-                custom_template = gr.Textbox(
-                    label="เทมเพลตกำหนดเอง (ไม่บังคับ)",
                     lines=3,
-                    placeholder="สร้างเทมเพลตของคุณเองที่นี่..."
                 )
                 # Data Quality Settings
                 gr.Markdown("### 🧼 การจัดการคุณภาพข้อมูล")
@@ -554,43 +813,72 @@ def create_interface():
                     label="รูปแบบการส่งออก"
                 )
-                # Generation parameters
                 gr.Markdown("### ⚙️ ตั้งค่าการสร้างข้อมูล")
                 with gr.Row():
-                    num_samples = gr.Slider(
-                        minimum=1,
-                        maximum=100,
-                        value=10,
-                        step=1,
-                        label="จำนวนข้อมูลที่ต้องการ"
-                    )
                     max_length = gr.Slider(
                         minimum=10,
-                        maximum=1000,
-                        value=200,
                         step=10,
                         label="ความยาวสูงสุด (โทเคน)"
                     )
-                with gr.Row():
-                    temperature = gr.Slider(
-                        minimum=0.1,
-                        maximum=2.0,
-                        value=0.8,
-                        step=0.1,
-                        label="ความคิดสร้างสรรค์ (Temperature)"
-                    )
-                    top_p = gr.Slider(
-                        minimum=0.1,
-                        maximum=1.0,
-                        value=0.9,
-                        step=0.1,
-                        label="ความหลากหลาย (Top-p)"
                     )
-                generate_btn = gr.Button("🚀 สร้างชุดข้อมูลคุณภาพสูง", variant="primary", size="lg")
         with gr.Column():
             with gr.Tabs():
@@ -718,68 +1006,101 @@ def create_interface():
                 return (
                     gr.update(visible=False),
                     gr.update(visible=True, value="❌ กรุณาเลือกโมเดลอย่างน้อยหนึ่งตัว"),
-                    {}, "เกิดข้อผิดพลาดในการสร้างข้อมูล",
-                    None, None, None, None
                 )
-            # Generate raw data first
-            if work_mode == "collaborative" and len(selected_models) > 1:
-                df, csv_data, json_data, error = generate_dataset_multi_model(
-                    selected_models, task_type, custom_template, file_data,
-                    num_samples, max_length, temperature, top_p
-                )
-            else:
-                model_name = selected_models[0] if selected_models else "distilgpt2"
-                df, csv_data, json_data, error = generate_dataset_from_task(
-                    model_name, task_type, custom_template, file_data,
-                    num_samples, max_length, temperature, top_p
                 )
-            if error:
-                return (
                     gr.update(visible=False),
-                    gr.update(visible=True, value=f"❌ เกิดข้อผิดพลาด: {error}"),
-                    {}, "เกิดข้อผิดพลาดในการสร้างข้อมูล",
-                    None, None, None, None
                 )
-            # Apply basic quality management since we don't have the full module
-            raw_data = df.to_dict('records')
-            # Simple cleaning
-            if enable_cleaning:
-                for record in raw_data:
-                    if 'prompt' in record:
-                        record['prompt'] = str(record['prompt']).strip()
-                    if 'generated_text' in record:
-                        record['generated_text'] = str(record['generated_text']).strip()
-            # Remove duplicates (simple version)
-            if remove_duplicates:
-                seen = set()
-                unique_data = []
-                for record in raw_data:
-                    key = str(record.get('prompt', '')) + str(record.get('generated_text', ''))
-                    if key not in seen:
-                        seen.add(key)
-                        unique_data.append(record)
-                raw_data = unique_data
-            # Create quality report
-            quality_report = {
-                "total_records": len(raw_data),
-                "cleaning_enabled": enable_cleaning,
-                "duplicates_removed": remove_duplicates,
-                "models_used": list(set([r.get('model_used', 'unknown') for r in raw_data]))
-            }
-            # Create final DataFrame
-            final_df = pd.DataFrame(raw_data)
-            final_csv = final_df.to_csv(index=False)
-            final_json = json.dumps(raw_data, indent=2, ensure_ascii=False)
-            # Simple dataset card
-            dataset_card = f"""# Thai {task_type.title()} Dataset
 ## Dataset Information
 - Total Records: {len(raw_data)}
@@ -790,53 +1111,29 @@ def create_interface():
 ## Usage
 This dataset can be used for Thai NLP tasks.
 """
-            success_msg = f"✅ สร้างข้อมูลสำเร็จ! ได้ {len(raw_data)} รายการ"
-            quality_summary = f"📊 จำนวนข้อมูล: {len(raw_data)} รายการ"
-            return (
-                gr.update(visible=True, value=final_df),
-                gr.update(visible=True, value=success_msg),
-                quality_report,
-                quality_summary,
-                final_csv,
-                final_json,
-                dataset_card,
-                None  # hf_export_path
-            )
-        def download_csv(csv_data):
-            if csv_data:
-                return gr.update(visible=True, value=io.StringIO(csv_data))
-            return gr.update(visible=False)
-        def download_json(json_data):
-            if json_data:
-                return gr.update(visible=True, value=io.StringIO(json_data))
-            return gr.update(visible=False)
-        def download_dataset_card(card_content):
-            if card_content:
-                return gr.update(visible=True, value=io.StringIO(card_content))
-            return gr.update(visible=False)
-        def download_hf_dataset(hf_path):
-            if hf_path:
-                import zipfile
-                import tempfile
-                import os
-                # Create zip file
-                zip_path = tempfile.mktemp(suffix='.zip')
-                with zipfile.ZipFile(zip_path, 'w') as zipf:
-                    for root, dirs, files in os.walk(hf_path):
-                        for file in files:
-                            file_path = os.path.join(root, file)
-                            arcname = os.path.relpath(file_path, hf_path)
-                            zipf.write(file_path, arcname)
-                return gr.update(visible=True, value=zip_path)
-            return gr.update(visible=False)
         # Event connections
         task_dropdown.change(
@@ -851,14 +1148,16 @@ This dataset can be used for Thai NLP tasks.
             outputs=[file_preview, file_data_state]
         )
         generate_btn.click(
-            fn=process_with_quality_management,
             inputs=[model_checkboxes, work_mode, task_dropdown, custom_template, file_data_state,
                    num_samples, max_length, temperature, top_p,
                    enable_cleaning, remove_duplicates, min_quality_score,
                    create_splits, export_format],
             outputs=[dataset_preview, status_message, quality_report, quality_summary,
-                    csv_data_state, json_data_state, dataset_card_state, hf_export_state]
         )
         csv_btn.click(

 import queue
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import asyncio
+# Global model cache and loading status
+MODEL_CACHE = {}
+MODEL_LOADING_STATUS = {}
+MODEL_LOADING_LOCK = threading.Lock()
+def check_model_loading_status(model_names: List[str]) -> Dict:
+    """Check loading status of multiple models"""
+    with MODEL_LOADING_LOCK:
+        status = {}
+        for model_name in model_names:
+            if model_name in MODEL_CACHE:
+                status[model_name] = "ready"
+            elif model_name in MODEL_LOADING_STATUS:
+                status[model_name] = MODEL_LOADING_STATUS[model_name]
+            else:
+                status[model_name] = "not_loaded"
+        return status
+def load_model_with_status_tracking(model_name: str):
+    """Load model with status tracking"""
+    with MODEL_LOADING_LOCK:
+        if model_name in MODEL_CACHE:
+            return MODEL_CACHE[model_name], None
+        if model_name in MODEL_LOADING_STATUS:
+            return None, f"โมเดล {model_name} กำลังโหลดอยู่..."
+        MODEL_LOADING_STATUS[model_name] = "loading"
+    try:
+        print(f"🔄 เริ่มโหลดโมเดล {model_name}...")
+        # Update status
+        with MODEL_LOADING_LOCK:
+            MODEL_LOADING_STATUS[model_name] = "downloading"
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        with MODEL_LOADING_LOCK:
+            MODEL_LOADING_STATUS[model_name] = "loading_model"
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16,
+            device_map="auto",
+            trust_remote_code=True
+        )
+        with MODEL_LOADING_LOCK:
+            MODEL_LOADING_STATUS[model_name] = "creating_pipeline"
+        generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
+        with MODEL_LOADING_LOCK:
+            MODEL_CACHE[model_name] = generator
+            MODEL_LOADING_STATUS[model_name] = "ready"
+        print(f"✅ โหลดโมเดล {model_name} สำเร็จ")
+        return generator, None
+    except Exception as e:
+        error_msg = f"❌ ไม่สามารถโหลดโมเดล {model_name}: {str(e)}"
+        print(error_msg)
+        with MODEL_LOADING_LOCK:
+            if model_name in MODEL_LOADING_STATUS:
+                del MODEL_LOADING_STATUS[model_name]
+        return None, error_msg
+def preload_models_async(model_names: List[str], progress_callback=None):
+    """Preload models asynchronously"""
+    def load_single_model(model_name):
+        generator, error = load_model_with_status_tracking(model_name)
+        if progress_callback:
+            progress_callback(model_name, "ready" if generator else "error", error)
+        return model_name, generator, error
+    results = {}
+    with ThreadPoolExecutor(max_workers=2) as executor:  # Limit concurrent loading
+        futures = {executor.submit(load_single_model, model): model for model in model_names}
+        for future in as_completed(futures):
+            model_name, generator, error = future.result()
+            results[model_name] = {"generator": generator, "error": error}
+    return results
 # Predefined task templates with Thai language support
 TASK_TEMPLATES = {
             }
 def load_model_with_cache(model_name: str, cache: dict):
+    """Load model with caching and progress feedback"""
     if model_name in cache:
         return cache[model_name], None
     try:
+        print(f"🔄 กำลังโหลดโมเดล {model_name}...")
+        # Use smaller models or quantized versions for faster loading
+        if "typhoon" in model_name.lower():
+            # Load with optimizations
+            tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch.float16,  # Use half precision
+                device_map="auto",
+                trust_remote_code=True
+            )
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch.float16,
+                device_map="auto"
+            )
         generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
         cache[model_name] = generator
+        print(f"✅ โหลด��มเดล {model_name} สำเร็จ")
         return generator, None
     except Exception as e:
+        error_msg = f"❌ ไม่สามารถโหลดโมเดล {model_name}: {str(e)}"
+        print(error_msg)
+        return None, error_msg
 def generate_single_record(generator, prompt: str, record_id: int, model_name: str,
                           max_length: int, temperature: float, top_p: float,
         return None, None, None, f"Error in multi-model generation: {str(e)}"
 def create_interface():
+    with gr.Blocks(title="🇹🇭 Thai Dataset Generator", theme=gr.themes.Soft()) as demo:
         gr.Markdown("# 🤗 เครื่องมือสร้างชุดข้อมูลภาษาไทยคุณภาพสูง")
+        gr.Markdown("⚡ **เคล็ดลับ**: ใช้โมเดลใดก็ได้จาก Hugging Face - เริ่มต้นด้วยโมเดลเล็กๆ เพื่อทดสอบก่อน")
         with gr.Row():
             with gr.Column():
+                # Flexible model input
+                gr.Markdown("### 🤖 เลือกโมเดลจาก Hugging Face")
+                gr.Markdown("💡 **คำแนะนำ**: ใส่ชื่อโมเดลจาก [Hugging Face](https://huggingface.co/models) เช่น `microsoft/DialoGPT-small`, `gpt2`, `scb10x/typhoon-7b`")
+                model_input_mode = gr.Radio(
                     choices=[
+                        ("📝 ใส่ชื่อโมเดลเอง", "manual"),
+                        ("📋 เลือกจากรายการแนะนำ", "suggested"),
+                        ("🔀 ใช้หลายโมเดลพร้อมกัน", "multiple")
                     ],
+                    value="manual",
+                    label="วิธีการเลือกโมเดล"
                 )
+                # Manual model input
+                manual_model_group = gr.Group(visible=True)
+                with manual_model_group:
+                    single_model_name = gr.Textbox(
+                        label="ชื่อโมเดลจาก Hugging Face",
+                        value="microsoft/DialoGPT-small",
+                        placeholder="เช่น gpt2, microsoft/DialoGPT-medium, scb10x/typhoon-7b",
+                        info="ใส่ชื่อโมเดลที่ต้องการใช้งาน"
+                    )
+                    model_verification = gr.Button("🔍 ตรวจสอบโมเดล", variant="secondary", size="sm")
+                    model_status = gr.Textbox(
+                        label="สถานะโมเดล",
+                        value="ยังไม่ได้ตรวจสอบ",
+                        interactive=False
+                    )
+                # Suggested models
+                suggested_model_group = gr.Group(visible=False)
+                with suggested_model_group:
+                    gr.Markdown("#### โมเดลแนะนำ")
+                    suggested_models = gr.Dropdown(
+                        choices=[
+                            # Small/Fast models
+                            ("⚡ DistilGPT2 (เล็ก, เร็ว)", "distilgpt2"),
+                            ("⚡ GPT2 (กลาง)", "gpt2"),
+                            ("⚡ DialoGPT-small (บทสนทนา)", "microsoft/DialoGPT-small"),
+                            ("⚡ DialoGPT-medium (บทสนทนา)", "microsoft/DialoGPT-medium"),
+                            # Thai models
+                            ("🇹🇭 Typhoon-7B (ไทย, ใหญ่)", "scb10x/typhoon-7b"),
+                            ("🇹🇭 OpenThaiGPT-1.5-7B (ไทย)", "openthaigpt/openthaigpt1.5-7b-instruct"),
+                            ("🇹🇭 WangchanLION-7B (ไทย)", "aisingapore/llama2-7b-chat-thai"),
+                            # Multilingual models
+                            ("🌍 mGPT (หลายภาษา)", "ai-forever/mGPT"),
+                            ("🌍 Bloom-560m (หลายภาษา, เล็ก)", "bigscience/bloom-560m"),
+                            ("🌍 Bloom-1b1 (หลายภาษา)", "bigscience/bloom-1b1"),
+                            # Instruction-following
+                            ("🎯 Flan-T5-small (คำสั่ง)", "google/flan-t5-small"),
+                            ("🎯 Flan-T5-base (คำสั่ง)", "google/flan-t5-base"),
+                            # Other popular models
+                            ("🔥 OPT-350m (Meta)", "facebook/opt-350m"),
+                            ("🔥 OPT-1.3b (Meta)", "facebook/opt-1.3b"),
+                        ],
+                        value="distilgpt2",
+                        label="เลือกโมเดลแนะนำ"
+                    )
+                # Multiple models
+                multiple_model_group = gr.Group(visible=False)
+                with multiple_model_group:
+                    multiple_model_names = gr.Textbox(
+                        label="ชื่อโมเดลหลายตัว (แยกด้วยเครื่องหมายจุลภาค)",
+                        value="distilgpt2, microsoft/DialoGPT-small",
+                        placeholder="gpt2, microsoft/DialoGPT-medium, scb10x/typhoon-7b",
+                        lines=3,
+                        info="ใส่ชื่อโมเดลหลายตัวแยกด้วยเครื่องหมายจุลภาค"
+                    )
+                    model_distribution_mode = gr.Radio(
+                        choices=[
+                            ("🔄 แบ่งงานกัน (Collaborative)", "collaborative"),
+                            ("🎲 สุ่มเลือก (Random)", "random"),
+                            ("📊 เท่าๆ กัน (Round-robin)", "round_robin")
+                        ],
+                        value="collaborative",
+                        label="วิธีการใช้โมเดลหลายตัว"
+                    )
+                # Model info display
+                current_models_display = gr.Textbox(
+                    label="โมเดลที่จะใช้",
+                    value="microsoft/DialoGPT-small",
+                    interactive=False
                 )
                 # Task selection with Thai tasks
                     visible=False
                 )
+                # Template customization with multi-prompt support
+                gr.Markdown("### 🎯 ปรับแต่งเทมเพลตและ Prompt")
                 gr.Markdown("ใช้ {ชื่อฟิลด์} สำหรับตัวแปรในเทมเพลต")
+                prompt_mode = gr.Radio(
+                    choices=[
+                        ("📝 Prompt เดียว (Single)", "single"),
+                        ("📋 หลาย Prompt (Multiple)", "multiple"),
+                        ("🎲 สุ่มจาก Template (Random)", "random")
+                    ],
+                    value="single",
+                    label="โหมดการใส่ Prompt"
                 )
+                # Single prompt mode
+                single_prompt_group = gr.Group(visible=True)
+                with single_prompt_group:
+                    template_display = gr.Textbox(
+                        label="เทมเพลตปัจจุบัน",
+                        value=TASK_TEMPLATES["text_generation"]["template"],
+                        interactive=False
+                    )
+                    custom_template = gr.Textbox(
+                        label="เทมเพลตกำหนดเอง (ไม่บังคับ)",
+                        lines=3,
+                        placeholder="สร้างเทมเพลตของคุณเองที่นี่..."
+                    )
+                # Multiple prompts mode
+                multi_prompt_group = gr.Group(visible=False)
+                with multi_prompt_group:
+                    gr.Markdown("#### 📋 ใส่หลาย Prompt (แต่ละบรรทัดคือ prompt หนึ่งตัว)")
+                    multi_prompts = gr.Textbox(
+                        label="Prompts หลายตัว (แยกด้วยการขึ้นบรรทัดใหม่)",
+                        lines=10,
+                        placeholder="""เขียนเรื่องราวเกี่ยวกับการผจญภัยในป่า
+สร้างบทสนทนาระหว่างครูกับนักเรียน
+อธิบายวิธีการทำอาหารไทย
+เขียนบทกวีเกี่ยวกับธรรมชาติ
+สร้างเรื่องสั้นเกี่ยวกับมิตรภาพ"""
+                    )
+                    prompt_distribution = gr.Radio(
+                        choices=[
+                            ("📊 กระจายเท่าๆ กัน", "equal"),
+                            ("🎯 ตามสัดส่วนที่กำหนด", "weighted"),
+                            ("🎲 สุ่ม", "random")
+                        ],
+                        value="equal",
+                        label="วิธีการกระจาย Prompt"
+                    )
+                    prompt_weights = gr.Textbox(
+                        label="น้ำหนักของแต่ละ Prompt (เช่น 2,1,3,1,2)",
+                        placeholder="2,1,3,1,2",
+                        visible=False
+                    )
+                # Random template mode
+                random_prompt_group = gr.Group(visible=False)
+                with random_prompt_group:
+                    gr.Markdown("#### 🎲 สุ่ม Prompt จาก Template ที่เลือก")
+                    random_templates = gr.CheckboxGroup(
+                        choices=[(v["name"], k) for k, v in TASK_TEMPLATES.items()],
+                        value=["text_generation", "conversation"],
+                        label="เลือก Template ที่จะสุ่ม"
+                    )
+                    random_variables = gr.Textbox(
+                        label="ตัวแปรสำหรับสุ่ม (JSON format)",
+                        lines=5,
+                        value="""{
+    "topic": ["การเดินทาง", "เทคโนโลยี", "อาหาร", "ธรรมชาติ", "ศิลปะ"],
+    "question": ["AI คืออะไร", "โลกร้อนคืออะไร", "การศึกษาสำคัญอย่างไร"],
+    "instruction": ["เขียนบทความ", "สรุปข้อมูล", "วิเคราะห์ปัญหา"]
+}""",
+                        placeholder="ใส่ตัวแปรในรูปแบบ JSON"
+                    )
+                # Prompt preview and count
+                prompt_preview = gr.Textbox(
+                    label="ตัวอย่าง Prompt ที่จะใช้",
                     lines=3,
+                    interactive=False
                 )
+                prompt_count = gr.Textbox(
+                    label="จำนวน Prompt ที่พร้อมใช้",
+                    value="1 prompt",
+                    interactive=False
+                )
                 # Data Quality Settings
                 gr.Markdown("### 🧼 การจัดการคุณภาพข้อมูล")
                     label="รูปแบบการส่งออก"
                 )
+                # Generation parameters with better row selection
                 gr.Markdown("### ⚙️ ตั้งค่าการสร้างข้อมูล")
+                # Row count selection with presets
+                gr.Markdown("#### 📊 จำนวนข้อมูลที่ต้องการสร้าง")
+                row_preset = gr.Radio(
+                    choices=[
+                        ("🚀 ทดสอบ (5 rows)", 5),
+                        ("📝 เล็ก (50 rows)", 50),
+                        ("📋 กลาง (250 rows)", 250),
+                        ("📚 ใหญ่ (1,000 rows)", 1000),
+                        ("🏭 ใหญ่มาก (5,000 rows)", 5000),
+                        ("🏢 Enterprise (10,000 rows)", 10000),
+                        ("🎯 กำหนดเอง", -1)
+                    ],
+                    value=5,
+                    label="เลือกขนาดชุดข้อมูล"
+                )
+                custom_rows = gr.Slider(
+                    minimum=1,
+                    maximum=50000,  # Increased from 2000
+                    value=100,
+                    step=1,
+                    label="จำนวนแถวที่ต้องการ (1-50,000)",
+                    visible=False
+                )
+                # Performance warning for large datasets
+                performance_warning = gr.Markdown(
+                    visible=False,
+                    value="⚠️ **คำเตือน**: ชุดข้อมูลขนาดใหญ่ (>1,000 rows) อาจใช้เวลานานและหน่วยความจำมาก"
+                )
                 with gr.Row():
                     max_length = gr.Slider(
                         minimum=10,
+                        maximum=500,
+                        value=100,
                         step=10,
                         label="ความยาวสูงสุด (โทเคน)"
                     )
+                    batch_size = gr.Slider(
+                        minimum=1,
+                        maximum=50,  # Increased from 10
+                        value=5,     # Increased default
+                        step=1,
+                        label="ขนาด Batch (แนะนำ 5-20 สำหรับ dataset ใหญ่)"
                     )
+                generate_btn = gr.Button(
+                    "🚀 สร้างชุดข้อมูล",
+                    variant="primary",
+                    size="lg",
+                    interactive=False  # Initially disabled
+                )
+                # Add warning for large models
+                gr.Markdown("""
+                ⚠️ **คำเตือน**:
+                - โมเดลใหญ่ (7B+) ใช้เวลาโหลด 2-5 นาที
+                - แนะนำเริ่มต้นด้วย distilgpt2 เพื่อทดสอบ
+                - ถ้าหน่วยความจำไม่พอ ลองลดจำนวนข้อมูลหรือเลือกโมเดลเล็กกว่า
+                """)
         with gr.Column():
             with gr.Tabs():
                 return (
                     gr.update(visible=False),
                     gr.update(visible=True, value="❌ กรุณาเลือกโมเดลอย่างน้อยหนึ่งตัว"),
+                    {}, "กรุณาเลือกโมเดล", None, None, None, None,
+                    "❌ ไม่ได้เลือกโมเดล"
                 )
+            try:
+                # Update loading status
+                yield (
+                    gr.update(visible=False),
+                    gr.update(visible=True, value="🔄 กำลังเริ่มต้นการสร้างข้อมูล..."),
+                    {}, "กำลังเริ่มต้น...", None, None, None, None,
+                    "🔄 กำลังโหลดโมเดลและเตรียมข้อมูล..."
                 )
+                # Generate data
+                if work_mode == "collaborative" and len(selected_models) > 1:
+                    # Multi-model generation with progress
+                    yield (
+                        gr.update(visible=False),
+                        gr.update(visible=True, value="🤖 กำลังใช้โมเดลหลายตัวทำงานร่วมกัน..."),
+                        {}, "กำลังสร้างข้อมูล...", None, None, None, None,
+                        "🔄 โมเดลหลายตัวกำลังทำงาน..."
+                    )
+                    df, csv_data, json_data, error = generate_dataset_multi_model(
+                        selected_models, task_type, custom_template, file_data,
+                        num_samples, max_length, temperature, top_p
+                    )
+                else:
+                    model_name = selected_models[0]
+                    yield (
+                        gr.update(visible=False),
+                        gr.update(visible=True, value=f"🤖 กำลังใช้โมเดล {model_name}..."),
+                        {}, "กำลังสร้างข้อมูล...", None, None, None, None,
+                        f"🔄 กำลังโหลด {model_name}..."
+                    )
+                    df, csv_data, json_data, error = generate_dataset_from_task(
+                        model_name, task_type, custom_template, file_data,
+                        num_samples, max_length, temperature, top_p
+                    )
+                if error:
+                    yield (
+                        gr.update(visible=False),
+                        gr.update(visible=True, value=f"❌ เกิดข้อผิดพลาด: {error}"),
+                        {}, "เกิดข้อผิดพลาด", None, None, None, None,
+                        f"❌ {error}"
+                    )
+                    return
+                # Process quality management
+                yield (
                     gr.update(visible=False),
+                    gr.update(visible=True, value="🧼 กำลังปรับปรุงคุณภาพข้อมูล..."),
+                    {}, "กำลังปรับปรุงคุณภาพ...", None, None, None, None,
+                    "🧼 กำลังทำความสะอาดและตรวจสอบคุณภาพ..."
                 )
+                # Apply basic quality management since we don't have the full module
+                raw_data = df.to_dict('records')
+                # Simple cleaning
+                if enable_cleaning:
+                    for record in raw_data:
+                        if 'prompt' in record:
+                            record['prompt'] = str(record['prompt']).strip()
+                        if 'generated_text' in record:
+                            record['generated_text'] = str(record['generated_text']).strip()
+                # Remove duplicates (simple version)
+                if remove_duplicates:
+                    seen = set()
+                    unique_data = []
+                    for record in raw_data:
+                        key = str(record.get('prompt', '')) + str(record.get('generated_text', ''))
+                        if key not in seen:
+                            seen.add(key)
+                            unique_data.append(record)
+                    raw_data = unique_data
+                # Create quality report
+                quality_report = {
+                    "total_records": len(raw_data),
+                    "cleaning_enabled": enable_cleaning,
+                    "duplicates_removed": remove_duplicates,
+                    "models_used": list(set([r.get('model_used', 'unknown') for r in raw_data]))
+                }
+                # Create final DataFrame
+                final_df = pd.DataFrame(raw_data)
+                final_csv = final_df.to_csv(index=False)
+                final_json = json.dumps(raw_data, indent=2, ensure_ascii=False)
+                # Simple dataset card
+                dataset_card = f"""# Thai {task_type.title()} Dataset
 ## Dataset Information
 - Total Records: {len(raw_data)}
 ## Usage
 This dataset can be used for Thai NLP tasks.
 """
+                success_msg = f"✅ สร้างข้อมูลสำเร็จ! ได้ {len(raw_data)} รายการ"
+                quality_summary = f"📊 จำนวนข้อมูล: {len(raw_data)} รายการ"
+                yield (
+                    gr.update(visible=True, value=final_df),
+                    gr.update(visible=True, value=success_msg),
+                    quality_report,
+                    quality_summary,
+                    final_csv,
+                    final_json,
+                    dataset_card,
+                    None,
+                    "✅ เสร็จสิ้น!"
+                )
+            except Exception as e:
+                yield (
+                    gr.update(visible=False),
+                    gr.update(visible=True, value=f"❌ เกิดข้อผิดพลาดที่ไม่คาดคิด: {str(e)}"),
+                    {}, "เกิดข้อผิดพลาด", None, None, None, None,
+                    f"❌ ข้อผิดพลาด: {str(e)}"
+                )
         # Event connections
         task_dropdown.change(
             outputs=[file_preview, file_data_state]
         )
+        # Update generate button to use new function with progress
         generate_btn.click(
+            fn=process_with_progress_feedback,
             inputs=[model_checkboxes, work_mode, task_dropdown, custom_template, file_data_state,
                    num_samples, max_length, temperature, top_p,
                    enable_cleaning, remove_duplicates, min_quality_score,
                    create_splits, export_format],
             outputs=[dataset_preview, status_message, quality_report, quality_summary,
+                    csv_data_state, json_data_state, dataset_card_state, hf_export_state,
+                    loading_status]
         )
         csv_btn.click(