Spaces:

thliang01
/

taide-Llama3-TAIDE-LX-8B-Chat-Alpha1

Running on Zero

App Files Files Community

thliang01 commited on Jun 17

Commit

5fa88f4

verified ·

1 Parent(s): cfc8b04

Update app.py

Browse files

* 設定pad_token
* 使用return_dict=True獲取attention_mask
* 在generate_kwargs中加入必要參數
* 增加錯誤處理和記憶體清理
* 增加streamer timeout

Files changed (1) hide show

app.py +57 -44

app.py CHANGED Viewed

@@ -1,18 +1,13 @@
 import gradio as gr
 import spaces
 import os
-import spaces
 import torch
-from transformers import GemmaTokenizer, AutoModelForCausalLM
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
 # Set an environment variable
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
-zero = torch.Tensor([0]).cuda()
-print(zero.device) # <-- 'cpu' 🤔
 DESCRIPTION = '''
 <div>
 <h1 style="text-align: center;">TAIDE/Llama3-TAIDE-LX-8B-Chat-Alpha1</h1>
@@ -41,7 +36,12 @@ h1 {
 # Load the tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained("taide/Llama3-TAIDE-LX-8B-Chat-Alpha1")
-model = AutoModelForCausalLM.from_pretrained("taide/Llama3-TAIDE-LX-8B-Chat-Alpha1")  # to("cuda:0")
 terminators = [
     tokenizer.eos_token_id,
     tokenizer.convert_tokens_to_ids("<|eot_id|>")
@@ -55,47 +55,60 @@ def chat_taide_8b(message: str,
              ) -> str:
     """
     Generate a streaming response using the llama3-8b model.
-    Args:
-        message (str): The input message.
-        history (list): The conversation history used by ChatInterface.
-        temperature (float): The temperature for generating the response.
-        max_new_tokens (int): The maximum number of new tokens to generate.
-    Returns:
-        str: The generated response.
     """
-    conversation = []
-    for user, assistant in history:
-        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
-    conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
-    generate_kwargs = dict(
-        input_ids= input_ids,
-        streamer=streamer,
-        max_new_tokens=max_new_tokens,
-        do_sample=True,
-        temperature=temperature,
-        eos_token_id=terminators,
-    )
-    # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
-    if temperature == 0:
-        generate_kwargs['do_sample'] = False
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()
-    outputs = []
-    for text in streamer:
-        outputs.append(text)
-        #print(outputs)
-        yield "".join(outputs)
 # Gradio block
-chatbot=gr.Chatbot(height=450, label='Gradio ChatInterface')
 with gr.Blocks(fill_height=True, css=css) as demo:
@@ -118,15 +131,15 @@ with gr.Blocks(fill_height=True, css=css) as demo:
                       step=1,
                       value=512,
                       label="Max new tokens",
-                      render=False ),
-            ],
         examples=[
             ['請以以下內容為基礎，寫一篇文章：撰寫一篇作文，題目為《一張舊照片》，內容要求為：選擇一張令你印象深刻的照片，說明令你印象深刻的原因，並描述照片中的影像及背後的故事。記錄成長的過程、與他人的情景、環境變遷和美麗的景色。'],
             ['請以品牌經理的身份，給廣告公司的創意總監寫一封信，提出對於新產品廣告宣傳活動的創意建議。'],
             ['以下提供英文內容，請幫我翻譯成中文。Dongshan coffee is famous for its unique position, and the constant refinement of production methods. The flavor is admired by many caffeine afficionados.'],
-            ],
         cache_examples=False,
-                     )
     gr.Markdown(LICENSE)

 import gradio as gr
 import spaces
 import os
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
 # Set an environment variable
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 DESCRIPTION = '''
 <div>
 <h1 style="text-align: center;">TAIDE/Llama3-TAIDE-LX-8B-Chat-Alpha1</h1>
 # Load the tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained("taide/Llama3-TAIDE-LX-8B-Chat-Alpha1")
+model = AutoModelForCausalLM.from_pretrained("taide/Llama3-TAIDE-LX-8B-Chat-Alpha1")
+# 設定pad_token_id（關鍵修正）
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
 terminators = [
     tokenizer.eos_token_id,
     tokenizer.convert_tokens_to_ids("<|eot_id|>")
              ) -> str:
     """
     Generate a streaming response using the llama3-8b model.
     """
+    try:
+        conversation = []
+        for user, assistant in history:
+            conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+        conversation.append({"role": "user", "content": message})
+        # 使用return_dict=True來獲取attention_mask（關鍵修正）
+        inputs = tokenizer.apply_chat_template(
+            conversation,
+            return_tensors="pt",
+            return_dict=True,
+            add_generation_prompt=True
+        )
+        input_ids = inputs["input_ids"].to(model.device)
+        attention_mask = inputs.get("attention_mask", None)
+        if attention_mask is not None:
+            attention_mask = attention_mask.to(model.device)
+        streamer = TextIteratorStreamer(tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
+        generate_kwargs = dict(
+            input_ids=input_ids,
+            attention_mask=attention_mask,  # 加入attention_mask
+            streamer=streamer,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            temperature=temperature,
+            eos_token_id=terminators,
+            pad_token_id=tokenizer.pad_token_id,  # 明確設定pad_token_id
+        )
+        # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
+        if temperature == 0:
+            generate_kwargs['do_sample'] = False
+        t = Thread(target=model.generate, kwargs=generate_kwargs)
+        t.start()
+        outputs = []
+        for text in streamer:
+            outputs.append(text)
+            yield "".join(outputs)
+    except Exception as e:
+        yield f"生成過程中發生錯誤: {str(e)}"
+    finally:
+        # 清理GPU記憶體
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
 # Gradio block
+chatbot = gr.Chatbot(height=450, label='Gradio ChatInterface')
 with gr.Blocks(fill_height=True, css=css) as demo:
                       step=1,
                       value=512,
                       label="Max new tokens",
+                      render=False),
+        ],
         examples=[
             ['請以以下內容為基礎，寫一篇文章：撰寫一篇作文，題目為《一張舊照片》，內容要求為：選擇一張令你印象深刻的照片，說明令你印象深刻的原因，並描述照片中的影像及背後的故事。記錄成長的過程、與他人的情景、環境變遷和美麗的景色。'],
             ['請以品牌經理的身份，給廣告公司的創意總監寫一封信，提出對於新產品廣告宣傳活動的創意建議。'],
             ['以下提供英文內容，請幫我翻譯成中文。Dongshan coffee is famous for its unique position, and the constant refinement of production methods. The flavor is admired by many caffeine afficionados.'],
+        ],
         cache_examples=False,
+    )
     gr.Markdown(LICENSE)