Spaces:

kimhyunwoo
/

freetestn

Running

App Files Files Community

kimhyunwoo commited on 2 days ago

Commit

8a7a11f

verified ·

1 Parent(s): c9ceb74

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -69

app.py CHANGED Viewed

@@ -28,6 +28,7 @@ print("This might take a few minutes, especially on the first launch...")
 model = None
 tokenizer = None
 load_successful = False
 try:
     start_load_time = time.time()
@@ -35,11 +36,11 @@ try:
         MODEL_ID,
         torch_dtype=torch.float32,
         device_map="cpu",
-        # force_download=True # 주석 처리. 캐시 문제가 없다면 불필요
     )
     tokenizer = AutoTokenizer.from_pretrained(
         MODEL_ID,
-        # force_download=True # 주석 처리
     )
     model.eval()
     load_time = time.time() - start_load_time
@@ -48,14 +49,14 @@ try:
     # --- Stop Token Configuration ---
     stop_token_strings = ["<|endofturn|>", "<|stop|>"]
-    stop_token_ids_list = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]
-    if tokenizer.eos_token_id is not None and tokenizer.eos_token_id not in stop_token_ids_list:
-        stop_token_ids_list.append(tokenizer.eos_token_id)
     elif tokenizer.eos_token_id is None:
          print("Warning: tokenizer.eos_token_id is None. Cannot add to stop tokens.")
-    stop_token_ids_list = [tid for tid in stop_token_ids_list if tid is not None]
     if not stop_token_ids_list:
         print("Warning: Could not find any stop token IDs. Using default EOS if available, otherwise generation might not stop correctly.")
@@ -63,7 +64,7 @@ try:
             stop_token_ids_list = [tokenizer.eos_token_id]
         else:
              print("Error: No stop tokens found, including default EOS. Generation may run indefinitely.")
-             # 필요시 에러 처리 또는 기본값 설정
     print(f"Using Stop Token IDs: {stop_token_ids_list}")
@@ -72,7 +73,7 @@ except Exception as e:
     if 'model' in locals() and model is not None: del model
     if 'tokenizer' in locals() and tokenizer is not None: del tokenizer
     gc.collect()
-    # 앱 실행 전에 로딩 실패 시 Gradio 에러 대신 프로세스 종료 또는 다른 처리 고려
     raise gr.Error(f"Failed to load the model {MODEL_ID}. Cannot start the application. Error: {e}")
@@ -81,7 +82,7 @@ def get_system_prompt():
     current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)")
     return (
         f"- AI 언어모델의 이름은 \"CLOVA X\" 이며 네이버에서 만들었다.\n"
-        # f"- 오늘은 {current_date}이다.\n" # 필요시 주석 해제
         f"- 사용자의 질문에 대해 친절하고 자세하게 한국어로 답변해야 한다."
     )
@@ -109,16 +110,22 @@ def warmup_model():
             return_tensors="pt"
         ).to("cpu")
         with torch.no_grad():
-            output_ids = model.generate(
-                **inputs,
-                max_new_tokens=10, # 짧게 생성하여 시간 절약
-                eos_token_id=stop_token_ids_list,
-                pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
-                do_sample=False # Warm-up 시에는 샘플링 불필요
-            )
-        # 결과 디코딩 (선택 사항, 확인용)
         # response = tokenizer.decode(output_ids[0, inputs['input_ids'].shape[1]:], skip_special_tokens=True)
         # print(f"Warm-up response (decoded): {response}")
@@ -130,40 +137,43 @@ def warmup_model():
     except Exception as e:
         print(f"!!! Error during model warm-up: {e}")
-        # 웜업 실패가 앱 실행을 막지는 않도록 처리
     finally:
-        gc.collect() # Ensure cleanup even if warmup fails
 # --- Inference Function ---
 def predict(message, history):
     """
-    Generates response using HyperCLOVAX based on user message and chat history.
-    Handles chat formatting, generation, decoding, and memory management.
-    Assumes 'history' is in the Gradio 'messages' format: List[List[str | None | Tuple]] or List[Dict]
     """
     if model is None or tokenizer is None:
          return "오류: 모델이 로드되지 않았습니다."
     system_prompt = get_system_prompt()
-    # history 형식이 List[Dict] ('messages' format)라고 가정하고 처리
     chat_history_formatted = [
-        {"role": "tool_list", "content": ""},
         {"role": "system", "content": system_prompt}
     ]
-    # history는 [{'role': 'user', 'content': '...'}, {'role': 'assistant', 'content': '...'}] 형태
-    for turn in history:
-         # history의 각 요소가 딕셔너리 형태인지 확인 (더 안전하게)
-         if isinstance(turn, dict) and "role" in turn and "content" in turn:
-             chat_history_formatted.append(turn)
-         else:
-             # 예상치 못한 형식이 들어올 경우 경고 출력 (디버깅용)
-             print(f"Warning: Unexpected history format item: {turn}")
-             # 필요하다면 여기서 에러 처리 또는 변환 로직 추가
-    # Add the latest user message
     chat_history_formatted.append({"role": "user", "content": message})
     inputs = None
@@ -175,41 +185,47 @@ def predict(message, history):
             add_generation_prompt=True,
             return_dict=True,
             return_tensors="pt"
-        ).to("cpu") # Explicitly send to CPU
         input_length = inputs['input_ids'].shape[1]
         print(f"\nInput tokens: {input_length}")
     except Exception as e:
         print(f"!!! Error applying chat template: {e}")
-        # Provide feedback to the user
         return f"오류: 입력 형식을 처리하는 중 문제가 발생했습니다. ({e})"
     try:
         print("Generating response...")
         generation_start_time = time.time()
         with torch.no_grad():
-            output_ids = model.generate(
-                **inputs,
-                max_new_tokens=MAX_NEW_TOKENS,
-                eos_token_id=stop_token_ids_list,
-                pad_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
-                do_sample=True,
-                temperature=0.7,
-                top_p=0.9,
-            )
         generation_time = time.time() - generation_start_time
         print(f"Generation complete in {generation_time:.2f} seconds.")
     except Exception as e:
         print(f"!!! Error during model generation: {e}")
-        # Clean up potentially large tensors in case of error
         if inputs is not None: del inputs
         if output_ids is not None: del output_ids
         gc.collect()
         return f"오류: 응답을 생성하는 중 문제가 발생했습니다. ({e})"
     # Decode the response
-    response = "오류: 응답 생성에 실패했습니다." # 기본값
     if output_ids is not None:
         try:
             new_tokens = output_ids[0, input_length:]
@@ -220,7 +236,6 @@ def predict(message, history):
             print(f"!!! Error decoding response: {e}")
             response = "오류: 응답을 ���코딩하는 중 문제가 발생했습니다."
     # Clean up memory
     if inputs is not None: del inputs
     if output_ids is not None: del output_ids
@@ -232,13 +247,8 @@ def predict(message, history):
 # --- Gradio Interface Setup ---
 print("--- Setting up Gradio Interface ---")
-# type='messages'를 명시하여 UserWarning 해결 및 최신 형식 사용
-chatbot_component = gr.Chatbot(
-    label="HyperCLOVA X SEED (0.5B) 대화",
-    bubble_full_width=False,
-    height=600,
-    type='messages' # 이 부분을 명시하여 ChatInterface와의 호환성 확보
-)
 examples = [
     ["네이버 클로바X는 무엇인가요?"],
@@ -247,34 +257,32 @@ examples = [
     ["제주도 여행 계획을 세우고 있는데, 3박 4일 추천 코스 좀 짜줄래?"],
 ]
-# ChatInterface 생성 시 불필요한 인자 제거됨
 demo = gr.ChatInterface(
-    fn=predict,                 # 예측 함수 연결
-    chatbot=chatbot_component,  # Chatbot 컴포넌트 사용 (type='messages' 설정됨)
     title="🇰🇷 네이버 HyperCLOVA X SEED (0.5B) 데모",
     description=(
         f"**모델:** {MODEL_ID}\n"
         f"**환경:** Hugging Face 무료 CPU (16GB RAM)\n"
-        f"**주의:** CPU에서 실행되므로 응답 생성에 다소 시간이 걸릴 수 있습니다. (웜업 시도됨)\n"
         f"최대 생성 토큰 수는 {MAX_NEW_TOKENS}개로 제한됩니다."
     ),
     examples=examples,
-    cache_examples=False,       # 무료 티어 캐싱 비활성화
     theme="soft",
-    # retry_btn, undo_btn, clear_btn 등은 최신 버전에서 직접 지원하지 않음
 )
 # --- Application Launch ---
 if __name__ == "__main__":
-    # 모델 로딩 성공 시에만 웜업 실행
     if load_successful:
         warmup_model()
     else:
         print("Skipping warm-up because model loading failed.")
     print("--- Launching Gradio App ---")
-    # queue()는 여러 사용자 처리 및 긴 작업 관리에 유용
     demo.queue().launch(
-        # share=True # 공개 링크 생성 시 필요 (로그인 필요할 수 있음)
-        # server_name="0.0.0.0" # 로컬 네트워크에서 접근 허용 시
     )

 model = None
 tokenizer = None
 load_successful = False
+stop_token_ids_list = [] # Initialize stop_token_ids_list
 try:
     start_load_time = time.time()
         MODEL_ID,
         torch_dtype=torch.float32,
         device_map="cpu",
+        # force_download=True # Keep commented unless cache issues reappear
     )
     tokenizer = AutoTokenizer.from_pretrained(
         MODEL_ID,
+        # force_download=True # Keep commented
     )
     model.eval()
     load_time = time.time() - start_load_time
     # --- Stop Token Configuration ---
     stop_token_strings = ["<|endofturn|>", "<|stop|>"]
+    temp_stop_ids = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]
+    if tokenizer.eos_token_id is not None and tokenizer.eos_token_id not in temp_stop_ids:
+        temp_stop_ids.append(tokenizer.eos_token_id)
     elif tokenizer.eos_token_id is None:
          print("Warning: tokenizer.eos_token_id is None. Cannot add to stop tokens.")
+    stop_token_ids_list = [tid for tid in temp_stop_ids if tid is not None] # Assign to the global scope variable
     if not stop_token_ids_list:
         print("Warning: Could not find any stop token IDs. Using default EOS if available, otherwise generation might not stop correctly.")
             stop_token_ids_list = [tokenizer.eos_token_id]
         else:
              print("Error: No stop tokens found, including default EOS. Generation may run indefinitely.")
+             # Consider raising an error or setting a default if this is critical
     print(f"Using Stop Token IDs: {stop_token_ids_list}")
     if 'model' in locals() and model is not None: del model
     if 'tokenizer' in locals() and tokenizer is not None: del tokenizer
     gc.collect()
+    # Raise Gradio error to display in the Space UI if loading fails
     raise gr.Error(f"Failed to load the model {MODEL_ID}. Cannot start the application. Error: {e}")
     current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)")
     return (
         f"- AI 언어모델의 이름은 \"CLOVA X\" 이며 네이버에서 만들었다.\n"
+        # f"- 오늘은 {current_date}이다.\n" # Uncomment if needed
         f"- 사용자의 질문에 대해 친절하고 자세하게 한국어로 답변해야 한다."
     )
             return_tensors="pt"
         ).to("cpu")
+        # Check if stop_token_ids_list is empty and handle appropriately
+        gen_kwargs = {
+            "max_new_tokens": 10,
+            "pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
+            "do_sample": False
+        }
+        if stop_token_ids_list:
+            gen_kwargs["eos_token_id"] = stop_token_ids_list
+        else:
+            print("Warmup Warning: No stop tokens defined for generation.")
         with torch.no_grad():
+            output_ids = model.generate(**inputs, **gen_kwargs)
+        # Optional: Decode warmup response for verification
         # response = tokenizer.decode(output_ids[0, inputs['input_ids'].shape[1]:], skip_special_tokens=True)
         # print(f"Warm-up response (decoded): {response}")
     except Exception as e:
         print(f"!!! Error during model warm-up: {e}")
     finally:
+        gc.collect()
 # --- Inference Function ---
 def predict(message, history):
     """
+    Generates response using HyperCLOVAX.
+    Assumes 'history' is in the Gradio 'messages' format: List[Dict].
     """
     if model is None or tokenizer is None:
          return "오류: 모델이 로드되지 않았습니다."
     system_prompt = get_system_prompt()
+    # Start with system prompt
     chat_history_formatted = [
+        {"role": "tool_list", "content": ""}, # As required by model card
         {"role": "system", "content": system_prompt}
     ]
+    # Append history (List of {'role': 'user'/'assistant', 'content': '...'})
+    if isinstance(history, list): # Check if history is a list
+        for turn in history:
+             # Validate turn format
+            if isinstance(turn, dict) and "role" in turn and "content" in turn:
+                 chat_history_formatted.append(turn)
+            # Handle potential older tuple format if necessary (though less likely now)
+            elif isinstance(turn, (list, tuple)) and len(turn) == 2:
+                 print(f"Warning: Received history item in tuple format: {turn}. Converting to messages format.")
+                 chat_history_formatted.append({"role": "user", "content": turn[0]})
+                 if turn[1]: # Ensure assistant message exists
+                      chat_history_formatted.append({"role": "assistant", "content": turn[1]})
+            else:
+                print(f"Warning: Skipping unexpected history format item: {turn}")
+    # Append the latest user message
     chat_history_formatted.append({"role": "user", "content": message})
     inputs = None
             add_generation_prompt=True,
             return_dict=True,
             return_tensors="pt"
+        ).to("cpu")
         input_length = inputs['input_ids'].shape[1]
         print(f"\nInput tokens: {input_length}")
     except Exception as e:
         print(f"!!! Error applying chat template: {e}")
         return f"오류: 입력 형식을 처리하는 중 문제가 발생했습니다. ({e})"
     try:
         print("Generating response...")
         generation_start_time = time.time()
+        # Prepare generation arguments, handling empty stop_token_ids_list
+        gen_kwargs = {
+            "max_new_tokens": MAX_NEW_TOKENS,
+            "pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
+            "do_sample": True,
+            "temperature": 0.7,
+            "top_p": 0.9,
+        }
+        if stop_token_ids_list:
+             gen_kwargs["eos_token_id"] = stop_token_ids_list
+        else:
+             print("Generation Warning: No stop tokens defined.")
         with torch.no_grad():
+            output_ids = model.generate(**inputs, **gen_kwargs)
         generation_time = time.time() - generation_start_time
         print(f"Generation complete in {generation_time:.2f} seconds.")
     except Exception as e:
         print(f"!!! Error during model generation: {e}")
         if inputs is not None: del inputs
         if output_ids is not None: del output_ids
         gc.collect()
         return f"오류: 응답을 생성하는 중 문제가 발생했습니다. ({e})"
     # Decode the response
+    response = "오류: 응답 생성에 실패했습니다."
     if output_ids is not None:
         try:
             new_tokens = output_ids[0, input_length:]
             print(f"!!! Error decoding response: {e}")
             response = "오류: 응답을 ���코딩하는 중 문제가 발생했습니다."
     # Clean up memory
     if inputs is not None: del inputs
     if output_ids is not None: del output_ids
 # --- Gradio Interface Setup ---
 print("--- Setting up Gradio Interface ---")
+# No need to create a separate Chatbot component beforehand
+# chatbot_component = gr.Chatbot(...) # REMOVED
 examples = [
     ["네이버 클로바X는 무엇인가요?"],
     ["제주도 여행 계획을 세우고 있는데, 3박 4일 추천 코스 좀 짜줄래?"],
 ]
+# Let ChatInterface manage its own internal Chatbot component
+# Remove the chatbot=... argument
 demo = gr.ChatInterface(
+    fn=predict,                 # Link the prediction function
+    # chatbot=chatbot_component,  # REMOVED
     title="🇰🇷 네이버 HyperCLOVA X SEED (0.5B) 데모",
     description=(
         f"**모델:** {MODEL_ID}\n"
         f"**환경:** Hugging Face 무료 CPU (16GB RAM)\n"
+        f"**주의:** CPU에서 실행되므로 응답 생성에 다소 시간이 걸릴 수 있습니다. (웜업 완료)\n"
         f"최대 생성 토큰 수는 {MAX_NEW_TOKENS}개로 제한됩니다."
     ),
     examples=examples,
+    cache_examples=False,
     theme="soft",
 )
 # --- Application Launch ---
 if __name__ == "__main__":
     if load_successful:
         warmup_model()
     else:
         print("Skipping warm-up because model loading failed.")
     print("--- Launching Gradio App ---")
     demo.queue().launch(
+        # share=True # Uncomment for public link
+        # server_name="0.0.0.0" # Uncomment for local network access
     )