Spaces:

wiklif
/

my-api

Sleeping

wiklif commited on Jul 24, 2024

Commit

f7fc778

1 Parent(s): b7844b5

Zamiast używać InferenceClient, ładujemy model lokalnie za pomocą AutoModelForCausalLM i AutoTokenizer.

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,29 +1,37 @@
 import spaces
-from huggingface_hub import InferenceClient
 import gradio as gr
-import os
-# Inicjalizacja klienta
-client = InferenceClient(
-    model='meta-llama/Meta-Llama-3.1-8B',
-    token=os.environ.get("MY_API_LLAMA_3_1")
-)
 @spaces.GPU(duration=60)
 def generate_response(chat, kwargs):
-    output = ''
-    stream = client.text_generation(chat, **kwargs, stream=True, details=True, return_full_text=False)
-    for response in stream:
-        output += response.token.text
-    if output.endswith("</s>"):  # Sprawdzamy, czy odpowiedź kończy się tagiem </s>
-        output = output[:-4]  # Usuwamy tag </s> z końca odpowiedzi
     return output
 def function(prompt, history=[]):
     chat = "<s>"
     for user_prompt, bot_response in history:
         chat += f"[INST] {user_prompt} [/INST] {bot_response}</s> <s>"
-    chat += f"[INST] {prompt} [/INST]"  # Zostawiamy tylko tag otwierający <s> na początku i kończymy ciąg zwykłym znacznikiem
     kwargs = dict(
         temperature=0.5,
         max_new_tokens=4096,

+import os
 import spaces
 import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+model_id = "meta-llama/Meta-Llama-3.1-8B"
+@spaces.GPU(duration=60)
+def load_model():
+    tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("MY_API_LLAMA_3_1"))
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        token=os.environ.get("MY_API_LLAMA_3_1"),
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+        low_cpu_mem_usage=True
+    )
+    return pipeline("text-generation", model=model, tokenizer=tokenizer)
+pipe = load_model()
 @spaces.GPU(duration=60)
 def generate_response(chat, kwargs):
+    output = pipe(chat, **kwargs)[0]['generated_text']
+    if output.endswith("</s>"):
+        output = output[:-4]
     return output
 def function(prompt, history=[]):
     chat = "<s>"
     for user_prompt, bot_response in history:
         chat += f"[INST] {user_prompt} [/INST] {bot_response}</s> <s>"
+    chat += f"[INST] {prompt} [/INST]"
     kwargs = dict(
         temperature=0.5,
         max_new_tokens=4096,