ai-tube-model-moondream2

Paused

jbilcke-hf HF staff commited on Apr 22, 2024

Commit

566b8be

verified ·

1 Parent(s): f62c867

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -22,18 +22,31 @@ def readb64(b64):
     return img
-# not sure why
 #import subprocess
 #subprocess.run('pip3 install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 model_id = "vikhyatk/moondream2"
-revision = "2024-04-02"
-tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
 moondream = AutoModelForCausalLM.from_pretrained(
-    model_id, trust_remote_code=True, revision=revision,
-    torch_dtype=torch.bfloat16, device_map={"": "cuda"},
-    attn_implementation="flash_attention_2"
-)
 moondream.eval()
 def answer_question(secret_token, input, prompt):
@@ -60,11 +73,13 @@ def answer_question(secret_token, input, prompt):
     buffer = ""
     for new_text in streamer:
         buffer += new_text
-    buffer.strip()
-    return buffer
 with gr.Blocks() as demo:
     gr.HTML("""

     return img
+#
+# this version work in the official demo but not when I fork it, doesn't work, and I'm not sure why
+#
 #import subprocess
 #subprocess.run('pip3 install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+#model_id = "vikhyatk/moondream2"
+#revision = "2024-04-02"
+#tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
+#moondream = AutoModelForCausalLM.from_pretrained(
+#    model_id, trust_remote_code=True, revision=revision,
+#    torch_dtype=torch.bfloat16, device_map={"": "cuda"},
+#    attn_implementation="flash_attention_2"
+#)
+#moondream.eval()
+# so let's use an older version
+if torch.cuda.is_available():
+    device, dtype = "cuda", torch.float16
+else:
+    device, dtype = "cpu", torch.float32
 model_id = "vikhyatk/moondream2"
+tokenizer = AutoTokenizer.from_pretrained(model_id, revision="2024-03-06")
 moondream = AutoModelForCausalLM.from_pretrained(
+    model_id, trust_remote_code=True, revision="2024-03-06"
+).to(device=device, dtype=dtype)
 moondream.eval()
 def answer_question(secret_token, input, prompt):
     buffer = ""
     for new_text in streamer:
+        # do we really need this?
+        clean_text = re.sub("<$|<END$", "", new_text)
         buffer += new_text
+    return buffer.strip()
 with gr.Blocks() as demo:
     gr.HTML("""