Spaces:

prithivMLmods
/

Qwen2.5-VL-Outpost

Running on Zero

App Files Files Community

prithivMLmods commited on 5 days ago

Commit

6833cd8

verified ·

1 Parent(s): 4fd729b

update app (#9)

Browse files

- update app (81625108125ebbde9bd434ab19f47ea6fb62b9b8)

Files changed (1) hide show

app.py +7 -7

app.py CHANGED Viewed

@@ -56,10 +56,10 @@ model_q = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load R-4B
-MODEL_ID_Y = "YannQi/R-4B"
 processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
-model_y = AutoModel.from_pretrained(
     MODEL_ID_Y,
     trust_remote_code=True,
     torch_dtype=torch.float16
@@ -106,7 +106,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
         processor = processor_q
         model = model_q
-    elif model_name == "R-4B":
         processor = processor_y
         model = model_y
     else:
@@ -163,7 +163,7 @@ def generate_video(model_name: str, text: str, video_path: str,
     elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
         processor = processor_q
         model = model_q
-    elif model_name == "R-4B":
         processor = processor_y
         model = model_y
     else:
@@ -280,7 +280,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                     markdown_output = gr.Markdown()
             model_choice = gr.Radio(
-                choices=["Qwen2.5-VL-7B-Instruct", "Qwen2.5-VL-3B-Instruct", "R-4B", "Qwen2.5-VL-7B-Abliterated-Caption-it"],
                 label="Select Model",
                 value="Qwen2.5-VL-7B-Instruct"
             )
@@ -294,7 +294,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                 """
             )
-            gr.Markdown("> [R-4B](https://huggingface.co/YannQi/R-4B): R-4B is a multimodal large language model designed for adaptive auto-thinking, able to intelligently switch between detailed reasoning and direct responses to optimize quality and efficiency. It achieves state-of-the-art performance and efficiency with user-controllable response modes, making it ideal for both simple and complex tasks.")
             gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")

     torch_dtype=torch.float16
 ).to(device).eval()
+# Load Lumian2-VLR-7B-Thinking
+MODEL_ID_Y = "prithivMLmods/Lumian2-VLR-7B-Thinking"
 processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
+model_y = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_Y,
     trust_remote_code=True,
     torch_dtype=torch.float16
     elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
         processor = processor_q
         model = model_q
+    elif model_name == "Lumian2-VLR-7B-Thinking":
         processor = processor_y
         model = model_y
     else:
     elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
         processor = processor_q
         model = model_q
+    elif model_name == "Lumian2-VLR-7B-Thinking":
         processor = processor_y
         model = model_y
     else:
                     markdown_output = gr.Markdown()
             model_choice = gr.Radio(
+                choices=["Qwen2.5-VL-7B-Instruct", "Qwen2.5-VL-3B-Instruct", "Lumian2-VLR-7B-Thinking", "Qwen2.5-VL-7B-Abliterated-Caption-it"],
                 label="Select Model",
                 value="Qwen2.5-VL-7B-Instruct"
             )
                 """
             )
+            gr.Markdown("> [Lumian2-VLR-7B-Thinking](https://huggingface.co/prithivMLmods/Lumian2-VLR-7B-Thinking): The Lumian2-VLR-7B-Thinking model is a high-fidelity vision-language reasoning (experimental model) system designed for fine-grained multimodal understanding. Built on Qwen2.5-VL-7B-Instruct, this model enhances image captioning, sampled video reasoning, and document comprehension through explicit grounded reasoning. It produces structured reasoning traces aligned with visual coordinates, enabling explainable multimodal reasoning.")
             gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")