prithivMLmods commited on
Commit
4db58a4
·
verified ·
1 Parent(s): 9610d21

update app

Browse files
Files changed (1) hide show
  1. app.py +1 -22
app.py CHANGED
@@ -56,15 +56,6 @@ model_q = Qwen2_5_VLForConditionalGeneration.from_pretrained(
56
  torch_dtype=torch.float16
57
  ).to(device).eval()
58
 
59
- # Load allenai/olmOCR-7B-0825
60
- MODEL_ID_F = "allenai/olmOCR-7B-0825"
61
- processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
62
- model_f = Qwen2_5_VLForConditionalGeneration.from_pretrained(
63
- MODEL_ID_F,
64
- trust_remote_code=True,
65
- torch_dtype=torch.float16
66
- ).to(device).eval()
67
-
68
  # Load R-4B
69
  MODEL_ID_Y = "YannQi/R-4B"
70
  processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
@@ -115,9 +106,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
115
  elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
116
  processor = processor_q
117
  model = model_q
118
- elif model_name == "olmOCR-7B-0825":
119
- processor = processor_f
120
- model = model_f
121
  elif model_name == "R-4B":
122
  processor = processor_y
123
  model = model_y
@@ -175,9 +163,6 @@ def generate_video(model_name: str, text: str, video_path: str,
175
  elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
176
  processor = processor_q
177
  model = model_q
178
- elif model_name == "olmOCR-7B-0825":
179
- processor = processor_f
180
- model = model_f
181
  elif model_name == "R-4B":
182
  processor = processor_y
183
  model = model_y
@@ -295,7 +280,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
295
  markdown_output = gr.Markdown()
296
 
297
  model_choice = gr.Radio(
298
- choices=["Qwen2.5-VL-7B-Instruct", "Qwen2.5-VL-3B-Instruct", "R-4B", "Qwen2.5-VL-7B-Abliterated-Caption-it", "olmOCR-7B-0825"],
299
  label="Select Model",
300
  value="Qwen2.5-VL-7B-Instruct"
301
  )
@@ -305,12 +290,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
305
  """
306
  > [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): The Qwen2.5-VL-7B-Instruct model is a multimodal AI model developed by Alibaba Cloud that excels at understanding both text and images. It's a Vision-Language Model (VLM) designed to handle various visual understanding tasks, including image understanding, video analysis, and even multilingual support.
307
  >
308
- > [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct): Qwen2.5-VL-3B-Instruct is an instruction-tuned vision-language model from Alibaba Cloud, built upon the Qwen2-VL series. It excels at understanding and generating text related to both visual and textual inputs, making it capable of tasks like image captioning, visual question answering, and object localization. The model also supports long video understanding and structured data extraction.
309
- """
310
- )
311
-
312
- gr.Markdown(
313
- """
314
  > [Qwen2.5-VL-7B-Abliterated-Caption-it](prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it): Qwen2.5-VL-7B-Abliterated-Caption-it is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Abliterated Captioning / Uncensored Captioning. This model excels at generating detailed, context-rich, and high-fidelity captions across diverse image categories and variational aspect ratios, offering robust visual understanding without filtering or censorship.
315
  >
316
  > [olmOCR-7B-0825](https://huggingface.co/allenai/olmOCR-7B-0825): olmOCR-7B-0825 is a 7B parameter open large model designed for OCR tasks with robust text extraction, especially in complex document layouts. Multimodal model emphasizing strong document reading and extraction capabilities combined with vision-language understanding to support detailed document parsing tasks.
 
56
  torch_dtype=torch.float16
57
  ).to(device).eval()
58
 
 
 
 
 
 
 
 
 
 
59
  # Load R-4B
60
  MODEL_ID_Y = "YannQi/R-4B"
61
  processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
 
106
  elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
107
  processor = processor_q
108
  model = model_q
 
 
 
109
  elif model_name == "R-4B":
110
  processor = processor_y
111
  model = model_y
 
163
  elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
164
  processor = processor_q
165
  model = model_q
 
 
 
166
  elif model_name == "R-4B":
167
  processor = processor_y
168
  model = model_y
 
280
  markdown_output = gr.Markdown()
281
 
282
  model_choice = gr.Radio(
283
+ choices=["Qwen2.5-VL-7B-Instruct", "Qwen2.5-VL-3B-Instruct", "R-4B", "Qwen2.5-VL-7B-Abliterated-Caption-it"],
284
  label="Select Model",
285
  value="Qwen2.5-VL-7B-Instruct"
286
  )
 
290
  """
291
  > [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): The Qwen2.5-VL-7B-Instruct model is a multimodal AI model developed by Alibaba Cloud that excels at understanding both text and images. It's a Vision-Language Model (VLM) designed to handle various visual understanding tasks, including image understanding, video analysis, and even multilingual support.
292
  >
 
 
 
 
 
 
293
  > [Qwen2.5-VL-7B-Abliterated-Caption-it](prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it): Qwen2.5-VL-7B-Abliterated-Caption-it is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Abliterated Captioning / Uncensored Captioning. This model excels at generating detailed, context-rich, and high-fidelity captions across diverse image categories and variational aspect ratios, offering robust visual understanding without filtering or censorship.
294
  >
295
  > [olmOCR-7B-0825](https://huggingface.co/allenai/olmOCR-7B-0825): olmOCR-7B-0825 is a 7B parameter open large model designed for OCR tasks with robust text extraction, especially in complex document layouts. Multimodal model emphasizing strong document reading and extraction capabilities combined with vision-language understanding to support detailed document parsing tasks.