Spaces:

feras-vbrl
/

SigmaTriple

Sleeping

App Files Files Community

feras-vbrl commited on 24 days ago

Commit

d847350

verified ·

1 Parent(s): d876e1d

Upload app.py

Browse files

Files changed (1) hide show

app.py +21 -7

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ st.set_page_config(
 # Cache the model loading to avoid reloading on each interaction
 @st.cache_resource
 def load_model():
-    with st.spinner("Loading model with vllm for T4 GPU..."):
         # Check if GPU is available
         gpu_available = torch.cuda.is_available()
         st.info(f"GPU available: {gpu_available}")
@@ -86,7 +86,9 @@ def load_model():
                 )
         # Move model to appropriate device if needed
-        if 'device_map' not in locals() or device_map is None:
             model = model.to(device)
         tokenizer = AutoTokenizer.from_pretrained("sciphi/triplex", trust_remote_code=True)
@@ -123,8 +125,15 @@ def triplextract(model, tokenizer, text, entity_types, predicates, use_vllm=True
     else:
         # Use standard transformers
         messages = [{'role': 'user', 'content': message}]
-        device = next(model.parameters()).device  # Get the device the model is on
-        input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(device)
         output = tokenizer.decode(model.generate(input_ids=input_ids, max_length=2048)[0], skip_special_tokens=True)
     processing_time = time.time() - start_time
@@ -278,9 +287,14 @@ def main():
     # Add a note about performance
     if torch.cuda.is_available():
-        st.success("""
-        🚀 Running on GPU with vllm for optimal performance!
-        """)
     else:
         st.warning("""
         ⚠️ You are running on CPU which can be very slow for the SciPhi/Triplex model.

 # Cache the model loading to avoid reloading on each interaction
 @st.cache_resource
 def load_model():
+    with st.spinner("Loading model..."):
         # Check if GPU is available
         gpu_available = torch.cuda.is_available()
         st.info(f"GPU available: {gpu_available}")
                 )
         # Move model to appropriate device if needed
+        # Check if the model has a device_map attribute and if it's not None
+        # If it has a device_map, it's already distributed across devices and shouldn't be moved
+        if not hasattr(model, 'device_map') or model.device_map is None:
             model = model.to(device)
         tokenizer = AutoTokenizer.from_pretrained("sciphi/triplex", trust_remote_code=True)
     else:
         # Use standard transformers
         messages = [{'role': 'user', 'content': message}]
+        # Handle device mapping differently based on model configuration
+        if hasattr(model, 'device_map') and model.device_map is not None:
+            # Model already has device mapping, don't need to specify device for input_ids
+            input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
+        else:
+            # Get the device the model is on
+            device = next(model.parameters()).device
+            input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(device)
         output = tokenizer.decode(model.generate(input_ids=input_ids, max_length=2048)[0], skip_special_tokens=True)
     processing_time = time.time() - start_time
     # Add a note about performance
     if torch.cuda.is_available():
+        if use_vllm:
+            st.success("""
+            🚀 Running on GPU with vllm for optimal performance!
+            """)
+        else:
+            st.success("""
+            🚀 Running on GPU for improved performance!
+            """)
     else:
         st.warning("""
         ⚠️ You are running on CPU which can be very slow for the SciPhi/Triplex model.