Spaces:

SamuelJaja
/

llama3.1-8b-construction-lora_a100

Runtime error

App Files Files Community

SamuelJaja commited on Mar 17

Commit

60ae325

verified ·

1 Parent(s): 722a723

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -26

app.py CHANGED Viewed

@@ -1,10 +1,15 @@
 import os
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 # Initialize FastAPI
 app = FastAPI()
@@ -17,43 +22,90 @@ HF_TOKEN = os.getenv("HF_TOKEN")
 if HF_TOKEN is None:
     raise ValueError("Hugging Face token not found. Please set the HF_TOKEN environment variable.")
-# Define the offload directory
-OFFLOAD_DIR = "/app/offload"
-os.makedirs(OFFLOAD_DIR, exist_ok=True)
-# Load tokenizer with authentication
-tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN, cache_dir=OFFLOAD_DIR)
-# Load base model with offloading
-model = AutoModelForCausalLM.from_pretrained(
-    BASE_MODEL,
-    torch_dtype=torch.float16,
-    device_map="auto",
-    token=HF_TOKEN,
-    cache_dir=OFFLOAD_DIR,
-    offload_folder=OFFLOAD_DIR  # Specify the offload directory
-)
-# Load fine-tuned weights
-model = PeftModel.from_pretrained(
-    model,
-    FINETUNED_MODEL,
-    token=HF_TOKEN,
-    cache_dir=OFFLOAD_DIR,
-    offload_folder=OFFLOAD_DIR  # Ensure offloading is consistent
-)
 # Define request body
 class Query(BaseModel):
     text: str
 # Define the text generation endpoint
 @app.post("/generate")
 async def generate_text(query: Query):
     try:
         inputs = tokenizer(query.text, return_tensors="pt").to(model.device)
-        output = model.generate(**inputs, max_new_tokens=200)
         response_text = tokenizer.decode(output[0], skip_special_tokens=True)
         return {"response": response_text}
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))

 import os
+import logging
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
 # Initialize FastAPI
 app = FastAPI()
 if HF_TOKEN is None:
     raise ValueError("Hugging Face token not found. Please set the HF_TOKEN environment variable.")
+# Define the cache directory
+CACHE_DIR = "/app/cache"
+os.makedirs(CACHE_DIR, exist_ok=True)
+try:
+    # Load tokenizer with authentication
+    logger.info(f"Loading tokenizer from {BASE_MODEL}")
+    tokenizer = AutoTokenizer.from_pretrained(
+        BASE_MODEL,
+        token=HF_TOKEN,
+        cache_dir=CACHE_DIR
+    )
+    # Load base model with simplified configuration
+    logger.info(f"Loading base model from {BASE_MODEL}")
+    model = AutoModelForCausalLM.from_pretrained(
+        BASE_MODEL,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        token=HF_TOKEN,
+        cache_dir=CACHE_DIR,
+        trust_remote_code=True
+    )
+    # Load fine-tuned adapter with simplified approach
+    logger.info(f"Loading adapter from {FINETUNED_MODEL}")
+    adapter_model = PeftModel.from_pretrained(
+        model,
+        FINETUNED_MODEL,
+        token=HF_TOKEN,
+        device_map="auto",
+        torch_dtype=torch.float16,
+        is_trainable=False  # Set to False for inference
+    )
+    # Merge adapter weights with base model for better performance (optional)
+    logger.info("Merging adapter weights with base model")
+    model = adapter_model.merge_and_unload()
+    logger.info("Model loading completed successfully")
+except Exception as e:
+    logger.error(f"Error loading model: {str(e)}")
+    raise
 # Define request body
 class Query(BaseModel):
     text: str
+    max_tokens: int = 200
+    temperature: float = 0.7
 # Define the text generation endpoint
 @app.post("/generate")
 async def generate_text(query: Query):
     try:
+        logger.info(f"Generating text for input: {query.text[:50]}...")
         inputs = tokenizer(query.text, return_tensors="pt").to(model.device)
+        with torch.no_grad():
+            output = model.generate(
+                **inputs,
+                max_new_tokens=query.max_tokens,
+                temperature=query.temperature,
+                do_sample=True if query.temperature > 0 else False
+            )
         response_text = tokenizer.decode(output[0], skip_special_tokens=True)
+        logger.info("Text generation successful")
         return {"response": response_text}
     except Exception as e:
+        logger.error(f"Error in text generation: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
+# Health check endpoint
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy"}
+# Model info endpoint
+@app.get("/info")
+async def model_info():
+    return {
+        "base_model": BASE_MODEL,
+        "adapter_model": FINETUNED_MODEL,
+        "device": str(model.device)
+    }