Spaces:
Paused
Paused
Peter Larnholt
commited on
Commit
·
e48919a
1
Parent(s):
275a99c
Upgrade to vLLM 0.6.3.post1 and remove pyairports workarounds
Browse files- Upgrade vllm from 0.5.5 to 0.6.3.post1 (stable version with proper dependency management)
- Upgrade torch from 2.4.0 to 2.5.0 (CUDA 12.1 compatible)
- Remove all pyairports/outlines workarounds (handled natively by newer vLLM)
- Delete sitecustomize.py (no longer needed)
- Simplify Dockerfile (remove cache directories and patches)
- Clean up app.py (remove VLLM_USE_OUTLINES environment variable manipulation)
- Dockerfile +1 -11
- app.py +1 -5
- requirements.txt +2 -6
- sitecustomize.py +0 -14
Dockerfile
CHANGED
|
@@ -3,11 +3,7 @@ FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
|
|
| 3 |
ENV DEBIAN_FRONTEND=noninteractive \
|
| 4 |
PYTHONUNBUFFERED=1 \
|
| 5 |
PIP_NO_CACHE_DIR=1 \
|
| 6 |
-
HF_HUB_ENABLE_HF_TRANSFER=1
|
| 7 |
-
# Give numba/outlines a writable cache in Spaces runtime
|
| 8 |
-
NUMBA_CACHE_DIR=/tmp/numba_cache \
|
| 9 |
-
OUTLINES_CACHE_DIR=/tmp/outlines_cache
|
| 10 |
-
# If issues persist, add: NUMBA_DISABLE_FILE_SYSTEM_CACHING=1
|
| 11 |
|
| 12 |
RUN apt-get update && apt-get install -y python3 python3-pip git && rm -rf /var/lib/apt/lists/*
|
| 13 |
WORKDIR /app
|
|
@@ -15,12 +11,6 @@ WORKDIR /app
|
|
| 15 |
COPY requirements.txt /app/
|
| 16 |
RUN python3 -m pip install --upgrade pip && pip3 install -r requirements.txt
|
| 17 |
|
| 18 |
-
# Copy sitecustomize.py to Python's site-packages to patch pyairports import globally
|
| 19 |
-
COPY sitecustomize.py /usr/local/lib/python3.10/dist-packages/
|
| 20 |
-
|
| 21 |
-
# ensure caches exist & are writable in Spaces container
|
| 22 |
-
RUN mkdir -p /tmp/numba_cache /tmp/outlines_cache && chmod -R 777 /tmp/numba_cache /tmp/outlines_cache
|
| 23 |
-
|
| 24 |
COPY app.py /app/
|
| 25 |
|
| 26 |
# Spaces exposes the app on $PORT
|
|
|
|
| 3 |
ENV DEBIAN_FRONTEND=noninteractive \
|
| 4 |
PYTHONUNBUFFERED=1 \
|
| 5 |
PIP_NO_CACHE_DIR=1 \
|
| 6 |
+
HF_HUB_ENABLE_HF_TRANSFER=1
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
RUN apt-get update && apt-get install -y python3 python3-pip git && rm -rf /var/lib/apt/lists/*
|
| 9 |
WORKDIR /app
|
|
|
|
| 11 |
COPY requirements.txt /app/
|
| 12 |
RUN python3 -m pip install --upgrade pip && pip3 install -r requirements.txt
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
COPY app.py /app/
|
| 15 |
|
| 16 |
# Spaces exposes the app on $PORT
|
app.py
CHANGED
|
@@ -10,8 +10,6 @@ import os, time, threading, subprocess, requests
|
|
| 10 |
from fastapi import FastAPI, Request, Response
|
| 11 |
import gradio as gr
|
| 12 |
|
| 13 |
-
os.environ["VLLM_USE_OUTLINES"] = "0" # turn off outlines (pyairports patched via sitecustomize.py)
|
| 14 |
-
|
| 15 |
MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-14B-Instruct-AWQ")
|
| 16 |
API_PORT = int(os.environ.get("API_PORT", "8000")) # vLLM internal port
|
| 17 |
SYSTEM_PROMPT = os.environ.get(
|
|
@@ -34,9 +32,7 @@ if "AWQ" in MODEL_ID.upper():
|
|
| 34 |
|
| 35 |
def launch_vllm():
|
| 36 |
print(f"[vLLM] Launch: {MODEL_ID}")
|
| 37 |
-
|
| 38 |
-
env["VLLM_USE_OUTLINES"] = "0" # disable outlines
|
| 39 |
-
subprocess.Popen(VLLM_ARGS, env=env)
|
| 40 |
|
| 41 |
def wait_vllm_ready(timeout=900, interval=3):
|
| 42 |
url = f"http://127.0.0.1:{API_PORT}/v1/models"
|
|
|
|
| 10 |
from fastapi import FastAPI, Request, Response
|
| 11 |
import gradio as gr
|
| 12 |
|
|
|
|
|
|
|
| 13 |
MODEL_ID = os.environ.get("MODEL_ID", "Qwen/Qwen2.5-14B-Instruct-AWQ")
|
| 14 |
API_PORT = int(os.environ.get("API_PORT", "8000")) # vLLM internal port
|
| 15 |
SYSTEM_PROMPT = os.environ.get(
|
|
|
|
| 32 |
|
| 33 |
def launch_vllm():
|
| 34 |
print(f"[vLLM] Launch: {MODEL_ID}")
|
| 35 |
+
subprocess.Popen(VLLM_ARGS)
|
|
|
|
|
|
|
| 36 |
|
| 37 |
def wait_vllm_ready(timeout=900, interval=3):
|
| 38 |
url = f"http://127.0.0.1:{API_PORT}/v1/models"
|
requirements.txt
CHANGED
|
@@ -4,12 +4,8 @@ gradio>=4.38
|
|
| 4 |
requests>=2.31
|
| 5 |
|
| 6 |
# vLLM + CUDA 12.1
|
| 7 |
-
vllm==0.
|
| 8 |
--extra-index-url https://download.pytorch.org/whl/cu121
|
| 9 |
-
torch==2.
|
| 10 |
transformers>=4.44
|
| 11 |
accelerate>=0.30
|
| 12 |
-
|
| 13 |
-
# Structured outputs stack used by vLLM
|
| 14 |
-
# outlines is imported by vLLM 0.5.5 even with VLLM_USE_OUTLINES=0
|
| 15 |
-
# We skip outlines dependencies and will patch the import
|
|
|
|
| 4 |
requests>=2.31
|
| 5 |
|
| 6 |
# vLLM + CUDA 12.1
|
| 7 |
+
vllm==0.6.3.post1
|
| 8 |
--extra-index-url https://download.pytorch.org/whl/cu121
|
| 9 |
+
torch==2.5.0
|
| 10 |
transformers>=4.44
|
| 11 |
accelerate>=0.30
|
|
|
|
|
|
|
|
|
|
|
|
sitecustomize.py
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Sitecustomize to patch pyairports module before any imports.
|
| 3 |
-
This runs automatically for all Python processes.
|
| 4 |
-
"""
|
| 5 |
-
import sys
|
| 6 |
-
from types import ModuleType
|
| 7 |
-
|
| 8 |
-
# Create fake pyairports module to satisfy outlines import
|
| 9 |
-
# vLLM 0.5.5 imports outlines even when VLLM_USE_OUTLINES=0
|
| 10 |
-
pyairports = ModuleType('pyairports')
|
| 11 |
-
pyairports.airports = ModuleType('pyairports.airports')
|
| 12 |
-
pyairports.airports.AIRPORT_LIST = []
|
| 13 |
-
sys.modules['pyairports'] = pyairports
|
| 14 |
-
sys.modules['pyairports.airports'] = pyairports.airports
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|