Spaces:
Running
on
T4
Running
on
T4
sparkleman
commited on
Commit
·
05b6df6
1
Parent(s):
cdd6039
CKPT: Space CPU version
Browse files- Dockerfile +1 -1
- app.py +6 -5
Dockerfile
CHANGED
@@ -26,4 +26,4 @@ COPY --chown=user . $HOME/app
|
|
26 |
|
27 |
RUN uv sync --frozen --extra cu124
|
28 |
|
29 |
-
CMD ["uv","run","app.py","--strategy","
|
|
|
26 |
|
27 |
RUN uv sync --frozen --extra cu124
|
28 |
|
29 |
+
CMD ["uv","run","app.py","--strategy","cuda fp16","--model_title","RWKV-x070-World-0.1B-v2.8-20241210-ctx4096","--download_repo_id","BlinkDL/rwkv-7-world","--host","0.0.0.0","--port","7860"]
|
app.py
CHANGED
@@ -92,7 +92,7 @@ class ChatCompletionRequest(BaseModel):
|
|
92 |
description="Add `:thinking` suffix to the model name to enable reasoning. Example: `rwkv-latest:thinking`",
|
93 |
)
|
94 |
messages: List[ChatMessage]
|
95 |
-
prompt:
|
96 |
max_tokens: int = Field(default=512)
|
97 |
temperature: float = Field(default=1.0)
|
98 |
top_p: float = Field(default=0.3)
|
@@ -114,7 +114,7 @@ app.add_middleware(
|
|
114 |
)
|
115 |
|
116 |
|
117 |
-
def runPrefill(ctx: str, model_tokens: List[int], model_state):
|
118 |
ctx = ctx.replace("\r\n", "\n")
|
119 |
|
120 |
tokens = pipeline.encode(ctx)
|
@@ -124,6 +124,7 @@ def runPrefill(ctx: str, model_tokens: List[int], model_state):
|
|
124 |
while len(tokens) > 0:
|
125 |
out, model_state = model.forward(tokens[: CONFIG.CHUNK_LEN], model_state)
|
126 |
tokens = tokens[CONFIG.CHUNK_LEN :]
|
|
|
127 |
|
128 |
return out, model_tokens, model_state
|
129 |
|
@@ -220,7 +221,7 @@ async def chatResponse(
|
|
220 |
else request.prompt.strip()
|
221 |
)
|
222 |
|
223 |
-
out, model_tokens, model_state = runPrefill(prompt, [], model_state)
|
224 |
|
225 |
prefillTime = time.time()
|
226 |
promptTokenCount = len(model_tokens)
|
@@ -301,7 +302,7 @@ async def chatResponseStream(
|
|
301 |
else request.prompt.strip()
|
302 |
)
|
303 |
|
304 |
-
out, model_tokens, model_state = runPrefill(prompt, [], model_state)
|
305 |
|
306 |
prefillTime = time.time()
|
307 |
promptTokenCount = len(model_tokens)
|
@@ -530,7 +531,7 @@ async def chat_completions(request: ChatCompletionRequest):
|
|
530 |
completionId = str(next(CompletionIdGenerator))
|
531 |
logger.info(f"[REQ] {completionId} - {request.model_dump()}")
|
532 |
|
533 |
-
def chatResponseStreamDisconnect():
|
534 |
if "cuda" in CONFIG.STRATEGY:
|
535 |
gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
|
536 |
logger.info(
|
|
|
92 |
description="Add `:thinking` suffix to the model name to enable reasoning. Example: `rwkv-latest:thinking`",
|
93 |
)
|
94 |
messages: List[ChatMessage]
|
95 |
+
prompt: Optional[str] = Field(default=None)
|
96 |
max_tokens: int = Field(default=512)
|
97 |
temperature: float = Field(default=1.0)
|
98 |
top_p: float = Field(default=0.3)
|
|
|
114 |
)
|
115 |
|
116 |
|
117 |
+
async def runPrefill(ctx: str, model_tokens: List[int], model_state):
|
118 |
ctx = ctx.replace("\r\n", "\n")
|
119 |
|
120 |
tokens = pipeline.encode(ctx)
|
|
|
124 |
while len(tokens) > 0:
|
125 |
out, model_state = model.forward(tokens[: CONFIG.CHUNK_LEN], model_state)
|
126 |
tokens = tokens[CONFIG.CHUNK_LEN :]
|
127 |
+
await asyncio.sleep(0)
|
128 |
|
129 |
return out, model_tokens, model_state
|
130 |
|
|
|
221 |
else request.prompt.strip()
|
222 |
)
|
223 |
|
224 |
+
out, model_tokens, model_state = await runPrefill(prompt, [], model_state)
|
225 |
|
226 |
prefillTime = time.time()
|
227 |
promptTokenCount = len(model_tokens)
|
|
|
302 |
else request.prompt.strip()
|
303 |
)
|
304 |
|
305 |
+
out, model_tokens, model_state = await runPrefill(prompt, [], model_state)
|
306 |
|
307 |
prefillTime = time.time()
|
308 |
promptTokenCount = len(model_tokens)
|
|
|
531 |
completionId = str(next(CompletionIdGenerator))
|
532 |
logger.info(f"[REQ] {completionId} - {request.model_dump()}")
|
533 |
|
534 |
+
async def chatResponseStreamDisconnect():
|
535 |
if "cuda" in CONFIG.STRATEGY:
|
536 |
gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
|
537 |
logger.info(
|