sparkleman commited on
Commit
05b6df6
·
1 Parent(s): cdd6039

CKPT: Space CPU version

Browse files
Files changed (2) hide show
  1. Dockerfile +1 -1
  2. app.py +6 -5
Dockerfile CHANGED
@@ -26,4 +26,4 @@ COPY --chown=user . $HOME/app
26
 
27
  RUN uv sync --frozen --extra cu124
28
 
29
- CMD ["uv","run","app.py","--strategy","cpu i4","--model_title","RWKV-x070-World-0.1B-v2.8-20241210-ctx4096","--download_repo_id","BlinkDL/rwkv-7-world","--host","0.0.0.0","--port","7860"]
 
26
 
27
  RUN uv sync --frozen --extra cu124
28
 
29
+ CMD ["uv","run","app.py","--strategy","cuda fp16","--model_title","RWKV-x070-World-0.1B-v2.8-20241210-ctx4096","--download_repo_id","BlinkDL/rwkv-7-world","--host","0.0.0.0","--port","7860"]
app.py CHANGED
@@ -92,7 +92,7 @@ class ChatCompletionRequest(BaseModel):
92
  description="Add `:thinking` suffix to the model name to enable reasoning. Example: `rwkv-latest:thinking`",
93
  )
94
  messages: List[ChatMessage]
95
- prompt: Union[str, None] = Field(default=None)
96
  max_tokens: int = Field(default=512)
97
  temperature: float = Field(default=1.0)
98
  top_p: float = Field(default=0.3)
@@ -114,7 +114,7 @@ app.add_middleware(
114
  )
115
 
116
 
117
- def runPrefill(ctx: str, model_tokens: List[int], model_state):
118
  ctx = ctx.replace("\r\n", "\n")
119
 
120
  tokens = pipeline.encode(ctx)
@@ -124,6 +124,7 @@ def runPrefill(ctx: str, model_tokens: List[int], model_state):
124
  while len(tokens) > 0:
125
  out, model_state = model.forward(tokens[: CONFIG.CHUNK_LEN], model_state)
126
  tokens = tokens[CONFIG.CHUNK_LEN :]
 
127
 
128
  return out, model_tokens, model_state
129
 
@@ -220,7 +221,7 @@ async def chatResponse(
220
  else request.prompt.strip()
221
  )
222
 
223
- out, model_tokens, model_state = runPrefill(prompt, [], model_state)
224
 
225
  prefillTime = time.time()
226
  promptTokenCount = len(model_tokens)
@@ -301,7 +302,7 @@ async def chatResponseStream(
301
  else request.prompt.strip()
302
  )
303
 
304
- out, model_tokens, model_state = runPrefill(prompt, [], model_state)
305
 
306
  prefillTime = time.time()
307
  promptTokenCount = len(model_tokens)
@@ -530,7 +531,7 @@ async def chat_completions(request: ChatCompletionRequest):
530
  completionId = str(next(CompletionIdGenerator))
531
  logger.info(f"[REQ] {completionId} - {request.model_dump()}")
532
 
533
- def chatResponseStreamDisconnect():
534
  if "cuda" in CONFIG.STRATEGY:
535
  gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
536
  logger.info(
 
92
  description="Add `:thinking` suffix to the model name to enable reasoning. Example: `rwkv-latest:thinking`",
93
  )
94
  messages: List[ChatMessage]
95
+ prompt: Optional[str] = Field(default=None)
96
  max_tokens: int = Field(default=512)
97
  temperature: float = Field(default=1.0)
98
  top_p: float = Field(default=0.3)
 
114
  )
115
 
116
 
117
+ async def runPrefill(ctx: str, model_tokens: List[int], model_state):
118
  ctx = ctx.replace("\r\n", "\n")
119
 
120
  tokens = pipeline.encode(ctx)
 
124
  while len(tokens) > 0:
125
  out, model_state = model.forward(tokens[: CONFIG.CHUNK_LEN], model_state)
126
  tokens = tokens[CONFIG.CHUNK_LEN :]
127
+ await asyncio.sleep(0)
128
 
129
  return out, model_tokens, model_state
130
 
 
221
  else request.prompt.strip()
222
  )
223
 
224
+ out, model_tokens, model_state = await runPrefill(prompt, [], model_state)
225
 
226
  prefillTime = time.time()
227
  promptTokenCount = len(model_tokens)
 
302
  else request.prompt.strip()
303
  )
304
 
305
+ out, model_tokens, model_state = await runPrefill(prompt, [], model_state)
306
 
307
  prefillTime = time.time()
308
  promptTokenCount = len(model_tokens)
 
531
  completionId = str(next(CompletionIdGenerator))
532
  logger.info(f"[REQ] {completionId} - {request.model_dump()}")
533
 
534
+ async def chatResponseStreamDisconnect():
535
  if "cuda" in CONFIG.STRATEGY:
536
  gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
537
  logger.info(