Spaces:

RWKV-Red-Team
/

RWKV-LatestSpace

Running on T4

App Files Files Community

sparkleman commited on Mar 7

Commit

adb6ad5

1 Parent(s): ff3952a

UPDATE: Add frontend

Browse files

Files changed (6) hide show

.gitignore +4 -1
Dockerfile +53 -2
README.md +1 -1
app.py +136 -61
config.py +82 -0
openai_test.py +0 -78

.gitignore CHANGED Viewed

@@ -13,4 +13,7 @@ wheels/
 *pth
 *.pt
-*.st

 *pth
 *.pt
+*.st
+*local*
+dist-frontend/

Dockerfile CHANGED Viewed

@@ -9,12 +9,23 @@ apt install --no-install-recommends -y \
 apt clean && rm -rf /var/lib/apt/lists/*
 EOF
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 COPY . .
 RUN useradd -m -u 1000 user
-# Switch to the "user" user
 USER user
 ENV HOME=/home/user \
@@ -23,7 +34,47 @@ ENV HOME=/home/user \
 WORKDIR $HOME/app
 COPY --chown=user . $HOME/app
 RUN uv sync --frozen --extra cu124
-CMD ["uv","run","app.py","--strategy","cuda fp16","--model_title","RWKV-x070-World-0.1B-v2.8-20241210-ctx4096","--download_repo_id","BlinkDL/rwkv-7-world","--host","0.0.0.0","--port","7860","--RWKV_CUDA_ON","True"]

 apt clean && rm -rf /var/lib/apt/lists/*
 EOF
+# 安装Node.js和npm
+RUN curl -fsSL https://deb.nodesource.com/setup_lts.x | bash - && \
+    apt-get install -y nodejs
+# 安装pnpm
+RUN npm install -g pnpm
+# 克隆前端仓库并构建
+RUN git clone https://github.com/SolomonLeon/web-rwkv-realweb.git /frontend
+WORKDIR /frontend
+RUN pnpm install && pnpm run build
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 COPY . .
 RUN useradd -m -u 1000 user
 USER user
 ENV HOME=/home/user \
 WORKDIR $HOME/app
 COPY --chown=user . $HOME/app
+COPY --chown=user /frontend/dist $HOME/app/dist-frontend
+RUN cat > $HOME/app/config.local.yaml<<EOF
+HOST: "0.0.0.0"
+PORT: 7860
+STRATEGY: "cuda fp16"
+RWKV_CUDA_ON: False
+CHUNK_LEN: 256
+MODELS:
+  - SERVICE_NAME: "RWKV-x070-World-0.1B-v2.8-20241210-ctx4096"
+    DOWNLOAD_MODEL_FILE_NAME: "RWKV-x070-World-0.1B-v2.8-20241210-ctx4096.pth"
+    DOWNLOAD_MODEL_REPO_ID: "BlinkDL/rwkv-7-world"
+    DOWNLOAD_MODEL_DIR: "./"
+    REASONING: False
+    DEFAULT: True
+    DEFAULT_SAMPLER:
+      max_tokens: 512
+      temperature: 1.0
+      top_p: 0.3
+      presence_penalty: 0.5
+      count_penalty: 0.5
+      penalty_decay: 0.996
+      stop:
+        - "\n\n"
+  - SERVICE_NAME: "RWKV7-G1-0.1B-68%trained-20250303-ctx4k"
+    DOWNLOAD_MODEL_FILE_NAME: "RWKV7-G1-0.1B-68%trained-20250303-ctx4k.pth"
+    DOWNLOAD_MODEL_REPO_ID: "BlinkDL/temp-latest-training-models"
+    DOWNLOAD_MODEL_DIR: "./"
+    REASONING: True
+    DEFAULT: True
+    DEFAULT_SAMPLER:
+      max_tokens: 4096
+      temperature: 1.0
+      top_p: 0.3
+      presence_penalty: 0.5
+      count_penalty: 0.5
+      penalty_decay: 0.996
+      stop:
+        - "\n\n"
+EOF
 RUN uv sync --frozen --extra cu124
+CMD ["uv","run","app.py",]

README.md CHANGED Viewed

@@ -25,7 +25,7 @@ python app.py --strategy "cuda fp16" --model_title "RWKV-x070-World-0.1B-v2.8-20
 python app.py --strategy "cuda fp16" --model_title "RWKV7-G1-0.1B-68%trained-20250303-ctx4k" --download_repo_id "BlinkDL/temp-latest-training-models" --download_model_dir ./
 ```
-`RWKV7-G1-0.1B-68%trained-20250303-ctx4k`
 ```shell
 python app.py --strategy "cuda fp16" --model_title "RWKV7-G1-0.4B-32%trained-20250304-ctx4k" --download_repo_id "BlinkDL/temp-latest-training-models" --download_model_dir ./

 python app.py --strategy "cuda fp16" --model_title "RWKV7-G1-0.1B-68%trained-20250303-ctx4k" --download_repo_id "BlinkDL/temp-latest-training-models" --download_model_dir ./
 ```
+`RWKV7-G1-0.4B-68%trained-20250303-ctx4k`
 ```shell
 python app.py --strategy "cuda fp16" --model_title "RWKV7-G1-0.4B-32%trained-20250304-ctx4k" --download_repo_id "BlinkDL/temp-latest-training-models" --download_model_dir ./

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import os, copy, types, gc, sys, re, time, collections, asyncio
 from huggingface_hub import hf_hub_download
 from loguru import logger
@@ -6,32 +8,11 @@ from snowflake import SnowflakeGenerator
 CompletionIdGenerator = SnowflakeGenerator(42, timestamp=1741101491595)
-from typing import List, Optional, Union
-from pydantic import BaseModel, Field
 from pydantic_settings import BaseSettings
-class Config(BaseSettings, cli_parse_args=True, cli_use_class_docs_for_groups=True):
-    HOST: str = Field("127.0.0.1", description="Host")
-    PORT: int = Field(8000, description="Port")
-    DEBUG: bool = Field(False, description="Debug mode")
-    STRATEGY: str = Field("cpu", description="Stratergy")
-    MODEL_TITLE: str = Field("RWKV-x070-World-0.1B-v2.8-20241210-ctx4096")
-    DOWNLOAD_REPO_ID: str = Field("BlinkDL/rwkv-7-world")
-    DOWNLOAD_MODEL_DIR: Union[str, None] = Field(None, description="Model Download Dir")
-    MODEL_FILE_PATH: Union[str, None] = Field(None, description="Model Path")
-    GEN_penalty_decay: float = Field(0.996, description="Default penalty decay")
-    CHUNK_LEN: int = Field(
-        256,
-        description="split input into chunks to save VRAM (shorter -> slower, but saves VRAM)",
-    )
-    VOCAB: str = Field("rwkv_vocab_v20230424", description="Vocab Name")
-    RWKV_CUDA_ON:bool = Field(False, description="`True` to compile CUDA kernel (10x faster), requires c++ compiler & cuda libraries !!!")
-CONFIG = Config()
 import numpy as np
 import torch
@@ -58,9 +39,10 @@ os.environ["RWKV_CUDA_ON"] = (
 from rwkv.model import RWKV
 from rwkv.utils import PIPELINE, PIPELINE_ARGS
-from fastapi import FastAPI
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from api_types import (
     ChatMessage,
@@ -74,17 +56,50 @@ from api_types import (
 from utils import cleanMessages, parse_think_response
 logger.info(f"STRATEGY - {CONFIG.STRATEGY}")
-if CONFIG.MODEL_FILE_PATH == None:
-    CONFIG.MODEL_FILE_PATH = hf_hub_download(
-        repo_id=CONFIG.DOWNLOAD_REPO_ID,
-        filename=f"{CONFIG.MODEL_TITLE}.pth",
-        local_dir=CONFIG.DOWNLOAD_MODEL_DIR,
     )
-logger.info(f"Load Model - {CONFIG.MODEL_FILE_PATH}")
-model = RWKV(model=CONFIG.MODEL_FILE_PATH.replace(".pth", ""), strategy=CONFIG.STRATEGY)
-pipeline = PIPELINE(model, CONFIG.VOCAB)
 class ChatCompletionRequest(BaseModel):
@@ -92,16 +107,33 @@ class ChatCompletionRequest(BaseModel):
         default="rwkv-latest",
         description="Add `:thinking` suffix to the model name to enable reasoning. Example: `rwkv-latest:thinking`",
     )
-    messages: List[ChatMessage]
     prompt: Optional[str] = Field(default=None)
-    max_tokens: int = Field(default=512)
-    temperature: float = Field(default=1.0)
-    top_p: float = Field(default=0.3)
-    presencePenalty: float = Field(default=0.5)
-    countPenalty: float = Field(default=0.5)
-    stream: bool = Field(default=False)
-    state_name: str = Field(default=None)
-    include_usage: bool = Field(default=False)
 app = FastAPI(title="RWKV OpenAI-Compatible API")
@@ -115,15 +147,19 @@ app.add_middleware(
 )
-async def runPrefill(ctx: str, model_tokens: List[int], model_state):
     ctx = ctx.replace("\r\n", "\n")
-    tokens = pipeline.encode(ctx)
     tokens = [int(x) for x in tokens]
     model_tokens += tokens
     while len(tokens) > 0:
-        out, model_state = model.forward(tokens[: CONFIG.CHUNK_LEN], model_state)
         tokens = tokens[CONFIG.CHUNK_LEN :]
         await asyncio.sleep(0)
@@ -141,8 +177,8 @@ def generate(
     args = PIPELINE_ARGS(
         temperature=max(0.2, request.temperature),
         top_p=request.top_p,
-        alpha_frequency=request.countPenalty,
-        alpha_presence=request.presencePenalty,
         token_ban=[],  # ban the generation of some tokens
         token_stop=[0],
     )  # stop generation whenever you see any token here
@@ -158,20 +194,22 @@ def generate(
             out[n] -= args.alpha_presence + occurrence[n] * args.alpha_frequency
         out[0] -= 1e10  # disable END_OF_TEXT
-        token = pipeline.sample_logits(
             out, temperature=args.temperature, top_p=args.top_p
         )
-        out, model_state = model.forward([token], model_state)
         model_tokens += [token]
         out_tokens += [token]
         for xxx in occurrence:
-            occurrence[xxx] *= CONFIG.GEN_penalty_decay
         occurrence[token] = 1 + (occurrence[token] if token in occurrence else 0)
-        tmp: str = pipeline.decode(out_tokens[out_last:])
         if "\ufffd" in tmp:
             continue
@@ -210,19 +248,20 @@ def generate(
 async def chatResponse(
-    request: ChatCompletionRequest, model_state: any, completionId: str
 ) -> ChatCompletion:
     createTimestamp = time.time()
-    enableReasoning = request.model.endswith(":thinking")
     prompt = (
         f"{cleanMessages(request.messages)}\n\nAssistant:{' <think' if enableReasoning else ''}"
         if request.prompt == None
         else request.prompt.strip()
     )
-    out, model_tokens, model_state = await runPrefill(prompt, [], model_state)
     prefillTime = time.time()
     promptTokenCount = len(model_tokens)
@@ -291,19 +330,20 @@ async def chatResponse(
 async def chatResponseStream(
-    request: ChatCompletionRequest, model_state: any, completionId: str
 ):
     createTimestamp = int(time.time())
-    enableReasoning = request.model.endswith(":thinking")
     prompt = (
         f"{cleanMessages(request.messages)}\n\nAssistant:{' <think' if enableReasoning else ''}"
         if request.prompt == None
         else request.prompt.strip()
     )
-    out, model_tokens, model_state = await runPrefill(prompt, [], model_state)
     prefillTime = time.time()
     promptTokenCount = len(model_tokens)
@@ -343,7 +383,7 @@ async def chatResponseStream(
     buffer = []
     if enableReasoning:
-        buffer.append(" <think")
         streamConfig = {
             "isChecking": False,
@@ -532,6 +572,32 @@ async def chat_completions(request: ChatCompletionRequest):
     completionId = str(next(CompletionIdGenerator))
     logger.info(f"[REQ] {completionId} - {request.model_dump()}")
     async def chatResponseStreamDisconnect():
         if "cuda" in CONFIG.STRATEGY:
             gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
@@ -540,18 +606,27 @@ async def chat_completions(request: ChatCompletionRequest):
             )
     model_state = None
     if request.stream:
         r = StreamingResponse(
-            chatResponseStream(request, model_state, completionId),
             media_type="text/event-stream",
             background=chatResponseStreamDisconnect,
         )
     else:
-        r = await chatResponse(request, model_state, completionId)
     return r
 if __name__ == "__main__":
     import uvicorn

+from config import CONFIG, ModelConfig
 import os, copy, types, gc, sys, re, time, collections, asyncio
 from huggingface_hub import hf_hub_download
 from loguru import logger
 CompletionIdGenerator = SnowflakeGenerator(42, timestamp=1741101491595)
+from typing import List, Optional, Union, Any, Dict
+from pydantic import BaseModel, Field, model_validator
 from pydantic_settings import BaseSettings
 import numpy as np
 import torch
 from rwkv.model import RWKV
 from rwkv.utils import PIPELINE, PIPELINE_ARGS
+from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
 from api_types import (
     ChatMessage,
 from utils import cleanMessages, parse_think_response
+class ModelStorage:
+    MODEL_CONFIG: Optional[ModelConfig] = None
+    model: Optional[RWKV] = None
+    pipeline: Optional[PIPELINE] = None
+MODEL_STORAGE: Dict[str, ModelStorage] = {}
+DEFALUT_MODEL_NAME = None
+DEFAULT_REASONING_MODEL_NAME = None
 logger.info(f"STRATEGY - {CONFIG.STRATEGY}")
+for model_config in CONFIG.MODELS:
+    logger.info(f"Load Model - {model_config.SERVICE_NAME}")
+    if model_config.MODEL_FILE_PATH == None:
+        model_config.MODEL_FILE_PATH = hf_hub_download(
+            repo_id=model_config.DOWNLOAD_MODEL_REPO_ID,
+            filename=model_config.DOWNLOAD_MODEL_FILE_NAME,
+            local_dir=model_config.DOWNLOAD_MODEL_DIR,
+        )
+    logger.info(f"Load Model - Path - {model_config.MODEL_FILE_PATH}")
+    tmp_model = RWKV(
+        model=model_config.DOWNLOAD_MODEL_FILE_NAME.replace(".pth", ""),
+        strategy=CONFIG.STRATEGY,
     )
+    tmp_pipeline = PIPELINE(tmp_model, model_config.VOCAB)
+    if model_config.DEFAULT:
+        if model_config.REASONING:
+            DEFAULT_REASONING_MODEL_NAME = model_config.SERVICE_NAME
+        else:
+            DEFALUT_MODEL_NAME = model_config.SERVICE_NAME
+    MODEL_STORAGE[model_config.SERVICE_NAME] = ModelStorage()
+    MODEL_STORAGE[model_config.SERVICE_NAME].MODEL_CONFIG = model_config
+    MODEL_STORAGE[model_config.SERVICE_NAME].model = tmp_model
+    MODEL_STORAGE[model_config.SERVICE_NAME].pipeline = tmp_pipeline
+logger.info(f"DEFALUT_MODEL_NAME is `{DEFALUT_MODEL_NAME}`")
+logger.info(f"DEFAULT_REASONING_MODEL_NAME is `{DEFAULT_REASONING_MODEL_NAME}`")
 class ChatCompletionRequest(BaseModel):
         default="rwkv-latest",
         description="Add `:thinking` suffix to the model name to enable reasoning. Example: `rwkv-latest:thinking`",
     )
+    messages: Optional[List[ChatMessage]] = Field(default=None)
     prompt: Optional[str] = Field(default=None)
+    max_tokens: Optional[int] = Field(default=None)
+    temperature: Optional[float] = Field(default=None)
+    top_p: Optional[float] = Field(default=None)
+    presence_penalty: Optional[float] = Field(default=None)
+    count_penalty: Optional[float] = Field(default=None)
+    penalty_decay: Optional[float] = Field(default=None)
+    stream: Optional[bool] = Field(default=False)
+    state_name: Optional[str] = Field(default=None)
+    include_usage: Optional[bool] = Field(default=False)
+    stop: Optional[list[str]] = Field(["\n\n"])
+    @model_validator(mode="before")
+    @classmethod
+    def validate_mutual_exclusivity(cls, data: Any) -> Any:
+        if not isinstance(data, dict):
+            return data
+        messages_provided = "messages" in data and data["messages"] != None
+        prompt_provided = "prompt" in data and data["prompt"] != None
+        if messages_provided and prompt_provided:
+            raise ValueError("messages and prompt cannot coexist. Choose one.")
+        if not messages_provided and not prompt_provided:
+            raise ValueError("Either messages or prompt must be provided.")
+        return data
 app = FastAPI(title="RWKV OpenAI-Compatible API")
 )
+async def runPrefill(
+    request: ChatCompletionRequest, ctx: str, model_tokens: List[int], model_state
+):
     ctx = ctx.replace("\r\n", "\n")
+    tokens = MODEL_STORAGE[request.model].pipeline.encode(ctx)
     tokens = [int(x) for x in tokens]
     model_tokens += tokens
     while len(tokens) > 0:
+        out, model_state = MODEL_STORAGE[request.model].model.forward(
+            tokens[: CONFIG.CHUNK_LEN], model_state
+        )
         tokens = tokens[CONFIG.CHUNK_LEN :]
         await asyncio.sleep(0)
     args = PIPELINE_ARGS(
         temperature=max(0.2, request.temperature),
         top_p=request.top_p,
+        alpha_frequency=request.count_penalty,
+        alpha_presence=request.presence_penalty,
         token_ban=[],  # ban the generation of some tokens
         token_stop=[0],
     )  # stop generation whenever you see any token here
             out[n] -= args.alpha_presence + occurrence[n] * args.alpha_frequency
         out[0] -= 1e10  # disable END_OF_TEXT
+        token = MODEL_STORAGE[request.model].pipeline.sample_logits(
             out, temperature=args.temperature, top_p=args.top_p
         )
+        out, model_state = MODEL_STORAGE[request.model].model.forward(
+            [token], model_state
+        )
         model_tokens += [token]
         out_tokens += [token]
         for xxx in occurrence:
+            occurrence[xxx] *= request.penalty_decay
         occurrence[token] = 1 + (occurrence[token] if token in occurrence else 0)
+        tmp: str = MODEL_STORAGE[request.model].pipeline.decode(out_tokens[out_last:])
         if "\ufffd" in tmp:
             continue
 async def chatResponse(
+    request: ChatCompletionRequest,
+    model_state: any,
+    completionId: str,
+    enableReasoning: bool,
 ) -> ChatCompletion:
     createTimestamp = time.time()
     prompt = (
         f"{cleanMessages(request.messages)}\n\nAssistant:{' <think' if enableReasoning else ''}"
         if request.prompt == None
         else request.prompt.strip()
     )
+    out, model_tokens, model_state = await runPrefill(request, prompt, [], model_state)
     prefillTime = time.time()
     promptTokenCount = len(model_tokens)
 async def chatResponseStream(
+    request: ChatCompletionRequest,
+    model_state: any,
+    completionId: str,
+    enableReasoning: bool,
 ):
     createTimestamp = int(time.time())
     prompt = (
         f"{cleanMessages(request.messages)}\n\nAssistant:{' <think' if enableReasoning else ''}"
         if request.prompt == None
         else request.prompt.strip()
     )
+    out, model_tokens, model_state = await runPrefill(request, prompt, [], model_state)
     prefillTime = time.time()
     promptTokenCount = len(model_tokens)
     buffer = []
     if enableReasoning:
+        buffer.append("<think")
         streamConfig = {
             "isChecking": False,
     completionId = str(next(CompletionIdGenerator))
     logger.info(f"[REQ] {completionId} - {request.model_dump()}")
+    modelName = request.model.split(":")[0]
+    enableReasoning = ":thinking" in request.model
+    if "rwkv-latest" in request.model:
+        if enableReasoning:
+            if DEFAULT_REASONING_MODEL_NAME == None:
+                raise HTTPException(404, "DEFAULT_REASONING_MODEL_NAME not set")
+            defaultSamplerConfig = MODEL_STORAGE[
+                DEFAULT_REASONING_MODEL_NAME
+            ].MODEL_CONFIG.DEFAULT_SAMPLER
+            request.model = DEFAULT_REASONING_MODEL_NAME
+        else:
+            if DEFALUT_MODEL_NAME == None:
+                raise HTTPException(404, "DEFALUT_MODEL_NAME not set")
+            defaultSamplerConfig = MODEL_STORAGE[
+                DEFALUT_MODEL_NAME
+            ].MODEL_CONFIG.DEFAULT_SAMPLER
+            request.model = DEFALUT_MODEL_NAME
+    elif modelName in MODEL_STORAGE:
+        defaultSamplerConfig = MODEL_STORAGE[modelName].MODEL_CONFIG.DEFAULT_SAMPLER
+        request.model = modelName
+    else:
+        raise f"Can not find `{modelName}`"
     async def chatResponseStreamDisconnect():
         if "cuda" in CONFIG.STRATEGY:
             gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
             )
     model_state = None
+    request_dict = request.model_dump()
+    for k, v in defaultSamplerConfig.model_dump().items():
+        if request_dict[k] == None:
+            request_dict[k] = v
+    realRequest = ChatCompletionRequest(**request_dict)
+    logger.info(f"[REQ] {completionId} - Real - {request.model_dump()}")
     if request.stream:
         r = StreamingResponse(
+            chatResponseStream(realRequest, model_state, completionId, enableReasoning),
             media_type="text/event-stream",
             background=chatResponseStreamDisconnect,
         )
     else:
+        r = await chatResponse(realRequest, model_state, completionId, enableReasoning)
     return r
+app.mount("/", StaticFiles(directory="dist-frontend", html=True), name="static")
 if __name__ == "__main__":
     import uvicorn

config.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from pydantic import BaseModel, Field
+from typing import List, Optional
+from typing import List, Optional, Union, Any
+import sys
+from pydantic_settings import BaseSettings
+class CliConfig(BaseSettings, cli_parse_args=True, cli_use_class_docs_for_groups=True):
+    CONFIG_FILE: str = Field("./config.local.yaml", description="Config file path")
+CLI_CONFIG = CliConfig()
+class SamplerConfig(BaseModel):
+    """Default sampler configuration for each model."""
+    max_tokens: int = Field(512, description="Maximum number of tokens to generate.")
+    temperature: float = Field(1.0, description="Sampling temperature.")
+    top_p: float = Field(0.3, description="Top-p sampling threshold.")
+    presence_penalty: float = Field(0.5, description="Presence penalty.")
+    count_penalty: float = Field(0.5, description="Count penalty.")
+    penalty_decay: float = Field(0.5, description="Penalty decay factor.")
+    stop: List[str] = Field(0.996, description="List of stop sequences.")
+class ModelConfig(BaseModel):
+    """Configuration for each individual model."""
+    SERVICE_NAME: str = Field(..., description="Service name of the model.")
+    MODEL_FILE_PATH: Optional[str] = Field(None, description="Model file path.")
+    DOWNLOAD_MODEL_FILE_NAME: Optional[str] = Field(
+        None, description="Model name, should end with .pth"
+    )
+    DOWNLOAD_MODEL_REPO_ID: Optional[str] = Field(
+        None, description="Model repository ID on Hugging Face Hub."
+    )
+    DOWNLOAD_MODEL_DIR: Optional[str] = Field(
+        None, description="Directory to download the model to."
+    )
+    REASONING: bool = Field(
+        False, description="Whether reasoning is enabled for this model."
+    )
+    DEFAULT: bool = Field(False, description="Whether this model is the default model.")
+    DEFAULT_SAMPLER: SamplerConfig = Field(
+        SamplerConfig(), description="Default sampler configuration for this model."
+    )
+    VOCAB: str = Field("rwkv_vocab_v20230424", description="Vocab Name")
+class RootConfig(BaseModel):
+    """Root configuration for the RWKV service."""
+    HOST: Optional[str] = Field(
+        "127.0.0.1", description="Host IP address to bind to."
+    )  # 注释掉可选的HOST和PORT
+    PORT: Optional[int] = Field(
+        8000, description="Port number to listen on."
+    )  # 因为YAML示例中被注释掉了
+    STRATEGY: str = Field(
+        "cpu", description="Strategy for model execution (e.g., 'cuda fp16')."
+    )
+    RWKV_CUDA_ON: bool = Field(False, description="Whether to enable RWKV CUDA kernel.")
+    CHUNK_LEN: int = Field(256, description="Chunk length for processing.")
+    MODELS: List[ModelConfig] = Field(..., description="List of model configurations.")
+import yaml
+try:
+    with open(CLI_CONFIG.CONFIG_FILE, "r", encoding="utf-8") as f:
+        CONFIG = RootConfig.model_validate(yaml.safe_load(f.read()))
+except Exception as e:
+    print(f"Pydantic Model Validation Failed: {e}")
+    sys.exit(0)

openai_test.py DELETED Viewed

@@ -1,78 +0,0 @@
-"""
-uv pip install openai
-"""
-import os
-import logging
-# logging.basicConfig(
-#     level=logging.DEBUG,
-# )
-os.environ["NO_PROXY"] = "127.0.0.1"
-from openai import OpenAI
-client = OpenAI(base_url="http://127.0.0.1:8000/api/v1", api_key="sk-test")
-def completionStreamTest():
-    print("[*] Stream completion: ")
-    completion = client.chat.completions.create(
-        model="rwkv-latest",
-        messages=[
-            {
-                "role": "User",
-                "content": "请讲个关于一只灰猫和一个小女孩之间的简短故事。",
-            },
-        ],
-        stream=True,
-        max_tokens=2048,
-    )
-    isReasoning = False
-    for chunk in completion:
-        if chunk.choices[0].delta.reasoning_content and not isReasoning:
-            print("<- Reasoning ->")
-            isReasoning = True
-        elif chunk.choices[0].delta.content and isReasoning:
-            isReasoning = False
-            print("<- Stop Reasoning ->")
-        if chunk.choices[0].delta.reasoning_content:
-            print(chunk.choices[0].delta.reasoning_content, end="", flush=True)
-        if chunk.choices[0].delta.content:
-            print(chunk.choices[0].delta.content, end="", flush=True)
-    print("")
-def completionTest():
-    completion = client.chat.completions.create(
-        model="rwkv-latest:thinking",
-        messages=[
-            {
-                "role": "User",
-                "content": "How many planets are there in our solar system?",
-            },
-        ],
-        max_tokens=2048,
-    )
-    print("[*] Completion: ", completion)
-if __name__ == "__main__":
-    try:
-        # completionTest()
-        testRounds = input("Test rounds (Default: 10) :")
-        for i in range(int(testRounds) if testRounds != "" else 10):
-            print("\n", "=" * 10, i + 1, "/", testRounds, "=" * 10)
-            completionStreamTest()
-    except KeyboardInterrupt:
-        pass