Morgan Funtowicz commited on
Commit
57ba236
·
1 Parent(s): 36406e7

feat(quant): allow using native precision

Browse files
Files changed (2) hide show
  1. Dockerfile +1 -1
  2. entrypoint.sh +26 -11
Dockerfile CHANGED
@@ -3,7 +3,7 @@ FROM lmsysorg/sglang:latest
3
  ENV MODEL_ID="/repository"
4
  ENV KV_CACHE_DTYPE="auto"
5
  ENV TP_SIZE="1"
6
- ENV QUANT_METHOD="w8a8_int8"
7
  EXPOSE 80
8
 
9
  COPY entrypoint.sh /usr/local/endpoint/
 
3
  ENV MODEL_ID="/repository"
4
  ENV KV_CACHE_DTYPE="auto"
5
  ENV TP_SIZE="1"
6
+ ENV QUANT_METHOD=""
7
  EXPOSE 80
8
 
9
  COPY entrypoint.sh /usr/local/endpoint/
entrypoint.sh CHANGED
@@ -1,13 +1,28 @@
1
  #!/bin/bash
2
 
3
- python3 -m sglang.launch_server \
4
- --model-path $MODEL_ID \
5
- --kv-cache-dtype $KV_CACHE_DTYPE \
6
- --tensor-parallel-size $TP_SIZE \
7
- --expert-parallel-size $TP_SIZE \
8
- --quantization $QUANT_METHOD \
9
- --enable-torch-compile \
10
- --enable-ep-moe \
11
- --tool-call-parser qwen25 \
12
- --host 0.0.0.0 \
13
- --port 80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  #!/bin/bash
2
 
3
+ if [ -z "$QUANT_METHOD" ] then
4
+ echo "Using native precision"
5
+ python3 -m sglang.launch_server \
6
+ --model-path $MODEL_ID \
7
+ --kv-cache-dtype $KV_CACHE_DTYPE \
8
+ --tensor-parallel-size $TP_SIZE \
9
+ --expert-parallel-size $TP_SIZE \
10
+ --enable-torch-compile \
11
+ --enable-ep-moe \
12
+ --tool-call-parser qwen25 \
13
+ --host 0.0.0.0 \
14
+ --port 80
15
+ else
16
+ echo "Using ${QUANT_METHOD} quantization schema"
17
+ python3 -m sglang.launch_server \
18
+ --model-path $MODEL_ID \
19
+ --kv-cache-dtype $KV_CACHE_DTYPE \
20
+ --tensor-parallel-size $TP_SIZE \
21
+ --expert-parallel-size $TP_SIZE \
22
+ --quantization $QUANT_METHOD \
23
+ --enable-torch-compile \
24
+ --enable-ep-moe \
25
+ --tool-call-parser qwen25 \
26
+ --host 0.0.0.0 \
27
+ --port 80
28
+ fi