ASUS Ascent GX10 (DGX Spark) 上でvLLMでQwen 3.6 27bのNVFP4量子化モデルを起動するメモ
git clone https://github.com/eugr/spark-vllm-docker.git
cd spark-vllm-docker
./build-and-copy.sh
sakamakismile/Qwen3.6-27B-Text-NVFP4-MTP
vllm-qwen3.6-27b.sh
vllm serve \
sakamakismile/Qwen3.6-27B-Text-NVFP4-MTP \
--language-model-only \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--reasoning-parser qwen3 \
--gpu-memory-utilization 0.6 \
--quantization modelopt \
--kv-cache-dtype fp8 \
--max-num-seqs 4 \
--host 0.0.0.0 \
--port 8000 \
--load-format fastsafetensors \
--attention-backend flashinfer \
--default-chat-template-kwargs '{"enable_thinking":false}' \
--enable-prefix-caching \
--speculative-config '{"method":"qwen3_5_mtp","num_speculative_tokens":2}'
./launch-cluster.sh --solo -d --apply-mod mods/drop-caches --launch-script vllm-qwen3.6-27b.sh
docker logs -f vllm_node
$ curl -s http://localhost:8000/v1/chat/completions --json '{
"messages": [
{"role": "user", "content": "Who are you?"}
]
}' | jq .
{
"id": "chatcmpl-8b03cf13feeb8f24",
"object": "chat.completion",
"created": 1777792776,
"model": "sakamakismile/Qwen3.6-27B-Text-NVFP4-MTP",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "I am Qwen, a large language model independently developed by Alibaba Group's Tongyi Lab. \n\nI'm here to assist you with a wide variety of tasks, such as answering questions, writing, logical reasoning, coding, and creative tasks. How can I help you today?",
"refusal": null,
"annotations": null,
"audio": null,
"function_call": null,
"tool_calls": [],
"reasoning": null
},
"logprobs": null,
"finish_reason": "stop",
"stop_reason": null,
"token_ids": null
}
],
"service_tier": null,
"system_fingerprint": "vllm-0.20.1rc1.dev131+g7075df79b.d20260501-dc4bfec1",
"usage": {
"prompt_tokens": 16,
"total_tokens": 73,
"completion_tokens": 57,
"prompt_tokens_details": null
},
"prompt_logprobs": null,
"prompt_token_ids": null,
"kv_transfer_params": null
}
docker exec -ti vllm_node bash
vllm bench serve \
--model sakamakismile/Qwen3.6-27B-Text-NVFP4-MTP \
--host localhost --port 8000 \
--random-input-len 1024 --random-output-len 256 \
--num-prompts 32
============ Serving Benchmark Result ============
Successful requests: 32
Failed requests: 0
Benchmark duration (s): 147.82
Total input tokens: 32768
Total generated tokens: 8192
Request throughput (req/s): 0.22
Output token throughput (tok/s): 55.42
Peak output token throughput (tok/s): 40.00
Peak concurrent requests: 32.00
Total token throughput (tok/s): 277.09
---------------Time to First Token----------------
Mean TTFT (ms): 67077.26
Median TTFT (ms): 65509.57
P99 TTFT (ms): 132059.51
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 62.85
Median TPOT (ms): 60.54
P99 TPOT (ms): 86.18
---------------Inter-token Latency----------------
Mean ITL (ms): 107.86
Median ITL (ms): 102.19
P99 ITL (ms): 361.58
---------------Speculative Decoding---------------
Acceptance rate (%): 71.86
Acceptance length: 1.72
Drafts: 4755
Draft tokens: 4755
Accepted tokens: 3417
Per-position acceptance (%):
Position 0: 71.86
==================================================
./launch-cluster.sh stop --solo