ASUS Ascent GX10 (DGX Spark) 上でvLLMでQwen 3.6 27bのNVFP4量子化モデルを起動するメモ

git clone https://github.com/eugr/spark-vllm-docker.git
cd spark-vllm-docker
./build-and-copy.sh

sakamakismile/Qwen3.6-27B-Text-NVFP4-MTP

vllm-qwen3.6-27b.sh

vllm serve \
  sakamakismile/Qwen3.6-27B-Text-NVFP4-MTP \
  --language-model-only \
  --enable-auto-tool-choice \
  --tool-call-parser qwen3_coder \
  --reasoning-parser qwen3 \
  --gpu-memory-utilization 0.6 \
  --quantization modelopt \
  --kv-cache-dtype fp8 \
  --max-num-seqs 4 \
  --host 0.0.0.0 \
  --port 8000 \
  --load-format fastsafetensors \
  --attention-backend flashinfer \
  --default-chat-template-kwargs '{"enable_thinking":false}' \
  --enable-prefix-caching \
  --speculative-config '{"method":"qwen3_5_mtp","num_speculative_tokens":2}'

./launch-cluster.sh --solo -d --apply-mod mods/drop-caches --launch-script vllm-qwen3.6-27b.sh

docker logs -f vllm_node

$ curl -s http://localhost:8000/v1/chat/completions --json '{
    "messages": [
      {"role": "user", "content": "Who are you?"}
    ]
  }' | jq .

{
  "id": "chatcmpl-8b03cf13feeb8f24",
  "object": "chat.completion",
  "created": 1777792776,
  "model": "sakamakismile/Qwen3.6-27B-Text-NVFP4-MTP",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "I am Qwen, a large language model independently developed by Alibaba Group's Tongyi Lab. \n\nI'm here to assist you with a wide variety of tasks, such as answering questions, writing, logical reasoning, coding, and creative tasks. How can I help you today?",
        "refusal": null,
        "annotations": null,
        "audio": null,
        "function_call": null,
        "tool_calls": [],
        "reasoning": null
      },
      "logprobs": null,
      "finish_reason": "stop",
      "stop_reason": null,
      "token_ids": null
    }
  ],
  "service_tier": null,
  "system_fingerprint": "vllm-0.20.1rc1.dev131+g7075df79b.d20260501-dc4bfec1",
  "usage": {
    "prompt_tokens": 16,
    "total_tokens": 73,
    "completion_tokens": 57,
    "prompt_tokens_details": null
  },
  "prompt_logprobs": null,
  "prompt_token_ids": null,
  "kv_transfer_params": null
}

docker exec -ti vllm_node bash

vllm bench serve \
  --model sakamakismile/Qwen3.6-27B-Text-NVFP4-MTP \
  --host localhost --port 8000 \
  --random-input-len 1024 --random-output-len 256 \
  --num-prompts 32

============ Serving Benchmark Result ============
Successful requests:                     32        
Failed requests:                         0         
Benchmark duration (s):                  147.82    
Total input tokens:                      32768     
Total generated tokens:                  8192      
Request throughput (req/s):              0.22      
Output token throughput (tok/s):         55.42     
Peak output token throughput (tok/s):    40.00     
Peak concurrent requests:                32.00     
Total token throughput (tok/s):          277.09    
---------------Time to First Token----------------
Mean TTFT (ms):                          67077.26  
Median TTFT (ms):                        65509.57  
P99 TTFT (ms):                           132059.51 
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          62.85     
Median TPOT (ms):                        60.54     
P99 TPOT (ms):                           86.18     
---------------Inter-token Latency----------------
Mean ITL (ms):                           107.86    
Median ITL (ms):                         102.19    
P99 ITL (ms):                            361.58    
---------------Speculative Decoding---------------
Acceptance rate (%):                     71.86     
Acceptance length:                       1.72      
Drafts:                                  4755      
Draft tokens:                            4755      
Accepted tokens:                         3417      
Per-position acceptance (%):
  Position 0:                            71.86     
==================================================

./launch-cluster.sh stop --solo