SGLang: TypeError: gptq_marlin_gemm() got an unexpected keyword argument 'b_bias'
#7
by
boydcheung - opened
When using SGLang to load this model on 4*A100 GPUs:
Capturing batches (bs=512 avail_mem=12.16 GB): 0%| | 0/52 [00:01<?, ?it/s]
[2025-12-18 15:01:40 TP0] Scheduler hit an exception: Traceback (most recent call last):
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/managers/scheduler.py", line 2680, in run_scheduler_process
scheduler = Scheduler(
^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/managers/scheduler.py", line 320, in __init__
self.tp_worker = TpModelWorker(
^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/managers/tp_worker.py", line 248, in __init__
self._model_runner = ModelRunner(
^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/model_executor/model_runner.py", line 359, in __init__
self.initialize(min_per_gpu_memory)
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/model_executor/model_runner.py", line 511, in initialize
self.init_device_graphs()
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/model_executor/model_runner.py", line 2448, in init_device_graphs
self.graph_runner = graph_runners[self.device](self)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/model_executor/cuda_graph_runner.py", line 352, in __init__
self.capture()
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/model_executor/cuda_graph_runner.py", line 507, in capture
_capture_one_stream()
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/model_executor/cuda_graph_runner.py", line 494, in _capture_one_stream
) = self.capture_one_batch_size(bs, forward, stream_idx)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/model_executor/cuda_graph_runner.py", line 698, in capture_one_batch_size
run_once()
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/model_executor/cuda_graph_runner.py", line 685, in run_once
logits_output_or_pp_proxy_tensors = forward(
^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/models/qwen3_vl.py", line 735, in forward
hidden_states = general_mm_embed_routine(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/managers/mm_utils.py", line 780, in general_mm_embed_routine
hidden_states = language_model(
^^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/models/qwen3_vl_moe.py", line 83, in forward
hidden_states, residual = layer(
^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/models/qwen3_moe.py", line 786, in forward
hidden_states = self.self_attn(
^^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/models/qwen3_moe.py", line 674, in forward
s = self.forward_prepare(
^^^^^^^^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/models/qwen3_moe.py", line 634, in forward_prepare
return self.forward_prepare_native(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/models/qwen3_moe.py", line 569, in forward_prepare_native
qkv, _ = self.qkv_proj(hidden_states)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/layers/linear.py", line 438, in forward
output_parallel = self.quant_method.apply(self, input_, bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/layers/quantization/fp8.py", line 484, in apply
return apply_fp8_marlin_linear(
^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/vlm/common/env/vllm/lib/python3.11/site-packages/sglang/srt/layers/quantization/marlin_utils_fp8.py", line 61, in apply_fp8_marlin_linear
output = gptq_marlin_gemm(
^^^^^^^^^^^^^^^^^
TypeError: gptq_marlin_gemm() got an unexpected keyword argument 'b_bias'