SGLang Native APIs#

Apart from the OpenAI compatible APIs, the SGLang Runtime also provides its native server APIs. We introduce these following APIs:

  • /generate (text generation model)

  • /get_model_info

  • /get_server_info

  • /health

  • /health_generate

  • /flush_cache

  • /update_weights

  • /encode(embedding model)

  • /v1/rerank(cross encoder rerank model)

  • /classify(reward model)

  • /start_expert_distribution_record

  • /stop_expert_distribution_record

  • /dump_expert_distribution_record

We mainly use requests to test these APIs in the following examples. You can also use curl.

Launch A Server#

[1]:
import requests
from sglang.test.test_utils import is_in_ci

if is_in_ci():
    from patch import launch_server_cmd
else:
    from sglang.utils import launch_server_cmd

from sglang.utils import wait_for_server, print_highlight, terminate_process


server_process, port = launch_server_cmd(
    "python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0"
)
## To run qwen2.5-0.5b-instruct model on the Ascend-Npu, you can execute the following command:
# server_process, port = launch_server_cmd(
#     "python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --device npu --tp 2 --attention-backend torch_native"
# )

wait_for_server(f"http://localhost:{port}")
[2025-06-30 02:28:26] server_args=ServerArgs(model_path='qwen/qwen2.5-0.5b-instruct', tokenizer_path='qwen/qwen2.5-0.5b-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, quantization_param_path=None, context_length=None, device='cuda', served_model_name='qwen/qwen2.5-0.5b-instruct', chat_template=None, completion_template=None, is_embedding=False, enable_multimodal=None, revision=None, hybrid_kvcache_ratio=None, impl='auto', host='0.0.0.0', port=32602, mem_fraction_static=0.874, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, pp_size=1, max_micro_batch_size=None, stream_interval=1, stream_output=False, random_seed=232013094, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, sleep_on_idle=False, log_level='info', log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, bucket_time_to_first_token=None, bucket_e2e_request_latency=None, bucket_inter_token_latency=None, collect_tokens_histogram=False, decode_log_interval=40, enable_request_time_stats_logging=False, kv_events_config=None, api_key=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, tool_call_parser=None, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, lora_paths=None, max_loras_per_batch=8, lora_backend='triton', attention_backend=None, sampling_backend='flashinfer', grammar_backend='xgrammar', mm_attention_backend=None, speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, ep_size=1, enable_ep_moe=False, enable_deepep_moe=False, enable_flashinfer_moe=False, deepep_mode='auto', ep_num_redundant_experts=0, ep_dispatch_algorithm='static', init_expert_location='trivial', enable_eplb=False, eplb_algorithm='auto', eplb_rebalance_num_iterations=1000, eplb_rebalance_layers_per_chunk=None, expert_distribution_recorder_mode=None, expert_distribution_recorder_buffer_size=1000, enable_expert_distribution_metrics=False, deepep_config=None, moe_dense_tp_size=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, disable_radix_cache=False, cuda_graph_max_bs=None, cuda_graph_bs=None, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_profile_cuda_graph=False, enable_nccl_nvls=False, enable_tokenizer_batch_encode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_mscclpp=False, disable_overlap_schedule=False, disable_overlap_cg_plan=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_dp_lm_head=False, enable_two_batch_overlap=False, enable_torch_compile=False, torch_compile_max_bs=32, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through_selective', flashinfer_mla_disable_ragged=False, disable_shared_experts_fusion=False, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, enable_return_hidden_states=False, warmups=None, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, debug_tensor_dump_prefill_only=False, disaggregation_mode='null', disaggregation_transfer_backend='mooncake', disaggregation_bootstrap_port=8998, disaggregation_decode_tp=None, disaggregation_decode_dp=None, disaggregation_prefill_pp=1, disaggregation_ib_device=None, num_reserved_decode_tokens=512, pdlb_url=None, custom_weight_loader=[], weight_loader_disable_mmap=False)
[2025-06-30 02:28:40] Attention backend not set. Use fa3 backend by default.
[2025-06-30 02:28:40] Init torch distributed begin.
[2025-06-30 02:28:42] Init torch distributed ends. mem usage=0.00 GB
[2025-06-30 02:28:45] Load weight begin. avail mem=77.30 GB
[2025-06-30 02:28:45] Using model weights format ['*.safetensors']
[2025-06-30 02:28:45] No model.safetensors.index.json found in remote.
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  5.52it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  5.52it/s]

[2025-06-30 02:28:46] Load weight end. type=Qwen2ForCausalLM, dtype=torch.bfloat16, avail mem=46.97 GB, mem usage=30.33 GB.
[2025-06-30 02:28:46] KV Cache is allocated. #tokens: 20480, K size: 0.12 GB, V size: 0.12 GB
[2025-06-30 02:28:46] Memory pool end. avail mem=46.56 GB
[2025-06-30 02:28:47] max_total_num_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=200, context_len=32768, available_gpu_mem=46.47 GB
[2025-06-30 02:28:47] INFO:     Started server process [1964001]
[2025-06-30 02:28:47] INFO:     Waiting for application startup.
[2025-06-30 02:28:47] INFO:     Application startup complete.
[2025-06-30 02:28:47] INFO:     Uvicorn running on http://0.0.0.0:32602 (Press CTRL+C to quit)
[2025-06-30 02:28:48] INFO:     127.0.0.1:41630 - "GET /v1/models HTTP/1.1" 200 OK
[2025-06-30 02:28:48] INFO:     127.0.0.1:41646 - "GET /get_model_info HTTP/1.1" 200 OK
[2025-06-30 02:28:48] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, #token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-06-30 02:28:50] INFO:     127.0.0.1:41654 - "POST /generate HTTP/1.1" 200 OK
[2025-06-30 02:28:50] The server is fired up and ready to roll!


NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
We are running those notebooks in a CI parallel environment, so the throughput is not representative of the actual performance.

Generate (text generation model)#

Generate completions. This is similar to the /v1/completions in OpenAI API. Detailed parameters can be found in the sampling parameters.

[2]:
url = f"http://localhost:{port}/generate"
data = {"text": "What is the capital of France?"}

response = requests.post(url, json=data)
print_highlight(response.json())
[2025-06-30 02:28:53] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, #token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-06-30 02:28:53] Decode batch. #running-req: 1, #token: 40, token usage: 0.00, cuda graph: False, gen throughput (token/s): 6.36, #queue-req: 0
[2025-06-30 02:28:53] Decode batch. #running-req: 1, #token: 80, token usage: 0.00, cuda graph: False, gen throughput (token/s): 130.97, #queue-req: 0
[2025-06-30 02:28:53] Decode batch. #running-req: 1, #token: 120, token usage: 0.01, cuda graph: False, gen throughput (token/s): 130.11, #queue-req: 0
[2025-06-30 02:28:54] INFO:     127.0.0.1:41662 - "POST /generate HTTP/1.1" 200 OK
{'text': ' The capital of France is Paris. Paris is the largest and most important city in France, and it serves as the country\'s capital, as well as its largest metropolitan area. Paris is located in the west of the country, on the River Seine and the Îles de la Cité, which were used by Napoleon asвшийandation to act as an island presence. Paris is known as the "Queen of France" and as the "City of Love" for its romantic atmosphere. It is also recognized for its Eiffel Tower, the Louvre Museum, the Basilica of St. Louis and the Champs-Élys', 'meta_info': {'id': '1721bfa81c3d4925a7728dbb1b604ce7', 'finish_reason': {'type': 'length', 'length': 128}, 'prompt_tokens': 7, 'completion_tokens': 128, 'cached_tokens': 0, 'e2e_latency': 1.027165174484253}}

Get Model Info#

Get the information of the model.

  • model_path: The path/name of the model.

  • is_generation: Whether the model is used as generation model or embedding model.

  • tokenizer_path: The path/name of the tokenizer.

[3]:
url = f"http://localhost:{port}/get_model_info"

response = requests.get(url)
response_json = response.json()
print_highlight(response_json)
assert response_json["model_path"] == "qwen/qwen2.5-0.5b-instruct"
assert response_json["is_generation"] is True
assert response_json["tokenizer_path"] == "qwen/qwen2.5-0.5b-instruct"
assert response_json.keys() == {"model_path", "is_generation", "tokenizer_path"}
[2025-06-30 02:28:54] INFO:     127.0.0.1:59206 - "GET /get_model_info HTTP/1.1" 200 OK
{'model_path': 'qwen/qwen2.5-0.5b-instruct', 'tokenizer_path': 'qwen/qwen2.5-0.5b-instruct', 'is_generation': True}

Get Server Info#

Gets the server information including CLI arguments, token limits, and memory pool sizes.

  • Note: get_server_info merges the following deprecated endpoints:

    • get_server_args

    • get_memory_pool_size

    • get_max_total_num_tokens

[4]:
# get_server_info

url = f"http://localhost:{port}/get_server_info"

response = requests.get(url)
print_highlight(response.text)
[2025-06-30 02:28:54] INFO:     127.0.0.1:59214 - "GET /get_server_info HTTP/1.1" 200 OK
{"model_path":"qwen/qwen2.5-0.5b-instruct","tokenizer_path":"qwen/qwen2.5-0.5b-instruct","tokenizer_mode":"auto","skip_tokenizer_init":false,"load_format":"auto","model_loader_extra_config":"{}","trust_remote_code":false,"dtype":"auto","kv_cache_dtype":"auto","quantization":null,"quantization_param_path":null,"context_length":null,"device":"cuda","served_model_name":"qwen/qwen2.5-0.5b-instruct","chat_template":null,"completion_template":null,"is_embedding":false,"enable_multimodal":null,"revision":null,"hybrid_kvcache_ratio":null,"impl":"auto","host":"0.0.0.0","port":32602,"mem_fraction_static":0.874,"max_running_requests":200,"max_total_tokens":20480,"chunked_prefill_size":8192,"max_prefill_tokens":16384,"schedule_policy":"fcfs","schedule_conservativeness":1.0,"cpu_offload_gb":0,"page_size":1,"tp_size":1,"pp_size":1,"max_micro_batch_size":null,"stream_interval":1,"stream_output":false,"random_seed":232013094,"constrained_json_whitespace_pattern":null,"watchdog_timeout":300,"dist_timeout":null,"download_dir":null,"base_gpu_id":0,"gpu_id_step":1,"sleep_on_idle":false,"log_level":"info","log_level_http":null,"log_requests":false,"log_requests_level":0,"show_time_cost":false,"enable_metrics":false,"bucket_time_to_first_token":null,"bucket_e2e_request_latency":null,"bucket_inter_token_latency":null,"collect_tokens_histogram":false,"decode_log_interval":40,"enable_request_time_stats_logging":false,"kv_events_config":null,"api_key":null,"file_storage_path":"sglang_storage","enable_cache_report":false,"reasoning_parser":null,"tool_call_parser":null,"dp_size":1,"load_balance_method":"round_robin","dist_init_addr":null,"nnodes":1,"node_rank":0,"json_model_override_args":"{}","preferred_sampling_params":null,"lora_paths":null,"max_loras_per_batch":8,"lora_backend":"triton","attention_backend":null,"sampling_backend":"flashinfer","grammar_backend":"xgrammar","mm_attention_backend":null,"speculative_algorithm":null,"speculative_draft_model_path":null,"speculative_num_steps":null,"speculative_eagle_topk":null,"speculative_num_draft_tokens":null,"speculative_accept_threshold_single":1.0,"speculative_accept_threshold_acc":1.0,"speculative_token_map":null,"ep_size":1,"enable_ep_moe":false,"enable_deepep_moe":false,"enable_flashinfer_moe":false,"deepep_mode":"auto","ep_num_redundant_experts":0,"ep_dispatch_algorithm":"static","init_expert_location":"trivial","enable_eplb":false,"eplb_algorithm":"auto","eplb_rebalance_num_iterations":1000,"eplb_rebalance_layers_per_chunk":null,"expert_distribution_recorder_mode":null,"expert_distribution_recorder_buffer_size":1000,"enable_expert_distribution_metrics":false,"deepep_config":null,"moe_dense_tp_size":null,"enable_double_sparsity":false,"ds_channel_config_path":null,"ds_heavy_channel_num":32,"ds_heavy_token_num":256,"ds_heavy_channel_type":"qk","ds_sparse_decode_threshold":4096,"disable_radix_cache":false,"cuda_graph_max_bs":null,"cuda_graph_bs":null,"disable_cuda_graph":true,"disable_cuda_graph_padding":false,"enable_profile_cuda_graph":false,"enable_nccl_nvls":false,"enable_tokenizer_batch_encode":false,"disable_outlines_disk_cache":false,"disable_custom_all_reduce":false,"enable_mscclpp":false,"disable_overlap_schedule":false,"disable_overlap_cg_plan":false,"enable_mixed_chunk":false,"enable_dp_attention":false,"enable_dp_lm_head":false,"enable_two_batch_overlap":false,"enable_torch_compile":false,"torch_compile_max_bs":32,"torchao_config":"","enable_nan_detection":false,"enable_p2p_check":false,"triton_attention_reduce_in_fp32":false,"triton_attention_num_kv_splits":8,"num_continuous_decode_steps":1,"delete_ckpt_after_loading":false,"enable_memory_saver":false,"allow_auto_truncate":false,"enable_custom_logit_processor":false,"enable_hierarchical_cache":false,"hicache_ratio":2.0,"hicache_size":0,"hicache_write_policy":"write_through_selective","flashinfer_mla_disable_ragged":false,"disable_shared_experts_fusion":false,"disable_chunked_prefix_cache":false,"disable_fast_image_processor":false,"enable_return_hidden_states":false,"warmups":null,"debug_tensor_dump_output_folder":null,"debug_tensor_dump_input_file":null,"debug_tensor_dump_inject":false,"debug_tensor_dump_prefill_only":false,"disaggregation_mode":"null","disaggregation_transfer_backend":"mooncake","disaggregation_bootstrap_port":8998,"disaggregation_decode_tp":null,"disaggregation_decode_dp":null,"disaggregation_prefill_pp":1,"disaggregation_ib_device":null,"num_reserved_decode_tokens":512,"pdlb_url":null,"custom_weight_loader":[],"weight_loader_disable_mmap":false,"status":"ready","max_total_num_tokens":20480,"max_req_input_len":20474,"internal_states":[{"attention_backend":"fa3","mm_attention_backend":null,"debug_tensor_dump_inject":false,"debug_tensor_dump_output_folder":null,"chunked_prefill_size":8192,"device":"cuda","disable_chunked_prefix_cache":true,"disable_radix_cache":false,"enable_dp_attention":false,"enable_two_batch_overlap":false,"enable_dp_lm_head":false,"enable_deepep_moe":false,"deepep_mode":"auto","enable_ep_moe":false,"enable_flashinfer_moe":false,"moe_dense_tp_size":null,"ep_dispatch_algorithm":"static","deepep_config":null,"ep_num_redundant_experts":0,"enable_nan_detection":false,"flashinfer_mla_disable_ragged":false,"max_micro_batch_size":200,"disable_shared_experts_fusion":false,"sampling_backend":"flashinfer","speculative_accept_threshold_acc":1.0,"speculative_accept_threshold_single":1.0,"torchao_config":"","triton_attention_reduce_in_fp32":false,"num_reserved_decode_tokens":512,"weight_loader_disable_mmap":false,"use_mla_backend":false,"speculative_algorithm":1,"last_gen_throughput":130.10505319363642,"load":0}],"version":"0.4.8.post1"}

Health Check#

  • /health: Check the health of the server.

  • /health_generate: Check the health of the server by generating one token.

[5]:
url = f"http://localhost:{port}/health_generate"

response = requests.get(url)
print_highlight(response.text)
[2025-06-30 02:28:54] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 0, #token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-06-30 02:28:55] INFO:     127.0.0.1:59218 - "GET /health_generate HTTP/1.1" 200 OK
[6]:
url = f"http://localhost:{port}/health"

response = requests.get(url)
print_highlight(response.text)
[2025-06-30 02:28:55] INFO:     127.0.0.1:59220 - "GET /health HTTP/1.1" 200 OK

Flush Cache#

Flush the radix cache. It will be automatically triggered when the model weights are updated by the /update_weights API.

[7]:
# flush cache

url = f"http://localhost:{port}/flush_cache"

response = requests.post(url)
print_highlight(response.text)
[2025-06-30 02:28:55] Cache flushed successfully!
[2025-06-30 02:28:55] INFO:     127.0.0.1:59230 - "POST /flush_cache HTTP/1.1" 200 OK
Cache flushed.
Please check backend logs for more details. (When there are running or waiting requests, the operation will not be performed.)

Update Weights From Disk#

Update model weights from disk without restarting the server. Only applicable for models with the same architecture and parameter size.

SGLang support update_weights_from_disk API for continuous evaluation during training (save checkpoint to disk and update weights from disk).

[8]:
# successful update with same architecture and size

url = f"http://localhost:{port}/update_weights_from_disk"
data = {"model_path": "qwen/qwen2.5-0.5b-instruct"}

response = requests.post(url, json=data)
print_highlight(response.text)
assert response.json()["success"] is True
assert response.json()["message"] == "Succeeded to update model weights."
[2025-06-30 02:28:55] Start update_weights. Load format=auto
[2025-06-30 02:28:55] Update engine weights online from disk begin. avail mem=41.50 GB
[2025-06-30 02:28:55] Using model weights format ['*.safetensors']
[2025-06-30 02:28:55] No model.safetensors.index.json found in remote.
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  5.95it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  5.94it/s]

[2025-06-30 02:28:55] Update weights end.
[2025-06-30 02:28:55] Cache flushed successfully!
[2025-06-30 02:28:55] INFO:     127.0.0.1:59234 - "POST /update_weights_from_disk HTTP/1.1" 200 OK
{"success":true,"message":"Succeeded to update model weights.","num_paused_requests":0}
[9]:
# failed update with different parameter size or wrong name

url = f"http://localhost:{port}/update_weights_from_disk"
data = {"model_path": "qwen/qwen2.5-0.5b-instruct-wrong"}

response = requests.post(url, json=data)
response_json = response.json()
print_highlight(response_json)
assert response_json["success"] is False
assert response_json["message"] == (
    "Failed to get weights iterator: "
    "qwen/qwen2.5-0.5b-instruct-wrong"
    " (repository not found)."
)
[2025-06-30 02:28:55] Start update_weights. Load format=auto
[2025-06-30 02:28:55] Update engine weights online from disk begin. avail mem=41.50 GB
[2025-06-30 02:28:55] Failed to get weights iterator: qwen/qwen2.5-0.5b-instruct-wrong (repository not found).
[2025-06-30 02:28:55] INFO:     127.0.0.1:59244 - "POST /update_weights_from_disk HTTP/1.1" 400 Bad Request
{'success': False, 'message': 'Failed to get weights iterator: qwen/qwen2.5-0.5b-instruct-wrong (repository not found).', 'num_paused_requests': 0}
[10]:
terminate_process(server_process)
[2025-06-30 02:28:55] Child process unexpectedly failed with exitcode=9. pid=1964647
[2025-06-30 02:28:55] Child process unexpectedly failed with exitcode=9. pid=1964447

Encode (embedding model)#

Encode text into embeddings. Note that this API is only available for embedding models and will raise an error for generation models. Therefore, we launch a new server to server an embedding model.

[11]:
embedding_process, port = launch_server_cmd(
    """
python3 -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-1.5B-instruct \
    --host 0.0.0.0 --is-embedding
"""
)

wait_for_server(f"http://localhost:{port}")
[2025-06-30 02:29:02] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-1.5B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-1.5B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, quantization_param_path=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-1.5B-instruct', chat_template=None, completion_template=None, is_embedding=True, enable_multimodal=None, revision=None, hybrid_kvcache_ratio=None, impl='auto', host='0.0.0.0', port=38288, mem_fraction_static=0.874, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, pp_size=1, max_micro_batch_size=None, stream_interval=1, stream_output=False, random_seed=936041604, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, sleep_on_idle=False, log_level='info', log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, bucket_time_to_first_token=None, bucket_e2e_request_latency=None, bucket_inter_token_latency=None, collect_tokens_histogram=False, decode_log_interval=40, enable_request_time_stats_logging=False, kv_events_config=None, api_key=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, tool_call_parser=None, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, lora_paths=None, max_loras_per_batch=8, lora_backend='triton', attention_backend=None, sampling_backend='flashinfer', grammar_backend='xgrammar', mm_attention_backend=None, speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, ep_size=1, enable_ep_moe=False, enable_deepep_moe=False, enable_flashinfer_moe=False, deepep_mode='auto', ep_num_redundant_experts=0, ep_dispatch_algorithm='static', init_expert_location='trivial', enable_eplb=False, eplb_algorithm='auto', eplb_rebalance_num_iterations=1000, eplb_rebalance_layers_per_chunk=None, expert_distribution_recorder_mode=None, expert_distribution_recorder_buffer_size=1000, enable_expert_distribution_metrics=False, deepep_config=None, moe_dense_tp_size=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, disable_radix_cache=False, cuda_graph_max_bs=None, cuda_graph_bs=None, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_profile_cuda_graph=False, enable_nccl_nvls=False, enable_tokenizer_batch_encode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_mscclpp=False, disable_overlap_schedule=False, disable_overlap_cg_plan=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_dp_lm_head=False, enable_two_batch_overlap=False, enable_torch_compile=False, torch_compile_max_bs=32, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through_selective', flashinfer_mla_disable_ragged=False, disable_shared_experts_fusion=False, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, enable_return_hidden_states=False, warmups=None, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, debug_tensor_dump_prefill_only=False, disaggregation_mode='null', disaggregation_transfer_backend='mooncake', disaggregation_bootstrap_port=8998, disaggregation_decode_tp=None, disaggregation_decode_dp=None, disaggregation_prefill_pp=1, disaggregation_ib_device=None, num_reserved_decode_tokens=512, pdlb_url=None, custom_weight_loader=[], weight_loader_disable_mmap=False)
[2025-06-30 02:29:06] Downcasting torch.float32 to torch.float16.
[2025-06-30 02:29:13] Downcasting torch.float32 to torch.float16.
[2025-06-30 02:29:13] Overlap scheduler is disabled for embedding models.
[2025-06-30 02:29:13] Downcasting torch.float32 to torch.float16.
[2025-06-30 02:29:13] Attention backend not set. Use fa3 backend by default.
[2025-06-30 02:29:13] Init torch distributed begin.
[2025-06-30 02:29:14] Init torch distributed ends. mem usage=0.00 GB
[2025-06-30 02:29:15] Load weight begin. avail mem=62.07 GB
[2025-06-30 02:29:15] Using model weights format ['*.safetensors']
Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:01<00:01,  1.52s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:02<00:00,  1.04s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:02<00:00,  1.11s/it]

[2025-06-30 02:29:18] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=58.05 GB, mem usage=4.02 GB.
[2025-06-30 02:29:18] KV Cache is allocated. #tokens: 20480, K size: 0.27 GB, V size: 0.27 GB
[2025-06-30 02:29:18] Memory pool end. avail mem=57.23 GB
[2025-06-30 02:29:18] max_total_num_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=200, context_len=131072, available_gpu_mem=55.99 GB
[2025-06-30 02:29:19] INFO:     Started server process [1967053]
[2025-06-30 02:29:19] INFO:     Waiting for application startup.
[2025-06-30 02:29:19] INFO:     Application startup complete.
[2025-06-30 02:29:19] INFO:     Uvicorn running on http://0.0.0.0:38288 (Press CTRL+C to quit)
[2025-06-30 02:29:19] INFO:     127.0.0.1:43920 - "GET /v1/models HTTP/1.1" 200 OK
[2025-06-30 02:29:20] INFO:     127.0.0.1:43932 - "GET /get_model_info HTTP/1.1" 200 OK
[2025-06-30 02:29:20] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, #token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-06-30 02:29:20] INFO:     127.0.0.1:43940 - "POST /encode HTTP/1.1" 200 OK
[2025-06-30 02:29:20] The server is fired up and ready to roll!


NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
We are running those notebooks in a CI parallel environment, so the throughput is not representative of the actual performance.
[12]:
# successful encode for embedding model

url = f"http://localhost:{port}/encode"
data = {"model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct", "text": "Once upon a time"}

response = requests.post(url, json=data)
response_json = response.json()
print_highlight(f"Text embedding (first 10): {response_json['embedding'][:10]}")
[2025-06-30 02:29:24] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, #token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-06-30 02:29:24] INFO:     127.0.0.1:51964 - "POST /encode HTTP/1.1" 200 OK
Text embedding (first 10): [-0.00023102760314941406, -0.04986572265625, -0.0032711029052734375, 0.011077880859375, -0.0140533447265625, 0.0159912109375, -0.01441192626953125, 0.0059051513671875, -0.0228424072265625, 0.0272979736328125]
[13]:
terminate_process(embedding_process)
[2025-06-30 02:29:24] Child process unexpectedly failed with exitcode=9. pid=1967397
[2025-06-30 02:29:24] Child process unexpectedly failed with exitcode=9. pid=1967330

v1/rerank (cross encoder rerank model)#

Rerank a list of documents given a query using a cross-encoder model. Note that this API is only available for cross encoder model like BAAI/bge-reranker-v2-m3 with attention-backend triton and torch_native.

[14]:
reranker_process, port = launch_server_cmd(
    """
python3 -m sglang.launch_server --model-path BAAI/bge-reranker-v2-m3 \
    --host 0.0.0.0 --disable-radix-cache --chunked-prefill-size -1 --attention-backend triton --is-embedding
"""
)

wait_for_server(f"http://localhost:{port}")
[2025-06-30 02:29:32] server_args=ServerArgs(model_path='BAAI/bge-reranker-v2-m3', tokenizer_path='BAAI/bge-reranker-v2-m3', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, quantization_param_path=None, context_length=None, device='cuda', served_model_name='BAAI/bge-reranker-v2-m3', chat_template=None, completion_template=None, is_embedding=True, enable_multimodal=None, revision=None, hybrid_kvcache_ratio=None, impl='auto', host='0.0.0.0', port=32813, mem_fraction_static=0.874, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=-1, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, pp_size=1, max_micro_batch_size=None, stream_interval=1, stream_output=False, random_seed=403241683, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, sleep_on_idle=False, log_level='info', log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, bucket_time_to_first_token=None, bucket_e2e_request_latency=None, bucket_inter_token_latency=None, collect_tokens_histogram=False, decode_log_interval=40, enable_request_time_stats_logging=False, kv_events_config=None, api_key=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, tool_call_parser=None, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, lora_paths=None, max_loras_per_batch=8, lora_backend='triton', attention_backend='triton', sampling_backend='flashinfer', grammar_backend='xgrammar', mm_attention_backend=None, speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, ep_size=1, enable_ep_moe=False, enable_deepep_moe=False, enable_flashinfer_moe=False, deepep_mode='auto', ep_num_redundant_experts=0, ep_dispatch_algorithm='static', init_expert_location='trivial', enable_eplb=False, eplb_algorithm='auto', eplb_rebalance_num_iterations=1000, eplb_rebalance_layers_per_chunk=None, expert_distribution_recorder_mode=None, expert_distribution_recorder_buffer_size=1000, enable_expert_distribution_metrics=False, deepep_config=None, moe_dense_tp_size=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, disable_radix_cache=True, cuda_graph_max_bs=None, cuda_graph_bs=None, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_profile_cuda_graph=False, enable_nccl_nvls=False, enable_tokenizer_batch_encode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_mscclpp=False, disable_overlap_schedule=False, disable_overlap_cg_plan=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_dp_lm_head=False, enable_two_batch_overlap=False, enable_torch_compile=False, torch_compile_max_bs=32, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through_selective', flashinfer_mla_disable_ragged=False, disable_shared_experts_fusion=False, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, enable_return_hidden_states=False, warmups=None, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, debug_tensor_dump_prefill_only=False, disaggregation_mode='null', disaggregation_transfer_backend='mooncake', disaggregation_bootstrap_port=8998, disaggregation_decode_tp=None, disaggregation_decode_dp=None, disaggregation_prefill_pp=1, disaggregation_ib_device=None, num_reserved_decode_tokens=512, pdlb_url=None, custom_weight_loader=[], weight_loader_disable_mmap=False)
[2025-06-30 02:29:35] Downcasting torch.float32 to torch.float16.
[2025-06-30 02:29:43] Downcasting torch.float32 to torch.float16.
[2025-06-30 02:29:43] Overlap scheduler is disabled for embedding models.
[2025-06-30 02:29:44] Downcasting torch.float32 to torch.float16.
[2025-06-30 02:29:44] Init torch distributed begin.
[2025-06-30 02:29:44] Init torch distributed ends. mem usage=0.00 GB
[2025-06-30 02:29:46] Load weight begin. avail mem=61.47 GB
[2025-06-30 02:29:46] Using model weights format ['*.safetensors']
[2025-06-30 02:29:46] No model.safetensors.index.json found in remote.
Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.62it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.62it/s]

[2025-06-30 02:29:47] Load weight end. type=XLMRobertaForSequenceClassification, dtype=torch.float16, avail mem=60.37 GB, mem usage=1.10 GB.
[2025-06-30 02:29:47] KV Cache is allocated. #tokens: 20480, K size: 0.94 GB, V size: 0.94 GB
[2025-06-30 02:29:47] Memory pool end. avail mem=58.33 GB
[2025-06-30 02:29:47] max_total_num_tokens=20480, chunked_prefill_size=-1, max_prefill_tokens=16384, max_running_requests=200, context_len=8194, available_gpu_mem=58.24 GB
[2025-06-30 02:29:48] INFO:     Started server process [1969832]
[2025-06-30 02:29:48] INFO:     Waiting for application startup.
[2025-06-30 02:29:48] INFO:     Application startup complete.
[2025-06-30 02:29:48] INFO:     Uvicorn running on http://0.0.0.0:32813 (Press CTRL+C to quit)
[2025-06-30 02:29:48] INFO:     127.0.0.1:47516 - "GET /v1/models HTTP/1.1" 200 OK
[2025-06-30 02:29:49] INFO:     127.0.0.1:47522 - "GET /get_model_info HTTP/1.1" 200 OK
[2025-06-30 02:29:49] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 0, #token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-06-30 02:29:50] INFO:     127.0.0.1:47528 - "POST /encode HTTP/1.1" 200 OK
[2025-06-30 02:29:50] The server is fired up and ready to roll!


NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
We are running those notebooks in a CI parallel environment, so the throughput is not representative of the actual performance.
[15]:
# compute rerank scores for query and documents

url = f"http://localhost:{port}/v1/rerank"
data = {
    "model": "BAAI/bge-reranker-v2-m3",
    "query": "what is panda?",
    "documents": [
        "hi",
        "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.",
    ],
}

response = requests.post(url, json=data)
response_json = response.json()
for item in response_json:
    print_highlight(f"Score: {item['score']:.2f} - Document: '{item['document']}'")
[2025-06-30 02:29:53] Prefill batch. #new-seq: 1, #new-token: 10, #cached-token: 0, #token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-06-30 02:29:53] Prefill batch. #new-seq: 1, #new-token: 43, #cached-token: 0, #token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-06-30 02:29:53] INFO:     127.0.0.1:46272 - "POST /v1/rerank HTTP/1.1" 200 OK
Score: 5.26 - Document: 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.'
Score: -8.19 - Document: 'hi'
[16]:
terminate_process(reranker_process)
[2025-06-30 02:29:53] Child process unexpectedly failed with exitcode=9. pid=1970322
[2025-06-30 02:29:53] Child process unexpectedly failed with exitcode=9. pid=1970245

Classify (reward model)#

SGLang Runtime also supports reward models. Here we use a reward model to classify the quality of pairwise generations.

[17]:
# Note that SGLang now treats embedding models and reward models as the same type of models.
# This will be updated in the future.

reward_process, port = launch_server_cmd(
    """
python3 -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding
"""
)

wait_for_server(f"http://localhost:{port}")
[2025-06-30 02:30:00] server_args=ServerArgs(model_path='Skywork/Skywork-Reward-Llama-3.1-8B-v0.2', tokenizer_path='Skywork/Skywork-Reward-Llama-3.1-8B-v0.2', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, quantization_param_path=None, context_length=None, device='cuda', served_model_name='Skywork/Skywork-Reward-Llama-3.1-8B-v0.2', chat_template=None, completion_template=None, is_embedding=True, enable_multimodal=None, revision=None, hybrid_kvcache_ratio=None, impl='auto', host='0.0.0.0', port=32795, mem_fraction_static=0.874, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, pp_size=1, max_micro_batch_size=None, stream_interval=1, stream_output=False, random_seed=853155260, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, sleep_on_idle=False, log_level='info', log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, bucket_time_to_first_token=None, bucket_e2e_request_latency=None, bucket_inter_token_latency=None, collect_tokens_histogram=False, decode_log_interval=40, enable_request_time_stats_logging=False, kv_events_config=None, api_key=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, tool_call_parser=None, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, lora_paths=None, max_loras_per_batch=8, lora_backend='triton', attention_backend=None, sampling_backend='flashinfer', grammar_backend='xgrammar', mm_attention_backend=None, speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, ep_size=1, enable_ep_moe=False, enable_deepep_moe=False, enable_flashinfer_moe=False, deepep_mode='auto', ep_num_redundant_experts=0, ep_dispatch_algorithm='static', init_expert_location='trivial', enable_eplb=False, eplb_algorithm='auto', eplb_rebalance_num_iterations=1000, eplb_rebalance_layers_per_chunk=None, expert_distribution_recorder_mode=None, expert_distribution_recorder_buffer_size=1000, enable_expert_distribution_metrics=False, deepep_config=None, moe_dense_tp_size=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, disable_radix_cache=False, cuda_graph_max_bs=None, cuda_graph_bs=None, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_profile_cuda_graph=False, enable_nccl_nvls=False, enable_tokenizer_batch_encode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_mscclpp=False, disable_overlap_schedule=False, disable_overlap_cg_plan=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_dp_lm_head=False, enable_two_batch_overlap=False, enable_torch_compile=False, torch_compile_max_bs=32, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through_selective', flashinfer_mla_disable_ragged=False, disable_shared_experts_fusion=False, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, enable_return_hidden_states=False, warmups=None, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, debug_tensor_dump_prefill_only=False, disaggregation_mode='null', disaggregation_transfer_backend='mooncake', disaggregation_bootstrap_port=8998, disaggregation_decode_tp=None, disaggregation_decode_dp=None, disaggregation_prefill_pp=1, disaggregation_ib_device=None, num_reserved_decode_tokens=512, pdlb_url=None, custom_weight_loader=[], weight_loader_disable_mmap=False)
[2025-06-30 02:30:12] Overlap scheduler is disabled for embedding models.
[2025-06-30 02:30:12] Attention backend not set. Use flashinfer backend by default.
[2025-06-30 02:30:12] Init torch distributed begin.
[2025-06-30 02:30:12] Init torch distributed ends. mem usage=0.00 GB
[2025-06-30 02:30:13] Load weight begin. avail mem=78.50 GB
[2025-06-30 02:30:13] Using model weights format ['*.safetensors']
Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.26it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.15it/s]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.17it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.51it/s]

[2025-06-30 02:30:16] Load weight end. type=LlamaForSequenceClassification, dtype=torch.bfloat16, avail mem=64.46 GB, mem usage=14.04 GB.
[2025-06-30 02:30:16] KV Cache is allocated. #tokens: 20480, K size: 1.25 GB, V size: 1.25 GB
[2025-06-30 02:30:16] Memory pool end. avail mem=61.66 GB
[2025-06-30 02:30:17] max_total_num_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=200, context_len=131072, available_gpu_mem=61.09 GB
[2025-06-30 02:30:18] INFO:     Started server process [1972743]
[2025-06-30 02:30:18] INFO:     Waiting for application startup.
[2025-06-30 02:30:18] INFO:     Application startup complete.
[2025-06-30 02:30:18] INFO:     Uvicorn running on http://0.0.0.0:32795 (Press CTRL+C to quit)
[2025-06-30 02:30:19] INFO:     127.0.0.1:38580 - "GET /v1/models HTTP/1.1" 200 OK
[2025-06-30 02:30:19] INFO:     127.0.0.1:38596 - "GET /get_model_info HTTP/1.1" 200 OK
[2025-06-30 02:30:19] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, #token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0


NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
We are running those notebooks in a CI parallel environment, so the throughput is not representative of the actual performance.
[18]:
from transformers import AutoTokenizer

PROMPT = (
    "What is the range of the numeric output of a sigmoid node in a neural network?"
)

RESPONSE1 = "The output of a sigmoid node is bounded between -1 and 1."
RESPONSE2 = "The output of a sigmoid node is bounded between 0 and 1."

CONVS = [
    [{"role": "user", "content": PROMPT}, {"role": "assistant", "content": RESPONSE1}],
    [{"role": "user", "content": PROMPT}, {"role": "assistant", "content": RESPONSE2}],
]

tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-Reward-Llama-3.1-8B-v0.2")
prompts = tokenizer.apply_chat_template(CONVS, tokenize=False)

url = f"http://localhost:{port}/classify"
data = {"model": "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", "text": prompts}

responses = requests.post(url, json=data).json()
for response in responses:
    print_highlight(f"reward: {response['embedding'][0]}")
[2025-06-30 02:31:23] Prefill batch. #new-seq: 2, #new-token: 136, #cached-token: 2, #token: 1, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-06-30 02:31:23] INFO:     127.0.0.1:38600 - "POST /encode HTTP/1.1" 200 OK
[2025-06-30 02:31:23] The server is fired up and ready to roll!
[2025-06-30 02:31:37] INFO:     127.0.0.1:51498 - "POST /classify HTTP/1.1" 200 OK
reward: -24.125
reward: 1.171875
[19]:
terminate_process(reward_process)
[2025-06-30 02:31:37] Child process unexpectedly failed with exitcode=9. pid=1973312

Capture expert selection distribution in MoE models#

SGLang Runtime supports recording the number of times an expert is selected in a MoE model run for each expert in the model. This is useful when analyzing the throughput of the model and plan for optimization.

Note: We only print out the first 10 lines of the csv below for better readability. Please adjust accordingly if you want to analyze the results more deeply.

[20]:
expert_record_server_process, port = launch_server_cmd(
    "python3 -m sglang.launch_server --model-path Qwen/Qwen1.5-MoE-A2.7B --host 0.0.0.0 --expert-distribution-recorder-mode stat"
)

wait_for_server(f"http://localhost:{port}")
[2025-06-30 02:31:44] server_args=ServerArgs(model_path='Qwen/Qwen1.5-MoE-A2.7B', tokenizer_path='Qwen/Qwen1.5-MoE-A2.7B', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, quantization_param_path=None, context_length=None, device='cuda', served_model_name='Qwen/Qwen1.5-MoE-A2.7B', chat_template=None, completion_template=None, is_embedding=False, enable_multimodal=None, revision=None, hybrid_kvcache_ratio=None, impl='auto', host='0.0.0.0', port=38371, mem_fraction_static=0.874, max_running_requests=200, max_total_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, pp_size=1, max_micro_batch_size=None, stream_interval=1, stream_output=False, random_seed=149163156, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, sleep_on_idle=False, log_level='info', log_level_http=None, log_requests=False, log_requests_level=0, show_time_cost=False, enable_metrics=False, bucket_time_to_first_token=None, bucket_e2e_request_latency=None, bucket_inter_token_latency=None, collect_tokens_histogram=False, decode_log_interval=40, enable_request_time_stats_logging=False, kv_events_config=None, api_key=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, tool_call_parser=None, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, lora_paths=None, max_loras_per_batch=8, lora_backend='triton', attention_backend=None, sampling_backend='flashinfer', grammar_backend='xgrammar', mm_attention_backend=None, speculative_algorithm=None, speculative_draft_model_path=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, ep_size=1, enable_ep_moe=False, enable_deepep_moe=False, enable_flashinfer_moe=False, deepep_mode='auto', ep_num_redundant_experts=0, ep_dispatch_algorithm='static', init_expert_location='trivial', enable_eplb=False, eplb_algorithm='auto', eplb_rebalance_num_iterations=1000, eplb_rebalance_layers_per_chunk=None, expert_distribution_recorder_mode='stat', expert_distribution_recorder_buffer_size=1000, enable_expert_distribution_metrics=False, deepep_config=None, moe_dense_tp_size=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, disable_radix_cache=False, cuda_graph_max_bs=None, cuda_graph_bs=None, disable_cuda_graph=True, disable_cuda_graph_padding=False, enable_profile_cuda_graph=False, enable_nccl_nvls=False, enable_tokenizer_batch_encode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_mscclpp=False, disable_overlap_schedule=False, disable_overlap_cg_plan=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_dp_lm_head=False, enable_two_batch_overlap=False, enable_torch_compile=False, torch_compile_max_bs=32, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, allow_auto_truncate=False, enable_custom_logit_processor=False, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through_selective', flashinfer_mla_disable_ragged=False, disable_shared_experts_fusion=False, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, enable_return_hidden_states=False, warmups=None, debug_tensor_dump_output_folder=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, debug_tensor_dump_prefill_only=False, disaggregation_mode='null', disaggregation_transfer_backend='mooncake', disaggregation_bootstrap_port=8998, disaggregation_decode_tp=None, disaggregation_decode_dp=None, disaggregation_prefill_pp=1, disaggregation_ib_device=None, num_reserved_decode_tokens=512, pdlb_url=None, custom_weight_loader=[], weight_loader_disable_mmap=False)
[2025-06-30 02:31:55] Attention backend not set. Use flashinfer backend by default.
[2025-06-30 02:31:55] Init torch distributed begin.
[2025-06-30 02:31:57] Init torch distributed ends. mem usage=0.00 GB
[2025-06-30 02:31:58] Load weight begin. avail mem=78.48 GB
[2025-06-30 02:31:59] Using model weights format ['*.safetensors']
Loading safetensors checkpoint shards:   0% Completed | 0/8 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  12% Completed | 1/8 [00:00<00:04,  1.50it/s]
Loading safetensors checkpoint shards:  25% Completed | 2/8 [00:01<00:04,  1.39it/s]
Loading safetensors checkpoint shards:  38% Completed | 3/8 [00:02<00:03,  1.37it/s]
Loading safetensors checkpoint shards:  50% Completed | 4/8 [00:02<00:02,  1.34it/s]
Loading safetensors checkpoint shards:  62% Completed | 5/8 [00:03<00:02,  1.32it/s]
Loading safetensors checkpoint shards:  75% Completed | 6/8 [00:04<00:01,  1.28it/s]
Loading safetensors checkpoint shards:  88% Completed | 7/8 [00:05<00:00,  1.26it/s]
Loading safetensors checkpoint shards: 100% Completed | 8/8 [00:05<00:00,  1.66it/s]
Loading safetensors checkpoint shards: 100% Completed | 8/8 [00:05<00:00,  1.44it/s]

[2025-06-30 02:32:05] Load weight end. type=Qwen2MoeForCausalLM, dtype=torch.bfloat16, avail mem=51.76 GB, mem usage=26.71 GB.
[2025-06-30 02:32:05] KV Cache is allocated. #tokens: 20480, K size: 1.88 GB, V size: 1.88 GB
[2025-06-30 02:32:05] Memory pool end. avail mem=47.85 GB
[2025-06-30 02:32:06] max_total_num_tokens=20480, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=200, context_len=8192, available_gpu_mem=47.28 GB
[2025-06-30 02:32:06] INFO:     Started server process [1979710]
[2025-06-30 02:32:06] INFO:     Waiting for application startup.
[2025-06-30 02:32:06] INFO:     Application startup complete.
[2025-06-30 02:32:06] INFO:     Uvicorn running on http://0.0.0.0:38371 (Press CTRL+C to quit)
[2025-06-30 02:32:07] INFO:     127.0.0.1:48278 - "GET /v1/models HTTP/1.1" 200 OK
[2025-06-30 02:32:07] INFO:     127.0.0.1:48282 - "GET /get_model_info HTTP/1.1" 200 OK
[2025-06-30 02:32:07] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, #token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-06-30 02:32:08] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /public_sglang_ci/runner-l1a-gpu-23/_work/sglang/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=60,N=1408,device_name=NVIDIA_H100_80GB_HBM3.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton


NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
We are running those notebooks in a CI parallel environment, so the throughput is not representative of the actual performance.
[21]:
response = requests.post(f"http://localhost:{port}/start_expert_distribution_record")
print_highlight(response)

url = f"http://localhost:{port}/generate"
data = {"text": "What is the capital of France?"}

response = requests.post(url, json=data)
print_highlight(response.json())

response = requests.post(f"http://localhost:{port}/stop_expert_distribution_record")
print_highlight(response)

response = requests.post(f"http://localhost:{port}/dump_expert_distribution_record")
print_highlight(response)
[2025-06-30 02:32:22] Resetting ExpertDistributionRecorder...
[2025-06-30 02:32:22] INFO:     127.0.0.1:48304 - "POST /start_expert_distribution_record HTTP/1.1" 200 OK
[2025-06-30 02:32:22] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, #token: 8, token usage: 0.00, #running-req: 1, #queue-req: 0
[2025-06-30 02:32:23] INFO:     127.0.0.1:48288 - "POST /generate HTTP/1.1" 200 OK
[2025-06-30 02:32:23] The server is fired up and ready to roll!
[2025-06-30 02:32:23] INFO:     127.0.0.1:52162 - "POST /generate HTTP/1.1" 200 OK
{'text': ' The capital of France is Paris.', 'meta_info': {'id': '2000f4a71f8c4cb88e549c22e4330f74', 'finish_reason': {'type': 'stop', 'matched': 151643}, 'prompt_tokens': 7, 'completion_tokens': 8, 'cached_tokens': 0, 'e2e_latency': 0.8356466293334961}}
[2025-06-30 02:32:23] INFO:     127.0.0.1:57466 - "POST /stop_expert_distribution_record HTTP/1.1" 200 OK
[2025-06-30 02:32:24] Write expert distribution to /tmp/expert_distribution_recorder_1751250744.615675.pt
[2025-06-30 02:32:24] Resetting ExpertDistributionRecorder...
[2025-06-30 02:32:24] INFO:     127.0.0.1:57478 - "POST /dump_expert_distribution_record HTTP/1.1" 200 OK
[22]:
terminate_process(expert_record_server_process)
[2025-06-30 02:32:24] Child process unexpectedly failed with exitcode=9. pid=1980366
[2025-06-30 02:32:24] Child process unexpectedly failed with exitcode=9. pid=1980166