Skip to content

llama server spec decoding script

Benson Wong edited this page May 31, 2025 · 1 revision

This shell script to run a few prompts through the model of your choice to test speculative decoding performance.

# no speculative configuration
$ ./test-spec-decoding.sh http://llama-swap.host/v1/chat/completions gemma 2000
| prompt | n | tok/sec | draft_n | draft_accepted | ratio |
|--------|---|---------|---------|----------------|-------|
| create a one page ht... | 1600 | 38.71 | null | null | N/A |
| write a snake game i... | 1929 | 38.49 | null | null | N/A |
| write a story about ... | 859 | 38.88 | null | null | N/A |

# with speculative decoding
$ ./test-spec-decoding.sh http://llama-swap.host/v1/chat/completions gemma-draft 2000
| prompt | n | tok/sec | draft_n | draft_accepted | ratio |
|--------|---|---------|---------|----------------|-------|
| create a one page ht... | 1542 | 49.07 | 1422 | 956 | 0.67 |
| write a snake game i... | 1904 | 50.67 | 1709 | 1236 | 0.72 |
| write a story about ... | 982 | 33.97 | 1068 | 282 | 0.26 |
#!/bin/bash

# Check arguments
if [[ $# -ne 3 ]]; then
    echo "Usage: $0 <url> <model> <max_tokens>"
    echo "Example: $0 http://llama-swap.host:8080/v1/chat/completions gemma 2000"
    exit 1
fi

URL="$1"
MODEL="$2"
MAX_TOKENS="$3"

# Define prompts
prompts=(
    "create a one page html snake game in javascript"
    "write a snake game in python"
    "write a story about a dog"
)

# Common curl settings
HEADERS=(-H "Content-Type: application/json")
JQ_FILTER='sub("^data: "; "") | fromjson? | .timings | {predicted_n, predicted_per_second, draft_n, draft_n_accepted, draft_accept_ratio: (if .draft_n > 0 then .draft_n_accepted / .draft_n else null end)}'

# Print table header
echo "| prompt | n | tok/sec | draft_n | draft_accepted | ratio |"
echo "|--------|---|---------|---------|----------------|-------|"

# Loop through prompts
for prompt in "${prompts[@]}"; do
    # Build JSON payload
    json_data=$(cat <<EOF
{
    "model": "$MODEL",
    "max_tokens": $MAX_TOKENS,
    "timings_per_token": true,
    "top_k": 1,
    "messages": [
        {
            "role": "user",
            "content": "$prompt"
        }
    ]
}
EOF
)

    # Make request and extract values
    result=$(curl -s "$URL" "${HEADERS[@]}" -d "$json_data" | jq -cR "$JQ_FILTER")

    # Parse JSON and format table row
    if [[ "$result" != "null" && -n "$result" ]]; then
        predicted_n=$(echo "$result" | jq -r '.predicted_n')
        predicted_per_second=$(echo "$result" | jq -r '.predicted_per_second | (. * 100 | round) / 100')
        draft_n=$(echo "$result" | jq -r '.draft_n')
        draft_n_accepted=$(echo "$result" | jq -r '.draft_n_accepted')
        draft_accept_ratio=$(echo "$result" | jq -r '.draft_accept_ratio | if . then (. * 100 | round) / 100 else "N/A" end')

        # Truncate prompt for display
        short_prompt=$(echo "$prompt" | cut -c1-20)
        [[ ${#prompt} -gt 20 ]] && short_prompt="${short_prompt}..."

        echo "| $short_prompt | $predicted_n | $predicted_per_second | $draft_n | $draft_n_accepted | $draft_accept_ratio |"
    else
        echo "| $prompt | ERROR | ERROR | ERROR | ERROR | ERROR |"
    fi
done
Clone this wiki locally