1
+ # Benchmark
2
+ name : Benchmark
3
+
4
+ on :
5
+ workflow_dispatch :
6
+ inputs :
7
+ gpu-series :
8
+ description : ' Azure GPU series to run with'
9
+ required : true
10
+ type : choice
11
+ options :
12
+ - Standard_NC4as_T4_v3
13
+ - Standard_NC64as_T4_v3
14
+ - Standard_NC24ads_A100_v4
15
+ - Standard_NC48ads_A100_v4
16
+ - Standard_ND96asr_A100_v4
17
+ - Standard_NC40ads_H100_v5
18
+ - Standard_NC80adis_H100_v5
19
+ push :
20
+ branches :
21
+ - master
22
+ - hp/server/bench/workflow # FIXME remove
23
+ paths : ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
24
+ pull_request :
25
+ types : [opened, synchronize, reopened]
26
+ paths : ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
27
+ schedule :
28
+ - cron : ' 04 2 * * *'
29
+
30
+ concurrency :
31
+ group : ${{ github.workflow }}-${{ github.ref }}
32
+ cancel-in-progress : true
33
+
34
+ jobs :
35
+ bench-server-baseline :
36
+ runs-on : Standard_NC4as_T4_v3
37
+ env :
38
+ RUNNER_LABEL : Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
39
+ # if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.event.push.ref == 'refs/heads/master' }}
40
+ steps :
41
+ - name : Clone
42
+ id : checkout
43
+ uses : actions/checkout@v3
44
+ with :
45
+ fetch-depth : 0
46
+
47
+ - name : TMP
48
+ id : tmp
49
+ run : |
50
+ echo IF: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.event.push.ref == 'refs/heads/master' }}
51
+ echo github.event.inputs.gpu-series=${{ github.event.inputs.gpu-series }}
52
+ echo github.event.pull_request=${{ github.event.pull_request }}
53
+ echo github.event.push.ref=${{ github.event.push.ref }}
54
+ echo github.event.schedule=${{ github.event.schedule }}
55
+
56
+ - name : Install python env
57
+ id : pipenv
58
+ run : |
59
+ cd examples/server/bench
60
+ python3 -m venv venv
61
+ source venv/bin/activate
62
+ pip install -r requirements.txt
63
+
64
+ - name : Prometheus
65
+ id : install_prometheus
66
+ run : |
67
+ wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
68
+ tar xzf prometheus*.tar.gz --strip-components=1
69
+ ./prometheus --config.file=examples/server/bench/prometheus.yml &
70
+ while ! nc -z localhost 9090; do
71
+ sleep 0.1
72
+ done
73
+
74
+ - name : Install k6
75
+ id : k6_installation
76
+ run : |
77
+ cd examples/server/bench
78
+ wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
79
+ tar xzf k6*.tar.gz --strip-components=1
80
+
81
+ - name : Build
82
+ id : cmake_build
83
+ run : |
84
+ set -eux
85
+ mkdir build
86
+ cd build
87
+ cmake .. \
88
+ -DLLAMA_NATIVE=OFF \
89
+ -DLLAMA_BUILD_SERVER=ON \
90
+ -DLLAMA_CURL=ON \
91
+ -DLLAMA_CUBLAS=ON \
92
+ -DCUDAToolkit_ROOT=/usr/local/cuda \
93
+ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
94
+ -DCMAKE_CUDA_ARCHITECTURES=75 \
95
+ -DLLAMA_FATAL_WARNINGS=OFF \
96
+ -DLLAMA_ALL_WARNINGS=OFF \
97
+ -DCMAKE_BUILD_TYPE=Release;
98
+ cmake --build . --config Release -j $(nproc) --target server
99
+
100
+ - name : Download the dataset
101
+ id : download_dataset
102
+ run : |
103
+ cd examples/server/bench
104
+ wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
105
+
106
+ - name : Server bench
107
+ id : server_bench
108
+ run : |
109
+ set -eux
110
+
111
+ cd examples/server/bench
112
+ source venv/bin/activate
113
+ BENCH_K6_BIN_PATH=./k6 python bench.py \
114
+ --runner-label ${{ env.RUNNER_LABEL }} \
115
+ --name ${{ github.job }} \
116
+ --branch ${{ github.ref_name }} \
117
+ --commit ${{ github.sha }} \
118
+ --scenario script.js \
119
+ --duration 30s \
120
+ --hf-repo ggml-org/models \
121
+ --hf-file phi-2/ggml-model-q4_0.gguf \
122
+ --model-path-prefix /models \
123
+ --parallel 8 \
124
+ -ngl 33 \
125
+ --batch-size 2048 \
126
+ --ubatch-size 256 \
127
+ --ctx-size 16384 \
128
+ --n-prompts 1000 \
129
+ --max-prompt-tokens 1024 \
130
+ --max-tokens 2048
131
+
132
+ cat results.github.env >> $GITHUB_ENV
133
+
134
+ # - name: Comment PR
135
+ # uses: mshick/add-pr-comment@v2
136
+ # id: comment_pr
137
+ # if: ${{ github.event.pull_request != '' }}
138
+ # with:
139
+ # message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
140
+ # message: |
141
+ # $BENCH_PR_COMMENT
142
+
143
+ - name : Commit status
144
+ uses : Sibz/github-status-action@v1
145
+ with :
146
+ context : ${{ github.job }}
147
+ description : |
148
+ $BENCH_RESULTS
149
+ state : ' success'
150
+
151
+ - name : Upload results
152
+ if : ${{ github.event.pull_request != '' }}
153
+
154
+ with :
155
+ path : ' *.png'
156
+ title : |
157
+ llama.cpp server benchmark results for ${{ github.job }} on ${{ env.RUNNER_LABEL }}: ${{ env.LLAMACPP_TOKENS_SECOND_AVG}}tk/s
158
+ annotationLevel : ' success'
0 commit comments