24
24
push :
25
25
branches :
26
26
- master
27
- paths : ['.github/workflows/bench.yml ', '**/CMakeLists.txt ', '**/Makefile', '**/*.h ', '**/*.hpp', '**/*. c', '**/*.cpp', '**/*. cu', '**/*.swift', '** /*.m ', 'examples/server/bench/**.* ']
27
+ paths : ['llama.cpp ', 'ggml.c ', 'ggml-backend.c ', 'ggml-quants. c', '**/*.cu', 'examples/server /*.h* ', 'examples/server/*.cpp ']
28
28
pull_request_target :
29
29
types : [opened, synchronize, reopened]
30
- paths : ['.github/workflows/bench.yml ', '**/CMakeLists.txt ', '**/Makefile', '**/*.h ', '**/*.hpp', '**/*. c', '**/*.cpp', '**/*. cu', '**/*.swift', '** /*.m ', 'examples/server/bench/**.* ']
30
+ paths : ['llama.cpp ', 'ggml.c ', 'ggml-backend.c ', 'ggml-quants. c', '**/*.cu', 'examples/server /*.h* ', 'examples/server/*.cpp ']
31
31
schedule :
32
32
- cron : ' 04 2 * * *'
33
33
42
42
RUNNER_LABEL : Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
43
43
N_USERS : 8
44
44
DURATION : 10m
45
+
46
+ strategy :
47
+ matrix :
48
+ model : [phi-2]
49
+ ftype : [q4_0, q8_0, f16]
50
+ include :
51
+ - model : phi-2
52
+ ftype : q4_0
53
+ pr_comment_enabled : " true"
54
+
45
55
if : ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
46
56
steps :
47
57
- name : Clone
@@ -116,7 +126,7 @@ jobs:
116
126
--scenario script.js \
117
127
--duration ${{ github.event.inputs.duration || env.DURATION }} \
118
128
--hf-repo ggml-org/models \
119
- --hf-file phi-2 /ggml-model-q4_0 .gguf \
129
+ --hf-file ${{ matrix.model }} /ggml-model-${{ matrix.ftype }} .gguf \
120
130
--model-path-prefix /models \
121
131
--parallel ${{ env.N_USERS }} \
122
132
-ngl 33 \
@@ -134,7 +144,7 @@ jobs:
134
144
135
145
- uses : actions/upload-artifact@v4
136
146
with :
137
- name : benchmark-results
147
+ name : bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
138
148
compression-level : 9
139
149
path : |
140
150
examples/server/bench/*.jpg
@@ -146,7 +156,7 @@ jobs:
146
156
with :
147
157
authToken : ${{secrets.GITHUB_TOKEN}}
148
158
sha : ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
149
- context : bench-server-baseline
159
+ context : bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
150
160
description : |
151
161
${{ env.BENCH_RESULTS }}
152
162
state : ' success'
@@ -203,21 +213,26 @@ jobs:
203
213
- name : Comment PR
204
214
uses : mshick/add-pr-comment@v2
205
215
id : comment_pr
206
- if : ${{ github.event.pull_request != '' }}
216
+ if : ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
207
217
with :
208
- message-id : bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
218
+ message-id : bench-server- ${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
209
219
message : |
210
- 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
220
+ <p align="center">
221
+
222
+ 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
223
+
224
+ </p>
225
+
226
+ <details>
227
+
228
+ <summary>Expand details for performance related PR only</summary>
211
229
212
230
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
213
231
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
214
232
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
215
233
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
216
234
- ${{ env.BENCH_GRAPH_XLABEL }}
217
235
218
- <details>
219
-
220
- <summary>Time series</summary>
221
236
222
237
<p align="center">
223
238
0 commit comments