ggml-org · phymbert · Mar 9, 2024 · Mar 8, 2024 · Mar 8, 2024 · Mar 8, 2024
diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
@@ -0,0 +1,64 @@
+### Server benchmark tools
+
+Benchmark is using [k6](https://k6.io/).
+
+##### Install k6 - ubuntu
+```shell
+snap install k6
+```
+
+#### Downloading the ShareGPT dataset
+
+```shell
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+```
+
+#### Download a model
+Example for PHI-2
+
+```shell
+../../../scripts/hf.sh --repo ggml-org/models --file phi-2/ggml-model-q4_0.gguf
+```
+
+#### Start the server
+The server must listen on `localhost:8080`.
+
+Example:
+```shell
+server --host localhost --port 8080 \
+  --model ggml-model-q4_0.gguf \
+  --cont-batching \
+  --metrics \
+  --parallel 8 \
+  --batch-size 512 \
+  --ctx-size 4096 \
+  --log-format text \
+  -ngl 33
+```
+
+#### Run the bench
+```shell
+k6 run script.js
+```
+
+#### Change the number of concurrent user
+in the `script.js`, change the ramping period according to your number of slots.
+
+#### Metrics
+
+Following metrics are available:
+- `llamacpp_prompt_tokens` Gauge of OAI response `usage.prompt_tokens`
+- `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens`
+- `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens`
+- `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens`
+- `llamacpp_completions_tokens_seconds` Gauge of `usage.completion_tokens` divided by the request time in second
+- `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
+- `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'`
+
+The script will fail if too many completions are truncated, see `llamacpp_completions_truncated_rate`.
+
+K6 metrics might be compared against [server metrics](../README.md), with:
+
+```shell
+curl http://localhost:8080/metrics
+```
diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
@@ -0,0 +1,84 @@
+import http from 'k6/http';
+import { check, sleep } from 'k6';
+import { SharedArray } from 'k6/data';
+import { Counter, Gauge, Rate } from 'k6/metrics';
+
+const data = new SharedArray('conversations', function () {
+    return JSON.parse(open('./ShareGPT_V3_unfiltered_cleaned_split.json'))
+
+        // Filter out the conversations with less than 2 turns.
+        .filter(data => data["conversations"].length >= 2)
+        // Only keep the first two turns of each conversation.
+        .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"]));
+});
+
+const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens');
+const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens');
+
+const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds');
+
+const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter');
+const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter');
+
+const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate');
+const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate');
+
+export const options = {
+    thresholds: {
+        llamacpp_completions_truncated_rate: [
+            // more than 10% of truncated input will abort the test
+            { threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m' },
+        ],
+    },
+    scenarios: {
+        completions: {
+            executor: 'ramping-vus',
+            startVUs: 1,
+            stages: [
+                {duration: '1m', target: 8},
+                {duration: '3m', target: 8},
+                {duration: '1m', target: 0},
+            ],
+            gracefulRampDown: '30s',
+        },
+    },
+};
+
+export default function () {
+    const conversation = data[0]
+    const payload = {
+        "messages": [
+            {
+                "role": "system",
+                "content": conversation[0],
+            },
+            {
+                "role": "user",
+                "content": conversation[1],
+            }
+        ],
+        "model": "model",
+        "stream": false,
+    }
+    let res = http.post('http://localhost:8080/v1/chat/completions', JSON.stringify(payload), {
+        headers: { 'Content-Type': 'application/json' },
+    })
+
+    check(res, {'success completion': (r) => r.status === 200})
+
+    const completions = res.json()
+
+    llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
+    llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
+
+    llamacpp_completion_tokens.add(completions.usage.completion_tokens)
+    llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
+
+    llamacpp_completions_tokens_seconds.add(completions.usage.completion_tokens / res.timings.duration * 1e3)
+
+    llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
+    llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
+
+
+    sleep(0.3)
+}