Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions examples/server/bench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
### Server benchmark tools

Benchmark is using [k6](https://k6.io/).

##### Install k6 - ubuntu
```shell
snap install k6
```

#### Downloading the ShareGPT dataset

```shell
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
```

#### Download a model
Example for PHI-2

```shell
../../../scripts/hf.sh --repo ggml-org/models --file phi-2/ggml-model-q4_0.gguf
```

#### Start the server
The server must listen on `localhost:8080`.

Example:
```shell
server --host localhost --port 8080 \
--model ggml-model-q4_0.gguf \
--cont-batching \
--metrics \
--parallel 8 \
--batch-size 512 \
--ctx-size 4096 \
--log-format text \
-ngl 33
```

#### Run the bench
```shell
k6 run script.js
```

#### Change the number of concurrent user
in the `script.js`, change the ramping period according to your number of slots.

#### Metrics

Following metrics are available:
- `llamacpp_prompt_tokens` Gauge of OAI response `usage.prompt_tokens`
- `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens`
- `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens`
- `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens`
- `llamacpp_completions_tokens_seconds` Gauge of `usage.completion_tokens` divided by the request time in second
- `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
- `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'`

The script will fail if too many completions are truncated, see `llamacpp_completions_truncated_rate`.

K6 metrics might be compared against [server metrics](../README.md), with:

```shell
curl http://localhost:8080/metrics
```
84 changes: 84 additions & 0 deletions examples/server/bench/script.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import http from 'k6/http';
import { check, sleep } from 'k6';
import { SharedArray } from 'k6/data';
import { Counter, Gauge, Rate } from 'k6/metrics';

const data = new SharedArray('conversations', function () {
return JSON.parse(open('./ShareGPT_V3_unfiltered_cleaned_split.json'))

// Filter out the conversations with less than 2 turns.
.filter(data => data["conversations"].length >= 2)
// Only keep the first two turns of each conversation.
.map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"]));
});

const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens');
const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens');

const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds');

const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter');
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter');

const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate');
const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate');

export const options = {
thresholds: {
llamacpp_completions_truncated_rate: [
// more than 10% of truncated input will abort the test
{ threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m' },
],
},
scenarios: {
completions: {
executor: 'ramping-vus',
startVUs: 1,
stages: [
{duration: '1m', target: 8},
{duration: '3m', target: 8},
{duration: '1m', target: 0},
],
gracefulRampDown: '30s',
},
},
};

export default function () {
const conversation = data[0]
const payload = {
"messages": [
{
"role": "system",
"content": conversation[0],
},
{
"role": "user",
"content": conversation[1],
}
],
"model": "model",
"stream": false,
}
let res = http.post('http://localhost:8080/v1/chat/completions', JSON.stringify(payload), {
headers: { 'Content-Type': 'application/json' },
})

check(res, {'success completion': (r) => r.status === 200})

const completions = res.json()

llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)

llamacpp_completion_tokens.add(completions.usage.completion_tokens)
llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)

llamacpp_completions_tokens_seconds.add(completions.usage.completion_tokens / res.timings.duration * 1e3)

llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')


sleep(0.3)
}