Skip to content

Commit 2711f79

Browse files
Aisukomudler
authored andcommitted
feat(conda):Add seperate env for vllm (#1148)
**Description** This PR is related to #1117 **Notes for Reviewers** * The gRPC server can be started as normal * The test case can be triggered in VSCode * Same to other this kind of PRs, add `vllm.yml` Makefile and add `run.sh` to the main Dockerfile, and command to the main Makefile **[Signed commits](../CONTRIBUTING.md#signing-off-on-commits-developer-certificate-of-origin)** - [x] Yes, I signed my commits. <!-- Thank you for contributing to LocalAI! Contributing Conventions ------------------------- The draft above helps to give a quick overview of your PR. Remember to remove this comment and to at least: 1. Include descriptive PR titles with [<component-name>] prepended. We use [conventional commits](https://www.conventionalcommits.org/en/v1.0.0/). 2. Build and test your changes before submitting a PR (`make build`). 3. Sign your commits 4. **Tag maintainer:** for a quicker response, tag the relevant maintainer (see below). 5. **X/Twitter handle:** we announce bigger features on X/Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out! By following the community's contribution conventions upfront, the review process will be accelerated and your PR merged more quickly. If no one reviews your PR within a few days, please @-mention @mudler. --> Signed-off-by: GitHub <[email protected]> Signed-off-by: Ettore Di Giacinto <[email protected]>
1 parent c40d924 commit 2711f79

File tree

8 files changed

+229
-7
lines changed

8 files changed

+229
-7
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ ARG TARGETARCH
1414
ARG TARGETVARIANT
1515

1616
ENV BUILD_TYPE=${BUILD_TYPE}
17-
ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/huggingface.py,autogptq:/build/extra/grpc/autogptq/run.sh,bark:/build/extra/grpc/bark/run.sh,diffusers:/build/extra/grpc/diffusers/run.sh,exllama:/build/extra/grpc/exllama/exllama.py,vall-e-x:/build/extra/grpc/vall-e-x/ttsvalle.py,vllm:/build/extra/grpc/vllm/backend_vllm.py"
17+
ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/huggingface.py,autogptq:/build/extra/grpc/autogptq/run.sh,bark:/build/extra/grpc/bark/run.sh,diffusers:/build/extra/grpc/diffusers/run.sh,exllama:/build/extra/grpc/exllama/exllama.py,vall-e-x:/build/extra/grpc/vall-e-x/ttsvalle.py,vllm:/build/extra/grpc/vllm/run.sh"
1818
ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
1919
ARG GO_TAGS="stablediffusion tts"
2020

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,7 @@ prepare-extra-conda-environments:
413413
$(MAKE) -C extra/grpc/autogptq
414414
$(MAKE) -C extra/grpc/bark
415415
$(MAKE) -C extra/grpc/diffusers
416+
$(MAKE) -C extra/grpc/vllm
416417

417418
backend-assets/grpc:
418419
mkdir -p backend-assets/grpc

extra/grpc/vllm/Makefile

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
.PONY: vllm
2+
vllm:
3+
@echo "Creating virtual environment..."
4+
@conda env create --name vllm --file vllm.yml
5+
@echo "Virtual environment created."
6+
7+
.PONY: run
8+
run:
9+
@echo "Running vllm..."
10+
bash run.sh
11+
@echo "vllm run."

extra/grpc/vllm/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Creating a separate environment for the vllm project
2+
3+
```
4+
make vllm
5+
```

extra/grpc/vllm/backend_vllm.py

Lines changed: 61 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
#!/usr/bin/env python3
2-
import grpc
32
from concurrent import futures
43
import time
5-
import backend_pb2
6-
import backend_pb2_grpc
74
import argparse
85
import signal
96
import sys
10-
import os, glob
7+
import os
8+
9+
import backend_pb2
10+
import backend_pb2_grpc
1111

12-
from pathlib import Path
12+
import grpc
1313
from vllm import LLM, SamplingParams
1414

1515
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -19,7 +19,20 @@
1919

2020
# Implement the BackendServicer class with the service methods
2121
class BackendServicer(backend_pb2_grpc.BackendServicer):
22+
"""
23+
A gRPC servicer that implements the Backend service defined in backend.proto.
24+
"""
2225
def generate(self,prompt, max_new_tokens):
26+
"""
27+
Generates text based on the given prompt and maximum number of new tokens.
28+
29+
Args:
30+
prompt (str): The prompt to generate text from.
31+
max_new_tokens (int): The maximum number of new tokens to generate.
32+
33+
Returns:
34+
str: The generated text.
35+
"""
2336
self.generator.end_beam_search()
2437

2538
# Tokenizing the input
@@ -41,9 +54,31 @@ def generate(self,prompt, max_new_tokens):
4154
if token.item() == self.generator.tokenizer.eos_token_id:
4255
break
4356
return decoded_text
57+
4458
def Health(self, request, context):
59+
"""
60+
Returns a health check message.
61+
62+
Args:
63+
request: The health check request.
64+
context: The gRPC context.
65+
66+
Returns:
67+
backend_pb2.Reply: The health check reply.
68+
"""
4569
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
70+
4671
def LoadModel(self, request, context):
72+
"""
73+
Loads a language model.
74+
75+
Args:
76+
request: The load model request.
77+
context: The gRPC context.
78+
79+
Returns:
80+
backend_pb2.Result: The load model result.
81+
"""
4782
try:
4883
if request.Quantization != "":
4984
self.llm = LLM(model=request.Model, quantization=request.Quantization)
@@ -54,6 +89,16 @@ def LoadModel(self, request, context):
5489
return backend_pb2.Result(message="Model loaded successfully", success=True)
5590

5691
def Predict(self, request, context):
92+
"""
93+
Generates text based on the given prompt and sampling parameters.
94+
95+
Args:
96+
request: The predict request.
97+
context: The gRPC context.
98+
99+
Returns:
100+
backend_pb2.Result: The predict result.
101+
"""
57102
if request.TopP == 0:
58103
request.TopP = 0.9
59104

@@ -68,6 +113,16 @@ def Predict(self, request, context):
68113
return backend_pb2.Result(message=bytes(generated_text, encoding='utf-8'))
69114

70115
def PredictStream(self, request, context):
116+
"""
117+
Generates text based on the given prompt and sampling parameters, and streams the results.
118+
119+
Args:
120+
request: The predict stream request.
121+
context: The gRPC context.
122+
123+
Returns:
124+
backend_pb2.Result: The predict stream result.
125+
"""
71126
# Implement PredictStream RPC
72127
#for reply in some_data_generator():
73128
# yield reply
@@ -104,4 +159,4 @@ def signal_handler(sig, frame):
104159
)
105160
args = parser.parse_args()
106161

107-
serve(args.addr)
162+
serve(args.addr)

extra/grpc/vllm/run.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
##
2+
## A bash script wrapper that runs the diffusers server with conda
3+
4+
# Activate conda environment
5+
source activate vllm
6+
7+
# get the directory where the bash script is located
8+
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
9+
10+
python $DIR/backend_vllm.py
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import unittest
2+
import subprocess
3+
import time
4+
import backend_pb2
5+
import backend_pb2_grpc
6+
7+
import grpc
8+
9+
import unittest
10+
import subprocess
11+
import time
12+
import grpc
13+
import backend_pb2_grpc
14+
import backend_pb2
15+
16+
class TestBackendServicer(unittest.TestCase):
17+
"""
18+
TestBackendServicer is the class that tests the gRPC service.
19+
20+
This class contains methods to test the startup and shutdown of the gRPC service.
21+
"""
22+
def setUp(self):
23+
self.service = subprocess.Popen(["python", "backend_vllm.py", "--addr", "localhost:50051"])
24+
25+
def tearDown(self) -> None:
26+
self.service.terminate()
27+
self.service.wait()
28+
29+
def test_server_startup(self):
30+
time.sleep(2)
31+
try:
32+
self.setUp()
33+
with grpc.insecure_channel("localhost:50051") as channel:
34+
stub = backend_pb2_grpc.BackendStub(channel)
35+
response = stub.Health(backend_pb2.HealthMessage())
36+
self.assertEqual(response.message, b'OK')
37+
except Exception as err:
38+
print(err)
39+
self.fail("Server failed to start")
40+
finally:
41+
self.tearDown()

extra/grpc/vllm/vllm.yml

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
name: vllm
2+
channels:
3+
- defaults
4+
dependencies:
5+
- _libgcc_mutex=0.1=main
6+
- _openmp_mutex=5.1=1_gnu
7+
- bzip2=1.0.8=h7b6447c_0
8+
- ca-certificates=2023.08.22=h06a4308_0
9+
- ld_impl_linux-64=2.38=h1181459_1
10+
- libffi=3.4.4=h6a678d5_0
11+
- libgcc-ng=11.2.0=h1234567_1
12+
- libgomp=11.2.0=h1234567_1
13+
- libstdcxx-ng=11.2.0=h1234567_1
14+
- libuuid=1.41.5=h5eee18b_0
15+
- ncurses=6.4=h6a678d5_0
16+
- openssl=3.0.11=h7f8727e_2
17+
- pip=23.2.1=py311h06a4308_0
18+
- python=3.11.5=h955ad1f_0
19+
- readline=8.2=h5eee18b_0
20+
- setuptools=68.0.0=py311h06a4308_0
21+
- sqlite=3.41.2=h5eee18b_0
22+
- tk=8.6.12=h1ccaba5_0
23+
- wheel=0.41.2=py311h06a4308_0
24+
- xz=5.4.2=h5eee18b_0
25+
- zlib=1.2.13=h5eee18b_0
26+
- pip:
27+
- aiosignal==1.3.1
28+
- anyio==3.7.1
29+
- attrs==23.1.0
30+
- certifi==2023.7.22
31+
- charset-normalizer==3.3.0
32+
- click==8.1.7
33+
- cmake==3.27.6
34+
- fastapi==0.103.2
35+
- filelock==3.12.4
36+
- frozenlist==1.4.0
37+
- fsspec==2023.9.2
38+
- grpcio==1.59.0
39+
- h11==0.14.0
40+
- httptools==0.6.0
41+
- huggingface-hub==0.17.3
42+
- idna==3.4
43+
- jinja2==3.1.2
44+
- jsonschema==4.19.1
45+
- jsonschema-specifications==2023.7.1
46+
- lit==17.0.2
47+
- markupsafe==2.1.3
48+
- mpmath==1.3.0
49+
- msgpack==1.0.7
50+
- networkx==3.1
51+
- ninja==1.11.1
52+
- numpy==1.26.0
53+
- nvidia-cublas-cu11==11.10.3.66
54+
- nvidia-cuda-cupti-cu11==11.7.101
55+
- nvidia-cuda-nvrtc-cu11==11.7.99
56+
- nvidia-cuda-runtime-cu11==11.7.99
57+
- nvidia-cudnn-cu11==8.5.0.96
58+
- nvidia-cufft-cu11==10.9.0.58
59+
- nvidia-curand-cu11==10.2.10.91
60+
- nvidia-cusolver-cu11==11.4.0.1
61+
- nvidia-cusparse-cu11==11.7.4.91
62+
- nvidia-nccl-cu11==2.14.3
63+
- nvidia-nvtx-cu11==11.7.91
64+
- packaging==23.2
65+
- pandas==2.1.1
66+
- protobuf==4.24.4
67+
- psutil==5.9.5
68+
- pyarrow==13.0.0
69+
- pydantic==1.10.13
70+
- python-dateutil==2.8.2
71+
- python-dotenv==1.0.0
72+
- pytz==2023.3.post1
73+
- pyyaml==6.0.1
74+
- ray==2.7.0
75+
- referencing==0.30.2
76+
- regex==2023.10.3
77+
- requests==2.31.0
78+
- rpds-py==0.10.4
79+
- safetensors==0.4.0
80+
- sentencepiece==0.1.99
81+
- six==1.16.0
82+
- sniffio==1.3.0
83+
- starlette==0.27.0
84+
- sympy==1.12
85+
- tokenizers==0.14.1
86+
- torch==2.0.1
87+
- tqdm==4.66.1
88+
- transformers==4.34.0
89+
- triton==2.0.0
90+
- typing-extensions==4.8.0
91+
- tzdata==2023.3
92+
- urllib3==2.0.6
93+
- uvicorn==0.23.2
94+
- uvloop==0.17.0
95+
- vllm==0.2.0
96+
- watchfiles==0.20.0
97+
- websockets==11.0.3
98+
- xformers==0.0.22
99+
prefix: /opt/conda/envs/vllm

0 commit comments

Comments
 (0)