Skip to content

Commit 2c425e9

Browse files
authored
feat(loader): enhance single active backend by treating as singleton (#5107)
feat(loader): enhance single active backend by treating at singleton Signed-off-by: Ettore Di Giacinto <[email protected]>
1 parent c59975a commit 2c425e9

File tree

24 files changed

+92
-71
lines changed

24 files changed

+92
-71
lines changed

core/application/application.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ type Application struct {
1616
func newApplication(appConfig *config.ApplicationConfig) *Application {
1717
return &Application{
1818
backendLoader: config.NewBackendConfigLoader(appConfig.ModelPath),
19-
modelLoader: model.NewModelLoader(appConfig.ModelPath),
19+
modelLoader: model.NewModelLoader(appConfig.ModelPath, appConfig.SingleBackend),
2020
applicationConfig: appConfig,
2121
templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath),
2222
}

core/application/startup.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ func New(opts ...config.AppOption) (*Application, error) {
143143
}()
144144
}
145145

146-
if options.LoadToMemory != nil {
146+
if options.LoadToMemory != nil && !options.SingleBackend {
147147
for _, m := range options.LoadToMemory {
148148
cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options)
149149
if err != nil {

core/backend/embeddings.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendCo
1717
if err != nil {
1818
return nil, err
1919
}
20+
defer loader.Close()
2021

2122
var fn func() ([]float32, error)
2223
switch model := inferenceModel.(type) {

core/backend/image.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
1616
if err != nil {
1717
return nil, err
1818
}
19+
defer loader.Close()
1920

2021
fn := func() error {
2122
_, err := inferenceModel.GenerateImage(

core/backend/llm.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
5353
if err != nil {
5454
return nil, err
5555
}
56+
defer loader.Close()
5657

5758
var protoMessages []*proto.Message
5859
// if we are using the tokenizer template, we need to convert the messages to proto messages

core/backend/options.go

Lines changed: 28 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,6 @@ func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts ...
4040
grpcOpts := grpcModelOpts(c)
4141
defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))
4242

43-
if so.SingleBackend {
44-
defOpts = append(defOpts, model.WithSingleActiveBackend())
45-
}
46-
4743
if so.ParallelBackendRequests {
4844
defOpts = append(defOpts, model.EnableParallelRequests)
4945
}
@@ -121,7 +117,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
121117
triggers := make([]*pb.GrammarTrigger, 0)
122118
for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
123119
triggers = append(triggers, &pb.GrammarTrigger{
124-
Word: t.Word,
120+
Word: t.Word,
125121
})
126122

127123
}
@@ -161,33 +157,33 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
161157
DisableLogStatus: c.DisableLogStatus,
162158
DType: c.DType,
163159
// LimitMMPerPrompt vLLM
164-
LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
165-
LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
166-
LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
167-
MMProj: c.MMProj,
168-
FlashAttention: c.FlashAttention,
169-
CacheTypeKey: c.CacheTypeK,
170-
CacheTypeValue: c.CacheTypeV,
171-
NoKVOffload: c.NoKVOffloading,
172-
YarnExtFactor: c.YarnExtFactor,
173-
YarnAttnFactor: c.YarnAttnFactor,
174-
YarnBetaFast: c.YarnBetaFast,
175-
YarnBetaSlow: c.YarnBetaSlow,
176-
NGQA: c.NGQA,
177-
RMSNormEps: c.RMSNormEps,
178-
MLock: mmlock,
179-
RopeFreqBase: c.RopeFreqBase,
180-
RopeScaling: c.RopeScaling,
181-
Type: c.ModelType,
182-
RopeFreqScale: c.RopeFreqScale,
183-
NUMA: c.NUMA,
184-
Embeddings: embeddings,
185-
LowVRAM: lowVRAM,
186-
NGPULayers: int32(nGPULayers),
187-
MMap: mmap,
188-
MainGPU: c.MainGPU,
189-
Threads: int32(*c.Threads),
190-
TensorSplit: c.TensorSplit,
160+
LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
161+
LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
162+
LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
163+
MMProj: c.MMProj,
164+
FlashAttention: c.FlashAttention,
165+
CacheTypeKey: c.CacheTypeK,
166+
CacheTypeValue: c.CacheTypeV,
167+
NoKVOffload: c.NoKVOffloading,
168+
YarnExtFactor: c.YarnExtFactor,
169+
YarnAttnFactor: c.YarnAttnFactor,
170+
YarnBetaFast: c.YarnBetaFast,
171+
YarnBetaSlow: c.YarnBetaSlow,
172+
NGQA: c.NGQA,
173+
RMSNormEps: c.RMSNormEps,
174+
MLock: mmlock,
175+
RopeFreqBase: c.RopeFreqBase,
176+
RopeScaling: c.RopeScaling,
177+
Type: c.ModelType,
178+
RopeFreqScale: c.RopeFreqScale,
179+
NUMA: c.NUMA,
180+
Embeddings: embeddings,
181+
LowVRAM: lowVRAM,
182+
NGPULayers: int32(nGPULayers),
183+
MMap: mmap,
184+
MainGPU: c.MainGPU,
185+
Threads: int32(*c.Threads),
186+
TensorSplit: c.TensorSplit,
191187
// AutoGPTQ
192188
ModelBaseName: c.AutoGPTQ.ModelBaseName,
193189
Device: c.AutoGPTQ.Device,

core/backend/rerank.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@ import (
1212
func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
1313
opts := ModelOptions(backendConfig, appConfig)
1414
rerankModel, err := loader.Load(opts...)
15-
1615
if err != nil {
1716
return nil, err
1817
}
18+
defer loader.Close()
1919

2020
if rerankModel == nil {
2121
return nil, fmt.Errorf("could not load rerank model")

core/backend/soundgeneration.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@ func SoundGeneration(
2626

2727
opts := ModelOptions(backendConfig, appConfig)
2828
soundGenModel, err := loader.Load(opts...)
29-
3029
if err != nil {
3130
return "", nil, err
3231
}
32+
defer loader.Close()
3333

3434
if soundGenModel == nil {
3535
return "", nil, fmt.Errorf("could not load sound generation model")

core/backend/token_metrics.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ func TokenMetrics(
2020
if err != nil {
2121
return nil, err
2222
}
23+
defer loader.Close()
2324

2425
if model == nil {
2526
return nil, fmt.Errorf("could not loadmodel model")

core/backend/tokenize.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
1414

1515
opts := ModelOptions(backendConfig, appConfig)
1616
inferenceModel, err = loader.Load(opts...)
17-
1817
if err != nil {
1918
return schema.TokenizeResponse{}, err
2019
}
20+
defer loader.Close()
2121

2222
predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
2323
predictOptions.Prompt = s

0 commit comments

Comments
 (0)