@@ -40,10 +40,6 @@ func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts ...
4040 grpcOpts := grpcModelOpts (c )
4141 defOpts = append (defOpts , model .WithLoadGRPCLoadModelOpts (grpcOpts ))
4242
43- if so .SingleBackend {
44- defOpts = append (defOpts , model .WithSingleActiveBackend ())
45- }
46-
4743 if so .ParallelBackendRequests {
4844 defOpts = append (defOpts , model .EnableParallelRequests )
4945 }
@@ -121,7 +117,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
121117 triggers := make ([]* pb.GrammarTrigger , 0 )
122118 for _ , t := range c .FunctionsConfig .GrammarConfig .GrammarTriggers {
123119 triggers = append (triggers , & pb.GrammarTrigger {
124- Word : t .Word ,
120+ Word : t .Word ,
125121 })
126122
127123 }
@@ -161,33 +157,33 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
161157 DisableLogStatus : c .DisableLogStatus ,
162158 DType : c .DType ,
163159 // LimitMMPerPrompt vLLM
164- LimitImagePerPrompt : int32 (c .LimitMMPerPrompt .LimitImagePerPrompt ),
165- LimitVideoPerPrompt : int32 (c .LimitMMPerPrompt .LimitVideoPerPrompt ),
166- LimitAudioPerPrompt : int32 (c .LimitMMPerPrompt .LimitAudioPerPrompt ),
167- MMProj : c .MMProj ,
168- FlashAttention : c .FlashAttention ,
169- CacheTypeKey : c .CacheTypeK ,
170- CacheTypeValue : c .CacheTypeV ,
171- NoKVOffload : c .NoKVOffloading ,
172- YarnExtFactor : c .YarnExtFactor ,
173- YarnAttnFactor : c .YarnAttnFactor ,
174- YarnBetaFast : c .YarnBetaFast ,
175- YarnBetaSlow : c .YarnBetaSlow ,
176- NGQA : c .NGQA ,
177- RMSNormEps : c .RMSNormEps ,
178- MLock : mmlock ,
179- RopeFreqBase : c .RopeFreqBase ,
180- RopeScaling : c .RopeScaling ,
181- Type : c .ModelType ,
182- RopeFreqScale : c .RopeFreqScale ,
183- NUMA : c .NUMA ,
184- Embeddings : embeddings ,
185- LowVRAM : lowVRAM ,
186- NGPULayers : int32 (nGPULayers ),
187- MMap : mmap ,
188- MainGPU : c .MainGPU ,
189- Threads : int32 (* c .Threads ),
190- TensorSplit : c .TensorSplit ,
160+ LimitImagePerPrompt : int32 (c .LimitMMPerPrompt .LimitImagePerPrompt ),
161+ LimitVideoPerPrompt : int32 (c .LimitMMPerPrompt .LimitVideoPerPrompt ),
162+ LimitAudioPerPrompt : int32 (c .LimitMMPerPrompt .LimitAudioPerPrompt ),
163+ MMProj : c .MMProj ,
164+ FlashAttention : c .FlashAttention ,
165+ CacheTypeKey : c .CacheTypeK ,
166+ CacheTypeValue : c .CacheTypeV ,
167+ NoKVOffload : c .NoKVOffloading ,
168+ YarnExtFactor : c .YarnExtFactor ,
169+ YarnAttnFactor : c .YarnAttnFactor ,
170+ YarnBetaFast : c .YarnBetaFast ,
171+ YarnBetaSlow : c .YarnBetaSlow ,
172+ NGQA : c .NGQA ,
173+ RMSNormEps : c .RMSNormEps ,
174+ MLock : mmlock ,
175+ RopeFreqBase : c .RopeFreqBase ,
176+ RopeScaling : c .RopeScaling ,
177+ Type : c .ModelType ,
178+ RopeFreqScale : c .RopeFreqScale ,
179+ NUMA : c .NUMA ,
180+ Embeddings : embeddings ,
181+ LowVRAM : lowVRAM ,
182+ NGPULayers : int32 (nGPULayers ),
183+ MMap : mmap ,
184+ MainGPU : c .MainGPU ,
185+ Threads : int32 (* c .Threads ),
186+ TensorSplit : c .TensorSplit ,
191187 // AutoGPTQ
192188 ModelBaseName : c .AutoGPTQ .ModelBaseName ,
193189 Device : c .AutoGPTQ .Device ,
0 commit comments