@@ -4,6 +4,7 @@ use crate::HubTokenizerConfig;
44use crate :: { ChatRequest , GenerateRequest , GenerateStreamResponse , PrefillToken } ;
55use crate :: { Entry , Queue , Token } ;
66use futures:: future:: try_join_all;
7+ use minijinja:: { Environment , ErrorKind , Template } ;
78use nohash_hasher:: IntMap ;
89use std:: sync:: {
910 atomic:: { AtomicBool , Ordering } ,
@@ -27,12 +28,12 @@ pub struct Infer {
2728 validation : Validation ,
2829 /// Request queue
2930 queue : Queue ,
30- /// Chat formatter
31- tokenizer_config : HubTokenizerConfig ,
3231 /// Shared state
3332 shared : Arc < Shared > ,
3433 /// Inference limit
3534 limit_concurrent_requests : Arc < Semaphore > ,
35+ /// Chat template
36+ template : Option < Template < ' static , ' static > > ,
3637}
3738
3839/// Infer shared state
@@ -78,12 +79,21 @@ impl Infer {
7879 // Inference limit with a semaphore
7980 let semaphore = Arc :: new ( Semaphore :: new ( max_concurrent_requests) ) ;
8081
82+ let template = tokenizer_config. chat_template . map ( |t| {
83+ let env = Box :: new ( Environment :: new ( ) ) ;
84+ let template_str = t. into_boxed_str ( ) ;
85+ // leaking env and template_str as read-only, static resources for performance.
86+ Box :: leak ( env)
87+ . template_from_str ( Box :: leak ( template_str) )
88+ . unwrap ( )
89+ } ) ;
90+
8191 Self {
8292 validation,
8393 queue,
8494 shared,
8595 limit_concurrent_requests : semaphore,
86- tokenizer_config ,
96+ template ,
8797 }
8898 }
8999
@@ -139,9 +149,15 @@ impl Infer {
139149 /// Apply the chat template to the chat request
140150 #[ instrument( skip_all) ]
141151 pub ( crate ) fn apply_chat_template ( & self , chat : ChatRequest ) -> Result < String , InferError > {
142- self . tokenizer_config
143- . apply_chat_template ( chat)
144- . map_err ( InferError :: TemplateError )
152+ self . template
153+ . as_ref ( )
154+ . ok_or_else ( || InferError :: TemplateError ( ErrorKind :: TemplateNotFound . into ( ) ) ) ?
155+ . render ( chat)
156+ . map_err ( |e| {
157+ metrics:: increment_counter!( "tgi_request_failure" , "err" => "template" ) ;
158+ tracing:: error!( "{e}" ) ;
159+ InferError :: TemplateError ( e)
160+ } )
145161 }
146162
147163 /// Add a new request to the queue and return a InferResponse
0 commit comments