fix: initialize chat template single time, fix defaults and add seed param

drbh · drbh · commit e7a6fb29b85c · 2024-01-10T13:29:20.000-05:00
diff --git a/router/src/infer.rs b/router/src/infer.rs
@@ -4,6 +4,7 @@ use crate::HubTokenizerConfig;
 use crate::{ChatRequest, GenerateRequest, GenerateStreamResponse, PrefillToken};
 use crate::{Entry, Queue, Token};
 use futures::future::try_join_all;
+use minijinja::{Environment, ErrorKind, Template};
 use nohash_hasher::IntMap;
 use std::sync::{
     atomic::{AtomicBool, Ordering},
@@ -27,12 +28,12 @@ pub struct Infer {
     validation: Validation,
     /// Request queue
     queue: Queue,
-    /// Chat formatter
-    tokenizer_config: HubTokenizerConfig,
     /// Shared state
     shared: Arc<Shared>,
     /// Inference limit
     limit_concurrent_requests: Arc<Semaphore>,
+    /// Chat template
+    template: Option<Template<'static, 'static>>,
 }
 
 /// Infer shared state
@@ -78,12 +79,21 @@ impl Infer {
         // Inference limit with a semaphore
         let semaphore = Arc::new(Semaphore::new(max_concurrent_requests));
 
+        let template = tokenizer_config.chat_template.map(|t| {
+            let env = Box::new(Environment::new());
+            let template_str = t.into_boxed_str();
+            // leaking env and template_str as read-only, static resources for performance.
+            Box::leak(env)
+                .template_from_str(Box::leak(template_str))
+                .unwrap()
+        });
+
         Self {
             validation,
             queue,
             shared,
             limit_concurrent_requests: semaphore,
-            tokenizer_config,
+            template,
         }
     }
 
@@ -139,9 +149,15 @@ impl Infer {
     /// Apply the chat template to the chat request
     #[instrument(skip_all)]
     pub(crate) fn apply_chat_template(&self, chat: ChatRequest) -> Result<String, InferError> {
-        self.tokenizer_config
-            .apply_chat_template(chat)
-            .map_err(InferError::TemplateError)
+        self.template
+            .as_ref()
+            .ok_or_else(|| InferError::TemplateError(ErrorKind::TemplateNotFound.into()))?
+            .render(chat)
+            .map_err(|e| {
+                metrics::increment_counter!("tgi_request_failure", "err" => "template");
+                tracing::error!("{e}");
+                InferError::TemplateError(e)
+            })
     }
 
     /// Add a new request to the queue and return a InferResponse
diff --git a/router/src/lib.rs b/router/src/lib.rs
@@ -36,22 +36,6 @@ pub struct HubTokenizerConfig {
     pub chat_template: Option<String>,
 }
 
-impl HubTokenizerConfig {
-    /// Apply the chat template to the chat request
-    pub(crate) fn apply_chat_template(
-        &self,
-        chat: ChatRequest,
-    ) -> Result<String, minijinja::Error> {
-        let mut env = minijinja::Environment::new();
-        let chat_template = self
-            .chat_template
-            .as_ref()
-            .ok_or(minijinja::ErrorKind::TemplateNotFound)?;
-        env.add_template("_", chat_template)?;
-        env.get_template("_")?.render(chat)
-    }
-}
-
 #[derive(Clone, Debug, Serialize, ToSchema)]
 pub struct Info {
     /// Model info
@@ -292,7 +276,7 @@ impl ChatCompletionChunk {
         finish_reason: Option<String>,
     ) -> Self {
         Self {
-            id: "".to_string(),
+            id: String::new(),
             object: "text_completion".to_string(),
             created,
             model,
@@ -312,7 +296,7 @@ impl ChatCompletionChunk {
 
 fn default_request_messages() -> Vec<Message> {
     vec![Message {
-        role: "system".to_string(),
+        role: "user".to_string(),
         content: "My name is David and I".to_string(),
     }]
 }
@@ -371,11 +355,14 @@ pub(crate) struct ChatRequest {
 
     #[serde(default = "bool::default")]
     pub stream: bool,
+
+    #[schema(nullable = true, example = 42)]
+    pub seed: Option<u64>,
 }
 
 #[derive(Clone, Deserialize, ToSchema, Serialize)]
 pub(crate) struct Message {
-    #[schema(example = "system")]
+    #[schema(example = "user")]
     pub role: String,
     #[schema(example = "My name is David and I")]
     pub content: String,
diff --git a/router/src/server.rs b/router/src/server.rs
@@ -564,6 +564,7 @@ async fn chat_completions(
         // rescale frequency_penalty from (-2.0, 2.0) to (0.0, 4.0)
         .map(|x| x + 2.0);
     let logprobs = req.logprobs.unwrap_or(false);
+    let seed = req.seed;
 
     // apply chat template to flatten the request into a single input
     let inputs = match infer.apply_chat_template(req) {
@@ -599,7 +600,7 @@ async fn chat_completions(
             watermark: false,
             details: true,
             decoder_input_details: false,
-            seed: None,
+            seed,
             top_n_tokens: None,
         },
     };