Apply suggestion on GPTQ buffer setup

cyang49 · cyang49 · commit 3ffda19e2495 · 2024-03-17T11:19:21.000-04:00
Signed-off-by: cyang49 &lt;7364402+cyang49@users.noreply.github.com&gt;
diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
@@ -278,31 +278,29 @@ async def serve_inner(
 
         if quantize == "gptq" and deployment_framework == "tgis_native":
             from text_generation_server.utils.layers import HAS_GPTQ_CUDA, EXLLAMA_VERSION
-            if HAS_GPTQ_CUDA:
-                if EXLLAMA_VERSION is not None:
-                    try:
-                        # When using GPTQ, Exllama kernels need some global kernels
-                        # For which we have the final shapes only after the model has loaded
-                        # This will allocate those buffers.
-
-                        if EXLLAMA_VERSION == "1":
-                            from text_generation_server.utils.gptq.exllama import (
-                                create_exllama_buffers, set_device,
-                            )
-                            set_device(device)
-                            create_exllama_buffers(max_sequence_length)
-                        else:
-                            assert EXLLAMA_VERSION == "2"
-                            from text_generation_server.utils.gptq.exllamav2 import (
-                                set_device, Ex4bitLinearV2,
-                            )
-                            set_device(device)
-                            for _, submodule in model.model.named_modules():
-                                if isinstance(submodule, Ex4bitLinearV2):
-                                    submodule.post_init()  # make q matrix and set scratch space
-
-                    except ImportError:
-                        print("WARN: Error setting up GPTQ exllama buffers")
+            if HAS_GPTQ_CUDA and EXLLAMA_VERSION is not None:
+                try:
+                    # When using GPTQ, Exllama kernels need some global kernels
+                    # For which we have the final shapes only after the model has loaded
+                    # This will allocate those buffers.
+                    if EXLLAMA_VERSION == "1":
+                        from text_generation_server.utils.gptq.exllama import (
+                            create_exllama_buffers, set_device,
+                        )
+                        set_device(device)
+                        create_exllama_buffers(max_sequence_length)
+                    elif EXLLAMA_VERSION == "2":
+                        from text_generation_server.utils.gptq.exllamav2 import (
+                            set_device, Ex4bitLinearV2,
+                        )
+                        set_device(device)
+                        for _, submodule in model.model.named_modules():
+                            if isinstance(submodule, Ex4bitLinearV2):
+                                submodule.post_init()  # make q matrix and set scratch space
+                    else:
+                        raise ValueError(f"Unsupported {EXLLAMA_VERSION=}") 
+                except ImportError:
+                    print("WARN: Error setting up GPTQ exllama buffers")
 
         if local_rank == 0 and device.type == "cuda":
             # Log GPU memory stats at startup