From 9613e56270dad822043b4917d825cdf3d0bedac5 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 13 Mar 2024 17:20:54 -0600
Subject: [PATCH] =?UTF-8?q?=F0=9F=94=A5=20Remove=20custom=20exllama=20code?=
 =?UTF-8?q?,=20use=20auto-gptq=20vendored=20instead?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 Dockerfile                                    |  24 -
 .../exllama_kernels/cuda_buffers.cu           |  71 --
 .../exllama_kernels/cuda_buffers.cuh          |  52 -
 .../exllama_kernels/cuda_compat.cuh           |  58 --
 .../exllama_kernels/cuda_func/column_remap.cu |  61 --
 .../cuda_func/column_remap.cuh                |  19 -
 .../exllama_kernels/cuda_func/q4_matmul.cu    | 252 -----
 .../exllama_kernels/cuda_func/q4_matmul.cuh   |  37 -
 .../exllama_kernels/cuda_func/q4_matrix.cu    | 217 ----
 .../exllama_kernels/cuda_func/q4_matrix.cuh   |  53 -
 .../exllama_kernels/exllama_ext.cpp           | 249 -----
 .../exllama_kernels/matrix.cuh                | 294 ------
 .../exllama_kernels/exllama_kernels/tuning.h  |  13 -
 .../exllama_kernels/exllama_kernels/util.cuh  |  29 -
 server/exllama_kernels/setup.py               |  19 -
 .../exllamav2_kernels/config.h                |  13 -
 .../exllamav2_kernels/cpp/quantize_func.cpp   |  59 --
 .../exllamav2_kernels/cpp/quantize_func.h     |  25 -
 .../exllamav2_kernels/cpp/sampling.cpp        | 477 ---------
 .../exllamav2_kernels/cpp/sampling.h          |  84 --
 .../exllamav2_kernels/cpp/util.h              |  25 -
 .../exllamav2_kernels/cuda/cache.cu           | 161 ---
 .../exllamav2_kernels/cuda/cache.cuh          |  14 -
 .../exllamav2_kernels/cuda/compat.cuh         |  56 -
 .../exllamav2_kernels/cuda/compat_gemm.cuh    |  33 -
 .../exllamav2_kernels/cuda/h_gemm.cu          | 275 -----
 .../exllamav2_kernels/cuda/h_gemm.cuh         |  26 -
 .../exllamav2_kernels/cuda/lora.cu            |  33 -
 .../exllamav2_kernels/cuda/lora.cuh           |  24 -
 .../exllamav2_kernels/cuda/matrix_view.cuh    | 121 ---
 .../exllamav2_kernels/cuda/pack_tensor.cu     | 268 -----
 .../exllamav2_kernels/cuda/pack_tensor.cuh    |  35 -
 .../exllamav2_kernels/cuda/q_attn.cu          | 159 ---
 .../exllamav2_kernels/cuda/q_attn.cuh         |  98 --
 .../exllamav2_kernels/cuda/q_gemm.cu          | 211 ----
 .../exllamav2_kernels/cuda/q_gemm.cuh         |  33 -
 .../exllamav2_kernels/cuda/q_gemm_kernel.cuh  | 484 ---------
 .../cuda/q_gemm_kernel_gptq.cuh               | 219 ----
 .../exllamav2_kernels/cuda/q_matrix.cu        | 623 ------------
 .../exllamav2_kernels/cuda/q_matrix.cuh       |  73 --
 .../exllamav2_kernels/cuda/q_mlp.cu           | 162 ---
 .../exllamav2_kernels/cuda/q_mlp.cuh          |  65 --
 .../exllamav2_kernels/cuda/quant/qdq_2.cuh    | 103 --
 .../exllamav2_kernels/cuda/quant/qdq_3.cuh    | 169 ----
 .../exllamav2_kernels/cuda/quant/qdq_4.cuh    | 227 -----
 .../exllamav2_kernels/cuda/quant/qdq_5.cuh    | 207 ----
 .../exllamav2_kernels/cuda/quant/qdq_6.cuh    |  44 -
 .../exllamav2_kernels/cuda/quant/qdq_8.cuh    |  38 -
 .../exllamav2_kernels/cuda/quant/qdq_util.cuh |  51 -
 .../exllamav2_kernels/cuda/quantize.cu        | 256 -----
 .../exllamav2_kernels/cuda/quantize.cuh       |  56 -
 .../exllamav2_kernels/cuda/rms_norm.cu        | 130 ---
 .../exllamav2_kernels/cuda/rms_norm.cuh       |  19 -
 .../exllamav2_kernels/cuda/rope.cu            | 132 ---
 .../exllamav2_kernels/cuda/rope.cuh           |  22 -
 .../exllamav2_kernels/cuda/util.cuh           |  42 -
 .../exllamav2_kernels/ext.cpp                 | 956 ------------------
 server/exllamav2_kernels/setup.py             |  28 -
 58 files changed, 7784 deletions(-)
 delete mode 100644 server/exllama_kernels/exllama_kernels/cuda_buffers.cu
 delete mode 100644 server/exllama_kernels/exllama_kernels/cuda_buffers.cuh
 delete mode 100644 server/exllama_kernels/exllama_kernels/cuda_compat.cuh
 delete mode 100644 server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cu
 delete mode 100644 server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh
 delete mode 100644 server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cu
 delete mode 100644 server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cuh
 delete mode 100644 server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cu
 delete mode 100644 server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh
 delete mode 100644 server/exllama_kernels/exllama_kernels/exllama_ext.cpp
 delete mode 100644 server/exllama_kernels/exllama_kernels/matrix.cuh
 delete mode 100644 server/exllama_kernels/exllama_kernels/tuning.h
 delete mode 100644 server/exllama_kernels/exllama_kernels/util.cuh
 delete mode 100644 server/exllama_kernels/setup.py
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/config.h
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.cpp
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.h
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cpp/sampling.cpp
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cpp/sampling.h
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cpp/util.h
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/cache.cu
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/cache.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/compat.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/compat_gemm.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/h_gemm.cu
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/h_gemm.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/lora.cu
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/lora.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/matrix_view.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/pack_tensor.cu
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/pack_tensor.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_attn.cu
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_attn.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel_gptq.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_mlp.cu
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_mlp.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_2.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_3.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_4.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_5.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_6.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_8.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_util.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/quantize.cu
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/quantize.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/rms_norm.cu
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/rms_norm.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/rope.cu
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/rope.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/util.cuh
 delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/ext.cpp
 delete mode 100644 server/exllamav2_kernels/setup.py

diff --git a/Dockerfile b/Dockerfile
index 61933909d..462885892 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -253,24 +253,6 @@ COPY server/custom_kernels/ /usr/src/.
 RUN cd /usr/src && python setup.py build_ext && python setup.py install
 
 
-## Build transformers exllama kernels ##########################################
-FROM python-builder as exllama-kernels-builder
-
-WORKDIR /usr/src
-
-COPY server/exllama_kernels/ .
-RUN python setup.py build
-
-
-## Build transformers exllamav2 kernels ########################################
-FROM python-builder as exllamav2-kernels-builder
-
-WORKDIR /usr/src
-
-COPY server/exllamav2_kernels/ .
-RUN python setup.py build
-
-
 ## Flash attention v2 cached build image #######################################
 FROM base as flash-att-v2-cache
 
@@ -301,12 +283,6 @@ ENV PATH=/opt/tgis/bin:$PATH
 RUN --mount=type=bind,from=flash-att-v2-cache,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
     pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
 
-# Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
-
-# Copy build artifacts from exllamav2 kernels builder
-COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
-
 # Copy over the auto-gptq wheel and install it
 RUN --mount=type=bind,from=auto-gptq-cache,src=/usr/src/auto-gptq-wheel,target=/usr/src/auto-gptq-wheel \
     pip install /usr/src/auto-gptq-wheel/*.whl --no-cache-dir
diff --git a/server/exllama_kernels/exllama_kernels/cuda_buffers.cu b/server/exllama_kernels/exllama_kernels/cuda_buffers.cu
deleted file mode 100644
index ee2cbee23..000000000
--- a/server/exllama_kernels/exllama_kernels/cuda_buffers.cu
+++ /dev/null
@@ -1,71 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#define _cuda_buffers_cu
-#include "cuda_buffers.cuh"
-
-CudaBuffers* g_buffers[CUDA_MAX_DEVICES] = {NULL};
-// __constant__ half2 q4_table[16][256];
-// half2 q4_table_host[16][256];
-// bool q4_table_init = false;
-
-CudaBuffers::CudaBuffers
-(
-    int _device,
-    half* _temp_state,
-    half* _temp_dq
-) :
-    device(_device),
-    temp_state(_temp_state),
-    temp_dq(_temp_dq)
-{
-    cudaSetDevice(_device);
-
-    cudaStreamCreate(&alt_stream_1);
-    cudaStreamCreate(&alt_stream_2);
-    cudaStreamCreate(&alt_stream_3);
-    cudaEventCreate(&alt_stream_1_done);
-    cudaEventCreate(&alt_stream_2_done);
-    cudaEventCreate(&alt_stream_3_done);
-}
-
-CudaBuffers::~CudaBuffers()
-{
-    cudaStreamDestroy(alt_stream_1);
-    cudaStreamDestroy(alt_stream_2);
-    cudaStreamDestroy(alt_stream_3);
-    cudaEventDestroy(alt_stream_1_done);
-    cudaEventDestroy(alt_stream_2_done);
-    cudaEventDestroy(alt_stream_3_done);
-}
-
-CudaBuffers* get_buffers(const int device_index)
-{
-    return g_buffers[device_index];
-}
-
-void prepare_buffers_cuda
-(
-    int _device,
-    half* _temp_state,
-    half* _temp_dq
-)
-{
-    CudaBuffers* buffers = new CudaBuffers
-    (
-        _device,
-        _temp_state,
-        _temp_dq
-    );
-
-    g_buffers[_device] = buffers;
-}
-
-void cleanup_buffers_cuda()
-{
-    for (int i = 0; i < CUDA_MAX_DEVICES; i++)
-    {
-        if (!g_buffers[i]) continue;
-        delete g_buffers[i];
-        g_buffers[i] = NULL;
-    }
-}
diff --git a/server/exllama_kernels/exllama_kernels/cuda_buffers.cuh b/server/exllama_kernels/exllama_kernels/cuda_buffers.cuh
deleted file mode 100644
index afb60a012..000000000
--- a/server/exllama_kernels/exllama_kernels/cuda_buffers.cuh
+++ /dev/null
@@ -1,52 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _cuda_buffers_cuh
-#define _cuda_buffers_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-
-const int CUDA_MAX_DEVICES = 16;
-
-// #ifndef _cuda_buffers_cu
-// extern __constant__ half2 q4_table[16][256];
-// #endif
-
-class CudaBuffers
-{
-public:
-    int device;
-
-    half* temp_state;           // [max_hidden_rows * intermediate_size]
-    half* temp_dq;              // size of largest quant tensor * 8
-
-    cudaStream_t alt_stream_1;
-    cudaStream_t alt_stream_2;
-    cudaStream_t alt_stream_3;
-    cudaEvent_t alt_stream_1_done;
-    cudaEvent_t alt_stream_2_done;
-    cudaEvent_t alt_stream_3_done;
-
-    CudaBuffers
-    (
-        int _device,
-        half* _temp_state,
-        half* _temp_dq
-    );
-    ~CudaBuffers();
-};
-
-CudaBuffers* get_buffers(const int device_index);
-
-void prepare_buffers_cuda
-(
-    int _device,
-    half* _temp_state,
-    half* _temp_dq
-);
-
-void cleanup_buffers_cuda();
-
-#endif
diff --git a/server/exllama_kernels/exllama_kernels/cuda_compat.cuh b/server/exllama_kernels/exllama_kernels/cuda_compat.cuh
deleted file mode 100644
index 8dfa25de3..000000000
--- a/server/exllama_kernels/exllama_kernels/cuda_compat.cuh
+++ /dev/null
@@ -1,58 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _cuda_compat_cuh
-#define _cuda_compat_cuh
-
-// atomicAdd for half types, to support CC < 7.x
-
-__device__ __forceinline__ void atomicAdd_half(half* address, half val)
-{
-    unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
-    unsigned int old = *address_as_ui;
-    unsigned int assumed;
-
-    do
-    {
-        assumed = old;
-        __half_raw hsum;
-        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
-        half tmpres = __hadd(hsum, val);
-        hsum = __half_raw(tmpres);
-        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
-        old = atomicCAS(address_as_ui, assumed, old);
-    }
-    while (assumed != old);
-}
-
-// atomicAdd for half2 types
-
-__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
-{
-    unsigned int* address_as_ui = (unsigned int*)address;
-    unsigned int old = *address_as_ui;
-    unsigned int assumed;
-    do
-    {
-        assumed = old;
-        half2 old_val = *((half2*)&old);
-        half2 new_val = __hadd2(old_val, val);
-        old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
-    }
-    while (assumed != old);
-}
-
-//
-
-#if defined(__CUDA_ARCH__)
-#if __CUDA_ARCH__ < 700
-
-__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
-
-#if __CUDA_ARCH__ < 600
-__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
-#endif
-
-#endif
-#endif
-
-#endif
diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cu b/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cu
deleted file mode 100644
index c25b0206b..000000000
--- a/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cu
+++ /dev/null
@@ -1,61 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#include "column_remap.cuh"
-#include "../util.cuh"
-
-const int SHUF_BLOCKSIZE_X = 256;
-const int SHUF_BLOCKSIZE_Y = 16;
-
-__global__ void column_remap_kernel
-(
-    const half* __restrict__ x,
-    half* __restrict__ x_new,
-    const int x_width,
-    const int x_height,
-    const uint32_t* x_map
-)
-{
-    int x_column = SHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
-    int x_row = SHUF_BLOCKSIZE_Y * blockIdx.y;
-
-    int x_stride = x_width;
-    int x_idx = x_row * x_stride + x_column;
-
-    int x_row_end = min(x_row + SHUF_BLOCKSIZE_Y, x_height);
-    int x_idx_end = x_row_end * x_stride + x_column;
-
-    int s_column = x_map[x_column];
-    int s_idx = x_row * x_stride + s_column;
-
-    while (x_idx < x_idx_end)
-    {
-        x_new[x_idx] = x[s_idx];
-        x_idx += x_stride;
-        s_idx += x_stride;
-    }
-}
-
-// Remap columns in x to correspond to sequential group index before matmul
-//
-// perform x -> seq_x such that seq_x @ seq_w == x @ w
-
-void column_remap_cuda
-(
-    const half* x,
-    half* x_new,
-    const int x_height,
-    const int x_width,
-    const uint32_t* x_map
-)
-{
-    dim3 threads(SHUF_BLOCKSIZE_X, 1, 1);
-
-    dim3 blocks
-    (
-        (x_width + SHUF_BLOCKSIZE_X - 1) / SHUF_BLOCKSIZE_X,
-        (x_height + SHUF_BLOCKSIZE_Y - 1) / SHUF_BLOCKSIZE_Y,
-        1
-    );
-
-    column_remap_kernel<<<blocks, threads>>>(x, x_new, x_width, x_height, x_map);
-}
diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh b/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh
deleted file mode 100644
index 6571c17d6..000000000
--- a/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh
+++ /dev/null
@@ -1,19 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _column_remap_cuh
-#define _column_remap_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-
-void column_remap_cuda
-(
-    const half* x,
-    half* x_new,
-    const int x_height,
-    const int x_width,
-    const uint32_t* x_map
-);
-
-#endif
\ No newline at end of file
diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cu b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cu
deleted file mode 100644
index 60dc4c9db..000000000
--- a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cu
+++ /dev/null
@@ -1,252 +0,0 @@
-#include "q4_matmul.cuh"
-#include "column_remap.cuh"
-#include "../util.cuh"
-#include "../matrix.cuh"
-#include "../cuda_compat.cuh"
-#include "../cuda_buffers.cuh"
-
-const int THREADS_X = 32;       // Block size and thread count along columns in w and out
-const int THREADS_Y = 1;        // Block size and thread count along rows in x and out
-
-typedef void (*fp_q4_matmul_kernel)
-(
-    const half*,
-    const uint32_t*,
-    half*,
-    const half*,
-    const uint32_t*,
-    const int,
-    const int,
-    const int,
-    const int,
-    const int,
-    const uint32_t*,
-    bool
-);
-
-template<bool use_half2, bool use_groupsize, bool use_x_map>
-__global__ void q4_matmul_kernel
-(
-    const half* __restrict__ x,
-    const uint32_t* __restrict__ w,
-    half* __restrict__ out,
-    const half* __restrict__ w_scales,
-    const uint32_t* __restrict__ w_zeros,
-    const int height,
-    const int dim,
-    const int width,
-    const int groupsize,
-    const int block_size_z,
-    const uint32_t* __restrict__ x_map,
-    bool no_zero
-)
-{
-    // Start of block
-
-    int x_column = block_size_z * blockIdx.z;
-    int x_column_end = min(dim, block_size_z * (blockIdx.z + 1));
-
-    int w_column = THREADS_X * blockIdx.x + threadIdx.x;
-    int x_row = THREADS_Y * blockIdx.y + threadIdx.y;
-
-    int iterations = (x_column_end - x_column) / 8;
-
-    // Views
-
-    MatrixView_half x_(x, height, dim);
-    MatrixView_half w_scales_(w_scales, dim / groupsize, width);
-    MatrixView_q4_row w_zeros_(w_zeros, dim / groupsize, width);
-    MatrixView_q4_column w_(w, dim, width);
-    MatrixView_half_rw out_(out, height, width);
-
-    // Zero output
-
-    if (!no_zero && blockIdx.z == 0 && (threadIdx.x & 1) == 0)
-    {
-        *((uint32_t*) out_.item_ptr(x_row, w_column)) = 0;
-        __syncthreads();
-    }
-
-    // Loop over part of x row (and w column)
-
-    half2 acc = {};
-    half acc_h = {};
-
-    if constexpr (use_groupsize)
-    {
-        // For quant matrices where groupsize divides BLOCK_SIZE_Z we always start on a group boundary, so this
-        // could be slightly faster
-
-        for (int k = x_column, group = x_column / groupsize; k < x_column + iterations * 8; group++, k += groupsize)
-        {
-            if constexpr (use_half2)
-            {
-                half2 w_scale = w_scales_.item_half2half2(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
-
-                if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map);
-                else                     acc = dot_product_8      (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8);
-            }
-            else
-            {
-                half w_scale = w_scales_.item(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
-
-                if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map);
-                else                     acc_h = dot_product_8_h      (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8);
-            }
-        }
-    }
-    else
-    {
-        // Otherwise assume groupsize is a multiple of 8, do 8 columns per iteration and trust the cache
-
-        for (int k = x_column; k < x_column + iterations * 8; k += 8)
-        {
-            if constexpr (use_half2)
-            {
-                int group = k / groupsize;
-                half2 w_scale = w_scales_.item_half2half2(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
-
-                if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map);
-                else                     acc = dot_product_8      (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1);
-            }
-            else
-            {
-                int group = k / groupsize;
-                half w_scale = w_scales_.item(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
-
-                if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map);
-                else                     acc_h = dot_product_8_h      (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1);
-            }
-        }
-    }
-
-    // Add to block result
-
-    if constexpr (use_half2)
-    {
-        half result = __hadd(acc.x, acc.y);
-        atomicAdd(out_.item_ptr(x_row, w_column), result);
-    }
-    else
-    {
-        atomicAdd(out_.item_ptr(x_row, w_column), acc_h);
-    }
-}
-
-fp_q4_matmul_kernel q4_matmul_kernel_pick(ExLlamaTuning* tuningParams, int block_size_z, int groupsize, uint32_t* x_map)
-{
-    // <bool use_half2, bool use_groupsize, bool use_x_map>
-    if (tuningParams->matmul_no_half2) {
-        if (block_size_z % groupsize == 0) {
-            if (x_map) return q4_matmul_kernel<false, true,  true >;
-            else       return q4_matmul_kernel<false, true,  false>;
-        } else {
-            if (x_map) return q4_matmul_kernel<false, false, true >;
-            else       return q4_matmul_kernel<false, false, false>;
-        }
-    } else {
-        if (block_size_z % groupsize == 0)
-        {
-            if (x_map) return q4_matmul_kernel<true,  true,  true >;
-            else       return q4_matmul_kernel<true,  true,  false>;
-        } else {
-            if (x_map) return q4_matmul_kernel<true,  false, true >;
-            else       return q4_matmul_kernel<true,  false, false>;
-        }
-    }
-};
-
-// Compute y = x @ w
-
-void q4_matmul_cuda
-(
-    ExLlamaTuning* tuningParams,
-    const half* x,
-    const int x_height,
-    const Q4Matrix* w,
-    half* out,
-    bool no_zero,
-    cudaStream_t alt_stream
-)
-{
-    int height = x_height;
-    int dim = w->height;
-    int width = w->width;
-
-    cudaSetDevice(w->device);
-
-    uint32_t* x_map = w->cuda_x_map;
-    const half* x_mapped = x;
-    if (x_map && !tuningParams->matmul_fused_remap && !alt_stream)
-    {
-        CudaBuffers* buffers = get_buffers(w->device);
-        column_remap_cuda(x, buffers->temp_state, x_height, dim, w->cuda_x_map);
-        x_mapped = buffers->temp_state;
-        x_map = NULL;
-    }
-
-    int block_size_z;
-    if (w->width == 4096) block_size_z = 384;           // 7B
-    else if (w->width == 11008) block_size_z = 256;
-    else if (w->width == 5120) block_size_z = 384;      // 13B
-    else if (w->width == 13824) block_size_z = 256;
-    else if (w->width == 6656) block_size_z = 256;      // 33B
-    else if (w->width == 17920) block_size_z = 128;
-    else block_size_z = 256;
-
-    //if (!no_zero) cudaMemsetAsync(out, 0, x_height * w->width * sizeof(half));
-
-    dim3 threads(THREADS_X, THREADS_Y, 1);
-
-    dim3 blocks
-    (
-        (width + threads.x - 1) / threads.x,
-        (height + threads.y - 1) / threads.y,
-        (dim + block_size_z - 1) / block_size_z
-    );
-
-    fp_q4_matmul_kernel kernel = q4_matmul_kernel_pick(tuningParams, block_size_z, w->groupsize, x_map);
-
-    kernel<<<blocks, threads, 0, alt_stream>>> (x_mapped, w->cuda_qweight, out, w->cuda_scales, w->cuda_qzeros, height, dim, width, w->groupsize, block_size_z, x_map, no_zero);
-}
-
-void q4_matmul_recons_cuda
-(
-    ExLlamaTuning* tuningParams,
-    const half* x,
-    const int x_height,
-    Q4Matrix* w,
-    half* out,
-    const cublasHandle_t handle,
-    bool no_zero
-)
-{
-    int height = x_height;
-    int dim = w->height;
-    int width = w->width;
-
-    cudaSetDevice(w->device);
-    CudaBuffers* buffers = get_buffers(w->device);
-
-    const half* x_mapped = x;
-    if (w->cuda_x_map)
-    {
-        column_remap_cuda(x, buffers->temp_state, x_height, dim, w->cuda_x_map);
-        x_mapped = buffers->temp_state;
-    }
-
-    w->reconstruct(buffers->temp_dq);
-
-    const half alpha = __float2half(1.0f);
-    const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f);
-    cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, width, x_mapped, dim, &beta, out, width);
-
-//     const float alpha = 1.0f;
-//     const float beta = no_zero ? 1.0f : 0.0f;
-//     cublasSgemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, CUDA_R_16F, width,
-//                 x_mapped, CUDA_R_16F, dim, &beta, out, CUDA_R_16F, width);
-}
diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cuh b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cuh
deleted file mode 100644
index 63611790a..000000000
--- a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cuh
+++ /dev/null
@@ -1,37 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _q4_matmul_cuh
-#define _q4_matmul_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-#include <ATen/cuda/CUDAContext.h>
-
-#include "q4_matrix.cuh"
-#include "../tuning.h"
-
-void q4_matmul_cuda
-(
-    ExLlamaTuning* tuningParams,
-    const half* x,
-    const int x_height,
-    const Q4Matrix* w,
-    half* out,
-    bool no_zero = false,
-    cudaStream_t alt_stream = NULL
-);
-
-void q4_matmul_recons_cuda
-(
-    ExLlamaTuning* tuningParams,
-    const half* x,
-    const int x_height,
-    Q4Matrix* w,
-    half* out,
-    const cublasHandle_t handle,
-    bool no_zero = false
-);
-
-#endif
diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cu b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cu
deleted file mode 100644
index f3d1564f3..000000000
--- a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cu
+++ /dev/null
@@ -1,217 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#include "q4_matrix.cuh"
-#include <vector>
-#include "../util.cuh"
-#include "../matrix.cuh"
-
-using namespace std;
-
-const int UNSHUF_BLOCKSIZE_X = 64;
-
-const int RECONS_THREADS_X = 64;      // Block size and thread count along columns in out, each thread converts 1 column
-const int RECONS_THREADS_Y = 1;       // Block size and thread count along rows in x and out, each thread converts 8 rows
-
-vector<Q4Matrix*> g_q4_matrices;
-
-void g_q4_keep_matrix(Q4Matrix* m)
-{
-    g_q4_matrices.push_back(m);
-}
-
-void g_q4_free_matrices()
-{
-    for (const auto& m : g_q4_matrices) delete m;
-    g_q4_matrices.clear();
-}
-
-Q4Matrix::Q4Matrix
-(
-    const int _height,
-    const int _width,
-    const int _groups,
-
-    uint32_t* _qweight,
-    uint32_t* _qzeros,
-    half* _scales,
-    uint32_t* _g_idx,
-
-    const int _device
-) :
-    height(_height),
-    width(_width),
-    groups(_groups),
-    device(_device)
-{
-    cudaSetDevice(device);
-
-    cuda_qweight = _qweight;
-    cuda_qzeros = _qzeros;
-    cuda_scales = _scales;
-
-    groupsize = height / groups;
-
-    if (_g_idx) make_sequential(_g_idx);
-}
-
-Q4Matrix::~Q4Matrix()
-{
-}
-
-// Make sequential
-
-__global__ void make_sequential_kernel
-(
-    const uint32_t* __restrict__ w,
-    uint32_t* __restrict__ w_new,
-    const uint32_t* __restrict__ x_map,
-    const int w_height,
-    const int w_width
-)
-{
-    const uint64_t* w2 = (uint64_t*) w;
-    uint64_t* w_new2 = (uint64_t*) w_new;
-    int w2_stride = w_width >> 1;
-
-    int w2_column = UNSHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
-    int w_new2_row = blockIdx.y;
-
-    int x_map_idx = w_new2_row << 3;
-
-    uint64_t dst = 0;
-
-    #pragma unroll
-    for (int i = 0; i < 8; i++)
-    {
-        int source_row = x_map[x_map_idx++];
-
-        int w2_row = source_row >> 3;
-        int w2_subrow = source_row & 0x07;
-        int w2_row_shift = w2_subrow << 2;
-        int wnew2_row_shift = i << 2;
-
-        uint64_t src = w2[w2_row * w2_stride + w2_column];
-        src >>= w2_row_shift;
-        src &= 0x0000000f0000000f;
-        src <<= wnew2_row_shift;
-        dst |= src;
-    }
-
-    w_new2[w_new2_row * w2_stride + w2_column] = dst;
-}
-
-void Q4Matrix::make_sequential(const uint32_t* cpu_g_idx)
-{
-    uint32_t* cuda_new_qweight = NULL;
-    cudaMalloc(&cuda_new_qweight, height / 8 * width * sizeof(uint32_t));
-    cudaMalloc(&cuda_x_map, height * sizeof(uint32_t));  // TODO: Should probably be allocated in PyTorch
-
-    uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t));
-    uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t));
-    uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t));
-
-    // Group histogram
-
-    for (int i = 0; i < height; i++) cpu_g_idx_map[cpu_g_idx[i]]++;
-
-    // Group map
-
-    for (int i = 0, acc = 0; i < groups; i++)
-    {
-        short tmp = cpu_g_idx_map[i];
-        cpu_g_idx_map[i] = acc;
-        acc += tmp;
-    }
-
-    // X map (inverse)
-
-    for (int row = 0; row < height; row++)
-    {
-        uint32_t target_group = cpu_g_idx[row];
-        uint32_t target_row = cpu_g_idx_map[target_group];
-        cpu_g_idx_map[target_group]++;
-        cpu_x_map_inv[row] = target_row;
-    }
-
-    // X map
-
-    for (int row = 0; row < height; row++) cpu_x_map[cpu_x_map_inv[row]] = row;
-
-    // Move to CUDA
-
-    cudaMemcpyAsync(cuda_x_map, cpu_x_map, height * sizeof(uint32_t), cudaMemcpyHostToDevice);
-
-    // Rearrange rows in w
-
-    dim3 threads(UNSHUF_BLOCKSIZE_X, 1, 1);
-    dim3 blocks(width / UNSHUF_BLOCKSIZE_X / 2, height / 8, 1);
-
-    make_sequential_kernel<<<blocks, threads>>>(cuda_qweight, cuda_new_qweight, cuda_x_map, height / 8, width);
-
-    // Replace qweights
-
-    cudaMemcpyAsync(cuda_qweight, cuda_new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice);
-
-    // Cleanup
-
-    cudaDeviceSynchronize();
-    cudaFree(cuda_new_qweight);
-    free(cpu_g_idx_map);
-    free(cpu_x_map);
-    free(cpu_x_map_inv);
-}
-
-__global__ void reconstruct_kernel
-(
-    const uint32_t* __restrict__ w,
-    half* __restrict__ out,  // (y)
-    const half* __restrict__ w_scales,
-    const uint32_t* __restrict__ w_zeros,
-    const int height,
-    const int width,
-    const int groupsize
-)
-{
-    // Start of block
-
-    int column = RECONS_THREADS_X * blockIdx.x + threadIdx.x;
-    int row = (RECONS_THREADS_Y * blockIdx.y + threadIdx.y) * 8;
-
-    // Views
-
-    MatrixView_q4_column w_(w, height, width);
-    MatrixView_half_rw out_(out, height, width);
-    MatrixView_half w_scales_(w_scales, height / groupsize, width);
-    MatrixView_q4_row w_zeros_(w_zeros, height / groupsize, width);
-
-    // Groupsize version
-
-    int group = row / groupsize;
-
-    half w_scale = w_scales_.item(group, column);
-    uint32_t w_zero = w_zeros_.item(group, column) + 1;
-
-    uint32_t w_read = w_.item_uint32_t(row, column);
-    half* out_ptr = out_.item_ptr(row, column);
-
-    #pragma unroll
-    for (int s = 0; s < 32; s += 4)
-    {
-        half w_item = __hmul(__int2half_rn((int)((w_read >> s) & 0x0f) - w_zero), w_scale);
-        *out_ptr = w_item; out_ptr += out_.width;
-    }
-}
-
-void Q4Matrix::reconstruct(half* out)
-{
-    dim3 threads(RECONS_THREADS_X, RECONS_THREADS_Y, 1);
-
-    dim3 blocks
-    (
-        (width + threads.x - 1) / threads.x,
-        (height / 8 + threads.y - 1) / threads.y,
-        1
-    );
-
-    reconstruct_kernel<<<blocks, threads>>>(cuda_qweight, out, cuda_scales, cuda_qzeros, height / 8, width, groupsize);
-}
\ No newline at end of file
diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh
deleted file mode 100644
index 50cb72a41..000000000
--- a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh
+++ /dev/null
@@ -1,53 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _q4_matrix_cuh
-#define _q4_matrix_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-
-class Q4Matrix
-{
-public:
-
-    int device;
-
-    int height;
-    int width;
-    int groups;
-    int groupsize;
-
-    uint32_t* cuda_qweight = NULL;
-    uint32_t* cuda_qzeros = NULL;
-    half* cuda_scales = NULL;
-    uint32_t* cuda_x_map = NULL;
-
-    Q4Matrix
-    (
-        const int _height,
-        const int _width,
-        const int _groups,
-
-        uint32_t* _qweight,
-        uint32_t* _qzeros,
-        half* _scales,
-        uint32_t* _g_idx,
-
-        const int _device
-    );
-
-    ~Q4Matrix();
-
-    void reconstruct(half* out);
-
-private:
-
-    void make_sequential(const uint32_t* cpu_g_idx);
-
-};
-
-void g_q4_keep_matrix(Q4Matrix* m);
-void g_q4_free_matrices();
-
-#endif
\ No newline at end of file
diff --git a/server/exllama_kernels/exllama_kernels/exllama_ext.cpp b/server/exllama_kernels/exllama_kernels/exllama_ext.cpp
deleted file mode 100644
index b786988bd..000000000
--- a/server/exllama_kernels/exllama_kernels/exllama_ext.cpp
+++ /dev/null
@@ -1,249 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#include <torch/extension.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-#include "util.cuh"
-#include "tuning.h"
-#include "cuda_buffers.cuh"
-#include "cuda_func/q4_matrix.cuh"
-#include "cuda_func/q4_matmul.cuh"
-#include "cuda_func/column_remap.cuh"
-
-// Check CUDA return code. We don't want to include Torch headers in the .cu files because parsing them adds almost a
-// minute to the compile time on a 12900K. Also passing exceptions back to Python is super tricky, so in place of
-// exceptions, CUDA functions return with a cudaError_t which we can parse and dump to the console.
-
-void check_cuda(cudaError_t ret)
-{
-    switch (ret)
-    {
-        case cudaSuccess:
-            break;
-
-        case cudaUnspecified:
-            printf(" **** Unspecified error\n");
-            TORCH_CHECK(false, "CUDA error");
-            break;
-
-        default:
-            printf(" **** CUDA error\n"); \
-            printf(" **** %s\n", cudaGetErrorString(ret)); \
-            TORCH_CHECK(false, "CUDA error"); \
-            break;
-    }
-}
-
-// Some decluttering macros
-
-#define STRINGIFY_(__x) #__x
-#define STRINGIFY(__x) STRINGIFY_(__x)
-#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
-#define TORCH_CHECK_DTYPE_OPT(__x, __dtype) TORCH_CHECK((__x).device().is_meta() || (__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
-#define TORCH_CHECK_SHAPES(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
-#define TORCH_CHECK_SHAPES_OPT(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).device().is_meta() || (__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
-#define TORCH_CHECK_SHAPE_MOD(__x, __dim_x, __mod) TORCH_CHECK((__x).size(__dim_x) % __mod == 0, #__x ".shape[" STRINGIFY(__dim_x) "] must be a multiple of " STRINGIFY(__mod))
-
-#define TORCH_CHECK_DEVICE_INDEX(__index) \
-do { \
-    TORCH_CHECK(__index >= 0, "no device index"); \
-    TORCH_CHECK(__index < CUDA_MAX_DEVICES, "invalid device index"); \
-} while(0)
-
-#define TORCH_CHECK_QUANT(__w, __w_scales, __w_zeros, __seq_g_idx, __x_map) \
-do { \
-    TORCH_CHECK_DTYPE(__w, kInt); \
-    TORCH_CHECK_DTYPE(__w_scales, kHalf); \
-    TORCH_CHECK_DTYPE(__w_zeros, kInt); \
-    TORCH_CHECK_DTYPE_OPT(__seq_g_idx, kShort); \
-    TORCH_CHECK_DTYPE_OPT(__x_map, kInt); \
-    TORCH_CHECK_SHAPES_OPT(__seq_g_idx, 0, __w, 0, 2 * 8); \
-    TORCH_CHECK_SHAPES_OPT(__x_map, 0, __w, 0, 8); \
-} while(0)
-
-int get_groupsize(torch::Tensor w, torch::Tensor w_zeros)
-{
-    int groupsize = w.size(0) * 8 / w_zeros.size(0);
-    TORCH_CHECK(groupsize * w_zeros.size(0) == w.size(0) * 8, "w.shape[-2] must be a multiple of zeros.shape[-2]")
-    return groupsize;
-}
-
-
-// Tuning parameters
-
-ExLlamaTuning tuningParams;
-
-void set_tuning_params
-(
-    int matmul_recons_thd,
-    bool matmul_fused_remap,
-    bool matmul_no_half2
-)
-{
-    tuningParams.matmul_recons_thd = matmul_recons_thd;
-    tuningParams.matmul_fused_remap = matmul_fused_remap;
-    tuningParams.matmul_no_half2 = matmul_no_half2;
-}
-
-
-// Release all unmanaged objects allocated by the extension
-
-void cleanup()
-{
-    cleanup_buffers_cuda();
-    g_q4_free_matrices();
-}
-
-
-// Prepare buffers for forward pass
-
-void prepare_buffers
-(
-    torch::Device device,
-    torch::Tensor temp_state,
-    torch::Tensor temp_dq
-)
-{
-    int device_index = device.index();
-    TORCH_CHECK_DEVICE_INDEX(device_index);
-    const at::cuda::OptionalCUDAGuard device_guard(device);
-
-    prepare_buffers_cuda
-    (
-        device_index,
-        (half*) temp_state.data_ptr(),
-        (half*) temp_dq.data_ptr()
-    );
-}
-
-
-// Create Q4Matrix, return handle
-
-uintptr_t make_q4
-(
-    torch::Tensor qweight,
-    torch::Tensor qzeros,
-    torch::Tensor scales,
-    torch::Tensor g_idx,
-    int device
-)
-{
-    TORCH_CHECK_DTYPE(qweight, kInt);
-    TORCH_CHECK_DTYPE(qzeros, kInt);
-    TORCH_CHECK_DTYPE(scales, kHalf);
-    TORCH_CHECK_DTYPE_OPT(g_idx, kInt);
-    TORCH_CHECK_SHAPES(qweight, 1, qzeros, 1, 8);
-    TORCH_CHECK_SHAPES(scales, 1, qweight, 1, 1);
-    TORCH_CHECK_SHAPES(qzeros, 0, scales, 0, 1);
-
-    int width = qweight.size(1);
-    int height = qweight.size(0) * 8;
-    int groups = qzeros.size(0);
-
-    Q4Matrix* m = new Q4Matrix
-    (
-        height,
-        width,
-        groups,
-
-        (uint32_t*) qweight.data_ptr(),
-        (uint32_t*) qzeros.data_ptr(),
-        (half*) scales.data_ptr(),
-        g_idx.device().is_meta() ? NULL : (uint32_t*) g_idx.data_ptr(),
-
-        device
-    );
-
-    g_q4_keep_matrix(m);
-    return reinterpret_cast<uintptr_t> (m);
-}
-
-
-// Matmul half @ quant -> half
-
-void q4_matmul
-(
-    torch::Tensor x,
-    uintptr_t w,
-    torch::Tensor out
-)
-{
-    Q4Matrix* wm = reinterpret_cast<Q4Matrix*> (w);
-
-    TORCH_CHECK_DTYPE(x, kHalf);
-    TORCH_CHECK_DTYPE(out, kHalf);
-    TORCH_CHECK_SHAPES(x, 0, out, 0, 1);
-    TORCH_CHECK(wm->height == x.size(-1), "x and w have incompatible shapes")
-
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-
-    int x_height = x.size(0);
-
-    if (tuningParams.matmul_recons_thd == 0 || x_height < tuningParams.matmul_recons_thd)
-    {
-        q4_matmul_cuda
-        (
-            &tuningParams,
-            (half*) x.data_ptr(),
-            x_height,
-            wm,
-            (half*) out.data_ptr()
-        );
-    }
-    else
-    {
-        q4_matmul_recons_cuda
-        (
-            &tuningParams,
-            (half*) x.data_ptr(),
-            x_height,
-            wm,
-            (half*) out.data_ptr(),
-            at::cuda::getCurrentCUDABlasHandle()
-        );
-    }
-}
-
-
-// Remap columns in half tensor
-
-void column_remap
-(
-    torch::Tensor x,
-    torch::Tensor x_new,
-    torch::Tensor x_map
-)
-{
-    TORCH_CHECK_DTYPE(x, kHalf);
-    TORCH_CHECK_DTYPE(x_new, kHalf);
-    TORCH_CHECK_DTYPE(x_map, kInt);
-    TORCH_CHECK_SHAPES(x_map, 0, x, 1, 1);
-
-    int height = x.size(0);
-    int width = x.size(1);
-
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-
-    column_remap_cuda
-    (
-        (half*) x.data_ptr(),
-        (half*) x_new.data_ptr(),
-        height,
-        width,
-        (uint32_t*) x_map.data_ptr()
-    );
-}
-
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("set_tuning_params", &set_tuning_params, "set_tuning_params");
-    m.def("prepare_buffers", &prepare_buffers, "prepare_buffers");
-    m.def("cleanup", &cleanup, "cleanup");
-    m.def("make_q4", &make_q4, "make_q4");
-    m.def("q4_matmul", &q4_matmul, "q4_matmul");
-}
diff --git a/server/exllama_kernels/exllama_kernels/matrix.cuh b/server/exllama_kernels/exllama_kernels/matrix.cuh
deleted file mode 100644
index 2fd5ab0b3..000000000
--- a/server/exllama_kernels/exllama_kernels/matrix.cuh
+++ /dev/null
@@ -1,294 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _matrix_cuh
-#define _matrix_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-
-class MatrixView_half
-{
-public:
-    const half* data;
-    const int height;
-    const int width;
-
-    __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-
-    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
-    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
-    __device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); }
-    __device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; }
-};
-
-class MatrixView_half_rw
-{
-public:
-    half* data;
-    const int height;
-    const int width;
-
-    __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-
-    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
-    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
-    __device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; }
-    __device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; }
-    __device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; }
-};
-
-class MatrixView_q4_row
-{
-public:
-    const uint32_t* data;
-    const int height;
-    const int width;
-
-    __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-
-    __device__ __forceinline__ int item(int row, int column) const
-    {
-        int shift = (column & 0x07) * 4;
-        return (data[row * width / 8 + column / 8] >> shift) & 0x0f;
-    }
-};
-
-class MatrixView_q4_column
-{
-public:
-    const uint32_t* data;
-    const int height;
-    const int width;
-
-    __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-
-    __device__ __forceinline__ int item(int row, int column) const
-    {
-        int shift = (row & 0x07) * 4;
-        return (data[row / 8 * width + column] >> shift) & 0x0f;
-    }
-
-    __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) { return data[row / 8 * width + column]; }
-    __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; }
-};
-
-// TODO: Rewrite all these dot product functions using functors or something, move to q4_matmul.cu
-
-// Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale
-
-__device__ __forceinline__ half2 dot_product_8
-(
-    const half2 acc,
-    MatrixView_half& h_,
-    const int h_row,
-    const int h_column,                 // divisible by 8
-    MatrixView_q4_column& v_,
-    const int v_row,                    // divisible by 8
-    const int v_column,
-    const half2 v_scale_2,
-    const uint32_t v_zero,              // + 1 (!!)
-    const int count
-)
-{
-    const half2* h_ptr = (const half2*) h_.item_ptr(h_row, h_column);
-    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
-    half2 result = acc;
-
-    for (int i = 0; i < count; i++)
-    {
-        uint32_t v_read = *v_ptr; v_ptr += v_.width;
-
-        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
-        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
-        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
-        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
-        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
-        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
-        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
-        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
-
-        half2 v_01 = __halves2half2(v_0, v_1);
-        half2 v_23 = __halves2half2(v_2, v_3);
-        half2 v_45 = __halves2half2(v_4, v_5);
-        half2 v_67 = __halves2half2(v_6, v_7);
-
-//         half2 v_01 = q4_table[v_zero - 1][(v_read      ) & 0xff]; // (constant memory is too slow apparently)
-//         half2 v_23 = q4_table[v_zero - 1][(v_read >>  8) & 0xff];
-//         half2 v_45 = q4_table[v_zero - 1][(v_read >> 16) & 0xff];
-//         half2 v_67 = q4_table[v_zero - 1][(v_read >> 24)       ];
-
-        half2 tmp = __hmul2(*h_ptr++, v_01);
-        tmp = __hfma2(*h_ptr++, v_23, tmp);
-        tmp = __hfma2(*h_ptr++, v_45, tmp);
-        tmp = __hfma2(*h_ptr++, v_67, tmp);
-        result = __hfma2(v_scale_2, tmp, result);
-    }
-
-    return result;
-}
-
-__device__ __forceinline__ half dot_product_8_h
-(
-    const half acc,
-    MatrixView_half& h_,
-    const int h_row,
-    const int h_column,                 // divisible by 8
-    MatrixView_q4_column& v_,
-    const int v_row,                    // divisible by 8
-    const int v_column,
-    const half v_scale,
-    const uint32_t v_zero,              // + 1 (!!)
-    const int count
-)
-{
-    const half* h_ptr = h_.item_ptr(h_row, h_column);
-    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
-    half result = acc;
-
-    for (int i = 0; i < count; i++)
-    {
-        uint32_t v_read = *v_ptr; v_ptr += v_.width;
-
-        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
-        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
-        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
-        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
-        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
-        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
-        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
-        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
-
-        half tmp = __hmul(*h_ptr++, v_0);
-        tmp = __hfma(*h_ptr++, v_1, tmp);
-        tmp = __hfma(*h_ptr++, v_2, tmp);
-        tmp = __hfma(*h_ptr++, v_3, tmp);
-        tmp = __hfma(*h_ptr++, v_4, tmp);
-        tmp = __hfma(*h_ptr++, v_5, tmp);
-        tmp = __hfma(*h_ptr++, v_6, tmp);
-        tmp = __hfma(*h_ptr++, v_7, tmp);
-        result = __hfma(v_scale, tmp, result);
-    }
-
-    return result;
-}
-
-// Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale, with x_map
-
-__device__ __forceinline__ half2 dot_product_8_x_map
-(
-    const half2 acc,
-    MatrixView_half& h_,
-    const int h_row,
-    const int h_column,                 // divisible by 8
-    MatrixView_q4_column& v_,
-    const int v_row,                    // divisible by 8
-    const int v_column,
-    const half2 v_scale_2,
-    const uint32_t v_zero,              // + 1 (!!)
-    const int count,
-    const uint32_t* x_map
-)
-{
-    const half* h_ptr = h_.item_ptr(h_row, 0);
-    const uint32_t* x_map_ptr = x_map + h_column;
-    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
-    half2 result = acc;
-
-    for (int i = 0; i < count; i++)
-    {
-        uint32_t v_read = *v_ptr; v_ptr += v_.width;
-
-        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
-        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
-        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
-        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
-        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
-        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
-        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
-        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
-
-        half2 v_01 = __halves2half2(v_0, v_1);
-        half2 v_23 = __halves2half2(v_2, v_3);
-        half2 v_45 = __halves2half2(v_4, v_5);
-        half2 v_67 = __halves2half2(v_6, v_7);
-
-        half h_0 = h_ptr[*x_map_ptr++];
-        half h_1 = h_ptr[*x_map_ptr++];
-        half h_2 = h_ptr[*x_map_ptr++];
-        half h_3 = h_ptr[*x_map_ptr++];
-        half h_4 = h_ptr[*x_map_ptr++];
-        half h_5 = h_ptr[*x_map_ptr++];
-        half h_6 = h_ptr[*x_map_ptr++];
-        half h_7 = h_ptr[*x_map_ptr++];
-
-        half2 h_01 = __halves2half2(h_0, h_1);
-        half2 h_23 = __halves2half2(h_2, h_3);
-        half2 h_45 = __halves2half2(h_4, h_5);
-        half2 h_67 = __halves2half2(h_6, h_7);
-
-        half2 tmp = __hmul2(h_01, v_01);
-        tmp = __hfma2(h_23, v_23, tmp);
-        tmp = __hfma2(h_45, v_45, tmp);
-        tmp = __hfma2(h_67, v_67, tmp);
-        result = __hfma2(v_scale_2, tmp, result);
-    }
-
-    return result;
-}
-
-__device__ __forceinline__ half dot_product_8_x_map_h
-(
-    const half acc,
-    MatrixView_half& h_,
-    const int h_row,
-    const int h_column,                 // divisible by 8
-    MatrixView_q4_column& v_,
-    const int v_row,                    // divisible by 8
-    const int v_column,
-    const half v_scale,
-    const uint32_t v_zero,              // + 1 (!!)
-    const int count,
-    const uint32_t* x_map
-)
-{
-    const half* h_ptr = h_.item_ptr(h_row, 0);
-    const uint32_t* x_map_ptr = x_map + h_column;
-    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
-    half result = acc;
-
-    for (int i = 0; i < count; i++)
-    {
-        uint32_t v_read = *v_ptr; v_ptr += v_.width;
-
-        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
-        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
-        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
-        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
-        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
-        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
-        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
-        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
-
-        half tmp = __hmul(h_ptr[*x_map_ptr++], v_0);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_1, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_2, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_3, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_4, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_5, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_6, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_7, tmp);
-        result = __hfma(v_scale, tmp, result);
-    }
-
-    return result;
-}
-
-#endif
diff --git a/server/exllama_kernels/exllama_kernels/tuning.h b/server/exllama_kernels/exllama_kernels/tuning.h
deleted file mode 100644
index 770ca46aa..000000000
--- a/server/exllama_kernels/exllama_kernels/tuning.h
+++ /dev/null
@@ -1,13 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _tuning_h
-#define _tuning_h
-
-struct ExLlamaTuning
-{
-    int matmul_recons_thd;
-    bool matmul_fused_remap;
-    bool matmul_no_half2;
-};
-
-#endif
diff --git a/server/exllama_kernels/exllama_kernels/util.cuh b/server/exllama_kernels/exllama_kernels/util.cuh
deleted file mode 100644
index 2839b10fa..000000000
--- a/server/exllama_kernels/exllama_kernels/util.cuh
+++ /dev/null
@@ -1,29 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _util_cuh
-#define _util_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-
-#define cudaUnspecified cudaErrorApiFailureBase
-
-// React to failure on return code != cudaSuccess
-
-#define _cuda_check(fn) \
-do { \
-    {_cuda_err = fn;} \
-    if (_cuda_err != cudaSuccess) goto _cuda_fail; \
-} while(false)
-
-// React to failure on return code == 0
-
-#define _alloc_check(fn) \
-do { \
-    if (!(fn)) { _cuda_err = cudaUnspecified; goto _cuda_fail; } \
-    else _cuda_err = cudaSuccess; \
-} while(false)
-
-#endif
diff --git a/server/exllama_kernels/setup.py b/server/exllama_kernels/setup.py
deleted file mode 100644
index f06a72bd9..000000000
--- a/server/exllama_kernels/setup.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-setup(
-    name="exllama_kernels",
-    ext_modules=[
-        CUDAExtension(
-            name="exllama_kernels",
-            sources=[
-                "exllama_kernels/exllama_ext.cpp",
-                "exllama_kernels/cuda_buffers.cu",
-                "exllama_kernels/cuda_func/column_remap.cu",
-                "exllama_kernels/cuda_func/q4_matmul.cu",
-                "exllama_kernels/cuda_func/q4_matrix.cu"
-            ],
-        )
-    ],
-    cmdclass={"build_ext": BuildExtension},
-)
diff --git a/server/exllamav2_kernels/exllamav2_kernels/config.h b/server/exllamav2_kernels/exllamav2_kernels/config.h
deleted file mode 100644
index 86baaf412..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/config.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _config_h
-#define _config_h
-
-#define MAX_Q_GEMM_ROWS 50
-
-#define QMODE_2BIT 1
-#define QMODE_3BIT 1
-#define QMODE_4BIT 1
-#define QMODE_5BIT 1
-#define QMODE_6BIT 0
-#define QMODE_8BIT 0
-
-#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.cpp b/server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.cpp
deleted file mode 100644
index f83c93490..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-#include "quantize_func.h"
-#include "../cuda/quantize.cuh"
-
-void quantize_range
-(
-    torch::Tensor quant,
-    torch::Tensor scale,
-    torch::Tensor out_q,
-    float qzero,
-    float maxq,
-    torch::Tensor hessian_inv,
-    torch::Tensor weights,
-    torch::Tensor error,
-    int a,
-    int b
-)
-{
-    int columns = weights.size(1);
-    int hcolumns = hessian_inv.size(1);
-
-    for (int c = a; c < b; c++)
-    {
-        quantize_cuda
-        (
-            ((const float*) weights.data_ptr()) + c * columns,
-            ((float*) quant.data_ptr()) + c * columns,
-            (const float*) scale.data_ptr(),
-            out_q.device().is_meta() ? NULL : ((uint16_t*) out_q.data_ptr()) + c * columns,
-            1,
-            columns,
-            qzero,
-            maxq
-        );
-
-        adjust_error_row_cuda
-        (
-            (const float*) hessian_inv.data_ptr(),
-            (float*) error.data_ptr(),
-            (const float*) weights.data_ptr(),
-            (const float*) quant.data_ptr(),
-            c,
-            columns,
-            hcolumns
-        );
-
-        vv_mul_sub_cuda
-        (
-            ((const float*) hessian_inv.data_ptr()) + c * hcolumns + c,
-            ((const float*) error.data_ptr()) + c * columns,
-            ((float*) weights.data_ptr()) + c * columns,
-            b - c,
-            columns
-        );
-    }
-
-    torch::Tensor x = hessian_inv.slice(0, a, b).slice(1, b).transpose(0, 1);
-    torch::Tensor y = error.slice(0, a, b);
-    weights.slice(0, b).addmm_(x, y, 1.0f, -1.0f);
-}
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.h b/server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.h
deleted file mode 100644
index cd111bad3..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _quantize_func_h
-#define _quantize_func_h
-
-#include <torch/extension.h>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <cstdint>
-#include <cstdio>
-
-void quantize_range
-(
-    torch::Tensor quant,
-    torch::Tensor scale,
-    torch::Tensor out_q,
-    float qzero,
-    float maxq,
-    torch::Tensor hessian_inv,
-    torch::Tensor weights,
-    torch::Tensor error,
-    int a,
-    int b
-);
-
-#endif
\ No newline at end of file
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cpp/sampling.cpp b/server/exllamav2_kernels/exllamav2_kernels/cpp/sampling.cpp
deleted file mode 100644
index 56d731aab..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cpp/sampling.cpp
+++ /dev/null
@@ -1,477 +0,0 @@
-#include "sampling.h"
-#include "util.h"
-#include <math.h>
-#include <vector>
-#include <queue>
-#include <utility>
-
-const int top_k_heap_threshold = 500;
-
-bool* g_rep_mask = NULL;
-int g_vocab_size = 0;
-
-void apply_rep_penalty_cpu
-(
-    const int vocab_size,
-    const uint64_t* sequence,
-    const float penalty_max,
-    const int sustain,
-    const int decay,
-    const int seq_len,
-    float* logits
-)
-{
-    if (vocab_size != g_vocab_size)
-    {
-        if (g_rep_mask) free(g_rep_mask);
-        g_vocab_size = vocab_size;
-        g_rep_mask = (bool*) malloc(g_vocab_size * sizeof(bool));
-    }
-
-    memset(g_rep_mask, 0, g_vocab_size * sizeof(bool));
-
-    float v = penalty_max;
-    float dv = decay ? (1.0f - penalty_max) / (float) decay : 0.0f;
-
-    int s = sustain == -1 ? seq_len : sustain;
-    int beg = seq_len - s - decay;
-    if (beg < 0) beg = 0;
-
-    for (int i = seq_len; i > beg;)
-    {
-        uint64_t t = sequence[--i];
-        if (!g_rep_mask[t])
-        {
-            if (logits[t] > 0.0) logits[t] /= v;
-            else logits[t] *= v;
-            g_rep_mask[t] = true;
-        }
-        if (--s < 0) v += dv;
-    }
-}
-
-void softmax_cpu
-(
-    const int vocab_size,
-    const float temperature,
-    const float* logits,
-    const bool* logits_filter,
-    float* output
-)
-{
-    float esum = 0.0f;
-    float itemp = 1.0f / temperature;
-    float maxl = 0.0f;
-
-    #pragma unroll(32)
-    for (int i = 0; i < vocab_size; i++)
-    {
-        if (!logits_filter[i]) continue;
-        maxl = fmaxf(logits[i], maxl);
-    }
-    maxl *= itemp;
-
-    #pragma unroll(32)
-    for (int i = 0; i < vocab_size; i++)
-    {
-        if (!logits_filter[i]) continue;
-        float e = expf(logits[i] * itemp - maxl);
-        output[i] = e;
-        esum += e;
-    }
-    float isum = 1.0f / esum;
-
-    #pragma unroll(32)
-    for (int i = 0; i < vocab_size; i++)
-    {
-        if (logits_filter[i])
-            output[i] *= isum;
-        else
-            output[i] = 0.0f;
-    }
-
-//    printf("Softmax:");
-//    float summ = 0.0f;
-//    for (int i = 0; i < vocab_size; i++)
-//    {
-//        if (logits_filter[i])
-//        {
-//            printf("%d, %f\n", i, output[i]);
-//            summ += output[i];
-//        }
-//    }
-//    printf("sum: %f\n", summ);
-}
-
-void normalize_cpu
-(
-    const int num_candidates,
-    float* probs
-)
-{
-    float sum = 0.0f;
-    #pragma unroll(32)
-    for (int i = 0; i < num_candidates; i++) sum += probs[i];
-    float isum = 1.0f / sum;
-    #pragma unroll(32)
-    for (int i = 0; i < num_candidates; i++) probs[i] *= isum;
-}
-
-int greedy_sample
-(
-    const int num_candidates,
-    const float* probs,
-    const bool* logits_filter
-)
-{
-    int maxidx = -1;
-    float max = -1e38;
-
-    for(int i = 1; i < num_candidates; i++)
-    {
-        if (logits_filter[i] && (maxidx == -1 || probs[i] > max))
-        {
-            max = probs[i];
-            maxidx = i;
-        }
-    }
-    return maxidx;
-}
-
-template <typename T>
-inline void swap(T &a, T &b)
-{
-    T temp = a;
-    a = b;
-    b = temp;
-}
-
-inline bool cmp_asc(const float& a, const float& b)
-{
-    return a > b;
-}
-
-inline bool cmp_desc(const float& a, const float& b)
-{
-    return a < b;
-}
-
-template <bool (*cmp_func)(const float&, const float&)>
-void quicksort_with_idx
-(
-    float* arr,
-    int* idx,
-    int low,
-    int high,
-    int max_index
-)
-{
-    if (low >= high) return;
-
-    // Bubblesort very short segments
-
-    if (high - low == 1)
-    {
-        int i0 = low;
-        int i1 = low + 1;
-
-        if (cmp_func(arr[i0], arr[i1])) { swap<float>(arr[i0], arr[i1]); swap<int>(idx[i0], idx[i1]); }
-        return;
-    }
-
-    if (high - low == 2)
-    {
-        int i0 = low;
-        int i1 = low + 1;
-        int i2 = low + 2;
-
-        if (cmp_func(arr[i0], arr[i1])) { swap<float>(arr[i0], arr[i1]); swap<int>(idx[i0], idx[i1]); }
-        if (cmp_func(arr[i1], arr[i2])) { swap<float>(arr[i1], arr[i2]); swap<int>(idx[i1], idx[i2]); }
-        if (cmp_func(arr[i0], arr[i1])) { swap<float>(arr[i0], arr[i1]); swap<int>(idx[i0], idx[i1]); }
-        return;
-    }
-
-    if (high - low == 3)
-    {
-        int i0 = low;
-        int i1 = low + 1;
-        int i2 = low + 2;
-        int i3 = low + 3;
-
-        if (cmp_func(arr[i0], arr[i1])) { swap<float>(arr[i0], arr[i1]); swap<int>(idx[i0], idx[i1]); }
-        if (cmp_func(arr[i1], arr[i2])) { swap<float>(arr[i1], arr[i2]); swap<int>(idx[i1], idx[i2]); }
-        if (cmp_func(arr[i2], arr[i3])) { swap<float>(arr[i2], arr[i3]); swap<int>(idx[i2], idx[i3]); }
-        if (cmp_func(arr[i0], arr[i1])) { swap<float>(arr[i0], arr[i1]); swap<int>(idx[i0], idx[i1]); }
-        if (cmp_func(arr[i1], arr[i2])) { swap<float>(arr[i1], arr[i2]); swap<int>(idx[i1], idx[i2]); }
-        if (cmp_func(arr[i0], arr[i1])) { swap<float>(arr[i0], arr[i1]); swap<int>(idx[i0], idx[i1]); }
-        return;
-    }
-
-    if (high - low == 4)
-    {
-        int i0 = low;
-        int i1 = low + 1;
-        int i2 = low + 2;
-        int i3 = low + 3;
-        int i4 = low + 4;
-
-        if (cmp_func(arr[i0], arr[i1])) { swap<float>(arr[i0], arr[i1]); swap<int>(idx[i0], idx[i1]); }
-        if (cmp_func(arr[i1], arr[i2])) { swap<float>(arr[i1], arr[i2]); swap<int>(idx[i1], idx[i2]); }
-        if (cmp_func(arr[i2], arr[i3])) { swap<float>(arr[i2], arr[i3]); swap<int>(idx[i2], idx[i3]); }
-        if (cmp_func(arr[i3], arr[i4])) { swap<float>(arr[i3], arr[i4]); swap<int>(idx[i3], idx[i4]); }
-        if (cmp_func(arr[i0], arr[i1])) { swap<float>(arr[i0], arr[i1]); swap<int>(idx[i0], idx[i1]); }
-        if (cmp_func(arr[i1], arr[i2])) { swap<float>(arr[i1], arr[i2]); swap<int>(idx[i1], idx[i2]); }
-        if (cmp_func(arr[i2], arr[i3])) { swap<float>(arr[i2], arr[i3]); swap<int>(idx[i2], idx[i3]); }
-        if (cmp_func(arr[i0], arr[i1])) { swap<float>(arr[i0], arr[i1]); swap<int>(idx[i0], idx[i1]); }
-        if (cmp_func(arr[i1], arr[i2])) { swap<float>(arr[i1], arr[i2]); swap<int>(idx[i1], idx[i2]); }
-        if (cmp_func(arr[i0], arr[i1])) { swap<float>(arr[i0], arr[i1]); swap<int>(idx[i0], idx[i1]); }
-        return;
-    }
-
-    float pivot = arr[high];
-    int i = low - 1;
-    for (int j = low; j < high; j++)
-    {
-        if (!cmp_func(arr[j], pivot))
-        {
-            i++;
-            swap<float>(arr[i], arr[j]);
-            swap<int>(idx[i], idx[j]);
-        }
-    }
-
-    swap<float>(arr[i + 1], arr[high]);
-    swap<int>(idx[i + 1], idx[high]);
-    int pos = i + 1;
-
-    if (max_index == 0 || low <= max_index)
-        quicksort_with_idx<cmp_func>(arr, idx, low, pos - 1, max_index);
-    if (max_index == 0 || pos <= max_index)
-        quicksort_with_idx<cmp_func>(arr, idx, pos + 1, high, max_index);
-}
-
-// Discard tiny probabilities, improves performance when temperature is very low
-
-int pre_sort_descending
-(
-    const int num_candidates,
-    float* arr,
-    int* idx
-)
-{
-    const float eps = 1e-8;
-    int i = 0;
-    int j = num_candidates - 1;
-
-    while (i <= j)
-    {
-        if (arr[j] < eps) { j--; continue; }
-        if (arr[i] >= eps) { i++; continue; }
-        swap<float>(arr[i], arr[j]);
-        swap<int>(idx[i], idx[j]);
-        i++;
-        j--;
-    }
-
-    return i;
-}
-
-void sort_descending
-(
-    const int num_candidates,
-    float* temp_probs,
-    int* temp_indices,
-    int max_index
-)
-{
-    int pre = pre_sort_descending(num_candidates, temp_probs, temp_indices);
-    quicksort_with_idx<cmp_desc>(temp_probs, temp_indices, 0, pre - 1, max_index);
-
-//    int m = (max_index == 0 ? num_candidates : max_index);
-//    for (int i = 0; i < m; i++) printf("%i - %f \n", temp_indices[i], temp_probs[i] * 10000.0);
-//    for (int i = 0; i < m - 1; i++) if (temp_probs[i] < temp_probs[i + 1] - 2e-8) DBGI(i);
-}
-
-int top_k_cpu
-(
-    const int num_candidates,
-    float* temp_probs,
-    int* temp_indices,
-    int top_k
-)
-{
-    //TIME_START;
-
-    // Use min-heap for lower values of K
-
-    if (top_k <= top_k_heap_threshold)
-    {
-        std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int>>, std::greater<std::pair<float, int>>> min_heap;
-
-        for (int i = 0; i < top_k; ++i) min_heap.push({temp_probs[i], temp_indices[i]});
-
-        for (int i = top_k; i < num_candidates; i++)
-        {
-            if (temp_probs[i] > min_heap.top().first)
-            {
-                min_heap.pop();
-                min_heap.push({temp_probs[i], temp_indices[i]});
-            }
-        }
-
-        int j = top_k;
-        for (int i = 0; i < top_k; i++)
-        {
-            j--;
-            temp_probs[j] = min_heap.top().first;
-            temp_indices[j] = min_heap.top().second;
-            min_heap.pop();
-        }
-    }
-
-    // For larger values, quicksort is still faster
-
-    else
-    {
-        sort_descending(num_candidates, temp_probs, temp_indices, top_k);
-    }
-
-    //TIME_STOP;
-
-    return top_k;
-}
-
-int top_p_cpu
-(
-    const int num_candidates,
-    float* temp_probs,
-    int* temp_indices,
-    float top_p
-)
-{
-    std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int>>, std::greater<std::pair<float, int>>> min_heap;
-
-    //TIME_START;
-
-    float min_p = 1e-6;
-
-    float sum = 0.0f;
-    for (int i = 0; i < num_candidates; i++)
-    {
-        if (temp_probs[i] < min_p) continue;
-        if (sum > top_p && temp_probs[i] < min_heap.top().first) continue;
-
-        min_heap.push({temp_probs[i], temp_indices[i]});
-        sum += temp_probs[i];
-
-        while (sum > top_p && min_heap.size() > 1)
-        {
-            sum -= min_heap.top().first;
-            min_heap.pop();
-        }
-    }
-
-    int j = min_heap.size();
-    int k = j;
-    while (j > 0)
-    {
-        j--;
-        temp_probs[j] = min_heap.top().first;
-        temp_indices[j] = min_heap.top().second;
-        min_heap.pop();
-    }
-
-    //TIME_STOP;
-
-    return k;
-}
-
-int typical_cpu
-(
-    const int num_candidates,
-    float* temp_probs,
-    int* temp_indices,
-    float typical
-)
-{
-    //TIME_START;
-
-    const float epsilon = 1e-10;
-
-    float* temp = (float*) malloc(num_candidates * sizeof(float));
-    int* entropy_dev_order = (int*) malloc(num_candidates * sizeof(int));
-    int* temp_indices_2 = (int*) malloc(num_candidates * sizeof(int));
-
-    float neg_entropy = 0.0f;
-    for (int i = 0; i < num_candidates; i++)
-    {
-        float x = temp_probs[i];
-        float y = x + logf(x + epsilon);
-        neg_entropy += x * y;
-        temp[i] = y;  // temp = log_probs
-    }
-
-    for (int i = 0; i < num_candidates; i++)
-    {
-        temp[i] = fabs(temp[i] - neg_entropy);  // temp = entropy_dev
-        entropy_dev_order[i] = i;
-    }
-
-    quicksort_with_idx<cmp_asc>(temp, entropy_dev_order, 0, num_candidates - 1, num_candidates);
-
-    memcpy(temp, temp_probs, num_candidates * sizeof(float));  // temp = temp_probs
-    memcpy(temp_indices_2, temp_indices, num_candidates * sizeof(int));
-
-    float cumprob = 0.0f;
-    int num = 0;
-
-    while (true)
-    {
-        int j = entropy_dev_order[num];
-        float p = temp[j];
-        temp_probs[num] = p;
-        temp_indices[num] = temp_indices_2[j];
-
-        cumprob += p;
-        if (cumprob >= typical) break;
-        num++;
-        if (num >= num_candidates) break;
-    }
-
-    free(temp);
-    free(entropy_dev_order);
-    free(temp_indices_2);
-
-    //TIME_STOP;
-
-    if (num == 0) num = 1;
-    return num;
-}
-
-int multinomial_cpu
-(
-    const int num_candidates,
-    float* temp_probs,
-    int* temp_indices,
-    float random
-)
-{
-    int idx = 0;
-    float accum = temp_probs[idx];
-
-    while (true)
-    {
-        if (accum >= random) break;
-        if (idx == num_candidates - 1) break;
-        idx++;
-        accum += temp_probs[idx];
-    }
-
-    temp_probs[0] = temp_probs[idx];
-    temp_indices[0] = temp_indices[idx];
-
-    return 1;
-}
-
-
-
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cpp/sampling.h b/server/exllamav2_kernels/exllamav2_kernels/cpp/sampling.h
deleted file mode 100644
index 85472994a..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cpp/sampling.h
+++ /dev/null
@@ -1,84 +0,0 @@
-#ifndef _sampling_h
-#define _sampling_h
-
-#include <cstdlib>
-#include <cstring>
-#include <cstdint>
-#include <cstdio>
-
-void apply_rep_penalty_cpu
-(
-    const int vocab_size,
-    const uint64_t* sequence,
-    const float penalty_max,
-    const int sustain,
-    const int decay,
-    const int seq_len,
-    float* logits
-);
-
-void softmax_cpu
-(
-    const int vocab_size,
-    const float temperature,
-    const float* logits,
-    const bool* logits_filter,
-    float* output
-);
-
-void normalize_cpu
-(
-    const int num_candidates,
-    float* probs
-);
-
-int greedy_sample
-(
-    const int num_candidates,
-    const float* probs,
-    const bool* logits_filter
-);
-
-void sort_descending
-(
-    const int num_candidates,
-    float* temp_probs,
-    int* temp_indices,
-    int max_index
-);
-
-int top_k_cpu
-(
-    const int num_candidates,
-    float* temp_probs,
-    int* temp_indices,
-    int top_k
-);
-
-int top_p_cpu
-(
-    const int num_candidates,
-    float* temp_probs,
-    int* temp_indices,
-    float top_p
-);
-
-int typical_cpu
-(
-    const int num_candidates,
-    float* temp_probs,
-    int* temp_indices,
-    float typical
-);
-
-int multinomial_cpu
-(
-    const int num_candidates,
-    float* temp_probs,
-    int* temp_indices,
-    float random
-);
-
-#endif
-
-
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cpp/util.h b/server/exllamav2_kernels/exllamav2_kernels/cpp/util.h
deleted file mode 100644
index 983d6323a..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cpp/util.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _util_h
-#define _util_h
-
-#include <chrono>
-
-#define DBGS(__x) printf("%s\n", __x)
-#define DBGI(__x) printf("%s: %i\n", #__x, __x)
-#define DBGI2(__x, __y) printf("%s, %s: %i, %i\n", #__x, #__y, __x, __y)
-#define DBGI3(__x, __y, __z) printf("%s, %s, %s: %i, %i, %i\n", #__x, #__y, #__z, __x, __y, __z)
-#define DBGF(__x) printf("%s: %f\n", #__x, __x)
-#define DBGF2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __x, __y)
-#define DBGF3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __x, __y, __z)
-#define DBGIF(__x, __y) printf("%s, %s: %i, %f\n", #__x, #__y, __x, __y)
-
-#define TIME_START \
-    auto start = std::chrono::high_resolution_clock::now()
-
-#define TIME_STOP \
-    do { \
-        auto stop = std::chrono::high_resolution_clock::now(); \
-        auto duration_us = std::chrono::duration_cast<std::chrono::microseconds>(stop - start); \
-        DBGI(duration_us); \
-    } while (false)
-
-#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/cache.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/cache.cu
deleted file mode 100644
index 900a92b53..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/cache.cu
+++ /dev/null
@@ -1,161 +0,0 @@
-#include "cache.cuh"
-
-// #if defined(CUDART_VERSION) && CUDART_VERSION >= 11080
-//
-// #include <cuda_fp8.h>
-
-#include "quant/qdq_util.cuh"
-#include "util.cuh"
-
-#define THREADS 32
-
-// The upper 8 bits of FP16 are equivalent to FP8 E5M2.
-//
-// The range of values typically cached seem to be in the range of +/- 16, with an exponent component (with bias) up to
-// about 20. Empirically, the MSE over the whole range of observed values in the K/V cache works out the same for E4M3
-// and E5M2. However, over 80% of values in the cache tensors fall within the range of -1..1, where E5M2 produces about
-// a 25% lower MSE.
-
-__device__ inline uint32_t compress(uint32_t v)
-{
-    uint32_t vh = (v & 0xff000000) >> 16;
-    uint32_t vl = (v & 0x0000ff00) >> 8;
-    return vh | vl;
-}
-
-__device__ inline uint32_t decompress(uint32_t v)
-{
-    uint32_t vh = (v & 0xff00) << 16;
-    uint32_t vl = (v & 0x00ff) << 8;
-    return vh | vl;
-}
-
-__global__ void nv_fp16_to_fp8
-(
-    const half* __restrict__ pIn,
-    unsigned char* __restrict__ pOut,
-    int stride,
-    int height,
-    int min,
-    int max
-)
-{
-    int x = min + (blockIdx.x * blockDim.x + threadIdx.x) * 8;
-    int y = blockIdx.y;
-    if (x >= max) return;
-
-    int4* in_ptr = (int4*) (pIn + y * stride + x);
-    int2* out_ptr = (int2*) (pOut + y * stride + x);
-
-    int4 in = *in_ptr;
-    uint32_t c0 = compress(in.x);
-    uint32_t c1 = compress(in.y);
-    uint32_t c2 = compress(in.z);
-    uint32_t c3 = compress(in.w);
-    int2 out = make_int2(c0 | (c1 << 16), c2 | (c3 << 16));
-    *out_ptr = out;
-}
-
-__global__ void nv_fp8_to_fp16
-(
-    const unsigned char* __restrict__ pIn,
-    half* __restrict__ pOut,
-    int stride,
-    int height,
-    int min,
-    int max
-)
-{
-    int x = min + (blockIdx.x * blockDim.x + threadIdx.x) * 8;
-    int y = blockIdx.y;
-    if (x >= max) return;
-
-    int2* in_ptr = (int2*) (pIn + y * stride + x);
-    int4* out_ptr = (int4*) (pOut + y * stride + x);
-
-    int2 in = *in_ptr;
-    uint32_t c0 = decompress(in.x);
-    uint32_t c1 = decompress(in.x >> 16);
-    uint32_t c2 = decompress(in.y);
-    uint32_t c3 = decompress(in.y >> 16);
-    int4 out = make_int4(c0, c1, c2, c3);
-    *out_ptr = out;
-}
-
-// __global__ void nv_fp32_to_fp16(const float* pIn, half* pOut, int size)
-// {
-//     int i = blockIdx.x * blockDim.x + threadIdx.x;
-//     if (i < size) {
-//         pOut[i] = __float2half(pIn[i]);
-//     }
-// }
-
-// __global__ void nv_fp16_to_fp8_ref(const half* pIn, unsigned char *pOut, int size)
-// {
-//     int i = blockIdx.x * blockDim.x + threadIdx.x;
-//     if (i < size) {
-//         pOut[i] = __nv_cvt_halfraw_to_fp8(pIn[i], __NV_SATFINITE, __NV_E4M3);
-//     }
-// }
-//
-// __global__ void nv_fp8_to_fp16_ref(const unsigned char* pIn, half* pOut, int size)
-// {
-//     int i = blockIdx.x * blockDim.x + threadIdx.x;
-//     if (i < size) {
-//         pOut[i] = __nv_cvt_fp8_to_halfraw(pIn[i], __NV_E4M3);
-//     }
-// }
-
-void array_fp16_to_fp8_cuda(const half* pIn, unsigned char *pOut, int stride, int height, int offset, int width)
-{
-    int min = offset;
-    int max = offset + width;
-    min = min / 8 * 8;
-    max = min + (max - min + 7) / 8 * 8;
-
-    dim3 blockDim, gridDim;
-    blockDim.x = THREADS;
-    gridDim.x = DIVIDE((max - min) / 8, THREADS);
-    gridDim.y = height;
-
-    nv_fp16_to_fp8<<<gridDim, blockDim>>>(pIn, pOut, stride, height, min, max);
-    // cuda_check( cudaPeekAtLastError() );
-}
-
-void array_fp8_to_fp16_cuda(const unsigned char* pIn, half* pOut, int stride, int height, int offset, int width)
-{
-    int min = offset;
-    int max = offset + width;
-    min = min / 8 * 8;
-    max = min + (max - min + 7) / 8 * 8;
-
-    dim3 blockDim, gridDim;
-    blockDim.x = THREADS;
-    gridDim.x = DIVIDE((max - min) / 8, THREADS);
-    gridDim.y = height;
-
-    nv_fp8_to_fp16<<<gridDim, blockDim>>>(pIn, pOut, stride, height, min, max);
-    // cuda_check( cudaPeekAtLastError() );
-}
-
-// void array_fp16_to_fp8_ref_cuda(const half* pIn, unsigned char *pOut, int size)
-// {
-//     const int threads = 512;
-//     int blocks = DIVIDE(size / 1, threads);
-//     nv_fp16_to_fp8_ref<<<blocks, threads>>>(pIn, pOut, size);
-// }
-//
-// void array_fp8_to_fp16_ref_cuda(const unsigned char* pIn, half* pOut, int size)
-// {
-//     const int threads = 512;
-//     int blocks = DIVIDE(size / 1, threads);
-//     nv_fp8_to_fp16_ref<<<blocks, threads>>>(pIn, pOut, size);
-// }
-
-// #else
-//
-// void array_fp16_to_fp8_cuda(const half* pIn, unsigned char *pOut, int size) { }
-//
-// void array_fp8_to_fp16_cuda(const unsigned char* pIn, half* pOut, int size) { }
-//
-// #endif
\ No newline at end of file
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/cache.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/cache.cuh
deleted file mode 100644
index 4cb291b65..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/cache.cuh
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _cache_cuh
-#define _cache_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-
-void array_fp16_to_fp8_cuda(const half* pIn, unsigned char *pOut, int stride, int height, int offset, int width);
-void array_fp8_to_fp16_cuda(const unsigned char* pIn, half* pOut, int stride, int height, int offset, int width);
-// void array_fp16_to_fp8_ref_cuda(const half* pIn, unsigned char *pOut, int size);
-// void array_fp8_to_fp16_ref_cuda(const unsigned char* pIn, half* pOut, int size);
-
-#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/compat.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/compat.cuh
deleted file mode 100644
index 12684ff8b..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/compat.cuh
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef _compat_cuh
-#define _compat_cuh
-
-// atomicAdd for half types, to support CC < 7.x
-
-__device__ __forceinline__ void atomicAdd_half(half* address, half val)
-{
-    unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
-    unsigned int old = *address_as_ui;
-    unsigned int assumed;
-
-    do
-    {
-        assumed = old;
-        __half_raw hsum;
-        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
-        half tmpres = __hadd(hsum, val);
-        hsum = __half_raw(tmpres);
-        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
-        old = atomicCAS(address_as_ui, assumed, old);
-    }
-    while (assumed != old);
-}
-
-// atomicAdd for half2 types
-
-__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
-{
-    unsigned int* address_as_ui = (unsigned int*)address;
-    unsigned int old = *address_as_ui;
-    unsigned int assumed;
-    do
-    {
-        assumed = old;
-        half2 old_val = *((half2*)&old);
-        half2 new_val = __hadd2(old_val, val);
-        old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
-    }
-    while (assumed != old);
-}
-
-//
-
-#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
-#if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
-
-__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
-
-#if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
-__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
-#endif
-
-#endif
-#endif
-
-#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/compat_gemm.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/compat_gemm.cuh
deleted file mode 100644
index 089f28625..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/compat_gemm.cuh
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef _compat_gemm_cuh
-#define _compat_gemm_cuh
-
-#if defined(USE_ROCM)
-__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t    handle,
-                                                               hipblasOperation_t transA,
-                                                               hipblasOperation_t transB,
-                                                               int                m,
-                                                               int                n,
-                                                               int                k,
-                                                               const half*        alpha,
-                                                               const half*        AP,
-                                                               int                lda,
-                                                               const half*        BP,
-                                                               int                ldb,
-                                                               const half*        beta,
-                                                               half*              CP,
-                                                               int                ldc) {
-    return hipblasHgemm(handle, transA, transB, m, n, k,
-                        reinterpret_cast<const hipblasHalf *>(alpha),
-                        reinterpret_cast<const hipblasHalf *>(AP), lda,
-                        reinterpret_cast<const hipblasHalf *>(BP), ldb,
-                        reinterpret_cast<const hipblasHalf *>(beta),
-                        reinterpret_cast<hipblasHalf *>(CP), ldc);
-}
-#define hipblasHgemm __compat_hipblasHgemm
-
-// Previous version of PyTorch were converting to rocBLAS instead of hipBLAS.
-#define rocblas_operation_none HIPBLAS_OP_N
-#define rocblas_hgemm __compat_hipblasHgemm
-#endif
-
-#endif
\ No newline at end of file
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/h_gemm.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/h_gemm.cu
deleted file mode 100644
index 5d4495ece..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/h_gemm.cu
+++ /dev/null
@@ -1,275 +0,0 @@
-#include "h_gemm.cuh"
-#include "util.cuh"
-#include "../config.h"
-#include "matrix_view.cuh"
-
-#include "compat_gemm.cuh"
-
-// union half2_uint32
-// {
-//     uint32_t as_uint32;
-//     half2 as_half2;
-//     __device__ half2_uint32(uint32_t val) : as_uint32(val) {}
-//     __device__ half2_uint32(half2 val) : as_half2(val) {}
-// };
-
-// TODO: Improve tall kernel, maybe special cases for size_n = 1, 2, 4, 8, 16
-
-const int T_THREADS_M = 1;
-const int T_THREADS_N = 8;
-const int T_BLOCKSIZE_K = 32;
-const int T_MAX_M = 16;
-const int T_MAX_N = 64;
-const int T_MAX_K = 1024 / T_THREADS_N * T_BLOCKSIZE_K;
-const int T_MAX_BLOCKS_K = T_MAX_K / T_BLOCKSIZE_K;
-
-__global__ void h_gemm_tall_kernel
-(
-    const int size_m,
-    const int size_n,
-    const int size_k,
-    const half* __restrict__ a,
-    const half* __restrict__ b,
-    half* __restrict__ c,
-    bool clear
-)
-{
-    __shared__ half accum[T_MAX_BLOCKS_K][T_THREADS_N];
-
-    int m = blockIdx.y * T_THREADS_M + threadIdx.z;
-    int n = blockIdx.x * T_THREADS_N + threadIdx.x;
-    int k = threadIdx.y * T_BLOCKSIZE_K;
-
-    if (n >= size_n) return;
-    if (m >= size_m) return;
-
-    MatrixView_half a_(a, size_m, size_k);
-    MatrixView_half b_(b, size_k, size_n);
-    MatrixView_half_rw c_(c, size_m, size_n);
-
-    int k_end = min(k + T_BLOCKSIZE_K, size_k);
-
-    const half* a_ptr = a_.item_ptr(m, k);
-    const half* a_ptr_end = a_.item_ptr(m, k_end);
-    const half* b_ptr = b_.item_ptr(k, n);
-    half* c_ptr = c_.item_ptr(m, n);
-
-    half2 r2 = {};
-
-    while(a_ptr <= a_ptr_end - 8)
-    {
-        int4 a_int4 = *((int4*) a_ptr);
-        half2 a_01 = ((half2_uint32) a_int4.x).as_half2;
-        half2 a_23 = ((half2_uint32) a_int4.y).as_half2;
-        half2 a_45 = ((half2_uint32) a_int4.z).as_half2;
-        half2 a_67 = ((half2_uint32) a_int4.w).as_half2;
-        a_ptr += 8;
-
-        half b_0 = *b_ptr; b_ptr += size_n;
-        half b_1 = *b_ptr; b_ptr += size_n;
-        half b_2 = *b_ptr; b_ptr += size_n;
-        half b_3 = *b_ptr; b_ptr += size_n;
-        half b_4 = *b_ptr; b_ptr += size_n;
-        half b_5 = *b_ptr; b_ptr += size_n;
-        half b_6 = *b_ptr; b_ptr += size_n;
-        half b_7 = *b_ptr; b_ptr += size_n;
-        half2 b_01 = __halves2half2(b_0, b_1);
-        half2 b_23 = __halves2half2(b_2, b_3);
-        half2 b_45 = __halves2half2(b_4, b_5);
-        half2 b_67 = __halves2half2(b_6, b_7);
-
-        r2 = __hfma2(a_01, b_01, r2);
-        r2 = __hfma2(a_23, b_23, r2);
-        r2 = __hfma2(a_45, b_45, r2);
-        r2 = __hfma2(a_67, b_67, r2);
-    }
-
-    while(a_ptr <= a_ptr_end - 4)
-    {
-        int2 a_int2 = *((int2*) a_ptr);
-        half2 a_01 = ((half2_uint32) a_int2.x).as_half2;
-        half2 a_23 = ((half2_uint32) a_int2.y).as_half2;
-        a_ptr += 4;
-
-        half b_0 = *b_ptr; b_ptr += size_n;
-        half b_1 = *b_ptr; b_ptr += size_n;
-        half b_2 = *b_ptr; b_ptr += size_n;
-        half b_3 = *b_ptr; b_ptr += size_n;
-        half2 b_01 = __halves2half2(b_0, b_1);
-        half2 b_23 = __halves2half2(b_2, b_3);
-
-        r2 = __hfma2(a_01, b_01, r2);
-        r2 = __hfma2(a_23, b_23, r2);
-    }
-
-    half r = __hadd(__low2half(r2), __high2half(r2));
-
-    while(a_ptr < a_ptr_end)
-    {
-        half a_item = *a_ptr++;
-        half b_item = *b_ptr; b_ptr += size_n;
-        r = __hfma(a_item, b_item, r);
-    }
-
-    accum[threadIdx.y][threadIdx.x] = r;
-    __syncthreads();
-
-    if (threadIdx.y == 0)
-    {
-        half acc = accum[0][threadIdx.x];
-        for (int i = 1; i < blockDim.y; ++i) acc = __hadd(accum[i][threadIdx.x], acc);
-        if (!clear) acc = __hadd(acc, *c_ptr);
-        *c_ptr = acc;
-    }
-}
-
-
-const int W_MAX_M = 16;
-const int W_MAX_N = 65536;
-const int W_MAX_K = 32;
-const int W_THREADS_M = 1;
-const int W_THREADS_N = 32;
-
-__global__ void h_gemm_wide_kernel
-(
-    const int size_m,
-    const int size_n,
-    const int size_k,
-    const half* __restrict__ a,
-    const half* __restrict__ b,
-    half* __restrict__ c,
-    bool clear
-)
-{
-    int m = blockIdx.y * W_THREADS_M + threadIdx.y;
-    int n = blockIdx.x * W_THREADS_N + threadIdx.x;
-
-    if (n >= size_n) return;
-    if (m >= size_m) return;
-
-    MatrixView_half a_(a, size_m, size_k);
-    MatrixView_half b_(b, size_k, size_n);
-    MatrixView_half_rw c_(c, size_m, size_n);
-
-    half* c_ptr = c_.item_ptr(m, n);
-
-    __shared__ half read_a[W_MAX_K];
-    int t = threadIdx.x;
-
-    if (t < size_k)
-    {
-        read_a[t] = a_.item(m, t);
-    }
-    __syncthreads();
-
-    half r = {};
-
-    for (int k = 0; k < size_k; ++k)
-    {
-        half item_a = read_a[k];
-        half item_b = b_.item(k, n);
-        r = __hfma(item_a, item_b, r);
-    }
-
-    if (threadIdx.y == 0)
-    {
-        if (!clear) r = __hadd(r, *c_ptr);
-        *c_ptr = r;
-    }
-}
-
-
-// cuBLAS
-
-void h_gemm_cublas
-(
-    cublasHandle_t cublas_handle,
-    const int size_m,
-    const int size_n,
-    const int size_k,
-    const half* a,
-    const half* b,
-    half* c,
-    const float alpha,
-    const float beta
-)
-{
-    half alpha_ = __float2half(alpha);
-    half beta_ = __float2half(beta);
-    cublasHgemm(cublas_handle,
-                CUBLAS_OP_N,
-                CUBLAS_OP_N,
-                size_n, size_m, size_k,
-                &alpha_, b, size_n,
-                         a, size_k,
-                &beta_,  c, size_n);
-}
-
-
-// alpha * ( a[m,k] @ b[k,n] ) + beta * c[m,n] -> c[m,n]
-
-void h_gemm_cuda
-(
-    cublasHandle_t cublas_handle,
-    const int size_m,
-    const int size_n,
-    const int size_k,
-    const half* a,
-    const half* b,
-    half* c,
-    const float alpha,
-    const float beta
-)
-{
-    if ((beta == 1.0f || beta == 0.0f) && (alpha == 1.0f))
-    {
-        bool clear = (beta == 0.0f);
-
-        //DBGI3(size_m, size_n, size_k);
-
-        if (size_m <= T_MAX_M && size_n <= T_MAX_N && size_k <= T_MAX_K)
-        {
-            // Tall
-
-            dim3 blockDim, gridDim;
-            blockDim.x = T_THREADS_N;
-            blockDim.y = DIVIDE(size_k, T_BLOCKSIZE_K);
-            blockDim.z = T_THREADS_M;
-            gridDim.x = DIVIDE(size_n, T_THREADS_N);
-            gridDim.y = DIVIDE(size_m, T_THREADS_M);
-            gridDim.z = 1;
-
-//             DBGI3(blockDim.x, blockDim.y, blockDim.z);
-//             DBGI3(gridDim.x, gridDim.y, gridDim.z);
-
-            h_gemm_tall_kernel<<<gridDim, blockDim>>>(size_m, size_n, size_k, a, b, c, clear);
-            cuda_check( cudaPeekAtLastError() );
-            return;
-        }
-
-        if (size_m <= W_MAX_M && size_n <= W_MAX_N && size_k <= W_MAX_K)
-        {
-            // Wide
-
-            dim3 blockDim, gridDim;
-            blockDim.x = W_THREADS_N;
-            blockDim.y = W_THREADS_M;
-            blockDim.z = 1;
-            gridDim.x = DIVIDE(size_n, W_THREADS_N);
-            gridDim.y = DIVIDE(size_m, W_THREADS_M);
-            gridDim.z = 1;
-
-//             DBGI3(blockDim.x, blockDim.y, blockDim.z);
-//             DBGI3(gridDim.x, gridDim.y, gridDim.z);
-
-            h_gemm_wide_kernel<<<gridDim, blockDim>>>(size_m, size_n, size_k, a, b, c, clear);
-            cuda_check( cudaPeekAtLastError() );
-            return;
-        }
-    }
-
-    h_gemm_cublas(cublas_handle, size_m, size_n, size_k, a, b, c, alpha, beta);
-//     DBGI3(size_m, size_n, size_k);
-    cuda_check( cudaPeekAtLastError() );
-
-}
\ No newline at end of file
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/h_gemm.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/h_gemm.cuh
deleted file mode 100644
index 5694e2ade..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/h_gemm.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _h_gemm_cuh
-#define _h_gemm_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-#include <ATen/cuda/CUDAContext.h>
-
-// alpha * ( a[m,k] @ b[k,n] ) + beta * c[m,n] -> c[m,n]
-
-void h_gemm_cuda
-(
-    cublasHandle_t cublas_handle,
-    const int size_m,
-    const int size_n,
-    const int size_k,
-    const half* a,
-    const half* b,
-    half* c,
-    const float alpha,
-    const float beta
-);
-
-#endif
-
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/lora.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/lora.cu
deleted file mode 100644
index 49a19ec0f..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/lora.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-#include "lora.cuh"
-#include "util.cuh"
-#include "h_gemm.cuh"
-
-void apply_loras_cuda
-(
-    cublasHandle_t cublas_handle,
-    const std::unordered_map<uintptr_t, std::tuple<half*, half*, int>>& adapters,
-    const std::vector<uintptr_t>& ids,
-    QMatrix* base,
-    const half* input,
-    half* output,
-    half* temp,
-    int rows
-)
-{
-    for (uintptr_t lora_id : ids)
-    {
-        auto it = adapters.find(lora_id);
-        if (it == adapters.end()) continue;
-
-        const std::tuple<half*, half*, int>& lora = it->second;
-        half* lora_a = std::get<0>(lora);
-        half* lora_b = std::get<1>(lora);
-        int rank = std::get<2>(lora);
-
-//         DBGI3(rows, rank, base->height);
-//         DBGI3(rows, base->width, rank);
-
-        h_gemm_cuda(cublas_handle, rows, rank, base->height, input, lora_a, temp, 1.0f, 0.0f);
-        h_gemm_cuda(cublas_handle, rows, base->width, rank, temp, lora_b, output, 1.0f, 1.0f);
-    }
-}
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/lora.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/lora.cuh
deleted file mode 100644
index b505a201c..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/lora.cuh
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _lora_cuh
-#define _lora_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-#include <ATen/cuda/CUDAContext.h>
-
-#include "q_matrix.cuh"
-
-void apply_loras_cuda
-(
-    cublasHandle_t cublas_handle,
-    const std::unordered_map<uintptr_t, std::tuple<half*, half*, int>>& adapters,
-    const std::vector<uintptr_t>& ids,
-    QMatrix* base,
-    const half* input,
-    half* output,
-    half* temp,
-    int rows
-);
-
-#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/matrix_view.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/matrix_view.cuh
deleted file mode 100644
index 55af84f23..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/matrix_view.cuh
+++ /dev/null
@@ -1,121 +0,0 @@
-#ifndef _matrix_view_cuh
-#define _matrix_view_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-
-#include "quant/qdq_util.cuh"
-
-class MatrixView_half
-{
-public:
-    const half* data;
-    const int height;
-    const int width;
-
-    __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-
-    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
-    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
-    __device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); }
-    __device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; }
-
-    __device__ __forceinline__ void item4(half (&items)[4], int row, int column) const
-    {
-        half2* ptr = (half2*) item_ptr(row, column);
-        half2 i01 = ptr[0];
-        half2 i23 = ptr[1];
-        items[0] = __low2half(i01);
-        items[1] = __high2half(i01);
-        items[2] = __low2half(i23);
-        items[3] = __high2half(i23);
-    }
-    __device__ __forceinline__ void item4_f(float (&items)[4], int row, int column) const
-    {
-        half2* ptr = (half2*)item_ptr(row, column);
-        half2 i01 = ptr[0];
-        half2 i23 = ptr[1];
-        items[0] = __half2float(__low2half(i01));
-        items[1] = __half2float(__high2half(i01));
-        items[2] = __half2float(__low2half(i23));
-        items[3] = __half2float(__high2half(i23));
-    }
-
-    __device__ __forceinline__ void item4_h2(half2 (&items)[4], int row, int column) const
-    {
-        half2* ptr = (half2*)item_ptr(row, column);
-        half2 i01 = ptr[0];
-        half2 i23 = ptr[1];
-        items[0] = __half2half2(__low2half(i01));
-        items[1] = __half2half2(__high2half(i01));
-        items[2] = __half2half2(__low2half(i23));
-        items[3] = __half2half2(__high2half(i23));
-    }
-};
-
-class MatrixView_half_rw
-{
-public:
-    half* data;
-    const int height;
-    const int width;
-
-    __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-
-    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
-    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
-    __device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; }
-    __device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; }
-    __device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; }
-
-    __device__ __forceinline__ void set4(int row, int column, half v0, half v1, half v2, half v3)
-    {
-        half2 v01 = __halves2half2(v0, v1);
-        half2 v23 = __halves2half2(v2, v3);
-        half2* ptr = (half2*) item_ptr(row, column);
-        ptr[0] = v01;
-        ptr[1] = v23;
-    }
-};
-
-class MatrixView_q4_row
-{
-public:
-    const uint32_t* data;
-    const int height;
-    const int width;
-
-    __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-
-    __device__ __forceinline__ int item(int row, int column) const
-    {
-        int shift = (column & 0x07) * 4;
-        return (data[row * width / 8 + column / 8] >> shift) & 0x0f;
-    }
-
-    __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const
-    {
-        int shift = (column & 0x07) * 4;
-        uint32_t d = data[row * width / 8 + column / 8] >> shift;
-        items[0] = d & 0x0f;
-        items[1] = (d >> 4) & 0x0f;
-    }
-
-    __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const
-    {
-        int shift = (column & 0x07) * 4;
-        uint32_t d = data[row * width / 8 + column / 8] >> shift;
-        items[0] = d & 0x0f;
-        items[1] = (d >> 4) & 0x0f;
-        items[2] = (d >> 8) & 0x0f;
-        items[3] = (d >> 12) & 0x0f;
-    }
-};
-
-#endif
\ No newline at end of file
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/pack_tensor.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/pack_tensor.cu
deleted file mode 100644
index 3a29f2fcd..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/pack_tensor.cu
+++ /dev/null
@@ -1,268 +0,0 @@
-#include "pack_tensor.cuh"
-#include "util.cuh"
-
-#define BLOCKSIZE_X 32
-#define BLOCKSIZE_Y 16
-
-// Pack rows:
-// 0000 0000 0000 aaaa  0000 0000 0000 bbbb  0000 0000 0000 cccc  ...  -> hhhh gggg ffff eeee dddd cccc bbbb aaaa
-
-__global__ void pack_rows_4_kernel
-(
-    const uint16_t* __restrict__ input,
-    uint32_t* __restrict__ output,
-    int rows,
-    int out_columns
-)
-{
-    int out_column = blockIdx.x * blockDim.x + threadIdx.x;
-    int row = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (row >= rows) return;
-    if (out_column >= out_columns) return;
-
-    uint32_t packed = 0;
-
-    #pragma unroll
-    for (int i = 0; i < 8; i++)
-    {
-        uint16_t x = input[row * out_columns * 8 + out_column * 8 + i];
-        x -= 1;
-        packed |= (((uint32_t)x) << (i * 4));
-    }
-
-    output[row * out_columns + out_column] = packed;
-}
-
-void pack_rows_4_cuda
-(
-    const uint16_t* input,
-    uint32_t* output,
-    const int rows,
-    const int columns
-)
-{
-    int out_columns = columns * 4 / 32;
-
-    dim3 threads(BLOCKSIZE_X, BLOCKSIZE_Y);
-    dim3 blocks(DIVIDE(out_columns, BLOCKSIZE_X), DIVIDE(rows, BLOCKSIZE_Y));
-
-    pack_rows_4_kernel<<<blocks, threads>>>(input, output, rows, out_columns);
-}
-
-// Pack rows:
-// 0000 0000 0000 aaaa  0000 0000 0000 bbbb  0000 0000 0000 cccc  ...  -> hhhh gggg ffff eeee dddd cccc bbbb aaaa
-
-__global__ void pack_rows_6_kernel
-(
-    const uint16_t* __restrict__ input,
-    uint32_t* __restrict__ output,
-    int rows,
-    int out_columns
-)
-{
-    int out_column = blockIdx.x * blockDim.x + threadIdx.x;
-    int row = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (row >= rows) return;
-    if (out_column >= out_columns) return;
-
-    uint32_t packed = 0;
-
-    #pragma unroll
-    for (int i = 0; i < 8; i++)
-    {
-        uint16_t x = input[row * out_columns * 8 + out_column * 8 + i];
-        x -= 1;
-        packed |= (((uint32_t)x) << (i * 4));
-    }
-
-    output[row * out_columns + out_column] = packed;
-}
-
-void pack_rows_6_cuda
-(
-    const uint16_t* input,
-    uint32_t* output,
-    const int rows,
-    const int columns
-)
-{
-    int out_columns = columns * 6 / 32;
-
-    dim3 threads(BLOCKSIZE_X, BLOCKSIZE_Y);
-    dim3 blocks(DIVIDE(out_columns, BLOCKSIZE_X), DIVIDE(rows, BLOCKSIZE_Y));
-
-    pack_rows_6_kernel<<<blocks, threads>>>(input, output, rows, out_columns);
-}
-
-// Pack columns
-
-__forceinline__ __device__ uint32_t wshift(uint32_t x, int j)
-{
-    if (j < 0)
-    {
-        if (j <= -32) return 0;  // Else undefined in CUDA
-        return x >> (-j);
-    }
-    else
-    {
-        if (j >= 32) return 0;  // Else undefined in CUDA
-        return x << j;
-    }
-}
-
-template<int bits>
-__global__ void pack_columns_kernel
-(
-    const uint16_t* __restrict__ input,
-    uint32_t* __restrict__ output,
-    int out_rows,
-    int columns
-)
-{
-    int column = blockIdx.x * blockDim.x + threadIdx.x;
-    int out_row = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (column >= columns) return;
-    if (out_row >= out_rows) return;
-
-    uint32_t x;
-
-    if constexpr (bits == 2)
-    {
-        int row = out_row * 32 / 2;
-        uint32_t packed = 0;
-
-        # pragma unroll
-        for (int i = 0, j = 0; i < 16; i++, j += 2)
-        {
-            x = (uint32_t) input[(row + i) * columns + column];
-            packed |= (x << (i * 2));
-        }
-        output[out_row * columns + column] = packed;
-    }
-
-    if constexpr (bits == 3)
-    {
-        if (out_row % 3) return;  // Only run for every third row
-        int row = out_row * 32 / 3;
-        uint32_t packed0 = 0;
-        uint32_t packed1 = 0;
-        uint32_t packed2 = 0;
-
-        #pragma unroll
-        for (int i = 0, j = 0; i < 32; i++, j += 3)
-        {
-            x = (uint32_t) input[(row + i) * columns + column];
-            packed0 |= wshift(x, j);
-            packed1 |= wshift(x, j - 32);
-            packed2 |= wshift(x, j - 64);
-        }
-
-        output[(out_row + 0) * columns + column] = packed0;
-        output[(out_row + 1) * columns + column] = packed1;
-        output[(out_row + 2) * columns + column] = packed2;
-    }
-
-    if constexpr (bits == 4)
-    {
-        int row = out_row * 32 / 4;
-        uint32_t packed = 0;
-
-        #pragma unroll
-        for (int i = 0, j = 0; i < 8; i++, j += 4)
-        {
-            x = (uint32_t) input[(row + i) * columns + column];
-            packed |= (x << j);
-        }
-        output[out_row * columns + column] = packed;
-    }
-
-    if constexpr (bits == 5)
-    {
-        if (out_row % 5) return;  // Only run for every fifth row
-        int row = out_row * 32 / 5;
-        uint32_t packed0 = 0;
-        uint32_t packed1 = 0;
-        uint32_t packed2 = 0;
-        uint32_t packed3 = 0;
-        uint32_t packed4 = 0;
-
-        #pragma unroll
-        for (int i = 0, j = 0; i < 32; i++, j += 5)
-        {
-            x = (uint32_t) input[(row + i) * columns + column];
-            packed0 |= wshift(x, j);
-            packed1 |= wshift(x, j - 32);
-            packed2 |= wshift(x, j - 64);
-            packed3 |= wshift(x, j - 96);
-            packed4 |= wshift(x, j - 128);
-        }
-
-        output[(out_row + 0) * columns + column] = packed0;
-        output[(out_row + 1) * columns + column] = packed1;
-        output[(out_row + 2) * columns + column] = packed2;
-        output[(out_row + 3) * columns + column] = packed3;
-        output[(out_row + 4) * columns + column] = packed4;
-    }
-
-    if constexpr (bits == 6)
-    {
-        if (out_row % 3) return;  // Only run for every third row
-        int row = out_row * 32 / 6;
-        uint32_t packed0 = 0;
-        uint32_t packed1 = 0;
-        uint32_t packed2 = 0;
-
-        #pragma unroll
-        for (int i = 0, j = 0; i < 16; i++, j += 6)
-        {
-            x = (uint32_t) input[(row + i) * columns + column];
-            packed0 |= wshift(x, j);
-            packed1 |= wshift(x, j - 32);
-            packed2 |= wshift(x, j - 64);
-        }
-
-        output[(out_row + 0) * columns + column] = packed0;
-        output[(out_row + 1) * columns + column] = packed1;
-        output[(out_row + 2) * columns + column] = packed2;
-    }
-
-    if constexpr (bits == 8)
-    {
-        int row = out_row * 32 / 8;
-        uint32_t packed = 0;
-
-        #pragma unroll
-        for (int i = 0, j = 0; i < 4; i++, j += 8)
-        {
-            x = (uint32_t) input[(row + i) * columns + column];
-            packed |= (x << j);
-        }
-
-        output[out_row * columns + column] = packed;
-    }
-}
-
-void pack_columns_cuda
-(
-    const uint16_t* input,
-    uint32_t* output,
-    const int in_rows,
-    const int out_rows,
-    const int columns,
-    const int bits
-)
-{
-    dim3 threads(BLOCKSIZE_X, BLOCKSIZE_Y);
-    dim3 blocks(DIVIDE(columns, BLOCKSIZE_X), DIVIDE(out_rows, BLOCKSIZE_Y));
-
-    if (bits == 2) pack_columns_kernel<2><<<blocks, threads>>>(input, output, out_rows, columns);
-    if (bits == 3) pack_columns_kernel<3><<<blocks, threads>>>(input, output, out_rows, columns);
-    if (bits == 4) pack_columns_kernel<4><<<blocks, threads>>>(input, output, out_rows, columns);
-    if (bits == 5) pack_columns_kernel<5><<<blocks, threads>>>(input, output, out_rows, columns);
-    if (bits == 6) pack_columns_kernel<6><<<blocks, threads>>>(input, output, out_rows, columns);
-    if (bits == 8) pack_columns_kernel<8><<<blocks, threads>>>(input, output, out_rows, columns);
-}
-
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/pack_tensor.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/pack_tensor.cuh
deleted file mode 100644
index 4231cbb9c..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/pack_tensor.cuh
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef _pack_tensor_cuh
-#define _pack_tensor_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-
-void pack_rows_4_cuda
-(
-    const uint16_t* input,
-    uint32_t* output,
-    const int rows,
-    const int columns
-);
-
-void pack_rows_6_cuda
-(
-    const uint16_t* input,
-    uint32_t* output,
-    const int rows,
-    const int columns
-);
-
-void pack_columns_cuda
-(
-    const uint16_t* input,
-    uint32_t* output,
-    const int in_rows,
-    const int out_rows,
-    const int columns,
-    const int bits
-);
-
-#endif
\ No newline at end of file
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_attn.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_attn.cu
deleted file mode 100644
index eb183842a..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_attn.cu
+++ /dev/null
@@ -1,159 +0,0 @@
-#include "q_attn.cuh"
-#include "q_gemm.cuh"
-#include "rms_norm.cuh"
-#include "rope.cuh"
-#include "util.cuh"
-#include "lora.cuh"
-
-const int THREADS_X = 32;
-const int THREADS_Y = 1;
-const int THREADS_Z = 4;
-const int BLOCKSIZE_X = 2; // 2*half == 1*uint32_t
-const int BLOCKSIZE_Z = 4; // num_heads must be divisible by BLOCKSIZE_Z
-
-__global__ void update_cache_kernel
-(
-    const half* __restrict__ key_states,
-    const half* __restrict__ value_states,
-    half* __restrict__ key_cache,
-    half* __restrict__ value_cache,
-    const int head_dim,
-    const int num_kv_heads,
-    const int q_len,
-    const int cache_seq_len,
-    const int past_len
-)
-{
-    //int state_shape[]  = {              num_kv_heads,                     q_len, head_dim };
-    int state_stride[] = {                  head_dim,   head_dim * num_kv_heads,        1 };
-    int state_pos[]    = {                         0,                         0,        0 };
-
-    //int cache_shape[]  = {              num_kv_heads,             cache_seq_len, head_dim };
-    int cache_stride[] = {  cache_seq_len * head_dim,                  head_dim,        1 };
-    int cache_pos[]    = {                         0,                  past_len,        0 };
-
-    int size[]         = {              num_kv_heads,                     q_len, head_dim };
-
-    int x = (blockIdx.x * THREADS_X + threadIdx.x) * BLOCKSIZE_X;
-    int y = blockIdx.y * THREADS_Y + threadIdx.y;
-    int z = (blockIdx.z * THREADS_Z + threadIdx.z) * BLOCKSIZE_Z;
-
-    if (x >= size[2]) return;
-    if (y >= size[1]) return;
-    if (z >= size[0]) return;
-
-    int state_offset = (z + state_pos[0]) * state_stride[0] + (y + state_pos[1]) * state_stride[1] + (x + state_pos[2]) * state_stride[2];
-    int cache_offset = (z + cache_pos[0]) * cache_stride[0] + (y + cache_pos[1]) * cache_stride[1] + (x + cache_pos[2]) * cache_stride[2];
-
-    const uint32_t* key_ptr   = (uint32_t*) (key_states   + state_offset);
-    const uint32_t* value_ptr = (uint32_t*) (value_states + state_offset);
-    uint32_t* key_cache_ptr   = (uint32_t*) (key_cache    + cache_offset);
-    uint32_t* value_cache_ptr = (uint32_t*) (value_cache  + cache_offset);
-
-    #pragma unroll
-    for (int k = 0; k < BLOCKSIZE_Z; k++)
-    {
-        *key_cache_ptr = *key_ptr;
-        key_ptr += state_stride[0] / BLOCKSIZE_X;
-        key_cache_ptr += cache_stride[0] / BLOCKSIZE_X;
-    }
-
-    #pragma unroll
-    for (int k = 0; k < BLOCKSIZE_Z; k++)
-    {
-        *value_cache_ptr = *value_ptr;
-        value_ptr += state_stride[0] / BLOCKSIZE_X;
-        value_cache_ptr += cache_stride[0] / BLOCKSIZE_X;
-    }
-}
-
-QAttn::QAttn
-(
-    half* _layernorm,
-    float _norm_epsilon,
-    QMatrix* _q_proj,
-    QMatrix* _k_proj,
-    QMatrix* _v_proj,
-    QMatrix* _o_proj,
-    half* _temp_state,
-//     half* _temp_q,
-//     half* _temp_k,
-//     half* _temp_v,
-    half* _temp_dq,
-    int _max_rows,
-    int _hidden_size,
-    int _num_heads,
-    int _num_kv_heads,
-    int _head_dim,
-    int _max_seq_len
-):
-    layernorm(_layernorm),
-    norm_epsilon(_norm_epsilon),
-    q_proj(_q_proj),
-    k_proj(_k_proj),
-    v_proj(_v_proj),
-    o_proj(_o_proj),
-    temp_state(_temp_state),
-//     temp_q(_temp_q),
-//     temp_k(_temp_k),
-//     temp_v(_temp_v),
-    temp_dq(_temp_dq),
-    max_rows(_max_rows),
-    hidden_size(_hidden_size),
-    num_heads(_num_heads),
-    num_kv_heads(_num_kv_heads),
-    head_dim(_head_dim),
-    max_seq_len(_max_seq_len)
-{
-}
-
-QAttn::~QAttn()
-{
-}
-
-void QAttn::forward_cuda_1
-(
-    cublasHandle_t cublas_handle,
-    half* x,
-    int batch_size,
-    int q_len,
-    int past_len,
-    const uint32_t* past_lens,
-    half* temp_q,
-    half* temp_k,
-    half* temp_v,
-    const half* sin,
-    const half* cos,
-    const std::vector<uintptr_t>& loras,
-    half* lora_temp
-)
-{
-    rms_norm_cuda(x, layernorm, temp_state, norm_epsilon, q_len * batch_size, hidden_size);
-
-    gemm_half_q_half_cuda(cublas_handle, temp_state, q_proj, temp_q, q_len * batch_size, q_proj->width, hidden_size, true, temp_dq);
-    gemm_half_q_half_cuda(cublas_handle, temp_state, k_proj, temp_k, q_len * batch_size, k_proj->width, hidden_size, true, temp_dq);
-    gemm_half_q_half_cuda(cublas_handle, temp_state, v_proj, temp_v, q_len * batch_size, v_proj->width, hidden_size, true, temp_dq);
-
-    apply_loras_cuda(cublas_handle, q_proj_lora, loras, q_proj, temp_state, temp_q, lora_temp, q_len * batch_size);
-    apply_loras_cuda(cublas_handle, k_proj_lora, loras, k_proj, temp_state, temp_k, lora_temp, q_len * batch_size);
-    apply_loras_cuda(cublas_handle, v_proj_lora, loras, v_proj, temp_state, temp_v, lora_temp, q_len * batch_size);
-
-    rope_cuda(temp_q, sin, cos, batch_size, q_len * num_heads,    head_dim, num_heads,    past_len, past_lens);
-    rope_cuda(temp_k, sin, cos, batch_size, q_len * num_kv_heads, head_dim, num_kv_heads, past_len, past_lens);
-}
-
-void QAttn::forward_cuda_2
-(
-    cublasHandle_t cublas_handle,
-    const half* attn_output,
-    half* hidden_state,
-    int q_len,
-    int batch_size,
-    const std::vector<uintptr_t>& loras,
-    half* lora_temp
-)
-{
-    gemm_half_q_half_cuda(cublas_handle, attn_output, o_proj, hidden_state, q_len * batch_size, o_proj->width, hidden_size, false, temp_dq);
-
-    apply_loras_cuda(cublas_handle, o_proj_lora, loras, o_proj, attn_output, hidden_state, lora_temp, q_len * batch_size);
-}
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_attn.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_attn.cuh
deleted file mode 100644
index dced611f9..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_attn.cuh
+++ /dev/null
@@ -1,98 +0,0 @@
-#ifndef _q_attn_cuh
-#define _q_attn_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-#include <ATen/cuda/CUDAContext.h>
-
-#include "q_matrix.cuh"
-
-class QAttn
-{
-public:
-
-    half* layernorm;
-    float norm_epsilon;
-
-    QMatrix* q_proj;
-    QMatrix* k_proj;
-    QMatrix* v_proj;
-    QMatrix* o_proj;
-
-    half* temp_state;
-//     half* temp_q;
-//     half* temp_k;
-//     half* temp_v;
-    half* temp_dq;
-
-    int device;
-    int max_rows;
-    int hidden_size;
-    int num_heads;
-    int num_kv_heads;
-    int head_dim;
-    int max_seq_len;
-
-    std::unordered_map<uintptr_t, std::tuple<half*, half*, int>> q_proj_lora;
-    std::unordered_map<uintptr_t, std::tuple<half*, half*, int>> k_proj_lora;
-    std::unordered_map<uintptr_t, std::tuple<half*, half*, int>> v_proj_lora;
-    std::unordered_map<uintptr_t, std::tuple<half*, half*, int>> o_proj_lora;
-
-    QAttn
-    (
-        half* _layernorm,
-        float _norm_epsilon,
-        QMatrix* _q_proj,
-        QMatrix* _k_proj,
-        QMatrix* _v_proj,
-        QMatrix* _o_proj,
-        half* _temp_state,
-//         half* _temp_q,
-//         half* _temp_k,
-//         half* _temp_v,
-        half* _temp_dq,
-        int _max_rows,
-        int _hidden_size,
-        int _num_heads,
-        int _num_kv_heads,
-        int _head_dim,
-        int _max_seq_len
-    );
-
-    ~QAttn();
-
-    void forward_cuda_1
-    (
-        cublasHandle_t cublas_handle,
-        half* x,
-        int batch_size,
-        int q_len,
-        int past_len,
-        const uint32_t* past_lens,
-        half* temp_q,
-        half* temp_k,
-        half* temp_v,
-        const half* sin,
-        const half* cos,
-        const std::vector<uintptr_t>& loras,
-        half* lora_temp
-    );
-
-    void forward_cuda_2
-    (
-        cublasHandle_t cublas_handle,
-        const half* attn_output,
-        half* hidden_state,
-        int q_len,
-        int batch_size,
-        const std::vector<uintptr_t>& loras,
-        half* lora_temp
-    );
-
-private:
-
-};
-
-#endif
\ No newline at end of file
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
deleted file mode 100644
index 351b9cd5b..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu
+++ /dev/null
@@ -1,211 +0,0 @@
-#include "q_gemm.cuh"
-#include "util.cuh"
-#include "matrix_view.cuh"
-#include "../config.h"
-
-#include "quant/qdq_2.cuh"
-#include "quant/qdq_3.cuh"
-#include "quant/qdq_4.cuh"
-#include "quant/qdq_5.cuh"
-#include "quant/qdq_6.cuh"
-#include "quant/qdq_8.cuh"
-
-#define BLOCK_KN_SIZE 128
-#define BLOCK_M_SIZE_MAX 8
-#define MAX_GROUPS_IN_BLOCK (BLOCK_KN_SIZE / 32)
-#define CLEAR_N_SIZE 256
-
-#include "q_gemm_kernel.cuh"
-#include "q_gemm_kernel_gptq.cuh"
-
-#include "compat_gemm.cuh"
-
-void gemm_half_q_half_cuda_part
-(
-    const half* a,
-    QMatrix* b,
-    half* c,
-    int size_m,
-    int size_n,
-    int size_k,
-    int m_count,
-    bool clear
-)
-{
-    if (!b->is_gptq)
-    {
-        dim3 blockDim, gridDim;
-        blockDim.x = BLOCK_KN_SIZE;
-        blockDim.y = 1;
-        blockDim.z = 1;
-        gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE * 4);
-        gridDim.y = DIVIDE(size_m, m_count);
-        gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE);
-
-        fp_gemm_half_q_half_kernel kernel = pick_gemm_half_q_half_kernel(true, m_count);
-
-        kernel<<<gridDim, blockDim>>>
-        (
-            a,
-            b->cuda_q_weight,
-            b->cuda_q_scale,
-            b->cuda_q_scale_max,
-            c,
-            size_m,
-            size_n,
-            size_k,
-            b->groups,
-            b->groupsize,
-            b->cuda_q_perm,
-            b->rows_8,
-            b->rows_6,
-            b->rows_5,
-            b->rows_4,
-            b->rows_3,
-            b->rows_2,
-            clear
-        );
-    }
-    else
-    {
-        dim3 blockDim, gridDim;
-        blockDim.x = BLOCK_KN_SIZE;
-        blockDim.y = 1;
-        blockDim.z = 1;
-        gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE * 4);
-        gridDim.y = DIVIDE(size_m, m_count);
-        gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE);
-
-        fp_gemm_half_q_half_gptq_kernel kernel = pick_gemm_half_q_half_gptq_kernel(true, m_count);
-
-//         DBGX((uint64_t) b->cuda_q_perm);
-//         DBGI(b->rows_4);
-//         DBGI(b->height);
-
-        kernel<<<gridDim, blockDim>>>
-        (
-            a,
-            b->cuda_q_weight,
-            b->cuda_gptq_qzeros,
-            b->cuda_gptq_scales,
-            c,
-            size_m,
-            size_n,
-            size_k,
-            b->groups,
-            b->groupsize,
-            b->cuda_q_perm,
-            b->rows_4,
-            clear
-        );
-    }
-}
-
-void gemm_half_q_half_cuda
-(
-    cublasHandle_t cublas_handle,
-    const half* a,
-    QMatrix* b,
-    half* c,
-    int size_m,
-    int size_n,
-    int size_k,
-    bool clear,
-    half* temp_dq,
-    bool force_cuda
-)
-{
-    if (size_m > MAX_Q_GEMM_ROWS && !force_cuda)
-    {
-        //printf("cublas\n");
-
-        // Reconstruct FP16 matrix, then cuBLAS
-
-        if (!temp_dq) temp_dq = b->temp_dq;
-        b->reconstruct(temp_dq);
-
-        //cublasSetMathMode(cublas_handle, CUBLAS_TENSOR_OP_MATH);
-
-        const half alpha = __float2half(1.0f);
-        const half beta = clear ? __float2half(0.0f) : __float2half(1.0f);
-        cublasHgemm(cublas_handle,
-                    CUBLAS_OP_N,
-                    CUBLAS_OP_N,
-                    size_n, size_m, size_k,
-                    &alpha, temp_dq, size_n,
-                            a,       size_k,
-                    &beta,  c,       size_n);
-
-        //const float alpha = 1.0f;
-        //const float beta = clear ? 0.0f : 1.0f;
-        //cublasSgemmEx(cublas_handle,
-        //              CUBLAS_OP_N,
-        //              CUBLAS_OP_N,
-        //              size_n, size_m, size_k,
-        //              &alpha, temp_dq, CUDA_R_16F, size_n,
-        //                      a,       CUDA_R_16F, size_k,
-        //              &beta,  c,       CUDA_R_16F, size_n);
-
-        //const float alpha = 1.0f;
-        //const float beta = clear ? 0.0f : 1.0f;
-        //cublasGemmEx(cublas_handle,
-        //             CUBLAS_OP_N, CUBLAS_OP_N,
-        //             size_n, size_m, size_k,
-        //             &alpha, temp_dq, CUDA_R_16F, size_n,
-        //                     a,       CUDA_R_16F, size_k,
-        //             &beta,  c,       CUDA_R_16F, size_n,
-        //             CUDA_R_16F, CUBLAS_GEMM_DFALT_TENSOR_OP);
-    }
-    else
-    {
-        //printf("cuda\n");
-
-        // Quantized matmul
-
-        //if (clear) clear_tensor_cuda(c, size_m, size_n);
-
-        int max_chunks = size_m / BLOCK_M_SIZE_MAX;
-        int last_chunk = max_chunks * BLOCK_M_SIZE_MAX;
-        int last_chunk_size = size_m - last_chunk;
-
-        if (max_chunks)
-        {
-            gemm_half_q_half_cuda_part(a, b, c, last_chunk, size_n, size_k, BLOCK_M_SIZE_MAX, clear);
-        }
-
-        if (last_chunk_size)
-        {
-            gemm_half_q_half_cuda_part(a + last_chunk * size_k, b, c + last_chunk * size_n, last_chunk_size, size_n, size_k, last_chunk_size, clear);
-        }
-    }
-}
-
-__global__ void clear_kernel
-(
-    half* __restrict__ c,
-    const int size_m,
-    const int size_n
-)
-{
-    int m = blockIdx.y;
-    int n = (blockIdx.x * CLEAR_N_SIZE + threadIdx.x) * 8;
-    if (n >= size_n) return;
-    int4* c_ptr = (int4*)(c + m * size_n + n);
-    *c_ptr = {};
-}
-
-void clear_tensor_cuda
-(
-    half* c,
-    int size_m,
-    int size_n
-)
-{
-    return;
-    dim3 blockDim, gridDim;
-    blockDim.x = CLEAR_N_SIZE;
-    blockDim.y = 1;
-    gridDim.x = DIVIDE(size_n / 8, CLEAR_N_SIZE);
-    gridDim.y = size_m;
-    clear_kernel<<<gridDim, blockDim>>>(c, size_m, size_n);
-}
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cuh
deleted file mode 100644
index c69f1a709..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cuh
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef _q_gemm_cuh
-#define _q_gemm_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-#include <ATen/cuda/CUDAContext.h>
-
-#include "q_matrix.cuh"
-
-void gemm_half_q_half_cuda
-(
-    cublasHandle_t cublas_handle,
-    const half* a,
-    QMatrix* b,
-    half* c,
-    int size_m,
-    int size_n,
-    int size_k,
-    bool clear = false,
-    half* reconstruct = NULL,
-    bool force_cuda = false
-);
-
-void clear_tensor_cuda
-(
-    half* c,
-    int size_m,
-    int size_n
-);
-
-#endif
\ No newline at end of file
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel.cuh
deleted file mode 100644
index 04643f65b..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel.cuh
+++ /dev/null
@@ -1,484 +0,0 @@
-#include "compat.cuh"
-
-__forceinline__ __device__ half2 dot22_8(half2(&dq)[4], const half* a_ptr, const half2 g_result, const half qs_h)
-{
-    half2 result = {};
-    const half2* a2_ptr = (const half2*)a_ptr;
-    #pragma unroll
-    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
-    return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
-}
-
-__forceinline__ __device__ half2 dot22_16(half2(&dq)[8], const half* a_ptr, const half2 g_result, const half qs_h)
-{
-    half2 result = {};
-    const half2* a2_ptr = (const half2*)a_ptr;
-    #pragma unroll
-    for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
-    return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
-}
-
-__forceinline__ __device__ half2 dot22_32(half2(&dq)[16], const half* a_ptr, const half2 g_result, const half qs_h)
-{
-    half2 result = {};
-    const half2* a2_ptr = (const half2*)a_ptr;
-    #pragma unroll
-    for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
-    return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
-}
-
-__forceinline__ __device__ float dot22_8_f(half2(&dq)[4], const half* a_ptr, const float g_result, const float qs_f)
-{
-    half2 result = {};
-    const half2* a2_ptr = (const half2*)a_ptr;
-    #pragma unroll
-    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
-    float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
-    return fma(result_f, qs_f, g_result);
-}
-
-__forceinline__ __device__ float dot22_16_f(half2(&dq)[8], const half* a_ptr, const float g_result, const float qs_f)
-{
-    half2 result = {};
-    const half2* a2_ptr = (const half2*)a_ptr;
-    #pragma unroll
-    for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
-    float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
-    return fma(result_f, qs_f, g_result);
-}
-
-__forceinline__ __device__ float dot22_32_f(half2(&dq)[16], const half* a_ptr, const float g_result, const float qs_f)
-{
-    half2 result = {};
-    const half2* a2_ptr = (const half2*)a_ptr;
-    #pragma unroll
-    for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
-    float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
-    return fma(result_f, qs_f, g_result);
-}
-
-
-
-typedef void (*fp_gemm_half_q_half_kernel)
-(
-    const half*,
-    const uint32_t*,
-    const uint32_t*,
-    const half*,
-    half*,
-    const int,
-    const int,
-    const int,
-    const int,
-    const int,
-    const uint16_t*,
-    const int,
-    const int,
-    const int,
-    const int,
-    const int,
-    const int,
-    const bool
-);
-
-template <bool first_block, int m_count>
-__global__ void gemm_half_q_half_kernel
-(
-    const half*      __restrict__ a,
-    const uint32_t*  __restrict__ b_q_weight,
-    const uint32_t*  __restrict__ b_q_scale,
-    const half*      __restrict__ b_q_scale_max,
-    half*            __restrict__ c,
-    const int size_m,
-    const int size_n,
-    const int size_k,
-    const int groups,
-    const int groupsize,
-    const uint16_t* __restrict__ b_q_perm,
-    const int rows_8,
-    const int rows_6,
-    const int rows_5,
-    const int rows_4,
-    const int rows_3,
-    const int rows_2,
-    const bool clear
-)
-{
-    MatrixView_half a_(a, size_m, size_k);
-    MatrixView_half_rw c_(c, size_m, size_n);
-    MatrixView_q4_row b_q_scale_(b_q_scale, groups, size_n);
-
-    int t = threadIdx.x;
-
-    // Block
-
-    int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
-    int offset_m = blockIdx.y * m_count;
-    int offset_k = blockIdx.z * BLOCK_KN_SIZE;
-
-    int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-    int end_m = min(offset_m + m_count, size_m);
-    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
-    int n = offset_n + t * 4;
-
-    // Preload block_a
-
-    __shared__ half block_a[m_count][BLOCK_KN_SIZE];
-
-    if (offset_k + t < end_k)
-    {
-        for (int m = 0; m < m_count; ++m)
-        {
-            const half* a_ptr = a_.item_ptr(offset_m + m, 0);
-            half* block_a_ptr = block_a[m];
-            half a0 = a_ptr[b_q_perm[offset_k + t]];
-            block_a_ptr[t] = a0;
-        }
-    }
-
-    // Clear
-
-    if (n >= size_n) return;
-
-    if (clear && blockIdx.z == 0) // && (threadIdx.x & 1) == 0)
-    {
-        for (int m = 0; m < m_count; m++)
-            *((uint64_t*) c_.item_ptr(offset_m + m, n)) = 0;
-    }
-
-    __syncthreads();
-
-    // Find initial group
-
-    int group = offset_k / groupsize;
-
-    // Preload scales
-
-    float scales[MAX_GROUPS_IN_BLOCK][4];
-
-    int groups_in_block = DIVIDE((end_k - offset_k), groupsize);
-    for (int g = 0; g < groups_in_block; g++)
-    {
-        int qscales[4];
-        b_q_scale_.item4(qscales, group + g, n);
-        qscales[0]++;
-        qscales[1]++;
-        qscales[2]++;
-        qscales[3]++;
-        float maxscale = __half2float(b_q_scale_max[group + g]);
-        scales[g][0] = __int2float_rn(qscales[0] * qscales[0]) * maxscale;
-        scales[g][1] = __int2float_rn(qscales[1] * qscales[1]) * maxscale;
-        scales[g][2] = __int2float_rn(qscales[2] * qscales[2]) * maxscale;
-        scales[g][3] = __int2float_rn(qscales[3] * qscales[3]) * maxscale;
-    }
-
-    // a, b offset
-
-    int pre_rows_8 = min(rows_8, offset_k);
-    int pre_rows_6 = offset_k > rows_8 ? min(rows_6, offset_k) - rows_8 : 0;
-    int pre_rows_5 = offset_k > rows_6 ? min(rows_5, offset_k) - rows_6 : 0;
-    int pre_rows_4 = offset_k > rows_5 ? min(rows_4, offset_k) - rows_5 : 0;
-    int pre_rows_3 = offset_k > rows_4 ? min(rows_3, offset_k) - rows_4 : 0;
-    int pre_rows_2 = offset_k > rows_3 ? min(rows_2, offset_k) - rows_3 : 0;
-    int qk = 0;
-    qk += pre_rows_8 / 32 * 8;
-    qk += pre_rows_6 / 32 * 6;
-    qk += pre_rows_5 / 32 * 5;
-    qk += pre_rows_4 / 32 * 4;
-    qk += pre_rows_3 / 32 * 3;
-    qk += pre_rows_2 / 32 * 2;
-
-    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
-    const half* a_ptr = &block_a[0][0];
-    int a_stride = BLOCK_KN_SIZE;
-
-    // Initial group
-
-    int scales_idx = 0;
-    float qs_f0 = scales[scales_idx][0];
-    float qs_f1 = scales[scales_idx][1];
-    float qs_f2 = scales[scales_idx][2];
-    float qs_f3 = scales[scales_idx][3];
-    int nextgroup = offset_k + groupsize;
-
-    // Column result
-
-    float block_c[m_count][4] = {};
-
-    // Dequantize groups
-
-    int k = offset_k;
-
-    while (k < rows_8 && k < end_k)
-    {
-        if (k == nextgroup)
-        {
-            group++;
-            scales_idx++;
-            qs_f0 = scales[scales_idx][0];
-            qs_f1 = scales[scales_idx][1];
-            qs_f2 = scales[scales_idx][2];
-            qs_f3 = scales[scales_idx][3];
-            nextgroup += groupsize;
-        }
-
-        #pragma unroll
-        for (int j = 0; j < 4; j++)
-        {
-            int4 load_int4[2];
-            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
-            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
-
-            half2 dq[4][4];
-            dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n);
-            dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n);
-            dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n);
-            dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n);
-
-            for (int m = 0; m < m_count; m++)
-            {
-                block_c[m][0] = dot22_8_f(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_f0);
-                block_c[m][1] = dot22_8_f(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_f1);
-                block_c[m][2] = dot22_8_f(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_f2);
-                block_c[m][3] = dot22_8_f(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_f3);
-            }
-            a_ptr += 8;
-        }
-        k += 32;
-    }
-
-    while (k < rows_6 && k < end_k)
-    {
-        if (k == nextgroup)
-        {
-            group++;
-            scales_idx++;
-            qs_f0 = scales[scales_idx][0];
-            qs_f1 = scales[scales_idx][1];
-            qs_f2 = scales[scales_idx][2];
-            qs_f3 = scales[scales_idx][3];
-            nextgroup += groupsize;
-        }
-
-        #pragma unroll
-        for (int j = 0; j < 2; j++)
-        {
-            int4 load_int4[3];
-            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
-            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
-            load_int4[2] = *((int4*) b_ptr); b_ptr += size_n;
-
-            half2 dq[4][8];
-            dequant_6bit_16(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n);
-            dequant_6bit_16(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n);
-            dequant_6bit_16(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n);
-            dequant_6bit_16(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n);
-
-            for (int m = 0; m < m_count; m++)
-            {
-                block_c[m][0] = dot22_16_f(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_f0);
-                block_c[m][1] = dot22_16_f(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_f1);
-                block_c[m][2] = dot22_16_f(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_f2);
-                block_c[m][3] = dot22_16_f(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_f3);
-            }
-            a_ptr += 16;
-        }
-        k += 32;
-    }
-
-    while (k < rows_5 && k < end_k)
-    {
-        if (k == nextgroup)
-        {
-            group++;
-            scales_idx++;
-            qs_f0 = scales[scales_idx][0];
-            qs_f1 = scales[scales_idx][1];
-            qs_f2 = scales[scales_idx][2];
-            qs_f3 = scales[scales_idx][3];
-            nextgroup += groupsize;
-        }
-
-        #pragma unroll
-        for (int j = 0; j < 1; j++)
-        {
-            int4 load_int4[5];
-            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
-            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
-            load_int4[2] = *((int4*) b_ptr); b_ptr += size_n;
-            load_int4[3] = *((int4*) b_ptr); b_ptr += size_n;
-            load_int4[4] = *((int4*) b_ptr); b_ptr += size_n;
-
-            half2 dq[4][16];
-            dequant_5bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, load_int4[3].x, load_int4[4].x, dq[0], size_n);
-            dequant_5bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, load_int4[3].y, load_int4[4].y, dq[1], size_n);
-            dequant_5bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, load_int4[3].z, load_int4[4].z, dq[2], size_n);
-            dequant_5bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, load_int4[3].w, load_int4[4].w, dq[3], size_n);
-
-            for (int m = 0; m < m_count; m++)
-            {
-                block_c[m][0] = dot22_32_f(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_f0);
-                block_c[m][1] = dot22_32_f(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_f1);
-                block_c[m][2] = dot22_32_f(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_f2);
-                block_c[m][3] = dot22_32_f(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_f3);
-            }
-            a_ptr += 32;
-        }
-
-        k += 32;
-    }
-
-    while (k < rows_4 && k < end_k)
-    {
-        if (k == nextgroup)
-        {
-            group++;
-            scales_idx++;
-            qs_f0 = scales[scales_idx][0];
-            qs_f1 = scales[scales_idx][1];
-            qs_f2 = scales[scales_idx][2];
-            qs_f3 = scales[scales_idx][3];
-            nextgroup += groupsize;
-        }
-
-        #pragma unroll
-        for (int j = 0; j < 4; j++)
-        {
-            int4 load_int4[1];
-            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
-
-            half2 dq[4][4];
-            dequant_4bit_8(load_int4[0].x, dq[0], size_n);
-            dequant_4bit_8(load_int4[0].y, dq[1], size_n);
-            dequant_4bit_8(load_int4[0].z, dq[2], size_n);
-            dequant_4bit_8(load_int4[0].w, dq[3], size_n);
-
-            for (int m = 0; m < m_count; m++)
-            {
-                block_c[m][0] = dot22_8_f(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_f0);
-                block_c[m][1] = dot22_8_f(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_f1);
-                block_c[m][2] = dot22_8_f(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_f2);
-                block_c[m][3] = dot22_8_f(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_f3);
-            }
-            a_ptr += 8;
-        }
-        k += 32;
-    }
-
-    while (k < rows_3 && k < end_k)
-    {
-        if (k == nextgroup)
-        {
-            group++;
-            scales_idx++;
-            qs_f0 = scales[scales_idx][0];
-            qs_f1 = scales[scales_idx][1];
-            qs_f2 = scales[scales_idx][2];
-            qs_f3 = scales[scales_idx][3];
-            nextgroup += groupsize;
-        }
-
-        #pragma unroll
-        for (int j = 0; j < 1; j++)
-        {
-            int4 load_int4[3];
-            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
-            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
-            load_int4[2] = *((int4*) b_ptr); b_ptr += size_n;
-
-            half2 dq[4][16];
-            dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n);
-            dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n);
-            dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n);
-            dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n);
-
-            for (int m = 0; m < m_count; m++)
-            {
-                block_c[m][0] = dot22_32_f(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_f0);
-                block_c[m][1] = dot22_32_f(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_f1);
-                block_c[m][2] = dot22_32_f(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_f2);
-                block_c[m][3] = dot22_32_f(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_f3);
-            }
-            a_ptr += 32;
-        }
-        k += 32;
-    }
-
-    while (k < rows_2 && k < end_k)
-    {
-        if (k == nextgroup)
-        {
-            group++;
-            scales_idx++;
-            qs_f0 = scales[scales_idx][0];
-            qs_f1 = scales[scales_idx][1];
-            qs_f2 = scales[scales_idx][2];
-            qs_f3 = scales[scales_idx][3];
-            nextgroup += groupsize;
-        }
-
-        #pragma unroll
-        for (int j = 0; j < 2; j++)
-        {
-            int4 load_int4[1];
-            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
-
-            half2 dq[4][8];
-            dequant_2bit_16(load_int4[0].x, dq[0], size_n);
-            dequant_2bit_16(load_int4[0].y, dq[1], size_n);
-            dequant_2bit_16(load_int4[0].z, dq[2], size_n);
-            dequant_2bit_16(load_int4[0].w, dq[3], size_n);
-
-            for (int m = 0; m < m_count; m++)
-            {
-                block_c[m][0] = dot22_16_f(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_f0);
-                block_c[m][1] = dot22_16_f(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_f1);
-                block_c[m][2] = dot22_16_f(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_f2);
-                block_c[m][3] = dot22_16_f(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_f3);
-            }
-
-            a_ptr += 16;
-        }
-        k += 32;
-    }
-
-    // Accumulate column sums in c
-
-    for (int m = 0; m < m_count; m++)
-    {
-        half2* out = (half2*)c_.item_ptr(offset_m + m, n);
-        half2 result01 = __halves2half2(__float2half_rn(block_c[m][0]), __float2half_rn(block_c[m][1]));
-        half2 result23 = __halves2half2(__float2half_rn(block_c[m][2]), __float2half_rn(block_c[m][3]));
-        atomicAdd(out    , result01);
-        atomicAdd(out + 1, result23);
-    }
-}
-
-fp_gemm_half_q_half_kernel pick_gemm_half_q_half_kernel(bool first_block, const int m_count)
-{
-    #if BLOCK_M_SIZE_MAX >= 1
-    if (m_count == 1) return gemm_half_q_half_kernel<true, 1>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 2
-    if (m_count == 2) return gemm_half_q_half_kernel<true, 2>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 3
-    if (m_count == 3) return gemm_half_q_half_kernel<true, 3>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 4
-    if (m_count == 4) return gemm_half_q_half_kernel<true, 4>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 5
-    if (m_count == 5) return gemm_half_q_half_kernel<true, 5>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 6
-    if (m_count == 6) return gemm_half_q_half_kernel<true, 6>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 7
-    if (m_count == 7) return gemm_half_q_half_kernel<true, 7>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 8
-    if (m_count == 8) return gemm_half_q_half_kernel<true, 8>;
-    #endif
-    return NULL;
-}
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel_gptq.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel_gptq.cuh
deleted file mode 100644
index ebaa42d05..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel_gptq.cuh
+++ /dev/null
@@ -1,219 +0,0 @@
-#include "compat.cuh"
-
-__forceinline__ __device__ half2 dot22_8(half2(&dq)[4], const half* a_ptr, const half2 g_result)
-{
-    half2 result = {};
-    const half2* a2_ptr = (const half2*)a_ptr;
-    #pragma unroll
-    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
-    return __hadd2(result, g_result);
-}
-
-__forceinline__ __device__ float dot22_8_f(half2(&dq)[4], const half* a_ptr)
-{
-    half2 result = {};
-    const half2* a2_ptr = (const half2*)a_ptr;
-    #pragma unroll
-    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
-    return __half2float(__low2half(result)) + __half2float(__high2half(result));
-}
-
-typedef void (*fp_gemm_half_q_half_gptq_kernel)
-(
-    const half*,
-    const uint32_t*,
-    const uint32_t*,
-    const half*,
-    half*,
-    const int,
-    const int,
-    const int,
-    const int,
-    const int,
-    const uint16_t*,
-    const int,
-    const bool
-);
-
-template <bool first_block, int m_count>
-__global__ void gemm_half_q_half_gptq_kernel
-(
-    const half* __restrict__ a,
-    const uint32_t* __restrict__ b_q_weight,
-    const uint32_t* __restrict__ b_gptq_qzeros,
-    const half* __restrict__ b_gptq_scales,
-    half* __restrict__ c,
-    const int size_m,
-    const int size_n,
-    const int size_k,
-    const int groups,
-    const int groupsize,
-    const uint16_t* __restrict__ b_q_perm,
-    const int rows_4,
-    const bool clear
-)
-{
-    MatrixView_half a_(a, size_m, size_k);
-    MatrixView_half_rw c_(c, size_m, size_n);
-    MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
-    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
-
-    int t = threadIdx.x;
-
-    // Block
-
-    int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
-    int offset_m = blockIdx.y * m_count;
-    int offset_k = blockIdx.z * BLOCK_KN_SIZE;
-
-    int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-    int end_m = min(offset_m + m_count, size_m);
-    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
-
-    int n = offset_n + t * 4;
-
-    // Preload block_a
-
-    __shared__ half block_a[m_count][BLOCK_KN_SIZE];
-
-    if (offset_k + t < end_k)
-    {
-        for (int m = 0; m < m_count; ++m)
-        {
-            const half* a_ptr = a_.item_ptr(offset_m + m, 0);
-            half* block_a_ptr = block_a[m];
-
-            half a0;
-            if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]];
-            else a0 = a_ptr[offset_k + t];
-            block_a_ptr[t] = a0;
-        }
-    }
-
-    // Zero output
-
-    if (n >= size_n) return;
-
-    if (clear && blockIdx.z == 0) // && (threadIdx.x & 1) == 0)
-    {
-        for (int m = 0; m < m_count; m++)
-            *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
-    }
-
-    __syncthreads();
-
-    // Find initial group
-
-    int group = offset_k / groupsize;
-    int nextgroup = offset_k + groupsize;
-
-    // a, b offset
-
-    int qk = offset_k / (32 / 4);
-
-    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
-    const half* a_ptr = &block_a[0][0];
-    int a_stride = BLOCK_KN_SIZE;
-
-    // Initial group
-
-    int zeros[4];
-    float scales[4];
-    half2 z1z16[4][2];
-    half2 y1y16[4][2];
-    b_gptq_qzeros_.item4(zeros, group, n);
-    b_gptq_scales_.item4_f(scales, group, n);
-    dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
-    dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
-    dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
-    dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
-
-//    __syncthreads();
-
-    // Column result
-
-    float block_c[m_count][4] = {};
-
-    // Dequantize and multiply
-
-    int k = offset_k;
-    while (k < end_k)
-    {
-        if (k == nextgroup)
-        {
-            group++;
-            nextgroup += groupsize;
-            b_gptq_qzeros_.item4(zeros, group, n);
-            b_gptq_scales_.item4_f(scales, group, n);
-            dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
-            dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
-            dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
-            dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
-        }
-
-        #pragma unroll
-        for (int j = 0; j < 4; j++)
-        {
-            const int4* b_ptr4 = (int4*) b_ptr;
-            int4 load_int4 = *b_ptr4;
-
-            half2 dq[4][4];
-            dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false);
-            dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false);
-            dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false);
-            dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false);
-
-            #pragma unroll
-            for (int m = 0; m < m_count; m++)
-            {
-                block_c[m][0] = fma(dot22_8_f(dq[0], a_ptr + m * a_stride), scales[0], block_c[m][0]);
-                block_c[m][1] = fma(dot22_8_f(dq[1], a_ptr + m * a_stride), scales[1], block_c[m][1]);
-                block_c[m][2] = fma(dot22_8_f(dq[2], a_ptr + m * a_stride), scales[2], block_c[m][2]);
-                block_c[m][3] = fma(dot22_8_f(dq[3], a_ptr + m * a_stride), scales[3], block_c[m][3]);
-            }
-
-            b_ptr += size_n;
-            a_ptr += 8;
-        }
-
-        k += 32;
-    }
-
-    for (int m = 0; m < m_count; m++)
-    {
-        half2 *out = (half2*) c_.item_ptr(offset_m + m, n);
-        half2 result01 = __halves2half2(__float2half_rn(block_c[m][0]), __float2half_rn(block_c[m][1]));
-        half2 result23 = __halves2half2(__float2half_rn(block_c[m][2]), __float2half_rn(block_c[m][3]));
-        atomicAdd(out    , result01);
-        atomicAdd(out + 1, result23);
-    }
-}
-
-fp_gemm_half_q_half_gptq_kernel pick_gemm_half_q_half_gptq_kernel(bool first_block, const int m_count)
-{
-    #if BLOCK_M_SIZE_MAX >= 1
-    if (m_count == 1) return gemm_half_q_half_gptq_kernel<true, 1>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 2
-    if (m_count == 2) return gemm_half_q_half_gptq_kernel<true, 2>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 3
-    if (m_count == 3) return gemm_half_q_half_gptq_kernel<true, 3>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 4
-    if (m_count == 4) return gemm_half_q_half_gptq_kernel<true, 4>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 5
-    if (m_count == 5) return gemm_half_q_half_gptq_kernel<true, 5>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 6
-    if (m_count == 6) return gemm_half_q_half_gptq_kernel<true, 6>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 7
-    if (m_count == 7) return gemm_half_q_half_gptq_kernel<true, 7>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 8
-    if (m_count == 8) return gemm_half_q_half_gptq_kernel<true, 8>;
-    #endif
-    return NULL;
-}
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu
deleted file mode 100644
index 6aed74705..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu
+++ /dev/null
@@ -1,623 +0,0 @@
-#include "q_matrix.cuh"
-#include "matrix_view.cuh"
-#include "util.cuh"
-
-#include "quant/qdq_2.cuh"
-#include "quant/qdq_3.cuh"
-#include "quant/qdq_4.cuh"
-#include "quant/qdq_5.cuh"
-#include "quant/qdq_6.cuh"
-#include "quant/qdq_8.cuh"
-
-#define BLOCK_KN_SIZE 128
-
-#define THREADS_X 32
-#define THREADS_Y 32
-
-// Shuffle quantized data on load
-
-__global__ void shuffle_kernel
-(
-    uint32_t* __restrict__ b_q_weight,
-    const int size_k,
-    const int size_n,
-    const int rows_8,
-    const int rows_6,
-    const int rows_5,
-    const int rows_4,
-    const int rows_3,
-    const int rows_2
-)
-{
-    int n = blockIdx.x * THREADS_X + threadIdx.x;
-    if (n >= size_n) return;
-    int k = 0;
-    uint32_t* b_ptr = b_q_weight + n;
-    while (k < rows_8) { shuffle_8bit_4 (b_ptr, size_n); b_ptr += 1 * size_n; k +=  4; }
-    while (k < rows_6) { shuffle_6bit_16(b_ptr, size_n); b_ptr += 3 * size_n; k += 16; }
-    while (k < rows_5) { shuffle_5bit_32(b_ptr, size_n); b_ptr += 5 * size_n; k += 32; }
-    while (k < rows_4) { shuffle_4bit_8 (b_ptr, size_n); b_ptr += 1 * size_n; k +=  8; }
-    while (k < rows_3) { shuffle_3bit_32(b_ptr, size_n); b_ptr += 3 * size_n; k += 32; }
-    while (k < rows_2) { shuffle_2bit_16(b_ptr, size_n); b_ptr += 1 * size_n; k += 16; }
-}
-
-
-// QMatrix constructor
-
-QMatrix::QMatrix
-(
-    const int _device,
-    const int _height,
-    const int _width,
-    const int _groups,
-
-    uint32_t* _q_weight,
-    uint16_t* _q_perm,
-    uint16_t* _q_invperm,
-    uint32_t* _q_scale,
-    half* _q_scale_max,
-    uint16_t* _q_groups,
-
-    uint32_t* _gptq_qzeros,
-    half* _gptq_scales,
-    uint32_t* _gptq_g_idx,
-
-    half* _temp_dq
-) :
-    device(_device),
-    height(_height),
-    width(_width),
-    groups(_groups),
-    temp_dq(_temp_dq)
-{
-    cudaSetDevice(device);
-
-    failed = false;
-
-    cuda_q_weight = _q_weight;
-    cuda_q_perm = _q_perm;
-    cuda_q_invperm = _q_invperm;
-    cuda_q_scale = _q_scale;
-    cuda_q_scale_max = _q_scale_max;
-    cuda_q_groups = _q_groups;
-    cuda_gptq_qzeros = _gptq_qzeros;
-    cuda_gptq_scales = _gptq_scales;
-
-    is_gptq = (_gptq_qzeros != NULL);
-
-    groupsize = 1;
-    while (groupsize * groups < height) groupsize *= 2;
-
-    // Create group map
-
-    rows_8 = 0;
-    rows_6 = 0;
-    rows_5 = 0;
-    rows_4 = 0;
-    rows_3 = 0;
-    rows_2 = 0;
-
-    if (!is_gptq)
-    {
-        uint16_t* cpu_q_groups = (uint16_t*)calloc(groups * 2, sizeof(uint16_t));
-        cudaMemcpy(cpu_q_groups, cuda_q_groups, groups * 2 * sizeof(uint16_t), cudaMemcpyDeviceToHost);
-
-        for (int i = 0; i < groups; i++)
-        {
-            int bits = cpu_q_groups[i * 2];
-            if (bits == 8) rows_8 += groupsize;
-            if (bits == 6) rows_6 += groupsize;
-            if (bits == 5) rows_5 += groupsize;
-            if (bits == 4) rows_4 += groupsize;
-            if (bits == 3) rows_3 += groupsize;
-            if (bits == 2) rows_2 += groupsize;
-        }
-
-        free(cpu_q_groups);
-
-        rows_6 += rows_8;
-        rows_5 += rows_6;
-        rows_4 += rows_5;
-        rows_3 += rows_4;
-        rows_2 += rows_3;
-    }
-    else
-    {
-        rows_4 = height;
-        rows_3 = height;
-        rows_2 = height;
-
-        if (_gptq_g_idx)
-        {
-            if (!make_sequential(_gptq_g_idx))
-            {
-                failed = true;
-                //printf("FAIL\n");
-                return;
-            }
-        }
-    }
-
-    // Shuffle quantized data
-
-    dim3 blockDim, gridDim;
-    blockDim.x = THREADS_X;
-    blockDim.y = 1;
-    gridDim.x = DIVIDE(width, THREADS_X);
-    gridDim.y = 1;
-
-    shuffle_kernel<<<gridDim, blockDim>>>(cuda_q_weight, height, width, rows_8, rows_6, rows_5, rows_4, rows_3, rows_2);
-}
-
-QMatrix::~QMatrix()
-{
-}
-
-// Reconstruct b[k,n] (GPTQ)
-
-__global__ void reconstruct_gptq_kernel
-(
-    const uint32_t* __restrict__ b_q_weight,
-    const uint16_t* __restrict__ b_q_perm,
-    const uint32_t* __restrict__ b_gptq_qzeros,
-    const half* __restrict__ b_gptq_scales,
-    //const uint16_t* __restrict__ b_q_groups,
-    const int size_k,
-    const int size_n,
-    const int groupsize,
-    const int groups,
-    half* __restrict__ b,
-    const int rows_4
-)
-{
-    MatrixView_half_rw b_(b, size_k, size_n);
-    MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
-    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
-
-    int offset_k = BLOCK_KN_SIZE * blockIdx.y;
-    int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
-
-    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
-
-    // Preload remapping table
-
-    __shared__ uint16_t perm[BLOCK_KN_SIZE];
-    int t = threadIdx.x;
-
-    if (b_q_perm)
-    {
-        if (offset_k + t < size_k)
-            perm[t] = b_q_perm[offset_k + t];
-    }
-
-    // Column
-
-    int n = offset_n + t * 4;
-    if (n >= size_n) return;
-
-    // Find initial group
-
-    int group = offset_k / groupsize;
-    int nextgroup = offset_k + groupsize;
-
-    // b offset
-
-    int qk = offset_k / (32 / 4);
-
-    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
-
-    // Initial zeros/scale
-
-    int zeros[4];
-    half2 scales[4];
-    half2 z1z16[4][2];
-    half2 y1y16[4][2];
-    b_gptq_qzeros_.item4(zeros, group, n);
-    b_gptq_scales_.item4_h2(scales, group, n);
-    dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
-    dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
-    dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
-    dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
-
-    __syncthreads();
-
-    int k = offset_k;
-    int lk = 0;
-
-    while (k < end_k)
-    {
-        if (k == nextgroup)
-        {
-            group++;
-            nextgroup += groupsize;
-            b_gptq_qzeros_.item4(zeros, group, n);
-            b_gptq_scales_.item4_h2(scales, group, n);
-            dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
-            dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
-            dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
-            dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
-        }
-
-        for (int p = 0; p < 4; p++)
-        {
-            half2 dq[4][4];
-            const int4* b_ptr4 = (int4*) b_ptr;
-            int4 load_int4 = *b_ptr4;
-
-            dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false);
-            dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false);
-            dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false);
-            dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false);
-
-            b_ptr += size_n;
-            //half* dqh = (half*)dq;
-            if (b_q_perm)
-            {
-                for (int j = 0; j < 4; j++)
-                {
-                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
-                    b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
-                    b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
-                }
-            }
-            else
-            {
-                for (int j = 0; j < 4; j++)
-                {
-                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
-                    b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
-                    b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
-                }
-            }
-        }
-        k += 32;
-    }
-}
-
-
-// Reconstruct b[k,n]
-
-__global__ void reconstruct_kernel
-(
-    const uint32_t* __restrict__ b_q_weight,
-    const uint16_t* __restrict__ b_q_perm,
-    const uint32_t* __restrict__ b_q_scale,
-    const half* __restrict__ b_q_scale_max,
-    //const uint16_t* __restrict__ b_q_groups,
-    const int size_k,
-    const int size_n,
-    const int groupsize,
-    const int groups,
-    half* __restrict__ b,
-    const int rows_8,
-    const int rows_6,
-    const int rows_5,
-    const int rows_4,
-    const int rows_3,
-    const int rows_2
-)
-{
-    MatrixView_half_rw b_(b, size_k, size_n);
-    MatrixView_q4_row b_q_scale_(b_q_scale, groups, size_n);
-
-    int offset_k = BLOCK_KN_SIZE * blockIdx.y;
-    int offset_n = BLOCK_KN_SIZE * blockIdx.x;
-
-    // Preload remapping table
-
-    int t = threadIdx.x;
-    __shared__ uint16_t perm[BLOCK_KN_SIZE];
-    if (offset_k + t < size_k)
-        perm[t] = b_q_perm[offset_k + t];
-
-    // Column
-
-    int n = offset_n + t;
-    if (n >= size_n) return;
-
-    // Find initial group
-
-    int group = offset_k / groupsize;
-
-    int pre_rows_8 = min(rows_8, offset_k);
-    int pre_rows_6 = offset_k > rows_8 ? min(rows_6, offset_k) - rows_8 : 0;
-    int pre_rows_5 = offset_k > rows_6 ? min(rows_5, offset_k) - rows_6 : 0;
-    int pre_rows_4 = offset_k > rows_5 ? min(rows_4, offset_k) - rows_5 : 0;
-    int pre_rows_3 = offset_k > rows_4 ? min(rows_3, offset_k) - rows_4 : 0;
-    int pre_rows_2 = offset_k > rows_3 ? min(rows_2, offset_k) - rows_3 : 0;
-    int qk = 0;
-    qk += pre_rows_8 / 32 * 8;
-    qk += pre_rows_6 / 32 * 6;
-    qk += pre_rows_5 / 32 * 5;
-    qk += pre_rows_4 / 32 * 4;
-    qk += pre_rows_3 / 32 * 3;
-    qk += pre_rows_2 / 32 * 2;
-
-    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
-
-    half qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]);
-    half2 qs_h2 = __halves2half2(qs_h, qs_h);
-    int nextgroup = offset_k + groupsize;
-
-    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
-    int k = offset_k;
-    int lk = 0;
-
-    __syncthreads();
-
-    while (k < rows_8 && k < end_k)
-    {
-        if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += groupsize; qs_h2 = __halves2half2(qs_h, qs_h); }
-        for (int p = 0; p < 4; p++)
-        {
-            half2 dq[4];
-            uint32_t q_0 = *b_ptr; b_ptr += size_n;
-            uint32_t q_1 = *b_ptr; b_ptr += size_n;
-            dequant_8bit_8(q_0, q_1, dq, size_n);
-            for (int j = 0; j < 4; j++) dq[j] = __hmul2(dq[j], qs_h2);
-            half* dqh = (half*) dq;
-            for (int j = 0; j < 8; j++) b_.set(perm[lk++], n, dqh[j]);
-        }
-        k += 32;
-    }
-
-    while (k < rows_6 && k < end_k)
-    {
-        if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += groupsize; qs_h2 = __halves2half2(qs_h, qs_h); }
-        for (int p = 0; p < 2; p++)
-        {
-            half2 dq[8];
-            uint32_t q_0 = *b_ptr; b_ptr += size_n;
-            uint32_t q_1 = *b_ptr; b_ptr += size_n;
-            uint32_t q_2 = *b_ptr; b_ptr += size_n;
-            dequant_6bit_16(q_0, q_1, q_2, dq, size_n);
-            for (int j = 0; j < 8; j++) dq[j] = __hmul2(dq[j], qs_h2);
-            half* dqh = (half*) dq;
-            for (int j = 0; j < 16; j++) b_.set(perm[lk++], n, dqh[j]);
-        }
-        k += 32;
-    }
-
-    while (k < rows_5 && k < end_k)
-    {
-        if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += groupsize; qs_h2 = __halves2half2(qs_h, qs_h); }
-        for (int p = 0; p < 1; p++)
-        {
-            half2 dq[16];
-            uint32_t q_0 = *b_ptr; b_ptr += size_n;
-            uint32_t q_1 = *b_ptr; b_ptr += size_n;
-            uint32_t q_2 = *b_ptr; b_ptr += size_n;
-            uint32_t q_3 = *b_ptr; b_ptr += size_n;
-            uint32_t q_4 = *b_ptr; b_ptr += size_n;
-            dequant_5bit_32(q_0, q_1, q_2, q_3, q_4, dq, size_n);
-            for (int j = 0; j < 16; j++) dq[j] = __hmul2(dq[j], qs_h2);
-            half* dqh = (half*) dq;
-            for (int j = 0; j < 32; j++) b_.set(perm[lk++], n, dqh[j]);
-        }
-        k += 32;
-    }
-
-    while (k < rows_4 && k < end_k)
-    {
-        if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += groupsize; qs_h2 = __halves2half2(qs_h, qs_h); }
-        for (int p = 0; p < 4; p++)
-        {
-            half2 dq[4];
-            uint32_t q_0 = *b_ptr; b_ptr += size_n;
-            dequant_4bit_8(q_0, dq, size_n);
-            for (int j = 0; j < 4; j++) dq[j] = __hmul2(dq[j], qs_h2);
-            half* dqh = (half*) dq;
-            for (int j = 0; j < 8; j++) b_.set(perm[lk++], n, dqh[j]);
-        }
-        k += 32;
-    }
-
-    while (k < rows_3 && k < end_k)
-    {
-        if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += groupsize; qs_h2 = __halves2half2(qs_h, qs_h); }
-        for (int p = 0; p < 1; p++)
-        {
-            half2 dq[16];
-            uint32_t q_0 = *b_ptr; b_ptr += size_n;
-            uint32_t q_1 = *b_ptr; b_ptr += size_n;
-            uint32_t q_2 = *b_ptr; b_ptr += size_n;
-            dequant_3bit_32(q_0, q_1, q_2, dq, size_n);
-            for (int j = 0; j < 16; j++) dq[j] = __hmul2(dq[j], qs_h2);
-            half* dqh = (half*) dq;
-            for (int j = 0; j < 32; j++) b_.set(perm[lk++], n, dqh[j]);
-        }
-        k += 32;
-    }
-
-    while (k < rows_2 && k < end_k)
-    {
-        if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += groupsize; qs_h2 = __halves2half2(qs_h, qs_h); }
-        for (int p = 0; p < 2; p++)
-        {
-            half2 dq[8];
-            uint32_t q_0 = *b_ptr; b_ptr += size_n;
-            dequant_2bit_16(q_0, dq, size_n);
-            for (int j = 0; j < 8; j++) dq[j] = __hmul2(dq[j], qs_h2);
-            half* dqh = (half*) dq;
-            for (int j = 0; j < 16; j++) b_.set(perm[lk++], n, dqh[j]);
-        }
-        k += 32;
-    }
-}
-
-void QMatrix::reconstruct(half* out)
-{
-    dim3 blockDim, gridDim;
-    blockDim.x = BLOCK_KN_SIZE;
-    blockDim.y = 1;
-    gridDim.y = DIVIDE(height, BLOCK_KN_SIZE);
-
-    if (!is_gptq)
-    {
-        gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
-        reconstruct_kernel<<<gridDim, blockDim>>>
-        (
-            cuda_q_weight,
-            cuda_q_perm,
-            cuda_q_scale,
-            cuda_q_scale_max,
-            //cuda_q_groups,
-            height,
-            width,
-            groupsize,
-            groups,
-            out,
-            rows_8,
-            rows_6,
-            rows_5,
-            rows_4,
-            rows_3,
-            rows_2
-        );
-    }
-    else
-    {
-        gridDim.x = DIVIDE(width, BLOCK_KN_SIZE * 4);
-        reconstruct_gptq_kernel<<<gridDim, blockDim>>>
-        (
-            cuda_q_weight,
-            cuda_q_perm,
-            cuda_gptq_qzeros,
-            cuda_gptq_scales,
-            //const uint16_t* __restrict__ b_q_groups,
-            height,
-            width,
-            groupsize,
-            groups,
-            out,
-            rows_4
-        );
-    }
-}
-
-__global__ void make_sequential_kernel
-(
-    const uint32_t* __restrict__ w,
-    uint32_t* __restrict__ w_new,
-    const uint16_t* __restrict__ q_perm,
-    const int w_height,
-    const int w_width
-)
-{
-    const uint64_t* w2 = (uint64_t*) w;
-    uint64_t* w_new2 = (uint64_t*) w_new;
-    int w2_stride = w_width >> 1;
-
-    int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
-    if (w2_column >= w2_stride) return;
-
-    int w_new2_row = blockIdx.y;
-
-    int q_perm_idx = w_new2_row << 3;
-
-    uint64_t dst = 0;
-
-    #pragma unroll
-    for (int i = 0; i < 8; i++)
-    {
-        int source_row = q_perm[q_perm_idx++];
-
-        int w2_row = source_row >> 3;
-        int w2_subrow = source_row & 0x07;
-        int w2_row_shift = w2_subrow << 2;
-        int wnew2_row_shift = i << 2;
-
-        uint64_t src = w2[w2_row * w2_stride + w2_column];
-        src >>= w2_row_shift;
-        src &= 0x0000000f0000000f;
-        src <<= wnew2_row_shift;
-        dst |= src;
-    }
-
-    w_new2[w_new2_row * w2_stride + w2_column] = dst;
-}
-
-bool QMatrix::make_sequential(const uint32_t* cpu_g_idx)
-{
-    uint32_t* cuda_new_qweight = NULL;
-    cudaError_t err = cudaMalloc(&cuda_new_qweight, height / 8 * width * sizeof(uint32_t));
-    if (err != cudaSuccess) {
-        cudaError_t cuda_status = cudaGetLastError(); // Clear error
-        return false;
-    }
-
-    uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t));
-    uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t));
-    uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t));
-
-    // Group histogram
-
-    for (int i = 0; i < height; i++) cpu_g_idx_map[cpu_g_idx[i]]++;
-
-    // Group map
-
-    for (int i = 0, acc = 0; i < groups; i++)
-    {
-        short tmp = cpu_g_idx_map[i];
-        cpu_g_idx_map[i] = acc;
-        acc += tmp;
-    }
-
-    // X map (inverse)
-
-    for (int row = 0; row < height; row++)
-    {
-        uint32_t target_group = cpu_g_idx[row];
-        uint32_t target_row = cpu_g_idx_map[target_group];
-        cpu_g_idx_map[target_group]++;
-        cpu_x_map_inv[row] = target_row;
-    }
-
-    // X map
-
-    for (int row = 0; row < height; row++) cpu_x_map[cpu_x_map_inv[row]] = row;
-
-    // Reduce to uint16_t
-
-    uint16_t* cpu_x_map16 = (uint16_t*)cpu_x_map;
-    uint16_t* cpu_x_map_inv16 = (uint16_t*)cpu_x_map_inv;
-    for (int row = 0; row < height; row++) cpu_x_map16[row] = (uint16_t) cpu_x_map[row];
-    for (int row = 0; row < height; row++) cpu_x_map_inv16[row] = (uint16_t) cpu_x_map_inv[row];
-
-    // Move to CUDA
-
-    cudaMemcpyAsync(cuda_q_perm, cpu_x_map16, height * sizeof(uint16_t), cudaMemcpyHostToDevice);
-    cudaMemcpyAsync(cuda_q_invperm, cpu_x_map_inv16, height * sizeof(uint16_t), cudaMemcpyHostToDevice);
-
-    // Rearrange rows in w
-
-    dim3 blockDim, gridDim;
-    blockDim.x = THREADS_X;
-    blockDim.y = 1;
-    gridDim.x = DIVIDE(width, THREADS_X);
-    gridDim.y = height / 8;
-
-    make_sequential_kernel<<<gridDim, blockDim>>>
-    (
-        cuda_q_weight,
-        cuda_new_qweight,
-        cuda_q_perm,
-        height / 8,
-        width
-    );
-
-    // Replace qweights
-
-    cudaMemcpyAsync(cuda_q_weight, cuda_new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice);
-
-    // Cleanup
-
-    cudaDeviceSynchronize();
-
-    cudaFree(cuda_new_qweight);
-    free(cpu_g_idx_map);
-    free(cpu_x_map);
-    free(cpu_x_map_inv);
-
-    return true;
-}
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cuh
deleted file mode 100644
index dda83a4f3..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cuh
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef _q_matrix_cuh
-#define _q_matrix_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-
-#define MAX_SUPERGROUPS 16
-
-class QMatrix
-{
-public:
-
-    int device;
-    bool is_gptq;
-
-    int height;
-    int width;
-    int groups;
-    int groupsize;
-
-    int rows_8;
-    int rows_6;
-    int rows_5;
-    int rows_4;
-    int rows_3;
-    int rows_2;
-
-    uint32_t* cuda_q_weight = NULL;
-    uint16_t* cuda_q_perm = NULL;
-    uint16_t* cuda_q_invperm = NULL;
-    uint32_t* cuda_q_scale = NULL;
-    half* cuda_q_scale_max = NULL;
-    uint16_t* cuda_q_groups = NULL;
-    uint32_t* cuda_gptq_qzeros = NULL;
-    half* cuda_gptq_scales = NULL;
-
-    half* temp_dq;
-
-    bool failed;
-
-    QMatrix
-    (
-        const int _device,
-        const int _height,
-        const int _width,
-        const int _groups,
-
-        uint32_t* _q_weight,
-        uint16_t* _q_perm,
-        uint16_t* _q_invperm,
-        uint32_t* _q_scale,
-        half* _q_scale_max,
-        uint16_t* _q_groups,
-
-        uint32_t* _gptq_qzeros,
-        half* _gptq_scales,
-        uint32_t* _gptq_g_idx,
-
-        half* _temp_dq
-    );
-
-    ~QMatrix();
-
-    void reconstruct(half* out);
-    bool make_sequential(const uint32_t* cpu_g_idx);
-
-private:
-
-};
-
-#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_mlp.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_mlp.cu
deleted file mode 100644
index 87fe8ef3f..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_mlp.cu
+++ /dev/null
@@ -1,162 +0,0 @@
-#include "q_mlp.cuh"
-#include "q_gemm.cuh"
-#include "rms_norm.cuh"
-#include "util.cuh"
-#include "matrix_view.cuh"
-#include "lora.cuh"
-
-#if defined(USE_ROCM)
-__device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) {
-    return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half2_raw>(x).data.x)),
-        static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half2_raw>(x).data.y))};
-}
-#define h2rcp __compat_h2rcp
-#endif
-
-const int THREADS_X = 32;
-const int THREADS_Y = 4;
-// const int MAX_DIMENSION = 8192;
-
-__device__ __forceinline__ half silu(half x)
-{
-    half one = __float2half(1.0f);
-    half neg_x = __hneg(x);
-    half e = hexp(neg_x);
-    half sum = __hadd(one, e);
-    half r = hrcp(sum);
-    half result = __hmul(x, r);
-    return result;
-}
-
-__device__ __forceinline__ half2 silu(half2 x)
-{
-    half2 one = __float2half2_rn(1.0f);
-    half2 neg_x = __hneg2(x);
-    half2 e = h2exp(neg_x);
-    half2 sum = __hadd2(one, e);
-    half2 r = h2rcp(sum);
-    half2 result = __hmul2(x, r);
-    return result;
-}
-
-typedef void (*fp_silu_mul_kernel)
-(
-    half*,
-    const half*,
-    const int,
-    const int
-);
-
-template <bool use_half2>
-__global__ void silu_mul_kernel
-(
-    half* __restrict__ x,
-    const half* __restrict__ y,
-    const int height,
-    const int width
-)
-{
-    MatrixView_half_rw x_(x, height, width);
-    MatrixView_half y_(y, height, width);
-
-    int column = (THREADS_X * blockIdx.x + threadIdx.x); if constexpr (use_half2) column *= 2;
-    int row = THREADS_Y * blockIdx.y + threadIdx.y;
-    if (row >= height) return;
-
-    // silu(x) * y
-
-    if constexpr (use_half2)
-    {
-        half2 one = __half2half2(__float2half(1.0f));
-
-        half2 x_item = x_.item_half2(row, column);
-        half2 y_item = y_.item_half2(row, column);
-
-        x_item = silu(x_item);
-        x_item = __hmul2(x_item, y_item);
-
-        x_.set_half2(row, column, x_item);
-    }
-    else
-    {
-        half one = __float2half(1.0f);
-
-        half x_item = x_.item(row, column);
-        half y_item = y_.item(row, column);
-
-        x_item = silu(x_item);
-        x_item = __hmul(x_item, y_item);
-
-        x_.set(row, column, x_item);
-    }
-}
-
-fp_silu_mul_kernel pick_silu_mul_kernel(bool use_half2)
-{
-    if (use_half2) return silu_mul_kernel<true>;
-    else           return silu_mul_kernel<false>;
-};
-
-
-QMLP::QMLP
-(
-    half* _layernorm,
-    float _norm_epsilon,
-    QMatrix* _gate,
-    QMatrix* _up,
-    QMatrix* _down,
-    half* _temp_state,
-    half* _temp_a,
-    half* _temp_b,
-    half* _temp_dq,
-    int _max_rows
-):
-    layernorm(_layernorm),
-    norm_epsilon(_norm_epsilon),
-    gate(_gate),
-    up(_up),
-    down(_down),
-    temp_state(_temp_state),
-    temp_a(_temp_a),
-    temp_b(_temp_b),
-    temp_dq(_temp_dq),
-    max_rows(_max_rows)
-{
-}
-
-QMLP::~QMLP() {
-}
-
-void QMLP::forward_
-(
-    cublasHandle_t cublas_handle,
-    half* x,
-    int rows,
-    int columns,
-    const std::vector<uintptr_t>& loras,
-    half* lora_temp
-)
-{
-    bool use_half2 = true;
-    int intermediate_size = gate->width;
-
-    rms_norm_cuda(x, layernorm, temp_state, norm_epsilon, rows, columns);
-    gemm_half_q_half_cuda(cublas_handle, temp_state, gate, temp_a, rows, intermediate_size, columns, true, temp_dq);
-    gemm_half_q_half_cuda(cublas_handle, temp_state, up,   temp_b, rows, intermediate_size, columns, true, temp_dq);
-
-    apply_loras_cuda(cublas_handle, gate_proj_lora, loras, gate, temp_state, temp_a, lora_temp, rows);
-    apply_loras_cuda(cublas_handle, up_proj_lora,   loras, up,   temp_state, temp_b, lora_temp, rows);
-
-    dim3 blockDim, gridDim;
-    blockDim.x = THREADS_X;
-    blockDim.y = THREADS_Y;
-    gridDim.x = DIVIDE(up->width, THREADS_X) / (use_half2 ? 2 : 1);
-    gridDim.y = DIVIDE(rows, THREADS_Y);
-
-    fp_silu_mul_kernel kernel = pick_silu_mul_kernel(use_half2);
-    kernel<<<gridDim, blockDim>>>(temp_a, temp_b, rows, intermediate_size);
-
-    gemm_half_q_half_cuda(cublas_handle, temp_a, down, x, rows, columns, intermediate_size, false, temp_dq);
-
-    apply_loras_cuda(cublas_handle, down_proj_lora, loras, down, temp_a, x, lora_temp, rows);
-}
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_mlp.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_mlp.cuh
deleted file mode 100644
index ed3802559..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_mlp.cuh
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef _q_mlp_cuh
-#define _q_mlp_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-#include <ATen/cuda/CUDAContext.h>
-
-#include "q_matrix.cuh"
-
-class QMLP
-{
-public:
-
-    half* layernorm;
-    float norm_epsilon;
-
-    QMatrix* gate;
-    QMatrix* up;
-    QMatrix* down;
-
-    half* temp_state;
-    half* temp_a;
-    half* temp_b;
-    half* temp_dq;
-
-    int device;
-    int max_rows;
-
-    std::unordered_map<uintptr_t, std::tuple<half*, half*, int>> gate_proj_lora;
-    std::unordered_map<uintptr_t, std::tuple<half*, half*, int>> up_proj_lora;
-    std::unordered_map<uintptr_t, std::tuple<half*, half*, int>> down_proj_lora;
-
-    QMLP
-    (
-        half* _layernorm,
-        float _norm_epsilon,
-        QMatrix* _gate,
-        QMatrix* _up,
-        QMatrix* _down,
-        half* _temp_state,
-        half* _temp_a,
-        half* _temp_b,
-        half* _temp_dq,
-        int _max_rows
-    );
-
-    ~QMLP();
-
-    void forward_
-    (
-        cublasHandle_t cublas_handle,
-        half* x,
-        int rows,
-        int columns,
-        const std::vector<uintptr_t>& loras,
-        half* lora_temp
-    );
-
-private:
-
-};
-
-#endif
\ No newline at end of file
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_2.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_2.cuh
deleted file mode 100644
index 3beaeefa9..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_2.cuh
+++ /dev/null
@@ -1,103 +0,0 @@
-#ifndef _qdq_2_cuh
-#define _qdq_2_cuh
-
-#include "qdq_util.cuh"
-#include "../../config.h"
-
-#if QMODE_2BIT == 1
-
-// Permutation:
-//
-// ffddbb99 77553311  eeccaa88 66442200
-
-__forceinline__ __device__ void shuffle_2bit_16
-(
-    uint32_t* q,
-    int stride
-)
-{
-    uint32_t qa = q[0];
-    uint32_t qb = 0;
-
-    #pragma unroll
-    for (int i = 0; i < 8; i++)
-    {
-        uint32_t qa0 = qa & 0x03;
-        uint32_t qa1 = (qa & 0x0c) >> 2;
-        qa >>= 4;
-        qb |= (qa1 << (i * 2 + 16));
-        qb |= (qa0 << (i * 2));
-    }
-    q[0] = qb;
-}
-
-__forceinline__ __device__ void dequant_2bit_16
-(
-    const uint32_t q_0,
-    half2 (&dq)[8],
-    int stride
-)
-{
-    const uint32_t c0 = 0x64006400;
-    const half y4_  = __float2half_rn(1.0f /  4.0f);
-    const half y16_ = __float2half_rn(1.0f / 16.0f);
-    const half y64_ = __float2half_rn(1.0f / 64.0f);
-    const half2 y4  = __halves2half2(y4_,  y4_);
-    const half2 y16 = __halves2half2(y16_, y16_);
-    const half2 y64 = __halves2half2(y64_, y64_);
-    const half z1_  = __float2half_rn(-1024.0f         - 2.0f);
-    const half z4_  = __float2half_rn(-1024.0f /  4.0f - 2.0f);
-    const half z16_ = __float2half_rn(-1024.0f / 16.0f - 2.0f);
-    const half z64_ = __float2half_rn(-1024.0f / 64.0f - 2.0f);
-    const half2 z1  = __halves2half2(z1_,  z1_);
-    const half2 z4  = __halves2half2(z4_,  z4_);
-    const half2 z16 = __halves2half2(z16_, z16_);
-    const half2 z64 = __halves2half2(z64_, z64_);
-
-    uint32_t qa = q_0;
-    half2_uint32 q0((qa & 0x00030003) | c0); // half2(q[ 0], q[ 1])      + 1024
-    half2_uint32 q1((qa & 0x000c000c) | c0); // half2(q[ 2], q[ 3]) *  4 + 1024
-    half2_uint32 q2((qa & 0x00300030) | c0); // half2(q[ 4], q[ 5]) * 16 + 1024
-    half2_uint32 q3((qa & 0x00c000c0) | c0); // half2(q[ 6], q[ 7]) * 64 + 1024
-    qa >>= 8;
-    half2_uint32 q4((qa & 0x00030003) | c0); // half2(q[ 8], q[ 8])      + 1024
-    half2_uint32 q5((qa & 0x000c000c) | c0); // half2(q[10], q[11]) *  4 + 1024
-    half2_uint32 q6((qa & 0x00300030) | c0); // half2(q[12], q[13]) * 16 + 1024
-    half2_uint32 q7((qa & 0x00c000c0) | c0); // half2(q[14], q[15]) * 64 + 1024
-
-    dq[0] = __hadd2(q0.as_half2, z1);
-    dq[1] = __hfma2(q1.as_half2, y4,  z4);
-    dq[2] = __hfma2(q2.as_half2, y16, z16);
-    dq[3] = __hfma2(q3.as_half2, y64, z64);
-    dq[4] = __hadd2(q4.as_half2, z1);
-    dq[5] = __hfma2(q5.as_half2, y4,  z4);
-    dq[6] = __hfma2(q6.as_half2, y16, z16);
-    dq[7] = __hfma2(q7.as_half2, y64, z64);
-}
-
-#else
-
-__forceinline__ __device__ void shuffle_2bit_16
-(
-    uint32_t* q,
-    int stride
-)
-{
-}
-
-__forceinline__ __device__ void dequant_2bit_16
-(
-    const uint32_t q_0,
-    half2 (&dq)[8],
-    int stride
-)
-{
-    half dqh[16];
-    for (int i = 0; i < 16; i++) dqh[i] = dq_ns(exb(q_0, i * 2, 0x03), 2);
-
-    for (int i = 0; i < 8; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
-}
-
-#endif
-
-#endif
\ No newline at end of file
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_3.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_3.cuh
deleted file mode 100644
index 101173763..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_3.cuh
+++ /dev/null
@@ -1,169 +0,0 @@
-#ifndef _qdq_3_cuh
-#define _qdq_3_cuh
-
-#include "qdq_util.cuh"
-#include "../../config.h"
-
-#if QMODE_3BIT == 1
-
-// Permutation:
-//
-// v9997775 55333111  u8886664 44222000  (u, v lsb)
-// vjjjhhhf ffdddbbb  uiiiggge eecccaaa
-// vtttrrrp ppnnnlll  usssqqqo oommmkkk
-
-__forceinline__ __device__ void shuffle_3bit_32
-(
-    uint32_t* q,
-    int stride
-)
-{
-    uint32_t qa = q[0 * stride];
-    uint32_t qb = q[1 * stride];
-    uint32_t qc = q[2 * stride];
-
-    // qa: aa999888 77766655  54443332 22111000
-    // qb: lkkkjjji iihhhggg  fffeeedd dcccbbba
-    // qc: vvvuuutt tsssrrrq  qqpppooo nnnmmmll
-
-    uint32_t qd = qc >> 26;
-    qc <<= 4;
-    qc |= qb >> 28;
-    qb <<= 2;
-    qb |= qa >> 30;
-
-    // qa: ..999888 77766655  54443332 22111000
-    // qb: ..jjjiii hhhgggff  feeedddc ccbbbaaa
-    // qc: ..tttsss rrrqqqpp  pooonnnm mmlllkkk
-    // qd:                               vvvuuu
-
-    uint32_t za = 0;
-    uint32_t zb = 0;
-    uint32_t zc = 0;
-
-    for (int i = 0; i < 5; i++) { uint32_t t0 = qa & 0x07; uint32_t t1 = (qa & 0x38) >> 3; qa >>= 6; za |= (t0 << (i * 3)); za |= (t1 << (i * 3 + 16)); }
-    for (int i = 0; i < 5; i++) { uint32_t t0 = qb & 0x07; uint32_t t1 = (qb & 0x38) >> 3; qb >>= 6; zb |= (t0 << (i * 3)); zb |= (t1 << (i * 3 + 16)); }
-    for (int i = 0; i < 5; i++) { uint32_t t0 = qc & 0x07; uint32_t t1 = (qc & 0x38) >> 3; qc >>= 6; zc |= (t0 << (i * 3)); zc |= (t1 << (i * 3 + 16)); }
-
-    // za:  9997775 55333111   8886664 44222000
-    // zb:  jjjhhhf ffdddbbb   iiiggge eecccaaa
-    // zc:  tttrrrp ppnnnlll   sssqqqo oommmkkk
-    // qd:                               vvvuuu
-
-    za |= ((qd & 0x01) >> 0) << 15;
-    zb |= ((qd & 0x02) >> 1) << 15;
-    zc |= ((qd & 0x04) >> 2) << 15;
-    za |= ((qd & 0x08) >> 3) << 31;
-    zb |= ((qd & 0x10) >> 4) << 31;
-    zc |= ((qd & 0x20) >> 5) << 31;
-
-    // za: v9997775 55333111  u8886664 44222000  (u, v lsb)
-    // zb: vjjjhhhf ffdddbbb  uiiiggge eecccaaa
-    // zc: vtttrrrp ppnnnlll  usssqqqo oommmkkk
-
-    q[0 * stride] = za;
-    q[1 * stride] = zb;
-    q[2 * stride] = zc;
-}
-
-__forceinline__ __device__ void dequant_3bit_32
-(
-    const uint32_t q_0,
-    const uint32_t q_1,
-    const uint32_t q_2,
-    half2 (&dq)[16],
-    int stride
-)
-{
-    const uint32_t c0 = 0x64006400;
-    const half y8_  = __float2half_rn(1.0f /  8.0f);
-    const half y64_ = __float2half_rn(1.0f / 64.0f);
-    const half2 y8  = __halves2half2(y8_,  y8_);
-    const half2 y64 = __halves2half2(y64_, y64_);
-    const half z1_  = __float2half_rn(-1024.0f         - 4.0f);
-    const half z8_  = __float2half_rn(-1024.0f /  8.0f - 4.0f);
-    const half z64_ = __float2half_rn(-1024.0f / 64.0f - 4.0f);
-    const half2 z1  = __halves2half2(z1_,  z1_);
-    const half2 z8  = __halves2half2(z8_,  z8_);
-    const half2 z64 = __halves2half2(z64_, z64_);
-
-    uint32_t qa = q_0;
-    uint32_t qb = q_1;
-    uint32_t qc = q_2;
-
-    half2_uint32 q0((qa & 0x00070007) | c0); // half2(q[ 0], q[ 1])      + 1024
-    half2_uint32 q1((qa & 0x00380038) | c0); // half2(q[ 2], q[ 3]) *  8 + 1024
-    qa >>= 6;
-    half2_uint32 q2((qa & 0x00070007) | c0); // half2(q[ 4], q[ 5])      + 1024
-    half2_uint32 q3((qa & 0x00380038) | c0); // half2(q[ 6], q[ 7]) *  8 + 1024
-    half2_uint32 q4((qa & 0x01c001c0) | c0); // half2(q[ 8], q[ 9]) * 64 + 1024
-    qa >>= 9;
-    qa &= 0x00010001;
-    half2_uint32 q5((qb & 0x00070007) | c0); // half2(q[10], q[11])      + 1024
-    half2_uint32 q6((qb & 0x00380038) | c0); // half2(q[12], q[13]) *  8 + 1024
-    qb >>= 6;
-    half2_uint32 q7((qb & 0x00070007) | c0); // half2(q[14], q[15])      + 1024
-    half2_uint32 q8((qb & 0x00380038) | c0); // half2(q[16], q[17]) *  8 + 1024
-    half2_uint32 q9((qb & 0x01c001c0) | c0); // half2(q[18], q[19]) * 64 + 1024
-    qb >>= 8;
-    qb &= 0x00020002;
-    half2_uint32 q10((qc & 0x00070007) | c0); // half2(q[20], q[21])      + 1024
-    half2_uint32 q11((qc & 0x00380038) | c0); // half2(q[22], q[23]) *  8 + 1024
-    qc >>= 6;
-    half2_uint32 q12((qc & 0x00070007) | c0); // half2(q[24], q[25])      + 1024
-    half2_uint32 q13((qc & 0x00380038) | c0); // half2(q[26], q[27]) *  8 + 1024
-    half2_uint32 q14((qc & 0x01c001c0) | c0); // half2(q[28], q[29]) * 64 + 1024
-    qc >>= 7;
-    qc &= 0x00040004;
-    half2_uint32 q15((qa | qb | qc) | c0);
-
-    dq[ 0] = __hadd2( q0.as_half2, z1);
-    dq[ 1] = __hfma2( q1.as_half2, y8,  z8);
-    dq[ 2] = __hadd2( q2.as_half2, z1);
-    dq[ 3] = __hfma2( q3.as_half2, y8,  z8);
-    dq[ 4] = __hfma2( q4.as_half2, y64, z64);
-    dq[ 5] = __hadd2( q5.as_half2, z1);
-    dq[ 6] = __hfma2( q6.as_half2, y8,  z8);
-    dq[ 7] = __hadd2( q7.as_half2, z1);
-    dq[ 8] = __hfma2( q8.as_half2, y8,  z8);
-    dq[ 9] = __hfma2( q9.as_half2, y64, z64);
-    dq[10] = __hadd2(q10.as_half2, z1);
-    dq[11] = __hfma2(q11.as_half2, y8,  z8);
-    dq[12] = __hadd2(q12.as_half2, z1);
-    dq[13] = __hfma2(q13.as_half2, y8,  z8);
-    dq[14] = __hfma2(q14.as_half2, y64, z64);
-    dq[15] = __hadd2(q15.as_half2, z1);
-}
-
-#else
-
-__forceinline__ __device__ void shuffle_3bit_32
-(
-    uint32_t* q,
-    int stride
-)
-{
-}
-
-__forceinline__ __device__ void dequant_3bit_32
-(
-    const uint32_t q_0,
-    const uint32_t q_1,
-    const uint32_t q_2,
-    half2 (&dq)[16],
-    int stride
-)
-{
-    half dqh[32];
-    for (int i = 0; i < 10; i++) dqh[     i] = dq_ns(exb(     q_0, i * 3    , 0x07), 4);
-                                 dqh[10    ] = dq_ns(exb(q_1, q_0,        30, 0x07), 4);
-    for (int i = 0; i < 10; i++) dqh[11 + i] = dq_ns(exb(     q_1, i * 3 + 1, 0x07), 4);
-                                 dqh[21    ] = dq_ns(exb(q_2, q_1,        31, 0x07), 4);
-    for (int i = 0; i < 10; i++) dqh[22 + i] = dq_ns(exb(     q_2, i * 3 + 2, 0x07), 4);
-
-    for (int i = 0; i < 16; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
-}
-
-#endif
-
-#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_4.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_4.cuh
deleted file mode 100644
index 5fb070d06..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_4.cuh
+++ /dev/null
@@ -1,227 +0,0 @@
-#ifndef _qdq_4_cuh
-#define _qdq_4_cuh
-
-#include "qdq_util.cuh"
-#include "../../config.h"
-
-#if QMODE_4BIT == 1
-
-// Permutation:
-//
-// 77775555 33331111  66664444 22220000
-
-__forceinline__ __device__ void shuffle_4bit_8
-(
-    uint32_t* q,
-    int stride
-)
-{
-    uint32_t qa = q[0];
-    uint32_t qb = 0;
-
-    #pragma unroll
-    for (int i = 0; i < 4; i++)
-    {
-        uint32_t qa0 = qa & 0x0f;
-        uint32_t qa1 = (qa & 0xf0) >> 4;
-        qa >>= 8;
-        qb |= (qa1 << (i * 4 + 16));
-        qb |= (qa0 << (i * 4));
-    }
-    q[0] = qb;
-}
-
-__forceinline__ __device__ void dequant_4bit_8
-(
-    const uint32_t q_0,
-    half2 (&dq)[4],
-    int stride
-)
-{
-    const uint32_t c0 = 0x64006400;
-    const half y16_ = __float2half_rn(1.0f / 16.0f);
-    const half2 y16 = __halves2half2(y16_, y16_);
-    const half z1_  = __float2half_rn(-1024.0f         - 8.0f);
-    const half z16_ = __float2half_rn(-1024.0f / 16.0f - 8.0f);
-    const half2 z1  = __halves2half2(z1_,  z1_);
-    const half2 z16 = __halves2half2(z16_, z16_);
-
-    uint32_t qa = q_0;
-    half2_uint32 q0((qa & 0x000f000f) | c0); // half2(q[ 0], q[ 1])      + 1024
-    half2_uint32 q1((qa & 0x00f000f0) | c0); // half2(q[ 2], q[ 3]) * 16 + 1024
-    qa >>= 8;
-    half2_uint32 q2((qa & 0x000f000f) | c0); // half2(q[ 4], q[ 5])      + 1024
-    half2_uint32 q3((qa & 0x00f000f0) | c0); // half2(q[ 6], q[ 7]) * 16 + 1024
-
-    dq[0] = __hadd2(q0.as_half2, z1);
-    dq[1] = __hfma2(q1.as_half2, y16, z16);
-    dq[2] = __hadd2(q2.as_half2, z1);
-    dq[3] = __hfma2(q3.as_half2, y16, z16);
-}
-
-__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale
-(
-    const uint32_t zero,
-    const half scale,
-    half2 (&z1z16)[2],
-    half2 (&y1y16)[2]
-)
-{
-    half_uint16 z1(0xe400 | zero); // half(-1024.0f - zero);
-    half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero));
-
-    half2 scale2 = __half2half2(scale);
-
-    z1z16[0] = __hmul2(scale2, __half2half2(z1.as_half));
-    z1z16[1] = __hmul2(scale2, __half2half2(z16));
-
-    const half y1 = __float2half_rn(1.0f);
-    const half y16 = __float2half_rn(1.0f / 16.0f);
-
-    y1y16[0] = __hmul2(scale2, __half2half2(y1));
-    y1y16[1] = __hmul2(scale2, __half2half2(y16));
-}
-
-__forceinline__ __device__ void dequant_4bit_8_prep_zero
-(
-    const uint32_t zero,
-    half2(&z1z16)[2],
-    half2(&y1y16)[2]
-)
-{
-    half_uint16 z1(0xe400 | zero); // half(-1024.0f - zero);
-    half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero));
-
-    z1z16[0] = __half2half2(z1.as_half);
-    z1z16[1] = __half2half2(z16);
-
-    const half y1 = __float2half_rn(1.0f);
-    const half y16 = __float2half_rn(1.0f / 16.0f);
-
-    y1y16[0] = __half2half2(y1);
-    y1y16[1] = __half2half2(y16);
-}
-
-
-__forceinline__ __device__ void dequant_4bit_8_gptq
-(
-    const uint32_t q_0,
-    half2 (&dq)[4],
-    half2 (&z1z16)[2],
-    half2 (&y1y16)[2],
-    int stride,
-    bool scaled
-)
-{
-    const uint32_t c0 = 0x64006400;
-
-    uint32_t qa = q_0;
-    half2_uint32 q0((qa & 0x000f000f) | c0); // half2( q[0]      + 1024, q[1]      + 1024 )
-    half2_uint32 q1((qa & 0x00f000f0) | c0); // half2( q[2] * 16 + 1024, q[3] * 16 + 1024 )
-    qa >>= 8;
-    half2_uint32 q2((qa & 0x000f000f) | c0); // half2( q[4]      + 1024, q[5]      + 1024 )
-    half2_uint32 q3((qa & 0x00f000f0) | c0); // half2( q[6] * 16 + 1024, q[7] * 16 + 1024 )
-
-    if (scaled)
-    {
-        dq[0] = __hfma2(q0.as_half2, y1y16[0], z1z16[0]);  // half2( q[0] * s - z * s, q[1] * s - z * s)
-        dq[1] = __hfma2(q1.as_half2, y1y16[1], z1z16[1]);  // half2( q[2] * s - z * s, q[3] * s - z * s)
-        dq[2] = __hfma2(q2.as_half2, y1y16[0], z1z16[0]);
-        dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]);
-    }
-    else
-    {
-        dq[0] = __hadd2(q0.as_half2,           z1z16[0]);  // half2( q[0] - z, q[1] - z )
-        dq[1] = __hfma2(q1.as_half2, y1y16[1], z1z16[1]);  // half2( q[2] - z, q[3] - z )
-        dq[2] = __hadd2(q2.as_half2,           z1z16[0]);  // half2( q[4] - z, q[5] - z )
-        dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]);  // half2( q[6] - z, q[7] - z )
-    }
-}
-
-#else
-
-__forceinline__ __device__ void shuffle_4bit_8
-(
-    uint32_t* q,
-    int stride
-)
-{
-}
-
-__forceinline__ __device__ void dequant_4bit_8
-(
-    const uint32_t q_0,
-    half2 (&dq)[4],
-    int stride
-)
-{
-    half dqh[8];
-    for (int i = 0; i < 8; i++) dqh[i] = dq_ns(exb(q_0, i * 4, 0x0f), 8);
-
-    for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
-}
-
-__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale
-(
-    const uint32_t zero,
-    const half scale,
-    half2 (&z1)[2],
-    half2 (&y1)[2]
-)
-{
-    half z = __int2half_rn(-((int)zero));
-    z = __hmul(z, scale);
-    z1[0] = __half2half2(z);
-    y1[0] = __half2half2(scale);
-}
-
-__forceinline__ __device__ void dequant_4bit_8_prep_zero
-(
-    const uint32_t zero,
-    half2(&z1)[2],
-    half2(&y1)[2]
-)
-{
-    half z = __int2half_rn(-((int)zero));
-    z1[0] = __half2half2(z);
-}
-
-__forceinline__ __device__ void dequant_4bit_8_gptq
-(
-    const uint32_t q_0,
-    half2 (&dq)[4],
-    half2 (&z1)[2],
-    half2 (&y1)[2],
-    int stride,
-    bool scaled
-)
-{
-    half2 dqh2[8];
-
-    uint32_t qa = q_0;
-    for (int i = 0; i < 4; i++)
-    {
-        half d0 = __int2half_rn(qa & 0x0f); qa >>= 4;
-        half d1 = __int2half_rn(qa & 0x0f); qa >>= 4;
-        dqh2[i] = __halves2half2(d0, d1);
-    }
-
-    if (scaled)
-    {
-        dq[0] = __hfma2(dqh2[0], y1[0], z1[0]);
-        dq[1] = __hfma2(dqh2[1], y1[0], z1[0]);
-        dq[2] = __hfma2(dqh2[2], y1[0], z1[0]);
-        dq[3] = __hfma2(dqh2[3], y1[0], z1[0]);
-    }
-    else
-    {
-        dq[0] = __hadd2(dqh2[0], z1[0]);
-        dq[1] = __hadd2(dqh2[1], z1[0]);
-        dq[2] = __hadd2(dqh2[2], z1[0]);
-        dq[3] = __hadd2(dqh2[3], z1[0]);
-    }
-}
-
-#endif
-
-#endif
\ No newline at end of file
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_5.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_5.cuh
deleted file mode 100644
index 454e4b93b..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_5.cuh
+++ /dev/null
@@ -1,207 +0,0 @@
-#ifndef _qdq_5_cuh
-#define _qdq_5_cuh
-
-#include "qdq_util.cuh"
-#include "../../config.h"
-
-#if QMODE_5BIT == 1
-
-// Permutation:
-//
-// v5555533 33311111  u4444422 22200000  (u, v lsb)
-// vbbbbb99 99977777  uaaaaa88 88866666
-// vhhhhhff fffddddd  ugggggee eeeccccc
-// vnnnnnll llljjjjj  ummmmmkk kkkiiiii
-// vtttttrr rrrppppp  usssssqq qqqooooo
-
-__forceinline__ __device__ void shuffle_5bit_32
-(
-    uint32_t* q,
-    int stride
-)
-{
-    uint32_t qa = q[0 * stride];
-    uint32_t qb = q[1 * stride];
-    uint32_t qc = q[2 * stride];
-    uint32_t qd = q[3 * stride];
-    uint32_t qe = q[4 * stride];
-
-    // qa: 66555554 44443333  32222211 11100000
-    // qb: ccccbbbb baaaaa99  99988888 77777666
-    // qc: jiiiiihh hhhggggg  fffffeee eedddddc
-    // qd: pppooooo nnnnnmmm  mmlllllk kkkkjjjj
-    // qe: vvvvvuuu uuttttts  ssssrrrr rqqqqqpp
-
-    uint32_t qf = qe >> 22;
-    qe <<= 8;
-    qe |= qd >> 24;
-    qd <<= 6;
-    qd |= qc >> 26;
-    qc <<= 4;
-    qc |= qb >> 28;
-    qb <<= 2;
-    qb |= qa >> 30;
-
-    // qa:   555554 44443333  32222211 11100000
-    // qb:   bbbbba aaaa9999  98888877 77766666
-    // qc:   hhhhhg ggggffff  feeeeedd dddccccc
-    // qd:   nnnnnm mmmmllll  lkkkkkjj jjjiiiii
-    // qe:   ttttts ssssrrrr  rqqqqqpp pppooooo
-    // qf:                          vv vvvuuuuu
-
-    uint32_t za = 0;
-    uint32_t zb = 0;
-    uint32_t zc = 0;
-    uint32_t zd = 0;
-    uint32_t ze = 0;
-
-    for (int i = 0; i < 3; i++) { uint32_t t0 = qa & 0x1f; uint32_t t1 = (qa & 0x3e0) >> 5; qa >>= 10; za |= (t0 << (i * 5)); za |= (t1 << (i * 5 + 16)); }
-    for (int i = 0; i < 3; i++) { uint32_t t0 = qb & 0x1f; uint32_t t1 = (qb & 0x3e0) >> 5; qb >>= 10; zb |= (t0 << (i * 5)); zb |= (t1 << (i * 5 + 16)); }
-    for (int i = 0; i < 3; i++) { uint32_t t0 = qc & 0x1f; uint32_t t1 = (qc & 0x3e0) >> 5; qc >>= 10; zc |= (t0 << (i * 5)); zc |= (t1 << (i * 5 + 16)); }
-    for (int i = 0; i < 3; i++) { uint32_t t0 = qd & 0x1f; uint32_t t1 = (qd & 0x3e0) >> 5; qd >>= 10; zd |= (t0 << (i * 5)); zd |= (t1 << (i * 5 + 16)); }
-    for (int i = 0; i < 3; i++) { uint32_t t0 = qe & 0x1f; uint32_t t1 = (qe & 0x3e0) >> 5; qe >>= 10; ze |= (t0 << (i * 5)); ze |= (t1 << (i * 5 + 16)); }
-
-    // za:  5555533 33311111   4444422 22200000
-    // zb:  bbbbb99 99977777   aaaaa88 88866666
-    // zc:  hhhhhff fffddddd   gggggee eeeccccc
-    // zd:  nnnnnll llljjjjj   mmmmmkk kkkiiiii
-    // ze:  tttttrr rrrppppp   sssssqq qqqooooo
-    // qf:                          vv vvvuuuuu
-
-    za |= ((qf & 0x001) >> 0) << 15;
-    zb |= ((qf & 0x002) >> 1) << 15;
-    zc |= ((qf & 0x004) >> 2) << 15;
-    zd |= ((qf & 0x008) >> 3) << 15;
-    ze |= ((qf & 0x010) >> 4) << 15;
-    za |= ((qf & 0x020) >> 5) << 31;
-    zb |= ((qf & 0x040) >> 6) << 31;
-    zc |= ((qf & 0x080) >> 7) << 31;
-    zd |= ((qf & 0x100) >> 8) << 31;
-    ze |= ((qf & 0x200) >> 9) << 31;
-
-    // za: v5555533 33311111  u4444422 22200000  (u, v lsb)
-    // zb: vbbbbb99 99977777  uaaaaa88 88866666
-    // zc: vhhhhhff fffddddd  ugggggee eeeccccc
-    // zd: vnnnnnll llljjjjj  ummmmmkk kkkiiiii
-    // ze: vtttttrr rrrppppp  usssssqq qqqooooo
-
-    q[0 * stride] = za;
-    q[1 * stride] = zb;
-    q[2 * stride] = zc;
-    q[3 * stride] = zd;
-    q[4 * stride] = ze;
-}
-
-__forceinline__ __device__ void dequant_5bit_32
-(
-    const uint32_t q_0,
-    const uint32_t q_1,
-    const uint32_t q_2,
-    const uint32_t q_3,
-    const uint32_t q_4,
-    half2 (&dq)[16],
-    int stride
-)
-{
-    const uint32_t c0 = 0x64006400;
-    const half y32_ = __float2half_rn(1.0f / 32.0f);
-    const half2 y32 = __halves2half2(y32_, y32_);
-    const half z1_  = __float2half_rn(-1024.0f         - 16.0f);
-    const half z32_ = __float2half_rn(-1024.0f / 32.0f - 16.0f);
-    const half2 z1  = __halves2half2(z1_,  z1_);
-    const half2 z32 = __halves2half2(z32_, z32_);
-
-    uint32_t qa = q_0;
-    uint32_t qb = q_1;
-    uint32_t qc = q_2;
-    uint32_t qd = q_3;
-    uint32_t qe = q_4;
-
-    half2_uint32 q0 ((qa & 0x001f001f) | c0); // half2(q[ 0], q[ 1])      + 1024
-    half2_uint32 q1 ((qa & 0x03e003e0) | c0); // half2(q[ 2], q[ 3]) * 32 + 1024
-    qa >>= 10;
-    half2_uint32 q2 ((qa & 0x001f001f) | c0); // half2(q[ 4], q[ 5])      + 1024
-    qa >>= 5;
-    qa &= 0x00010001;
-    half2_uint32 q3 ((qb & 0x001f001f) | c0); // half2(q[ 6], q[ 7])      + 1024
-    half2_uint32 q4 ((qb & 0x03e003e0) | c0); // half2(q[ 8], q[ 9]) * 32 + 1024
-    qb >>= 10;
-    half2_uint32 q5 ((qb & 0x001f001f) | c0); // half2(q[10], q[11])      + 1024
-    qb >>= 4;
-    qb &= 0x00020002;
-    half2_uint32 q6 ((qc & 0x001f001f) | c0); // half2(q[12], q[13])      + 1024
-    half2_uint32 q7 ((qc & 0x03e003e0) | c0); // half2(q[14], q[15]) * 32 + 1024
-    qc >>= 10;
-    half2_uint32 q8 ((qc & 0x001f001f) | c0); // half2(q[16], q[17])      + 1024
-    qc >>= 3;
-    qc &= 0x00040004;
-    half2_uint32 q9 ((qd & 0x001f001f) | c0); // half2(q[18], q[19])      + 1024
-    half2_uint32 q10((qd & 0x03e003e0) | c0); // half2(q[20], q[21]) * 32 + 1024
-    qd >>= 10;
-    half2_uint32 q11((qd & 0x001f001f) | c0); // half2(q[22], q[23])      + 1024
-    qd >>= 2;
-    qd &= 0x00080008;
-    half2_uint32 q12((qe & 0x001f001f) | c0); // half2(q[24], q[25])      + 1024
-    half2_uint32 q13((qe & 0x03e003e0) | c0); // half2(q[26], q[27]) * 32 + 1024
-    qe >>= 10;
-    half2_uint32 q14((qe & 0x001f001f) | c0); // half2(q[28], q[29])      + 1024
-    qe >>= 1;
-    qe &= 0x00100010;
-    half2_uint32 q15((qa | qb | qc | qd | qe) | c0);
-
-    dq[ 0] = __hadd2( q0.as_half2, z1);
-    dq[ 1] = __hfma2( q1.as_half2, y32, z32);
-    dq[ 2] = __hadd2( q2.as_half2, z1);
-    dq[ 3] = __hadd2( q3.as_half2, z1);
-    dq[ 4] = __hfma2( q4.as_half2, y32, z32);
-    dq[ 5] = __hadd2( q5.as_half2, z1);
-    dq[ 6] = __hadd2( q6.as_half2, z1);
-    dq[ 7] = __hfma2( q7.as_half2, y32, z32);
-    dq[ 8] = __hadd2( q8.as_half2, z1);
-    dq[ 9] = __hadd2( q9.as_half2, z1);
-    dq[10] = __hfma2(q10.as_half2, y32, z32);
-    dq[11] = __hadd2(q11.as_half2, z1);
-    dq[12] = __hadd2(q12.as_half2, z1);
-    dq[13] = __hfma2(q13.as_half2, y32, z32);
-    dq[14] = __hadd2(q14.as_half2, z1);
-    dq[15] = __hadd2(q15.as_half2, z1);
-}
-
-#else
-
-__forceinline__ __device__ void shuffle_5bit_32
-(
-    uint32_t* q,
-    int stride
-)
-{
-}
-
-__forceinline__ __device__ void dequant_5bit_32
-(
-    const uint32_t q_0,
-    const uint32_t q_1,
-    const uint32_t q_2,
-    const uint32_t q_3,
-    const uint32_t q_4,
-    half2 (&dq)[16],
-    int stride
-)
-{
-    half dqh[32];
-    for (int i = 0; i <  6; i++) dqh[     i] = dq_ns(exb(     q_0, i * 5    , 0x1f), 16);
-                                 dqh[ 6    ] = dq_ns(exb(q_1, q_0,        30, 0x1f), 16);
-    for (int i = 0; i <  5; i++) dqh[ 7 + i] = dq_ns(exb(     q_1, i * 5 + 3, 0x1f), 16);
-                                 dqh[12    ] = dq_ns(exb(q_2, q_1,        28, 0x1f), 16);
-    for (int i = 0; i <  6; i++) dqh[13 + i] = dq_ns(exb(     q_2, i * 5 + 1, 0x1f), 16);
-                                 dqh[19    ] = dq_ns(exb(q_3, q_2,        31, 0x1f), 16);
-    for (int i = 0; i <  5; i++) dqh[20 + i] = dq_ns(exb(     q_3, i * 5 + 4, 0x1f), 16);
-                                 dqh[25    ] = dq_ns(exb(q_4, q_3,        29, 0x1f), 16);
-    for (int i = 0; i <  6; i++) dqh[26 + i] = dq_ns(exb(     q_4, i * 5 + 2, 0x1f), 16);
-
-    for (int i = 0; i < 16; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
-}
-
-#endif
-
-#endif
\ No newline at end of file
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_6.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_6.cuh
deleted file mode 100644
index c2eb8cfbf..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_6.cuh
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef _qdq_6_cuh
-#define _qdq_6_cuh
-
-#include "qdq_util.cuh"
-#include "../../config.h"
-
-#if QMODE_6BIT == 1
-
-  // Not implemented
-
-#else
-
-__forceinline__ __device__ void shuffle_6bit_16
-(
-    uint32_t* q,
-    int stride
-)
-{
-}
-
-__forceinline__ __device__ void dequant_6bit_16
-(
-    const uint32_t q_0,
-    const uint32_t q_1,
-    const uint32_t q_2,
-    half2 (&dq)[8],
-    int stride
-)
-{
-    half dqh[16];
-    for (int i = 0; i < 5; i++) dqh[     i] = dq_ns(exb(     q_0, i * 6    , 0x3f), 32);
-                                dqh[ 5    ] = dq_ns(exb(q_1, q_0,        30, 0x3f), 32);
-    for (int i = 0; i < 4; i++) dqh[ 6 + i] = dq_ns(exb(     q_1, i * 6 + 4, 0x3f), 32);
-                                dqh[10    ] = dq_ns(exb(q_2, q_1,        28, 0x3f), 32);
-    for (int i = 0; i < 5; i++) dqh[11 + i] = dq_ns(exb(     q_2, i * 6 + 2, 0x3f), 32);
-
-    for (int i = 0; i < 8; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
-}
-
-#endif
-
-#endif
-
-
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_8.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_8.cuh
deleted file mode 100644
index e2409efac..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_8.cuh
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef _qdq_8_cuh
-#define _qdq_8_cuh
-
-#include "qdq_util.cuh"
-#include "../../config.h"
-
-#if QMODE_8BIT == 1
-
-  // Not implemented
-
-#else
-
-__forceinline__ __device__ void shuffle_8bit_4
-(
-    uint32_t* q,
-    int stride
-)
-{
-}
-
-__forceinline__ __device__ void dequant_8bit_8
-(
-    const uint32_t q_0,
-    const uint32_t q_1,
-    half2 (&dq)[4],
-    int stride
-)
-{
-    half dqh[8];
-    for (int i = 0; i < 4; i++) dqh[i    ] = dq_ns(exb(q_0, i * 8, 0xff), 128);
-    for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), 128);
-
-    for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
-}
-
-#endif
-
-#endif
\ No newline at end of file
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_util.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_util.cuh
deleted file mode 100644
index 71657191b..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_util.cuh
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef _qdq_util_cuh
-#define _qdq_util_cuh
-
-union half2_uint32
-{
-    uint32_t as_uint32;
-    half2 as_half2;
-    __device__ half2_uint32(uint32_t val) : as_uint32(val) {}
-    __device__ half2_uint32(half2 val) : as_half2(val) {}
-};
-
-union half_uint16
-{
-    uint16_t as_uint16;
-    half as_half;
-    __device__ half_uint16(uint16_t val) : as_uint16(val) {}
-    __device__ half_uint16(half val) : as_half(val) {}
-};
-
-// Max_scale premultiplied by 1/256
-
-__forceinline__ __device__ half dq_scale(const int qs, const half max_scale)
-{
-    int qs_i = qs + 1;
-    half qs_h = __int2half_rn(qs_i * qs_i);
-    qs_h = __hmul(qs_h, max_scale);
-    return qs_h;
-}
-
-__forceinline__ __device__ half dq(const int q, const int qzero, const half scale)
-{
-    return __hmul(__int2half_rn(q - qzero), scale);
-}
-
-__forceinline__ __device__ half dq_ns(const int q, const int qzero)
-{
-    //return __hsub(__int2half_rn(q), __int2half_rn(qzero));
-    return __int2half_rn(q - qzero);
-}
-
-__forceinline__ __device__ int exb(const uint32_t q, const int shift, const int mask)
-{
-    return (int)((q >> shift) & mask);
-}
-
-__forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, const int shift, const int mask)
-{
-    return (int)(__funnelshift_rc(q0, q1, shift) & mask);
-}
-
-#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quantize.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/quantize.cu
deleted file mode 100644
index 0e009c6f1..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quantize.cu
+++ /dev/null
@@ -1,256 +0,0 @@
-#include "quantize.cuh"
-#include "util.cuh"
-#include <curand_kernel.h>
-#include "compat.cuh"
-
-#define BLOCKSIZE_X 32
-#define BLOCKSIZE_Y 32
-
-__global__ void quantize_kernel
-(
-    const float* __restrict__ input,
-    float* __restrict__ output,
-    const float* __restrict__ scale,
-    uint16_t* __restrict__ out_q,
-    int rows,
-    int columns,
-    float qzero,
-    float maxq
-)
-{
-    int column = blockIdx.x * blockDim.x + threadIdx.x;
-    int row = blockIdx.y * blockDim.y + threadIdx.y;
-    if (column >= columns) return;
-    if (row >= rows) return;
-
-    // Quantize
-
-    float x = input[row * columns + column];
-    float s = scale[column];
-    x /= s;
-    x = rintf(x);
-    x += qzero;
-    x = clamp(x, 0.0f, maxq);
-
-    // Optionally save quant
-
-    if (out_q)
-    {
-        uint16_t q = static_cast<uint16_t>(x);
-        out_q[row * columns + column] = q;
-    }
-
-    half h_s = __float2half_rn(s);
-    half h_x = __float2half_rn(x);
-    half h_qzero = __float2half_rn(qzero);
-
-    h_x = __hsub(h_x, h_qzero);
-    h_x = __hmul(h_x, h_s);
-
-    // Dequantize
-
-//    x -= qzero;
-//    x *= s;
-    output[row * columns + column] = __half2float(h_x);
-}
-
-void quantize_cuda
-(
-    const float* input,
-    float* output,
-    const float* scale,
-    uint16_t* out_q,
-    int rows,
-    int columns,
-    float qzero,
-    float maxq
-)
-{
-    dim3 threads(BLOCKSIZE_X, BLOCKSIZE_Y);
-    dim3 blocks(DIVIDE(columns, BLOCKSIZE_X), DIVIDE(rows, BLOCKSIZE_Y));
-
-//     DBGI2(rows, columns);
-//     DBGF2(qzero, maxq);
-
-    quantize_kernel<<<blocks, threads>>>
-    (
-        input,
-        output,
-        scale,
-        out_q,
-        rows,
-        columns,
-        qzero,
-        maxq
-    );
-}
-
-__global__ void quantize_err_kernel
-(
-    const float* __restrict__ input,
-    float* __restrict__ output,
-    const float* __restrict__ scale,
-    int rows,
-    int columns,
-    float qzero,
-    float maxq,
-    float err_norm,
-    float min_p,
-    float max_p,
-    int p_grid
-)
-{
-    int column = blockIdx.x * blockDim.x + threadIdx.x;
-    int row = blockIdx.y * blockDim.y + threadIdx.y;
-    if (column >= columns) return;
-    if (row >= rows) return;
-
-    float w = input[row * columns + column];
-
-    // Quantize
-
-    for (int i = 0; i <= p_grid; i++)
-    {
-        float pi = __int2float_rn(i) / __int2float_rn(p_grid);
-        float p = min_p * (1.0f - pi) + max_p * pi;
-
-        float x = w;
-        float s = scale[column] * p;
-        x /= s;
-        x = rintf(x);
-        x += qzero;
-        x = clamp(x, 0.0f, maxq);
-
-        // Dequantize
-
-        x -= qzero;
-        x *= s;
-
-        // Quantization error
-
-        x = __powf(fabsf(x - w), err_norm);
-        atomicAdd(&output[i * 128 + column % 128], x);
-    }
-}
-
-void quantize_err_cuda
-(
-    const float* input,
-    float* output,
-    const float* scale,
-    int rows,
-    int columns,
-    float qzero,
-    float maxq,
-    float err_norm,
-    float min_p,
-    float max_p,
-    int p_grid
-)
-{
-    dim3 threads(BLOCKSIZE_X, BLOCKSIZE_Y);
-    dim3 blocks(DIVIDE(columns, BLOCKSIZE_X), DIVIDE(rows, BLOCKSIZE_Y));
-
-//     DBGI2(rows, columns);
-//     DBGF2(qzero, maxq);
-
-    quantize_err_kernel<<<blocks, threads>>>
-    (
-        input,
-        output,
-        scale,
-        rows,
-        columns,
-        qzero,
-        maxq,
-        err_norm,
-        min_p,
-        max_p,
-        p_grid
-    );
-}
-
-__global__ void adjust_error_row_kernel
-(
-    const float* __restrict__ hessian_inv,
-    float* __restrict__ error,
-    const float* __restrict__ weights,
-    const float* __restrict__ quant,
-    int c,
-    int columns,
-    int hcolumns
-)
-{
-    int column = blockIdx.x * blockDim.x + threadIdx.x;
-    if (column >= columns) return;
-
-    float d = hessian_inv[c * hcolumns + c];
-
-    int idx = c * columns + column;
-    float w = weights[idx];
-    float q = quant[idx];
-    error[idx] = (w - q) / d;
-}
-
-void adjust_error_row_cuda
-(
-    const float* hessian_inv,
-    float* error,
-    const float* weights,
-    const float* quant,
-    int c,
-    int columns,
-    int hcolumns
-)
-{
-    dim3 threads(BLOCKSIZE_X, 1);
-    dim3 blocks(DIVIDE(columns, BLOCKSIZE_X), 1);
-
-    adjust_error_row_kernel<<<blocks, threads>>>(hessian_inv, error, weights, quant, c, columns, hcolumns);
-}
-
-
-// Compute z = z - x.T @ y
-
-__global__ void vv_mul_sub_kernel
-(
-    const float* __restrict__ x,
-    const float* __restrict__ y,
-    float* __restrict__ z,
-    int x_size,
-    int y_size
-)
-{
-    int y_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int x_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (y_idx >= y_size) return;
-    if (x_idx >= x_size) return;
-    int z_idx = y_size * x_idx + y_idx;
-
-    float p = x[x_idx] * y[y_idx];
-
-//     curandState state;
-//     int tid = blockIdx.x * blockDim.x + threadIdx.x;
-//     curand_init(1234, tid, clock64(), &state);
-//     float r = curand_uniform(&state);
-//     p *= r;
-
-//    p *- 0.707106478;
-
-    z[z_idx] -= p;
-}
-
-void vv_mul_sub_cuda
-(
-    const float* x,
-    const float* y,
-    float* z,
-    int x_size,
-    int y_size
-)
-{
-    dim3 threads(BLOCKSIZE_X, BLOCKSIZE_Y);
-    dim3 blocks(DIVIDE(y_size, BLOCKSIZE_X), DIVIDE(x_size, BLOCKSIZE_Y));
-
-    vv_mul_sub_kernel<<<blocks, threads>>>(x, y, z, x_size, y_size);
-}
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quantize.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quantize.cuh
deleted file mode 100644
index 70ef665e6..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quantize.cuh
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef _quantize_cuh
-#define _quantize_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-
-void quantize_cuda
-(
-    const float* input,
-    float* output,
-    const float* scale,
-    uint16_t* out_q,
-    int rows,
-    int columns,
-    float qzero,
-    float maxq
-);
-
-void quantize_err_cuda
-(
-    const float* input,
-    float* output,
-    const float* scale,
-    int rows,
-    int columns,
-    float qzero,
-    float maxq,
-    float err_norm,
-    float min_p,
-    float max_p,
-    int p_grid
-);
-
-void adjust_error_row_cuda
-(
-    const float* hessian_inv,
-    float* error,
-    const float* weights,
-    const float* quant,
-    int c,
-    int columns,
-    int hcolumns
-);
-
-void vv_mul_sub_cuda
-(
-    const float* x,
-    const float* y,
-    float* z,
-    int x_size,
-    int y_size
-);
-
-#endif
\ No newline at end of file
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/rms_norm.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/rms_norm.cu
deleted file mode 100644
index 29bbf2c39..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/rms_norm.cu
+++ /dev/null
@@ -1,130 +0,0 @@
-#include "rms_norm.cuh"
-#include "util.cuh"
-
-#if defined(USE_ROCM)
-#define __shfl_xor_sync(mask, var, laneMask) __shfl_xor(var, laneMask)
-#endif
-
-// y = x * w / sqrt(row_mean(x * x) + epsilon)
-
-#define NUM_WARPS 32
-#define WARP_SIZE 32
-#define BLOCK_SIZE WARP_SIZE
-#define NUM_THREADS (NUM_WARPS * WARP_SIZE)
-
-typedef void (*fp_rms_norm_kernel)
-(
-    const half*,
-    const half*,
-    half*,
-    const float,
-    const float,
-    const int,
-    const int
-);
-
-template <int blocks_per_warp>
-__global__ void rms_norm_kernel
-(
-    const half* __restrict__ x,
-    const half* __restrict__ w,
-    half* __restrict__ y,
-    const float epsilon,
-    const float r_dim,
-    const int rows,
-    const int dim
-)
-{
-    int warp_id = threadIdx.x / WARP_SIZE;
-    int lane_id = threadIdx.x % WARP_SIZE;
-    int row = blockIdx.x;
-    const half* x_row = x + row * dim;
-    half* y_row = y + row * dim;
-
-    //int blocks_per_warp = DIVIDE(dim, NUM_THREADS);
-
-    // Compute sum of squares for each block
-
-    float sum = 0.0f;
-    float itemf[blocks_per_warp];
-
-    #pragma unroll
-    for (int i = 0; i < blocks_per_warp; i++)
-    {
-        int column = warp_id * WARP_SIZE + lane_id + NUM_THREADS * i;
-        if (column >= dim) break;
-
-        float f = __half2float(x_row[column]);
-        f = fmaxf(-65504.0f, fminf(f, 65504.0f));
-        itemf[i] = f;
-        sum = fma(f, f, sum);
-    }
-
-    // Shuffle to sum across lanes
-
-    __shared__ float sums[NUM_WARPS];
-
-    for(int offset = warpSize / 2; offset > 0; offset /= 2) sum += __shfl_xor_sync(0xffffffff, sum, offset);
-    if (lane_id == 0) sums[warp_id] = sum;
-    __syncthreads();
-
-    // Load partial sums from across warps, shuffle again across lanes
-
-    sum = sums[lane_id];
-    for(int offset = warpSize / 2; offset > 0; offset /= 2) sum += __shfl_xor_sync(0xffffffff, sum, offset);
-
-    // Get norm
-
-    float rmf = rsqrtf(sum * r_dim + epsilon);
-
-    // Normalize x, scaling by w
-
-    #pragma unroll 4
-    for (int i = 0; i < blocks_per_warp; i++)
-    {
-        int column = warp_id * WARP_SIZE + lane_id + NUM_THREADS * i;
-        if (column >= dim) return;
-
-        float x_itemf = itemf[i];
-        float w_itemf = __half2float(w[column]);
-        float n = x_itemf * w_itemf * rmf;
-        y_row[column] = __float2half_rn(n);
-    }
-}
-
-fp_rms_norm_kernel pick_rms_norm_kernel(const int blocks_per_warp)
-{
-    if (blocks_per_warp == 1) return rms_norm_kernel<1>;
-    if (blocks_per_warp == 2) return rms_norm_kernel<2>;
-    if (blocks_per_warp == 3) return rms_norm_kernel<3>;
-    if (blocks_per_warp == 4) return rms_norm_kernel<4>;
-    if (blocks_per_warp == 5) return rms_norm_kernel<5>;
-    if (blocks_per_warp == 6) return rms_norm_kernel<6>;
-    if (blocks_per_warp == 7) return rms_norm_kernel<7>;
-    if (blocks_per_warp == 8) return rms_norm_kernel<8>;
-	return NULL;
-}
-
-
-void rms_norm_cuda
-(
-    const half* x,
-    const half* w,
-    half* y,
-    const float epsilon,
-    const int rows,
-    const int dim
-)
-{
-    dim3 blockDim, gridDim;
-    blockDim.x = NUM_THREADS;
-    blockDim.y = 1;
-    gridDim.x = rows;
-    gridDim.y = 1;
-
-    float r_dim = 1.0f / (float) dim;
-
-    int blocks_per_warp = DIVIDE(dim, NUM_THREADS);
-    fp_rms_norm_kernel kernel = pick_rms_norm_kernel(blocks_per_warp);
-    kernel<<<gridDim, blockDim>>>(x, w, y, epsilon, r_dim, rows, dim);
-}
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/rms_norm.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/rms_norm.cuh
deleted file mode 100644
index 4cb0fea97..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/rms_norm.cuh
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef _rms_norm_cuh
-#define _rms_norm_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-
-void rms_norm_cuda
-(
-    const half* x,
-    const half* w,
-    half* y,
-    const float epsilon,
-    const int rows,
-    const int dim
-);
-
-#endif
\ No newline at end of file
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/rope.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/rope.cu
deleted file mode 100644
index 7b1cf2b6f..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/rope.cu
+++ /dev/null
@@ -1,132 +0,0 @@
-#include "rope.cuh"
-#include "util.cuh"
-#include "matrix_view.cuh"
-
-const int THREADS_X = 32;
-const int THREADS_Y = 4;
-const int MAX_POS_EMBEDDINGS = 32768;   // Actual number doesn't matter
-const int MAX_ROWS = 32768;             // Actual number doesn't matter
-
-typedef void (*fp_rope_cuda_kernel)
-(
-    half*,
-    const half*,
-    const half*,
-    int,
-    int,
-    int,
-    int,
-    const uint32_t*,
-    int
-);
-
-template<bool use_half2>
-__global__ void rope_cuda_kernel
-(
-    half* __restrict__ x,
-    const half* __restrict__ sin,
-    const half* __restrict__ cos,
-    int rows_per_batch,
-    int head_dim,
-    int num_heads,
-    int past_len,
-    const uint32_t* __restrict__ past_lens,
-    int threads_y
-)
-{
-    MatrixView_half_rw x_(x, MAX_ROWS, head_dim);
-    MatrixView_half sin_(sin, MAX_POS_EMBEDDINGS, head_dim);
-    MatrixView_half cos_(cos, MAX_POS_EMBEDDINGS, head_dim);
-
-    int column = (blockIdx.x * THREADS_X + threadIdx.x); if constexpr (use_half2) column *= 2;
-    int half_dim = head_dim / 2;
-    if (column >= half_dim) return;
-
-    int row = blockIdx.y * threads_y + threadIdx.y;
-    if (row >= rows_per_batch) return;
-    int batch_offset = blockIdx.z * rows_per_batch;
-    int row_offset = batch_offset + row;
-
-    // Get sin and cos
-
-    if (past_len == -1) past_len = past_lens[blockIdx.z];
-    int sincos_row = past_len + row / num_heads;
-
-    if constexpr (use_half2)
-    {
-        half2 cos2_l = cos_.item_half2(sincos_row, column);
-        half2 cos2_r = cos_.item_half2(sincos_row, column + half_dim);
-        half2 sin2_l = sin_.item_half2(sincos_row, column);
-        half2 sin2_r = sin_.item_half2(sincos_row, column + half_dim);
-        sin2_l = __hneg2(sin2_l);
-
-        // Apply embedding to row
-
-        half2 item2_l = x_.item_half2(row_offset, column);
-        half2 item2_r = x_.item_half2(row_offset, column + half_dim);
-        half2 item2_ls = __hmul2(item2_r, sin2_l);
-        half2 item2_rs = __hmul2(item2_l, sin2_r);
-        item2_l = __hfma2(item2_l, cos2_l, item2_ls);
-        item2_r = __hfma2(item2_r, cos2_r, item2_rs);
-        x_.set_half2(row_offset, column, item2_l);
-        x_.set_half2(row_offset, column + half_dim, item2_r);
-    }
-    else
-    {
-        half cos_l = cos_.item(sincos_row, column);
-        half cos_r = cos_.item(sincos_row, column + half_dim);
-        half sin_l = sin_.item(sincos_row, column);
-        half sin_r = sin_.item(sincos_row, column + half_dim);
-        sin_l = __hneg(sin_l);
-
-        // Apply embedding to row
-
-        half item_l = x_.item(row_offset, column);
-        half item_r = x_.item(row_offset, column + half_dim);
-        half item_ls = __hmul(item_r, sin_l);
-        half item_rs = __hmul(item_l, sin_r);
-        item_l = __hfma(item_l, cos_l, item_ls);
-        item_r = __hfma(item_r, cos_r, item_rs);
-        x_.set(row_offset, column, item_l);
-        x_.set(row_offset, column + half_dim, item_r);
-    }
-}
-
-fp_rope_cuda_kernel pick_rope_cuda_kernel(bool use_half2)
-{
-    if (use_half2) return rope_cuda_kernel<true>;
-    else           return rope_cuda_kernel<false>;
-};
-
-void rope_cuda
-(
-    half* x,
-    const half* sin,
-    const half* cos,
-    const int batch_size,
-    const int rows_per_batch,
-    const int head_dim,
-    const int num_heads,
-    const int past_len,
-    const uint32_t* past_lens
-)
-{
-    bool use_half2 = true;
-
-    // For large batch sizes we risk exceeding grid dimension of 65535, so shift to block dimension instead
-
-    int threads_y = THREADS_Y;
-    while (DIVIDE(rows_per_batch, threads_y) > 65535) threads_y *= 2;
-
-    dim3 blockDim, gridDim;
-    blockDim.x = THREADS_X;
-    blockDim.y = threads_y;
-    gridDim.x = DIVIDE(head_dim, THREADS_X) / (use_half2 ? 2 : 1);
-    gridDim.y = DIVIDE(rows_per_batch, threads_y);
-    gridDim.z = batch_size;
-
-    fp_rope_cuda_kernel kernel = pick_rope_cuda_kernel(use_half2);
-    kernel<<<gridDim, blockDim>>>(x, sin, cos, rows_per_batch, head_dim, num_heads, past_len, past_lens, threads_y);
-
-    cuda_check( cudaPeekAtLastError() );
-}
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/rope.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/rope.cuh
deleted file mode 100644
index 9fcf7a933..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/rope.cuh
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _rope_cuh
-#define _rope_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-
-void rope_cuda
-(
-    half* x,
-    const half* sin,
-    const half* cos,
-    const int batch_size,
-    const int rows_per_batch,
-    const int head_dim,
-    const int num_heads,
-    const int past_len,
-    const uint32_t* past_lens
-);
-
-#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/util.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/util.cuh
deleted file mode 100644
index 06a58d184..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/util.cuh
+++ /dev/null
@@ -1,42 +0,0 @@
-
-#define DIVIDE(x, size) (((x) + (size) - 1) / (size))
-
-#define DBGS(__x) printf("%s\n", __x)
-#define DBGI(__x) printf("%s: %i\n", #__x, __x)
-#define DBGI2(__x, __y) printf("%s, %s: %i, %i\n", #__x, #__y, __x, __y)
-#define DBGI3(__x, __y, __z) printf("%s, %s, %s: %i, %i, %i\n", #__x, #__y, #__z, __x, __y, __z)
-#define DBGX(__x) printf("%s: %x\n", #__x, __x)
-#define DBGX2(__x, __y) printf("%s, %s: %x, %x\n", #__x, #__y, __x, __y)
-#define DBGX3(__x, __y, __z) printf("%s, %s, %s: %x, %x, %x\n", #__x, #__y, #__z, __x, __y, __z)
-#define DBGF(__x) printf("%s: %f\n", #__x, __x)
-#define DBGF2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __x, __y)
-#define DBGF3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __x, __y, __z)
-#define DBGH(__x) printf("%s: %f\n", #__x, __half2float(__x))
-#define DBGH2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __half2float(__x), __half2float(__y))
-#define DBGH3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __half2float(__x), __half2float(__y), __half2float(__z))
-
-#define DBGIH(__x, __y) printf("%s, %s: %i, %f\n", #__x, #__y, __x, __half2float(__y))
-#define DBGIH2(__x, __y, __z) printf("%s, %s, %s: %i, %f, %f\n", #__x, #__y, #__z, __x, __half2float(__y), __half2float(__z))
-
-__forceinline__ __device__ half dq_scale_(const int qs, const half max_scale)
-{
-    half qs_h = __hmul(__int2half_rn(qs + 1), __float2half_rn(1.0f / 16.0f));
-    qs_h = __hmul(qs_h, qs_h);
-    qs_h = __hmul(qs_h, max_scale);
-    return qs_h;
-}
-
-__forceinline__ __device__ float clamp(float x, float a, float b)
-{
-    return fmaxf(a, fminf(b, x));
-}
-
-#define cuda_check(ans) { gpu_assert((ans), __FILE__, __LINE__); }
-inline void gpu_assert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-   if (code != cudaSuccess)
-   {
-      fprintf(stderr,"CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
-      if (abort) exit(code);
-   }
-}
diff --git a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp
deleted file mode 100644
index 7591e365f..000000000
--- a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp
+++ /dev/null
@@ -1,956 +0,0 @@
-#include <torch/extension.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "config.h"
-
-#include "cuda/pack_tensor.cuh"
-#include "cuda/quantize.cuh"
-#include "cuda/q_matrix.cuh"
-#include "cuda/q_attn.cuh"
-#include "cuda/q_mlp.cuh"
-#include "cuda/q_gemm.cuh"
-#include "cuda/rms_norm.cuh"
-#include "cuda/rope.cuh"
-#include "cuda/cache.cuh"
-
-#include "cpp/quantize_func.h"
-#include "cpp/sampling.h"
-
-#include "cpp/util.h"
-
-// Some decluttering macros
-
-#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
-#define TORCH_CHECK_DTYPE_OPT(__x, __dtype) TORCH_CHECK((__x).device().is_meta() || (__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
-#define TORCH_CHECK_SHAPES(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
-#define TORCH_CHECK_SHAPES_OPT(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).device().is_meta() || (__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
-
-
-// Packing functions
-
-void pack_rows_4
-(
-    torch::Tensor input,
-    torch::Tensor output
-)
-{
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-
-    TORCH_CHECK_DTYPE(input, kShort);
-    TORCH_CHECK_DTYPE(output, kInt);
-    TORCH_CHECK_SHAPES(input, 0, output, 0, 1);
-    TORCH_CHECK_SHAPES(input, 1, output, 1, 8);
-
-    int rows = input.size(0);
-    int columns = input.size(1);
-
-    pack_rows_4_cuda
-    (
-        (uint16_t*) input.data_ptr(),
-        (uint32_t*) output.data_ptr(),
-        rows,
-        columns
-    );
-}
-
-void pack_columns
-(
-    torch::Tensor input,
-    torch::Tensor output,
-    int bits
-)
-{
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-
-    TORCH_CHECK_DTYPE(input, kShort);
-    TORCH_CHECK_DTYPE(output, kInt);
-    TORCH_CHECK_SHAPES(input, 1, output, 1, 1);
-
-    int in_rows = input.size(0);
-    int columns = input.size(1);
-    int out_rows = output.size(0);
-    int exp_out_rows = in_rows * bits / 32;
-    TORCH_CHECK(out_rows == exp_out_rows, "Wrong output shape for input and bitrate")
-
-    pack_columns_cuda
-    (
-        (uint16_t*) input.data_ptr(),
-        (uint32_t*) output.data_ptr(),
-        in_rows,
-        out_rows,
-        columns,
-        bits
-    );
-}
-
-
-// Quantization functions
-
-void quantize_err
-(
-    torch::Tensor input,
-    torch::Tensor output,
-    torch::Tensor scale,
-    float qzero,
-    float maxq,
-    float err_norm,
-    float min_p,
-    float max_p,
-    int p_grid
-)
-{
-    TORCH_CHECK_DTYPE(input, kFloat);
-    TORCH_CHECK_DTYPE(output, kFloat);
-    // TORCH_CHECK_SHAPES(input, 0, output, 0, 1);
-    // TORCH_CHECK_SHAPES(input, 1, output, 1, 1);
-    TORCH_CHECK_SHAPES(input, 1, scale, 0, 1);
-    TORCH_CHECK(output.size(0) == p_grid + 1, "Output vector shape doesn't match grid")
-
-    int rows = input.size(0);
-    int columns = input.size(1);
-
-    quantize_err_cuda
-    (
-        (float*) input.data_ptr(),
-        (float*) output.data_ptr(),
-        (float*) scale.data_ptr(),
-        rows,
-        columns,
-        qzero,
-        maxq,
-        err_norm,
-        min_p,
-        max_p,
-        p_grid
-    );
-}
-
-void quantize
-(
-    torch::Tensor input,
-    torch::Tensor output,
-    torch::Tensor scale,
-    torch::Tensor out_q,
-    float qzero,
-    float maxq
-)
-{
-    TORCH_CHECK_DTYPE(input, kFloat);
-    TORCH_CHECK_DTYPE(output, kFloat);
-    TORCH_CHECK_SHAPES(input, 0, output, 0, 1);
-    TORCH_CHECK_SHAPES(input, 1, output, 1, 1);
-    TORCH_CHECK_SHAPES(input, 1, scale, 0, 1);
-
-    int rows = input.size(0);
-    int columns = input.size(1);
-
-    quantize_cuda
-    (
-        (float*) input.data_ptr(),
-        (float*) output.data_ptr(),
-        (float*) scale.data_ptr(),
-        out_q.device().is_meta() ? NULL : (uint16_t*) out_q.data_ptr(),
-        rows,
-        columns,
-        qzero,
-        maxq
-    );
-}
-
-
-// Quant matrix
-
-uintptr_t make_q_matrix
-(
-    torch::Tensor q_weight,
-    torch::Tensor q_perm,
-    torch::Tensor q_invperm,
-    torch::Tensor q_scale,
-    torch::Tensor q_scale_max,
-    torch::Tensor q_groups,
-    torch::Tensor gptq_qzeros,
-    torch::Tensor gptq_scales,
-    torch::Tensor gptq_g_idx,
-    torch::Tensor temp_dq
-)
-{
-    TORCH_CHECK_DTYPE(q_weight, kInt);
-    TORCH_CHECK_DTYPE_OPT(q_perm, kShort);
-    TORCH_CHECK_DTYPE_OPT(q_invperm, kShort);
-    TORCH_CHECK_DTYPE_OPT(q_scale, kInt);
-    TORCH_CHECK_DTYPE_OPT(q_scale_max, kHalf);
-    TORCH_CHECK_DTYPE_OPT(q_groups, kShort);
-    TORCH_CHECK_DTYPE_OPT(gptq_qzeros, kInt);
-    TORCH_CHECK_DTYPE_OPT(gptq_scales, kHalf);
-    TORCH_CHECK_DTYPE_OPT(gptq_g_idx, kInt);
-
-    TORCH_CHECK_SHAPES(q_perm, 0, q_invperm, 0, 1);
-
-    int device = q_weight.device().index();
-    int width = q_weight.size(1);
-    int groups;
-    int height;
-
-    if (!q_scale.device().is_meta())
-    {
-        TORCH_CHECK_SHAPES(q_weight, 1, q_scale, 1, 8);
-        TORCH_CHECK_SHAPES(q_scale_max, 0, q_scale, 0, 1);
-        groups = q_scale.size(0);
-        height = q_invperm.size(0);
-    }
-    else
-    {
-        TORCH_CHECK_SHAPES(q_weight, 1, gptq_qzeros, 1, 8);
-        TORCH_CHECK_SHAPES(q_weight, 1, gptq_scales, 1, 1);
-        groups = gptq_qzeros.size(0);
-        height = q_weight.size(0) * 8;
-    }
-
-    TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer")
-
-    QMatrix* m = new QMatrix
-    (
-        device,
-        height,
-        width,
-        groups,
-        (uint32_t*) q_weight.data_ptr(),
-        q_perm.device().is_meta() ? NULL : (uint16_t*) q_perm.data_ptr(),
-        q_invperm.device().is_meta() ? NULL : (uint16_t*) q_invperm.data_ptr(),
-        q_scale.device().is_meta() ? NULL : (uint32_t*) q_scale.data_ptr(),
-        q_scale_max.device().is_meta() ? NULL : (half*) q_scale_max.data_ptr(),
-        q_groups.device().is_meta() ? NULL : (uint16_t*) q_groups.data_ptr(),
-        gptq_qzeros.device().is_meta() ? NULL : (uint32_t*) gptq_qzeros.data_ptr(),
-        gptq_scales.device().is_meta() ? NULL : (half*) gptq_scales.data_ptr(),
-        gptq_g_idx.device().is_meta() ? NULL : (uint32_t*) gptq_g_idx.data_ptr(),
-        (half*) temp_dq.data_ptr()
-    );
-
-    if (m->failed) throw std::runtime_error("CUDA out of memory");
-
-    return reinterpret_cast<uintptr_t> (m);
-}
-
-void free_q_matrix
-(
-    uintptr_t handle
-)
-{
-    QMatrix* m = reinterpret_cast<QMatrix*> (handle);
-    delete m;
-}
-
-void reconstruct
-(
-    uintptr_t q_handle,
-    torch::Tensor output
-)
-{
-    QMatrix* qm = reinterpret_cast<QMatrix*> (q_handle);
-    TORCH_CHECK(qm->height == output.size(0) && qm->width == output.size(1), "Output tensor doesn't match shape of QMatrix")
-    TORCH_CHECK_DTYPE(output, kHalf);
-
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
-
-    qm->reconstruct((half*) output.data_ptr());
-}
-
-
-// Matmul
-
-void gemm_half_q_half
-(
-    torch::Tensor a,
-    uintptr_t b,
-    torch::Tensor c,
-    bool force_cuda
-)
-{
-    QMatrix* qm = reinterpret_cast<QMatrix*> (b);
-
-    TORCH_CHECK_DTYPE(a, kHalf);
-    TORCH_CHECK_DTYPE(c, kHalf);
-    TORCH_CHECK_SHAPES(a, 0, c, 0, 1);
-    TORCH_CHECK(qm->height == a.size(1), "a and b have incompatible shapes")
-    TORCH_CHECK(qm->width == c.size(1), "b and c have incompatible shapes")
-
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
-
-    gemm_half_q_half_cuda
-    (
-        at::cuda::getCurrentCUDABlasHandle(),
-        (const half*) a.data_ptr(),
-        qm,
-        (half*) c.data_ptr(),
-        c.size(0), // m
-        c.size(1), // n
-        a.size(1), // k
-        true,
-        NULL,
-        force_cuda
-    );
-}
-
-
-// Quant attention
-
-uintptr_t make_q_attn
-(
-    torch::Tensor layernorm,
-    float norm_epsilon,
-    uintptr_t q_q_proj,
-    uintptr_t q_k_proj,
-    uintptr_t q_v_proj,
-    uintptr_t q_o_proj,
-    torch::Tensor temp_state,
-//    torch::Tensor temp_q,
-//    torch::Tensor temp_k,
-//    torch::Tensor temp_v,
-    torch::Tensor temp_dq,
-    int max_rows,
-    int hidden_size,
-    int num_heads,
-    int num_kv_heads,
-    int head_dim,
-    int max_seq_len
-)
-{
-    QMatrix* qm_q_proj = reinterpret_cast<QMatrix*> (q_q_proj);
-    QMatrix* qm_k_proj = reinterpret_cast<QMatrix*> (q_k_proj);
-    QMatrix* qm_v_proj = reinterpret_cast<QMatrix*> (q_v_proj);
-    QMatrix* qm_o_proj = reinterpret_cast<QMatrix*> (q_o_proj);
-
-    TORCH_CHECK_DTYPE_OPT(layernorm, kHalf);
-
-    if (qm_q_proj && !layernorm.is_meta()) TORCH_CHECK(qm_q_proj->height == layernorm.size(0), "q_proj is wrong shape")
-    if (qm_k_proj && !layernorm.is_meta()) TORCH_CHECK(qm_k_proj->height == layernorm.size(0), "k_proj is wrong shape")
-    if (qm_v_proj && !layernorm.is_meta()) TORCH_CHECK(qm_v_proj->height == layernorm.size(0), "v_proj is wrong shape")
-    if (!layernorm.is_meta()) TORCH_CHECK(qm_o_proj->height == layernorm.size(0), "o_proj is wrong shape")
-
-    QAttn* attn = new QAttn
-    (
-        (half*) layernorm.is_meta() ? NULL : (half*) layernorm.data_ptr(),
-        norm_epsilon,
-        qm_q_proj,
-        qm_k_proj,
-        qm_v_proj,
-        qm_o_proj,
-        (half*) temp_state.data_ptr(),
-//        (half*) temp_q.data_ptr(),
-//        (half*) temp_k.data_ptr(),
-//        (half*) temp_v.data_ptr(),
-        (half*) temp_dq.data_ptr(),
-        max_rows,
-        hidden_size,
-        num_heads,
-        num_kv_heads,
-        head_dim,
-        max_seq_len
-    );
-
-    return reinterpret_cast<uintptr_t> (attn);
-}
-
-void free_q_attn
-(
-    uintptr_t handle
-)
-{
-    QAttn* attn = reinterpret_cast<QAttn*> (handle);
-    delete attn;
-}
-
-void q_attn_forward_1
-(
-    uintptr_t q_attn,
-    torch::Tensor x,
-    int batch_size,
-    int q_len,
-    int past_len,
-    torch::Tensor past_lens,
-    torch::Tensor q_temp,
-    torch::Tensor k_temp,
-    torch::Tensor v_temp,
-    torch::Tensor sin,
-    torch::Tensor cos,
-    const std::vector<uintptr_t>& loras,
-    torch::Tensor loras_temp
-)
-{
-    QAttn* attn = reinterpret_cast<QAttn*> (q_attn);
-    TORCH_CHECK_DTYPE(x, kHalf);
-
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-    cublasHandle_t cublas_handle = at::cuda::getCurrentCUDABlasHandle();
-
-    attn->forward_cuda_1
-    (
-        cublas_handle,
-        (half*) x.data_ptr(),
-        batch_size,
-        q_len,
-        past_len,
-        past_lens.device().is_meta() ? NULL : (uint32_t*) past_lens.data_ptr(),
-        (half*) q_temp.data_ptr(),
-        (half*) k_temp.data_ptr(),
-        (half*) v_temp.data_ptr(),
-        (half*) sin.data_ptr(),
-        (half*) cos.data_ptr(),
-        loras,
-        loras_temp.device().is_meta() ? NULL : (half*) loras_temp.data_ptr()
-    );
-}
-
-void q_attn_forward_2
-(
-    uintptr_t q_attn,
-    torch::Tensor x,
-    torch::Tensor attn_output,
-    int batch_size,
-    int q_len,
-    const std::vector<uintptr_t>& loras,
-    torch::Tensor loras_temp
-)
-{
-    QAttn* attn = reinterpret_cast<QAttn*> (q_attn);
-    TORCH_CHECK_DTYPE(x, kHalf);
-
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-    cublasHandle_t cublas_handle = at::cuda::getCurrentCUDABlasHandle();
-
-    attn->forward_cuda_2
-    (
-        cublas_handle,
-        (const half*) attn_output.data_ptr(),
-        (half*) x.data_ptr(),
-        q_len,
-        batch_size,
-        loras,
-        loras_temp.device().is_meta() ? NULL : (half*) loras_temp.data_ptr()
-    );
-}
-
-int q_attn_set_loras
-(
-    uintptr_t q_attn,
-    std::unordered_map<uintptr_t, torch::Tensor>& q_proj_lora_a,
-    std::unordered_map<uintptr_t, torch::Tensor>& q_proj_lora_b,
-    std::unordered_map<uintptr_t, torch::Tensor>& k_proj_lora_a,
-    std::unordered_map<uintptr_t, torch::Tensor>& k_proj_lora_b,
-    std::unordered_map<uintptr_t, torch::Tensor>& v_proj_lora_a,
-    std::unordered_map<uintptr_t, torch::Tensor>& v_proj_lora_b,
-    std::unordered_map<uintptr_t, torch::Tensor>& o_proj_lora_a,
-    std::unordered_map<uintptr_t, torch::Tensor>& o_proj_lora_b
-)
-{
-    QAttn* attn = reinterpret_cast<QAttn*> (q_attn);
-
-    attn->q_proj_lora.clear();
-    attn->k_proj_lora.clear();
-    attn->v_proj_lora.clear();
-    attn->o_proj_lora.clear();
-
-    int max_rank = 0;
-
-    for (const auto& pair : q_proj_lora_a)
-    {
-        int rank = pair.second.size(-1);
-        if (rank > max_rank) max_rank = rank;
-        half* a = (half*) pair.second.data_ptr();
-        half* b = (half*) q_proj_lora_b[pair.first].data_ptr();
-        attn->q_proj_lora[pair.first] = std::make_tuple(a, b, rank);
-    }
-
-    for (const auto& pair : k_proj_lora_a)
-    {
-        int rank = pair.second.size(-1);
-        if (rank > max_rank) max_rank = rank;
-        half* a = (half*) pair.second.data_ptr();
-        half* b = (half*) k_proj_lora_b[pair.first].data_ptr();
-        attn->k_proj_lora[pair.first] = std::make_tuple(a, b, rank);
-    }
-
-    for (const auto& pair : v_proj_lora_a)
-    {
-        int rank = pair.second.size(-1);
-        if (rank > max_rank) max_rank = rank;
-        half* a = (half*) pair.second.data_ptr();
-        half* b = (half*) v_proj_lora_b[pair.first].data_ptr();
-        attn->v_proj_lora[pair.first] = std::make_tuple(a, b, rank);
-    }
-
-    for (const auto& pair : o_proj_lora_a)
-    {
-        int rank = pair.second.size(-1);
-        if (rank > max_rank) max_rank = rank;
-        half* a = (half*) pair.second.data_ptr();
-        half* b = (half*) o_proj_lora_b[pair.first].data_ptr();
-        attn->o_proj_lora[pair.first] = std::make_tuple(a, b, rank);
-    }
-
-    return max_rank;
-}
-
-// Quant MLP
-
-uintptr_t make_q_mlp
-(
-    torch::Tensor layernorm,
-    float norm_epsilon,
-    uintptr_t q_gate,
-    uintptr_t q_up,
-    uintptr_t q_down,
-    torch::Tensor temp_state,
-    torch::Tensor temp_a,
-    torch::Tensor temp_b,
-    torch::Tensor temp_dq,
-    int max_rows
-)
-{
-    QMatrix* qm_gate = reinterpret_cast<QMatrix*> (q_gate);
-    QMatrix* qm_up = reinterpret_cast<QMatrix*> (q_up);
-    QMatrix* qm_down = reinterpret_cast<QMatrix*> (q_down);
-
-    TORCH_CHECK_DTYPE(layernorm, kHalf);
-    TORCH_CHECK(qm_gate->height == layernorm.size(0), "gate_proj is wrong shape")
-    TORCH_CHECK(qm_up->height == layernorm.size(0), "up_proj is wrong shape")
-
-    QMLP* mlp = new QMLP
-    (
-        (half*) layernorm.data_ptr(),
-        norm_epsilon,
-        qm_gate,
-        qm_up,
-        qm_down,
-        (half*) temp_state.data_ptr(),
-        (half*) temp_a.data_ptr(),
-        (half*) temp_b.data_ptr(),
-        (half*) temp_dq.data_ptr(),
-        max_rows
-    );
-
-    return reinterpret_cast<uintptr_t> (mlp);
-}
-
-void free_q_mlp
-(
-   uintptr_t handle
-)
-{
-    QMLP* mlp = reinterpret_cast<QMLP*> (handle);
-    delete mlp;
-}
-
-void q_mlp_forward_
-(
-    uintptr_t q_mlp,
-    torch::Tensor x,
-    const std::vector<uintptr_t>& loras,
-    torch::Tensor loras_temp
-)
-{
-    QMLP* mlp = reinterpret_cast<QMLP*> (q_mlp);
-    TORCH_CHECK_DTYPE(x, kHalf);
-
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-
-    TORCH_CHECK(x.size(1) == mlp->gate->height, "x is wrong shape");
-    TORCH_CHECK(x.size(0) <= mlp->max_rows, "Too many rows in x");
-
-    mlp->forward_
-    (
-        at::cuda::getCurrentCUDABlasHandle(),
-        (half*) x.data_ptr(),
-        x.size(0), // rows
-        x.size(1), // columns == hidden_size
-        loras,
-        loras_temp.device().is_meta() ? NULL : (half*) loras_temp.data_ptr()
-    );
-}
-
-int q_mlp_set_loras
-(
-    uintptr_t q_mlp,
-    std::unordered_map<uintptr_t, torch::Tensor>& gate_proj_lora_a,
-    std::unordered_map<uintptr_t, torch::Tensor>& gate_proj_lora_b,
-    std::unordered_map<uintptr_t, torch::Tensor>& up_proj_lora_a,
-    std::unordered_map<uintptr_t, torch::Tensor>& up_proj_lora_b,
-    std::unordered_map<uintptr_t, torch::Tensor>& down_proj_lora_a,
-    std::unordered_map<uintptr_t, torch::Tensor>& down_proj_lora_b
-)
-{
-    QMLP* mlp = reinterpret_cast<QMLP*> (q_mlp);
-
-    mlp->gate_proj_lora.clear();
-    mlp->up_proj_lora.clear();
-    mlp->down_proj_lora.clear();
-
-    int max_rank = 0;
-
-    for (const auto& pair : gate_proj_lora_a)
-    {
-        int rank = pair.second.size(-1);
-        if (rank > max_rank) max_rank = rank;
-        half* a = (half*) pair.second.data_ptr();
-        half* b = (half*) gate_proj_lora_b[pair.first].data_ptr();
-        mlp->gate_proj_lora[pair.first] = std::make_tuple(a, b, rank);
-    }
-
-    for (const auto& pair : up_proj_lora_a)
-    {
-        int rank = pair.second.size(-1);
-        if (rank > max_rank) max_rank = rank;
-        half* a = (half*) pair.second.data_ptr();
-        half* b = (half*) up_proj_lora_b[pair.first].data_ptr();
-        mlp->up_proj_lora[pair.first] = std::make_tuple(a, b, rank);
-    }
-
-    for (const auto& pair : down_proj_lora_a)
-    {
-        int rank = pair.second.size(-1);
-        if (rank > max_rank) max_rank = rank;
-        half* a = (half*) pair.second.data_ptr();
-        half* b = (half*) down_proj_lora_b[pair.first].data_ptr();
-        mlp->down_proj_lora[pair.first] = std::make_tuple(a, b, rank);
-    }
-
-    return max_rank;
-}
-
-
-// RoPE rotary positional embeddings, in-place
-
-void rope_
-(
-    torch::Tensor x,
-    torch::Tensor sin,
-    torch::Tensor cos,
-    int past_len,
-    int num_heads,
-    int head_dim
-)
-{
-    TORCH_CHECK_DTYPE(x, kHalf);
-    TORCH_CHECK_DTYPE(sin, kHalf);
-    TORCH_CHECK_DTYPE(cos, kHalf);
-    TORCH_CHECK(head_dim == cos.size(-1), "cos table does not match head_dim");
-    TORCH_CHECK(head_dim == sin.size(-1), "sin table does not match head_dim");
-
-    int batch_size = x.size(0);
-    int rows_per_batch = x.numel() / head_dim / batch_size;
-
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-
-    rope_cuda
-    (
-        (half*) x.data_ptr(),
-        (const half*) sin.data_ptr(),
-        (const half*) cos.data_ptr(),
-        batch_size,
-        rows_per_batch,
-        head_dim,
-        num_heads,
-        past_len,
-        NULL
-    );
-}
-
-
-// RMS layernorm
-
-void rms_norm
-(
-    torch::Tensor x,
-    torch::Tensor w,
-    torch::Tensor y,
-    float epsilon
-)
-{
-    TORCH_CHECK_DTYPE(x, kHalf);
-    TORCH_CHECK_DTYPE(w, kHalf);
-    TORCH_CHECK_DTYPE(y, kHalf);
-    TORCH_CHECK_SHAPES(x, 1, w, 0, 1);
-    TORCH_CHECK_SHAPES(x, 1, w, 0, 1);
-    TORCH_CHECK_SHAPES(x, 0, y, 0, 1);
-    TORCH_CHECK_SHAPES(x, 1, y, 1, 1);
-
-    int rows = x.size(0);
-    int dim = x.size(1);
-
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-
-    rms_norm_cuda
-    (
-        (half*) x.data_ptr(),
-        (half*) w.data_ptr(),
-        (half*) y.data_ptr(),
-        epsilon,
-        rows,
-        dim
-    );
-}
-
-void rms_norm_
-(
-    torch::Tensor x,
-    torch::Tensor w,
-    float epsilon
-)
-{
-    rms_norm(x, w, x, epsilon);
-}
-
-
-// Sampling
-
-void apply_rep_penalty
-(
-    torch::Tensor sequence,
-    float penalty_max,
-    int sustain,
-    int decay,
-    torch::Tensor logits
-)
-{
-    TORCH_CHECK_DTYPE(sequence, kLong);
-    TORCH_CHECK_DTYPE(logits, kFloat);
-    TORCH_CHECK_SHAPES(sequence, 0, logits, 0, 1);
-
-    int vocab_size = logits.size(-1);
-    int bsz = sequence.size(0);
-    int seq_len = sequence.size(-1);
-
-    for (int i = 0; i < bsz; i++)
-    {
-        apply_rep_penalty_cpu
-        (
-            vocab_size,
-            ((uint64_t*) sequence.data_ptr()) + i * seq_len,
-            penalty_max,
-            sustain,
-            decay,
-            seq_len,
-            ((float*) logits.data_ptr()) + i * vocab_size
-        );
-    }
-}
-
-void sample_basic
-(
-    torch::Tensor logits,           // shape [bsz, vocab_size]
-    float temperature,
-    int top_k,
-    float top_p,
-    float typical,
-    float random,
-    torch::Tensor output_tokens,    // shape [bsz, 1]
-    torch::Tensor output_probs,     // shape [bsz, 1]
-    torch::Tensor logit_filter      // shape [bsz, vocab_size]
-)
-{
-    TORCH_CHECK_DTYPE(logits, kFloat);
-    TORCH_CHECK_DTYPE(output_tokens, kLong);
-    TORCH_CHECK_DTYPE(output_probs, kFloat);
-    TORCH_CHECK_DTYPE(logits, kFloat);
-    TORCH_CHECK_DTYPE(logit_filter, kBool);
-
-    TORCH_CHECK_SHAPES(logit_filter, 0, logits, 0, 1);
-    TORCH_CHECK_SHAPES(logit_filter, 1, logits, 1, 1);
-
-    int vocab_size = logits.size(-1);
-    int bsz = logits.size(0);
-
-    float* temp_probs = (float*) malloc(vocab_size * sizeof(float));
-    int* temp_indices = (int*) malloc(vocab_size * sizeof(int));
-
-//    int64_t* output_tokens_ptr = (int64_t*) output_tokens.data_ptr();
-//    float* output_probs_ptr = (float*) output_tokens.data_ptr();
-    float* logits_ptr = (float*) logits.data_ptr();
-
-    bool* logits_filter_ptr = (bool*) logit_filter.data_ptr();
-
-    for (int i = 0; i < bsz; i++)
-    {
-        softmax_cpu
-        (
-            vocab_size,
-            temperature,
-            logits_ptr + i * vocab_size,
-            logits_filter_ptr + i * vocab_size,
-            temp_probs
-        );
-
-        if (top_k == 1)
-        {
-            int index = greedy_sample(vocab_size, logits_ptr + i * vocab_size, logits_filter_ptr + i * vocab_size);
-            output_tokens[i] = index;
-            output_probs[i] = temp_probs[index];
-            continue;
-        }
-
-        for (int j = 0; j < vocab_size; j++) temp_indices[j] = j;
-        int num_candidates = vocab_size;
-
-        if (top_k > 0 && top_k < vocab_size)
-        {
-            num_candidates = top_k_cpu(num_candidates, temp_probs, temp_indices, top_k);
-            normalize_cpu(num_candidates, temp_probs);
-        }
-
-        if (top_p > 0.0f && top_p < 1.0f)
-        {
-            num_candidates = top_p_cpu(num_candidates, temp_probs, temp_indices, top_p);
-            normalize_cpu(num_candidates, temp_probs);
-        }
-
-        if (typical > 0.0f && typical < 1.0f)
-        {
-            num_candidates = typical_cpu(num_candidates, temp_probs, temp_indices, typical);
-            normalize_cpu(num_candidates, temp_probs);
-        }
-
-        num_candidates = multinomial_cpu(num_candidates, temp_probs, temp_indices, random);
-        output_tokens[i] = temp_indices[0];
-        output_probs[i] = temp_probs[0];
-    }
-
-    free(temp_probs);
-    free(temp_indices);
-}
-
-
-// Filtering
-
-void logit_filter_exclusive
-(
-    torch::Tensor filter,                                       // shape [bsz, vocab_size]
-    const std::vector<std::vector<int>> &exclusive_lists
-)
-{
-    TORCH_CHECK_DTYPE(filter, kBool);
-    TORCH_CHECK((uint64_t) filter.size(0) == exclusive_lists.size(), "Number of lists does not match batch size")
-
-    bool* filter_ptr = (bool*) filter.data_ptr();
-    unsigned int vocab_size = filter.size(1);
-
-    for(const auto& list : exclusive_lists)
-    {
-        unsigned int id = 0;
-        unsigned int next_id_idx = 0;
-        unsigned int next_id = list[next_id_idx];
-
-        while (id < vocab_size)
-        {
-            while (id < next_id)
-            {
-                filter_ptr[id] = false;
-                id++;
-            }
-            id++;
-            next_id_idx++;
-            if (next_id_idx >= list.size()) next_id = vocab_size;
-            else next_id = list[next_id_idx];
-        }
-
-        filter_ptr += vocab_size;
-    }
-}
-
-// For cache conversion
-
-void fp16_to_fp8(torch::Tensor in_tensor, torch::Tensor out_tensor, int batch_size, int offset, int width)
-{
-    TORCH_CHECK_DTYPE(in_tensor, kHalf);
-    TORCH_CHECK_DTYPE(out_tensor, kUInt8);
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(in_tensor));
-
-    TORCH_CHECK_SHAPES(in_tensor, 0, out_tensor, 0, 1);
-    TORCH_CHECK_SHAPES(in_tensor, 1, out_tensor, 1, 1);
-    TORCH_CHECK_SHAPES(in_tensor, 2, out_tensor, 2, 1);
-    TORCH_CHECK_SHAPES(in_tensor, 3, out_tensor, 3, 1);
-
-    int stride = in_tensor.size(1) * in_tensor.size(2) * in_tensor.size(3);
-    int height = batch_size;
-
-    int tsize = in_tensor.size(2) * in_tensor.size(3);
-    offset *= tsize;
-    width *= tsize;
-
-    array_fp16_to_fp8_cuda((const half*) (in_tensor.data_ptr()), (unsigned char*)(out_tensor.data_ptr()), stride, height, offset, width);
-}
-
-void fp8_to_fp16(torch::Tensor in_tensor, torch::Tensor out_tensor, int batch_size, int offset, int width)
-{
-    TORCH_CHECK_DTYPE(in_tensor, kUInt8);
-    TORCH_CHECK_DTYPE(out_tensor, kHalf);
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(in_tensor));
-
-    TORCH_CHECK_SHAPES(in_tensor, 0, out_tensor, 0, 1);
-    TORCH_CHECK_SHAPES(in_tensor, 1, out_tensor, 1, 1);
-    TORCH_CHECK_SHAPES(in_tensor, 2, out_tensor, 2, 1);
-    TORCH_CHECK_SHAPES(in_tensor, 3, out_tensor, 3, 1);
-
-    int stride = in_tensor.size(1) * in_tensor.size(2) * in_tensor.size(3);
-    int height = batch_size;
-
-    int tsize = in_tensor.size(2) * in_tensor.size(3);
-    offset *= tsize;
-    width *= tsize;
-
-    array_fp8_to_fp16_cuda((const unsigned char*)(in_tensor.data_ptr()), (half*)(out_tensor.data_ptr()), stride, height, offset, width);
-}
-
-//void array_fp16_to_fp8_ref(torch::Tensor in_tensor, torch::Tensor out_tensor, int size)
-//{
-//    TORCH_CHECK_DTYPE(in_tensor, kHalf);
-//    TORCH_CHECK_DTYPE(out_tensor, kUInt8);
-//    array_fp16_to_fp8_ref_cuda((const half*) (in_tensor.data_ptr()), (unsigned char*)(out_tensor.data_ptr()), size);
-//}
-//
-//void array_fp8_to_fp16_ref(torch::Tensor in_tensor, torch::Tensor out_tensor, int size)
-//{
-//    TORCH_CHECK_DTYPE(in_tensor, kUInt8);
-//    TORCH_CHECK_DTYPE(out_tensor, kHalf);
-//    array_fp8_to_fp16_ref_cuda((const unsigned char*)(in_tensor.data_ptr()), (half*)(out_tensor.data_ptr()), size);
-//}
-
-// Bindings
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("pack_rows_4", &pack_rows_4, "pack_rows_4");
-    m.def("pack_columns", &pack_columns, "pack_columns");
-    m.def("quantize_err", &quantize_err, "quantize_err");
-    m.def("quantize", &quantize, "quantize");
-    m.def("make_q_matrix", &make_q_matrix, "make_q_matrix");
-    m.def("free_q_matrix", &free_q_matrix, "free_q_matrix");
-    m.def("reconstruct", &reconstruct, "reconstruct");
-    m.def("make_q_mlp", &make_q_mlp, "make_q_mlp");
-    m.def("free_q_mlp", &free_q_mlp, "free_q_mlp");
-    m.def("q_mlp_forward_", &q_mlp_forward_, "q_mlp_forward_");
-    m.def("q_mlp_set_loras", &q_mlp_set_loras, "q_mlp_set_loras");
-    m.def("make_q_attn", &make_q_attn, "make_q_attn");
-    m.def("free_q_attn", &free_q_attn, "free_q_attn");
-    m.def("q_attn_forward_1", &q_attn_forward_1, "q_attn_forward_1");
-    m.def("q_attn_forward_2", &q_attn_forward_2, "q_attn_forward_2");
-    m.def("q_attn_set_loras", &q_attn_set_loras, "q_attn_set_loras");
-    m.def("quantize_range", &quantize_range, "quantize_range");
-    m.def("gemm_half_q_half", &gemm_half_q_half, "gemm_half_q_half");
-    m.def("rms_norm", &rms_norm, "rms_norm");
-    m.def("rms_norm_", &rms_norm_, "rms_norm_");
-    m.def("rope_", &rope_, "rope_");
-    m.def("apply_rep_penalty", &apply_rep_penalty, "apply_rep_penalty");
-    m.def("sample_basic", &sample_basic, "sample_basic");
-    m.def("logit_filter_exclusive", &logit_filter_exclusive, "logit_filter_exclusive");
-    m.def("fp16_to_fp8", &fp16_to_fp8, "fp16_to_fp8");
-    m.def("fp8_to_fp16", &fp8_to_fp16, "fp8_to_fp16");
-//    m.def("array_fp16_to_fp8_ref", &array_fp16_to_fp8_ref, "array_fp16_to_fp8_ref");
-//    m.def("array_fp8_to_fp16_ref", &array_fp8_to_fp16_ref, "array_fp8_to_fp16_ref");
-}
diff --git a/server/exllamav2_kernels/setup.py b/server/exllamav2_kernels/setup.py
deleted file mode 100644
index 8f25cfb1b..000000000
--- a/server/exllamav2_kernels/setup.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-setup(
-    name="exllamav2_kernels",
-    ext_modules=[
-        CUDAExtension(
-            name="exllamav2_kernels",
-            sources=[
-                "exllamav2_kernels/ext.cpp",
-                "exllamav2_kernels/cuda/h_gemm.cu",
-                "exllamav2_kernels/cuda/lora.cu",
-                "exllamav2_kernels/cuda/pack_tensor.cu",
-                "exllamav2_kernels/cuda/quantize.cu",
-                "exllamav2_kernels/cuda/q_matrix.cu",
-                "exllamav2_kernels/cuda/q_attn.cu",
-                "exllamav2_kernels/cuda/q_mlp.cu",
-                "exllamav2_kernels/cuda/q_gemm.cu",
-                "exllamav2_kernels/cuda/rms_norm.cu",
-                "exllamav2_kernels/cuda/rope.cu",
-                "exllamav2_kernels/cuda/cache.cu",
-                "exllamav2_kernels/cpp/quantize_func.cpp",
-                "exllamav2_kernels/cpp/sampling.cpp"
-            ],
-        )
-    ],
-    cmdclass={"build_ext": BuildExtension},
-)