From 9613e56270dad822043b4917d825cdf3d0bedac5 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Wed, 13 Mar 2024 17:20:54 -0600 Subject: [PATCH] =?UTF-8?q?=F0=9F=94=A5=20Remove=20custom=20exllama=20code?= =?UTF-8?q?,=20use=20auto-gptq=20vendored=20instead?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Travis Johnson --- Dockerfile | 24 - .../exllama_kernels/cuda_buffers.cu | 71 -- .../exllama_kernels/cuda_buffers.cuh | 52 - .../exllama_kernels/cuda_compat.cuh | 58 -- .../exllama_kernels/cuda_func/column_remap.cu | 61 -- .../cuda_func/column_remap.cuh | 19 - .../exllama_kernels/cuda_func/q4_matmul.cu | 252 ----- .../exllama_kernels/cuda_func/q4_matmul.cuh | 37 - .../exllama_kernels/cuda_func/q4_matrix.cu | 217 ---- .../exllama_kernels/cuda_func/q4_matrix.cuh | 53 - .../exllama_kernels/exllama_ext.cpp | 249 ----- .../exllama_kernels/matrix.cuh | 294 ------ .../exllama_kernels/exllama_kernels/tuning.h | 13 - .../exllama_kernels/exllama_kernels/util.cuh | 29 - server/exllama_kernels/setup.py | 19 - .../exllamav2_kernels/config.h | 13 - .../exllamav2_kernels/cpp/quantize_func.cpp | 59 -- .../exllamav2_kernels/cpp/quantize_func.h | 25 - .../exllamav2_kernels/cpp/sampling.cpp | 477 --------- .../exllamav2_kernels/cpp/sampling.h | 84 -- .../exllamav2_kernels/cpp/util.h | 25 - .../exllamav2_kernels/cuda/cache.cu | 161 --- .../exllamav2_kernels/cuda/cache.cuh | 14 - .../exllamav2_kernels/cuda/compat.cuh | 56 - .../exllamav2_kernels/cuda/compat_gemm.cuh | 33 - .../exllamav2_kernels/cuda/h_gemm.cu | 275 ----- .../exllamav2_kernels/cuda/h_gemm.cuh | 26 - .../exllamav2_kernels/cuda/lora.cu | 33 - .../exllamav2_kernels/cuda/lora.cuh | 24 - .../exllamav2_kernels/cuda/matrix_view.cuh | 121 --- .../exllamav2_kernels/cuda/pack_tensor.cu | 268 ----- .../exllamav2_kernels/cuda/pack_tensor.cuh | 35 - .../exllamav2_kernels/cuda/q_attn.cu | 159 --- .../exllamav2_kernels/cuda/q_attn.cuh | 98 -- .../exllamav2_kernels/cuda/q_gemm.cu | 211 ---- .../exllamav2_kernels/cuda/q_gemm.cuh | 33 - .../exllamav2_kernels/cuda/q_gemm_kernel.cuh | 484 --------- .../cuda/q_gemm_kernel_gptq.cuh | 219 ---- .../exllamav2_kernels/cuda/q_matrix.cu | 623 ------------ .../exllamav2_kernels/cuda/q_matrix.cuh | 73 -- .../exllamav2_kernels/cuda/q_mlp.cu | 162 --- .../exllamav2_kernels/cuda/q_mlp.cuh | 65 -- .../exllamav2_kernels/cuda/quant/qdq_2.cuh | 103 -- .../exllamav2_kernels/cuda/quant/qdq_3.cuh | 169 ---- .../exllamav2_kernels/cuda/quant/qdq_4.cuh | 227 ----- .../exllamav2_kernels/cuda/quant/qdq_5.cuh | 207 ---- .../exllamav2_kernels/cuda/quant/qdq_6.cuh | 44 - .../exllamav2_kernels/cuda/quant/qdq_8.cuh | 38 - .../exllamav2_kernels/cuda/quant/qdq_util.cuh | 51 - .../exllamav2_kernels/cuda/quantize.cu | 256 ----- .../exllamav2_kernels/cuda/quantize.cuh | 56 - .../exllamav2_kernels/cuda/rms_norm.cu | 130 --- .../exllamav2_kernels/cuda/rms_norm.cuh | 19 - .../exllamav2_kernels/cuda/rope.cu | 132 --- .../exllamav2_kernels/cuda/rope.cuh | 22 - .../exllamav2_kernels/cuda/util.cuh | 42 - .../exllamav2_kernels/ext.cpp | 956 ------------------ server/exllamav2_kernels/setup.py | 28 - 58 files changed, 7784 deletions(-) delete mode 100644 server/exllama_kernels/exllama_kernels/cuda_buffers.cu delete mode 100644 server/exllama_kernels/exllama_kernels/cuda_buffers.cuh delete mode 100644 server/exllama_kernels/exllama_kernels/cuda_compat.cuh delete mode 100644 server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cu delete mode 100644 server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh delete mode 100644 server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cu delete mode 100644 server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cuh delete mode 100644 server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cu delete mode 100644 server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh delete mode 100644 server/exllama_kernels/exllama_kernels/exllama_ext.cpp delete mode 100644 server/exllama_kernels/exllama_kernels/matrix.cuh delete mode 100644 server/exllama_kernels/exllama_kernels/tuning.h delete mode 100644 server/exllama_kernels/exllama_kernels/util.cuh delete mode 100644 server/exllama_kernels/setup.py delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/config.h delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.cpp delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.h delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cpp/sampling.cpp delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cpp/sampling.h delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cpp/util.h delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/cache.cu delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/cache.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/compat.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/compat_gemm.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/h_gemm.cu delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/h_gemm.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/lora.cu delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/lora.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/matrix_view.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/pack_tensor.cu delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/pack_tensor.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_attn.cu delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_attn.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel_gptq.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_mlp.cu delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/q_mlp.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_2.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_3.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_4.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_5.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_6.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_8.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_util.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/quantize.cu delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/quantize.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/rms_norm.cu delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/rms_norm.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/rope.cu delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/rope.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/cuda/util.cuh delete mode 100644 server/exllamav2_kernels/exllamav2_kernels/ext.cpp delete mode 100644 server/exllamav2_kernels/setup.py diff --git a/Dockerfile b/Dockerfile index 61933909d..462885892 100644 --- a/Dockerfile +++ b/Dockerfile @@ -253,24 +253,6 @@ COPY server/custom_kernels/ /usr/src/. RUN cd /usr/src && python setup.py build_ext && python setup.py install -## Build transformers exllama kernels ########################################## -FROM python-builder as exllama-kernels-builder - -WORKDIR /usr/src - -COPY server/exllama_kernels/ . -RUN python setup.py build - - -## Build transformers exllamav2 kernels ######################################## -FROM python-builder as exllamav2-kernels-builder - -WORKDIR /usr/src - -COPY server/exllamav2_kernels/ . -RUN python setup.py build - - ## Flash attention v2 cached build image ####################################### FROM base as flash-att-v2-cache @@ -301,12 +283,6 @@ ENV PATH=/opt/tgis/bin:$PATH RUN --mount=type=bind,from=flash-att-v2-cache,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir -# Copy build artifacts from exllama kernels builder -COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES} - -# Copy build artifacts from exllamav2 kernels builder -COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES} - # Copy over the auto-gptq wheel and install it RUN --mount=type=bind,from=auto-gptq-cache,src=/usr/src/auto-gptq-wheel,target=/usr/src/auto-gptq-wheel \ pip install /usr/src/auto-gptq-wheel/*.whl --no-cache-dir diff --git a/server/exllama_kernels/exllama_kernels/cuda_buffers.cu b/server/exllama_kernels/exllama_kernels/cuda_buffers.cu deleted file mode 100644 index ee2cbee23..000000000 --- a/server/exllama_kernels/exllama_kernels/cuda_buffers.cu +++ /dev/null @@ -1,71 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#define _cuda_buffers_cu -#include "cuda_buffers.cuh" - -CudaBuffers* g_buffers[CUDA_MAX_DEVICES] = {NULL}; -// __constant__ half2 q4_table[16][256]; -// half2 q4_table_host[16][256]; -// bool q4_table_init = false; - -CudaBuffers::CudaBuffers -( - int _device, - half* _temp_state, - half* _temp_dq -) : - device(_device), - temp_state(_temp_state), - temp_dq(_temp_dq) -{ - cudaSetDevice(_device); - - cudaStreamCreate(&alt_stream_1); - cudaStreamCreate(&alt_stream_2); - cudaStreamCreate(&alt_stream_3); - cudaEventCreate(&alt_stream_1_done); - cudaEventCreate(&alt_stream_2_done); - cudaEventCreate(&alt_stream_3_done); -} - -CudaBuffers::~CudaBuffers() -{ - cudaStreamDestroy(alt_stream_1); - cudaStreamDestroy(alt_stream_2); - cudaStreamDestroy(alt_stream_3); - cudaEventDestroy(alt_stream_1_done); - cudaEventDestroy(alt_stream_2_done); - cudaEventDestroy(alt_stream_3_done); -} - -CudaBuffers* get_buffers(const int device_index) -{ - return g_buffers[device_index]; -} - -void prepare_buffers_cuda -( - int _device, - half* _temp_state, - half* _temp_dq -) -{ - CudaBuffers* buffers = new CudaBuffers - ( - _device, - _temp_state, - _temp_dq - ); - - g_buffers[_device] = buffers; -} - -void cleanup_buffers_cuda() -{ - for (int i = 0; i < CUDA_MAX_DEVICES; i++) - { - if (!g_buffers[i]) continue; - delete g_buffers[i]; - g_buffers[i] = NULL; - } -} diff --git a/server/exllama_kernels/exllama_kernels/cuda_buffers.cuh b/server/exllama_kernels/exllama_kernels/cuda_buffers.cuh deleted file mode 100644 index afb60a012..000000000 --- a/server/exllama_kernels/exllama_kernels/cuda_buffers.cuh +++ /dev/null @@ -1,52 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _cuda_buffers_cuh -#define _cuda_buffers_cuh - -#include -#include -#include -#include - -const int CUDA_MAX_DEVICES = 16; - -// #ifndef _cuda_buffers_cu -// extern __constant__ half2 q4_table[16][256]; -// #endif - -class CudaBuffers -{ -public: - int device; - - half* temp_state; // [max_hidden_rows * intermediate_size] - half* temp_dq; // size of largest quant tensor * 8 - - cudaStream_t alt_stream_1; - cudaStream_t alt_stream_2; - cudaStream_t alt_stream_3; - cudaEvent_t alt_stream_1_done; - cudaEvent_t alt_stream_2_done; - cudaEvent_t alt_stream_3_done; - - CudaBuffers - ( - int _device, - half* _temp_state, - half* _temp_dq - ); - ~CudaBuffers(); -}; - -CudaBuffers* get_buffers(const int device_index); - -void prepare_buffers_cuda -( - int _device, - half* _temp_state, - half* _temp_dq -); - -void cleanup_buffers_cuda(); - -#endif diff --git a/server/exllama_kernels/exllama_kernels/cuda_compat.cuh b/server/exllama_kernels/exllama_kernels/cuda_compat.cuh deleted file mode 100644 index 8dfa25de3..000000000 --- a/server/exllama_kernels/exllama_kernels/cuda_compat.cuh +++ /dev/null @@ -1,58 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _cuda_compat_cuh -#define _cuda_compat_cuh - -// atomicAdd for half types, to support CC < 7.x - -__device__ __forceinline__ void atomicAdd_half(half* address, half val) -{ - unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2)); - unsigned int old = *address_as_ui; - unsigned int assumed; - - do - { - assumed = old; - __half_raw hsum; - hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); - half tmpres = __hadd(hsum, val); - hsum = __half_raw(tmpres); - old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; - old = atomicCAS(address_as_ui, assumed, old); - } - while (assumed != old); -} - -// atomicAdd for half2 types - -__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) -{ - unsigned int* address_as_ui = (unsigned int*)address; - unsigned int old = *address_as_ui; - unsigned int assumed; - do - { - assumed = old; - half2 old_val = *((half2*)&old); - half2 new_val = __hadd2(old_val, val); - old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val)); - } - while (assumed != old); -} - -// - -#if defined(__CUDA_ARCH__) -#if __CUDA_ARCH__ < 700 - -__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); } - -#if __CUDA_ARCH__ < 600 -__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); } -#endif - -#endif -#endif - -#endif diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cu b/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cu deleted file mode 100644 index c25b0206b..000000000 --- a/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cu +++ /dev/null @@ -1,61 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#include "column_remap.cuh" -#include "../util.cuh" - -const int SHUF_BLOCKSIZE_X = 256; -const int SHUF_BLOCKSIZE_Y = 16; - -__global__ void column_remap_kernel -( - const half* __restrict__ x, - half* __restrict__ x_new, - const int x_width, - const int x_height, - const uint32_t* x_map -) -{ - int x_column = SHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x; - int x_row = SHUF_BLOCKSIZE_Y * blockIdx.y; - - int x_stride = x_width; - int x_idx = x_row * x_stride + x_column; - - int x_row_end = min(x_row + SHUF_BLOCKSIZE_Y, x_height); - int x_idx_end = x_row_end * x_stride + x_column; - - int s_column = x_map[x_column]; - int s_idx = x_row * x_stride + s_column; - - while (x_idx < x_idx_end) - { - x_new[x_idx] = x[s_idx]; - x_idx += x_stride; - s_idx += x_stride; - } -} - -// Remap columns in x to correspond to sequential group index before matmul -// -// perform x -> seq_x such that seq_x @ seq_w == x @ w - -void column_remap_cuda -( - const half* x, - half* x_new, - const int x_height, - const int x_width, - const uint32_t* x_map -) -{ - dim3 threads(SHUF_BLOCKSIZE_X, 1, 1); - - dim3 blocks - ( - (x_width + SHUF_BLOCKSIZE_X - 1) / SHUF_BLOCKSIZE_X, - (x_height + SHUF_BLOCKSIZE_Y - 1) / SHUF_BLOCKSIZE_Y, - 1 - ); - - column_remap_kernel<<>>(x, x_new, x_width, x_height, x_map); -} diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh b/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh deleted file mode 100644 index 6571c17d6..000000000 --- a/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh +++ /dev/null @@ -1,19 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _column_remap_cuh -#define _column_remap_cuh - -#include -#include -#include - -void column_remap_cuda -( - const half* x, - half* x_new, - const int x_height, - const int x_width, - const uint32_t* x_map -); - -#endif \ No newline at end of file diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cu b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cu deleted file mode 100644 index 60dc4c9db..000000000 --- a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cu +++ /dev/null @@ -1,252 +0,0 @@ -#include "q4_matmul.cuh" -#include "column_remap.cuh" -#include "../util.cuh" -#include "../matrix.cuh" -#include "../cuda_compat.cuh" -#include "../cuda_buffers.cuh" - -const int THREADS_X = 32; // Block size and thread count along columns in w and out -const int THREADS_Y = 1; // Block size and thread count along rows in x and out - -typedef void (*fp_q4_matmul_kernel) -( - const half*, - const uint32_t*, - half*, - const half*, - const uint32_t*, - const int, - const int, - const int, - const int, - const int, - const uint32_t*, - bool -); - -template -__global__ void q4_matmul_kernel -( - const half* __restrict__ x, - const uint32_t* __restrict__ w, - half* __restrict__ out, - const half* __restrict__ w_scales, - const uint32_t* __restrict__ w_zeros, - const int height, - const int dim, - const int width, - const int groupsize, - const int block_size_z, - const uint32_t* __restrict__ x_map, - bool no_zero -) -{ - // Start of block - - int x_column = block_size_z * blockIdx.z; - int x_column_end = min(dim, block_size_z * (blockIdx.z + 1)); - - int w_column = THREADS_X * blockIdx.x + threadIdx.x; - int x_row = THREADS_Y * blockIdx.y + threadIdx.y; - - int iterations = (x_column_end - x_column) / 8; - - // Views - - MatrixView_half x_(x, height, dim); - MatrixView_half w_scales_(w_scales, dim / groupsize, width); - MatrixView_q4_row w_zeros_(w_zeros, dim / groupsize, width); - MatrixView_q4_column w_(w, dim, width); - MatrixView_half_rw out_(out, height, width); - - // Zero output - - if (!no_zero && blockIdx.z == 0 && (threadIdx.x & 1) == 0) - { - *((uint32_t*) out_.item_ptr(x_row, w_column)) = 0; - __syncthreads(); - } - - // Loop over part of x row (and w column) - - half2 acc = {}; - half acc_h = {}; - - if constexpr (use_groupsize) - { - // For quant matrices where groupsize divides BLOCK_SIZE_Z we always start on a group boundary, so this - // could be slightly faster - - for (int k = x_column, group = x_column / groupsize; k < x_column + iterations * 8; group++, k += groupsize) - { - if constexpr (use_half2) - { - half2 w_scale = w_scales_.item_half2half2(group, w_column); - uint32_t w_zero = w_zeros_.item(group, w_column) + 1; - - if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map); - else acc = dot_product_8 (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8); - } - else - { - half w_scale = w_scales_.item(group, w_column); - uint32_t w_zero = w_zeros_.item(group, w_column) + 1; - - if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map); - else acc_h = dot_product_8_h (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8); - } - } - } - else - { - // Otherwise assume groupsize is a multiple of 8, do 8 columns per iteration and trust the cache - - for (int k = x_column; k < x_column + iterations * 8; k += 8) - { - if constexpr (use_half2) - { - int group = k / groupsize; - half2 w_scale = w_scales_.item_half2half2(group, w_column); - uint32_t w_zero = w_zeros_.item(group, w_column) + 1; - - if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map); - else acc = dot_product_8 (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1); - } - else - { - int group = k / groupsize; - half w_scale = w_scales_.item(group, w_column); - uint32_t w_zero = w_zeros_.item(group, w_column) + 1; - - if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map); - else acc_h = dot_product_8_h (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1); - } - } - } - - // Add to block result - - if constexpr (use_half2) - { - half result = __hadd(acc.x, acc.y); - atomicAdd(out_.item_ptr(x_row, w_column), result); - } - else - { - atomicAdd(out_.item_ptr(x_row, w_column), acc_h); - } -} - -fp_q4_matmul_kernel q4_matmul_kernel_pick(ExLlamaTuning* tuningParams, int block_size_z, int groupsize, uint32_t* x_map) -{ - // - if (tuningParams->matmul_no_half2) { - if (block_size_z % groupsize == 0) { - if (x_map) return q4_matmul_kernel; - else return q4_matmul_kernel; - } else { - if (x_map) return q4_matmul_kernel; - else return q4_matmul_kernel; - } - } else { - if (block_size_z % groupsize == 0) - { - if (x_map) return q4_matmul_kernel; - else return q4_matmul_kernel; - } else { - if (x_map) return q4_matmul_kernel; - else return q4_matmul_kernel; - } - } -}; - -// Compute y = x @ w - -void q4_matmul_cuda -( - ExLlamaTuning* tuningParams, - const half* x, - const int x_height, - const Q4Matrix* w, - half* out, - bool no_zero, - cudaStream_t alt_stream -) -{ - int height = x_height; - int dim = w->height; - int width = w->width; - - cudaSetDevice(w->device); - - uint32_t* x_map = w->cuda_x_map; - const half* x_mapped = x; - if (x_map && !tuningParams->matmul_fused_remap && !alt_stream) - { - CudaBuffers* buffers = get_buffers(w->device); - column_remap_cuda(x, buffers->temp_state, x_height, dim, w->cuda_x_map); - x_mapped = buffers->temp_state; - x_map = NULL; - } - - int block_size_z; - if (w->width == 4096) block_size_z = 384; // 7B - else if (w->width == 11008) block_size_z = 256; - else if (w->width == 5120) block_size_z = 384; // 13B - else if (w->width == 13824) block_size_z = 256; - else if (w->width == 6656) block_size_z = 256; // 33B - else if (w->width == 17920) block_size_z = 128; - else block_size_z = 256; - - //if (!no_zero) cudaMemsetAsync(out, 0, x_height * w->width * sizeof(half)); - - dim3 threads(THREADS_X, THREADS_Y, 1); - - dim3 blocks - ( - (width + threads.x - 1) / threads.x, - (height + threads.y - 1) / threads.y, - (dim + block_size_z - 1) / block_size_z - ); - - fp_q4_matmul_kernel kernel = q4_matmul_kernel_pick(tuningParams, block_size_z, w->groupsize, x_map); - - kernel<<>> (x_mapped, w->cuda_qweight, out, w->cuda_scales, w->cuda_qzeros, height, dim, width, w->groupsize, block_size_z, x_map, no_zero); -} - -void q4_matmul_recons_cuda -( - ExLlamaTuning* tuningParams, - const half* x, - const int x_height, - Q4Matrix* w, - half* out, - const cublasHandle_t handle, - bool no_zero -) -{ - int height = x_height; - int dim = w->height; - int width = w->width; - - cudaSetDevice(w->device); - CudaBuffers* buffers = get_buffers(w->device); - - const half* x_mapped = x; - if (w->cuda_x_map) - { - column_remap_cuda(x, buffers->temp_state, x_height, dim, w->cuda_x_map); - x_mapped = buffers->temp_state; - } - - w->reconstruct(buffers->temp_dq); - - const half alpha = __float2half(1.0f); - const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f); - cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, width, x_mapped, dim, &beta, out, width); - -// const float alpha = 1.0f; -// const float beta = no_zero ? 1.0f : 0.0f; -// cublasSgemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, CUDA_R_16F, width, -// x_mapped, CUDA_R_16F, dim, &beta, out, CUDA_R_16F, width); -} diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cuh b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cuh deleted file mode 100644 index 63611790a..000000000 --- a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matmul.cuh +++ /dev/null @@ -1,37 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _q4_matmul_cuh -#define _q4_matmul_cuh - -#include -#include -#include -#include -#include - -#include "q4_matrix.cuh" -#include "../tuning.h" - -void q4_matmul_cuda -( - ExLlamaTuning* tuningParams, - const half* x, - const int x_height, - const Q4Matrix* w, - half* out, - bool no_zero = false, - cudaStream_t alt_stream = NULL -); - -void q4_matmul_recons_cuda -( - ExLlamaTuning* tuningParams, - const half* x, - const int x_height, - Q4Matrix* w, - half* out, - const cublasHandle_t handle, - bool no_zero = false -); - -#endif diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cu b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cu deleted file mode 100644 index f3d1564f3..000000000 --- a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cu +++ /dev/null @@ -1,217 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#include "q4_matrix.cuh" -#include -#include "../util.cuh" -#include "../matrix.cuh" - -using namespace std; - -const int UNSHUF_BLOCKSIZE_X = 64; - -const int RECONS_THREADS_X = 64; // Block size and thread count along columns in out, each thread converts 1 column -const int RECONS_THREADS_Y = 1; // Block size and thread count along rows in x and out, each thread converts 8 rows - -vector g_q4_matrices; - -void g_q4_keep_matrix(Q4Matrix* m) -{ - g_q4_matrices.push_back(m); -} - -void g_q4_free_matrices() -{ - for (const auto& m : g_q4_matrices) delete m; - g_q4_matrices.clear(); -} - -Q4Matrix::Q4Matrix -( - const int _height, - const int _width, - const int _groups, - - uint32_t* _qweight, - uint32_t* _qzeros, - half* _scales, - uint32_t* _g_idx, - - const int _device -) : - height(_height), - width(_width), - groups(_groups), - device(_device) -{ - cudaSetDevice(device); - - cuda_qweight = _qweight; - cuda_qzeros = _qzeros; - cuda_scales = _scales; - - groupsize = height / groups; - - if (_g_idx) make_sequential(_g_idx); -} - -Q4Matrix::~Q4Matrix() -{ -} - -// Make sequential - -__global__ void make_sequential_kernel -( - const uint32_t* __restrict__ w, - uint32_t* __restrict__ w_new, - const uint32_t* __restrict__ x_map, - const int w_height, - const int w_width -) -{ - const uint64_t* w2 = (uint64_t*) w; - uint64_t* w_new2 = (uint64_t*) w_new; - int w2_stride = w_width >> 1; - - int w2_column = UNSHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x; - int w_new2_row = blockIdx.y; - - int x_map_idx = w_new2_row << 3; - - uint64_t dst = 0; - - #pragma unroll - for (int i = 0; i < 8; i++) - { - int source_row = x_map[x_map_idx++]; - - int w2_row = source_row >> 3; - int w2_subrow = source_row & 0x07; - int w2_row_shift = w2_subrow << 2; - int wnew2_row_shift = i << 2; - - uint64_t src = w2[w2_row * w2_stride + w2_column]; - src >>= w2_row_shift; - src &= 0x0000000f0000000f; - src <<= wnew2_row_shift; - dst |= src; - } - - w_new2[w_new2_row * w2_stride + w2_column] = dst; -} - -void Q4Matrix::make_sequential(const uint32_t* cpu_g_idx) -{ - uint32_t* cuda_new_qweight = NULL; - cudaMalloc(&cuda_new_qweight, height / 8 * width * sizeof(uint32_t)); - cudaMalloc(&cuda_x_map, height * sizeof(uint32_t)); // TODO: Should probably be allocated in PyTorch - - uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t)); - uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t)); - uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t)); - - // Group histogram - - for (int i = 0; i < height; i++) cpu_g_idx_map[cpu_g_idx[i]]++; - - // Group map - - for (int i = 0, acc = 0; i < groups; i++) - { - short tmp = cpu_g_idx_map[i]; - cpu_g_idx_map[i] = acc; - acc += tmp; - } - - // X map (inverse) - - for (int row = 0; row < height; row++) - { - uint32_t target_group = cpu_g_idx[row]; - uint32_t target_row = cpu_g_idx_map[target_group]; - cpu_g_idx_map[target_group]++; - cpu_x_map_inv[row] = target_row; - } - - // X map - - for (int row = 0; row < height; row++) cpu_x_map[cpu_x_map_inv[row]] = row; - - // Move to CUDA - - cudaMemcpyAsync(cuda_x_map, cpu_x_map, height * sizeof(uint32_t), cudaMemcpyHostToDevice); - - // Rearrange rows in w - - dim3 threads(UNSHUF_BLOCKSIZE_X, 1, 1); - dim3 blocks(width / UNSHUF_BLOCKSIZE_X / 2, height / 8, 1); - - make_sequential_kernel<<>>(cuda_qweight, cuda_new_qweight, cuda_x_map, height / 8, width); - - // Replace qweights - - cudaMemcpyAsync(cuda_qweight, cuda_new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice); - - // Cleanup - - cudaDeviceSynchronize(); - cudaFree(cuda_new_qweight); - free(cpu_g_idx_map); - free(cpu_x_map); - free(cpu_x_map_inv); -} - -__global__ void reconstruct_kernel -( - const uint32_t* __restrict__ w, - half* __restrict__ out, // (y) - const half* __restrict__ w_scales, - const uint32_t* __restrict__ w_zeros, - const int height, - const int width, - const int groupsize -) -{ - // Start of block - - int column = RECONS_THREADS_X * blockIdx.x + threadIdx.x; - int row = (RECONS_THREADS_Y * blockIdx.y + threadIdx.y) * 8; - - // Views - - MatrixView_q4_column w_(w, height, width); - MatrixView_half_rw out_(out, height, width); - MatrixView_half w_scales_(w_scales, height / groupsize, width); - MatrixView_q4_row w_zeros_(w_zeros, height / groupsize, width); - - // Groupsize version - - int group = row / groupsize; - - half w_scale = w_scales_.item(group, column); - uint32_t w_zero = w_zeros_.item(group, column) + 1; - - uint32_t w_read = w_.item_uint32_t(row, column); - half* out_ptr = out_.item_ptr(row, column); - - #pragma unroll - for (int s = 0; s < 32; s += 4) - { - half w_item = __hmul(__int2half_rn((int)((w_read >> s) & 0x0f) - w_zero), w_scale); - *out_ptr = w_item; out_ptr += out_.width; - } -} - -void Q4Matrix::reconstruct(half* out) -{ - dim3 threads(RECONS_THREADS_X, RECONS_THREADS_Y, 1); - - dim3 blocks - ( - (width + threads.x - 1) / threads.x, - (height / 8 + threads.y - 1) / threads.y, - 1 - ); - - reconstruct_kernel<<>>(cuda_qweight, out, cuda_scales, cuda_qzeros, height / 8, width, groupsize); -} \ No newline at end of file diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh deleted file mode 100644 index 50cb72a41..000000000 --- a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh +++ /dev/null @@ -1,53 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _q4_matrix_cuh -#define _q4_matrix_cuh - -#include -#include -#include - -class Q4Matrix -{ -public: - - int device; - - int height; - int width; - int groups; - int groupsize; - - uint32_t* cuda_qweight = NULL; - uint32_t* cuda_qzeros = NULL; - half* cuda_scales = NULL; - uint32_t* cuda_x_map = NULL; - - Q4Matrix - ( - const int _height, - const int _width, - const int _groups, - - uint32_t* _qweight, - uint32_t* _qzeros, - half* _scales, - uint32_t* _g_idx, - - const int _device - ); - - ~Q4Matrix(); - - void reconstruct(half* out); - -private: - - void make_sequential(const uint32_t* cpu_g_idx); - -}; - -void g_q4_keep_matrix(Q4Matrix* m); -void g_q4_free_matrices(); - -#endif \ No newline at end of file diff --git a/server/exllama_kernels/exllama_kernels/exllama_ext.cpp b/server/exllama_kernels/exllama_kernels/exllama_ext.cpp deleted file mode 100644 index b786988bd..000000000 --- a/server/exllama_kernels/exllama_kernels/exllama_ext.cpp +++ /dev/null @@ -1,249 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#include -#include -#include -#include -#include -#include -#include -#include "util.cuh" -#include "tuning.h" -#include "cuda_buffers.cuh" -#include "cuda_func/q4_matrix.cuh" -#include "cuda_func/q4_matmul.cuh" -#include "cuda_func/column_remap.cuh" - -// Check CUDA return code. We don't want to include Torch headers in the .cu files because parsing them adds almost a -// minute to the compile time on a 12900K. Also passing exceptions back to Python is super tricky, so in place of -// exceptions, CUDA functions return with a cudaError_t which we can parse and dump to the console. - -void check_cuda(cudaError_t ret) -{ - switch (ret) - { - case cudaSuccess: - break; - - case cudaUnspecified: - printf(" **** Unspecified error\n"); - TORCH_CHECK(false, "CUDA error"); - break; - - default: - printf(" **** CUDA error\n"); \ - printf(" **** %s\n", cudaGetErrorString(ret)); \ - TORCH_CHECK(false, "CUDA error"); \ - break; - } -} - -// Some decluttering macros - -#define STRINGIFY_(__x) #__x -#define STRINGIFY(__x) STRINGIFY_(__x) -#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype) -#define TORCH_CHECK_DTYPE_OPT(__x, __dtype) TORCH_CHECK((__x).device().is_meta() || (__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype) -#define TORCH_CHECK_SHAPES(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes") -#define TORCH_CHECK_SHAPES_OPT(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).device().is_meta() || (__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes") -#define TORCH_CHECK_SHAPE_MOD(__x, __dim_x, __mod) TORCH_CHECK((__x).size(__dim_x) % __mod == 0, #__x ".shape[" STRINGIFY(__dim_x) "] must be a multiple of " STRINGIFY(__mod)) - -#define TORCH_CHECK_DEVICE_INDEX(__index) \ -do { \ - TORCH_CHECK(__index >= 0, "no device index"); \ - TORCH_CHECK(__index < CUDA_MAX_DEVICES, "invalid device index"); \ -} while(0) - -#define TORCH_CHECK_QUANT(__w, __w_scales, __w_zeros, __seq_g_idx, __x_map) \ -do { \ - TORCH_CHECK_DTYPE(__w, kInt); \ - TORCH_CHECK_DTYPE(__w_scales, kHalf); \ - TORCH_CHECK_DTYPE(__w_zeros, kInt); \ - TORCH_CHECK_DTYPE_OPT(__seq_g_idx, kShort); \ - TORCH_CHECK_DTYPE_OPT(__x_map, kInt); \ - TORCH_CHECK_SHAPES_OPT(__seq_g_idx, 0, __w, 0, 2 * 8); \ - TORCH_CHECK_SHAPES_OPT(__x_map, 0, __w, 0, 8); \ -} while(0) - -int get_groupsize(torch::Tensor w, torch::Tensor w_zeros) -{ - int groupsize = w.size(0) * 8 / w_zeros.size(0); - TORCH_CHECK(groupsize * w_zeros.size(0) == w.size(0) * 8, "w.shape[-2] must be a multiple of zeros.shape[-2]") - return groupsize; -} - - -// Tuning parameters - -ExLlamaTuning tuningParams; - -void set_tuning_params -( - int matmul_recons_thd, - bool matmul_fused_remap, - bool matmul_no_half2 -) -{ - tuningParams.matmul_recons_thd = matmul_recons_thd; - tuningParams.matmul_fused_remap = matmul_fused_remap; - tuningParams.matmul_no_half2 = matmul_no_half2; -} - - -// Release all unmanaged objects allocated by the extension - -void cleanup() -{ - cleanup_buffers_cuda(); - g_q4_free_matrices(); -} - - -// Prepare buffers for forward pass - -void prepare_buffers -( - torch::Device device, - torch::Tensor temp_state, - torch::Tensor temp_dq -) -{ - int device_index = device.index(); - TORCH_CHECK_DEVICE_INDEX(device_index); - const at::cuda::OptionalCUDAGuard device_guard(device); - - prepare_buffers_cuda - ( - device_index, - (half*) temp_state.data_ptr(), - (half*) temp_dq.data_ptr() - ); -} - - -// Create Q4Matrix, return handle - -uintptr_t make_q4 -( - torch::Tensor qweight, - torch::Tensor qzeros, - torch::Tensor scales, - torch::Tensor g_idx, - int device -) -{ - TORCH_CHECK_DTYPE(qweight, kInt); - TORCH_CHECK_DTYPE(qzeros, kInt); - TORCH_CHECK_DTYPE(scales, kHalf); - TORCH_CHECK_DTYPE_OPT(g_idx, kInt); - TORCH_CHECK_SHAPES(qweight, 1, qzeros, 1, 8); - TORCH_CHECK_SHAPES(scales, 1, qweight, 1, 1); - TORCH_CHECK_SHAPES(qzeros, 0, scales, 0, 1); - - int width = qweight.size(1); - int height = qweight.size(0) * 8; - int groups = qzeros.size(0); - - Q4Matrix* m = new Q4Matrix - ( - height, - width, - groups, - - (uint32_t*) qweight.data_ptr(), - (uint32_t*) qzeros.data_ptr(), - (half*) scales.data_ptr(), - g_idx.device().is_meta() ? NULL : (uint32_t*) g_idx.data_ptr(), - - device - ); - - g_q4_keep_matrix(m); - return reinterpret_cast (m); -} - - -// Matmul half @ quant -> half - -void q4_matmul -( - torch::Tensor x, - uintptr_t w, - torch::Tensor out -) -{ - Q4Matrix* wm = reinterpret_cast (w); - - TORCH_CHECK_DTYPE(x, kHalf); - TORCH_CHECK_DTYPE(out, kHalf); - TORCH_CHECK_SHAPES(x, 0, out, 0, 1); - TORCH_CHECK(wm->height == x.size(-1), "x and w have incompatible shapes") - - const at::cuda::OptionalCUDAGuard device_guard(device_of(x)); - - int x_height = x.size(0); - - if (tuningParams.matmul_recons_thd == 0 || x_height < tuningParams.matmul_recons_thd) - { - q4_matmul_cuda - ( - &tuningParams, - (half*) x.data_ptr(), - x_height, - wm, - (half*) out.data_ptr() - ); - } - else - { - q4_matmul_recons_cuda - ( - &tuningParams, - (half*) x.data_ptr(), - x_height, - wm, - (half*) out.data_ptr(), - at::cuda::getCurrentCUDABlasHandle() - ); - } -} - - -// Remap columns in half tensor - -void column_remap -( - torch::Tensor x, - torch::Tensor x_new, - torch::Tensor x_map -) -{ - TORCH_CHECK_DTYPE(x, kHalf); - TORCH_CHECK_DTYPE(x_new, kHalf); - TORCH_CHECK_DTYPE(x_map, kInt); - TORCH_CHECK_SHAPES(x_map, 0, x, 1, 1); - - int height = x.size(0); - int width = x.size(1); - - const at::cuda::OptionalCUDAGuard device_guard(device_of(x)); - - column_remap_cuda - ( - (half*) x.data_ptr(), - (half*) x_new.data_ptr(), - height, - width, - (uint32_t*) x_map.data_ptr() - ); -} - - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) -{ - m.def("set_tuning_params", &set_tuning_params, "set_tuning_params"); - m.def("prepare_buffers", &prepare_buffers, "prepare_buffers"); - m.def("cleanup", &cleanup, "cleanup"); - m.def("make_q4", &make_q4, "make_q4"); - m.def("q4_matmul", &q4_matmul, "q4_matmul"); -} diff --git a/server/exllama_kernels/exllama_kernels/matrix.cuh b/server/exllama_kernels/exllama_kernels/matrix.cuh deleted file mode 100644 index 2fd5ab0b3..000000000 --- a/server/exllama_kernels/exllama_kernels/matrix.cuh +++ /dev/null @@ -1,294 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _matrix_cuh -#define _matrix_cuh - -#include -#include - -class MatrixView_half -{ -public: - const half* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; } - __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; } - __device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); } - __device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; } -}; - -class MatrixView_half_rw -{ -public: - half* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; } - __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; } - __device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; } - __device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; } - __device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; } -}; - -class MatrixView_q4_row -{ -public: - const uint32_t* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ int item(int row, int column) const - { - int shift = (column & 0x07) * 4; - return (data[row * width / 8 + column / 8] >> shift) & 0x0f; - } -}; - -class MatrixView_q4_column -{ -public: - const uint32_t* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ int item(int row, int column) const - { - int shift = (row & 0x07) * 4; - return (data[row / 8 * width + column] >> shift) & 0x0f; - } - - __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) { return data[row / 8 * width + column]; } - __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; } -}; - -// TODO: Rewrite all these dot product functions using functors or something, move to q4_matmul.cu - -// Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale - -__device__ __forceinline__ half2 dot_product_8 -( - const half2 acc, - MatrixView_half& h_, - const int h_row, - const int h_column, // divisible by 8 - MatrixView_q4_column& v_, - const int v_row, // divisible by 8 - const int v_column, - const half2 v_scale_2, - const uint32_t v_zero, // + 1 (!!) - const int count -) -{ - const half2* h_ptr = (const half2*) h_.item_ptr(h_row, h_column); - const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column); - half2 result = acc; - - for (int i = 0; i < count; i++) - { - uint32_t v_read = *v_ptr; v_ptr += v_.width; - - half v_0 = __int2half_rn((int)((v_read ) & 0x0f) - v_zero); - half v_1 = __int2half_rn((int)((v_read >> 4) & 0x0f) - v_zero); - half v_2 = __int2half_rn((int)((v_read >> 8) & 0x0f) - v_zero); - half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero); - half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero); - half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero); - half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero); - half v_7 = __int2half_rn((int)((v_read >> 28) ) - v_zero); - - half2 v_01 = __halves2half2(v_0, v_1); - half2 v_23 = __halves2half2(v_2, v_3); - half2 v_45 = __halves2half2(v_4, v_5); - half2 v_67 = __halves2half2(v_6, v_7); - -// half2 v_01 = q4_table[v_zero - 1][(v_read ) & 0xff]; // (constant memory is too slow apparently) -// half2 v_23 = q4_table[v_zero - 1][(v_read >> 8) & 0xff]; -// half2 v_45 = q4_table[v_zero - 1][(v_read >> 16) & 0xff]; -// half2 v_67 = q4_table[v_zero - 1][(v_read >> 24) ]; - - half2 tmp = __hmul2(*h_ptr++, v_01); - tmp = __hfma2(*h_ptr++, v_23, tmp); - tmp = __hfma2(*h_ptr++, v_45, tmp); - tmp = __hfma2(*h_ptr++, v_67, tmp); - result = __hfma2(v_scale_2, tmp, result); - } - - return result; -} - -__device__ __forceinline__ half dot_product_8_h -( - const half acc, - MatrixView_half& h_, - const int h_row, - const int h_column, // divisible by 8 - MatrixView_q4_column& v_, - const int v_row, // divisible by 8 - const int v_column, - const half v_scale, - const uint32_t v_zero, // + 1 (!!) - const int count -) -{ - const half* h_ptr = h_.item_ptr(h_row, h_column); - const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column); - half result = acc; - - for (int i = 0; i < count; i++) - { - uint32_t v_read = *v_ptr; v_ptr += v_.width; - - half v_0 = __int2half_rn((int)((v_read ) & 0x0f) - v_zero); - half v_1 = __int2half_rn((int)((v_read >> 4) & 0x0f) - v_zero); - half v_2 = __int2half_rn((int)((v_read >> 8) & 0x0f) - v_zero); - half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero); - half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero); - half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero); - half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero); - half v_7 = __int2half_rn((int)((v_read >> 28) ) - v_zero); - - half tmp = __hmul(*h_ptr++, v_0); - tmp = __hfma(*h_ptr++, v_1, tmp); - tmp = __hfma(*h_ptr++, v_2, tmp); - tmp = __hfma(*h_ptr++, v_3, tmp); - tmp = __hfma(*h_ptr++, v_4, tmp); - tmp = __hfma(*h_ptr++, v_5, tmp); - tmp = __hfma(*h_ptr++, v_6, tmp); - tmp = __hfma(*h_ptr++, v_7, tmp); - result = __hfma(v_scale, tmp, result); - } - - return result; -} - -// Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale, with x_map - -__device__ __forceinline__ half2 dot_product_8_x_map -( - const half2 acc, - MatrixView_half& h_, - const int h_row, - const int h_column, // divisible by 8 - MatrixView_q4_column& v_, - const int v_row, // divisible by 8 - const int v_column, - const half2 v_scale_2, - const uint32_t v_zero, // + 1 (!!) - const int count, - const uint32_t* x_map -) -{ - const half* h_ptr = h_.item_ptr(h_row, 0); - const uint32_t* x_map_ptr = x_map + h_column; - const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column); - half2 result = acc; - - for (int i = 0; i < count; i++) - { - uint32_t v_read = *v_ptr; v_ptr += v_.width; - - half v_0 = __int2half_rn((int)((v_read ) & 0x0f) - v_zero); - half v_1 = __int2half_rn((int)((v_read >> 4) & 0x0f) - v_zero); - half v_2 = __int2half_rn((int)((v_read >> 8) & 0x0f) - v_zero); - half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero); - half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero); - half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero); - half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero); - half v_7 = __int2half_rn((int)((v_read >> 28) ) - v_zero); - - half2 v_01 = __halves2half2(v_0, v_1); - half2 v_23 = __halves2half2(v_2, v_3); - half2 v_45 = __halves2half2(v_4, v_5); - half2 v_67 = __halves2half2(v_6, v_7); - - half h_0 = h_ptr[*x_map_ptr++]; - half h_1 = h_ptr[*x_map_ptr++]; - half h_2 = h_ptr[*x_map_ptr++]; - half h_3 = h_ptr[*x_map_ptr++]; - half h_4 = h_ptr[*x_map_ptr++]; - half h_5 = h_ptr[*x_map_ptr++]; - half h_6 = h_ptr[*x_map_ptr++]; - half h_7 = h_ptr[*x_map_ptr++]; - - half2 h_01 = __halves2half2(h_0, h_1); - half2 h_23 = __halves2half2(h_2, h_3); - half2 h_45 = __halves2half2(h_4, h_5); - half2 h_67 = __halves2half2(h_6, h_7); - - half2 tmp = __hmul2(h_01, v_01); - tmp = __hfma2(h_23, v_23, tmp); - tmp = __hfma2(h_45, v_45, tmp); - tmp = __hfma2(h_67, v_67, tmp); - result = __hfma2(v_scale_2, tmp, result); - } - - return result; -} - -__device__ __forceinline__ half dot_product_8_x_map_h -( - const half acc, - MatrixView_half& h_, - const int h_row, - const int h_column, // divisible by 8 - MatrixView_q4_column& v_, - const int v_row, // divisible by 8 - const int v_column, - const half v_scale, - const uint32_t v_zero, // + 1 (!!) - const int count, - const uint32_t* x_map -) -{ - const half* h_ptr = h_.item_ptr(h_row, 0); - const uint32_t* x_map_ptr = x_map + h_column; - const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column); - half result = acc; - - for (int i = 0; i < count; i++) - { - uint32_t v_read = *v_ptr; v_ptr += v_.width; - - half v_0 = __int2half_rn((int)((v_read ) & 0x0f) - v_zero); - half v_1 = __int2half_rn((int)((v_read >> 4) & 0x0f) - v_zero); - half v_2 = __int2half_rn((int)((v_read >> 8) & 0x0f) - v_zero); - half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero); - half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero); - half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero); - half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero); - half v_7 = __int2half_rn((int)((v_read >> 28) ) - v_zero); - - half tmp = __hmul(h_ptr[*x_map_ptr++], v_0); - tmp = __hfma(h_ptr[*x_map_ptr++], v_1, tmp); - tmp = __hfma(h_ptr[*x_map_ptr++], v_2, tmp); - tmp = __hfma(h_ptr[*x_map_ptr++], v_3, tmp); - tmp = __hfma(h_ptr[*x_map_ptr++], v_4, tmp); - tmp = __hfma(h_ptr[*x_map_ptr++], v_5, tmp); - tmp = __hfma(h_ptr[*x_map_ptr++], v_6, tmp); - tmp = __hfma(h_ptr[*x_map_ptr++], v_7, tmp); - result = __hfma(v_scale, tmp, result); - } - - return result; -} - -#endif diff --git a/server/exllama_kernels/exllama_kernels/tuning.h b/server/exllama_kernels/exllama_kernels/tuning.h deleted file mode 100644 index 770ca46aa..000000000 --- a/server/exllama_kernels/exllama_kernels/tuning.h +++ /dev/null @@ -1,13 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _tuning_h -#define _tuning_h - -struct ExLlamaTuning -{ - int matmul_recons_thd; - bool matmul_fused_remap; - bool matmul_no_half2; -}; - -#endif diff --git a/server/exllama_kernels/exllama_kernels/util.cuh b/server/exllama_kernels/exllama_kernels/util.cuh deleted file mode 100644 index 2839b10fa..000000000 --- a/server/exllama_kernels/exllama_kernels/util.cuh +++ /dev/null @@ -1,29 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _util_cuh -#define _util_cuh - -#include -#include -#include -#include - -#define cudaUnspecified cudaErrorApiFailureBase - -// React to failure on return code != cudaSuccess - -#define _cuda_check(fn) \ -do { \ - {_cuda_err = fn;} \ - if (_cuda_err != cudaSuccess) goto _cuda_fail; \ -} while(false) - -// React to failure on return code == 0 - -#define _alloc_check(fn) \ -do { \ - if (!(fn)) { _cuda_err = cudaUnspecified; goto _cuda_fail; } \ - else _cuda_err = cudaSuccess; \ -} while(false) - -#endif diff --git a/server/exllama_kernels/setup.py b/server/exllama_kernels/setup.py deleted file mode 100644 index f06a72bd9..000000000 --- a/server/exllama_kernels/setup.py +++ /dev/null @@ -1,19 +0,0 @@ -from setuptools import setup -from torch.utils.cpp_extension import BuildExtension, CUDAExtension - -setup( - name="exllama_kernels", - ext_modules=[ - CUDAExtension( - name="exllama_kernels", - sources=[ - "exllama_kernels/exllama_ext.cpp", - "exllama_kernels/cuda_buffers.cu", - "exllama_kernels/cuda_func/column_remap.cu", - "exllama_kernels/cuda_func/q4_matmul.cu", - "exllama_kernels/cuda_func/q4_matrix.cu" - ], - ) - ], - cmdclass={"build_ext": BuildExtension}, -) diff --git a/server/exllamav2_kernels/exllamav2_kernels/config.h b/server/exllamav2_kernels/exllamav2_kernels/config.h deleted file mode 100644 index 86baaf412..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/config.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _config_h -#define _config_h - -#define MAX_Q_GEMM_ROWS 50 - -#define QMODE_2BIT 1 -#define QMODE_3BIT 1 -#define QMODE_4BIT 1 -#define QMODE_5BIT 1 -#define QMODE_6BIT 0 -#define QMODE_8BIT 0 - -#endif diff --git a/server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.cpp b/server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.cpp deleted file mode 100644 index f83c93490..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.cpp +++ /dev/null @@ -1,59 +0,0 @@ -#include "quantize_func.h" -#include "../cuda/quantize.cuh" - -void quantize_range -( - torch::Tensor quant, - torch::Tensor scale, - torch::Tensor out_q, - float qzero, - float maxq, - torch::Tensor hessian_inv, - torch::Tensor weights, - torch::Tensor error, - int a, - int b -) -{ - int columns = weights.size(1); - int hcolumns = hessian_inv.size(1); - - for (int c = a; c < b; c++) - { - quantize_cuda - ( - ((const float*) weights.data_ptr()) + c * columns, - ((float*) quant.data_ptr()) + c * columns, - (const float*) scale.data_ptr(), - out_q.device().is_meta() ? NULL : ((uint16_t*) out_q.data_ptr()) + c * columns, - 1, - columns, - qzero, - maxq - ); - - adjust_error_row_cuda - ( - (const float*) hessian_inv.data_ptr(), - (float*) error.data_ptr(), - (const float*) weights.data_ptr(), - (const float*) quant.data_ptr(), - c, - columns, - hcolumns - ); - - vv_mul_sub_cuda - ( - ((const float*) hessian_inv.data_ptr()) + c * hcolumns + c, - ((const float*) error.data_ptr()) + c * columns, - ((float*) weights.data_ptr()) + c * columns, - b - c, - columns - ); - } - - torch::Tensor x = hessian_inv.slice(0, a, b).slice(1, b).transpose(0, 1); - torch::Tensor y = error.slice(0, a, b); - weights.slice(0, b).addmm_(x, y, 1.0f, -1.0f); -} diff --git a/server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.h b/server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.h deleted file mode 100644 index cd111bad3..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cpp/quantize_func.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _quantize_func_h -#define _quantize_func_h - -#include -#include -#include -#include -#include -#include - -void quantize_range -( - torch::Tensor quant, - torch::Tensor scale, - torch::Tensor out_q, - float qzero, - float maxq, - torch::Tensor hessian_inv, - torch::Tensor weights, - torch::Tensor error, - int a, - int b -); - -#endif \ No newline at end of file diff --git a/server/exllamav2_kernels/exllamav2_kernels/cpp/sampling.cpp b/server/exllamav2_kernels/exllamav2_kernels/cpp/sampling.cpp deleted file mode 100644 index 56d731aab..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cpp/sampling.cpp +++ /dev/null @@ -1,477 +0,0 @@ -#include "sampling.h" -#include "util.h" -#include -#include -#include -#include - -const int top_k_heap_threshold = 500; - -bool* g_rep_mask = NULL; -int g_vocab_size = 0; - -void apply_rep_penalty_cpu -( - const int vocab_size, - const uint64_t* sequence, - const float penalty_max, - const int sustain, - const int decay, - const int seq_len, - float* logits -) -{ - if (vocab_size != g_vocab_size) - { - if (g_rep_mask) free(g_rep_mask); - g_vocab_size = vocab_size; - g_rep_mask = (bool*) malloc(g_vocab_size * sizeof(bool)); - } - - memset(g_rep_mask, 0, g_vocab_size * sizeof(bool)); - - float v = penalty_max; - float dv = decay ? (1.0f - penalty_max) / (float) decay : 0.0f; - - int s = sustain == -1 ? seq_len : sustain; - int beg = seq_len - s - decay; - if (beg < 0) beg = 0; - - for (int i = seq_len; i > beg;) - { - uint64_t t = sequence[--i]; - if (!g_rep_mask[t]) - { - if (logits[t] > 0.0) logits[t] /= v; - else logits[t] *= v; - g_rep_mask[t] = true; - } - if (--s < 0) v += dv; - } -} - -void softmax_cpu -( - const int vocab_size, - const float temperature, - const float* logits, - const bool* logits_filter, - float* output -) -{ - float esum = 0.0f; - float itemp = 1.0f / temperature; - float maxl = 0.0f; - - #pragma unroll(32) - for (int i = 0; i < vocab_size; i++) - { - if (!logits_filter[i]) continue; - maxl = fmaxf(logits[i], maxl); - } - maxl *= itemp; - - #pragma unroll(32) - for (int i = 0; i < vocab_size; i++) - { - if (!logits_filter[i]) continue; - float e = expf(logits[i] * itemp - maxl); - output[i] = e; - esum += e; - } - float isum = 1.0f / esum; - - #pragma unroll(32) - for (int i = 0; i < vocab_size; i++) - { - if (logits_filter[i]) - output[i] *= isum; - else - output[i] = 0.0f; - } - -// printf("Softmax:"); -// float summ = 0.0f; -// for (int i = 0; i < vocab_size; i++) -// { -// if (logits_filter[i]) -// { -// printf("%d, %f\n", i, output[i]); -// summ += output[i]; -// } -// } -// printf("sum: %f\n", summ); -} - -void normalize_cpu -( - const int num_candidates, - float* probs -) -{ - float sum = 0.0f; - #pragma unroll(32) - for (int i = 0; i < num_candidates; i++) sum += probs[i]; - float isum = 1.0f / sum; - #pragma unroll(32) - for (int i = 0; i < num_candidates; i++) probs[i] *= isum; -} - -int greedy_sample -( - const int num_candidates, - const float* probs, - const bool* logits_filter -) -{ - int maxidx = -1; - float max = -1e38; - - for(int i = 1; i < num_candidates; i++) - { - if (logits_filter[i] && (maxidx == -1 || probs[i] > max)) - { - max = probs[i]; - maxidx = i; - } - } - return maxidx; -} - -template -inline void swap(T &a, T &b) -{ - T temp = a; - a = b; - b = temp; -} - -inline bool cmp_asc(const float& a, const float& b) -{ - return a > b; -} - -inline bool cmp_desc(const float& a, const float& b) -{ - return a < b; -} - -template -void quicksort_with_idx -( - float* arr, - int* idx, - int low, - int high, - int max_index -) -{ - if (low >= high) return; - - // Bubblesort very short segments - - if (high - low == 1) - { - int i0 = low; - int i1 = low + 1; - - if (cmp_func(arr[i0], arr[i1])) { swap(arr[i0], arr[i1]); swap(idx[i0], idx[i1]); } - return; - } - - if (high - low == 2) - { - int i0 = low; - int i1 = low + 1; - int i2 = low + 2; - - if (cmp_func(arr[i0], arr[i1])) { swap(arr[i0], arr[i1]); swap(idx[i0], idx[i1]); } - if (cmp_func(arr[i1], arr[i2])) { swap(arr[i1], arr[i2]); swap(idx[i1], idx[i2]); } - if (cmp_func(arr[i0], arr[i1])) { swap(arr[i0], arr[i1]); swap(idx[i0], idx[i1]); } - return; - } - - if (high - low == 3) - { - int i0 = low; - int i1 = low + 1; - int i2 = low + 2; - int i3 = low + 3; - - if (cmp_func(arr[i0], arr[i1])) { swap(arr[i0], arr[i1]); swap(idx[i0], idx[i1]); } - if (cmp_func(arr[i1], arr[i2])) { swap(arr[i1], arr[i2]); swap(idx[i1], idx[i2]); } - if (cmp_func(arr[i2], arr[i3])) { swap(arr[i2], arr[i3]); swap(idx[i2], idx[i3]); } - if (cmp_func(arr[i0], arr[i1])) { swap(arr[i0], arr[i1]); swap(idx[i0], idx[i1]); } - if (cmp_func(arr[i1], arr[i2])) { swap(arr[i1], arr[i2]); swap(idx[i1], idx[i2]); } - if (cmp_func(arr[i0], arr[i1])) { swap(arr[i0], arr[i1]); swap(idx[i0], idx[i1]); } - return; - } - - if (high - low == 4) - { - int i0 = low; - int i1 = low + 1; - int i2 = low + 2; - int i3 = low + 3; - int i4 = low + 4; - - if (cmp_func(arr[i0], arr[i1])) { swap(arr[i0], arr[i1]); swap(idx[i0], idx[i1]); } - if (cmp_func(arr[i1], arr[i2])) { swap(arr[i1], arr[i2]); swap(idx[i1], idx[i2]); } - if (cmp_func(arr[i2], arr[i3])) { swap(arr[i2], arr[i3]); swap(idx[i2], idx[i3]); } - if (cmp_func(arr[i3], arr[i4])) { swap(arr[i3], arr[i4]); swap(idx[i3], idx[i4]); } - if (cmp_func(arr[i0], arr[i1])) { swap(arr[i0], arr[i1]); swap(idx[i0], idx[i1]); } - if (cmp_func(arr[i1], arr[i2])) { swap(arr[i1], arr[i2]); swap(idx[i1], idx[i2]); } - if (cmp_func(arr[i2], arr[i3])) { swap(arr[i2], arr[i3]); swap(idx[i2], idx[i3]); } - if (cmp_func(arr[i0], arr[i1])) { swap(arr[i0], arr[i1]); swap(idx[i0], idx[i1]); } - if (cmp_func(arr[i1], arr[i2])) { swap(arr[i1], arr[i2]); swap(idx[i1], idx[i2]); } - if (cmp_func(arr[i0], arr[i1])) { swap(arr[i0], arr[i1]); swap(idx[i0], idx[i1]); } - return; - } - - float pivot = arr[high]; - int i = low - 1; - for (int j = low; j < high; j++) - { - if (!cmp_func(arr[j], pivot)) - { - i++; - swap(arr[i], arr[j]); - swap(idx[i], idx[j]); - } - } - - swap(arr[i + 1], arr[high]); - swap(idx[i + 1], idx[high]); - int pos = i + 1; - - if (max_index == 0 || low <= max_index) - quicksort_with_idx(arr, idx, low, pos - 1, max_index); - if (max_index == 0 || pos <= max_index) - quicksort_with_idx(arr, idx, pos + 1, high, max_index); -} - -// Discard tiny probabilities, improves performance when temperature is very low - -int pre_sort_descending -( - const int num_candidates, - float* arr, - int* idx -) -{ - const float eps = 1e-8; - int i = 0; - int j = num_candidates - 1; - - while (i <= j) - { - if (arr[j] < eps) { j--; continue; } - if (arr[i] >= eps) { i++; continue; } - swap(arr[i], arr[j]); - swap(idx[i], idx[j]); - i++; - j--; - } - - return i; -} - -void sort_descending -( - const int num_candidates, - float* temp_probs, - int* temp_indices, - int max_index -) -{ - int pre = pre_sort_descending(num_candidates, temp_probs, temp_indices); - quicksort_with_idx(temp_probs, temp_indices, 0, pre - 1, max_index); - -// int m = (max_index == 0 ? num_candidates : max_index); -// for (int i = 0; i < m; i++) printf("%i - %f \n", temp_indices[i], temp_probs[i] * 10000.0); -// for (int i = 0; i < m - 1; i++) if (temp_probs[i] < temp_probs[i + 1] - 2e-8) DBGI(i); -} - -int top_k_cpu -( - const int num_candidates, - float* temp_probs, - int* temp_indices, - int top_k -) -{ - //TIME_START; - - // Use min-heap for lower values of K - - if (top_k <= top_k_heap_threshold) - { - std::priority_queue, std::vector>, std::greater>> min_heap; - - for (int i = 0; i < top_k; ++i) min_heap.push({temp_probs[i], temp_indices[i]}); - - for (int i = top_k; i < num_candidates; i++) - { - if (temp_probs[i] > min_heap.top().first) - { - min_heap.pop(); - min_heap.push({temp_probs[i], temp_indices[i]}); - } - } - - int j = top_k; - for (int i = 0; i < top_k; i++) - { - j--; - temp_probs[j] = min_heap.top().first; - temp_indices[j] = min_heap.top().second; - min_heap.pop(); - } - } - - // For larger values, quicksort is still faster - - else - { - sort_descending(num_candidates, temp_probs, temp_indices, top_k); - } - - //TIME_STOP; - - return top_k; -} - -int top_p_cpu -( - const int num_candidates, - float* temp_probs, - int* temp_indices, - float top_p -) -{ - std::priority_queue, std::vector>, std::greater>> min_heap; - - //TIME_START; - - float min_p = 1e-6; - - float sum = 0.0f; - for (int i = 0; i < num_candidates; i++) - { - if (temp_probs[i] < min_p) continue; - if (sum > top_p && temp_probs[i] < min_heap.top().first) continue; - - min_heap.push({temp_probs[i], temp_indices[i]}); - sum += temp_probs[i]; - - while (sum > top_p && min_heap.size() > 1) - { - sum -= min_heap.top().first; - min_heap.pop(); - } - } - - int j = min_heap.size(); - int k = j; - while (j > 0) - { - j--; - temp_probs[j] = min_heap.top().first; - temp_indices[j] = min_heap.top().second; - min_heap.pop(); - } - - //TIME_STOP; - - return k; -} - -int typical_cpu -( - const int num_candidates, - float* temp_probs, - int* temp_indices, - float typical -) -{ - //TIME_START; - - const float epsilon = 1e-10; - - float* temp = (float*) malloc(num_candidates * sizeof(float)); - int* entropy_dev_order = (int*) malloc(num_candidates * sizeof(int)); - int* temp_indices_2 = (int*) malloc(num_candidates * sizeof(int)); - - float neg_entropy = 0.0f; - for (int i = 0; i < num_candidates; i++) - { - float x = temp_probs[i]; - float y = x + logf(x + epsilon); - neg_entropy += x * y; - temp[i] = y; // temp = log_probs - } - - for (int i = 0; i < num_candidates; i++) - { - temp[i] = fabs(temp[i] - neg_entropy); // temp = entropy_dev - entropy_dev_order[i] = i; - } - - quicksort_with_idx(temp, entropy_dev_order, 0, num_candidates - 1, num_candidates); - - memcpy(temp, temp_probs, num_candidates * sizeof(float)); // temp = temp_probs - memcpy(temp_indices_2, temp_indices, num_candidates * sizeof(int)); - - float cumprob = 0.0f; - int num = 0; - - while (true) - { - int j = entropy_dev_order[num]; - float p = temp[j]; - temp_probs[num] = p; - temp_indices[num] = temp_indices_2[j]; - - cumprob += p; - if (cumprob >= typical) break; - num++; - if (num >= num_candidates) break; - } - - free(temp); - free(entropy_dev_order); - free(temp_indices_2); - - //TIME_STOP; - - if (num == 0) num = 1; - return num; -} - -int multinomial_cpu -( - const int num_candidates, - float* temp_probs, - int* temp_indices, - float random -) -{ - int idx = 0; - float accum = temp_probs[idx]; - - while (true) - { - if (accum >= random) break; - if (idx == num_candidates - 1) break; - idx++; - accum += temp_probs[idx]; - } - - temp_probs[0] = temp_probs[idx]; - temp_indices[0] = temp_indices[idx]; - - return 1; -} - - - diff --git a/server/exllamav2_kernels/exllamav2_kernels/cpp/sampling.h b/server/exllamav2_kernels/exllamav2_kernels/cpp/sampling.h deleted file mode 100644 index 85472994a..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cpp/sampling.h +++ /dev/null @@ -1,84 +0,0 @@ -#ifndef _sampling_h -#define _sampling_h - -#include -#include -#include -#include - -void apply_rep_penalty_cpu -( - const int vocab_size, - const uint64_t* sequence, - const float penalty_max, - const int sustain, - const int decay, - const int seq_len, - float* logits -); - -void softmax_cpu -( - const int vocab_size, - const float temperature, - const float* logits, - const bool* logits_filter, - float* output -); - -void normalize_cpu -( - const int num_candidates, - float* probs -); - -int greedy_sample -( - const int num_candidates, - const float* probs, - const bool* logits_filter -); - -void sort_descending -( - const int num_candidates, - float* temp_probs, - int* temp_indices, - int max_index -); - -int top_k_cpu -( - const int num_candidates, - float* temp_probs, - int* temp_indices, - int top_k -); - -int top_p_cpu -( - const int num_candidates, - float* temp_probs, - int* temp_indices, - float top_p -); - -int typical_cpu -( - const int num_candidates, - float* temp_probs, - int* temp_indices, - float typical -); - -int multinomial_cpu -( - const int num_candidates, - float* temp_probs, - int* temp_indices, - float random -); - -#endif - - diff --git a/server/exllamav2_kernels/exllamav2_kernels/cpp/util.h b/server/exllamav2_kernels/exllamav2_kernels/cpp/util.h deleted file mode 100644 index 983d6323a..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cpp/util.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _util_h -#define _util_h - -#include - -#define DBGS(__x) printf("%s\n", __x) -#define DBGI(__x) printf("%s: %i\n", #__x, __x) -#define DBGI2(__x, __y) printf("%s, %s: %i, %i\n", #__x, #__y, __x, __y) -#define DBGI3(__x, __y, __z) printf("%s, %s, %s: %i, %i, %i\n", #__x, #__y, #__z, __x, __y, __z) -#define DBGF(__x) printf("%s: %f\n", #__x, __x) -#define DBGF2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __x, __y) -#define DBGF3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __x, __y, __z) -#define DBGIF(__x, __y) printf("%s, %s: %i, %f\n", #__x, #__y, __x, __y) - -#define TIME_START \ - auto start = std::chrono::high_resolution_clock::now() - -#define TIME_STOP \ - do { \ - auto stop = std::chrono::high_resolution_clock::now(); \ - auto duration_us = std::chrono::duration_cast(stop - start); \ - DBGI(duration_us); \ - } while (false) - -#endif diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/cache.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/cache.cu deleted file mode 100644 index 900a92b53..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/cache.cu +++ /dev/null @@ -1,161 +0,0 @@ -#include "cache.cuh" - -// #if defined(CUDART_VERSION) && CUDART_VERSION >= 11080 -// -// #include - -#include "quant/qdq_util.cuh" -#include "util.cuh" - -#define THREADS 32 - -// The upper 8 bits of FP16 are equivalent to FP8 E5M2. -// -// The range of values typically cached seem to be in the range of +/- 16, with an exponent component (with bias) up to -// about 20. Empirically, the MSE over the whole range of observed values in the K/V cache works out the same for E4M3 -// and E5M2. However, over 80% of values in the cache tensors fall within the range of -1..1, where E5M2 produces about -// a 25% lower MSE. - -__device__ inline uint32_t compress(uint32_t v) -{ - uint32_t vh = (v & 0xff000000) >> 16; - uint32_t vl = (v & 0x0000ff00) >> 8; - return vh | vl; -} - -__device__ inline uint32_t decompress(uint32_t v) -{ - uint32_t vh = (v & 0xff00) << 16; - uint32_t vl = (v & 0x00ff) << 8; - return vh | vl; -} - -__global__ void nv_fp16_to_fp8 -( - const half* __restrict__ pIn, - unsigned char* __restrict__ pOut, - int stride, - int height, - int min, - int max -) -{ - int x = min + (blockIdx.x * blockDim.x + threadIdx.x) * 8; - int y = blockIdx.y; - if (x >= max) return; - - int4* in_ptr = (int4*) (pIn + y * stride + x); - int2* out_ptr = (int2*) (pOut + y * stride + x); - - int4 in = *in_ptr; - uint32_t c0 = compress(in.x); - uint32_t c1 = compress(in.y); - uint32_t c2 = compress(in.z); - uint32_t c3 = compress(in.w); - int2 out = make_int2(c0 | (c1 << 16), c2 | (c3 << 16)); - *out_ptr = out; -} - -__global__ void nv_fp8_to_fp16 -( - const unsigned char* __restrict__ pIn, - half* __restrict__ pOut, - int stride, - int height, - int min, - int max -) -{ - int x = min + (blockIdx.x * blockDim.x + threadIdx.x) * 8; - int y = blockIdx.y; - if (x >= max) return; - - int2* in_ptr = (int2*) (pIn + y * stride + x); - int4* out_ptr = (int4*) (pOut + y * stride + x); - - int2 in = *in_ptr; - uint32_t c0 = decompress(in.x); - uint32_t c1 = decompress(in.x >> 16); - uint32_t c2 = decompress(in.y); - uint32_t c3 = decompress(in.y >> 16); - int4 out = make_int4(c0, c1, c2, c3); - *out_ptr = out; -} - -// __global__ void nv_fp32_to_fp16(const float* pIn, half* pOut, int size) -// { -// int i = blockIdx.x * blockDim.x + threadIdx.x; -// if (i < size) { -// pOut[i] = __float2half(pIn[i]); -// } -// } - -// __global__ void nv_fp16_to_fp8_ref(const half* pIn, unsigned char *pOut, int size) -// { -// int i = blockIdx.x * blockDim.x + threadIdx.x; -// if (i < size) { -// pOut[i] = __nv_cvt_halfraw_to_fp8(pIn[i], __NV_SATFINITE, __NV_E4M3); -// } -// } -// -// __global__ void nv_fp8_to_fp16_ref(const unsigned char* pIn, half* pOut, int size) -// { -// int i = blockIdx.x * blockDim.x + threadIdx.x; -// if (i < size) { -// pOut[i] = __nv_cvt_fp8_to_halfraw(pIn[i], __NV_E4M3); -// } -// } - -void array_fp16_to_fp8_cuda(const half* pIn, unsigned char *pOut, int stride, int height, int offset, int width) -{ - int min = offset; - int max = offset + width; - min = min / 8 * 8; - max = min + (max - min + 7) / 8 * 8; - - dim3 blockDim, gridDim; - blockDim.x = THREADS; - gridDim.x = DIVIDE((max - min) / 8, THREADS); - gridDim.y = height; - - nv_fp16_to_fp8<<>>(pIn, pOut, stride, height, min, max); - // cuda_check( cudaPeekAtLastError() ); -} - -void array_fp8_to_fp16_cuda(const unsigned char* pIn, half* pOut, int stride, int height, int offset, int width) -{ - int min = offset; - int max = offset + width; - min = min / 8 * 8; - max = min + (max - min + 7) / 8 * 8; - - dim3 blockDim, gridDim; - blockDim.x = THREADS; - gridDim.x = DIVIDE((max - min) / 8, THREADS); - gridDim.y = height; - - nv_fp8_to_fp16<<>>(pIn, pOut, stride, height, min, max); - // cuda_check( cudaPeekAtLastError() ); -} - -// void array_fp16_to_fp8_ref_cuda(const half* pIn, unsigned char *pOut, int size) -// { -// const int threads = 512; -// int blocks = DIVIDE(size / 1, threads); -// nv_fp16_to_fp8_ref<<>>(pIn, pOut, size); -// } -// -// void array_fp8_to_fp16_ref_cuda(const unsigned char* pIn, half* pOut, int size) -// { -// const int threads = 512; -// int blocks = DIVIDE(size / 1, threads); -// nv_fp8_to_fp16_ref<<>>(pIn, pOut, size); -// } - -// #else -// -// void array_fp16_to_fp8_cuda(const half* pIn, unsigned char *pOut, int size) { } -// -// void array_fp8_to_fp16_cuda(const unsigned char* pIn, half* pOut, int size) { } -// -// #endif \ No newline at end of file diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/cache.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/cache.cuh deleted file mode 100644 index 4cb291b65..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/cache.cuh +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef _cache_cuh -#define _cache_cuh - -#include -#include -#include -#include - -void array_fp16_to_fp8_cuda(const half* pIn, unsigned char *pOut, int stride, int height, int offset, int width); -void array_fp8_to_fp16_cuda(const unsigned char* pIn, half* pOut, int stride, int height, int offset, int width); -// void array_fp16_to_fp8_ref_cuda(const half* pIn, unsigned char *pOut, int size); -// void array_fp8_to_fp16_ref_cuda(const unsigned char* pIn, half* pOut, int size); - -#endif diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/compat.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/compat.cuh deleted file mode 100644 index 12684ff8b..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/compat.cuh +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef _compat_cuh -#define _compat_cuh - -// atomicAdd for half types, to support CC < 7.x - -__device__ __forceinline__ void atomicAdd_half(half* address, half val) -{ - unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2)); - unsigned int old = *address_as_ui; - unsigned int assumed; - - do - { - assumed = old; - __half_raw hsum; - hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); - half tmpres = __hadd(hsum, val); - hsum = __half_raw(tmpres); - old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; - old = atomicCAS(address_as_ui, assumed, old); - } - while (assumed != old); -} - -// atomicAdd for half2 types - -__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) -{ - unsigned int* address_as_ui = (unsigned int*)address; - unsigned int old = *address_as_ui; - unsigned int assumed; - do - { - assumed = old; - half2 old_val = *((half2*)&old); - half2 new_val = __hadd2(old_val, val); - old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val)); - } - while (assumed != old); -} - -// - -#if defined(__CUDA_ARCH__) || defined(USE_ROCM) -#if __CUDA_ARCH__ < 700 || defined(USE_ROCM) - -__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); } - -#if __CUDA_ARCH__ < 600 || defined(USE_ROCM) -__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); } -#endif - -#endif -#endif - -#endif diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/compat_gemm.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/compat_gemm.cuh deleted file mode 100644 index 089f28625..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/compat_gemm.cuh +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef _compat_gemm_cuh -#define _compat_gemm_cuh - -#if defined(USE_ROCM) -__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t handle, - hipblasOperation_t transA, - hipblasOperation_t transB, - int m, - int n, - int k, - const half* alpha, - const half* AP, - int lda, - const half* BP, - int ldb, - const half* beta, - half* CP, - int ldc) { - return hipblasHgemm(handle, transA, transB, m, n, k, - reinterpret_cast(alpha), - reinterpret_cast(AP), lda, - reinterpret_cast(BP), ldb, - reinterpret_cast(beta), - reinterpret_cast(CP), ldc); -} -#define hipblasHgemm __compat_hipblasHgemm - -// Previous version of PyTorch were converting to rocBLAS instead of hipBLAS. -#define rocblas_operation_none HIPBLAS_OP_N -#define rocblas_hgemm __compat_hipblasHgemm -#endif - -#endif \ No newline at end of file diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/h_gemm.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/h_gemm.cu deleted file mode 100644 index 5d4495ece..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/h_gemm.cu +++ /dev/null @@ -1,275 +0,0 @@ -#include "h_gemm.cuh" -#include "util.cuh" -#include "../config.h" -#include "matrix_view.cuh" - -#include "compat_gemm.cuh" - -// union half2_uint32 -// { -// uint32_t as_uint32; -// half2 as_half2; -// __device__ half2_uint32(uint32_t val) : as_uint32(val) {} -// __device__ half2_uint32(half2 val) : as_half2(val) {} -// }; - -// TODO: Improve tall kernel, maybe special cases for size_n = 1, 2, 4, 8, 16 - -const int T_THREADS_M = 1; -const int T_THREADS_N = 8; -const int T_BLOCKSIZE_K = 32; -const int T_MAX_M = 16; -const int T_MAX_N = 64; -const int T_MAX_K = 1024 / T_THREADS_N * T_BLOCKSIZE_K; -const int T_MAX_BLOCKS_K = T_MAX_K / T_BLOCKSIZE_K; - -__global__ void h_gemm_tall_kernel -( - const int size_m, - const int size_n, - const int size_k, - const half* __restrict__ a, - const half* __restrict__ b, - half* __restrict__ c, - bool clear -) -{ - __shared__ half accum[T_MAX_BLOCKS_K][T_THREADS_N]; - - int m = blockIdx.y * T_THREADS_M + threadIdx.z; - int n = blockIdx.x * T_THREADS_N + threadIdx.x; - int k = threadIdx.y * T_BLOCKSIZE_K; - - if (n >= size_n) return; - if (m >= size_m) return; - - MatrixView_half a_(a, size_m, size_k); - MatrixView_half b_(b, size_k, size_n); - MatrixView_half_rw c_(c, size_m, size_n); - - int k_end = min(k + T_BLOCKSIZE_K, size_k); - - const half* a_ptr = a_.item_ptr(m, k); - const half* a_ptr_end = a_.item_ptr(m, k_end); - const half* b_ptr = b_.item_ptr(k, n); - half* c_ptr = c_.item_ptr(m, n); - - half2 r2 = {}; - - while(a_ptr <= a_ptr_end - 8) - { - int4 a_int4 = *((int4*) a_ptr); - half2 a_01 = ((half2_uint32) a_int4.x).as_half2; - half2 a_23 = ((half2_uint32) a_int4.y).as_half2; - half2 a_45 = ((half2_uint32) a_int4.z).as_half2; - half2 a_67 = ((half2_uint32) a_int4.w).as_half2; - a_ptr += 8; - - half b_0 = *b_ptr; b_ptr += size_n; - half b_1 = *b_ptr; b_ptr += size_n; - half b_2 = *b_ptr; b_ptr += size_n; - half b_3 = *b_ptr; b_ptr += size_n; - half b_4 = *b_ptr; b_ptr += size_n; - half b_5 = *b_ptr; b_ptr += size_n; - half b_6 = *b_ptr; b_ptr += size_n; - half b_7 = *b_ptr; b_ptr += size_n; - half2 b_01 = __halves2half2(b_0, b_1); - half2 b_23 = __halves2half2(b_2, b_3); - half2 b_45 = __halves2half2(b_4, b_5); - half2 b_67 = __halves2half2(b_6, b_7); - - r2 = __hfma2(a_01, b_01, r2); - r2 = __hfma2(a_23, b_23, r2); - r2 = __hfma2(a_45, b_45, r2); - r2 = __hfma2(a_67, b_67, r2); - } - - while(a_ptr <= a_ptr_end - 4) - { - int2 a_int2 = *((int2*) a_ptr); - half2 a_01 = ((half2_uint32) a_int2.x).as_half2; - half2 a_23 = ((half2_uint32) a_int2.y).as_half2; - a_ptr += 4; - - half b_0 = *b_ptr; b_ptr += size_n; - half b_1 = *b_ptr; b_ptr += size_n; - half b_2 = *b_ptr; b_ptr += size_n; - half b_3 = *b_ptr; b_ptr += size_n; - half2 b_01 = __halves2half2(b_0, b_1); - half2 b_23 = __halves2half2(b_2, b_3); - - r2 = __hfma2(a_01, b_01, r2); - r2 = __hfma2(a_23, b_23, r2); - } - - half r = __hadd(__low2half(r2), __high2half(r2)); - - while(a_ptr < a_ptr_end) - { - half a_item = *a_ptr++; - half b_item = *b_ptr; b_ptr += size_n; - r = __hfma(a_item, b_item, r); - } - - accum[threadIdx.y][threadIdx.x] = r; - __syncthreads(); - - if (threadIdx.y == 0) - { - half acc = accum[0][threadIdx.x]; - for (int i = 1; i < blockDim.y; ++i) acc = __hadd(accum[i][threadIdx.x], acc); - if (!clear) acc = __hadd(acc, *c_ptr); - *c_ptr = acc; - } -} - - -const int W_MAX_M = 16; -const int W_MAX_N = 65536; -const int W_MAX_K = 32; -const int W_THREADS_M = 1; -const int W_THREADS_N = 32; - -__global__ void h_gemm_wide_kernel -( - const int size_m, - const int size_n, - const int size_k, - const half* __restrict__ a, - const half* __restrict__ b, - half* __restrict__ c, - bool clear -) -{ - int m = blockIdx.y * W_THREADS_M + threadIdx.y; - int n = blockIdx.x * W_THREADS_N + threadIdx.x; - - if (n >= size_n) return; - if (m >= size_m) return; - - MatrixView_half a_(a, size_m, size_k); - MatrixView_half b_(b, size_k, size_n); - MatrixView_half_rw c_(c, size_m, size_n); - - half* c_ptr = c_.item_ptr(m, n); - - __shared__ half read_a[W_MAX_K]; - int t = threadIdx.x; - - if (t < size_k) - { - read_a[t] = a_.item(m, t); - } - __syncthreads(); - - half r = {}; - - for (int k = 0; k < size_k; ++k) - { - half item_a = read_a[k]; - half item_b = b_.item(k, n); - r = __hfma(item_a, item_b, r); - } - - if (threadIdx.y == 0) - { - if (!clear) r = __hadd(r, *c_ptr); - *c_ptr = r; - } -} - - -// cuBLAS - -void h_gemm_cublas -( - cublasHandle_t cublas_handle, - const int size_m, - const int size_n, - const int size_k, - const half* a, - const half* b, - half* c, - const float alpha, - const float beta -) -{ - half alpha_ = __float2half(alpha); - half beta_ = __float2half(beta); - cublasHgemm(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - size_n, size_m, size_k, - &alpha_, b, size_n, - a, size_k, - &beta_, c, size_n); -} - - -// alpha * ( a[m,k] @ b[k,n] ) + beta * c[m,n] -> c[m,n] - -void h_gemm_cuda -( - cublasHandle_t cublas_handle, - const int size_m, - const int size_n, - const int size_k, - const half* a, - const half* b, - half* c, - const float alpha, - const float beta -) -{ - if ((beta == 1.0f || beta == 0.0f) && (alpha == 1.0f)) - { - bool clear = (beta == 0.0f); - - //DBGI3(size_m, size_n, size_k); - - if (size_m <= T_MAX_M && size_n <= T_MAX_N && size_k <= T_MAX_K) - { - // Tall - - dim3 blockDim, gridDim; - blockDim.x = T_THREADS_N; - blockDim.y = DIVIDE(size_k, T_BLOCKSIZE_K); - blockDim.z = T_THREADS_M; - gridDim.x = DIVIDE(size_n, T_THREADS_N); - gridDim.y = DIVIDE(size_m, T_THREADS_M); - gridDim.z = 1; - -// DBGI3(blockDim.x, blockDim.y, blockDim.z); -// DBGI3(gridDim.x, gridDim.y, gridDim.z); - - h_gemm_tall_kernel<<>>(size_m, size_n, size_k, a, b, c, clear); - cuda_check( cudaPeekAtLastError() ); - return; - } - - if (size_m <= W_MAX_M && size_n <= W_MAX_N && size_k <= W_MAX_K) - { - // Wide - - dim3 blockDim, gridDim; - blockDim.x = W_THREADS_N; - blockDim.y = W_THREADS_M; - blockDim.z = 1; - gridDim.x = DIVIDE(size_n, W_THREADS_N); - gridDim.y = DIVIDE(size_m, W_THREADS_M); - gridDim.z = 1; - -// DBGI3(blockDim.x, blockDim.y, blockDim.z); -// DBGI3(gridDim.x, gridDim.y, gridDim.z); - - h_gemm_wide_kernel<<>>(size_m, size_n, size_k, a, b, c, clear); - cuda_check( cudaPeekAtLastError() ); - return; - } - } - - h_gemm_cublas(cublas_handle, size_m, size_n, size_k, a, b, c, alpha, beta); -// DBGI3(size_m, size_n, size_k); - cuda_check( cudaPeekAtLastError() ); - -} \ No newline at end of file diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/h_gemm.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/h_gemm.cuh deleted file mode 100644 index 5694e2ade..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/h_gemm.cuh +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _h_gemm_cuh -#define _h_gemm_cuh - -#include -#include -#include -#include -#include - -// alpha * ( a[m,k] @ b[k,n] ) + beta * c[m,n] -> c[m,n] - -void h_gemm_cuda -( - cublasHandle_t cublas_handle, - const int size_m, - const int size_n, - const int size_k, - const half* a, - const half* b, - half* c, - const float alpha, - const float beta -); - -#endif - diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/lora.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/lora.cu deleted file mode 100644 index 49a19ec0f..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/lora.cu +++ /dev/null @@ -1,33 +0,0 @@ -#include "lora.cuh" -#include "util.cuh" -#include "h_gemm.cuh" - -void apply_loras_cuda -( - cublasHandle_t cublas_handle, - const std::unordered_map>& adapters, - const std::vector& ids, - QMatrix* base, - const half* input, - half* output, - half* temp, - int rows -) -{ - for (uintptr_t lora_id : ids) - { - auto it = adapters.find(lora_id); - if (it == adapters.end()) continue; - - const std::tuple& lora = it->second; - half* lora_a = std::get<0>(lora); - half* lora_b = std::get<1>(lora); - int rank = std::get<2>(lora); - -// DBGI3(rows, rank, base->height); -// DBGI3(rows, base->width, rank); - - h_gemm_cuda(cublas_handle, rows, rank, base->height, input, lora_a, temp, 1.0f, 0.0f); - h_gemm_cuda(cublas_handle, rows, base->width, rank, temp, lora_b, output, 1.0f, 1.0f); - } -} diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/lora.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/lora.cuh deleted file mode 100644 index b505a201c..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/lora.cuh +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _lora_cuh -#define _lora_cuh - -#include -#include -#include -#include -#include - -#include "q_matrix.cuh" - -void apply_loras_cuda -( - cublasHandle_t cublas_handle, - const std::unordered_map>& adapters, - const std::vector& ids, - QMatrix* base, - const half* input, - half* output, - half* temp, - int rows -); - -#endif diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/matrix_view.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/matrix_view.cuh deleted file mode 100644 index 55af84f23..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/matrix_view.cuh +++ /dev/null @@ -1,121 +0,0 @@ -#ifndef _matrix_view_cuh -#define _matrix_view_cuh - -#include -#include - -#include "quant/qdq_util.cuh" - -class MatrixView_half -{ -public: - const half* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; } - __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; } - __device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); } - __device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; } - - __device__ __forceinline__ void item4(half (&items)[4], int row, int column) const - { - half2* ptr = (half2*) item_ptr(row, column); - half2 i01 = ptr[0]; - half2 i23 = ptr[1]; - items[0] = __low2half(i01); - items[1] = __high2half(i01); - items[2] = __low2half(i23); - items[3] = __high2half(i23); - } - __device__ __forceinline__ void item4_f(float (&items)[4], int row, int column) const - { - half2* ptr = (half2*)item_ptr(row, column); - half2 i01 = ptr[0]; - half2 i23 = ptr[1]; - items[0] = __half2float(__low2half(i01)); - items[1] = __half2float(__high2half(i01)); - items[2] = __half2float(__low2half(i23)); - items[3] = __half2float(__high2half(i23)); - } - - __device__ __forceinline__ void item4_h2(half2 (&items)[4], int row, int column) const - { - half2* ptr = (half2*)item_ptr(row, column); - half2 i01 = ptr[0]; - half2 i23 = ptr[1]; - items[0] = __half2half2(__low2half(i01)); - items[1] = __half2half2(__high2half(i01)); - items[2] = __half2half2(__low2half(i23)); - items[3] = __half2half2(__high2half(i23)); - } -}; - -class MatrixView_half_rw -{ -public: - half* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; } - __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; } - __device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; } - __device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; } - __device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; } - - __device__ __forceinline__ void set4(int row, int column, half v0, half v1, half v2, half v3) - { - half2 v01 = __halves2half2(v0, v1); - half2 v23 = __halves2half2(v2, v3); - half2* ptr = (half2*) item_ptr(row, column); - ptr[0] = v01; - ptr[1] = v23; - } -}; - -class MatrixView_q4_row -{ -public: - const uint32_t* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ int item(int row, int column) const - { - int shift = (column & 0x07) * 4; - return (data[row * width / 8 + column / 8] >> shift) & 0x0f; - } - - __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const - { - int shift = (column & 0x07) * 4; - uint32_t d = data[row * width / 8 + column / 8] >> shift; - items[0] = d & 0x0f; - items[1] = (d >> 4) & 0x0f; - } - - __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const - { - int shift = (column & 0x07) * 4; - uint32_t d = data[row * width / 8 + column / 8] >> shift; - items[0] = d & 0x0f; - items[1] = (d >> 4) & 0x0f; - items[2] = (d >> 8) & 0x0f; - items[3] = (d >> 12) & 0x0f; - } -}; - -#endif \ No newline at end of file diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/pack_tensor.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/pack_tensor.cu deleted file mode 100644 index 3a29f2fcd..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/pack_tensor.cu +++ /dev/null @@ -1,268 +0,0 @@ -#include "pack_tensor.cuh" -#include "util.cuh" - -#define BLOCKSIZE_X 32 -#define BLOCKSIZE_Y 16 - -// Pack rows: -// 0000 0000 0000 aaaa 0000 0000 0000 bbbb 0000 0000 0000 cccc ... -> hhhh gggg ffff eeee dddd cccc bbbb aaaa - -__global__ void pack_rows_4_kernel -( - const uint16_t* __restrict__ input, - uint32_t* __restrict__ output, - int rows, - int out_columns -) -{ - int out_column = blockIdx.x * blockDim.x + threadIdx.x; - int row = blockIdx.y * blockDim.y + threadIdx.y; - - if (row >= rows) return; - if (out_column >= out_columns) return; - - uint32_t packed = 0; - - #pragma unroll - for (int i = 0; i < 8; i++) - { - uint16_t x = input[row * out_columns * 8 + out_column * 8 + i]; - x -= 1; - packed |= (((uint32_t)x) << (i * 4)); - } - - output[row * out_columns + out_column] = packed; -} - -void pack_rows_4_cuda -( - const uint16_t* input, - uint32_t* output, - const int rows, - const int columns -) -{ - int out_columns = columns * 4 / 32; - - dim3 threads(BLOCKSIZE_X, BLOCKSIZE_Y); - dim3 blocks(DIVIDE(out_columns, BLOCKSIZE_X), DIVIDE(rows, BLOCKSIZE_Y)); - - pack_rows_4_kernel<<>>(input, output, rows, out_columns); -} - -// Pack rows: -// 0000 0000 0000 aaaa 0000 0000 0000 bbbb 0000 0000 0000 cccc ... -> hhhh gggg ffff eeee dddd cccc bbbb aaaa - -__global__ void pack_rows_6_kernel -( - const uint16_t* __restrict__ input, - uint32_t* __restrict__ output, - int rows, - int out_columns -) -{ - int out_column = blockIdx.x * blockDim.x + threadIdx.x; - int row = blockIdx.y * blockDim.y + threadIdx.y; - - if (row >= rows) return; - if (out_column >= out_columns) return; - - uint32_t packed = 0; - - #pragma unroll - for (int i = 0; i < 8; i++) - { - uint16_t x = input[row * out_columns * 8 + out_column * 8 + i]; - x -= 1; - packed |= (((uint32_t)x) << (i * 4)); - } - - output[row * out_columns + out_column] = packed; -} - -void pack_rows_6_cuda -( - const uint16_t* input, - uint32_t* output, - const int rows, - const int columns -) -{ - int out_columns = columns * 6 / 32; - - dim3 threads(BLOCKSIZE_X, BLOCKSIZE_Y); - dim3 blocks(DIVIDE(out_columns, BLOCKSIZE_X), DIVIDE(rows, BLOCKSIZE_Y)); - - pack_rows_6_kernel<<>>(input, output, rows, out_columns); -} - -// Pack columns - -__forceinline__ __device__ uint32_t wshift(uint32_t x, int j) -{ - if (j < 0) - { - if (j <= -32) return 0; // Else undefined in CUDA - return x >> (-j); - } - else - { - if (j >= 32) return 0; // Else undefined in CUDA - return x << j; - } -} - -template -__global__ void pack_columns_kernel -( - const uint16_t* __restrict__ input, - uint32_t* __restrict__ output, - int out_rows, - int columns -) -{ - int column = blockIdx.x * blockDim.x + threadIdx.x; - int out_row = blockIdx.y * blockDim.y + threadIdx.y; - - if (column >= columns) return; - if (out_row >= out_rows) return; - - uint32_t x; - - if constexpr (bits == 2) - { - int row = out_row * 32 / 2; - uint32_t packed = 0; - - # pragma unroll - for (int i = 0, j = 0; i < 16; i++, j += 2) - { - x = (uint32_t) input[(row + i) * columns + column]; - packed |= (x << (i * 2)); - } - output[out_row * columns + column] = packed; - } - - if constexpr (bits == 3) - { - if (out_row % 3) return; // Only run for every third row - int row = out_row * 32 / 3; - uint32_t packed0 = 0; - uint32_t packed1 = 0; - uint32_t packed2 = 0; - - #pragma unroll - for (int i = 0, j = 0; i < 32; i++, j += 3) - { - x = (uint32_t) input[(row + i) * columns + column]; - packed0 |= wshift(x, j); - packed1 |= wshift(x, j - 32); - packed2 |= wshift(x, j - 64); - } - - output[(out_row + 0) * columns + column] = packed0; - output[(out_row + 1) * columns + column] = packed1; - output[(out_row + 2) * columns + column] = packed2; - } - - if constexpr (bits == 4) - { - int row = out_row * 32 / 4; - uint32_t packed = 0; - - #pragma unroll - for (int i = 0, j = 0; i < 8; i++, j += 4) - { - x = (uint32_t) input[(row + i) * columns + column]; - packed |= (x << j); - } - output[out_row * columns + column] = packed; - } - - if constexpr (bits == 5) - { - if (out_row % 5) return; // Only run for every fifth row - int row = out_row * 32 / 5; - uint32_t packed0 = 0; - uint32_t packed1 = 0; - uint32_t packed2 = 0; - uint32_t packed3 = 0; - uint32_t packed4 = 0; - - #pragma unroll - for (int i = 0, j = 0; i < 32; i++, j += 5) - { - x = (uint32_t) input[(row + i) * columns + column]; - packed0 |= wshift(x, j); - packed1 |= wshift(x, j - 32); - packed2 |= wshift(x, j - 64); - packed3 |= wshift(x, j - 96); - packed4 |= wshift(x, j - 128); - } - - output[(out_row + 0) * columns + column] = packed0; - output[(out_row + 1) * columns + column] = packed1; - output[(out_row + 2) * columns + column] = packed2; - output[(out_row + 3) * columns + column] = packed3; - output[(out_row + 4) * columns + column] = packed4; - } - - if constexpr (bits == 6) - { - if (out_row % 3) return; // Only run for every third row - int row = out_row * 32 / 6; - uint32_t packed0 = 0; - uint32_t packed1 = 0; - uint32_t packed2 = 0; - - #pragma unroll - for (int i = 0, j = 0; i < 16; i++, j += 6) - { - x = (uint32_t) input[(row + i) * columns + column]; - packed0 |= wshift(x, j); - packed1 |= wshift(x, j - 32); - packed2 |= wshift(x, j - 64); - } - - output[(out_row + 0) * columns + column] = packed0; - output[(out_row + 1) * columns + column] = packed1; - output[(out_row + 2) * columns + column] = packed2; - } - - if constexpr (bits == 8) - { - int row = out_row * 32 / 8; - uint32_t packed = 0; - - #pragma unroll - for (int i = 0, j = 0; i < 4; i++, j += 8) - { - x = (uint32_t) input[(row + i) * columns + column]; - packed |= (x << j); - } - - output[out_row * columns + column] = packed; - } -} - -void pack_columns_cuda -( - const uint16_t* input, - uint32_t* output, - const int in_rows, - const int out_rows, - const int columns, - const int bits -) -{ - dim3 threads(BLOCKSIZE_X, BLOCKSIZE_Y); - dim3 blocks(DIVIDE(columns, BLOCKSIZE_X), DIVIDE(out_rows, BLOCKSIZE_Y)); - - if (bits == 2) pack_columns_kernel<2><<>>(input, output, out_rows, columns); - if (bits == 3) pack_columns_kernel<3><<>>(input, output, out_rows, columns); - if (bits == 4) pack_columns_kernel<4><<>>(input, output, out_rows, columns); - if (bits == 5) pack_columns_kernel<5><<>>(input, output, out_rows, columns); - if (bits == 6) pack_columns_kernel<6><<>>(input, output, out_rows, columns); - if (bits == 8) pack_columns_kernel<8><<>>(input, output, out_rows, columns); -} - diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/pack_tensor.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/pack_tensor.cuh deleted file mode 100644 index 4231cbb9c..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/pack_tensor.cuh +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef _pack_tensor_cuh -#define _pack_tensor_cuh - -#include -#include -#include -#include - -void pack_rows_4_cuda -( - const uint16_t* input, - uint32_t* output, - const int rows, - const int columns -); - -void pack_rows_6_cuda -( - const uint16_t* input, - uint32_t* output, - const int rows, - const int columns -); - -void pack_columns_cuda -( - const uint16_t* input, - uint32_t* output, - const int in_rows, - const int out_rows, - const int columns, - const int bits -); - -#endif \ No newline at end of file diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_attn.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_attn.cu deleted file mode 100644 index eb183842a..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_attn.cu +++ /dev/null @@ -1,159 +0,0 @@ -#include "q_attn.cuh" -#include "q_gemm.cuh" -#include "rms_norm.cuh" -#include "rope.cuh" -#include "util.cuh" -#include "lora.cuh" - -const int THREADS_X = 32; -const int THREADS_Y = 1; -const int THREADS_Z = 4; -const int BLOCKSIZE_X = 2; // 2*half == 1*uint32_t -const int BLOCKSIZE_Z = 4; // num_heads must be divisible by BLOCKSIZE_Z - -__global__ void update_cache_kernel -( - const half* __restrict__ key_states, - const half* __restrict__ value_states, - half* __restrict__ key_cache, - half* __restrict__ value_cache, - const int head_dim, - const int num_kv_heads, - const int q_len, - const int cache_seq_len, - const int past_len -) -{ - //int state_shape[] = { num_kv_heads, q_len, head_dim }; - int state_stride[] = { head_dim, head_dim * num_kv_heads, 1 }; - int state_pos[] = { 0, 0, 0 }; - - //int cache_shape[] = { num_kv_heads, cache_seq_len, head_dim }; - int cache_stride[] = { cache_seq_len * head_dim, head_dim, 1 }; - int cache_pos[] = { 0, past_len, 0 }; - - int size[] = { num_kv_heads, q_len, head_dim }; - - int x = (blockIdx.x * THREADS_X + threadIdx.x) * BLOCKSIZE_X; - int y = blockIdx.y * THREADS_Y + threadIdx.y; - int z = (blockIdx.z * THREADS_Z + threadIdx.z) * BLOCKSIZE_Z; - - if (x >= size[2]) return; - if (y >= size[1]) return; - if (z >= size[0]) return; - - int state_offset = (z + state_pos[0]) * state_stride[0] + (y + state_pos[1]) * state_stride[1] + (x + state_pos[2]) * state_stride[2]; - int cache_offset = (z + cache_pos[0]) * cache_stride[0] + (y + cache_pos[1]) * cache_stride[1] + (x + cache_pos[2]) * cache_stride[2]; - - const uint32_t* key_ptr = (uint32_t*) (key_states + state_offset); - const uint32_t* value_ptr = (uint32_t*) (value_states + state_offset); - uint32_t* key_cache_ptr = (uint32_t*) (key_cache + cache_offset); - uint32_t* value_cache_ptr = (uint32_t*) (value_cache + cache_offset); - - #pragma unroll - for (int k = 0; k < BLOCKSIZE_Z; k++) - { - *key_cache_ptr = *key_ptr; - key_ptr += state_stride[0] / BLOCKSIZE_X; - key_cache_ptr += cache_stride[0] / BLOCKSIZE_X; - } - - #pragma unroll - for (int k = 0; k < BLOCKSIZE_Z; k++) - { - *value_cache_ptr = *value_ptr; - value_ptr += state_stride[0] / BLOCKSIZE_X; - value_cache_ptr += cache_stride[0] / BLOCKSIZE_X; - } -} - -QAttn::QAttn -( - half* _layernorm, - float _norm_epsilon, - QMatrix* _q_proj, - QMatrix* _k_proj, - QMatrix* _v_proj, - QMatrix* _o_proj, - half* _temp_state, -// half* _temp_q, -// half* _temp_k, -// half* _temp_v, - half* _temp_dq, - int _max_rows, - int _hidden_size, - int _num_heads, - int _num_kv_heads, - int _head_dim, - int _max_seq_len -): - layernorm(_layernorm), - norm_epsilon(_norm_epsilon), - q_proj(_q_proj), - k_proj(_k_proj), - v_proj(_v_proj), - o_proj(_o_proj), - temp_state(_temp_state), -// temp_q(_temp_q), -// temp_k(_temp_k), -// temp_v(_temp_v), - temp_dq(_temp_dq), - max_rows(_max_rows), - hidden_size(_hidden_size), - num_heads(_num_heads), - num_kv_heads(_num_kv_heads), - head_dim(_head_dim), - max_seq_len(_max_seq_len) -{ -} - -QAttn::~QAttn() -{ -} - -void QAttn::forward_cuda_1 -( - cublasHandle_t cublas_handle, - half* x, - int batch_size, - int q_len, - int past_len, - const uint32_t* past_lens, - half* temp_q, - half* temp_k, - half* temp_v, - const half* sin, - const half* cos, - const std::vector& loras, - half* lora_temp -) -{ - rms_norm_cuda(x, layernorm, temp_state, norm_epsilon, q_len * batch_size, hidden_size); - - gemm_half_q_half_cuda(cublas_handle, temp_state, q_proj, temp_q, q_len * batch_size, q_proj->width, hidden_size, true, temp_dq); - gemm_half_q_half_cuda(cublas_handle, temp_state, k_proj, temp_k, q_len * batch_size, k_proj->width, hidden_size, true, temp_dq); - gemm_half_q_half_cuda(cublas_handle, temp_state, v_proj, temp_v, q_len * batch_size, v_proj->width, hidden_size, true, temp_dq); - - apply_loras_cuda(cublas_handle, q_proj_lora, loras, q_proj, temp_state, temp_q, lora_temp, q_len * batch_size); - apply_loras_cuda(cublas_handle, k_proj_lora, loras, k_proj, temp_state, temp_k, lora_temp, q_len * batch_size); - apply_loras_cuda(cublas_handle, v_proj_lora, loras, v_proj, temp_state, temp_v, lora_temp, q_len * batch_size); - - rope_cuda(temp_q, sin, cos, batch_size, q_len * num_heads, head_dim, num_heads, past_len, past_lens); - rope_cuda(temp_k, sin, cos, batch_size, q_len * num_kv_heads, head_dim, num_kv_heads, past_len, past_lens); -} - -void QAttn::forward_cuda_2 -( - cublasHandle_t cublas_handle, - const half* attn_output, - half* hidden_state, - int q_len, - int batch_size, - const std::vector& loras, - half* lora_temp -) -{ - gemm_half_q_half_cuda(cublas_handle, attn_output, o_proj, hidden_state, q_len * batch_size, o_proj->width, hidden_size, false, temp_dq); - - apply_loras_cuda(cublas_handle, o_proj_lora, loras, o_proj, attn_output, hidden_state, lora_temp, q_len * batch_size); -} diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_attn.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_attn.cuh deleted file mode 100644 index dced611f9..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_attn.cuh +++ /dev/null @@ -1,98 +0,0 @@ -#ifndef _q_attn_cuh -#define _q_attn_cuh - -#include -#include -#include -#include -#include - -#include "q_matrix.cuh" - -class QAttn -{ -public: - - half* layernorm; - float norm_epsilon; - - QMatrix* q_proj; - QMatrix* k_proj; - QMatrix* v_proj; - QMatrix* o_proj; - - half* temp_state; -// half* temp_q; -// half* temp_k; -// half* temp_v; - half* temp_dq; - - int device; - int max_rows; - int hidden_size; - int num_heads; - int num_kv_heads; - int head_dim; - int max_seq_len; - - std::unordered_map> q_proj_lora; - std::unordered_map> k_proj_lora; - std::unordered_map> v_proj_lora; - std::unordered_map> o_proj_lora; - - QAttn - ( - half* _layernorm, - float _norm_epsilon, - QMatrix* _q_proj, - QMatrix* _k_proj, - QMatrix* _v_proj, - QMatrix* _o_proj, - half* _temp_state, -// half* _temp_q, -// half* _temp_k, -// half* _temp_v, - half* _temp_dq, - int _max_rows, - int _hidden_size, - int _num_heads, - int _num_kv_heads, - int _head_dim, - int _max_seq_len - ); - - ~QAttn(); - - void forward_cuda_1 - ( - cublasHandle_t cublas_handle, - half* x, - int batch_size, - int q_len, - int past_len, - const uint32_t* past_lens, - half* temp_q, - half* temp_k, - half* temp_v, - const half* sin, - const half* cos, - const std::vector& loras, - half* lora_temp - ); - - void forward_cuda_2 - ( - cublasHandle_t cublas_handle, - const half* attn_output, - half* hidden_state, - int q_len, - int batch_size, - const std::vector& loras, - half* lora_temp - ); - -private: - -}; - -#endif \ No newline at end of file diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu deleted file mode 100644 index 351b9cd5b..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cu +++ /dev/null @@ -1,211 +0,0 @@ -#include "q_gemm.cuh" -#include "util.cuh" -#include "matrix_view.cuh" -#include "../config.h" - -#include "quant/qdq_2.cuh" -#include "quant/qdq_3.cuh" -#include "quant/qdq_4.cuh" -#include "quant/qdq_5.cuh" -#include "quant/qdq_6.cuh" -#include "quant/qdq_8.cuh" - -#define BLOCK_KN_SIZE 128 -#define BLOCK_M_SIZE_MAX 8 -#define MAX_GROUPS_IN_BLOCK (BLOCK_KN_SIZE / 32) -#define CLEAR_N_SIZE 256 - -#include "q_gemm_kernel.cuh" -#include "q_gemm_kernel_gptq.cuh" - -#include "compat_gemm.cuh" - -void gemm_half_q_half_cuda_part -( - const half* a, - QMatrix* b, - half* c, - int size_m, - int size_n, - int size_k, - int m_count, - bool clear -) -{ - if (!b->is_gptq) - { - dim3 blockDim, gridDim; - blockDim.x = BLOCK_KN_SIZE; - blockDim.y = 1; - blockDim.z = 1; - gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE * 4); - gridDim.y = DIVIDE(size_m, m_count); - gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE); - - fp_gemm_half_q_half_kernel kernel = pick_gemm_half_q_half_kernel(true, m_count); - - kernel<<>> - ( - a, - b->cuda_q_weight, - b->cuda_q_scale, - b->cuda_q_scale_max, - c, - size_m, - size_n, - size_k, - b->groups, - b->groupsize, - b->cuda_q_perm, - b->rows_8, - b->rows_6, - b->rows_5, - b->rows_4, - b->rows_3, - b->rows_2, - clear - ); - } - else - { - dim3 blockDim, gridDim; - blockDim.x = BLOCK_KN_SIZE; - blockDim.y = 1; - blockDim.z = 1; - gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE * 4); - gridDim.y = DIVIDE(size_m, m_count); - gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE); - - fp_gemm_half_q_half_gptq_kernel kernel = pick_gemm_half_q_half_gptq_kernel(true, m_count); - -// DBGX((uint64_t) b->cuda_q_perm); -// DBGI(b->rows_4); -// DBGI(b->height); - - kernel<<>> - ( - a, - b->cuda_q_weight, - b->cuda_gptq_qzeros, - b->cuda_gptq_scales, - c, - size_m, - size_n, - size_k, - b->groups, - b->groupsize, - b->cuda_q_perm, - b->rows_4, - clear - ); - } -} - -void gemm_half_q_half_cuda -( - cublasHandle_t cublas_handle, - const half* a, - QMatrix* b, - half* c, - int size_m, - int size_n, - int size_k, - bool clear, - half* temp_dq, - bool force_cuda -) -{ - if (size_m > MAX_Q_GEMM_ROWS && !force_cuda) - { - //printf("cublas\n"); - - // Reconstruct FP16 matrix, then cuBLAS - - if (!temp_dq) temp_dq = b->temp_dq; - b->reconstruct(temp_dq); - - //cublasSetMathMode(cublas_handle, CUBLAS_TENSOR_OP_MATH); - - const half alpha = __float2half(1.0f); - const half beta = clear ? __float2half(0.0f) : __float2half(1.0f); - cublasHgemm(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - size_n, size_m, size_k, - &alpha, temp_dq, size_n, - a, size_k, - &beta, c, size_n); - - //const float alpha = 1.0f; - //const float beta = clear ? 0.0f : 1.0f; - //cublasSgemmEx(cublas_handle, - // CUBLAS_OP_N, - // CUBLAS_OP_N, - // size_n, size_m, size_k, - // &alpha, temp_dq, CUDA_R_16F, size_n, - // a, CUDA_R_16F, size_k, - // &beta, c, CUDA_R_16F, size_n); - - //const float alpha = 1.0f; - //const float beta = clear ? 0.0f : 1.0f; - //cublasGemmEx(cublas_handle, - // CUBLAS_OP_N, CUBLAS_OP_N, - // size_n, size_m, size_k, - // &alpha, temp_dq, CUDA_R_16F, size_n, - // a, CUDA_R_16F, size_k, - // &beta, c, CUDA_R_16F, size_n, - // CUDA_R_16F, CUBLAS_GEMM_DFALT_TENSOR_OP); - } - else - { - //printf("cuda\n"); - - // Quantized matmul - - //if (clear) clear_tensor_cuda(c, size_m, size_n); - - int max_chunks = size_m / BLOCK_M_SIZE_MAX; - int last_chunk = max_chunks * BLOCK_M_SIZE_MAX; - int last_chunk_size = size_m - last_chunk; - - if (max_chunks) - { - gemm_half_q_half_cuda_part(a, b, c, last_chunk, size_n, size_k, BLOCK_M_SIZE_MAX, clear); - } - - if (last_chunk_size) - { - gemm_half_q_half_cuda_part(a + last_chunk * size_k, b, c + last_chunk * size_n, last_chunk_size, size_n, size_k, last_chunk_size, clear); - } - } -} - -__global__ void clear_kernel -( - half* __restrict__ c, - const int size_m, - const int size_n -) -{ - int m = blockIdx.y; - int n = (blockIdx.x * CLEAR_N_SIZE + threadIdx.x) * 8; - if (n >= size_n) return; - int4* c_ptr = (int4*)(c + m * size_n + n); - *c_ptr = {}; -} - -void clear_tensor_cuda -( - half* c, - int size_m, - int size_n -) -{ - return; - dim3 blockDim, gridDim; - blockDim.x = CLEAR_N_SIZE; - blockDim.y = 1; - gridDim.x = DIVIDE(size_n / 8, CLEAR_N_SIZE); - gridDim.y = size_m; - clear_kernel<<>>(c, size_m, size_n); -} diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cuh deleted file mode 100644 index c69f1a709..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cuh +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef _q_gemm_cuh -#define _q_gemm_cuh - -#include -#include -#include -#include -#include - -#include "q_matrix.cuh" - -void gemm_half_q_half_cuda -( - cublasHandle_t cublas_handle, - const half* a, - QMatrix* b, - half* c, - int size_m, - int size_n, - int size_k, - bool clear = false, - half* reconstruct = NULL, - bool force_cuda = false -); - -void clear_tensor_cuda -( - half* c, - int size_m, - int size_n -); - -#endif \ No newline at end of file diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel.cuh deleted file mode 100644 index 04643f65b..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel.cuh +++ /dev/null @@ -1,484 +0,0 @@ -#include "compat.cuh" - -__forceinline__ __device__ half2 dot22_8(half2(&dq)[4], const half* a_ptr, const half2 g_result, const half qs_h) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result); - return __hfma2(result, __halves2half2(qs_h, qs_h), g_result); -} - -__forceinline__ __device__ half2 dot22_16(half2(&dq)[8], const half* a_ptr, const half2 g_result, const half qs_h) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result); - return __hfma2(result, __halves2half2(qs_h, qs_h), g_result); -} - -__forceinline__ __device__ half2 dot22_32(half2(&dq)[16], const half* a_ptr, const half2 g_result, const half qs_h) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result); - return __hfma2(result, __halves2half2(qs_h, qs_h), g_result); -} - -__forceinline__ __device__ float dot22_8_f(half2(&dq)[4], const half* a_ptr, const float g_result, const float qs_f) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result); - float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result)); - return fma(result_f, qs_f, g_result); -} - -__forceinline__ __device__ float dot22_16_f(half2(&dq)[8], const half* a_ptr, const float g_result, const float qs_f) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result); - float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result)); - return fma(result_f, qs_f, g_result); -} - -__forceinline__ __device__ float dot22_32_f(half2(&dq)[16], const half* a_ptr, const float g_result, const float qs_f) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result); - float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result)); - return fma(result_f, qs_f, g_result); -} - - - -typedef void (*fp_gemm_half_q_half_kernel) -( - const half*, - const uint32_t*, - const uint32_t*, - const half*, - half*, - const int, - const int, - const int, - const int, - const int, - const uint16_t*, - const int, - const int, - const int, - const int, - const int, - const int, - const bool -); - -template -__global__ void gemm_half_q_half_kernel -( - const half* __restrict__ a, - const uint32_t* __restrict__ b_q_weight, - const uint32_t* __restrict__ b_q_scale, - const half* __restrict__ b_q_scale_max, - half* __restrict__ c, - const int size_m, - const int size_n, - const int size_k, - const int groups, - const int groupsize, - const uint16_t* __restrict__ b_q_perm, - const int rows_8, - const int rows_6, - const int rows_5, - const int rows_4, - const int rows_3, - const int rows_2, - const bool clear -) -{ - MatrixView_half a_(a, size_m, size_k); - MatrixView_half_rw c_(c, size_m, size_n); - MatrixView_q4_row b_q_scale_(b_q_scale, groups, size_n); - - int t = threadIdx.x; - - // Block - - int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; - int offset_m = blockIdx.y * m_count; - int offset_k = blockIdx.z * BLOCK_KN_SIZE; - - int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - int end_m = min(offset_m + m_count, size_m); - int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); - int n = offset_n + t * 4; - - // Preload block_a - - __shared__ half block_a[m_count][BLOCK_KN_SIZE]; - - if (offset_k + t < end_k) - { - for (int m = 0; m < m_count; ++m) - { - const half* a_ptr = a_.item_ptr(offset_m + m, 0); - half* block_a_ptr = block_a[m]; - half a0 = a_ptr[b_q_perm[offset_k + t]]; - block_a_ptr[t] = a0; - } - } - - // Clear - - if (n >= size_n) return; - - if (clear && blockIdx.z == 0) // && (threadIdx.x & 1) == 0) - { - for (int m = 0; m < m_count; m++) - *((uint64_t*) c_.item_ptr(offset_m + m, n)) = 0; - } - - __syncthreads(); - - // Find initial group - - int group = offset_k / groupsize; - - // Preload scales - - float scales[MAX_GROUPS_IN_BLOCK][4]; - - int groups_in_block = DIVIDE((end_k - offset_k), groupsize); - for (int g = 0; g < groups_in_block; g++) - { - int qscales[4]; - b_q_scale_.item4(qscales, group + g, n); - qscales[0]++; - qscales[1]++; - qscales[2]++; - qscales[3]++; - float maxscale = __half2float(b_q_scale_max[group + g]); - scales[g][0] = __int2float_rn(qscales[0] * qscales[0]) * maxscale; - scales[g][1] = __int2float_rn(qscales[1] * qscales[1]) * maxscale; - scales[g][2] = __int2float_rn(qscales[2] * qscales[2]) * maxscale; - scales[g][3] = __int2float_rn(qscales[3] * qscales[3]) * maxscale; - } - - // a, b offset - - int pre_rows_8 = min(rows_8, offset_k); - int pre_rows_6 = offset_k > rows_8 ? min(rows_6, offset_k) - rows_8 : 0; - int pre_rows_5 = offset_k > rows_6 ? min(rows_5, offset_k) - rows_6 : 0; - int pre_rows_4 = offset_k > rows_5 ? min(rows_4, offset_k) - rows_5 : 0; - int pre_rows_3 = offset_k > rows_4 ? min(rows_3, offset_k) - rows_4 : 0; - int pre_rows_2 = offset_k > rows_3 ? min(rows_2, offset_k) - rows_3 : 0; - int qk = 0; - qk += pre_rows_8 / 32 * 8; - qk += pre_rows_6 / 32 * 6; - qk += pre_rows_5 / 32 * 5; - qk += pre_rows_4 / 32 * 4; - qk += pre_rows_3 / 32 * 3; - qk += pre_rows_2 / 32 * 2; - - const uint32_t* b_ptr = b_q_weight + qk * size_n + n; - const half* a_ptr = &block_a[0][0]; - int a_stride = BLOCK_KN_SIZE; - - // Initial group - - int scales_idx = 0; - float qs_f0 = scales[scales_idx][0]; - float qs_f1 = scales[scales_idx][1]; - float qs_f2 = scales[scales_idx][2]; - float qs_f3 = scales[scales_idx][3]; - int nextgroup = offset_k + groupsize; - - // Column result - - float block_c[m_count][4] = {}; - - // Dequantize groups - - int k = offset_k; - - while (k < rows_8 && k < end_k) - { - if (k == nextgroup) - { - group++; - scales_idx++; - qs_f0 = scales[scales_idx][0]; - qs_f1 = scales[scales_idx][1]; - qs_f2 = scales[scales_idx][2]; - qs_f3 = scales[scales_idx][3]; - nextgroup += groupsize; - } - - #pragma unroll - for (int j = 0; j < 4; j++) - { - int4 load_int4[2]; - load_int4[0] = *((int4*) b_ptr); b_ptr += size_n; - load_int4[1] = *((int4*) b_ptr); b_ptr += size_n; - - half2 dq[4][4]; - dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n); - dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n); - dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n); - dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n); - - for (int m = 0; m < m_count; m++) - { - block_c[m][0] = dot22_8_f(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_f0); - block_c[m][1] = dot22_8_f(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_f1); - block_c[m][2] = dot22_8_f(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_f2); - block_c[m][3] = dot22_8_f(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_f3); - } - a_ptr += 8; - } - k += 32; - } - - while (k < rows_6 && k < end_k) - { - if (k == nextgroup) - { - group++; - scales_idx++; - qs_f0 = scales[scales_idx][0]; - qs_f1 = scales[scales_idx][1]; - qs_f2 = scales[scales_idx][2]; - qs_f3 = scales[scales_idx][3]; - nextgroup += groupsize; - } - - #pragma unroll - for (int j = 0; j < 2; j++) - { - int4 load_int4[3]; - load_int4[0] = *((int4*) b_ptr); b_ptr += size_n; - load_int4[1] = *((int4*) b_ptr); b_ptr += size_n; - load_int4[2] = *((int4*) b_ptr); b_ptr += size_n; - - half2 dq[4][8]; - dequant_6bit_16(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n); - dequant_6bit_16(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n); - dequant_6bit_16(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n); - dequant_6bit_16(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n); - - for (int m = 0; m < m_count; m++) - { - block_c[m][0] = dot22_16_f(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_f0); - block_c[m][1] = dot22_16_f(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_f1); - block_c[m][2] = dot22_16_f(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_f2); - block_c[m][3] = dot22_16_f(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_f3); - } - a_ptr += 16; - } - k += 32; - } - - while (k < rows_5 && k < end_k) - { - if (k == nextgroup) - { - group++; - scales_idx++; - qs_f0 = scales[scales_idx][0]; - qs_f1 = scales[scales_idx][1]; - qs_f2 = scales[scales_idx][2]; - qs_f3 = scales[scales_idx][3]; - nextgroup += groupsize; - } - - #pragma unroll - for (int j = 0; j < 1; j++) - { - int4 load_int4[5]; - load_int4[0] = *((int4*) b_ptr); b_ptr += size_n; - load_int4[1] = *((int4*) b_ptr); b_ptr += size_n; - load_int4[2] = *((int4*) b_ptr); b_ptr += size_n; - load_int4[3] = *((int4*) b_ptr); b_ptr += size_n; - load_int4[4] = *((int4*) b_ptr); b_ptr += size_n; - - half2 dq[4][16]; - dequant_5bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, load_int4[3].x, load_int4[4].x, dq[0], size_n); - dequant_5bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, load_int4[3].y, load_int4[4].y, dq[1], size_n); - dequant_5bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, load_int4[3].z, load_int4[4].z, dq[2], size_n); - dequant_5bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, load_int4[3].w, load_int4[4].w, dq[3], size_n); - - for (int m = 0; m < m_count; m++) - { - block_c[m][0] = dot22_32_f(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_f0); - block_c[m][1] = dot22_32_f(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_f1); - block_c[m][2] = dot22_32_f(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_f2); - block_c[m][3] = dot22_32_f(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_f3); - } - a_ptr += 32; - } - - k += 32; - } - - while (k < rows_4 && k < end_k) - { - if (k == nextgroup) - { - group++; - scales_idx++; - qs_f0 = scales[scales_idx][0]; - qs_f1 = scales[scales_idx][1]; - qs_f2 = scales[scales_idx][2]; - qs_f3 = scales[scales_idx][3]; - nextgroup += groupsize; - } - - #pragma unroll - for (int j = 0; j < 4; j++) - { - int4 load_int4[1]; - load_int4[0] = *((int4*) b_ptr); b_ptr += size_n; - - half2 dq[4][4]; - dequant_4bit_8(load_int4[0].x, dq[0], size_n); - dequant_4bit_8(load_int4[0].y, dq[1], size_n); - dequant_4bit_8(load_int4[0].z, dq[2], size_n); - dequant_4bit_8(load_int4[0].w, dq[3], size_n); - - for (int m = 0; m < m_count; m++) - { - block_c[m][0] = dot22_8_f(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_f0); - block_c[m][1] = dot22_8_f(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_f1); - block_c[m][2] = dot22_8_f(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_f2); - block_c[m][3] = dot22_8_f(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_f3); - } - a_ptr += 8; - } - k += 32; - } - - while (k < rows_3 && k < end_k) - { - if (k == nextgroup) - { - group++; - scales_idx++; - qs_f0 = scales[scales_idx][0]; - qs_f1 = scales[scales_idx][1]; - qs_f2 = scales[scales_idx][2]; - qs_f3 = scales[scales_idx][3]; - nextgroup += groupsize; - } - - #pragma unroll - for (int j = 0; j < 1; j++) - { - int4 load_int4[3]; - load_int4[0] = *((int4*) b_ptr); b_ptr += size_n; - load_int4[1] = *((int4*) b_ptr); b_ptr += size_n; - load_int4[2] = *((int4*) b_ptr); b_ptr += size_n; - - half2 dq[4][16]; - dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n); - dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n); - dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n); - dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n); - - for (int m = 0; m < m_count; m++) - { - block_c[m][0] = dot22_32_f(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_f0); - block_c[m][1] = dot22_32_f(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_f1); - block_c[m][2] = dot22_32_f(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_f2); - block_c[m][3] = dot22_32_f(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_f3); - } - a_ptr += 32; - } - k += 32; - } - - while (k < rows_2 && k < end_k) - { - if (k == nextgroup) - { - group++; - scales_idx++; - qs_f0 = scales[scales_idx][0]; - qs_f1 = scales[scales_idx][1]; - qs_f2 = scales[scales_idx][2]; - qs_f3 = scales[scales_idx][3]; - nextgroup += groupsize; - } - - #pragma unroll - for (int j = 0; j < 2; j++) - { - int4 load_int4[1]; - load_int4[0] = *((int4*) b_ptr); b_ptr += size_n; - - half2 dq[4][8]; - dequant_2bit_16(load_int4[0].x, dq[0], size_n); - dequant_2bit_16(load_int4[0].y, dq[1], size_n); - dequant_2bit_16(load_int4[0].z, dq[2], size_n); - dequant_2bit_16(load_int4[0].w, dq[3], size_n); - - for (int m = 0; m < m_count; m++) - { - block_c[m][0] = dot22_16_f(dq[0], a_ptr + m * a_stride, block_c[m][0], qs_f0); - block_c[m][1] = dot22_16_f(dq[1], a_ptr + m * a_stride, block_c[m][1], qs_f1); - block_c[m][2] = dot22_16_f(dq[2], a_ptr + m * a_stride, block_c[m][2], qs_f2); - block_c[m][3] = dot22_16_f(dq[3], a_ptr + m * a_stride, block_c[m][3], qs_f3); - } - - a_ptr += 16; - } - k += 32; - } - - // Accumulate column sums in c - - for (int m = 0; m < m_count; m++) - { - half2* out = (half2*)c_.item_ptr(offset_m + m, n); - half2 result01 = __halves2half2(__float2half_rn(block_c[m][0]), __float2half_rn(block_c[m][1])); - half2 result23 = __halves2half2(__float2half_rn(block_c[m][2]), __float2half_rn(block_c[m][3])); - atomicAdd(out , result01); - atomicAdd(out + 1, result23); - } -} - -fp_gemm_half_q_half_kernel pick_gemm_half_q_half_kernel(bool first_block, const int m_count) -{ - #if BLOCK_M_SIZE_MAX >= 1 - if (m_count == 1) return gemm_half_q_half_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 2 - if (m_count == 2) return gemm_half_q_half_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 3 - if (m_count == 3) return gemm_half_q_half_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 4 - if (m_count == 4) return gemm_half_q_half_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 5 - if (m_count == 5) return gemm_half_q_half_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 6 - if (m_count == 6) return gemm_half_q_half_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 7 - if (m_count == 7) return gemm_half_q_half_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 8 - if (m_count == 8) return gemm_half_q_half_kernel; - #endif - return NULL; -} diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel_gptq.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel_gptq.cuh deleted file mode 100644 index ebaa42d05..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm_kernel_gptq.cuh +++ /dev/null @@ -1,219 +0,0 @@ -#include "compat.cuh" - -__forceinline__ __device__ half2 dot22_8(half2(&dq)[4], const half* a_ptr, const half2 g_result) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result); - return __hadd2(result, g_result); -} - -__forceinline__ __device__ float dot22_8_f(half2(&dq)[4], const half* a_ptr) -{ - half2 result = {}; - const half2* a2_ptr = (const half2*)a_ptr; - #pragma unroll - for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result); - return __half2float(__low2half(result)) + __half2float(__high2half(result)); -} - -typedef void (*fp_gemm_half_q_half_gptq_kernel) -( - const half*, - const uint32_t*, - const uint32_t*, - const half*, - half*, - const int, - const int, - const int, - const int, - const int, - const uint16_t*, - const int, - const bool -); - -template -__global__ void gemm_half_q_half_gptq_kernel -( - const half* __restrict__ a, - const uint32_t* __restrict__ b_q_weight, - const uint32_t* __restrict__ b_gptq_qzeros, - const half* __restrict__ b_gptq_scales, - half* __restrict__ c, - const int size_m, - const int size_n, - const int size_k, - const int groups, - const int groupsize, - const uint16_t* __restrict__ b_q_perm, - const int rows_4, - const bool clear -) -{ - MatrixView_half a_(a, size_m, size_k); - MatrixView_half_rw c_(c, size_m, size_n); - MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); - MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - - int t = threadIdx.x; - - // Block - - int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; - int offset_m = blockIdx.y * m_count; - int offset_k = blockIdx.z * BLOCK_KN_SIZE; - - int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); - int end_m = min(offset_m + m_count, size_m); - int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); - - int n = offset_n + t * 4; - - // Preload block_a - - __shared__ half block_a[m_count][BLOCK_KN_SIZE]; - - if (offset_k + t < end_k) - { - for (int m = 0; m < m_count; ++m) - { - const half* a_ptr = a_.item_ptr(offset_m + m, 0); - half* block_a_ptr = block_a[m]; - - half a0; - if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]]; - else a0 = a_ptr[offset_k + t]; - block_a_ptr[t] = a0; - } - } - - // Zero output - - if (n >= size_n) return; - - if (clear && blockIdx.z == 0) // && (threadIdx.x & 1) == 0) - { - for (int m = 0; m < m_count; m++) - *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0; - } - - __syncthreads(); - - // Find initial group - - int group = offset_k / groupsize; - int nextgroup = offset_k + groupsize; - - // a, b offset - - int qk = offset_k / (32 / 4); - - const uint32_t* b_ptr = b_q_weight + qk * size_n + n; - const half* a_ptr = &block_a[0][0]; - int a_stride = BLOCK_KN_SIZE; - - // Initial group - - int zeros[4]; - float scales[4]; - half2 z1z16[4][2]; - half2 y1y16[4][2]; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4_f(scales, group, n); - dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); - dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); - dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); - dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); - -// __syncthreads(); - - // Column result - - float block_c[m_count][4] = {}; - - // Dequantize and multiply - - int k = offset_k; - while (k < end_k) - { - if (k == nextgroup) - { - group++; - nextgroup += groupsize; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4_f(scales, group, n); - dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); - dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); - dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); - dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); - } - - #pragma unroll - for (int j = 0; j < 4; j++) - { - const int4* b_ptr4 = (int4*) b_ptr; - int4 load_int4 = *b_ptr4; - - half2 dq[4][4]; - dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false); - dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false); - dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false); - dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false); - - #pragma unroll - for (int m = 0; m < m_count; m++) - { - block_c[m][0] = fma(dot22_8_f(dq[0], a_ptr + m * a_stride), scales[0], block_c[m][0]); - block_c[m][1] = fma(dot22_8_f(dq[1], a_ptr + m * a_stride), scales[1], block_c[m][1]); - block_c[m][2] = fma(dot22_8_f(dq[2], a_ptr + m * a_stride), scales[2], block_c[m][2]); - block_c[m][3] = fma(dot22_8_f(dq[3], a_ptr + m * a_stride), scales[3], block_c[m][3]); - } - - b_ptr += size_n; - a_ptr += 8; - } - - k += 32; - } - - for (int m = 0; m < m_count; m++) - { - half2 *out = (half2*) c_.item_ptr(offset_m + m, n); - half2 result01 = __halves2half2(__float2half_rn(block_c[m][0]), __float2half_rn(block_c[m][1])); - half2 result23 = __halves2half2(__float2half_rn(block_c[m][2]), __float2half_rn(block_c[m][3])); - atomicAdd(out , result01); - atomicAdd(out + 1, result23); - } -} - -fp_gemm_half_q_half_gptq_kernel pick_gemm_half_q_half_gptq_kernel(bool first_block, const int m_count) -{ - #if BLOCK_M_SIZE_MAX >= 1 - if (m_count == 1) return gemm_half_q_half_gptq_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 2 - if (m_count == 2) return gemm_half_q_half_gptq_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 3 - if (m_count == 3) return gemm_half_q_half_gptq_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 4 - if (m_count == 4) return gemm_half_q_half_gptq_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 5 - if (m_count == 5) return gemm_half_q_half_gptq_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 6 - if (m_count == 6) return gemm_half_q_half_gptq_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 7 - if (m_count == 7) return gemm_half_q_half_gptq_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 8 - if (m_count == 8) return gemm_half_q_half_gptq_kernel; - #endif - return NULL; -} diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu deleted file mode 100644 index 6aed74705..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cu +++ /dev/null @@ -1,623 +0,0 @@ -#include "q_matrix.cuh" -#include "matrix_view.cuh" -#include "util.cuh" - -#include "quant/qdq_2.cuh" -#include "quant/qdq_3.cuh" -#include "quant/qdq_4.cuh" -#include "quant/qdq_5.cuh" -#include "quant/qdq_6.cuh" -#include "quant/qdq_8.cuh" - -#define BLOCK_KN_SIZE 128 - -#define THREADS_X 32 -#define THREADS_Y 32 - -// Shuffle quantized data on load - -__global__ void shuffle_kernel -( - uint32_t* __restrict__ b_q_weight, - const int size_k, - const int size_n, - const int rows_8, - const int rows_6, - const int rows_5, - const int rows_4, - const int rows_3, - const int rows_2 -) -{ - int n = blockIdx.x * THREADS_X + threadIdx.x; - if (n >= size_n) return; - int k = 0; - uint32_t* b_ptr = b_q_weight + n; - while (k < rows_8) { shuffle_8bit_4 (b_ptr, size_n); b_ptr += 1 * size_n; k += 4; } - while (k < rows_6) { shuffle_6bit_16(b_ptr, size_n); b_ptr += 3 * size_n; k += 16; } - while (k < rows_5) { shuffle_5bit_32(b_ptr, size_n); b_ptr += 5 * size_n; k += 32; } - while (k < rows_4) { shuffle_4bit_8 (b_ptr, size_n); b_ptr += 1 * size_n; k += 8; } - while (k < rows_3) { shuffle_3bit_32(b_ptr, size_n); b_ptr += 3 * size_n; k += 32; } - while (k < rows_2) { shuffle_2bit_16(b_ptr, size_n); b_ptr += 1 * size_n; k += 16; } -} - - -// QMatrix constructor - -QMatrix::QMatrix -( - const int _device, - const int _height, - const int _width, - const int _groups, - - uint32_t* _q_weight, - uint16_t* _q_perm, - uint16_t* _q_invperm, - uint32_t* _q_scale, - half* _q_scale_max, - uint16_t* _q_groups, - - uint32_t* _gptq_qzeros, - half* _gptq_scales, - uint32_t* _gptq_g_idx, - - half* _temp_dq -) : - device(_device), - height(_height), - width(_width), - groups(_groups), - temp_dq(_temp_dq) -{ - cudaSetDevice(device); - - failed = false; - - cuda_q_weight = _q_weight; - cuda_q_perm = _q_perm; - cuda_q_invperm = _q_invperm; - cuda_q_scale = _q_scale; - cuda_q_scale_max = _q_scale_max; - cuda_q_groups = _q_groups; - cuda_gptq_qzeros = _gptq_qzeros; - cuda_gptq_scales = _gptq_scales; - - is_gptq = (_gptq_qzeros != NULL); - - groupsize = 1; - while (groupsize * groups < height) groupsize *= 2; - - // Create group map - - rows_8 = 0; - rows_6 = 0; - rows_5 = 0; - rows_4 = 0; - rows_3 = 0; - rows_2 = 0; - - if (!is_gptq) - { - uint16_t* cpu_q_groups = (uint16_t*)calloc(groups * 2, sizeof(uint16_t)); - cudaMemcpy(cpu_q_groups, cuda_q_groups, groups * 2 * sizeof(uint16_t), cudaMemcpyDeviceToHost); - - for (int i = 0; i < groups; i++) - { - int bits = cpu_q_groups[i * 2]; - if (bits == 8) rows_8 += groupsize; - if (bits == 6) rows_6 += groupsize; - if (bits == 5) rows_5 += groupsize; - if (bits == 4) rows_4 += groupsize; - if (bits == 3) rows_3 += groupsize; - if (bits == 2) rows_2 += groupsize; - } - - free(cpu_q_groups); - - rows_6 += rows_8; - rows_5 += rows_6; - rows_4 += rows_5; - rows_3 += rows_4; - rows_2 += rows_3; - } - else - { - rows_4 = height; - rows_3 = height; - rows_2 = height; - - if (_gptq_g_idx) - { - if (!make_sequential(_gptq_g_idx)) - { - failed = true; - //printf("FAIL\n"); - return; - } - } - } - - // Shuffle quantized data - - dim3 blockDim, gridDim; - blockDim.x = THREADS_X; - blockDim.y = 1; - gridDim.x = DIVIDE(width, THREADS_X); - gridDim.y = 1; - - shuffle_kernel<<>>(cuda_q_weight, height, width, rows_8, rows_6, rows_5, rows_4, rows_3, rows_2); -} - -QMatrix::~QMatrix() -{ -} - -// Reconstruct b[k,n] (GPTQ) - -__global__ void reconstruct_gptq_kernel -( - const uint32_t* __restrict__ b_q_weight, - const uint16_t* __restrict__ b_q_perm, - const uint32_t* __restrict__ b_gptq_qzeros, - const half* __restrict__ b_gptq_scales, - //const uint16_t* __restrict__ b_q_groups, - const int size_k, - const int size_n, - const int groupsize, - const int groups, - half* __restrict__ b, - const int rows_4 -) -{ - MatrixView_half_rw b_(b, size_k, size_n); - MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); - MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - - int offset_k = BLOCK_KN_SIZE * blockIdx.y; - int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; - - int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); - - // Preload remapping table - - __shared__ uint16_t perm[BLOCK_KN_SIZE]; - int t = threadIdx.x; - - if (b_q_perm) - { - if (offset_k + t < size_k) - perm[t] = b_q_perm[offset_k + t]; - } - - // Column - - int n = offset_n + t * 4; - if (n >= size_n) return; - - // Find initial group - - int group = offset_k / groupsize; - int nextgroup = offset_k + groupsize; - - // b offset - - int qk = offset_k / (32 / 4); - - const uint32_t* b_ptr = b_q_weight + qk * size_n + n; - - // Initial zeros/scale - - int zeros[4]; - half2 scales[4]; - half2 z1z16[4][2]; - half2 y1y16[4][2]; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4_h2(scales, group, n); - dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); - dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); - dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); - dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); - - __syncthreads(); - - int k = offset_k; - int lk = 0; - - while (k < end_k) - { - if (k == nextgroup) - { - group++; - nextgroup += groupsize; - b_gptq_qzeros_.item4(zeros, group, n); - b_gptq_scales_.item4_h2(scales, group, n); - dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); - dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); - dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); - dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); - } - - for (int p = 0; p < 4; p++) - { - half2 dq[4][4]; - const int4* b_ptr4 = (int4*) b_ptr; - int4 load_int4 = *b_ptr4; - - dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false); - dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false); - dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false); - dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false); - - b_ptr += size_n; - //half* dqh = (half*)dq; - if (b_q_perm) - { - for (int j = 0; j < 4; j++) - { - for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); - b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); - b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j])); - } - } - else - { - for (int j = 0; j < 4; j++) - { - for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); - b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); - b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j])); - } - } - } - k += 32; - } -} - - -// Reconstruct b[k,n] - -__global__ void reconstruct_kernel -( - const uint32_t* __restrict__ b_q_weight, - const uint16_t* __restrict__ b_q_perm, - const uint32_t* __restrict__ b_q_scale, - const half* __restrict__ b_q_scale_max, - //const uint16_t* __restrict__ b_q_groups, - const int size_k, - const int size_n, - const int groupsize, - const int groups, - half* __restrict__ b, - const int rows_8, - const int rows_6, - const int rows_5, - const int rows_4, - const int rows_3, - const int rows_2 -) -{ - MatrixView_half_rw b_(b, size_k, size_n); - MatrixView_q4_row b_q_scale_(b_q_scale, groups, size_n); - - int offset_k = BLOCK_KN_SIZE * blockIdx.y; - int offset_n = BLOCK_KN_SIZE * blockIdx.x; - - // Preload remapping table - - int t = threadIdx.x; - __shared__ uint16_t perm[BLOCK_KN_SIZE]; - if (offset_k + t < size_k) - perm[t] = b_q_perm[offset_k + t]; - - // Column - - int n = offset_n + t; - if (n >= size_n) return; - - // Find initial group - - int group = offset_k / groupsize; - - int pre_rows_8 = min(rows_8, offset_k); - int pre_rows_6 = offset_k > rows_8 ? min(rows_6, offset_k) - rows_8 : 0; - int pre_rows_5 = offset_k > rows_6 ? min(rows_5, offset_k) - rows_6 : 0; - int pre_rows_4 = offset_k > rows_5 ? min(rows_4, offset_k) - rows_5 : 0; - int pre_rows_3 = offset_k > rows_4 ? min(rows_3, offset_k) - rows_4 : 0; - int pre_rows_2 = offset_k > rows_3 ? min(rows_2, offset_k) - rows_3 : 0; - int qk = 0; - qk += pre_rows_8 / 32 * 8; - qk += pre_rows_6 / 32 * 6; - qk += pre_rows_5 / 32 * 5; - qk += pre_rows_4 / 32 * 4; - qk += pre_rows_3 / 32 * 3; - qk += pre_rows_2 / 32 * 2; - - const uint32_t* b_ptr = b_q_weight + qk * size_n + n; - - half qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); - half2 qs_h2 = __halves2half2(qs_h, qs_h); - int nextgroup = offset_k + groupsize; - - int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); - int k = offset_k; - int lk = 0; - - __syncthreads(); - - while (k < rows_8 && k < end_k) - { - if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += groupsize; qs_h2 = __halves2half2(qs_h, qs_h); } - for (int p = 0; p < 4; p++) - { - half2 dq[4]; - uint32_t q_0 = *b_ptr; b_ptr += size_n; - uint32_t q_1 = *b_ptr; b_ptr += size_n; - dequant_8bit_8(q_0, q_1, dq, size_n); - for (int j = 0; j < 4; j++) dq[j] = __hmul2(dq[j], qs_h2); - half* dqh = (half*) dq; - for (int j = 0; j < 8; j++) b_.set(perm[lk++], n, dqh[j]); - } - k += 32; - } - - while (k < rows_6 && k < end_k) - { - if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += groupsize; qs_h2 = __halves2half2(qs_h, qs_h); } - for (int p = 0; p < 2; p++) - { - half2 dq[8]; - uint32_t q_0 = *b_ptr; b_ptr += size_n; - uint32_t q_1 = *b_ptr; b_ptr += size_n; - uint32_t q_2 = *b_ptr; b_ptr += size_n; - dequant_6bit_16(q_0, q_1, q_2, dq, size_n); - for (int j = 0; j < 8; j++) dq[j] = __hmul2(dq[j], qs_h2); - half* dqh = (half*) dq; - for (int j = 0; j < 16; j++) b_.set(perm[lk++], n, dqh[j]); - } - k += 32; - } - - while (k < rows_5 && k < end_k) - { - if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += groupsize; qs_h2 = __halves2half2(qs_h, qs_h); } - for (int p = 0; p < 1; p++) - { - half2 dq[16]; - uint32_t q_0 = *b_ptr; b_ptr += size_n; - uint32_t q_1 = *b_ptr; b_ptr += size_n; - uint32_t q_2 = *b_ptr; b_ptr += size_n; - uint32_t q_3 = *b_ptr; b_ptr += size_n; - uint32_t q_4 = *b_ptr; b_ptr += size_n; - dequant_5bit_32(q_0, q_1, q_2, q_3, q_4, dq, size_n); - for (int j = 0; j < 16; j++) dq[j] = __hmul2(dq[j], qs_h2); - half* dqh = (half*) dq; - for (int j = 0; j < 32; j++) b_.set(perm[lk++], n, dqh[j]); - } - k += 32; - } - - while (k < rows_4 && k < end_k) - { - if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += groupsize; qs_h2 = __halves2half2(qs_h, qs_h); } - for (int p = 0; p < 4; p++) - { - half2 dq[4]; - uint32_t q_0 = *b_ptr; b_ptr += size_n; - dequant_4bit_8(q_0, dq, size_n); - for (int j = 0; j < 4; j++) dq[j] = __hmul2(dq[j], qs_h2); - half* dqh = (half*) dq; - for (int j = 0; j < 8; j++) b_.set(perm[lk++], n, dqh[j]); - } - k += 32; - } - - while (k < rows_3 && k < end_k) - { - if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += groupsize; qs_h2 = __halves2half2(qs_h, qs_h); } - for (int p = 0; p < 1; p++) - { - half2 dq[16]; - uint32_t q_0 = *b_ptr; b_ptr += size_n; - uint32_t q_1 = *b_ptr; b_ptr += size_n; - uint32_t q_2 = *b_ptr; b_ptr += size_n; - dequant_3bit_32(q_0, q_1, q_2, dq, size_n); - for (int j = 0; j < 16; j++) dq[j] = __hmul2(dq[j], qs_h2); - half* dqh = (half*) dq; - for (int j = 0; j < 32; j++) b_.set(perm[lk++], n, dqh[j]); - } - k += 32; - } - - while (k < rows_2 && k < end_k) - { - if (k == nextgroup) { group++; qs_h = dq_scale(b_q_scale_.item(group, n), b_q_scale_max[group]); nextgroup += groupsize; qs_h2 = __halves2half2(qs_h, qs_h); } - for (int p = 0; p < 2; p++) - { - half2 dq[8]; - uint32_t q_0 = *b_ptr; b_ptr += size_n; - dequant_2bit_16(q_0, dq, size_n); - for (int j = 0; j < 8; j++) dq[j] = __hmul2(dq[j], qs_h2); - half* dqh = (half*) dq; - for (int j = 0; j < 16; j++) b_.set(perm[lk++], n, dqh[j]); - } - k += 32; - } -} - -void QMatrix::reconstruct(half* out) -{ - dim3 blockDim, gridDim; - blockDim.x = BLOCK_KN_SIZE; - blockDim.y = 1; - gridDim.y = DIVIDE(height, BLOCK_KN_SIZE); - - if (!is_gptq) - { - gridDim.x = DIVIDE(width, BLOCK_KN_SIZE); - reconstruct_kernel<<>> - ( - cuda_q_weight, - cuda_q_perm, - cuda_q_scale, - cuda_q_scale_max, - //cuda_q_groups, - height, - width, - groupsize, - groups, - out, - rows_8, - rows_6, - rows_5, - rows_4, - rows_3, - rows_2 - ); - } - else - { - gridDim.x = DIVIDE(width, BLOCK_KN_SIZE * 4); - reconstruct_gptq_kernel<<>> - ( - cuda_q_weight, - cuda_q_perm, - cuda_gptq_qzeros, - cuda_gptq_scales, - //const uint16_t* __restrict__ b_q_groups, - height, - width, - groupsize, - groups, - out, - rows_4 - ); - } -} - -__global__ void make_sequential_kernel -( - const uint32_t* __restrict__ w, - uint32_t* __restrict__ w_new, - const uint16_t* __restrict__ q_perm, - const int w_height, - const int w_width -) -{ - const uint64_t* w2 = (uint64_t*) w; - uint64_t* w_new2 = (uint64_t*) w_new; - int w2_stride = w_width >> 1; - - int w2_column = THREADS_X * blockIdx.x + threadIdx.x; - if (w2_column >= w2_stride) return; - - int w_new2_row = blockIdx.y; - - int q_perm_idx = w_new2_row << 3; - - uint64_t dst = 0; - - #pragma unroll - for (int i = 0; i < 8; i++) - { - int source_row = q_perm[q_perm_idx++]; - - int w2_row = source_row >> 3; - int w2_subrow = source_row & 0x07; - int w2_row_shift = w2_subrow << 2; - int wnew2_row_shift = i << 2; - - uint64_t src = w2[w2_row * w2_stride + w2_column]; - src >>= w2_row_shift; - src &= 0x0000000f0000000f; - src <<= wnew2_row_shift; - dst |= src; - } - - w_new2[w_new2_row * w2_stride + w2_column] = dst; -} - -bool QMatrix::make_sequential(const uint32_t* cpu_g_idx) -{ - uint32_t* cuda_new_qweight = NULL; - cudaError_t err = cudaMalloc(&cuda_new_qweight, height / 8 * width * sizeof(uint32_t)); - if (err != cudaSuccess) { - cudaError_t cuda_status = cudaGetLastError(); // Clear error - return false; - } - - uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t)); - uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t)); - uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t)); - - // Group histogram - - for (int i = 0; i < height; i++) cpu_g_idx_map[cpu_g_idx[i]]++; - - // Group map - - for (int i = 0, acc = 0; i < groups; i++) - { - short tmp = cpu_g_idx_map[i]; - cpu_g_idx_map[i] = acc; - acc += tmp; - } - - // X map (inverse) - - for (int row = 0; row < height; row++) - { - uint32_t target_group = cpu_g_idx[row]; - uint32_t target_row = cpu_g_idx_map[target_group]; - cpu_g_idx_map[target_group]++; - cpu_x_map_inv[row] = target_row; - } - - // X map - - for (int row = 0; row < height; row++) cpu_x_map[cpu_x_map_inv[row]] = row; - - // Reduce to uint16_t - - uint16_t* cpu_x_map16 = (uint16_t*)cpu_x_map; - uint16_t* cpu_x_map_inv16 = (uint16_t*)cpu_x_map_inv; - for (int row = 0; row < height; row++) cpu_x_map16[row] = (uint16_t) cpu_x_map[row]; - for (int row = 0; row < height; row++) cpu_x_map_inv16[row] = (uint16_t) cpu_x_map_inv[row]; - - // Move to CUDA - - cudaMemcpyAsync(cuda_q_perm, cpu_x_map16, height * sizeof(uint16_t), cudaMemcpyHostToDevice); - cudaMemcpyAsync(cuda_q_invperm, cpu_x_map_inv16, height * sizeof(uint16_t), cudaMemcpyHostToDevice); - - // Rearrange rows in w - - dim3 blockDim, gridDim; - blockDim.x = THREADS_X; - blockDim.y = 1; - gridDim.x = DIVIDE(width, THREADS_X); - gridDim.y = height / 8; - - make_sequential_kernel<<>> - ( - cuda_q_weight, - cuda_new_qweight, - cuda_q_perm, - height / 8, - width - ); - - // Replace qweights - - cudaMemcpyAsync(cuda_q_weight, cuda_new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice); - - // Cleanup - - cudaDeviceSynchronize(); - - cudaFree(cuda_new_qweight); - free(cpu_g_idx_map); - free(cpu_x_map); - free(cpu_x_map_inv); - - return true; -} diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cuh deleted file mode 100644 index dda83a4f3..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_matrix.cuh +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef _q_matrix_cuh -#define _q_matrix_cuh - -#include -#include -#include -#include - -#define MAX_SUPERGROUPS 16 - -class QMatrix -{ -public: - - int device; - bool is_gptq; - - int height; - int width; - int groups; - int groupsize; - - int rows_8; - int rows_6; - int rows_5; - int rows_4; - int rows_3; - int rows_2; - - uint32_t* cuda_q_weight = NULL; - uint16_t* cuda_q_perm = NULL; - uint16_t* cuda_q_invperm = NULL; - uint32_t* cuda_q_scale = NULL; - half* cuda_q_scale_max = NULL; - uint16_t* cuda_q_groups = NULL; - uint32_t* cuda_gptq_qzeros = NULL; - half* cuda_gptq_scales = NULL; - - half* temp_dq; - - bool failed; - - QMatrix - ( - const int _device, - const int _height, - const int _width, - const int _groups, - - uint32_t* _q_weight, - uint16_t* _q_perm, - uint16_t* _q_invperm, - uint32_t* _q_scale, - half* _q_scale_max, - uint16_t* _q_groups, - - uint32_t* _gptq_qzeros, - half* _gptq_scales, - uint32_t* _gptq_g_idx, - - half* _temp_dq - ); - - ~QMatrix(); - - void reconstruct(half* out); - bool make_sequential(const uint32_t* cpu_g_idx); - -private: - -}; - -#endif diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_mlp.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_mlp.cu deleted file mode 100644 index 87fe8ef3f..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_mlp.cu +++ /dev/null @@ -1,162 +0,0 @@ -#include "q_mlp.cuh" -#include "q_gemm.cuh" -#include "rms_norm.cuh" -#include "util.cuh" -#include "matrix_view.cuh" -#include "lora.cuh" - -#if defined(USE_ROCM) -__device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) { - return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half2_raw>(x).data.x)), - static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half2_raw>(x).data.y))}; -} -#define h2rcp __compat_h2rcp -#endif - -const int THREADS_X = 32; -const int THREADS_Y = 4; -// const int MAX_DIMENSION = 8192; - -__device__ __forceinline__ half silu(half x) -{ - half one = __float2half(1.0f); - half neg_x = __hneg(x); - half e = hexp(neg_x); - half sum = __hadd(one, e); - half r = hrcp(sum); - half result = __hmul(x, r); - return result; -} - -__device__ __forceinline__ half2 silu(half2 x) -{ - half2 one = __float2half2_rn(1.0f); - half2 neg_x = __hneg2(x); - half2 e = h2exp(neg_x); - half2 sum = __hadd2(one, e); - half2 r = h2rcp(sum); - half2 result = __hmul2(x, r); - return result; -} - -typedef void (*fp_silu_mul_kernel) -( - half*, - const half*, - const int, - const int -); - -template -__global__ void silu_mul_kernel -( - half* __restrict__ x, - const half* __restrict__ y, - const int height, - const int width -) -{ - MatrixView_half_rw x_(x, height, width); - MatrixView_half y_(y, height, width); - - int column = (THREADS_X * blockIdx.x + threadIdx.x); if constexpr (use_half2) column *= 2; - int row = THREADS_Y * blockIdx.y + threadIdx.y; - if (row >= height) return; - - // silu(x) * y - - if constexpr (use_half2) - { - half2 one = __half2half2(__float2half(1.0f)); - - half2 x_item = x_.item_half2(row, column); - half2 y_item = y_.item_half2(row, column); - - x_item = silu(x_item); - x_item = __hmul2(x_item, y_item); - - x_.set_half2(row, column, x_item); - } - else - { - half one = __float2half(1.0f); - - half x_item = x_.item(row, column); - half y_item = y_.item(row, column); - - x_item = silu(x_item); - x_item = __hmul(x_item, y_item); - - x_.set(row, column, x_item); - } -} - -fp_silu_mul_kernel pick_silu_mul_kernel(bool use_half2) -{ - if (use_half2) return silu_mul_kernel; - else return silu_mul_kernel; -}; - - -QMLP::QMLP -( - half* _layernorm, - float _norm_epsilon, - QMatrix* _gate, - QMatrix* _up, - QMatrix* _down, - half* _temp_state, - half* _temp_a, - half* _temp_b, - half* _temp_dq, - int _max_rows -): - layernorm(_layernorm), - norm_epsilon(_norm_epsilon), - gate(_gate), - up(_up), - down(_down), - temp_state(_temp_state), - temp_a(_temp_a), - temp_b(_temp_b), - temp_dq(_temp_dq), - max_rows(_max_rows) -{ -} - -QMLP::~QMLP() { -} - -void QMLP::forward_ -( - cublasHandle_t cublas_handle, - half* x, - int rows, - int columns, - const std::vector& loras, - half* lora_temp -) -{ - bool use_half2 = true; - int intermediate_size = gate->width; - - rms_norm_cuda(x, layernorm, temp_state, norm_epsilon, rows, columns); - gemm_half_q_half_cuda(cublas_handle, temp_state, gate, temp_a, rows, intermediate_size, columns, true, temp_dq); - gemm_half_q_half_cuda(cublas_handle, temp_state, up, temp_b, rows, intermediate_size, columns, true, temp_dq); - - apply_loras_cuda(cublas_handle, gate_proj_lora, loras, gate, temp_state, temp_a, lora_temp, rows); - apply_loras_cuda(cublas_handle, up_proj_lora, loras, up, temp_state, temp_b, lora_temp, rows); - - dim3 blockDim, gridDim; - blockDim.x = THREADS_X; - blockDim.y = THREADS_Y; - gridDim.x = DIVIDE(up->width, THREADS_X) / (use_half2 ? 2 : 1); - gridDim.y = DIVIDE(rows, THREADS_Y); - - fp_silu_mul_kernel kernel = pick_silu_mul_kernel(use_half2); - kernel<<>>(temp_a, temp_b, rows, intermediate_size); - - gemm_half_q_half_cuda(cublas_handle, temp_a, down, x, rows, columns, intermediate_size, false, temp_dq); - - apply_loras_cuda(cublas_handle, down_proj_lora, loras, down, temp_a, x, lora_temp, rows); -} diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_mlp.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_mlp.cuh deleted file mode 100644 index ed3802559..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_mlp.cuh +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef _q_mlp_cuh -#define _q_mlp_cuh - -#include -#include -#include -#include -#include - -#include "q_matrix.cuh" - -class QMLP -{ -public: - - half* layernorm; - float norm_epsilon; - - QMatrix* gate; - QMatrix* up; - QMatrix* down; - - half* temp_state; - half* temp_a; - half* temp_b; - half* temp_dq; - - int device; - int max_rows; - - std::unordered_map> gate_proj_lora; - std::unordered_map> up_proj_lora; - std::unordered_map> down_proj_lora; - - QMLP - ( - half* _layernorm, - float _norm_epsilon, - QMatrix* _gate, - QMatrix* _up, - QMatrix* _down, - half* _temp_state, - half* _temp_a, - half* _temp_b, - half* _temp_dq, - int _max_rows - ); - - ~QMLP(); - - void forward_ - ( - cublasHandle_t cublas_handle, - half* x, - int rows, - int columns, - const std::vector& loras, - half* lora_temp - ); - -private: - -}; - -#endif \ No newline at end of file diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_2.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_2.cuh deleted file mode 100644 index 3beaeefa9..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_2.cuh +++ /dev/null @@ -1,103 +0,0 @@ -#ifndef _qdq_2_cuh -#define _qdq_2_cuh - -#include "qdq_util.cuh" -#include "../../config.h" - -#if QMODE_2BIT == 1 - -// Permutation: -// -// ffddbb99 77553311 eeccaa88 66442200 - -__forceinline__ __device__ void shuffle_2bit_16 -( - uint32_t* q, - int stride -) -{ - uint32_t qa = q[0]; - uint32_t qb = 0; - - #pragma unroll - for (int i = 0; i < 8; i++) - { - uint32_t qa0 = qa & 0x03; - uint32_t qa1 = (qa & 0x0c) >> 2; - qa >>= 4; - qb |= (qa1 << (i * 2 + 16)); - qb |= (qa0 << (i * 2)); - } - q[0] = qb; -} - -__forceinline__ __device__ void dequant_2bit_16 -( - const uint32_t q_0, - half2 (&dq)[8], - int stride -) -{ - const uint32_t c0 = 0x64006400; - const half y4_ = __float2half_rn(1.0f / 4.0f); - const half y16_ = __float2half_rn(1.0f / 16.0f); - const half y64_ = __float2half_rn(1.0f / 64.0f); - const half2 y4 = __halves2half2(y4_, y4_); - const half2 y16 = __halves2half2(y16_, y16_); - const half2 y64 = __halves2half2(y64_, y64_); - const half z1_ = __float2half_rn(-1024.0f - 2.0f); - const half z4_ = __float2half_rn(-1024.0f / 4.0f - 2.0f); - const half z16_ = __float2half_rn(-1024.0f / 16.0f - 2.0f); - const half z64_ = __float2half_rn(-1024.0f / 64.0f - 2.0f); - const half2 z1 = __halves2half2(z1_, z1_); - const half2 z4 = __halves2half2(z4_, z4_); - const half2 z16 = __halves2half2(z16_, z16_); - const half2 z64 = __halves2half2(z64_, z64_); - - uint32_t qa = q_0; - half2_uint32 q0((qa & 0x00030003) | c0); // half2(q[ 0], q[ 1]) + 1024 - half2_uint32 q1((qa & 0x000c000c) | c0); // half2(q[ 2], q[ 3]) * 4 + 1024 - half2_uint32 q2((qa & 0x00300030) | c0); // half2(q[ 4], q[ 5]) * 16 + 1024 - half2_uint32 q3((qa & 0x00c000c0) | c0); // half2(q[ 6], q[ 7]) * 64 + 1024 - qa >>= 8; - half2_uint32 q4((qa & 0x00030003) | c0); // half2(q[ 8], q[ 8]) + 1024 - half2_uint32 q5((qa & 0x000c000c) | c0); // half2(q[10], q[11]) * 4 + 1024 - half2_uint32 q6((qa & 0x00300030) | c0); // half2(q[12], q[13]) * 16 + 1024 - half2_uint32 q7((qa & 0x00c000c0) | c0); // half2(q[14], q[15]) * 64 + 1024 - - dq[0] = __hadd2(q0.as_half2, z1); - dq[1] = __hfma2(q1.as_half2, y4, z4); - dq[2] = __hfma2(q2.as_half2, y16, z16); - dq[3] = __hfma2(q3.as_half2, y64, z64); - dq[4] = __hadd2(q4.as_half2, z1); - dq[5] = __hfma2(q5.as_half2, y4, z4); - dq[6] = __hfma2(q6.as_half2, y16, z16); - dq[7] = __hfma2(q7.as_half2, y64, z64); -} - -#else - -__forceinline__ __device__ void shuffle_2bit_16 -( - uint32_t* q, - int stride -) -{ -} - -__forceinline__ __device__ void dequant_2bit_16 -( - const uint32_t q_0, - half2 (&dq)[8], - int stride -) -{ - half dqh[16]; - for (int i = 0; i < 16; i++) dqh[i] = dq_ns(exb(q_0, i * 2, 0x03), 2); - - for (int i = 0; i < 8; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); -} - -#endif - -#endif \ No newline at end of file diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_3.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_3.cuh deleted file mode 100644 index 101173763..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_3.cuh +++ /dev/null @@ -1,169 +0,0 @@ -#ifndef _qdq_3_cuh -#define _qdq_3_cuh - -#include "qdq_util.cuh" -#include "../../config.h" - -#if QMODE_3BIT == 1 - -// Permutation: -// -// v9997775 55333111 u8886664 44222000 (u, v lsb) -// vjjjhhhf ffdddbbb uiiiggge eecccaaa -// vtttrrrp ppnnnlll usssqqqo oommmkkk - -__forceinline__ __device__ void shuffle_3bit_32 -( - uint32_t* q, - int stride -) -{ - uint32_t qa = q[0 * stride]; - uint32_t qb = q[1 * stride]; - uint32_t qc = q[2 * stride]; - - // qa: aa999888 77766655 54443332 22111000 - // qb: lkkkjjji iihhhggg fffeeedd dcccbbba - // qc: vvvuuutt tsssrrrq qqpppooo nnnmmmll - - uint32_t qd = qc >> 26; - qc <<= 4; - qc |= qb >> 28; - qb <<= 2; - qb |= qa >> 30; - - // qa: ..999888 77766655 54443332 22111000 - // qb: ..jjjiii hhhgggff feeedddc ccbbbaaa - // qc: ..tttsss rrrqqqpp pooonnnm mmlllkkk - // qd: vvvuuu - - uint32_t za = 0; - uint32_t zb = 0; - uint32_t zc = 0; - - for (int i = 0; i < 5; i++) { uint32_t t0 = qa & 0x07; uint32_t t1 = (qa & 0x38) >> 3; qa >>= 6; za |= (t0 << (i * 3)); za |= (t1 << (i * 3 + 16)); } - for (int i = 0; i < 5; i++) { uint32_t t0 = qb & 0x07; uint32_t t1 = (qb & 0x38) >> 3; qb >>= 6; zb |= (t0 << (i * 3)); zb |= (t1 << (i * 3 + 16)); } - for (int i = 0; i < 5; i++) { uint32_t t0 = qc & 0x07; uint32_t t1 = (qc & 0x38) >> 3; qc >>= 6; zc |= (t0 << (i * 3)); zc |= (t1 << (i * 3 + 16)); } - - // za: 9997775 55333111 8886664 44222000 - // zb: jjjhhhf ffdddbbb iiiggge eecccaaa - // zc: tttrrrp ppnnnlll sssqqqo oommmkkk - // qd: vvvuuu - - za |= ((qd & 0x01) >> 0) << 15; - zb |= ((qd & 0x02) >> 1) << 15; - zc |= ((qd & 0x04) >> 2) << 15; - za |= ((qd & 0x08) >> 3) << 31; - zb |= ((qd & 0x10) >> 4) << 31; - zc |= ((qd & 0x20) >> 5) << 31; - - // za: v9997775 55333111 u8886664 44222000 (u, v lsb) - // zb: vjjjhhhf ffdddbbb uiiiggge eecccaaa - // zc: vtttrrrp ppnnnlll usssqqqo oommmkkk - - q[0 * stride] = za; - q[1 * stride] = zb; - q[2 * stride] = zc; -} - -__forceinline__ __device__ void dequant_3bit_32 -( - const uint32_t q_0, - const uint32_t q_1, - const uint32_t q_2, - half2 (&dq)[16], - int stride -) -{ - const uint32_t c0 = 0x64006400; - const half y8_ = __float2half_rn(1.0f / 8.0f); - const half y64_ = __float2half_rn(1.0f / 64.0f); - const half2 y8 = __halves2half2(y8_, y8_); - const half2 y64 = __halves2half2(y64_, y64_); - const half z1_ = __float2half_rn(-1024.0f - 4.0f); - const half z8_ = __float2half_rn(-1024.0f / 8.0f - 4.0f); - const half z64_ = __float2half_rn(-1024.0f / 64.0f - 4.0f); - const half2 z1 = __halves2half2(z1_, z1_); - const half2 z8 = __halves2half2(z8_, z8_); - const half2 z64 = __halves2half2(z64_, z64_); - - uint32_t qa = q_0; - uint32_t qb = q_1; - uint32_t qc = q_2; - - half2_uint32 q0((qa & 0x00070007) | c0); // half2(q[ 0], q[ 1]) + 1024 - half2_uint32 q1((qa & 0x00380038) | c0); // half2(q[ 2], q[ 3]) * 8 + 1024 - qa >>= 6; - half2_uint32 q2((qa & 0x00070007) | c0); // half2(q[ 4], q[ 5]) + 1024 - half2_uint32 q3((qa & 0x00380038) | c0); // half2(q[ 6], q[ 7]) * 8 + 1024 - half2_uint32 q4((qa & 0x01c001c0) | c0); // half2(q[ 8], q[ 9]) * 64 + 1024 - qa >>= 9; - qa &= 0x00010001; - half2_uint32 q5((qb & 0x00070007) | c0); // half2(q[10], q[11]) + 1024 - half2_uint32 q6((qb & 0x00380038) | c0); // half2(q[12], q[13]) * 8 + 1024 - qb >>= 6; - half2_uint32 q7((qb & 0x00070007) | c0); // half2(q[14], q[15]) + 1024 - half2_uint32 q8((qb & 0x00380038) | c0); // half2(q[16], q[17]) * 8 + 1024 - half2_uint32 q9((qb & 0x01c001c0) | c0); // half2(q[18], q[19]) * 64 + 1024 - qb >>= 8; - qb &= 0x00020002; - half2_uint32 q10((qc & 0x00070007) | c0); // half2(q[20], q[21]) + 1024 - half2_uint32 q11((qc & 0x00380038) | c0); // half2(q[22], q[23]) * 8 + 1024 - qc >>= 6; - half2_uint32 q12((qc & 0x00070007) | c0); // half2(q[24], q[25]) + 1024 - half2_uint32 q13((qc & 0x00380038) | c0); // half2(q[26], q[27]) * 8 + 1024 - half2_uint32 q14((qc & 0x01c001c0) | c0); // half2(q[28], q[29]) * 64 + 1024 - qc >>= 7; - qc &= 0x00040004; - half2_uint32 q15((qa | qb | qc) | c0); - - dq[ 0] = __hadd2( q0.as_half2, z1); - dq[ 1] = __hfma2( q1.as_half2, y8, z8); - dq[ 2] = __hadd2( q2.as_half2, z1); - dq[ 3] = __hfma2( q3.as_half2, y8, z8); - dq[ 4] = __hfma2( q4.as_half2, y64, z64); - dq[ 5] = __hadd2( q5.as_half2, z1); - dq[ 6] = __hfma2( q6.as_half2, y8, z8); - dq[ 7] = __hadd2( q7.as_half2, z1); - dq[ 8] = __hfma2( q8.as_half2, y8, z8); - dq[ 9] = __hfma2( q9.as_half2, y64, z64); - dq[10] = __hadd2(q10.as_half2, z1); - dq[11] = __hfma2(q11.as_half2, y8, z8); - dq[12] = __hadd2(q12.as_half2, z1); - dq[13] = __hfma2(q13.as_half2, y8, z8); - dq[14] = __hfma2(q14.as_half2, y64, z64); - dq[15] = __hadd2(q15.as_half2, z1); -} - -#else - -__forceinline__ __device__ void shuffle_3bit_32 -( - uint32_t* q, - int stride -) -{ -} - -__forceinline__ __device__ void dequant_3bit_32 -( - const uint32_t q_0, - const uint32_t q_1, - const uint32_t q_2, - half2 (&dq)[16], - int stride -) -{ - half dqh[32]; - for (int i = 0; i < 10; i++) dqh[ i] = dq_ns(exb( q_0, i * 3 , 0x07), 4); - dqh[10 ] = dq_ns(exb(q_1, q_0, 30, 0x07), 4); - for (int i = 0; i < 10; i++) dqh[11 + i] = dq_ns(exb( q_1, i * 3 + 1, 0x07), 4); - dqh[21 ] = dq_ns(exb(q_2, q_1, 31, 0x07), 4); - for (int i = 0; i < 10; i++) dqh[22 + i] = dq_ns(exb( q_2, i * 3 + 2, 0x07), 4); - - for (int i = 0; i < 16; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); -} - -#endif - -#endif diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_4.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_4.cuh deleted file mode 100644 index 5fb070d06..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_4.cuh +++ /dev/null @@ -1,227 +0,0 @@ -#ifndef _qdq_4_cuh -#define _qdq_4_cuh - -#include "qdq_util.cuh" -#include "../../config.h" - -#if QMODE_4BIT == 1 - -// Permutation: -// -// 77775555 33331111 66664444 22220000 - -__forceinline__ __device__ void shuffle_4bit_8 -( - uint32_t* q, - int stride -) -{ - uint32_t qa = q[0]; - uint32_t qb = 0; - - #pragma unroll - for (int i = 0; i < 4; i++) - { - uint32_t qa0 = qa & 0x0f; - uint32_t qa1 = (qa & 0xf0) >> 4; - qa >>= 8; - qb |= (qa1 << (i * 4 + 16)); - qb |= (qa0 << (i * 4)); - } - q[0] = qb; -} - -__forceinline__ __device__ void dequant_4bit_8 -( - const uint32_t q_0, - half2 (&dq)[4], - int stride -) -{ - const uint32_t c0 = 0x64006400; - const half y16_ = __float2half_rn(1.0f / 16.0f); - const half2 y16 = __halves2half2(y16_, y16_); - const half z1_ = __float2half_rn(-1024.0f - 8.0f); - const half z16_ = __float2half_rn(-1024.0f / 16.0f - 8.0f); - const half2 z1 = __halves2half2(z1_, z1_); - const half2 z16 = __halves2half2(z16_, z16_); - - uint32_t qa = q_0; - half2_uint32 q0((qa & 0x000f000f) | c0); // half2(q[ 0], q[ 1]) + 1024 - half2_uint32 q1((qa & 0x00f000f0) | c0); // half2(q[ 2], q[ 3]) * 16 + 1024 - qa >>= 8; - half2_uint32 q2((qa & 0x000f000f) | c0); // half2(q[ 4], q[ 5]) + 1024 - half2_uint32 q3((qa & 0x00f000f0) | c0); // half2(q[ 6], q[ 7]) * 16 + 1024 - - dq[0] = __hadd2(q0.as_half2, z1); - dq[1] = __hfma2(q1.as_half2, y16, z16); - dq[2] = __hadd2(q2.as_half2, z1); - dq[3] = __hfma2(q3.as_half2, y16, z16); -} - -__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale -( - const uint32_t zero, - const half scale, - half2 (&z1z16)[2], - half2 (&y1y16)[2] -) -{ - half_uint16 z1(0xe400 | zero); // half(-1024.0f - zero); - half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero)); - - half2 scale2 = __half2half2(scale); - - z1z16[0] = __hmul2(scale2, __half2half2(z1.as_half)); - z1z16[1] = __hmul2(scale2, __half2half2(z16)); - - const half y1 = __float2half_rn(1.0f); - const half y16 = __float2half_rn(1.0f / 16.0f); - - y1y16[0] = __hmul2(scale2, __half2half2(y1)); - y1y16[1] = __hmul2(scale2, __half2half2(y16)); -} - -__forceinline__ __device__ void dequant_4bit_8_prep_zero -( - const uint32_t zero, - half2(&z1z16)[2], - half2(&y1y16)[2] -) -{ - half_uint16 z1(0xe400 | zero); // half(-1024.0f - zero); - half z16 = __hsub(__int2half_rn(-64), __int2half_rn(zero)); - - z1z16[0] = __half2half2(z1.as_half); - z1z16[1] = __half2half2(z16); - - const half y1 = __float2half_rn(1.0f); - const half y16 = __float2half_rn(1.0f / 16.0f); - - y1y16[0] = __half2half2(y1); - y1y16[1] = __half2half2(y16); -} - - -__forceinline__ __device__ void dequant_4bit_8_gptq -( - const uint32_t q_0, - half2 (&dq)[4], - half2 (&z1z16)[2], - half2 (&y1y16)[2], - int stride, - bool scaled -) -{ - const uint32_t c0 = 0x64006400; - - uint32_t qa = q_0; - half2_uint32 q0((qa & 0x000f000f) | c0); // half2( q[0] + 1024, q[1] + 1024 ) - half2_uint32 q1((qa & 0x00f000f0) | c0); // half2( q[2] * 16 + 1024, q[3] * 16 + 1024 ) - qa >>= 8; - half2_uint32 q2((qa & 0x000f000f) | c0); // half2( q[4] + 1024, q[5] + 1024 ) - half2_uint32 q3((qa & 0x00f000f0) | c0); // half2( q[6] * 16 + 1024, q[7] * 16 + 1024 ) - - if (scaled) - { - dq[0] = __hfma2(q0.as_half2, y1y16[0], z1z16[0]); // half2( q[0] * s - z * s, q[1] * s - z * s) - dq[1] = __hfma2(q1.as_half2, y1y16[1], z1z16[1]); // half2( q[2] * s - z * s, q[3] * s - z * s) - dq[2] = __hfma2(q2.as_half2, y1y16[0], z1z16[0]); - dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]); - } - else - { - dq[0] = __hadd2(q0.as_half2, z1z16[0]); // half2( q[0] - z, q[1] - z ) - dq[1] = __hfma2(q1.as_half2, y1y16[1], z1z16[1]); // half2( q[2] - z, q[3] - z ) - dq[2] = __hadd2(q2.as_half2, z1z16[0]); // half2( q[4] - z, q[5] - z ) - dq[3] = __hfma2(q3.as_half2, y1y16[1], z1z16[1]); // half2( q[6] - z, q[7] - z ) - } -} - -#else - -__forceinline__ __device__ void shuffle_4bit_8 -( - uint32_t* q, - int stride -) -{ -} - -__forceinline__ __device__ void dequant_4bit_8 -( - const uint32_t q_0, - half2 (&dq)[4], - int stride -) -{ - half dqh[8]; - for (int i = 0; i < 8; i++) dqh[i] = dq_ns(exb(q_0, i * 4, 0x0f), 8); - - for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); -} - -__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale -( - const uint32_t zero, - const half scale, - half2 (&z1)[2], - half2 (&y1)[2] -) -{ - half z = __int2half_rn(-((int)zero)); - z = __hmul(z, scale); - z1[0] = __half2half2(z); - y1[0] = __half2half2(scale); -} - -__forceinline__ __device__ void dequant_4bit_8_prep_zero -( - const uint32_t zero, - half2(&z1)[2], - half2(&y1)[2] -) -{ - half z = __int2half_rn(-((int)zero)); - z1[0] = __half2half2(z); -} - -__forceinline__ __device__ void dequant_4bit_8_gptq -( - const uint32_t q_0, - half2 (&dq)[4], - half2 (&z1)[2], - half2 (&y1)[2], - int stride, - bool scaled -) -{ - half2 dqh2[8]; - - uint32_t qa = q_0; - for (int i = 0; i < 4; i++) - { - half d0 = __int2half_rn(qa & 0x0f); qa >>= 4; - half d1 = __int2half_rn(qa & 0x0f); qa >>= 4; - dqh2[i] = __halves2half2(d0, d1); - } - - if (scaled) - { - dq[0] = __hfma2(dqh2[0], y1[0], z1[0]); - dq[1] = __hfma2(dqh2[1], y1[0], z1[0]); - dq[2] = __hfma2(dqh2[2], y1[0], z1[0]); - dq[3] = __hfma2(dqh2[3], y1[0], z1[0]); - } - else - { - dq[0] = __hadd2(dqh2[0], z1[0]); - dq[1] = __hadd2(dqh2[1], z1[0]); - dq[2] = __hadd2(dqh2[2], z1[0]); - dq[3] = __hadd2(dqh2[3], z1[0]); - } -} - -#endif - -#endif \ No newline at end of file diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_5.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_5.cuh deleted file mode 100644 index 454e4b93b..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_5.cuh +++ /dev/null @@ -1,207 +0,0 @@ -#ifndef _qdq_5_cuh -#define _qdq_5_cuh - -#include "qdq_util.cuh" -#include "../../config.h" - -#if QMODE_5BIT == 1 - -// Permutation: -// -// v5555533 33311111 u4444422 22200000 (u, v lsb) -// vbbbbb99 99977777 uaaaaa88 88866666 -// vhhhhhff fffddddd ugggggee eeeccccc -// vnnnnnll llljjjjj ummmmmkk kkkiiiii -// vtttttrr rrrppppp usssssqq qqqooooo - -__forceinline__ __device__ void shuffle_5bit_32 -( - uint32_t* q, - int stride -) -{ - uint32_t qa = q[0 * stride]; - uint32_t qb = q[1 * stride]; - uint32_t qc = q[2 * stride]; - uint32_t qd = q[3 * stride]; - uint32_t qe = q[4 * stride]; - - // qa: 66555554 44443333 32222211 11100000 - // qb: ccccbbbb baaaaa99 99988888 77777666 - // qc: jiiiiihh hhhggggg fffffeee eedddddc - // qd: pppooooo nnnnnmmm mmlllllk kkkkjjjj - // qe: vvvvvuuu uuttttts ssssrrrr rqqqqqpp - - uint32_t qf = qe >> 22; - qe <<= 8; - qe |= qd >> 24; - qd <<= 6; - qd |= qc >> 26; - qc <<= 4; - qc |= qb >> 28; - qb <<= 2; - qb |= qa >> 30; - - // qa: 555554 44443333 32222211 11100000 - // qb: bbbbba aaaa9999 98888877 77766666 - // qc: hhhhhg ggggffff feeeeedd dddccccc - // qd: nnnnnm mmmmllll lkkkkkjj jjjiiiii - // qe: ttttts ssssrrrr rqqqqqpp pppooooo - // qf: vv vvvuuuuu - - uint32_t za = 0; - uint32_t zb = 0; - uint32_t zc = 0; - uint32_t zd = 0; - uint32_t ze = 0; - - for (int i = 0; i < 3; i++) { uint32_t t0 = qa & 0x1f; uint32_t t1 = (qa & 0x3e0) >> 5; qa >>= 10; za |= (t0 << (i * 5)); za |= (t1 << (i * 5 + 16)); } - for (int i = 0; i < 3; i++) { uint32_t t0 = qb & 0x1f; uint32_t t1 = (qb & 0x3e0) >> 5; qb >>= 10; zb |= (t0 << (i * 5)); zb |= (t1 << (i * 5 + 16)); } - for (int i = 0; i < 3; i++) { uint32_t t0 = qc & 0x1f; uint32_t t1 = (qc & 0x3e0) >> 5; qc >>= 10; zc |= (t0 << (i * 5)); zc |= (t1 << (i * 5 + 16)); } - for (int i = 0; i < 3; i++) { uint32_t t0 = qd & 0x1f; uint32_t t1 = (qd & 0x3e0) >> 5; qd >>= 10; zd |= (t0 << (i * 5)); zd |= (t1 << (i * 5 + 16)); } - for (int i = 0; i < 3; i++) { uint32_t t0 = qe & 0x1f; uint32_t t1 = (qe & 0x3e0) >> 5; qe >>= 10; ze |= (t0 << (i * 5)); ze |= (t1 << (i * 5 + 16)); } - - // za: 5555533 33311111 4444422 22200000 - // zb: bbbbb99 99977777 aaaaa88 88866666 - // zc: hhhhhff fffddddd gggggee eeeccccc - // zd: nnnnnll llljjjjj mmmmmkk kkkiiiii - // ze: tttttrr rrrppppp sssssqq qqqooooo - // qf: vv vvvuuuuu - - za |= ((qf & 0x001) >> 0) << 15; - zb |= ((qf & 0x002) >> 1) << 15; - zc |= ((qf & 0x004) >> 2) << 15; - zd |= ((qf & 0x008) >> 3) << 15; - ze |= ((qf & 0x010) >> 4) << 15; - za |= ((qf & 0x020) >> 5) << 31; - zb |= ((qf & 0x040) >> 6) << 31; - zc |= ((qf & 0x080) >> 7) << 31; - zd |= ((qf & 0x100) >> 8) << 31; - ze |= ((qf & 0x200) >> 9) << 31; - - // za: v5555533 33311111 u4444422 22200000 (u, v lsb) - // zb: vbbbbb99 99977777 uaaaaa88 88866666 - // zc: vhhhhhff fffddddd ugggggee eeeccccc - // zd: vnnnnnll llljjjjj ummmmmkk kkkiiiii - // ze: vtttttrr rrrppppp usssssqq qqqooooo - - q[0 * stride] = za; - q[1 * stride] = zb; - q[2 * stride] = zc; - q[3 * stride] = zd; - q[4 * stride] = ze; -} - -__forceinline__ __device__ void dequant_5bit_32 -( - const uint32_t q_0, - const uint32_t q_1, - const uint32_t q_2, - const uint32_t q_3, - const uint32_t q_4, - half2 (&dq)[16], - int stride -) -{ - const uint32_t c0 = 0x64006400; - const half y32_ = __float2half_rn(1.0f / 32.0f); - const half2 y32 = __halves2half2(y32_, y32_); - const half z1_ = __float2half_rn(-1024.0f - 16.0f); - const half z32_ = __float2half_rn(-1024.0f / 32.0f - 16.0f); - const half2 z1 = __halves2half2(z1_, z1_); - const half2 z32 = __halves2half2(z32_, z32_); - - uint32_t qa = q_0; - uint32_t qb = q_1; - uint32_t qc = q_2; - uint32_t qd = q_3; - uint32_t qe = q_4; - - half2_uint32 q0 ((qa & 0x001f001f) | c0); // half2(q[ 0], q[ 1]) + 1024 - half2_uint32 q1 ((qa & 0x03e003e0) | c0); // half2(q[ 2], q[ 3]) * 32 + 1024 - qa >>= 10; - half2_uint32 q2 ((qa & 0x001f001f) | c0); // half2(q[ 4], q[ 5]) + 1024 - qa >>= 5; - qa &= 0x00010001; - half2_uint32 q3 ((qb & 0x001f001f) | c0); // half2(q[ 6], q[ 7]) + 1024 - half2_uint32 q4 ((qb & 0x03e003e0) | c0); // half2(q[ 8], q[ 9]) * 32 + 1024 - qb >>= 10; - half2_uint32 q5 ((qb & 0x001f001f) | c0); // half2(q[10], q[11]) + 1024 - qb >>= 4; - qb &= 0x00020002; - half2_uint32 q6 ((qc & 0x001f001f) | c0); // half2(q[12], q[13]) + 1024 - half2_uint32 q7 ((qc & 0x03e003e0) | c0); // half2(q[14], q[15]) * 32 + 1024 - qc >>= 10; - half2_uint32 q8 ((qc & 0x001f001f) | c0); // half2(q[16], q[17]) + 1024 - qc >>= 3; - qc &= 0x00040004; - half2_uint32 q9 ((qd & 0x001f001f) | c0); // half2(q[18], q[19]) + 1024 - half2_uint32 q10((qd & 0x03e003e0) | c0); // half2(q[20], q[21]) * 32 + 1024 - qd >>= 10; - half2_uint32 q11((qd & 0x001f001f) | c0); // half2(q[22], q[23]) + 1024 - qd >>= 2; - qd &= 0x00080008; - half2_uint32 q12((qe & 0x001f001f) | c0); // half2(q[24], q[25]) + 1024 - half2_uint32 q13((qe & 0x03e003e0) | c0); // half2(q[26], q[27]) * 32 + 1024 - qe >>= 10; - half2_uint32 q14((qe & 0x001f001f) | c0); // half2(q[28], q[29]) + 1024 - qe >>= 1; - qe &= 0x00100010; - half2_uint32 q15((qa | qb | qc | qd | qe) | c0); - - dq[ 0] = __hadd2( q0.as_half2, z1); - dq[ 1] = __hfma2( q1.as_half2, y32, z32); - dq[ 2] = __hadd2( q2.as_half2, z1); - dq[ 3] = __hadd2( q3.as_half2, z1); - dq[ 4] = __hfma2( q4.as_half2, y32, z32); - dq[ 5] = __hadd2( q5.as_half2, z1); - dq[ 6] = __hadd2( q6.as_half2, z1); - dq[ 7] = __hfma2( q7.as_half2, y32, z32); - dq[ 8] = __hadd2( q8.as_half2, z1); - dq[ 9] = __hadd2( q9.as_half2, z1); - dq[10] = __hfma2(q10.as_half2, y32, z32); - dq[11] = __hadd2(q11.as_half2, z1); - dq[12] = __hadd2(q12.as_half2, z1); - dq[13] = __hfma2(q13.as_half2, y32, z32); - dq[14] = __hadd2(q14.as_half2, z1); - dq[15] = __hadd2(q15.as_half2, z1); -} - -#else - -__forceinline__ __device__ void shuffle_5bit_32 -( - uint32_t* q, - int stride -) -{ -} - -__forceinline__ __device__ void dequant_5bit_32 -( - const uint32_t q_0, - const uint32_t q_1, - const uint32_t q_2, - const uint32_t q_3, - const uint32_t q_4, - half2 (&dq)[16], - int stride -) -{ - half dqh[32]; - for (int i = 0; i < 6; i++) dqh[ i] = dq_ns(exb( q_0, i * 5 , 0x1f), 16); - dqh[ 6 ] = dq_ns(exb(q_1, q_0, 30, 0x1f), 16); - for (int i = 0; i < 5; i++) dqh[ 7 + i] = dq_ns(exb( q_1, i * 5 + 3, 0x1f), 16); - dqh[12 ] = dq_ns(exb(q_2, q_1, 28, 0x1f), 16); - for (int i = 0; i < 6; i++) dqh[13 + i] = dq_ns(exb( q_2, i * 5 + 1, 0x1f), 16); - dqh[19 ] = dq_ns(exb(q_3, q_2, 31, 0x1f), 16); - for (int i = 0; i < 5; i++) dqh[20 + i] = dq_ns(exb( q_3, i * 5 + 4, 0x1f), 16); - dqh[25 ] = dq_ns(exb(q_4, q_3, 29, 0x1f), 16); - for (int i = 0; i < 6; i++) dqh[26 + i] = dq_ns(exb( q_4, i * 5 + 2, 0x1f), 16); - - for (int i = 0; i < 16; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); -} - -#endif - -#endif \ No newline at end of file diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_6.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_6.cuh deleted file mode 100644 index c2eb8cfbf..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_6.cuh +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef _qdq_6_cuh -#define _qdq_6_cuh - -#include "qdq_util.cuh" -#include "../../config.h" - -#if QMODE_6BIT == 1 - - // Not implemented - -#else - -__forceinline__ __device__ void shuffle_6bit_16 -( - uint32_t* q, - int stride -) -{ -} - -__forceinline__ __device__ void dequant_6bit_16 -( - const uint32_t q_0, - const uint32_t q_1, - const uint32_t q_2, - half2 (&dq)[8], - int stride -) -{ - half dqh[16]; - for (int i = 0; i < 5; i++) dqh[ i] = dq_ns(exb( q_0, i * 6 , 0x3f), 32); - dqh[ 5 ] = dq_ns(exb(q_1, q_0, 30, 0x3f), 32); - for (int i = 0; i < 4; i++) dqh[ 6 + i] = dq_ns(exb( q_1, i * 6 + 4, 0x3f), 32); - dqh[10 ] = dq_ns(exb(q_2, q_1, 28, 0x3f), 32); - for (int i = 0; i < 5; i++) dqh[11 + i] = dq_ns(exb( q_2, i * 6 + 2, 0x3f), 32); - - for (int i = 0; i < 8; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); -} - -#endif - -#endif - - diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_8.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_8.cuh deleted file mode 100644 index e2409efac..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_8.cuh +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef _qdq_8_cuh -#define _qdq_8_cuh - -#include "qdq_util.cuh" -#include "../../config.h" - -#if QMODE_8BIT == 1 - - // Not implemented - -#else - -__forceinline__ __device__ void shuffle_8bit_4 -( - uint32_t* q, - int stride -) -{ -} - -__forceinline__ __device__ void dequant_8bit_8 -( - const uint32_t q_0, - const uint32_t q_1, - half2 (&dq)[4], - int stride -) -{ - half dqh[8]; - for (int i = 0; i < 4; i++) dqh[i ] = dq_ns(exb(q_0, i * 8, 0xff), 128); - for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), 128); - - for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); -} - -#endif - -#endif \ No newline at end of file diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_util.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_util.cuh deleted file mode 100644 index 71657191b..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_util.cuh +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef _qdq_util_cuh -#define _qdq_util_cuh - -union half2_uint32 -{ - uint32_t as_uint32; - half2 as_half2; - __device__ half2_uint32(uint32_t val) : as_uint32(val) {} - __device__ half2_uint32(half2 val) : as_half2(val) {} -}; - -union half_uint16 -{ - uint16_t as_uint16; - half as_half; - __device__ half_uint16(uint16_t val) : as_uint16(val) {} - __device__ half_uint16(half val) : as_half(val) {} -}; - -// Max_scale premultiplied by 1/256 - -__forceinline__ __device__ half dq_scale(const int qs, const half max_scale) -{ - int qs_i = qs + 1; - half qs_h = __int2half_rn(qs_i * qs_i); - qs_h = __hmul(qs_h, max_scale); - return qs_h; -} - -__forceinline__ __device__ half dq(const int q, const int qzero, const half scale) -{ - return __hmul(__int2half_rn(q - qzero), scale); -} - -__forceinline__ __device__ half dq_ns(const int q, const int qzero) -{ - //return __hsub(__int2half_rn(q), __int2half_rn(qzero)); - return __int2half_rn(q - qzero); -} - -__forceinline__ __device__ int exb(const uint32_t q, const int shift, const int mask) -{ - return (int)((q >> shift) & mask); -} - -__forceinline__ __device__ int exb(const uint32_t q1, const uint32_t q0, const int shift, const int mask) -{ - return (int)(__funnelshift_rc(q0, q1, shift) & mask); -} - -#endif diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quantize.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/quantize.cu deleted file mode 100644 index 0e009c6f1..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quantize.cu +++ /dev/null @@ -1,256 +0,0 @@ -#include "quantize.cuh" -#include "util.cuh" -#include -#include "compat.cuh" - -#define BLOCKSIZE_X 32 -#define BLOCKSIZE_Y 32 - -__global__ void quantize_kernel -( - const float* __restrict__ input, - float* __restrict__ output, - const float* __restrict__ scale, - uint16_t* __restrict__ out_q, - int rows, - int columns, - float qzero, - float maxq -) -{ - int column = blockIdx.x * blockDim.x + threadIdx.x; - int row = blockIdx.y * blockDim.y + threadIdx.y; - if (column >= columns) return; - if (row >= rows) return; - - // Quantize - - float x = input[row * columns + column]; - float s = scale[column]; - x /= s; - x = rintf(x); - x += qzero; - x = clamp(x, 0.0f, maxq); - - // Optionally save quant - - if (out_q) - { - uint16_t q = static_cast(x); - out_q[row * columns + column] = q; - } - - half h_s = __float2half_rn(s); - half h_x = __float2half_rn(x); - half h_qzero = __float2half_rn(qzero); - - h_x = __hsub(h_x, h_qzero); - h_x = __hmul(h_x, h_s); - - // Dequantize - -// x -= qzero; -// x *= s; - output[row * columns + column] = __half2float(h_x); -} - -void quantize_cuda -( - const float* input, - float* output, - const float* scale, - uint16_t* out_q, - int rows, - int columns, - float qzero, - float maxq -) -{ - dim3 threads(BLOCKSIZE_X, BLOCKSIZE_Y); - dim3 blocks(DIVIDE(columns, BLOCKSIZE_X), DIVIDE(rows, BLOCKSIZE_Y)); - -// DBGI2(rows, columns); -// DBGF2(qzero, maxq); - - quantize_kernel<<>> - ( - input, - output, - scale, - out_q, - rows, - columns, - qzero, - maxq - ); -} - -__global__ void quantize_err_kernel -( - const float* __restrict__ input, - float* __restrict__ output, - const float* __restrict__ scale, - int rows, - int columns, - float qzero, - float maxq, - float err_norm, - float min_p, - float max_p, - int p_grid -) -{ - int column = blockIdx.x * blockDim.x + threadIdx.x; - int row = blockIdx.y * blockDim.y + threadIdx.y; - if (column >= columns) return; - if (row >= rows) return; - - float w = input[row * columns + column]; - - // Quantize - - for (int i = 0; i <= p_grid; i++) - { - float pi = __int2float_rn(i) / __int2float_rn(p_grid); - float p = min_p * (1.0f - pi) + max_p * pi; - - float x = w; - float s = scale[column] * p; - x /= s; - x = rintf(x); - x += qzero; - x = clamp(x, 0.0f, maxq); - - // Dequantize - - x -= qzero; - x *= s; - - // Quantization error - - x = __powf(fabsf(x - w), err_norm); - atomicAdd(&output[i * 128 + column % 128], x); - } -} - -void quantize_err_cuda -( - const float* input, - float* output, - const float* scale, - int rows, - int columns, - float qzero, - float maxq, - float err_norm, - float min_p, - float max_p, - int p_grid -) -{ - dim3 threads(BLOCKSIZE_X, BLOCKSIZE_Y); - dim3 blocks(DIVIDE(columns, BLOCKSIZE_X), DIVIDE(rows, BLOCKSIZE_Y)); - -// DBGI2(rows, columns); -// DBGF2(qzero, maxq); - - quantize_err_kernel<<>> - ( - input, - output, - scale, - rows, - columns, - qzero, - maxq, - err_norm, - min_p, - max_p, - p_grid - ); -} - -__global__ void adjust_error_row_kernel -( - const float* __restrict__ hessian_inv, - float* __restrict__ error, - const float* __restrict__ weights, - const float* __restrict__ quant, - int c, - int columns, - int hcolumns -) -{ - int column = blockIdx.x * blockDim.x + threadIdx.x; - if (column >= columns) return; - - float d = hessian_inv[c * hcolumns + c]; - - int idx = c * columns + column; - float w = weights[idx]; - float q = quant[idx]; - error[idx] = (w - q) / d; -} - -void adjust_error_row_cuda -( - const float* hessian_inv, - float* error, - const float* weights, - const float* quant, - int c, - int columns, - int hcolumns -) -{ - dim3 threads(BLOCKSIZE_X, 1); - dim3 blocks(DIVIDE(columns, BLOCKSIZE_X), 1); - - adjust_error_row_kernel<<>>(hessian_inv, error, weights, quant, c, columns, hcolumns); -} - - -// Compute z = z - x.T @ y - -__global__ void vv_mul_sub_kernel -( - const float* __restrict__ x, - const float* __restrict__ y, - float* __restrict__ z, - int x_size, - int y_size -) -{ - int y_idx = blockIdx.x * blockDim.x + threadIdx.x; - int x_idx = blockIdx.y * blockDim.y + threadIdx.y; - if (y_idx >= y_size) return; - if (x_idx >= x_size) return; - int z_idx = y_size * x_idx + y_idx; - - float p = x[x_idx] * y[y_idx]; - -// curandState state; -// int tid = blockIdx.x * blockDim.x + threadIdx.x; -// curand_init(1234, tid, clock64(), &state); -// float r = curand_uniform(&state); -// p *= r; - -// p *- 0.707106478; - - z[z_idx] -= p; -} - -void vv_mul_sub_cuda -( - const float* x, - const float* y, - float* z, - int x_size, - int y_size -) -{ - dim3 threads(BLOCKSIZE_X, BLOCKSIZE_Y); - dim3 blocks(DIVIDE(y_size, BLOCKSIZE_X), DIVIDE(x_size, BLOCKSIZE_Y)); - - vv_mul_sub_kernel<<>>(x, y, z, x_size, y_size); -} diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quantize.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quantize.cuh deleted file mode 100644 index 70ef665e6..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quantize.cuh +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef _quantize_cuh -#define _quantize_cuh - -#include -#include -#include -#include - -void quantize_cuda -( - const float* input, - float* output, - const float* scale, - uint16_t* out_q, - int rows, - int columns, - float qzero, - float maxq -); - -void quantize_err_cuda -( - const float* input, - float* output, - const float* scale, - int rows, - int columns, - float qzero, - float maxq, - float err_norm, - float min_p, - float max_p, - int p_grid -); - -void adjust_error_row_cuda -( - const float* hessian_inv, - float* error, - const float* weights, - const float* quant, - int c, - int columns, - int hcolumns -); - -void vv_mul_sub_cuda -( - const float* x, - const float* y, - float* z, - int x_size, - int y_size -); - -#endif \ No newline at end of file diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/rms_norm.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/rms_norm.cu deleted file mode 100644 index 29bbf2c39..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/rms_norm.cu +++ /dev/null @@ -1,130 +0,0 @@ -#include "rms_norm.cuh" -#include "util.cuh" - -#if defined(USE_ROCM) -#define __shfl_xor_sync(mask, var, laneMask) __shfl_xor(var, laneMask) -#endif - -// y = x * w / sqrt(row_mean(x * x) + epsilon) - -#define NUM_WARPS 32 -#define WARP_SIZE 32 -#define BLOCK_SIZE WARP_SIZE -#define NUM_THREADS (NUM_WARPS * WARP_SIZE) - -typedef void (*fp_rms_norm_kernel) -( - const half*, - const half*, - half*, - const float, - const float, - const int, - const int -); - -template -__global__ void rms_norm_kernel -( - const half* __restrict__ x, - const half* __restrict__ w, - half* __restrict__ y, - const float epsilon, - const float r_dim, - const int rows, - const int dim -) -{ - int warp_id = threadIdx.x / WARP_SIZE; - int lane_id = threadIdx.x % WARP_SIZE; - int row = blockIdx.x; - const half* x_row = x + row * dim; - half* y_row = y + row * dim; - - //int blocks_per_warp = DIVIDE(dim, NUM_THREADS); - - // Compute sum of squares for each block - - float sum = 0.0f; - float itemf[blocks_per_warp]; - - #pragma unroll - for (int i = 0; i < blocks_per_warp; i++) - { - int column = warp_id * WARP_SIZE + lane_id + NUM_THREADS * i; - if (column >= dim) break; - - float f = __half2float(x_row[column]); - f = fmaxf(-65504.0f, fminf(f, 65504.0f)); - itemf[i] = f; - sum = fma(f, f, sum); - } - - // Shuffle to sum across lanes - - __shared__ float sums[NUM_WARPS]; - - for(int offset = warpSize / 2; offset > 0; offset /= 2) sum += __shfl_xor_sync(0xffffffff, sum, offset); - if (lane_id == 0) sums[warp_id] = sum; - __syncthreads(); - - // Load partial sums from across warps, shuffle again across lanes - - sum = sums[lane_id]; - for(int offset = warpSize / 2; offset > 0; offset /= 2) sum += __shfl_xor_sync(0xffffffff, sum, offset); - - // Get norm - - float rmf = rsqrtf(sum * r_dim + epsilon); - - // Normalize x, scaling by w - - #pragma unroll 4 - for (int i = 0; i < blocks_per_warp; i++) - { - int column = warp_id * WARP_SIZE + lane_id + NUM_THREADS * i; - if (column >= dim) return; - - float x_itemf = itemf[i]; - float w_itemf = __half2float(w[column]); - float n = x_itemf * w_itemf * rmf; - y_row[column] = __float2half_rn(n); - } -} - -fp_rms_norm_kernel pick_rms_norm_kernel(const int blocks_per_warp) -{ - if (blocks_per_warp == 1) return rms_norm_kernel<1>; - if (blocks_per_warp == 2) return rms_norm_kernel<2>; - if (blocks_per_warp == 3) return rms_norm_kernel<3>; - if (blocks_per_warp == 4) return rms_norm_kernel<4>; - if (blocks_per_warp == 5) return rms_norm_kernel<5>; - if (blocks_per_warp == 6) return rms_norm_kernel<6>; - if (blocks_per_warp == 7) return rms_norm_kernel<7>; - if (blocks_per_warp == 8) return rms_norm_kernel<8>; - return NULL; -} - - -void rms_norm_cuda -( - const half* x, - const half* w, - half* y, - const float epsilon, - const int rows, - const int dim -) -{ - dim3 blockDim, gridDim; - blockDim.x = NUM_THREADS; - blockDim.y = 1; - gridDim.x = rows; - gridDim.y = 1; - - float r_dim = 1.0f / (float) dim; - - int blocks_per_warp = DIVIDE(dim, NUM_THREADS); - fp_rms_norm_kernel kernel = pick_rms_norm_kernel(blocks_per_warp); - kernel<<>>(x, w, y, epsilon, r_dim, rows, dim); -} diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/rms_norm.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/rms_norm.cuh deleted file mode 100644 index 4cb0fea97..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/rms_norm.cuh +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef _rms_norm_cuh -#define _rms_norm_cuh - -#include -#include -#include -#include - -void rms_norm_cuda -( - const half* x, - const half* w, - half* y, - const float epsilon, - const int rows, - const int dim -); - -#endif \ No newline at end of file diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/rope.cu b/server/exllamav2_kernels/exllamav2_kernels/cuda/rope.cu deleted file mode 100644 index 7b1cf2b6f..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/rope.cu +++ /dev/null @@ -1,132 +0,0 @@ -#include "rope.cuh" -#include "util.cuh" -#include "matrix_view.cuh" - -const int THREADS_X = 32; -const int THREADS_Y = 4; -const int MAX_POS_EMBEDDINGS = 32768; // Actual number doesn't matter -const int MAX_ROWS = 32768; // Actual number doesn't matter - -typedef void (*fp_rope_cuda_kernel) -( - half*, - const half*, - const half*, - int, - int, - int, - int, - const uint32_t*, - int -); - -template -__global__ void rope_cuda_kernel -( - half* __restrict__ x, - const half* __restrict__ sin, - const half* __restrict__ cos, - int rows_per_batch, - int head_dim, - int num_heads, - int past_len, - const uint32_t* __restrict__ past_lens, - int threads_y -) -{ - MatrixView_half_rw x_(x, MAX_ROWS, head_dim); - MatrixView_half sin_(sin, MAX_POS_EMBEDDINGS, head_dim); - MatrixView_half cos_(cos, MAX_POS_EMBEDDINGS, head_dim); - - int column = (blockIdx.x * THREADS_X + threadIdx.x); if constexpr (use_half2) column *= 2; - int half_dim = head_dim / 2; - if (column >= half_dim) return; - - int row = blockIdx.y * threads_y + threadIdx.y; - if (row >= rows_per_batch) return; - int batch_offset = blockIdx.z * rows_per_batch; - int row_offset = batch_offset + row; - - // Get sin and cos - - if (past_len == -1) past_len = past_lens[blockIdx.z]; - int sincos_row = past_len + row / num_heads; - - if constexpr (use_half2) - { - half2 cos2_l = cos_.item_half2(sincos_row, column); - half2 cos2_r = cos_.item_half2(sincos_row, column + half_dim); - half2 sin2_l = sin_.item_half2(sincos_row, column); - half2 sin2_r = sin_.item_half2(sincos_row, column + half_dim); - sin2_l = __hneg2(sin2_l); - - // Apply embedding to row - - half2 item2_l = x_.item_half2(row_offset, column); - half2 item2_r = x_.item_half2(row_offset, column + half_dim); - half2 item2_ls = __hmul2(item2_r, sin2_l); - half2 item2_rs = __hmul2(item2_l, sin2_r); - item2_l = __hfma2(item2_l, cos2_l, item2_ls); - item2_r = __hfma2(item2_r, cos2_r, item2_rs); - x_.set_half2(row_offset, column, item2_l); - x_.set_half2(row_offset, column + half_dim, item2_r); - } - else - { - half cos_l = cos_.item(sincos_row, column); - half cos_r = cos_.item(sincos_row, column + half_dim); - half sin_l = sin_.item(sincos_row, column); - half sin_r = sin_.item(sincos_row, column + half_dim); - sin_l = __hneg(sin_l); - - // Apply embedding to row - - half item_l = x_.item(row_offset, column); - half item_r = x_.item(row_offset, column + half_dim); - half item_ls = __hmul(item_r, sin_l); - half item_rs = __hmul(item_l, sin_r); - item_l = __hfma(item_l, cos_l, item_ls); - item_r = __hfma(item_r, cos_r, item_rs); - x_.set(row_offset, column, item_l); - x_.set(row_offset, column + half_dim, item_r); - } -} - -fp_rope_cuda_kernel pick_rope_cuda_kernel(bool use_half2) -{ - if (use_half2) return rope_cuda_kernel; - else return rope_cuda_kernel; -}; - -void rope_cuda -( - half* x, - const half* sin, - const half* cos, - const int batch_size, - const int rows_per_batch, - const int head_dim, - const int num_heads, - const int past_len, - const uint32_t* past_lens -) -{ - bool use_half2 = true; - - // For large batch sizes we risk exceeding grid dimension of 65535, so shift to block dimension instead - - int threads_y = THREADS_Y; - while (DIVIDE(rows_per_batch, threads_y) > 65535) threads_y *= 2; - - dim3 blockDim, gridDim; - blockDim.x = THREADS_X; - blockDim.y = threads_y; - gridDim.x = DIVIDE(head_dim, THREADS_X) / (use_half2 ? 2 : 1); - gridDim.y = DIVIDE(rows_per_batch, threads_y); - gridDim.z = batch_size; - - fp_rope_cuda_kernel kernel = pick_rope_cuda_kernel(use_half2); - kernel<<>>(x, sin, cos, rows_per_batch, head_dim, num_heads, past_len, past_lens, threads_y); - - cuda_check( cudaPeekAtLastError() ); -} diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/rope.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/rope.cuh deleted file mode 100644 index 9fcf7a933..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/rope.cuh +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef _rope_cuh -#define _rope_cuh - -#include -#include -#include -#include - -void rope_cuda -( - half* x, - const half* sin, - const half* cos, - const int batch_size, - const int rows_per_batch, - const int head_dim, - const int num_heads, - const int past_len, - const uint32_t* past_lens -); - -#endif diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/util.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/util.cuh deleted file mode 100644 index 06a58d184..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/cuda/util.cuh +++ /dev/null @@ -1,42 +0,0 @@ - -#define DIVIDE(x, size) (((x) + (size) - 1) / (size)) - -#define DBGS(__x) printf("%s\n", __x) -#define DBGI(__x) printf("%s: %i\n", #__x, __x) -#define DBGI2(__x, __y) printf("%s, %s: %i, %i\n", #__x, #__y, __x, __y) -#define DBGI3(__x, __y, __z) printf("%s, %s, %s: %i, %i, %i\n", #__x, #__y, #__z, __x, __y, __z) -#define DBGX(__x) printf("%s: %x\n", #__x, __x) -#define DBGX2(__x, __y) printf("%s, %s: %x, %x\n", #__x, #__y, __x, __y) -#define DBGX3(__x, __y, __z) printf("%s, %s, %s: %x, %x, %x\n", #__x, #__y, #__z, __x, __y, __z) -#define DBGF(__x) printf("%s: %f\n", #__x, __x) -#define DBGF2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __x, __y) -#define DBGF3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __x, __y, __z) -#define DBGH(__x) printf("%s: %f\n", #__x, __half2float(__x)) -#define DBGH2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __half2float(__x), __half2float(__y)) -#define DBGH3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __half2float(__x), __half2float(__y), __half2float(__z)) - -#define DBGIH(__x, __y) printf("%s, %s: %i, %f\n", #__x, #__y, __x, __half2float(__y)) -#define DBGIH2(__x, __y, __z) printf("%s, %s, %s: %i, %f, %f\n", #__x, #__y, #__z, __x, __half2float(__y), __half2float(__z)) - -__forceinline__ __device__ half dq_scale_(const int qs, const half max_scale) -{ - half qs_h = __hmul(__int2half_rn(qs + 1), __float2half_rn(1.0f / 16.0f)); - qs_h = __hmul(qs_h, qs_h); - qs_h = __hmul(qs_h, max_scale); - return qs_h; -} - -__forceinline__ __device__ float clamp(float x, float a, float b) -{ - return fmaxf(a, fminf(b, x)); -} - -#define cuda_check(ans) { gpu_assert((ans), __FILE__, __LINE__); } -inline void gpu_assert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} diff --git a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp b/server/exllamav2_kernels/exllamav2_kernels/ext.cpp deleted file mode 100644 index 7591e365f..000000000 --- a/server/exllamav2_kernels/exllamav2_kernels/ext.cpp +++ /dev/null @@ -1,956 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "config.h" - -#include "cuda/pack_tensor.cuh" -#include "cuda/quantize.cuh" -#include "cuda/q_matrix.cuh" -#include "cuda/q_attn.cuh" -#include "cuda/q_mlp.cuh" -#include "cuda/q_gemm.cuh" -#include "cuda/rms_norm.cuh" -#include "cuda/rope.cuh" -#include "cuda/cache.cuh" - -#include "cpp/quantize_func.h" -#include "cpp/sampling.h" - -#include "cpp/util.h" - -// Some decluttering macros - -#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype) -#define TORCH_CHECK_DTYPE_OPT(__x, __dtype) TORCH_CHECK((__x).device().is_meta() || (__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype) -#define TORCH_CHECK_SHAPES(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes") -#define TORCH_CHECK_SHAPES_OPT(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).device().is_meta() || (__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes") - - -// Packing functions - -void pack_rows_4 -( - torch::Tensor input, - torch::Tensor output -) -{ - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); - - TORCH_CHECK_DTYPE(input, kShort); - TORCH_CHECK_DTYPE(output, kInt); - TORCH_CHECK_SHAPES(input, 0, output, 0, 1); - TORCH_CHECK_SHAPES(input, 1, output, 1, 8); - - int rows = input.size(0); - int columns = input.size(1); - - pack_rows_4_cuda - ( - (uint16_t*) input.data_ptr(), - (uint32_t*) output.data_ptr(), - rows, - columns - ); -} - -void pack_columns -( - torch::Tensor input, - torch::Tensor output, - int bits -) -{ - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); - - TORCH_CHECK_DTYPE(input, kShort); - TORCH_CHECK_DTYPE(output, kInt); - TORCH_CHECK_SHAPES(input, 1, output, 1, 1); - - int in_rows = input.size(0); - int columns = input.size(1); - int out_rows = output.size(0); - int exp_out_rows = in_rows * bits / 32; - TORCH_CHECK(out_rows == exp_out_rows, "Wrong output shape for input and bitrate") - - pack_columns_cuda - ( - (uint16_t*) input.data_ptr(), - (uint32_t*) output.data_ptr(), - in_rows, - out_rows, - columns, - bits - ); -} - - -// Quantization functions - -void quantize_err -( - torch::Tensor input, - torch::Tensor output, - torch::Tensor scale, - float qzero, - float maxq, - float err_norm, - float min_p, - float max_p, - int p_grid -) -{ - TORCH_CHECK_DTYPE(input, kFloat); - TORCH_CHECK_DTYPE(output, kFloat); - // TORCH_CHECK_SHAPES(input, 0, output, 0, 1); - // TORCH_CHECK_SHAPES(input, 1, output, 1, 1); - TORCH_CHECK_SHAPES(input, 1, scale, 0, 1); - TORCH_CHECK(output.size(0) == p_grid + 1, "Output vector shape doesn't match grid") - - int rows = input.size(0); - int columns = input.size(1); - - quantize_err_cuda - ( - (float*) input.data_ptr(), - (float*) output.data_ptr(), - (float*) scale.data_ptr(), - rows, - columns, - qzero, - maxq, - err_norm, - min_p, - max_p, - p_grid - ); -} - -void quantize -( - torch::Tensor input, - torch::Tensor output, - torch::Tensor scale, - torch::Tensor out_q, - float qzero, - float maxq -) -{ - TORCH_CHECK_DTYPE(input, kFloat); - TORCH_CHECK_DTYPE(output, kFloat); - TORCH_CHECK_SHAPES(input, 0, output, 0, 1); - TORCH_CHECK_SHAPES(input, 1, output, 1, 1); - TORCH_CHECK_SHAPES(input, 1, scale, 0, 1); - - int rows = input.size(0); - int columns = input.size(1); - - quantize_cuda - ( - (float*) input.data_ptr(), - (float*) output.data_ptr(), - (float*) scale.data_ptr(), - out_q.device().is_meta() ? NULL : (uint16_t*) out_q.data_ptr(), - rows, - columns, - qzero, - maxq - ); -} - - -// Quant matrix - -uintptr_t make_q_matrix -( - torch::Tensor q_weight, - torch::Tensor q_perm, - torch::Tensor q_invperm, - torch::Tensor q_scale, - torch::Tensor q_scale_max, - torch::Tensor q_groups, - torch::Tensor gptq_qzeros, - torch::Tensor gptq_scales, - torch::Tensor gptq_g_idx, - torch::Tensor temp_dq -) -{ - TORCH_CHECK_DTYPE(q_weight, kInt); - TORCH_CHECK_DTYPE_OPT(q_perm, kShort); - TORCH_CHECK_DTYPE_OPT(q_invperm, kShort); - TORCH_CHECK_DTYPE_OPT(q_scale, kInt); - TORCH_CHECK_DTYPE_OPT(q_scale_max, kHalf); - TORCH_CHECK_DTYPE_OPT(q_groups, kShort); - TORCH_CHECK_DTYPE_OPT(gptq_qzeros, kInt); - TORCH_CHECK_DTYPE_OPT(gptq_scales, kHalf); - TORCH_CHECK_DTYPE_OPT(gptq_g_idx, kInt); - - TORCH_CHECK_SHAPES(q_perm, 0, q_invperm, 0, 1); - - int device = q_weight.device().index(); - int width = q_weight.size(1); - int groups; - int height; - - if (!q_scale.device().is_meta()) - { - TORCH_CHECK_SHAPES(q_weight, 1, q_scale, 1, 8); - TORCH_CHECK_SHAPES(q_scale_max, 0, q_scale, 0, 1); - groups = q_scale.size(0); - height = q_invperm.size(0); - } - else - { - TORCH_CHECK_SHAPES(q_weight, 1, gptq_qzeros, 1, 8); - TORCH_CHECK_SHAPES(q_weight, 1, gptq_scales, 1, 1); - groups = gptq_qzeros.size(0); - height = q_weight.size(0) * 8; - } - - TORCH_CHECK(temp_dq.size(0) >= width * height, "Insufficient size of temp_dq buffer") - - QMatrix* m = new QMatrix - ( - device, - height, - width, - groups, - (uint32_t*) q_weight.data_ptr(), - q_perm.device().is_meta() ? NULL : (uint16_t*) q_perm.data_ptr(), - q_invperm.device().is_meta() ? NULL : (uint16_t*) q_invperm.data_ptr(), - q_scale.device().is_meta() ? NULL : (uint32_t*) q_scale.data_ptr(), - q_scale_max.device().is_meta() ? NULL : (half*) q_scale_max.data_ptr(), - q_groups.device().is_meta() ? NULL : (uint16_t*) q_groups.data_ptr(), - gptq_qzeros.device().is_meta() ? NULL : (uint32_t*) gptq_qzeros.data_ptr(), - gptq_scales.device().is_meta() ? NULL : (half*) gptq_scales.data_ptr(), - gptq_g_idx.device().is_meta() ? NULL : (uint32_t*) gptq_g_idx.data_ptr(), - (half*) temp_dq.data_ptr() - ); - - if (m->failed) throw std::runtime_error("CUDA out of memory"); - - return reinterpret_cast (m); -} - -void free_q_matrix -( - uintptr_t handle -) -{ - QMatrix* m = reinterpret_cast (handle); - delete m; -} - -void reconstruct -( - uintptr_t q_handle, - torch::Tensor output -) -{ - QMatrix* qm = reinterpret_cast (q_handle); - TORCH_CHECK(qm->height == output.size(0) && qm->width == output.size(1), "Output tensor doesn't match shape of QMatrix") - TORCH_CHECK_DTYPE(output, kHalf); - - const at::cuda::OptionalCUDAGuard device_guard(device_of(output)); - - qm->reconstruct((half*) output.data_ptr()); -} - - -// Matmul - -void gemm_half_q_half -( - torch::Tensor a, - uintptr_t b, - torch::Tensor c, - bool force_cuda -) -{ - QMatrix* qm = reinterpret_cast (b); - - TORCH_CHECK_DTYPE(a, kHalf); - TORCH_CHECK_DTYPE(c, kHalf); - TORCH_CHECK_SHAPES(a, 0, c, 0, 1); - TORCH_CHECK(qm->height == a.size(1), "a and b have incompatible shapes") - TORCH_CHECK(qm->width == c.size(1), "b and c have incompatible shapes") - - const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); - - gemm_half_q_half_cuda - ( - at::cuda::getCurrentCUDABlasHandle(), - (const half*) a.data_ptr(), - qm, - (half*) c.data_ptr(), - c.size(0), // m - c.size(1), // n - a.size(1), // k - true, - NULL, - force_cuda - ); -} - - -// Quant attention - -uintptr_t make_q_attn -( - torch::Tensor layernorm, - float norm_epsilon, - uintptr_t q_q_proj, - uintptr_t q_k_proj, - uintptr_t q_v_proj, - uintptr_t q_o_proj, - torch::Tensor temp_state, -// torch::Tensor temp_q, -// torch::Tensor temp_k, -// torch::Tensor temp_v, - torch::Tensor temp_dq, - int max_rows, - int hidden_size, - int num_heads, - int num_kv_heads, - int head_dim, - int max_seq_len -) -{ - QMatrix* qm_q_proj = reinterpret_cast (q_q_proj); - QMatrix* qm_k_proj = reinterpret_cast (q_k_proj); - QMatrix* qm_v_proj = reinterpret_cast (q_v_proj); - QMatrix* qm_o_proj = reinterpret_cast (q_o_proj); - - TORCH_CHECK_DTYPE_OPT(layernorm, kHalf); - - if (qm_q_proj && !layernorm.is_meta()) TORCH_CHECK(qm_q_proj->height == layernorm.size(0), "q_proj is wrong shape") - if (qm_k_proj && !layernorm.is_meta()) TORCH_CHECK(qm_k_proj->height == layernorm.size(0), "k_proj is wrong shape") - if (qm_v_proj && !layernorm.is_meta()) TORCH_CHECK(qm_v_proj->height == layernorm.size(0), "v_proj is wrong shape") - if (!layernorm.is_meta()) TORCH_CHECK(qm_o_proj->height == layernorm.size(0), "o_proj is wrong shape") - - QAttn* attn = new QAttn - ( - (half*) layernorm.is_meta() ? NULL : (half*) layernorm.data_ptr(), - norm_epsilon, - qm_q_proj, - qm_k_proj, - qm_v_proj, - qm_o_proj, - (half*) temp_state.data_ptr(), -// (half*) temp_q.data_ptr(), -// (half*) temp_k.data_ptr(), -// (half*) temp_v.data_ptr(), - (half*) temp_dq.data_ptr(), - max_rows, - hidden_size, - num_heads, - num_kv_heads, - head_dim, - max_seq_len - ); - - return reinterpret_cast (attn); -} - -void free_q_attn -( - uintptr_t handle -) -{ - QAttn* attn = reinterpret_cast (handle); - delete attn; -} - -void q_attn_forward_1 -( - uintptr_t q_attn, - torch::Tensor x, - int batch_size, - int q_len, - int past_len, - torch::Tensor past_lens, - torch::Tensor q_temp, - torch::Tensor k_temp, - torch::Tensor v_temp, - torch::Tensor sin, - torch::Tensor cos, - const std::vector& loras, - torch::Tensor loras_temp -) -{ - QAttn* attn = reinterpret_cast (q_attn); - TORCH_CHECK_DTYPE(x, kHalf); - - const at::cuda::OptionalCUDAGuard device_guard(device_of(x)); - cublasHandle_t cublas_handle = at::cuda::getCurrentCUDABlasHandle(); - - attn->forward_cuda_1 - ( - cublas_handle, - (half*) x.data_ptr(), - batch_size, - q_len, - past_len, - past_lens.device().is_meta() ? NULL : (uint32_t*) past_lens.data_ptr(), - (half*) q_temp.data_ptr(), - (half*) k_temp.data_ptr(), - (half*) v_temp.data_ptr(), - (half*) sin.data_ptr(), - (half*) cos.data_ptr(), - loras, - loras_temp.device().is_meta() ? NULL : (half*) loras_temp.data_ptr() - ); -} - -void q_attn_forward_2 -( - uintptr_t q_attn, - torch::Tensor x, - torch::Tensor attn_output, - int batch_size, - int q_len, - const std::vector& loras, - torch::Tensor loras_temp -) -{ - QAttn* attn = reinterpret_cast (q_attn); - TORCH_CHECK_DTYPE(x, kHalf); - - const at::cuda::OptionalCUDAGuard device_guard(device_of(x)); - cublasHandle_t cublas_handle = at::cuda::getCurrentCUDABlasHandle(); - - attn->forward_cuda_2 - ( - cublas_handle, - (const half*) attn_output.data_ptr(), - (half*) x.data_ptr(), - q_len, - batch_size, - loras, - loras_temp.device().is_meta() ? NULL : (half*) loras_temp.data_ptr() - ); -} - -int q_attn_set_loras -( - uintptr_t q_attn, - std::unordered_map& q_proj_lora_a, - std::unordered_map& q_proj_lora_b, - std::unordered_map& k_proj_lora_a, - std::unordered_map& k_proj_lora_b, - std::unordered_map& v_proj_lora_a, - std::unordered_map& v_proj_lora_b, - std::unordered_map& o_proj_lora_a, - std::unordered_map& o_proj_lora_b -) -{ - QAttn* attn = reinterpret_cast (q_attn); - - attn->q_proj_lora.clear(); - attn->k_proj_lora.clear(); - attn->v_proj_lora.clear(); - attn->o_proj_lora.clear(); - - int max_rank = 0; - - for (const auto& pair : q_proj_lora_a) - { - int rank = pair.second.size(-1); - if (rank > max_rank) max_rank = rank; - half* a = (half*) pair.second.data_ptr(); - half* b = (half*) q_proj_lora_b[pair.first].data_ptr(); - attn->q_proj_lora[pair.first] = std::make_tuple(a, b, rank); - } - - for (const auto& pair : k_proj_lora_a) - { - int rank = pair.second.size(-1); - if (rank > max_rank) max_rank = rank; - half* a = (half*) pair.second.data_ptr(); - half* b = (half*) k_proj_lora_b[pair.first].data_ptr(); - attn->k_proj_lora[pair.first] = std::make_tuple(a, b, rank); - } - - for (const auto& pair : v_proj_lora_a) - { - int rank = pair.second.size(-1); - if (rank > max_rank) max_rank = rank; - half* a = (half*) pair.second.data_ptr(); - half* b = (half*) v_proj_lora_b[pair.first].data_ptr(); - attn->v_proj_lora[pair.first] = std::make_tuple(a, b, rank); - } - - for (const auto& pair : o_proj_lora_a) - { - int rank = pair.second.size(-1); - if (rank > max_rank) max_rank = rank; - half* a = (half*) pair.second.data_ptr(); - half* b = (half*) o_proj_lora_b[pair.first].data_ptr(); - attn->o_proj_lora[pair.first] = std::make_tuple(a, b, rank); - } - - return max_rank; -} - -// Quant MLP - -uintptr_t make_q_mlp -( - torch::Tensor layernorm, - float norm_epsilon, - uintptr_t q_gate, - uintptr_t q_up, - uintptr_t q_down, - torch::Tensor temp_state, - torch::Tensor temp_a, - torch::Tensor temp_b, - torch::Tensor temp_dq, - int max_rows -) -{ - QMatrix* qm_gate = reinterpret_cast (q_gate); - QMatrix* qm_up = reinterpret_cast (q_up); - QMatrix* qm_down = reinterpret_cast (q_down); - - TORCH_CHECK_DTYPE(layernorm, kHalf); - TORCH_CHECK(qm_gate->height == layernorm.size(0), "gate_proj is wrong shape") - TORCH_CHECK(qm_up->height == layernorm.size(0), "up_proj is wrong shape") - - QMLP* mlp = new QMLP - ( - (half*) layernorm.data_ptr(), - norm_epsilon, - qm_gate, - qm_up, - qm_down, - (half*) temp_state.data_ptr(), - (half*) temp_a.data_ptr(), - (half*) temp_b.data_ptr(), - (half*) temp_dq.data_ptr(), - max_rows - ); - - return reinterpret_cast (mlp); -} - -void free_q_mlp -( - uintptr_t handle -) -{ - QMLP* mlp = reinterpret_cast (handle); - delete mlp; -} - -void q_mlp_forward_ -( - uintptr_t q_mlp, - torch::Tensor x, - const std::vector& loras, - torch::Tensor loras_temp -) -{ - QMLP* mlp = reinterpret_cast (q_mlp); - TORCH_CHECK_DTYPE(x, kHalf); - - const at::cuda::OptionalCUDAGuard device_guard(device_of(x)); - - TORCH_CHECK(x.size(1) == mlp->gate->height, "x is wrong shape"); - TORCH_CHECK(x.size(0) <= mlp->max_rows, "Too many rows in x"); - - mlp->forward_ - ( - at::cuda::getCurrentCUDABlasHandle(), - (half*) x.data_ptr(), - x.size(0), // rows - x.size(1), // columns == hidden_size - loras, - loras_temp.device().is_meta() ? NULL : (half*) loras_temp.data_ptr() - ); -} - -int q_mlp_set_loras -( - uintptr_t q_mlp, - std::unordered_map& gate_proj_lora_a, - std::unordered_map& gate_proj_lora_b, - std::unordered_map& up_proj_lora_a, - std::unordered_map& up_proj_lora_b, - std::unordered_map& down_proj_lora_a, - std::unordered_map& down_proj_lora_b -) -{ - QMLP* mlp = reinterpret_cast (q_mlp); - - mlp->gate_proj_lora.clear(); - mlp->up_proj_lora.clear(); - mlp->down_proj_lora.clear(); - - int max_rank = 0; - - for (const auto& pair : gate_proj_lora_a) - { - int rank = pair.second.size(-1); - if (rank > max_rank) max_rank = rank; - half* a = (half*) pair.second.data_ptr(); - half* b = (half*) gate_proj_lora_b[pair.first].data_ptr(); - mlp->gate_proj_lora[pair.first] = std::make_tuple(a, b, rank); - } - - for (const auto& pair : up_proj_lora_a) - { - int rank = pair.second.size(-1); - if (rank > max_rank) max_rank = rank; - half* a = (half*) pair.second.data_ptr(); - half* b = (half*) up_proj_lora_b[pair.first].data_ptr(); - mlp->up_proj_lora[pair.first] = std::make_tuple(a, b, rank); - } - - for (const auto& pair : down_proj_lora_a) - { - int rank = pair.second.size(-1); - if (rank > max_rank) max_rank = rank; - half* a = (half*) pair.second.data_ptr(); - half* b = (half*) down_proj_lora_b[pair.first].data_ptr(); - mlp->down_proj_lora[pair.first] = std::make_tuple(a, b, rank); - } - - return max_rank; -} - - -// RoPE rotary positional embeddings, in-place - -void rope_ -( - torch::Tensor x, - torch::Tensor sin, - torch::Tensor cos, - int past_len, - int num_heads, - int head_dim -) -{ - TORCH_CHECK_DTYPE(x, kHalf); - TORCH_CHECK_DTYPE(sin, kHalf); - TORCH_CHECK_DTYPE(cos, kHalf); - TORCH_CHECK(head_dim == cos.size(-1), "cos table does not match head_dim"); - TORCH_CHECK(head_dim == sin.size(-1), "sin table does not match head_dim"); - - int batch_size = x.size(0); - int rows_per_batch = x.numel() / head_dim / batch_size; - - const at::cuda::OptionalCUDAGuard device_guard(device_of(x)); - - rope_cuda - ( - (half*) x.data_ptr(), - (const half*) sin.data_ptr(), - (const half*) cos.data_ptr(), - batch_size, - rows_per_batch, - head_dim, - num_heads, - past_len, - NULL - ); -} - - -// RMS layernorm - -void rms_norm -( - torch::Tensor x, - torch::Tensor w, - torch::Tensor y, - float epsilon -) -{ - TORCH_CHECK_DTYPE(x, kHalf); - TORCH_CHECK_DTYPE(w, kHalf); - TORCH_CHECK_DTYPE(y, kHalf); - TORCH_CHECK_SHAPES(x, 1, w, 0, 1); - TORCH_CHECK_SHAPES(x, 1, w, 0, 1); - TORCH_CHECK_SHAPES(x, 0, y, 0, 1); - TORCH_CHECK_SHAPES(x, 1, y, 1, 1); - - int rows = x.size(0); - int dim = x.size(1); - - const at::cuda::OptionalCUDAGuard device_guard(device_of(x)); - - rms_norm_cuda - ( - (half*) x.data_ptr(), - (half*) w.data_ptr(), - (half*) y.data_ptr(), - epsilon, - rows, - dim - ); -} - -void rms_norm_ -( - torch::Tensor x, - torch::Tensor w, - float epsilon -) -{ - rms_norm(x, w, x, epsilon); -} - - -// Sampling - -void apply_rep_penalty -( - torch::Tensor sequence, - float penalty_max, - int sustain, - int decay, - torch::Tensor logits -) -{ - TORCH_CHECK_DTYPE(sequence, kLong); - TORCH_CHECK_DTYPE(logits, kFloat); - TORCH_CHECK_SHAPES(sequence, 0, logits, 0, 1); - - int vocab_size = logits.size(-1); - int bsz = sequence.size(0); - int seq_len = sequence.size(-1); - - for (int i = 0; i < bsz; i++) - { - apply_rep_penalty_cpu - ( - vocab_size, - ((uint64_t*) sequence.data_ptr()) + i * seq_len, - penalty_max, - sustain, - decay, - seq_len, - ((float*) logits.data_ptr()) + i * vocab_size - ); - } -} - -void sample_basic -( - torch::Tensor logits, // shape [bsz, vocab_size] - float temperature, - int top_k, - float top_p, - float typical, - float random, - torch::Tensor output_tokens, // shape [bsz, 1] - torch::Tensor output_probs, // shape [bsz, 1] - torch::Tensor logit_filter // shape [bsz, vocab_size] -) -{ - TORCH_CHECK_DTYPE(logits, kFloat); - TORCH_CHECK_DTYPE(output_tokens, kLong); - TORCH_CHECK_DTYPE(output_probs, kFloat); - TORCH_CHECK_DTYPE(logits, kFloat); - TORCH_CHECK_DTYPE(logit_filter, kBool); - - TORCH_CHECK_SHAPES(logit_filter, 0, logits, 0, 1); - TORCH_CHECK_SHAPES(logit_filter, 1, logits, 1, 1); - - int vocab_size = logits.size(-1); - int bsz = logits.size(0); - - float* temp_probs = (float*) malloc(vocab_size * sizeof(float)); - int* temp_indices = (int*) malloc(vocab_size * sizeof(int)); - -// int64_t* output_tokens_ptr = (int64_t*) output_tokens.data_ptr(); -// float* output_probs_ptr = (float*) output_tokens.data_ptr(); - float* logits_ptr = (float*) logits.data_ptr(); - - bool* logits_filter_ptr = (bool*) logit_filter.data_ptr(); - - for (int i = 0; i < bsz; i++) - { - softmax_cpu - ( - vocab_size, - temperature, - logits_ptr + i * vocab_size, - logits_filter_ptr + i * vocab_size, - temp_probs - ); - - if (top_k == 1) - { - int index = greedy_sample(vocab_size, logits_ptr + i * vocab_size, logits_filter_ptr + i * vocab_size); - output_tokens[i] = index; - output_probs[i] = temp_probs[index]; - continue; - } - - for (int j = 0; j < vocab_size; j++) temp_indices[j] = j; - int num_candidates = vocab_size; - - if (top_k > 0 && top_k < vocab_size) - { - num_candidates = top_k_cpu(num_candidates, temp_probs, temp_indices, top_k); - normalize_cpu(num_candidates, temp_probs); - } - - if (top_p > 0.0f && top_p < 1.0f) - { - num_candidates = top_p_cpu(num_candidates, temp_probs, temp_indices, top_p); - normalize_cpu(num_candidates, temp_probs); - } - - if (typical > 0.0f && typical < 1.0f) - { - num_candidates = typical_cpu(num_candidates, temp_probs, temp_indices, typical); - normalize_cpu(num_candidates, temp_probs); - } - - num_candidates = multinomial_cpu(num_candidates, temp_probs, temp_indices, random); - output_tokens[i] = temp_indices[0]; - output_probs[i] = temp_probs[0]; - } - - free(temp_probs); - free(temp_indices); -} - - -// Filtering - -void logit_filter_exclusive -( - torch::Tensor filter, // shape [bsz, vocab_size] - const std::vector> &exclusive_lists -) -{ - TORCH_CHECK_DTYPE(filter, kBool); - TORCH_CHECK((uint64_t) filter.size(0) == exclusive_lists.size(), "Number of lists does not match batch size") - - bool* filter_ptr = (bool*) filter.data_ptr(); - unsigned int vocab_size = filter.size(1); - - for(const auto& list : exclusive_lists) - { - unsigned int id = 0; - unsigned int next_id_idx = 0; - unsigned int next_id = list[next_id_idx]; - - while (id < vocab_size) - { - while (id < next_id) - { - filter_ptr[id] = false; - id++; - } - id++; - next_id_idx++; - if (next_id_idx >= list.size()) next_id = vocab_size; - else next_id = list[next_id_idx]; - } - - filter_ptr += vocab_size; - } -} - -// For cache conversion - -void fp16_to_fp8(torch::Tensor in_tensor, torch::Tensor out_tensor, int batch_size, int offset, int width) -{ - TORCH_CHECK_DTYPE(in_tensor, kHalf); - TORCH_CHECK_DTYPE(out_tensor, kUInt8); - const at::cuda::OptionalCUDAGuard device_guard(device_of(in_tensor)); - - TORCH_CHECK_SHAPES(in_tensor, 0, out_tensor, 0, 1); - TORCH_CHECK_SHAPES(in_tensor, 1, out_tensor, 1, 1); - TORCH_CHECK_SHAPES(in_tensor, 2, out_tensor, 2, 1); - TORCH_CHECK_SHAPES(in_tensor, 3, out_tensor, 3, 1); - - int stride = in_tensor.size(1) * in_tensor.size(2) * in_tensor.size(3); - int height = batch_size; - - int tsize = in_tensor.size(2) * in_tensor.size(3); - offset *= tsize; - width *= tsize; - - array_fp16_to_fp8_cuda((const half*) (in_tensor.data_ptr()), (unsigned char*)(out_tensor.data_ptr()), stride, height, offset, width); -} - -void fp8_to_fp16(torch::Tensor in_tensor, torch::Tensor out_tensor, int batch_size, int offset, int width) -{ - TORCH_CHECK_DTYPE(in_tensor, kUInt8); - TORCH_CHECK_DTYPE(out_tensor, kHalf); - const at::cuda::OptionalCUDAGuard device_guard(device_of(in_tensor)); - - TORCH_CHECK_SHAPES(in_tensor, 0, out_tensor, 0, 1); - TORCH_CHECK_SHAPES(in_tensor, 1, out_tensor, 1, 1); - TORCH_CHECK_SHAPES(in_tensor, 2, out_tensor, 2, 1); - TORCH_CHECK_SHAPES(in_tensor, 3, out_tensor, 3, 1); - - int stride = in_tensor.size(1) * in_tensor.size(2) * in_tensor.size(3); - int height = batch_size; - - int tsize = in_tensor.size(2) * in_tensor.size(3); - offset *= tsize; - width *= tsize; - - array_fp8_to_fp16_cuda((const unsigned char*)(in_tensor.data_ptr()), (half*)(out_tensor.data_ptr()), stride, height, offset, width); -} - -//void array_fp16_to_fp8_ref(torch::Tensor in_tensor, torch::Tensor out_tensor, int size) -//{ -// TORCH_CHECK_DTYPE(in_tensor, kHalf); -// TORCH_CHECK_DTYPE(out_tensor, kUInt8); -// array_fp16_to_fp8_ref_cuda((const half*) (in_tensor.data_ptr()), (unsigned char*)(out_tensor.data_ptr()), size); -//} -// -//void array_fp8_to_fp16_ref(torch::Tensor in_tensor, torch::Tensor out_tensor, int size) -//{ -// TORCH_CHECK_DTYPE(in_tensor, kUInt8); -// TORCH_CHECK_DTYPE(out_tensor, kHalf); -// array_fp8_to_fp16_ref_cuda((const unsigned char*)(in_tensor.data_ptr()), (half*)(out_tensor.data_ptr()), size); -//} - -// Bindings - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) -{ - m.def("pack_rows_4", &pack_rows_4, "pack_rows_4"); - m.def("pack_columns", &pack_columns, "pack_columns"); - m.def("quantize_err", &quantize_err, "quantize_err"); - m.def("quantize", &quantize, "quantize"); - m.def("make_q_matrix", &make_q_matrix, "make_q_matrix"); - m.def("free_q_matrix", &free_q_matrix, "free_q_matrix"); - m.def("reconstruct", &reconstruct, "reconstruct"); - m.def("make_q_mlp", &make_q_mlp, "make_q_mlp"); - m.def("free_q_mlp", &free_q_mlp, "free_q_mlp"); - m.def("q_mlp_forward_", &q_mlp_forward_, "q_mlp_forward_"); - m.def("q_mlp_set_loras", &q_mlp_set_loras, "q_mlp_set_loras"); - m.def("make_q_attn", &make_q_attn, "make_q_attn"); - m.def("free_q_attn", &free_q_attn, "free_q_attn"); - m.def("q_attn_forward_1", &q_attn_forward_1, "q_attn_forward_1"); - m.def("q_attn_forward_2", &q_attn_forward_2, "q_attn_forward_2"); - m.def("q_attn_set_loras", &q_attn_set_loras, "q_attn_set_loras"); - m.def("quantize_range", &quantize_range, "quantize_range"); - m.def("gemm_half_q_half", &gemm_half_q_half, "gemm_half_q_half"); - m.def("rms_norm", &rms_norm, "rms_norm"); - m.def("rms_norm_", &rms_norm_, "rms_norm_"); - m.def("rope_", &rope_, "rope_"); - m.def("apply_rep_penalty", &apply_rep_penalty, "apply_rep_penalty"); - m.def("sample_basic", &sample_basic, "sample_basic"); - m.def("logit_filter_exclusive", &logit_filter_exclusive, "logit_filter_exclusive"); - m.def("fp16_to_fp8", &fp16_to_fp8, "fp16_to_fp8"); - m.def("fp8_to_fp16", &fp8_to_fp16, "fp8_to_fp16"); -// m.def("array_fp16_to_fp8_ref", &array_fp16_to_fp8_ref, "array_fp16_to_fp8_ref"); -// m.def("array_fp8_to_fp16_ref", &array_fp8_to_fp16_ref, "array_fp8_to_fp16_ref"); -} diff --git a/server/exllamav2_kernels/setup.py b/server/exllamav2_kernels/setup.py deleted file mode 100644 index 8f25cfb1b..000000000 --- a/server/exllamav2_kernels/setup.py +++ /dev/null @@ -1,28 +0,0 @@ -from setuptools import setup -from torch.utils.cpp_extension import BuildExtension, CUDAExtension - -setup( - name="exllamav2_kernels", - ext_modules=[ - CUDAExtension( - name="exllamav2_kernels", - sources=[ - "exllamav2_kernels/ext.cpp", - "exllamav2_kernels/cuda/h_gemm.cu", - "exllamav2_kernels/cuda/lora.cu", - "exllamav2_kernels/cuda/pack_tensor.cu", - "exllamav2_kernels/cuda/quantize.cu", - "exllamav2_kernels/cuda/q_matrix.cu", - "exllamav2_kernels/cuda/q_attn.cu", - "exllamav2_kernels/cuda/q_mlp.cu", - "exllamav2_kernels/cuda/q_gemm.cu", - "exllamav2_kernels/cuda/rms_norm.cu", - "exllamav2_kernels/cuda/rope.cu", - "exllamav2_kernels/cuda/cache.cu", - "exllamav2_kernels/cpp/quantize_func.cpp", - "exllamav2_kernels/cpp/sampling.cpp" - ], - ) - ], - cmdclass={"build_ext": BuildExtension}, -)