Skip to content

Fix 131k context ggml assert #3

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copie
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
option(GGML_CUDA_FA "ggml: compile ggml FlashAttention CUDA kernels" ON)
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
option(GGML_CUDA_ALLOW_LARGE_TENSORS "ggml: allow large tensors for CUDA (disable INT_MAX check)" OFF)
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
"ggml: cuda link binary compression mode; requires cuda 12.8+")
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ if (CUDAToolkit_FOUND)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()

if (GGML_CUDA_ALLOW_LARGE_TENSORS)
add_compile_definitions(GGML_CUDA_ALLOW_LARGE_TENSORS)
endif()

if (GGML_STATIC)
if (WIN32)
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
Expand Down
132 changes: 109 additions & 23 deletions ggml/src/ggml-cuda/cpy.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "cpy.cuh"
#include "dequantize.cuh"
#include "cpy-utils.cuh"
#include <climits> // For SIZE_MAX
#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
#include "ggml-musa/mudnn.cuh"
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
Expand Down Expand Up @@ -141,69 +142,147 @@ void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_des

template<typename src_t, typename dst_t>
static void ggml_cpy_flt_cuda(
const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

const char * cx, char * cdst, const int64_t ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

#if defined(GGML_CUDA_ALLOW_LARGE_TENSORS)
const int64_t max_chunk = INT_MAX;
for (int64_t offset = 0; offset < ne; offset += max_chunk) {
const int64_t chunk = (ne - offset) < max_chunk ? (ne - offset) : max_chunk;
const int num_blocks = (chunk + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
(cx + offset * sizeof(src_t), cdst + offset * sizeof(dst_t), chunk,
ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13,
cdst_indirect, graph_cpynode_index++);
}
#else
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
#endif
}

static void ggml_cpy_f32_q8_0_cuda(
const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
const char * cx, char * cdst, const int64_t ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

GGML_ASSERT(ne % QK8_0 == 0);
#if defined(GGML_CUDA_ALLOW_LARGE_TENSORS)
const int64_t max_chunk = INT_MAX;
for (int64_t offset = 0; offset < ne; offset += max_chunk) {
const int64_t chunk = (ne - offset) < max_chunk ? (ne - offset) : max_chunk;
const int64_t chunk_blocks = chunk / QK8_0;
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<chunk_blocks, 1, 0, stream>>>
(cx + offset * sizeof(float), cdst + (offset / QK8_0) * sizeof(block_q8_0), chunk,
ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13,
cdst_indirect, graph_cpynode_index++);
}
#else
const int num_blocks = ne / QK8_0;
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
#endif
}

static void ggml_cpy_q8_0_f32_cuda(
const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

const char * cx, char * cdst, const int64_t ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

#if defined(GGML_CUDA_ALLOW_LARGE_TENSORS)
const int64_t max_chunk = INT_MAX;
for (int64_t offset = 0; offset < ne; offset += max_chunk) {
const int64_t chunk = (ne - offset) < max_chunk ? (ne - offset) : max_chunk;
const int64_t chunk_blocks = chunk;
cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<chunk_blocks, 1, 0, stream>>>
(cx + (offset / QK8_0) * sizeof(block_q8_0), cdst + offset * sizeof(float), chunk,
ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13,
cdst_indirect, graph_cpynode_index++);
}
#else
const int num_blocks = ne;
cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
#endif
}

static void ggml_cpy_f32_q4_0_cuda(
const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
const char * cx, char * cdst, const int64_t ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

GGML_ASSERT(ne % QK4_0 == 0);
#if defined(GGML_CUDA_ALLOW_LARGE_TENSORS)
const int64_t max_chunk = INT_MAX;
for (int64_t offset = 0; offset < ne; offset += max_chunk) {
const int64_t chunk = (ne - offset) < max_chunk ? (ne - offset) : max_chunk;
const int64_t chunk_blocks = chunk / QK4_0;
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<chunk_blocks, 1, 0, stream>>>
(cx + offset * sizeof(float), cdst + (offset / QK4_0) * sizeof(block_q4_0), chunk,
ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13,
cdst_indirect, graph_cpynode_index++);
}
#else
const int num_blocks = ne / QK4_0;
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
#endif
}

static void ggml_cpy_q4_0_f32_cuda(
const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02,
const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12,
const int nb10, const int nb11, const int nb12, const int nb13,
const char * cx, char * cdst, const int64_t ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02,
const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12,
const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13,
cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
#if defined(GGML_CUDA_ALLOW_LARGE_TENSORS)
const int64_t max_chunk = INT_MAX;
for (int64_t offset = 0; offset < ne; offset += max_chunk) {
const int64_t chunk = (ne - offset) < max_chunk ? (ne - offset) : max_chunk;
const int64_t chunk_blocks = chunk;
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<chunk_blocks, 1, 0, stream>>>(
cx + (offset / QK4_0) * sizeof(block_q4_0), cdst + offset * sizeof(float), chunk,
ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13,
cdst_indirect, graph_cpynode_index++);
}
#else
const int num_blocks = ne;
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
#endif
}

static void ggml_cpy_f32_q4_1_cuda(
const char * cx, char * cdst, const int ne,
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
const char * cx, char * cdst, const int64_t ne,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02,
const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

GGML_ASSERT(ne % QK4_1 == 0);
#if defined(GGML_CUDA_ALLOW_LARGE_TENSORS)
const int64_t max_chunk = INT_MAX;
for (int64_t offset = 0; offset < ne; offset += max_chunk) {
const int64_t chunk = (ne - offset) < max_chunk ? (ne - offset) : max_chunk;
const int64_t chunk_blocks = chunk / QK4_1;
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<chunk_blocks, 1, 0, stream>>>
(cx + offset * sizeof(float), cdst + (offset / QK4_1) * sizeof(block_q4_1), chunk,
ne00, ne01, ne02, nb00, nb01, nb02, nb03,
ne10, ne11, ne12, nb10, nb11, nb12, nb13,
cdst_indirect, graph_cpynode_index++);
}
#else
const int num_blocks = ne / QK4_1;
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
#endif
}

static void ggml_cpy_q4_1_f32_cuda(
Expand Down Expand Up @@ -282,8 +361,15 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1));

#if defined(GGML_CUDA_ALLOW_LARGE_TENSORS)
// No INT_MAX limit – ggml_nbytes may exceed 2GB on large contexts.
// The underlying cudaMemcpyAsync can handle size_t lengths.
GGML_ASSERT(ggml_nbytes(src0) <= SIZE_MAX / 4); // Reasonable upper bound with safety margin
GGML_ASSERT(ggml_nbytes(src1) <= SIZE_MAX / 4); // Reasonable upper bound with safety margin
#else
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
#endif

const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
Expand Down
Loading