@@ -501,6 +501,42 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
501501
502502}
503503
504+ typedef union {
505+ half f16 ;
506+ uint16_t u16 ;
507+ } iq1m_scale_t ;
508+
509+ template <typename dst_t >
510+ static __global__ void dequantize_block_iq1_m (const void * __restrict__ vx, dst_t * __restrict__ yy) {
511+
512+ const int i = blockIdx .x ;
513+ const block_iq1_m * x = (const block_iq1_m *) vx;
514+
515+ const int tid = threadIdx .x ;
516+ #if QK_K == 256
517+ const int il = tid/8 ; // 0...3
518+ const int ib = tid%8 ; // 0...7
519+ dst_t * y = yy + i*QK_K + 32 *ib + 8 *il;
520+ const uint16_t * sc = (const uint16_t *)x[i].scales ;
521+ iq1m_scale_t scale;
522+ scale.u16 = (sc[0 ] >> 12 ) | ((sc[1 ] >> 8 ) & 0x00f0 ) | ((sc[2 ] >> 4 ) & 0x0f00 ) | (sc[3 ] & 0xf000 );
523+ const int ib16 = 2 *ib + il/2 ; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
524+ const float d = (float )scale.f16 * (2 *((sc[ib16/4 ] >> 3 *(ib16%4 )) & 0x7 ) + 1 );
525+ const float delta = x[i].qh [2 *ib+il/2 ] & (0x08 << 4 *(il%2 )) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
526+ uint32_t grid32[2 ]; const int8_t * q = (const int8_t *)grid32;
527+ grid32[0 ] = iq1s_grid_gpu[x[i].qs [4 *ib+il] | (((x[i].qh [2 *ib+il/2 ] >> 4 *(il%2 )) & 7 ) << 8 )];
528+ grid32[1 ] = (grid32[0 ] >> 4 ) & 0x0f0f0f0f ;
529+ grid32[0 ] &= 0x0f0f0f0f ;
530+ for (int j = 0 ; j < 8 ; ++j) {
531+ y[j] = d * (q[j] + delta);
532+ }
533+ #else
534+ assert (false );
535+ #endif
536+
537+ }
538+
539+
504540template <typename dst_t >
505541static __global__ void dequantize_block_iq4_nl (const void * __restrict__ vx, dst_t * __restrict__ yy) {
506542
@@ -658,6 +694,12 @@ static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k,
658694 dequantize_block_iq4_nl<<<nb, 32 , 0 , stream>>> (vx, y);
659695}
660696
697+ template <typename dst_t >
698+ static void dequantize_row_iq1_m_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
699+ const int nb = k / QK_K;
700+ dequantize_block_iq1_m<<<nb, 32 , 0 , stream>>> (vx, y);
701+ }
702+
661703template <typename dst_t >
662704static void dequantize_row_iq4_xs_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
663705 const int nb = (k + QK_K - 1 ) / QK_K;
@@ -724,6 +766,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
724766 return dequantize_row_iq3_xxs_cuda;
725767 case GGML_TYPE_IQ1_S:
726768 return dequantize_row_iq1_s_cuda;
769+ case GGML_TYPE_IQ1_M:
770+ return dequantize_row_iq1_m_cuda;
727771 case GGML_TYPE_IQ4_NL:
728772 return dequantize_row_iq4_nl_cuda;
729773 case GGML_TYPE_IQ4_XS:
@@ -769,6 +813,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
769813 return dequantize_row_iq3_xxs_cuda;
770814 case GGML_TYPE_IQ1_S:
771815 return dequantize_row_iq1_s_cuda;
816+ case GGML_TYPE_IQ1_M:
817+ return dequantize_row_iq1_m_cuda;
772818 case GGML_TYPE_IQ4_NL:
773819 return dequantize_row_iq4_nl_cuda;
774820 case GGML_TYPE_IQ4_XS:
0 commit comments