@@ -1476,10 +1476,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
14761476
14771477    const  int64_t  ne10 = src1->ne [0 ];
14781478    const  int64_t  ne11 = src1->ne [1 ];
1479+     const  int64_t  ne12 = src1->ne [2 ];
1480+     const  int64_t  ne13 = src1->ne [3 ];
14791481
14801482    const  int  nb2  = dst->nb [2 ];
14811483    const  int  nb3  = dst->nb [3 ];
14821484
1485+     const  int64_t  r2 = ne12 / ne02;
1486+     const  int64_t  r3 = ne13 / ne03;
1487+ 
14831488    const  float  alpha = 1 .0f ;
14841489    const  float  beta = 0 .0f ;
14851490    const  int  x_ne = ne01 * ne00;
@@ -1498,13 +1503,22 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
14981503    cl_mem d_Y = ggml_cl_pool_malloc (sizeof (float ) * y_ne, &y_size);
14991504    cl_mem d_D = ggml_cl_pool_malloc (sizeof (float ) * d_ne, &d_size);
15001505
1501-     for  (int64_t  i03 = 0 ; i03 < ne03; i03++) {
1502-         for  (int64_t  i02 = 0 ; i02 < ne02; i02++) {
1506+     int64_t  pi02 = -1 ;
1507+     int64_t  pi03 = -1 ;
1508+ 
1509+     for  (int64_t  i13 = 0 ; i13 < ne13; i13++) {
1510+         int64_t  i03 = i13 / r3;
1511+ 
1512+         for  (int64_t  i12 = 0 ; i12 < ne12; i12++) {
1513+             int64_t  i02 = i12 / r2;
1514+ 
15031515            //  copy data to device
1504-             if  (src0->backend  != GGML_BACKEND_GPU) {
1516+             if  (src0->backend  != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03) ) {
15051517                CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, 0 , src0, i03, i02, NULL ));
1518+                 pi02 = i02;
1519+                 pi03 = i03;
15061520            }
1507-             CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i03, i02 , NULL ));
1521+             CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12 , NULL ));
15081522
15091523            CL_CHECK (clFinish (queue));
15101524
@@ -1525,7 +1539,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
15251539            }
15261540
15271541            //  copy dst to host
1528-             float  * d = (float  *) ((char  *) dst->data  + i02 *nb2 + i03 *nb3);
1542+             float  * d = (float  *) ((char  *) dst->data  + i12 *nb2 + i13 *nb3);
15291543            CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * d_ne, d, 1 , &ev_sgemm, NULL ));
15301544        }
15311545    }
@@ -1547,6 +1561,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
15471561
15481562    const  int64_t  ne10 = src1->ne [0 ];
15491563    const  int64_t  ne11 = src1->ne [1 ];
1564+     const  int64_t  ne12 = src1->ne [2 ];
1565+     const  int64_t  ne13 = src1->ne [3 ];
15501566
15511567    const  int  nb10 = src1->nb [0 ];
15521568    const  int  nb11 = src1->nb [1 ];
@@ -1556,6 +1572,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
15561572    const  int  nb2  = dst->nb [2 ];
15571573    const  int  nb3  = dst->nb [3 ];
15581574
1575+     const  int64_t  r2 = ne12 / ne02;
1576+     const  int64_t  r3 = ne13 / ne03;
1577+ 
15591578    const  ggml_fp16_t  alpha = ggml_fp32_to_fp16 (1 .0f );
15601579    const  ggml_fp16_t  beta = ggml_fp32_to_fp16 (0 .0f );
15611580    const  int  x_ne = ne01 * ne00;
@@ -1577,32 +1596,41 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
15771596    bool  src1_cont_rows = nb10 == sizeof (float );
15781597    bool  src1_cont_cols = (size_t )nb11 == ne11*sizeof (float );
15791598
1580-     for  (int64_t  i03 = 0 ; i03 < ne03; i03++) {
1581-         for  (int64_t  i02 = 0 ; i02 < ne02; i02++) {
1599+     int64_t  pi02 = -1 ;
1600+     int64_t  pi03 = -1 ;
1601+ 
1602+     for  (int64_t  i13 = 0 ; i13 < ne13; i13++) {
1603+         int64_t  i03 = i13 / r3;
1604+ 
1605+         for  (int64_t  i12 = 0 ; i12 < ne12; i12++) {
1606+             int64_t  i02 = i12 / r2;
1607+ 
15821608            //  copy src0 to device
1583-             if  (src0->backend  != GGML_BACKEND_GPU) {
1609+             if  (src0->backend  != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03) ) {
15841610                CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, 0 , src0, i03, i02, NULL ));
1611+                 pi02 = i02;
1612+                 pi03 = i03;
15851613            }
15861614
15871615            //  convert src1 to fp16
15881616            //  TODO: use multiple threads
1589-             ggml_fp16_t  * const  tmp = (ggml_fp16_t  *) wdata + (ne11 * ne10) * (i03  * ne02  + i02 );
1590-             char  * src1i = (char  *) src1->data  + i03 *nb13 + i02 *nb12;
1617+             ggml_fp16_t  * const  tmp = (ggml_fp16_t  *) wdata + (ne11 * ne10) * (i13  * ne12  + i12 );
1618+             char  * src1i = (char  *) src1->data  + i13 *nb13 + i12 *nb12;
15911619            if  (src1_cont_rows) {
15921620                if  (src1_cont_cols) {
15931621                    ggml_fp32_to_fp16_row ((float  *) src1i, tmp, ne10*ne11);
15941622                }
15951623                else  {
1596-                     for  (int64_t  i01  = 0 ; i01  < ne11; i01 ++) {
1597-                         ggml_fp32_to_fp16_row ((float  *) (src1i + i01 *nb11), tmp + i01 *ne10, ne10);
1624+                     for  (int64_t  i11  = 0 ; i11  < ne11; i11 ++) {
1625+                         ggml_fp32_to_fp16_row ((float  *) (src1i + i11 *nb11), tmp + i11 *ne10, ne10);
15981626                    }
15991627                }
16001628            }
16011629            else  {
1602-                 for  (int64_t  i01  = 0 ; i01  < ne11; i01 ++) {
1603-                     for  (int64_t  i00  = 0 ; i00  < ne10; i00 ++) {
1630+                 for  (int64_t  i11  = 0 ; i11  < ne11; i11 ++) {
1631+                     for  (int64_t  i10  = 0 ; i10  < ne10; i10 ++) {
16041632                        //  very slow due to no inlining
1605-                         tmp[i01 *ne10 + i00 ] = ggml_fp32_to_fp16 (*(float  *) (src1i + i01 *nb11 + i00 *nb10));
1633+                         tmp[i11 *ne10 + i10 ] = ggml_fp32_to_fp16 (*(float  *) (src1i + i11 *nb11 + i10 *nb10));
16061634                    }
16071635                }
16081636            }
@@ -1631,7 +1659,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
16311659            //  copy dst to host, then convert to float
16321660            CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (ggml_fp16_t ) * d_ne, tmp, 1 , &ev_sgemm, NULL ));
16331661
1634-             float  * d = (float  *) ((char  *) dst->data  + i02 *nb2 + i03 *nb3);
1662+             float  * d = (float  *) ((char  *) dst->data  + i12 *nb2 + i13 *nb3);
16351663
16361664            ggml_fp16_to_fp32_row (tmp, d, d_ne);
16371665        }
@@ -1652,12 +1680,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
16521680
16531681    const  int64_t  ne10 = src1->ne [0 ];
16541682    const  int64_t  ne11 = src1->ne [1 ];
1683+     const  int64_t  ne12 = src1->ne [2 ];
1684+     const  int64_t  ne13 = src1->ne [3 ];
16551685
16561686    const  int  nb2  = dst->nb [2 ];
16571687    const  int  nb3  = dst->nb [3 ];
16581688    const  ggml_type type = src0->type ;
16591689    const  bool  mul_mat_vec = ne11 == 1 ;
16601690
1691+     const  int64_t  r2 = ne12 / ne02;
1692+     const  int64_t  r3 = ne13 / ne03;
1693+ 
16611694    const  float  alpha = 1 .0f ;
16621695    const  float  beta = 0 .0f ;
16631696    const  int  x_ne = ne01 * ne00;
@@ -1690,12 +1723,23 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
16901723    size_t  ev_idx = 0 ;
16911724    std::vector<cl_event> events;
16921725
1693-     for  (int64_t  i03 = 0 ; i03 < ne03; i03++) {
1694-         for  (int64_t  i02 = 0 ; i02 < ne02; i02++) {
1726+     int64_t  pi02 = -1 ;
1727+     int64_t  pi03 = -1 ;
1728+ 
1729+     for  (int64_t  i13 = 0 ; i13 < ne13; i13++) {
1730+         int64_t  i03 = i13 / r3;
1731+ 
1732+         for  (int64_t  i12 = 0 ; i12 < ne12; i12++) {
1733+             int64_t  i02 = i12 / r2;
1734+ 
16951735            //  copy src0 to device if necessary
16961736            if  (src0->backend  == GGML_BACKEND_CPU) {
1697-                 events.emplace_back ();
1698-                 CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Q, 0 , src0, i03, i02, events.data () + ev_idx++));
1737+                 if  (i02 != pi02 || i03 != pi03) {
1738+                     events.emplace_back ();
1739+                     CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Q, 0 , src0, i03, i02, events.data () + ev_idx++));
1740+                     pi02 = i02;
1741+                     pi03 = i03;
1742+                 }
16991743            } else  if  (src0->backend  == GGML_BACKEND_GPU) {
17001744                d_Q = (cl_mem) src0->extra ;
17011745            } else  {
@@ -1704,7 +1748,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
17041748            if  (mul_mat_vec) { //  specialized dequantize_mul_mat_vec kernel
17051749                //  copy src1 to device
17061750                events.emplace_back ();
1707-                 CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i03, i02 , events.data () + ev_idx++));
1751+                 CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12 , events.data () + ev_idx++));
17081752
17091753                //  compute
17101754                const  size_t  global = ne01 * CL_DMMV_BLOCK_SIZE;
@@ -1725,7 +1769,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
17251769                CL_CHECK (clEnqueueNDRangeKernel (queue, *to_fp32_cl, 1 , NULL , &global, local > 0  ? &local : NULL , events.size (), !events.empty () ? events.data () : NULL , NULL ));
17261770
17271771                //  copy src1 to device
1728-                 CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i03, i02 , NULL ));
1772+                 CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12 , NULL ));
17291773
17301774                events.emplace_back ();
17311775
@@ -1749,7 +1793,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
17491793            }
17501794
17511795            //  copy dst to host
1752-             float  * d = (float  *) ((char  *) dst->data  + i02 *nb2 + i03 *nb3);
1796+             float  * d = (float  *) ((char  *) dst->data  + i12 *nb2 + i13 *nb3);
17531797            CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * d_ne, d, 1 , &events[events.size () - 1 ], NULL ));
17541798            for  (auto  *event : events) {
17551799                clReleaseEvent (event);
0 commit comments