@@ -88,42 +88,45 @@ static struct ggml_tensor * build_ada_residual_conv(ggml_context * ctx, struct g
8888
8989 gamma = ggml_add (ctx, ggml_mul_mat (ctx, block->norm1_gamma , style), block->norm1_gamma_bias );
9090 beta = ggml_add (ctx, ggml_mul_mat (ctx, block->norm1_beta , style), block->norm1_beta_bias );
91- cur = ggml_cont (ctx, ggml_transpose (ctx, ggml_norm (ctx, ggml_cont (ctx, ggml_transpose (ctx, cur)), 0.00001 )) );
91+ cur = ggml_norm (ctx, x, 0.00001 );
9292
9393 // The addition between gamma * x and x is performed here because ggml doesn't support scalar multiplication without initializing the scalars in advance.
9494 // An optimal remedy to this would be to increment the gamma bias above by one when preparing the gguf file for the model.
95- cur = ggml_add (ctx, ggml_add (ctx, cur, ggml_mul (ctx, cur, gamma)), beta);
96- cur = ggml_leaky_relu (ctx, cur, 0 .2f , false );
95+ cur = ggml_add (ctx, cur, ggml_mul (ctx, cur, ggml_transpose (ctx, gamma)));
96+ cur = ggml_add (ctx, cur, ggml_transpose (ctx, beta));
97+ cur = ggml_leaky_relu (ctx, cur, 0 .2f , false );
9798
9899 if (block->pool ) {
99- cur = ggml_conv_transpose_1d (ctx, block->pool , ggml_cont (ctx, ggml_transpose (ctx, cur)) , 2 , 1 , 1 , 1 , cur->ne [0 ]);
100+ cur = ggml_conv_transpose_1d (ctx, block->pool , cur, 2 , 1 , 1 , 1 , cur->ne [1 ]);
100101 cur = ggml_add (ctx, cur, block->pool_bias );
101- cur = ggml_cont (ctx, ggml_transpose (ctx, cur));
102102 }
103103
104- cur = ggml_conv_1d (ctx, block->conv1 , ggml_cont (ctx, ggml_transpose (ctx, cur)) , 1 , 1 , 1 );
104+ cur = ggml_conv_1d (ctx, block->conv1 , cur, 1 , 1 , 1 );
105105
106106 cur = ggml_add (ctx, cur, block->conv1_bias );
107107 gamma = ggml_add (ctx, ggml_mul_mat (ctx, block->norm2_gamma , style), block->norm2_gamma_bias );
108108 beta = ggml_add (ctx, ggml_mul_mat (ctx, block->norm2_beta , style), block->norm2_beta_bias );
109- cur = ggml_cont (ctx, ggml_transpose (ctx, ggml_norm (ctx, cur, 0.00001 )) );
109+ cur = ggml_norm (ctx, cur, 0.00001 );
110110
111111 // The addition between gamma * x and x is performed here because ggml doesn't support scalar multiplication without initializing the scalars in advance.
112112 // An optimal remedy to this would be to increment the gamma bias above by one when preparing the gguf file for the model.
113- cur = ggml_add (ctx, ggml_add (ctx, cur, ggml_mul (ctx, cur, gamma)), beta);
114- cur = ggml_leaky_relu (ctx, cur, 0 .2f , false );
115- cur = ggml_add (ctx, ggml_conv_1d (ctx, block->conv2 , ggml_cont (ctx, ggml_transpose (ctx, cur)), 1 , 1 , 1 ), block->conv2_bias );
113+ cur = ggml_add (ctx, cur, ggml_mul (ctx, cur, ggml_transpose (ctx, gamma)));
114+ cur = ggml_add (ctx, cur, ggml_transpose (ctx, beta));
115+ cur = ggml_leaky_relu (ctx, cur, 0 .2f , false );
116+ cur = ggml_add (ctx, ggml_conv_1d (ctx, block->conv2 , cur, 1 , 1 , 1 ), block->conv2_bias );
116117
117- struct ggml_tensor * res = cur;
118- cur = ggml_cont (ctx, ggml_transpose (ctx, x)) ;
118+ struct ggml_tensor * res = cur;
119+ cur = x ;
119120 if (block->upsample ) {
121+ cur = ggml_cont (ctx, ggml_transpose (ctx, cur));
120122 if (block->pool ) {
121- cur = ggml_upscale_ext (ctx, cur, cur->ne [0 ]* 2 , cur->ne [1 ], cur->ne [2 ], cur->ne [3 ]);
123+ cur = ggml_upscale_ext (ctx, cur, cur->ne [0 ], cur->ne [1 ]* 2 , cur->ne [2 ], cur->ne [3 ]);
122124 }
123- cur = ggml_conv_1d (ctx, block->upsample , cur, 1 , 0 , 1 );
125+ cur = ggml_mul_mat (ctx, block->upsample , cur);
126+ cur = ggml_cont (ctx, ggml_transpose (ctx, cur));
124127 }
125-
126- return ggml_cont (ctx, ggml_transpose (ctx, ggml_div (ctx, ggml_add (ctx, res, cur), sqrt_tensor))) ;
128+ cur = ggml_div (ctx, ggml_add (ctx, res, cur), sqrt_tensor);
129+ return cur;
127130}
128131
129132static struct ggml_tensor * build_kokoro_generator_res_block (ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * style, kokoro_generator_residual_block * block) {
@@ -158,6 +161,7 @@ static struct ggml_tensor * build_kokoro_generator_res_block(ggml_context * ctx,
158161}
159162
160163static struct ggml_tensor * build_noise_block (ggml_context * ctx, kokoro_noise_residual_block * block, struct ggml_tensor * x, struct ggml_tensor * style) {
164+ // This conv_1d seems replaceable with squeezed and transposed ggml_mul_mut, but s0 and p0 are dynamic
161165 ggml_tensor * cur = ggml_add (ctx, ggml_conv_1d (ctx, block->input_conv , x, block->input_conv_stride , block->input_conv_padding , 1 ), block->input_conv_bias );
162166 return build_kokoro_generator_res_block (ctx, cur, style, block->res_block );
163167}
@@ -510,6 +514,15 @@ void kokoro_model::assign_gen_resblock(kokoro_generator_residual_block * block,
510514 }
511515}
512516
517+ /* *
518+ * Removes the last axis, for cases where it's redundantly of length 1.
519+ * assert x.ndim == 3; numpy.squeeze(x, axis=-1)
520+ */
521+ static ggml_tensor * squeeze_3d_2d_e0 (ggml_context * ctx, ggml_tensor * x) {
522+ TTS_ASSERT (x->ne [0 ] == 1 );
523+ return ggml_reshape_2d (ctx, x, x->ne [1 ], x->ne [2 ]);
524+ }
525+
513526void kokoro_model::assign_ada_res_block (ada_residual_conv_block * block, std::string name, ggml_tensor * tensor) {
514527 if (name == " norm1_gamma_weight" ) {
515528 block->norm1_gamma = ggml_dup_tensor (ctx, tensor);
@@ -554,6 +567,7 @@ void kokoro_model::assign_ada_res_block(ada_residual_conv_block * block, std::st
554567 block->pool_bias = ggml_dup_tensor (ctx, ggml_transpose (ctx, tensor));
555568 set_tensor (block->pool_bias , tensor);
556569 } else if (name == " conv1x1_weight" ) {
570+ tensor = squeeze_3d_2d_e0 (ctx, tensor);
557571 block->upsample = ggml_dup_tensor (ctx, tensor);
558572 set_tensor (block->upsample , tensor);
559573 } else if (name == " conv1x1_bias" ) {
@@ -576,6 +590,7 @@ void kokoro_model::assign_decoder_weight(std::string name, ggml_tensor * tensor)
576590 decoder->n_conv_bias = ggml_dup_tensor (ctx, ggml_transpose (ctx, tensor));
577591 set_tensor (decoder->n_conv_bias , tensor);
578592 } else if (name == " asr_conv_weight" ) {
593+ tensor = squeeze_3d_2d_e0 (ctx, tensor);
579594 decoder->asr_conv = ggml_dup_tensor (ctx, tensor);
580595 set_tensor (decoder->asr_conv , tensor);
581596 } else if (name == " asr_conv_bias" ) {
@@ -607,12 +622,14 @@ void kokoro_model::assign_duration_weight(std::string name, ggml_tensor * tensor
607622 prosody_pred->duration_proj_bias = ggml_dup_tensor (ctx, tensor);
608623 set_tensor (prosody_pred->duration_proj_bias , tensor);
609624 } else if (name == " n_proj_kernel" ) {
625+ tensor = squeeze_3d_2d_e0 (ctx, tensor);
610626 prosody_pred->n_proj_kernel = ggml_dup_tensor (ctx, tensor);
611627 set_tensor (prosody_pred->n_proj_kernel , tensor);
612628 } else if (name == " n_proj_bias" ) {
613629 prosody_pred->n_proj_bias = ggml_dup_tensor (ctx, ggml_transpose (ctx, tensor));
614630 set_tensor (prosody_pred->n_proj_bias , tensor);
615631 } else if (name == " f0_proj_kernel" ) {
632+ tensor = squeeze_3d_2d_e0 (ctx, tensor);
616633 prosody_pred->f0_proj_kernel = ggml_dup_tensor (ctx, tensor);
617634 set_tensor (prosody_pred->f0_proj_kernel , tensor);
618635 } else if (name == " f0_proj_bias" ) {
@@ -1147,20 +1164,27 @@ struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) {
11471164
11481165 cur = build_lstm (ctx, cur, model->prosody_pred ->shared_lstm , cur->ne [1 ]);
11491166
1150- ggml_build_forward_expand (gf, cur);
11511167
11521168 struct ggml_tensor * f0_curve = cur;
1169+ f0_curve = ggml_cont (ctx, ggml_transpose (ctx, f0_curve));
11531170 for (auto block : model->prosody_pred ->f0_blocks ) {
11541171 f0_curve = build_ada_residual_conv (ctx, f0_curve, block, style_half, model->sqrt_tensor );
11551172 }
1156- f0_curve = ggml_add (ctx, ggml_conv_1d (ctx, model->prosody_pred ->f0_proj_kernel , ggml_cont (ctx, ggml_transpose (ctx, f0_curve)), 1 , 0 , 1 ), model->prosody_pred ->f0_proj_bias );
1173+ f0_curve = ggml_cont (ctx, ggml_transpose (ctx, f0_curve));
1174+ f0_curve = ggml_mul_mat (ctx, model->prosody_pred ->f0_proj_kernel , f0_curve);
1175+ f0_curve = squeeze_3d_2d_e0 (ctx, f0_curve);
1176+ f0_curve = ggml_add (ctx, f0_curve, model->prosody_pred ->f0_proj_bias );
11571177 ggml_set_name (f0_curve, " f0_out" );
11581178
11591179 struct ggml_tensor * n = cur;
1180+ n = ggml_cont (ctx, ggml_transpose (ctx, n));
11601181 for (auto block : model->prosody_pred ->n_blocks ) {
11611182 n = build_ada_residual_conv (ctx, n, block, style_half, model->sqrt_tensor );
11621183 }
1163- n = ggml_add (ctx, ggml_conv_1d (ctx, model->prosody_pred ->n_proj_kernel , ggml_cont (ctx, ggml_transpose (ctx, n)), 1 , 0 , 1 ), model->prosody_pred ->n_proj_bias );
1184+ n = ggml_cont (ctx, ggml_transpose (ctx, n));
1185+ n = ggml_mul_mat (ctx, model->prosody_pred ->n_proj_kernel , n);
1186+ n = squeeze_3d_2d_e0 (ctx, n);
1187+ n = ggml_add (ctx, n, model->prosody_pred ->n_proj_bias );
11641188 ggml_set_name (n, " n_out" );
11651189 ggml_build_forward_expand (gf, n);
11661190
@@ -1188,17 +1212,20 @@ struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) {
11881212 struct ggml_tensor * style_half2 = ggml_view_1d (ctx, voice, voice->ne [0 ]/2 , (batch.n_tokens - 3 ) * voice->nb [1 ]);
11891213
11901214 {
1191- f0 = ggml_cont (ctx, ggml_transpose (ctx, ggml_add (ctx, ggml_conv_1d (ctx, model->decoder ->f0_conv , f0_curve, 2 , 1 , 1 ), model->decoder ->f0_conv_bias )));
1192- n_base = ggml_cont (ctx, ggml_transpose (ctx, ggml_add (ctx, ggml_conv_1d (ctx, model->decoder ->n_conv , n, 2 , 1 , 1 ), model->decoder ->n_conv_bias )));
1193- cur = ggml_concat (ctx, ggml_concat (ctx, asr, f0, 0 ), n_base, 0 );
1194-
1215+ f0 = ggml_add (ctx, ggml_conv_1d (ctx, model->decoder ->f0_conv , f0_curve, 2 , 1 , 1 ), model->decoder ->f0_conv_bias );
1216+ n_base = ggml_add (ctx, ggml_conv_1d (ctx, model->decoder ->n_conv , n, 2 , 1 , 1 ), model->decoder ->n_conv_bias );
1217+ cur = ggml_concat (ctx, ggml_concat (ctx, ggml_cont (ctx, ggml_transpose (ctx, asr)), f0, 1 ), n_base, 1 );
11951218 cur = build_ada_residual_conv (ctx, cur, model->decoder ->encoder_block , style_half2, model->sqrt_tensor );
1196- asr_res = ggml_cont (ctx, ggml_transpose (ctx, ggml_add (ctx, ggml_conv_1d (ctx, model->decoder ->asr_conv , ggml_cont (ctx, ggml_transpose (ctx, asr)), 1 , 0 , 1 ), model->decoder ->asr_conv_bias )));
11971219
1220+ asr_res = ggml_mul_mat (ctx, model->decoder ->asr_conv , asr);
1221+ asr_res = ggml_add (ctx, asr_res, ggml_transpose (ctx, model->decoder ->asr_conv_bias ));
1222+
1223+ asr_res = ggml_cont (ctx, ggml_transpose (ctx, asr_res));
11981224 for (auto l : model->decoder ->decoder_blocks ) {
1199- cur = ggml_concat (ctx, ggml_concat (ctx, ggml_concat (ctx, cur, asr_res, 0 ), f0, 0 ), n_base, 0 );
1225+ cur = ggml_concat (ctx, ggml_concat (ctx, ggml_concat (ctx, cur, asr_res, 1 ), f0, 1 ), n_base, 1 );
12001226 cur = build_ada_residual_conv (ctx, cur, l, style_half2, model->sqrt_tensor );
12011227 }
1228+ cur = ggml_cont (ctx, ggml_transpose (ctx, cur));
12021229 }
12031230
12041231 kctx->window_sq_sum = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, kctx->total_duration *model->up_sampling_factor );
0 commit comments