[Refactor] all gather the accurate context lengths

gjc0824 · pisceskkk · pisceskkk · commit 8f8659c8202b · 2025-11-07T09:03:27.000+08:00
Co-authored-by: gaojc &lt;1055866782@qq.com&gt;
Co-authored-by: QiuChunshuo &lt;qiuchunshuo@huawei.com&gt;
Signed-off-by: gaojc &lt;1055866782@qq.com&gt;
Signed-off-by: QiuChunshuo &lt;qiuchunshuo@huawei.com&gt;
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
@@ -195,6 +195,7 @@
 
 import torch
 from tqdm import tqdm
+import numpy as np
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
@@ -845,15 +846,7 @@ def build(
                         None,
                         self.dcp_local_block_size,
                     )
-                    # Note(qcs): The max local context lengths
-                    # padded to `dcp_local_block_size`.
-                    local_context_lens_cpu = (
-                        cdiv(
-                            context_lens_cpu,
-                            self.dcp_virtual_block_size,
-                        )
-                        * self.dcp_local_block_size
-                    )
+                    local_context_lens_cpu = local_context_lens_allrank[:, self.dcp_rank]
                     # Note(hc): The above max_context_chunk already enforces
                     # block_size alignment, DCP just need the block_size can
                     # be divisible by dcp_world_size, because DCP use
@@ -989,7 +982,7 @@ def reorg_kvcache(
     local_context_lens_allrank: list[list[int]],
     sum_seq_len: int,
     max_seq_len: int,
-    toks: int,
+    local_context_lens_sum: list[int],
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     reorg kvcache after cp local gather to tp layout for attn kernel.
@@ -1000,31 +993,35 @@ def reorg_kvcache(
         local_context_lens_allrank: local context lengths on each CP rank.
         sum_seq_len: the sum of cp_chunk_seq_lens_lst.
         max_seq_len: the max value of cp_chunk_seq_lens_lst.
-        toks: the number of tokens for local gather cache.
+        local_context_lens_sum: the total context tokens of all request 
+            on each CP rank.
     """
     kv_c_segments = []
     k_pe_segments = []
     src_token_idx = 0
     max_seq_len_check = 0
+
     for local_chunk_seq_len, local_context_lens in zip(
         local_chunk_seq_lens_lst, local_context_lens_allrank
     ):
         cur_seq_len = 0
+        context_len_across_rank = 0
         for rank, local_context_len in enumerate(local_context_lens):
             if local_context_len != 0:
                 kv_c_segment = allgatered_kv_c_normed[
-                    rank * toks + src_token_idx : rank * toks
+                    context_len_across_rank + src_token_idx : context_len_across_rank
                     + src_token_idx
                     + local_context_len
                 ]
                 k_pe_segment = allgatered_k_pe[
-                    rank * toks + src_token_idx : rank * toks
+                    context_len_across_rank + src_token_idx : context_len_across_rank
                     + src_token_idx
                     + local_context_len
                 ]
                 kv_c_segments.append(kv_c_segment)
                 k_pe_segments.append(k_pe_segment)
                 cur_seq_len += local_context_len
+                context_len_across_rank += local_context_lens_sum[rank]
         max_seq_len_check = max(max_seq_len_check, cur_seq_len)
         src_token_idx += local_chunk_seq_len
     reorganized_kv_c_normed = torch.cat(kv_c_segments, dim=0)
@@ -1613,11 +1610,21 @@ def _context_parallel_compute_prefill_context(
             cur_allgather_workspace = workspace[
                 allgather_offset : allgather_offset * (1 + dcp_world_size)
             ]
+            local_context_lens_allrank = (
+                prefill_metadata.chunked_context.local_context_lens_allrank
+            )
+            local_context_lens_sum = np.sum(local_context_lens_allrank, axis=0).tolist()
             assert toks * dcp_world_size <= cur_allgather_workspace.shape[0]
-            cur_allgather_kvcache = cur_allgather_workspace[: toks * dcp_world_size]
+            cur_allgather_kvcache = cur_allgather_workspace[: sum(local_context_lens_sum)]
+            
             cur_allgather_kvcache.copy_(
-                get_dcp_group().all_gather(local_gathered_kvcache, dim=0)
+                get_dcp_group().all_gatherv(
+                    local_gathered_kvcache,
+                    dim=0,
+                    sizes=local_context_lens_sum
+                )
             )
+            
             assert (
                 cur_allgather_kvcache.shape[-1]
                 == self.kv_lora_rank + self.qk_rope_head_dim
@@ -1632,10 +1639,11 @@ def _context_parallel_compute_prefill_context(
                 local_chunk_seq_lens_lst=prefill_metadata.chunked_context.local_chunk_seq_lens[
                     i
                 ],
-                local_context_lens_allrank=prefill_metadata.chunked_context.local_context_lens_allrank,
+                local_context_lens_allrank=
+                prefill_metadata.chunked_context.local_context_lens_allrank,
                 sum_seq_len=prefill_metadata.chunked_context.cu_seq_lens_lst[i][-1],
                 max_seq_len=prefill_metadata.chunked_context.max_seq_lens[i],
-                toks=toks,
+                local_context_lens_sum=local_context_lens_sum,
             )
 
             kv_nope = self.kv_b_proj(kv_c_normed)[0].view(