Use the optimized block sizes after tuning the kernel. (#14329)

vanbasten23 · web-flow · commit 1e3598edeb69 · 2025-03-07T13:25:13.000Z
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
@@ -12,8 +12,8 @@
 from vllm.attention.backends.utils import CommonAttentionState
 
 # These are the 2 tunable parameters of the paged attention Pallas kernel.
-NUM_QUERIES_PER_BLOCK = 32
-NUM_KV_PAGES_PER_BLOCK = 128
+NUM_QUERIES_PER_BLOCK = 16
+NUM_KV_PAGES_PER_BLOCK = 256
 
 
 class PallasAttentionBackend(AttentionBackend):