[New Question] Attention with Linear Biases (Medium) (#78)

June24-Wu · June · web-flow · commit 4526e648da62 · 2025-09-16T03:36:08.000-04:00
Co-authored-by: June &lt;june24c.wu@gmail.com&gt;
diff --git a/challenges/medium/55_attn_w_linear_bias/challenge.html b/challenges/medium/55_attn_w_linear_bias/challenge.html
@@ -0,0 +1,115 @@
+<p>
+  Implement Attention with Linear Biases (ALiBi), following the method described in
+  <a href="https://arxiv.org/pdf/2108.12409" target="_blank">
+  "Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation"
+  </a>, for a given set of matrices.  
+  Given the query matrix <code>Q</code> of size <code>M×d</code>, key matrix <code>K</code> of size <code>N×d</code>, and value matrix
+  <code>V</code> of size <code>N×d</code>, your program should compute the output matrix using the formula:
+</p>
+  
+<p>
+  $$
+  \text{Attention}_{ALiBi}(Q, K, V) = \text{softmax}\Bigl( \frac{QK^T}{\sqrt{d}} + \alpha \cdot \Delta \Bigr)V
+  $$
+</p>
+
+<p>
+  where &alpha; is a slope controlling the linear bias and <code>&Delta; = i - j</code> represents the relative position between query <code>i</code> and key <code>j</code>. 
+  The softmax function is applied row-wise. <code>Q</code>, <code>K</code>, <code>V</code>, <code>output</code>, and <code>&alpha;</code> are all of data type <code>float32</code>; 
+  <code>M</code>, <code>N</code>, <code>d</code> are of data type <code>int32</code>.
+</p>
+
+<h2>Implementation Requirements</h2>
+<ul>
+  <li>Use only native features (external libraries are not permitted)</li>
+  <li>The
+    <code>solve</code> function signature must remain unchanged
+  </li>
+  <li>The final result must be stored in the output matrix
+    <code>output</code>
+  </li>
+</ul>
+<h2>Example 1:</h2>
+<p>
+<strong>Input:</strong><br>
+<code>Q</code> (2×4):
+\[
+\begin{bmatrix}
+1.0 & 0.0 & 0.0 & 0.0 \\
+0.0 & 1.0 & 0.0 & 0.0
+\end{bmatrix}
+\]
+<code>K</code> (3×4):
+\[
+\begin{bmatrix}
+1.0 & 0.0 & 0.0 & 0.0 \\
+0.0 & 1.0 & 0.0 & 0.0 \\
+0.0 & 0.0 & 1.0 & 0.0
+\end{bmatrix}
+\]
+<code>V</code> (3×4):
+\[
+\begin{bmatrix}
+1.0 & 2.0 & 3.0 & 4.0 \\
+5.0 & 6.0 & 7.0 & 8.0 \\
+9.0 & 10.0 & 11.0 & 12.0
+\end{bmatrix}
+\]
+\(\alpha = 0.5\)
+</p>
+
+<p>
+<strong>Output:</strong><br>
+<code>output</code> (2×4):
+\[
+\begin{bmatrix}
+3.05 & 4.05 & 6.05 & 7.05 \\
+3.93 & 4.93 & 5.93 & 6.93
+\end{bmatrix}
+\]
+</p>
+
+<h2>Example 2:</h2>
+<p>
+<strong>Input:</strong><br>
+<code>Q</code> (1×2):
+\[
+\begin{bmatrix}
+1.0 & 2.0
+\end{bmatrix}
+\]
+<code>K</code> (2×2):
+\[
+\begin{bmatrix}
+1.0 & 0.0 \\
+0.0 & 1.0
+\end{bmatrix}
+\]
+<code>V</code> (2×2):
+\[
+\begin{bmatrix}
+3.0 & 4.0 \\
+5.0 & 6.0
+\end{bmatrix}
+\]
+<code>α</code> = 0.8
+</p>
+
+<p>
+<strong>Output:</strong><br>
+<code>output</code> (1×2):
+\[
+\begin{bmatrix}
+3.95 & 4.95
+\end{bmatrix}
+\]
+</p>
+
+<h2>Constraints</h2>
+<ul>
+  <li>Matrix <code>Q</code> is of size <code>M×d</code> and matrices <code>K</code> and <code>V</code> are of size
+    <code>N×d</code></li>
+  <li>1 &le; <code>M</code>, <code>N</code> &le; 2048</li>
+  <li>1 &le; <code>d</code> &le; 1024</li>
+  <li>-1.0 &le; <code>&alpha;</code> &le; 1.0</li>
+</ul>
diff --git a/challenges/medium/55_attn_w_linear_bias/challenge.py b/challenges/medium/55_attn_w_linear_bias/challenge.py
@@ -0,0 +1,118 @@
+import ctypes
+from typing import Any, List, Dict
+import torch
+from core.challenge_base import ChallengeBase
+
+class Challenge(ChallengeBase):
+    def __init__(self):
+        super().__init__(
+            name="Attention with Linear Biases",
+            atol=1e-04,
+            rtol=1e-04,
+            num_gpus=1,
+            access_tier="free"
+        )
+    
+    def reference_impl(self, Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, output: torch.Tensor, M: int, N: int, d: int, alpha: float):
+        assert Q.shape == (M,d)
+        assert K.shape == (N,d)
+        assert V.shape == (N,d)
+        assert output.shape == (M,d)
+        
+        scale = d ** 0.5
+        attn = torch.matmul(Q, K.t()) / scale
+
+        pos_bias = alpha * (torch.arange(M, device=Q.device).unsqueeze(1) - torch.arange(N, device=K.device).unsqueeze(0))
+        attn = attn + pos_bias
+
+        attn = torch.softmax(attn, dim=1) # M , N
+        torch.matmul(attn, V, out=output)
+
+    def get_solve_signature(self) -> Dict[str, Any]:
+        return {
+            "Q": ctypes.POINTER(ctypes.c_float),
+            "K": ctypes.POINTER(ctypes.c_float),
+            "V": ctypes.POINTER(ctypes.c_float),
+            "output": ctypes.POINTER(ctypes.c_float),
+            "M": ctypes.c_int,
+            "N": ctypes.c_int,
+            "d": ctypes.c_int,
+            "alpha": ctypes.c_float,
+        }
+
+    def generate_example_test(self) -> Dict[str, Any]:
+        dtype = torch.float32
+        Q = torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device="cuda", dtype=dtype)
+        K = torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]], device="cuda", dtype=dtype)
+        V = torch.tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0]], device="cuda", dtype=dtype)
+        output = torch.empty(2, 4, device="cuda", dtype=dtype)
+        return {"Q": Q, "K": K, "V": V, "output": output, "M": 2, "N": 3, "d": 4, "alpha": 0.5}
+
+    def generate_functional_test(self) -> List[Dict[str, Any]]:
+        dtype = torch.float32
+        tests = []
+
+        # basic_example 1
+        tests.append({
+            "Q": torch.tensor([[1.0, 2.0]], device="cuda", dtype=dtype),
+            "K": torch.tensor([[1.0, 0.0],[0.0, 1.0]], device="cuda", dtype=dtype),
+            "V": torch.tensor([[3.0, 4.0], [5.0, 6.0]], device="cuda", dtype=dtype),
+            "output": torch.empty(1, 2, device="cuda", dtype=dtype),
+            "M": 1, "N": 2, "d": 2, "alpha": 0.8
+        })
+
+        # basic_example 2
+        tests.append({
+            "Q": torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device="cuda", dtype=dtype),
+            "K": torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]], device="cuda", dtype=dtype),
+            "V": torch.tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0]], device="cuda", dtype=dtype),
+            "output": torch.empty(2, 4, device="cuda", dtype=dtype),
+            "M": 2, "N": 3, "d": 4, "alpha": 0.5
+        })
+
+        # zero_matrices
+        tests.append({
+            "Q": torch.zeros((3, 5), device="cuda", dtype=dtype),
+            "K": torch.zeros((3, 5), device="cuda", dtype=dtype),
+            "V": torch.zeros((3, 5), device="cuda", dtype=dtype),
+            "output": torch.empty(3, 5, device="cuda", dtype=dtype),
+            "M": 3, "N": 3, "d": 5, "alpha": 0.5
+        })
+
+        # mixed_values
+        tests.append({
+            "Q": torch.tensor([[-1.0, 2.0, -3.0], [4.0, -5.0, 6.0], [-7.0, 8.0, -9.0], [10.0, -11.0, 12.0]], device="cuda", dtype=dtype),
+            "K": torch.tensor([[2.0, -1.0, 3.0], [-4.0, 5.0, -6.0], [7.0, -8.0, 9.0], [-10.0, 11.0, -12.0]], device="cuda", dtype=dtype),
+            "V": torch.tensor([[1.0, 0.5, -0.5], [-1.0, 2.0, 3.0], [4.0, -2.0, 1.0], [0.0, 1.0, -1.0]], device="cuda", dtype=dtype),
+            "output": torch.empty(4, 3, device="cuda", dtype=dtype),
+            "M": 4, "N": 4, "d": 3, "alpha": 1.0
+        })
+
+        # large_matrices
+        tests.append({
+            "Q": torch.empty((64, 32), device="cuda", dtype=dtype).uniform_(-0.1, 0.1),
+            "K": torch.empty((128, 32), device="cuda", dtype=dtype).uniform_(-0.1, 0.1),
+            "V": torch.empty((128, 32), device="cuda", dtype=dtype).uniform_(-0.1, 0.1),
+            "output": torch.empty(64, 32, device="cuda", dtype=dtype),
+            "M": 64, "N": 128, "d": 32, "alpha": -0.76
+        })
+    
+        # different alpha
+        tests.append({
+            "Q": torch.empty((64, 32), device="cuda", dtype=dtype).uniform_(-1, 1),
+            "K": torch.empty((128, 32), device="cuda", dtype=dtype).uniform_(-1, 1),
+            "V": torch.empty((128, 32), device="cuda", dtype=dtype).uniform_(-1, 1),
+            "output": torch.empty(64, 32, device="cuda", dtype=dtype),
+            "M": 64, "N": 128, "d": 32, "alpha": -0.3
+        })
+
+        return tests
+
+    def generate_performance_test(self) -> Dict[str, Any]:
+        dtype = torch.float32
+        M, N, d = 2048, 2048, 1024
+        Q = torch.empty((M, d), device="cuda", dtype=dtype).uniform_(-0.1, 0.1)
+        K = torch.empty((N, d), device="cuda", dtype=dtype).uniform_(-0.1, 0.1)
+        V = torch.empty((N,d), device="cuda", dtype=dtype).uniform_(-0.1, 0.1)
+        output = torch.empty(M, d, device="cuda", dtype=dtype)
+        return {"Q": Q, "K": K, "V": V, "output": output, "M": M, "N": N, "d": d, "alpha": 0.5}
diff --git a/challenges/medium/55_attn_w_linear_bias/starter/starter.cu b/challenges/medium/55_attn_w_linear_bias/starter/starter.cu
@@ -0,0 +1,6 @@
+#include <cuda_runtime.h>
+
+// Q, K, V, output are device pointers
+extern "C" void solve(const float* Q, const float* K, const float* V, float* output, int M, int N, int d, float alpha) {
+
+}
diff --git a/challenges/medium/55_attn_w_linear_bias/starter/starter.mojo b/challenges/medium/55_attn_w_linear_bias/starter/starter.mojo
@@ -0,0 +1,10 @@
+from gpu.host import DeviceContext
+from gpu.id import block_dim, block_idx, thread_idx
+from memory import UnsafePointer
+from math import ceildiv
+
+# Q, K, V, output are device pointers (i.e. pointers to memory on the GPU)
+@export                         
+def solve(Q: UnsafePointer[Float32], K: UnsafePointer[Float32], V: UnsafePointer[Float32], 
+          output: UnsafePointer[Float32], M: Int32, N: Int32, d: Int32, alpha: Float32):
+    pass 
diff --git a/challenges/medium/55_attn_w_linear_bias/starter/starter.pytorch.py b/challenges/medium/55_attn_w_linear_bias/starter/starter.pytorch.py
@@ -0,0 +1,6 @@
+import torch
+
+# Q, K, V, output are tensors on the GPU
+def solve(Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, output: torch.Tensor,
+          M: int, N: int, d: int, alpha: float):
+    pass 
diff --git a/challenges/medium/55_attn_w_linear_bias/starter/starter.triton.py b/challenges/medium/55_attn_w_linear_bias/starter/starter.triton.py
@@ -0,0 +1,7 @@
+import torch
+import triton
+import triton.language as tl
+
+# Q, K, V, output are tensors on the GPU
+def solve(Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, output: torch.Tensor, M: int, N: int, d: int, alpha: float):
+    pass