Add qwen 2.5

jackzhxng · jackzhxng · commit 36152fb4db69 · 2025-02-10T16:58:17.000-08:00
diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
@@ -175,9 +175,10 @@ def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
         self.max_batch_size = args.max_batch_size
         self.max_context_len = args.max_context_len
         self.dim = args.dim
-        self.wq = nn.Linear(self.dim, self.n_heads * self.head_dim, bias=False)
-        self.wk = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wv = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
+        # TODO: parametrize bias for attention and feedforward.
+        self.wq = nn.Linear(self.dim, self.n_heads * self.head_dim, bias=True)
+        self.wk = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=True)
+        self.wv = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=True)
         self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
 
         self.layer_id = layer_id
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -150,6 +150,7 @@ def __init__(self, **kwargs):
             input_prune_map=input_prune_map,
             output_prune_map=output_prune_map,
             enable_dynamic_shape=self.enable_dynamic_shape,
+            use_hf_rope=True,
             **params,
         )
 
@@ -170,7 +171,7 @@ def __init__(self, **kwargs):
 
         # Within the device="meta" context, tensors that are created do not carry data.
         # They possess all other metadata a tensor carries such as size, stride, requires_grad.
-        with torch.device("meta"):
+        with torch.device("cpu"):
             self.model_ = Transformer(model_args)
 
         if "int8" in str(checkpoint_path):
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
@@ -114,6 +114,7 @@ def apply_rotary_emb_to_k(
     return xk_out.type_as(xk)
 
 
+# Wrap apply_rotary_emb in a module to enable it to be module swapped out.
 class RotaryEmbedding(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -209,18 +210,66 @@ def hf_apply_rotary_emb_to_k(k, cos, sin, position_ids=None, unsqueeze_dim=1):
     return k_embed
 
 
+# ======================= Qwen2 Implementation ========================
+
+
+def qwen_precompute_freqs_cis(dim: int, end: int, theta: float = 1_000_000.0):
+    """
+    Precompute frequency tensor for Qwen2-style RoPE.
+    """
+    freqs = 1.0 / (
+        theta ** (torch.arange(0, dim, 2, device="cpu")[: (dim // 2)].float() / dim)
+    )
+    t = torch.arange(end, device=freqs.device)
+    freqs = torch.outer(t, freqs).float()
+    freqs_cos = torch.cos(freqs)
+    freqs_sin = torch.sin(freqs)
+    return freqs_cos, freqs_sin
+
+
+def qwen_apply_rotary_emb(
+    q: torch.Tensor, k: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply Qwen2-style RoPE to query and key tensors.
+    """
+    def rotate_half(x):
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+
+    # Reshape cos and sin for broadcasting
+    cos = freqs_cos.unsqueeze(1)  # [seq_len, 1, head_dim]
+    sin = freqs_sin.unsqueeze(1)  # [seq_len, 1, head_dim]
+
+    # Apply rotation
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
 class Rope(torch.nn.Module):
     def __init__(self, params: ModelArgs):
         super().__init__()
         self.params = params
+
+        # Choose the appropriate RoPE implementation
         if self.params.use_hf_rope:
             self.precompute_freqs_cis = hf_precompute_freqs_cis
+            self.apply_rotary_emb = hf_apply_rotary_emb
+        # elif self.params.use_qwen_rope:
+        #     self.precompute_freqs_cis = qwen_precompute_freqs_cis
+        #     self.apply_rotary_emb = qwen_apply_rotary_emb
         else:
             self.precompute_freqs_cis = partial(
                 precompute_freqs_cis,
                 use_scaled=self.params.use_scaled_rope,
                 scale_factor=self.params.rope_scale_factor,
             )
+            self.apply_rotary_emb = RotaryEmbedding()
+
+        # Precompute frequencies
         freqs_cos, freqs_sin = self.precompute_freqs_cis(
             self.params.head_dim,
             (
@@ -232,10 +281,6 @@ def __init__(self, params: ModelArgs):
         )
         self.register_buffer("freqs_cos", freqs_cos, persistent=False)
         self.register_buffer("freqs_sin", freqs_sin, persistent=False)
-        if self.params.use_hf_rope:
-            self.apply_rotary_emb = hf_apply_rotary_emb
-        else:
-            self.apply_rotary_emb = RotaryEmbedding()
 
     def forward(
         self,
diff --git a/examples/models/qwen2_5/convert_weights.py b/examples/models/qwen2_5/convert_weights.py
@@ -0,0 +1,73 @@
+from typing import Dict
+
+from torchtune.training import FullModelHFCheckpointer
+# from torchtune.models import convert_weights
+from torchtune.models.convert_weights import get_mapped_key
+import torch
+
+# Standard _FROM_META weight mapping from TorchTune + additional bias weight mappings.
+_QWEN_2_FROM_META = {
+    "tok_embeddings.weight": "tok_embeddings.weight",
+    "norm.weight": "norm.scale",
+    "output.weight": "output.weight",
+    "layers.{}.attention.wk.weight": "layers.{}.attn.k_proj.weight",
+    "layers.{}.attention.wk.bias": "layers.{}.attn.k_proj.bias",
+    "layers.{}.attention.wq.weight": "layers.{}.attn.q_proj.weight",
+    "layers.{}.attention.wq.bias": "layers.{}.attn.q_proj.bias",
+    "layers.{}.attention.wv.weight": "layers.{}.attn.v_proj.weight",
+    "layers.{}.attention.wv.bias": "layers.{}.attn.v_proj.bias",
+    "layers.{}.attention.wo.weight": "layers.{}.attn.output_proj.weight",
+    "layers.{}.attention_norm.weight": "layers.{}.sa_norm.scale",
+    "layers.{}.ffn_norm.weight": "layers.{}.mlp_norm.scale",
+    "layers.{}.feed_forward.w1.weight": "layers.{}.mlp.w1.weight",
+    "layers.{}.feed_forward.w2.weight": "layers.{}.mlp.w2.weight",
+    "layers.{}.feed_forward.w3.weight": "layers.{}.mlp.w3.weight",
+    
+}
+
+def qwen_2_tune_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    """
+    Convert a state dict from torchtune's format to Meta's format. This function
+    doesn't handle any sharding or splitting of state dicts. It follows the
+    state_dict IN -> state_dict OUT pattern.
+
+    Args:
+        state_dict (Dict[str, torch.Tensor]): State dict in torchtune's format.
+
+    Returns:
+        Dict[str, torch.Tensor]: State dict in Meta's format.
+    """
+    converted_state_dict = {}
+    inverted_mapping_dict = {v: k for k, v in _QWEN_2_FROM_META.items()}
+
+    for key, value in state_dict.items():
+        new_key = get_mapped_key(key, inverted_mapping_dict)
+        converted_state_dict[new_key] = value
+
+    return converted_state_dict
+
+# TODO: no need to use TorchTune checkpointer, can just aggregate checkpoint files by ourselves.
+checkpointer = FullModelHFCheckpointer(
+    checkpoint_dir='/home/jackzhxng/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/',
+    checkpoint_files=['model.safetensors'],
+    output_dir='.' ,
+    model_type='QWEN2'
+)
+
+print("Loading checkpoint")
+sd = checkpointer.load_checkpoint()
+
+print("HF weights:")
+for weight in sd["model"].keys():
+    print(weight)
+print()
+
+# Convert from TorchTune to Meta (PyTorch native)
+sd = qwen_2_tune_to_meta(sd['model'])
+
+print("Meta weights:")
+for weight in sd.keys():
+    print(weight)
+
+print("Saving checkpoint")
+torch.save(sd, "/home/jackzhxng/models/qwen2_5-1_5b.pth")