Kameshr
/

Qwen-safety-Med-32B-pt

Model card Files Files and versions

Kameshr commited on Jan 6

Commit

c9f3e76

·

verified ·

1 Parent(s): 551090e

Update modeling_qwen2_custom.py

Files changed (1) hide show

modeling_qwen2_custom.py +13 -3

modeling_qwen2_custom.py CHANGED Viewed

@@ -1,15 +1,24 @@
 from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer, Qwen2Model, Qwen2ForCausalLM
 from transformers.configuration_utils import PretrainedConfig
 import torch
 import torch.nn as nn
 class CustomQwen2DecoderLayer(Qwen2DecoderLayer):
     def __init__(self, config, layer_idx):
         super().__init__(config, layer_idx)
-        # FIX: Reverted to 1D shape [5120] to match the saved checkpoint
         self.register_buffer(
             "resid_bias",
-            torch.zeros(config.hidden_size),
             persistent=True
         )
@@ -23,7 +32,8 @@ class CustomQwen2DecoderLayer(Qwen2DecoderLayer):
             bias = self.resid_bias.to(hidden.device).to(hidden.dtype)
             if bias.norm() > 0:
-                # view(1, 1, -1) safely converts [5120] -> [1, 1, 5120] for broadcasting
                 hidden = hidden + bias.view(1, 1, -1)
         if isinstance(outputs, tuple): outputs = (hidden,) + outputs[1:]

 from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer, Qwen2Model, Qwen2ForCausalLM
 from transformers.configuration_utils import PretrainedConfig
+from transformers import AutoConfig
 import torch
 import torch.nn as nn
 class CustomQwen2DecoderLayer(Qwen2DecoderLayer):
     def __init__(self, config, layer_idx):
         super().__init__(config, layer_idx)
+        # --- FIX: HYBRID SHAPE INITIALIZATION ---
+        # Layers 28-53 were saved as [1, 5120]. All others as [5120].
+        # We switch the shape based on the layer index.
+        if 28 <= layer_idx <= 53:
+            shape = (1, config.hidden_size)
+        else:
+            shape = (config.hidden_size,)
         self.register_buffer(
             "resid_bias",
+            torch.zeros(shape),
             persistent=True
         )
             bias = self.resid_bias.to(hidden.device).to(hidden.dtype)
             if bias.norm() > 0:
+                # view(1, 1, -1) fixes the math for everyone.
+                # It treats [5120] and [1, 5120] exactly the same.
                 hidden = hidden + bias.view(1, 1, -1)
         if isinstance(outputs, tuple): outputs = (hidden,) + outputs[1:]