Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion transformer_lens/model_bridge/sources/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,13 @@ def boot(
# Ensure all parameters match the requested dtype. Some architectures
# (e.g., MoE models) retain native bfloat16 weights even when
# torch_dtype is specified during from_pretrained().
hf_model = hf_model.to(dtype=dtype)
# Only cast parameters (trainable weights), not buffers. HF
# intentionally keeps some buffers in float32 for precision (e.g.,
# RotaryEmbedding.inv_freq). Casting them to bfloat16 introduces
# rounding that compounds through every attention layer.
for param in hf_model.parameters():
if param.is_floating_point() and param.dtype != dtype:
param.data = param.data.to(dtype=dtype)
adapter.prepare_model(hf_model)
tokenizer = tokenizer
default_padding_side = getattr(adapter.cfg, "default_padding_side", None)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def __init__(self, cfg: Any) -> None:
# native forward pass. The gate (router) is mapped as a submodule
# for hook access.
"mlp": MoEBridge(
name="mlp",
name="block_sparse_moe",
config=self.cfg,
submodules={
"gate": LinearBridge(name="gate"),
Expand Down
165 changes: 90 additions & 75 deletions transformer_lens/tools/model_registry/data/supported_models.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
},
"total_architectures": 35,
"total_models": 5833,
"total_verified": 677,
"total_verified": 686,
"models": [
{
"architecture_id": "Qwen2ForCausalLM",
Expand Down Expand Up @@ -24117,15 +24117,16 @@
{
"architecture_id": "MixtralForCausalLM",
"model_id": "DiscoResearch/DiscoLM-mixtral-8x7b-v2",
"status": 0,
"verified_date": null,
"status": 3,
"verified_date": "2026-03-24",
"metadata": null,
"note": null,
"phase1_score": null,
"note": "CORE FAILED: Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed)",
"phase1_score": 0.0,
"phase2_score": null,
"phase3_score": null,
"phase4_score": null,
"phase7_score": null
"phase7_score": null,
"phase8_score": null
},
{
"architecture_id": "LlamaForCausalLM",
Expand Down Expand Up @@ -28563,15 +28564,16 @@
{
"architecture_id": "MixtralForCausalLM",
"model_id": "dphn/dolphin-2.5-mixtral-8x7b",
"status": 0,
"verified_date": null,
"status": 1,
"verified_date": "2026-03-24",
"metadata": null,
"note": null,
"phase1_score": null,
"note": "Core verification completed",
"phase1_score": 100.0,
"phase2_score": null,
"phase3_score": null,
"phase4_score": null,
"phase7_score": null
"phase4_score": 99.4,
"phase7_score": null,
"phase8_score": null
},
{
"architecture_id": "GPT2LMHeadModel",
Expand Down Expand Up @@ -40825,15 +40827,16 @@
{
"architecture_id": "MixtralForCausalLM",
"model_id": "BEE-spoke-data/Mixtral-GQA-400m-v2",
"status": 0,
"verified_date": null,
"status": 1,
"verified_date": "2026-03-23",
"metadata": null,
"note": null,
"phase1_score": null,
"phase2_score": null,
"phase3_score": null,
"phase4_score": null,
"phase7_score": null
"note": "Full verification completed",
"phase1_score": 100.0,
"phase2_score": 100.0,
"phase3_score": 100.0,
"phase4_score": 91.0,
"phase7_score": null,
"phase8_score": null
},
{
"architecture_id": "LlamaForCausalLM",
Expand Down Expand Up @@ -42047,15 +42050,16 @@
{
"architecture_id": "MixtralForCausalLM",
"model_id": "NeverSleep/Noromaid-v0.1-mixtral-8x7b-Instruct-v3",
"status": 0,
"verified_date": null,
"status": 1,
"verified_date": "2026-03-24",
"metadata": null,
"note": null,
"phase1_score": null,
"note": "Core verification completed",
"phase1_score": 100.0,
"phase2_score": null,
"phase3_score": null,
"phase4_score": null,
"phase7_score": null
"phase4_score": 99.4,
"phase7_score": null,
"phase8_score": null
},
{
"architecture_id": "Qwen2ForCausalLM",
Expand Down Expand Up @@ -53370,15 +53374,16 @@
{
"architecture_id": "MixtralForCausalLM",
"model_id": "jondurbin/bagel-8x7b-v0.2",
"status": 0,
"verified_date": null,
"status": 1,
"verified_date": "2026-03-24",
"metadata": null,
"note": null,
"phase1_score": null,
"note": "Core verification completed",
"phase1_score": 100.0,
"phase2_score": null,
"phase3_score": null,
"phase4_score": null,
"phase7_score": null
"phase4_score": 96.9,
"phase7_score": null,
"phase8_score": null
},
{
"architecture_id": "MistralForCausalLM",
Expand Down Expand Up @@ -54787,15 +54792,16 @@
{
"architecture_id": "MixtralForCausalLM",
"model_id": "Isotonic/TinyMixtral-4x248M-MoE",
"status": 0,
"verified_date": null,
"status": 1,
"verified_date": "2026-03-23",
"metadata": null,
"note": null,
"phase1_score": null,
"phase2_score": null,
"phase3_score": null,
"phase4_score": null,
"phase7_score": null
"note": "Full verification completed",
"phase1_score": 100.0,
"phase2_score": 100.0,
"phase3_score": 100.0,
"phase4_score": 92.5,
"phase7_score": null,
"phase8_score": null
},
{
"architecture_id": "Qwen2ForCausalLM",
Expand Down Expand Up @@ -55021,15 +55027,16 @@
{
"architecture_id": "MixtralForCausalLM",
"model_id": "chargoddard/SmolLlamix-8x101M",
"status": 0,
"verified_date": null,
"status": 1,
"verified_date": "2026-03-23",
"metadata": null,
"note": null,
"phase1_score": null,
"phase2_score": null,
"phase3_score": null,
"phase4_score": null,
"phase7_score": null
"note": "Full verification completed",
"phase1_score": 100.0,
"phase2_score": 100.0,
"phase3_score": 100.0,
"phase4_score": 90.9,
"phase7_score": null,
"phase8_score": null
},
{
"architecture_id": "LlamaForCausalLM",
Expand Down Expand Up @@ -55385,15 +55392,16 @@
{
"architecture_id": "MixtralForCausalLM",
"model_id": "chargoddard/SmolLlamix-8x101M-take2",
"status": 0,
"verified_date": null,
"status": 1,
"verified_date": "2026-03-23",
"metadata": null,
"note": null,
"phase1_score": null,
"phase2_score": null,
"phase3_score": null,
"phase4_score": null,
"phase7_score": null
"note": "Full verification completed",
"phase1_score": 100.0,
"phase2_score": 100.0,
"phase3_score": 100.0,
"phase4_score": 97.5,
"phase7_score": null,
"phase8_score": null
},
{
"architecture_id": "LlamaForCausalLM",
Expand Down Expand Up @@ -56464,15 +56472,16 @@
{
"architecture_id": "MixtralForCausalLM",
"model_id": "yujiepan/mixtral-8xtiny-random",
"status": 0,
"verified_date": null,
"status": 1,
"verified_date": "2026-03-23",
"metadata": null,
"note": null,
"phase1_score": null,
"phase2_score": null,
"phase3_score": null,
"phase4_score": null,
"phase7_score": null
"note": "Full verification completed with issues, low text quality",
"phase1_score": 100.0,
"phase2_score": 100.0,
"phase3_score": 100.0,
"phase4_score": 47.5,
"phase7_score": null,
"phase8_score": null
},
{
"architecture_id": "MixtralForCausalLM",
Expand Down Expand Up @@ -63956,7 +63965,7 @@
"status": 2,
"verified_date": "2026-03-18",
"metadata": null,
"note": "Estimated 189.6 GB exceeds 121.0 GB limit",
"note": null,
"phase1_score": null,
"phase2_score": null,
"phase3_score": null,
Expand All @@ -63969,7 +63978,7 @@
"status": 2,
"verified_date": "2026-03-18",
"metadata": null,
"note": "Estimated 189.6 GB exceeds 121.0 GB limit",
"note": null,
"phase1_score": null,
"phase2_score": null,
"phase3_score": null,
Expand Down Expand Up @@ -65615,13 +65624,16 @@
{
"architecture_id": "MixtralForCausalLM",
"model_id": "llm-jp/llm-jp-3-8x1.8b",
"status": 0,
"verified_date": null,
"status": 2,
"verified_date": "2026-03-23",
"metadata": null,
"note": null,
"note": "Estimated 81.0 GB exceeds 68.4 GB limit",
"phase1_score": null,
"phase2_score": null,
"phase3_score": null
"phase3_score": null,
"phase4_score": null,
"phase7_score": null,
"phase8_score": null
},
{
"architecture_id": "OPTForCausalLM",
Expand Down Expand Up @@ -68936,13 +68948,16 @@
{
"architecture_id": "MixtralForCausalLM",
"model_id": "prometheus-eval/prometheus-8x7b-v2.0",
"status": 0,
"verified_date": null,
"status": 1,
"verified_date": "2026-03-24",
"metadata": null,
"note": null,
"phase1_score": null,
"note": "Core verification completed",
"phase1_score": 100.0,
"phase2_score": null,
"phase3_score": null
"phase3_score": null,
"phase4_score": 99.4,
"phase7_score": null,
"phase8_score": null
},
{
"architecture_id": "MixtralForCausalLM",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"last_updated": "2026-03-19T13:52:40.585159",
"last_updated": "2026-03-23T20:31:54.411804",
"records": [
{
"model_id": "Macropodus/macbert4mdcspell_v1",
Expand Down Expand Up @@ -10430,6 +10430,76 @@
"notes": "Full verification completed",
"invalidated": false,
"invalidation_reason": null
},
{
"model_id": "Isotonic/TinyMixtral-4x248M-MoE",
"architecture_id": "MixtralForCausalLM",
"verified_date": "2026-03-23",
"verified_by": "verify_models",
"transformerlens_version": null,
"notes": "Full verification completed",
"invalidated": false,
"invalidation_reason": null
},
{
"model_id": "chargoddard/SmolLlamix-8x101M",
"architecture_id": "MixtralForCausalLM",
"verified_date": "2026-03-23",
"verified_by": "verify_models",
"transformerlens_version": null,
"notes": "Full verification completed",
"invalidated": false,
"invalidation_reason": null
},
{
"model_id": "Isotonic/TinyMixtral-4x248M-MoE",
"architecture_id": "MixtralForCausalLM",
"verified_date": "2026-03-23",
"verified_by": "verify_models",
"transformerlens_version": null,
"notes": "Full verification completed",
"invalidated": false,
"invalidation_reason": null
},
{
"model_id": "chargoddard/SmolLlamix-8x101M",
"architecture_id": "MixtralForCausalLM",
"verified_date": "2026-03-23",
"verified_by": "verify_models",
"transformerlens_version": null,
"notes": "Full verification completed",
"invalidated": false,
"invalidation_reason": null
},
{
"model_id": "chargoddard/SmolLlamix-8x101M-take2",
"architecture_id": "MixtralForCausalLM",
"verified_date": "2026-03-23",
"verified_by": "verify_models",
"transformerlens_version": null,
"notes": "Full verification completed",
"invalidated": false,
"invalidation_reason": null
},
{
"model_id": "BEE-spoke-data/Mixtral-GQA-400m-v2",
"architecture_id": "MixtralForCausalLM",
"verified_date": "2026-03-23",
"verified_by": "verify_models",
"transformerlens_version": null,
"notes": "Full verification completed",
"invalidated": false,
"invalidation_reason": null
},
{
"model_id": "yujiepan/mixtral-8xtiny-random",
"architecture_id": "MixtralForCausalLM",
"verified_date": "2026-03-23",
"verified_by": "verify_models",
"transformerlens_version": null,
"notes": "Full verification completed with issues, low text quality",
"invalidated": false,
"invalidation_reason": null
}
]
}
Loading
Loading