diff --git a/modelopt/torch/nas/plugins/megatron_hooks/base_hooks.py b/modelopt/torch/nas/plugins/megatron_hooks/base_hooks.py
index 7cd7214443..a868fddc13 100644
--- a/modelopt/torch/nas/plugins/megatron_hooks/base_hooks.py
+++ b/modelopt/torch/nas/plugins/megatron_hooks/base_hooks.py
@@ -602,9 +602,9 @@ def __init__(self, linear_layer: nn.Linear, activation_hooks_kwargs: dict):
         assert self.optimize_for in ["latency", "memory"]
 
         self.hidden_size = model_config.hidden_size
-        self.n_heads_in_group = block_config.attention.n_heads_in_group
         self.num_q_heads = model_config.num_attention_heads
-        self.num_kv_heads = self.num_q_heads // self.n_heads_in_group
+        self.num_kv_heads = block_config.attention.num_key_value_heads
+        self.n_heads_in_group = self.num_q_heads // self.num_kv_heads
         self.head_dim = getattr(model_config, "head_dim", self.hidden_size // self.num_q_heads)
 
         self.agg_kv_head_contributions = torch.zeros(
@@ -1142,61 +1142,39 @@ def __call__(
 
 
 class Qwen3VLRemoveExpertsIndependentHook(RemoveExpertsIndependentHook):
-    """Expert removal importance hook for Qwen3-VL models.
-
-    TODO: Implement get_router_logits_and_routed_experts based on Qwen3-VL MoE forward pass.
-    """
+    """Expert removal importance hook for Qwen3-VL models."""
 
     def get_router_logits_and_routed_experts(
         self, hidden_states: torch.Tensor, router_logits: torch.Tensor | None = None
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """Extract router logits and expert outputs for Qwen3-VL MoE.
 
-        Note: This is a placeholder implementation. Implement based on Qwen3VLMoeSparseMoe forward.
+        Based on Qwen3VLMoeSparseMoe forward pass.
         """
-        batch_size = (
-            hidden_states.shape[0] * hidden_states.shape[1]
-            if hidden_states.ndim > 2
-            else hidden_states.shape[0]
-        )
-        router_logits_out = torch.zeros(
-            batch_size, self.num_local_experts, device=hidden_states.device
-        )
-        routed_experts = hidden_states.view(-1, hidden_states.shape[-1])
-        return router_logits_out, routed_experts
+        orig_shape = hidden_states.shape
 
+        # Flatten to (num_tokens, hidden_size) for processing
+        hidden_states_flat = hidden_states.reshape(-1, self.moe.hidden_size)
 
-class GptOssRemoveExpertsIndependentHook(RemoveExpertsIndependentHook):
-    """Expert removal importance hook for GPT-OSS models.
+        if router_logits is None:
+            router_logits = self.moe.gate(hidden_states_flat)
+
+        routing_weights = torch.nn.functional.softmax(router_logits, dim=-1, dtype=torch.float)
+        routing_weights, router_indices = torch.topk(routing_weights, self.moe.top_k, dim=-1)
+        routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
+        routing_weights = routing_weights.to(hidden_states_flat.dtype)
+        router_weights = torch.zeros_like(router_logits).scatter_(
+            1, router_indices, routing_weights
+        )
 
-    TODO: Implement get_router_logits_and_routed_experts based on GPT-OSS MoE forward pass.
-    This is a placeholder implementation that allows the framework to run.
-    """
+        # Reshape hidden_states for moe.experts (expects 3D: batch, seq, hidden)
+        # router_weights and router_indices remain 2D (num_tokens, num_experts)
+        batch_size = orig_shape[0] if hidden_states.ndim == 3 else 1
+        hidden_states_3d = hidden_states_flat.reshape(batch_size, -1, self.moe.hidden_size)
 
-    def get_router_logits_and_routed_experts(
-        self, hidden_states: torch.Tensor, router_logits: torch.Tensor | None = None
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """Extract router logits and expert outputs for GPT-OSS MoE.
+        routed_out = self.moe.experts(hidden_states_3d, router_weights, router_indices)
 
-        Note: This is a placeholder implementation. For proper expert scoring,
-        implement based on GptOssSparseMoeBlock forward pass.
+        # Return in same shape as input
+        routed_out = routed_out.reshape(*orig_shape)
 
-        Args:
-            hidden_states: Input tensor of shape (batch, seq_len, hidden_dim)
-            router_logits: Optional pre-computed router logits
-
-        Returns:
-            tuple of (router_logits, routed_experts):
-                - router_logits: Shape (num_tokens, num_local_experts) - zeros as placeholder
-                - routed_experts: Original hidden states (no-op)
-        """
-        batch_size = (
-            hidden_states.shape[0] * hidden_states.shape[1]
-            if hidden_states.ndim > 2
-            else hidden_states.shape[0]
-        )
-        router_logits_out = torch.zeros(
-            batch_size, self.num_local_experts, device=hidden_states.device
-        )
-        routed_experts = hidden_states.view(-1, hidden_states.shape[-1])
-        return router_logits_out, routed_experts
+        return router_logits, routed_out
diff --git a/modelopt/torch/puzzletron/activation_scoring/activation_hooks/utils.py b/modelopt/torch/puzzletron/activation_scoring/activation_hooks/utils.py
index 1b1485c713..33243c0125 100644
--- a/modelopt/torch/puzzletron/activation_scoring/activation_hooks/utils.py
+++ b/modelopt/torch/puzzletron/activation_scoring/activation_hooks/utils.py
@@ -19,8 +19,11 @@
 
 from typing import Type
 
+import torch
+
 from modelopt.torch.nas.plugins.megatron_hooks.base_hooks import ForwardHook as ActivationsHook
 from modelopt.torch.puzzletron.tools.logger import aprint
+from modelopt.torch.puzzletron.utils.dummy_modules import DummyBlock, DummyModule
 
 
 def register_activation_hooks(
@@ -51,6 +54,16 @@ def register_activation_hooks(
     module_names_to_hook = pruning_mixin.get_module_names_to_hook(model)
     activation_hooks = dict()
     for block_idx, module_name in module_names_to_hook:
+        try:
+            module = model.get_submodule(module_name)
+        except AttributeError:
+            # Module doesn't exist on this rank's shard (e.g., in distributed setup)
+            continue
+
+        # Skip dummy modules - they don't have real activations to hook
+        if isinstance(module, (DummyModule, DummyBlock)):
+            continue
+
         block_config = None
         if block_idx is not None:
             block_config = model.config.block_configs[block_idx]
@@ -59,13 +72,25 @@ def register_activation_hooks(
             "block_config": block_config,
         }
 
-        module = model.get_submodule(module_name)
         hook = hook_class(module, curr_activation_hooks_kwargs)
         module.register_forward_hook(hook)
         activation_hooks[module_name] = hook
 
     if len(activation_hooks) == 0:
-        raise ValueError("couldn't find any hooks")
+        # In distributed mode, it's okay for a rank to have 0 hooks if it doesn't own
+        # the target modules (e.g., with hybrid patterns like "*-" where different
+        # ranks own different layer types). However, we still want to catch real bugs
+        # where no hooks are found at all.
+        is_distributed = torch.distributed.is_available() and torch.distributed.is_initialized()
+        if is_distributed:
+            aprint(
+                "No hooks registered on this rank. This is expected if this rank "
+                "doesn't own any layers matching the hook pattern (e.g., in hybrid "
+                "patterns with distributed model sharding)."
+            )
+        else:
+            raise ValueError("couldn't find any hooks")
 
-    aprint(f"Found the following hooks: {activation_hooks.keys()}")
+    if len(activation_hooks) > 0:
+        aprint(f"Found the following hooks: {activation_hooks.keys()}")
     return activation_hooks
diff --git a/modelopt/torch/puzzletron/anymodel/converter/converter.py b/modelopt/torch/puzzletron/anymodel/converter/converter.py
index 5fdc92718c..eb2330b515 100644
--- a/modelopt/torch/puzzletron/anymodel/converter/converter.py
+++ b/modelopt/torch/puzzletron/anymodel/converter/converter.py
@@ -135,9 +135,10 @@ def convert_configs_in_dirs(
         cls,
         input_dir: Path,
         output_dir: Path,
+        trust_remote_code: bool = False,
     ):
         """Convert config and add block_configs."""
-        config = load_model_config(input_dir)
+        config = load_model_config(input_dir, trust_remote_code=trust_remote_code)
 
         block_configs = cls.create_block_configs_from_main_config(config)
         out_config = copy.deepcopy(config)
@@ -179,7 +180,10 @@ def convert(
             output_dir: Path to the output AnyModel checkpoint.
         """
         cls.copy_checkpoint_files(input_dir, output_dir)
-        config = cls.convert_configs_in_dirs(input_dir, output_dir)
+        trust_remote_code = descriptor.requires_trust_remote_code()
+        config = cls.convert_configs_in_dirs(
+            input_dir, output_dir, trust_remote_code=trust_remote_code
+        )
         cls.convert_model_weights(
             input_dir, output_dir, descriptor=descriptor, num_hidden_layers=config.num_hidden_layers
         )
diff --git a/modelopt/torch/puzzletron/anymodel/model_descriptor/model_descriptor.py b/modelopt/torch/puzzletron/anymodel/model_descriptor/model_descriptor.py
index 73d56d2016..4cc4356c8e 100644
--- a/modelopt/torch/puzzletron/anymodel/model_descriptor/model_descriptor.py
+++ b/modelopt/torch/puzzletron/anymodel/model_descriptor/model_descriptor.py
@@ -53,6 +53,18 @@ def block_config_to_layer_overrides(block_config: BlockConfig) -> Dict[str, Any]
         """
         raise NotImplementedError
 
+    @staticmethod
+    def requires_trust_remote_code() -> bool:
+        """Whether this model descriptor requires trust_remote_code=True for loading.
+
+        Models that use custom code (e.g., via auto_map in config) should override
+        this to return True.
+
+        Returns:
+            True if trust_remote_code=True is required, False otherwise.
+        """
+        return False
+
     @staticmethod
     def mlp_no_op_post_init(decoder_layer: nn.Module):
         """Post-init callback to alter a decoder layer so that FFN/mlp subblock performs as no-op.
diff --git a/modelopt/torch/puzzletron/anymodel/models/__init__.py b/modelopt/torch/puzzletron/anymodel/models/__init__.py
index f2119059f4..1f3fb477be 100644
--- a/modelopt/torch/puzzletron/anymodel/models/__init__.py
+++ b/modelopt/torch/puzzletron/anymodel/models/__init__.py
@@ -16,9 +16,9 @@
 # Import models to trigger factory registration
 # from modelopt.torch.puzzletron.anymodel.models.gpt_oss_20b import *
 from modelopt.torch.puzzletron.anymodel.models.llama import *
-# from modelopt.torch.puzzletron.anymodel.models.mistral_small import *
-# from modelopt.torch.puzzletron.anymodel.models.nemotron_h import *
-# from modelopt.torch.puzzletron.anymodel.models.nemotron_h_v2 import *
-# from modelopt.torch.puzzletron.anymodel.models.qwen2 import *
-# from modelopt.torch.puzzletron.anymodel.models.qwen3_8b import *
-# from modelopt.torch.puzzletron.anymodel.models.qwen3_vl_30b_a3b_instruct import *
+from modelopt.torch.puzzletron.anymodel.models.mistral_small import *
+from modelopt.torch.puzzletron.anymodel.models.nemotron_h import *
+from modelopt.torch.puzzletron.anymodel.models.nemotron_h_v2 import *
+from modelopt.torch.puzzletron.anymodel.models.qwen2 import *
+from modelopt.torch.puzzletron.anymodel.models.qwen3_8b import *
+from modelopt.torch.puzzletron.anymodel.models.qwen3_vl_30b_a3b_instruct import *
diff --git a/modelopt/torch/puzzletron/anymodel/models/llama/llama_model_descriptor.py b/modelopt/torch/puzzletron/anymodel/models/llama/llama_model_descriptor.py
index fe416e2dd6..082e5da599 100644
--- a/modelopt/torch/puzzletron/anymodel/models/llama/llama_model_descriptor.py
+++ b/modelopt/torch/puzzletron/anymodel/models/llama/llama_model_descriptor.py
@@ -39,6 +39,7 @@
 from modelopt.torch.puzzletron.pruning.ffn_intermediate_pruning_mixin import (
     FFNIntermediateLayerDescriptor,
 )
+from modelopt.torch.puzzletron.pruning.kv_heads_pruning_mixin import KVHeadsLayerDescriptor
 
 
 @ModelDescriptorFactory.register_decorator("llama")
@@ -129,3 +130,12 @@ class LlamaFFNIntermediateLayerDescriptor(FFNIntermediateLayerDescriptor):
     linear_weight_names: List[str] = field(
         default_factory=lambda: ["down_proj", "gate_proj", "up_proj"]
     )
+
+
+@dataclass
+class LlamaKVHeadsLayerDescriptor(KVHeadsLayerDescriptor):
+    o_proj_name: str = "self_attn.o_proj"
+    attn_prefix_name: str = "model.layers.{layer_idx}.self_attn"
+    qkvo_weight_names: List[str] = field(
+        default_factory=lambda: ["q_proj", "k_proj", "v_proj", "o_proj"]
+    )
diff --git a/modelopt/torch/puzzletron/anymodel/models/mistral_small/__init__.py b/modelopt/torch/puzzletron/anymodel/models/mistral_small/__init__.py
new file mode 100644
index 0000000000..821be47e9d
--- /dev/null
+++ b/modelopt/torch/puzzletron/anymodel/models/mistral_small/__init__.py
@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from modelopt.torch.puzzletron.anymodel.models.mistral_small.mistral_small_converter import (
+    MistralSmallConverter,
+)
+from modelopt.torch.puzzletron.anymodel.models.mistral_small.mistral_small_model_descriptor import (
+    MistralSmallModelDescriptor,
+)
diff --git a/modelopt/torch/puzzletron/anymodel/models/mistral_small/mistral_small_converter.py b/modelopt/torch/puzzletron/anymodel/models/mistral_small/mistral_small_converter.py
new file mode 100644
index 0000000000..ddc8151dc9
--- /dev/null
+++ b/modelopt/torch/puzzletron/anymodel/models/mistral_small/mistral_small_converter.py
@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# mypy: ignore-errors
+
+from typing import List
+
+from transformers import MistralConfig
+
+from modelopt.torch.puzzletron.anymodel.converter import Converter, ConverterFactory
+from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import (
+    AttentionConfig,
+    BlockConfig,
+    FFNConfig,
+)
+
+
+@ConverterFactory.register_decorator("mistral_small")
+class MistralSmallConverter(Converter):
+    @staticmethod
+    def create_block_configs_from_main_config(config: MistralConfig) -> List[BlockConfig]:
+        num_hidden_layers = config.num_hidden_layers
+
+        block_config = BlockConfig(
+            attention=AttentionConfig(no_op=False, num_key_value_heads=config.num_key_value_heads),
+            ffn=FFNConfig(no_op=False, intermediate_size=config.intermediate_size),
+        ).to_dict()
+
+        block_configs = [block_config] * num_hidden_layers
+        return block_configs
diff --git a/modelopt/torch/puzzletron/anymodel/models/mistral_small/mistral_small_model_descriptor.py b/modelopt/torch/puzzletron/anymodel/models/mistral_small/mistral_small_model_descriptor.py
new file mode 100644
index 0000000000..1ac2bd7072
--- /dev/null
+++ b/modelopt/torch/puzzletron/anymodel/models/mistral_small/mistral_small_model_descriptor.py
@@ -0,0 +1,135 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# mypy: ignore-errors
+
+import re
+from dataclasses import dataclass, field
+from typing import Dict, List
+
+from transformers.models.mistral.modeling_mistral import (
+    MistralDecoderLayer,
+    MistralForCausalLM,
+    MistralRotaryEmbedding,
+)
+
+from modelopt.torch.puzzletron.anymodel.model_descriptor import (
+    ModelDescriptor,
+    ModelDescriptorFactory,
+)
+from modelopt.torch.puzzletron.anymodel.puzzformer.no_op import (
+    MatchingZeros,
+    Same,
+    return_tuple_of_size,
+)
+from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import BlockConfig
+from modelopt.torch.puzzletron.pruning.ffn_intermediate_pruning_mixin import (
+    FFNIntermediateLayerDescriptor,
+)
+from modelopt.torch.puzzletron.pruning.kv_heads_pruning_mixin import KVHeadsLayerDescriptor
+
+
+@ModelDescriptorFactory.register_decorator("mistral_small")
+class MistralSmallModelDescriptor(ModelDescriptor):
+    @staticmethod
+    def decoder_layer_cls():
+        return MistralDecoderLayer
+
+    @staticmethod
+    def block_config_to_layer_overrides(block_config: BlockConfig):
+        return {
+            "intermediate_size": block_config.ffn.intermediate_size,
+            "num_key_value_heads": block_config.attention.num_key_value_heads,
+        }
+
+    @staticmethod
+    def attn_no_op_post_init(decoder_layer: MistralDecoderLayer):
+        decoder_layer.input_layernorm = Same()
+        decoder_layer.self_attn = return_tuple_of_size(MatchingZeros, size=2)()
+
+    @staticmethod
+    def mlp_no_op_post_init(decoder_layer: MistralDecoderLayer):
+        decoder_layer.post_attention_layernorm = Same()
+        decoder_layer.mlp = MatchingZeros()
+
+    @staticmethod
+    def init_rotary_embedding(model: MistralForCausalLM, runtime):
+        model.model.rotary_emb = MistralRotaryEmbedding(model.config, runtime.device)
+
+    @staticmethod
+    def input_embedding_name():
+        return "model.embed_tokens"
+
+    @staticmethod
+    def output_embedding_name():
+        return "lm_head"
+
+    @staticmethod
+    def final_norm_name():
+        return "model.norm"
+
+    @staticmethod
+    def layer_block_name(index: int):
+        return f"model.layers.{index}"
+
+    @staticmethod
+    def layer_name_predicates(num_layers: int) -> Dict[str, re.Pattern]:
+        layer_name_patterns = {
+            "embeddings": re.compile(r"^model\.embed_tokens\.weight$"),
+            "lm_head": re.compile(r"^(model\.norm\.weight|lm_head\.weight)$"),
+        }
+
+        def build_ffn_predicates() -> Dict[str, re.Pattern]:
+            return {
+                f"block_{layer_idx}_ffn": re.compile(
+                    rf"^model\.layers\.{layer_idx}\.(post_attention_layernorm\.weight"
+                    r"|mlp\.up_proj\.weight"
+                    r"|mlp\.gate_proj\.weight"
+                    r"|mlp\.down_proj\.weight)$"
+                )
+                for layer_idx in range(num_layers)
+            }
+
+        def build_attention_predicates() -> Dict[str, re.Pattern]:
+            return {
+                f"block_{layer_idx}_attention": re.compile(
+                    rf"^model\.layers\.{layer_idx}\.(input_layernorm\.weight"
+                    r"|self_attn\.q_proj\.weight"
+                    r"|self_attn\.k_proj\.weight"
+                    r"|self_attn\.v_proj\.weight"
+                    r"|self_attn\.o_proj\.weight)$"
+                )
+                for layer_idx in range(num_layers)
+            }
+
+        layer_name_patterns.update(**build_ffn_predicates(), **build_attention_predicates())
+        return layer_name_patterns
+
+
+@dataclass
+class MistralFFNIntermediateLayerDescriptor(FFNIntermediateLayerDescriptor):
+    down_proj_name: str = "mlp.down_proj"
+    ffn_prefix_name: str = "model.layers.{layer_idx}.mlp"
+    linear_weight_names: List[str] = field(
+        default_factory=lambda: ["down_proj", "gate_proj", "up_proj"]
+    )
+
+
+@dataclass
+class MistralKVHeadsLayerDescriptor(KVHeadsLayerDescriptor):
+    o_proj_name: str = "self_attn.o_proj"
+    attn_prefix_name: str = "model.layers.{layer_idx}.self_attn"
+    qkvo_weight_names: List[str] = field(
+        default_factory=lambda: ["q_proj", "k_proj", "v_proj", "o_proj"]
+    )
diff --git a/modelopt/torch/puzzletron/anymodel/models/nemotron_h/__init__.py b/modelopt/torch/puzzletron/anymodel/models/nemotron_h/__init__.py
new file mode 100644
index 0000000000..a2140f118e
--- /dev/null
+++ b/modelopt/torch/puzzletron/anymodel/models/nemotron_h/__init__.py
@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from modelopt.torch.puzzletron.anymodel.models.nemotron_h.nemotron_h_converter import (
+    NemotronHConverter,
+)
+from modelopt.torch.puzzletron.anymodel.models.nemotron_h.nemotron_h_model_descriptor import (
+    NemotronHModelDescriptor,
+)
diff --git a/modelopt/torch/puzzletron/anymodel/models/nemotron_h/nemotron_h_converter.py b/modelopt/torch/puzzletron/anymodel/models/nemotron_h/nemotron_h_converter.py
new file mode 100644
index 0000000000..16d9e3c73d
--- /dev/null
+++ b/modelopt/torch/puzzletron/anymodel/models/nemotron_h/nemotron_h_converter.py
@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+from modelopt.torch.puzzletron.anymodel.converter import Converter, ConverterFactory
+from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import (
+    AttentionConfig,
+    BlockConfig,
+    FFNConfig,
+    MambaConfig,
+    MoEConfig,
+)
+
+
+@ConverterFactory.register_decorator("nemotron_h")
+class NemotronHConverter(Converter):
+    @staticmethod
+    def create_block_configs_from_main_config(config) -> List[BlockConfig]:
+        # Create block configs for each layer based on the hybrid_override_pattern
+        block_configs = []
+
+        # Parse the hybrid_override_pattern: "M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"
+        pattern = config.hybrid_override_pattern
+        print(f"Parsing hybrid pattern: {pattern}")
+
+        for i, char in enumerate(pattern):
+            if char == "M":
+                _block_config = BlockConfig(
+                    attention=AttentionConfig(
+                        mamba=MambaConfig(  # Those parameters are currently used only for calc_block_stats.
+                            state_dim=config.ssm_state_size,
+                            num_heads=config.mamba_num_heads,
+                            head_dim=config.mamba_head_dim,
+                            num_groups=config.n_groups,
+                        )
+                    ),
+                    ffn=FFNConfig(no_op=True),
+                )
+
+            elif char == "-":
+                _block_config = BlockConfig(
+                    attention=AttentionConfig(no_op=True),
+                    ffn=FFNConfig(intermediate_size=config.intermediate_size),
+                )
+
+            elif char == "*":
+                _block_config = BlockConfig(
+                    attention=AttentionConfig(num_key_value_heads=config.num_key_value_heads),
+                    ffn=FFNConfig(no_op=True),
+                )
+
+            elif char == "E":
+                _block_config = BlockConfig(
+                    attention=AttentionConfig(no_op=True),
+                    ffn=FFNConfig(
+                        moe=MoEConfig(
+                            num_local_experts=config.n_routed_experts,
+                            expert_intermediate_dim=config.moe_intermediate_size,
+                            num_experts_per_tok=config.num_experts_per_tok,
+                        )
+                    ),
+                )
+            else:
+                raise ValueError(
+                    f"Unknown character '{char}' in hybrid_override_pattern at position {i}"
+                )
+
+            block_configs.append(_block_config)
+
+        print(f"Created {len(block_configs)} block configs from pattern")
+        return block_configs
diff --git a/modelopt/torch/puzzletron/anymodel/models/nemotron_h/nemotron_h_model_descriptor.py b/modelopt/torch/puzzletron/anymodel/models/nemotron_h/nemotron_h_model_descriptor.py
new file mode 100644
index 0000000000..55d9ef56ca
--- /dev/null
+++ b/modelopt/torch/puzzletron/anymodel/models/nemotron_h/nemotron_h_model_descriptor.py
@@ -0,0 +1,256 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# mypy: ignore-errors
+
+import importlib
+import inspect
+import pkgutil
+import re
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Dict, Iterable, List, Tuple, Type
+
+import torch.nn as nn
+
+from modelopt.torch.puzzletron.anymodel.model_descriptor import (
+    ModelDescriptor,
+    ModelDescriptorFactory,
+)
+from modelopt.torch.puzzletron.anymodel.puzzformer.no_op import MatchingZeros, Same
+from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import BlockConfig
+from modelopt.torch.puzzletron.pruning.expert_removal_pruning_mixin import (
+    ExpertRemovalLayerDescriptor,
+    ExpertRemovalPruningMixIn,
+)
+from modelopt.torch.puzzletron.pruning.pruning_mixin import PruningMixIn
+
+
+def get_dynamic_modules(module_cls_str: str) -> List[Type[nn.Module]]:
+    import transformers_modules
+
+    matches = []
+    for finder, modname, ispkg in pkgutil.walk_packages(
+        transformers_modules.__path__, transformers_modules.__name__ + "."
+    ):
+        module = importlib.import_module(modname)
+        for _, obj in inspect.getmembers(module, inspect.isclass):
+            if obj.__name__ == module_cls_str:
+                matches.append(obj)
+
+    return matches
+
+
+@dataclass
+class NemotronHExpertRemovalLayerDescriptor(ExpertRemovalLayerDescriptor):
+    target_name: str = "mixer.gate"
+    moe_prefix_name: str = "backbone.layers.{layer_idx}.mixer"
+    expert_prefix_name: str = "experts.{expert_idx}"
+    router_weights: List[str] = field(default_factory=lambda: ["gate.weight"])
+    router_biases: List[str] = field(default_factory=lambda: ["gate.e_score_correction_bias"])
+    expert_weights: List[str] = field(
+        default_factory=lambda: ["up_proj.weight", "down_proj.weight"]
+    )
+
+    def get_modules_names_to_hook(self, model) -> List[Tuple[int, str]]:
+        if self.target_name != "mixer":
+            return super().get_modules_names_to_hook(model)
+
+        # when target is `mixer` we'll target moe layers of class type: `NemotronHMOE`, as NemotronH models use auto-map we'll check for class name instead of class type.
+        target_class_name = "NemotronHMOE"
+
+        module_names_to_hook = []
+        for module_name, module in model.named_modules():
+            # restrict to attributes called "mixer" and with the desired class name
+            if (
+                module_name.endswith(self.target_name)
+                and module.__class__.__name__ == target_class_name
+            ):
+                module_names_to_hook.append(
+                    (self.block_idx_from_module_name(module_name), module_name)
+                )
+        return module_names_to_hook
+
+
+@ModelDescriptorFactory.register_decorator("nemotron_h")
+class NemotronHModelDescriptor(ModelDescriptor):
+    _DECODER_LAYER_CLS: Type[nn.Module] = None
+
+    @staticmethod
+    def decoder_layer_cls():
+        decoder_cls_list = get_dynamic_modules("NemotronHBlock")
+        if not decoder_cls_list:
+            raise AssertionError(
+                "NemotronH contains dynamic modules that should be cached beforehand, make sure to load your config using `load_model_config` or manually call `force_cache_dynamic_modules(config, checkpoint_dir)`"
+            )
+        return decoder_cls_list
+
+    @staticmethod
+    def requires_trust_remote_code() -> bool:
+        return True
+
+    @staticmethod
+    def block_config_to_layer_overrides(block_config: BlockConfig):
+        override_kwargs = {}
+        if block_config.ffn.intermediate_size is not None:
+            override_kwargs["intermediate_size"] = block_config.ffn.intermediate_size
+
+        if block_config.attention.num_key_value_heads is not None:
+            override_kwargs["num_key_value_heads"] = block_config.attention.num_key_value_heads
+
+        if block_config.ffn.moe is not None:
+            override_kwargs["moe_intermediate_size"] = block_config.ffn.moe.expert_intermediate_dim
+            override_kwargs["n_routed_experts"] = block_config.ffn.moe.num_local_experts
+
+        return override_kwargs
+
+    @staticmethod
+    def _block_no_op_post_init(decoder_layer):
+        """
+        Due to the subblock structure of NemotronH always one of the subblock is set to no-op, for a real no-op both attention & ffn no-op should be set to True.
+        """
+        block_config = decoder_layer.config.block_configs[decoder_layer.layer_idx]
+        if block_config.ffn.no_op and block_config.attention.no_op:
+            decoder_layer.norm = Same()
+            decoder_layer.mixer = MatchingZeros()
+
+    @staticmethod
+    def attn_no_op_post_init(decoder_layer):
+        NemotronHModelDescriptor._block_no_op_post_init(decoder_layer)
+
+    @staticmethod
+    def mlp_no_op_post_init(decoder_layer):
+        NemotronHModelDescriptor._block_no_op_post_init(decoder_layer)
+
+    @classmethod
+    def create_dummy_block(cls, original_layer: nn.Module, block_index: int) -> nn.Module:
+        dummy_block = super().create_dummy_block(original_layer, block_index)
+        # Required by `NemotronHModel.forward`.
+        dummy_block.block_type = original_layer.block_type
+        # Preserve layer_idx if it exists (used by _block_no_op_post_init)
+        if hasattr(original_layer, "layer_idx"):
+            dummy_block.layer_idx = original_layer.layer_idx
+        # Preserve config if it exists (used by _block_no_op_post_init to access block_configs)
+        if hasattr(original_layer, "config"):
+            dummy_block.config = original_layer.config
+        return dummy_block
+
+    @staticmethod
+    def init_rotary_embedding(model, runtime):
+        """
+        NemotronH has no positional embeddings
+        """
+        pass
+
+    @staticmethod
+    def input_embedding_name():
+        return "backbone.embeddings"
+
+    @staticmethod
+    def output_embedding_name():
+        return "lm_head"
+
+    @staticmethod
+    def final_norm_name():
+        return "backbone.norm_f"
+
+    @staticmethod
+    def layer_block_name(index: int):
+        return f"backbone.layers.{index}"
+
+    @classmethod
+    def get_weight_groups(
+        cls, layer_names: Iterable[str], num_hidden_layers: int
+    ) -> Dict[str, List[str]]:
+        """
+        Problem with NemotronH is that `norm.weight` can be in both block_{i}_ffn and block_{i}_attention. duplicate groups with `norm.weight` should be removed.
+        """
+        weight_groups = defaultdict(list)
+        for name in layer_names:
+            is_matched = False
+            for group, pattern in cls.layer_name_predicates(num_hidden_layers).items():
+                if pattern.match(name):
+                    weight_groups[group].append(name)
+                    is_matched = True
+            if not is_matched:
+                raise ValueError(f"Couldn't find a match for {name}")
+
+        valid_weight_groups = {}
+        for group, names in weight_groups.items():
+            if len(names) == 1:
+                only_name = names[0]
+                if only_name.endswith("norm.weight") and "layers" in only_name:
+                    # Skip and don't append this group to valid_weight_groups
+                    continue
+            valid_weight_groups[group] = names
+
+        return valid_weight_groups
+
+    @staticmethod
+    def layer_name_predicates(num_layers: int) -> Dict[str, re.Pattern]:
+        layer_name_patterns = {
+            "embeddings": re.compile(
+                r"^(model\.embed_tokens\.weight|backbone\.embeddings\.weight)$"
+            ),
+            "lm_head": re.compile(r"^(lm_head\.weight|backbone\.norm_f\.weight)$"),
+        }
+
+        def build_ffn_predicates() -> Dict[str, re.Pattern]:
+            return {
+                f"block_{layer_idx}_ffn": re.compile(
+                    rf"^backbone\.layers\.{layer_idx}\."
+                    r"(norm\.weight|"  # ← INCLUDED IN FFN
+                    r"mixer\.(gate\.e_score_correction_bias"
+                    r"|gate\.weight"
+                    r"|experts\.\d+\.up_proj\.weight"
+                    r"|experts\.\d+\.down_proj\.weight"
+                    r"|shared_experts\.up_proj\.weight"
+                    r"|shared_experts\.down_proj\.weight))$"
+                )
+                for layer_idx in range(num_layers)
+            }
+
+        def build_attention_predicates() -> Dict[str, re.Pattern]:
+            return {
+                f"block_{layer_idx}_attention": re.compile(
+                    rf"^backbone\.layers\.{layer_idx}\."
+                    r"(norm\.weight|"  # ← INCLUDED IN ATTENTION
+                    r"mixer\.(norm\.weight"
+                    r"|A_log"
+                    r"|D"
+                    r"|conv1d\.weight"
+                    r"|conv1d\.bias"
+                    r"|dt_bias"
+                    r"|in_proj\.weight"
+                    r"|out_proj\.weight"
+                    r"|q_proj\.weight"
+                    r"|k_proj\.weight"
+                    r"|v_proj\.weight"
+                    r"|o_proj\.weight))$"
+                )
+                for layer_idx in range(num_layers)
+            }
+
+        layer_name_patterns.update(
+            **build_ffn_predicates(),
+            **build_attention_predicates(),
+        )
+
+        return layer_name_patterns
+
+    @staticmethod
+    def pruning_mixins() -> Dict[str, PruningMixIn]:
+        return {
+            "experts_removal": ExpertRemovalPruningMixIn(NemotronHExpertRemovalLayerDescriptor()),
+        }
diff --git a/modelopt/torch/puzzletron/anymodel/models/nemotron_h_v2/__init__.py b/modelopt/torch/puzzletron/anymodel/models/nemotron_h_v2/__init__.py
new file mode 100644
index 0000000000..4b17785ace
--- /dev/null
+++ b/modelopt/torch/puzzletron/anymodel/models/nemotron_h_v2/__init__.py
@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from modelopt.torch.puzzletron.anymodel.models.nemotron_h_v2.nemotron_h_v2_converter import (
+    NemotronHV2Converter,
+)
+from modelopt.torch.puzzletron.anymodel.models.nemotron_h_v2.nemotron_h_v2_model_descriptor import (
+    NemotronHV2ModelDescriptor,
+)
diff --git a/modelopt/torch/puzzletron/anymodel/models/nemotron_h_v2/nemotron_h_v2_converter.py b/modelopt/torch/puzzletron/anymodel/models/nemotron_h_v2/nemotron_h_v2_converter.py
new file mode 100644
index 0000000000..2c54388325
--- /dev/null
+++ b/modelopt/torch/puzzletron/anymodel/models/nemotron_h_v2/nemotron_h_v2_converter.py
@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+from modelopt.torch.puzzletron.anymodel.converter import Converter, ConverterFactory
+from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import (
+    AttentionConfig,
+    BlockConfig,
+    FFNConfig,
+    MambaConfig,
+    MoEConfig,
+)
+
+
+@ConverterFactory.register_decorator("nemotron_h_v2")
+class NemotronHV2Converter(Converter):
+    @staticmethod
+    def create_block_configs_from_main_config(config) -> List[BlockConfig]:
+        # Create block configs for each layer based on the hybrid_override_pattern
+        block_configs = []
+
+        # Parse the hybrid_override_pattern: "M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"
+        pattern = config.hybrid_override_pattern
+        print(f"Parsing hybrid pattern: {pattern}")
+
+        for i, char in enumerate(pattern):
+            if char == "M":
+                _block_config = BlockConfig(
+                    attention=AttentionConfig(
+                        mamba=MambaConfig(  # Those parameters are currently used only for calc_block_stats.
+                            state_dim=config.ssm_state_size,
+                            num_heads=config.mamba_num_heads,
+                            head_dim=config.mamba_head_dim,
+                            num_groups=config.n_groups,
+                        )
+                    ),
+                    ffn=FFNConfig(no_op=True),
+                )
+
+            elif char == "-":
+                _block_config = BlockConfig(
+                    attention=AttentionConfig(no_op=True),
+                    ffn=FFNConfig(intermediate_size=config.intermediate_size),
+                )
+
+            elif char == "*":
+                _block_config = BlockConfig(
+                    attention=AttentionConfig(num_key_value_heads=config.num_key_value_heads),
+                    ffn=FFNConfig(no_op=True),
+                )
+
+            elif char == "E":
+                _block_config = BlockConfig(
+                    attention=AttentionConfig(no_op=True),
+                    ffn=FFNConfig(
+                        moe=MoEConfig(
+                            num_local_experts=config.n_routed_experts,
+                            expert_intermediate_dim=config.moe_intermediate_size,
+                            num_experts_per_tok=config.num_experts_per_tok,
+                        )
+                    ),
+                )
+            else:
+                raise ValueError(
+                    f"Unknown character '{char}' in hybrid_override_pattern at position {i}"
+                )
+
+            block_configs.append(_block_config)
+
+        print(f"Created {len(block_configs)} block configs from pattern")
+        return block_configs
diff --git a/modelopt/torch/puzzletron/anymodel/models/nemotron_h_v2/nemotron_h_v2_model_descriptor.py b/modelopt/torch/puzzletron/anymodel/models/nemotron_h_v2/nemotron_h_v2_model_descriptor.py
new file mode 100644
index 0000000000..f50217d4d3
--- /dev/null
+++ b/modelopt/torch/puzzletron/anymodel/models/nemotron_h_v2/nemotron_h_v2_model_descriptor.py
@@ -0,0 +1,241 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import inspect
+import pkgutil
+import re
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Dict, Iterable, List, Type
+
+import torch.nn as nn
+
+from modelopt.torch.puzzletron.anymodel.model_descriptor import (
+    ModelDescriptor,
+    ModelDescriptorFactory,
+)
+from modelopt.torch.puzzletron.anymodel.puzzformer.no_op import MatchingZeros, Same
+from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import BlockConfig
+from modelopt.torch.puzzletron.pruning.ffn_intermediate_pruning_mixin import (
+    FFNIntermediateLayerDescriptor,
+    FFNIntermediatePruningMixIn,
+)
+from modelopt.torch.puzzletron.pruning.pruning_mixin import PruningMixIn
+
+
+def get_dynamic_modules(module_cls_str: str) -> List[Type[nn.Module]]:
+    import transformers_modules
+
+    matches = []
+    for finder, modname, ispkg in pkgutil.walk_packages(
+        transformers_modules.__path__, transformers_modules.__name__ + "."
+    ):
+        module = importlib.import_module(modname)
+        for _, obj in inspect.getmembers(module, inspect.isclass):
+            if obj.__name__ == module_cls_str:
+                matches.append(obj)
+
+    return matches
+
+
+@dataclass
+class NemotronHV2FFNIntermediateLayerDescriptor(FFNIntermediateLayerDescriptor):
+    down_proj_name: str = "mixer.down_proj"
+    ffn_prefix_name: str = "backbone.layers.{layer_idx}.mixer"
+    linear_weight_names: List[str] = field(default_factory=lambda: ["down_proj", "up_proj"])
+
+
+@ModelDescriptorFactory.register_decorator("nemotron_h_v2")
+class NemotronHV2ModelDescriptor(ModelDescriptor):
+    _DECODER_LAYER_CLS: Type[nn.Module] = None
+
+    @staticmethod
+    def decoder_layer_cls():
+        decoder_cls_list = get_dynamic_modules("NemotronHBlock")
+        if not decoder_cls_list:
+            raise AssertionError(
+                "NemotronH contains dynamic modules that should be cached beforehand, make sure to load your config using `load_model_config` or manually call `force_cache_dynamic_modules(config, checkpoint_dir)`"
+            )
+        return decoder_cls_list
+
+    @staticmethod
+    def requires_trust_remote_code() -> bool:
+        return True
+
+    @staticmethod
+    def block_config_to_layer_overrides(block_config: BlockConfig):
+        override_kwargs = {}
+        if block_config.ffn is not None and block_config.ffn.intermediate_size is not None:
+            override_kwargs["intermediate_size"] = block_config.ffn.intermediate_size
+
+        if (
+            block_config.attention is not None
+            and block_config.attention.num_key_value_heads is not None
+        ):
+            override_kwargs["num_key_value_heads"] = block_config.attention.num_key_value_heads
+
+        if block_config.ffn is not None and block_config.ffn.moe is not None:
+            override_kwargs["moe_intermediate_size"] = block_config.ffn.moe.expert_intermediate_dim
+            override_kwargs["n_routed_experts"] = block_config.ffn.moe.num_local_experts
+
+        return override_kwargs
+
+    @staticmethod
+    def _block_no_op_post_init(decoder_layer):
+        """
+        Due to the subblock structure of NemotronH always one of the subblock is set to no-op, for a real no-op both attention & ffn no-op should be set to True.
+        """
+        block_config = decoder_layer.config.block_configs[decoder_layer.layer_idx]
+        ffn_no_op = block_config.ffn is not None and block_config.ffn.no_op
+        attn_no_op = block_config.attention is not None and block_config.attention.no_op
+        if ffn_no_op and attn_no_op:
+            decoder_layer.norm = Same()
+            decoder_layer.mixer = MatchingZeros()
+
+    @staticmethod
+    def attn_no_op_post_init(decoder_layer):
+        NemotronHV2ModelDescriptor._block_no_op_post_init(decoder_layer)
+
+    @staticmethod
+    def mlp_no_op_post_init(decoder_layer):
+        NemotronHV2ModelDescriptor._block_no_op_post_init(decoder_layer)
+
+    @classmethod
+    def create_dummy_block(cls, original_layer: nn.Module, block_index: int) -> nn.Module:
+        dummy_block = super().create_dummy_block(original_layer, block_index)
+        # Required by `NemotronHModel.forward`.
+        dummy_block.block_type = original_layer.block_type
+        # Preserve layer_idx if it exists (used by _block_no_op_post_init)
+        if hasattr(original_layer, "layer_idx"):
+            dummy_block.layer_idx = original_layer.layer_idx
+        # Preserve config if it exists (used by _block_no_op_post_init to access block_configs)
+        if hasattr(original_layer, "config"):
+            dummy_block.config = original_layer.config
+        return dummy_block
+
+    @staticmethod
+    def init_rotary_embedding(model, runtime):
+        """
+        NemotronH has no positional embeddings
+        """
+        pass
+
+    @staticmethod
+    def input_embedding_name():
+        return "backbone.embeddings"
+
+    @staticmethod
+    def output_embedding_name():
+        return "lm_head"
+
+    @staticmethod
+    def final_norm_name():
+        return "backbone.norm_f"
+
+    @staticmethod
+    def layer_block_name(index: int):
+        return f"backbone.layers.{index}"
+
+    @classmethod
+    def get_weight_groups(
+        cls, layer_names: Iterable[str], num_hidden_layers: int
+    ) -> Dict[str, List[str]]:
+        """
+        Problem with NemotronH is that `norm.weight` can be in both block_{i}_ffn and block_{i}_attention. duplicate groups with `norm.weight` should be removed.
+        """
+        weight_groups = defaultdict(list)
+        for name in layer_names:
+            is_matched = False
+            for group, pattern in cls.layer_name_predicates(num_hidden_layers).items():
+                if pattern.match(name):
+                    weight_groups[group].append(name)
+                    is_matched = True
+            if not is_matched:
+                raise ValueError(f"Couldn't find a match for {name}")
+
+        valid_weight_groups = {}
+        for group, names in weight_groups.items():
+            if len(names) == 1:
+                only_name = names[0]
+                if only_name.endswith("norm.weight") and "layers" in only_name:
+                    # Skip and don't append this group to valid_weight_groups
+                    continue
+            valid_weight_groups[group] = names
+
+        return valid_weight_groups
+
+    @staticmethod
+    def layer_name_predicates(num_layers: int) -> Dict[str, re.Pattern]:
+        layer_name_patterns = {
+            "embeddings": re.compile(
+                r"^(model\.embed_tokens\.weight|backbone\.embeddings\.weight)$"
+            ),
+            "lm_head": re.compile(r"^(lm_head\.weight|backbone\.norm_f\.weight)$"),
+        }
+
+        def build_ffn_predicates() -> Dict[str, re.Pattern]:
+            return {
+                f"block_{layer_idx}_ffn": re.compile(
+                    rf"^backbone\.layers\.{layer_idx}\."
+                    r"(norm\.weight|"  # ← INCLUDED IN FFN
+                    r"mixer\.(gate\.e_score_correction_bias"
+                    r"|gate\.weight"
+                    r"|experts\.\d+\.up_proj\.weight"
+                    r"|experts\.\d+\.down_proj\.weight"
+                    r"|shared_experts\.up_proj\.weight"
+                    r"|shared_experts\.down_proj\.weight"
+                    r"|up_proj\.weight"  # Simple MLP (non-MoE)
+                    r"|down_proj\.weight))$"  # Simple MLP (non-MoE)
+                )
+                for layer_idx in range(num_layers)
+            }
+
+        def build_attention_predicates() -> Dict[str, re.Pattern]:
+            return {
+                f"block_{layer_idx}_attention": re.compile(
+                    rf"^backbone\.layers\.{layer_idx}\."
+                    r"(norm\.weight|"  # ← INCLUDED IN ATTENTION
+                    r"mixer\.(norm\.weight"
+                    r"|A_log"
+                    r"|D"
+                    r"|conv1d\.weight"
+                    r"|conv1d\.bias"
+                    r"|dt_bias"
+                    r"|in_proj\.weight"
+                    r"|out_proj\.weight"
+                    r"|q_proj\.weight"
+                    r"|k_proj\.weight"
+                    r"|v_proj\.weight"
+                    r"|o_proj\.weight))$"
+                )
+                for layer_idx in range(num_layers)
+            }
+
+        layer_name_patterns.update(
+            **build_ffn_predicates(),
+            **build_attention_predicates(),
+        )
+
+        return layer_name_patterns
+
+    @staticmethod
+    def pruning_mixins() -> Dict[str, PruningMixIn]:
+        return {
+            "ffn_intermediate": FFNIntermediatePruningMixIn(
+                NemotronHV2FFNIntermediateLayerDescriptor()
+            ),
+            # TODO: Add expert removal support when ExpertRemovalPruningMixIn is migrated
+        }
diff --git a/modelopt/torch/puzzletron/anymodel/models/qwen2/__init__.py b/modelopt/torch/puzzletron/anymodel/models/qwen2/__init__.py
new file mode 100644
index 0000000000..c193fc0d6d
--- /dev/null
+++ b/modelopt/torch/puzzletron/anymodel/models/qwen2/__init__.py
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from modelopt.torch.puzzletron.anymodel.models.qwen2.qwen2_converter import Qwen2Converter
+from modelopt.torch.puzzletron.anymodel.models.qwen2.qwen2_model_descriptor import (
+    Qwen2ModelDescriptor,
+)
diff --git a/modelopt/torch/puzzletron/anymodel/models/qwen2/qwen2_converter.py b/modelopt/torch/puzzletron/anymodel/models/qwen2/qwen2_converter.py
new file mode 100644
index 0000000000..878cfd64dc
--- /dev/null
+++ b/modelopt/torch/puzzletron/anymodel/models/qwen2/qwen2_converter.py
@@ -0,0 +1,50 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# mypy: ignore-errors
+
+"""Qwen2 converter for AnyModel compression."""
+
+from typing import List
+
+from transformers import Qwen2Config
+
+from modelopt.torch.puzzletron.anymodel.converter import Converter, ConverterFactory
+from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import (
+    AttentionConfig,
+    BlockConfig,
+    FFNConfig,
+)
+
+
+@ConverterFactory.register_decorator("qwen2")
+class Qwen2Converter(Converter):
+    """Converter for Qwen2 models to AnyModel format."""
+
+    @staticmethod
+    def create_block_configs_from_main_config(config: Qwen2Config) -> List[BlockConfig]:
+        """Create uniform block configs for all Qwen2 layers.
+
+        Qwen2 models have uniform architecture across all layers, so we create
+        the same BlockConfig for each layer.
+        """
+        num_hidden_layers = config.num_hidden_layers
+
+        block_config = BlockConfig(
+            attention=AttentionConfig(no_op=False, num_key_value_heads=config.num_key_value_heads),
+            ffn=FFNConfig(no_op=False, intermediate_size=config.intermediate_size),
+        ).to_dict()
+
+        block_configs = [block_config] * num_hidden_layers
+        return block_configs
diff --git a/modelopt/torch/puzzletron/anymodel/models/qwen2/qwen2_model_descriptor.py b/modelopt/torch/puzzletron/anymodel/models/qwen2/qwen2_model_descriptor.py
new file mode 100644
index 0000000000..69185d1de3
--- /dev/null
+++ b/modelopt/torch/puzzletron/anymodel/models/qwen2/qwen2_model_descriptor.py
@@ -0,0 +1,148 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# mypy: ignore-errors
+
+"""Qwen2 model descriptor for AnyModel compression."""
+
+import re
+from dataclasses import dataclass
+from typing import Dict
+
+from torch import nn
+from transformers.models.qwen2.modeling_qwen2 import (
+    Qwen2DecoderLayer,
+    Qwen2ForCausalLM,
+    Qwen2RotaryEmbedding,
+)
+
+from modelopt.torch.puzzletron.anymodel.model_descriptor import (
+    ModelDescriptor,
+    ModelDescriptorFactory,
+)
+from modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor import (
+    LlamaFFNIntermediateLayerDescriptor,
+)
+from modelopt.torch.puzzletron.anymodel.puzzformer.no_op import (
+    MatchingZeros,
+    Same,
+    return_tuple_of_size,
+)
+from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import BlockConfig
+from modelopt.torch.puzzletron.utils.dummy_modules import DummyBlock
+
+
+@ModelDescriptorFactory.register_decorator("qwen2")
+class Qwen2ModelDescriptor(ModelDescriptor):
+    """Model descriptor for Qwen2 models."""
+
+    @staticmethod
+    def decoder_layer_cls():
+        return Qwen2DecoderLayer
+
+    @classmethod
+    def create_dummy_block(cls, original_layer: nn.Module, block_index: int) -> nn.Module:
+        """Create a dummy block that preserves Qwen2-specific attributes like attention_type.
+
+        Qwen2's forward pass accesses decoder_layer.attention_type for attention mask selection.
+        """
+        dummy = DummyBlock(block_index=block_index)
+        # Copy attention_type from original layer (required by Qwen2's forward pass)
+        if hasattr(original_layer, "attention_type"):
+            dummy.attention_type = original_layer.attention_type
+        return dummy
+
+    @staticmethod
+    def block_config_to_layer_overrides(block_config: BlockConfig):
+        return {
+            "intermediate_size": block_config.ffn.intermediate_size,
+            "num_key_value_heads": block_config.attention.num_key_value_heads,
+        }
+
+    @staticmethod
+    def attn_no_op_post_init(decoder_layer: Qwen2DecoderLayer):
+        decoder_layer.input_layernorm = Same()
+        decoder_layer.self_attn = return_tuple_of_size(MatchingZeros, size=2)()
+
+    @staticmethod
+    def mlp_no_op_post_init(decoder_layer: Qwen2DecoderLayer):
+        decoder_layer.post_attention_layernorm = Same()
+        decoder_layer.mlp = MatchingZeros()
+
+    @staticmethod
+    def init_rotary_embedding(model: Qwen2ForCausalLM, runtime):
+        model.model.rotary_emb = Qwen2RotaryEmbedding(config=model.config, device=runtime.device)
+
+    @staticmethod
+    def input_embedding_name():
+        return "model.embed_tokens"
+
+    @staticmethod
+    def output_embedding_name():
+        return "lm_head"
+
+    @staticmethod
+    def final_norm_name():
+        return "model.norm"
+
+    @staticmethod
+    def layer_block_name(index: int):
+        return f"model.layers.{index}"
+
+    @staticmethod
+    def layer_name_predicates(num_layers: int) -> Dict[str, re.Pattern]:
+        layer_name_patterns = {
+            "embeddings": re.compile(r"^model\.embed_tokens\.weight$"),
+            "lm_head": re.compile(r"^(model\.norm\.weight|lm_head\.weight)$"),
+        }
+
+        def build_ffn_predicates() -> Dict[str, re.Pattern]:
+            return {
+                f"block_{layer_idx}_ffn": re.compile(
+                    rf"^model\.layers\.{layer_idx}\.(post_attention_layernorm\.weight"
+                    r"|mlp\.up_proj\.weight"
+                    r"|mlp\.gate_proj\.weight"
+                    r"|mlp\.down_proj\.weight)$"
+                )
+                for layer_idx in range(num_layers)
+            }
+
+        def build_attention_predicates() -> Dict[str, re.Pattern]:
+            # Qwen2 has biases on attention projections
+            return {
+                f"block_{layer_idx}_attention": re.compile(
+                    rf"^model\.layers\.{layer_idx}\.(input_layernorm\.weight"
+                    r"|self_attn\.q_proj\.weight"
+                    r"|self_attn\.q_proj\.bias"
+                    r"|self_attn\.k_proj\.weight"
+                    r"|self_attn\.k_proj\.bias"
+                    r"|self_attn\.v_proj\.weight"
+                    r"|self_attn\.v_proj\.bias"
+                    r"|self_attn\.o_proj\.weight)$"
+                )
+                for layer_idx in range(num_layers)
+            }
+
+        layer_name_patterns.update(**build_ffn_predicates(), **build_attention_predicates())
+        return layer_name_patterns
+
+
+@dataclass
+class Qwen2FFNIntermediateLayerDescriptor(LlamaFFNIntermediateLayerDescriptor):
+    """Layer descriptor for Qwen2 FFN intermediate pruning.
+
+    Qwen2 uses the same FFN structure as Llama (gate_proj, up_proj, down_proj).
+    """
+
+    pass
diff --git a/modelopt/torch/puzzletron/anymodel/models/qwen3_8b/__init__.py b/modelopt/torch/puzzletron/anymodel/models/qwen3_8b/__init__.py
new file mode 100644
index 0000000000..0f753f705d
--- /dev/null
+++ b/modelopt/torch/puzzletron/anymodel/models/qwen3_8b/__init__.py
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from modelopt.torch.puzzletron.anymodel.models.qwen3_8b.qwen3_8b_converter import Qwen3_8BConverter
+from modelopt.torch.puzzletron.anymodel.models.qwen3_8b.qwen3_8b_model_descriptor import (
+    Qwen3_8BModelDescriptor,
+)
diff --git a/modelopt/torch/puzzletron/anymodel/models/qwen3_8b/qwen3_8b_converter.py b/modelopt/torch/puzzletron/anymodel/models/qwen3_8b/qwen3_8b_converter.py
new file mode 100644
index 0000000000..1a389291df
--- /dev/null
+++ b/modelopt/torch/puzzletron/anymodel/models/qwen3_8b/qwen3_8b_converter.py
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# mypy: ignore-errors
+
+from typing import List
+
+from transformers import Qwen3Config
+
+from modelopt.torch.puzzletron.anymodel.converter import Converter, ConverterFactory
+from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import (
+    AttentionConfig,
+    BlockConfig,
+    FFNConfig,
+)
+
+
+@ConverterFactory.register_decorator("qwen3")
+class Qwen3_8BConverter(Converter):
+    @staticmethod
+    def create_block_configs_from_main_config(config: Qwen3Config) -> List[BlockConfig]:
+        num_hidden_layers = config.num_hidden_layers
+
+        block_config = BlockConfig(
+            attention=AttentionConfig(no_op=False, num_key_value_heads=config.num_key_value_heads),
+            ffn=FFNConfig(no_op=False, intermediate_size=config.intermediate_size),
+        ).to_dict()
+
+        block_configs = [block_config] * num_hidden_layers
+        return block_configs
diff --git a/modelopt/torch/puzzletron/anymodel/models/qwen3_8b/qwen3_8b_model_descriptor.py b/modelopt/torch/puzzletron/anymodel/models/qwen3_8b/qwen3_8b_model_descriptor.py
new file mode 100644
index 0000000000..679ee73fae
--- /dev/null
+++ b/modelopt/torch/puzzletron/anymodel/models/qwen3_8b/qwen3_8b_model_descriptor.py
@@ -0,0 +1,152 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# mypy: ignore-errors
+
+import re
+from dataclasses import dataclass, field
+from typing import Dict, List
+
+from torch import nn
+from transformers.models.qwen3.modeling_qwen3 import (
+    Qwen3DecoderLayer,
+    Qwen3ForCausalLM,
+    Qwen3RotaryEmbedding,
+)
+
+from modelopt.torch.puzzletron.anymodel.model_descriptor import (
+    ModelDescriptor,
+    ModelDescriptorFactory,
+)
+from modelopt.torch.puzzletron.anymodel.puzzformer.no_op import (
+    MatchingZeros,
+    Same,
+    return_tuple_of_size,
+)
+from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import BlockConfig
+from modelopt.torch.puzzletron.pruning.ffn_intermediate_pruning_mixin import (
+    FFNIntermediateLayerDescriptor,
+)
+from modelopt.torch.puzzletron.pruning.kv_heads_pruning_mixin import KVHeadsLayerDescriptor
+from modelopt.torch.puzzletron.utils.dummy_modules import DummyBlock
+
+
+@ModelDescriptorFactory.register_decorator("qwen3")
+class Qwen3_8BModelDescriptor(ModelDescriptor):
+    @staticmethod
+    def decoder_layer_cls():
+        return Qwen3DecoderLayer
+
+    @classmethod
+    def create_dummy_block(cls, original_layer: nn.Module, block_index: int) -> nn.Module:
+        """Create a dummy block that preserves Qwen3-specific attributes like attention_type.
+
+        Qwen3's forward pass accesses decoder_layer.attention_type for attention mask selection.
+        """
+        dummy = DummyBlock(block_index=block_index)
+        # Copy attention_type from original layer (required by Qwen3's forward pass)
+        if hasattr(original_layer, "attention_type"):
+            dummy.attention_type = original_layer.attention_type
+        return dummy
+
+    @staticmethod
+    def block_config_to_layer_overrides(block_config: BlockConfig):
+        return {
+            "intermediate_size": block_config.ffn.intermediate_size,
+            "num_key_value_heads": block_config.attention.num_key_value_heads,
+        }
+
+    @staticmethod
+    def attn_no_op_post_init(decoder_layer: Qwen3DecoderLayer):
+        decoder_layer.input_layernorm = Same()
+        decoder_layer.self_attn = return_tuple_of_size(MatchingZeros, size=2)()
+
+    @staticmethod
+    def mlp_no_op_post_init(decoder_layer: Qwen3DecoderLayer):
+        decoder_layer.post_attention_layernorm = Same()
+        decoder_layer.mlp = MatchingZeros()
+
+    @staticmethod
+    def init_rotary_embedding(model: Qwen3ForCausalLM, runtime):
+        model.model.rotary_emb = Qwen3RotaryEmbedding(model.config, runtime.device)
+
+    @staticmethod
+    def input_embedding_name():
+        return "model.embed_tokens"
+
+    @staticmethod
+    def output_embedding_name():
+        return "lm_head"
+
+    @staticmethod
+    def final_norm_name():
+        return "model.norm"
+
+    @staticmethod
+    def layer_block_name(index: int):
+        return f"model.layers.{index}"
+
+    @staticmethod
+    def layer_name_predicates(num_layers: int) -> Dict[str, re.Pattern]:
+        layer_name_patterns = {
+            "embeddings": re.compile(r"^model\.embed_tokens\.weight$"),
+            "lm_head": re.compile(r"^(model\.norm\.weight|lm_head\.weight)$"),
+        }
+
+        def build_ffn_predicates() -> Dict[str, re.Pattern]:
+            return {
+                f"block_{layer_idx}_ffn": re.compile(
+                    rf"^model\.layers\.{layer_idx}\.(post_attention_layernorm\.weight"
+                    r"|mlp\.up_proj\.weight"
+                    r"|mlp\.gate_proj\.weight"
+                    r"|mlp\.down_proj\.weight)$"
+                )
+                for layer_idx in range(num_layers)
+            }
+
+        def build_attention_predicates() -> Dict[str, re.Pattern]:
+            return {
+                f"block_{layer_idx}_attention": re.compile(
+                    rf"^model\.layers\.{layer_idx}\.(input_layernorm\.weight"
+                    r"|self_attn\.q_proj\.weight"
+                    r"|self_attn\.k_proj\.weight"
+                    r"|self_attn\.v_proj\.weight"
+                    r"|self_attn\.o_proj\.weight"
+                    r"|self_attn\.q_norm\.weight"
+                    r"|self_attn\.k_norm\.weight)$"
+                )
+                for layer_idx in range(num_layers)
+            }
+
+        layer_name_patterns.update(**build_ffn_predicates(), **build_attention_predicates())
+        return layer_name_patterns
+
+
+@dataclass
+class Qwen3_8BFFNIntermediateLayerDescriptor(FFNIntermediateLayerDescriptor):
+    down_proj_name: str = "mlp.down_proj"
+    ffn_prefix_name: str = "model.layers.{layer_idx}.mlp"
+    linear_weight_names: List[str] = field(
+        default_factory=lambda: ["down_proj", "gate_proj", "up_proj"]
+    )
+
+
+@dataclass
+class Qwen3_8BKVHeadsLayerDescriptor(KVHeadsLayerDescriptor):
+    o_proj_name: str = "self_attn.o_proj"
+    attn_prefix_name: str = "model.layers.{layer_idx}.self_attn"
+    qkvo_weight_names: List[str] = field(
+        default_factory=lambda: ["q_proj", "k_proj", "v_proj", "o_proj"]
+    )
diff --git a/modelopt/torch/puzzletron/anymodel/models/qwen3_vl_30b_a3b_instruct/__init__.py b/modelopt/torch/puzzletron/anymodel/models/qwen3_vl_30b_a3b_instruct/__init__.py
new file mode 100644
index 0000000000..7bf317d29e
--- /dev/null
+++ b/modelopt/torch/puzzletron/anymodel/models/qwen3_vl_30b_a3b_instruct/__init__.py
@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from modelopt.torch.puzzletron.anymodel.models.qwen3_vl_30b_a3b_instruct.qwen3_vl_30b_a3b_instruct_converter import (
+    Qwen3VL30BA3BInstructConverter,
+)
+from modelopt.torch.puzzletron.anymodel.models.qwen3_vl_30b_a3b_instruct.qwen3_vl_30b_a3b_instruct_model_descriptor import (
+    Qwen3VL30BA3BInstructModelDescriptor,
+)
diff --git a/modelopt/torch/puzzletron/anymodel/models/qwen3_vl_30b_a3b_instruct/qwen3_vl_30b_a3b_instruct_converter.py b/modelopt/torch/puzzletron/anymodel/models/qwen3_vl_30b_a3b_instruct/qwen3_vl_30b_a3b_instruct_converter.py
new file mode 100644
index 0000000000..0c50dfeb9e
--- /dev/null
+++ b/modelopt/torch/puzzletron/anymodel/models/qwen3_vl_30b_a3b_instruct/qwen3_vl_30b_a3b_instruct_converter.py
@@ -0,0 +1,77 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# mypy: ignore-errors
+
+from typing import List
+
+from transformers import Qwen3VLMoeConfig
+
+from modelopt.torch.puzzletron.anymodel.converter import Converter, ConverterFactory
+from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import (
+    AttentionConfig,
+    BlockConfig,
+    FFNConfig,
+    MoEConfig,
+)
+
+
+@ConverterFactory.register_decorator("qwen3_vl")
+class Qwen3VL30BA3BInstructConverter(Converter):
+    @staticmethod
+    def create_block_configs_from_main_config(config: Qwen3VLMoeConfig) -> List[BlockConfig]:
+        # Qwen3-VL MoE has nested text_config
+        text_config = config.text_config if hasattr(config, "text_config") else config
+
+        num_hidden_layers = text_config.num_hidden_layers
+        decoder_sparse_step = getattr(text_config, "decoder_sparse_step", 1)
+        mlp_only_layers = getattr(text_config, "mlp_only_layers", [])
+
+        block_configs = []
+        for layer_idx in range(num_hidden_layers):
+            # Check if this layer is MoE or dense
+            is_moe_layer = (layer_idx % decoder_sparse_step == 0) and (
+                layer_idx not in mlp_only_layers
+            )
+
+            if is_moe_layer:
+                # MoE layer
+                block_config = BlockConfig(
+                    attention=AttentionConfig(
+                        no_op=False, num_key_value_heads=text_config.num_key_value_heads
+                    ),
+                    ffn=FFNConfig(
+                        moe=MoEConfig(
+                            num_local_experts=text_config.num_experts,
+                            expert_intermediate_dim=text_config.moe_intermediate_size,
+                            num_experts_per_tok=text_config.num_experts_per_tok,
+                        )
+                    ),
+                )
+            else:
+                # Dense layer
+                block_config = BlockConfig(
+                    attention=AttentionConfig(
+                        no_op=False, num_key_value_heads=text_config.num_key_value_heads
+                    ),
+                    ffn=FFNConfig(no_op=False, intermediate_size=text_config.intermediate_size),
+                )
+
+            block_configs.append(block_config)
+
+        print(
+            f"Created {len(block_configs)} block configs for Qwen3-VL MoE (decoder_sparse_step={decoder_sparse_step})"
+        )
+        return block_configs
diff --git a/modelopt/torch/puzzletron/anymodel/models/qwen3_vl_30b_a3b_instruct/qwen3_vl_30b_a3b_instruct_model_descriptor.py b/modelopt/torch/puzzletron/anymodel/models/qwen3_vl_30b_a3b_instruct/qwen3_vl_30b_a3b_instruct_model_descriptor.py
new file mode 100644
index 0000000000..7c7665a644
--- /dev/null
+++ b/modelopt/torch/puzzletron/anymodel/models/qwen3_vl_30b_a3b_instruct/qwen3_vl_30b_a3b_instruct_model_descriptor.py
@@ -0,0 +1,212 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# mypy: ignore-errors
+
+import re
+from dataclasses import dataclass, field
+from typing import Dict, List
+
+import torch.nn as nn
+from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import (
+    Qwen3VLMoeTextDecoderLayer,
+    Qwen3VLMoeTextRotaryEmbedding,
+    Qwen3VLMoeVisionRotaryEmbedding,
+)
+
+from modelopt.torch.puzzletron.anymodel.model_descriptor import (
+    ModelDescriptor,
+    ModelDescriptorFactory,
+)
+from modelopt.torch.puzzletron.anymodel.puzzformer.no_op import (
+    MatchingZeros,
+    Same,
+    return_tuple_of_size,
+)
+from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import BlockConfig
+from modelopt.torch.puzzletron.pruning.expert_removal_pruning_mixin import (
+    ExpertRemovalLayerDescriptor,
+)
+from modelopt.torch.puzzletron.pruning.ffn_intermediate_pruning_mixin import (
+    FFNIntermediateLayerDescriptor,
+)
+from modelopt.torch.puzzletron.pruning.kv_heads_pruning_mixin import KVHeadsLayerDescriptor
+
+
+@ModelDescriptorFactory.register_decorator("qwen3_vl")
+class Qwen3VL30BA3BInstructModelDescriptor(ModelDescriptor):
+    @staticmethod
+    def uses_autocast() -> bool:
+        """
+        Qwen3-VL MoE has a dtype bug in HuggingFace transformers under torch.autocast:
+        scatter() in MoE routing fails with dtype mismatch. Use native bfloat16 instead.
+        See: https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct (recommended approach)
+        """
+        return False
+
+    @staticmethod
+    def get_language_model_config(config):
+        """Qwen3-VL has nested text_config for language model parameters."""
+        return config.text_config if hasattr(config, "text_config") else config
+
+    @staticmethod
+    def decoder_layer_cls():
+        return Qwen3VLMoeTextDecoderLayer
+
+    @staticmethod
+    def block_config_to_layer_overrides(block_config: BlockConfig):
+        override_kwargs = {"num_key_value_heads": block_config.attention.num_key_value_heads}
+
+        if block_config.ffn.moe:
+            override_kwargs["moe_intermediate_size"] = block_config.ffn.moe.expert_intermediate_dim
+            override_kwargs["num_experts"] = block_config.ffn.moe.num_local_experts
+        else:
+            override_kwargs["intermediate_size"] = block_config.ffn.intermediate_size
+
+        return override_kwargs
+
+    @staticmethod
+    def attn_no_op_post_init(decoder_layer: Qwen3VLMoeTextDecoderLayer):
+        decoder_layer.input_layernorm = Same()
+        decoder_layer.self_attn = return_tuple_of_size(MatchingZeros, size=2)()
+
+    @staticmethod
+    def mlp_no_op_post_init(decoder_layer: Qwen3VLMoeTextDecoderLayer):
+        decoder_layer.post_attention_layernorm = Same()
+        decoder_layer.mlp = MatchingZeros()
+
+    @staticmethod
+    def init_rotary_embedding(model, runtime):
+        # Re-initialize text rotary embedding on correct device and dtype
+        text_config = Qwen3VL30BA3BInstructModelDescriptor.get_language_model_config(model.config)
+        model.model.language_model.rotary_emb = Qwen3VLMoeTextRotaryEmbedding(
+            config=text_config
+        ).to(device=runtime.device, dtype=runtime.dtype)
+        # Re-initialize vision rotary embedding on correct device and dtype
+        vision_config = (
+            model.config.vision_config if hasattr(model.config, "vision_config") else None
+        )
+        if vision_config is not None:
+            head_dim = vision_config.hidden_size // vision_config.num_heads
+            model.model.visual.rotary_pos_emb = Qwen3VLMoeVisionRotaryEmbedding(head_dim // 2).to(
+                device=runtime.device, dtype=runtime.dtype
+            )
+
+    @staticmethod
+    def input_embedding_name():
+        return "model.language_model.embed_tokens"
+
+    @staticmethod
+    def output_embedding_name():
+        return "lm_head"
+
+    @staticmethod
+    def final_norm_name():
+        return "model.language_model.norm"
+
+    @staticmethod
+    def layer_block_name(index: int):
+        return f"model.language_model.layers.{index}"
+
+    @staticmethod
+    def layer_name_predicates(num_layers: int) -> Dict[str, re.Pattern]:
+        # Qwen3-VL has text model under model.language_model.* prefix
+        layer_name_patterns = {
+            "embeddings": re.compile(r"^model\.language_model\.embed_tokens\.weight$"),
+            "lm_head": re.compile(r"^(model\.language_model\.norm\.weight|lm_head\.weight)$"),
+            # Vision encoder (includes merger under model.visual.deepstack_merger_list.*)
+            "vision_encoding": re.compile(r"^model\.visual\..*"),
+        }
+
+        def build_ffn_predicates() -> Dict[str, re.Pattern]:
+            return {
+                f"block_{layer_idx}_ffn": re.compile(
+                    rf"^model\.language_model\.layers\.{layer_idx}\.(post_attention_layernorm\.weight"
+                    # MoE router
+                    r"|mlp\.gate\.weight"
+                    # MoE experts - fused format (gate_up_proj, down_proj without .weight suffix)
+                    r"|mlp\.experts\.gate_up_proj"
+                    r"|mlp\.experts\.down_proj"
+                    # Shared expert (if present)
+                    r"|mlp\.shared_expert\.up_proj\.weight"
+                    r"|mlp\.shared_expert\.gate_proj\.weight"
+                    r"|mlp\.shared_expert\.down_proj\.weight"
+                    r"|mlp\.shared_expert_gate\.weight"
+                    # Dense MLP fallback (for non-MoE layers)
+                    r"|mlp\.up_proj\.weight"
+                    r"|mlp\.gate_proj\.weight"
+                    r"|mlp\.down_proj\.weight)$"
+                )
+                for layer_idx in range(num_layers)
+            }
+
+        def build_attention_predicates() -> Dict[str, re.Pattern]:
+            return {
+                f"block_{layer_idx}_attention": re.compile(
+                    rf"^model\.language_model\.layers\.{layer_idx}\.(input_layernorm\.weight"
+                    r"|self_attn\.q_proj\.weight"
+                    r"|self_attn\.k_proj\.weight"
+                    r"|self_attn\.v_proj\.weight"
+                    r"|self_attn\.o_proj\.weight"
+                    r"|self_attn\.q_norm\.weight"
+                    r"|self_attn\.k_norm\.weight)$"
+                )
+                for layer_idx in range(num_layers)
+            }
+
+        layer_name_patterns.update(**build_ffn_predicates(), **build_attention_predicates())
+        return layer_name_patterns
+
+
+@dataclass
+class Qwen3VL30BA3BInstructFFNIntermediateLayerDescriptor(FFNIntermediateLayerDescriptor):
+    down_proj_name: str = "mlp.down_proj"
+    ffn_prefix_name: str = "model.language_model.layers.{layer_idx}.mlp"
+    linear_weight_names: List[str] = field(
+        default_factory=lambda: ["down_proj", "gate_proj", "up_proj"]
+    )
+
+
+@dataclass
+class Qwen3VL30BA3BInstructKVHeadsLayerDescriptor(KVHeadsLayerDescriptor):
+    o_proj_name: str = "self_attn.o_proj"
+    attn_prefix_name: str = "model.language_model.layers.{layer_idx}.self_attn"
+    qkvo_weight_names: List[str] = field(
+        default_factory=lambda: ["q_proj", "k_proj", "v_proj", "o_proj"]
+    )
+
+
+@dataclass
+class Qwen3VL30BA3BInstructExpertRemovalLayerDescriptor(ExpertRemovalLayerDescriptor):
+    """
+    Qwen3-VL MoE layer descriptor.
+
+    Reference: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py
+    - Qwen3VLMoeTextSparseMoeBlock: MoE block with .gate (router) and .experts
+    - Qwen3VLMoeTextTopKRouter: Router with .weight (no bias)
+    - Qwen3VLMoeTextExperts: Fused experts with .gate_up_proj and .down_proj tensors
+    """
+
+    target_name: str = "mlp"
+    moe_prefix_name: str = "model.language_model.layers.{layer_idx}.mlp"
+    # Router: Qwen3VLMoeTextTopKRouter has self.weight, no bias
+    router_weights: List[str] = field(default_factory=lambda: ["gate.weight"])
+    router_biases: List[str] = field(default_factory=lambda: [])
+    # Fused expert format: Qwen3VLMoeTextExperts stores all experts in single tensors
+    # with shape [num_experts, ...] instead of separate tensors per expert.
+    is_fused_experts: bool = True
+    fused_expert_weights: List[str] = field(
+        default_factory=lambda: ["experts.gate_up_proj", "experts.down_proj"]
+    )
diff --git a/modelopt/torch/puzzletron/anymodel/puzzformer/no_op.py b/modelopt/torch/puzzletron/anymodel/puzzformer/no_op.py
index aac57af0a9..9b3a9a2190 100644
--- a/modelopt/torch/puzzletron/anymodel/puzzformer/no_op.py
+++ b/modelopt/torch/puzzletron/anymodel/puzzformer/no_op.py
@@ -43,7 +43,7 @@ class Wrapped(cls):
         def forward(self, *args, **kwargs):
             result = super().forward(*args, **kwargs)
             outputs = [None] * size
-            outputs[0] = result[0]
+            outputs[0] = result if isinstance(result, torch.Tensor) else result[0]
             return tuple(outputs)
 
         def extra_repr(self) -> str:
diff --git a/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/modeling_decilm.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/modeling_decilm.py
index 22d00ea773..24be1b227d 100644
--- a/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/modeling_decilm.py
+++ b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/modeling_decilm.py
@@ -534,7 +534,7 @@ def __init__(
         self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
         self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
-        self.act_fn = ACT2FN[ffn_config.hidden_act]
+        self.act_fn = ACT2FN[getattr(ffn_config, "hidden_act", "silu")]
 
         if ffn_config.sparsify is not None:
             self.register_full_backward_hook(sparsity_backward_hook)
@@ -579,7 +579,7 @@ def __init__(
         self.intermediate_size = ffn_config.intermediate_size
         self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
-        self.act_fn = ACT2FN[ffn_config.hidden_act]
+        self.act_fn = ACT2FN[getattr(ffn_config, "hidden_act", "silu")]
 
         if ffn_config.sparsify is not None:
             self.register_full_backward_hook(sparsity_backward_hook)
@@ -1037,7 +1037,7 @@ def __init__(self, config: DeciLMConfig, layer_idx: int | tuple[int, ...]):
                 self.self_attn = DeciLMLlama4TextAttention(config, layer_idx, self.attention_config)
 
         if not (self.ffn_config.no_op or self.attention_config.is_mamba):
-            if self.ffn_config.hidden_act is None:
+            if getattr(self.ffn_config, "hidden_act", None) is None:
                 print(f"WARNING: FFN hidden_act is None for layer {layer_idx}")
 
             self.post_attention_layernorm = DeciLMRMSNorm(
diff --git a/modelopt/torch/puzzletron/mip/run_puzzle.py b/modelopt/torch/puzzletron/mip/run_puzzle.py
index da0f90452d..71913db7d3 100644
--- a/modelopt/torch/puzzletron/mip/run_puzzle.py
+++ b/modelopt/torch/puzzletron/mip/run_puzzle.py
@@ -29,6 +29,10 @@
 import yaml
 from omegaconf import DictConfig, ListConfig, OmegaConf
 
+from modelopt.torch.puzzletron.anymodel.model_descriptor import (
+    ModelDescriptor,
+    ModelDescriptorFactory,
+)
 from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import (
     AttentionConfig,
     BlockConfig,
@@ -558,7 +562,12 @@ def _parse_teacher_block_metrics(
 ) -> list[dict]:
     raw_metrics = json.loads((single_block_replacement_validation_dir / "teacher.json").read_text())
     teacher_checkpoint_dir = Path(raw_metrics["args"]["teacher_dir"]).resolve()
-    teacher_model_config = load_model_config(teacher_checkpoint_dir)
+    descriptor_name = raw_metrics["args"]["descriptor"]
+    descriptor = ModelDescriptorFactory.get(descriptor_name)
+    trust_remote_code = descriptor.requires_trust_remote_code()
+    teacher_model_config = load_model_config(
+        teacher_checkpoint_dir, trust_remote_code=trust_remote_code
+    )
 
     teacher_replacements = None
     replacement_library_path = raw_metrics["args"].get("replacement_library_path")
diff --git a/modelopt/torch/puzzletron/pruning/expert_removal_pruning_mixin.py b/modelopt/torch/puzzletron/pruning/expert_removal_pruning_mixin.py
index 96d3489f5e..3c00ca212a 100644
--- a/modelopt/torch/puzzletron/pruning/expert_removal_pruning_mixin.py
+++ b/modelopt/torch/puzzletron/pruning/expert_removal_pruning_mixin.py
@@ -21,7 +21,6 @@
 
 from modelopt.torch.nas.plugins.megatron_hooks.base_hooks import (
     ForwardHook,
-    GptOssRemoveExpertsIndependentHook,
     NemotronHRemoveExpertsIndependentHook,
     Qwen3VLRemoveExpertsIndependentHook,
     RankedChoiceVotingHook,
@@ -82,7 +81,6 @@ def supported_hooks(self) -> List[Type[ForwardHook]]:
             RankedChoiceVotingHookNemotronH,
             NemotronHRemoveExpertsIndependentHook,
             Qwen3VLRemoveExpertsIndependentHook,
-            GptOssRemoveExpertsIndependentHook,
         ]
 
     def prune_single_layer(
diff --git a/modelopt/torch/puzzletron/pruning/pruning_ckpts.py b/modelopt/torch/puzzletron/pruning/pruning_ckpts.py
index 823f42faf8..b9cfd75faf 100644
--- a/modelopt/torch/puzzletron/pruning/pruning_ckpts.py
+++ b/modelopt/torch/puzzletron/pruning/pruning_ckpts.py
@@ -95,6 +95,12 @@ def launch_ffn_intermediates_prune_ckpt(
 def launch_attn_groups_prune_ckpt(
     cfg: DictConfig, max_save_workers: Optional[int] = None, max_layer_workers: Optional[int] = None
 ):
+    descriptor = cfg.descriptor
+    parent_model_config = load_model_config(
+        cfg.teacher_dir, trust_remote_code=descriptor.requires_trust_remote_code()
+    )
+    num_attention_heads = parent_model_config.num_attention_heads
+
     for n_heads_in_group in cfg.pruning.n_heads_in_group_list:
         dirname = f"n_heads_in_group{n_heads_in_group}"
 
@@ -105,7 +111,8 @@ def launch_attn_groups_prune_ckpt(
         mprint("Process n_heads_in_group {}".format(n_heads_in_group))
         mprint(f"=== STARTING ATTENTION PRUNING FOR n_heads_in_group={n_heads_in_group} ===")
 
-        model_config_overrides_json = {"attention": [{"n_heads_in_group": n_heads_in_group}]}
+        num_key_value_heads = num_attention_heads // n_heads_in_group
+        model_config_overrides_json = {"attention": [{"num_key_value_heads": num_key_value_heads}]}
         mlp_init_config_yaml = cfg.pruning.mlp_init_config_yaml
 
         output_dir = os.path.join(cfg.pruning.pruned_ckpts_output_dir, dirname)
@@ -151,7 +158,11 @@ def launch_hidden_dim_prune_ckpt(cfg: DictConfig):
         )
 
     # Load parent model config to get FFN configuration
-    parent_model_config = load_model_config(cfg.pruning.model_name_or_path)
+    descriptor = ModelDescriptorFactory.get(cfg.descriptor)
+    trust_remote_code = descriptor.requires_trust_remote_code()
+    parent_model_config = load_model_config(
+        cfg.pruning.model_name_or_path, trust_remote_code=trust_remote_code
+    )
     parent_hidden_size = parent_model_config.hidden_size
 
     # Get teacher's FFN configuration
diff --git a/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py b/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py
index 0f5ecd2158..cc81f4f887 100644
--- a/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py
+++ b/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py
@@ -39,6 +39,10 @@
 import pandas as pd
 from omegaconf import DictConfig
 
+from modelopt.torch.puzzletron.anymodel.model_descriptor import (
+    ModelDescriptor,
+    ModelDescriptorFactory,
+)
 from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import (
     AttentionConfig,
     BlockConfig,
@@ -65,6 +69,7 @@
 
 def build_replacement_library(
     master_puzzle_dir: Path | str,
+    descriptor: ModelDescriptor,
     teacher_checkpoint_dir: Path | str | None = None,
     add_ffn_no_ops: bool = True,
     add_attention_no_ops: bool = True,
@@ -76,20 +81,22 @@ def build_replacement_library(
     master_puzzle_dir = Path(master_puzzle_dir)
     (master_puzzle_dir / "ckpts").mkdir(exist_ok=True)
     teacher_checkpoint_dir = infer_teacher_dir(master_puzzle_dir, teacher_checkpoint_dir)
+    trust_remote_code = descriptor.requires_trust_remote_code()
     subblocks_df = _build_subblocks_df(
         master_puzzle_dir,
         teacher_checkpoint_dir,
         add_ffn_no_ops,
         add_attention_no_ops,
+        trust_remote_code=trust_remote_code,
     )
     block_library_df = _build_block_library_from_subblocks(subblocks_df)
 
     layer_replacements = _build_layer_replacements(
-        block_library_df, master_puzzle_dir, teacher_checkpoint_dir
+        block_library_df, master_puzzle_dir, teacher_checkpoint_dir, trust_remote_code
     )
 
     single_sequence_replacement_solutions = _build_single_sequence_replacement_solutions(
-        layer_replacements, teacher_checkpoint_dir
+        layer_replacements, teacher_checkpoint_dir, trust_remote_code
     )
 
     json_dump(block_library_df.to_dict(orient="records"), master_puzzle_dir / "block_library.json")
@@ -112,11 +119,13 @@ def launch_build_replacement_library(cfg: DictConfig) -> None:
         f"Build replacement library config: {format_global_config(cfg.build_replacement_library, title='Build replacement library')}"
     )
 
+    descriptor = ModelDescriptorFactory.get(cfg.descriptor)
     build_replacement_library(
         master_puzzle_dir=cfg.puzzle_dir,
         teacher_checkpoint_dir=cfg.teacher_dir,
         add_ffn_no_ops=cfg.build_replacement_library.add_ffn_no_ops,
         add_attention_no_ops=cfg.build_replacement_library.add_attention_no_ops,
+        descriptor=descriptor,
     )
 
 
@@ -191,9 +200,12 @@ def _build_subblocks_df(
     teacher_checkpoint_dir: Path | str,
     add_ffn_no_ops: bool,
     add_attention_no_ops: bool,
+    trust_remote_code: bool = False,
 ) -> pd.DataFrame:
     teacher_checkpoint_dir = Path(teacher_checkpoint_dir)
-    checkpoint_dirs = _get_last_checkpoint_from_each_experiment(master_puzzle_dir)
+    checkpoint_dirs = _get_last_checkpoint_from_each_experiment(
+        master_puzzle_dir, trust_remote_code=trust_remote_code
+    )
     checkpoint_dirs = [teacher_checkpoint_dir] + list(checkpoint_dirs - {teacher_checkpoint_dir})
     checkpoints_to_split = [teacher_checkpoint_dir]
 
@@ -203,7 +215,7 @@ def _build_subblocks_df(
         if len(subblocks_to_extract) > 0:
             subblock_rows_from_current_checkpoint = (
                 _construct_subblock_rows_from_current_checkpoint(
-                    checkpoint_dir, subblocks_to_extract
+                    checkpoint_dir, subblocks_to_extract, trust_remote_code=trust_remote_code
                 )
             )
             subblock_rows.extend(subblock_rows_from_current_checkpoint)
@@ -303,10 +315,10 @@ def _drop_duplicates_of_decomp_no_op(subblocks_df: pd.DataFrame) -> pd.DataFrame
 
 
 def _construct_subblock_rows_from_current_checkpoint(
-    checkpoint_dir: Path, subblocks_to_extract: list[str]
+    checkpoint_dir: Path, subblocks_to_extract: list[str], trust_remote_code: bool = False
 ) -> list[dict[str, Any]]:
     subblock_rows_from_current_checkpoint = []
-    model_config = load_model_config(checkpoint_dir)
+    model_config = load_model_config(checkpoint_dir, trust_remote_code=trust_remote_code)
     for block_idx, block_config in enumerate(model_config.block_configs):
         for subblock_to_extract in subblocks_to_extract:
             subblock_row = _init_empty_subblock_row(block_idx)
@@ -388,7 +400,9 @@ def _get_rows_with_no_op_subblock(
     return rows_with_no_op_subblock, subblock_cls
 
 
-def _get_last_checkpoint_from_each_experiment(master_puzzle_dir: Path | str) -> set[Path]:
+def _get_last_checkpoint_from_each_experiment(
+    master_puzzle_dir: Path | str, trust_remote_code: bool = False
+) -> set[Path]:
     master_puzzle_dir = Path(master_puzzle_dir)
     master_checkpoints_dir = master_puzzle_dir / CHECKPOINTS_DIR_NAME
     subdirs_of_master_checkpoints_dir = [
@@ -409,7 +423,11 @@ def _get_last_checkpoint_from_each_experiment(master_puzzle_dir: Path | str) ->
             )
 
     # Filter out non-DeciLM checkpoints (e.g., unconverted Llama checkpoints)
-    valid_checkpoint_dirs = [cp for cp in checkpoint_dirs if is_valid_decilm_checkpoint(cp)]
+    valid_checkpoint_dirs = [
+        cp
+        for cp in checkpoint_dirs
+        if is_valid_decilm_checkpoint(cp, trust_remote_code=trust_remote_code)
+    ]
 
     experiment_dirs = [
         p if (p in subdirs_of_master_checkpoints_dir) else p.parent for p in valid_checkpoint_dirs
@@ -465,14 +483,15 @@ def _build_layer_replacements(
     block_library_df: pd.DataFrame,
     master_puzzle_dir: Path,
     teacher_checkpoint_dir: Path,
+    trust_remote_code: bool = False,
 ) -> list[dict]:
     layer_replacements_from_blocks = _build_layer_replacements_from_block_library(block_library_df)
     layer_replacements_from_checkpoints = _gather_layer_replacements_from_checkpoints(
-        master_puzzle_dir
+        master_puzzle_dir, trust_remote_code=trust_remote_code
     )
     layer_replacements = layer_replacements_from_blocks + layer_replacements_from_checkpoints
     layer_replacements = _filter_duplicate_teacher_replacements(
-        layer_replacements, teacher_checkpoint_dir
+        layer_replacements, teacher_checkpoint_dir, trust_remote_code
     )
     return layer_replacements
 
@@ -502,9 +521,13 @@ def _build_layer_replacements_from_block_library(block_library_df: pd.DataFrame)
     return layer_replacements
 
 
-def _gather_layer_replacements_from_checkpoints(master_puzzle_dir: str | Path) -> list[dict]:
+def _gather_layer_replacements_from_checkpoints(
+    master_puzzle_dir: str | Path, trust_remote_code: bool = False
+) -> list[dict]:
     gathered_layer_replacements = []
-    checkpoint_dirs = _get_last_checkpoint_from_each_experiment(master_puzzle_dir)
+    checkpoint_dirs = _get_last_checkpoint_from_each_experiment(
+        master_puzzle_dir, trust_remote_code=trust_remote_code
+    )
     for checkpoint_dir in checkpoint_dirs:
         if (layer_replacements_path := checkpoint_dir / "replacement_library.json").exists():
             layer_replacements = json.loads(layer_replacements_path.read_text())
@@ -523,8 +546,11 @@ def _gather_layer_replacements_from_checkpoints(master_puzzle_dir: str | Path) -
 def _filter_duplicate_teacher_replacements(
     layer_replacements: list[dict],
     teacher_checkpoint_dir: Path,
+    trust_remote_code: bool = False,
 ) -> list[dict]:
-    teacher_model_config = load_model_config(teacher_checkpoint_dir)
+    teacher_model_config = load_model_config(
+        teacher_checkpoint_dir, trust_remote_code=trust_remote_code
+    )
     filtered_layer_replacements = []
     for layer_replacement in layer_replacements:
         if replacement_is_teacher(
@@ -537,8 +563,11 @@ def _filter_duplicate_teacher_replacements(
 def _build_single_sequence_replacement_solutions(
     layer_replacements: list[dict],
     teacher_checkpoint_dir: Path,
+    trust_remote_code: bool = False,
 ) -> list[dict]:
-    teacher_model_config = load_model_config(teacher_checkpoint_dir)
+    teacher_model_config = load_model_config(
+        teacher_checkpoint_dir, trust_remote_code=trust_remote_code
+    )
     n_layer = teacher_model_config.num_hidden_layers
 
     teacher_replacements = dict()
diff --git a/modelopt/torch/puzzletron/replacement_library/replacement_library.py b/modelopt/torch/puzzletron/replacement_library/replacement_library.py
index 7935fea4a0..8a7c2834fd 100644
--- a/modelopt/torch/puzzletron/replacement_library/replacement_library.py
+++ b/modelopt/torch/puzzletron/replacement_library/replacement_library.py
@@ -123,10 +123,12 @@ def n_layer(self) -> int:
     @property
     def model_config(self) -> DeciLMConfig:
         if self._model_config is None:
+            trust_remote_code = self.descriptor.requires_trust_remote_code()
             self._model_config = load_model_config(
                 self.get_arbitrary_checkpoint_dir(),
                 self.model_config_overrides,
                 ignore_unexpected_config_keys=True,
+                trust_remote_code=trust_remote_code,
             )
         return self._model_config
 
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
index 2db0bc3916..0b8a3e72fe 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
@@ -285,7 +285,8 @@ def calculate_subblock_stats_for_puzzle_dir(
     teacher_dir = (
         Path(teacher_dir) if teacher_dir is not None else master_puzzle_dir / "ckpts" / "teacher"
     )
-    model_config = load_model_config(teacher_dir)
+    trust_remote_code = descriptor.requires_trust_remote_code()
+    model_config = load_model_config(teacher_dir, trust_remote_code=trust_remote_code)
     # Get language model config for LM-specific attributes (VL models have nested config)
     lm_config = descriptor.get_language_model_config(model_config)
     subblock_configs = _load_subblock_configs(master_puzzle_dir, ffn_hidden_sizes, model_config)
diff --git a/modelopt/torch/puzzletron/tools/bypassed_training/init_child_from_parent.py b/modelopt/torch/puzzletron/tools/bypassed_training/init_child_from_parent.py
index 36e41c4b6a..ecfb8b857b 100644
--- a/modelopt/torch/puzzletron/tools/bypassed_training/init_child_from_parent.py
+++ b/modelopt/torch/puzzletron/tools/bypassed_training/init_child_from_parent.py
@@ -86,7 +86,9 @@ def init_child_from_parent(
 
     copy_tokenizer(parent_checkpoint_dir, output_checkpoint_dir)
 
-    parent_model_config = load_model_config(parent_checkpoint_dir)
+    parent_model_config = load_model_config(
+        parent_checkpoint_dir, trust_remote_code=descriptor.requires_trust_remote_code()
+    )
     parent_state_dict = load_state_dict(parent_checkpoint_dir)
 
     # Parse JSON if string
@@ -108,6 +110,7 @@ def init_child_from_parent(
         parent_checkpoint_dir,
         model_config_overrides=global_config_overrides,
         ignore_unexpected_config_keys=True,
+        trust_remote_code=descriptor.requires_trust_remote_code(),
     )
 
     # Apply block-level overrides if any
@@ -126,7 +129,10 @@ def init_child_from_parent(
             model_class = _get_model_class_from_config(child_model_config)
             # AutoModelForCausalLM uses from_config(); concrete model classes use _from_config()
             if model_class is AutoModelForCausalLM:
-                child_model = model_class.from_config(child_model_config, trust_remote_code=True)
+                trust_remote_code = descriptor.requires_trust_remote_code()
+                child_model = model_class.from_config(
+                    child_model_config, trust_remote_code=trust_remote_code
+                )
             else:
                 child_model = model_class._from_config(child_model_config)
 
diff --git a/modelopt/torch/puzzletron/tools/checkpoint_utils.py b/modelopt/torch/puzzletron/tools/checkpoint_utils.py
index f08b89e449..20c2fbe2ac 100644
--- a/modelopt/torch/puzzletron/tools/checkpoint_utils.py
+++ b/modelopt/torch/puzzletron/tools/checkpoint_utils.py
@@ -135,17 +135,20 @@ def skip_init(module_cls, *args, **kwargs) -> nn.Module:
     return module
 
 
-def is_valid_decilm_checkpoint(checkpoint_dir: Path | str) -> bool:
+def is_valid_decilm_checkpoint(checkpoint_dir: Path | str, trust_remote_code: bool = False) -> bool:
     """Validate that a checkpoint is in DeciLM format (has block_configs).
 
     Args:
         checkpoint_dir: Path to checkpoint directory
+        trust_remote_code: If True, allows execution of custom code from the model repository.
+            This is a security risk if the model source is untrusted. Only set to True if you
+            trust the source of the model. Defaults to False for security.
 
     Returns:
         True if checkpoint is valid DeciLM format, False otherwise
     """
     try:
-        model_config = load_model_config(checkpoint_dir)
+        model_config = load_model_config(checkpoint_dir, trust_remote_code=trust_remote_code)
         if model_config.block_configs is None:
             warnings.warn(
                 f"Skipping checkpoint '{checkpoint_dir}' - not in DeciLM format (missing block_configs)"
diff --git a/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py b/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py
index 3c3b54830a..3647de5e25 100644
--- a/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py
+++ b/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py
@@ -73,10 +73,19 @@ def load_checkpoint(
     checkpoint_dir: Path | str,
     model_config_overrides: dict | None = None,
     ignore_unexpected_config_keys: bool = False,
+    trust_remote_code: bool = False,
 ) -> DeciLMForCausalLM:
     """
     Unlike AutoModelForCausalLM.from_pretrained, the models loaded by this function use your
     local repo code, not the code inside the checkpoint.
+
+    Args:
+        checkpoint_dir: Path to checkpoint directory
+        model_config_overrides: Optional mapping of config overrides.
+        ignore_unexpected_config_keys: If True, ignore unexpected config keys.
+        trust_remote_code: If True, allows execution of custom code from the model repository.
+            This is a security risk if the model source is untrusted. Only set to True if you
+            trust the source of the model. Defaults to False for security.
     """
     from modelopt.torch.puzzletron.tools.checkpoint_utils import (
         load_state_dict,  # prevent circular import
@@ -86,7 +95,10 @@ def load_checkpoint(
         checkpoint_dir = Path(checkpoint_dir)
 
     model_config = load_model_config(
-        checkpoint_dir, model_config_overrides, ignore_unexpected_config_keys
+        checkpoint_dir,
+        model_config_overrides=model_config_overrides,
+        ignore_unexpected_config_keys=ignore_unexpected_config_keys,
+        trust_remote_code=trust_remote_code,
     )
 
     # Without sparsity we could have done:
@@ -221,7 +233,17 @@ def _save_checkpoint(
     )
 
 
-def split_checkpoint_to_subblocks(checkpoint_dir: Path | str) -> None:
+def split_checkpoint_to_subblocks(
+    checkpoint_dir: Path | str, trust_remote_code: bool = False
+) -> None:
+    """Split a checkpoint into subblocks.
+
+    Args:
+        checkpoint_dir: Path to checkpoint directory
+        trust_remote_code: If True, allows execution of custom code from the model repository.
+            This is a security risk if the model source is untrusted. Only set to True if you
+            trust the source of the model. Defaults to False for security.
+    """
     from modelopt.torch.puzzletron.tools.checkpoint_utils import (
         load_state_dict,  # prevent circular import
     )
@@ -229,7 +251,7 @@ def split_checkpoint_to_subblocks(checkpoint_dir: Path | str) -> None:
     if not isinstance(checkpoint_dir, Path):
         checkpoint_dir = Path(checkpoint_dir)
 
-    model_config = load_model_config(checkpoint_dir)
+    model_config = load_model_config(checkpoint_dir, trust_remote_code=trust_remote_code)
     state_dict = load_state_dict(checkpoint_dir)
     save_subblocks(state_dict, checkpoint_dir)
 
diff --git a/modelopt/torch/puzzletron/tools/sharded_checkpoint_utils.py b/modelopt/torch/puzzletron/tools/sharded_checkpoint_utils.py
index 1cf02dc931..55926eaaea 100644
--- a/modelopt/torch/puzzletron/tools/sharded_checkpoint_utils.py
+++ b/modelopt/torch/puzzletron/tools/sharded_checkpoint_utils.py
@@ -115,7 +115,9 @@ def set_submodule(model: nn.Module, module_name: str, new_submodule: nn.Module)
 
 
 def create_local_shard_(model, owned_block_indexes: set[int], descriptor, runtime):
-    all_block_indexes = set(range(model.config.num_hidden_layers))
+    # Get language model config (handles nested configs like Qwen3-VL's text_config)
+    lm_config = descriptor.get_language_model_config(model.config)
+    all_block_indexes = set(range(lm_config.num_hidden_layers))
     has_first_block = 0 in owned_block_indexes
     has_last_block = max(all_block_indexes) in owned_block_indexes
 
@@ -136,13 +138,13 @@ def create_local_shard_(model, owned_block_indexes: set[int], descriptor, runtim
         set_submodule(
             model,
             descriptor.input_embedding_name(),
-            DummyWTE(model.config.hidden_size, dtype=runtime.dtype),
+            DummyWTE(lm_config.hidden_size, dtype=runtime.dtype),
         )
 
     if not has_last_block:
         set_submodule(model, descriptor.final_norm_name(), nn.Identity())
         if not (model.config.tie_word_embeddings and has_first_block):
-            set_submodule(model, descriptor.output_embedding_name(), DummyLMHead(model.config))
+            set_submodule(model, descriptor.output_embedding_name(), DummyLMHead(lm_config))
 
     return model
 
@@ -202,11 +204,13 @@ def load_and_shard_model(
 
     with runtime.device:
         if model_config is None:
-            model_config = load_model_config(checkpoint_path)
+            trust_remote_code = descriptor.requires_trust_remote_code()
+            model_config = load_model_config(checkpoint_path, trust_remote_code=trust_remote_code)
 
+        num_hidden_layers = descriptor.get_language_model_config(model_config).num_hidden_layers
         if owned_block_indexes == "auto":
             owned_block_indexes = set(
-                np.array_split(np.arange(model_config.num_hidden_layers), runtime.world_size)[
+                np.array_split(np.arange(num_hidden_layers), runtime.world_size)[
                     runtime.global_rank
                 ]
             )
@@ -250,7 +254,7 @@ def load_and_shard_model(
             # Re-tie weights after load_state_dict with assign=True, which severs the tie.
             # Needed on first rank (owns embed_tokens) and last rank (owns lm_head).
             has_first_block = 0 in owned_block_indexes
-            has_last_block = (model_config.num_hidden_layers - 1) in owned_block_indexes
+            has_last_block = (num_hidden_layers - 1) in owned_block_indexes
             if model_config.tie_word_embeddings and (has_first_block or has_last_block):
                 model_shard.tie_weights()
 
@@ -309,7 +313,8 @@ def create_sharded_model(
         model_class = _get_model_class_from_config(model_config)
         # AutoModelForCausalLM uses from_config(); concrete model classes use _from_config()
         if model_class is AutoModelForCausalLM:
-            model = model_class.from_config(model_config, trust_remote_code=True)
+            trust_remote_code = descriptor.requires_trust_remote_code()
+            model = model_class.from_config(model_config, trust_remote_code=trust_remote_code)
         else:
             model = model_class._from_config(model_config)
         create_local_shard_(
diff --git a/tests/_test_utils/torch/puzzletron/resources/configs/pruning/ffn_pruning.yaml b/tests/_test_utils/torch/puzzletron/resources/configs/pruning/ffn_pruning.yaml
deleted file mode 100644
index f0c852eec9..0000000000
--- a/tests/_test_utils/torch/puzzletron/resources/configs/pruning/ffn_pruning.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-defaults:
-  - pruning_defaults
-
-activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/ffn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id}
-
-activation_hooks_kwargs:
-  method: iterative
-  target_layer: "mlp.down_proj"
-  layer_input_descriptors_path:
-
-intermediate_size_list: [256]  # teacher_intermediate_size is 14336
-mlp_init_mode: "PruneByActivationsLog"
diff --git a/tests/_test_utils/torch/puzzletron/resources/configs/pruning/pruning_defaults.yaml b/tests/_test_utils/torch/puzzletron/resources/configs/pruning/pruning_defaults.yaml
deleted file mode 100644
index 0a5eafcfff..0000000000
--- a/tests/_test_utils/torch/puzzletron/resources/configs/pruning/pruning_defaults.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-defaults:
-  - /validate_model_defaults
-
-model_name_or_path: ${teacher_dir}
-experiment_id: ${pruning.eval_samples}samples_diverse_mini
-activations_log_dir: ???
-activation_hooks_kwargs: ???
-
-# Data:
-eval_samples: 100
-micro_batch_size: 4
-dataset_path: ${dataset_path}
-val_dataset_name: train
-
-# Prune ckpts
-pruned_ckpts_outpt_dir: ${puzzle_dir}/pruning/${pruning.experiment_id}
-
-## FFN pruning
-ffn_list:
-mlp_init_mode: "Truncate"
-
-## KV-heads pruning
-n_heads_in_group_list:
-gqa_init_mode: "AverageKV"
-
-## Hidden dimension pruning
-hidden_size_list:
-hidden_size_init_mode: "PruneByChannelRanking"
-linear_init_mode: "FromTeacher"
-
-mlp_init_config_yaml:
-  activations_log_dir: ${pruning.activations_log_dir}
diff --git a/tests/_test_utils/torch/puzzletron/resources/configs/validate_model_defaults.yaml b/tests/_test_utils/torch/puzzletron/resources/configs/validate_model_defaults.yaml
deleted file mode 100644
index 1d042d75df..0000000000
--- a/tests/_test_utils/torch/puzzletron/resources/configs/validate_model_defaults.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-model_dtype: torch.bfloat16
-autocast_dtype: torch.bfloat16
-block_size: 8192
-bos_rate: 0.5
-data_column: conversation
-val_dataset_name: train
-shuffle_seed: 81436
-seed: 42
-fim_rate: 0
-fim_spm_rate: 0
-source_datasets_to_discard:
-varlen: false
-write_results: false
-calc_losses_on_cpu: false
-activations_log_dir:
-model_name_or_path:
-load_dataset_fn: ${get_object:modelopt.torch.puzzletron.utils.data.dataloaders.load_from_disk_fn}
diff --git a/tests/_test_utils/torch/puzzletron/resources/tokenizer/special_tokens_map.json b/tests/_test_utils/torch/puzzletron/resources/tokenizer/special_tokens_map.json
deleted file mode 100644
index 02ee80b619..0000000000
--- a/tests/_test_utils/torch/puzzletron/resources/tokenizer/special_tokens_map.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "bos_token": {
-    "content": "<|begin_of_text|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "eos_token": {
-    "content": "<|eot_id|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  }
-}
diff --git a/tests/_test_utils/torch/puzzletron/resources/tokenizer/tokenizer.json b/tests/_test_utils/torch/puzzletron/resources/tokenizer/tokenizer.json
deleted file mode 100644
index 83592e2494..0000000000
--- a/tests/_test_utils/torch/puzzletron/resources/tokenizer/tokenizer.json
+++ /dev/null
@@ -1,212 +0,0 @@
-{
-  "version": "1.0",
-  "truncation": null,
-  "padding": null,
-  "added_tokens": [],
-  "normalizer": null,
-  "pre_tokenizer": {
-    "type": "Sequence",
-    "pretokenizers": [
-      {
-        "type": "Split",
-        "pattern": {
-          "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
-        },
-        "behavior": "Isolated",
-        "invert": false
-      },
-      {
-        "type": "ByteLevel",
-        "add_prefix_space": false,
-        "trim_offsets": true,
-        "use_regex": false
-      }
-    ]
-  },
-  "post_processor": {
-    "type": "Sequence",
-    "processors": [
-      {
-        "type": "ByteLevel",
-        "add_prefix_space": true,
-        "trim_offsets": false,
-        "use_regex": true
-      },
-      {
-        "type": "TemplateProcessing",
-        "single": [
-          {
-            "SpecialToken": {
-              "id": "<|begin_of_text|>",
-              "type_id": 0
-            }
-          },
-          {
-            "Sequence": {
-              "id": "A",
-              "type_id": 0
-            }
-          }
-        ],
-        "pair": [
-          {
-            "SpecialToken": {
-              "id": "<|begin_of_text|>",
-              "type_id": 0
-            }
-          },
-          {
-            "Sequence": {
-              "id": "A",
-              "type_id": 0
-            }
-          },
-          {
-            "SpecialToken": {
-              "id": "<|begin_of_text|>",
-              "type_id": 1
-            }
-          },
-          {
-            "Sequence": {
-              "id": "B",
-              "type_id": 1
-            }
-          }
-        ],
-        "special_tokens": {
-          "<|begin_of_text|>": {
-            "id": "<|begin_of_text|>",
-            "ids": [
-              100
-            ],
-            "tokens": [
-              "<|begin_of_text|>"
-            ]
-          }
-        }
-      }
-    ]
-  },
-  "decoder": {
-    "type": "ByteLevel",
-    "add_prefix_space": true,
-    "trim_offsets": true,
-    "use_regex": true
-  },
-  "model": {
-    "type": "BPE",
-    "dropout": null,
-    "unk_token": null,
-    "continuing_subword_prefix": null,
-    "end_of_word_suffix": null,
-    "fuse_unk": false,
-    "byte_fallback": false,
-    "ignore_merges": true,
-    "vocab": {
-      "!": 0,
-      "\"": 1,
-      "#": 2,
-      "$": 3,
-      "%": 4,
-      "&": 5,
-      "'": 6,
-      "(": 7,
-      ")": 8,
-      "*": 9,
-      "+": 10,
-      ",": 11,
-      "-": 12,
-      ".": 13,
-      "/": 14,
-      "0": 15,
-      "1": 16,
-      "2": 17,
-      "3": 18,
-      "4": 19,
-      "5": 20,
-      "6": 21,
-      "7": 22,
-      "8": 23,
-      "9": 24,
-      ":": 25,
-      ";": 26,
-      "<": 27,
-      "=": 28,
-      ">": 29,
-      "?": 30,
-      "@": 31,
-      "A": 32,
-      "B": 33,
-      "C": 34,
-      "D": 35,
-      "E": 36,
-      "F": 37,
-      "G": 38,
-      "H": 39,
-      "I": 40,
-      "J": 41,
-      "K": 42,
-      "L": 43,
-      "M": 44,
-      "N": 45,
-      "O": 46,
-      "P": 47,
-      "Q": 48,
-      "R": 49,
-      "S": 50,
-      "T": 51,
-      "U": 52,
-      "V": 53,
-      "W": 54,
-      "X": 55,
-      "Y": 56,
-      "Z": 57,
-      "[": 58,
-      "\\": 59,
-      "]": 60,
-      "^": 61,
-      "_": 62,
-      "`": 63,
-      "a": 64,
-      "b": 65,
-      "c": 66,
-      "d": 67,
-      "e": 68,
-      "f": 69,
-      "g": 70,
-      "h": 71,
-      "i": 72,
-      "j": 73,
-      "k": 74,
-      "l": 75,
-      "m": 76,
-      "n": 77,
-      "o": 78,
-      "p": 79,
-      "q": 80,
-      "r": 81,
-      "s": 82,
-      "t": 83,
-      "u": 84,
-      "v": 85,
-      "w": 86,
-      "x": 87,
-      "y": 88,
-      "z": 89,
-      "{": 90,
-      "|": 91,
-      "}": 92,
-      "~": 93,
-      "¡": 94,
-      "¢": 95,
-      "£": 96,
-      "¤": 97,
-      "¥": 98,
-      "¦": 99,
-      "<|begin_of_text|>": 100,
-      "<|eot_id|>": 101
-    },
-    "merges": []
-  }
-}
diff --git a/tests/_test_utils/torch/puzzletron/resources/tokenizer/tokenizer_config.json b/tests/_test_utils/torch/puzzletron/resources/tokenizer/tokenizer_config.json
deleted file mode 100644
index 754d9e8db5..0000000000
--- a/tests/_test_utils/torch/puzzletron/resources/tokenizer/tokenizer_config.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "bos_token": "<|begin_of_text|>",
-  "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n            {%- for arg_name, arg_val in tool_call.arguments | items %}\n                {{- arg_name + '=\"' + arg_val + '\"' }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- endif %}\n                {%- endfor %}\n            {{- \")\" }}\n        {%- else  %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n            {{- '\"parameters\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \"}\" }}\n        {%- endif %}\n        {%- if builtin_tools is defined %}\n            {#- This means we're in ipython mode #}\n            {{- \"<|eom_id|>\" }}\n        {%- else %}\n            {{- \"<|eot_id|>\" }}\n        {%- endif %}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
-  "clean_up_tokenization_spaces": true,
-  "eos_token": "<|eot_id|>",
-  "extra_special_tokens": {},
-  "model_input_names": [
-    "input_ids",
-    "attention_mask"
-  ],
-  "model_max_length": 131072,
-  "tokenizer_class": "PreTrainedTokenizer"
-}
diff --git a/tests/_test_utils/torch/puzzletron/resources/tokenizer/truncate_tokenizer.py b/tests/_test_utils/torch/puzzletron/resources/tokenizer/truncate_tokenizer.py
deleted file mode 100644
index aedcae4ab2..0000000000
--- a/tests/_test_utils/torch/puzzletron/resources/tokenizer/truncate_tokenizer.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This script was used to truncate the tokenizer.json file from Llama 3.1 8B model
-to keep only the top 100 most common tokens.
-"""
-
-import json
-
-# Path to your original and new tokenizer.json
-in_path = "./tokenizer.json"
-out_path = "./tokenizer_truncated.json"
-
-# How many top tokens to keep
-NUM_TO_KEEP = 100
-
-with open(in_path, encoding="utf-8") as f:
-    tokenizer_data = json.load(f)
-
-# Get and sort the original vocab by index (frequency proxy)
-orig_vocab = tokenizer_data["model"]["vocab"]
-
-# Sort tokens by their original index (lowest index = assumed most common/important)
-sorted_tokens = sorted(orig_vocab.items(), key=lambda item: item[1])
-
-# Keep the top N tokens
-tokens_to_keep = [tok for tok, idx in sorted_tokens[:NUM_TO_KEEP]]
-
-# Re-index the selected tokens: 0..N-1
-small_vocab = {tok: i for i, tok in enumerate(tokens_to_keep)}
-tokenizer_data["model"]["vocab"] = small_vocab
-
-# Update vocab size
-if "vocab_size" in tokenizer_data["model"]:
-    tokenizer_data["model"]["vocab_size"] = len(small_vocab)
-
-# Optionally remove merges if present and unneeded (mostly for BPE/WordPiece)
-if "merges" in tokenizer_data["model"]:
-    tokenizer_data["model"]["merges"] = []
-
-# Remove added_tokens if not needed
-if "added_tokens" in tokenizer_data:
-    tokenizer_data["added_tokens"] = []
-
-# Write out the truncated tokenizer.json
-with open(out_path, "w", encoding="utf-8") as f:
-    json.dump(tokenizer_data, f, indent=2, ensure_ascii=False)
-
-print(f"Truncated tokenizer saved to: {out_path}")
diff --git a/tests/_test_utils/torch/puzzletron/utils.py b/tests/_test_utils/torch/puzzletron/utils.py
index 07d1565f42..7615c5d085 100644
--- a/tests/_test_utils/torch/puzzletron/utils.py
+++ b/tests/_test_utils/torch/puzzletron/utils.py
@@ -24,18 +24,12 @@
 import modelopt.torch.utils.distributed as dist
 from modelopt.torch.puzzletron.tools.hydra_utils import register_hydra_resolvers
 
-# Path to HF configs relative to this file
-# HF configs are in tests/gpu/torch/puzzletron/resources/hf_configs
-HF_CONFIGS_DIR = (
-    Path(__file__).parent.parent.parent.parent / "gpu/torch/puzzletron/resources/hf_configs"
-)
-
 
 def setup_test_model_and_data(
     project_root_path: Path,
     tmp_path: Path,
     rank: int,
-    hf_config_name: str,
+    hf_model_name: str,
     hybrid_override_pattern: str | None = None,
 ) -> tuple[Path, Path, Path]:
     """
@@ -45,7 +39,7 @@ def setup_test_model_and_data(
         project_root_path (Path): the root path of the project
         tmp_path (Path): the temporary path to use for the test
         rank (int): the rank of the process
-        hf_config_name (str): Name of the HF config directory (e.g., "llama_3_1_8b_instruct")
+        hf_model_name (str): HuggingFace model card name (e.g., "meta-llama/Llama-3.1-8B-Instruct")
         hybrid_override_pattern (str): For NemotronH models, the layer type pattern
 
     Returns:
@@ -56,10 +50,8 @@ def setup_test_model_and_data(
     # Register Hydra custom resolvers (needed for config resolution)
     register_hydra_resolvers()
 
-    # The inputs for the nas.convert() step.
-    #
-    puzzle_dir = tmp_path / hf_config_name
-    hf_checkpoint_path = puzzle_dir / f"hf_models/{hf_config_name}"
+    puzzle_dir = tmp_path / hf_model_name
+    hf_checkpoint_path = puzzle_dir / f"hf_models/{hf_model_name}"
     dataset_path = puzzle_dir / "dummy_dataset"
 
     if rank == 0:
@@ -73,7 +65,7 @@ def setup_test_model_and_data(
             output_path=str(hf_checkpoint_path),
             vocab_size=tokenizer.vocab_size,
             tokenizer=tokenizer,
-            hf_config_name=hf_config_name,
+            hf_model_name=hf_model_name,
             hybrid_override_pattern=hybrid_override_pattern,
         )
     dist.barrier()
@@ -89,7 +81,7 @@ def create_and_save_small_hf_model(
     output_path: str,
     vocab_size: int,
     tokenizer: PreTrainedTokenizerBase,
-    hf_config_name: str,
+    hf_model_name: str,
     hybrid_override_pattern: str | None = None,
 ):
     """
@@ -101,23 +93,21 @@ def create_and_save_small_hf_model(
         output_path: Where to save the model
         vocab_size: Vocabulary size (should match tokenizer)
         tokenizer: Tokenizer to save alongside the model
-        hf_config_name: Name of the config directory under resources/hf_configs/
-                        e.g., "llama_3_1_8b_instruct", "llama_3_2_3b_instruct", or "qwen2_5_7b_instruct"
+        hf_model_name: HuggingFace model card name (e.g., "meta-llama/Llama-3.1-8B-Instruct")
         hybrid_override_pattern: For NemotronH models, the layer type pattern (e.g., "*-" for Attention+MLP,
                                  "M-" for Mamba+MLP). Must match num_hidden_layers. None for non-NemotronH models.
     """
     os.makedirs(output_path, exist_ok=True)
 
     # Load real HuggingFace config (preserves tie_word_embeddings, rope_scaling, etc.)
-    config_path = HF_CONFIGS_DIR / hf_config_name
-    config = AutoConfig.from_pretrained(config_path, local_files_only=True, trust_remote_code=True)
+    config = AutoConfig.from_pretrained(hf_model_name, trust_remote_code=True)
 
     # Override size-related params to make it small for testing
     # Note: intermediate_size must be divisible by 256 per DeciLM config requirements
     # Note: hidden_size must give head_dim >= 8 for Flash Attention 2 compatibility
 
     # VL models have nested configs (text_config, vision_config)
-    if hf_config_name == "qwen3-vl-30b-a3b-instruct":
+    if hasattr(config, "text_config") and hasattr(config, "vision_config"):
         config.text_config.vocab_size = vocab_size
         config.text_config.hidden_size = 256
         config.text_config.intermediate_size = 512
@@ -160,14 +150,34 @@ def create_and_save_small_hf_model(
     torch.manual_seed(42)
 
     # Create and save the model
+    # Force CPU initialization for deterministic behavior (prevents NaN on RTX GPUs)
+    original_cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES")
+    os.environ["CUDA_VISIBLE_DEVICES"] = ""
     # TODO: Consider using AutoModel.from_config instead.
-    if hf_config_name == "qwen3-vl-30b-a3b-instruct":
+    if hasattr(config, "text_config") and hasattr(config, "vision_config"):
         from transformers import Qwen3VLMoeForConditionalGeneration
 
         model = Qwen3VLMoeForConditionalGeneration._from_config(config)
     else:
         model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
 
+    # Initialize weights to ensure all parameters are properly initialized
+    # This prevents NaN values in uninitialized parameters (e.g., backbone.layers.1.mixer.gate.weight
+    # in nemotron-3-nano-30b-a3b-base-bf16) that can occur with from_config on RTX GPU cards (not on H100)
+    model.initialize_weights()
+
+    # Fix any remaining NaN/Inf values that initialize_weights() might have missed
+    for name, param in model.named_parameters():
+        if torch.isnan(param).any() or torch.isinf(param).any():
+            nan_inf_mask = torch.isnan(param) | torch.isinf(param)
+            param.data = torch.where(nan_inf_mask, torch.zeros_like(param), param)
+
+    # Restore CUDA_VISIBLE_DEVICES after model creation and initialization
+    if original_cuda_visible is not None:
+        os.environ["CUDA_VISIBLE_DEVICES"] = original_cuda_visible
+    else:
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+
     model.to(dtype=torch.bfloat16).save_pretrained(output_path)
 
     # Save tokenizer
diff --git a/tests/gpu/torch/puzzletron/nas/plugins/test_nas_convert.py b/tests/gpu/torch/puzzletron/nas/plugins/test_nas_convert.py
index e2373676d2..8a5bad0c62 100644
--- a/tests/gpu/torch/puzzletron/nas/plugins/test_nas_convert.py
+++ b/tests/gpu/torch/puzzletron/nas/plugins/test_nas_convert.py
@@ -18,7 +18,6 @@
 from functools import partial
 from pathlib import Path
 
-import pytest
 import torch
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.puzzletron.utils import setup_test_model_and_data
@@ -28,7 +27,6 @@
 from modelopt.torch.puzzletron.nas.plugins.puzzletron_nas_plugin import PuzzletronModel
 
 
-@pytest.mark.skip(reason="Temporarily disabled")
 def test_nas_convert_ffn_pruning(project_root_path: Path, tmp_path: Path):
     spawn_multiprocess_job(
         size=torch.cuda.device_count(),
@@ -43,12 +41,10 @@ def _test_nas_convert_ffn_pruning_multiprocess_job(
     dist.setup(timeout=timedelta(10))
     # Setup the test model and data.
     puzzle_dir, llama_checkpoint_path, dataset_path = setup_test_model_and_data(
-        project_root_path, tmp_path, rank, "llama_3_1_8b_instruct"
+        project_root_path, tmp_path, rank, "meta-llama/Llama-3.1-8B-Instruct"
     )
-    hydra_config_dir = (
-        project_root_path / "tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct"
-    )
-    hydra_config_name = "llama_3_1_8b_instruct"
+    hydra_config_dir = project_root_path / "tests/gpu/torch/puzzletron/resources/configs"
+    hydra_config_name = "meta-llama/Llama-3.1-8B-Instruct/Llama-3.1-8B-Instruct"
 
     #
     # Run the mnt.convert() step
@@ -87,7 +83,6 @@ def _test_nas_convert_ffn_pruning_multiprocess_job(
     dist.cleanup()
 
 
-@pytest.mark.skip(reason="Temporarily disabled")
 def test_nas_convert_attn_pruning(project_root_path: Path, tmp_path: Path):
     spawn_multiprocess_job(
         size=torch.cuda.device_count(),
@@ -102,12 +97,10 @@ def _test_nas_convert_attn_pruning_multiprocess_job(
     dist.setup(timeout=timedelta(10))
     # Setup the test model and data.
     puzzle_dir, llama_checkpoint_path, dataset_path = setup_test_model_and_data(
-        project_root_path, tmp_path, rank, "llama_3_1_8b_instruct"
-    )
-    hydra_config_dir = (
-        project_root_path / "tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct"
+        project_root_path, tmp_path, rank, "meta-llama/Llama-3.1-8B-Instruct"
     )
-    hydra_config_name = "llama_3_1_8b_instruct-attn-pruning"
+    hydra_config_dir = project_root_path / "tests/gpu/torch/puzzletron/resources/configs"
+    hydra_config_name = "meta-llama/Llama-3.1-8B-Instruct/Llama-3.1-8B-Instruct-attn-pruning"
 
     #
     # Run the mnt.convert() step
diff --git a/tests/gpu/torch/puzzletron/nas/plugins/test_nas_search.py b/tests/gpu/torch/puzzletron/nas/plugins/test_nas_search.py
index e39f1e1cbc..2af371e5ca 100644
--- a/tests/gpu/torch/puzzletron/nas/plugins/test_nas_search.py
+++ b/tests/gpu/torch/puzzletron/nas/plugins/test_nas_search.py
@@ -17,7 +17,6 @@
 from functools import partial
 from pathlib import Path
 
-import pytest
 import torch
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.puzzletron.utils import setup_test_model_and_data
@@ -27,7 +26,6 @@
 from modelopt.torch.puzzletron.nas.plugins.puzzletron_nas_plugin import PuzzletronModel
 
 
-@pytest.mark.skip(reason="Temporarily disabled")
 def test_nas_search(project_root_path: Path, tmp_path: Path):
     spawn_multiprocess_job(
         size=torch.cuda.device_count(),
@@ -42,12 +40,10 @@ def _test_nas_search_multiprocess_job(
     dist.setup(timeout=timedelta(10))
     # Setup the test model and data.
     puzzle_dir, llama_checkpoint_path, dataset_path = setup_test_model_and_data(
-        project_root_path, tmp_path, rank, "llama_3_1_8b_instruct"
+        project_root_path, tmp_path, rank, "meta-llama/Llama-3.1-8B-Instruct"
     )
-    hydra_config_dir = (
-        project_root_path / "tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct"
-    )
-    hydra_config_name = "llama_3_1_8b_instruct"
+    hydra_config_dir = project_root_path / "tests/gpu/torch/puzzletron/resources/configs"
+    hydra_config_name = "meta-llama/Llama-3.1-8B-Instruct/Llama-3.1-8B-Instruct"
 
     #
     # Run the mnt.convert() step
diff --git a/tests/_test_utils/torch/puzzletron/resources/configs/Llama-3_1-8B-ffn-pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen2.5-7B-Instruct/Qwen2.5-7B-Instruct.yaml
similarity index 76%
rename from tests/_test_utils/torch/puzzletron/resources/configs/Llama-3_1-8B-ffn-pruning.yaml
rename to tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen2.5-7B-Instruct/Qwen2.5-7B-Instruct.yaml
index 8af352660b..2843f0b97a 100644
--- a/tests/_test_utils/torch/puzzletron/resources/configs/Llama-3_1-8B-ffn-pruning.yaml
+++ b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen2.5-7B-Instruct/Qwen2.5-7B-Instruct.yaml
@@ -1,18 +1,19 @@
+# @package _global_
 defaults:
-  - pruning: ffn_pruning
-  - scoring: ../validate_solutions_defaults
-  - realize_model: ../validate_solutions_defaults
-  - bypass:
-  - override hydra/hydra_logging: disabled
+  - /Qwen/Qwen2.5-7B-Instruct/pruning@pruning: ffn_pruning
+  - /validate_solutions_defaults@scoring
+  - /validate_solutions_defaults@realize_model
   - _self_
 
 puzzle_dir: ???
 teacher_dir: ${puzzle_dir}/ckpts/teacher/
 replacement_library_path: ${puzzle_dir}/replacement_library.json
-dataset_path: ??? # path to v0.4_mini
+dataset_path: ???     # path to v0.4_mini
 
 skip_realize_model: false
 
+descriptor: qwen2
+
 build_replacement_library:
   add_ffn_no_ops: true
   add_attention_no_ops: true
@@ -21,15 +22,17 @@ calc_subblock_stats:
   batch_sizes: [64, 96, 128]
   prefill_seq_len: 4096
   generation_seq_len: 4096
-  num_active_tokens_override: # Optional override for sequence lengths
+  num_active_tokens_override:       # Optional override for sequence lengths
   prefill_queue_size: 0
   allocate_prefill_query: false
-  benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking
+  benchmark_iterations:       # Set to a number (e.g., 1000) to enable runtime benchmarking
   merge_with_existing_stats: false
   subblock_stats_filename: "subblock_stats.json"
   moe_stats_filename: "moe_stats.json"
 
 scoring:
+  descriptor: ${descriptor}
+
   solutions_to_validate:
   skip_existing_solutions: true
 
@@ -54,6 +57,8 @@ mip:
   # puzzle_profile:
   objective: metrics.cosine_embedding_loss_hidden_states
   bigger_is_better: false
+  num_solutions: 1
+  minimal_diversity: 2
 
   subblock_stats_args:
     - batch_size: 96
@@ -77,18 +82,23 @@ mip:
     target_memory: 780_000 # 78_000
 
   mip_constraints:
+  use_greedy_search: false
+  is_multi_layer_puzzle: true
   metric_overrides:
+  constrain_search_func:
   max_seconds_per_solution: 60
 
 realize_model:
+  descriptor: ${descriptor}
+
   teacher_dir: ${to_path:${teacher_dir}}
   tokenizer_name: ${to_path:${teacher_dir}}
   replacement_library_path: ${replacement_library_path}
   save_models: true
-  solutions_path: # Filled dynamically
+  solutions_path:     # Filled dynamically
 
   # Validate params
-  skip_validation: false # To enable validation of the model solution set `skip_validation` as False
+  skip_validation: false    # To enable validation of the model solution set `skip_validation` as False
   eval_samples: 2
   micro_batch_size: 1
   dataset_path: ${dataset_path}/valid
diff --git a/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen2.5-7B-Instruct/pruning/ffn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen2.5-7B-Instruct/pruning/ffn_pruning.yaml
new file mode 100644
index 0000000000..cf6201080c
--- /dev/null
+++ b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen2.5-7B-Instruct/pruning/ffn_pruning.yaml
@@ -0,0 +1,7 @@
+defaults:
+  - /pruning/ffn_pruning_base@_here_
+  - _self_
+
+pruning_mixin:
+  layer_descriptor:
+    _target_: modelopt.torch.puzzletron.anymodel.models.qwen2.qwen2_model_descriptor.Qwen2FFNIntermediateLayerDescriptor
diff --git a/tests/_test_utils/torch/puzzletron/resources/configs/Llama-3_1-8B-attn-pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-8B/Qwen3-8B.yaml
similarity index 76%
rename from tests/_test_utils/torch/puzzletron/resources/configs/Llama-3_1-8B-attn-pruning.yaml
rename to tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-8B/Qwen3-8B.yaml
index 473a5d418d..cd82a47271 100644
--- a/tests/_test_utils/torch/puzzletron/resources/configs/Llama-3_1-8B-attn-pruning.yaml
+++ b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-8B/Qwen3-8B.yaml
@@ -1,18 +1,19 @@
+# @package _global_
 defaults:
-  - pruning: attn_pruning
-  - scoring: ../validate_solutions_defaults
-  - realize_model: ../validate_solutions_defaults
-  - bypass:
-  - override hydra/hydra_logging: disabled
+  - /Qwen/Qwen3-8B/pruning@pruning: ffn_pruning
+  - /validate_solutions_defaults@scoring
+  - /validate_solutions_defaults@realize_model
   - _self_
 
 puzzle_dir: ???
 teacher_dir: ${puzzle_dir}/ckpts/teacher/
 replacement_library_path: ${puzzle_dir}/replacement_library.json
-dataset_path: ??? # path to v0.4_mini
+dataset_path: ???     # path to v0.4_mini
 
 skip_realize_model: false
 
+descriptor: qwen3
+
 build_replacement_library:
   add_ffn_no_ops: true
   add_attention_no_ops: true
@@ -21,15 +22,16 @@ calc_subblock_stats:
   batch_sizes: [64, 96, 128]
   prefill_seq_len: 4096
   generation_seq_len: 4096
-  num_active_tokens_override: # Optional override for sequence lengths
+  num_active_tokens_override:       # Optional override for sequence lengths
   prefill_queue_size: 0
-  allocate_prefill_query: false
-  benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking
+  benchmark_iterations:       # Set to a number (e.g., 1000) to enable runtime benchmarking
   merge_with_existing_stats: false
   subblock_stats_filename: "subblock_stats.json"
   moe_stats_filename: "moe_stats.json"
 
 scoring:
+  descriptor: ${descriptor}
+
   solutions_to_validate:
   skip_existing_solutions: true
 
@@ -54,6 +56,8 @@ mip:
   # puzzle_profile:
   objective: metrics.cosine_embedding_loss_hidden_states
   bigger_is_better: false
+  num_solutions: 1
+  minimal_diversity: 2
 
   subblock_stats_args:
     - batch_size: 96
@@ -77,18 +81,23 @@ mip:
     target_memory: 780_000 # 78_000
 
   mip_constraints:
+  use_greedy_search: false
+  is_multi_layer_puzzle: true
   metric_overrides:
+  constrain_search_func:
   max_seconds_per_solution: 60
 
 realize_model:
+  descriptor: ${descriptor}
+
   teacher_dir: ${to_path:${teacher_dir}}
   tokenizer_name: ${to_path:${teacher_dir}}
   replacement_library_path: ${replacement_library_path}
   save_models: true
-  solutions_path: # Filled dynamically
+  solutions_path:     # Filled dynamically
 
   # Validate params
-  skip_validation: false # To enable validation of the model solution set `skip_validation` as False
+  skip_validation: false    # To enable validation of the model solution set `skip_validation` as False
   eval_samples: 2
   micro_batch_size: 1
   dataset_path: ${dataset_path}/valid
diff --git a/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-8B/pruning/ffn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-8B/pruning/ffn_pruning.yaml
new file mode 100644
index 0000000000..e6e6ce5bb4
--- /dev/null
+++ b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-8B/pruning/ffn_pruning.yaml
@@ -0,0 +1,7 @@
+defaults:
+  - /pruning/ffn_pruning_base@_here_
+  - _self_
+
+pruning_mixin:
+  layer_descriptor:
+    _target_: modelopt.torch.puzzletron.anymodel.models.qwen3_8b.qwen3_8b_model_descriptor.Qwen3_8BFFNIntermediateLayerDescriptor
diff --git a/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-VL-30B-A3B-Instruct/Qwen3-VL-30B-A3B-Instruct.yaml b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-VL-30B-A3B-Instruct/Qwen3-VL-30B-A3B-Instruct.yaml
new file mode 100644
index 0000000000..00b21ea979
--- /dev/null
+++ b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-VL-30B-A3B-Instruct/Qwen3-VL-30B-A3B-Instruct.yaml
@@ -0,0 +1,113 @@
+# @package _global_
+defaults:
+  - /Qwen/Qwen3-VL-30B-A3B-Instruct/pruning@pruning: expert_pruning
+  - /validate_solutions_defaults@scoring
+  - /validate_solutions_defaults@realize_model
+  - _self_
+
+puzzle_dir: ???
+teacher_dir: ${puzzle_dir}/ckpts/teacher/
+replacement_library_path: ${puzzle_dir}/replacement_library.json
+dataset_path: ???     # path to v0.4_mini
+
+skip_realize_model: false
+
+descriptor: qwen3_vl
+
+build_replacement_library:
+  add_ffn_no_ops: true
+  add_attention_no_ops: true
+
+calc_subblock_stats:
+  batch_sizes: [64, 96, 128]
+  prefill_seq_len: 4096
+  generation_seq_len: 4096
+  num_active_tokens_override:       # Optional override for sequence lengths
+  prefill_queue_size: 0
+  benchmark_iterations:       # Set to a number (e.g., 1000) to enable runtime benchmarking
+  merge_with_existing_stats: false
+  subblock_stats_filename: "subblock_stats.json"
+  moe_stats_filename: "moe_stats.json"
+
+scoring:
+  descriptor: ${descriptor}
+
+  solutions_to_validate:
+  skip_existing_solutions: true
+
+  replacement_library_path: ${replacement_library_path}
+  solutions_path: ${to_path:${puzzle_dir}/single_sequence_replacement_solutions.json}
+  teacher_dir: ${to_path:${teacher_dir}}
+  output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation
+
+  eval_samples: 2
+  micro_batch_size: 1
+  dataset_path: ${dataset_path}/valid
+  seed: 42
+  shuffle_seed: 444
+
+mip:
+  single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}}
+  subblock_stats_path: ${to_path:${puzzle_dir}/${calc_subblock_stats.subblock_stats_filename}}
+  output_path: ${to_path:${puzzle_dir}/mip/puzzle_solutions}
+  gathered_metrics_path:
+  puzzle_profile:
+
+  # puzzle_profile:
+  objective: metrics.cosine_embedding_loss_hidden_states
+  bigger_is_better: false
+  num_solutions: 1
+  minimal_diversity: 2
+
+  subblock_stats_args:
+    - batch_size: 96
+      weights_dtype: torch.bfloat16
+      activations_dtype: torch.bfloat16
+      kv_cache_dtype: torch.bfloat16
+
+  report_additional_costs:
+    - stats.memory_mib
+    - stats.num_params
+    - stats.num_kv_heads
+    - stats.has_attention
+    - stats.has_ffn
+    - stats.kv_cache_memory_mib
+    - stats.attention_memory_mib
+    - stats.ffn_memory_mib
+    - stats.ffn_num_params
+    - stats.attention_num_params
+    - stats.num_local_experts
+
+  human_constraints:
+
+  mip_constraints:
+    - stats.num_local_experts: 1472   # same constraint as nemotron-3-nano for test consistency
+  use_greedy_search: false
+  is_multi_layer_puzzle: true
+  metric_overrides:
+  constrain_search_func:
+  max_seconds_per_solution: 60
+
+realize_model:
+  descriptor: ${descriptor}
+
+  teacher_dir: ${to_path:${teacher_dir}}
+  tokenizer_name: ${to_path:${teacher_dir}}
+  replacement_library_path: ${replacement_library_path}
+  save_models: true
+  solutions_path:     # Filled dynamically
+
+  # Validate params
+  skip_validation: false    # To enable validation of the model solution set `skip_validation` as False
+  eval_samples: 2
+  micro_batch_size: 1
+  dataset_path: ${dataset_path}/valid
+  seed: 42
+  shuffle_seed: 444
+
+nccl_timeout_minutes: ${timedelta_minutes:10}
+
+# This section redirects Hydra outputs
+hydra:
+  run:
+    dir: ${puzzle_dir}/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
diff --git a/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-VL-30B-A3B-Instruct/pruning/expert_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-VL-30B-A3B-Instruct/pruning/expert_pruning.yaml
new file mode 100644
index 0000000000..81c5f35ba5
--- /dev/null
+++ b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-VL-30B-A3B-Instruct/pruning/expert_pruning.yaml
@@ -0,0 +1,20 @@
+defaults:
+  - /pruning/pruning_defaults@_here_
+
+eval_samples: 10
+activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/expert_removal/${pruning.experiment_id}
+pruning_mixin:
+  _target_: modelopt.torch.puzzletron.pruning.expert_removal_pruning_mixin.ExpertRemovalPruningMixIn
+  layer_descriptor:
+    _target_: modelopt.torch.puzzletron.anymodel.models.qwen3_vl_30b_a3b_instruct.qwen3_vl_30b_a3b_instruct_model_descriptor.Qwen3VL30BA3BInstructExpertRemovalLayerDescriptor
+    target_name: "mlp"
+
+hook_class: ${get_object:modelopt.torch.nas.plugins.megatron_hooks.base_hooks.Qwen3VLRemoveExpertsIndependentHook}
+activation_hooks_kwargs:
+
+# num_experts_to_keep must be >= num_experts_per_tok (can't route to more experts than exist)
+num_experts_to_keep_list: [8]  # num_experts in test model is 16, num_experts_per_tok is 8
+mlp_init_mode: "ExpertRemoval"
+mlp_init_config_yaml:
+  expert_scores_key: "expert_ranks_mse"
+  layer_prefix_template: "model.language_model.layers.{layer_idx}.mlp"
diff --git a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/attn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/attn_pruning.yaml
deleted file mode 100644
index 01886607e4..0000000000
--- a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/attn_pruning.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-defaults:
-  - pruning_defaults
-
-activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/attn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id}
-
-activation_hooks_kwargs:
-  method: independent_kv_head_contribution
-  optimize_for: memory    # IndependentKvHeadContributionHook implementation that consumes less memory
-  target_layer: "self_attn.o_proj"
-  layer_input_descriptors_path:
-
-# n_heads_in_group: 4
-# num_attention_heads: 32       # num query heads
-# num_kv_heads: 32 / 4 = 8      # num_query_heads // n_heads_in_group
-n_heads_in_group_list: [8, 16, 32]      # num_kv_heads = [4, 2, 1]
-gqa_init_mode: "PruneKVHeads"
diff --git a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/hidden_dim_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/hidden_dim_pruning.yaml
deleted file mode 100644
index 407c835d8c..0000000000
--- a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/hidden_dim_pruning.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-defaults:
-  - pruning_defaults
-
-activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/hidden_dim_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id}
-
-activation_hooks_kwargs:
-  method: layer_norm_contribution
-  target_layer: "layernorm"
-
-# Hidden dimension pruning specific settings
-hidden_size_list: [3072, 2048]  # Target hidden sizes to prune to
-hidden_size_init_mode: "PruneByChannelRanking"
-mlp_init_mode: "Truncate" # TODO, make it work with CopyAsIs/FromTeacher
-gqa_init_mode: "AverageKV" # TODO, make it work with CopyAsIs/FromTeacher
-linear_init_mode: "FromTeacher"
diff --git a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/validate_solutions_defaults.yaml b/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/validate_solutions_defaults.yaml
deleted file mode 100644
index ec13902379..0000000000
--- a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/validate_solutions_defaults.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-defaults:
-  - /validate_model_defaults
-  - _self_
-
-solutions_to_validate:
-skip_validation: false
-save_models: false
-bigger_is_better: false
-sort_solutions_by:
-calculate_full_score_ablations: false
diff --git a/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/Llama-3.1-8B-Instruct-attn-pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/Llama-3.1-8B-Instruct-attn-pruning.yaml
new file mode 100644
index 0000000000..57051431a1
--- /dev/null
+++ b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/Llama-3.1-8B-Instruct-attn-pruning.yaml
@@ -0,0 +1,10 @@
+# @package _global_
+defaults:
+  - /meta-llama/Llama-3.1-8B-Instruct/pruning@pruning: attn_pruning
+  - _self_
+
+descriptor: llama
+
+puzzle_dir: ???
+teacher_dir: ${puzzle_dir}/ckpts/teacher/
+dataset_path: ???
diff --git a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/llama_3_1_8b_instruct-attn-pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/Llama-3.1-8B-Instruct.yaml
similarity index 94%
rename from tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/llama_3_1_8b_instruct-attn-pruning.yaml
rename to tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/Llama-3.1-8B-Instruct.yaml
index 02c73aca69..8e2e0786b3 100644
--- a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/llama_3_1_8b_instruct-attn-pruning.yaml
+++ b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/Llama-3.1-8B-Instruct.yaml
@@ -1,9 +1,8 @@
+# @package _global_
 defaults:
-  - pruning: attn_pruning
-  - scoring: ../validate_solutions_defaults
-  - realize_model: ../validate_solutions_defaults
-  - bypass:
-  - override hydra/hydra_logging: disabled
+  - /meta-llama/Llama-3.1-8B-Instruct/pruning@pruning: ffn_pruning
+  - /validate_solutions_defaults@scoring
+  - /validate_solutions_defaults@realize_model
   - _self_
 
 descriptor: llama
diff --git a/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/pruning/attn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/pruning/attn_pruning.yaml
new file mode 100644
index 0000000000..6e8af1f651
--- /dev/null
+++ b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/pruning/attn_pruning.yaml
@@ -0,0 +1,7 @@
+defaults:
+  - /pruning/attn_pruning@_here_
+  - _self_
+
+pruning_mixin:
+  layer_descriptor:
+    _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaKVHeadsLayerDescriptor
diff --git a/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/pruning/ffn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/pruning/ffn_pruning.yaml
new file mode 100644
index 0000000000..b30f4a17d9
--- /dev/null
+++ b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/pruning/ffn_pruning.yaml
@@ -0,0 +1,7 @@
+defaults:
+  - /pruning/ffn_pruning_base@_here_
+  - _self_
+
+pruning_mixin:
+  layer_descriptor:
+    _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaFFNIntermediateLayerDescriptor
diff --git a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/llama_3_1_8b_instruct.yaml b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.2-3B-Instruct/Llama-3.2-3B-Instruct.yaml
similarity index 94%
rename from tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/llama_3_1_8b_instruct.yaml
rename to tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.2-3B-Instruct/Llama-3.2-3B-Instruct.yaml
index 65ca64ef4e..78cb6bd73c 100644
--- a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/llama_3_1_8b_instruct.yaml
+++ b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.2-3B-Instruct/Llama-3.2-3B-Instruct.yaml
@@ -1,9 +1,8 @@
+# @package _global_
 defaults:
-  - pruning: ffn_pruning
-  - scoring: ../validate_solutions_defaults
-  - realize_model: ../validate_solutions_defaults
-  - bypass:
-  - override hydra/hydra_logging: disabled
+  - /meta-llama/Llama-3.2-3B-Instruct/pruning@pruning: ffn_pruning
+  - /validate_solutions_defaults@scoring
+  - /validate_solutions_defaults@realize_model
   - _self_
 
 descriptor: llama
diff --git a/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.2-3B-Instruct/pruning/ffn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.2-3B-Instruct/pruning/ffn_pruning.yaml
new file mode 100644
index 0000000000..b30f4a17d9
--- /dev/null
+++ b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.2-3B-Instruct/pruning/ffn_pruning.yaml
@@ -0,0 +1,7 @@
+defaults:
+  - /pruning/ffn_pruning_base@_here_
+  - _self_
+
+pruning_mixin:
+  layer_descriptor:
+    _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaFFNIntermediateLayerDescriptor
diff --git a/tests/gpu/torch/puzzletron/resources/configs/mistralai/Mistral-Small-24B-Instruct-2501/Mistral-Small-24B-Instruct-2501.yaml b/tests/gpu/torch/puzzletron/resources/configs/mistralai/Mistral-Small-24B-Instruct-2501/Mistral-Small-24B-Instruct-2501.yaml
new file mode 100644
index 0000000000..e042c4bb62
--- /dev/null
+++ b/tests/gpu/torch/puzzletron/resources/configs/mistralai/Mistral-Small-24B-Instruct-2501/Mistral-Small-24B-Instruct-2501.yaml
@@ -0,0 +1,112 @@
+# @package _global_
+defaults:
+  - /mistralai/Mistral-Small-24B-Instruct-2501/pruning@pruning: ffn_pruning
+  - /validate_solutions_defaults@scoring
+  - /validate_solutions_defaults@realize_model
+  - _self_
+
+puzzle_dir: ???
+teacher_dir: ${puzzle_dir}/ckpts/teacher/
+replacement_library_path: ${puzzle_dir}/replacement_library.json
+dataset_path: ???     # path to v0.4_mini
+
+skip_realize_model: false
+
+descriptor: mistral_small
+
+build_replacement_library:
+  add_ffn_no_ops: true
+  add_attention_no_ops: true
+
+calc_subblock_stats:
+  batch_sizes: [64, 96, 128]
+  prefill_seq_len: 4096
+  generation_seq_len: 4096
+  num_active_tokens_override:       # Optional override for sequence lengths
+  prefill_queue_size: 0
+  benchmark_iterations:       # Set to a number (e.g., 1000) to enable runtime benchmarking
+  merge_with_existing_stats: false
+  subblock_stats_filename: "subblock_stats.json"
+  moe_stats_filename: "moe_stats.json"
+
+scoring:
+  descriptor: ${descriptor}
+
+  solutions_to_validate:
+  skip_existing_solutions: true
+
+  replacement_library_path: ${replacement_library_path}
+  solutions_path: ${to_path:${puzzle_dir}/single_sequence_replacement_solutions.json}
+  teacher_dir: ${to_path:${teacher_dir}}
+  output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation
+
+  eval_samples: 2
+  micro_batch_size: 1
+  dataset_path: ${dataset_path}/valid
+  seed: 42
+  shuffle_seed: 444
+
+mip:
+  single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}}
+  subblock_stats_path: ${to_path:${puzzle_dir}/${calc_subblock_stats.subblock_stats_filename}}
+  output_path: ${to_path:${puzzle_dir}/mip/puzzle_solutions}
+  gathered_metrics_path:
+  puzzle_profile:
+
+  # puzzle_profile:
+  objective: metrics.cosine_embedding_loss_hidden_states
+  bigger_is_better: false
+  num_solutions: 1
+  minimal_diversity: 2
+
+  subblock_stats_args:
+    - batch_size: 96
+      weights_dtype: torch.bfloat16
+      activations_dtype: torch.bfloat16
+      kv_cache_dtype: torch.bfloat16
+
+  report_additional_costs:
+    - stats.memory_mib
+    - stats.num_params
+    - stats.num_kv_heads
+    - stats.has_attention
+    - stats.has_ffn
+    - stats.kv_cache_memory_mib
+    - stats.attention_memory_mib
+    - stats.ffn_memory_mib
+    - stats.ffn_num_params
+    - stats.attention_num_params
+
+  human_constraints:
+    target_memory: 780_000 # 78_000
+
+  mip_constraints:
+  use_greedy_search: false
+  is_multi_layer_puzzle: true
+  metric_overrides:
+  constrain_search_func:
+  max_seconds_per_solution: 60
+
+realize_model:
+  descriptor: ${descriptor}
+
+  teacher_dir: ${to_path:${teacher_dir}}
+  tokenizer_name: ${to_path:${teacher_dir}}
+  replacement_library_path: ${replacement_library_path}
+  save_models: true
+  solutions_path:     # Filled dynamically
+
+  # Validate params
+  skip_validation: false    # To enable validation of the model solution set `skip_validation` as False
+  eval_samples: 2
+  micro_batch_size: 1
+  dataset_path: ${dataset_path}/valid
+  seed: 42
+  shuffle_seed: 444
+
+nccl_timeout_minutes: ${timedelta_minutes:10}
+
+# This section redirects Hydra outputs
+hydra:
+  run:
+    dir: ${puzzle_dir}/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
diff --git a/tests/gpu/torch/puzzletron/resources/configs/mistralai/Mistral-Small-24B-Instruct-2501/pruning/ffn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/mistralai/Mistral-Small-24B-Instruct-2501/pruning/ffn_pruning.yaml
new file mode 100644
index 0000000000..37c21fd638
--- /dev/null
+++ b/tests/gpu/torch/puzzletron/resources/configs/mistralai/Mistral-Small-24B-Instruct-2501/pruning/ffn_pruning.yaml
@@ -0,0 +1,7 @@
+defaults:
+  - /pruning/ffn_pruning_base@_here_
+  - _self_
+
+pruning_mixin:
+  layer_descriptor:
+    _target_: modelopt.torch.puzzletron.anymodel.models.mistral_small.mistral_small_model_descriptor.MistralFFNIntermediateLayerDescriptor
diff --git a/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16.yaml b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16.yaml
new file mode 100644
index 0000000000..ab2b09e679
--- /dev/null
+++ b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16.yaml
@@ -0,0 +1,115 @@
+# @package _global_
+defaults:
+  - /nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/pruning@pruning: expert_pruning
+  - /validate_solutions_defaults@scoring
+  - /validate_solutions_defaults@realize_model
+  - _self_
+
+
+puzzle_dir: ???
+teacher_dir: ${puzzle_dir}/ckpts/teacher/
+replacement_library_path: ${puzzle_dir}/replacement_library.json
+dataset_path: ???     # path to v0.4_mini
+
+skip_realize_model: false
+
+descriptor: nemotron_h
+
+build_replacement_library:
+  add_ffn_no_ops: true
+  add_attention_no_ops: true
+
+calc_subblock_stats:
+  batch_sizes: [64, 96, 128]
+  prefill_seq_len: 4096
+  generation_seq_len: 4096
+  num_active_tokens_override:       # Optional override for sequence lengths
+  prefill_queue_size: 0
+  allocate_prefill_query: false
+  benchmark_iterations:       # Set to a number (e.g., 1000) to enable runtime benchmarking
+  merge_with_existing_stats: false
+  subblock_stats_filename: "subblock_stats.json"
+  moe_stats_filename: "moe_stats.json"
+  runtime_stats:
+    backend: trt_torch
+
+scoring:
+  descriptor: ${descriptor}
+
+  solutions_to_validate:
+  skip_existing_solutions: true
+
+  replacement_library_path: ${replacement_library_path}
+  solutions_path: ${to_path:${puzzle_dir}/single_sequence_replacement_solutions.json}
+  teacher_dir: ${to_path:${teacher_dir}}
+  output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation
+
+  eval_samples: 2
+  micro_batch_size: 1
+  seed: 42
+  shuffle_seed: 444
+  dataset_path: ${dataset_path}/valid
+
+mip:
+  single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}}
+  subblock_stats_path: ${to_path:${puzzle_dir}/${calc_subblock_stats.subblock_stats_filename}}
+  output_path: ${to_path:${puzzle_dir}/mip/puzzle_solutions}
+  gathered_metrics_path:
+  puzzle_profile:
+
+  # puzzle_profile:
+  objective: metrics.cosine_embedding_loss_hidden_states
+  bigger_is_better: false
+  num_solutions: 1
+  minimal_diversity: 2
+
+  subblock_stats_args:
+    - batch_size: 96
+      weights_dtype: torch.bfloat16
+      activations_dtype: torch.bfloat16
+      kv_cache_dtype: torch.bfloat16
+
+  report_additional_costs:
+    - stats.memory_mib
+    - stats.num_params
+    - stats.num_kv_heads
+    - stats.has_attention
+    - stats.has_ffn
+    - stats.kv_cache_memory_mib
+    - stats.attention_memory_mib
+    - stats.ffn_memory_mib
+    - stats.ffn_num_params
+    - stats.attention_num_params
+    - stats.num_local_experts
+
+  human_constraints:
+  mip_constraints:
+    - stats.num_local_experts: 1472   # teacher has: 23 moe-blocks * 128 experts = 2944 total experts  use_greedy_search: false
+  is_multi_layer_puzzle: true
+  metric_overrides:
+  constrain_search_func:
+  max_seconds_per_solution: 60
+
+realize_model:
+  descriptor: ${descriptor}
+
+  teacher_dir: ${to_path:${teacher_dir}}
+  tokenizer_name: ${to_path:${teacher_dir}}
+  replacement_library_path: ${replacement_library_path}
+  save_models: true
+  solutions_path:     # Filled dynamically
+
+  # Validate params
+  skip_validation: false    # To enable validation of the model solution set `skip_validation` as False
+  eval_samples: 2
+  micro_batch_size: 1
+  seed: 42
+  shuffle_seed: 444
+  dataset_path: ${dataset_path}/valid
+
+nccl_timeout_minutes: ${timedelta_minutes:10}
+
+# This section redirects Hydra outputs
+hydra:
+  run:
+    dir: ${puzzle_dir}/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
diff --git a/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/pruning/expert_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/pruning/expert_pruning.yaml
new file mode 100644
index 0000000000..4c2335becf
--- /dev/null
+++ b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/pruning/expert_pruning.yaml
@@ -0,0 +1,18 @@
+defaults:
+  - /pruning/pruning_defaults@_here_
+
+eval_samples: 10
+activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/expert_removal/${pruning.experiment_id}
+pruning_mixin:
+  _target_: modelopt.torch.puzzletron.pruning.expert_removal_pruning_mixin.ExpertRemovalPruningMixIn
+  layer_descriptor:
+    _target_: modelopt.torch.puzzletron.anymodel.models.nemotron_h.nemotron_h_model_descriptor.NemotronHExpertRemovalLayerDescriptor
+    target_name: "mixer"
+
+hook_class: ${get_object:modelopt.torch.nas.plugins.megatron_hooks.base_hooks.NemotronHRemoveExpertsIndependentHook}
+activation_hooks_kwargs:    # Additional kwargs to pass to the hook init
+
+num_experts_to_keep_list: [96, 64, 32, 16, 8]  # num_experts in teacher is 128
+mlp_init_mode: "ExpertRemoval"
+mlp_init_config_yaml:
+  expert_scores_key: "expert_ranks_mse"
diff --git a/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/pruning/ffn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/pruning/ffn_pruning.yaml
new file mode 100644
index 0000000000..cb1147d86b
--- /dev/null
+++ b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/pruning/ffn_pruning.yaml
@@ -0,0 +1,14 @@
+defaults:
+  - /pruning/pruning_defaults
+
+activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/ffn/${pruning.experiment_id}
+pruning_mixin:
+  _target_: modelopt.torch.puzzletron.pruning.ffn_intermediate_pruning_mixin.FFNIntermediatePruningMixIn
+  layer_descriptor:
+    _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaFFNIntermediateLayerDescriptor
+
+hook_class: ${get_object:modelopt.torch.nas.plugins.megatron_hooks.base_hooks.IterativeChannelContributionHook}
+activation_hooks_kwargs:    # Additional kwargs to pass to the hook init
+
+intermediate_size_list: [3072, 5888, 8704, 11520]  # teacher_intermediate_size is 14336
+mlp_init_mode: "PruneByActivationsLog"
diff --git a/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-Nano-12B-v2/NVIDIA-Nemotron-Nano-12B-v2.yaml b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-Nano-12B-v2/NVIDIA-Nemotron-Nano-12B-v2.yaml
new file mode 100644
index 0000000000..906b7338d8
--- /dev/null
+++ b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-Nano-12B-v2/NVIDIA-Nemotron-Nano-12B-v2.yaml
@@ -0,0 +1,113 @@
+# @package _global_
+defaults:
+  - /nvidia/NVIDIA-Nemotron-Nano-12B-v2/pruning@pruning: ffn_pruning
+  - /validate_solutions_defaults@scoring
+  - /validate_solutions_defaults@realize_model
+  - _self_
+
+puzzle_dir: ???
+teacher_dir: ${puzzle_dir}/ckpts/teacher/
+replacement_library_path: ${puzzle_dir}/replacement_library.json
+dataset_path: ???     # path to v0.4_mini
+
+skip_realize_model: false
+
+descriptor: nemotron_h_v2
+
+build_replacement_library:
+  add_ffn_no_ops: true
+  add_attention_no_ops: true
+
+calc_subblock_stats:
+  batch_sizes: [64, 96, 128]
+  prefill_seq_len: 4096
+  generation_seq_len: 4096
+  num_active_tokens_override:       # Optional override for sequence lengths
+  prefill_queue_size: 0
+  allocate_prefill_query: false
+  benchmark_iterations:       # Set to a number (e.g., 1000) to enable runtime benchmarking
+  merge_with_existing_stats: false
+  subblock_stats_filename: "subblock_stats.json"
+  moe_stats_filename: "moe_stats.json"
+
+scoring:
+  descriptor: ${descriptor}
+
+  solutions_to_validate:
+  skip_existing_solutions: true
+
+  replacement_library_path: ${replacement_library_path}
+  solutions_path: ${to_path:${puzzle_dir}/single_sequence_replacement_solutions.json}
+  teacher_dir: ${to_path:${teacher_dir}}
+  output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation
+
+  eval_samples: 2
+  micro_batch_size: 1
+  dataset_path: ${dataset_path}/valid
+  seed: 42
+  shuffle_seed: 444
+
+mip:
+  single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}}
+  subblock_stats_path: ${to_path:${puzzle_dir}/${calc_subblock_stats.subblock_stats_filename}}
+  output_path: ${to_path:${puzzle_dir}/mip/puzzle_solutions}
+  gathered_metrics_path:
+  puzzle_profile:
+
+  # puzzle_profile:
+  objective: metrics.cosine_embedding_loss_hidden_states
+  bigger_is_better: false
+  num_solutions: 1
+  minimal_diversity: 2
+
+  subblock_stats_args:
+    - batch_size: 96
+      weights_dtype: torch.bfloat16
+      activations_dtype: torch.bfloat16
+      kv_cache_dtype: torch.bfloat16
+
+  report_additional_costs:
+    - stats.memory_mib
+    - stats.num_params
+    - stats.num_kv_heads
+    - stats.has_attention
+    - stats.has_ffn
+    - stats.kv_cache_memory_mib
+    - stats.attention_memory_mib
+    - stats.ffn_memory_mib
+    - stats.ffn_num_params
+    - stats.attention_num_params
+
+  human_constraints:
+    target_memory: 780_000 # 78_000
+
+  mip_constraints:
+  use_greedy_search: false
+  is_multi_layer_puzzle: true
+  metric_overrides:
+  constrain_search_func:
+  max_seconds_per_solution: 60
+
+realize_model:
+  descriptor: ${descriptor}
+
+  teacher_dir: ${to_path:${teacher_dir}}
+  tokenizer_name: ${to_path:${teacher_dir}}
+  replacement_library_path: ${replacement_library_path}
+  save_models: true
+  solutions_path:     # Filled dynamically
+
+  # Validate params
+  skip_validation: false    # To enable validation of the model solution set `skip_validation` as False
+  eval_samples: 2
+  micro_batch_size: 1
+  dataset_path: ${dataset_path}/valid
+  seed: 42
+  shuffle_seed: 444
+
+nccl_timeout_minutes: ${timedelta_minutes:10}
+
+# This section redirects Hydra outputs
+hydra:
+  run:
+    dir: ${puzzle_dir}/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
diff --git a/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-Nano-12B-v2/pruning/ffn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-Nano-12B-v2/pruning/ffn_pruning.yaml
new file mode 100644
index 0000000000..f68068c3ac
--- /dev/null
+++ b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-Nano-12B-v2/pruning/ffn_pruning.yaml
@@ -0,0 +1,12 @@
+defaults:
+  - /pruning/ffn_pruning_base@_here_
+  - _self_
+
+pruning_mixin:
+  layer_descriptor:
+    _target_: modelopt.torch.puzzletron.anymodel.models.nemotron_h_v2.nemotron_h_v2_model_descriptor.NemotronHV2FFNIntermediateLayerDescriptor
+
+activation_hooks_kwargs:
+  method: iterative
+  target_layer: "mixer.down_proj"
+  layer_input_descriptors_path:
diff --git a/tests/_test_utils/torch/puzzletron/resources/configs/pruning/attn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/pruning/attn_pruning.yaml
similarity index 67%
rename from tests/_test_utils/torch/puzzletron/resources/configs/pruning/attn_pruning.yaml
rename to tests/gpu/torch/puzzletron/resources/configs/pruning/attn_pruning.yaml
index 01886607e4..7306b6e379 100644
--- a/tests/_test_utils/torch/puzzletron/resources/configs/pruning/attn_pruning.yaml
+++ b/tests/gpu/torch/puzzletron/resources/configs/pruning/attn_pruning.yaml
@@ -1,8 +1,15 @@
 defaults:
-  - pruning_defaults
+  - /pruning/pruning_defaults@_here_
+  - _self_
 
 activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/attn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id}
 
+pruning_mixin:
+  _target_: modelopt.torch.puzzletron.pruning.kv_heads_pruning_mixin.KVHeadsPruningMixIn
+  layer_descriptor:
+    _target_: ???
+
+hook_class: ${get_object:modelopt.torch.nas.plugins.megatron_hooks.base_hooks.IndependentKvHeadContributionHook}
 activation_hooks_kwargs:
   method: independent_kv_head_contribution
   optimize_for: memory    # IndependentKvHeadContributionHook implementation that consumes less memory
diff --git a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/ffn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/pruning/ffn_pruning_base.yaml
similarity index 72%
rename from tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/ffn_pruning.yaml
rename to tests/gpu/torch/puzzletron/resources/configs/pruning/ffn_pruning_base.yaml
index cad6fcf3ee..7e19afbbce 100644
--- a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/ffn_pruning.yaml
+++ b/tests/gpu/torch/puzzletron/resources/configs/pruning/ffn_pruning_base.yaml
@@ -1,12 +1,13 @@
 defaults:
-  - pruning_defaults
+  - /pruning/pruning_defaults@_here_
+  - _self_
 
 activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/ffn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id}
 
 pruning_mixin:
   _target_: modelopt.torch.puzzletron.pruning.ffn_intermediate_pruning_mixin.FFNIntermediatePruningMixIn
   layer_descriptor:
-    _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaFFNIntermediateLayerDescriptor
+    _target_: ???
 
 hook_class: ${get_object:modelopt.torch.nas.plugins.megatron_hooks.base_hooks.IterativeChannelContributionHook}
 activation_hooks_kwargs:
@@ -14,5 +15,5 @@ activation_hooks_kwargs:
   target_layer: "mlp.down_proj"
   layer_input_descriptors_path:
 
-intermediate_size_list: [256]  # teacher_intermediate_size is 14336
+intermediate_size_list: [256]
 mlp_init_mode: "PruneByActivationsLog"
diff --git a/tests/_test_utils/torch/puzzletron/resources/configs/pruning/hidden_dim_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/pruning/hidden_dim_pruning.yaml
similarity index 93%
rename from tests/_test_utils/torch/puzzletron/resources/configs/pruning/hidden_dim_pruning.yaml
rename to tests/gpu/torch/puzzletron/resources/configs/pruning/hidden_dim_pruning.yaml
index 407c835d8c..4033fedf3a 100644
--- a/tests/_test_utils/torch/puzzletron/resources/configs/pruning/hidden_dim_pruning.yaml
+++ b/tests/gpu/torch/puzzletron/resources/configs/pruning/hidden_dim_pruning.yaml
@@ -1,5 +1,5 @@
 defaults:
-  - pruning_defaults
+  - /pruning/pruning_defaults@_here_
 
 activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/hidden_dim_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id}
 
diff --git a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/pruning_defaults.yaml b/tests/gpu/torch/puzzletron/resources/configs/pruning/pruning_defaults.yaml
similarity index 94%
rename from tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/pruning_defaults.yaml
rename to tests/gpu/torch/puzzletron/resources/configs/pruning/pruning_defaults.yaml
index b24ea1b7cc..f00a86da66 100644
--- a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/pruning_defaults.yaml
+++ b/tests/gpu/torch/puzzletron/resources/configs/pruning/pruning_defaults.yaml
@@ -1,12 +1,13 @@
 defaults:
-  - /validate_model_defaults
+  - /validate_model_defaults@_here_
 
-descriptor: ${descriptor}
 model_name_or_path: ${teacher_dir}
 experiment_id: ${pruning.eval_samples}samples_diverse_mini
 activations_log_dir: ???
 activation_hooks_kwargs: ???
 
+descriptor: ${descriptor}
+
 # Data:
 eval_samples: 100
 micro_batch_size: 4
diff --git a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/validate_model_defaults.yaml b/tests/gpu/torch/puzzletron/resources/configs/validate_model_defaults.yaml
similarity index 100%
rename from tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/validate_model_defaults.yaml
rename to tests/gpu/torch/puzzletron/resources/configs/validate_model_defaults.yaml
diff --git a/tests/_test_utils/torch/puzzletron/resources/configs/validate_solutions_defaults.yaml b/tests/gpu/torch/puzzletron/resources/configs/validate_solutions_defaults.yaml
similarity index 100%
rename from tests/_test_utils/torch/puzzletron/resources/configs/validate_solutions_defaults.yaml
rename to tests/gpu/torch/puzzletron/resources/configs/validate_solutions_defaults.yaml
diff --git a/tests/gpu/torch/puzzletron/resources/hf_configs/llama_3_1_8b_instruct/config.json b/tests/gpu/torch/puzzletron/resources/hf_configs/llama_3_1_8b_instruct/config.json
deleted file mode 100644
index 0bb6fd75b3..0000000000
--- a/tests/gpu/torch/puzzletron/resources/hf_configs/llama_3_1_8b_instruct/config.json
+++ /dev/null
@@ -1,38 +0,0 @@
-{
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "bos_token_id": 128000,
-  "eos_token_id": [
-    128001,
-    128008,
-    128009
-  ],
-  "hidden_act": "silu",
-  "hidden_size": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 14336,
-  "max_position_embeddings": 131072,
-  "mlp_bias": false,
-  "model_type": "llama",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 32,
-  "num_key_value_heads": 8,
-  "pretraining_tp": 1,
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": {
-    "factor": 8.0,
-    "low_freq_factor": 1.0,
-    "high_freq_factor": 4.0,
-    "original_max_position_embeddings": 8192,
-    "rope_type": "llama3"
-  },
-  "rope_theta": 500000.0,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.42.3",
-  "use_cache": true,
-  "vocab_size": 128256
-}
diff --git a/tests/gpu/torch/puzzletron/test_puzzletron.py b/tests/gpu/torch/puzzletron/test_puzzletron.py
index a42a716547..cf600558e5 100644
--- a/tests/gpu/torch/puzzletron/test_puzzletron.py
+++ b/tests/gpu/torch/puzzletron/test_puzzletron.py
@@ -21,6 +21,7 @@
 import pytest
 import torch
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
+from _test_utils.torch.misc import set_seed
 from _test_utils.torch.puzzletron.utils import setup_test_model_and_data
 
 import modelopt.torch.utils.distributed as dist
@@ -31,46 +32,30 @@
 # using a one-click command.
 #
 # Note: Bypass is disabled now in the test.
+#
+
+SEED = 1234
 
 
 @pytest.mark.parametrize(
-    (
-        "hf_config_name",
-        "converter",
-        "hydra_config_subdir",
-        "hybrid_override_pattern",
-        "has_moe_layers",
-    ),
+    ("hf_model_name", "converter", "hybrid_override_pattern", "has_moe_layers"),
     [
-        ("llama_3_1_8b_instruct", "llama", "llama_3_1_8b_instruct", None, False),
-        # ("llama_3_2_3b_instruct", "llama", "llama_3_1_8b_instruct", None, False),
-        # ("qwen2_5_7b_instruct", "qwen2", "qwen2_5_7b_instruct", None, False),
-        # (
-        #     "mistral-small-24b-instruct-2501",
-        #     "mistral_small",
-        #     "mistral-small-24b-instruct-2501",
-        #     None,
-        #     False,
-        # ),
-        # ("qwen3-8b", "qwen3", "qwen3-8b", None, False),
-        # ("qwen3-vl-30b-a3b-instruct", "qwen3_vl", "qwen3-vl-30b-a3b-instruct", None, True),
-        # ("nemotron-nano-12b-v2", "nemotron_h_v2", "nemotron-nano-12b-v2", "*-", False),
-        # (
-        #     "nemotron-3-nano-30b-a3b-base-bf16",
-        #     "nemotron_h",
-        #     "nemotron-3-nano-30b-a3b-base-bf16",
-        #     "*E",
-        #     True,
-        # ),
-        # ("gpt-oss-20b", "gpt_oss_20b", "gpt-oss-20b", None, True),
+        ("meta-llama/Llama-3.1-8B-Instruct", "llama", None, False),
+        ("meta-llama/Llama-3.2-3B-Instruct", "llama", None, False),
+        ("mistralai/Mistral-Small-24B-Instruct-2501", "mistral_small", None, False),
+        ("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16", "nemotron_h", "*E", True),
+        ("nvidia/NVIDIA-Nemotron-Nano-12B-v2", "nemotron_h_v2", "*-", False),
+        # ("openai/gpt-oss-20b", "gpt_oss", None, True),
+        ("Qwen/Qwen2.5-7B-Instruct", "qwen2", None, False),
+        ("Qwen/Qwen3-8B", "qwen3", None, False),
+        ("Qwen/Qwen3-VL-30B-A3B-Instruct", "qwen3_vl", None, True),
     ],
 )
 def test_puzzletron(
     project_root_path: Path,
     tmp_path: Path,
-    hf_config_name: str,
+    hf_model_name: str,
     converter: str,
-    hydra_config_subdir: str,
     hybrid_override_pattern: str,
     has_moe_layers: bool,
 ):
@@ -80,9 +65,8 @@ def test_puzzletron(
             _test_puzzletron_multiprocess_job,
             project_root_path,
             tmp_path,
-            hf_config_name,
+            hf_model_name,
             converter,
-            hydra_config_subdir,
             hybrid_override_pattern,
             has_moe_layers,
         ),
@@ -93,23 +77,25 @@ def test_puzzletron(
 def _test_puzzletron_multiprocess_job(
     project_root_path: Path,
     tmp_path: Path,
-    hf_config_name: str,
+    hf_model_name: str,
     converter: str,
-    hydra_config_subdir: str,
     hybrid_override_pattern: str,
     has_moe_layers: bool,
     rank: int,
     size: int,
 ):
+    # Set seed BEFORE dist.setup() to ensure reproducibility across all processes
+    set_seed(SEED)
+
     dist.setup(timeout=timedelta(10))
 
     # Setup the test model and data.
     puzzle_dir, hf_checkpoint_path, dataset_path = setup_test_model_and_data(
-        project_root_path, tmp_path, rank, hf_config_name, hybrid_override_pattern
-    )
-    hydra_config_dir = (
-        project_root_path / f"tests/gpu/torch/puzzletron/resources/configs/{hydra_config_subdir}"
+        project_root_path, tmp_path, rank, hf_model_name, hybrid_override_pattern
     )
+    hydra_config_dir = project_root_path / "tests/gpu/torch/puzzletron/resources/configs"
+    model_basename = hf_model_name.split("/")[1]
+    hydra_config_name = f"{hf_model_name}/{model_basename}"
 
     # Convert the model using AnyModel converter.
     if rank == 0:
@@ -122,7 +108,7 @@ def _test_puzzletron_multiprocess_job(
 
     # Compress the model using a one-click approach
     puzzletron.puzzletron(
-        str(hydra_config_dir), hydra_config_subdir, str(puzzle_dir), str(dataset_path)
+        str(hydra_config_dir), hydra_config_name, str(puzzle_dir), str(dataset_path)
     )
 
     #
@@ -159,16 +145,16 @@ def _test_puzzletron_multiprocess_job(
             assert (solution_dir / "solutions.json").exists()
 
             # Validate lm_loss
-            _assert_lm_loss(puzzle_dir, hf_config_name)
+            _assert_lm_loss(puzzle_dir, hf_model_name, tolerance=0.01)
         else:
             # assertions for the score_pruning_activations step 1 (FFN pruning)
-            _assert_score_pruning_activations(puzzle_dir, hf_config_name)
+            _assert_score_pruning_activations(puzzle_dir, hf_model_name)
 
             # assertions for the pruning_ckpts step 2
             assert (puzzle_dir / "ckpts/ffn_256_attn_no_op").exists()
 
             # assertions for the mip_and_realize_models step 6
-            _assert_mip_solutions(puzzle_dir, hf_config_name)
+            _assert_mip_solutions(puzzle_dir, hf_model_name)
 
         # assertions for the build_library_and_stats step 4
         assert (puzzle_dir / "replacement_library.json").is_file()
@@ -183,7 +169,7 @@ def _test_puzzletron_multiprocess_job(
     dist.cleanup()
 
     print(
-        f"PYTEST SUMMARY: test_puzzletron({hf_config_name}) test has finished successfully. "
+        f"PYTEST SUMMARY: test_puzzletron({hf_model_name}) test has finished successfully. "
         f"Puzzle directory: {puzzle_dir}"
     )
 
@@ -191,52 +177,50 @@ def _test_puzzletron_multiprocess_job(
 # Expected pruning activation values per model
 # Each model has a list of (score, channels) tuples for each FFN layer
 EXPECTED_PRUNING_VALUES = {
-    "llama_3_1_8b_instruct": [
+    "meta-llama/Llama-3.1-8B-Instruct": [
         {"score": 73, "channels": 95},
         {"score": 440, "channels": 174},
     ],
-    "llama_3_2_3b_instruct": [
+    "meta-llama/Llama-3.2-3B-Instruct": [
         {"score": 79, "channels": 95},
         {"score": 428, "channels": 174},
     ],
-    "qwen2_5_7b_instruct": [
-        {"score": 96, "channels": 433},
-        {"score": 485, "channels": 105},
-    ],
-    # Mistral Small 24B
-    "mistral-small-24b-instruct-2501": [
+    "mistralai/Mistral-Small-24B-Instruct-2501": [
         {"score": 73, "channels": 95},
         {"score": 431, "channels": 174},
     ],
-    # Qwen3 8B
-    "qwen3-8b": [
-        {"score": 208, "channels": 51},
-        {"score": 475, "channels": 266},
-    ],
     # NemotronH with pattern "*-" has only 1 FFN layer (the "-" layer)
-    "nemotron-nano-12b-v2": [
+    "nvidia/NVIDIA-Nemotron-Nano-12B-v2": [
         {"score": 70, "channels": 509},
     ],
-    # Note: nemotron-3-nano-30b-a3b-base-bf16 uses MoE expert pruning, not FFN pruning
-    # so it doesn't have EXPECTED_PRUNING_VALUES
+    # nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 uses MoE expert pruning, not FFN pruning
+    "Qwen/Qwen2.5-7B-Instruct": [
+        {"score": 96, "channels": 433},
+        {"score": 485, "channels": 105},
+    ],
+    "Qwen/Qwen3-8B": [
+        {"score": 208, "channels": 51},
+        {"score": 475, "channels": 266},
+    ],
 }
 
 
 # Expected lm_loss values per model
 EXPECTED_LM_LOSS = {
-    "llama_3_1_8b_instruct": 4.706878662109375,
-    "llama_3_2_3b_instruct": 4.816886901855469,
-    "qwen2_5_7b_instruct": 4.778186798095703,
-    "nemotron-nano-12b-v2": 4.79390811920166,
-    "mistral-small-24b-instruct-2501": 4.709150314331055,
-    "qwen3-8b": 4.733874320983887,
-    "gpt-oss-20b": 4.689250946044922,
-    "nemotron-3-nano-30b-a3b-base-bf16": 4.741103172302246,
-    "qwen3-vl-30b-a3b-instruct": 4.65625,
+    "meta-llama/Llama-3.1-8B-Instruct": 4.706878662109375,
+    "meta-llama/Llama-3.2-3B-Instruct": 4.816886901855469,
+    "mistralai/Mistral-Small-24B-Instruct-2501": 4.709150314331055,
+    # TODO: not reproducible in CI, skipping for now
+    # "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16": 4.7737884521484375,
+    "nvidia/NVIDIA-Nemotron-Nano-12B-v2": 4.79390811920166,
+    # "openai/gpt-oss-20b": 4.689250946044922,
+    "Qwen/Qwen2.5-7B-Instruct": 4.778186798095703,
+    "Qwen/Qwen3-8B": 4.733874320983887,
+    "Qwen/Qwen3-VL-30B-A3B-Instruct": 4.65625,
 }
 
 
-def _assert_score_pruning_activations(puzzle_dir: Path, hf_config_name: str):
+def _assert_score_pruning_activations(puzzle_dir: Path, hf_model_name: str):
     """Assertions for the score_pruning_activations step 1."""
     rank = dist.rank()
     rank_filepath = f"pruning/pruning_scores/ffn_iterative/100samples_diverse_mini/rank_{rank}.pth"
@@ -245,7 +229,7 @@ def _assert_score_pruning_activations(puzzle_dir: Path, hf_config_name: str):
     pruning_scores = torch.load(puzzle_dir / rank_filepath)
 
     layer_names = list(pruning_scores.keys())
-    expected = EXPECTED_PRUNING_VALUES[hf_config_name]
+    expected = EXPECTED_PRUNING_VALUES[hf_model_name]
     size = dist.size()
 
     if expected is not None:
@@ -267,8 +251,8 @@ def _assert_score_pruning_activations(puzzle_dir: Path, hf_config_name: str):
             )
     else:
         # Print values for new models - update EXPECTED_PRUNING_VALUES with these
-        print(f"\n=== PRUNING VALUES for {hf_config_name} (num_layers={len(layer_names)}) ===")
-        print(f'"{hf_config_name}": [')
+        print(f"\n=== PRUNING VALUES for {hf_model_name} (num_layers={len(layer_names)}) ===")
+        print(f'"{hf_model_name}": [')
         for layer_name in layer_names:
             layer_data = pruning_scores[layer_name]
             score = layer_data["score"][0].item()
@@ -278,7 +262,7 @@ def _assert_score_pruning_activations(puzzle_dir: Path, hf_config_name: str):
         print("===")
 
 
-def _assert_lm_loss(puzzle_dir: Path, hf_config_name: str):
+def _assert_lm_loss(puzzle_dir: Path, hf_model_name: str, tolerance: float = 0.01):
     """Validate lm_loss for a model solution."""
     solution_0_path = (
         puzzle_dir / "single_sequence_replacement_solutions--validation/solution_0.json"
@@ -287,19 +271,19 @@ def _assert_lm_loss(puzzle_dir: Path, hf_config_name: str):
         validation = json.load(f)
 
     actual_lm_loss = validation["lm_loss"]["avg"]
-    expected_lm_loss = EXPECTED_LM_LOSS.get(hf_config_name)
+    expected_lm_loss = EXPECTED_LM_LOSS.get(hf_model_name)
     if expected_lm_loss is not None:
-        assert abs(actual_lm_loss - expected_lm_loss) < 0.01, (
+        assert abs(actual_lm_loss - expected_lm_loss) < tolerance, (
             f"lm_loss mismatch: expected {expected_lm_loss}, got {actual_lm_loss}"
         )
     else:
         # Print value for new models - update EXPECTED_LM_LOSS with this
-        print(f"\n=== LM_LOSS for {hf_config_name} ===")
-        print(f'"{hf_config_name}": {actual_lm_loss},')
+        print(f"\n=== LM_LOSS for {hf_model_name} ===")
+        print(f'"{hf_model_name}": {actual_lm_loss},')
         print("===")
 
 
-def _assert_mip_solutions(puzzle_dir: Path, hf_config_name: str):
+def _assert_mip_solutions(puzzle_dir: Path, hf_model_name: str):
     """Assertions for the mip_and_realize_models step."""
     mip_dir = puzzle_dir / "mip/puzzle_solutions/target_memory_780000MiB"
 
@@ -307,4 +291,4 @@ def _assert_mip_solutions(puzzle_dir: Path, hf_config_name: str):
     assert (mip_dir / "solutions--checkpoints/solution_0/config.json").exists()
 
     # Validate lm_loss
-    _assert_lm_loss(puzzle_dir, hf_config_name)
+    _assert_lm_loss(puzzle_dir, hf_model_name)
diff --git a/tox.ini b/tox.ini
index bcfb41fca3..33700288b8 100644
--- a/tox.ini
+++ b/tox.ini
@@ -73,6 +73,8 @@ commands =
 [testenv:cuda13-gpu-puzzletron]
 commands_pre =
     # Install deps here so that it gets installed even in --current-env
+    pip install --no-build-isolation git+https://github.com/state-spaces/mamba.git
+    pip install --no-build-isolation git+https://github.com/Dao-AILab/causal-conv1d.git
     pip install -e .[hf,puzzletron,dev-test]
 commands =
     # Coverage fails with "Can't combine line data with arc data" error so not using "--cov"