NVIDIA · danielkorzekwa · Mar 4, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
@@ -44,6 +44,8 @@ repos:
     rev: v1.17.1
     hooks:
       - id: mypy
+        # Exclude HF config directories to avoid duplicate module errors (e.g., configuration_nemotron_h.py exists in multiple model configs)
+        exclude: "tests/gpu/torch/puzzletron/resources/hf_configs/"
 
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v21.1.0

@@ -1142,61 +1142,39 @@ def __call__(
 
 
 class Qwen3VLRemoveExpertsIndependentHook(RemoveExpertsIndependentHook):
-    """Expert removal importance hook for Qwen3-VL models.
-
-    TODO: Implement get_router_logits_and_routed_experts based on Qwen3-VL MoE forward pass.
-    """
+    """Expert removal importance hook for Qwen3-VL models."""
 
     def get_router_logits_and_routed_experts(
         self, hidden_states: torch.Tensor, router_logits: torch.Tensor | None = None
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """Extract router logits and expert outputs for Qwen3-VL MoE.
 
-        Note: This is a placeholder implementation. Implement based on Qwen3VLMoeSparseMoe forward.
+        Based on Qwen3VLMoeSparseMoe forward pass.
         """
-        batch_size = (
-            hidden_states.shape[0] * hidden_states.shape[1]
-            if hidden_states.ndim > 2
-            else hidden_states.shape[0]
-        )
-        router_logits_out = torch.zeros(
-            batch_size, self.num_local_experts, device=hidden_states.device
-        )
-        routed_experts = hidden_states.view(-1, hidden_states.shape[-1])
-        return router_logits_out, routed_experts
+        orig_shape = hidden_states.shape
 
+        # Flatten to (num_tokens, hidden_size) for processing
+        hidden_states_flat = hidden_states.reshape(-1, self.moe.hidden_size)
 
-class GptOssRemoveExpertsIndependentHook(RemoveExpertsIndependentHook):
-    """Expert removal importance hook for GPT-OSS models.
+        if router_logits is None:
+            router_logits = self.moe.gate(hidden_states_flat)
+
+        routing_weights = torch.nn.functional.softmax(router_logits, dim=-1, dtype=torch.float)
+        routing_weights, router_indices = torch.topk(routing_weights, self.moe.top_k, dim=-1)
+        routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
+        routing_weights = routing_weights.to(hidden_states_flat.dtype)
+        router_weights = torch.zeros_like(router_logits).scatter_(
+            1, router_indices, routing_weights
+        )
 
-    TODO: Implement get_router_logits_and_routed_experts based on GPT-OSS MoE forward pass.
-    This is a placeholder implementation that allows the framework to run.
-    """
+        # Reshape hidden_states for moe.experts (expects 3D: batch, seq, hidden)
+        # router_weights and router_indices remain 2D (num_tokens, num_experts)
+        batch_size = orig_shape[0] if hidden_states.ndim == 3 else 1
+        hidden_states_3d = hidden_states_flat.reshape(batch_size, -1, self.moe.hidden_size)
 
-    def get_router_logits_and_routed_experts(
-        self, hidden_states: torch.Tensor, router_logits: torch.Tensor | None = None
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """Extract router logits and expert outputs for GPT-OSS MoE.
+        routed_out = self.moe.experts(hidden_states_3d, router_weights, router_indices)
 
-        Note: This is a placeholder implementation. For proper expert scoring,
-        implement based on GptOssSparseMoeBlock forward pass.
+        # Return in same shape as input
+        routed_out = routed_out.reshape(*orig_shape)
 
-        Args:
-            hidden_states: Input tensor of shape (batch, seq_len, hidden_dim)
-            router_logits: Optional pre-computed router logits
-
-        Returns:
-            tuple of (router_logits, routed_experts):
-                - router_logits: Shape (num_tokens, num_local_experts) - zeros as placeholder
-                - routed_experts: Original hidden states (no-op)
-        """
-        batch_size = (
-            hidden_states.shape[0] * hidden_states.shape[1]
-            if hidden_states.ndim > 2
-            else hidden_states.shape[0]
-        )
-        router_logits_out = torch.zeros(
-            batch_size, self.num_local_experts, device=hidden_states.device
-        )
-        routed_experts = hidden_states.view(-1, hidden_states.shape[-1])
-        return router_logits_out, routed_experts
+        return router_logits, routed_out
@@ -19,8 +19,11 @@
 
 from typing import Type
 
+import torch
+
 from modelopt.torch.nas.plugins.megatron_hooks.base_hooks import ForwardHook as ActivationsHook
 from modelopt.torch.puzzletron.tools.logger import aprint
+from modelopt.torch.puzzletron.utils.dummy_modules import DummyBlock, DummyModule
 
 
 def register_activation_hooks(
@@ -51,6 +54,16 @@ def register_activation_hooks(
     module_names_to_hook = pruning_mixin.get_module_names_to_hook(model)
     activation_hooks = dict()
     for block_idx, module_name in module_names_to_hook:
+        try:
+            module = model.get_submodule(module_name)
+        except AttributeError:
+            # Module doesn't exist on this rank's shard (e.g., in distributed setup)
+            continue
+
+        # Skip dummy modules - they don't have real activations to hook
+        if isinstance(module, (DummyModule, DummyBlock)):
+            continue
+
         block_config = None
         if block_idx is not None:
             block_config = model.config.block_configs[block_idx]
@@ -59,13 +72,25 @@ def register_activation_hooks(
             "block_config": block_config,
         }
 
-        module = model.get_submodule(module_name)
         hook = hook_class(module, curr_activation_hooks_kwargs)
         module.register_forward_hook(hook)
         activation_hooks[module_name] = hook
 
     if len(activation_hooks) == 0:
-        raise ValueError("couldn't find any hooks")
+        # In distributed mode, it's okay for a rank to have 0 hooks if it doesn't own
+        # the target modules (e.g., with hybrid patterns like "*-" where different
+        # ranks own different layer types). However, we still want to catch real bugs
+        # where no hooks are found at all.
+        is_distributed = torch.distributed.is_available() and torch.distributed.is_initialized()
+        if is_distributed:
+            aprint(
+                "No hooks registered on this rank. This is expected if this rank "
+                "doesn't own any layers matching the hook pattern (e.g., in hybrid "
+                "patterns with distributed model sharding)."
+            )
+        else:
+            raise ValueError("couldn't find any hooks")
 
-    aprint(f"Found the following hooks: {activation_hooks.keys()}")
+    if len(activation_hooks) > 0:
+        aprint(f"Found the following hooks: {activation_hooks.keys()}")
     return activation_hooks
@@ -135,9 +135,10 @@ def convert_configs_in_dirs(
         cls,
         input_dir: Path,
         output_dir: Path,
+        trust_remote_code: bool = False,
     ):
         """Convert config and add block_configs."""
-        config = load_model_config(input_dir)
+        config = load_model_config(input_dir, trust_remote_code=trust_remote_code)
 
         block_configs = cls.create_block_configs_from_main_config(config)
         out_config = copy.deepcopy(config)
@@ -179,7 +180,10 @@ def convert(
             output_dir: Path to the output AnyModel checkpoint.
         """
         cls.copy_checkpoint_files(input_dir, output_dir)
-        config = cls.convert_configs_in_dirs(input_dir, output_dir)
+        trust_remote_code = descriptor.requires_trust_remote_code()
+        config = cls.convert_configs_in_dirs(
+            input_dir, output_dir, trust_remote_code=trust_remote_code
+        )
         cls.convert_model_weights(
             input_dir, output_dir, descriptor=descriptor, num_hidden_layers=config.num_hidden_layers
         )

@@ -53,6 +53,18 @@ def block_config_to_layer_overrides(block_config: BlockConfig) -> Dict[str, Any]
         """
         raise NotImplementedError
 
+    @staticmethod
+    def requires_trust_remote_code() -> bool:
+        """Whether this model descriptor requires trust_remote_code=True for loading.
+
+        Models that use custom code (e.g., via auto_map in config) should override
+        this to return True.
+
+        Returns:
+            True if trust_remote_code=True is required, False otherwise.
+        """
+        return False
+
     @staticmethod
     def mlp_no_op_post_init(decoder_layer: nn.Module):
         """Post-init callback to alter a decoder layer so that FFN/mlp subblock performs as no-op.

@@ -16,9 +16,9 @@
 # Import models to trigger factory registration
 # from modelopt.torch.puzzletron.anymodel.models.gpt_oss_20b import *
 from modelopt.torch.puzzletron.anymodel.models.llama import *
-# from modelopt.torch.puzzletron.anymodel.models.mistral_small import *
-# from modelopt.torch.puzzletron.anymodel.models.nemotron_h import *
-# from modelopt.torch.puzzletron.anymodel.models.nemotron_h_v2 import *
-# from modelopt.torch.puzzletron.anymodel.models.qwen2 import *
-# from modelopt.torch.puzzletron.anymodel.models.qwen3_8b import *
-# from modelopt.torch.puzzletron.anymodel.models.qwen3_vl_30b_a3b_instruct import *
+from modelopt.torch.puzzletron.anymodel.models.mistral_small import *
+from modelopt.torch.puzzletron.anymodel.models.nemotron_h import *
+from modelopt.torch.puzzletron.anymodel.models.nemotron_h_v2 import *
+from modelopt.torch.puzzletron.anymodel.models.qwen2 import *
+from modelopt.torch.puzzletron.anymodel.models.qwen3_8b import *
+from modelopt.torch.puzzletron.anymodel.models.qwen3_vl_30b_a3b_instruct import *
@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from modelopt.torch.puzzletron.anymodel.models.mistral_small.mistral_small_converter import (
+    MistralSmallConverter,
+)
+from modelopt.torch.puzzletron.anymodel.models.mistral_small.mistral_small_model_descriptor import (
+    MistralSmallModelDescriptor,
+)
@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# mypy: ignore-errors
+
+from typing import List
+
+from transformers import MistralConfig
+
+from modelopt.torch.puzzletron.anymodel.converter import Converter, ConverterFactory
+from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import (
+    AttentionConfig,
+    BlockConfig,
+    FFNConfig,
+)
+
+
+@ConverterFactory.register_decorator("mistral_small")
+class MistralSmallConverter(Converter):
+    @staticmethod
+    def create_block_configs_from_main_config(config: MistralConfig) -> List[BlockConfig]:
+        num_hidden_layers = config.num_hidden_layers
+
+        block_config = BlockConfig(
+            attention=AttentionConfig(no_op=False, num_key_value_heads=config.num_key_value_heads),
+            ffn=FFNConfig(no_op=False, intermediate_size=config.intermediate_size),
+        ).to_dict()
+
+        block_configs = [block_config] * num_hidden_layers
+        return block_configs