diff --git a/modelopt/torch/nas/plugins/megatron_hooks/base_hooks.py b/modelopt/torch/nas/plugins/megatron_hooks/base_hooks.py index 7cd7214443..a868fddc13 100644 --- a/modelopt/torch/nas/plugins/megatron_hooks/base_hooks.py +++ b/modelopt/torch/nas/plugins/megatron_hooks/base_hooks.py @@ -602,9 +602,9 @@ def __init__(self, linear_layer: nn.Linear, activation_hooks_kwargs: dict): assert self.optimize_for in ["latency", "memory"] self.hidden_size = model_config.hidden_size - self.n_heads_in_group = block_config.attention.n_heads_in_group self.num_q_heads = model_config.num_attention_heads - self.num_kv_heads = self.num_q_heads // self.n_heads_in_group + self.num_kv_heads = block_config.attention.num_key_value_heads + self.n_heads_in_group = self.num_q_heads // self.num_kv_heads self.head_dim = getattr(model_config, "head_dim", self.hidden_size // self.num_q_heads) self.agg_kv_head_contributions = torch.zeros( @@ -1142,61 +1142,39 @@ def __call__( class Qwen3VLRemoveExpertsIndependentHook(RemoveExpertsIndependentHook): - """Expert removal importance hook for Qwen3-VL models. - - TODO: Implement get_router_logits_and_routed_experts based on Qwen3-VL MoE forward pass. - """ + """Expert removal importance hook for Qwen3-VL models.""" def get_router_logits_and_routed_experts( self, hidden_states: torch.Tensor, router_logits: torch.Tensor | None = None ) -> tuple[torch.Tensor, torch.Tensor]: """Extract router logits and expert outputs for Qwen3-VL MoE. - Note: This is a placeholder implementation. Implement based on Qwen3VLMoeSparseMoe forward. + Based on Qwen3VLMoeSparseMoe forward pass. """ - batch_size = ( - hidden_states.shape[0] * hidden_states.shape[1] - if hidden_states.ndim > 2 - else hidden_states.shape[0] - ) - router_logits_out = torch.zeros( - batch_size, self.num_local_experts, device=hidden_states.device - ) - routed_experts = hidden_states.view(-1, hidden_states.shape[-1]) - return router_logits_out, routed_experts + orig_shape = hidden_states.shape + # Flatten to (num_tokens, hidden_size) for processing + hidden_states_flat = hidden_states.reshape(-1, self.moe.hidden_size) -class GptOssRemoveExpertsIndependentHook(RemoveExpertsIndependentHook): - """Expert removal importance hook for GPT-OSS models. + if router_logits is None: + router_logits = self.moe.gate(hidden_states_flat) + + routing_weights = torch.nn.functional.softmax(router_logits, dim=-1, dtype=torch.float) + routing_weights, router_indices = torch.topk(routing_weights, self.moe.top_k, dim=-1) + routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True) + routing_weights = routing_weights.to(hidden_states_flat.dtype) + router_weights = torch.zeros_like(router_logits).scatter_( + 1, router_indices, routing_weights + ) - TODO: Implement get_router_logits_and_routed_experts based on GPT-OSS MoE forward pass. - This is a placeholder implementation that allows the framework to run. - """ + # Reshape hidden_states for moe.experts (expects 3D: batch, seq, hidden) + # router_weights and router_indices remain 2D (num_tokens, num_experts) + batch_size = orig_shape[0] if hidden_states.ndim == 3 else 1 + hidden_states_3d = hidden_states_flat.reshape(batch_size, -1, self.moe.hidden_size) - def get_router_logits_and_routed_experts( - self, hidden_states: torch.Tensor, router_logits: torch.Tensor | None = None - ) -> tuple[torch.Tensor, torch.Tensor]: - """Extract router logits and expert outputs for GPT-OSS MoE. + routed_out = self.moe.experts(hidden_states_3d, router_weights, router_indices) - Note: This is a placeholder implementation. For proper expert scoring, - implement based on GptOssSparseMoeBlock forward pass. + # Return in same shape as input + routed_out = routed_out.reshape(*orig_shape) - Args: - hidden_states: Input tensor of shape (batch, seq_len, hidden_dim) - router_logits: Optional pre-computed router logits - - Returns: - tuple of (router_logits, routed_experts): - - router_logits: Shape (num_tokens, num_local_experts) - zeros as placeholder - - routed_experts: Original hidden states (no-op) - """ - batch_size = ( - hidden_states.shape[0] * hidden_states.shape[1] - if hidden_states.ndim > 2 - else hidden_states.shape[0] - ) - router_logits_out = torch.zeros( - batch_size, self.num_local_experts, device=hidden_states.device - ) - routed_experts = hidden_states.view(-1, hidden_states.shape[-1]) - return router_logits_out, routed_experts + return router_logits, routed_out diff --git a/modelopt/torch/puzzletron/activation_scoring/activation_hooks/utils.py b/modelopt/torch/puzzletron/activation_scoring/activation_hooks/utils.py index 1b1485c713..33243c0125 100644 --- a/modelopt/torch/puzzletron/activation_scoring/activation_hooks/utils.py +++ b/modelopt/torch/puzzletron/activation_scoring/activation_hooks/utils.py @@ -19,8 +19,11 @@ from typing import Type +import torch + from modelopt.torch.nas.plugins.megatron_hooks.base_hooks import ForwardHook as ActivationsHook from modelopt.torch.puzzletron.tools.logger import aprint +from modelopt.torch.puzzletron.utils.dummy_modules import DummyBlock, DummyModule def register_activation_hooks( @@ -51,6 +54,16 @@ def register_activation_hooks( module_names_to_hook = pruning_mixin.get_module_names_to_hook(model) activation_hooks = dict() for block_idx, module_name in module_names_to_hook: + try: + module = model.get_submodule(module_name) + except AttributeError: + # Module doesn't exist on this rank's shard (e.g., in distributed setup) + continue + + # Skip dummy modules - they don't have real activations to hook + if isinstance(module, (DummyModule, DummyBlock)): + continue + block_config = None if block_idx is not None: block_config = model.config.block_configs[block_idx] @@ -59,13 +72,25 @@ def register_activation_hooks( "block_config": block_config, } - module = model.get_submodule(module_name) hook = hook_class(module, curr_activation_hooks_kwargs) module.register_forward_hook(hook) activation_hooks[module_name] = hook if len(activation_hooks) == 0: - raise ValueError("couldn't find any hooks") + # In distributed mode, it's okay for a rank to have 0 hooks if it doesn't own + # the target modules (e.g., with hybrid patterns like "*-" where different + # ranks own different layer types). However, we still want to catch real bugs + # where no hooks are found at all. + is_distributed = torch.distributed.is_available() and torch.distributed.is_initialized() + if is_distributed: + aprint( + "No hooks registered on this rank. This is expected if this rank " + "doesn't own any layers matching the hook pattern (e.g., in hybrid " + "patterns with distributed model sharding)." + ) + else: + raise ValueError("couldn't find any hooks") - aprint(f"Found the following hooks: {activation_hooks.keys()}") + if len(activation_hooks) > 0: + aprint(f"Found the following hooks: {activation_hooks.keys()}") return activation_hooks diff --git a/modelopt/torch/puzzletron/anymodel/converter/converter.py b/modelopt/torch/puzzletron/anymodel/converter/converter.py index 5fdc92718c..eb2330b515 100644 --- a/modelopt/torch/puzzletron/anymodel/converter/converter.py +++ b/modelopt/torch/puzzletron/anymodel/converter/converter.py @@ -135,9 +135,10 @@ def convert_configs_in_dirs( cls, input_dir: Path, output_dir: Path, + trust_remote_code: bool = False, ): """Convert config and add block_configs.""" - config = load_model_config(input_dir) + config = load_model_config(input_dir, trust_remote_code=trust_remote_code) block_configs = cls.create_block_configs_from_main_config(config) out_config = copy.deepcopy(config) @@ -179,7 +180,10 @@ def convert( output_dir: Path to the output AnyModel checkpoint. """ cls.copy_checkpoint_files(input_dir, output_dir) - config = cls.convert_configs_in_dirs(input_dir, output_dir) + trust_remote_code = descriptor.requires_trust_remote_code() + config = cls.convert_configs_in_dirs( + input_dir, output_dir, trust_remote_code=trust_remote_code + ) cls.convert_model_weights( input_dir, output_dir, descriptor=descriptor, num_hidden_layers=config.num_hidden_layers ) diff --git a/modelopt/torch/puzzletron/anymodel/model_descriptor/model_descriptor.py b/modelopt/torch/puzzletron/anymodel/model_descriptor/model_descriptor.py index 73d56d2016..4cc4356c8e 100644 --- a/modelopt/torch/puzzletron/anymodel/model_descriptor/model_descriptor.py +++ b/modelopt/torch/puzzletron/anymodel/model_descriptor/model_descriptor.py @@ -53,6 +53,18 @@ def block_config_to_layer_overrides(block_config: BlockConfig) -> Dict[str, Any] """ raise NotImplementedError + @staticmethod + def requires_trust_remote_code() -> bool: + """Whether this model descriptor requires trust_remote_code=True for loading. + + Models that use custom code (e.g., via auto_map in config) should override + this to return True. + + Returns: + True if trust_remote_code=True is required, False otherwise. + """ + return False + @staticmethod def mlp_no_op_post_init(decoder_layer: nn.Module): """Post-init callback to alter a decoder layer so that FFN/mlp subblock performs as no-op. diff --git a/modelopt/torch/puzzletron/anymodel/models/__init__.py b/modelopt/torch/puzzletron/anymodel/models/__init__.py index f2119059f4..1f3fb477be 100644 --- a/modelopt/torch/puzzletron/anymodel/models/__init__.py +++ b/modelopt/torch/puzzletron/anymodel/models/__init__.py @@ -16,9 +16,9 @@ # Import models to trigger factory registration # from modelopt.torch.puzzletron.anymodel.models.gpt_oss_20b import * from modelopt.torch.puzzletron.anymodel.models.llama import * -# from modelopt.torch.puzzletron.anymodel.models.mistral_small import * -# from modelopt.torch.puzzletron.anymodel.models.nemotron_h import * -# from modelopt.torch.puzzletron.anymodel.models.nemotron_h_v2 import * -# from modelopt.torch.puzzletron.anymodel.models.qwen2 import * -# from modelopt.torch.puzzletron.anymodel.models.qwen3_8b import * -# from modelopt.torch.puzzletron.anymodel.models.qwen3_vl_30b_a3b_instruct import * +from modelopt.torch.puzzletron.anymodel.models.mistral_small import * +from modelopt.torch.puzzletron.anymodel.models.nemotron_h import * +from modelopt.torch.puzzletron.anymodel.models.nemotron_h_v2 import * +from modelopt.torch.puzzletron.anymodel.models.qwen2 import * +from modelopt.torch.puzzletron.anymodel.models.qwen3_8b import * +from modelopt.torch.puzzletron.anymodel.models.qwen3_vl_30b_a3b_instruct import * diff --git a/modelopt/torch/puzzletron/anymodel/models/llama/llama_model_descriptor.py b/modelopt/torch/puzzletron/anymodel/models/llama/llama_model_descriptor.py index fe416e2dd6..082e5da599 100644 --- a/modelopt/torch/puzzletron/anymodel/models/llama/llama_model_descriptor.py +++ b/modelopt/torch/puzzletron/anymodel/models/llama/llama_model_descriptor.py @@ -39,6 +39,7 @@ from modelopt.torch.puzzletron.pruning.ffn_intermediate_pruning_mixin import ( FFNIntermediateLayerDescriptor, ) +from modelopt.torch.puzzletron.pruning.kv_heads_pruning_mixin import KVHeadsLayerDescriptor @ModelDescriptorFactory.register_decorator("llama") @@ -129,3 +130,12 @@ class LlamaFFNIntermediateLayerDescriptor(FFNIntermediateLayerDescriptor): linear_weight_names: List[str] = field( default_factory=lambda: ["down_proj", "gate_proj", "up_proj"] ) + + +@dataclass +class LlamaKVHeadsLayerDescriptor(KVHeadsLayerDescriptor): + o_proj_name: str = "self_attn.o_proj" + attn_prefix_name: str = "model.layers.{layer_idx}.self_attn" + qkvo_weight_names: List[str] = field( + default_factory=lambda: ["q_proj", "k_proj", "v_proj", "o_proj"] + ) diff --git a/modelopt/torch/puzzletron/anymodel/models/mistral_small/__init__.py b/modelopt/torch/puzzletron/anymodel/models/mistral_small/__init__.py new file mode 100644 index 0000000000..821be47e9d --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/mistral_small/__init__.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from modelopt.torch.puzzletron.anymodel.models.mistral_small.mistral_small_converter import ( + MistralSmallConverter, +) +from modelopt.torch.puzzletron.anymodel.models.mistral_small.mistral_small_model_descriptor import ( + MistralSmallModelDescriptor, +) diff --git a/modelopt/torch/puzzletron/anymodel/models/mistral_small/mistral_small_converter.py b/modelopt/torch/puzzletron/anymodel/models/mistral_small/mistral_small_converter.py new file mode 100644 index 0000000000..ddc8151dc9 --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/mistral_small/mistral_small_converter.py @@ -0,0 +1,41 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# mypy: ignore-errors + +from typing import List + +from transformers import MistralConfig + +from modelopt.torch.puzzletron.anymodel.converter import Converter, ConverterFactory +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import ( + AttentionConfig, + BlockConfig, + FFNConfig, +) + + +@ConverterFactory.register_decorator("mistral_small") +class MistralSmallConverter(Converter): + @staticmethod + def create_block_configs_from_main_config(config: MistralConfig) -> List[BlockConfig]: + num_hidden_layers = config.num_hidden_layers + + block_config = BlockConfig( + attention=AttentionConfig(no_op=False, num_key_value_heads=config.num_key_value_heads), + ffn=FFNConfig(no_op=False, intermediate_size=config.intermediate_size), + ).to_dict() + + block_configs = [block_config] * num_hidden_layers + return block_configs diff --git a/modelopt/torch/puzzletron/anymodel/models/mistral_small/mistral_small_model_descriptor.py b/modelopt/torch/puzzletron/anymodel/models/mistral_small/mistral_small_model_descriptor.py new file mode 100644 index 0000000000..1ac2bd7072 --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/mistral_small/mistral_small_model_descriptor.py @@ -0,0 +1,135 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# mypy: ignore-errors + +import re +from dataclasses import dataclass, field +from typing import Dict, List + +from transformers.models.mistral.modeling_mistral import ( + MistralDecoderLayer, + MistralForCausalLM, + MistralRotaryEmbedding, +) + +from modelopt.torch.puzzletron.anymodel.model_descriptor import ( + ModelDescriptor, + ModelDescriptorFactory, +) +from modelopt.torch.puzzletron.anymodel.puzzformer.no_op import ( + MatchingZeros, + Same, + return_tuple_of_size, +) +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import BlockConfig +from modelopt.torch.puzzletron.pruning.ffn_intermediate_pruning_mixin import ( + FFNIntermediateLayerDescriptor, +) +from modelopt.torch.puzzletron.pruning.kv_heads_pruning_mixin import KVHeadsLayerDescriptor + + +@ModelDescriptorFactory.register_decorator("mistral_small") +class MistralSmallModelDescriptor(ModelDescriptor): + @staticmethod + def decoder_layer_cls(): + return MistralDecoderLayer + + @staticmethod + def block_config_to_layer_overrides(block_config: BlockConfig): + return { + "intermediate_size": block_config.ffn.intermediate_size, + "num_key_value_heads": block_config.attention.num_key_value_heads, + } + + @staticmethod + def attn_no_op_post_init(decoder_layer: MistralDecoderLayer): + decoder_layer.input_layernorm = Same() + decoder_layer.self_attn = return_tuple_of_size(MatchingZeros, size=2)() + + @staticmethod + def mlp_no_op_post_init(decoder_layer: MistralDecoderLayer): + decoder_layer.post_attention_layernorm = Same() + decoder_layer.mlp = MatchingZeros() + + @staticmethod + def init_rotary_embedding(model: MistralForCausalLM, runtime): + model.model.rotary_emb = MistralRotaryEmbedding(model.config, runtime.device) + + @staticmethod + def input_embedding_name(): + return "model.embed_tokens" + + @staticmethod + def output_embedding_name(): + return "lm_head" + + @staticmethod + def final_norm_name(): + return "model.norm" + + @staticmethod + def layer_block_name(index: int): + return f"model.layers.{index}" + + @staticmethod + def layer_name_predicates(num_layers: int) -> Dict[str, re.Pattern]: + layer_name_patterns = { + "embeddings": re.compile(r"^model\.embed_tokens\.weight$"), + "lm_head": re.compile(r"^(model\.norm\.weight|lm_head\.weight)$"), + } + + def build_ffn_predicates() -> Dict[str, re.Pattern]: + return { + f"block_{layer_idx}_ffn": re.compile( + rf"^model\.layers\.{layer_idx}\.(post_attention_layernorm\.weight" + r"|mlp\.up_proj\.weight" + r"|mlp\.gate_proj\.weight" + r"|mlp\.down_proj\.weight)$" + ) + for layer_idx in range(num_layers) + } + + def build_attention_predicates() -> Dict[str, re.Pattern]: + return { + f"block_{layer_idx}_attention": re.compile( + rf"^model\.layers\.{layer_idx}\.(input_layernorm\.weight" + r"|self_attn\.q_proj\.weight" + r"|self_attn\.k_proj\.weight" + r"|self_attn\.v_proj\.weight" + r"|self_attn\.o_proj\.weight)$" + ) + for layer_idx in range(num_layers) + } + + layer_name_patterns.update(**build_ffn_predicates(), **build_attention_predicates()) + return layer_name_patterns + + +@dataclass +class MistralFFNIntermediateLayerDescriptor(FFNIntermediateLayerDescriptor): + down_proj_name: str = "mlp.down_proj" + ffn_prefix_name: str = "model.layers.{layer_idx}.mlp" + linear_weight_names: List[str] = field( + default_factory=lambda: ["down_proj", "gate_proj", "up_proj"] + ) + + +@dataclass +class MistralKVHeadsLayerDescriptor(KVHeadsLayerDescriptor): + o_proj_name: str = "self_attn.o_proj" + attn_prefix_name: str = "model.layers.{layer_idx}.self_attn" + qkvo_weight_names: List[str] = field( + default_factory=lambda: ["q_proj", "k_proj", "v_proj", "o_proj"] + ) diff --git a/modelopt/torch/puzzletron/anymodel/models/nemotron_h/__init__.py b/modelopt/torch/puzzletron/anymodel/models/nemotron_h/__init__.py new file mode 100644 index 0000000000..a2140f118e --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/nemotron_h/__init__.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from modelopt.torch.puzzletron.anymodel.models.nemotron_h.nemotron_h_converter import ( + NemotronHConverter, +) +from modelopt.torch.puzzletron.anymodel.models.nemotron_h.nemotron_h_model_descriptor import ( + NemotronHModelDescriptor, +) diff --git a/modelopt/torch/puzzletron/anymodel/models/nemotron_h/nemotron_h_converter.py b/modelopt/torch/puzzletron/anymodel/models/nemotron_h/nemotron_h_converter.py new file mode 100644 index 0000000000..16d9e3c73d --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/nemotron_h/nemotron_h_converter.py @@ -0,0 +1,84 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +from modelopt.torch.puzzletron.anymodel.converter import Converter, ConverterFactory +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import ( + AttentionConfig, + BlockConfig, + FFNConfig, + MambaConfig, + MoEConfig, +) + + +@ConverterFactory.register_decorator("nemotron_h") +class NemotronHConverter(Converter): + @staticmethod + def create_block_configs_from_main_config(config) -> List[BlockConfig]: + # Create block configs for each layer based on the hybrid_override_pattern + block_configs = [] + + # Parse the hybrid_override_pattern: "M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-" + pattern = config.hybrid_override_pattern + print(f"Parsing hybrid pattern: {pattern}") + + for i, char in enumerate(pattern): + if char == "M": + _block_config = BlockConfig( + attention=AttentionConfig( + mamba=MambaConfig( # Those parameters are currently used only for calc_block_stats. + state_dim=config.ssm_state_size, + num_heads=config.mamba_num_heads, + head_dim=config.mamba_head_dim, + num_groups=config.n_groups, + ) + ), + ffn=FFNConfig(no_op=True), + ) + + elif char == "-": + _block_config = BlockConfig( + attention=AttentionConfig(no_op=True), + ffn=FFNConfig(intermediate_size=config.intermediate_size), + ) + + elif char == "*": + _block_config = BlockConfig( + attention=AttentionConfig(num_key_value_heads=config.num_key_value_heads), + ffn=FFNConfig(no_op=True), + ) + + elif char == "E": + _block_config = BlockConfig( + attention=AttentionConfig(no_op=True), + ffn=FFNConfig( + moe=MoEConfig( + num_local_experts=config.n_routed_experts, + expert_intermediate_dim=config.moe_intermediate_size, + num_experts_per_tok=config.num_experts_per_tok, + ) + ), + ) + else: + raise ValueError( + f"Unknown character '{char}' in hybrid_override_pattern at position {i}" + ) + + block_configs.append(_block_config) + + print(f"Created {len(block_configs)} block configs from pattern") + return block_configs diff --git a/modelopt/torch/puzzletron/anymodel/models/nemotron_h/nemotron_h_model_descriptor.py b/modelopt/torch/puzzletron/anymodel/models/nemotron_h/nemotron_h_model_descriptor.py new file mode 100644 index 0000000000..55d9ef56ca --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/nemotron_h/nemotron_h_model_descriptor.py @@ -0,0 +1,256 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# mypy: ignore-errors + +import importlib +import inspect +import pkgutil +import re +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Dict, Iterable, List, Tuple, Type + +import torch.nn as nn + +from modelopt.torch.puzzletron.anymodel.model_descriptor import ( + ModelDescriptor, + ModelDescriptorFactory, +) +from modelopt.torch.puzzletron.anymodel.puzzformer.no_op import MatchingZeros, Same +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import BlockConfig +from modelopt.torch.puzzletron.pruning.expert_removal_pruning_mixin import ( + ExpertRemovalLayerDescriptor, + ExpertRemovalPruningMixIn, +) +from modelopt.torch.puzzletron.pruning.pruning_mixin import PruningMixIn + + +def get_dynamic_modules(module_cls_str: str) -> List[Type[nn.Module]]: + import transformers_modules + + matches = [] + for finder, modname, ispkg in pkgutil.walk_packages( + transformers_modules.__path__, transformers_modules.__name__ + "." + ): + module = importlib.import_module(modname) + for _, obj in inspect.getmembers(module, inspect.isclass): + if obj.__name__ == module_cls_str: + matches.append(obj) + + return matches + + +@dataclass +class NemotronHExpertRemovalLayerDescriptor(ExpertRemovalLayerDescriptor): + target_name: str = "mixer.gate" + moe_prefix_name: str = "backbone.layers.{layer_idx}.mixer" + expert_prefix_name: str = "experts.{expert_idx}" + router_weights: List[str] = field(default_factory=lambda: ["gate.weight"]) + router_biases: List[str] = field(default_factory=lambda: ["gate.e_score_correction_bias"]) + expert_weights: List[str] = field( + default_factory=lambda: ["up_proj.weight", "down_proj.weight"] + ) + + def get_modules_names_to_hook(self, model) -> List[Tuple[int, str]]: + if self.target_name != "mixer": + return super().get_modules_names_to_hook(model) + + # when target is `mixer` we'll target moe layers of class type: `NemotronHMOE`, as NemotronH models use auto-map we'll check for class name instead of class type. + target_class_name = "NemotronHMOE" + + module_names_to_hook = [] + for module_name, module in model.named_modules(): + # restrict to attributes called "mixer" and with the desired class name + if ( + module_name.endswith(self.target_name) + and module.__class__.__name__ == target_class_name + ): + module_names_to_hook.append( + (self.block_idx_from_module_name(module_name), module_name) + ) + return module_names_to_hook + + +@ModelDescriptorFactory.register_decorator("nemotron_h") +class NemotronHModelDescriptor(ModelDescriptor): + _DECODER_LAYER_CLS: Type[nn.Module] = None + + @staticmethod + def decoder_layer_cls(): + decoder_cls_list = get_dynamic_modules("NemotronHBlock") + if not decoder_cls_list: + raise AssertionError( + "NemotronH contains dynamic modules that should be cached beforehand, make sure to load your config using `load_model_config` or manually call `force_cache_dynamic_modules(config, checkpoint_dir)`" + ) + return decoder_cls_list + + @staticmethod + def requires_trust_remote_code() -> bool: + return True + + @staticmethod + def block_config_to_layer_overrides(block_config: BlockConfig): + override_kwargs = {} + if block_config.ffn.intermediate_size is not None: + override_kwargs["intermediate_size"] = block_config.ffn.intermediate_size + + if block_config.attention.num_key_value_heads is not None: + override_kwargs["num_key_value_heads"] = block_config.attention.num_key_value_heads + + if block_config.ffn.moe is not None: + override_kwargs["moe_intermediate_size"] = block_config.ffn.moe.expert_intermediate_dim + override_kwargs["n_routed_experts"] = block_config.ffn.moe.num_local_experts + + return override_kwargs + + @staticmethod + def _block_no_op_post_init(decoder_layer): + """ + Due to the subblock structure of NemotronH always one of the subblock is set to no-op, for a real no-op both attention & ffn no-op should be set to True. + """ + block_config = decoder_layer.config.block_configs[decoder_layer.layer_idx] + if block_config.ffn.no_op and block_config.attention.no_op: + decoder_layer.norm = Same() + decoder_layer.mixer = MatchingZeros() + + @staticmethod + def attn_no_op_post_init(decoder_layer): + NemotronHModelDescriptor._block_no_op_post_init(decoder_layer) + + @staticmethod + def mlp_no_op_post_init(decoder_layer): + NemotronHModelDescriptor._block_no_op_post_init(decoder_layer) + + @classmethod + def create_dummy_block(cls, original_layer: nn.Module, block_index: int) -> nn.Module: + dummy_block = super().create_dummy_block(original_layer, block_index) + # Required by `NemotronHModel.forward`. + dummy_block.block_type = original_layer.block_type + # Preserve layer_idx if it exists (used by _block_no_op_post_init) + if hasattr(original_layer, "layer_idx"): + dummy_block.layer_idx = original_layer.layer_idx + # Preserve config if it exists (used by _block_no_op_post_init to access block_configs) + if hasattr(original_layer, "config"): + dummy_block.config = original_layer.config + return dummy_block + + @staticmethod + def init_rotary_embedding(model, runtime): + """ + NemotronH has no positional embeddings + """ + pass + + @staticmethod + def input_embedding_name(): + return "backbone.embeddings" + + @staticmethod + def output_embedding_name(): + return "lm_head" + + @staticmethod + def final_norm_name(): + return "backbone.norm_f" + + @staticmethod + def layer_block_name(index: int): + return f"backbone.layers.{index}" + + @classmethod + def get_weight_groups( + cls, layer_names: Iterable[str], num_hidden_layers: int + ) -> Dict[str, List[str]]: + """ + Problem with NemotronH is that `norm.weight` can be in both block_{i}_ffn and block_{i}_attention. duplicate groups with `norm.weight` should be removed. + """ + weight_groups = defaultdict(list) + for name in layer_names: + is_matched = False + for group, pattern in cls.layer_name_predicates(num_hidden_layers).items(): + if pattern.match(name): + weight_groups[group].append(name) + is_matched = True + if not is_matched: + raise ValueError(f"Couldn't find a match for {name}") + + valid_weight_groups = {} + for group, names in weight_groups.items(): + if len(names) == 1: + only_name = names[0] + if only_name.endswith("norm.weight") and "layers" in only_name: + # Skip and don't append this group to valid_weight_groups + continue + valid_weight_groups[group] = names + + return valid_weight_groups + + @staticmethod + def layer_name_predicates(num_layers: int) -> Dict[str, re.Pattern]: + layer_name_patterns = { + "embeddings": re.compile( + r"^(model\.embed_tokens\.weight|backbone\.embeddings\.weight)$" + ), + "lm_head": re.compile(r"^(lm_head\.weight|backbone\.norm_f\.weight)$"), + } + + def build_ffn_predicates() -> Dict[str, re.Pattern]: + return { + f"block_{layer_idx}_ffn": re.compile( + rf"^backbone\.layers\.{layer_idx}\." + r"(norm\.weight|" # ← INCLUDED IN FFN + r"mixer\.(gate\.e_score_correction_bias" + r"|gate\.weight" + r"|experts\.\d+\.up_proj\.weight" + r"|experts\.\d+\.down_proj\.weight" + r"|shared_experts\.up_proj\.weight" + r"|shared_experts\.down_proj\.weight))$" + ) + for layer_idx in range(num_layers) + } + + def build_attention_predicates() -> Dict[str, re.Pattern]: + return { + f"block_{layer_idx}_attention": re.compile( + rf"^backbone\.layers\.{layer_idx}\." + r"(norm\.weight|" # ← INCLUDED IN ATTENTION + r"mixer\.(norm\.weight" + r"|A_log" + r"|D" + r"|conv1d\.weight" + r"|conv1d\.bias" + r"|dt_bias" + r"|in_proj\.weight" + r"|out_proj\.weight" + r"|q_proj\.weight" + r"|k_proj\.weight" + r"|v_proj\.weight" + r"|o_proj\.weight))$" + ) + for layer_idx in range(num_layers) + } + + layer_name_patterns.update( + **build_ffn_predicates(), + **build_attention_predicates(), + ) + + return layer_name_patterns + + @staticmethod + def pruning_mixins() -> Dict[str, PruningMixIn]: + return { + "experts_removal": ExpertRemovalPruningMixIn(NemotronHExpertRemovalLayerDescriptor()), + } diff --git a/modelopt/torch/puzzletron/anymodel/models/nemotron_h_v2/__init__.py b/modelopt/torch/puzzletron/anymodel/models/nemotron_h_v2/__init__.py new file mode 100644 index 0000000000..4b17785ace --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/nemotron_h_v2/__init__.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from modelopt.torch.puzzletron.anymodel.models.nemotron_h_v2.nemotron_h_v2_converter import ( + NemotronHV2Converter, +) +from modelopt.torch.puzzletron.anymodel.models.nemotron_h_v2.nemotron_h_v2_model_descriptor import ( + NemotronHV2ModelDescriptor, +) diff --git a/modelopt/torch/puzzletron/anymodel/models/nemotron_h_v2/nemotron_h_v2_converter.py b/modelopt/torch/puzzletron/anymodel/models/nemotron_h_v2/nemotron_h_v2_converter.py new file mode 100644 index 0000000000..2c54388325 --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/nemotron_h_v2/nemotron_h_v2_converter.py @@ -0,0 +1,84 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +from modelopt.torch.puzzletron.anymodel.converter import Converter, ConverterFactory +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import ( + AttentionConfig, + BlockConfig, + FFNConfig, + MambaConfig, + MoEConfig, +) + + +@ConverterFactory.register_decorator("nemotron_h_v2") +class NemotronHV2Converter(Converter): + @staticmethod + def create_block_configs_from_main_config(config) -> List[BlockConfig]: + # Create block configs for each layer based on the hybrid_override_pattern + block_configs = [] + + # Parse the hybrid_override_pattern: "M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-" + pattern = config.hybrid_override_pattern + print(f"Parsing hybrid pattern: {pattern}") + + for i, char in enumerate(pattern): + if char == "M": + _block_config = BlockConfig( + attention=AttentionConfig( + mamba=MambaConfig( # Those parameters are currently used only for calc_block_stats. + state_dim=config.ssm_state_size, + num_heads=config.mamba_num_heads, + head_dim=config.mamba_head_dim, + num_groups=config.n_groups, + ) + ), + ffn=FFNConfig(no_op=True), + ) + + elif char == "-": + _block_config = BlockConfig( + attention=AttentionConfig(no_op=True), + ffn=FFNConfig(intermediate_size=config.intermediate_size), + ) + + elif char == "*": + _block_config = BlockConfig( + attention=AttentionConfig(num_key_value_heads=config.num_key_value_heads), + ffn=FFNConfig(no_op=True), + ) + + elif char == "E": + _block_config = BlockConfig( + attention=AttentionConfig(no_op=True), + ffn=FFNConfig( + moe=MoEConfig( + num_local_experts=config.n_routed_experts, + expert_intermediate_dim=config.moe_intermediate_size, + num_experts_per_tok=config.num_experts_per_tok, + ) + ), + ) + else: + raise ValueError( + f"Unknown character '{char}' in hybrid_override_pattern at position {i}" + ) + + block_configs.append(_block_config) + + print(f"Created {len(block_configs)} block configs from pattern") + return block_configs diff --git a/modelopt/torch/puzzletron/anymodel/models/nemotron_h_v2/nemotron_h_v2_model_descriptor.py b/modelopt/torch/puzzletron/anymodel/models/nemotron_h_v2/nemotron_h_v2_model_descriptor.py new file mode 100644 index 0000000000..f50217d4d3 --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/nemotron_h_v2/nemotron_h_v2_model_descriptor.py @@ -0,0 +1,241 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import inspect +import pkgutil +import re +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Dict, Iterable, List, Type + +import torch.nn as nn + +from modelopt.torch.puzzletron.anymodel.model_descriptor import ( + ModelDescriptor, + ModelDescriptorFactory, +) +from modelopt.torch.puzzletron.anymodel.puzzformer.no_op import MatchingZeros, Same +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import BlockConfig +from modelopt.torch.puzzletron.pruning.ffn_intermediate_pruning_mixin import ( + FFNIntermediateLayerDescriptor, + FFNIntermediatePruningMixIn, +) +from modelopt.torch.puzzletron.pruning.pruning_mixin import PruningMixIn + + +def get_dynamic_modules(module_cls_str: str) -> List[Type[nn.Module]]: + import transformers_modules + + matches = [] + for finder, modname, ispkg in pkgutil.walk_packages( + transformers_modules.__path__, transformers_modules.__name__ + "." + ): + module = importlib.import_module(modname) + for _, obj in inspect.getmembers(module, inspect.isclass): + if obj.__name__ == module_cls_str: + matches.append(obj) + + return matches + + +@dataclass +class NemotronHV2FFNIntermediateLayerDescriptor(FFNIntermediateLayerDescriptor): + down_proj_name: str = "mixer.down_proj" + ffn_prefix_name: str = "backbone.layers.{layer_idx}.mixer" + linear_weight_names: List[str] = field(default_factory=lambda: ["down_proj", "up_proj"]) + + +@ModelDescriptorFactory.register_decorator("nemotron_h_v2") +class NemotronHV2ModelDescriptor(ModelDescriptor): + _DECODER_LAYER_CLS: Type[nn.Module] = None + + @staticmethod + def decoder_layer_cls(): + decoder_cls_list = get_dynamic_modules("NemotronHBlock") + if not decoder_cls_list: + raise AssertionError( + "NemotronH contains dynamic modules that should be cached beforehand, make sure to load your config using `load_model_config` or manually call `force_cache_dynamic_modules(config, checkpoint_dir)`" + ) + return decoder_cls_list + + @staticmethod + def requires_trust_remote_code() -> bool: + return True + + @staticmethod + def block_config_to_layer_overrides(block_config: BlockConfig): + override_kwargs = {} + if block_config.ffn is not None and block_config.ffn.intermediate_size is not None: + override_kwargs["intermediate_size"] = block_config.ffn.intermediate_size + + if ( + block_config.attention is not None + and block_config.attention.num_key_value_heads is not None + ): + override_kwargs["num_key_value_heads"] = block_config.attention.num_key_value_heads + + if block_config.ffn is not None and block_config.ffn.moe is not None: + override_kwargs["moe_intermediate_size"] = block_config.ffn.moe.expert_intermediate_dim + override_kwargs["n_routed_experts"] = block_config.ffn.moe.num_local_experts + + return override_kwargs + + @staticmethod + def _block_no_op_post_init(decoder_layer): + """ + Due to the subblock structure of NemotronH always one of the subblock is set to no-op, for a real no-op both attention & ffn no-op should be set to True. + """ + block_config = decoder_layer.config.block_configs[decoder_layer.layer_idx] + ffn_no_op = block_config.ffn is not None and block_config.ffn.no_op + attn_no_op = block_config.attention is not None and block_config.attention.no_op + if ffn_no_op and attn_no_op: + decoder_layer.norm = Same() + decoder_layer.mixer = MatchingZeros() + + @staticmethod + def attn_no_op_post_init(decoder_layer): + NemotronHV2ModelDescriptor._block_no_op_post_init(decoder_layer) + + @staticmethod + def mlp_no_op_post_init(decoder_layer): + NemotronHV2ModelDescriptor._block_no_op_post_init(decoder_layer) + + @classmethod + def create_dummy_block(cls, original_layer: nn.Module, block_index: int) -> nn.Module: + dummy_block = super().create_dummy_block(original_layer, block_index) + # Required by `NemotronHModel.forward`. + dummy_block.block_type = original_layer.block_type + # Preserve layer_idx if it exists (used by _block_no_op_post_init) + if hasattr(original_layer, "layer_idx"): + dummy_block.layer_idx = original_layer.layer_idx + # Preserve config if it exists (used by _block_no_op_post_init to access block_configs) + if hasattr(original_layer, "config"): + dummy_block.config = original_layer.config + return dummy_block + + @staticmethod + def init_rotary_embedding(model, runtime): + """ + NemotronH has no positional embeddings + """ + pass + + @staticmethod + def input_embedding_name(): + return "backbone.embeddings" + + @staticmethod + def output_embedding_name(): + return "lm_head" + + @staticmethod + def final_norm_name(): + return "backbone.norm_f" + + @staticmethod + def layer_block_name(index: int): + return f"backbone.layers.{index}" + + @classmethod + def get_weight_groups( + cls, layer_names: Iterable[str], num_hidden_layers: int + ) -> Dict[str, List[str]]: + """ + Problem with NemotronH is that `norm.weight` can be in both block_{i}_ffn and block_{i}_attention. duplicate groups with `norm.weight` should be removed. + """ + weight_groups = defaultdict(list) + for name in layer_names: + is_matched = False + for group, pattern in cls.layer_name_predicates(num_hidden_layers).items(): + if pattern.match(name): + weight_groups[group].append(name) + is_matched = True + if not is_matched: + raise ValueError(f"Couldn't find a match for {name}") + + valid_weight_groups = {} + for group, names in weight_groups.items(): + if len(names) == 1: + only_name = names[0] + if only_name.endswith("norm.weight") and "layers" in only_name: + # Skip and don't append this group to valid_weight_groups + continue + valid_weight_groups[group] = names + + return valid_weight_groups + + @staticmethod + def layer_name_predicates(num_layers: int) -> Dict[str, re.Pattern]: + layer_name_patterns = { + "embeddings": re.compile( + r"^(model\.embed_tokens\.weight|backbone\.embeddings\.weight)$" + ), + "lm_head": re.compile(r"^(lm_head\.weight|backbone\.norm_f\.weight)$"), + } + + def build_ffn_predicates() -> Dict[str, re.Pattern]: + return { + f"block_{layer_idx}_ffn": re.compile( + rf"^backbone\.layers\.{layer_idx}\." + r"(norm\.weight|" # ← INCLUDED IN FFN + r"mixer\.(gate\.e_score_correction_bias" + r"|gate\.weight" + r"|experts\.\d+\.up_proj\.weight" + r"|experts\.\d+\.down_proj\.weight" + r"|shared_experts\.up_proj\.weight" + r"|shared_experts\.down_proj\.weight" + r"|up_proj\.weight" # Simple MLP (non-MoE) + r"|down_proj\.weight))$" # Simple MLP (non-MoE) + ) + for layer_idx in range(num_layers) + } + + def build_attention_predicates() -> Dict[str, re.Pattern]: + return { + f"block_{layer_idx}_attention": re.compile( + rf"^backbone\.layers\.{layer_idx}\." + r"(norm\.weight|" # ← INCLUDED IN ATTENTION + r"mixer\.(norm\.weight" + r"|A_log" + r"|D" + r"|conv1d\.weight" + r"|conv1d\.bias" + r"|dt_bias" + r"|in_proj\.weight" + r"|out_proj\.weight" + r"|q_proj\.weight" + r"|k_proj\.weight" + r"|v_proj\.weight" + r"|o_proj\.weight))$" + ) + for layer_idx in range(num_layers) + } + + layer_name_patterns.update( + **build_ffn_predicates(), + **build_attention_predicates(), + ) + + return layer_name_patterns + + @staticmethod + def pruning_mixins() -> Dict[str, PruningMixIn]: + return { + "ffn_intermediate": FFNIntermediatePruningMixIn( + NemotronHV2FFNIntermediateLayerDescriptor() + ), + # TODO: Add expert removal support when ExpertRemovalPruningMixIn is migrated + } diff --git a/modelopt/torch/puzzletron/anymodel/models/qwen2/__init__.py b/modelopt/torch/puzzletron/anymodel/models/qwen2/__init__.py new file mode 100644 index 0000000000..c193fc0d6d --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/qwen2/__init__.py @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from modelopt.torch.puzzletron.anymodel.models.qwen2.qwen2_converter import Qwen2Converter +from modelopt.torch.puzzletron.anymodel.models.qwen2.qwen2_model_descriptor import ( + Qwen2ModelDescriptor, +) diff --git a/modelopt/torch/puzzletron/anymodel/models/qwen2/qwen2_converter.py b/modelopt/torch/puzzletron/anymodel/models/qwen2/qwen2_converter.py new file mode 100644 index 0000000000..878cfd64dc --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/qwen2/qwen2_converter.py @@ -0,0 +1,50 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# mypy: ignore-errors + +"""Qwen2 converter for AnyModel compression.""" + +from typing import List + +from transformers import Qwen2Config + +from modelopt.torch.puzzletron.anymodel.converter import Converter, ConverterFactory +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import ( + AttentionConfig, + BlockConfig, + FFNConfig, +) + + +@ConverterFactory.register_decorator("qwen2") +class Qwen2Converter(Converter): + """Converter for Qwen2 models to AnyModel format.""" + + @staticmethod + def create_block_configs_from_main_config(config: Qwen2Config) -> List[BlockConfig]: + """Create uniform block configs for all Qwen2 layers. + + Qwen2 models have uniform architecture across all layers, so we create + the same BlockConfig for each layer. + """ + num_hidden_layers = config.num_hidden_layers + + block_config = BlockConfig( + attention=AttentionConfig(no_op=False, num_key_value_heads=config.num_key_value_heads), + ffn=FFNConfig(no_op=False, intermediate_size=config.intermediate_size), + ).to_dict() + + block_configs = [block_config] * num_hidden_layers + return block_configs diff --git a/modelopt/torch/puzzletron/anymodel/models/qwen2/qwen2_model_descriptor.py b/modelopt/torch/puzzletron/anymodel/models/qwen2/qwen2_model_descriptor.py new file mode 100644 index 0000000000..69185d1de3 --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/qwen2/qwen2_model_descriptor.py @@ -0,0 +1,148 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# mypy: ignore-errors + +"""Qwen2 model descriptor for AnyModel compression.""" + +import re +from dataclasses import dataclass +from typing import Dict + +from torch import nn +from transformers.models.qwen2.modeling_qwen2 import ( + Qwen2DecoderLayer, + Qwen2ForCausalLM, + Qwen2RotaryEmbedding, +) + +from modelopt.torch.puzzletron.anymodel.model_descriptor import ( + ModelDescriptor, + ModelDescriptorFactory, +) +from modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor import ( + LlamaFFNIntermediateLayerDescriptor, +) +from modelopt.torch.puzzletron.anymodel.puzzformer.no_op import ( + MatchingZeros, + Same, + return_tuple_of_size, +) +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import BlockConfig +from modelopt.torch.puzzletron.utils.dummy_modules import DummyBlock + + +@ModelDescriptorFactory.register_decorator("qwen2") +class Qwen2ModelDescriptor(ModelDescriptor): + """Model descriptor for Qwen2 models.""" + + @staticmethod + def decoder_layer_cls(): + return Qwen2DecoderLayer + + @classmethod + def create_dummy_block(cls, original_layer: nn.Module, block_index: int) -> nn.Module: + """Create a dummy block that preserves Qwen2-specific attributes like attention_type. + + Qwen2's forward pass accesses decoder_layer.attention_type for attention mask selection. + """ + dummy = DummyBlock(block_index=block_index) + # Copy attention_type from original layer (required by Qwen2's forward pass) + if hasattr(original_layer, "attention_type"): + dummy.attention_type = original_layer.attention_type + return dummy + + @staticmethod + def block_config_to_layer_overrides(block_config: BlockConfig): + return { + "intermediate_size": block_config.ffn.intermediate_size, + "num_key_value_heads": block_config.attention.num_key_value_heads, + } + + @staticmethod + def attn_no_op_post_init(decoder_layer: Qwen2DecoderLayer): + decoder_layer.input_layernorm = Same() + decoder_layer.self_attn = return_tuple_of_size(MatchingZeros, size=2)() + + @staticmethod + def mlp_no_op_post_init(decoder_layer: Qwen2DecoderLayer): + decoder_layer.post_attention_layernorm = Same() + decoder_layer.mlp = MatchingZeros() + + @staticmethod + def init_rotary_embedding(model: Qwen2ForCausalLM, runtime): + model.model.rotary_emb = Qwen2RotaryEmbedding(config=model.config, device=runtime.device) + + @staticmethod + def input_embedding_name(): + return "model.embed_tokens" + + @staticmethod + def output_embedding_name(): + return "lm_head" + + @staticmethod + def final_norm_name(): + return "model.norm" + + @staticmethod + def layer_block_name(index: int): + return f"model.layers.{index}" + + @staticmethod + def layer_name_predicates(num_layers: int) -> Dict[str, re.Pattern]: + layer_name_patterns = { + "embeddings": re.compile(r"^model\.embed_tokens\.weight$"), + "lm_head": re.compile(r"^(model\.norm\.weight|lm_head\.weight)$"), + } + + def build_ffn_predicates() -> Dict[str, re.Pattern]: + return { + f"block_{layer_idx}_ffn": re.compile( + rf"^model\.layers\.{layer_idx}\.(post_attention_layernorm\.weight" + r"|mlp\.up_proj\.weight" + r"|mlp\.gate_proj\.weight" + r"|mlp\.down_proj\.weight)$" + ) + for layer_idx in range(num_layers) + } + + def build_attention_predicates() -> Dict[str, re.Pattern]: + # Qwen2 has biases on attention projections + return { + f"block_{layer_idx}_attention": re.compile( + rf"^model\.layers\.{layer_idx}\.(input_layernorm\.weight" + r"|self_attn\.q_proj\.weight" + r"|self_attn\.q_proj\.bias" + r"|self_attn\.k_proj\.weight" + r"|self_attn\.k_proj\.bias" + r"|self_attn\.v_proj\.weight" + r"|self_attn\.v_proj\.bias" + r"|self_attn\.o_proj\.weight)$" + ) + for layer_idx in range(num_layers) + } + + layer_name_patterns.update(**build_ffn_predicates(), **build_attention_predicates()) + return layer_name_patterns + + +@dataclass +class Qwen2FFNIntermediateLayerDescriptor(LlamaFFNIntermediateLayerDescriptor): + """Layer descriptor for Qwen2 FFN intermediate pruning. + + Qwen2 uses the same FFN structure as Llama (gate_proj, up_proj, down_proj). + """ + + pass diff --git a/modelopt/torch/puzzletron/anymodel/models/qwen3_8b/__init__.py b/modelopt/torch/puzzletron/anymodel/models/qwen3_8b/__init__.py new file mode 100644 index 0000000000..0f753f705d --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/qwen3_8b/__init__.py @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from modelopt.torch.puzzletron.anymodel.models.qwen3_8b.qwen3_8b_converter import Qwen3_8BConverter +from modelopt.torch.puzzletron.anymodel.models.qwen3_8b.qwen3_8b_model_descriptor import ( + Qwen3_8BModelDescriptor, +) diff --git a/modelopt/torch/puzzletron/anymodel/models/qwen3_8b/qwen3_8b_converter.py b/modelopt/torch/puzzletron/anymodel/models/qwen3_8b/qwen3_8b_converter.py new file mode 100644 index 0000000000..1a389291df --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/qwen3_8b/qwen3_8b_converter.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# mypy: ignore-errors + +from typing import List + +from transformers import Qwen3Config + +from modelopt.torch.puzzletron.anymodel.converter import Converter, ConverterFactory +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import ( + AttentionConfig, + BlockConfig, + FFNConfig, +) + + +@ConverterFactory.register_decorator("qwen3") +class Qwen3_8BConverter(Converter): + @staticmethod + def create_block_configs_from_main_config(config: Qwen3Config) -> List[BlockConfig]: + num_hidden_layers = config.num_hidden_layers + + block_config = BlockConfig( + attention=AttentionConfig(no_op=False, num_key_value_heads=config.num_key_value_heads), + ffn=FFNConfig(no_op=False, intermediate_size=config.intermediate_size), + ).to_dict() + + block_configs = [block_config] * num_hidden_layers + return block_configs diff --git a/modelopt/torch/puzzletron/anymodel/models/qwen3_8b/qwen3_8b_model_descriptor.py b/modelopt/torch/puzzletron/anymodel/models/qwen3_8b/qwen3_8b_model_descriptor.py new file mode 100644 index 0000000000..679ee73fae --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/qwen3_8b/qwen3_8b_model_descriptor.py @@ -0,0 +1,152 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# mypy: ignore-errors + +import re +from dataclasses import dataclass, field +from typing import Dict, List + +from torch import nn +from transformers.models.qwen3.modeling_qwen3 import ( + Qwen3DecoderLayer, + Qwen3ForCausalLM, + Qwen3RotaryEmbedding, +) + +from modelopt.torch.puzzletron.anymodel.model_descriptor import ( + ModelDescriptor, + ModelDescriptorFactory, +) +from modelopt.torch.puzzletron.anymodel.puzzformer.no_op import ( + MatchingZeros, + Same, + return_tuple_of_size, +) +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import BlockConfig +from modelopt.torch.puzzletron.pruning.ffn_intermediate_pruning_mixin import ( + FFNIntermediateLayerDescriptor, +) +from modelopt.torch.puzzletron.pruning.kv_heads_pruning_mixin import KVHeadsLayerDescriptor +from modelopt.torch.puzzletron.utils.dummy_modules import DummyBlock + + +@ModelDescriptorFactory.register_decorator("qwen3") +class Qwen3_8BModelDescriptor(ModelDescriptor): + @staticmethod + def decoder_layer_cls(): + return Qwen3DecoderLayer + + @classmethod + def create_dummy_block(cls, original_layer: nn.Module, block_index: int) -> nn.Module: + """Create a dummy block that preserves Qwen3-specific attributes like attention_type. + + Qwen3's forward pass accesses decoder_layer.attention_type for attention mask selection. + """ + dummy = DummyBlock(block_index=block_index) + # Copy attention_type from original layer (required by Qwen3's forward pass) + if hasattr(original_layer, "attention_type"): + dummy.attention_type = original_layer.attention_type + return dummy + + @staticmethod + def block_config_to_layer_overrides(block_config: BlockConfig): + return { + "intermediate_size": block_config.ffn.intermediate_size, + "num_key_value_heads": block_config.attention.num_key_value_heads, + } + + @staticmethod + def attn_no_op_post_init(decoder_layer: Qwen3DecoderLayer): + decoder_layer.input_layernorm = Same() + decoder_layer.self_attn = return_tuple_of_size(MatchingZeros, size=2)() + + @staticmethod + def mlp_no_op_post_init(decoder_layer: Qwen3DecoderLayer): + decoder_layer.post_attention_layernorm = Same() + decoder_layer.mlp = MatchingZeros() + + @staticmethod + def init_rotary_embedding(model: Qwen3ForCausalLM, runtime): + model.model.rotary_emb = Qwen3RotaryEmbedding(model.config, runtime.device) + + @staticmethod + def input_embedding_name(): + return "model.embed_tokens" + + @staticmethod + def output_embedding_name(): + return "lm_head" + + @staticmethod + def final_norm_name(): + return "model.norm" + + @staticmethod + def layer_block_name(index: int): + return f"model.layers.{index}" + + @staticmethod + def layer_name_predicates(num_layers: int) -> Dict[str, re.Pattern]: + layer_name_patterns = { + "embeddings": re.compile(r"^model\.embed_tokens\.weight$"), + "lm_head": re.compile(r"^(model\.norm\.weight|lm_head\.weight)$"), + } + + def build_ffn_predicates() -> Dict[str, re.Pattern]: + return { + f"block_{layer_idx}_ffn": re.compile( + rf"^model\.layers\.{layer_idx}\.(post_attention_layernorm\.weight" + r"|mlp\.up_proj\.weight" + r"|mlp\.gate_proj\.weight" + r"|mlp\.down_proj\.weight)$" + ) + for layer_idx in range(num_layers) + } + + def build_attention_predicates() -> Dict[str, re.Pattern]: + return { + f"block_{layer_idx}_attention": re.compile( + rf"^model\.layers\.{layer_idx}\.(input_layernorm\.weight" + r"|self_attn\.q_proj\.weight" + r"|self_attn\.k_proj\.weight" + r"|self_attn\.v_proj\.weight" + r"|self_attn\.o_proj\.weight" + r"|self_attn\.q_norm\.weight" + r"|self_attn\.k_norm\.weight)$" + ) + for layer_idx in range(num_layers) + } + + layer_name_patterns.update(**build_ffn_predicates(), **build_attention_predicates()) + return layer_name_patterns + + +@dataclass +class Qwen3_8BFFNIntermediateLayerDescriptor(FFNIntermediateLayerDescriptor): + down_proj_name: str = "mlp.down_proj" + ffn_prefix_name: str = "model.layers.{layer_idx}.mlp" + linear_weight_names: List[str] = field( + default_factory=lambda: ["down_proj", "gate_proj", "up_proj"] + ) + + +@dataclass +class Qwen3_8BKVHeadsLayerDescriptor(KVHeadsLayerDescriptor): + o_proj_name: str = "self_attn.o_proj" + attn_prefix_name: str = "model.layers.{layer_idx}.self_attn" + qkvo_weight_names: List[str] = field( + default_factory=lambda: ["q_proj", "k_proj", "v_proj", "o_proj"] + ) diff --git a/modelopt/torch/puzzletron/anymodel/models/qwen3_vl_30b_a3b_instruct/__init__.py b/modelopt/torch/puzzletron/anymodel/models/qwen3_vl_30b_a3b_instruct/__init__.py new file mode 100644 index 0000000000..7bf317d29e --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/qwen3_vl_30b_a3b_instruct/__init__.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from modelopt.torch.puzzletron.anymodel.models.qwen3_vl_30b_a3b_instruct.qwen3_vl_30b_a3b_instruct_converter import ( + Qwen3VL30BA3BInstructConverter, +) +from modelopt.torch.puzzletron.anymodel.models.qwen3_vl_30b_a3b_instruct.qwen3_vl_30b_a3b_instruct_model_descriptor import ( + Qwen3VL30BA3BInstructModelDescriptor, +) diff --git a/modelopt/torch/puzzletron/anymodel/models/qwen3_vl_30b_a3b_instruct/qwen3_vl_30b_a3b_instruct_converter.py b/modelopt/torch/puzzletron/anymodel/models/qwen3_vl_30b_a3b_instruct/qwen3_vl_30b_a3b_instruct_converter.py new file mode 100644 index 0000000000..0c50dfeb9e --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/qwen3_vl_30b_a3b_instruct/qwen3_vl_30b_a3b_instruct_converter.py @@ -0,0 +1,77 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# mypy: ignore-errors + +from typing import List + +from transformers import Qwen3VLMoeConfig + +from modelopt.torch.puzzletron.anymodel.converter import Converter, ConverterFactory +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import ( + AttentionConfig, + BlockConfig, + FFNConfig, + MoEConfig, +) + + +@ConverterFactory.register_decorator("qwen3_vl") +class Qwen3VL30BA3BInstructConverter(Converter): + @staticmethod + def create_block_configs_from_main_config(config: Qwen3VLMoeConfig) -> List[BlockConfig]: + # Qwen3-VL MoE has nested text_config + text_config = config.text_config if hasattr(config, "text_config") else config + + num_hidden_layers = text_config.num_hidden_layers + decoder_sparse_step = getattr(text_config, "decoder_sparse_step", 1) + mlp_only_layers = getattr(text_config, "mlp_only_layers", []) + + block_configs = [] + for layer_idx in range(num_hidden_layers): + # Check if this layer is MoE or dense + is_moe_layer = (layer_idx % decoder_sparse_step == 0) and ( + layer_idx not in mlp_only_layers + ) + + if is_moe_layer: + # MoE layer + block_config = BlockConfig( + attention=AttentionConfig( + no_op=False, num_key_value_heads=text_config.num_key_value_heads + ), + ffn=FFNConfig( + moe=MoEConfig( + num_local_experts=text_config.num_experts, + expert_intermediate_dim=text_config.moe_intermediate_size, + num_experts_per_tok=text_config.num_experts_per_tok, + ) + ), + ) + else: + # Dense layer + block_config = BlockConfig( + attention=AttentionConfig( + no_op=False, num_key_value_heads=text_config.num_key_value_heads + ), + ffn=FFNConfig(no_op=False, intermediate_size=text_config.intermediate_size), + ) + + block_configs.append(block_config) + + print( + f"Created {len(block_configs)} block configs for Qwen3-VL MoE (decoder_sparse_step={decoder_sparse_step})" + ) + return block_configs diff --git a/modelopt/torch/puzzletron/anymodel/models/qwen3_vl_30b_a3b_instruct/qwen3_vl_30b_a3b_instruct_model_descriptor.py b/modelopt/torch/puzzletron/anymodel/models/qwen3_vl_30b_a3b_instruct/qwen3_vl_30b_a3b_instruct_model_descriptor.py new file mode 100644 index 0000000000..7c7665a644 --- /dev/null +++ b/modelopt/torch/puzzletron/anymodel/models/qwen3_vl_30b_a3b_instruct/qwen3_vl_30b_a3b_instruct_model_descriptor.py @@ -0,0 +1,212 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# mypy: ignore-errors + +import re +from dataclasses import dataclass, field +from typing import Dict, List + +import torch.nn as nn +from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import ( + Qwen3VLMoeTextDecoderLayer, + Qwen3VLMoeTextRotaryEmbedding, + Qwen3VLMoeVisionRotaryEmbedding, +) + +from modelopt.torch.puzzletron.anymodel.model_descriptor import ( + ModelDescriptor, + ModelDescriptorFactory, +) +from modelopt.torch.puzzletron.anymodel.puzzformer.no_op import ( + MatchingZeros, + Same, + return_tuple_of_size, +) +from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import BlockConfig +from modelopt.torch.puzzletron.pruning.expert_removal_pruning_mixin import ( + ExpertRemovalLayerDescriptor, +) +from modelopt.torch.puzzletron.pruning.ffn_intermediate_pruning_mixin import ( + FFNIntermediateLayerDescriptor, +) +from modelopt.torch.puzzletron.pruning.kv_heads_pruning_mixin import KVHeadsLayerDescriptor + + +@ModelDescriptorFactory.register_decorator("qwen3_vl") +class Qwen3VL30BA3BInstructModelDescriptor(ModelDescriptor): + @staticmethod + def uses_autocast() -> bool: + """ + Qwen3-VL MoE has a dtype bug in HuggingFace transformers under torch.autocast: + scatter() in MoE routing fails with dtype mismatch. Use native bfloat16 instead. + See: https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct (recommended approach) + """ + return False + + @staticmethod + def get_language_model_config(config): + """Qwen3-VL has nested text_config for language model parameters.""" + return config.text_config if hasattr(config, "text_config") else config + + @staticmethod + def decoder_layer_cls(): + return Qwen3VLMoeTextDecoderLayer + + @staticmethod + def block_config_to_layer_overrides(block_config: BlockConfig): + override_kwargs = {"num_key_value_heads": block_config.attention.num_key_value_heads} + + if block_config.ffn.moe: + override_kwargs["moe_intermediate_size"] = block_config.ffn.moe.expert_intermediate_dim + override_kwargs["num_experts"] = block_config.ffn.moe.num_local_experts + else: + override_kwargs["intermediate_size"] = block_config.ffn.intermediate_size + + return override_kwargs + + @staticmethod + def attn_no_op_post_init(decoder_layer: Qwen3VLMoeTextDecoderLayer): + decoder_layer.input_layernorm = Same() + decoder_layer.self_attn = return_tuple_of_size(MatchingZeros, size=2)() + + @staticmethod + def mlp_no_op_post_init(decoder_layer: Qwen3VLMoeTextDecoderLayer): + decoder_layer.post_attention_layernorm = Same() + decoder_layer.mlp = MatchingZeros() + + @staticmethod + def init_rotary_embedding(model, runtime): + # Re-initialize text rotary embedding on correct device and dtype + text_config = Qwen3VL30BA3BInstructModelDescriptor.get_language_model_config(model.config) + model.model.language_model.rotary_emb = Qwen3VLMoeTextRotaryEmbedding( + config=text_config + ).to(device=runtime.device, dtype=runtime.dtype) + # Re-initialize vision rotary embedding on correct device and dtype + vision_config = ( + model.config.vision_config if hasattr(model.config, "vision_config") else None + ) + if vision_config is not None: + head_dim = vision_config.hidden_size // vision_config.num_heads + model.model.visual.rotary_pos_emb = Qwen3VLMoeVisionRotaryEmbedding(head_dim // 2).to( + device=runtime.device, dtype=runtime.dtype + ) + + @staticmethod + def input_embedding_name(): + return "model.language_model.embed_tokens" + + @staticmethod + def output_embedding_name(): + return "lm_head" + + @staticmethod + def final_norm_name(): + return "model.language_model.norm" + + @staticmethod + def layer_block_name(index: int): + return f"model.language_model.layers.{index}" + + @staticmethod + def layer_name_predicates(num_layers: int) -> Dict[str, re.Pattern]: + # Qwen3-VL has text model under model.language_model.* prefix + layer_name_patterns = { + "embeddings": re.compile(r"^model\.language_model\.embed_tokens\.weight$"), + "lm_head": re.compile(r"^(model\.language_model\.norm\.weight|lm_head\.weight)$"), + # Vision encoder (includes merger under model.visual.deepstack_merger_list.*) + "vision_encoding": re.compile(r"^model\.visual\..*"), + } + + def build_ffn_predicates() -> Dict[str, re.Pattern]: + return { + f"block_{layer_idx}_ffn": re.compile( + rf"^model\.language_model\.layers\.{layer_idx}\.(post_attention_layernorm\.weight" + # MoE router + r"|mlp\.gate\.weight" + # MoE experts - fused format (gate_up_proj, down_proj without .weight suffix) + r"|mlp\.experts\.gate_up_proj" + r"|mlp\.experts\.down_proj" + # Shared expert (if present) + r"|mlp\.shared_expert\.up_proj\.weight" + r"|mlp\.shared_expert\.gate_proj\.weight" + r"|mlp\.shared_expert\.down_proj\.weight" + r"|mlp\.shared_expert_gate\.weight" + # Dense MLP fallback (for non-MoE layers) + r"|mlp\.up_proj\.weight" + r"|mlp\.gate_proj\.weight" + r"|mlp\.down_proj\.weight)$" + ) + for layer_idx in range(num_layers) + } + + def build_attention_predicates() -> Dict[str, re.Pattern]: + return { + f"block_{layer_idx}_attention": re.compile( + rf"^model\.language_model\.layers\.{layer_idx}\.(input_layernorm\.weight" + r"|self_attn\.q_proj\.weight" + r"|self_attn\.k_proj\.weight" + r"|self_attn\.v_proj\.weight" + r"|self_attn\.o_proj\.weight" + r"|self_attn\.q_norm\.weight" + r"|self_attn\.k_norm\.weight)$" + ) + for layer_idx in range(num_layers) + } + + layer_name_patterns.update(**build_ffn_predicates(), **build_attention_predicates()) + return layer_name_patterns + + +@dataclass +class Qwen3VL30BA3BInstructFFNIntermediateLayerDescriptor(FFNIntermediateLayerDescriptor): + down_proj_name: str = "mlp.down_proj" + ffn_prefix_name: str = "model.language_model.layers.{layer_idx}.mlp" + linear_weight_names: List[str] = field( + default_factory=lambda: ["down_proj", "gate_proj", "up_proj"] + ) + + +@dataclass +class Qwen3VL30BA3BInstructKVHeadsLayerDescriptor(KVHeadsLayerDescriptor): + o_proj_name: str = "self_attn.o_proj" + attn_prefix_name: str = "model.language_model.layers.{layer_idx}.self_attn" + qkvo_weight_names: List[str] = field( + default_factory=lambda: ["q_proj", "k_proj", "v_proj", "o_proj"] + ) + + +@dataclass +class Qwen3VL30BA3BInstructExpertRemovalLayerDescriptor(ExpertRemovalLayerDescriptor): + """ + Qwen3-VL MoE layer descriptor. + + Reference: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py + - Qwen3VLMoeTextSparseMoeBlock: MoE block with .gate (router) and .experts + - Qwen3VLMoeTextTopKRouter: Router with .weight (no bias) + - Qwen3VLMoeTextExperts: Fused experts with .gate_up_proj and .down_proj tensors + """ + + target_name: str = "mlp" + moe_prefix_name: str = "model.language_model.layers.{layer_idx}.mlp" + # Router: Qwen3VLMoeTextTopKRouter has self.weight, no bias + router_weights: List[str] = field(default_factory=lambda: ["gate.weight"]) + router_biases: List[str] = field(default_factory=lambda: []) + # Fused expert format: Qwen3VLMoeTextExperts stores all experts in single tensors + # with shape [num_experts, ...] instead of separate tensors per expert. + is_fused_experts: bool = True + fused_expert_weights: List[str] = field( + default_factory=lambda: ["experts.gate_up_proj", "experts.down_proj"] + ) diff --git a/modelopt/torch/puzzletron/anymodel/puzzformer/no_op.py b/modelopt/torch/puzzletron/anymodel/puzzformer/no_op.py index aac57af0a9..9b3a9a2190 100644 --- a/modelopt/torch/puzzletron/anymodel/puzzformer/no_op.py +++ b/modelopt/torch/puzzletron/anymodel/puzzformer/no_op.py @@ -43,7 +43,7 @@ class Wrapped(cls): def forward(self, *args, **kwargs): result = super().forward(*args, **kwargs) outputs = [None] * size - outputs[0] = result[0] + outputs[0] = result if isinstance(result, torch.Tensor) else result[0] return tuple(outputs) def extra_repr(self) -> str: diff --git a/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/modeling_decilm.py b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/modeling_decilm.py index 22d00ea773..24be1b227d 100644 --- a/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/modeling_decilm.py +++ b/modelopt/torch/puzzletron/decilm/deci_lm_hf_code/modeling_decilm.py @@ -534,7 +534,7 @@ def __init__( self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) - self.act_fn = ACT2FN[ffn_config.hidden_act] + self.act_fn = ACT2FN[getattr(ffn_config, "hidden_act", "silu")] if ffn_config.sparsify is not None: self.register_full_backward_hook(sparsity_backward_hook) @@ -579,7 +579,7 @@ def __init__( self.intermediate_size = ffn_config.intermediate_size self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) - self.act_fn = ACT2FN[ffn_config.hidden_act] + self.act_fn = ACT2FN[getattr(ffn_config, "hidden_act", "silu")] if ffn_config.sparsify is not None: self.register_full_backward_hook(sparsity_backward_hook) @@ -1037,7 +1037,7 @@ def __init__(self, config: DeciLMConfig, layer_idx: int | tuple[int, ...]): self.self_attn = DeciLMLlama4TextAttention(config, layer_idx, self.attention_config) if not (self.ffn_config.no_op or self.attention_config.is_mamba): - if self.ffn_config.hidden_act is None: + if getattr(self.ffn_config, "hidden_act", None) is None: print(f"WARNING: FFN hidden_act is None for layer {layer_idx}") self.post_attention_layernorm = DeciLMRMSNorm( diff --git a/modelopt/torch/puzzletron/mip/run_puzzle.py b/modelopt/torch/puzzletron/mip/run_puzzle.py index da0f90452d..71913db7d3 100644 --- a/modelopt/torch/puzzletron/mip/run_puzzle.py +++ b/modelopt/torch/puzzletron/mip/run_puzzle.py @@ -29,6 +29,10 @@ import yaml from omegaconf import DictConfig, ListConfig, OmegaConf +from modelopt.torch.puzzletron.anymodel.model_descriptor import ( + ModelDescriptor, + ModelDescriptorFactory, +) from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import ( AttentionConfig, BlockConfig, @@ -558,7 +562,12 @@ def _parse_teacher_block_metrics( ) -> list[dict]: raw_metrics = json.loads((single_block_replacement_validation_dir / "teacher.json").read_text()) teacher_checkpoint_dir = Path(raw_metrics["args"]["teacher_dir"]).resolve() - teacher_model_config = load_model_config(teacher_checkpoint_dir) + descriptor_name = raw_metrics["args"]["descriptor"] + descriptor = ModelDescriptorFactory.get(descriptor_name) + trust_remote_code = descriptor.requires_trust_remote_code() + teacher_model_config = load_model_config( + teacher_checkpoint_dir, trust_remote_code=trust_remote_code + ) teacher_replacements = None replacement_library_path = raw_metrics["args"].get("replacement_library_path") diff --git a/modelopt/torch/puzzletron/pruning/expert_removal_pruning_mixin.py b/modelopt/torch/puzzletron/pruning/expert_removal_pruning_mixin.py index 96d3489f5e..3c00ca212a 100644 --- a/modelopt/torch/puzzletron/pruning/expert_removal_pruning_mixin.py +++ b/modelopt/torch/puzzletron/pruning/expert_removal_pruning_mixin.py @@ -21,7 +21,6 @@ from modelopt.torch.nas.plugins.megatron_hooks.base_hooks import ( ForwardHook, - GptOssRemoveExpertsIndependentHook, NemotronHRemoveExpertsIndependentHook, Qwen3VLRemoveExpertsIndependentHook, RankedChoiceVotingHook, @@ -82,7 +81,6 @@ def supported_hooks(self) -> List[Type[ForwardHook]]: RankedChoiceVotingHookNemotronH, NemotronHRemoveExpertsIndependentHook, Qwen3VLRemoveExpertsIndependentHook, - GptOssRemoveExpertsIndependentHook, ] def prune_single_layer( diff --git a/modelopt/torch/puzzletron/pruning/pruning_ckpts.py b/modelopt/torch/puzzletron/pruning/pruning_ckpts.py index 823f42faf8..b9cfd75faf 100644 --- a/modelopt/torch/puzzletron/pruning/pruning_ckpts.py +++ b/modelopt/torch/puzzletron/pruning/pruning_ckpts.py @@ -95,6 +95,12 @@ def launch_ffn_intermediates_prune_ckpt( def launch_attn_groups_prune_ckpt( cfg: DictConfig, max_save_workers: Optional[int] = None, max_layer_workers: Optional[int] = None ): + descriptor = cfg.descriptor + parent_model_config = load_model_config( + cfg.teacher_dir, trust_remote_code=descriptor.requires_trust_remote_code() + ) + num_attention_heads = parent_model_config.num_attention_heads + for n_heads_in_group in cfg.pruning.n_heads_in_group_list: dirname = f"n_heads_in_group{n_heads_in_group}" @@ -105,7 +111,8 @@ def launch_attn_groups_prune_ckpt( mprint("Process n_heads_in_group {}".format(n_heads_in_group)) mprint(f"=== STARTING ATTENTION PRUNING FOR n_heads_in_group={n_heads_in_group} ===") - model_config_overrides_json = {"attention": [{"n_heads_in_group": n_heads_in_group}]} + num_key_value_heads = num_attention_heads // n_heads_in_group + model_config_overrides_json = {"attention": [{"num_key_value_heads": num_key_value_heads}]} mlp_init_config_yaml = cfg.pruning.mlp_init_config_yaml output_dir = os.path.join(cfg.pruning.pruned_ckpts_output_dir, dirname) @@ -151,7 +158,11 @@ def launch_hidden_dim_prune_ckpt(cfg: DictConfig): ) # Load parent model config to get FFN configuration - parent_model_config = load_model_config(cfg.pruning.model_name_or_path) + descriptor = ModelDescriptorFactory.get(cfg.descriptor) + trust_remote_code = descriptor.requires_trust_remote_code() + parent_model_config = load_model_config( + cfg.pruning.model_name_or_path, trust_remote_code=trust_remote_code + ) parent_hidden_size = parent_model_config.hidden_size # Get teacher's FFN configuration diff --git a/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py b/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py index 0f5ecd2158..cc81f4f887 100644 --- a/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py +++ b/modelopt/torch/puzzletron/replacement_library/build_replacement_library.py @@ -39,6 +39,10 @@ import pandas as pd from omegaconf import DictConfig +from modelopt.torch.puzzletron.anymodel.model_descriptor import ( + ModelDescriptor, + ModelDescriptorFactory, +) from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import ( AttentionConfig, BlockConfig, @@ -65,6 +69,7 @@ def build_replacement_library( master_puzzle_dir: Path | str, + descriptor: ModelDescriptor, teacher_checkpoint_dir: Path | str | None = None, add_ffn_no_ops: bool = True, add_attention_no_ops: bool = True, @@ -76,20 +81,22 @@ def build_replacement_library( master_puzzle_dir = Path(master_puzzle_dir) (master_puzzle_dir / "ckpts").mkdir(exist_ok=True) teacher_checkpoint_dir = infer_teacher_dir(master_puzzle_dir, teacher_checkpoint_dir) + trust_remote_code = descriptor.requires_trust_remote_code() subblocks_df = _build_subblocks_df( master_puzzle_dir, teacher_checkpoint_dir, add_ffn_no_ops, add_attention_no_ops, + trust_remote_code=trust_remote_code, ) block_library_df = _build_block_library_from_subblocks(subblocks_df) layer_replacements = _build_layer_replacements( - block_library_df, master_puzzle_dir, teacher_checkpoint_dir + block_library_df, master_puzzle_dir, teacher_checkpoint_dir, trust_remote_code ) single_sequence_replacement_solutions = _build_single_sequence_replacement_solutions( - layer_replacements, teacher_checkpoint_dir + layer_replacements, teacher_checkpoint_dir, trust_remote_code ) json_dump(block_library_df.to_dict(orient="records"), master_puzzle_dir / "block_library.json") @@ -112,11 +119,13 @@ def launch_build_replacement_library(cfg: DictConfig) -> None: f"Build replacement library config: {format_global_config(cfg.build_replacement_library, title='Build replacement library')}" ) + descriptor = ModelDescriptorFactory.get(cfg.descriptor) build_replacement_library( master_puzzle_dir=cfg.puzzle_dir, teacher_checkpoint_dir=cfg.teacher_dir, add_ffn_no_ops=cfg.build_replacement_library.add_ffn_no_ops, add_attention_no_ops=cfg.build_replacement_library.add_attention_no_ops, + descriptor=descriptor, ) @@ -191,9 +200,12 @@ def _build_subblocks_df( teacher_checkpoint_dir: Path | str, add_ffn_no_ops: bool, add_attention_no_ops: bool, + trust_remote_code: bool = False, ) -> pd.DataFrame: teacher_checkpoint_dir = Path(teacher_checkpoint_dir) - checkpoint_dirs = _get_last_checkpoint_from_each_experiment(master_puzzle_dir) + checkpoint_dirs = _get_last_checkpoint_from_each_experiment( + master_puzzle_dir, trust_remote_code=trust_remote_code + ) checkpoint_dirs = [teacher_checkpoint_dir] + list(checkpoint_dirs - {teacher_checkpoint_dir}) checkpoints_to_split = [teacher_checkpoint_dir] @@ -203,7 +215,7 @@ def _build_subblocks_df( if len(subblocks_to_extract) > 0: subblock_rows_from_current_checkpoint = ( _construct_subblock_rows_from_current_checkpoint( - checkpoint_dir, subblocks_to_extract + checkpoint_dir, subblocks_to_extract, trust_remote_code=trust_remote_code ) ) subblock_rows.extend(subblock_rows_from_current_checkpoint) @@ -303,10 +315,10 @@ def _drop_duplicates_of_decomp_no_op(subblocks_df: pd.DataFrame) -> pd.DataFrame def _construct_subblock_rows_from_current_checkpoint( - checkpoint_dir: Path, subblocks_to_extract: list[str] + checkpoint_dir: Path, subblocks_to_extract: list[str], trust_remote_code: bool = False ) -> list[dict[str, Any]]: subblock_rows_from_current_checkpoint = [] - model_config = load_model_config(checkpoint_dir) + model_config = load_model_config(checkpoint_dir, trust_remote_code=trust_remote_code) for block_idx, block_config in enumerate(model_config.block_configs): for subblock_to_extract in subblocks_to_extract: subblock_row = _init_empty_subblock_row(block_idx) @@ -388,7 +400,9 @@ def _get_rows_with_no_op_subblock( return rows_with_no_op_subblock, subblock_cls -def _get_last_checkpoint_from_each_experiment(master_puzzle_dir: Path | str) -> set[Path]: +def _get_last_checkpoint_from_each_experiment( + master_puzzle_dir: Path | str, trust_remote_code: bool = False +) -> set[Path]: master_puzzle_dir = Path(master_puzzle_dir) master_checkpoints_dir = master_puzzle_dir / CHECKPOINTS_DIR_NAME subdirs_of_master_checkpoints_dir = [ @@ -409,7 +423,11 @@ def _get_last_checkpoint_from_each_experiment(master_puzzle_dir: Path | str) -> ) # Filter out non-DeciLM checkpoints (e.g., unconverted Llama checkpoints) - valid_checkpoint_dirs = [cp for cp in checkpoint_dirs if is_valid_decilm_checkpoint(cp)] + valid_checkpoint_dirs = [ + cp + for cp in checkpoint_dirs + if is_valid_decilm_checkpoint(cp, trust_remote_code=trust_remote_code) + ] experiment_dirs = [ p if (p in subdirs_of_master_checkpoints_dir) else p.parent for p in valid_checkpoint_dirs @@ -465,14 +483,15 @@ def _build_layer_replacements( block_library_df: pd.DataFrame, master_puzzle_dir: Path, teacher_checkpoint_dir: Path, + trust_remote_code: bool = False, ) -> list[dict]: layer_replacements_from_blocks = _build_layer_replacements_from_block_library(block_library_df) layer_replacements_from_checkpoints = _gather_layer_replacements_from_checkpoints( - master_puzzle_dir + master_puzzle_dir, trust_remote_code=trust_remote_code ) layer_replacements = layer_replacements_from_blocks + layer_replacements_from_checkpoints layer_replacements = _filter_duplicate_teacher_replacements( - layer_replacements, teacher_checkpoint_dir + layer_replacements, teacher_checkpoint_dir, trust_remote_code ) return layer_replacements @@ -502,9 +521,13 @@ def _build_layer_replacements_from_block_library(block_library_df: pd.DataFrame) return layer_replacements -def _gather_layer_replacements_from_checkpoints(master_puzzle_dir: str | Path) -> list[dict]: +def _gather_layer_replacements_from_checkpoints( + master_puzzle_dir: str | Path, trust_remote_code: bool = False +) -> list[dict]: gathered_layer_replacements = [] - checkpoint_dirs = _get_last_checkpoint_from_each_experiment(master_puzzle_dir) + checkpoint_dirs = _get_last_checkpoint_from_each_experiment( + master_puzzle_dir, trust_remote_code=trust_remote_code + ) for checkpoint_dir in checkpoint_dirs: if (layer_replacements_path := checkpoint_dir / "replacement_library.json").exists(): layer_replacements = json.loads(layer_replacements_path.read_text()) @@ -523,8 +546,11 @@ def _gather_layer_replacements_from_checkpoints(master_puzzle_dir: str | Path) - def _filter_duplicate_teacher_replacements( layer_replacements: list[dict], teacher_checkpoint_dir: Path, + trust_remote_code: bool = False, ) -> list[dict]: - teacher_model_config = load_model_config(teacher_checkpoint_dir) + teacher_model_config = load_model_config( + teacher_checkpoint_dir, trust_remote_code=trust_remote_code + ) filtered_layer_replacements = [] for layer_replacement in layer_replacements: if replacement_is_teacher( @@ -537,8 +563,11 @@ def _filter_duplicate_teacher_replacements( def _build_single_sequence_replacement_solutions( layer_replacements: list[dict], teacher_checkpoint_dir: Path, + trust_remote_code: bool = False, ) -> list[dict]: - teacher_model_config = load_model_config(teacher_checkpoint_dir) + teacher_model_config = load_model_config( + teacher_checkpoint_dir, trust_remote_code=trust_remote_code + ) n_layer = teacher_model_config.num_hidden_layers teacher_replacements = dict() diff --git a/modelopt/torch/puzzletron/replacement_library/replacement_library.py b/modelopt/torch/puzzletron/replacement_library/replacement_library.py index 7935fea4a0..8a7c2834fd 100644 --- a/modelopt/torch/puzzletron/replacement_library/replacement_library.py +++ b/modelopt/torch/puzzletron/replacement_library/replacement_library.py @@ -123,10 +123,12 @@ def n_layer(self) -> int: @property def model_config(self) -> DeciLMConfig: if self._model_config is None: + trust_remote_code = self.descriptor.requires_trust_remote_code() self._model_config = load_model_config( self.get_arbitrary_checkpoint_dir(), self.model_config_overrides, ignore_unexpected_config_keys=True, + trust_remote_code=trust_remote_code, ) return self._model_config diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py index 2db0bc3916..0b8a3e72fe 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py @@ -285,7 +285,8 @@ def calculate_subblock_stats_for_puzzle_dir( teacher_dir = ( Path(teacher_dir) if teacher_dir is not None else master_puzzle_dir / "ckpts" / "teacher" ) - model_config = load_model_config(teacher_dir) + trust_remote_code = descriptor.requires_trust_remote_code() + model_config = load_model_config(teacher_dir, trust_remote_code=trust_remote_code) # Get language model config for LM-specific attributes (VL models have nested config) lm_config = descriptor.get_language_model_config(model_config) subblock_configs = _load_subblock_configs(master_puzzle_dir, ffn_hidden_sizes, model_config) diff --git a/modelopt/torch/puzzletron/tools/bypassed_training/init_child_from_parent.py b/modelopt/torch/puzzletron/tools/bypassed_training/init_child_from_parent.py index 36e41c4b6a..ecfb8b857b 100644 --- a/modelopt/torch/puzzletron/tools/bypassed_training/init_child_from_parent.py +++ b/modelopt/torch/puzzletron/tools/bypassed_training/init_child_from_parent.py @@ -86,7 +86,9 @@ def init_child_from_parent( copy_tokenizer(parent_checkpoint_dir, output_checkpoint_dir) - parent_model_config = load_model_config(parent_checkpoint_dir) + parent_model_config = load_model_config( + parent_checkpoint_dir, trust_remote_code=descriptor.requires_trust_remote_code() + ) parent_state_dict = load_state_dict(parent_checkpoint_dir) # Parse JSON if string @@ -108,6 +110,7 @@ def init_child_from_parent( parent_checkpoint_dir, model_config_overrides=global_config_overrides, ignore_unexpected_config_keys=True, + trust_remote_code=descriptor.requires_trust_remote_code(), ) # Apply block-level overrides if any @@ -126,7 +129,10 @@ def init_child_from_parent( model_class = _get_model_class_from_config(child_model_config) # AutoModelForCausalLM uses from_config(); concrete model classes use _from_config() if model_class is AutoModelForCausalLM: - child_model = model_class.from_config(child_model_config, trust_remote_code=True) + trust_remote_code = descriptor.requires_trust_remote_code() + child_model = model_class.from_config( + child_model_config, trust_remote_code=trust_remote_code + ) else: child_model = model_class._from_config(child_model_config) diff --git a/modelopt/torch/puzzletron/tools/checkpoint_utils.py b/modelopt/torch/puzzletron/tools/checkpoint_utils.py index f08b89e449..20c2fbe2ac 100644 --- a/modelopt/torch/puzzletron/tools/checkpoint_utils.py +++ b/modelopt/torch/puzzletron/tools/checkpoint_utils.py @@ -135,17 +135,20 @@ def skip_init(module_cls, *args, **kwargs) -> nn.Module: return module -def is_valid_decilm_checkpoint(checkpoint_dir: Path | str) -> bool: +def is_valid_decilm_checkpoint(checkpoint_dir: Path | str, trust_remote_code: bool = False) -> bool: """Validate that a checkpoint is in DeciLM format (has block_configs). Args: checkpoint_dir: Path to checkpoint directory + trust_remote_code: If True, allows execution of custom code from the model repository. + This is a security risk if the model source is untrusted. Only set to True if you + trust the source of the model. Defaults to False for security. Returns: True if checkpoint is valid DeciLM format, False otherwise """ try: - model_config = load_model_config(checkpoint_dir) + model_config = load_model_config(checkpoint_dir, trust_remote_code=trust_remote_code) if model_config.block_configs is None: warnings.warn( f"Skipping checkpoint '{checkpoint_dir}' - not in DeciLM format (missing block_configs)" diff --git a/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py b/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py index 3c3b54830a..3647de5e25 100644 --- a/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py +++ b/modelopt/torch/puzzletron/tools/checkpoint_utils_hf.py @@ -73,10 +73,19 @@ def load_checkpoint( checkpoint_dir: Path | str, model_config_overrides: dict | None = None, ignore_unexpected_config_keys: bool = False, + trust_remote_code: bool = False, ) -> DeciLMForCausalLM: """ Unlike AutoModelForCausalLM.from_pretrained, the models loaded by this function use your local repo code, not the code inside the checkpoint. + + Args: + checkpoint_dir: Path to checkpoint directory + model_config_overrides: Optional mapping of config overrides. + ignore_unexpected_config_keys: If True, ignore unexpected config keys. + trust_remote_code: If True, allows execution of custom code from the model repository. + This is a security risk if the model source is untrusted. Only set to True if you + trust the source of the model. Defaults to False for security. """ from modelopt.torch.puzzletron.tools.checkpoint_utils import ( load_state_dict, # prevent circular import @@ -86,7 +95,10 @@ def load_checkpoint( checkpoint_dir = Path(checkpoint_dir) model_config = load_model_config( - checkpoint_dir, model_config_overrides, ignore_unexpected_config_keys + checkpoint_dir, + model_config_overrides=model_config_overrides, + ignore_unexpected_config_keys=ignore_unexpected_config_keys, + trust_remote_code=trust_remote_code, ) # Without sparsity we could have done: @@ -221,7 +233,17 @@ def _save_checkpoint( ) -def split_checkpoint_to_subblocks(checkpoint_dir: Path | str) -> None: +def split_checkpoint_to_subblocks( + checkpoint_dir: Path | str, trust_remote_code: bool = False +) -> None: + """Split a checkpoint into subblocks. + + Args: + checkpoint_dir: Path to checkpoint directory + trust_remote_code: If True, allows execution of custom code from the model repository. + This is a security risk if the model source is untrusted. Only set to True if you + trust the source of the model. Defaults to False for security. + """ from modelopt.torch.puzzletron.tools.checkpoint_utils import ( load_state_dict, # prevent circular import ) @@ -229,7 +251,7 @@ def split_checkpoint_to_subblocks(checkpoint_dir: Path | str) -> None: if not isinstance(checkpoint_dir, Path): checkpoint_dir = Path(checkpoint_dir) - model_config = load_model_config(checkpoint_dir) + model_config = load_model_config(checkpoint_dir, trust_remote_code=trust_remote_code) state_dict = load_state_dict(checkpoint_dir) save_subblocks(state_dict, checkpoint_dir) diff --git a/modelopt/torch/puzzletron/tools/sharded_checkpoint_utils.py b/modelopt/torch/puzzletron/tools/sharded_checkpoint_utils.py index 1cf02dc931..55926eaaea 100644 --- a/modelopt/torch/puzzletron/tools/sharded_checkpoint_utils.py +++ b/modelopt/torch/puzzletron/tools/sharded_checkpoint_utils.py @@ -115,7 +115,9 @@ def set_submodule(model: nn.Module, module_name: str, new_submodule: nn.Module) def create_local_shard_(model, owned_block_indexes: set[int], descriptor, runtime): - all_block_indexes = set(range(model.config.num_hidden_layers)) + # Get language model config (handles nested configs like Qwen3-VL's text_config) + lm_config = descriptor.get_language_model_config(model.config) + all_block_indexes = set(range(lm_config.num_hidden_layers)) has_first_block = 0 in owned_block_indexes has_last_block = max(all_block_indexes) in owned_block_indexes @@ -136,13 +138,13 @@ def create_local_shard_(model, owned_block_indexes: set[int], descriptor, runtim set_submodule( model, descriptor.input_embedding_name(), - DummyWTE(model.config.hidden_size, dtype=runtime.dtype), + DummyWTE(lm_config.hidden_size, dtype=runtime.dtype), ) if not has_last_block: set_submodule(model, descriptor.final_norm_name(), nn.Identity()) if not (model.config.tie_word_embeddings and has_first_block): - set_submodule(model, descriptor.output_embedding_name(), DummyLMHead(model.config)) + set_submodule(model, descriptor.output_embedding_name(), DummyLMHead(lm_config)) return model @@ -202,11 +204,13 @@ def load_and_shard_model( with runtime.device: if model_config is None: - model_config = load_model_config(checkpoint_path) + trust_remote_code = descriptor.requires_trust_remote_code() + model_config = load_model_config(checkpoint_path, trust_remote_code=trust_remote_code) + num_hidden_layers = descriptor.get_language_model_config(model_config).num_hidden_layers if owned_block_indexes == "auto": owned_block_indexes = set( - np.array_split(np.arange(model_config.num_hidden_layers), runtime.world_size)[ + np.array_split(np.arange(num_hidden_layers), runtime.world_size)[ runtime.global_rank ] ) @@ -250,7 +254,7 @@ def load_and_shard_model( # Re-tie weights after load_state_dict with assign=True, which severs the tie. # Needed on first rank (owns embed_tokens) and last rank (owns lm_head). has_first_block = 0 in owned_block_indexes - has_last_block = (model_config.num_hidden_layers - 1) in owned_block_indexes + has_last_block = (num_hidden_layers - 1) in owned_block_indexes if model_config.tie_word_embeddings and (has_first_block or has_last_block): model_shard.tie_weights() @@ -309,7 +313,8 @@ def create_sharded_model( model_class = _get_model_class_from_config(model_config) # AutoModelForCausalLM uses from_config(); concrete model classes use _from_config() if model_class is AutoModelForCausalLM: - model = model_class.from_config(model_config, trust_remote_code=True) + trust_remote_code = descriptor.requires_trust_remote_code() + model = model_class.from_config(model_config, trust_remote_code=trust_remote_code) else: model = model_class._from_config(model_config) create_local_shard_( diff --git a/tests/_test_utils/torch/puzzletron/resources/configs/pruning/ffn_pruning.yaml b/tests/_test_utils/torch/puzzletron/resources/configs/pruning/ffn_pruning.yaml deleted file mode 100644 index f0c852eec9..0000000000 --- a/tests/_test_utils/torch/puzzletron/resources/configs/pruning/ffn_pruning.yaml +++ /dev/null @@ -1,12 +0,0 @@ -defaults: - - pruning_defaults - -activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/ffn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} - -activation_hooks_kwargs: - method: iterative - target_layer: "mlp.down_proj" - layer_input_descriptors_path: - -intermediate_size_list: [256] # teacher_intermediate_size is 14336 -mlp_init_mode: "PruneByActivationsLog" diff --git a/tests/_test_utils/torch/puzzletron/resources/configs/pruning/pruning_defaults.yaml b/tests/_test_utils/torch/puzzletron/resources/configs/pruning/pruning_defaults.yaml deleted file mode 100644 index 0a5eafcfff..0000000000 --- a/tests/_test_utils/torch/puzzletron/resources/configs/pruning/pruning_defaults.yaml +++ /dev/null @@ -1,32 +0,0 @@ -defaults: - - /validate_model_defaults - -model_name_or_path: ${teacher_dir} -experiment_id: ${pruning.eval_samples}samples_diverse_mini -activations_log_dir: ??? -activation_hooks_kwargs: ??? - -# Data: -eval_samples: 100 -micro_batch_size: 4 -dataset_path: ${dataset_path} -val_dataset_name: train - -# Prune ckpts -pruned_ckpts_outpt_dir: ${puzzle_dir}/pruning/${pruning.experiment_id} - -## FFN pruning -ffn_list: -mlp_init_mode: "Truncate" - -## KV-heads pruning -n_heads_in_group_list: -gqa_init_mode: "AverageKV" - -## Hidden dimension pruning -hidden_size_list: -hidden_size_init_mode: "PruneByChannelRanking" -linear_init_mode: "FromTeacher" - -mlp_init_config_yaml: - activations_log_dir: ${pruning.activations_log_dir} diff --git a/tests/_test_utils/torch/puzzletron/resources/configs/validate_model_defaults.yaml b/tests/_test_utils/torch/puzzletron/resources/configs/validate_model_defaults.yaml deleted file mode 100644 index 1d042d75df..0000000000 --- a/tests/_test_utils/torch/puzzletron/resources/configs/validate_model_defaults.yaml +++ /dev/null @@ -1,17 +0,0 @@ -model_dtype: torch.bfloat16 -autocast_dtype: torch.bfloat16 -block_size: 8192 -bos_rate: 0.5 -data_column: conversation -val_dataset_name: train -shuffle_seed: 81436 -seed: 42 -fim_rate: 0 -fim_spm_rate: 0 -source_datasets_to_discard: -varlen: false -write_results: false -calc_losses_on_cpu: false -activations_log_dir: -model_name_or_path: -load_dataset_fn: ${get_object:modelopt.torch.puzzletron.utils.data.dataloaders.load_from_disk_fn} diff --git a/tests/_test_utils/torch/puzzletron/resources/tokenizer/special_tokens_map.json b/tests/_test_utils/torch/puzzletron/resources/tokenizer/special_tokens_map.json deleted file mode 100644 index 02ee80b619..0000000000 --- a/tests/_test_utils/torch/puzzletron/resources/tokenizer/special_tokens_map.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "bos_token": { - "content": "<|begin_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - }, - "eos_token": { - "content": "<|eot_id|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } -} diff --git a/tests/_test_utils/torch/puzzletron/resources/tokenizer/tokenizer.json b/tests/_test_utils/torch/puzzletron/resources/tokenizer/tokenizer.json deleted file mode 100644 index 83592e2494..0000000000 --- a/tests/_test_utils/torch/puzzletron/resources/tokenizer/tokenizer.json +++ /dev/null @@ -1,212 +0,0 @@ -{ - "version": "1.0", - "truncation": null, - "padding": null, - "added_tokens": [], - "normalizer": null, - "pre_tokenizer": { - "type": "Sequence", - "pretokenizers": [ - { - "type": "Split", - "pattern": { - "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" - }, - "behavior": "Isolated", - "invert": false - }, - { - "type": "ByteLevel", - "add_prefix_space": false, - "trim_offsets": true, - "use_regex": false - } - ] - }, - "post_processor": { - "type": "Sequence", - "processors": [ - { - "type": "ByteLevel", - "add_prefix_space": true, - "trim_offsets": false, - "use_regex": true - }, - { - "type": "TemplateProcessing", - "single": [ - { - "SpecialToken": { - "id": "<|begin_of_text|>", - "type_id": 0 - } - }, - { - "Sequence": { - "id": "A", - "type_id": 0 - } - } - ], - "pair": [ - { - "SpecialToken": { - "id": "<|begin_of_text|>", - "type_id": 0 - } - }, - { - "Sequence": { - "id": "A", - "type_id": 0 - } - }, - { - "SpecialToken": { - "id": "<|begin_of_text|>", - "type_id": 1 - } - }, - { - "Sequence": { - "id": "B", - "type_id": 1 - } - } - ], - "special_tokens": { - "<|begin_of_text|>": { - "id": "<|begin_of_text|>", - "ids": [ - 100 - ], - "tokens": [ - "<|begin_of_text|>" - ] - } - } - } - ] - }, - "decoder": { - "type": "ByteLevel", - "add_prefix_space": true, - "trim_offsets": true, - "use_regex": true - }, - "model": { - "type": "BPE", - "dropout": null, - "unk_token": null, - "continuing_subword_prefix": null, - "end_of_word_suffix": null, - "fuse_unk": false, - "byte_fallback": false, - "ignore_merges": true, - "vocab": { - "!": 0, - "\"": 1, - "#": 2, - "$": 3, - "%": 4, - "&": 5, - "'": 6, - "(": 7, - ")": 8, - "*": 9, - "+": 10, - ",": 11, - "-": 12, - ".": 13, - "/": 14, - "0": 15, - "1": 16, - "2": 17, - "3": 18, - "4": 19, - "5": 20, - "6": 21, - "7": 22, - "8": 23, - "9": 24, - ":": 25, - ";": 26, - "<": 27, - "=": 28, - ">": 29, - "?": 30, - "@": 31, - "A": 32, - "B": 33, - "C": 34, - "D": 35, - "E": 36, - "F": 37, - "G": 38, - "H": 39, - "I": 40, - "J": 41, - "K": 42, - "L": 43, - "M": 44, - "N": 45, - "O": 46, - "P": 47, - "Q": 48, - "R": 49, - "S": 50, - "T": 51, - "U": 52, - "V": 53, - "W": 54, - "X": 55, - "Y": 56, - "Z": 57, - "[": 58, - "\\": 59, - "]": 60, - "^": 61, - "_": 62, - "`": 63, - "a": 64, - "b": 65, - "c": 66, - "d": 67, - "e": 68, - "f": 69, - "g": 70, - "h": 71, - "i": 72, - "j": 73, - "k": 74, - "l": 75, - "m": 76, - "n": 77, - "o": 78, - "p": 79, - "q": 80, - "r": 81, - "s": 82, - "t": 83, - "u": 84, - "v": 85, - "w": 86, - "x": 87, - "y": 88, - "z": 89, - "{": 90, - "|": 91, - "}": 92, - "~": 93, - "¡": 94, - "¢": 95, - "£": 96, - "¤": 97, - "¥": 98, - "¦": 99, - "<|begin_of_text|>": 100, - "<|eot_id|>": 101 - }, - "merges": [] - } -} diff --git a/tests/_test_utils/torch/puzzletron/resources/tokenizer/tokenizer_config.json b/tests/_test_utils/torch/puzzletron/resources/tokenizer/tokenizer_config.json deleted file mode 100644 index 754d9e8db5..0000000000 --- a/tests/_test_utils/torch/puzzletron/resources/tokenizer/tokenizer_config.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "bos_token": "<|begin_of_text|>", - "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", - "clean_up_tokenization_spaces": true, - "eos_token": "<|eot_id|>", - "extra_special_tokens": {}, - "model_input_names": [ - "input_ids", - "attention_mask" - ], - "model_max_length": 131072, - "tokenizer_class": "PreTrainedTokenizer" -} diff --git a/tests/_test_utils/torch/puzzletron/resources/tokenizer/truncate_tokenizer.py b/tests/_test_utils/torch/puzzletron/resources/tokenizer/truncate_tokenizer.py deleted file mode 100644 index aedcae4ab2..0000000000 --- a/tests/_test_utils/torch/puzzletron/resources/tokenizer/truncate_tokenizer.py +++ /dev/null @@ -1,62 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script was used to truncate the tokenizer.json file from Llama 3.1 8B model -to keep only the top 100 most common tokens. -""" - -import json - -# Path to your original and new tokenizer.json -in_path = "./tokenizer.json" -out_path = "./tokenizer_truncated.json" - -# How many top tokens to keep -NUM_TO_KEEP = 100 - -with open(in_path, encoding="utf-8") as f: - tokenizer_data = json.load(f) - -# Get and sort the original vocab by index (frequency proxy) -orig_vocab = tokenizer_data["model"]["vocab"] - -# Sort tokens by their original index (lowest index = assumed most common/important) -sorted_tokens = sorted(orig_vocab.items(), key=lambda item: item[1]) - -# Keep the top N tokens -tokens_to_keep = [tok for tok, idx in sorted_tokens[:NUM_TO_KEEP]] - -# Re-index the selected tokens: 0..N-1 -small_vocab = {tok: i for i, tok in enumerate(tokens_to_keep)} -tokenizer_data["model"]["vocab"] = small_vocab - -# Update vocab size -if "vocab_size" in tokenizer_data["model"]: - tokenizer_data["model"]["vocab_size"] = len(small_vocab) - -# Optionally remove merges if present and unneeded (mostly for BPE/WordPiece) -if "merges" in tokenizer_data["model"]: - tokenizer_data["model"]["merges"] = [] - -# Remove added_tokens if not needed -if "added_tokens" in tokenizer_data: - tokenizer_data["added_tokens"] = [] - -# Write out the truncated tokenizer.json -with open(out_path, "w", encoding="utf-8") as f: - json.dump(tokenizer_data, f, indent=2, ensure_ascii=False) - -print(f"Truncated tokenizer saved to: {out_path}") diff --git a/tests/_test_utils/torch/puzzletron/utils.py b/tests/_test_utils/torch/puzzletron/utils.py index 07d1565f42..7615c5d085 100644 --- a/tests/_test_utils/torch/puzzletron/utils.py +++ b/tests/_test_utils/torch/puzzletron/utils.py @@ -24,18 +24,12 @@ import modelopt.torch.utils.distributed as dist from modelopt.torch.puzzletron.tools.hydra_utils import register_hydra_resolvers -# Path to HF configs relative to this file -# HF configs are in tests/gpu/torch/puzzletron/resources/hf_configs -HF_CONFIGS_DIR = ( - Path(__file__).parent.parent.parent.parent / "gpu/torch/puzzletron/resources/hf_configs" -) - def setup_test_model_and_data( project_root_path: Path, tmp_path: Path, rank: int, - hf_config_name: str, + hf_model_name: str, hybrid_override_pattern: str | None = None, ) -> tuple[Path, Path, Path]: """ @@ -45,7 +39,7 @@ def setup_test_model_and_data( project_root_path (Path): the root path of the project tmp_path (Path): the temporary path to use for the test rank (int): the rank of the process - hf_config_name (str): Name of the HF config directory (e.g., "llama_3_1_8b_instruct") + hf_model_name (str): HuggingFace model card name (e.g., "meta-llama/Llama-3.1-8B-Instruct") hybrid_override_pattern (str): For NemotronH models, the layer type pattern Returns: @@ -56,10 +50,8 @@ def setup_test_model_and_data( # Register Hydra custom resolvers (needed for config resolution) register_hydra_resolvers() - # The inputs for the nas.convert() step. - # - puzzle_dir = tmp_path / hf_config_name - hf_checkpoint_path = puzzle_dir / f"hf_models/{hf_config_name}" + puzzle_dir = tmp_path / hf_model_name + hf_checkpoint_path = puzzle_dir / f"hf_models/{hf_model_name}" dataset_path = puzzle_dir / "dummy_dataset" if rank == 0: @@ -73,7 +65,7 @@ def setup_test_model_and_data( output_path=str(hf_checkpoint_path), vocab_size=tokenizer.vocab_size, tokenizer=tokenizer, - hf_config_name=hf_config_name, + hf_model_name=hf_model_name, hybrid_override_pattern=hybrid_override_pattern, ) dist.barrier() @@ -89,7 +81,7 @@ def create_and_save_small_hf_model( output_path: str, vocab_size: int, tokenizer: PreTrainedTokenizerBase, - hf_config_name: str, + hf_model_name: str, hybrid_override_pattern: str | None = None, ): """ @@ -101,23 +93,21 @@ def create_and_save_small_hf_model( output_path: Where to save the model vocab_size: Vocabulary size (should match tokenizer) tokenizer: Tokenizer to save alongside the model - hf_config_name: Name of the config directory under resources/hf_configs/ - e.g., "llama_3_1_8b_instruct", "llama_3_2_3b_instruct", or "qwen2_5_7b_instruct" + hf_model_name: HuggingFace model card name (e.g., "meta-llama/Llama-3.1-8B-Instruct") hybrid_override_pattern: For NemotronH models, the layer type pattern (e.g., "*-" for Attention+MLP, "M-" for Mamba+MLP). Must match num_hidden_layers. None for non-NemotronH models. """ os.makedirs(output_path, exist_ok=True) # Load real HuggingFace config (preserves tie_word_embeddings, rope_scaling, etc.) - config_path = HF_CONFIGS_DIR / hf_config_name - config = AutoConfig.from_pretrained(config_path, local_files_only=True, trust_remote_code=True) + config = AutoConfig.from_pretrained(hf_model_name, trust_remote_code=True) # Override size-related params to make it small for testing # Note: intermediate_size must be divisible by 256 per DeciLM config requirements # Note: hidden_size must give head_dim >= 8 for Flash Attention 2 compatibility # VL models have nested configs (text_config, vision_config) - if hf_config_name == "qwen3-vl-30b-a3b-instruct": + if hasattr(config, "text_config") and hasattr(config, "vision_config"): config.text_config.vocab_size = vocab_size config.text_config.hidden_size = 256 config.text_config.intermediate_size = 512 @@ -160,14 +150,34 @@ def create_and_save_small_hf_model( torch.manual_seed(42) # Create and save the model + # Force CPU initialization for deterministic behavior (prevents NaN on RTX GPUs) + original_cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES") + os.environ["CUDA_VISIBLE_DEVICES"] = "" # TODO: Consider using AutoModel.from_config instead. - if hf_config_name == "qwen3-vl-30b-a3b-instruct": + if hasattr(config, "text_config") and hasattr(config, "vision_config"): from transformers import Qwen3VLMoeForConditionalGeneration model = Qwen3VLMoeForConditionalGeneration._from_config(config) else: model = AutoModelForCausalLM.from_config(config, trust_remote_code=True) + # Initialize weights to ensure all parameters are properly initialized + # This prevents NaN values in uninitialized parameters (e.g., backbone.layers.1.mixer.gate.weight + # in nemotron-3-nano-30b-a3b-base-bf16) that can occur with from_config on RTX GPU cards (not on H100) + model.initialize_weights() + + # Fix any remaining NaN/Inf values that initialize_weights() might have missed + for name, param in model.named_parameters(): + if torch.isnan(param).any() or torch.isinf(param).any(): + nan_inf_mask = torch.isnan(param) | torch.isinf(param) + param.data = torch.where(nan_inf_mask, torch.zeros_like(param), param) + + # Restore CUDA_VISIBLE_DEVICES after model creation and initialization + if original_cuda_visible is not None: + os.environ["CUDA_VISIBLE_DEVICES"] = original_cuda_visible + else: + os.environ.pop("CUDA_VISIBLE_DEVICES", None) + model.to(dtype=torch.bfloat16).save_pretrained(output_path) # Save tokenizer diff --git a/tests/gpu/torch/puzzletron/nas/plugins/test_nas_convert.py b/tests/gpu/torch/puzzletron/nas/plugins/test_nas_convert.py index e2373676d2..8a5bad0c62 100644 --- a/tests/gpu/torch/puzzletron/nas/plugins/test_nas_convert.py +++ b/tests/gpu/torch/puzzletron/nas/plugins/test_nas_convert.py @@ -18,7 +18,6 @@ from functools import partial from pathlib import Path -import pytest import torch from _test_utils.torch.distributed.utils import spawn_multiprocess_job from _test_utils.torch.puzzletron.utils import setup_test_model_and_data @@ -28,7 +27,6 @@ from modelopt.torch.puzzletron.nas.plugins.puzzletron_nas_plugin import PuzzletronModel -@pytest.mark.skip(reason="Temporarily disabled") def test_nas_convert_ffn_pruning(project_root_path: Path, tmp_path: Path): spawn_multiprocess_job( size=torch.cuda.device_count(), @@ -43,12 +41,10 @@ def _test_nas_convert_ffn_pruning_multiprocess_job( dist.setup(timeout=timedelta(10)) # Setup the test model and data. puzzle_dir, llama_checkpoint_path, dataset_path = setup_test_model_and_data( - project_root_path, tmp_path, rank, "llama_3_1_8b_instruct" + project_root_path, tmp_path, rank, "meta-llama/Llama-3.1-8B-Instruct" ) - hydra_config_dir = ( - project_root_path / "tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct" - ) - hydra_config_name = "llama_3_1_8b_instruct" + hydra_config_dir = project_root_path / "tests/gpu/torch/puzzletron/resources/configs" + hydra_config_name = "meta-llama/Llama-3.1-8B-Instruct/Llama-3.1-8B-Instruct" # # Run the mnt.convert() step @@ -87,7 +83,6 @@ def _test_nas_convert_ffn_pruning_multiprocess_job( dist.cleanup() -@pytest.mark.skip(reason="Temporarily disabled") def test_nas_convert_attn_pruning(project_root_path: Path, tmp_path: Path): spawn_multiprocess_job( size=torch.cuda.device_count(), @@ -102,12 +97,10 @@ def _test_nas_convert_attn_pruning_multiprocess_job( dist.setup(timeout=timedelta(10)) # Setup the test model and data. puzzle_dir, llama_checkpoint_path, dataset_path = setup_test_model_and_data( - project_root_path, tmp_path, rank, "llama_3_1_8b_instruct" - ) - hydra_config_dir = ( - project_root_path / "tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct" + project_root_path, tmp_path, rank, "meta-llama/Llama-3.1-8B-Instruct" ) - hydra_config_name = "llama_3_1_8b_instruct-attn-pruning" + hydra_config_dir = project_root_path / "tests/gpu/torch/puzzletron/resources/configs" + hydra_config_name = "meta-llama/Llama-3.1-8B-Instruct/Llama-3.1-8B-Instruct-attn-pruning" # # Run the mnt.convert() step diff --git a/tests/gpu/torch/puzzletron/nas/plugins/test_nas_search.py b/tests/gpu/torch/puzzletron/nas/plugins/test_nas_search.py index e39f1e1cbc..2af371e5ca 100644 --- a/tests/gpu/torch/puzzletron/nas/plugins/test_nas_search.py +++ b/tests/gpu/torch/puzzletron/nas/plugins/test_nas_search.py @@ -17,7 +17,6 @@ from functools import partial from pathlib import Path -import pytest import torch from _test_utils.torch.distributed.utils import spawn_multiprocess_job from _test_utils.torch.puzzletron.utils import setup_test_model_and_data @@ -27,7 +26,6 @@ from modelopt.torch.puzzletron.nas.plugins.puzzletron_nas_plugin import PuzzletronModel -@pytest.mark.skip(reason="Temporarily disabled") def test_nas_search(project_root_path: Path, tmp_path: Path): spawn_multiprocess_job( size=torch.cuda.device_count(), @@ -42,12 +40,10 @@ def _test_nas_search_multiprocess_job( dist.setup(timeout=timedelta(10)) # Setup the test model and data. puzzle_dir, llama_checkpoint_path, dataset_path = setup_test_model_and_data( - project_root_path, tmp_path, rank, "llama_3_1_8b_instruct" + project_root_path, tmp_path, rank, "meta-llama/Llama-3.1-8B-Instruct" ) - hydra_config_dir = ( - project_root_path / "tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct" - ) - hydra_config_name = "llama_3_1_8b_instruct" + hydra_config_dir = project_root_path / "tests/gpu/torch/puzzletron/resources/configs" + hydra_config_name = "meta-llama/Llama-3.1-8B-Instruct/Llama-3.1-8B-Instruct" # # Run the mnt.convert() step diff --git a/tests/_test_utils/torch/puzzletron/resources/configs/Llama-3_1-8B-ffn-pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen2.5-7B-Instruct/Qwen2.5-7B-Instruct.yaml similarity index 76% rename from tests/_test_utils/torch/puzzletron/resources/configs/Llama-3_1-8B-ffn-pruning.yaml rename to tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen2.5-7B-Instruct/Qwen2.5-7B-Instruct.yaml index 8af352660b..2843f0b97a 100644 --- a/tests/_test_utils/torch/puzzletron/resources/configs/Llama-3_1-8B-ffn-pruning.yaml +++ b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen2.5-7B-Instruct/Qwen2.5-7B-Instruct.yaml @@ -1,18 +1,19 @@ +# @package _global_ defaults: - - pruning: ffn_pruning - - scoring: ../validate_solutions_defaults - - realize_model: ../validate_solutions_defaults - - bypass: - - override hydra/hydra_logging: disabled + - /Qwen/Qwen2.5-7B-Instruct/pruning@pruning: ffn_pruning + - /validate_solutions_defaults@scoring + - /validate_solutions_defaults@realize_model - _self_ puzzle_dir: ??? teacher_dir: ${puzzle_dir}/ckpts/teacher/ replacement_library_path: ${puzzle_dir}/replacement_library.json -dataset_path: ??? # path to v0.4_mini +dataset_path: ??? # path to v0.4_mini skip_realize_model: false +descriptor: qwen2 + build_replacement_library: add_ffn_no_ops: true add_attention_no_ops: true @@ -21,15 +22,17 @@ calc_subblock_stats: batch_sizes: [64, 96, 128] prefill_seq_len: 4096 generation_seq_len: 4096 - num_active_tokens_override: # Optional override for sequence lengths + num_active_tokens_override: # Optional override for sequence lengths prefill_queue_size: 0 allocate_prefill_query: false - benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking + benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking merge_with_existing_stats: false subblock_stats_filename: "subblock_stats.json" moe_stats_filename: "moe_stats.json" scoring: + descriptor: ${descriptor} + solutions_to_validate: skip_existing_solutions: true @@ -54,6 +57,8 @@ mip: # puzzle_profile: objective: metrics.cosine_embedding_loss_hidden_states bigger_is_better: false + num_solutions: 1 + minimal_diversity: 2 subblock_stats_args: - batch_size: 96 @@ -77,18 +82,23 @@ mip: target_memory: 780_000 # 78_000 mip_constraints: + use_greedy_search: false + is_multi_layer_puzzle: true metric_overrides: + constrain_search_func: max_seconds_per_solution: 60 realize_model: + descriptor: ${descriptor} + teacher_dir: ${to_path:${teacher_dir}} tokenizer_name: ${to_path:${teacher_dir}} replacement_library_path: ${replacement_library_path} save_models: true - solutions_path: # Filled dynamically + solutions_path: # Filled dynamically # Validate params - skip_validation: false # To enable validation of the model solution set `skip_validation` as False + skip_validation: false # To enable validation of the model solution set `skip_validation` as False eval_samples: 2 micro_batch_size: 1 dataset_path: ${dataset_path}/valid diff --git a/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen2.5-7B-Instruct/pruning/ffn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen2.5-7B-Instruct/pruning/ffn_pruning.yaml new file mode 100644 index 0000000000..cf6201080c --- /dev/null +++ b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen2.5-7B-Instruct/pruning/ffn_pruning.yaml @@ -0,0 +1,7 @@ +defaults: + - /pruning/ffn_pruning_base@_here_ + - _self_ + +pruning_mixin: + layer_descriptor: + _target_: modelopt.torch.puzzletron.anymodel.models.qwen2.qwen2_model_descriptor.Qwen2FFNIntermediateLayerDescriptor diff --git a/tests/_test_utils/torch/puzzletron/resources/configs/Llama-3_1-8B-attn-pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-8B/Qwen3-8B.yaml similarity index 76% rename from tests/_test_utils/torch/puzzletron/resources/configs/Llama-3_1-8B-attn-pruning.yaml rename to tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-8B/Qwen3-8B.yaml index 473a5d418d..cd82a47271 100644 --- a/tests/_test_utils/torch/puzzletron/resources/configs/Llama-3_1-8B-attn-pruning.yaml +++ b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-8B/Qwen3-8B.yaml @@ -1,18 +1,19 @@ +# @package _global_ defaults: - - pruning: attn_pruning - - scoring: ../validate_solutions_defaults - - realize_model: ../validate_solutions_defaults - - bypass: - - override hydra/hydra_logging: disabled + - /Qwen/Qwen3-8B/pruning@pruning: ffn_pruning + - /validate_solutions_defaults@scoring + - /validate_solutions_defaults@realize_model - _self_ puzzle_dir: ??? teacher_dir: ${puzzle_dir}/ckpts/teacher/ replacement_library_path: ${puzzle_dir}/replacement_library.json -dataset_path: ??? # path to v0.4_mini +dataset_path: ??? # path to v0.4_mini skip_realize_model: false +descriptor: qwen3 + build_replacement_library: add_ffn_no_ops: true add_attention_no_ops: true @@ -21,15 +22,16 @@ calc_subblock_stats: batch_sizes: [64, 96, 128] prefill_seq_len: 4096 generation_seq_len: 4096 - num_active_tokens_override: # Optional override for sequence lengths + num_active_tokens_override: # Optional override for sequence lengths prefill_queue_size: 0 - allocate_prefill_query: false - benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking + benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking merge_with_existing_stats: false subblock_stats_filename: "subblock_stats.json" moe_stats_filename: "moe_stats.json" scoring: + descriptor: ${descriptor} + solutions_to_validate: skip_existing_solutions: true @@ -54,6 +56,8 @@ mip: # puzzle_profile: objective: metrics.cosine_embedding_loss_hidden_states bigger_is_better: false + num_solutions: 1 + minimal_diversity: 2 subblock_stats_args: - batch_size: 96 @@ -77,18 +81,23 @@ mip: target_memory: 780_000 # 78_000 mip_constraints: + use_greedy_search: false + is_multi_layer_puzzle: true metric_overrides: + constrain_search_func: max_seconds_per_solution: 60 realize_model: + descriptor: ${descriptor} + teacher_dir: ${to_path:${teacher_dir}} tokenizer_name: ${to_path:${teacher_dir}} replacement_library_path: ${replacement_library_path} save_models: true - solutions_path: # Filled dynamically + solutions_path: # Filled dynamically # Validate params - skip_validation: false # To enable validation of the model solution set `skip_validation` as False + skip_validation: false # To enable validation of the model solution set `skip_validation` as False eval_samples: 2 micro_batch_size: 1 dataset_path: ${dataset_path}/valid diff --git a/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-8B/pruning/ffn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-8B/pruning/ffn_pruning.yaml new file mode 100644 index 0000000000..e6e6ce5bb4 --- /dev/null +++ b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-8B/pruning/ffn_pruning.yaml @@ -0,0 +1,7 @@ +defaults: + - /pruning/ffn_pruning_base@_here_ + - _self_ + +pruning_mixin: + layer_descriptor: + _target_: modelopt.torch.puzzletron.anymodel.models.qwen3_8b.qwen3_8b_model_descriptor.Qwen3_8BFFNIntermediateLayerDescriptor diff --git a/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-VL-30B-A3B-Instruct/Qwen3-VL-30B-A3B-Instruct.yaml b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-VL-30B-A3B-Instruct/Qwen3-VL-30B-A3B-Instruct.yaml new file mode 100644 index 0000000000..00b21ea979 --- /dev/null +++ b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-VL-30B-A3B-Instruct/Qwen3-VL-30B-A3B-Instruct.yaml @@ -0,0 +1,113 @@ +# @package _global_ +defaults: + - /Qwen/Qwen3-VL-30B-A3B-Instruct/pruning@pruning: expert_pruning + - /validate_solutions_defaults@scoring + - /validate_solutions_defaults@realize_model + - _self_ + +puzzle_dir: ??? +teacher_dir: ${puzzle_dir}/ckpts/teacher/ +replacement_library_path: ${puzzle_dir}/replacement_library.json +dataset_path: ??? # path to v0.4_mini + +skip_realize_model: false + +descriptor: qwen3_vl + +build_replacement_library: + add_ffn_no_ops: true + add_attention_no_ops: true + +calc_subblock_stats: + batch_sizes: [64, 96, 128] + prefill_seq_len: 4096 + generation_seq_len: 4096 + num_active_tokens_override: # Optional override for sequence lengths + prefill_queue_size: 0 + benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking + merge_with_existing_stats: false + subblock_stats_filename: "subblock_stats.json" + moe_stats_filename: "moe_stats.json" + +scoring: + descriptor: ${descriptor} + + solutions_to_validate: + skip_existing_solutions: true + + replacement_library_path: ${replacement_library_path} + solutions_path: ${to_path:${puzzle_dir}/single_sequence_replacement_solutions.json} + teacher_dir: ${to_path:${teacher_dir}} + output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation + + eval_samples: 2 + micro_batch_size: 1 + dataset_path: ${dataset_path}/valid + seed: 42 + shuffle_seed: 444 + +mip: + single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}} + subblock_stats_path: ${to_path:${puzzle_dir}/${calc_subblock_stats.subblock_stats_filename}} + output_path: ${to_path:${puzzle_dir}/mip/puzzle_solutions} + gathered_metrics_path: + puzzle_profile: + + # puzzle_profile: + objective: metrics.cosine_embedding_loss_hidden_states + bigger_is_better: false + num_solutions: 1 + minimal_diversity: 2 + + subblock_stats_args: + - batch_size: 96 + weights_dtype: torch.bfloat16 + activations_dtype: torch.bfloat16 + kv_cache_dtype: torch.bfloat16 + + report_additional_costs: + - stats.memory_mib + - stats.num_params + - stats.num_kv_heads + - stats.has_attention + - stats.has_ffn + - stats.kv_cache_memory_mib + - stats.attention_memory_mib + - stats.ffn_memory_mib + - stats.ffn_num_params + - stats.attention_num_params + - stats.num_local_experts + + human_constraints: + + mip_constraints: + - stats.num_local_experts: 1472 # same constraint as nemotron-3-nano for test consistency + use_greedy_search: false + is_multi_layer_puzzle: true + metric_overrides: + constrain_search_func: + max_seconds_per_solution: 60 + +realize_model: + descriptor: ${descriptor} + + teacher_dir: ${to_path:${teacher_dir}} + tokenizer_name: ${to_path:${teacher_dir}} + replacement_library_path: ${replacement_library_path} + save_models: true + solutions_path: # Filled dynamically + + # Validate params + skip_validation: false # To enable validation of the model solution set `skip_validation` as False + eval_samples: 2 + micro_batch_size: 1 + dataset_path: ${dataset_path}/valid + seed: 42 + shuffle_seed: 444 + +nccl_timeout_minutes: ${timedelta_minutes:10} + +# This section redirects Hydra outputs +hydra: + run: + dir: ${puzzle_dir}/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S} diff --git a/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-VL-30B-A3B-Instruct/pruning/expert_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-VL-30B-A3B-Instruct/pruning/expert_pruning.yaml new file mode 100644 index 0000000000..81c5f35ba5 --- /dev/null +++ b/tests/gpu/torch/puzzletron/resources/configs/Qwen/Qwen3-VL-30B-A3B-Instruct/pruning/expert_pruning.yaml @@ -0,0 +1,20 @@ +defaults: + - /pruning/pruning_defaults@_here_ + +eval_samples: 10 +activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/expert_removal/${pruning.experiment_id} +pruning_mixin: + _target_: modelopt.torch.puzzletron.pruning.expert_removal_pruning_mixin.ExpertRemovalPruningMixIn + layer_descriptor: + _target_: modelopt.torch.puzzletron.anymodel.models.qwen3_vl_30b_a3b_instruct.qwen3_vl_30b_a3b_instruct_model_descriptor.Qwen3VL30BA3BInstructExpertRemovalLayerDescriptor + target_name: "mlp" + +hook_class: ${get_object:modelopt.torch.nas.plugins.megatron_hooks.base_hooks.Qwen3VLRemoveExpertsIndependentHook} +activation_hooks_kwargs: + +# num_experts_to_keep must be >= num_experts_per_tok (can't route to more experts than exist) +num_experts_to_keep_list: [8] # num_experts in test model is 16, num_experts_per_tok is 8 +mlp_init_mode: "ExpertRemoval" +mlp_init_config_yaml: + expert_scores_key: "expert_ranks_mse" + layer_prefix_template: "model.language_model.layers.{layer_idx}.mlp" diff --git a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/attn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/attn_pruning.yaml deleted file mode 100644 index 01886607e4..0000000000 --- a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/attn_pruning.yaml +++ /dev/null @@ -1,16 +0,0 @@ -defaults: - - pruning_defaults - -activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/attn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} - -activation_hooks_kwargs: - method: independent_kv_head_contribution - optimize_for: memory # IndependentKvHeadContributionHook implementation that consumes less memory - target_layer: "self_attn.o_proj" - layer_input_descriptors_path: - -# n_heads_in_group: 4 -# num_attention_heads: 32 # num query heads -# num_kv_heads: 32 / 4 = 8 # num_query_heads // n_heads_in_group -n_heads_in_group_list: [8, 16, 32] # num_kv_heads = [4, 2, 1] -gqa_init_mode: "PruneKVHeads" diff --git a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/hidden_dim_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/hidden_dim_pruning.yaml deleted file mode 100644 index 407c835d8c..0000000000 --- a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/hidden_dim_pruning.yaml +++ /dev/null @@ -1,15 +0,0 @@ -defaults: - - pruning_defaults - -activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/hidden_dim_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} - -activation_hooks_kwargs: - method: layer_norm_contribution - target_layer: "layernorm" - -# Hidden dimension pruning specific settings -hidden_size_list: [3072, 2048] # Target hidden sizes to prune to -hidden_size_init_mode: "PruneByChannelRanking" -mlp_init_mode: "Truncate" # TODO, make it work with CopyAsIs/FromTeacher -gqa_init_mode: "AverageKV" # TODO, make it work with CopyAsIs/FromTeacher -linear_init_mode: "FromTeacher" diff --git a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/validate_solutions_defaults.yaml b/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/validate_solutions_defaults.yaml deleted file mode 100644 index ec13902379..0000000000 --- a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/validate_solutions_defaults.yaml +++ /dev/null @@ -1,10 +0,0 @@ -defaults: - - /validate_model_defaults - - _self_ - -solutions_to_validate: -skip_validation: false -save_models: false -bigger_is_better: false -sort_solutions_by: -calculate_full_score_ablations: false diff --git a/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/Llama-3.1-8B-Instruct-attn-pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/Llama-3.1-8B-Instruct-attn-pruning.yaml new file mode 100644 index 0000000000..57051431a1 --- /dev/null +++ b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/Llama-3.1-8B-Instruct-attn-pruning.yaml @@ -0,0 +1,10 @@ +# @package _global_ +defaults: + - /meta-llama/Llama-3.1-8B-Instruct/pruning@pruning: attn_pruning + - _self_ + +descriptor: llama + +puzzle_dir: ??? +teacher_dir: ${puzzle_dir}/ckpts/teacher/ +dataset_path: ??? diff --git a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/llama_3_1_8b_instruct-attn-pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/Llama-3.1-8B-Instruct.yaml similarity index 94% rename from tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/llama_3_1_8b_instruct-attn-pruning.yaml rename to tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/Llama-3.1-8B-Instruct.yaml index 02c73aca69..8e2e0786b3 100644 --- a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/llama_3_1_8b_instruct-attn-pruning.yaml +++ b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/Llama-3.1-8B-Instruct.yaml @@ -1,9 +1,8 @@ +# @package _global_ defaults: - - pruning: attn_pruning - - scoring: ../validate_solutions_defaults - - realize_model: ../validate_solutions_defaults - - bypass: - - override hydra/hydra_logging: disabled + - /meta-llama/Llama-3.1-8B-Instruct/pruning@pruning: ffn_pruning + - /validate_solutions_defaults@scoring + - /validate_solutions_defaults@realize_model - _self_ descriptor: llama diff --git a/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/pruning/attn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/pruning/attn_pruning.yaml new file mode 100644 index 0000000000..6e8af1f651 --- /dev/null +++ b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/pruning/attn_pruning.yaml @@ -0,0 +1,7 @@ +defaults: + - /pruning/attn_pruning@_here_ + - _self_ + +pruning_mixin: + layer_descriptor: + _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaKVHeadsLayerDescriptor diff --git a/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/pruning/ffn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/pruning/ffn_pruning.yaml new file mode 100644 index 0000000000..b30f4a17d9 --- /dev/null +++ b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.1-8B-Instruct/pruning/ffn_pruning.yaml @@ -0,0 +1,7 @@ +defaults: + - /pruning/ffn_pruning_base@_here_ + - _self_ + +pruning_mixin: + layer_descriptor: + _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaFFNIntermediateLayerDescriptor diff --git a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/llama_3_1_8b_instruct.yaml b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.2-3B-Instruct/Llama-3.2-3B-Instruct.yaml similarity index 94% rename from tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/llama_3_1_8b_instruct.yaml rename to tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.2-3B-Instruct/Llama-3.2-3B-Instruct.yaml index 65ca64ef4e..78cb6bd73c 100644 --- a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/llama_3_1_8b_instruct.yaml +++ b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.2-3B-Instruct/Llama-3.2-3B-Instruct.yaml @@ -1,9 +1,8 @@ +# @package _global_ defaults: - - pruning: ffn_pruning - - scoring: ../validate_solutions_defaults - - realize_model: ../validate_solutions_defaults - - bypass: - - override hydra/hydra_logging: disabled + - /meta-llama/Llama-3.2-3B-Instruct/pruning@pruning: ffn_pruning + - /validate_solutions_defaults@scoring + - /validate_solutions_defaults@realize_model - _self_ descriptor: llama diff --git a/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.2-3B-Instruct/pruning/ffn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.2-3B-Instruct/pruning/ffn_pruning.yaml new file mode 100644 index 0000000000..b30f4a17d9 --- /dev/null +++ b/tests/gpu/torch/puzzletron/resources/configs/meta-llama/Llama-3.2-3B-Instruct/pruning/ffn_pruning.yaml @@ -0,0 +1,7 @@ +defaults: + - /pruning/ffn_pruning_base@_here_ + - _self_ + +pruning_mixin: + layer_descriptor: + _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaFFNIntermediateLayerDescriptor diff --git a/tests/gpu/torch/puzzletron/resources/configs/mistralai/Mistral-Small-24B-Instruct-2501/Mistral-Small-24B-Instruct-2501.yaml b/tests/gpu/torch/puzzletron/resources/configs/mistralai/Mistral-Small-24B-Instruct-2501/Mistral-Small-24B-Instruct-2501.yaml new file mode 100644 index 0000000000..e042c4bb62 --- /dev/null +++ b/tests/gpu/torch/puzzletron/resources/configs/mistralai/Mistral-Small-24B-Instruct-2501/Mistral-Small-24B-Instruct-2501.yaml @@ -0,0 +1,112 @@ +# @package _global_ +defaults: + - /mistralai/Mistral-Small-24B-Instruct-2501/pruning@pruning: ffn_pruning + - /validate_solutions_defaults@scoring + - /validate_solutions_defaults@realize_model + - _self_ + +puzzle_dir: ??? +teacher_dir: ${puzzle_dir}/ckpts/teacher/ +replacement_library_path: ${puzzle_dir}/replacement_library.json +dataset_path: ??? # path to v0.4_mini + +skip_realize_model: false + +descriptor: mistral_small + +build_replacement_library: + add_ffn_no_ops: true + add_attention_no_ops: true + +calc_subblock_stats: + batch_sizes: [64, 96, 128] + prefill_seq_len: 4096 + generation_seq_len: 4096 + num_active_tokens_override: # Optional override for sequence lengths + prefill_queue_size: 0 + benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking + merge_with_existing_stats: false + subblock_stats_filename: "subblock_stats.json" + moe_stats_filename: "moe_stats.json" + +scoring: + descriptor: ${descriptor} + + solutions_to_validate: + skip_existing_solutions: true + + replacement_library_path: ${replacement_library_path} + solutions_path: ${to_path:${puzzle_dir}/single_sequence_replacement_solutions.json} + teacher_dir: ${to_path:${teacher_dir}} + output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation + + eval_samples: 2 + micro_batch_size: 1 + dataset_path: ${dataset_path}/valid + seed: 42 + shuffle_seed: 444 + +mip: + single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}} + subblock_stats_path: ${to_path:${puzzle_dir}/${calc_subblock_stats.subblock_stats_filename}} + output_path: ${to_path:${puzzle_dir}/mip/puzzle_solutions} + gathered_metrics_path: + puzzle_profile: + + # puzzle_profile: + objective: metrics.cosine_embedding_loss_hidden_states + bigger_is_better: false + num_solutions: 1 + minimal_diversity: 2 + + subblock_stats_args: + - batch_size: 96 + weights_dtype: torch.bfloat16 + activations_dtype: torch.bfloat16 + kv_cache_dtype: torch.bfloat16 + + report_additional_costs: + - stats.memory_mib + - stats.num_params + - stats.num_kv_heads + - stats.has_attention + - stats.has_ffn + - stats.kv_cache_memory_mib + - stats.attention_memory_mib + - stats.ffn_memory_mib + - stats.ffn_num_params + - stats.attention_num_params + + human_constraints: + target_memory: 780_000 # 78_000 + + mip_constraints: + use_greedy_search: false + is_multi_layer_puzzle: true + metric_overrides: + constrain_search_func: + max_seconds_per_solution: 60 + +realize_model: + descriptor: ${descriptor} + + teacher_dir: ${to_path:${teacher_dir}} + tokenizer_name: ${to_path:${teacher_dir}} + replacement_library_path: ${replacement_library_path} + save_models: true + solutions_path: # Filled dynamically + + # Validate params + skip_validation: false # To enable validation of the model solution set `skip_validation` as False + eval_samples: 2 + micro_batch_size: 1 + dataset_path: ${dataset_path}/valid + seed: 42 + shuffle_seed: 444 + +nccl_timeout_minutes: ${timedelta_minutes:10} + +# This section redirects Hydra outputs +hydra: + run: + dir: ${puzzle_dir}/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S} diff --git a/tests/gpu/torch/puzzletron/resources/configs/mistralai/Mistral-Small-24B-Instruct-2501/pruning/ffn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/mistralai/Mistral-Small-24B-Instruct-2501/pruning/ffn_pruning.yaml new file mode 100644 index 0000000000..37c21fd638 --- /dev/null +++ b/tests/gpu/torch/puzzletron/resources/configs/mistralai/Mistral-Small-24B-Instruct-2501/pruning/ffn_pruning.yaml @@ -0,0 +1,7 @@ +defaults: + - /pruning/ffn_pruning_base@_here_ + - _self_ + +pruning_mixin: + layer_descriptor: + _target_: modelopt.torch.puzzletron.anymodel.models.mistral_small.mistral_small_model_descriptor.MistralFFNIntermediateLayerDescriptor diff --git a/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16.yaml b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16.yaml new file mode 100644 index 0000000000..ab2b09e679 --- /dev/null +++ b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16.yaml @@ -0,0 +1,115 @@ +# @package _global_ +defaults: + - /nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/pruning@pruning: expert_pruning + - /validate_solutions_defaults@scoring + - /validate_solutions_defaults@realize_model + - _self_ + + +puzzle_dir: ??? +teacher_dir: ${puzzle_dir}/ckpts/teacher/ +replacement_library_path: ${puzzle_dir}/replacement_library.json +dataset_path: ??? # path to v0.4_mini + +skip_realize_model: false + +descriptor: nemotron_h + +build_replacement_library: + add_ffn_no_ops: true + add_attention_no_ops: true + +calc_subblock_stats: + batch_sizes: [64, 96, 128] + prefill_seq_len: 4096 + generation_seq_len: 4096 + num_active_tokens_override: # Optional override for sequence lengths + prefill_queue_size: 0 + allocate_prefill_query: false + benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking + merge_with_existing_stats: false + subblock_stats_filename: "subblock_stats.json" + moe_stats_filename: "moe_stats.json" + runtime_stats: + backend: trt_torch + +scoring: + descriptor: ${descriptor} + + solutions_to_validate: + skip_existing_solutions: true + + replacement_library_path: ${replacement_library_path} + solutions_path: ${to_path:${puzzle_dir}/single_sequence_replacement_solutions.json} + teacher_dir: ${to_path:${teacher_dir}} + output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation + + eval_samples: 2 + micro_batch_size: 1 + seed: 42 + shuffle_seed: 444 + dataset_path: ${dataset_path}/valid + +mip: + single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}} + subblock_stats_path: ${to_path:${puzzle_dir}/${calc_subblock_stats.subblock_stats_filename}} + output_path: ${to_path:${puzzle_dir}/mip/puzzle_solutions} + gathered_metrics_path: + puzzle_profile: + + # puzzle_profile: + objective: metrics.cosine_embedding_loss_hidden_states + bigger_is_better: false + num_solutions: 1 + minimal_diversity: 2 + + subblock_stats_args: + - batch_size: 96 + weights_dtype: torch.bfloat16 + activations_dtype: torch.bfloat16 + kv_cache_dtype: torch.bfloat16 + + report_additional_costs: + - stats.memory_mib + - stats.num_params + - stats.num_kv_heads + - stats.has_attention + - stats.has_ffn + - stats.kv_cache_memory_mib + - stats.attention_memory_mib + - stats.ffn_memory_mib + - stats.ffn_num_params + - stats.attention_num_params + - stats.num_local_experts + + human_constraints: + mip_constraints: + - stats.num_local_experts: 1472 # teacher has: 23 moe-blocks * 128 experts = 2944 total experts use_greedy_search: false + is_multi_layer_puzzle: true + metric_overrides: + constrain_search_func: + max_seconds_per_solution: 60 + +realize_model: + descriptor: ${descriptor} + + teacher_dir: ${to_path:${teacher_dir}} + tokenizer_name: ${to_path:${teacher_dir}} + replacement_library_path: ${replacement_library_path} + save_models: true + solutions_path: # Filled dynamically + + # Validate params + skip_validation: false # To enable validation of the model solution set `skip_validation` as False + eval_samples: 2 + micro_batch_size: 1 + seed: 42 + shuffle_seed: 444 + dataset_path: ${dataset_path}/valid + +nccl_timeout_minutes: ${timedelta_minutes:10} + +# This section redirects Hydra outputs +hydra: + run: + dir: ${puzzle_dir}/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S} diff --git a/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/pruning/expert_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/pruning/expert_pruning.yaml new file mode 100644 index 0000000000..4c2335becf --- /dev/null +++ b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/pruning/expert_pruning.yaml @@ -0,0 +1,18 @@ +defaults: + - /pruning/pruning_defaults@_here_ + +eval_samples: 10 +activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/expert_removal/${pruning.experiment_id} +pruning_mixin: + _target_: modelopt.torch.puzzletron.pruning.expert_removal_pruning_mixin.ExpertRemovalPruningMixIn + layer_descriptor: + _target_: modelopt.torch.puzzletron.anymodel.models.nemotron_h.nemotron_h_model_descriptor.NemotronHExpertRemovalLayerDescriptor + target_name: "mixer" + +hook_class: ${get_object:modelopt.torch.nas.plugins.megatron_hooks.base_hooks.NemotronHRemoveExpertsIndependentHook} +activation_hooks_kwargs: # Additional kwargs to pass to the hook init + +num_experts_to_keep_list: [96, 64, 32, 16, 8] # num_experts in teacher is 128 +mlp_init_mode: "ExpertRemoval" +mlp_init_config_yaml: + expert_scores_key: "expert_ranks_mse" diff --git a/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/pruning/ffn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/pruning/ffn_pruning.yaml new file mode 100644 index 0000000000..cb1147d86b --- /dev/null +++ b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16/pruning/ffn_pruning.yaml @@ -0,0 +1,14 @@ +defaults: + - /pruning/pruning_defaults + +activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/ffn/${pruning.experiment_id} +pruning_mixin: + _target_: modelopt.torch.puzzletron.pruning.ffn_intermediate_pruning_mixin.FFNIntermediatePruningMixIn + layer_descriptor: + _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaFFNIntermediateLayerDescriptor + +hook_class: ${get_object:modelopt.torch.nas.plugins.megatron_hooks.base_hooks.IterativeChannelContributionHook} +activation_hooks_kwargs: # Additional kwargs to pass to the hook init + +intermediate_size_list: [3072, 5888, 8704, 11520] # teacher_intermediate_size is 14336 +mlp_init_mode: "PruneByActivationsLog" diff --git a/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-Nano-12B-v2/NVIDIA-Nemotron-Nano-12B-v2.yaml b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-Nano-12B-v2/NVIDIA-Nemotron-Nano-12B-v2.yaml new file mode 100644 index 0000000000..906b7338d8 --- /dev/null +++ b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-Nano-12B-v2/NVIDIA-Nemotron-Nano-12B-v2.yaml @@ -0,0 +1,113 @@ +# @package _global_ +defaults: + - /nvidia/NVIDIA-Nemotron-Nano-12B-v2/pruning@pruning: ffn_pruning + - /validate_solutions_defaults@scoring + - /validate_solutions_defaults@realize_model + - _self_ + +puzzle_dir: ??? +teacher_dir: ${puzzle_dir}/ckpts/teacher/ +replacement_library_path: ${puzzle_dir}/replacement_library.json +dataset_path: ??? # path to v0.4_mini + +skip_realize_model: false + +descriptor: nemotron_h_v2 + +build_replacement_library: + add_ffn_no_ops: true + add_attention_no_ops: true + +calc_subblock_stats: + batch_sizes: [64, 96, 128] + prefill_seq_len: 4096 + generation_seq_len: 4096 + num_active_tokens_override: # Optional override for sequence lengths + prefill_queue_size: 0 + allocate_prefill_query: false + benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking + merge_with_existing_stats: false + subblock_stats_filename: "subblock_stats.json" + moe_stats_filename: "moe_stats.json" + +scoring: + descriptor: ${descriptor} + + solutions_to_validate: + skip_existing_solutions: true + + replacement_library_path: ${replacement_library_path} + solutions_path: ${to_path:${puzzle_dir}/single_sequence_replacement_solutions.json} + teacher_dir: ${to_path:${teacher_dir}} + output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation + + eval_samples: 2 + micro_batch_size: 1 + dataset_path: ${dataset_path}/valid + seed: 42 + shuffle_seed: 444 + +mip: + single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}} + subblock_stats_path: ${to_path:${puzzle_dir}/${calc_subblock_stats.subblock_stats_filename}} + output_path: ${to_path:${puzzle_dir}/mip/puzzle_solutions} + gathered_metrics_path: + puzzle_profile: + + # puzzle_profile: + objective: metrics.cosine_embedding_loss_hidden_states + bigger_is_better: false + num_solutions: 1 + minimal_diversity: 2 + + subblock_stats_args: + - batch_size: 96 + weights_dtype: torch.bfloat16 + activations_dtype: torch.bfloat16 + kv_cache_dtype: torch.bfloat16 + + report_additional_costs: + - stats.memory_mib + - stats.num_params + - stats.num_kv_heads + - stats.has_attention + - stats.has_ffn + - stats.kv_cache_memory_mib + - stats.attention_memory_mib + - stats.ffn_memory_mib + - stats.ffn_num_params + - stats.attention_num_params + + human_constraints: + target_memory: 780_000 # 78_000 + + mip_constraints: + use_greedy_search: false + is_multi_layer_puzzle: true + metric_overrides: + constrain_search_func: + max_seconds_per_solution: 60 + +realize_model: + descriptor: ${descriptor} + + teacher_dir: ${to_path:${teacher_dir}} + tokenizer_name: ${to_path:${teacher_dir}} + replacement_library_path: ${replacement_library_path} + save_models: true + solutions_path: # Filled dynamically + + # Validate params + skip_validation: false # To enable validation of the model solution set `skip_validation` as False + eval_samples: 2 + micro_batch_size: 1 + dataset_path: ${dataset_path}/valid + seed: 42 + shuffle_seed: 444 + +nccl_timeout_minutes: ${timedelta_minutes:10} + +# This section redirects Hydra outputs +hydra: + run: + dir: ${puzzle_dir}/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S} diff --git a/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-Nano-12B-v2/pruning/ffn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-Nano-12B-v2/pruning/ffn_pruning.yaml new file mode 100644 index 0000000000..f68068c3ac --- /dev/null +++ b/tests/gpu/torch/puzzletron/resources/configs/nvidia/NVIDIA-Nemotron-Nano-12B-v2/pruning/ffn_pruning.yaml @@ -0,0 +1,12 @@ +defaults: + - /pruning/ffn_pruning_base@_here_ + - _self_ + +pruning_mixin: + layer_descriptor: + _target_: modelopt.torch.puzzletron.anymodel.models.nemotron_h_v2.nemotron_h_v2_model_descriptor.NemotronHV2FFNIntermediateLayerDescriptor + +activation_hooks_kwargs: + method: iterative + target_layer: "mixer.down_proj" + layer_input_descriptors_path: diff --git a/tests/_test_utils/torch/puzzletron/resources/configs/pruning/attn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/pruning/attn_pruning.yaml similarity index 67% rename from tests/_test_utils/torch/puzzletron/resources/configs/pruning/attn_pruning.yaml rename to tests/gpu/torch/puzzletron/resources/configs/pruning/attn_pruning.yaml index 01886607e4..7306b6e379 100644 --- a/tests/_test_utils/torch/puzzletron/resources/configs/pruning/attn_pruning.yaml +++ b/tests/gpu/torch/puzzletron/resources/configs/pruning/attn_pruning.yaml @@ -1,8 +1,15 @@ defaults: - - pruning_defaults + - /pruning/pruning_defaults@_here_ + - _self_ activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/attn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} +pruning_mixin: + _target_: modelopt.torch.puzzletron.pruning.kv_heads_pruning_mixin.KVHeadsPruningMixIn + layer_descriptor: + _target_: ??? + +hook_class: ${get_object:modelopt.torch.nas.plugins.megatron_hooks.base_hooks.IndependentKvHeadContributionHook} activation_hooks_kwargs: method: independent_kv_head_contribution optimize_for: memory # IndependentKvHeadContributionHook implementation that consumes less memory diff --git a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/ffn_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/pruning/ffn_pruning_base.yaml similarity index 72% rename from tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/ffn_pruning.yaml rename to tests/gpu/torch/puzzletron/resources/configs/pruning/ffn_pruning_base.yaml index cad6fcf3ee..7e19afbbce 100644 --- a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/ffn_pruning.yaml +++ b/tests/gpu/torch/puzzletron/resources/configs/pruning/ffn_pruning_base.yaml @@ -1,12 +1,13 @@ defaults: - - pruning_defaults + - /pruning/pruning_defaults@_here_ + - _self_ activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/ffn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} pruning_mixin: _target_: modelopt.torch.puzzletron.pruning.ffn_intermediate_pruning_mixin.FFNIntermediatePruningMixIn layer_descriptor: - _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaFFNIntermediateLayerDescriptor + _target_: ??? hook_class: ${get_object:modelopt.torch.nas.plugins.megatron_hooks.base_hooks.IterativeChannelContributionHook} activation_hooks_kwargs: @@ -14,5 +15,5 @@ activation_hooks_kwargs: target_layer: "mlp.down_proj" layer_input_descriptors_path: -intermediate_size_list: [256] # teacher_intermediate_size is 14336 +intermediate_size_list: [256] mlp_init_mode: "PruneByActivationsLog" diff --git a/tests/_test_utils/torch/puzzletron/resources/configs/pruning/hidden_dim_pruning.yaml b/tests/gpu/torch/puzzletron/resources/configs/pruning/hidden_dim_pruning.yaml similarity index 93% rename from tests/_test_utils/torch/puzzletron/resources/configs/pruning/hidden_dim_pruning.yaml rename to tests/gpu/torch/puzzletron/resources/configs/pruning/hidden_dim_pruning.yaml index 407c835d8c..4033fedf3a 100644 --- a/tests/_test_utils/torch/puzzletron/resources/configs/pruning/hidden_dim_pruning.yaml +++ b/tests/gpu/torch/puzzletron/resources/configs/pruning/hidden_dim_pruning.yaml @@ -1,5 +1,5 @@ defaults: - - pruning_defaults + - /pruning/pruning_defaults@_here_ activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/hidden_dim_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} diff --git a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/pruning_defaults.yaml b/tests/gpu/torch/puzzletron/resources/configs/pruning/pruning_defaults.yaml similarity index 94% rename from tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/pruning_defaults.yaml rename to tests/gpu/torch/puzzletron/resources/configs/pruning/pruning_defaults.yaml index b24ea1b7cc..f00a86da66 100644 --- a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/pruning/pruning_defaults.yaml +++ b/tests/gpu/torch/puzzletron/resources/configs/pruning/pruning_defaults.yaml @@ -1,12 +1,13 @@ defaults: - - /validate_model_defaults + - /validate_model_defaults@_here_ -descriptor: ${descriptor} model_name_or_path: ${teacher_dir} experiment_id: ${pruning.eval_samples}samples_diverse_mini activations_log_dir: ??? activation_hooks_kwargs: ??? +descriptor: ${descriptor} + # Data: eval_samples: 100 micro_batch_size: 4 diff --git a/tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/validate_model_defaults.yaml b/tests/gpu/torch/puzzletron/resources/configs/validate_model_defaults.yaml similarity index 100% rename from tests/gpu/torch/puzzletron/resources/configs/llama_3_1_8b_instruct/validate_model_defaults.yaml rename to tests/gpu/torch/puzzletron/resources/configs/validate_model_defaults.yaml diff --git a/tests/_test_utils/torch/puzzletron/resources/configs/validate_solutions_defaults.yaml b/tests/gpu/torch/puzzletron/resources/configs/validate_solutions_defaults.yaml similarity index 100% rename from tests/_test_utils/torch/puzzletron/resources/configs/validate_solutions_defaults.yaml rename to tests/gpu/torch/puzzletron/resources/configs/validate_solutions_defaults.yaml diff --git a/tests/gpu/torch/puzzletron/resources/hf_configs/llama_3_1_8b_instruct/config.json b/tests/gpu/torch/puzzletron/resources/hf_configs/llama_3_1_8b_instruct/config.json deleted file mode 100644 index 0bb6fd75b3..0000000000 --- a/tests/gpu/torch/puzzletron/resources/hf_configs/llama_3_1_8b_instruct/config.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "architectures": [ - "LlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": 128000, - "eos_token_id": [ - 128001, - 128008, - 128009 - ], - "hidden_act": "silu", - "hidden_size": 4096, - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 131072, - "mlp_bias": false, - "model_type": "llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": { - "factor": 8.0, - "low_freq_factor": 1.0, - "high_freq_factor": 4.0, - "original_max_position_embeddings": 8192, - "rope_type": "llama3" - }, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.42.3", - "use_cache": true, - "vocab_size": 128256 -} diff --git a/tests/gpu/torch/puzzletron/test_puzzletron.py b/tests/gpu/torch/puzzletron/test_puzzletron.py index a42a716547..cf600558e5 100644 --- a/tests/gpu/torch/puzzletron/test_puzzletron.py +++ b/tests/gpu/torch/puzzletron/test_puzzletron.py @@ -21,6 +21,7 @@ import pytest import torch from _test_utils.torch.distributed.utils import spawn_multiprocess_job +from _test_utils.torch.misc import set_seed from _test_utils.torch.puzzletron.utils import setup_test_model_and_data import modelopt.torch.utils.distributed as dist @@ -31,46 +32,30 @@ # using a one-click command. # # Note: Bypass is disabled now in the test. +# + +SEED = 1234 @pytest.mark.parametrize( - ( - "hf_config_name", - "converter", - "hydra_config_subdir", - "hybrid_override_pattern", - "has_moe_layers", - ), + ("hf_model_name", "converter", "hybrid_override_pattern", "has_moe_layers"), [ - ("llama_3_1_8b_instruct", "llama", "llama_3_1_8b_instruct", None, False), - # ("llama_3_2_3b_instruct", "llama", "llama_3_1_8b_instruct", None, False), - # ("qwen2_5_7b_instruct", "qwen2", "qwen2_5_7b_instruct", None, False), - # ( - # "mistral-small-24b-instruct-2501", - # "mistral_small", - # "mistral-small-24b-instruct-2501", - # None, - # False, - # ), - # ("qwen3-8b", "qwen3", "qwen3-8b", None, False), - # ("qwen3-vl-30b-a3b-instruct", "qwen3_vl", "qwen3-vl-30b-a3b-instruct", None, True), - # ("nemotron-nano-12b-v2", "nemotron_h_v2", "nemotron-nano-12b-v2", "*-", False), - # ( - # "nemotron-3-nano-30b-a3b-base-bf16", - # "nemotron_h", - # "nemotron-3-nano-30b-a3b-base-bf16", - # "*E", - # True, - # ), - # ("gpt-oss-20b", "gpt_oss_20b", "gpt-oss-20b", None, True), + ("meta-llama/Llama-3.1-8B-Instruct", "llama", None, False), + ("meta-llama/Llama-3.2-3B-Instruct", "llama", None, False), + ("mistralai/Mistral-Small-24B-Instruct-2501", "mistral_small", None, False), + ("nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16", "nemotron_h", "*E", True), + ("nvidia/NVIDIA-Nemotron-Nano-12B-v2", "nemotron_h_v2", "*-", False), + # ("openai/gpt-oss-20b", "gpt_oss", None, True), + ("Qwen/Qwen2.5-7B-Instruct", "qwen2", None, False), + ("Qwen/Qwen3-8B", "qwen3", None, False), + ("Qwen/Qwen3-VL-30B-A3B-Instruct", "qwen3_vl", None, True), ], ) def test_puzzletron( project_root_path: Path, tmp_path: Path, - hf_config_name: str, + hf_model_name: str, converter: str, - hydra_config_subdir: str, hybrid_override_pattern: str, has_moe_layers: bool, ): @@ -80,9 +65,8 @@ def test_puzzletron( _test_puzzletron_multiprocess_job, project_root_path, tmp_path, - hf_config_name, + hf_model_name, converter, - hydra_config_subdir, hybrid_override_pattern, has_moe_layers, ), @@ -93,23 +77,25 @@ def test_puzzletron( def _test_puzzletron_multiprocess_job( project_root_path: Path, tmp_path: Path, - hf_config_name: str, + hf_model_name: str, converter: str, - hydra_config_subdir: str, hybrid_override_pattern: str, has_moe_layers: bool, rank: int, size: int, ): + # Set seed BEFORE dist.setup() to ensure reproducibility across all processes + set_seed(SEED) + dist.setup(timeout=timedelta(10)) # Setup the test model and data. puzzle_dir, hf_checkpoint_path, dataset_path = setup_test_model_and_data( - project_root_path, tmp_path, rank, hf_config_name, hybrid_override_pattern - ) - hydra_config_dir = ( - project_root_path / f"tests/gpu/torch/puzzletron/resources/configs/{hydra_config_subdir}" + project_root_path, tmp_path, rank, hf_model_name, hybrid_override_pattern ) + hydra_config_dir = project_root_path / "tests/gpu/torch/puzzletron/resources/configs" + model_basename = hf_model_name.split("/")[1] + hydra_config_name = f"{hf_model_name}/{model_basename}" # Convert the model using AnyModel converter. if rank == 0: @@ -122,7 +108,7 @@ def _test_puzzletron_multiprocess_job( # Compress the model using a one-click approach puzzletron.puzzletron( - str(hydra_config_dir), hydra_config_subdir, str(puzzle_dir), str(dataset_path) + str(hydra_config_dir), hydra_config_name, str(puzzle_dir), str(dataset_path) ) # @@ -159,16 +145,16 @@ def _test_puzzletron_multiprocess_job( assert (solution_dir / "solutions.json").exists() # Validate lm_loss - _assert_lm_loss(puzzle_dir, hf_config_name) + _assert_lm_loss(puzzle_dir, hf_model_name, tolerance=0.01) else: # assertions for the score_pruning_activations step 1 (FFN pruning) - _assert_score_pruning_activations(puzzle_dir, hf_config_name) + _assert_score_pruning_activations(puzzle_dir, hf_model_name) # assertions for the pruning_ckpts step 2 assert (puzzle_dir / "ckpts/ffn_256_attn_no_op").exists() # assertions for the mip_and_realize_models step 6 - _assert_mip_solutions(puzzle_dir, hf_config_name) + _assert_mip_solutions(puzzle_dir, hf_model_name) # assertions for the build_library_and_stats step 4 assert (puzzle_dir / "replacement_library.json").is_file() @@ -183,7 +169,7 @@ def _test_puzzletron_multiprocess_job( dist.cleanup() print( - f"PYTEST SUMMARY: test_puzzletron({hf_config_name}) test has finished successfully. " + f"PYTEST SUMMARY: test_puzzletron({hf_model_name}) test has finished successfully. " f"Puzzle directory: {puzzle_dir}" ) @@ -191,52 +177,50 @@ def _test_puzzletron_multiprocess_job( # Expected pruning activation values per model # Each model has a list of (score, channels) tuples for each FFN layer EXPECTED_PRUNING_VALUES = { - "llama_3_1_8b_instruct": [ + "meta-llama/Llama-3.1-8B-Instruct": [ {"score": 73, "channels": 95}, {"score": 440, "channels": 174}, ], - "llama_3_2_3b_instruct": [ + "meta-llama/Llama-3.2-3B-Instruct": [ {"score": 79, "channels": 95}, {"score": 428, "channels": 174}, ], - "qwen2_5_7b_instruct": [ - {"score": 96, "channels": 433}, - {"score": 485, "channels": 105}, - ], - # Mistral Small 24B - "mistral-small-24b-instruct-2501": [ + "mistralai/Mistral-Small-24B-Instruct-2501": [ {"score": 73, "channels": 95}, {"score": 431, "channels": 174}, ], - # Qwen3 8B - "qwen3-8b": [ - {"score": 208, "channels": 51}, - {"score": 475, "channels": 266}, - ], # NemotronH with pattern "*-" has only 1 FFN layer (the "-" layer) - "nemotron-nano-12b-v2": [ + "nvidia/NVIDIA-Nemotron-Nano-12B-v2": [ {"score": 70, "channels": 509}, ], - # Note: nemotron-3-nano-30b-a3b-base-bf16 uses MoE expert pruning, not FFN pruning - # so it doesn't have EXPECTED_PRUNING_VALUES + # nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 uses MoE expert pruning, not FFN pruning + "Qwen/Qwen2.5-7B-Instruct": [ + {"score": 96, "channels": 433}, + {"score": 485, "channels": 105}, + ], + "Qwen/Qwen3-8B": [ + {"score": 208, "channels": 51}, + {"score": 475, "channels": 266}, + ], } # Expected lm_loss values per model EXPECTED_LM_LOSS = { - "llama_3_1_8b_instruct": 4.706878662109375, - "llama_3_2_3b_instruct": 4.816886901855469, - "qwen2_5_7b_instruct": 4.778186798095703, - "nemotron-nano-12b-v2": 4.79390811920166, - "mistral-small-24b-instruct-2501": 4.709150314331055, - "qwen3-8b": 4.733874320983887, - "gpt-oss-20b": 4.689250946044922, - "nemotron-3-nano-30b-a3b-base-bf16": 4.741103172302246, - "qwen3-vl-30b-a3b-instruct": 4.65625, + "meta-llama/Llama-3.1-8B-Instruct": 4.706878662109375, + "meta-llama/Llama-3.2-3B-Instruct": 4.816886901855469, + "mistralai/Mistral-Small-24B-Instruct-2501": 4.709150314331055, + # TODO: not reproducible in CI, skipping for now + # "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16": 4.7737884521484375, + "nvidia/NVIDIA-Nemotron-Nano-12B-v2": 4.79390811920166, + # "openai/gpt-oss-20b": 4.689250946044922, + "Qwen/Qwen2.5-7B-Instruct": 4.778186798095703, + "Qwen/Qwen3-8B": 4.733874320983887, + "Qwen/Qwen3-VL-30B-A3B-Instruct": 4.65625, } -def _assert_score_pruning_activations(puzzle_dir: Path, hf_config_name: str): +def _assert_score_pruning_activations(puzzle_dir: Path, hf_model_name: str): """Assertions for the score_pruning_activations step 1.""" rank = dist.rank() rank_filepath = f"pruning/pruning_scores/ffn_iterative/100samples_diverse_mini/rank_{rank}.pth" @@ -245,7 +229,7 @@ def _assert_score_pruning_activations(puzzle_dir: Path, hf_config_name: str): pruning_scores = torch.load(puzzle_dir / rank_filepath) layer_names = list(pruning_scores.keys()) - expected = EXPECTED_PRUNING_VALUES[hf_config_name] + expected = EXPECTED_PRUNING_VALUES[hf_model_name] size = dist.size() if expected is not None: @@ -267,8 +251,8 @@ def _assert_score_pruning_activations(puzzle_dir: Path, hf_config_name: str): ) else: # Print values for new models - update EXPECTED_PRUNING_VALUES with these - print(f"\n=== PRUNING VALUES for {hf_config_name} (num_layers={len(layer_names)}) ===") - print(f'"{hf_config_name}": [') + print(f"\n=== PRUNING VALUES for {hf_model_name} (num_layers={len(layer_names)}) ===") + print(f'"{hf_model_name}": [') for layer_name in layer_names: layer_data = pruning_scores[layer_name] score = layer_data["score"][0].item() @@ -278,7 +262,7 @@ def _assert_score_pruning_activations(puzzle_dir: Path, hf_config_name: str): print("===") -def _assert_lm_loss(puzzle_dir: Path, hf_config_name: str): +def _assert_lm_loss(puzzle_dir: Path, hf_model_name: str, tolerance: float = 0.01): """Validate lm_loss for a model solution.""" solution_0_path = ( puzzle_dir / "single_sequence_replacement_solutions--validation/solution_0.json" @@ -287,19 +271,19 @@ def _assert_lm_loss(puzzle_dir: Path, hf_config_name: str): validation = json.load(f) actual_lm_loss = validation["lm_loss"]["avg"] - expected_lm_loss = EXPECTED_LM_LOSS.get(hf_config_name) + expected_lm_loss = EXPECTED_LM_LOSS.get(hf_model_name) if expected_lm_loss is not None: - assert abs(actual_lm_loss - expected_lm_loss) < 0.01, ( + assert abs(actual_lm_loss - expected_lm_loss) < tolerance, ( f"lm_loss mismatch: expected {expected_lm_loss}, got {actual_lm_loss}" ) else: # Print value for new models - update EXPECTED_LM_LOSS with this - print(f"\n=== LM_LOSS for {hf_config_name} ===") - print(f'"{hf_config_name}": {actual_lm_loss},') + print(f"\n=== LM_LOSS for {hf_model_name} ===") + print(f'"{hf_model_name}": {actual_lm_loss},') print("===") -def _assert_mip_solutions(puzzle_dir: Path, hf_config_name: str): +def _assert_mip_solutions(puzzle_dir: Path, hf_model_name: str): """Assertions for the mip_and_realize_models step.""" mip_dir = puzzle_dir / "mip/puzzle_solutions/target_memory_780000MiB" @@ -307,4 +291,4 @@ def _assert_mip_solutions(puzzle_dir: Path, hf_config_name: str): assert (mip_dir / "solutions--checkpoints/solution_0/config.json").exists() # Validate lm_loss - _assert_lm_loss(puzzle_dir, hf_config_name) + _assert_lm_loss(puzzle_dir, hf_model_name) diff --git a/tox.ini b/tox.ini index bcfb41fca3..33700288b8 100644 --- a/tox.ini +++ b/tox.ini @@ -73,6 +73,8 @@ commands = [testenv:cuda13-gpu-puzzletron] commands_pre = # Install deps here so that it gets installed even in --current-env + pip install --no-build-isolation git+https://github.com/state-spaces/mamba.git + pip install --no-build-isolation git+https://github.com/Dao-AILab/causal-conv1d.git pip install -e .[hf,puzzletron,dev-test] commands = # Coverage fails with "Can't combine line data with arc data" error so not using "--cov"