diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f3430012fa..55e63ac7a3 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,7 +1,14 @@ NVIDIA Model Optimizer Changelog ================================ -0.43 (2026-03-xx) +0.44 (2026-05-xx) +^^^^^^^^^^^^^^^^^ + +**New Features** + +- Support full Transformer Engine spec for Minitron pruning (``mcore_minitron``). Now we no longer need to use custom ModelOpt spec. Note that this does not affect the usage of the pruning workflow but makes pruning slightly faster and may result in slightly different pruned model because of different kernel and numerics. + +0.43 (2026-04-09) ^^^^^^^^^^^^^^^^^ **Bug Fixes** @@ -29,7 +36,7 @@ NVIDIA Model Optimizer Changelog - Migrated project metadata from ``setup.py`` to a fully declarative ``pyproject.toml``. -0.42 (2026-02-xx) +0.42 (2026-03-10) ^^^^^^^^^^^^^^^^^ **Bug Fixes** diff --git a/examples/megatron_bridge/README.md b/examples/megatron_bridge/README.md index db9b60090f..3c2c0034cd 100644 --- a/examples/megatron_bridge/README.md +++ b/examples/megatron_bridge/README.md @@ -16,9 +16,9 @@ This directory contains examples of using Model Optimizer with [NeMo Megatron-Br ## Pre-Requisites -Running these examples requires many additional dependencies to be installed (e.g., Megatron-Bridge, Megatron-core, etc.), hence we strongly recommend directly using the NeMo container (e.g., `nvcr.io/nvidia/nemo:26.02`) which has all the dependencies installed. +Running these examples requires many additional dependencies to be installed (e.g., Megatron-Bridge, Megatron-core, etc.), hence we strongly recommend directly using the NeMo container (e.g., `nvcr.io/nvidia/nemo:26.02.01`) which has all the dependencies installed. -To get the latest ModelOpt features and examples scripts, mount your Model-Optimizer repo to the container. +To get the ModelOpt examples scripts, mount your Model-Optimizer repo to the container as follows: ```bash export MODELOPT_DIR=${PWD}/Model-Optimizer # or set to your local Model-Optimizer repository path if you have cloned it @@ -26,7 +26,7 @@ if [ ! -d "${MODELOPT_DIR}" ]; then git clone https://github.com/NVIDIA/Model-Optimizer.git ${MODELOPT_DIR} fi -export DOCKER_IMAGE=nvcr.io/nvidia/nemo:26.02 +export DOCKER_IMAGE=nvcr.io/nvidia/nemo:26.02.01 docker run \ --gpus all \ --shm-size=16GB \ diff --git a/examples/megatron_bridge/prune_minitron.py b/examples/megatron_bridge/prune_minitron.py index deecf0f8dc..662b9b8b96 100644 --- a/examples/megatron_bridge/prune_minitron.py +++ b/examples/megatron_bridge/prune_minitron.py @@ -241,7 +241,7 @@ def main(args: argparse.Namespace): }, init_model_parallel=True, ) - print_rank_0(f"\nPruning {unwrapped_model=}") + print_rank_0(f"\nPruning model (showing PP rank0): {unwrapped_model}") print_rank_0( f"Original model params: {num2hrb(mtp.mcore_minitron.get_mcore_param_count(unwrapped_model))}" ) @@ -317,7 +317,7 @@ def score_func_mmlu(m): else "hybrid_layer_pattern" ) setattr(provider, hybrid_key, getattr(unwrapped_model, hybrid_key)) - print_rank_0(f"\nPruned {unwrapped_model=}") + print_rank_0(f"\nPruned model (showing PP rank0): {unwrapped_model}") print_rank_0( f"Pruned model params: {num2hrb(mtp.mcore_minitron.get_mcore_param_count(unwrapped_model))}" ) diff --git a/modelopt/torch/nas/plugins/megatron.py b/modelopt/torch/nas/plugins/megatron.py index b408da161c..85e471fea6 100644 --- a/modelopt/torch/nas/plugins/megatron.py +++ b/modelopt/torch/nas/plugins/megatron.py @@ -15,15 +15,23 @@ """Plugin to add NAS/Pruning support for megatron-core Language models like GPT and Mamba.""" +import copy import types from abc import ABC from collections.abc import Callable, Sequence import torch import torch.nn as nn -from megatron.core.fusions.fused_layer_norm import FusedLayerNorm +import transformer_engine as te +from megatron.core.extensions.transformer_engine import ( + TEColumnParallelLinear, + TEDotProductAttention, + TELayerNormColumnParallelLinear, + TERowParallelLinear, +) from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding from megatron.core.models.gpt import GPTModel +from megatron.core.models.gpt.moe_module_specs import get_moe_module_spec from megatron.core.parallel_state import is_pipeline_first_stage, is_pipeline_last_stage from megatron.core.tensor_parallel.layers import ( ColumnParallelLinear, @@ -31,13 +39,14 @@ VocabParallelEmbedding, ) from megatron.core.transformer.attention import SelfAttention -from megatron.core.transformer.dot_product_attention import DotProductAttention +from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.mlp import MLP from megatron.core.transformer.moe import moe_utils from megatron.core.transformer.moe.experts import SequentialMLP from megatron.core.transformer.moe.moe_layer import MoELayer from megatron.core.transformer.moe.router import TopKRouter from megatron.core.transformer.moe.shared_experts import SharedExpertMLP +from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_layer import TransformerLayer from modelopt.torch.nas.modules import DynamicModuleList @@ -55,16 +64,12 @@ SUPPORTED_MODELS = {GPTModel: "megatron.core.models.gpt.GPTModel"} -try: - from megatron.core.extensions.transformer_engine import TEDotProductAttention - - HAS_TE = True -except ImportError: - HAS_TE = False - try: import mamba_ssm # noqa: F401 from megatron.core.models.mamba import MambaModel + from megatron.core.models.mamba.mamba_layer_specs import ( + mamba_stack_spec as _te_mamba_stack_spec, + ) from megatron.core.ssm.mamba_layer import MambaLayer from megatron.core.ssm.mamba_mixer import ExtendedRMSNorm, MambaMixer @@ -74,9 +79,26 @@ except ImportError: HAS_MAMBA = False -__all__ = [] +__all__ = ["get_te_mamba_stack_spec"] + + +# TODO: Maybe upstream this to Megatron-LM +def get_te_mamba_stack_spec(moe_grouped_gemm: bool = False) -> ModuleSpec: + """Return the TE Mamba stack spec.""" + assert HAS_MAMBA + if moe_grouped_gemm: + return _te_mamba_stack_spec + + # The upstream TE mamba stack spec hardcodes TEGroupedMLP for MoE. + # Replace it with SequentialMLP (TE linear layers, no grouped gemm dependency). + te_mamba_stack_spec = copy.deepcopy(_te_mamba_stack_spec) + te_mamba_stack_spec.submodules.moe_layer.submodules.mlp = get_moe_module_spec( + use_te=True, num_experts=8, moe_grouped_gemm=False + ) + return te_mamba_stack_spec +# Local Parallel Linear DynamicModules ########################################################################## class _DynamicParallelLinear(DynamicModule): """A parallel linear layer with dynamic hyperparams.""" @@ -127,6 +149,74 @@ def _setup(self, *, input_size: TracedHp | None = None, output_size: TracedHp | ) +# TE Parallel Linear DynamicModules ################################################################ +class _DynamicTEParallelLinear(DynamicModule): + """Base for TE parallel linear layers that use in_features/out_features naming.""" + + def _setup(self, *, input_size: TracedHp | None = None, output_size: TracedHp | None = None): + if input_size is None: + input_size = TracedHp(list(range(1, self.in_features + 1))) + self._register_hparam("input_size", input_size) + + if output_size is None: + output_size = TracedHp(list(range(1, self.out_features + 1))) + self._register_hparam("output_size", output_size) + + self._register_dynamic_attribute("weight", self._get_weight) + # TE stores a zero-length tensor (not None) when bias=False; only register if non-empty + if hasattr(self, "bias") and self.bias is not None and self.bias.numel() > 0: + self._register_dynamic_attribute("bias", self._get_bias) + self._register_dynamic_attribute("in_features", lambda mod, val: mod.input_size) + self._register_dynamic_attribute("out_features", lambda mod, val: mod.output_size) + + @staticmethod + def _get_weight(mod: "_DynamicTEParallelLinear", weight: torch.Tensor) -> torch.Tensor: + return get_sliced_tensor(mod, weight, "output_size", "input_size") + + @staticmethod + def _get_bias( + mod: "_DynamicTEParallelLinear", bias: torch.Tensor | None + ) -> torch.Tensor | None: + return get_sliced_tensor(mod, bias, "output_size") + + +@DMRegistry.register( + {TEColumnParallelLinear: "megatron.core.extensions.transformer_engine.TEColumnParallelLinear"} +) +class _DynamicTEColumnParallelLinear(_DynamicTEParallelLinear): + """A TEColumnParallelLinear layer with dynamic hyperparams.""" + + +@DMRegistry.register( + {TERowParallelLinear: "megatron.core.extensions.transformer_engine.TERowParallelLinear"} +) +class _DynamicTERowParallelLinear(_DynamicTEParallelLinear): + """A TERowParallelLinear layer with dynamic hyperparams.""" + + +@DMRegistry.register( + { + TELayerNormColumnParallelLinear: ( + "megatron.core.extensions.transformer_engine.TELayerNormColumnParallelLinear" + ) + } +) +class _DynamicTELayerNormColumnParallelLinear(_DynamicTEParallelLinear): + """A TELayerNormColumnParallelLinear with dynamic hyperparams (includes fused layernorm).""" + + def _setup(self, *, input_size: TracedHp | None = None, output_size: TracedHp | None = None): + super()._setup(input_size=input_size, output_size=output_size) + self._register_dynamic_attribute("layer_norm_weight", self._get_ln_param) + if hasattr(self, "layer_norm_bias") and self.layer_norm_bias is not None: + self._register_dynamic_attribute("layer_norm_bias", self._get_ln_param) + + @staticmethod + def _get_ln_param( + mod: "_DynamicTELayerNormColumnParallelLinear", val: torch.Tensor | None + ) -> torch.Tensor | None: + return get_sliced_tensor(mod, val, "input_size") + + # Embedding DynamicModule ########################################################################## @DMRegistry.register( { @@ -175,19 +265,20 @@ def export(self) -> torch.nn.Module: return super().export() -# Normalization DynamicModule ###################################################################### -@DMRegistry.register({FusedLayerNorm: "megatron.core.fusions.fused_layer_norm.FusedLayerNorm"}) -class _DynamicFusedLayerNorm(_DynamicLayerNorm): - """A FusedLayerNorm layer with dynamic hyperparams.""" +# TE Normalization DynamicModule ################################################################### +@DMRegistry.register( + {te.pytorch.LayerNorm: "te.pytorch.LayerNorm", te.pytorch.RMSNorm: "te.pytorch.RMSNorm"} +) +class _DynamicTENorm(_DynamicLayerNorm): + """A ``te.pytorch.{Layer/RMS}Norm`` layer with dynamic hyperparams.""" def _setup(self, *, num_features: TracedHp): - """Setup the FusedLayerNorm dynamic module with pre-defined num_features hparam.""" + """Setup the TENorm dynamic module with pre-defined num_features hparam.""" self._register_hparam("num_features", num_features) - # register dynamic attributes self._register_dynamic_attribute("weight", self._cut_to_active_features) - self._register_dynamic_attribute("bias", self._cut_to_active_features) - self._register_dynamic_attribute("hidden_size", self._get_normalized_shape) + if hasattr(self, "bias"): # Bias is not present in RMSNorm + self._register_dynamic_attribute("bias", self._cut_to_active_features) # MLP DynamicModule ################################################################################ @@ -291,23 +382,24 @@ def active_slice(self) -> torch.LongTensor: # NOTE: We provide a parent class since we do not register to DMRegistry. -class _DynamicQKVColumnParallelLinear(DynamicModule, ColumnParallelLinear): - """An mcore ColumnParallelLinear layer for linear_qkv with dynamic attributes.""" +class _DynamicTEQKVLayerNormColumnParallelLinear(DynamicModule, TELayerNormColumnParallelLinear): + """TE's fused LayerNorm+ColumnParallelLinear for QKV projection with dynamic attributes.""" def _setup(self, *, num_attention_heads: NumAttentionHeadsHp, hidden_size: TracedHp): - """Setup the _DynamicQKVColumnParallelLinear dynamic module with global hidden_size hparam.""" self._register_hparam("input_size", hidden_size) self._register_hparam("num_attention_heads", num_attention_heads) self._register_dynamic_attribute( - "output_size", + "out_features", lambda mod, val: (num_attention_heads.active + 2 * mod.config.num_query_groups) * mod.config.kv_channels, ) - self._register_dynamic_attribute( - "output_size_per_partition", lambda mod, val: mod.output_size - ) self._register_dynamic_attribute("weight", self._get_weight) - self._register_dynamic_attribute("bias", self._get_bias) + # TE stores a zero-length tensor (not None) when bias=False; only register if non-empty + if hasattr(self, "bias") and self.bias is not None and self.bias.numel() > 0: + self._register_dynamic_attribute("bias", self._get_bias) + self._register_dynamic_attribute("layer_norm_weight", self._get_ln_param) + if hasattr(self, "layer_norm_bias") and self.layer_norm_bias is not None: + self._register_dynamic_attribute("layer_norm_bias", self._get_ln_param) def _get_output_size_indices(self) -> torch.LongTensor: """Get the indices of the output size based on sorted + pruned attention heads. @@ -374,38 +466,42 @@ def _get_output_size_indices(self) -> torch.LongTensor: return selected_indices.cpu() @staticmethod - def _get_weight(mod: "_DynamicQKVColumnParallelLinear", weight: torch.Tensor) -> torch.Tensor: - """Return the weight tensor of the linear layer.""" + def _get_weight( + mod: "_DynamicTEQKVLayerNormColumnParallelLinear", weight: torch.Tensor + ) -> torch.Tensor: return get_sliced_tensor_by_slices( weight, [mod._get_output_size_indices(), mod.get_hparam("input_size").active_slice] ) @staticmethod def _get_bias( - mod: "_DynamicQKVColumnParallelLinear", bias: torch.Tensor | None + mod: "_DynamicTEQKVLayerNormColumnParallelLinear", bias: torch.Tensor | None ) -> torch.Tensor | None: - """Return the bias tensor of the linear layer.""" if bias is None: return bias return get_sliced_tensor_by_slices(bias, [mod._get_output_size_indices()]) + @staticmethod + def _get_ln_param( + mod: "_DynamicTEQKVLayerNormColumnParallelLinear", val: torch.Tensor | None + ) -> torch.Tensor | None: + return get_sliced_tensor(mod, val, "input_size") + # NOTE: We provide a parent class since we do not register to DMRegistry. -class _DynamicProjRowParallelLinear(DynamicModule, RowParallelLinear): - """An mcore RowParallelLinear layer for linear_qkv with dynamic attributes.""" +class _DynamicTEProjRowParallelLinear(DynamicModule, TERowParallelLinear): + """TE's RowParallelLinear for output projection with dynamic attributes.""" def _setup(self, *, num_attention_heads: NumAttentionHeadsHp, hidden_size: TracedHp): - """Setup the _DynamicProjRowParallelLinear dynamic module with global hidden_size hparam.""" self._register_hparam("output_size", hidden_size) self._register_hparam("num_attention_heads", num_attention_heads) self._register_dynamic_attribute( - "input_size", lambda mod, val: num_attention_heads.active * mod.config.kv_channels - ) - self._register_dynamic_attribute( - "input_size_per_partition", lambda mod, val: mod.input_size + "in_features", lambda mod, val: num_attention_heads.active * mod.config.kv_channels ) self._register_dynamic_attribute("weight", self._get_weight) - self._register_dynamic_attribute("bias", self._get_bias) + # TE stores a zero-length tensor (not None) when bias=False; only register if non-empty + if hasattr(self, "bias") and self.bias is not None and self.bias.numel() > 0: + self._register_dynamic_attribute("bias", self._get_bias) def _get_input_size_indices(self) -> torch.LongTensor: """Get the indices of the input size based on sorted + pruned heads and query groups.""" @@ -420,17 +516,15 @@ def _get_input_size_indices(self) -> torch.LongTensor: return selected_indices.cpu() @staticmethod - def _get_weight(mod: "_DynamicProjRowParallelLinear", weight: torch.Tensor) -> torch.Tensor: - """Return the weight tensor of the linear layer.""" + def _get_weight(mod: "_DynamicTEProjRowParallelLinear", weight: torch.Tensor) -> torch.Tensor: return get_sliced_tensor_by_slices( weight, [mod.get_hparam("output_size").active_slice, mod._get_input_size_indices()] ) @staticmethod def _get_bias( - mod: "_DynamicProjRowParallelLinear", bias: torch.Tensor | None + mod: "_DynamicTEProjRowParallelLinear", bias: torch.Tensor | None ) -> torch.Tensor | None: - """Return the bias tensor of the linear layer.""" return get_sliced_tensor(mod, bias, "output_size") @@ -454,43 +548,28 @@ def _setup(self, *, hidden_size: TracedHp): "num_attention_heads_per_partition", lambda mod, val: self.num_attention_heads ) - # Convert the Dot Product Attention to dynamic module - if isinstance(self.core_attention, DotProductAttention): - _DynamicDotProductAttention: DynamicModule = type( # noqa: N806 - "_DynamicDotProductAttention", - (DynamicModule, DotProductAttention), - {"_setup": lambda self: None}, - ) - - _DynamicDotProductAttention.convert(self.core_attention) - self.core_attention._register_dynamic_attribute( - "hidden_size_per_partition", - lambda mod, val: self.config.kv_channels * self.num_attention_heads_per_partition, - ) - self.core_attention._register_dynamic_attribute( - "num_attention_heads_per_partition", - lambda mod, val: self.num_attention_heads_per_partition, - ) - else: - assert HAS_TE and isinstance(self.core_attention, TEDotProductAttention) - - _DynamicTEDotProductAttention: DynamicModule = type( # noqa: N806 - "_DynamicTEDotProductAttention", - (DynamicModule, TEDotProductAttention), - {"_setup": lambda self: None}, - ) - - _DynamicTEDotProductAttention.convert(self.core_attention) - self.core_attention._register_dynamic_attribute( - "num_attention_heads", lambda mod, val: self.num_attention_heads_per_partition - ) + # Convert the TEDotProductAttention to dynamic module + assert isinstance(self.core_attention, TEDotProductAttention) + _DynamicTEDotProductAttention: DynamicModule = type( # noqa: N806 + "_DynamicTEDotProductAttention", + (DynamicModule, TEDotProductAttention), + {"_setup": lambda self: None}, + ) + _DynamicTEDotProductAttention.convert(self.core_attention) + self.core_attention._register_dynamic_attribute( + "num_attention_heads", lambda mod, val: self.num_attention_heads_per_partition + ) # Convert the fused qkv and output projection linear layer to dynamic module - _DynamicQKVColumnParallelLinear.convert( - self.linear_qkv, num_attention_heads=num_attention_heads, hidden_size=hidden_size + _DynamicTEQKVLayerNormColumnParallelLinear.convert( + self.linear_qkv, + num_attention_heads=num_attention_heads, + hidden_size=hidden_size, ) - _DynamicProjRowParallelLinear.convert( - self.linear_proj, num_attention_heads=num_attention_heads, hidden_size=hidden_size + _DynamicTEProjRowParallelLinear.convert( + self.linear_proj, + num_attention_heads=num_attention_heads, + hidden_size=hidden_size, ) def export(self) -> torch.nn.Module: @@ -585,10 +664,16 @@ def _setup(self, *, hidden_size: TracedHp): def forward(self, *args, **kwargs): """Forward pass for the MoE layer.""" - # Dont allow forward if model is sorted / trimmed unless exported (reinitializing token dispatcher correctly) - if isinstance(self, DynamicModule) and ( - self.get_hparam("num_moe_experts")._slice_order is not None - or self.get_hparam("num_moe_experts").active != self.get_hparam("num_moe_experts").max + # Dont allow forward if model is sorted / trimmed unless the token dispatcher has been + # reinitialized (via _export_reinit_token_dispatcher in _prune or export). + if ( + isinstance(self, DynamicModule) + and not getattr(self, "_token_dispatcher_reinitialized", False) + and ( + self.get_hparam("num_moe_experts")._slice_order is not None + or self.get_hparam("num_moe_experts").active + != self.get_hparam("num_moe_experts").max + ) ): raise RuntimeError("Only run forward after exporting the pruned model") return super().forward(*args, **kwargs) @@ -610,10 +695,7 @@ def modify( def _export_reinit_token_dispatcher(self) -> None: """Reinitialize the token dispatcher after pruning.""" - if hasattr(moe_utils, "get_default_model_comm_pgs"): - model_comm_pgs = moe_utils.get_default_model_comm_pgs() - else: - model_comm_pgs = moe_utils.get_default_pg_collection() + model_comm_pgs = moe_utils.get_default_pg_collection() # NOTE: Update config.num_moe_experts for correct router initialization. self.config.num_moe_experts = self.num_moe_experts self.token_dispatcher = type(self.token_dispatcher)( @@ -623,6 +705,9 @@ def _export_reinit_token_dispatcher(self) -> None: if self.use_shared_expert and self.shared_expert_overlap: self.token_dispatcher.set_shared_experts(self.shared_experts) + # Allow forward after token dispatcher reinitialization + self._token_dispatcher_reinitialized = True + def export(self) -> torch.nn.Module: """Export the dynamic module to a standard MoELayer.""" self.router.export() @@ -642,14 +727,16 @@ class _DynamicTransformerLayer(DynamicModule): def _setup(self, *, hidden_size: TracedHp): """Setup the TransformerLayer dynamic module with global hidden_size hparam.""" - # Convert the layernorms, self-attention, and mlp/moe layers to dynamic modules + # Convert the self-attention and mlp/moe layers to dynamic modules # NOTE: Mamba stack layers have either Attention or MLP, not both unlike GPT models if isinstance(self.self_attention, SelfAttention): - DMRegistry.convert(self.input_layernorm, num_features=hidden_size) DMRegistry.convert(self.self_attention, hidden_size=hidden_size) if isinstance(self.mlp, (MLP, MoELayer)): - DMRegistry.convert(self.pre_mlp_layernorm, num_features=hidden_size) + # pre_mlp_layernorm is IdentityOp for dense MLP (fused into linear_fc1), + # but RMSNorm for MoETransformerLayer (separate from MoE experts) + if not isinstance(self.pre_mlp_layernorm, IdentityOp): + DMRegistry.convert(self.pre_mlp_layernorm, num_features=hidden_size) if isinstance(self.mlp, MoELayer): setup_kwargs = {} else: @@ -674,10 +761,10 @@ def modify( def export(self): """Export the dynamic module to a torch.nn.Module.""" if isinstance(self.self_attention, SelfAttention): - self.input_layernorm.export() self.self_attention.export() if isinstance(self.mlp, (MLP, MoELayer)): - self.pre_mlp_layernorm.export() + if not isinstance(self.pre_mlp_layernorm, IdentityOp): + self.pre_mlp_layernorm.export() self.mlp.export() return super().export() @@ -941,8 +1028,6 @@ def _setup(self, *, hidden_size: TracedHp): # Convert to dynamic module DMRegistry.convert(self.mixer, hidden_size=hidden_size) - DMRegistry.convert(self.norm, num_features=hidden_size) - def modify( self, *, @@ -958,7 +1043,6 @@ def modify( def export(self): """Export the dynamic module to a torch.nn.Module.""" self.mixer.export() - self.norm.export() return super().export() diff --git a/modelopt/torch/nas/plugins/transformer_engine.py b/modelopt/torch/nas/plugins/transformer_engine.py deleted file mode 100644 index 3392c0858c..0000000000 --- a/modelopt/torch/nas/plugins/transformer_engine.py +++ /dev/null @@ -1,39 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Plugin to add NAS support for Transformer Engine modules.""" - -import transformer_engine as te - -from ..modules import _DynamicLayerNorm -from ..registry import DMRegistry -from ..traced_hp import TracedHp - -__all__ = ["_DynamicTENorm"] - - -@DMRegistry.register( - {te.pytorch.LayerNorm: "te.pytorch.LayerNorm", te.pytorch.RMSNorm: "te.pytorch.RMSNorm"} -) -class _DynamicTENorm(_DynamicLayerNorm): - """A ``te.pytorch.{Layer/RMS}Norm`` layer with dynamic hyperparams.""" - - def _setup(self, *, num_features: TracedHp): - """Setup the TENorm dynamic module with pre-defined num_features hparam.""" - self._register_hparam("num_features", num_features) - # register dynamic attributes - self._register_dynamic_attribute("weight", self._cut_to_active_features) - if hasattr(self, "bias"): # Bias is not present in RMSNorm - self._register_dynamic_attribute("bias", self._cut_to_active_features) diff --git a/modelopt/torch/prune/plugins/mcore_minitron.py b/modelopt/torch/prune/plugins/mcore_minitron.py index 9e7f0faeb6..b40f97a2d9 100644 --- a/modelopt/torch/prune/plugins/mcore_minitron.py +++ b/modelopt/torch/prune/plugins/mcore_minitron.py @@ -34,6 +34,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from megatron.core.extensions.transformer_engine import TELayerNormColumnParallelLinear from megatron.core.models.gpt.gpt_model import GPTModel from megatron.core.models.mamba.mamba_model import MambaModel from megatron.core.parallel_state import ( @@ -182,7 +183,7 @@ class MCoreMinitronSearcher(BaseSearcher): - `top_k`: Number of candidates to consider for score_func validation (default: 10). """ - activations_per_rank: list[dict[str, torch.Tensor]] + local_activations: dict[str, torch.Tensor] layer_scores: dict[int, torch.Tensor] sorted_layers: list[int] | None # 1-indexed sorted list of layer numbers # Dict from params constraint to list of tuples (ss_config, params, score) @@ -207,7 +208,7 @@ def default_search_config(self) -> SearchConfig: def default_state_dict(self) -> SearchStateDict: """Return default state dict for importance scores and activations from forward loop.""" return { - "activations_per_rank": [], + "local_activations": {}, "layer_scores": {}, "sorted_layers": None, "top_k_candidates_per_constraint": {}, @@ -273,8 +274,10 @@ def before_search(self) -> None: def run_search(self) -> None: """Run forward loop to collect activations, sort parameters, and prune the model.""" registry = ImportanceEstimatorRegistry(self.model) - if self.layer_scores and self.activations_per_rank: # Available from checkpoint - registry.set_activations_and_layer_scores(self.activations_per_rank, self.layer_scores) + if self.local_activations and self.layer_scores: # Available from per-rank checkpoint + registry.set_local_activations_and_layer_scores( + self.local_activations, self.layer_scores + ) elif not self.config["skip_sorting"]: assert self.forward_loop is not None is_training = self.model.training @@ -284,8 +287,8 @@ def run_search(self) -> None: self.model.train(is_training) # Store activations and layer scores for re-pruning with different export configs - self.activations_per_rank, self.layer_scores = ( - registry.get_activations_and_layer_scores() + self.local_activations, self.layer_scores = ( + registry.get_local_activations_and_layer_scores() ) self.save_search_checkpoint(verbose=True) @@ -384,7 +387,6 @@ def _prune(self, export_config: dict, prune_depth: bool = True) -> None: for m in self.model.modules(): if isinstance(m, _DynamicMoELayer): m._export_reinit_token_dispatcher() - break def search_best_arch_by_params(self) -> dict: """Search for the best architecture based on the given parameters constraints. @@ -865,6 +867,11 @@ def cleanup(self) -> None: handle.remove() self._hooks.clear() + # Unpatch return_layernorm_output on fused TELayerNormColumnParallelLinear modules + for m in self.model.modules(): + if isinstance(m, TELayerNormColumnParallelLinear): + m.return_layernorm_output = False + def get_layer_scores(self) -> dict[int, torch.Tensor]: """Get the layer scores (1-indexed) from the model. @@ -893,45 +900,38 @@ def get_layer_scores(self) -> dict[int, torch.Tensor]: return layer_scores - def get_activations_and_layer_scores( + def get_local_activations_and_layer_scores( self, - ) -> tuple[list[dict[str, torch.Tensor]], dict[int, torch.Tensor]]: - """Get the per-rank activations and layer scores from the model.""" - local_activations = {} - for n, m in self.model.named_modules(): - if hasattr(m, "_activations"): - local_activations[n] = m._activations - activations_per_rank = dist.allgather( - local_activations, group=get_pipeline_model_parallel_group() - ) - assert len(activations_per_rank) == get_pipeline_model_parallel_world_size() + ) -> tuple[dict[str, torch.Tensor], dict[int, torch.Tensor]]: + """Get this rank's local activations and global layer scores from the model. + Each rank saves its own activations to its per-rank checkpoint file (no allgather needed). + Layer scores are gathered across all PP ranks to produce a global ranking. + """ + local_activations = { + n: m._activations for n, m in self.model.named_modules() if hasattr(m, "_activations") + } layer_scores = self.get_layer_scores() - return activations_per_rank, layer_scores + return local_activations, layer_scores - def set_activations_and_layer_scores( + def set_local_activations_and_layer_scores( self, - activations_per_rank: list[dict[str, torch.Tensor]], + local_activations: dict[str, torch.Tensor], layer_scores: dict[int, torch.Tensor], ) -> None: - """Set the pre-computed layer_scores and per-rank activations instead of running forward. + """Set the pre-computed layer_scores and local activations instead of running forward. Args: - activations_per_rank: List of dicts from module name to activations. Should match PP size. - layer_scores: Dict from layer_number (1-indexed) to score. + local_activations: Dict from module name to activations for this rank. + layer_scores: Dict from layer_number (1-indexed) to score (global across all PP ranks). """ - print_rank_0("Loading activations and scores per rank from checkpoint...") - rank = get_pipeline_model_parallel_rank() - pp_size = get_pipeline_model_parallel_world_size() - assert len(activations_per_rank) == pp_size, ( - f"Expected same PP size for stored pruning scores ({len(activations_per_rank)}) as current ({pp_size})!" - ) + print_rank_0("Loading activations and scores from per-rank checkpoint...") for layer in self.model.decoder.layers: layer._scores = layer_scores[layer.layer_number] for n, m in self.model.named_modules(): if hasattr(m, "_activations"): - m._activations = activations_per_rank[rank][n] + m._activations = local_activations[n] # Module-specific registration functions @@ -941,25 +941,48 @@ def _register_hidden_size_importance( """Register importance estimators for Language Model (GPT/Mamba) modules.""" module._register_temp_attribute("_activations", {}) - def _emb_layernorm_forward_hook(mod, module_inner, input, output): - """Hook to collect activations for importance estimation. + def _collect_activations(mod, module_id, activations_tensor): + """Accumulate activation importance scores for a given module.""" + activations_tensor = activations_tensor.to(torch.float32) + activations = activations_tensor.abs().mean(dim=0) # [batch_size, hidden_size] + activations = activations.pow(2).sum(dim=0) + if module_id not in mod._activations: + mod._activations[module_id] = activations + else: + mod._activations[module_id] += ( + activations # aggregate sum instead of mean of scores for simplicity + ) - Activations are computed as mean over seq_len and then squared and summed over batch_size. - Later we take the square root of the sum to get the L2 norm. + def _fused_ln_linear_forward_hook(mod, module_inner, input, output): + """Hook on TELayerNormColumnParallelLinear with return_layernorm_output=True. + + Extracts the exact layernorm output from TE's fused kernel and restores + the normal return format so downstream code is not affected. """ + # Output format with return_layernorm_output=True: + # te_return_bias=True: MCore returns (linear_out, bias, ln_out) + # te_return_bias=False: MCore returns ((linear_out, ln_out), None) + if module_inner.te_return_bias: + linear_out, bias, ln_out = output + fixed_output = (linear_out, bias) + else: + (linear_out, ln_out), bias = output + fixed_output = (linear_out, bias) + + # Gather over all TP regions + # NOTE: This is not used at the moment since we restrict to TP=1 + ln_out = gather_from_tensor_model_parallel_region(ln_out).detach() + _collect_activations(mod, id(module_inner), ln_out) + + # Return the normal output format so downstream code (e.g. SelfAttention) is not affected + return fixed_output + + def _layernorm_forward_hook(mod, module_inner, input, output): + """Hook on separate layernorm modules (e.g. TENorm for MoE pre_mlp_layernorm).""" # Gather output [seq_len, batch_size, hidden_size] over all TP regions # NOTE: This is not used at the moment since we restrict to TP=1 output = gather_from_tensor_model_parallel_region(output).detach() - - output = output.to(torch.float32) # use full precision to avoid overflow - activations = output.abs().mean(dim=0) # [batch_size, hidden_size] - activations = activations.pow(2).sum(dim=0) - if id(module_inner) not in mod._activations: - mod._activations[id(module_inner)] = activations - else: - mod._activations[id(module_inner)] += ( - activations # aggregate sum instead of mean of scores for simplicity - ) + _collect_activations(mod, id(module_inner), output) def _estimate_hidden_size_importance(mod): """Return the activation magnitude-based importance of the hidden_size.""" @@ -973,25 +996,44 @@ def _estimate_hidden_size_importance(mod): torch.distributed.all_reduce(activations, op=torch.distributed.ReduceOp.SUM) return activations - # Register hooks for all layers + # Register hooks to collect post-layernorm activations for hidden_size importance. + # Layernorms are fused into TELayerNormColumnParallelLinear. We temporarily + # patch return_layernorm_output=True so TE's fused kernel returns the layernorm output. + # For MoE layers, pre_mlp_layernorm is a separate TENorm — use a regular forward hook. + for m in module.modules(): + if isinstance(m, TELayerNormColumnParallelLinear): + m.return_layernorm_output = True + for layer in module.decoder.layers: if isinstance(layer, _DynamicTransformerLayer): if isinstance(layer.self_attention, _DynamicSelfAttention): + # input_layernorm is fused into self_attention.linear_qkv registry.register_hook( - layer.input_layernorm, - partial(_emb_layernorm_forward_hook, module), + layer.self_attention.linear_qkv, + partial(_fused_ln_linear_forward_hook, module), hook_type="forward", ) - if isinstance(layer.mlp, (_DynamicMLP, _DynamicSequentialMLP)): + if isinstance(layer.mlp, _DynamicMoELayer): + # MoE layers have a separate pre_mlp_layernorm (TENorm, not IdentityOp) registry.register_hook( layer.pre_mlp_layernorm, - partial(_emb_layernorm_forward_hook, module), + partial(_layernorm_forward_hook, module), + hook_type="forward", + ) + elif isinstance(layer.mlp, _DynamicMLP): + # Dense MLP: pre_mlp_layernorm is fused into mlp.linear_fc1 + registry.register_hook( + layer.mlp.linear_fc1, + partial(_fused_ln_linear_forward_hook, module), hook_type="forward", ) elif isinstance(layer, _DynamicMambaLayer): + # Mamba norm is fused into mixer.in_proj registry.register_hook( - layer.norm, partial(_emb_layernorm_forward_hook, module), hook_type="forward" + layer.mixer.in_proj, + partial(_fused_ln_linear_forward_hook, module), + hook_type="forward", ) registry.register_importance( diff --git a/modelopt/torch/utils/plugins/mbridge.py b/modelopt/torch/utils/plugins/mbridge.py index 94cdf87cf5..da8c773da5 100644 --- a/modelopt/torch/utils/plugins/mbridge.py +++ b/modelopt/torch/utils/plugins/mbridge.py @@ -23,13 +23,9 @@ from megatron.bridge.data.builders.hf_dataset import HFDatasetConfig from megatron.bridge.data.loaders import setup_data_iterators from megatron.bridge.data.utils import get_dataset_provider -from megatron.bridge.models.gpt_provider import GPTModelProvider, modelopt_transformer_layer_spec +from megatron.bridge.models.gpt_provider import GPTModelProvider from megatron.bridge.models.hf_pretrained.utils import is_safe_repo -from megatron.bridge.models.mamba.mamba_provider import ( - MambaModelProvider, - modelopt_mamba_stack_spec, -) -from megatron.bridge.models.nemotronh.nemotron_h_provider import NemotronHModelProvider +from megatron.bridge.models.mamba.mamba_provider import MambaModelProvider from megatron.bridge.training.config import ( CheckpointConfig, ConfigContainer, @@ -44,12 +40,14 @@ from megatron.bridge.training.state import GlobalState from megatron.bridge.training.tokenizers.config import TokenizerConfig from megatron.core.models.gpt import GPTModel +from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec from megatron.core.models.mamba import MambaModel from megatron.core.parallel_state import get_data_parallel_group from megatron.core.transformer.module import MegatronModule from megatron.core.utils import unwrap_model from transformers import AutoTokenizer +from modelopt.torch.nas.plugins.megatron import get_te_mamba_stack_spec from modelopt.torch.utils import get_dataset_samples, print_rank_0, warn_rank_0 __all__ = ["get_hf_mbridge_calibration_loop", "load_mbridge_model_from_hf"] @@ -94,12 +92,15 @@ def load_mbridge_model_from_hf( assert hasattr(provider, key), f"{type(provider)} does not have attribute {key}" setattr(provider, key, value) - print_rank_0("Setting ModelOpt spec for model provider") + # disable moe_grouped_gemm in default TE spec until its supported if isinstance(provider, MambaModelProvider): - provider.mamba_stack_spec = modelopt_mamba_stack_spec + provider.mamba_stack_spec = get_te_mamba_stack_spec(moe_grouped_gemm=False) else: - provider.transformer_layer_spec = modelopt_transformer_layer_spec - + provider.transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec( + num_experts=provider.num_moe_experts, + moe_grouped_gemm=False, + qk_layernorm=provider.qk_layernorm, + ) provider.finalize() if init_model_parallel: provider.initialize_model_parallel(seed=0) @@ -179,9 +180,6 @@ def get_hf_mbridge_calibration_loop( global_batch_size = micro_batch_size num_iters = num_samples // global_batch_size - # NOTE: Issue with NemotronH tokenizer's len() hence using use_fast=True as a WAR - use_fast_tokenizer = isinstance(provider, NemotronHModelProvider) - cfg = ConfigContainer( model=provider, train=TrainingConfig( @@ -203,9 +201,10 @@ def get_hf_mbridge_calibration_loop( tokenizer=TokenizerConfig( tokenizer_type="HuggingFaceTokenizer", tokenizer_model=hf_model_name_or_path, + # NOTE: Issue with Nemotron Nano v2 tokenizer returning bool hence using use_fast=True as a WAR hf_tokenizer_kwargs={ "trust_remote_code": trust_remote_code, - "use_fast": use_fast_tokenizer, + "use_fast": tokenizer.is_fast, }, ), # Unused diff --git a/tests/_test_utils/torch/megatron/models.py b/tests/_test_utils/torch/megatron/models.py index 42d722cd40..2d6fe2a6e7 100644 --- a/tests/_test_utils/torch/megatron/models.py +++ b/tests/_test_utils/torch/megatron/models.py @@ -31,6 +31,7 @@ from megatron.core.transformer.transformer_config import TransformerConfig from modelopt.torch.export.unified_export_megatron import import_mcore_gpt_from_hf +from modelopt.torch.nas.plugins.megatron import get_te_mamba_stack_spec try: from megatron.core.extensions.transformer_engine import TENorm @@ -183,8 +184,8 @@ def squared_relu(x): pipeline_dtype=torch.bfloat16 if bf16 else torch.float32, bf16=bf16, # MoE-specific parameters + moe_router_dtype=None, moe_grouped_gemm=moe_grouped_gemm, - moe_router_dtype="fp32", moe_ffn_hidden_size=moe_ffn_hidden_size, moe_shared_expert_intermediate_size=moe_shared_expert_intermediate_size, moe_router_enable_expert_bias=True, @@ -207,23 +208,21 @@ def squared_relu(x): assert HAS_APEX, "Apex not installed" transformer_layer_spec = get_gpt_layer_local_spec( num_experts=num_moe_experts, - normalization=normalization, moe_grouped_gemm=moe_grouped_gemm, - # TODO: uncomment this when TEGroupedMLP is enabled in Megatron-LM - # use_te=use_te, + normalization=normalization, ) else: assert HAS_TE, "Transformer Engine not installed" - transformer_layer_spec = ( - get_gpt_modelopt_spec( + if transformer_impl == "modelopt": + transformer_layer_spec = get_gpt_modelopt_spec( config, remap_te_layernorm=True, - # TODO: uncomment this when TEGroupedMLP is enabled in Megatron-LM - # moe_grouped_gemm=moe_grouped_gemm ) - if transformer_impl == "modelopt" - else get_gpt_layer_with_transformer_engine_spec() - ) + else: + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec( + num_experts=num_moe_experts, + moe_grouped_gemm=moe_grouped_gemm, + ) model = GPTModel( config=config, @@ -306,6 +305,7 @@ def get_mcore_mamba_hybrid_model( vocab_size: int = 64, bf16: bool = True, sequence_parallel: bool = False, + transformer_impl: str = "modelopt", # Mamba-specific parameters mamba_state_dim: int = 32, mamba_num_heads: int | None = None, @@ -313,6 +313,7 @@ def get_mcore_mamba_hybrid_model( mamba_num_groups: int = 2, # MoE-specific parameters skip_moe: bool = False, + moe_grouped_gemm: bool = False, moe_ffn_hidden_size: int | None = 64, moe_shared_expert_intermediate_size: int | None = 32, num_moe_experts: int | None = 8, @@ -346,6 +347,7 @@ def get_mcore_mamba_hybrid_model( mamba_head_dim=mamba_head_dim, mamba_num_groups=mamba_num_groups, num_moe_experts=num_moe_experts, + moe_grouped_gemm=moe_grouped_gemm, moe_ffn_hidden_size=moe_ffn_hidden_size, moe_shared_expert_intermediate_size=moe_shared_expert_intermediate_size, add_bias_linear=False, @@ -383,9 +385,14 @@ def get_mcore_mamba_hybrid_model( assert len(hybrid_override_pattern.replace("|", "")) == num_layers print(f"Using `{hybrid_override_pattern=}` for building MambaModel") + if transformer_impl == "transformer_engine": + mamba_spec = get_te_mamba_stack_spec(moe_grouped_gemm=moe_grouped_gemm) + else: + mamba_spec = get_mamba_stack_modelopt_spec(remap_te_layernorm=True) + model = MambaModel( config=config, - mamba_stack_spec=get_mamba_stack_modelopt_spec(remap_te_layernorm=True), + mamba_stack_spec=mamba_spec, vocab_size=vocab_size, max_sequence_length=max_sequence_length, hybrid_override_pattern=hybrid_override_pattern, diff --git a/tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py b/tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py index 4d905e6cec..158b6cafac 100644 --- a/tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py +++ b/tests/gpu_megatron/torch/nas/plugins/test_megatron_gpt_dynamic_modules.py @@ -29,17 +29,17 @@ from modelopt.torch.nas.modules import DynamicModuleList from modelopt.torch.nas.plugins.megatron import ( NumAttentionHeadsHp, - _DynamicColumnParallelLinear, _DynamicEmbedding, _DynamicLanguageModelEmbedding, _DynamicMCoreLanguageModel, _DynamicMLP, _DynamicMoELayer, - _DynamicProjRowParallelLinear, - _DynamicQKVColumnParallelLinear, - _DynamicRowParallelLinear, _DynamicSelfAttention, _DynamicSequentialMLP, + _DynamicTELayerNormColumnParallelLinear, + _DynamicTEProjRowParallelLinear, + _DynamicTEQKVLayerNormColumnParallelLinear, + _DynamicTERowParallelLinear, _DynamicTopKRouter, _DynamicTransformerLayer, expand_head_indices, @@ -76,6 +76,7 @@ def _test_gpt_search_space( vocab_size=vocab_size, activation_func=activation_func, normalization=normalization, + transformer_impl="transformer_engine", ).cuda() mtn.convert( @@ -101,12 +102,12 @@ def _test_gpt_search_space( assert isinstance(m, _DynamicTransformerLayer) elif isinstance(m, MLP): assert isinstance(m, _DynamicMLP) - assert isinstance(m.linear_fc1, _DynamicColumnParallelLinear) - assert isinstance(m.linear_fc2, _DynamicRowParallelLinear) + assert isinstance(m.linear_fc1, _DynamicTELayerNormColumnParallelLinear) + assert isinstance(m.linear_fc2, _DynamicTERowParallelLinear) elif isinstance(m, SelfAttention): assert isinstance(m, _DynamicSelfAttention) - assert isinstance(m.linear_qkv, _DynamicQKVColumnParallelLinear) - assert isinstance(m.linear_proj, _DynamicProjRowParallelLinear) + assert isinstance(m.linear_qkv, _DynamicTEQKVLayerNormColumnParallelLinear) + assert isinstance(m.linear_proj, _DynamicTEProjRowParallelLinear) # NOTE: `search_space_size` does not reduce across TP/PP groups ss_size_per_pp = search_space_size(model) @@ -139,7 +140,6 @@ def _test_gpt_search_space( [ (8, 8, "squared_relu", "LayerNorm"), # MHA (8, 4, "swiglu", "RMSNorm"), # GQA - # (8, 1, "swiglu", "RMSNorm"), # MQA ], ) def test_gpt_search_space( @@ -173,14 +173,15 @@ def test_gpt_self_attention_head_sorting(distributed_setup_size_1): num_query_groups=2, ffn_hidden_size=16, activation_func="squared_relu", + transformer_impl="transformer_engine", ).cuda() model = mtn.convert(model, "mcore_minitron") self_attn = model.decoder.layers[0].self_attention assert isinstance(self_attn, _DynamicSelfAttention) - assert isinstance(self_attn.linear_qkv, _DynamicQKVColumnParallelLinear) - assert isinstance(self_attn.linear_proj, _DynamicProjRowParallelLinear) + assert isinstance(self_attn.linear_qkv, _DynamicTEQKVLayerNormColumnParallelLinear) + assert isinstance(self_attn.linear_proj, _DynamicTEProjRowParallelLinear) hp_num_attention_heads = self_attn.get_hparam("num_attention_heads") assert isinstance(hp_num_attention_heads, NumAttentionHeadsHp) @@ -255,6 +256,7 @@ def _test_gpt_moe_search_space(rank, size): max_sequence_length=max_sequence_length, vocab_size=vocab_size, activation_func="squared_relu", + transformer_impl="transformer_engine", num_moe_experts=num_moe_experts, moe_ffn_hidden_size=moe_ffn_hidden_size, moe_shared_expert_intermediate_size=moe_shared_expert_intermediate_size, @@ -291,6 +293,7 @@ def _test_gpt_moe_search_space(rank, size): moe_shared_ffn_choices = moe_shared_expert_intermediate_size // channel_divisor hidden_size_choices = hidden_size // channel_divisor num_layers_per_pp = num_layers // size + # SequentialMLP has per-expert moe_ffn_hidden_size hparams assert ( ss_size_per_pp == ( diff --git a/tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py b/tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py index 5905a2984e..db8b9e10ba 100644 --- a/tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py +++ b/tests/gpu_megatron/torch/nas/plugins/test_megatron_mamba_dynamic_modules.py @@ -31,11 +31,12 @@ _DynamicColumnParallelLinear, _DynamicEmbedding, _DynamicExtendedRMSNorm, - _DynamicLayerNorm, _DynamicMambaLayer, _DynamicMambaMixer, _DynamicMCoreLanguageModel, - _DynamicRowParallelLinear, + _DynamicTELayerNormColumnParallelLinear, + _DynamicTENorm, + _DynamicTERowParallelLinear, ) from modelopt.torch.nas.traced_hp import TracedHp from modelopt.torch.opt.utils import named_dynamic_modules, search_space_size @@ -71,6 +72,8 @@ def _test_mamba_search_space(rank, size): mamba_num_groups=mamba_num_groups, max_sequence_length=max_sequence_length, vocab_size=vocab_size, + transformer_impl="transformer_engine", + bf16=False, ).cuda() mamba_num_heads = model.decoder.layers[0].mixer.nheads @@ -95,13 +98,13 @@ def _test_mamba_search_space(rank, size): for layer in model.decoder.layers: assert isinstance(layer, _DynamicMambaLayer) assert isinstance(layer.mixer, _DynamicMambaMixer) - assert isinstance(layer.mixer.in_proj, _DynamicColumnParallelLinear) - assert isinstance(layer.mixer.out_proj, _DynamicRowParallelLinear) + assert isinstance(layer.mixer.in_proj, _DynamicTELayerNormColumnParallelLinear) + assert isinstance(layer.mixer.out_proj, _DynamicTERowParallelLinear) assert isinstance(layer.mixer.conv1d, _DynamicConvNd) if layer.mixer.rmsnorm: assert isinstance(layer.mixer.norm, _DynamicExtendedRMSNorm) if is_pipeline_last_stage(): - assert isinstance(model.decoder.final_norm, _DynamicLayerNorm) + assert isinstance(model.decoder.final_norm, _DynamicTENorm) assert isinstance(model.output_layer, _DynamicColumnParallelLinear) # NOTE: `search_space_size` does not reduce across TP/PP groups diff --git a/tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py b/tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py index 55583a4300..3dab58f5ba 100644 --- a/tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py +++ b/tests/gpu_megatron/torch/prune/plugins/test_mcore_gpt_minitron_pruning.py @@ -64,6 +64,7 @@ def _test_mcore_gpt_parameter_sorting(activation_func, rank, size): max_sequence_length=max_sequence_length, vocab_size=vocab_size, activation_func=activation_func, + transformer_impl="transformer_engine", bf16=False, ).cuda() @@ -166,6 +167,7 @@ def _get_model(initialize_megatron=True): position_embedding_type=position_embedding_type, activation_func=activation_func, normalization=normalization, + transformer_impl="transformer_engine", num_layers_in_first_pipeline_stage=num_layers_in_first_pipeline_stage, num_layers_in_last_pipeline_stage=num_layers_in_last_pipeline_stage, ).cuda() @@ -206,7 +208,7 @@ def forward_loop(m): model, pruning_scores = prune_minitron(model, constraints, config, channel_divisor) if not skip_sorting: assert pruning_scores["layer_scores"] - assert pruning_scores["activations_per_rank"] + assert pruning_scores["local_activations"] # Assert weights are pruned correctly for layer in model.decoder.layers: @@ -337,6 +339,7 @@ def _test_mcore_gpt_moe_parameter_sorting(rank, size): max_sequence_length=max_sequence_length, vocab_size=vocab_size, activation_func="squared_relu", + transformer_impl="transformer_engine", num_moe_experts=num_moe_experts, moe_ffn_hidden_size=moe_ffn_hidden_size, moe_shared_expert_intermediate_size=moe_shared_expert_intermediate_size, @@ -413,6 +416,7 @@ def _get_model(initialize_megatron=True): max_sequence_length=max_sequence_length, vocab_size=vocab_size, activation_func="squared_relu", + transformer_impl="transformer_engine", num_moe_experts=num_moe_experts, moe_ffn_hidden_size=moe_ffn_hidden_size, moe_shared_expert_intermediate_size=moe_shared_expert_intermediate_size, diff --git a/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py b/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py index 785e434a2f..c27cace7bd 100644 --- a/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py +++ b/tests/gpu_megatron/torch/prune/plugins/test_mcore_mamba_minitron_pruning.py @@ -71,6 +71,7 @@ def _test_mcore_mamba_parameter_sorting(rank, size): mamba_num_groups=mamba_num_groups, max_sequence_length=max_sequence_length, vocab_size=vocab_size, + transformer_impl="transformer_engine", bf16=False, ).cuda() @@ -151,6 +152,8 @@ def _get_model(initialize_megatron=True): moe_shared_expert_intermediate_size=ffn_hidden_size, num_moe_experts=num_moe_experts, vocab_size=vocab_size, + transformer_impl="transformer_engine", + bf16=False, ).cuda() return model @@ -202,11 +205,8 @@ def forward_loop(m): bc = 2 * mixer.ngroups * mixer.d_state assert mixer.nheads == pruned_mamba_num_heads assert mixer.headdim == pruned_mamba_head_dim - assert mixer.in_proj.input_size == pruned_hidden_size assert mixer.d_inner == pruned_mamba_num_heads * pruned_mamba_head_dim - assert mixer.in_proj.output_size == 2 * mixer.d_inner + bc + pruned_mamba_num_heads - assert mixer.out_proj.input_size == mixer.d_inner - assert mixer.out_proj.output_size == pruned_hidden_size + assert mixer.out_proj.out_features == pruned_hidden_size assert mixer.conv1d.in_channels == mixer.conv1d.out_channels == mixer.d_inner + bc # Assert model.config is updated for correct save/restoring @@ -271,6 +271,8 @@ def _test_mcore_mamba_hybrid_pruning_nas(ckpt_path, rank, size): moe_shared_expert_intermediate_size=moe_shared_expert_intermediate_size, num_moe_experts=num_moe_experts, vocab_size=vocab_size, + transformer_impl="transformer_engine", + bf16=False, ).cuda() param_count = get_mcore_param_count(model)