diff --git a/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py b/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py index 3e5b35946..16ace511f 100644 --- a/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py +++ b/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py @@ -22,7 +22,6 @@ from typing import Any import torch -import yaml from megatron.core import dist_checkpointing, mpu from megatron.core.dist_checkpointing.serialization import get_default_load_sharded_strategy from megatron.core.dist_checkpointing.strategies.common import COMMON_STATE_FNAME @@ -36,21 +35,6 @@ SUPPORTED_WRAPPERS[Float16Module] = "module" -DROP_SUBSTRINGS = [ - "fp4", - "fp8", - "tp_", - "parallel", - "cuda_graph", - "init_", - "cpu", - "recompute", - "inference", - "pipeline", - "comm", - "batch", -] - def remove_per_module_state( modelopt_state: dict[str, Any], @@ -138,29 +122,6 @@ def save_sharded_modelopt_state( sharded_strategy: configures sharded tensors saving behavior and backend prefix: the prefix to add to the modelopt_state keys ("model." for NeMo) """ - - def _parse_transformer_config(transformer_config: dict) -> dict: - config = {} - - for k, v in transformer_config.items(): - if any(substring in k for substring in DROP_SUBSTRINGS): - continue - if isinstance(v, (bool, int, str)): - config[k] = v - else: - config[k] = str(v) - - return config - - # Save own version of run config, if not already saved by the framework. - if dist.is_master() and not os.path.exists(f"{checkpoint_name}/run_config.yaml"): - run_config_name = f"{checkpoint_name}/modelopt_run_config.yaml" - # We avoid deepcopy since some attributes in Megatron-Bridge config cannot be deepcopied. - config_dict = _parse_transformer_config(model[0].config.__dict__) - config_dict["nvidia_modelopt_version"] = modelopt.__version__ - with open(run_config_name, "w") as f: - yaml.dump(config_dict, f, default_flow_style=False) - if not mto.ModeloptStateManager.is_converted(model[0]): return if len(model) > 1: