NVIDIA · shengliangxu · Mar 17, 2026 · Dec 5, 2025 · Mar 13, 2026 · Mar 13, 2026
@@ -45,7 +45,6 @@
 except ImportError:
     snapshot_download = None
 
-import modelopt.torch.quantization as mtq
 from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor
 
 logger = logging.getLogger(__name__)
@@ -199,22 +198,13 @@ def calibrate_loop(_model):
 
 def build_quant_cfg(
     qformat,
-    kv_cache_qformat,
+    quant_cfg,
     awq_block_size,
     model_type,
-    quant_cfg_choices,
-    kv_quant_cfg_choices,
     moe_calib_experts_ratio: float | None = None,
 ) -> dict[str, Any]:
-    quant_cfg = {}
-    assert qformat in quant_cfg_choices, (
-        f"Unsupported quantization format: {qformat} with {kv_cache_qformat} KV cache"
-    )
-
-    quant_cfg = quant_cfg_choices[qformat]
-
-    if "awq" in qformat:
-        quant_cfg = copy.deepcopy(quant_cfg_choices[qformat])
+    quant_cfg = copy.deepcopy(quant_cfg)
+    if "awq" in str(quant_cfg.get("algorithm")):
         weight_quantizer = quant_cfg["quant_cfg"]["*weight_quantizer"]
         if isinstance(weight_quantizer, list):
             weight_quantizer = weight_quantizer[0]
@@ -226,16 +216,6 @@ def build_quant_cfg(
         if qformat == "w4a8_awq" and model_type in ["gemma", "mpt"]:
             quant_cfg["algorithm"] = {"method": "awq_lite", "alpha_step": 1}
 
-    enable_quant_kv_cache = kv_cache_qformat != "none"
-    print(f"{'Enable' if enable_quant_kv_cache else 'Disable'} KV cache quantization")
-
-    # Check if any bmm_quantizer is in the quant_cfg. If so, we need to enable the bmm_quantizer.
-    if enable_quant_kv_cache:
-        quant_cfg = mtq.update_quant_cfg_with_kv_cache_quant(
-            quant_cfg,
-            getattr(mtq, kv_quant_cfg_choices[kv_cache_qformat])["quant_cfg"],
-        )
-
     if moe_calib_experts_ratio:
         assert 0 < moe_calib_experts_ratio <= 1, "moe_calib_experts_ratio must be between 0 and 1"
         if isinstance(quant_cfg["algorithm"], str):

@@ -50,6 +50,7 @@
 import modelopt.torch.opt as mto
 import modelopt.torch.quantization as mtq
 import modelopt.torch.sparsity as mts
+from modelopt.recipe import ModelOptPTQRecipe, load_recipe
 from modelopt.torch.export import (
     export_hf_checkpoint,
     export_speculative_decoding,
@@ -262,7 +263,7 @@ def auto_quantize(
     assert qformat_list, "No quantization formats provided"
     # Check if all provided quantization formats are supported
     assert all(
-        args.qformat
+        qformat
         in [
             "fp8",
             "int8_sq",
@@ -277,7 +278,7 @@ def auto_quantize(
             "nvfp4_omlp_only",
             "mxfp8",
         ]
-        for args.qformat in qformat_list
+        for qformat in qformat_list
     ), "One or more quantization formats provided are not supported for unified checkpoint export"
 
     def loss_func(output, data):
@@ -548,9 +549,6 @@ def mono_quantize(
         print("Quantization will only be applied to the decoder (text generation) component")
 
     if not model_is_already_quantized or calibration_only:
-        if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only":
-            print("Applying nvfp4 quantization (MoE only) for gpt-oss")
-
         # quantize the model
 
         use_calibration = need_calibration(quant_cfg)
@@ -746,8 +744,6 @@ def pre_quantize(
         )
     else:
         generated_ids_before_ptq = full_model.generate(preview_input_ids, max_new_tokens=100)
-    if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only":
-        print("Applying nvfp4 quantization (MoE only) for gpt-oss")
 
     return preview_input_ids, generated_ids_before_ptq
 
@@ -923,38 +919,42 @@ def quantize_main(
 
     else:
         # mono quantization
-        assert len(args.qformat.split(",")) == 1, (
-            "Plain quantization supports only one quantization format."
-        )
 
-        assert (
-            args.qformat
-            in [
-                "int8_wo",
-                "int4_awq",
-                "fp8",
-                "nvfp4",
-                "nvfp4_awq",
-                "nvfp4_mse",
-                "w4a8_awq",
-                "fp8_pb_wo",
-                "w4a8_mxfp4_fp8",
-                "nvfp4_mlp_only",
-                "nvfp4_omlp_only",
-                "mxfp8",
-            ]
-            or args.kv_cache_qformat in KV_QUANT_CFG_CHOICES
-        ), f"Plain quantization format {args.qformat} not supported for HF export path"
-
-        quant_cfg = build_quant_cfg(
-            args.qformat,
-            args.kv_cache_qformat,
-            args.awq_block_size,
-            model_type,
-            QUANT_CFG_CHOICES,
-            KV_QUANT_CFG_CHOICES,
-            args.moe_calib_experts_ratio,
-        )
+        if args.recipe is not None:
+            print(f"Use recipe {args.recipe} for quantization")
+            recipe = load_recipe(args.recipe)
+            assert isinstance(recipe, ModelOptPTQRecipe), (
+                f"Expected PTQ recipe, but got {type(recipe).__name__} from {args.recipe}"
+            )
+            quant_cfg = recipe.ptq_cfg
+
+        else:
+            assert len(args.qformat.split(",")) == 1, (
+                "Plain quantization supports only one quantization format."
+            )
+
+            assert args.qformat in QUANT_CFG_CHOICES, (
+                f"Unsupported quantization format: {args.qformat}, choices are: {list(QUANT_CFG_CHOICES.keys())}"
+            )
+            quant_cfg = QUANT_CFG_CHOICES[args.qformat]
+
+            quant_cfg = build_quant_cfg(
+                args.qformat,
+                quant_cfg,
+                args.awq_block_size,
+                model_type,
+                args.moe_calib_experts_ratio,
+            )
+
+            enable_quant_kv_cache = args.kv_cache_qformat != "none"
+            print(f"{'Enable' if enable_quant_kv_cache else 'Disable'} KV cache quantization")
+
+            # Check if any bmm_quantizer is in the quant_cfg. If so, we need to enable the bmm_quantizer.
+            if enable_quant_kv_cache:
+                quant_cfg = mtq.update_quant_cfg_with_kv_cache_quant(
+                    quant_cfg,
+                    getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"],
+                )
 
         # Exclude MTP layers from quantization if detected (e.g., GLM-4.7's layer 92)
         # These layers are typically speculative decoding layers that should be exported as-is
@@ -1013,9 +1013,21 @@ def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument(
         "--pyt_ckpt_path",
-        help="Specify where the PyTorch checkpoint path is",
+        "--model",
+        help=(
+            "Model name or path to the PyTorch checkpoint to be quantized. "
+            "Can be a local path or a Huggingface model name."
+        ),
         required=True,
     )
+    parser.add_argument(
+        "--recipe",
+        help=(
+            "PTQ recipe YAML file or name without suffix (e.g. general/ptq/nvfp4_default-fp8_kv)."
+        ),
+        default=None,
+    )
+
     parser.add_argument("--device", default="cuda")
     parser.add_argument(
         "--qformat",

@@ -327,16 +327,25 @@ def main(args):
         trust_remote_code=args.trust_remote_code,
     )
 
-    # Build quantization config
+    quant_cfg = QUANT_CFG_CHOICES[args.qformat]
+
     quant_cfg = build_quant_cfg(
         args.qformat,
-        args.kv_cache_qformat,
+        quant_cfg,
         args.awq_block_size,
         model_type,
-        QUANT_CFG_CHOICES,
-        KV_QUANT_CFG_CHOICES,
     )
 
+    enable_quant_kv_cache = args.kv_cache_qformat != "none"
+    print(f"{'Enable' if enable_quant_kv_cache else 'Disable'} KV cache quantization")
+
+    # Check if any bmm_quantizer is in the quant_cfg. If so, we need to enable the bmm_quantizer.
+    if enable_quant_kv_cache:
+        quant_cfg = mtq.update_quant_cfg_with_kv_cache_quant(
+            quant_cfg,
+            getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"],
+        )
+
     # Quantize the model
     if accelerator.is_main_process:
         print("Starting quantization...")

diff --git a/modelopt/recipe/__init__.py b/modelopt/recipe/__init__.py
@@ -0,0 +1,27 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Module for the ModelOpt recipe lib.
+
+``modelopt.recipe`` contains tooling to:
+
+* load and store model optimization recipes
+* (TODO) utilities to manipulate the recipes, such as merging multiple recipes together, or
+  overriding some fields in a recipe with user-provided values.
+
+"""
+
+from .config import *
+from .loader import *
diff --git a/modelopt/recipe/_config_loader.py b/modelopt/recipe/_config_loader.py
@@ -0,0 +1,113 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""YAML config loading utilities.
+
+This module is intentionally free of ``modelopt.torch`` imports so that
+``modelopt.torch.quantization.config`` can import :func:`load_config` without
+triggering a circular import through ``modelopt.recipe.loader``.
+"""
+
+from importlib.resources import files
+
+try:
+    from importlib.resources.abc import Traversable
+except ImportError:  # Python < 3.11
+    from importlib.abc import Traversable
+import re
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+# Root to all built-in recipes. Users can create own recipes.
+BUILTIN_RECIPES_LIB = files("modelopt_recipes")
+
+_EXMY_RE = re.compile(r"^[Ee](\d+)[Mm](\d+)$")
+_EXMY_KEYS = frozenset({"num_bits", "scale_bits"})
+
+
+def _parse_exmy_num_bits(obj: Any) -> Any:
+    """Recursively convert ``ExMy`` strings in ``num_bits`` / ``scale_bits`` to ``(x, y)`` tuples."""
+    if isinstance(obj, dict):
+        return {
+            k: (
+                _parse_exmy(v)
+                if k in _EXMY_KEYS and isinstance(v, str)
+                else _parse_exmy_num_bits(v)
+            )
+            for k, v in obj.items()
+        }
+    if isinstance(obj, list):
+        return [_parse_exmy_num_bits(item) for item in obj]
+    return obj
+
+
+def _parse_exmy(s: str) -> tuple[int, int] | str:
+    m = _EXMY_RE.match(s)
+    if m:
+        return (int(m.group(1)), int(m.group(2)))
+    return s
+
+
+def load_config(config_file: str | Path | Traversable) -> dict[str, Any]:
+    """Load a config yaml.
+
+    config_file: Path to a config yaml file. The path suffix can be omitted.
+    """
+    paths_to_check: list[Path | Traversable] = []
+    if isinstance(config_file, str):
+        if not config_file.endswith(".yml") and not config_file.endswith(".yaml"):
+            paths_to_check.append(Path(f"{config_file}.yml"))
+            paths_to_check.append(Path(f"{config_file}.yaml"))
+            paths_to_check.append(BUILTIN_RECIPES_LIB.joinpath(f"{config_file}.yml"))
+            paths_to_check.append(BUILTIN_RECIPES_LIB.joinpath(f"{config_file}.yaml"))
+        else:
+            paths_to_check.append(Path(config_file))
+            paths_to_check.append(BUILTIN_RECIPES_LIB.joinpath(config_file))
+    elif isinstance(config_file, Path):
+        if config_file.suffix in (".yml", ".yaml"):
+            paths_to_check.append(config_file)
+            if not config_file.is_absolute():
+                paths_to_check.append(BUILTIN_RECIPES_LIB.joinpath(str(config_file)))
+        else:
+            paths_to_check.append(Path(f"{config_file}.yml"))
+            paths_to_check.append(Path(f"{config_file}.yaml"))
+            if not config_file.is_absolute():
+                paths_to_check.append(BUILTIN_RECIPES_LIB.joinpath(f"{config_file}.yml"))
+                paths_to_check.append(BUILTIN_RECIPES_LIB.joinpath(f"{config_file}.yaml"))
+    elif isinstance(config_file, Traversable):
+        paths_to_check.append(config_file)
+    else:
+        raise ValueError(f"Invalid config file of {config_file}")
+
+    config_path = None
+    for path in paths_to_check:
+        if path.is_file():
+            config_path = path
+            break
+    if not config_path:
+        raise ValueError(
+            f"Cannot find config file of {config_file}, paths checked: {paths_to_check}"
+        )
+
+    _raw = yaml.safe_load(config_path.read_text(encoding="utf-8"))
+    if _raw is None:
+        return {}
+    if not isinstance(_raw, dict):
+        raise ValueError(
+            f"Config file {config_path} must contain a YAML mapping, got {type(_raw).__name__}"
+        )
+    return _parse_exmy_num_bits(_raw)