NVIDIA · Fridah-nv · May 29, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -40,6 +40,7 @@ Changelog
 - Add mixed-precision FP8 + NVFP4 export for Megatron-Core: per-layer ``quant_algo`` recorded under ``quantized_layers`` in ``hf_quant_config.json``, PP-aware ``kv_cache_dtype`` gather, fused-QKV exclude split into per-HF-name ``q/k/v_proj`` entries.
 - Add Nemotron-3-Super-120B-A12B PTQ recipes ``modelopt_recipes/models/Nemotron-3-Super-120B-A12B/super-nvfp4.yaml`` (MSE-mixed) and ``super-nvfp4-max-calib.yaml`` (max-calib mixed): NVFP4 W4A4 routed experts + FP8 per-tensor shared experts / Mamba in/out_proj + FP8 KV cache.
 - Add quantized ``nn.Embedding`` support. ``nn.Embedding`` is now registered in ``QuantModuleRegistry`` and exposes ``weight_quantizer`` (embedding table), ``output_quantizer`` (lookup activations), and a permanently disabled ``input_quantizer`` placeholder — embedding inputs are integer indices and cannot be fake-quantized, so direct ``enable*()`` calls raise. ``export_hf_checkpoint`` packs quantized embedding weights alongside Linear layers. Embedding quantizers are opt-in (``parent_class: nn.Embedding`` disabled by default).
+- Group layerwise calibration options under a nested ``LayerwiseConfig`` and add three knobs: ``get_qdq_activations_from_prev_layer`` (correct GPTQ-Hessian vs max-calib activation semantics — defaults to True for GPTQ, False for max/mse/local_hessian), ``save_every`` (gate per-window ``next_inputs.pt`` activation-cache writes), and ``save_quantizers_only`` (skip the layer-weights blob for amax-only algorithms — whitelisted to ``max``/``mse``/``local_hessian``). Legacy bool ``layerwise`` and flat ``layerwise_checkpoint_dir`` keys still work; the bool form emits a ``DeprecationWarning``.
 
 **Bug Fixes**
 

@@ -850,22 +850,37 @@ def copy_custom_model_files(source_path: str, export_path: str, trust_remote_cod
         print("No custom model files found to copy")
 
 
-def needs_checkpoint_path_update(quant_cfg: dict) -> bool:
-    """Check if quant_cfg has a layerwise_checkpoint_dir that should be auto-resolved to a unique subpath."""
-    algorithm = quant_cfg.get("algorithm")
+def _layerwise_checkpoint_dir_location(algorithm) -> tuple[str, str] | None:
+    """Return ``("flat"/"nested", checkpoint_dir)`` for the layerwise checkpoint dir, or None."""
     if not isinstance(algorithm, dict):
-        return False
-    return algorithm.get("layerwise_checkpoint_dir") is not None
+        return None
+    flat = algorithm.get("layerwise_checkpoint_dir")
+    if flat is not None:
+        return "flat", flat
+    nested = algorithm.get("layerwise") or {}
+    ckpt = nested.get("checkpoint_dir") if isinstance(nested, dict) else None
+    return ("nested", ckpt) if ckpt is not None else None
+
+
+def needs_checkpoint_path_update(quant_cfg: dict) -> bool:
+    """Check if quant_cfg has a layerwise checkpoint_dir that should be auto-resolved to a unique subpath."""
+    return _layerwise_checkpoint_dir_location(quant_cfg.get("algorithm")) is not None
 
 
-def resolve_checkpoint_dir(quant_cfg: dict, model_path: str) -> dict:
-    """Append a unique ``<model_name>_<config_hash>`` subdirectory to layerwise_checkpoint_dir.
+def resolve_checkpoint_dir(quant_cfg: dict, model_path: str) -> tuple[dict, str]:
+    """Append a unique ``<model_name>_<config_hash>`` subdirectory to the layerwise checkpoint_dir.
 
     Allows a single recipe to be reused across models without checkpoint collisions.
+    Supports both the legacy flat ``layerwise_checkpoint_dir`` and the nested
+    ``layerwise.checkpoint_dir`` shape, writing back to whichever the user provided.
     Must only be called when :func:`needs_checkpoint_path_update` returns True.
+
+    Returns ``(updated_quant_cfg, resolved_path)`` so the caller can log or
+    reference the resolved path without re-deriving the dict shape.
     """
-    algorithm = quant_cfg["algorithm"]
-    base_dir = algorithm["layerwise_checkpoint_dir"]
+    location = _layerwise_checkpoint_dir_location(quant_cfg["algorithm"])
+    assert location is not None  # guaranteed by needs_checkpoint_path_update
+    shape, base_dir = location
 
     name = model_path.rstrip("/")
     if "/" in name and not os.path.isabs(name):
@@ -874,9 +889,12 @@ def resolve_checkpoint_dir(quant_cfg: dict, model_path: str) -> dict:
         name = Path(name).name
 
     config_hash = hashlib.sha256(json.dumps(quant_cfg, default=str).encode()).hexdigest()[:8]
+    resolved = os.path.join(base_dir, f"{name}_{config_hash}")
 
     quant_cfg = copy.deepcopy(quant_cfg)
-    quant_cfg["algorithm"]["layerwise_checkpoint_dir"] = os.path.join(
-        base_dir, f"{name}_{config_hash}"
-    )
-    return quant_cfg
+    algo = quant_cfg["algorithm"]
+    if "layerwise_checkpoint_dir" in algo:
+        algo["layerwise_checkpoint_dir"] = resolved
+    if isinstance(algo.get("layerwise"), dict) and "checkpoint_dir" in algo["layerwise"]:
+        algo["layerwise"]["checkpoint_dir"] = resolved
+    return quant_cfg, resolved
@@ -1000,7 +1000,8 @@ def _is_layerwise(obj):
             return _is_layerwise(obj.quantize.algorithm)
         if isinstance(obj, list):
             return any(_is_layerwise(a) for a in obj)
-        return bool(getattr(obj, "layerwise", False))
+        layerwise = getattr(obj, "layerwise", None)
+        return bool(getattr(layerwise, "enable", False))
 
     is_layerwise = _is_layerwise(recipe)
 
@@ -1135,10 +1136,8 @@ def _is_layerwise(obj):
             _set_kv_cache_constant_amax(quant_cfg["quant_cfg"])
 
         if needs_checkpoint_path_update(quant_cfg):
-            quant_cfg = resolve_checkpoint_dir(quant_cfg, args.pyt_ckpt_path)
-            print(
-                f"Auto-resolved layerwise_checkpoint_dir: {quant_cfg['algorithm']['layerwise_checkpoint_dir']}"
-            )
+            quant_cfg, resolved_dir = resolve_checkpoint_dir(quant_cfg, args.pyt_ckpt_path)
+            print(f"Auto-resolved layerwise checkpoint_dir: {resolved_dir}")
 
         if args.cast_mxfp4_to_nvfp4:
             quant_cfg = copy.deepcopy(quant_cfg)

@@ -152,9 +152,9 @@
 
 import warnings
 from collections.abc import Mapping, Sequence
-from typing import Any, Literal
+from typing import Any, ClassVar, Literal
 
-from pydantic import AliasChoices, ValidationInfo, field_validator, model_validator
+from pydantic import AliasChoices, Field, ValidationInfo, field_validator, model_validator
 
 from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField
 from modelopt.torch.opt.config_loader import load_config
@@ -633,9 +633,88 @@ def validate_calibrator(cls, v, info: ValidationInfo):
     )
 
 
+class LayerwiseConfig(ModeloptBaseConfig):
+    """Nested config for layer-by-layer calibration behavior."""
+
+    enable: bool = ModeloptField(
+        default=False,
+        title="Enable layerwise (layer-by-layer) calibration.",
+        description=(
+            "If True, the calibration algorithm is applied layer by layer. "
+            "Each layer's inputs are captured via a forward pass that reflects the "
+            "quantization of all preceding layers, incurring O(N) forward passes for N layers."
+        ),
+    )
+
+    get_qdq_activations_from_prev_layer: bool = ModeloptField(
+        default=False,
+        title="Cache next-layer inputs from QDQ outputs of prior layers.",
+        description=(
+            "If True (GPTQ default), capture each layer's next-layer inputs "
+            "after it is calibrated, so QDQ error and in-place weight updates "
+            "propagate forward. If False (max/mse default), capture before, so "
+            "the next layer sees the same FP activations as a non-layerwise pass."
+        ),
+    )
+
+    checkpoint_dir: str | None = ModeloptField(
+        default=None,
+        title="Per-layer checkpoint directory (resume on restart).",
+        description=(
+            "If set, per-layer checkpoints are saved here during calibration. "
+            "On restart, calibration resumes from the last completed layer."
+        ),
+    )
+
+    save_every: int = ModeloptField(
+        default=1,
+        ge=1,
+        title="Flush resume metadata every N layers (final layer always flushes).",
+        description=(
+            "Only the boundary layer of each window writes the large "
+            "``next_inputs.pt`` activation cache; other per-layer files are "
+            "still written for every layer (resume needs them to replay skips). "
+            "Mid-window interrupts re-calibrate the unfinished window on resume."
+        ),
+    )
+
+    save_quantizers_only: bool = ModeloptField(
+        default=False,
+        title="Skip the per-layer weights blob; persist only quantizer state.",
+        description=(
+            "Only accepted by algorithms that update solely ``TensorQuantizer._amax`` "
+            "(max, mse, local_hessian). Rejected for weight-mutating algorithms "
+            "(GPTQ, AWQ, SmoothQuant) where it would silently lose updates on resume."
+        ),
+    )
+
+
+def _coerce_layerwise_input(value):
+    """Normalize a raw ``layerwise`` value to a dict; warn on deprecated bool."""
+    if isinstance(value, bool):
+        warnings.warn(
+            "Passing the layerwise field as a bool is deprecated; use a dict, "
+            "e.g. `{'enable': True}`.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        return {"enable": value}
+    if value is None:
+        return {}
+    if isinstance(value, LayerwiseConfig):
+        # ``exclude_unset=True`` so downstream ``model_fields_set`` reflects the
+        # user's actual input
+        return value.model_dump(exclude_unset=True)
+    return value
+
+
 class QuantizeAlgorithmConfig(ModeloptBaseConfig):
     """Calibration algorithm config base."""
 
+    # Set True only for algorithms that update solely ``TensorQuantizer._amax``
+    # (no ``layer.weight`` mutation). Gates ``layerwise.save_quantizers_only``.
+    _supports_save_quantizers_only: ClassVar[bool] = False
+
     method: Literal[None] = ModeloptField(
         None,
         title="This field specifies the name of the calibration algorithm. If None, no calibration is performed.",
@@ -656,34 +735,72 @@ class QuantizeAlgorithmConfig(ModeloptBaseConfig):
         ),
     )
 
-    layerwise: bool = ModeloptField(
-        default=False,
+    layerwise: LayerwiseConfig = Field(
+        default_factory=LayerwiseConfig,
         validation_alias=AliasChoices("layerwise", "use_sequential"),
-        title="Enable layerwise (layer-by-layer) calibration.",
+        title="Layerwise calibration configuration.",
         description=(
-            "If True, the calibration algorithm is applied layer by layer. "
-            "Each layer's inputs are captured via a forward pass that reflects the "
-            "quantization of all preceding layers, incurring O(N) forward passes for N layers."
+            "Nested config controlling layer-by-layer calibration. Pass a dict, "
+            "e.g. ``{'enable': True, 'checkpoint_dir': '/path'}``. Bool input is "
+            "accepted for backward compatibility but deprecated."
         ),
     )
 
-    layerwise_checkpoint_dir: str | None = ModeloptField(
-        default=None,
-        title="Checkpoint directory for layerwise calibration.",
-        description=(
-            "If set together with layerwise=True, per-layer checkpoints are saved to this "
-            "directory during calibration. On restart, calibration resumes from the last "
-            "completed layer."
-        ),
-    )
+    @model_validator(mode="before")
+    @classmethod
+    def _migrate_layerwise_checkpoint_dir(cls, data):
+        """Merge the legacy flat ``layerwise_checkpoint_dir`` key into ``layerwise``.
+
+        Raises if both the flat key and a nested ``checkpoint_dir`` are set with conflicting values.
+        """
+        if not isinstance(data, dict) or "layerwise_checkpoint_dir" not in data:
+            return data
+        warnings.warn(
+            "Passing `layerwise_checkpoint_dir` at the top level is deprecated; "
+            "nest it under `layerwise.checkpoint_dir` instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        data = dict(data)
+        flat_dir = data.pop("layerwise_checkpoint_dir")
+        # Resolve the legacy ``use_sequential`` alias before writing ``layerwise``,
+        # otherwise the alias value is silently dropped when AliasChoices picks the
+        # newly-written ``layerwise`` key over ``use_sequential``.
+        raw_layerwise = data.pop("layerwise", data.pop("use_sequential", None))
+        layerwise = _coerce_layerwise_input(raw_layerwise)
+        existing = layerwise.get("checkpoint_dir")
+        if existing is not None and existing != flat_dir:
+            raise ValueError(
+                f"Conflicting checkpoint_dir: layerwise_checkpoint_dir={flat_dir!r} "
+                f"differs from layerwise.checkpoint_dir={existing!r}. Set only one."
+            )
+        data["layerwise"] = {**layerwise, "checkpoint_dir": flat_dir}
+        return data
+
+    @field_validator("layerwise", mode="before")
+    @classmethod
+    def _coerce_layerwise(cls, value):
+        """Coerce ``layerwise=bool/None`` to dict form; also handles the alias path."""
+        return _coerce_layerwise_input(value)
 
     @model_validator(mode="after")
     def validate_layerwise_checkpoint_dir(self):
-        """Raise if layerwise_checkpoint_dir is set but layerwise is False."""
-        if self.layerwise_checkpoint_dir is not None and not self.layerwise:
+        """Raise if layerwise.checkpoint_dir is set but layerwise.enable is False."""
+        if self.layerwise.checkpoint_dir is not None and not self.layerwise.enable:
             raise ValueError(
-                "layerwise_checkpoint_dir requires layerwise=True. "
-                "Set layerwise=True or remove layerwise_checkpoint_dir."
+                "layerwise.checkpoint_dir requires layerwise.enable=True. "
+                "Set layerwise.enable=True or remove layerwise.checkpoint_dir."
+            )
+        return self
+
+    @model_validator(mode="after")
+    def _validate_save_quantizers_only_supported(self):
+        """Enforce the ``_supports_save_quantizers_only`` whitelist."""
+        if self.layerwise.save_quantizers_only and not self._supports_save_quantizers_only:
+            raise ValueError(
+                f"Algorithm '{self.method}' mutates layer weights in-place; "
+                "save_quantizers_only=True would lose those updates on resume. "
+                "Only max/mse/local_hessian (amax-only) support this flag."
             )
         return self
 
@@ -696,6 +813,8 @@ class MaxCalibConfig(QuantizeAlgorithmConfig):
     See `Integer Quantization <https://arxiv.org/pdf/2004.09602>`_ for the concepts.
     """
 
+    _supports_save_quantizers_only: ClassVar[bool] = True
+
     method: Literal["max"] = ModeloptField("max")
 
     distributed_sync: bool | None = ModeloptField(
@@ -727,6 +846,8 @@ class MseCalibConfig(QuantizeAlgorithmConfig):
     When fp8_scale_sweep is enabled, step_size is ignored.
     """
 
+    _supports_save_quantizers_only: ClassVar[bool] = True
+
     method: Literal["mse"] = ModeloptField("mse")
 
     step_size: float | None = ModeloptField(
@@ -779,6 +900,8 @@ class LocalHessianCalibConfig(QuantizeAlgorithmConfig):
 
     """
 
+    _supports_save_quantizers_only: ClassVar[bool] = True
+
     method: Literal["local_hessian"] = ModeloptField("local_hessian")
 
     step_size: float | None = ModeloptField(
@@ -996,6 +1119,21 @@ class GPTQCalibConfig(QuantizeAlgorithmConfig):
         per-column error propagation into one launch per GPTQ block.""",
     )
 
+    @model_validator(mode="after")
+    def _gptq_qdq_default(self):
+        """Inject ``get_qdq_activations_from_prev_layer=True`` unless the user set it.
+
+        GPTQ's Hessian correctness depends on prior-layer QDQ activations, so the
+        default differs from the base class. Uses ``model_fields_set`` to detect
+        whether the user explicitly set the field — covers every input shape
+        (empty constructor, bool, dict) without a per-shape special case.
+        """
+        if "get_qdq_activations_from_prev_layer" not in self.layerwise.model_fields_set:
+            self.layerwise = self.layerwise.model_copy(
+                update={"get_qdq_activations_from_prev_layer": True}
+            )
+        return self
+
 
 QuantizeQuantCfgType = list[QuantizerCfgEntry]
 QuantizerCfgListConfig = QuantizeQuantCfgType

@@ -223,8 +223,12 @@ def wrapped_calib_func(
     """
     kwargs = config.model_dump()
     method = kwargs.pop("method")
-    layerwise = kwargs.pop("layerwise", False)
-    checkpoint_dir = kwargs.pop("layerwise_checkpoint_dir", None)
+    layerwise_cfg = kwargs.pop("layerwise", None) or {}
+    layerwise = layerwise_cfg.get("enable", False)
+    checkpoint_dir = layerwise_cfg.get("checkpoint_dir")
+    qdq_from_prev = layerwise_cfg.get("get_qdq_activations_from_prev_layer", False)
+    save_every = layerwise_cfg.get("save_every", 1)
+    save_quantizers_only = layerwise_cfg.get("save_quantizers_only", False)
     if method is not None and "awq" in method:
         # For backward compatibility
         kwargs["algorithm"] = method
@@ -244,8 +248,8 @@ def wrapped_calib_func(
             # future algorithms that need full-model context must add a guard here.
             if not supports_layerwise:
                 raise ValueError(
-                    f"Calibration algorithm '{method}' does not support layerwise=True. "
-                    "Set layerwise=False, or override `_supports_layerwise = True` on the "
+                    f"Calibration algorithm '{method}' does not support layerwise.enable=True. "
+                    "Set layerwise.enable=False, or override `_supports_layerwise = True` on the "
                     "corresponding CalibrateModeDescriptor once the algorithm is made "
                     "compatible with per-layer calibration."
                 )
@@ -257,6 +261,9 @@ def wrapped_calib_func(
                 forward_loop=forward_loop,
                 calib_func=func,
                 checkpoint_dir=checkpoint_dir,
+                get_qdq_activations_from_prev_layer=qdq_from_prev,
+                save_every=save_every,
+                save_quantizers_only=save_quantizers_only,
                 **kwargs,
             )
         else: