Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ Changelog
- Add mixed-precision FP8 + NVFP4 export for Megatron-Core: per-layer ``quant_algo`` recorded under ``quantized_layers`` in ``hf_quant_config.json``, PP-aware ``kv_cache_dtype`` gather, fused-QKV exclude split into per-HF-name ``q/k/v_proj`` entries.
- Add Nemotron-3-Super-120B-A12B PTQ recipes ``modelopt_recipes/models/Nemotron-3-Super-120B-A12B/super-nvfp4.yaml`` (MSE-mixed) and ``super-nvfp4-max-calib.yaml`` (max-calib mixed): NVFP4 W4A4 routed experts + FP8 per-tensor shared experts / Mamba in/out_proj + FP8 KV cache.
- Add quantized ``nn.Embedding`` support. ``nn.Embedding`` is now registered in ``QuantModuleRegistry`` and exposes ``weight_quantizer`` (embedding table), ``output_quantizer`` (lookup activations), and a permanently disabled ``input_quantizer`` placeholder — embedding inputs are integer indices and cannot be fake-quantized, so direct ``enable*()`` calls raise. ``export_hf_checkpoint`` packs quantized embedding weights alongside Linear layers. Embedding quantizers are opt-in (``parent_class: nn.Embedding`` disabled by default).
- Group layerwise calibration options under a nested ``LayerwiseConfig`` and add three knobs: ``get_qdq_activations_from_prev_layer`` (correct GPTQ-Hessian vs max-calib activation semantics — defaults to True for GPTQ, False for max/mse/local_hessian), ``save_every`` (gate per-window ``next_inputs.pt`` activation-cache writes), and ``save_quantizers_only`` (skip the layer-weights blob for amax-only algorithms — whitelisted to ``max``/``mse``/``local_hessian``). Legacy bool ``layerwise`` and flat ``layerwise_checkpoint_dir`` keys still work; the bool form emits a ``DeprecationWarning``.

**Bug Fixes**

Expand Down
44 changes: 31 additions & 13 deletions examples/llm_ptq/example_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -850,22 +850,37 @@ def copy_custom_model_files(source_path: str, export_path: str, trust_remote_cod
print("No custom model files found to copy")


def needs_checkpoint_path_update(quant_cfg: dict) -> bool:
"""Check if quant_cfg has a layerwise_checkpoint_dir that should be auto-resolved to a unique subpath."""
algorithm = quant_cfg.get("algorithm")
def _layerwise_checkpoint_dir_location(algorithm) -> tuple[str, str] | None:
"""Return ``("flat"/"nested", checkpoint_dir)`` for the layerwise checkpoint dir, or None."""
if not isinstance(algorithm, dict):
return False
return algorithm.get("layerwise_checkpoint_dir") is not None
return None
flat = algorithm.get("layerwise_checkpoint_dir")
if flat is not None:
return "flat", flat
nested = algorithm.get("layerwise") or {}
ckpt = nested.get("checkpoint_dir") if isinstance(nested, dict) else None
return ("nested", ckpt) if ckpt is not None else None


def needs_checkpoint_path_update(quant_cfg: dict) -> bool:
Comment thread
Fridah-nv marked this conversation as resolved.
"""Check if quant_cfg has a layerwise checkpoint_dir that should be auto-resolved to a unique subpath."""
return _layerwise_checkpoint_dir_location(quant_cfg.get("algorithm")) is not None


def resolve_checkpoint_dir(quant_cfg: dict, model_path: str) -> dict:
"""Append a unique ``<model_name>_<config_hash>`` subdirectory to layerwise_checkpoint_dir.
def resolve_checkpoint_dir(quant_cfg: dict, model_path: str) -> tuple[dict, str]:
"""Append a unique ``<model_name>_<config_hash>`` subdirectory to the layerwise checkpoint_dir.

Allows a single recipe to be reused across models without checkpoint collisions.
Supports both the legacy flat ``layerwise_checkpoint_dir`` and the nested
``layerwise.checkpoint_dir`` shape, writing back to whichever the user provided.
Must only be called when :func:`needs_checkpoint_path_update` returns True.

Returns ``(updated_quant_cfg, resolved_path)`` so the caller can log or
reference the resolved path without re-deriving the dict shape.
"""
algorithm = quant_cfg["algorithm"]
base_dir = algorithm["layerwise_checkpoint_dir"]
location = _layerwise_checkpoint_dir_location(quant_cfg["algorithm"])
assert location is not None # guaranteed by needs_checkpoint_path_update
shape, base_dir = location

name = model_path.rstrip("/")
if "/" in name and not os.path.isabs(name):
Expand All @@ -874,9 +889,12 @@ def resolve_checkpoint_dir(quant_cfg: dict, model_path: str) -> dict:
name = Path(name).name

config_hash = hashlib.sha256(json.dumps(quant_cfg, default=str).encode()).hexdigest()[:8]
resolved = os.path.join(base_dir, f"{name}_{config_hash}")

quant_cfg = copy.deepcopy(quant_cfg)
quant_cfg["algorithm"]["layerwise_checkpoint_dir"] = os.path.join(
base_dir, f"{name}_{config_hash}"
)
return quant_cfg
algo = quant_cfg["algorithm"]
if "layerwise_checkpoint_dir" in algo:
algo["layerwise_checkpoint_dir"] = resolved
if isinstance(algo.get("layerwise"), dict) and "checkpoint_dir" in algo["layerwise"]:
algo["layerwise"]["checkpoint_dir"] = resolved
return quant_cfg, resolved
9 changes: 4 additions & 5 deletions examples/llm_ptq/hf_ptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -1000,7 +1000,8 @@ def _is_layerwise(obj):
return _is_layerwise(obj.quantize.algorithm)
if isinstance(obj, list):
return any(_is_layerwise(a) for a in obj)
return bool(getattr(obj, "layerwise", False))
layerwise = getattr(obj, "layerwise", None)
return bool(getattr(layerwise, "enable", False))

is_layerwise = _is_layerwise(recipe)

Expand Down Expand Up @@ -1135,10 +1136,8 @@ def _is_layerwise(obj):
_set_kv_cache_constant_amax(quant_cfg["quant_cfg"])

if needs_checkpoint_path_update(quant_cfg):
quant_cfg = resolve_checkpoint_dir(quant_cfg, args.pyt_ckpt_path)
print(
f"Auto-resolved layerwise_checkpoint_dir: {quant_cfg['algorithm']['layerwise_checkpoint_dir']}"
)
quant_cfg, resolved_dir = resolve_checkpoint_dir(quant_cfg, args.pyt_ckpt_path)
print(f"Auto-resolved layerwise checkpoint_dir: {resolved_dir}")

if args.cast_mxfp4_to_nvfp4:
quant_cfg = copy.deepcopy(quant_cfg)
Expand Down
180 changes: 159 additions & 21 deletions modelopt/torch/quantization/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,9 @@

import warnings
from collections.abc import Mapping, Sequence
from typing import Any, Literal
from typing import Any, ClassVar, Literal

from pydantic import AliasChoices, ValidationInfo, field_validator, model_validator
from pydantic import AliasChoices, Field, ValidationInfo, field_validator, model_validator

from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField
from modelopt.torch.opt.config_loader import load_config
Comment thread
Fridah-nv marked this conversation as resolved.
Expand Down Expand Up @@ -633,9 +633,88 @@ def validate_calibrator(cls, v, info: ValidationInfo):
)


class LayerwiseConfig(ModeloptBaseConfig):
"""Nested config for layer-by-layer calibration behavior."""

enable: bool = ModeloptField(
default=False,
title="Enable layerwise (layer-by-layer) calibration.",
description=(
"If True, the calibration algorithm is applied layer by layer. "
"Each layer's inputs are captured via a forward pass that reflects the "
"quantization of all preceding layers, incurring O(N) forward passes for N layers."
),
)

get_qdq_activations_from_prev_layer: bool = ModeloptField(
default=False,
title="Cache next-layer inputs from QDQ outputs of prior layers.",
description=(
"If True (GPTQ default), capture each layer's next-layer inputs "
"after it is calibrated, so QDQ error and in-place weight updates "
"propagate forward. If False (max/mse default), capture before, so "
"the next layer sees the same FP activations as a non-layerwise pass."
),
)

checkpoint_dir: str | None = ModeloptField(
default=None,
title="Per-layer checkpoint directory (resume on restart).",
description=(
"If set, per-layer checkpoints are saved here during calibration. "
"On restart, calibration resumes from the last completed layer."
),
)

save_every: int = ModeloptField(
default=1,
ge=1,
title="Flush resume metadata every N layers (final layer always flushes).",
description=(
"Only the boundary layer of each window writes the large "
"``next_inputs.pt`` activation cache; other per-layer files are "
"still written for every layer (resume needs them to replay skips). "
"Mid-window interrupts re-calibrate the unfinished window on resume."
),
)

save_quantizers_only: bool = ModeloptField(
default=False,
title="Skip the per-layer weights blob; persist only quantizer state.",
description=(
"Only accepted by algorithms that update solely ``TensorQuantizer._amax`` "
"(max, mse, local_hessian). Rejected for weight-mutating algorithms "
"(GPTQ, AWQ, SmoothQuant) where it would silently lose updates on resume."
),
)


def _coerce_layerwise_input(value):
"""Normalize a raw ``layerwise`` value to a dict; warn on deprecated bool."""
if isinstance(value, bool):
warnings.warn(
"Passing the layerwise field as a bool is deprecated; use a dict, "
"e.g. `{'enable': True}`.",
DeprecationWarning,
stacklevel=2,
)
return {"enable": value}
if value is None:
return {}
if isinstance(value, LayerwiseConfig):
# ``exclude_unset=True`` so downstream ``model_fields_set`` reflects the
# user's actual input
return value.model_dump(exclude_unset=True)
return value
Comment thread
coderabbitai[bot] marked this conversation as resolved.


class QuantizeAlgorithmConfig(ModeloptBaseConfig):
"""Calibration algorithm config base."""

# Set True only for algorithms that update solely ``TensorQuantizer._amax``
# (no ``layer.weight`` mutation). Gates ``layerwise.save_quantizers_only``.
_supports_save_quantizers_only: ClassVar[bool] = False

method: Literal[None] = ModeloptField(
None,
title="This field specifies the name of the calibration algorithm. If None, no calibration is performed.",
Expand All @@ -656,34 +735,72 @@ class QuantizeAlgorithmConfig(ModeloptBaseConfig):
),
)

layerwise: bool = ModeloptField(
default=False,
layerwise: LayerwiseConfig = Field(
default_factory=LayerwiseConfig,
validation_alias=AliasChoices("layerwise", "use_sequential"),
title="Enable layerwise (layer-by-layer) calibration.",
title="Layerwise calibration configuration.",
description=(
"If True, the calibration algorithm is applied layer by layer. "
"Each layer's inputs are captured via a forward pass that reflects the "
"quantization of all preceding layers, incurring O(N) forward passes for N layers."
"Nested config controlling layer-by-layer calibration. Pass a dict, "
"e.g. ``{'enable': True, 'checkpoint_dir': '/path'}``. Bool input is "
"accepted for backward compatibility but deprecated."
),
)

layerwise_checkpoint_dir: str | None = ModeloptField(
default=None,
title="Checkpoint directory for layerwise calibration.",
description=(
"If set together with layerwise=True, per-layer checkpoints are saved to this "
"directory during calibration. On restart, calibration resumes from the last "
"completed layer."
),
)
@model_validator(mode="before")
@classmethod
def _migrate_layerwise_checkpoint_dir(cls, data):
"""Merge the legacy flat ``layerwise_checkpoint_dir`` key into ``layerwise``.

Raises if both the flat key and a nested ``checkpoint_dir`` are set with conflicting values.
"""
if not isinstance(data, dict) or "layerwise_checkpoint_dir" not in data:
return data
warnings.warn(
"Passing `layerwise_checkpoint_dir` at the top level is deprecated; "
"nest it under `layerwise.checkpoint_dir` instead.",
DeprecationWarning,
stacklevel=2,
)
data = dict(data)
flat_dir = data.pop("layerwise_checkpoint_dir")
# Resolve the legacy ``use_sequential`` alias before writing ``layerwise``,
# otherwise the alias value is silently dropped when AliasChoices picks the
# newly-written ``layerwise`` key over ``use_sequential``.
raw_layerwise = data.pop("layerwise", data.pop("use_sequential", None))
layerwise = _coerce_layerwise_input(raw_layerwise)
existing = layerwise.get("checkpoint_dir")
if existing is not None and existing != flat_dir:
raise ValueError(
f"Conflicting checkpoint_dir: layerwise_checkpoint_dir={flat_dir!r} "
f"differs from layerwise.checkpoint_dir={existing!r}. Set only one."
)
data["layerwise"] = {**layerwise, "checkpoint_dir": flat_dir}
return data
Comment thread
Fridah-nv marked this conversation as resolved.

@field_validator("layerwise", mode="before")
@classmethod
def _coerce_layerwise(cls, value):
"""Coerce ``layerwise=bool/None`` to dict form; also handles the alias path."""
return _coerce_layerwise_input(value)

@model_validator(mode="after")
def validate_layerwise_checkpoint_dir(self):
"""Raise if layerwise_checkpoint_dir is set but layerwise is False."""
if self.layerwise_checkpoint_dir is not None and not self.layerwise:
"""Raise if layerwise.checkpoint_dir is set but layerwise.enable is False."""
if self.layerwise.checkpoint_dir is not None and not self.layerwise.enable:
raise ValueError(
"layerwise_checkpoint_dir requires layerwise=True. "
"Set layerwise=True or remove layerwise_checkpoint_dir."
"layerwise.checkpoint_dir requires layerwise.enable=True. "
"Set layerwise.enable=True or remove layerwise.checkpoint_dir."
)
return self

@model_validator(mode="after")
def _validate_save_quantizers_only_supported(self):
"""Enforce the ``_supports_save_quantizers_only`` whitelist."""
if self.layerwise.save_quantizers_only and not self._supports_save_quantizers_only:
raise ValueError(
f"Algorithm '{self.method}' mutates layer weights in-place; "
"save_quantizers_only=True would lose those updates on resume. "
"Only max/mse/local_hessian (amax-only) support this flag."
)
return self

Expand All @@ -696,6 +813,8 @@ class MaxCalibConfig(QuantizeAlgorithmConfig):
See `Integer Quantization <https://arxiv.org/pdf/2004.09602>`_ for the concepts.
"""

_supports_save_quantizers_only: ClassVar[bool] = True

method: Literal["max"] = ModeloptField("max")

distributed_sync: bool | None = ModeloptField(
Expand Down Expand Up @@ -727,6 +846,8 @@ class MseCalibConfig(QuantizeAlgorithmConfig):
When fp8_scale_sweep is enabled, step_size is ignored.
"""

_supports_save_quantizers_only: ClassVar[bool] = True

method: Literal["mse"] = ModeloptField("mse")

step_size: float | None = ModeloptField(
Expand Down Expand Up @@ -779,6 +900,8 @@ class LocalHessianCalibConfig(QuantizeAlgorithmConfig):

"""

_supports_save_quantizers_only: ClassVar[bool] = True

method: Literal["local_hessian"] = ModeloptField("local_hessian")

step_size: float | None = ModeloptField(
Expand Down Expand Up @@ -996,6 +1119,21 @@ class GPTQCalibConfig(QuantizeAlgorithmConfig):
per-column error propagation into one launch per GPTQ block.""",
)

@model_validator(mode="after")
def _gptq_qdq_default(self):
"""Inject ``get_qdq_activations_from_prev_layer=True`` unless the user set it.

GPTQ's Hessian correctness depends on prior-layer QDQ activations, so the
default differs from the base class. Uses ``model_fields_set`` to detect
whether the user explicitly set the field — covers every input shape
(empty constructor, bool, dict) without a per-shape special case.
"""
if "get_qdq_activations_from_prev_layer" not in self.layerwise.model_fields_set:
self.layerwise = self.layerwise.model_copy(
Comment thread
Fridah-nv marked this conversation as resolved.
update={"get_qdq_activations_from_prev_layer": True}
)
return self


QuantizeQuantCfgType = list[QuantizerCfgEntry]
QuantizerCfgListConfig = QuantizeQuantCfgType
Expand Down
15 changes: 11 additions & 4 deletions modelopt/torch/quantization/mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,8 +223,12 @@ def wrapped_calib_func(
"""
kwargs = config.model_dump()
method = kwargs.pop("method")
layerwise = kwargs.pop("layerwise", False)
checkpoint_dir = kwargs.pop("layerwise_checkpoint_dir", None)
layerwise_cfg = kwargs.pop("layerwise", None) or {}
layerwise = layerwise_cfg.get("enable", False)
checkpoint_dir = layerwise_cfg.get("checkpoint_dir")
qdq_from_prev = layerwise_cfg.get("get_qdq_activations_from_prev_layer", False)
save_every = layerwise_cfg.get("save_every", 1)
save_quantizers_only = layerwise_cfg.get("save_quantizers_only", False)
if method is not None and "awq" in method:
# For backward compatibility
kwargs["algorithm"] = method
Expand All @@ -244,8 +248,8 @@ def wrapped_calib_func(
# future algorithms that need full-model context must add a guard here.
if not supports_layerwise:
raise ValueError(
f"Calibration algorithm '{method}' does not support layerwise=True. "
"Set layerwise=False, or override `_supports_layerwise = True` on the "
f"Calibration algorithm '{method}' does not support layerwise.enable=True. "
"Set layerwise.enable=False, or override `_supports_layerwise = True` on the "
"corresponding CalibrateModeDescriptor once the algorithm is made "
"compatible with per-layer calibration."
)
Expand All @@ -257,6 +261,9 @@ def wrapped_calib_func(
forward_loop=forward_loop,
calib_func=func,
checkpoint_dir=checkpoint_dir,
get_qdq_activations_from_prev_layer=qdq_from_prev,
save_every=save_every,
save_quantizers_only=save_quantizers_only,
**kwargs,
)
else:
Expand Down
Loading
Loading