diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index f1856c1134..d81d4bed93 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -15,6 +15,7 @@
 
 import argparse
 import copy
+import os
 import random
 import time
 import warnings
@@ -61,6 +62,11 @@
 )
 from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model
 from modelopt.torch.quantization.config import _default_disabled_quantizer_cfg, need_calibration
+from modelopt.torch.quantization.metrics import (
+    ActivationMSELogger,
+    compute_perplexity,
+    get_wikitext2,
+)
 from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
 from modelopt.torch.quantization.utils import is_quantized
 from modelopt.torch.utils.dataset_utils import (
@@ -97,6 +103,7 @@ def _set_kv_cache_constant_amax(quant_cfg: dict) -> None:
     "int4_awq": mtq.INT4_AWQ_CFG,
     "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
     "nvfp4": mtq.NVFP4_DEFAULT_CFG,
+    "nvfp4_wo": mtq.NVFP4_WEIGHT_ONLY_CFG,
     "nvfp4_awq": mtq.NVFP4_AWQ_LITE_CFG,
     "nvfp4_mse": mtq.NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG,
     "fp8_pb_wo": mtq.FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG,
@@ -106,6 +113,8 @@ def _set_kv_cache_constant_amax(quant_cfg: dict) -> None:
     "nvfp4_mlp_only": mtq.NVFP4_MLP_ONLY_CFG,
     "nvfp4_omlp_only": mtq.NVFP4_OMLP_ONLY_CFG,
     "nvfp4_svdquant": mtq.NVFP4_SVDQUANT_DEFAULT_CFG,
+    "nvfp4_wo_gptq": mtq.NVFP4_WEIGHT_ONLY_GPTQ_CFG,
+    "nvfp4_gptq": mtq.NVFP4_GPTQ_CFG,
     "mxfp8": mtq.MXFP8_DEFAULT_CFG,
     "nvfp4_local_hessian": mtq.NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG,
 }
@@ -676,6 +685,12 @@ def export_quantized(
                     "They will be set at deployment time."
                 )
 
+            if getattr(args, "eval_perplexity", False) and tokenizer is not None:
+                seq_len = getattr(args, "eval_perplexity_seq_len", 2048)
+                eval_data = get_wikitext2(tokenizer, seq_len)
+                ppl = compute_perplexity(full_model, eval_data)
+                print(f"Wikitext-2 perplexity: {round(ppl, 2):.2f}")
+
             # Load any missing weights from non-standard safetensors (handled in get_model for non-low-memory mode)
             # Store the MTP layer prefixes on the model for later exclusion from quantization
             mtp_layer_prefixes, mtp_state_dict = load_mtp_weights(full_model, args.pyt_ckpt_path)
@@ -907,6 +922,64 @@ def quantize_main(
         args, full_model, model_type, tokenizer, calib_dataloader, is_nemotron_vl_model
     )
 
+    # Collect original (unquantized) activations before quantization modifies the model
+    mse_logger = None
+    if getattr(args, "measure_activation_mse", False):
+        n_mse = getattr(args, "activation_mse_max_samples", 16)
+        mse_save_dir = getattr(args, "activation_mse_save_dir", None)
+        mse_input_path = getattr(args, "activation_mse_input_path", None)
+
+        # Resolve MSE input data: frozen file (raw text or tokenized) or live dataloader
+        mse_data = None
+        if mse_input_path is not None:
+            if mse_input_path.endswith(".json"):
+                if os.path.isfile(mse_input_path):
+                    print(f"Loading MSE input data from existing .json file: {mse_input_path}")
+                    texts = ActivationMSELogger.load_raw_text(mse_input_path)
+                    mse_data = ActivationMSELogger.tokenize_raw_text(
+                        texts,
+                        tokenizer,
+                        max_length=args.calib_seq,
+                    )
+                else:
+                    assert tokenizer is not None, (
+                        "--activation_mse_input_path with .json requires a tokenizer to decode"
+                    )
+                    print(f"Creating MSE input data .json file: {mse_input_path}")
+                    texts = ActivationMSELogger.materialize_raw_text(
+                        calib_dataloader,
+                        mse_input_path,
+                        tokenizer=tokenizer,
+                        max_samples=n_mse,
+                    )
+                    mse_data = ActivationMSELogger.tokenize_raw_text(
+                        texts,
+                        tokenizer,
+                        max_length=args.calib_seq,
+                    )
+            elif mse_input_path.endswith(".pt"):
+                if os.path.isfile(mse_input_path):
+                    print(f"Loading MSE input data from existing .pt file: {mse_input_path}")
+                    mse_data = ActivationMSELogger.load_data(mse_input_path)
+                else:
+                    print(f"Creating MSE input data .pt file: {mse_input_path}")
+                    mse_data = ActivationMSELogger.materialize_data(
+                        calib_dataloader,
+                        mse_input_path,
+                        max_samples=n_mse,
+                    )
+            else:
+                raise ValueError(
+                    f"--activation_mse_input_path must end with .json or .pt, got: {mse_input_path}"
+                )
+
+        if mse_data is None:
+            mse_data = calib_dataloader
+
+        mse_logger = ActivationMSELogger(max_samples=n_mse, save_dir=mse_save_dir)
+        print(f"Collecting original (unquantized) activations for MSE over {n_mse} samples...")
+        mse_logger.collect(language_model, mse_data, phase="original")
+
     if args.auto_quantize_bits:
         assert len(args.qformat.split(",")) > 1, (
             "Auto quantization needs multiple quantization format."
@@ -999,6 +1072,23 @@ def quantize_main(
         is_nemotron_vl_model,
         first_text_speech_dataset,
     )
+
+    if mse_logger is not None:
+        import gc
+
+        print("Collecting quantized activations for MSE...")
+        mse_logger.collect(language_model, mse_data, phase="quantized")
+
+        mse_logger.compute_mse()
+        print(mse_logger.summary())
+
+        if getattr(args, "activation_mse_save_dir", None):
+            mse_logger.save()
+
+        del mse_logger, mse_data
+        gc.collect()
+        torch.cuda.empty_cache()
+
     export_quantized(
         args,
         full_model,
@@ -1157,6 +1247,7 @@ def parse_args() -> argparse.Namespace:
         default=False,
         action="store_true",
     )
+
     parser.add_argument(
         "--low_memory_mode",
         help=(
@@ -1215,6 +1306,48 @@ def parse_args() -> argparse.Namespace:
             "Does not impact non-MOE models."
         ),
     )
+    parser.add_argument(
+        "--eval_perplexity",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Evaluate Wikitext-2 perplexity after quantization (before export).",
+    )
+    parser.add_argument(
+        "--eval_perplexity_seq_len",
+        type=int,
+        default=2048,
+        help="Sequence length for perplexity evaluation (default: 2048).",
+    )
+    parser.add_argument(
+        "--measure_activation_mse",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Measure per-layer activation MSE (original vs quantized) after quantization.",
+    )
+    parser.add_argument(
+        "--activation_mse_max_samples",
+        type=int,
+        default=16,
+        help="Max calibration samples for activation MSE (default: 16).",
+    )
+    parser.add_argument(
+        "--activation_mse_save_dir",
+        type=str,
+        default=None,
+        help="Directory to save activation MSE results. If not set, results are only printed.",
+    )
+    parser.add_argument(
+        "--activation_mse_input_path",
+        type=str,
+        default=None,
+        help=(
+            "Path to frozen MSE input data. Supports two formats:\n"
+            "  .json — raw text (cross-model reuse): if file exists, loads and re-tokenizes "
+            "with the current model's tokenizer; if not, decodes calibration data to text and saves.\n"
+            "  .pt — tokenized tensors (same-tokenizer reuse): if file exists, loads directly; "
+            "if not, materializes from calibration data and saves."
+        ),
+    )
 
     args = parser.parse_args()
     if args.moe_calib_experts_ratio is not None and not (0.0 < args.moe_calib_experts_ratio <= 1.0):
diff --git a/modelopt/torch/quantization/__init__.py b/modelopt/torch/quantization/__init__.py
index 87dbf30bb5..d471e55823 100644
--- a/modelopt/torch/quantization/__init__.py
+++ b/modelopt/torch/quantization/__init__.py
@@ -16,12 +16,18 @@
 """Quantization package."""
 
 # Initialize mode and plugins
-from . import mode, plugins, utils
+from . import metrics, mode, plugins, utils
 
 # Add methods to mtq namespace
 from .compress import *
 from .config import *
 from .conversion import *
+from .metrics import (
+    ActivationMSELogger,
+    compute_perplexity,
+    get_wikitext2,
+    measure_per_layer_activation_mse,
+)
 from .model_quant import *
 from .nn.modules.quant_module import QuantModuleRegistry
 from .utils import update_quant_cfg_with_kv_cache_quant
diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py
index 46a36cb8f3..8320790411 100644
--- a/modelopt/torch/quantization/config.py
+++ b/modelopt/torch/quantization/config.py
@@ -427,6 +427,20 @@
     "algorithm": "max",
 }
 
+NVFP4_WEIGHT_ONLY_CFG = {
+    "quant_cfg": {
+        "*weight_quantizer": {
+            "num_bits": (2, 1),
+            "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)},
+            "axis": None,
+            "enable": True,
+        },
+        "*input_quantizer": {"enable": False},
+        **_default_disabled_quantizer_cfg,
+    },
+    "algorithm": "max",
+}
+
 NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG = {
     "quant_cfg": {
         "*weight_quantizer": {
@@ -459,6 +473,39 @@
     },
 }
 
+NVFP4_WEIGHT_ONLY_GPTQ_CFG = {
+    "quant_cfg": {
+        "*weight_quantizer": {
+            "num_bits": (2, 1),
+            "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)},
+            "axis": None,
+            "enable": True,
+        },
+        "*input_quantizer": {"enable": False},
+        **_default_disabled_quantizer_cfg,
+    },
+    "algorithm": {"method": "gptq", "use_sequential": True},
+}
+
+NVFP4_GPTQ_CFG = {
+    "quant_cfg": {
+        "*weight_quantizer": {
+            "num_bits": (2, 1),
+            "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)},
+            "axis": None,
+            "enable": True,
+        },
+        "*input_quantizer": {
+            "num_bits": (2, 1),
+            "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)},
+            "axis": None,
+            "enable": True,
+        },
+        **_default_disabled_quantizer_cfg,
+    },
+    "algorithm": {"method": "gptq", "use_sequential": True},
+}
+
 MAMBA_MOE_NVFP4_AGGRESSIVE_CFG = {
     "quant_cfg": {
         "*weight_quantizer": _nvfp4_quantizer,
@@ -679,6 +726,9 @@
     "NVFP4_AWQ_FULL_CFG",
     "NVFP4_AWQ_LITE_CFG",
     "NVFP4_DEFAULT_CFG",
+    "NVFP4_GPTQ_CFG",
+    "NVFP4_WEIGHT_ONLY_CFG",
+    "NVFP4_WEIGHT_ONLY_GPTQ_CFG",
     "NVFP4_FP8_MHA_CONFIG",
     "NVFP4_KV_CFG",
     "NVFP4_KV_ROTATE_CFG",
@@ -1392,6 +1442,44 @@ class GPTQLiteConfig(QuantizeAlgorithmConfig):
     )
 
 
+class GPTQConfig(QuantizeAlgorithmConfig):
+    """The config for GPTQ lite.
+
+    GPTQ lite is a variant of GPTQ that does not exactly follow the official GPTQ implementation.
+
+    GPTQ lite does not perform sequential quantization of layers. This means that the updated
+    activations are not used to process the next layer.
+
+    The default values are taken from the official GPTQ implementation:
+    https://github.com/IST-DASLab/FP-Quant/blob/d2e3092f968262c4de5fb050e1aef568a280dadd/src/quantization/gptq.py#L35
+
+    Note: This feature is currently experimental and may not translate to improved accuracy as expected.
+
+
+    """
+
+    method: Literal["gptq"] = ModeloptField("gptq")
+    percdamp: float | None = ModeloptField(
+        default=0.01,
+        gt=0.0,
+        le=1.0,
+        title="Percentage damping factor.",
+        description="The percentage of average Hessian diagonal used for damping.",
+    )
+    block_size: int | None = ModeloptField(
+        default=128,
+        title="Block size for GPTQ weight update.",
+        description="""The block size for GPTQ weight update, which must be a multiple of the
+        group_size used in the quantization.""",
+    )
+    hessian_state_path: str | None = ModeloptField(
+        default=None,
+        title="Path to the Hessian state file.",
+        description="""The path to the Hessian state file. If hessian path exists, we load from
+         hessian file instead of recomputing them.""",
+    )
+
+
 QuantizeQuantCfgType = dict[
     str | Callable,
     QuantizerAttributeConfig
diff --git a/modelopt/torch/quantization/mode.py b/modelopt/torch/quantization/mode.py
index e08efece9a..df48c72c29 100644
--- a/modelopt/torch/quantization/mode.py
+++ b/modelopt/torch/quantization/mode.py
@@ -37,6 +37,7 @@
     AWQFullCalibConfig,
     AWQLiteCalibConfig,
     CompressConfig,
+    GPTQConfig,
     GPTQLiteConfig,
     LocalHessianCalibConfig,
     MaxCalibConfig,
@@ -59,6 +60,7 @@
 )
 from .model_calib import (
     awq,
+    gptq,
     gptq_lite,
     local_hessian_calibrate,
     max_calibrate,
@@ -240,8 +242,8 @@ def wrapped_calib_func(
         if sequential:
             if forward_loop is None:
                 raise ValueError("forward_loop is required for calibration but got None.")
-            assert method in ["max"], (
-                f"Sequential calibration currently only supports max calibration, got {method}"
+            assert method in ["max", "gptq"], (
+                f"Sequential calibration currently only supports max and gptq calibration, got {method}"
             )
             # Wrap with sequential processing
             sequential_calibrate(
@@ -502,3 +504,15 @@ def config_class(self) -> type[QuantizeAlgorithmConfig]:
         return GPTQLiteConfig
 
     _calib_func = gptq_lite
+
+
+@CalibrateModeRegistry.register_mode
+class GPTQModeDescriptor(BaseCalibrateModeDescriptor):
+    """Mode for GPTQ calibration algorithm."""
+
+    @property
+    def config_class(self) -> type[QuantizeAlgorithmConfig]:
+        """Specifies the config class for the mode."""
+        return GPTQConfig
+
+    _calib_func = gptq
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
index ed57ea3fc7..1e8a94b3db 100644
--- a/modelopt/torch/quantization/model_calib.py
+++ b/modelopt/torch/quantization/model_calib.py
@@ -1519,7 +1519,13 @@ def postprocess(module, name):
     max_calibrate(model, forward_loop)
 
 
-def _print_relative_mse_error(q: torch.Tensor, w: torch.Tensor, h: torch.Tensor, module_name: str):
+def _print_relative_mse_error(
+    q: torch.Tensor,
+    w: torch.Tensor,
+    h: torch.Tensor,
+    module_name: str,
+    n_samples: int | None = None,
+):
     """Print relative mean squared error between quantized and original weights.
 
     Computes the Hessian-weighted relative MSE between quantized and original weights,
@@ -1531,13 +1537,15 @@ def _print_relative_mse_error(q: torch.Tensor, w: torch.Tensor, h: torch.Tensor,
         w (torch.Tensor): Original weight tensor
         h (torch.Tensor): Hessian matrix used for weighting the error
         module_name (str): Name of the module for logging purposes
+        n_samples (int | None): Number of Hessian samples (batches) used for this layer
     Note:
         Implementation adapted from the GPTQ repository:
         https://github.com/IST-DASLab/FP-Quant
     """
     delta = q - w
     mse = (delta).mm(h).mul(delta).mean() / (w.mm(h).mul(w).mean() + 1e-6)
-    print(f"[{module_name}] Relative MSE error: {mse.item():.2e}")
+    suffix = f", n_hessian_samples: {n_samples}" if n_samples is not None else ""
+    print_rank_0(f"[{module_name}] Relative MSE error: {mse.item():.2e}{suffix}")
 
 
 def update_hessian(input, hessian, n_samples):
@@ -1550,15 +1558,15 @@ def update_hessian(input, hessian, n_samples):
     Returns:
         Tuple of (updated_hessian, new_sample_count)
     """
-    batch_size = input.shape[0]
+    # Flatten to 2D (total_tokens, features) first, so batch_size counts tokens
+    input_flat = input.reshape(-1, input.shape[-1]).t().float()
+    batch_size = input_flat.shape[1]
 
     # Incremental averaging: scale down old hessian
     hessian *= n_samples / (n_samples + batch_size)
     n_samples += batch_size
 
     # Compute outer product: H += (2/n_samples) * X @ X^T
-    # where X is the flattened input reshaped to (features, batch*seq)
-    input_flat = input.reshape(-1, input.shape[-1]).t().float()
     scaled_input = math.sqrt(2 / n_samples) * input_flat
     hessian.add_((scaled_input @ scaled_input.t()).to(hessian.device))
 
@@ -1596,98 +1604,227 @@ def prepare_hessian_inverse(h, weight, percdamp):
         h = torch.cholesky_inverse(torch.linalg.cholesky(h))
         h_inv = torch.linalg.cholesky(h, upper=True)
     except (RuntimeError, torch.linalg.LinAlgError):
-        print("Warning: Hessian is not positive definite, using identity matrix")
+        print_rank_0("Warning: Hessian is not positive definite, using identity matrix")
         h_inv = torch.eye(h.shape[0], device=h.device, dtype=h.dtype)
     return h_inv
 
 
-def quantize_block(full_weight, block_start, block_end, h_inv, quantizer):
-    """Quantize a block of weights group by group (based on quantizer block sizes) with error propagation.
+def _build_column_qdq(quantizer, weight_shape):
+    """Build a fast column-wise quantize-dequantize function for integer quantizers.
+
+    Instead of calling the full TensorQuantizer on the entire weight matrix (which
+    quantizes all elements) and extracting one column, this returns a closure that
+    quantizes only a single column using the quantizer's pre-computed amax/scales.
+
+    Since max_calibrate fixes the amax before GPTQ weight updates, quantizing a
+    single column with the same fixed scale gives bit-identical results to
+    quantizing the full matrix and extracting that column.
 
     Args:
-        full_weight: The full weight tensor (needed for INT4 quantization)
-        block_start: Starting column index of the block
-        block_end: Ending column index of the block
-        h_inv: Hessian inverse
-        quantizer: The quantizer to apply
+        quantizer: The weight TensorQuantizer (already calibrated).
+        weight_shape: Shape of the weight tensor (out_features, in_features).
+
     Returns:
-        quantized_block: Quantized weights for this block
-        losses: Quantization losses per element
-        errors: Accumulated errors for propagation
+        Tuple of (column_qdq_fn, supported) where:
+        - column_qdq_fn(column, col_idx) -> qdq_column  (if supported)
+        - supported: True if column-wise qdq is available, False to fall back.
     """
-    # Extract the block we're working on
-    block_weight = full_weight[:, block_start:block_end]
-    block_hinv = h_inv[block_start:block_end, block_start:block_end]
-    block_size = block_end - block_start
+    # Unsupported: NVFP4 (two-level FP4 scaling), FP quantization (num_bits is a tuple)
+    if isinstance(quantizer, NVFP4StaticQuantizer):
+        return None, False
+    if isinstance(quantizer._num_bits, tuple):
+        return None, False
+
+    # Unsupported: pre_quant_scale (SmoothQuant) or rotation transforms mix columns
+    if getattr(quantizer, "pre_quant_scale", None) is not None:
+        return None, False
+    if getattr(quantizer, "rotate_is_enabled", False):
+        return None, False
+
+    # Need calibrated amax
+    if not hasattr(quantizer, "_amax") or quantizer._amax is None:
+        return None, False
+
+    num_bits = quantizer._num_bits
+    unsigned = getattr(quantizer, "_unsigned", False)
+    narrow_range = getattr(quantizer, "_narrow_range", False)
+    max_bound = (2 ** (num_bits - 1 + int(unsigned))) - 1
+    min_bound = -max_bound + int(narrow_range)
+
+    amax = quantizer._amax.float()
+    out_features, in_features = weight_shape
+
+    # Determine quantization geometry from block_sizes
+    block_sizes = quantizer.block_sizes
+    group_size = None
+    if block_sizes is not None:
+        # Skip dynamic block quantization
+        if block_sizes.get("type", "static") == "dynamic":
+            return None, False
+        group_size = block_sizes.get(-1, None) or block_sizes.get(len(weight_shape) - 1, None)
+
+    if group_size is not None and group_size > 0:
+        # Per-group block quantization along last dim.
+        # After _setup_for_blockquant, weight is reshaped to (-1, group_size) with axis=(0,).
+        # amax shape: (out_features * n_groups, 1) where n_groups = in_features // group_size.
+        if in_features % group_size != 0:
+            return None, False  # Padding case — fall back
+
+        n_groups = in_features // group_size
+
+        try:
+            # Reshape amax to (out_features, n_groups) for O(1) group lookup
+            amax_2d = amax.reshape(out_features, n_groups)
+        except RuntimeError:
+            return None, False
+
+        def _column_qdq_group(
+            col, col_idx, _a=amax_2d, _mx=max_bound, _mn=min_bound, _gs=group_size
+        ):
+            col_scale = _mx / _a[:, col_idx // _gs].clamp(min=1e-12)
+            return torch.clamp(torch.round(col * col_scale), _mn, _mx) / col_scale
+
+        return _column_qdq_group, True
+
+    # Per-channel (axis != None) or per-tensor (axis == None)
+    axis = quantizer.axis
+    if axis is not None:
+        # Per-channel: amax has shape (out_features, 1) or similar
+        col_scale = max_bound / amax.reshape(-1).clamp(min=1e-12)
 
-    quantized_block = torch.zeros_like(block_weight)
-    losses = torch.zeros_like(block_weight)
-    errors = torch.zeros_like(block_weight)
+        def _column_qdq_channel(col, col_idx, _s=col_scale, _mx=max_bound, _mn=min_bound):
+            return torch.clamp(torch.round(col * _s), _mn, _mx) / _s
 
-    # We perform column-wise update for GPTQ within the block
-    group_size = 1
+        return _column_qdq_channel, True
 
-    for group_start in range(0, block_size, group_size):
-        group_end = min(group_start + group_size, block_size)
-        group_cols = slice(group_start, group_end)
-        # Get current column and its Hessian inverse diagonal
-        weight_col = block_weight[:, group_cols]
-        hinv_diag = torch.diag(block_hinv[group_cols, group_cols])
+    # Per-tensor: single scalar scale
+    scalar_scale = max_bound / amax.clamp(min=1e-12).item()
 
-        # Quantize using the full weight, then extract the columns we need
-        quantized_full = quantizer(full_weight)
-        quantized_cols = quantized_full[:, block_start + group_start : block_start + group_end]
-        quantized_block[:, group_cols] = quantized_cols
+    def _column_qdq_tensor(col, col_idx, _s=scalar_scale, _mx=max_bound, _mn=min_bound):
+        return torch.clamp(torch.round(col * _s), _mn, _mx) / _s
 
-        # Compute quantization error and loss
-        error = (weight_col - quantized_cols) / hinv_diag
-        losses[:, group_cols] = (weight_col - quantized_cols) ** 2 / (hinv_diag**2) / 2
-        errors[:, group_cols] = error
+    return _column_qdq_tensor, True
 
-        # Propagate error to remaining columns in block
-        block_weight[:, group_start:] -= error @ block_hinv[group_start:group_end, group_start:]
-        full_weight[:, block_start:block_end] = block_weight
 
-    return quantized_block, losses, errors
+def _can_use_fused_gptq(quantizer) -> bool:
+    """Check whether the fused Triton GPTQ kernel can be used for *quantizer*."""
+    if not isinstance(quantizer, NVFP4StaticQuantizer):
+        return False
+    if not hasattr(quantizer, "_amax") or quantizer._amax is None:
+        return False
+    from modelopt.torch.quantization.triton import IS_AVAILABLE as _TRITON_OK
 
+    return _TRITON_OK
 
-def blockwise_weight_update(module, h, block_size, percdamp):
+
+def blockwise_weight_update(module, h, block_size, percdamp, n_samples=None):
     """Update module weights using GPTQ-style blockwise quantization.
 
+    Dispatches to one of three internal paths depending on quantizer type:
+
+    1. **Fused Triton** — for :class:`NVFP4StaticQuantizer` when Triton is
+       available.  Runs the entire column loop in a single GPU kernel per
+       block (~130x faster than the unfused path on Blackwell GPUs).
+    2. **Column-QDQ** — for integer quantizers whose scale geometry allows
+       single-column fake-quant via :func:`_build_column_qdq`.
+    3. **Full-matrix fallback** — calls the quantizer on the full weight matrix
+       each column (slowest, but always correct).
+
     Args:
-        module: Neural network module with weight and weight_quantizer
-        H: Hessian matrix (d x d)
-        block_size: Size of blocks to process at once
-        percdamp: Damping percentage for Hessian diagonal
+        module: Neural network module with ``weight`` and ``weight_quantizer``.
+        h: Hessian matrix of shape ``(d, d)``.
+        block_size: Number of columns processed per block.
+        percdamp: Damping as a fraction of the mean Hessian diagonal.
+        n_samples: Number of Hessian samples (used only for logging).
     """
     weight = module.weight.data.float().clone()
-    _, num_cols = weight.shape
+    num_rows, num_cols = weight.shape
 
-    # Preprocess Hessian: handle dead neurons and add damping
     h_inv = prepare_hessian_inverse(h, weight, percdamp)
 
-    # Initialize output tensors
-    quantized_weight = torch.zeros_like(weight)
-    losses = torch.zeros_like(weight)
+    quantizer = module.weight_quantizer
+    if _can_use_fused_gptq(quantizer):
+        _blockwise_weight_update_fused(weight, h_inv, quantizer, num_rows, num_cols, block_size)
+    else:
+        col_qdq_fn, col_qdq_supported = _build_column_qdq(quantizer, weight.shape)
+        _blockwise_weight_update_unfused(
+            weight, h_inv, quantizer, num_cols, block_size, col_qdq_fn, col_qdq_supported
+        )
+
+    _print_relative_mse_error(weight, module.weight.float(), h, module.name, n_samples)
+    module.weight.data = weight.reshape(module.weight.shape).to(module.weight.data.dtype)
+
+
+def _blockwise_weight_update_fused(weight, h_inv, quantizer, num_rows, num_cols, block_size):
+    """Fused Triton path for NVFP4: one kernel launch per block."""
+    from modelopt.torch.quantization.triton.gptq_fused_kernel import gptq_fused_block
+
+    group_size = quantizer.block_sizes.get(-1, None) or quantizer.block_sizes.get(1, None)
+    num_groups = math.ceil(num_cols / group_size)
+    amax_grouped = quantizer._amax.float().reshape(num_rows, num_groups).contiguous()
+    global_amax = quantizer.global_amax.float()
 
-    # Process weights in blocks
     for block_start in range(0, num_cols, block_size):
         block_end = min(block_start + block_size, num_cols)
-
-        quantized_block, block_losses, block_errors = quantize_block(
-            weight, block_start, block_end, h_inv, module.weight_quantizer
+        n_cols_blk = block_end - block_start
+
+        w_block = weight[:, block_start:block_end].clone().contiguous()
+        h_inv_cho_blk = h_inv[block_start:block_end, block_start:block_end].contiguous()
+
+        qw_block, err_block = gptq_fused_block(
+            w_block,
+            amax_grouped,
+            global_amax,
+            h_inv_cho_blk,
+            group_size,
+            block_start,
+            n_cols_blk,
         )
-        # Store results
-        quantized_weight[:, block_start:block_end] = quantized_block
-        losses[:, block_start:block_end] = block_losses
 
-        # Propagate errors to remaining weights
-        weight[:, block_end:] -= block_errors @ h_inv[block_start:block_end, block_end:]
+        weight[:, block_start:block_end] = qw_block
+        if block_end < num_cols:
+            weight[:, block_end:].addmm_(
+                err_block[:, :n_cols_blk],
+                h_inv[block_start:block_end, block_end:],
+                alpha=-1,
+            )
 
-    # Print relative mse error
-    _print_relative_mse_error(quantized_weight, module.weight.float(), h, module.name)
-    # Update module weights
-    module.weight.data = quantized_weight.reshape(module.weight.shape).to(module.weight.data.dtype)
+
+def _blockwise_weight_update_unfused(
+    weight, h_inv, quantizer, num_cols, block_size, col_qdq_fn, col_qdq_supported
+):
+    """Column-QDQ or full-matrix fallback for non-NVFP4 quantizers."""
+    for block_start in range(0, num_cols, block_size):
+        block_end = min(block_start + block_size, num_cols)
+        n_cols_blk = block_end - block_start
+        h_inv_cho_blk = h_inv[block_start:block_end, block_start:block_end]
+
+        if col_qdq_supported:
+            wblk = weight[:, block_start:block_end].clone()
+            errs = torch.zeros_like(wblk)
+
+            for i in range(n_cols_blk):
+                w_ci = wblk[:, i]
+                d = h_inv_cho_blk[i, i]
+                qdq_col = col_qdq_fn(w_ci, block_start + i)
+                weight[:, block_start + i] = qdq_col
+                err = (w_ci - qdq_col) / d
+                wblk[:, i:].addr_(err, h_inv_cho_blk[i, i:], alpha=-1)
+                errs[:, i] = err
+        else:
+            wblk = weight.clone()
+            errs = torch.zeros_like(wblk[:, block_start:block_end])
+
+            for i in range(n_cols_blk):
+                w_ci = wblk[:, block_start + i]
+                d = h_inv_cho_blk[i, i]
+                qdq = quantizer(wblk)
+                weight[:, block_start + i] = qdq[:, block_start + i]
+                err = (w_ci - qdq[:, block_start + i]) / d
+                wblk[:, block_start + i : block_end].addr_(err, h_inv_cho_blk[i, i:], alpha=-1)
+                errs[:, i] = err
+
+        weight[:, block_end:].addmm_(errs, h_inv[block_start:block_end, block_end:], alpha=-1)
 
 
 def gptq_lite(
@@ -1883,3 +2020,141 @@ def _layer_forward_loop(m, _inputs=layer_inputs):
             torch.cuda.empty_cache()
     finally:
         input_getter._unpatch_all_layers()
+
+    print_rank_0("Sequential calibration completed")
+
+
+def _promote_nvfp4_static_quantizers(model: nn.Module) -> int:
+    """Convert eligible TensorQuantizers to NVFP4StaticQuantizer in-place.
+
+    After max calibration sets per-block amax values, NVFP4 static quantizers
+    need to be promoted so they use the two-level scaling path (global amax +
+    per-block amax) instead of the generic E4M3 path.
+
+    Returns the number of quantizers converted.
+    """
+    converted = 0
+    for _name, module in list(model.named_modules()):
+        if isinstance(module, TensorQuantizer) and not module._disabled:
+            if module._calibrator is not None and not module._dynamic and hasattr(module, "_amax"):
+                is_nvfp4_static = (
+                    module.is_static_block_quant
+                    and module._num_bits == (2, 1)
+                    and module._block_sizes is not None
+                    and module._block_sizes.get("scale_bits") == (4, 3)
+                )
+                if is_nvfp4_static:
+                    initial_amax = module._amax.clone().detach()
+                    global_amax = reduce_amax(initial_amax, axis=None)
+                    NVFP4StaticQuantizer.from_tensor_quantizer(module, global_amax=global_amax)
+                    converted += 1
+    return converted
+
+
+@torch.no_grad()
+def gptq(
+    layer: nn.Module,
+    forward_loop: ForwardLoop,
+    percdamp: float = 0.01,
+    block_size: int = 128,
+    **kwargs,
+):
+    """GPTQ quantization - a GPTQ variant.
+
+    Args:
+        layer: A single decoder layer to quantize.
+        forward_loop: Callable that replays calibration inputs through the layer.
+            Provided by ``sequential_calibrate`` which captures per-layer activations.
+        percdamp: Percentage of avg Hessian diagonal for damping (default: 0.01).
+        block_size: Block size for GPTQ weight update.
+    """
+    import time
+
+    total_start = time.time()
+
+    # Set weight amax and activation amax for the current layer using max_calibrate
+    max_calibrate(layer, forward_loop=forward_loop)
+
+    # Promote NVFP4 static quantizers so they use the two-level scaling path
+    n_promoted = _promote_nvfp4_static_quantizers(layer)
+    if n_promoted:
+        print_rank_0(f"Promoted {n_promoted} quantizer(s) to NVFP4StaticQuantizer")
+
+    # Dictionary to store hessian matrices for all linear layers in this decoder
+    hessian_state = {}
+
+    # Phase 1: Build tensor mapping for all quantized linear layers in this decoder layer
+    tensor_mapping = {}
+    for name, module in layer.named_modules():
+        if is_quantized_linear(module) and module.weight_quantizer.is_enabled:
+            in_features = module.weight.shape[-1]
+            tensor_mapping[name] = ((in_features, in_features), module.weight.device)
+            module.name = name  # Attach name for easy access in hooks
+
+    if not tensor_mapping:
+        print_rank_0("No quantized linear layers found in decoder layer, skipping GPTQ")
+        return
+
+    # Initialize hessian state with zeros
+    for name, (shape, device) in tensor_mapping.items():
+        hessian_state[name] = {
+            "hessian": torch.zeros(shape, dtype=torch.float32, device=device),
+            "n_samples": 0,
+        }
+
+    # Phase 2: Patch forwards to collect Hessians (similar to local_hessian_calibrate)
+    def _make_hessian_forward(module_name):
+        def hessian_forward(self, input, *args, **kwargs):
+            inp = input.to_local() if hasattr(input, "to_local") else input
+            state = hessian_state[module_name]
+            hessian, n_samples = update_hessian(inp, state["hessian"], state["n_samples"])
+            hessian_state[module_name] = {"hessian": hessian, "n_samples": n_samples}
+
+            self.weight_quantizer.disable()
+            out = self._forward_no_gptq_hessian(input, *args, **kwargs)
+            self.weight_quantizer.enable()
+            return out
+
+        return hessian_forward
+
+    patched_modules = []
+    for name, module in layer.named_modules():
+        if is_quantized_linear(module) and module.weight_quantizer.is_enabled:
+            bind_forward_method(module, _make_hessian_forward(name), "_forward_no_gptq_hessian")
+            patched_modules.append(module)
+
+    # Run forward passes to collect Hessians
+    hessian_start = time.time()
+    print_rank_0(f"Computing Hessians for {len(tensor_mapping)} linear layers...")
+    forward_loop(layer)
+
+    # Unpatch forwards
+    for module in patched_modules:
+        unpatch_forward_method(module, "_forward_no_gptq_hessian")
+
+    torch.cuda.synchronize() if torch.cuda.is_available() else None
+    hessian_time = time.time() - hessian_start
+
+    # Phase 3: Update weights using computed Hessians (same as gptq_lite)
+    weight_update_start = time.time()
+    print_rank_0("Updating weights using GPTQ algorithm...")
+    for name, module in layer.named_modules():
+        if is_quantized_linear(module) and module.weight_quantizer.is_enabled:
+            state = hessian_state[module.name]
+            hessian = state["hessian"].to(module.weight.device)
+            blockwise_weight_update(
+                module, hessian, block_size, percdamp, n_samples=state["n_samples"]
+            )
+            del hessian_state[module.name]
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+    torch.cuda.synchronize() if torch.cuda.is_available() else None
+    weight_update_time = time.time() - weight_update_start
+
+    total_time = time.time() - total_start
+    print_rank_0(
+        f"GPTQ timing - Hessian: {hessian_time:.2f}s, "
+        f"Weight update: {weight_update_time:.2f}s, "
+        f"Total: {total_time:.2f}s"
+    )
diff --git a/modelopt/torch/quantization/triton/gptq_fused_kernel.py b/modelopt/torch/quantization/triton/gptq_fused_kernel.py
new file mode 100644
index 0000000000..21d84713a1
--- /dev/null
+++ b/modelopt/torch/quantization/triton/gptq_fused_kernel.py
@@ -0,0 +1,189 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Fused Triton kernel for the GPTQ blockwise weight-update inner loop.
+
+The standard GPTQ inner loop launches ~10-15 CUDA kernels per column
+(amax lookup, FP4 quantization, error computation, rank-1 update).
+For ``block_size=128`` that is ~1 500 kernel launches per block, each with
+~5-10 us of launch overhead dominating actual compute.
+
+This module fuses the entire inner loop into a **single** Triton kernel per
+block.  Rows are independent and map to Triton programs; columns are processed
+sequentially inside each program so the rank-1 error update is carried forward
+without synchronisation.
+
+Supported quantisation format: **NVFP4 static block quantisation** (two-level
+scaling with per-group amax and a global amax).
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+__all__ = ["gptq_fused_block"]
+
+# -- NVFP4 constants used by the kernel ------------------------------------
+# Maximum representable FP4-E2M1 value (1 + 1 + 0.5 = 6.0 when decoded via
+# the standard E2M1 table: {0, 0.5, 1, 1.5, 2, 3, 4, 6}).
+_FP4_MAX = 6.0
+# FP8-E4M3 has max representable value 448.
+_FP8_E4M3_MAX = 448.0
+
+
+@triton.jit
+def _gptq_fused_block_kernel(
+    w_ptr,  # [num_rows, BLOCK_SIZE] working weight block (in-place)
+    qw_ptr,  # [num_rows, BLOCK_SIZE] output: quantized weights
+    err_ptr,  # [num_rows, BLOCK_SIZE] output: quantization errors
+    amax_ptr,  # [num_rows, num_groups] per-group amax, row-major
+    global_amax_ptr,  # scalar float32 on device
+    hinv_ptr,  # [BLOCK_SIZE, BLOCK_SIZE] upper Cholesky of H^{-1}
+    num_rows,
+    num_groups,
+    group_size: tl.constexpr,
+    block_start,  # column offset of this block in the full weight matrix
+    n_cols,  # actual columns in this block (may be < BLOCK_SIZE)
+    BLOCK_SIZE: tl.constexpr,
+):
+    """One program per row; sequentially quantizes columns, propagating errors."""
+    row = tl.program_id(0)
+    if row >= num_rows:
+        return
+
+    # Base pointers for this row
+    w_base = w_ptr + row * BLOCK_SIZE
+    qw_base = qw_ptr + row * BLOCK_SIZE
+    err_base = err_ptr + row * BLOCK_SIZE
+    amax_row_base = amax_ptr + row * num_groups
+
+    # Pre-compute global FP8 scale factors (constant across columns)
+    global_amax = tl.load(global_amax_ptr).to(tl.float32)
+    global_scale = global_amax / 6.0  # _FP4_MAX
+    fp8_inv_scale = tl.where(global_scale > 0.0, 1.0 / (448.0 / global_scale), 0.0)
+
+    j_range = tl.arange(0, BLOCK_SIZE)
+
+    for i in range(BLOCK_SIZE):
+        wi = tl.load(w_base + i)
+
+        # -- Compute NVFP4 two-level scale for this column's group -----------
+        col_idx = block_start + i
+        group_idx = col_idx // group_size
+        raw_amax = tl.load(amax_row_base + group_idx).to(tl.float32)
+        raw_scale = raw_amax / 6.0  # _FP4_MAX
+
+        # FP8-quantize the block scale: scale * fp8_scale -> cast E4M3 -> back
+        fp8_scale = tl.where(global_scale > 0.0, 448.0 / global_scale, 1.0)
+        si = (raw_scale * fp8_scale).to(tl.float8e4nv).to(tl.float32) * fp8_inv_scale
+
+        # Guard: replace zero / nan / inf scale with 1.0
+        # NOTE: ``si != si`` is the standard NaN check in Triton (no math.isnan).
+        si_safe = tl.where(
+            (si == 0.0) | (si != si) | (tl.abs(si) == float("inf")),  # noqa: PLR0124
+            1.0,
+            si,
+        )
+
+        # -- FP4-E2M1 fake quantization (nearest-round to 8 levels) ----------
+        abs_scaled = tl.abs(wi) / si_safe
+        q_val = tl.where(
+            abs_scaled <= 0.25,
+            0.0,
+            tl.where(
+                abs_scaled < 0.75,
+                0.5,
+                tl.where(
+                    abs_scaled <= 1.25,
+                    1.0,
+                    tl.where(
+                        abs_scaled < 1.75,
+                        1.5,
+                        tl.where(
+                            abs_scaled <= 2.5,
+                            2.0,
+                            tl.where(abs_scaled < 3.5, 3.0, tl.where(abs_scaled <= 5.0, 4.0, 6.0)),
+                        ),
+                    ),
+                ),
+            ),
+        )
+
+        qi = q_val * si_safe * tl.where(wi >= 0.0, 1.0, -1.0)
+        tl.store(qw_base + i, qi)
+
+        # -- GPTQ error and rank-1 update ------------------------------------
+        di = tl.load(hinv_ptr + i * BLOCK_SIZE + i)
+        err_i = (wi - qi) / di
+        tl.store(err_base + i, err_i)
+
+        j_mask = (j_range > i) & (j_range < n_cols)
+        hinv_row = tl.load(hinv_ptr + i * BLOCK_SIZE + j_range, mask=j_mask, other=0.0)
+        w_rem = tl.load(w_base + j_range, mask=j_mask, other=0.0)
+        w_rem = w_rem - err_i * hinv_row
+        tl.store(w_base + j_range, w_rem, mask=j_mask)
+
+
+def gptq_fused_block(
+    w_block: torch.Tensor,
+    amax_grouped: torch.Tensor,
+    global_amax: torch.Tensor,
+    h_inv_cho_blk: torch.Tensor,
+    group_size: int,
+    block_start: int,
+    n_cols: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Run the GPTQ column loop for one block in a single Triton kernel launch.
+
+    Args:
+        w_block: Working weight block of shape ``[num_rows, block_size]`` (will be cloned).
+        amax_grouped: Per-group amax of shape ``[num_rows, num_groups]``.
+        global_amax: Scalar tensor with the global amax.
+        h_inv_cho_blk: Upper Cholesky factor of H^{-1}, shape ``[block_size, block_size]``.
+        group_size: NVFP4 quantization group size (typically 16).
+        block_start: Column offset of this block in the full weight matrix.
+        n_cols: Actual number of columns in this block (``<= block_size``).
+
+    Returns:
+        Tuple of ``(qw_block, err_block)`` each of shape ``[num_rows, block_size]``.
+    """
+    num_rows, block_size = w_block.shape
+    num_groups = amax_grouped.shape[1]
+
+    w_block = w_block.contiguous()
+    amax_grouped = amax_grouped.contiguous()
+    h_inv_cho_blk = h_inv_cho_blk.contiguous()
+
+    qw_block = torch.empty_like(w_block)
+    err_block = torch.empty_like(w_block)
+
+    grid = (num_rows,)
+    with torch.cuda.device(w_block.device):
+        _gptq_fused_block_kernel[grid](
+            w_block,
+            qw_block,
+            err_block,
+            amax_grouped,
+            global_amax,
+            h_inv_cho_blk,
+            num_rows,
+            num_groups,
+            group_size,
+            block_start,
+            n_cols,
+            BLOCK_SIZE=block_size,
+        )
+
+    return qw_block, err_block
diff --git a/tests/gpu/torch/quantization/test_gptq.py b/tests/gpu/torch/quantization/test_gptq.py
index 0c60bcd007..23bdf6cbff 100644
--- a/tests/gpu/torch/quantization/test_gptq.py
+++ b/tests/gpu/torch/quantization/test_gptq.py
@@ -20,7 +20,16 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 import modelopt.torch.quantization as mtq
-from modelopt.torch.quantization.model_calib import blockwise_weight_update, update_hessian
+from modelopt.torch.export.unified_export_hf import _export_quantized_weight
+from modelopt.torch.quantization.model_calib import (
+    _blockwise_weight_update_fused,
+    _blockwise_weight_update_unfused,
+    blockwise_weight_update,
+    prepare_hessian_inverse,
+    update_hessian,
+)
+from modelopt.torch.quantization.nn import NVFP4StaticQuantizer
+from modelopt.torch.quantization.qtensor.nvfp4_tensor import NVFP4QTensor
 from modelopt.torch.utils.dataset_utils import create_forward_loop, get_dataset_dataloader
 
 RAND_SEED = 42
@@ -156,6 +165,91 @@ def test_gptq_updates(block_size, dim, model_weight, expect_weight_change):
         assert torch.allclose(model.weight.data, q_dq_weight), "Weight should be equal"
 
 
+def test_gptq_export_roundtrip():
+    """Test that GPTQ export + dequantize produces weights matching in-memory QDQ."""
+    torch.manual_seed(RAND_SEED)
+    dim = 128
+    block_size = 4
+
+    # Step 1: Create a simple linear model and quantize to install NVFP4 quantizers
+    model = torch.nn.Linear(dim, dim).to("cuda")
+    model.name = "linear"
+    original_weight = model.weight.data.clone()
+    input_tensor = torch.randn(2, 16, dim).to("cuda")
+    quant_cfg = mtq.NVFP4_DEFAULT_CFG
+
+    mtq.quantize(model, quant_cfg, forward_loop=lambda m: m(input_tensor))
+
+    # Restore original weight before GPTQ
+    model.weight.data = original_weight.clone()
+
+    # Step 2: Perform GPTQ — compute Hessian and update weights
+    hessian = torch.zeros(dim, dim, dtype=torch.float32)
+    n_samples = 0
+    hessian, n_samples = update_hessian(input_tensor, hessian, n_samples)
+    hessian = hessian.to("cuda")
+
+    blockwise_weight_update(model, hessian, block_size, percdamp=0.1)
+
+    # Save the QDQ reference from the quantizer applied to GPTQ'd weights
+    gptq_weight_shape = model.weight.data.shape
+    gptq_weight_dtype = model.weight.data.dtype
+    qdq_ref = model.weight.data.clone()
+
+    # Step 3: Export — converts weight to packed NVFP4 and registers scale buffers
+    _export_quantized_weight(model, torch.bfloat16)
+
+    # Verify export produced the expected buffers
+    assert hasattr(model, "weight_scale"), "Export should register weight_scale buffer"
+    assert hasattr(model, "weight_scale_2"), "Export should register weight_scale_2 buffer"
+
+    # Step 4: Dequantize the exported packed weight and compare with QDQ reference
+    packed_weight = model.weight.data
+    weight_scale = model.weight_scale
+    weight_scale_2 = model.weight_scale_2
+
+    nvfp4_qtensor = NVFP4QTensor(gptq_weight_shape, gptq_weight_dtype, packed_weight)
+    deq_weight = nvfp4_qtensor.dequantize(
+        dtype=torch.bfloat16,
+        scale=weight_scale,
+        double_scale=weight_scale_2,
+        block_sizes={-1: 16},
+    )
+
+    assert deq_weight.shape == qdq_ref.shape, (
+        f"Shape mismatch: dequantized {deq_weight.shape} vs QDQ ref {qdq_ref.shape}"
+    )
+    diff = (deq_weight - qdq_ref.to(torch.bfloat16)).abs()
+    max_diff = diff.max().item()
+    max_diff_idx = diff.argmax().item()
+    max_diff_row = max_diff_idx // deq_weight.shape[1]
+    max_diff_col = max_diff_idx % deq_weight.shape[1]
+    num_mismatched = (diff > 1e-3).sum().item()
+    total_elements = diff.numel()
+
+    print("\n--- Diff Stats ---")
+    print(f"  Max diff:  {max_diff}")
+    print(f"  Mean diff: {diff.mean().item()}")
+    print(f"  Median diff: {diff.median().item()}")
+    print(f"  Std diff:  {diff.std().item()}")
+    print(
+        f"  Mismatched (>1e-3): {num_mismatched}/{total_elements} "
+        f"({100 * num_mismatched / total_elements:.2f}%)"
+    )
+    print(
+        f"  Max diff at [{max_diff_row}, {max_diff_col}]: "
+        f"deq={deq_weight[max_diff_row, max_diff_col].item()}, "
+        f"qdq_ref={qdq_ref[max_diff_row, max_diff_col].item()}"
+    )
+
+    assert torch.allclose(deq_weight, qdq_ref.to(torch.bfloat16), atol=1e-2), (
+        f"Dequantized weight does not match QDQ reference. "
+        f"Max diff: {max_diff} at [{max_diff_row}, {max_diff_col}] "
+        f"(deq={deq_weight[max_diff_row, max_diff_col].item()}, "
+        f"qdq_ref={qdq_ref[max_diff_row, max_diff_col].item()})"
+    )
+
+
 @pytest.mark.parametrize(
     "quant_cfg", [mtq.NVFP4_DEFAULT_CFG, mtq.FP8_DEFAULT_CFG, mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG]
 )
@@ -208,3 +302,87 @@ def test_gptq_e2e_flow(quant_cfg):
     print(
         f"Generated ids after quantization: {tokenizer.decode(generated_ids_after_ptq[0], skip_special_tokens=True)}"
     )
+
+
+@pytest.mark.parametrize("dim", [256, 512])
+def test_fused_vs_unfused_nvfp4(dim):
+    """Verify that the fused Triton GPTQ kernel produces equivalent results to the unfused path.
+
+    The fused kernel computes NVFP4 quantisation inline using Triton intrinsics,
+    which can differ slightly from the PyTorch-level quantiser path (different FP
+    rounding order).  On real models (dim >= 4096) the relative MSE difference is
+    typically < 0.1%; at the smaller dims used here the tolerance is set to 20%.
+    """
+    from modelopt.torch.quantization.model_calib import _promote_nvfp4_static_quantizers
+
+    torch.manual_seed(RAND_SEED)
+    block_size = min(128, dim)
+
+    # NVFP4_WEIGHT_ONLY_GPTQ_CFG uses *static* blocks, which get promoted to
+    # NVFP4StaticQuantizer — the prerequisite for the fused Triton path.
+    quant_cfg = copy.deepcopy(mtq.NVFP4_WEIGHT_ONLY_GPTQ_CFG)
+    quant_cfg["algorithm"] = "max"  # calibrate only, don't run GPTQ
+
+    model = torch.nn.Linear(dim, dim, bias=False).to("cuda")
+    model.name = "test_fused"
+    original_weight = model.weight.data.clone()
+    inp = torch.randn(4, 32, dim, device="cuda")
+
+    mtq.quantize(model, quant_cfg, forward_loop=lambda m: m(inp))
+
+    # Promote to NVFP4StaticQuantizer (normally done by gptq / sequential_calibrate)
+    n_promoted = _promote_nvfp4_static_quantizers(model)
+    assert n_promoted > 0, "Expected at least one quantizer to be promoted"
+
+    quantizer = model.weight_quantizer
+    assert isinstance(quantizer, NVFP4StaticQuantizer), (
+        f"Expected NVFP4StaticQuantizer, got {type(quantizer).__name__}"
+    )
+
+    # Restore original weight and compute Hessian
+    model.weight.data = original_weight.clone()
+    hessian = torch.zeros(dim, dim, dtype=torch.float32)
+    n_samples = 0
+    hessian, n_samples = update_hessian(inp, hessian, n_samples)
+    hessian = hessian.to("cuda")
+
+    # --- Run fused path ---
+    weight_fused = original_weight.float().clone()
+    num_rows, num_cols = weight_fused.shape
+    h_inv = prepare_hessian_inverse(hessian, weight_fused, percdamp=0.01)
+    _blockwise_weight_update_fused(weight_fused, h_inv, quantizer, num_rows, num_cols, block_size)
+
+    # --- Run unfused path ---
+    weight_unfused = original_weight.float().clone()
+    h_inv_unfused = prepare_hessian_inverse(hessian, weight_unfused, percdamp=0.01)
+    _blockwise_weight_update_unfused(
+        weight_unfused, h_inv_unfused, quantizer, num_cols, block_size, None, False
+    )
+
+    # Both paths must produce non-trivial updates
+    assert not torch.equal(weight_fused, original_weight.float()), (
+        "Fused path did not update weights"
+    )
+    assert not torch.equal(weight_unfused, original_weight.float()), (
+        "Unfused path did not update weights"
+    )
+
+    # Compare Hessian-weighted relative MSE
+    def _relative_mse(q, w, h):
+        delta = q - w
+        return (delta.mm(h).mul(delta).mean() / (w.mm(h).mul(w).mean() + 1e-6)).item()
+
+    orig_f = original_weight.float()
+    mse_fused = _relative_mse(weight_fused, orig_f, hessian)
+    mse_unfused = _relative_mse(weight_unfused, orig_f, hessian)
+
+    assert mse_fused > 0, "Fused MSE should be positive"
+    assert mse_unfused > 0, "Unfused MSE should be positive"
+
+    # At small test dimensions, inline Triton FP4 rounding can diverge up to ~15%
+    # from the PyTorch path.  On production-scale layers this drops below 0.1%.
+    relative_mse_diff = abs(mse_fused - mse_unfused) / max(mse_fused, mse_unfused)
+    assert relative_mse_diff < 0.20, (
+        f"Fused ({mse_fused:.6e}) and unfused ({mse_unfused:.6e}) MSE differ by "
+        f"{relative_mse_diff:.2%}, expected < 20%"
+    )