From 2333727317b91ec414444da21c3dfa41807e3289 Mon Sep 17 00:00:00 2001
From: Siddartha Pothapragada <sidart@meta.com>
Date: Tue, 23 Jun 2026 18:13:50 -0700
Subject: [PATCH 1/2] Qualcomm: cap inf replacement value to fix 16a16w
 accuracy regression

PR #19660 folded ReplaceInfValues into QnnQuantizer._replace_inf and made
the inf stand-in equal to the full quant range. For 16a16w that is 65535
(vs the previous fixed 255), which blows up the attention-mask quant scale
and breaks stories110M decoding in test-llama-runner-qnn-linux. Cap the
magnitude at 255 to restore prior behavior; 8a8w is unaffected.
---
 backends/qualcomm/quantizer/quantizer.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py
index c6fbc51484f..c21f9d90411 100644
--- a/backends/qualcomm/quantizer/quantizer.py
+++ b/backends/qualcomm/quantizer/quantizer.py
@@ -416,7 +416,12 @@ def _get_quant_range(self, node):
                 if quant_info.output_qspec.quant_min is None
                 else quant_info.output_qspec.quant_min
             )
-            return quant_range
+            # Cap the inf stand-in so it does not dominate the tensor's
+            # dynamic range.  For >8-bit activations the full range (e.g.
+            # 65535 for uint16) would blow up the attention-mask quant scale
+            # and wreck accuracy; 255 keeps a reasonable scale for
+            # Llama-style attention masks.
+            return min(quant_range, 255)
 
     def _get_candidates_with_infinity_args(self, graph_module: GraphModule):
         binary_op_sources = [

From 67f78b63c8a2b3dc05884595e4a8c2a56df95e24 Mon Sep 17 00:00:00 2001
From: Siddartha Pothapragada <sidart@meta.com>
Date: Thu, 25 Jun 2026 00:04:11 -0700
Subject: [PATCH 2/2] Bump test-llama-runner-qnn-linux to linux.4xlarge.memory

The 16a16w export+compile is OOM-killed on linux.2xlarge. Use a larger
runner to verify whether the accuracy fix works once memory is sufficient.
---
 .github/workflows/pull.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index eb1a414531b..eb65968ab1a 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -856,7 +856,7 @@ jobs:
         mode: [qnn]
       fail-fast: false
     with:
-      runner: linux.2xlarge
+      runner: linux.4xlarge.memory
       docker-image: ci-image:executorch-ubuntu-22.04-qnn-sdk
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}