diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index eb1a414531b..eb65968ab1a 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -856,7 +856,7 @@ jobs: mode: [qnn] fail-fast: false with: - runner: linux.2xlarge + runner: linux.4xlarge.memory docker-image: ci-image:executorch-ubuntu-22.04-qnn-sdk submodules: 'recursive' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py index c6fbc51484f..c21f9d90411 100644 --- a/backends/qualcomm/quantizer/quantizer.py +++ b/backends/qualcomm/quantizer/quantizer.py @@ -416,7 +416,12 @@ def _get_quant_range(self, node): if quant_info.output_qspec.quant_min is None else quant_info.output_qspec.quant_min ) - return quant_range + # Cap the inf stand-in so it does not dominate the tensor's + # dynamic range. For >8-bit activations the full range (e.g. + # 65535 for uint16) would blow up the attention-mask quant scale + # and wreck accuracy; 255 keeps a reasonable scale for + # Llama-style attention masks. + return min(quant_range, 255) def _get_candidates_with_infinity_args(self, graph_module: GraphModule): binary_op_sources = [