From 17d1cb5197aa639f6a17d5443f682d7b425d91e9 Mon Sep 17 00:00:00 2001 From: Jonas Behr Date: Tue, 28 Apr 2026 13:55:30 +0200 Subject: [PATCH] handling crash related to generated negative M op from valid CIGAR --- .../vardict/modules/CigarModifier.java | 78 +++++++++++++++++- .../vardict/variations/VariationUtils.java | 6 +- .../vardict/modules/CigarModifierTest.java | 80 +++++++++++++++++++ 3 files changed, 159 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/astrazeneca/vardict/modules/CigarModifier.java b/src/main/java/com/astrazeneca/vardict/modules/CigarModifier.java index d8cffe66..390ec72b 100644 --- a/src/main/java/com/astrazeneca/vardict/modules/CigarModifier.java +++ b/src/main/java/com/astrazeneca/vardict/modules/CigarModifier.java @@ -619,7 +619,22 @@ private boolean threeIndels(jregex.Matcher matcher, boolean flag) { newCigarStr += tslen + "I" + rm + "M"; } } else { - newCigarStr += dlen + "D" + rm + "M"; + if (rm < 0) { + // rn ate into the trailing M after consuming all interior bases; + // absorb the negative rm into RDOFF. RDOFF stays > 0 for any input + // with a leading M >= 1 (algebra: RDOFF + rm = g2 + rn + (g_lastM + tslen) + // where rn >= mid + |rm-floor|, so RDOFF + rm >= g2 + g_lastM > 0). Guard + // against pathological inputs where this invariant is violated. + if (RDOFF + rm <= 0) { + throw new IllegalStateException( + "CigarModifier merge would yield non-positive RDOFF (=" + + (RDOFF + rm) + ") for cigar '" + originalCigar + "'"); + } + RDOFF = RDOFF + rm; + newCigarStr = RDOFF + "M" + dlen + "D"; + } else { + newCigarStr += dlen + "D" + rm + "M"; + } } } else { if (dlen == 0) { @@ -684,11 +699,42 @@ private boolean threeDeletions(jregex.Matcher matcher, boolean flag) { RDOFF += rn; dlen -= rn; tslen -= rn; + // Edge cases mirror threeIndels(): when rn outruns the interior matched bases + // (e.g. inside a homopolymer that extends past the indel cluster), tslen and/or rm + // become negative. Without the rm < 0 branch in the dlen > 0 case below, the merge + // would emit a malformed CIGAR like "...15D12I-1M9D27M". See regression test + // modifyCigarOnHomopolymerThreeDeletionsDoesNotProduceMalformedCigar. String newCigarStr = RDOFF + "M"; if (tslen <= 0) { dlen -= tslen; rm += tslen; - newCigarStr += dlen + "D" + rm + "M"; + if (dlen == 0) { + RDOFF = RDOFF + rm; + newCigarStr = RDOFF + "M"; + } else if (dlen < 0) { + int tslen2 = -dlen; + rm += dlen; + if (rm < 0) { + RDOFF = RDOFF + rm; + newCigarStr = RDOFF + "M" + tslen2 + "I"; + } else { + newCigarStr += tslen2 + "I" + rm + "M"; + } + } else { + if (rm < 0) { + // rn consumed more than the interior 'mid'; the extra bases come + // from the trailing M, so absorb the negative rm into RDOFF. + if (RDOFF + rm <= 0) { + throw new IllegalStateException( + "CigarModifier merge would yield non-positive RDOFF (=" + + (RDOFF + rm) + ") for cigar '" + originalCigar + "'"); + } + RDOFF = RDOFF + rm; + newCigarStr = RDOFF + "M" + dlen + "D"; + } else { + newCigarStr += dlen + "D" + rm + "M"; + } + } } else { newCigarStr += dlen + "D" + tslen + "I" + rm + "M"; } @@ -745,11 +791,37 @@ private boolean twoDeletionsInsertionToComplex(jregex.Matcher matcher, boolean f dlen -= rn; tslen -= rn; + // Edge cases mirror threeIndels()/threeDeletions(); see the matching comment in + // threeDeletions() for the rationale behind the dlen/rm < 0 branches. String newCigarStr = RDOFF + "M"; if (tslen <= 0) { dlen -= tslen; rm += tslen; - newCigarStr += dlen + "D" + rm + "M"; + if (dlen == 0) { + RDOFF = RDOFF + rm; + newCigarStr = RDOFF + "M"; + } else if (dlen < 0) { + int tslen2 = -dlen; + rm += dlen; + if (rm < 0) { + RDOFF = RDOFF + rm; + newCigarStr = RDOFF + "M" + tslen2 + "I"; + } else { + newCigarStr += tslen2 + "I" + rm + "M"; + } + } else { + if (rm < 0) { + if (RDOFF + rm <= 0) { + throw new IllegalStateException( + "CigarModifier merge would yield non-positive RDOFF (=" + + (RDOFF + rm) + ") for cigar '" + originalCigar + "'"); + } + RDOFF = RDOFF + rm; + newCigarStr = RDOFF + "M" + dlen + "D"; + } else { + newCigarStr += dlen + "D" + rm + "M"; + } + } } else { newCigarStr += dlen + "D" + tslen + "I" + rm + "M"; } diff --git a/src/main/java/com/astrazeneca/vardict/variations/VariationUtils.java b/src/main/java/com/astrazeneca/vardict/variations/VariationUtils.java index f2963635..27fd349b 100644 --- a/src/main/java/com/astrazeneca/vardict/variations/VariationUtils.java +++ b/src/main/java/com/astrazeneca/vardict/variations/VariationUtils.java @@ -479,7 +479,8 @@ public static boolean isHasAndEquals(Map ref, int index1, St Character refc = ref.get(index1); if (refc == null) return false; - //if (index2 < 0) index2 = index2 + str.length(); + if (index2 < 0 || index2 >= str.length()) + return false; return refc.equals(str.charAt(index2)); } @@ -494,7 +495,8 @@ public static boolean isHasAndNotEquals(Map ref, int index1, Character refc = ref.get(index1); if (refc == null) return false; - //if (index2 < 0) index2 = index2 + str.length(); + if (index2 < 0 || index2 >= str.length()) + return false; return !refc.equals(str.charAt(index2)); } diff --git a/src/test/java/com/astrazeneca/vardict/modules/CigarModifierTest.java b/src/test/java/com/astrazeneca/vardict/modules/CigarModifierTest.java index 984d23b6..2723dde5 100644 --- a/src/test/java/com/astrazeneca/vardict/modules/CigarModifierTest.java +++ b/src/test/java/com/astrazeneca/vardict/modules/CigarModifierTest.java @@ -8,6 +8,7 @@ import htsjdk.samtools.Cigar; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; +import htsjdk.samtools.TextCigarCodec; import org.testng.Assert; import org.testng.annotations.AfterMethod; import org.testng.annotations.DataProvider; @@ -334,4 +335,83 @@ public void threeIndels(final String initialCigar, final String expected, final String actual = cigarModifier.modifyCigar().cigar; Assert.assertEquals(actual, expected); } + + /** + * Regression test for a malformed CIGAR produced by the merge logic when a read aligns + * inside a long homopolymer with three closely-spaced deletions. + * + * Reproducer is taken verbatim from a duplex consensus read observed in production: + * read name : duplex_GTGTC-GTCAC_140434267-140434623_817012 (mate2) + * chr/pos : 7:140434471 (hg19) + * cigar : 104M1I2M2D1M1D5M1D2M9D27M + * + * Without a fix, threeDeletions()/captureMisSoftly3Mismatches() let the matching-base + * scan run past the read end, producing the malformed intermediate CIGAR + * 104M15D12I-1M9D27M + * (note the negative -1M operator) and then throwing + * StringIndexOutOfBoundsException: Index 143 out of bounds for length 142 + * inside VariationUtils.isHasAndNotEquals. + * + * The expected behaviour is: modifyCigar() must NOT throw, and must return a CIGAR + * that htsjdk's TextCigarCodec can decode (i.e. all operator lengths positive). + */ + @Test + public void modifyCigarOnHomopolymerThreeDeletionsDoesNotProduceMalformedCigar() { + // Read sequence and qualities are 142 nt (the bug reports "Index 143 out of bounds for length 142"). + final String querySequence = + "AATCCTCTGTTTGGAAACCAGCCCGATTCAAGGAGGGTTCTGATGCACTGCGGTGAATTTTTGGCAATGAGCGGGCCAGCAGCTCAATAGAGGCGAGAATCTAC" + + "AAAAAAAAAAAAAAAAAAAAGAAAAAAGAAAAAAAAAG"; + final String queryQuality = + "BBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF" + + "=74??44444444FFFFFFFFFFFFFFFFFFFFFFFFFFFFF"; + Assert.assertEquals(querySequence.length(), 142); + Assert.assertEquals(queryQuality.length(), 142); + + // Reference window covering position .. position + cigarRefSpan(154) with a small margin, + // taken from human_g1k_v37.fa at chr7:140434471-140434670. Reference is 1-indexed in the test, + // so reference position 1 corresponds to the read alignment start. + final int position = 1; + final String refSequence = + "AATCCTCTGTTTGGAAACCAGCCCGATTCAAGGAGGGTTCTGATGCACTGCGGTGAATTTTTGGCAATGAGCGGGCCAGCAGCTCAATAGAGGCGAGAATCTAC" + + "AAAAAAAAAAAGAAAAAAAAAAGAAAAAAAAAGAAAAAAGAAAAAAAAAGAAAGAAAGAAAAAGAAAAAACAGAAAGAAGAATGAAAATCTGGGTG"; + + Map reference = new HashMap<>(); + for (int i = 0; i < refSequence.length(); i++) { + reference.put(i + 1, refSequence.charAt(i)); + } + Reference ref = new Reference(); + ref.referenceSequences = reference; + + // GlobalReadOnlyScope is required by some merge-helper paths. + Configuration config = new Configuration(); + config.goodq = 23; + config.vext = 3; + GlobalReadOnlyScope.init(config, null, null, null, "", new HashMap<>(), new HashMap<>()); + + Region region = new Region("7", 1, 700, ""); + + final String initialCigar = "104M1I2M2D1M1D5M1D2M9D27M"; + + CigarModifier cigarModifier = new CigarModifier( + position, initialCigar, querySequence, queryQuality, + ref, /*indel=*/3, region, /*maxReadLength=*/150); + + String modifiedCigar = cigarModifier.modifyCigar().cigar; + + // Hard requirement #1: htsjdk must be able to decode the result. This currently fails + // with "Malformed CIGAR string: 104M15D12I-1M9D27M". + try { + TextCigarCodec.decode(modifiedCigar); + } catch (IllegalArgumentException e) { + Assert.fail("modifyCigar() returned a CIGAR htsjdk cannot decode: '" + + modifiedCigar + "' (" + e.getMessage() + ")"); + } + + // Hard requirement #2: every operator length must be positive. + for (CigarElement el : TextCigarCodec.decode(modifiedCigar).getCigarElements()) { + Assert.assertTrue(el.getLength() > 0, + "modifyCigar() produced non-positive length for operator " + + el.getOperator() + " in CIGAR '" + modifiedCigar + "'"); + } + } } \ No newline at end of file