Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 75 additions & 3 deletions src/main/java/com/astrazeneca/vardict/modules/CigarModifier.java
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,22 @@ private boolean threeIndels(jregex.Matcher matcher, boolean flag) {
newCigarStr += tslen + "I" + rm + "M";
}
} else {
newCigarStr += dlen + "D" + rm + "M";
if (rm < 0) {
// rn ate into the trailing M after consuming all interior bases;
// absorb the negative rm into RDOFF. RDOFF stays > 0 for any input
// with a leading M >= 1 (algebra: RDOFF + rm = g2 + rn + (g_lastM + tslen)
// where rn >= mid + |rm-floor|, so RDOFF + rm >= g2 + g_lastM > 0). Guard
// against pathological inputs where this invariant is violated.
if (RDOFF + rm <= 0) {
throw new IllegalStateException(
"CigarModifier merge would yield non-positive RDOFF (="
+ (RDOFF + rm) + ") for cigar '" + originalCigar + "'");
}
RDOFF = RDOFF + rm;
newCigarStr = RDOFF + "M" + dlen + "D";
} else {
newCigarStr += dlen + "D" + rm + "M";
}
}
} else {
if (dlen == 0) {
Expand Down Expand Up @@ -684,11 +699,42 @@ private boolean threeDeletions(jregex.Matcher matcher, boolean flag) {
RDOFF += rn;
dlen -= rn;
tslen -= rn;
// Edge cases mirror threeIndels(): when rn outruns the interior matched bases
// (e.g. inside a homopolymer that extends past the indel cluster), tslen and/or rm
// become negative. Without the rm < 0 branch in the dlen > 0 case below, the merge
// would emit a malformed CIGAR like "...15D12I-1M9D27M". See regression test
// modifyCigarOnHomopolymerThreeDeletionsDoesNotProduceMalformedCigar.
String newCigarStr = RDOFF + "M";
if (tslen <= 0) {
dlen -= tslen;
rm += tslen;
newCigarStr += dlen + "D" + rm + "M";
if (dlen == 0) {
RDOFF = RDOFF + rm;
newCigarStr = RDOFF + "M";
} else if (dlen < 0) {
int tslen2 = -dlen;
rm += dlen;
if (rm < 0) {
RDOFF = RDOFF + rm;
newCigarStr = RDOFF + "M" + tslen2 + "I";
} else {
newCigarStr += tslen2 + "I" + rm + "M";
}
} else {
if (rm < 0) {
// rn consumed more than the interior 'mid'; the extra bases come
// from the trailing M, so absorb the negative rm into RDOFF.
if (RDOFF + rm <= 0) {
throw new IllegalStateException(
"CigarModifier merge would yield non-positive RDOFF (="
+ (RDOFF + rm) + ") for cigar '" + originalCigar + "'");
}
RDOFF = RDOFF + rm;
newCigarStr = RDOFF + "M" + dlen + "D";
} else {
newCigarStr += dlen + "D" + rm + "M";
}
}
} else {
newCigarStr += dlen + "D" + tslen + "I" + rm + "M";
}
Expand Down Expand Up @@ -745,11 +791,37 @@ private boolean twoDeletionsInsertionToComplex(jregex.Matcher matcher, boolean f
dlen -= rn;
tslen -= rn;

// Edge cases mirror threeIndels()/threeDeletions(); see the matching comment in
// threeDeletions() for the rationale behind the dlen/rm < 0 branches.
String newCigarStr = RDOFF + "M";
if (tslen <= 0) {
dlen -= tslen;
rm += tslen;
newCigarStr += dlen + "D" + rm + "M";
if (dlen == 0) {
RDOFF = RDOFF + rm;
newCigarStr = RDOFF + "M";
} else if (dlen < 0) {
int tslen2 = -dlen;
rm += dlen;
if (rm < 0) {
RDOFF = RDOFF + rm;
newCigarStr = RDOFF + "M" + tslen2 + "I";
} else {
newCigarStr += tslen2 + "I" + rm + "M";
}
} else {
if (rm < 0) {
if (RDOFF + rm <= 0) {
throw new IllegalStateException(
"CigarModifier merge would yield non-positive RDOFF (="
+ (RDOFF + rm) + ") for cigar '" + originalCigar + "'");
}
RDOFF = RDOFF + rm;
newCigarStr = RDOFF + "M" + dlen + "D";
} else {
newCigarStr += dlen + "D" + rm + "M";
}
}
} else {
newCigarStr += dlen + "D" + tslen + "I" + rm + "M";
}
Expand Down
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a defensive change that prevents similar issues to have an operational impact, but it has the risk of hiding issues in future. Happy to drop this from the PR. Let me know what you think.

Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,8 @@ public static boolean isHasAndEquals(Map<Integer, Character> ref, int index1, St
Character refc = ref.get(index1);
if (refc == null)
return false;
//if (index2 < 0) index2 = index2 + str.length();
if (index2 < 0 || index2 >= str.length())
return false;
return refc.equals(str.charAt(index2));
}

Expand All @@ -494,7 +495,8 @@ public static boolean isHasAndNotEquals(Map<Integer, Character> ref, int index1,
Character refc = ref.get(index1);
if (refc == null)
return false;
//if (index2 < 0) index2 = index2 + str.length();
if (index2 < 0 || index2 >= str.length())
return false;
return !refc.equals(str.charAt(index2));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.TextCigarCodec;
import org.testng.Assert;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.DataProvider;
Expand Down Expand Up @@ -334,4 +335,83 @@ public void threeIndels(final String initialCigar, final String expected, final
String actual = cigarModifier.modifyCigar().cigar;
Assert.assertEquals(actual, expected);
}

/**
* Regression test for a malformed CIGAR produced by the merge logic when a read aligns
* inside a long homopolymer with three closely-spaced deletions.
*
* Reproducer is taken verbatim from a duplex consensus read observed in production:
* read name : duplex_GTGTC-GTCAC_140434267-140434623_817012 (mate2)
* chr/pos : 7:140434471 (hg19)
* cigar : 104M1I2M2D1M1D5M1D2M9D27M
*
* Without a fix, threeDeletions()/captureMisSoftly3Mismatches() let the matching-base
* scan run past the read end, producing the malformed intermediate CIGAR
* 104M15D12I-1M9D27M
* (note the negative -1M operator) and then throwing
* StringIndexOutOfBoundsException: Index 143 out of bounds for length 142
* inside VariationUtils.isHasAndNotEquals.
*
* The expected behaviour is: modifyCigar() must NOT throw, and must return a CIGAR
* that htsjdk's TextCigarCodec can decode (i.e. all operator lengths positive).
*/
@Test
public void modifyCigarOnHomopolymerThreeDeletionsDoesNotProduceMalformedCigar() {
// Read sequence and qualities are 142 nt (the bug reports "Index 143 out of bounds for length 142").
final String querySequence =
"AATCCTCTGTTTGGAAACCAGCCCGATTCAAGGAGGGTTCTGATGCACTGCGGTGAATTTTTGGCAATGAGCGGGCCAGCAGCTCAATAGAGGCGAGAATCTAC"
+ "AAAAAAAAAAAAAAAAAAAAGAAAAAAGAAAAAAAAAG";
final String queryQuality =
"BBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"
+ "=74??44444444FFFFFFFFFFFFFFFFFFFFFFFFFFFFF";
Assert.assertEquals(querySequence.length(), 142);
Assert.assertEquals(queryQuality.length(), 142);

// Reference window covering position .. position + cigarRefSpan(154) with a small margin,
// taken from human_g1k_v37.fa at chr7:140434471-140434670. Reference is 1-indexed in the test,
// so reference position 1 corresponds to the read alignment start.
final int position = 1;
final String refSequence =
"AATCCTCTGTTTGGAAACCAGCCCGATTCAAGGAGGGTTCTGATGCACTGCGGTGAATTTTTGGCAATGAGCGGGCCAGCAGCTCAATAGAGGCGAGAATCTAC"
+ "AAAAAAAAAAAGAAAAAAAAAAGAAAAAAAAAGAAAAAAGAAAAAAAAAGAAAGAAAGAAAAAGAAAAAACAGAAAGAAGAATGAAAATCTGGGTG";

Map<Integer, Character> reference = new HashMap<>();
for (int i = 0; i < refSequence.length(); i++) {
reference.put(i + 1, refSequence.charAt(i));
}
Reference ref = new Reference();
ref.referenceSequences = reference;

// GlobalReadOnlyScope is required by some merge-helper paths.
Configuration config = new Configuration();
config.goodq = 23;
config.vext = 3;
GlobalReadOnlyScope.init(config, null, null, null, "", new HashMap<>(), new HashMap<>());

Region region = new Region("7", 1, 700, "");

final String initialCigar = "104M1I2M2D1M1D5M1D2M9D27M";

CigarModifier cigarModifier = new CigarModifier(
position, initialCigar, querySequence, queryQuality,
ref, /*indel=*/3, region, /*maxReadLength=*/150);

String modifiedCigar = cigarModifier.modifyCigar().cigar;

// Hard requirement #1: htsjdk must be able to decode the result. This currently fails
// with "Malformed CIGAR string: 104M15D12I-1M9D27M".
try {
TextCigarCodec.decode(modifiedCigar);
} catch (IllegalArgumentException e) {
Assert.fail("modifyCigar() returned a CIGAR htsjdk cannot decode: '"
+ modifiedCigar + "' (" + e.getMessage() + ")");
}

// Hard requirement #2: every operator length must be positive.
for (CigarElement el : TextCigarCodec.decode(modifiedCigar).getCigarElements()) {
Assert.assertTrue(el.getLength() > 0,
"modifyCigar() produced non-positive length for operator "
+ el.getOperator() + " in CIGAR '" + modifiedCigar + "'");
}
}
}