Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ import com.fulcrumgenomics.umi.UmiConsensusCaller.{ConsensusKvMetric, ReadType,
import com.fulcrumgenomics.util.NumericTypes.PhredScore
import com.fulcrumgenomics.util.Sequences
import htsjdk.samtools.SAMTag
import htsjdk.samtools.SamPairUtil.PairOrientation

import scala.collection.mutable

/**
* Consensus caller for CODEC sequencing[1]. In CODEC each read-pair has an R1 which is generated from one
Expand Down Expand Up @@ -170,10 +173,25 @@ class CodecConsensusCaller(readNamePrefix: String,
Nil
}
else {
// Extract just the primary alignments and then clip where they extend past the ends of their mates
// TODO: Remove the isFrPair here and handle chimeric reads or reads with one unmapped etc.
val primaries = pairs.filterNot(r => r.secondary || r.supplementary).filter(_.isFrPair)
primaries.groupBy(_.name).foreach { case (_, Seq(rec, mate)) => clipper.clipExtendingPastMateEnds(rec, mate) }
// Group the primary alignments by read name and retain only templates that form a single
// primary FR pair. The FR check is performed at the pair level (via `isPrimaryFrPair`)
// because htsjdk's per-record `SamPairUtil.getPairOrientation` historically disagreed
// between mates on dovetail pairs whose aligned ends coincided, which previously caused a
// `scala.MatchError` on a singleton read-name bucket here. See
// https://github.com/samtools/htsjdk/pull/1771 for the upstream fix (htsjdk 5.0.0+).
// Template order is preserved from BAM-iteration order (not hash order) so that downstream
// `maxBy` tie-breaks on `cigar.lengthOnTarget` are stable across JVMs.
// TODO: handle chimeric reads or reads with one unmapped etc.
val primaryPairs = CodecConsensusCaller.orderedPrimaryPairs(
pairs.filterNot(r => r.secondary || r.supplementary)
)
val (frPairs, nonFrPairs) = primaryPairs.partition {
case (_, Seq(a, b)) => CodecConsensusCaller.isPrimaryFrPair(a, b)
case _ => false
}
rejectRecords(nonFrPairs.flatMap(_._2), RejectionReason.NotPrimaryFrPair)
frPairs.foreach { case (_, Seq(rec, mate)) => clipper.clipExtendingPastMateEnds(rec, mate) }
val primaries = frPairs.flatMap(_._2)

val r1s = filterToMostCommonAlignment(primaries.filter(_.firstOfPair).map(toSourceReadForCodec))
val r2s = filterToMostCommonAlignment(primaries.filter(_.secondOfPair).map(toSourceReadForCodec))
Expand Down Expand Up @@ -375,3 +393,42 @@ class CodecConsensusCaller(readNamePrefix: String,
)
}
}

object CodecConsensusCaller {
/** Groups primary records by read name while preserving BAM-iteration order of templates.
*
* Scala's `Seq.groupBy` returns a hash-keyed `Map`, so calling `.toSeq` on it produces a
* template order that depends on the JVM's `String#hashCode` and the `HashMap` layout.
* Downstream code selects the longest R1 and R2 via `maxBy(_.cigar.lengthOnTarget)`, which
* returns the first element in iteration order on ties; relying on hash order there yields
* non-obvious, JVM-dependent winners. This helper instead orders templates by the first
* occurrence of each read name in the input (i.e. BAM-iteration order), so tie-breaks are
* stable and match the order the records were observed.
*/
private[umi] def orderedPrimaryPairs(records: Seq[SamRecord]): Seq[(String, Seq[SamRecord])] = {
val byName = mutable.LinkedHashMap.empty[String, mutable.ArrayBuffer[SamRecord]]
records.foreach { rec => byName.getOrElseUpdate(rec.name, mutable.ArrayBuffer.empty) += rec }
byName.iterator.map { case (name, recs) => (name, recs.toSeq) }.toSeq
}

/** Returns true if the two records form an FR pair, computing the answer symmetrically in
* its arguments. After validating that both records are mapped to the same contig on
* opposite strands, delegates to htsjdk's `SamPairUtil.getPairOrientation` evaluated on
* the reverse-strand record: that branch derives both 5' positions from CIGARs
* (`mate.alignmentStart` vs `record.alignmentEnd`) and is independent of MC/TLEN, so the
* answer is consistent regardless of which record of the pair this is called with. See
* https://github.com/samtools/htsjdk/pull/1771 (htsjdk 5.0.0+) for the underlying
* per-record asymmetry this avoids; once we no longer need to support older htsjdk
* versions for this caller, this helper can be replaced with `SamRecord.isFrPair`.
*/
private[umi] def isPrimaryFrPair(a: SamRecord, b: SamRecord): Boolean = {
if (!a.mapped || !b.mapped) false
else if (!a.mateMapped || !b.mateMapped) false
else if (a.refIndex != b.refIndex) false
else if (a.positiveStrand == b.positiveStrand) false
else {
val rev = if (a.positiveStrand) b else a
rev.pairOrientation == PairOrientation.FR
}
}
Comment on lines +425 to +433

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you just find the higher coordinate of the two reads, and then call htsjdk's method here? Your main problem was lack of consistency, and doing anything on the pair gets you there.

}
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ object UmiConsensusCaller {
case object IndelErrorBetweenStrands extends _RejectionReason("indel_error_between_strands", "Indel error between top/bottom strands", false, false, true)
case object HighDuplexDisagreement extends _RejectionReason("high_duplex_disagreement", "Too many errors between top/bottoms strands", false, false, true)
case object ClipOverlapFailed extends _RejectionReason("clip_overlap_failed", "See https://github.com/fulcrumgenomics/fgbio/issues/1090", false, false, true)
case object NotPrimaryFrPair extends _RejectionReason("not_primary_fr_pair", "Template did not have a single primary FR pair of reads", false, false, true)
}

/** Metric class for outputting consensus calling statistics. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,38 @@ class CodecConsensusCallerTest extends UnitSpec with OptionValues {
caller.consensusReadsFromSamRecords(rfPair) shouldBe Seq()
}

it should "not throw on an FR dovetail pair whose aligned ends coincide (htsjdk asymmetric isFrPair)" in {
// Reproduces a real template from a HEK293T CODEC dataset in which the aligned end of the
// reverse-strand read coincides with the aligned start of the forward-strand read and
// soft-clipping produces TLEN=+/-1. For this geometry htsjdk's
// `SamPairUtil.getPairOrientation` returns FR for R1 and RF for R2, which previously caused
// `CodecConsensusCaller` to drop R2 via a per-record `isFrPair` filter and then throw
// `scala.MatchError` on the resulting singleton read-name bucket.
val builder = new SamBuilder(readLength=129, baseQuality=35)
val raw = builder.addPair(
contig = 0, start1 = 96, start2 = 49,
cigar1 = "68S53M8S", cigar2 = "28S48M53S",
attrs = Map(("RX", "ACC-TGA"), ("MI", "hi"))
).tapEach(setReadSequence)

val caller = new CodecConsensusCaller(readNamePrefix="codec", minReadsPerStrand=1, minDuplexLength=1)
noException should be thrownBy caller.consensusReadsFromSamRecords(raw)
}

it should "not throw on a template whose primary read-name bucket has only one record" in {
// Exercise the defensive path for a degenerate template that reaches the CODEC caller with
// only one primary record per read name (e.g. an orphan introduced upstream). Previously
// the `Seq(rec, mate)` pattern match at line 176 crashed with MatchError.
val builder = new SamBuilder(readLength=30, baseQuality=35)
val pair = builder.addPair(
contig=0, start1=1, start2=11, attrs=Map(("RX", "ACC-TGA"), ("MI", "hi"))
).tapEach(setReadSequence)
val onlyR1 = pair.filter(_.firstOfPair)

val caller = new CodecConsensusCaller(readNamePrefix="codec", minReadsPerStrand=1, minDuplexLength=1)
caller.consensusReadsFromSamRecords(onlyR1) shouldBe Seq()
}

it should "not emit a consensus when the reads are a cross-chromosomal chimeric pair" in {
val builder = new SamBuilder(readLength=30, baseQuality=35)
val caller = new CodecConsensusCaller(readNamePrefix="codec", minReadsPerStrand=1, minDuplexLength=1)
Expand Down Expand Up @@ -290,6 +322,87 @@ class CodecConsensusCallerTest extends UnitSpec with OptionValues {
.consensusReadsFromSamRecords(raw) should have length 0
}

//////////////////////////////////////////////////////////////////////////////
// Tests for orderedPrimaryPairs
//////////////////////////////////////////////////////////////////////////////

"CodecConsensusCaller.orderedPrimaryPairs" should "order templates by first-occurrence BAM-iteration order" in {
// Names chosen so that Scala's `HashMap` iteration order on `groupBy(_.name).toSeq` is
// extremely unlikely to coincide with insertion order; the helper must return them in the
// order they were observed in the input regardless of hash layout.
val builder = new SamBuilder(readLength=30, baseQuality=35)
val names = Seq("zulu", "alpha", "mike", "bravo", "yankee")
names.foreach { n =>
builder.addPair(name=n, contig=0, start1=1, start2=11).tapEach(setReadSequence)
}
val ordered = CodecConsensusCaller.orderedPrimaryPairs(builder.toSeq)
ordered.map(_._1) shouldBe names
ordered.foreach { case (_, recs) => recs should have size 2 }
}

it should "ignore secondary/supplementary records only when the caller pre-filters them" in {
// The helper itself does not filter; callers pass pre-filtered primaries. This test documents
// that contract by passing records that include a supplementary alignment and verifying it is
// grouped alongside the primary under the same read name.
val builder = new SamBuilder(readLength=30, baseQuality=35)
val pair = builder.addPair(name="t1", contig=0, start1=1, start2=11).tapEach(setReadSequence)
pair.head.supplementary = true
val ordered = CodecConsensusCaller.orderedPrimaryPairs(builder.toSeq)
ordered.map(_._1) shouldBe Seq("t1")
ordered.head._2 should have size 2
}

//////////////////////////////////////////////////////////////////////////////
// Tests for isPrimaryFrPair
//////////////////////////////////////////////////////////////////////////////

"CodecConsensusCaller.isPrimaryFrPair" should "classify a dovetail FR pair as FR regardless of argument order" in {
val builder = new SamBuilder(readLength=129, baseQuality=35)
val pair = builder.addPair(
contig=0, start1=96, start2=49, cigar1="68S53M8S", cigar2="28S48M53S"
).tapEach(setReadSequence)
val r1 = pair.head
val r2 = pair.last
CodecConsensusCaller.isPrimaryFrPair(r1, r2) shouldBe true
CodecConsensusCaller.isPrimaryFrPair(r2, r1) shouldBe true
}

it should "classify a typical FR pair as FR" in {
val builder = new SamBuilder(readLength=30, baseQuality=35)
val Seq(r1, r2) = builder.addPair(contig=0, start1=10, start2=50).toSeq
CodecConsensusCaller.isPrimaryFrPair(r1, r2) shouldBe true
}

it should "classify an RF pair as not FR" in {
val builder = new SamBuilder(readLength=30, baseQuality=35)
val Seq(r1, r2) = builder.addPair(
contig=0, start1=50, start2=100, strand1=Minus, strand2=Plus
).toSeq
CodecConsensusCaller.isPrimaryFrPair(r1, r2) shouldBe false
}

it should "reject cross-chromosomal pairs" in {
val builder = new SamBuilder(readLength=30, baseQuality=35)
val Seq(r1, r2) = builder.addPair(contig=0, start1=10, start2=50).toSeq
r1.refIndex = 0
r2.refIndex = 1
CodecConsensusCaller.isPrimaryFrPair(r1, r2) shouldBe false
}

it should "reject a pair with one mate unmapped" in {
val builder = new SamBuilder(readLength=30, baseQuality=35)
val Seq(r1, r2) = builder.addPair(contig=0, start1=10, start2=10, unmapped2=true).toSeq
CodecConsensusCaller.isPrimaryFrPair(r1, r2) shouldBe false
}

it should "reject a tandem pair" in {
val builder = new SamBuilder(readLength=30, baseQuality=35)
val Seq(r1, r2) = builder.addPair(
contig=0, start1=10, start2=50, strand1=Plus, strand2=Plus
).toSeq
CodecConsensusCaller.isPrimaryFrPair(r1, r2) shouldBe false
}

//////////////////////////////////////////////////////////////////////////////
// Tests for quality masking
//////////////////////////////////////////////////////////////////////////////
Expand Down
Loading