Skip to content

Commit 9a691bc

Browse files
author
Dana Elizabeth Wyman
committed
Set default vals for primaryOnly and canonOnly; Fixed bug leading to different lengths for SEQ and CIGAR when correcting junctions, adding extensive test cases for this.
1 parent ccb0264 commit 9a691bc

10 files changed

Lines changed: 416565 additions & 14 deletions

TranscriptClean.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
# TC Classes
1111
from transcript import Transcript
12+
from transcript import check_seq_and_cigar_length
1213
from spliceJunction import *
1314
from intronBound import IntronBound
1415
from optparse import OptionParser
@@ -120,12 +121,13 @@ def getOptions():
120121
parser.add_option("--primaryOnly", dest ="primaryOnly", action='store_true',
121122
help = "If this option is set, TranscriptClean will only \
122123
output primary mappings of transcripts (ie it will filter \
123-
out unmapped and multimapped lines from the SAM input.")
124+
out unmapped and multimapped lines from the SAM input.",
125+
default = False)
124126
parser.add_option("--canonOnly", dest ="canonOnly", action='store_true',
125127
help = ("If this option is set, TranscriptClean will "
126128
"output only canonical transcripts and transcripts "
127129
"containing annotated noncanonical junctions to the "
128-
"clean SAM file at the end of the run."))
130+
"clean SAM file at the end of the run."), default = False)
129131
parser.add_option("--tmpDir", dest ="tmp_path",
130132
help = ("If you would like the tmp files to be written "
131133
"somewhere different than the final output, "
@@ -411,7 +413,7 @@ def correct_transcript(transcript_line, options, refs):
411413
options.maxLenIndel, upd_logInfo)
412414
if ins_TE != "":
413415
TE_entries += ins_TE
414-
416+
415417
# Deletion correction
416418
del_TE = correctDeletions(upd_transcript, refs.genome, refs.deletions,
417419
options.maxLenIndel, upd_logInfo)
@@ -1212,6 +1214,7 @@ def update_post_ncsj_correction(transcript, splice_jn_num, genome, sjAnnot):
12121214
transcript.jM, transcript.jI = transcript.get_jM_jI_tags_from_sjs()
12131215
transcript.isCanonical = transcript.recheckCanonical()
12141216
transcript.allJnsAnnotated = transcript.recheckJnsAnnotated()
1217+
12151218
return
12161219

12171220
def attempt_jn_correction(transcript, splice_jn_num, genome, ref_donors,
@@ -1247,17 +1250,19 @@ def attempt_jn_correction(transcript, splice_jn_num, genome, ref_donors,
12471250
transcript.POS, splice_jn_num,
12481251
donor, ref_donor.dist, genome,
12491252
transcript.SEQ, transcript.CIGAR)
1250-
1253+
12511254
# Attempt to fix the splice acceptor side
12521255
acceptor = junction.get_splice_acceptor()
12531256
transcript.SEQ, transcript.CIGAR = fix_one_side_of_junction(transcript.CHROM,
12541257
transcript.POS, splice_jn_num,
12551258
acceptor, ref_acceptor.dist, genome,
12561259
transcript.SEQ, transcript.CIGAR)
1260+
12571261
# Now, perform updates:
12581262
update_post_ncsj_correction(transcript, splice_jn_num, genome, sjAnnot)
12591263

1260-
except:
1264+
except Exception as e:
1265+
print(e)
12611266
return False, "Other", combined_dist
12621267

12631268
return True, "NA", combined_dist
@@ -1384,6 +1389,9 @@ def fix_one_side_of_junction(chrom, transcript_start, jn_number, intronBound, d,
13841389
newCIGAR = newCIGAR + exonCIGARs[i] + str(intronCIGARs[i]) + "N"
13851390
newCIGAR = newCIGAR + exonCIGARs[-1]
13861391

1392+
if not check_seq_and_cigar_length(newSeq, newCIGAR):
1393+
raise RuntimeError("CIGAR string and sequence are not the same length")
1394+
13871395
return newSeq, newCIGAR
13881396

13891397

@@ -1580,6 +1588,7 @@ def dryRun(sam, options, outfiles):
15801588
write_to_transcript_log(logInfo, tL)
15811589
return
15821590

1591+
15831592
if __name__ == '__main__':
15841593
#pr = cProfile.Profile()
15851594
#pr.enable()
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
@SQ SN:chr2L LN:23513712
2+
@SQ SN:chr2R LN:25286936
3+
@SQ SN:chr3L LN:28110227
4+
@SQ SN:chr3R LN:32079331
5+
@SQ SN:chr4 LN:1348131
6+
@SQ SN:chrX LN:23542271
7+
@SQ SN:chrY LN:3667352
8+
@PG ID:minimap2 PN:minimap2 VN:2.15-r905 CL:minimap2 -t 16 -ax splice -uf --secondary=no -C5 /pub/dwyman/TALON-review/talon_on_other_data/refs/drosophila/dmel_r6.fa /pub/dwyman/TALON-review/talon_on_other_data/krizanovic_et_al/dataset7_real_subreads.fastq
9+
m160713_133433_42182_c101000162550000001823232709161620_s1_p0/121139/11291_13013 0 chr3R 14890311 53 67S116M269N202M10D232M11I151M62N371M61N92M74N284M13D101M6D30M198013N68M36S * 0 0 AACAGGGTATCAACGCAGCGAGAGTACAGGGATGTGTAACACACCGCCCGAGAGCGACACAACAAAATCTTAAATATTTTTTAGTAATAGGCTATTGAACGAATTAATAAAACGATGGCGTTGCGAGTTCTTGGCGTGACGCAGAATATGCGCTCCTTCATGCGGGGCGTTGACATACAAGTCCGATGGCTGCCACCCTACAGAAGACTCTGCTGAACATACCCGCCACCCAGGTGACCAAGCTGGACAACGGTCTCCGCGTGGCTAGCGAGGATTCCGGTGCCTCTACCGCCACCGTGGGACTGTGGATCGATGCTGGGTCGCGCTCCGAGAACGAGAAGAACAACGGAGTGGCCCACTTCCTGGAGCACATGGCCTCTCACTAGCTTGATGCTTTAAATGCGTATAATATGCGTTTCTGCCACTTTTTAGGGTACCGCCAAGCGCTCGCAAACGGATTTGGAGCTGGAGGTTGAGAACCTGGGCGCCCACTTGAACGCCTACACTTCCAGGGAGCAGACCGTCTTCTACGCCAAGTGTTTGTCCAAGGATGTGCCCAAAGCCGTCGAGATCCTGGCCGACATCATCCAGAACTCCAAGCTGGGTGAGGCGCCACTATAACTGAGATCGCCCGCGAGCGTTCGGTGATTCTGCGCGAGATGCAGGAGGTGGAGAGCAACCTGCAGGAGGTGGTCTTCGATCACCTTCACGCCACCGCCTATCAGGGCACTCCTCTGGGCCAGACCATCCTGGGACCCACCAAGAACATTCAGTCCATTGGCAAGGCCGATCTGACCGACTACATCCAGACCCACTACAAGGCTTCGCGCATCGTTCTGGCCGCTGCTGGCGGCGTCAAGCACGATGATCTGGTCAAGCTTGCCTGCAGCAGCTTGGGTGGCCTGGAGGCCAGTGTGCTGCCCGCGGAGGTGACTCCCTGCCGCTTCACCGGCTCCGAGGTGCGTGTGCGCGACGATTCCCTGCCCCTGGCCCATGTCGCCATCGCCGTCGAAGGCTGCGGCTGGACCGACCAGGACAACATCCCCCTGATGGTGGCCAACACCTTGGTGGGTGCTTGGGATCGTTCCCAAGGCGGTGGCGCCAACAACGCCTCCAACCTGGCCCGTGCCAGCGCTGAGGACAACCTTTGCCACAGCTTCCAATCGTTCAACACATGCTACAAGGACACCGGACTCTGGGGCATTTACTTCGTGTGCGACCCTCTGCAGTGCGAGGATATGCTCTTCAACGTGCAGACCGAGTGGATGCGTCTGTGCACCATGGTTACCGAGGCTGAGGTCGAGCGCGCCAAGAACCTTCTGAAGACCAACATGCTGCTGCAGCTCGACGGCACCACACCCATCTGCGAGGACATTGGCCGCCAGATCCTGTGCTACAACCGCCGCATCCCGCTGCACGAGCTGGAGCAGCGCATCGATGCCGTGAGTGTGGGCAATGTGCGCGACGTCGCAATGAAGTACATCTACGATCGGTGCCCAGCCGTCGCTGCCGTGGGTCCCCCGACTACAACAGAATCCGCTCCTCCATGTACTGGTTGAGGGTTTAAGAGGTTCCGCCCCGCTGTTGTTGGACATTGTAGTTTAATTCAAAAGAAAATTTCAGAGGATCAGTAAACCGATAATACTAAAAATTCAAAGCTCCAACTCCGCGCAGCCCCAAAAAAAATATCGAATAAAAGCTAAAAAAAAAAAAAAAGAAAAAAAAAATAAAGTACCCTGCGTTGATACCAGCTT * ms:i:368 AS:i:172 nn:i:0 ts:A:+ tp:A:P cm:i:15 s1:i:81 s2:i:0 de:f:0.1985 NM:i:1184 MD:Z:109G0A0T0C0A0A0A4A0C0A0A1T1G0G1T0G0C2C0C0C0T2A0G0A0A0G0A0C0T0C0T0G0C0T0G0A0A0C0A0T4G0C0C0A0C0C1A0G0G0T0G0A0C0C1A0G0C0T2A1A0A2G0T0C0T0C1G0C1T1G0C0T1G0C0G0A2A0T0T1C0G0G0T0G1C0T1T0A1C0G0C0C0A0C1G0T0G3C0T1T0G1A1C1A1G0C0T1G0G0T2C1C0T1C0G0A0G1A0C0G0A0G1A1A1C0A0A0C0G0G0A0G0T0G0G2C0A0C0T0T1C0T0G0G0A2A0C0A1G0G1C0T0^TCAAGGTTGG0C0T0C0A0C0T0A4G1T0G0C0T0T0T0A1A0T0G0C0G1A0T0A0A1A0T0G0C0G0T0T0T0C1G0C0C1C0T0T1T0T0A1G0G0T1C3C0A0A0G0C0G0C0T0C1C0A0A0A0C1G0A0T1T1G0A1C2G0A0G0G0T0T0G0A1A0A0C1T0G0G0G0C0G0C0C0C0A0C0T0T0G1A0C0G0C0C0T0A0C0A0C0T0T0C3G0G0A1C0A0G0A1C0G0T0C0T0T0C0T0A0C1C0C0A0A0G0T0G0T0T0T1T0C0C0A0A0G0G0A0T0G0T3C0A0A0A0G0C0C0G0T1G0A1A0T0C0C0T0G0G1C0G0A2T0C1T0C1A0G1A2C0C0A0A1C0T1G1T0G1G0G1T0G0A0G0A0T0C0G0C1C0G0C0G0A1C1T1C0G0G0T0G0A0T0T0C0T0G0C2G0A2T1C0A1G0A0G1T0G0G0A0G0A3A0C0C0T1C0A1G0A0G0G0T1G2T0T1G0A0T0C0A0C1T0T1A0C0G1C1C1G0C0C0T0A0T1A0G0G0G1A0C0T0C1T0C0T0G0G0G0C0C0A0G0A0C0C0A0T2T0G0G0G1C0C0C2C0A0A0G1A0C0A0T0T0C0A1T1C1T0T0G0G0C1A0G0G0C0C0G0A0T0C0T0G2C0G0A1T0A0C0A0T1C0A0G2C0C0A0C0T0A0C0A0A0G2T3C0G1A0T0C0G0T0T3G0C0C2T2T0G0G0C0G2G0T0C1A1C0A1G0A0T0G0A0T0C0T0G0G1C0A0A0G0C0T0T2C0T0G0C0A1C0A0G0C0T0T0G0G1T0G1C1T0G1A1G0C1A1T0G0T2T1C0C0C0G0C0G0G0A0G0G0T0G0A1T0C1C1G0C1G0C0T0T0C0A1C0G1C1C2A1G0T0G0C0G0T0G0T1C0G0C0G0A1G0A0T0T2C0T4C2G1C1A0T0G0T0C0G1C0A0T0C0G0C0C1T0C1A0A1G0C0T1C0G0G0C0T0G0G1C0C0G0A0C0C0A0G0G0A1A0A1A0T0C0C0C0C0C0T1A0T0G0G0T0G0G1C0A0A0C0A0C0C0T2G1G2T0G0C0T0T0G0G0G0A0T0C0G0T0T1C0C0A0A1G0C0G0G0T0G0G0C0G1C0A0A0C0A0A0C0G3C0C0A0A1C2G1C0C0G0T0G0C0C2C0G1T0G0A0G0G0A0C0A0A1C0T0T0T0G0C0C0A2G0C0T0T0C0C0A1T1G0T0T0C0A0A0C3T1C0T0A0C0A0A4A1C2A0C0T0C1G0G0G0G0C0A0T0T1A0C0T0T0C0G0T0G0T0G1G0A1C0C1C0T1C0A1T0G0C0G0A0G0G0A1A0T0G0C0T0C1T2A0C0G0T1C0A0G0A0C0C0G0A0G0T0G0G0A0T0G0C0G0T1T0G2C0A0C0C0A0T1G0T0T0A0C0C0G0A1G1T0G0A0G1T0C0G0A0G0C1C0G2A0A0G0A0A0C0C0T0T1T0G1A0G0A0C1A0A1A0T0G0C0T0G0C0T1C0A1C0T0C0G0A0C0G0G1A1C0A1A0C1C1T0C0T0G0C0G0A2A1A0T0T0G1C0C0G1C0A0G0A0T0C0C0T0G0T0G1T0A1A0A0C0C0G4A0T1C2C0T0G0C0A0C0G0A0G0C0T0G1A0G0C0A1C0G0C0A0T0C0G0A0T1C0C0G0T1A0G0T0G2G2A0A0T0G0T1C0G0C0G1C0G0T0C0G0C0A1T0G0A0A0G0T0A0C3T0A0C1A0T1G1T0G0C0C1A0G0C0C0G0T0C0G0C0T1C1G0T0G0G1T1C0^CGTTGAGAACCTG0C1C0G0A0C0T1C0A0A1A0G0A0A0T0C1G0C1C0C0T1C0A0T0G1A0C0T4G1G1G0T0T1A0A0G0A0G0G0T0T0C1G0C0C0C0C0G0C0T1T0T0G3G1C0A2G0T0A0G0T0T0T2T0T0C2A0A0G0A1A1T0T0T0^CACCCC0C0A0G0A0G0G1T1A1T0A2C0C0G0A0T2T1C1A3A0T2A2G0C0T1C0A0A1T0C1G1G0C1G0C0C0C0C1A1A0A0A4T0C0G0A0A5G0C0T8A6G1 jM:B:c,21,21,21,21,1 jI:B:i,14890427,14890695,14891291,14891352,14891724,14891784,14891877,14891950,14892385,15090397

0 commit comments

Comments
 (0)