11# Find circRNAs using:
2- # circExplorer2
3- # ciri2
4- # CLEAR
5- # DCC
2+ # ------------------------------------------------------------------------------------------------
3+ # TOOL | Main output file
4+ # ------------------------------------------------------------------------------------------------
5+ # circExplorer2 | "results","{sample}","circExplorer","{sample}.circExplorer.counts_table.tsv"
6+ # ciri2 | "results","{sample}","ciri","{sample}.ciri.out"
7+ # CLEAR | "results","{sample}","CLEAR","quant.txt.annotated" ... not used as this is just filtered circExplorer output
8+ # DCC | "results","{sample}","DCC","{sample}.dcc.counts_table.tsv"
9+ # MapSplice | "results","{sample}","MapSplice","circular_RNAs.txt"
10+ # NCLscan | "results","{sample}","NCLscan","{sample}.nclscan.counts_table.tsv"
611# and annotate them
712
813
914## function
10- def get_dcc_inputs (wildcards ):
11- filelist = []
12- for s in SAMPLES :
13- filelist .append (join (WORKDIR ,"results" ,s ,"STAR1p" ,s + "_p1.Chimeric.out.junction" ))
14- filelist .append (join (WORKDIR ,"results" ,s ,"STAR1p" ,"mate1" ,s + "_mate1.Chimeric.out.junction" ))
15- filelist .append (join (WORKDIR ,"results" ,s ,"STAR1p" ,"mate2" ,s + "_mate2.Chimeric.out.junction" ))
16- return filelist
15+ # def get_dcc_inputs(wildcards):
16+ # filelist=[]
17+ # for s in SAMPLES:
18+ # filelist.append(join(WORKDIR,"results",s,"STAR1p",s+"_p1.Chimeric.out.junction"))
19+ # filelist.append(join(WORKDIR,"results",s,"STAR1p","mate1",s+"_mate1.Chimeric.out.junction"))
20+ # filelist.append(join(WORKDIR,"results",s,"STAR1p","mate2",s+"_mate2.Chimeric.out.junction"))
21+ # return filelist
22+
23+ def get_nclscan_target_files_per_sample (wildcards ):
24+ targetfiles = dict ()
25+ s = wildcards .sample
26+ if SAMPLESDF .loc [[s ],"PEorSE" ][0 ]== "PE" : # SE is already take care of by function get_nclscan_target_files
27+ targetfiles ['fixed_gtf' ]= join (REF_DIR ,"ref.fixed.gtf" )
28+ targetfiles ['ndx' ]= join (REF_DIR ,"NCLscan_index" ,"AllRef.ndx" )
29+ targetfiles ['R1' ]= join (WORKDIR ,"results" ,s ,"trim" ,s + ".R1.trim.fastq.gz" )
30+ targetfiles ['R2' ]= join (WORKDIR ,"results" ,s ,"trim" ,s + ".R2.trim.fastq.gz" )
31+ return targetfiles # empty if SE and will not run the rule at all!
32+
33+ def get_per_sample_files_to_merge (wildcards ):
34+ filedict = {}
35+ s = wildcards .sample
36+ filedict ['circExplorer' ]= join (WORKDIR ,"results" ,s ,"circExplorer" ,s + ".circExplorer.counts_table.tsv" )
37+ filedict ['CIRI' ]= join (WORKDIR ,"results" ,s ,"ciri" ,s + ".ciri.out" )
38+ # # if RUN_CLEAR:
39+ # # filedict['CLEAR']=join(WORKDIR,"results","{sample}","CLEAR","quant.txt.annotated")
40+ if RUN_DCC :
41+ filedict ['DCC' ]= join (WORKDIR ,"results" ,"{sample}" ,"DCC" ,"{sample}.dcc.counts_table.tsv" )
42+ if RUN_MAPSPLICE :
43+ filedict ['MapSplice' ]= join (WORKDIR ,"results" ,"{sample}" ,"MapSplice" ,"{sample}.mapslice.counts_table.tsv" )
44+ if RUN_NCLSCAN :
45+ filedict ['NCLscan' ]= join (WORKDIR ,"results" ,"{sample}" ,"NCLscan" ,"{sample}.nclscan.counts_table.tsv" )
46+ return (filedict )
47+
1748
1849## rules
1950# rule circExplorer:
@@ -198,6 +229,7 @@ rm -rf {params.sample}.bwa.sam
198229
199230
200231rule create_ciri_count_matrix :
232+ # DEPRECATED
201233 input :
202234 expand (join (WORKDIR ,"results" ,"{sample}" ,"ciri" ,"{sample}.ciri.out" ),sample = SAMPLES )
203235 output :
@@ -215,6 +247,7 @@ python {params.script} {params.lookup} {params.hostID}
215247"""
216248
217249rule create_circexplorer_count_matrix :
250+ # DEPRECATED
218251 input :
219252 expand (join (WORKDIR ,"results" ,"{sample}" ,"circExplorer" ,"{sample}.circularRNA_known.txt" ),sample = SAMPLES )
220253 output :
@@ -666,13 +699,10 @@ rsync -az --progress alignments.bam.bai {output.bai}
666699# | 3 | end | 1223968 |
667700# | 4 | strand | - |
668701# | 5 | read_count | 26 |
669- # | 6 | nclscan_annotation | normal##2.811419 | <--1 for intragenic 0 for intergenic
702+ # | 6 | nclscan_annotation | 1 | <--1+1 for intragenic 0+1 for intergenic
670703rule nclscan :
671704 input :
672- fixed_gtf = rules .create_index .output .fixed_gtf ,
673- ndx = rules .create_index .output .ndx ,
674- R1 = rules .cutadapt .output .of1 ,
675- R2 = rules .cutadapt .output .of2 ,
705+ unpack (get_nclscan_target_files_per_sample )
676706 output :
677707 result = join (WORKDIR ,"results" ,"{sample}" ,"NCLscan" ,"{sample}.result" ),
678708 ct = join (WORKDIR ,"results" ,"{sample}" ,"NCLscan" ,"{sample}.nclscan.counts_table.tsv" ),
@@ -697,60 +727,100 @@ fi
697727if [ ! -d $TMPDIR ];then mkdir -p $TMPDIR;fi
698728outdir=$(dirname {output.result})
699729
730+ if [ "{params.peorse}" == "PE" ];then
700731{params.nclscan_dir}/NCLscan.py -c {params.nclscan_config} -pj {params.sample} -o $outdir --fq1 {input.R1} --fq2 {input.R2}
701732python {params.script} \
702733 --result {output.result} -o {output.ct}
734+ # else
735+ # outdir=$(dirname {output.result})
736+ # if [ ! -d $outdir ];then
737+ # mkdir -p $outdir
738+ # fi
739+ # touch {output.result}
740+ # touch {output.ct}
741+ # This part is redundant as it is already taken care of by get_nclscan_target_files function!
742+ fi
703743"""
704744
705-
706- localrules : merge_per_sample_circRNA_counts
707- # rule merge_per_sample_circRNA_counts:
708- # merges counts from circExplorer2 and CIRI2 for all identified circRNAs.
709- # The output file columns are:
710- # | # | ColName |
711- # |---|---------------------------------------|
712- # | 1 | circRNA_id |
713- # | 2 | strand |
714- # | 3 | <samplename>_circExplorer_read_count |
715- # | 4 | <samplename>_ciri_read_count |
716- # | 5 | <samplename>_circExplorer_known_novel | --> options are known, low_conf, novel
717- # | 6 | <samplename>_circRNA_type | --> options are exon, intron, intergenic_region
718- # | 7 | <samplename>_ntools | --> number of tools calling this BSJ/circRNA
719- rule merge_per_sample_circRNA_counts :
745+ def _boolean2str (x ): # "1" for True and "0" for False
746+ if x == True :
747+ return "1"
748+ else :
749+ return "0"
750+
751+ # rule merge_per_sample:
752+ # The output file looks like this:
753+ # | Col# | ColName | Description |
754+ # |------|--------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
755+ # | 1 | circRNA_id | chrom:start-end |
756+ # | 2 | strand | "+ or -" |
757+ # | 3 | <samplename>_ntools | number of tools which have this circRNA_id detected |
758+ # | 4 | <samplename>_circExplorer_read_count | |
759+ # | 5 | <samplename>_ciri_read_count | |
760+ # | 6 | <samplename>_dcc_read_count | |
761+ # | 7 | <samplename>_mapsplice_read_count | |
762+ # | 8 | <samplename>_nclscan_read_count | |
763+ # | 9 | circExplorer_annotation | options are known, low_conf, novel |
764+ # | 10 | ciri_annotation | options are exon, intron, intergenic_region |
765+ # | 11 | dcc_annotation | JunctionType##Start-End Region from CircCoordinates file; 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT;Start-End Region eg. intron-intergenic, exon-exon, intergenic-intron, etc. |
766+ # | 12 | mapsplice_annotation | "fusion_type"##"entropy"; "fusion_type" is either "normal" or "overlapping" ... higher "entropy" values are better! |
767+ # | 13 | nclscan_annotation | 1+1 for intragenic 0+1 for intergenic |
768+ rule merge_per_sample :
720769 input :
721- circExplorer_table = rules .circExplorer .output .counts_table ,
722- ciri_table = rules .ciri .output .ciriout ,
723- dcc_table = rules .dcc .output .ct , #join(WORKDIR,"results","{sample}","DCC","{sample}.dcc.counts_table.tsv"),
770+ unpack (get_per_sample_files_to_merge )
724771 output :
725- merged_counts = join (WORKDIR ,"results" ,"{sample}" ,"{sample}.circRNA_counts.txt" )
772+ merged_counts = join (WORKDIR ,"results" ,"{sample}" ,"{sample}.circRNA_counts.txt" ),
726773 params :
727774 script = join (SCRIPTS_DIR ,"merge_per_sample_counts_table.py" ),
728- samplename = "{sample}"
775+ samplename = "{sample}" ,
776+ runclear = _boolean2str (RUN_CLEAR ),
777+ rundcc = _boolean2str (RUN_DCC ),
778+ runmapsplice = _boolean2str (RUN_MAPSPLICE ),
779+ runnclscan = _boolean2str (RUN_NCLSCAN ),
780+ minreadcount = config ['minreadcount' ]
781+ envmodules :
782+ TOOLS ["python37" ]["version" ]
729783 shell :"""
730784set -exo pipefail
731- python {params.script} \
732- --circExplorer {input.circExplorer_table} \
733- --ciri {input.ciri_table} \
734- --samplename {params.samplename} \
735- -o {output.merged_counts}
785+ outdir=$(dirname {output.merged_counts})
786+
787+ parameters=" --circExplorer {input.circExplorer}"
788+ parameters="$parameters --ciri {input.CIRI}"
789+ if [[ "{params.rundcc}" == "1" ]]; then
790+ parameters="$parameters --dcc {input.DCC}"
791+ fi
792+ if [[ "{params.runmapsplice}" == "1" ]]; then
793+ parameters="$parameters --mapsplice {input.MapSplice}"
794+ fi
795+ if [[ "{params.runnclscan}" == "1" ]]; then
796+ parameters="$parameters --nclscan {input.NCLscan}"
797+ fi
798+ parameters="$parameters --min_read_count_reqd {params.minreadcount}"
799+ parameters="$parameters --samplename {params.samplename} -o {output.merged_counts}"
800+
801+ echo "python {params.script} $parameters"
802+ python {params.script} $parameters
736803"""
737804
738- localrules : create_counts_matrix
805+
806+ # localrules: create_counts_matrix
739807# rule create_counts_matrix:
740808# merge all per-sample counts tables into a single giant counts matrix and annotate it with known circRNA databases
741809rule create_counts_matrix :
742810 input :
743- expand (join (WORKDIR ,"results" ,"{sample}" ,"circRNA_counts.txt" ),sample = SAMPLES ),
811+ expand (join (WORKDIR ,"results" ,"{sample}" ,"{sample}. circRNA_counts.txt" ),sample = SAMPLES ),
744812 output :
745813 matrix = join (WORKDIR ,"results" ,"circRNA_counts_matrix.tsv" )
746814 params :
747815 script = join (SCRIPTS_DIR ,"merge_counts_tables_2_counts_matrix.py" ),
748816 resultsdir = join (WORKDIR ,"results" ),
749817 lookup_table = ANNOTATION_LOOKUP
818+ envmodules :
819+ TOOLS ['python37' ]['version' ]
750820 shell :"""
751821set -exo pipefail
752822python {params.script} \
753- --results_folder {params.resultsdir } \
823+ --per_sample_tables {input } \
754824 --lookup_table {params.lookup_table} \
755825 -o {output.matrix}
756826"""
0 commit comments