Skip to content

Commit ed60ef4

Browse files
authored
Merge pull request #41 from CCBR/activeDev
Merge from ActiveDev with 5 circRNA callers
2 parents 9631b39 + 6791801 commit ed60ef4

9 files changed

Lines changed: 283 additions & 123 deletions

config/config.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ run_clear: True
1515
run_dcc: True
1616
#
1717
# Should the MapSplice pipeline be run? True or False WITHOUT quotes
18-
run_mapslice: True
18+
run_mapsplice: True
1919
#
2020
# Should the NCLscan pipeline be run? True or False WITHOUT quotes
2121
# This can only be run for PE data
@@ -108,7 +108,8 @@ resourcesdir: "PIPELINE_HOME/resources"
108108
tools: "PIPELINE_HOME/resources/tools.yaml"
109109
cluster: "PIPELINE_HOME/resources/cluster.json"
110110
adapters: "PIPELINE_HOME/resources/TruSeq_and_nextera_adapters.consolidated.fa"
111-
circexplorer_bsj_circRNA_min_reads: 1 # in addition to "known" and "low-conf" circRNAs identified by circexplorer, we also include those found in back_spliced.bed file but not classified as known/low-conf only if the number of reads supporting the BSJ call is greater than this number
111+
circexplorer_bsj_circRNA_min_reads: 2 # in addition to "known" and "low-conf" circRNAs identified by circexplorer, we also include those found in back_spliced.bed file but not classified as known/low-conf only if the number of reads supporting the BSJ call is greater than this number
112+
minreadcount: 2 # this is used to filter circRNAs while creating the per-sample counts table
112113
ciri_perl_script: "/data/Ziegelbauer_lab/tools/CIRI_v2.0.6/CIRI2.pl"
113114
nclscan_dir: "/data/Ziegelbauer_lab/tools/NCLscan-1.7.0"
114115
dcc_strandedness: "-ss" # "-ss" for stranded library and "--nonstrand" for unstranded

resources/tools.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ bedtools:
33
bwa:
44
version: "bwa/0.7.17"
55
circexplorer:
6-
version: "circexplorer2/2.3.5"
6+
version: "circexplorer2/2.3.8"
77
cufflinks:
88
version: "cufflinks/2.2.1"
99
cutadapt:

run_circrna_daq.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ module purge
1515
#######
1616
EXTRA_SINGULARITY_BINDS="/lscratch"
1717
PYTHONVERSION="3.7"
18-
SNAKEMAKEVERSION="7.3.7"
18+
# SNAKEMAKEVERSION="7.3.7"
19+
SNAKEMAKEVERSION="5.24.1"
1920
#######
2021

2122

workflow/Snakefile

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,22 @@ include: "rules/init.smk"
1616

1717
def get_clear_target_files(runclear):
1818
targetfiles=[]
19-
if runclear==True or runclear=="True" or runclear=="TRUE":
19+
if runclear:
2020
for s in SAMPLES:
2121
targetfiles.append(join(WORKDIR,"results",s,"CLEAR","quant.txt"))
2222
targetfiles.append(join(WORKDIR,"results",s,"CLEAR","quant.txt.annotated"))
2323
return targetfiles
2424

2525
def get_dcc_target_files(rundcc):
2626
targetfiles=[]
27-
if rundcc==True or rundcc=="True" or rundcc=="TRUE":
27+
if rundcc:
2828
for s in SAMPLES:
2929
targetfiles.append(join(WORKDIR,"results",s,"DCC",s+".dcc.counts_table.tsv"))
3030
return targetfiles
3131

3232
def get_mapsplice_target_files(runmapslice):
3333
targetfiles=[]
34-
if runmapslice==True or runmapslice=="True" or runmapslice=="TRUE":
34+
if runmapslice:
3535
for s in SAMPLES:
3636
targetfiles.append(join(WORKDIR,"results",s,"MapSplice","circular_RNAs.txt"))
3737
targetfiles.append(join(WORKDIR,"results",s,"MapSplice","alignments.bam"))
@@ -45,9 +45,13 @@ def get_nclscan_target_files(runnclscan):
4545
if not os.path.exists(join(WORKDIR,"results",s)): os.mkdir(join(WORKDIR,"results",s))
4646
if not os.path.exists(join(WORKDIR,"results",s,"NCLscan")): os.mkdir(join(WORKDIR,"results",s,"NCLscan"))
4747
if SAMPLESDF.loc[[s],"PEorSE"][0]=="SE":
48-
Path(join(WORKDIR,"results",s,"NCLscan",s+".result")).touch() # nclscan cannot run for se
49-
with open(join(WORKDIR,"results",s,"NCLscan",s+".nclscan.counts_table.tsv"),'w') as f:
50-
f.write("chrom\tend\tstart\tstrand\tread_count\tnclscan_annotation\n") # create empty file
48+
resultfile=join(WORKDIR,"results",s,"NCLscan",s+".result")
49+
ctable=join(WORKDIR,"results",s,"NCLscan",s+".nclscan.counts_table.tsv")
50+
if not os.path.exists(resultfile):
51+
Path(join(WORKDIR,"results",s,"NCLscan",s+".result")).touch() # nclscan cannot run for se
52+
if not os.path.exists(ctable):
53+
with open(join(WORKDIR,"results",s,"NCLscan",s+".nclscan.counts_table.tsv"),'w') as f:
54+
f.write("chrom\tend\tstart\tstrand\tread_count\tnclscan_annotation\n") # create empty file
5155
else:
5256
targetfiles.append(join(WORKDIR,"results",s,"NCLscan",s+".result"))
5357
return targetfiles
@@ -89,20 +93,20 @@ rule all:
8993
## circExplorer --> we run circExplorer2
9094
expand(join(WORKDIR,"results","{sample}","circExplorer","{sample}.circularRNA_known.txt"),sample=SAMPLES), # annotations with "known" GENCODE genes and NOT "known" circRNAs!
9195
## CLEAR quant output --> CLEAR is nothing but circExplorer3
92-
get_clear_target_files(config['run_clear']),
96+
get_clear_target_files(RUN_CLEAR),
9397
## ciri
9498
expand(join(WORKDIR,"results","{sample}","ciri","{sample}.ciri.out"),sample=SAMPLES),
9599
## DCC
96100
# expand(join(WORKDIR,"results","{sample}","DCC","{sample}.dcc.counts_table.tsv"),sample=SAMPLES),
97-
get_dcc_target_files(config['run_dcc']),
101+
get_dcc_target_files(RUN_DCC),
98102
## MapSplice
99-
get_mapsplice_target_files(config['run_mapslice']),
103+
get_mapsplice_target_files(RUN_MAPSPLICE),
100104
## NCLscan
101-
get_nclscan_target_files(config['run_nclscan']),
105+
get_nclscan_target_files(RUN_NCLSCAN),
102106
## merged counts per sample table of all counts/annotations from all circRNA callers
103-
# expand(join(WORKDIR,"results","{sample}","{sample}.circRNA_counts.txt"),sample=SAMPLES),
107+
expand(join(WORKDIR,"results","{sample}","{sample}.circRNA_counts.txt"),sample=SAMPLES),
104108
## aggregated counts matrix
105-
# join(WORKDIR,"results","circRNA_counts_matrix.tsv"),
109+
join(WORKDIR,"results","circRNA_counts_matrix.tsv"),
106110
# ## ciri BSJ bam
107111
# expand(join(WORKDIR,"results","{sample}","ciri","{sample}.bwa.BSJ.bam"),sample=SAMPLES),
108112
# ## ciri aggregate count matrix

workflow/rules/findcircrna.smk

Lines changed: 114 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,50 @@
11
# Find circRNAs using:
2-
# circExplorer2
3-
# ciri2
4-
# CLEAR
5-
# DCC
2+
# ------------------------------------------------------------------------------------------------
3+
# TOOL | Main output file
4+
# ------------------------------------------------------------------------------------------------
5+
# circExplorer2 | "results","{sample}","circExplorer","{sample}.circExplorer.counts_table.tsv"
6+
# ciri2 | "results","{sample}","ciri","{sample}.ciri.out"
7+
# CLEAR | "results","{sample}","CLEAR","quant.txt.annotated" ... not used as this is just filtered circExplorer output
8+
# DCC | "results","{sample}","DCC","{sample}.dcc.counts_table.tsv"
9+
# MapSplice | "results","{sample}","MapSplice","circular_RNAs.txt"
10+
# NCLscan | "results","{sample}","NCLscan","{sample}.nclscan.counts_table.tsv"
611
# and annotate them
712

813

914
## function
10-
def get_dcc_inputs(wildcards):
11-
filelist=[]
12-
for s in SAMPLES:
13-
filelist.append(join(WORKDIR,"results",s,"STAR1p",s+"_p1.Chimeric.out.junction"))
14-
filelist.append(join(WORKDIR,"results",s,"STAR1p","mate1",s+"_mate1.Chimeric.out.junction"))
15-
filelist.append(join(WORKDIR,"results",s,"STAR1p","mate2",s+"_mate2.Chimeric.out.junction"))
16-
return filelist
15+
# def get_dcc_inputs(wildcards):
16+
# filelist=[]
17+
# for s in SAMPLES:
18+
# filelist.append(join(WORKDIR,"results",s,"STAR1p",s+"_p1.Chimeric.out.junction"))
19+
# filelist.append(join(WORKDIR,"results",s,"STAR1p","mate1",s+"_mate1.Chimeric.out.junction"))
20+
# filelist.append(join(WORKDIR,"results",s,"STAR1p","mate2",s+"_mate2.Chimeric.out.junction"))
21+
# return filelist
22+
23+
def get_nclscan_target_files_per_sample(wildcards):
24+
targetfiles=dict()
25+
s=wildcards.sample
26+
if SAMPLESDF.loc[[s],"PEorSE"][0]=="PE": # SE is already take care of by function get_nclscan_target_files
27+
targetfiles['fixed_gtf']=join(REF_DIR,"ref.fixed.gtf")
28+
targetfiles['ndx']=join(REF_DIR,"NCLscan_index","AllRef.ndx")
29+
targetfiles['R1']=join(WORKDIR,"results",s,"trim",s+".R1.trim.fastq.gz")
30+
targetfiles['R2']=join(WORKDIR,"results",s,"trim",s+".R2.trim.fastq.gz")
31+
return targetfiles # empty if SE and will not run the rule at all!
32+
33+
def get_per_sample_files_to_merge(wildcards):
34+
filedict={}
35+
s=wildcards.sample
36+
filedict['circExplorer']=join(WORKDIR,"results",s,"circExplorer",s+".circExplorer.counts_table.tsv")
37+
filedict['CIRI']=join(WORKDIR,"results",s,"ciri",s+".ciri.out")
38+
# # if RUN_CLEAR:
39+
# # filedict['CLEAR']=join(WORKDIR,"results","{sample}","CLEAR","quant.txt.annotated")
40+
if RUN_DCC:
41+
filedict['DCC']=join(WORKDIR,"results","{sample}","DCC","{sample}.dcc.counts_table.tsv")
42+
if RUN_MAPSPLICE:
43+
filedict['MapSplice']=join(WORKDIR,"results","{sample}","MapSplice","{sample}.mapslice.counts_table.tsv")
44+
if RUN_NCLSCAN:
45+
filedict['NCLscan']=join(WORKDIR,"results","{sample}","NCLscan","{sample}.nclscan.counts_table.tsv")
46+
return(filedict)
47+
1748

1849
## rules
1950
# rule circExplorer:
@@ -198,6 +229,7 @@ rm -rf {params.sample}.bwa.sam
198229

199230

200231
rule create_ciri_count_matrix:
232+
# DEPRECATED
201233
input:
202234
expand(join(WORKDIR,"results","{sample}","ciri","{sample}.ciri.out"),sample=SAMPLES)
203235
output:
@@ -215,6 +247,7 @@ python {params.script} {params.lookup} {params.hostID}
215247
"""
216248

217249
rule create_circexplorer_count_matrix:
250+
# DEPRECATED
218251
input:
219252
expand(join(WORKDIR,"results","{sample}","circExplorer","{sample}.circularRNA_known.txt"),sample=SAMPLES)
220253
output:
@@ -666,13 +699,10 @@ rsync -az --progress alignments.bam.bai {output.bai}
666699
# | 3 | end | 1223968 |
667700
# | 4 | strand | - |
668701
# | 5 | read_count | 26 |
669-
# | 6 | nclscan_annotation | normal##2.811419 | <--1 for intragenic 0 for intergenic
702+
# | 6 | nclscan_annotation | 1 | <--1+1 for intragenic 0+1 for intergenic
670703
rule nclscan:
671704
input:
672-
fixed_gtf=rules.create_index.output.fixed_gtf,
673-
ndx=rules.create_index.output.ndx,
674-
R1=rules.cutadapt.output.of1,
675-
R2=rules.cutadapt.output.of2,
705+
unpack(get_nclscan_target_files_per_sample)
676706
output:
677707
result=join(WORKDIR,"results","{sample}","NCLscan","{sample}.result"),
678708
ct=join(WORKDIR,"results","{sample}","NCLscan","{sample}.nclscan.counts_table.tsv"),
@@ -697,60 +727,100 @@ fi
697727
if [ ! -d $TMPDIR ];then mkdir -p $TMPDIR;fi
698728
outdir=$(dirname {output.result})
699729
730+
if [ "{params.peorse}" == "PE" ];then
700731
{params.nclscan_dir}/NCLscan.py -c {params.nclscan_config} -pj {params.sample} -o $outdir --fq1 {input.R1} --fq2 {input.R2}
701732
python {params.script} \
702733
--result {output.result} -o {output.ct}
734+
# else
735+
# outdir=$(dirname {output.result})
736+
# if [ ! -d $outdir ];then
737+
# mkdir -p $outdir
738+
# fi
739+
# touch {output.result}
740+
# touch {output.ct}
741+
# This part is redundant as it is already taken care of by get_nclscan_target_files function!
742+
fi
703743
"""
704744

705-
706-
localrules: merge_per_sample_circRNA_counts
707-
# rule merge_per_sample_circRNA_counts:
708-
# merges counts from circExplorer2 and CIRI2 for all identified circRNAs.
709-
# The output file columns are:
710-
# | # | ColName |
711-
# |---|---------------------------------------|
712-
# | 1 | circRNA_id |
713-
# | 2 | strand |
714-
# | 3 | <samplename>_circExplorer_read_count |
715-
# | 4 | <samplename>_ciri_read_count |
716-
# | 5 | <samplename>_circExplorer_known_novel | --> options are known, low_conf, novel
717-
# | 6 | <samplename>_circRNA_type | --> options are exon, intron, intergenic_region
718-
# | 7 | <samplename>_ntools | --> number of tools calling this BSJ/circRNA
719-
rule merge_per_sample_circRNA_counts:
745+
def _boolean2str(x): # "1" for True and "0" for False
746+
if x==True:
747+
return "1"
748+
else:
749+
return "0"
750+
751+
# rule merge_per_sample:
752+
# The output file looks like this:
753+
# | Col# | ColName | Description |
754+
# |------|--------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
755+
# | 1 | circRNA_id | chrom:start-end |
756+
# | 2 | strand | "+ or -" |
757+
# | 3 | <samplename>_ntools | number of tools which have this circRNA_id detected |
758+
# | 4 | <samplename>_circExplorer_read_count | |
759+
# | 5 | <samplename>_ciri_read_count | |
760+
# | 6 | <samplename>_dcc_read_count | |
761+
# | 7 | <samplename>_mapsplice_read_count | |
762+
# | 8 | <samplename>_nclscan_read_count | |
763+
# | 9 | circExplorer_annotation | options are known, low_conf, novel |
764+
# | 10 | ciri_annotation | options are exon, intron, intergenic_region |
765+
# | 11 | dcc_annotation | JunctionType##Start-End Region from CircCoordinates file; 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT;Start-End Region eg. intron-intergenic, exon-exon, intergenic-intron, etc. |
766+
# | 12 | mapsplice_annotation | "fusion_type"##"entropy"; "fusion_type" is either "normal" or "overlapping" ... higher "entropy" values are better! |
767+
# | 13 | nclscan_annotation | 1+1 for intragenic 0+1 for intergenic |
768+
rule merge_per_sample:
720769
input:
721-
circExplorer_table=rules.circExplorer.output.counts_table,
722-
ciri_table=rules.ciri.output.ciriout,
723-
dcc_table=rules.dcc.output.ct, #join(WORKDIR,"results","{sample}","DCC","{sample}.dcc.counts_table.tsv"),
770+
unpack(get_per_sample_files_to_merge)
724771
output:
725-
merged_counts=join(WORKDIR,"results","{sample}","{sample}.circRNA_counts.txt")
772+
merged_counts=join(WORKDIR,"results","{sample}","{sample}.circRNA_counts.txt"),
726773
params:
727774
script=join(SCRIPTS_DIR,"merge_per_sample_counts_table.py"),
728-
samplename="{sample}"
775+
samplename="{sample}",
776+
runclear=_boolean2str(RUN_CLEAR),
777+
rundcc=_boolean2str(RUN_DCC),
778+
runmapsplice=_boolean2str(RUN_MAPSPLICE),
779+
runnclscan=_boolean2str(RUN_NCLSCAN),
780+
minreadcount=config['minreadcount']
781+
envmodules:
782+
TOOLS["python37"]["version"]
729783
shell:"""
730784
set -exo pipefail
731-
python {params.script} \
732-
--circExplorer {input.circExplorer_table} \
733-
--ciri {input.ciri_table} \
734-
--samplename {params.samplename} \
735-
-o {output.merged_counts}
785+
outdir=$(dirname {output.merged_counts})
786+
787+
parameters=" --circExplorer {input.circExplorer}"
788+
parameters="$parameters --ciri {input.CIRI}"
789+
if [[ "{params.rundcc}" == "1" ]]; then
790+
parameters="$parameters --dcc {input.DCC}"
791+
fi
792+
if [[ "{params.runmapsplice}" == "1" ]]; then
793+
parameters="$parameters --mapsplice {input.MapSplice}"
794+
fi
795+
if [[ "{params.runnclscan}" == "1" ]]; then
796+
parameters="$parameters --nclscan {input.NCLscan}"
797+
fi
798+
parameters="$parameters --min_read_count_reqd {params.minreadcount}"
799+
parameters="$parameters --samplename {params.samplename} -o {output.merged_counts}"
800+
801+
echo "python {params.script} $parameters"
802+
python {params.script} $parameters
736803
"""
737804

738-
localrules: create_counts_matrix
805+
806+
# localrules: create_counts_matrix
739807
# rule create_counts_matrix:
740808
# merge all per-sample counts tables into a single giant counts matrix and annotate it with known circRNA databases
741809
rule create_counts_matrix:
742810
input:
743-
expand(join(WORKDIR,"results","{sample}","circRNA_counts.txt"),sample=SAMPLES),
811+
expand(join(WORKDIR,"results","{sample}","{sample}.circRNA_counts.txt"),sample=SAMPLES),
744812
output:
745813
matrix=join(WORKDIR,"results","circRNA_counts_matrix.tsv")
746814
params:
747815
script=join(SCRIPTS_DIR,"merge_counts_tables_2_counts_matrix.py"),
748816
resultsdir=join(WORKDIR,"results"),
749817
lookup_table=ANNOTATION_LOOKUP
818+
envmodules:
819+
TOOLS['python37']['version']
750820
shell:"""
751821
set -exo pipefail
752822
python {params.script} \
753-
--results_folder {params.resultsdir} \
823+
--per_sample_tables {input} \
754824
--lookup_table {params.lookup_table} \
755825
-o {output.matrix}
756826
"""

0 commit comments

Comments
 (0)