Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .nf-core.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ lint:
- .github/workflows/branch.yml
- .github/workflows/linting.yml
- docs/README.md
nf_core_version: 3.2.0
nf_core_version: 3.2.1
repository_type: pipeline
template:
author: Usman Rashid, Jason Shiller
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ Each row represents an input genome and the fields are:
- `fasta:` fasta file for the genome
- `is_masked`: yes or no to denote whether the fasta file is already masked or not



At minimum, a file with proteins as evidence is also required. Now, you can run the pipeline using:

```bash
Expand Down
150 changes: 106 additions & 44 deletions docs/parameters.md

Large diffs are not rendered by default.

60 changes: 53 additions & 7 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,39 @@ include { GENEPAL } from './workflows/genepal'
include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_genepal_pipeline'
include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_genepal_pipeline'

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
PROCESS: Filter Genome Assembly by Minimum Contig Length
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

// Include the seqkit module
include { SEQKIT } from './modules/nf-core/seqkit'

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
PROCESS: Filter Genome Assembly by Minimum Contig Length
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

process SEQKIT_GET_LENGTH {

input:
path input_file

output:
path 'filtered_output_file.txt'

script:
"""
# Filter contigs based on length and output filtered FASTA
seqkit seq --min-len ${params.min_contig_length} ${genome_fasta} > filtered_${meta.id}.fasta

# Generate a list of filtered contigs
seqkit fx2tab --length --name filtered_${meta.id}.fasta | awk '{print \$1}' > ${meta.id}_contig_list.txt
"""
}

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
NAMED WORKFLOWS FOR PIPELINE
Expand Down Expand Up @@ -48,10 +81,15 @@ workflow PLANTFOODRESEARCHOPEN_GENEPAL {

main:
//
// WORKFLOW: Run pipeline
// Filter genome assembly by minimum contig length
//
SEQKIT_GET_LENGTH(ch_target_assembly)

//
// Run GENEPAL main workflow using filtered FASTA
//
GENEPAL(
ch_target_assembly,
SEQKIT_GET_LENGTH.out.filtered_fasta.map { meta, fasta, contig_list -> [ meta, fasta ] }, // Filtered genome FASTA
ch_tar_assm_str,
ch_is_masked,
ch_te_library,
Expand All @@ -68,9 +106,11 @@ workflow PLANTFOODRESEARCHOPEN_GENEPAL {
ch_tsebra_config,
ch_orthofinder_pep
)

emit:
multiqc_report = GENEPAL.out.multiqc_report // channel: /path/to/multiqc_report.html
}

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
RUN MAIN WORKFLOW
Expand All @@ -81,9 +121,9 @@ workflow {

main:
//
// SUBWORKFLOW: Run initialisation tasks
// SUBWORKFLOW: Run initialization tasks
//
PIPELINE_INITIALISATION (
PIPELINE_INITIALISATION(
params.version,
params.monochrome_logs,
args,
Expand All @@ -95,10 +135,15 @@ workflow {
)

//
// WORKFLOW: Run main workflow
// Filter genome assembly by minimum contig length
//
SEQKIT_GET_LENGTH(PIPELINE_INITIALISATION.out.target_assembly)

//
// Run main workflow using filtered FASTA
//
PLANTFOODRESEARCHOPEN_GENEPAL(
PIPELINE_INITIALISATION.out.target_assembly,
SEQKIT_GET_LENGTH.out.filtered_fasta,
PIPELINE_INITIALISATION.out.tar_assm_str,
PIPELINE_INITIALISATION.out.is_masked,
PIPELINE_INITIALISATION.out.te_library,
Expand All @@ -115,10 +160,11 @@ workflow {
PIPELINE_INITIALISATION.out.tsebra_config,
PIPELINE_INITIALISATION.out.orthofinder_pep
)

//
// SUBWORKFLOW: Run completion tasks
//
PIPELINE_COMPLETION (
PIPELINE_COMPLETION(
params.email,
params.email_on_fail,
params.plaintext_email,
Expand Down
16 changes: 16 additions & 0 deletions modules/nf-core/seqkit/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 10 additions & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ params {
liftoff_identity = 0.9
eggnogmapper_evalue = 0.00001
eggnogmapper_pident = 35

min_contig_length = 5000
// Post-annotation filtering options
allow_isoforms = true
enforce_full_intron_support = true
Expand Down Expand Up @@ -91,7 +91,15 @@ params {
validate_params = true

}

// Validation for the min_contig_length parameter
process {
beforeScript = """
if [[ ${params.min_contig_length} -le 1000 ]]; then
echo "ERROR: The parameter 'min_contig_length' must be greater than 5 kbp (5000 base pairs). Provided value: ${params.min_contig_length}" >&2
exit 1
fi
"""
}
// Max resources
process {
resourceLimits = [
Expand Down
19 changes: 19 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@
"default": 1,
"fa_icon": "fas fa-hashtag"
},
"min_contig_length": {
"type": "integer",
"description": "Minimum length for contigs to be included in annotation",
"minimum": 0,
"default": 5000,
"fa_icon": "fas fa-ruler"
},
"rna_evidence": {
"type": "string",
"format": "file-path",
Expand Down Expand Up @@ -302,8 +309,20 @@
"append_genome_prefix_to_feature_ids": {
"type": "boolean",
"fa_icon": "fas fa-question-circle",
<<<<<<< HEAD
"description": "Add genome prefix to all the features in the final Gff/Fasta files",
"default": true
=======
"description": "Add gff attributes to proteins fasta"
},
"Seqkit_min_contig_threshold":
{
"name": "min_contig_length",
"type": "integer",
"description": "Minimum length of contigs.",
"required": true,
"minimum": 5000
>>>>>>> feature/seqkit-filtering
}
}
},
Expand Down
8 changes: 7 additions & 1 deletion subworkflows/local/prepare_assembly.nf
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
include { GUNZIP as GUNZIP_TARGET_ASSEMBLY } from '../../modules/nf-core/gunzip'
include { GUNZIP as GUNZIP_TE_LIBRARY } from '../../modules/nf-core/gunzip'
include { SEQKIT as SEQKIT_FILTER } from '../../modules/nf-core/seqkit/main.nf'
include { SEQKIT_RMDUP } from '../../modules/nf-core/seqkit/rmdup/main.nf'
include { FASTAVALIDATOR } from '../../modules/nf-core/fastavalidator'
include { REPEATMODELER_BUILDDATABASE } from '../../modules/nf-core/repeatmodeler/builddatabase'
Expand Down Expand Up @@ -37,8 +38,13 @@ workflow PREPARE_ASSEMBLY {
)
ch_versions = ch_versions.mix(GUNZIP_TARGET_ASSEMBLY.out.versions.first())

// MODULE: SEQKIT_FILTER
SEQKIT_FILTER ( ch_gunzip_assembly, params.min_contig_length )

ch_filtered_assembly = SEQKIT_FILTER.out.fastx

// MODULE: SEQKIT_RMDUP
SEQKIT_RMDUP ( ch_gunzip_assembly )
SEQKIT_RMDUP ( ch_filtered_assembly )

ch_nondup_fw_assembly = SEQKIT_RMDUP.out.log
| join(SEQKIT_RMDUP.out.fastx)
Expand Down
21 changes: 21 additions & 0 deletions subworkflows/yykaya/seqkit.filter.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
process SEQKIT_GET_LENGTH {
tag "${meta.id}"
label 'process_medium'
container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
? 'https://depot.galaxyproject.org/singularity/seqkit:2.4.0--h9ee0642_0'
: 'quay.io/biocontainers/seqkit:2.4.0--h9ee0642_0'}"

input:
tuple val(meta), path(genome_fasta)

output:
tuple val(meta), path("filtered_${meta.id}.fasta"), path("${meta.id}_contig_list.txt"), emit: filtered_fasta

script:
"""
# Filter contigs based on length and output filtered FASTA
seqkit seq --min-len ${params.min_contig_length} ${genome_fasta} > filtered_${meta.id}.fasta

# Generate a list of filtered contigs
seqkit fx2tab --length --name filtered_${meta.id}.fasta | awk '{print \$1}' > ${meta.id}_contig_list.txt
"""