Plant-Food-Research-Open · yykaya · Dec 12, 2024 · Dec 12, 2024 · Feb 5, 2025 · Feb 5, 2025
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -20,7 +20,7 @@ lint:
     - .github/workflows/branch.yml
     - .github/workflows/linting.yml
     - docs/README.md
-nf_core_version: 3.2.0
+nf_core_version: 3.2.1
 repository_type: pipeline
 template:
   author: Usman Rashid, Jason Shiller

diff --git a/README.md b/README.md
@@ -72,6 +72,8 @@ Each row represents an input genome and the fields are:
 - `fasta:` fasta file for the genome
 - `is_masked`: yes or no to denote whether the fasta file is already masked or not
 
+
+
 At minimum, a file with proteins as evidence is also required. Now, you can run the pipeline using:
 
 ```bash

diff --git a/docs/parameters.md b/docs/parameters.md
diff --git a/main.nf b/main.nf
@@ -17,6 +17,39 @@ include { GENEPAL                   } from './workflows/genepal'
 include { PIPELINE_INITIALISATION   } from './subworkflows/local/utils_nfcore_genepal_pipeline'
 include { PIPELINE_COMPLETION       } from './subworkflows/local/utils_nfcore_genepal_pipeline'
 
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    PROCESS: Filter Genome Assembly by Minimum Contig Length
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+// Include the seqkit module
+include { SEQKIT } from './modules/nf-core/seqkit'
+
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    PROCESS: Filter Genome Assembly by Minimum Contig Length
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+process SEQKIT_GET_LENGTH {
+
+    input:
+    path input_file
+
+    output:
+    path 'filtered_output_file.txt'
+
+    script:
+    """
+    # Filter contigs based on length and output filtered FASTA
+    seqkit seq --min-len ${params.min_contig_length} ${genome_fasta} > filtered_${meta.id}.fasta
+
+    # Generate a list of filtered contigs
+    seqkit fx2tab --length --name filtered_${meta.id}.fasta | awk '{print \$1}' > ${meta.id}_contig_list.txt
+    """
+}
+
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     NAMED WORKFLOWS FOR PIPELINE
@@ -48,10 +81,15 @@ workflow PLANTFOODRESEARCHOPEN_GENEPAL {
 
     main:
     //
-    // WORKFLOW: Run pipeline
+    // Filter genome assembly by minimum contig length
+    //
+    SEQKIT_GET_LENGTH(ch_target_assembly)
+
+    //
+    // Run GENEPAL main workflow using filtered FASTA
     //
     GENEPAL(
-        ch_target_assembly,
+        SEQKIT_GET_LENGTH.out.filtered_fasta.map { meta, fasta, contig_list -> [ meta, fasta ] }, // Filtered genome FASTA
         ch_tar_assm_str,
         ch_is_masked,
         ch_te_library,
@@ -68,9 +106,11 @@ workflow PLANTFOODRESEARCHOPEN_GENEPAL {
         ch_tsebra_config,
         ch_orthofinder_pep
     )
+
     emit:
     multiqc_report = GENEPAL.out.multiqc_report // channel: /path/to/multiqc_report.html
 }
+
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     RUN MAIN WORKFLOW
@@ -81,9 +121,9 @@ workflow {
 
     main:
     //
-    // SUBWORKFLOW: Run initialisation tasks
+    // SUBWORKFLOW: Run initialization tasks
     //
-    PIPELINE_INITIALISATION (
+    PIPELINE_INITIALISATION(
         params.version,
         params.monochrome_logs,
         args,
@@ -95,10 +135,15 @@ workflow {
     )
 
     //
-    // WORKFLOW: Run main workflow
+    // Filter genome assembly by minimum contig length
+    //
+    SEQKIT_GET_LENGTH(PIPELINE_INITIALISATION.out.target_assembly)
+
+    //
+    // Run main workflow using filtered FASTA
     //
     PLANTFOODRESEARCHOPEN_GENEPAL(
-        PIPELINE_INITIALISATION.out.target_assembly,
+        SEQKIT_GET_LENGTH.out.filtered_fasta,
         PIPELINE_INITIALISATION.out.tar_assm_str,
         PIPELINE_INITIALISATION.out.is_masked,
         PIPELINE_INITIALISATION.out.te_library,
@@ -115,10 +160,11 @@ workflow {
         PIPELINE_INITIALISATION.out.tsebra_config,
         PIPELINE_INITIALISATION.out.orthofinder_pep
     )
+
     //
     // SUBWORKFLOW: Run completion tasks
     //
-    PIPELINE_COMPLETION (
+    PIPELINE_COMPLETION(
         params.email,
         params.email_on_fail,
         params.plaintext_email,

diff --git a/modules/nf-core/seqkit/main.nf b/modules/nf-core/seqkit/main.nf
diff --git a/nextflow.config b/nextflow.config
@@ -48,7 +48,7 @@ params {
     liftoff_identity                    = 0.9
     eggnogmapper_evalue                 = 0.00001
     eggnogmapper_pident                 = 35
-
+    min_contig_length                   = 5000
     // Post-annotation filtering options
     allow_isoforms                      = true
     enforce_full_intron_support         = true
@@ -91,7 +91,15 @@ params {
     validate_params            = true
 
 }
-
+// Validation for the min_contig_length parameter
+process {
+    beforeScript = """
+        if [[ ${params.min_contig_length} -le 1000 ]]; then
+            echo "ERROR: The parameter 'min_contig_length' must be greater than 5 kbp (5000 base pairs). Provided value: ${params.min_contig_length}" >&2
+            exit 1
+        fi
+    """
+}
 // Max resources
 process {
     resourceLimits = [

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -41,6 +41,13 @@
                     "default": 1,
                     "fa_icon": "fas fa-hashtag"
                 },
+                "min_contig_length": {
+                    "type": "integer",
+                    "description": "Minimum length for contigs to be included in annotation",
+                    "minimum": 0,
+                    "default": 5000,
+                    "fa_icon": "fas fa-ruler"
+                },
                 "rna_evidence": {
                     "type": "string",
                     "format": "file-path",
@@ -302,8 +309,20 @@
                 "append_genome_prefix_to_feature_ids": {
                     "type": "boolean",
                     "fa_icon": "fas fa-question-circle",
+<<<<<<< HEAD
                     "description": "Add genome prefix to all the features in the final Gff/Fasta files",
                     "default": true
+=======
+                    "description": "Add gff attributes to proteins fasta"
+                },
+                "Seqkit_min_contig_threshold":
+                {
+                    "name": "min_contig_length",
+                    "type": "integer",
+                    "description": "Minimum length of contigs.",
+                    "required": true,
+                    "minimum": 5000
+>>>>>>> feature/seqkit-filtering
                 }
             }
         },

diff --git a/subworkflows/local/prepare_assembly.nf b/subworkflows/local/prepare_assembly.nf
@@ -1,5 +1,6 @@
 include { GUNZIP as GUNZIP_TARGET_ASSEMBLY      } from '../../modules/nf-core/gunzip'
 include { GUNZIP as GUNZIP_TE_LIBRARY           } from '../../modules/nf-core/gunzip'
+include { SEQKIT as SEQKIT_FILTER               } from '../../modules/nf-core/seqkit/main.nf'
 include { SEQKIT_RMDUP                          } from '../../modules/nf-core/seqkit/rmdup/main.nf'
 include { FASTAVALIDATOR                        } from '../../modules/nf-core/fastavalidator'
 include { REPEATMODELER_BUILDDATABASE           } from '../../modules/nf-core/repeatmodeler/builddatabase'
@@ -37,8 +38,13 @@ workflow PREPARE_ASSEMBLY {
                                 )
     ch_versions                 = ch_versions.mix(GUNZIP_TARGET_ASSEMBLY.out.versions.first())
 
+    // MODULE: SEQKIT_FILTER
+    SEQKIT_FILTER ( ch_gunzip_assembly, params.min_contig_length )
+
+    ch_filtered_assembly = SEQKIT_FILTER.out.fastx
+
     // MODULE: SEQKIT_RMDUP
-    SEQKIT_RMDUP ( ch_gunzip_assembly )
+    SEQKIT_RMDUP ( ch_filtered_assembly )
 
     ch_nondup_fw_assembly       = SEQKIT_RMDUP.out.log
                                 | join(SEQKIT_RMDUP.out.fastx)

diff --git a/subworkflows/yykaya/seqkit.filter.nf b/subworkflows/yykaya/seqkit.filter.nf
@@ -0,0 +1,21 @@
+process SEQKIT_GET_LENGTH {
+    tag "${meta.id}"
+    label 'process_medium'
+    container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
+        ? 'https://depot.galaxyproject.org/singularity/seqkit:2.4.0--h9ee0642_0'
+        : 'quay.io/biocontainers/seqkit:2.4.0--h9ee0642_0'}"
+
+    input:
+    tuple val(meta), path(genome_fasta)
+
+    output:
+    tuple val(meta), path("filtered_${meta.id}.fasta"), path("${meta.id}_contig_list.txt"), emit: filtered_fasta
+
+    script:
+    """
+    # Filter contigs based on length and output filtered FASTA
+    seqkit seq --min-len ${params.min_contig_length} ${genome_fasta} > filtered_${meta.id}.fasta
+
+    # Generate a list of filtered contigs
+    seqkit fx2tab --length --name filtered_${meta.id}.fasta | awk '{print \$1}' > ${meta.id}_contig_list.txt
+    """
-Original file line number
+Diff line change
@@ Expand Up / @@ -72,6 +72,8 @@ Each row represents an input genome and the fields are: @@
     - `fasta:` fasta file for the genome
     - `is_masked`: yes or no to denote whether the fasta file is already masked or not
     At minimum, a file with proteins as evidence is also required. Now, you can run the pipeline using:
     ```bash
@@ Expand Down @@