From f8a0215c67754089904b6b7d4a0ab0e11fa011ee Mon Sep 17 00:00:00 2001
From: Maina Vienne <maina.vienne@inrae.fr>
Date: Fri, 28 Oct 2022 13:24:11 +0200
Subject: [PATCH 01/31] Reduce main.nf by moving  "manage flowcell" in the
 subworflow 02_assembly

---
 main.nf                     | 45 +--------------------------------
 subworkflows/02_assembly.nf | 50 +++++++++++++++++++++++++++++++------
 2 files changed, 43 insertions(+), 52 deletions(-)

diff --git a/main.nf b/main.nf
index cdd2d19..0d26a32 100644
--- a/main.nf
+++ b/main.nf
@@ -244,7 +244,6 @@ workflow {
         def meta = [:] 
         meta.id = item.sample
         if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell}
-        if (item.group !=null) {meta.id = meta.id+"_"+item.group}
         meta.sample = item.sample
         meta.flowcell = item.flowcell
         meta.group = item.group
@@ -265,7 +264,6 @@ workflow {
       def meta = [:] 
       meta.id = item.sample
       if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell}
-      if (item.group !=null) {meta.id = meta.id+"_"+item.group}
       meta.sample = item.sample
       meta.flowcell = item.flowcell
       meta.group = item.group
@@ -362,49 +360,8 @@ workflow {
   }
 
   if ( !params.stop_at_clean ) {
-		if (!has_assembly & has_flowcell ){ 
-			//////////////////
-			// Manage Flowcell 
-			//////////////////
-			ch_reads_tmp = ch_preprocessed_reads
-				.map {
-					meta, fastq ->
-					[ meta.sample, meta, fastq ] 
-					}
-				.groupTuple(by: [0])
-				.branch {
-					id, meta, fastq ->
-						single  : fastq.size() == 1
-							return [[id:meta.sample.unique().join(),
-							sample:meta.sample.unique().join(),
-							flowcell:meta.flowcell.join("_"),
-							group:meta.group.unique().join(),
-							assembly:meta.assembly.unique().join(),
-							type:meta.type.unique().join()], fastq.flatten() ]
-						multiple: fastq.size() > 1
-							return [[id:meta.sample.unique().join(),
-							sample:meta.sample.unique().join(),
-							flowcell:meta.flowcell.join("_"),
-							group:meta.group.unique().join(),
-							assembly:meta.assembly.unique().join(),
-							type:meta.type.unique().join()], fastq.flatten() ]
-				}
-
-
-			MERGE_FASTQ (
-				ch_reads_tmp.multiple
-			)
-				.reads
-				.mix(ch_reads_tmp.single)
-				.set{ch_preprocessed_reads}
-		}
-		
-      /////////////////////
-      //End manage Flowcell
-      /////////////////////
-
-    S02_ASSEMBLY ( ch_preprocessed_reads, ch_assembly, has_assembly, assembly_tool )
 
+    S02_ASSEMBLY ( ch_preprocessed_reads, ch_assembly, has_assembly, assembly_tool, has_flowcell )
     ch_assembly = S02_ASSEMBLY.out.assembly
     ch_reads = S02_ASSEMBLY.out.reads
     ch_bam = S02_ASSEMBLY.out.bam
diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf
index fe90d44..d360c9f 100644
--- a/subworkflows/02_assembly.nf
+++ b/subworkflows/02_assembly.nf
@@ -6,10 +6,11 @@ include { GET_ALIGNMENT_METRICS } from '../modules/read_alignment_manipulation'
 
 workflow STEP_02_ASSEMBLY {
   take: 
-    reads
-    assembly
-    has_assembly
-    assembly_tool
+  reads
+  assembly
+  has_assembly
+  assembly_tool
+  has_flowcell
 
   main:
     ch_assembler_v = Channel.empty() 
@@ -17,11 +18,45 @@ workflow STEP_02_ASSEMBLY {
     ch_bwa_mem_v = Channel.empty()
     ch_minimap2_v = Channel.empty()
     ch_samtools_v = Channel.empty()
-  
+
+
+    if (!has_assembly & has_flowcell ){ 
+			//////////////////
+			// Manage Flowcell 
+			//////////////////
+			ch_reads_flowcell = reads
+				.map { meta, fastq ->
+					   [ meta.sample, meta, fastq ] }
+				.groupTuple(by: [0])
+				.branch {	id, meta, fastq ->
+          single : fastq.size() == 1
+            return [[id:meta.sample.unique().join(),
+            sample:meta.sample.unique().join(),
+            flowcell:meta.flowcell.join("_"),
+            group:meta.group.unique().join(),
+            assembly:meta.assembly.unique().join(),
+            type:meta.type.unique().join()], fastq.flatten() ]
+          multiple: fastq.size() > 1
+            return [[id:meta.sample.unique().join(),
+            sample:meta.sample.unique().join(),
+            flowcell:meta.flowcell.join("_"),
+            group:meta.group.unique().join(),
+            assembly:meta.assembly.unique().join(),
+            type:meta.type.unique().join()], fastq.flatten() ]
+				}
+
+			MERGE_FASTQ (ch_reads_flowcell.multiple)
+				.reads
+				.mix(ch_reads_flowcell.single)
+				.set{ch_reads}
+      
+		} else {
+      ch_reads = reads
+    }
+
     if (has_assembly){
       ch_assembly = assembly   
-    } 
-    else {
+    } else {
       if(assembly_tool == 'metaspades') {
         METASPADES(reads)
         ch_assembly = METASPADES.out.assembly
@@ -58,7 +93,6 @@ workflow STEP_02_ASSEMBLY {
 
     ch_contigs_and_reads = ch_assembly_renamed.join(ch_reads, remainder: true)
     if ( params.type.toUpperCase() == "SR" ) {
-
       READS_DEDUPLICATION ( ch_contigs_and_reads )
       
       ch_reads = READS_DEDUPLICATION.out.dedup_reads
-- 
GitLab


From fac60418083694140540653d03dab61540160457 Mon Sep 17 00:00:00 2001
From: Maina Vienne <maina.vienne@inrae.fr>
Date: Tue, 8 Nov 2022 08:49:11 +0100
Subject: [PATCH 02/31] manage coassembly for step 2

	Adds coassembly parameter
	Adapts assembly modules to take multiple fastq
---
 modules/assembly.nf         | 26 ++++++++---
 nextflow.config             |  1 +
 subworkflows/02_assembly.nf | 90 ++++++++++++++++++++++++++-----------
 3 files changed, 85 insertions(+), 32 deletions(-)

diff --git a/modules/assembly.nf b/modules/assembly.nf
index 9155270..969a918 100644
--- a/modules/assembly.nf
+++ b/modules/assembly.nf
@@ -4,7 +4,7 @@ process METASPADES {
   label 'ASSEMBLY_SR'
 
   input:
-  tuple val(meta), path(reads)
+  tuple val(meta), path(read1), path(read2)
 
   output:
   tuple val(meta), path("metaspades/${meta.id}.contigs.fa"), emit: assembly
@@ -15,7 +15,21 @@ process METASPADES {
     (_,mem,unit) = (task.memory =~ /(\d+) ([A-Z]B)/)[0]
     if ( unit =~ /GB/ ) { 
       """
-      metaspades.py -t ${task.cpus} -m $mem -1 ${reads[0]} -2 ${reads[1]} -o metaspades
+              echo "
+[
+    {
+        orientation: \\"fr\\",
+        type: \\"paired-end\\",
+        right reads: [
+            \\"${read1.join('\\",\n            \\"')}\\"
+        ],
+        left reads: [
+            \\"${read2.join('\\",\n            \\"')}\\"
+        ]
+    }
+]" > input.yaml
+
+      metaspades.py -t ${task.cpus} -m $mem --dataset input.yaml -o metaspades
       mv metaspades/scaffolds.fasta metaspades/${meta.id}.contigs.fa
       mv metaspades/spades.log metaspades/${meta.id}.log
       mv metaspades/params.txt metaspades/${meta.id}.params.txt 
@@ -34,7 +48,7 @@ process MEGAHIT {
   label 'ASSEMBLY_SR'
 
   input:
-  tuple val(meta), path(reads)
+  tuple val(meta), path(read1), path(read2)
 
   output:
   tuple val(meta), path("megahit/${meta.id}.contigs.fa"), emit: assembly
@@ -43,7 +57,7 @@ process MEGAHIT {
 
   script:
     """
-    megahit -t ${task.cpus} -1 ${reads[0]} -2 ${reads[1]} -o megahit --out-prefix "${meta.id}"
+    megahit -t ${task.cpus} -1 ${read1.join(',')} -2 ${read2.join(',')} -o megahit --out-prefix "${meta.id}"
     mv megahit/options.json megahit/${meta.id}.params.txt
     rm -r  megahit/intermediate_contigs
 
@@ -69,7 +83,7 @@ process HIFIASM_META {
   """
   mkdir hifiasm-meta
 
-  hifiasm_meta -t ${task.cpus} -o ${meta.id} $reads 2> hifiasm-meta/${meta.id}.log
+  hifiasm_meta -t ${task.cpus} -o ${meta.id} ${reads.join(' ')} 2> hifiasm-meta/${meta.id}.log
 
   # gfa to fasta format
   awk '/^S/{print ">"\$2"\\n"\$3}' ${meta.id}.p_ctg.gfa | fold > hifiasm-meta/${meta.id}.contigs.fa
@@ -98,7 +112,7 @@ process METAFLYE {
   """
   mkdir metaflye
   
-  flye --pacbio-hifi $reads -o 'metaflye' --meta -t ${task.cpus} 2> metaflye/${meta.id}.log
+  flye --pacbio-hifi ${reads.join(' ')} -o 'metaflye' --meta -t ${task.cpus} 2> metaflye/${meta.id}.log
 
   mv  metaflye/assembly.fasta  metaflye/${meta.id}.contigs.fa
   mv metaflye/params.json  metaflye/${meta.id}.params.json
diff --git a/nextflow.config b/nextflow.config
index 9e86698..296ceac 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -21,6 +21,7 @@ params {
     gtdbtk_bank = ""
     percentage_identity = 0.95
     type = ""
+    coassembly = false
 
     // Stop after step or skip optional step/sub-step.
 
diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf
index d360c9f..3797c8f 100644
--- a/subworkflows/02_assembly.nf
+++ b/subworkflows/02_assembly.nf
@@ -54,34 +54,58 @@ workflow STEP_02_ASSEMBLY {
       ch_reads = reads
     }
 
+    if (params.coassembly){ 
+      ch_reads.map { meta, fastq ->
+                   [ meta.group, meta, fastq] }
+        .groupTuple(by: [0])
+        .map { group, metas, fastq ->
+              def meta = [:]
+              meta.id = metas.group.unique().join()
+              meta.sample = metas.sample.join("_")
+              meta.flowcell = metas.flowcell.unique().join()
+              meta.group =  metas.group.unique().join()
+              meta.assembly = metas.assembly.unique().join()
+              meta.type = metas.type.unique().join()
+              if (params.type.toUpperCase() == "SR") {
+                return [meta, fastq.collect { it[0] }, fastq.collect { it[1] }]
+              } else {
+                return [meta, fastq ]
+              }}
+        .set { ch_reads_assembly }
+    } else if (params.type.toUpperCase() == "SR") {
+        ch_reads_assembly = ch_reads
+        .map { meta, fastq ->
+            return [meta, fastq[0], fastq[1]] }
+    } else {
+      ch_reads_assembly = ch_reads
+    }
+
+    ch_reads_assembly.view()
+
     if (has_assembly){
       ch_assembly = assembly   
     } else {
-      if(assembly_tool == 'metaspades') {
-        METASPADES(reads)
-        ch_assembly = METASPADES.out.assembly
-        ch_assembler_v = METASPADES.out.v_metaspades
-      } 
-      else if(assembly_tool == 'megahit') {
-        MEGAHIT(reads)
-        ch_assembly = MEGAHIT.out.assembly
-        ch_assembler_v = MEGAHIT.out.v_megahit
-      } 
-      else if(assembly_tool == 'hifiasm-meta') {
-        HIFIASM_META(reads)
-        ch_assembly = HIFIASM_META.out.assembly
-        ch_assembler_v = HIFIASM_META.out.v_hifiasm_meta
-        } 
-      else if(assembly_tool == 'metaflye') {
-          METAFLYE(reads)
+      if (assembly_tool == 'metaspades') {
+          METASPADES(ch_reads_assembly)
+          ch_assembly = METASPADES.out.assembly
+          ch_assembler_v = METASPADES.out.v_metaspades
+      } else if (assembly_tool == 'megahit') {
+          MEGAHIT(ch_reads_assembly)
+          ch_assembly = MEGAHIT.out.assembly
+          ch_assembler_v = MEGAHIT.out.v_megahit
+      } else if (assembly_tool == 'hifiasm-meta') {
+          HIFIASM_META(ch_reads_assembly)
+          ch_assembly = HIFIASM_META.out.assembly
+          ch_assembler_v = HIFIASM_META.out.v_hifiasm_meta
+      } else if (assembly_tool == 'metaflye') {
+          METAFLYE(ch_reads_assembly)
           ch_assembly = METAFLYE.out.assembly
           ch_assembler_v = METAFLYE.out.v_metaflye
-        } 
-      else {
+      } else {
         exit 1,  "Invalid assembly parameter: ${assembly_tool}"
-        }
+      }
     }
-    
+
     ch_assembly_renamed = RENAME_CONTIGS(ch_assembly)
     ASSEMBLY_QUAST( ch_assembly_renamed,"02_assembly/quast_primary")
     ch_assembly_report = ASSEMBLY_QUAST.out.report
@@ -89,10 +113,24 @@ workflow STEP_02_ASSEMBLY {
 
     ch_idxstats = Channel.empty()
     ch_flagstat = Channel.empty()
-    ch_reads = reads
+                                          
+    if (params.coassembly){
+      ch_reads.map { meta, fastq ->
+                   [ meta.group, meta, fastq ]}
+              .set { ch_reads_tmp }
+
+      ch_assembly_renamed.map { meta, assembly ->
+                              [ meta.group, assembly ]} 
+                         .combine(ch_reads_tmp, by: 0)
+                         .map { group, assembly, meta, fastq -> 
+                              [ meta, assembly,fastq ]}
+                         .set { ch_contigs_and_reads }
+    } else {
+      ch_contigs_and_reads = ch_assembly_renamed.join(ch_reads, remainder: true)
+    }
 
-    ch_contigs_and_reads = ch_assembly_renamed.join(ch_reads, remainder: true)
     if ( params.type.toUpperCase() == "SR" ) {
+
       READS_DEDUPLICATION ( ch_contigs_and_reads )
       
       ch_reads = READS_DEDUPLICATION.out.dedup_reads
@@ -101,10 +139,10 @@ workflow STEP_02_ASSEMBLY {
       ch_bwa_mem_v = READS_DEDUPLICATION.out.v_bwa_mem2
       
     } else {
-        MINIMAP2(ch_contigs_and_reads,"02_assembly")
-        ch_bam = MINIMAP2.out.bam
+      MINIMAP2(ch_contigs_and_reads,"02_assembly")
+      ch_bam = MINIMAP2.out.bam
 
-        ch_minimap2_v = MINIMAP2.out.v_minimap2
+      ch_minimap2_v = MINIMAP2.out.v_minimap2
         
     }
 
-- 
GitLab


From 57b243a714e38e2da05bc8286a67906b9b9b100a Mon Sep 17 00:00:00 2001
From: Jean Mainguy <jean.mainguy@inra.fr>
Date: Wed, 2 Nov 2022 14:01:24 +0100
Subject: [PATCH 03/31] add gene line gff file

---
 bin/merge_annotations.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/bin/merge_annotations.py b/bin/merge_annotations.py
index d18ebd0..afa038a 100755
--- a/bin/merge_annotations.py
+++ b/bin/merge_annotations.py
@@ -139,10 +139,10 @@ def merging_cds_and_rna(cds_per_contig, contig2rnas):
             logging.info(f'{contig} has only rna annotation')
             yield contig, rnas
 
-def add_new_ID_tag(gff_attributes, new_id):
+def add_new_ID_tag_and_Parent(gff_attributes, new_id, parent_id):
 
-    gff_attributes_no_id = ';'.join([attr for attr in gff_attributes.split(';') if  attr.split('=')[0] != 'ID'])
-    return f"ID={new_id};{gff_attributes_no_id}"
+    gff_attributes_no_id = ';'.join([attr for attr in gff_attributes.split(';') if  attr.split('=')[0] not in ['ID', 'Parent']])
+    return f"ID={new_id};Parent={parent_id};{gff_attributes_no_id}"
 
 
 def get_tag_value(gff_attribute, tag):
@@ -168,6 +168,7 @@ def writing_features_to_gff_ffn_faa(annotations_per_contig, out_gff, fna_file, o
             for feature in sorted(features, key=lambda x: int(x['start'])):
                 gene_count[feature['feature']] += 1 
                 new_id = f"{feature['seqname']}.{feature['feature']}_{gene_count[feature['feature']]}"
+                parent_id = f"{new_id}_gene"
 
                 start, end = int(feature['start']), int(feature['end'])
                 
@@ -183,11 +184,20 @@ def writing_features_to_gff_ffn_faa(annotations_per_contig, out_gff, fna_file, o
            
                     
                 # Write gff and ffn 
-                feature['attribute'] = add_new_ID_tag(feature['attribute'], new_id)
+                feature['attribute'] = add_new_ID_tag_and_Parent(feature['attribute'], new_id, parent_id)
+
+                # gff line with CDS/tRNA or rRNA in feature column
+                feature_gff_line = '\t'.join(feature.values())
+
+                # gff line with gene in feature column
+                feature['feature'] = 'gene'
+                feature['attribute'] = f'ID={parent_id};locus_tag={new_id}'
+                gene_gff_line = '\t'.join(feature.values())
+
+                fh_gff.write(f'{gene_gff_line}\n{feature_gff_line}\n') 
 
                 tag_name = get_tag_value(feature['attribute'], 'Name')
-                fh_gff.write('\t'.join(feature.values())+"\n")
-                
+
                 if feature['strand'] == "+":
                     feature_seq = contig_fa[contig][start-1:end].seq
                 else:
-- 
GitLab


From 1a968cef04f19c7cb4405a6181b8acf3aba0a06e Mon Sep 17 00:00:00 2001
From: Jean Mainguy <jean.mainguy@inra.fr>
Date: Wed, 2 Nov 2022 14:17:15 +0100
Subject: [PATCH 04/31] add old name as description of renamed sequenced to
 keep it

---
 bin/rename_contigs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/rename_contigs.py b/bin/rename_contigs.py
index 0734c9c..264d9cf 100755
--- a/bin/rename_contigs.py
+++ b/bin/rename_contigs.py
@@ -64,8 +64,8 @@ def main():
 
     logging.info(f'Writting renamed fasta file in {out_fna}')
     with open(out_fna, "w") as fh_fna:
-        for i, (_, seq) in enumerate(pyfastx.Fasta(fna_file, build_index=False)):
-            fh_fna.write(f'>{sample}_c{i+1}\n{seq}\n')
+        for i, (name, seq) in enumerate(pyfastx.Fasta(fna_file, build_index=False)):
+            fh_fna.write(f'>{sample}_c{i+1} {name}\n{seq}\n')
 
 if __name__ == '__main__':
     main()
\ No newline at end of file
-- 
GitLab


From 87061f3b3d32aabaad2956f0c425400cd2f79be2 Mon Sep 17 00:00:00 2001
From: Jean Mainguy <jean.mainguy@inra.fr>
Date: Wed, 2 Nov 2022 14:27:34 +0100
Subject: [PATCH 05/31] add flag skip_alignment to skip step 05

---
 main.nf | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index 0d26a32..ad9a9cc 100644
--- a/main.nf
+++ b/main.nf
@@ -77,6 +77,7 @@ include { MERGE_FASTQ } from './modules/merge_fastq.nf'
      S05_ALIGNMENT options:
        --diamond_bank                Path to diamond bank used to align protein sequence of genes: "PATH/bank.dmnd".
                                      This bank must be previously built with diamond makedb.
+       --skip_alignment              Skip this step
        
      S06_FUNC_ANNOT options:
        --skip_func_annot             Skip this step
@@ -412,7 +413,7 @@ workflow {
 
   ch_diamond_result = Channel.empty()
 
-  if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot ) {
+  if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_alignment) {
     S05_ALIGNMENT (ch_annotation_faa, ch_diamon_db)
 
     ch_diamond_result = S05_ALIGNMENT.out.m8
@@ -422,14 +423,14 @@ workflow {
 
   ch_quant_report = Channel.empty()
   ch_v_eggnogmapper = Channel.empty()
-  if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_func_annot ) {
+  if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_func_annot && !params.skip_alignment) {
       S06_FUNC_ANNOT ( ch_annotation_ffn, ch_annotation_faa, ch_annotation_gff, ch_bam, ch_diamond_result, ch_eggnog_db )
       ch_quant_report = S06_FUNC_ANNOT.out.quant_report
 
       ch_software_versions_S06 = S06_FUNC_ANNOT.out.software_versions
   }
 
-	if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_taxo_affi ) {
+	if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_taxo_affi && !params.skip_alignment) {
 		S07_TAXO_AFFI ( ch_taxonomy, ch_diamond_result, ch_sam_coverage)
 
     ch_software_versions_S07 = S07_TAXO_AFFI.out.software_versions
-- 
GitLab


From b8235b46c92729dc94ce7d752f7f3f3cbaeda4b5 Mon Sep 17 00:00:00 2001
From: Jean Mainguy <jean.mainguy@inra.fr>
Date: Fri, 4 Nov 2022 15:58:10 +0100
Subject: [PATCH 06/31] Improve fasta parsing

---
 bin/Bins_per_sample_summarize.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/bin/Bins_per_sample_summarize.py b/bin/Bins_per_sample_summarize.py
index fce7b72..89efa2b 100755
--- a/bin/Bins_per_sample_summarize.py
+++ b/bin/Bins_per_sample_summarize.py
@@ -45,13 +45,14 @@ def bins_contigs_compositions(folder):
     contigs_to_bins = dict()
     list_bins = list()
     for fi in os.listdir(folder):
-        with open(folder + "/" + fi,'r') as bin_file:
+        bin_fasta_file = os.path.join(folder, fi)
+        with open(bin_fasta_file) as bin_file:
             bin_name = fi.rsplit('.', maxsplit=1)[0]
             if bin_name not in list_bins:
                 list_bins.append(bin_name)
             for line in bin_file:
                 if line.startswith('>'):
-                    contig_name = line.strip().lstrip('>')
+                    contig_name = line.split()[0].strip().lstrip('>')
                     contigs_to_bins[contig_name] = bin_name
 
     return contigs_to_bins, list_bins
-- 
GitLab


From 2859eb13e91f75f2de2b6364d93cd505206c0ec8 Mon Sep 17 00:00:00 2001
From: Jean Mainguy <jean.mainguy@inra.fr>
Date: Fri, 4 Nov 2022 15:58:41 +0100
Subject: [PATCH 07/31] add meta to quast report because it is needed in
 binning step

---
 main.nf              | 9 +++++++--
 modules/metaquast.nf | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index ad9a9cc..9d29306 100644
--- a/main.nf
+++ b/main.nf
@@ -371,7 +371,9 @@ workflow {
     ch_idxstats = S02_ASSEMBLY.out.idxstats
     ch_unfilter_assembly_flagstat_report = S02_ASSEMBLY.out.flagstat
 
-    ch_quast_before_filter_report = S02_ASSEMBLY.out.assembly_report
+    ch_quast = S02_ASSEMBLY.out.assembly_report
+    ch_quast_before_filter_report = ch_quast.map{ meta, quast_report -> quast_report }
+
     ch_software_versions_S02 = S02_ASSEMBLY.out.software_versions
   }
 
@@ -392,7 +394,10 @@ workflow {
     ch_bam =  S03_FILTERING.out.bam
 
     ch_sam_coverage = S03_FILTERING.out.sam_coverage
-    ch_quast_after_filter_report = S03_FILTERING.out.quast_report
+
+    ch_quast = S03_FILTERING.out.quast_report
+    ch_quast_after_filter_report = ch_quast.map{ meta, quast_report -> quast_report }
+
     ch_final_assembly_flagstat_report = S03_FILTERING.out.sam_flagstat
 
   }
diff --git a/modules/metaquast.nf b/modules/metaquast.nf
index dee90d5..dbaa508 100644
--- a/modules/metaquast.nf
+++ b/modules/metaquast.nf
@@ -9,7 +9,7 @@ process QUAST {
 
     output:
         path "${outdir}/${meta.id}/*", emit: all
-        path "${outdir}/${meta.id}/report.tsv", emit: report
+        tuple val(meta), path ("${outdir}/${meta.id}/report.tsv"), emit: report
         path "v_quast.txt", emit: v_quast
 
     script:
-- 
GitLab


From b76791e04411360a674d1f919632c4ac226cec16 Mon Sep 17 00:00:00 2001
From: Jean Mainguy <jean.mainguy@inra.fr>
Date: Mon, 7 Nov 2022 13:02:04 +0100
Subject: [PATCH 08/31] add skip_alignment in config

---
 nextflow.config | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/nextflow.config b/nextflow.config
index 296ceac..22d7fa8 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -44,6 +44,9 @@ params {
     // Step
     stop_at_structural_annot = false
 
+    // Optional step
+    skip_alignment = false
+
     // Optional step
     skip_func_annot = false
 
-- 
GitLab


From bd52ace89dbd324ec04d8fd46f8a534d6f761f57 Mon Sep 17 00:00:00 2001
From: Jean Mainguy <jean.mainguy@inra.fr>
Date: Mon, 7 Nov 2022 15:45:40 +0100
Subject: [PATCH 09/31] remove skip_alignment flag

---
 main.nf                      |  9 ++++-----
 subworkflows/05_alignment.nf | 21 +++++++++++----------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/main.nf b/main.nf
index 9d29306..a6bdc5b 100644
--- a/main.nf
+++ b/main.nf
@@ -77,7 +77,6 @@ include { MERGE_FASTQ } from './modules/merge_fastq.nf'
      S05_ALIGNMENT options:
        --diamond_bank                Path to diamond bank used to align protein sequence of genes: "PATH/bank.dmnd".
                                      This bank must be previously built with diamond makedb.
-       --skip_alignment              Skip this step
        
      S06_FUNC_ANNOT options:
        --skip_func_annot             Skip this step
@@ -418,24 +417,24 @@ workflow {
 
   ch_diamond_result = Channel.empty()
 
-  if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_alignment) {
+  if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && (!params.skip_func_annot || !params.skip_taxo_affi)) {
     S05_ALIGNMENT (ch_annotation_faa, ch_diamon_db)
 
-    ch_diamond_result = S05_ALIGNMENT.out.m8
+    ch_diamond_result = S05_ALIGNMENT.out.diamond_result
 
     ch_software_versions_S05 = S05_ALIGNMENT.out.software_versions
   }
 
   ch_quant_report = Channel.empty()
   ch_v_eggnogmapper = Channel.empty()
-  if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_func_annot && !params.skip_alignment) {
+  if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_func_annot) {
       S06_FUNC_ANNOT ( ch_annotation_ffn, ch_annotation_faa, ch_annotation_gff, ch_bam, ch_diamond_result, ch_eggnog_db )
       ch_quant_report = S06_FUNC_ANNOT.out.quant_report
 
       ch_software_versions_S06 = S06_FUNC_ANNOT.out.software_versions
   }
 
-	if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_taxo_affi && !params.skip_alignment) {
+	if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_taxo_affi) {
 		S07_TAXO_AFFI ( ch_taxonomy, ch_diamond_result, ch_sam_coverage)
 
     ch_software_versions_S07 = S07_TAXO_AFFI.out.software_versions
diff --git a/subworkflows/05_alignment.nf b/subworkflows/05_alignment.nf
index 9902230..9017d47 100644
--- a/subworkflows/05_alignment.nf
+++ b/subworkflows/05_alignment.nf
@@ -9,16 +9,17 @@ workflow STEP_05_ALIGNMENT {
         ch_diamond_v = Channel.empty()
 
 
-        ch_m8 =Channel.empty()
-        if (!params.skip_func_annot || !params.skip_taxo_affi){
-            DIAMOND (
-                prokka_faa,
-                diamond
-            )
-            ch_m8 = DIAMOND.out.m8
-            ch_diamond_v = DIAMOND.out.v_diamond
-        }
+        ch_diamond_result =Channel.empty()
+        
+        DIAMOND (
+            prokka_faa,
+            diamond,
+        )
+
+        ch_diamond_result = DIAMOND.out.m8
+        ch_diamond_v = DIAMOND.out.v_diamond
+        
     emit:
-        m8 = ch_m8
+        diamond_result = ch_diamond_result
         software_versions = ch_diamond_v.first()
     }
-- 
GitLab


From 2b12c305cd1fe43aeb8b3926b7e9158c8bc4f294 Mon Sep 17 00:00:00 2001
From: VIENNE MAINA <maina.vienne@inrae.fr>
Date: Tue, 8 Nov 2022 10:45:26 +0100
Subject: [PATCH 10/31] refactoring

---
 bin/{aln2taxaffi.py => aln_to_tax_affi.py}    |  2 +-
 ...marize.py => bins_per_sample_summarize.py} |  4 +-
 ...ge_abundance_and_functional_annotations.py | 20 +++-----
 bin/merge_annotations.py                      |  2 +-
 bin/merge_kaiju_results.py                    | 19 +++----
 ...clusters.py => quantification_clusters.py} | 49 +++++++------------
 modules/assign_taxonomy.nf                    |  2 +-
 modules/feature_counts.nf                     |  2 +-
 modules/sum_up_bins_informations.nf           |  4 +-
 9 files changed, 41 insertions(+), 63 deletions(-)
 rename bin/{aln2taxaffi.py => aln_to_tax_affi.py} (99%)
 rename bin/{Bins_per_sample_summarize.py => bins_per_sample_summarize.py} (99%)
 rename bin/{Quantification_clusters.py => quantification_clusters.py} (88%)

diff --git a/bin/aln2taxaffi.py b/bin/aln_to_tax_affi.py
similarity index 99%
rename from bin/aln2taxaffi.py
rename to bin/aln_to_tax_affi.py
index 241d836..6635297 100755
--- a/bin/aln2taxaffi.py
+++ b/bin/aln_to_tax_affi.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 """----------------------------------------------------------------------------
-  Script Name: aln2taxaffi.py
+  Script Name: aln_to_tax_affi.py
   Description:
   Input files: File with correspondence between accession ids and taxon ids, \
   taxonomy directory and diamond output file (.m8)
diff --git a/bin/Bins_per_sample_summarize.py b/bin/bins_per_sample_summarize.py
similarity index 99%
rename from bin/Bins_per_sample_summarize.py
rename to bin/bins_per_sample_summarize.py
index 89efa2b..08e7112 100755
--- a/bin/Bins_per_sample_summarize.py
+++ b/bin/bins_per_sample_summarize.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 """----------------------------------------------------------------------------
-  Script Name: Bins_per_sample.py
+  Script Name: bins_per_sample_summarize.py
   Description: Generate abundances table of bins between samples, with also \
                taxonomic and genomes informations from gtdb-tk and Checkm2.
   Input files: Samtools coverages anf flagstats files per sample, .fasta file of bins, \
@@ -307,4 +307,4 @@ def main():
     write_report_file(general_table, args.report_file, args.checkm_file, args.table_file)
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/bin/merge_abundance_and_functional_annotations.py b/bin/merge_abundance_and_functional_annotations.py
index 3885286..614b123 100755
--- a/bin/merge_abundance_and_functional_annotations.py
+++ b/bin/merge_abundance_and_functional_annotations.py
@@ -28,18 +28,14 @@ __status__ = 'dev'
 # Status: dev.
 
 # Modules importation.
-try:
-    import argparse
-    import re
-    import sys
-    import pandas as pd
-    from datetime import datetime
-except ImportError as error:
-    print(error)
-    exit(1)
-
-# Print time.
-print(str(datetime.now()))
+
+import argparse
+import re
+import sys
+import pandas as pd
+from datetime import datetime
+
+
 
 # Manage parameters.
 parser = argparse.ArgumentParser(description = 'Script which join \
diff --git a/bin/merge_annotations.py b/bin/merge_annotations.py
index afa038a..4b35037 100755
--- a/bin/merge_annotations.py
+++ b/bin/merge_annotations.py
@@ -268,4 +268,4 @@ def main():
     
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/bin/merge_kaiju_results.py b/bin/merge_kaiju_results.py
index 9980126..25cdb1c 100755
--- a/bin/merge_kaiju_results.py
+++ b/bin/merge_kaiju_results.py
@@ -21,18 +21,13 @@ __status__ = 'dev'
 # Status: dev.
 
 # Modules importation.
-try:
-    import argparse
-    import re
-    import sys
-    import pandas as pd
-    from datetime import datetime
-except ImportError as error:
-    print(error)
-    exit(1)
-
-# Print time.
-print(str(datetime.now()))
+
+import argparse
+import re
+import sys
+import pandas as pd
+from datetime import datetime
+
 
 # Manage parameters.
 parser = argparse.ArgumentParser(description = 'Script which join \
diff --git a/bin/Quantification_clusters.py b/bin/quantification_clusters.py
similarity index 88%
rename from bin/Quantification_clusters.py
rename to bin/quantification_clusters.py
index 49f4ad4..84c7367 100755
--- a/bin/Quantification_clusters.py
+++ b/bin/quantification_clusters.py
@@ -1,15 +1,14 @@
 #!/usr/bin/env python
 
 """--------------------------------------------------------------------
-  Script Name: Quantification_clusters.py
+  Script Name: quantification_clusters.py
   Description: Create a file which join
                table with global cluster id and intermediate cluster id
                to table with intermediate cluster id and genes id.
                Create a file which contains
                sum of reads aligned
                to each gene of a cluster.
-  Input files:
-               1st input file: table_clstr.txt (table with cluster id
+  Input files: 1st input file: table_clstr.txt (table with cluster id
                and corresponding intermediate cluster ids)
                2nd input file: file containing list of file names
                generated with 1st cd-hit for each sample
@@ -17,7 +16,7 @@
                3rd input file: file containing list of file names
                generated with featureCounts for each sample
                (.featureCounts.count files)
-  Created By:  Joanna Fourquet et Celine Noirot
+  Created By:  Joanna Fourquet and Celine Noirot
   Date:        2019-04-11
 -----------------------------------------------------------------------
 """
@@ -34,14 +33,11 @@ __status__ = 'dev'
 # Status: dev.
 
 # Modules importation.
-try:
-    import argparse
-    import re
-    import sys
-    from datetime import datetime
-except ImportError as error:
-    print(error)
-    exit(1)
+import argparse
+import re
+import sys
+from datetime import datetime
+
 
 # Print time.
 print(str(datetime.now()))
@@ -52,22 +48,22 @@ correspondence table between global cluster id and gene id and \
 a table with number of aligned reads in each sample and for each \
 global cluster id.')
 
-parser.add_argument('-t', '--table_of_corespondences', required = True, \
+parser.add_argument('-t', '--table_of_corespondences', required = True, 
 help = 'Correspondence table between global cluster \
 id and intermediate cluster id.')
 
-parser.add_argument('-l', '--list_of_file_clusters', required = True, \
+parser.add_argument('-l', '--list_of_file_clusters', required = True, 
 help = 'List of files containing correspondence tables between \
 cluster intermediate cluster id and gene id per sample.')
 
-parser.add_argument('-c', '--list_of_file_counts', required = True, \
+parser.add_argument('-c', '--list_of_file_counts', required = True, 
 help = 'List of files storing read counts for each gene per sample.')
 
-parser.add_argument('-oc', '--output_counts', required = True, \
+parser.add_argument('-oc', '--output_counts', required = True, 
 help = 'Name of output file containing counts \
 for each global cluster id and each sample.')
 
-parser.add_argument('-oid', '--output_id', required = True, \
+parser.add_argument('-oid', '--output_id', required = True, 
 help = 'Name of output file containing correspondence table \
 between global cluster id and gene id.')
 
@@ -100,8 +96,6 @@ with open(args.table_of_corespondences) as fp:
 print(d_g_clstr_id_by_int_clstr_id)
 print(d_count_by_g_clstr)
 
-# Print date.
-print(str(datetime.now()))
 
 # Initialization of dictionnary d_g_clstr_id_by_gene_id.
 d_g_clstr_id_by_gene_id = {}
@@ -127,10 +121,7 @@ for int_clstr_gene_path in files_of_int_clstr_id_gene_id:
             print(line_int_clstr_gene)
             int_clstr_id = line_int_clstr_gene[0]
             gene_id_from_clstr_gene_path = line_int_clstr_gene[1]
-            if \
-            'd_g_clstr_id_by_gene_id[gene_id_from_clstr_gene_path]' \
-             not in d_g_clstr_id_by_gene_id:
-                print("if")
+            if 'd_g_clstr_id_by_gene_id[gene_id_from_clstr_gene_path]' not in d_g_clstr_id_by_gene_id:
                 d_g_clstr_id_by_gene_id[gene_id_from_clstr_gene_path] \
                 = d_g_clstr_id_by_int_clstr_id[int_clstr_id]
             else:
@@ -139,8 +130,7 @@ for int_clstr_gene_path in files_of_int_clstr_id_gene_id:
 
 print(d_g_clstr_id_by_gene_id)
 
-# Print date.
-print(str(datetime.now()))
+
 
 # For each count file (output of featureCounts), reading of lines one by one,
 # recovery of name of gene and count number and incrementing of corresponding
@@ -157,8 +147,7 @@ for (count_idx,counts_path) in enumerate(sorted(files_of_counts)):
             d_count_by_g_clstr[d_g_clstr_id_by_gene_id[gene_id]]\
             [count_idx] += gene_count
 
-# Print date.
-print(str(datetime.now()))
+
 
 #######################################
 # Write in the output files.
@@ -178,8 +167,7 @@ with open(args.output_id,"w") as foutput_res_table:
         + gene_id \
         + "\n")
 
-# Print date.
-print(str(datetime.now()))
+
 
 # Write output file containing global cluster id and read count for each sample.
 with open(args.output_counts,"w") as foutput_res_counts:
@@ -192,5 +180,4 @@ with open(args.output_counts,"w") as foutput_res_counts:
         + "\t".join([str(i) for i in count])\
         + "\n")
 
-# Print date.
-print(str(datetime.now()))
+
diff --git a/modules/assign_taxonomy.nf b/modules/assign_taxonomy.nf
index 559c06c..9634f94 100644
--- a/modules/assign_taxonomy.nf
+++ b/modules/assign_taxonomy.nf
@@ -38,7 +38,7 @@ process ASSIGN_TAXONOMY {
     fi
 
 
-    aln2taxaffi.py -a ${accession2taxid} --taxonomy \$new_taxdump_var \
+    aln_to_tax_affi.py -a ${accession2taxid} --taxonomy \$new_taxdump_var \
                   -o ${meta.id} -b ${m8} --keep_only_best_aln  \
                    -v --write_top_taxons 
     merge_contig_quantif_perlineage.py -c ${meta.id}.percontig.tsv -s ${sam_coverage} -o ${meta.id}
diff --git a/modules/feature_counts.nf b/modules/feature_counts.nf
index 97587cc..73ec38c 100644
--- a/modules/feature_counts.nf
+++ b/modules/feature_counts.nf
@@ -37,7 +37,7 @@ process QUANTIFICATION_TABLE {
     """
     ls ${clusters_contigs} | cat > List_of_contigs_files.txt
     ls ${counts_files} | cat > List_of_count_files.txt
-    Quantification_clusters.py -t ${global_clusters_clusters} -l List_of_contigs_files.txt -c List_of_count_files.txt -oc Clusters_Count_table_all_samples.txt -oid Correspondence_global_clstr_genes.txt
+    quantification_clusters.py -t ${global_clusters_clusters} -l List_of_contigs_files.txt -c List_of_count_files.txt -oc Clusters_Count_table_all_samples.txt -oid Correspondence_global_clstr_genes.txt
     """
 }
 
diff --git a/modules/sum_up_bins_informations.nf b/modules/sum_up_bins_informations.nf
index d09a2a1..842b43c 100644
--- a/modules/sum_up_bins_informations.nf
+++ b/modules/sum_up_bins_informations.nf
@@ -18,7 +18,7 @@ process GENOMES_ABUNDANCES_PER_SAMPLE {
    script:
       """
       mkdir -p stats
-      Bins_per_sample_summarize.py --list_of_coverage_files ${coverage_files} \
+      bins_per_sample_summarize.py --list_of_coverage_files ${coverage_files} \
       --list_of_flagstats_files ${flagstats_files} --affiliations_predictions ${affiliations_predictions} \
       --bins_folder ${bins_folder} --genomes_informations ${genomes_informations} \
       --output_file genomes_abundances.tsv --report_file stats/genomes_abundances_mqc.tsv \
@@ -76,4 +76,4 @@ process BINS_STATS_TO_MUTLIQC_FORMAT {
    cat bin_size_per_quality.tsv >> bin_size_per_quality_mqc.tsv
 
    """
-}
\ No newline at end of file
+}
-- 
GitLab


From 7827694c726844bb60b1766b8aac673f6ff341af Mon Sep 17 00:00:00 2001
From: VIENNE MAINA <maina.vienne@inrae.fr>
Date: Tue, 8 Nov 2022 10:53:03 +0100
Subject: [PATCH 11/31] Fix filter_diamond_hits.py

---
 bin/filter_diamond_hits.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/bin/filter_diamond_hits.py b/bin/filter_diamond_hits.py
index 375542a..a29e514 100755
--- a/bin/filter_diamond_hits.py
+++ b/bin/filter_diamond_hits.py
@@ -68,9 +68,8 @@ def get_all_hits_per_query(blast_result_file):
 def is_identity_and_coverage_ok(hit, min_identity, min_coverage):
 
     qcovhsp = (int(hit["qend"]) - int(hit["qstart"]) + 1) / int(hit['qlen'])
-    if float(hit['pident']) >= min_identity or qcovhsp >= min_coverage:
-        return True
-    return False
+    return float(hit['pident']) >= min_identity and qcovhsp >= min_coverage
+
 
 
 def parse_arguments():
-- 
GitLab


From 35a5f84eced5556e2df392b319e3fc8098240938 Mon Sep 17 00:00:00 2001
From: VIENNE MAINA <maina.vienne@inrae.fr>
Date: Tue, 8 Nov 2022 12:04:11 +0100
Subject: [PATCH 12/31] Update bin/format_bins_stat_to_multiqc.py,
 bin/merge_annotations.py, bin/plot_contigs_taxonomic_affiliation.py,
 bin/plot_kaiju_stat.py, bin/quantification_by_contig_lineage.py,
 bin/quantification_by_functional_annotation.py, bin/db_versions.py

---
 bin/db_versions.py                            | 16 +++++++++-----
 bin/format_bins_stat_to_multiqc.py            |  4 +++-
 bin/merge_annotations.py                      | 12 +++++-----
 bin/plot_contigs_taxonomic_affiliation.py     |  6 ++---
 bin/plot_kaiju_stat.py                        |  8 +++----
 bin/quantification_by_contig_lineage.py       | 22 ++++++++-----------
 ...quantification_by_functional_annotation.py | 14 +++++-------
 7 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/bin/db_versions.py b/bin/db_versions.py
index 3088ea3..bf762bc 100755
--- a/bin/db_versions.py
+++ b/bin/db_versions.py
@@ -5,21 +5,25 @@ import subprocess
 import os.path
 import gzip 
 
-def info_db(file):
-	name=file.split()[0]
-	path =os.path.realpath(file.split()[1])
-	if file.split()[1]=='nodes.dmp':
+def info_db(db_info_file):
+	name=db_info_file.split()[0]
+	path =os.path.realpath(db_info_file.split()[1])
+
+	if db_info_file.split()[1]=='nodes.dmp':
 		path=os.path.dirname(path)
 
+	# get db size 
 	process = subprocess.run(['du', '-sh',path], stdout=subprocess.PIPE)
 	size = process.stdout.split()[0].decode('utf-8')
+
 	if (name == "Host_genome"):
 		size = f"{size} ({get_genome_seq_count(path)} seq)"
 	
+	# get date of last modifaction 
 	process = subprocess.run(['stat', '-c %y', path], stdout=subprocess.PIPE)
-	modif = process.stdout.split()[0].decode('utf-8')
+	modif_date = process.stdout.split()[0].decode('utf-8')
 
-	return name,size,modif,path
+	return name,size,modif_date,path
 
 
 def get_genome_seq_count(genome_path):
diff --git a/bin/format_bins_stat_to_multiqc.py b/bin/format_bins_stat_to_multiqc.py
index e94ee23..05fdeb6 100755
--- a/bin/format_bins_stat_to_multiqc.py
+++ b/bin/format_bins_stat_to_multiqc.py
@@ -22,7 +22,7 @@ import pandas as pd
 
 
 def parse_arguments():
-    """Parse script arguments."""
+    """Parse script arguments."""bins_per_sample_summarize.py
     parser = ArgumentParser(description="...",
                             formatter_class=ArgumentDefaultsHelpFormatter)
 
@@ -65,6 +65,7 @@ def main():
 
         # remove Not_binned category when counting bins in different quality
         df_ech_count = df_ech.loc[~(df_ech['genome'] == "Not_binned")]
+
         df_ech_gr = df_ech_count.groupby(['Quality']).size().reset_index(name='counts')
         df_ech_gr = df_ech_gr.set_index("Quality")
  
@@ -91,6 +92,7 @@ def main():
 
     logging.info(f'Writing {args.out_bins_count}')
     df_count.to_csv(args.out_bins_count, sep='\t')
+    
     logging.info(f'Writing {args.out_bins_size}')
     df_size.to_csv(args.out_bins_size, sep='\t')
 
diff --git a/bin/merge_annotations.py b/bin/merge_annotations.py
index 4b35037..8843414 100755
--- a/bin/merge_annotations.py
+++ b/bin/merge_annotations.py
@@ -12,7 +12,7 @@ merge_annotations.py -c prodigal.gff -r barrnap.gff -t trnascan_se.gff --contig_
 
 # Metadata
 __author__ = 'Mainguy Jean - Plateforme bioinformatique Toulouse'
-__copyright__ = 'Copyright (C) 2020 INRAE'
+__copyright__ = 'Copyright (C) 2022 INRAE'
 __license__ = 'GNU General Public License'
 __version__ = '0.1'
 __email__ = 'support.bioinfo.genotoul@inra.fr'
@@ -25,7 +25,7 @@ import gzip
 import csv
 from collections import defaultdict
 import pyfastx
-from Bio.Seq import Seq
+
 
 
 def parse_arguments():
@@ -244,16 +244,16 @@ def main():
     out_report = args.report_output
 
 
-    logging.info('Parsing rRNA annoations.')
+    logging.info('Parsing rRNA annotations.')
     rrna_annotations = parse_gff_file(rrna_file, 'rRNA')
 
-    logging.info('Parsing tRNA annoations.')
+    logging.info('Parsing tRNA annotations.')
     trna_annotations = parse_gff_file(trna_file, 'tRNA')
 
-    logging.info('Grouping RNA annoations by contig.')
+    logging.info('Grouping RNA annotations by contig.')
     contig2rnas = group_annotations_by_contigs(trna_annotations, rrna_annotations)
     
-    logging.info('Removing CDS annoations overlapping a RNA annotations.')
+    logging.info('Removing CDS annotations overlapping a RNA annotations.')
     unoverlapping_cds_per_contig = remove_overlapping_cds(cds_file, contig2rnas)
 
     logging.info('Merge CDS and RNA annotations by contig.')
diff --git a/bin/plot_contigs_taxonomic_affiliation.py b/bin/plot_contigs_taxonomic_affiliation.py
index 3d51a38..a170c28 100755
--- a/bin/plot_contigs_taxonomic_affiliation.py
+++ b/bin/plot_contigs_taxonomic_affiliation.py
@@ -303,7 +303,7 @@ def parse_arguments():
 
     parser.add_argument('--output_dir', default='plots', help="Name of the output directory")
 
-    parser.add_argument('--top_taxon', default=10, type=int, help="Plot only the top n most abundant taxa.")
+    parser.add_argument('--nb_top_taxon', default=10, type=int, help="Plot only the top n most abundant taxa.")
 
     parser.add_argument("-v", "--verbose", help="increase output verbosity",
                         action="store_true")
@@ -326,7 +326,7 @@ def main():
 
 
     contig_affi_files = args.affi_taxo_quantif
-    top_n_taxon = args.top_taxon
+    top_n_taxon = args.nb_top_taxon
     output_dir = args.output_dir
 
     os.makedirs(output_dir, exist_ok=True)
@@ -385,7 +385,7 @@ def main():
         html_figs = []
         for i, (rank, fig) in enumerate(rank2fig.items()):
             
-            include_plotly = True if i == 0 else False
+            include_plotly = i == 0
 
             html_fig = fig.to_html(full_html=False, include_plotlyjs=include_plotly)
             html_figs.append((rank, html_fig )) 
diff --git a/bin/plot_kaiju_stat.py b/bin/plot_kaiju_stat.py
index 9ea7e0b..24fcde5 100755
--- a/bin/plot_kaiju_stat.py
+++ b/bin/plot_kaiju_stat.py
@@ -2,7 +2,7 @@
 
 """----------------------------------------------------------------------------------------------------------------------------------------------------------
   Script Name: plot_kaiju_stat.py
-  Description: Generates density plot distribution of kiaju match length
+  Description: Generates density plot distribution of kaiju match length
   Input files:
                Verbose files generated by Kaiju for each sample.
   Created By:  Jean Mainguy
@@ -88,7 +88,7 @@ def main():
 
     density_df_samples = []
     max_match_length = 0
-    sample2dfmatchlength = {}
+    sample2df_matchlen = {}
 
     logging.info(f'Parsing kaiju results')
     for kaiju_result in kaiju_results:
@@ -98,14 +98,14 @@ def main():
         df_matchlen =  parse_matchlen_from_kaiju_output(kaiju_result)
         
         max_match_length = max((max_match_length, df_matchlen['match_length'].max()))
-        sample2dfmatchlength[sample_name] = df_matchlen
+        sample2df_matchlen[sample_name] = df_matchlen
 
     x_vals = np.linspace(0,max_match_length,max_match_length) # Specifying the limits of our data
     logging.info(f'Maximum match length is {max_match_length}')
     
     logging.info(f'Computing density for each sample')
 
-    for sample_name, df_matchlen in sample2dfmatchlength.items():
+    for sample_name, df_matchlen in sample2df_matchlen.items():
         density = gaussian_kde(df_matchlen['match_length'], weights = df_matchlen['reads'])
         density.covariance_factor = lambda : smoothing_parameter #Smoothing parameter
         density._compute_covariance()
diff --git a/bin/quantification_by_contig_lineage.py b/bin/quantification_by_contig_lineage.py
index 3092db3..d85fb2f 100755
--- a/bin/quantification_by_contig_lineage.py
+++ b/bin/quantification_by_contig_lineage.py
@@ -22,19 +22,15 @@ __status__ = 'dev'
 # Status: dev.
 
 # Modules importation.
-try:
-    import argparse
-    import re
-    import sys
-    import pandas as pd
-    import os
-    from datetime import datetime
-except ImportError as error:
-    print(error)
-    exit(1)
-
-# Print time.
-print(str(datetime.now()))
+
+import argparse
+import re
+import sys
+import pandas as pd
+import os
+from datetime import datetime
+
+
 
 # Manage parameters.
 parser = argparse.ArgumentParser(description = 'Script which make \
diff --git a/bin/quantification_by_functional_annotation.py b/bin/quantification_by_functional_annotation.py
index a6e0e72..e9e7515 100755
--- a/bin/quantification_by_functional_annotation.py
+++ b/bin/quantification_by_functional_annotation.py
@@ -23,14 +23,12 @@ __status__ = 'dev'
 # Status: dev.
 
 # Modules importation.
-try:
-    import argparse
-    import re
-    import sys
-    import pandas as pd
-except ImportError as error:
-    print(error)
-    exit(1)
+
+import argparse
+import re
+import sys
+import pandas as pd
+
 
 # Manage parameters.
 parser = argparse.ArgumentParser(description = 'Create a file with \
-- 
GitLab


From 2cb9f5af7d7e66ffcd770f9dcaff7821af02b766 Mon Sep 17 00:00:00 2001
From: Maina Vienne <maina.vienne@inrae.fr>
Date: Thu, 17 Nov 2022 16:28:50 +0100
Subject: [PATCH 13/31] remove unnecessary line

---
 modules/cd_hit.nf                   | 1 -
 subworkflows/02_assembly.nf         | 2 --
 subworkflows/04_structural_annot.nf | 2 --
 3 files changed, 5 deletions(-)

diff --git a/modules/cd_hit.nf b/modules/cd_hit.nf
index 217bc92..1bf02ec 100644
--- a/modules/cd_hit.nf
+++ b/modules/cd_hit.nf
@@ -55,7 +55,6 @@ main:
   INDIVIDUAL_CD_HIT( ch_assembly, ch_percentage_identity )
   ch_individual_clusters = INDIVIDUAL_CD_HIT.out.clstr_fasta.collect()
   GLOBAL_CD_HIT(ch_individual_clusters , ch_percentage_identity )
-  ch_ffn = ch_assembly.flatMap{it -> it[1]}.collect()
 
 emit:
   individual_clstr_table = INDIVIDUAL_CD_HIT.out.clstr_table
diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf
index 3797c8f..5630b4e 100644
--- a/subworkflows/02_assembly.nf
+++ b/subworkflows/02_assembly.nf
@@ -80,8 +80,6 @@ workflow STEP_02_ASSEMBLY {
       ch_reads_assembly = ch_reads
     }
 
-    ch_reads_assembly.view()
-
     if (has_assembly){
       ch_assembly = assembly   
     } else {
diff --git a/subworkflows/04_structural_annot.nf b/subworkflows/04_structural_annot.nf
index 49841ef..505e633 100644
--- a/subworkflows/04_structural_annot.nf
+++ b/subworkflows/04_structural_annot.nf
@@ -17,8 +17,6 @@ workflow STEP_04_STRUCTURAL_ANNOT {
 
     MERGE_ANNOTATIONS(annotations_ch)
 
-    ch_software_versions = PRODIGAL.out.v_prodigal.first()
-
     ch_software_versions = PRODIGAL.out.v_prodigal.first().mix( BARRNAP.out.v_barrnap.first(), 
                                                                 TRNASCAN_SE.out.v_tRNAscan.first())
 
-- 
GitLab


From ae32ab7e48aecf5f29f36fae4efa4295f4262796 Mon Sep 17 00:00:00 2001
From: Maina Vienne <maina.vienne@inrae.fr>
Date: Thu, 17 Nov 2022 16:29:29 +0100
Subject: [PATCH 14/31] Manage coassembly for step 03 (filtering)

---
 subworkflows/03_filtering.nf | 156 +++++++++++++++++++++++------------
 1 file changed, 103 insertions(+), 53 deletions(-)

diff --git a/subworkflows/03_filtering.nf b/subworkflows/03_filtering.nf
index be38cd6..cbb1dcf 100644
--- a/subworkflows/03_filtering.nf
+++ b/subworkflows/03_filtering.nf
@@ -13,10 +13,33 @@ workflow STEP_03_ASSEMBLY_FILTER {
   main:
     ch_chunk_assembly_for_filter = assemblies
                                   .splitFasta(by: 100000, file: true)
-  
 
-    ch_assembly_and_idxstats = ch_chunk_assembly_for_filter
+    if (params.coassembly){ 
+      idxstats.map { meta, idxstats ->
+                   [ meta.group, meta, idxstats] }
+              .groupTuple(by: [0])
+              .map { group, metas, idxstats -> 
+                    def meta = [:]
+                    meta.id = metas.group.unique().join()
+                    meta.sample = metas.sample.join("_")
+                    meta.flowcell = metas.flowcell.unique().join()
+                    meta.group =  metas.group.unique().join()
+                    meta.assembly = metas.assembly.unique().join()
+                    meta.type = metas.type.unique().join()
+                    [ group, meta, idxstats] }
+              .set { ch_idxstats_tmp }
+      ch_chunk_assembly_for_filter.map { meta, assembly ->
+                                       [ meta.group, assembly ]} 
+                                  .combine(ch_idxstats_tmp, by: 0)
+                                  .map { group, assembly, meta, idxstats -> 
+                                       [ meta, assembly, idxstats ]}
+                                  .set { ch_assembly_and_idxstats }
+    } else {
+      ch_assembly_and_idxstats = ch_chunk_assembly_for_filter
                                 .combine(idxstats, by:0)
+    }
+
+    
 
     CHUNK_ASSEMBLY_FILTER (
       ch_assembly_and_idxstats,
@@ -42,57 +65,84 @@ workflow STEP_03_ASSEMBLY_FILTER {
     discarded_contigs = MERGE_ASSEMBLY_FILTER.out.merged_discarded
 
 
-   // Differentiate sample with and without discarded_contigs 
-   // samples with no discarded_contigs are not going to be processed to process 
-   discarded_contigs_category = discarded_contigs.branch{
-                              without: it[1].isEmpty() 
-                              with: !it[1].isEmpty() 
-                              }
-
-
-   samples_without_discarded_contigs = discarded_contigs_category.without.map{ it -> it[0]}
-   ch_bam_of_sample_without_discarded_contigs = samples_without_discarded_contigs.join(bam)
-   
-  ch_discarded_contigs_and_bam = discarded_contigs_category.with.join(bam)
-
-  FILTER_BAM(ch_discarded_contigs_and_bam)
-
-  ch_discarded_bam = FILTER_BAM.out.discarded_contigs_bam
-  ch_selected_bam = FILTER_BAM.out.selected_contigs_bam 
-
-  if ( params.type.toUpperCase() == "SR" ) {
-        
-        ch_discarded_reads = EXTRACT_PAIRED_READS(ch_discarded_bam)
-
-        ch_contigs_and_discarded_reads = ch_merged_selected.join(ch_discarded_reads) 
-
-        BWA_MEM(ch_contigs_and_discarded_reads, "03_filtering")
-        ch_bam_with_discarded_reads = BWA_MEM.out.bam
-
-  } else {
-        ch_discarded_reads = EXTRACT_SINGLE_READS(ch_discarded_bam)
-        ch_contigs_and_discarded_reads = ch_merged_selected.join(ch_discarded_reads) 
-        MINIMAP2(ch_contigs_and_discarded_reads, "03_filtering")
-        ch_bam_with_discarded_reads = MINIMAP2.out.bam
-  
-  }
-
-
-   // remove bai from ch_bam_with_discarded_reads and add bam of selected contigs
-  ch_bam_to_merge = ch_bam_with_discarded_reads.map{ it -> [it[0], it[1]]}.join(ch_selected_bam)
-
-  ch_merged_bam = MERGE_BAM_FILES(ch_bam_to_merge)
-
-   ch_final_bam = ch_bam_of_sample_without_discarded_contigs.mix(ch_merged_bam)
-
-
-  GET_ALIGNMENT_METRICS(ch_final_bam, '03_filtering/')
-
-  ch_flagstat =  GET_ALIGNMENT_METRICS.out.sam_flagstat
-  ch_coverage =  GET_ALIGNMENT_METRICS.out.sam_coverage
-
-  QUAST( ch_merged_selected, "03_filtering/quast_filtered" )
-  ch_quast_report = QUAST.out.report
+    // Differentiate sample with and without discarded_contigs 
+    // samples with no discarded_contigs are not going to be processed to process 
+    discarded_contigs_category = discarded_contigs.branch{
+                                without: it[1].isEmpty() 
+                                with: !it[1].isEmpty() 
+                                }
+    
+
+    samples_without_discarded_contigs = discarded_contigs_category.without.map{ it -> it[0]}
+
+    if (params.coassembly){
+      bam.map { meta, bam, bai ->
+                   [ meta.group, meta, bam, bai ]}
+              .set { ch_bam_tmp }
+
+      samples_without_discarded_contigs.map { meta, assembly ->
+                                            [ meta.group, assembly ]} 
+                                       .combine( ch_bam_tmp, by: 0)
+                                       .map { group, assembly, meta, bam, bai -> 
+                                            [ meta, assembly, bam, bai ]}
+                                       .set { ch_bam_of_sample_without_discarded_contigs }
+      discarded_contigs_category.with.map { meta, discarded_contigs ->
+                                            [ meta.group, discarded_contigs ]} 
+                                       .combine( ch_bam_tmp, by: 0)
+                                       .map { group, discarded_contigs, meta, bam, bai -> 
+                                            [ meta, discarded_contigs, bam, bai ]}
+                                       .set { ch_discarded_contigs_and_bam }
+      // metadata retrieval from bam channel, for the join with discarded reads after "EXTRACT_PAIRED_READS"                                                            
+      ch_merged_selected.map { meta, assembly ->
+                              [ meta.group, assembly ]} 
+                        .combine( ch_bam_tmp, by: 0)
+                        .map { group, assembly, meta, bam, bai -> 
+                             [ meta, assembly ]}
+                        .set { ch_merged_selected_meta }
+    } else {
+      ch_bam_of_sample_without_discarded_contigs = samples_without_discarded_contigs.join(bam)
+      ch_discarded_contigs_and_bam = discarded_contigs_category.with.join(bam)
+      ch_merged_selected_meta = ch_merged_selected
+    }
+
+    FILTER_BAM(ch_discarded_contigs_and_bam)
+
+    ch_discarded_bam = FILTER_BAM.out.discarded_contigs_bam
+    ch_selected_bam = FILTER_BAM.out.selected_contigs_bam 
+
+
+
+    if ( params.type.toUpperCase() == "SR" ) {
+          
+          ch_discarded_reads = EXTRACT_PAIRED_READS(ch_discarded_bam)
+          ch_contigs_and_discarded_reads = ch_merged_selected_meta.join(ch_discarded_reads) 
+
+          BWA_MEM(ch_contigs_and_discarded_reads, "03_filtering")
+          ch_bam_with_discarded_reads = BWA_MEM.out.bam
+
+    } else {
+          ch_discarded_reads = EXTRACT_SINGLE_READS(ch_discarded_bam)
+          ch_contigs_and_discarded_reads = ch_merged_selected_meta.join(ch_discarded_reads) 
+          MINIMAP2(ch_contigs_and_discarded_reads, "03_filtering")
+          ch_bam_with_discarded_reads = MINIMAP2.out.bam
+    
+    }
+
+    // remove bai from ch_bam_with_discarded_reads and add bam of selected contigs
+    ch_bam_to_merge = ch_bam_with_discarded_reads.map{ it -> [it[0], it[1]]}.join(ch_selected_bam)
+
+    ch_merged_bam = MERGE_BAM_FILES(ch_bam_to_merge)
+
+    ch_final_bam = ch_bam_of_sample_without_discarded_contigs.mix(ch_merged_bam)
+
+
+    GET_ALIGNMENT_METRICS(ch_final_bam, '03_filtering/')
+
+    ch_flagstat =  GET_ALIGNMENT_METRICS.out.sam_flagstat
+    ch_coverage =  GET_ALIGNMENT_METRICS.out.sam_coverage
+
+    QUAST( ch_merged_selected, "03_filtering/quast_filtered" )
+    ch_quast_report = QUAST.out.report
 
   emit:
     selected_contigs = ch_merged_selected
-- 
GitLab


From e735885ed9632cad17b4b6d568e9897efa9b0a91 Mon Sep 17 00:00:00 2001
From: Maina Vienne <maina.vienne@inrae.fr>
Date: Fri, 28 Oct 2022 13:24:11 +0200
Subject: [PATCH 15/31] Reduce main.nf by moving  "manage flowcell" in the
 subworflow 02_assembly

---
 main.nf                     | 45 +--------------------------------
 subworkflows/02_assembly.nf | 50 +++++++++++++++++++++++++++++++------
 2 files changed, 43 insertions(+), 52 deletions(-)

diff --git a/main.nf b/main.nf
index d9bc02b..d364366 100644
--- a/main.nf
+++ b/main.nf
@@ -244,7 +244,6 @@ workflow {
         def meta = [:] 
         meta.id = item.sample
         if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell}
-        if (item.group !=null) {meta.id = meta.id+"_"+item.group}
         meta.sample = item.sample
         meta.flowcell = item.flowcell
         meta.group = item.group
@@ -265,7 +264,6 @@ workflow {
       def meta = [:] 
       meta.id = item.sample
       if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell}
-      if (item.group !=null) {meta.id = meta.id+"_"+item.group}
       meta.sample = item.sample
       meta.flowcell = item.flowcell
       meta.group = item.group
@@ -362,49 +360,8 @@ workflow {
   }
 
   if ( !params.stop_at_clean ) {
-		if (!has_assembly & has_flowcell ){ 
-			//////////////////
-			// Manage Flowcell 
-			//////////////////
-			ch_reads_tmp = ch_preprocessed_reads
-				.map {
-					meta, fastq ->
-					[ meta.sample, meta, fastq ] 
-					}
-				.groupTuple(by: [0])
-				.branch {
-					id, meta, fastq ->
-						single  : fastq.size() == 1
-							return [[id:meta.sample.unique().join(),
-							sample:meta.sample.unique().join(),
-							flowcell:meta.flowcell.join("_"),
-							group:meta.group.unique().join(),
-							assembly:meta.assembly.unique().join(),
-							type:meta.type.unique().join()], fastq.flatten() ]
-						multiple: fastq.size() > 1
-							return [[id:meta.sample.unique().join(),
-							sample:meta.sample.unique().join(),
-							flowcell:meta.flowcell.join("_"),
-							group:meta.group.unique().join(),
-							assembly:meta.assembly.unique().join(),
-							type:meta.type.unique().join()], fastq.flatten() ]
-				}
-
-
-			MERGE_FASTQ (
-				ch_reads_tmp.multiple
-			)
-				.reads
-				.mix(ch_reads_tmp.single)
-				.set{ch_preprocessed_reads}
-		}
-		
-      /////////////////////
-      //End manage Flowcell
-      /////////////////////
-
-    S02_ASSEMBLY ( ch_preprocessed_reads, ch_assembly, has_assembly, assembly_tool )
 
+    S02_ASSEMBLY ( ch_preprocessed_reads, ch_assembly, has_assembly, assembly_tool, has_flowcell )
     ch_assembly = S02_ASSEMBLY.out.assembly
     ch_reads = S02_ASSEMBLY.out.reads
     ch_bam = S02_ASSEMBLY.out.bam
diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf
index d202dbd..3f40572 100644
--- a/subworkflows/02_assembly.nf
+++ b/subworkflows/02_assembly.nf
@@ -6,10 +6,11 @@ include { GET_ALIGNMENT_METRICS } from '../modules/read_alignment_manipulation'
 
 workflow STEP_02_ASSEMBLY {
   take: 
-    reads
-    assembly
-    has_assembly
-    assembly_tool
+  reads
+  assembly
+  has_assembly
+  assembly_tool
+  has_flowcell
 
   main:
     ch_assembler_v = Channel.empty() 
@@ -17,11 +18,45 @@ workflow STEP_02_ASSEMBLY {
     ch_bwa_mem_v = Channel.empty()
     ch_minimap2_v = Channel.empty()
     ch_samtools_v = Channel.empty()
-  
+
+
+    if (!has_assembly & has_flowcell ){ 
+			//////////////////
+			// Manage Flowcell 
+			//////////////////
+			ch_reads_flowcell = reads
+				.map { meta, fastq ->
+					   [ meta.sample, meta, fastq ] }
+				.groupTuple(by: [0])
+				.branch {	id, meta, fastq ->
+          single : fastq.size() == 1
+            return [[id:meta.sample.unique().join(),
+            sample:meta.sample.unique().join(),
+            flowcell:meta.flowcell.join("_"),
+            group:meta.group.unique().join(),
+            assembly:meta.assembly.unique().join(),
+            type:meta.type.unique().join()], fastq.flatten() ]
+          multiple: fastq.size() > 1
+            return [[id:meta.sample.unique().join(),
+            sample:meta.sample.unique().join(),
+            flowcell:meta.flowcell.join("_"),
+            group:meta.group.unique().join(),
+            assembly:meta.assembly.unique().join(),
+            type:meta.type.unique().join()], fastq.flatten() ]
+				}
+
+			MERGE_FASTQ (ch_reads_flowcell.multiple)
+				.reads
+				.mix(ch_reads_flowcell.single)
+				.set{ch_reads}
+      
+		} else {
+      ch_reads = reads
+    }
+
     if (has_assembly){
       ch_assembly = assembly   
-    } 
-    else {
+    } else {
       if(assembly_tool == 'metaspades') {
         METASPADES(reads)
         ch_assembly = METASPADES.out.assembly
@@ -58,7 +93,6 @@ workflow STEP_02_ASSEMBLY {
 
     ch_contigs_and_reads = ch_assembly_renamed.join(ch_reads, remainder: true)
     if ( params.type.toUpperCase() == "SR" ) {
-
       READS_DEDUPLICATION ( ch_contigs_and_reads )
       
       ch_reads = READS_DEDUPLICATION.out.dedup_reads
-- 
GitLab


From 3cf8c518179501558d5c8cee44789b54f351b66a Mon Sep 17 00:00:00 2001
From: Maina Vienne <maina.vienne@inrae.fr>
Date: Tue, 8 Nov 2022 08:49:11 +0100
Subject: [PATCH 16/31] manage coassembly for step 2

	Adds coassembly parameter
	Adapts assembly modules to take multiple fastq
---
 modules/assembly.nf         | 26 ++++++++---
 nextflow.config             |  1 +
 subworkflows/02_assembly.nf | 86 ++++++++++++++++++++++++++-----------
 3 files changed, 83 insertions(+), 30 deletions(-)

diff --git a/modules/assembly.nf b/modules/assembly.nf
index 728e8b9..a6f82bf 100644
--- a/modules/assembly.nf
+++ b/modules/assembly.nf
@@ -4,7 +4,7 @@ process METASPADES {
   label 'ASSEMBLY_SR'
 
   input:
-  tuple val(meta), path(reads)
+  tuple val(meta), path(read1), path(read2)
 
   output:
   tuple val(meta), path("metaspades_${meta.id}/${meta.id}.contigs.fa"), emit: assembly
@@ -15,7 +15,21 @@ process METASPADES {
     (_,mem,unit) = (task.memory =~ /(\d+) ([A-Z]B)/)[0]
     if ( unit =~ /GB/ ) { 
       """
-      metaspades.py -t ${task.cpus} -m $mem -1 ${reads[0]} -2 ${reads[1]} -o metaspades_${meta.id}
+      echo "
+[
+    {
+        orientation: \\"fr\\",
+        type: \\"paired-end\\",
+        right reads: [
+            \\"${read1.join('\\",\n            \\"')}\\"
+        ],
+        left reads: [
+            \\"${read2.join('\\",\n            \\"')}\\"
+        ]
+    }
+]" > input.yaml
+
+      metaspades.py -t ${task.cpus} -m $mem --dataset input.yaml -o metaspades_${meta.id}
       mv metaspades_${meta.id}/scaffolds.fasta metaspades_${meta.id}/${meta.id}.contigs.fa
       mv metaspades_${meta.id}/spades.log metaspades_${meta.id}/${meta.id}.log
       mv metaspades_${meta.id}/params.txt metaspades_${meta.id}/${meta.id}.params.txt 
@@ -34,7 +48,7 @@ process MEGAHIT {
   label 'ASSEMBLY_SR'
 
   input:
-  tuple val(meta), path(reads)
+  tuple val(meta), path(read1), path(read2)
 
   output:
   tuple val(meta), path("megahit_${meta.id}/${meta.id}.contigs.fa"), emit: assembly
@@ -43,7 +57,7 @@ process MEGAHIT {
 
   script:
     """
-    megahit -t ${task.cpus} -1 ${reads[0]} -2 ${reads[1]} -o megahit_${meta.id} --out-prefix "${meta.id}"
+    megahit -t ${task.cpus} -1 ${read1.join(',')} -2 ${read2.join(',')} -o megahit_${meta.id} --out-prefix "${meta.id}"
     mv megahit_${meta.id}/options.json megahit_${meta.id}/${meta.id}.params.txt
     rm -r  megahit_${meta.id}/intermediate_contigs
 
@@ -69,7 +83,7 @@ process HIFIASM_META {
   """
   mkdir hifiasm_meta_${meta.id}
 
-  hifiasm_meta -t ${task.cpus} -o ${meta.id} $reads 2> hifiasm_meta_${meta.id}/${meta.id}.log
+  hifiasm_meta -t ${task.cpus} -o ${meta.id}  ${reads.join(' ')} 2> hifiasm_meta_${meta.id}/${meta.id}.log
 
   # gfa to fasta format
   awk '/^S/{print ">"\$2"\\n"\$3}' ${meta.id}.p_ctg.gfa | fold > hifiasm_meta_${meta.id}/${meta.id}.contigs.fa
@@ -98,7 +112,7 @@ process METAFLYE {
   """
   mkdir metaflye_${meta.id}
   
-  flye --pacbio-hifi $reads -o 'metaflye' --meta -t ${task.cpus} 2> metaflye_${meta.id}/${meta.id}.log
+  flye --pacbio-hifi ${reads.join(' ')} -o 'metaflye' --meta -t ${task.cpus} 2> metaflye_${meta.id}/${meta.id}.log
 
   mv  metaflye_${meta.id}/assembly.fasta  metaflye_${meta.id}/${meta.id}.contigs.fa
   mv metaflye_${meta.id}/params.json  metaflye_${meta.id}/${meta.id}.params.json
diff --git a/nextflow.config b/nextflow.config
index 1516f4c..22d7fa8 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -21,6 +21,7 @@ params {
     gtdbtk_bank = ""
     percentage_identity = 0.95
     type = ""
+    coassembly = false
 
     // Stop after step or skip optional step/sub-step.
 
diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf
index 3f40572..542e603 100644
--- a/subworkflows/02_assembly.nf
+++ b/subworkflows/02_assembly.nf
@@ -54,34 +54,58 @@ workflow STEP_02_ASSEMBLY {
       ch_reads = reads
     }
 
+    if (params.coassembly){ 
+      ch_reads.map { meta, fastq ->
+                   [ meta.group, meta, fastq] }
+        .groupTuple(by: [0])
+        .map { group, metas, fastq ->
+              def meta = [:]
+              meta.id = metas.group.unique().join()
+              meta.sample = metas.sample.join("_")
+              meta.flowcell = metas.flowcell.unique().join()
+              meta.group =  metas.group.unique().join()
+              meta.assembly = metas.assembly.unique().join()
+              meta.type = metas.type.unique().join()
+              if (params.type.toUpperCase() == "SR") {
+                return [meta, fastq.collect { it[0] }, fastq.collect { it[1] }]
+              } else {
+                return [meta, fastq ]
+              }}
+        .set { ch_reads_assembly }
+    } else if (params.type.toUpperCase() == "SR") {
+        ch_reads_assembly = ch_reads
+        .map { meta, fastq ->
+            return [meta, fastq[0], fastq[1]] }
+    } else {
+      ch_reads_assembly = ch_reads
+    }
+
+    ch_reads_assembly.view()
+
     if (has_assembly){
       ch_assembly = assembly   
     } else {
-      if(assembly_tool == 'metaspades') {
-        METASPADES(reads)
-        ch_assembly = METASPADES.out.assembly
-        ch_assembler_v = METASPADES.out.v_metaspades
-      } 
-      else if(assembly_tool == 'megahit') {
-        MEGAHIT(reads)
-        ch_assembly = MEGAHIT.out.assembly
-        ch_assembler_v = MEGAHIT.out.v_megahit
-      } 
-      else if(assembly_tool == 'hifiasm-meta') {
-        HIFIASM_META(reads)
-        ch_assembly = HIFIASM_META.out.assembly
-        ch_assembler_v = HIFIASM_META.out.v_hifiasm_meta
-        } 
-      else if(assembly_tool == 'metaflye') {
-          METAFLYE(reads)
+      if (assembly_tool == 'metaspades') {
+          METASPADES(ch_reads_assembly)
+          ch_assembly = METASPADES.out.assembly
+          ch_assembler_v = METASPADES.out.v_metaspades
+      } else if (assembly_tool == 'megahit') {
+          MEGAHIT(ch_reads_assembly)
+          ch_assembly = MEGAHIT.out.assembly
+          ch_assembler_v = MEGAHIT.out.v_megahit
+      } else if (assembly_tool == 'hifiasm-meta') {
+          HIFIASM_META(ch_reads_assembly)
+          ch_assembly = HIFIASM_META.out.assembly
+          ch_assembler_v = HIFIASM_META.out.v_hifiasm_meta
+      } else if (assembly_tool == 'metaflye') {
+          METAFLYE(ch_reads_assembly)
           ch_assembly = METAFLYE.out.assembly
           ch_assembler_v = METAFLYE.out.v_metaflye
-        } 
-      else {
+      } else {
         exit 1,  "Invalid assembly parameter: ${assembly_tool}"
-        }
+      }
     }
-    
+
     ch_assembly_renamed = RENAME_CONTIGS(ch_assembly)
     ASSEMBLY_QUAST( ch_assembly_renamed,"02_assembly/02_1_primary_assembly/assembly_metric/")
     ch_assembly_report = ASSEMBLY_QUAST.out.report
@@ -89,10 +113,24 @@ workflow STEP_02_ASSEMBLY {
 
     ch_idxstats = Channel.empty()
     ch_flagstat = Channel.empty()
-    ch_reads = reads
+                                          
+    if (params.coassembly){
+      ch_reads.map { meta, fastq ->
+                   [ meta.group, meta, fastq ]}
+              .set { ch_reads_tmp }
+
+      ch_assembly_renamed.map { meta, assembly ->
+                              [ meta.group, assembly ]} 
+                         .combine(ch_reads_tmp, by: 0)
+                         .map { group, assembly, meta, fastq -> 
+                              [ meta, assembly,fastq ]}
+                         .set { ch_contigs_and_reads }
+    } else {
+      ch_contigs_and_reads = ch_assembly_renamed.join(ch_reads, remainder: true)
+    }
 
-    ch_contigs_and_reads = ch_assembly_renamed.join(ch_reads, remainder: true)
     if ( params.type.toUpperCase() == "SR" ) {
+
       READS_DEDUPLICATION ( ch_contigs_and_reads )
       
       ch_reads = READS_DEDUPLICATION.out.dedup_reads
@@ -104,7 +142,7 @@ workflow STEP_02_ASSEMBLY {
         MINIMAP2(ch_contigs_and_reads,"02_assembly/02_3_reads_vs_primary_assembly")
         ch_bam = MINIMAP2.out.bam
 
-        ch_minimap2_v = MINIMAP2.out.v_minimap2
+      ch_minimap2_v = MINIMAP2.out.v_minimap2
         
     }
 
-- 
GitLab


From 706e1428dfa3613e461ba47a05fcbe38bb13b6cf Mon Sep 17 00:00:00 2001
From: Maina Vienne <maina.vienne@inrae.fr>
Date: Thu, 17 Nov 2022 16:28:50 +0100
Subject: [PATCH 17/31] remove unnecessary line

---
 modules/cd_hit.nf                   | 1 -
 subworkflows/02_assembly.nf         | 2 --
 subworkflows/04_structural_annot.nf | 2 --
 3 files changed, 5 deletions(-)

diff --git a/modules/cd_hit.nf b/modules/cd_hit.nf
index 217bc92..1bf02ec 100644
--- a/modules/cd_hit.nf
+++ b/modules/cd_hit.nf
@@ -55,7 +55,6 @@ main:
   INDIVIDUAL_CD_HIT( ch_assembly, ch_percentage_identity )
   ch_individual_clusters = INDIVIDUAL_CD_HIT.out.clstr_fasta.collect()
   GLOBAL_CD_HIT(ch_individual_clusters , ch_percentage_identity )
-  ch_ffn = ch_assembly.flatMap{it -> it[1]}.collect()
 
 emit:
   individual_clstr_table = INDIVIDUAL_CD_HIT.out.clstr_table
diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf
index 542e603..53f6f69 100644
--- a/subworkflows/02_assembly.nf
+++ b/subworkflows/02_assembly.nf
@@ -80,8 +80,6 @@ workflow STEP_02_ASSEMBLY {
       ch_reads_assembly = ch_reads
     }
 
-    ch_reads_assembly.view()
-
     if (has_assembly){
       ch_assembly = assembly   
     } else {
diff --git a/subworkflows/04_structural_annot.nf b/subworkflows/04_structural_annot.nf
index 49841ef..505e633 100644
--- a/subworkflows/04_structural_annot.nf
+++ b/subworkflows/04_structural_annot.nf
@@ -17,8 +17,6 @@ workflow STEP_04_STRUCTURAL_ANNOT {
 
     MERGE_ANNOTATIONS(annotations_ch)
 
-    ch_software_versions = PRODIGAL.out.v_prodigal.first()
-
     ch_software_versions = PRODIGAL.out.v_prodigal.first().mix( BARRNAP.out.v_barrnap.first(), 
                                                                 TRNASCAN_SE.out.v_tRNAscan.first())
 
-- 
GitLab


From fec33d4f185af9c237a11b6bdefdaa85a0075a8f Mon Sep 17 00:00:00 2001
From: Maina Vienne <maina.vienne@inrae.fr>
Date: Thu, 17 Nov 2022 16:29:29 +0100
Subject: [PATCH 18/31] Manage coassembly for step 03 (filtering)

---
 subworkflows/03_filtering.nf | 125 ++++++++++++++++++++++++-----------
 1 file changed, 88 insertions(+), 37 deletions(-)

diff --git a/subworkflows/03_filtering.nf b/subworkflows/03_filtering.nf
index 3e7288b..1d4fdbe 100644
--- a/subworkflows/03_filtering.nf
+++ b/subworkflows/03_filtering.nf
@@ -13,10 +13,33 @@ workflow STEP_03_ASSEMBLY_FILTER {
   main:
     ch_chunk_assembly_for_filter = assemblies
                                   .splitFasta(by: 100000, file: true)
-  
 
-    ch_assembly_and_idxstats = ch_chunk_assembly_for_filter
+    if (params.coassembly){ 
+      idxstats.map { meta, idxstats ->
+                   [ meta.group, meta, idxstats] }
+              .groupTuple(by: [0])
+              .map { group, metas, idxstats -> 
+                    def meta = [:]
+                    meta.id = metas.group.unique().join()
+                    meta.sample = metas.sample.join("_")
+                    meta.flowcell = metas.flowcell.unique().join()
+                    meta.group =  metas.group.unique().join()
+                    meta.assembly = metas.assembly.unique().join()
+                    meta.type = metas.type.unique().join()
+                    [ group, meta, idxstats] }
+              .set { ch_idxstats_tmp }
+      ch_chunk_assembly_for_filter.map { meta, assembly ->
+                                       [ meta.group, assembly ]} 
+                                  .combine(ch_idxstats_tmp, by: 0)
+                                  .map { group, assembly, meta, idxstats -> 
+                                       [ meta, assembly, idxstats ]}
+                                  .set { ch_assembly_and_idxstats }
+    } else {
+      ch_assembly_and_idxstats = ch_chunk_assembly_for_filter
                                 .combine(idxstats, by:0)
+    }
+
+    
 
     CHUNK_ASSEMBLY_FILTER (
       ch_assembly_and_idxstats,
@@ -42,57 +65,85 @@ workflow STEP_03_ASSEMBLY_FILTER {
     discarded_contigs = MERGE_ASSEMBLY_FILTER.out.merged_discarded
 
 
-   // Differentiate sample with and without discarded_contigs 
-   // samples with no discarded_contigs are not going to be processed to process 
-   discarded_contigs_category = discarded_contigs.branch{
-                              without: it[1].isEmpty() 
-                              with: !it[1].isEmpty() 
-                              }
+    // Differentiate sample with and without discarded_contigs 
+    // samples with no discarded_contigs are not going to be processed to process 
+    discarded_contigs_category = discarded_contigs.branch{
+                                without: it[1].isEmpty() 
+                                with: !it[1].isEmpty() 
+                                }
+
+
+    samples_without_discarded_contigs = discarded_contigs_category.without.map{ it -> it[0]}
+
+    if (params.coassembly){
+      bam.map { meta, bam, bai ->
+                   [ meta.group, meta, bam, bai ]}
+              .set { ch_bam_tmp }
 
+      samples_without_discarded_contigs.map { meta, assembly ->
+                                            [ meta.group, assembly ]} 
+                                       .combine( ch_bam_tmp, by: 0)
+                                       .map { group, assembly, meta, bam, bai -> 
+                                            [ meta, assembly, bam, bai ]}
+                                       .set { ch_bam_of_sample_without_discarded_contigs }
+      discarded_contigs_category.with.map { meta, discarded_contigs ->
+                                            [ meta.group, discarded_contigs ]} 
+                                       .combine( ch_bam_tmp, by: 0)
+                                       .map { group, discarded_contigs, meta, bam, bai -> 
+                                            [ meta, discarded_contigs, bam, bai ]}
+                                       .set { ch_discarded_contigs_and_bam }
+      // metadata retrieval from bam channel, for the join with discarded reads after "EXTRACT_PAIRED_READS"                                                            
+      ch_merged_selected.map { meta, assembly ->
+                              [ meta.group, assembly ]} 
+                        .combine( ch_bam_tmp, by: 0)
+                        .map { group, assembly, meta, bam, bai -> 
+                             [ meta, assembly ]}
+                        .set { ch_merged_selected_meta }
+    } else {
+      ch_bam_of_sample_without_discarded_contigs = samples_without_discarded_contigs.join(bam)
+      ch_discarded_contigs_and_bam = discarded_contigs_category.with.join(bam)
+      ch_merged_selected_meta = ch_merged_selected
+    }
 
-   samples_without_discarded_contigs = discarded_contigs_category.without.map{ it -> it[0]}
-   ch_bam_of_sample_without_discarded_contigs = samples_without_discarded_contigs.join(bam)
-   
-  ch_discarded_contigs_and_bam = discarded_contigs_category.with.join(bam)
 
-  FILTER_BAM(ch_discarded_contigs_and_bam)
+    FILTER_BAM(ch_discarded_contigs_and_bam)
 
-  ch_discarded_bam = FILTER_BAM.out.discarded_contigs_bam
-  ch_selected_bam = FILTER_BAM.out.selected_contigs_bam 
+    ch_discarded_bam = FILTER_BAM.out.discarded_contigs_bam
+    ch_selected_bam = FILTER_BAM.out.selected_contigs_bam 
 
-  if ( params.type.toUpperCase() == "SR" ) {
-        
-        ch_discarded_reads = EXTRACT_PAIRED_READS(ch_discarded_bam)
+    if ( params.type.toUpperCase() == "SR" ) {
+          
+          ch_discarded_reads = EXTRACT_PAIRED_READS(ch_discarded_bam)
 
-        ch_contigs_and_discarded_reads = ch_merged_selected.join(ch_discarded_reads) 
+          ch_contigs_and_discarded_reads = ch_merged_selected_meta.join(ch_discarded_reads) 
 
-        BWA_MEM(ch_contigs_and_discarded_reads, "03_filtering/03_2_reads_vs_filtered_assembly")
-        ch_bam_with_discarded_reads = BWA_MEM.out.bam
+          BWA_MEM(ch_contigs_and_discarded_reads, "03_filtering/03_2_reads_vs_filtered_assembly")
+          ch_bam_with_discarded_reads = BWA_MEM.out.bam
 
-  } else {
-        ch_discarded_reads = EXTRACT_SINGLE_READS(ch_discarded_bam)
-        ch_contigs_and_discarded_reads = ch_merged_selected.join(ch_discarded_reads) 
-        MINIMAP2(ch_contigs_and_discarded_reads, "03_filtering/03_2_reads_vs_filtered_assembly")
-        ch_bam_with_discarded_reads = MINIMAP2.out.bam
-  
-  }
+    } else {
+          ch_discarded_reads = EXTRACT_SINGLE_READS(ch_discarded_bam)
+          ch_contigs_and_discarded_reads = ch_merged_selected_meta.join(ch_discarded_reads) 
+          MINIMAP2(ch_contigs_and_discarded_reads, "03_filtering/03_2_reads_vs_filtered_assembly")
+          ch_bam_with_discarded_reads = MINIMAP2.out.bam
+    
+    }
 
 
-   // remove bai from ch_bam_with_discarded_reads and add bam of selected contigs
-  ch_bam_to_merge = ch_bam_with_discarded_reads.map{ it -> [it[0], it[1]]}.join(ch_selected_bam)
+    // remove bai from ch_bam_with_discarded_reads and add bam of selected contigs
+    ch_bam_to_merge = ch_bam_with_discarded_reads.map{ it -> [it[0], it[1]]}.join(ch_selected_bam)
 
-  ch_merged_bam = MERGE_BAM_FILES(ch_bam_to_merge)
+    ch_merged_bam = MERGE_BAM_FILES(ch_bam_to_merge)
 
-   ch_final_bam = ch_bam_of_sample_without_discarded_contigs.mix(ch_merged_bam)
+    ch_final_bam = ch_bam_of_sample_without_discarded_contigs.mix(ch_merged_bam)
 
 
-  GET_ALIGNMENT_METRICS(ch_final_bam, '03_filtering/03_2_reads_vs_filtered_assembly/')
+    GET_ALIGNMENT_METRICS(ch_final_bam, '03_filtering/03_2_reads_vs_filtered_assembly/')
 
-  ch_flagstat =  GET_ALIGNMENT_METRICS.out.sam_flagstat
-  ch_coverage =  GET_ALIGNMENT_METRICS.out.sam_coverage
+    ch_flagstat =  GET_ALIGNMENT_METRICS.out.sam_flagstat
+    ch_coverage =  GET_ALIGNMENT_METRICS.out.sam_coverage
 
-  QUAST( ch_merged_selected, "03_filtering/03_1_filtered_assembly/assembly_metric" )
-  ch_quast_report = QUAST.out.report
+    QUAST( ch_merged_selected, "03_filtering/03_1_filtered_assembly/assembly_metric" )
+    ch_quast_report = QUAST.out.report
 
   emit:
     selected_contigs = ch_merged_selected
-- 
GitLab


From e070a4e5f75ae308316821eabe75c5209196d94e Mon Sep 17 00:00:00 2001
From: "maina.vienne" <maina.vienne@inrae.fr>
Date: Wed, 14 Dec 2022 11:28:03 +0100
Subject: [PATCH 19/31] Allows to give co-assembly as input

---
 subworkflows/02_assembly.nf | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf
index 5bed864..532873a 100644
--- a/subworkflows/02_assembly.nf
+++ b/subworkflows/02_assembly.nf
@@ -20,6 +20,9 @@ workflow STEP_02_ASSEMBLY {
     ch_samtools_v = Channel.empty()
 
 
+    ch_assembly = assembly 
+    ch_reads = reads
+
     if (!has_assembly & has_flowcell ){ 
 			//////////////////
 			// Manage Flowcell 
@@ -49,9 +52,6 @@ workflow STEP_02_ASSEMBLY {
 				.reads
 				.mix(ch_reads_flowcell.single)
 				.set{ch_reads}
-      
-		} else {
-      ch_reads = reads
     }
 
     if (params.coassembly){ 
@@ -72,17 +72,33 @@ workflow STEP_02_ASSEMBLY {
                 return [meta, fastq ]
               }}
         .set { ch_reads_assembly }
+
+      if (has_assembly){ 
+        ch_assembly = assembly.map { meta, assembly ->
+                                   [ meta.group, meta, assembly] }
+                              .groupTuple(by: [0]) 
+                              .map{ group, metas, assembly ->
+                                    def meta = [:]
+                                    meta.id = metas.group.unique().join()
+                                    meta.sample = metas.sample.join("_")
+                                    meta.flowcell = metas.flowcell.unique().join()
+                                    meta.group =  metas.group.unique().join()
+                                    meta.assembly = metas.assembly.unique().join()
+                                    meta.type = metas.type.unique().join()
+                                    return [meta, assembly[0]] }
+        ch_assembly.view()
+      }
+
     } else if (params.type.toUpperCase() == "SR") {
         ch_reads_assembly = ch_reads
         .map { meta, fastq ->
             return [meta, fastq[0], fastq[1]] }
+
     } else {
       ch_reads_assembly = ch_reads
     }
 
-    if (has_assembly){
-      ch_assembly = assembly   
-    } else {
+    if (!has_assembly){
       if (assembly_tool == 'metaspades') {
           METASPADES(ch_reads_assembly)
           ch_assembly = METASPADES.out.assembly
-- 
GitLab


From 3ebff09ef82e07de6a0e5e797e9143baf4bf5dbe Mon Sep 17 00:00:00 2001
From: "maina.vienne" <maina.vienne@inrae.fr>
Date: Wed, 14 Dec 2022 11:30:55 +0100
Subject: [PATCH 20/31] Remove unused coma

---
 subworkflows/05_protein_alignment.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subworkflows/05_protein_alignment.nf b/subworkflows/05_protein_alignment.nf
index 56258c4..b178755 100644
--- a/subworkflows/05_protein_alignment.nf
+++ b/subworkflows/05_protein_alignment.nf
@@ -13,7 +13,7 @@ workflow STEP_05_PROTEIN_ALIGNMENT {
         
         DIAMOND (
             prokka_faa,
-            diamond,
+            diamond
         )
 
         ch_diamond_result = DIAMOND.out.m8
-- 
GitLab


From e73afaff781c129b85a2155edb742b25b8d76c81 Mon Sep 17 00:00:00 2001
From: "maina.vienne" <maina.vienne@inrae.fr>
Date: Wed, 14 Dec 2022 14:22:36 +0100
Subject: [PATCH 21/31] Manage coassembly for func annot et taxo affi

---
 modules/feature_counts.nf            |  6 ++----
 subworkflows/06_functionnal_annot.nf | 17 ++++++++++++++++-
 subworkflows/07_taxonomic_affi.nf    | 15 ++++++++++++++-
 3 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/modules/feature_counts.nf b/modules/feature_counts.nf
index 73ec38c..63eb79a 100644
--- a/modules/feature_counts.nf
+++ b/modules/feature_counts.nf
@@ -44,14 +44,12 @@ process QUANTIFICATION_TABLE {
 workflow QUANTIFICATION {
  
   take:
-    ch_gff  // channel: [ val(meta), path(gff) ]
-    ch_bam  // channel: [ val(meta), path(bam), path(bam_index) ]
+    ch_gff_and_bam  // channel: [ val(meta), path(gff), path(bam), path(bam_index) ]
     ch_individual_clstr_table
     ch_global_clstr_table
 
   main:
-    ch_gff_and_bam = ch_gff.join(ch_bam, remainder: false)
-
+  
     FEATURE_COUNTS(ch_gff_and_bam)
     ch_count_table = FEATURE_COUNTS.out.count_table.collect()
     ch_quant_report = FEATURE_COUNTS.out.summary
diff --git a/subworkflows/06_functionnal_annot.nf b/subworkflows/06_functionnal_annot.nf
index 5b86055..ad48129 100644
--- a/subworkflows/06_functionnal_annot.nf
+++ b/subworkflows/06_functionnal_annot.nf
@@ -28,7 +28,22 @@ workflow STEP_06_FUNC_ANNOT {
         ch_global_clstr_table = CD_HIT.out.global_clstr_table
         ch_cdhit_v = CD_HIT.out.v_cdhit
 
-        QUANTIFICATION ( gff, bam, ch_individual_clstr_table, ch_global_clstr_table)
+        if (params.coassembly){
+            bam.map { meta, bam, bai ->
+                        [ meta.group, meta, bam, bai ]}
+                    .set { ch_bam_tmp }
+
+            gff.map { meta, gff ->
+                        [ meta.group, gff]} 
+                    .combine( ch_bam_tmp, by: 0)
+                    .map { group, gff, meta, bam, bai -> 
+                        [ meta, gff, bam, bai ]}
+                    .set { ch_gff_and_bam }
+        } else {
+            ch_gff_and_bam = gff.join(bam, remainder: false)
+        }
+
+        QUANTIFICATION ( ch_gff_and_bam, ch_individual_clstr_table, ch_global_clstr_table)
         ch_quant_table = QUANTIFICATION.out.quantification_table
         ch_quant_report = QUANTIFICATION.out.quant_report
         ch_featurecounts_v = QUANTIFICATION.out.v_featurecounts
diff --git a/subworkflows/07_taxonomic_affi.nf b/subworkflows/07_taxonomic_affi.nf
index 99ead6c..4f1eb9b 100644
--- a/subworkflows/07_taxonomic_affi.nf
+++ b/subworkflows/07_taxonomic_affi.nf
@@ -8,7 +8,20 @@ workflow STEP_07_TAXO_AFFI {
         diamond_result // channel: [ val(meta), path(diamond_file) ]
         sam_coverage // channel: [ val(meta), path(samtools coverage) ]
     main:
-        ch_assign_taxo_input = diamond_result.join(sam_coverage, remainder: true)
+        if (params.coassembly){
+            sam_coverage.map { meta, cov ->
+                             [ meta.group, meta, cov ]}
+                        .set { ch_sam_cov_tmp }
+
+            diamond_result.map { meta, m8 ->
+                               [ meta.group, m8]} 
+                          .combine( ch_sam_cov_tmp, by: 0)
+                          .map { group, m8, meta, cov -> 
+                               [ meta, m8, cov ]}
+                          .set { ch_assign_taxo_input }
+        } else {
+            ch_assign_taxo_input = diamond_result.join(sam_coverage, remainder: true)
+        }
 
         ASSIGN_TAXONOMY ( taxonomy, ch_assign_taxo_input )
 
-- 
GitLab


From f99bf3434080ca66b72d28d8f17686726e5c0c19 Mon Sep 17 00:00:00 2001
From: "maina.vienne" <maina.vienne@inrae.fr>
Date: Wed, 14 Dec 2022 14:30:09 +0100
Subject: [PATCH 22/31] Manage coassembly for the binning

---
 subworkflows/08_binning.nf | 60 +++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 21 deletions(-)

diff --git a/subworkflows/08_binning.nf b/subworkflows/08_binning.nf
index 59b9c7b..efa4f9d 100644
--- a/subworkflows/08_binning.nf
+++ b/subworkflows/08_binning.nf
@@ -52,21 +52,21 @@ workflow STEP_08_BINNING {
     if (params.binning_cross_alignment == 'all') {
       // combine assemblies with reads of all samples
       ch_reads_assembly = reads.combine(assembly)
-      .map{ meta_reads, reads, meta_assembly, assembly ->
-        if (meta_reads != meta_assembly){
-          [[id:meta_reads.id+"_"+meta_assembly.id, sample:meta_assembly.sample, flowcell:meta_assembly.flowcell, group:meta_assembly.group, assembly:meta_assembly.assembly, type:meta_assembly.type], assembly, reads]
-        }
-      }
+                          .map{ meta_reads, reads, meta_assembly, assembly ->
+                            if ((meta_reads.id != meta_assembly.id && !(params.coassembly)) || ((params.coassembly && meta_reads.group == meta_assembly.group))){
+                              [[id:meta_reads.id+"_"+meta_assembly.id, sample:meta_assembly.sample, flowcell:meta_assembly.flowcell, group:meta_assembly.group, assembly:meta_assembly.assembly, type:meta_assembly.type], assembly, reads]
+                            }
+                          }
     } else if (params.binning_cross_alignment == 'group'){
       // combine assemblies with reads of samples from same group
       ch_assembly_group = assembly.map{ meta, assembly -> [ meta.group, meta, assembly ] }
       ch_reads_assembly = reads.map{ meta, reads -> [ meta.group, meta, reads ] }
-      .combine(ch_assembly_group, by: 0)
-      .map{ group, meta_reads, reads, meta_assembly, assembly ->
-        if (meta_reads != meta_assembly){
-          [[id:meta_reads.id+"_"+meta_assembly.id, sample:meta_assembly.sample, flowcell:meta_assembly.flowcell, group:meta_assembly.group, assembly:meta_assembly.assembly, type:meta_assembly.type], assembly, reads]
-        }
-      }
+                          .combine(ch_assembly_group, by: 0)
+                          .map{ group, meta_reads, reads, meta_assembly, assembly ->
+                            if (meta_reads != meta_assembly){
+                              [[id:meta_reads.id+"_"+meta_assembly.id, sample:meta_assembly.sample, flowcell:meta_assembly.flowcell, group:meta_assembly.group, assembly:meta_assembly.assembly, type:meta_assembly.type], assembly, reads]
+                            }
+                          }
     }
     // cross alignment
     if (params.type == 'SR') {
@@ -78,21 +78,39 @@ workflow STEP_08_BINNING {
     } 
     // formatting channel  
     ch_bam = ch_bam_cross_alignment.mix(bam)
-    .map { meta, bam, bai -> [ meta.sample, meta, bam, bai ]  }
-    .groupTuple(by: [0])
-    .map { sample,metas, bam, bai ->
-      [ metas.min { it.id.size() }, bam, bai ]
-    } 
+            .map { meta, bam, bai -> [ meta.sample, meta, bam, bai ]  }
+            .groupTuple(by: [0])
+            .map { sample,metas, bam, bai ->
+              [ metas.min { it.id.size() }, bam, bai ]
+            } 
   } else {
       ch_bam = bam
   }
 
+  if (params.coassembly){
+    ch_bam.map { meta, bam, bai ->
+                    [ meta.group, meta, bam, bai ]}
+          .set { ch_bam_tmp }
+
+    assembly.map { meta, assembly ->
+                  [ meta.group, assembly ]} 
+            .combine( ch_bam_tmp, by: 0)
+            .map { group, assembly, meta, bam, bai -> 
+                  [ meta, assembly, bam, bai]}
+            .tap { ch_assembly_bam }
+            .map { meta, assembly, bam, bai ->
+                  [ meta, assembly ]}
+            .tap { ch_assembly }
+  } else {
+    ch_assembly_bam = assembly.join(ch_bam)
+    ch_assembly = assembly
+  }
   ///////////
   /// BINNING
   ///////////
 
   ch_depth = GENERATE_DEPTH_FILES(ch_bam)
-  ch_assembly_depth = assembly.join(ch_depth)
+  ch_assembly_depth = ch_assembly.join(ch_depth)
 
   METABAT2(ch_assembly_depth)
   ch_metabat_bins = METABAT2.out.bins.filter{ t -> t[1].list().size()}
@@ -102,8 +120,6 @@ workflow STEP_08_BINNING {
   ch_maxbin_bins = MAXBIN2.out.bins.filter{ t -> t[1].list().size()}
   ch_maxbin_v = MAXBIN2.out.v_maxbin
 
-  ch_assembly_bam = assembly.join(ch_bam)
-
   CONCOCT(ch_assembly_bam)
   ch_concoct_bins = CONCOCT.out.bins.filter{ t -> t[1].list().size()}
   ch_concoct_v = CONCOCT.out.v_concoct
@@ -120,7 +136,8 @@ workflow STEP_08_BINNING {
       (it[2] != null && it[3] != null)
       single: true
       }
-
+  
+  //bins set  with bins form at least 2 binners 
   ch_bins_multiple = ch_bins_set.multiple.map{ meta, bins1, bins2, bins3 -> 
     if ( bins1 == null ) {
       return [meta, bins2, bins3, bins1]
@@ -131,6 +148,7 @@ workflow STEP_08_BINNING {
     }
   }
 
+  //bins set  with bins form only 1 binner
   ch_bins_single = ch_bins_set.single.map{ meta, bins1, bins2, bins3 -> 
     if ( bins1 != null ) {
       return [meta, bins1]
@@ -160,7 +178,7 @@ workflow STEP_08_BINNING {
                                             .map { meta, file -> file}
                                             .collect()
   
-  ch_bins_assembly = METAWRAP_REFINMENT.out.bins.join(assembly)
+  ch_bins_assembly = METAWRAP_REFINMENT.out.bins.join(ch_assembly)
 
   UNBINNED_CONTIGS(ch_bins_assembly)
 
-- 
GitLab


From 7fd9ec8d10fea1daf623d00387d7372210baf6c1 Mon Sep 17 00:00:00 2001
From: "maina.vienne" <maina.vienne@inrae.fr>
Date: Tue, 20 Dec 2022 08:49:20 +0100
Subject: [PATCH 23/31] Fix import MERGE_FASTQ for flowcell

---
 main.nf                     | 2 +-
 subworkflows/02_assembly.nf | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 62dc6d4..634a355 100644
--- a/main.nf
+++ b/main.nf
@@ -23,7 +23,7 @@ include { STEP_08_BINNING as S08_BINNING } from './subworkflows/08_binning'
 
 include { GET_SOFTWARE_VERSIONS } from './modules/get_software_versions'
 include { MULTIQC } from './modules/multiqc'
-include { MERGE_FASTQ } from './modules/merge_fastq.nf'
+
 
 
 /*
diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf
index 532873a..01291d9 100644
--- a/subworkflows/02_assembly.nf
+++ b/subworkflows/02_assembly.nf
@@ -3,6 +3,7 @@ include { QUAST as ASSEMBLY_QUAST} from '../modules/metaquast'
 include { READS_DEDUPLICATION } from '../modules/reads_deduplication'
 include { MINIMAP2 } from '../modules/read_alignment'
 include { GET_ALIGNMENT_METRICS } from '../modules/read_alignment_manipulation'
+include { MERGE_FASTQ } from '../modules/merge_fastq.nf'
 
 workflow STEP_02_ASSEMBLY {
   take: 
-- 
GitLab


From 332a13f167cf215ed217d9444309b51522697068 Mon Sep 17 00:00:00 2001
From: "maina.vienne" <maina.vienne@inrae.fr>
Date: Tue, 20 Dec 2022 16:29:38 +0100
Subject: [PATCH 24/31] Fix channel reads on main.nf

---
 main.nf | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/main.nf b/main.nf
index 634a355..f2455ef 100644
--- a/main.nf
+++ b/main.nf
@@ -244,6 +244,7 @@ workflow {
         def meta = [:] 
         meta.id = item.sample
         if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell}
+        if (item.group !=null) {meta.id = meta.id+"_"+item.group}
         meta.sample = item.sample
         meta.flowcell = item.flowcell
         meta.group = item.group
@@ -264,6 +265,7 @@ workflow {
       def meta = [:] 
       meta.id = item.sample
       if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell}
+      if (item.group !=null) {meta.id = meta.id+"_"+item.group}
       meta.sample = item.sample
       meta.flowcell = item.flowcell
       meta.group = item.group
@@ -344,7 +346,7 @@ workflow {
       ch_host_index,
       ch_kaiju_db
     )
-    ch_preprocessed_reads = S01_CLEAN_QC.out.preprocessed_reads
+    ch_reads = S01_CLEAN_QC.out.preprocessed_reads
     ch_cutadapt_report = S01_CLEAN_QC.out.cutadapt_report
     ch_sickle_report = S01_CLEAN_QC.out.sickle_report
 
@@ -355,13 +357,11 @@ workflow {
     ch_kaiju_report = S01_CLEAN_QC.out.kaiju_report
 
     ch_software_versions_S01 = S01_CLEAN_QC.out.software_versions
-  } else {
-    ch_preprocessed_reads = ch_reads
   }
 
   if ( !params.stop_at_clean ) {
 
-    S02_ASSEMBLY ( ch_preprocessed_reads, ch_assembly, has_assembly, assembly_tool, has_flowcell )
+    S02_ASSEMBLY ( ch_reads, ch_assembly, has_assembly, assembly_tool, has_flowcell )
     ch_assembly = S02_ASSEMBLY.out.assembly
     ch_reads = S02_ASSEMBLY.out.reads
     ch_bam = S02_ASSEMBLY.out.bam
@@ -384,7 +384,7 @@ workflow {
 
     S03_FILTERING (
         ch_assembly,
-        ch_preprocessed_reads,
+        ch_reads,
         ch_idxstats,
         ch_bam,
         ch_min_contigs_cpm,
-- 
GitLab


From eb938df9fdfc57fbd34250d81a74e267dc17d88f Mon Sep 17 00:00:00 2001
From: "maina.vienne" <maina.vienne@inrae.fr>
Date: Tue, 20 Dec 2022 16:36:44 +0100
Subject: [PATCH 25/31] homogenize bam names between SR and HiFi

---
 modules/read_alignment.nf    | 16 ++++++++--------
 subworkflows/03_filtering.nf | 12 ++++++------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/modules/read_alignment.nf b/modules/read_alignment.nf
index 881f6c8..5183c8b 100644
--- a/modules/read_alignment.nf
+++ b/modules/read_alignment.nf
@@ -1,6 +1,6 @@
 process BWA_MEM {
    tag "${meta.id}"
-   publishDir "${params.outdir}/$publishDir_path/${meta.id}", mode: 'copy', pattern:"*sort*"
+   publishDir "${params.outdir}/$publishDir_path/${meta.id}", mode: 'copy', pattern:"*bam*"
 
    input:
       tuple val(meta), path(fna), path(reads)
@@ -8,16 +8,16 @@ process BWA_MEM {
 
 
    output:
-      tuple val(meta), path("${meta.id}.sort.bam"), path("${meta.id}.sort.bam.bai"), emit: bam
+      tuple val(meta), path("${meta.id}.bam"), path("${meta.id}.bam.bai"), emit: bam
       path "v_bwa.txt", emit: v_bwa_mem2
       path "v_samtools.txt", emit: v_samtools
 
    script:
       """
       bwa-mem2 index ${fna} -p ${fna}
-      bwa-mem2 mem -t ${task.cpus} ${fna} ${reads[0]} ${reads[1]} | samtools view -@ ${task.cpus} -bS - | samtools sort -@ ${task.cpus} - -o ${meta.id}.sort.bam
+      bwa-mem2 mem -t ${task.cpus} ${fna} ${reads[0]} ${reads[1]} | samtools view -@ ${task.cpus} -bS - | samtools sort -@ ${task.cpus} - -o ${meta.id}.bam
       
-      samtools index -@ ${task.cpus} ${meta.id}.sort.bam
+      samtools index -@ ${task.cpus} ${meta.id}.bam
 
       bwa-mem2 version > v_bwa.txt
       samtools --version &> v_samtools.txt
@@ -27,23 +27,23 @@ process BWA_MEM {
 process MINIMAP2 {
    tag "${meta.id}"
    label 'MINIMAP2'
-   publishDir "${params.outdir}/$publishDir_path/${meta.id}", mode: 'copy', pattern:"*sort*"
+   publishDir "${params.outdir}/$publishDir_path/${meta.id}", mode: 'copy', pattern:"*bam*"
 
    input:
       tuple val(meta), path(fna), path(reads)
       val(publishDir_path)
 
    output:
-      tuple val(meta), path("${meta.id}.sort.bam"), path("${meta.id}.sort.bam.bai"), emit: bam
+      tuple val(meta), path("${meta.id}.bam"), path("${meta.id}.bam.bai"), emit: bam
       path "v_minimap2.txt", emit: v_minimap2
       path "v_samtools.txt", emit: v_samtools
 
    script:
       """
       # align reads to contigs, keep only primary aln and sort resulting bam 
-      minimap2 -t ${task.cpus} -ax map-hifi $fna $reads | samtools view -@ ${task.cpus} -b -F 2304 | samtools sort -@ ${task.cpus} -o ${meta.id}.sort.bam  
+      minimap2 -t ${task.cpus} -ax map-hifi $fna $reads | samtools view -@ ${task.cpus} -b -F 2304 | samtools sort -@ ${task.cpus} -o ${meta.id}.bam  
       
-      samtools index ${meta.id}.sort.bam -@ ${task.cpus}
+      samtools index ${meta.id}.bam -@ ${task.cpus}
 
       samtools --version &> v_samtools.txt
       minimap2 --version &> v_minimap2.txt
diff --git a/subworkflows/03_filtering.nf b/subworkflows/03_filtering.nf
index 5fdd04c..c71ea08 100644
--- a/subworkflows/03_filtering.nf
+++ b/subworkflows/03_filtering.nf
@@ -122,12 +122,12 @@ workflow STEP_03_ASSEMBLY_FILTER {
     result_path_dir.mkdirs()
     
     ch_bam_unchanged_by_filtering.map { meta, bam, bai ->
-          {file("${result_path_dir}/${meta.id}/").mkdir()
-            file("${params.outdir}/${unfiltered_assembly_bam_outdir}/${meta.id}/${meta.id}.sort.bam")
-                .mklink("${bam}", overwrite:true)
-            file("${params.outdir}/${unfiltered_assembly_bam_outdir}/${meta.id}/${meta.id}.sort.bam.bai")
-                .mklink("${bai}", overwrite:true)
-          }
+         { file("${result_path_dir}/${meta.id}/").mkdir()
+           file("${params.outdir}/${unfiltered_assembly_bam_outdir}/${meta.id}/${meta.id}.bam")
+                .mklink("${result_path_dir}/${meta.id}/${meta.id}.bam", overwrite:true)
+           file("${params.outdir}/${unfiltered_assembly_bam_outdir}/${meta.id}/${meta.id}.bam.bai")
+                .mklink("${result_path_dir}/${meta.id}/${meta.id}.bam.bai", overwrite:true)
+      }
     }
 
     if ( params.type.toUpperCase() == "SR" ) {
-- 
GitLab


From f132835af2e57f727c8d933819eeb7df4d37afc2 Mon Sep 17 00:00:00 2001
From: "maina.vienne" <maina.vienne@inrae.fr>
Date: Tue, 20 Dec 2022 17:05:51 +0100
Subject: [PATCH 26/31] Add metabat2_seed parameters

---
 docs/usage.md      | 2 ++
 modules/binning.nf | 2 +-
 nextflow.config    | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/usage.md b/docs/usage.md
index ed48bf6..87e6e2b 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -352,6 +352,8 @@ For example check the version of software in the yaml file and the singularity r
 
 * `--gtdbtk_bank`: indicates path to the GTDBTK database. 
 
+* `--metabat2_seed`: Set the seed for metabat2, for exact reproducibility of metabat2 (default: 0 (random seed))
+
 * `--binning_cross_alignment ["all","group","individual"]`: defines mapping strategy to compute co-abundances for binning. `all` means that each samples will be mapped against every assembly, `group` means that all sample from a group will be mapped against every assembly of the group, `individual` means that each sample will only be mapped against his assembly. Default `individual`
 
 
diff --git a/modules/binning.nf b/modules/binning.nf
index c90306a..f9012d9 100644
--- a/modules/binning.nf
+++ b/modules/binning.nf
@@ -30,7 +30,7 @@ process METABAT2 {
   script:
   """
   mkdir -p metabat2/bins/
-  metabat2 --inFile $fna --abdFile $depth --outFile metabat2/bins/${meta.id}_metabat2 --numThreads ${task.cpus}
+  metabat2 --inFile $fna --abdFile $depth --outFile metabat2/bins/${meta.id}_metabat2 --numThreads ${task.cpus} --seed ${params.metabat2_seed}
   echo \$(metabat2 -h 2>&1) > v_metabat2.txt
   """
 }
diff --git a/nextflow.config b/nextflow.config
index 22d7fa8..b0f4b71 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -59,6 +59,7 @@ params {
     drep_threshold = 0.95
     min_completeness = 50
     max_contamination = 10
+    metabat2_seed = 0
 
     // Ressources.
     kaiju_db_dir = false
-- 
GitLab


From 688d47bc65f36d562a2d198b223d827db4492eef Mon Sep 17 00:00:00 2001
From: "maina.vienne" <maina.vienne@inrae.fr>
Date: Wed, 4 Jan 2023 15:50:47 +0100
Subject: [PATCH 27/31] Fix coassembly on step 03

---
 main.nf                      |  2 --
 subworkflows/03_filtering.nf | 26 ++++++++++----------------
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/main.nf b/main.nf
index f2455ef..3b86218 100644
--- a/main.nf
+++ b/main.nf
@@ -244,7 +244,6 @@ workflow {
         def meta = [:] 
         meta.id = item.sample
         if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell}
-        if (item.group !=null) {meta.id = meta.id+"_"+item.group}
         meta.sample = item.sample
         meta.flowcell = item.flowcell
         meta.group = item.group
@@ -265,7 +264,6 @@ workflow {
       def meta = [:] 
       meta.id = item.sample
       if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell}
-      if (item.group !=null) {meta.id = meta.id+"_"+item.group}
       meta.sample = item.sample
       meta.flowcell = item.flowcell
       meta.group = item.group
diff --git a/subworkflows/03_filtering.nf b/subworkflows/03_filtering.nf
index c71ea08..2aa0f79 100644
--- a/subworkflows/03_filtering.nf
+++ b/subworkflows/03_filtering.nf
@@ -88,26 +88,20 @@ workflow STEP_03_ASSEMBLY_FILTER {
 
   
     if (params.coassembly){
-      bam.map { meta, bam, bai ->
-                    [ meta.group, meta, bam, bai ]}
-              .set { ch_bam_tmp }
       discarded_contigs_category.without.map { meta, discarded_empty -> [ meta.group ]} 
-                                        .combine( ch_bam_tmp, by: 0)
+                                        .combine( bam.map { meta, bam, bai ->
+                                                          [ meta.group, meta, bam, bai ]}, by: 0)
                                         .map{ group, meta, bam, bai -> 
-                                              [ meta, bam, bai ]}
+                                            [ meta, bam, bai ]}
                                         .set{ ch_bam_unchanged_by_filtering }
-
                                       
-      reads{ meta, reads ->
-                    [ meta.group, meta, reads ]}
-              .set { ch_reads_tmp }
-
-      ch_selected_contigs_and_reads = discarded_contigs_category.with.map{meta, discarded_contigs -> meta.group}
-                                                                    .join(ch_merged_selected)
-                                                                    .combine(ch_reads_tmp, by: 0)
-                                                                    .map{ group, meta, assembly, reads -> 
-                                                                          [ meta, assembly, reads ]}
-                                                                    .set{ ch_bam_unchanged_by_filtering }
+      ch_selected_contigs_and_reads= discarded_contigs_category.with.map {meta, discarded_contigs -> meta.group}
+                                                                    .join( ch_merged_selected.map { meta, contigs -> 
+                                                                                                  [meta.group, meta, contigs]})
+                                                                    .combine( reads.map{ meta, reads ->
+                                                                                       [ meta.group, meta, reads ]}, by: 0)
+                                                                    .map{ group, meta_contigs, contigs, meta_reads, reads -> 
+                                                                        [ meta_reads, contigs, reads ]}
 
     } else {
       ch_bam_unchanged_by_filtering =  discarded_contigs_category.without.map{ it -> it[0]}
-- 
GitLab


From 83286cb378f5720df276d867b0689612eb5abf78 Mon Sep 17 00:00:00 2001
From: "maina.vienne" <maina.vienne@inrae.fr>
Date: Thu, 5 Jan 2023 16:31:26 +0100
Subject: [PATCH 28/31] Fix binning and cross alignment for coassembly

---
 modules/binning.nf         |  2 +-
 subworkflows/08_binning.nf | 28 ++++++++++++++++++++--------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/modules/binning.nf b/modules/binning.nf
index f9012d9..431fb01 100644
--- a/modules/binning.nf
+++ b/modules/binning.nf
@@ -130,7 +130,7 @@ process METAWRAP_REFINMENT {
     path "v_metawrap.txt", emit: v_metawrap
 
 	script:
-  bins_flag = (bins3 != null) ? "-A $bins1 -B $bins2 -C $bins3" : "-A $bins2 -B $bins2 "
+  bins_flag = (bins3 != null) ? "-A $bins1 -B $bins2 -C $bins3" : "-A $bins1 -B $bins2 "
   """
 
   echo "metawrap 1.3_modified" > v_metawrap.txt
diff --git a/subworkflows/08_binning.nf b/subworkflows/08_binning.nf
index efa4f9d..32b9e32 100644
--- a/subworkflows/08_binning.nf
+++ b/subworkflows/08_binning.nf
@@ -53,7 +53,7 @@ workflow STEP_08_BINNING {
       // combine assemblies with reads of all samples
       ch_reads_assembly = reads.combine(assembly)
                           .map{ meta_reads, reads, meta_assembly, assembly ->
-                            if ((meta_reads.id != meta_assembly.id && !(params.coassembly)) || ((params.coassembly && meta_reads.group == meta_assembly.group))){
+                            if (((meta_reads.id != meta_assembly.id) && !(params.coassembly)) || (params.coassembly && (meta_reads.group != meta_assembly.group))){
                               [[id:meta_reads.id+"_"+meta_assembly.id, sample:meta_assembly.sample, flowcell:meta_assembly.flowcell, group:meta_assembly.group, assembly:meta_assembly.assembly, type:meta_assembly.type], assembly, reads]
                             }
                           }
@@ -78,7 +78,9 @@ workflow STEP_08_BINNING {
     } 
     // formatting channel  
     ch_bam = ch_bam_cross_alignment.mix(bam)
-            .map { meta, bam, bai -> [ meta.sample, meta, bam, bai ]  }
+            .map { meta, bam, bai -> 
+                  if (params.coassembly){[ meta.group, meta, bam, bai ]} 
+                  else  {[ meta.sample, meta, bam, bai ]}}
             .groupTuple(by: [0])
             .map { sample,metas, bam, bai ->
               [ metas.min { it.id.size() }, bam, bai ]
@@ -88,19 +90,29 @@ workflow STEP_08_BINNING {
   }
 
   if (params.coassembly){
-    ch_bam.map { meta, bam, bai ->
-                    [ meta.group, meta, bam, bai ]}
-          .set { ch_bam_tmp }
-
+    if (params.binning_cross_alignment == 'all') {
+      ch_bam.map { meta, bam, bai -> [ meta.group, bam, bai ]  }
+            .set { ch_bam_tmp }
+    } else {
+      ch_bam.map { meta, bam, bai -> [ meta.group, meta, bam, bai ]  }
+            .groupTuple(by: [0])
+            .map { group, metas, bam, bai ->
+                [ group, bam, bai ]} 
+            .set { ch_bam_tmp }
+    }
     assembly.map { meta, assembly ->
-                  [ meta.group, assembly ]} 
+                  [ meta.group, meta, assembly ]} 
             .combine( ch_bam_tmp, by: 0)
-            .map { group, assembly, meta, bam, bai -> 
+            .map { group, meta, assembly, bam, bai -> 
                   [ meta, assembly, bam, bai]}
             .tap { ch_assembly_bam }
             .map { meta, assembly, bam, bai ->
                   [ meta, assembly ]}
             .tap { ch_assembly }
+
+    ch_assembly_bam.map{  meta, assembly, bam, bai ->
+                       [ meta, bam, bai ]}
+                   .set{ ch_bam}
   } else {
     ch_assembly_bam = assembly.join(ch_bam)
     ch_assembly = assembly
-- 
GitLab


From bae314156695d687339e122040c1599affde5204 Mon Sep 17 00:00:00 2001
From: "maina.vienne" <maina.vienne@inrae.fr>
Date: Thu, 5 Jan 2023 16:36:13 +0100
Subject: [PATCH 29/31] Remove channel view

---
 subworkflows/02_assembly.nf | 1 -
 1 file changed, 1 deletion(-)

diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf
index 01291d9..46a0a6e 100644
--- a/subworkflows/02_assembly.nf
+++ b/subworkflows/02_assembly.nf
@@ -87,7 +87,6 @@ workflow STEP_02_ASSEMBLY {
                                     meta.assembly = metas.assembly.unique().join()
                                     meta.type = metas.type.unique().join()
                                     return [meta, assembly[0]] }
-        ch_assembly.view()
       }
 
     } else if (params.type.toUpperCase() == "SR") {
-- 
GitLab


From b9436266d1af0da8502e2d6d0c9ba010966e8a46 Mon Sep 17 00:00:00 2001
From: "maina.vienne" <maina.vienne@inrae.fr>
Date: Thu, 5 Jan 2023 17:16:45 +0100
Subject: [PATCH 30/31] Doc + test param coassembly

---
 docs/usage.md | 3 ++-
 main.nf       | 9 +++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/docs/usage.md b/docs/usage.md
index 87e6e2b..a0907e2 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -118,7 +118,7 @@ For HiFi the fastq2 column is not needed
    > name_sample2,number_flowcell,path_sample2_R1.fastq.gz,path_sample2_R2.fastq.gz
    > ```
 
-   * **If you want to perfom cross alignment for binning on groups** :  
+   * **If you want to perfom coassembly or cross alignment for binning on groups** :  
    > ```
    > sample,group,fastq_1,fastq_2
    > name_sample1,group_name,path_sample1_flowcell1_R1.fastq.gz,path_sample1_flowcell1_R2.fastq.gz
@@ -276,6 +276,7 @@ No parameter available for this substep.
 * `--assembly` allows to indicate the assembly tool. For short reads: `["metaspades" or "megahit"]`:  Default: `metaspades`.
                                                      For HiFi reads:  `["hifiasm-meta", "metaflye"]`. Default: `hifiasm-meta`.
 
+* `--coassembly` allows to assemble together the samples labeled with the same group in the samplesheet. It will generate one assembly for each group. To co-assemble all of your samples together, you must indicate a unique group for each sample in the samplesheet.  **WARNING** With the coassembly, you can't use `--binning_cross_alignment 'group'`  because one binning will be generate for each group co-assembled and automatically mapping with every sample of his group but `--binning_cross_alignment 'all'` can be use to cross align every sample with every group. 
 
 
 **WARNING 4:** For short reads, the user can choose between `metaspades` or `megahit` for `--assembly` parameter. The choice can be based on CPUs and memory availability: `metaspades` needs more CPUs and memory than `megahit` but our tests showed that assembly metrics are better for `metaspades` than `megahit`.For PacBio HiFi reads, the user can choose between `hifiasm-meta` or `metaflye`. 
diff --git a/main.nf b/main.nf
index 3b86218..6b2e1da 100644
--- a/main.nf
+++ b/main.nf
@@ -65,6 +65,7 @@ include { MULTIQC } from './modules/multiqc'
        --stop_at_assembly            Stop the pipeline at this step
        --assembly                    Indicate the assembly tool for short reads ["metaspades" or "megahit" ]. Default: "metaspades".
                                                              or for HiFi reads ["hifiasm-meta", "metaflye"]. Default: "hifiasm-meta".
+       --coassembly                  Assemble together samples labeled with the same group in the samplesheet 
               
      S03_FILTERING options:
        --stop_at_filtering           Stop the pipeline at this step
@@ -213,6 +214,10 @@ workflow {
       exit 1, "You must specify --skip_binning or specify a GTDB-TK bank with --gtdbtk_bank"
   }
 
+  if ( params.coassembly && params.binning_cross_alignment == 'group'){
+    exit 1, "--binning_cross_alignment group must not be use use --coassembly."
+  }
+
 
 
   ////////////
@@ -228,8 +233,8 @@ workflow {
            if (hasExtension(row.fastq_1, "fastq") || hasExtension(row.fastq_1, "fq") || hasExtension(row.fastq_2, "fastq") || hasExtension(row.fastq_2, "fq")) {
               exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint."
            }
-           if (params.binning_cross_alignment == 'group' && row.group == null){
-            exit 1, "You must specify groups in the samplesheet if you want to use --binning_cross_alignment 'group'"
+           if ((params.binning_cross_alignment == 'group' || params.coassembly) && row.group == null){
+            exit 1, "You must specify groups in the samplesheet if you want to use --binning_cross_alignment 'group' or --coassembly"
            }
            ["sample":row.sample,
             "flowcell":row.flowcell,
-- 
GitLab


From dd9d1a14c9e2a79715fde81a393dda6403347ed8 Mon Sep 17 00:00:00 2001
From: "maina.vienne" <maina.vienne@inrae.fr>
Date: Tue, 10 Jan 2023 16:09:34 +0100
Subject: [PATCH 31/31] Fix cd_hit_produce_table_clstr.py input step06

---
 conf/functional_test.config | 2 +-
 modules/cd_hit.nf           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/functional_test.config b/conf/functional_test.config
index b1fa154..6268237 100644
--- a/conf/functional_test.config
+++ b/conf/functional_test.config
@@ -5,7 +5,7 @@ singularity.enabled = true
 singularity.autoMounts = true
 
 process {
-    container = "/work/project/plateforme/metaG/functional_test/singularity_img/metagwgs.sif"
+    container = "/work/project/plateforme/metaG/functional_test/singularity_img/dev_v2.4/metagwgs.sif"
     withLabel: BINNING {
         container = "/work/project/plateforme/metaG/functional_test/singularity_img/binning.sif"
     }
diff --git a/modules/cd_hit.nf b/modules/cd_hit.nf
index 0a1b1f5..b341142 100644
--- a/modules/cd_hit.nf
+++ b/modules/cd_hit.nf
@@ -15,7 +15,7 @@ process INDIVIDUAL_CD_HIT {
   script:
   """
   cd-hit-est -c ${pct_id} -i ${ffn} -o ${meta.id}.cd-hit-est.${pct_id}.fasta -T ${task.cpus} -M ${task.mem} -d 150
-  cat ${meta.id}.cd-hit-est.${pct_id}.fasta.clstr | cd_hit_produce_table_clstr.py > ${meta.id}.cd-hit-est.${pct_id}.table_cluster_contigs.txt
+  cd_hit_produce_table_clstr.py -i ${meta.id}.cd-hit-est.${pct_id}.fasta.clstr -o ${meta.id}.cd-hit-est.${pct_id}.table_cluster_contigs.txt
   echo \$(cd-hit -h 2>&1) > v_cdhit.txt
   """
 }
-- 
GitLab