From f8a0215c67754089904b6b7d4a0ab0e11fa011ee Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Fri, 28 Oct 2022 13:24:11 +0200 Subject: [PATCH 01/31] Reduce main.nf by moving "manage flowcell" in the subworflow 02_assembly --- main.nf | 45 +-------------------------------- subworkflows/02_assembly.nf | 50 +++++++++++++++++++++++++++++++------ 2 files changed, 43 insertions(+), 52 deletions(-) diff --git a/main.nf b/main.nf index cdd2d19..0d26a32 100644 --- a/main.nf +++ b/main.nf @@ -244,7 +244,6 @@ workflow { def meta = [:] meta.id = item.sample if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell} - if (item.group !=null) {meta.id = meta.id+"_"+item.group} meta.sample = item.sample meta.flowcell = item.flowcell meta.group = item.group @@ -265,7 +264,6 @@ workflow { def meta = [:] meta.id = item.sample if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell} - if (item.group !=null) {meta.id = meta.id+"_"+item.group} meta.sample = item.sample meta.flowcell = item.flowcell meta.group = item.group @@ -362,49 +360,8 @@ workflow { } if ( !params.stop_at_clean ) { - if (!has_assembly & has_flowcell ){ - ////////////////// - // Manage Flowcell - ////////////////// - ch_reads_tmp = ch_preprocessed_reads - .map { - meta, fastq -> - [ meta.sample, meta, fastq ] - } - .groupTuple(by: [0]) - .branch { - id, meta, fastq -> - single : fastq.size() == 1 - return [[id:meta.sample.unique().join(), - sample:meta.sample.unique().join(), - flowcell:meta.flowcell.join("_"), - group:meta.group.unique().join(), - assembly:meta.assembly.unique().join(), - type:meta.type.unique().join()], fastq.flatten() ] - multiple: fastq.size() > 1 - return [[id:meta.sample.unique().join(), - sample:meta.sample.unique().join(), - flowcell:meta.flowcell.join("_"), - group:meta.group.unique().join(), - assembly:meta.assembly.unique().join(), - type:meta.type.unique().join()], fastq.flatten() ] - } - - - MERGE_FASTQ ( - ch_reads_tmp.multiple - ) - .reads - .mix(ch_reads_tmp.single) - .set{ch_preprocessed_reads} - } - - ///////////////////// - //End manage Flowcell - ///////////////////// - - S02_ASSEMBLY ( ch_preprocessed_reads, ch_assembly, has_assembly, assembly_tool ) + S02_ASSEMBLY ( ch_preprocessed_reads, ch_assembly, has_assembly, assembly_tool, has_flowcell ) ch_assembly = S02_ASSEMBLY.out.assembly ch_reads = S02_ASSEMBLY.out.reads ch_bam = S02_ASSEMBLY.out.bam diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf index fe90d44..d360c9f 100644 --- a/subworkflows/02_assembly.nf +++ b/subworkflows/02_assembly.nf @@ -6,10 +6,11 @@ include { GET_ALIGNMENT_METRICS } from '../modules/read_alignment_manipulation' workflow STEP_02_ASSEMBLY { take: - reads - assembly - has_assembly - assembly_tool + reads + assembly + has_assembly + assembly_tool + has_flowcell main: ch_assembler_v = Channel.empty() @@ -17,11 +18,45 @@ workflow STEP_02_ASSEMBLY { ch_bwa_mem_v = Channel.empty() ch_minimap2_v = Channel.empty() ch_samtools_v = Channel.empty() - + + + if (!has_assembly & has_flowcell ){ + ////////////////// + // Manage Flowcell + ////////////////// + ch_reads_flowcell = reads + .map { meta, fastq -> + [ meta.sample, meta, fastq ] } + .groupTuple(by: [0]) + .branch { id, meta, fastq -> + single : fastq.size() == 1 + return [[id:meta.sample.unique().join(), + sample:meta.sample.unique().join(), + flowcell:meta.flowcell.join("_"), + group:meta.group.unique().join(), + assembly:meta.assembly.unique().join(), + type:meta.type.unique().join()], fastq.flatten() ] + multiple: fastq.size() > 1 + return [[id:meta.sample.unique().join(), + sample:meta.sample.unique().join(), + flowcell:meta.flowcell.join("_"), + group:meta.group.unique().join(), + assembly:meta.assembly.unique().join(), + type:meta.type.unique().join()], fastq.flatten() ] + } + + MERGE_FASTQ (ch_reads_flowcell.multiple) + .reads + .mix(ch_reads_flowcell.single) + .set{ch_reads} + + } else { + ch_reads = reads + } + if (has_assembly){ ch_assembly = assembly - } - else { + } else { if(assembly_tool == 'metaspades') { METASPADES(reads) ch_assembly = METASPADES.out.assembly @@ -58,7 +93,6 @@ workflow STEP_02_ASSEMBLY { ch_contigs_and_reads = ch_assembly_renamed.join(ch_reads, remainder: true) if ( params.type.toUpperCase() == "SR" ) { - READS_DEDUPLICATION ( ch_contigs_and_reads ) ch_reads = READS_DEDUPLICATION.out.dedup_reads -- GitLab From fac60418083694140540653d03dab61540160457 Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Tue, 8 Nov 2022 08:49:11 +0100 Subject: [PATCH 02/31] manage coassembly for step 2 Adds coassembly parameter Adapts assembly modules to take multiple fastq --- modules/assembly.nf | 26 ++++++++--- nextflow.config | 1 + subworkflows/02_assembly.nf | 90 ++++++++++++++++++++++++++----------- 3 files changed, 85 insertions(+), 32 deletions(-) diff --git a/modules/assembly.nf b/modules/assembly.nf index 9155270..969a918 100644 --- a/modules/assembly.nf +++ b/modules/assembly.nf @@ -4,7 +4,7 @@ process METASPADES { label 'ASSEMBLY_SR' input: - tuple val(meta), path(reads) + tuple val(meta), path(read1), path(read2) output: tuple val(meta), path("metaspades/${meta.id}.contigs.fa"), emit: assembly @@ -15,7 +15,21 @@ process METASPADES { (_,mem,unit) = (task.memory =~ /(\d+) ([A-Z]B)/)[0] if ( unit =~ /GB/ ) { """ - metaspades.py -t ${task.cpus} -m $mem -1 ${reads[0]} -2 ${reads[1]} -o metaspades + echo " +[ + { + orientation: \\"fr\\", + type: \\"paired-end\\", + right reads: [ + \\"${read1.join('\\",\n \\"')}\\" + ], + left reads: [ + \\"${read2.join('\\",\n \\"')}\\" + ] + } +]" > input.yaml + + metaspades.py -t ${task.cpus} -m $mem --dataset input.yaml -o metaspades mv metaspades/scaffolds.fasta metaspades/${meta.id}.contigs.fa mv metaspades/spades.log metaspades/${meta.id}.log mv metaspades/params.txt metaspades/${meta.id}.params.txt @@ -34,7 +48,7 @@ process MEGAHIT { label 'ASSEMBLY_SR' input: - tuple val(meta), path(reads) + tuple val(meta), path(read1), path(read2) output: tuple val(meta), path("megahit/${meta.id}.contigs.fa"), emit: assembly @@ -43,7 +57,7 @@ process MEGAHIT { script: """ - megahit -t ${task.cpus} -1 ${reads[0]} -2 ${reads[1]} -o megahit --out-prefix "${meta.id}" + megahit -t ${task.cpus} -1 ${read1.join(',')} -2 ${read2.join(',')} -o megahit --out-prefix "${meta.id}" mv megahit/options.json megahit/${meta.id}.params.txt rm -r megahit/intermediate_contigs @@ -69,7 +83,7 @@ process HIFIASM_META { """ mkdir hifiasm-meta - hifiasm_meta -t ${task.cpus} -o ${meta.id} $reads 2> hifiasm-meta/${meta.id}.log + hifiasm_meta -t ${task.cpus} -o ${meta.id} ${reads.join(' ')} 2> hifiasm-meta/${meta.id}.log # gfa to fasta format awk '/^S/{print ">"\$2"\\n"\$3}' ${meta.id}.p_ctg.gfa | fold > hifiasm-meta/${meta.id}.contigs.fa @@ -98,7 +112,7 @@ process METAFLYE { """ mkdir metaflye - flye --pacbio-hifi $reads -o 'metaflye' --meta -t ${task.cpus} 2> metaflye/${meta.id}.log + flye --pacbio-hifi ${reads.join(' ')} -o 'metaflye' --meta -t ${task.cpus} 2> metaflye/${meta.id}.log mv metaflye/assembly.fasta metaflye/${meta.id}.contigs.fa mv metaflye/params.json metaflye/${meta.id}.params.json diff --git a/nextflow.config b/nextflow.config index 9e86698..296ceac 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,6 +21,7 @@ params { gtdbtk_bank = "" percentage_identity = 0.95 type = "" + coassembly = false // Stop after step or skip optional step/sub-step. diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf index d360c9f..3797c8f 100644 --- a/subworkflows/02_assembly.nf +++ b/subworkflows/02_assembly.nf @@ -54,34 +54,58 @@ workflow STEP_02_ASSEMBLY { ch_reads = reads } + if (params.coassembly){ + ch_reads.map { meta, fastq -> + [ meta.group, meta, fastq] } + .groupTuple(by: [0]) + .map { group, metas, fastq -> + def meta = [:] + meta.id = metas.group.unique().join() + meta.sample = metas.sample.join("_") + meta.flowcell = metas.flowcell.unique().join() + meta.group = metas.group.unique().join() + meta.assembly = metas.assembly.unique().join() + meta.type = metas.type.unique().join() + if (params.type.toUpperCase() == "SR") { + return [meta, fastq.collect { it[0] }, fastq.collect { it[1] }] + } else { + return [meta, fastq ] + }} + .set { ch_reads_assembly } + } else if (params.type.toUpperCase() == "SR") { + ch_reads_assembly = ch_reads + .map { meta, fastq -> + return [meta, fastq[0], fastq[1]] } + } else { + ch_reads_assembly = ch_reads + } + + ch_reads_assembly.view() + if (has_assembly){ ch_assembly = assembly } else { - if(assembly_tool == 'metaspades') { - METASPADES(reads) - ch_assembly = METASPADES.out.assembly - ch_assembler_v = METASPADES.out.v_metaspades - } - else if(assembly_tool == 'megahit') { - MEGAHIT(reads) - ch_assembly = MEGAHIT.out.assembly - ch_assembler_v = MEGAHIT.out.v_megahit - } - else if(assembly_tool == 'hifiasm-meta') { - HIFIASM_META(reads) - ch_assembly = HIFIASM_META.out.assembly - ch_assembler_v = HIFIASM_META.out.v_hifiasm_meta - } - else if(assembly_tool == 'metaflye') { - METAFLYE(reads) + if (assembly_tool == 'metaspades') { + METASPADES(ch_reads_assembly) + ch_assembly = METASPADES.out.assembly + ch_assembler_v = METASPADES.out.v_metaspades + } else if (assembly_tool == 'megahit') { + MEGAHIT(ch_reads_assembly) + ch_assembly = MEGAHIT.out.assembly + ch_assembler_v = MEGAHIT.out.v_megahit + } else if (assembly_tool == 'hifiasm-meta') { + HIFIASM_META(ch_reads_assembly) + ch_assembly = HIFIASM_META.out.assembly + ch_assembler_v = HIFIASM_META.out.v_hifiasm_meta + } else if (assembly_tool == 'metaflye') { + METAFLYE(ch_reads_assembly) ch_assembly = METAFLYE.out.assembly ch_assembler_v = METAFLYE.out.v_metaflye - } - else { + } else { exit 1, "Invalid assembly parameter: ${assembly_tool}" - } + } } - + ch_assembly_renamed = RENAME_CONTIGS(ch_assembly) ASSEMBLY_QUAST( ch_assembly_renamed,"02_assembly/quast_primary") ch_assembly_report = ASSEMBLY_QUAST.out.report @@ -89,10 +113,24 @@ workflow STEP_02_ASSEMBLY { ch_idxstats = Channel.empty() ch_flagstat = Channel.empty() - ch_reads = reads + + if (params.coassembly){ + ch_reads.map { meta, fastq -> + [ meta.group, meta, fastq ]} + .set { ch_reads_tmp } + + ch_assembly_renamed.map { meta, assembly -> + [ meta.group, assembly ]} + .combine(ch_reads_tmp, by: 0) + .map { group, assembly, meta, fastq -> + [ meta, assembly,fastq ]} + .set { ch_contigs_and_reads } + } else { + ch_contigs_and_reads = ch_assembly_renamed.join(ch_reads, remainder: true) + } - ch_contigs_and_reads = ch_assembly_renamed.join(ch_reads, remainder: true) if ( params.type.toUpperCase() == "SR" ) { + READS_DEDUPLICATION ( ch_contigs_and_reads ) ch_reads = READS_DEDUPLICATION.out.dedup_reads @@ -101,10 +139,10 @@ workflow STEP_02_ASSEMBLY { ch_bwa_mem_v = READS_DEDUPLICATION.out.v_bwa_mem2 } else { - MINIMAP2(ch_contigs_and_reads,"02_assembly") - ch_bam = MINIMAP2.out.bam + MINIMAP2(ch_contigs_and_reads,"02_assembly") + ch_bam = MINIMAP2.out.bam - ch_minimap2_v = MINIMAP2.out.v_minimap2 + ch_minimap2_v = MINIMAP2.out.v_minimap2 } -- GitLab From 57b243a714e38e2da05bc8286a67906b9b9b100a Mon Sep 17 00:00:00 2001 From: Jean Mainguy <jean.mainguy@inra.fr> Date: Wed, 2 Nov 2022 14:01:24 +0100 Subject: [PATCH 03/31] add gene line gff file --- bin/merge_annotations.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/bin/merge_annotations.py b/bin/merge_annotations.py index d18ebd0..afa038a 100755 --- a/bin/merge_annotations.py +++ b/bin/merge_annotations.py @@ -139,10 +139,10 @@ def merging_cds_and_rna(cds_per_contig, contig2rnas): logging.info(f'{contig} has only rna annotation') yield contig, rnas -def add_new_ID_tag(gff_attributes, new_id): +def add_new_ID_tag_and_Parent(gff_attributes, new_id, parent_id): - gff_attributes_no_id = ';'.join([attr for attr in gff_attributes.split(';') if attr.split('=')[0] != 'ID']) - return f"ID={new_id};{gff_attributes_no_id}" + gff_attributes_no_id = ';'.join([attr for attr in gff_attributes.split(';') if attr.split('=')[0] not in ['ID', 'Parent']]) + return f"ID={new_id};Parent={parent_id};{gff_attributes_no_id}" def get_tag_value(gff_attribute, tag): @@ -168,6 +168,7 @@ def writing_features_to_gff_ffn_faa(annotations_per_contig, out_gff, fna_file, o for feature in sorted(features, key=lambda x: int(x['start'])): gene_count[feature['feature']] += 1 new_id = f"{feature['seqname']}.{feature['feature']}_{gene_count[feature['feature']]}" + parent_id = f"{new_id}_gene" start, end = int(feature['start']), int(feature['end']) @@ -183,11 +184,20 @@ def writing_features_to_gff_ffn_faa(annotations_per_contig, out_gff, fna_file, o # Write gff and ffn - feature['attribute'] = add_new_ID_tag(feature['attribute'], new_id) + feature['attribute'] = add_new_ID_tag_and_Parent(feature['attribute'], new_id, parent_id) + + # gff line with CDS/tRNA or rRNA in feature column + feature_gff_line = '\t'.join(feature.values()) + + # gff line with gene in feature column + feature['feature'] = 'gene' + feature['attribute'] = f'ID={parent_id};locus_tag={new_id}' + gene_gff_line = '\t'.join(feature.values()) + + fh_gff.write(f'{gene_gff_line}\n{feature_gff_line}\n') tag_name = get_tag_value(feature['attribute'], 'Name') - fh_gff.write('\t'.join(feature.values())+"\n") - + if feature['strand'] == "+": feature_seq = contig_fa[contig][start-1:end].seq else: -- GitLab From 1a968cef04f19c7cb4405a6181b8acf3aba0a06e Mon Sep 17 00:00:00 2001 From: Jean Mainguy <jean.mainguy@inra.fr> Date: Wed, 2 Nov 2022 14:17:15 +0100 Subject: [PATCH 04/31] add old name as description of renamed sequenced to keep it --- bin/rename_contigs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/rename_contigs.py b/bin/rename_contigs.py index 0734c9c..264d9cf 100755 --- a/bin/rename_contigs.py +++ b/bin/rename_contigs.py @@ -64,8 +64,8 @@ def main(): logging.info(f'Writting renamed fasta file in {out_fna}') with open(out_fna, "w") as fh_fna: - for i, (_, seq) in enumerate(pyfastx.Fasta(fna_file, build_index=False)): - fh_fna.write(f'>{sample}_c{i+1}\n{seq}\n') + for i, (name, seq) in enumerate(pyfastx.Fasta(fna_file, build_index=False)): + fh_fna.write(f'>{sample}_c{i+1} {name}\n{seq}\n') if __name__ == '__main__': main() \ No newline at end of file -- GitLab From 87061f3b3d32aabaad2956f0c425400cd2f79be2 Mon Sep 17 00:00:00 2001 From: Jean Mainguy <jean.mainguy@inra.fr> Date: Wed, 2 Nov 2022 14:27:34 +0100 Subject: [PATCH 05/31] add flag skip_alignment to skip step 05 --- main.nf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 0d26a32..ad9a9cc 100644 --- a/main.nf +++ b/main.nf @@ -77,6 +77,7 @@ include { MERGE_FASTQ } from './modules/merge_fastq.nf' S05_ALIGNMENT options: --diamond_bank Path to diamond bank used to align protein sequence of genes: "PATH/bank.dmnd". This bank must be previously built with diamond makedb. + --skip_alignment Skip this step S06_FUNC_ANNOT options: --skip_func_annot Skip this step @@ -412,7 +413,7 @@ workflow { ch_diamond_result = Channel.empty() - if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot ) { + if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_alignment) { S05_ALIGNMENT (ch_annotation_faa, ch_diamon_db) ch_diamond_result = S05_ALIGNMENT.out.m8 @@ -422,14 +423,14 @@ workflow { ch_quant_report = Channel.empty() ch_v_eggnogmapper = Channel.empty() - if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_func_annot ) { + if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_func_annot && !params.skip_alignment) { S06_FUNC_ANNOT ( ch_annotation_ffn, ch_annotation_faa, ch_annotation_gff, ch_bam, ch_diamond_result, ch_eggnog_db ) ch_quant_report = S06_FUNC_ANNOT.out.quant_report ch_software_versions_S06 = S06_FUNC_ANNOT.out.software_versions } - if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_taxo_affi ) { + if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_taxo_affi && !params.skip_alignment) { S07_TAXO_AFFI ( ch_taxonomy, ch_diamond_result, ch_sam_coverage) ch_software_versions_S07 = S07_TAXO_AFFI.out.software_versions -- GitLab From b8235b46c92729dc94ce7d752f7f3f3cbaeda4b5 Mon Sep 17 00:00:00 2001 From: Jean Mainguy <jean.mainguy@inra.fr> Date: Fri, 4 Nov 2022 15:58:10 +0100 Subject: [PATCH 06/31] Improve fasta parsing --- bin/Bins_per_sample_summarize.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/Bins_per_sample_summarize.py b/bin/Bins_per_sample_summarize.py index fce7b72..89efa2b 100755 --- a/bin/Bins_per_sample_summarize.py +++ b/bin/Bins_per_sample_summarize.py @@ -45,13 +45,14 @@ def bins_contigs_compositions(folder): contigs_to_bins = dict() list_bins = list() for fi in os.listdir(folder): - with open(folder + "/" + fi,'r') as bin_file: + bin_fasta_file = os.path.join(folder, fi) + with open(bin_fasta_file) as bin_file: bin_name = fi.rsplit('.', maxsplit=1)[0] if bin_name not in list_bins: list_bins.append(bin_name) for line in bin_file: if line.startswith('>'): - contig_name = line.strip().lstrip('>') + contig_name = line.split()[0].strip().lstrip('>') contigs_to_bins[contig_name] = bin_name return contigs_to_bins, list_bins -- GitLab From 2859eb13e91f75f2de2b6364d93cd505206c0ec8 Mon Sep 17 00:00:00 2001 From: Jean Mainguy <jean.mainguy@inra.fr> Date: Fri, 4 Nov 2022 15:58:41 +0100 Subject: [PATCH 07/31] add meta to quast report because it is needed in binning step --- main.nf | 9 +++++++-- modules/metaquast.nf | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index ad9a9cc..9d29306 100644 --- a/main.nf +++ b/main.nf @@ -371,7 +371,9 @@ workflow { ch_idxstats = S02_ASSEMBLY.out.idxstats ch_unfilter_assembly_flagstat_report = S02_ASSEMBLY.out.flagstat - ch_quast_before_filter_report = S02_ASSEMBLY.out.assembly_report + ch_quast = S02_ASSEMBLY.out.assembly_report + ch_quast_before_filter_report = ch_quast.map{ meta, quast_report -> quast_report } + ch_software_versions_S02 = S02_ASSEMBLY.out.software_versions } @@ -392,7 +394,10 @@ workflow { ch_bam = S03_FILTERING.out.bam ch_sam_coverage = S03_FILTERING.out.sam_coverage - ch_quast_after_filter_report = S03_FILTERING.out.quast_report + + ch_quast = S03_FILTERING.out.quast_report + ch_quast_after_filter_report = ch_quast.map{ meta, quast_report -> quast_report } + ch_final_assembly_flagstat_report = S03_FILTERING.out.sam_flagstat } diff --git a/modules/metaquast.nf b/modules/metaquast.nf index dee90d5..dbaa508 100644 --- a/modules/metaquast.nf +++ b/modules/metaquast.nf @@ -9,7 +9,7 @@ process QUAST { output: path "${outdir}/${meta.id}/*", emit: all - path "${outdir}/${meta.id}/report.tsv", emit: report + tuple val(meta), path ("${outdir}/${meta.id}/report.tsv"), emit: report path "v_quast.txt", emit: v_quast script: -- GitLab From b76791e04411360a674d1f919632c4ac226cec16 Mon Sep 17 00:00:00 2001 From: Jean Mainguy <jean.mainguy@inra.fr> Date: Mon, 7 Nov 2022 13:02:04 +0100 Subject: [PATCH 08/31] add skip_alignment in config --- nextflow.config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nextflow.config b/nextflow.config index 296ceac..22d7fa8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -44,6 +44,9 @@ params { // Step stop_at_structural_annot = false + // Optional step + skip_alignment = false + // Optional step skip_func_annot = false -- GitLab From bd52ace89dbd324ec04d8fd46f8a534d6f761f57 Mon Sep 17 00:00:00 2001 From: Jean Mainguy <jean.mainguy@inra.fr> Date: Mon, 7 Nov 2022 15:45:40 +0100 Subject: [PATCH 09/31] remove skip_alignment flag --- main.nf | 9 ++++----- subworkflows/05_alignment.nf | 21 +++++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/main.nf b/main.nf index 9d29306..a6bdc5b 100644 --- a/main.nf +++ b/main.nf @@ -77,7 +77,6 @@ include { MERGE_FASTQ } from './modules/merge_fastq.nf' S05_ALIGNMENT options: --diamond_bank Path to diamond bank used to align protein sequence of genes: "PATH/bank.dmnd". This bank must be previously built with diamond makedb. - --skip_alignment Skip this step S06_FUNC_ANNOT options: --skip_func_annot Skip this step @@ -418,24 +417,24 @@ workflow { ch_diamond_result = Channel.empty() - if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_alignment) { + if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && (!params.skip_func_annot || !params.skip_taxo_affi)) { S05_ALIGNMENT (ch_annotation_faa, ch_diamon_db) - ch_diamond_result = S05_ALIGNMENT.out.m8 + ch_diamond_result = S05_ALIGNMENT.out.diamond_result ch_software_versions_S05 = S05_ALIGNMENT.out.software_versions } ch_quant_report = Channel.empty() ch_v_eggnogmapper = Channel.empty() - if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_func_annot && !params.skip_alignment) { + if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_func_annot) { S06_FUNC_ANNOT ( ch_annotation_ffn, ch_annotation_faa, ch_annotation_gff, ch_bam, ch_diamond_result, ch_eggnog_db ) ch_quant_report = S06_FUNC_ANNOT.out.quant_report ch_software_versions_S06 = S06_FUNC_ANNOT.out.software_versions } - if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_taxo_affi && !params.skip_alignment) { + if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot && !params.skip_taxo_affi) { S07_TAXO_AFFI ( ch_taxonomy, ch_diamond_result, ch_sam_coverage) ch_software_versions_S07 = S07_TAXO_AFFI.out.software_versions diff --git a/subworkflows/05_alignment.nf b/subworkflows/05_alignment.nf index 9902230..9017d47 100644 --- a/subworkflows/05_alignment.nf +++ b/subworkflows/05_alignment.nf @@ -9,16 +9,17 @@ workflow STEP_05_ALIGNMENT { ch_diamond_v = Channel.empty() - ch_m8 =Channel.empty() - if (!params.skip_func_annot || !params.skip_taxo_affi){ - DIAMOND ( - prokka_faa, - diamond - ) - ch_m8 = DIAMOND.out.m8 - ch_diamond_v = DIAMOND.out.v_diamond - } + ch_diamond_result =Channel.empty() + + DIAMOND ( + prokka_faa, + diamond, + ) + + ch_diamond_result = DIAMOND.out.m8 + ch_diamond_v = DIAMOND.out.v_diamond + emit: - m8 = ch_m8 + diamond_result = ch_diamond_result software_versions = ch_diamond_v.first() } -- GitLab From 2b12c305cd1fe43aeb8b3926b7e9158c8bc4f294 Mon Sep 17 00:00:00 2001 From: VIENNE MAINA <maina.vienne@inrae.fr> Date: Tue, 8 Nov 2022 10:45:26 +0100 Subject: [PATCH 10/31] refactoring --- bin/{aln2taxaffi.py => aln_to_tax_affi.py} | 2 +- ...marize.py => bins_per_sample_summarize.py} | 4 +- ...ge_abundance_and_functional_annotations.py | 20 +++----- bin/merge_annotations.py | 2 +- bin/merge_kaiju_results.py | 19 +++---- ...clusters.py => quantification_clusters.py} | 49 +++++++------------ modules/assign_taxonomy.nf | 2 +- modules/feature_counts.nf | 2 +- modules/sum_up_bins_informations.nf | 4 +- 9 files changed, 41 insertions(+), 63 deletions(-) rename bin/{aln2taxaffi.py => aln_to_tax_affi.py} (99%) rename bin/{Bins_per_sample_summarize.py => bins_per_sample_summarize.py} (99%) rename bin/{Quantification_clusters.py => quantification_clusters.py} (88%) diff --git a/bin/aln2taxaffi.py b/bin/aln_to_tax_affi.py similarity index 99% rename from bin/aln2taxaffi.py rename to bin/aln_to_tax_affi.py index 241d836..6635297 100755 --- a/bin/aln2taxaffi.py +++ b/bin/aln_to_tax_affi.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """---------------------------------------------------------------------------- - Script Name: aln2taxaffi.py + Script Name: aln_to_tax_affi.py Description: Input files: File with correspondence between accession ids and taxon ids, \ taxonomy directory and diamond output file (.m8) diff --git a/bin/Bins_per_sample_summarize.py b/bin/bins_per_sample_summarize.py similarity index 99% rename from bin/Bins_per_sample_summarize.py rename to bin/bins_per_sample_summarize.py index 89efa2b..08e7112 100755 --- a/bin/Bins_per_sample_summarize.py +++ b/bin/bins_per_sample_summarize.py @@ -1,7 +1,7 @@ #!/usr/bin/env python """---------------------------------------------------------------------------- - Script Name: Bins_per_sample.py + Script Name: bins_per_sample_summarize.py Description: Generate abundances table of bins between samples, with also \ taxonomic and genomes informations from gtdb-tk and Checkm2. Input files: Samtools coverages anf flagstats files per sample, .fasta file of bins, \ @@ -307,4 +307,4 @@ def main(): write_report_file(general_table, args.report_file, args.checkm_file, args.table_file) if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/bin/merge_abundance_and_functional_annotations.py b/bin/merge_abundance_and_functional_annotations.py index 3885286..614b123 100755 --- a/bin/merge_abundance_and_functional_annotations.py +++ b/bin/merge_abundance_and_functional_annotations.py @@ -28,18 +28,14 @@ __status__ = 'dev' # Status: dev. # Modules importation. -try: - import argparse - import re - import sys - import pandas as pd - from datetime import datetime -except ImportError as error: - print(error) - exit(1) - -# Print time. -print(str(datetime.now())) + +import argparse +import re +import sys +import pandas as pd +from datetime import datetime + + # Manage parameters. parser = argparse.ArgumentParser(description = 'Script which join \ diff --git a/bin/merge_annotations.py b/bin/merge_annotations.py index afa038a..4b35037 100755 --- a/bin/merge_annotations.py +++ b/bin/merge_annotations.py @@ -268,4 +268,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/bin/merge_kaiju_results.py b/bin/merge_kaiju_results.py index 9980126..25cdb1c 100755 --- a/bin/merge_kaiju_results.py +++ b/bin/merge_kaiju_results.py @@ -21,18 +21,13 @@ __status__ = 'dev' # Status: dev. # Modules importation. -try: - import argparse - import re - import sys - import pandas as pd - from datetime import datetime -except ImportError as error: - print(error) - exit(1) - -# Print time. -print(str(datetime.now())) + +import argparse +import re +import sys +import pandas as pd +from datetime import datetime + # Manage parameters. parser = argparse.ArgumentParser(description = 'Script which join \ diff --git a/bin/Quantification_clusters.py b/bin/quantification_clusters.py similarity index 88% rename from bin/Quantification_clusters.py rename to bin/quantification_clusters.py index 49f4ad4..84c7367 100755 --- a/bin/Quantification_clusters.py +++ b/bin/quantification_clusters.py @@ -1,15 +1,14 @@ #!/usr/bin/env python """-------------------------------------------------------------------- - Script Name: Quantification_clusters.py + Script Name: quantification_clusters.py Description: Create a file which join table with global cluster id and intermediate cluster id to table with intermediate cluster id and genes id. Create a file which contains sum of reads aligned to each gene of a cluster. - Input files: - 1st input file: table_clstr.txt (table with cluster id + Input files: 1st input file: table_clstr.txt (table with cluster id and corresponding intermediate cluster ids) 2nd input file: file containing list of file names generated with 1st cd-hit for each sample @@ -17,7 +16,7 @@ 3rd input file: file containing list of file names generated with featureCounts for each sample (.featureCounts.count files) - Created By: Joanna Fourquet et Celine Noirot + Created By: Joanna Fourquet and Celine Noirot Date: 2019-04-11 ----------------------------------------------------------------------- """ @@ -34,14 +33,11 @@ __status__ = 'dev' # Status: dev. # Modules importation. -try: - import argparse - import re - import sys - from datetime import datetime -except ImportError as error: - print(error) - exit(1) +import argparse +import re +import sys +from datetime import datetime + # Print time. print(str(datetime.now())) @@ -52,22 +48,22 @@ correspondence table between global cluster id and gene id and \ a table with number of aligned reads in each sample and for each \ global cluster id.') -parser.add_argument('-t', '--table_of_corespondences', required = True, \ +parser.add_argument('-t', '--table_of_corespondences', required = True, help = 'Correspondence table between global cluster \ id and intermediate cluster id.') -parser.add_argument('-l', '--list_of_file_clusters', required = True, \ +parser.add_argument('-l', '--list_of_file_clusters', required = True, help = 'List of files containing correspondence tables between \ cluster intermediate cluster id and gene id per sample.') -parser.add_argument('-c', '--list_of_file_counts', required = True, \ +parser.add_argument('-c', '--list_of_file_counts', required = True, help = 'List of files storing read counts for each gene per sample.') -parser.add_argument('-oc', '--output_counts', required = True, \ +parser.add_argument('-oc', '--output_counts', required = True, help = 'Name of output file containing counts \ for each global cluster id and each sample.') -parser.add_argument('-oid', '--output_id', required = True, \ +parser.add_argument('-oid', '--output_id', required = True, help = 'Name of output file containing correspondence table \ between global cluster id and gene id.') @@ -100,8 +96,6 @@ with open(args.table_of_corespondences) as fp: print(d_g_clstr_id_by_int_clstr_id) print(d_count_by_g_clstr) -# Print date. -print(str(datetime.now())) # Initialization of dictionnary d_g_clstr_id_by_gene_id. d_g_clstr_id_by_gene_id = {} @@ -127,10 +121,7 @@ for int_clstr_gene_path in files_of_int_clstr_id_gene_id: print(line_int_clstr_gene) int_clstr_id = line_int_clstr_gene[0] gene_id_from_clstr_gene_path = line_int_clstr_gene[1] - if \ - 'd_g_clstr_id_by_gene_id[gene_id_from_clstr_gene_path]' \ - not in d_g_clstr_id_by_gene_id: - print("if") + if 'd_g_clstr_id_by_gene_id[gene_id_from_clstr_gene_path]' not in d_g_clstr_id_by_gene_id: d_g_clstr_id_by_gene_id[gene_id_from_clstr_gene_path] \ = d_g_clstr_id_by_int_clstr_id[int_clstr_id] else: @@ -139,8 +130,7 @@ for int_clstr_gene_path in files_of_int_clstr_id_gene_id: print(d_g_clstr_id_by_gene_id) -# Print date. -print(str(datetime.now())) + # For each count file (output of featureCounts), reading of lines one by one, # recovery of name of gene and count number and incrementing of corresponding @@ -157,8 +147,7 @@ for (count_idx,counts_path) in enumerate(sorted(files_of_counts)): d_count_by_g_clstr[d_g_clstr_id_by_gene_id[gene_id]]\ [count_idx] += gene_count -# Print date. -print(str(datetime.now())) + ####################################### # Write in the output files. @@ -178,8 +167,7 @@ with open(args.output_id,"w") as foutput_res_table: + gene_id \ + "\n") -# Print date. -print(str(datetime.now())) + # Write output file containing global cluster id and read count for each sample. with open(args.output_counts,"w") as foutput_res_counts: @@ -192,5 +180,4 @@ with open(args.output_counts,"w") as foutput_res_counts: + "\t".join([str(i) for i in count])\ + "\n") -# Print date. -print(str(datetime.now())) + diff --git a/modules/assign_taxonomy.nf b/modules/assign_taxonomy.nf index 559c06c..9634f94 100644 --- a/modules/assign_taxonomy.nf +++ b/modules/assign_taxonomy.nf @@ -38,7 +38,7 @@ process ASSIGN_TAXONOMY { fi - aln2taxaffi.py -a ${accession2taxid} --taxonomy \$new_taxdump_var \ + aln_to_tax_affi.py -a ${accession2taxid} --taxonomy \$new_taxdump_var \ -o ${meta.id} -b ${m8} --keep_only_best_aln \ -v --write_top_taxons merge_contig_quantif_perlineage.py -c ${meta.id}.percontig.tsv -s ${sam_coverage} -o ${meta.id} diff --git a/modules/feature_counts.nf b/modules/feature_counts.nf index 97587cc..73ec38c 100644 --- a/modules/feature_counts.nf +++ b/modules/feature_counts.nf @@ -37,7 +37,7 @@ process QUANTIFICATION_TABLE { """ ls ${clusters_contigs} | cat > List_of_contigs_files.txt ls ${counts_files} | cat > List_of_count_files.txt - Quantification_clusters.py -t ${global_clusters_clusters} -l List_of_contigs_files.txt -c List_of_count_files.txt -oc Clusters_Count_table_all_samples.txt -oid Correspondence_global_clstr_genes.txt + quantification_clusters.py -t ${global_clusters_clusters} -l List_of_contigs_files.txt -c List_of_count_files.txt -oc Clusters_Count_table_all_samples.txt -oid Correspondence_global_clstr_genes.txt """ } diff --git a/modules/sum_up_bins_informations.nf b/modules/sum_up_bins_informations.nf index d09a2a1..842b43c 100644 --- a/modules/sum_up_bins_informations.nf +++ b/modules/sum_up_bins_informations.nf @@ -18,7 +18,7 @@ process GENOMES_ABUNDANCES_PER_SAMPLE { script: """ mkdir -p stats - Bins_per_sample_summarize.py --list_of_coverage_files ${coverage_files} \ + bins_per_sample_summarize.py --list_of_coverage_files ${coverage_files} \ --list_of_flagstats_files ${flagstats_files} --affiliations_predictions ${affiliations_predictions} \ --bins_folder ${bins_folder} --genomes_informations ${genomes_informations} \ --output_file genomes_abundances.tsv --report_file stats/genomes_abundances_mqc.tsv \ @@ -76,4 +76,4 @@ process BINS_STATS_TO_MUTLIQC_FORMAT { cat bin_size_per_quality.tsv >> bin_size_per_quality_mqc.tsv """ -} \ No newline at end of file +} -- GitLab From 7827694c726844bb60b1766b8aac673f6ff341af Mon Sep 17 00:00:00 2001 From: VIENNE MAINA <maina.vienne@inrae.fr> Date: Tue, 8 Nov 2022 10:53:03 +0100 Subject: [PATCH 11/31] Fix filter_diamond_hits.py --- bin/filter_diamond_hits.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bin/filter_diamond_hits.py b/bin/filter_diamond_hits.py index 375542a..a29e514 100755 --- a/bin/filter_diamond_hits.py +++ b/bin/filter_diamond_hits.py @@ -68,9 +68,8 @@ def get_all_hits_per_query(blast_result_file): def is_identity_and_coverage_ok(hit, min_identity, min_coverage): qcovhsp = (int(hit["qend"]) - int(hit["qstart"]) + 1) / int(hit['qlen']) - if float(hit['pident']) >= min_identity or qcovhsp >= min_coverage: - return True - return False + return float(hit['pident']) >= min_identity and qcovhsp >= min_coverage + def parse_arguments(): -- GitLab From 35a5f84eced5556e2df392b319e3fc8098240938 Mon Sep 17 00:00:00 2001 From: VIENNE MAINA <maina.vienne@inrae.fr> Date: Tue, 8 Nov 2022 12:04:11 +0100 Subject: [PATCH 12/31] Update bin/format_bins_stat_to_multiqc.py, bin/merge_annotations.py, bin/plot_contigs_taxonomic_affiliation.py, bin/plot_kaiju_stat.py, bin/quantification_by_contig_lineage.py, bin/quantification_by_functional_annotation.py, bin/db_versions.py --- bin/db_versions.py | 16 +++++++++----- bin/format_bins_stat_to_multiqc.py | 4 +++- bin/merge_annotations.py | 12 +++++----- bin/plot_contigs_taxonomic_affiliation.py | 6 ++--- bin/plot_kaiju_stat.py | 8 +++---- bin/quantification_by_contig_lineage.py | 22 ++++++++----------- ...quantification_by_functional_annotation.py | 14 +++++------- 7 files changed, 41 insertions(+), 41 deletions(-) diff --git a/bin/db_versions.py b/bin/db_versions.py index 3088ea3..bf762bc 100755 --- a/bin/db_versions.py +++ b/bin/db_versions.py @@ -5,21 +5,25 @@ import subprocess import os.path import gzip -def info_db(file): - name=file.split()[0] - path =os.path.realpath(file.split()[1]) - if file.split()[1]=='nodes.dmp': +def info_db(db_info_file): + name=db_info_file.split()[0] + path =os.path.realpath(db_info_file.split()[1]) + + if db_info_file.split()[1]=='nodes.dmp': path=os.path.dirname(path) + # get db size process = subprocess.run(['du', '-sh',path], stdout=subprocess.PIPE) size = process.stdout.split()[0].decode('utf-8') + if (name == "Host_genome"): size = f"{size} ({get_genome_seq_count(path)} seq)" + # get date of last modifaction process = subprocess.run(['stat', '-c %y', path], stdout=subprocess.PIPE) - modif = process.stdout.split()[0].decode('utf-8') + modif_date = process.stdout.split()[0].decode('utf-8') - return name,size,modif,path + return name,size,modif_date,path def get_genome_seq_count(genome_path): diff --git a/bin/format_bins_stat_to_multiqc.py b/bin/format_bins_stat_to_multiqc.py index e94ee23..05fdeb6 100755 --- a/bin/format_bins_stat_to_multiqc.py +++ b/bin/format_bins_stat_to_multiqc.py @@ -22,7 +22,7 @@ import pandas as pd def parse_arguments(): - """Parse script arguments.""" + """Parse script arguments."""bins_per_sample_summarize.py parser = ArgumentParser(description="...", formatter_class=ArgumentDefaultsHelpFormatter) @@ -65,6 +65,7 @@ def main(): # remove Not_binned category when counting bins in different quality df_ech_count = df_ech.loc[~(df_ech['genome'] == "Not_binned")] + df_ech_gr = df_ech_count.groupby(['Quality']).size().reset_index(name='counts') df_ech_gr = df_ech_gr.set_index("Quality") @@ -91,6 +92,7 @@ def main(): logging.info(f'Writing {args.out_bins_count}') df_count.to_csv(args.out_bins_count, sep='\t') + logging.info(f'Writing {args.out_bins_size}') df_size.to_csv(args.out_bins_size, sep='\t') diff --git a/bin/merge_annotations.py b/bin/merge_annotations.py index 4b35037..8843414 100755 --- a/bin/merge_annotations.py +++ b/bin/merge_annotations.py @@ -12,7 +12,7 @@ merge_annotations.py -c prodigal.gff -r barrnap.gff -t trnascan_se.gff --contig_ # Metadata __author__ = 'Mainguy Jean - Plateforme bioinformatique Toulouse' -__copyright__ = 'Copyright (C) 2020 INRAE' +__copyright__ = 'Copyright (C) 2022 INRAE' __license__ = 'GNU General Public License' __version__ = '0.1' __email__ = 'support.bioinfo.genotoul@inra.fr' @@ -25,7 +25,7 @@ import gzip import csv from collections import defaultdict import pyfastx -from Bio.Seq import Seq + def parse_arguments(): @@ -244,16 +244,16 @@ def main(): out_report = args.report_output - logging.info('Parsing rRNA annoations.') + logging.info('Parsing rRNA annotations.') rrna_annotations = parse_gff_file(rrna_file, 'rRNA') - logging.info('Parsing tRNA annoations.') + logging.info('Parsing tRNA annotations.') trna_annotations = parse_gff_file(trna_file, 'tRNA') - logging.info('Grouping RNA annoations by contig.') + logging.info('Grouping RNA annotations by contig.') contig2rnas = group_annotations_by_contigs(trna_annotations, rrna_annotations) - logging.info('Removing CDS annoations overlapping a RNA annotations.') + logging.info('Removing CDS annotations overlapping a RNA annotations.') unoverlapping_cds_per_contig = remove_overlapping_cds(cds_file, contig2rnas) logging.info('Merge CDS and RNA annotations by contig.') diff --git a/bin/plot_contigs_taxonomic_affiliation.py b/bin/plot_contigs_taxonomic_affiliation.py index 3d51a38..a170c28 100755 --- a/bin/plot_contigs_taxonomic_affiliation.py +++ b/bin/plot_contigs_taxonomic_affiliation.py @@ -303,7 +303,7 @@ def parse_arguments(): parser.add_argument('--output_dir', default='plots', help="Name of the output directory") - parser.add_argument('--top_taxon', default=10, type=int, help="Plot only the top n most abundant taxa.") + parser.add_argument('--nb_top_taxon', default=10, type=int, help="Plot only the top n most abundant taxa.") parser.add_argument("-v", "--verbose", help="increase output verbosity", action="store_true") @@ -326,7 +326,7 @@ def main(): contig_affi_files = args.affi_taxo_quantif - top_n_taxon = args.top_taxon + top_n_taxon = args.nb_top_taxon output_dir = args.output_dir os.makedirs(output_dir, exist_ok=True) @@ -385,7 +385,7 @@ def main(): html_figs = [] for i, (rank, fig) in enumerate(rank2fig.items()): - include_plotly = True if i == 0 else False + include_plotly = i == 0 html_fig = fig.to_html(full_html=False, include_plotlyjs=include_plotly) html_figs.append((rank, html_fig )) diff --git a/bin/plot_kaiju_stat.py b/bin/plot_kaiju_stat.py index 9ea7e0b..24fcde5 100755 --- a/bin/plot_kaiju_stat.py +++ b/bin/plot_kaiju_stat.py @@ -2,7 +2,7 @@ """---------------------------------------------------------------------------------------------------------------------------------------------------------- Script Name: plot_kaiju_stat.py - Description: Generates density plot distribution of kiaju match length + Description: Generates density plot distribution of kaiju match length Input files: Verbose files generated by Kaiju for each sample. Created By: Jean Mainguy @@ -88,7 +88,7 @@ def main(): density_df_samples = [] max_match_length = 0 - sample2dfmatchlength = {} + sample2df_matchlen = {} logging.info(f'Parsing kaiju results') for kaiju_result in kaiju_results: @@ -98,14 +98,14 @@ def main(): df_matchlen = parse_matchlen_from_kaiju_output(kaiju_result) max_match_length = max((max_match_length, df_matchlen['match_length'].max())) - sample2dfmatchlength[sample_name] = df_matchlen + sample2df_matchlen[sample_name] = df_matchlen x_vals = np.linspace(0,max_match_length,max_match_length) # Specifying the limits of our data logging.info(f'Maximum match length is {max_match_length}') logging.info(f'Computing density for each sample') - for sample_name, df_matchlen in sample2dfmatchlength.items(): + for sample_name, df_matchlen in sample2df_matchlen.items(): density = gaussian_kde(df_matchlen['match_length'], weights = df_matchlen['reads']) density.covariance_factor = lambda : smoothing_parameter #Smoothing parameter density._compute_covariance() diff --git a/bin/quantification_by_contig_lineage.py b/bin/quantification_by_contig_lineage.py index 3092db3..d85fb2f 100755 --- a/bin/quantification_by_contig_lineage.py +++ b/bin/quantification_by_contig_lineage.py @@ -22,19 +22,15 @@ __status__ = 'dev' # Status: dev. # Modules importation. -try: - import argparse - import re - import sys - import pandas as pd - import os - from datetime import datetime -except ImportError as error: - print(error) - exit(1) - -# Print time. -print(str(datetime.now())) + +import argparse +import re +import sys +import pandas as pd +import os +from datetime import datetime + + # Manage parameters. parser = argparse.ArgumentParser(description = 'Script which make \ diff --git a/bin/quantification_by_functional_annotation.py b/bin/quantification_by_functional_annotation.py index a6e0e72..e9e7515 100755 --- a/bin/quantification_by_functional_annotation.py +++ b/bin/quantification_by_functional_annotation.py @@ -23,14 +23,12 @@ __status__ = 'dev' # Status: dev. # Modules importation. -try: - import argparse - import re - import sys - import pandas as pd -except ImportError as error: - print(error) - exit(1) + +import argparse +import re +import sys +import pandas as pd + # Manage parameters. parser = argparse.ArgumentParser(description = 'Create a file with \ -- GitLab From 2cb9f5af7d7e66ffcd770f9dcaff7821af02b766 Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Thu, 17 Nov 2022 16:28:50 +0100 Subject: [PATCH 13/31] remove unnecessary line --- modules/cd_hit.nf | 1 - subworkflows/02_assembly.nf | 2 -- subworkflows/04_structural_annot.nf | 2 -- 3 files changed, 5 deletions(-) diff --git a/modules/cd_hit.nf b/modules/cd_hit.nf index 217bc92..1bf02ec 100644 --- a/modules/cd_hit.nf +++ b/modules/cd_hit.nf @@ -55,7 +55,6 @@ main: INDIVIDUAL_CD_HIT( ch_assembly, ch_percentage_identity ) ch_individual_clusters = INDIVIDUAL_CD_HIT.out.clstr_fasta.collect() GLOBAL_CD_HIT(ch_individual_clusters , ch_percentage_identity ) - ch_ffn = ch_assembly.flatMap{it -> it[1]}.collect() emit: individual_clstr_table = INDIVIDUAL_CD_HIT.out.clstr_table diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf index 3797c8f..5630b4e 100644 --- a/subworkflows/02_assembly.nf +++ b/subworkflows/02_assembly.nf @@ -80,8 +80,6 @@ workflow STEP_02_ASSEMBLY { ch_reads_assembly = ch_reads } - ch_reads_assembly.view() - if (has_assembly){ ch_assembly = assembly } else { diff --git a/subworkflows/04_structural_annot.nf b/subworkflows/04_structural_annot.nf index 49841ef..505e633 100644 --- a/subworkflows/04_structural_annot.nf +++ b/subworkflows/04_structural_annot.nf @@ -17,8 +17,6 @@ workflow STEP_04_STRUCTURAL_ANNOT { MERGE_ANNOTATIONS(annotations_ch) - ch_software_versions = PRODIGAL.out.v_prodigal.first() - ch_software_versions = PRODIGAL.out.v_prodigal.first().mix( BARRNAP.out.v_barrnap.first(), TRNASCAN_SE.out.v_tRNAscan.first()) -- GitLab From ae32ab7e48aecf5f29f36fae4efa4295f4262796 Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Thu, 17 Nov 2022 16:29:29 +0100 Subject: [PATCH 14/31] Manage coassembly for step 03 (filtering) --- subworkflows/03_filtering.nf | 156 +++++++++++++++++++++++------------ 1 file changed, 103 insertions(+), 53 deletions(-) diff --git a/subworkflows/03_filtering.nf b/subworkflows/03_filtering.nf index be38cd6..cbb1dcf 100644 --- a/subworkflows/03_filtering.nf +++ b/subworkflows/03_filtering.nf @@ -13,10 +13,33 @@ workflow STEP_03_ASSEMBLY_FILTER { main: ch_chunk_assembly_for_filter = assemblies .splitFasta(by: 100000, file: true) - - ch_assembly_and_idxstats = ch_chunk_assembly_for_filter + if (params.coassembly){ + idxstats.map { meta, idxstats -> + [ meta.group, meta, idxstats] } + .groupTuple(by: [0]) + .map { group, metas, idxstats -> + def meta = [:] + meta.id = metas.group.unique().join() + meta.sample = metas.sample.join("_") + meta.flowcell = metas.flowcell.unique().join() + meta.group = metas.group.unique().join() + meta.assembly = metas.assembly.unique().join() + meta.type = metas.type.unique().join() + [ group, meta, idxstats] } + .set { ch_idxstats_tmp } + ch_chunk_assembly_for_filter.map { meta, assembly -> + [ meta.group, assembly ]} + .combine(ch_idxstats_tmp, by: 0) + .map { group, assembly, meta, idxstats -> + [ meta, assembly, idxstats ]} + .set { ch_assembly_and_idxstats } + } else { + ch_assembly_and_idxstats = ch_chunk_assembly_for_filter .combine(idxstats, by:0) + } + + CHUNK_ASSEMBLY_FILTER ( ch_assembly_and_idxstats, @@ -42,57 +65,84 @@ workflow STEP_03_ASSEMBLY_FILTER { discarded_contigs = MERGE_ASSEMBLY_FILTER.out.merged_discarded - // Differentiate sample with and without discarded_contigs - // samples with no discarded_contigs are not going to be processed to process - discarded_contigs_category = discarded_contigs.branch{ - without: it[1].isEmpty() - with: !it[1].isEmpty() - } - - - samples_without_discarded_contigs = discarded_contigs_category.without.map{ it -> it[0]} - ch_bam_of_sample_without_discarded_contigs = samples_without_discarded_contigs.join(bam) - - ch_discarded_contigs_and_bam = discarded_contigs_category.with.join(bam) - - FILTER_BAM(ch_discarded_contigs_and_bam) - - ch_discarded_bam = FILTER_BAM.out.discarded_contigs_bam - ch_selected_bam = FILTER_BAM.out.selected_contigs_bam - - if ( params.type.toUpperCase() == "SR" ) { - - ch_discarded_reads = EXTRACT_PAIRED_READS(ch_discarded_bam) - - ch_contigs_and_discarded_reads = ch_merged_selected.join(ch_discarded_reads) - - BWA_MEM(ch_contigs_and_discarded_reads, "03_filtering") - ch_bam_with_discarded_reads = BWA_MEM.out.bam - - } else { - ch_discarded_reads = EXTRACT_SINGLE_READS(ch_discarded_bam) - ch_contigs_and_discarded_reads = ch_merged_selected.join(ch_discarded_reads) - MINIMAP2(ch_contigs_and_discarded_reads, "03_filtering") - ch_bam_with_discarded_reads = MINIMAP2.out.bam - - } - - - // remove bai from ch_bam_with_discarded_reads and add bam of selected contigs - ch_bam_to_merge = ch_bam_with_discarded_reads.map{ it -> [it[0], it[1]]}.join(ch_selected_bam) - - ch_merged_bam = MERGE_BAM_FILES(ch_bam_to_merge) - - ch_final_bam = ch_bam_of_sample_without_discarded_contigs.mix(ch_merged_bam) - - - GET_ALIGNMENT_METRICS(ch_final_bam, '03_filtering/') - - ch_flagstat = GET_ALIGNMENT_METRICS.out.sam_flagstat - ch_coverage = GET_ALIGNMENT_METRICS.out.sam_coverage - - QUAST( ch_merged_selected, "03_filtering/quast_filtered" ) - ch_quast_report = QUAST.out.report + // Differentiate sample with and without discarded_contigs + // samples with no discarded_contigs are not going to be processed to process + discarded_contigs_category = discarded_contigs.branch{ + without: it[1].isEmpty() + with: !it[1].isEmpty() + } + + + samples_without_discarded_contigs = discarded_contigs_category.without.map{ it -> it[0]} + + if (params.coassembly){ + bam.map { meta, bam, bai -> + [ meta.group, meta, bam, bai ]} + .set { ch_bam_tmp } + + samples_without_discarded_contigs.map { meta, assembly -> + [ meta.group, assembly ]} + .combine( ch_bam_tmp, by: 0) + .map { group, assembly, meta, bam, bai -> + [ meta, assembly, bam, bai ]} + .set { ch_bam_of_sample_without_discarded_contigs } + discarded_contigs_category.with.map { meta, discarded_contigs -> + [ meta.group, discarded_contigs ]} + .combine( ch_bam_tmp, by: 0) + .map { group, discarded_contigs, meta, bam, bai -> + [ meta, discarded_contigs, bam, bai ]} + .set { ch_discarded_contigs_and_bam } + // metadata retrieval from bam channel, for the join with discarded reads after "EXTRACT_PAIRED_READS" + ch_merged_selected.map { meta, assembly -> + [ meta.group, assembly ]} + .combine( ch_bam_tmp, by: 0) + .map { group, assembly, meta, bam, bai -> + [ meta, assembly ]} + .set { ch_merged_selected_meta } + } else { + ch_bam_of_sample_without_discarded_contigs = samples_without_discarded_contigs.join(bam) + ch_discarded_contigs_and_bam = discarded_contigs_category.with.join(bam) + ch_merged_selected_meta = ch_merged_selected + } + + FILTER_BAM(ch_discarded_contigs_and_bam) + + ch_discarded_bam = FILTER_BAM.out.discarded_contigs_bam + ch_selected_bam = FILTER_BAM.out.selected_contigs_bam + + + + if ( params.type.toUpperCase() == "SR" ) { + + ch_discarded_reads = EXTRACT_PAIRED_READS(ch_discarded_bam) + ch_contigs_and_discarded_reads = ch_merged_selected_meta.join(ch_discarded_reads) + + BWA_MEM(ch_contigs_and_discarded_reads, "03_filtering") + ch_bam_with_discarded_reads = BWA_MEM.out.bam + + } else { + ch_discarded_reads = EXTRACT_SINGLE_READS(ch_discarded_bam) + ch_contigs_and_discarded_reads = ch_merged_selected_meta.join(ch_discarded_reads) + MINIMAP2(ch_contigs_and_discarded_reads, "03_filtering") + ch_bam_with_discarded_reads = MINIMAP2.out.bam + + } + + // remove bai from ch_bam_with_discarded_reads and add bam of selected contigs + ch_bam_to_merge = ch_bam_with_discarded_reads.map{ it -> [it[0], it[1]]}.join(ch_selected_bam) + + ch_merged_bam = MERGE_BAM_FILES(ch_bam_to_merge) + + ch_final_bam = ch_bam_of_sample_without_discarded_contigs.mix(ch_merged_bam) + + + GET_ALIGNMENT_METRICS(ch_final_bam, '03_filtering/') + + ch_flagstat = GET_ALIGNMENT_METRICS.out.sam_flagstat + ch_coverage = GET_ALIGNMENT_METRICS.out.sam_coverage + + QUAST( ch_merged_selected, "03_filtering/quast_filtered" ) + ch_quast_report = QUAST.out.report emit: selected_contigs = ch_merged_selected -- GitLab From e735885ed9632cad17b4b6d568e9897efa9b0a91 Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Fri, 28 Oct 2022 13:24:11 +0200 Subject: [PATCH 15/31] Reduce main.nf by moving "manage flowcell" in the subworflow 02_assembly --- main.nf | 45 +-------------------------------- subworkflows/02_assembly.nf | 50 +++++++++++++++++++++++++++++++------ 2 files changed, 43 insertions(+), 52 deletions(-) diff --git a/main.nf b/main.nf index d9bc02b..d364366 100644 --- a/main.nf +++ b/main.nf @@ -244,7 +244,6 @@ workflow { def meta = [:] meta.id = item.sample if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell} - if (item.group !=null) {meta.id = meta.id+"_"+item.group} meta.sample = item.sample meta.flowcell = item.flowcell meta.group = item.group @@ -265,7 +264,6 @@ workflow { def meta = [:] meta.id = item.sample if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell} - if (item.group !=null) {meta.id = meta.id+"_"+item.group} meta.sample = item.sample meta.flowcell = item.flowcell meta.group = item.group @@ -362,49 +360,8 @@ workflow { } if ( !params.stop_at_clean ) { - if (!has_assembly & has_flowcell ){ - ////////////////// - // Manage Flowcell - ////////////////// - ch_reads_tmp = ch_preprocessed_reads - .map { - meta, fastq -> - [ meta.sample, meta, fastq ] - } - .groupTuple(by: [0]) - .branch { - id, meta, fastq -> - single : fastq.size() == 1 - return [[id:meta.sample.unique().join(), - sample:meta.sample.unique().join(), - flowcell:meta.flowcell.join("_"), - group:meta.group.unique().join(), - assembly:meta.assembly.unique().join(), - type:meta.type.unique().join()], fastq.flatten() ] - multiple: fastq.size() > 1 - return [[id:meta.sample.unique().join(), - sample:meta.sample.unique().join(), - flowcell:meta.flowcell.join("_"), - group:meta.group.unique().join(), - assembly:meta.assembly.unique().join(), - type:meta.type.unique().join()], fastq.flatten() ] - } - - - MERGE_FASTQ ( - ch_reads_tmp.multiple - ) - .reads - .mix(ch_reads_tmp.single) - .set{ch_preprocessed_reads} - } - - ///////////////////// - //End manage Flowcell - ///////////////////// - - S02_ASSEMBLY ( ch_preprocessed_reads, ch_assembly, has_assembly, assembly_tool ) + S02_ASSEMBLY ( ch_preprocessed_reads, ch_assembly, has_assembly, assembly_tool, has_flowcell ) ch_assembly = S02_ASSEMBLY.out.assembly ch_reads = S02_ASSEMBLY.out.reads ch_bam = S02_ASSEMBLY.out.bam diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf index d202dbd..3f40572 100644 --- a/subworkflows/02_assembly.nf +++ b/subworkflows/02_assembly.nf @@ -6,10 +6,11 @@ include { GET_ALIGNMENT_METRICS } from '../modules/read_alignment_manipulation' workflow STEP_02_ASSEMBLY { take: - reads - assembly - has_assembly - assembly_tool + reads + assembly + has_assembly + assembly_tool + has_flowcell main: ch_assembler_v = Channel.empty() @@ -17,11 +18,45 @@ workflow STEP_02_ASSEMBLY { ch_bwa_mem_v = Channel.empty() ch_minimap2_v = Channel.empty() ch_samtools_v = Channel.empty() - + + + if (!has_assembly & has_flowcell ){ + ////////////////// + // Manage Flowcell + ////////////////// + ch_reads_flowcell = reads + .map { meta, fastq -> + [ meta.sample, meta, fastq ] } + .groupTuple(by: [0]) + .branch { id, meta, fastq -> + single : fastq.size() == 1 + return [[id:meta.sample.unique().join(), + sample:meta.sample.unique().join(), + flowcell:meta.flowcell.join("_"), + group:meta.group.unique().join(), + assembly:meta.assembly.unique().join(), + type:meta.type.unique().join()], fastq.flatten() ] + multiple: fastq.size() > 1 + return [[id:meta.sample.unique().join(), + sample:meta.sample.unique().join(), + flowcell:meta.flowcell.join("_"), + group:meta.group.unique().join(), + assembly:meta.assembly.unique().join(), + type:meta.type.unique().join()], fastq.flatten() ] + } + + MERGE_FASTQ (ch_reads_flowcell.multiple) + .reads + .mix(ch_reads_flowcell.single) + .set{ch_reads} + + } else { + ch_reads = reads + } + if (has_assembly){ ch_assembly = assembly - } - else { + } else { if(assembly_tool == 'metaspades') { METASPADES(reads) ch_assembly = METASPADES.out.assembly @@ -58,7 +93,6 @@ workflow STEP_02_ASSEMBLY { ch_contigs_and_reads = ch_assembly_renamed.join(ch_reads, remainder: true) if ( params.type.toUpperCase() == "SR" ) { - READS_DEDUPLICATION ( ch_contigs_and_reads ) ch_reads = READS_DEDUPLICATION.out.dedup_reads -- GitLab From 3cf8c518179501558d5c8cee44789b54f351b66a Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Tue, 8 Nov 2022 08:49:11 +0100 Subject: [PATCH 16/31] manage coassembly for step 2 Adds coassembly parameter Adapts assembly modules to take multiple fastq --- modules/assembly.nf | 26 ++++++++--- nextflow.config | 1 + subworkflows/02_assembly.nf | 86 ++++++++++++++++++++++++++----------- 3 files changed, 83 insertions(+), 30 deletions(-) diff --git a/modules/assembly.nf b/modules/assembly.nf index 728e8b9..a6f82bf 100644 --- a/modules/assembly.nf +++ b/modules/assembly.nf @@ -4,7 +4,7 @@ process METASPADES { label 'ASSEMBLY_SR' input: - tuple val(meta), path(reads) + tuple val(meta), path(read1), path(read2) output: tuple val(meta), path("metaspades_${meta.id}/${meta.id}.contigs.fa"), emit: assembly @@ -15,7 +15,21 @@ process METASPADES { (_,mem,unit) = (task.memory =~ /(\d+) ([A-Z]B)/)[0] if ( unit =~ /GB/ ) { """ - metaspades.py -t ${task.cpus} -m $mem -1 ${reads[0]} -2 ${reads[1]} -o metaspades_${meta.id} + echo " +[ + { + orientation: \\"fr\\", + type: \\"paired-end\\", + right reads: [ + \\"${read1.join('\\",\n \\"')}\\" + ], + left reads: [ + \\"${read2.join('\\",\n \\"')}\\" + ] + } +]" > input.yaml + + metaspades.py -t ${task.cpus} -m $mem --dataset input.yaml -o metaspades_${meta.id} mv metaspades_${meta.id}/scaffolds.fasta metaspades_${meta.id}/${meta.id}.contigs.fa mv metaspades_${meta.id}/spades.log metaspades_${meta.id}/${meta.id}.log mv metaspades_${meta.id}/params.txt metaspades_${meta.id}/${meta.id}.params.txt @@ -34,7 +48,7 @@ process MEGAHIT { label 'ASSEMBLY_SR' input: - tuple val(meta), path(reads) + tuple val(meta), path(read1), path(read2) output: tuple val(meta), path("megahit_${meta.id}/${meta.id}.contigs.fa"), emit: assembly @@ -43,7 +57,7 @@ process MEGAHIT { script: """ - megahit -t ${task.cpus} -1 ${reads[0]} -2 ${reads[1]} -o megahit_${meta.id} --out-prefix "${meta.id}" + megahit -t ${task.cpus} -1 ${read1.join(',')} -2 ${read2.join(',')} -o megahit_${meta.id} --out-prefix "${meta.id}" mv megahit_${meta.id}/options.json megahit_${meta.id}/${meta.id}.params.txt rm -r megahit_${meta.id}/intermediate_contigs @@ -69,7 +83,7 @@ process HIFIASM_META { """ mkdir hifiasm_meta_${meta.id} - hifiasm_meta -t ${task.cpus} -o ${meta.id} $reads 2> hifiasm_meta_${meta.id}/${meta.id}.log + hifiasm_meta -t ${task.cpus} -o ${meta.id} ${reads.join(' ')} 2> hifiasm_meta_${meta.id}/${meta.id}.log # gfa to fasta format awk '/^S/{print ">"\$2"\\n"\$3}' ${meta.id}.p_ctg.gfa | fold > hifiasm_meta_${meta.id}/${meta.id}.contigs.fa @@ -98,7 +112,7 @@ process METAFLYE { """ mkdir metaflye_${meta.id} - flye --pacbio-hifi $reads -o 'metaflye' --meta -t ${task.cpus} 2> metaflye_${meta.id}/${meta.id}.log + flye --pacbio-hifi ${reads.join(' ')} -o 'metaflye' --meta -t ${task.cpus} 2> metaflye_${meta.id}/${meta.id}.log mv metaflye_${meta.id}/assembly.fasta metaflye_${meta.id}/${meta.id}.contigs.fa mv metaflye_${meta.id}/params.json metaflye_${meta.id}/${meta.id}.params.json diff --git a/nextflow.config b/nextflow.config index 1516f4c..22d7fa8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,6 +21,7 @@ params { gtdbtk_bank = "" percentage_identity = 0.95 type = "" + coassembly = false // Stop after step or skip optional step/sub-step. diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf index 3f40572..542e603 100644 --- a/subworkflows/02_assembly.nf +++ b/subworkflows/02_assembly.nf @@ -54,34 +54,58 @@ workflow STEP_02_ASSEMBLY { ch_reads = reads } + if (params.coassembly){ + ch_reads.map { meta, fastq -> + [ meta.group, meta, fastq] } + .groupTuple(by: [0]) + .map { group, metas, fastq -> + def meta = [:] + meta.id = metas.group.unique().join() + meta.sample = metas.sample.join("_") + meta.flowcell = metas.flowcell.unique().join() + meta.group = metas.group.unique().join() + meta.assembly = metas.assembly.unique().join() + meta.type = metas.type.unique().join() + if (params.type.toUpperCase() == "SR") { + return [meta, fastq.collect { it[0] }, fastq.collect { it[1] }] + } else { + return [meta, fastq ] + }} + .set { ch_reads_assembly } + } else if (params.type.toUpperCase() == "SR") { + ch_reads_assembly = ch_reads + .map { meta, fastq -> + return [meta, fastq[0], fastq[1]] } + } else { + ch_reads_assembly = ch_reads + } + + ch_reads_assembly.view() + if (has_assembly){ ch_assembly = assembly } else { - if(assembly_tool == 'metaspades') { - METASPADES(reads) - ch_assembly = METASPADES.out.assembly - ch_assembler_v = METASPADES.out.v_metaspades - } - else if(assembly_tool == 'megahit') { - MEGAHIT(reads) - ch_assembly = MEGAHIT.out.assembly - ch_assembler_v = MEGAHIT.out.v_megahit - } - else if(assembly_tool == 'hifiasm-meta') { - HIFIASM_META(reads) - ch_assembly = HIFIASM_META.out.assembly - ch_assembler_v = HIFIASM_META.out.v_hifiasm_meta - } - else if(assembly_tool == 'metaflye') { - METAFLYE(reads) + if (assembly_tool == 'metaspades') { + METASPADES(ch_reads_assembly) + ch_assembly = METASPADES.out.assembly + ch_assembler_v = METASPADES.out.v_metaspades + } else if (assembly_tool == 'megahit') { + MEGAHIT(ch_reads_assembly) + ch_assembly = MEGAHIT.out.assembly + ch_assembler_v = MEGAHIT.out.v_megahit + } else if (assembly_tool == 'hifiasm-meta') { + HIFIASM_META(ch_reads_assembly) + ch_assembly = HIFIASM_META.out.assembly + ch_assembler_v = HIFIASM_META.out.v_hifiasm_meta + } else if (assembly_tool == 'metaflye') { + METAFLYE(ch_reads_assembly) ch_assembly = METAFLYE.out.assembly ch_assembler_v = METAFLYE.out.v_metaflye - } - else { + } else { exit 1, "Invalid assembly parameter: ${assembly_tool}" - } + } } - + ch_assembly_renamed = RENAME_CONTIGS(ch_assembly) ASSEMBLY_QUAST( ch_assembly_renamed,"02_assembly/02_1_primary_assembly/assembly_metric/") ch_assembly_report = ASSEMBLY_QUAST.out.report @@ -89,10 +113,24 @@ workflow STEP_02_ASSEMBLY { ch_idxstats = Channel.empty() ch_flagstat = Channel.empty() - ch_reads = reads + + if (params.coassembly){ + ch_reads.map { meta, fastq -> + [ meta.group, meta, fastq ]} + .set { ch_reads_tmp } + + ch_assembly_renamed.map { meta, assembly -> + [ meta.group, assembly ]} + .combine(ch_reads_tmp, by: 0) + .map { group, assembly, meta, fastq -> + [ meta, assembly,fastq ]} + .set { ch_contigs_and_reads } + } else { + ch_contigs_and_reads = ch_assembly_renamed.join(ch_reads, remainder: true) + } - ch_contigs_and_reads = ch_assembly_renamed.join(ch_reads, remainder: true) if ( params.type.toUpperCase() == "SR" ) { + READS_DEDUPLICATION ( ch_contigs_and_reads ) ch_reads = READS_DEDUPLICATION.out.dedup_reads @@ -104,7 +142,7 @@ workflow STEP_02_ASSEMBLY { MINIMAP2(ch_contigs_and_reads,"02_assembly/02_3_reads_vs_primary_assembly") ch_bam = MINIMAP2.out.bam - ch_minimap2_v = MINIMAP2.out.v_minimap2 + ch_minimap2_v = MINIMAP2.out.v_minimap2 } -- GitLab From 706e1428dfa3613e461ba47a05fcbe38bb13b6cf Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Thu, 17 Nov 2022 16:28:50 +0100 Subject: [PATCH 17/31] remove unnecessary line --- modules/cd_hit.nf | 1 - subworkflows/02_assembly.nf | 2 -- subworkflows/04_structural_annot.nf | 2 -- 3 files changed, 5 deletions(-) diff --git a/modules/cd_hit.nf b/modules/cd_hit.nf index 217bc92..1bf02ec 100644 --- a/modules/cd_hit.nf +++ b/modules/cd_hit.nf @@ -55,7 +55,6 @@ main: INDIVIDUAL_CD_HIT( ch_assembly, ch_percentage_identity ) ch_individual_clusters = INDIVIDUAL_CD_HIT.out.clstr_fasta.collect() GLOBAL_CD_HIT(ch_individual_clusters , ch_percentage_identity ) - ch_ffn = ch_assembly.flatMap{it -> it[1]}.collect() emit: individual_clstr_table = INDIVIDUAL_CD_HIT.out.clstr_table diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf index 542e603..53f6f69 100644 --- a/subworkflows/02_assembly.nf +++ b/subworkflows/02_assembly.nf @@ -80,8 +80,6 @@ workflow STEP_02_ASSEMBLY { ch_reads_assembly = ch_reads } - ch_reads_assembly.view() - if (has_assembly){ ch_assembly = assembly } else { diff --git a/subworkflows/04_structural_annot.nf b/subworkflows/04_structural_annot.nf index 49841ef..505e633 100644 --- a/subworkflows/04_structural_annot.nf +++ b/subworkflows/04_structural_annot.nf @@ -17,8 +17,6 @@ workflow STEP_04_STRUCTURAL_ANNOT { MERGE_ANNOTATIONS(annotations_ch) - ch_software_versions = PRODIGAL.out.v_prodigal.first() - ch_software_versions = PRODIGAL.out.v_prodigal.first().mix( BARRNAP.out.v_barrnap.first(), TRNASCAN_SE.out.v_tRNAscan.first()) -- GitLab From fec33d4f185af9c237a11b6bdefdaa85a0075a8f Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Thu, 17 Nov 2022 16:29:29 +0100 Subject: [PATCH 18/31] Manage coassembly for step 03 (filtering) --- subworkflows/03_filtering.nf | 125 ++++++++++++++++++++++++----------- 1 file changed, 88 insertions(+), 37 deletions(-) diff --git a/subworkflows/03_filtering.nf b/subworkflows/03_filtering.nf index 3e7288b..1d4fdbe 100644 --- a/subworkflows/03_filtering.nf +++ b/subworkflows/03_filtering.nf @@ -13,10 +13,33 @@ workflow STEP_03_ASSEMBLY_FILTER { main: ch_chunk_assembly_for_filter = assemblies .splitFasta(by: 100000, file: true) - - ch_assembly_and_idxstats = ch_chunk_assembly_for_filter + if (params.coassembly){ + idxstats.map { meta, idxstats -> + [ meta.group, meta, idxstats] } + .groupTuple(by: [0]) + .map { group, metas, idxstats -> + def meta = [:] + meta.id = metas.group.unique().join() + meta.sample = metas.sample.join("_") + meta.flowcell = metas.flowcell.unique().join() + meta.group = metas.group.unique().join() + meta.assembly = metas.assembly.unique().join() + meta.type = metas.type.unique().join() + [ group, meta, idxstats] } + .set { ch_idxstats_tmp } + ch_chunk_assembly_for_filter.map { meta, assembly -> + [ meta.group, assembly ]} + .combine(ch_idxstats_tmp, by: 0) + .map { group, assembly, meta, idxstats -> + [ meta, assembly, idxstats ]} + .set { ch_assembly_and_idxstats } + } else { + ch_assembly_and_idxstats = ch_chunk_assembly_for_filter .combine(idxstats, by:0) + } + + CHUNK_ASSEMBLY_FILTER ( ch_assembly_and_idxstats, @@ -42,57 +65,85 @@ workflow STEP_03_ASSEMBLY_FILTER { discarded_contigs = MERGE_ASSEMBLY_FILTER.out.merged_discarded - // Differentiate sample with and without discarded_contigs - // samples with no discarded_contigs are not going to be processed to process - discarded_contigs_category = discarded_contigs.branch{ - without: it[1].isEmpty() - with: !it[1].isEmpty() - } + // Differentiate sample with and without discarded_contigs + // samples with no discarded_contigs are not going to be processed to process + discarded_contigs_category = discarded_contigs.branch{ + without: it[1].isEmpty() + with: !it[1].isEmpty() + } + + + samples_without_discarded_contigs = discarded_contigs_category.without.map{ it -> it[0]} + + if (params.coassembly){ + bam.map { meta, bam, bai -> + [ meta.group, meta, bam, bai ]} + .set { ch_bam_tmp } + samples_without_discarded_contigs.map { meta, assembly -> + [ meta.group, assembly ]} + .combine( ch_bam_tmp, by: 0) + .map { group, assembly, meta, bam, bai -> + [ meta, assembly, bam, bai ]} + .set { ch_bam_of_sample_without_discarded_contigs } + discarded_contigs_category.with.map { meta, discarded_contigs -> + [ meta.group, discarded_contigs ]} + .combine( ch_bam_tmp, by: 0) + .map { group, discarded_contigs, meta, bam, bai -> + [ meta, discarded_contigs, bam, bai ]} + .set { ch_discarded_contigs_and_bam } + // metadata retrieval from bam channel, for the join with discarded reads after "EXTRACT_PAIRED_READS" + ch_merged_selected.map { meta, assembly -> + [ meta.group, assembly ]} + .combine( ch_bam_tmp, by: 0) + .map { group, assembly, meta, bam, bai -> + [ meta, assembly ]} + .set { ch_merged_selected_meta } + } else { + ch_bam_of_sample_without_discarded_contigs = samples_without_discarded_contigs.join(bam) + ch_discarded_contigs_and_bam = discarded_contigs_category.with.join(bam) + ch_merged_selected_meta = ch_merged_selected + } - samples_without_discarded_contigs = discarded_contigs_category.without.map{ it -> it[0]} - ch_bam_of_sample_without_discarded_contigs = samples_without_discarded_contigs.join(bam) - - ch_discarded_contigs_and_bam = discarded_contigs_category.with.join(bam) - FILTER_BAM(ch_discarded_contigs_and_bam) + FILTER_BAM(ch_discarded_contigs_and_bam) - ch_discarded_bam = FILTER_BAM.out.discarded_contigs_bam - ch_selected_bam = FILTER_BAM.out.selected_contigs_bam + ch_discarded_bam = FILTER_BAM.out.discarded_contigs_bam + ch_selected_bam = FILTER_BAM.out.selected_contigs_bam - if ( params.type.toUpperCase() == "SR" ) { - - ch_discarded_reads = EXTRACT_PAIRED_READS(ch_discarded_bam) + if ( params.type.toUpperCase() == "SR" ) { + + ch_discarded_reads = EXTRACT_PAIRED_READS(ch_discarded_bam) - ch_contigs_and_discarded_reads = ch_merged_selected.join(ch_discarded_reads) + ch_contigs_and_discarded_reads = ch_merged_selected_meta.join(ch_discarded_reads) - BWA_MEM(ch_contigs_and_discarded_reads, "03_filtering/03_2_reads_vs_filtered_assembly") - ch_bam_with_discarded_reads = BWA_MEM.out.bam + BWA_MEM(ch_contigs_and_discarded_reads, "03_filtering/03_2_reads_vs_filtered_assembly") + ch_bam_with_discarded_reads = BWA_MEM.out.bam - } else { - ch_discarded_reads = EXTRACT_SINGLE_READS(ch_discarded_bam) - ch_contigs_and_discarded_reads = ch_merged_selected.join(ch_discarded_reads) - MINIMAP2(ch_contigs_and_discarded_reads, "03_filtering/03_2_reads_vs_filtered_assembly") - ch_bam_with_discarded_reads = MINIMAP2.out.bam - - } + } else { + ch_discarded_reads = EXTRACT_SINGLE_READS(ch_discarded_bam) + ch_contigs_and_discarded_reads = ch_merged_selected_meta.join(ch_discarded_reads) + MINIMAP2(ch_contigs_and_discarded_reads, "03_filtering/03_2_reads_vs_filtered_assembly") + ch_bam_with_discarded_reads = MINIMAP2.out.bam + + } - // remove bai from ch_bam_with_discarded_reads and add bam of selected contigs - ch_bam_to_merge = ch_bam_with_discarded_reads.map{ it -> [it[0], it[1]]}.join(ch_selected_bam) + // remove bai from ch_bam_with_discarded_reads and add bam of selected contigs + ch_bam_to_merge = ch_bam_with_discarded_reads.map{ it -> [it[0], it[1]]}.join(ch_selected_bam) - ch_merged_bam = MERGE_BAM_FILES(ch_bam_to_merge) + ch_merged_bam = MERGE_BAM_FILES(ch_bam_to_merge) - ch_final_bam = ch_bam_of_sample_without_discarded_contigs.mix(ch_merged_bam) + ch_final_bam = ch_bam_of_sample_without_discarded_contigs.mix(ch_merged_bam) - GET_ALIGNMENT_METRICS(ch_final_bam, '03_filtering/03_2_reads_vs_filtered_assembly/') + GET_ALIGNMENT_METRICS(ch_final_bam, '03_filtering/03_2_reads_vs_filtered_assembly/') - ch_flagstat = GET_ALIGNMENT_METRICS.out.sam_flagstat - ch_coverage = GET_ALIGNMENT_METRICS.out.sam_coverage + ch_flagstat = GET_ALIGNMENT_METRICS.out.sam_flagstat + ch_coverage = GET_ALIGNMENT_METRICS.out.sam_coverage - QUAST( ch_merged_selected, "03_filtering/03_1_filtered_assembly/assembly_metric" ) - ch_quast_report = QUAST.out.report + QUAST( ch_merged_selected, "03_filtering/03_1_filtered_assembly/assembly_metric" ) + ch_quast_report = QUAST.out.report emit: selected_contigs = ch_merged_selected -- GitLab From e070a4e5f75ae308316821eabe75c5209196d94e Mon Sep 17 00:00:00 2001 From: "maina.vienne" <maina.vienne@inrae.fr> Date: Wed, 14 Dec 2022 11:28:03 +0100 Subject: [PATCH 19/31] Allows to give co-assembly as input --- subworkflows/02_assembly.nf | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf index 5bed864..532873a 100644 --- a/subworkflows/02_assembly.nf +++ b/subworkflows/02_assembly.nf @@ -20,6 +20,9 @@ workflow STEP_02_ASSEMBLY { ch_samtools_v = Channel.empty() + ch_assembly = assembly + ch_reads = reads + if (!has_assembly & has_flowcell ){ ////////////////// // Manage Flowcell @@ -49,9 +52,6 @@ workflow STEP_02_ASSEMBLY { .reads .mix(ch_reads_flowcell.single) .set{ch_reads} - - } else { - ch_reads = reads } if (params.coassembly){ @@ -72,17 +72,33 @@ workflow STEP_02_ASSEMBLY { return [meta, fastq ] }} .set { ch_reads_assembly } + + if (has_assembly){ + ch_assembly = assembly.map { meta, assembly -> + [ meta.group, meta, assembly] } + .groupTuple(by: [0]) + .map{ group, metas, assembly -> + def meta = [:] + meta.id = metas.group.unique().join() + meta.sample = metas.sample.join("_") + meta.flowcell = metas.flowcell.unique().join() + meta.group = metas.group.unique().join() + meta.assembly = metas.assembly.unique().join() + meta.type = metas.type.unique().join() + return [meta, assembly[0]] } + ch_assembly.view() + } + } else if (params.type.toUpperCase() == "SR") { ch_reads_assembly = ch_reads .map { meta, fastq -> return [meta, fastq[0], fastq[1]] } + } else { ch_reads_assembly = ch_reads } - if (has_assembly){ - ch_assembly = assembly - } else { + if (!has_assembly){ if (assembly_tool == 'metaspades') { METASPADES(ch_reads_assembly) ch_assembly = METASPADES.out.assembly -- GitLab From 3ebff09ef82e07de6a0e5e797e9143baf4bf5dbe Mon Sep 17 00:00:00 2001 From: "maina.vienne" <maina.vienne@inrae.fr> Date: Wed, 14 Dec 2022 11:30:55 +0100 Subject: [PATCH 20/31] Remove unused coma --- subworkflows/05_protein_alignment.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/05_protein_alignment.nf b/subworkflows/05_protein_alignment.nf index 56258c4..b178755 100644 --- a/subworkflows/05_protein_alignment.nf +++ b/subworkflows/05_protein_alignment.nf @@ -13,7 +13,7 @@ workflow STEP_05_PROTEIN_ALIGNMENT { DIAMOND ( prokka_faa, - diamond, + diamond ) ch_diamond_result = DIAMOND.out.m8 -- GitLab From e73afaff781c129b85a2155edb742b25b8d76c81 Mon Sep 17 00:00:00 2001 From: "maina.vienne" <maina.vienne@inrae.fr> Date: Wed, 14 Dec 2022 14:22:36 +0100 Subject: [PATCH 21/31] Manage coassembly for func annot et taxo affi --- modules/feature_counts.nf | 6 ++---- subworkflows/06_functionnal_annot.nf | 17 ++++++++++++++++- subworkflows/07_taxonomic_affi.nf | 15 ++++++++++++++- 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/modules/feature_counts.nf b/modules/feature_counts.nf index 73ec38c..63eb79a 100644 --- a/modules/feature_counts.nf +++ b/modules/feature_counts.nf @@ -44,14 +44,12 @@ process QUANTIFICATION_TABLE { workflow QUANTIFICATION { take: - ch_gff // channel: [ val(meta), path(gff) ] - ch_bam // channel: [ val(meta), path(bam), path(bam_index) ] + ch_gff_and_bam // channel: [ val(meta), path(gff), path(bam), path(bam_index) ] ch_individual_clstr_table ch_global_clstr_table main: - ch_gff_and_bam = ch_gff.join(ch_bam, remainder: false) - + FEATURE_COUNTS(ch_gff_and_bam) ch_count_table = FEATURE_COUNTS.out.count_table.collect() ch_quant_report = FEATURE_COUNTS.out.summary diff --git a/subworkflows/06_functionnal_annot.nf b/subworkflows/06_functionnal_annot.nf index 5b86055..ad48129 100644 --- a/subworkflows/06_functionnal_annot.nf +++ b/subworkflows/06_functionnal_annot.nf @@ -28,7 +28,22 @@ workflow STEP_06_FUNC_ANNOT { ch_global_clstr_table = CD_HIT.out.global_clstr_table ch_cdhit_v = CD_HIT.out.v_cdhit - QUANTIFICATION ( gff, bam, ch_individual_clstr_table, ch_global_clstr_table) + if (params.coassembly){ + bam.map { meta, bam, bai -> + [ meta.group, meta, bam, bai ]} + .set { ch_bam_tmp } + + gff.map { meta, gff -> + [ meta.group, gff]} + .combine( ch_bam_tmp, by: 0) + .map { group, gff, meta, bam, bai -> + [ meta, gff, bam, bai ]} + .set { ch_gff_and_bam } + } else { + ch_gff_and_bam = gff.join(bam, remainder: false) + } + + QUANTIFICATION ( ch_gff_and_bam, ch_individual_clstr_table, ch_global_clstr_table) ch_quant_table = QUANTIFICATION.out.quantification_table ch_quant_report = QUANTIFICATION.out.quant_report ch_featurecounts_v = QUANTIFICATION.out.v_featurecounts diff --git a/subworkflows/07_taxonomic_affi.nf b/subworkflows/07_taxonomic_affi.nf index 99ead6c..4f1eb9b 100644 --- a/subworkflows/07_taxonomic_affi.nf +++ b/subworkflows/07_taxonomic_affi.nf @@ -8,7 +8,20 @@ workflow STEP_07_TAXO_AFFI { diamond_result // channel: [ val(meta), path(diamond_file) ] sam_coverage // channel: [ val(meta), path(samtools coverage) ] main: - ch_assign_taxo_input = diamond_result.join(sam_coverage, remainder: true) + if (params.coassembly){ + sam_coverage.map { meta, cov -> + [ meta.group, meta, cov ]} + .set { ch_sam_cov_tmp } + + diamond_result.map { meta, m8 -> + [ meta.group, m8]} + .combine( ch_sam_cov_tmp, by: 0) + .map { group, m8, meta, cov -> + [ meta, m8, cov ]} + .set { ch_assign_taxo_input } + } else { + ch_assign_taxo_input = diamond_result.join(sam_coverage, remainder: true) + } ASSIGN_TAXONOMY ( taxonomy, ch_assign_taxo_input ) -- GitLab From f99bf3434080ca66b72d28d8f17686726e5c0c19 Mon Sep 17 00:00:00 2001 From: "maina.vienne" <maina.vienne@inrae.fr> Date: Wed, 14 Dec 2022 14:30:09 +0100 Subject: [PATCH 22/31] Manage coassembly for the binning --- subworkflows/08_binning.nf | 60 +++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/subworkflows/08_binning.nf b/subworkflows/08_binning.nf index 59b9c7b..efa4f9d 100644 --- a/subworkflows/08_binning.nf +++ b/subworkflows/08_binning.nf @@ -52,21 +52,21 @@ workflow STEP_08_BINNING { if (params.binning_cross_alignment == 'all') { // combine assemblies with reads of all samples ch_reads_assembly = reads.combine(assembly) - .map{ meta_reads, reads, meta_assembly, assembly -> - if (meta_reads != meta_assembly){ - [[id:meta_reads.id+"_"+meta_assembly.id, sample:meta_assembly.sample, flowcell:meta_assembly.flowcell, group:meta_assembly.group, assembly:meta_assembly.assembly, type:meta_assembly.type], assembly, reads] - } - } + .map{ meta_reads, reads, meta_assembly, assembly -> + if ((meta_reads.id != meta_assembly.id && !(params.coassembly)) || ((params.coassembly && meta_reads.group == meta_assembly.group))){ + [[id:meta_reads.id+"_"+meta_assembly.id, sample:meta_assembly.sample, flowcell:meta_assembly.flowcell, group:meta_assembly.group, assembly:meta_assembly.assembly, type:meta_assembly.type], assembly, reads] + } + } } else if (params.binning_cross_alignment == 'group'){ // combine assemblies with reads of samples from same group ch_assembly_group = assembly.map{ meta, assembly -> [ meta.group, meta, assembly ] } ch_reads_assembly = reads.map{ meta, reads -> [ meta.group, meta, reads ] } - .combine(ch_assembly_group, by: 0) - .map{ group, meta_reads, reads, meta_assembly, assembly -> - if (meta_reads != meta_assembly){ - [[id:meta_reads.id+"_"+meta_assembly.id, sample:meta_assembly.sample, flowcell:meta_assembly.flowcell, group:meta_assembly.group, assembly:meta_assembly.assembly, type:meta_assembly.type], assembly, reads] - } - } + .combine(ch_assembly_group, by: 0) + .map{ group, meta_reads, reads, meta_assembly, assembly -> + if (meta_reads != meta_assembly){ + [[id:meta_reads.id+"_"+meta_assembly.id, sample:meta_assembly.sample, flowcell:meta_assembly.flowcell, group:meta_assembly.group, assembly:meta_assembly.assembly, type:meta_assembly.type], assembly, reads] + } + } } // cross alignment if (params.type == 'SR') { @@ -78,21 +78,39 @@ workflow STEP_08_BINNING { } // formatting channel ch_bam = ch_bam_cross_alignment.mix(bam) - .map { meta, bam, bai -> [ meta.sample, meta, bam, bai ] } - .groupTuple(by: [0]) - .map { sample,metas, bam, bai -> - [ metas.min { it.id.size() }, bam, bai ] - } + .map { meta, bam, bai -> [ meta.sample, meta, bam, bai ] } + .groupTuple(by: [0]) + .map { sample,metas, bam, bai -> + [ metas.min { it.id.size() }, bam, bai ] + } } else { ch_bam = bam } + if (params.coassembly){ + ch_bam.map { meta, bam, bai -> + [ meta.group, meta, bam, bai ]} + .set { ch_bam_tmp } + + assembly.map { meta, assembly -> + [ meta.group, assembly ]} + .combine( ch_bam_tmp, by: 0) + .map { group, assembly, meta, bam, bai -> + [ meta, assembly, bam, bai]} + .tap { ch_assembly_bam } + .map { meta, assembly, bam, bai -> + [ meta, assembly ]} + .tap { ch_assembly } + } else { + ch_assembly_bam = assembly.join(ch_bam) + ch_assembly = assembly + } /////////// /// BINNING /////////// ch_depth = GENERATE_DEPTH_FILES(ch_bam) - ch_assembly_depth = assembly.join(ch_depth) + ch_assembly_depth = ch_assembly.join(ch_depth) METABAT2(ch_assembly_depth) ch_metabat_bins = METABAT2.out.bins.filter{ t -> t[1].list().size()} @@ -102,8 +120,6 @@ workflow STEP_08_BINNING { ch_maxbin_bins = MAXBIN2.out.bins.filter{ t -> t[1].list().size()} ch_maxbin_v = MAXBIN2.out.v_maxbin - ch_assembly_bam = assembly.join(ch_bam) - CONCOCT(ch_assembly_bam) ch_concoct_bins = CONCOCT.out.bins.filter{ t -> t[1].list().size()} ch_concoct_v = CONCOCT.out.v_concoct @@ -120,7 +136,8 @@ workflow STEP_08_BINNING { (it[2] != null && it[3] != null) single: true } - + + //bins set with bins form at least 2 binners ch_bins_multiple = ch_bins_set.multiple.map{ meta, bins1, bins2, bins3 -> if ( bins1 == null ) { return [meta, bins2, bins3, bins1] @@ -131,6 +148,7 @@ workflow STEP_08_BINNING { } } + //bins set with bins form only 1 binner ch_bins_single = ch_bins_set.single.map{ meta, bins1, bins2, bins3 -> if ( bins1 != null ) { return [meta, bins1] @@ -160,7 +178,7 @@ workflow STEP_08_BINNING { .map { meta, file -> file} .collect() - ch_bins_assembly = METAWRAP_REFINMENT.out.bins.join(assembly) + ch_bins_assembly = METAWRAP_REFINMENT.out.bins.join(ch_assembly) UNBINNED_CONTIGS(ch_bins_assembly) -- GitLab From 7fd9ec8d10fea1daf623d00387d7372210baf6c1 Mon Sep 17 00:00:00 2001 From: "maina.vienne" <maina.vienne@inrae.fr> Date: Tue, 20 Dec 2022 08:49:20 +0100 Subject: [PATCH 23/31] Fix import MERGE_FASTQ for flowcell --- main.nf | 2 +- subworkflows/02_assembly.nf | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 62dc6d4..634a355 100644 --- a/main.nf +++ b/main.nf @@ -23,7 +23,7 @@ include { STEP_08_BINNING as S08_BINNING } from './subworkflows/08_binning' include { GET_SOFTWARE_VERSIONS } from './modules/get_software_versions' include { MULTIQC } from './modules/multiqc' -include { MERGE_FASTQ } from './modules/merge_fastq.nf' + /* diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf index 532873a..01291d9 100644 --- a/subworkflows/02_assembly.nf +++ b/subworkflows/02_assembly.nf @@ -3,6 +3,7 @@ include { QUAST as ASSEMBLY_QUAST} from '../modules/metaquast' include { READS_DEDUPLICATION } from '../modules/reads_deduplication' include { MINIMAP2 } from '../modules/read_alignment' include { GET_ALIGNMENT_METRICS } from '../modules/read_alignment_manipulation' +include { MERGE_FASTQ } from '../modules/merge_fastq.nf' workflow STEP_02_ASSEMBLY { take: -- GitLab From 332a13f167cf215ed217d9444309b51522697068 Mon Sep 17 00:00:00 2001 From: "maina.vienne" <maina.vienne@inrae.fr> Date: Tue, 20 Dec 2022 16:29:38 +0100 Subject: [PATCH 24/31] Fix channel reads on main.nf --- main.nf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/main.nf b/main.nf index 634a355..f2455ef 100644 --- a/main.nf +++ b/main.nf @@ -244,6 +244,7 @@ workflow { def meta = [:] meta.id = item.sample if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell} + if (item.group !=null) {meta.id = meta.id+"_"+item.group} meta.sample = item.sample meta.flowcell = item.flowcell meta.group = item.group @@ -264,6 +265,7 @@ workflow { def meta = [:] meta.id = item.sample if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell} + if (item.group !=null) {meta.id = meta.id+"_"+item.group} meta.sample = item.sample meta.flowcell = item.flowcell meta.group = item.group @@ -344,7 +346,7 @@ workflow { ch_host_index, ch_kaiju_db ) - ch_preprocessed_reads = S01_CLEAN_QC.out.preprocessed_reads + ch_reads = S01_CLEAN_QC.out.preprocessed_reads ch_cutadapt_report = S01_CLEAN_QC.out.cutadapt_report ch_sickle_report = S01_CLEAN_QC.out.sickle_report @@ -355,13 +357,11 @@ workflow { ch_kaiju_report = S01_CLEAN_QC.out.kaiju_report ch_software_versions_S01 = S01_CLEAN_QC.out.software_versions - } else { - ch_preprocessed_reads = ch_reads } if ( !params.stop_at_clean ) { - S02_ASSEMBLY ( ch_preprocessed_reads, ch_assembly, has_assembly, assembly_tool, has_flowcell ) + S02_ASSEMBLY ( ch_reads, ch_assembly, has_assembly, assembly_tool, has_flowcell ) ch_assembly = S02_ASSEMBLY.out.assembly ch_reads = S02_ASSEMBLY.out.reads ch_bam = S02_ASSEMBLY.out.bam @@ -384,7 +384,7 @@ workflow { S03_FILTERING ( ch_assembly, - ch_preprocessed_reads, + ch_reads, ch_idxstats, ch_bam, ch_min_contigs_cpm, -- GitLab From eb938df9fdfc57fbd34250d81a74e267dc17d88f Mon Sep 17 00:00:00 2001 From: "maina.vienne" <maina.vienne@inrae.fr> Date: Tue, 20 Dec 2022 16:36:44 +0100 Subject: [PATCH 25/31] homogenize bam names between SR and HiFi --- modules/read_alignment.nf | 16 ++++++++-------- subworkflows/03_filtering.nf | 12 ++++++------ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/modules/read_alignment.nf b/modules/read_alignment.nf index 881f6c8..5183c8b 100644 --- a/modules/read_alignment.nf +++ b/modules/read_alignment.nf @@ -1,6 +1,6 @@ process BWA_MEM { tag "${meta.id}" - publishDir "${params.outdir}/$publishDir_path/${meta.id}", mode: 'copy', pattern:"*sort*" + publishDir "${params.outdir}/$publishDir_path/${meta.id}", mode: 'copy', pattern:"*bam*" input: tuple val(meta), path(fna), path(reads) @@ -8,16 +8,16 @@ process BWA_MEM { output: - tuple val(meta), path("${meta.id}.sort.bam"), path("${meta.id}.sort.bam.bai"), emit: bam + tuple val(meta), path("${meta.id}.bam"), path("${meta.id}.bam.bai"), emit: bam path "v_bwa.txt", emit: v_bwa_mem2 path "v_samtools.txt", emit: v_samtools script: """ bwa-mem2 index ${fna} -p ${fna} - bwa-mem2 mem -t ${task.cpus} ${fna} ${reads[0]} ${reads[1]} | samtools view -@ ${task.cpus} -bS - | samtools sort -@ ${task.cpus} - -o ${meta.id}.sort.bam + bwa-mem2 mem -t ${task.cpus} ${fna} ${reads[0]} ${reads[1]} | samtools view -@ ${task.cpus} -bS - | samtools sort -@ ${task.cpus} - -o ${meta.id}.bam - samtools index -@ ${task.cpus} ${meta.id}.sort.bam + samtools index -@ ${task.cpus} ${meta.id}.bam bwa-mem2 version > v_bwa.txt samtools --version &> v_samtools.txt @@ -27,23 +27,23 @@ process BWA_MEM { process MINIMAP2 { tag "${meta.id}" label 'MINIMAP2' - publishDir "${params.outdir}/$publishDir_path/${meta.id}", mode: 'copy', pattern:"*sort*" + publishDir "${params.outdir}/$publishDir_path/${meta.id}", mode: 'copy', pattern:"*bam*" input: tuple val(meta), path(fna), path(reads) val(publishDir_path) output: - tuple val(meta), path("${meta.id}.sort.bam"), path("${meta.id}.sort.bam.bai"), emit: bam + tuple val(meta), path("${meta.id}.bam"), path("${meta.id}.bam.bai"), emit: bam path "v_minimap2.txt", emit: v_minimap2 path "v_samtools.txt", emit: v_samtools script: """ # align reads to contigs, keep only primary aln and sort resulting bam - minimap2 -t ${task.cpus} -ax map-hifi $fna $reads | samtools view -@ ${task.cpus} -b -F 2304 | samtools sort -@ ${task.cpus} -o ${meta.id}.sort.bam + minimap2 -t ${task.cpus} -ax map-hifi $fna $reads | samtools view -@ ${task.cpus} -b -F 2304 | samtools sort -@ ${task.cpus} -o ${meta.id}.bam - samtools index ${meta.id}.sort.bam -@ ${task.cpus} + samtools index ${meta.id}.bam -@ ${task.cpus} samtools --version &> v_samtools.txt minimap2 --version &> v_minimap2.txt diff --git a/subworkflows/03_filtering.nf b/subworkflows/03_filtering.nf index 5fdd04c..c71ea08 100644 --- a/subworkflows/03_filtering.nf +++ b/subworkflows/03_filtering.nf @@ -122,12 +122,12 @@ workflow STEP_03_ASSEMBLY_FILTER { result_path_dir.mkdirs() ch_bam_unchanged_by_filtering.map { meta, bam, bai -> - {file("${result_path_dir}/${meta.id}/").mkdir() - file("${params.outdir}/${unfiltered_assembly_bam_outdir}/${meta.id}/${meta.id}.sort.bam") - .mklink("${bam}", overwrite:true) - file("${params.outdir}/${unfiltered_assembly_bam_outdir}/${meta.id}/${meta.id}.sort.bam.bai") - .mklink("${bai}", overwrite:true) - } + { file("${result_path_dir}/${meta.id}/").mkdir() + file("${params.outdir}/${unfiltered_assembly_bam_outdir}/${meta.id}/${meta.id}.bam") + .mklink("${result_path_dir}/${meta.id}/${meta.id}.bam", overwrite:true) + file("${params.outdir}/${unfiltered_assembly_bam_outdir}/${meta.id}/${meta.id}.bam.bai") + .mklink("${result_path_dir}/${meta.id}/${meta.id}.bam.bai", overwrite:true) + } } if ( params.type.toUpperCase() == "SR" ) { -- GitLab From f132835af2e57f727c8d933819eeb7df4d37afc2 Mon Sep 17 00:00:00 2001 From: "maina.vienne" <maina.vienne@inrae.fr> Date: Tue, 20 Dec 2022 17:05:51 +0100 Subject: [PATCH 26/31] Add metabat2_seed parameters --- docs/usage.md | 2 ++ modules/binning.nf | 2 +- nextflow.config | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index ed48bf6..87e6e2b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -352,6 +352,8 @@ For example check the version of software in the yaml file and the singularity r * `--gtdbtk_bank`: indicates path to the GTDBTK database. +* `--metabat2_seed`: Set the seed for metabat2, for exact reproducibility of metabat2 (default: 0 (random seed)) + * `--binning_cross_alignment ["all","group","individual"]`: defines mapping strategy to compute co-abundances for binning. `all` means that each samples will be mapped against every assembly, `group` means that all sample from a group will be mapped against every assembly of the group, `individual` means that each sample will only be mapped against his assembly. Default `individual` diff --git a/modules/binning.nf b/modules/binning.nf index c90306a..f9012d9 100644 --- a/modules/binning.nf +++ b/modules/binning.nf @@ -30,7 +30,7 @@ process METABAT2 { script: """ mkdir -p metabat2/bins/ - metabat2 --inFile $fna --abdFile $depth --outFile metabat2/bins/${meta.id}_metabat2 --numThreads ${task.cpus} + metabat2 --inFile $fna --abdFile $depth --outFile metabat2/bins/${meta.id}_metabat2 --numThreads ${task.cpus} --seed ${params.metabat2_seed} echo \$(metabat2 -h 2>&1) > v_metabat2.txt """ } diff --git a/nextflow.config b/nextflow.config index 22d7fa8..b0f4b71 100644 --- a/nextflow.config +++ b/nextflow.config @@ -59,6 +59,7 @@ params { drep_threshold = 0.95 min_completeness = 50 max_contamination = 10 + metabat2_seed = 0 // Ressources. kaiju_db_dir = false -- GitLab From 688d47bc65f36d562a2d198b223d827db4492eef Mon Sep 17 00:00:00 2001 From: "maina.vienne" <maina.vienne@inrae.fr> Date: Wed, 4 Jan 2023 15:50:47 +0100 Subject: [PATCH 27/31] Fix coassembly on step 03 --- main.nf | 2 -- subworkflows/03_filtering.nf | 26 ++++++++++---------------- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/main.nf b/main.nf index f2455ef..3b86218 100644 --- a/main.nf +++ b/main.nf @@ -244,7 +244,6 @@ workflow { def meta = [:] meta.id = item.sample if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell} - if (item.group !=null) {meta.id = meta.id+"_"+item.group} meta.sample = item.sample meta.flowcell = item.flowcell meta.group = item.group @@ -265,7 +264,6 @@ workflow { def meta = [:] meta.id = item.sample if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell} - if (item.group !=null) {meta.id = meta.id+"_"+item.group} meta.sample = item.sample meta.flowcell = item.flowcell meta.group = item.group diff --git a/subworkflows/03_filtering.nf b/subworkflows/03_filtering.nf index c71ea08..2aa0f79 100644 --- a/subworkflows/03_filtering.nf +++ b/subworkflows/03_filtering.nf @@ -88,26 +88,20 @@ workflow STEP_03_ASSEMBLY_FILTER { if (params.coassembly){ - bam.map { meta, bam, bai -> - [ meta.group, meta, bam, bai ]} - .set { ch_bam_tmp } discarded_contigs_category.without.map { meta, discarded_empty -> [ meta.group ]} - .combine( ch_bam_tmp, by: 0) + .combine( bam.map { meta, bam, bai -> + [ meta.group, meta, bam, bai ]}, by: 0) .map{ group, meta, bam, bai -> - [ meta, bam, bai ]} + [ meta, bam, bai ]} .set{ ch_bam_unchanged_by_filtering } - - reads{ meta, reads -> - [ meta.group, meta, reads ]} - .set { ch_reads_tmp } - - ch_selected_contigs_and_reads = discarded_contigs_category.with.map{meta, discarded_contigs -> meta.group} - .join(ch_merged_selected) - .combine(ch_reads_tmp, by: 0) - .map{ group, meta, assembly, reads -> - [ meta, assembly, reads ]} - .set{ ch_bam_unchanged_by_filtering } + ch_selected_contigs_and_reads= discarded_contigs_category.with.map {meta, discarded_contigs -> meta.group} + .join( ch_merged_selected.map { meta, contigs -> + [meta.group, meta, contigs]}) + .combine( reads.map{ meta, reads -> + [ meta.group, meta, reads ]}, by: 0) + .map{ group, meta_contigs, contigs, meta_reads, reads -> + [ meta_reads, contigs, reads ]} } else { ch_bam_unchanged_by_filtering = discarded_contigs_category.without.map{ it -> it[0]} -- GitLab From 83286cb378f5720df276d867b0689612eb5abf78 Mon Sep 17 00:00:00 2001 From: "maina.vienne" <maina.vienne@inrae.fr> Date: Thu, 5 Jan 2023 16:31:26 +0100 Subject: [PATCH 28/31] Fix binning and cross alignment for coassembly --- modules/binning.nf | 2 +- subworkflows/08_binning.nf | 28 ++++++++++++++++++++-------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/modules/binning.nf b/modules/binning.nf index f9012d9..431fb01 100644 --- a/modules/binning.nf +++ b/modules/binning.nf @@ -130,7 +130,7 @@ process METAWRAP_REFINMENT { path "v_metawrap.txt", emit: v_metawrap script: - bins_flag = (bins3 != null) ? "-A $bins1 -B $bins2 -C $bins3" : "-A $bins2 -B $bins2 " + bins_flag = (bins3 != null) ? "-A $bins1 -B $bins2 -C $bins3" : "-A $bins1 -B $bins2 " """ echo "metawrap 1.3_modified" > v_metawrap.txt diff --git a/subworkflows/08_binning.nf b/subworkflows/08_binning.nf index efa4f9d..32b9e32 100644 --- a/subworkflows/08_binning.nf +++ b/subworkflows/08_binning.nf @@ -53,7 +53,7 @@ workflow STEP_08_BINNING { // combine assemblies with reads of all samples ch_reads_assembly = reads.combine(assembly) .map{ meta_reads, reads, meta_assembly, assembly -> - if ((meta_reads.id != meta_assembly.id && !(params.coassembly)) || ((params.coassembly && meta_reads.group == meta_assembly.group))){ + if (((meta_reads.id != meta_assembly.id) && !(params.coassembly)) || (params.coassembly && (meta_reads.group != meta_assembly.group))){ [[id:meta_reads.id+"_"+meta_assembly.id, sample:meta_assembly.sample, flowcell:meta_assembly.flowcell, group:meta_assembly.group, assembly:meta_assembly.assembly, type:meta_assembly.type], assembly, reads] } } @@ -78,7 +78,9 @@ workflow STEP_08_BINNING { } // formatting channel ch_bam = ch_bam_cross_alignment.mix(bam) - .map { meta, bam, bai -> [ meta.sample, meta, bam, bai ] } + .map { meta, bam, bai -> + if (params.coassembly){[ meta.group, meta, bam, bai ]} + else {[ meta.sample, meta, bam, bai ]}} .groupTuple(by: [0]) .map { sample,metas, bam, bai -> [ metas.min { it.id.size() }, bam, bai ] @@ -88,19 +90,29 @@ workflow STEP_08_BINNING { } if (params.coassembly){ - ch_bam.map { meta, bam, bai -> - [ meta.group, meta, bam, bai ]} - .set { ch_bam_tmp } - + if (params.binning_cross_alignment == 'all') { + ch_bam.map { meta, bam, bai -> [ meta.group, bam, bai ] } + .set { ch_bam_tmp } + } else { + ch_bam.map { meta, bam, bai -> [ meta.group, meta, bam, bai ] } + .groupTuple(by: [0]) + .map { group, metas, bam, bai -> + [ group, bam, bai ]} + .set { ch_bam_tmp } + } assembly.map { meta, assembly -> - [ meta.group, assembly ]} + [ meta.group, meta, assembly ]} .combine( ch_bam_tmp, by: 0) - .map { group, assembly, meta, bam, bai -> + .map { group, meta, assembly, bam, bai -> [ meta, assembly, bam, bai]} .tap { ch_assembly_bam } .map { meta, assembly, bam, bai -> [ meta, assembly ]} .tap { ch_assembly } + + ch_assembly_bam.map{ meta, assembly, bam, bai -> + [ meta, bam, bai ]} + .set{ ch_bam} } else { ch_assembly_bam = assembly.join(ch_bam) ch_assembly = assembly -- GitLab From bae314156695d687339e122040c1599affde5204 Mon Sep 17 00:00:00 2001 From: "maina.vienne" <maina.vienne@inrae.fr> Date: Thu, 5 Jan 2023 16:36:13 +0100 Subject: [PATCH 29/31] Remove channel view --- subworkflows/02_assembly.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf index 01291d9..46a0a6e 100644 --- a/subworkflows/02_assembly.nf +++ b/subworkflows/02_assembly.nf @@ -87,7 +87,6 @@ workflow STEP_02_ASSEMBLY { meta.assembly = metas.assembly.unique().join() meta.type = metas.type.unique().join() return [meta, assembly[0]] } - ch_assembly.view() } } else if (params.type.toUpperCase() == "SR") { -- GitLab From b9436266d1af0da8502e2d6d0c9ba010966e8a46 Mon Sep 17 00:00:00 2001 From: "maina.vienne" <maina.vienne@inrae.fr> Date: Thu, 5 Jan 2023 17:16:45 +0100 Subject: [PATCH 30/31] Doc + test param coassembly --- docs/usage.md | 3 ++- main.nf | 9 +++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 87e6e2b..a0907e2 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -118,7 +118,7 @@ For HiFi the fastq2 column is not needed > name_sample2,number_flowcell,path_sample2_R1.fastq.gz,path_sample2_R2.fastq.gz > ``` - * **If you want to perfom cross alignment for binning on groups** : + * **If you want to perfom coassembly or cross alignment for binning on groups** : > ``` > sample,group,fastq_1,fastq_2 > name_sample1,group_name,path_sample1_flowcell1_R1.fastq.gz,path_sample1_flowcell1_R2.fastq.gz @@ -276,6 +276,7 @@ No parameter available for this substep. * `--assembly` allows to indicate the assembly tool. For short reads: `["metaspades" or "megahit"]`: Default: `metaspades`. For HiFi reads: `["hifiasm-meta", "metaflye"]`. Default: `hifiasm-meta`. +* `--coassembly` allows to assemble together the samples labeled with the same group in the samplesheet. It will generate one assembly for each group. To co-assemble all of your samples together, you must indicate a unique group for each sample in the samplesheet. **WARNING** With the coassembly, you can't use `--binning_cross_alignment 'group'` because one binning will be generate for each group co-assembled and automatically mapping with every sample of his group but `--binning_cross_alignment 'all'` can be use to cross align every sample with every group. **WARNING 4:** For short reads, the user can choose between `metaspades` or `megahit` for `--assembly` parameter. The choice can be based on CPUs and memory availability: `metaspades` needs more CPUs and memory than `megahit` but our tests showed that assembly metrics are better for `metaspades` than `megahit`.For PacBio HiFi reads, the user can choose between `hifiasm-meta` or `metaflye`. diff --git a/main.nf b/main.nf index 3b86218..6b2e1da 100644 --- a/main.nf +++ b/main.nf @@ -65,6 +65,7 @@ include { MULTIQC } from './modules/multiqc' --stop_at_assembly Stop the pipeline at this step --assembly Indicate the assembly tool for short reads ["metaspades" or "megahit" ]. Default: "metaspades". or for HiFi reads ["hifiasm-meta", "metaflye"]. Default: "hifiasm-meta". + --coassembly Assemble together samples labeled with the same group in the samplesheet S03_FILTERING options: --stop_at_filtering Stop the pipeline at this step @@ -213,6 +214,10 @@ workflow { exit 1, "You must specify --skip_binning or specify a GTDB-TK bank with --gtdbtk_bank" } + if ( params.coassembly && params.binning_cross_alignment == 'group'){ + exit 1, "--binning_cross_alignment group must not be use use --coassembly." + } + //////////// @@ -228,8 +233,8 @@ workflow { if (hasExtension(row.fastq_1, "fastq") || hasExtension(row.fastq_1, "fq") || hasExtension(row.fastq_2, "fastq") || hasExtension(row.fastq_2, "fq")) { exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint." } - if (params.binning_cross_alignment == 'group' && row.group == null){ - exit 1, "You must specify groups in the samplesheet if you want to use --binning_cross_alignment 'group'" + if ((params.binning_cross_alignment == 'group' || params.coassembly) && row.group == null){ + exit 1, "You must specify groups in the samplesheet if you want to use --binning_cross_alignment 'group' or --coassembly" } ["sample":row.sample, "flowcell":row.flowcell, -- GitLab From dd9d1a14c9e2a79715fde81a393dda6403347ed8 Mon Sep 17 00:00:00 2001 From: "maina.vienne" <maina.vienne@inrae.fr> Date: Tue, 10 Jan 2023 16:09:34 +0100 Subject: [PATCH 31/31] Fix cd_hit_produce_table_clstr.py input step06 --- conf/functional_test.config | 2 +- modules/cd_hit.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/functional_test.config b/conf/functional_test.config index b1fa154..6268237 100644 --- a/conf/functional_test.config +++ b/conf/functional_test.config @@ -5,7 +5,7 @@ singularity.enabled = true singularity.autoMounts = true process { - container = "/work/project/plateforme/metaG/functional_test/singularity_img/metagwgs.sif" + container = "/work/project/plateforme/metaG/functional_test/singularity_img/dev_v2.4/metagwgs.sif" withLabel: BINNING { container = "/work/project/plateforme/metaG/functional_test/singularity_img/binning.sif" } diff --git a/modules/cd_hit.nf b/modules/cd_hit.nf index 0a1b1f5..b341142 100644 --- a/modules/cd_hit.nf +++ b/modules/cd_hit.nf @@ -15,7 +15,7 @@ process INDIVIDUAL_CD_HIT { script: """ cd-hit-est -c ${pct_id} -i ${ffn} -o ${meta.id}.cd-hit-est.${pct_id}.fasta -T ${task.cpus} -M ${task.mem} -d 150 - cat ${meta.id}.cd-hit-est.${pct_id}.fasta.clstr | cd_hit_produce_table_clstr.py > ${meta.id}.cd-hit-est.${pct_id}.table_cluster_contigs.txt + cd_hit_produce_table_clstr.py -i ${meta.id}.cd-hit-est.${pct_id}.fasta.clstr -o ${meta.id}.cd-hit-est.${pct_id}.table_cluster_contigs.txt echo \$(cd-hit -h 2>&1) > v_cdhit.txt """ } -- GitLab