diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index 4338c5bbb6a2dff99cf9f1825839e43ddf247576..47abb58bbbe6d5a9b89d43f00d6bc833eb32c1d0 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -15,6 +15,24 @@ show_analysis_time: False ## Number formatting thousandsSep_format: " " +## General Statistics table +table_columns_visible: + Duplicats: False + ContaminationSearch - RNA: True + samtools: False + ReadsStats: + percent_duplicates: False + percent_fails: False + AlignmentStat: + avg_gc: False + 1_x_pc: True + 5_x_pc: True + 10_x_pc: True + 50_x_pc: False + mapped_reads: False + total_reads: True + general_error_rate: True + ## Sample name formatting extra_fn_clean_exts: - "_filtered" @@ -33,18 +51,13 @@ report_section_order: order: -1000 summary: order: -1001 - + module_order: - fastqc: name: "ReadsStats" #info: "Analysis performed with FastQC, which is a quality control tool for high throughput sequence data, written by Simon Andrews at the Babraham Institute in Cambridge" href: "http://www.bioinformatics.babraham.ac.uk/projects/fastqc/" target: "FastQC" - - qualimap: - name: "AlignmentStat" - #info: "Analysis performed with QualiMap" - href: "http://qualimap.bioinfo.cipf.es/" - target: "QualiMap" - samtools: - fastp: name: "Duplicats" @@ -54,6 +67,19 @@ module_order: name: "ContaminationSearch" #info: "This section shows the module with different files" target: "FastQ-Screen" + - sortmerna: + name: "ContaminationSearch - rRNA" + target: "SortMeRNA" + - qualimap: + name: "AlignmentStat" + #info: "Analysis performed with QualiMap" + href: "http://qualimap.bioinfo.cipf.es/" + target: "QualiMap" + - salmon: + name: "AlignmentStat" + href: "https://combine-lab.github.io/salmon/" + target: "Salmon" + # Pattern sp: diff --git a/bin/DTM/data_prepare.pl b/bin/DTM/data_prepare.pl new file mode 100644 index 0000000000000000000000000000000000000000..5b5bd0c6e48e336bb72e94bd5a428616a51f7be6 --- /dev/null +++ b/bin/DTM/data_prepare.pl @@ -0,0 +1,310 @@ +#!/usr/bin/perl -w + +binmode STDIN, ':encoding(UTF-8)'; +binmode STDOUT, ':encoding(UTF-8)'; +binmode STDERR, ':encoding(UTF-8)'; + +=pod + +=head1 NAME + + data_prepare.pl + +=head1 DESCRIPTION + + Prend en entrée un dossier de sortie de demultiplexage + Et prepare les donnees pour le lancement de NextFlow + +=head1 SYNOPSIS + + data_prepare.pl -h | -i + +=head1 OPTIONS + + -i|s : Chemin vers le dossier a preparer -> Obligatoire + +=head1 EXEMPLES + + perl extractReads.pl -i /illumina/MiSeq/analyzed/230228_M01945_0423_000000000-GFKVR + +=head1 DEPENDENCIES + + - Web service permettant la recuperation des adresses mails a partir de l'id + http://vm-esitoul.toulouse.inra.fr:8080/jaxws-plaGeDataFileParser/adminWS + +=head1 AUTHOR + + Jules Sabban pour Plateforme genomique Toulouse (get-plage.bioinfo@genotoul.fr) + +=encoding UTF8 + +=cut + +############################################################################################################################# +# # +# LIBRAIRIES # +# # +############################################################################################################################# +use strict; +use File::chdir; +use Getopt::Long; +use SOAP::Lite; +use File::Copy "cp"; +use Switch; +use File::Basename; +use utf8; + +############################################################################################################################# +# # +# INITIALISATION # +# # +############################################################################################################################# +my $input = ''; +my $mail_dev = 'jules.sabban@inrae.fr'; + +GetOptions( + "i=s" => \$input +); + +if ($input eq '') { + print STDERR ("USAGE : data_prepare.pl -i <dir_path>"); + die; +} +################################################################## +# +# MAIN +# +################################################################## +MAIN: +{ + my $checkTest = $input =~ m/.*data_test.*/ ? 1 : 0; + + my $wf_illumina_nf_folder_path = '/save/sbsuser/scripts-ngs/wf-Illumina-nf_Current'; + if ($checkTest){ + $wf_illumina_nf_folder_path = '/home/sbsuser/work/test/jules/VisualStudioSources/wf-illumina-nf'; + } + + my @RunDir = split("/", $input); + my @RunInfo = (); + @RunInfo = split("_", $RunDir[$#RunDir]); + my $barcodeFlowcell; + if ($RunInfo[3] =~ m/000000000-/){ + my @FCBarcode = split('-', $RunInfo[3]); + $barcodeFlowcell = $FCBarcode[$#FCBarcode]; + } else { + $barcodeFlowcell = $RunInfo[3]; + } + my $lane = '1'; + if ($RunInfo[4] =~ m/Lane(\d)/){ + $lane = $1; + } + + chdir $input or (print STDERR ("Impossible de se déplacer dans $input\n") and die); + + print ("Recherche de la jFlow\n"); + my $regexpSampleSheetjFlow = '^[0-9]{8}_.*_jFlow.*\.toSubmit$'; + my $file_jflow = `ls -t | egrep $regexpSampleSheetjFlow | head -1`; $? and (print STDERR("[Erreur]Récup de la derniere jflow\n") and die); + chomp($file_jflow); + print ("\tjFlow trouvée : $file_jflow\n"); + + print ("\tParametrage de la commande Nextflow\n"); + my $submitted_jflow = $file_jflow; + my $nb_jflow_lines = `wc -l < $submitted_jflow`; + if ( $nb_jflow_lines == 1 ) { + # ouverture de la jFlow + print ("Ouverture en lecture de : $submitted_jflow\n"); + open(SJF,"< $submitted_jflow") or (print STDERR("Impossible d'ouvrir le fichier jflow $submitted_jflow\n") and die); + my $line = <SJF>; + my ($dataNature) = $line =~ m/--data-nature \'(\S+)\'/; + if ($dataNature eq 'ReadyToLoad') { + my @jflow_components = split(' ', $line); + my $jflow_pipeline = $jflow_components[2]; + switch($jflow_pipeline) { + case 'illumina_rnaseq' { $dataNature = 'RNA-Stranded'} + case 'illumina_qc' { $dataNature = 'DNA'} + case 'illumina_diversity_qc' { $dataNature = 'DNA'} + case 'methylseq' { $dataNature = 'methylated'} + case 'illumina_10X_qc' { $dataNature = '10X'} + else { $dataNature = 'unknown'} + } + } + print ("Analyses NextFlow lancées sur données de type : $dataNature.\n"); + my ($project) = $line =~ m/--project-name (\S+)/; + my (@samples) = $line =~ m/select-sample-id=(\S+)/g; + my ($isMultiplex) = $#samples == 0 ? 'false' : 'true'; + my ($genomeRef) = $line =~ m/--reference-genome (\S+)/; + my ($transcriptomeRef) = $line =~ m/--reference-transcriptome (\S+)/; + my ($runName) = $line =~ m/--name \'(\S+)\'/; + my ($sequencerLong) = $line =~ m/--sequencer \'(\S+)\'/; + my ($sequencer, $sequencerID) = split('-', $sequencerLong); + my ($date) = $line =~ m/--date \'(\S+)\'/; + my ($fcID, $laneNumber, $demuxUniqueness) = $RunDir[$#RunDir] =~ m/\d{6}_\S{6}_\d{4}_(\S+)_Lane(\d)_(.*)$/; + my ($fcType) = $line =~ m/--type \'(.{0,25}Lane.?\d{1})\'/; + my ($description) = $line =~ m/--description \'(.+)\'/; + my ($species) = $line =~ m/--species \'(\S+)\'/; + my ($minOverlap) = $line =~ m/--min_overlap (\d+)/; + my ($maxOverlap) = $line =~ m/--max_overlap (\d+)/; + + # Qui sera le destinataire principal du mail nextflow ? + my $nf_mailRecipient = ""; + eval { + $nf_mailRecipient = get_mail_by_ident( get_operator_from_sample_sheet( $input."/SampleSheet.csv" ) ); + }; + { + if ($@) { + print ("\t$@\n") ; + $nf_mailRecipient = $mail_dev; + } + } + print ("Le destinataire principal des mails NF est : $nf_mailRecipient\n"); + + # Avoid colision between STAR versions + my $force_indexing = 0; + if ($dataNature eq 'RNA-Stranded' && defined($genomeRef)) { + print "Verification de la version de STAR utilisée lors de l'indexation du génome de ref.\n"; + my $genomeDirName = dirname($genomeRef); + my $index_version = `grep versionGenome "$genomeDirName/genomeParameters.txt" | cut -d\$'\t' -f2`; + chomp($index_version); + print("\tRecherche de la version $index_version de STAR dans la config de Nextflow\n"); + my $nf_star_version = `grep "bioinfo/STAR-$index_version" "$wf_illumina_nf_folder_path/conf/base.config" | wc -l`; + print "nf star version : $nf_star_version"; + if ($nf_star_version == 0) { + print "\tLa version STAR utilsée pour l'indexation du genome est différente de celle utilisée par NextFlow. L'index sera refait.\n"; + $force_indexing = 1; + } else { + print "\tLa version STAR utilsée pour l'indexation du genome est la même que celle utilisée par NextFlow. L'index est conservé.\n"; + } + } + + my $nf_params_file = "${project}_params.yml"; + open (NF_PARAMS,"> $nf_params_file") or (print STDERR("Impossible d'ouvrir le fichier $nf_params_file : $!\n") and die); + + print NF_PARAMS "inputdir: '$input'\n"; + print NF_PARAMS "project: '$project'\n"; + print NF_PARAMS "is_multiplex: $isMultiplex\n"; + print NF_PARAMS "data_nature: '$dataNature'\n"; + print NF_PARAMS "split_reads: true\n"; + print NF_PARAMS "species: '$species'\n"; + print NF_PARAMS "reference_genome: '$genomeRef'\n" if (defined($genomeRef)); # parametre non obligatoire + print NF_PARAMS "make_star_index: true\n" if ($force_indexing); # parametre non obligatoire + print NF_PARAMS "reference_transcriptome: '$transcriptomeRef'\n" if (defined($transcriptomeRef)); # parametre non obligatoire + print NF_PARAMS "run_name: '$runName'\n"; + print NF_PARAMS "sequencer: '$sequencer'\n"; + print NF_PARAMS "machine_id: '$sequencerID'\n"; + print NF_PARAMS "run_date: '$date'\n"; + print NF_PARAMS "fc_id: '$fcID'\n"; + print NF_PARAMS "fc_type: '$fcType'\n"; + print NF_PARAMS "lane: '$laneNumber'\n"; + print NF_PARAMS "demux_uniqueness: $demuxUniqueness\n"; + print NF_PARAMS "min_overlap: $minOverlap\n" if (defined($minOverlap)); # parametre non obligatoire + print NF_PARAMS "max_overlap: $maxOverlap\n" if (defined($maxOverlap)); # parametre non obligatoire + print NF_PARAMS "email: '$nf_mailRecipient'\n" if (defined($nf_mailRecipient)); # parametre non obligatoire + print NF_PARAMS "description: '$description'\n"; + + close NF_PARAMS; + print ("Le fichier de parametres nextflow est ici : $input/$nf_params_file\n"); + + print ("\tCopie du fichier de config FASTQSCREEN dans le répertoire courant\n" ); + unless(-e $input."/fastq_screen.conf"){ + cp($wf_illumina_nf_folder_path."/assets/fastq_screen.conf_example", $input."/fastq_screen.conf") or + print STDERR("Impossible de copier le fichier fastq_screen.conf_n") and die; + } + unless(-e $input."/nextflow"){ + mkdir $input."/nextflow" or (print STDERR("Impossible de créer le répertoire $input/nextflow\n") and die); + } + my $nextflow_profile = $checkTest ? 'dev' : 'prod'; + my $nextflow_time_job = $checkTest ? '3:00:00' : '3-00'; + my $nextflow_cmd_line_sbatch = "sbatch -p wflowq -t $nextflow_time_job --mem 5GB "; + $nextflow_cmd_line_sbatch .= "-J nf-illumina_${barcodeFlowcell}_${lane} --wrap="; + + my $nextflow_cmd_line_wrap = "module load bioinfo/nfcore-Nextflow-v22.12.0-edge; "; + $nextflow_cmd_line_wrap .= "cd $input/nextflow; "; + $nextflow_cmd_line_wrap .= "nextflow run $wf_illumina_nf_folder_path/main.nf -ansi-log false "; + $nextflow_cmd_line_wrap .= "-profile $nextflow_profile "; + $nextflow_cmd_line_wrap .= "-params-file ../${project}_params.yml "; + + my $nextflow_cmd_line = "$nextflow_cmd_line_sbatch\" $nextflow_cmd_line_wrap \" "; + print ("Commande nextflox à lancer : $nextflow_cmd_line\n"); + + chdir './nextflow' or (print STDERR("Impossible de se déplacer dans $input/nextflow_n") and die); + open(NFC,"> $input/nextflow/${project}_${runName}_nextflow.cmd") + and print("La commande nextflow est stockée ici : $input/nextflow/${project}_${runName}_nextflow.cmd\n") + or (print STDERR("Impossible d'ouvrir en écriture le fichier : ${project}_${runName}_nextflow.cmd\n") and die); + print NFC "$nextflow_cmd_line\n"; + close NFC; + } else { + chomp($nb_jflow_lines); + print ("jFlow avec $nb_jflow_lines lignes : cas non pris en charge pour le moment._n"); + } + close(SJF); + +} +############################################################################################################################# +# # +# FONCTIONS # +# # +############################################################################################################################# +=head2 function get_mail_by_ident + + Title : get_mail_by_ident + Usage : $user_mail = get_mail_by_ident( $user_ident ) + Prerequisite : Un web service qui permet d'interroger la base de donnees PlaGe. + Function : Retourne l'identifiant de l'utilisateur. + Returns : String + Args : $user_ident String - Ident de la personne. + Globals : none + +=cut + +sub get_mail_by_ident { + my ( $ident ) = @_ ; + my $result = 0 ; + + my $client = SOAP::Lite->proxy("http://genomique.genotoul.fr:8080/jaxws-plaGeDataFileParser/adminWS")->default_ns('http://genomique.genotoul.fr/')->soapversion('1.1')->envprefix('soapenv')->readable('true'); + #my $client = SOAP::Lite->proxy("http://esitoul-dev.toulouse.inra.fr:8080/jaxws-plaGeDataFileParser/adminWS")->default_ns('esitoul-dev.toulouse.inra.fr')->soapversion('1.1')->envprefix('soapenv')->readable('true'); + + $result = $client->call( 'getPeopleDTO', SOAP::Data->name('peopleIdentifier')->value($ident) ); $? and (print ("WARNING : echec de l'envoi du mail\n")); + if( $result->fault ) { + print ("WARNING : $result->faultstring\n") ; + return "0"; + } + if ($result->body && $result->body->{'getPeopleDTOResponse'}) { + return $result->body->{'getPeopleDTOResponse'}{'return'}{'mail'}; + } +} + +=head2 function get_operator_from_sample_sheet + + Title : get_operator_from_sample_sheet + Usage : $operator_ident = get_operator_from_sample_sheet($sample_sheet_path) + Prerequisite : none + Function : Retourne l'identifiant de l'operateur du run. + Returns : String + Args : $sample_sheet_path String - Chemin de la sample sheet. + Globals : none + +=cut + +sub get_operator_from_sample_sheet { + my ($sample_sheet_path) = @_ ; + my $operator = "" ; + open( my $FH_SAMPLESHEET, $sample_sheet_path ) or (print ("WARNING : Impossible d'ouvrir la sample sheet '".$sample_sheet_path."'\n")); + + ##Parsing Sample Sheet IEM + while(my $ligne = <$FH_SAMPLESHEET>){ + chomp($ligne); + if($ligne =~ /Investigator Name/){ + my @list = split(/,/,$ligne); + + #Récupérer l'opérateur + $operator = $list[1]; + if($operator =~ /\r/){ + $operator = substr($operator,0,(length($operator)-1)); + } + print ("Opérateur : $operator\n"); + last; #we do not need to continue the loop + } + } + close( $FH_SAMPLESHEET ); + return $operator ; +} diff --git a/bin/demuxStatsFromXML.R b/bin/demuxStatsFromXML.R index 1aec58cfff9f19e5546b5b15930388fa56c8f330..75b98b34eb4565b18d0659f517a0c950fc72c578 100755 --- a/bin/demuxStatsFromXML.R +++ b/bin/demuxStatsFromXML.R @@ -107,7 +107,7 @@ cat("Rassemblement des statistiques par échantillons.\n") for (line in 1:dim(indexNumber)[1]){ mySample<-indexNumber[line, "Sample"] mySampleNumber<-indexNumber[line, "NumberOfIndex"] - + cat("\nEtude de l'échantillon : " , mySample, "\n") # Single Index Case if (mySampleNumber == 1) { df.singleLine<-df[which(df$Sample == mySample),] @@ -115,40 +115,43 @@ for (line in 1:dim(indexNumber)[1]){ } # Dual et 4 Index Cases else if (mySampleNumber > 1) { - #sub.df<-df[which(str_detect(df$Sample, mySample)), ] - sub.df<-df[which(df$Sample == mySample), ] + sub.df<-df[which(str_detect(df$Sample, paste0(mySample,'_S.*'))), ] #print(sub.df) - # Parcours du sous-data.frame - for (l in 1:dim(sub.df)[1]) { - sub.df.project<-sub.df[l, "Project"] - sub.df.barcode<-sub.df[l, "Barcode"] - sub.df.bcCount<-as.numeric(sub.df[l, "bcCount"]) - sub.df.bcPerfect<-as.numeric(sub.df[l, "bcPerfect"]) - sub.df.oneMismatch<-as.numeric(sub.df[l, "bcOneMismatch"]) # bcOneMismatch - - #print(paste(mySample, ":: Traitement du barcode :", sub.df.barcode)) - - if (l == 1 ) { - sub.df.project.toAdd<-sub.df.project - sub.df.barcode.toAdd<-sub.df.barcode - sub.df.bcCount.toAdd<-sub.df.bcCount - sub.df.bcPerfect.toAdd<-sub.df.bcPerfect - sub.df.oneMismatch.toAdd<-sub.df.oneMismatch - } else { - sub.df.barcode.toAdd<-paste0(sub.df.barcode.toAdd, "+", sub.df.barcode) - sub.df.bcCount.toAdd<-sub.df.bcCount.toAdd+sub.df.bcCount - sub.df.bcPerfect.toAdd<-sub.df.bcPerfect.toAdd+sub.df.bcPerfect - sub.df.oneMismatch.toAdd<-sub.df.oneMismatch.toAdd+sub.df.oneMismatch + if (nrow(sub.df) == 0) { + cat("Aucun échantillon trouvé !\n") + cat("La recherche de l'échantillon",mySample, "dans le data.table suivant à échouée :\n") + print(df) + } else { + # Parcours du sous-data.frame + for (l in 1:dim(sub.df)[1]) { + sub.df.project<-sub.df[l, "Project"] + sub.df.barcode<-sub.df[l, "Barcode"] + sub.df.bcCount<-as.numeric(sub.df[l, "bcCount"]) + sub.df.bcPerfect<-as.numeric(sub.df[l, "bcPerfect"]) + sub.df.oneMismatch<-as.numeric(sub.df[l, "bcOneMismatch"]) # bcOneMismatch + + # Première iteration + if (l == 1 ) { + sub.df.project.toAdd<-sub.df.project + sub.df.barcode.toAdd<-sub.df.barcode + sub.df.bcCount.toAdd<-sub.df.bcCount + sub.df.bcPerfect.toAdd<-sub.df.bcPerfect + sub.df.oneMismatch.toAdd<-sub.df.oneMismatch + } else { + sub.df.barcode.toAdd<-paste0(sub.df.barcode.toAdd, "+", sub.df.barcode) + sub.df.bcCount.toAdd<-sub.df.bcCount.toAdd+sub.df.bcCount + sub.df.bcPerfect.toAdd<-sub.df.bcPerfect.toAdd+sub.df.bcPerfect + sub.df.oneMismatch.toAdd<-sub.df.oneMismatch.toAdd+sub.df.oneMismatch + } } + # Add to data.frame + df_to_add<-data.frame(sub.df.project,mySample, sub.df.barcode.toAdd, sub.df.bcCount.toAdd, sub.df.bcPerfect.toAdd, sub.df.oneMismatch.toAdd) + df2<-concat_df(df2, df_to_add, vec.names) } - - # Add to data.frame - df_to_add<-data.frame(sub.df.project,mySample, sub.df.barcode.toAdd, sub.df.bcCount.toAdd, sub.df.bcPerfect.toAdd, sub.df.oneMismatch.toAdd) - df2<-concat_df(df2, df_to_add, vec.names) } } -cat("Résumé des informations extraites (nombre d'échantillons par projet) :") +cat("\nRésumé des informations extraites (nombre d'échantillons par projet) :") table(df2$Project) ## Recherche des index indeterminés @@ -166,7 +169,7 @@ tabUndetermined<-tabDemuxSum[which(tabDemuxSum$Count >= bcCount.threshold),] cat("\tRésumé des inforamtions extraites :\n") cat(paste0("\tNombre d'index indéterminés retrouvés :\t", dim(tabUndetermined)[1], "\n")) -head(tabUndetermined) +if(nrow(tabUndetermined) > 0) { head(tabUndetermined) } # Construction du dataFrame pour intégration à df2 diff --git a/bin/extractInfoForDemuxStats.pl b/bin/extractInfoForDemuxStats.pl index 8e879af83d1901c975a878c53a38b2eba492d626..eddd76002844721da55861e7dce013f0fafdc69c 100755 --- a/bin/extractInfoForDemuxStats.pl +++ b/bin/extractInfoForDemuxStats.pl @@ -77,17 +77,32 @@ my $sample_ID_position; my @index_ID_position=(); my %sample_info=(); +my $machineName = 'MISEQ'; # evite comparaison avec chaine vide + +my %regexForDataHeader = (); +$regexForDataHeader{'MISEQ'} = '^Sample_ID'; +$regexForDataHeader{'NOVASEQ'} = '^Lane'; + +my %regexForSampleLine = (); +$regexForSampleLine{'MISEQ'} = '^.+,.+,.*,.*,.+,.+,.+,.*$'; +$regexForSampleLine{'NOVASEQ'} = '^(\d),'; foreach my $line (@lines) { my @cur_line = split(',', $line); - + + # Recherche de la machine + if ($line =~ /^Description/) { + $machineName = $cur_line[1]; + $machineName = $machineName =~ /^NOVASEQ/ ? 'NOVASEQ' : $machineName; + } + # Recherche du nom du projet if ($line =~ /^Infos/) { $projectName = $cur_line[1]; } # Recherche des positions des Sample_ID et des Index_ID - elsif ($line =~ /^Lane/) { + elsif ($line =~ m/${regexForDataHeader{$machineName}}/) { while ( my ( $indice, $valeur ) = each @cur_line ) { if ($valeur eq "Sample_ID") { $sample_ID_position=$indice;} if ($valeur =~ /Index_ID$/) { push(@index_ID_position, $indice);} @@ -95,7 +110,7 @@ foreach my $line (@lines) { } # Association Sample_ID avec sont nombre d'index - elsif ($line =~ m/^(\d),/) { + elsif ($line =~ m/${regexForSampleLine{$machineName}}/) { my $sample_ID = $cur_line[$sample_ID_position]; my $index_number=0; my @cur_index_ID = (); diff --git a/conf/base.config b/conf/base.config index 85910c282e6090115c276d443effc2a2112099fb..2cbab3d39f1beb1b6100f7450387745462b01cd1 100644 --- a/conf/base.config +++ b/conf/base.config @@ -27,6 +27,7 @@ params { // DNA / RNA params reference_genome = "" + make_star_index = false reference_transcriptome = "" // Amplicon / 16S params @@ -73,14 +74,25 @@ process { time='1h' cpus = 1 memory = 2.GB + beforeScript = 'sleep 5s; module purge' errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } maxRetries = 2 maxErrors = '-1' - // ----- WithName - withName: BWA_ALIGNMENT { - module = ['bioinfo/bwa-0.7.17'] + // ----- WithName + // ----- CORE ----- // + withName: illuminaFilter { + publishDir = [ + path: "${params.outdir}/IlluminaFilter", + mode: 'symlink', + pattern: '*.gz'/*, + saveAs: { filename -> "${name}.fastq.gz" }*/ + ] + + module = ['bioinfo/fastq_illumina_filter-0.1'] + cpus = { 3 * task.attempt } + time = { 4.h * task.attempt } } withName: DUPLICATED_READS { @@ -113,7 +125,43 @@ process { maxRetries = 3 module = ['bioinfo/FastQC_v0.11.7'] time = { 1.h * task.attempt } + } + withName: FASTQSCREEN { + ext.args = [ + "--conf ${launchDir}/../fastq_screen.conf" + ].join(' ') + } + + // ----- RNA ----- // + withName: SALMON_INDEX { + module = ['bioinfo/salmon-1.9.0'] + time = { 1.h * task.attempt } + memory = { 3.GB * task.attempt } + cpus = 8 + } + + withName: SALMON_QUANT { + module = ['bioinfo/salmon-1.9.0'] + time = { 1.h * task.attempt } + memory = { 3.GB * task.attempt } + cpus = 8 + + publishDir = [ + path: "${params.outdir}/AlignmentStats", + mode: 'symlink', + pattern: '*' + ] + } + + withName: STAR_INDEX { + memory = { 50.GB * task.attempt } + cpus = 8 + } + + withName: STAR_ALIGN { + memory = { 20.GB * task.attempt } + cpus = 2 } // ----- WithLabel @@ -121,17 +169,19 @@ process { executor = 'local' } - withLabel: samtools { - module = ['bioinfo/samtools-1.14'] - } - withLabel: cigar { module = ['system/Python-3.7.4:bioinfo/samtools-1.14'] } - - withLabel: qualimap { - module = ['system/R-3.4.3:bioinfo/qualimap-31-08-20'] - beforeScript='unset DISPLAY' + + // ----- DNA ----- // + withLabel: bwa { + module = ['/tools/share/Modules/bioinfo/bwa-0.7.17'] + beforeScript = "module list" + } + + // ----- RNA ----- // + withLabel: star { + module = ['bioinfo/STAR-2.7.10a_alpha_220314'] } } @@ -141,6 +191,11 @@ process { params.shared_modules = '/home/sbsuser/work/Nextflow/shared_modules/ExportSources_Jules' process { + withName: SAMTOOLS_FAIDX { + beforeScript = "module purge" + module = ['bioinfo/samtools-1.16.1'] + } + withName: GZIP { ext.args = '-f' publishDir = [ @@ -176,11 +231,12 @@ process { withName: MULTIQC { ext.args = [ "--config ${baseDir}/assets/multiqc_config.yaml", - params.project ? "--title '${params.project}'" : '' + params.project ? "--title '${params.project} - ${params.run_name}'" : '' ].join(' ') - module = '/tools/share/Modules/bioinfo/MultiQC-v1.11' - memory = { 5.GB * task.attempt } + beforeScript = "module purge" + module = 'bioinfo/MultiQC-1.14' + memory = { 10.GB * task.attempt } publishDir = [ path: { "${params.outdir}/MultiQC" }, @@ -189,4 +245,17 @@ process { pattern: "*.html" ] } + + withName: SORTMERNA { + module = 'bioinfo/sortmerna-4.3.2' + memory = '1.GB' + time = { 10.h * task.attempt } + cpus = { 1 * task.attempt } + + publishDir = [ + path: "${params.outdir}/rRNA", + mode: 'copy', + overwrite: false + ] + } } \ No newline at end of file diff --git a/conf/functions.config b/conf/functions.config index 66ea1a9527ba2e46e9f93d7f44742cc53c1b93a5..f16fa59e2ac46f7247bc9df28cacba8aef259b06 100644 --- a/conf/functions.config +++ b/conf/functions.config @@ -64,6 +64,9 @@ def createSummary(formatted_date) { } def customMailSend(body, subject, email_address) { + if (email_address == null) { + email_address = params.email_bioinfo + } if (workflow.profile == 'dev') { email_address = params.email_dev try { @@ -174,7 +177,9 @@ def sendFinalMail(formatted_date, summary) { if (!params.email && params.email_on_fail && !workflow.success) { email_address = params.email_on_fail } - + if (workflow.profile == 'dev') { + email_address = params.email_dev + } // Render the TXT template def engine = new groovy.text.GStringTemplateEngine() def tf = new File("$baseDir/assets/final_email_template.txt") @@ -182,9 +187,12 @@ def sendFinalMail(formatted_date, summary) { def email_txt = txt_template.toString() // Send the e-mail + def mail_sent = false if (email_address) { mail_sent = customMailSend(email_txt, subject, email_address) - } + } else { + log.info "No email adress -> no email sending" + } // Write summary e-mail HTML to a file def output_d = new File( "${params.outdir}/pipeline_info/" ) diff --git a/conf/prod.config b/conf/prod.config index b36b1a7eb73be4305bff6ca4b6567b32651dbd0f..391ae6a5125a77d65ab408f2800c284b8365ad10 100644 --- a/conf/prod.config +++ b/conf/prod.config @@ -10,12 +10,15 @@ process { } withLabel: samtools { + module = ['bioinfo/samtools-1.14'] cpus = { 6 * task.attempt } memory = { 8.GB * task.attempt } time = { 3.h * task.attempt } } withLabel: qualimap { + module = ['system/R-3.4.3:bioinfo/qualimap-31-08-20'] + beforeScript='unset DISPLAY' cpus = { 8 * task.attempt } memory = { 2.GB * task.attempt } time = { 3.h * task.attempt } diff --git a/conf/report.config b/conf/report.config index 2d40a635290928a4c9b2b54e23fc509229e9b641..35d4255683ef7f37cc489eb201a3bde0ca0f9499 100644 --- a/conf/report.config +++ b/conf/report.config @@ -9,7 +9,7 @@ timeline { trace { enabled = true file = "${params.outdir}/pipeline_info/execution_trace.txt" - fields = 'task_id,native_id,name,status,exit,realtime,%cpu,%mem,duration,script,rss' // verifier ajout des champs + fields = 'task_id,native_id,name,status,exit,realtime,%cpu,%mem,rss,duration,workdir,script' // verifier ajout des champs } report { @@ -29,5 +29,5 @@ manifest { description = "Workflow for Illumina data quality control" mainScript = 'main.nf' nextflowVersion = '>=0.32.0' - version = '1.0.0' + version = '1.2.0' } \ No newline at end of file diff --git a/conf/test.config b/conf/test.config index fa614b4a0b71ccf28727b2b929b611a28be185f0..2d6b36e95a0b9397c3fa83d6a1f70edf45e4febd 100644 --- a/conf/test.config +++ b/conf/test.config @@ -9,21 +9,24 @@ process { } withLabel: samtools { + module = ['bioinfo/samtools-1.14'] cpus = { 1 * task.attempt } memory = { 2.GB * task.attempt } time = { 10.m * task.attempt } } withLabel: qualimap { + module = ['system/R-3.4.3:bioinfo/qualimap-31-08-20'] + beforeScript='unset DISPLAY' cpus = { 1 * task.attempt } memory = { 2.GB * task.attempt } time = { 10.m * task.attempt } } withName: BWA_ALIGNMENT { - cpus = { 3 * task.attempt } - memory = { 2.GB * task.attempt } - time = { 1.h * task.attempt } + cpus = { 6 * task.attempt } + memory = { 8.GB * task.attempt } + time = { 3.d * task.attempt } } } diff --git a/modules/local/module_core.nf b/modules/local/module_core.nf index f12fd855a0bf7bfea4949d94a1732ebd24c4ee16..7d9c7c52f6f38c6b05f52c58d333cf7ad4cc7726 100644 --- a/modules/local/module_core.nf +++ b/modules/local/module_core.nf @@ -59,15 +59,6 @@ process FASTQC { process illuminaFilter { - publishDir path: "${params.outdir}/IlluminaFilter" , mode: 'copy', pattern: '*.gz'/*, saveAs: { filename -> "${name}.fastq.gz" }*/ - - module 'bioinfo/fastq_illumina_filter-0.1' - executor 'slurm' - queue 'wflowq' - cpus { 1 * task.attempt } - time { 1.h * task.attempt } - memory '1.GB' - tag " $name" input: @@ -84,26 +75,6 @@ process illuminaFilter { } -process BWA_ALIGNMENT { - publishDir path: "${params.outdir}/ContaminationSearch/tmp" , mode: 'copy' - - tag " $sample" - - input: - tuple val(sample), path(reads) - each genomeRef - - output: - //tuple val(sample), path("*.log"), emit: log - tuple val("${sample}_${genomeName}"), path("${sample}_${genomeName}.sam"), emit: sam - - script: - genomeName=file(genomeRef).simpleName - """ - bwa mem ${genomeRef} ${reads} 1> ${sample}_${genomeName}.sam 2> ${sample}.log - """ -} - process FASTQSCREEN { publishDir path: "${params.outdir}/ContaminationSearch/FastQ-Screen", mode: 'copy' @@ -119,8 +90,9 @@ process FASTQSCREEN { tuple val(sample), path("*.txt"), emit: report script: + def args = task.ext.args ?: '' """ - fastq_screen $reads --conf $launchDir/../fastq_screen.conf + fastq_screen $reads $args """ } diff --git a/modules/local/module_dna.nf b/modules/local/module_dna.nf index 534950c56c001b989853bc40da213bb925f4b7b1..8dc0709098973e3c6281c20c58ed7eac1ed5feed 100644 --- a/modules/local/module_dna.nf +++ b/modules/local/module_dna.nf @@ -2,10 +2,11 @@ * Module pour l'alignement des reads ADN sur génome de référence et des statistiques associées */ -process BWA_ALIGNMENT { BWA_ALIGNMENT +process BWA_ALIGNMENT { publishDir path: "${params.outdir}/alignment/bwa" , mode: 'copy' - tag " $sample" + tag "$sample" + label 'bwa' input: tuple val(sample), path(reads) @@ -15,8 +16,10 @@ process BWA_ALIGNMENT { BWA_ALIGNMENT tuple val(sample), path("*.sam"), emit: sam script: + def reference = params.reference_genome ?: params.reference_transcriptome + def referenceName=file(reference).toString().split('/')[6] """ - bwa mem ${params.reference_genome} ${reads} 1> ${sample}.sam 2> ${sample}.log + bwa mem ${reference} ${reads} 1> ${sample}_${referenceName}.sam 2> ${sample}_${referenceName}.log """ } @@ -81,7 +84,8 @@ process SAMTOOLS_FLAGSTATS { } process QUALIMAP { - publishDir path: "${params.outdir}/alignmentStats/qualimap" , mode: 'copy' + publishDir path: "${params.outdir}/alignmentStats/qualimap" , mode: 'copy', pattern: "*.html" + publishDir path: "${params.outdir}/alignmentStats/qualimap" , mode: 'copy', pattern: "*.txt" tag "$sample" diff --git a/modules/local/module_rna.nf b/modules/local/module_rna.nf new file mode 100644 index 0000000000000000000000000000000000000000..2cf07ecff83b6a52dd24d57121d859001463907c --- /dev/null +++ b/modules/local/module_rna.nf @@ -0,0 +1,108 @@ +/* + * Module pour l'alignement des reads ARN sur génome/trasncriptome de référence et extraction des statistiques associées +*/ + +process SALMON_INDEX { + tag "$params.project" + + output: + path("index/"), emit: index + + script: + """ + salmon index \ + -t ${params.reference_transcriptome} \ + -i ./index \ + --threads ${task.cpus} + """ +} + + +process SALMON_QUANT { + tag "$sample" + + input: + tuple val(sample), path(reads) + path(index) + val(lib_type) + + output: + tuple val(sample), path("$sample/"), emit: results + path("versions.yml"), emit: version + + script: + def R1 = reads.find { it =~ /.*_R1_.*/} + def R2 = reads.find { it =~ /.*_R2_.*/} + """ + salmon quant \\ + --libType ${lib_type} \\ + --validateMappings \\ + -o ./${sample}/ \\ + --index ${index} \\ + -1 ${R1} \\ + -2 ${R2} \\ + 2> /dev/null + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + salmon: \$(echo \$(salmon --version) | sed -e "s/salmon //g") + END_VERSIONS + """ +} + +process STAR_INDEX { + tag "$params.project" + label "star" + + input: + path(fasta) + path(fai) + + output: + path("index/"), emit: index + + script: + // renamme en .fa ?? utile ?? + def args = task.ext.args ?: '' + def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' + """ + NUM_BASES=`gawk '{sum = sum + \$2}END{if ((log(sum)/log(2))/2 - 1 > 14) {printf "%.0f", 14} else {printf "%.0f", (log(sum)/log(2))/2 - 1}}' ${fai}` + + mkdir index + STAR \\ + --runMode genomeGenerate \\ + --genomeDir index/ \\ + --genomeFastaFiles $fasta \\ + --runThreadN $task.cpus \\ + --genomeSAindexNbases \$NUM_BASES \\ + $args + + """ +} + +process STAR_ALIGN { + tag "$sample" + label "star" + + input: + tuple val(sample), path(reads) + each index + + output: + tuple val(sample), path("${sample}_Log.final.out"), emit: results + tuple val(sample), path("${sample}_Log.out"), emit: log + + script: + def args = task.ext.args ?: '' + def read_files_cmd = reads[0].endsWith('.gz') ? '--readFilesCommand zcat' : '' + """ + STAR \\ + --outFileNamePrefix ${sample}_ \\ + --genomeDir $index/ \\ + --runThreadN $task.cpus \\ + --outSAMunmapped Within \\ + --readFilesIn $reads \\ + $read_files_cmd + + """ +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index c161fa17997fbfe1fcd8a67f426cc3d1c3b53f90..7f098f77d1a9a65911cea9dcb588a128a4dac648 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,15 @@ params { large_indexing_nova_subset_byte = 350000000 // in byte <=> 500 000 reads large_indexing_nova_subset_seq = 500000 // in reads + // RNA QC + sortmerna_db_path = '/usr/local/bioinfo/src/SortMeRNA/sortmerna-2.1b/rRNA_databases' + sortmerna_bac_16s = sortmerna_db_path + '/silva-bac-16s-id90.fasta' + sortmerna_bac_23s = sortmerna_db_path + '/silva-bac-23s-id98.fasta' + sortmerna_arc_16s = sortmerna_db_path + '/silva-arc-16s-id95.fasta' + sortmerna_arc_23s = sortmerna_db_path + '/silva-arc-23s-id98.fasta' + sortmerna_euk_18s = sortmerna_db_path + '/silva-euk-18s-id95.fasta' + sortmerna_euk_28s = sortmerna_db_path + '/silva-euk-28s-id98.fasta' + // OTHERS email="" email_dev="jules.sabban@inrae.fr" diff --git a/sub-workflows/local/core_pipeline.nf b/sub-workflows/local/core_pipeline.nf index f71886cb365630cb57716f27a29cdf21c3189a1c..bec4c8a8b4cf60faa0fcaf1295eaca03e10a2b22 100644 --- a/sub-workflows/local/core_pipeline.nf +++ b/sub-workflows/local/core_pipeline.nf @@ -71,7 +71,7 @@ workflow CORE { demultiplexStats(ch_DemuxStatXML, extractInfoForDemuxStats.out, ch_DemuxSummary) // ----------- Illumina Filter // ou SubsetSeqFiles : dans quel cas on fait l'un ou l'autre ???? - if (params.sequencer == 'NovaSeq' & params.is_multiplex) { + if ("$params.sequencer" =~ /NovaSeq.*/ && params.is_multiplex) { System.out.println "Les données ne nécessite pas de passer par IlluminaFilter" ch_read_good = ch_read } else { // Si MiSeq ou Nova + noIndex diff --git a/sub-workflows/local/dna_qc.nf b/sub-workflows/local/dna_qc.nf index 4be068643d855de2e361dbc540ea480f62028e52..57e1a0845c112de54f45d68c83a702c100c15249 100644 --- a/sub-workflows/local/dna_qc.nf +++ b/sub-workflows/local/dna_qc.nf @@ -26,7 +26,7 @@ workflow DNA_QC { fastq main: - if ( "$params.reference_genome" != '' ) { + if ( "$params.reference_genome" != '' || "$params.reference_transcriptome" != '') { BWA_ALIGNMENT(fastq) SAMTOOLS_VIEW(BWA_ALIGNMENT.out.sam) SAMTOOLS_SORT(SAMTOOLS_VIEW.out.bam) @@ -37,7 +37,7 @@ workflow DNA_QC { flagstats_output_emitted = SAMTOOLS_FLAGSTATS.out.txt } else { - System.out.println "Le paramètre reference_genome est vide : $params.reference_genome, on ne peut pas faire d'alignement" + System.out.println "Pas de référence genomique ou transcriptomique renseignée, on ne peut pas faire d'alignement" // If Qualimap and Samtools were not executed qualimap_report_emitted = Channel.empty() flagstats_output_emitted = Channel.empty() diff --git a/sub-workflows/local/rna_qc.nf b/sub-workflows/local/rna_qc.nf index fe778d2a564d1344a4640e33e0ad547407811d10..c7afbaa2173856ff7f1cc0a8a50fb5b56d34cbba 100644 --- a/sub-workflows/local/rna_qc.nf +++ b/sub-workflows/local/rna_qc.nf @@ -3,4 +3,90 @@ alignementStat insertSizeDistribution -*/ \ No newline at end of file +*/ +// ------------------------------------------------- +// RNA QC +// ------------------------------------------------- +/* + * QC des données ARN : + * - Alignement contre génome de référence avec STAR + * - Pseudo alignement contre transcriptome de référence avec SALMON + * - Rapport d'alignement avec Qualimap ?? +*/ + +// ------------------------------------------------- +// MODULES +// ------------------------------------------------- +include { STAR_INDEX; + STAR_ALIGN; + SALMON_INDEX; + SALMON_QUANT; +} from "$baseDir/modules/local/module_rna.nf" + +include { SAMTOOLS_FAIDX } from "${params.shared_modules}/samtools.nf" +include { SORTMERNA } from "${params.shared_modules}/sortmerna.nf" +// ------------------------------------------------- +// WORKFLOW +// ------------------------------------------------- +workflow RNA_QC { + take: + fastq + sortmerna_db + + main: + fastq = fastq.collect{it[1]}.flatten().map { $it -> [ ($it.simpleName =~ /(.*)_R[1-2]_.*/)[0][1] , $it ] }.groupTuple() + align_results = Channel.empty() + + if ( "$params.reference_genome" != '' ) { + // if indexFiles does not exist + if ( ! file(file(params.reference_genome).getParent() + '/SAindex').exists() || params.make_star_index) { + println "STAR index files does not exists -> Let's start genome indexing..." + reference_genome = Channel.from(params.reference_genome) + genome_index = SAMTOOLS_FAIDX(reference_genome).index + star_index = STAR_INDEX(reference_genome, genome_index).index + } else { + star_index = Channel.from(file(params.reference_genome).getParent()) + } + align_results = STAR_ALIGN(fastq, star_index).results // R1 et R2 en même temps + + } else if ("$params.reference_transcriptome" != '') { + // if indexFiles does not exist + if ( ! file(file(params.reference_transcriptome).getParent() + '/seq.bin').exists()) { + println "SALMON index files does not exists -> Let's start transcriptome indexing..." + salmon_index = SALMON_INDEX().index + } else { + salmon_index = Channel.from(file(params.reference_transcriptome).getParent()) + } + + def lib_type = '' + if (params.data_nature == 'RNA-Stranded') { + lib_type = 'ISR' + log.info "[RNA][SALMON] ${lib_type} library type detected." + } else if (params.data_nature =~ 'RNA-*') { + lib_type = 'IU' + log.info "[RNA][SALMON] ${lib_type} library type detected." + } else { + log.warning "[RNA][SALMON] Unknown library type ! No transcript quantification can be performed." + } + ch_lib_type = Channel.value(lib_type) + + align_results = SALMON_QUANT( + fastq, + salmon_index, + ch_lib_type + ).results + + } else { + // If Qualimap and Samtools were not executed + System.out.println "Pas de référence genomique ou transcriptomique renseignée, on ne peut pas faire d'alignement" + //qualimap_report_emitted = Channel.empty() + //flagstats_output_emitted = Channel.empty() + } + + SORTMERNA(fastq, sortmerna_db.collect()) + + emit: + align_results = align_results + sortmerna_log = SORTMERNA.out.log + //flagstats_output = flagstats_output_emitted +} \ No newline at end of file diff --git a/workflow/illumina_qc.nf b/workflow/illumina_qc.nf index b0f4f1d09d7a60fff6d1f23f7a547e7022dd4b88..fb702cc87555f4501255fbbf1336ce9d05e3733b 100644 --- a/workflow/illumina_qc.nf +++ b/workflow/illumina_qc.nf @@ -33,12 +33,21 @@ ch_DemuxStatXML=Channel.fromPath(params.inputdir+'/Stats/DemultiplexingStats.xml // fastq one by one ch_read=Channel - .fromPath(params.data_location+'/*_R{1,2}_*.fastq.gz') + .fromPath(params.data_location+'/**_R{1,2}_*.fastq.gz') .map{$it -> [$it.simpleName, $it]} // fastq paired //ch_read_merged=Channel.fromFilePairs(params.data_location+'/*_R{1,2}_*.fastq.gz') +// Channel of rRNA databases for sortmerna +ch_sortmerna_db = Channel.from( + params.sortmerna_euk_18s, + params.sortmerna_euk_28s, + params.sortmerna_bac_16s, + params.sortmerna_bac_23s, + params.sortmerna_arc_16s, + params.sortmerna_arc_23s +) mismatchNumber = params.sequencer == 'MiSeq'? 0 : 1 //banksForConta = params.addBankForConta ? params.genomesRefForConta << params.addBankForConta : params.genomesRefForConta @@ -50,6 +59,7 @@ createDir = file(params.outdir).mkdir() // ------------------------------------------------- include { CORE } from "$baseDir/sub-workflows/local/core_pipeline.nf" include { DNA_QC } from "$baseDir/sub-workflows/local/dna_qc.nf" +include { RNA_QC } from "$baseDir/sub-workflows/local/rna_qc.nf" include { MULTIQC } from "${params.shared_modules}/multiqc.nf" include { workflow_summary as WORKFLOW_SUMMARY } from "${params.shared_modules}/workflow_summary.nf" @@ -75,6 +85,12 @@ workflow ILLUMINA_QC { DNA_QC.out.qualimap_report.collect{it[1]}.ifEmpty([]), DNA_QC.out.flagstats_output.collect{it[1]}.ifEmpty([]) ) + } else if (params.data_nature =~ 'RNA') { + RNA_QC(CORE.out.subset_fastq, ch_sortmerna_db) + ch_mqc = ch_mqc.mix( + RNA_QC.out.align_results.collect{it[1]}.ifEmpty([]), + RNA_QC.out.sortmerna_log.collect{it[1]}.ifEmpty([]) + ) } else { System.out.println "Le QC des données non ADN n'est pas prit en charge pour le moment." ch_mqc = ch_mqc.mix( Channel.empty() ) @@ -85,7 +101,7 @@ workflow ILLUMINA_QC { CORE.out.fastqc_report.collect{it[1]}.ifEmpty([]), CORE.out.fastqscreen_report.collect{it[1]}.ifEmpty([]), CORE.out.fastp_report.collect{it[1]}.ifEmpty([]), - ch_mqc.collect{it[1]}.ifEmpty([]) + ch_mqc.collect().ifEmpty([]) ).collect() ) /*