Merge branch 'dev' into 'main'

v1.6 See merge request !11

Merge branch 'dev' into 'main'
v1.6 See merge request !11
59bab410 · Alexis Mergez · 64663c72 · bcbd5705 · 59bab410 · 59bab410
Commit 59bab410 authored 1 year ago by Alexis Mergez
--- a/Snakefile
+++ b/Snakefile
@@ -5,40 +5,50 @@ import numpy as np
 import gzip
 import re

-# Getting the list of haplotypes
+# Getting the list of haplotypes (SAMPLES)
 SAMPLES = np.unique([
    os.path.basename(f).split('.fa')[0] 
    for f in os.listdir("data/haplotypes/")
    if re.search(r".fa", f)
    ])
+SAMPLES_NOREF = [
+    sample 
+    for sample in SAMPLES 
+    if sample != os.path.basename(config['reference']).split('.fa')[0]
+    ]
 nHAP = len(SAMPLES)

 # Getting the list of chromosomes
 with gzip.open("data/haplotypes/"+config['reference'], "r") as handle:
    CHRLIST = [line.decode().split("#")[-1].split('\n')[0] for line in handle.readlines() if line.decode()[0] == ">"]

-def which_post_analysis():
-    ## Simple function to configure which parts of the workflow needs to be run
-    post_analysis_inputs = [     # Default post analysis steps
-        "output/pan1c.pggb."+config['name']+".workflow.stats.tsv",
-        "output/figures",
-        "output/panacus.reports"
+# Configuring steps to be done
+def which_analysis():
+    # Creating a list with default analysis steps (to prevent the function from returning an empty list)
+    analysis_inputs = [     
+        "output/pan1c.pggb."+config['name']+".core.stats.tsv", # core stats
+        expand("output/panacus.reports/{chromosome}.histgrowth.html", chromosome=CHRLIST), # panacus histgrowth
+        expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF), # syri for haplotypes 
+        expand("output/chrGraphs.figs/{chromosome}.1Dviz.png", chromosome=CHRLIST), # visualizations from odgi on chromosome graphs
+        "output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv" # chromosomes graph statistics
        ]
    
-    # Optionals post analysis steps
+    # Optionals analysis steps
    if config["get_PAV"] == "True":
-        post_analysis_inputs.append("output/pav.matrices")
+        analysis_inputs.append("output/pav.matrices")
+    if config["get_allASM_SyRI"] == "True":
+        analysis_inputs.append("output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png")

-    return post_analysis_inputs
+    return analysis_inputs

 rule all:
    input:
-        "output/pan1c.pggb."+config['name']+".gfa",
-        "output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv",
-        which_post_analysis()
+        "output/pan1c.pggb."+config['name']+".gfa", # Final graph (main output)
+        "output/pan1c.pggb."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line)
+        which_analysis()

 rule samtools_index:
-    # Using samtools faidx to index compressed fasta
+    # Samtools faidx to index compressed fasta
    input:
        fa="{sample}.fa.gz"
    output:
@@ -51,6 +61,11 @@ rule samtools_index:
        "apptainer run --app samtools {params.apppath}/PanGeTools.sif "
        "faidx {input.fa}"

+"""
+Pre-processing section
+Preparing pggb inputs
+"""
+
 rule ragtag_scaffolding:
    # Scaffold input haplotype against the reference to infer chromosome scale sequences
    input:
@@ -74,8 +89,60 @@ rule ragtag_scaffolding:
            -o {output}
        """

+rule create_allAsm_syri_fig:
+    # Create a SyRI figure containing all input assemblies
+    input:
+        ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz",
+        qry=expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES)
+    output:
+        fig="output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png",
+        wrkdir=directory('data/asm.syri/all')
+    threads: 8
+    params:
+        apppath=config["app.path"]
+    shell:
+        """
+        mkdir {output.wrkdir}
+        bash scripts/getSyriFigs.sh \
+            -a {params.apppath} \
+            -t {threads} \
+            -d {output.wrkdir} \
+            -o $(basename {output.fig}) \
+            -r {input.ref} \
+            -q "{input.qry}"
+        mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
+        """
+
+rule create_sglAsm_syri_fig:
+    # Create a SyRI figure for a single input assembly
+    input:
+        ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz",
+        qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz"
+    output:
+        fig="output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png",
+        wrkdir=directory('data/asm.syri/{haplotype}')
+    threads: 4
+    params:
+        apppath=config["app.path"]
+    shell:
+        """
+        mkdir -p {output.wrkdir}
+        bash scripts/getSyriFigs.sh \
+            -a {params.apppath} \
+            -t {threads} \
+            -d {output.wrkdir} \
+            -o $(basename {output.fig}) \
+            -r {input.ref} \
+            -q "{input.qry}"
+        mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
+        """
+
+"""
+Core section
+"""
+
 rule clustering:
-    # Read alignment file to create bins for each chromosome
+    # Read ragtagged fastas and split chromosome sequences into according bins
    input:
        expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES)
    output:
@@ -93,8 +160,46 @@ rule clustering:
        done
        """

+rule create_pggb_input_syri_fig:
+    input:
+        fasta='data/chrInputs/{chromosome}.fa.gz'
+    output:
+        fig="output/chrInput.syri.figs/{chromosome}."+config['name']+".asm.syri.png",
+        wrkdir=directory('data/chrInput/syri/{chromosome}')
+    threads: 8
+    params:
+        apppath=config["app.path"],
+        ref=config['reference']
+    shell:
+        """
+        mkdir {output.wrkdir}
+        refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1)
+
+        ## Creating single fasta from multifasta
+        zcat {input.fasta} | awk -F"#" \
+            '/^>/ {OUT="{output.wrkdir}" substr($0,2) ".fa"}; {print >> OUT; close(OUT)}'
+        
+        ## Getting the list of sequences
+        asmList=()
+        for file in {output.wrkdir}/*.fa; do
+            asm="$(basename $file .fa | cut -f1 -d"#").fa"
+            mv $file "$(dirname $file)/$asm"
+            asmList+=("$asm")
+        done
+
+        bash scripts/getSyriFigs.sh \
+            -a {params.apppath} \
+            -t {threads} \
+            -d {output.wrkdir} \
+            -o $(basename {output.fig}) \
+            -r {output.wrkdir}/"${refname}.fa" \
+            -q "${asmList[@]}"
+        mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
+        rm {output.wrkdir}/*.fa
+        """
+
 rule pggb_on_chr:
-    # Runs pggb on a specific chromosome
+    # Run pggb on a specific chromosome
    input:
        fa="data/chrInputs/{chromosome}.fa.gz",
        gzi="data/chrInputs/{chromosome}.fa.gz.gzi"
@@ -149,46 +254,45 @@ rule graph_squeeze:
 rule graph_stats:
    # Using odgi to produce stats on every chromosome scale graph
    input:
-        "data/chrGraphs/graphsList.txt"
+        graph="data/chrGraphs/{chromosome}.gfa"
    output:
-        directory("output/stats")
+        stats="output/chrGraphs.stats/{chromosome}.stats.tsv"
    threads: 4
    params:
        apppath=config['app.path']
-    run:
-        shell("mkdir {output}")
-        # Getting the list of graphs
-        with open(input[0]) as handle:
-            graphList = [graph.rstrip("\n") for graph in handle.readlines()]
-        # Iterating over graphs
-        for graph in graphList:
-            shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif stats -S -t {threads} -P -i {graph} > {output}/$(basename {graph} .gfa).stats.tsv")
+    shell:
+        """
+        apptainer run --app odgi {params.apppath}/PanGeTools.sif stats \
+            -S -t {threads} -P -i {input.graph} > {output.stats}
+        """

 rule graph_figs:
    # Creating figures using odgi viz 
    input:
-        "data/chrGraphs/graphsList.txt"
+        graph="data/chrGraphs/{chromosome}.gfa"
    output:
-        directory("output/figures")
+        oneDviz="output/chrGraphs.figs/{chromosome}.1Dviz.png",
+        pcov="output/chrGraphs.figs/{chromosome}.pcov.png"
    threads: 4
    params:
        apppath=config['app.path'],
        oneDviz=config['odgi.1Dviz.params'],
        pcov=config['odgi.pcov.params']
-    run:
-        shell("mkdir {output}")
-        # Getting the list of graphs
-        with open(input[0]) as handle:
-            graphList = [graph.rstrip("\n") for graph in handle.readlines()]
-        # Iterating over graphs
-        for graph in graphList:
-            shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif viz -i {graph} -o {output}/$(basename {graph} .gfa).1Dviz.png {params.oneDviz} -t {threads} -P")
-            shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif viz -i {graph} -o {output}/$(basename {graph} .gfa).pcov.png {params.pcov} -t {threads} -P")
+    shell:
+        """
+        ## 1D viz
+        apptainer run --app odgi {params.apppath}/PanGeTools.sif \
+            viz -i {input.graph} -o {output.oneDviz} {params.oneDviz} -t {threads} -P

-rule aggregate_stats:
-    # Reading and merging all stats files into a final one called aggregatedStats.tsv
+        ## Pcov viz
+        apptainer run --app odgi {params.apppath}/PanGeTools.sif \
+            viz -i {input.graph} -o {output.pcov} {params.pcov} -t {threads} -P
+        """
+
+rule aggregate_graphs_stats:
+    # Reading and merging all stats files from chromosome graphs into a .tsv.
    input:
-        "output/stats/"
+        expand("output/chrGraphs.stats/{chromosome}.stats.tsv", chromosome=CHRLIST)
    output:
        "output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv"
    params:
@@ -196,29 +300,28 @@ rule aggregate_stats:
        panname=config['name']
    shell:
        """
-        apptainer run {params.apppath}/pan1c-env.sif python scripts/statsAggregation.py \
-            --input {input} --output {output} --panname {params.panname}
+        apptainer run {params.apppath}/pan1c-env.sif python scripts/chrStatsAggregation.py \
+            --input $(dirname {input[0]}) --output {output} --panname {params.panname}
        """

-rule get_pav:
-    # Create PAV matrix readable by panache for a given chromosome scale graph
+rule gfaTagR:
+    # Add metadata to the final GFA
    input:
-        "data/chrGraphs/graphsList.txt"
+        graph="output/pan1c.pggb."+config['name']+".gfa",
    output:
-        directory("output/pav.matrices")
-    threads: 16
+        "output/pan1c.pggb."+config['name']+".gfa.metadata"
    params:
-        apppath=config['app.path']
-    run:
-        shell("mkdir {output}")
-        # Getting the list of graphs
-        with open(input[0]) as handle:
-            graphList = [graph.rstrip("\n") for graph in handle.readlines()]
-        # Iterating over graphs
-        for graph in graphList:
-            shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.apppath} -t {threads}")
+        apppath=config['app.path'],
+        panname=config['name']
+    shell:
+        """
+        python scripts/getTags.py \
+            --appdir {params.apppath} --config-file config.yaml > {output}
+        sed -i '/^H/r {output}' {input.graph}
+        """

 rule pggb_log_compression:
+    # Compresses the logs of pggb into a tarball. (1 file to load in following steps) 
    input:
        flag="output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv"
    output:
@@ -231,6 +334,7 @@ rule pggb_log_compression:
        """

 rule pggb_input_stats:
+    # Produces statistics on pggb input sequences
    input:
        flag="output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv"
    output:
@@ -244,33 +348,40 @@ rule pggb_input_stats:
            -f data/chrInputs/*.fa.gz -o {output} -p {params.panname}
        """

-rule workflow_statistics:
+rule core_statistics:
+    # Combines all statistics from the input of pggb to their respective graphs 
    input:
        tar="logs/pan1c.pggb."+config['name']+".logs.tar.gz",
        chrInputStats="data/chrInputs/pan1c.pggb."+config['name']+".chrInput.stats.tsv",
        chrGraphStats="output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv"
    output:
-        tsv="output/pan1c.pggb."+config['name']+".workflow.stats.tsv",
+        tsv="output/pan1c.pggb."+config['name']+".core.stats.tsv",
        dir=directory("output/pggb.usage.figs")
    params:
        apppath=config['app.path']
    shell:
        """
        mkdir -p {output.dir}
-        apptainer run {params.apppath}/pan1c-env.sif python scripts/workflowStats.py \
+        apptainer run {params.apppath}/pan1c-env.sif python scripts/coreStats.py \
            -i {input.tar} -c {input.chrInputStats} -g {input.chrGraphStats} -o {output.tsv} -f {output.dir}
        """

-rule panacus_stats:
+"""
+Post-processing section :
+    The graph for each chromosome are made as well as some basic statistics.
+    In this section, more stats are produced but more specifics ones requiring dedicated tools (Panacus, PAVs for Panache ...).
+    It also contains rules to use the graph itself. 
+"""
+
+rule get_pav:
+    # Create PAV matrix readable by panache for a given chromosome scale graph
    input:
        "data/chrGraphs/graphsList.txt"
    output:
-        directory("output/panacus.reports")
+        directory("output/pav.matrices")
+    threads: 16
    params:
-        apppath=config['app.path'],
-        panname=config['name'],
-        refname=config['reference']
-    threads: 8
+        apppath=config['app.path']
    run:
        shell("mkdir {output}")
        # Getting the list of graphs
@@ -278,4 +389,26 @@ rule panacus_stats:
            graphList = [graph.rstrip("\n") for graph in handle.readlines()]
        # Iterating over graphs
        for graph in graphList:
-            shell("bash scripts/getPanacusHG.sh -g {graph} -r $(basename {params.refname} .fa.gz) -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).histgrowth.html -a {params.apppath} -t {threads}")
+            shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.apppath} -t {threads}")
+
+rule panacus_stats:
+    # Produces panacus reports for a chromosome graph
+    input:
+        graph="data/chrGraphs/{chromosome}.gfa"
+    output:
+        html="output/panacus.reports/{chromosome}.histgrowth.html"
+    params:
+        apppath=config['app.path'],
+        panname=config['name'],
+        refname=config['reference']
+    threads: 8
+    shell:
+        """
+        bash scripts/getPanacusHG.sh \
+            -g {input.graph} \
+            -r $(basename {params.refname} .fa.gz) \
+            -d data/chrGraphs/$(basename {input.graph} .gfa) \
+            -o {output.html} \
+            -a {params.apppath} \
+            -t {threads}
+        """
--- a/config.yaml
+++ b/config.yaml
@@ -4,4 +4,9 @@ app.path: '/home/amergez/work/apptainer'
 pggb.params: '-X --skip-viz'
 odgi.1Dviz.params: '-x 500 -b'
 odgi.pcov.params: '-x 500 -O'
+## Control over optional parts of the workflow
+# get_PAV is very long the more haplotypes there are (all vs all comparison)
 get_PAV: 'False'
+# get_allASM_SyRI controls if the SyRI figure with all haplotypes diplayed should be created (longer with #haplotypes)
+get_allASM_SyRI: 'True'
+
--- a/example/config_CICD.yaml
+++ b/example/config_CICD.yaml
@@ -4,4 +4,8 @@ app.path: 'appimgs/'
 pggb.params: '-X --skip-viz'
 odgi.1Dviz.params: '-x 500 -b'
 odgi.pcov.params: '-x 500 -O'
+## Control over optional parts of the workflow
+# get_PAV is very long the more haplotypes there are (all vs all comparison)
 get_PAV: 'False'
+# get_allASM_SyRI controls if the SyRI figure with all haplotypes diplayed should be created (longer with #haplotypes)
+get_allASM_SyRI: 'True'
--- a/scripts/analyzeWorflowData.py
+++ b/scripts/analyzeWorflowData.py
-"""
-Clustering script for Pan1c workflow
-
-Given a list of fasta files with records ids following the pattern <haplotype>#<chromosome id>,
-the script clusters sequence by chromosome and returns several fasta. 
-Each output fasta contains sequences related to one chromosome only.
-
-@author: alexis.mergez@inrae.fr
-@version: 1.0 
-"""
-
-from Bio import SeqIO
-import numpy as np
-import pandas as pd
-import argparse
-import gzip
-import os
-import seaborn as sns
-import matplotlib.pyplot as plt
-
-## Arguments
-arg_parser = argparse.ArgumentParser(description='Pan1c input haplotype clustering')
-arg_parser.add_argument(
-    "--statsfiles",
-    "-i",
-    nargs="+",
-    dest = "statsfiles",
-    required = True,
-    help = "Workflow statistics file(s)"
-    )
-arg_parser.add_argument(
-    "--output",
-    "-o",
-    dest = "outdir",
-    required = True,
-    help = "Output directory"
-    )
-arg_parser.add_argument(
-    "--debug",
-    "-d",
-    action="store_true",
-    dest = "debug",
-    help = "Show debug"
-    )
-args = arg_parser.parse_args()
-
-## Toolbox
-def getRegEquation(x, y):
-    (slope, intercept), ssr, _1, _2, _3 = np.polyfit(x, y, 1, full = True)
-    ymean = np.mean(y)
-    sst = np.sum((y - ymean)**2)
-    r2 = (1 - (ssr/sst))[0]
-    return f'y = {slope:.2f}x + {intercept:.2f} (R2 : {r2:.2f})'
-
-## Main script
-dfList = []
-for file in args.statsfiles:
-    dfList.append(
-        pd.read_csv(
-            file,
-            sep='\t',
-        )
-    )
-
-df = pd.concat(dfList, ignore_index = True)
-
-if args.debug: print(df)
-df.to_csv("Final.tsv", sep='\t', index=False)
-
-# Memory versus mean base count
-sns.regplot(x=df["input.mean.length"], y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5})
-equation = getRegEquation(x=df["input.mean.length"], y=df.mem)
-plt.xlabel('Mean input sequences length (#bases)')
-plt.ylabel('Peak memory usage (GB)')
-plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')
-plt.savefig("MemoryVSMeanSeqLength.png")
-plt.close()
-
-# Memory versus base count
-sns.regplot(x=df["input.total.length"], y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5})
-equation = getRegEquation(x=df["input.total.length"], y=df.mem)
-plt.xlabel('Total input sequences length (#bases)')
-plt.ylabel('Peak memory usage (GB)')
-plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')
-plt.savefig("MemoryVSSeqLength.png")
-plt.close()
-
-# PCA
-if False:
-    excluded_columns = ["pangenome.name", "chrom.id", "time"]
-    columns = [
-        col 
-        for col in df.columns.tolist()
-        if col not in excluded_columns
-    ]
-    crdf = df.copy()
-    crdf[columns] = (df[columns]-df[columns].mean())/df[columns].std()
-
-    if args.debug: print(crdf)
-
-    from sklearn.decomposition import PCA
-    pca = PCA(n_components=3)
-    newX = pca.fit_transform(crdf[columns])
-
-    pcaDF = pd.DataFrame(newX)
-    pcaDF.columns = ["D1", "D2", "D3"]
-    print(pcaDF)
-
-    finalDF = pd.concat([df[excluded_columns], pcaDF], axis = 1)
-    print(finalDF)
-
-    fig = sns.lmplot(x="D1", y="D2", hue="chrom.id", data = finalDF, fit_reg=False)
-    plt.savefig("Test.png")
\ No newline at end of file
--- a/scripts/chrInputStats.py
+++ b/scripts/chrInputStats.py
 """
 Input statistics script for Pan1c workflow

-Given a fasta input made for pggb, computes stats regarding the complexity of the sequences
+Given a fasta input made for pggb (data/chrInputs), it computes statistics. 

 @author: alexis.mergez@inrae.fr
 @version: 1.0 
@@ -47,48 +47,73 @@ arg_parser.add_argument(
    )
 args = arg_parser.parse_args()

-## Toolbox
-
 ## Main script
+"""
+Sequence dictionnary : 
+    - key : (Chromosome name (from filename), sequence id)
+    - value : sequence 
+"""
 seqDict = {}

 # Parsing fasta files
 for filename in args.fastafiles:
    
+    # Getting chromosome name from fasta filename
    chrName = os.path.basename(filename).split(".fa.gz")[0]

-    # Reading .fa.gz file and adding records to seqDict
+    # Reading bgzip fasta file and adding records to seqDict
    with gzip.open(filename, "rt") as handle:
+
+        # Parsing fasta sequences using SeqIO
        sequences = SeqIO.parse(handle, "fasta")
        
+        # Adding sequence to sequence dictionnary
        for record in sequences:
            seqDict[(chrName, record.id)] = record.seq

+# Reading available chromosomes for seqDict keys
 chromList = np.unique([x for x, y in seqDict.keys()])

+"""
+Data dictionnary :
+    - key : (pangenome name, chromosome id)
+    - value : statistics dictionnary
+"""
 aggregatedStats = {}

+# Iterating over available chromosomes
 for chrom in chromList:
+    """
+    Statistics dictionnary (temporary):
+        - key : statistic name
+        - value : statistic value
+    """
    _stats = {
        "input.#N": 0,
        "input.total.length": 0
        }
+    
+    # Storing length and computed N percentage for each sequences in dedicated lists
    _lengths, _Nper = [], []
+
+    # Iterating over sequences for the chrom fasta file in sequence dictionnary
    for (chrName, seqid), seq in seqDict.items():
+
        if chrName == chrom:
-            # Counting number of Ns
+            # Counting the number of Ns
            _stats["input.#N"] += seq.count("N")
-            # Counting total bases
+            # Counting the total number bases
            _stats["input.total.length"] += len(seq)
-            # Saving length and N percentage
+            # Saving the length and the N percentage
            _lengths.append(len(seq))
            _Nper.append((seq.count("N")/len(seq)))

+    # Adding stats
    _stats["input.mean.N%"] = np.mean(_Nper)*100
    _stats["input.mean.length"] = np.mean(_lengths)
    _stats["input.#sequences"] = len(_lengths)

-    #Computing L50
+    # Computing L50 for each sequence of chrom
    _lengths = sorted(_lengths, reverse = True)
    halfTotal = _stats["input.total.length"]/2
    cumulativeLength, L50 = 0, 0
@@ -101,13 +126,17 @@ for chrom in chromList:
    
    _stats["input.L50"] = L50

+    # Adding stats to according chromosome in the data dictionnary
    aggregatedStats[(args.panname,chrom)] = _stats

 if args.debug: print(aggregatedStats)    

+# Converting data dictionnary to pandas dataframe
 df = pd.DataFrame.from_dict(aggregatedStats, orient='index')
 df.reset_index(inplace=True)
 df.rename(columns={df.columns[0]: 'pangenome.name', df.columns[1]: 'chrom.id'}, inplace = True)
+
+# Saving to TSV
 df.to_csv(
    args.output,
    sep='\t',

--- a/scripts/statsAggregation.py
+++ b/scripts/statsAggregation.py
 """
-Statistics aggregator for Pan1c workflow
+Chromosomes statistics aggregator for Pan1c workflow

 Aggregate chromosome level graph statistics into a single TSV.


--- a/scripts/workflowStats.py
+++ b/scripts/workflowStats.py
 """
-Workflow statistics script for Pan1c workflow.
+Core statistics script for Pan1c workflow.

 Given one or more tarball containing time logs of the pggb rule of the workflow, 
 the script returns an aggregated table as well as figures    
@@ -114,7 +114,7 @@ for tarball in args.tarballs:
            if args.debug: print(f"[timeStats::debug] Pangenome Name: {panname}\tChr: {chrid}\tTime: {time}\tCPU: {cpu}%\t memory: {memory}Gb")

            # Adding to aggregatedData
-            aggregatedData[(panname,chrid)] = {"time": time, "cpu": cpu, "mem": memory}
+            aggregatedData[(panname,chrid)] = {"pggb.time": time, "pggb.cpu": cpu, "pggb.mem": memory}

 ## Parsing chromosome input file size (in #bases).
 # Iterating over input length files
@@ -169,9 +169,9 @@ df = pd.DataFrame.from_dict(aggregatedData, orient='index')
 df.reset_index(inplace=True)
 df.rename(columns={'level_0': 'pangenome.name', 'level_1': 'chrom.id'}, inplace=True)
 df.rename(columns=graphColDict, inplace = True)
-df.time = pd.to_timedelta(df.time)
-df.mem = df.mem.astype(float)
-df.cpu = df.cpu.astype(int)
+df["pggb.time"] = pd.to_timedelta(df["pggb.time"])
+df["pggb.mem"] = df["pggb.mem"].astype(float)
+df["pggb.cpu"] = df["pggb.cpu"].astype(int)
 df["input.total.length"] = df["input.total.length"].astype(int)

 if args.debug: print("[timeStats::debug]", df.dtypes)
@@ -181,8 +181,8 @@ df.to_csv(args.output, sep='\t', index=False)

 # Creating some figures
 # Time versus base count
-sns.regplot(x=df["input.total.length"], y=df.time.dt.total_seconds(), line_kws={"color":"r","alpha":0.7,"lw":5})
-equation = getRegEquation(x=df["input.total.length"], y=df.time.dt.total_seconds())
+sns.regplot(x=df["input.total.length"], y=df["pggb.time"].dt.total_seconds(), line_kws={"color":"r","alpha":0.7,"lw":5})
+equation = getRegEquation(x=df["input.total.length"], y=df["pggb.time"].dt.total_seconds())
 plt.xlabel('Total input sequences length (#bases)')
 plt.ylabel('Graph creation time (s)')
 plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')
@@ -190,8 +190,8 @@ plt.savefig(os.path.join(args.figdir,"TimeVSSeqLength.png"))
 plt.close()

 # Memory versus base count
-sns.regplot(x=df["input.total.length"], y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5})
-equation = getRegEquation(x=df["input.total.length"], y=df.mem)
+sns.regplot(x=df["input.total.length"], y=df["pggb.mem"], line_kws={"color":"r","alpha":0.7,"lw":5})
+equation = getRegEquation(x=df["input.total.length"], y=df["pggb.mem"])
 plt.xlabel('Total input sequences length (#bases)')
 plt.ylabel('Peak memory usage (GB)')
 plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')
@@ -199,8 +199,8 @@ plt.savefig(os.path.join(args.figdir,"MemoryVSSeqLength.png"))
 plt.close()

 # CPU versus base count
-sns.regplot(x=df["input.total.length"], y=df.cpu, line_kws={"color":"r","alpha":0.7,"lw":5})
-equation = getRegEquation(x=df["input.total.length"], y=df.cpu)
+sns.regplot(x=df["input.total.length"], y=df["pggb.cpu"], line_kws={"color":"r","alpha":0.7,"lw":5})
+equation = getRegEquation(x=df["input.total.length"], y=df["pggb.cpu"])
 plt.xlabel('Total input sequences length (#bases)')
 plt.ylabel('CPU usage (%)')
 plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')
@@ -208,15 +208,10 @@ plt.savefig(os.path.join(args.figdir,"CpuVSSeqLength.png"))
 plt.close()

 # Memory versus CPU
-sns.regplot(x=df.cpu, y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5})
-equation = getRegEquation(x=df.cpu, y=df.mem)
+sns.regplot(x=df["pggb.cpu"], y=df["pggb.mem"], line_kws={"color":"r","alpha":0.7,"lw":5})
+equation = getRegEquation(x=df["pggb.cpu"], y=df["pggb.mem"])
 plt.xlabel('CPU usage (%)')
 plt.ylabel('Peak memory usage (GB)')
 plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')
 plt.savefig(os.path.join(args.figdir,"MemoryVSCpu.png"))
 plt.close()
-
-
-
-
-
--- a/scripts/getPanacusHG.sh
+++ b/scripts/getPanacusHG.sh
 #!/bin/bash
-# Get a GFA and returns a Panacus report
+# Create a panacus report in html from a given gfa
+# @author: alexis.mergez@inrae.fr

 # Initializing arguments
 gfa=""          # GFA path
@@ -27,14 +28,11 @@ done
 chrname=$(basename ${gfa} .gfa)
 ref=$(echo $refname | sed 's/.hap/#/')

-
 # Getting paths in chromosome graph
-echo "[getPanacusHG::odgi::paths] Running on ${gfa}"
-apptainer run --app odgi "${appdir}/PanGeTools.sif" paths -i ${gfa} -L | grep -ve "$ref" > ${chrdir}/$chrname.paths.noref.txt
+echo "[getPanacusHG::paths] Running on ${gfa}"
+grep '^P' $gfa | cut -f2 | grep -ve "$ref" > ${chrdir}/$chrname.paths.noref.txt

 # Running Panacus
 echo "[getPanacusHG::panacus] Running on ${gfa}"
 apptainer run --app panacus $appdir/PanGeTools.sif histgrowth \
    -t $threads -l 1,2,1,1,1 -q 0,0,1,0.5,0.1 -S -a -s ${chrdir}/$chrname.paths.noref.txt -c all -o html $gfa > ${output}
-
-
--- a/scripts/getSyriFigs.sh
+++ b/scripts/getSyriFigs.sh
+#!/bin/bash
+# Create a Syri figure for the given genomes
+# @author: alexis.mergez@inrae.fr
+
+# Initializing arguments
+ref=""          # Reference fasta
+qry=""          # Queries fasta
+appdir=""       # Directory containing apptainer images
+threads=""      
+wrkdir=""       # Working directory (directory used by pggb to store step files like .paf, etc...)
+output=""       # Output Syri figure(s)
+
+## Getting arguments
+while getopts "r:q:a:t:d:o:" option; do
+    case "$option" in
+        r) ref="$OPTARG";;
+        q) qry="$OPTARG";;
+        a) appdir="$OPTARG";;
+        t) threads="$OPTARG";;
+        d) wrkdir="$OPTARG";;
+        o) output="$OPTARG";;
+        \?) echo "Usage: $0 [-r ref] [-q query] [-a appdir] [-t threads] [-d wrkdir] [-o output]" >&2
+            exit 1;;
+    esac
+done
+
+## Main script
+# Reading query argument and creating an array containing the path to query fasta files
+IFS=' ' read -r -a temp <<< "$qry"
+
+asmList=("$ref")
+# Sorting the array to put the reference in first
+for item in "${temp[@]}"; do
+    if [[ $item != "$ref" ]]; then
+        asmList+=($item)
+    fi
+done
+
+# Array to store the created syri files 
+syriFileList=()
+
+# Iterating 2 by 2 with overlap, over the array of fasta files
+for ((i = 0; i < ${#asmList[@]} - 1; i++)); do
+
+    # Setting filepaths for later
+    bamFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).bam"
+    syriFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).syri.out"
+    
+    # Adding the output syri file to the array
+    syriFileList+=($syriFile)
+
+    # Minimap2 genome vs genome alignment
+    apptainer run --app minimap2 $appdir/PanGeTools.sif \
+        -ax asm5 --eqx ${asmList[i]} ${asmList[i + 1]} -t $threads | \
+        apptainer run --app samtools $appdir/PanGeTools.sif sort -O BAM -@ $threads - > $wrkdir/$bamFile
+
+    # Syri on previous alignment
+    apptainer run $appdir/pan1c-env.sif \
+        syri -c $wrkdir/$bamFile -r ${asmList[i]} -q ${asmList[i + 1]} -k -F B \
+        --nc $threads \
+        --dir $wrkdir --prefix "$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz)."    
+done
+
+# Creating genomes.txt for plotsr. It is used to give simple names to fasta files in the final figure
+# Each line contains 2 columns : fasta filepath and its simpler name
+echo -e "#files\tname" > $wrkdir/genomes.txt
+for asm in "${asmList[@]}"; do
+    echo -e "${asm}\t$(basename $asm .fa.gz | cut -d'.' -f1)" >> $wrkdir/genomes.txt
+done
+
+# Generating the plotsr command
+command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f 12 -H 16 -W 9 -d 600 "
+
+# Adding syri files to the command as each needs to be specified using "--sr" argument 
+for file in "${syriFileList[@]}"; do
+    command+="--sr $wrkdir/$file "
+done
+
+# Running plotsr
+apptainer run $appdir/pan1c-env.sif plotsr \
+    $command
--- a/scripts/getTags.py
+++ b/scripts/getTags.py
+"""
+Tag list creation script for Pan1c workflow
+
+Returns a list of tags to ad at the top of the final gfa file as commented lines.
+
+@author: alexis.mergez@inrae.fr
+@version: 1.0 
+"""
+
+import argparse
+import subprocess
+import os
+import json
+
+## Arguments
+arg_parser = argparse.ArgumentParser(description='Tag list creation script')
+arg_parser.add_argument(
+    "--appdir",
+    "-a",
+    dest = "appdir",
+    required = True,
+    help = "Apptainer images directory"
+    )
+arg_parser.add_argument(
+    "--config-file",
+    "-c",
+    dest = "config",
+    required = True,
+    help = "Pan1c config file"
+    )
+args = arg_parser.parse_args()
+
+## Main script
+"""
+Tags dictionnary :
+    - key : Main tool / apptainer image
+    - value : dictionnary of tags
+"""
+tags = {}
+
+### Pan1c-workflow section
+tags["Pan1c"] = {}
+
+# Using git to get the version of the Pan1c workflow 
+_output = subprocess.run(
+    ["git", "describe", "--tags"],
+    capture_output=True,
+    text=True,
+).stdout[:-1]
+
+# Getting the pggb commands used in the workflow from the config file
+# ToDo : Get the command used from the pggb command logs !
+with open(args.config, 'r') as handle:
+    pggbCmd = [line[:-1] for line in handle.readlines() if "pggb.params" in line][0].split(': ')[-1]
+
+# Adding tags
+tags["Pan1c"]["pan1c.version"] = _output
+tags["Pan1c"]["pan1c.home"] = "https://forgemia.inra.fr/alexis.mergez/pan1c"
+tags["Pan1c"]["pan1c.pggb.args"] = pggbCmd
+
+### PanGeTools section
+tags["pangetools"] = {}
+
+# Reading the apps versions from the apptainer tags
+_output = subprocess.run(
+    ["apptainer", "inspect", "-j", f"{args.appdir}/PanGeTools.sif"],
+    capture_output=True, 
+    text=True
+).stdout
+_output = json.loads(_output)
+labels = _output['data']['attributes']['labels']
+tags["pangetools"]["image.version"] = labels['Version']
+tags["pangetools"]["image.home"] = labels['about.home']
+
+# Adding app versions to the tag dictionnary
+for key in labels.keys():
+    if ".Version" in key:
+        tags["pangetools"][key.lower()] = labels[key]
+
+### PGGB image section
+tags["pggb"] = {}
+
+# Reading the apps versions from the apptainer tags
+_output = subprocess.run(
+    ["apptainer", "inspect", "-j", f"{args.appdir}/pggb.sif"],
+    capture_output=True, 
+    text=True
+).stdout
+_output = json.loads(_output)
+labels = _output['data']['attributes']['labels']
+tags["pggb"]["image.version"] = labels['Version']
+tags["pggb"]["image.home"] = labels['about.home']
+
+# Adding app versions to the tag dictionnary
+for key in labels.keys():
+    if ".Version" in key:
+        tags["pggb"][key.lower()] = labels[key]
+
+### Pan1c-Env section
+tags["pan1c-env"] = {}
+
+# Reading the apps versions from the apptainer tags
+_output = subprocess.run(
+    ["apptainer", "inspect", "-j", f"{args.appdir}/pan1c-env.sif"],
+    capture_output=True, 
+    text=True
+).stdout
+_output = json.loads(_output)
+labels = _output['data']['attributes']['labels']
+tags["pan1c-env"]["image.version"] = labels['Version']
+tags["pan1c-env"]["image.home"] = labels['about.home']
+
+# Adding app versions to the tag dictionnary
+for key in labels.keys():
+    if ".Version" in key:
+        tags["pan1c-env"][key.lower()] = labels[key]
+
+## Pan1c-Box section
+tags["pan1c-box"] = {}
+
+# Reading the apps versions from the apptainer tags
+_output = subprocess.run(
+    ["apptainer", "inspect", "-j", f"{args.appdir}/pan1c-box.sif"],
+    capture_output=True, 
+    text=True
+).stdout
+_output = json.loads(_output)
+labels = _output['data']['attributes']['labels']
+tags["pan1c-box"]["image.version"] = labels['Version']
+tags["pan1c-box"]["image.home"] = labels['about.home']
+
+# Adding app versions to the tag dictionnary
+for key in labels.keys():
+    if ".Version" in key:
+        tags["pan1c-box"][key.lower()] = labels[key]
+
+## Exporting tags to stdout
+print("#\tThis graph have been created using the Pan1c workflow (https://forgemia.inra.fr/alexis.mergez/pan1c)\n#")
+print("#\tTool versions and commands\n#")
+for first_elem in tags.keys():
+    print(f'#\t-- {first_elem} --')
+    for label in tags[first_elem].keys():
+        print(f"#\t{first_elem}\t{label}: {tags[first_elem][label]}")
+    print('#')
+    
--- a/scripts/inputClustering.py
+++ b/scripts/inputClustering.py
@@ -10,6 +10,7 @@ Each output fasta contains sequences related to one chromosome only.
 """

 from Bio import SeqIO
+from Bio.SeqIO import FastaIO
 import numpy as np
 import argparse
 import gzip
@@ -86,4 +87,5 @@ if args.debug: print(chrSeq.keys())
 # Writing chromosome specific fasta file
 for chrName in chrSeq.keys():
    with open(os.path.join(args.outdir, f"{chrName}.fa"), "w") as output_handle:
-        SeqIO.write(chrSeq[chrName], output_handle, "fasta")
\ No newline at end of file
+        fasta_out = FastaIO.FastaWriter(output_handle, wrap=None)
+        fasta_out.write_file(chrSeq[chrName])  
\ No newline at end of file
--- a/scripts/inputStats.py
+++ b/scripts/inputStats.py
+"""
+Input statistics script for Pan1c workflow
+
+This scripts produces statistics of the input haplotypes such as kmers counts and lengths. 
+
+@author: alexis.mergez@inrae.fr
+@version: 1.0 
+"""
+
+from Bio import SeqIO
+import numpy as np
+import argparse
+import gzip
+import os
+
+## Arguments
+arg_parser = argparse.ArgumentParser(description='Pan1c input haplotype statistics')
+arg_parser.add_argument(
+    "--fasta",
+    "-f",
+    dest = "fastafile",
+    required = True,
+    help = "Fasta file"
+    )
+arg_parser.add_argument(
+    "--output",
+    "-o",
+    dest = "outdir",
+    required = True,
+    help = "Output directory"
+    )
+arg_parser.add_argument(
+    "--debug",
+    "-d",
+    action="store_true",
+    dest = "debug",
+    help = "Show debug"
+    )
+args = arg_parser.parse_args()
+
+## Toolbox
+def getChr(name):
+    """
+    Simply returns the chromosome name of a given fasta record
+    """
+    return name.split('#')[-1]
+
+## Main script
+seqDict = {}
+
+# Parsing fasta files
+for filename in args.fastafiles:
+    
+    # Reading .fa.gz file and adding records to seqDict
+    with gzip.open(filename, "rt") as handle:
+        sequences = SeqIO.parse(handle, "fasta")
+        
+        for record in sequences:
+            seqDict[record.id] = record
+
+# Inferring the list of available chromosomes from the sequence record ids
+chrList = np.unique([
+    getChr(recordid) for recordid in seqDict.keys()
+])
+
+# Clustering records based on chromosome tag in records id
+chrSeq = {}
+
+for recordid, record in seqDict.items():
+    # Getting the chromosome id
+    _chrname = getChr(recordid)
+
+    # If not encountered before, create a key for this chromosome in chrSeq
+    if _chrname not in list(chrSeq.keys()):
+        chrSeq[_chrname] = []
+
+    # Add the record to the chromosome bin in chrSeq
+    chrSeq[_chrname].append(record)
+
+# Debug purpose print
+if args.debug: print(chrSeq.keys())
+
+# Writing chromosome specific fasta file
+for chrName in chrSeq.keys():
+    with open(os.path.join(args.outdir, f"{chrName}.fa"), "w") as output_handle:
+        SeqIO.write(chrSeq[chrName], output_handle, "fasta")
\ No newline at end of file