Skip to content
Snippets Groups Projects
Commit 59bab410 authored by Alexis Mergez's avatar Alexis Mergez
Browse files

Merge branch 'dev' into 'main'

v1.6

See merge request !11
parents 64663c72 bcbd5705
No related branches found
No related tags found
1 merge request!11v1.6
Pipeline #178844 passed with warnings
......@@ -5,40 +5,50 @@ import numpy as np
import gzip
import re
# Getting the list of haplotypes
# Getting the list of haplotypes (SAMPLES)
SAMPLES = np.unique([
os.path.basename(f).split('.fa')[0]
for f in os.listdir("data/haplotypes/")
if re.search(r".fa", f)
])
SAMPLES_NOREF = [
sample
for sample in SAMPLES
if sample != os.path.basename(config['reference']).split('.fa')[0]
]
nHAP = len(SAMPLES)
# Getting the list of chromosomes
with gzip.open("data/haplotypes/"+config['reference'], "r") as handle:
CHRLIST = [line.decode().split("#")[-1].split('\n')[0] for line in handle.readlines() if line.decode()[0] == ">"]
def which_post_analysis():
## Simple function to configure which parts of the workflow needs to be run
post_analysis_inputs = [ # Default post analysis steps
"output/pan1c.pggb."+config['name']+".workflow.stats.tsv",
"output/figures",
"output/panacus.reports"
# Configuring steps to be done
def which_analysis():
# Creating a list with default analysis steps (to prevent the function from returning an empty list)
analysis_inputs = [
"output/pan1c.pggb."+config['name']+".core.stats.tsv", # core stats
expand("output/panacus.reports/{chromosome}.histgrowth.html", chromosome=CHRLIST), # panacus histgrowth
expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF), # syri for haplotypes
expand("output/chrGraphs.figs/{chromosome}.1Dviz.png", chromosome=CHRLIST), # visualizations from odgi on chromosome graphs
"output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv" # chromosomes graph statistics
]
# Optionals post analysis steps
# Optionals analysis steps
if config["get_PAV"] == "True":
post_analysis_inputs.append("output/pav.matrices")
analysis_inputs.append("output/pav.matrices")
if config["get_allASM_SyRI"] == "True":
analysis_inputs.append("output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png")
return post_analysis_inputs
return analysis_inputs
rule all:
input:
"output/pan1c.pggb."+config['name']+".gfa",
"output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv",
which_post_analysis()
"output/pan1c.pggb."+config['name']+".gfa", # Final graph (main output)
"output/pan1c.pggb."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line)
which_analysis()
rule samtools_index:
# Using samtools faidx to index compressed fasta
# Samtools faidx to index compressed fasta
input:
fa="{sample}.fa.gz"
output:
......@@ -51,6 +61,11 @@ rule samtools_index:
"apptainer run --app samtools {params.apppath}/PanGeTools.sif "
"faidx {input.fa}"
"""
Pre-processing section
Preparing pggb inputs
"""
rule ragtag_scaffolding:
# Scaffold input haplotype against the reference to infer chromosome scale sequences
input:
......@@ -74,8 +89,60 @@ rule ragtag_scaffolding:
-o {output}
"""
rule create_allAsm_syri_fig:
# Create a SyRI figure containing all input assemblies
input:
ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz",
qry=expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES)
output:
fig="output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png",
wrkdir=directory('data/asm.syri/all')
threads: 8
params:
apppath=config["app.path"]
shell:
"""
mkdir {output.wrkdir}
bash scripts/getSyriFigs.sh \
-a {params.apppath} \
-t {threads} \
-d {output.wrkdir} \
-o $(basename {output.fig}) \
-r {input.ref} \
-q "{input.qry}"
mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
"""
rule create_sglAsm_syri_fig:
# Create a SyRI figure for a single input assembly
input:
ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz",
qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz"
output:
fig="output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png",
wrkdir=directory('data/asm.syri/{haplotype}')
threads: 4
params:
apppath=config["app.path"]
shell:
"""
mkdir -p {output.wrkdir}
bash scripts/getSyriFigs.sh \
-a {params.apppath} \
-t {threads} \
-d {output.wrkdir} \
-o $(basename {output.fig}) \
-r {input.ref} \
-q "{input.qry}"
mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
"""
"""
Core section
"""
rule clustering:
# Read alignment file to create bins for each chromosome
# Read ragtagged fastas and split chromosome sequences into according bins
input:
expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES)
output:
......@@ -93,8 +160,46 @@ rule clustering:
done
"""
rule create_pggb_input_syri_fig:
input:
fasta='data/chrInputs/{chromosome}.fa.gz'
output:
fig="output/chrInput.syri.figs/{chromosome}."+config['name']+".asm.syri.png",
wrkdir=directory('data/chrInput/syri/{chromosome}')
threads: 8
params:
apppath=config["app.path"],
ref=config['reference']
shell:
"""
mkdir {output.wrkdir}
refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1)
## Creating single fasta from multifasta
zcat {input.fasta} | awk -F"#" \
'/^>/ {OUT="{output.wrkdir}" substr($0,2) ".fa"}; {print >> OUT; close(OUT)}'
## Getting the list of sequences
asmList=()
for file in {output.wrkdir}/*.fa; do
asm="$(basename $file .fa | cut -f1 -d"#").fa"
mv $file "$(dirname $file)/$asm"
asmList+=("$asm")
done
bash scripts/getSyriFigs.sh \
-a {params.apppath} \
-t {threads} \
-d {output.wrkdir} \
-o $(basename {output.fig}) \
-r {output.wrkdir}/"${refname}.fa" \
-q "${asmList[@]}"
mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
rm {output.wrkdir}/*.fa
"""
rule pggb_on_chr:
# Runs pggb on a specific chromosome
# Run pggb on a specific chromosome
input:
fa="data/chrInputs/{chromosome}.fa.gz",
gzi="data/chrInputs/{chromosome}.fa.gz.gzi"
......@@ -149,46 +254,45 @@ rule graph_squeeze:
rule graph_stats:
# Using odgi to produce stats on every chromosome scale graph
input:
"data/chrGraphs/graphsList.txt"
graph="data/chrGraphs/{chromosome}.gfa"
output:
directory("output/stats")
stats="output/chrGraphs.stats/{chromosome}.stats.tsv"
threads: 4
params:
apppath=config['app.path']
run:
shell("mkdir {output}")
# Getting the list of graphs
with open(input[0]) as handle:
graphList = [graph.rstrip("\n") for graph in handle.readlines()]
# Iterating over graphs
for graph in graphList:
shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif stats -S -t {threads} -P -i {graph} > {output}/$(basename {graph} .gfa).stats.tsv")
shell:
"""
apptainer run --app odgi {params.apppath}/PanGeTools.sif stats \
-S -t {threads} -P -i {input.graph} > {output.stats}
"""
rule graph_figs:
# Creating figures using odgi viz
input:
"data/chrGraphs/graphsList.txt"
graph="data/chrGraphs/{chromosome}.gfa"
output:
directory("output/figures")
oneDviz="output/chrGraphs.figs/{chromosome}.1Dviz.png",
pcov="output/chrGraphs.figs/{chromosome}.pcov.png"
threads: 4
params:
apppath=config['app.path'],
oneDviz=config['odgi.1Dviz.params'],
pcov=config['odgi.pcov.params']
run:
shell("mkdir {output}")
# Getting the list of graphs
with open(input[0]) as handle:
graphList = [graph.rstrip("\n") for graph in handle.readlines()]
# Iterating over graphs
for graph in graphList:
shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif viz -i {graph} -o {output}/$(basename {graph} .gfa).1Dviz.png {params.oneDviz} -t {threads} -P")
shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif viz -i {graph} -o {output}/$(basename {graph} .gfa).pcov.png {params.pcov} -t {threads} -P")
shell:
"""
## 1D viz
apptainer run --app odgi {params.apppath}/PanGeTools.sif \
viz -i {input.graph} -o {output.oneDviz} {params.oneDviz} -t {threads} -P
rule aggregate_stats:
# Reading and merging all stats files into a final one called aggregatedStats.tsv
## Pcov viz
apptainer run --app odgi {params.apppath}/PanGeTools.sif \
viz -i {input.graph} -o {output.pcov} {params.pcov} -t {threads} -P
"""
rule aggregate_graphs_stats:
# Reading and merging all stats files from chromosome graphs into a .tsv.
input:
"output/stats/"
expand("output/chrGraphs.stats/{chromosome}.stats.tsv", chromosome=CHRLIST)
output:
"output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv"
params:
......@@ -196,29 +300,28 @@ rule aggregate_stats:
panname=config['name']
shell:
"""
apptainer run {params.apppath}/pan1c-env.sif python scripts/statsAggregation.py \
--input {input} --output {output} --panname {params.panname}
apptainer run {params.apppath}/pan1c-env.sif python scripts/chrStatsAggregation.py \
--input $(dirname {input[0]}) --output {output} --panname {params.panname}
"""
rule get_pav:
# Create PAV matrix readable by panache for a given chromosome scale graph
rule gfaTagR:
# Add metadata to the final GFA
input:
"data/chrGraphs/graphsList.txt"
graph="output/pan1c.pggb."+config['name']+".gfa",
output:
directory("output/pav.matrices")
threads: 16
"output/pan1c.pggb."+config['name']+".gfa.metadata"
params:
apppath=config['app.path']
run:
shell("mkdir {output}")
# Getting the list of graphs
with open(input[0]) as handle:
graphList = [graph.rstrip("\n") for graph in handle.readlines()]
# Iterating over graphs
for graph in graphList:
shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.apppath} -t {threads}")
apppath=config['app.path'],
panname=config['name']
shell:
"""
python scripts/getTags.py \
--appdir {params.apppath} --config-file config.yaml > {output}
sed -i '/^H/r {output}' {input.graph}
"""
rule pggb_log_compression:
# Compresses the logs of pggb into a tarball. (1 file to load in following steps)
input:
flag="output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv"
output:
......@@ -231,6 +334,7 @@ rule pggb_log_compression:
"""
rule pggb_input_stats:
# Produces statistics on pggb input sequences
input:
flag="output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv"
output:
......@@ -244,33 +348,40 @@ rule pggb_input_stats:
-f data/chrInputs/*.fa.gz -o {output} -p {params.panname}
"""
rule workflow_statistics:
rule core_statistics:
# Combines all statistics from the input of pggb to their respective graphs
input:
tar="logs/pan1c.pggb."+config['name']+".logs.tar.gz",
chrInputStats="data/chrInputs/pan1c.pggb."+config['name']+".chrInput.stats.tsv",
chrGraphStats="output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv"
output:
tsv="output/pan1c.pggb."+config['name']+".workflow.stats.tsv",
tsv="output/pan1c.pggb."+config['name']+".core.stats.tsv",
dir=directory("output/pggb.usage.figs")
params:
apppath=config['app.path']
shell:
"""
mkdir -p {output.dir}
apptainer run {params.apppath}/pan1c-env.sif python scripts/workflowStats.py \
apptainer run {params.apppath}/pan1c-env.sif python scripts/coreStats.py \
-i {input.tar} -c {input.chrInputStats} -g {input.chrGraphStats} -o {output.tsv} -f {output.dir}
"""
rule panacus_stats:
"""
Post-processing section :
The graph for each chromosome are made as well as some basic statistics.
In this section, more stats are produced but more specifics ones requiring dedicated tools (Panacus, PAVs for Panache ...).
It also contains rules to use the graph itself.
"""
rule get_pav:
# Create PAV matrix readable by panache for a given chromosome scale graph
input:
"data/chrGraphs/graphsList.txt"
output:
directory("output/panacus.reports")
directory("output/pav.matrices")
threads: 16
params:
apppath=config['app.path'],
panname=config['name'],
refname=config['reference']
threads: 8
apppath=config['app.path']
run:
shell("mkdir {output}")
# Getting the list of graphs
......@@ -278,4 +389,26 @@ rule panacus_stats:
graphList = [graph.rstrip("\n") for graph in handle.readlines()]
# Iterating over graphs
for graph in graphList:
shell("bash scripts/getPanacusHG.sh -g {graph} -r $(basename {params.refname} .fa.gz) -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).histgrowth.html -a {params.apppath} -t {threads}")
shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.apppath} -t {threads}")
rule panacus_stats:
# Produces panacus reports for a chromosome graph
input:
graph="data/chrGraphs/{chromosome}.gfa"
output:
html="output/panacus.reports/{chromosome}.histgrowth.html"
params:
apppath=config['app.path'],
panname=config['name'],
refname=config['reference']
threads: 8
shell:
"""
bash scripts/getPanacusHG.sh \
-g {input.graph} \
-r $(basename {params.refname} .fa.gz) \
-d data/chrGraphs/$(basename {input.graph} .gfa) \
-o {output.html} \
-a {params.apppath} \
-t {threads}
"""
......@@ -4,4 +4,9 @@ app.path: '/home/amergez/work/apptainer'
pggb.params: '-X --skip-viz'
odgi.1Dviz.params: '-x 500 -b'
odgi.pcov.params: '-x 500 -O'
## Control over optional parts of the workflow
# get_PAV is very long the more haplotypes there are (all vs all comparison)
get_PAV: 'False'
# get_allASM_SyRI controls if the SyRI figure with all haplotypes diplayed should be created (longer with #haplotypes)
get_allASM_SyRI: 'True'
......@@ -4,4 +4,8 @@ app.path: 'appimgs/'
pggb.params: '-X --skip-viz'
odgi.1Dviz.params: '-x 500 -b'
odgi.pcov.params: '-x 500 -O'
## Control over optional parts of the workflow
# get_PAV is very long the more haplotypes there are (all vs all comparison)
get_PAV: 'False'
# get_allASM_SyRI controls if the SyRI figure with all haplotypes diplayed should be created (longer with #haplotypes)
get_allASM_SyRI: 'True'
"""
Clustering script for Pan1c workflow
Given a list of fasta files with records ids following the pattern <haplotype>#<chromosome id>,
the script clusters sequence by chromosome and returns several fasta.
Each output fasta contains sequences related to one chromosome only.
@author: alexis.mergez@inrae.fr
@version: 1.0
"""
from Bio import SeqIO
import numpy as np
import pandas as pd
import argparse
import gzip
import os
import seaborn as sns
import matplotlib.pyplot as plt
## Arguments
arg_parser = argparse.ArgumentParser(description='Pan1c input haplotype clustering')
arg_parser.add_argument(
"--statsfiles",
"-i",
nargs="+",
dest = "statsfiles",
required = True,
help = "Workflow statistics file(s)"
)
arg_parser.add_argument(
"--output",
"-o",
dest = "outdir",
required = True,
help = "Output directory"
)
arg_parser.add_argument(
"--debug",
"-d",
action="store_true",
dest = "debug",
help = "Show debug"
)
args = arg_parser.parse_args()
## Toolbox
def getRegEquation(x, y):
(slope, intercept), ssr, _1, _2, _3 = np.polyfit(x, y, 1, full = True)
ymean = np.mean(y)
sst = np.sum((y - ymean)**2)
r2 = (1 - (ssr/sst))[0]
return f'y = {slope:.2f}x + {intercept:.2f} (R2 : {r2:.2f})'
## Main script
dfList = []
for file in args.statsfiles:
dfList.append(
pd.read_csv(
file,
sep='\t',
)
)
df = pd.concat(dfList, ignore_index = True)
if args.debug: print(df)
df.to_csv("Final.tsv", sep='\t', index=False)
# Memory versus mean base count
sns.regplot(x=df["input.mean.length"], y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5})
equation = getRegEquation(x=df["input.mean.length"], y=df.mem)
plt.xlabel('Mean input sequences length (#bases)')
plt.ylabel('Peak memory usage (GB)')
plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')
plt.savefig("MemoryVSMeanSeqLength.png")
plt.close()
# Memory versus base count
sns.regplot(x=df["input.total.length"], y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5})
equation = getRegEquation(x=df["input.total.length"], y=df.mem)
plt.xlabel('Total input sequences length (#bases)')
plt.ylabel('Peak memory usage (GB)')
plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')
plt.savefig("MemoryVSSeqLength.png")
plt.close()
# PCA
if False:
excluded_columns = ["pangenome.name", "chrom.id", "time"]
columns = [
col
for col in df.columns.tolist()
if col not in excluded_columns
]
crdf = df.copy()
crdf[columns] = (df[columns]-df[columns].mean())/df[columns].std()
if args.debug: print(crdf)
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
newX = pca.fit_transform(crdf[columns])
pcaDF = pd.DataFrame(newX)
pcaDF.columns = ["D1", "D2", "D3"]
print(pcaDF)
finalDF = pd.concat([df[excluded_columns], pcaDF], axis = 1)
print(finalDF)
fig = sns.lmplot(x="D1", y="D2", hue="chrom.id", data = finalDF, fit_reg=False)
plt.savefig("Test.png")
\ No newline at end of file
"""
Input statistics script for Pan1c workflow
Given a fasta input made for pggb, computes stats regarding the complexity of the sequences
Given a fasta input made for pggb (data/chrInputs), it computes statistics.
@author: alexis.mergez@inrae.fr
@version: 1.0
......@@ -47,48 +47,73 @@ arg_parser.add_argument(
)
args = arg_parser.parse_args()
## Toolbox
## Main script
"""
Sequence dictionnary :
- key : (Chromosome name (from filename), sequence id)
- value : sequence
"""
seqDict = {}
# Parsing fasta files
for filename in args.fastafiles:
# Getting chromosome name from fasta filename
chrName = os.path.basename(filename).split(".fa.gz")[0]
# Reading .fa.gz file and adding records to seqDict
# Reading bgzip fasta file and adding records to seqDict
with gzip.open(filename, "rt") as handle:
# Parsing fasta sequences using SeqIO
sequences = SeqIO.parse(handle, "fasta")
# Adding sequence to sequence dictionnary
for record in sequences:
seqDict[(chrName, record.id)] = record.seq
# Reading available chromosomes for seqDict keys
chromList = np.unique([x for x, y in seqDict.keys()])
"""
Data dictionnary :
- key : (pangenome name, chromosome id)
- value : statistics dictionnary
"""
aggregatedStats = {}
# Iterating over available chromosomes
for chrom in chromList:
"""
Statistics dictionnary (temporary):
- key : statistic name
- value : statistic value
"""
_stats = {
"input.#N": 0,
"input.total.length": 0
}
# Storing length and computed N percentage for each sequences in dedicated lists
_lengths, _Nper = [], []
# Iterating over sequences for the chrom fasta file in sequence dictionnary
for (chrName, seqid), seq in seqDict.items():
if chrName == chrom:
# Counting number of Ns
# Counting the number of Ns
_stats["input.#N"] += seq.count("N")
# Counting total bases
# Counting the total number bases
_stats["input.total.length"] += len(seq)
# Saving length and N percentage
# Saving the length and the N percentage
_lengths.append(len(seq))
_Nper.append((seq.count("N")/len(seq)))
# Adding stats
_stats["input.mean.N%"] = np.mean(_Nper)*100
_stats["input.mean.length"] = np.mean(_lengths)
_stats["input.#sequences"] = len(_lengths)
#Computing L50
# Computing L50 for each sequence of chrom
_lengths = sorted(_lengths, reverse = True)
halfTotal = _stats["input.total.length"]/2
cumulativeLength, L50 = 0, 0
......@@ -101,13 +126,17 @@ for chrom in chromList:
_stats["input.L50"] = L50
# Adding stats to according chromosome in the data dictionnary
aggregatedStats[(args.panname,chrom)] = _stats
if args.debug: print(aggregatedStats)
# Converting data dictionnary to pandas dataframe
df = pd.DataFrame.from_dict(aggregatedStats, orient='index')
df.reset_index(inplace=True)
df.rename(columns={df.columns[0]: 'pangenome.name', df.columns[1]: 'chrom.id'}, inplace = True)
# Saving to TSV
df.to_csv(
args.output,
sep='\t',
......
"""
Statistics aggregator for Pan1c workflow
Chromosomes statistics aggregator for Pan1c workflow
Aggregate chromosome level graph statistics into a single TSV.
......
"""
Workflow statistics script for Pan1c workflow.
Core statistics script for Pan1c workflow.
Given one or more tarball containing time logs of the pggb rule of the workflow,
the script returns an aggregated table as well as figures
......@@ -114,7 +114,7 @@ for tarball in args.tarballs:
if args.debug: print(f"[timeStats::debug] Pangenome Name: {panname}\tChr: {chrid}\tTime: {time}\tCPU: {cpu}%\t memory: {memory}Gb")
# Adding to aggregatedData
aggregatedData[(panname,chrid)] = {"time": time, "cpu": cpu, "mem": memory}
aggregatedData[(panname,chrid)] = {"pggb.time": time, "pggb.cpu": cpu, "pggb.mem": memory}
## Parsing chromosome input file size (in #bases).
# Iterating over input length files
......@@ -169,9 +169,9 @@ df = pd.DataFrame.from_dict(aggregatedData, orient='index')
df.reset_index(inplace=True)
df.rename(columns={'level_0': 'pangenome.name', 'level_1': 'chrom.id'}, inplace=True)
df.rename(columns=graphColDict, inplace = True)
df.time = pd.to_timedelta(df.time)
df.mem = df.mem.astype(float)
df.cpu = df.cpu.astype(int)
df["pggb.time"] = pd.to_timedelta(df["pggb.time"])
df["pggb.mem"] = df["pggb.mem"].astype(float)
df["pggb.cpu"] = df["pggb.cpu"].astype(int)
df["input.total.length"] = df["input.total.length"].astype(int)
if args.debug: print("[timeStats::debug]", df.dtypes)
......@@ -181,8 +181,8 @@ df.to_csv(args.output, sep='\t', index=False)
# Creating some figures
# Time versus base count
sns.regplot(x=df["input.total.length"], y=df.time.dt.total_seconds(), line_kws={"color":"r","alpha":0.7,"lw":5})
equation = getRegEquation(x=df["input.total.length"], y=df.time.dt.total_seconds())
sns.regplot(x=df["input.total.length"], y=df["pggb.time"].dt.total_seconds(), line_kws={"color":"r","alpha":0.7,"lw":5})
equation = getRegEquation(x=df["input.total.length"], y=df["pggb.time"].dt.total_seconds())
plt.xlabel('Total input sequences length (#bases)')
plt.ylabel('Graph creation time (s)')
plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')
......@@ -190,8 +190,8 @@ plt.savefig(os.path.join(args.figdir,"TimeVSSeqLength.png"))
plt.close()
# Memory versus base count
sns.regplot(x=df["input.total.length"], y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5})
equation = getRegEquation(x=df["input.total.length"], y=df.mem)
sns.regplot(x=df["input.total.length"], y=df["pggb.mem"], line_kws={"color":"r","alpha":0.7,"lw":5})
equation = getRegEquation(x=df["input.total.length"], y=df["pggb.mem"])
plt.xlabel('Total input sequences length (#bases)')
plt.ylabel('Peak memory usage (GB)')
plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')
......@@ -199,8 +199,8 @@ plt.savefig(os.path.join(args.figdir,"MemoryVSSeqLength.png"))
plt.close()
# CPU versus base count
sns.regplot(x=df["input.total.length"], y=df.cpu, line_kws={"color":"r","alpha":0.7,"lw":5})
equation = getRegEquation(x=df["input.total.length"], y=df.cpu)
sns.regplot(x=df["input.total.length"], y=df["pggb.cpu"], line_kws={"color":"r","alpha":0.7,"lw":5})
equation = getRegEquation(x=df["input.total.length"], y=df["pggb.cpu"])
plt.xlabel('Total input sequences length (#bases)')
plt.ylabel('CPU usage (%)')
plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')
......@@ -208,15 +208,10 @@ plt.savefig(os.path.join(args.figdir,"CpuVSSeqLength.png"))
plt.close()
# Memory versus CPU
sns.regplot(x=df.cpu, y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5})
equation = getRegEquation(x=df.cpu, y=df.mem)
sns.regplot(x=df["pggb.cpu"], y=df["pggb.mem"], line_kws={"color":"r","alpha":0.7,"lw":5})
equation = getRegEquation(x=df["pggb.cpu"], y=df["pggb.mem"])
plt.xlabel('CPU usage (%)')
plt.ylabel('Peak memory usage (GB)')
plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')
plt.savefig(os.path.join(args.figdir,"MemoryVSCpu.png"))
plt.close()
#!/bin/bash
# Get a GFA and returns a Panacus report
# Create a panacus report in html from a given gfa
# @author: alexis.mergez@inrae.fr
# Initializing arguments
gfa="" # GFA path
......@@ -27,14 +28,11 @@ done
chrname=$(basename ${gfa} .gfa)
ref=$(echo $refname | sed 's/.hap/#/')
# Getting paths in chromosome graph
echo "[getPanacusHG::odgi::paths] Running on ${gfa}"
apptainer run --app odgi "${appdir}/PanGeTools.sif" paths -i ${gfa} -L | grep -ve "$ref" > ${chrdir}/$chrname.paths.noref.txt
echo "[getPanacusHG::paths] Running on ${gfa}"
grep '^P' $gfa | cut -f2 | grep -ve "$ref" > ${chrdir}/$chrname.paths.noref.txt
# Running Panacus
echo "[getPanacusHG::panacus] Running on ${gfa}"
apptainer run --app panacus $appdir/PanGeTools.sif histgrowth \
-t $threads -l 1,2,1,1,1 -q 0,0,1,0.5,0.1 -S -a -s ${chrdir}/$chrname.paths.noref.txt -c all -o html $gfa > ${output}
#!/bin/bash
# Create a Syri figure for the given genomes
# @author: alexis.mergez@inrae.fr
# Initializing arguments
ref="" # Reference fasta
qry="" # Queries fasta
appdir="" # Directory containing apptainer images
threads=""
wrkdir="" # Working directory (directory used by pggb to store step files like .paf, etc...)
output="" # Output Syri figure(s)
## Getting arguments
while getopts "r:q:a:t:d:o:" option; do
case "$option" in
r) ref="$OPTARG";;
q) qry="$OPTARG";;
a) appdir="$OPTARG";;
t) threads="$OPTARG";;
d) wrkdir="$OPTARG";;
o) output="$OPTARG";;
\?) echo "Usage: $0 [-r ref] [-q query] [-a appdir] [-t threads] [-d wrkdir] [-o output]" >&2
exit 1;;
esac
done
## Main script
# Reading query argument and creating an array containing the path to query fasta files
IFS=' ' read -r -a temp <<< "$qry"
asmList=("$ref")
# Sorting the array to put the reference in first
for item in "${temp[@]}"; do
if [[ $item != "$ref" ]]; then
asmList+=($item)
fi
done
# Array to store the created syri files
syriFileList=()
# Iterating 2 by 2 with overlap, over the array of fasta files
for ((i = 0; i < ${#asmList[@]} - 1; i++)); do
# Setting filepaths for later
bamFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).bam"
syriFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).syri.out"
# Adding the output syri file to the array
syriFileList+=($syriFile)
# Minimap2 genome vs genome alignment
apptainer run --app minimap2 $appdir/PanGeTools.sif \
-ax asm5 --eqx ${asmList[i]} ${asmList[i + 1]} -t $threads | \
apptainer run --app samtools $appdir/PanGeTools.sif sort -O BAM -@ $threads - > $wrkdir/$bamFile
# Syri on previous alignment
apptainer run $appdir/pan1c-env.sif \
syri -c $wrkdir/$bamFile -r ${asmList[i]} -q ${asmList[i + 1]} -k -F B \
--nc $threads \
--dir $wrkdir --prefix "$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz)."
done
# Creating genomes.txt for plotsr. It is used to give simple names to fasta files in the final figure
# Each line contains 2 columns : fasta filepath and its simpler name
echo -e "#files\tname" > $wrkdir/genomes.txt
for asm in "${asmList[@]}"; do
echo -e "${asm}\t$(basename $asm .fa.gz | cut -d'.' -f1)" >> $wrkdir/genomes.txt
done
# Generating the plotsr command
command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f 12 -H 16 -W 9 -d 600 "
# Adding syri files to the command as each needs to be specified using "--sr" argument
for file in "${syriFileList[@]}"; do
command+="--sr $wrkdir/$file "
done
# Running plotsr
apptainer run $appdir/pan1c-env.sif plotsr \
$command
"""
Tag list creation script for Pan1c workflow
Returns a list of tags to ad at the top of the final gfa file as commented lines.
@author: alexis.mergez@inrae.fr
@version: 1.0
"""
import argparse
import subprocess
import os
import json
## Arguments
arg_parser = argparse.ArgumentParser(description='Tag list creation script')
arg_parser.add_argument(
"--appdir",
"-a",
dest = "appdir",
required = True,
help = "Apptainer images directory"
)
arg_parser.add_argument(
"--config-file",
"-c",
dest = "config",
required = True,
help = "Pan1c config file"
)
args = arg_parser.parse_args()
## Main script
"""
Tags dictionnary :
- key : Main tool / apptainer image
- value : dictionnary of tags
"""
tags = {}
### Pan1c-workflow section
tags["Pan1c"] = {}
# Using git to get the version of the Pan1c workflow
_output = subprocess.run(
["git", "describe", "--tags"],
capture_output=True,
text=True,
).stdout[:-1]
# Getting the pggb commands used in the workflow from the config file
# ToDo : Get the command used from the pggb command logs !
with open(args.config, 'r') as handle:
pggbCmd = [line[:-1] for line in handle.readlines() if "pggb.params" in line][0].split(': ')[-1]
# Adding tags
tags["Pan1c"]["pan1c.version"] = _output
tags["Pan1c"]["pan1c.home"] = "https://forgemia.inra.fr/alexis.mergez/pan1c"
tags["Pan1c"]["pan1c.pggb.args"] = pggbCmd
### PanGeTools section
tags["pangetools"] = {}
# Reading the apps versions from the apptainer tags
_output = subprocess.run(
["apptainer", "inspect", "-j", f"{args.appdir}/PanGeTools.sif"],
capture_output=True,
text=True
).stdout
_output = json.loads(_output)
labels = _output['data']['attributes']['labels']
tags["pangetools"]["image.version"] = labels['Version']
tags["pangetools"]["image.home"] = labels['about.home']
# Adding app versions to the tag dictionnary
for key in labels.keys():
if ".Version" in key:
tags["pangetools"][key.lower()] = labels[key]
### PGGB image section
tags["pggb"] = {}
# Reading the apps versions from the apptainer tags
_output = subprocess.run(
["apptainer", "inspect", "-j", f"{args.appdir}/pggb.sif"],
capture_output=True,
text=True
).stdout
_output = json.loads(_output)
labels = _output['data']['attributes']['labels']
tags["pggb"]["image.version"] = labels['Version']
tags["pggb"]["image.home"] = labels['about.home']
# Adding app versions to the tag dictionnary
for key in labels.keys():
if ".Version" in key:
tags["pggb"][key.lower()] = labels[key]
### Pan1c-Env section
tags["pan1c-env"] = {}
# Reading the apps versions from the apptainer tags
_output = subprocess.run(
["apptainer", "inspect", "-j", f"{args.appdir}/pan1c-env.sif"],
capture_output=True,
text=True
).stdout
_output = json.loads(_output)
labels = _output['data']['attributes']['labels']
tags["pan1c-env"]["image.version"] = labels['Version']
tags["pan1c-env"]["image.home"] = labels['about.home']
# Adding app versions to the tag dictionnary
for key in labels.keys():
if ".Version" in key:
tags["pan1c-env"][key.lower()] = labels[key]
## Pan1c-Box section
tags["pan1c-box"] = {}
# Reading the apps versions from the apptainer tags
_output = subprocess.run(
["apptainer", "inspect", "-j", f"{args.appdir}/pan1c-box.sif"],
capture_output=True,
text=True
).stdout
_output = json.loads(_output)
labels = _output['data']['attributes']['labels']
tags["pan1c-box"]["image.version"] = labels['Version']
tags["pan1c-box"]["image.home"] = labels['about.home']
# Adding app versions to the tag dictionnary
for key in labels.keys():
if ".Version" in key:
tags["pan1c-box"][key.lower()] = labels[key]
## Exporting tags to stdout
print("#\tThis graph have been created using the Pan1c workflow (https://forgemia.inra.fr/alexis.mergez/pan1c)\n#")
print("#\tTool versions and commands\n#")
for first_elem in tags.keys():
print(f'#\t-- {first_elem} --')
for label in tags[first_elem].keys():
print(f"#\t{first_elem}\t{label}: {tags[first_elem][label]}")
print('#')
......@@ -10,6 +10,7 @@ Each output fasta contains sequences related to one chromosome only.
"""
from Bio import SeqIO
from Bio.SeqIO import FastaIO
import numpy as np
import argparse
import gzip
......@@ -86,4 +87,5 @@ if args.debug: print(chrSeq.keys())
# Writing chromosome specific fasta file
for chrName in chrSeq.keys():
with open(os.path.join(args.outdir, f"{chrName}.fa"), "w") as output_handle:
SeqIO.write(chrSeq[chrName], output_handle, "fasta")
\ No newline at end of file
fasta_out = FastaIO.FastaWriter(output_handle, wrap=None)
fasta_out.write_file(chrSeq[chrName])
\ No newline at end of file
"""
Input statistics script for Pan1c workflow
This scripts produces statistics of the input haplotypes such as kmers counts and lengths.
@author: alexis.mergez@inrae.fr
@version: 1.0
"""
from Bio import SeqIO
import numpy as np
import argparse
import gzip
import os
## Arguments
arg_parser = argparse.ArgumentParser(description='Pan1c input haplotype statistics')
arg_parser.add_argument(
"--fasta",
"-f",
dest = "fastafile",
required = True,
help = "Fasta file"
)
arg_parser.add_argument(
"--output",
"-o",
dest = "outdir",
required = True,
help = "Output directory"
)
arg_parser.add_argument(
"--debug",
"-d",
action="store_true",
dest = "debug",
help = "Show debug"
)
args = arg_parser.parse_args()
## Toolbox
def getChr(name):
"""
Simply returns the chromosome name of a given fasta record
"""
return name.split('#')[-1]
## Main script
seqDict = {}
# Parsing fasta files
for filename in args.fastafiles:
# Reading .fa.gz file and adding records to seqDict
with gzip.open(filename, "rt") as handle:
sequences = SeqIO.parse(handle, "fasta")
for record in sequences:
seqDict[record.id] = record
# Inferring the list of available chromosomes from the sequence record ids
chrList = np.unique([
getChr(recordid) for recordid in seqDict.keys()
])
# Clustering records based on chromosome tag in records id
chrSeq = {}
for recordid, record in seqDict.items():
# Getting the chromosome id
_chrname = getChr(recordid)
# If not encountered before, create a key for this chromosome in chrSeq
if _chrname not in list(chrSeq.keys()):
chrSeq[_chrname] = []
# Add the record to the chromosome bin in chrSeq
chrSeq[_chrname].append(record)
# Debug purpose print
if args.debug: print(chrSeq.keys())
# Writing chromosome specific fasta file
for chrName in chrSeq.keys():
with open(os.path.join(args.outdir, f"{chrName}.fa"), "w") as output_handle:
SeqIO.write(chrSeq[chrName], output_handle, "fasta")
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment