#conda activate Prokka
path_i="data/genomes"
path_o="data/GET_HOMOLOGUES/Prokka_out"
mkdir -p "$path_o"
for file in $path_i/*.f*; do
FILENAME=`basename ${file%.*}`
prokka --prefix $FILENAME --outdir $path_o --genus 'Corynebacterium' --species 'sp' --strain $FILENAME --centre X --compliant --cpus 0 --force $file;
done
Prokka Annotations
Custom Prokka Annotations
We annotated the genomes with Prokka v1.14.6 (Seemann, 2014) in two different ways for proper compatibility and strain labeling with both GET_HOMOLOGUES and anvi’o.
Prokka annotations for GET_HOMOLOGUES
We annotated the 92 Corynebacterium strain genomes described in Table S2. These genomes are listed in NovCor_AnnotationProkka_GenomeList_v02.csv
.
This step annotates all the .fasta
files in the selected input folder (path_i
) and places all the output annotated files in the output folder (path_o
). Output files headers get updated with –genus ‘Corynebacterium’ –species ‘sp’ and –strain based on the file name. We used default parameters, including gene recognition and translation initiation site identification with Prodigal (Hyatt et al., 2010).
Prokka annotation for anvi’o
We selected 30 Corynebacterium strains for analysis of KEGG metabolic capabilities using anvi’o. These genomes are listed in NovCor_AnnotationAnvio_GenomeList_v02.csv
and included in data/genomes
as .fasta
files.
More information about importing Prokka annotations into anvi’o can be found here: https://merenlab.org/2017/05/18/working-with-prokka/#note-for-the-pangenomics-workflow
Fasta reformatting
Before the annotation step with Prokka we need to reformat the .fasta
files using anvi-script-reformat-fasta
. This script creates .fasta
files with simplified deflines and also by using --seq-type NT
prevents downstream errors with “characters that are not any of A, C, T, G, N, a, c, t, g, n.”
#conda activate anvio-dev
path_i="data/genomes"
path_o="data/Anvio8/Reformatted"
mkdir -p "$path_o"
for file in $path_i/*.f*; do
FILENAME=`basename ${file%.*}`
anvi-script-reformat-fasta -o $path_o/$FILENAME.fa --min-len 0 --simplify-names $file --seq-type NT;
done
Prokka annotation
This step repeats the Prokka annotation using the anvi’o reformatted .fasta
files.
Output files headers get updated with –genus –species and –strain based on the info in the genomes list .csv
file.
#conda activate Prokka
csv_file="data/genome_lists/NovCor_AnnotationAnvio_GenomeList_v01.csv"
path_i="data/Anvio8/Reformatted"
path_o="data/Anvio8/Prokka_out"
mkdir -p "$path_o"
while IFS=',' read -r name genus species; do
if [[ "$name" != "name" ]]; then # Skip the header
prokka --prefix "$name" --outdir "$path_o" --genus "$genus" --species "$species" --strain "$name" --cpus 0 --force "$path_i/$name.fa"
fi
done < "$csv_file"
Parsing .gff files
This step is to parse Prokka annotated genomes to import both the external Prodigal gene calls and functions independently into anvi’o. The input (path_i
) is the annotation in GFF3 format and outputs (path_o
) are two tab-delimited text files, one for gene calls (calls_*.txt
) and one for annotations (annot_*.txt
).
This is done with the script gff_parser.py
described in this tutorial.
#conda activate gffutils
path_i="data/Anvio8/Prokka_out"
path_o="data/Anvio8/Parsed_prokka"
mkdir -p "$path_o"
for file in $path_i/*.gff; do
FILENAME=`basename ${file%.*}`
python scripts/gff_parser.py $file \
--gene-calls $path_o/calls_$FILENAME.txt \
--annotation $path_o/annot_$FILENAME.txt;
done
Generating contigs databases
In this step the reformatted .fa
files (path_i
) and the external gene calls (calls_*.txt
) from Prokka (path_e
) get imported to generate anvi’o contig databases (path_o
). Initially we got a lot of early stop codon errors. Therefore, we add the –ignore-internal-stop-codons
flag.
#conda activate anvio-dev
path_i="data/Anvio8/Reformatted"
path_e="data/Anvio8/Parsed_prokka"
path_o="data/Anvio8/Contigs_db"
mkdir -p "$path_o"
for file in $path_i/*.fa; do
FILENAME=`basename ${file%.*}`
anvi-gen-contigs-database -f $file \
-o $path_o/$FILENAME.db \
--external-gene-calls $path_e/calls_$FILENAME.txt \
--ignore-internal-stop-codons \
-n $FILENAME;
done
Importing Prokka functional annotation
Finally, the external functional annotations (annot_*.txt
) from Prokka (path_e
) get imported into the Anvi’o contigs databases (path_i
).
#conda activate anvio-dev
path_i="data/Anvio8/Contigs_db"
path_e="data/Anvio8/Parsed_prokka"
for file in $path_i/*.db; do
FILENAME=`basename ${file%.*}`
anvi-import-functions -c $file \
-i $path_e/annot_$FILENAME.txt
done