library(tidyverse)
library(ggtext)
library(viridis)Average Nucleotide Identity (ANI)
Set up paths
#This will open the .Renviron file for the project. You need to add a DATA_PATH environment variable with the path to your data folder.
#usethis::edit_r_environ(scope = "project")
# This will create and open the .bash_env file for editing. You need to add a DATA_PATH environment variable with the path to your data folder.
#file.create(".bash_env")
#file.edit(".bash_env")
FOLDER_PATH <- Sys.getenv("DATA_PATH")
genomes_list_folder <- file.path(FOLDER_PATH, "genome_lists")
# Create a folder for pyANI outputs if it doesn't exist
ani_folder <- file.path(FOLDER_PATH, "pyani-plus")
if (!dir.exists(ani_folder)) {
dir.create(ani_folder)
}Run pyani-plus
We used pyani-plus(Pritchard et al., 2016) to calculate the ANIb (BLAST+(Camacho et al., 2009)) values for the 28 proposed Corynebacterium nasorum genomes and for the 30 genomes in the Corynebacterium tuberculostearicum species complex. We first activate the pyani-plus conda environment and source the .bash_env file to set up the environment variables.
conda deactivate
conda activate pyani-plus
source .bash_envANIb for the 28 proposed C. nasorum genomes
path_i="$DATA_PATH/genomes/fasta_28_Cna"
path_o="$DATA_PATH/pyani-plus"
pyani-plus anib $path_i/ --database $path_o/NovCor_28_Cna.db --name "NovCor_28_Cna ANIb" --create-db
pyani-plus plot-run --database $path_o/NovCor_28_Cna.db --outdir $path_o/NovCor_28_Cna/ ANIb for the 30 genomes in the C. tuberculostearicum species complex
path_i="$DATA_PATH/genomes/fasta_30_CtuComplex"
path_o="$DATA_PATH/pyani-plus"
pyani-plus anib $path_i/ --database $path_o/NovCor_30_CtuComplex.db --name "NovCor_30_CtuComplex ANIb" --create-db
pyani-plus plot-run --database $path_o/NovCor_30_CtuComplex.db --outdir $path_o/NovCor_30_CtuComplex/ Plots
We create plots from the ANIb_identity_heatmap.tsv files generated by by pyani-plus. We first extract from those files the order of the genomes in the heatmap, which we will use in the next function.
# Read the ANIb percentage identity data from the TSV files
ANI_data_28 <- read.table(file.path(ani_folder, "NovCor_28_Cna/ANIb_identity_heatmap.tsv"), header = TRUE, sep = "\t")
ANI_data_30 <- read.table(file.path(ani_folder, "NovCor_30_CtuComplex/ANIb_identity_heatmap.tsv"), header = TRUE, sep = "\t")
# Extract the order of the genomes from the 'X' column of the data frame and save to file
write_csv(data.frame(Genome = ANI_data_28$X ), file.path(genomes_list_folder, "ANI_PlotOrder_28_Cna.csv"))
write_csv(data.frame(Genome = ANI_data_30$X ), file.path(genomes_list_folder, "ANI_PlotOrder_30_CtuComplex.csv"))We manually edit the saved PlotOrder files to add a column with the styled genome names that we want to use in the plot. We then read the _styled version of those files back in and use them to set the order of the genomes in the plot and to add the styled genome names as labels in the plot.
plot_ani <- function(ANI_data, StrainOrder_path) {
# Read strain order
StrainOrder <- read_csv(StrainOrder_path)
# Convert wide → long
ANI_data.long <- ANI_data %>%
rename(Genome1 = X) %>%
pivot_longer(cols = -Genome1, names_to = "Genome2", values_to = "PercentIdentity")
# Join and style Genome1
ANI_data.long <- left_join(ANI_data.long, StrainOrder, by = c("Genome1" = "Genome")) %>%
rename(styled_Genome1 = styled_Genome)
ANI_data.long$styled_Genome1 <- factor(ANI_data.long$styled_Genome1,
levels = StrainOrder$styled_Genome)
ANI_data.long$Genome1 <- factor(ANI_data.long$Genome1, levels = StrainOrder$Genome)
# Join and style Genome2
ANI_data.long <- left_join(ANI_data.long, StrainOrder, by = c("Genome2" = "Genome")) %>%
rename(styled_Genome2 = styled_Genome)
ANI_data.long$styled_Genome2 <- factor(ANI_data.long$styled_Genome2,
levels = StrainOrder$styled_Genome)
ANI_data.long$Genome2 <- factor(ANI_data.long$Genome2, levels = StrainOrder$Genome)
# Plot
pANI <- ggplot(ANI_data.long, aes(x = styled_Genome1, y = styled_Genome2, fill = PercentIdentity)) +
geom_tile(width = 1.0, height = 1.0) +
geom_text(aes(label = sprintf("%.1f", round(PercentIdentity * 100, 1))), size = 3, colour = "black") +
scale_fill_viridis(option = "A", begin = 0.6, labels = function(x) sprintf("%.0f", x * 100)) +
scale_y_discrete(position = "right") +
labs(x = NULL, y = NULL, title = NULL) +
theme_bw() +
theme(axis.text.x = element_markdown(angle = 45, vjust = 1, hjust = 1, colour = "black", face = "bold"),
axis.text.y = element_markdown(colour = "black", face = "bold"),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.ticks = element_blank(),
plot.margin = margin(0, 0.25, 0.25, 1, "cm"),
legend.position = c(1.12, -0.12),
legend.direction = "horizontal",
legend.justification = c(1, 0)) +
guides(fill = guide_colourbar(title="ANIb % identity", title.position="top", title.hjust = 0.5, barwidth = 6))
return(pANI)
}Generate the plots for the two datasets.
pANI_28 <- plot_ani(ANI_data_28, file.path(genomes_list_folder, "ANI_PlotOrder_28_Cna_styled.csv"))
pANI_30 <- plot_ani(ANI_data_30, file.path(genomes_list_folder, "ANI_PlotOrder_30_CtuComplex_styled.csv"))Function to add colored rectangles to the plot, which we will use to highlight the different species groups in the C. tuberculostearicum species complex.
add_rectangles_to_heatmap <- function(plot, levels, top, bottom, color) {
top_pos <- which(levels == top)
bottom_pos <- which(levels == bottom)
p_with_rect <- plot +
geom_rect(
aes(xmin = top_pos + 0.5,
xmax = bottom_pos - 0.5,
ymin = top_pos + 0.5,
ymax = bottom_pos - 0.5),
fill = NA,
color = color,
size = 1
)
return(p_with_rect)
}genome_levels <- levels(pANI_30$data$Genome1)
pANI_30 <- add_rectangles_to_heatmap(pANI_30, genome_levels, "Cna_KPL2825", "Cna_KPL3804", "#8200E7")
pANI_30 <- add_rectangles_to_heatmap(pANI_30, genome_levels, "Cyo_c21Ua_68", "Cyo_KPL2619", "#0A6003")
pANI_30 <- add_rectangles_to_heatmap(pANI_30, genome_levels, "Cmq_c19Ua_121", "Cmq_KPL4083", "#39D9E5")
pANI_30 <- add_rectangles_to_heatmap(pANI_30, genome_levels, "Ctu_MSK207", "Ctu_DSM_44922", "#002BFF")
pANI_30 <- add_rectangles_to_heatmap(pANI_30, genome_levels, "Ccu_c8Ua_181", "Ccu_CTNIH19", "#FF00FF")
pANI_30 <- add_rectangles_to_heatmap(pANI_30, genome_levels, "Cha_CTNIH23", "Cha_CTNIH22", "#0ABF00")
pANI_30Outputs
ggsave(file.path(ani_folder, "NovCor_ANI_NovCor_28_Cna.png"), plot = pANI_28, width = 12, height = 9, dpi = 600)
ggsave(file.path(ani_folder, "NovCor_ANI_NovCor_30_CtuComplex.png"), plot = pANI_30, width = 12, height = 9, dpi = 600)
saveRDS(pANI_28, file = file.path(ani_folder, "NovCor_ANI_NovCor_28_Cna.rds"))
saveRDS(pANI_30, file = file.path(ani_folder, "NovCor_ANI_NovCor_30_CtuComplex.rds"))The results based on the identity_heatmap ANI files are reported in the manuscript in the following tables and figures:
- ANI_28: Table S1, for each genome proposed to belong to C. nasorum against the type strain (KPL3804).
- ANI_30: Figure 2A, full pairwise comparisons for all included genomes against each other; together with the dDDH results (Fig 2B) ordered to map the ANI results.