Description of Corynebacterium nasorum sp. nov. and Corynebacterium hallucis sp. nov. isolated from human nasal passages and skin
  1. Methods
  2. Average Nucleotide Identity (ANI)
  • Introduction
  • Methods
    • Average Nucleotide Identity (ANI)
    • Digital DNA:DNA hybridization (dDDH)
    • Prokka Annotations
    • Anvio
    • Figures
  • References
  • R Session Info

Table of contents

  • Set up paths
  • Run pyani-plus
  • Plots
  • Outputs

Average Nucleotide Identity (ANI)

library(tidyverse)
library(ggtext)
library(viridis)

Set up paths

#This will open the .Renviron file for the project. You need to add a DATA_PATH environment variable with the path to your data folder.
#usethis::edit_r_environ(scope = "project") 

# This will create and open the .bash_env file for editing. You need to add a DATA_PATH environment variable with the path to your data folder.
#file.create(".bash_env")
#file.edit(".bash_env") 

FOLDER_PATH <- Sys.getenv("DATA_PATH") 
genomes_list_folder <- file.path(FOLDER_PATH, "genome_lists")

# Create a folder for pyANI outputs if it doesn't exist
ani_folder <- file.path(FOLDER_PATH, "pyani-plus")
if (!dir.exists(ani_folder)) {
  dir.create(ani_folder)
}

Run pyani-plus

We used pyani-plus(Pritchard et al., 2016) to calculate the ANIb (BLAST+(Camacho et al., 2009)) values for the 28 proposed Corynebacterium nasorum genomes and for the 30 genomes in the Corynebacterium tuberculostearicum species complex. We first activate the pyani-plus conda environment and source the .bash_env file to set up the environment variables.

conda deactivate
conda activate pyani-plus

source .bash_env

ANIb for the 28 proposed C. nasorum genomes

path_i="$DATA_PATH/genomes/fasta_28_Cna"
path_o="$DATA_PATH/pyani-plus"

pyani-plus anib $path_i/ --database $path_o/NovCor_28_Cna.db --name "NovCor_28_Cna ANIb" --create-db
pyani-plus plot-run --database $path_o/NovCor_28_Cna.db --outdir $path_o/NovCor_28_Cna/ 

ANIb for the 30 genomes in the C. tuberculostearicum species complex

path_i="$DATA_PATH/genomes/fasta_30_CtuComplex"
path_o="$DATA_PATH/pyani-plus"

pyani-plus anib $path_i/ --database $path_o/NovCor_30_CtuComplex.db --name "NovCor_30_CtuComplex ANIb" --create-db
pyani-plus plot-run --database $path_o/NovCor_30_CtuComplex.db --outdir $path_o/NovCor_30_CtuComplex/ 

Plots

We create plots from the ANIb_identity_heatmap.tsv files generated by by pyani-plus. We first extract from those files the order of the genomes in the heatmap, which we will use in the next function.

# Read the ANIb percentage identity data from the TSV files
ANI_data_28 <- read.table(file.path(ani_folder, "NovCor_28_Cna/ANIb_identity_heatmap.tsv"), header = TRUE, sep = "\t")
ANI_data_30 <- read.table(file.path(ani_folder, "NovCor_30_CtuComplex/ANIb_identity_heatmap.tsv"), header = TRUE, sep = "\t")

# Extract the order of the genomes from the 'X' column of the data frame and save to file
write_csv(data.frame(Genome = ANI_data_28$X ), file.path(genomes_list_folder, "ANI_PlotOrder_28_Cna.csv"))
write_csv(data.frame(Genome = ANI_data_30$X ), file.path(genomes_list_folder, "ANI_PlotOrder_30_CtuComplex.csv"))

We manually edit the saved PlotOrder files to add a column with the styled genome names that we want to use in the plot. We then read the _styled version of those files back in and use them to set the order of the genomes in the plot and to add the styled genome names as labels in the plot.

plot_ani <- function(ANI_data, StrainOrder_path) {

  # Read strain order
  StrainOrder <- read_csv(StrainOrder_path)

  # Convert wide → long
  ANI_data.long <- ANI_data %>%
    rename(Genome1 = X) %>%
    pivot_longer(cols = -Genome1, names_to = "Genome2", values_to = "PercentIdentity")

  # Join and style Genome1
  ANI_data.long <- left_join(ANI_data.long, StrainOrder, by = c("Genome1" = "Genome")) %>%
    rename(styled_Genome1 = styled_Genome)
  ANI_data.long$styled_Genome1 <- factor(ANI_data.long$styled_Genome1,
                                         levels = StrainOrder$styled_Genome)
  ANI_data.long$Genome1 <- factor(ANI_data.long$Genome1, levels = StrainOrder$Genome)
  
  
  # Join and style Genome2
  ANI_data.long <- left_join(ANI_data.long, StrainOrder, by = c("Genome2" = "Genome")) %>%
    rename(styled_Genome2 = styled_Genome)
  ANI_data.long$styled_Genome2 <- factor(ANI_data.long$styled_Genome2,
                                         levels = StrainOrder$styled_Genome)
  ANI_data.long$Genome2 <- factor(ANI_data.long$Genome2, levels = StrainOrder$Genome)

  
  # Plot
  pANI <- ggplot(ANI_data.long, aes(x = styled_Genome1, y = styled_Genome2, fill = PercentIdentity)) + 
    geom_tile(width = 1.0, height = 1.0) +
    geom_text(aes(label = sprintf("%.1f", round(PercentIdentity * 100, 1))), size = 3, colour = "black") +
    scale_fill_viridis(option = "A", begin = 0.6, labels = function(x) sprintf("%.0f", x * 100)) + 
    scale_y_discrete(position = "right") +
    labs(x = NULL, y = NULL, title = NULL) +
    theme_bw() +
    theme(axis.text.x = element_markdown(angle = 45, vjust = 1, hjust = 1, colour = "black", face = "bold"),
          axis.text.y = element_markdown(colour = "black", face = "bold"),
          panel.border = element_blank(),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(), 
          axis.ticks = element_blank(),
          plot.margin = margin(0, 0.25, 0.25, 1, "cm"),
          legend.position = c(1.12, -0.12), 
          legend.direction = "horizontal",
          legend.justification = c(1, 0)) +
    guides(fill = guide_colourbar(title="ANIb % identity", title.position="top", title.hjust = 0.5, barwidth = 6))
  
  return(pANI)
}

Generate the plots for the two datasets.

pANI_28 <- plot_ani(ANI_data_28, file.path(genomes_list_folder, "ANI_PlotOrder_28_Cna_styled.csv"))
pANI_30 <- plot_ani(ANI_data_30, file.path(genomes_list_folder, "ANI_PlotOrder_30_CtuComplex_styled.csv"))

Function to add colored rectangles to the plot, which we will use to highlight the different species groups in the C. tuberculostearicum species complex.

add_rectangles_to_heatmap <- function(plot, levels, top, bottom, color) {
  top_pos <- which(levels == top)
  bottom_pos <- which(levels == bottom)

  p_with_rect <- plot +
    geom_rect(
      aes(xmin = top_pos + 0.5, 
          xmax = bottom_pos - 0.5,
          ymin = top_pos + 0.5, 
          ymax = bottom_pos - 0.5),
      fill = NA,
      color = color,
      size = 1
    )
  
  return(p_with_rect)
}
genome_levels <- levels(pANI_30$data$Genome1)
pANI_30 <- add_rectangles_to_heatmap(pANI_30, genome_levels, "Cna_KPL2825", "Cna_KPL3804", "#8200E7")  
pANI_30 <- add_rectangles_to_heatmap(pANI_30, genome_levels, "Cyo_c21Ua_68", "Cyo_KPL2619", "#0A6003")  
pANI_30 <- add_rectangles_to_heatmap(pANI_30, genome_levels, "Cmq_c19Ua_121", "Cmq_KPL4083", "#39D9E5")
pANI_30 <- add_rectangles_to_heatmap(pANI_30, genome_levels, "Ctu_MSK207", "Ctu_DSM_44922", "#002BFF") 
pANI_30 <- add_rectangles_to_heatmap(pANI_30, genome_levels, "Ccu_c8Ua_181", "Ccu_CTNIH19", "#FF00FF") 
pANI_30 <- add_rectangles_to_heatmap(pANI_30, genome_levels, "Cha_CTNIH23", "Cha_CTNIH22", "#0ABF00") 
pANI_30

Outputs

ggsave(file.path(ani_folder, "NovCor_ANI_NovCor_28_Cna.png"), plot = pANI_28, width = 12, height = 9, dpi = 600)
ggsave(file.path(ani_folder, "NovCor_ANI_NovCor_30_CtuComplex.png"), plot = pANI_30, width = 12, height = 9, dpi = 600)

saveRDS(pANI_28, file = file.path(ani_folder, "NovCor_ANI_NovCor_28_Cna.rds"))
saveRDS(pANI_30, file = file.path(ani_folder, "NovCor_ANI_NovCor_30_CtuComplex.rds"))

The results based on the identity_heatmap ANI files are reported in the manuscript in the following tables and figures:

  • ANI_28: Table S1, for each genome proposed to belong to C. nasorum against the type strain (KPL3804).
  • ANI_30: Figure 2A, full pairwise comparisons for all included genomes against each other; together with the dDDH results (Fig 2B) ordered to map the ANI results.
Camacho, C., Coulouris, G., Avagyan, V., Ma, N., Papadopoulos, J., Bealer, K. and Madden, T. L. (2009). BLAST+: architecture and applications. BMC Bioinformatics 10,.
Pritchard, L., Glover, R. H., Humphris, S., Elphinstone, J. G. and Toth, I. K. (2016). Genomics and taxonomy in diagnostics for food security: soft-rotting enterobacterial plant pathogens. Analytical Methods 8, 12–24.
Introduction
Digital DNA:DNA hybridization (dDDH)
Source Code
---
execute:
  eval: TRUE
  message: FALSE
  warning: FALSE
---

# Average Nucleotide Identity (ANI) {.unnumbered}

```{r}
library(tidyverse)
library(ggtext)
library(viridis)
```

## Set up paths

```{r}
#This will open the .Renviron file for the project. You need to add a DATA_PATH environment variable with the path to your data folder.
#usethis::edit_r_environ(scope = "project") 

# This will create and open the .bash_env file for editing. You need to add a DATA_PATH environment variable with the path to your data folder.
#file.create(".bash_env")
#file.edit(".bash_env") 

FOLDER_PATH <- Sys.getenv("DATA_PATH") 
genomes_list_folder <- file.path(FOLDER_PATH, "genome_lists")

# Create a folder for pyANI outputs if it doesn't exist
ani_folder <- file.path(FOLDER_PATH, "pyani-plus")
if (!dir.exists(ani_folder)) {
  dir.create(ani_folder)
}
```

## Run pyani-plus

We used [pyani-plus](https://pyani-plus.github.io/pyani-plus-docs/)[@pritchard2016] to calculate the ANIb (BLAST+[@camacho2009]) values for the 28 proposed *Corynebacterium nasorum* genomes and for the 30 genomes in the *Corynebacterium tuberculostearicum* species complex. We first activate the pyani-plus conda environment and source the `.bash_env` file to set up the environment variables.

```{bash, eval=FALSE}
conda deactivate
conda activate pyani-plus

source .bash_env
```

#### ANIb for the 28 proposed *C. nasorum* genomes

```{bash, eval=FALSE}
path_i="$DATA_PATH/genomes/fasta_28_Cna"
path_o="$DATA_PATH/pyani-plus"

pyani-plus anib $path_i/ --database $path_o/NovCor_28_Cna.db --name "NovCor_28_Cna ANIb" --create-db
pyani-plus plot-run --database $path_o/NovCor_28_Cna.db --outdir $path_o/NovCor_28_Cna/ 
```

#### ANIb for the 30 genomes in the *C. tuberculostearicum* species complex

```{bash, eval=FALSE}
path_i="$DATA_PATH/genomes/fasta_30_CtuComplex"
path_o="$DATA_PATH/pyani-plus"

pyani-plus anib $path_i/ --database $path_o/NovCor_30_CtuComplex.db --name "NovCor_30_CtuComplex ANIb" --create-db
pyani-plus plot-run --database $path_o/NovCor_30_CtuComplex.db --outdir $path_o/NovCor_30_CtuComplex/ 
```

## Plots

We create plots from the `ANIb_identity_heatmap.tsv` files generated by by pyani-plus. We first extract from those files the order of the genomes in the heatmap, which we will use in the next function.

```{r}
# Read the ANIb percentage identity data from the TSV files
ANI_data_28 <- read.table(file.path(ani_folder, "NovCor_28_Cna/ANIb_identity_heatmap.tsv"), header = TRUE, sep = "\t")
ANI_data_30 <- read.table(file.path(ani_folder, "NovCor_30_CtuComplex/ANIb_identity_heatmap.tsv"), header = TRUE, sep = "\t")

# Extract the order of the genomes from the 'X' column of the data frame and save to file
write_csv(data.frame(Genome = ANI_data_28$X ), file.path(genomes_list_folder, "ANI_PlotOrder_28_Cna.csv"))
write_csv(data.frame(Genome = ANI_data_30$X ), file.path(genomes_list_folder, "ANI_PlotOrder_30_CtuComplex.csv"))
```

We manually edit the saved `PlotOrder` files to add a column with the styled genome names that we want to use in the plot. We then read the `_styled` version of those files back in and use them to set the order of the genomes in the plot and to add the styled genome names as labels in the plot.

```{r}
plot_ani <- function(ANI_data, StrainOrder_path) {

  # Read strain order
  StrainOrder <- read_csv(StrainOrder_path)

  # Convert wide → long
  ANI_data.long <- ANI_data %>%
    rename(Genome1 = X) %>%
    pivot_longer(cols = -Genome1, names_to = "Genome2", values_to = "PercentIdentity")

  # Join and style Genome1
  ANI_data.long <- left_join(ANI_data.long, StrainOrder, by = c("Genome1" = "Genome")) %>%
    rename(styled_Genome1 = styled_Genome)
  ANI_data.long$styled_Genome1 <- factor(ANI_data.long$styled_Genome1,
                                         levels = StrainOrder$styled_Genome)
  ANI_data.long$Genome1 <- factor(ANI_data.long$Genome1, levels = StrainOrder$Genome)
  
  
  # Join and style Genome2
  ANI_data.long <- left_join(ANI_data.long, StrainOrder, by = c("Genome2" = "Genome")) %>%
    rename(styled_Genome2 = styled_Genome)
  ANI_data.long$styled_Genome2 <- factor(ANI_data.long$styled_Genome2,
                                         levels = StrainOrder$styled_Genome)
  ANI_data.long$Genome2 <- factor(ANI_data.long$Genome2, levels = StrainOrder$Genome)

  
  # Plot
  pANI <- ggplot(ANI_data.long, aes(x = styled_Genome1, y = styled_Genome2, fill = PercentIdentity)) + 
    geom_tile(width = 1.0, height = 1.0) +
    geom_text(aes(label = sprintf("%.1f", round(PercentIdentity * 100, 1))), size = 3, colour = "black") +
    scale_fill_viridis(option = "A", begin = 0.6, labels = function(x) sprintf("%.0f", x * 100)) + 
    scale_y_discrete(position = "right") +
    labs(x = NULL, y = NULL, title = NULL) +
    theme_bw() +
    theme(axis.text.x = element_markdown(angle = 45, vjust = 1, hjust = 1, colour = "black", face = "bold"),
          axis.text.y = element_markdown(colour = "black", face = "bold"),
          panel.border = element_blank(),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(), 
          axis.ticks = element_blank(),
          plot.margin = margin(0, 0.25, 0.25, 1, "cm"),
          legend.position = c(1.12, -0.12), 
          legend.direction = "horizontal",
          legend.justification = c(1, 0)) +
    guides(fill = guide_colourbar(title="ANIb % identity", title.position="top", title.hjust = 0.5, barwidth = 6))
  
  return(pANI)
}
```

Generate the plots for the two datasets.

```{r}
pANI_28 <- plot_ani(ANI_data_28, file.path(genomes_list_folder, "ANI_PlotOrder_28_Cna_styled.csv"))
pANI_30 <- plot_ani(ANI_data_30, file.path(genomes_list_folder, "ANI_PlotOrder_30_CtuComplex_styled.csv"))
```

Function to add colored rectangles to the plot, which we will use to highlight the different species groups in the *C. tuberculostearicum* species complex.

```{r}
add_rectangles_to_heatmap <- function(plot, levels, top, bottom, color) {
  top_pos <- which(levels == top)
  bottom_pos <- which(levels == bottom)

  p_with_rect <- plot +
    geom_rect(
      aes(xmin = top_pos + 0.5, 
          xmax = bottom_pos - 0.5,
          ymin = top_pos + 0.5, 
          ymax = bottom_pos - 0.5),
      fill = NA,
      color = color,
      size = 1
    )
  
  return(p_with_rect)
}
```

```{r, fig.align="center",fig.width=12, fig.height=9}
genome_levels <- levels(pANI_30$data$Genome1)
pANI_30 <- add_rectangles_to_heatmap(pANI_30, genome_levels, "Cna_KPL2825", "Cna_KPL3804", "#8200E7")  
pANI_30 <- add_rectangles_to_heatmap(pANI_30, genome_levels, "Cyo_c21Ua_68", "Cyo_KPL2619", "#0A6003")  
pANI_30 <- add_rectangles_to_heatmap(pANI_30, genome_levels, "Cmq_c19Ua_121", "Cmq_KPL4083", "#39D9E5")
pANI_30 <- add_rectangles_to_heatmap(pANI_30, genome_levels, "Ctu_MSK207", "Ctu_DSM_44922", "#002BFF") 
pANI_30 <- add_rectangles_to_heatmap(pANI_30, genome_levels, "Ccu_c8Ua_181", "Ccu_CTNIH19", "#FF00FF") 
pANI_30 <- add_rectangles_to_heatmap(pANI_30, genome_levels, "Cha_CTNIH23", "Cha_CTNIH22", "#0ABF00") 
pANI_30
```

## Outputs

```{r}
ggsave(file.path(ani_folder, "NovCor_ANI_NovCor_28_Cna.png"), plot = pANI_28, width = 12, height = 9, dpi = 600)
ggsave(file.path(ani_folder, "NovCor_ANI_NovCor_30_CtuComplex.png"), plot = pANI_30, width = 12, height = 9, dpi = 600)

saveRDS(pANI_28, file = file.path(ani_folder, "NovCor_ANI_NovCor_28_Cna.rds"))
saveRDS(pANI_30, file = file.path(ani_folder, "NovCor_ANI_NovCor_30_CtuComplex.rds"))
```

The results based on the identity_heatmap ANI files are reported in the manuscript in the following tables and figures:

-   **ANI_28:** Table S1, for each genome proposed to belong to *C. nasorum* against the type strain (KPL3804).
-   **ANI_30:** Figure 2A, full pairwise comparisons for all included genomes against each other; together with the dDDH results (Fig 2B) ordered to map the ANI results.