Description of Corynebacterium nasorum sp. nov. and Corynebacterium hallucis sp. nov. isolated from human nasal passages and skin
  1. Methods
  2. Digital DNA:DNA hybridization (dDDH)
  • Introduction
  • Methods
    • Genomes Summary
    • Prokka Annotations
    • Average Nucleotide Identity (ANI)
    • Digital DNA:DNA hybridization (dDDH)
    • Phylogenies
    • Anvio
  • References

Table of contents

  • GGDC
  • Excel
  • Plot

Digital DNA:DNA hybridization (dDDH)

library(tidyverse)
library(reshape2)
library(ggtext)
library(viridis)
library(openxlsx2)

GGDC

Digital DNA:DNA hybridization was conducted for the same set of 30 Corynebacterium genomes included in the ANI analysis (listed in NovCor_ANI_GenomeList_v01.csv). Pairwise comparisons were performed using the online Genome-to-Genome Distance Calculator (GGDC)(Meier-Kolthoff et al., 2013; Meier-Kolthoff et al., 2021) with default parameters. The results based on the recommended GGDC Formula 2 are included in Table S1.

# Set the folder path
path_i="data/DDH/ggdc_results"

# List all matching files
file_list <- list.files(path = path_i, pattern = "^ggdc_results_.*\\.csv$", full.names = TRUE)

# Function to read each file
read_ggdc <- function(file) {
  # Read the data skipping the second header row
  df <- read_csv(file, skip = 1, show_col_types = FALSE)
  
  # Make column names unique
  names(df) <- make.names(names(df), unique = TRUE)
  
  return(df)
}

# Read and bind all files
all_ggdc_results <- map_dfr(file_list, read_ggdc)

# Select desired columns for Formula 2
ggdc_formula2 <- all_ggdc_results %>%
  select(
    Query.genome,
    Reference.genome,
    `DDH...7`,
    `Model.C.I....8`,
    `Distance...9`,
    `Prob..DDH....70....10`
  )

# Rename them to cleaner names
ggdc_formula2 <- ggdc_formula2 %>%
  rename(
    Genome1 = Query.genome,
    Genome2 = Reference.genome,
    DDH = `DDH...7`,
    CI = `Model.C.I....8`,
    Distance = `Distance...9`,
    Prob_DDH_70 = `Prob..DDH....70....10`
  )
# Read the strain order from a CSV file and specify the order of the levels
StrainOrder <- read_csv("data/genome_lists/NovCor_pyANI_PlotOrder_v02.txt")
StrainOrder$Genome <- factor(StrainOrder$Genome, levels=rev(StrainOrder$Genome))

# Join the ggdc_formula2 and StrainOrder data frames based on Genome1
ggdc_formula2 <- left_join(ggdc_formula2, StrainOrder, by = c("Genome1" = "Genome")) %>% 
  rename(styled_Genome1 = styled_Genome) 

# Convert the 'styled_Genome1' and 'Genome1' columns to a factor and specify the order of the levels
ggdc_formula2$styled_Genome1 <- factor(ggdc_formula2$styled_Genome1, levels=rev(StrainOrder$styled_Genome))
ggdc_formula2$Genome1 <- factor(ggdc_formula2$Genome1, levels=rev(StrainOrder$Genome))

# Join the ggdc_formula2 and StrainOrder data frames based on Genome2
ggdc_formula2 <- left_join(ggdc_formula2, StrainOrder, by = c("Genome2" = "Genome")) %>% 
  rename(styled_Genome2 = styled_Genome) 

# Convert the 'styled_Genome2' and 'Genome2' columns to a factor and specify the order of the levels
ggdc_formula2$styled_Genome2 <- factor(ggdc_formula2$styled_Genome2, levels=rev(StrainOrder$styled_Genome))
ggdc_formula2$Genome2 <- factor(ggdc_formula2$Genome2, levels=StrainOrder$Genome)

# Save as a single csv file
write_csv(ggdc_formula2, "data/DDH/ggdc_formula2.csv")

Excel

Save file to include DDH formula 2 as Supplemental Material

# Prepare genome levels
genome1_levels <- levels(ggdc_formula2$Genome1)

# Create workbook
wb <- wb_workbook()

for (sheet_name in genome1_levels) {
  sheet_data <- ggdc_formula2 %>%
    filter(Genome1 == sheet_name) %>%
    select(-starts_with("styled_")) %>%
    arrange(Genome2)

  wb <- wb %>%
    wb_add_worksheet(sheet_name) %>%
    wb_add_data(sheet = sheet_name, x = paste0("GGDC Formula 2 Results for Genome ", sheet_name), start_col = 1, start_row = 1) %>%
    wb_add_data(sheet = sheet_name, x = sheet_data, start_col = 1, start_row = 2) %>%
    wb_set_col_widths(sheet = sheet_name, cols = 1:ncol(sheet_data), widths = 20) %>%
    wb_set_base_font(font_size = 12, font_name = "Arial") %>%
    wb_add_font(dims = "A1:AF2", bold = TRUE, size = 14)
}

# Save workbook
wb$save("data/DDH/ggdc_formula2_by_Genome.xlsx")

Plot

We match the ordering in ANIb_percentage_identity.svg by creating an ordering file NovCor_pyANI_PlotOrder.txt

# Create a ggplot object with styled_Genome1 on the x-axis, styled_Genome2 on the y-axis, and fill color representing DDH
pDDH <- ggplot(ggdc_formula2, aes(x = styled_Genome1, y = styled_Genome2, fill = DDH)) + 
# Add a tile layer to the plot with specified width and height
  geom_tile(width = 1.0, height = 1.0) +
# Add text to each tile showing the rounded DDH value
  geom_text(aes(label = DDH), size = 3, colour = "black") +
# Use the viridis color palette for the fill color and scale the labels to percent
  scale_fill_viridis(option = "A", begin = 0.6) + 
# Set the position of the y-axis labels to the right
  scale_y_discrete(position = "right") +
# Set the plot title and remove the x and y axis labels
  labs(x = NULL, y = NULL, title = NULL) +
# Use a white background for the plot
  theme_bw() +
# Customize the plot theme
  theme(axis.text.x = element_markdown(angle = 45, vjust = 1, hjust = 1, colour = "black", face = "bold"),
        axis.text.y = element_markdown(colour = "black", face = "bold"),
        panel.border = element_blank(),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(), 
        axis.ticks = element_blank(),
        plot.margin = margin(0, 0.25, 0.25, 1, "cm"),
        legend.position = c(1.12, -0.12), 
        legend.direction = "horizontal",
        legend.justification = c(1, 0)) +
# Add a colorbar legend for the fill color with a specified title and position
  guides(fill = guide_colourbar(title="dDDH", title.position="top", title.hjust=0.5, barwidth=6))
pDDH

# Function to add colored rectangles to the plot
add_rectangles_to_heatmap <- function(plot, levels, top, bottom, color) {
  top_pos <- which(levels == top)
  bottom_pos <- which(levels == bottom)

  p_with_rect <- plot +
    geom_rect(
      aes(xmin = top_pos + 0.5, 
          xmax = bottom_pos - 0.5,
          ymin = top_pos + 0.5, 
          ymax = bottom_pos - 0.5),
      fill = NA,
      color = color,
      size = 1
    )
  
  return(p_with_rect)
}
genome_levels <- levels(StrainOrder$Genome)
pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Ctu_MSK207", "Ctu_DSM_44922", "#002BFF") 
pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Cmq_MSK144", "Cau_620_CAUR", "#39D9E5")  
pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Cyo_MSK136", "Cyo_KPL2619", "#0A6003")  
pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Cha_CTNIH23", "Cha_CTNIH22", "#0ABF00") 
pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Ccu_CTNIH19", "Ccu_c8Ua_181", "#FF00FF") 
pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Cna_MSK071", "Cna_KPL4072", "#8200E7")  
pDDH

# Save the plot as PNG
ggsave("data/DDH/Figure/NovCor_dDDH.png", plot = pDDH, width = 12, height = 9, dpi = 600)
Meier-Kolthoff, J. P., Auch, A. F., Klenk, H.-P. and Göker, M. (2013). Genome sequence-based species delimitation with confidence intervals and improved distance functions. BMC Bioinformatics 14,.
Meier-Kolthoff, J. P., Carbasse, J. S., Peinado-Olarte, R. L. and Göker, M. (2021). TYGS and LPSN: a database tandem for fast and reliable genome-based classification and nomenclature of prokaryotes. Nucleic Acids Research 50, D801–D807.
Average Nucleotide Identity (ANI)
Phylogenies
Source Code
---
execute:
  eval: TRUE
  message: FALSE
  warning: FALSE
---

# Digital DNA:DNA hybridization (dDDH) {.unnumbered}

```{r}
library(tidyverse)
library(reshape2)
library(ggtext)
library(viridis)
library(openxlsx2)
```

## GGDC

Digital DNA:DNA hybridization was conducted for the same set of 30 *Corynebacterium* genomes included in the ANI analysis (listed in `NovCor_ANI_GenomeList_v01.csv`). Pairwise comparisons were performed using the online Genome-to-Genome Distance Calculator ([GGDC](https://ggdc.dsmz.de/ggdc.php))[@meier-kolthoff2021; @meier-kolthoff2013] with default parameters. The results based on the recommended GGDC Formula 2 are included in Table S1.

```{r}
# Set the folder path
path_i="data/DDH/ggdc_results"

# List all matching files
file_list <- list.files(path = path_i, pattern = "^ggdc_results_.*\\.csv$", full.names = TRUE)

# Function to read each file
read_ggdc <- function(file) {
  # Read the data skipping the second header row
  df <- read_csv(file, skip = 1, show_col_types = FALSE)
  
  # Make column names unique
  names(df) <- make.names(names(df), unique = TRUE)
  
  return(df)
}

# Read and bind all files
all_ggdc_results <- map_dfr(file_list, read_ggdc)

# Select desired columns for Formula 2
ggdc_formula2 <- all_ggdc_results %>%
  select(
    Query.genome,
    Reference.genome,
    `DDH...7`,
    `Model.C.I....8`,
    `Distance...9`,
    `Prob..DDH....70....10`
  )

# Rename them to cleaner names
ggdc_formula2 <- ggdc_formula2 %>%
  rename(
    Genome1 = Query.genome,
    Genome2 = Reference.genome,
    DDH = `DDH...7`,
    CI = `Model.C.I....8`,
    Distance = `Distance...9`,
    Prob_DDH_70 = `Prob..DDH....70....10`
  )
```

```{r}
# Read the strain order from a CSV file and specify the order of the levels
StrainOrder <- read_csv("data/genome_lists/NovCor_pyANI_PlotOrder_v02.txt")
StrainOrder$Genome <- factor(StrainOrder$Genome, levels=rev(StrainOrder$Genome))

# Join the ggdc_formula2 and StrainOrder data frames based on Genome1
ggdc_formula2 <- left_join(ggdc_formula2, StrainOrder, by = c("Genome1" = "Genome")) %>% 
  rename(styled_Genome1 = styled_Genome) 

# Convert the 'styled_Genome1' and 'Genome1' columns to a factor and specify the order of the levels
ggdc_formula2$styled_Genome1 <- factor(ggdc_formula2$styled_Genome1, levels=rev(StrainOrder$styled_Genome))
ggdc_formula2$Genome1 <- factor(ggdc_formula2$Genome1, levels=rev(StrainOrder$Genome))

# Join the ggdc_formula2 and StrainOrder data frames based on Genome2
ggdc_formula2 <- left_join(ggdc_formula2, StrainOrder, by = c("Genome2" = "Genome")) %>% 
  rename(styled_Genome2 = styled_Genome) 

# Convert the 'styled_Genome2' and 'Genome2' columns to a factor and specify the order of the levels
ggdc_formula2$styled_Genome2 <- factor(ggdc_formula2$styled_Genome2, levels=rev(StrainOrder$styled_Genome))
ggdc_formula2$Genome2 <- factor(ggdc_formula2$Genome2, levels=StrainOrder$Genome)

# Save as a single csv file
write_csv(ggdc_formula2, "data/DDH/ggdc_formula2.csv")
```

## Excel

Save file to include DDH formula 2 as Supplemental Material

```{r}
# Prepare genome levels
genome1_levels <- levels(ggdc_formula2$Genome1)

# Create workbook
wb <- wb_workbook()

for (sheet_name in genome1_levels) {
  sheet_data <- ggdc_formula2 %>%
    filter(Genome1 == sheet_name) %>%
    select(-starts_with("styled_")) %>%
    arrange(Genome2)

  wb <- wb %>%
    wb_add_worksheet(sheet_name) %>%
    wb_add_data(sheet = sheet_name, x = paste0("GGDC Formula 2 Results for Genome ", sheet_name), start_col = 1, start_row = 1) %>%
    wb_add_data(sheet = sheet_name, x = sheet_data, start_col = 1, start_row = 2) %>%
    wb_set_col_widths(sheet = sheet_name, cols = 1:ncol(sheet_data), widths = 20) %>%
    wb_set_base_font(font_size = 12, font_name = "Arial") %>%
    wb_add_font(dims = "A1:AF2", bold = TRUE, size = 14)
}

# Save workbook
wb$save("data/DDH/ggdc_formula2_by_Genome.xlsx")
```

## Plot

We match the ordering in `ANIb_percentage_identity.svg` by creating an ordering file `NovCor_pyANI_PlotOrder.txt`
```{r}
# Create a ggplot object with styled_Genome1 on the x-axis, styled_Genome2 on the y-axis, and fill color representing DDH
pDDH <- ggplot(ggdc_formula2, aes(x = styled_Genome1, y = styled_Genome2, fill = DDH)) + 
# Add a tile layer to the plot with specified width and height
  geom_tile(width = 1.0, height = 1.0) +
# Add text to each tile showing the rounded DDH value
  geom_text(aes(label = DDH), size = 3, colour = "black") +
# Use the viridis color palette for the fill color and scale the labels to percent
  scale_fill_viridis(option = "A", begin = 0.6) + 
# Set the position of the y-axis labels to the right
  scale_y_discrete(position = "right") +
# Set the plot title and remove the x and y axis labels
  labs(x = NULL, y = NULL, title = NULL) +
# Use a white background for the plot
  theme_bw() +
# Customize the plot theme
  theme(axis.text.x = element_markdown(angle = 45, vjust = 1, hjust = 1, colour = "black", face = "bold"),
        axis.text.y = element_markdown(colour = "black", face = "bold"),
        panel.border = element_blank(),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(), 
        axis.ticks = element_blank(),
        plot.margin = margin(0, 0.25, 0.25, 1, "cm"),
        legend.position = c(1.12, -0.12), 
        legend.direction = "horizontal",
        legend.justification = c(1, 0)) +
# Add a colorbar legend for the fill color with a specified title and position
  guides(fill = guide_colourbar(title="dDDH", title.position="top", title.hjust=0.5, barwidth=6))
pDDH
```

```{r}
# Function to add colored rectangles to the plot
add_rectangles_to_heatmap <- function(plot, levels, top, bottom, color) {
  top_pos <- which(levels == top)
  bottom_pos <- which(levels == bottom)

  p_with_rect <- plot +
    geom_rect(
      aes(xmin = top_pos + 0.5, 
          xmax = bottom_pos - 0.5,
          ymin = top_pos + 0.5, 
          ymax = bottom_pos - 0.5),
      fill = NA,
      color = color,
      size = 1
    )
  
  return(p_with_rect)
}
```

```{r}
genome_levels <- levels(StrainOrder$Genome)
pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Ctu_MSK207", "Ctu_DSM_44922", "#002BFF") 
pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Cmq_MSK144", "Cau_620_CAUR", "#39D9E5")  
pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Cyo_MSK136", "Cyo_KPL2619", "#0A6003")  
pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Cha_CTNIH23", "Cha_CTNIH22", "#0ABF00") 
pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Ccu_CTNIH19", "Ccu_c8Ua_181", "#FF00FF") 
pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Cna_MSK071", "Cna_KPL4072", "#8200E7")  
pDDH

# Save the plot as PNG
ggsave("data/DDH/Figure/NovCor_dDDH.png", plot = pDDH, width = 12, height = 9, dpi = 600)
```