Digital DNA:DNA hybridization (dDDH)

library(tidyverse)
library(reshape2)
library(ggtext)
library(viridis)
library(openxlsx2)

GGDC

Digital DNA:DNA hybridization was conducted for the same set of 30 Corynebacterium genomes included in the ANI analysis (listed in NovCor_ANI_GenomeList_v01.csv). Pairwise comparisons were performed using the online Genome-to-Genome Distance Calculator (GGDC)(Meier-Kolthoff et al., 2013; Meier-Kolthoff et al., 2021) with default parameters. The results based on the recommended GGDC Formula 2 are included in Table S1.

# Set the folder path
path_i="data/DDH/ggdc_results"

# List all matching files
file_list <- list.files(path = path_i, pattern = "^ggdc_results_.*\\.csv$", full.names = TRUE)

# Function to read each file
read_ggdc <- function(file) {
  # Read the data skipping the second header row
  df <- read_csv(file, skip = 1, show_col_types = FALSE)
  
  # Make column names unique
  names(df) <- make.names(names(df), unique = TRUE)
  
  return(df)
}

# Read and bind all files
all_ggdc_results <- map_dfr(file_list, read_ggdc)

# Select desired columns for Formula 2
ggdc_formula2 <- all_ggdc_results %>%
  select(
    Query.genome,
    Reference.genome,
    `DDH...7`,
    `Model.C.I....8`,
    `Distance...9`,
    `Prob..DDH....70....10`
  )

# Rename them to cleaner names
ggdc_formula2 <- ggdc_formula2 %>%
  rename(
    Genome1 = Query.genome,
    Genome2 = Reference.genome,
    DDH = `DDH...7`,
    CI = `Model.C.I....8`,
    Distance = `Distance...9`,
    Prob_DDH_70 = `Prob..DDH....70....10`
  )

# Read the strain order from a CSV file and specify the order of the levels
StrainOrder <- read_csv("data/genome_lists/NovCor_pyANI_PlotOrder_v02.txt")
StrainOrder$Genome <- factor(StrainOrder$Genome, levels=rev(StrainOrder$Genome))

# Join the ggdc_formula2 and StrainOrder data frames based on Genome1
ggdc_formula2 <- left_join(ggdc_formula2, StrainOrder, by = c("Genome1" = "Genome")) %>% 
  rename(styled_Genome1 = styled_Genome) 

# Convert the 'styled_Genome1' and 'Genome1' columns to a factor and specify the order of the levels
ggdc_formula2$styled_Genome1 <- factor(ggdc_formula2$styled_Genome1, levels=rev(StrainOrder$styled_Genome))
ggdc_formula2$Genome1 <- factor(ggdc_formula2$Genome1, levels=rev(StrainOrder$Genome))

# Join the ggdc_formula2 and StrainOrder data frames based on Genome2
ggdc_formula2 <- left_join(ggdc_formula2, StrainOrder, by = c("Genome2" = "Genome")) %>% 
  rename(styled_Genome2 = styled_Genome) 

# Convert the 'styled_Genome2' and 'Genome2' columns to a factor and specify the order of the levels
ggdc_formula2$styled_Genome2 <- factor(ggdc_formula2$styled_Genome2, levels=rev(StrainOrder$styled_Genome))
ggdc_formula2$Genome2 <- factor(ggdc_formula2$Genome2, levels=StrainOrder$Genome)

# Save as a single csv file
write_csv(ggdc_formula2, "data/DDH/ggdc_formula2.csv")

Excel

Save file to include DDH formula 2 as Supplemental Material

# Prepare genome levels
genome1_levels <- levels(ggdc_formula2$Genome1)

# Create workbook
wb <- wb_workbook()

for (sheet_name in genome1_levels) {
  sheet_data <- ggdc_formula2 %>%
    filter(Genome1 == sheet_name) %>%
    select(-starts_with("styled_")) %>%
    arrange(Genome2)

  wb <- wb %>%
    wb_add_worksheet(sheet_name) %>%
    wb_add_data(sheet = sheet_name, x = paste0("GGDC Formula 2 Results for Genome ", sheet_name), start_col = 1, start_row = 1) %>%
    wb_add_data(sheet = sheet_name, x = sheet_data, start_col = 1, start_row = 2) %>%
    wb_set_col_widths(sheet = sheet_name, cols = 1:ncol(sheet_data), widths = 20) %>%
    wb_set_base_font(font_size = 12, font_name = "Arial") %>%
    wb_add_font(dims = "A1:AF2", bold = TRUE, size = 14)
}

# Save workbook
wb$save("data/DDH/ggdc_formula2_by_Genome.xlsx")

Plot

We match the ordering in ANIb_percentage_identity.svg by creating an ordering file NovCor_pyANI_PlotOrder.txt

# Create a ggplot object with styled_Genome1 on the x-axis, styled_Genome2 on the y-axis, and fill color representing DDH
pDDH <- ggplot(ggdc_formula2, aes(x = styled_Genome1, y = styled_Genome2, fill = DDH)) + 
# Add a tile layer to the plot with specified width and height
  geom_tile(width = 1.0, height = 1.0) +
# Add text to each tile showing the rounded DDH value
  geom_text(aes(label = DDH), size = 3, colour = "black") +
# Use the viridis color palette for the fill color and scale the labels to percent
  scale_fill_viridis(option = "A", begin = 0.6) + 
# Set the position of the y-axis labels to the right
  scale_y_discrete(position = "right") +
# Set the plot title and remove the x and y axis labels
  labs(x = NULL, y = NULL, title = NULL) +
# Use a white background for the plot
  theme_bw() +
# Customize the plot theme
  theme(axis.text.x = element_markdown(angle = 45, vjust = 1, hjust = 1, colour = "black", face = "bold"),
        axis.text.y = element_markdown(colour = "black", face = "bold"),
        panel.border = element_blank(),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(), 
        axis.ticks = element_blank(),
        plot.margin = margin(0, 0.25, 0.25, 1, "cm"),
        legend.position = c(1.12, -0.12), 
        legend.direction = "horizontal",
        legend.justification = c(1, 0)) +
# Add a colorbar legend for the fill color with a specified title and position
  guides(fill = guide_colourbar(title="dDDH", title.position="top", title.hjust=0.5, barwidth=6))
pDDH

# Function to add colored rectangles to the plot
add_rectangles_to_heatmap <- function(plot, levels, top, bottom, color) {
  top_pos <- which(levels == top)
  bottom_pos <- which(levels == bottom)

  p_with_rect <- plot +
    geom_rect(
      aes(xmin = top_pos + 0.5, 
          xmax = bottom_pos - 0.5,
          ymin = top_pos + 0.5, 
          ymax = bottom_pos - 0.5),
      fill = NA,
      color = color,
      size = 1
    )
  
  return(p_with_rect)
}

genome_levels <- levels(StrainOrder$Genome)
pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Ctu_MSK207", "Ctu_DSM_44922", "#002BFF") 
pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Cmq_MSK144", "Cau_620_CAUR", "#39D9E5")  
pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Cyo_MSK136", "Cyo_KPL2619", "#0A6003")  
pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Cha_CTNIH23", "Cha_CTNIH22", "#0ABF00") 
pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Ccu_CTNIH19", "Ccu_c8Ua_181", "#FF00FF") 
pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Cna_MSK071", "Cna_KPL4072", "#8200E7")  
pDDH

# Save the plot as PNG
ggsave("data/DDH/Figure/NovCor_dDDH.png", plot = pDDH, width = 12, height = 9, dpi = 600)

--- execute: eval: TRUE message: FALSE warning: FALSE --- # Digital DNA:DNA hybridization (dDDH) {.unnumbered} ```{r} library(tidyverse) library(reshape2) library(ggtext) library(viridis) library(openxlsx2) ``` ## GGDC Digital DNA:DNA hybridization was conducted for the same set of 30 *Corynebacterium* genomes included in the ANI analysis (listed in `NovCor_ANI_GenomeList_v01.csv`). Pairwise comparisons were performed using the online Genome-to-Genome Distance Calculator ([GGDC](https://ggdc.dsmz.de/ggdc.php))[@meier-kolthoff2021; @meier-kolthoff2013] with default parameters. The results based on the recommended GGDC Formula 2 are included in Table S1. ```{r} # Set the folder path path_i="data/DDH/ggdc_results" # List all matching files file_list <- list.files(path = path_i, pattern = "^ggdc_results_.*\\.csv$", full.names = TRUE) # Function to read each file read_ggdc <- function(file) { # Read the data skipping the second header row df <- read_csv(file, skip = 1, show_col_types = FALSE) # Make column names unique names(df) <- make.names(names(df), unique = TRUE) return(df) } # Read and bind all files all_ggdc_results <- map_dfr(file_list, read_ggdc) # Select desired columns for Formula 2 ggdc_formula2 <- all_ggdc_results %>% select( Query.genome, Reference.genome, `DDH...7`, `Model.C.I....8`, `Distance...9`, `Prob..DDH....70....10` ) # Rename them to cleaner names ggdc_formula2 <- ggdc_formula2 %>% rename( Genome1 = Query.genome, Genome2 = Reference.genome, DDH = `DDH...7`, CI = `Model.C.I....8`, Distance = `Distance...9`, Prob_DDH_70 = `Prob..DDH....70....10` ) ``` ```{r} # Read the strain order from a CSV file and specify the order of the levels StrainOrder <- read_csv("data/genome_lists/NovCor_pyANI_PlotOrder_v02.txt") StrainOrder$Genome <- factor(StrainOrder$Genome, levels=rev(StrainOrder$Genome)) # Join the ggdc_formula2 and StrainOrder data frames based on Genome1 ggdc_formula2 <- left_join(ggdc_formula2, StrainOrder, by = c("Genome1" = "Genome")) %>% rename(styled_Genome1 = styled_Genome) # Convert the 'styled_Genome1' and 'Genome1' columns to a factor and specify the order of the levels ggdc_formula2$styled_Genome1 <- factor(ggdc_formula2$styled_Genome1, levels=rev(StrainOrder$styled_Genome)) ggdc_formula2$Genome1 <- factor(ggdc_formula2$Genome1, levels=rev(StrainOrder$Genome)) # Join the ggdc_formula2 and StrainOrder data frames based on Genome2 ggdc_formula2 <- left_join(ggdc_formula2, StrainOrder, by = c("Genome2" = "Genome")) %>% rename(styled_Genome2 = styled_Genome) # Convert the 'styled_Genome2' and 'Genome2' columns to a factor and specify the order of the levels ggdc_formula2$styled_Genome2 <- factor(ggdc_formula2$styled_Genome2, levels=rev(StrainOrder$styled_Genome)) ggdc_formula2$Genome2 <- factor(ggdc_formula2$Genome2, levels=StrainOrder$Genome) # Save as a single csv file write_csv(ggdc_formula2, "data/DDH/ggdc_formula2.csv") ``` ## Excel Save file to include DDH formula 2 as Supplemental Material ```{r} # Prepare genome levels genome1_levels <- levels(ggdc_formula2$Genome1) # Create workbook wb <- wb_workbook() for (sheet_name in genome1_levels) { sheet_data <- ggdc_formula2 %>% filter(Genome1 == sheet_name) %>% select(-starts_with("styled_")) %>% arrange(Genome2) wb <- wb %>% wb_add_worksheet(sheet_name) %>% wb_add_data(sheet = sheet_name, x = paste0("GGDC Formula 2 Results for Genome ", sheet_name), start_col = 1, start_row = 1) %>% wb_add_data(sheet = sheet_name, x = sheet_data, start_col = 1, start_row = 2) %>% wb_set_col_widths(sheet = sheet_name, cols = 1:ncol(sheet_data), widths = 20) %>% wb_set_base_font(font_size = 12, font_name = "Arial") %>% wb_add_font(dims = "A1:AF2", bold = TRUE, size = 14) } # Save workbook wb$save("data/DDH/ggdc_formula2_by_Genome.xlsx") ``` ## Plot We match the ordering in `ANIb_percentage_identity.svg` by creating an ordering file `NovCor_pyANI_PlotOrder.txt` ```{r} # Create a ggplot object with styled_Genome1 on the x-axis, styled_Genome2 on the y-axis, and fill color representing DDH pDDH <- ggplot(ggdc_formula2, aes(x = styled_Genome1, y = styled_Genome2, fill = DDH)) + # Add a tile layer to the plot with specified width and height geom_tile(width = 1.0, height = 1.0) + # Add text to each tile showing the rounded DDH value geom_text(aes(label = DDH), size = 3, colour = "black") + # Use the viridis color palette for the fill color and scale the labels to percent scale_fill_viridis(option = "A", begin = 0.6) + # Set the position of the y-axis labels to the right scale_y_discrete(position = "right") + # Set the plot title and remove the x and y axis labels labs(x = NULL, y = NULL, title = NULL) + # Use a white background for the plot theme_bw() + # Customize the plot theme theme(axis.text.x = element_markdown(angle = 45, vjust = 1, hjust = 1, colour = "black", face = "bold"), axis.text.y = element_markdown(colour = "black", face = "bold"), panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.ticks = element_blank(), plot.margin = margin(0, 0.25, 0.25, 1, "cm"), legend.position = c(1.12, -0.12), legend.direction = "horizontal", legend.justification = c(1, 0)) + # Add a colorbar legend for the fill color with a specified title and position guides(fill = guide_colourbar(title="dDDH", title.position="top", title.hjust=0.5, barwidth=6)) pDDH ``` ```{r} # Function to add colored rectangles to the plot add_rectangles_to_heatmap <- function(plot, levels, top, bottom, color) { top_pos <- which(levels == top) bottom_pos <- which(levels == bottom) p_with_rect <- plot + geom_rect( aes(xmin = top_pos + 0.5, xmax = bottom_pos - 0.5, ymin = top_pos + 0.5, ymax = bottom_pos - 0.5), fill = NA, color = color, size = 1 ) return(p_with_rect) } ``` ```{r} genome_levels <- levels(StrainOrder$Genome) pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Ctu_MSK207", "Ctu_DSM_44922", "#002BFF") pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Cmq_MSK144", "Cau_620_CAUR", "#39D9E5") pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Cyo_MSK136", "Cyo_KPL2619", "#0A6003") pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Cha_CTNIH23", "Cha_CTNIH22", "#0ABF00") pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Ccu_CTNIH19", "Ccu_c8Ua_181", "#FF00FF") pDDH <- add_rectangles_to_heatmap(pDDH, genome_levels, "Cna_MSK071", "Cna_KPL4072", "#8200E7") pDDH # Save the plot as PNG ggsave("data/DDH/Figure/NovCor_dDDH.png", plot = pDDH, width = 12, height = 9, dpi = 600) ```