Anvio

library(tidyverse)
library(ggh4x)
library(RColorBrewer)
library(viridis)
library(ggtext)
library(kableExtra)

This notebook contains analyses using anvi’o version 8 (Delmont and Eren, 2018; Eren et al., 2020) (development branch). For more in-depth information about anvi’o go to their official online site here: https://anvio.org/

Functional annotations

We use as input (path_i) the anvi’o contigs database files (.db) that were generated in Methods_Prokka_Annotations.

KEGG annotation

First we downloaded and set up the KEGG KOfam(Kanehisa, 2000; Kanehisa et al., 2022) database using anvi-setup-kegg-data. We used the snapshot of the KEGG database dated 2023-09-22 (hash a2b5bde358bb). It contains 479 modules with 15116 entries.

The program anvi-run-kegg-kofams annotates genes in a given anvi’o contigs database with KEGG Orthology (KO) numbers via hits to the KEGG KOfam database. We used -E lower than 1e-15.

#conda activate anvio-dev

path_i="data/Anvio8/Contigs_db"

for file in $path_i/*.db; do
    anvi-run-kegg-kofams -T 8 --log-bitscores -E 1e-15 --just-do-it -c $file;
done

The option --log-bitscores records the bit scores values. They get saved to the main folder for the repo, so we moved them to a new subfolder


path_i <- "."
path_o <- "data/Anvio8/KEGG_bitscores"
dir.create(file.path("data/Anvio8/KEGG_bitscores"), recursive = TRUE, showWarnings = FALSE)
files_to_move <- list.files(path_i, pattern = "_bitscores.txt$", full.names = TRUE)

for (file in files_to_move) {
  new_path <- file.path(path_o, basename(file))
  print(new_path)
  file.rename(file, new_path)
}

Metabolic Analysis

Estimation of KEGG module completeness

Using anvi-estimate-metabolism we predicted KEGG module completeness as described in (Veseli et al., 2023) for the the organisms listed in NovCor_MetabolicAnalysis_GenomeList_v01.txt. This list includes the 30 Corynebacterium genomes selected for metabolic analysis.

#conda activate anvio-dev

txt_file="data/genome_lists/NovCor_MetabolicAnalysis_GenomeList_v03.txt"
prefix=NovCor
path_o="data/Anvio8/Metabolic_Analysis"
mkdir -p "$path_o"

anvi-estimate-metabolism -e $txt_file \
                         -O $path_o/$prefix --add-copy-number --output-modes modules

The generated NovCor_modules.txt file is included as Table S3 in this publication.

Code for Table 1

# Read modules data and make appropriate labeling columns
Modules <- read_delim("data/Anvio8/Metabolic_Analysis/NovCor_modules.txt", show_col_types = FALSE, col_types = cols(unique_enzymes_hit_counts = col_character(), gene_caller_ids_in_module = col_character())) %>% 
  mutate(module_class_short = paste0("(", substr(module_class, 1, 1), ")")) %>%
  unite(modules_label, c(module_class_short, module_category, module_subcategory), sep = "_", remove = FALSE) %>% 
  mutate_at(c("genome_name", "module", "module_name", "module_category", "module_subcategory", "modules_label"), factor)

# Summary tally table of fully stepwise complete modules by species. Filter to modules present in at least 25 genomes
ModulesTable <- Modules %>%
  filter(stepwise_module_completeness == 1) %>%
  group_by(module_class, module_category, module_subcategory, module, module_name) %>%
  tally() %>%
  ungroup() %>%
  select(module, module_name, n, -module_class) %>%  
  arrange(desc(n)) %>%
  filter(n > 25)

write_csv(ModulesTable, "data/Anvio8/Metabolic_Analysis/Fig_Tables/NovCor_Table1.csv")
kable(ModulesTable)

Code for Figure 3

StrainOrder <- read_csv("data/genome_lists/NovCor_MetabolicAnalysis_PlotOrder.txt")

# When using stepwise_module_completeness some modules can be = 0 for all genomes but still have pathwise_module_completeness > 0. We are removing those modules.
Modules_cleaned <- Modules %>%
  group_by(module) %>%
  filter(!all(stepwise_module_completeness == 0)) %>%
  ungroup() %>%
  mutate(across(where(is.factor), droplevels))

all_zero_modules <- setdiff(Modules$module, Modules_cleaned$module)
all_zero_subcategory <- setdiff(Modules$module_subcategory, Modules_cleaned$module_subcategory)

# Calculating mean_stepwise_module_completeness and preparing plot labels 
Modules_Means = Modules_cleaned %>% 
  group_by(modules_label, genome_name, .drop = FALSE) %>%
  summarise(mean_stepwise_module_completeness = mean(stepwise_module_completeness)) %>%
  mutate(mean_stepwise_module_completeness = coalesce(mean_stepwise_module_completeness, 0)) %>%
  mutate(species_name = str_extract(genome_name, "^[^_]+")) %>%
  mutate(module_category = str_extract(modules_label, "^[^_]+_[^_]+")) %>%
  mutate(module_category = str_replace_all(module_category, "_", " ")) %>%
  mutate(modules_final_label = str_replace(modules_label, "^[^_]+_[^_]+_", "")) %>%
  mutate_at(c("module_category", "modules_final_label"), factor) 

Modules_Means <- left_join(Modules_Means, StrainOrder, by = "genome_name")

Modules_Means$species_name <- sub("Cke", "Cna", Modules_Means$species_name)
Modules_Means$species_name <- sub("Cau", "Cmq", Modules_Means$species_name)
Modules_Means$species_name <- factor(Modules_Means$species_name, levels = c("Cna","Ccu","Cha","Cyo","Cmq","Ctu"))
Modules_Means$genome_name <- factor(Modules_Means$genome_name, levels = StrainOrder$genome_name)
Modules_Means$styled_genome_name <- factor(Modules_Means$styled_genome_name, levels = rev(StrainOrder$styled_genome_name))
Modules_Means$modules_final_label  <- factor(Modules_Means$modules_final_label, rev(levels(Modules_Means$modules_final_label)))

# Colors for plot
colorsSpecies <- c("#8200E7", "#FF00FF", "#0ABF00", "#0A6003", "#39D9E5", "#002BFF")

# Heatmap plot
plot <- ggplot(Modules_Means, aes(x = styled_genome_name, y = modules_final_label, fill = mean_stepwise_module_completeness)) + 
  geom_tile(color = "white") +
  facet_grid2(module_category ~ species_name, scales = "free", space = "free", switch = "y", labeller = label_wrap_gen(width = 31),
              #strip = strip_themed(background_x = elem_list_rect(fill = colorsSpecies))) +
              strip = strip_themed(text_x = elem_list_text(colour = colorsSpecies))) +
  labs(x = "", y = "") +
  scale_y_discrete(expand = c(0,0), position = "right") +
  scale_x_discrete(expand = c(0,0)) +
  scale_fill_viridis(direction = -1, option = "A", begin = 0.2) +
  theme_bw() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.background = element_rect(colour = "black", fill = NA),
        strip.text.y.left = element_text(angle = 0, hjust = 0.5, size = 11),
        strip.text.x = element_text(size = 12, face = "bold"),
        axis.text.y = element_text(size = 11, color = "black"),
        axis.text.x = element_markdown(angle = 45, hjust = 1, size = 11, color = "black"),
        panel.spacing = unit(0, 'pt'),
        legend.position = c(1.30, -0.11), 
        legend.direction = "horizontal",
        legend.justification = c(1, 0)) +
  guides(fill = guide_colourbar(title = "Average stepwise\nmodule completeness", title.position = "top", title.hjust = 0.5, barwidth = 8))


ggsave(plot = plot, filename = "data/Anvio8/Metabolic_Analysis/Fig_Tables/NovCor_Figure3.jpg", height = 13, width = 12, device = 'jpg', dpi = 600)

plot

--- execute: message: FALSE warning: FALSE --- # Anvio {.unnumbered} ```{r} library(tidyverse) library(ggh4x) library(RColorBrewer) library(viridis) library(ggtext) library(kableExtra) ``` This notebook contains analyses using anvi'o version 8 [@eren2020; @delmont2018] (development branch). For more in-depth information about anvi'o go to their official online site here: <https://anvio.org/> ## Functional annotations We use as input (`path_i`) the anvi'o contigs database files (`.db`) that were generated in [[Methods_Prokka_Annotations]{.underline}](Methods_Prokka_Annotations.html). ::: {.content-hidden when-profile="manuscript"} This command runs all the annotations available on anvi'o 8 in a loop, for a detailed step by step process, see below. ```{bash, eval=FALSE} #conda activate anvio-dev path_i="data/Anvio8/Contigs_db" for file in $path_i/*.db; do anvi-run-ncbi-cogs -T 8 --search-with diamond -c $file anvi-run-kegg-kofams -T 8 --log-bitscores -E 1e-15 --just-do-it -c $file anvi-run-pfams -T 8 -c $file anvi-run-cazymes -T 8 -c $file anvi-run-hmms -T 8 -c $file --just-do-it anvi-reaction-network -c $file; done ``` ### COG annotation First we downloaded and set up the NCBI's Clusters of Orthologous Groups database[@Tatusov2000, @Galperin2021] using `anvi-setup-ncbi-cogs`. The default is the latest version, which is COG20, meaning that anvi'o will use the NCBI's 2020 release of COGs to setup the database. The `anvi-run-ncbi-cogs` command was used to annotate the contig databases. We used DIAMOND as searching method[@buchfink2014] with the recommended "sensitive" option. ```{bash, eval=FALSE} #conda activate anvio-dev path_i="data/Anvio8/Contigs_db" for file in $path_i/*.db; do anvi-run-ncbi-cogs -T 8 --search-with diamond -c $file; done ``` ::: ### KEGG annotation First we downloaded and set up the KEGG KOfam[@Kanehisa2000; @Kanehisa2023] database using `anvi-setup-kegg-data`. We used the snapshot of the KEGG database dated 2023-09-22 (hash a2b5bde358bb). It contains 479 modules with 15116 entries. The program `anvi-run-kegg-kofams` annotates genes in a given anvi'o contigs database with KEGG Orthology (KO) numbers via hits to the KEGG KOfam database. We used `-E` lower than 1e-15. ```{bash, eval=FALSE} #conda activate anvio-dev path_i="data/Anvio8/Contigs_db" for file in $path_i/*.db; do anvi-run-kegg-kofams -T 8 --log-bitscores -E 1e-15 --just-do-it -c $file; done ``` The option `--log-bitscores` records the bit scores values. They get saved to the main folder for the repo, so we moved them to a new subfolder ```{bash, eval=FALSE} path_i <- "." path_o <- "data/Anvio8/KEGG_bitscores" dir.create(file.path("data/Anvio8/KEGG_bitscores"), recursive = TRUE, showWarnings = FALSE) files_to_move <- list.files(path_i, pattern = "_bitscores.txt$", full.names = TRUE) for (file in files_to_move) { new_path <- file.path(path_o, basename(file)) print(new_path) file.rename(file, new_path) } ``` ::: {.content-hidden when-profile="manuscript"} ### PFAM annotation First you need to download and set up the PFAM database[@Bateman2000; @Mistry2021] using `anvi-setup-pfams`: this is something you will do only once. We used Pfam version 36.0 (2023-07). The program `anvi-run-kegg-pfam` annotates genes in a given anvi'o contigs database with the PFAM database. ```{bash, eval=FALSE} #conda activate anvio-dev path_i="data/Anvio8/Contigs_db" for file in $path_i/*.db; do anvi-run-pfams -T 8 -c $file; done ``` ### CAZyme Annotation First you need to download and set up the CAZyme database [@drula2021] using `anvi-setup-cazymes`: this is something you will do only once. We used Pfam version 11. The program `anvi-run-cazymes` annotates genes in your contigs-db with the dbCAN CAZyme database. ```{bash, eval=FALSE} #conda activate anvio-dev path_i="data/Anvio8/Contigs_db" for file in $path_i/*.db; do anvi-run-cazymes -T 8 -c $file; done ``` ### Getting HMMs When you run the `anvi-run-hmms` command the occurrence of bacterial single-copy genes across your contigs is added to your contigs database [@Eddy2011]. ```{bash, eval=FALSE} #conda activate anvio-dev path_i="data/Anvio8/Contigs_db" for file in $path_i/*.db; do anvi-run-hmms -T 8 -c $file --just-do-it; done ``` ### Getting Metabolic Networks The program `anvi-reaction-network` is used to store a metabolic reaction-network in each contigs-db. The network consists of data on biochemical reactions predicted to be encoded by each genome, referencing the KEGG Orthology (KO) and ModelSEED Biochemistry databases. ```{bash, eval=FALSE} #conda activate anvio-dev for file in $path_i/*.db; do anvi-reaction-network -c $file; done ``` ### Annotation Summaries This generates basic stats on the annotations: ```{bash, eval=FALSE} #conda activate anvio-dev path_i="data/Anvio8/Contigs_db" path_o="data/Anvio8/Annotation_summaries" mkdir -p "$path_o" for file in $path_i/*.db; do FILENAME=`basename ${file%.*}` anvi-display-contigs-stats $file --report-as-text -o $path_o/stats_$FILENAME.txt; done ``` This provides basic info on the contigs databases: ```{bash, eval=FALSE} #conda activate anvio-dev path_i="data/Anvio8/Contigs_db" path_o="data/Anvio8/Annotation_summaries" for file in $path_i/*.db; do anvi-db-info $file &>> $path_o/anvi-db-info_log.txt; done ``` ::: ## Metabolic Analysis ### Estimation of KEGG module completeness Using `anvi-estimate-metabolism` we predicted KEGG module completeness as described in [@veseli2023] for the the organisms listed in `NovCor_MetabolicAnalysis_GenomeList_v01.txt`. This list includes the 30 *Corynebacterium* genomes selected for metabolic analysis. ```{bash, eval=FALSE} #conda activate anvio-dev txt_file="data/genome_lists/NovCor_MetabolicAnalysis_GenomeList_v03.txt" prefix=NovCor path_o="data/Anvio8/Metabolic_Analysis" mkdir -p "$path_o" anvi-estimate-metabolism -e $txt_file \ -O $path_o/$prefix --add-copy-number --output-modes modules ``` The generated `NovCor_modules.txt` file is included as **Table S3** in this publication. #### Code for Table 1 ```{r} # Read modules data and make appropriate labeling columns Modules <- read_delim("data/Anvio8/Metabolic_Analysis/NovCor_modules.txt", show_col_types = FALSE, col_types = cols(unique_enzymes_hit_counts = col_character(), gene_caller_ids_in_module = col_character())) %>% mutate(module_class_short = paste0("(", substr(module_class, 1, 1), ")")) %>% unite(modules_label, c(module_class_short, module_category, module_subcategory), sep = "_", remove = FALSE) %>% mutate_at(c("genome_name", "module", "module_name", "module_category", "module_subcategory", "modules_label"), factor) # Summary tally table of fully stepwise complete modules by species. Filter to modules present in at least 25 genomes ModulesTable <- Modules %>% filter(stepwise_module_completeness == 1) %>% group_by(module_class, module_category, module_subcategory, module, module_name) %>% tally() %>% ungroup() %>% select(module, module_name, n, -module_class) %>% arrange(desc(n)) %>% filter(n > 25) write_csv(ModulesTable, "data/Anvio8/Metabolic_Analysis/Fig_Tables/NovCor_Table1.csv") kable(ModulesTable) ``` #### Code for Figure 3 ```{r} StrainOrder <- read_csv("data/genome_lists/NovCor_MetabolicAnalysis_PlotOrder.txt") # When using stepwise_module_completeness some modules can be = 0 for all genomes but still have pathwise_module_completeness > 0. We are removing those modules. Modules_cleaned <- Modules %>% group_by(module) %>% filter(!all(stepwise_module_completeness == 0)) %>% ungroup() %>% mutate(across(where(is.factor), droplevels)) all_zero_modules <- setdiff(Modules$module, Modules_cleaned$module) all_zero_subcategory <- setdiff(Modules$module_subcategory, Modules_cleaned$module_subcategory) # Calculating mean_stepwise_module_completeness and preparing plot labels Modules_Means = Modules_cleaned %>% group_by(modules_label, genome_name, .drop = FALSE) %>% summarise(mean_stepwise_module_completeness = mean(stepwise_module_completeness)) %>% mutate(mean_stepwise_module_completeness = coalesce(mean_stepwise_module_completeness, 0)) %>% mutate(species_name = str_extract(genome_name, "^[^_]+")) %>% mutate(module_category = str_extract(modules_label, "^[^_]+_[^_]+")) %>% mutate(module_category = str_replace_all(module_category, "_", " ")) %>% mutate(modules_final_label = str_replace(modules_label, "^[^_]+_[^_]+_", "")) %>% mutate_at(c("module_category", "modules_final_label"), factor) Modules_Means <- left_join(Modules_Means, StrainOrder, by = "genome_name") Modules_Means$species_name <- sub("Cke", "Cna", Modules_Means$species_name) Modules_Means$species_name <- sub("Cau", "Cmq", Modules_Means$species_name) Modules_Means$species_name <- factor(Modules_Means$species_name, levels = c("Cna","Ccu","Cha","Cyo","Cmq","Ctu")) Modules_Means$genome_name <- factor(Modules_Means$genome_name, levels = StrainOrder$genome_name) Modules_Means$styled_genome_name <- factor(Modules_Means$styled_genome_name, levels = rev(StrainOrder$styled_genome_name)) Modules_Means$modules_final_label <- factor(Modules_Means$modules_final_label, rev(levels(Modules_Means$modules_final_label))) ``` ```{r} #| fig-width: 12 #| fig-height: 13 # Colors for plot colorsSpecies <- c("#8200E7", "#FF00FF", "#0ABF00", "#0A6003", "#39D9E5", "#002BFF") # Heatmap plot plot <- ggplot(Modules_Means, aes(x = styled_genome_name, y = modules_final_label, fill = mean_stepwise_module_completeness)) + geom_tile(color = "white") + facet_grid2(module_category ~ species_name, scales = "free", space = "free", switch = "y", labeller = label_wrap_gen(width = 31), #strip = strip_themed(background_x = elem_list_rect(fill = colorsSpecies))) + strip = strip_themed(text_x = elem_list_text(colour = colorsSpecies))) + labs(x = "", y = "") + scale_y_discrete(expand = c(0,0), position = "right") + scale_x_discrete(expand = c(0,0)) + scale_fill_viridis(direction = -1, option = "A", begin = 0.2) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), strip.background = element_rect(colour = "black", fill = NA), strip.text.y.left = element_text(angle = 0, hjust = 0.5, size = 11), strip.text.x = element_text(size = 12, face = "bold"), axis.text.y = element_text(size = 11, color = "black"), axis.text.x = element_markdown(angle = 45, hjust = 1, size = 11, color = "black"), panel.spacing = unit(0, 'pt'), legend.position = c(1.30, -0.11), legend.direction = "horizontal", legend.justification = c(1, 0)) + guides(fill = guide_colourbar(title = "Average stepwise\nmodule completeness", title.position = "top", title.hjust = 0.5, barwidth = 8)) ggsave(plot = plot, filename = "data/Anvio8/Metabolic_Analysis/Fig_Tables/NovCor_Figure3.jpg", height = 13, width = 12, device = 'jpg', dpi = 600) plot ``` ::: {.content-hidden when-profile="manuscript"} ### Module Enrichment Analysis The `NovCor_modules.txt` file generated in the previous step is used by `anvi-compute-metabolic-enrichment` to calculate KEGG module enrichment[@shaiber2020] across different groups of genomes. We used the `-G` flag to indicate the grouping options defined in file `NovCor_MetabolicAnalysis_Groups_CtuComplex.txt`that includes the 30 *Corynebacterium* genomes grouped by their final species assignment. ```{bash, eval=FALSE} #conda activate anvio-dev csv_file_modules="data/Anvio8/Metabolic_Analysis/NovCor_modules.txt" path_o="data/Anvio8/Metabolic_Analysis/Enrichment_Modules" mkdir -p "$path_o" txt_file_groups="data/genome_lists/NovCor_MetabolicAnalysis_Groups_CtuComplex_v03.txt" output_file="$path_o/NovCor_enriched_modules_stepwise_1_CtuComplex.txt" anvi-compute-metabolic-enrichment -M "$csv_file_modules" \ -G "$txt_file_groups" \ -o "$output_file" \ --use-stepwise-completeness \ --module-completion-threshold 1.0 ``` ### KO Enrichment Analysis We use `anvi-display-functions`. It runs the enrichment for the desired annotations (in this case KOfam). It saves the output to a temp file that can be recovered, renamed and saved into the same `path_o` as the database file. ```{bash, eval=FALSE} #conda activate anvio-dev txt_file="data/genome_lists/NovCor_MetabolicAnalysis_GenomeList_v01.txt" path_o="data/Anvio8/Metabolic_Analysis/Enrichment_KOs" mkdir -p "$path_o" txt_file_groups="data/genome_lists/NovCor_MetabolicAnalysis_Groups_CtuComplex.txt" anvi-display-functions -e $txt_file \ --groups-txt $txt_file_groups \ --annotation-source KOfam \ --profile-db $path_o/KOfam_CtuComplex.db txt_file_groups="data/genome_lists/NovCor_MetabolicAnalysis_Groups_References.txt" anvi-display-functions -e $txt_file \ --groups-txt $txt_file_groups \ --annotation-source KOfam \ --profile-db $path_o/KOfam_References.db ``` Many of the enriched KOs belong to the already identified enriched modules. This function filters out those KOs that are part of the definition of modules listed in the enriched modules identified with `anvi-compute-metabolic-enrichment`: ```{bash, eval=FALSE} get_KOs_not_in_enrich_modules <- function(modules, hits, enrich_modules, enrich_KOs, q_value_threshold) { # Get list of KOs that correspond with the modules on the enrichment file modules_subset <- modules %>% filter(module %in% enrich_modules$accession) module_definition_subset <- paste(modules_subset$module_definition, collapse = " ") module_definition_subset <- str_extract_all(module_definition_subset, "\\bK\\d+\\b") module_definition_subset <- unique(unlist(module_definition_subset)) # Add a column to enrich_KOs indicating if the KO is the previous list enrich_KOs$inModule <- sapply(strsplit(as.character(enrich_KOs$accession), ","), function(x) all((x %in% module_definition_subset))) # Filter Enrich_KO based on inModule and q_value_threshold enrich_KOs_filtered <- enrich_KOs %>% filter(inModule == FALSE) %>% filter(adjusted_q_value < q_value_threshold) return(enrich_KOs_filtered) } Modules <- read_delim("data/Anvio8/Metabolic_Analysis/NovCor_modules.txt", show_col_types = FALSE, col_types = cols(unique_enzymes_hit_counts = col_character(), gene_caller_ids_in_module = col_character())) KOHits <- read_delim("data/Anvio8/Metabolic_Analysis/NovCor_hits.txt", show_col_types = FALSE) Enrich_Mod_CtuComplex <- read_delim("data/Anvio8/Metabolic_Analysis/Enrichment_Modules/NovCor_enriched_modules_stepwise_1_CtuComplex.txt", show_col_types = FALSE) Enrich_KO_CtuComplex <- read_delim("data/Anvio8/Metabolic_Analysis/Enrichment_KOs/NovCor_enriched_KOs_CtuComplex.txt", show_col_types = FALSE) Enrich_KO_CtuComplex_Filtered <- get_KOs_not_in_enrich_modules(Modules, KOHits, Enrich_Mod_CtuComplex, Enrich_KO_CtuComplex, 0.1) write_csv(Enrich_KO_CtuComplex_Filtered, "data/Anvio8/Metabolic_Analysis/Enrichment_KOs/NovCor_enriched_KOs_CtuComplex_Filtered.csv") Enrich_Mod_References <- read_delim("data/Anvio8/Metabolic_Analysis/Enrichment_Modules/NovCor_enriched_modules_stepwise_1_References.txt", show_col_types = FALSE) Enrich_KO_References <- read_delim("data/Anvio8/Metabolic_Analysis/Enrichment_KOs/NovCor_enriched_KOs_References.txt", show_col_types = FALSE) Enrich_KO_References_Filtered <- get_KOs_not_in_enrich_modules(Modules, KOHits, Enrich_Mod_References, Enrich_KO_References, 0.1) write_csv(Enrich_KO_References_Filtered, "data/Anvio8/Metabolic_Analysis/Enrichment_KOs/NovCor_enriched_KOs_References_Filtered.csv") ``` ### Pangenomic level analysis This analysis using `NovCor_MetabolicAnalysis_GenomeList_v02.txt` includes the 30 *Corynebacterium* genomes selected for metabolic analysis. ```{bash, eval=FALSE} #conda activate anvio-dev path_i="data/Anvio8/Pangenomic_Analysis" mkdir -p "$path_i" anvi-gen-genomes-storage -e "data/genome_lists/NovCor_MetabolicAnalysis_GenomeList_v02.txt" -o $path_i/NovCor-ANI-GENOMES.db --gene-caller Prodigal anvi-db-info $path_i/NovCor-ANI-GENOMES.db anvi-pan-genome -g $path_i/NovCor-ANI-GENOMES.db \ -o $path_i/ANI -n PAN_ANI \ --use-ncbi-blast --mcl-inflation 10 --num-threads 8 #Create a bin collection named ANI with all Core GCs for the species using the visual interface: anvi-display-pan -p $path_i/ANI/PAN_ANI-PAN.db -g $path_i/NovCor-ANI-GENOMES.db anvi-summarize -p $path_i/ANI/PAN_ANI-PAN.db \ -g $path_i/NovCor-ANI-GENOMES.db \ -o $path_i/ANI/Summaries \ -C ANI ``` ## ANI ```{bash, eval=FALSE} #conda activate anvio-dev path_i="data/Anvio8/Contigs_dbv23" #Contigs_dbv22 has the original files anvi-migrate $path_i/*.db --migrate-quickly ``` Anvi'o uses 'PyANI' [@pritchard2016] ```{bash, eval=FALSE} #conda activate anvio-dev path_i="data/genome_lists/NovCor_MetabolicAnalysis_GenomeList_v02.txt" path_o="data/Anvio8/ANI" anvi-compute-genome-similarity -e $path_i \ -o $path_o \ --program pyANI \ --method ANIb ``` ### Code for ANI figure ```{r} # Define the path of the input file input = "./data/Anvio8/ANI/ANIb_percentage_identity.txt" ANI_data <- read.table(input, header = TRUE, sep = "\t") # Read the tab-separated data from the file into a data frame called ANI_data ANI_data.long <- ANI_data %>% # Convert the data from wide format to long format pivot_longer(cols = -key, names_to = "Genome2", values_to = "PercentIdentity") %>% # Rename the 'key' column to 'Genome1' rename(Genome1 = key) %>% # Filter out rows where Genome1 or Genome2 is 'Cma_CCUG_32361' filter(Genome1 != "Cma_CCUG_32361" , Genome2 != "Cma_CCUG_32361") %>% # Filter out rows where Genome1 or Genome2 is 'Cac_ATCC_49725' filter(Genome1 != "Cac_ATCC_49725" , Genome2 != "Cac_ATCC_49725") # Read the strain order from a CSV file StrainOrder <- read_csv("data/genome_lists/NovCor_pyANI_PlotOrder.txt") # Join the ANI_data.long and StrainOrder data frames based on Genome1 ANI_data.long <- left_join(ANI_data.long, StrainOrder, by = join_by(Genome1 == Genome)) %>% # Rename the 'styled_Genome' column to 'styled_Genome1' rename(styled_Genome1 = styled_Genome) # Convert the 'styled_Genome1' column to a factor and specify the order of the levels ANI_data.long$styled_Genome1 <- factor(ANI_data.long$styled_Genome1, levels = rev(StrainOrder$styled_Genome)) # Join the ANI_data.long and StrainOrder data frames based on Genome1 ANI_data.long <- left_join(ANI_data.long, StrainOrder, by = join_by(Genome2 == Genome)) %>% # Rename the 'styled_Genome' column to 'styled_Genome2' rename(styled_Genome2 = styled_Genome) # Convert the 'styled_Genome2' column to a factor and specify the order of the levels ANI_data.long$styled_Genome2 <- factor(ANI_data.long$styled_Genome2, levels = rev(StrainOrder$styled_Genome)) ``` ```{r} # Create a ggplot object with styled_Genome1 on the x-axis, styled_Genome2 on the y-axis, and fill color representing PercentIdentity pANI <- ggplot(ANI_data.long, aes(x = styled_Genome1, y = styled_Genome2, fill = PercentIdentity)) + # Add a tile layer to the plot with specified width and height geom_tile(width = 1.0, height = 1.0) + # Add text to each tile showing the rounded PercentIdentity value geom_text(aes(label = sprintf("%.1f", round(PercentIdentity * 100, 1))), size = 3, colour = "black") + # Use the viridis color palette for the fill color and scale the labels to percent scale_fill_viridis(option = "A", begin = 0.4, labels = scales::percent) + # Set the position of the x-axis labels to the bottom scale_x_discrete(position = "bottom") + # Set the position of the y-axis labels to the right scale_y_discrete(position = "right") + # Set the plot title and remove the x and y axis labels labs(x = NULL, y = NULL, title = "Whole-genome Average Nucleotide Identity") + # Use a white background for the plot theme_bw() + # Customize the plot theme theme(axis.text.x = element_markdown(angle = 45, vjust = 1, hjust = 1, color = "black"), axis.text.y = element_markdown(angle = 25, color = "black"), panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.ticks = element_blank(), plot.margin = margin(1, 0, 0, 1.5, "cm"), legend.position = c(1.1, -0.15), legend.direction = "horizontal", legend.justification = c(1, 0)) + # Add a colorbar legend for the fill color with a specified title and position guides(fill = guide_colourbar(title = "% identity", title.position = "top", title.hjust = 0.5, barwidth = 6)) ``` ```{r} # Function to add colored rectangles to the plot add_rectangles_to_heatmap <- function(plot, levels, top, bottom, color) { top_pos <- which(levels == top) bottom_pos <- which(levels == bottom) p_with_rect <- plot + geom_rect( aes(xmin = top_pos + 0.5, xmax = bottom_pos - 0.5, ymin = top_pos + 0.5, ymax = bottom_pos - 0.5), fill = NA, color = color, size = 1 ) return(p_with_rect) } ``` ```{r} # Apply the function to the ANI plot genome_levels <- levels(ANI_data.long$styled_Genome1) pANI <- add_rectangles_to_heatmap(pANI, genome_levels, "Cna_KPL4072", "Cna_MSK071", "blue") pANI <- add_rectangles_to_heatmap(pANI, genome_levels, "Cyo_KPL2619", "Cyo_MSK136", "red") pANI <- add_rectangles_to_heatmap(pANI, genome_levels, "Ccu_CTNIH19", "<b>Ccu_c8Ua_181</b>", "green") pANI <- add_rectangles_to_heatmap(pANI, genome_levels, "<b>Cha_CTNIH22</b>", "Cha_CTNIH23", "purple") pANI <- add_rectangles_to_heatmap(pANI, genome_levels, "Ctu_KPL3807", "Ctu_SK141", "orange") pANI <- add_rectangles_to_heatmap(pANI, genome_levels, "Cmq_MSK144", "Cau_620_CAUR", "yellow") pANI # Save the plot as PNG ggsave("data/Anvio8/ANI/NovCor_pyani_anvio.png", plot = pANI, width = 12, height = 9, dpi = 300) ``` :::