Description of two novel Corynebacterium species isolated from human nasal passages and skin
  1. Methods
  2. Anvio
  • Introduction
  • Methods
    • Prokka Annotations
    • Average Nucleotide Identity (ANI)
    • Phylogenies
    • Anvio
  • References

Table of contents

  • Functional annotations
    • KEGG annotation
  • Metabolic Analysis
    • Estimation of KEGG module completeness

Anvio

library(tidyverse)
library(ggh4x)
library(RColorBrewer)
library(viridis)
library(ggtext)
library(kableExtra)

This notebook contains analyses using anvi’o version 8 (Delmont and Eren, 2018; Eren et al., 2020) (development branch). For more in-depth information about anvi’o go to their official online site here: https://anvio.org/

Functional annotations

We use as input (path_i) the anvi’o contigs database files (.db) that were generated in Methods_Prokka_Annotations.

KEGG annotation

First we downloaded and set up the KEGG KOfam(Kanehisa, 2000; Kanehisa et al., 2022) database using anvi-setup-kegg-data. We used the snapshot of the KEGG database dated 2023-09-22 (hash a2b5bde358bb). It contains 479 modules with 15116 entries.

The program anvi-run-kegg-kofams annotates genes in a given anvi’o contigs database with KEGG Orthology (KO) numbers via hits to the KEGG KOfam database. We used -E lower than 1e-15.

#conda activate anvio-dev

path_i="data/Anvio8/Contigs_db"

for file in $path_i/*.db; do
    anvi-run-kegg-kofams -T 8 --log-bitscores -E 1e-15 --just-do-it -c $file;
done

The option --log-bitscores records the bit scores values. They get saved to the main folder for the repo, so we moved them to a new subfolder


path_i <- "."
path_o <- "data/Anvio8/KEGG_bitscores"
dir.create(file.path("data/Anvio8/KEGG_bitscores"), recursive = TRUE, showWarnings = FALSE)
files_to_move <- list.files(path_i, pattern = "_bitscores.txt$", full.names = TRUE)

for (file in files_to_move) {
  new_path <- file.path(path_o, basename(file))
  print(new_path)
  file.rename(file, new_path)
}

Metabolic Analysis

Estimation of KEGG module completeness

Using anvi-estimate-metabolism we predicted KEGG module completeness as described in (Veseli et al., 2023) for the the organisms listed in NovCor_MetabolicAnalysis_GenomeList_v01.txt. This list includes the 30 Corynebacterium genomes selected for metabolic analysis.

#conda activate anvio-dev

txt_file="data/genome_lists/NovCor_MetabolicAnalysis_GenomeList_v03.txt"
prefix=NovCor
path_o="data/Anvio8/Metabolic_Analysis"
mkdir -p "$path_o"

anvi-estimate-metabolism -e $txt_file \
                         -O $path_o/$prefix --add-copy-number --output-modes modules

The generated NovCor_modules.txt file is included as Table S3 in this publication.

Code for Table 1

# Read modules data and make appropriate labeling columns
Modules <- read_delim("data/Anvio8/Metabolic_Analysis/NovCor_modules.txt", show_col_types = FALSE, col_types = cols(unique_enzymes_hit_counts = col_character(), gene_caller_ids_in_module = col_character())) %>% 
  mutate(module_class_short = paste0("(", substr(module_class, 1, 1), ")")) %>%
  unite(modules_label, c(module_class_short, module_category, module_subcategory), sep = "_", remove = FALSE) %>% 
  mutate_at(c("genome_name", "module", "module_name", "module_category", "module_subcategory", "modules_label"), factor)

# Summary tally table of fully stepwise complete modules by species. Filter to modules present in at least 25 genomes
ModulesTable <- Modules %>%
  filter(stepwise_module_completeness == 1) %>%
  group_by(module_class, module_category, module_subcategory, module, module_name) %>%
  tally() %>%
  ungroup() %>%
  select(module, module_name, n, -module_class) %>%  
  arrange(desc(n)) %>%
  filter(n > 25)

write_csv(ModulesTable, "data/Anvio8/Metabolic_Analysis/Fig_Tables/NovCor_Table1.csv")
kable(ModulesTable)

Code for Figure 3

StrainOrder <- read_csv("data/genome_lists/NovCor_MetabolicAnalysis_PlotOrder.txt")

# When using stepwise_module_completeness some modules can be = 0 for all genomes but still have pathwise_module_completeness > 0. We are removing those modules.
Modules_cleaned <- Modules %>%
  group_by(module) %>%
  filter(!all(stepwise_module_completeness == 0)) %>%
  ungroup() %>%
  mutate(across(where(is.factor), droplevels))

all_zero_modules <- setdiff(Modules$module, Modules_cleaned$module)
all_zero_subcategory <- setdiff(Modules$module_subcategory, Modules_cleaned$module_subcategory)

# Calculating mean_stepwise_module_completeness and preparing plot labels 
Modules_Means = Modules_cleaned %>% 
  group_by(modules_label, genome_name, .drop = FALSE) %>%
  summarise(mean_stepwise_module_completeness = mean(stepwise_module_completeness)) %>%
  mutate(mean_stepwise_module_completeness = coalesce(mean_stepwise_module_completeness, 0)) %>%
  mutate(species_name = str_extract(genome_name, "^[^_]+")) %>%
  mutate(module_category = str_extract(modules_label, "^[^_]+_[^_]+")) %>%
  mutate(module_category = str_replace_all(module_category, "_", " ")) %>%
  mutate(modules_final_label = str_replace(modules_label, "^[^_]+_[^_]+_", "")) %>%
  mutate_at(c("module_category", "modules_final_label"), factor) 

Modules_Means <- left_join(Modules_Means, StrainOrder, by = "genome_name")

Modules_Means$species_name <- sub("Cke", "Cna", Modules_Means$species_name)
Modules_Means$species_name <- sub("Cau", "Cmq", Modules_Means$species_name)
Modules_Means$species_name <- factor(Modules_Means$species_name, levels = c("Cna","Ccu","Cha","Cyo","Cmq","Ctu"))
Modules_Means$genome_name <- factor(Modules_Means$genome_name, levels = StrainOrder$genome_name)
Modules_Means$styled_genome_name <- factor(Modules_Means$styled_genome_name, levels = rev(StrainOrder$styled_genome_name))
Modules_Means$modules_final_label  <- factor(Modules_Means$modules_final_label, rev(levels(Modules_Means$modules_final_label)))
# Colors for plot
colorsSpecies <- c("#8200E7", "#FF00FF", "#0ABF00", "#0A6003", "#39D9E5", "#002BFF")

# Heatmap plot
plot <- ggplot(Modules_Means, aes(x = styled_genome_name, y = modules_final_label, fill = mean_stepwise_module_completeness)) + 
  geom_tile(color = "white") +
  facet_grid2(module_category ~ species_name, scales = "free", space = "free", switch = "y", labeller = label_wrap_gen(width = 31),
              #strip = strip_themed(background_x = elem_list_rect(fill = colorsSpecies))) +
              strip = strip_themed(text_x = elem_list_text(colour = colorsSpecies))) +
  labs(x = "", y = "") +
  scale_y_discrete(expand = c(0,0), position = "right") +
  scale_x_discrete(expand = c(0,0)) +
  scale_fill_viridis(direction = -1, option = "A", begin = 0.2) +
  theme_bw() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.background = element_rect(colour = "black", fill = NA),
        strip.text.y.left = element_text(angle = 0, hjust = 0.5, size = 11),
        strip.text.x = element_text(size = 12, face = "bold"),
        axis.text.y = element_text(size = 11, color = "black"),
        axis.text.x = element_markdown(angle = 45, hjust = 1, size = 11, color = "black"),
        panel.spacing = unit(0, 'pt'),
        legend.position = c(1.30, -0.11), 
        legend.direction = "horizontal",
        legend.justification = c(1, 0)) +
  guides(fill = guide_colourbar(title = "Average stepwise\nmodule completeness", title.position = "top", title.hjust = 0.5, barwidth = 8))


ggsave(plot = plot, filename = "data/Anvio8/Metabolic_Analysis/Fig_Tables/NovCor_Figure3.jpg", height = 13, width = 12, device = 'jpg', dpi = 600)

plot
Delmont, T. O. and Eren, A. M. (2018). Linking pangenomes and metagenomes: the Prochlorococcus metapangenome. PeerJ 6, e4320.
Eren, A. M., Kiefl, E., Shaiber, A., Veseli, I., Miller, S. E., Schechter, M. S., Fink, I., Pan, J. N., Yousef, M., Fogarty, E. C., et al. (2020). Community-led, integrated, reproducible multi-omics with anvi’o. Nature Microbiology 6, 3–6.
Kanehisa, M. (2000). KEGG: Kyoto encyclopedia of genes and genomes. Nucleic Acids Research 28, 27–30.
Kanehisa, M., Furumichi, M., Sato, Y., Kawashima, M. and Ishiguro-Watanabe, M. (2022). KEGG for taxonomy-based analysis of pathways and genomes. Nucleic Acids Research 51, D587–D592.
Veseli, I., Chen, Y. T., Schechter, M. S., Vanni, C., Fogarty, E. C., Watson, A. R., Jabri, B., Blekhman, R., Willis, A. D., Yu, M. K., et al. (2023). Microbes with higher metabolic independence are enriched in human gut microbiomes under stress. biorxiv.
Phylogenies
References
Source Code
---
execute:
  message: FALSE
  warning: FALSE
---

# Anvio {.unnumbered}

```{r}
library(tidyverse)
library(ggh4x)
library(RColorBrewer)
library(viridis)
library(ggtext)
library(kableExtra)
```

This notebook contains analyses using anvi'o version 8 [@eren2020; @delmont2018] (development branch). For more in-depth information about anvi'o go to their official online site here: <https://anvio.org/>

## Functional annotations

We use as input (`path_i`) the anvi'o contigs database files (`.db`) that were generated in [[Methods_Prokka_Annotations]{.underline}](Methods_Prokka_Annotations.html).

::: {.content-hidden when-profile="manuscript"}
This command runs all the annotations available on anvi'o 8 in a loop, for a detailed step by step process, see below.

```{bash, eval=FALSE}
#conda activate anvio-dev

path_i="data/Anvio8/Contigs_db"

for file in $path_i/*.db; do
    anvi-run-ncbi-cogs -T 8 --search-with diamond -c $file
    anvi-run-kegg-kofams -T 8 --log-bitscores -E 1e-15 --just-do-it -c $file
    anvi-run-pfams -T 8 -c $file
    anvi-run-cazymes -T 8 -c $file
    anvi-run-hmms -T 8 -c $file --just-do-it
    anvi-reaction-network -c $file;
done
```

### COG annotation

First we downloaded and set up the NCBI's Clusters of Orthologous Groups database[@Tatusov2000, @Galperin2021] using `anvi-setup-ncbi-cogs`. The default is the latest version, which is COG20, meaning that anvi'o will use the NCBI's 2020 release of COGs to setup the database.

The `anvi-run-ncbi-cogs` command was used to annotate the contig databases. We used DIAMOND as searching method[@buchfink2014] with the recommended "sensitive" option.

```{bash, eval=FALSE}
#conda activate anvio-dev

path_i="data/Anvio8/Contigs_db"

for file in $path_i/*.db; do
    anvi-run-ncbi-cogs -T 8 --search-with diamond -c $file;
done
```
:::

### KEGG annotation

First we downloaded and set up the KEGG KOfam[@Kanehisa2000; @Kanehisa2023] database using `anvi-setup-kegg-data`. We used the snapshot of the KEGG database dated 2023-09-22 (hash a2b5bde358bb). It contains 479 modules with 15116 entries.

The program `anvi-run-kegg-kofams` annotates genes in a given anvi'o contigs database with KEGG Orthology (KO) numbers via hits to the KEGG KOfam database. We used `-E` lower than 1e-15.

```{bash, eval=FALSE}
#conda activate anvio-dev

path_i="data/Anvio8/Contigs_db"

for file in $path_i/*.db; do
    anvi-run-kegg-kofams -T 8 --log-bitscores -E 1e-15 --just-do-it -c $file;
done
```

The option `--log-bitscores` records the bit scores values. They get saved to the main folder for the repo, so we moved them to a new subfolder

```{bash, eval=FALSE}

path_i <- "."
path_o <- "data/Anvio8/KEGG_bitscores"
dir.create(file.path("data/Anvio8/KEGG_bitscores"), recursive = TRUE, showWarnings = FALSE)
files_to_move <- list.files(path_i, pattern = "_bitscores.txt$", full.names = TRUE)

for (file in files_to_move) {
  new_path <- file.path(path_o, basename(file))
  print(new_path)
  file.rename(file, new_path)
}
```

::: {.content-hidden when-profile="manuscript"}
### PFAM annotation

First you need to download and set up the PFAM database[@Bateman2000; @Mistry2021] using `anvi-setup-pfams`: this is something you will do only once. We used Pfam version 36.0 (2023-07).

The program `anvi-run-kegg-pfam` annotates genes in a given anvi'o contigs database with the PFAM database.

```{bash, eval=FALSE}
#conda activate anvio-dev

path_i="data/Anvio8/Contigs_db"

for file in $path_i/*.db; do
    anvi-run-pfams -T 8 -c $file;
done
```

### CAZyme Annotation

First you need to download and set up the CAZyme database [@drula2021] using `anvi-setup-cazymes`: this is something you will do only once. We used Pfam version 11.

The program `anvi-run-cazymes` annotates genes in your contigs-db with the dbCAN CAZyme database.

```{bash, eval=FALSE}
#conda activate anvio-dev

path_i="data/Anvio8/Contigs_db"

for file in $path_i/*.db; do
    anvi-run-cazymes -T 8 -c $file;
done
```

### Getting HMMs

When you run the `anvi-run-hmms` command the occurrence of bacterial single-copy genes across your contigs is added to your contigs database [@Eddy2011].

```{bash, eval=FALSE}
#conda activate anvio-dev

path_i="data/Anvio8/Contigs_db"

for file in $path_i/*.db; do
    anvi-run-hmms -T 8 -c $file --just-do-it;
done
```

### Getting Metabolic Networks

The program `anvi-reaction-network` is used to store a metabolic reaction-network in each contigs-db. The network consists of data on biochemical reactions predicted to be encoded by each genome, referencing the KEGG Orthology (KO) and ModelSEED Biochemistry databases.

```{bash, eval=FALSE}
#conda activate anvio-dev

for file in $path_i/*.db; do
    anvi-reaction-network -c $file;
done
```

### Annotation Summaries

This generates basic stats on the annotations:

```{bash, eval=FALSE}
#conda activate anvio-dev

path_i="data/Anvio8/Contigs_db"
path_o="data/Anvio8/Annotation_summaries"
mkdir -p "$path_o"

for file in $path_i/*.db; do
    FILENAME=`basename ${file%.*}`
    anvi-display-contigs-stats $file --report-as-text -o $path_o/stats_$FILENAME.txt;
done
```

This provides basic info on the contigs databases:

```{bash, eval=FALSE}
#conda activate anvio-dev

path_i="data/Anvio8/Contigs_db"
path_o="data/Anvio8/Annotation_summaries"

for file in $path_i/*.db; do
    anvi-db-info $file &>> $path_o/anvi-db-info_log.txt;
done
```
:::

## Metabolic Analysis

### Estimation of KEGG module completeness

Using `anvi-estimate-metabolism` we predicted KEGG module completeness as described in [@veseli2023] for the the organisms listed in `NovCor_MetabolicAnalysis_GenomeList_v01.txt`. This list includes the 30 *Corynebacterium* genomes selected for metabolic analysis.

```{bash, eval=FALSE}
#conda activate anvio-dev

txt_file="data/genome_lists/NovCor_MetabolicAnalysis_GenomeList_v03.txt"
prefix=NovCor
path_o="data/Anvio8/Metabolic_Analysis"
mkdir -p "$path_o"

anvi-estimate-metabolism -e $txt_file \
                         -O $path_o/$prefix --add-copy-number --output-modes modules
```

The generated `NovCor_modules.txt` file is included as **Table S3** in this publication.

#### Code for Table 1

```{r}
# Read modules data and make appropriate labeling columns
Modules <- read_delim("data/Anvio8/Metabolic_Analysis/NovCor_modules.txt", show_col_types = FALSE, col_types = cols(unique_enzymes_hit_counts = col_character(), gene_caller_ids_in_module = col_character())) %>% 
  mutate(module_class_short = paste0("(", substr(module_class, 1, 1), ")")) %>%
  unite(modules_label, c(module_class_short, module_category, module_subcategory), sep = "_", remove = FALSE) %>% 
  mutate_at(c("genome_name", "module", "module_name", "module_category", "module_subcategory", "modules_label"), factor)

# Summary tally table of fully stepwise complete modules by species. Filter to modules present in at least 25 genomes
ModulesTable <- Modules %>%
  filter(stepwise_module_completeness == 1) %>%
  group_by(module_class, module_category, module_subcategory, module, module_name) %>%
  tally() %>%
  ungroup() %>%
  select(module, module_name, n, -module_class) %>%  
  arrange(desc(n)) %>%
  filter(n > 25)

write_csv(ModulesTable, "data/Anvio8/Metabolic_Analysis/Fig_Tables/NovCor_Table1.csv")
kable(ModulesTable)
```

#### Code for Figure 3

```{r}
StrainOrder <- read_csv("data/genome_lists/NovCor_MetabolicAnalysis_PlotOrder.txt")

# When using stepwise_module_completeness some modules can be = 0 for all genomes but still have pathwise_module_completeness > 0. We are removing those modules.
Modules_cleaned <- Modules %>%
  group_by(module) %>%
  filter(!all(stepwise_module_completeness == 0)) %>%
  ungroup() %>%
  mutate(across(where(is.factor), droplevels))

all_zero_modules <- setdiff(Modules$module, Modules_cleaned$module)
all_zero_subcategory <- setdiff(Modules$module_subcategory, Modules_cleaned$module_subcategory)

# Calculating mean_stepwise_module_completeness and preparing plot labels 
Modules_Means = Modules_cleaned %>% 
  group_by(modules_label, genome_name, .drop = FALSE) %>%
  summarise(mean_stepwise_module_completeness = mean(stepwise_module_completeness)) %>%
  mutate(mean_stepwise_module_completeness = coalesce(mean_stepwise_module_completeness, 0)) %>%
  mutate(species_name = str_extract(genome_name, "^[^_]+")) %>%
  mutate(module_category = str_extract(modules_label, "^[^_]+_[^_]+")) %>%
  mutate(module_category = str_replace_all(module_category, "_", " ")) %>%
  mutate(modules_final_label = str_replace(modules_label, "^[^_]+_[^_]+_", "")) %>%
  mutate_at(c("module_category", "modules_final_label"), factor) 

Modules_Means <- left_join(Modules_Means, StrainOrder, by = "genome_name")

Modules_Means$species_name <- sub("Cke", "Cna", Modules_Means$species_name)
Modules_Means$species_name <- sub("Cau", "Cmq", Modules_Means$species_name)
Modules_Means$species_name <- factor(Modules_Means$species_name, levels = c("Cna","Ccu","Cha","Cyo","Cmq","Ctu"))
Modules_Means$genome_name <- factor(Modules_Means$genome_name, levels = StrainOrder$genome_name)
Modules_Means$styled_genome_name <- factor(Modules_Means$styled_genome_name, levels = rev(StrainOrder$styled_genome_name))
Modules_Means$modules_final_label  <- factor(Modules_Means$modules_final_label, rev(levels(Modules_Means$modules_final_label)))
```

```{r}
#| fig-width: 12
#| fig-height: 13

# Colors for plot
colorsSpecies <- c("#8200E7", "#FF00FF", "#0ABF00", "#0A6003", "#39D9E5", "#002BFF")

# Heatmap plot
plot <- ggplot(Modules_Means, aes(x = styled_genome_name, y = modules_final_label, fill = mean_stepwise_module_completeness)) + 
  geom_tile(color = "white") +
  facet_grid2(module_category ~ species_name, scales = "free", space = "free", switch = "y", labeller = label_wrap_gen(width = 31),
              #strip = strip_themed(background_x = elem_list_rect(fill = colorsSpecies))) +
              strip = strip_themed(text_x = elem_list_text(colour = colorsSpecies))) +
  labs(x = "", y = "") +
  scale_y_discrete(expand = c(0,0), position = "right") +
  scale_x_discrete(expand = c(0,0)) +
  scale_fill_viridis(direction = -1, option = "A", begin = 0.2) +
  theme_bw() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.background = element_rect(colour = "black", fill = NA),
        strip.text.y.left = element_text(angle = 0, hjust = 0.5, size = 11),
        strip.text.x = element_text(size = 12, face = "bold"),
        axis.text.y = element_text(size = 11, color = "black"),
        axis.text.x = element_markdown(angle = 45, hjust = 1, size = 11, color = "black"),
        panel.spacing = unit(0, 'pt'),
        legend.position = c(1.30, -0.11), 
        legend.direction = "horizontal",
        legend.justification = c(1, 0)) +
  guides(fill = guide_colourbar(title = "Average stepwise\nmodule completeness", title.position = "top", title.hjust = 0.5, barwidth = 8))


ggsave(plot = plot, filename = "data/Anvio8/Metabolic_Analysis/Fig_Tables/NovCor_Figure3.jpg", height = 13, width = 12, device = 'jpg', dpi = 600)

plot
```


::: {.content-hidden when-profile="manuscript"}
### Module Enrichment Analysis

The `NovCor_modules.txt` file generated in the previous step is used by `anvi-compute-metabolic-enrichment` to calculate KEGG module enrichment[@shaiber2020] across different groups of genomes. We used the `-G` flag to indicate the grouping options defined in file `NovCor_MetabolicAnalysis_Groups_CtuComplex.txt`that includes the 30 *Corynebacterium* genomes grouped by their final species assignment.

```{bash, eval=FALSE}
#conda activate anvio-dev

csv_file_modules="data/Anvio8/Metabolic_Analysis/NovCor_modules.txt"
path_o="data/Anvio8/Metabolic_Analysis/Enrichment_Modules"
mkdir -p "$path_o"

txt_file_groups="data/genome_lists/NovCor_MetabolicAnalysis_Groups_CtuComplex_v03.txt"
output_file="$path_o/NovCor_enriched_modules_stepwise_1_CtuComplex.txt"
anvi-compute-metabolic-enrichment -M "$csv_file_modules" \
                                  -G "$txt_file_groups" \
                                  -o "$output_file" \
                                  --use-stepwise-completeness \
                                  --module-completion-threshold 1.0

```

### KO Enrichment Analysis

We use `anvi-display-functions`. It runs the enrichment for the desired annotations (in this case KOfam). It saves the output to a temp file that can be recovered, renamed and saved into the same `path_o` as the database file.

```{bash, eval=FALSE}
#conda activate anvio-dev

txt_file="data/genome_lists/NovCor_MetabolicAnalysis_GenomeList_v01.txt"
path_o="data/Anvio8/Metabolic_Analysis/Enrichment_KOs"
mkdir -p "$path_o"

txt_file_groups="data/genome_lists/NovCor_MetabolicAnalysis_Groups_CtuComplex.txt"
anvi-display-functions -e $txt_file \
                       --groups-txt $txt_file_groups \
                       --annotation-source KOfam \
                       --profile-db $path_o/KOfam_CtuComplex.db

txt_file_groups="data/genome_lists/NovCor_MetabolicAnalysis_Groups_References.txt"
anvi-display-functions -e $txt_file \
                       --groups-txt $txt_file_groups \
                       --annotation-source KOfam \
                       --profile-db $path_o/KOfam_References.db  
```

Many of the enriched KOs belong to the already identified enriched modules. This function filters out those KOs that are part of the definition of modules listed in the enriched modules identified with `anvi-compute-metabolic-enrichment`:

```{bash, eval=FALSE}
get_KOs_not_in_enrich_modules <- function(modules, hits, enrich_modules, enrich_KOs, q_value_threshold) {

  # Get list of KOs that correspond with the modules on the enrichment file
  modules_subset <- modules %>% filter(module %in% enrich_modules$accession)
  module_definition_subset <- paste(modules_subset$module_definition, collapse = " ")
  module_definition_subset <- str_extract_all(module_definition_subset, "\\bK\\d+\\b")
  module_definition_subset <- unique(unlist(module_definition_subset))

  # Add a column to enrich_KOs indicating if the KO is the previous list
  enrich_KOs$inModule <- sapply(strsplit(as.character(enrich_KOs$accession), ","), function(x) all((x %in% module_definition_subset)))

  # Filter Enrich_KO based on inModule and q_value_threshold
  enrich_KOs_filtered <- enrich_KOs %>% 
    filter(inModule == FALSE) %>% 
    filter(adjusted_q_value < q_value_threshold)

  return(enrich_KOs_filtered)
}

Modules <- read_delim("data/Anvio8/Metabolic_Analysis/NovCor_modules.txt", show_col_types = FALSE, col_types = cols(unique_enzymes_hit_counts = col_character(), gene_caller_ids_in_module = col_character()))
KOHits <- read_delim("data/Anvio8/Metabolic_Analysis/NovCor_hits.txt", show_col_types = FALSE)

Enrich_Mod_CtuComplex <- read_delim("data/Anvio8/Metabolic_Analysis/Enrichment_Modules/NovCor_enriched_modules_stepwise_1_CtuComplex.txt", show_col_types = FALSE)
Enrich_KO_CtuComplex <- read_delim("data/Anvio8/Metabolic_Analysis/Enrichment_KOs/NovCor_enriched_KOs_CtuComplex.txt", show_col_types = FALSE)
Enrich_KO_CtuComplex_Filtered <- get_KOs_not_in_enrich_modules(Modules, KOHits, Enrich_Mod_CtuComplex, Enrich_KO_CtuComplex, 0.1)
write_csv(Enrich_KO_CtuComplex_Filtered, "data/Anvio8/Metabolic_Analysis/Enrichment_KOs/NovCor_enriched_KOs_CtuComplex_Filtered.csv")

Enrich_Mod_References <- read_delim("data/Anvio8/Metabolic_Analysis/Enrichment_Modules/NovCor_enriched_modules_stepwise_1_References.txt", show_col_types = FALSE)
Enrich_KO_References <- read_delim("data/Anvio8/Metabolic_Analysis/Enrichment_KOs/NovCor_enriched_KOs_References.txt", show_col_types = FALSE)
Enrich_KO_References_Filtered <- get_KOs_not_in_enrich_modules(Modules, KOHits, Enrich_Mod_References, Enrich_KO_References, 0.1)
write_csv(Enrich_KO_References_Filtered, "data/Anvio8/Metabolic_Analysis/Enrichment_KOs/NovCor_enriched_KOs_References_Filtered.csv")
```

### Pangenomic level analysis

This analysis using `NovCor_MetabolicAnalysis_GenomeList_v02.txt` includes the 30 *Corynebacterium* genomes selected for metabolic analysis.

```{bash, eval=FALSE}
#conda activate anvio-dev

path_i="data/Anvio8/Pangenomic_Analysis"
mkdir -p "$path_i"

anvi-gen-genomes-storage -e "data/genome_lists/NovCor_MetabolicAnalysis_GenomeList_v02.txt" -o $path_i/NovCor-ANI-GENOMES.db --gene-caller Prodigal

anvi-db-info $path_i/NovCor-ANI-GENOMES.db

anvi-pan-genome -g $path_i/NovCor-ANI-GENOMES.db \
                -o $path_i/ANI -n PAN_ANI \
                --use-ncbi-blast --mcl-inflation 10  --num-threads 8

#Create a bin collection named ANI with all Core GCs for the species using the visual interface:       

anvi-display-pan -p $path_i/ANI/PAN_ANI-PAN.db -g $path_i/NovCor-ANI-GENOMES.db         

anvi-summarize -p $path_i/ANI/PAN_ANI-PAN.db \
               -g $path_i/NovCor-ANI-GENOMES.db \
               -o $path_i/ANI/Summaries \
               -C ANI
```

## ANI

```{bash, eval=FALSE}
#conda activate anvio-dev

path_i="data/Anvio8/Contigs_dbv23" #Contigs_dbv22 has the original files
  
anvi-migrate $path_i/*.db --migrate-quickly 
```

Anvi'o uses 'PyANI' [@pritchard2016]

```{bash, eval=FALSE}
#conda activate anvio-dev

path_i="data/genome_lists/NovCor_MetabolicAnalysis_GenomeList_v02.txt"
path_o="data/Anvio8/ANI"
  
anvi-compute-genome-similarity -e $path_i \
                               -o $path_o \
                               --program pyANI \
                               --method ANIb
```

### Code for ANI figure

```{r}
# Define the path of the input file
input = "./data/Anvio8/ANI/ANIb_percentage_identity.txt"
ANI_data <- read.table(input, header = TRUE, sep = "\t")

# Read the tab-separated data from the file into a data frame called ANI_data
ANI_data.long <- ANI_data %>%
# Convert the data from wide format to long format
  pivot_longer(cols = -key, names_to = "Genome2", values_to = "PercentIdentity") %>%
# Rename the 'key' column to 'Genome1'
  rename(Genome1 = key) %>%
# Filter out rows where Genome1 or Genome2 is 'Cma_CCUG_32361'
  filter(Genome1 != "Cma_CCUG_32361" , Genome2 != "Cma_CCUG_32361") %>%
# Filter out rows where Genome1 or Genome2 is 'Cac_ATCC_49725'
  filter(Genome1 != "Cac_ATCC_49725" , Genome2 != "Cac_ATCC_49725")

# Read the strain order from a CSV file
StrainOrder <- read_csv("data/genome_lists/NovCor_pyANI_PlotOrder.txt")
# Join the ANI_data.long and StrainOrder data frames based on Genome1
ANI_data.long <- left_join(ANI_data.long, StrainOrder, by = join_by(Genome1 == Genome)) %>% 
# Rename the 'styled_Genome' column to 'styled_Genome1'
  rename(styled_Genome1 = styled_Genome) 
# Convert the 'styled_Genome1' column to a factor and specify the order of the levels
ANI_data.long$styled_Genome1 <- factor(ANI_data.long$styled_Genome1, levels = rev(StrainOrder$styled_Genome))

# Join the ANI_data.long and StrainOrder data frames based on Genome1
ANI_data.long <- left_join(ANI_data.long, StrainOrder, by = join_by(Genome2 == Genome)) %>% 
# Rename the 'styled_Genome' column to 'styled_Genome2'
  rename(styled_Genome2 = styled_Genome) 
# Convert the 'styled_Genome2' column to a factor and specify the order of the levels
ANI_data.long$styled_Genome2 <- factor(ANI_data.long$styled_Genome2, levels = rev(StrainOrder$styled_Genome))
```

```{r}
# Create a ggplot object with styled_Genome1 on the x-axis, styled_Genome2 on the y-axis, and fill color representing PercentIdentity
pANI <- ggplot(ANI_data.long, aes(x = styled_Genome1, y = styled_Genome2, fill = PercentIdentity)) + 
# Add a tile layer to the plot with specified width and height
  geom_tile(width = 1.0, height = 1.0) +
# Add text to each tile showing the rounded PercentIdentity value
  geom_text(aes(label = sprintf("%.1f", round(PercentIdentity * 100, 1))), size = 3, colour = "black") +
# Use the viridis color palette for the fill color and scale the labels to percent
  scale_fill_viridis(option = "A", begin = 0.4, labels = scales::percent) + 
# Set the position of the x-axis labels to the bottom
  scale_x_discrete(position = "bottom") +
# Set the position of the y-axis labels to the right
  scale_y_discrete(position = "right") +
# Set the plot title and remove the x and y axis labels
  labs(x = NULL, y = NULL, title = "Whole-genome Average Nucleotide Identity") +
# Use a white background for the plot
  theme_bw() +
# Customize the plot theme
  theme(axis.text.x = element_markdown(angle = 45, vjust = 1, hjust = 1, color = "black"),
        axis.text.y = element_markdown(angle = 25, color = "black"),
        panel.border = element_blank(),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(), 
        axis.ticks = element_blank(),
        plot.margin = margin(1, 0, 0, 1.5, "cm"),
        legend.position = c(1.1, -0.15), 
        legend.direction = "horizontal",
        legend.justification = c(1, 0)) +
# Add a colorbar legend for the fill color with a specified title and position
  guides(fill = guide_colourbar(title = "% identity", title.position = "top", title.hjust = 0.5, barwidth = 6))
```

```{r}
# Function to add colored rectangles to the plot
add_rectangles_to_heatmap <- function(plot, levels, top, bottom, color) {
  top_pos <- which(levels == top)
  bottom_pos <- which(levels == bottom)
  
  p_with_rect <- plot +
    geom_rect(
      aes(xmin = top_pos + 0.5, 
          xmax = bottom_pos - 0.5,
          ymin = top_pos + 0.5, 
          ymax = bottom_pos - 0.5),
      fill = NA,
      color = color,
      size = 1
    )
  
  return(p_with_rect)
}
```

```{r}
# Apply the function to the ANI plot
genome_levels <- levels(ANI_data.long$styled_Genome1)
pANI <- add_rectangles_to_heatmap(pANI, genome_levels, "Cna_KPL4072", "Cna_MSK071", "blue")
pANI <- add_rectangles_to_heatmap(pANI, genome_levels, "Cyo_KPL2619", "Cyo_MSK136", "red")
pANI <- add_rectangles_to_heatmap(pANI, genome_levels, "Ccu_CTNIH19", "<b>Ccu_c8Ua_181</b>", "green")
pANI <- add_rectangles_to_heatmap(pANI, genome_levels, "<b>Cha_CTNIH22</b>", "Cha_CTNIH23", "purple")
pANI <- add_rectangles_to_heatmap(pANI, genome_levels, "Ctu_KPL3807", "Ctu_SK141", "orange")
pANI <- add_rectangles_to_heatmap(pANI, genome_levels, "Cmq_MSK144", "Cau_620_CAUR", "yellow")
pANI
# Save the plot as PNG
ggsave("data/Anvio8/ANI/NovCor_pyani_anvio.png", plot = pANI, width = 12, height = 9, dpi = 300)
```
:::