Supplemental Methods: COGs Functional Analysis for Cac

1 Data Import

This Notebook uses the anvi-summarize output files saved in the “analysis_Anvio7” folder. In order to automatically run this Notebook for each of the four Corynebacterium species you can source the scrip run_COGS.R

We select the most relevant variables for the functional analysis:

Pangenome <- Pangenome %>%
  select(-functional_homogeneity_index, -geometric_homogeneity_index, -combined_homogeneity_index, -aa_sequence)

In the new variable “bins_PPanGGOLiN” we define “Persistent” and “Accessory” (as shell”+“cloud”):

Pangenome <- Pangenome %>%
  mutate(bins_PPanGGOLiN = ifelse(grepl("persistent", PPanGGOLiN), "Persistent", "Accessory"))

Number and percentage of Persistent vs Accessory:

vPersistent <- nrow(Pangenome %>% group_by(gene_cluster_id) %>% filter(bins_PPanGGOLiN =="Persistent") %>% summarise)
vAccesory <- nrow(Pangenome %>% group_by(gene_cluster_id) %>% filter(bins_PPanGGOLiN =="Accessory") %>% summarise)

vPersistent.p <- round(100*vPersistent/(vAccesory+vPersistent), 1)
vAccesory.p <- round(100*vAccesory/(vAccesory+vPersistent), 1)

There are 1416 gene clusters (GC) (41.3%) in the “Accessory” vs. 2011 (58.7%) in the “Persistent” at the pangenome level

2 COG Analysis at the Gene Level

We define a new variable COGs to use in the plots. This variable is based on COG20_CATEGORY but with a cleaner definition of unclassified, uninformative, or mixed assignments:

  • COG categories “Function Unknown” and “General function predictions only” were considered as “Uninformative”.
  • If the COG category is mix (e.g., G|S|M) it gets labeled as “Ambiguous”.
  • Missing values (NA) are labeled as “Unclassified”.
Pangenome$COGs <- Pangenome$COG20_CATEGORY_ACC
Pangenome$COGs[Pangenome$COGs =="S"]<- "Uninformative"
Pangenome$COGs[Pangenome$COGs =="R"]<- "Uninformative"
Pangenome$COGs[grepl('|', Pangenome$COGs,fixed=TRUE)]<-"Ambiguous"
Pangenome$COGs[Pangenome$COGs ==""]<-"Unclassified"

Summary of COG annotated genes:

Genes Count Percentage
Total in Pangenome 78261 100.0
COG Category Uninformative = Function Unknown 2533 3.2
COG Category Uninformative = General function prediction only 3090 3.9
COG Category Ambiguous (Mixed COG Category) 7665 9.8
COG Category Unclassified (Non-assigned) 16280 20.8
Informative COGs (Total - Uninformative, Ambiguous & Unclassified) 48693 62.2

3 COG Analysis at the Gene Cluster Level

This analysis was done at the pangenomic gene cluster level (GC). Since many gene clusters had mixed COG category assignments a solution is to assign each individual gene call to their corresponding Genome/bins_PPanGGOLiN/COG grouping weighting their contribution by dividing their count by the number of genes in their GC.

3.1 GCs by COG Category and Genome

The table “GCsbyCOG_Genome” groups the genes by genome; and inside genomes by “Accessory” vs. “Persistent” status, and nested inside as the COG category. But, in this case, instead of counting the elements in each group we calculated the sum of 1/num_genes_in_gene_cluster.

Pangenome$COGs <- as.factor(Pangenome$COGs)
GCsbyCOG_Genome <- Pangenome %>%
  group_by(genome_name, bins_PPanGGOLiN, COGs, .drop=FALSE) %>%
  summarise(num_corrected_genes=sum(1/num_genes_in_gene_cluster))

The total sum of all values in the num_corrected_genes variable should add up to the number of CGs:

sum(GCsbyCOG_Genome$num_corrected_genes)
## [1] 3427
nrow(Pangenome %>% group_by(gene_cluster_id) %>% summarise)
## [1] 3427

Adding extra column to label the gray scale portion of the plots:

GCsbyCOG_Genome <- GCsbyCOG_Genome %>%
  mutate(Assignment=ifelse(COGs!="Uninformative" & COGs!="Ambiguous" & COGs!="Unclassified", "Informative", as.character(COGs)))

3.1.1 Summary of GOC annotated GCs in the Accessory vs. Persistent

TableGC <- GCsbyCOG_Genome %>% 
  group_by(bins_PPanGGOLiN, Assignment) %>%
  summarize(corrected_genes=sum(num_corrected_genes))

TableGC$Percentages <- round(100*TableGC$corrected_genes/sum(TableGC$corrected_genes), 1)

kable(TableGC)
bins_PPanGGOLiN Assignment corrected_genes Percentages
Accessory Ambiguous 67.06670 2.0
Accessory Informative 363.50120 10.6
Accessory Unclassified 957.07039 27.9
Accessory Uninformative 28.36171 0.8
Persistent Ambiguous 203.39698 5.9
Persistent Informative 1300.33953 37.9
Persistent Unclassified 350.60064 10.2
Persistent Uninformative 156.66285 4.6

3.1.2 Summary of GOC annotated GCs in the Accessory

TableGCAccessory <- GCsbyCOG_Genome %>% 
  filter(bins_PPanGGOLiN =="Accessory") %>%
  group_by(bins_PPanGGOLiN, Assignment) %>%
  summarize(corrected_genes=sum(num_corrected_genes))

TableGCAccessory$Percentages <- round(100*TableGCAccessory$corrected_genes/sum(TableGCAccessory$corrected_genes), 1)

kable(TableGCAccessory)
bins_PPanGGOLiN Assignment corrected_genes Percentages
Accessory Ambiguous 67.06670 4.7
Accessory Informative 363.50120 25.7
Accessory Unclassified 957.07039 67.6
Accessory Uninformative 28.36171 2.0

3.1.3 Summary of GOC annotated GCs in the Persistent

TableGCPersistent <- GCsbyCOG_Genome %>% 
  filter(bins_PPanGGOLiN =="Persistent") %>%
  group_by(bins_PPanGGOLiN, Assignment) %>%
  summarize(corrected_genes=sum(num_corrected_genes))

TableGCPersistent$Percentages <- round(100*TableGCPersistent$corrected_genes/sum(TableGCPersistent$corrected_genes), 1)

kable(TableGCPersistent)
bins_PPanGGOLiN Assignment corrected_genes Percentages
Persistent Ambiguous 203.3970 10.1
Persistent Informative 1300.3395 64.7
Persistent Unclassified 350.6006 17.4
Persistent Uninformative 156.6629 7.8

3.1.4 Summary of GOC annotated GCs by Genome in the Accessory vs. Persistent

TableGenomes <- GCsbyCOG_Genome %>% 
  group_by(genome_name, bins_PPanGGOLiN) %>% 
  summarize(corrected_genes=sum(num_corrected_genes))

kable(TableGenomes)
genome_name bins_PPanGGOLiN corrected_genes
Cac_ATCC_49725 Accessory 29.60302
Cac_ATCC_49725 Persistent 58.99323
Cac_ATCC_49726 Accessory 139.24175
Cac_ATCC_49726 Persistent 58.56769
Cac_BWA070 Accessory 18.22612
Cac_BWA070 Persistent 59.19701
Cac_BWA096 Accessory 16.20340
Cac_BWA096 Persistent 59.21302
Cac_BWA109 Accessory 33.66284
Cac_BWA109 Persistent 59.15409
Cac_BWA121 Accessory 55.39508
Cac_BWA121 Persistent 58.80654
Cac_BWA161 Accessory 40.24589
Cac_BWA161 Persistent 59.02810
Cac_BWA270 Accessory 55.45114
Cac_BWA270 Persistent 58.99514
Cac_BWA273 Accessory 24.54642
Cac_BWA273 Persistent 59.06474
Cac_DU075 Accessory 74.21173
Cac_DU075 Persistent 58.98644
Cac_DU076 Accessory 46.45043
Cac_DU076 Persistent 59.12481
Cac_DU079 Accessory 22.62237
Cac_DU079 Persistent 59.45713
Cac_KPL_1818 Accessory 72.05661
Cac_KPL_1818 Persistent 59.37915
Cac_KPL_1824 Accessory 51.05168
Cac_KPL_1824 Persistent 59.67452
Cac_KPL_1996 Accessory 45.36679
Cac_KPL_1996 Persistent 59.30254
Cac_KPL_2617 Accessory 93.15755
Cac_KPL_2617 Persistent 59.28030
Cac_KPL_2618 Accessory 47.48306
Cac_KPL_2618 Persistent 58.98104
Cac_KPL_2641 Accessory 23.35536
Cac_KPL_2641 Persistent 59.29569
Cac_KPL_2652 Accessory 48.89516
Cac_KPL_2652 Persistent 58.98813
Cac_KPL_2660 Accessory 37.20511
Cac_KPL_2660 Persistent 59.09316
Cac_KPL_2783 Accessory 35.34869
Cac_KPL_2783 Persistent 59.16703
Cac_KPL_2859 Accessory 27.41650
Cac_KPL_2859 Persistent 58.91783
Cac_KPL_3647 Accessory 45.41046
Cac_KPL_3647 Persistent 59.14408
Cac_KPL_3674 Accessory 21.53857
Cac_KPL_3674 Persistent 59.21365
Cac_KPL_3703 Accessory 26.81848
Cac_KPL_3703 Persistent 58.70233
Cac_KPL_3774 Accessory 28.92555
Cac_KPL_3774 Persistent 59.55393
Cac_KPL_3802 Accessory 69.96062
Cac_KPL_3802 Persistent 59.36714
Cac_KPL_3832 Accessory 23.37103
Cac_KPL_3832 Persistent 58.93962
Cac_KPL_3921 Accessory 24.87543
Cac_KPL_3921 Persistent 59.39748
Cac_KPL_3926 Accessory 53.36095
Cac_KPL_3926 Persistent 59.22556
Cac_KPL_3970 Accessory 15.45448
Cac_KPL_3970 Persistent 59.14831
Cac_KPL_4034 Accessory 23.72253
Cac_KPL_4034 Persistent 59.23188
Cac_KPL_4065 Accessory 27.84234
Cac_KPL_4065 Persistent 59.27321
Cac_KPL_4075 Accessory 17.52288
Cac_KPL_4075 Persistent 59.13549

3.1.5 Renaming and ordering variables factor levels for plotting:

GCsbyCOG_Genome$bins_PPanGGOLiN <- factor(GCsbyCOG_Genome$bins_PPanGGOLiN, levels =c("Persistent", "Accessory"))

GCsbyCOG_Genome$COGs <- recode_factor(GCsbyCOG_Genome$COGs, "Q"="Secondary metabolites biosynthesis, transport, and catabolism","P"="Inorganic ion transport and metabolism","I"="Lipid transport and metabolism","H"="Coenzyme transport and metabolism","G"="Carbohydrate transport and metabolism","F"="Nucleotide transport and metabolism","E"="Amino acid transport and metabolism","C"="Energy production and conversion","X"="Mobilome: prophages, transposons","L"="Replication, recombination and repair","K"="Transcription","J"="Translation, ribosomal structure and biogenesis","V"="Defense mechanisms","U"="Intracellular trafficking, secretion, and vesicular transport","T"="Signal transduction mechanisms","O"="Post-translational modification, protein turnover, and chaperones","N"="Cell Motility","M"="Cell wall/membrane/envelope biogenesis","D"="Cell cycle control, cell division, chromosome partitioning","A"="RNA processing and modification","W"="Extracellular structures","Uninformative"="Uninformative","Ambiguous"="Ambiguous","Unclassified"="Unclassified", .ordered = TRUE)

GCsbyCOG_Genome$Assignment <- recode_factor(GCsbyCOG_Genome$Assignment,  "Informative"=" ", "Uninformative"="Uninformative", "Ambiguous"="Ambiguous", "Unclassified"="Unclassified", .ordered = TRUE)

3.2 GCs by COG Category

The table “GCsbyCOG” groups the genes by “Accessory” vs. “Persistent” status, and nested inside as the COG category.

GCsbyCOG <- Pangenome %>%
  group_by(bins_PPanGGOLiN, COGs) %>%
  summarise(num_corrected_genes=sum(1/num_genes_in_gene_cluster))

3.2.1 Renaming and ordering variables factor levels for plotting:

GCsbyCOG$COGs <- recode_factor(GCsbyCOG$COGs, "Q"="Secondary metabolites biosynthesis, transport, and catabolism",
                               "P"="Inorganic ion transport and metabolism",
                               "I"="Lipid transport and metabolism",
                               "H"="Coenzyme transport and metabolism",
                               "G"="Carbohydrate transport and metabolism",
                               "F"="Nucleotide transport and metabolism",
                               "E"="Amino acid transport and metabolism",
                               "C"="Energy production and conversion",
                               "X"="Mobilome: prophages, transposons",
                               "L"="Replication, recombination and repair",
                               "K"="Transcription",
                               "J"="Translation, ribosomal structure and biogenesis",
                               "V"="Defense mechanisms",
                               "U"="Intracellular trafficking, secretion, and vesicular transport",
                               "T"="Signal transduction mechanisms",
                               "O"="Post-translational modification, protein turnover, and chaperones",
                               "N"="Cell Motility",
                               "M"="Cell wall/membrane/envelope biogenesis",
                               "D"="Cell cycle control, cell division, chromosome partitioning",
                               "A"="RNA processing and modification",
                               "W"="Extracellular structures",
                               "Uninformative"="Uninformative",
                               "Ambiguous"="Ambiguous",
                               "Unclassified"="Unclassified", .ordered = TRUE)

3.2.2 Summary of GOC annotated GCs GCs by COG Category:

New table “GCsbyCOG_PervsAcc” in wide format. % of each category relative to the “Accessory” or “Persistent” was calculated (pTotal. variables). Total GCs for each COG category calculated, and % of GCs in the “Accessory” and “Persistent” relative to each category (p. values) were calculated as well. The ratio between the number of GC in the “Accessory” vs. the “Persistent” is calculated for each COG:

GCsbyCOG_PervsAcc <- spread(GCsbyCOG, bins_PPanGGOLiN, num_corrected_genes, fill=0)
GCsbyCOG_PervsAcc$pTotal.Accessory <- round(100*GCsbyCOG_PervsAcc$Accessory/sum(GCsbyCOG_PervsAcc$Accessory), 1)
GCsbyCOG_PervsAcc$pTotal.Persistent <- round(100*GCsbyCOG_PervsAcc$Persistent/sum(GCsbyCOG_PervsAcc$Persistent), 1)
GCsbyCOG_PervsAcc$total <- GCsbyCOG_PervsAcc$Accessory + GCsbyCOG_PervsAcc$Persistent
GCsbyCOG_PervsAcc$pTotal.total <- round(100*GCsbyCOG_PervsAcc$total/sum(GCsbyCOG_PervsAcc$total), 1)
GCsbyCOG_PervsAcc$p.accessory <- round(100*(GCsbyCOG_PervsAcc$Accessory/GCsbyCOG_PervsAcc$total), 1)
GCsbyCOG_PervsAcc$p.Persistent <- round(100*(GCsbyCOG_PervsAcc$Persistent/GCsbyCOG_PervsAcc$total), 1)
GCsbyCOG_PervsAcc$ratio <- round(GCsbyCOG_PervsAcc$Accessory/GCsbyCOG_PervsAcc$Persistent, 2)

kable(GCsbyCOG_PervsAcc)
COGs Accessory Persistent pTotal.Accessory pTotal.Persistent total pTotal.total p.accessory p.Persistent ratio
Secondary metabolites biosynthesis, transport, and catabolism 4.000000 17.588235 0.3 0.9 21.58824 0.6 18.5 81.5 0.23
Inorganic ion transport and metabolism 16.932217 103.775358 1.2 5.2 120.70757 3.5 14.0 86.0 0.16
Lipid transport and metabolism 13.000000 65.537101 0.9 3.3 78.53710 2.3 16.6 83.4 0.20
Coenzyme transport and metabolism 6.000000 99.724599 0.4 5.0 105.72460 3.1 5.7 94.3 0.06
Carbohydrate transport and metabolism 29.900000 99.616963 2.1 5.0 129.51696 3.8 23.1 76.9 0.30
Nucleotide transport and metabolism 4.944444 64.935484 0.3 3.2 69.87993 2.0 7.1 92.9 0.08
Amino acid transport and metabolism 20.000000 137.913445 1.4 6.9 157.91345 4.6 12.7 87.3 0.15
Energy production and conversion 9.633333 72.280445 0.7 3.6 81.91378 2.4 11.8 88.2 0.13
Mobilome: prophages, transposons 70.977571 8.526291 5.0 0.4 79.50386 2.3 89.3 10.7 8.32
Replication, recombination and repair 49.726496 81.792484 3.5 4.1 131.51898 3.8 37.8 62.2 0.61
Transcription 27.361050 90.596429 1.9 4.5 117.95748 3.4 23.2 76.8 0.30
Translation, ribosomal structure and biogenesis 7.000000 175.000000 0.5 8.7 182.00000 5.3 3.8 96.2 0.04
Defense mechanisms 54.127950 43.632277 3.8 2.2 97.76023 2.9 55.4 44.6 1.24
Intracellular trafficking, secretion, and vesicular transport 0.000000 14.058824 0.0 0.7 14.05882 0.4 0.0 100.0 0.00
Signal transduction mechanisms 10.650000 36.382353 0.8 1.8 47.03235 1.4 22.6 77.4 0.29
Post-translational modification, protein turnover, and chaperones 11.000000 74.473109 0.8 3.7 85.47311 2.5 12.9 87.1 0.15
Cell Motility 0.000000 1.000000 0.0 0.0 1.00000 0.0 0.0 100.0 0.00
Cell wall/membrane/envelope biogenesis 25.248140 91.319388 1.8 4.5 116.56753 3.4 21.7 78.3 0.28
Cell cycle control, cell division, chromosome partitioning 3.000000 20.186745 0.2 1.0 23.18674 0.7 12.9 87.1 0.15
RNA processing and modification 0.000000 1.000000 0.0 0.0 1.00000 0.0 0.0 100.0 0.00
Extracellular structures 0.000000 1.000000 0.0 0.0 1.00000 0.0 0.0 100.0 0.00
Uninformative 28.361709 156.662851 2.0 7.8 185.02456 5.4 15.3 84.7 0.18
Ambiguous 67.066696 203.396979 4.7 10.1 270.46368 7.9 24.8 75.2 0.33
Unclassified 957.070393 350.600641 67.6 17.4 1307.67103 38.2 73.2 26.8 2.73

4 Plots

Color Palettes

getPalette <- colorRampPalette(brewer.pal(8, "Set1"))
CountTotalCOGs <- length(unique(GCsbyCOG_Genome$COGs))

palette1 <- c("grey60", "grey40", "grey20", getPalette(CountTotalCOGs-3)) # Colors + Grays
palette2 <- getPalette(CountTotalCOGs-3) # Colors
palette3 <- c("grey60", "grey40", "grey20", "white") # White + Grays

4.1 Plots Accessory vs. Persistent

Panel A in main figure:

pA <- ggplot(GCsbyCOG_Genome, aes(x = bins_PPanGGOLiN, y = num_corrected_genes, fill = fct_rev(COGs))) +
  stat_summary(fun=sum ,geom="bar", position = "stack") +
  scale_x_discrete(labels = c("Persistent", "Accessory")) +
  scale_fill_manual(values = palette1) +
  scale_y_continuous(expand = c(0,0), breaks=seq(0, 2250, by = 250)) +
  labs(fill="COG Categories", x=" ", y= "Number of Gene Clusters") +
  theme_classic() +
  theme(axis.title = element_text(size = 9), axis.text = element_text(size=7), plot.margin=unit(c(10,0,10,20),"pt"), legend.position = "none") 
pA

4.2 Plots by Genome (Accessory)

Panel A in supplemental figure:

pAS <- ggplot(filter(GCsbyCOG_Genome, bins_PPanGGOLiN == "Accessory"), aes(x=genome_name, y=num_corrected_genes, fill = fct_rev(COGs))) +
  stat_summary(fun=sum ,geom="bar", position = "stack") +
  scale_fill_manual(values = palette1) +
  scale_y_continuous(expand = c(0,0)) + 
  labs(fill="COG Assignment", x="", y= "Number of Gene Clusters") +
  theme_classic() + 
  theme(axis.text.y = element_text(size=7), axis.text.x = element_text(size=8, angle=75, hjust=1)) +
  theme(legend.position = "none", plot.margin=unit(c(15,15,-10,20),"pt")) 
pAS

Panel B in supplemental figure:

pBS <- ggplot(filter(GCsbyCOG_Genome %>% filter(COGs != "Uninformative", COGs !="Ambiguous", COGs != "Unclassified"), bins_PPanGGOLiN == "Accessory"), aes(x=genome_name, y=num_corrected_genes, fill = fct_rev(COGs))) +
  stat_summary(fun=sum ,geom="bar", position = "stack") +
  scale_y_continuous(expand = c(0,0)) + 
  scale_fill_manual(values = palette2) + 
  labs(fill="COG Categories", x="", y= "Number of Informative Gene Clusters") +
  theme_classic() + 
  theme(axis.text.y = element_text(size=7), axis.text.x = element_text(size=8, angle=75, hjust=1)) +
  theme(legend.position="bottom", legend.key.size = unit(0.7, "line"), legend.text = element_text(size = 8), plot.margin=unit(c(0,15,0,20),"pt")) +
  guides(fill=guide_legend(ncol=2, title.position = "top", title.hjust = 0.5)) 
pBS

4.3 Plots by COG Category

In order to represent the Persistent on the left of the plot with absolute values per COG category we create per.neg; a negative version of the persistent variable in GCsbyCOG_PervsAcc. Table converted to the long format for plotting.

GCsbyCOG_PervsAcc$per.neg <- -GCsbyCOG_PervsAcc$Persistent
GCsbyCOG_PervsAccLong <- gather(GCsbyCOG_PervsAcc, bins_PPanGGOLiN, plotting, per.neg, Accessory)
kable(GCsbyCOG_PervsAccLong)
COGs Persistent pTotal.Accessory pTotal.Persistent total pTotal.total p.accessory p.Persistent ratio bins_PPanGGOLiN plotting
Secondary metabolites biosynthesis, transport, and catabolism 17.588235 0.3 0.9 21.58824 0.6 18.5 81.5 0.23 per.neg -17.588235
Inorganic ion transport and metabolism 103.775358 1.2 5.2 120.70757 3.5 14.0 86.0 0.16 per.neg -103.775358
Lipid transport and metabolism 65.537101 0.9 3.3 78.53710 2.3 16.6 83.4 0.20 per.neg -65.537101
Coenzyme transport and metabolism 99.724599 0.4 5.0 105.72460 3.1 5.7 94.3 0.06 per.neg -99.724599
Carbohydrate transport and metabolism 99.616963 2.1 5.0 129.51696 3.8 23.1 76.9 0.30 per.neg -99.616963
Nucleotide transport and metabolism 64.935484 0.3 3.2 69.87993 2.0 7.1 92.9 0.08 per.neg -64.935484
Amino acid transport and metabolism 137.913445 1.4 6.9 157.91345 4.6 12.7 87.3 0.15 per.neg -137.913445
Energy production and conversion 72.280445 0.7 3.6 81.91378 2.4 11.8 88.2 0.13 per.neg -72.280445
Mobilome: prophages, transposons 8.526291 5.0 0.4 79.50386 2.3 89.3 10.7 8.32 per.neg -8.526291
Replication, recombination and repair 81.792484 3.5 4.1 131.51898 3.8 37.8 62.2 0.61 per.neg -81.792484
Transcription 90.596429 1.9 4.5 117.95748 3.4 23.2 76.8 0.30 per.neg -90.596429
Translation, ribosomal structure and biogenesis 175.000000 0.5 8.7 182.00000 5.3 3.8 96.2 0.04 per.neg -175.000000
Defense mechanisms 43.632277 3.8 2.2 97.76023 2.9 55.4 44.6 1.24 per.neg -43.632277
Intracellular trafficking, secretion, and vesicular transport 14.058824 0.0 0.7 14.05882 0.4 0.0 100.0 0.00 per.neg -14.058824
Signal transduction mechanisms 36.382353 0.8 1.8 47.03235 1.4 22.6 77.4 0.29 per.neg -36.382353
Post-translational modification, protein turnover, and chaperones 74.473109 0.8 3.7 85.47311 2.5 12.9 87.1 0.15 per.neg -74.473109
Cell Motility 1.000000 0.0 0.0 1.00000 0.0 0.0 100.0 0.00 per.neg -1.000000
Cell wall/membrane/envelope biogenesis 91.319388 1.8 4.5 116.56753 3.4 21.7 78.3 0.28 per.neg -91.319388
Cell cycle control, cell division, chromosome partitioning 20.186745 0.2 1.0 23.18674 0.7 12.9 87.1 0.15 per.neg -20.186745
RNA processing and modification 1.000000 0.0 0.0 1.00000 0.0 0.0 100.0 0.00 per.neg -1.000000
Extracellular structures 1.000000 0.0 0.0 1.00000 0.0 0.0 100.0 0.00 per.neg -1.000000
Uninformative 156.662851 2.0 7.8 185.02456 5.4 15.3 84.7 0.18 per.neg -156.662851
Ambiguous 203.396979 4.7 10.1 270.46368 7.9 24.8 75.2 0.33 per.neg -203.396979
Unclassified 350.600641 67.6 17.4 1307.67103 38.2 73.2 26.8 2.73 per.neg -350.600641
Secondary metabolites biosynthesis, transport, and catabolism 17.588235 0.3 0.9 21.58824 0.6 18.5 81.5 0.23 Accessory 4.000000
Inorganic ion transport and metabolism 103.775358 1.2 5.2 120.70757 3.5 14.0 86.0 0.16 Accessory 16.932217
Lipid transport and metabolism 65.537101 0.9 3.3 78.53710 2.3 16.6 83.4 0.20 Accessory 13.000000
Coenzyme transport and metabolism 99.724599 0.4 5.0 105.72460 3.1 5.7 94.3 0.06 Accessory 6.000000
Carbohydrate transport and metabolism 99.616963 2.1 5.0 129.51696 3.8 23.1 76.9 0.30 Accessory 29.900000
Nucleotide transport and metabolism 64.935484 0.3 3.2 69.87993 2.0 7.1 92.9 0.08 Accessory 4.944444
Amino acid transport and metabolism 137.913445 1.4 6.9 157.91345 4.6 12.7 87.3 0.15 Accessory 20.000000
Energy production and conversion 72.280445 0.7 3.6 81.91378 2.4 11.8 88.2 0.13 Accessory 9.633333
Mobilome: prophages, transposons 8.526291 5.0 0.4 79.50386 2.3 89.3 10.7 8.32 Accessory 70.977571
Replication, recombination and repair 81.792484 3.5 4.1 131.51898 3.8 37.8 62.2 0.61 Accessory 49.726496
Transcription 90.596429 1.9 4.5 117.95748 3.4 23.2 76.8 0.30 Accessory 27.361050
Translation, ribosomal structure and biogenesis 175.000000 0.5 8.7 182.00000 5.3 3.8 96.2 0.04 Accessory 7.000000
Defense mechanisms 43.632277 3.8 2.2 97.76023 2.9 55.4 44.6 1.24 Accessory 54.127950
Intracellular trafficking, secretion, and vesicular transport 14.058824 0.0 0.7 14.05882 0.4 0.0 100.0 0.00 Accessory 0.000000
Signal transduction mechanisms 36.382353 0.8 1.8 47.03235 1.4 22.6 77.4 0.29 Accessory 10.650000
Post-translational modification, protein turnover, and chaperones 74.473109 0.8 3.7 85.47311 2.5 12.9 87.1 0.15 Accessory 11.000000
Cell Motility 1.000000 0.0 0.0 1.00000 0.0 0.0 100.0 0.00 Accessory 0.000000
Cell wall/membrane/envelope biogenesis 91.319388 1.8 4.5 116.56753 3.4 21.7 78.3 0.28 Accessory 25.248140
Cell cycle control, cell division, chromosome partitioning 20.186745 0.2 1.0 23.18674 0.7 12.9 87.1 0.15 Accessory 3.000000
RNA processing and modification 1.000000 0.0 0.0 1.00000 0.0 0.0 100.0 0.00 Accessory 0.000000
Extracellular structures 1.000000 0.0 0.0 1.00000 0.0 0.0 100.0 0.00 Accessory 0.000000
Uninformative 156.662851 2.0 7.8 185.02456 5.4 15.3 84.7 0.18 Accessory 28.361709
Ambiguous 203.396979 4.7 10.1 270.46368 7.9 24.8 75.2 0.33 Accessory 67.066696
Unclassified 350.600641 67.6 17.4 1307.67103 38.2 73.2 26.8 2.73 Accessory 957.070393

Panel B in main figure:

pB <- ggplot(filter(GCsbyCOG_PervsAccLong, COGs != "Uninformative", COGs != "Ambiguous", COGs != "Unclassified"), aes(x = COGs, y = plotting, fill = COGs)) +
  geom_bar(stat="identity") + 
  scale_fill_manual(values = rev(palette2)) + 
  scale_x_discrete(position = "top") +
  labs(x="", y= "Number of Gene Clusters") +
  coord_flip() +
  scale_y_continuous(limits = c(-200, 200), breaks = c(-150, -100, -50, 0, 50, 100, 150), label = c(150, 100, 50, 0, 50, 100, 150)) +
  geom_segment(aes(x=0,xend=19.5,y=0,yend=0), linetype=3, size=0.1) +
  geom_label(aes(x = 22.5, y = -95, label = "      Persistent       "), fontface="bold", size=3, fill = "grey90", label.size=NA, label.padding = unit(0.3, "lines")) +
  geom_label(aes(x = 22.5, y = 95, label = "     Accessory      "), fontface="bold", size=3, fill = "grey90", label.size=NA, label.padding = unit(0.3, "lines")) +
  theme_classic() +
  theme(axis.title = element_text(size = 9), axis.text.x = element_text(size=7), axis.ticks.y = element_blank(), axis.line.y = element_blank(), legend.position = "none", plot.margin=unit(c(5,10,10,25),"pt"), plot.title=element_text(face="bold", hjust=3, vjust=-3.9)) 

gpB <- ggplotGrob(pB)
gpB$layout$clip[gpB$layout$name=="panel"] <- "off"
ggarrange(gpB, labels="COG Categories", label.x = 0.5, vjust = 1.1)