Supplemental Methods: COGs Functional Analysis for Cps

1 Data Import

This Notebook uses the anvi-summarize output files saved in the “analysis_Anvio7” folder. In order to automatically run this Notebook for each of the four Corynebacterium species you can source the scrip run_COGS.R

We select the most relevant variables for the functional analysis:

Pangenome <- Pangenome %>%
  select(-functional_homogeneity_index, -geometric_homogeneity_index, -combined_homogeneity_index, -aa_sequence)

In the new variable “bins_PPanGGOLiN” we define “Persistent” and “Accessory” (as shell”+“cloud”):

Pangenome <- Pangenome %>%
  mutate(bins_PPanGGOLiN = ifelse(grepl("persistent", PPanGGOLiN), "Persistent", "Accessory"))

Number and percentage of Persistent vs Accessory:

vPersistent <- nrow(Pangenome %>% group_by(gene_cluster_id) %>% filter(bins_PPanGGOLiN =="Persistent") %>% summarise)
vAccesory <- nrow(Pangenome %>% group_by(gene_cluster_id) %>% filter(bins_PPanGGOLiN =="Accessory") %>% summarise)

vPersistent.p <- round(100*vPersistent/(vAccesory+vPersistent), 1)
vAccesory.p <- round(100*vAccesory/(vAccesory+vPersistent), 1)

There are 1819 gene clusters (GC) (50.7%) in the “Accessory” vs. 1771 (49.3%) in the “Persistent” at the pangenome level

2 COG Analysis at the Gene Level

We define a new variable COGs to use in the plots. This variable is based on COG20_CATEGORY but with a cleaner definition of unclassified, uninformative, or mixed assignments:

  • COG categories “Function Unknown” and “General function predictions only” were considered as “Uninformative”.
  • If the COG category is mix (e.g., G|S|M) it gets labeled as “Ambiguous”.
  • Missing values (NA) are labeled as “Unclassified”.
Pangenome$COGs <- Pangenome$COG20_CATEGORY_ACC
Pangenome$COGs[Pangenome$COGs =="S"]<- "Uninformative"
Pangenome$COGs[Pangenome$COGs =="R"]<- "Uninformative"
Pangenome$COGs[grepl('|', Pangenome$COGs,fixed=TRUE)]<-"Ambiguous"
Pangenome$COGs[Pangenome$COGs ==""]<-"Unclassified"

Summary of COG annotated genes:

Genes Count Percentage
Total in Pangenome 90461 100.0
COG Category Uninformative = Function Unknown 2518 2.8
COG Category Uninformative = General function prediction only 3367 3.7
COG Category Ambiguous (Mixed COG Category) 8688 9.6
COG Category Unclassified (Non-assigned) 19763 21.8
Informative COGs (Total - Uninformative, Ambiguous & Unclassified) 56125 62.0

3 COG Analysis at the Gene Cluster Level

This analysis was done at the pangenomic gene cluster level (GC). Since many gene clusters had mixed COG category assignments a solution is to assign each individual gene call to their corresponding Genome/bins_PPanGGOLiN/COG grouping weighting their contribution by dividing their count by the number of genes in their GC.

3.1 GCs by COG Category and Genome

The table “GCsbyCOG_Genome” groups the genes by genome; and inside genomes by “Accessory” vs. “Persistent” status, and nested inside as the COG category. But, in this case, instead of counting the elements in each group we calculated the sum of 1/num_genes_in_gene_cluster.

Pangenome$COGs <- as.factor(Pangenome$COGs)
GCsbyCOG_Genome <- Pangenome %>%
  group_by(genome_name, bins_PPanGGOLiN, COGs, .drop=FALSE) %>%
  summarise(num_corrected_genes=sum(1/num_genes_in_gene_cluster))

The total sum of all values in the num_corrected_genes variable should add up to the number of CGs:

sum(GCsbyCOG_Genome$num_corrected_genes)
## [1] 3590
nrow(Pangenome %>% group_by(gene_cluster_id) %>% summarise)
## [1] 3590

Adding extra column to label the gray scale portion of the plots:

GCsbyCOG_Genome <- GCsbyCOG_Genome %>%
  mutate(Assignment=ifelse(COGs!="Uninformative" & COGs!="Ambiguous" & COGs!="Unclassified", "Informative", as.character(COGs)))

3.1.1 Summary of GOC annotated GCs in the Accessory vs. Persistent

TableGC <- GCsbyCOG_Genome %>% 
  group_by(bins_PPanGGOLiN, Assignment) %>%
  summarize(corrected_genes=sum(num_corrected_genes))

TableGC$Percentages <- round(100*TableGC$corrected_genes/sum(TableGC$corrected_genes), 1)

kable(TableGC)
bins_PPanGGOLiN Assignment corrected_genes Percentages
Accessory Ambiguous 80.38391 2.2
Accessory Informative 543.70924 15.1
Accessory Unclassified 1142.44959 31.8
Accessory Uninformative 52.45726 1.5
Persistent Ambiguous 184.11053 5.1
Persistent Informative 1156.64885 32.2
Persistent Unclassified 307.84530 8.6
Persistent Uninformative 122.39532 3.4

3.1.2 Summary of GOC annotated GCs in the Accessory

TableGCAccessory <- GCsbyCOG_Genome %>% 
  filter(bins_PPanGGOLiN =="Accessory") %>%
  group_by(bins_PPanGGOLiN, Assignment) %>%
  summarize(corrected_genes=sum(num_corrected_genes))

TableGCAccessory$Percentages <- round(100*TableGCAccessory$corrected_genes/sum(TableGCAccessory$corrected_genes), 1)

kable(TableGCAccessory)
bins_PPanGGOLiN Assignment corrected_genes Percentages
Accessory Ambiguous 80.38391 4.4
Accessory Informative 543.70924 29.9
Accessory Unclassified 1142.44959 62.8
Accessory Uninformative 52.45726 2.9

3.1.3 Summary of GOC annotated GCs in the Persistent

TableGCPersistent <- GCsbyCOG_Genome %>% 
  filter(bins_PPanGGOLiN =="Persistent") %>%
  group_by(bins_PPanGGOLiN, Assignment) %>%
  summarize(corrected_genes=sum(num_corrected_genes))

TableGCPersistent$Percentages <- round(100*TableGCPersistent$corrected_genes/sum(TableGCPersistent$corrected_genes), 1)

kable(TableGCPersistent)
bins_PPanGGOLiN Assignment corrected_genes Percentages
Persistent Ambiguous 184.1105 10.4
Persistent Informative 1156.6489 65.3
Persistent Unclassified 307.8453 17.4
Persistent Uninformative 122.3953 6.9

3.1.4 Summary of GOC annotated GCs by Genome in the Accessory vs. Persistent

TableGenomes <- GCsbyCOG_Genome %>% 
  group_by(genome_name, bins_PPanGGOLiN) %>% 
  summarize(corrected_genes=sum(num_corrected_genes))

kable(TableGenomes)
genome_name bins_PPanGGOLiN corrected_genes
Cps_90104 Accessory 72.51181
Cps_90104 Persistent 43.73011
Cps_BWA092 Accessory 29.49148
Cps_BWA092 Persistent 41.07585
Cps_BWA110 Accessory 42.90065
Cps_BWA110 Persistent 41.15082
Cps_BWA118 Accessory 30.22329
Cps_BWA118 Persistent 41.05280
Cps_BWA127 Accessory 63.52215
Cps_BWA127 Persistent 41.05936
Cps_BWA141 Accessory 35.29439
Cps_BWA141 Persistent 41.09723
Cps_BWA164 Accessory 52.52830
Cps_BWA164 Persistent 41.02304
Cps_BWA166 Accessory 57.23220
Cps_BWA166 Persistent 41.10363
Cps_BWA172 Accessory 62.50068
Cps_BWA172 Persistent 41.01290
Cps_BWA184 Accessory 57.03582
Cps_BWA184 Persistent 41.17115
Cps_BWA188 Accessory 107.59937
Cps_BWA188 Persistent 41.12069
Cps_BWA198 Accessory 42.51485
Cps_BWA198 Persistent 41.02077
Cps_BWA223 Accessory 32.02394
Cps_BWA223 Persistent 41.09447
Cps_BWA238 Accessory 44.39401
Cps_BWA238 Persistent 40.96878
Cps_BWA283 Accessory 45.38969
Cps_BWA283 Persistent 41.08820
Cps_BWA300 Accessory 97.59197
Cps_BWA300 Persistent 41.00141
Cps_BWA305 Accessory 30.59784
Cps_BWA305 Persistent 41.24247
Cps_BWA311 Accessory 31.27443
Cps_BWA311 Persistent 41.05230
Cps_DSM_44287 Accessory 44.23515
Cps_DSM_44287 Persistent 41.16971
Cps_DU080 Accessory 15.52420
Cps_DU080 Persistent 40.97233
Cps_HSID17575 Accessory 61.30105
Cps_HSID17575 Persistent 41.14971
Cps_HSID17576 Accessory 28.30659
Cps_HSID17576 Persistent 41.21913
Cps_KPL_1989 Accessory 17.46316
Cps_KPL_1989 Persistent 41.20477
Cps_KPL_2621 Accessory 30.72339
Cps_KPL_2621 Persistent 41.16018
Cps_KPL_2640 Accessory 107.66539
Cps_KPL_2640 Persistent 41.17798
Cps_KPL_2667 Accessory 45.32912
Cps_KPL_2667 Persistent 41.11554
Cps_KPL_2733 Accessory 17.71984
Cps_KPL_2733 Persistent 41.14355
Cps_KPL_2773 Accessory 36.82944
Cps_KPL_2773 Persistent 41.22281
Cps_KPL_2795 Accessory 20.97932
Cps_KPL_2795 Persistent 41.23837
Cps_KPL_2826 Accessory 21.43115
Cps_KPL_2826 Persistent 40.95227
Cps_KPL_2834 Accessory 44.27853
Cps_KPL_2834 Persistent 41.36393
Cps_KPL_2865 Accessory 19.20562
Cps_KPL_2865 Persistent 41.21689
Cps_KPL_2915 Accessory 33.01606
Cps_KPL_2915 Persistent 41.18450
Cps_KPL_3671 Accessory 20.15255
Cps_KPL_3671 Persistent 41.06914
Cps_KPL_3702 Accessory 32.10179
Cps_KPL_3702 Persistent 41.21586
Cps_KPL_3770 Accessory 17.02193
Cps_KPL_3770 Persistent 41.11065
Cps_KPL_3772 Accessory 39.99019
Cps_KPL_3772 Persistent 41.22392
Cps_KPL_3833 Accessory 35.57909
Cps_KPL_3833 Persistent 41.29288
Cps_KPL_3966 Accessory 66.99397
Cps_KPL_3966 Persistent 41.08352
Cps_KPL_4010 Accessory 21.81175
Cps_KPL_4010 Persistent 41.12072
Cps_KPL_4025 Accessory 17.68086
Cps_KPL_4025 Persistent 41.08501
Cps_KPL_4041 Accessory 49.68279
Cps_KPL_4041 Persistent 41.12589
Cps_KPL_4066 Accessory 39.35019
Cps_KPL_4066 Persistent 41.11475

3.1.5 Renaming and ordering variables factor levels for plotting:

GCsbyCOG_Genome$bins_PPanGGOLiN <- factor(GCsbyCOG_Genome$bins_PPanGGOLiN, levels =c("Persistent", "Accessory"))

GCsbyCOG_Genome$COGs <- recode_factor(GCsbyCOG_Genome$COGs, "Q"="Secondary metabolites biosynthesis, transport, and catabolism","P"="Inorganic ion transport and metabolism","I"="Lipid transport and metabolism","H"="Coenzyme transport and metabolism","G"="Carbohydrate transport and metabolism","F"="Nucleotide transport and metabolism","E"="Amino acid transport and metabolism","C"="Energy production and conversion","X"="Mobilome: prophages, transposons","L"="Replication, recombination and repair","K"="Transcription","J"="Translation, ribosomal structure and biogenesis","V"="Defense mechanisms","U"="Intracellular trafficking, secretion, and vesicular transport","T"="Signal transduction mechanisms","O"="Post-translational modification, protein turnover, and chaperones","N"="Cell Motility","M"="Cell wall/membrane/envelope biogenesis","D"="Cell cycle control, cell division, chromosome partitioning","A"="RNA processing and modification","W"="Extracellular structures","Uninformative"="Uninformative","Ambiguous"="Ambiguous","Unclassified"="Unclassified", .ordered = TRUE)

GCsbyCOG_Genome$Assignment <- recode_factor(GCsbyCOG_Genome$Assignment,  "Informative"=" ", "Uninformative"="Uninformative", "Ambiguous"="Ambiguous", "Unclassified"="Unclassified", .ordered = TRUE)

3.2 GCs by COG Category

The table “GCsbyCOG” groups the genes by “Accessory” vs. “Persistent” status, and nested inside as the COG category.

GCsbyCOG <- Pangenome %>%
  group_by(bins_PPanGGOLiN, COGs) %>%
  summarise(num_corrected_genes=sum(1/num_genes_in_gene_cluster))

3.2.1 Renaming and ordering variables factor levels for plotting:

GCsbyCOG$COGs <- recode_factor(GCsbyCOG$COGs, "Q"="Secondary metabolites biosynthesis, transport, and catabolism",
                               "P"="Inorganic ion transport and metabolism",
                               "I"="Lipid transport and metabolism",
                               "H"="Coenzyme transport and metabolism",
                               "G"="Carbohydrate transport and metabolism",
                               "F"="Nucleotide transport and metabolism",
                               "E"="Amino acid transport and metabolism",
                               "C"="Energy production and conversion",
                               "X"="Mobilome: prophages, transposons",
                               "L"="Replication, recombination and repair",
                               "K"="Transcription",
                               "J"="Translation, ribosomal structure and biogenesis",
                               "V"="Defense mechanisms",
                               "U"="Intracellular trafficking, secretion, and vesicular transport",
                               "T"="Signal transduction mechanisms",
                               "O"="Post-translational modification, protein turnover, and chaperones",
                               "N"="Cell Motility",
                               "M"="Cell wall/membrane/envelope biogenesis",
                               "D"="Cell cycle control, cell division, chromosome partitioning",
                               "A"="RNA processing and modification",
                               "W"="Extracellular structures",
                               "Uninformative"="Uninformative",
                               "Ambiguous"="Ambiguous",
                               "Unclassified"="Unclassified", .ordered = TRUE)

3.2.2 Summary of GOC annotated GCs GCs by COG Category:

New table “GCsbyCOG_PervsAcc” in wide format. % of each category relative to the “Accessory” or “Persistent” was calculated (pTotal. variables). Total GCs for each COG category calculated, and % of GCs in the “Accessory” and “Persistent” relative to each category (p. values) were calculated as well. The ratio between the number of GC in the “Accessory” vs. the “Persistent” is calculated for each COG:

GCsbyCOG_PervsAcc <- spread(GCsbyCOG, bins_PPanGGOLiN, num_corrected_genes, fill=0)
GCsbyCOG_PervsAcc$pTotal.Accessory <- round(100*GCsbyCOG_PervsAcc$Accessory/sum(GCsbyCOG_PervsAcc$Accessory), 1)
GCsbyCOG_PervsAcc$pTotal.Persistent <- round(100*GCsbyCOG_PervsAcc$Persistent/sum(GCsbyCOG_PervsAcc$Persistent), 1)
GCsbyCOG_PervsAcc$total <- GCsbyCOG_PervsAcc$Accessory + GCsbyCOG_PervsAcc$Persistent
GCsbyCOG_PervsAcc$pTotal.total <- round(100*GCsbyCOG_PervsAcc$total/sum(GCsbyCOG_PervsAcc$total), 1)
GCsbyCOG_PervsAcc$p.accessory <- round(100*(GCsbyCOG_PervsAcc$Accessory/GCsbyCOG_PervsAcc$total), 1)
GCsbyCOG_PervsAcc$p.Persistent <- round(100*(GCsbyCOG_PervsAcc$Persistent/GCsbyCOG_PervsAcc$total), 1)
GCsbyCOG_PervsAcc$ratio <- round(GCsbyCOG_PervsAcc$Accessory/GCsbyCOG_PervsAcc$Persistent, 2)

kable(GCsbyCOG_PervsAcc)
COGs Accessory Persistent pTotal.Accessory pTotal.Persistent total pTotal.total p.accessory p.Persistent ratio
Secondary metabolites biosynthesis, transport, and catabolism 15.000000 12.690476 0.8 0.7 27.69048 0.8 54.2 45.8 1.18
Inorganic ion transport and metabolism 42.326210 89.287350 2.3 5.0 131.61356 3.7 32.2 67.8 0.47
Lipid transport and metabolism 15.000000 66.279070 0.8 3.7 81.27907 2.3 18.5 81.5 0.23
Coenzyme transport and metabolism 11.000000 86.006037 0.6 4.9 97.00604 2.7 11.3 88.7 0.13
Carbohydrate transport and metabolism 15.803922 76.712727 0.9 4.3 92.51665 2.6 17.1 82.9 0.21
Nucleotide transport and metabolism 8.636364 52.511628 0.5 3.0 61.14799 1.7 14.1 85.9 0.16
Amino acid transport and metabolism 20.914286 125.886317 1.1 7.1 146.80060 4.1 14.2 85.8 0.17
Energy production and conversion 4.500000 68.225000 0.2 3.9 72.72500 2.0 6.2 93.8 0.07
Mobilome: prophages, transposons 107.972960 2.987879 5.9 0.2 110.96084 3.1 97.3 2.7 36.14
Replication, recombination and repair 87.200758 75.227273 4.8 4.2 162.42803 4.5 53.7 46.3 1.16
Transcription 32.144444 71.815747 1.8 4.1 103.96019 2.9 30.9 69.1 0.45
Translation, ribosomal structure and biogenesis 15.954546 168.955556 0.9 9.5 184.91010 5.2 8.6 91.4 0.09
Defense mechanisms 84.471718 29.679939 4.6 1.7 114.15166 3.2 74.0 26.0 2.85
Intracellular trafficking, secretion, and vesicular transport 1.000000 14.139535 0.1 0.8 15.13953 0.4 6.6 93.4 0.07
Signal transduction mechanisms 13.875000 35.093023 0.8 2.0 48.96802 1.4 28.3 71.7 0.40
Post-translational modification, protein turnover, and chaperones 23.434286 79.721987 1.3 4.5 103.15627 2.9 22.7 77.3 0.29
Cell Motility 0.000000 2.000000 0.0 0.1 2.00000 0.1 0.0 100.0 0.00
Cell wall/membrane/envelope biogenesis 43.474748 80.496987 2.4 4.5 123.97174 3.5 35.1 64.9 0.54
Cell cycle control, cell division, chromosome partitioning 1.000000 16.932323 0.1 1.0 17.93232 0.5 5.6 94.4 0.06
RNA processing and modification 0.000000 1.000000 0.0 0.1 1.00000 0.0 0.0 100.0 0.00
Extracellular structures 0.000000 1.000000 0.0 0.1 1.00000 0.0 0.0 100.0 0.00
Uninformative 52.457258 122.395321 2.9 6.9 174.85258 4.9 30.0 70.0 0.43
Ambiguous 80.383908 184.110527 4.4 10.4 264.49443 7.4 30.4 69.6 0.44
Unclassified 1142.449594 307.845298 62.8 17.4 1450.29489 40.4 78.8 21.2 3.71

4 Plots

Color Palettes

getPalette <- colorRampPalette(brewer.pal(8, "Set1"))
CountTotalCOGs <- length(unique(GCsbyCOG_Genome$COGs))

palette1 <- c("grey60", "grey40", "grey20", getPalette(CountTotalCOGs-3)) # Colors + Grays
palette2 <- getPalette(CountTotalCOGs-3) # Colors
palette3 <- c("grey60", "grey40", "grey20", "white") # White + Grays

4.1 Plots Accessory vs. Persistent

Panel A in main figure:

pA <- ggplot(GCsbyCOG_Genome, aes(x = bins_PPanGGOLiN, y = num_corrected_genes, fill = fct_rev(COGs))) +
  stat_summary(fun=sum ,geom="bar", position = "stack") +
  scale_x_discrete(labels = c("Persistent", "Accessory")) +
  scale_fill_manual(values = palette1) +
  scale_y_continuous(expand = c(0,0), breaks=seq(0, 2250, by = 250)) +
  labs(fill="COG Categories", x=" ", y= "Number of Gene Clusters") +
  theme_classic() +
  theme(axis.title = element_text(size = 9), axis.text = element_text(size=7), plot.margin=unit(c(10,0,10,20),"pt"), legend.position = "none") 
pA

4.2 Plots by Genome (Accessory)

Panel A in supplemental figure:

pAS <- ggplot(filter(GCsbyCOG_Genome, bins_PPanGGOLiN == "Accessory"), aes(x=genome_name, y=num_corrected_genes, fill = fct_rev(COGs))) +
  stat_summary(fun=sum ,geom="bar", position = "stack") +
  scale_fill_manual(values = palette1) +
  scale_y_continuous(expand = c(0,0)) + 
  labs(fill="COG Assignment", x="", y= "Number of Gene Clusters") +
  theme_classic() + 
  theme(axis.text.y = element_text(size=7), axis.text.x = element_text(size=8, angle=75, hjust=1)) +
  theme(legend.position = "none", plot.margin=unit(c(15,15,-10,20),"pt")) 
pAS

Panel B in supplemental figure:

pBS <- ggplot(filter(GCsbyCOG_Genome %>% filter(COGs != "Uninformative", COGs !="Ambiguous", COGs != "Unclassified"), bins_PPanGGOLiN == "Accessory"), aes(x=genome_name, y=num_corrected_genes, fill = fct_rev(COGs))) +
  stat_summary(fun=sum ,geom="bar", position = "stack") +
  scale_y_continuous(expand = c(0,0)) + 
  scale_fill_manual(values = palette2) + 
  labs(fill="COG Categories", x="", y= "Number of Informative Gene Clusters") +
  theme_classic() + 
  theme(axis.text.y = element_text(size=7), axis.text.x = element_text(size=8, angle=75, hjust=1)) +
  theme(legend.position="bottom", legend.key.size = unit(0.7, "line"), legend.text = element_text(size = 8), plot.margin=unit(c(0,15,0,20),"pt")) +
  guides(fill=guide_legend(ncol=2, title.position = "top", title.hjust = 0.5)) 
pBS

4.3 Plots by COG Category

In order to represent the Persistent on the left of the plot with absolute values per COG category we create per.neg; a negative version of the persistent variable in GCsbyCOG_PervsAcc. Table converted to the long format for plotting.

GCsbyCOG_PervsAcc$per.neg <- -GCsbyCOG_PervsAcc$Persistent
GCsbyCOG_PervsAccLong <- gather(GCsbyCOG_PervsAcc, bins_PPanGGOLiN, plotting, per.neg, Accessory)
kable(GCsbyCOG_PervsAccLong)
COGs Persistent pTotal.Accessory pTotal.Persistent total pTotal.total p.accessory p.Persistent ratio bins_PPanGGOLiN plotting
Secondary metabolites biosynthesis, transport, and catabolism 12.690476 0.8 0.7 27.69048 0.8 54.2 45.8 1.18 per.neg -12.690476
Inorganic ion transport and metabolism 89.287350 2.3 5.0 131.61356 3.7 32.2 67.8 0.47 per.neg -89.287350
Lipid transport and metabolism 66.279070 0.8 3.7 81.27907 2.3 18.5 81.5 0.23 per.neg -66.279070
Coenzyme transport and metabolism 86.006037 0.6 4.9 97.00604 2.7 11.3 88.7 0.13 per.neg -86.006037
Carbohydrate transport and metabolism 76.712727 0.9 4.3 92.51665 2.6 17.1 82.9 0.21 per.neg -76.712727
Nucleotide transport and metabolism 52.511628 0.5 3.0 61.14799 1.7 14.1 85.9 0.16 per.neg -52.511628
Amino acid transport and metabolism 125.886317 1.1 7.1 146.80060 4.1 14.2 85.8 0.17 per.neg -125.886317
Energy production and conversion 68.225000 0.2 3.9 72.72500 2.0 6.2 93.8 0.07 per.neg -68.225000
Mobilome: prophages, transposons 2.987879 5.9 0.2 110.96084 3.1 97.3 2.7 36.14 per.neg -2.987879
Replication, recombination and repair 75.227273 4.8 4.2 162.42803 4.5 53.7 46.3 1.16 per.neg -75.227273
Transcription 71.815747 1.8 4.1 103.96019 2.9 30.9 69.1 0.45 per.neg -71.815747
Translation, ribosomal structure and biogenesis 168.955556 0.9 9.5 184.91010 5.2 8.6 91.4 0.09 per.neg -168.955556
Defense mechanisms 29.679939 4.6 1.7 114.15166 3.2 74.0 26.0 2.85 per.neg -29.679939
Intracellular trafficking, secretion, and vesicular transport 14.139535 0.1 0.8 15.13953 0.4 6.6 93.4 0.07 per.neg -14.139535
Signal transduction mechanisms 35.093023 0.8 2.0 48.96802 1.4 28.3 71.7 0.40 per.neg -35.093023
Post-translational modification, protein turnover, and chaperones 79.721987 1.3 4.5 103.15627 2.9 22.7 77.3 0.29 per.neg -79.721987
Cell Motility 2.000000 0.0 0.1 2.00000 0.1 0.0 100.0 0.00 per.neg -2.000000
Cell wall/membrane/envelope biogenesis 80.496987 2.4 4.5 123.97174 3.5 35.1 64.9 0.54 per.neg -80.496987
Cell cycle control, cell division, chromosome partitioning 16.932323 0.1 1.0 17.93232 0.5 5.6 94.4 0.06 per.neg -16.932323
RNA processing and modification 1.000000 0.0 0.1 1.00000 0.0 0.0 100.0 0.00 per.neg -1.000000
Extracellular structures 1.000000 0.0 0.1 1.00000 0.0 0.0 100.0 0.00 per.neg -1.000000
Uninformative 122.395321 2.9 6.9 174.85258 4.9 30.0 70.0 0.43 per.neg -122.395321
Ambiguous 184.110527 4.4 10.4 264.49443 7.4 30.4 69.6 0.44 per.neg -184.110527
Unclassified 307.845298 62.8 17.4 1450.29489 40.4 78.8 21.2 3.71 per.neg -307.845298
Secondary metabolites biosynthesis, transport, and catabolism 12.690476 0.8 0.7 27.69048 0.8 54.2 45.8 1.18 Accessory 15.000000
Inorganic ion transport and metabolism 89.287350 2.3 5.0 131.61356 3.7 32.2 67.8 0.47 Accessory 42.326210
Lipid transport and metabolism 66.279070 0.8 3.7 81.27907 2.3 18.5 81.5 0.23 Accessory 15.000000
Coenzyme transport and metabolism 86.006037 0.6 4.9 97.00604 2.7 11.3 88.7 0.13 Accessory 11.000000
Carbohydrate transport and metabolism 76.712727 0.9 4.3 92.51665 2.6 17.1 82.9 0.21 Accessory 15.803922
Nucleotide transport and metabolism 52.511628 0.5 3.0 61.14799 1.7 14.1 85.9 0.16 Accessory 8.636364
Amino acid transport and metabolism 125.886317 1.1 7.1 146.80060 4.1 14.2 85.8 0.17 Accessory 20.914286
Energy production and conversion 68.225000 0.2 3.9 72.72500 2.0 6.2 93.8 0.07 Accessory 4.500000
Mobilome: prophages, transposons 2.987879 5.9 0.2 110.96084 3.1 97.3 2.7 36.14 Accessory 107.972960
Replication, recombination and repair 75.227273 4.8 4.2 162.42803 4.5 53.7 46.3 1.16 Accessory 87.200758
Transcription 71.815747 1.8 4.1 103.96019 2.9 30.9 69.1 0.45 Accessory 32.144444
Translation, ribosomal structure and biogenesis 168.955556 0.9 9.5 184.91010 5.2 8.6 91.4 0.09 Accessory 15.954546
Defense mechanisms 29.679939 4.6 1.7 114.15166 3.2 74.0 26.0 2.85 Accessory 84.471718
Intracellular trafficking, secretion, and vesicular transport 14.139535 0.1 0.8 15.13953 0.4 6.6 93.4 0.07 Accessory 1.000000
Signal transduction mechanisms 35.093023 0.8 2.0 48.96802 1.4 28.3 71.7 0.40 Accessory 13.875000
Post-translational modification, protein turnover, and chaperones 79.721987 1.3 4.5 103.15627 2.9 22.7 77.3 0.29 Accessory 23.434286
Cell Motility 2.000000 0.0 0.1 2.00000 0.1 0.0 100.0 0.00 Accessory 0.000000
Cell wall/membrane/envelope biogenesis 80.496987 2.4 4.5 123.97174 3.5 35.1 64.9 0.54 Accessory 43.474748
Cell cycle control, cell division, chromosome partitioning 16.932323 0.1 1.0 17.93232 0.5 5.6 94.4 0.06 Accessory 1.000000
RNA processing and modification 1.000000 0.0 0.1 1.00000 0.0 0.0 100.0 0.00 Accessory 0.000000
Extracellular structures 1.000000 0.0 0.1 1.00000 0.0 0.0 100.0 0.00 Accessory 0.000000
Uninformative 122.395321 2.9 6.9 174.85258 4.9 30.0 70.0 0.43 Accessory 52.457258
Ambiguous 184.110527 4.4 10.4 264.49443 7.4 30.4 69.6 0.44 Accessory 80.383908
Unclassified 307.845298 62.8 17.4 1450.29489 40.4 78.8 21.2 3.71 Accessory 1142.449594

Panel B in main figure:

pB <- ggplot(filter(GCsbyCOG_PervsAccLong, COGs != "Uninformative", COGs != "Ambiguous", COGs != "Unclassified"), aes(x = COGs, y = plotting, fill = COGs)) +
  geom_bar(stat="identity") + 
  scale_fill_manual(values = rev(palette2)) + 
  scale_x_discrete(position = "top") +
  labs(x="", y= "Number of Gene Clusters") +
  coord_flip() +
  scale_y_continuous(limits = c(-200, 200), breaks = c(-150, -100, -50, 0, 50, 100, 150), label = c(150, 100, 50, 0, 50, 100, 150)) +
  geom_segment(aes(x=0,xend=19.5,y=0,yend=0), linetype=3, size=0.1) +
  geom_label(aes(x = 22.5, y = -95, label = "      Persistent       "), fontface="bold", size=3, fill = "grey90", label.size=NA, label.padding = unit(0.3, "lines")) +
  geom_label(aes(x = 22.5, y = 95, label = "     Accessory      "), fontface="bold", size=3, fill = "grey90", label.size=NA, label.padding = unit(0.3, "lines")) +
  theme_classic() +
  theme(axis.title = element_text(size = 9), axis.text.x = element_text(size=7), axis.ticks.y = element_blank(), axis.line.y = element_blank(), legend.position = "none", plot.margin=unit(c(5,10,10,25),"pt"), plot.title=element_text(face="bold", hjust=3, vjust=-3.9)) 

gpB <- ggplotGrob(pB)
gpB$layout$clip[gpB$layout$name=="panel"] <- "off"
ggarrange(gpB, labels="COG Categories", label.x = 0.5, vjust = 1.1)