Supplemental Methods: COGs Functional Analysis for Ctu

1 Data Import

This Notebook uses the anvi-summarize output files saved in the “analysis_Anvio7” folder. In order to automatically run this Notebook for each of the four Corynebacterium species you can source the scrip run_COGS.R

We select the most relevant variables for the functional analysis:

Pangenome <- Pangenome %>%
  select(-functional_homogeneity_index, -geometric_homogeneity_index, -combined_homogeneity_index, -aa_sequence)

In the new variable “bins_PPanGGOLiN” we define “Persistent” and “Accessory” (as shell”+“cloud”):

Pangenome <- Pangenome %>%
  mutate(bins_PPanGGOLiN = ifelse(grepl("persistent", PPanGGOLiN), "Persistent", "Accessory"))

Number and percentage of Persistent vs Accessory:

vPersistent <- nrow(Pangenome %>% group_by(gene_cluster_id) %>% filter(bins_PPanGGOLiN =="Persistent") %>% summarise)
vAccesory <- nrow(Pangenome %>% group_by(gene_cluster_id) %>% filter(bins_PPanGGOLiN =="Accessory") %>% summarise)

vPersistent.p <- round(100*vPersistent/(vAccesory+vPersistent), 1)
vAccesory.p <- round(100*vAccesory/(vAccesory+vPersistent), 1)

There are 916 gene clusters (GC) (31.5%) in the “Accessory” vs. 1991 (68.5%) in the “Persistent” at the pangenome level

2 COG Analysis at the Gene Level

We define a new variable COGs to use in the plots. This variable is based on COG20_CATEGORY but with a cleaner definition of unclassified, uninformative, or mixed assignments:

  • COG categories “Function Unknown” and “General function predictions only” were considered as “Uninformative”.
  • If the COG category is mix (e.g., G|S|M) it gets labeled as “Ambiguous”.
  • Missing values (NA) are labeled as “Unclassified”.
Pangenome$COGs <- Pangenome$COG20_CATEGORY_ACC
Pangenome$COGs[Pangenome$COGs =="S"]<- "Uninformative"
Pangenome$COGs[Pangenome$COGs =="R"]<- "Uninformative"
Pangenome$COGs[grepl('|', Pangenome$COGs,fixed=TRUE)]<-"Ambiguous"
Pangenome$COGs[Pangenome$COGs ==""]<-"Unclassified"

Summary of COG annotated genes:

Genes Count Percentage
Total in Pangenome 17992 100.0
COG Category Uninformative = Function Unknown 587 3.3
COG Category Uninformative = General function prediction only 716 4.0
COG Category Ambiguous (Mixed COG Category) 1750 9.7
COG Category Unclassified (Non-assigned) 3829 21.3
Informative COGs (Total - Uninformative, Ambiguous & Unclassified) 11110 61.7

3 COG Analysis at the Gene Cluster Level

This analysis was done at the pangenomic gene cluster level (GC). Since many gene clusters had mixed COG category assignments a solution is to assign each individual gene call to their corresponding Genome/bins_PPanGGOLiN/COG grouping weighting their contribution by dividing their count by the number of genes in their GC.

3.1 GCs by COG Category and Genome

The table “GCsbyCOG_Genome” groups the genes by genome; and inside genomes by “Accessory” vs. “Persistent” status, and nested inside as the COG category. But, in this case, instead of counting the elements in each group we calculated the sum of 1/num_genes_in_gene_cluster.

Pangenome$COGs <- as.factor(Pangenome$COGs)
GCsbyCOG_Genome <- Pangenome %>%
  group_by(genome_name, bins_PPanGGOLiN, COGs, .drop=FALSE) %>%
  summarise(num_corrected_genes=sum(1/num_genes_in_gene_cluster))

The total sum of all values in the num_corrected_genes variable should add up to the number of CGs:

sum(GCsbyCOG_Genome$num_corrected_genes)
## [1] 2907
nrow(Pangenome %>% group_by(gene_cluster_id) %>% summarise)
## [1] 2907

Adding extra column to label the gray scale portion of the plots:

GCsbyCOG_Genome <- GCsbyCOG_Genome %>%
  mutate(Assignment=ifelse(COGs!="Uninformative" & COGs!="Ambiguous" & COGs!="Unclassified", "Informative", as.character(COGs)))

3.1.1 Summary of GOC annotated GCs in the Accessory vs. Persistent

TableGC <- GCsbyCOG_Genome %>% 
  group_by(bins_PPanGGOLiN, Assignment) %>%
  summarize(corrected_genes=sum(num_corrected_genes))

TableGC$Percentages <- round(100*TableGC$corrected_genes/sum(TableGC$corrected_genes), 1)

kable(TableGC)
bins_PPanGGOLiN Assignment corrected_genes Percentages
Accessory Ambiguous 56.85152 2.0
Accessory Informative 326.01640 11.2
Accessory Unclassified 505.13209 17.4
Accessory Uninformative 28.00000 1.0
Persistent Ambiguous 200.12897 6.9
Persistent Informative 1264.16198 43.5
Persistent Unclassified 372.18128 12.8
Persistent Uninformative 154.52778 5.3

3.1.2 Summary of GOC annotated GCs in the Accessory

TableGCAccessory <- GCsbyCOG_Genome %>% 
  filter(bins_PPanGGOLiN =="Accessory") %>%
  group_by(bins_PPanGGOLiN, Assignment) %>%
  summarize(corrected_genes=sum(num_corrected_genes))

TableGCAccessory$Percentages <- round(100*TableGCAccessory$corrected_genes/sum(TableGCAccessory$corrected_genes), 1)

kable(TableGCAccessory)
bins_PPanGGOLiN Assignment corrected_genes Percentages
Accessory Ambiguous 56.85152 6.2
Accessory Informative 326.01640 35.6
Accessory Unclassified 505.13209 55.1
Accessory Uninformative 28.00000 3.1

3.1.3 Summary of GOC annotated GCs in the Persistent

TableGCPersistent <- GCsbyCOG_Genome %>% 
  filter(bins_PPanGGOLiN =="Persistent") %>%
  group_by(bins_PPanGGOLiN, Assignment) %>%
  summarize(corrected_genes=sum(num_corrected_genes))

TableGCPersistent$Percentages <- round(100*TableGCPersistent$corrected_genes/sum(TableGCPersistent$corrected_genes), 1)

kable(TableGCPersistent)
bins_PPanGGOLiN Assignment corrected_genes Percentages
Persistent Ambiguous 200.1290 10.1
Persistent Informative 1264.1620 63.5
Persistent Unclassified 372.1813 18.7
Persistent Uninformative 154.5278 7.8

3.1.4 Summary of GOC annotated GCs by Genome in the Accessory vs. Persistent

TableGenomes <- GCsbyCOG_Genome %>% 
  group_by(genome_name, bins_PPanGGOLiN) %>% 
  summarize(corrected_genes=sum(num_corrected_genes))

kable(TableGenomes)
genome_name bins_PPanGGOLiN corrected_genes
Ctu_BWA150 Accessory 105.99015
Ctu_BWA150 Persistent 249.37606
Ctu_BWA158 Accessory 153.98734
Ctu_BWA158 Persistent 247.80976
Ctu_BWA195 Accessory 134.31496
Ctu_BWA195 Persistent 249.24472
Ctu_BWA207 Accessory 84.64527
Ctu_BWA207 Persistent 248.42905
Ctu_DSM_44922 Accessory 101.66341
Ctu_DSM_44922 Persistent 250.77091
Ctu_DU074 Accessory 93.25170
Ctu_DU074 Persistent 248.90180
Ctu_KPL_3807 Accessory 117.20552
Ctu_KPL_3807 Persistent 247.90377
Ctu_SK141 Accessory 124.94164
Ctu_SK141 Persistent 248.56393

3.1.5 Renaming and ordering variables factor levels for plotting:

GCsbyCOG_Genome$bins_PPanGGOLiN <- factor(GCsbyCOG_Genome$bins_PPanGGOLiN, levels =c("Persistent", "Accessory"))

GCsbyCOG_Genome$COGs <- recode_factor(GCsbyCOG_Genome$COGs, "Q"="Secondary metabolites biosynthesis, transport, and catabolism","P"="Inorganic ion transport and metabolism","I"="Lipid transport and metabolism","H"="Coenzyme transport and metabolism","G"="Carbohydrate transport and metabolism","F"="Nucleotide transport and metabolism","E"="Amino acid transport and metabolism","C"="Energy production and conversion","X"="Mobilome: prophages, transposons","L"="Replication, recombination and repair","K"="Transcription","J"="Translation, ribosomal structure and biogenesis","V"="Defense mechanisms","U"="Intracellular trafficking, secretion, and vesicular transport","T"="Signal transduction mechanisms","O"="Post-translational modification, protein turnover, and chaperones","N"="Cell Motility","M"="Cell wall/membrane/envelope biogenesis","D"="Cell cycle control, cell division, chromosome partitioning","A"="RNA processing and modification","W"="Extracellular structures","Uninformative"="Uninformative","Ambiguous"="Ambiguous","Unclassified"="Unclassified", .ordered = TRUE)

GCsbyCOG_Genome$Assignment <- recode_factor(GCsbyCOG_Genome$Assignment,  "Informative"=" ", "Uninformative"="Uninformative", "Ambiguous"="Ambiguous", "Unclassified"="Unclassified", .ordered = TRUE)

3.2 GCs by COG Category

The table “GCsbyCOG” groups the genes by “Accessory” vs. “Persistent” status, and nested inside as the COG category.

GCsbyCOG <- Pangenome %>%
  group_by(bins_PPanGGOLiN, COGs) %>%
  summarise(num_corrected_genes=sum(1/num_genes_in_gene_cluster))

3.2.1 Renaming and ordering variables factor levels for plotting:

GCsbyCOG$COGs <- recode_factor(GCsbyCOG$COGs, "Q"="Secondary metabolites biosynthesis, transport, and catabolism",
                               "P"="Inorganic ion transport and metabolism",
                               "I"="Lipid transport and metabolism",
                               "H"="Coenzyme transport and metabolism",
                               "G"="Carbohydrate transport and metabolism",
                               "F"="Nucleotide transport and metabolism",
                               "E"="Amino acid transport and metabolism",
                               "C"="Energy production and conversion",
                               "X"="Mobilome: prophages, transposons",
                               "L"="Replication, recombination and repair",
                               "K"="Transcription",
                               "J"="Translation, ribosomal structure and biogenesis",
                               "V"="Defense mechanisms",
                               "U"="Intracellular trafficking, secretion, and vesicular transport",
                               "T"="Signal transduction mechanisms",
                               "O"="Post-translational modification, protein turnover, and chaperones",
                               "N"="Cell Motility",
                               "M"="Cell wall/membrane/envelope biogenesis",
                               "D"="Cell cycle control, cell division, chromosome partitioning",
                               "A"="RNA processing and modification",
                               "W"="Extracellular structures",
                               "Uninformative"="Uninformative",
                               "Ambiguous"="Ambiguous",
                               "Unclassified"="Unclassified", .ordered = TRUE)

3.2.2 Summary of GOC annotated GCs GCs by COG Category:

New table “GCsbyCOG_PervsAcc” in wide format. % of each category relative to the “Accessory” or “Persistent” was calculated (pTotal. variables). Total GCs for each COG category calculated, and % of GCs in the “Accessory” and “Persistent” relative to each category (p. values) were calculated as well. The ratio between the number of GC in the “Accessory” vs. the “Persistent” is calculated for each COG:

GCsbyCOG_PervsAcc <- spread(GCsbyCOG, bins_PPanGGOLiN, num_corrected_genes, fill=0)
GCsbyCOG_PervsAcc$pTotal.Accessory <- round(100*GCsbyCOG_PervsAcc$Accessory/sum(GCsbyCOG_PervsAcc$Accessory), 1)
GCsbyCOG_PervsAcc$pTotal.Persistent <- round(100*GCsbyCOG_PervsAcc$Persistent/sum(GCsbyCOG_PervsAcc$Persistent), 1)
GCsbyCOG_PervsAcc$total <- GCsbyCOG_PervsAcc$Accessory + GCsbyCOG_PervsAcc$Persistent
GCsbyCOG_PervsAcc$pTotal.total <- round(100*GCsbyCOG_PervsAcc$total/sum(GCsbyCOG_PervsAcc$total), 1)
GCsbyCOG_PervsAcc$p.accessory <- round(100*(GCsbyCOG_PervsAcc$Accessory/GCsbyCOG_PervsAcc$total), 1)
GCsbyCOG_PervsAcc$p.Persistent <- round(100*(GCsbyCOG_PervsAcc$Persistent/GCsbyCOG_PervsAcc$total), 1)
GCsbyCOG_PervsAcc$ratio <- round(GCsbyCOG_PervsAcc$Accessory/GCsbyCOG_PervsAcc$Persistent, 2)

kable(GCsbyCOG_PervsAcc)
COGs Accessory Persistent pTotal.Accessory pTotal.Persistent total pTotal.total p.accessory p.Persistent ratio
Secondary metabolites biosynthesis, transport, and catabolism 1.000000 15.571429 0.1 0.8 16.57143 0.6 6.0 94.0 0.06
Inorganic ion transport and metabolism 44.750000 80.000000 4.9 4.0 124.75000 4.3 35.9 64.1 0.56
Lipid transport and metabolism 11.333333 60.888889 1.2 3.1 72.22222 2.5 15.7 84.3 0.19
Coenzyme transport and metabolism 9.000000 85.125000 1.0 4.3 94.12500 3.2 9.6 90.4 0.11
Carbohydrate transport and metabolism 24.000000 101.000000 2.6 5.1 125.00000 4.3 19.2 80.8 0.24
Nucleotide transport and metabolism 1.000000 62.750000 0.1 3.2 63.75000 2.2 1.6 98.4 0.02
Amino acid transport and metabolism 14.925000 130.000000 1.6 6.5 144.92500 5.0 10.3 89.7 0.11
Energy production and conversion 6.000000 66.888889 0.7 3.4 72.88889 2.5 8.2 91.8 0.09
Mobilome: prophages, transposons 33.527112 7.958333 3.7 0.4 41.48545 1.4 80.8 19.2 4.21
Replication, recombination and repair 29.666667 85.767857 3.2 4.3 115.43452 4.0 25.7 74.3 0.35
Transcription 36.266667 94.250000 4.0 4.7 130.51667 4.5 27.8 72.2 0.38
Translation, ribosomal structure and biogenesis 6.500000 178.000000 0.7 8.9 184.50000 6.3 3.5 96.5 0.04
Defense mechanisms 48.714286 48.410714 5.3 2.4 97.12500 3.3 50.2 49.8 1.01
Intracellular trafficking, secretion, and vesicular transport 0.000000 16.000000 0.0 0.8 16.00000 0.6 0.0 100.0 0.00
Signal transduction mechanisms 6.500000 46.250000 0.7 2.3 52.75000 1.8 12.3 87.7 0.14
Post-translational modification, protein turnover, and chaperones 9.666667 74.553571 1.1 3.7 84.22024 2.9 11.5 88.5 0.13
Cell Motility 0.000000 1.000000 0.0 0.1 1.00000 0.0 0.0 100.0 0.00
Cell wall/membrane/envelope biogenesis 40.166667 87.566558 4.4 4.4 127.73323 4.4 31.4 68.6 0.46
Cell cycle control, cell division, chromosome partitioning 3.000000 20.180736 0.3 1.0 23.18074 0.8 12.9 87.1 0.15
RNA processing and modification 0.000000 1.000000 0.0 0.1 1.00000 0.0 0.0 100.0 0.00
Extracellular structures 0.000000 1.000000 0.0 0.1 1.00000 0.0 0.0 100.0 0.00
Uninformative 28.000000 154.527778 3.1 7.8 182.52778 6.3 15.3 84.7 0.18
Ambiguous 56.851515 200.128968 6.2 10.1 256.98048 8.8 22.1 77.9 0.28
Unclassified 505.132087 372.181277 55.1 18.7 877.31336 30.2 57.6 42.4 1.36

4 Plots

Color Palettes

getPalette <- colorRampPalette(brewer.pal(8, "Set1"))
CountTotalCOGs <- length(unique(GCsbyCOG_Genome$COGs))

palette1 <- c("grey60", "grey40", "grey20", getPalette(CountTotalCOGs-3)) # Colors + Grays
palette2 <- getPalette(CountTotalCOGs-3) # Colors
palette3 <- c("grey60", "grey40", "grey20", "white") # White + Grays

4.1 Plots Accessory vs. Persistent

Panel A in main figure:

pA <- ggplot(GCsbyCOG_Genome, aes(x = bins_PPanGGOLiN, y = num_corrected_genes, fill = fct_rev(COGs))) +
  stat_summary(fun=sum ,geom="bar", position = "stack") +
  scale_x_discrete(labels = c("Persistent", "Accessory")) +
  scale_fill_manual(values = palette1) +
  scale_y_continuous(expand = c(0,0), breaks=seq(0, 2250, by = 250)) +
  labs(fill="COG Categories", x=" ", y= "Number of Gene Clusters") +
  theme_classic() +
  theme(axis.title = element_text(size = 9), axis.text = element_text(size=7), plot.margin=unit(c(10,0,10,20),"pt"), legend.position = "none") 
pA

4.2 Plots by Genome (Accessory)

Panel A in supplemental figure:

pAS <- ggplot(filter(GCsbyCOG_Genome, bins_PPanGGOLiN == "Accessory"), aes(x=genome_name, y=num_corrected_genes, fill = fct_rev(COGs))) +
  stat_summary(fun=sum ,geom="bar", position = "stack") +
  scale_fill_manual(values = palette1) +
  scale_y_continuous(expand = c(0,0)) + 
  labs(fill="COG Assignment", x="", y= "Number of Gene Clusters") +
  theme_classic() + 
  theme(axis.text.y = element_text(size=7), axis.text.x = element_text(size=8, angle=75, hjust=1)) +
  theme(legend.position = "none", plot.margin=unit(c(15,15,-10,20),"pt")) 
pAS

Panel B in supplemental figure:

pBS <- ggplot(filter(GCsbyCOG_Genome %>% filter(COGs != "Uninformative", COGs !="Ambiguous", COGs != "Unclassified"), bins_PPanGGOLiN == "Accessory"), aes(x=genome_name, y=num_corrected_genes, fill = fct_rev(COGs))) +
  stat_summary(fun=sum ,geom="bar", position = "stack") +
  scale_y_continuous(expand = c(0,0)) + 
  scale_fill_manual(values = palette2) + 
  labs(fill="COG Categories", x="", y= "Number of Informative Gene Clusters") +
  theme_classic() + 
  theme(axis.text.y = element_text(size=7), axis.text.x = element_text(size=8, angle=75, hjust=1)) +
  theme(legend.position="bottom", legend.key.size = unit(0.7, "line"), legend.text = element_text(size = 8), plot.margin=unit(c(0,15,0,20),"pt")) +
  guides(fill=guide_legend(ncol=2, title.position = "top", title.hjust = 0.5)) 
pBS

4.3 Plots by COG Category

In order to represent the Persistent on the left of the plot with absolute values per COG category we create per.neg; a negative version of the persistent variable in GCsbyCOG_PervsAcc. Table converted to the long format for plotting.

GCsbyCOG_PervsAcc$per.neg <- -GCsbyCOG_PervsAcc$Persistent
GCsbyCOG_PervsAccLong <- gather(GCsbyCOG_PervsAcc, bins_PPanGGOLiN, plotting, per.neg, Accessory)
kable(GCsbyCOG_PervsAccLong)
COGs Persistent pTotal.Accessory pTotal.Persistent total pTotal.total p.accessory p.Persistent ratio bins_PPanGGOLiN plotting
Secondary metabolites biosynthesis, transport, and catabolism 15.571429 0.1 0.8 16.57143 0.6 6.0 94.0 0.06 per.neg -15.571429
Inorganic ion transport and metabolism 80.000000 4.9 4.0 124.75000 4.3 35.9 64.1 0.56 per.neg -80.000000
Lipid transport and metabolism 60.888889 1.2 3.1 72.22222 2.5 15.7 84.3 0.19 per.neg -60.888889
Coenzyme transport and metabolism 85.125000 1.0 4.3 94.12500 3.2 9.6 90.4 0.11 per.neg -85.125000
Carbohydrate transport and metabolism 101.000000 2.6 5.1 125.00000 4.3 19.2 80.8 0.24 per.neg -101.000000
Nucleotide transport and metabolism 62.750000 0.1 3.2 63.75000 2.2 1.6 98.4 0.02 per.neg -62.750000
Amino acid transport and metabolism 130.000000 1.6 6.5 144.92500 5.0 10.3 89.7 0.11 per.neg -130.000000
Energy production and conversion 66.888889 0.7 3.4 72.88889 2.5 8.2 91.8 0.09 per.neg -66.888889
Mobilome: prophages, transposons 7.958333 3.7 0.4 41.48545 1.4 80.8 19.2 4.21 per.neg -7.958333
Replication, recombination and repair 85.767857 3.2 4.3 115.43452 4.0 25.7 74.3 0.35 per.neg -85.767857
Transcription 94.250000 4.0 4.7 130.51667 4.5 27.8 72.2 0.38 per.neg -94.250000
Translation, ribosomal structure and biogenesis 178.000000 0.7 8.9 184.50000 6.3 3.5 96.5 0.04 per.neg -178.000000
Defense mechanisms 48.410714 5.3 2.4 97.12500 3.3 50.2 49.8 1.01 per.neg -48.410714
Intracellular trafficking, secretion, and vesicular transport 16.000000 0.0 0.8 16.00000 0.6 0.0 100.0 0.00 per.neg -16.000000
Signal transduction mechanisms 46.250000 0.7 2.3 52.75000 1.8 12.3 87.7 0.14 per.neg -46.250000
Post-translational modification, protein turnover, and chaperones 74.553571 1.1 3.7 84.22024 2.9 11.5 88.5 0.13 per.neg -74.553571
Cell Motility 1.000000 0.0 0.1 1.00000 0.0 0.0 100.0 0.00 per.neg -1.000000
Cell wall/membrane/envelope biogenesis 87.566558 4.4 4.4 127.73323 4.4 31.4 68.6 0.46 per.neg -87.566558
Cell cycle control, cell division, chromosome partitioning 20.180736 0.3 1.0 23.18074 0.8 12.9 87.1 0.15 per.neg -20.180736
RNA processing and modification 1.000000 0.0 0.1 1.00000 0.0 0.0 100.0 0.00 per.neg -1.000000
Extracellular structures 1.000000 0.0 0.1 1.00000 0.0 0.0 100.0 0.00 per.neg -1.000000
Uninformative 154.527778 3.1 7.8 182.52778 6.3 15.3 84.7 0.18 per.neg -154.527778
Ambiguous 200.128968 6.2 10.1 256.98048 8.8 22.1 77.9 0.28 per.neg -200.128968
Unclassified 372.181277 55.1 18.7 877.31336 30.2 57.6 42.4 1.36 per.neg -372.181277
Secondary metabolites biosynthesis, transport, and catabolism 15.571429 0.1 0.8 16.57143 0.6 6.0 94.0 0.06 Accessory 1.000000
Inorganic ion transport and metabolism 80.000000 4.9 4.0 124.75000 4.3 35.9 64.1 0.56 Accessory 44.750000
Lipid transport and metabolism 60.888889 1.2 3.1 72.22222 2.5 15.7 84.3 0.19 Accessory 11.333333
Coenzyme transport and metabolism 85.125000 1.0 4.3 94.12500 3.2 9.6 90.4 0.11 Accessory 9.000000
Carbohydrate transport and metabolism 101.000000 2.6 5.1 125.00000 4.3 19.2 80.8 0.24 Accessory 24.000000
Nucleotide transport and metabolism 62.750000 0.1 3.2 63.75000 2.2 1.6 98.4 0.02 Accessory 1.000000
Amino acid transport and metabolism 130.000000 1.6 6.5 144.92500 5.0 10.3 89.7 0.11 Accessory 14.925000
Energy production and conversion 66.888889 0.7 3.4 72.88889 2.5 8.2 91.8 0.09 Accessory 6.000000
Mobilome: prophages, transposons 7.958333 3.7 0.4 41.48545 1.4 80.8 19.2 4.21 Accessory 33.527112
Replication, recombination and repair 85.767857 3.2 4.3 115.43452 4.0 25.7 74.3 0.35 Accessory 29.666667
Transcription 94.250000 4.0 4.7 130.51667 4.5 27.8 72.2 0.38 Accessory 36.266667
Translation, ribosomal structure and biogenesis 178.000000 0.7 8.9 184.50000 6.3 3.5 96.5 0.04 Accessory 6.500000
Defense mechanisms 48.410714 5.3 2.4 97.12500 3.3 50.2 49.8 1.01 Accessory 48.714286
Intracellular trafficking, secretion, and vesicular transport 16.000000 0.0 0.8 16.00000 0.6 0.0 100.0 0.00 Accessory 0.000000
Signal transduction mechanisms 46.250000 0.7 2.3 52.75000 1.8 12.3 87.7 0.14 Accessory 6.500000
Post-translational modification, protein turnover, and chaperones 74.553571 1.1 3.7 84.22024 2.9 11.5 88.5 0.13 Accessory 9.666667
Cell Motility 1.000000 0.0 0.1 1.00000 0.0 0.0 100.0 0.00 Accessory 0.000000
Cell wall/membrane/envelope biogenesis 87.566558 4.4 4.4 127.73323 4.4 31.4 68.6 0.46 Accessory 40.166667
Cell cycle control, cell division, chromosome partitioning 20.180736 0.3 1.0 23.18074 0.8 12.9 87.1 0.15 Accessory 3.000000
RNA processing and modification 1.000000 0.0 0.1 1.00000 0.0 0.0 100.0 0.00 Accessory 0.000000
Extracellular structures 1.000000 0.0 0.1 1.00000 0.0 0.0 100.0 0.00 Accessory 0.000000
Uninformative 154.527778 3.1 7.8 182.52778 6.3 15.3 84.7 0.18 Accessory 28.000000
Ambiguous 200.128968 6.2 10.1 256.98048 8.8 22.1 77.9 0.28 Accessory 56.851515
Unclassified 372.181277 55.1 18.7 877.31336 30.2 57.6 42.4 1.36 Accessory 505.132087

Panel B in main figure:

pB <- ggplot(filter(GCsbyCOG_PervsAccLong, COGs != "Uninformative", COGs != "Ambiguous", COGs != "Unclassified"), aes(x = COGs, y = plotting, fill = COGs)) +
  geom_bar(stat="identity") + 
  scale_fill_manual(values = rev(palette2)) + 
  scale_x_discrete(position = "top") +
  labs(x="", y= "Number of Gene Clusters") +
  coord_flip() +
  scale_y_continuous(limits = c(-200, 200), breaks = c(-150, -100, -50, 0, 50, 100, 150), label = c(150, 100, 50, 0, 50, 100, 150)) +
  geom_segment(aes(x=0,xend=19.5,y=0,yend=0), linetype=3, size=0.1) +
  geom_label(aes(x = 22.5, y = -95, label = "      Persistent       "), fontface="bold", size=3, fill = "grey90", label.size=NA, label.padding = unit(0.3, "lines")) +
  geom_label(aes(x = 22.5, y = 95, label = "     Accessory      "), fontface="bold", size=3, fill = "grey90", label.size=NA, label.padding = unit(0.3, "lines")) +
  theme_classic() +
  theme(axis.title = element_text(size = 9), axis.text.x = element_text(size=7), axis.ticks.y = element_blank(), axis.line.y = element_blank(), legend.position = "none", plot.margin=unit(c(5,10,10,25),"pt"), plot.title=element_text(face="bold", hjust=3, vjust=-3.9)) 

gpB <- ggplotGrob(pB)
gpB$layout$clip[gpB$layout$name=="panel"] <- "off"
ggarrange(gpB, labels="COG Categories", label.x = 0.5, vjust = 1.1)