RNASeq

library(tidyverse)
library(readxl)
library(ggtext)
library(magrittr)
library(DESeq2)

Data Input and Selection

File Paths

# Folder paths
input_path <- "data/input_data/RNASeq/"
metadata_path <- "data/metadata/RNASeq"

# Create subfolders for output files
dataframes_folder <- "data/dataframes"
if (!file.exists("data/dataframes")) {
  dir.create("data/dataframes", recursive = TRUE)
}
outputs_folder <- "data/outputs/RNASeq"
if (!file.exists("data/outputs/RNASeq")) {
  dir.create("data/outputs/RNASeq", recursive = TRUE)
}

# Load data and metadata
metadata <- read_excel(file.path(metadata_path, "hnoseq3437-metadata.xlsx")) 
counts <- read_csv(file.path(input_path, "hnoseq3437-hisat2.featureCounts-genes2-2batches.csv"))

Data clean-up

# Clean up unneeded columns and transform all columns into factors
metadata <- metadata %>%
  select(-contains(c('_func','original_name'))) %>%
  mutate_all(factor)

# Select only the rows with "protein_coding" as the GeneBiotype and transform in a matrix with GeneID as the rownames. Sort by GeneID
counts <- counts %>%
  filter(GeneBiotype == "protein_coding") %>% 
  column_to_rownames(var = "GeneID") %>%
  select(-contains('Gene')) %>% 
  .[ order(names(.)) ]

# Subset metadata to include only the samples relevant to this analysis 
subset.metadata <- metadata %>% 
  subset(line != 'HVO262') %>% # remove HVO262 samples
  subset(temp != 'CT') %>%  # remove CONTROL samples
  subset(temp != '34') %>%  # remove 34C samples
  droplevels() %>% 
  #mutate(temp = factor(temp, labels = c("37 °C", "37 °C → 34 °C")))  %>% 
  mutate(temp = factor(temp, labels = c("<b><span style='color:red3;'>37 °C</span></b>", 
                                        "<b><span style='color:#5b5b5b;'>37 °C → 34 °C</span></b>")))  %>%
  mutate(line = factor(line, levels = c("HNO918", "HNO204", "HNO919")))  %>% 
  set_rownames(.$sampleid)

# Keep counts column only for the selected samples 
subset.counts <- counts %>% 
  select(contains(rownames(subset.metadata))) 

# Make the correesponding design matrix 
coldata <- data.frame(subset.metadata$line) %>%
  set_rownames(rownames(subset.metadata)) %>%
  setNames('condition')

Saving files

# Save data frames as CSV files in the dataframes folder
write.csv(subset.counts, file.path(dataframes_folder, "RNASeq_counts.csv"), row.names = TRUE)

# Save data frames as R objects in the dataframes folder
saveRDS(subset.counts, file.path(dataframes_folder, "RNASeq_counts.rds"))

# Use this to read the final objects
subset.counts <- readRDS("data/dataframes/RNASeq_counts.rds")

DESeq

# Run DESeq
dds01.cnts <- DESeqDataSetFromMatrix(subset.counts, colData = coldata, design = ~ condition)
dds02.dseq <- DESeq(dds01.cnts)
dds03.resu <- results(dds02.dseq)
dds04.vstf <- vst(dds02.dseq,  blind = FALSE)

# Get PCA dataframe and percentVar
pcadata <- plotPCA(dds04.vstf, returnData = TRUE) 
pcadata <- left_join(pcadata, subset.metadata, by = c("name" = "sampleid"))

Saving files

# Save data frames as CSV files in the output folder
write_csv(pcadata, file.path(outputs_folder, "PCA_values.csv"))

# Save data frames as R objects in the output folder
saveRDS(pcadata, file.path(outputs_folder, "PCA_values.rds"))

# Cleaning-up all objects from the environment
rm(list = ls())

# Use this to read the final objects
pcadata <- readRDS("data/outputs/RNASeq/PCA_values.rds")

PCA Plot

File Paths

# Folder paths
outputs_folder <- "data/outputs/RNASeq"

# Load data 
pcadata <- readRDS("data/outputs/RNASeq/PCA_values.rds")

Plot

percentVar = round(100 * attr(pcadata, "percentVar"))

PCAplot = ggplot(pcadata, aes(x = PC1, y = PC2, group = condition)) +
  geom_point(aes(fill = temp, color = temp, shape = line), alpha = 0.75, size = 3, show.legend = c(shape = FALSE)) + 
  scale_fill_manual(values = c('red3', '#5b5b5b')) +
  scale_color_manual(values = c('red3', '#5b5b5b')) +
  scale_shape_manual(values = c(21, 22, 24)) +
  labs(x = paste0("PC1: ", percentVar[1], "% variance"),
       y = paste0("PC2: ", percentVar[2], "% variance"),
       fill = "Temperature", color = "Temperature") +
  #coord_fixed() +
  theme_bw() +
  theme(panel.grid = element_blank(),
        text = element_text(size = 20),
        legend.text = element_markdown(),
        axis.text.x = element_text(color = "black"),
        axis.text.y = element_text(color = "black")) +
  guides(fill = guide_legend(override.aes = list(shape = 21))) 

PCAplot

Saving files

ggsave(PCAplot, filename = paste0(outputs_folder, "/plotPCA.png"), width = 6, height = 6)
saveRDS(PCAplot, file.path(outputs_folder, paste0("plotPCA.rds")))

--- execute: message: FALSE warning: FALSE --- # RNASeq {.unnumbered} ```{r} library(tidyverse) library(readxl) library(ggtext) library(magrittr) library(DESeq2) ``` ## Data Input and Selection ### File Paths ```{r} # Folder paths input_path <- "data/input_data/RNASeq/" metadata_path <- "data/metadata/RNASeq" # Create subfolders for output files dataframes_folder <- "data/dataframes" if (!file.exists("data/dataframes")) { dir.create("data/dataframes", recursive = TRUE) } outputs_folder <- "data/outputs/RNASeq" if (!file.exists("data/outputs/RNASeq")) { dir.create("data/outputs/RNASeq", recursive = TRUE) } # Load data and metadata metadata <- read_excel(file.path(metadata_path, "hnoseq3437-metadata.xlsx")) counts <- read_csv(file.path(input_path, "hnoseq3437-hisat2.featureCounts-genes2-2batches.csv")) ``` ### Data clean-up ```{r} # Clean up unneeded columns and transform all columns into factors metadata <- metadata %>% select(-contains(c('_func','original_name'))) %>% mutate_all(factor) # Select only the rows with "protein_coding" as the GeneBiotype and transform in a matrix with GeneID as the rownames. Sort by GeneID counts <- counts %>% filter(GeneBiotype == "protein_coding") %>% column_to_rownames(var = "GeneID") %>% select(-contains('Gene')) %>% .[ order(names(.)) ] ``` ```{r} # Subset metadata to include only the samples relevant to this analysis subset.metadata <- metadata %>% subset(line != 'HVO262') %>% # remove HVO262 samples subset(temp != 'CT') %>% # remove CONTROL samples subset(temp != '34') %>% # remove 34C samples droplevels() %>% #mutate(temp = factor(temp, labels = c("37 °C", "37 °C → 34 °C"))) %>% mutate(temp = factor(temp, labels = c("37 °C", "37 °C → 34 °C"))) %>% mutate(line = factor(line, levels = c("HNO918", "HNO204", "HNO919"))) %>% set_rownames(.$sampleid) # Keep counts column only for the selected samples subset.counts <- counts %>% select(contains(rownames(subset.metadata))) # Make the correesponding design matrix coldata <- data.frame(subset.metadata$line) %>% set_rownames(rownames(subset.metadata)) %>% setNames('condition') ``` ### Saving files ```{r} # Save data frames as CSV files in the dataframes folder write.csv(subset.counts, file.path(dataframes_folder, "RNASeq_counts.csv"), row.names = TRUE) # Save data frames as R objects in the dataframes folder saveRDS(subset.counts, file.path(dataframes_folder, "RNASeq_counts.rds")) # Use this to read the final objects subset.counts <- readRDS("data/dataframes/RNASeq_counts.rds") ``` ## DESeq ```{r} # Run DESeq dds01.cnts <- DESeqDataSetFromMatrix(subset.counts, colData = coldata, design = ~ condition) dds02.dseq <- DESeq(dds01.cnts) dds03.resu <- results(dds02.dseq) dds04.vstf <- vst(dds02.dseq, blind = FALSE) ``` ```{r} # Get PCA dataframe and percentVar pcadata <- plotPCA(dds04.vstf, returnData = TRUE) pcadata <- left_join(pcadata, subset.metadata, by = c("name" = "sampleid")) ``` ### Saving files ```{r} # Save data frames as CSV files in the output folder write_csv(pcadata, file.path(outputs_folder, "PCA_values.csv")) # Save data frames as R objects in the output folder saveRDS(pcadata, file.path(outputs_folder, "PCA_values.rds")) # Cleaning-up all objects from the environment rm(list = ls()) # Use this to read the final objects pcadata <- readRDS("data/outputs/RNASeq/PCA_values.rds") ``` ## PCA Plot ### File Paths ```{r} # Folder paths outputs_folder <- "data/outputs/RNASeq" # Load data pcadata <- readRDS("data/outputs/RNASeq/PCA_values.rds") ``` ### Plot ```{r} percentVar = round(100 * attr(pcadata, "percentVar")) ``` ```{r} PCAplot = ggplot(pcadata, aes(x = PC1, y = PC2, group = condition)) + geom_point(aes(fill = temp, color = temp, shape = line), alpha = 0.75, size = 3, show.legend = c(shape = FALSE)) + scale_fill_manual(values = c('red3', '#5b5b5b')) + scale_color_manual(values = c('red3', '#5b5b5b')) + scale_shape_manual(values = c(21, 22, 24)) + labs(x = paste0("PC1: ", percentVar[1], "% variance"), y = paste0("PC2: ", percentVar[2], "% variance"), fill = "Temperature", color = "Temperature") + #coord_fixed() + theme_bw() + theme(panel.grid = element_blank(), text = element_text(size = 20), legend.text = element_markdown(), axis.text.x = element_text(color = "black"), axis.text.y = element_text(color = "black")) + guides(fill = guide_legend(override.aes = list(shape = 21))) PCAplot ``` ### Saving files ```{r} ggsave(PCAplot, filename = paste0(outputs_folder, "/plotPCA.png"), width = 6, height = 6) saveRDS(PCAplot, file.path(outputs_folder, paste0("plotPCA.rds"))) ```