# Folder pathsinput_path <-"data/input_data/RNASeq/"metadata_path <-"data/metadata/RNASeq"# Create subfolders for output filesdataframes_folder <-"data/dataframes"if (!file.exists("data/dataframes")) {dir.create("data/dataframes", recursive =TRUE)}outputs_folder <-"data/outputs/RNASeq"if (!file.exists("data/outputs/RNASeq")) {dir.create("data/outputs/RNASeq", recursive =TRUE)}# Load data and metadatametadata <-read_excel(file.path(metadata_path, "hnoseq3437-metadata.xlsx")) counts <-read_csv(file.path(input_path, "hnoseq3437-hisat2.featureCounts-genes2-2batches.csv"))
Data clean-up
# Clean up unneeded columns and transform all columns into factorsmetadata <- metadata %>%select(-contains(c('_func','original_name'))) %>%mutate_all(factor)# Select only the rows with "protein_coding" as the GeneBiotype and transform in a matrix with GeneID as the rownames. Sort by GeneIDcounts <- counts %>%filter(GeneBiotype =="protein_coding") %>%column_to_rownames(var ="GeneID") %>%select(-contains('Gene')) %>% .[ order(names(.)) ]
# Subset metadata to include only the samples relevant to this analysis subset.metadata <- metadata %>%subset(line !='HVO262') %>%# remove HVO262 samplessubset(temp !='CT') %>%# remove CONTROL samplessubset(temp !='34') %>%# remove 34C samplesdroplevels() %>%#mutate(temp = factor(temp, labels = c("37 °C", "37 °C → 34 °C"))) %>% mutate(temp =factor(temp, labels =c("<b><span style='color:red3;'>37 °C</span></b>", "<b><span style='color:#5b5b5b;'>37 °C → 34 °C</span></b>"))) %>%mutate(line =factor(line, levels =c("HNO918", "HNO204", "HNO919"))) %>%set_rownames(.$sampleid)# Keep counts column only for the selected samples subset.counts <- counts %>%select(contains(rownames(subset.metadata))) # Make the correesponding design matrix coldata <-data.frame(subset.metadata$line) %>%set_rownames(rownames(subset.metadata)) %>%setNames('condition')
Saving files
# Save data frames as CSV files in the dataframes folderwrite.csv(subset.counts, file.path(dataframes_folder, "RNASeq_counts.csv"), row.names =TRUE)# Save data frames as R objects in the dataframes foldersaveRDS(subset.counts, file.path(dataframes_folder, "RNASeq_counts.rds"))# Use this to read the final objectssubset.counts <-readRDS("data/dataframes/RNASeq_counts.rds")
# Get PCA dataframe and percentVarpcadata <-plotPCA(dds04.vstf, returnData =TRUE) pcadata <-left_join(pcadata, subset.metadata, by =c("name"="sampleid"))
Saving files
# Save data frames as CSV files in the output folderwrite_csv(pcadata, file.path(outputs_folder, "PCA_values.csv"))# Save data frames as R objects in the output foldersaveRDS(pcadata, file.path(outputs_folder, "PCA_values.rds"))# Cleaning-up all objects from the environmentrm(list =ls())# Use this to read the final objectspcadata <-readRDS("data/outputs/RNASeq/PCA_values.rds")
PCA Plot
File Paths
# Folder pathsoutputs_folder <-"data/outputs/RNASeq"# Load data pcadata <-readRDS("data/outputs/RNASeq/PCA_values.rds")
---execute: message: FALSE warning: FALSE---# RNASeq {.unnumbered}```{r}library(tidyverse)library(readxl)library(ggtext)library(magrittr)library(DESeq2)```## Data Input and Selection### File Paths```{r}# Folder pathsinput_path <-"data/input_data/RNASeq/"metadata_path <-"data/metadata/RNASeq"# Create subfolders for output filesdataframes_folder <-"data/dataframes"if (!file.exists("data/dataframes")) {dir.create("data/dataframes", recursive =TRUE)}outputs_folder <-"data/outputs/RNASeq"if (!file.exists("data/outputs/RNASeq")) {dir.create("data/outputs/RNASeq", recursive =TRUE)}# Load data and metadatametadata <-read_excel(file.path(metadata_path, "hnoseq3437-metadata.xlsx")) counts <-read_csv(file.path(input_path, "hnoseq3437-hisat2.featureCounts-genes2-2batches.csv"))```### Data clean-up```{r}# Clean up unneeded columns and transform all columns into factorsmetadata <- metadata %>%select(-contains(c('_func','original_name'))) %>%mutate_all(factor)# Select only the rows with "protein_coding" as the GeneBiotype and transform in a matrix with GeneID as the rownames. Sort by GeneIDcounts <- counts %>%filter(GeneBiotype =="protein_coding") %>%column_to_rownames(var ="GeneID") %>%select(-contains('Gene')) %>% .[ order(names(.)) ]``````{r}# Subset metadata to include only the samples relevant to this analysis subset.metadata <- metadata %>%subset(line !='HVO262') %>%# remove HVO262 samplessubset(temp !='CT') %>%# remove CONTROL samplessubset(temp !='34') %>%# remove 34C samplesdroplevels() %>%#mutate(temp = factor(temp, labels = c("37 °C", "37 °C → 34 °C"))) %>% mutate(temp =factor(temp, labels =c("<b><span style='color:red3;'>37 °C</span></b>", "<b><span style='color:#5b5b5b;'>37 °C → 34 °C</span></b>"))) %>%mutate(line =factor(line, levels =c("HNO918", "HNO204", "HNO919"))) %>%set_rownames(.$sampleid)# Keep counts column only for the selected samples subset.counts <- counts %>%select(contains(rownames(subset.metadata))) # Make the correesponding design matrix coldata <-data.frame(subset.metadata$line) %>%set_rownames(rownames(subset.metadata)) %>%setNames('condition') ```### Saving files```{r}# Save data frames as CSV files in the dataframes folderwrite.csv(subset.counts, file.path(dataframes_folder, "RNASeq_counts.csv"), row.names =TRUE)# Save data frames as R objects in the dataframes foldersaveRDS(subset.counts, file.path(dataframes_folder, "RNASeq_counts.rds"))# Use this to read the final objectssubset.counts <-readRDS("data/dataframes/RNASeq_counts.rds")```## DESeq```{r}# Run DESeqdds01.cnts <-DESeqDataSetFromMatrix(subset.counts, colData = coldata, design =~ condition)dds02.dseq <-DESeq(dds01.cnts)dds03.resu <-results(dds02.dseq)dds04.vstf <-vst(dds02.dseq, blind =FALSE)``````{r}# Get PCA dataframe and percentVarpcadata <-plotPCA(dds04.vstf, returnData =TRUE) pcadata <-left_join(pcadata, subset.metadata, by =c("name"="sampleid")) ```### Saving files```{r}# Save data frames as CSV files in the output folderwrite_csv(pcadata, file.path(outputs_folder, "PCA_values.csv"))# Save data frames as R objects in the output foldersaveRDS(pcadata, file.path(outputs_folder, "PCA_values.rds"))# Cleaning-up all objects from the environmentrm(list =ls())# Use this to read the final objectspcadata <-readRDS("data/outputs/RNASeq/PCA_values.rds")```## PCA Plot### File Paths```{r}# Folder pathsoutputs_folder <-"data/outputs/RNASeq"# Load data pcadata <-readRDS("data/outputs/RNASeq/PCA_values.rds")```### Plot```{r}percentVar =round(100*attr(pcadata, "percentVar"))``````{r}PCAplot =ggplot(pcadata, aes(x = PC1, y = PC2, group = condition)) +geom_point(aes(fill = temp, color = temp, shape = line), alpha =0.75, size =3, show.legend =c(shape =FALSE)) +scale_fill_manual(values =c('red3', '#5b5b5b')) +scale_color_manual(values =c('red3', '#5b5b5b')) +scale_shape_manual(values =c(21, 22, 24)) +labs(x =paste0("PC1: ", percentVar[1], "% variance"),y =paste0("PC2: ", percentVar[2], "% variance"),fill ="Temperature", color ="Temperature") +#coord_fixed() +theme_bw() +theme(panel.grid =element_blank(),text =element_text(size =20),legend.text =element_markdown(),axis.text.x =element_text(color ="black"),axis.text.y =element_text(color ="black")) +guides(fill =guide_legend(override.aes =list(shape =21))) PCAplot```### Saving files```{r}ggsave(PCAplot, filename =paste0(outputs_folder, "/plotPCA.png"), width =6, height =6)saveRDS(PCAplot, file.path(outputs_folder, paste0("plotPCA.rds")))```