HNOBac Manuscript
  1. Methods
  2. RNASeq
  • Introduction
  • Methods
    • RNASeq
    • Cell Counts & TEER
    • CFUs 48h
    • CFUs Epithelial Lines
    • LDH 48h
    • LDH Epithelial Lines
    • Cytokines
    • HEK-Blue
  • R Session Info

Table of contents

  • Data Input and Selection
    • File Paths
    • Data clean-up
    • Saving files
  • DESeq
    • Saving files
  • PCA Plot
    • File Paths
    • Plot
    • Saving files

RNASeq

library(tidyverse)
library(readxl)
library(ggtext)
library(magrittr)
library(DESeq2)

Data Input and Selection

File Paths

# Folder paths
input_path <- "data/input_data/RNASeq/"
metadata_path <- "data/metadata/RNASeq"

# Create subfolders for output files
dataframes_folder <- "data/dataframes"
if (!file.exists("data/dataframes")) {
  dir.create("data/dataframes", recursive = TRUE)
}
outputs_folder <- "data/outputs/RNASeq"
if (!file.exists("data/outputs/RNASeq")) {
  dir.create("data/outputs/RNASeq", recursive = TRUE)
}

# Load data and metadata
metadata <- read_excel(file.path(metadata_path, "hnoseq3437-metadata.xlsx")) 
counts <- read_csv(file.path(input_path, "hnoseq3437-hisat2.featureCounts-genes2-2batches.csv"))

Data clean-up

# Clean up unneeded columns and transform all columns into factors
metadata <- metadata %>%
  select(-contains(c('_func','original_name'))) %>%
  mutate_all(factor)

# Select only the rows with "protein_coding" as the GeneBiotype and transform in a matrix with GeneID as the rownames. Sort by GeneID
counts <- counts %>%
  filter(GeneBiotype == "protein_coding") %>% 
  column_to_rownames(var = "GeneID") %>%
  select(-contains('Gene')) %>% 
  .[ order(names(.)) ]
# Subset metadata to include only the samples relevant to this analysis 
subset.metadata <- metadata %>% 
  subset(line != 'HVO262') %>% # remove HVO262 samples
  subset(temp != 'CT') %>%  # remove CONTROL samples
  subset(temp != '34') %>%  # remove 34C samples
  droplevels() %>% 
  #mutate(temp = factor(temp, labels = c("37 °C", "37 °C → 34 °C")))  %>% 
  mutate(temp = factor(temp, labels = c("<b><span style='color:red3;'>37 °C</span></b>", 
                                        "<b><span style='color:#5b5b5b;'>37 °C → 34 °C</span></b>")))  %>%
  mutate(line = factor(line, levels = c("HNO918", "HNO204", "HNO919")))  %>% 
  set_rownames(.$sampleid)

# Keep counts column only for the selected samples 
subset.counts <- counts %>% 
  select(contains(rownames(subset.metadata))) 

# Make the correesponding design matrix 
coldata <- data.frame(subset.metadata$line) %>%
  set_rownames(rownames(subset.metadata)) %>%
  setNames('condition') 

Saving files

# Save data frames as CSV files in the dataframes folder
write.csv(subset.counts, file.path(dataframes_folder, "RNASeq_counts.csv"), row.names = TRUE)

# Save data frames as R objects in the dataframes folder
saveRDS(subset.counts, file.path(dataframes_folder, "RNASeq_counts.rds"))

# Use this to read the final objects
subset.counts <- readRDS("data/dataframes/RNASeq_counts.rds")

DESeq

# Run DESeq
dds01.cnts <- DESeqDataSetFromMatrix(subset.counts, colData = coldata, design = ~ condition)
dds02.dseq <- DESeq(dds01.cnts)
dds03.resu <- results(dds02.dseq)
dds04.vstf <- vst(dds02.dseq,  blind = FALSE)
# Get PCA dataframe and percentVar
pcadata <- plotPCA(dds04.vstf, returnData = TRUE) 
pcadata <- left_join(pcadata, subset.metadata, by = c("name" = "sampleid")) 

Saving files

# Save data frames as CSV files in the output folder
write_csv(pcadata, file.path(outputs_folder, "PCA_values.csv"))

# Save data frames as R objects in the output folder
saveRDS(pcadata, file.path(outputs_folder, "PCA_values.rds"))

# Cleaning-up all objects from the environment
rm(list = ls())

# Use this to read the final objects
pcadata <- readRDS("data/outputs/RNASeq/PCA_values.rds")

PCA Plot

File Paths

# Folder paths
outputs_folder <- "data/outputs/RNASeq"

# Load data 
pcadata <- readRDS("data/outputs/RNASeq/PCA_values.rds")

Plot

percentVar = round(100 * attr(pcadata, "percentVar"))
PCAplot = ggplot(pcadata, aes(x = PC1, y = PC2, group = condition)) +
  geom_point(aes(fill = temp, color = temp, shape = line), alpha = 0.75, size = 3, show.legend = c(shape = FALSE)) + 
  scale_fill_manual(values = c('red3', '#5b5b5b')) +
  scale_color_manual(values = c('red3', '#5b5b5b')) +
  scale_shape_manual(values = c(21, 22, 24)) +
  labs(x = paste0("PC1: ", percentVar[1], "% variance"),
       y = paste0("PC2: ", percentVar[2], "% variance"),
       fill = "Temperature", color = "Temperature") +
  #coord_fixed() +
  theme_bw() +
  theme(panel.grid = element_blank(),
        text = element_text(size = 20),
        legend.text = element_markdown(),
        axis.text.x = element_text(color = "black"),
        axis.text.y = element_text(color = "black")) +
  guides(fill = guide_legend(override.aes = list(shape = 21))) 

PCAplot

Saving files

ggsave(PCAplot, filename = paste0(outputs_folder, "/plotPCA.png"), width = 6, height = 6)
saveRDS(PCAplot, file.path(outputs_folder, paste0("plotPCA.rds")))
Introduction
Cell Counts & TEER
Source Code
---
execute:
  message: FALSE
  warning: FALSE
---

# RNASeq {.unnumbered}

```{r}
library(tidyverse)
library(readxl)
library(ggtext)
library(magrittr)
library(DESeq2)
```

## Data Input and Selection

### File Paths

```{r}
# Folder paths
input_path <- "data/input_data/RNASeq/"
metadata_path <- "data/metadata/RNASeq"

# Create subfolders for output files
dataframes_folder <- "data/dataframes"
if (!file.exists("data/dataframes")) {
  dir.create("data/dataframes", recursive = TRUE)
}
outputs_folder <- "data/outputs/RNASeq"
if (!file.exists("data/outputs/RNASeq")) {
  dir.create("data/outputs/RNASeq", recursive = TRUE)
}

# Load data and metadata
metadata <- read_excel(file.path(metadata_path, "hnoseq3437-metadata.xlsx")) 
counts <- read_csv(file.path(input_path, "hnoseq3437-hisat2.featureCounts-genes2-2batches.csv"))
```

### Data clean-up

```{r}
# Clean up unneeded columns and transform all columns into factors
metadata <- metadata %>%
  select(-contains(c('_func','original_name'))) %>%
  mutate_all(factor)

# Select only the rows with "protein_coding" as the GeneBiotype and transform in a matrix with GeneID as the rownames. Sort by GeneID
counts <- counts %>%
  filter(GeneBiotype == "protein_coding") %>% 
  column_to_rownames(var = "GeneID") %>%
  select(-contains('Gene')) %>% 
  .[ order(names(.)) ]
```

```{r}
# Subset metadata to include only the samples relevant to this analysis 
subset.metadata <- metadata %>% 
  subset(line != 'HVO262') %>% # remove HVO262 samples
  subset(temp != 'CT') %>%  # remove CONTROL samples
  subset(temp != '34') %>%  # remove 34C samples
  droplevels() %>% 
  #mutate(temp = factor(temp, labels = c("37 °C", "37 °C → 34 °C")))  %>% 
  mutate(temp = factor(temp, labels = c("<b><span style='color:red3;'>37 °C</span></b>", 
                                        "<b><span style='color:#5b5b5b;'>37 °C → 34 °C</span></b>")))  %>%
  mutate(line = factor(line, levels = c("HNO918", "HNO204", "HNO919")))  %>% 
  set_rownames(.$sampleid)

# Keep counts column only for the selected samples 
subset.counts <- counts %>% 
  select(contains(rownames(subset.metadata))) 

# Make the correesponding design matrix 
coldata <- data.frame(subset.metadata$line) %>%
  set_rownames(rownames(subset.metadata)) %>%
  setNames('condition') 
```

### Saving files

```{r}
# Save data frames as CSV files in the dataframes folder
write.csv(subset.counts, file.path(dataframes_folder, "RNASeq_counts.csv"), row.names = TRUE)

# Save data frames as R objects in the dataframes folder
saveRDS(subset.counts, file.path(dataframes_folder, "RNASeq_counts.rds"))

# Use this to read the final objects
subset.counts <- readRDS("data/dataframes/RNASeq_counts.rds")
```

## DESeq

```{r}
# Run DESeq
dds01.cnts <- DESeqDataSetFromMatrix(subset.counts, colData = coldata, design = ~ condition)
dds02.dseq <- DESeq(dds01.cnts)
dds03.resu <- results(dds02.dseq)
dds04.vstf <- vst(dds02.dseq,  blind = FALSE)
```

```{r}
# Get PCA dataframe and percentVar
pcadata <- plotPCA(dds04.vstf, returnData = TRUE) 
pcadata <- left_join(pcadata, subset.metadata, by = c("name" = "sampleid")) 
```

### Saving files

```{r}
# Save data frames as CSV files in the output folder
write_csv(pcadata, file.path(outputs_folder, "PCA_values.csv"))

# Save data frames as R objects in the output folder
saveRDS(pcadata, file.path(outputs_folder, "PCA_values.rds"))

# Cleaning-up all objects from the environment
rm(list = ls())

# Use this to read the final objects
pcadata <- readRDS("data/outputs/RNASeq/PCA_values.rds")
```

## PCA Plot

### File Paths

```{r}
# Folder paths
outputs_folder <- "data/outputs/RNASeq"

# Load data 
pcadata <- readRDS("data/outputs/RNASeq/PCA_values.rds")
```

### Plot

```{r}
percentVar = round(100 * attr(pcadata, "percentVar"))
```

```{r}
PCAplot = ggplot(pcadata, aes(x = PC1, y = PC2, group = condition)) +
  geom_point(aes(fill = temp, color = temp, shape = line), alpha = 0.75, size = 3, show.legend = c(shape = FALSE)) + 
  scale_fill_manual(values = c('red3', '#5b5b5b')) +
  scale_color_manual(values = c('red3', '#5b5b5b')) +
  scale_shape_manual(values = c(21, 22, 24)) +
  labs(x = paste0("PC1: ", percentVar[1], "% variance"),
       y = paste0("PC2: ", percentVar[2], "% variance"),
       fill = "Temperature", color = "Temperature") +
  #coord_fixed() +
  theme_bw() +
  theme(panel.grid = element_blank(),
        text = element_text(size = 20),
        legend.text = element_markdown(),
        axis.text.x = element_text(color = "black"),
        axis.text.y = element_text(color = "black")) +
  guides(fill = guide_legend(override.aes = list(shape = 21))) 

PCAplot
```

### Saving files

```{r}
ggsave(PCAplot, filename = paste0(outputs_folder, "/plotPCA.png"), width = 6, height = 6)
saveRDS(PCAplot, file.path(outputs_folder, paste0("plotPCA.rds")))
```