Example analysis steps/calculations and visualizations for RNA sequencing (RNA-seq) analysis.
# get list of files
list.files(path = 'count_tables', 
           pattern = 'counts.txt',
           full.names = T) -> files
# Read files in recursively
vroom(file = files, id = 'file_path', delim = '\t', comment = '#', skip = 2,
      col_names = c('ensembl_gene_id', 'chr', 'start', 'end', 
                    'strand', 'length', 'count')) %>%
  mutate(sample_id = str_extract(file_path, 'ENCFF[0-9,A-Z]{6}'),
         tissue = ifelse(str_detect(file_path, 'breast'), 
                            'breast', 'colon')) %>%
  dplyr::select(-file_path) -> data## Rows: 406,357
## Columns: 8
## Delimiter: "\t"
## chr [5]: ensembl_gene_id, chr, start, end, strand
## dbl [2]: length, count
## 
## Use `spec()` to retrieve the guessed column specification
## Pass a specification to the `col_types` argument to quiet this messageDESeq2Transform the data into a count matrix and an annotation table.
data %>%
  dplyr::select(ensembl_gene_id, sample_id, count) %>%
  pivot_wider(names_from = sample_id, values_from = count) %>% 
  as.data.frame() -> count_matrix
data %>%
  dplyr::select(sample_id, tissue) %>%
  distinct() -> metadatadds <- DESeqDataSetFromMatrix(countData = count_matrix,
                              colData = metadata,
                              tidy = TRUE,
                              design = ~ tissue)## converting counts to integer mode## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factorsRemove rows that contain only zero counts. There’s no point in testing genes where nothing was detected.
### filter out rows that contain only zero counts
keep <- rowSums(counts(dds)) > 1
dds <- dds[keep, ]Check to make sure that our groups are distinct from one another and there’s no obvious clustering by technical effects. As seen in the PCA plot generated below, PC1 and PC2 separate samples by tissue type, the effect we expect to see.
# normalize the counts
vsd <- varianceStabilizingTransformation(dds, blind = FALSE)
# plot the PCa
plotPCA(vsd, intgroup = 'tissue') +
  geom_point() +
  scale_color_manual(values = c('hotpink2', 'hotpink4')) +
  theme_classic()dds <- DESeq(dds)## estimating size factors## estimating dispersions## gene-wise dispersion estimates## mean-dispersion relationship## final dispersion estimates## fitting model and testingGet human readable gene names
# check which annotations are available.
columns(org.Hs.eg.db)##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
##  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
## [11] "GO"           "GOALL"        "IPI"          "MAP"          "OMIM"        
## [16] "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"       "UNIGENE"     
## [26] "UNIPROT"# use the Ensembl IDs to find the corresponding HGNC IDs
mapIds(org.Hs.eg.db, 
       keys = rownames(results(dds)), 
       column = 'SYMBOL',
       keytype = 'ENSEMBL',
       multiVals = 'first') %>% 
  enframe(name = 'ensembl_gene_id', value = 'gene') -> gene_names## 'select()' returned 1:many mapping between keys and columnsWrangle the differential expression result object to get a rectangular table with the human readable gene IDs.
results(dds) %>% 
  as.data.frame() %>%
  rownames_to_column('ensembl_gene_id') %>%
  left_join(gene_names, by = 'ensembl_gene_id') %>%
  dplyr::select(gene, ensembl_gene_id, everything()) %>%
  mutate(sig = ifelse(padj < 0.05 & abs(log2FoldChange) >= 1, 
                      'sig', 'notsig'),
         log_qvalue = -log10(padj)) %>%
  replace_na(list(sig = 'notsig', log_qvalue = 0)) -> diff_exp_tblSave the results of the differential expression test.
# write_tsv(diff_exp_tbl, 'diff_exp.tsv')Only a few genes are differentially expressed, many of them with originally low counts.
diff_exp_tbl %>%
  mutate(direction = ifelse(log2FoldChange < 0, 'down', 'up')) %>%
  group_by(direction, sig) %>%
  dplyr::count() %>%
  ungroup() %>%
  filter(sig == 'sig') %>%
  mutate(label = paste0(n, ', ', round((n / nrow(diff_exp_tbl)), 1), '%'),
         baseMean = 850,
         log2FoldChange = c(-8, 8)) -> ma_labels
# plot
ggplot(diff_exp_tbl, aes(x = baseMean, y = log2FoldChange)) +
  geom_text(data = ma_labels, aes(label = label), size = 8) +
  geom_point(aes(color = sig)) +
  scale_color_manual(values = c('gray30', 'firebrick3')) +
  geom_hline(yintercept = 0, color = 'gray60', linetype = 'dashed') +
  labs(x = 'Mean Expression (Counts)', y = 'Log2 Fold Change') +
  coord_cartesian(xlim = c(0, 1000), ylim = c(-10, 10)) +
  theme_classic(base_size = 20) +
  theme(legend.position = 'none')Only a few up- and down-regulated genes.
# Create labels for the number and percentage of significantly up- and down- 
# regulated genes
diff_exp_tbl %>%
  mutate(direction = ifelse(log2FoldChange < 0, 'down', 'up')) %>%
  group_by(direction, sig) %>%
  dplyr::count() %>%
  ungroup() %>%
  # complete(direction, sig, fill = list(n = 0)) %>%
  # na.omit() %>%
  filter(sig == 'sig') %>%
  mutate(label = paste0(n, ', ', round((n / nrow(diff_exp_tbl)), 1), '%'),
         log2FoldChange = c(-3.5, 3.5),
         log_qvalue = 4) -> volc_labels
# plot
ggplot(diff_exp_tbl, aes(x = log2FoldChange, y = log_qvalue)) +
  geom_point(aes(color = sig)) +
  scale_color_manual(values = c('gray30', 'firebrick3')) +
  geom_hline(yintercept = -log10(0.05), color = 'gray60', linetype = 'dashed') +
  geom_vline(xintercept = c(-1, 1), color = 'gray60', linetype = 'dashed') +
  geom_text(data = volc_labels, aes(label = label), size = 8) +
  labs(x = 'Log2 Fold Change', y = '-Log10 QValue') +
  coord_cartesian(xlim = c(-5, 5), ylim = c(0, 5)) +
  theme_classic(base_size = 20) +
  theme(legend.position = 'none')Get the list of Entrez IDs that correspond to all our genes detected in the differential expression analysis.
# get Entrez IDs from our Ensembl IDs for fgsea()
AnnotationDbi::mapIds(org.Hs.eg.db,
                      keys = unique(diff_exp_tbl$ensembl_gene_id),
                      keytype = 'ENSEMBL',
                      column = 'ENTREZID',
                      multiVals = 'first') %>%
  enframe(name = 'gene', value = 'entrez_id') %>%
  unnest(c(entrez_id)) -> entrez_ids## 'select()' returned 1:many mapping between keys and columns# make a named vector of our genes for fgsea()
diff_exp_tbl %>%
  dplyr::select(ensembl_gene_id, stat) %>% 
  left_join(entrez_ids, by = c('ensembl_gene_id' = 'gene')) %>%
  dplyr::select(entrez_id, stat) %>%
  deframe() -> diff_exp_resLoad the Reactome pathways
# use Reactome pathwayse
reactome_pathways <- reactomePathways(names(exampleRanks)) ## Loading required namespace: reactome.db## 'select()' returned 1:many mapping between keys and columns## 'select()' returned 1:1 mapping between keys and columnsRun the pathway analysis
fgsea_res <- fgsea(pathways = reactome_pathways, 
                   stats = exampleRanks, 
                   nperm = 1000)Save results
### save results
fgsea_res %>%
  unnest(c(leadingEdge)) #%>%## # A tibble: 17,134 x 8
##    pathway               pval  padj     ES    NES nMoreExtreme  size leadingEdge
##    <chr>                <dbl> <dbl>  <dbl>  <dbl>        <dbl> <int> <chr>      
##  1 5-Phosphoribose 1-d… 0.878 0.955  0.427  0.679          444     2 328099     
##  2 5-Phosphoribose 1-d… 0.878 0.955  0.427  0.679          444     2 19139      
##  3 A tetrasaccharide l… 0.702 0.889  0.322  0.825          406    11 14733      
##  4 A tetrasaccharide l… 0.702 0.889  0.322  0.825          406    11 20971      
##  5 A tetrasaccharide l… 0.702 0.889  0.322  0.825          406    11 20970      
##  6 A tetrasaccharide l… 0.702 0.889  0.322  0.825          406    11 12032      
##  7 A tetrasaccharide l… 0.702 0.889  0.322  0.825          406    11 29873      
##  8 A tetrasaccharide l… 0.702 0.889  0.322  0.825          406    11 218271     
##  9 A tetrasaccharide l… 0.702 0.889  0.322  0.825          406    11 13179      
## 10 Abacavir metabolism  0.362 0.730 -0.628 -1.13           173     3 11522      
## # … with 17,124 more rows  # write_tsv('all_pathway_results.tsv')
### save collapsed results
# find the essential top-level pathways
collapsed_pathways <- collapsePathways(fgsea_res, 
                                       pathways = reactome_pathways, 
                                       stats = exampleRanks)
# filter the results for the essential pathways
fgsea_res %>% 
  filter(pathway %in% collapsed_pathways$mainPathways) %>%
  arrange(pathway) %>% 
  unnest(c(leadingEdge)) #%>%## # A tibble: 1,517 x 8
##    pathway               pval   padj     ES   NES nMoreExtreme  size leadingEdge
##    <chr>                <dbl>  <dbl>  <dbl> <dbl>        <dbl> <int> <chr>      
##  1 Activated NOTCH1 … 0.00673 0.0680 -0.763 -1.82            2     7 14357      
##  2 Activated NOTCH1 … 0.00673 0.0680 -0.763 -1.82            2     7 18128      
##  3 Activated NOTCH1 … 0.00673 0.0680 -0.763 -1.82            2     7 74198      
##  4 Activation of IRF… 0.0145  0.112  -0.655 -1.76            5    10 56489      
##  5 Activation of IRF… 0.0145  0.112  -0.655 -1.76            5    10 54131      
##  6 Activation of IRF… 0.0145  0.112  -0.655 -1.76            5    10 54123      
##  7 Activation of IRF… 0.0145  0.112  -0.655 -1.76            5    10 56480      
##  8 Activation of IRF… 0.0145  0.112  -0.655 -1.76            5    10 22031      
##  9 Apoptosis          0.00147 0.0310  0.516  2.04            0    71 58801      
## 10 Apoptosis          0.00147 0.0310  0.516  2.04            0    71 14958      
## # … with 1,507 more rows  # write_tsv('top_level_pathway_results.tsv')Look at the top 10 most up- and down-regulated statistically significant pathways.
fgsea_res %>% #arrange(desc(nchar(pathway)))
# filter for significant pathways
  filter(padj < 0.05) %>%
# arrange by the normalized enrichment score
  arrange(NES) %>% 
# get the first and last 10 rows which will be the 10 most up- and down- 
# regulated pathways
  do(rbind(head(., 10), tail(., 10))) %>%
# the pathway names can be long, so if they're over 20 characters, subset them
# otherwise use the whole name
  mutate(pathway_short = ifelse(nchar(pathway) <= 30, 
                                pathway, 
                                paste0(str_sub(pathway, start = 1, end = 27), 
                                       '...'))) %>%
ggplot(aes(x = reorder(pathway_short, NES), y = NES)) +
  geom_col(aes(fill = padj)) +
  scale_fill_viridis(direction = -1) +
  coord_flip() +
  labs(x = 'Pathway', 
       y = 'Normalized Enrichment Score (NES)', 
       fill = 'Q-Value') +
  theme_minimal(base_size = 16)