Setup

Load packages.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.3.0      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 1.0.0 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(Seurat)
## Attaching SeuratObject
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.12.1
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite either one:
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##     genomic data. Bioinformatics 2016.
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## 
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library(circlize)
## ========================================
## circlize version 0.4.15
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================
library(ggalluvial)

Load data.

load("pbmc_clean.RData")

Exercises

Practice violin plot

Further customize the mitochondrial plot by adding a horizontal line showing the 5% cutoff used in data clean. Hint: checkout geom_hline( )

ggplot(data = pbmc[["percent.mt"]]) +
  aes(x = 1, y = percent.mt) +
  geom_violin(fill = "salmon") + #Add color to violin
  geom_jitter(height = 0, width = 0.4, size = 0.5) + #Make points smaller
  #Add nice labels
  labs(x = "Identity pbmc3k", 
       y = "",
       title = "percent.mt") +
  #Format background and axes
  theme_classic() +
  #Remove meaningless x-axis values
  theme(axis.ticks.x = element_blank(),
        axis.text.x = element_blank()) +
  #Add horizontal cutoff line
  geom_hline(yintercept = 5, linetype = "dashed")

Make a violin plot showing the number of RNA features per cell. These data are in pbmc[["nFeature_RNA"]] and our original quality filtering retained cells with 200 - 2500 features.

ggplot(data = pbmc[["nFeature_RNA"]]) +
  aes(x = 1, y = nFeature_RNA) +
  geom_violin(fill = "salmon") + #Add color to violin
  geom_jitter(height = 0, width = 0.4, size = 0.5) + #Make points smaller
  #Add nice labels
  labs(x = "Identity pbmc3k", 
       y = "",
       title = "nFeature_RNA") +
  #Format background and axes
  theme_classic() +
  #Remove meaningless x-axis values
  theme(axis.ticks.x = element_blank(),
        axis.text.x = element_blank()) +
  #Add horizontal cutoff lines. Notice how you can add 2 at once
  geom_hline(yintercept = c(200, 2500), linetype = "dashed")

Practice ordination

Re-run tSNE parts of the workshop if needed. This is copied exactly from the workshop.

Calculate tSNE.

pbmc <- RunTSNE(pbmc, dims = 1:10)

Extract tSNE data.

#Extract the XY values for UMP
tsne_xy <- as.data.frame(pbmc[["tsne"]]@cell.embeddings)
#Add cell cluster information
cell_clusters <- as.data.frame(pbmc$seurat_clusters)

#Use the same cell clusters as UMAP
#Combine
tsne_xy_clusters <- tsne_xy %>% 
  #move rownames to data column
  rownames_to_column("cell") %>% 
  #merge with cluster info
  full_join(
    cell_clusters %>% rownames_to_column("cell")
  ) %>% 
  #clean column names
  rename(ident = `pbmc$seurat_clusters`)
## Joining, by = "cell"

Color gene expression of CD14 and CD8A in the tSNE ordination

First, extract and format gene expression data similar to what we did for the UMAP.

#Define gene of interest
genes_of_interest <- c("CD14", "CD8A")

#Extract normalized gene expression data
pbmc_counts <- as.data.frame(GetAssayData(
  object = pbmc, slot = "data"))

#Filter for gene of interest
tsne_xy_expression <- pbmc_counts %>% 
  rownames_to_column("gene") %>% 
  filter(gene %in% genes_of_interest) %>% 
  #long format
  pivot_longer(-gene, names_to = "cell") %>% 
  #combine with UMAP data
  full_join(tsne_xy_clusters)
## Joining, by = "cell"

Then create the plot.

ggplot(data = tsne_xy_expression) +
  aes(x = tSNE_1, y = tSNE_2) +
  geom_point(size = 0.5) +
  aes(color = value) + #Color by expression
  scale_color_gradient(low = "lightgrey", 
                       high = "blue") + #Change colors
  facet_wrap(~ gene, scales = "free") + #Split by gene
  labs(color = "") +
  theme_classic()

Change the color scale of one UMAP or tSNE to your colors of choice! Checkout https://colorbrewer2.org/ and https://davidmathlogic.com/colorblind/ for some accessible color examples.

We will use the tSNE and the Wong colors from the second reference above.

#List colors
col.palette <- c("#000000","#E69F00","#56B4E9","#009E73",
                 "#F0E442","#0072B2","#D55E00","#CC79A7",
                 "#6F0A4C")
#Add names to the vector to specify which color goes to which cluster
##Here we're keeping mostly the default order but forcing the lower right 
## cluster to be black instead
names(col.palette) <- c("3","0","1","2",
                        "4","5","6","7","8")

ggplot(data = tsne_xy_clusters) +
  aes(x = tSNE_1, y = tSNE_2) +
  geom_point(size = 0.5) +
  aes(color = ident) +
  labs(title = "ident", color = "") +
  theme_classic() +
  scale_color_manual(values = col.palette)

Practice heatmap

Re-run heatmap part of the workshop if needed. This is copied exactly from the workshop.

#Extract scaled expression data
pbmc_counts_log <- as.data.frame(GetAssayData(
  object = pbmc, 
  slot = "scale.data"))

#Order cells by clusters. Remember that we originally 
# got these data from as.data.frame(pbmc$seurat_clusters)
cell_order <- cell_clusters %>% 
  arrange(`pbmc$seurat_clusters`)

Create “top 3” data for second exercise. Again this is copied from the workshop

top3 <- pbmc_markers %>%
  group_by(cluster) %>%
  top_n(n = 3, wt = avg_log2FC)

pbmc_counts_log_top3 <- pbmc_counts_log %>% 
  #Filter top genes
  rownames_to_column("gene") %>% 
  filter(gene %in% top3$gene) %>% 
  #Remove outliers
  mutate_if(is.numeric, ~ifelse(. > 2.5, 2.5, .)) %>% 
  #Reorder cells
  select(gene, all_of(rownames(cell_order))) %>% 
  #Reorder genes
  mutate(gene = factor(gene, levels = unique(top3$gene))) %>% 
  arrange(gene) %>% 
  #Convert to matrix
  column_to_rownames("gene") %>% 
  as.matrix()

Create a heatmap of the top 20 marker genes in clusters. Which clusters better resolve with de novo clustering?

This time, let’s select “top” as the most significant, instead of the largest fold change. We’ll use slice_min( ) for this.

Format data.

top20 <- pbmc_markers %>%
  group_by(cluster) %>%
  slice_min(n = 20, order_by = p_val_adj)

pbmc_counts_log_top20 <- pbmc_counts_log %>% 
  #Filter top genes
  rownames_to_column("gene") %>% 
  filter(gene %in% top20$gene) %>% 
  #Remove outliers
  mutate_if(is.numeric, ~ifelse(. > 2.5, 2.5, .)) %>% 
  #Reorder cells
  select(gene, all_of(rownames(cell_order))) %>% 
  #Reorder genes
  mutate(gene = factor(gene, levels = unique(top20$gene))) %>% 
  arrange(gene) %>% 
  #Convert to matrix
  column_to_rownames("gene") %>% 
  as.matrix()

Plot without clustering. Here we include the cluster annotation at top but don’t recolor like Seurat (because I personally don’t like those colors). Since we can’t read the 200+ gene names anyway, I will remove those as well.

#Format cluster annotation
hm_clusters <- cell_order$`pbmc$seurat_clusters`
names(hm_clusters) <- rownames(cell_order)

#Make cluster colors the same as ggplot default
hm_colors <- scales::hue_pal()(9)
names(hm_colors) <- as.character(c(0:8))

#Format ComplexHeatmap annotation
#This is the not great syntax that the reference can help with
hm_anno <- HeatmapAnnotation(
  Identity = hm_clusters,
  col = list(Identity = hm_colors),
  show_annotation_name = FALSE)

#Make heatmap
Heatmap(pbmc_counts_log_top20, use_raster = FALSE,
        cluster_rows = FALSE, cluster_columns = FALSE,
        name = "Expression",
        column_split = hm_clusters,
        show_column_names = FALSE, #remove cell (column) labels
        show_row_names = FALSE, #remove gene (row) labels
        top_annotation = hm_anno) #add cluster annotation
## The automatically generated colors map from the 1^st and 99^th of the
## values in the matrix. There are outliers in the matrix whose patterns
## might be hidden by this color mapping. You can manually set the color
## to `col` argument.
## 
## Use `suppressMessages()` to turn off this message.

And with clustering, where we see that clusters 1 and 3 are nicely resolved. In contrast, notice how clusters 0 and 2 are heavily mixed. If we knew the cell-type identities of these clusters, these results could lead to interesting findings such as showing that gene expression is more or less similar in different cell types.

Heatmap(pbmc_counts_log_top20, use_raster = FALSE,
        cluster_rows = TRUE, cluster_columns = TRUE,
        name = "Expression",
        show_column_names = FALSE, #remove cell (column) labels
        show_row_names = FALSE, #remove gene (row) labels
        top_annotation = hm_anno) #add cluster annotation
## The automatically generated colors map from the 1^st and 99^th of the
## values in the matrix. There are outliers in the matrix whose patterns
## might be hidden by this color mapping. You can manually set the color
## to `col` argument.
## 
## Use `suppressMessages()` to turn off this message.

Create a heatmap of the top 3 marker genes in only clusters 0 and 1.

First, list all the cells in these two clusters.

cells_c0_1 <- cell_clusters %>% 
  rownames_to_column("cell") %>% 
  filter(`pbmc$seurat_clusters` %in% c(0,1)) %>% 
  pull(cell)

Then filter the expression data to these cells.

pbmc_counts_log_top3_c01 <- as.data.frame(pbmc_counts_log_top3) %>% 
  rownames_to_column() %>% 
  # move cells (which are indiv columns) into 1 column so we can filter it
  pivot_longer(-rowname, names_to = "cell") %>% 
  # filer cells
  filter(cell %in% cells_c0_1) %>% 
  #Put back to wide matrix for heatmap
  pivot_wider(names_from = "cell") %>% 
  column_to_rownames() %>% 
  as.matrix()

#also filter the cluster splitting vector
hm_clusters_01 <- hm_clusters[names(hm_clusters) %in% cells_c0_1]

#And the annotation object
hm_anno_01 <- HeatmapAnnotation(
  Identity = hm_clusters_01,
  col = list(Identity = hm_colors[1:2]),
  show_annotation_name = FALSE)

Plot! Let’s cluster the genes (rows) to highlight similarities between the two clusters.

Heatmap(pbmc_counts_log_top3_c01, use_raster = FALSE,
        cluster_rows = TRUE, cluster_columns = FALSE, #Turn clustering on
        name = "Expression",
        column_split = hm_clusters_01,
        show_column_names = FALSE, #remove cell (column) labels
        row_names_side = "left", #move gene (row) labels
        top_annotation = hm_anno_01 #add cluster annotation
        )