Load packages.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.3.0      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 1.0.0 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(Seurat)
## Attaching SeuratObject
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.12.1
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite either one:
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##     genomic data. Bioinformatics 2016.
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## 
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library(circlize)
## ========================================
## circlize version 0.4.15
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================
library(ggalluvial)
Load data.
load("pbmc_clean.RData")
geom_hline( )ggplot(data = pbmc[["percent.mt"]]) +
  aes(x = 1, y = percent.mt) +
  geom_violin(fill = "salmon") + #Add color to violin
  geom_jitter(height = 0, width = 0.4, size = 0.5) + #Make points smaller
  #Add nice labels
  labs(x = "Identity pbmc3k", 
       y = "",
       title = "percent.mt") +
  #Format background and axes
  theme_classic() +
  #Remove meaningless x-axis values
  theme(axis.ticks.x = element_blank(),
        axis.text.x = element_blank()) +
  #Add horizontal cutoff line
  geom_hline(yintercept = 5, linetype = "dashed")
pbmc[["nFeature_RNA"]] and our original
quality filtering retained cells with 200 - 2500 features.ggplot(data = pbmc[["nFeature_RNA"]]) +
  aes(x = 1, y = nFeature_RNA) +
  geom_violin(fill = "salmon") + #Add color to violin
  geom_jitter(height = 0, width = 0.4, size = 0.5) + #Make points smaller
  #Add nice labels
  labs(x = "Identity pbmc3k", 
       y = "",
       title = "nFeature_RNA") +
  #Format background and axes
  theme_classic() +
  #Remove meaningless x-axis values
  theme(axis.ticks.x = element_blank(),
        axis.text.x = element_blank()) +
  #Add horizontal cutoff lines. Notice how you can add 2 at once
  geom_hline(yintercept = c(200, 2500), linetype = "dashed")
Re-run tSNE parts of the workshop if needed. This is copied exactly from the workshop.
Calculate tSNE.
pbmc <- RunTSNE(pbmc, dims = 1:10)
Extract tSNE data.
#Extract the XY values for UMP
tsne_xy <- as.data.frame(pbmc[["tsne"]]@cell.embeddings)
#Add cell cluster information
cell_clusters <- as.data.frame(pbmc$seurat_clusters)
#Use the same cell clusters as UMAP
#Combine
tsne_xy_clusters <- tsne_xy %>% 
  #move rownames to data column
  rownames_to_column("cell") %>% 
  #merge with cluster info
  full_join(
    cell_clusters %>% rownames_to_column("cell")
  ) %>% 
  #clean column names
  rename(ident = `pbmc$seurat_clusters`)
## Joining, by = "cell"
First, extract and format gene expression data similar to what we did for the UMAP.
#Define gene of interest
genes_of_interest <- c("CD14", "CD8A")
#Extract normalized gene expression data
pbmc_counts <- as.data.frame(GetAssayData(
  object = pbmc, slot = "data"))
#Filter for gene of interest
tsne_xy_expression <- pbmc_counts %>% 
  rownames_to_column("gene") %>% 
  filter(gene %in% genes_of_interest) %>% 
  #long format
  pivot_longer(-gene, names_to = "cell") %>% 
  #combine with UMAP data
  full_join(tsne_xy_clusters)
## Joining, by = "cell"
Then create the plot.
ggplot(data = tsne_xy_expression) +
  aes(x = tSNE_1, y = tSNE_2) +
  geom_point(size = 0.5) +
  aes(color = value) + #Color by expression
  scale_color_gradient(low = "lightgrey", 
                       high = "blue") + #Change colors
  facet_wrap(~ gene, scales = "free") + #Split by gene
  labs(color = "") +
  theme_classic()
We will use the tSNE and the Wong colors from the second reference above.
#List colors
col.palette <- c("#000000","#E69F00","#56B4E9","#009E73",
                 "#F0E442","#0072B2","#D55E00","#CC79A7",
                 "#6F0A4C")
#Add names to the vector to specify which color goes to which cluster
##Here we're keeping mostly the default order but forcing the lower right 
## cluster to be black instead
names(col.palette) <- c("3","0","1","2",
                        "4","5","6","7","8")
ggplot(data = tsne_xy_clusters) +
  aes(x = tSNE_1, y = tSNE_2) +
  geom_point(size = 0.5) +
  aes(color = ident) +
  labs(title = "ident", color = "") +
  theme_classic() +
  scale_color_manual(values = col.palette)
Re-run heatmap part of the workshop if needed. This is copied exactly from the workshop.
#Extract scaled expression data
pbmc_counts_log <- as.data.frame(GetAssayData(
  object = pbmc, 
  slot = "scale.data"))
#Order cells by clusters. Remember that we originally 
# got these data from as.data.frame(pbmc$seurat_clusters)
cell_order <- cell_clusters %>% 
  arrange(`pbmc$seurat_clusters`)
Create “top 3” data for second exercise. Again this is copied from the workshop
top3 <- pbmc_markers %>%
  group_by(cluster) %>%
  top_n(n = 3, wt = avg_log2FC)
pbmc_counts_log_top3 <- pbmc_counts_log %>% 
  #Filter top genes
  rownames_to_column("gene") %>% 
  filter(gene %in% top3$gene) %>% 
  #Remove outliers
  mutate_if(is.numeric, ~ifelse(. > 2.5, 2.5, .)) %>% 
  #Reorder cells
  select(gene, all_of(rownames(cell_order))) %>% 
  #Reorder genes
  mutate(gene = factor(gene, levels = unique(top3$gene))) %>% 
  arrange(gene) %>% 
  #Convert to matrix
  column_to_rownames("gene") %>% 
  as.matrix()
This time, let’s select “top” as the most significant, instead of the
largest fold change. We’ll use slice_min( ) for this.
Format data.
top20 <- pbmc_markers %>%
  group_by(cluster) %>%
  slice_min(n = 20, order_by = p_val_adj)
pbmc_counts_log_top20 <- pbmc_counts_log %>% 
  #Filter top genes
  rownames_to_column("gene") %>% 
  filter(gene %in% top20$gene) %>% 
  #Remove outliers
  mutate_if(is.numeric, ~ifelse(. > 2.5, 2.5, .)) %>% 
  #Reorder cells
  select(gene, all_of(rownames(cell_order))) %>% 
  #Reorder genes
  mutate(gene = factor(gene, levels = unique(top20$gene))) %>% 
  arrange(gene) %>% 
  #Convert to matrix
  column_to_rownames("gene") %>% 
  as.matrix()
Plot without clustering. Here we include the cluster annotation at
top but don’t recolor like Seurat (because I personally
don’t like those colors). Since we can’t read the 200+ gene names
anyway, I will remove those as well.
#Format cluster annotation
hm_clusters <- cell_order$`pbmc$seurat_clusters`
names(hm_clusters) <- rownames(cell_order)
#Make cluster colors the same as ggplot default
hm_colors <- scales::hue_pal()(9)
names(hm_colors) <- as.character(c(0:8))
#Format ComplexHeatmap annotation
#This is the not great syntax that the reference can help with
hm_anno <- HeatmapAnnotation(
  Identity = hm_clusters,
  col = list(Identity = hm_colors),
  show_annotation_name = FALSE)
#Make heatmap
Heatmap(pbmc_counts_log_top20, use_raster = FALSE,
        cluster_rows = FALSE, cluster_columns = FALSE,
        name = "Expression",
        column_split = hm_clusters,
        show_column_names = FALSE, #remove cell (column) labels
        show_row_names = FALSE, #remove gene (row) labels
        top_annotation = hm_anno) #add cluster annotation
## The automatically generated colors map from the 1^st and 99^th of the
## values in the matrix. There are outliers in the matrix whose patterns
## might be hidden by this color mapping. You can manually set the color
## to `col` argument.
## 
## Use `suppressMessages()` to turn off this message.
And with clustering, where we see that clusters 1 and 3 are nicely resolved. In contrast, notice how clusters 0 and 2 are heavily mixed. If we knew the cell-type identities of these clusters, these results could lead to interesting findings such as showing that gene expression is more or less similar in different cell types.
Heatmap(pbmc_counts_log_top20, use_raster = FALSE,
        cluster_rows = TRUE, cluster_columns = TRUE,
        name = "Expression",
        show_column_names = FALSE, #remove cell (column) labels
        show_row_names = FALSE, #remove gene (row) labels
        top_annotation = hm_anno) #add cluster annotation
## The automatically generated colors map from the 1^st and 99^th of the
## values in the matrix. There are outliers in the matrix whose patterns
## might be hidden by this color mapping. You can manually set the color
## to `col` argument.
## 
## Use `suppressMessages()` to turn off this message.
First, list all the cells in these two clusters.
cells_c0_1 <- cell_clusters %>% 
  rownames_to_column("cell") %>% 
  filter(`pbmc$seurat_clusters` %in% c(0,1)) %>% 
  pull(cell)
Then filter the expression data to these cells.
pbmc_counts_log_top3_c01 <- as.data.frame(pbmc_counts_log_top3) %>% 
  rownames_to_column() %>% 
  # move cells (which are indiv columns) into 1 column so we can filter it
  pivot_longer(-rowname, names_to = "cell") %>% 
  # filer cells
  filter(cell %in% cells_c0_1) %>% 
  #Put back to wide matrix for heatmap
  pivot_wider(names_from = "cell") %>% 
  column_to_rownames() %>% 
  as.matrix()
#also filter the cluster splitting vector
hm_clusters_01 <- hm_clusters[names(hm_clusters) %in% cells_c0_1]
#And the annotation object
hm_anno_01 <- HeatmapAnnotation(
  Identity = hm_clusters_01,
  col = list(Identity = hm_colors[1:2]),
  show_annotation_name = FALSE)
Plot! Let’s cluster the genes (rows) to highlight similarities between the two clusters.
Heatmap(pbmc_counts_log_top3_c01, use_raster = FALSE,
        cluster_rows = TRUE, cluster_columns = FALSE, #Turn clustering on
        name = "Expression",
        column_split = hm_clusters_01,
        show_column_names = FALSE, #remove cell (column) labels
        row_names_side = "left", #move gene (row) labels
        top_annotation = hm_anno_01 #add cluster annotation
        )