## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✔ ggplot2 3.3.0     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.5
## ✔ tidyr   1.0.2     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0

## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Introduction

This document will outline the addition of HTO information to a Seurat object.

Starting off

There will be two sets of raw data consisting of fastq files, one set from the RNA and another from the HTO. These are used by different software, the RNA is coverted into a barcode, feature/gene and matrix set by cellranger. The HTO are instead converted into their barcode, feature and matrix set by Cite-seq-Count.

Cite seq information

Prepare RNA data as Seurat

This will not be new to you, but a quick rehash of generating Seurat object.
Two samples:
- D13 monolayer
- D13+14 organoid
Both of these samples consist of 4 replicates that were hashed.

raw <- list(d13 = Read10X(data.dir = "/group/kidn1/RNAseq/scRNA_Jessica/200403_A00692_0080_AHHVTYDSXX/cellranger/outs/filtered_feature_bc_matrix/"), 
    d13p14 = Read10X(data.dir = "200421_A00692_0090_AHMYW3DMXX/DD156_D13p12/outs/filtered_feature_bc_matrix/"))
seurat <- map(raw, function(x) CreateSeuratObject(counts = x, project = "ExtDiff_Hashing"))
tracking <- data.frame(sample = names(seurat), genes = c(nrow(seurat$d13), nrow(seurat$d13p14)), 
    cells.RNA = c(ncol(seurat$d13), ncol(seurat$d13p14)))
tracking

  sample genes cells.RNA
1    d13 33538     20572
2 d13p14 33538     16412

Read the HTO counts

Now that we have the RNA information into a Seurat object, we can add in the HTO data. We add this in as an Assay, the RNA data is maintained in an “RNA” assay. This is an additional layer of information within the object that we can call upon when needed. Much like the RNA assay contains the transcript counts, the HTO assay will have the HTO barcode counts within each cell.

Because the data is in the same format as the RNA, we read it in using the same Read10X() function.

hash <- list(d13.hash = Read10X(data.dir = "200421_A00692_0090_AHMYW3DMXX_repeat/HASHED/umi_count/", 
    gene.column = 1), d13p14.hash = Read10X(data.dir = "200522_A00692_0102_AHMYW2DMXX/HASHED/umi_count/", 
    gene.column = 1))

This now contains the raw counts for the HTO barcodes, instead of genes like in RNA.

as.data.frame(hash$d13p14.hash)[1:5, 1:5]

                      GCCGATGGTTACGGAG TGTTTGTTCTTGAGCA GTCGAATTCAGGCGAA
A0255-AAGTATCGTTTCGCA               24                6              439
A0256-GGTTGCCAGATGTCA              190               10                5
A0257-TGTCTTTCCTGCCAG                8              279                6
A0258-CTCCTCTGCAATTAC               12                7               16
unmapped                            56               24               23
                      CAAGGGACACACAGAG TGTCCACCACATTACG
A0255-AAGTATCGTTTCGCA               48              145
A0256-GGTTGCCAGATGTCA                4                5
A0257-TGTCTTTCCTGCCAG               96                6
A0258-CTCCTCTGCAATTAC                9                8
unmapped                            10               48

Match cell barcodes

The cell barcodes within the HTO data will match to the cell barcodes within the RNA data. We need to filter the barcodes that are present in both.

RNA <- map(seurat, function(x) ncol(x))  # if you double click on RNA, or HTO or tracking  & cntrl enter, you will see
HTO <- map(hash, function(x) ncol(x))
joint.bcs <- map2(seurat, hash, function(x, y) intersect(colnames(x), colnames(y)))
tracking <- tracking %>% mutate(cells.HTO = c(HTO$d13.hash, HTO$d13p14.hash), cells.both = c(length(joint.bcs$d13), 
    length(joint.bcs$d13p14)))
tracking

  sample genes cells.RNA cells.HTO cells.both
1    d13 33538     20572     20565      20565
2 d13p14 33538     16412     16408      16408

Integrate HTO assay

We can see here that the majority of the RNA cells have been identified in the HTO dataset. We can now subset these cells and integrate the HTO information.

for (i in 1:2) {
    # will run through once for each sample
    seurat[[i]] <- seurat[[i]][, joint.bcs[[i]]]  # these lines are saying that we want only the cells present in both RNA and HTO data
    hash[[i]] <- hash[[i]][, joint.bcs[[i]]]
    seurat[[i]][["HTO"]] <- CreateAssayObject(counts = hash[[i]])
    seurat[[i]] <- NormalizeData(seurat[[i]], assay = "HTO", normalization.method = "CLR", 
        verbose = F)
    seurat[[i]] <- HTODemux(seurat[[i]], assay = "HTO", positive.quantile = 0.99)
}

Output

We now have a bunch more information regarding the hashing within the object. A quick look at the meta.data columns will tell us what we have.

head(as.tibble(seurat[[1]]@meta.data))

# A tibble: 6 x 11
  orig.ident nCount_RNA nFeature_RNA nCount_HTO nFeature_HTO HTO_maxID
  <fct>           <dbl>        <int>      <dbl>        <int> <fct>    
1 ExtDiff_H…      15335         3556        442            5 A0253-TT…
2 ExtDiff_H…      13049         3285        391            5 A0253-TT…
3 ExtDiff_H…      59375         7127       3579            5 A0252-TG…
4 ExtDiff_H…      32296         5598        502            5 A0252-TG…
5 ExtDiff_H…      29199         5546       1803            5 A0254-AG…
6 ExtDiff_H…      36918         5765        700            5 A0254-AG…
# … with 5 more variables: HTO_secondID <fct>, HTO_margin <dbl>,
#   HTO_classification <fct>, HTO_classification.global <fct>, hash.ID <fct>

head(as.tibble(seurat[[2]]@meta.data))

# A tibble: 6 x 11
  orig.ident nCount_RNA nFeature_RNA nCount_HTO nFeature_HTO HTO_maxID
  <fct>           <dbl>        <int>      <dbl>        <int> <fct>    
1 ExtDiff_H…       9208         2666        262            5 A0256-GG…
2 ExtDiff_H…       9868         3035        409            5 A0255-AA…
3 ExtDiff_H…      13044         3707        239            5 A0257-TG…
4 ExtDiff_H…       6284         2105        162            5 A0256-GG…
5 ExtDiff_H…      41695         6973        543            5 A0255-AA…
6 ExtDiff_H…      21252         4657         72            5 A0256-GG…
# … with 5 more variables: HTO_secondID <fct>, HTO_margin <dbl>,
#   HTO_classification <fct>, HTO_classification.global <fct>, hash.ID <fct>

Here we can see:
nCount_HTO and nFeature_HTO are the same as their “RNA” counterparts
HTO_maxID is the barcode that has the most hits within the cell
HTO_secondID is the barcode that has the second most hits, if there is one otherwise it will be called unmapped
HTO_classification is the full classification of the cell, if it is a doublet it will have 2 names here
HTO_classification.global identifier of Singlet, Doublet or Negative
hash.ID ID given to each cell, either the most likely single call, Doublet or Negative

temp.counts <- as.data.frame(table(seurat[[1]]$HTO_maxID, seurat[[1]]$HTO_classification.global)) %>% 
    mutate(sample = names(seurat)[1])
g1 <- temp.counts %>% ggplot(aes(Var1, Freq, fill = Var2)) + geom_bar(stat = "identity", 
    colour = "black", width = 1) + # geom_hline(yintercept=1000, linetype='dashed', color = 'red') + facet_wrap(ncol
# = 5, facets = 'capture', scales = 'free_x') +
theme(legend.position = "right") + ggtitle(paste0(names(seurat)[1])) + geom_text(aes(label = ifelse(Freq >= 
    0, Freq, "")), size = 3, position = position_stack(vjust = 0.5), colour = "white") + 
    theme(axis.text.x = element_text(angle = -45, hjust = 0, vjust = 0.5)) + theme(legend.title = element_text(size = rel(1.1)))

temp.counts <- as.data.frame(table(seurat[[2]]$HTO_maxID, seurat[[2]]$HTO_classification.global)) %>% 
    mutate(sample = names(seurat)[2])
g2 <- temp.counts %>% ggplot(aes(Var1, Freq, fill = Var2)) + geom_bar(stat = "identity", 
    colour = "black", width = 1) + # geom_hline(yintercept=1000, linetype='dashed', color = 'red') + facet_wrap(ncol
# = 5, facets = 'capture', scales = 'free_x') +
theme(legend.position = "right") + ggtitle(paste0(names(seurat)[2])) + geom_text(aes(label = ifelse(Freq >= 
    0, Freq, "")), size = 3, position = position_stack(vjust = 0.5), colour = "white") + 
    theme(axis.text.x = element_text(angle = -45, hjust = 0, vjust = 0.5)) + theme(legend.title = element_text(size = rel(1.1)))

g1 + g2 + patchwork::plot_layout(guides = "collect")

These look like really nice distributions of counts. WooHoo!!
There are a number of Doublets predicted in the first sample, something to keep in mind.

Let’s export the list of seurat objects and you can use this for your integration and analysis going forward.

write_rds(seurat, "ExtDiff_Hashing.rds")

sessionInfo()

R version 3.6.1 (2019-07-05)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS:   /hpc/software/installed/R/3.6.1/lib64/R/lib/libRblas.so
LAPACK: /hpc/software/installed/R/3.6.1/lib64/R/lib/libRlapack.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] forcats_0.4.0    stringr_1.4.0    dplyr_0.8.5      purrr_0.3.3     
 [5] readr_1.3.1      tidyr_1.0.2      tibble_2.1.3     ggplot2_3.3.0   
 [9] tidyverse_1.3.0  Seurat_3.1.4     rmdformats_0.3.7 knitr_1.28      

loaded via a namespace (and not attached):
 [1] Rtsne_0.15         colorspace_1.4-1   ggridges_0.5.1     fs_1.3.1          
 [5] rstudioapi_0.10    leiden_0.3.3       listenv_0.8.0      npsurv_0.4-0      
 [9] ggrepel_0.8.1      fansi_0.4.0        lubridate_1.7.4    xml2_1.2.5        
[13] codetools_0.2-16   splines_3.6.1      lsei_1.2-0         jsonlite_1.6      
[17] broom_0.5.2        ica_1.0-2          cluster_2.1.0      dbplyr_1.4.2      
[21] png_0.1-7          uwot_0.1.5         sctransform_0.2.0  compiler_3.6.1    
[25] httr_1.4.1         backports_1.1.5    assertthat_0.2.1   Matrix_1.2-17     
[29] lazyeval_0.2.2     cli_1.1.0          formatR_1.7        htmltools_0.4.0   
[33] tools_3.6.1        rsvd_1.0.3         igraph_1.2.4.2     gtable_0.3.0      
[37] glue_1.3.1         RANN_2.6.1         reshape2_1.4.3     rappdirs_0.3.1    
[41] Rcpp_1.0.3         cellranger_1.1.0   vctrs_0.2.4        gdata_2.18.0      
[45] ape_5.3            nlme_3.1-140       gbRd_0.4-11        lmtest_0.9-37     
[49] xfun_0.12          globals_0.12.5     rvest_0.3.5        lifecycle_0.2.0   
[53] irlba_2.3.3        gtools_3.8.1       future_1.16.0      MASS_7.3-51.4     
[57] zoo_1.8-7          scales_1.0.0       hms_0.5.3          parallel_3.6.1    
[61] RColorBrewer_1.1-2 yaml_2.2.0         reticulate_1.14    pbapply_1.4-2     
[65] gridExtra_2.3      stringi_1.4.3      caTools_1.17.1.2   bibtex_0.4.2      
[69] Rdpack_0.11-0      rlang_0.4.5        pkgconfig_2.0.3    bitops_1.0-6      
[73] evaluate_0.14      lattice_0.20-38    ROCR_1.0-7        
 [ reached getOption("max.print") -- omitted 35 entries ]

Add HTO to Seurat