Go back to desc homepage

Data Summary

In order to evaluate the performance of DESC for data generated from different scRNA-seq protocols, we analyzed four human pancreatic islet datasets, and compared DESC with six other batch effect removal methods, including Seurat3.0, CCA, MNN, scVI, BERMUDA and scanorama. The four protocols we considered include CelSeq (GSE81076), CelSeq2 (GSE85241), Fluidigm C1 (GSE86469), and SMART-Seq2 (E-MTAB-5061). The combined raw data matrix and associated metadata file can be downloaded from here

The combined dataset has 6,321 cells, with 1,004 cells from GSE81076, 2,285 cells from GSE85241, 638 cells from GSE86469, and 2,394 cells from E-MTAB-5061. This combined dataset contains 13 cell types: acinar, activated_stellate, alpha, beta, delta, ductal, endothelia, epsilon, gamma, macrophage, mast, quiescent_stellate and schwann.

options(warn = -1)
.libPaths(c("/home/xiaoxiang/R/x86_64-pc-linux-gnu-library/3.5",.libPaths()))
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(png))
suppressPackageStartupMessages(library(RANN))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(kableExtra))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(vioplot))
suppressPackageStartupMessages(library(googleVis))
## Creating a generic function for 'toJSON' from package 'jsonlite' in package 'googleVis'
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(googleVis))
suppressPackageStartupMessages(library(rsvg))
#knitr::opts_chunk$set(echo=T)
datadirpath="/media/xiaoxiang/D/raw_data/DECv.1/pancreas_v3_result/"
fig_path="/home/xiaoxiang/Documents/DESC_paper_prepare/DESC_paper_final/formal_revised/figures_sep/"
# get sankey plot 
getmy.sankey=function (reference, clusters,reorder_label=F,plot_width = 400, plot_height = 600,fontsize=12) 
{
  Var1 <- value <- NULL
  stopifnot(class(reference)=="factor")
  stopifnot(class(clusters)=="factor")
  res.all <- NULL
  for (j in names(table(reference))) {
    res <- NULL
    for (i in names(table(clusters))) {
      tmp <- length(intersect(which(clusters == i), which(reference == 
                                                            j)))
      res <- c(res, tmp)
    }
    res.all <- rbind(res.all, res)
  }
  colnames(res.all) <- names(table(clusters))
  rownames(res.all) <- names(table(reference))
  
  if (ncol(res.all) > 1) {
    res.all <- res.all[order(as.numeric(table(reference)), 
                             decreasing = TRUE), order(as.numeric(table(clusters)), 
                                                       decreasing = TRUE), drop = FALSE]
  }
  res <- reshape2::melt(res.all)
  res <- res[res$value != 0, ]
  if (ncol(res.all) > 1) {
    maxs <- res %>% dplyr::group_by(Var1) %>% dplyr::summarise(max = max(value))
    res <- merge(res, maxs)
    maxs <- res[res$value == res$max, ]
    maxs <- maxs[order(maxs$value, decreasing = TRUE), ]
    res <- res[res$value != res$max, ]
    if(reorder_label){
      maxs=maxs[match(rev(levels(reference)),maxs$Var1),]
    }
    res <- rbind(maxs, res)
    res <- res[, 1:3]
  }
  res[, 1] <- paste0(res[, 1], " ")
  res[, 2] <- paste0(" ", res[, 2])
  colnames(res) <- c("From", "To", "# of cells")
  colors_link <- colorRampPalette(brewer.pal(5, "Set2"))(length(unique(reference)))
  colors_link_array <- paste0("[", paste0("'", colors_link,"'", collapse = ','), "]")
  colors_node <- colorRampPalette(brewer.pal(5, "Dark2"))(length(unique(reference))+length(unique(clusters)))
  colors_node_array <- paste0("[", paste0("'", colors_node,"'", collapse = ','), "]")
  opts <- paste0("{
                 link: { colorMode: 'source',
                 colors: ", colors_link_array ," },
                 node: { colors: ", colors_node_array ," ,
                 label:{fontSize:",fontsize,",fontName:'Arial'},
                 fontName: 'Arial',
                 fontSize: ", fontsize, " },
                 iterations:0
}" )
    Sankey=gvisSankey(res, from = "From", to = "To", weight = "# of cells",options=list(width = plot_width, height = plot_height, sankey=opts))
    return(Sankey)
  }
# load necessay function
source("/media/xiaoxiang/D/DESC_reproducible_file/helpfunc_new.R")
old=theme_set(theme_bw()+theme(strip.background = element_rect(fill="white"),
                                         panel.background = element_blank(),
                                         panel.grid =element_blank()))
gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}
op=par(mar=c(5,4,6,4))
color_celltype=c('#1CE6FF','#FF34FF','#FF4A46','#008941','#A30059','#FF2F80','#0000A6','#63FFAC','#004D43','#8FB0FF','#4FC601','#3B5DFF','#4A3B53','#61615A','#BA0900','#6B7900','#00C2A0','#FFAA92','#FF90C9','#B903AA')
image(1:length(color_celltype),1, as.matrix(1:length(color_celltype)),col=color_celltype,ylab="",xlab="",axes=F)
axis(3,at=seq(1:length(color_celltype)),labels=color_celltype,las=2,lwd=0)

op=par(mar=c(5,4,6,4))
col.set.update <- c("#c10023", "#008e17", "#fb8500", "#f60000", "#FE0092", "#bc9000","#4ffc00", "#00bcac", "#0099cc",
                    "#D35400", "#00eefd", "#cf6bd6", "#99cc00", "#aa00ff", "#ff00ff", "#0053c8",
                    "#f2a287","#ffb3ff", "#800000", "#77a7b7", "#00896e", "#00cc99", "#007CC8")
image(1:length(col.set.update),1, as.matrix(1:length(col.set.update)),col=col.set.update,ylab="",xlab="",axes=F)
axis(3,at=seq(1:length(col.set.update)),labels=col.set.update,las=2,lwd=0)

The Clustering Result

DESC

df0=list()
df0$desc=list()
df0$cca=list()
df0$cca3.0=list()
df0$mnn=list()
df0$scVI=list()
df0$saucie=list()
df0$BERMUDA=list()
df0$scanorama=list()
KL_list=list()
KL_list$desc=list()
KL_list$cca=list()
KL_list$cca3.0=list()
KL_list$mnn=list()
KL_list$scVI=list()
KL_list$saucie=list()
KL_list$BERMUDA=list()
KL_list$scanorama=list()
#if (file.exists("df0.RData")) load("df0.RData")
resolution_use="0.2"
desc_use=paste0("desc_",resolution_use)
tsne_use=paste0("tsne",resolution_use)
obsm_use=paste0("X_Embeded_z",resolution_use)
dataset="pancreas_v3_result/"
filename=paste0(datadirpath,"/",dataset,"/adata_desc.h5ad")
Sys.setenv(RETICULATE_PYTHON="/usr/bin/python3")
ad=import("anndata",convert = FALSE)
adata=ad$read_h5ad(filename)
adata=change_obs(adata,exclude=c("n_genes","n_counts","sample"))# colum
obj0=Convert_from_anndata_to_seurat(adata,raw.X.slot = "count.data")
obj0=NormalizeData(obj0,display.progress = F)
#obj=Convert_from_anndata_to_seurat(adata,raw.X.slot="count.data")
df_desc=getplotdata(adata,reduction.use = tsne_use)
df0$desc=list()
df0$desc$df_desc=df_desc
df0$desc$dr_desc=py_to_r(adata$obsm[[obsm_use]]) #just for KL
if (! desc_use %in% names(KL_list$desc)){
tmp=data.frame(tsne_tech=BatchKL(df_desc,n_cells=100,batch="tech"),
               dr_tech=BatchKL(df_desc,df0$desc$dr_desc,n_cells=100,batch="tech"))
KL_list$desc[[desc_use]]=tmp
}
df=reshape2::melt(KL_list$desc[[desc_use]],id.vars=NULL)
colnames(df)=c("Method","value")
df$Group=sapply(as.character(df$Method),function(x) strsplit(x,"_")[[1]][1])
df$Method=factor(df$Method,levels = c("tsne_tech","dr_tech"))
df$batchid=factor(sapply(as.character(df$Method),function(x) strsplit(x,"_")[[1]][2]),levels=c("tech"))
p_KL=ggplot(df,aes(Method,value))+
   xlab("")+
   ylab("KL divergence")+
   geom_boxplot(aes(color=batchid,fill=Group))+
  scale_fill_manual(values = c("dr"="white","tsne"="grey"))+
   theme_self+theme(
   aspect.ratio = 1,
   axis.text.x   = element_text(angle = 60,hjust=1,vjust=1),
   panel.border =element_blank(),
   axis.line = element_line())+
   ggtitle("")
df_desc[,desc_use]=factor(as.numeric(as.character(df_desc[,desc_use])))
ari=mclust::adjustedRandIndex(df_desc[,'celltype'],df_desc[,desc_use])
#pdesc_1=getplot(df_desc,by.group = desc_use,ggtitle0 = paste0("Cluster","(ARI=",round(ari,3),")"),pt.size = 0.2)+theme(plot.title = element_text(color="red"))
df_desc$res=df_desc[,desc_use]
pdesc_1=getplot(df_desc,by.group = "res",ggtitle0 = "DESC",pt.size = 0.5)+theme(plot.title = element_text(color="red"))+
  scale_color_manual(values=colorRampPalette(col.set.update)(15))+
  coord_cartesian()
pdesc_2=getplot(df_desc,by.group = "celltype",ggtitle0 = "Celltype",pt.size = 0.5)+coord_cartesian()+
  scale_color_manual(values=colorRampPalette(color_celltype[1:13])(13))
pdesc_3=getplot(df_desc,by.group = "tech",ggtitle0 = "tech",pt.size = 0.5)+
   scale_color_manual(values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
  coord_cartesian()
x=cbind(df_desc[,c(1,2)],"max.prob"=apply(py_to_r(adata$uns[paste0("prob_matrix",resolution_use)]),1,max))
pdesc_4=getfeature.plot(x,pt.size = 0.5,cols.use = c("red","grey"))[[1]]+coord_cartesian()
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
p0=egg::ggarrange(pdesc_1,pdesc_2,pdesc_3,pdesc_4,p_KL,ncol=3,draw = F)
## adding dummy grobs
p0

Seurat3.0

batch_id="tech"
detach("package:Seurat",unload = T)
suppressPackageStartupMessages(library("Seurat",lib.loc = "/usr/lib/R/self_library/"))
filename=paste0(datadirpath,"/cca3.0_result/","obj_cca3.0_",batch_id,"_2000.rds")
obj=readRDS(filename)
df_cca3.0=cbind(obj[["tsne"]]@cell.embeddings[,c(1,2)],obj@meta.data)
df_cca3.0$sample=df_cca3.0$BatchID2
df0$cca3.0$df_cca3.0=df_cca3.0
df0$cca3.0$dr_cca3.0=obj[["pca"]]@cell.embeddings[,1:20]
detach("package:Seurat",unload = T)
suppressPackageStartupMessages(library("Seurat"))
#print(mclust::adjustedRandIndex(df_cca$cluster,df_cca$integrated_snn_res.0.5))
#getplot(df_cca,by.group = "sample",pt.size = 0.1)
cca3.0_use="techtop2000"
if (! cca3.0_use %in% names(KL_list$cca3.0)){
tmp=data.frame(tsne_tech=BatchKL(df0$cca3.0$df_cca3.0,n_cells=100,batch="tech"),
               dr_tech=BatchKL(df0$cca3.0$df_cca3.0,df0$cca3.0$dr_cca3.0,n_cells=100,batch="tech"))
KL_list$cca3.0[[cca3.0_use]]=tmp
}
df=reshape2::melt(KL_list$cca3.0[[cca3.0_use]],id.vars=NULL)
colnames(df)=c("Method","value")
df$Group=sapply(as.character(df$Method),function(x) strsplit(x,"_")[[1]][1])
df$Method=factor(df$Method,levels = c("tsne_tech","dr_tech"))
df$batchid=factor(sapply(as.character(df$Method),function(x) strsplit(x,"_")[[1]][2]),levels=c("tech"))
p_KL=ggplot(df,aes(Method,value))+
   xlab("")+
   ylab("KL divergence")+
   geom_boxplot(aes(color=batchid,fill=Group))+
  scale_fill_manual(values = c("dr"="white","tsne"="grey"))+
   theme_self+theme(
   aspect.ratio = 1,
   axis.text.x   = element_text(angle = 60,hjust=1,vjust=1),
   panel.border =element_blank(),
   axis.line = element_line())+
   ggtitle("")
df_cca3.0[,"integrated_snn_res.0.25"]=factor(as.numeric(as.character(df_cca3.0[,"integrated_snn_res.0.25"])))
df_cca3.0[,"res"]=df_cca3.0[,"integrated_snn_res.0.25"]
ari=mclust::adjustedRandIndex(df_cca3.0[,'celltype'],df_cca3.0[,"res"])
#pcca3.0_1=getplot(df_cca3.0,by.group = "res",ggtitle0 = paste0("Cluster","(ARI=",round(ari,3),")"),pt.size = 0.5)+theme(plot.title = element_text(color="red"))
pcca3.0_1=getplot(df_cca3.0,by.group = "res",ggtitle0 = "Seurat3.0",pt.size = 0.5)+theme(plot.title = element_text(color="red"))+
  scale_color_manual(values=colorRampPalette(col.set.update)(15))+
  coord_cartesian()
pcca3.0_2=getplot(df_cca3.0,by.group = "celltype",ggtitle0 = "celltype",pt.size = 0.5)+
  scale_color_manual(values=colorRampPalette(color_celltype[1:13])(13))+
  coord_cartesian()
pcca3.0_3=getplot(df_cca3.0,by.group = "tech",ggtitle0 = "tech",pt.size = 0.5)+
  scale_color_manual(values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
  coord_cartesian()
p0=egg::ggarrange(pcca3.0_1,pcca3.0_2,pcca3.0_3,p_KL,ncol=3,draw = F)
## adding dummy grobs
p0

CCA

batch_id="tech"
filename=paste0(datadirpath,"/cca_result/","obj_cca_",batch_id,"_2000_False.rds")
obj=readRDS(filename)
df_cca=cbind(obj@dr$tsne@cell.embeddings[,c(1,2)],obj@meta.data)
df0$cca$df_cca=df_cca
df0$cca$dr_cca=obj@dr$cca@cell.embeddings
cca_use="techtop2000"
if (! cca_use %in% names(KL_list$cca)){
tmp=data.frame(tsne_tech=BatchKL(df0$cca$df_cca,n_cells=100,batch="tech"),
               dr_tech=BatchKL(df0$cca$df_cca,df0$cca$dr_cca,n_cells=100,batch="tech"))
KL_list$cca[[cca_use]]=tmp
}
df=reshape2::melt(KL_list$cca[[cca_use]],id.vars=NULL)
colnames(df)=c("Method","value")
df$Group=sapply(as.character(df$Method),function(x) strsplit(x,"_")[[1]][1])
df$Method=factor(df$Method,levels = c("tsne_tech","dr_tech"))
df$batchid=factor(sapply(as.character(df$Method),function(x) strsplit(x,"_")[[1]][2]),levels=c("tech"))
p_KL=ggplot(df,aes(Method,value))+
   xlab("")+
   ylab("KL divergence")+
   geom_boxplot(aes(color=batchid,fill=Group))+
  scale_fill_manual(values = c("dr"="white","tsne"="grey"))+
   theme_self+theme(
   aspect.ratio = 1,
   axis.text.x   = element_text(angle = 60,hjust=1,vjust=1),
   panel.border =element_blank(),
   axis.line = element_line())+
   ggtitle("")
df_cca[,"res.0.6"]=factor(as.numeric(as.character(df_cca[,"res.0.6"])))
df_cca[,"res"]=df_cca[,"res.0.6"]
ari=mclust::adjustedRandIndex(df_cca[,'celltype'],df_cca[,"res"])
#pcca_1=getplot(df_cca,by.group = "res",ggtitle0 = paste0("Cluster","(ARI=",round(ari,3),")"),pt.size = 0.5)+theme(plot.title = element_text(color="red"))
pcca_1=getplot(df_cca,by.group = "res",ggtitle0 = "CCA",pt.size = 0.5)+theme(plot.title = element_text(color="red"))+
   scale_color_manual(values=colorRampPalette(col.set.update)(15))+
  coord_cartesian()

pcca_2=getplot(df_cca,by.group = "celltype",ggtitle0 = "celltype",pt.size = 0.5)+
   scale_color_manual(values=colorRampPalette(color_celltype[1:13])(13))+
  coord_cartesian()
pcca_3=getplot(df_cca,by.group = "tech",ggtitle0 = "tech",pt.size = 0.5)+
   scale_color_manual(values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
  coord_cartesian()
p0=egg::ggarrange(pcca_1,pcca_2,pcca_3,p_KL,ncol=3,draw = F)
## adding dummy grobs
p0

MNN

batch_id="tech"
filename=paste0(datadirpath,"/mnn_result/","adata_mnn_",batch_id,"_2000_False.h5ad")
ad=import("anndata",convert = FALSE)
adata=ad$read_h5ad(filename)
adata=change_obs(adata,exclude=c("n_genes","n_counts","sample"))# colum
df_mnn=getplotdata(adata,reduction.use = "tsne")
df0$mnn$df_mnn=df_mnn
df0$mnn$dr_mnn=py_to_r(adata$obsm["X_pca"])[,c(1:20)]
mnn_use="techtop2000"
if (! mnn_use %in% names(KL_list$mnn)){
tmp=data.frame(tsne_tech=BatchKL(df0$mnn$df_mnn,n_cells=100,batch="tech"),
               dr_tech=BatchKL(df0$mnn$df_mnn,df0$mnn$dr_mnn,n_cells=100,batch="tech"))
KL_list$mnn[[mnn_use]]=tmp
}
df=reshape2::melt(KL_list$mnn[[mnn_use]],id.vars=NULL)
colnames(df)=c("Method","value")
df$Group=sapply(as.character(df$Method),function(x) strsplit(x,"_")[[1]][1])
df$Method=factor(df$Method,levels = c("tsne_tech","dr_tech"))
df$batchid=factor(sapply(as.character(df$Method),function(x) strsplit(x,"_")[[1]][2]),levels=c("tech"))
p_KL=ggplot(df,aes(Method,value))+
   xlab("")+
   ylab("KL divergence")+
   geom_boxplot(aes(color=batchid,fill=Group))+
  scale_fill_manual(values = c("dr"="white","tsne"="grey"))+
   theme_self+theme(
   aspect.ratio = 1,
   axis.text.x   = element_text(angle = 60,hjust=1,vjust=1),
   panel.border =element_blank(),
   axis.line = element_line())+
   ggtitle("")
df_mnn[,"louvain_ori0.1"]=factor(as.numeric(as.character(df_mnn[,"louvain_ori0.1"])))
df_mnn[,"res"]=df_mnn[,"louvain_ori0.1"]
ari=mclust::adjustedRandIndex(df_mnn[,'celltype'],df_mnn[,"res"])
#pmnn_1=getplot(df_mnn,by.group = "res",ggtitle0 = paste0("Cluster","(ARI=",round(ari,3),")"),pt.size = 0.5)+theme(plot.title = element_text(color="red"))
pmnn_1=getplot(df_mnn,by.group = "res",ggtitle0 = "MNN",pt.size = 0.5)+theme(plot.title = element_text(color="red"))+
     scale_color_manual(values=colorRampPalette(col.set.update)(15))+
  coord_cartesian()
pmnn_2=getplot(df_mnn,by.group = "celltype",ggtitle0 = "celltype",pt.size = 0.5)+
     scale_color_manual(values=colorRampPalette(color_celltype[1:13])(13))+
  coord_cartesian()
pmnn_3=getplot(df_mnn,by.group = "tech",ggtitle0 = "tech",pt.size = 0.5)+
     scale_color_manual(values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
  coord_cartesian()
p0=egg::ggarrange(pmnn_1,pmnn_2,pmnn_3,p_KL,ncol=3,draw = F)
## adding dummy grobs
p0

scVI

batch_id="tech"
filename=paste0(datadirpath,"/scvi_result/","adata_scvi_",batch_id,"_2000_False.h5ad")
ad=import("anndata",convert = FALSE)
adata=ad$read_h5ad(filename)
adata=change_obs(adata,exclude=c("n_genes","n_counts","sample"))# colum
df_scvi=getplotdata(adata,reduction.use = "tsne")
df0$scvi$df_scvi=df_scvi
df0$scvi$dr_scvi=py_to_r(adata$X)
scvi_use="techtop2000"
if (! scvi_use %in% names(KL_list$scvi)){
tmp=data.frame(tsne_tech=BatchKL(df0$scvi$df_scvi,n_cells=100,batch="tech"),
               dr_tech=BatchKL(df0$scvi$df_scvi,df0$scvi$dr_scvi,n_cells=100,batch="tech"))
KL_list$scvi[[scvi_use]]=tmp
}
df=reshape2::melt(KL_list$scvi[[scvi_use]],id.vars=NULL)
colnames(df)=c("Method","value")
df$Group=sapply(as.character(df$Method),function(x) strsplit(x,"_")[[1]][1])
df$Method=factor(df$Method,levels = c("tsne_tech","dr_tech"))
df$batchid=factor(sapply(as.character(df$Method),function(x) strsplit(x,"_")[[1]][2]),levels=c("tech"))
p_KL=ggplot(df,aes(Method,value))+
   xlab("")+
   ylab("KL divergence")+
   geom_boxplot(aes(color=batchid,fill=Group))+
  scale_fill_manual(values = c("dr"="white","tsne"="grey"))+
   theme_self+theme(
   aspect.ratio = 1,
   axis.text.x   = element_text(angle = 60,hjust=1,vjust=1),
   panel.border =element_blank(),
   axis.line = element_line())+
   ggtitle("")
df_scvi[,"louvain_ori0.4"]=factor(as.numeric(as.character(df_scvi[,"louvain_ori0.4"])))
df_scvi[,"res"]=df_scvi[,"louvain_ori0.4"]
ari=mclust::adjustedRandIndex(df_scvi[,'celltype'],df_scvi[,"res"])
#pscvi_1=getplot(df_scvi,by.group = "res",ggtitle0 = paste0("Cluster","(ARI=",round(ari,3),")"),pt.size = 0.5)+theme(plot.title = element_text(color="red"))
pscvi_1=getplot(df_scvi,by.group = "res",ggtitle0 = "scVI",pt.size = 0.5)+theme(plot.title = element_text(color="red"))+
   scale_color_manual(values=colorRampPalette(col.set.update)(15))+
  coord_cartesian()
pscvi_2=getplot(df_scvi,by.group = "celltype",ggtitle0 = "celltype",pt.size = 0.5)+
   scale_color_manual(values=colorRampPalette(color_celltype[1:13])(13))+
  coord_cartesian()
pscvi_3=getplot(df_scvi,by.group = "tech",ggtitle0 = "tech",pt.size = 0.5)+
   scale_color_manual(values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
  coord_cartesian()
p0=egg::ggarrange(pscvi_1,pscvi_2,pscvi_3,p_KL,ncol=3,draw = F)
## adding dummy grobs
p0

BERMUDA

BERMUDA threshold=0.9

batch_id="BatchID"
filename=paste0(datadirpath,"/BERMUDA_result/adata_bermuda0.9_2000.h5ad")
ad=import("anndata",convert = FALSE)
adata=ad$read_h5ad(filename)
adata=change_obs(adata,exclude=c("n_genes","n_counts","sample"))# colum
#print(mclust::adjustedRandIndex(df_cca$cluster,df_cca$integrated_snn_res.0.2))
df_bermuda=getplotdata(adata,reduction.use = "tsne")
df0$BERMUDA$df_bermuda=df_bermuda
df0$BERMUDA$dr_bermuda=py_to_r(adata$obsm["X_bermuda"])[,c(1:20)]
BERMUDA_use="tech0.9"

if (! BERMUDA_use %in% names(KL_list$BERMUDA)){
tmp=data.frame(tsne_tech=BatchKL(df_bermuda,n_cells=100,batch="tech"),
               dr_tech=BatchKL(df_bermuda,df0$BERMUDA$dr_bermuda,n_cells=100,batch="tech"))
KL_list$BERMUDA[[BERMUDA_use]]=tmp
}
df=reshape2::melt(KL_list$BERMUDA[[BERMUDA_use]],id.vars=NULL)
colnames(df)=c("Method","value")
df$Group=sapply(as.character(df$Method),function(x) strsplit(x,"_")[[1]][1])
df$Method=factor(df$Method,levels = c("tsne_tech","dr_tech"))
df$batchid=factor(sapply(as.character(df$Method),function(x) strsplit(x,"_")[[1]][2]),levels=c("tech"))
p_KL=ggplot(df,aes(Method,value))+
   xlab("")+
   ylab("KL divergence")+
   geom_boxplot(aes(color=batchid,fill=Group))+
  scale_fill_manual(values = c("dr"="white","tsne"="grey"))+
   theme_self+theme(
   aspect.ratio = 1,
   axis.text.x   = element_text(angle = 60,hjust=1,vjust=1),
   panel.border =element_blank(),
   axis.line = element_line())+
   ggtitle("")
#stopifnot(any(rownames(df_desc)==df_mnn$cellname))
id0=match(rownames(df_desc),df_bermuda$cellname)
df_bermuda[,"louvain_0.4"]=factor(as.numeric(as.character(df_bermuda[,"louvain_0.4"])))
df_bermuda[,"res"]=df_bermuda[,"louvain_0.4"]
df_bermuda_0.90=df_bermuda
pbermuda0.90_1=getplot(df_bermuda,by.group = "res",ggtitle0 = "BERMUDA0.90",pt.size = 0.5)+theme(plot.title = element_text(color="red"))+
   scale_color_manual(values=colorRampPalette(col.set.update)(15))+
  coord_cartesian()
pbermuda0.90_2=getplot(df_bermuda,by.group = "celltype",ggtitle0 = "BERMUDA0.90",pt.size = 0.5)+
   scale_color_manual(values=colorRampPalette(color_celltype[1:13])(13))+
  coord_cartesian()
pbermuda0.90_3=getplot(df_bermuda,by.group = "tech",ggtitle0 = "BERMUDA0.90",pt.size = 0.5)+
   scale_color_manual(values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
  coord_cartesian()
p0=egg::ggarrange(pbermuda0.90_1,pbermuda0.90_2,pbermuda0.90_3,p_KL,ncol=3,draw = F)
## adding dummy grobs
p0

BERMUDA threshold=0.85

batch_id="BatchID"
filename=paste0(datadirpath,"/BERMUDA_result/adata_bermuda0.85_2000.h5ad")
ad=import("anndata",convert = FALSE)
adata=ad$read_h5ad(filename)
adata=change_obs(adata,exclude=c("n_genes","n_counts","sample"))# colum
#print(mclust::adjustedRandIndex(df_cca$cluster,df_cca$integrated_snn_res.0.2))
df_bermuda=getplotdata(adata,reduction.use = "tsne")
df0$BERMUDA$df_bermuda=df_bermuda
df0$BERMUDA$dr_bermuda=py_to_r(adata$obsm["X_bermuda"])[,c(1:20)]
BERMUDA_use="tech0.85"
if (! BERMUDA_use %in% names(KL_list$BERMUDA)){
  tmp=data.frame(tsne_tech=BatchKL(df_bermuda,n_cells=100,batch="tech"),
               dr_tech=BatchKL(df_bermuda,df0$BERMUDA$dr_bermuda,n_cells=100,batch="tech"))
  KL_list$BERMUDA[[BERMUDA_use]]=tmp
}
df=reshape2::melt(KL_list$BERMUDA[[BERMUDA_use]],id.vars=NULL)
colnames(df)=c("Method","value")
df$Group=sapply(as.character(df$Method),function(x) strsplit(x,"_")[[1]][1])
df$Method=factor(df$Method,levels = c("tsne_tech","dr_tech"))
df$batchid=factor(sapply(as.character(df$Method),function(x) strsplit(x,"_")[[1]][2]),levels=c("tech"))
p_KL=ggplot(df,aes(Method,value))+
   xlab("")+
   ylab("KL divergence")+
   geom_boxplot(aes(color=batchid,fill=Group))+
  scale_fill_manual(values = c("dr"="white","tsne"="grey"))+
   theme_self+theme(
   aspect.ratio = 1,
   axis.text.x   = element_text(angle = 60,hjust=1,vjust=1),
   panel.border =element_blank(),
   axis.line = element_line())+
   ggtitle("")
#stopifnot(any(rownames(df_desc)==df_mnn$cellname))
id0=match(rownames(df_desc),df_bermuda$cellname)

df_bermuda[,"louvain_0.4"]=factor(as.numeric(as.character(df_bermuda[,"louvain_0.4"])))
df_bermuda[,"res"]=df_bermuda[,"louvain_0.4"]
df_bermuda_0.85=df_bermuda
pbermuda0.85_1=getplot(df_bermuda,by.group = "res",ggtitle0 = "BERMUDA0.85",pt.size = 0.5)+theme(plot.title = element_text(color="red"))+
   scale_color_manual(values=colorRampPalette(col.set.update)(15))+
  coord_cartesian()
pbermuda0.85_2=getplot(df_bermuda,by.group = "celltype",ggtitle0 = "BERMUDA0.85",pt.size = 0.5)+
   scale_color_manual(values=colorRampPalette(color_celltype[1:13])(13))+
  coord_cartesian()
pbermuda0.85_3=getplot(df_bermuda,by.group = "tech",ggtitle0 = "BERMUDA0.85",pt.size = 0.5)+
   scale_color_manual(values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
  coord_cartesian()
p0=egg::ggarrange(pbermuda0.85_1,pbermuda0.85_2,pbermuda0.85_3,p_KL,ncol=3,draw = F)
## adding dummy grobs
p0

scanorama

scanorama embeded

batch_id="tech"
filename=paste0(datadirpath,"/scanorama_result/adata_scanorama_tech_2000_False.h5ad")
ad=import("anndata",convert = FALSE)
adata=ad$read_h5ad(filename)
adata=change_obs(adata,exclude=c("n_genes","n_counts","sample"))# colum
#print(mclust::adjustedRandIndex(df_cca$cluster,df_cca$integrated_snn_res.0.2))
df_scanorama=getplotdata(adata,reduction.use = "tsne")
df0$scanorama$df_scanorama=df_scanorama
df0$scanorama$dr_scanorama=py_to_r(adata$obsm["X_scanorama"])[,c(1:100)]
scanorama_use="tech.emd"
if (! scanorama_use %in% names(KL_list$scanorama)){
  tmp=data.frame(tsne_tech=BatchKL(df_scanorama,n_cells=100,batch="tech"),
               dr_tech=BatchKL(df_scanorama,df0$scanorama$dr_scanorama,n_cells=100,batch="tech"))
  KL_list$scanorama[[scanorama_use]]=tmp
}
df=reshape2::melt(KL_list$scanorama[[scanorama_use]],id.vars=NULL)
colnames(df)=c("Method","value")
df$Group=sapply(as.character(df$Method),function(x) strsplit(x,"_")[[1]][1])
df$Method=factor(df$Method,levels = c("tsne_tech","dr_tech"))
df$batchid=factor(sapply(as.character(df$Method),function(x) strsplit(x,"_")[[1]][2]),levels=c("tech"))
p_KL=ggplot(df,aes(Method,value))+
   xlab("")+
   ylab("KL divergence")+
   geom_boxplot(aes(color=batchid,fill=Group))+
  scale_fill_manual(values = c("dr"="white","tsne"="grey"))+
   theme_self+theme(
   aspect.ratio = 1,
   axis.text.x   = element_text(angle = 60,hjust=1,vjust=1),
   panel.border =element_blank(),
   axis.line = element_line())+
   ggtitle("")
#stopifnot(any(rownames(df_desc)==df_mnn$cellname))
id0=match(rownames(df_desc),df_scanorama$cellname)
df_scanorama[,"louvain_0.4"]=factor(as.numeric(as.character(df_scanorama[,"louvain_0.4"])))
df_scanorama[,"res"]=df_scanorama[,"louvain_0.4"]
pscanorama_1=getplot(df_scanorama,by.group = "res",ggtitle0 = "scanorama",pt.size = 0.2)+theme(plot.title = element_text(color="red"))+
  scale_color_manual(values=colorRampPalette(col.set.update)(15))+
  coord_cartesian()
pscanorama_2=getplot(df_scanorama,by.group = "celltype",ggtitle0 = "scanorama",pt.size = 0.1)+
  scale_color_manual(values=colorRampPalette(color_celltype[1:13])(13))+
  coord_cartesian()
pscanorama_3=getplot(df_scanorama,by.group = "tech",ggtitle0 = "scanorama",pt.size = 0.1)+
  scale_color_manual(values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
  coord_cartesian()
p0=egg::ggarrange(pscanorama_1,pscanorama_2,pscanorama_3,p_KL,ncol=3,draw = F)
## adding dummy grobs
p0

Computing ARI & KL

KL_list$desc[[1]]$Method="DESC"
KL_list$cca3.0[[1]]$Method="Seurat3.0"
KL_list$cca[[1]]$Method="CCA"
KL_list$mnn[[1]]$Method="MNN"
KL_list$scvi[[1]]$Method="scVI"
KL_list$BERMUDA[[1]]$Method="BERMUDA0.90"
KL_list$BERMUDA[[2]]$Method="BERMUDA0.85"
KL_list$scanorama[[1]]$Method="scanorama"
data_KL=do.call("rbind",list(KL_list$desc[[1]],
                             KL_list$cca3.0[[1]],
                             KL_list$cca[[1]],
                             KL_list$mnn[[1]],
                             KL_list$scvi[[1]],
                             KL_list$BERMUDA[[1]],
                             KL_list$BERMUDA[[2]],
                             KL_list$scanorama[[1]]))
data_KL=data_KL[,c(2,3)]
data_KL=subset(data_KL,!Method%in%"BERMUDA0.85")
df=reshape2::melt(data_KL,id.vars="Method")
df$Method=factor(df$Method,levels=c("DESC","Seurat3.0","CCA","MNN","scVI","BERMUDA0.90","scanorama"),
                                    labels=c("DESC","Seurat3.0","CCA","MNN","scVI","BERMUDA","scanorama"))

colnames(df)=c("Method","Group","value")
p_KL=ggplot(df,aes(Method,value))+
   xlab("")+
   ylab("KL divergence")+
   geom_boxplot(aes(color=Method))+
   theme_self+theme(
   #aspect.ratio = 1,
   panel.border =element_blank(),
   axis.line = element_line())+
  theme22(20)+
  theme(axis.text.x   = element_text(angle = 30,hjust=1,size=20,vjust=1))+
   ggtitle("KL divergence")
#p_KL
theme_use=theme(legend.key.size = unit(1,"cm"),
                legend.key.width = unit(0,'cm'),
                legend.text = element_text(size=16),
                legend.title = element_text(size=20),
                legend.key.height = unit(1.0,"cm"),
                axis.text.x = element_blank(),
                axis.text.y = element_blank())

theme_use2=theme(legend.key.size = unit(0.5,"cm"),
                  legend.text = element_text(size=16),
                  legend.key.height = unit(0.5,"cm"),
                  legend.title = element_text(size=20),
                 #axis.title.x = element_blank(),
                 axis.text.x = element_blank(),
                axis.text.y = element_blank())

theme_use_feature=theme(legend.justification=0.5,
                legend.key.size = unit(1,"cm"),
                legend.text = element_text(size=25),
                legend.title = element_text(size=30),
                legend.key.height = unit(1.3,"cm"),
                axis.text.x = element_blank(),
                axis.text.y=element_blank())

p1=egg::ggarrange(pdesc_1+theme_use,
                  pcca3.0_1+theme_use,
                  pcca_1+theme_use,
                  pmnn_1+theme_use,
                  pscvi_1+theme_use,
                  pdesc_2+theme_use+theme(legend.position = "none"),
                  pcca3.0_2+theme_use+theme(legend.position = "none"),
                  pcca_2+theme_use+theme(legend.position = "none"),
                  pmnn_2+theme_use+theme(legend.position = "none"),
                  pscvi_2+theme_use,
                  pdesc_3+theme_use+theme(legend.position = "none"),
                  pcca3.0_3+theme_use+theme(legend.position = "none"),
                  pcca_3+theme_use+theme(legend.position = "none"),
                  pmnn_3+theme_use+theme(legend.position = "none"),
                  pscvi_3+theme_use,
                  ncol=5,draw=F)
data_ARI=data.frame(ARI=c(mclust::adjustedRandIndex(df_desc[,desc_use],df_desc$celltype),
                       mclust::adjustedRandIndex(df_cca3.0[,"res"],df_cca3.0$celltype),
                       mclust::adjustedRandIndex(df_cca[,"res"],df_cca$celltype),
                       mclust::adjustedRandIndex(df_mnn[,"res"],df_mnn$celltype),
                       mclust::adjustedRandIndex(df_scvi[,"res"],df_scvi$celltype),
                       mclust::adjustedRandIndex(df_bermuda_0.90[,"res"],df_bermuda_0.90$celltype),
                       #mclust::adjustedRandIndex(df_bermuda_0.85[,"res"],df_bermuda_0.85$celltype),
                       mclust::adjustedRandIndex(df_scanorama[,"res"],df_scanorama$celltype))
                 )
data_ARI$Method=factor(c("DESC","Seurat3.0","CCA","MNN","scVI","BERMUDA","scanorama"),
                       levels = c("DESC","Seurat3.0","CCA","MNN","scVI","BERMUDA","scanorama"))
p_ARI=ggplot(data=data_ARI,aes(x=Method,y=ARI))+
  geom_col(aes(fill=Method),position="dodge")+
  scale_y_continuous(expand=c(0,0),limits=c(0,1.0))+
  geom_text(aes(label=round(ARI,3)),vjust=0.1)+
  ggtitle("Adjusted Rand Index")+
  ylab("ARI")+
   theme_self+
  theme22(20)+
  theme(axis.text.x   = element_text(angle = 30,hjust=1,size=20,vjust=1))+
  xlab("")
p=egg::ggarrange(p_ARI,p_KL,ncol=2,draw=F)

We used the following code to generate sankey plot, but the output is in html format, which can be saved as pdf or other format (such as *png, or *.jpeg) manually. We combined the sankey plots in Supplementary Figure 4 and Supplementary Figure 6 using Inkspace software.

# generate sankey plots 
s_DESC=getmy.sankey(factor(df_desc$celltype),factor(df_desc[,desc_use]),plot_height=650,fontsize=17)
print(s_DESC,tag=NULL,file="DESC_sankey_retina.html")
#louvain
s_Seurat3.0=getmy.sankey(factor(df_cca3.0$celltype),factor(df_cca3.0$res),fontsize=17)
print(s_Seurat3.0,tag=NULL,file="Seurat3.0_sankey_retina.html")

#cca
s_mnn=getmy.sankey(factor(df_cca$celltype),factor(df_cca$res),plot_height=650,fontsize =17)
print(s_mnn,tag=NULL,file="CCA_sankey_retina.html")

#MNN
s_mnn_top1000=getmy.sankey(factor(df_mnn$celltype),factor(df_mnn$res),plot_height=650,fontsize =17)
print(s_mnn_top1000,tag=NULL,file="MNN_sankey_retina.html")

#scVI
s_seurat=getmy.sankey(factor(df_scvi$celltype),factor(df_scvi$res),plot_height=650,fontsize =17)
print(s_seurat,tag=NULL,file="scVI_sankey_retina.html")

#BERMUDA0.90
s_seurat=getmy.sankey(factor(df_bermuda_0.90$celltype),factor(df_bermuda_0.90$res),plot_height=650,fontsize =17)
print(s_seurat,tag=NULL,file="BERMUDA0.90_sankey_retina.html")


#BERMUDA0.85
s_seurat=getmy.sankey(factor(df_bermuda_0.85$celltype),factor(df_bermuda_0.85$res),plot_height=650,fontsize =17)
print(s_seurat,tag=NULL,file="BERMUDA0.85_sankey_retina.html")


#scanorama
s_seurat=getmy.sankey(factor(df_scanorama$celltype),factor(df_scanorama$res),plot_height=650,fontsize =17)
print(s_seurat,tag=NULL,file="scanorama_sankey_retina.html")
pdesc_3=getplot(df_desc,by.group = "tech",ggtitle0 = "DESC",pt.size = 0.8)+
  scale_color_manual(values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
  coord_cartesian()
pcca3.0_3=getplot(df_cca3.0,by.group = "tech",ggtitle0 = "Seurat3.0",pt.size = 0.8)+
  scale_color_manual(values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
  coord_cartesian()
pcca_3=getplot(df_cca,by.group = "tech",ggtitle0 = "CCA",pt.size = 0.8)+
  scale_color_manual(values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
  coord_cartesian()
pmnn_3=getplot(df_mnn,by.group = "tech",ggtitle0 = "MNN",pt.size = 0.8)+
  scale_color_manual(values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
  coord_cartesian()
pscvi_3=getplot(df_scvi,by.group = "tech",ggtitle0 = "scVI",pt.size = 0.8)+
  scale_color_manual(values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
  coord_cartesian()
pbermuda0.90_3=getplot(df_bermuda_0.90,by.group = "tech",ggtitle0 = "BERMUDA",pt.size = 0.8)+
  scale_color_manual(values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
  coord_cartesian()
pbermuda0.85_3=getplot(df_bermuda_0.85,by.group = "tech",ggtitle0 = "BERMUDA0.85",pt.size = 0.8)+
  scale_color_manual(values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
  coord_cartesian()
pscanorama_3=getplot(df_scanorama,by.group = "tech",ggtitle0 = "scanorama",pt.size = 0.8)+
  scale_color_manual(values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
  coord_cartesian()


p_ARI=ggplot(data=data_ARI,aes(x=Method,y=ARI))+
  geom_col(aes(fill=Method),position="dodge")+
  scale_y_continuous(expand=c(0,0),limits=c(0,1.0))+
  geom_text(aes(label=round(ARI,3)),vjust=0.1,size=5)+
  ggtitle("Adjusted Rand Index")+
  ylab("ARI")+
  xlab("")+
  theme(axis.text.x   = element_text(angle = 60,hjust=1,vjust=1,size=18),
        axis.text.y   = element_text(size=18),
        plot.title = element_text(size=22)
        )

p2d=egg::ggarrange(pdesc_3+theme_use+theme(legend.position = "none"),
                  pcca3.0_3+theme_use+theme(legend.position = "none"),
                  pcca_3+theme_use+theme(legend.position = "none"),
                  pmnn_3+theme_use+theme(legend.position = "none"),
                  pscvi_3+theme_use+theme(legend.position = "none"),
                  pbermuda0.90_3+theme_use+theme(legend.position = "none"),
                  #pbermuda0.85_3+theme_use+theme(legend.position = "none"),
                  pscanorama_3+theme_use+theme(legend.position = "none"),
                  ncol=4,draw=F)
## adding dummy grobs
#put legend below figure
#p1=grid.arrange(p2d,get_legend(pdesc_3+theme(legend.position = "top",legend.title = element_blank(),legend.text = element_text(size=25))+
#                             guides(colour = guide_legend(override.aes = list(alpha = 1, size=15),nrow=1))),nrow=2,heights=c(10, 1))
p_legend=get_legend(pdesc_3+theme(legend.position = "top",legend.title = element_blank(),legend.text = element_text(size=25))+
                              guides(colour = guide_legend(override.aes = list(alpha = 1, size=15),ncol=1)))
pa=egg::ggarrange(plots =list(pdesc_3+theme_use+theme(legend.position = "none"),
                  pcca3.0_3+theme_use+theme(legend.position = "none"),
                  pcca_3+theme_use+theme(legend.position = "none"),
                  pmnn_3+theme_use+theme(legend.position = "none"),
                  pscvi_3+theme_use+theme(legend.position = "none"),
                  pbermuda0.90_3+theme_use+theme(legend.position = "none"),
                  pscanorama_3+theme_use+theme(legend.position = "none"),
             plot_grid(p_legend)),ncol=4,draw = F)
pa

Removing batch effect gradually

dataset="pancreas_v3_result/"
filename=paste0(datadirpath,"/",dataset,"/adata_desc.h5ad")
Sys.setenv(RETICULATE_PYTHON="/usr/bin/python3")
suppressPackageStartupMessages(library(reticulate))
ad=import("anndata",convert = FALSE)
adata=ad$read_h5ad(filename)
adata=change_obs(adata)
#generate data
#drs0= unlist(x = py_to_r(adata$obsm$keys()))
drs0<-unlist(strsplit(gsub(".{1,50}:|\\s|)",replacement = "",x = as.character(py_to_r(adata$obsm$keys()))),split = ","))
drs=c(drs0[grepl(pattern = "tsne_AE_epoch_0.2",drs0)],"X_tsne0.2")
#df_desc=getplotdata(adata,reduction.use = "tsne0.2")
df_list=lapply(drs,function(x){
  return(getplotdata(adata,reduction.use =paste0(strsplit(x,"_")[[1]][-1],collapse = "_") ))
})
names(df_list)=c(drs[-length(drs)],"X_tsne_AE_epoch_0.2_final")

drs=c(drs0[grepl(pattern = "X_Embeded_AE_epoch_0.2",drs0)],"X_Embeded_z0.2")
dr_list=lapply(drs,function(x){
  return(py_to_r(adata$obsm[[x]]))
})
names(dr_list)=c(drs[-length(drs)],"X_Embeded_AE_epoch_0.2_final")

pList=lapply(names(df_list),function(x){
  p=getplot(df_list[[x]],by.group = "tech",ggtitle0 = paste0("epoch:",strsplit(x,"_")[[1]][6]),pt.size = 0.8)+
    theme_use+
    scale_color_manual(values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
    theme(legend.position = "none")+
    coord_cartesian()
})

pList_legend=get_legend(getplot(df_list[["X_tsne_AE_epoch_0.2_0"]],by.group = "tech")+theme_use+
                          theme(legend.position = "bottom")+
  scale_color_manual(name="",values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
  guides(colour = guide_legend(override.aes = list(alpha = 1, size=15),nrow =1)))
  


KL_list=lapply(names(dr_list),function(x){
  BatchKL(df_desc,dr_list[[x]],n_cells=100,batch="tech")
})
KL_data=do.call("cbind",KL_list)
colnames(KL_data)=unlist(lapply(names(dr_list),function(x) paste0("epoch:",strsplit(x,"_")[[1]][6])))

df=reshape2::melt(KL_data,id.vars=NULL)
colnames(df)[-1]=c("Method","value")
pe=ggplot(df,aes(Method,value))+xlab("Method")+
  ylab("KL divergence")+
  geom_boxplot(aes(color=Method),fill = "white",width=0.4,size=1.2)+
  theme_self+theme(panel.border =element_blank(),
   axis.line = element_line())+theme22(20)+
  theme(axis.text.x   = element_text(angle = 30,hjust=1,size=20,vjust=1))+
  ggtitle("")+theme(legend.position = "none")+
  xlab("")
#save_plot(filename ="figures/res=0.8.pdf",plot_grid(plotlist = rlist::list.append(pList,p0),ncol=4),base_height = 14,base_width = 28)
p=egg::ggarrange(plots=pList,ncol=3,draw=F)
pd=egg::ggarrange(plot_grid(p),plot_grid(pList_legend),nrow=2,heights=c(8,1),draw = F)

Figure 4

fig_height0=26
p=ggdraw()+
  draw_label("a",x=0.005,y=1,hjust=0.5,vjust=1,fontface = "bold",size=30)+
  draw_plot(pa,x=0.005,y=14/fig_height0,width=0.995,height=12/fig_height0)+
  draw_label("b",x=0.005,y=14/fig_height0,hjust=0.5,vjust=1,fontface = "bold",size=30)+
  draw_plot(p_ARI+theme(legend.position = "none"),x=0.02,y=7/fig_height0,width=8/24,height=6/fig_height0)+
  draw_label("d",x=10.2/24,y=14/fig_height0,hjust=0.5,vjust=1,fontface = "bold",size=30)+
  draw_plot(pd,x=10.5/24,y=6/fig_height0,width=13/24,height=8/fig_height0)+
  draw_label("c",x=0.005,y=6/fig_height0,hjust=0.5,vjust=1,fontface = "bold",size=30)+
  draw_plot(egg::ggarrange(p_KL+theme(legend.position = "none"),
                           ggplot()+theme_void(),
                           pe,ncol=3,widths = c(8,1.5,13),draw=F),x=0.02,y=0,width=22/24,height=6/fig_height0)+
  draw_label("e",x=10.2/24,y=6/fig_height0,hjust=0.5,vjust=1,fontface = "bold",size=30)
#Figure 4
ggsave(file.path(fig_path,"Figure.4.tiff"),p,width=24,height = 26,dpi=300,compression="lzw")
ggsave(file.path(fig_path,"Figure.4.pdf"),p,width=24,height = 26,dpi=300)
p

Supplementary Figure 4

p1=egg::ggarrange(pdesc_1+theme_use+theme(plot.title = element_text(color="black")),
                  pcca3.0_1+theme_use+theme(plot.title = element_text(color="black")),
                  pcca_1+theme_use+theme(plot.title = element_text(color="black")),
                  pmnn_1+theme_use+theme(plot.title = element_text(color="black")),
                  pscvi_1+theme_use+theme(plot.title = element_text(color="black")),
                  pbermuda0.90_1+theme_use+theme(plot.title = element_text(color="black")),
                  #pbermuda0.85_1+theme_use+theme(plot.title = element_text(color="black")),
                  pscanorama_1+theme_use+theme(plot.title = element_text(color="black")),
                  ncol=4,draw=F)
## adding dummy grobs
right_margin=2.1                 
p2=egg::ggarrange( pdesc_2+theme_use+ggtitle("DESC")+theme(legend.position = "none",plot.margin = unit(c(0,right_margin,0,0), "cm")),
                  pcca3.0_2+theme_use+ggtitle("Seurat3.0")+theme(legend.position = "none",plot.margin = unit(c(0,right_margin,0,0), "cm")),
                  pcca_2+theme_use+ggtitle("CCA")+theme(legend.position = "none",plot.margin = unit(c(0,right_margin,0,0), "cm")),
                  pmnn_2+theme_use+ggtitle("MNN")+theme(legend.position = "none",plot.margin = unit(c(0,right_margin,0,0), "cm")),
                  pscvi_2+theme_use+ggtitle("scVI")+theme(legend.position = "none"),
                  pbermuda0.90_2+theme_use+ggtitle("BERMUDA")+theme(legend.position = "none"),
                  #pbermuda0.85_2+theme_use+ggtitle("BERMUDA0.85")+theme(legend.position = "none"),
                  pscanorama_2+theme_use+ggtitle("scanorama")+theme(legend.position = "none"),
                  ncol=4,draw=F)
## adding dummy grobs
p_legend=get_legend(pdesc_2+theme(legend.title = element_blank(),
                                  legend.position = "top",
                                  legend.text = element_text(size=25))+
                      guides(colour = guide_legend(override.aes = list(alpha = 1, size=8),nrow=2)))


p2=egg::ggarrange(plot_grid(p2),plot_grid(p_legend),ncol=1,heights=c(6,1),draw = F)
p00=egg::ggarrange(plot_grid(p1),ggplot()+ theme_void(),plot_grid(p2),ncol=1,heights = c(12,1,12.2),
                   labels = c("a","","b"),
                 label.args = list(gp = grid::gpar(cex =3.5,color="black",face="bold"),just="bottom"),draw=F)
#Supplementary Figure 4
ggsave(file.path(fig_path,"Figure.S4.tiff"),p00,width=24,height = 26,dpi=300,compression="lzw")
ggsave(file.path(fig_path,"Figure.S4.pdf"),p00,width=24,height = 26,dpi=300)
p00

Supplementary Figure 5

psankey_desc=ggdraw()+draw_image(magick::image_read("./DESC_sankey_plot.png"),scale=0.9)+
  draw_text("DESC",x=0.5,y=0.95,vjust=0,size=35)
psankey_cca3.0=ggdraw()+draw_image(magick::image_read("./Seurat3.0_sankey_plot.png"),scale=0.9)+
  draw_text("Seurat3.0",x=0.5,y=0.95,vjust=0,size=35)
psankey_cca=ggdraw()+draw_image(magick::image_read("./cca_sankey_plot.png"),scale=0.9)+
  draw_text("CCA",x=0.5,y=0.95,vjust=0,size=35)
psankey_mnn=ggdraw()+draw_image(magick::image_read("./MNN_sankey_plot.png"),scale=0.9)+
  draw_text("MNN",x=0.5,y=0.95,vjust=0,size=35)
psankey_scVI=ggdraw()+draw_image(magick::image_read("./scVI_sankey_plot.png"),scale=0.9)+
  draw_text("scVI",x=0.5,y=0.95,vjust=0,size=35)
psankey_BERMUDA=ggdraw()+draw_image(magick::image_read("./BERMUDA0.90_sankey_retina.png"),scale=0.9)+
  draw_text("BERMUDA",x=0.5,y=0.95,vjust=0,size=35)
psankey_scanorama=ggdraw()+draw_image(magick::image_read("./scanorama_correct_sankey_retina.png"),scale=0.9)+
  draw_text("scanorama",x=0.5,y=0.95,vjust=0,size=35)
ps5=egg::ggarrange(plots  = list(psankey_desc,
                               psankey_cca3.0,
                               psankey_cca,
                               psankey_mnn,
                               psankey_scVI,
                               psankey_BERMUDA,
                               psankey_scanorama),ncol=4,draw=F)
## adding dummy grobs
#Supplementary Figure 5
ggsave(file.path(fig_path,"Figure.S5.tiff"),ps5,width=20,height = 14,dpi=300,compression="lzw")
ggsave(file.path(fig_path,"Figure.S5.pdf"),ps5,width=20,height = 14,dpi=300)
ps5

Supplementary Figure 6

Compare t-distribution and gaussian

dataset="compare_t_gaussian/pancreas_v3_result_gaussian/"
filename=paste0(datadirpath,"/",dataset,"/adata_desc.h5ad")
Sys.setenv(RETICULATE_PYTHON="/usr/bin/python3")
ad=import("anndata",convert = FALSE)
adata_gaussian=ad$read_h5ad(filename)
adata_gaussian=change_obs(adata_gaussian)
df_desc_gaussian=getplotdata(adata_gaussian,reduction.use = "tsne0.3")
dr_desc_gaussian=py_to_r(adata_gaussian$obsm[["X_Embeded_z0.3"]]) #just for KL
df_desc_gaussian2=getplotdata(adata_gaussian,reduction.use = "tsne0.4")
dr_desc_gaussian2=py_to_r(adata_gaussian$obsm[["X_Embeded_z0.4"]]) #just for KL

tmp=data.frame(dr_tech=BatchKL(df_desc_gaussian,dr_desc_gaussian,n_cells=100,batch="tech"),
               dr_tech2=BatchKL(df_desc_gaussian2,dr_desc_gaussian2,n_cells=100,batch="tech"))
df=reshape2::melt(tmp,id.vars=NULL)
colnames(df)=c("Method","value")
df$Group=sapply(as.character(df$Method),function(x) strsplit(x,"_")[[1]][1])
df$Method=factor(df$Method,levels = c("dr_tech","dr_tech2"),labels=c("res=0.3","res=0.4"))
#df$batchid=factor(sapply(as.character(df$Method),function(x) strsplit(x,"_")[[1]][2]),levels=c("tech"))
p_KL_gaussian=ggplot(df,aes(Method,value))+
   xlab("")+
   ylab("KL divergence")+
   geom_boxplot(aes(color=Method,fill=Group))+
  scale_fill_manual(values = c("dr"="white","tsne"="grey"))+
   theme_self+theme(
   #aspect.ratio = 1,
   axis.text.x   = element_text(angle = 30,hjust=1,vjust=1,size=20),
   axis.text.y=element_text(size=20),
   axis.title.y = element_text(size=25),
   panel.border =element_blank(),
  legend.position = "none",
   axis.line = element_line())+
  theme(panel.border = element_rect(fill=NA,color="white"),
        panel.background = element_blank())+
   ggtitle("")
#p_KL_gaussian
df_desc_gaussian[,"desc_0.3"]=factor(as.numeric(as.character(df_desc_gaussian[,"desc_0.3"])))
ari=mclust::adjustedRandIndex(df_desc_gaussian[,'celltype'],df_desc_gaussian[,"desc_0.3"])
df_desc_gaussian$res=df_desc_gaussian[,"desc_0.3"]
pdesc_1=getplot(df_desc_gaussian,by.group = "res",ggtitle0  = paste0("DESC","(ARI=",round(ari,3),")"),pt.size = 0.2)+theme(plot.title = element_text(color="red"))+
  scale_color_manual(values=colorRampPalette(col.set.update)(15))+
  coord_cartesian()
pdesc_2=getplot(df_desc_gaussian,by.group = "celltype",ggtitle0 = "Celltype",pt.size = 0.2)+
   scale_color_manual(values=colorRampPalette(color_celltype[1:13])(13))+
  coord_cartesian()
pdesc_3=getplot(df_desc_gaussian,by.group = "tech",ggtitle0 = "tech",pt.size = 0.2)+
  scale_color_manual(values=colorRampPalette(col.set.update[c(2,3,6,8)])(4))+
  coord_cartesian()
x=cbind(df_desc_gaussian[,c(1,2)],"max.prob"=apply(py_to_r(adata$uns[paste0("prob_matrix",resolution_use)]),1,max))
pdesc_4=getfeature.plot(x,pt.size = 0.5,cols.use = c("red","grey"))[[1]]
p0=egg::ggarrange(pdesc_1+theme_use2+theme(plot.margin=unit(c(0,4,0,0),"cm")),
                  pdesc_2+theme_use2,
                  pdesc_3+theme_use,
                  pdesc_4+theme_use_feature,
                  ncol=2,draw = F)
#generate sankey plots for gaussian distribution
s_gaussian=getmy.sankey(factor(df_desc_gaussian$celltype),factor(df_desc_gaussian$res),plot_height=650,fontsize =17)
print(s_gaussian,tag=NULL,file="desc_sankey_gaussian.html")
s_gaussian=getmy.sankey(factor(df_desc_gaussian$celltype),factor(df_desc_gaussian$desc_0.4),plot_height=650,fontsize =17)
print(s_gaussian,tag=NULL,file="desc_sankey_gaussian2.html")
psankey_desc_res0.3=ggdraw()+draw_image(magick::image_read("./DESC_sankey_gaussian_plot.png"),scale=0.9)+
  draw_text("DESC-gaussian(res=0.3)",x=0.5,y=0.95,vjust=0,size=25)
psankey_desc_res0.4=ggdraw()+draw_image(magick::image_read("./DESC_sankey_gaussian_plot2.png"),scale=0.9)+
  draw_text("DESC-gaussian(res=0.4)",x=0.5,y=0.95,vjust=0,size=25)

ps6=ggdraw()+
  draw_plot(p0,x=0.01,y=0.5,width=0.99,height=0.5)+
  draw_plot(psankey_desc_res0.3,x=0,y=0,width=0.35,height=0.45)+
  draw_plot(psankey_desc_res0.4,x=0.375,y=0,width=0.35,height=0.45)+
  draw_plot(p_KL_gaussian,x=0.75,y=0.025,width=0.2,height=0.425)+
  draw_label("a",x=0.01,y=1,hjust=0.5,vjust=1,fontface = "bold",size=30)+
  draw_label("c",x=0.375,y=0.45,hjust=0.5,vjust=1,fontface = "bold",size=30)+
  draw_label("b",x=0.01,y=0.45,hjust=0.5,vjust=1,fontface = "bold",size=30)+
  draw_label("d",x=0.75,y=0.45,hjust=0.5,vjust=1,fontface = "bold",size=30)
#Supplementary Figure 6
ggsave(file.path(fig_path,"Figure.S6.tiff"),ps6,width=16,height = 18,dpi=300,compression="lzw")
ggsave(file.path(fig_path,"Figure.S6.pdf"),ps6,width=16,height = 18,dpi=300)
ps6