Data Summary

This dataset was generated by our group, which can be downloaded from GEO (GSE146974). This dataset was generated from human peripheral blood mononuclear clear cells by Ficoll Separation followed by CD14 and CD16 positive cell selection. Since the CD14 and CD16 antibodies are not 100% specific, some T cells were also present in the scRNA-seq data. We performed clustering analysis using Louvain’s algorithm for each batch and identified 288 T cells in total based on the T cell marker genes CD3D, CD3E and CD3G. Aftering removing these 288 T cells, there are 10,878 cells and 21,289 genes, which was processed and sequenced at three different days, resulting in three batches (3,640 cells in T1, 4,833 cells in T2 and 2,405 cells in T3) left in the remaining analysis. Figs. 7- 8 and Supplementary Figs. 14 and 15 show the results for analysis of these data.

Human monocyte preparation: Monocyte preparation uses a modification of published protocols. Briefly, ~20 ml blood drawn in sodium heparin was processed immediately in the lab in the Clinical Research Center at Columbia University. PBMCs were isolated by gradient Ficoll paque centrifugation, which maintains cell viability and prevents ex vivo activation during cell recovery. Cells were stained with antibodies against human HLADR, CD14 and CD16 and monocyte subsets defined as HLADR+CD14++CD16-(classical), HLADR+CD14++CD16+ (intermediate), HLADR+CD14dim/CD16++ (nonclassical, patrolling monocyte). DAPI staining was used to exclude dead cells. Monocytes were sorted by a BD Influx Sorter into tubes for real-time 10x Genomics analysis.

options(warn=-1) # turn off warning message globally
.libPaths(c("/home/xiaoxiang/R/x86_64-pc-linux-gnu-library/3.5",.libPaths()))
Sys.setenv(RETICULATE_PYTHON_ENV="/home/xiaoxiang/anaconda3/envs/py36")#="/home/xiaoxiang/.conda/envs/DESCVIR"
#RETICULATE_PYTHON="/home/xiaoxiang/anaconda3/bin/python3",
if ("Seurat" %in% loadedNamespaces()) detach("package:Seurat",unload = T)
suppressPackageStartupMessages(library(reticulate))
suppressPackageStartupMessages(library(devtools))
dyn.load("/home/xiaoxiang/R/x86_64-pc-linux-gnu-library/3.5/sf/libs/sf.so")
suppressPackageStartupMessages(library(monocle,lib.loc = "/usr/lib/R/monocle_alpha"))# devtools::install_github("")
#devtools::install_github("cole-trapnell-lab/DDRTree", ref="simple-ppt-like",lib="/usr/lib/R/monocle_alpha")
#devtools::install_github("r-spatial/sf") if 
suppressPackageStartupMessages(library(flexclust))
suppressPackageStartupMessages(library(mcclust))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggjoy))
suppressPackageStartupMessages(library(VGAM))
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(kableExtra))
#py_install('umap-learn', pip = T, pip_ignore_installed = T)
#import("louvain")
fig_path="/home/xiaoxiang/Documents/DESC_paper_prepare/DESC_paper_final/formal_revised/figures_sep/"
datadirpath="./"

df_pseudotime_list=list()

get_p_new=function(x) sapply(x,function(i0) ifelse(i0<=0,"<2.2e-16",scales::scientific(i0,digits = 3)))
Stable5=data.frame(Method=c("raw+monocle3","DESC+monocle3","Seurat3.0+monocle3","CCA+monocle3","MNN+monocle3","scVI+monocle3","BERMUDA+monocle3","scanorama+monocle3"),
                  `T1 v.s. T2`=rep("<2.2e-16",length=8),
                  `T1 v.s. T3`=rep("<2.2e-16",length=8),
                  `T2 v.s. T3`=rep("<2.2e-16",length=8),stringsAsFactors = F)
rownames(Stable5)=Stable5$Method
#table5 %>%
#  kable() %>%
#  kable_styling()

# load necessay
source("/media/xiaoxiang/D/DESC_reproducible_file/helpfunc_new.R")
#source("/media/xiaoxiang/D/Upenn_computer_backup/Documents/Human_Heart_Project/heart/Heart_result_updated/helpfunc_new.R")
old=theme_set(theme_bw()+theme(strip.background = element_rect(fill="white"),
                                         panel.background = element_blank(),
                                         panel.grid =element_blank()))

KL_list=list()

resolution_use="0.7"
desc_use=paste0("desc_",resolution_use)
tsne_use=paste0("tsne",resolution_use)
obsm_use=paste0("X_Embeded_z",resolution_use)

#dataset="desc_onlyMPH001_resultdataset_label_128_top1000"
dataset="onlyMPH001new/desc_result"
filename=paste0("/media/xiaoxiang/D/raw_data/DECv.1/monocyte_result/",dataset,"/adata_desc.h5ad")
Sys.setenv(RETICULATE_PYTHON="/usr/bin/python3")
ad=import("anndata",convert = FALSE)
adata=ad$read_h5ad(filename)
adata=change_obs(adata,exclude=c("n_genes","n_counts"))# colum
obj0=Convert_from_anndata_to_seurat(adata,raw.X.slot = "count.data")
obj0=NormalizeData(obj0,display.progress = F)
df_desc=getplotdata(adata,reduction.use = tsne_use)

raw.data=obj0@raw.data
maprules=c("2017_0801"="T1","2017_1017"="T2","2017_1120"="T3")
#obj0@meta.data$dataset_batch=plyr::maply(obj0@meta.data$batch_label,names(maprules),maprules)

Compared methods

Monocle3 using raw data

cell.meta.data=obj0@meta.data
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann=data.frame(gene_short_name = make.unique(rownames(raw.data)),row.names = make.unique(rownames(raw.data)))
pd <- new("AnnotatedDataFrame",data=cell.meta.data)
fd <- new("AnnotatedDataFrame",data=gene_ann)
cds <- newCellDataSet(raw.data, phenoData = pd,featureData =fd,
                          expressionFamily = negbinomial.size(),
                          lowerDetectionLimit=1)

options(DelayedArray.block.size=1000e6)
cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds)

## Removing 12 outliers

disp_table = dispersionTable(cds)
disp_table = disp_table %>% mutate(excess_disp =
  (dispersion_empirical - dispersion_fit) / dispersion_fit) %>%
    arrange(plyr::desc(excess_disp))
top_subset_genes = as.character(head(disp_table, 2500)$gene_id)
cds = setOrderingFilter(cds, top_subset_genes)
cds <- preprocessCDS(cds,  method = 'PCA',
                         norm_method = 'log',
                         num_dim = 50,
                         verbose = T)

## Remove noise by PCA ...

df_ori=pData(cds)
ori_dim_use=cds@normalized_data_projection[,1:50]

cds <- reduceDimension(cds, max_components = 2,
                       reduction_method = 'UMAP',
                       metric="correlation",
                       min_dist = 0.75,
                       n_neighbors = 50,
                       verbose = F)
#will give the following 
#Retrieving normalized data ...
#Running Uniform Manifold Approximation and Projection
#Python '/usr/bin/python' was requested but '/home/xiaoxiang/anaconda3/envs/py36/bin/python' was loaded instead (see reticulate::py_config() for more information)
cds <- clusterCells(cds, use_pca=T,
                    method = 'louvain',
                    res = 1e-3,
                    louvain_iter = 1,
                    verbose = F)

KL_list$raw=BatchKL(df_ori,ori_dim_use,n_cells=100,batch="batch_label")

cds <- partitionCells(cds)
cds <- learnGraph(cds,  RGE_method = 'SimplePPT')
#plot_cell_trajectory(cds,color_by = "Cluster")
# a helper function to identify the root principal points:
get_correct_root_state <- function(cds, cell_phenotype, root_type){
  cell_ids <- which(pData(cds)[, cell_phenotype] == root_type)

  closest_vertex <-
    cds@auxOrderingData[[cds@rge_method]]$pr_graph_cell_proj_closest_vertex
  closest_vertex <- as.matrix(closest_vertex[colnames(cds), ])
  root_pr_nodes <-
    V(cds@minSpanningTree)$name[as.numeric(names
      (which.max(table(closest_vertex[cell_ids,]))))]

  root_pr_nodes
}
MPP_node_ids = get_correct_root_state(cds,cell_phenotype ='Cluster', "3")
cds <- orderCells(cds, root_pr_nodes = MPP_node_ids)
#plot_cell_trajectory(cds,color_by = "Cluster")

df_den=pData(cds)[,c("Pseudotime","dataset_batch")]
df_den=df_den[!is.infinite(df_den$Pseudotime),]
set.seed(10)

theme_use=theme(legend.text = element_text(size=16),
                legend.title = element_text(size=20))

p_ori_1=plot_cell_trajectory(cds,color_by = "dataset_batch",cell_link_size=0,alpha=0.7,cell_size = 0.2)+
            guides(colour = guide_legend(override.aes = list(alpha=0.7, size=5)))+theme_use
          
p_ori_2=plot_cell_trajectory(cds,show_branch_points=T,cell_link_size=2,alpha=0.8,cell_size = 0.2)+
            theme(legend.position = "top",
              legend.title = element_text(vjust = 0.2),
              legend.text = element_text(angle=-50 ),
                  legend.key.height = unit(0.5,"cm"),
                  legend.key.width = unit(1,"cm"))+
            guides(color = guide_colourbar(label.position = "top"))+theme_use

p_ori_3=ggplot(data=df_den)+geom_density(aes(x=Pseudotime,fill=dataset_batch),alpha=0.7)+
            scale_y_continuous(expand = c(0,0))+
            scale_x_continuous(expand = c(0,0))+
            theme(legend.position="top")+theme_use

p_monocle_ori=egg::ggarrange(p_ori_1,p_ori_2,p_ori_3,ncol=3,draw=F)

# how many cells with no pseudotime
table(is.infinite(pData(cds)[,c("Pseudotime")]))

## 
## FALSE 
## 10878

p_monocle_ori

ggplot(df_den, aes(x=Pseudotime, y=dataset_batch,fill=dataset_batch)) +
         ggjoy::geom_joy(scale=2,alpha=0.3) +
         scale_y_discrete(expand = c(0.001, 0))+
         xlab('') +
         #scale_x_continuous(limits = c(0,max(df_den$Pseudotime)))+
         theme_joy() +
         theme(axis.title.y = element_blank())+theme(legend.position = "none")

## Picking joint bandwidth of 0.0476

tt1=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T1"],df_den$Pseudotime[df_den$dataset_batch=="T3"])
tt1

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T1"] and df_den$Pseudotime[df_den$dataset_batch == "T3"]
## D = 0.83192, p-value < 2.2e-16
## alternative hypothesis: two-sided

tt2=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T1"],df_den$Pseudotime[df_den$dataset_batch=="T2"])
tt2

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T1"] and df_den$Pseudotime[df_den$dataset_batch == "T2"]
## D = 0.5303, p-value < 2.2e-16
## alternative hypothesis: two-sided

tt3=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T2"],df_den$Pseudotime[df_den$dataset_batch=="T3"])
tt3

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T2"] and df_den$Pseudotime[df_den$dataset_batch == "T3"]
## D = 0.91617, p-value < 2.2e-16
## alternative hypothesis: two-sided

Stable5[1,2:4]=matrix(get_p_new(c(tt1$p.value,tt2$p.value,tt3$p.value)),1,3)

p_feature_ori=plot_cell_clusters(cds,markers =c("FCGR3A","S100A8"),cell_size = 0.8,alpha=1)+
  scale_colour_gradient(name="",low="grey",high="red")+theme(legend.position = "right",strip.text = element_text(size=20,face="bold"))

## Loading required package: ggrastr

## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

#p2=plot_cell_clusters(cds,markers =c("CD14","CD163"),cell_size = 0.8,alpha=1)+
#  scale_colour_gradient(name="",low="grey",high="red")+theme(legend.position = "right",strip.text = element_text(size=20,face="bold"))

#ggsave("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_supp/monocyte_ori_monocle3_feature.tiff",p1,width=12,height = 5,dpi=300,compression="lzw")
#ggsave("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_supp/monocyte_ori_add_monocle3_feature.tiff",p2,width=12,height = 5,dpi=300,compression="lzw")

cds_exprs <-  as.matrix(exprs(cds[c("FCGR3A","S100A8")]))[1:2,]
df0=data.frame(cbind(pseudotime=pData(cds)$Pseudotime,log1p(t(cds_exprs)/sizeFactors(cds))))
df0$BatchID=pData(cds)$dataset_batch
df0=df0[is.finite(df0$pseudotime),]
df0=df0[order(df0$pseudotime,decreasing = F),,drop=F]
df0$x=df0$pseudotime/max(df0$pseudotime)
df_pseudotime_list$raw=df0

Monocle using representation of DESC

cell.meta.data=obj0@meta.data
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann=data.frame(gene_short_name = make.unique(rownames(raw.data)),row.names = make.unique(rownames(raw.data)))
pd <- new("AnnotatedDataFrame",data=cell.meta.data)
fd <- new("AnnotatedDataFrame",data=gene_ann)
cds <- newCellDataSet(raw.data, phenoData = pd,featureData =fd,
                          expressionFamily = negbinomial.size(),
                          lowerDetectionLimit=1)

options(DelayedArray.block.size=1000e6)
cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds)

## Removing 12 outliers

disp_table = dispersionTable(cds)
disp_table = disp_table %>% mutate(excess_disp =
  (dispersion_empirical - dispersion_fit) / dispersion_fit) %>%
    arrange(plyr::desc(excess_disp))
top_subset_genes = as.character(head(disp_table, 2500)$gene_id)
cds = setOrderingFilter(cds, top_subset_genes)
#cds <- preprocessCDS(cds,  method = 'PCA',
#                         norm_method = 'log',
#                         num_dim = 32,
#                         verbose = T)
desc_dim_use=as.matrix(obj0@dr$Embeded_z0.7@cell.embeddings)
colnames(desc_dim_use)=paste0("PC_",1:ncol(desc_dim_use))
rownames(desc_dim_use)=df_desc$cellname
cds@normalized_data_projection=desc_dim_use[match(cell.meta.data$cellname,rownames(desc_dim_use)),]
#replcae PCA
cds <- reduceDimension(cds, max_components = 2,
                       reduction_method = 'UMAP',
                       metric="correlation",
                       min_dist = 0.75,
                       n_neighbors = 50,
                       verbose = F)
cds <- clusterCells(cds, use_pca=T,
                    method = 'louvain',
                    res = 1e-3,
                    louvain_iter = 1,
                    verbose = F)
#plot_cell_clusters(cds,markers = c("S100A8","FCGR3A"),show_group_id = T, cell_size = 0.1)

KL_list$DESC=BatchKL(df_desc,desc_dim_use,n_cells=100,batch="batch_label")

cds <- partitionCells(cds)
cds <- learnGraph(cds,  RGE_method = 'SimplePPT')
#plot_cell_trajectory(cds,color_by = "Cluster")
# a helper function to identify the root principal points:
get_correct_root_state <- function(cds, cell_phenotype, root_type){
  cell_ids <- which(pData(cds)[, cell_phenotype] == root_type)

  closest_vertex <-
    cds@auxOrderingData[[cds@rge_method]]$pr_graph_cell_proj_closest_vertex
  closest_vertex <- as.matrix(closest_vertex[colnames(cds), ])
  root_pr_nodes <-
    V(cds@minSpanningTree)$name[as.numeric(names
      (which.max(table(closest_vertex[cell_ids,]))))]

  root_pr_nodes
}
MPP_node_ids = get_correct_root_state(cds,cell_phenotype ='Cluster', "1")
cds <- orderCells(cds, root_pr_nodes = MPP_node_ids)
#plot_cell_trajectory(cds,color_by = "Cluster")

df_den=pData(cds)[,c("Pseudotime","dataset_batch")]
df_den=df_den[!is.infinite(df_den$Pseudotime),]
set.seed(10)
theme_use=theme(legend.text = element_text(size=16),
                legend.title = element_text(size=20))

p_desc_1=plot_cell_trajectory(cds,color_by = "dataset_batch",cell_link_size=0,alpha=0.7,cell_size = 0.2)+
            guides(colour = guide_legend(override.aes = list(alpha=0.7, size=5)))+theme_use
  
p_desc_2=plot_cell_trajectory(cds,show_branch_points=T,cell_link_size=2,alpha=0.8,cell_size = 0.2)+
            theme(legend.position = "top",
              legend.title = element_text(vjust = 0.2),
              legend.text = element_text(angle=-50 ),
                  legend.key.height = unit(0.5,"cm"),
                  legend.key.width = unit(1,"cm"))+
            guides(color = guide_colourbar(label.position = "top"))+theme_use
  
p_desc_3=ggplot(data=df_den)+geom_density(aes(x=Pseudotime,fill=dataset_batch),alpha=0.7)+
            scale_y_continuous(expand = c(0,0))+
            scale_x_continuous(expand = c(0,0))+
            theme(legend.position="top")+theme_use
p_monocle_desc=egg::ggarrange(p_desc_1,p_desc_2,p_desc_3,ncol=3,draw=F)

table(is.infinite(pData(cds)[,c("Pseudotime")]))

## 
## FALSE 
## 10878

p_monocle_desc

#tiff("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_main/monocyte_monocle_desc.tiff",width=1800*0.8,height=0.8*500)
#p_monocle_desc
#dev.off()

uniquebatch=sort(unique(as.character(pData(cds)$dataset_batch)))
plot_grid(plotlist = lapply(uniquebatch,function(x){
  return(plot_cell_trajectory(cds[,pData(cds)$dataset_batch==x],show_branch_points=T,cell_link_size=2,alpha=0.8,cell_size = 0.2)+ggtitle(x)+
            theme(legend.text = element_text(angle=-90,hjust=1.2,vjust=0),
                  plot.title = element_text(vjust=-15,hjust=0.9,color="darkblue",face="bold"),
                  legend.key.height = unit(0.5,"cm")))}),ncol=3)

tt1=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T1"],df_den$Pseudotime[df_den$dataset_batch=="T3"])
tt1

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T1"] and df_den$Pseudotime[df_den$dataset_batch == "T3"]
## D = 0.054188, p-value = 0.0004051
## alternative hypothesis: two-sided

tt2=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T1"],df_den$Pseudotime[df_den$dataset_batch=="T2"])
tt2

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T1"] and df_den$Pseudotime[df_den$dataset_batch == "T2"]
## D = 0.067203, p-value = 1.433e-08
## alternative hypothesis: two-sided

tt3=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T2"],df_den$Pseudotime[df_den$dataset_batch=="T3"])
tt3

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T2"] and df_den$Pseudotime[df_den$dataset_batch == "T3"]
## D = 0.037531, p-value = 0.02169
## alternative hypothesis: two-sided

Stable5[2,2:4]=matrix(get_p_new(c(tt1$p.value,tt2$p.value,tt3$p.value)),1,3)

p_feature_desc=plot_cell_clusters(cds,markers =c("FCGR3A","S100A8"),cell_size = 0.8,alpha=1)+
  scale_colour_gradient(name="",low="grey",high="red")+theme(legend.position = "right",strip.text = element_text(size=20,face="bold"))

## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

p2=plot_cell_clusters(cds,markers =c("CD14","CD163"),cell_size = 0.8,alpha=1)+
  scale_colour_gradient(name="",low="grey",high="red")+theme(legend.position = "right",strip.text = element_text(size=20,face="bold"))

## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

#ggsave("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_supp/monocyte_desc_monocle3_feature.tiff",p1,width=12,height = 5,dpi=200)
#ggsave("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_supp/monocyte_desc_add_monocle3_feature.tiff",p2,width=12,height = 5,dpi=200)

cds_exprs <-  as.matrix(exprs(cds[c("FCGR3A","S100A8")]))[1:2,]
df0=data.frame(cbind(pseudotime=pData(cds)$Pseudotime,log1p(t(cds_exprs)/sizeFactors(cds))))
df0$BatchID=pData(cds)$dataset_batch
df0=df0[is.finite(df0$pseudotime),]
df0=df0[order(df0$pseudotime,decreasing = F),,drop=F]
df0$x=df0$pseudotime/max(df0$pseudotime)
df_pseudotime_list$desc=df0

Monocle using representation of Seurat3.0

batch_id="dataset_batch"
if ("Seurat" %in% loadedNamespaces()) detach("package:Seurat",unload = T)
suppressPackageStartupMessages(library("Seurat",lib.loc = "/usr/lib/R/self_library/"))
filename=paste0(datadirpath,"/cca3.0_result/","obj_cca3.0_",batch_id,"_2000.rds")
obj=readRDS(filename)
df_cca3.0=cbind(obj[["tsne"]]@cell.embeddings[,c(1,2)],obj@meta.data)
#cca3.0_dim_use=t(obj@assays$integrated@data)
cca3.0_dim_use=obj@reductions$pca@cell.embeddings
raw.data.3.0=obj@assays$RNA@counts
#cca3.0_dim_use=t(obj@assays$integrated@data)
#df0$cca3.0$df_cca3.0=df_cca3.0
#df0$cca3.0$dr_cca3.0=obj[["pca"]]@cell.embeddings[,1:20]
detach("package:Seurat",unload = T)

KL_list$Seurat3.0=BatchKL(df_cca3.0,cca3.0_dim_use,n_cells=100,batch="batch_label")

cell.meta.data=obj@meta.data
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann=data.frame(gene_short_name = make.unique(rownames(raw.data.3.0)),row.names = make.unique(rownames(raw.data.3.0)))
                      #make.unique(rownames(obj@assays$RNA@counts)))
pd <- new("AnnotatedDataFrame",data=cell.meta.data)
fd <- new("AnnotatedDataFrame",data=gene_ann)
cds <- newCellDataSet(raw.data.3.0, phenoData = pd,featureData =fd,
                          expressionFamily = negbinomial.size(),
                          lowerDetectionLimit=1)
options(DelayedArray.block.size=1000e6)
cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds)

## Removing 12 outliers

disp_table = dispersionTable(cds)
disp_table = disp_table %>% mutate(excess_disp =
  (dispersion_empirical - dispersion_fit) / dispersion_fit) %>%
    arrange(plyr::desc(excess_disp))
top_subset_genes = as.character(head(disp_table, 2500)$gene_id)
cds = setOrderingFilter(cds, top_subset_genes)
#cds <- preprocessCDS(cds,  method = 'PCA',
#                         norm_method = 'log',
#                         num_dim = ncol(cca3.0_dim_use),
#                         verbose = T)
colnames(cca3.0_dim_use)=paste0("PC_",1:ncol(cca3.0_dim_use))

rownames(cca3.0_dim_use)=df_cca3.0$cellname
cds@normalized_data_projection=cca3.0_dim_use[match(cell.meta.data$cellname,rownames(cca3.0_dim_use)),]


cds@normalized_data_projection=cca3.0_dim_use[,1:20] #only top 20 PCs
#replcae PCA
cds <- reduceDimension(cds, max_components = 2,
                       reduction_method = 'UMAP',
                       metric="correlation",
                       min_dist = 0.75,
                       n_neighbors = 50,
                       verbose = F)
cds <- clusterCells(cds, use_pca=T,
                    method = 'louvain',
                    res = 3e-3,
                    louvain_iter = 1,
                    verbose = F)
#plot_cell_clusters(cds,markers = c("S100A8","FCGR3A"),show_group_id = T, cell_size = 0.1)

cds <- partitionCells(cds)
cds <- learnGraph(cds,  RGE_method = 'SimplePPT')
#plot_cell_trajectory(cds,color_by = "Cluster")
# a helper function to identify the root principal points:
get_correct_root_state <- function(cds, cell_phenotype, root_type){
  cell_ids <- which(pData(cds)[, cell_phenotype] == root_type)

  closest_vertex <-
    cds@auxOrderingData[[cds@rge_method]]$pr_graph_cell_proj_closest_vertex
  closest_vertex <- as.matrix(closest_vertex[colnames(cds), ])
  root_pr_nodes <-
    V(cds@minSpanningTree)$name[as.numeric(names
      (which.max(table(closest_vertex[cell_ids,]))))]

  root_pr_nodes
}
MPP_node_ids = get_correct_root_state(cds,cell_phenotype ='Cluster', "5")
cds <- orderCells(cds, root_pr_nodes = MPP_node_ids)
#plot_cell_trajectory(cds)
#plot_cell_trajectory(cds,color_by = "Cluster")

df_den=pData(cds)[,c("Pseudotime","dataset_batch")]
df_den=df_den[!is.infinite(df_den$Pseudotime),]
set.seed(10)
theme_use=theme(legend.text = element_text(size=16),
                legend.title = element_text(size=20))

p_cca3.0_1=plot_cell_trajectory(cds,color_by = "dataset_batch",cell_link_size=0,alpha=0.7,cell_size = 0.2)+
            guides(colour = guide_legend(override.aes = list(alpha=0.7, size=5)))+theme_use
p_cca3.0_2=plot_cell_trajectory(cds,show_branch_points=T,cell_link_size=2,alpha=0.8,cell_size = 0.2)+
            theme(legend.position = "top",
              legend.title = element_text(vjust = 0.2),
              legend.text = element_text(angle=-50 ),
                  legend.key.height = unit(0.5,"cm"),
                  legend.key.width = unit(1,"cm"))+
            guides(color = guide_colourbar(label.position = "top"))+theme_use
p_cca3.0_3=ggplot(data=df_den)+geom_density(aes(x=Pseudotime,fill=dataset_batch),alpha=0.7)+
            scale_y_continuous(expand = c(0,0))+
            scale_x_continuous(expand = c(0,0))+
            theme(legend.position="top")+theme_use

p_monocle_cca3.0=egg::ggarrange(p_cca3.0_1,p_cca3.0_2,p_cca3.0_3,ncol=3,draw=F)

table(is.infinite(pData(cds)[,c("Pseudotime")]))

## 
## FALSE 
## 10878

p_monocle_cca3.0

#tiff("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_main/monocyte_monocle_cca3.0.tiff",width=1800*0.8,height=0.8*500)
#p_monocle_cca3.0
#dev.off()

tt1=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T1"],df_den$Pseudotime[df_den$dataset_batch=="T3"])
tt1

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T1"] and df_den$Pseudotime[df_den$dataset_batch == "T3"]
## D = 0.16822, p-value < 2.2e-16
## alternative hypothesis: two-sided

tt2=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T1"],df_den$Pseudotime[df_den$dataset_batch=="T2"])
tt2

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T1"] and df_den$Pseudotime[df_den$dataset_batch == "T2"]
## D = 0.16158, p-value < 2.2e-16
## alternative hypothesis: two-sided

tt3=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T2"],df_den$Pseudotime[df_den$dataset_batch=="T3"])
tt3

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T2"] and df_den$Pseudotime[df_den$dataset_batch == "T3"]
## D = 0.078667, p-value = 4.666e-09
## alternative hypothesis: two-sided

Stable5[3,2:4]=matrix(get_p_new(c(tt1$p.value,tt2$p.value,tt3$p.value)),1,3)

p_feature_cca3.0=plot_cell_clusters(cds,markers =c("FCGR3A","S100A8"),cell_size = 0.8,alpha=1)+
  scale_colour_gradient(name="",low="grey",high="red")+theme(legend.position = "right",strip.text = element_text(size=20,face="bold"))

## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

p2=plot_cell_clusters(cds,markers =c("CD14","CD163"),cell_size = 0.8,alpha=1)+
  scale_colour_gradient(name="",low="grey",high="red")+theme(legend.position = "right",strip.text = element_text(size=20,face="bold"))

## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

#ggsave("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_supp/monocyte_Seurat3.0_monocle3_feature.tiff",p1,width=12,height = 5,dpi=200)
#ggsave("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_supp/monocyte_Seurat3.0_add_monocle3_feature.tiff",p2,width=12,height = 5,dpi=200)

cds_exprs <-  as.matrix(exprs(cds[c("FCGR3A","S100A8")]))[1:2,]
df0=data.frame(cbind(pseudotime=pData(cds)$Pseudotime,log1p(t(cds_exprs)/sizeFactors(cds))))
df0$BatchID=pData(cds)$dataset_batch
df0=df0[is.finite(df0$pseudotime),]
df0=df0[order(df0$pseudotime,decreasing = F),,drop=F]
df0$x=df0$pseudotime/max(df0$pseudotime)
df_pseudotime_list$Seurat3.0=df0

if ("Seurat" %in% loadedNamespaces()) detach("package:Seurat",unload = T)

Monocle using representation of CCA

batch_id="dataset_batch"
suppressPackageStartupMessages(library(Seurat))
filename=paste0(datadirpath,"/cca_result/","obj_cca_",batch_id,"_2000_False.rds")
obj=readRDS(filename)
df_cca=cbind(obj@dr$tsne@cell.embeddings[,c(1,2)],obj@meta.data)
#cca3.0_dim_use=t(obj@assays$integrated@data)
cca_dim_use=obj@dr$cca.aligned@cell.embeddings

KL_list$CCA=BatchKL(df_cca,cca_dim_use,n_cells=100,batch="batch_label")

cell.meta.data=obj@meta.data
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann=data.frame(gene_short_name = make.unique(rownames(raw.data)),row.names = make.unique(rownames(raw.data)))
pd <- new("AnnotatedDataFrame",data=cell.meta.data)
fd <- new("AnnotatedDataFrame",data=gene_ann)
cds <- newCellDataSet(raw.data, 
                      phenoData = pd,
                      featureData =fd,
                          expressionFamily = negbinomial.size(),
                          lowerDetectionLimit=1)
options(DelayedArray.block.size=1000e6)
cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds)

## Removing 12 outliers

disp_table = dispersionTable(cds)
disp_table = disp_table %>% mutate(excess_disp =
  (dispersion_empirical - dispersion_fit) / dispersion_fit) %>%
    arrange(plyr::desc(excess_disp))
top_subset_genes = as.character(head(disp_table, 2500)$gene_id)
cds = setOrderingFilter(cds, top_subset_genes)
#cds <- preprocessCDS(cds,  method = 'PCA',
#                         norm_method = 'log',
#                         num_dim = ncol(cca3.0_dim_use),
#                         verbose = T)
colnames(cca_dim_use)=paste0("PC_",1:ncol(cca_dim_use))
rownames(cca_dim_use)=df_cca$cellname
cds@normalized_data_projection=cca_dim_use[match(cell.meta.data$cellname,rownames(cca_dim_use)),]

#replcae PCA
cds <- reduceDimension(cds, max_components = 2,
                       reduction_method = 'UMAP',
                       metric="correlation",
                       min_dist = 0.75,
                       n_neighbors = 50,
                       verbose = F)
cds <- clusterCells(cds, use_pca=T,
                    method = 'louvain',
                    res = 3.5e-3,
                    louvain_iter = 1,
                    verbose = F)
#plot_cell_clusters(cds,markers = c("S100A8","FCGR3A"),show_group_id = T, cell_size = 0.1)

cds <- partitionCells(cds)
cds <- learnGraph(cds,  RGE_method = 'SimplePPT')
#plot_cell_trajectory(cds,color_by = "Cluster")
# a helper function to identify the root principal points:
get_correct_root_state <- function(cds, cell_phenotype, root_type){
  cell_ids <- which(pData(cds)[, cell_phenotype] == root_type)

  closest_vertex <-
    cds@auxOrderingData[[cds@rge_method]]$pr_graph_cell_proj_closest_vertex
  closest_vertex <- as.matrix(closest_vertex[colnames(cds), ])
  root_pr_nodes <-
    V(cds@minSpanningTree)$name[as.numeric(names
      (which.max(table(closest_vertex[cell_ids,]))))]

  root_pr_nodes
}
MPP_node_ids = get_correct_root_state(cds,cell_phenotype ='Cluster', "3")
cds <- orderCells(cds, root_pr_nodes = MPP_node_ids)
#plot_cell_trajectory(cds)
#plot_cell_trajectory(cds,color_by = "Cluster")

df_den=pData(cds)[,c("Pseudotime","dataset_batch")]
df_den=df_den[!is.infinite(df_den$Pseudotime),]
set.seed(10)
theme_use=theme(legend.text = element_text(size=16),
                legend.title = element_text(size=20))

p_cca_1=plot_cell_trajectory(cds,color_by = "dataset_batch",cell_link_size=0,alpha=0.7,cell_size = 0.2)+
            guides(colour = guide_legend(override.aes = list(alpha=0.7, size=5)))+theme_use
p_cca_2=plot_cell_trajectory(cds,show_branch_points=T,cell_link_size=2,alpha=0.8,cell_size = 0.2)+
            theme(legend.position = "top",
              legend.title = element_text(vjust = 0.2),
              legend.text = element_text(angle=-50 ),
                  legend.key.height = unit(0.5,"cm"),
                  legend.key.width = unit(1,"cm"))+
            guides(color = guide_colourbar(label.position = "top"))+theme_use
p_cca_3=ggplot(data=df_den)+geom_density(aes(x=Pseudotime,fill=dataset_batch),alpha=0.7)+
            scale_y_continuous(expand = c(0,0))+
            scale_x_continuous(expand = c(0,0))+
            theme(legend.position="top")+theme_use
p_monocle_cca=egg::ggarrange(p_cca_1,p_cca_2,p_cca_3,ncol=3,draw = F)

table(is.infinite(pData(cds)[,c("Pseudotime")]))

## 
## FALSE 
## 10878

p_monocle_cca

tt1=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T1"],df_den$Pseudotime[df_den$dataset_batch=="T3"])
tt1

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T1"] and df_den$Pseudotime[df_den$dataset_batch == "T3"]
## D = 0.15916, p-value < 2.2e-16
## alternative hypothesis: two-sided

tt2=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T1"],df_den$Pseudotime[df_den$dataset_batch=="T2"])
tt2

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T1"] and df_den$Pseudotime[df_den$dataset_batch == "T2"]
## D = 0.099524, p-value < 2.2e-16
## alternative hypothesis: two-sided

tt3=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T2"],df_den$Pseudotime[df_den$dataset_batch=="T3"])
tt3

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T2"] and df_den$Pseudotime[df_den$dataset_batch == "T3"]
## D = 0.16291, p-value < 2.2e-16
## alternative hypothesis: two-sided

Stable5[4,2:4]=matrix(get_p_new(c(tt1$p.value,tt2$p.value,tt3$p.value)),1,3)

p_feature_cca=plot_cell_clusters(cds,markers =c("FCGR3A","S100A8"),cell_size = 0.8,alpha=1)+
  scale_colour_gradient(name="",low="grey",high="red")+theme(legend.position = "right",strip.text = element_text(size=20,face="bold"))

## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

p2=plot_cell_clusters(cds,markers =c("CD14","CD163"),cell_size = 0.8,alpha=1)+
  scale_colour_gradient(name="",low="grey",high="red")+theme(legend.position = "right",strip.text = element_text(size=20,face="bold"))

## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

#ggsave("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_supp/monocyte_CCA_monocle3_feature.tiff",p1,width=12,height = 5,dpi=200)
#ggsave("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_supp/monocyte_CCA_add_monocle3_feature.tiff",p2,width=12,height = 5,dpi=200)

cds_exprs <-  as.matrix(exprs(cds[c("FCGR3A","S100A8")]))[1:2,]
df0=data.frame(cbind(pseudotime=pData(cds)$Pseudotime,log1p(t(cds_exprs)/sizeFactors(cds))))
df0$BatchID=pData(cds)$dataset_batch
df0=df0[is.finite(df0$pseudotime),]
df0=df0[order(df0$pseudotime,decreasing = F),,drop=F]
df0$x=df0$pseudotime/max(df0$pseudotime)
df_pseudotime_list$CCA=df0

if ("Seurat" %in% loadedNamespaces()) detach("package:Seurat",unload = T)

Monocle using representation of MNN

batch_id="dataset_batch"
filename=paste0(datadirpath,"/mnn_result/","adata_mnn_",batch_id,"_2000_False.h5ad")
suppressPackageStartupMessages(library(Seurat))
ad=import("anndata",convert = FALSE)
adata=ad$read_h5ad(filename)
adata=change_obs(adata,exclude=c("n_genes","n_counts","sample"))# colum
#print(mclust::adjustedRandIndex(df_cca$cluster,df_cca$integrated_snn_res.0.2))
mnn_dim_use=py_to_r(adata$obsm["X_pca"])[,c(1:20)]
df_mnn=py_to_r(adata$obs)
id0=match(rownames(df_desc),paste0(df_mnn$cellname,"-",df_mnn$dataset_batch))
mnn_dim_use=mnn_dim_use[id0,]
rownames(mnn_dim_use)=rownames(df_desc)# 因为是match之后的

KL_list$MNN=BatchKL(df_mnn,mnn_dim_use,n_cells=100,batch="batch_label")

cell.meta.data=obj0@meta.data
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann=data.frame(gene_short_name = make.unique(rownames(raw.data)),row.names = make.unique(rownames(raw.data)))
pd <- new("AnnotatedDataFrame",data=cell.meta.data)
fd <- new("AnnotatedDataFrame",data=gene_ann)
cds <- newCellDataSet(raw.data, phenoData = pd,featureData =fd,
                          expressionFamily = negbinomial.size(),
                          lowerDetectionLimit=1)
options(DelayedArray.block.size=1000e6)
cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds)

## Removing 12 outliers

disp_table = dispersionTable(cds)
disp_table = disp_table %>% mutate(excess_disp =
  (dispersion_empirical - dispersion_fit) / dispersion_fit) %>%
    arrange(plyr::desc(excess_disp))
top_subset_genes = as.character(head(disp_table, 2500)$gene_id)
cds = setOrderingFilter(cds, top_subset_genes)
#cds <- preprocessCDS(cds,  method = 'PCA',
#                         norm_method = 'log',
#                         num_dim = ncol(cca3.0_dim_use),
#                         verbose = T)
colnames(mnn_dim_use)=paste0("PC_",1:ncol(mnn_dim_use))
cds@normalized_data_projection=mnn_dim_use[match(cell.meta.data$cellname,rownames(mnn_dim_use)),]

#replcae PCA
cds <- reduceDimension(cds, max_components = 2,
                       reduction_method = 'UMAP',
                       metric="correlation",
                       min_dist = 0.75,
                       n_neighbors = 50,
                       verbose = F)
cds <- clusterCells(cds, use_pca=T,
                    method = 'louvain',
                    res = 3e-3,
                    louvain_iter = 1,
                    verbose = F)
#plot_cell_clusters(cds,markers = c("S100A8","FCGR3A"),show_group_id = T, cell_size = 0.1)

cds <- partitionCells(cds)
cds <- learnGraph(cds,  RGE_method = 'SimplePPT')
#plot_cell_trajectory(cds,color_by = "Cluster")
# a helper function to identify the root principal points:
get_correct_root_state <- function(cds, cell_phenotype, root_type){
  cell_ids <- which(pData(cds)[, cell_phenotype] == root_type)
  closest_vertex <-
    cds@auxOrderingData[[cds@rge_method]]$pr_graph_cell_proj_closest_vertex
  closest_vertex <- as.matrix(closest_vertex[colnames(cds), ])
  root_pr_nodes <-
    V(cds@minSpanningTree)$name[as.numeric(names
      (which.max(table(closest_vertex[cell_ids,]))))]

  root_pr_nodes
}
MPP_node_ids = get_correct_root_state(cds,cell_phenotype ='Cluster', "4")
cds <- orderCells(cds, root_pr_nodes = MPP_node_ids)
#plot_cell_trajectory(cds)
#plot_cell_trajectory(cds,color_by = "Cluster")

if ("Seurat" %in% loadedNamespaces()) detach("package:Seurat",unload = T)

df_den=pData(cds)[,c("Pseudotime","dataset_batch")]
df_den=df_den[!is.infinite(df_den$Pseudotime),]
set.seed(10)
theme_use=theme(legend.text = element_text(size=16),
                legend.title = element_text(size=20))

p_mnn_1=plot_cell_trajectory(cds,color_by = "dataset_batch",cell_link_size=0,alpha=0.7,cell_size = 0.2)+
            guides(colour = guide_legend(override.aes = list(alpha=0.7, size=5)))+theme_use

p_mnn_2=plot_cell_trajectory(cds,show_branch_points=T,cell_link_size=2,alpha=0.8,cell_size = 0.2)+
            theme(legend.position = "top",
              legend.title = element_text(vjust = 0.2),
              legend.text = element_text(angle=-50 ),
                  legend.key.height = unit(0.5,"cm"),
                  legend.key.width = unit(1,"cm"))+
            guides(color = guide_colourbar(label.position = "top"))+theme_use

p_mnn_3=ggplot(data=df_den)+geom_density(aes(x=Pseudotime,fill=dataset_batch),alpha=0.7)+
            scale_y_continuous(expand = c(0,0))+
            scale_x_continuous(expand = c(0,0))+
            theme(legend.position="top")+theme_use
p_monocle_mnn=egg::ggarrange(p_mnn_1,p_mnn_2,p_mnn_3,ncol=3,draw=F)

table(is.infinite(pData(cds)[,c("Pseudotime")]))

## 
## FALSE 
## 10878

p_monocle_mnn

#tiff("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_main/monocyte_monocle_mnn.tiff",width=1800*0.8,height=0.8*500)
#p_monocle_mnn
#dev.off()

tt1=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T1"],df_den$Pseudotime[df_den$dataset_batch=="T3"])
tt1

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T1"] and df_den$Pseudotime[df_den$dataset_batch == "T3"]
## D = 0.10442, p-value = 3.864e-14
## alternative hypothesis: two-sided

tt2=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T1"],df_den$Pseudotime[df_den$dataset_batch=="T2"])
tt2

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T1"] and df_den$Pseudotime[df_den$dataset_batch == "T2"]
## D = 0.38753, p-value < 2.2e-16
## alternative hypothesis: two-sided

tt3=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T2"],df_den$Pseudotime[df_den$dataset_batch=="T3"])
tt3

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T2"] and df_den$Pseudotime[df_den$dataset_batch == "T3"]
## D = 0.41069, p-value < 2.2e-16
## alternative hypothesis: two-sided

Stable5[5,2:4]=matrix(get_p_new(c(tt1$p.value,tt2$p.value,tt3$p.value)),1,3)

#tiff("~/Documents/DESC_paper_prepare/DESC_paper_v2/figures/figure_supp/monocyte_MNN_monocle3_feature.tiff",width=1200*0.65,height=0.65*500)
#plot_grid(plot_cell_clusters(cds,markers = "FCGR3A",cell_size = 0.8)+theme_use,
 #         plot_cell_clusters(cds,markers = "S100A8",cell_size = 0.8)+theme_use,ncol=2)
#dev.off()
p_feature_mnn=plot_cell_clusters(cds,markers =c("FCGR3A","S100A8"),cell_size = 0.8,alpha=1)+
  scale_colour_gradient(name="",low="grey",high="red")+theme(legend.position = "right",strip.text = element_text(size=20,face="bold"))

## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

p2=plot_cell_clusters(cds,markers =c("CD14","CD163"),cell_size = 0.8,alpha=1)+
  scale_colour_gradient(name="",low="grey",high="red")+theme(legend.position = "right",strip.text = element_text(size=20,face="bold"))

## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

#ggsave("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_supp/monocyte_MNN_monocle3_feature.tiff",p1,width=12,height = 5,dpi=200)
#ggsave("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_supp/monocyte_MNN_add_monocle3_feature.tiff",p2,width=12,height = 5,dpi=200)

cds_exprs <-  as.matrix(exprs(cds[c("FCGR3A","S100A8")]))[1:2,]
df0=data.frame(cbind(pseudotime=pData(cds)$Pseudotime,log1p(t(cds_exprs)/sizeFactors(cds))))
df0$BatchID=pData(cds)$dataset_batch
df0=df0[is.finite(df0$pseudotime),]
df0=df0[order(df0$pseudotime,decreasing = F),,drop=F]
df0$x=df0$pseudotime/max(df0$pseudotime)
df_pseudotime_list$MNN=df0

Monocle using representation of scVI

batch_id="dataset_batch"
filename=paste0(datadirpath,"/scvi_result/","adata_scvi_",batch_id,"_2000_False.h5ad")
ad=import("anndata",convert = FALSE)
adata=ad$read_h5ad(filename)
adata=change_obs(adata,exclude=c("n_genes","n_counts","sample"))# colum
df_scvi=getplotdata(adata,reduction.use = "tsne")
#print(mclust::adjustedRandIndex(df_cca$cluster,df_cca$integrated_snn_res.0.2))
scvi_dim_use=py_to_r(adata$X)

KL_list$scVI=BatchKL(df_scvi,scvi_dim_use,n_cells=100,batch="batch_label")

cell.meta.data=obj0@meta.data
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann=data.frame(gene_short_name = make.unique(rownames(raw.data)),row.names = make.unique(rownames(raw.data)))
pd <- new("AnnotatedDataFrame",data=cell.meta.data)
fd <- new("AnnotatedDataFrame",data=gene_ann)
cds <- newCellDataSet(obj0@raw.data, phenoData = pd,featureData =fd,
                          expressionFamily = negbinomial.size(),
                          lowerDetectionLimit=1)
options(DelayedArray.block.size=1000e6)
cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds)

## Removing 12 outliers

disp_table = dispersionTable(cds)
disp_table = disp_table %>% mutate(excess_disp =
  (dispersion_empirical - dispersion_fit) / dispersion_fit) %>%
    arrange(plyr::desc(excess_disp))
top_subset_genes = as.character(head(disp_table, 2500)$gene_id)
cds = setOrderingFilter(cds, top_subset_genes)
#cds <- preprocessCDS(cds,  method = 'PCA',
#                         norm_method = 'log',
#                         num_dim = 20,
#                         verbose = T)
colnames(scvi_dim_use)=paste0("PC_",1:ncol(scvi_dim_use))
rownames(scvi_dim_use)=df_scvi$cellname
cds@normalized_data_projection=scvi_dim_use[match(cell.meta.data$cellname,rownames(scvi_dim_use)),]

#replcae PCA
cds <- reduceDimension(cds, max_components = 2,
                       reduction_method = 'UMAP',
                       metric="correlation",
                       min_dist = 0.75,
                       n_neighbors = 50,
                       verbose = F)
cds <- clusterCells(cds, use_pca=T,
                    method = 'louvain',
                    res = 3e-3,
                    louvain_iter = 1,
                    verbose = F)
#plot_cell_clusters(cds,markers = c("S100A8","FCGR3A"),show_group_id = T, cell_size = 0.1)

cds <- partitionCells(cds)
cds <- learnGraph(cds,  RGE_method = 'SimplePPT')
get_correct_root_state <- function(cds, cell_phenotype, root_type){
  cell_ids <- which(pData(cds)[, cell_phenotype] == root_type)

  closest_vertex <-
    cds@auxOrderingData[[cds@rge_method]]$pr_graph_cell_proj_closest_vertex
  closest_vertex <- as.matrix(closest_vertex[colnames(cds), ])
  root_pr_nodes <-
    V(cds@minSpanningTree)$name[as.numeric(names
      (which.max(table(closest_vertex[cell_ids,]))))]

  root_pr_nodes
}
MPP_node_ids = get_correct_root_state(cds,cell_phenotype ='Cluster', "5")
cds <- orderCells(cds, root_pr_nodes = MPP_node_ids)
#plot_cell_trajectory(cds)
#plot_cell_trajectory(cds,color_by = "Cluster")

df_den=pData(cds)[,c("Pseudotime","dataset_batch")]
df_den=df_den[!is.infinite(df_den$Pseudotime),]
set.seed(10)
theme_use=theme(legend.text = element_text(size=16),
                legend.title = element_text(size=20))

p_scvi_1=plot_cell_trajectory(cds,color_by = "dataset_batch",cell_link_size=0,alpha=0.7,cell_size = 0.2)+
            guides(colour = guide_legend(override.aes = list(alpha=0.7, size=5)))+theme_use

p_scvi_2=plot_cell_trajectory(cds,show_branch_points=T,cell_link_size=2,alpha=0.8,cell_size = 0.2)+
            theme(legend.position = "top",
              legend.title = element_text(vjust = 0.2),
              legend.text = element_text(angle=-50 ),
                  legend.key.height = unit(0.5,"cm"),
                  legend.key.width = unit(1,"cm"))+
            guides(color = guide_colourbar(label.position = "top"))+theme_use

p_scvi_3=ggplot(data=df_den)+geom_density(aes(x=Pseudotime,fill=dataset_batch),alpha=0.7)+
            scale_y_continuous(expand = c(0,0))+
            scale_x_continuous(expand = c(0,0))+
            theme(legend.position="top")+theme_use
p_monocle_scvi=egg::ggarrange(p_scvi_1,p_scvi_2,p_scvi_3,ncol=3,draw=F)

table(is.infinite(pData(cds)[,c("Pseudotime")]))

## 
## FALSE 
## 10878

p_monocle_scvi

#tiff("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_main/monocyte_monocle_scvi.tiff",width=1800*0.8,height=0.8*500)
#p_monocle_scvi
#dev.off()

tt1=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T1"],df_den$Pseudotime[df_den$dataset_batch=="T3"])
tt1

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T1"] and df_den$Pseudotime[df_den$dataset_batch == "T3"]
## D = 0.1548, p-value < 2.2e-16
## alternative hypothesis: two-sided

tt2=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T1"],df_den$Pseudotime[df_den$dataset_batch=="T2"])
tt2

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T1"] and df_den$Pseudotime[df_den$dataset_batch == "T2"]
## D = 0.29338, p-value < 2.2e-16
## alternative hypothesis: two-sided

tt3=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T2"],df_den$Pseudotime[df_den$dataset_batch=="T3"])
tt3

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T2"] and df_den$Pseudotime[df_den$dataset_batch == "T3"]
## D = 0.17777, p-value < 2.2e-16
## alternative hypothesis: two-sided

Stable5[6,2:4]=matrix(get_p_new(c(tt1$p.value,tt2$p.value,tt3$p.value)),1,3)

p_feature_scvi=plot_cell_clusters(cds,markers =c("FCGR3A","S100A8"),cell_size = 0.8,alpha=1)+
  scale_colour_gradient(name="",low="grey",high="red")+theme(legend.position = "right",strip.text = element_text(size=20,face="bold"))

## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

p2=plot_cell_clusters(cds,markers =c("CD14","CD163"),cell_size = 0.8,alpha=1)+
  scale_colour_gradient(name="",low="grey",high="red")+theme(legend.position = "right",strip.text = element_text(size=20,face="bold"))

## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

#ggsave("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_supp/monocyte_scVI_monocle3_feature.tiff",p1,width=12,height = 5,dpi=200)
#ggsave("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_supp/monocyte_scVI_add_monocle3_feature.tiff",p2,width=12,height = 5,dpi=200)

cds_exprs <-  as.matrix(exprs(cds[c("FCGR3A","S100A8")]))[1:2,]
df0=data.frame(cbind(pseudotime=pData(cds)$Pseudotime,log1p(t(cds_exprs)/sizeFactors(cds))))
df0$BatchID=pData(cds)$dataset_batch
df0=df0[is.finite(df0$pseudotime),]
df0=df0[order(df0$pseudotime,decreasing = F),,drop=F]
df0$x=df0$pseudotime/max(df0$pseudotime)
df_pseudotime_list$scVI=df0

BERMUDA0.90

filename=paste0(datadirpath,"/BERMUDA_result/adata_bermuda0.9_dataset_batch.h5ad")
ad=import("anndata",convert = FALSE)
adata=ad$read_h5ad(filename)
adata=change_obs(adata,exclude=c("n_genes","n_counts","sample"))# colum
df_bermuda=getplotdata(adata,reduction.use = "tsne")%>%left_join(df_desc[,c("cellname","batch_label","dataset_batch","status_label")],by="cellname")
#print(mclust::adjustedRandIndex(df_cca$cluster,df_cca$integrated_snn_res.0.2))
bermuda_dim_use=py_to_r(adata$obsm[["X_bermuda"]])

KL_list$BERMUDA0.90=BatchKL(df_bermuda,bermuda_dim_use,n_cells=100,batch="batch_label")

cell.meta.data=obj0@meta.data
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann=data.frame(gene_short_name = make.unique(rownames(obj0@raw.data)),row.names = make.unique(rownames(obj0@raw.data)))
pd <- new("AnnotatedDataFrame",data=cell.meta.data)
fd <- new("AnnotatedDataFrame",data=gene_ann)
cds <- newCellDataSet(obj0@raw.data, phenoData = pd,featureData =fd,
                      expressionFamily = negbinomial.size(),
                      lowerDetectionLimit=1)
options(DelayedArray.block.size=1000e6)
cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds)

## Removing 12 outliers

disp_table = dispersionTable(cds)
disp_table = disp_table %>% mutate(excess_disp =
                                     (dispersion_empirical - dispersion_fit) / dispersion_fit) %>%
  arrange(plyr::desc(excess_disp))
top_subset_genes = as.character(head(disp_table, 2500)$gene_id)
cds = setOrderingFilter(cds, top_subset_genes)
#cds <- preprocessCDS(cds,  method = 'PCA',
#                         norm_method = 'log',
#                         num_dim = 20,
#                         verbose = T)
colnames(bermuda_dim_use)=paste0("PC_",1:ncol(bermuda_dim_use))
rownames(bermuda_dim_use)=df_bermuda$cellname
cds@normalized_data_projection=bermuda_dim_use[match(cell.meta.data$cellname,rownames(bermuda_dim_use)),]
#replcae PCA
cds <- reduceDimension(cds, max_components = 2,
                       reduction_method = 'UMAP',
                       metric="correlation",
                       min_dist = 0.75,
                       n_neighbors = 50,
                       verbose = F)
cds <- clusterCells(cds, use_pca=T,
                    method = 'louvain',
                    res = 3e-3,
                    louvain_iter = 1,
                    verbose = F)
#plot_cell_clusters(cds,markers = c("S100A8","FCGR3A"),show_group_id = T, cell_size = 0.1)

cds <- partitionCells(cds)
cds <- learnGraph(cds,  RGE_method = 'SimplePPT')
#plot_cell_trajectory(cds,color_by = "Cluster")
# a helper function to identify the root principal points:
get_correct_root_state <- function(cds, cell_phenotype, root_type){
  cell_ids <- which(pData(cds)[, cell_phenotype] == root_type)
  
  closest_vertex <-
    cds@auxOrderingData[[cds@rge_method]]$pr_graph_cell_proj_closest_vertex
  closest_vertex <- as.matrix(closest_vertex[colnames(cds), ])
  root_pr_nodes <-
    V(cds@minSpanningTree)$name[as.numeric(names
                                           (which.max(table(closest_vertex[cell_ids,]))))]
  
  root_pr_nodes
}
MPP_node_ids = get_correct_root_state(cds,cell_phenotype ='Cluster', "4")
cds <- orderCells(cds, root_pr_nodes = MPP_node_ids)
#plot_cell_trajectory(cds,color_by = "Cluster")

df_den=pData(cds)[,c("Pseudotime","dataset_batch")]
df_den=df_den[!is.infinite(df_den$Pseudotime),]
set.seed(10)
theme_use=theme(legend.text = element_text(size=16),
                legend.title = element_text(size=20))

p_bermuda_1=plot_cell_trajectory(cds,color_by = "dataset_batch",cell_link_size=0,alpha=0.7,cell_size = 0.2)+
                                   guides(colour = guide_legend(override.aes = list(alpha=0.7, size=5)))+theme_use

p_bermuda_2=plot_cell_trajectory(cds,show_branch_points=T,cell_link_size=2,alpha=0.8,cell_size = 0.2)+
                                   theme(legend.position = "top",
                                         legend.title = element_text(vjust = 0.2),
                                         legend.text = element_text(angle=-50 ),
                                         legend.key.height = unit(0.5,"cm"),
                                         legend.key.width = unit(1,"cm"))+
                                   guides(color = guide_colourbar(label.position = "top"))+theme_use

p_bermuda_3=ggplot(data=df_den)+geom_density(aes(x=Pseudotime,fill=dataset_batch),alpha=0.7)+
                                   scale_y_continuous(expand = c(0,0))+
                                   scale_x_continuous(expand = c(0,0))+
                                   theme(legend.position="top")+theme_use
p_monocle_bermuda0.90=egg::ggarrange(p_bermuda_1,p_bermuda_2,p_bermuda_3,ncol=3,draw=F)

table(is.infinite(pData(cds)[,c("Pseudotime")]))

## 
## FALSE  TRUE 
## 10825    53

p_monocle_bermuda0.90

tt1=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T1"],df_den$Pseudotime[df_den$dataset_batch=="T3"])
tt1

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T1"] and df_den$Pseudotime[df_den$dataset_batch == "T3"]
## D = 0.54208, p-value < 2.2e-16
## alternative hypothesis: two-sided

tt2=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T1"],df_den$Pseudotime[df_den$dataset_batch=="T2"])
tt2

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T1"] and df_den$Pseudotime[df_den$dataset_batch == "T2"]
## D = 0.55244, p-value < 2.2e-16
## alternative hypothesis: two-sided

tt3=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T2"],df_den$Pseudotime[df_den$dataset_batch=="T3"])
tt3

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T2"] and df_den$Pseudotime[df_den$dataset_batch == "T3"]
## D = 0.13306, p-value < 2.2e-16
## alternative hypothesis: two-sided

Stable5[7,2:4]=matrix(get_p_new(c(tt1$p.value,tt2$p.value,tt3$p.value)),1,3)

p_feature_bermuda0.90=plot_cell_clusters(cds,markers =c("FCGR3A","S100A8"),cell_size = 0.8,alpha=1)+
  scale_colour_gradient(name="",low="grey",high="red")+theme(legend.position = "right",strip.text = element_text(size=20,face="bold"))

## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

#ggsave("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_supp/monocyte_bermuda0.90_monocle3_feature.tiff",p1,width=12,height = 5,dpi=200)
#ggsave("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_supp/monocyte_bermuda0.90_add_monocle3_feature.tiff",p2,width=12,height = 5,dpi=200)

cds_exprs <-  as.matrix(exprs(cds[c("FCGR3A","S100A8")]))[1:2,]
df0=data.frame(cbind(pseudotime=pData(cds)$Pseudotime,log1p(t(cds_exprs)/sizeFactors(cds))))
df0$BatchID=pData(cds)$dataset_batch
df0=df0[is.finite(df0$pseudotime),]
df0=df0[order(df0$pseudotime,decreasing = F),,drop=F]
df0$x=df0$pseudotime/max(df0$pseudotime)
df_pseudotime_list$BERMUDA0.90=df0

scanorama

filename=paste0(datadirpath,"/scanorama_result/adata_scanorama_dataset_batch_2000_False.h5ad")
ad=import("anndata",convert = FALSE)
adata=ad$read_h5ad(filename)
adata=change_obs(adata,exclude=c("n_genes","n_counts","sample"))# colum
df_scanorama=getplotdata(adata,reduction.use = "tsne")
#print(mclust::adjustedRandIndex(df_cca$cluster,df_cca$integrated_snn_res.0.2))
scanorama_dim_use=py_to_r(adata$obsm[["X_scanorama"]])
rownames(scanorama_dim_use)=paste0(df_scanorama$cellname,"-",df_scanorama$dataset_batch)

KL_list$scanorama=BatchKL(df_scanorama,scanorama_dim_use,n_cells=100,batch="batch_label")

cell.meta.data=obj0@meta.data
cell.meta.data$dataset_batch=plyr::mapvalues(cell.meta.data$batch_label,names(maprules),maprules)
gene_ann=data.frame(gene_short_name = make.unique(rownames(obj0@raw.data)),row.names = make.unique(rownames(obj0@raw.data)))
pd <- new("AnnotatedDataFrame",data=cell.meta.data)
fd <- new("AnnotatedDataFrame",data=gene_ann)
cds <- newCellDataSet(obj0@raw.data, phenoData = pd,featureData =fd,
                      expressionFamily = negbinomial.size(),
                      lowerDetectionLimit=1)
options(DelayedArray.block.size=1000e6)
cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds)

## Removing 12 outliers

disp_table = dispersionTable(cds)
disp_table = disp_table %>% mutate(excess_disp =
                                     (dispersion_empirical - dispersion_fit) / dispersion_fit) %>%
  arrange(plyr::desc(excess_disp))
top_subset_genes = as.character(head(disp_table, 2500)$gene_id)
cds = setOrderingFilter(cds, top_subset_genes)
#cds <- preprocessCDS(cds,  method = 'PCA',
#                         norm_method = 'log',
#                         num_dim = 20,
#                         verbose = T)
colnames(scanorama_dim_use)=paste0("PC_",1:ncol(scanorama_dim_use))
rownames(scanorama_dim_use)=paste0(df_scanorama$cellname,"-",df_scanorama$dataset_batch)
cds@normalized_data_projection=scanorama_dim_use[match(cell.meta.data$cellname,rownames(scanorama_dim_use)),]

#replcae PCA
cds <- reduceDimension(cds, max_components = 2,
                       reduction_method = 'UMAP',
                       metric="correlation",
                       min_dist = 0.75,
                       n_neighbors = 50,
                       verbose = F)
cds <- clusterCells(cds, use_pca=T,
                    method = 'louvain',
                    res = 3e-3,
                    louvain_iter = 1,
                    verbose = F)
#plot_cell_clusters(cds,markers = c("S100A8","FCGR3A"),show_group_id = T, cell_size = 0.1)

cds <- partitionCells(cds)
cds <- learnGraph(cds,  RGE_method = 'SimplePPT')
#plot_cell_trajectory(cds,color_by = "Cluster")
# a helper function to identify the root principal points:
get_correct_root_state <- function(cds, cell_phenotype, root_type){
  cell_ids <- which(pData(cds)[, cell_phenotype] == root_type)
  
  closest_vertex <-
    cds@auxOrderingData[[cds@rge_method]]$pr_graph_cell_proj_closest_vertex
  closest_vertex <- as.matrix(closest_vertex[colnames(cds), ])
  root_pr_nodes <-
    V(cds@minSpanningTree)$name[as.numeric(names
                                           (which.max(table(closest_vertex[cell_ids,]))))]
  
  root_pr_nodes
}
MPP_node_ids = get_correct_root_state(cds,cell_phenotype ='Cluster', "5")
cds <- orderCells(cds, root_pr_nodes = MPP_node_ids)
#plot_cell_trajectory(cds,gr)
#plot_cell_trajectory(cds,color_by = "Cluster")

df_den=pData(cds)[,c("Pseudotime","dataset_batch")]
df_den=df_den[!is.infinite(df_den$Pseudotime),]
set.seed(10)
theme_use=theme(legend.text = element_text(size=16),
                legend.title = element_text(size=20))

p_scanorama_1=plot_cell_trajectory(cds,color_by = "dataset_batch",cell_link_size=0,alpha=0.7,cell_size = 0.2)+
                                   guides(colour = guide_legend(override.aes = list(alpha=0.7, size=5)))+theme_use

p_scanorama_2=plot_cell_trajectory(cds,show_branch_points=T,cell_link_size=2,alpha=0.8,cell_size = 0.2)+
                                   theme(legend.position = "top",
                                         legend.title = element_text(vjust = 0.2),
                                         legend.text = element_text(angle=-50 ),
                                         legend.key.height = unit(0.5,"cm"),
                                         legend.key.width = unit(1,"cm"))+
                                   guides(color = guide_colourbar(label.position = "top"))+theme_use

p_scanorama_3=ggplot(data=df_den)+geom_density(aes(x=Pseudotime,fill=dataset_batch),alpha=0.7)+
                                   scale_y_continuous(expand = c(0,0))+
                                   scale_x_continuous(expand = c(0,0))+
                                   theme(legend.position="top")+theme_use

p_monocle_scanorama=egg::ggarrange(p_scanorama_1,p_scanorama_2,p_scanorama_3,ncol=3,draw=F)

table(is.infinite(pData(cds)[,c("Pseudotime")]))

## 
## FALSE 
## 10878

p_monocle_scanorama

tt1=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T1"],df_den$Pseudotime[df_den$dataset_batch=="T3"])
tt1

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T1"] and df_den$Pseudotime[df_den$dataset_batch == "T3"]
## D = 0.22419, p-value < 2.2e-16
## alternative hypothesis: two-sided

tt2=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T1"],df_den$Pseudotime[df_den$dataset_batch=="T2"])
tt2

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T1"] and df_den$Pseudotime[df_den$dataset_batch == "T2"]
## D = 0.41335, p-value < 2.2e-16
## alternative hypothesis: two-sided

tt3=ks.test(df_den$Pseudotime[df_den$dataset_batch=="T2"],df_den$Pseudotime[df_den$dataset_batch=="T3"])
tt3

## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  df_den$Pseudotime[df_den$dataset_batch == "T2"] and df_den$Pseudotime[df_den$dataset_batch == "T3"]
## D = 0.22625, p-value < 2.2e-16
## alternative hypothesis: two-sided

Stable5[8,2:4]=matrix(get_p_new(c(tt1$p.value,tt2$p.value,tt3$p.value)),1,3)

p_feature_scanorama=plot_cell_clusters(cds,markers =c("FCGR3A","S100A8"),cell_size = 0.8,alpha=1)+
  scale_colour_gradient(name="",low="grey",high="red")+theme(legend.position = "right",strip.text = element_text(size=20,face="bold"))

## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

p2=plot_cell_clusters(cds,markers =c("CD14","CD163"),cell_size = 0.8,alpha=1)+
  scale_colour_gradient(name="",low="grey",high="red")+theme(legend.position = "right",strip.text = element_text(size=20,face="bold"))

## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

#ggsave("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_supp/monocyte_scanorama_monocle3_feature.tiff",p1,width=12,height = 5,dpi=200)
#ggsave("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_supp/monocyte_scanorama_add_monocle3_feature.tiff",p2,width=12,height = 5,dpi=200)

cds_exprs <-  as.matrix(exprs(cds[c("FCGR3A","S100A8")]))[1:2,]
df0=data.frame(cbind(pseudotime=pData(cds)$Pseudotime,log1p(t(cds_exprs)/sizeFactors(cds))))
df0$BatchID=pData(cds)$dataset_batch
df0=df0[is.finite(df0$pseudotime),]
df0=df0[order(df0$pseudotime,decreasing = F),,drop=F]
df0$x=df0$pseudotime/max(df0$pseudotime)
df_pseudotime_list$scanorama=df0

names(df_pseudotime_list)[which(names(df_pseudotime_list)=="desc")]="DESC"
names(df_pseudotime_list)[which(names(df_pseudotime_list)=="BERMUDA0.90")]="BERMUDA"
save(df_pseudotime_list,file = "df_pseudotime_list_paper_final.RData")
#load("./df_pseudotime_list_paper_final.RData")

Figure 7

#top,right, bottom, right
p=egg::ggarrange(p_desc_2+theme(plot.margin = unit(c(0,0,2,0), "cm")),p_desc_3+theme(plot.margin = unit(c(0,2,0,0), "cm")),
                 p_cca3.0_2,p_cca3.0_3,
                 p_cca_2+theme(plot.margin = unit(c(0,0,2,0), "cm")),p_cca_3,p_mnn_2,p_mnn_3,
                 p_scvi_2+theme(plot.margin = unit(c(0,0,2,0), "cm")),p_scvi_3,p_bermuda_2,p_bermuda_3,
                 p_scanorama_2,p_scanorama_3,p_ori_2,p_ori_3,
                 ncol=4,draw=F,
                 labels = c("a","","b","","c","","d","","e","","f","","g","","h",""),
                 label.args = list(gp = grid::gpar(cex =3.5,color="black",face="bold"),just="bottom"))
                       #label.args = list(gp = grid::gpar(cex =3.5,color="black",face="bold"),y=unit(7.5,"line")))

pp=ggdraw() +
  draw_plot(p, x=0, y=0, width=1, height = 1)+
  draw_label("DESC+monocle3",x=0.25,y=1,hjust=0.5,vjust=1,fontface = "bold",size=30)+
  draw_label("Seurat3.0+monocle3",x=0.75,y=1,hjust=0.5,vjust=1,fontface = "bold",size=30)+
  draw_label("CCA+monocle3",x=0.25,y=0.75,hjust=0.5,vjust=1,fontface = "bold",size=30)+
  draw_label("MNN+monocle3",x=0.75,y=0.75,hjust=0.5,vjust=1,fontface = "bold",size=30)+
 draw_label("scVI+monocle3",x=0.25,y=0.496,hjust=0.5,vjust=1,fontface = "bold",size=30)+
  draw_label("BERMUDA+monocle3",x=0.75,y=0.496,hjust=0.5,vjust=1,fontface = "bold",size=30)+
  draw_label("scanorama+monocle3",x=0.248,y=0.25,hjust=0.5,vjust=1,fontface = "bold",size=30)+
  draw_label("raw+monocle3",x=0.75,y=0.248,hjust=0.5,vjust=1,fontface = "bold",size=30)

#ggsave("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_main/monocyte_pseudotime_batch.tiff",p,width=24,height=22,dpi=300,compression="lzw")
pp

-bottom

data_KL0=do.call("rbind",lapply(names(KL_list),function(x){
  tmp=data.frame(value=KL_list[[x]])
  tmp$Method=x
  return(tmp)
}))
df=data_KL0
colnames(df)=c("value","Method")
df$Method=factor(df$Method,levels=c("DESC","Seurat3.0","CCA","MNN","scVI","BERMUDA0.90","scanorama","raw"),
                 labels=c("DESC","Seurat3.0","CCA","MNN","scVI","BERMUDA","scanorama","raw"))
p_KL=ggplot(df,aes(Method,value))+
   geom_boxplot(aes(color=Method))+
   xlab("")+
   ylab("KL divergence")+
   theme_self+theme(
   axis.text.x   = element_text(angle = 30,hjust=1,vjust=1),
   panel.border =element_blank(),
   axis.line = element_line())+
   ggtitle("")+theme(legend.position = "none")
p_KL

pp2=ggdraw() +
  draw_plot(pp, x=0, y=0.2, width=1, height = 0.8)+
  draw_plot(p_KL,x=0.25, y=0, width=0.5, height = 0.2)+
  draw_label("i",x=0.25,y=0.2,hjust=0.5,vjust=1,fontface = "bold",size=35)

#ggsave("~/Documents/DESC_paper_prepare/DESC_paper_v3/figures/figure_main/monocyte_pseudotime_batch.tiff",p,width=24,height=22,dpi=300,compression="lzw")
pp2

## Figure 7
ggsave(file.path(fig_path,"Figure.7.tiff"),pp2,width=24,height = 26,dpi=300,compression="lzw")
ggsave(file.path(fig_path,"Figure.7.pdf"),pp2,width=24,height = 26,dpi=300)

Figure 8

xx=lapply(names(df_pseudotime_list),function(x){
  x1=df_pseudotime_list[[x]]
  x1$Method=x
  return(x1)
})
df_all=do.call("rbind",xx)
df_all$Method=factor(df_all$Method,names(df_pseudotime_list[c(2:length(df_pseudotime_list),1)]))

ggplot(data=df_all,aes(x=x,y=S100A8))+
  geom_point(aes(color=BatchID),size=0.01)+
  geom_smooth(aes(color=BatchID),method="gam",formula = y ~ s(x, bs="cs"))+
  geom_smooth(color="black",method="gam",formula = y ~ s(x, bs="cs"),size=0.5)+
  facet_wrap(~Method,ncol=3)+
  theme(strip.background = element_blank(),
        strip.text = element_text(size=15,face="bold"),
        axis.text.y = element_text(size=12))

#ggplot(data=df_all0,aes(x=pseudotime,y=FCGR3A))+geom_smooth(aes(color=Method))
df_all0=df_pseudotime_list[c(2:length(df_pseudotime_list),1)]
plist=lapply(names(df_all0),function(x) {
  x1=df_all0[[x]]
  p=ggplot(data=x1,aes(x=x,y=S100A8))+
    geom_point(aes(color=BatchID),size=0.01)+
    guides(color=guide_legend(override.aes = list(size=5)))+
    geom_smooth(aes(color=BatchID),method="gam",formula = y ~ s(x, bs="cs"))+
     geom_smooth(color="black",method="gam",formula = y ~ s(x, bs="cs"),size=0.5)+
    ggtitle(x)+xlab("Pseudotime")+theme(legend.position = "top",
                             plot.title = element_text(size=18,face="bold",hjust=0.5),
                             legend.text = element_text(size=15,face="bold"),
                             legend.title = element_blank())
  return(p)
  #geom_point(size=0.01,position = position_jitter(width = 0.05))+geom_smooth(method="gam",formula = y ~ s(x, bs="cs"))
})


p_legend=get_legend(plist[[1]])

p=egg::ggarrange(plots = lapply(plist,function(x) x+theme(legend.position = "none")),
                                            ncol=3,draw=F, 
                 labels = c("a","b","c","d","e","f","g","h"),
                 label.args = list(gp = grid::gpar(cex =2.5,color="black",face="bold"),just="bottom"))

## adding dummy grobs

                       #label.args = list(gp = grid::gpar(cex =3.5,color="black",face="bold"),y=unit(7.5,"line")))

p_legend=get_legend(plist[[1]])
pp2=ggdraw() +
  draw_plot(p, x=0, y=0, width=1, height = 1)+
  draw_plot(plot_grid(p_legend),x=0.60, y=0, width=0.4, height = 0.4)

pp2

## Figure 8
ggsave(file.path(fig_path,"Figure.8.tiff"),pp2,width=15,height = 12,dpi=300,compression="lzw")
ggsave(file.path(fig_path,"Figure.8.pdf"),pp2,width=15,height = 12,dpi=300)

Supplementary Figure 14

df_all0=

xx0=list(p_feature_ori,
         p_feature_desc,
         p_feature_cca3.0,
         p_feature_cca,
         p_feature_mnn,
         p_feature_scvi,
         p_feature_bermuda0.90,
         p_feature_scanorama)
names(xx0)=names(df_pseudotime_list)
plist0=xx0[c(2:length(xx0),1)]

plist=lapply(names(plist0),function(x) {
  plist0[[x]]+ggtitle(x)+theme(plot.title = element_text(size=25,face="bold",color="blue",hjust=0.5),
                             legend.text = element_text(size=15,face="bold"),
                             legend.title = element_blank())
})
p=egg::ggarrange(plots = plist,ncol=2,draw=F,
                  labels = c("a","b","c","d","e","f","g","h"),
                 label.args = list(gp = grid::gpar(cex =2.5,color="black",face="bold"),just="bottom"))

## Supplementary Figure 14
ggsave(file.path(fig_path,"Figure.S14.tiff"),p,width=24,height = 20,dpi=300,compression="lzw")
ggsave(file.path(fig_path,"Figure.S14.pdf"),p,width=24,height = 20,dpi=300)
p

Supplementary Figure 15

#top,right, bottom, right
xx0=list(p_ori_1,p_desc_1,p_cca3.0_1,p_cca_1,p_mnn_1,p_scvi_1,p_bermuda_1,p_scanorama_1)
names(xx0)=names(df_pseudotime_list)

plist0=xx0[c(2:length(xx0),1)]
plist=lapply(names(plist0),function(x) {
  plist0[[x]]+ggtitle(x)+theme(legend.position = "top",
                             plot.title = element_text(size=18,face="bold",hjust=0.5),
                             legend.text = element_text(size=15,face="bold"),
                             legend.title = element_blank())
})

p_legend=get_legend(plist[[1]])

p=egg::ggarrange(plots = rlist::list.append(lapply(plist,function(x) x+theme(legend.position = "none")),plot_grid(p_legend)),
                                            ncol=3,draw=F,
                  labels = c("a","b","c","d","e","f","g","h",""),
                 label.args = list(gp = grid::gpar(cex =2.5,color="black",face="bold"),just="bottom"))

## Supplementary Figure 15
ggsave(file.path(fig_path,"Figure.S15.tiff"),p,width=15,height = 12,dpi=300,compression="lzw")
ggsave(file.path(fig_path,"Figure.S15.pdf"),p,width=15,height = 12,dpi=300)

Supplementary Table 5

#Supplementary Table 5
Stable5 %>%
  kable() %>%
  kable_styling()

	Method	T1.v.s..T2	T1.v.s..T3	T2.v.s..T3
raw+monocle3	raw+monocle3	<2.2e-16	<2.2e-16	<2.2e-16
DESC+monocle3	DESC+monocle3	4.05e-04	1.43e-08	2.17e-02
Seurat3.0+monocle3	Seurat3.0+monocle3	<2.2e-16	<2.2e-16	4.67e-09
CCA+monocle3	CCA+monocle3	<2.2e-16	<2.2e-16	<2.2e-16
MNN+monocle3	MNN+monocle3	3.86e-14	<2.2e-16	<2.2e-16
scVI+monocle3	scVI+monocle3	<2.2e-16	<2.2e-16	<2.2e-16
BERMUDA+monocle3	BERMUDA+monocle3	<2.2e-16	<2.2e-16	<2.2e-16
scanorama+monocle3	scanorama+monocle3	<2.2e-16	<2.2e-16	<2.2e-16

Deep learning enables accurate clustering with batch effect removal in single-cell RNA-seq analysis

Figures related to the human monocyte dataset

Xiangjie Li, Kui Wang, Yafei Lyu, Huize Pan, Jingxiao Zhang, Dwight Stambolian, Katalin Susztak, Muredach P. Reilly, Gang Hu, Mingyao Li

03/10/2020