呜呜最近发现我工作效率低的一个原因就是重复性工作没有流程化,一气之下,把seurat分析单套数据的流程封装了起来,步骤包含数据质控、数据标准化、聚类以及初步的细胞类型鉴定。细胞类型鉴定是用每个cluster的top marker来标注的。之后再更新整合多套数据的流程,希望与君分享
1. 用到的所有函数放在了SeuratWrapperFunction.R中了
这个需要用source()函数导入到下面封装好的代码中的
### Time: 20221025
### Author: zhengyiyi
## load function
library(Seurat)
library(SingleCellExperiment)
library(scater)
library(MAST)
library(ggplot2)
library(patchwork)
library(RColorBrewer)
library(Seurat)
library(openxlsx)
library("grid")
library("ggplotify")
library(magrittr)
library(dplyr)
library(ggsignif)
library("ggplot2")
library(cowplot)
library(openxlsx)
CreateFirstFilterSeuratObject <- function(exp_count, pr_name, Or_ident,
min_cell, min_gene, mt_pattern,rb_pattern){
# CreateSeuratObject : filter gene and cell by min.cells(Include features detected in at least this many cells) and min.features(Include cells where at least this many features are detected.)
exp_count.seurat <- CreateSeuratObject(exp_count,
project = pr_name,
min.cells = min_cell,
min.features = min_gene)
exp_count.seurat@meta.data$orig.ident <- as.factor(Or_ident)
exp_count.seurat@active.ident <- exp_count.seurat$orig.ident
## Check the mt gene and rp gene
exp_count.seurat[["percent.mt"]] <- PercentageFeatureSet(object = exp_count.seurat, pattern = mt_pattern)
exp_count.seurat[["percent.rb"]] <- PercentageFeatureSet(object = exp_count.seurat, pattern = rb_pattern)
print(levels(exp_count.seurat@active.ident))
print(table(exp_count.seurat$orig.ident))
print(head(x = exp_count.seurat@meta.data, 5))
print(dim(exp_count.seurat@meta.data))
return(exp_count.seurat)
}
SeuratQC <- function(exp_count.seurat, name_pre){
folder_name="1.QC"
if (file.exists(folder_name)){
print("1.QC existed.")
}
else{
dir.create(folder_name)
}
file_name=paste(name_pre,"Violinplot.pdf",sep="_")
reads.drop <- isOutlier(as.numeric(exp_count.seurat$nCount_RNA), nmads = 3, type = "lower")
feature.drop <- isOutlier(as.numeric(exp_count.seurat$nFeature_RNA), nmads = 3, type = "lower")
qc.mito2 <- isOutlier(exp_count.seurat$percent.mt, nmads = 3, type="higher")
qc.ribo2 <- isOutlier(exp_count.seurat$percent.rb, nmads = 3, type="higher")
### 保留下那些质量差的细胞,看看这些细胞的情况
low_q_cells <- (reads.drop | feature.drop | qc.mito2 | qc.ribo2)
exp_count.seurat$lowquality_cells <- "High_qualitycells"
exp_count.seurat$lowquality_cells[low_q_cells] <- "Low_qualitycells"
plot1=FeatureScatter(object = exp_count.seurat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "lowquality_cells")
plot2=FeatureScatter(object = exp_count.seurat, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "lowquality_cells")
plot3=FeatureScatter(object = exp_count.seurat, feature1 = "nCount_RNA", feature2 = "percent.rb", group.by = "lowquality_cells")
pdf(file = file_name, height = 10, width = 20)
print(VlnPlot(object = exp_count.seurat, features = c("nFeature_RNA", "nCount_RNA",
"percent.mt","percent.rb"), ncol = 2, pt.size=0))
opar <- par(no.readonly = TRUE)
par(mfrow=c(2,2))
print(hist(exp_count.seurat$nCount_RNA, breaks = 100, main = "Reads Count Distribution", col = "grey80",
xlab = "library size", ylab = "frequency",
cex.lab = 1.4, cex.axis = 1.4))
print(abline(v = 10000, col = "red", lwd = 2, lty = 2))
print(hist(exp_count.seurat$nFeature_RNA, breaks = 100, main = "Gene Count Distribution",
col = "grey80", xlab = "total mRNA", ylab = "frequency", cex.lab = 1.4, cex.axis = 1.4))
print(abline(v = 1500, col = "red", lwd = 2, lty = 2))
print(hist(exp_count.seurat$percent.mt, breaks = 10, main = "Mitochondrial Percentage Distribution",
col = "grey80", xlab = "total mRNA", ylab = "frequency", cex.lab = 1.4, cex.axis = 1.4))
print(abline(v = 20, col = "red", lwd = 2, lty = 2))
print(hist(exp_count.seurat$percent.rb, breaks = 10, main = "Ribosome Percentage Distribution",
col = "grey80", xlab = "total mRNA", ylab = "frequency", cex.lab = 1.4, cex.axis = 1.4))
print(abline(v = 20, col = "red", lwd = 2, lty = 2))
par(opar)
print(plot1 + plot2 + plot3)
dev.off()
#copy files to 1.QC
result_files <- c(file_name)
file.copy(result_files, folder_name ,overwrite = T)
file.remove(result_files)
print("based on the Violin plot, you can filter the cell by the nFeature_RNA, nCount_RNA and/or percent.mt rb")
}
SubsetSeuratData <- function(exp_count.seurat, min_gene, max_gene,
min_ncountRNA, max_ncountRNA,
max_mt_percent, max_rb_percent){
exp_count.seurat <- subset(exp_count.seurat, subset = nCount_RNA < max_ncountRNA & nCount_RNA > min_ncountRNA & nFeature_RNA < max_gene & nFeature_RNA > min_gene & percent.mt < max_mt_percent & percent.rb < max_rb_percent )
# QC
qc_pdf_name <- paste("filtered_by_gene", min_gene, "UMI", min_ncountRNA, "percent.mt", max_mt_percent, "percent.rb", max_rb_percent, sep="_")
SeuratQC(exp_count.seurat, qc_pdf_name)
dim(exp_count.seurat)
print(levels(exp_count.seurat$orig.ident))
return(exp_count.seurat)
}
SelectPCsByLogNormalize <- function(exp_count.seurat, file_prefix, scale_factor,
nfeatures, npc_plot){
folder_name="2.RunPCA"
if (file.exists(folder_name)){
print("2.RunPCA existed.")
}
else{
dir.create(folder_name)
}
DefaultAssay(exp_count.seurat) <- "RNA"
# Normalize Find VariableGene and Scale Data
exp_count.seurat <- NormalizeData(exp_count.seurat, normalization.method = "LogNormalize",
scale.factor = scale_factor)
exp_count.seurat <- FindVariableFeatures(exp_count.seurat, selection.method = "vst", nfeatures = nfeatures)
exp_count.seurat <- ScaleData(exp_count.seurat, vars.to.regress = c("percent.rb","percent.mt"))
# RunPCA
exp_count.seurat <- RunPCA(exp_count.seurat, pc.genes = exp_count.seurat@var.genes,
npcs = npc_plot, verbose = FALSE,)
# Select PCs:1:30pcs
selected_pcs_name <- paste0(file_prefix, "_Selected_PCs.pdf")
pdf(file = selected_pcs_name, height = 8, width = 8)
print(ElbowPlot(object = exp_count.seurat, ndims = npc_plot, reduction = "pca"))
dev.off()
# Plot
pca_plot_name <- paste0(file_prefix, "_PCA_plot",".pdf")
pdf(pca_plot_name,8,8)
print(DimPlot(exp_count.seurat, reduction="pca", label = TRUE, pt.size=0.5, group.by="ident"))
dev.off()
#Copy files to 2.RunPCA
result_files <- c(selected_pcs_name,pca_plot_name)
file.copy(result_files, folder_name ,overwrite = T)#拷贝文件
file.remove(result_files) #移除拷贝完的文件
return(exp_count.seurat)
}
SelectPCsBySCTransform <- function(exp_count.seurat, file_prefix, npc_plot){
folder_name="2.RunPCA"
if (file.exists(folder_name)){
print("2.RunPCA existed.")
}
else{
dir.create(folder_name)
}
# Normalize Find VariableGene and Scale Data
exp_count.seurat <- SCTransform(exp_count.seurat, vars.to.regress = c("percent.rb","percent.mt"))
# RunPCA
exp_count.seurat <- RunPCA(exp_count.seurat, verbose = FALSE, npcs = npc_plot)
# Select PCs:1:30pcs
selected_pcs_name <- paste0(file_prefix, "_Selected_PCs.pdf")
pdf(file = selected_pcs_name, height = 8, width = 8)
print(ElbowPlot(object = exp_count.seurat, ndims = npc_plot, reduction = "pca"))
dev.off()
# Plot
pca_plot_name <- paste0(file_prefix, "_PCA_plot",".pdf")
pdf(pca_plot_name, 8, 8)
print(DimPlot(exp_count.seurat,reduction="pca", label = TRUE, pt.size= 0.5,group.by="ident",shape.by="orig.ident"))
dev.off()
#Copy files to 2.RunPCA
result_files <- c(selected_pcs_name,pca_plot_name)
file.copy(result_files, folder_name ,overwrite = T)#拷贝文件
file.remove(result_files) #移除拷贝完的文件
return(exp_count.seurat)
}
# RunUMAP and RunTSNE and Then find cluster
PlotCluster <- function(exp_count.seurat, file_prefix, npc_used, k_param, resolution_number){
folder_name="3.PlotCluster"
if (file.exists(folder_name)){
print("3.PlotCluster existed.")
}
else{
dir.create(folder_name)
}
# FinderNeighbors
exp_count.seurat <- FindNeighbors(exp_count.seurat, dims = 1:npc_used, verbose = FALSE, k.param = k_param)
exp_count.seurat <- FindClusters(exp_count.seurat, verbose = FALSE, resolution = resolution_number)
# RunTSNE and RunUMAP
exp_count.seurat <- RunTSNE(exp_count.seurat, dims = 1:npc_used, verbose = FALSE, check_duplicates = FALSE)
exp_count.seurat <- RunUMAP(exp_count.seurat, dims = 1:npc_used, verbose = FALSE)
# RenameIdents from zero to one
levels_define <- as.numeric(levels(exp_count.seurat))
new.cluster.ids <- levels_define + 1
names(new.cluster.ids) <- levels(exp_count.seurat)
exp_count.seurat <- RenameIdents(exp_count.seurat, new.cluster.ids)
combined_cluster_plotname <- paste0(file_prefix, "_combined_cluster_resolution_",resolution_number,".pdf")
pdf(combined_cluster_plotname,7,7)
print(DimPlot(exp_count.seurat,reduction="umap",label = TRUE, pt.size = 0.5,
label.size = 4.5, repel=TRUE, group.by="ident"))
print(DimPlot(exp_count.seurat,reduction="tsne", label = TRUE, pt.size = 0.5,
label.size = 4.5, repel=TRUE, group.by="ident"))
dev.off()
split_clustering_plot_name <- paste0(file_prefix, "_split_clustering_resolution_", resolution_number, ".pdf")
pdf(split_clustering_plot_name,14,7)
print(DimPlot(exp_count.seurat,reduction="umap",label = TRUE, pt.size = 0.5,
label.size = 4.5, group.by="ident", repel=TRUE, split.by="orig.ident"))
print(DimPlot(exp_count.seurat,reduction="tsne", label = TRUE, pt.size = 0.5,
label.size = 4.5, group.by="ident", repel=TRUE, split.by="orig.ident"))
dev.off()
# Copy files to 2.Cluster
file.copy(combined_cluster_plotname, folder_name ,overwrite = T)
file.remove(combined_cluster_plotname)
file.copy(split_clustering_plot_name, folder_name ,overwrite = T)
file.remove(split_clustering_plot_name)
return(exp_count.seurat)
}
# Find markers for each cluster
FindmarkerForCluster <- function(exp_count.seurat, file_prefix, min.pct, logfc.threshold, p_val_adj, mt_rb_pattern){
folder_name="4.MarkersInCluster"
if (file.exists(folder_name)){
print("4.MarkersInCluster file existed.")
}
else{
dir.create(folder_name)
}
cluster.markers <- FindAllMarkers(object = exp_count.seurat, only.pos = TRUE,
min.pct = min.pct, logfc.threshold = logfc.threshold)
index <- cluster.markers$p_val_adj < p_val_adj
cluster.markers <- cluster.markers[index,]
# remove mt and rb gene
index <- grep(mt_rb_pattern, cluster.markers$gene)
if (length(index) > 0){
cluster.markers <- cluster.markers[-index, ]
}
save_name <- paste0(file_prefix,"_MarkersInClusters.csv")
write.csv(cluster.markers,save_name)
file.copy(save_name, folder_name,overwrite = T)
file.remove(save_name)
return(cluster.markers)
}
# Top markers for each cluster
TopMarkersInCluster <- function(cluster.markers, file_prefix, top_num){
library(dplyr)
folder_name="4.MarkersInCluster"
if (file.exists(folder_name)){
print("4.MarkersInCluster file existed.")
}
else{
dir.create(folder_name)
}
#将readsCountSM.markers传给group_by,group_by按cluster 排序,再将结果传给top_n,top_n按avg_logFC排序,显示每个类中的前两个
top_marker <- cluster.markers %>% group_by(cluster) %>% top_n(n = top_num, wt = avg_log2FC)
file_name=paste(file_prefix, "_top_marker", top_num,".csv",sep="")
write.csv(top_marker, file =file_name)
file.copy(file_name, folder_name,overwrite = T)
return(top_marker)
file.remove(file_name)
}
# Rename each cluster with top2 markers
MapTop2MarkerEachCluster <- function(exp_count.seurat, cluster.markers, file_prefix){
library(dplyr)
library(plyr)
exp_count.seurat$seurat_clusters <- exp_count.seurat@active.ident
top2 <- cluster.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_log2FC)
top2 <- top2 %>% group_by(cluster) %>%
dplyr::mutate(markers = paste0(gene, collapse = "/")) %>% dplyr::slice(1)
marker.names <- top2$markers
current.cluster.ids <- as.character(1:(length(unique(Idents(exp_count.seurat)))))
new.cluster.ids <- marker.names
Idents(exp_count.seurat) <- plyr::mapvalues(Idents(exp_count.seurat),
from = current.cluster.ids,
to = new.cluster.ids)
return(exp_count.seurat)
}
2.DataQualityOverviewSingle.R: 数据质控的函数是放在了这个脚本
使用的话需要把source()那行代码改成自己的路径
这边有个特别注意的点就是这个脚本里的第一行
“#!/home/zhengjh/miniconda3/envs/r403/bin/Rscript”
之前我是把脚本放环境变量里了,每次source环境的时候总是就不能直接使用,必须全路径,只要把这个Rscript所在位置换成你用的conda环境中的Rscript就可以了。另外需要chmod +x DataQualityOverviewSingle.R 这个代码,才可以执行
#!/home/zhengjh/miniconda3/envs/r403/bin/Rscript
# parameter
library(optparse)
library(getopt)
option_list <- list(
make_option(c("-o", "--output"), type = "character", default = FALSE,
action = "store", help = "This is the output directory."
),
make_option(c("-t", "--Type"), type = "character", default = "10X",
action = "store", help = "This is type of exp matrix, default is 10X, the other is .csv which row is gene, column is cell."
),
make_option(c("--datapath"), type = "character", default = FALSE,
action = "store", help = "This is the path of exp matrix."
),
make_option(c("--project_name"), type = "character", default = FALSE,
action = "store", help = "This is project name built in seurat object and the pre_fix of files generated by this function."
),
make_option(c("--mt_pattern"), type = "character", default = "^mt-",
action = "store", help = "This is mitochondria pattern used to calculate its percentage, default is the ^mt-."
),
make_option(c("--rb_pattern"), type = "character", default = "^Rpl|^Rps",
action = "store", help = "This is ribosome pattern used to calculate its percentage, default is ^Rpl|^Rps"
),
make_option(c("--min_cell"), type = "integer", default = 3,
action = "store", help = "This is min cells number to filter, default is 3."
),
make_option(c("--min_gene"), type = "integer", default = 0,
action = "store", help = "This is min gene number to filter, default is 0."
)
)
# -help
opt = parse_args(OptionParser(option_list = option_list,
usage = "This script is general overview data quality of single cell exp matrix!"))
print(opt)
#### step0.load data ####
source("/home/zhengjh/scripts/seurat/function/SeuratWrapperFunction.R")
print("Step0: Load Data")
##### step00. load functions ####
# read in the functions
timestart<-Sys.time()
# set the output path
cat("Current working dir: ", opt$output)
cat("\n")
setwd(opt$output)
# load ref object
#### step01. load exp matrix ####
print("Step0: Load exp matrix data")
if(opt$Type=="10X"){
print("Read exp matrix from 10X cellranger output")
exp_count <- Read10X(data.dir = opt$datapath)
print("The rownum and colnum of exp_count is ")
print(dim(exp_count))
}else{
print("Read exp matrix from csv file")
exp_count <- read.csv(opt$datapath, header = T, row.names = 1)
print("The rownum and colnum of normal exp is ")
print(dim(exp_count))
}
#### step1: Create_seurat_filter_cell ######
print("Step1: Creat Seuart Object with filtering lower genes and lower cells")
Or_ident <- rep(opt$project_name, ncol(exp_count))
exp_count.seurat <- CreateFirstFilterSeuratObject(exp_count, opt$project_name, Or_ident,
opt$min_cell, opt$min_gene,
opt$mt_pattern, opt$rb_pattern)
print("The row_num and col_num of the first filtered matrix:")
print(dim(exp_count.seurat))
print("The cellnumber of samples:")
print(table(exp_count.seurat$orig.ident))
name_pre <- paste0("filter_gene_expressed_", opt$min_cell, "cells", opt$min_gene, "genes")
#### step2: Overview of the data quality ######
print("Step2: Overview of the data quality")
SeuratQC(exp_count.seurat, name_pre)
rds_name <- paste0(opt$project_name, "_dataquality_overview_seurat.rds")
saveRDS(exp_count.seurat, file = rds_name)
print("Happy~Finished!!!")
timeend<-Sys.time()
runningtime<-timeend-timestart
print(runningtime)
3.LogNormalizeClusteringSingle.R脚本: LogNormalize数据并聚类
#!/home/zhengjh/miniconda3/envs/r403/bin/Rscript
# parameter
library(optparse)
library(getopt)
option_list <- list(
make_option(c("-o", "--output"), type = "character", default = FALSE,
action = "store", help = "This is the output directory."
),
make_option(c( "--seuratobject"), type = "character", default = "10X",
action = "store", help = "This is absoulte path of seuratobject."
),
make_option(c("--min_gene"), type = "integer", default = 50,
action = "store", help = "This is min gene number to filter, default is 50."
),
make_option(c("--max_gene"), type = "integer", default = 10000,
action = "store", help = "This is max gene number to filter, default is 10000."
),
make_option(c("--min_ncountRNA"), type = "integer", default = 500,
action = "store", help = "This is min ncountRNA to filter, default is 5000."
),
make_option(c("--max_ncountRNA"), type = "integer", default = 100000,
action = "store", help = "This is min ncountRNA to filter, default is 100000."
),
make_option(c("--max_mt_percent"), type = "integer", default = 60,
action = "store", help = "This is min mitochondria percentage to filter, default is 60."
),
make_option(c("--max_rb_percent"), type = "integer", default = 60,
action = "store", help = "This is min ribosome percentage to filter, default is 60."
),
make_option(c("--file_prefix"), type = "character",
action = "store", help = "This is number of file prefix to used in the following analysis."
),
make_option(c("--scale_factor"), type = "integer", default = 10000,
action = "store", help = "This is scale factor used in Seurat NormalizeData function, default is 10000."
),
make_option(c("--nfeatures"), type = "integer", default = 2000,
action = "store", help = "This is number of features used in Seurat FindVariableFeatures function, default is 2000."
),
make_option(c("--npc_plot"), type = "integer", default = 50,
action = "store", help = "This is number of principles to plot, default is 50."
),
make_option(c("--npc_used"), type = "integer", default = 20,
action = "store", help = "This is number of principles to use in RunTSNE and RunUMAP function, default is 20."
),
make_option(c("--k_parameter"), type = "integer", default = 20,
action = "store", help = "This is k parameter used in FindNeighbors function which will influence the clusering number, default is 20."
),
make_option(c("--resolution_number"), type = "double", default = 0.5,
action = "store", help = "This is resolution number used in FindClusters function , default is 0.5"
),
make_option(c("--min_pct"), type = "double", default = 0.25,
action = "store", help = "This is min.pct used in FindAllMarkers function , default is 0.25"
),
make_option(c("--logfc_threshold"), type = "double", default = 0.25,
action = "store", help = "This is logfc.threshold used in FindClusters function , default is 0.25"
),
make_option(c("--p_val_adj"), type = "double", default = 0.05,
action = "store", help = "This is p_val_adj used to filter marker , default is 0.05"
),
make_option(c("--top_num"), type = "integer", default = 10,
action = "store", help = "This is top num marker of each cluster, default is 10"
),
make_option(c("--mt_rb_pattern"), type = "character", default = "^mt-|^Rpl-|^Rps-",
action = "store", help = "This is mt_rb_pattern used to filter mt/rb genes found in cluster markers, default is ^mt-|^Rpl-|^Rps-"
)
)
# -help
opt = parse_args(OptionParser(option_list = option_list, usage = "This script is general processing of single cell exp matrix!"))
print(opt)
source("/home/zhengjh/scripts/seurat/function/SeuratWrapperFunction.R")
###### step0. subset seurat object ####
timestart<-Sys.time()
# set the output path
setwd(opt$output)
exp_count.seurat <- readRDS(opt$seuratobject)
print("Step0: Subset Seurat Object")
cat(opt$min_gene, "< gene number <", opt$max_gene, "\n")
cat(opt$min_ncountRNA, "< ncountRNA <", opt$max_ncountRNA, "\n")
cat("max_mt_percent ", opt$max_mt_percent, "\n")
cat("max_rb_percent ", opt$max_rb_percent, "\n")
exp_count.seurat <- SubsetSeuratData(exp_count.seurat = exp_count.seurat,
min_gene = opt$min_gene, max_gene = opt$max_gene,
min_ncountRNA = opt$min_ncountRNA, max_ncountRNA = opt$max_ncountRNA,
max_mt_percent = opt$max_mt_percent, max_rb_percent = opt$max_rb_percent)
#### step1. normalize data and run pca ####
print("Step1: LogNormalize Data and Run PCA")
cat("scale factor used in NormalizeData function is ", opt$scale_factor, "\n")
cat("nfeatures used in Seurat FindVariableFeatures function is ", opt$nfeatures, "\n")
cat("npc_plot used in ElbowPlot is ", opt$npc_plot, "\n")
exp_count.seurat <- SelectPCsByLogNormalize(exp_count.seurat = exp_count.seurat,
file_prefix = opt$file_prefix,
scale_factor = opt$scale_factor,
nfeatures = opt$nfeatures,
npc_plot = opt$npc_plot)
#### step2. find clusters ####
print("Step2: FindNeighbors and clusters and then run tsne and umap reduction")
cat("k_param used in Seurat FindNeighbors function is ", opt$k_param, "\n")
cat("resolution_number used in Seurat FindClusters function is ", opt$resolution_number, "\n")
cat("npc_used used in RunTSNE and RUNUMAP is ", opt$npc_used, "\n")
exp_count.seurat <- PlotCluster(exp_count.seurat = exp_count.seurat,
file_prefix = opt$file_prefix,
npc_used = opt$npc_used,
k_param = opt$k_param,
resolution_number = opt$resolution_number)
#### step3. find top markers #####
print("Step3: find top markers and then map cluster with top2 marke")
cat("min.pct used in Seurat FindAllMarkers function is ", opt$min.pct, "\n")
cat("logfc.threshold used in Seurat FindAllMarkers function is ", opt$logfc.threshold, "\n")
cat("p_val_adj used to filter markers is ", opt$p_val_adj, "\n")
cluster.markers <- FindmarkerForCluster(exp_count.seurat = exp_count.seurat,
file_prefix = opt$file_prefix,
min.pct = opt$min_pct,
logfc.threshold = opt$logfc_threshold,
p_val_adj = opt$p_val_adj,
mt_rb_pattern = opt$mt_rb_pattern)
TopMarker <- TopMarkersInCluster(cluster.markers = cluster.markers,
file_prefix = opt$file_prefix,
top_num = opt$top_num)
# Top markers heatmap
P_heatmap_top_marker <- DoHeatmap(exp_count.seurat, features = TopMarker$gene)
filename <- paste0(opt$file_prefix, "_Heatmap_plot_top_markers.pdf")
folder_name <- "4.MarkersInCluster"
pdf(filename, 30, 15)
print(P_heatmap_top_marker)
dev.off()
file.copy(filename, folder_name, overwrite = T) #copy files
file.remove(filename)
#### step4. map top2 marker to each cluster #####
print("step4: map top2 marker to each cluster")
exp_count.seurat <- MapTop2MarkerEachCluster(exp_count.seurat = exp_count.seurat,
cluster.markers = cluster.markers,
file_prefix = opt$file_prefix)
#### step5. save the clustering plots aftering assign top2 markers ###
print("step5. save the clustering plots aftering assign top2 markers")
p1 <- DimPlot(exp_count.seurat, reduction = "umap",label = TRUE,
pt.size = 0.5, label.size = 4.5, repel = TRUE, group.by = "ident") + NoLegend()
p1 <- p1 + labs(title="Clustering UMAP Reduction")
p1 <- p1 + theme(plot.title = element_text(hjust = 0.5))
p2 <- DimPlot(exp_count.seurat, reduction = "umap",label = TRUE,
pt.size = 0.5, label.size = 4.5, repel = TRUE, group.by = "ident")
p2 <- p2 + labs(title="Clustering TSNE Reduction")
p2 <- p2 + theme(plot.title = element_text(hjust = 0.5))
plots <- plot_grid(p1, p2, ncol=2,rel_widths = c(1.7, 2.2))
cluster_pdfname <- paste0(opt$file_prefix, "_Clustering_TSNE_UMAP.pdf")
folder_name <- "3.PlotCluster"
pdf(cluster_pdfname, width = 18, height = 6)
print(plots)
dev.off()
file.copy(cluster_pdfname, folder_name, overwrite = T)#拷贝文件
file.remove(cluster_pdfname)
rds_name <- paste0(opt$file_prefix, "_lognormalize_clustering_seurat.rds.rds")
saveRDS(exp_count.seurat, file = rds_name)
print("Happy~Finished!!")
timeend<-Sys.time()
runningtime<-timeend-timestart
print(runningtime)
4. SCTransformClusteringSingle.R 脚本: SCTtransform数据并聚类
#!/home/zhengjh/miniconda3/envs/r403/bin/Rscript
# parameter
library(optparse)
library(getopt)
option_list <- list(
make_option(c("-o", "--output"), type = "character", default = FALSE,
action = "store", help = "This is the output directory."
),
make_option(c( "--seuratobject"), type = "character", default = "10X",
action = "store", help = "This is absoulte path of seuratobject."
),
make_option(c("--min_gene"), type = "integer", default = 50,
action = "store", help = "This is min gene number to filter, default is 50."
),
make_option(c("--max_gene"), type = "integer", default = 10000,
action = "store", help = "This is max gene number to filter, default is 10000."
),
make_option(c("--min_ncountRNA"), type = "integer", default = 500,
action = "store", help = "This is min ncountRNA to filter, default is 5000."
),
make_option(c("--max_ncountRNA"), type = "integer", default = 100000,
action = "store", help = "This is min ncountRNA to filter, default is 100000."
),
make_option(c("--max_mt_percent"), type = "integer", default = 60,
action = "store", help = "This is min mitochondria percentage to filter, default is 60."
),
make_option(c("--max_rb_percent"), type = "integer", default = 60,
action = "store", help = "This is min ribosome percentage to filter, default is 60."
),
make_option(c("--file_prefix"), type = "character",
action = "store", help = "This is number of file prefix to used in the following analysis."
),
make_option(c("--npc_plot"), type = "integer", default = 50,
action = "store", help = "This is number of principles to plot, default is 50."
),
make_option(c("--npc_used"), type = "integer", default = 20,
action = "store", help = "This is number of principles to use in RunTSNE and RunUMAP function, default is 20."
),
make_option(c("--k_parameter"), type = "integer", default = 20,
action = "store", help = "This is k parameter used in FindNeighbors function which will influence the clusering number, default is 20."
),
make_option(c("--resolution_number"), type = "double", default = 0.5,
action = "store", help = "This is resolution number used in FindClusters function , default is 0.5"
),
make_option(c("--min_pct"), type = "double", default = 0.25,
action = "store", help = "This is min.pct used in FindAllMarkers function , default is 0.25"
),
make_option(c("--logfc_threshold"), type = "double", default = 0.25,
action = "store", help = "This is logfc.threshold used in FindClusters function , default is 0.25"
),
make_option(c("--p_val_adj"), type = "double", default = 0.05,
action = "store", help = "This is p_val_adj used to filter marker , default is 0.05"
),
make_option(c("--top_num"), type = "integer", default = 10,
action = "store", help = "This is top num marker of each cluster, default is 10"
),
make_option(c("--mt_rb_pattern"), type = "character", default = "^mt-|^Rpl-|^Rps-",
action = "store", help = "This is mt_rb_pattern used to filter mt/rb genes found in cluster markers, default is ^mt-|^Rpl-|^Rps-"
)
)
# -help
opt = parse_args(OptionParser(option_list = option_list, usage = "This script is general processing of single cell exp matrix!"))
print(opt)
source("/home/zhengjh/scripts/seurat/function/SeuratWrapperFunction.R")
###### step0. subset seurat object ####
timestart<-Sys.time()
# set the output path
setwd(opt$output)
exp_count.seurat <- readRDS(opt$seuratobject)
print("Step0: Subset Seurat Object")
cat(opt$min_gene, "< gene number <", opt$max_gene, "\n")
cat(opt$min_ncountRNA, "< ncountRNA <", opt$max_ncountRNA, "\n")
cat("max_mt_percent ", opt$max_mt_percent, "\n")
cat("max_rb_percent ", opt$max_rb_percent, "\n")
exp_count.seurat <- SubsetSeuratData(exp_count.seurat = exp_count.seurat,
min_gene = opt$min_gene, max_gene = opt$max_gene,
min_ncountRNA = opt$min_ncountRNA, max_ncountRNA = opt$max_ncountRNA,
max_mt_percent = opt$max_mt_percent, max_rb_percent = opt$max_rb_percent)
#### step1. normalize data and run pca ####
print("Step1: SCTtransform Data and Run PCA")
cat("scale factor used in NormalizeData function is ", opt$scale_factor, "\n")
cat("nfeatures used in Seurat FindVariableFeatures function is ", opt$nfeatures, "\n")
cat("npc_plot used in ElbowPlot is ", opt$npc_plot, "\n")
exp_count.seurat <- SelectPCsBySCTransform(exp_count.seurat = exp_count.seurat,
file_prefix = opt$file_prefix,
npc_plot = opt$npc_plot)
#### step2. find clusters ####
print("Step2: FindNeighbors and clusters and then run tsne and umap reduction")
cat("k_param used in Seurat FindNeighbors function is ", opt$k_param, "\n")
cat("resolution_number used in Seurat FindClusters function is ", opt$resolution_number, "\n")
cat("npc_used used in RunTSNE and RUNUMAP is ", opt$npc_used, "\n")
exp_count.seurat <- PlotCluster(exp_count.seurat = exp_count.seurat,
file_prefix = opt$file_prefix,
npc_used = opt$npc_used,
k_param = opt$k_param,
resolution_number = opt$resolution_number)
#### step3. find top markers #####
print("Step3: find top markers and then map cluster with top2 marke")
cat("min.pct used in Seurat FindAllMarkers function is ", opt$min.pct, "\n")
cat("logfc.threshold used in Seurat FindAllMarkers function is ", opt$logfc.threshold, "\n")
cat("p_val_adj used to filter markers is ", opt$p_val_adj, "\n")
cluster.markers <- FindmarkerForCluster(exp_count.seurat = exp_count.seurat,
file_prefix = opt$file_prefix,
min.pct = opt$min_pct,
logfc.threshold = opt$logfc_threshold,
p_val_adj = opt$p_val_adj,
mt_rb_pattern = opt$mt_rb_pattern)
TopMarker <- TopMarkersInCluster(cluster.markers = cluster.markers,
file_prefix = opt$file_prefix,
top_num = opt$top_num)
# Top markers heatmap
P_heatmap_top_marker <- DoHeatmap(exp_count.seurat, features = TopMarker$gene)
filename <- paste0(opt$file_prefix, "_Heatmap_plot_top_markers.pdf")
folder_name <- "4.MarkersInCluster"
pdf(filename, 30, 15)
print(P_heatmap_top_marker)
dev.off()
file.copy(filename, folder_name, overwrite = T)# copy files
file.remove(filename)
#### step4. map top2 marker to each cluster #####
print("step4: map top2 marker to each cluster")
exp_count.seurat <- MapTop2MarkerEachCluster(exp_count.seurat = exp_count.seurat,
cluster.markers = cluster.markers,
file_prefix = opt$file_prefix)
#### step5. save the clustering plots aftering assign top2 markers ###
print("step5. save the clustering plots aftering assign top2 markers")
p1 <- DimPlot(exp_count.seurat, reduction = "umap",label = TRUE,
pt.size = 0.5, label.size = 4.5, repel = TRUE, group.by = "ident") + NoLegend()
p1 <- p1 + labs(title="Clustering UMAP Reduction")
p1 <- p1 + theme(plot.title = element_text(hjust = 0.5))
p2 <- DimPlot(exp_count.seurat, reduction = "umap",label = TRUE,
pt.size = 0.5, label.size = 4.5, repel = TRUE, group.by = "ident")
p2 <- p2 + labs(title="Clustering TSNE Reduction")
p2 <- p2 + theme(plot.title = element_text(hjust = 0.5))
plots <- plot_grid(p1, p2, ncol=2,rel_widths = c(1.7, 2.2))
cluster_pdfname <- paste0(opt$file_prefix, "_Clustering_TSNE_UMAP.pdf")
folder_name <- "3.PlotCluster"
pdf(cluster_pdfname, width = 18, height = 6)
print(plots)
dev.off()
file.copy(cluster_pdfname, folder_name, overwrite = T)#拷贝文件
file.remove(cluster_pdfname)
rds_name <- paste0(opt$file_prefix, "_sctransform_clustering_seurat.rds.rds")
saveRDS(exp_count.seurat, file = rds_name)
print("Happy~Finished!!")
timeend<-Sys.time()
runningtime<-timeend-timestart
print(runningtime)
5.附上使用说明
1) 把SeuratWrapperFunction.R模块下面的代码复制到文件SeuratWrapperFunction.R,并存储
2) 把DataQualityOverviewSingle.R模块下面的代码复制到文件DataQualityOverviewSingle.R,并存储
3) 把LogNormalizeClusteringSingle.R模块下面的代码复制到文件LogNormalizeClusteringSingle.R,并存储
4) 把SCTransformClusteringSingle.R模块下面的代码复制到文件SCTransformClusteringSingle.R,并存储
5) 可执行权限
chmod +x DataQualityOverviewSingle.R
chmod +x LogNormalizeClusteringSingle.R
chmod +x SCTransformClusteringSingle.R
6)使用
## 质量控制
Rscript ~/scripts/seurat/pipeline/DataQualityOverviewSingle.R -o /home/zhengjh/other/PinealGland/Results/Zebrafish -t 10X --datapath /home/zhengjh/other/PinealGland/Data/Zebrafish_GSE123778_RAW/GSM3511192_scSeq1 \
--project_name "zebrafish_scseq1" --mt_pattern "^mt-" --rb_pattern "^rpl|^rps" --min_cell 3 --min_gene 100
## LogNormalize
Rscript ~/scripts/seurat/pipeline/LogNormalizeClusteringSingle.R -o ~/other/PinealGland/Results/Zebrafish --seuratobject ~/other/PinealGland/Results/Zebrafish/zebrafish_scseq1_dataquality_overview_seurat.rds \
--min_gene 200 --max_gene 6500 --min_ncountRNA 200 --max_ncountRNA 75000 \
--max_mt_percent 20 --max_rb_percent 20 --file_prefix zebrafish_scseq1_logtransform \
--scale_factor 10000 --nfeatures 2000 --npc_plot 50 --npc_used 30 \
--k_parameter 20 --resolution_number 0.5 --min_pct 0.25 --logfc_threshold 0.25 \
--p_val_adj 0.05 --top_num 10 --mt_rb_pattern "^mt-|rpl|rps"
## SCTransform
Rscript ~/scripts/seurat/pipeline/SCTransformClusteringSingle.R -o ~/other/PinealGland/Results/Zebrafish --seuratobject ~/other/PinealGland/Results/Zebrafish/zebrafish_scseq1_dataquality_overview_seurat.rds --min_gene 200 --max_gene 6500 --min_ncountRNA 200 --max_ncountRNA 75000 \
--max_mt_percent 20 --max_rb_percent 20 --file_prefix zebrafish_scseq1_scttransform \
--npc_plot 50 --npc_used 30 \
--k_parameter 20 --resolution_number 0.5 --min_pct 0.25 --logfc_threshold 0.25 \
--p_val_adj 0.05 --top_num 10 --mt_rb_pattern "^mt-|rpl|rps"
7) 结果展示
结果展示
最后放个图片
祝君开心快乐 健康!
也希望自己一直保持学习 不郁闷呜呜u
image.png