3大在线分析工具:Enrichr、WebGestalt、gprofiler与R包clusterprofiler的比较开发者社区

link之家

链接快照平台

输入网页链接，自动生成快照
标签化管理网页链接

3大在线分析工具:Enrichr、WebGestalt、gprofiler与R包clusterprofiler的比较

install.packages("enrichR")
library(enrichR)
dbs <- listEnrichrDbs() ###列出164个库
dbs[1:4,1:4]
#   geneCoverage genesPerTerm               libraryName
# 1        13362          275       Genome_Browser_PWMs
# 2        27884         1284  TRANSFAC_and_JASPAR_PWMs
# 3         6002           77 Transcription_Factor_PPIs
# 4        47172         1370                 ChEA_2013
#                                                       link
# 1 http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/
# 2                 http://jaspar.genereg.net/html/DOWNLOAD/
# 4           http://amp.pharm.mssm.edu/lib/cheadownload.jsp
###从中选择你要富集的库
dbs$libraryName ###查看库名
dbs <- c("GO_Molecular_Function_2018", "GO_Cellular_Component_2018", "GO_Biological_Process_2018")###这里我选择GO库的3个process
library(clusterProfiler)
library(org.Hs.eg.db)
keytypes(org.Hs.eg.db)
symbol=read.csv("igno.txt",header = F)
#BiocManager::install("org.Hs.eg.db")
###id转换
df <- bitr(unique(symbol$V1), fromType = "ENSEMBL",
           toType = c("SYMBOL","ENTREZID"),
           OrgDb = org.Hs.eg.db)
enrichr<- enrichr(symbol, dbs)
###有点久，因为要联网

install.packages("WebGestaltR")
install.packages("gdtools")
library(WebGestaltR)
####ORA
head(listGeneSet())###列出所有的库的前6个
#                                          name
# 1             geneontology_Biological_Process
# 2 geneontology_Biological_Process_noRedundant
# 3             geneontology_Cellular_Component
# 4 geneontology_Cellular_Component_noRedundant
# 5             geneontology_Molecular_Function
# 6 geneontology_Molecular_Function_noRedundant
enrichResult <- WebGestaltR(enrichMethod="ORA", organism="hsapiens",                     enrichDatabase=c("geneontology_Biological_Process_noRedundant","geneontology_Cellular_Component_noRedundant","geneontology_Molecular_Function_noRedundant")####GO 3个process
, interestGeneFile=df$SYMBOL,interestGeneType="genesymbol", isOutput=TRUE,
 outputDirectory="./", projectName=NULL)
####也很慢，需要用外网

install.packages("gprofiler2")
library(gprofiler2)
gostres <- gost(query = df$SYMBOL,
                organism = "hsapiens", ordered_query = FALSE,
                multi_query = FALSE, significant = TRUE, exclude_iea = FALSE,
                measure_underrepresentation = FALSE, evcodes = FALSE,
                user_threshold = 0.05, correction_method = "fdr",
                domain_scope = "annotated", custom_bg = NULL,
                numeric_ns = "", sources = GO, as_short_link = FALSE)
####这个也需要连接到外网

library(clusterProfiler)
library(org.Hs.eg.db)
keytypes(org.Hs.eg.db)
symbol=read.csv("igno.txt",header = F)
#BiocManager::install("org.Hs.eg.db")
df <- bitr(unique(symbol$V1), fromType = "ENSEMBL", ###输入只能说entrez id，所以需要iD转换
           toType = c("SYMBOL","ENTREZID"),
           OrgDb = org.Hs.eg.db)
go <- enrichGO(df$ENTREZID, OrgDb = "org.Hs.eg.db", ont="all",readable = T,qvalueCutoff = 0.05) ###默认阈值是pvalue=0,05，方法是"BH"，物种是人,这里需要将FDR设为0.05，因为这样4个阈值才统一

###结果查看
###cluster profiler
cgo=go@result
ccc=cgo[cgo$ONTOLOGY=="CC",]
cbp=cgo[cgo$ONTOLOGY=="BP",]
cmf=cgo[cgo$ONTOLOGY=="MF",]
dim(ccc);dim(cbp);dim(cmf) ###分别为19 19 14个terms
# [1] 19 10
# [1] 19 10
# [1] 14 10
###enrich r
###enrichr
rbp=read.csv("mrna+lncrna/salmon/GO_Biological_Process_2018_table.txt",sep = "\t")
rcc=read.csv("GO_Cellular_Component_2018_table.txt",sep = "\t")
rmf=read.csv("GO_Molecular_Function_2018_table.txt",sep = "\t")
rbp=rbp[rbp$Adjusted.P.value<0.05,]
rcc=rcc[rcc$Adjusted.P.value<0.05,]
rmf=rmf[rmf$Adjusted.P.value<0.05,]
dim(rcc);dim(rbp);dim(rmf) ##可以看到如果根据fdr来筛选，cc与bp并没有显著terms，可能原因是其用的不是FDR or BH算法，而是fisher exact test
# [1] 0 9
# [1] 0 9
# [1] 4 9
###gprofiler
g=read.csv("gProfiler_hsapiens.csv")
gcc=g[g$source=="GO:CC",]
gbp=g[g$source=="GO:BP",]
gmf=g[g$source=="GO:MF",]
dim(gcc);dim(gbp);dim(gmf)
# [1] 20 10
# [1] 54 10
# [1]  8 10
### WebGestalt
####WebGestalt
wcc=read.csv("goslim_summary_wg_result1586186048_cc.txt",sep = "\t")
wbp=read.csv("goslim_summary_wg_result1586186048_bp.txt",sep = "\t")
wmf=read.csv("goslim_summary_wg_result1586173032_mf.txt",sep = "\t")
dim(wcc);dim(wbp);dim(wmf)
# [1] 21  3
# [1] 12  3
# [1] 17  3
####交集
library(venn)
cc=venn(list(ccc$ID,rcc$Term,gcc$term_id,wcc$V1),snames = c("cluster profiler","enrichr","gprofiler","WebGestalt"),zcolor = "style",sncs = 1,ellipse = T,box = F)
BP=venn(list(cbp$ID,rbp$Term,gbp$term_id,wbp$V1),snames = c("cluster profiler","enrichr","gprofiler","WebGestalt"),zcolor = "style",sncs = 1,ellipse = T,box = F)
mf=venn(list(cmf$ID,rmf$Term,gmf$term_id,wmf$V1),snames = c("cluster profiler","enrichr","gprofiler","WebGestalt"),zcolor = "style",sncs = 1,ellipse = T,box = F)

clcc=ifelse(ccu%in%ccc$ID,1,0)
rlcc=ifelse(ccu%in%rcc$Term,1,0)
glcc=ifelse(ccu%in%gcc$term_id,1,0)
wlcc=ifelse(ccu%in%wcc$V1,1,0)
cccc=data.frame(clcc,rlcc,glcc,wlcc)
rownames(cccc)=ccu
upset(cccc,nsets = 4)

myfun=function(x){
  unlist(str_extract_all(x$V1,"GO:\\d+"))  
###enrichr
rbp=read.csv("GO_Biological_Process_2018.txt",sep = "\n",header = F)
rbp=myfun(rbp)
rcc=read.csv("gocc.csv",sep = "\n",header = F)
rcc=myfun(rcc)
rmf=read.csv("GO_Molecular_Function_2018.txt",sep = "\n",header = F)
rmf=myfun(rmf)
length(rcc);length(rbp);length(rmf)
# [1] 446
# [1] 5103
# [1] 1151
###gprofiler
library(tidyr)
library(stringr)
gcc=read.csv("hsapiens.GO_CC.name.gmt",sep = "\n",header = F)
gcc=myfun(gcc)
gbp=read.csv("hsapiens.GO_BP.name.gmt",sep = "\n",header = F)
gbp=myfun(gbp)
gmf=read.csv("hsapiens.GO_MF.name.gmt",sep = "\n",header = F)
gmf=myfun(gmf)
length(gcc);length(gbp);length(gmf)
# [1] 2005
# [1] 16262
# [1] 4704
###cluster profiler
library(clusterProfiler)
cmf <- enrichGO(df$ENTREZID, OrgDb = "org.Hs.eg.db", ont="MF",readable = T,qvalueCutoff = 1,pvalueCutoff = 1)
cmf=names(cmf@geneSets)
ccc<- enrichGO(df$ENTREZID, OrgDb = "org.Hs.eg.db", ont="cc",readable = T,qvalueCutoff = 1,pvalueCutoff = 1)
ccc=names(ccc@geneSets)
cbp=enrichGO(df$ENTREZID, OrgDb = "org.Hs.eg.db", ont="BP",readable = T,qvalueCutoff = 1,pvalueCutoff = 1)
cbp=names(cbp@geneSets)
length(ccc);length(cbp);length(cmf)
# [1] 692
# [1] 4937
# [1] 872

bp=Reduce(setdiff,list(gbp,cbp,rbp))
bp[1]
#[1] "GO:0000019"
#获取该通路上所有基因
# ENSG00000020922
# ENSG00000076242
# ENSG00000104884
# ENSG00000113522
# ENSG00000132604
# ENSG00000180532
###放到cluster profiler去找
term=c("ENSG00000020922",
       "ENSG00000076242",
       "ENSG00000104884",
       "ENSG00000113522",
       "ENSG00000132604",
       "ENSG00000180532")
###id转换
df_1 <- bitr(unique(term), fromType = "ENSEMBL",
           toType = c("SYMBOL","ENTREZID"),

3大在线分析工具:Enrichr、WebGestalt、gprofiler与R包clusterprofiler的比较

3大在线分析工具:Enrichr、WebGestalt、gprofiler与R包clusterprofiler的比较

网页版

R包版

网页版

R包版

网页版

R包版

对4种工具筛选结果查看

3种原始库文件比较

差异原因

1.不同工具的GO数据库更新时间不同