看到我们生信入门班里有学员问:单细胞里面怎么将亚群与预后关联起来呢?来看看~
比如这篇2025年10月10号发表在Oncogene杂志的研究mast细胞的泛癌文献,标题为《Comprehensive single-cell analysis reveals mast cells’ roles in cancer immunity》。里面就有很好的一个示例~
亚群生存分析策略
这篇文献整理了超多公共数据库的数据,然后注释,提取其中的mast细胞整合到一起,细分了9个亚群。其中重点研究了 C7-HLA-DR cluster 这个亚群与临床生存结局之间的相关性,图如下:
策略:拿到 C7-HLA-DR cluster 这个亚群的特征基因作为signature,使用带有生存信息的bulk-RNA seq数据做打分,根据打分对bulk-RNA seq数据样本进行分组,然后看两个组别之间的OS等生存差异。
Fig. 4 The relationship between C7-HLA-DR cluster and clinical outcomes.
我们这里以TCGA数据库中的肝癌样本为例,来分析看看~
step1:TCGA肝癌样本数据下载
这里参考我们之前的文章:TCGA数据库| 如何将表达矩阵与样本临床数据进行合并?
## download tcga data
## update: 2024-02-22
## Author: zhang juan
## load packages
rm(list=ls())
library(TCGAbiolinks)
library(SummarizedExperiment)
library(tidyverse)
# 癌症类型,用 getGDCprojects()$project_id 查看所有
getGDCprojects()$project_id
# "TCGA-LIHC"
# 设置query
query <- GDCquery(
project = "TCGA-LIHC", # 癌症类型,用 getGDCprojects()$project_id 查看所有
data.category="Transcriptome Profiling", # 数据类别, 用getProjectSummary(project)查看所有类别
data.type ="Gene Expression Quantification", # 数据类型
workflow.type="STAR - Counts" # 工作流类型
)
## 下载数据
GDCdownload(query=query, files.per.chunk= 50, directory = "./")
## 整理数据并存储为 R对象
GDCprepare(query,save=T, save.filename="TCGA-LIHC.transcriptome.Rdata", directory = "./")
load("TCGA-LIHC.transcriptome.Rdata")
ls()
names(assays(data))
rowdata <- rowData(data)
提取其中的表达矩阵:
table(rowdata$gene_type)
count <- assay(data,"unstranded") # counts矩阵
tpm <- assay(data, "tpm_unstrand") # tpm矩阵
fpkm <- assay(data,"fpkm_unstrand") # fpkm矩阵
count[1:5,1:6]
# 添加gene_symbol, 先提取gene_name
symbol <- rowData(data)
head(symbol)
count_symbol <- cbind(data.frame(symbol$gene_name), as.data.frame(count))
kp <- duplicated(count_symbol$symbol.gene_name)
temp <- count_symbol[kp,]
count_symbol <- count_symbol[!kp,]
tpm_symbol <- cbind(data.frame(symbol$gene_name), as.data.frame(tpm)) # tpm矩阵
tpm_symbol <- tpm_symbol[!kp,]
# 最后保存
save(count_symbol, tpm_symbol, file = "exp.RData")
step2:临床信息下载
这里主要是得到样本的生存时间和生存结局:
query <- GDCquery(
project = "TCGA-LIHC",
data.category = "Clinical",
data.format = "bcr xml"
)
# save(query, file = "TCGA-LIHC.clinic.query.rdata")
# #load("TCGA-LIHC.clinic.query.rdata")
GDCdownload(query, files.per.chunk= 50, directory = "./")
clinical <- GDCprepare_clinic(query, clinical.info = "patient", directory = "./")
clinical.follow_up <- GDCprepare_clinic(query, clinical.info = "follow_up", directory = "./")
clinical.stage_event <- GDCprepare_clinic(query, clinical.info = "stage_event", directory = "./")
clinical.drug <- GDCprepare_clinic(query, clinical.info = "drug", directory = "./")
clinical.radiation <- GDCprepare_clinic(query, clinical.info = "radiation", directory = "./")
clinical.admin <- GDCprepare_clinic(query, clinical.info = "admin", directory = "./")
# 保存
saveRDS(clinical, file = "TCGA-LIHC.clinical_patient.rds")
saveRDS(clinical.admin, file = "TCGA-LIHC.clinical_admin.rds")
saveRDS(clinical.drug, file = "TCGA-LIHC.clinical_drug.rds")
saveRDS(clinical.follow_up, file = "TCGA-LIHC.clinical_follow_up.rds")
saveRDS(clinical.radiation, file = "TCGA-LIHC.clinical_radiation.rds")
saveRDS(clinical.stage_event, file = "TCGA-LIHC.clinical_stage_event.rds")
step3:计算打分
使用AUCell:https://bioconductor.org/packages/release/bioc/vignettes/AUCell/inst/doc/AUCell.html
输入表达矩阵:使用count矩阵,这里的基因集使用的文献给的代码中放好的,就是有点奇怪,每个亚群的基因只用了5个,但是表格里面不止这么多。
library(AUCell)
# 定义基因集
geneSets = list(c0=c("TPSAB1","RPL27A","RPS27","RPL41","RPL35"),
c1=c("TPSAB1","DNAJB1","HSPH1","JUN","BAG3"),
c2=c("TPSAB1","NFKB1","NR4A3","VEGFA","KDM6B"),
c3=c("TPSAB1","RGS1","CMA1","CTSG","FOS"),
c4=c("TPSAB1","PKM","MYDGF","PDIA3","PPIB"),
c5=c("TPSAB1","CREM","BCL2A1","BIRC3","EIF1"),
c6=c("TPSAB1","TXNIP","TPSB2","CTSD","S100A4"),
c7=c("TPSAB1","HLA-DPB1","HLA-DPA1","HLA-DQB1","HLA-DMB"),
c8=c("TPSAB1","HSPA1A","EGR1","HSPE1","FOS"),
c9=c("TPSAB1","CCL5","IL32","NKG7","IL7R"))
# 加载表达矩阵
load("./TCGA-LIHC/exp.RData")
count_symbol[1:6,1:5]
rownames(count_symbol) <- count_symbol$symbol.gene_name
count_symbol <- count_symbol[,-1]
count_symbol[1:6,1:5]
# 过滤低表达
kp <- rowSums(count_symbol)>1
table(kp)
expr_data <- count_symbol[kp,]
sparse_expr <- as(as.matrix(expr_data), "sparseMatrix")
cells_rankings <- AUCell_buildRankings(sparse_expr )
auc_scores <- AUCell_calcAUC(geneSets, cells_rankings)
temp <- getAUC(auc_scores)
auc_df <- data.frame(id = colnames(expr_data), score = t(temp))
head(auc_df)
这样就得到了每个样本的9个基因集的打分。
step4:打分与临床信息整合
现在加载之前的临床信息与上面的打分合并,整理后就可以根据打分对样本进行分组,看两组间的生存时间差异啦!
先加载临床信息,预处理
过滤小于30天的patients,没有临床结局的patients。
# 加载临床信息
clinical <- readRDS("./TCGA-LIHC/TCGA-LIHC.clinical_follow_up.rds")
colnames(clinical)
clinical <- clinical[,c("bcr_followup_barcode","vital_status","days_to_last_followup","days_to_death" )]
head(clinical)
clinical$OS <- ifelse(!is.na(clinical$days_to_death), clinical$days_to_death, clinical$days_to_last_followup)
clinical <- clinical[clinical$OS>30,]
table(clinical$vital)
clinical <- clinical[clinical$vital!="",]
clinical$vital <- ifelse(clinical$vital_status=="Dead",1,0)
dim(clinical)
与上面的表达矩阵合并
这里先提取肿瘤样本:
# auc打分选肿瘤样本
kp <- substr(rownames(auc_df),14,16)
table(kp)
auc_df_tumor <- auc_df[rownames(auc_df)[kp=="01A"],]
auc_df_tumor$patient <- substr(auc_df_tumor$id,1,12)
# 合并
auc_clin <- merge(clinical, auc_df_tumor,by.x = "bcr_patient_barcode", by.y = "patient" )
head(auc_clin)
step5:绘制 C7-HLA-DR cluster 亚群相关KM曲线
与LICH的预后相关性:
# 选择最佳阈值分组点
res.cut <- surv_cutpoint(auc_clin, time = "OS", event = "vital", variables = "score.c7" )
res.cat <- surv_categorize(res.cut)
head(res.cat)
auc_clin$group <- res.cat[,"score.c7"]
绘制KM曲线:
# 生存分析
fit <- survfit(Surv(OS, vital) ~score.c7, data = res.cat)
p3 <- ggsurvplot(fit, pval=TRUE,
legend.labs=c( "High","Low"), size = 1,
risk.table = F, palette = c("#fc9272","#6baed6"))
p <- p3$plot +
xlab("Time(Days) ") +
labs(title = "LIHC", color = "C7-HLA-DR" ) + # 修改颜色图例的标题
theme(
plot.title = element_text(hjust = 0.5), # 标题居中
legend.position = "right" ) # 图例放到右侧
p
上面的pvalue小于0.05,说明这个亚群可能与病人的生存相关~
这一套学废了吗?
后面还会有其他的单细胞生存分析技巧,欢迎关注~
如果上面的内容对你有用,欢迎一键三连~
转发:
-
生信入门&数据挖掘线上直播课2026年1月班,你的生物信息学入门课 -
时隔5年,我们的生信技能树VIP学徒继续招生啦 -
满足你生信分析计算需求的低价解决方案 -
生信故事会,来看看他们的生信入门故事 -
生信马拉松答疑专辑,获取你的生信专属答疑

