欢迎关注R语言数据分析指南
❝在上一节中,我们结合Nature Microbiology论文中的代码框架介绍了GSEA(基因集富集分析)。此代码中还有一段值得学习的代码即按基因聚类模块进行KEGG功能富集分析,下面小编结合自己得到的数据来进行演示,因此富集结果会与原文有所差异,个观点仅供参考。更完整的背景和生物学解释请参阅原论文的详细描述。
论文信息
Enteric bacterial infection stimulates remodelling of bile metabolites to promote intestinal homeostasis
论文图
论文代码
https://github.com/yh766/NMICROBIOL_Bile_2024
library(DESeq2)
library(tidyverse)
library(tximport)
#BiocManager::install("apeglm")
library(apeglm)
library(cluster)
library(AnnotationDbi)
library(org.Mm.eg.db)
library(clusterProfiler)
library(RColorBrewer)
library(pals)
library(ggtext)
❝
DESeq2 结果清洗 基因名注释 Z-score PAM 模块 compareCluster pathway筛选 绘图
DESeq2 结果清洗
## 读取 tx2gene
tx2gene <- read_tsv("tx2gene.tsv") %>%
distinct(gene_id, gene_name)
# 差异分析结果
list.srnk <- list(lm.srnk, cr.pos.srnk)
names(list.srnk) <- c("lm.srnk", "cr.pos.srnk")
# 清洗循环
for (i in seq_along(list.srnk)) {
df <- list.srnk[[i]] %>% as.data.frame() %>%
tibble::rownames_to_column("gene_id")
# 注释 gene_name
df <- df %>% left_join(tx2gene, by = c("gene_id"))
# 列顺序:geneid, gene_name, 其他 DESeq2 统计列
df <- df[, c("gene_id", "gene_name",
setdiff(colnames(df), c("gene_id", "gene_name")))]
# 给统计列加后缀
colnames(df)[-(1:2)] <- paste0(colnames(df)[-(1:2)],".",
names(list.srnk)[i])
list.srnk[[i]] <- df
}
## 合并
table.srnk <- reduce(list.srnk, left_join,
by = c("gene_id", "gene_name"))
table.srnk <- list.srnk %>% purrr::reduce(
left_join, by = c("gene_id","gene_name"))
table.srnk <- table.srnk %>%
mutate(class=case_when(padj.lm.srnk < 0.05 &
padj.cr.pos.srnk < 0.05 ~ "Both",
padj.lm.srnk < 0.05 & padj.cr.pos.srnk > 0.05 ~ "Lm DEGs",
padj.lm.srnk > 0.05 & padj.cr.pos.srnk < 0.05 ~ "Cr DEGs",
TRUE ~ "not DEGs"))
3.Z-score
vsd <- vst(dds, blind=FALSE)
assay(vsd) -> counts.nom
data.frame(counts.nom) -> counts.nom
t(scale(t(counts.nom))) -> counts.nom.z
data.frame(counts.nom.z) -> counts.nom.z
table.sub <- subset(table.srnk, padj.lm.srnk < 0.05 | padj.cr.pos.srnk < 0.05)
table.sub <- subset(table.sub, abs(log2FoldChange.lm.srnk) > 1 |
abs(log2FoldChange.cr.pos.srnk) > 1)
genes <- table.sub$gene_id
data <- subset(counts.nom.z, rownames(counts.nom.z) %in% genes) %>%
rownames_to_column(var="gene_id")
4. PAM 模块
data$Pam <- factor(paste0("c", pam(data, 7)$clustering))
# reorder clusters
data$Pam <- factor(paste0("c", pam(data, 7)$clustering),
levels=c("c1","c5","c4","c3","c7","c6","c2"))
set.seed(1234)
clusters <- rev(levels(data$Pam))
genelist <- list()
for(i in1:length(clusters)){
table <- subset(data, Pam==clusters[i])
table$entrez = mapIds(org.Mm.eg.db, keys=gsub("\\.[0-9]*","",table$gene_id),
column="ENTREZID", keytype="ENSEMBL", multiVals="first")
genelist[[i]] <- na.omit(table$entrez)
}
names(genelist) <- clusters
compareCluster
cc.kegg <- compareCluster(geneCluster = genelist, fun = "enrichKEGG",
organism="mmu", keyType = "ncbi-geneid",
pAdjustMethod = "fdr", qvalueCutoff=0.2,
minGSSize = 10, maxGSSize = 500)
cc.kegg <- setReadable(cc.kegg, OrgDb = org.Mm.eg.db, keyType="ENTREZID")
cc.kegg.result <- cc.kegg@compareClusterResult
pathway筛选
nm_kegg_list <- c(
"Antifolate resistance",
"Fructose and mannose metabolism",
"Biosynthesis of nucleotide sugars",
"Galactose metabolism",
"Purine metabolism",
"Metabolism of xenobiotics by cytochrome P450",
"Steroid hormone biosynthesis",
"Drug metabolism - other enzymes",
"Retinol metabolism",
"Drug metabolism - cytochrome P450",
"Linoleic acid metabolism",
"Glutathione metabolism",
"Arachidonic acid metabolism",
"Nitrogen metabolism",
"Alanine, aspartate and glutamate metabolism",
"Glycine, serine and threonine metabolism",
"Pentose and glucuronate interconversions",
"Fatty acid elongation",
"Fatty acid degradation",
"Tryptophan metabolism",
"Ascorbate and aldarate metabolism",
"Biosynthesis of unsaturated fatty acids",
"Selenocompound metabolism",
"Porphyrin metabolism")
filtered_kegg <- cc.kegg.result %>%
filter(Description %in% nm_kegg_list)
top_kegg <- filtered_kegg %>%
dplyr::group_by(Cluster) %>%
dplyr::arrange(p.adjust) %>%
dplyr::ungroup() %>%
mutate(Count_num = as.numeric(sub("/.*", "", GeneRatio)),
Bg_num = as.numeric(sub(".*/", "", GeneRatio))) %>%
mutate(GeneRatio = Count_num / Bg_num,
Cluster=paste0(Cluster,"<br>","(",Bg_num,")"))
绘图
ggplot(top_kegg,aes(x = Cluster,y = Description,
size = GeneRatio,color = p.adjust)) +
geom_point(alpha = 0.9) +
scale_color_gradientn(
colours = rev(RColorBrewer::brewer.pal(9, "Blues")),
limits = c(0,0.2)) +
theme_bw() +
theme(
axis.title = element_blank(),
axis.text.y = element_text(size = 10,color="black"),
axis.text.x = element_markdown(size = 10,color="black"),
legend.title = element_text(size = 12),
legend.text = element_text(size = 10))

