大数跨境
0
0

【定量因变量机器学习】R语言05-梯度提升回归树(GBDT,Gradient Boosting DTs) 标准化代码

【定量因变量机器学习】R语言05-梯度提升回归树(GBDT,Gradient Boosting DTs) 标准化代码 医学统计数据分析
2025-11-29
1
导读:定量因变量机器学习05.梯度提升回归树(GBDT,Gradient Boosting Decision Tre




定量因变量机器学习



05.梯度提升回归树(GBDT,Gradient Boosting Decision Trees)


R语言教程(标准化代码)

01

概念、原理、思想、应用

概念:GBDT是一种迭代的决策树算法,通过逐步减小残差来改进模型。

原理:每一棵树学习的是之前所有树的残差(负梯度),通过不断添加树来减少残差,最终将多个弱学习器组合成强学习器。

思想:利用梯度下降法优化损失函数,每一步都学习一个新的树来拟合残差。

应用:广泛应用于各种回归和分类问题,如搜索排序、推荐系统等。

02

操作流程

-数据预处理:

-模型构建:

-训练:

-评估:

-可视化:

-保存结果:


03

代码及操作演示与功能解析

定量变量预测的机器学习模型可分为传统统计模型、树基集成模型、核方法和深度学习模型四大类,每类模型通过不同机制捕捉数据模式,适用于从线性到复杂非线性关系的预测任务。

代码涵盖了从数据准备到结果保存的自动化过程,包括数据预处理、模型配置、性能评估和报告生成。

# 设置工作目录和清理环境
rm(list = ls())
if (!is.null(dev.list())) dev.off()
setwd("C:/Users/hyy/Desktop/")
if (!dir.exists("Results-GBDT-ML")) dir.create("Results-GBDT-ML")

# 加载必要的包
if (!require(pacman)) install.packages("pacman")
pacman::p_load(
  readxl, dplyr, ggplot2, car, lmtest, ggpubr, corrplot, 
  performance, see, effects, sjPlot, report, officer, flextable,
  broom, gridExtra, patchwork, gbm, caret, vip, pdp,
  xgboost, pROC, MLmetrics, ModelMetrics, doParallel
)

# 设置中文字体
font_family <- "SimSun"

# 读取数据
data <- read_excel("示例数据.xlsx", sheet = "示例数据")

# 数据预处理
str(data)
summary(data)

# 处理分类变量
data$结局 <- as.factor(data$结局)
data$肥胖程度 <- as.factor(data$肥胖程度)
data$教育水平 <- as.factor(data$教育水平)
data$血型 <- as.factor(data$血型)
data$指标8 <- as.factor(data$指标8)

# 检查缺失值
sum(is.na(data))

# 机器学习数据划分
set.seed(123)
train_index <- createDataPartition(data$指标1, p = 0.7, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]

cat("训练集样本数:", nrow(train_data), "\n")
cat("测试集样本数:", nrow(test_data), "\n")

# 准备训练集和测试集的特征和目标变量
x_train <- train_data[, !names(train_data) %in% c("序号""指标1")]
y_train <- train_data$指标1
x_test <- test_data[, !names(test_data) %in% c("序号""指标1")]
y_test <- test_data$指标1

# 检查数据结构
cat("训练集自变量维度:", dim(x_train), "\n")
cat("训练集因变量长度:", length(y_train), "\n")
cat("测试集自变量维度:", dim(x_test), "\n")
cat("测试集因变量长度:", length(y_test), "\n")

# 设置并行计算以加速模型训练
cl <- makePSOCKcluster(detectCores() - 1)
registerDoParallel(cl)


  # 简化模型
  gbdt_model <- gbm(
    formula = 指标1 ~ . - 序号,
    data = train_data,
    distribution = "gaussian",
    n.trees = 500,
    interaction.depth = 2,
    shrinkage = 0.05,
    verbose = FALSE
  )
})

# 确定最优树的数量
best_iter <- gbm.perf(gbdt_model, method = "cv")
cat("最优树的数量:", best_iter, "\n")

# 使用caret包进行交叉验证调优(在训练集上)
set.seed(123)
train_control <- trainControl(
  method = "cv",
  number = 5,
  verboseIter = FALSE,
  savePredictions = TRUE,
  allowParallel = TRUE
)

# 调优参数网格
tune_grid <- expand.grid(
  n.trees = c(100, 200, 300),
  interaction.depth = c(2, 3, 4),
  shrinkage = c(0.01, 0.05, 0.1),
  n.minobsinnode = 10
)

# 训练调优模型
tryCatch({
  gbdt_caret <- train(
    x = x_train,
    y = y_train,
    method = "gbm",
    trControl = train_control,
    tuneGrid = tune_grid,
    verbose = FALSE,
    metric = "RMSE"
  )

  best_params <- gbdt_caret$bestTune
  cat("最佳参数:\n")
print(best_params)

}, error = function(e) {
  cat("Caret tuning failed:", e$message"\n")
  best_params <- list(
    n.trees = best_iter,
    interaction.depth = 3,
    shrinkage = 0.01,
    n.minobsinnode = 10
  )
  gbdt_caret <- NULL
})

# 使用最佳参数重新训练最终模型(在训练集上)
tryCatch({
  final_gbdt <- gbm(
    formula = 指标1 ~ . - 序号,
    data = train_data,
    distribution = "gaussian",
    n.trees = if(!is.null(gbdt_caret)) gbdt_caret$bestTune$n.trees else best_params$n.trees,
    interaction.depth = if(!is.null(gbdt_caret)) gbdt_caret$bestTune$interaction.depth else best_params$interaction.depth,
    shrinkage = if(!is.null(gbdt_caret)) gbdt_caret$bestTune$shrinkageelse best_params$shrinkage,
    n.minobsinnode = if(!is.null(gbdt_caret)) gbdt_caret$bestTune$n.minobsinnode else best_params$n.minobsinnode,
    cv.folds = 5,
    verbose = FALSE,
    n.cores = 1
  )
}, error = function(e) {
  cat("Final model training failed:", e$message"\n")
  final_gbdt <- gbdt_model
})

# 停止并行计算
stopCluster(cl)

# 模型预测 - 训练集和测试集
y_pred_train <- predict(final_gbdt, train_data, n.trees = gbm.perf(final_gbdt, method = "cv"))
y_pred_test <- predict(final_gbdt, test_data, n.trees = gbm.perf(final_gbdt, method = "cv"))

# 计算训练集性能指标
mse_train <- mean((y_train - y_pred_train)^2)
rmse_train <- sqrt(mse_train)
mae_train <- mean(abs(y_train - y_pred_train))
r_squared_train <- 1 - sum((y_train - y_pred_train)^2) / sum((y_train - mean(y_train))^2)

# 计算测试集性能指标
mse_test <- mean((y_test - y_pred_test)^2)
rmse_test <- sqrt(mse_test)
mae_test <- mean(abs(y_test - y_pred_test))
r_squared_test <- 1 - sum((y_test - y_pred_test)^2) / sum((y_test - mean(y_test))^2)

# 计算OOB改进(类似于随机森林的OOB误差)
best_tree <- gbm.perf(final_gbdt, method = "cv")
cv_error <- final_gbdt$cv.error[best_tree]

# 保存模型结果
model_performance <- data.frame(
  Dataset = c("Training""Training""Training""Training"
              "Testing""Testing""Testing""Testing""CV"),
  Metric = c("MSE""RMSE""MAE""R-squared"
             "MSE""RMSE""MAE""R-squared""CV-Error"),
  Value = c(mse_train, rmse_train, mae_train, r_squared_train,
            mse_test, rmse_test, mae_test, r_squared_test, cv_error)
)

# 变量重要性
importance_data <- summary(final_gbdt, plotit = FALSE)
var_importance <- data.frame(
  Variable = importance_data$var,
  RelativeInfluence = importance_data$rel.inf
) %>% arrange(desc(RelativeInfluence))

# 保存结果
write.csv(model_performance, "Results-GBDT-ML/GBDT模型性能_ML.csv", row.names = FALSE, fileEncoding = "UTF-8")
write.csv(var_importance, "Results-GBDT-ML/GBDT变量重要性.csv", row.names = FALSE, fileEncoding = "UTF-8")

# 1. 变量重要性可视化
p_var_importance <- ggplot(var_importance, 
                           aes(x = RelativeInfluence, y = reorder(Variable, RelativeInfluence))) +
  geom_col(fill = "steelblue", alpha = 0.8) +
  labs(title = "GBDT变量重要性 (相对影响力)",
       x = "相对影响力"
       y = "变量") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))


# 2. 训练集和测试集预测效果可视化
pred_actual_train <- data.frame(
  Dataset = "Training",
  Actual = y_train,
  Predicted = y_pred_train
)

pred_actual_test <- data.frame(
  Dataset = "Testing",
  Actual = y_test,
  Predicted = y_pred_test
)

pred_actual_combined <- rbind(pred_actual_train, pred_actual_test)

p_pred_vs_actual <- ggplot(pred_actual_combined, aes(x = Actual, y = Predicted, color = Dataset)) +
  geom_point(alpha = 0.6) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "预测值 vs 实际值 - 训练集 vs 测试集 (GBDT)",
       x = "实际值"
       y = "预测值") +
  theme_minimal() +
  scale_color_manual(values = c("Training" = "blue""Testing" = "green")) +
  theme(legend.position = "bottom")

# 3. 残差分析 - 训练集和测试集
residuals_train <- data.frame(
  Dataset = "Training",
  Fitted = y_pred_train,
  Residuals = y_train - y_pred_train
)

residuals_test <- data.frame(
  Dataset = "Testing",
  Fitted = y_pred_test,
  Residuals = y_test - y_pred_test
)

residuals_combined <- rbind(residuals_train, residuals_test)

p_residual <- ggplot(residuals_combined, aes(x = Fitted, y = Residuals, color = Dataset)) +
  geom_point(alpha = 0.6) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "GBDT回归残差图 - 训练集 vs 测试集"
       x = "拟合值"
       y = "残差") +
  theme_minimal() +
  scale_color_manual(values = c("Training" = "blue""Testing" = "green"))


# Q-Q图 - 训练集和测试集
p_qq_train <- ggplot(residuals_train, aes(sample = Residuals)) +
  stat_qq() + 
  stat_qq_line() +
  labs(title = "训练集残差Q-Q图 (GBDT)") +
  theme_minimal()

p_qq_test <- ggplot(residuals_test, aes(sample = Residuals)) +
  stat_qq() + 
  stat_qq_line() +
  labs(title = "测试集残差Q-Q图 (GBDT)") +
  theme_minimal()

# 残差分布 - 训练集和测试集
p_resid_hist_train <- ggplot(residuals_train, aes(x = Residuals)) +
  geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "lightblue", alpha = 0.7) +
  geom_density(color = "blue") +
  labs(title = "训练集残差分布 (GBDT)"
       x = "残差"
       y = "密度") +
  theme_minimal()

p_resid_hist_test <- ggplot(residuals_test, aes(x = Residuals)) +
  geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "lightgreen", alpha = 0.7) +
  geom_density(color = "darkgreen") +
  labs(title = "测试集残差分布 (GBDT)"
       x = "残差"
       y = "密度") +
  theme_minimal()

# 4. 训练误差随树的数量变化
train_error_data <- data.frame(
  Trees = 1:length(final_gbdt$train.error),
  Train_Error = final_gbdt$train.error,
  CV_Error = final_gbdt$cv.error
)

p_train_error <- ggplot(train_error_data, aes(x = Trees)) +
  geom_line(aes(y = Train_Error, color = "训练误差"), size = 1) +
  geom_line(aes(y = CV_Error, color = "交叉验证误差"), size = 1) +
  geom_vline(xintercept = best_tree, linetype = "dashed", color = "red") +
  labs(title = "GBDT训练误差和交叉验证误差",
       x = "树的数量",
       y = "误差") +
  theme_minimal() +
  scale_color_manual(values = c("训练误差" = "blue""交叉验证误差" = "orange")) +
  theme(legend.position = "bottom")

# 5. 参数调优结果可视化(如果caret调优成功)
if(!is.null(gbdt_caret)) {
  tune_results <- gbdt_caret$results
  p_tuning <- ggplot(tune_results, aes(x = n.trees, y = RMSE, color = as.factor(interaction.depth))) +
    geom_line(size = 1) +
    geom_point(size = 2) +
    labs(title = "GBDT参数调优结果",
         x = "树的数量",
         y = "交叉验证RMSE",
         color = "树深度") +
    theme_minimal() +
    theme(legend.position = "bottom")
else {
  p_tuning <- ggplot() + 
    labs(title = "参数调优结果不可用") +
    theme_minimal()
}

# 6. 部分依赖图 (Partial Dependence Plots) - 基于训练集
important_vars <- head(var_importance$Variable, 3)
pdp_plots <- list()

for (var in important_vars) {
  tryCatch({
    pdp_data <- plot(final_gbdt, i.var = var, return.grid = TRUE)
    colnames(pdp_data) <- c("x""y")
    
    p <- ggplot(pdp_data, aes(x = x, y = y)) +
      geom_line(color = "steelblue", size = 1) +
      labs(title = paste("部分依赖图:", var),
           x = var,
           y = "预测值") +
      theme_minimal()
    pdp_plots[[var]] <- p
  }, error = function(e) {
    cat("Partial dependence plot for", var, "failed:", e$message"\n")
  })
}

# 7. 模型比较:GBDT vs 线性回归(在测试集上)
lm_model <- lm(指标1 ~ . - 序号, data = train_data)
y_pred_lm_test <- predict(lm_model, test_data)

comparison_data <- data.frame(
  Actual = y_test,
  GBDT = y_pred_test,
  LinearRegression = y_pred_lm_test
)

p_comparison <- ggplot(comparison_data, aes(x = Actual)) +
  geom_point(aes(y = GBDT, color = "GBDT"), alpha = 0.6) +
  geom_point(aes(y = LinearRegression, color = "线性回归"), alpha = 0.6) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "black") +
  labs(title = "模型比较: GBDT vs 线性回归 (测试集)",
       x = "实际值",
       y = "预测值",
       color = "模型") +
  theme_minimal() +
  theme(legend.position = "bottom")


# 8. 性能对比图
performance_comparison <- data.frame(
  Metric = rep(c("RMSE""MAE""R-squared"), 2),
  Value = c(rmse_train, mae_train, r_squared_train, 
            rmse_test, mae_test, r_squared_test),
  Dataset = rep(c("Training""Testing"), each = 3)
)

p_performance <- ggplot(performance_comparison, aes(x = Metric, y = Value, fill = Dataset)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
  labs(title = "模型性能比较: 训练集 vs 测试集 (GBDT)",
       x = "性能指标",
       y = "值") +
  theme_minimal() +
  scale_fill_manual(values = c("Training" = "steelblue""Testing" = "orange")) +
  theme(legend.position = "bottom")

# 9. 学习曲线分析(训练集大小 vs 性能)
learning_curve_data <- data.frame()
train_sizes <- seq(0.1, 0.9, by = 0.1)

for (size in train_sizes) {
  set.seed(123)
  small_train_index <- createDataPartition(train_data$指标1, p = size, list = FALSE)
  small_train <- train_data[small_train_index, ]

# 训练小模型
  small_gbdt <- gbm(
    formula = 指标1 ~ . - 序号,
    data = small_train,
    distribution = "gaussian",
    n.trees = 100,
    interaction.depth = 2,
    shrinkage = 0.05,
    verbose = FALSE
  )

# 训练集性能
  small_pred_train <- predict(small_gbdt, small_train, n.trees = 100)
  rmse_small_train <- sqrt(mean((small_train$指标1 - small_pred_train)^2))

# 测试集性能
  small_pred_test <- predict(small_gbdt, test_data, n.trees = 100)
  rmse_small_test <- sqrt(mean((test_data$指标1 - small_pred_test)^2))

  learning_curve_data <- rbind(learning_curve_data, 
                               data.frame(
                                 TrainSize = size * nrow(train_data),
                                 RMSE_Train = rmse_small_train,
                                 RMSE_Test = rmse_small_test
                               ))
}

p_learning_curve <- ggplot(learning_curve_data, aes(x = TrainSize)) +
  geom_line(aes(y = RMSE_Train, color = "Training"), size = 1) +
  geom_line(aes(y = RMSE_Test, color = "Testing"), size = 1) +
  labs(title = "学习曲线: 训练集大小 vs RMSE (GBDT)",
       x = "训练集大小",
       y = "RMSE",
       color = "数据集") +
  theme_minimal() +
  scale_color_manual(values = c("Training" = "blue""Testing" = "red"))

# 10. 模型稳定性分析 - 多次随机划分的性能分布
stability_results <- data.frame()
n_iterations <- 10

for (i in 1:n_iterations) {
  set.seed(100 + i)
  stability_index <- createDataPartition(data$指标1, p = 0.7, list = FALSE)
  stability_train <- data[stability_index, ]
  stability_test <- data[-stability_index, ]

  stability_gbdt <- gbm(
    formula = 指标1 ~ . - 序号,
    data = stability_train,
    distribution = "gaussian",
    n.trees = 100,
    interaction.depth = 2,
    shrinkage = 0.05,
    verbose = FALSE
  )

  stability_pred <- predict(stability_gbdt, stability_test, n.trees = 100)

  stability_rmse <- sqrt(mean((stability_test$指标1 - stability_pred)^2))
  stability_r2 <- 1 - sum((stability_test$指标1 - stability_pred)^2) / 
    sum((stability_test$指标1 - mean(stability_test$指标1))^2)

  stability_results <- rbind(stability_results,
                             data.frame(
                               Iteration = i,
                               RMSE = stability_rmse,
                               R_squared = stability_r2
                             ))
}

p_stability <- ggplot(stability_results, aes(x = Iteration, y = RMSE)) +
  geom_line(color = "steelblue", size = 1) +
  geom_point(color = "red", size = 2) +
  labs(title = "模型稳定性分析: 不同数据划分下的测试RMSE (GBDT)",
       x = "迭代次数",
       y = "测试集RMSE") +
  theme_minimal()

# 11. 特征交互作用分析(使用前2个最重要的变量)
if(nrow(var_importance) >= 2) {
  top_vars <- head(var_importance$Variable, 2)
  tryCatch({
    # GBDT包没有直接提供交互作用图,我们可以手动计算
    p_interaction <- plot(final_gbdt, i.var = top_vars, return.grid = TRUE)
    
    # 创建交互作用热图
    p_interaction_plot <- ggplot(p_interaction, aes_string(x = top_vars[1], y = top_vars[2], z = "y")) +
      geom_tile(aes(fill = y)) +
      scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = mean(p_interaction$y)) +
      labs(title = paste("特征交互作用:", paste(top_vars, collapse = " 和 ")),
           fill = "预测值") +
      theme_minimal()
  }, error = function(e) {
    cat("Interaction plot failed:", e$message"\n")
    p_interaction_plot <- NULL
  })
}

# 12. 模型诊断图 - 残差与重要特征的关系
if(nrow(var_importance) > 0) {
  top_var <- var_importance$Variable[1]
  diagnostic_data <- data.frame(
    Feature = train_data[[top_var]],
    Residuals = residuals_train$Residuals
  )

  p_diagnostic <- ggplot(diagnostic_data, aes(x = Feature, y = Residuals)) +
    geom_point(alpha = 0.6, color = "steelblue") +
    geom_smooth(method = "loess", color = "red", se = TRUE) +
    labs(title = paste("残差与最重要特征", top_var, "的关系"),
         x = top_var,
         y = "残差") +
    theme_minimal()
else {
  p_diagnostic <- ggplot() + 
    labs(title = "诊断图不可用") +
    theme_minimal()
}

# 保存所有图形
# 基本图形
ggsave("Results-GBDT-ML/variable_importance.jpg", p_var_importance, 
       width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-GBDT-ML/predicted_vs_actual.jpg", p_pred_vs_actual, 
       width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-GBDT-ML/residual_plot.jpg", p_residual, 
       width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-GBDT-ML/qq_plot_train.jpg", p_qq_train, 
       width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-GBDT-ML/qq_plot_test.jpg", p_qq_test, 
       width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-GBDT-ML/residual_distribution_train.jpg", p_resid_hist_train, 
       width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-GBDT-ML/residual_distribution_test.jpg", p_resid_hist_test, 
       width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-GBDT-ML/training_error.jpg", p_train_error, 
       width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-GBDT-ML/performance_comparison.jpg", p_performance, 
       width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-GBDT-ML/learning_curve.jpg", p_learning_curve, 
       width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-GBDT-ML/model_stability.jpg", p_stability, 
       width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-GBDT-ML/model_diagnostic.jpg", p_diagnostic, 
       width = 10, height = 8, units = "in", dpi = 300)

if(exists("p_tuning")) {
  ggsave("Results-GBDT-ML/parameter_tuning.jpg", p_tuning, 
         width = 10, height = 8, units = "in", dpi = 300)
}

ggsave("Results-GBDT-ML/model_comparison.jpg", p_comparison, 
       width = 10, height = 8, units = "in", dpi = 300)

# 保存部分依赖图
for (var in names(pdp_plots)) {
  ggsave(paste0("Results-GBDT-ML/partial_dependence_", var, ".jpg"), 
         pdp_plots[[var]], width = 10, height = 8, units = "in", dpi = 300)
}

# 保存交互作用图
if(exists("p_interaction_plot") && !is.null(p_interaction_plot)) {
  ggsave("Results-GBDT-ML/feature_interaction.jpg", p_interaction_plot, 
         width = 10, height = 8, units = "in", dpi = 300)
}

# 生成详细的机器学习评估报告
tryCatch({
  doc <- read_docx()

# 标题
  doc <- doc %>% 
    body_add_par("梯度提升回归 (GBDT) - 机器学习分析报告", style = "heading 1") %>%
    body_add_par(paste("生成日期:", Sys.Date()), style = "Normal") %>%
    body_add_par("", style = "Normal")

# 数据概述
  doc <- doc %>% 
    body_add_par("数据概述", style = "heading 2") %>%
    body_add_par(paste("总样本量:", nrow(data)), style = "Normal") %>%
    body_add_par(paste("训练集样本量:", nrow(train_data)), style = "Normal") %>%
    body_add_par(paste("测试集样本量:", nrow(test_data)), style = "Normal") %>%
    body_add_par(paste("变量数量:", ncol(data)), style = "Normal") %>%
    body_add_par("", style = "Normal")

# 模型参数
  doc <- doc %>% 
    body_add_par("模型参数", style = "heading 2") %>%
    body_add_par(paste("最优树的数量:", best_tree), style = "Normal") %>%
    body_add_par(paste("树深度:"if(!is.null(gbdt_caret)) gbdt_caret$bestTune$interaction.depth else best_params$interaction.depth), style = "Normal") %>%
    body_add_par(paste("学习率:"if(!is.null(gbdt_caret)) gbdt_caret$bestTune$shrinkageelse best_params$shrinkage), style = "Normal") %>%
    body_add_par(paste("叶节点最小样本数:"if(!is.null(gbdt_caret)) gbdt_caret$bestTune$n.minobsinnode else best_params$n.minobsinnode), style = "Normal") %>%
    body_add_par("", style = "Normal")

# 模型性能摘要
  doc <- doc %>% 
    body_add_par("模型性能摘要", style = "heading 2")

  perf_table <- model_performance %>%
    flextable() %>%
    theme_zebra() %>%
    autofit()

  doc <- doc %>% 
    body_add_flextable(perf_table) %>%
    body_add_par("", style = "Normal")

# 过拟合分析
  overfitting_gap <- rmse_test - rmse_train
  doc <- doc %>% 
    body_add_par("过拟合分析", style = "heading 2") %>%
    body_add_par(paste("RMSE差距 (测试集 - 训练集):", round(overfitting_gap, 4)), style = "Normal") %>%
    body_add_par(ifelse(overfitting_gap > 0.1, 
                        "警告: 检测到潜在的过拟合 (训练集和测试集性能差距较大)",
                        "良好: 模型泛化能力良好 (训练集和测试集性能差距较小)"), 
                 style = "Normal") %>%
    body_add_par("", style = "Normal")

# 变量重要性
  doc <- doc %>% 
    body_add_par("变量重要性", style = "heading 2")

  importance_ft <- var_importance %>%
    flextable() %>%
    theme_zebra() %>%
    autofit()

  doc <- doc %>% 
    body_add_flextable(importance_ft) %>%
    body_add_par("", style = "Normal")

# 稳定性分析
  stability_summary <- data.frame(
    Metric = c("平均测试RMSE""测试RMSE标准差""平均测试R平方""测试R平方标准差"),
    Value = c(mean(stability_results$RMSE), sd(stability_results$RMSE),
              mean(stability_results$R_squared), sd(stability_results$R_squared))
  )

  doc <- doc %>% 
    body_add_par("模型稳定性分析", style = "heading 2")

  stability_ft <- stability_summary %>%
    flextable() %>%
    theme_zebra() %>%
    autofit()

  doc <- doc %>% 
    body_add_flextable(stability_ft) %>%
    body_add_par("", style = "Normal")

# 添加图片到报告
  doc <- doc %>% 
    body_add_par("可视化结果", style = "heading 2") %>%
    body_add_par("变量重要性图:", style = "Normal") %>%
    body_add_img("Results-GBDT-ML/variable_importance.jpg", width = 6, height = 5) %>%
    body_add_par("性能比较图:", style = "Normal") %>%
    body_add_img("Results-GBDT-ML/performance_comparison.jpg", width = 6, height = 5) %>%
    body_add_par("学习曲线:", style = "Normal") %>%
    body_add_img("Results-GBDT-ML/learning_curve.jpg", width = 6, height = 5) %>%
    body_add_par("模型稳定性:", style = "Normal") %>%
    body_add_img("Results-GBDT-ML/model_stability.jpg", width = 6, height = 5) %>%
    body_add_par("预测值 vs 实际值:", style = "Normal") %>%
    body_add_img("Results-GBDT-ML/predicted_vs_actual.jpg", width = 6, height = 5) %>%
    body_add_par("训练误差图:", style = "Normal") %>%
    body_add_img("Results-GBDT-ML/training_error.jpg", width = 6, height = 5) %>%
    body_add_par("模型诊断图:", style = "Normal") %>%
    body_add_img("Results-GBDT-ML/model_diagnostic.jpg", width = 6, height = 5)

# 添加部分依赖图
if(length(pdp_plots) > 0) {
    doc <- doc %>% 
      body_add_par("部分依赖图:", style = "Normal")
    
    for (var in names(pdp_plots)[1:min(2, length(pdp_plots))]) {
      doc <- doc %>% 
        body_add_img(paste0("Results-GBDT-ML/partial_dependence_", var, ".jpg"), width = 6, height = 5)
    }
  }

# 添加交互作用图
if(exists("p_interaction_plot") && !is.null(p_interaction_plot)) {
    doc <- doc %>% 
      body_add_par("特征交互作用图:", style = "Normal") %>%
      body_add_img("Results-GBDT-ML/feature_interaction.jpg", width = 6, height = 5)
  }

# 结论和建议
  doc <- doc %>% 
    body_add_par("结论和建议", style = "heading 2") %>%
    body_add_par("基于梯度提升回归 (GBDT) 的机器学习分析:", style = "Normal") %>%
    body_add_par(paste("- 训练集R平方:", round(r_squared_train * 100, 2), "%"), style = "Normal") %>%
    body_add_par(paste("- 测试集R平方:", round(r_squared_test * 100, 2), "%"), style = "Normal") %>%
    body_add_par(paste("- 模型泛化差距:", round(overfitting_gap, 4)), style = "Normal") %>%
    body_add_par(paste("- 最重要的变量:", paste(head(var_importance$Variable, 3), collapse = ", ")), style = "Normal") %>%
    body_add_par(paste("- 模型稳定性 (RMSE标准差):", round(sd(stability_results$RMSE), 4)), style = "Normal") %>%
    body_add_par(paste("- 最优树的数量:", best_tree), style = "Normal") %>%
    body_add_par("", style = "Normal") %>%
    body_add_par("GBDT模型特点:", style = "Normal") %>%
    body_add_par("- 通过梯度提升逐步减少残差,构建强预测模型", style = "Normal") %>%
    body_add_par("- 能够处理复杂的非线性关系和特征交互", style = "Normal") %>%
    body_add_par("- 对异常值相对稳健,但需要仔细调参", style = "Normal") %>%
    body_add_par("", style = "Normal") %>%
    body_add_par("建议:", style = "Normal") %>%
    body_add_par("1. 如果检测到过拟合,降低树深度或增加正则化", style = "Normal") %>%
    body_add_par("2. 考虑使用XGBoost或LightGBM等更高效的梯度提升实现", style = "Normal") %>%
    body_add_par("3. 进行更精细的超参数调优以获得最佳性能", style = "Normal") %>%
    body_add_par("4. 监控模型性能并根据新数据定期重新训练", style = "Normal")

# 保存Word文档
print(doc, target = "Results-GBDT-ML/GBDT机器学习分析报告.docx")
}, error = function(e) {
  cat("Error generating Word report:", e$message"\n")
})

# 保存R工作环境
save.image("Results-GBDT-ML/GBDT机器学习分析.RData")

# 保存模型对象
saveRDS(final_gbdt, "Results-GBDT-ML/gbdt_model.rds")
if(!is.null(gbdt_caret)) {
  saveRDS(gbdt_caret, "Results-GBDT-ML/tuned_gbdt_model.rds")
}

# 输出完成信息
cat("GBDT机器学习分析完成!\n")
cat("结果已保存到 Results-GBDT-ML 文件夹:\n")
cat("- 模型性能比较 (训练集 vs 测试集)\n")
cat("- 综合可视化包括学习曲线和稳定性分析\n")
cat("- 详细的Word分析报告\n")
cat("- R工作环境文件和模型对象\n")





 5. 梯度提升回归树(GBDT,Gradient Boosting Decision Trees)

 概念:串行集成方法,每棵树学习前序树的残差。

 原理

- 梯度下降:在函数空间进行优化

- 加法模型:$F_m(x) = F_{m-1}(x) + \nu h_m(x)$

- 损失函数:可微损失函数(如平方损失)

 思想:逐步改进模型,每步专注于当前模型表现最差的部分。

 应用

- 搜索排序

- 推荐系统

- 风险建模




医学统计数据分析分享交流SPSS、R语言、Python、ArcGis、Geoda、GraphPad、数据分析图表制作等心得。承接数据分析,论文返修,医学统计,机器学习,生存分析,空间分析,问卷分析业务。若有投稿和数据分析代做需求,可以直接联系我,谢谢!



!!!可加我粉丝群!!!

“医学统计数据分析”公众号右下角;

找到“联系作者”,

可加我微信,邀请入粉丝群!

【医学统计数据分析】工作室“粉丝群”

01

【临床】粉丝群

有临床流行病学数据分析

如(t检验、方差分析、χ2检验、logistic回归)、

(重复测量方差分析与配对T检验、ROC曲线)、

(非参数检验、生存分析、样本含量估计)、

(筛检试验:灵敏度、特异度、约登指数等计算)、

(绘制柱状图、散点图、小提琴图、列线图等)、

机器学习、深度学习、生存分析

等需求的同仁们,加入【临床】粉丝群

02

【公卫】粉丝群

疾控,公卫岗位的同仁,可以加一下【公卫】粉丝群,分享生态学研究、空间分析、时间序列、监测数据分析、时空面板技巧等工作科研自动化内容。

03

【生信】粉丝群

有实验室数据分析需求的同仁们,可以加入【生信】粉丝群,交流NCBI(基因序列)、UniProt(蛋白质)、KEGG(通路)、GEO(公共数据集)等公共数据库、基因组学转录组学蛋白组学代谢组学表型组学等数据分析和可视化内容。



或者可扫码直接加微信进群!!!





精品视频课程-“医学统计数据分析”视频号付费合集

“医学统计数据分析”视频号-付费合集兑换相应课程后,获取课程理论课PPT、代码、基础数据等相关资料,请大家在【医学统计数据分析】公众号右下角,找到“联系作者”,加我微信后打包发送。感谢您的支持!!



【声明】内容源于网络
0
0
医学统计数据分析
分享交流SPSS、R语言、Python、ArcGis、Geoda、GraphPad、数据分析图表制作等心得。承接数据分析,论文返修,医学统计,空间分析,机器学习,生存分析,时间序列,时空面板,深度学习,问卷分析等业务。公众号右下角可联系作者
内容 323
粉丝 0
医学统计数据分析 分享交流SPSS、R语言、Python、ArcGis、Geoda、GraphPad、数据分析图表制作等心得。承接数据分析,论文返修,医学统计,空间分析,机器学习,生存分析,时间序列,时空面板,深度学习,问卷分析等业务。公众号右下角可联系作者
总阅读32
粉丝0
内容323