概念:随机森林是一种集成学习算法,通过构建多棵决策树并综合它们的预测结果来提高泛化能力。
原理:使用Bootstrap抽样生成多个训练子集,每个子集训练一棵决策树,在树的生成过程中随机选择部分特征进行划分。最终预测为所有树预测的平均值。
思想:通过集成多棵树的预测,降低单棵树的方差,提高模型的稳定性和准确性。
应用:各种回归问题,尤其适用于高维数据和复杂非线性关系。
-数据预处理:
-模型构建:
-训练:
-评估:
-可视化:
-保存结果:
定量变量预测的机器学习模型可分为传统统计模型、树基集成模型、核方法和深度学习模型四大类,每类模型通过不同机制捕捉数据模式,适用于从线性到复杂非线性关系的预测任务。
代码涵盖了从数据准备到结果保存的自动化过程,包括数据预处理、模型配置、性能评估和报告生成。
# 设置工作目录和清理环境
rm(list = ls())
if (!is.null(dev.list())) dev.off()
setwd("C:/Users/hyy/Desktop/")
if (!dir.exists("Results-RandomForest-ML")) dir.create("Results-RandomForest-ML")
# 加载必要的包
if (!require(pacman)) install.packages("pacman")
pacman::p_load(
readxl, dplyr, ggplot2, car, lmtest, ggpubr, corrplot,
performance, see, effects, sjPlot, report, officer, flextable,
broom, gridExtra, patchwork, randomForest, caret, vip, pdp,
randomForestExplainer, DALEX, pROC, MLmetrics, ModelMetrics
)
# 设置中文字体(修复PDF中文显示问题)
font_family <- "SimSun"# 宋体,可根据系统调整如"STKaiti"(楷体)等
# 读取数据
data <- read_excel("示例数据.xlsx", sheet = "示例数据")
# 数据预处理
str(data)
summary(data)
# 处理分类变量
data$结局 <- as.factor(data$结局)
data$肥胖程度 <- as.factor(data$肥胖程度)
data$教育水平 <- as.factor(data$教育水平)
data$血型 <- as.factor(data$血型)
data$指标8 <- as.factor(data$指标8)
# 检查缺失值
sum(is.na(data))
# 机器学习数据划分
set.seed(123)
train_index <- createDataPartition(data$指标1, p = 0.7, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
cat("训练集样本数:", nrow(train_data), "\n")
cat("测试集样本数:", nrow(test_data), "\n")
# 准备训练集和测试集的特征和目标变量
x_train <- train_data[, !names(train_data) %in% c("序号", "指标1")]
y_train <- train_data$指标1
x_test <- test_data[, !names(test_data) %in% c("序号", "指标1")]
y_test <- test_data$指标1
# 检查数据结构
cat("训练集自变量维度:", dim(x_train), "\n")
cat("训练集因变量长度:", length(y_train), "\n")
cat("测试集自变量维度:", dim(x_test), "\n")
cat("测试集因变量长度:", length(y_test), "\n")
# 使用caret包进行交叉验证调优(在训练集上)
set.seed(123)
train_control <- trainControl(
method = "cv",
number = 5,
verboseIter = FALSE,
savePredictions = TRUE
)
# 调优参数网格
tune_grid <- expand.grid(
mtry = c(2, 4, 6, 8)
)
# 训练调优模型
tryCatch({
rf_caret <- train(
x = x_train,
y = y_train,
method = "rf",
trControl = train_control,
tuneGrid = tune_grid,
ntree = 100,
importance = TRUE
)
best_mtry <- rf_caret$bestTune$mtry
cat("最佳mtry参数:", best_mtry, "\n")
}, error = function(e) {
cat("Caret tuning failed:", e$message, "\n")
best_mtry <- floor(sqrt(ncol(x_train)))
rf_caret <- NULL
})
# 使用最佳参数重新训练最终模型(在训练集上)
tryCatch({
final_rf <- randomForest(
x = x_train,
y = y_train,
ntree = 300,
mtry = best_mtry,
importance = TRUE,
proximity = TRUE
)
}, error = function(e) {
cat("Final model training failed:", e$message, "\n")
final_rf <- rf_model
})
# 模型预测 - 训练集和测试集
y_pred_train <- predict(final_rf, x_train)
y_pred_test <- predict(final_rf, x_test)
# 计算训练集性能指标
mse_train <- mean((y_train - y_pred_train)^2)
rmse_train <- sqrt(mse_train)
mae_train <- mean(abs(y_train - y_pred_train))
r_squared_train <- 1 - sum((y_train - y_pred_train)^2) / sum((y_train - mean(y_train))^2)
# 计算测试集性能指标
mse_test <- mean((y_test - y_pred_test)^2)
rmse_test <- sqrt(mse_test)
mae_test <- mean(abs(y_test - y_pred_test))
r_squared_test <- 1 - sum((y_test - y_pred_test)^2) / sum((y_test - mean(y_test))^2)
# OOB误差
oob_error <- tail(final_rf$mse, 1)
# 保存模型结果
model_performance <- data.frame(
Dataset = c("Training", "Training", "Training", "Training",
"Testing", "Testing", "Testing", "Testing", "OOB"),
Metric = c("MSE", "RMSE", "MAE", "R-squared",
"MSE", "RMSE", "MAE", "R-squared", "OOB-MSE"),
Value = c(mse_train, rmse_train, mae_train, r_squared_train,
mse_test, rmse_test, mae_test, r_squared_test, oob_error)
)
# 变量重要性
importance_data <- importance(final_rf)
var_importance <- data.frame(
Variable = rownames(importance_data),
IncNodePurity = importance_data[, "IncNodePurity"],
IncMSE = if("%IncMSE" %in% colnames(importance_data)) importance_data[, "%IncMSE"] else NA
) %>% arrange(desc(IncNodePurity))
# 保存结果
write.csv(model_performance, "Results-RandomForest-ML/随机森林模型性能_ML.csv", row.names = FALSE, fileEncoding = "UTF-8")
write.csv(var_importance, "Results-RandomForest-ML/随机森林变量重要性.csv", row.names = FALSE, fileEncoding = "UTF-8")
# 1. 变量重要性可视化
p_var_importance <- ggplot(var_importance,
aes(x = IncNodePurity, y = reorder(Variable, IncNodePurity))) +
geom_col(fill = "steelblue", alpha = 0.8) +
labs(title = "随机森林变量重要性 (节点纯度)",
x = "节点纯度增加量",
y = "变量") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
# 2. 训练集和测试集预测效果可视化
pred_actual_train <- data.frame(
Dataset = "Training",
Actual = y_train,
Predicted = y_pred_train
)
pred_actual_test <- data.frame(
Dataset = "Testing",
Actual = y_test,
Predicted = y_pred_test
)
pred_actual_combined <- rbind(pred_actual_train, pred_actual_test)
p_pred_vs_actual <- ggplot(pred_actual_combined, aes(x = Actual, y = Predicted, color = Dataset)) +
geom_point(alpha = 0.6) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
labs(title = "预测值 vs 实际值 - 训练集 vs 测试集",
x = "实际值",
y = "预测值") +
theme_minimal() +
scale_color_manual(values = c("Training" = "blue", "Testing" = "green")) +
theme(legend.position = "bottom")
# 3. 残差分析 - 训练集和测试集
residuals_train <- data.frame(
Dataset = "Training",
Fitted = y_pred_train,
Residuals = y_train - y_pred_train
)
residuals_test <- data.frame(
Dataset = "Testing",
Fitted = y_pred_test,
Residuals = y_test - y_pred_test
)
residuals_combined <- rbind(residuals_train, residuals_test)
p_residual <- ggplot(residuals_combined, aes(x = Fitted, y = Residuals, color = Dataset)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "随机森林回归残差图 - 训练集 vs 测试集",
x = "拟合值",
y = "残差") +
theme_minimal() +
scale_color_manual(values = c("Training" = "blue", "Testing" = "green"))
# Q-Q图 - 训练集和测试集
p_qq_train <- ggplot(residuals_train, aes(sample = Residuals)) +
stat_qq() +
stat_qq_line() +
labs(title = "训练集残差Q-Q图") +
theme_minimal()
p_qq_test <- ggplot(residuals_test, aes(sample = Residuals)) +
stat_qq() +
stat_qq_line() +
labs(title = "测试集残差Q-Q图") +
theme_minimal()
# 残差分布 - 训练集和测试集
p_resid_hist_train <- ggplot(residuals_train, aes(x = Residuals)) +
geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "lightblue", alpha = 0.7) +
geom_density(color = "blue") +
labs(title = "训练集残差分布",
x = "残差",
y = "密度") +
theme_minimal()
p_resid_hist_test <- ggplot(residuals_test, aes(x = Residuals)) +
geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "lightgreen", alpha = 0.7) +
geom_density(color = "darkgreen") +
labs(title = "测试集残差分布",
x = "残差",
y = "密度") +
theme_minimal()
# 4. OOB误差随树的数量变化
oob_error_data <- data.frame(
Trees = 1:final_rf$ntree,
OOB_Error = final_rf$mse
)
p_oob_error <- ggplot(oob_error_data, aes(x = Trees, y = OOB_Error)) +
geom_line(color = "steelblue", size = 1) +
labs(title = "OOB误差随树的数量变化",
x = "树的数量",
y = "OOB均方误差") +
theme_minimal()
# 5. 参数调优结果可视化(如果caret调优成功)
if(!is.null(rf_caret)) {
tune_results <- rf_caret$results
p_tuning <- ggplot(tune_results, aes(x = mtry, y = RMSE)) +
geom_line(color = "steelblue", size = 1) +
geom_point(color = "red", size = 3) +
geom_vline(xintercept = best_mtry, linetype = "dashed", color = "darkgreen") +
labs(title = "随机森林参数调优结果",
x = "mtry (每棵树使用的变量数)",
y = "交叉验证RMSE") +
theme_minimal()
} else {
p_tuning <- ggplot() +
labs(title = "参数调优结果不可用") +
theme_minimal()
}
# 6. 部分依赖图 (Partial Dependence Plots) - 基于训练集
important_vars <- head(var_importance$Variable, 3)
pdp_plots <- list()
for (var in important_vars) {
tryCatch({
pdp_data <- partial(final_rf, pred.var = var, train = x_train, plot = FALSE)
p <- autoplot(pdp_data) +
labs(title = paste("部分依赖图:", var),
x = var,
y = "预测值") +
theme_minimal()
pdp_plots[[var]] <- p
}, error = function(e) {
cat("Partial dependence plot for", var, "failed:", e$message, "\n")
})
}
# 7. 模型比较:随机森林 vs 线性回归(在测试集上)
lm_model <- lm(指标1 ~ . - 序号, data = train_data)
y_pred_lm_test <- predict(lm_model, test_data)
comparison_data <- data.frame(
Actual = y_test,
RandomForest = y_pred_test,
LinearRegression = y_pred_lm_test
)
p_comparison <- ggplot(comparison_data, aes(x = Actual)) +
geom_point(aes(y = RandomForest, color = "随机森林"), alpha = 0.6) +
geom_point(aes(y = LinearRegression, color = "线性回归"), alpha = 0.6) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "black") +
labs(title = "模型比较: 随机森林 vs 线性回归 (测试集)",
x = "实际值",
y = "预测值",
color = "模型") +
theme_minimal() +
theme(legend.position = "bottom")
# 8. 性能对比图
performance_comparison <- data.frame(
Metric = rep(c("RMSE", "MAE", "R-squared"), 2),
Value = c(rmse_train, mae_train, r_squared_train,
rmse_test, mae_test, r_squared_test),
Dataset = rep(c("Training", "Testing"), each = 3)
)
p_performance <- ggplot(performance_comparison, aes(x = Metric, y = Value, fill = Dataset)) +
geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
labs(title = "模型性能比较: 训练集 vs 测试集",
x = "性能指标",
y = "值") +
theme_minimal() +
scale_fill_manual(values = c("Training" = "steelblue", "Testing" = "orange")) +
theme(legend.position = "bottom")
# 9. 学习曲线分析(训练集大小 vs 性能)
learning_curve_data <- data.frame()
train_sizes <- seq(0.1, 0.9, by = 0.1)
for (size in train_sizes) {
set.seed(123)
small_train_index <- createDataPartition(train_data$指标1, p = size, list = FALSE)
small_train <- train_data[small_train_index, ]
x_small <- small_train[, !names(small_train) %in% c("序号", "指标1")]
y_small <- small_train$指标1
# 训练小模型
small_rf <- randomForest(x = x_small, y = y_small, ntree = 100, importance = FALSE)
# 训练集性能
small_pred_train <- predict(small_rf, x_small)
rmse_small_train <- sqrt(mean((y_small - small_pred_train)^2))
# 测试集性能
small_pred_test <- predict(small_rf, x_test)
rmse_small_test <- sqrt(mean((y_test - small_pred_test)^2))
learning_curve_data <- rbind(learning_curve_data,
data.frame(
TrainSize = size * nrow(train_data),
RMSE_Train = rmse_small_train,
RMSE_Test = rmse_small_test
))
}
p_learning_curve <- ggplot(learning_curve_data, aes(x = TrainSize)) +
geom_line(aes(y = RMSE_Train, color = "Training"), size = 1) +
geom_line(aes(y = RMSE_Test, color = "Testing"), size = 1) +
labs(title = "学习曲线: 训练集大小 vs RMSE",
x = "训练集大小",
y = "RMSE",
color = "数据集") +
theme_minimal() +
scale_color_manual(values = c("Training" = "blue", "Testing" = "red"))
# 10. 模型稳定性分析 - 多次随机划分的性能分布
stability_results <- data.frame()
n_iterations <- 10
for (i in 1:n_iterations) {
set.seed(100 + i)
stability_index <- createDataPartition(data$指标1, p = 0.7, list = FALSE)
stability_train <- data[stability_index, ]
stability_test <- data[-stability_index, ]
x_stability_train <- stability_train[, !names(stability_train) %in% c("序号", "指标1")]
y_stability_train <- stability_train$指标1
x_stability_test <- stability_test[, !names(stability_test) %in% c("序号", "指标1")]
y_stability_test <- stability_test$指标1
stability_rf <- randomForest(x = x_stability_train, y = y_stability_train,
ntree = 100, importance = FALSE)
stability_pred <- predict(stability_rf, x_stability_test)
stability_rmse <- sqrt(mean((y_stability_test - stability_pred)^2))
stability_r2 <- 1 - sum((y_stability_test - stability_pred)^2) /
sum((y_stability_test - mean(y_stability_test))^2)
stability_results <- rbind(stability_results,
data.frame(
Iteration = i,
RMSE = stability_rmse,
R_squared = stability_r2
))
}
p_stability <- ggplot(stability_results, aes(x = Iteration, y = RMSE)) +
geom_line(color = "steelblue", size = 1) +
geom_point(color = "red", size = 2) +
labs(title = "模型稳定性分析: 不同数据划分下的测试RMSE",
x = "迭代次数",
y = "测试集RMSE") +
theme_minimal()
# 11. 特征交互作用分析(使用前2个最重要的变量)
if(nrow(var_importance) >= 2) {
top_vars <- head(var_importance$Variable, 2)
tryCatch({
p_interaction <- partial(final_rf, pred.var = top_vars, train = x_train,
chull = TRUE, progress = "text")
p_interaction_plot <- plotPartial(p_interaction) +
labs(title = paste("特征交互作用:", paste(top_vars, collapse = " 和 "))) +
theme_minimal()
}, error = function(e) {
cat("Interaction plot failed:", e$message, "\n")
p_interaction_plot <- NULL
})
}
# 保存所有图形
# 基本图形
ggsave("Results-RandomForest-ML/variable_importance.jpg", p_var_importance,
width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-RandomForest-ML/predicted_vs_actual.jpg", p_pred_vs_actual,
width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-RandomForest-ML/residual_plot.jpg", p_residual,
width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-RandomForest-ML/qq_plot_train.jpg", p_qq_train,
width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-RandomForest-ML/qq_plot_test.jpg", p_qq_test,
width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-RandomForest-ML/residual_distribution_train.jpg", p_resid_hist_train,
width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-RandomForest-ML/residual_distribution_test.jpg", p_resid_hist_test,
width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-RandomForest-ML/oob_error.jpg", p_oob_error,
width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-RandomForest-ML/performance_comparison.jpg", p_performance,
width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-RandomForest-ML/learning_curve.jpg", p_learning_curve,
width = 10, height = 8, units = "in", dpi = 300)
ggsave("Results-RandomForest-ML/model_stability.jpg", p_stability,
width = 10, height = 8, units = "in", dpi = 300)
if(exists("p_tuning")) {
ggsave("Results-RandomForest-ML/parameter_tuning.jpg", p_tuning,
width = 10, height = 8, units = "in", dpi = 300)
}
ggsave("Results-RandomForest-ML/model_comparison.jpg", p_comparison,
width = 10, height = 8, units = "in", dpi = 300)
# 保存部分依赖图
for (var in names(pdp_plots)) {
ggsave(paste0("Results-RandomForest-ML/partial_dependence_", var, ".jpg"),
pdp_plots[[var]], width = 10, height = 8, units = "in", dpi = 300)
}
# 保存交互作用图
if(exists("p_interaction_plot") && !is.null(p_interaction_plot)) {
ggsave("Results-RandomForest-ML/feature_interaction.jpg", p_interaction_plot,
width = 10, height = 8, units = "in", dpi = 300)
}
# 生成详细的机器学习评估报告
tryCatch({
doc <- read_docx()
# 标题
doc <- doc %>%
body_add_par("随机森林回归 - 机器学习分析报告", style = "heading 1") %>%
body_add_par(paste("生成日期:", Sys.Date()), style = "Normal") %>%
body_add_par("", style = "Normal")
# 数据概述
doc <- doc %>%
body_add_par("数据概述", style = "heading 2") %>%
body_add_par(paste("总样本量:", nrow(data)), style = "Normal") %>%
body_add_par(paste("训练集样本量:", nrow(train_data)), style = "Normal") %>%
body_add_par(paste("测试集样本量:", nrow(test_data)), style = "Normal") %>%
body_add_par(paste("变量数量:", ncol(data)), style = "Normal") %>%
body_add_par("", style = "Normal")
# 模型性能摘要
doc <- doc %>%
body_add_par("模型性能摘要", style = "heading 2")
perf_table <- model_performance %>%
flextable() %>%
theme_zebra() %>%
autofit()
doc <- doc %>%
body_add_flextable(perf_table) %>%
body_add_par("", style = "Normal")
# 过拟合分析
overfitting_gap <- rmse_test - rmse_train
doc <- doc %>%
body_add_par("过拟合分析", style = "heading 2") %>%
body_add_par(paste("RMSE差距 (测试集 - 训练集):", round(overfitting_gap, 4)), style = "Normal") %>%
body_add_par(ifelse(overfitting_gap > 0.1,
"警告: 检测到潜在的过拟合 (训练集和测试集性能差距较大)",
"良好: 模型泛化能力良好 (训练集和测试集性能差距较小)"),
style = "Normal") %>%
body_add_par("", style = "Normal")
# 变量重要性
doc <- doc %>%
body_add_par("变量重要性", style = "heading 2")
importance_ft <- var_importance %>%
flextable() %>%
theme_zebra() %>%
autofit()
doc <- doc %>%
body_add_flextable(importance_ft) %>%
body_add_par("", style = "Normal")
# 稳定性分析
stability_summary <- data.frame(
Metric = c("平均测试RMSE", "测试RMSE标准差", "平均测试R平方", "测试R平方标准差"),
Value = c(mean(stability_results$RMSE), sd(stability_results$RMSE),
mean(stability_results$R_squared), sd(stability_results$R_squared))
)
doc <- doc %>%
body_add_par("模型稳定性分析", style = "heading 2")
stability_ft <- stability_summary %>%
flextable() %>%
theme_zebra() %>%
autofit()
doc <- doc %>%
body_add_flextable(stability_ft) %>%
body_add_par("", style = "Normal")
# 添加图片到报告
doc <- doc %>%
body_add_par("可视化结果", style = "heading 2") %>%
body_add_par("变量重要性图:", style = "Normal") %>%
body_add_img("Results-RandomForest-ML/variable_importance.jpg", width = 6, height = 5) %>%
body_add_par("性能比较图:", style = "Normal") %>%
body_add_img("Results-RandomForest-ML/performance_comparison.jpg", width = 6, height = 5) %>%
body_add_par("学习曲线:", style = "Normal") %>%
body_add_img("Results-RandomForest-ML/learning_curve.jpg", width = 6, height = 5) %>%
body_add_par("模型稳定性:", style = "Normal") %>%
body_add_img("Results-RandomForest-ML/model_stability.jpg", width = 6, height = 5) %>%
body_add_par("预测值 vs 实际值:", style = "Normal") %>%
body_add_img("Results-RandomForest-ML/predicted_vs_actual.jpg", width = 6, height = 5) %>%
body_add_par("OOB误差图:", style = "Normal") %>%
body_add_img("Results-RandomForest-ML/oob_error.jpg", width = 6, height = 5)
# 添加部分依赖图
if(length(pdp_plots) > 0) {
doc <- doc %>%
body_add_par("部分依赖图:", style = "Normal")
for (var in names(pdp_plots)[1:min(2, length(pdp_plots))]) {
doc <- doc %>%
body_add_img(paste0("Results-RandomForest-ML/partial_dependence_", var, ".jpg"), width = 6, height = 5)
}
}
# 添加交互作用图
if(exists("p_interaction_plot") && !is.null(p_interaction_plot)) {
doc <- doc %>%
body_add_par("特征交互作用图:", style = "Normal") %>%
body_add_img("Results-RandomForest-ML/feature_interaction.jpg", width = 6, height = 5)
}
# 结论和建议
doc <- doc %>%
body_add_par("结论和建议", style = "heading 2") %>%
body_add_par("基于随机森林回归的机器学习分析:", style = "Normal") %>%
body_add_par(paste("- 训练集R平方:", round(r_squared_train * 100, 2), "%"), style = "Normal") %>%
body_add_par(paste("- 测试集R平方:", round(r_squared_test * 100, 2), "%"), style = "Normal") %>%
body_add_par(paste("- 模型泛化差距:", round(overfitting_gap, 4)), style = "Normal") %>%
body_add_par(paste("- 最重要的变量:", paste(head(var_importance$Variable, 3), collapse = ", ")), style = "Normal") %>%
body_add_par(paste("- 模型稳定性 (RMSE标准差):", round(sd(stability_results$RMSE), 4)), style = "Normal") %>%
body_add_par("", style = "Normal") %>%
body_add_par("建议:", style = "Normal") %>%
body_add_par("1. 如果检测到过拟合,考虑正则化或减少模型复杂度", style = "Normal") %>%
body_add_par("2. 使用集成方法如梯度提升树进一步提高性能", style = "Normal") %>%
body_add_par("3. 监控模型在新数据上的表现以检测概念漂移", style = "Normal") %>%
body_add_par("4. 考虑特征工程以提升模型性能", style = "Normal")
# 保存Word文档
print(doc, target = "Results-RandomForest-ML/随机森林机器学习分析报告.docx")
}, error = function(e) {
cat("Error generating Word report:", e$message, "\n")
})
# 保存R工作环境
save.image("Results-RandomForest-ML/随机森林机器学习分析.RData")
# 保存模型对象
saveRDS(final_rf, "Results-RandomForest-ML/random_forest_model.rds")
if(!is.null(rf_caret)) {
saveRDS(rf_caret, "Results-RandomForest-ML/tuned_random_forest_model.rds")
}
# 输出完成信息
cat("随机森林机器学习分析完成!\n")
cat("结果已保存到 Results-RandomForest-ML 文件夹:\n")
cat("- 模型性能比较 (训练集 vs 测试集)\n")
cat("- 综合可视化包括学习曲线和稳定性分析\n")
cat("- 详细的Word分析报告\n")
cat("- R工作环境文件和模型对象\n")
4. 随机森林回归(Random Forest Regression)
概念:集成学习方法,通过构建多棵决策树并平均其预测。
原理
- Bagging:自助采样构建多个训练集
- 特征随机性:每棵树只使用部分特征
- 预测:所有树预测的平均值
思想:"三个臭皮匠,顶个诸葛亮" - 通过集体智慧提高准确性和稳定性。
应用
- 房价预测
- 股票价格预测
- 生物信息学
医学统计数据分析分享交流SPSS、R语言、Python、ArcGis、Geoda、GraphPad、数据分析图表制作等心得。承接数据分析,论文返修,医学统计,机器学习,生存分析,空间分析,问卷分析业务。若有投稿和数据分析代做需求,可以直接联系我,谢谢!
“医学统计数据分析”公众号右下角;
找到“联系作者”,
可加我微信,邀请入粉丝群!
有临床流行病学数据分析
如(t检验、方差分析、χ2检验、logistic回归)、
(重复测量方差分析与配对T检验、ROC曲线)、
(非参数检验、生存分析、样本含量估计)、
(筛检试验:灵敏度、特异度、约登指数等计算)、
(绘制柱状图、散点图、小提琴图、列线图等)、
机器学习、深度学习、生存分析
等需求的同仁们,加入【临床】粉丝群。
疾控,公卫岗位的同仁,可以加一下【公卫】粉丝群,分享生态学研究、空间分析、时间序列、监测数据分析、时空面板技巧等工作科研自动化内容。
有实验室数据分析需求的同仁们,可以加入【生信】粉丝群,交流NCBI(基因序列)、UniProt(蛋白质)、KEGG(通路)、GEO(公共数据集)等公共数据库、基因组学转录组学蛋白组学代谢组学表型组学等数据分析和可视化内容。
或者可扫码直接加微信进群!!!
精品视频课程-“医学统计数据分析”视频号付费合集
在“医学统计数据分析”视频号-付费合集兑换相应课程后,获取课程理论课PPT、代码、基础数据等相关资料,请大家在【医学统计数据分析】公众号右下角,找到“联系作者”,加我微信后打包发送。感谢您的支持!!

