【定量因变量机器学习】Python-03. 决策树回归（Decision Tree Regression）标准化代码

医学统计数据分析

2025-11-27

导读：定量因变量机器学习03.决策树回归（Decision Tree Regression）Python教程（标准化

定量因变量机器学习

03.决策树回归（Decision Tree Regression）

Python教程（标准化代码）

概念、原理、思想、应用

概念：决策树回归通过树状结构进行回归预测，每个内部节点表示一个特征测试，每个叶节点表示一个预测值。

原理：通过递归地划分特征空间，使得每个子空间内的样本尽可能同质。常用划分准则有均方误差（MSE）等。

思想：将特征空间划分为多个矩形区域，并在每个区域上使用常量进行预测。

应用：非线性关系建模、可解释性要求较高的场景。

操作流程

-数据预处理：

-模型构建：

-训练：

-评估：

-可视化：

-保存结果：

代码及操作演示与功能解析

定量变量预测的机器学习模型可分为传统统计模型、树基集成模型、核方法和深度学习模型四大类，每类模型通过不同机制捕捉数据模式，适用于从线性到复杂非线性关系的预测任务。

代码涵盖了从数据准备到结果保存的自动化过程，包括数据预处理、模型配置、性能评估和报告生成。

# pip install pandas numpy matplotlib seaborn scikit-learn scipy python-docx openpyxl joblib

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.inspection import permutation_importance
import scipy.stats as stats
from docx import Document
from docx.shared import Inches
import warnings

warnings.filterwarnings('ignore')

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False


class DecisionTreeAnalysis:
    def __init__(self):
        self.tree_model = None
        self.train_data = None
        self.test_data = None
        self.results_dir = os.path.join(os.path.expanduser("~"), "Desktop", "Results模型-DecisionTree-ML")

        # 创建结果目录
        if not os.path.exists(self.results_dir):
            os.makedirs(self.results_dir)

    def load_and_preprocess_data(self):
        """加载和预处理数据"""
        try:
            # 读取Excel数据
            file_path = os.path.join(os.path.expanduser("~"), "Desktop", "示例数据.xlsx")
            self.data = pd.read_excel(file_path, sheet_name="示例数据")

            print("数据基本信息:")
            print(f"数据形状: {self.data.shape}")
            print("\n数据类型:")
            print(self.data.dtypes)
            print("\n数据摘要:")
            print(self.data.describe())

            # 处理分类变量
            categorical_columns = ['结局', '肥胖程度', '教育水平', '血型', '指标8']
            for col in categorical_columns:
                if col in self.data.columns:
                    self.data[col] = self.data[col].astype('category')

            # 检查缺失值
            missing_values = self.data.isnull().sum().sum()
            print(f"\n总缺失值数量: {missing_values}")

            # 如果有缺失值，进行简单处理
            if missing_values > 0:
                self.data = self.data.fillna(self.data.mean(numeric_only=True))
                for col in categorical_columns:
                    if col in self.data.columns and self.data[col].isnull().any():
                        self.data[col] = self.data[col].fillna(
                            self.data[col].mode()[0] if not self.data[col].mode().empty else'Unknown')

            return True

        except Exception as e:
            print(f"数据加载失败: {e}")
            return False

    def split_data(self):
        """划分训练集和测试集"""
        X = self.data.drop(['指标1', '序号'], axis=1, errors='ignore')
        y = self.data['指标1']

        # 对分类变量进行one-hot编码
        X_encoded = pd.get_dummies(X, drop_first=True)

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X_encoded, y, test_size=0.3, random_state=123
        )

        print(f"训练集样本数: {len(self.X_train)}")
        print(f"测试集样本数: {len(self.X_test)}")




    def evaluate_model(self):
        """评估模型性能"""
        # 训练集预测
        self.y_pred_train = self.tree_model.predict(self.X_train)
        # 测试集预测
        self.y_pred_test = self.tree_model.predict(self.X_test)

        # 计算训练集性能指标
        self.mse_train = mean_squared_error(self.y_train, self.y_pred_train)
        self.rmse_train = np.sqrt(self.mse_train)
        self.mae_train = mean_absolute_error(self.y_train, self.y_pred_train)
        self.r2_train = r2_score(self.y_train, self.y_pred_train)

        # 计算测试集性能指标
        self.mse_test = mean_squared_error(self.y_test, self.y_pred_test)
        self.rmse_test = np.sqrt(self.mse_test)
        self.mae_test = mean_absolute_error(self.y_test, self.y_pred_test)
        self.r2_test = r2_score(self.y_test, self.y_pred_test)

        # 保存性能指标
        performance_data = {
            'Dataset': ['Training', 'Training', 'Training', 'Training',
                        'Testing', 'Testing', 'Testing', 'Testing', 'CV'],
            'Metric': ['MSE', 'RMSE', 'MAE', 'R-squared',
                       'MSE', 'RMSE', 'MAE', 'R-squared', 'CV-RMSE'],
            'Value': [self.mse_train, self.rmse_train, self.mae_train, self.r2_train,
                      self.mse_test, self.rmse_test, self.mae_test, self.r2_test, self.cv_rmse]
        }

        self.performance_df = pd.DataFrame(performance_data)
        self.performance_df.to_csv(os.path.join(self.results_dir, "决策树模型性能_ML.csv"),
                                   index=False, encoding='utf-8-sig')

        print("\n模型性能指标:")
        print(self.performance_df)

    def get_feature_importance(self):
        """获取特征重要性"""
        feature_importance = self.tree_model.feature_importances_
        self.feature_importance_df = pd.DataFrame({
            'Variable': self.X_train.columns,
            'Importance': feature_importance
        }).sort_values('Importance', ascending=False)

        self.feature_importance_df.to_csv(
            os.path.join(self.results_dir, "决策树变量重要性.csv"),
            index=False, encoding='utf-8-sig'
        )

        print("\n特征重要性:")
        print(self.feature_importance_df.head(10))

def plot_decision_tree(self):
        """绘制决策树"""
        plt.figure(figsize=(20, 10))
        plot_tree(self.tree_model,
                  feature_names=self.X_train.columns,
                  filled=True,
                  rounded=True,
                  max_depth=3,  # 限制深度以便可视化
                  fontsize=10)
        plt.title("Decision Tree Structure", fontsize=16, fontweight='bold')
        plt.tight_layout()
        plt.savefig(os.path.join(self.results_dir, "decision_tree_structure.jpg"),
                    dpi=300, bbox_inches='tight')
        plt.close()

    def plot_feature_importance(self):
        """绘制特征重要性图"""
        plt.figure(figsize=(10, 8))
        top_features = self.feature_importance_df.head(15)

        plt.barh(range(len(top_features)), top_features['Importance'],
                 color='steelblue', alpha=0.8)
        plt.yticks(range(len(top_features)), top_features['Variable'])
        plt.xlabel('Importance Score')
        plt.title('Decision Tree Variable Importance', fontsize=14, fontweight='bold')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.savefig(os.path.join(self.results_dir, "variable_importance.jpg"),
                    dpi=300, bbox_inches='tight')
        plt.close()

    def plot_prediction_vs_actual(self):
        """绘制预测值与实际值对比图"""
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

        # 训练集
        ax1.scatter(self.y_train, self.y_pred_train, alpha=0.6, color='blue', label='Training')
        ax1.plot([self.y_train.min(), self.y_train.max()],
                 [self.y_train.min(), self.y_train.max()], 'r--', lw=2)
        ax1.set_xlabel('Actual Values')
        ax1.set_ylabel('Predicted Values')
        ax1.set_title('Training Set: Predicted vs Actual')
        ax1.legend()
        ax1.grid(True, alpha=0.3)

        # 测试集
        ax2.scatter(self.y_test, self.y_pred_test, alpha=0.6, color='green', label='Testing')
        ax2.plot([self.y_test.min(), self.y_test.max()],
                 [self.y_test.min(), self.y_test.max()], 'r--', lw=2)
        ax2.set_xlabel('Actual Values')
        ax2.set_ylabel('Predicted Values')
        ax2.set_title('Testing Set: Predicted vs Actual')
        ax2.legend()
        ax2.grid(True, alpha=0.3)

        plt.tight_layout()
        plt.savefig(os.path.join(self.results_dir, "predicted_vs_actual.jpg"),
                    dpi=300, bbox_inches='tight')
        plt.close()

def plot_residual_analysis(self):
        """绘制残差分析图"""
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))

        # 训练集残差
        residuals_train = self.y_train - self.y_pred_train
        # 测试集残差
        residuals_test = self.y_test - self.y_pred_test

        # 残差散点图 - 训练集
        axes[0, 0].scatter(self.y_pred_train, residuals_train, alpha=0.6, color='blue')
        axes[0, 0].axhline(y=0, color='red', linestyle='--')
        axes[0, 0].set_xlabel('Fitted Values')
        axes[0, 0].set_ylabel('Residuals')
        axes[0, 0].set_title('Training Set Residuals')
        axes[0, 0].grid(True, alpha=0.3)

        # 残差散点图 - 测试集
        axes[0, 1].scatter(self.y_pred_test, residuals_test, alpha=0.6, color='green')
        axes[0, 1].axhline(y=0, color='red', linestyle='--')
        axes[0, 1].set_xlabel('Fitted Values')
        axes[0, 1].set_ylabel('Residuals')
        axes[0, 1].set_title('Testing Set Residuals')
        axes[0, 1].grid(True, alpha=0.3)

        # Q-Q图 - 训练集
        stats.probplot(residuals_train, dist="norm", plot=axes[0, 2])
        axes[0, 2].set_title('Q-Q Plot - Training Set')

        # Q-Q图 - 测试集
        stats.probplot(residuals_test, dist="norm", plot=axes[1, 0])
        axes[1, 0].set_title('Q-Q Plot - Testing Set')

        # 残差分布 - 训练集
        axes[1, 1].hist(residuals_train, bins=30, density=True, alpha=0.7, color='blue', label='Training')
        axes[1, 1].set_xlabel('Residuals')
        axes[1, 1].set_ylabel('Density')
        axes[1, 1].set_title('Residual Distribution - Training')
        axes[1, 1].legend()

        # 残差分布 - 测试集
        axes[1, 2].hist(residuals_test, bins=30, density=True, alpha=0.7, color='green', label='Testing')
        axes[1, 2].set_xlabel('Residuals')
        axes[1, 2].set_ylabel('Density')
        axes[1, 2].set_title('Residual Distribution - Testing')
        axes[1, 2].legend()

        plt.tight_layout()
        plt.savefig(os.path.join(self.results_dir, "residual_analysis.jpg"),
                    dpi=300, bbox_inches='tight')
        plt.close()

def plot_performance_comparison(self):
        """绘制性能对比图"""
        performance_metrics = pd.DataFrame({
            'Metric': ['RMSE', 'MAE', 'R-squared'] * 2,
            'Value': [self.rmse_train, self.mae_train, self.r2_train,
                      self.rmse_test, self.mae_test, self.r2_test],
            'Dataset': ['Training'] * 3 + ['Testing'] * 3
        })

        plt.figure(figsize=(10, 6))
        sns.barplot(data=performance_metrics, x='Metric', y='Value', hue='Dataset',
                    palette=['steelblue', 'orange'], alpha=0.8)
        plt.title('Model Performance Comparison: Training vs Testing', fontsize=14, fontweight='bold')
        plt.legend(loc='best')
        plt.tight_layout()
        plt.savefig(os.path.join(self.results_dir, "performance_comparison.jpg"),
                    dpi=300, bbox_inches='tight')
        plt.close()

    def learning_curve_analysis(self):
        """学习曲线分析"""
        train_sizes = np.linspace(0.1, 0.9, 9)
        train_rmse = []
        test_rmse = []

        for size in train_sizes:
            # 从训练集中抽取子集
            n_samples = int(size * len(self.X_train))
            indices = np.random.choice(len(self.X_train), n_samples, replace=False)

            X_subset = self.X_train.iloc[indices]
            y_subset = self.y_train.iloc[indices]

            # 训练小模型
            small_tree = DecisionTreeRegressor(**self.tree_model.get_params())
            small_tree.fit(X_subset, y_subset)

            # 训练集性能
            pred_train = small_tree.predict(X_subset)
            rmse_train = np.sqrt(mean_squared_error(y_subset, pred_train))
            train_rmse.append(rmse_train)

            # 测试集性能
            pred_test = small_tree.predict(self.X_test)
            rmse_test = np.sqrt(mean_squared_error(self.y_test, pred_test))
            test_rmse.append(rmse_test)

        # 绘制学习曲线
        plt.figure(figsize=(10, 6))
        plt.plot(train_sizes * len(self.X_train), train_rmse, 'o-', color='blue',
                 label='Training RMSE', linewidth=2)
        plt.plot(train_sizes * len(self.X_train), test_rmse, 'o-', color='red',
                 label='Testing RMSE', linewidth=2)
        plt.xlabel('Training Set Size')
        plt.ylabel('RMSE')
        plt.title('Learning Curve: Training Size vs RMSE', fontsize=14, fontweight='bold')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(self.results_dir, "learning_curve.jpg"),
                    dpi=300, bbox_inches='tight')
        plt.close()

    def model_stability_analysis(self):
        """模型稳定性分析"""
        n_iterations = 10
        stability_results = []

        for i in range(n_iterations):
            # 重新划分数据
            X_temp = self.data.drop(['指标1', '序号'], axis=1, errors='ignore')
            y_temp = self.data['指标1']
            X_encoded = pd.get_dummies(X_temp, drop_first=True)

            X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(
                X_encoded, y_temp, test_size=0.3, random_state=100 + i
            )

            # 训练模型
            temp_model = DecisionTreeRegressor(**self.tree_model.get_params())
            temp_model.fit(X_train_temp, y_train_temp)

            # 预测和评估
            y_pred_temp = temp_model.predict(X_test_temp)
            rmse_temp = np.sqrt(mean_squared_error(y_test_temp, y_pred_temp))
            r2_temp = r2_score(y_test_temp, y_pred_temp)

            stability_results.append({
                'Iteration': i + 1,
                'RMSE': rmse_temp,
                'R_squared': r2_temp
            })

        self.stability_df = pd.DataFrame(stability_results)

        # 绘制稳定性图
        plt.figure(figsize=(10, 6))
        plt.plot(self.stability_df['Iteration'], self.stability_df['RMSE'],
                 'o-', color='steelblue', linewidth=2, markersize=8)
        plt.xlabel('Iteration')
        plt.ylabel('Test RMSE')
        plt.title('Model Stability Across Different Data Splits', fontsize=14, fontweight='bold')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(self.results_dir, "model_stability.jpg"),
                    dpi=300, bbox_inches='tight')
        plt.close()

    def generate_report(self):
        """生成分析报告"""
        doc = Document()

        # 标题
        doc.add_heading('决策树回归 - 机器学习分析报告', 0)
        doc.add_paragraph(f'生成日期: {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")}')
        doc.add_paragraph()

        # 数据概述
        doc.add_heading('数据概述', level=1)
        doc.add_paragraph(f'总样本数: {len(self.data)}')
        doc.add_paragraph(f'训练集样本数: {len(self.X_train)}')
        doc.add_paragraph(f'测试集样本数: {len(self.X_test)}')
        doc.add_paragraph(f'特征数量: {len(self.X_train.columns)}')
        doc.add_paragraph()

        # 模型性能摘要
        doc.add_heading('模型性能摘要', level=1)

        # 创建性能表格
        table = doc.add_table(rows=1, cols=3)
        table.style = 'Light Grid'
        hdr_cells = table.rows[0].cells
        hdr_cells[0].text = '数据集'
        hdr_cells[1].text = '指标'
        hdr_cells[2].text = '值'

        for _, row in self.performance_df.iterrows():
            row_cells = table.add_row().cells
            row_cells[0].text = str(row['Dataset'])
            row_cells[1].text = str(row['Metric'])
            row_cells[2].text = f"{row['Value']:.4f}"

        doc.add_paragraph()

        # 过拟合分析
        doc.add_heading('过拟合分析', level=1)
        overfitting_gap = self.rmse_test - self.rmse_train
        doc.add_paragraph(f'RMSE差距 (测试集 - 训练集): {overfitting_gap:.4f}')

        if overfitting_gap > 0.1:
            doc.add_paragraph('警告: 检测到可能的过拟合 (训练集和测试集性能差距较大)')
        else:
            doc.add_paragraph('良好: 模型表现出良好的泛化能力 (训练集和测试集性能差距较小)')

        doc.add_paragraph()

        # 特征重要性
        doc.add_heading('特征重要性', level=1)
        doc.add_paragraph('前10个最重要的特征:')

        table = doc.add_table(rows=1, cols=2)
        table.style = 'Light Grid'
        hdr_cells = table.rows[0].cells
        hdr_cells[0].text = '特征'
        hdr_cells[1].text = '重要性'

        for _, row in self.feature_importance_df.head(10).iterrows():
            row_cells = table.add_row().cells
            row_cells[0].text = str(row['Variable'])
            row_cells[1].text = f"{row['Importance']:.4f}"

        doc.add_paragraph()

        # 稳定性分析
        doc.add_heading('模型稳定性分析', level=1)
        stability_summary = pd.DataFrame({
            '指标': ['平均测试RMSE', '测试RMSE标准差', '平均测试R平方', '测试R平方标准差'],
            '值': [
                self.stability_df['RMSE'].mean(),
                self.stability_df['RMSE'].std(),
                self.stability_df['R_squared'].mean(),
                self.stability_df['R_squared'].std()
            ]
        })

        table = doc.add_table(rows=1, cols=2)
        table.style = 'Light Grid'
        hdr_cells = table.rows[0].cells
        hdr_cells[0].text = '指标'
        hdr_cells[1].text = '值'

        for _, row in stability_summary.iterrows():
            row_cells = table.add_row().cells
            row_cells[0].text = str(row['指标'])
            row_cells[1].text = f"{row['值']:.4f}"

        doc.add_paragraph()

        # 可视化结果
        doc.add_heading('可视化结果', level=1)

        # 添加图片
        image_files = [
            "variable_importance.jpg",
            "performance_comparison.jpg",
            "learning_curve.jpg",
            "model_stability.jpg",
            "predicted_vs_actual.jpg"
        ]

        titles = [
            "特征重要性",
            "性能对比",
            "学习曲线",
            "模型稳定性",
            "预测值 vs 实际值"
        ]

        for img_file, title in zip(image_files, titles):
            doc.add_paragraph(title)
            img_path = os.path.join(self.results_dir, img_file)
            if os.path.exists(img_path):
                doc.add_picture(img_path, width=Inches(6))
            doc.add_paragraph()

        # 结论和建议
        doc.add_heading('结论和建议', level=1)
        doc.add_paragraph('基于决策树回归的机器学习分析:')
        doc.add_paragraph(f'- 训练集 R平方: {self.r2_train * 100:.2f}%')
        doc.add_paragraph(f'- 测试集 R平方: {self.r2_test * 100:.2f}%')
        doc.add_paragraph(f'- 模型泛化差距: {overfitting_gap:.4f}')
        doc.add_paragraph(f'- 最重要的特征: {", ".join(self.feature_importance_df.head(3)["Variable"].tolist())}')
        doc.add_paragraph()

        doc.add_paragraph('建议:')
        doc.add_paragraph('1. 如果检测到过拟合，考虑进行超参数调优')
        doc.add_paragraph('2. 探索集成方法 (随机森林、梯度提升) 以提高性能')
        doc.add_paragraph('3. 在新数据上监控模型性能以检测概念漂移')

        # 保存文档
        report_path = os.path.join(self.results_dir, "DecisionTree_ML_Analysis_Report.docx")
        doc.save(report_path)
        print(f"分析报告已保存至: {report_path}")

    def save_models(self):
        """保存模型"""
        import joblib

        # 保存模型对象
        joblib.dump(self.tree_model, os.path.join(self.results_dir, "decision_tree_model.pkl"))

        # 保存处理后的数据
        self.data.to_csv(os.path.join(self.results_dir, "processed_data.csv"),
                         index=False, encoding='utf-8-sig')

        print("模型和数据已保存")

    def run_analysis(self):
        """运行完整分析流程"""
        print("开始决策树回归分析...")

        # 1. 加载和预处理数据
        if not self.load_and_preprocess_data():
            return

        # 2. 划分数据
        self.split_data()

        # 3. 训练决策树模型
        print("训练决策树模型中...")
        self.train_decision_tree()

        # 4. 评估模型
        print("评估模型性能...")
        self.evaluate_model()

        # 5. 获取特征重要性
        self.get_feature_importance()

        # 6. 生成可视化
        print("生成可视化图表...")
        self.plot_decision_tree()
        self.plot_feature_importance()
        self.plot_prediction_vs_actual()
        self.plot_residual_analysis()
        self.plot_performance_comparison()
        self.learning_curve_analysis()
        self.model_stability_analysis()

        # 7. 生成报告
        print("生成分析报告...")
        self.generate_report()

        # 8. 保存模型
        self.save_models()

        print("\n决策树机器学习分析完成!")
        print(f"结果已保存到: {self.results_dir}")
        print("- 模型性能比较 (训练集 vs 测试集)")
        print("- 综合可视化图表包括学习曲线")
        print("- 模型稳定性分析")
        print("- 详细的Word分析报告")
        print("- 模型对象和处理后的数据")


# 主程序
if __name__ == "__main__":
    analysis = DecisionTreeAnalysis()
    analysis.run_analysis()