
一、数据说明及读取
1、数据集信息
2、数据属性信息
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3、读取数据
#数据读取import pandas as pdimport numpy as nppd.set_option('display.max_columns', None)#显示所有的列path = '/Users/wuzhengxiang/Documents/DataSets/RizhaoGongJiJin/train.csv'train = pd.read_csv(path).fillna(-1)train.columnsIndex(['id', 'XINGBIE', 'CSNY', 'HYZK', 'ZHIYE', 'ZHICHEN', 'ZHIWU', 'XUELI','DWJJLX', 'DWSSHY', 'GRJCJS', 'GRZHZT', 'GRZHYE', 'GRZHSNJZYE','GRZHDNGJYE', 'GRYJCE', 'DWYJCE', 'DKFFE', 'DKYE', 'DKLL', 'label'],dtype='object')train.head()#查看前面的数据id XINGBIE CSNY HYZK ZHIYE ZHICHEN ZHIWU XUELI DWJJLX \0 train_0 1 1038672000 90 90 999 0 99 1501 train_1 2 504892800 90 90 999 0 99 1102 train_2 1 736185600 90 90 999 0 99 1503 train_3 1 428515200 90 90 999 0 99 1504 train_4 2 544204800 90 90 999 0 99 900DWSSHY GRJCJS GRZHZT GRZHYE GRZHSNJZYE GRZHDNGJYE GRYJCE \0 12 1737.0 1 3223.515 801.310 837.000 312.001 0 4894.0 1 18055.195 53213.220 1065.200 795.842 9 10297.0 1 27426.600 13963.140 7230.020 1444.203 7 10071.5 1 111871.130 99701.265 2271.295 1417.144 14 2007.0 1 237.000 11028.875 35.780 325.50DWYJCE DKFFE DKYE DKLL label0 312.00 175237 154112.935 2.708 01 795.84 300237 298252.945 2.979 02 1444.20 150237 147339.130 2.708 03 1417.14 350237 300653.780 2.708 04 325.50 150237 145185.010 2.708 0#构建训练集X = train.loc[:,'XINGBIE':'DKLL']Y = train['label']
二、两种决策树的构建
-
sklearn.tree.DecisionTreeClassifier 类:分类树的实现。 -
sklearn.tree.DecisionTreeRegressor 类:回归树的实现。
1、分类树构建
## 训练一个决策树,这里限制了最大深度和最小样本树from sklearn import treeclf = tree.DecisionTreeClassifier(max_depth=3,min_samples_leaf=50)clf = clf.fit(X, Y)
2、回归树构建
## 训练一个决策树,这里限制了最大深度和最小样本树from sklearn import treeclf = tree.DecisionTreeClassifier(max_depth=3,min_samples_leaf=50)clf = clf.fit(X, Y)
三、决策树的可视化
1、plot_tree可视化
#包里自带的,有点丑,但是还行,清晰度设置高点,就还可以了import matplotlib.pyplot as plttree.plot_tree(clf)plt.show()#设置图片的大小,想要清晰的可以设置的大点plt.figure(figsize=(8,8),dpi=1000)tree.plot_tree(clf)plt.show()
2、graphviz可视化
import graphvizdot_data = tree.export_graphviz(clf,out_file=None,feature_names=X.columns,class_names=['good','bad'],filled=True, rounded=True,special_characters=True)graph = graphviz.Source(dot_data)graph
#构建训练集X = train.loc[:,'XINGBIE':'DKLL']Y = train['label']# 训练一个模型from sklearn import treeclf = tree.DecisionTreeClassifier(max_depth=9,min_samples_leaf=1800)clf = clf.fit(X, Y)import graphvizdot_data = tree.export_graphviz(clf,out_file=None,feature_names=X.columns,class_names=['good','bad'],filled=True, rounded=True,special_characters=True)graph = graphviz.Source(dot_data)graph#graph.save('prueba2.png')
3、dtreeviz可视化
dtreeviz.model(model,X_train,y_train,tree_index: int=None,feature_names: List[str]=None,target_name: str=None,class_names: (List[str], Mapping[int, str])=None) -> DTreeVizAPI
# 训练一个模型
from sklearn import tree
clf = tree.DecisionTreeClassifier(
max_depth=6,
min_samples_leaf=20
)
clf = clf.fit(X, Y)
# 开始可视化
import dtreeviz
import warnings
warnings.filterwarnings("ignore")
viz_model = dtreeviz.model(clf,
X_train=X,
y_train=Y,
target_name='label',
feature_names=X.columns,
class_names={0:'good',1:'bad'},
)
v = viz_model.view() # render as SVG into internal object
v.show() # pop up window
v.save("/tmp/gonjijin.svg") # optionally save as svg
testX = X.iloc[77,:]v = viz_model.view(x=testX)v.show()
v = viz_model.view(x=testX,show_just_path=True)v.show()v.save("gonjijin.svg") # optionally save as svg
#数据读取import pandas as pdimport numpy as nppd.set_option('display.max_columns', None)#显示所有的列path = 'train.csv'train = pd.read_csv(path).fillna(-1)#构建训练集X = train.loc[:,'XINGBIE':'DKLL']Y = train['label']# 训练一个模型from sklearn import treeclf = tree.DecisionTreeClassifier(max_depth=7,min_samples_leaf=20)clf = clf.fit(X, Y)# 开始可视化import dtreevizimport warningswarnings.filterwarnings("ignore")viz_model = dtreeviz.model(clf,X_train=X,y_train=Y,target_name='label',feature_names=X.columns,class_names={0:'good',1:'bad'},)v = viz_model.view() # render as SVG into internal objectv.show() # pop up windowv.save("gonjijin.svg") # optionally save as svg
#数据读取import pandas as pdimport numpy as nppd.set_option('display.max_columns', None)#显示所有的列path = 'train.csv'train = pd.read_csv(path).fillna(-1)#构建训练集X = train.loc[:,'XINGBIE':'DKLL']Y = train['label']# 训练一个模型from sklearn import treeclf = tree.DecisionTreeClassifier(max_depth=5,min_samples_leaf=20)clf = clf.fit(X, Y)testX = X.iloc[77,:]# 开始可视化import dtreevizimport warningswarnings.filterwarnings("ignore")viz_model = dtreeviz.model(clf,X_train=X,y_train=Y,target_name='label',feature_names=X.columns,class_names={0:'good',1:'bad'},)v = viz_model.view(orientation="LR")v.show()v.save("gonjijin.svg")
#数据读取import pandas as pdimport numpy as nppd.set_option('display.max_columns', None)#显示所有的列path = 'train.csv'train = pd.read_csv(path).fillna(-1)#构建训练集X = train.loc[:,'XINGBIE':'DKLL']Y = train['label']# 训练一个模型from sklearn import treeclf = tree.DecisionTreeClassifier(max_depth=4,min_samples_leaf=50)clf = clf.fit(X, Y)testX = X.iloc[77,:]# 开始可视化import dtreevizimport warningswarnings.filterwarnings("ignore")viz_model = dtreeviz.model(clf,X_train=X,y_train=Y,target_name='label',feature_names=X.columns,class_names={0:'good',1:'bad'},)v = viz_model.view(fancy=False)v.show()v.save("gonjijin.svg")
4、回归树可视化
# 回归树可视化from sklearn.datasets import fetch_california_housingfrom sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressorimport dtreevizclf = DecisionTreeRegressor(max_depth=5)data = fetch_california_housing()X = data.dataY = data.targetclf.fit(X, Y)viz_model = dtreeviz.model(clf,X_train=X,y_train=Y,target_name=data.target_names,feature_names=data.feature_names)v = viz_model.view() # render as SVG into internal objectviz_model.view()v.show() # pop up windowv.save("/tmp/iris.svg") # optionally save as svg
# 回归树可视化from sklearn.datasets import fetch_california_housingfrom sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressorimport dtreevizclf = DecisionTreeRegressor(max_depth=4)data = fetch_california_housing()X = data.dataY = data.targetclf.fit(X, Y)viz_model = dtreeviz.model(clf,X_train=X,y_train=Y,target_name=data.target_names,feature_names=data.feature_names)v = viz_model.view(orientation="LR") # render as SVG into internal objectviz_model.view()v.show() # pop up windowv.save("gonjijin.svg")
四、理解决策树的底层设计
from sklearn.datasets import load_irisfrom sklearn import treeiris = load_iris()clf = tree.DecisionTreeClassifier()clf = clf.fit(iris.data, iris.target)clf.classes_[x for x in dir(clf) if not x.startswith('_')]
# 决策树结构探索dir(clf.tree_)['apply','capacity','children_left','children_right','compute_feature_importances','compute_partial_dependence','decision_path','feature','impurity','max_depth','max_n_classes','n_classes','n_features','n_leaves','n_node_samples','n_outputs','node_count','predict','threshold','value','weighted_n_node_samples']
clf.tree_.children_leftclf.tree_.children_rightclf.tree_.featureclf.tree_.capacityclf.tree_.thresholdclf.tree_.valueclf.tree_.impurityclf.tree_.decision_path
-
训练后的决策树共包含5个节点,其中3个叶子节点 -
通过children_left和children_right两个属性,可以知道第0个节点(也就是根节点)的左子节点索引为1,右子节点索引为2,;第1个节点的左右子节点均为-1,意味着该节点即为叶子节点;第2个节点的左右子节点分别为3和4,说明它是一个内部节点,并做了进一步分裂 -
通过feature和threshold两个属性,可以知道第0个节点(根节点)使用索引为3的特征(对应第4列特征)进行分裂,且其最优分割阈值为0.8;第1个节点因为是叶子节点,所以不再分裂,其对应feature和threshold字段均为-2 -
通过value属性,可以查看落入每个节点的各类样本数量,由于鸢尾花数据集是一个三分类问题,且该决策树共有5个节点,所以value的取值为一个5×3的二维数组,例如第一行代表落入根节点的样本计数为[50, 50, 50],第二行代表落入左子节点的样本计数为[50, 0, 0],由于已经是纯的了,所以不再继续分裂。 -
另外,tree中实际上并未直接标出各叶节点所对应的标签值,但完全可通过value属性来得到,即各叶子节点中落入样本最多的类别即为相应标签。甚至说,不仅可知道对应标签,还可通过计算数量之比得到相应的概率!
五、决策规则提取
1、老方法提取策略
#数据读取import pandas as pdimport numpy as nppd.set_option('display.max_columns', None)#显示所有的列path = '/Users/wuzhengxiang/Documents/DataSets/RizhaoGongJiJin/train.csv'train = pd.read_csv(path).fillna(-1)train.columns#构建训练集X = train.loc[:,'XINGBIE':'DKLL']Y = train['label']## 训练一个决策树,这里限制了最大深度和最小样本树from sklearn import treeclf = tree.DecisionTreeClassifier(max_depth=3,min_samples_leaf=50)clf = clf.fit(X, Y)# 决策树规则提取-老方法from sklearn.tree import _treedef tree_to_code(tree, feature_names):tree_ = tree.tree_feature_name = [feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"for i in tree_.feature]print ("def tree({}):".format(", ".join(feature_names)))def recurse(node, depth):indent = " " * depthif tree_.feature[node] != _tree.TREE_UNDEFINED:name = feature_name[node]threshold = tree_.threshold[node]print("{}if {} <= {}:".format(indent, name, threshold))recurse(tree_.children_left[node], depth + 1)print("{}else: # if {} > {}".format(indent, name, threshold))recurse(tree_.children_right[node], depth + 1)else:print("{}return {}".format(indent, tree_.value[node]))recurse(0, 1)tree_to_code(clf,X.columns)return [[161. 0.]]
2、新方法提取策略
from sklearn.tree import _treedef XiaoWuGe_Get_Rules(clf,X):n_nodes = clf.tree_.node_countchildren_left = clf.tree_.children_leftchildren_right = clf.tree_.children_rightfeature = clf.tree_.featurethreshold = clf.tree_.thresholdvalue = clf.tree_.valuenode_depth = np.zeros(shape=n_nodes, dtype=np.int64)is_leaves = np.zeros(shape=n_nodes, dtype=bool)stack = [(0, 0)]while len(stack) > 0:node_id, depth = stack.pop()node_depth[node_id] = depthis_split_node = children_left[node_id] != children_right[node_id]if is_split_node:stack.append((children_left[node_id], depth+1))stack.append((children_right[node_id], depth+1))else:is_leaves[node_id] = Truefeature_name = [X.columns[i] if i != _tree.TREE_UNDEFINED else "undefined!"for i in clf.tree_.feature]ways = []depth = []feat = []nodes = []rules = []for i in range(n_nodes):if is_leaves[i]:while depth[-1] >= node_depth[i]:depth.pop()ways.pop()feat.pop()nodes.pop()if children_left[i-1]==i:#当前节点是上一个节点的左节点,则是小于a='{f}<={th}'.format(f=feat[-1],th=round(threshold[nodes[-1]],4))ways[-1]=alast =' & '.join(ways)+':'+str(value[i][0][0])+':'+str(value[i][0][1])rules.append(last)else:a='{f}>{th}'.format(f=feat[-1],th=round(threshold[nodes[-1]],4))ways[-1]=alast = ' & '.join(ways)+':'+str(value[i][0][0])+':'+str(value[i][0][1])rules.append(last)else: #不是叶子节点 入栈if i==0:ways.append(round(threshold[i],4))depth.append(node_depth[i])feat.append(feature_name[i])nodes.append(i)else:while depth[-1] >= node_depth[i]:depth.pop()ways.pop()feat.pop()nodes.pop()if i==children_left[nodes[-1]]:w='{f}<={th}'.format(f=feat[-1],th=round(threshold[nodes[-1]],4))else:w='{f}>{th}'.format(f=feat[-1],th=round(threshold[nodes[-1]],4))ways[-1] = wways.append(round(threshold[i],4))depth.append(node_depth[i])feat.append(feature_name[i])nodes.append(i)return rules
#训练一个决策树,对规则进行提取clf = tree.DecisionTreeClassifier(max_depth=10,min_samples_leaf=50)clf = clf.fit(X, Y)Rules = XiaoWuGe_Get_Rules(clf,X)Rules[0:5] # 查看前5条规则['GRZHZT<=1.5 & DWSSHY<=14.5 & DWJJLX<=177.0 & DWJJLX<=115.0 & DKYE<=111236.2852 & DWSSHY<=4.5 & DWYJCE<=663.54 & DKYE<=67419.1094:45.0:8.0','GRZHZT<=1.5 & DWSSHY<=14.5 & DWJJLX<=177.0 & DWJJLX<=115.0 & DKYE<=111236.2852 & DWSSHY<=4.5 & DWYJCE<=663.54 & DKYE >67419.1094:61.0:3.0','GRZHZT<=1.5 & DWSSHY<=14.5 & DWJJLX<=177.0 & DWJJLX<=115.0 & DKYE<=111236.2852 & DWSSHY<=4.5 & DWYJCE >663.54 & GRZHYE<=45622.4883 & DKYE<=1825.5625:63.0:2.0','GRZHZT<=1.5 & DWSSHY<=14.5 & DWJJLX<=177.0 & DWJJLX<=115.0 & DKYE<=111236.2852 & DWSSHY<=4.5 & DWYJCE >663.54 & GRZHYE<=45622.4883 & DKYE >1825.5625:188.0:0.0','GRZHZT<=1.5 & DWSSHY<=14.5 & DWJJLX<=177.0 & DWJJLX<=115.0 & DKYE<=111236.2852 & DWSSHY<=4.5 & DWYJCE >663.54 & GRZHYE >45622.4883:46.0:4.0']len(Rules) # 查看规则总数182
clf = tree.DecisionTreeClassifier(max_depth=15,min_samples_leaf=20)clf = clf.fit(X, Y)Rules = Get_Rules(clf,X)Rules[0:5]['GRZHZT<=1.5 & DWSSHY<=14.5 & DWJJLX<=177.0 & DWJJLX<=115.0 & DKYE<=111236.2852 & DWSSHY<=4.5 & DWYJCE<=663.54 & GRZHSNJZYE<=19428.9082 & DKFFE<=142737.0 & CSNY<=600926400.0:54.0:0.0','GRZHZT<=1.5 & DWSSHY<=14.5 & DWJJLX<=177.0 & DWJJLX<=115.0 & DKYE<=111236.2852 & DWSSHY<=4.5 & DWYJCE<=663.54 & GRZHSNJZYE<=19428.9082 & DKFFE<=142737.0 & CSNY >600926400.0:18.0:2.0','GRZHZT<=1.5 & DWSSHY<=14.5 & DWJJLX<=177.0 & DWJJLX<=115.0 & DKYE<=111236.2852 & DWSSHY<=4.5 & DWYJCE<=663.54 & GRZHSNJZYE<=19428.9082 & DKFFE >142737.0:19.0:4.0','GRZHZT<=1.5 & DWSSHY<=14.5 & DWJJLX<=177.0 & DWJJLX<=115.0 & DKYE<=111236.2852 & DWSSHY<=4.5 & DWYJCE<=663.54 & GRZHSNJZYE >19428.9082:15.0:5.0','GRZHZT<=1.5 & DWSSHY<=14.5 & DWJJLX<=177.0 & DWJJLX<=115.0 & DKYE<=111236.2852 & DWSSHY<=4.5 & DWYJCE >663.54 & GRZHYE<=73608.0156 & DKYE<=1825.5625 & GRZHSNJZYE<=9524.7949:21.0:2.0']len(Rules)521#可以遍历所有的规则for i in Rules:print(i)
六、策略结果整理保存
# 结果格式整理df = pd.DataFrame(Rules)df.columns = ['allrules']df['Rules'] = df['allrules'].str.split(':').str.get(0)df['good'] = df['allrules'].str.split(':').str.get(1).astype(float)df['bad'] = df['allrules'].str.split(':').str.get(2).astype(float)df['all'] = df['bad']+df['good']df['bad_rate'] = df['bad']/df['all']df = df.sort_values(by='bad_rate',ascending=False)del df['allrules']
七、特征采样
2、根据重要性采样
八、软件安装及
pip install dtreevizconda install python-graphviz
了解更多数据分析知识、与更多优秀的人一起进群交流请扫码


