编程实现基于信息熵/基尼指数划分选择的决策树算法
手动建立一个csv文件
#csv的内容为
Idx,color,root,knocks,texture,navel,touch,density,sugar_ratio,label
1,dark_green,curl_up,little_heavily,distinct,sinking,hard_smooth,0.697,0.46,1
2,black,curl_up,heavily,distinct,sinking,hard_smooth,0.774,0.376,1
3,black,curl_up,little_heavily,distinct,sinking,hard_smooth,0.634,0.264,1
4,dark_green,curl_up,heavily,distinct,sinking,hard_smooth,0.608,0.318,1
5,light_white,curl_up,little_heavily,distinct,sinking,hard_smooth,0.556,0.215,1
6,dark_green,little_curl_up,little_heavily,distinct,little_sinking,soft_stick,0.403,0.237,1
7,black,little_curl_up,little_heavily,little_blur,little_sinking,soft_stick,0.481,0.149,1
8,black,little_curl_up,little_heavily,distinct,little_sinking,hard_smooth,0.437,0.211,1
9,black,little_curl_up,heavily,little_blur,little_sinking,hard_smooth,0.666,0.091,0
10,dark_green,stiff,clear,distinct,even,soft_stick,0.243,0.267,0
11,light_white,stiff,clear,blur,even,hard_smooth,0.245,0.057,0
12,light_white,curl_up,little_heavily,blur,even,soft_stick,0.343,0.099,0
13,dark_green,little_curl_up,little_heavily,little_blur,sinking,hard_smooth,0.639,0.161,0
14,light_white,little_curl_up,heavily,little_blur,sinking,hard_smooth,0.657,0.198,0
15,black,little_curl_up,little_heavily,distinct,little_sinking,soft_stick,0.36,0.37,0
16,light_white,curl_up,little_heavily,blur,even,hard_smooth,0.593,0.042,0
17,dark_green,curl_up,heavily,little_blur,little_sinking,hard_smooth,0.719,0.103,0
 
代码
import csv
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
from sklearn import tree
from matplotlib import pyplot as plt
import graphviz
import os     
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
os.environ["PATH"] += os.pathsep + 'C:\Program Files\Graphviz\\bin'
def is_number(n):
    is_number = True
    try:
        num = float(n)
        is_number = num == num
    except ValueError:
        # 不是float类型则返回false
        is_number = False
    return is_number
def loadData(filename):
    data = open(filename,'r',encoding='utf-8')
    reader = csv.reader(data)
    headers = next(reader) # 通过调用next方法来一行一行的读取数据
 
    featureList =[] # 添加属性数据
    labelList = [] # 添加结果
    
    for row in reader:# 每行每行的来
        labelList.append(row[len(row)-1])
        rowDict = {}
        for i in range(1,len(row)-1):
            # 跳过序号和结果
            # 对于每个属性数据,
            if is_number(row[i]) == True:
                rowDict[headers[i]] = float(row[i])
            else:
                rowDict[headers[i]] = row[i]
        featureList.append(rowDict)
        
    return featureList,labelList
def createDTree_information(featureList,labelList):
    # 对离散值进行编码处理
    vec = DictVectorizer()
    dummyX = vec.fit_transform(featureList).toarray()
 
    lb = preprocessing.LabelBinarizer()
    dummyY = lb.fit_transform(labelList)
 
    
    clf = tree.DecisionTreeClassifier(criterion='entropy')
    clf = clf.fit(dummyX,dummyY)
    target_name=['0','1']
 
    dot_data = tree.export_graphviz(clf,
                                feature_names=vec.get_feature_names_out(),
                                class_names=target_name,
                                out_file=None,
                                filled=True,
                                rounded=True)
    graph = graphviz.Source(dot_data)
    return graph
def createDTree_GiNi(featureList,labelList):
    # 对离散值进行编码处理
    vec = DictVectorizer()
    dummyX = vec.fit_transform(featureList).toarray()
 
    lb = preprocessing.LabelBinarizer()
    dummyY = lb.fit_transform(labelList)
 
    
    clf = tree.DecisionTreeClassifier(criterion='entropy')
    clf = clf.fit(dummyX,dummyY)
    target_name=['0','1']
 
    dot_data = tree.export_graphviz(clf,
                                feature_names=vec.get_feature_names_out(),
                                class_names=target_name,
                                out_file=None,
                                filled=True,
                                rounded=True)
    graph = graphviz.Source(dot_data)
    return graph
featureList,labelList = loadData('watermelo.csv')
graph = createDTree_information(featureList,labelList)
graph_gini = createDTree_GiNi(featureList,labelList)
print('以基尼指数作为划分准则的决策树')
graph_gini 
print('以信息熵作为划分准则的决策树')
graph
 
注意,前面导包的时候path的值是Graphviz的bin文件夹路径,这个Graphviz要手动去官网离线下载,然后记住它的安装位置
输出:以基尼指数作为划分准则的决策树
以信息熵作为划分准则的决策树




















