1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
| ''' 朴素贝叶斯模型 ''' import pandas as pd import numpy as np from sklearn.datasets import load_iris from collections import defaultdict from sklearn.cross_validation import train_test_split
def load_data(): ''' 加载鸢尾花数据 ''' data = load_iris() return data['data'],data['target']
class NBClassifier(object): def __init__(self): self.y = [] self.x = [] self.py = defaultdict(float) self.pxy = defaultdict(dict) self.n = 5
def prob(self,element,arr): ''' 计算元素在列表中出现的频率 ''' prob = 0.0 for a in arr: if element == a: prob += 1/len(arr) if prob == 0.0: prob = 0.001 return prob
def get_set(self,x,y): self.y = list(set(y)) for i in range(x.shape[1]): self.x.append(list(set(x[:,i]))) def fit(self,x,y): ''' 训练模型 ''' x = self.preprocess(x) self.get_set(x,y) for yi in self.y: self.py[yi] = self.prob(yi,y) for yi in self.y: for i in range(x.shape[1]): sample = x[y==yi,i] pxy = [self.prob(xi,sample) for xi in self.x[i]] self.pxy[yi][i] = pxy print("train score",self.score(x,y))
def predict_one(self,x): ''' 预测单个样本 ''' max_prob = 0.0 max_yi = self.y[0] for yi in self.y: prob_y = self.py[yi] for i in range(len(x)): prob_x_y = self.pxy[yi][i][self.x[i].index(x[i])] prob_y *= prob_x_y if prob_y > max_prob: max_prob = prob_y max_yi = yi return max_yi
def predict(self,samples): ''' 预测函数 ''' samples = self.preprocess(samples) y_list = [] for m in range(samples.shape[0]): yi = self.predict_one(samples[m,:]) y_list.append(yi) return np.array(y_list)
def preprocess(self,x): ''' 因为不同特征的数值集大小相差巨大,造成部分概率矩阵变得稀疏,需要进行数据分割 ''' for i in range(x.shape[1]): x[:,i] = self.step(x[:,i],self.n) return x def step(self,arr,n): ''' 分为n阶 ''' ma = max(arr) mi = min(arr) for i in range(len(arr)): for j in range(n): a = mi + (ma-mi)*(j/n) b = mi + (ma-mi)*((j+1)/n) if arr[i] >= a and arr[i] <= b: arr[i] = j+1 break return arr def score(self,x,y): y_test = self.predict(x) score = 0.0 for i in range(len(y)): if y_test[i] == y[i]: score += 1/len(y) return score
if __name__ == "__main__": x,y = load_data() x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.5,random_state = 100) clf = NBClassifier() clf.fit(x_train,y_train) score = clf.score(x_test,y_test) print('test score',score)
|