第6章 电力窃漏电用户自动识别
from random import shuffle
import matplotlib.pyplot as plt
import pandas as pd
from keras.layers.core import Activation, Dense
from keras.models import Sequential
from scipy.interpolate import lagrange
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix, roc_curve
from sklearn.tree import DecisionTreeClassifier
"""
cm_plot-->自定义混淆矩阵可视化
programmer_1-->使用拉格朗日插值法进行插值
programmer_2-->构建CART决策树模型,进行预测给出训练结果,并且绘制ROC曲线
programmer_3-->使用神经网络模型,进行预测给出训练结果,并且绘制ROC曲线
"""
def cm_plot(y, yp):
cm = confusion_matrix(y, yp)
plt.matshow(cm, cmap=plt.cm.Greens)
plt.colorbar()
for x in range(len(cm)):
for y in range(len(cm)):
plt.annotate(
cm[x, y],
xy=(x, y),
horizontalalignment='center',
verticalalignment='center')
plt.ylabel('True label')
plt.xlabel('Predicted label')
return plt
def programmer_1():
inputfile = 'data/missing_data.xls'
outputfile = 'tmp/missing_data_processed.xls'
data = pd.read_excel(inputfile, header=None)
def ployinterp_column(s, n, k=5):
y = s[list(range(n - k, n)) + list(range(n + 1, n + 1 + k))]
y = y[y.notnull()]
return lagrange(y.index, list(y))(n)
for i in data.columns:
for j in range(len(data)):
if (data[i].isnull())[j]:
data[i][j] = ployinterp_column(data[i], j)
data.to_excel(outputfile, header=None, index=False)
def programmer_2():
datafile = 'data/model.xls'
data = pd.read_excel(datafile)
data = data.as_matrix()
shuffle(data)
p = 0.8
train = data[:int(len(data) * p), :]
test = data[int(len(data) * p):, :]
treefile = 'tmp/tree.pkl'
tree = DecisionTreeClassifier()
tree.fit(train[:, :3], train[:, 3])
joblib.dump(tree, treefile)
cm_plot(train[:, 3], tree.predict(train[:, :3])).show()
fpr, tpr, thresholds = roc_curve(
test[:, 3], tree.predict_proba(test[:, :3])[:, 1], pos_label=1)
plt.plot(fpr, tpr, linewidth=2, label='ROC of CART', color='green')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.ylim(0, 1.05)
plt.xlim(0, 1.05)
plt.legend(loc=4)
plt.show()
print(thresholds)
def programmer_3():
datafile = 'data/model.xls'
data = pd.read_excel(datafile)
data = data.as_matrix()
shuffle(data)
p = 0.8
train = data[:int(len(data) * p), :]
test = data[int(len(data) * p):, :]
netfile = 'tmp/net.model'
net = Sequential()
net.add(Dense(10, input_shape=(3, )))
net.add(Activation('relu'))
net.add(Dense(1, input_shape=(10, )))
net.add(Activation('sigmoid'))
net.compile(
loss='binary_crossentropy',
optimizer='adam',
sample_weight_mode="binary")
net.fit(train[:, :3], train[:, 3], epochs=100, batch_size=1)
net.save_weights(netfile)
predict_result = net.predict_classes(train[:, :3]).reshape(
len(train))
'''这里要提醒的是,keras用predict给出预测概率,predict_classes才是给出预测类别,而且两者的预测结果都是n x 1维数组,而不是通常的 1 x n'''
cm_plot(train[:, 3], predict_result).show()
predict_result = net.predict(test[:, :3]).reshape(len(test))
fpr, tpr, thresholds = roc_curve(test[:, 3], predict_result, pos_label=1)
plt.plot(fpr, tpr, linewidth=2, label='ROC of LM')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.ylim(0, 1.05)
plt.xlim(0, 1.05)
plt.legend(loc=4)
plt.show()
print(thresholds)
if __name__ == "__main__":
pass