第5章 挖掘建模
"""
Created on Fri Oct 20 16:06:09 2017
@author: wnma3
"""
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from keras.layers.core import Activation, Dense
from keras.models import Sequential
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.tree import export_graphviz
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import adfuller as ADF
"""
programmer_1-->使用随机森林算出有效特征,使用线性回归计算相关系数
programmer_2-->使用决策数模型,生成决策树过程并保存为dot文件,天气、周末、促销决定销量
programmer_3-->使用Keras神经网络模型,训练数据预测销量高低
cm_plot-->自定义混淆矩阵可视化
density_plot-->自定义概率密度图函数
programmer_4-->使用KMeans聚类,做可视化操作(概率密度图)
programmer_5-->继programmer_4将数据做降维处理,并且可视化不同聚类的类别
programmer_6-->进行白噪声、平稳性检测,建立ARIMA(0, 1, 1)模型预测之后五天的结果
programmer_7-->使用Kmeans聚类之后,画出散点图,标记离群点
find_rule-->寻找关联规则的函数
connect_string-->自定义连接函数,用于实现L_{k-1}到C_k的连接
programmer_8-->菜单中各个菜品的关联程度
"""
def programmer_1():
filename = "data/bankloan.xls"
data = pd.read_excel(filename)
x = data.iloc[:, :8].as_matrix()
y = data.iloc[:, 8].as_matrix()
rlr = RLR()
rlr.fit(x, y)
rlr_support = rlr.get_support()
support_col = data.drop('违约', axis=1).columns[rlr_support]
print(
"rlr_support_columns: {columns}".format(columns=','.join(support_col)))
x = data[support_col].as_matrix()
lr = LR()
lr.fit(x, y)
print("lr: {score}".format(score=lr.score(x, y)))
def programmer_2():
inputfile = "data/sales_data.xls"
data = pd.read_excel(inputfile, index_col=u'序号')
data[data == u'是'] = 1
data[data == u'高'] = 1
data[data == u'好'] = 1
data[data != 1] = -1
x = data.iloc[:, :3].as_matrix().astype(int)
y = data.iloc[:, 3].as_matrix().astype(int)
dtc = DTC()
dtc.fit(x, y)
x = pd.DataFrame(x)
with open("tree.dot", "w") as f:
f = export_graphviz(dtc, feature_names=x.columns, out_file=f)
def programmer_3():
inputfile = "data/sales_data.xls"
data = pd.read_excel(inputfile, index_col=u'序号')
data[data == u'好'] = 1
data[data == u'是'] = 1
data[data == u'高'] = 1
data[data != 1] = 0
x = data.iloc[:, :3].as_matrix().astype(int)
y = data.iloc[:, 3].as_matrix().astype(int)
model = Sequential()
model.add(Dense(input_dim=3, units=10))
model.add(Activation('relu'))
model.add(Dense(input_dim=10, units=1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam')
model.fit(x, y, epochs=1000, batch_size=10)
yp = model.predict_classes(x).reshape(len(y))
def cm_plot(y, yp):
cm = confusion_matrix(y, yp)
plt.matshow(cm, cmap=plt.cm.Greens)
plt.colorbar()
for x in range(len(cm)):
for y in range(len(cm)):
plt.annotate(
cm[x, y],
xy=(x, y),
horizontalalignment='center',
verticalalignment='center')
plt.ylabel('True label')
plt.xlabel('Predicted label')
return plt
cm_plot(y, yp).show()
def programmer_4():
inputfile = 'data/consumption_data.xls'
outputfile = 'tmp/data_type.xls'
"""
k: 聚类类别
iteration: 聚类循环次数
model.labels_: 聚类类别
model.cluster_centers_: 聚类中心
"""
k = 3
iteration = 500
data = pd.read_excel(inputfile, index_col='Id')
data_zs = 1.0 * (data - data.mean()) / data.std()
model = KMeans(n_clusters=k, n_jobs=4, max_iter=iteration)
model.fit(data_zs)
r1 = pd.Series(model.labels_).value_counts()
r2 = pd.DataFrame(model.cluster_centers_)
r = pd.concat([r2, r1], axis=1)
r.columns = list(data.columns) + [u'类别数目']
print(r)
r = pd.concat([data, pd.Series(model.labels_, index=data.index)], axis=1)
r.columns = list(data.columns) + [u'聚类类别']
r.to_excel(outputfile)
def density_plot(data, k):
p = data.plot(kind='kde', linewidth=2, subplots=True, sharex=False)
[p[i].set_ylabel(u'密度') for i in range(k)]
plt.legend()
return plt
pic_output = 'tmp/pd_'
for i in range(k):
density_plot(data[r[u'聚类类别'] == i],
k).savefig(u'%s%s.png' % (pic_output, i))
return data_zs, r
def programmer_5(data_zs, r):
tsne = TSNE()
tsne.fit_transform(data_zs)
tsne = pd.DataFrame(tsne.embedding_, index=data_zs.index)
d = tsne[r[u'聚类类别'] == 0]
plt.plot(d[0], d[1], 'r.')
d = tsne[r[u'聚类类别'] == 1]
plt.plot(d[0], d[1], 'go')
d = tsne[r[u'聚类类别'] == 2]
plt.plot(d[0], d[1], 'b*')
plt.show()
def programmer_6():
"""
警告解释:
# UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure
"matplotlib is currently using a non-GUI backend, "
调用了多次plt.show()
解决方案,使用plt.subplot()
# RuntimeWarning: overflow encountered in exp
运算精度不够
forecastnum-->预测天数
plot_acf().show()-->自相关图
plot_pacf().show()-->偏自相关图
"""
discfile = 'data/arima_data.xls'
forecastnum = 5
data = pd.read_excel(discfile, index_col=u'日期')
fig = plt.figure(figsize=(8, 6))
ax1 = plt.subplot(411)
fig = plot_acf(data, ax=ax1)
print(u'原始序列的ADF检验结果为:', ADF(data[u'销量']))
D_data = data.diff().dropna()
D_data.columns = [u'销量差分']
D_data.plot()
plt.show()
fig = plt.figure(figsize=(8, 6))
ax2 = plt.subplot(412)
fig = plot_acf(D_data, ax=ax2)
ax3 = plt.subplot(414)
fig = plot_pacf(D_data, ax=ax3)
plt.show()
fig.clf()
print(u'差分序列的ADF检验结果为:', ADF(D_data[u'销量差分']))
print(u'差分序列的白噪声检验结果为:', acorr_ljungbox(D_data, lags=1))
data[u'销量'] = data[u'销量'].astype(float)
pmax = int(len(D_data) / 10)
qmax = int(len(D_data) / 10)
bic_matrix = []
data.dropna(inplace=True)
import warnings
warnings.filterwarnings('error')
for p in range(pmax + 1):
tmp = []
for q in range(qmax + 1):
try:
tmp.append(ARIMA(data, (p, 1, q)).fit().bic)
except:
tmp.append(None)
bic_matrix.append(tmp)
bic_matrix = pd.DataFrame(bic_matrix)
p, q = bic_matrix.stack().idxmin()
print(u'BIC最小的p值和q值为:%s、%s' % (p, q))
model = ARIMA(data, (p, 1, q)).fit()
model.summary2()
model.forecast(forecastnum)
def programmer_7():
"""
k:聚类中心数
threshold:离散点阈值
iteration:聚类最大循环次数
"""
inputfile = 'data/consumption_data.xls'
k = 3
threshold = 2
iteration = 500
data = pd.read_excel(inputfile, index_col='Id')
data_zs = 1.0 * (data - data.mean()) / data.std()
model = KMeans(n_clusters=k, n_jobs=4, max_iter=iteration)
model.fit(data_zs)
r = pd.concat(
[data_zs, pd.Series(model.labels_, index=data.index)], axis=1)
r.columns = list(data.columns) + [u'聚类类别']
norm = []
for i in range(k):
norm_tmp = r[['R', 'F', 'M']][r[u'聚类类别'] == i] - \
model.cluster_centers_[i]
norm_tmp = norm_tmp.apply(np.linalg.norm, axis=1)
norm.append(norm_tmp / norm_tmp.median())
norm = pd.concat(norm)
norm[norm <= threshold].plot(style='go')
discrete_points = norm[norm > threshold]
discrete_points.plot(style='ro')
for i in range(len(discrete_points)):
_id = discrete_points.index[i]
n = discrete_points.iloc[i]
plt.annotate('(%s, %0.2f)' % (_id, n), xy=(_id, n), xytext=(_id, n))
plt.xlabel(u'编号')
plt.ylabel(u'相对距离')
plt.show()
def connect_string(x, ms):
x = list(map(lambda i: sorted(i.split(ms)), x))
l = len(x[0])
r = []
for i in range(len(x)):
for j in range(i, len(x)):
if x[i][:l - 1] == x[j][:l - 1] and x[i][l - 1] != x[j][l - 1]:
r.append(x[i][:l - 1] + sorted([x[j][l - 1], x[i][l - 1]]))
return r
def find_rule(d, support, confidence, ms=u'--'):
result = pd.DataFrame(index=['support', 'confidence'])
support_series = 1.0 * d.sum() / len(d)
column = list(support_series[support_series > support].index)
k = 0
while len(column) > 1:
k = k + 1
print(u'\n正在进行第%s次搜索...' % k)
column = connect_string(column, ms)
print(u'数目:%s...' % len(column))
def sf(i):
return d[i].prod(axis=1, numeric_only=True)
d_2 = pd.DataFrame(
list(map(sf, column)), index=[ms.join(i) for i in column]).T
support_series_2 = 1.0 * \
d_2[[ms.join(i) for i in column]].sum() / len(d)
column = list(
support_series_2[support_series_2 > support].index)
support_series = support_series.append(support_series_2)
column2 = []
for i in column:
i = i.split(ms)
for j in range(len(i)):
column2.append(i[:j] + i[j + 1:] + i[j:j + 1])
cofidence_series = pd.Series(
index=[ms.join(i) for i in column2])
for i in column2:
cofidence_series[ms.join(i)] = support_series[ms.join(
sorted(i))] / support_series[ms.join(i[:len(i) - 1])]
for i in cofidence_series[cofidence_series > confidence].index:
result[i] = 0.0
result[i]['confidence'] = cofidence_series[i]
result[i]['support'] = support_series[ms.join(sorted(i.split(ms)))]
result = result.T.sort_values(
['confidence', 'support'], ascending=False)
print(u'\n结果为:')
print(result)
return result
def programmer_8():
inputfile = 'data/menu_orders.xls'
outputfile = 'tmp/apriori_rules.xls'
data = pd.read_excel(inputfile, header=None)
print(u'\n转换原始数据至0-1矩阵...')
def ct(x):
return pd.Series(1, index=x[pd.notnull(x)])
b = map(ct, data.as_matrix())
data = pd.DataFrame(list(b)).fillna(0)
print(u'\n转换完毕。')
del b
support = 0.2
confidence = 0.5
ms = '---'
find_rule(data, support, confidence, ms).to_excel(outputfile)
if __name__ == "__main__":
pass