第10章 家用电器用户行为分析与事件识别
"""
Created on Fri Nov 3 13:09:18 2017
@author: lu
"""
import numpy as np
import pandas as pd
from keras.layers.core import Activation, Dense
from keras.models import Sequential
"""
programmer_1-->简单的数据筛选,划分数据
programmer_2-->阈值寻优???不懂。。
programmer_3-->建立训练神经网络,并进行模型的检验
programmer_4-->根据几个特征推算出是否满足某项条件
event_num-->相邻时间的差分,比较是否大于阈值
"""
def programmer_1():
threshold = pd.Timedelta("4 min")
inputfile = "data/water_heater.xls"
outputfile = "tmp/dividsequence.xls"
data = pd.read_excel(inputfile)
data[u"发生时间"] = pd.to_datetime(data[u"发生时间"], format="%Y%m%d%H%M%S")
data = data[data[u"水流量"] > 0]
d = data[u"发生时间"].diff() > threshold
data[u"事件编号"] = d.cumsum() + 1
data.to_excel(outputfile)
def programmer_2():
inputfile = "data/water_heater.xls"
n = 4
threshold = pd.Timedelta(minutes=5)
data = pd.read_excel(inputfile)
data[u"发生时间"] = pd.to_datetime(data[u"发生时间"], format="%Y%m%d%H%M%S")
data = data[data[u"水流量"] > 0]
dt = [pd.Timedelta(minutes=i) for i in np.arange(1, 9, 0.25)]
h = pd.DataFrame(dt, columns=[u"阈值"])
def event_num(ts):
d = data[u"发生时间"].diff() > ts
return d.sum() + 1
h[u"事件数"] = h[u"阈值"].apply(event_num)
h[u"斜率"] = h[u"事件数"].diff() / 0.25
h[u"斜率指标"] = pd.Series.rolling(h[u"斜率"].abs(), n).mean()
ts = h[u"阈值"][h[u"斜率指标"].idxmin() - n]
if ts > threshold:
ts = pd.Timedelta(minutes=4)
print(ts)
def programmer_3():
inputfile1 = "data/train_neural_network_data.xls"
inputfile2 = "data/test_neural_network_data.xls"
testoutputfile = "tmp/test_output_data.xls"
data_train = pd.read_excel(inputfile1)
data_test = pd.read_excel(inputfile2)
y_train = data_train.iloc[:, 4].as_matrix()
x_train = data_train.iloc[:, 5:17].as_matrix()
y_test = data_test.iloc[:, 4].as_matrix()
x_test = data_test.iloc[:, 5:17].as_matrix()
model = Sequential()
model.add(Dense(17, input_shape=(11, )))
model.add(Activation("relu"))
model.add(Dense(10, input_shape=(17, )))
model.add(Activation("relu"))
model.add(Dense(1, input_shape=(10, )))
model.add(Activation("sigmoid"))
model.compile(
loss="binary_crossentropy",
optimizer="adam",
sample_weight_mode="binary")
model.fit(x_train, y_train, nb_epoch=100, batch_size=1)
model.save_weights("tmp/net.model")
r = pd.DataFrame(model.predict_classes(x_test), columns=[u"预测结果"])
pd.concat([data_test.iloc[:, :5], r], axis=1).to_excel(testoutputfile)
model.predict(x_test)
return y_test
def programmer_4():
threshold = pd.Timedelta("4 min")
inputfile = "data/water_heater.xls"
outputfile = "tmp/attribute_extract.xls"
data = pd.read_excel(inputfile)
data[u"发生时间"] = pd.to_datetime(data[u"发生时间"], format="%Y%m%d%H%M%S")
data = data[data[u"水流量"] > 0]
d = data[u"发生时间"].diff() > threshold
data[u"事件编号"] = d.cumsum() + 1
data_g = data.groupby(u"事件编号")
result = pd.DataFrame()
dt = pd.Timedelta(seconds=2)
for _, g in data_g:
temp = pd.DataFrame(index=[0])
tstart = g[u"发生时间"].min()
tend = g[u"发生时间"].max()
temp[u"用水事件时长(M)"] = (dt + tend - tstart).total_seconds() / 60
temp[u"开关机切换次数"] = (pd.Series.rolling(g[u"开关机状态"] == u"关",
2).sum() == 1).sum()
temp[u"总用水量(L)"] = g[u"水流量"].sum()
tdiff = g[u"发生时间"].diff()
if len(g[u"发生时间"]) == 1:
temp[u"总用水时长(Min)"] = dt.total_seconds() / 60
else:
temp[u"总用水时长(Min)"] = (
tdiff.sum() - tdiff.iloc[1] / 2 -
tdiff.iloc[len(tdiff) - 1] / 2).total_seconds() / 60
temp[u"平均水流量(L/min)"] = temp[u"总用水量(L)"] / temp[u"总用水时长(Min)"]
result = result.append(temp, ignore_index=True)
result.to_excel(outputfile)
if __name__ == "__main__":
pass