Intro
import numpy as np
import numpy.random as rnd
import matplotlib.pyplot as plt
Voting Classifiers
- Good classifiers can be built by aggregating predictions of various weaker classifiers, and returning the class that gets the most votes. (A "hard voting" classifier.)
heads_proba = 0.51
coin_tosses = (rnd.rand(10000, 10) < heads_proba).astype(np.int32)
cumulative_heads_ratio = np.cumsum(
coin_tosses, axis=0) / np.arange(1, 10001).reshape(-1, 1)
#cumulative_heads_ratio
plt.figure(figsize=(8,3.5))
plt.plot(cumulative_heads_ratio)
plt.plot([0, 10000], [0.51, 0.51], "k--", linewidth=2, label="51%")
plt.plot([0, 10000], [0.5, 0.5], "k-", label="50%")
plt.xlabel("Number of coin tosses")
plt.ylabel("Heads ratio")
plt.legend(loc="lower right")
plt.title("The law of large numbers:")
plt.axis([0, 10000, 0.42, 0.58])
#save_fig("law_of_large_numbers_plot")
plt.show()
# build a voting classifier in Scikit using three weaker classifiers
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
# use moons dataset
X, y = make_moons(
n_samples=500,
noise=0.30,
random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=42)
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(probability=True, random_state=42)
# voting classifier = logistic + random forest + SVC
voting_clf = VotingClassifier(
estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
voting='soft'
)
voting_clf.fit(X_train, y_train)
# let's see how each individual classifier did:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
# voting classifier did better than 3 individual ones!
LogisticRegression 0.864
RandomForestClassifier 0.872
SVC 0.888
VotingClassifier 0.912
If all classifiers can estimate class probabilities (they have a predict_proba() method), use Scikit to predict highest class probability, averaged over all individual classifiers. (soft voting)
Often better than hard voting because it gives more weight to highly confident votes. Replace voting="hard" with "soft" & ensure all classifiers can estimate class probabilities. (SVC cannot by default -set probability param to True.)
This tells SVC to use cross-validation to estimate class probabilities. Slows training times & adds a predict_proba() method).
Bagging & Pasting
- Another approach: use same training algorithm, but apply it to different subsets of the training dataset.
- bagging: sampling the dataset with replacement.
- pasting: sampling the dataset without replacement.
- Final prediction = based on an aggregation function.
- Predictions can be made in parallel -- good scaling properties.
from sklearn.datasets import make_moons
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
# Train ensemble of 500 Decision Tree classifiers
# each using 100 training instances - randomly sampled from training set
# with replacement.
bag_clf = BaggingClassifier(
DecisionTreeClassifier(random_state=42),
n_estimators=500,
max_samples=100,
bootstrap=True, # set to False for pasting instead of bagging.
n_jobs=-1,
random_state=42)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
0.904
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)
print(accuracy_score(y_test, y_pred_tree))
0.856
from matplotlib.colors import ListedColormap
def plot_decision_boundary(clf, X, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.5, contour=True):
x1s = np.linspace(axes[0], axes[1], 100)
x2s = np.linspace(axes[2], axes[3], 100)
x1, x2 = np.meshgrid(x1s, x2s)
X_new = np.c_[x1.ravel(), x2.ravel()]
y_pred = clf.predict(X_new).reshape(x1.shape)
custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap, linewidth=10)
if contour:
custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)
plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", alpha=alpha)
plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", alpha=alpha)
plt.axis(axes)
plt.xlabel(r"$x_1$", fontsize=18)
plt.ylabel(r"$x_2$", fontsize=18, rotation=0)
plt.figure(figsize=(11,4))
plt.subplot(121)
plot_decision_boundary(tree_clf, X, y)
plt.title("Decision Tree", fontsize=14)
plt.subplot(122)
plot_decision_boundary(bag_clf, X, y)
plt.title("Decision Trees with Bagging", fontsize=14)
#save_fig("decision_tree_without_and_with_bagging_plot")
plt.show()
Out of Bag Evaluation
- Bagging: some instances may be sampled multiple times - others not at all. On avg, ~63% of training samples are used. Remainder 37% = "out of bag".
- use oob_score=True in Scikit to do automatic oob evaluation after training.
# oob_score_: predicts classifier results on test set.
bag_clf = BaggingClassifier(
DecisionTreeClassifier(),
n_estimators=500,
bootstrap=True,
n_jobs=-1,
oob_score=True
)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_
0.89866666666666661
# did oob_score_ do a good job?
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test,y_pred)
0.90400000000000003
# oob decision functionfor each training instance
bag_clf.oob_decision_function_
array([[ 0.36363636, 0.63636364],
[ 0.38586957, 0.61413043],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0. , 1. ],
[ 0.06632653, 0.93367347],
[ 0.30769231, 0.69230769],
[ 0.03015075, 0.96984925],
[ 0.99444444, 0.00555556],
[ 0.94708995, 0.05291005],
[ 0.79 , 0.21 ],
[ 0.00507614, 0.99492386],
[ 0.77456647, 0.22543353],
[ 0.84269663, 0.15730337],
[ 0.95480226, 0.04519774],
[ 0.06557377, 0.93442623],
[ 0. , 1. ],
[ 0.98 , 0.02 ],
[ 0.95505618, 0.04494382],
[ 1. , 0. ],
[ 0.01086957, 0.98913043],
[ 0.3372093 , 0.6627907 ],
[ 0.89949749, 0.10050251],
[ 1. , 0. ],
[ 0.96666667, 0.03333333],
[ 0. , 1. ],
[ 0.99375 , 0.00625 ],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0.64285714, 0.35714286],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0. , 1. ],
[ 0.14444444, 0.85555556],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0.33152174, 0.66847826],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0.22702703, 0.77297297],
[ 0.41212121, 0.58787879],
[ 1. , 0. ],
[ 1. , 0. ],
[ 0. , 1. ],
[ 1. , 0. ],
[ 1. , 0. ],
[ 0.02777778, 0.97222222],
[ 1. , 0. ],
[ 0.00561798, 0.99438202],
[ 0.99418605, 0.00581395],
[ 0.89265537, 0.10734463],
[ 0.96273292, 0.03726708],
[ 0.9494382 , 0.0505618 ],
[ 0. , 1. ],
[ 0.03846154, 0.96153846],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0. , 1. ],
[ 0.00540541, 0.99459459],
[ 1. , 0. ],
[ 0.81182796, 0.18817204],
[ 0.44776119, 0.55223881],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0.63387978, 0.36612022],
[ 1. , 0. ],
[ 1. , 0. ],
[ 0.845 , 0.155 ],
[ 1. , 0. ],
[ 0.55 , 0.45 ],
[ 0.13372093, 0.86627907],
[ 0.68390805, 0.31609195],
[ 0.87700535, 0.12299465],
[ 0. , 1. ],
[ 0.19487179, 0.80512821],
[ 0.88324873, 0.11675127],
[ 1. , 0. ],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0.07017544, 0.92982456],
[ 0.0326087 , 0.9673913 ],
[ 0.29120879, 0.70879121],
[ 1. , 0. ],
[ 0.00487805, 0.99512195],
[ 0.87700535, 0.12299465],
[ 0.00543478, 0.99456522],
[ 0. , 1. ],
[ 0. , 1. ],
[ 0.26395939, 0.73604061],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0. , 1. ],
[ 0. , 1. ],
[ 0.92227979, 0.07772021],
[ 0.78494624, 0.21505376],
[ 0.005 , 0.995 ],
[ 1. , 0. ],
[ 0.1957672 , 0.8042328 ],
[ 0.6631016 , 0.3368984 ],
[ 0. , 1. ],
[ 0.03529412, 0.96470588],
[ 0.4974359 , 0.5025641 ],
[ 1. , 0. ],
[ 0.01785714, 0.98214286],
[ 0.99465241, 0.00534759],
[ 0.23626374, 0.76373626],
[ 0.5270936 , 0.4729064 ],
[ 1. , 0. ],
[ 0.01694915, 0.98305085],
[ 0.99568966, 0.00431034],
[ 0.25988701, 0.74011299],
[ 0.92982456, 0.07017544],
[ 1. , 0. ],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0. , 1. ],
[ 0.80748663, 0.19251337],
[ 1. , 0. ],
[ 0.02105263, 0.97894737],
[ 1. , 0. ],
[ 1. , 0. ],
[ 1. , 0. ],
[ 0.98477157, 0.01522843],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0.92655367, 0.07344633],
[ 1. , 0. ],
[ 0.01485149, 0.98514851],
[ 0.29145729, 0.70854271],
[ 0.96216216, 0.03783784],
[ 0.29608939, 0.70391061],
[ 0.9893617 , 0.0106383 ],
[ 0. , 1. ],
[ 0. , 1. ],
[ 0.73913043, 0.26086957],
[ 0.40251572, 0.59748428],
[ 0.46031746, 0.53968254],
[ 0.88297872, 0.11702128],
[ 0.92090395, 0.07909605],
[ 0.06818182, 0.93181818],
[ 0.82634731, 0.17365269],
[ 0. , 1. ],
[ 0. , 1. ],
[ 0.01169591, 0.98830409],
[ 1. , 0. ],
[ 1. , 0. ],
[ 1. , 0. ],
[ 0.00529101, 0.99470899],
[ 0. , 1. ],
[ 0.00540541, 0.99459459],
[ 0. , 1. ],
[ 1. , 0. ],
[ 1. , 0. ],
[ 0.95238095, 0.04761905],
[ 1. , 0. ],
[ 1. , 0. ],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0.34065934, 0.65934066],
[ 0.23529412, 0.76470588],
[ 0.00534759, 0.99465241],
[ 0.00512821, 0.99487179],
[ 0.31213873, 0.68786127],
[ 1. , 0. ],
[ 1. , 0. ],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0.00613497, 0.99386503],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0.00578035, 0.99421965],
[ 0.63313609, 0.36686391],
[ 0.9027027 , 0.0972973 ],
[ 0. , 1. ],
[ 0.98963731, 0.01036269],
[ 1. , 0. ],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0.07978723, 0.92021277],
[ 1. , 0. ],
[ 0.03645833, 0.96354167],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0.01818182, 0.98181818],
[ 1. , 0. ],
[ 0.96276596, 0.03723404],
[ 0.77173913, 0.22826087],
[ 0.65536723, 0.34463277],
[ 0. , 1. ],
[ 0.14832536, 0.85167464],
[ 1. , 0. ],
[ 0.96296296, 0.03703704],
[ 0.97849462, 0.02150538],
[ 1. , 0. ],
[ 0.00564972, 0.99435028],
[ 0. , 1. ],
[ 0.48924731, 0.51075269],
[ 0.86243386, 0.13756614],
[ 0. , 1. ],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0.00543478, 0.99456522],
[ 0. , 1. ],
[ 0.96208531, 0.03791469],
[ 0. , 1. ],
[ 0.21590909, 0.78409091],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0. , 1. ],
[ 0.96875 , 0.03125 ],
[ 0.8021978 , 0.1978022 ],
[ 1. , 0. ],
[ 0.00568182, 0.99431818],
[ 0.05670103, 0.94329897],
[ 1. , 0. ],
[ 0.01898734, 0.98101266],
[ 0. , 1. ],
[ 0.08888889, 0.91111111],
[ 1. , 0. ],
[ 0.77439024, 0.22560976],
[ 0. , 1. ],
[ 0.86666667, 0.13333333],
[ 0.99441341, 0.00558659],
[ 0.14634146, 0.85365854],
[ 0.19487179, 0.80512821],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0. , 1. ],
[ 0. , 1. ],
[ 0.26111111, 0.73888889],
[ 0.96391753, 0.03608247],
[ 0. , 1. ],
[ 1. , 0. ],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0.52147239, 0.47852761],
[ 1. , 0. ],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0. , 1. ],
[ 0.07177033, 0.92822967],
[ 0.13333333, 0.86666667],
[ 0.99435028, 0.00564972],
[ 0.01142857, 0.98857143],
[ 1. , 0. ],
[ 0.43820225, 0.56179775],
[ 0.11290323, 0.88709677],
[ 0.5875 , 0.4125 ],
[ 0.60752688, 0.39247312],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0. , 1. ],
[ 0.58125 , 0.41875 ],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0.18032787, 0.81967213],
[ 0.8150289 , 0.1849711 ],
[ 0.07216495, 0.92783505],
[ 1. , 0. ],
[ 0.84444444, 0.15555556],
[ 0. , 1. ],
[ 0. , 1. ],
[ 0.07772021, 0.92227979],
[ 0.02061856, 0.97938144],
[ 0. , 1. ],
[ 0.99473684, 0.00526316],
[ 0.9076087 , 0.0923913 ],
[ 0.15517241, 0.84482759],
[ 0.96174863, 0.03825137],
[ 0.00578035, 0.99421965],
[ 0.60487805, 0.39512195],
[ 0.03553299, 0.96446701],
[ 0.98958333, 0.01041667],
[ 0.82795699, 0.17204301],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0.96174863, 0.03825137],
[ 0. , 1. ],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0.32960894, 0.67039106],
[ 0.98907104, 0.01092896],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0. , 1. ],
[ 0.83240223, 0.16759777],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0.79213483, 0.20786517],
[ 0.94565217, 0.05434783],
[ 1. , 0. ],
[ 0.73224044, 0.26775956],
[ 0.57142857, 0.42857143],
[ 0. , 1. ],
[ 0.91666667, 0.08333333],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0.89820359, 0.10179641],
[ 1. , 0. ],
[ 1. , 0. ],
[ 0.77272727, 0.22727273],
[ 0.14371257, 0.85628743],
[ 0.52348993, 0.47651007],
[ 0.26701571, 0.73298429],
[ 0. , 1. ],
[ 0.86486486, 0.13513514],
[ 0.83536585, 0.16463415],
[ 0.00591716, 0.99408284],
[ 1. , 0. ],
[ 0.99431818, 0.00568182],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0.02659574, 0.97340426],
[ 0.96067416, 0.03932584],
[ 0.95767196, 0.04232804],
[ 1. , 0. ],
[ 0.47928994, 0.52071006],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0.99459459, 0.00540541],
[ 0.03157895, 0.96842105],
[ 1. , 0. ],
[ 1. , 0. ],
[ 1. , 0. ],
[ 0. , 1. ],
[ 0.96601942, 0.03398058],
[ 0. , 1. ],
[ 0.04651163, 0.95348837],
[ 0. , 1. ],
[ 0. , 1. ],
[ 1. , 0. ],
[ 1. , 0. ],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0.02040816, 0.97959184],
[ 1. , 0. ],
[ 0.13917526, 0.86082474],
[ 0. , 1. ],
[ 0.01734104, 0.98265896],
[ 0. , 1. ],
[ 0.40952381, 0.59047619],
[ 0.06818182, 0.93181818],
[ 0.23195876, 0.76804124],
[ 1. , 0. ],
[ 0.98795181, 0.01204819],
[ 0.20430108, 0.79569892],
[ 0.99438202, 0.00561798],
[ 0. , 1. ],
[ 0. , 1. ],
[ 1. , 0. ],
[ 0.97311828, 0.02688172],
[ 0.31460674, 0.68539326],
[ 0.98870056, 0.01129944],
[ 1. , 0. ],
[ 0.00581395, 0.99418605],
[ 0.99009901, 0.00990099],
[ 0. , 1. ],
[ 0.0304878 , 0.9695122 ],
[ 0.97849462, 0.02150538],
[ 1. , 0. ],
[ 0.03508772, 0.96491228],
[ 0.64864865, 0.35135135]])
Random Patches - Random Subspaces
- BaggingClassifier supports feature sampling. Params: max_features and bootstrap.
- Very useful when handling high-dimensional datasets.
- "Random patches": sampling features & sampling instances.
- "Random subspaces": sampling features & keeping all instances.
Random Forests
- RF = ensemble of Decision Trees
- Typically trained via bagging
- RandomForestClassifier: designed for DT classification
- RandomForestRegressor: designed for regression
# Train an RF classifier with 500 trees limited to 16 max nodes each.
# splitter="random": tells RF to search for best feature among
# a random subset of features.
bag_clf = BaggingClassifier(
DecisionTreeClassifier(
splitter="random",
max_leaf_nodes=16,
random_state=42),
n_estimators=500,
max_samples=1.0,
bootstrap=True,
n_jobs=-1,
random_state=42)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(
n_estimators=500,
max_leaf_nodes=16,
n_jobs=-1,
random_state=42)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)
# almost identical predictions
np.sum(y_pred == y_pred_rf) / len(y_pred)
0.97599999999999998
Feature importance
- important features likely to appear closer to root of tree
- unimportant features likely to appear closer to leaves - if at all.
- Scikit finds avg depth of feature appearance across all trees in an RF.
# rank features by importance in iris
# #1: petal length: 44%
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(
n_estimators=500,
n_jobs=-1,
random_state=42)
rnd_clf.fit(iris["data"], iris["target"])
for name, importance in zip(
iris["feature_names"],
rnd_clf.feature_importances_):
print(name, "=", importance)
sepal length (cm) = 0.112492250999
sepal width (cm) = 0.0231192882825
petal length (cm) = 0.441030464364
petal width (cm) = 0.423357996355
rnd_clf.feature_importances_
array([ 0.11249225, 0.02311929, 0.44103046, 0.423358 ])
plt.figure(figsize=(6, 4))
for i in range(15):
tree_clf = DecisionTreeClassifier(
max_leaf_nodes=16,
random_state=42+i)
indices_with_replacement = rnd.randint(
0,
len(X_train),
len(X_train))
tree_clf.fit(
X[indices_with_replacement],
y[indices_with_replacement])
plot_decision_boundary(
tree_clf, X, y,
axes=[-1.5, 2.5, -1, 1.5],
alpha=0.02,
contour=False)
plt.show()
Boosting - AdaBoost
- One strategy: pay more attention to training instances that predecessor underfitted - forces new predictors to concentrate more on the "hard cases".
- Disadvantage: results depend on previous classifier (sequential), so algo cannot be parallelized. Not great for scaling.
# Plot decision boundaries of five predictors on moons dataset
m = len(X_train)
plt.figure(figsize=(11, 4))
for subplot, learning_rate in ((121, 1), (122, 0.5)):
sample_weights = np.ones(m)
for i in range(5):
plt.subplot(subplot)
svm_clf = SVC(
kernel="rbf",
C=0.05)
svm_clf.fit(
X_train, y_train,
sample_weight=sample_weights)
y_pred = svm_clf.predict(
X_train)
sample_weights[y_pred != y_train] *= (1 + learning_rate)
plot_decision_boundary(
svm_clf,
X, y,
alpha=0.2)
plt.title("learning_rate = {}".format(learning_rate - 1),
fontsize=16)
plt.subplot(121)
plt.text(-0.7, -0.65, "1", fontsize=14)
plt.text(-0.6, -0.10, "2", fontsize=14)
plt.text(-0.5, 0.10, "3", fontsize=14)
plt.text(-0.4, 0.55, "4", fontsize=14)
plt.text(-0.3, 0.90, "5", fontsize=14)
#save_fig("boosting_plot")
plt.show()
# left: 1st clf gets many wrong, so 2nd clf gets boosted values.
# right: same sequence, but learning rate cut in half.
# train AdaBoost classifier on 200 decision stumps (DS)
# DS = decision tree with max_depth=1
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1), n_estimators=200,
algorithm="SAMME.R", learning_rate=0.5, random_state=42
)
ada_clf.fit(X_train, y_train)
plot_decision_boundary(ada_clf, X, y)
plt.show()
Boosting - Gradient Boosting
- Similar to AdaBoost (continually correcting the predecessors in an ensemble. Instead of tweaking instance weights on each iteration, GB fits the predictor to the residual errors of the previous predictor.
from sklearn.tree import DecisionTreeRegressor
# training set: a noisy quadratic function
rnd.seed(42)
X = rnd.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * rnd.randn(100)
# train Regressor
tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X, y)
# now train 2nd Regressor using errors made by 1st one.
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg2.fit(X, y2)
# now train 3rd Regressor using errors made by 2nd one.
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg3.fit(X, y3)
X_new = np.array([[0.8]])
# now have ensemble w/ three trees.
y_pred = sum(tree.predict(X_new) for tree in (
tree_reg1, tree_reg2, tree_reg3))
print(y_pred)
[ 0.75026781]
def plot_predictions(
regressors, X, y, axes,
label=None,
style="r-",
data_style="b.",
data_label=None):
x1 = np.linspace(axes[0], axes[1], 500)
y_pred = sum(
regressor.predict(x1.reshape(-1, 1)) for regressor in regressors)
plt.plot(X[:, 0], y, data_style, label=data_label)
plt.plot(x1, y_pred, style, linewidth=2, label=label)
if label or data_label:
plt.legend(loc="upper center", fontsize=16)
plt.axis(axes)
plt.figure(figsize=(11,11))
plt.subplot(321)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h_1(x_1)$", style="g-", data_label="Training set")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.title("Residuals and tree predictions", fontsize=16)
plt.subplot(322)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1)$", data_label="Training set")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.title("Ensemble predictions", fontsize=16)
plt.subplot(323)
plot_predictions([tree_reg2], X, y2, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_2(x_1)$", style="g-", data_style="k+", data_label="Residuals")
plt.ylabel("$y - h_1(x_1)$", fontsize=16)
plt.subplot(324)
plot_predictions([tree_reg1, tree_reg2], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1)$")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.subplot(325)
plot_predictions([tree_reg3], X, y3, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_3(x_1)$", style="g-", data_style="k+")
plt.ylabel("$y - h_1(x_1) - h_2(x_1)$", fontsize=16)
plt.xlabel("$x_1$", fontsize=16)
plt.subplot(326)
plot_predictions([tree_reg1, tree_reg2, tree_reg3], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1) + h_3(x_1)$")
plt.xlabel("$x_1$", fontsize=16)
plt.ylabel("$y$", fontsize=16, rotation=0)
#save_fig("gradient_boosting_plot")
plt.show()
# 1st row: ensemble = only one tree: predictions match 1st tree.
# 2nd row: new tree trained on residual errors of 1st tree.
# 3rd row: " "
# result: ensemble predictions get better as trees are added.
- learning_rate param controls contribution of each tree. Low values (ex: 0.1) = need more trees in ensemble to fit training set, but predictions usually generalize better. (This is called shrinkage.)
# two GBRT ensembles trained with low learning rate
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(
max_depth=2,
n_estimators=3,
learning_rate=0.1,
random_state=42)
gbrt.fit(X, y)
gbrt_slow = GradientBoostingRegressor(
max_depth=2,
n_estimators=200,
learning_rate=0.1,
random_state=42)
gbrt_slow.fit(X, y)
plt.figure(figsize=(11,4))
plt.subplot(121)
plot_predictions(
[gbrt], X, y,
axes=[-0.5, 0.5, -0.1, 0.8],
label="Ensemble predictions")
plt.title("learning_rate={}, n_estimators={}".format(gbrt.learning_rate, gbrt.n_estimators), fontsize=14)
plt.subplot(122)
plot_predictions(
[gbrt_slow], X, y,
axes=[-0.5, 0.5, -0.1, 0.8])
plt.title("learning_rate={}, n_estimators={}".format(gbrt_slow.learning_rate, gbrt_slow.n_estimators), fontsize=14)
#save_fig("gbrt_learning_rate_plot")
plt.show()
# left: not enough trees (underfits)
# right: too many trees (overfits)
- To find optimal number of trees - use early stopping method.
- staged_predict method: returns iterator
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_train, X_val, y_train, y_val = train_test_split(X, y)
# train GRBR regressor with 120 trees
gbrt = GradientBoostingRegressor(
max_depth=2,
n_estimators=120,
learning_rate=0.1,
random_state=42)
gbrt.fit(X_train, y_train)
# measure MSE validation error at each stage
errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]
errors
[0.05877146809545241,
0.050146609664278821,
0.042693525239940654,
0.036758764317358611,
0.032342621749728441,
0.028407668512271105,
0.024897554253370889,
0.022344405311247584,
0.019535997367701449,
0.017423553892941333,
0.015298227412102105,
0.013614891608372095,
0.01241865401978786,
0.01114950733723946,
0.010131360091843384,
0.0091854704682465919,
0.0085684302891776056,
0.0078525358395017328,
0.0072105819722258777,
0.0067708705683962693,
0.0062415649764643415,
0.0058360573276457243,
0.0053862983457847987,
0.0051345071507873903,
0.0048692096567381805,
0.0045993749990593299,
0.0043550054844811968,
0.0041542481413648245,
0.0039794595160053785,
0.0038058301746231277,
0.0036528925611761264,
0.0035903310836105469,
0.0035078898256137104,
0.0034145667924260869,
0.0033091498103360911,
0.0032216349333429491,
0.0031684358902285465,
0.0031067035318094903,
0.0030811367114601672,
0.0030602631146299077,
0.003000040093686018,
0.0029246869254349805,
0.0028559321605494477,
0.0028308419421558683,
0.0028218777360194264,
0.0027941065824977074,
0.0027733228935542496,
0.0027805517665357811,
0.0027523772234700978,
0.0027297064654860348,
0.0027248578787871292,
0.0027111390401517179,
0.0027041926119007326,
0.0026930464329994377,
0.0027047076934144398,
0.0027194180251317295,
0.0027010027055809748,
0.0026976053707465464,
0.0026946405089738347,
0.0026713744909731395,
0.0026633491003786457,
0.0026694977341077202,
0.0026594592750579836,
0.0026425819418378605,
0.0026524409142755744,
0.0026418897165154491,
0.0026483360802177103,
0.0026456393608631189,
0.0026465080389023671,
0.0026396693211148074,
0.002649273120700455,
0.002643721514468783,
0.0026463988198929221,
0.0026333618213948747,
0.0026314011519099879,
0.0026349113355268257,
0.0026387528659342825,
0.0026345585421650142,
0.0026355886319374901,
0.0026310345391991532,
0.0026519658939712061,
0.0026467700098620557,
0.00264498239665715,
0.0026475491456891486,
0.0026474836942911913,
0.0026530458155365681,
0.0026478335004093052,
0.0026564768881028435,
0.0026574608795571115,
0.0026537575609276061,
0.0026559108292476983,
0.0026528848367343987,
0.0026533895549644779,
0.0026520896622857252,
0.0026416985817433059,
0.0026497886163651938,
0.0026430582537166087,
0.0026548742317473117,
0.002660275592603878,
0.0026582571161537366,
0.0026570823709750535,
0.0026557538081706522,
0.002675470519360824,
0.0026762761989050578,
0.0026742086578626454,
0.0026957941482744232,
0.0026964801899977998,
0.0026939578807501376,
0.0026959742963617757,
0.0026949319702616616,
0.0026988916344244736,
0.0027169473218451121,
0.0027148017926961689,
0.0027192710134859655,
0.0027358435370699618,
0.0027346474658663323,
0.0027351047440069571,
0.0027459941366245631,
0.0027441324932851491,
0.002756368378237764]
# train another GBRT ensemble using optimal #trees
best_n_estimators = np.argmin(errors)
min_error = errors[best_n_estimators]
gbrt_best = GradientBoostingRegressor(
max_depth=2,
n_estimators=best_n_estimators,
learning_rate=0.1,
random_state=42)
gbrt_best.fit(X_train, y_train)
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
max_leaf_nodes=None, min_impurity_split=1e-07,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=79, presort='auto',
random_state=42, subsample=1.0, verbose=0, warm_start=False)
plt.figure(figsize=(11, 4))
plt.subplot(121)
plt.plot(errors, "b.-")
plt.plot([best_n_estimators, best_n_estimators], [0, min_error], "k--")
plt.plot([0, 120], [min_error, min_error], "k--")
plt.plot(best_n_estimators, min_error, "ko")
plt.text(best_n_estimators, min_error*1.2, "Minimum", ha="center", fontsize=14)
plt.axis([0, 120, 0, 0.01])
plt.xlabel("Number of trees")
plt.title("Validation error", fontsize=14)
plt.subplot(122)
plot_predictions([gbrt_best], X, y, axes=[-0.5, 0.5, -0.1, 0.8])
plt.title("Best model (55 trees)", fontsize=14)
#save_fig("early_stopping_gbrt_plot")
plt.show()
- Another method: actually stopping training early
- Implement via warm_start=True (tells Scikit to keep existing trees when fit() is called - allowing incremental training.)
gbrt = GradientBoostingRegressor(
max_depth=2,
n_estimators=1,
learning_rate=0.1,
random_state=42,
warm_start=True)
min_val_error = float("inf")
error_going_up = 0
# 120 estimators.
# stop training with validation error doesn't improve for
# five consecutive iterations
for n_estimators in range(1, 120):
gbrt.n_estimators = n_estimators
gbrt.fit(X_train, y_train)
y_pred = gbrt.predict(X_val)
val_error = mean_squared_error(y_val, y_pred)
if val_error < min_val_error:
min_val_error = val_error
error_going_up = 0
else:
error_going_up += 1
if error_going_up == 5:
break # early stopping
print(gbrt.n_estimators)
59
Stacking
- Instead of using a voting function to aggregate an ensemble's predictor outputs, instead train a model to do the aggregation. ("blending".)
- Blender training: common approach = use a holdout set.
# todo: stacking implementation