Get Dataset & Create Workspace

import os
import tarfile
from six.moves import urllib

import pandas as pd
import numpy  as np
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"
def fetch_housing_data(
    housing_url=HOUSING_URL, 
    housing_path=HOUSING_PATH):

    # create datasets/housing directory if needed
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)

    tgz_path = os.path.join(housing_path, "housing.tgz")

    # retrieve tarfile
    urllib.request.urlretrieve(housing_url, tgz_path)

    # extract tarfile & close path
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
def load_housing_data(
    housing_path=HOUSING_PATH):

    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
# do it
#fetch_housing_data() -- already downloaded - static dataset
housing = load_housing_data()

Data structure - quick peek

housing.head()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY

So... what's in the dataset?

# housing is a Pandas DataFrame.
# untouched datafile: 20640 records, 10 cols (9 float, 1 text)
housing.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
# let's see if ocean_proximity can be lumped into categories:
housing['ocean_proximity'].value_counts()
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64
# percentiles analysis of each feature
housing.describe()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
count 20640.000000 20640.000000 20640.000000 20640.000000 20433.000000 20640.000000 20640.000000 20640.000000 20640.000000
mean -119.569704 35.631861 28.639486 2635.763081 537.870553 1425.476744 499.539680 3.870671 206855.816909
std 2.003532 2.135952 12.585558 2181.615252 421.385070 1132.462122 382.329753 1.899822 115395.615874
min -124.350000 32.540000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900 14999.000000
25% -121.800000 33.930000 18.000000 1447.750000 296.000000 787.000000 280.000000 2.563400 119600.000000
50% -118.490000 34.260000 29.000000 2127.000000 435.000000 1166.000000 409.000000 3.534800 179700.000000
75% -118.010000 37.710000 37.000000 3148.000000 647.000000 1725.000000 605.000000 4.743250 264725.000000
max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000000 15.000100 500001.000000
# feature histograms
%matplotlib inline
import matplotlib.pyplot as plt

housing.hist(bins=50, figsize=(20,15))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f4a792b5438>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f4a75f1c2e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f4a75f39d68>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f4a75eaf7b8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f4a75e7acc0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f4a75e40438>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f4a75e0a860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f4a75dce198>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f4a75d1d1d0>]], dtype=object)

png

Create a test set

# split dataset into training (80%) and test (20%) subsets

import numpy as np

def split_train_test(
    data, test_ratio):

    shuffled_indices = np.random.permutation(len(data))

    test_set_size = int(len(data) * test_ratio)

    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]

    return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), "train +", len(test_set), "test")
16512 train + 4128 test
# create method for ensuring consistent test sets across multiple runs 
# (new test sets won't contain instances in previous training sets.)

# example method:
# compute hash of each instance
# keep only the last byte
# include instance in test set if value < 51 (20% of 256)

import hashlib

def test_set_check(
    identifier, test_ratio, hash):

    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

def split_train_test_by_id(
    data, test_ratio, id_column, hash=hashlib.md5):

    ids = data[id_column]
    in_test_set = ids.apply(
        lambda id_: test_set_check(
            id_, test_ratio, hash))

    return data.loc[~in_test_set], data.loc[in_test_set]
# housing dataset doesn't have ID attribute,
# so let's add an index to it.

housing_with_id = housing.reset_index()

train_set, test_set = split_train_test_by_id(
    housing_with_id, 0.2, "index")
train_set.head()
index longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
6 6 -122.25 37.84 52.0 2535.0 489.0 1094.0 514.0 3.6591 299200.0 NEAR BAY
test_set.head()
index longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
4 4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY
5 5 -122.25 37.85 52.0 919.0 213.0 413.0 193.0 4.0368 269700.0 NEAR BAY
11 11 -122.26 37.85 52.0 3503.0 752.0 1504.0 734.0 3.2705 241800.0 NEAR BAY
20 20 -122.27 37.85 40.0 751.0 184.0 409.0 166.0 1.3578 147500.0 NEAR BAY
23 23 -122.27 37.84 52.0 1688.0 337.0 853.0 325.0 2.1806 99700.0 NEAR BAY
# a better index:
# let's use longitude & latitude to build stable identifier

housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]

train_set, test_set = split_train_test_by_id(
    housing_with_id, 0.2, "id")
train_set.head()
index longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity id
0 0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY -122192.12
1 1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY -122182.14
2 2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY -122202.15
3 3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY -122212.15
4 4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY -122212.15
test_set.head()
index longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity id
8 8 -122.26 37.84 42.0 2555.0 665.0 1206.0 595.0 2.0804 226700.0 NEAR BAY -122222.16
10 10 -122.26 37.85 52.0 2202.0 434.0 910.0 402.0 3.2031 281500.0 NEAR BAY -122222.15
11 11 -122.26 37.85 52.0 3503.0 752.0 1504.0 734.0 3.2705 241800.0 NEAR BAY -122222.15
12 12 -122.26 37.85 52.0 2491.0 474.0 1098.0 468.0 3.0750 213500.0 NEAR BAY -122222.15
13 13 -122.26 37.84 52.0 696.0 191.0 345.0 174.0 2.6736 191300.0 NEAR BAY -122222.16
# another option: scikit-learn splitters

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(
    housing, test_size=0.2, random_state=42)
test_set.head()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
20046 -119.01 36.06 25.0 1505.0 NaN 1392.0 359.0 1.6812 47700.0 INLAND
3024 -119.46 35.14 30.0 2943.0 NaN 1565.0 584.0 2.5313 45800.0 INLAND
15663 -122.44 37.80 52.0 3830.0 NaN 1310.0 963.0 3.4801 500001.0 NEAR BAY
20484 -118.72 34.28 17.0 3051.0 NaN 1705.0 495.0 5.7376 218600.0 <1H OCEAN
9814 -121.93 36.62 34.0 2351.0 NaN 1063.0 428.0 3.7250 278000.0 NEAR OCEAN
train_set.head()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
14196 -117.03 32.71 33.0 3126.0 627.0 2300.0 623.0 3.2596 103000.0 NEAR OCEAN
8267 -118.16 33.77 49.0 3382.0 787.0 1314.0 756.0 3.8125 382100.0 NEAR OCEAN
17445 -120.48 34.66 4.0 1897.0 331.0 915.0 336.0 4.1563 172600.0 NEAR OCEAN
14265 -117.11 32.69 36.0 1421.0 367.0 1418.0 355.0 1.9425 93400.0 NEAR OCEAN
2271 -119.80 36.78 43.0 2382.0 431.0 874.0 380.0 3.5542 96500.0 INLAND
# does sampling plan have a sampling bias?
# each strata in test dataset should mimic reality

housing['median_income'].hist(bins=5)
<matplotlib.axes._subplots.AxesSubplot at 0x7f15f7250588>

png

housing.describe()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value
count 20640.000000 20640.000000 20640.000000 20640.000000 20433.000000 20640.000000 20640.000000 20640.000000 20640.000000
mean -119.569704 35.631861 28.639486 2635.763081 537.870553 1425.476744 499.539680 3.870671 206855.816909
std 2.003532 2.135952 12.585558 2181.615252 421.385070 1132.462122 382.329753 1.899822 115395.615874
min -124.350000 32.540000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900 14999.000000
25% -121.800000 33.930000 18.000000 1447.750000 296.000000 787.000000 280.000000 2.563400 119600.000000
50% -118.490000 34.260000 29.000000 2127.000000 435.000000 1166.000000 409.000000 3.534800 179700.000000
75% -118.010000 37.710000 37.000000 3148.000000 647.000000 1725.000000 605.000000 4.743250 264725.000000
max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000000 15.000100 500001.000000
housing["income_cat"]=np.ceil(housing["median_income"]/1.5)

housing["income_cat"].where(housing["income_cat"]<5, 5.0, inplace=True)
housing.describe()
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value income_cat
count 20640.000000 20640.000000 20640.000000 20640.000000 20433.000000 20640.000000 20640.000000 20640.000000 20640.000000 20640.000000
mean -119.569704 35.631861 28.639486 2635.763081 537.870553 1425.476744 499.539680 3.870671 206855.816909 3.006686
std 2.003532 2.135952 12.585558 2181.615252 421.385070 1132.462122 382.329753 1.899822 115395.615874 1.054618
min -124.350000 32.540000 1.000000 2.000000 1.000000 3.000000 1.000000 0.499900 14999.000000 1.000000
25% -121.800000 33.930000 18.000000 1447.750000 296.000000 787.000000 280.000000 2.563400 119600.000000 2.000000
50% -118.490000 34.260000 29.000000 2127.000000 435.000000 1166.000000 409.000000 3.534800 179700.000000 3.000000
75% -118.010000 37.710000 37.000000 3148.000000 647.000000 1725.000000 605.000000 4.743250 264725.000000 4.000000
max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000000 15.000100 500001.000000 5.000000
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(
    n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set  = housing.loc[test_index]
# review income category proportions
housing["income_cat"].value_counts() / len(housing)
3.0    0.350581
2.0    0.318847
4.0    0.176308
5.0    0.114438
1.0    0.039826
Name: income_cat, dtype: float64
# remove income_cat attribute (return dataset to original state)

for set in (strat_train_set, strat_test_set):
    set.drop(["income_cat"], axis=1, inplace=True)

Visualization

housing = strat_train_set.copy()

# first: basic geographic distribution

housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f15f71c4668>

png

# next: housing prices.
# color = price 
# radius = population
# use predefined "jet" color map 

housing.plot(
    kind="scatter", 
    x="longitude", 
    y="latitude", 
    alpha=0.4,
    #s=housing["population"].apply(lambda n: n/100), 
    s=housing["population"]/100,
    label="population",
    c="median_house_value", 
    cmap=plt.get_cmap("jet"), 
    colorbar=True,
)
plt.legend()
<matplotlib.legend.Legend at 0x7f15f5eff1d0>

png

Correlations

# next: look for correlatons to median house value.

corr_matrix = housing.corr()

corr_matrix['median_house_value'].sort_values(ascending=False)
median_house_value    1.000000
median_income         0.687160
total_rooms           0.135097
housing_median_age    0.114110
households            0.064506
total_bedrooms        0.047689
population           -0.026920
longitude            -0.047432
latitude             -0.142724
Name: median_house_value, dtype: float64
# another way of looking for correlations: scatter_matrix
# focus on top 3 factors from above

from pandas.tools.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
"housing_median_age"]

scatter_matrix(housing[attributes], figsize=(12, 8))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f15f5fc20f0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f15f5eda860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f15f602f898>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f15f5ff2860>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f15f5f7dd68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f15f5e836d8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f15f460e710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f15f45d50b8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f15f45224e0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f15f454ab38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f15f449ec88>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f15f44e5898>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f15f44380b8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f15f4450be0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f15f43c2da0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f15f43960f0>]], dtype=object)

png

# median house value to median income seems to be the most promising.
# let's zoom in.

housing.plot(
    kind="scatter", x="median_income", y="median_house_value",
    alpha=0.1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f15f41b1a90>

png

# combine some attributes to create more useful ones
# then rebuild the correlation matrix.

housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

# *** NOTE: rooms_per_household corr (in book) show more improvement, ~0.199
# compared to our 0.146. Not sure of root cause yet. ***
median_house_value          1.000000
median_income               0.687160
rooms_per_household         0.146285
total_rooms                 0.135097
housing_median_age          0.114110
households                  0.064506
total_bedrooms              0.047689
population_per_household   -0.021985
population                 -0.026920
longitude                  -0.047432
latitude                   -0.142724
bedrooms_per_room          -0.259984
Name: median_house_value, dtype: float64

Data Cleanup

# revert to clean copy of stratified training dataset
# separate predictors from labels

housing = strat_train_set.drop("median_house_value", axis=1)

housing_labels = strat_train_set["median_house_value"].copy()
# 'total bedrooms' has some missing values - fix
# can use DataFrame dropna(), drop(), fillna()

# use Scikit-Learn class to handle missing values
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")
# drop ocean_proximity attribute, since it's non-numeric.
# then fit to training data.

housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)
# now what do we have?
imputer.statistics_
array([ -118.51  ,    34.26  ,    29.    ,  2119.5   ,   433.    ,
        1164.    ,   408.    ,     3.5409])
housing_num.median().values
array([ -118.51  ,    34.26  ,    29.    ,  2119.5   ,   433.    ,
        1164.    ,   408.    ,     3.5409])
# update training set by replacing missing values with learned medians
X = imputer.transform(housing_num)
pd.DataFrame(X, columns=housing_num.columns).info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16512 entries, 0 to 16511
Data columns (total 8 columns):
longitude             16512 non-null float64
latitude              16512 non-null float64
housing_median_age    16512 non-null float64
total_rooms           16512 non-null float64
total_bedrooms        16512 non-null float64
population            16512 non-null float64
households            16512 non-null float64
median_income         16512 non-null float64
dtypes: float64(8)
memory usage: 1.0 MB
# convert ocean_proximity feature to numbers using LabelEncoder.

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

housing_cat = housing['ocean_proximity']
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded
array([0, 0, 4, ..., 1, 0, 3])
# how is 'ocean_proximity' mapped?
print(encoder.classes_)
['<1H OCEAN' 'INLAND' 'ISLAND' 'NEAR BAY' 'NEAR OCEAN']
# a better solution for categorical data: one-hot encoding

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()

# output = SciPy sparse matrix, better for memory usage
# if you need a dense NumPy array, call toarray()

housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot
<16512x5 sparse matrix of type '<class 'numpy.float64'>'
    with 16512 stored elements in Compressed Sparse Row format>

Label Binarization:

  • A shortcut (text categories => integer categories => one-hot vectors)
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()

housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot
array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       ..., 
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0]])

Custom Transformers:

  • Create your own using SciKit-Learn classes
  • implement fit(), transform() and fit_transform() methods
  • (fit_transform comes for free by using TransformerMixin as a base class.)
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):

    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs

        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self # nothing else to do

    def transform(self, X, y=None):
        rooms_per_household      = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]

        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, 
                         rooms_per_household, 
                         population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, 
                         rooms_per_household, 
                         population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)

housing_extra_attribs = attr_adder.transform(housing.values)

Feature Scaling

  • Min-max scaling (normalization) = shift & rescale to [0,1]
  • SciKit MinMaxScaler will do this for you.
  • Standardization subtracts mean & divides by variance - result has unit variance
  • SciKit StandardScaler does this for you.

Pipelining

  • SciKit Pipeline class helps to standardize the sequence of transforms you need for your project.
  • Pipelines = list of estimator steps. All but the last must be transformers (they must have fit_transform() method.)
# "DataFrameSelector" is a custom transformer class.
# grabs the specified feature, drops the rest, converts the DF into a NumPy array.

from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__ (self, attribute_names):
        self.attribute_names = attribute_names

    def fit (self, X, y=None):
        return self

    def transform (self, X):
        return X[self.attribute_names].values
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler

num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']

num_pipeline = Pipeline([
    ('selector',      DataFrameSelector(num_attribs)),
    ('imputer',       Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler',    StandardScaler()),
    ])

cat_pipeline = Pipeline([
    ('selector',      DataFrameSelector(cat_attribs)),
    ('label_binarizer', LabelBinarizer()),
])

full_pipeline = FeatureUnion(transformer_list =[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline)
])
# let's try it out:

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ..., 
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])
housing_prepared.shape
(16512, 16)
  • Note: pip3 install sklearn-pandas => gets a DataFrameMapper class

Model Selection & Training

# let's start with a linear regression

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
# first try. NOT very accurate.

some_data          = housing.iloc[:5]
some_labels        = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print ("predictions:\t", lin_reg.predict(some_data_prepared))
print ("labels:\t", list(some_labels))
predictions:     [ 210644.60459286  317768.80697211  210956.43331178   59218.98886849
  189747.55849879]
labels:     [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]
# why? look at RMSE on whole training set.

from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse             = mean_squared_error(housing_labels, housing_predictions)
lin_rmse            = np.sqrt(lin_mse)

print ("typical prediction error:\t", lin_rmse)
typical prediction error:     68628.1981985
# Hmmm. Not good. Underfit situation. 
# Let's try a more powerful model, like a Decision Tree.

from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')
# Zero error? No way...

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)

print ("typical prediction error:\t", tree_rmse)
typical prediction error:     0.0
# Use K-fold cross-validation
# Train & eval Decision Tree model against 10 splits of training dataset
# Returns 10 evaluation scores.

from sklearn.model_selection import cross_val_score

scores = cross_val_score(
    tree_reg, 
    housing_prepared, 
    housing_labels,
    scoring="neg_mean_squared_error", 
    cv=10)

rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(rmse_scores)
Scores: [ 69368.62190153  66248.56520386  72284.6557095   68417.57732406
  70049.44916939  74941.75765797  70236.59348749  69466.63688954
  76140.22952307  70217.59755116]
Mean: 70737.1684418
Standard deviation: 2815.58298405
# So, Decision Tree RMSE: mean ~71097, stdev 2165 (still sucks.)
# compare to earlier Linear Regression:

lin_scores = cross_val_score(
    lin_reg,
    housing_prepared,
    housing_labels,
    scoring="neg_mean_squared_error",
    cv=10)

lin_rmse_scores = np.sqrt(-lin_scores)

display_scores(lin_rmse_scores)
Scores: [ 66782.73843989  66960.118071    70347.95244419  74739.57052552
  68031.13388938  71193.84183426  64969.63056405  68281.61137997
  71552.91566558  67665.10082067]
Mean: 69052.4613635
Standard deviation: 2731.6740018
# Yep, DT overfit is just about as bad. (RMSE mean 69052, stdev 2731)
# Let's try a RandomForest.

from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

forest_scores = cross_val_score(
    forest_reg,
    housing_prepared,
    housing_labels,
    scoring="neg_mean_squared_error",
    cv=10)

forest_rmse_scores = np.sqrt(-forest_scores)

display_scores(forest_rmse_scores)
Scores: [ 52480.82629458  50035.41358467  53747.69332484  55053.95194112
  51800.65152945  55919.01705209  52226.75176017  50912.82366116
  55708.47271341  51931.81080304]
Mean: 52981.7412665
Standard deviation: 1929.32402243
# OK, RandomForest is a little better.
# RMSE mean ~52495, stdev ~1569

Fine-Tuning Model with Grid Search of Hyperparameters

from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 
     'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], # bootstrap = True = default setting
     'n_estimators': [3, 10], 
     'max_features': [2, 3, 4]},
]

forest_reg  = RandomForestRegressor()

grid_search = GridSearchCV(
    forest_reg, 
    param_grid, 
    cv=5,
    scoring = 'neg_mean_squared_error')

grid_search.fit(housing_prepared, housing_labels)
GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'max_features': [2, 4, 6, 8], 'n_estimators': [3, 10, 30]}, {'bootstrap': [False], 'max_features': [2, 3, 4], 'n_estimators': [3, 10]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)
# Best combination of parameters?
grid_search.best_params_
{'max_features': 6, 'n_estimators': 30}
# Best estimator?
grid_search.best_estimator_
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=6, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
# Evaluation scores:

cvres = grid_search.cv_results_

for mean_score, params in zip(cvres["mean_test_score"],
                              cvres["params"]):
    print(np.sqrt(-mean_score), params)
63492.9975584 {'max_features': 2, 'n_estimators': 3}
55677.1037862 {'max_features': 2, 'n_estimators': 10}
52917.801725 {'max_features': 2, 'n_estimators': 30}
60442.2787178 {'max_features': 4, 'n_estimators': 3}
53209.7111283 {'max_features': 4, 'n_estimators': 10}
50621.1191846 {'max_features': 4, 'n_estimators': 30}
58591.8196313 {'max_features': 6, 'n_estimators': 3}
52353.3606044 {'max_features': 6, 'n_estimators': 10}
49838.3807 {'max_features': 6, 'n_estimators': 30}
58615.6100561 {'max_features': 8, 'n_estimators': 3}
51726.2593734 {'max_features': 8, 'n_estimators': 10}
50074.3050139 {'max_features': 8, 'n_estimators': 30}
62010.5215854 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54852.7770725 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
60246.2164711 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52752.4109521 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
58355.1846204 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
51724.6800894 {'bootstrap': False, 'max_features': 4, 'n_estimators': 10}
# best solution:
# max_features = 6, n_estimators = 30 (RMSE ~49,960)
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances
array([  8.00229340e-02,   7.13499357e-02,   4.21346911e-02,
         1.73340009e-02,   1.55694906e-02,   1.76527489e-02,
         1.56813711e-02,   3.21068169e-01,   7.54675530e-02,
         1.07645094e-01,   5.74608930e-02,   1.47327045e-02,
         1.57310792e-01,   9.20951468e-05,   2.63317542e-03,
         3.84435167e-03])
# display feature "importance" scores next to their names:

extra_attribs       = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_one_hot_attribs = list(encoder.classes_)
attributes          = num_attribs + extra_attribs + cat_one_hot_attribs

sorted(zip(feature_importances, attributes), reverse=True)
[(0.32106816893273865, 'median_income'),
 (0.15731079177984286, 'INLAND'),
 (0.10764509417315272, 'pop_per_hhold'),
 (0.080022934000105003, 'longitude'),
 (0.075467553036607335, 'rooms_per_hhold'),
 (0.071349935674308126, 'latitude'),
 (0.057460893036370447, 'bedrooms_per_room'),
 (0.04213469106714228, 'housing_median_age'),
 (0.017652748894983483, 'population'),
 (0.017334000890698829, 'total_rooms'),
 (0.015681371107232313, 'households'),
 (0.015569490624941605, 'total_bedrooms'),
 (0.014732704544371122, '<1H OCEAN'),
 (0.0038443516681782959, 'NEAR OCEAN'),
 (0.00263317542255579, 'NEAR BAY'),
 (9.2095146771177451e-05, 'ISLAND')]

Time to Eval System on Test dataset

final_model = grid_search.best_estimator_

X_test          = strat_test_set.drop("median_house_value", axis=1)
y_test          = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse
47574.62166586089

results matching ""

    No results matching ""