Intro

  • Dimesionality reduction is lossy. It may speed up training but can degrade result quality. Also makes pipelines more complex. Try using original data before considering dimensionality reduction.
  • Very useful for visualization (2D, 3D representations more intuitive.)

  • Two main approaches: projection, manifold learning.

  • Three most popular techniques: PCA, Kernel PCA, LLE.

Curse of Dimensionality

  • Many things behave differently in high-D space.

1) Most points in high-D hypercube will be very close to a border.

2) Distances between random points much greater (very high probability of sparse matrix representation).

  • In 2D: ~0.52
  • In 3D: ~0.66
  • In 1,000,000D: ~408 ~ sqrt(1000000/6)

Approaches: Projection

  • Most dataset features are concentrated in a few dimensions - not uniformly across all. Much learnable training can be found in low-D subspace.
import numpy as np
import numpy.random as rnd
# build a 3D dataset

rnd.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = rnd.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * rnd.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * rnd.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * rnd.randn(m)

# mean-normalize the data
X = X - X.mean(axis=0)

# apply PCA to reduce to 2D
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)

# recover 3D points projected on 2D plane
X2D_inv = pca.inverse_transform(X2D)
# utility to draw 3D arrows
from matplotlib.patches import FancyArrowPatch
from mpl_toolkits.mplot3d import proj3d

class Arrow3D(FancyArrowPatch):
    def __init__(self, xs, ys, zs, *args, **kwargs):
        FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs)
        self._verts3d = xs, ys, zs

    def draw(self, renderer):
        xs3d, ys3d, zs3d = self._verts3d
        xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)
        self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))
        FancyArrowPatch.draw(self, renderer)

# express plane as function of x,y
axes = [-1.8, 1.8, -1.3, 1.3, -1.0, 1.0]

x1s = np.linspace(axes[0], axes[1], 10)
x2s = np.linspace(axes[2], axes[3], 10)
x1, x2 = np.meshgrid(x1s, x2s)

C = pca.components_
R = C.T.dot(C)
z = (R[0, 2] * x1 + R[1, 2] * x2) / (1 - R[2, 2])
# plot 3D dataset, plane & projections

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')

X3D_above = X[X[:, 2] > X2D_inv[:, 2]]
X3D_below = X[X[:, 2] <= X2D_inv[:, 2]]

ax.plot(X3D_below[:, 0], X3D_below[:, 1], X3D_below[:, 2], "bo", alpha=0.5)

ax.plot_surface(x1, x2, z, alpha=0.2, color="k")
np.linalg.norm(C, axis=0)
ax.add_artist(Arrow3D([0, C[0, 0]],[0, C[0, 1]],[0, C[0, 2]], mutation_scale=15, lw=1, arrowstyle="-|>", color="k"))
ax.add_artist(Arrow3D([0, C[1, 0]],[0, C[1, 1]],[0, C[1, 2]], mutation_scale=15, lw=1, arrowstyle="-|>", color="k"))
ax.plot([0], [0], [0], "k.")

for i in range(m):
    if X[i, 2] > X2D_inv[i, 2]:
        ax.plot([X[i][0], X2D_inv[i][0]], [X[i][1], X2D_inv[i][1]], [X[i][2], X2D_inv[i][2]], "k-")
    else:
        ax.plot([X[i][0], X2D_inv[i][0]], [X[i][1], X2D_inv[i][1]], [X[i][2], X2D_inv[i][2]], "k-", color="#505050")

ax.plot(X2D_inv[:, 0], X2D_inv[:, 1], X2D_inv[:, 2], "k+")
ax.plot(X2D_inv[:, 0], X2D_inv[:, 1], X2D_inv[:, 2], "k.")
ax.plot(X3D_above[:, 0], X3D_above[:, 1], X3D_above[:, 2], "bo")
ax.set_xlabel("$x_1$", fontsize=18)
ax.set_ylabel("$x_2$", fontsize=18)
ax.set_zlabel("$x_3$", fontsize=18)
ax.set_xlim(axes[0:2])
ax.set_ylim(axes[2:4])
ax.set_zlim(axes[4:6])

#save_fig("dataset_3d_plot")
plt.show()

png

# 2D projection equivalent:
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
ax.plot(X2D[:, 0], X2D[:, 1], "k+")
ax.plot(X2D[:, 0], X2D[:, 1], "k.")
ax.plot([0], [0], "ko")
ax.arrow(0, 0, 0, 1, head_width=0.05, length_includes_head=True, head_length=0.1, fc='k', ec='k')
ax.arrow(0, 0, 1, 0, head_width=0.05, length_includes_head=True, head_length=0.1, fc='k', ec='k')
ax.set_xlabel("$z_1$", fontsize=18)
ax.set_ylabel("$z_2$", fontsize=18, rotation=0)
ax.axis([-1.5, 1.3, -1.2, 1.2])
ax.grid(True)
plt.show()

png

Approaches: Manifolds

  • Manifolds = shapes that can be bent/twisted in higher-D space.
  • ex: "Swiss roll" problem
# Swiss roll visualization:
from sklearn.datasets import make_swiss_roll
X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)

axes = [-11.5, 14, -2, 23, -12, 15]

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=t, cmap=plt.cm.hot)
ax.view_init(10, -70)
ax.set_xlabel("$x_1$", fontsize=18)
ax.set_ylabel("$x_2$", fontsize=18)
ax.set_zlabel("$x_3$", fontsize=18)
ax.set_xlim(axes[0:2])
ax.set_ylim(axes[2:4])
ax.set_zlim(axes[4:6])

#save_fig("swiss_roll_plot")
plt.show()

png

# "squashed" swiss roll visualization:
plt.figure(figsize=(11, 4))

plt.subplot(121)
plt.scatter(X[:, 0], X[:, 1], c=t, cmap=plt.cm.hot)
plt.axis(axes[:4])
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$x_2$", fontsize=18, rotation=0)
plt.grid(True)

plt.subplot(122)
plt.scatter(t, X[:, 1], c=t, cmap=plt.cm.hot)
plt.axis([4, 15, axes[2], axes[3]])
plt.xlabel("$z_1$", fontsize=18)
plt.grid(True)

#save_fig("squished_swiss_roll_plot")
plt.show()

png

from matplotlib import gridspec

axes = [-11.5, 14, -2, 23, -12, 15]

x2s = np.linspace(axes[2], axes[3], 10)
x3s = np.linspace(axes[4], axes[5], 10)
x2, x3 = np.meshgrid(x2s, x3s)

fig = plt.figure(figsize=(6, 5))
ax = plt.subplot(111, projection='3d')

positive_class = X[:, 0] > 5
X_pos = X[positive_class]
X_neg = X[~positive_class]

ax.view_init(10, -70)
ax.plot(X_neg[:, 0], X_neg[:, 1], X_neg[:, 2], "y^")
ax.plot_wireframe(5, x2, x3, alpha=0.5)
ax.plot(X_pos[:, 0], X_pos[:, 1], X_pos[:, 2], "gs")
ax.set_xlabel("$x_1$", fontsize=18)
ax.set_ylabel("$x_2$", fontsize=18)
ax.set_zlabel("$x_3$", fontsize=18)
ax.set_xlim(axes[0:2])
ax.set_ylim(axes[2:4])
ax.set_zlim(axes[4:6])

#save_fig("manifold_decision_boundary_plot1")
plt.show()

fig = plt.figure(figsize=(5, 4))
ax = plt.subplot(111)

plt.plot(t[positive_class], X[positive_class, 1], "gs")
plt.plot(t[~positive_class], X[~positive_class, 1], "y^")
plt.axis([4, 15, axes[2], axes[3]])
plt.xlabel("$z_1$", fontsize=18)
plt.ylabel("$z_2$", fontsize=18, rotation=0)
plt.grid(True)

#save_fig("manifold_decision_boundary_plot2")
plt.show()

fig = plt.figure(figsize=(6, 5))
ax = plt.subplot(111, projection='3d')

positive_class = 2 * (t[:] - 4) > X[:, 1]
X_pos = X[positive_class]
X_neg = X[~positive_class]
ax.view_init(10, -70)
ax.plot(X_neg[:, 0], X_neg[:, 1], X_neg[:, 2], "y^")
ax.plot(X_pos[:, 0], X_pos[:, 1], X_pos[:, 2], "gs")
ax.set_xlabel("$x_1$", fontsize=18)
ax.set_ylabel("$x_2$", fontsize=18)
ax.set_zlabel("$x_3$", fontsize=18)
ax.set_xlim(axes[0:2])
ax.set_ylim(axes[2:4])
ax.set_zlim(axes[4:6])

#save_fig("manifold_decision_boundary_plot3")
plt.show()

fig = plt.figure(figsize=(5, 4))
ax = plt.subplot(111)

plt.plot(t[positive_class], X[positive_class, 1], "gs")
plt.plot(t[~positive_class], X[~positive_class, 1], "y^")
plt.plot([4, 15], [0, 22], "b-", linewidth=2)
plt.axis([4, 15, axes[2], axes[3]])
plt.xlabel("$z_1$", fontsize=18)
plt.ylabel("$z_2$", fontsize=18, rotation=0)
plt.grid(True)

#save_fig("manifold_decision_boundary_plot4")
plt.show()

# Lesson learned (below):
# Unrolling a dataset to a lower dimension doesn't necessarily lead to
# a simpler representation.

png

png

png

png

PCA (Principal Component Analysis)

  • Most popular DR algorithm
  • 1) Finds hyperplane that lies closest to the data
  • 2) Projects data onto it

Preserving Variance

  • Below: simple 2D dataset projected onto 3 different axes.
  • Projection on solid line preserves the maximum variance. (Therefore less likely to lose information.)
angle = np.pi / 5
stretch = 5
m = 200

rnd.seed(3)
X = rnd.randn(m, 2) / 10
X = X.dot(np.array([[stretch, 0],[0, 1]])) # stretch
X = X.dot([[np.cos(angle), np.sin(angle)], [-np.sin(angle), np.cos(angle)]]) # rotate

u1 = np.array([np.cos(angle), np.sin(angle)])
u2 = np.array([np.cos(angle - 2 * np.pi/6), np.sin(angle - 2 * np.pi/6)])
u3 = np.array([np.cos(angle - np.pi/2), np.sin(angle - np.pi/2)])

X_proj1 = X.dot(u1.reshape(-1, 1))
X_proj2 = X.dot(u2.reshape(-1, 1))
X_proj3 = X.dot(u3.reshape(-1, 1))

plt.figure(figsize=(8,4))
plt.subplot2grid((3,2), (0, 0), rowspan=3)
plt.plot([-1.4, 1.4], [-1.4*u1[1]/u1[0], 1.4*u1[1]/u1[0]], "k-", linewidth=1)
plt.plot([-1.4, 1.4], [-1.4*u2[1]/u2[0], 1.4*u2[1]/u2[0]], "k--", linewidth=1)
plt.plot([-1.4, 1.4], [-1.4*u3[1]/u3[0], 1.4*u3[1]/u3[0]], "k:", linewidth=2)
plt.plot(X[:, 0], X[:, 1], "bo", alpha=0.5)
plt.axis([-1.4, 1.4, -1.4, 1.4])
plt.arrow(0, 0, u1[0], u1[1], head_width=0.1, linewidth=5, length_includes_head=True, head_length=0.1, fc='k', ec='k')
plt.arrow(0, 0, u3[0], u3[1], head_width=0.1, linewidth=5, length_includes_head=True, head_length=0.1, fc='k', ec='k')
plt.text(u1[0] + 0.1, u1[1] - 0.05, r"$\mathbf{c_1}$", fontsize=22)
plt.text(u3[0] + 0.1, u3[1], r"$\mathbf{c_2}$", fontsize=22)
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$x_2$", fontsize=18, rotation=0)
plt.grid(True)

plt.subplot2grid((3,2), (0, 1))
plt.plot([-2, 2], [0, 0], "k-", linewidth=1)
plt.plot(X_proj1[:, 0], np.zeros(m), "bo", alpha=0.3)
plt.gca().get_yaxis().set_ticks([])
plt.gca().get_xaxis().set_ticklabels([])
plt.axis([-2, 2, -1, 1])
plt.grid(True)

plt.subplot2grid((3,2), (1, 1))
plt.plot([-2, 2], [0, 0], "k--", linewidth=1)
plt.plot(X_proj2[:, 0], np.zeros(m), "bo", alpha=0.3)
plt.gca().get_yaxis().set_ticks([])
plt.gca().get_xaxis().set_ticklabels([])
plt.axis([-2, 2, -1, 1])
plt.grid(True)

plt.subplot2grid((3,2), (2, 1))
plt.plot([-2, 2], [0, 0], "k:", linewidth=2)
plt.plot(X_proj3[:, 0], np.zeros(m), "bo", alpha=0.3)
plt.gca().get_yaxis().set_ticks([])
plt.axis([-2, 2, -1, 1])
plt.xlabel("$z_1$", fontsize=18)
plt.grid(True)

#save_fig("pca_best_projection")
plt.show()

png

Principal Components

  • PCA finds axis responsible for largest amount of variance in dataset.
  • Also finds 2nd axis, responsible for next largest amount.
  • If higher-D dataset, PCA also finds 3rd axis...
  • Repeat for # of dimensions in the dataset.
  • Each axis vector is called a principal component. (PC)

  • PCs found using Singular Value Decomposition (SVD), a matrix factorization technique.

  • SVD decomposes training set matrix X into dot product of three matrices.
  • Note: PCA assumes data is centered around origin. Scikit PCA will adjust data for you if needed.
# use NumPy svd() to get principal components of training set,
# then extract 1st two PCs.

X_centered = X - X.mean(axis=0)
U,s,V = np.linalg.svd(X_centered)

c1, c2 = V.T[:,0], V.T[:,1]
print(c1,c2)
[-0.79644131 -0.60471583] [-0.60471583  0.79644131]

Projecting Training Data Down to d Dimensions

  • Done by computing dot product of training data (X) by matrix containing the first d principal components (Wd).
# project training set onto plane defined by 1st two PCs.

W2 = V.T[:, :2]
X2D = X_centered.dot(W2)
print(X2D)
[[ -8.96088137e-01   2.61576283e-02]
 [ -4.53603363e-02  -1.85948860e-01]
 [  1.38359166e-01  -3.11666166e-02]
 [  4.16315780e-02  -6.04371773e-02]
 [  2.18583744e-02  -4.58726693e-02]
 [  6.53868464e-01   1.03673047e-01]
 [ -4.45218566e-01   1.63002740e-01]
 [ -2.52100754e-02  -3.96098381e-02]
 [  2.74828447e-01  -1.47486328e-01]
 [ -4.89804685e-01  -1.19064333e-01]
 [  5.91772943e-01  -6.68825324e-03]
 [ -7.44460369e-01   9.37220434e-03]
 [  5.12230114e-01  -5.91117152e-02]
 [ -3.13266691e-01  -2.12641588e-02]
 [  3.83765553e-01  -1.35145070e-02]
 [ -3.77664930e-01   1.91087392e-01]
 [  6.22192127e-01  -4.81326634e-02]
 [  4.05843018e-01  -2.32002753e-01]
 [  4.62900292e-01  -9.12474313e-02]
 [ -5.62638042e-01  -2.36637544e-02]
 [  8.09046208e-01   8.31463215e-02]
 [  1.80719622e-01  -1.69142171e-01]
 [  2.98447518e-01  -5.11785151e-02]
 [  4.35729072e-01   1.35618235e-02]
 [  1.12339132e+00  -1.68707469e-03]
 [ -5.09329756e-01   7.59538223e-02]
 [ -5.57383436e-01   1.01605408e-01]
 [ -7.42300017e-01  -1.26114063e-01]
 [ -4.19950471e-01  -1.93589947e-01]
 [  3.04360690e-01  -1.83671045e-01]
 [ -5.27822154e-01   1.23668447e-01]
 [  9.38906116e-02   1.80883149e-01]
 [  3.35918853e-01   2.35494590e-02]
 [ -7.52638095e-02  -1.06632109e-01]
 [ -2.24065944e-01   1.90613293e-01]
 [  5.09402619e-01   1.02097673e-01]
 [  7.24520511e-02   1.79929048e-01]
 [ -2.44318685e-01   6.38817933e-02]
 [ -3.23087658e-01   1.94977998e-02]
 [  6.93739438e-01   1.55229402e-01]
 [  6.83626747e-01   3.96804666e-02]
 [ -3.06255432e-01  -8.88712253e-02]
 [ -7.60306544e-02   1.16609123e-01]
 [  1.28713987e-02  -8.72153282e-02]
 [  1.45854192e+00  -6.50410607e-02]
 [  2.95510472e-01  -4.40211613e-02]
 [  4.78043426e-01   4.92202601e-02]
 [  2.86468892e-01  -3.50652486e-03]
 [ -3.38708502e-01  -9.13086575e-02]
 [  1.44466110e-01   2.20308932e-01]
 [ -4.35369951e-01  -1.37167289e-01]
 [  3.76189231e-02   5.86533049e-02]
 [ -6.17964798e-01   3.26551523e-03]
 [  2.65720436e-01  -6.60652314e-02]
 [ -3.24171713e-01   2.58737230e-02]
 [  2.57613005e-01  -1.20787043e-02]
 [  2.05472262e-01   7.82147166e-02]
 [  3.42825581e-01   5.72823775e-02]
 [ -4.27742475e-01   4.10117883e-02]
 [  4.13138475e-01   1.44642428e-01]
 [  3.37079378e-01   5.11618950e-02]
 [  3.79209530e-01  -1.65052243e-01]
 [ -1.14507596e-01   2.76931986e-02]
 [  3.70774307e-02   2.97937650e-02]
 [  3.24636337e-01  -6.55164554e-02]
 [  7.89476581e-02   1.94034121e-01]
 [ -4.07272297e-01  -5.92029130e-02]
 [ -2.79337894e-01  -5.23406778e-02]
 [  2.25715170e-01   9.21073425e-02]
 [  2.65055409e-01  -1.60611082e-01]
 [  4.51907852e-01   1.97745374e-02]
 [ -6.76603521e-02   1.24160746e-01]
 [  3.55338456e-01   8.20568965e-02]
 [ -2.13673993e-01  -1.80131732e-02]
 [ -4.19919072e-01   4.17639004e-02]
 [  4.27746050e-01   1.17613622e-01]
 [  6.09137236e-01   2.02104565e-02]
 [ -3.16955986e-03   4.38048374e-02]
 [  3.61657526e-01   5.53222800e-03]
 [  6.71393848e-02   1.02545906e-01]
 [  3.96663191e-01   1.70419112e-02]
 [  1.32276395e-01  -1.25644673e-01]
 [ -1.33866070e+00  -3.39620117e-02]
 [  7.39140839e-01   1.57883864e-01]
 [  5.33339338e-01   4.97439820e-02]
 [ -4.31004868e-01  -7.25504028e-02]
 [  2.15484246e-01  -4.80950244e-02]
 [  6.70354778e-02  -1.59469510e-01]
 [  6.16925576e-01   3.30095454e-04]
 [ -5.21745049e-01   5.35816552e-02]
 [ -8.67032328e-01   5.25304916e-02]
 [  2.54714484e-01  -5.51554532e-03]
 [  1.01594493e+00  -7.32702485e-02]
 [  5.08443285e-01   3.91889488e-02]
 [ -3.23713333e-01  -6.14794166e-02]
 [  2.96394651e-01  -1.47039271e-01]
 [  6.22912229e-02   2.75349476e-02]
 [ -2.22242499e-01  -8.15775375e-02]
 [ -9.97596213e-01   9.98881775e-02]
 [  4.76677524e-02  -5.03263425e-02]
 [  1.59385630e-01   1.98770961e-02]
 [  7.23983658e-03   4.99150260e-02]
 [ -3.88330571e-01   1.29862055e-01]
 [ -5.74324601e-01  -2.17412761e-02]
 [ -1.94317864e-01  -4.14215200e-02]
 [  2.88462890e-01   1.98608595e-01]
 [  2.24621449e-01   2.05254821e-01]
 [  1.72893464e-01   3.03337593e-02]
 [ -5.45686322e-01  -7.71908926e-03]
 [ -1.97062110e-01  -2.69019260e-02]
 [  2.37006918e-01  -1.01833788e-02]
 [  3.20293812e-01   1.71329418e-01]
 [  7.91454892e-02   1.75373382e-01]
 [  1.34116467e+00   3.15311858e-02]
 [ -2.81345950e-01  -3.39245876e-02]
 [ -5.49606098e-01   5.37889594e-02]
 [  1.35299898e-01   4.77632442e-02]
 [ -1.40721018e+00  -3.07936334e-03]
 [ -1.49842206e-01  -4.57709563e-02]
 [ -6.50670013e-02  -1.28928620e-01]
 [  9.28880501e-02   2.49171453e-01]
 [ -5.35894030e-01   5.42510650e-02]
 [ -5.56858738e-01   1.77869868e-01]
 [ -3.02147552e-01   2.02159209e-01]
 [ -5.36183631e-01   6.74427775e-03]
 [ -8.55623983e-01  -2.52623485e-01]
 [  2.83998192e-01   3.39738443e-02]
 [  4.55220905e-01  -4.60837740e-03]
 [  3.19652268e-01  -5.73147775e-02]
 [ -1.35803712e+00   3.55141032e-02]
 [  2.31421394e-02   1.33432806e-01]
 [  1.15723020e-01   9.23933561e-02]
 [ -1.79851419e-01   1.60298502e-01]
 [  7.07110703e-01   9.29033427e-02]
 [ -5.97227204e-02  -1.15625175e-01]
 [ -5.31804111e-01  -9.70759692e-02]
 [ -8.11573484e-01   3.56003677e-02]
 [ -3.48886570e-01   2.08692440e-03]
 [ -4.49281442e-01  -1.94684913e-01]
 [ -3.70829130e-02  -6.22225918e-02]
 [ -1.42251513e-01  -6.31866218e-03]
 [ -3.07506144e-01  -8.88695185e-02]
 [ -9.25307571e-01   2.13517965e-01]
 [  1.03097886e-01   2.07573310e-03]
 [ -1.47464910e-01   1.24724802e-01]
 [ -9.46748876e-01  -1.39178787e-01]
 [  3.08673618e-01  -9.83295182e-02]
 [  5.58671697e-01  -1.50783004e-01]
 [ -1.79969356e-01  -1.18326957e-01]
 [ -7.54431603e-01   7.63370593e-02]
 [  5.15860621e-01  -9.07637328e-02]
 [  3.02119496e-01   1.47799162e-01]
 [  4.26806702e-01  -1.38986614e-01]
 [  2.28667680e-02   3.92283202e-02]
 [  3.24088211e-01   1.70213975e-01]
 [ -1.22209299e-01   4.41346966e-02]
 [  4.26702773e-01   6.42551604e-03]
 [  8.19816829e-02  -2.24146989e-01]
 [ -1.41063811e-01  -1.81097498e-01]
 [  1.64457703e-02  -1.45681492e-01]
 [ -1.01134103e+00   1.26993172e-02]
 [ -4.33573355e-01   8.41743009e-02]
 [  6.65856885e-01  -2.12785979e-01]
 [  1.61808196e-01   9.45467292e-02]
 [ -4.82157648e-02   7.89423282e-02]
 [ -4.66117068e-01  -1.57706328e-01]
 [ -1.49009196e-01   1.94946740e-01]
 [  2.88429562e-01  -1.95469647e-01]
 [  1.15854187e-01  -4.26347471e-02]
 [ -2.08314238e-01  -9.22559093e-02]
 [ -4.17242418e-02  -2.21272525e-01]
 [ -4.31285498e-01   1.51023258e-01]
 [ -6.46651297e-01  -1.63796812e-01]
 [ -1.54321959e-01  -3.00319930e-01]
 [ -1.42477982e-01  -8.04414843e-03]
 [  4.96009127e-01   4.62484050e-02]
 [ -7.20255309e-02   9.38505820e-02]
 [ -6.51879918e-02   2.35315556e-02]
 [  9.73713335e-01  -9.40602754e-02]
 [  2.36218522e-01   3.60724373e-02]
 [ -1.06876746e-03   1.51498555e-01]
 [  4.34435954e-01  -1.34060334e-02]
 [ -1.28076019e-01  -2.44578006e-02]
 [ -2.47584073e-01  -6.08927420e-02]
 [ -7.10391471e-01  -4.55521748e-02]
 [ -5.58836834e-01  -1.34202263e-02]
 [ -7.43699081e-01  -1.54131331e-01]
 [ -2.58481292e-01  -4.73208896e-02]
 [ -1.19242387e-01   1.18190473e-01]
 [  5.71157514e-01  -1.18785924e-01]
 [  4.97483707e-01   3.73368260e-02]
 [  9.41952705e-01   3.09293927e-02]
 [  6.44331385e-01  -9.94076651e-02]
 [  1.82692942e-01   4.33205574e-02]
 [  3.13123024e-01  -4.12117592e-02]
 [  2.04403918e-02  -2.54497816e-02]
 [  1.33781786e+00  -1.34015280e-02]
 [ -4.58399199e-02   1.10234115e-01]
 [ -1.02539769e+00   4.64829485e-02]
 [ -3.85549579e-02  -9.59123942e-02]]

Scikit PCA

  • Uses SVD decomposition as before.
  • You can access each PC using components_ variable. (
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)

print(pca.components_[0])
print(pca.components_.T[:,0])
[-0.79644131 -0.60471583]
[-0.79644131 -0.60471583]

Explained Variance Ratio

  • Very useful metric: proportion of dataset's variance along the axis of each PC component.
# 95% of dataset variance explained by 1st axis.
print(pca.explained_variance_ratio_)
[ 0.95369864  0.04630136]

Choosing Right #Dimensions

  • No need to choose arbitrary #dimensions. Instead pick d that cumulatively accounts for a sufficient amount, ex: 95%.
# find minimum d to preserve 95% of training set variance
pca = PCA()
pca.fit(X)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
print(d)
1

PCA for Compression

  • Example applying PCA to MNIST dataset with 95% preservation = results in ~150 features (original = 28x28 = 784)
#MNIST compression:
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_mldata

#mnist = fetch_mldata('MNIST original')
mnist_path = "./mnist-original.mat"

from scipy.io import loadmat
mnist_raw = loadmat(mnist_path)
mnist = {
    "data": mnist_raw["data"].T,
    "target": mnist_raw["label"][0],
    "COL_NAMES": ["label", "data"],
    "DESCR": "mldata.org dataset: mnist-original",
    }

X, y = mnist["data"], mnist["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

X = X_train

pca = PCA()
pca.fit(X)
d = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.95) + 1
d
154
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X)
pca.n_components_
154
# did you hit your 95% minimum?
np.sum(pca.explained_variance_ratio_)
0.9503623084769206
# use inverse_transform to decompress back to 784 dimensions
X_mnist = X_train

pca = PCA(n_components = 154)
X_mnist_reduced = pca.fit_transform(X_mnist)
X_mnist_recovered = pca.inverse_transform(X_mnist_reduced)
import matplotlib
import matplotlib.pyplot as plt

def plot_digits(instances, images_per_row=5, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap = matplotlib.cm.binary, **options)
    plt.axis("off")

plt.figure(figsize=(7, 4))
plt.subplot(121)
plot_digits(X_mnist[::2100])
plt.title("Original", fontsize=16)
plt.subplot(122)
plot_digits(X_mnist_recovered[::2100])
plt.title("Compressed", fontsize=16)
#save_fig("mnist_compression_plot")
plt.show()

png

Incremental PCA

  • PCA normally requires entire dataset in memory for SVD algorithm.
  • Incremental PCA (IPCA) splits dataset into batches.
# split MNIST into 100 minibatches using Numpy array_split()
# reduce MNIST down to 154 dimensions as before.
# note use of partial_fit() for each batch.

from sklearn.decomposition import IncrementalPCA

n_batches = 100
inc_pca = IncrementalPCA(n_components=154)

for X_batch in np.array_split(X_mnist, n_batches):
    print(".", end="")
    inc_pca.partial_fit(X_batch)

X_mnist_reduced_inc = inc_pca.transform(X_mnist)
....................................................................................................
# alternative: Numpy memmap class (use binary array on disk as if it was in memory)

filename = "my_mnist.data"

X_mm = np.memmap(
    filename, dtype='float32', mode='write', shape=X_mnist.shape)

X_mm[:] = X_mnist
del X_mm
X_mm = np.memmap(filename, dtype='float32', mode='readonly', shape=X_mnist.shape)

batch_size = len(X_mnist) // n_batches
inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)
inc_pca.fit(X_mm)
IncrementalPCA(batch_size=525, copy=True, n_components=154, whiten=False)
rnd_pca = PCA(
    n_components=154, 
    random_state=42, 
    svd_solver="randomized")

X_reduced = rnd_pca.fit_transform(X_mnist)
import time

for n_components in (2, 10, 154):
    print("n_components =", n_components)
    regular_pca = PCA(
        n_components=n_components)
    inc_pca     = IncrementalPCA(
        n_components=154, 
        batch_size=500)
    rnd_pca     = PCA(
        n_components=154, 
        random_state=42, 
        svd_solver="randomized")

    for pca in (regular_pca, inc_pca, rnd_pca):
        t1 = time.time()
        pca.fit(X_mnist)
        t2 = time.time()
        print(pca.__class__.__name__, t2 - t1, "seconds")
n_components = 2
PCA 1.308387279510498 seconds
IncrementalPCA 18.326093673706055 seconds
PCA 3.998342514038086 seconds
n_components = 10
PCA 1.4705824851989746 seconds
IncrementalPCA 16.598721742630005 seconds
PCA 4.156355619430542 seconds
n_components = 154
PCA 4.129154682159424 seconds
IncrementalPCA 16.597434043884277 seconds
PCA 4.0131142139434814 seconds

Randomized PCA

  • Stochastic algorithm, quickly finds approximation of 1st d components. Dramatically faster.
rnd_pca = PCA(n_components=154, svd_solver="randomized")

t1 = time.time()
X_reduced = rnd_pca.fit_transform(X_mnist)
t2 = time.time()
print(t2-t1, "seconds")
4.414088487625122 seconds

Kernel PCA

  • Use kernel trick to map instances into higher-D feature spaces. This enables non-linear classification & regression with SVMs.
  • Good at preserving clusters after projecton.
# Below: Swiss roll reduced to 2D using 3 techniques:
# 1) linear kernel (equiv to PCA)
# 2) RBF kernel
# 3) sigmoid kernel (logistic)

from sklearn.decomposition import KernelPCA

X, t = make_swiss_roll(
    n_samples=1000, 
    noise=0.2, 
    random_state=42)

lin_pca = KernelPCA(
    n_components = 2, 
    kernel="linear", 
    fit_inverse_transform=True)

rbf_pca = KernelPCA(
    n_components = 2, 
    kernel="rbf", 
    gamma=0.0433, 
    fit_inverse_transform=True)

sig_pca = KernelPCA(
    n_components = 2, 
    kernel="sigmoid", 
    gamma=0.001, 
    coef0=1, 
    fit_inverse_transform=True)

y = t > 6.9

plt.figure(figsize=(11, 4))

for subplot, pca, title in (
    (131, lin_pca, "Linear kernel"), 
    (132, rbf_pca, "RBF kernel, $\gamma=0.04$"), 
    (133, sig_pca, "Sigmoid kernel, $\gamma=10^{-3}, r=1$")):

    X_reduced = pca.fit_transform(X)
    if subplot == 132:
        X_reduced_rbf = X_reduced

    plt.subplot(subplot)
    #plt.plot(X_reduced[y, 0], X_reduced[y, 1], "gs")
    #plt.plot(X_reduced[~y, 0], X_reduced[~y, 1], "y^")
    plt.title(title, fontsize=14)
    plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)
    plt.xlabel("$z_1$", fontsize=18)
    if subplot == 131:
        plt.ylabel("$z_2$", fontsize=18, rotation=0)
    plt.grid(True)

#save_fig("kernel_pca_plot")
plt.show()

png

Selecting a Kernel & Hyperparameters

  • Dimensionality reduction = prep for supervised learning task
  • Can use grid search to select kernel & params
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ("kpca", KernelPCA(n_components=2)),
    ("log_reg", LogisticRegression())])

param_grid = [{
    "kpca__gamma": np.linspace(0.03, 0.05, 10),
    "kpca__kernel": ["rbf", "sigmoid"]}]

grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X, y)

# best kernel & params?
print(grid_search.best_params_)
{'kpca__gamma': 0.043333333333333335, 'kpca__kernel': 'rbf'}
  • Another (unsupervised approach): select kernel & params with lowest reconstruction error. Not as easy as with linear PCA.
rbf_pca = KernelPCA(
    n_components = 2, 
    kernel="rbf", 
    gamma=0.0433,
    fit_inverse_transform=True) # perform reconstruction

X_reduced  = rbf_pca.fit_transform(X)
X_preimage = rbf_pca.inverse_transform(X_reduced)

# return reconstruction pre-image error
from sklearn.metrics import mean_squared_error
mean_squared_error(X, X_preimage)
32.786308795766082
times_rpca = []
times_pca = []
sizes = [1000, 10000,  20000,  30000, 40000, 50000, 70000, 
              100000, 200000, 500000]

for n_samples in sizes:

    X = rnd.randn(n_samples, 5)

    pca = PCA(
        n_components = 2, 
        random_state=42, 
        svd_solver="randomized")

    t1 = time.time()
    pca.fit(X)
    t2 = time.time()
    times_rpca.append(t2 - t1)

    pca = PCA(n_components = 2)

    t1 = time.time()
    pca.fit(X)
    t2 = time.time()
    times_pca.append(t2 - t1)

plt.plot(sizes, times_rpca, "b-o", label="RPCA")
plt.plot(sizes, times_pca, "r-s", label="PCA")
plt.xlabel("n_samples")
plt.ylabel("Training time")
plt.legend(loc="upper left")
plt.title("PCA and Randomized PCA time complexity ")
plt.show()

png

LLE (Locally Linear Embedding)

  • Powerful nonlinear dimensionality reduction tool
  • Manifold Learning; doesn't rely on projections.
  • LLE measures how each instance relates to closest neighbors, then looks for low-D representation where local relations are best preserved.
# Use LLE to unroll a Swiss Roll.

from sklearn.manifold import LocallyLinearEmbedding

X, t = make_swiss_roll(
    n_samples=1000, 
    noise=0.2, 
    random_state=41)

lle = LocallyLinearEmbedding(
    n_neighbors=10, 
    n_components=2, 
    random_state=42)

X_reduced = lle.fit_transform(X)

plt.title("Unrolled swiss roll using LLE", fontsize=14)
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)
plt.xlabel("$z_1$", fontsize=18)
plt.ylabel("$z_2$", fontsize=18)
plt.axis([-0.065, 0.055, -0.1, 0.12])
plt.grid(True)

#save_fig("lle_unrolling_plot")
plt.show()

png

  • 1st: For each instance, LLE finds k nearest neighbors & tries to reconstruct instance as linear function of neighbors (weights such that squared distance is minimum).
  • Weight matrix W now encodes all local linear relations between instances.
  • 2nd: Map instances into d-dimensional space & preserve relationship data
  • Scikit computational complexity:
  • finding K nearest neighbors: O(m x log(m) x n x log(k))
  • weight optimization: O(m x n x k^3)
  • constructing low-d representations: O(d x m^2)

MDS, Isomap, t-SNE, LDA

from sklearn.manifold import MDS
mds = MDS(n_components=2, random_state=42)
X_reduced_mds = mds.fit_transform(X)
from sklearn.manifold import Isomap
isomap = Isomap(n_components=2)
X_reduced_isomap = isomap.fit_transform(X)
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)
X_reduced_tsne = tsne.fit_transform(X)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=2)
X_mnist = mnist["data"]
y_mnist = mnist["target"]
lda.fit(X_mnist, y_mnist)
X_reduced_lda = lda.transform(X_mnist)
/home/bjpcjp/anaconda3/lib/python3.5/site-packages/sklearn/discriminant_analysis.py:387: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
titles = ["MDS", "Isomap", "t-SNE"]

plt.figure(figsize=(11,4))

for subplot, title, X_reduced in zip((131, 132, 133), titles,
                                     (X_reduced_mds, X_reduced_isomap, X_reduced_tsne)):
    plt.subplot(subplot)
    plt.title(title, fontsize=14)
    plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)
    plt.xlabel("$z_1$", fontsize=18)
    if subplot == 131:
        plt.ylabel("$z_2$", fontsize=18, rotation=0)
    plt.grid(True)

#save_fig("other_dim_reduction_plot")
plt.show()

png

results matching ""

    No results matching ""