Intro

Dimesionality reduction is lossy. It may speed up training but can degrade result quality. Also makes pipelines more complex. Try using original data before considering dimensionality reduction.
Very useful for visualization (2D, 3D representations more intuitive.)
Two main approaches: projection, manifold learning.
Three most popular techniques: PCA, Kernel PCA, LLE.

Curse of Dimensionality

Many things behave differently in high-D space.

1) Most points in high-D hypercube will be very close to a border.

2) Distances between random points much greater (very high probability of sparse matrix representation).

In 2D: ~0.52
In 3D: ~0.66
In 1,000,000D: ~408 ~ sqrt(1000000/6)

Approaches: Projection

Most dataset features are concentrated in a few dimensions - not uniformly across all. Much learnable training can be found in low-D subspace.

import numpy as np
import numpy.random as rnd

# build a 3D dataset

rnd.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = rnd.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * rnd.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * rnd.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * rnd.randn(m)

# mean-normalize the data
X = X - X.mean(axis=0)

# apply PCA to reduce to 2D
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)

# recover 3D points projected on 2D plane
X2D_inv = pca.inverse_transform(X2D)

# utility to draw 3D arrows
from matplotlib.patches import FancyArrowPatch
from mpl_toolkits.mplot3d import proj3d

class Arrow3D(FancyArrowPatch):
    def __init__(self, xs, ys, zs, *args, **kwargs):
        FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs)
        self._verts3d = xs, ys, zs

    def draw(self, renderer):
        xs3d, ys3d, zs3d = self._verts3d
        xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)
        self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))
        FancyArrowPatch.draw(self, renderer)

# express plane as function of x,y
axes = [-1.8, 1.8, -1.3, 1.3, -1.0, 1.0]

x1s = np.linspace(axes[0], axes[1], 10)
x2s = np.linspace(axes[2], axes[3], 10)
x1, x2 = np.meshgrid(x1s, x2s)

C = pca.components_
R = C.T.dot(C)
z = (R[0, 2] * x1 + R[1, 2] * x2) / (1 - R[2, 2])

# plot 3D dataset, plane & projections

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')

X3D_above = X[X[:, 2] > X2D_inv[:, 2]]
X3D_below = X[X[:, 2] <= X2D_inv[:, 2]]

ax.plot(X3D_below[:, 0], X3D_below[:, 1], X3D_below[:, 2], "bo", alpha=0.5)

ax.plot_surface(x1, x2, z, alpha=0.2, color="k")
np.linalg.norm(C, axis=0)
ax.add_artist(Arrow3D([0, C[0, 0]],[0, C[0, 1]],[0, C[0, 2]], mutation_scale=15, lw=1, arrowstyle="-|>", color="k"))
ax.add_artist(Arrow3D([0, C[1, 0]],[0, C[1, 1]],[0, C[1, 2]], mutation_scale=15, lw=1, arrowstyle="-|>", color="k"))
ax.plot([0], [0], [0], "k.")

for i in range(m):
    if X[i, 2] > X2D_inv[i, 2]:
        ax.plot([X[i][0], X2D_inv[i][0]], [X[i][1], X2D_inv[i][1]], [X[i][2], X2D_inv[i][2]], "k-")
    else:
        ax.plot([X[i][0], X2D_inv[i][0]], [X[i][1], X2D_inv[i][1]], [X[i][2], X2D_inv[i][2]], "k-", color="#505050")

ax.plot(X2D_inv[:, 0], X2D_inv[:, 1], X2D_inv[:, 2], "k+")
ax.plot(X2D_inv[:, 0], X2D_inv[:, 1], X2D_inv[:, 2], "k.")
ax.plot(X3D_above[:, 0], X3D_above[:, 1], X3D_above[:, 2], "bo")
ax.set_xlabel("$x_1$", fontsize=18)
ax.set_ylabel("$x_2$", fontsize=18)
ax.set_zlabel("$x_3$", fontsize=18)
ax.set_xlim(axes[0:2])
ax.set_ylim(axes[2:4])
ax.set_zlim(axes[4:6])

#save_fig("dataset_3d_plot")
plt.show()

# 2D projection equivalent:
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
ax.plot(X2D[:, 0], X2D[:, 1], "k+")
ax.plot(X2D[:, 0], X2D[:, 1], "k.")
ax.plot([0], [0], "ko")
ax.arrow(0, 0, 0, 1, head_width=0.05, length_includes_head=True, head_length=0.1, fc='k', ec='k')
ax.arrow(0, 0, 1, 0, head_width=0.05, length_includes_head=True, head_length=0.1, fc='k', ec='k')
ax.set_xlabel("$z_1$", fontsize=18)
ax.set_ylabel("$z_2$", fontsize=18, rotation=0)
ax.axis([-1.5, 1.3, -1.2, 1.2])
ax.grid(True)
plt.show()

Approaches: Manifolds

Manifolds = shapes that can be bent/twisted in higher-D space.
ex: "Swiss roll" problem

# Swiss roll visualization:
from sklearn.datasets import make_swiss_roll
X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)

axes = [-11.5, 14, -2, 23, -12, 15]

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=t, cmap=plt.cm.hot)
ax.view_init(10, -70)
ax.set_xlabel("$x_1$", fontsize=18)
ax.set_ylabel("$x_2$", fontsize=18)
ax.set_zlabel("$x_3$", fontsize=18)
ax.set_xlim(axes[0:2])
ax.set_ylim(axes[2:4])
ax.set_zlim(axes[4:6])

#save_fig("swiss_roll_plot")
plt.show()

# "squashed" swiss roll visualization:
plt.figure(figsize=(11, 4))

plt.subplot(121)
plt.scatter(X[:, 0], X[:, 1], c=t, cmap=plt.cm.hot)
plt.axis(axes[:4])
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$x_2$", fontsize=18, rotation=0)
plt.grid(True)

plt.subplot(122)
plt.scatter(t, X[:, 1], c=t, cmap=plt.cm.hot)
plt.axis([4, 15, axes[2], axes[3]])
plt.xlabel("$z_1$", fontsize=18)
plt.grid(True)

#save_fig("squished_swiss_roll_plot")
plt.show()

from matplotlib import gridspec

axes = [-11.5, 14, -2, 23, -12, 15]

x2s = np.linspace(axes[2], axes[3], 10)
x3s = np.linspace(axes[4], axes[5], 10)
x2, x3 = np.meshgrid(x2s, x3s)

fig = plt.figure(figsize=(6, 5))
ax = plt.subplot(111, projection='3d')

positive_class = X[:, 0] > 5
X_pos = X[positive_class]
X_neg = X[~positive_class]

ax.view_init(10, -70)
ax.plot(X_neg[:, 0], X_neg[:, 1], X_neg[:, 2], "y^")
ax.plot_wireframe(5, x2, x3, alpha=0.5)
ax.plot(X_pos[:, 0], X_pos[:, 1], X_pos[:, 2], "gs")
ax.set_xlabel("$x_1$", fontsize=18)
ax.set_ylabel("$x_2$", fontsize=18)
ax.set_zlabel("$x_3$", fontsize=18)
ax.set_xlim(axes[0:2])
ax.set_ylim(axes[2:4])
ax.set_zlim(axes[4:6])

#save_fig("manifold_decision_boundary_plot1")
plt.show()

fig = plt.figure(figsize=(5, 4))
ax = plt.subplot(111)

plt.plot(t[positive_class], X[positive_class, 1], "gs")
plt.plot(t[~positive_class], X[~positive_class, 1], "y^")
plt.axis([4, 15, axes[2], axes[3]])
plt.xlabel("$z_1$", fontsize=18)
plt.ylabel("$z_2$", fontsize=18, rotation=0)
plt.grid(True)

#save_fig("manifold_decision_boundary_plot2")
plt.show()

fig = plt.figure(figsize=(6, 5))
ax = plt.subplot(111, projection='3d')

positive_class = 2 * (t[:] - 4) > X[:, 1]
X_pos = X[positive_class]
X_neg = X[~positive_class]
ax.view_init(10, -70)
ax.plot(X_neg[:, 0], X_neg[:, 1], X_neg[:, 2], "y^")
ax.plot(X_pos[:, 0], X_pos[:, 1], X_pos[:, 2], "gs")
ax.set_xlabel("$x_1$", fontsize=18)
ax.set_ylabel("$x_2$", fontsize=18)
ax.set_zlabel("$x_3$", fontsize=18)
ax.set_xlim(axes[0:2])
ax.set_ylim(axes[2:4])
ax.set_zlim(axes[4:6])

#save_fig("manifold_decision_boundary_plot3")
plt.show()

fig = plt.figure(figsize=(5, 4))
ax = plt.subplot(111)

plt.plot(t[positive_class], X[positive_class, 1], "gs")
plt.plot(t[~positive_class], X[~positive_class, 1], "y^")
plt.plot([4, 15], [0, 22], "b-", linewidth=2)
plt.axis([4, 15, axes[2], axes[3]])
plt.xlabel("$z_1$", fontsize=18)
plt.ylabel("$z_2$", fontsize=18, rotation=0)
plt.grid(True)

#save_fig("manifold_decision_boundary_plot4")
plt.show()

# Lesson learned (below):
# Unrolling a dataset to a lower dimension doesn't necessarily lead to
# a simpler representation.

PCA (Principal Component Analysis)

Most popular DR algorithm
1) Finds hyperplane that lies closest to the data
2) Projects data onto it

Preserving Variance

Below: simple 2D dataset projected onto 3 different axes.
Projection on solid line preserves the maximum variance. (Therefore less likely to lose information.)

angle = np.pi / 5
stretch = 5
m = 200

rnd.seed(3)
X = rnd.randn(m, 2) / 10
X = X.dot(np.array([[stretch, 0],[0, 1]])) # stretch
X = X.dot([[np.cos(angle), np.sin(angle)], [-np.sin(angle), np.cos(angle)]]) # rotate

u1 = np.array([np.cos(angle), np.sin(angle)])
u2 = np.array([np.cos(angle - 2 * np.pi/6), np.sin(angle - 2 * np.pi/6)])
u3 = np.array([np.cos(angle - np.pi/2), np.sin(angle - np.pi/2)])

X_proj1 = X.dot(u1.reshape(-1, 1))
X_proj2 = X.dot(u2.reshape(-1, 1))
X_proj3 = X.dot(u3.reshape(-1, 1))

plt.figure(figsize=(8,4))
plt.subplot2grid((3,2), (0, 0), rowspan=3)
plt.plot([-1.4, 1.4], [-1.4*u1[1]/u1[0], 1.4*u1[1]/u1[0]], "k-", linewidth=1)
plt.plot([-1.4, 1.4], [-1.4*u2[1]/u2[0], 1.4*u2[1]/u2[0]], "k--", linewidth=1)
plt.plot([-1.4, 1.4], [-1.4*u3[1]/u3[0], 1.4*u3[1]/u3[0]], "k:", linewidth=2)
plt.plot(X[:, 0], X[:, 1], "bo", alpha=0.5)
plt.axis([-1.4, 1.4, -1.4, 1.4])
plt.arrow(0, 0, u1[0], u1[1], head_width=0.1, linewidth=5, length_includes_head=True, head_length=0.1, fc='k', ec='k')
plt.arrow(0, 0, u3[0], u3[1], head_width=0.1, linewidth=5, length_includes_head=True, head_length=0.1, fc='k', ec='k')
plt.text(u1[0] + 0.1, u1[1] - 0.05, r"$\mathbf{c_1}$", fontsize=22)
plt.text(u3[0] + 0.1, u3[1], r"$\mathbf{c_2}$", fontsize=22)
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$x_2$", fontsize=18, rotation=0)
plt.grid(True)

plt.subplot2grid((3,2), (0, 1))
plt.plot([-2, 2], [0, 0], "k-", linewidth=1)
plt.plot(X_proj1[:, 0], np.zeros(m), "bo", alpha=0.3)
plt.gca().get_yaxis().set_ticks([])
plt.gca().get_xaxis().set_ticklabels([])
plt.axis([-2, 2, -1, 1])
plt.grid(True)

plt.subplot2grid((3,2), (1, 1))
plt.plot([-2, 2], [0, 0], "k--", linewidth=1)
plt.plot(X_proj2[:, 0], np.zeros(m), "bo", alpha=0.3)
plt.gca().get_yaxis().set_ticks([])
plt.gca().get_xaxis().set_ticklabels([])
plt.axis([-2, 2, -1, 1])
plt.grid(True)

plt.subplot2grid((3,2), (2, 1))
plt.plot([-2, 2], [0, 0], "k:", linewidth=2)
plt.plot(X_proj3[:, 0], np.zeros(m), "bo", alpha=0.3)
plt.gca().get_yaxis().set_ticks([])
plt.axis([-2, 2, -1, 1])
plt.xlabel("$z_1$", fontsize=18)
plt.grid(True)

#save_fig("pca_best_projection")
plt.show()

Principal Components

PCA finds axis responsible for largest amount of variance in dataset.
Also finds 2nd axis, responsible for next largest amount.
If higher-D dataset, PCA also finds 3rd axis...
Repeat for # of dimensions in the dataset.
Each axis vector is called a principal component. (PC)
PCs found using Singular Value Decomposition (SVD), a matrix factorization technique.
SVD decomposes training set matrix X into dot product of three matrices.
Note: PCA assumes data is centered around origin. Scikit PCA will adjust data for you if needed.

# use NumPy svd() to get principal components of training set,
# then extract 1st two PCs.

X_centered = X - X.mean(axis=0)
U,s,V = np.linalg.svd(X_centered)

c1, c2 = V.T[:,0], V.T[:,1]
print(c1,c2)

[-0.79644131 -0.60471583] [-0.60471583  0.79644131]

Projecting Training Data Down to d Dimensions

Done by computing dot product of training data (X) by matrix containing the first d principal components (Wd).

# project training set onto plane defined by 1st two PCs.

W2 = V.T[:, :2]
X2D = X_centered.dot(W2)
print(X2D)

[[ -8.96088137e-01   2.61576283e-02]
 [ -4.53603363e-02  -1.85948860e-01]
 [  1.38359166e-01  -3.11666166e-02]
 [  4.16315780e-02  -6.04371773e-02]
 [  2.18583744e-02  -4.58726693e-02]
 [  6.53868464e-01   1.03673047e-01]
 [ -4.45218566e-01   1.63002740e-01]
 [ -2.52100754e-02  -3.96098381e-02]
 [  2.74828447e-01  -1.47486328e-01]
 [ -4.89804685e-01  -1.19064333e-01]
 [  5.91772943e-01  -6.68825324e-03]
 [ -7.44460369e-01   9.37220434e-03]
 [  5.12230114e-01  -5.91117152e-02]
 [ -3.13266691e-01  -2.12641588e-02]
 [  3.83765553e-01  -1.35145070e-02]
 [ -3.77664930e-01   1.91087392e-01]
 [  6.22192127e-01  -4.81326634e-02]
 [  4.05843018e-01  -2.32002753e-01]
 [  4.62900292e-01  -9.12474313e-02]
 [ -5.62638042e-01  -2.36637544e-02]
 [  8.09046208e-01   8.31463215e-02]
 [  1.80719622e-01  -1.69142171e-01]
 [  2.98447518e-01  -5.11785151e-02]
 [  4.35729072e-01   1.35618235e-02]
 [  1.12339132e+00  -1.68707469e-03]
 [ -5.09329756e-01   7.59538223e-02]
 [ -5.57383436e-01   1.01605408e-01]
 [ -7.42300017e-01  -1.26114063e-01]
 [ -4.19950471e-01  -1.93589947e-01]
 [  3.04360690e-01  -1.83671045e-01]
 [ -5.27822154e-01   1.23668447e-01]
 [  9.38906116e-02   1.80883149e-01]
 [  3.35918853e-01   2.35494590e-02]
 [ -7.52638095e-02  -1.06632109e-01]
 [ -2.24065944e-01   1.90613293e-01]
 [  5.09402619e-01   1.02097673e-01]
 [  7.24520511e-02   1.79929048e-01]
 [ -2.44318685e-01   6.38817933e-02]
 [ -3.23087658e-01   1.94977998e-02]
 [  6.93739438e-01   1.55229402e-01]
 [  6.83626747e-01   3.96804666e-02]
 [ -3.06255432e-01  -8.88712253e-02]
 [ -7.60306544e-02   1.16609123e-01]
 [  1.28713987e-02  -8.72153282e-02]
 [  1.45854192e+00  -6.50410607e-02]
 [  2.95510472e-01  -4.40211613e-02]
 [  4.78043426e-01   4.92202601e-02]
 [  2.86468892e-01  -3.50652486e-03]
 [ -3.38708502e-01  -9.13086575e-02]
 [  1.44466110e-01   2.20308932e-01]
 [ -4.35369951e-01  -1.37167289e-01]
 [  3.76189231e-02   5.86533049e-02]
 [ -6.17964798e-01   3.26551523e-03]
 [  2.65720436e-01  -6.60652314e-02]
 [ -3.24171713e-01   2.58737230e-02]
 [  2.57613005e-01  -1.20787043e-02]
 [  2.05472262e-01   7.82147166e-02]
 [  3.42825581e-01   5.72823775e-02]
 [ -4.27742475e-01   4.10117883e-02]
 [  4.13138475e-01   1.44642428e-01]
 [  3.37079378e-01   5.11618950e-02]
 [  3.79209530e-01  -1.65052243e-01]
 [ -1.14507596e-01   2.76931986e-02]
 [  3.70774307e-02   2.97937650e-02]
 [  3.24636337e-01  -6.55164554e-02]
 [  7.89476581e-02   1.94034121e-01]
 [ -4.07272297e-01  -5.92029130e-02]
 [ -2.79337894e-01  -5.23406778e-02]
 [  2.25715170e-01   9.21073425e-02]
 [  2.65055409e-01  -1.60611082e-01]
 [  4.51907852e-01   1.97745374e-02]
 [ -6.76603521e-02   1.24160746e-01]
 [  3.55338456e-01   8.20568965e-02]
 [ -2.13673993e-01  -1.80131732e-02]
 [ -4.19919072e-01   4.17639004e-02]
 [  4.27746050e-01   1.17613622e-01]
 [  6.09137236e-01   2.02104565e-02]
 [ -3.16955986e-03   4.38048374e-02]
 [  3.61657526e-01   5.53222800e-03]
 [  6.71393848e-02   1.02545906e-01]
 [  3.96663191e-01   1.70419112e-02]
 [  1.32276395e-01  -1.25644673e-01]
 [ -1.33866070e+00  -3.39620117e-02]
 [  7.39140839e-01   1.57883864e-01]
 [  5.33339338e-01   4.97439820e-02]
 [ -4.31004868e-01  -7.25504028e-02]
 [  2.15484246e-01  -4.80950244e-02]
 [  6.70354778e-02  -1.59469510e-01]
 [  6.16925576e-01   3.30095454e-04]
 [ -5.21745049e-01   5.35816552e-02]
 [ -8.67032328e-01   5.25304916e-02]
 [  2.54714484e-01  -5.51554532e-03]
 [  1.01594493e+00  -7.32702485e-02]
 [  5.08443285e-01   3.91889488e-02]
 [ -3.23713333e-01  -6.14794166e-02]
 [  2.96394651e-01  -1.47039271e-01]
 [  6.22912229e-02   2.75349476e-02]
 [ -2.22242499e-01  -8.15775375e-02]
 [ -9.97596213e-01   9.98881775e-02]
 [  4.76677524e-02  -5.03263425e-02]
 [  1.59385630e-01   1.98770961e-02]
 [  7.23983658e-03   4.99150260e-02]
 [ -3.88330571e-01   1.29862055e-01]
 [ -5.74324601e-01  -2.17412761e-02]
 [ -1.94317864e-01  -4.14215200e-02]
 [  2.88462890e-01   1.98608595e-01]
 [  2.24621449e-01   2.05254821e-01]
 [  1.72893464e-01   3.03337593e-02]
 [ -5.45686322e-01  -7.71908926e-03]
 [ -1.97062110e-01  -2.69019260e-02]
 [  2.37006918e-01  -1.01833788e-02]
 [  3.20293812e-01   1.71329418e-01]
 [  7.91454892e-02   1.75373382e-01]
 [  1.34116467e+00   3.15311858e-02]
 [ -2.81345950e-01  -3.39245876e-02]
 [ -5.49606098e-01   5.37889594e-02]
 [  1.35299898e-01   4.77632442e-02]
 [ -1.40721018e+00  -3.07936334e-03]
 [ -1.49842206e-01  -4.57709563e-02]
 [ -6.50670013e-02  -1.28928620e-01]
 [  9.28880501e-02   2.49171453e-01]
 [ -5.35894030e-01   5.42510650e-02]
 [ -5.56858738e-01   1.77869868e-01]
 [ -3.02147552e-01   2.02159209e-01]
 [ -5.36183631e-01   6.74427775e-03]
 [ -8.55623983e-01  -2.52623485e-01]
 [  2.83998192e-01   3.39738443e-02]
 [  4.55220905e-01  -4.60837740e-03]
 [  3.19652268e-01  -5.73147775e-02]
 [ -1.35803712e+00   3.55141032e-02]
 [  2.31421394e-02   1.33432806e-01]
 [  1.15723020e-01   9.23933561e-02]
 [ -1.79851419e-01   1.60298502e-01]
 [  7.07110703e-01   9.29033427e-02]
 [ -5.97227204e-02  -1.15625175e-01]
 [ -5.31804111e-01  -9.70759692e-02]
 [ -8.11573484e-01   3.56003677e-02]
 [ -3.48886570e-01   2.08692440e-03]
 [ -4.49281442e-01  -1.94684913e-01]
 [ -3.70829130e-02  -6.22225918e-02]
 [ -1.42251513e-01  -6.31866218e-03]
 [ -3.07506144e-01  -8.88695185e-02]
 [ -9.25307571e-01   2.13517965e-01]
 [  1.03097886e-01   2.07573310e-03]
 [ -1.47464910e-01   1.24724802e-01]
 [ -9.46748876e-01  -1.39178787e-01]
 [  3.08673618e-01  -9.83295182e-02]
 [  5.58671697e-01  -1.50783004e-01]
 [ -1.79969356e-01  -1.18326957e-01]
 [ -7.54431603e-01   7.63370593e-02]
 [  5.15860621e-01  -9.07637328e-02]
 [  3.02119496e-01   1.47799162e-01]
 [  4.26806702e-01  -1.38986614e-01]
 [  2.28667680e-02   3.92283202e-02]
 [  3.24088211e-01   1.70213975e-01]
 [ -1.22209299e-01   4.41346966e-02]
 [  4.26702773e-01   6.42551604e-03]
 [  8.19816829e-02  -2.24146989e-01]
 [ -1.41063811e-01  -1.81097498e-01]
 [  1.64457703e-02  -1.45681492e-01]
 [ -1.01134103e+00   1.26993172e-02]
 [ -4.33573355e-01   8.41743009e-02]
 [  6.65856885e-01  -2.12785979e-01]
 [  1.61808196e-01   9.45467292e-02]
 [ -4.82157648e-02   7.89423282e-02]
 [ -4.66117068e-01  -1.57706328e-01]
 [ -1.49009196e-01   1.94946740e-01]
 [  2.88429562e-01  -1.95469647e-01]
 [  1.15854187e-01  -4.26347471e-02]
 [ -2.08314238e-01  -9.22559093e-02]
 [ -4.17242418e-02  -2.21272525e-01]
 [ -4.31285498e-01   1.51023258e-01]
 [ -6.46651297e-01  -1.63796812e-01]
 [ -1.54321959e-01  -3.00319930e-01]
 [ -1.42477982e-01  -8.04414843e-03]
 [  4.96009127e-01   4.62484050e-02]
 [ -7.20255309e-02   9.38505820e-02]
 [ -6.51879918e-02   2.35315556e-02]
 [  9.73713335e-01  -9.40602754e-02]
 [  2.36218522e-01   3.60724373e-02]
 [ -1.06876746e-03   1.51498555e-01]
 [  4.34435954e-01  -1.34060334e-02]
 [ -1.28076019e-01  -2.44578006e-02]
 [ -2.47584073e-01  -6.08927420e-02]
 [ -7.10391471e-01  -4.55521748e-02]
 [ -5.58836834e-01  -1.34202263e-02]
 [ -7.43699081e-01  -1.54131331e-01]
 [ -2.58481292e-01  -4.73208896e-02]
 [ -1.19242387e-01   1.18190473e-01]
 [  5.71157514e-01  -1.18785924e-01]
 [  4.97483707e-01   3.73368260e-02]
 [  9.41952705e-01   3.09293927e-02]
 [  6.44331385e-01  -9.94076651e-02]
 [  1.82692942e-01   4.33205574e-02]
 [  3.13123024e-01  -4.12117592e-02]
 [  2.04403918e-02  -2.54497816e-02]
 [  1.33781786e+00  -1.34015280e-02]
 [ -4.58399199e-02   1.10234115e-01]
 [ -1.02539769e+00   4.64829485e-02]
 [ -3.85549579e-02  -9.59123942e-02]]

Scikit PCA

Uses SVD decomposition as before.
You can access each PC using components_ variable. (

from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)

print(pca.components_[0])
print(pca.components_.T[:,0])

[-0.79644131 -0.60471583]
[-0.79644131 -0.60471583]

Explained Variance Ratio

Very useful metric: proportion of dataset's variance along the axis of each PC component.

# 95% of dataset variance explained by 1st axis.
print(pca.explained_variance_ratio_)

[ 0.95369864  0.04630136]

Choosing Right #Dimensions

No need to choose arbitrary #dimensions. Instead pick d that cumulatively accounts for a sufficient amount, ex: 95%.

# find minimum d to preserve 95% of training set variance
pca = PCA()
pca.fit(X)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
print(d)

PCA for Compression

Example applying PCA to MNIST dataset with 95% preservation = results in ~150 features (original = 28x28 = 784)

#MNIST compression:
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_mldata

#mnist = fetch_mldata('MNIST original')
mnist_path = "./mnist-original.mat"

from scipy.io import loadmat
mnist_raw = loadmat(mnist_path)
mnist = {
    "data": mnist_raw["data"].T,
    "target": mnist_raw["label"][0],
    "COL_NAMES": ["label", "data"],
    "DESCR": "mldata.org dataset: mnist-original",
    }

X, y = mnist["data"], mnist["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

X = X_train

pca = PCA()
pca.fit(X)
d = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.95) + 1
d

pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X)
pca.n_components_

# did you hit your 95% minimum?
np.sum(pca.explained_variance_ratio_)

0.9503623084769206

# use inverse_transform to decompress back to 784 dimensions
X_mnist = X_train

pca = PCA(n_components = 154)
X_mnist_reduced = pca.fit_transform(X_mnist)
X_mnist_recovered = pca.inverse_transform(X_mnist_reduced)

import matplotlib
import matplotlib.pyplot as plt

def plot_digits(instances, images_per_row=5, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap = matplotlib.cm.binary, **options)
    plt.axis("off")

plt.figure(figsize=(7, 4))
plt.subplot(121)
plot_digits(X_mnist[::2100])
plt.title("Original", fontsize=16)
plt.subplot(122)
plot_digits(X_mnist_recovered[::2100])
plt.title("Compressed", fontsize=16)
#save_fig("mnist_compression_plot")
plt.show()

Incremental PCA

PCA normally requires entire dataset in memory for SVD algorithm.
Incremental PCA (IPCA) splits dataset into batches.

# split MNIST into 100 minibatches using Numpy array_split()
# reduce MNIST down to 154 dimensions as before.
# note use of partial_fit() for each batch.

from sklearn.decomposition import IncrementalPCA

n_batches = 100
inc_pca = IncrementalPCA(n_components=154)

for X_batch in np.array_split(X_mnist, n_batches):
    print(".", end="")
    inc_pca.partial_fit(X_batch)

X_mnist_reduced_inc = inc_pca.transform(X_mnist)

....................................................................................................

# alternative: Numpy memmap class (use binary array on disk as if it was in memory)

filename = "my_mnist.data"

X_mm = np.memmap(
    filename, dtype='float32', mode='write', shape=X_mnist.shape)

X_mm[:] = X_mnist
del X_mm

X_mm = np.memmap(filename, dtype='float32', mode='readonly', shape=X_mnist.shape)

batch_size = len(X_mnist) // n_batches
inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)
inc_pca.fit(X_mm)

IncrementalPCA(batch_size=525, copy=True, n_components=154, whiten=False)

rnd_pca = PCA(
    n_components=154, 
    random_state=42, 
    svd_solver="randomized")

X_reduced = rnd_pca.fit_transform(X_mnist)

import time

for n_components in (2, 10, 154):
    print("n_components =", n_components)
    regular_pca = PCA(
        n_components=n_components)
    inc_pca     = IncrementalPCA(
        n_components=154, 
        batch_size=500)
    rnd_pca     = PCA(
        n_components=154, 
        random_state=42, 
        svd_solver="randomized")

    for pca in (regular_pca, inc_pca, rnd_pca):
        t1 = time.time()
        pca.fit(X_mnist)
        t2 = time.time()
        print(pca.__class__.__name__, t2 - t1, "seconds")

n_components = 2
PCA 1.308387279510498 seconds
IncrementalPCA 18.326093673706055 seconds
PCA 3.998342514038086 seconds
n_components = 10
PCA 1.4705824851989746 seconds
IncrementalPCA 16.598721742630005 seconds
PCA 4.156355619430542 seconds
n_components = 154
PCA 4.129154682159424 seconds
IncrementalPCA 16.597434043884277 seconds
PCA 4.0131142139434814 seconds

Randomized PCA

Stochastic algorithm, quickly finds approximation of 1st d components. Dramatically faster.

rnd_pca = PCA(n_components=154, svd_solver="randomized")

t1 = time.time()
X_reduced = rnd_pca.fit_transform(X_mnist)
t2 = time.time()
print(t2-t1, "seconds")

4.414088487625122 seconds

Kernel PCA

Use kernel trick to map instances into higher-D feature spaces. This enables non-linear classification & regression with SVMs.
Good at preserving clusters after projecton.

# Below: Swiss roll reduced to 2D using 3 techniques:
# 1) linear kernel (equiv to PCA)
# 2) RBF kernel
# 3) sigmoid kernel (logistic)

from sklearn.decomposition import KernelPCA

X, t = make_swiss_roll(
    n_samples=1000, 
    noise=0.2, 
    random_state=42)

lin_pca = KernelPCA(
    n_components = 2, 
    kernel="linear", 
    fit_inverse_transform=True)

rbf_pca = KernelPCA(
    n_components = 2, 
    kernel="rbf", 
    gamma=0.0433, 
    fit_inverse_transform=True)

sig_pca = KernelPCA(
    n_components = 2, 
    kernel="sigmoid", 
    gamma=0.001, 
    coef0=1, 
    fit_inverse_transform=True)

y = t > 6.9

plt.figure(figsize=(11, 4))

for subplot, pca, title in (
    (131, lin_pca, "Linear kernel"), 
    (132, rbf_pca, "RBF kernel, $\gamma=0.04$"), 
    (133, sig_pca, "Sigmoid kernel, $\gamma=10^{-3}, r=1$")):

    X_reduced = pca.fit_transform(X)
    if subplot == 132:
        X_reduced_rbf = X_reduced

    plt.subplot(subplot)
    #plt.plot(X_reduced[y, 0], X_reduced[y, 1], "gs")
    #plt.plot(X_reduced[~y, 0], X_reduced[~y, 1], "y^")
    plt.title(title, fontsize=14)
    plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)
    plt.xlabel("$z_1$", fontsize=18)
    if subplot == 131:
        plt.ylabel("$z_2$", fontsize=18, rotation=0)
    plt.grid(True)

#save_fig("kernel_pca_plot")
plt.show()

Selecting a Kernel & Hyperparameters

Dimensionality reduction = prep for supervised learning task
Can use grid search to select kernel & params

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ("kpca", KernelPCA(n_components=2)),
    ("log_reg", LogisticRegression())])

param_grid = [{
    "kpca__gamma": np.linspace(0.03, 0.05, 10),
    "kpca__kernel": ["rbf", "sigmoid"]}]

grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X, y)

# best kernel & params?
print(grid_search.best_params_)

{'kpca__gamma': 0.043333333333333335, 'kpca__kernel': 'rbf'}

Another (unsupervised approach): select kernel & params with lowest reconstruction error. Not as easy as with linear PCA.

rbf_pca = KernelPCA(
    n_components = 2, 
    kernel="rbf", 
    gamma=0.0433,
    fit_inverse_transform=True) # perform reconstruction

X_reduced  = rbf_pca.fit_transform(X)
X_preimage = rbf_pca.inverse_transform(X_reduced)

# return reconstruction pre-image error
from sklearn.metrics import mean_squared_error
mean_squared_error(X, X_preimage)

32.786308795766082

times_rpca = []
times_pca = []
sizes = [1000, 10000,  20000,  30000, 40000, 50000, 70000, 
              100000, 200000, 500000]

for n_samples in sizes:

    X = rnd.randn(n_samples, 5)

    pca = PCA(
        n_components = 2, 
        random_state=42, 
        svd_solver="randomized")

    t1 = time.time()
    pca.fit(X)
    t2 = time.time()
    times_rpca.append(t2 - t1)

    pca = PCA(n_components = 2)

    t1 = time.time()
    pca.fit(X)
    t2 = time.time()
    times_pca.append(t2 - t1)

plt.plot(sizes, times_rpca, "b-o", label="RPCA")
plt.plot(sizes, times_pca, "r-s", label="PCA")
plt.xlabel("n_samples")
plt.ylabel("Training time")
plt.legend(loc="upper left")
plt.title("PCA and Randomized PCA time complexity ")
plt.show()

LLE (Locally Linear Embedding)

Powerful nonlinear dimensionality reduction tool
Manifold Learning; doesn't rely on projections.
LLE measures how each instance relates to closest neighbors, then looks for low-D representation where local relations are best preserved.

# Use LLE to unroll a Swiss Roll.

from sklearn.manifold import LocallyLinearEmbedding

X, t = make_swiss_roll(
    n_samples=1000, 
    noise=0.2, 
    random_state=41)

lle = LocallyLinearEmbedding(
    n_neighbors=10, 
    n_components=2, 
    random_state=42)

X_reduced = lle.fit_transform(X)

plt.title("Unrolled swiss roll using LLE", fontsize=14)
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)
plt.xlabel("$z_1$", fontsize=18)
plt.ylabel("$z_2$", fontsize=18)
plt.axis([-0.065, 0.055, -0.1, 0.12])
plt.grid(True)

#save_fig("lle_unrolling_plot")
plt.show()

1st: For each instance, LLE finds k nearest neighbors & tries to reconstruct instance as linear function of neighbors (weights such that squared distance is minimum).
Weight matrix W now encodes all local linear relations between instances.
2nd: Map instances into d-dimensional space & preserve relationship data
Scikit computational complexity:
finding K nearest neighbors: O(m x log(m) x n x log(k))
weight optimization: O(m x n x k^3)
constructing low-d representations: O(d x m^2)

MDS, Isomap, t-SNE, LDA

from sklearn.manifold import MDS
mds = MDS(n_components=2, random_state=42)
X_reduced_mds = mds.fit_transform(X)

from sklearn.manifold import Isomap
isomap = Isomap(n_components=2)
X_reduced_isomap = isomap.fit_transform(X)

from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)
X_reduced_tsne = tsne.fit_transform(X)

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=2)
X_mnist = mnist["data"]
y_mnist = mnist["target"]
lda.fit(X_mnist, y_mnist)
X_reduced_lda = lda.transform(X_mnist)

/home/bjpcjp/anaconda3/lib/python3.5/site-packages/sklearn/discriminant_analysis.py:387: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")

titles = ["MDS", "Isomap", "t-SNE"]

plt.figure(figsize=(11,4))

for subplot, title, X_reduced in zip((131, 132, 133), titles,
                                     (X_reduced_mds, X_reduced_isomap, X_reduced_tsne)):
    plt.subplot(subplot)
    plt.title(title, fontsize=14)
    plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)
    plt.xlabel("$z_1$", fontsize=18)
    if subplot == 131:
        plt.ylabel("$z_2$", fontsize=18, rotation=0)
    plt.grid(True)

#save_fig("other_dim_reduction_plot")
plt.show()

ch08 dimensionality reduction.md

Intro

Curse of Dimensionality

Approaches: Projection

Approaches: Manifolds

PCA (Principal Component Analysis)

Preserving Variance

Principal Components

Projecting Training Data Down to d Dimensions

Scikit PCA

Explained Variance Ratio

Choosing Right #Dimensions

PCA for Compression

Incremental PCA

Randomized PCA

Kernel PCA

Selecting a Kernel & Hyperparameters

LLE (Locally Linear Embedding)

MDS, Isomap, t-SNE, LDA

results matching ""

No results matching ""