 Dragon Notes UNDER CONSTRUCTION
Latest content:
 Apr 05 Deep Learning Mar 19 Anomaly Detection - ML Mar 13 +Data Tables Mar 08 Clustering - Machine Learning Feb 28 Support Vector Machines - ML Feb 20 Regression - Data Science

Machine Learning Algorithms:Image Recognition - Digits - Training a neural network to recognize digits, 0-9, from greyscale 20x20px images
- Implementing forward propagation & backpropagation to compute gradients
- Regularizing gradients & cost function
- Randomly initializing weights & bug-proofing gradients
- Classifying images with trained neural network
- Visualizing image data & neural network layers

[Results]    [Algorithm]
// Neural Network Forward-prop, Backprop, Cost Function
function [J, grad] = nnCostFunction(nn_params, ...
input_layer_size, ...
hidden_layer_size, num_labels, ...
X, y, lambda)
L_input = input_layer_size;
L_hidden = hidden_layer_size;
Theta1 = reshape(nn_params(1:L_hidden*(L_input+1)), ...
L_hidden, (L_input+1));
Theta2 = reshape(nn_params((1 + (L_hidden*(L_input+1))):end), ...
num_labels, (L_hidden+1));
m = size(X, 1);

// FORWARD PROPAGATION (see Note 1)
X = [ones(m,1) X]';   % [(neurons1 + bias) x examples]
z2 = Theta1*X;        % [neur2 x (neur1 + b)]*[(neur1 + b) x ex's]
a2 = sigmoid(z2);     % [neur2 x ex's]
a2 = [ones(1,m); a2]; % [(neur2 + b) x ex's]
z3 = Theta2*a2;       % [neur3 x (neur2 + b)]*[(neur2 + b) x ex's]
h = sigmoid(z3);      % [neur3 x ex's]
h = h(:);         % [n0(ex1;...;exm); n1(ex1...); ...; n9(ex1...)]

// COST FUNCTION (REGULARIZED)
yk = repmat((1:num_labels)',1,m);   % [neur3 x ex's]
ycompare = repmat(y',num_labels,1); % [neur3 x ex's]
yk = (ycompare(:) == yk(:));        % dim same as h

J = (1/m)*(-yk'*log(h)-(1-yk')*log(1-h));
J_reg = (lambda/(2*m))*(sum(Theta1(size(Theta1,1)+1:end).^2) ...
+ sum(Theta2(size(Theta2,1)+1:end).^2));
J = J + J_reg;

// BACKPROPAGATION (REGULARIZED)
Delta1 = zeros(L_hidden,L_input+1);      % [neur2 x (neur1 + b)]
Delta2 = zeros(num_labels,L_hidden+1);   % [neur3 x (neur2 + b)]

delta3 = (h - yk);
delta3 = reshape(delta3, num_labels, m); % [neur3 x ex's]
% [neur2 x neur3]*[neur3 x ex's].*[neur2 x ex's]

Delta1 = Delta1 + delta2*X';  % +[neur2 x ex's]*[ex's x (neur1+b)]
Delta2 = Delta2 + delta3*a2'; % +[neur3 x ex's]*[ex's x (neur2+b)]

D1 = Delta1/m;  % [neur2 x (neur1 + b)]
D2 = Delta2/m;  % [neur3 x (neur2 + b)]

Theta1_grad(:,1) = D1(:,1);   % un-regularize bias term
Theta2_grad(:,1) = D2(:,1);   % un-regularize bias term

end
%=================================================================
%  Computes the gradient using "finite differences"
%  and gives us a numerical estimate of the gradient.
perturb = zeros(size(theta));
e = 1e-4;  % set epsilon
for p = 1:numel(theta)
perturb(p) = e;  % set perturbation vector
loss1 = J(theta - perturb);
loss2 = J(theta + perturb);
perturb(p) = 0;  % reset perturbation vector
end
end
%=================================================================
// Prediction
function p = predict(Theta1, Theta2, X)
%  outputs the predicted label of X given the
%  trained weights of a neural network (Theta1, Theta2)
m = size(X, 1);
h = sigmoid([ones(m,1), sigmoid([ones(m,1),X]*Theta1')]*Theta2');
[void, p] = max(h, [], 2);
end


[Application] Train & test neural network for digit image recognition
// Initialization
L_input  = 400;   % 20x20 Input Images of Digits
L_hidden = 25;    % 25 hidden units
num_labels = 10;  % 10 labels, from 1 to 10 (& "0"->10)
%==================================================================
m = size(X, 1);

sel = randperm(size(X, 1));  % randomly select 100
sel = sel(1:100);            % datapoints to display
displayData(X(sel, :)); pause;

% Load pre-trained weights to check NN later before training
nn_params = [Theta1(:) ; Theta2(:)];
%==================================================================
// Initialize Pameters

initial_Theta1 = randInitializeWeights(L_input, L_hidden);
initial_Theta2 = randInitializeWeights(L_hidden, num_labels);

initial_nn_params = [initial_Theta1(:) ; initial_Theta2(:)];
%==================================================================
// Check Backpropagation
lambda = 3;

debug_J  = nnCostFunction(nn_params, L_input, ...
L_hidden, num_labels, X, y, lambda);

fprintf(['\nCost at (fixed) debugging params (w/ lambda=%f):'...
'%f\n(for lambda=3, expected val~0.576051)\n\n'],lambda,debug_J);
pause;
%==================================================================
// Train Neural Network
options = optimset('MaxIter', 700);
lambda = 1.5;

% create 'shorthand' for nnCostFunction, with one arg 'nn_params'
costFunc = @(p) nnCostFunction(p, L_input, L_hidden, ...
num_labels, X, y, lambda);

% minimize cost function w/ advanced optimization algorithm,
% supplying initial parameters, cost function, & gradient
[nn_params, cost] = fmincg(costFunc, initial_nn_params, options);

% retrieve Theta1 and Theta2 back from nn_params
Theta1 = reshape(nn_params(1:L_hidden * (L_input + 1)), ...
L_hidden, (L_input + 1));
Theta2 = reshape(nn_params((1+(L_hidden*(L_input + 1))):end), ...
num_labels, (L_hidden + 1));
%==================================================================
// Predict Digits
% Compute NN accuracy
pred = predict(Theta1, Theta2, X);
fprintf('\nTraining Set Accuracy: %f\n',mean(double(pred==y))*100);

% Visualize neural network (see Note 5)
displayData(Theta1(:, 2:end)); pause;
displayData(Theta2(:, 2:end)); pause;

% Predict digits, one-by-one
rp = randperm(m);  % randomly permute examples
for i = 1:m
displayData(X(rp(i), :));

pred = predict(Theta1, Theta2, X(rp(i),:));
fprintf('\nNeural Network Prediction: %d (digit %d)\n', ...
pred, mod(pred, 10));

s = input('Paused; press enter to continue, q to exit:','s');
if s == 'q'  % pause with quit option
break
end
end


[Data + Code]  imgdata.matpretrained_weights.mat
digitRecognition_NN.zip (algorithm + function code; data)
[Functions]
function g = sigmoid(z)
e = 2.718281828;
g = 1./(1+e.^(-z));
end
%==================================================================
g = sigmoid(z).*(1-sigmoid(z));
end
%==================================================================
function [h, display_array] = displayData(X, example_width)
%  [h, display_array] = DISPLAYDATA(X, example_width) displays 2D
%   data stored in X in a nice grid. It returns the figure handle
%   h and the displayed array if requested.

% "example" = image representation array;
% "example_width" = image width in pixels (= sqrt(row length))
% row of X = sequence of greyscale values of an image

% Set example_width automatically if not passed in
if ~exist('example_width', 'var') || isempty(example_width)
example_width = round(sqrt(size(X, 2)));
end

colormap(gray);  % grayscale Image

[m n] = size(X); % compute rows, cols
example_height = (n / example_width);

display_rows = floor(sqrt(m)); % compute # of items to display
display_cols = ceil(m / display_rows);

% setup blank display

% copy each example into a patch on the display array; see Note 3
curr_ex = 1;
for j = 1:display_rows
for i = 1:display_cols
if curr_ex > m,
break;
end
% copy the patch
% get the max value of the patch
max_val = max(abs(X(curr_ex, :)));
(1:example_height), pad + (i - 1) * ...
(example_width + pad) + (1:example_width)) = ...
reshape(X(curr_ex, :), example_height, ...
example_width) / max_val;
curr_ex = curr_ex + 1;
end
if curr_ex > m,
break;
end
end

h = imagesc(display_array, [-1 1]); % Display Image; see Note 4
axis image off % do not show axis

drawnow;
% updates figures and processes any pending callbacks; used to
% modify graphics objects to see results immediately
end
%==================================================================
function W = debugInitializeWeights(fan_out, fan_in)
% initializes the weights of a layer with fan_in incoming connecti-
% and fan_out outgoing connections using a fixed set of values
W = zeros(fan_out, 1 + fan_in);
W = reshape(sin(1:numel(W)), size(W)) / 10;
% Initialize W using "sin", this ensures that W is always of the
% same values and will be useful for debugging
end
%=================================================================
%  Creates a small neural network to check the backpropagation

if ~exist('lambda', 'var') || isempty(lambda)
lambda = 0;
end

L_input = 3; L_hidden = 5; num_labels = 3; m = 5;

% Generate 'random' test data
Theta1 = debugInitializeWeights(L_hidden, L_input);
Theta2 = debugInitializeWeights(num_labels, L_hidden);
X  = debugInitializeWeights(m, L_input - 1);
y  = 1 + mod(1:m, num_labels)';

nn_params = [Theta1(:) ; Theta2(:)];  % unroll parameters

costFunc = @(p)nnCostFunction(p, L_input, L_hidden, ...
num_labels, X, y, lambda); % see Note 2

fprintf(['The above two columns should be very similar.\n' ...
fprintf(['Expected relative difference: less than 1e-9 \n' ...
'\nRelative Difference: %g\n'], diff);
end


[Notes]
// Note 1
Fully-vectorized forward propagation, free of sum()'s and for-
loops. A helpful perception tool is to express each variable in
terms of its dimensions - including units (or rather, name). In
this doc, 'neurons1' refers to activation units in layer 1, etc:

X = [ones(m,1) X]' -- appends an intercept column (bias) to X,
and transposes to set dim as [(neur1 + bias) x ex]

z2 = Theta1*X -- note that in matrix multiplication, inner dimen-
sions drop and the result's the outer dims:
[neur2 x (neur1 + b)]*[(neur1 + b) x ex's]
= [neur2 x ex's]

The result is sensible; z2 is the input to (each neuron in)
layer 2, of which there are # of ex each. For further intuition,
see "NN Complete Vectorization" below.
%=================================================================
// Note 2
'p' serves as a variable handle for nn_params; when called,
costFunc takes p as input and obtains other vars of nnCostFunction
from the general workspace.
%=================================================================
// Note 3
for j = 1:display_rows -- Sample execution for (i,j) = (1,1):

display_array(1 + 0 + (1:20), 1 + 0 + (1:20)) =
display_array(1 + 1:20, 1 + 2:20) =
display_array(2:21, 2:21) = select RC 2 to 21, skip 1;
reshape(X(1, :), 20, 20)
= returns 20x20 matrix w/ entries from col 1 to 20 & row 1 to 20
of matrix X (in this func, passed as 'sel')
'display_array(...) = reshape(...)' = fills 20x20 patch of
display_array w/ entries from a 20x20 array of X
'/ max_val' = scales matrix X entries to range between -1 and 1
For (j,i) = (1, 2), repeat the same - filling rows 2 to 21, and
cols 23 to 43 (skipping col 22 to keep the grid)
repeat for all (j, i) <= (display_rows, display_cols)
%=================================================================
// Note 4
h = imagesc(display_array, [-1 1]) -- displays the data in
display_array as an image; each element of the array specifies
the greyscale value for 1 pixel of the image. [-1 1] uses scaled
data to utilzie the full greyscale range, w /-1 = black, 1 = white
Given a hidden unit, one way to visualize what it computes, loose-
ly, is to find an input x that will activate it (that is, to
have an activation value (a_i^(l)) close to 1. Different # of
train. iters & lambdas can yield notably different NN visuals.
Further, the hidden units can be seen to roughly correspond to
'detectors' that look for strokes and other patterns in the input.


NN Complete Vectorization

Forward propagation, backpropagation, cost function, and regularization can be vectorized entirely, free of for-loops:
Consider a simple NN with 4 classes, $$m=3$$ training examples, and hidden layer size 2:   