# Machine Learning Algorithms

Linear Regression

- Learning algorithm using gradient descent to perform linear regression on input data with any # of variables
- Batch gradient descent
- Scatter, regression line plots of data
- Applied examples with one and two features
- Making predictions using obtained regression
- Contour, surface plots of cost function
- Cost function for varying alpha

[Results]    [Algorithm]
// Normalization
function [X_norm, mu, sigma] = featureNormalize(X)

X_norm = X;                    % initialize X_norm
mu = zeros(1, size(X, 2));     % initialize mu
sigma = zeros(1, size(X, 2));  % initialize sigma
n = size(X,2);                 % # of features
m = length(X);                 % # of training examples

for i = 1:n                  % iterate across features
mu(i) = sum(X(:,i))/m;   % sum down the column, avg. over m
sigma(i) = std(X(:,i));  % compute std of column
for j = 1:m              % iterate across training ex.
X_norm(j,i) = (X(j,i)-mu(i))/sigma(i);
end                      % ^subtract mean, div. by std
end                          % X(row,col)=(train ex., feature)

end
=================================================================
function [theta, J_history] = gradientDescent(X, y, theta,
alpha, num_iters)
m = length(y); % # of training examples
J_history = zeros(num_iters, 1);
Xtp = X';

for iter = 1:num_iters
h = (theta'*Xtp)'; % compute prediction, tp. to match size(y)
delta = (1/m)*(Xtp*(h-y));   % see [1*]
theta = theta - alpha*delta; % update theta
J_history(iter) = computeCost(X, y, theta);
end

end
=================================================================
// Cost Function
function J = computeCost(X, y, theta)

m = length(y); J = 0; Xtp = X';  % transpose X to list features
% down the rows, and training ex. across the columns
h = (theta'*Xtp)';
J = 1/(2*m)*sum((h-y).^2);  % perform elementwise subtraction of
% output from prediction, square, and sum - then dividing by 2*m
% to obtain mean square error

end


// [1*]
% Takes element-wise product of (h-y) and training examples, per
% feature, and sums them:

% (h - y) = [a1
%            a2
%            a3
%            ...
%            am]
% Xtp = [b1 b2 b3 ... bm
%        c1 c2 c3 ... cm]
%
% Xtp*(h - y) = [b1*a1 + b2*a2 + ... + bm*am
%                c1*a1 + c2*a2 + ... + cm*am]
%             = [delta1
%                delta2]

[Functions]
function plotData(x, y)
figure; % open a new figure window
plot(x,y,'rx', 'MarkerSize', 7);
xlabel('Population (in 10,000s)');
ylabel('Profit (in $10,000s)'); end function plotSurf(theta0_vals, theta1_vals, theta) data = load('sample_data1.txt'); X = data(:, 1); y = data(:, 2); m = length(y); X = [ones(m, 1), X]; J_vals = zeros(length(theta0_vals), length(theta1_vals)); % ^ initialize J_vals for i = 1:length(theta0_vals) % fill out J_vals for j = 1:length(theta1_vals) t = [theta0_vals(i); theta1_vals(j)]; J_vals(i,j) = computeCost(X, y, t); end end J_vals = J_vals'; % transpose before calling surf to match axes figure; surf(theta0_vals, theta1_vals, J_vals) xlabel('\theta_0'); ylabel('\theta_1'); figure; % plot J_vals as 15 contours log-spaced btn .01 and 100 contour(theta0_vals, theta1_vals, J_vals, logspace(-2, 3, 20)) xlabel('\theta_0'); ylabel('\theta_1'); hold on; title('Cost function contours', 'FontSize', 14); plot(theta(1), theta(2), 'rx', 'MarkerSize', 10, 'LineWidth', 2); end function printSample(X, y) fprintf('First 10 examples from the dataset: \n'); fprintf(' x = [%.0f %.0f], y = %.0f \n', ... [X(1:10,:) y(1:10,:)]'); end function plotCostFunc(J_history1, J_history2, J_history3, J_history4) figure; plot(1:numel(J_history1), J_history1, '-b', 'LineWidth', 2); hold on plot(1:numel(J_history2), J_history2, '-r', 'LineWidth', 2); plot(1:numel(J_history3), J_history3, '-g', 'LineWidth', 2); plot(1:numel(J_history4), J_history3, '-k', 'LineWidth', 2); xlabel('Number of iterations'); ylabel('Cost J'); title('J vs. # of iterations', 'FontSize', 14) L1 = legend('alpha = 0.01','alpha = 0.03','alpha = 0.1', 'alpha = 1.33'); L1.FontSize = 11; axis([0 400 0 7*10^10]); end function predictPrice(sqft, rooms, theta) data = load('sample_data1.txt'); X = data(:, 1:2); [~, mu, sigma] = featureNormalize(X); % ^ ~ removes unused output sqft_norm = (sqft-mu(1))/sigma(1); % normalize feature price = theta(1) + theta(2)*sqft_norm + theta(3)*rooms; fprintf(['Predicted price of a %d sq-ft, %d rooms house ' ... '(using gradient descent):\n$%.2f\n'], sqft, rooms, price);
end


[Application] Predict sales profits from population (single feature)
data = load('sample_data1.txt'); % load example data
X = data(:, 1); y = data(:, 2);  % define input & output matrices
m = length(y);                   % # of training examples

plotData(X, y);         % plot data (2D scatter)

X = [ones(m, 1), X];    % add intercept term to X

// Run gradient descent and store cost function history
num_iter = 1500;        % set # of iterations
theta = zeros(2, 1);    % initialize theta
alpha = 0.01;           % set initial alpha

J = computeCost(X, y, theta) % compute and display initial cost
J = computeCost(X, y, [-1; 2]) % further testing cost function

theta = gradientDescent(X, y, theta, alpha, num_iter);
theta % run gradient descent; display resultant theta
================================================================

hold on; % keep previous plot
plot(X(:,2), X*theta, '-', 'LineWidth', 2) % plot regression line
L1 = legend('Training data', 'Linear regression');
L1.FontSize = 11;
title('Profit vs. Population', 'FontSize', 14)
hold off % don't overlay any more plots on this figure

// Predict values for population sizes of 35,000 & 70,000
predict1 = [1, 35000]*theta
predict2 = [1, 70000]*theta
================================================================

// Visualize J
theta0_vals = linspace(-10, 10, 100);  % grid over which to
theta1_vals = linspace(-1, 4, 100);    % calculate J

plotSurf(theta0_vals, theta1_vals, theta) % surface plot J

[Application] Predict house prices from sq-footage & # of rooms (2 features)
data = load('sample_data2.txt');   % load example data
X = data(:, 1:2); y = data(:, 3);  % generate input & output matrices
m = length(y);                     % # of training examples

printSample(X, y);    % prints first 10 training examples

[X mu sigma] = featureNormalize(X);  % Normalize features (zero mean,
% scale by standard deviation)
X = [ones(m, 1), X];                 % add intercept term to X

// Run gradient descent and store cost function history
num_iter = 400;         % set # of iterations
theta = zeros(3, 1);    % initialize theta
alpha = 0.01;           % set initial alpha

[theta, J_history1] = gradientDescent(X, y, theta, alpha, num_iter);
theta = zeros(3, 1);    % reset theta
alpha = 0.03;           % set new alpha
[theta, J_history2] = gradientDescent(X, y, theta, alpha, num_iter);
theta = zeros(3, 1);
alpha = 0.1;
[theta, J_history3] = gradientDescent(X, y, theta, alpha, num_iter);
theta = zeros(3, 1);
alpha = 1.33;           % found by trial & error for plotting
[theta, J_history4] = gradientDescent(X, y, theta, alpha, num_iter);
====================================================================

plotCostFunc(J_history1, J_history2, J_history3, J_history4)
theta % Plot convergence graph; display gradient descent's result

// Estimate the price of a 1650 sq-ft, 3 room house
sqft = 1650; rooms = 3;
predictPrice(sqft, rooms, theta)


[Data + Code]  sample_data1.txtsample_data2.txt
linreg_ML.zip (algorithm + function code; data)

Logistic Regression

- Learning algorithm using gradient descent to perform logistic regression on input data with any # of variables
- Feature-mapping for non-linear fitting
- Applied examples with linear & non-linear decision boundaries
- Making predictions using obtained regression
- Line, contour plots of decision boundaries
- Decision boundaries for varying lambda

[Results]    [Algorithm]
// Cost Function (SIMPLIFIED - one iter)
function [J, grad] = costFunction(theta, X, y)
m = length(y);
h = sigmoid(X*theta);

J = (-1/m)*(y'*log(h)+(1-y')*log(1-h));
end
=================================================================
// Prediction
function accur = predict(theta, X, y)  % Predicts whether the
m = size(X, 1); p = zeros(m, 1);   % label is 1 or 0
% learned logistic reg-
for i = 1:m                            % ression thetas w/ thre-
if sigmoid(X(i,:)*theta) >= 0.5    % shold at 0.5 - i.e., if
p(i) = 1;                      % sigmoid(theta'*X) >= 0.5,
else                               % predict 1
p(i) = 0;
end
end
accur = mean(double(p == y)); % averages truth-occurrence of
% predicate ('p is y' - i.e., prediction % accuracy)
end
=================================================================
// Regularized Cost Function (SIMPLIFIED - one iter)
function [J, grad] = costFunctionReg(theta, X, y, lambda)
m = length(y);
h = sigmoid(X*theta);

J = (-1/m)*(y'*log(h)+(1-y')*log(1-h))+ ...
(lambda/(2*m))*sum(theta(2:end).^2);
end
=================================================================
// Feature mapping
function out = mapFeature(X1, X2)
degree = 6; m = size(X1,1);
num_feat = sum(1:degree+1); % # of terms per deg is (deg + 1)
out = zeros(m, num_feat);   % initialize
out(:,1) = ones; col = 2;   % set first column to ones,
% starting iteration col to 2
for i = 1:degree
for j = 0:i  % fill columns w/ terms whose pows sum to degree
out(:,col) = (X1.^(i-j)).*(X2.^j);
col = col + 1;  % increment col to fill next column
end
end
end


[Notes]
- To compute the decision boundary, find relation that satisfies $$\bn{\theta}^T\bn{x}=0$$:
$$\up 0 = x_0\theta_0 + x_1\theta_1 + x_2\theta_2; x_0 = 1 \Rightarrow 0 = \theta_0 + x_1\theta_1 + x_2\theta_2$$
Solve for $$\up x_2$$: $$\boxed{x_2 = -(\theta_0 + x_1\theta_1)/\theta_2}$$
$$\up$$- To draw a line, we need two points; pick two values for $$x_2$$ - anything near the min and amx of the training set will do. Compute the corresponding $$x_2$$, and draw a line through them.
$$\up$$- Use
pbaspect([1 1 1])
for plot dispay square aspect ratio
- fminunc optimizes theta w/ 400 iterations, using the coded J and grad
[Functions]
function g = sigmoid(z)
e = 2.718281828;
g = 1./(1+e.^(-z));
end

function plotData(X, y)
figure; hold on;
pos = find(y==1);  % find indices of positive
neg = find(y==0);  % and negative examples

plot(X(pos,1), X(pos,2), 'k+', 'LineWidth',2, 'MarkerSize',7);
plot(X(neg,1), X(neg,2), 'ko', 'MarkerFaceColor','y', ...
'MarkerSize',7);
% plot positive examples with a '+' black marker,
% negative ones with a yellow-filled circle
hold off;
end

function plotDecisionBoundary(theta, X, y)
plotData(X(:,2:3), y); hold on;

if size(X, 2) <= 3
plot_x = [min(X(:,2))-2, max(X(:,2))+2]; % see Notes
plot_y = (-1./theta(3)).*(theta(2).*plot_x + theta(1));
plot(plot_x, plot_y, 'LineWidth', 2)

title('Decision boundary (Linear)', 'FontSize', 14)
axis([30, 100, 30, 100])
else
u = linspace(-1, 1.5, 50);
v = linspace(-1, 1.5, 50);
z = zeros(length(u), length(v));
for i = 1:length(u)
for j = 1:length(v)
z(i,j) = mapFeature(u(i), v(j))*theta;
end
end
z = z'; % transpose for contour plotting
contour(u, v, z, [0 0], 'LineWidth', 2)
% ^ plots all (u, v) for a single vector level 0 (= z)
end

hold off
end

% labels1.m
hold on;
xlabel('Exam 1 score'); ylabel('Exam 2 score'); % label axes
hold off;

% labels2.m
hold on;
xlabel('Microchip Test 1'); ylabel('Microchip Test 2');
legend('y = 1', 'y = 0');
hold off;

% dispResults1.m
fprintf('Cost at theta: %f\n', cost);
fprintf('Gradient at theta: \n');

% dispResults2.m
fprintf('Cost at theta: %f\n', cost);
fprintf('Gradient at theta - first five vals:\n');
fprintf(' %f \n', grad(1:5));


[Application] Predict admission chance from test scores

(linear boundary)

data = load('sample_data1.txt');
X = data(:, [1, 2]); y = data(:, 3);

plotData(X,y);  % visualize data
labels1;        % add labels

[m, n] = size(X);           % # of (training ex, features
X = [ones(m, 1), X];        % add intercept term

% Compute and display initial cost & gradient
init_theta = zeros(n+1, 1);
[cost, grad] = costFunction(init_theta, X, y);
dispResults1

% Repeat for non-zero theta
test_theta = [-24; 0.2; 0.2];
[cost, grad] = costFunction(test_theta, X, y);
dispResults1
==================================================================
% // Optimizing using fminunc

options = optimset('GradObj', 'on', 'MaxIter', 400);
[theta, cost] = ...
fminunc(@(t)(costFunction(t, X, y)), init_theta, options);

plotDecisionBoundary(theta, X, y);  % plot decision boundary
labels1;
==================================================================
% // Predicting & Accuracies
% After learning the parameters, we'd like to use them to predict
% the outcomes on unseen data. Further, we'd like to assess the
% accuracy of our model. In this ex, take exam scores of 45 & 85.

prob = sigmoid([1 45 85]*theta);  % include intercept term
accur = predict(theta, X, y); % compute accuracy on the train. set

fprintf(['For a student with scores 45 and 85, we predict an ' ...
'admission probability of %f\n'], prob);
fprintf('Train Accuracy: %f\n', accur);

[Application] Predict chip viability from prior tests

(non-linear bound.)

data = load('sample_data2.txt');
X = data(:, [1, 2]); y = data(:, 3);

plotData(X, y); labels2;

X = mapFeature(X(:,1), X(:,2)); % add polynomial features to
% classify non-linear data
% Compute & display initial cost and gradient for regularized
% logistic regression
init_theta = zeros(size(X,2), 1); lambda = 1;
[cost, grad] = costFunctionReg(init_theta, X, y, lambda);
dispResults2

% Repeat for all-ones theta and lambda = 10
test_theta = ones(size(X,2),1);
[cost, grad] = costFunctionReg(test_theta, X, y, 10);
dispResults2
==================================================================
% // Regularization & Accuracies
init_theta = zeros(size(X,2), 1); lambda = 1;

options = optimset('GradObj', 'on', 'MaxIter', 400);
[theta, J, exit_flag] = fminunc(@(t)(costFunctionReg(t, X, y, ...
lambda)), init_theta, options);

plotDecisionBoundary(theta, X, y); labels2;
title(sprintf('Decision boundary (non-lin), lambda = %g', ...
lambda), 'FontSize', 14);

accur = predict(theta, X, y);
fprintf('Train Accuracy: %f\n', accur);


[Data + Code]  sample_data1.txtsample_data2.txt
logreg_ML.zip (algorithm + function code; data)

