Machine learning assignment 4

This commit is contained in:
2017-05-16 22:11:33 +01:00
parent 18453d2e37
commit a9cee6e867
18 changed files with 246 additions and 0 deletions

View File

@@ -0,0 +1,27 @@
global randomness_source
load a4_randomness_source
global data_sets
temp = load('data_set'); % same as in PA3
data_sets = temp.data;
global report_calls_to_sample_bernoulli
report_calls_to_sample_bernoulli = false;
test_rbm_w = a4_rand([100, 256], 0) * 2 - 1;
small_test_rbm_w = a4_rand([10, 256], 0) * 2 - 1;
temp = extract_mini_batch(data_sets.training, 1, 1);
data_1_case = sample_bernoulli(temp.inputs);
temp = extract_mini_batch(data_sets.training, 100, 10);
data_10_cases = sample_bernoulli(temp.inputs);
temp = extract_mini_batch(data_sets.training, 200, 37);
data_37_cases = sample_bernoulli(temp.inputs);
test_hidden_state_1_case = sample_bernoulli(a4_rand([100, 1], 0));
test_hidden_state_10_cases = sample_bernoulli(a4_rand([100, 10], 1));
test_hidden_state_37_cases = sample_bernoulli(a4_rand([100, 37], 2));
report_calls_to_sample_bernoulli = true;
clear temp;

View File

@@ -0,0 +1,67 @@
% This file was published on Wed Nov 14 20:48:30 2012, UTC.
function a4_main(n_hid, lr_rbm, lr_classification, n_iterations)
% first, train the rbm
global report_calls_to_sample_bernoulli
report_calls_to_sample_bernoulli = false;
global data_sets
if prod(size(data_sets)) ~= 1,
error('You must run a4_init before you do anything else.');
end
rbm_w = optimize([n_hid, 256], ...
@(rbm_w, data) cd1(rbm_w, data.inputs), ... % discard labels
data_sets.training, ...
lr_rbm, ...
n_iterations);
% rbm_w is now a weight matrix of <n_hid> by <number of visible units, i.e. 256>
show_rbm(rbm_w);
input_to_hid = rbm_w;
% calculate the hidden layer representation of the labeled data
hidden_representation = logistic(input_to_hid * data_sets.training.inputs);
% train hid_to_class
data_2.inputs = hidden_representation;
data_2.targets = data_sets.training.targets;
hid_to_class = optimize([10, n_hid], @(model, data) classification_phi_gradient(model, data), data_2, lr_classification, n_iterations);
% report results
for data_details = reshape({'training', data_sets.training, 'validation', data_sets.validation, 'test', data_sets.test}, [2, 3]),
data_name = data_details{1};
data = data_details{2};
hid_input = input_to_hid * data.inputs; % size: <number of hidden units> by <number of data cases>
hid_output = logistic(hid_input); % size: <number of hidden units> by <number of data cases>
class_input = hid_to_class * hid_output; % size: <number of classes> by <number of data cases>
class_normalizer = log_sum_exp_over_rows(class_input); % log(sum(exp of class_input)) is what we subtract to get properly normalized log class probabilities. size: <1> by <number of data cases>
log_class_prob = class_input - repmat(class_normalizer, [size(class_input, 1), 1]); % log of probability of each class. size: <number of classes, i.e. 10> by <number of data cases>
error_rate = mean(double(argmax_over_rows(class_input) ~= argmax_over_rows(data.targets))); % scalar
loss = -mean(sum(log_class_prob .* data.targets, 1)); % scalar. select the right log class probability using that sum; then take the mean over all data cases.
fprintf('For the %s data, the classification cross-entropy loss is %f, and the classification error rate (i.e. the misclassification rate) is %f\n', data_name, loss, error_rate);
end
report_calls_to_sample_bernoulli = true;
end
function d_phi_by_d_input_to_class = classification_phi_gradient(input_to_class, data)
% This is about a very simple model: there's an input layer, and a softmax output layer. There are no hidden layers, and no biases.
% This returns the gradient of phi (a.k.a. negative the loss) for the <input_to_class> matrix.
% <input_to_class> is a matrix of size <number of classes> by <number of input units>.
% <data> has fields .inputs (matrix of size <number of input units> by <number of data cases>) and .targets (matrix of size <number of classes> by <number of data cases>).
% first: forward pass
class_input = input_to_class * data.inputs; % input to the components of the softmax. size: <number of classes> by <number of data cases>
class_normalizer = log_sum_exp_over_rows(class_input); % log(sum(exp)) is what we subtract to get normalized log class probabilities. size: <1> by <number of data cases>
log_class_prob = class_input - repmat(class_normalizer, [size(class_input, 1), 1]); % log of probability of each class. size: <number of classes> by <number of data cases>
class_prob = exp(log_class_prob); % probability of each class. Each column (i.e. each case) sums to 1. size: <number of classes> by <number of data cases>
% now: gradient computation
d_loss_by_d_class_input = -(data.targets - class_prob) ./ size(data.inputs, 2); % size: <number of classes> by <number of data cases>
d_loss_by_d_input_to_class = d_loss_by_d_class_input * data.inputs.'; % size: <number of classes> by <number of input units>
d_phi_by_d_input_to_class = -d_loss_by_d_input_to_class;
end
function indices = argmax_over_rows(matrix)
[dump, indices] = max(matrix);
end
function ret = log_sum_exp_over_rows(matrix)
% This computes log(sum(exp(a), 1)) in a numerically stable way
maxs_small = max(matrix, [], 1);
maxs_big = repmat(maxs_small, [size(matrix, 1), 1]);
ret = log(sum(exp(matrix - maxs_big), 1)) + maxs_small;
end

View File

@@ -0,0 +1,9 @@
function ret = a4_rand(requested_size, seed)
global randomness_source
start_i = mod(round(seed), round(size(randomness_source, 2) / 10)) + 1;
if start_i + prod(requested_size) >= size(randomness_source, 2) + 1,
error('a4_rand failed to generate an array of that size (too big)')
end
ret = reshape(randomness_source(start_i : start_i+prod(requested_size)-1), requested_size);
end

View File

@@ -0,0 +1,17 @@
2.360736
1724.967611
4391.169583
-18.391
3166.216216
-4669.675676
-4716.094972
0.1
0.065889

View File

@@ -0,0 +1,26 @@
function ret = cd1(rbm_w, visible_data)
% <rbm_w> is a matrix of size <number of hidden units> by <number of visible units>
% <visible_data> is a (possibly but not necessarily binary) matrix of size <number of visible units> by <number of data cases>
% The returned value is the gradient approximation produced by CD-1. It's of the same shape as <rbm_w>.
% This is needed for questions up to question #9
%v0 = visible_data;
%
v0 = sample_bernoulli(visible_data);
h0p = visible_state_to_hidden_probabilities(rbm_w, v0);
h0 = sample_bernoulli(h0p);
v1p = hidden_state_to_visible_probabilities(rbm_w, h0);
v1 = sample_bernoulli(v1p);
h1p = visible_state_to_hidden_probabilities(rbm_w, v1);
% h1 = sample_bernoulli(h1p); % We don't need this. But instead we could use
% probabilities.
ret = configuration_goodness_gradient(v0,h0) .- configuration_goodness_gradient(v1,h1p);
end

View File

@@ -0,0 +1,9 @@
function G = configuration_goodness(rbm_w, visible_state, hidden_state)
% <rbm_w> is a matrix of size <number of hidden units> by <number of visible units>
% <visible_state> is a binary matrix of size <number of visible units> by <number of configurations that we're handling in parallel>.
% <hidden_state> is a binary matrix of size <number of hidden units> by <number of configurations that we're handling in parallel>.
% This returns a scalar: the mean over cases of the goodness (negative energy) of the described configurations.
cases = size( visible_state, 2 );
sisj_wij = rbm_w .* (hidden_state * visible_state');
G = sum( sum( sisj_wij )) / cases;
end

View File

@@ -0,0 +1,8 @@
function d_G_by_rbm_w = configuration_goodness_gradient(visible_state, hidden_state)
% <visible_state> is a binary matrix of size <number of visible units> by <number of configurations that we're handling in parallel>.
% <hidden_state> is a (possibly but not necessarily binary) matrix of size <number of hidden units> by <number of configurations that we're handling in parallel>.
% You don't need the model parameters for this computation.
% This returns the gradient of the mean configuration goodness (negative energy, as computed by function <configuration_goodness>) with respect to the model parameters. Thus, the returned value is of the same shape as the model parameters, which by the way are not provided to this function. Notice that we're talking about the mean over data cases (as opposed to the sum over data cases).
cases = size( visible_state, 2 );
d_G_by_rbm_w = (hidden_state * visible_state') ./ cases;
end

Binary file not shown.

View File

@@ -0,0 +1,4 @@
function describe_matrix(matrix)
fprintf('Describing a matrix of size %d by %d. The mean of the elements is %f. The sum of the elements is %f\n', size(matrix, 1), size(matrix, 2), mean(matrix(:)), sum(matrix(:)))
end

View File

@@ -0,0 +1,5 @@
function mini_batch = extract_mini_batch(data_set, start_i, n_cases)
mini_batch.inputs = data_set.inputs(:, start_i : start_i + n_cases - 1);
mini_batch.targets = data_set.targets(:, start_i : start_i + n_cases - 1);
end

View File

@@ -0,0 +1,9 @@
function visible_probability = hidden_state_to_visible_probabilities(rbm_w, hidden_state)
% <rbm_w> is a matrix of size <number of hidden units> by <number of visible units>
% <hidden_state> is a binary matrix of size <number of hidden units> by <number of configurations that we're handling in parallel>.
% The returned value is a matrix of size <number of visible units> by <number of configurations that we're handling in parallel>.
% This takes in the (binary) states of the hidden units, and returns the activation probabilities of the visible units, conditional on those states.
deltaE = rbm_w' * hidden_state;
visible_probability = logistic(deltaE);
end

View File

@@ -0,0 +1,4 @@
function ret = logistic(input)
ret = 1 ./ (1 + exp(-input));
end

Binary file not shown.

View File

@@ -0,0 +1,19 @@
function model = optimize(model_shape, gradient_function, training_data, learning_rate, n_iterations)
% This trains a model that's defined by a single matrix of weights.
% <model_shape> is the shape of the array of weights.
% <gradient_function> is a function that takes parameters <model> and <data> and returns the gradient (or approximate gradient in the case of CD-1) of the function that we're maximizing. Note the contrast with the loss function that we saw in PA3, which we were minimizing. The returned gradient is an array of the same shape as the provided <model> parameter.
% This uses mini-batches of size 100, momentum of 0.9, no weight decay, and no early stopping.
% This returns the matrix of weights of the trained model.
model = (a4_rand(model_shape, prod(model_shape)) * 2 - 1) * 0.1;
momentum_speed = zeros(model_shape);
mini_batch_size = 100;
start_of_next_mini_batch = 1;
for iteration_number = 1:n_iterations,
mini_batch = extract_mini_batch(training_data, start_of_next_mini_batch, mini_batch_size);
start_of_next_mini_batch = mod(start_of_next_mini_batch + mini_batch_size, size(training_data.inputs, 2));
gradient = gradient_function(model, mini_batch);
momentum_speed = 0.9 * momentum_speed + gradient;
model = model + momentum_speed * learning_rate;
end
end

View File

@@ -0,0 +1,9 @@
function binary = sample_bernoulli(probabilities)
global report_calls_to_sample_bernoulli
if report_calls_to_sample_bernoulli,
fprintf('sample_bernoulli() was called with a matrix of size %d by %d. ', size(probabilities, 1), size(probabilities, 2));
end
seed = sum(probabilities(:));
binary = +(probabilities > a4_rand(size(probabilities), seed)); % the "+" is to avoid the "logical" data type, which just confuses things.
end

View File

@@ -0,0 +1,23 @@
function show_rbm(rbm_w)
n_hid = size(rbm_w, 1);
n_rows = ceil(sqrt(n_hid));
blank_lines = 4;
distance = 16 + blank_lines;
to_show = zeros([n_rows * distance + blank_lines, n_rows * distance + blank_lines]);
for i = 0:n_hid-1,
row_i = floor(i / n_rows);
col_i = mod(i, n_rows);
pixels = reshape(rbm_w(i+1, :), [16, 16]).';
row_base = row_i*distance + blank_lines;
col_base = col_i*distance + blank_lines;
to_show(row_base+1:row_base+16, col_base+1:col_base+16) = pixels;
end
extreme = max(abs(to_show(:)));
try
imshow(to_show, [-extreme, extreme]);
title('hidden units of the RBM');
catch err
fprintf('Failed to display the RBM. No big deal (you do not need the display to finish the assignment), but you are missing out on an interesting picture.\n');
end
end

View File

@@ -0,0 +1,10 @@
function hidden_probability = visible_state_to_hidden_probabilities(rbm_w, visible_state)
% <rbm_w> is a matrix of size <number of hidden units> by <number of visible units>
% <visible_state> is a binary matrix of size <number of visible units> by <number of configurations that we're handling in parallel>.
% The returned value is a matrix of size <number of hidden units> by <number of configurations that we're handling in parallel>.
% This takes in the (binary) states of the visible units, and returns the activation probabilities of the hidden units conditional on those states.
deltaE = rbm_w * visible_state;
hidden_probability = logistic(deltaE);
end