Machine learning assignment 4

2017-05-16 22:11:33 +01:00
parent 18453d2e37
commit a9cee6e867
18 changed files with 246 additions and 0 deletions
--- a/machine_learning/course2/assignment4/a4_init.m
+++ b/machine_learning/course2/assignment4/a4_init.m
@@ -0,0 +1,27 @@
+global randomness_source
+load a4_randomness_source
+
+global data_sets
+temp = load('data_set'); % same as in PA3
+data_sets = temp.data;
+
+global report_calls_to_sample_bernoulli
+report_calls_to_sample_bernoulli = false;
+
+test_rbm_w = a4_rand([100, 256], 0) * 2 - 1;
+small_test_rbm_w = a4_rand([10, 256], 0) * 2 - 1;
+
+temp = extract_mini_batch(data_sets.training, 1, 1);
+data_1_case = sample_bernoulli(temp.inputs);
+temp = extract_mini_batch(data_sets.training, 100, 10);
+data_10_cases = sample_bernoulli(temp.inputs);
+temp = extract_mini_batch(data_sets.training, 200, 37);
+data_37_cases = sample_bernoulli(temp.inputs);
+
+test_hidden_state_1_case = sample_bernoulli(a4_rand([100, 1], 0));
+test_hidden_state_10_cases = sample_bernoulli(a4_rand([100, 10], 1));
+test_hidden_state_37_cases = sample_bernoulli(a4_rand([100, 37], 2));
+
+report_calls_to_sample_bernoulli = true;
+
+clear temp;
--- a/machine_learning/course2/assignment4/a4_main.m
+++ b/machine_learning/course2/assignment4/a4_main.m
@@ -0,0 +1,67 @@
+% This file was published on Wed Nov 14 20:48:30 2012, UTC.
+
+function a4_main(n_hid, lr_rbm, lr_classification, n_iterations)
+% first, train the rbm
+    global report_calls_to_sample_bernoulli
+    report_calls_to_sample_bernoulli = false;
+    global data_sets
+    if prod(size(data_sets)) ~= 1,
+        error('You must run a4_init before you do anything else.');
+    end
+    rbm_w = optimize([n_hid, 256], ...
+                     @(rbm_w, data) cd1(rbm_w, data.inputs), ...  % discard labels
+                     data_sets.training, ...
+                     lr_rbm, ...
+                     n_iterations);
+    % rbm_w is now a weight matrix of <n_hid> by <number of visible units, i.e. 256>
+    show_rbm(rbm_w);
+    input_to_hid = rbm_w;
+    % calculate the hidden layer representation of the labeled data
+    hidden_representation = logistic(input_to_hid * data_sets.training.inputs);
+    % train hid_to_class
+    data_2.inputs = hidden_representation;
+    data_2.targets = data_sets.training.targets;
+    hid_to_class = optimize([10, n_hid], @(model, data) classification_phi_gradient(model, data), data_2, lr_classification, n_iterations);
+    % report results
+    for data_details = reshape({'training', data_sets.training, 'validation', data_sets.validation, 'test', data_sets.test}, [2, 3]),
+        data_name = data_details{1};
+        data = data_details{2};
+        hid_input = input_to_hid * data.inputs; % size: <number of hidden units> by <number of data cases>
+        hid_output = logistic(hid_input); % size: <number of hidden units> by <number of data cases>
+        class_input = hid_to_class * hid_output; % size: <number of classes> by <number of data cases>
+        class_normalizer = log_sum_exp_over_rows(class_input); % log(sum(exp of class_input)) is what we subtract to get properly normalized log class probabilities. size: <1> by <number of data cases>
+        log_class_prob = class_input - repmat(class_normalizer, [size(class_input, 1), 1]); % log of probability of each class. size: <number of classes, i.e. 10> by <number of data cases>
+        error_rate = mean(double(argmax_over_rows(class_input) ~= argmax_over_rows(data.targets))); % scalar
+        loss = -mean(sum(log_class_prob .* data.targets, 1)); % scalar. select the right log class probability using that sum; then take the mean over all data cases.
+        fprintf('For the %s data, the classification cross-entropy loss is %f, and the classification error rate (i.e. the misclassification rate) is %f\n', data_name, loss, error_rate);
+    end
+    report_calls_to_sample_bernoulli = true;
+end
+
+function d_phi_by_d_input_to_class = classification_phi_gradient(input_to_class, data)
+% This is about a very simple model: there's an input layer, and a softmax output layer. There are no hidden layers, and no biases.
+% This returns the gradient of phi (a.k.a. negative the loss) for the <input_to_class> matrix.
+% <input_to_class> is a matrix of size <number of classes> by <number of input units>.
+% <data> has fields .inputs (matrix of size <number of input units> by <number of data cases>) and .targets (matrix of size <number of classes> by <number of data cases>).
+% first: forward pass
+    class_input = input_to_class * data.inputs; % input to the components of the softmax. size: <number of classes> by <number of data cases>
+    class_normalizer = log_sum_exp_over_rows(class_input); % log(sum(exp)) is what we subtract to get normalized log class probabilities. size: <1> by <number of data cases>
+    log_class_prob = class_input - repmat(class_normalizer, [size(class_input, 1), 1]); % log of probability of each class. size: <number of classes> by <number of data cases>
+    class_prob = exp(log_class_prob); % probability of each class. Each column (i.e. each case) sums to 1. size: <number of classes> by <number of data cases>
+    % now: gradient computation
+    d_loss_by_d_class_input = -(data.targets - class_prob) ./ size(data.inputs, 2); % size: <number of classes> by <number of data cases>
+    d_loss_by_d_input_to_class = d_loss_by_d_class_input * data.inputs.'; % size: <number of classes> by <number of input units>
+    d_phi_by_d_input_to_class = -d_loss_by_d_input_to_class;
+end
+ 
+function indices = argmax_over_rows(matrix)
+    [dump, indices] = max(matrix);
+end
+
+function ret = log_sum_exp_over_rows(matrix)
+  % This computes log(sum(exp(a), 1)) in a numerically stable way
+  maxs_small = max(matrix, [], 1);
+  maxs_big = repmat(maxs_small, [size(matrix, 1), 1]);
+  ret = log(sum(exp(matrix - maxs_big), 1)) + maxs_small;
+end
+
--- a/machine_learning/course2/assignment4/a4_rand.m
+++ b/machine_learning/course2/assignment4/a4_rand.m
@@ -0,0 +1,9 @@
+function ret = a4_rand(requested_size, seed)
+    global randomness_source
+    start_i = mod(round(seed), round(size(randomness_source, 2) / 10)) + 1;
+    if start_i + prod(requested_size) >= size(randomness_source, 2) + 1, 
+        error('a4_rand failed to generate an array of that size (too big)')
+    end
+    ret = reshape(randomness_source(start_i : start_i+prod(requested_size)-1), requested_size);
+end
+
--- a/machine_learning/course2/assignment4/a4_randomness_source.mat
+++ b/machine_learning/course2/assignment4/a4_randomness_source.mat
--- a/machine_learning/course2/assignment4/answers.txt
+++ b/machine_learning/course2/assignment4/answers.txt
@@ -0,0 +1,17 @@
+
+2.360736
+1724.967611
+4391.169583
+-18.391
+3166.216216
+-4669.675676
+-4716.094972
+0.1
+0.065889
+
+
+
+
+
+
+
--- a/machine_learning/course2/assignment4/cd1.m
+++ b/machine_learning/course2/assignment4/cd1.m
@@ -0,0 +1,26 @@
+function ret = cd1(rbm_w, visible_data)
+% <rbm_w> is a matrix of size <number of hidden units> by <number of visible units>
+% <visible_data> is a (possibly but not necessarily binary) matrix of size <number of visible units> by <number of data cases>
+% The returned value is the gradient approximation produced by CD-1. It's of the same shape as <rbm_w>.
+
+
+    % This is needed for questions up to question #9
+    %v0 = visible_data;
+    %
+
+    v0 = sample_bernoulli(visible_data);
+    
+    h0p = visible_state_to_hidden_probabilities(rbm_w, v0);
+    h0 = sample_bernoulli(h0p);
+
+    v1p = hidden_state_to_visible_probabilities(rbm_w, h0);
+    v1 = sample_bernoulli(v1p);
+
+    h1p = visible_state_to_hidden_probabilities(rbm_w, v1);
+    
+    % h1 = sample_bernoulli(h1p); % We don't need this. But instead we could use
+    % probabilities.
+
+    ret = configuration_goodness_gradient(v0,h0) .- configuration_goodness_gradient(v1,h1p);
+
+end
--- a/machine_learning/course2/assignment4/configuration_goodness.m
+++ b/machine_learning/course2/assignment4/configuration_goodness.m
@@ -0,0 +1,9 @@
+function G = configuration_goodness(rbm_w, visible_state, hidden_state)
+% <rbm_w> is a matrix of size <number of hidden units> by <number of visible units>
+% <visible_state> is a binary matrix of size <number of visible units> by <number of configurations that we're handling in parallel>.
+% <hidden_state> is a binary matrix of size <number of hidden units> by <number of configurations that we're handling in parallel>.
+% This returns a scalar: the mean over cases of the goodness (negative energy) of the described configurations.
+    cases = size( visible_state, 2 );
+    sisj_wij = rbm_w .* (hidden_state * visible_state');
+    G = sum( sum( sisj_wij )) / cases;
+end
--- a/machine_learning/course2/assignment4/configuration_goodness_gradient.m
+++ b/machine_learning/course2/assignment4/configuration_goodness_gradient.m
@@ -0,0 +1,8 @@
+function d_G_by_rbm_w = configuration_goodness_gradient(visible_state, hidden_state)
+% <visible_state> is a binary matrix of size <number of visible units> by <number of configurations that we're handling in parallel>.
+% <hidden_state> is a (possibly but not necessarily binary) matrix of size <number of hidden units> by <number of configurations that we're handling in parallel>.
+% You don't need the model parameters for this computation.
+% This returns the gradient of the mean configuration goodness (negative energy, as computed by function <configuration_goodness>) with respect to the model parameters. Thus, the returned value is of the same shape as the model parameters, which by the way are not provided to this function. Notice that we're talking about the mean over data cases (as opposed to the sum over data cases).
+    cases = size( visible_state, 2 );
+    d_G_by_rbm_w = (hidden_state * visible_state') ./ cases;
+end
--- a/machine_learning/course2/assignment4/data_set.mat
+++ b/machine_learning/course2/assignment4/data_set.mat
--- a/machine_learning/course2/assignment4/describe_matrix.m
+++ b/machine_learning/course2/assignment4/describe_matrix.m
@@ -0,0 +1,4 @@
+function describe_matrix(matrix)
+    fprintf('Describing a matrix of size %d by %d. The mean of the elements is %f. The sum of the elements is %f\n', size(matrix, 1), size(matrix, 2), mean(matrix(:)), sum(matrix(:)))
+end
+
--- a/machine_learning/course2/assignment4/extract_mini_batch.m
+++ b/machine_learning/course2/assignment4/extract_mini_batch.m
@@ -0,0 +1,5 @@
+function mini_batch = extract_mini_batch(data_set, start_i, n_cases)
+    mini_batch.inputs = data_set.inputs(:, start_i : start_i + n_cases - 1);
+    mini_batch.targets = data_set.targets(:, start_i : start_i + n_cases - 1);
+end
+
--- a/machine_learning/course2/assignment4/hidden_state_to_visible_probabilities.m
+++ b/machine_learning/course2/assignment4/hidden_state_to_visible_probabilities.m
@@ -0,0 +1,9 @@
+function visible_probability = hidden_state_to_visible_probabilities(rbm_w, hidden_state)
+% <rbm_w> is a matrix of size <number of hidden units> by <number of visible units>
+% <hidden_state> is a binary matrix of size <number of hidden units> by <number of configurations that we're handling in parallel>.
+% The returned value is a matrix of size <number of visible units> by <number of configurations that we're handling in parallel>.
+% This takes in the (binary) states of the hidden units, and returns the activation probabilities of the visible units, conditional on those states.
+    
+    deltaE = rbm_w' * hidden_state;
+    visible_probability = logistic(deltaE);
+end
--- a/machine_learning/course2/assignment4/logistic.m
+++ b/machine_learning/course2/assignment4/logistic.m
@@ -0,0 +1,4 @@
+function ret = logistic(input)
+    ret = 1 ./ (1 + exp(-input));
+end
+
--- a/machine_learning/course2/assignment4/octave-workspace
+++ b/machine_learning/course2/assignment4/octave-workspace
--- a/machine_learning/course2/assignment4/optimize.m
+++ b/machine_learning/course2/assignment4/optimize.m
@@ -0,0 +1,19 @@
+function model = optimize(model_shape, gradient_function, training_data, learning_rate, n_iterations)
+% This trains a model that's defined by a single matrix of weights.
+% <model_shape> is the shape of the array of weights.
+% <gradient_function> is a function that takes parameters <model> and <data> and returns the gradient (or approximate gradient in the case of CD-1) of the function that we're maximizing. Note the contrast with the loss function that we saw in PA3, which we were minimizing. The returned gradient is an array of the same shape as the provided <model> parameter.
+% This uses mini-batches of size 100, momentum of 0.9, no weight decay, and no early stopping.
+% This returns the matrix of weights of the trained model.
+    model = (a4_rand(model_shape, prod(model_shape)) * 2 - 1) * 0.1;
+    momentum_speed = zeros(model_shape);
+    mini_batch_size = 100;
+    start_of_next_mini_batch = 1;
+    for iteration_number = 1:n_iterations,
+        mini_batch = extract_mini_batch(training_data, start_of_next_mini_batch, mini_batch_size);
+        start_of_next_mini_batch = mod(start_of_next_mini_batch + mini_batch_size, size(training_data.inputs, 2));
+        gradient = gradient_function(model, mini_batch);
+        momentum_speed = 0.9 * momentum_speed + gradient;
+        model = model + momentum_speed * learning_rate;
+    end
+end
+
--- a/machine_learning/course2/assignment4/sample_bernoulli.m
+++ b/machine_learning/course2/assignment4/sample_bernoulli.m
@@ -0,0 +1,9 @@
+function binary = sample_bernoulli(probabilities)
+    global report_calls_to_sample_bernoulli
+    if report_calls_to_sample_bernoulli, 
+        fprintf('sample_bernoulli() was called with a matrix of size %d by %d. ', size(probabilities, 1), size(probabilities, 2));
+    end
+    seed = sum(probabilities(:));
+    binary = +(probabilities > a4_rand(size(probabilities), seed)); % the "+" is to avoid the "logical" data type, which just confuses things.
+end
+
--- a/machine_learning/course2/assignment4/show_rbm.m
+++ b/machine_learning/course2/assignment4/show_rbm.m
@@ -0,0 +1,23 @@
+function show_rbm(rbm_w)
+    n_hid = size(rbm_w, 1);
+    n_rows = ceil(sqrt(n_hid));
+    blank_lines = 4;
+    distance = 16 + blank_lines;
+    to_show = zeros([n_rows * distance + blank_lines, n_rows * distance + blank_lines]);
+    for i = 0:n_hid-1,
+        row_i = floor(i / n_rows);
+        col_i = mod(i, n_rows);
+        pixels = reshape(rbm_w(i+1, :), [16, 16]).';
+        row_base = row_i*distance + blank_lines;
+        col_base = col_i*distance + blank_lines;
+        to_show(row_base+1:row_base+16, col_base+1:col_base+16) = pixels;
+    end
+    extreme = max(abs(to_show(:)));
+    try
+        imshow(to_show, [-extreme, extreme]);
+        title('hidden units of the RBM');
+    catch err
+        fprintf('Failed to display the RBM. No big deal (you do not need the display to finish the assignment), but you are missing out on an interesting picture.\n');
+    end
+end
+
--- a/machine_learning/course2/assignment4/visible_state_to_hidden_probabilities.m
+++ b/machine_learning/course2/assignment4/visible_state_to_hidden_probabilities.m
@@ -0,0 +1,10 @@
+function hidden_probability = visible_state_to_hidden_probabilities(rbm_w, visible_state)
+% <rbm_w> is a matrix of size <number of hidden units> by <number of visible units>
+% <visible_state> is a binary matrix of size <number of visible units> by <number of configurations that we're handling in parallel>.
+% The returned value is a matrix of size <number of hidden units> by <number of configurations that we're handling in parallel>.
+% This takes in the (binary) states of the visible units, and returns the activation probabilities of the hidden units conditional on those states.
+
+    deltaE = rbm_w * visible_state;
+    hidden_probability = logistic(deltaE);
+
+end