Cursera: machine learning ex6.

2015-03-21 11:35:11 +04:00
parent 97bf613378
commit 29c396874e
28 changed files with 3870 additions and 0 deletions
--- a/machine_learning/mlclass-ex6-008/mlclass-ex6/processEmail.m
+++ b/machine_learning/mlclass-ex6-008/mlclass-ex6/processEmail.m
@@ -0,0 +1,123 @@
+function word_indices = processEmail(email_contents)
+%PROCESSEMAIL preprocesses a the body of an email and
+%returns a list of word_indices 
+%   word_indices = PROCESSEMAIL(email_contents) preprocesses 
+%   the body of an email and returns a list of indices of the 
+%   words contained in the email. 
+%
+
+% Load Vocabulary
+vocabList = getVocabList();
+
+% Init return value
+word_indices = [];
+
+% ========================== Preprocess Email ===========================
+
+% Find the Headers ( \n\n and remove )
+% Uncomment the following lines if you are working with raw emails with the
+% full headers
+
+% hdrstart = strfind(email_contents, ([char(10) char(10)]));
+% email_contents = email_contents(hdrstart(1):end);
+
+% Lower case
+email_contents = lower(email_contents);
+
+% Strip all HTML
+% Looks for any expression that starts with < and ends with > and replace
+% and does not have any < or > in the tag it with a space
+email_contents = regexprep(email_contents, '<[^<>]+>', ' ');
+
+% Handle Numbers
+% Look for one or more characters between 0-9
+email_contents = regexprep(email_contents, '[0-9]+', 'number');
+
+% Handle URLS
+% Look for strings starting with http:// or https://
+email_contents = regexprep(email_contents, ...
+                           '(http|https)://[^\s]*', 'httpaddr');
+
+% Handle Email Addresses
+% Look for strings with @ in the middle
+email_contents = regexprep(email_contents, '[^\s]+@[^\s]+', 'emailaddr');
+
+% Handle $ sign
+email_contents = regexprep(email_contents, '[$]+', 'dollar');
+
+
+% ========================== Tokenize Email ===========================
+
+% Output the email to screen as well
+fprintf('\n==== Processed Email ====\n\n');
+
+% Process file
+l = 0;
+
+while ~isempty(email_contents)
+
+    % Tokenize and also get rid of any punctuation
+    [str, email_contents] = ...
+       strtok(email_contents, ...
+              [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]);
+   
+    % Remove any non alphanumeric characters
+    str = regexprep(str, '[^a-zA-Z0-9]', '');
+
+    % Stem the word 
+    % (the porterStemmer sometimes has issues, so we use a try catch block)
+    try str = porterStemmer(strtrim(str)); 
+    catch str = ''; continue;
+    end;
+
+    % Skip the word if it is too short
+    if length(str) < 1
+       continue;
+    end
+
+    % Look up the word in the dictionary and add to word_indices if
+    % found
+    % ====================== YOUR CODE HERE ======================
+    % Instructions: Fill in this function to add the index of str to
+    %               word_indices if it is in the vocabulary. At this point
+    %               of the code, you have a stemmed word from the email in
+    %               the variable str. You should look up str in the
+    %               vocabulary list (vocabList). If a match exists, you
+    %               should add the index of the word to the word_indices
+    %               vector. Concretely, if str = 'action', then you should
+    %               look up the vocabulary list to find where in vocabList
+    %               'action' appears. For example, if vocabList{18} =
+    %               'action', then, you should add 18 to the word_indices 
+    %               vector (e.g., word_indices = [word_indices ; 18]; ).
+    % 
+    % Note: vocabList{idx} returns a the word with index idx in the
+    %       vocabulary list.
+    % 
+    % Note: You can use strcmp(str1, str2) to compare two strings (str1 and
+    %       str2). It will return 1 only if the two strings are equivalent.
+    %
+
+	word_count = length(vocabList);
+	idx = strcmp(str,vocabList)'*(1:word_count)';
+	if idx > 0 
+		word_indices = [word_indices; idx];
+	end
+
+
+    % =============================================================
+
+
+    % Print to screen, ensuring that the output lines are not too long
+    if (l + length(str) + 1) > 78
+        fprintf('\n');
+        l = 0;
+    end
+    fprintf('%s ', str);
+    l = l + length(str) + 1;
+
+end
+
+% Print footer
+fprintf('\n\n=========================\n');
+
+end