diff --git a/ex6/processEmail.m b/ex6/processEmail.m index 234bae0..ba74f51 100644 --- a/ex6/processEmail.m +++ b/ex6/processEmail.m @@ -1,9 +1,9 @@ function word_indices = processEmail(email_contents) %PROCESSEMAIL preprocesses a the body of an email and -%returns a list of word_indices -% word_indices = PROCESSEMAIL(email_contents) preprocesses -% the body of an email and returns a list of indices of the -% words contained in the email. +%returns a list of word_indices +% word_indices = PROCESSEMAIL(email_contents) preprocesses +% the body of an email and returns a list of indices of the +% words contained in the email. % % Load Vocabulary @@ -60,13 +60,13 @@ while ~isempty(email_contents) [str, email_contents] = ... strtok(email_contents, ... [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]); - + % Remove any non alphanumeric characters str = regexprep(str, '[^a-zA-Z0-9]', ''); - % Stem the word + % Stem the word % (the porterStemmer sometimes has issues, so we use a try catch block) - try str = porterStemmer(strtrim(str)); + try str = porterStemmer(strtrim(str)); catch str = ''; continue; end; @@ -87,24 +87,22 @@ while ~isempty(email_contents) % vector. Concretely, if str = 'action', then you should % look up the vocabulary list to find where in vocabList % 'action' appears. For example, if vocabList{18} = - % 'action', then, you should add 18 to the word_indices + % 'action', then, you should add 18 to the word_indices % vector (e.g., word_indices = [word_indices ; 18]; ). - % + % % Note: vocabList{idx} returns a the word with index idx in the % vocabulary list. - % + % % Note: You can use strcmp(str1, str2) to compare two strings (str1 and % str2). It will return 1 only if the two strings are equivalent. % - - - - - - - - + for idx = 1:length(vocabList) + if strcmp(str, vocabList{idx}) + word_indices = [word_indices ; idx]; + break; + end + end % =============================================================