2014-11-10 23:44:39 +01:00
function word_indices = processEmail ( email_contents )
% PROCESSEMAIL preprocesses a the body of an email and
2014-11-13 23:47:04 +01:00
% returns a list of word_indices
% word_indices = PROCESSEMAIL ( email_contents ) preprocesses
% the body of an email and returns a list of indices of the
% words contained in the email .
2014-11-10 23:44:39 +01:00
%
% Load Vocabulary
vocabList = getVocabList ( ) ;
% Init return value
word_indices = [ ] ;
% === === === === === === === === = = Preprocess Email === === === === === === === === ===
% Find the Headers ( \ n \ n and remove )
% Uncomment the following lines if you are working with raw emails with the
% full headers
% hdrstart = strfind ( email_contents , ( [ char ( 10 ) char ( 10 ) ] ) ) ;
% email_contents = email_contents ( hdrstart ( 1 ) : end ) ;
% Lower case
email_contents = lower ( email_contents ) ;
% Strip all HTML
% Looks for any expression that starts with < and ends with > and replace
% and does not have any < or > in the tag it with a space
email_contents = regexprep ( email_contents , ' < [ ^ <> ] + > ' , ' ' ) ;
% Handle Numbers
% Look for one or more characters between 0 -9
email_contents = regexprep ( email_contents , ' [ 0 -9 ] + ' , ' number ' ) ;
% Handle URLS
% Look for strings starting with http : // or https : //
email_contents = regexprep ( email_contents , . . .
' ( http | https ) : // [ ^ \ s ] * ' , ' httpaddr ' ) ;
% Handle Email Addresses
% Look for strings with @ in the middle
email_contents = regexprep ( email_contents , ' [ ^ \ s ] + @ [ ^ \ s ] + ' , ' emailaddr ' ) ;
% Handle $ sign
email_contents = regexprep ( email_contents , ' [ $ ] + ' , ' dollar ' ) ;
% === === === === === === === === = = Tokenize Email === === === === === === === === ===
% Output the email to screen as well
fprintf ( ' \ n === = Processed Email === = \ n \ n ' ) ;
% Process file
l = 0 ;
while ~ isempty ( email_contents )
% Tokenize and also get rid of any punctuation
[ str , email_contents ] = . . .
strtok ( email_contents , . . .
[ ' @ $ / # . - : & * + = [ ] ? ! ( ) { } , ' ' " > _ < ; % ' char ( 10 ) char ( 13 ) ] ) ;
2014-11-13 23:47:04 +01:00
2014-11-10 23:44:39 +01:00
% Remove any non alphanumeric characters
str = regexprep ( str , ' [ ^ a - zA - Z0 -9 ] ' , ' ' ) ;
2014-11-13 23:47:04 +01:00
% Stem the word
2014-11-10 23:44:39 +01:00
% ( the porterStemmer sometimes has issues , so we use a try catch block )
2014-11-13 23:47:04 +01:00
try str = porterStemmer ( strtrim ( str ) ) ;
2014-11-10 23:44:39 +01:00
catch str = ' ' ; continue ;
end ;
% Skip the word if it is too short
if length ( str ) < 1
continue ;
end
% Look up the word in the dictionary and add to word_indices if
% found
% === === === === === === === = YOUR CODE HERE === === === === === === === =
% Instructions : Fill in this function to add the index of str to
% word_indices if it is in the vocabulary . At this point
% of the code , you have a stemmed word from the email in
% the variable str . You should look up str in the
% vocabulary list ( vocabList ) . If a match exists , you
% should add the index of the word to the word_indices
% vector . Concretely , if str = ' action ' , then you should
% look up the vocabulary list to find where in vocabList
% ' action ' appears . For example , if vocabList { 18 } =
2014-11-13 23:47:04 +01:00
% ' action ' , then , you should add 18 to the word_indices
2014-11-10 23:44:39 +01:00
% vector ( e . g . , word_indices = [ word_indices ; 18 ] ; ) .
2014-11-13 23:47:04 +01:00
%
2014-11-10 23:44:39 +01:00
% Note : vocabList { idx } returns a the word with index idx in the
% vocabulary list .
2014-11-13 23:47:04 +01:00
%
2014-11-10 23:44:39 +01:00
% Note : You can use strcmp ( str1 , str2 ) to compare two strings ( str1 and
% str2 ) . It will return 1 only if the two strings are equivalent .
%
2014-11-13 23:47:04 +01:00
for idx = 1 : length ( vocabList )
if strcmp ( str , vocabList { idx } )
word_indices = [ word_indices ; idx ] ;
break ;
end
end
2014-11-10 23:44:39 +01:00
% === === === === === === === === === === === === === === === === === === === === =
% Print to screen , ensuring that the output lines are not too long
if ( l + length ( str ) + 1 ) > 78
fprintf ( ' \ n ' ) ;
l = 0 ;
end
fprintf ( ' % s ' , str ) ;
l = l + length ( str ) + 1 ;
end
% Print footer
fprintf ( ' \ n \ n === === === === === === === === = \ n ' ) ;
end