1
0
Fork 0

Preprocess email

master
neingeist 10 years ago
parent 203cbc997c
commit f0d4b4d208

@ -1,9 +1,9 @@
function word_indices = processEmail(email_contents) function word_indices = processEmail(email_contents)
%PROCESSEMAIL preprocesses a the body of an email and %PROCESSEMAIL preprocesses a the body of an email and
%returns a list of word_indices %returns a list of word_indices
% word_indices = PROCESSEMAIL(email_contents) preprocesses % word_indices = PROCESSEMAIL(email_contents) preprocesses
% the body of an email and returns a list of indices of the % the body of an email and returns a list of indices of the
% words contained in the email. % words contained in the email.
% %
% Load Vocabulary % Load Vocabulary
@ -60,13 +60,13 @@ while ~isempty(email_contents)
[str, email_contents] = ... [str, email_contents] = ...
strtok(email_contents, ... strtok(email_contents, ...
[' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]); [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]);
% Remove any non alphanumeric characters % Remove any non alphanumeric characters
str = regexprep(str, '[^a-zA-Z0-9]', ''); str = regexprep(str, '[^a-zA-Z0-9]', '');
% Stem the word % Stem the word
% (the porterStemmer sometimes has issues, so we use a try catch block) % (the porterStemmer sometimes has issues, so we use a try catch block)
try str = porterStemmer(strtrim(str)); try str = porterStemmer(strtrim(str));
catch str = ''; continue; catch str = ''; continue;
end; end;
@ -87,24 +87,22 @@ while ~isempty(email_contents)
% vector. Concretely, if str = 'action', then you should % vector. Concretely, if str = 'action', then you should
% look up the vocabulary list to find where in vocabList % look up the vocabulary list to find where in vocabList
% 'action' appears. For example, if vocabList{18} = % 'action' appears. For example, if vocabList{18} =
% 'action', then, you should add 18 to the word_indices % 'action', then, you should add 18 to the word_indices
% vector (e.g., word_indices = [word_indices ; 18]; ). % vector (e.g., word_indices = [word_indices ; 18]; ).
% %
% Note: vocabList{idx} returns a the word with index idx in the % Note: vocabList{idx} returns a the word with index idx in the
% vocabulary list. % vocabulary list.
% %
% Note: You can use strcmp(str1, str2) to compare two strings (str1 and % Note: You can use strcmp(str1, str2) to compare two strings (str1 and
% str2). It will return 1 only if the two strings are equivalent. % str2). It will return 1 only if the two strings are equivalent.
% %
for idx = 1:length(vocabList)
if strcmp(str, vocabList{idx})
word_indices = [word_indices ; idx];
break;
end
end
% ============================================================= % =============================================================