Preprocess email
This commit is contained in:
parent
203cbc997c
commit
f0d4b4d208
1 changed files with 16 additions and 18 deletions
|
@ -1,9 +1,9 @@
|
||||||
function word_indices = processEmail(email_contents)
|
function word_indices = processEmail(email_contents)
|
||||||
%PROCESSEMAIL preprocesses a the body of an email and
|
%PROCESSEMAIL preprocesses a the body of an email and
|
||||||
%returns a list of word_indices
|
%returns a list of word_indices
|
||||||
% word_indices = PROCESSEMAIL(email_contents) preprocesses
|
% word_indices = PROCESSEMAIL(email_contents) preprocesses
|
||||||
% the body of an email and returns a list of indices of the
|
% the body of an email and returns a list of indices of the
|
||||||
% words contained in the email.
|
% words contained in the email.
|
||||||
%
|
%
|
||||||
|
|
||||||
% Load Vocabulary
|
% Load Vocabulary
|
||||||
|
@ -60,13 +60,13 @@ while ~isempty(email_contents)
|
||||||
[str, email_contents] = ...
|
[str, email_contents] = ...
|
||||||
strtok(email_contents, ...
|
strtok(email_contents, ...
|
||||||
[' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]);
|
[' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]);
|
||||||
|
|
||||||
% Remove any non alphanumeric characters
|
% Remove any non alphanumeric characters
|
||||||
str = regexprep(str, '[^a-zA-Z0-9]', '');
|
str = regexprep(str, '[^a-zA-Z0-9]', '');
|
||||||
|
|
||||||
% Stem the word
|
% Stem the word
|
||||||
% (the porterStemmer sometimes has issues, so we use a try catch block)
|
% (the porterStemmer sometimes has issues, so we use a try catch block)
|
||||||
try str = porterStemmer(strtrim(str));
|
try str = porterStemmer(strtrim(str));
|
||||||
catch str = ''; continue;
|
catch str = ''; continue;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
|
@ -87,24 +87,22 @@ while ~isempty(email_contents)
|
||||||
% vector. Concretely, if str = 'action', then you should
|
% vector. Concretely, if str = 'action', then you should
|
||||||
% look up the vocabulary list to find where in vocabList
|
% look up the vocabulary list to find where in vocabList
|
||||||
% 'action' appears. For example, if vocabList{18} =
|
% 'action' appears. For example, if vocabList{18} =
|
||||||
% 'action', then, you should add 18 to the word_indices
|
% 'action', then, you should add 18 to the word_indices
|
||||||
% vector (e.g., word_indices = [word_indices ; 18]; ).
|
% vector (e.g., word_indices = [word_indices ; 18]; ).
|
||||||
%
|
%
|
||||||
% Note: vocabList{idx} returns a the word with index idx in the
|
% Note: vocabList{idx} returns a the word with index idx in the
|
||||||
% vocabulary list.
|
% vocabulary list.
|
||||||
%
|
%
|
||||||
% Note: You can use strcmp(str1, str2) to compare two strings (str1 and
|
% Note: You can use strcmp(str1, str2) to compare two strings (str1 and
|
||||||
% str2). It will return 1 only if the two strings are equivalent.
|
% str2). It will return 1 only if the two strings are equivalent.
|
||||||
%
|
%
|
||||||
|
|
||||||
|
for idx = 1:length(vocabList)
|
||||||
|
if strcmp(str, vocabList{idx})
|
||||||
|
word_indices = [word_indices ; idx];
|
||||||
|
break;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
% =============================================================
|
% =============================================================
|
||||||
|
|
||||||
|
|
Reference in a new issue