You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
139 lines
4.5 KiB
Matlab
139 lines
4.5 KiB
Matlab
10 years ago
|
%% Machine Learning Online Class
|
||
|
% Exercise 6 | Spam Classification with SVMs
|
||
|
%
|
||
|
% Instructions
|
||
|
% ------------
|
||
|
%
|
||
|
% This file contains code that helps you get started on the
|
||
|
% exercise. You will need to complete the following functions:
|
||
|
%
|
||
|
% gaussianKernel.m
|
||
|
% dataset3Params.m
|
||
|
% processEmail.m
|
||
|
% emailFeatures.m
|
||
|
%
|
||
|
% For this exercise, you will not need to change any code in this file,
|
||
|
% or any other files other than those mentioned above.
|
||
|
%
|
||
|
|
||
|
%% Initialization
|
||
|
clear ; close all; clc
|
||
|
|
||
|
%% ==================== Part 1: Email Preprocessing ====================
|
||
|
% To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
|
||
|
% to convert each email into a vector of features. In this part, you will
|
||
|
% implement the preprocessing steps for each email. You should
|
||
|
% complete the code in processEmail.m to produce a word indices vector
|
||
|
% for a given email.
|
||
|
|
||
|
fprintf('\nPreprocessing sample email (emailSample1.txt)\n');
|
||
|
|
||
|
% Extract Features
|
||
|
file_contents = readFile('emailSample1.txt');
|
||
|
word_indices = processEmail(file_contents);
|
||
|
|
||
|
% Print Stats
|
||
|
fprintf('Word Indices: \n');
|
||
|
fprintf(' %d', word_indices);
|
||
|
fprintf('\n\n');
|
||
|
|
||
|
fprintf('Program paused. Press enter to continue.\n');
|
||
|
pause;
|
||
|
|
||
|
%% ==================== Part 2: Feature Extraction ====================
|
||
|
% Now, you will convert each email into a vector of features in R^n.
|
||
|
% You should complete the code in emailFeatures.m to produce a feature
|
||
|
% vector for a given email.
|
||
|
|
||
|
fprintf('\nExtracting features from sample email (emailSample1.txt)\n');
|
||
|
|
||
|
% Extract Features
|
||
|
file_contents = readFile('emailSample1.txt');
|
||
|
word_indices = processEmail(file_contents);
|
||
|
features = emailFeatures(word_indices);
|
||
|
|
||
|
% Print Stats
|
||
|
fprintf('Length of feature vector: %d\n', length(features));
|
||
|
fprintf('Number of non-zero entries: %d\n', sum(features > 0));
|
||
|
|
||
|
fprintf('Program paused. Press enter to continue.\n');
|
||
|
pause;
|
||
|
|
||
|
%% =========== Part 3: Train Linear SVM for Spam Classification ========
|
||
|
% In this section, you will train a linear classifier to determine if an
|
||
|
% email is Spam or Not-Spam.
|
||
|
|
||
|
% Load the Spam Email dataset
|
||
|
% You will have X, y in your environment
|
||
|
load('spamTrain.mat');
|
||
|
|
||
|
fprintf('\nTraining Linear SVM (Spam Classification)\n')
|
||
|
fprintf('(this may take 1 to 2 minutes) ...\n')
|
||
|
|
||
|
C = 0.1;
|
||
|
model = svmTrain(X, y, C, @linearKernel);
|
||
|
|
||
|
p = svmPredict(model, X);
|
||
|
|
||
|
fprintf('Training Accuracy: %f\n', mean(double(p == y)) * 100);
|
||
|
|
||
|
%% =================== Part 4: Test Spam Classification ================
|
||
|
% After training the classifier, we can evaluate it on a test set. We have
|
||
|
% included a test set in spamTest.mat
|
||
|
|
||
|
% Load the test dataset
|
||
|
% You will have Xtest, ytest in your environment
|
||
|
load('spamTest.mat');
|
||
|
|
||
|
fprintf('\nEvaluating the trained Linear SVM on a test set ...\n')
|
||
|
|
||
|
p = svmPredict(model, Xtest);
|
||
|
|
||
|
fprintf('Test Accuracy: %f\n', mean(double(p == ytest)) * 100);
|
||
|
pause;
|
||
|
|
||
|
|
||
|
%% ================= Part 5: Top Predictors of Spam ====================
|
||
|
% Since the model we are training is a linear SVM, we can inspect the
|
||
|
% weights learned by the model to understand better how it is determining
|
||
|
% whether an email is spam or not. The following code finds the words with
|
||
|
% the highest weights in the classifier. Informally, the classifier
|
||
|
% 'thinks' that these words are the most likely indicators of spam.
|
||
|
%
|
||
|
|
||
|
% Sort the weights and obtin the vocabulary list
|
||
|
[weight, idx] = sort(model.w, 'descend');
|
||
|
vocabList = getVocabList();
|
||
|
|
||
|
fprintf('\nTop predictors of spam: \n');
|
||
|
for i = 1:15
|
||
|
fprintf(' %-15s (%f) \n', vocabList{idx(i)}, weight(i));
|
||
|
end
|
||
|
|
||
|
fprintf('\n\n');
|
||
|
fprintf('\nProgram paused. Press enter to continue.\n');
|
||
|
pause;
|
||
|
|
||
|
%% =================== Part 6: Try Your Own Emails =====================
|
||
|
% Now that you've trained the spam classifier, you can use it on your own
|
||
|
% emails! In the starter code, we have included spamSample1.txt,
|
||
|
% spamSample2.txt, emailSample1.txt and emailSample2.txt as examples.
|
||
|
% The following code reads in one of these emails and then uses your
|
||
|
% learned SVM classifier to determine whether the email is Spam or
|
||
|
% Not Spam
|
||
|
|
||
|
% Set the file to be read in (change this to spamSample2.txt,
|
||
|
% emailSample1.txt or emailSample2.txt to see different predictions on
|
||
|
% different emails types). Try your own emails as well!
|
||
|
filename = 'spamSample1.txt';
|
||
|
|
||
|
% Read and predict
|
||
|
file_contents = readFile(filename);
|
||
|
word_indices = processEmail(file_contents);
|
||
|
x = emailFeatures(word_indices);
|
||
|
p = svmPredict(model, x);
|
||
|
|
||
|
fprintf('\nProcessed %s\n\nSpam Classification: %d\n', filename, p);
|
||
|
fprintf('(1 indicates spam, 0 indicates not spam)\n\n');
|
||
|
|