I have to do a spam filter in python using scikit-learn, and I do not know how the function multinomialnb.fit(x,y)
works. What parameters should I put as x
e y
?
I have all the ham and spam data with countvectorizer
loaded in a vector, but I will not move forward.
I'm looking at an example of how it's done here , but I'm lost.
Could someone explain me?
This is my code:
######################################################
# Imports
######################################################
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
import numpy as np
import json
import glob
from sklearn import metrics
import random
######################################################
# Aux. functions
######################################################
# load_enron_folder: load training, validation and test sets from an enron path
def load_enron_folder(path):
### Load ham mails ###
ham_folder = path + '/ham/*.txt' # save path in ham_folder
print("Loading files:", ham_folder)
ham_list = glob.glob(ham_folder) # create a list with filenames on ham_folder
num_ham_mails = len(ham_list)
ham_mail = []
for i in range(0,num_ham_mails): # scroll through each item on num_ham_mails
ham_i_path = ham_list[i] # save in ham_i_path each item
ham_i_file = open(ham_i_path, 'rb') # open that file with read-binary permissions
ham_i_str = ham_i_file.read()
ham_i_text = ham_i_str.decode('utf-8',errors='ignore') # Convert to Unicode
ham_mail.append(ham_i_text) # Append to the mail structure
ham_i_file.close()
random.shuffle(ham_mail) # Random order
# Separate into training, validation and test
num_ham_training = int(round(0.8*num_ham_mails))
ham_training_mail = ham_mail[0:num_ham_training]
ham_training_labels = [0]*num_ham_training
num_ham_validation = int(round(0.1*num_ham_mails))
ham_validation_mail = ham_mail[num_ham_training:num_ham_training+num_ham_validation]
ham_validation_labels = [0]*num_ham_validation
num_ham_test = num_ham_mails - num_ham_training - num_ham_validation
ham_test_mail = ham_mail[num_ham_training+num_ham_validation:num_ham_mails]
ham_test_labels = [0]*num_ham_test
print("ham mails :", num_ham_mails)
print("..for training :", num_ham_training)
print("..for validation:", num_ham_validation)
print("..for testing :", num_ham_test)
### Load spam mails ###
spam_folder = path + '/spam/*.txt'
print("Loading files:", spam_folder)
spam_list = glob.glob(spam_folder)
num_spam_mails = len(spam_list)
spam_mail = []
for i in range(0,num_spam_mails):
spam_i_path = spam_list[i]
#print(spam_i_path)
spam_i_file = open(spam_i_path, 'rb')
spam_i_str = spam_i_file.read()
spam_i_text = spam_i_str.decode('utf-8',errors='ignore') # Convert to Unicode
spam_mail.append(spam_i_text) # Append to the mail structure
spam_i_file.close()
random.shuffle(spam_mail) # Random order
# Separate into training, validation and test
num_spam_training = int(round(0.8*num_spam_mails))
spam_training_mail = spam_mail[0:num_spam_training]
spam_training_labels = [1]*num_spam_training
num_spam_validation = int(round(0.1*num_spam_mails))
spam_validation_mail = spam_mail[num_spam_training:num_spam_training+num_spam_validation]
spam_validation_labels = [1]*num_spam_validation
num_spam_test = num_spam_mails - num_spam_training - num_spam_validation
spam_test_mail = spam_mail[num_spam_training+num_spam_validation:num_spam_mails]
spam_test_labels = [1]*num_spam_test
print("spam mails :", num_spam_mails)
print("..for training :", num_spam_training)
print("..for validation:", num_spam_validation)
print("..for testing :", num_spam_test)
### spam + ham together ###
training_mails = ham_training_mail + spam_training_mail
training_labels = ham_training_labels + spam_training_labels
validation_mails = ham_validation_mail + spam_validation_mail
validation_labels = ham_validation_labels + spam_validation_labels
test_mails = ham_test_mail + spam_test_mail
test_labels = ham_test_labels + spam_test_labels
data = {'training_mails': training_mails, 'training_labels': training_labels, 'validation_mails': validation_mails, 'validation_labels': validation_labels, 'test_mails': test_mails, 'test_labels': test_labels}
return data
# my code
def separate_data(list_mails_training):
random.shuffle(list_mails_training)
cv = CountVectorizer();
vec = cv.fit_transform(list_mails_training) #training_mails
vecform = TfidfTransformer().fit(vec)
vectrain = vecform.transform(vec)
classifier = MultinomialNB()
# sacar xtrain, ytrain para classifier.fit()
classifier.fit(list_mails_training)
# pred = classifier.predict(test_mails)
# metrics.confusion_matrix(test_mails,pred)
# metrics.precision_recall_curve(test_mails,pred)
# metrics.f1_score(test_mails,pred)
######################################################
# Main
######################################################
print("Starting...")
# Path to the folder containing the mails
folder_enron1 = '/home/dexras/spamIA/enron/enron1'
folder_enron2 = '/home/dexras/spamIA/enron/enron2'
folder_enron3 = '/home/dexras/spamIA/enron/enron3'
folder_enron4 = '/home/dexras/spamIA/enron/enron4'
folder_enron5 = '/home/dexras/spamIA/enron/enron5'
folder_enron6_test = '/home/dexras/spamIA/enron/enron6'
# Load mails
data1 = load_enron_folder(folder_enron1)
data2 = load_enron_folder(folder_enron2)
data3 = load_enron_folder(folder_enron3)
data4 = load_enron_folder(folder_enron4)
data5 = load_enron_folder(folder_enron5)
data6 = load_enron_folder(folder_enron6_test)
# Prepare data
training_mails = data1['training_mails']+data2['training_mails']+data3['training_mails']+data4['training_mails']+data5['training_mails']
training_labels = data1['training_labels']+data2['training_labels']+data3['training_labels']+data4['training_labels']+data5['training_labels']
validation_mails = data1['validation_mails']+data2['validation_mails']+data3['validation_mails']+data4['validation_mails']+data5['validation_mails']
validation_labels = data1['validation_labels']+data2['validation_labels']+data3['validation_labels']+data4['validation_labels']+data5['validation_labels']
# test_mails = data1['test_mails']+data2['test_mails']+data3['test_mails']+data4['test_mails']+data5['test_mails']+data6['test_mails']
# test_labels = data1['test_labels']+data2['test_labels']+data3['test_labels']+data4['test_labels']+data5['test_labels']+data6['test_labels']
test_mails = data6['test_mails']
test_labels = data6['test_labels']
#my code
separate_data(training_mails)
list_mails = training_mails + validation_mails