Spam filter, operation of MultinomialNB (sklearn.naive_bayes)

1

I have to do a spam filter in python using scikit-learn, and I do not know how the function multinomialnb.fit(x,y) works. What parameters should I put as x e y ?

I have all the ham and spam data with countvectorizer loaded in a vector, but I will not move forward.

I'm looking at an example of how it's done here , but I'm lost.

Could someone explain me?

This is my code:

######################################################
# Imports
######################################################

import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
import numpy as np
import json
import glob
from sklearn import metrics
import random

######################################################
# Aux. functions
######################################################

# load_enron_folder: load training, validation and test sets from an enron path
def load_enron_folder(path):

    ### Load ham mails ###

    ham_folder = path + '/ham/*.txt' # save path in ham_folder
    print("Loading files:", ham_folder) 
    ham_list = glob.glob(ham_folder) # create a list with filenames on ham_folder
    num_ham_mails = len(ham_list)
    ham_mail = []
    for i in range(0,num_ham_mails): # scroll through each item on num_ham_mails
        ham_i_path = ham_list[i] # save in ham_i_path each item
        ham_i_file = open(ham_i_path, 'rb')  # open that file with read-binary permissions
        ham_i_str = ham_i_file.read()
        ham_i_text = ham_i_str.decode('utf-8',errors='ignore')     # Convert to Unicode
        ham_mail.append(ham_i_text)    # Append to the mail structure
        ham_i_file.close()
    random.shuffle(ham_mail)  # Random order

    # Separate into training, validation and test
    num_ham_training = int(round(0.8*num_ham_mails))
    ham_training_mail = ham_mail[0:num_ham_training]
    ham_training_labels = [0]*num_ham_training

    num_ham_validation = int(round(0.1*num_ham_mails))
    ham_validation_mail = ham_mail[num_ham_training:num_ham_training+num_ham_validation]
    ham_validation_labels = [0]*num_ham_validation

    num_ham_test = num_ham_mails - num_ham_training - num_ham_validation
    ham_test_mail = ham_mail[num_ham_training+num_ham_validation:num_ham_mails]
    ham_test_labels = [0]*num_ham_test

    print("ham mails       :", num_ham_mails)
    print("..for training  :", num_ham_training)
    print("..for validation:", num_ham_validation)
    print("..for testing   :", num_ham_test)


    ### Load spam mails ###

    spam_folder = path + '/spam/*.txt'
    print("Loading files:", spam_folder)
    spam_list = glob.glob(spam_folder)
    num_spam_mails = len(spam_list)
    spam_mail = []
    for i in range(0,num_spam_mails):
        spam_i_path = spam_list[i]
        #print(spam_i_path)
        spam_i_file = open(spam_i_path, 'rb')  
        spam_i_str = spam_i_file.read()
        spam_i_text = spam_i_str.decode('utf-8',errors='ignore')     # Convert to Unicode
        spam_mail.append(spam_i_text)    # Append to the mail structure
        spam_i_file.close()
    random.shuffle(spam_mail)  # Random order


    # Separate into training, validation and test
    num_spam_training = int(round(0.8*num_spam_mails))
    spam_training_mail = spam_mail[0:num_spam_training]
    spam_training_labels = [1]*num_spam_training

    num_spam_validation = int(round(0.1*num_spam_mails))
    spam_validation_mail = spam_mail[num_spam_training:num_spam_training+num_spam_validation]
    spam_validation_labels = [1]*num_spam_validation

    num_spam_test = num_spam_mails - num_spam_training - num_spam_validation
    spam_test_mail = spam_mail[num_spam_training+num_spam_validation:num_spam_mails]
    spam_test_labels = [1]*num_spam_test

    print("spam mails      :", num_spam_mails)
    print("..for training  :", num_spam_training)
    print("..for validation:", num_spam_validation)
    print("..for testing   :", num_spam_test)

    ### spam + ham together ###
    training_mails = ham_training_mail + spam_training_mail
    training_labels = ham_training_labels + spam_training_labels
    validation_mails = ham_validation_mail + spam_validation_mail
    validation_labels = ham_validation_labels + spam_validation_labels
    test_mails = ham_test_mail + spam_test_mail
    test_labels = ham_test_labels + spam_test_labels

    data = {'training_mails': training_mails, 'training_labels': training_labels, 'validation_mails': validation_mails, 'validation_labels': validation_labels, 'test_mails': test_mails, 'test_labels': test_labels} 

    return data



    # my code
def separate_data(list_mails_training):
    random.shuffle(list_mails_training)
    cv = CountVectorizer();
    vec = cv.fit_transform(list_mails_training)  #training_mails

    vecform = TfidfTransformer().fit(vec)
    vectrain = vecform.transform(vec)
    classifier = MultinomialNB()

    # sacar xtrain, ytrain para classifier.fit()
    classifier.fit(list_mails_training)
    # pred = classifier.predict(test_mails)

    # metrics.confusion_matrix(test_mails,pred)
    # metrics.precision_recall_curve(test_mails,pred)
    # metrics.f1_score(test_mails,pred)



######################################################
# Main
######################################################

print("Starting...")

# Path to the folder containing the mails
folder_enron1 = '/home/dexras/spamIA/enron/enron1'
folder_enron2 = '/home/dexras/spamIA/enron/enron2'
folder_enron3 = '/home/dexras/spamIA/enron/enron3'
folder_enron4 = '/home/dexras/spamIA/enron/enron4'
folder_enron5 = '/home/dexras/spamIA/enron/enron5'
folder_enron6_test = '/home/dexras/spamIA/enron/enron6'
# Load mails
data1 = load_enron_folder(folder_enron1)
data2 = load_enron_folder(folder_enron2)
data3 = load_enron_folder(folder_enron3)
data4 = load_enron_folder(folder_enron4)
data5 = load_enron_folder(folder_enron5)
data6 = load_enron_folder(folder_enron6_test)
# Prepare data
training_mails = data1['training_mails']+data2['training_mails']+data3['training_mails']+data4['training_mails']+data5['training_mails']
training_labels = data1['training_labels']+data2['training_labels']+data3['training_labels']+data4['training_labels']+data5['training_labels']
validation_mails = data1['validation_mails']+data2['validation_mails']+data3['validation_mails']+data4['validation_mails']+data5['validation_mails']
validation_labels = data1['validation_labels']+data2['validation_labels']+data3['validation_labels']+data4['validation_labels']+data5['validation_labels']
# test_mails = data1['test_mails']+data2['test_mails']+data3['test_mails']+data4['test_mails']+data5['test_mails']+data6['test_mails']
# test_labels = data1['test_labels']+data2['test_labels']+data3['test_labels']+data4['test_labels']+data5['test_labels']+data6['test_labels']
test_mails = data6['test_mails']
test_labels = data6['test_labels']

#my code
separate_data(training_mails)
list_mails = training_mails + validation_mails
    
asked by user3138958 27.01.2017 в 22:17
source

1 answer

2

Hello the basic operation would be the following:

def magic_function(train,train_tar,test,test_tar):
    # algoritmo a implementar 
    mnb = MultinomialNB()
    '''
        train: es una arreglo 'matriz, sparx matrix', en donde se encuentran las caracteristicas con las que entrenara 
        train_tar: es un arreglo 'lista', con los 'objetivos' (targets), es decir aquello que se quiere clasificar
    '''
    mnb.fit(train, train_tar)
    '''
        test arreglo 'matriz', con las caracteristicas de tu conjunto de datos que usas para realizar pruebas.
        test_tar: arreglo 'lista' con los objetivos, es como en el metodo fit
        considera que tanto las matrices de entrenamiento y prueba deberán de ser de las mismas dimensiones 

    '''

    # muestra el resultado que se obtiene al evaluar el modelo con un conjunto de pruebas
    print mnb.score(test, test_tar)

    '''
        Si deseas hacer una predicción con una sola muestra (en este caso un solo email), entonces querrás usar el metodo predict
    '''
    print mnb.predict(test_una_muestra)

I recommend you check the official documentation scikitlearn, there is better explained. link

    
answered by 28.01.2017 в 08:18