I am trying to test the code given by Sachin Joglekar in a project on convolutional neural networks to classify toxic comments using python3. But I have numerous problems when configuring my computer for it .. At the moment when I use tokenizer.texts_to_sequences(texts)
an object similar to a byte is required, and it seems that I give it a 'dict'. So how to transform a dict into a byte? Or how to adapt my code?
Here is some of the data:
{'comment_text': "Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,", 'id': '00001cee341fdb12'}
{'comment_text': '== From RfC == \n\n The title is fine as it is, IMO.', 'id': '0000247867823ef7'}
{'comment_text': '" \n\n == Sources == \n\n * Zawe Ashton on Lapland — / "', 'id': '00013b17ad220c46'}
{'comment_text': ":If you have a look back at the source, the information I updated was the correct form. I can only guess the source hadn't updated. I shall update the information once again but thank you for your message.", 'id': '00017563c3f7919a'}
{'comment_text': "I don't anonymously edit articles at all.", 'id': '00017695ad8997eb'}
{'comment_text': 'Thank you for understanding. I think very highly of you and would not revert without discussion.', 'id': '0001ea8717f6de06'}
{'comment_text': 'Please do not add nonsense to Wikipedia. Such edits are considered vandalism and quickly undone. If you would like to experiment, please use the sandbox instead. Thank you. -', 'id': '00024115d4cbde0f'}
{'comment_text': ':Dear god this site is horrible.', 'id': '000247e83dcc1211'}
{'comment_text': '" \n Only a fool can believe in such numbers. \n The correct number lies between 10 000 to 15 000. \n Ponder the numbers carefully. \n\n This error will persist for a long time as it continues to reproduce... The latest reproduction I know is from ENCYCLOPÆDIA BRITANNICA ALMANAC 2008 wich states \n Magnittude: 8.7 (fair enough) \n victims: 70 000 (today 10 000 to 15 000 is not ""a lot"" so I guess people just come out with a number that impresses enough, I don\'t know. But I know this: it\'s just a shameless lucky number that they throw in the air. \n GC \n\n "', 'id': '00025358d4737918'}
{'comment_text': "== Double Redirects == \n\n When fixing double redirects, don't just blank the outer one, you need edit it to point it to the final target, unless you think it's inappropriate, in which case, it needs to be nominated at WP:RfD", 'id': '00026d1092fe71cc'}
Here is the error:
mike@mike-thinks:~/Kaggle$ python3 medium.py
Using TensorFlow backend.
Total number of texts: 159571
Traceback (most recent call last):
File "medium.py", line 107, in <module>
tokenizer, word_index, x_train, y_train, x_val, y_val = get_datasets(texts, targets)
File "medium.py", line 37, in get_datasets
tokenizer.fit_on_texts(texts)
File "/usr/local/lib/python3.5/dist-packages/keras/preprocessing/text.py", line 194, in fit_on_texts
self.split)
File "/usr/local/lib/python3.5/dist-packages/keras/preprocessing/text.py", line 46, in text_to_word_sequence
text = text.translate(translate_map)
TypeError: a bytes-like object is required, not 'dict'
Here is the entire code:
from backports import csv
import numpy as np
import sys
import io
# Helps in reading long texts
csv.field_size_limit(sys.maxsize)
def get_texts_and_targets(filename):
texts = []
targets = []
with io.open(filename, encoding='utf-8') as csvfile:
readCSV = csv.reader(csvfile)
for i, row in enumerate(readCSV):
if i == 0:
# Header row
continue
texts.append(row[1].strip().encode('ascii', 'replace'))
targets.append(np.array([float(x) for x in row[2:]]))
print("Total number of texts: %s" % len(texts))
return texts, targets
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# Max number of input words in any sample
MAX_SEQUENCE_LENGTH = 200
VALIDATION_SPLIT = 0.1
def get_datasets(texts, targets, tokenizer=None):
if tokenizer is None:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
targets = np.asarray(targets)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
targets = targets[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = targets[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = targets[-nb_validation_samples:]
return tokenizer, word_index, x_train, y_train, x_val, y_val
# For word embeddings, he used the Glove Twitter vectors with 100 dimensions.
from gensim.models import KeyedVectors
def load_glove_model():
word2vec = KeyedVectors.load_word2vec_format(
os.path.join(WORD2VEC_FOLDER,
'word2vec_twitter_glove.txt'),
binary=False)
return word2vec
# embedding layer in Keras
def get_embedding_layer(word_index, gensim_model):
embedding_dim = len(gensim_model.wv['apple'])
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
if word in gensim_model.wv.vocab:
embedding_matrix[i] = gensim_model.wv[word]
embedding_layer = Embedding(len(word_index) + 1,
embedding_dim,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=True)
return embedding_layer
#We use 2 Convolutional+Max-Pooling blocks followed by 3 dense layers:
from keras.layers import *
from keras.models import Model
N_TARGET_CLASSES = 6
def get_convnet_model(embedding_layer):
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dense(64, activation='relu')(x)
#Sigmoid (and not Softmax) is the more appropriate objective function here, since each sample could belong to multiple classes (A comment could be an insult and obscene at the same time).
preds = Dense(N_TARGET_CLASSES, activation='sigmoid')(x)
model = Model(sequence_input, preds)
return model
# Training with Adagrad (with its default settings) as it worked best for this use-case.
texts, targets = get_texts_and_targets('data/train.csv')
tokenizer, word_index, x_train, y_train, x_val, y_val = get_datasets(texts, targets)
word2vec = load_word2vec_model()
embedding_layer = get_embedding_layer(word_index, word2vec)
model = get_convnet_model(embedding_layer)
# The binary_crossentropy objective is Keras version of log-loss
model.compile(loss='binary_crossentropy',
optimizer='adagrad',
metrics=['accuracy'])
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=2, batch_size=32, verbose=1)