What I'm trying to do is transform the column of the union of two dataframes
, the result is the dataframe data
. Within this dataframe
I have the following column: data['genres']
.
The code:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
tdata = pd.read_csv('../dB/title.basics.csv', encoding="utf-8", delimiter='\t', skip_blank_lines=True)
ratings = pd.read_csv('../dB/data.csv', encoding="utf-8", delimiter='\t', skip_blank_lines=True)
tdata.drop(['isAdult', 'endYear'], inplace = True, axis=1)
data = pd.merge(ratings, tdata)
data['genres'] = data['genres'].fillna('[]').apply(lambda x: str(x).split(',') if x != np.nan else np.nan)
numsVotes = ratings[ratings['numVotes'].notnull()]['numVotes'].astype('int64') #votos contados
averageRatings = ratings[ratings['averageRating'].notnull()]['averageRating'].astype('double') #voto promedio
C = averageRatings.mean()
C
m = numsVotes.quantile(0.95)
m
data = data[(data['numVotes'] >= m) & (data['numVotes'].notnull()) & (data['averageRating'].notnull())][['primaryTitle', 'originalTitle', 'startYear', 'genres', 'numVotes', 'averageRating']]
data.shape
### Función que calcula la calificación ponderada de cada película
def weighted_rating(x):
v = x['numVotes']
R = x['averageRating']
return (v/(v+m) * R) + (m/(m+v) * C)
data['averageRating'] = data.apply(weighted_rating, axis=1)
data = data.sort_values('averageRating', ascending=False).head(250)
data.head(25)
s = data.apply(lambda x: pd.Series(x.genres),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_data = data.drop('genres', axis=1).join(s)
######### Recomendacion por genero
def build_chart(genre, percentile=0.85):
df = gen_data[gen_data['genre'] == genre]
numsVotes = df[df['numVotes'].notnull()]['numVotes'].astype('int')
averageRatings = df[df['averageRating'].notnull()]['averageRating'].astype('float')
C = averageRatings.mean()
m = numsVotes.quantile(percentile)
data = df[(df['numVotes'] >= m) & (df['numVotes'].notnull()) & (df['averageRating'].notnull())][['primaryTitle', 'originalTitle', 'startYear', 'numVotes','averageRating', 'genre']]
data['numVotes'] = data['numVotes'].astype('int')
data['averageRating'] = data['averageRating'].astype('float')
data['averageRating'] = data.apply(lambda x: (x['numVotes']/(x['numVotes']+m) * x['averageRating']) + (m/(m+x['numVotes']) * C), axis=1)
data = data.sort_values('averageRating', ascending=False).head(250)
return data
build_chart('Comedy').head(15)
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(data['genres']) # Aca se crea el error!