How to solve AttributeError: 'list' object has no attribute 'lower'?

0

What I'm trying to do is transform the column of the union of two dataframes , the result is the dataframe data . Within this dataframe I have the following column: data['genres'] .

The code:

import pandas as pd

import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

tdata = pd.read_csv('../dB/title.basics.csv', encoding="utf-8", delimiter='\t', skip_blank_lines=True)
ratings = pd.read_csv('../dB/data.csv', encoding="utf-8", delimiter='\t', skip_blank_lines=True)

tdata.drop(['isAdult', 'endYear'], inplace = True, axis=1)

data = pd.merge(ratings, tdata)

data['genres'] = data['genres'].fillna('[]').apply(lambda x: str(x).split(',') if x != np.nan else np.nan)

numsVotes = ratings[ratings['numVotes'].notnull()]['numVotes'].astype('int64') #votos contados
averageRatings = ratings[ratings['averageRating'].notnull()]['averageRating'].astype('double') #voto promedio

C = averageRatings.mean()
C
m = numsVotes.quantile(0.95)
m

data = data[(data['numVotes'] >= m) & (data['numVotes'].notnull()) & (data['averageRating'].notnull())][['primaryTitle', 'originalTitle', 'startYear', 'genres', 'numVotes', 'averageRating']] 
data.shape

### Función que calcula la calificación ponderada de cada película
def weighted_rating(x):
    v = x['numVotes']
    R = x['averageRating']
    return (v/(v+m) * R) + (m/(m+v) * C)

data['averageRating'] = data.apply(weighted_rating, axis=1)
data = data.sort_values('averageRating', ascending=False).head(250)
data.head(25)

s = data.apply(lambda x: pd.Series(x.genres),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_data = data.drop('genres', axis=1).join(s)

######### Recomendacion por genero
def build_chart(genre, percentile=0.85):

    df = gen_data[gen_data['genre'] == genre]

    numsVotes = df[df['numVotes'].notnull()]['numVotes'].astype('int')
    averageRatings = df[df['averageRating'].notnull()]['averageRating'].astype('float')
    C = averageRatings.mean()
    m = numsVotes.quantile(percentile)


    data = df[(df['numVotes'] >= m) & (df['numVotes'].notnull()) & (df['averageRating'].notnull())][['primaryTitle', 'originalTitle', 'startYear', 'numVotes','averageRating', 'genre']] 
    data['numVotes'] = data['numVotes'].astype('int')
    data['averageRating'] = data['averageRating'].astype('float')  

    data['averageRating'] = data.apply(lambda x: (x['numVotes']/(x['numVotes']+m) * x['averageRating']) + (m/(m+x['numVotes']) * C), axis=1)
    data = data.sort_values('averageRating', ascending=False).head(250)

    return data

build_chart('Comedy').head(15)

tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(data['genres']) # Aca se crea el error!
    
asked by Bruno Soto 08.06.2018 в 14:37
source

0 answers