Saturday, June 12, 2021

【PYTHON】Data Transformations

 itunes_df['Seconds'] = itunes_df['Milliseconds'] / 1000


itunes_df['len_byte_ratio'] = itunes_df['Milliseconds'] / itunes_df['Bytes']


genre_dict = {'metal': 'Metal', 'met': 'Metal'}

itunes_df['Genre'].replace(genre_dict)


itunes_df['Genre'].apply(lambda x: x.lower())


# the above is the same as this

def lowercase(x):

  return x.lower()


itunes_df['Genre'].apply(lowercase)


# but using built-in functions is almost always faster

itunes_df['Genre'].str.lower()


# this is a common sentiment analysis library; polarity is positive/negative sentiment,

# subjectivety is subjective/objective rating.

from textblob import TextBlob

test = TextBlob("Textblob is amazingly simple to use. What great fun!")

test.sentiment


test.sentiment.polarity


# it would be better than apply to use a list comprehension to get sentiment of track names, like this

itunes_df['Track_sentiment'] = [TextBlob(x).sentiment.polarity for x in itunes_df['Track']]


# but, if we wanted to mix polarity and subjectivity into one column, it would be best to use apply:

def pol_sub_mix(x):

  tb = TextBlob(x)

  return tb.polarity * tb.subjectivity


itunes_df['Track_pol_sub_mix'] = itunes_df['Track'].apply(pol_sub_mix)


# delete these columns

itunes_df.drop(['Track_pol_sub_mix', 'Track_sentiment'], inplace=True, axis=1)


# currently doesn't work with python 3.9

import swifter

itunes_df['Genre'].swifter.apply(lambda x: x.lower())


itunes_df.to_csv('cleaned_itunes_data.csv', index=False)


itunes_df.groupby('Genre').mean()['Seconds'].sort_values().head()

No comments:

Post a Comment