e-techbytes: 【PYTHON】Data Transformations

Saturday, June 12, 2021

itunes_df['Seconds'] = itunes_df['Milliseconds'] / 1000

itunes_df['len_byte_ratio'] = itunes_df['Milliseconds'] / itunes_df['Bytes']

genre_dict = {'metal': 'Metal', 'met': 'Metal'}

itunes_df['Genre'].replace(genre_dict)

itunes_df['Genre'].apply(lambda x: x.lower())

# the above is the same as this

def lowercase(x):

return x.lower()

itunes_df['Genre'].apply(lowercase)

# but using built-in functions is almost always faster

itunes_df['Genre'].str.lower()

# this is a common sentiment analysis library; polarity is positive/negative sentiment,

# subjectivety is subjective/objective rating.

from textblob import TextBlob

test = TextBlob("Textblob is amazingly simple to use. What great fun!")

test.sentiment

test.sentiment.polarity

# it would be better than apply to use a list comprehension to get sentiment of track names, like this

itunes_df['Track_sentiment'] = [TextBlob(x).sentiment.polarity for x in itunes_df['Track']]

# but, if we wanted to mix polarity and subjectivity into one column, it would be best to use apply:

def pol_sub_mix(x):

tb = TextBlob(x)

return tb.polarity * tb.subjectivity

itunes_df['Track_pol_sub_mix'] = itunes_df['Track'].apply(pol_sub_mix)

# delete these columns

itunes_df.drop(['Track_pol_sub_mix', 'Track_sentiment'], inplace=True, axis=1)

# currently doesn't work with python 3.9

import swifter

itunes_df['Genre'].swifter.apply(lambda x: x.lower())

itunes_df.to_csv('cleaned_itunes_data.csv', index=False)

itunes_df.groupby('Genre').mean()['Seconds'].sort_values().head()

e-techbytes