Friday, June 11, 2021

【PYTHON】Cleaning Data, Filtering DataFrames

 pip install pygments


!pygmentize -l text itunes_data.csv


import pandas as pd


csv_df = pd.read_csv('itunes_data.csv')

csv_df.head()


excel_df = pd.read_excel('itunes_data.xlsx', engine='openpyxl')

excel_df.head()


from sqlalchemy import create_engine

engine = create_engine('sqlite:///chinook.db')


query = """SELECT tracks.name as Track, tracks.composer, tracks.milliseconds,

tracks.bytes, tracks.unitprice,

genres.name as Genre,

albums.title as Album,

artists.name as Artist

FROM tracks

JOIN genres ON tracks.genreid = genres.genreid

JOIN albums ON tracks.albumid = albums.albumid

JOIN artists ON albums.artistid = artists.artistid;

"""


with engine.connect() as connection:

  sql_df = pd.read_sql_query(query, connection)


sql_df.head(2).T


# create dataframe from lists

df = pd.DataFrame(data={'seconds': [1, 2, 3, 4], 'intensity': [12, 11, 12, 14]})

df.head()


sql_df.index


sql_df.columns


type(sql_df)


itunes_df = pd.concat([csv_df, excel_df, sql_df], axis=0)

itunes_df.head()

itunes_df.tail()


print(itunes_df.iloc[0])

print(itunes_df.iloc[-1])


itunes_df.iloc[0, 0]


itunes_df.iloc[-1, -1]


itunes_df.loc[3502]


test_df = itunes_df.copy()

test_df = test_df.append(itunes_df.loc[3502])

test_df.loc[3502]


import matplotlib.pyplot as plt


itunes_df['Milliseconds'].hist(bins=30)

plt.show()


# this cell is for saving the image; more on this is chapter 5

f = plt.figure(figsize=(5.5, 5.5))  # this changes the size of the image -- more on this is chapter 5

f.patch.set_facecolor('w')  # sets background color behind axis labels

itunes_df['Milliseconds'].hist(bins=30)

plt.tight_layout()  # auto-adjust margins

plt.savefig('B17030_04_06.png', dpi=300)


# figsize increases the size of the image -- more on this is chapter 5

itunes_df.plot.scatter(x='Milliseconds', y='Bytes', figsize=(8, 8))

plt.show()


# saving the image -- more on this in chapter 5

f = plt.figure()

itunes_df.plot.scatter(x='Milliseconds', y='Bytes', figsize=(5.5, 5.5))

f.patch.set_facecolor('w')  # sets background color behind axis labels

plt.tight_layout()  # auto-adjust margins

plt.savefig('B17030_04_07.png', dpi=300)


itunes_df.iloc[-1, -1]


itunes_df.loc[3502]


itunes_df[itunes_df['Milliseconds'] > 4e6]


print(itunes_df[itunes_df['Milliseconds'] > 4e6])


print(itunes_df[itunes_df['Milliseconds'] > 4e6][['Genre', 'Artist']])


itunes_df['Milliseconds'] > 4e6


itunes_df[itunes_df['Milliseconds'] > 2e6]['Genre'].value_counts()


itunes_df[(itunes_df['Milliseconds'] > 2e6) & (itunes_df['Bytes'] < 0.4e9)]['Genre'].value_counts()


itunes_df[itunes_df['Genre'] != 'TV Shows']['Genre'].value_counts()


itunes_df[~(itunes_df['Genre'] == 'TV Shows')]['Genre'].value_counts()


itunes_df[itunes_df['Genre'].str.contains('TV')]['Genre'].value_counts()


itunes_df_copy = itunes_df.copy()  # make a backup for testing dropping a column

itunes_df_copy.drop('Composer', axis=1, inplace=True)

itunes_df_copy.columns


only_music = itunes_df[~itunes_df['Genre'].isin(['Drama', 'TV Shows', 'Sci Fi & Fantasy', 'Science Fiction', 'Comedy'])]


#@title Default title text

itunes_df[itunes_df['Composer'].isna()].head()


itunes_df[itunes_df['Composer'].isna()].head()


itunes_df_copy = itunes_df.copy()  # make a backup for testing dropping a column

# print shapes to check that rows were dropped

print(itunes_df_copy.shape)

itunes_df_copy.dropna(inplace=True)

print(itunes_df_copy.shape)


itunes_df_copy = itunes_df.copy()  # make a backup for testing dropping a column

itunes_df_copy.loc[itunes_df['Composer'].isna(), 'Composer'] = 'Unknown'

itunes_df_copy.head()


itunes_df_copy = itunes_df.copy()  # make a backup for testing dropping a column

itunes_df_copy['Composer'].fillna('Unknown', inplace=True)

itunes_df_copy.head()


itunes_df['UnitPrice'].value_counts().iloc[0]/itunes_df.shape[0]


itunes_df_copy = itunes_df.copy()  # make a backup for testing dropping a column

itunes_df_copy['UnitPrice'].fillna(itunes_df_copy['UnitPrice'].mode(), inplace=True)

itunes_df_copy.head()


# create distributions for plotting

from scipy.stats import skewnorm

df = pd.DataFrame({'normal': skewnorm.rvs(0, size=10000), 'skewed': skewnorm.rvs(10, size=10000)})


df.plot.hist(bins=30, alpha=0.5, figsize=(5.5, 5.5))

plt.vlines(df['skewed'].mean(), ymin=0, ymax=2000, label='mean', color='r')

plt.vlines(df['skewed'].median(), ymin=0, ymax=2000, label='median', color='b')

#f.patch.set_facecolor('w')  # sets background color behind axis labels

plt.tight_layout()  # auto-adjust margins

plt.savefig('B17030_04_09.png', dpi=300)


# there are multiple values with the index value of 0, which is why we get multiple results

itunes_df.loc[0, 'Bytes']


from sklearn.impute import KNNImputer

imputer = KNNImputer()

imputed = imputer.fit_transform(itunes_df_copy[['Milliseconds', 'Bytes', 'UnitPrice']])

itunes_df_copy['Bytes'] = imputed[:, 1]


itunes_df_copy['Bytes'] = imputed[:, 1]


itunes_df_copy.loc[0, 'Bytes']

No comments:

Post a Comment