pip install pygments
!pygmentize -l text itunes_data.csv
import pandas as pd
csv_df = pd.read_csv('itunes_data.csv')
csv_df.head()
excel_df = pd.read_excel('itunes_data.xlsx', engine='openpyxl')
excel_df.head()
from sqlalchemy import create_engine
engine = create_engine('sqlite:///chinook.db')
query = """SELECT tracks.name as Track, tracks.composer, tracks.milliseconds,
tracks.bytes, tracks.unitprice,
genres.name as Genre,
albums.title as Album,
artists.name as Artist
FROM tracks
JOIN genres ON tracks.genreid = genres.genreid
JOIN albums ON tracks.albumid = albums.albumid
JOIN artists ON albums.artistid = artists.artistid;
"""
with engine.connect() as connection:
sql_df = pd.read_sql_query(query, connection)
sql_df.head(2).T
# create dataframe from lists
df = pd.DataFrame(data={'seconds': [1, 2, 3, 4], 'intensity': [12, 11, 12, 14]})
df.head()
sql_df.index
sql_df.columns
type(sql_df)
itunes_df = pd.concat([csv_df, excel_df, sql_df], axis=0)
itunes_df.head()
itunes_df.tail()
print(itunes_df.iloc[0])
print(itunes_df.iloc[-1])
itunes_df.iloc[0, 0]
itunes_df.iloc[-1, -1]
itunes_df.loc[3502]
test_df = itunes_df.copy()
test_df = test_df.append(itunes_df.loc[3502])
test_df.loc[3502]
import matplotlib.pyplot as plt
itunes_df['Milliseconds'].hist(bins=30)
plt.show()
# this cell is for saving the image; more on this is chapter 5
f = plt.figure(figsize=(5.5, 5.5)) # this changes the size of the image -- more on this is chapter 5
f.patch.set_facecolor('w') # sets background color behind axis labels
itunes_df['Milliseconds'].hist(bins=30)
plt.tight_layout() # auto-adjust margins
plt.savefig('B17030_04_06.png', dpi=300)
# figsize increases the size of the image -- more on this is chapter 5
itunes_df.plot.scatter(x='Milliseconds', y='Bytes', figsize=(8, 8))
plt.show()
# saving the image -- more on this in chapter 5
f = plt.figure()
itunes_df.plot.scatter(x='Milliseconds', y='Bytes', figsize=(5.5, 5.5))
f.patch.set_facecolor('w') # sets background color behind axis labels
plt.tight_layout() # auto-adjust margins
plt.savefig('B17030_04_07.png', dpi=300)
itunes_df.iloc[-1, -1]
itunes_df.loc[3502]
itunes_df[itunes_df['Milliseconds'] > 4e6]
print(itunes_df[itunes_df['Milliseconds'] > 4e6])
print(itunes_df[itunes_df['Milliseconds'] > 4e6][['Genre', 'Artist']])
itunes_df['Milliseconds'] > 4e6
itunes_df[itunes_df['Milliseconds'] > 2e6]['Genre'].value_counts()
itunes_df[(itunes_df['Milliseconds'] > 2e6) & (itunes_df['Bytes'] < 0.4e9)]['Genre'].value_counts()
itunes_df[itunes_df['Genre'] != 'TV Shows']['Genre'].value_counts()
itunes_df[~(itunes_df['Genre'] == 'TV Shows')]['Genre'].value_counts()
itunes_df[itunes_df['Genre'].str.contains('TV')]['Genre'].value_counts()
itunes_df_copy = itunes_df.copy() # make a backup for testing dropping a column
itunes_df_copy.drop('Composer', axis=1, inplace=True)
itunes_df_copy.columns
only_music = itunes_df[~itunes_df['Genre'].isin(['Drama', 'TV Shows', 'Sci Fi & Fantasy', 'Science Fiction', 'Comedy'])]
#@title Default title text
itunes_df[itunes_df['Composer'].isna()].head()
itunes_df[itunes_df['Composer'].isna()].head()
itunes_df_copy = itunes_df.copy() # make a backup for testing dropping a column
# print shapes to check that rows were dropped
print(itunes_df_copy.shape)
itunes_df_copy.dropna(inplace=True)
print(itunes_df_copy.shape)
itunes_df_copy = itunes_df.copy() # make a backup for testing dropping a column
itunes_df_copy.loc[itunes_df['Composer'].isna(), 'Composer'] = 'Unknown'
itunes_df_copy.head()
itunes_df_copy = itunes_df.copy() # make a backup for testing dropping a column
itunes_df_copy['Composer'].fillna('Unknown', inplace=True)
itunes_df_copy.head()
itunes_df['UnitPrice'].value_counts().iloc[0]/itunes_df.shape[0]
itunes_df_copy = itunes_df.copy() # make a backup for testing dropping a column
itunes_df_copy['UnitPrice'].fillna(itunes_df_copy['UnitPrice'].mode(), inplace=True)
itunes_df_copy.head()
# create distributions for plotting
from scipy.stats import skewnorm
df = pd.DataFrame({'normal': skewnorm.rvs(0, size=10000), 'skewed': skewnorm.rvs(10, size=10000)})
df.plot.hist(bins=30, alpha=0.5, figsize=(5.5, 5.5))
plt.vlines(df['skewed'].mean(), ymin=0, ymax=2000, label='mean', color='r')
plt.vlines(df['skewed'].median(), ymin=0, ymax=2000, label='median', color='b')
#f.patch.set_facecolor('w') # sets background color behind axis labels
plt.tight_layout() # auto-adjust margins
plt.savefig('B17030_04_09.png', dpi=300)
# there are multiple values with the index value of 0, which is why we get multiple results
itunes_df.loc[0, 'Bytes']
from sklearn.impute import KNNImputer
imputer = KNNImputer()
imputed = imputer.fit_transform(itunes_df_copy[['Milliseconds', 'Bytes', 'UnitPrice']])
itunes_df_copy['Bytes'] = imputed[:, 1]
itunes_df_copy['Bytes'] = imputed[:, 1]
itunes_df_copy.loc[0, 'Bytes']
No comments:
Post a Comment