e-techbytes: 【PYTHON】Cleaning Data, Filtering DataFrames

Friday, June 11, 2021

【PYTHON】Cleaning Data, Filtering DataFrames

pip install pygments

!pygmentize -l text itunes_data.csv

import pandas as pd

csv_df = pd.read_csv('itunes_data.csv')

csv_df.head()

excel_df = pd.read_excel('itunes_data.xlsx', engine='openpyxl')

excel_df.head()

from sqlalchemy import create_engine

engine = create_engine('sqlite:///chinook.db')

query = """SELECT tracks.name as Track, tracks.composer, tracks.milliseconds,

tracks.bytes, tracks.unitprice,

genres.name as Genre,

albums.title as Album,

artists.name as Artist

FROM tracks

JOIN genres ON tracks.genreid = genres.genreid

JOIN albums ON tracks.albumid = albums.albumid

JOIN artists ON albums.artistid = artists.artistid;

"""

with engine.connect() as connection:

sql_df = pd.read_sql_query(query, connection)

sql_df.head(2).T

# create dataframe from lists

df = pd.DataFrame(data={'seconds': [1, 2, 3, 4], 'intensity': [12, 11, 12, 14]})

df.head()

sql_df.index

sql_df.columns

type(sql_df)

itunes_df = pd.concat([csv_df, excel_df, sql_df], axis=0)

itunes_df.head()

itunes_df.tail()

print(itunes_df.iloc[0])

print(itunes_df.iloc[-1])

itunes_df.iloc[0, 0]

itunes_df.iloc[-1, -1]

itunes_df.loc[3502]

test_df = itunes_df.copy()

test_df = test_df.append(itunes_df.loc[3502])

test_df.loc[3502]

import matplotlib.pyplot as plt

itunes_df['Milliseconds'].hist(bins=30)

plt.show()

# this cell is for saving the image; more on this is chapter 5

f = plt.figure(figsize=(5.5, 5.5)) # this changes the size of the image -- more on this is chapter 5

f.patch.set_facecolor('w') # sets background color behind axis labels

itunes_df['Milliseconds'].hist(bins=30)

plt.tight_layout() # auto-adjust margins

plt.savefig('B17030_04_06.png', dpi=300)

# figsize increases the size of the image -- more on this is chapter 5

itunes_df.plot.scatter(x='Milliseconds', y='Bytes', figsize=(8, 8))

plt.show()

# saving the image -- more on this in chapter 5

f = plt.figure()

itunes_df.plot.scatter(x='Milliseconds', y='Bytes', figsize=(5.5, 5.5))

f.patch.set_facecolor('w') # sets background color behind axis labels

plt.tight_layout() # auto-adjust margins

plt.savefig('B17030_04_07.png', dpi=300)

itunes_df.iloc[-1, -1]

itunes_df.loc[3502]

itunes_df[itunes_df['Milliseconds'] > 4e6]

print(itunes_df[itunes_df['Milliseconds'] > 4e6])

print(itunes_df[itunes_df['Milliseconds'] > 4e6][['Genre', 'Artist']])

itunes_df['Milliseconds'] > 4e6

itunes_df[itunes_df['Milliseconds'] > 2e6]['Genre'].value_counts()

itunes_df[(itunes_df['Milliseconds'] > 2e6) & (itunes_df['Bytes'] < 0.4e9)]['Genre'].value_counts()

itunes_df[itunes_df['Genre'] != 'TV Shows']['Genre'].value_counts()

itunes_df[~(itunes_df['Genre'] == 'TV Shows')]['Genre'].value_counts()

itunes_df[itunes_df['Genre'].str.contains('TV')]['Genre'].value_counts()