Pandas
Data Analysis using Pandas II (Plot representation with Matplotlib)
In [ ]:
## Advanced analysis using pandas
# Example: US States Data
# Rank US states and territories by their 2010 population density
# source: Python DataScience Handbook (by Jake VanderPlas)
In [3]:
import pandas as pd
df1 = pd.read_csv('./state-abbrevs.csv')
df2= pd.read_csv('./state-population.csv')
df3=pd.read_csv('./state-areas.csv')
In [12]:
df1.info()
df1.head()
Out[12]:
In [11]:
df2.info()
df2.head()
Out[11]:
In [14]:
df3.info()
df3.head()
Out[14]:
In [ ]:
#check null data in dataframes. df.info() There is no missing data. One has 52, the other
#51 entries, the second dataframe contains 2544 entries;
In [ ]:
#Combining data : pd.merge(df1,df2,options)
In [ ]:
# merge options : right(keys from right dataframe), left, outer(all keys), inner(intersection)
In [34]:
df =pd.merge(df1,df3, how='outer')
In [71]:
df1.head()
df2.head()
Out[71]:
In [76]:
df2.head()
Out[76]:
In [77]:
df.head()
Out[77]:
In [104]:
#Merge lines with different size (dataframe1,dataframe2, column1, column2, options)
#options: right, left, inner , outer
dft=pd.merge(df,df2,left_on='abbreviation',right_on='state/region',how='outer')
#How to delete a column ---- df.drop(column,1) axis : {0 or ‘index’, 1 or ‘columns’}
dft=dft.drop('state/region',1) #drop the duplicated column
In [88]:
dft.head()
Out[88]:
In [101]:
dft.columns
Out[101]:
In [106]:
#df now has one missing data! print null columns
dft.isnull().any()
Out[106]:
In [108]:
null_datat = dft[dft.isnull().any(axis=1)] #print null rows
null_datat.head()
Out[108]:
In [109]:
df['abbreviation'].unique()
Out[109]:
In [ ]:
df.loc[df['state']=='Puerto Rico'].fillna('abbreviation'=='PR')
df[df.isnull().any(axis=1)]
In [ ]:
#There is no abbreviation for Puerto Rico: add missing valuen to df
# Dealing with missing data: dropna() and fillna()
#df_new=df.loc[df['state']=='Puerto Rico'].fillna('abbreviation'=='PR')
In [113]:
dft.loc[dft['state']=='Puerto Rico'].fillna('abbreviation'=='PR')
len(dft[dft.isnull().any(axis=1)])
Out[113]:
In [171]:
null_data2 = dft[dft.isnull().any(axis=1)] #print null rows
null_data2.head()
null_data2.abbreviation.unique()
Out[171]:
In [165]:
dfnew=dft[dft['state']=='Puerto Rico'] #new dataframe only with puerto rico
dfnew.head()
null_data = dfnew[dfnew.isnull().any(axis=1)] #print null rows
null_data.head()
Out[165]:
In [168]:
len(null_data.population.unique())
null_data.year.unique() #there is no data for PR from 1990 to 1999
Out[168]:
In [ ]:
#Identify duplicate info
# DataFrame.duplicated(subset=None, keep='first')[source]
In [28]:
df[df['abbreviation'].duplicated(keep='first')]
Out[28]:
In [ ]:
#Exemple: rank US states by their 2010 population density
In [180]:
data_2010=dft[dft['year']==2010]
data_2010.info()
null_2010=data_2010[data_2010.isnull().any(axis=1)]
null_2010.head()
#null_2010.abbreviation.unique()
Out[180]:
In [191]:
data_2010nonull=data_2010.dropna() #ignore NaN columns
data_2010_total=data_2010nonull[data_2010nonull['ages']=='total']
sum(data_2010_total['population']) #
data_2010_total.head()
Out[191]:
In [204]:
data_2010_total['density'] = data_2010_total['population']/data_2010_total['area (sq. mi)']
data_2010_total.head()
Out[204]:
In [206]:
#order the dataframe by the density value
#DataFrame.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')[source]
data_2010_total.sort_values('density', ascending=False).head()
Out[206]: