linkedin sandra-acebes mail google github
abrir

Pandas

Data Analysis using Pandas II (Plot representation with Matplotlib)

In [ ]:
## Advanced analysis using pandas
# Example: US States Data
# Rank US states and territories by their 2010 population density
# source: Python DataScience Handbook (by Jake VanderPlas)
In [3]:
import pandas as pd
df1 = pd.read_csv('./state-abbrevs.csv')
df2= pd.read_csv('./state-population.csv')
df3=pd.read_csv('./state-areas.csv')
In [12]:
df1.info()
df1.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 2 columns):
state           51 non-null object
abbreviation    51 non-null object
dtypes: object(2)
memory usage: 888.0+ bytes
Out[12]:
state abbreviation
0 Alabama AL
1 Alaska AK
2 Arizona AZ
3 Arkansas AR
4 California CA
In [11]:
df2.info()
df2.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2544 entries, 0 to 2543
Data columns (total 4 columns):
state/region    2544 non-null object
ages            2544 non-null object
year            2544 non-null int64
population      2524 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 79.6+ KB
Out[11]:
state/region ages year population
0 AL under18 2012 1117489.0
1 AL total 2012 4817528.0
2 AL under18 2010 1130966.0
3 AL total 2010 4785570.0
4 AL under18 2011 1125763.0
In [14]:
df3.info()
df3.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 2 columns):
state            52 non-null object
area (sq. mi)    52 non-null int64
dtypes: int64(1), object(1)
memory usage: 904.0+ bytes
Out[14]:
state area (sq. mi)
0 Alabama 52423
1 Alaska 656425
2 Arizona 114006
3 Arkansas 53182
4 California 163707
In [ ]:
#check null data in dataframes. df.info() There is no missing data. One has 52, the other
#51 entries, the second dataframe contains 2544 entries; 
In [ ]:
#Combining data : pd.merge(df1,df2,options)
In [ ]:
# merge options : right(keys from right dataframe), left, outer(all keys), inner(intersection)
In [34]:
df =pd.merge(df1,df3, how='outer')
In [71]:
df1.head()
df2.head()
Out[71]:
state/region ages year population
0 AL under18 2012 1117489.0
1 AL total 2012 4817528.0
2 AL under18 2010 1130966.0
3 AL total 2010 4785570.0
4 AL under18 2011 1125763.0
In [76]:
df2.head()
Out[76]:
state/region ages year population
0 AL under18 2012 1117489.0
1 AL total 2012 4817528.0
2 AL under18 2010 1130966.0
3 AL total 2010 4785570.0
4 AL under18 2011 1125763.0
In [77]:
df.head()
Out[77]:
state abbreviation area (sq. mi)
0 Alabama AL 52423
1 Alaska AK 656425
2 Arizona AZ 114006
3 Arkansas AR 53182
4 California CA 163707
In [104]:
#Merge lines with different size (dataframe1,dataframe2, column1, column2, options)
#options: right, left, inner , outer
dft=pd.merge(df,df2,left_on='abbreviation',right_on='state/region',how='outer')

#How to delete a column ---- df.drop(column,1)  axis : {0 or ‘index’, 1 or ‘columns’}                                
dft=dft.drop('state/region',1)      #drop the duplicated column
In [88]:
dft.head()
Out[88]:
state abbreviation area (sq. mi) ages year population
0 Alabama AL 52423.0 under18 2012 1117489.0
1 Alabama AL 52423.0 total 2012 4817528.0
2 Alabama AL 52423.0 under18 2010 1130966.0
3 Alabama AL 52423.0 total 2010 4785570.0
4 Alabama AL 52423.0 under18 2011 1125763.0
In [101]:
dft.columns
Out[101]:
Index([u'state', u'abbreviation', u'area (sq. mi)', u'year', u'ages',
       u'population'],
      dtype='object')
In [106]:
#df now has one missing data! print null columns
dft.isnull().any()
Out[106]:
state             True
abbreviation      True
area (sq. mi)     True
ages             False
year             False
population        True
dtype: bool
In [108]:
null_datat = dft[dft.isnull().any(axis=1)]   #print null rows
null_datat.head()
Out[108]:
state abbreviation area (sq. mi) ages year population
2448 Puerto Rico PR 3515.0 under18 1990 NaN
2449 Puerto Rico PR 3515.0 total 1990 NaN
2450 Puerto Rico PR 3515.0 total 1991 NaN
2451 Puerto Rico PR 3515.0 under18 1991 NaN
2452 Puerto Rico PR 3515.0 total 1993 NaN
In [109]:
df['abbreviation'].unique()
Out[109]:
array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
       'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MT', 'NE',
       'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'MD',
       'MA', 'MI', 'MN', 'MS', 'MO', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'PR'], dtype=object)
In [ ]:
df.loc[df['state']=='Puerto Rico'].fillna('abbreviation'=='PR')
df[df.isnull().any(axis=1)]
In [ ]:
#There is no abbreviation for Puerto Rico: add missing valuen to df

# Dealing with missing data: dropna() and fillna()
#df_new=df.loc[df['state']=='Puerto Rico'].fillna('abbreviation'=='PR')
In [113]:
dft.loc[dft['state']=='Puerto Rico'].fillna('abbreviation'=='PR')
len(dft[dft.isnull().any(axis=1)])
Out[113]:
68
In [171]:
null_data2 = dft[dft.isnull().any(axis=1)]   #print null rows
null_data2.head()
null_data2.abbreviation.unique()
Out[171]:
array(['PR', nan], dtype=object)
In [165]:
dfnew=dft[dft['state']=='Puerto Rico']  #new dataframe only with puerto rico
dfnew.head()

null_data = dfnew[dfnew.isnull().any(axis=1)]   #print null rows
null_data.head()
Out[165]:
state abbreviation area (sq. mi) ages year population
2448 Puerto Rico PR 3515.0 under18 1990 NaN
2449 Puerto Rico PR 3515.0 total 1990 NaN
2450 Puerto Rico PR 3515.0 total 1991 NaN
2451 Puerto Rico PR 3515.0 under18 1991 NaN
2452 Puerto Rico PR 3515.0 total 1993 NaN
In [168]:
len(null_data.population.unique())
null_data.year.unique()  #there is no data for PR from 1990 to 1999
Out[168]:
array([1990, 1991, 1993, 1992, 1994, 1995, 1996, 1998, 1997, 1999])
In [ ]:
#Identify duplicate info 
# DataFrame.duplicated(subset=None, keep='first')[source]
In [28]:
df[df['abbreviation'].duplicated(keep='first')]
Out[28]:
state abbreviation area (sq. mi)
In [ ]:
#Exemple: rank US states by their 2010 population density
In [180]:
data_2010=dft[dft['year']==2010]
data_2010.info()
null_2010=data_2010[data_2010.isnull().any(axis=1)]


null_2010.head()
#null_2010.abbreviation.unique()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 106 entries, 2 to 2539
Data columns (total 6 columns):
state            104 non-null object
abbreviation     104 non-null object
area (sq. mi)    104 non-null float64
ages             106 non-null object
year             106 non-null int64
population       106 non-null float64
dtypes: float64(2), int64(1), object(3)
memory usage: 5.8+ KB
Out[180]:
state abbreviation area (sq. mi) ages year population
2538 NaN NaN NaN under18 2010 74119556.0
2539 NaN NaN NaN total 2010 309326295.0
In [191]:
data_2010nonull=data_2010.dropna()  #ignore NaN columns
data_2010_total=data_2010nonull[data_2010nonull['ages']=='total']
sum(data_2010_total['population'])  # 
data_2010_total.head()
Out[191]:
state abbreviation area (sq. mi) ages year population
3 Alabama AL 52423.0 total 2010 4785570.0
91 Alaska AK 656425.0 total 2010 713868.0
101 Arizona AZ 114006.0 total 2010 6408790.0
189 Arkansas AR 53182.0 total 2010 2922280.0
197 California CA 163707.0 total 2010 37333601.0
In [204]:
data_2010_total['density'] = data_2010_total['population']/data_2010_total['area (sq. mi)']
data_2010_total.head()
/Users/sandra/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
Out[204]:
state abbreviation area (sq. mi) ages year population density
3 Alabama AL 52423.0 total 2010 4785570.0 91.287603
91 Alaska AK 656425.0 total 2010 713868.0 1.087509
101 Arizona AZ 114006.0 total 2010 6408790.0 56.214497
189 Arkansas AR 53182.0 total 2010 2922280.0 54.948667
197 California CA 163707.0 total 2010 37333601.0 228.051342
In [206]:
#order the dataframe by the density value
#DataFrame.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')[source]
data_2010_total.sort_values('density', ascending=False).head()
Out[206]:
state abbreviation area (sq. mi) ages year population density
389 District of Columbia DC 68.0 total 2010 605125.0 8898.897059
2490 Puerto Rico PR 3515.0 total 2010 3721208.0 1058.665149
1157 New Jersey NJ 8722.0 total 2010 8802707.0 1009.253268
1914 Rhode Island RI 1545.0 total 2010 1052669.0 681.339159
293 Connecticut CT 5544.0 total 2010 3579210.0 645.600649