## Advanced analysis using pandas
# Example: US States Data
# Rank US states and territories by their 2010 population density
# source: Python DataScience Handbook (by Jake VanderPlas)

import pandas as pd
df1 = pd.read_csv('./state-abbrevs.csv')
df2= pd.read_csv('./state-population.csv')
df3=pd.read_csv('./state-areas.csv')

df1.info()
df1.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 2 columns):
state           51 non-null object
abbreviation    51 non-null object
dtypes: object(2)
memory usage: 888.0+ bytes

df2.info()
df2.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2544 entries, 0 to 2543
Data columns (total 4 columns):
state/region    2544 non-null object
ages            2544 non-null object
year            2544 non-null int64
population      2524 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 79.6+ KB

df3.info()
df3.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 2 columns):
state            52 non-null object
area (sq. mi)    52 non-null int64
dtypes: int64(1), object(1)
memory usage: 904.0+ bytes

#check null data in dataframes. df.info() There is no missing data. One has 52, the other
#51 entries, the second dataframe contains 2544 entries;

#Combining data : pd.merge(df1,df2,options)

# merge options : right(keys from right dataframe), left, outer(all keys), inner(intersection)

df =pd.merge(df1,df3, how='outer')

df1.head()
df2.head()

df2.head()

df.head()

#Merge lines with different size (dataframe1,dataframe2, column1, column2, options)
#options: right, left, inner , outer
dft=pd.merge(df,df2,left_on='abbreviation',right_on='state/region',how='outer')

#How to delete a column ---- df.drop(column,1)  axis : {0 or ‘index’, 1 or ‘columns’}                                
dft=dft.drop('state/region',1)      #drop the duplicated column

dft.head()

dft.columns

Index([u'state', u'abbreviation', u'area (sq. mi)', u'year', u'ages',
       u'population'],
      dtype='object')

#df now has one missing data! print null columns
dft.isnull().any()

state             True
abbreviation      True
area (sq. mi)     True
ages             False
year             False
population        True
dtype: bool

null_datat = dft[dft.isnull().any(axis=1)]   #print null rows
null_datat.head()

df['abbreviation'].unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
       'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MT', 'NE',
       'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'MD',
       'MA', 'MI', 'MN', 'MS', 'MO', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'PR'], dtype=object)

df.loc[df['state']=='Puerto Rico'].fillna('abbreviation'=='PR')
df[df.isnull().any(axis=1)]

#There is no abbreviation for Puerto Rico: add missing valuen to df

# Dealing with missing data: dropna() and fillna()
#df_new=df.loc[df['state']=='Puerto Rico'].fillna('abbreviation'=='PR')

dft.loc[dft['state']=='Puerto Rico'].fillna('abbreviation'=='PR')
len(dft[dft.isnull().any(axis=1)])

68

null_data2 = dft[dft.isnull().any(axis=1)]   #print null rows
null_data2.head()
null_data2.abbreviation.unique()

array(['PR', nan], dtype=object)

dfnew=dft[dft['state']=='Puerto Rico']  #new dataframe only with puerto rico
dfnew.head()

null_data = dfnew[dfnew.isnull().any(axis=1)]   #print null rows
null_data.head()

len(null_data.population.unique())
null_data.year.unique()  #there is no data for PR from 1990 to 1999

array([1990, 1991, 1993, 1992, 1994, 1995, 1996, 1998, 1997, 1999])

#Identify duplicate info 
# DataFrame.duplicated(subset=None, keep='first')[source]

df[df['abbreviation'].duplicated(keep='first')]

#Exemple: rank US states by their 2010 population density

data_2010=dft[dft['year']==2010]
data_2010.info()
null_2010=data_2010[data_2010.isnull().any(axis=1)]


null_2010.head()
#null_2010.abbreviation.unique()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106 entries, 2 to 2539
Data columns (total 6 columns):
state            104 non-null object
abbreviation     104 non-null object
area (sq. mi)    104 non-null float64
ages             106 non-null object
year             106 non-null int64
population       106 non-null float64
dtypes: float64(2), int64(1), object(3)
memory usage: 5.8+ KB

data_2010nonull=data_2010.dropna()  #ignore NaN columns
data_2010_total=data_2010nonull[data_2010nonull['ages']=='total']
sum(data_2010_total['population'])  # 
data_2010_total.head()

data_2010_total['density'] = data_2010_total['population']/data_2010_total['area (sq. mi)']
data_2010_total.head()

/Users/sandra/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys

#order the dataframe by the density value
#DataFrame.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')[source]
data_2010_total.sort_values('density', ascending=False).head()

	state	area (sq. mi)
0	Alabama	52423
1	Alaska	656425
2	Arizona	114006
3	Arkansas	53182
4	California	163707

	state	abbreviation	area (sq. mi)
0	Alabama	AL	52423
1	Alaska	AK	656425
2	Arizona	AZ	114006
3	Arkansas	AR	53182
4	California	CA	163707

	state	abbreviation	area (sq. mi)	ages	year	population
0	Alabama	AL	52423.0	under18	2012	1117489.0
1	Alabama	AL	52423.0	total	2012	4817528.0
2	Alabama	AL	52423.0	under18	2010	1130966.0
3	Alabama	AL	52423.0	total	2010	4785570.0
4	Alabama	AL	52423.0	under18	2011	1125763.0

	state	abbreviation	area (sq. mi)	ages	year	population	density
389	District of Columbia	DC	68.0	total	2010	605125.0	8898.897059
2490	Puerto Rico	PR	3515.0	total	2010	3721208.0	1058.665149
1157	New Jersey	NJ	8722.0	total	2010	8802707.0	1009.253268
1914	Rhode Island	RI	1545.0	total	2010	1052669.0	681.339159
293	Connecticut	CT	5544.0	total	2010	3579210.0	645.600649

Pandas

Data Analysis using Pandas II (Plot representation with Matplotlib)

	state	abbreviation	area (sq. mi)	ages	year	population
2448	Puerto Rico	PR	3515.0	under18	1990	NaN
2449	Puerto Rico	PR	3515.0	total	1990	NaN
2450	Puerto Rico	PR	3515.0	total	1991	NaN
2451	Puerto Rico	PR	3515.0	under18	1991	NaN
2452	Puerto Rico	PR	3515.0	total	1993	NaN

	state	abbreviation	area (sq. mi)	ages	year	population
2538	NaN	NaN	NaN	under18	2010	74119556.0
2539	NaN	NaN	NaN	total	2010	309326295.0