linkedin sandra-acebes mail google github
abrir

Pandas

Data Analysis using Pandas II (Plot representation with Matplotlib)

In [ ]:
# Example Data Exploration with Pandas 
In [ ]:
## 1.- Data exploration   
In [10]:
import seaborn as sns
planets=sns.load_dataset('planets')
In [11]:
planets.tail()
Out[11]:
method number orbital_period mass distance year
1030 Transit 1 3.941507 NaN 172.0 2006
1031 Transit 1 2.615864 NaN 148.0 2007
1032 Transit 1 3.191524 NaN 174.0 2007
1033 Transit 1 4.125083 NaN 293.0 2008
1034 Transit 1 4.187757 NaN 260.0 2008
In [8]:
planets.head()
Out[8]:
method number orbital_period mass distance year
0 Radial Velocity 1 269.300 7.10 77.40 2006
1 Radial Velocity 1 874.774 2.21 56.95 2008
2 Radial Velocity 1 763.000 2.60 19.84 2011
3 Radial Velocity 1 326.030 19.40 110.62 2007
4 Radial Velocity 1 516.220 10.50 119.47 2009
In [9]:
planets.shape
Out[9]:
(1035, 6)
In [13]:
planets.dropna().describe()
Out[13]:
number orbital_period mass distance year
count 498.00000 498.000000 498.000000 498.000000 498.000000
mean 1.73494 835.778671 2.509320 52.068213 2007.377510
std 1.17572 1469.128259 3.636274 46.596041 4.167284
min 1.00000 1.328300 0.003600 1.350000 1989.000000
25% 1.00000 38.272250 0.212500 24.497500 2005.000000
50% 1.00000 357.000000 1.245000 39.940000 2009.000000
75% 2.00000 999.600000 2.867500 59.332500 2011.000000
max 6.00000 17337.500000 25.000000 354.000000 2014.000000
In [ ]:
#Listing of Pandas aggregation methods: 
# count(); first(); last(); mean(); median(); min(); max(); std(); var(); mad(); prod(); sum()    
In [14]:
planets.number.mean()
Out[14]:
1.7855072463768116
In [15]:
planets.mass.min()
Out[15]:
0.0036
In [ ]:
# Group by
In [20]:
planets.dropna().groupby('year').sum()
Out[20]:
number orbital_period mass distance
year
1989 1 83.888000 11.68000 40.57
1995 1 4.230785 0.47200 15.36
1996 13 1100.581533 7.91760 55.68
1997 1 39.845000 1.04000 17.43
1998 11 571.553306 13.43560 131.51
1999 23 8280.689040 45.31300 417.13
2000 22 8008.387400 50.86220 396.76
2001 14 8712.217000 37.74000 409.34
2002 45 31097.482023 122.05600 1376.28
2003 30 17364.364880 80.54100 979.13
2004 27 11811.296000 21.81400 640.68
2005 59 21090.221532 68.13250 1576.29
2006 29 15711.295275 35.99600 887.89
2007 39 12928.331900 75.53700 1681.90
2008 76 31883.047900 147.24658 2449.61
2009 98 96722.469100 221.67400 4169.84
2010 90 33603.340430 52.87400 2031.11
2011 166 54341.398100 109.11660 5379.56
2012 51 45216.585700 83.60630 1494.49
2013 65 10367.553090 59.52700 1690.39
2014 3 7279.000000 3.06000 89.02
In [24]:
planets.dropna().groupby('year')['mass'].mean()
Out[24]:
year
1989    11.680000
1995     0.472000
1996     1.979400
1997     1.040000
1998     2.687120
1999     3.236643
2000     3.633014
2001     3.430909
2002     3.937290
2003     3.660955
2004     1.454267
2005     2.003897
2006     1.799800
2007     2.797667
2008     3.424339
2009     3.166771
2010     1.289610
2011     1.212407
2012     3.635057
2013     2.052655
2014     1.530000
Name: mass, dtype: float64
In [26]:
#Iteration over groups
planets.groupby('method')['year'].describe().unstack()[:3]
Out[26]:
       method
count  Astrometry                    2.0
       Eclipse Timing Variations     9.0
       Imaging                      38.0
dtype: float64
In [ ]:
# aggregate, filter, transform, apply
In [ ]:
#Transform. Syntaxis: df.transform(lambda x:x-x.mean())
In [34]:
#pivot tables
planets.pivot_table('distance', index='year', columns='method')[0:3]
#makes a summary of the information
Out[34]:
method Astrometry Eclipse Timing Variations Imaging Microlensing Orbital Brightness Modulation Pulsar Timing Radial Velocity Transit Transit Timing Variations
year
1989 NaN NaN NaN NaN NaN NaN 40.57 NaN NaN
1992 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1994 NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [35]:
planets.pivot_table('distance', 'year', 'method')[0:3]
Out[35]:
method Astrometry Eclipse Timing Variations Imaging Microlensing Orbital Brightness Modulation Pulsar Timing Radial Velocity Transit Transit Timing Variations
year
1989 NaN NaN NaN NaN NaN NaN 40.57 NaN NaN
1992 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1994 NaN NaN NaN NaN NaN NaN NaN NaN NaN