linkedin sandra-acebes mail google github
abrir

Pandas

Data Analysis using Pandas

DATA ANALYSIS USING PANDAS

In [ ]:
# DATA ANALYSIS USING PANDAS 
# Analysis of the protein database (Result_protein.txt)
# that has been previously preprocessed using Python 
# (Orginal data from the protein data bank: https://www.rcsb.org/)
In [ ]:
# Pandas is built on Numpy. 
# The main difference between Numpy and Pandas are the indexes and labels
#Documentation:  https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html
In [2]:
import pandas as pd
import numpy as np
import sys
In [ ]:
# READ
# There are several readers and writers available (CSV; JSON; html ; SAS; SQL; ...)
# Readers: read_csv ; read_sas ; ...etc.
# Writers: to_csv; to_sql ; to_sas ; ...etc
# Specify the absence of header:  pd.read_csv(file, header=None)
# Generate a header: pd.read_csv(file, names=[n1,n2,n3])
In [3]:
df =pd.read_csv('Result_protein.txt', sep=" " )

Explore de data: head, info, columns, count, shape, describe, unique

In [4]:
df.head(3)
Out[4]:
pdbid chain type length family nhydropho %nhydropho npolar %npolar naroma %naroma nposich %nposich nnegach %nnegach
0 101m A protein 154 MYOGLOBIN 60 0.361446 29 0.174699 22 0.132530 35 0.210843 20 0.120482
1 102l A protein 165 T4_LYSOZYME 60 0.361446 46 0.277108 15 0.090361 27 0.162651 18 0.108434
2 102m A protein 154 MYOGLOBIN 60 0.363636 29 0.175758 22 0.133333 34 0.206061 20 0.121212
In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431023 entries, 0 to 431022
Data columns (total 15 columns):
pdbid         431023 non-null object
chain         431023 non-null object
type          431023 non-null object
length        431023 non-null int64
family        430997 non-null object
nhydropho     431023 non-null int64
%nhydropho    431023 non-null float64
npolar        431023 non-null int64
%npolar       431023 non-null float64
naroma        431023 non-null int64
%naroma       431023 non-null float64
nposich       431023 non-null int64
%nposich      431023 non-null float64
nnegach       431023 non-null int64
%nnegach      431023 non-null float64
dtypes: float64(5), int64(6), object(4)
memory usage: 49.3+ MB
In [10]:
df.columns
Out[10]:
Index([u'pdbid', u'chain', u'type', u'length', u'family', u'nhydropho',
       u'%nhydropho', u'npolar', u'%npolar', u'naroma', u'%naroma', u'nposich',
       u'%nposich', u'nnegach', u'%nnegach'],
      dtype='object')
In [11]:
df.count()
Out[11]:
pdbid         431023
chain         431023
type          431023
length        431023
family        430997
nhydropho     431023
%nhydropho    431023
npolar        431023
%npolar       431023
naroma        431023
%naroma       431023
nposich       431023
%nposich      431023
nnegach       431023
%nnegach      431023
dtype: int64
In [22]:
df.shape
Out[22]:
(431023, 15)
In [28]:
df.describe()
Out[28]:
length nhydropho %nhydropho npolar %npolar naroma %naroma nposich %nposich nnegach %nnegach
count 431023.000000 431023.000000 431023.000000 431023.000000 431023.000000 431023.000000 431023.000000 431023.000000 431023.000000 431023.000000 431023.000000
mean 257.625333 94.387933 0.353195 73.698833 0.278891 29.039411 0.107032 35.994399 0.144640 31.361663 0.116242
std 200.582322 75.572272 0.063410 58.858810 0.066262 24.819059 0.036343 27.248176 0.054907 26.623412 0.040079
min 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 129.000000 45.000000 0.317757 35.000000 0.239171 13.000000 0.084848 20.000000 0.115917 14.000000 0.096591
50% 219.000000 77.000000 0.352518 62.000000 0.272289 23.000000 0.107143 30.000000 0.136574 26.000000 0.116279
75% 333.000000 125.000000 0.388679 96.000000 0.307692 39.000000 0.128440 46.000000 0.160000 41.000000 0.135762
max 5037.000000 1887.000000 1.000000 1381.000000 1.000000 605.000000 1.000000 713.000000 1.000000 797.000000 1.000000
In [19]:
df['family'].unique()  #Get unique values of a column
Out[19]:
array(['MYOGLOBIN', 'T4_LYSOZYME', 'GLUTATHIONE_S-TRANSFERASE_P1-1', ...,
       'FOXO3a_peptide', 'GLY_ALA_TRP_PEPTIDE', 'N-ACETYLHIRUDIN'],
      dtype=object)
In [20]:
len(df['family'].unique())
Out[20]:
59018

Filter specific values: str.contains, loc, iloc, isin

In [24]:
#Select a column
#df['type']  # Select 'type' column
#df[0:3]     # Select rows from 0 to 3
df['type'][0:3] #Select 'type' column, rows from 0 to 3
Out[24]:
0    protein
1    protein
2    protein
Name: type, dtype: object
In [46]:
df[df['type'].str.contains('pro')][0:3]
Out[46]:
pdbid chain type length family nhydropho %nhydropho npolar %npolar naroma %naroma nposich %nposich nnegach %nnegach
0 101m A protein 154 MYOGLOBIN 60 0.361446 29 0.174699 22 0.132530 35 0.210843 20 0.120482
1 102l A protein 165 T4_LYSOZYME 60 0.361446 46 0.277108 15 0.090361 27 0.162651 18 0.108434
2 102m A protein 154 MYOGLOBIN 60 0.363636 29 0.175758 22 0.133333 34 0.206061 20 0.121212
In [4]:
len(df[df['type'].str.contains('pro')])
Out[4]:
431023
In [ ]:
#Selection by label
df.loc[]

#Selection by position
df.iloc[]
In [14]:
#df.loc[df['column_name'] == some_value]
df.loc[df['pdbid'] == '1dy3']
Out[14]:
pdbid chain type length family nhydropho %nhydropho npolar %npolar naroma %naroma nposich %nposich nnegach %nnegach
8040 1dy3 A protein 158 NaN 68 0.419753 38 0.234568 17 0.104938 19 0.117284 20 0.123457
In [29]:
df.iloc[[1,2,3,4],:]
Out[29]:
pdbid chain type length family nhydropho %nhydropho npolar %npolar naroma %naroma nposich %nposich nnegach %nnegach
1 102l A protein 165 T4_LYSOZYME 60 0.361446 46 0.277108 15 0.090361 27 0.162651 18 0.108434
2 102m A protein 154 MYOGLOBIN 60 0.363636 29 0.175758 22 0.133333 34 0.206061 20 0.121212
3 103l A protein 167 T4_LYSOZYME 60 0.357143 47 0.279762 15 0.089286 27 0.160714 19 0.113095
4 103m A protein 154 MYOGLOBIN 60 0.363636 29 0.175758 22 0.133333 34 0.206061 20 0.121212
In [49]:
# Boolean Indexing
#df[df.%npolar > 0.4] Invalid syntax:
df[df.iloc[:,8]>0.40][0:3]   #[0:3] limits the amount of output to display
Out[49]:
pdbid chain type length family nhydropho %nhydropho npolar %npolar naroma %naroma nposich %nposich nnegach %nnegach
34 11ba A protein 124 PROTEIN_(RIBONUCLEASE,_SEMINAL) 33 0.257812 53 0.414062 11 0.085938 22 0.171875 9 0.070312
35 11ba B protein 124 PROTEIN_(RIBONUCLEASE,_SEMINAL) 33 0.257812 53 0.414062 11 0.085938 22 0.171875 9 0.070312
36 11bg A protein 124 PROTEIN_(BOVINE_SEMINAL_RIBONUCLEASE) 33 0.257812 53 0.414062 11 0.085938 22 0.171875 9 0.070312
In [48]:
#[df.npolar>40  #returns a true/false list
df[df.npolar>40][0:3]
Out[48]:
pdbid chain type length family nhydropho %nhydropho npolar %npolar naroma %naroma nposich %nposich nnegach %nnegach
1 102l A protein 165 T4_LYSOZYME 60 0.361446 46 0.277108 15 0.090361 27 0.162651 18 0.108434
3 103l A protein 167 T4_LYSOZYME 60 0.357143 47 0.279762 15 0.089286 27 0.160714 19 0.113095
5 104l A protein 166 T4_LYSOZYME 61 0.365269 46 0.275449 15 0.089820 27 0.161677 18 0.107784
In [60]:
df[df['chain'].isin(['A', 'B'])][0:6]
#df.isin({'chain': "A", 'type': "protein"})
#df2[df2['E'].isin(['two', 'four'])]

#countries = ["T4_LYSOZIME","MYOGLOBIN"]
#df.family.isin(countries)
Out[60]:
pdbid chain type length family nhydropho %nhydropho npolar %npolar naroma %naroma nposich %nposich nnegach %nnegach
0 101m A protein 154 MYOGLOBIN 60 0.361446 29 0.174699 22 0.132530 35 0.210843 20 0.120482
1 102l A protein 165 T4_LYSOZYME 60 0.361446 46 0.277108 15 0.090361 27 0.162651 18 0.108434
2 102m A protein 154 MYOGLOBIN 60 0.363636 29 0.175758 22 0.133333 34 0.206061 20 0.121212
3 103l A protein 167 T4_LYSOZYME 60 0.357143 47 0.279762 15 0.089286 27 0.160714 19 0.113095
4 103m A protein 154 MYOGLOBIN 60 0.363636 29 0.175758 22 0.133333 34 0.206061 20 0.121212
5 104l A protein 166 T4_LYSOZYME 61 0.365269 46 0.275449 15 0.089820 27 0.161677 18 0.107784
In [91]:
familyname = ["T4_LYSOZYME","MYOGLOBIN"]
len(df[df.family.isin(familyname)])
df[df.family.isin(familyname)][0:5]
Out[91]:
pdbid chain type length family nhydropho %nhydropho npolar %npolar naroma %naroma nposich %nposich nnegach %nnegach
0 101m A protein 154 MYOGLOBIN 60 0.361446 29 0.174699 22 0.132530 35 0.210843 20 0.120482
1 102l A protein 165 T4_LYSOZYME 60 0.361446 46 0.277108 15 0.090361 27 0.162651 18 0.108434
2 102m A protein 154 MYOGLOBIN 60 0.363636 29 0.175758 22 0.133333 34 0.206061 20 0.121212
3 103l A protein 167 T4_LYSOZYME 60 0.357143 47 0.279762 15 0.089286 27 0.160714 19 0.113095
4 103m A protein 154 MYOGLOBIN 60 0.363636 29 0.175758 22 0.133333 34 0.206061 20 0.121212
In [92]:
#not in

familyname = ["T4_LYSOZYME","MYOGLOBIN"]
df[~df.family.isin(familyname)][0:6]
Out[92]:
pdbid chain type length family nhydropho %nhydropho npolar %npolar naroma %naroma nposich %nposich nnegach %nnegach
16 10gs A protein 209 GLUTATHIONE_S-TRANSFERASE_P1-1 81 0.383886 62 0.293839 23 0.109005 22 0.104265 23 0.109005
17 10gs B protein 209 GLUTATHIONE_S-TRANSFERASE_P1-1 81 0.383886 62 0.293839 23 0.109005 22 0.104265 23 0.109005
18 10mh A protein 327 PROTEIN_(CYTOSINE-SPECIFIC_METHYLTRANSFERASE_H... 106 0.318318 100 0.300300 42 0.126126 46 0.138138 39 0.117117
28 117e A protein 286 PROTEIN_(INORGANIC_PYROPHOSPHATASE) 102 0.349315 72 0.246575 34 0.116438 41 0.140411 43 0.147260
29 117e B protein 286 PROTEIN_(INORGANIC_PYROPHOSPHATASE) 102 0.349315 72 0.246575 34 0.116438 41 0.140411 43 0.147260
32 11as A protein 330 ASPARAGINE_SYNTHETASE 127 0.370262 86 0.250729 36 0.104956 47 0.137026 47 0.137026

Dealing with missing data: dropna, fillna

In [103]:
# [In] len(df[df['family'].str.contains('MYOGLOBIN')])   
# [Out] cannot index with vector containing NA / NaN values
# How to deal with missing data?
In [ ]:
# 1.- How to identify missing values??
In [95]:
df.info()   #family contains null elements
df.isnull().any()    # family true: means it contains null elements
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431023 entries, 0 to 431022
Data columns (total 15 columns):
pdbid         431023 non-null object
chain         431023 non-null object
type          431023 non-null object
length        431023 non-null int64
family        430997 non-null object
nhydropho     431023 non-null int64
%nhydropho    431023 non-null float64
npolar        431023 non-null int64
%npolar       431023 non-null float64
naroma        431023 non-null int64
%naroma       431023 non-null float64
nposich       431023 non-null int64
%nposich      431023 non-null float64
nnegach       431023 non-null int64
%nnegach      431023 non-null float64
dtypes: float64(5), int64(6), object(4)
memory usage: 49.3+ MB
Out[95]:
pdbid         False
chain         False
type          False
length        False
family         True
nhydropho     False
%nhydropho    False
npolar        False
%npolar       False
naroma        False
%naroma       False
nposich       False
%nposich      False
nnegach       False
%nnegach      False
dtype: bool
In [98]:
df.isnull().sum()   #isnull returns a true/false matrix
Out[98]:
pdbid          0
chain          0
type           0
length         0
family        26
nhydropho      0
%nhydropho     0
npolar         0
%npolar        0
naroma         0
%naroma        0
nposich        0
%nposich       0
nnegach        0
%nnegach       0
dtype: int64
In [114]:
null_data = df[df.isnull().any(axis=1)]   #print null rows
null_data.head()
Out[114]:
pdbid chain type length family nhydropho %nhydropho npolar %npolar naroma %naroma nposich %nposich nnegach %nnegach
8040 1dy3 A protein 158 NaN 68 0.419753 38 0.234568 17 0.104938 19 0.117284 20 0.123457
39551 1qo2 A protein 241 NaN 93 0.378049 53 0.215447 20 0.081301 40 0.162602 40 0.162602
39552 1qo2 B protein 241 NaN 93 0.378049 53 0.215447 20 0.081301 40 0.162602 40 0.162602
109996 2wnb A protein 298 NaN 94 0.304207 83 0.268608 46 0.148867 51 0.165049 35 0.113269
302171 5a5w A protein 253 NaN 106 0.403042 73 0.277567 23 0.087452 35 0.133080 26 0.098859
In [ ]:
# 2.- dropna --- returns a serie of non-null data : drop 
In [115]:
len(df['type'].str.contains('protein'))
Out[115]:
431023
In [117]:
len(df['family'].dropna())  #Not null
Out[117]:
430997
In [17]:
null_data = df[df.isnull().any(axis=1)]
null_data.head()
Out[17]:
pdbid chain type length family nhydropho %nhydropho npolar %npolar naroma %naroma nposich %nposich nnegach %nnegach
8040 1dy3 A protein 158 NaN 68 0.419753 38 0.234568 17 0.104938 19 0.117284 20 0.123457
39551 1qo2 A protein 241 NaN 93 0.378049 53 0.215447 20 0.081301 40 0.162602 40 0.162602
39552 1qo2 B protein 241 NaN 93 0.378049 53 0.215447 20 0.081301 40 0.162602 40 0.162602
109996 2wnb A protein 298 NaN 94 0.304207 83 0.268608 46 0.148867 51 0.165049 35 0.113269
302171 5a5w A protein 253 NaN 106 0.403042 73 0.277567 23 0.087452 35 0.133080 26 0.098859
In [ ]:
#transform the database to consider only non null values
In [107]:
df2=df.dropna()
df2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 430997 entries, 0 to 431022
Data columns (total 15 columns):
pdbid         430997 non-null object
chain         430997 non-null object
type          430997 non-null object
length        430997 non-null int64
family        430997 non-null object
nhydropho     430997 non-null int64
%nhydropho    430997 non-null float64
npolar        430997 non-null int64
%npolar       430997 non-null float64
naroma        430997 non-null int64
%naroma       430997 non-null float64
nposich       430997 non-null int64
%nposich      430997 non-null float64
nnegach       430997 non-null int64
%nnegach      430997 non-null float64
dtypes: float64(5), int64(6), object(4)
memory usage: 52.6+ MB
In [13]:
df.loc[df['pdbid'] == '1dy3']
Out[13]:
pdbid chain type length family nhydropho %nhydropho npolar %npolar naroma %naroma nposich %nposich nnegach %nnegach
8040 1dy3 A protein 158 NaN 68 0.419753 38 0.234568 17 0.104938 19 0.117284 20 0.123457
In [15]:
# Filling missing values

df3 = df.fillna('Unknown')
In [16]:
df3.loc[df3['pdbid'] == '1dy3']
Out[16]:
pdbid chain type length family nhydropho %nhydropho npolar %npolar naroma %naroma nposich %nposich nnegach %nnegach
8040 1dy3 A protein 158 Unknown 68 0.419753 38 0.234568 17 0.104938 19 0.117284 20 0.123457