DATA ANALYSIS USING PANDAS¶

# DATA ANALYSIS USING PANDAS 
# Analysis of the protein database (Result_protein.txt)
# that has been previously preprocessed using Python 
# (Orginal data from the protein data bank: https://www.rcsb.org/)

# Pandas is built on Numpy. 
# The main difference between Numpy and Pandas are the indexes and labels
#Documentation:  https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html

import pandas as pd
import numpy as np
import sys

# READ
# There are several readers and writers available (CSV; JSON; html ; SAS; SQL; ...)
# Readers: read_csv ; read_sas ; ...etc.
# Writers: to_csv; to_sql ; to_sas ; ...etc
# Specify the absence of header:  pd.read_csv(file, header=None)
# Generate a header: pd.read_csv(file, names=[n1,n2,n3])

df =pd.read_csv('Result_protein.txt', sep=" " )

Explore de data: head, info, columns, count, shape, describe, unique¶

df.head(3)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431023 entries, 0 to 431022
Data columns (total 15 columns):
pdbid         431023 non-null object
chain         431023 non-null object
type          431023 non-null object
length        431023 non-null int64
family        430997 non-null object
nhydropho     431023 non-null int64
%nhydropho    431023 non-null float64
npolar        431023 non-null int64
%npolar       431023 non-null float64
naroma        431023 non-null int64
%naroma       431023 non-null float64
nposich       431023 non-null int64
%nposich      431023 non-null float64
nnegach       431023 non-null int64
%nnegach      431023 non-null float64
dtypes: float64(5), int64(6), object(4)
memory usage: 49.3+ MB

df.columns

Index([u'pdbid', u'chain', u'type', u'length', u'family', u'nhydropho',
       u'%nhydropho', u'npolar', u'%npolar', u'naroma', u'%naroma', u'nposich',
       u'%nposich', u'nnegach', u'%nnegach'],
      dtype='object')

df.count()

pdbid         431023
chain         431023
type          431023
length        431023
family        430997
nhydropho     431023
%nhydropho    431023
npolar        431023
%npolar       431023
naroma        431023
%naroma       431023
nposich       431023
%nposich      431023
nnegach       431023
%nnegach      431023
dtype: int64

df.shape

(431023, 15)

df.describe()

df['family'].unique()  #Get unique values of a column

array(['MYOGLOBIN', 'T4_LYSOZYME', 'GLUTATHIONE_S-TRANSFERASE_P1-1', ...,
       'FOXO3a_peptide', 'GLY_ALA_TRP_PEPTIDE', 'N-ACETYLHIRUDIN'],
      dtype=object)

len(df['family'].unique())

59018

Filter specific values: str.contains, loc, iloc, isin¶

#Select a column
#df['type']  # Select 'type' column
#df[0:3]     # Select rows from 0 to 3
df['type'][0:3] #Select 'type' column, rows from 0 to 3

0    protein
1    protein
2    protein
Name: type, dtype: object

df[df['type'].str.contains('pro')][0:3]

len(df[df['type'].str.contains('pro')])

431023

#Selection by label
df.loc[]

#Selection by position
df.iloc[]

#df.loc[df['column_name'] == some_value]
df.loc[df['pdbid'] == '1dy3']

df.iloc[[1,2,3,4],:]

# Boolean Indexing
#df[df.%npolar > 0.4] Invalid syntax:
df[df.iloc[:,8]>0.40][0:3]   #[0:3] limits the amount of output to display

#[df.npolar>40  #returns a true/false list
df[df.npolar>40][0:3]

df[df['chain'].isin(['A', 'B'])][0:6]
#df.isin({'chain': "A", 'type': "protein"})
#df2[df2['E'].isin(['two', 'four'])]

#countries = ["T4_LYSOZIME","MYOGLOBIN"]
#df.family.isin(countries)

familyname = ["T4_LYSOZYME","MYOGLOBIN"]
len(df[df.family.isin(familyname)])
df[df.family.isin(familyname)][0:5]

#not in

familyname = ["T4_LYSOZYME","MYOGLOBIN"]
df[~df.family.isin(familyname)][0:6]

Dealing with missing data: dropna, fillna¶

# [In] len(df[df['family'].str.contains('MYOGLOBIN')])   
# [Out] cannot index with vector containing NA / NaN values
# How to deal with missing data?

# 1.- How to identify missing values??

df.info()   #family contains null elements
df.isnull().any()    # family true: means it contains null elements

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431023 entries, 0 to 431022
Data columns (total 15 columns):
pdbid         431023 non-null object
chain         431023 non-null object
type          431023 non-null object
length        431023 non-null int64
family        430997 non-null object
nhydropho     431023 non-null int64
%nhydropho    431023 non-null float64
npolar        431023 non-null int64
%npolar       431023 non-null float64
naroma        431023 non-null int64
%naroma       431023 non-null float64
nposich       431023 non-null int64
%nposich      431023 non-null float64
nnegach       431023 non-null int64
%nnegach      431023 non-null float64
dtypes: float64(5), int64(6), object(4)
memory usage: 49.3+ MB

pdbid         False
chain         False
type          False
length        False
family         True
nhydropho     False
%nhydropho    False
npolar        False
%npolar       False
naroma        False
%naroma       False
nposich       False
%nposich      False
nnegach       False
%nnegach      False
dtype: bool

df.isnull().sum()   #isnull returns a true/false matrix

pdbid          0
chain          0
type           0
length         0
family        26
nhydropho      0
%nhydropho     0
npolar         0
%npolar        0
naroma         0
%naroma        0
nposich        0
%nposich       0
nnegach        0
%nnegach       0
dtype: int64

null_data = df[df.isnull().any(axis=1)]   #print null rows
null_data.head()

# 2.- dropna --- returns a serie of non-null data : drop

len(df['type'].str.contains('protein'))

431023

len(df['family'].dropna())  #Not null

430997

null_data = df[df.isnull().any(axis=1)]
null_data.head()

#transform the database to consider only non null values

df2=df.dropna()
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 430997 entries, 0 to 431022
Data columns (total 15 columns):
pdbid         430997 non-null object
chain         430997 non-null object
type          430997 non-null object
length        430997 non-null int64
family        430997 non-null object
nhydropho     430997 non-null int64
%nhydropho    430997 non-null float64
npolar        430997 non-null int64
%npolar       430997 non-null float64
naroma        430997 non-null int64
%naroma       430997 non-null float64
nposich       430997 non-null int64
%nposich      430997 non-null float64
nnegach       430997 non-null int64
%nnegach      430997 non-null float64
dtypes: float64(5), int64(6), object(4)
memory usage: 52.6+ MB

df.loc[df['pdbid'] == '1dy3']

# Filling missing values

df3 = df.fillna('Unknown')

df3.loc[df3['pdbid'] == '1dy3']

	pdbid	chain	type	length	family	nhydropho	%nhydropho	npolar	%npolar	naroma	%naroma	nposich	%nposich	nnegach	%nnegach
0	101m	A	protein	154	MYOGLOBIN	60	0.361446	29	0.174699	22	0.132530	35	0.210843	20	0.120482
1	102l	A	protein	165	T4_LYSOZYME	60	0.361446	46	0.277108	15	0.090361	27	0.162651	18	0.108434
2	102m	A	protein	154	MYOGLOBIN	60	0.363636	29	0.175758	22	0.133333	34	0.206061	20	0.121212

	length	nhydropho	%nhydropho	npolar	%npolar	naroma	%naroma	nposich	%nposich	nnegach	%nnegach
count	431023.000000	431023.000000	431023.000000	431023.000000	431023.000000	431023.000000	431023.000000	431023.000000	431023.000000	431023.000000	431023.000000
mean	257.625333	94.387933	0.353195	73.698833	0.278891	29.039411	0.107032	35.994399	0.144640	31.361663	0.116242
std	200.582322	75.572272	0.063410	58.858810	0.066262	24.819059	0.036343	27.248176	0.054907	26.623412	0.040079
min	2.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	129.000000	45.000000	0.317757	35.000000	0.239171	13.000000	0.084848	20.000000	0.115917	14.000000	0.096591
50%	219.000000	77.000000	0.352518	62.000000	0.272289	23.000000	0.107143	30.000000	0.136574	26.000000	0.116279
75%	333.000000	125.000000	0.388679	96.000000	0.307692	39.000000	0.128440	46.000000	0.160000	41.000000	0.135762
max	5037.000000	1887.000000	1.000000	1381.000000	1.000000	605.000000	1.000000	713.000000	1.000000	797.000000	1.000000

	pdbid	chain	type	length	family	nhydropho	%nhydropho	npolar	%npolar	naroma	%naroma	nposich	%nposich	nnegach	%nnegach
0	101m	A	protein	154	MYOGLOBIN	60	0.361446	29	0.174699	22	0.132530	35	0.210843	20	0.120482
1	102l	A	protein	165	T4_LYSOZYME	60	0.361446	46	0.277108	15	0.090361	27	0.162651	18	0.108434
2	102m	A	protein	154	MYOGLOBIN	60	0.363636	29	0.175758	22	0.133333	34	0.206061	20	0.121212

	pdbid	chain	type	length	family	nhydropho	%nhydropho	npolar	%npolar	naroma	%naroma	nposich	%nposich	nnegach	%nnegach
1	102l	A	protein	165	T4_LYSOZYME	60	0.361446	46	0.277108	15	0.090361	27	0.162651	18	0.108434
2	102m	A	protein	154	MYOGLOBIN	60	0.363636	29	0.175758	22	0.133333	34	0.206061	20	0.121212
3	103l	A	protein	167	T4_LYSOZYME	60	0.357143	47	0.279762	15	0.089286	27	0.160714	19	0.113095
4	103m	A	protein	154	MYOGLOBIN	60	0.363636	29	0.175758	22	0.133333	34	0.206061	20	0.121212

	pdbid	chain	type	length	family	nhydropho	%nhydropho	npolar	%npolar	naroma	%naroma	nposich	%nposich	nnegach	%nnegach
34	11ba	A	protein	124	PROTEIN_(RIBONUCLEASE,_SEMINAL)	33	0.257812	53	0.414062	11	0.085938	22	0.171875	9	0.070312
35	11ba	B	protein	124	PROTEIN_(RIBONUCLEASE,_SEMINAL)	33	0.257812	53	0.414062	11	0.085938	22	0.171875	9	0.070312
36	11bg	A	protein	124	PROTEIN_(BOVINE_SEMINAL_RIBONUCLEASE)	33	0.257812	53	0.414062	11	0.085938	22	0.171875	9	0.070312

	pdbid	chain	type	length	family	nhydropho	%nhydropho	npolar	%npolar	naroma	%naroma	nposich	%nposich	nnegach	%nnegach
16	10gs	A	protein	209	GLUTATHIONE_S-TRANSFERASE_P1-1	81	0.383886	62	0.293839	23	0.109005	22	0.104265	23	0.109005
17	10gs	B	protein	209	GLUTATHIONE_S-TRANSFERASE_P1-1	81	0.383886	62	0.293839	23	0.109005	22	0.104265	23	0.109005
18	10mh	A	protein	327	PROTEIN_(CYTOSINE-SPECIFIC_METHYLTRANSFERASE_H...	106	0.318318	100	0.300300	42	0.126126	46	0.138138	39	0.117117
28	117e	A	protein	286	PROTEIN_(INORGANIC_PYROPHOSPHATASE)	102	0.349315	72	0.246575	34	0.116438	41	0.140411	43	0.147260
29	117e	B	protein	286	PROTEIN_(INORGANIC_PYROPHOSPHATASE)	102	0.349315	72	0.246575	34	0.116438	41	0.140411	43	0.147260
32	11as	A	protein	330	ASPARAGINE_SYNTHETASE	127	0.370262	86	0.250729	36	0.104956	47	0.137026	47	0.137026

	pdbid	chain	type	length	family	nhydropho	%nhydropho	npolar	%npolar	naroma	%naroma	nposich	%nposich	nnegach	%nnegach
8040	1dy3	A	protein	158	NaN	68	0.419753	38	0.234568	17	0.104938	19	0.117284	20	0.123457
39551	1qo2	A	protein	241	NaN	93	0.378049	53	0.215447	20	0.081301	40	0.162602	40	0.162602
39552	1qo2	B	protein	241	NaN	93	0.378049	53	0.215447	20	0.081301	40	0.162602	40	0.162602
109996	2wnb	A	protein	298	NaN	94	0.304207	83	0.268608	46	0.148867	51	0.165049	35	0.113269
302171	5a5w	A	protein	253	NaN	106	0.403042	73	0.277567	23	0.087452	35	0.133080	26	0.098859

Pandas

Data Analysis using Pandas

DATA ANALYSIS USING PANDAS¶

Explore de data: head, info, columns, count, shape, describe, unique¶

Filter specific values: str.contains, loc, iloc, isin¶

Dealing with missing data: dropna, fillna¶