# DATA ANALYSIS USING PANDAS
# Analysis of the protein database (Result_protein.txt)
# that has been previously preprocessed using Python
# (Orginal data from the protein data bank: https://www.rcsb.org/)
# Pandas is built on Numpy.
# The main difference between Numpy and Pandas are the indexes and labels
#Documentation: https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html
import pandas as pd
import numpy as np
import sys
# READ
# There are several readers and writers available (CSV; JSON; html ; SAS; SQL; ...)
# Readers: read_csv ; read_sas ; ...etc.
# Writers: to_csv; to_sql ; to_sas ; ...etc
# Specify the absence of header: pd.read_csv(file, header=None)
# Generate a header: pd.read_csv(file, names=[n1,n2,n3])
df =pd.read_csv('Result_protein.txt', sep=" " )
df.head(3)
df.info()
df.columns
df.count()
df.shape
df.describe()
df['family'].unique() #Get unique values of a column
len(df['family'].unique())
#Select a column
#df['type'] # Select 'type' column
#df[0:3] # Select rows from 0 to 3
df['type'][0:3] #Select 'type' column, rows from 0 to 3
df[df['type'].str.contains('pro')][0:3]
len(df[df['type'].str.contains('pro')])
#Selection by label
df.loc[]
#Selection by position
df.iloc[]
#df.loc[df['column_name'] == some_value]
df.loc[df['pdbid'] == '1dy3']
df.iloc[[1,2,3,4],:]
# Boolean Indexing
#df[df.%npolar > 0.4] Invalid syntax:
df[df.iloc[:,8]>0.40][0:3] #[0:3] limits the amount of output to display
#[df.npolar>40 #returns a true/false list
df[df.npolar>40][0:3]
df[df['chain'].isin(['A', 'B'])][0:6]
#df.isin({'chain': "A", 'type': "protein"})
#df2[df2['E'].isin(['two', 'four'])]
#countries = ["T4_LYSOZIME","MYOGLOBIN"]
#df.family.isin(countries)
familyname = ["T4_LYSOZYME","MYOGLOBIN"]
len(df[df.family.isin(familyname)])
df[df.family.isin(familyname)][0:5]
#not in
familyname = ["T4_LYSOZYME","MYOGLOBIN"]
df[~df.family.isin(familyname)][0:6]
# [In] len(df[df['family'].str.contains('MYOGLOBIN')])
# [Out] cannot index with vector containing NA / NaN values
# How to deal with missing data?
# 1.- How to identify missing values??
df.info() #family contains null elements
df.isnull().any() # family true: means it contains null elements
df.isnull().sum() #isnull returns a true/false matrix
null_data = df[df.isnull().any(axis=1)] #print null rows
null_data.head()
# 2.- dropna --- returns a serie of non-null data : drop
len(df['type'].str.contains('protein'))
len(df['family'].dropna()) #Not null
null_data = df[df.isnull().any(axis=1)]
null_data.head()
#transform the database to consider only non null values
df2=df.dropna()
df2.info()
df.loc[df['pdbid'] == '1dy3']
# Filling missing values
df3 = df.fillna('Unknown')
df3.loc[df3['pdbid'] == '1dy3']