# Pandas is able to process files with a specific format (table, csv, sql, ...)
# However it doesn't understand other formats as FASTA.
# In bioinformatics, FASTA format is a text-based format for representing either
# nucleotide or peptide sequences, in which nucleotides or amino acids
# are represented using single-letter codes.
# A possible solution: preprocess the data with Python
# This is an example of a FASTA file (for a protein called Myoglobin):
'''
>101m_A mol:protein length:154 MYOGLOBIN
MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKF
DRVKHLKTEAEMKASEDLKKHGVTVLTALGAILKKKGHHEAELK
PLAQSHATKHKIPIKYLEFISEAIIHVLHSRHPGNFGADAQGAM
NKALELFRKDIAAKYKELGYQG
'''
# Every protein starts with '>', then the name of the protein (101m_A),
# the type of molecule (protein, DNA or RNA), the lenght of the protein, the name, and finally
# the sequence of aminoacids or nucleotids (represented by single-letter code)
def join_lines_in_protein(file_in):
#The info of each protein is contained into two different lines
#This funtion joins that information into a list
all_lines=[]
info=[]
sequence=[]
for line in open(file_in):
all_lines.append(line)
#save all lines as a list.
#The odd lines contatain the info and the pair ones the sequences.
for N in range(0,(len(all_lines)/2)):
info.append(all_lines[2*N]) # info : pdb name, type, lenght, name
sequence.append(all_lines[2*N+1]) # sequence of aminoacids
return info, sequence
## add some columns to my file
# pdb name, type, lenght, name , sequence, nhydropho, %, npolar, %, naroma, % ,nposich, %, nnegach,
def Countresi(Sequence):
#This function counts the number of aa in a protein and the
#percentage
aminos= {"hydropho":["A","L","M","I","P","V"],
"polar":["C","G","Q","N","S","T"],"aroma":["F","Y","W","H"],
"posich":["H","R","K"],"negach":["E","D"]}
nhydropho=0
npolar=0
naroma=0
nposich=0
nnegach=0
for amino in Sequence:
if amino in aminos["hydropho"]:
nhydropho+=1
if amino in aminos["polar"]:
npolar+=1
if amino in aminos["aroma"]:
naroma+=1
if amino in aminos["posich"]:
nposich+=1
if amino in aminos["negach"]:
nnegach+=1
total= float(nhydropho+npolar+naroma+nposich+nnegach)
#print total
result = str(nhydropho)+' '+str(nhydropho/total)+' '+str(npolar)+' '+str(npolar/total)+' '+str(naroma)+' '+str(naroma/total)+' '+str(nposich)+' '+str(nposich/total)+' '+str(nnegach)+' '+str(nnegach/total)
return result
from timeit import default_timer as timer
# START MY TIMER
start = timer()
file_in2= './pdb_seqres.txt' #FASTA file
info,sequence=join_lines_in_protein(file_in2)
join_lines_in_protein(file_in2) #funtion that joins two lines into one (per protein)
fileout=open('./Result_protein.txt','w')
fileout.write('pdbid chain type length family nhydropho %nhydropho npolar %npolar naroma %naroma nposich %nposich nnegach %nnegach'+'\n')
allb=[]
for ele1,ele2 in zip(info,sequence):
if ele1.split()[1]=="mol:protein" and not 'X' in ele2: #to remove DNA and remove X
pdbid =(ele1.split('_')[0])[1:]
chain =(ele1.split('_')[1])[0]
ptype=(ele1.split()[1])[4:]
length=(ele1.split()[2])[7:]
family=ele1.split()[3:] #There are families with several words
ele3=pdbid+' '+chain+' '+ptype+' '+length+' '+ " ".join([str(i) for i in family])+' '+Countresi(ele2)
fileout.write(ele3+'\n')
allb.append(ele3)
fileout.close()
print 'Total number of proteins', len(allb) # less than the original number of proteins because
#I have eliminated DNA, RNA and proteins with
# non-standar residues, as X.
# STOP MY TIMER
elapsed_time = timer() - start # in seconds
print "time in seconds", elapsed_time