linkedin sandra-acebes mail google github
abrir

Python

Data preprocessing in Python

Python transformation (from FASTA format to CSV)

In [ ]:
# Pandas is able to process files with a specific format (table, csv, sql, ...)
# However it doesn't understand other formats as FASTA. 

# In bioinformatics, FASTA format is a text-based format for representing either 
# nucleotide or peptide sequences, in which nucleotides or amino acids 
# are represented using single-letter codes. 

# A possible solution: preprocess the data with Python
In [ ]:
# This is an example of a FASTA file (for a protein called Myoglobin):
'''
>101m_A mol:protein length:154  MYOGLOBIN
MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKF
DRVKHLKTEAEMKASEDLKKHGVTVLTALGAILKKKGHHEAELK
PLAQSHATKHKIPIKYLEFISEAIIHVLHSRHPGNFGADAQGAM
NKALELFRKDIAAKYKELGYQG
'''
# Every protein starts with '>', then the name of the protein (101m_A), 
# the type of molecule (protein, DNA or RNA), the lenght of the protein, the name, and finally
# the sequence of aminoacids or nucleotids (represented by single-letter code)
In [15]:
def join_lines_in_protein(file_in):
    #The info of each protein is contained into two different lines
    #This funtion joins that information into a list
    all_lines=[]
    info=[]
    sequence=[]

    for line in open(file_in):
        all_lines.append(line)
        #save all lines as a list. 

    #The odd lines contatain the info and the pair ones the sequences.

    for N in range(0,(len(all_lines)/2)):

        info.append(all_lines[2*N])    # info : pdb name, type, lenght, name
        sequence.append(all_lines[2*N+1]) # sequence of aminoacids

    return info, sequence
In [12]:
## add some columns to my file
# pdb name, type, lenght, name , sequence, nhydropho, %, npolar, %, naroma, % ,nposich, %,  nnegach, 

def Countresi(Sequence):
    #This function counts the number of aa in a protein and the 
    #percentage


    aminos= {"hydropho":["A","L","M","I","P","V"],
    "polar":["C","G","Q","N","S","T"],"aroma":["F","Y","W","H"],
             "posich":["H","R","K"],"negach":["E","D"]}

    nhydropho=0
    npolar=0
    naroma=0
    nposich=0
    nnegach=0

    for amino in Sequence:
        if amino in aminos["hydropho"]:
            nhydropho+=1
        if amino in aminos["polar"]:
            npolar+=1
        if amino in aminos["aroma"]:
            naroma+=1
        if amino in aminos["posich"]:
            nposich+=1
        if amino in aminos["negach"]:
            nnegach+=1
    total= float(nhydropho+npolar+naroma+nposich+nnegach)
    #print total  
    result = str(nhydropho)+' '+str(nhydropho/total)+' '+str(npolar)+' '+str(npolar/total)+' '+str(naroma)+' '+str(naroma/total)+' '+str(nposich)+' '+str(nposich/total)+' '+str(nnegach)+' '+str(nnegach/total)
    return result

In [17]:
from timeit import default_timer as timer
# START MY TIMER
start = timer()

file_in2= './pdb_seqres.txt'  #FASTA file
info,sequence=join_lines_in_protein(file_in2)
join_lines_in_protein(file_in2)   #funtion that joins two lines into one (per protein)


fileout=open('./Result_protein.txt','w')

fileout.write('pdbid chain type length family nhydropho %nhydropho npolar %npolar naroma %naroma nposich %nposich nnegach %nnegach'+'\n')
allb=[]




for ele1,ele2 in zip(info,sequence):

    if ele1.split()[1]=="mol:protein" and not 'X' in ele2:   #to remove DNA and remove X  

            pdbid =(ele1.split('_')[0])[1:]
            chain =(ele1.split('_')[1])[0]
            ptype=(ele1.split()[1])[4:]
            length=(ele1.split()[2])[7:]
            family=ele1.split()[3:]  #There are families with several words

            ele3=pdbid+' '+chain+' '+ptype+' '+length+' '+ " ".join([str(i) for i in family])+' '+Countresi(ele2)


            fileout.write(ele3+'\n')
            allb.append(ele3)


fileout.close()

print 'Total number of proteins', len(allb) # less than the original number of proteins because
                                            #I have eliminated DNA, RNA and proteins with
                                            # non-standar residues, as X.
# STOP MY TIMER
elapsed_time = timer() - start # in seconds
print "time in seconds", elapsed_time

LEN 431023
time in seconds 139.081338882