首页 > 技术文章 > 在gff中切fa的内容

yuanjingnan 2019-07-05 00:59 原文

#!/usr/bin/python
import re

def readfa(l):
    col={}
    arr =[]
    sca =''
    li = open(l)
    for line in li:
        if re.match(r'>(\w*)',line):
            match = re.match(r'>(\w*)',line)
            sca = match.group(1)
            col[sca]=arr
            arr =[]
        else:
            without = re.sub(r'\n',"",line)
            arr.append(without)
    return col

def readgff(l):
    col ={}
    arr =[]
    li = open(l)
    for line in li:
        sp = line.split( )
        if sp[2] == 'mRNA':
            gene = re.match(r'ID=(.*?);',sp[8]).group(1)
            start =sp[3]
            arr=[]
            col[gene]=[sp[3],sp[4],arr,sp[0],sp[6]]
        elif sp[2] == 'CDS':
            gene = re.match(r'Parent=(.*?);',sp[8]).group(1)
            col[gene][2].append([sp[3],sp[4]])
    return col

def deal_gff(l):
    col ={}
    for key,value in l.items():
        start=value[0]
        end = value[1]
        arr = value[2]
        sca = value[3]
        pos = value[4]
        if pos == '+':
            for single in arr:
                single[0] = int(single[0]) - int(start)
                single[1] = int(single[1]) - int(start)+1
        elif pos =='-':
            for single in arr:
                off= int(end)-int(single[1])
                lon= int(end)-int(single[0])+1
                single[0] = off
                single[1] = lon
            arr.reverse
        col[sca]=arr
        del value[0]
        del value[0]
        del value[2]
    return l
###main###



gff=readgff('gff')
c=gff

fa =readfa('fa')

g=deal_gff(c)

col = {}
s=''

for k,v in g.items():
    sca = v[1]
    if fa[sca]:
        lon=s.join(fa[sca])
        short=''
        for i in v[0]:
            short += lon[i[0]:i[1]]
            col[k]=short

for k1,v1 in col.items():
    print k1,"\n",v1

 

推荐阅读