python - 如何使用多个字典中的子字符串选择序列
问题描述
我有四个包含子字符串的字典:
fw1={'PLAU_fw1':'CCCFFF','EPCAM_fw1':'GGGTTT','MIF_fw1':'HHHFFF'}
fw1_rc={'PLAU_fw1_rc':'cccfff','EPCAM_fw1_rc':'gggttt','MIF_fw1_rc':'hhhfff'}
fw2={'PLAU_fw2':'RRREEE','EPCAM_fw2':'OOOPPP','MIF_fw2':'KKKZZZ'}
fw2_rc={'PLAU_fw2_rc':'rrreee','EPCAM_fw2_rc':'oooppp','MIF_fw2_rc':'kkkzzz'}
和一个fasta文件:
>MN00153:75:000H37WNG:1:11102:8051:1085
NNNNNNNNCCCFFFNNNNNGGGTTTNNNNNNN
>MN00153:75:000H37WNG:1:11102:00000:1088
NNNNNNCCCFFFNNNNNrrreeeNNNNNNN
>MN00153:75:000H37WNG:1:11102:16389:1090
NNNHHHFFFNNNNNNNOOOPPPNNNNNNN
>MN00153:75:000H37WNG:1:11102:00000:1095
cccfffNNNNNNNKKKZZZNNNNNNN
如果两个子字符串来自特定字典,我想选择序列。子字符串的顺序并不重要。
换句话说,如果一个子字符串来自fw1
另一个子字符串来自fw2_rc
字典,我希望我的代码选择读取一个子字符串来自字典另一个OR
子字符串。fw1_rc
fw2
这是我的代码;它选择正确的读取,但多次重复输出:
from Bio import SeqIO
count=0
with open('file.fasta','r') as f:
for record in SeqIO.parse(f,'fasta'):
for k1,Fw1 in fw1.items():
for k2,Fw1_rc in fw1_rc.items():
for k3,Fw2 in fw2.items():
for k4,Fw2_rc in fw2_rc.items():
if Fw1 in record.seq and Fw2_rc in record.seq:
pos1 = record.seq.find(Fw1) + 1
pos2 = record.seq.find(Fw2_rc) + 1
if pos1 < pos2:
distance = pos2 - pos1
if pos1 > pos2:
distance = pos1 - pos2
print("sample_2")
print(record.id)
print(record.seq)
print(str(k1) + " : " + str(Fw1) + " - The position is " + str(pos1))
print(str(k4) + " : " + str(Fw2_rc) + " - The position is " + str(pos2))
print('\n')
if Fw1_rc in record.seq and Fw2 in record.seq:
pos1 = record.seq.find(Fw1_rc) + 1
pos2 = record.seq.find(Fw2) + 1
if pos1 < pos2:
distance = pos2 - pos1
if pos1 > pos2:
distance = pos1 - pos2
print(record.id)
print(record.seq)
print(str(k2) + " : " + str(Fw1_rc) + " - The position is " + str(pos1))
print(str(k3) + " : " + str(Fw2) + " - The position is " + str(pos2))
print('\n')
count+=1
print("The total number of reads that have both 21nt protein-specific sequences is " + str(count))
所需的输出应该是:
sample_2
MN00153:75:000H37WNG:1:11102:00000:1088
NNNNNNCCCFFFNNNNNrrreeeNNNNNNN
PLAU_fw1 : CCCFFF - The position is 7
PLAU_fw2_rc : rrreee - The position is 18
sample_2
MN00153:75:000H37WNG:1:11102:00000:1095
cccfffNNNNNNNKKKZZZNNNNNNN
PLAU_fw1_rc : cccfff - The position is 1
MIF_fw2 : KKKZZZ - The position is 14
The total number of reads that have both 21nt protein-specific sequences is 2
解决方案
我无法获得计数,并且我反转了字典中的键/值项以允许查找(否则无法获得您想要的结果)
此外,我无法使用Bio
,只能从文本文件中读取,但我的代码的更改可以轻松更改为Bio
seq
and id
。
import re
fw1={'PLAU_fw1':'CCCFFF','EPCAM_fw1':'GGGTTT','MIF_fw1':'HHHFFF'}
fw1 = dict(zip(fw1.values(), fw1.keys()))
fw1_rc={'PLAU_fw1_rc':'cccfff','EPCAM_fw1_rc':'gggttt','MIF_fw1_rc':'hhhfff'}
fw1_rc= dict(zip(fw1_rc.values(), fw1_rc.keys()))
fw2={'PLAU_fw2':'RRREEE','EPCAM_fw2':'OOOPPP','MIF_fw2':'KKKZZZ'}
fw2 = dict(zip(fw2.values(), fw2.keys()))
fw2_rc={'PLAU_fw2_rc':'rrreee','EPCAM_fw2_rc':'oooppp','MIF_fw2_rc':'kkkzzz'}
fw2_rc= dict(zip(fw2_rc.values(), fw2_rc.keys()))
one_upcase = '(' + '|'.join(fw1.keys()) + ')'
one_locase = '(' + '|'.join(fw1_rc.keys()) + ')'
two_upcase = '(' + '|'.join(fw2.keys()) + ')'
two_locase = '(' + '|'.join(fw2_rc.keys()) + ')'
with open('f1.txt', 'r') as f:
_id = ''
count = 0
for line in f:
line = line.rstrip()
if line.startswith('>'):
_id = line[1:]
else:
if match := re.search(f'(?=.*{one_upcase})(?=.*{two_locase})', line):
print(_id)
print(line)
for item in match.groups():
idx = 1 + line.index(item)
if item.isupper():
print(fw1[item], ': ', end='')
else:
print(fw2_rc[item], ': ', end='')
print(item, 'The position is', idx)
print()
elif match := re.search(f'(?=.*{one_locase})(?=.*{two_upcase})', line):
print(_id)
print(line)
for item in match.groups():
idx = 1 + line.index(item)
if item.isupper():
print(fw2[item], ': ', end='')
else:
print(fw1_rc[item], ': ', end='')
print(item, 'The position is', idx)
print()
输出与您的输出相匹配:
MN00153:75:000H37WNG:1:11102:00000:1088
NNNNNNCCCFFFNNNNNrrreeeNNNNNNN
PLAU_fw1 : CCCFFF The position is 7
PLAU_fw2_rc : rrreee The position is 18
MN00153:75:000H37WNG:1:11102:00000:1095
cccfffNNNNNNNKKKZZZNNNNNNN
PLAU_fw1_rc : cccfff The position is 1
MIF_fw2 : KKKZZZ The position is 14
更新
这是不使用正则表达式的解决方案(另外,我安装Bio
它以将其用于此解决方案)
from Bio import SeqIO
# make values, (fasta seq), keys in dict and original keys now become values
fw1={'PLAU_fw1':'CCCFFF','EPCAM_fw1':'GGGTTT','MIF_fw1':'HHHFFF'}
fw1 = dict(zip(fw1.values(), fw1.keys()))
fw1_rc={'PLAU_fw1_rc':'cccfff','EPCAM_fw1_rc':'gggttt','MIF_fw1_rc':'hhhfff'}
fw1_rc= dict(zip(fw1_rc.values(), fw1_rc.keys()))
fw2={'PLAU_fw2':'RRREEE','EPCAM_fw2':'OOOPPP','MIF_fw2':'KKKZZZ'}
fw2 = dict(zip(fw2.values(), fw2.keys()))
fw2_rc={'PLAU_fw2_rc':'rrreee','EPCAM_fw2_rc':'oooppp','MIF_fw2_rc':'kkkzzz'}
fw2_rc= dict(zip(fw2_rc.values(), fw2_rc.keys()))
# store fasta substrings in lists
one_upcase = list(fw1.keys())
one_locase = list(fw1_rc.keys())
two_upcase = list(fw2.keys())
two_locase = list(fw2_rc.keys())
with open('fasta.txt', 'r') as f:
count = 0
for record in SeqIO.parse(f,'fasta'):
_id = record.id
seq = record.seq
last = False
for token_fw1 in one_upcase:
if last == True:
break
for token_fw2_rc in two_locase:
if token_fw1 in seq and token_fw2_rc in seq:
print(_id)
print(seq)
print(fw1[token_fw1], ':', token_fw1,
'in position', str(1+seq.index(token_fw1)))
print(fw2_rc[token_fw2_rc], ':', token_fw2_rc,
'in position', str(1+seq.index(token_fw2_rc)))
print()
last = True
break
for token_fw1_rc in one_locase:
if last == True:
break
for token_fw2 in two_upcase:
if token_fw1_rc in seq and token_fw2 in seq:
print(_id)
print(seq)
print(fw1_rc[token_fw1_rc], ':', token_fw1_rc,
'in position', str(1+seq.index(token_fw1_rc)))
print(fw2[token_fw2], ':', token_fw2,
'in position', str(1+seq.index(token_fw2)))
print()
last = True
break
我这里没有数,因为我不知道你想数什么。
我颠倒了字典(我认为你创建的字典错误),其中值(fasta 子字符串)成为键,键成为值。这允许在我的解决方案中进行查找:
print(fw1_rc[token_fw1_rc] etc
和print(fw2_rc[token_fw2_rc] etc
(此查找已完成 4 次中的 2 次)。
另外,请注意,我token
在这里说的是 fasta 子字符串。
推荐阅读
- java - 为什么 for 循环可以保存返回的布尔值但不能保存原始布尔值?
- r - 在 shinyapps.io RShiny 应用程序中使用 h2o 包
- kubernetes - 仅在 tidb 集群中由于 tidb pod 中的连接被拒绝而导致准备就绪问题失败
- python - 如何在 Python 中“堆叠”嵌套列表?
- influxdb-python - 如何从 Python 客户端为 InfluxDB 指定 range() 中的绝对时间?
- go - What errors are outputted to http.Server's ErrorLog struct field?
- python - 如何在字典中附加所有重复的字符串名称以使其成为对象列表?
- javascript - 为什么未定义元素数组上的`join()`在javascript中返回少1个元素?
- javascript - Whatsapp 只分享 window.location.search 的第一个参数
- spring-boot - 部署一个 Spring Boot 应用的战争到本地 Tomcat 服务器:404 响应