首页 > 解决方案 > 如何创建散点图以将 2 个嵌套字典与另一个具有相同键的嵌套字典进行比较

问题描述

我有三个嵌套字典:data_geo1、data_geo2、data_ali,它们都由两个键和一个值组成。目标是相互比较这些值。字典的第一个键是一样的,但第二个,其中 30 个与第一个字典共享,其中 20 个与第二个字典共享!不幸的是,键的方向彼此不同。目标是使用 matplotlib 和 numpy 数组创建散点图,以将第一个和第二个字典与 data_ali 中的数据进行比较。这有点复杂,我不知道从哪里开始!这是我为创建字典而写的:

import os
import numpy as np

path = "/home/ali/Desktop/data/"
root = "/home/ali/Desktop/SAMPLES/"



data_geo1={}
with open(path+"GSE98212_H_DE_genes_count.txt","rt") as fin: #data for sample 1-30
    h = fin.readline()
    sample1 = h.split()
    sample_names = [s.strip('"') for s in sample1[1:31]]
    for l in fin.readlines():
        l = l.strip().split()
        if l:
            gene= l[0].strip('"')
            data_geo1[gene] = {}
            for i, x in enumerate(l[1:31]):
                data_geo1[gene][sample_names[i]] = int(x)

#print(data_geo1)

data_geo2={}
with open (path+"GSE98212_L_DE_genes_count.txt","rt") as fin:
        h= fin.readline()
        sample2=h.split()
        sample_names=sample2[1:21]
        for l in fin.readlines():
            l = l.strip().split()
            if l:
                gene= l[0].strip()

            data_geo2[gene]={}
            for i,x in enumerate (l[1:21]):
                data_geo2[gene][sample_names[i]]= int(x)

#print(data_geo2)

data_ali={}

for sample_name in os.listdir(root):
    with open(os.path.join(root, sample_name, "counts.txt"), "r") as fin:
        for line in fin.readlines():
            gene, reads = line.split()
            reads = int(reads)
            if gene.startswith('ENSG'):
                data_ali.setdefault(gene, {})[sample_name] = reads

#print(data_ali)

每个字典结构的示例:

data_geo1: {'ENSG00000110514': {'Sample_19-leish_023_v2': 709, 'Sample_4-leish_012_v3': 501, 'Sample_25-leish027_v2': 690, 'Sample_6-leish_015_v3': 463, 'Sample_23-leish026_v2': 707, 'Sample_20 -leish_023_v3': 619, 'Sample_18-leish_022_v3': 678, 'Sample_10-leish_017_v3': 477, 'Sample_13-leish_019_v2': 460, 'Sample_1-Leish_011_v2': 574, 'Sample_11-leish_018_v2': 566, 'Sample_3-leish_012_v2 ':632,'Sample_2-leish_011_v3':388,'Sample_29-leish032_v2':661,'Sample_8-leish_016_v3':372,'Sample_28-leish028_v3':533,'Sample_27-leish028_v2_2':661'Sample_28-leish028_v2_2':6-02 624,'样品_12-leish_018_v3':653,'样品_5-leish_015_v2':421,'样品_16-leish_021_v3':376,'Sample_21-leish_024_v2': 668, 'Sample_9-leish_017_v2': 583, 'Sample_24-leish026_v3': 590, 'Sample_22-leish_024_v3': 537, 'Sample_14-leish_019_v3': 438, 'Sample_30-leish032_v3': 494, 'Sample_7- leish_016_v2':518,'Sample_15-leish_021_v2':834,'Sample_17-leish_022_v2':742}

data_geo2:{'ENSG00000110514': {'Sample_19': 518, 'Sample_10': 468, 'Sample_20': 517, 'Sample_9': 431, 'Sample_8': 522, 'Sample_7': 437, 'Sample_6': 491, 'Sample_5':461,'Sample_4':442,'Sample_3':667,'Sample_2':438,'Sample_1':378,'Sample_14':345,'Sample_13':424,'Sample_18':570,'Sample_15 ':492,'Sample_16':486,'Sample_12':401,'Sample_17':489,'Sample_11':464}

data_ali: 'ENSG00000110514': {'Sample_19-leish_023_v2': 710, 'Sample_16-leish_021_v3': 380, 'Sample_20': 517, 'Sample_24-leish026_v3': 593, 'Sample_6-leish_015_v3': 468, 'Sample_12-leish_018_v3' : 661, 'Sample_22-leish_024_v3': 539, 'Sample_23-leish026_v2': 710, 'Sample_25-leish027_v2': 689, 'Sample_18-leish_022_v3': 681, 'Sample_14': 394, 1'Sample'_2 Sample_13-leish_019_v2': 464, 'Sample_1-Leish_011_v2': 574, 'Sample_11-leish_018_v2': 571, 'Sample_20-leish_023_v3': 625, 'Sample_3-leish_012_v2': 637, 'Sample_10-leish_017_v3': 479, 'Sample_7' :436,'Sample_29-leish032_v2':659,'Sample_8-leish_016_v3':375,'Sample_6':492,'Sample_7-leish_016_v2':517,'Sample_9':432,'Sample_8':521,'Sample_27-leish028_v2':584,'Sample_26-leish027_v3':629,'Sample_5':460,'Sample_4':441,' :668,'Sample_19':516,'Sample_1':378,'Sample_2':437,'Sample_9-leish_017_v2':582,'Sample_5-leish_015_v2':421,'Sample_4-leish_012_v3':502,'4-leish_2 :670,'Sample_18':573,'Sample_13':426,'Sample_12':403,'Sample_11':463,'Sample_10':466,'Sample_17':488,'Sample_16':487,'Sample_15':490 ,'Sample_14-leish_019_v3':441,'Sample_30-leish032_v3':497,'Sample_28-leish028_v3':542,'Sample_15-leish_021_v2':837,'Sample_17-leish_022_v2':第747章

标签: pythonnumpydictionarymatplotlibnested

解决方案


您应该能够像这样“打开”字典:

main_key=u'ENSG00000110514'

geo1_labels = data_geo1[main_key].keys() 
geo1_ys = [data_geo1[main_key][x] for x in geo1_labels]
ali_geo1_ys = [data_ali[main_key][x] for x in geo1_labels]

geo2_labels = data_geo2[main_key].keys() 
geo2_ys = [data_geo2[main_key][x] for x in geo2_labels]
ali_geo2_ys = [data_ali[main_key][x] for x in geo2_labels]

然后您可以例如散点图进行比较的值,例如:

import matplotlib.pyplot as plt

fig,ax=plt.subplots()

ax.scatter(range(len(geo1_labels)),geo1_ys,facecolors="None",edgecolors="b",marker="o")
ax.scatter(range(len(geo1_labels)),ali_geo1_ys,facecolors="None",edgecolors="r",marker="s")

ax.set_xticks(range(len(geo1_labels)) )
ax.set_xticklabels(geo1_labels,rotation=90)

plt.tight_layout()
plt.show()

更新: 有几种方法可以到达您描述的地块,一种方法是使用模块pandas;在这种特定情况下,起初这似乎需要更多的努力,但它很受欢迎:

import matplotlib.pyplot as plt
import pandas as pd

main_key=u'ENSG00000110514'

compare1=pd.DataFrame.from_dict({
    'data_geo2':data_geo1[main_key],
    'data_ali':{k:data_ali[main_key][k] for k in data_geo1[main_key].keys()}
},orient='index')

compare2=pd.DataFrame.from_dict({
    'data_geo2':data_geo2[main_key],
    'data_ali':{k:data_ali[main_key][k] for k in data_geo2[main_key].keys()}
},orient='index')

compare1['Sample_19-leish_023_v2'].plot.bar()

plt.tight_layout()
plt.show()

另一种选择是直接使用上面的dicts进行绘图:

import matplotlib.pyplot as plt

main_key=u'ENSG00000110514'
sample_key=u'Sample_19-leish_023_v2'

fig,ax=plt.subplots()

xs=[0,1]
ys=[ data_geo1[main_key][sample_key], data_ali[main_key][sample_key] ]

ax.bar(xs,ys)
ax.set_xticks(xs)
ax.set_xticklabels(['data_geo1','data_ali'])

plt.tight_layout()
plt.show()

推荐阅读