首页 > 解决方案 > Python:从 800 万行文件中读取并存储到单独文件中的程序的效率

问题描述

我的程序所做的是,它将一行的第一个单词和第二个单词放入一个带有第一个单词首字母的 csv 文件中

例如:“Learn Python”将放在“l_words.csv”中,并将 [Learn, Python] 附加到文件中。

问题是我从一个 800 万行长的文件中读取的数据。我运行了 6 个小时,它只完成了 2.5% [我编写了代码来更新我的进度。] 函数“add_to_file(a,fw,sw)”效率低下吗?如果是这样,我该如何改进它?

def add_to_file(a,fw,sw): #Function to add to file

    file = open(a.lower()+'_words.csv', 'a+') #Create or open file

    file.write(fw + ',' + sw + '\n') #write to file



Data = open('data.txt', 'r') # Read data

for line in Data: #loop every line in file

    for i in line: #loop every character in line

        ### insert code that get 1st word and 2nd put into file with the add_to_file, then get 2nd and 3rd then 3rd and 4th and so on.

        add_to_file(f_char, first_word, second_word)

我尽力使代码###高效。该行内没有循环。

我认为for line in Dataandfor i in line是必要的并且不可能改进它?

编辑:感谢那条评论,我想通了。新代码:

def add_to_file(a,f_w,s_w):   #Function to add to file
    if a == 'a':
        fa.write(f_w + ',' + s_w + '\n')    #write to file
    elif a == 'b':
        fb.write(f_w + ',' + s_w + '\n')
    elif a == 'c':
        fc.write(f_w + ',' + s_w + '\n')
    elif a == 'd':
        fd.write(f_w + ',' + s_w + '\n')
    elif a == 'e':
        fe.write(f_w + ',' + s_w + '\n')
    elif a == 'f':
        ff.write(f_w + ',' + s_w + '\n')
    elif a == 'g':
        fg.write(f_w + ',' + s_w + '\n')
    elif a == 'h':
        fh.write(f_w + ',' + s_w + '\n')
    elif a == 'i':
        fi.write(f_w + ',' + s_w + '\n')
    elif a == 'j':
        fj.write(f_w + ',' + s_w + '\n')
    elif a == 'k':
        fk.write(f_w + ',' + s_w + '\n')
    elif a == 'l':
        fl.write(f_w + ',' + s_w + '\n')
    elif a == 'm':
        fm.write(f_w + ',' + s_w + '\n')
    elif a == 'n':
        fn.write(f_w + ',' + s_w + '\n')
    elif a == 'o':
        fo.write(f_w + ',' + s_w + '\n')
    elif a == 'p':
        fp.write(f_w + ',' + s_w + '\n')
    elif a == 'q':
        fq.write(f_w + ',' + s_w + '\n')
    elif a == 'r':
        fr.write(f_w + ',' + s_w + '\n')
    elif a == 's':
        fs.write(f_w + ',' + s_w + '\n')
    elif a == 't':
        ft.write(f_w + ',' + s_w + '\n')
    elif a == 'u':
        fu.write(f_w + ',' + s_w + '\n')
    elif a == 'v':
        fv.write(f_w + ',' + s_w + '\n')
    elif a == 'w':
        fw.write(f_w + ',' + s_w + '\n')
    elif a == 'x':
        fx.write(f_w + ',' + s_w + '\n')
    elif a == 'y':
        fy.write(f_w + ',' + s_w + '\n')
    elif a == 'z':
        fz.write(f_w + ',' + s_w + '\n')

fa = open('a_words.csv', 'a+')
fb = open('b_words.csv', 'a+')
fc = open('c_words.csv', 'a+')
fd = open('d_words.csv', 'a+')
fe = open('e_words.csv', 'a+')
ff = open('f_words.csv', 'a+')
fg = open('g_words.csv', 'a+')
fh = open('h_words.csv', 'a+')
fi = open('i_words.csv', 'a+')
fj = open('j_words.csv', 'a+')
fk = open('k_words.csv', 'a+')
fl = open('l_words.csv', 'a+')
fm = open('m_words.csv', 'a+')
fn = open('n_words.csv', 'a+')
fo = open('o_words.csv', 'a+')
fp = open('p_words.csv', 'a+')
fq = open('q_words.csv', 'a+')
fr = open('r_words.csv', 'a+')
fs = open('s_words.csv', 'a+')
ft = open('t_words.csv', 'a+')
fu = open('u_words.csv', 'a+')
fv = open('v_words.csv', 'a+')
fw = open('w_words.csv', 'a+')
fx = open('x_words.csv', 'a+')
fy = open('y_words.csv', 'a+')
fz = open('z_words.csv', 'a+')

Data = open('data.txt', 'r') # Read data
    for line in Data: #loop every line in file
        for i in line: #loop every character in line
            ### insert code that get 1st word and 2nd put into file with the add_to_file, then get 2nd and 3rd then 3rd and 4th and so on.
        add_to_file(f_char, first_word, second_word)

fa.close()
fb.close()
fc.close()
fd.close()
fe.close()
ff.close()
fg.close()
fh.close()
fi.close()
fj.close()
fk.close()
fl.close()
fm.close()
fn.close()
fo.close()
fp.close()
fq.close()
fr.close()
fs.close()
ft.close()
fu.close()
fv.close()
fw.close()
fx.close()
fy.close()
fz.close()

标签: pythonpython-3.xperformancefile

解决方案


一种更“pythonic”的方法是使用文件描述符的字典:

import string
file = {char: open(char+'_words.csv') for char in string.ascii_lowercase}

然后您可以通过字符访问每个文件:

file[char].write(...)

推荐阅读