python - 连接多个文件python
问题描述
我正在尝试使用 python 将多个文件合并到一个文件中,我尝试了几种方法,但它们都导致最终文件在某些行上丢失。文件的大小可以变化很大,所以我更喜欢使用不会将整个文件加载到内存中的东西。
我对此的了解有点有限,但我读到这可能是由于写入缓冲,也就是文件没有立即写入,信息暂时保存在内存中,然后写入文件。
我尝试了多种方法来解决这个问题:使用shutil.copyfileobj,经典python的读/写,在文件末尾添加标签,检查两个文件的尾部,使用file.flush后跟os.fsync,最后,添加几秒钟的 time.sleep。一切都失败了,任何人都可以就合并文件的可靠方法提出建议吗?有些方法在我的本地 PC 上似乎可以正常工作,但是在另一个系统 (HPC) 上尝试时会发生错误,所以这有点难以复制。
这些是我迄今为止尝试过的所有方法:
#support functions
def tail(file_path):
last_line = None
with open(file_path) as file:
line=file.readline()
while line:
last_line=str(line)
line=file.readline()
return last_line
def wait_for_flush(output_file,tail_in):
c = 0
while not file_exists(output_file):
sleep(5)
c += 1
if c > 100: raise BrokenConcatenation(output_file)
tail_out = tail(output_file)
while tail_out != tail_in:
while not tail_out:
sleep(2)
tail_out = tail(output_file)
c += 1
if c > 100: raise BrokenConcatenation(output_file)
tail_out = tail(output_file)
c += 1
sleep(2)
if c > 100: raise BrokenConcatenation(output_file)
def merge_two_files(file1,file2):
with open(file1, 'a+') as f1:
with open(file2) as f2:
line=f2.readline()
while line:
f1.write(line)
line=f2.readline()
#forcing disk write
f1.flush()
os.fsync(f1)
#main functions
def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
print(output_file)
list_files=list(list_file_paths)
while len(list_files)>1:
file1=list_files.pop(0)
file2=list_files.pop(0)
merge_two_files(file1,file2)
sleep(1)
os.remove(file2)
list_files.append(file1)
final_file=list_files.pop()
move_file(final_file,output_file)
def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'wb',buffering=0) as wfd:
for f in list_file_paths:
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
#forcing disk write
wfd.flush()
os.fsync(wfd)
sleep(2)
def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'w+') as wfd:
for f in list_file_paths:
with open(f) as fd:
line = fd.readline()
while line:
wfd.write(line)
line = fd.readline()
if add_tag:
tail_in='#'+f+'\n'
wfd.write(tail_in)
else: tail_in=tail(f)
# forcing disk write
wfd.flush()
os.fsync(wfd)
wait_for_flush(output_file,tail_in)
#resets file whenever we open file, doesnt work
def concat_files(output_file,list_file_paths,stdout_file=None):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
for f in list_file_paths:
with open(output_file, 'wb') as wfd:
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
#forcing disk write
wfd.flush()
os.fsync(wfd)
def concat_files(output_file,list_file_paths,stdout_file=None):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'w+') as outfile:
for f in list_file_paths:
with open(f) as infile:
line=infile.readline()
while line:
outfile.write(line)
line=infile.readline()
#forcing disk write
outfile.flush()
os.fsync(outfile)
def concat_files(output_file,list_file_paths,stdout_file=None):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'wb') as wfd:
for f in list_file_paths:
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
#forcing disk write
wfd.flush()
os.fsync(wfd)
解决方案
如果您不想将大文件读入内存,我会说这应该可以工作:
def concat_files(output_file, list_file_paths):
print('Concatenating files into', output_file)
with open(output_file, 'w') as wfd:
for f in list_file_paths:
print(f, '...')
with open(f) as fd:
for line in fd:
wfd.write(line)
wfd.write(f'eof - {f}\n') # mod to indicate end of this file
print('Done.')
这应该创建output_file
一个新文件,并从 中读取每个文件list_file_paths
,一次一行,写入新文件。
更新:查看mod 以指示此文件的结尾
推荐阅读
- php - 如何从其他共享主机服务器访问 VPS 根目录中的文件?
- azure - 如果我移动资源,如何在 Azure 中保留应用服务的入站和出站 IP 地址?
- odoo - 如何打印索引号而不是服务产品 qweb?
- file-extension - 为什么要创建文件扩展名?
- matrix-synapse - 迁移 Synapse 后未加载用户/房间的远程头像
- mongodb - MongoDB Realm:环境值存在但在 Realm Function 内部未定义
- security - 实施 ISO 27001
- regex - 如何编写匹配所有不包含任何控制字符的字符的正则表达式?
- html - 容器 xxl 不适用于 Angular 的引导程序 5
- arrays - 为什么String Array在JSP中会抛出错误