python - 在文件路径/文件名之间查找序列的快速方法
问题描述
我正在尝试在给定的文件名列表中查找文件名之间的文件序列。然而,潜在文件序列的位置是未知的。它可以在文件名中的任何位置。
序列有以下规则:
- 序列的长度总是相同的。所以 01 - 12 和 070 - 110 是有效的序列。1 - 12 和 70 - 110 不是。
- 序列允许有间隙。01, 02, 05, 10, 21 将是一个有效的序列。
例子:
输入:
[
"fde302be-4d3e-xxxx_abc08_xyz_05.png",
"fde302be-4d3e-xxxx_abc09_xyz_05.png",
"fde302be-4d3e-xxxx_abc10_xyz_05.JPG",
"fde302be-4d3e-xxxx_abc15_xyz_05.JPG",
"fde302be-4d3e-xxxx_abc16_xyz_05.png",
"fde305be-4d3e-xxxx_abc08_abcde_05.png",
"fde309be-4d3e-xxxx_abc08_abcde_05.png",
"fde310be-4d3e-xxxx_abc08_abcde_05.png"
]
期望的输出:
[
[["fde302be-4d3e-xxxx_abc08_xyz_05.png",
"fde302be-4d3e-xxxx_abc09_xyz_05.png",
"fde302be-4d3e-xxxx_abc16_xyz_05.png"], (22, 24)],
[["fde302be-4d3e-xxxx_abc10_xyz_05.JPG",
"fde302be-4d3e-xxxx_abc15_xyz_05.JPG"], (22, 24)],
[["fde305be-4d3e-xxxx_abc08_abcde_05.png",
"fde309be-4d3e-xxxx_abc08_abcde_05.png",
"fde310be-4d3e-xxxx_abc08_abcde_05.png"], (3, 6)]
]
其中 (22, 24) 和 (3, 6) 描述了序列在字符串中的位置。对于最后一个,我并不特别关心它是 (3, 6) 还是 (4, 6)。但两者中的任何一个。列表元素(输入和输出)不必按顺序排列。
我有一个工作版本,但它真的很慢,随着文件名和序列的数量而变得非常慢,并且大约有 100 行代码。
有没有更快更简单的方法来获得所需的输出?
这是我目前的解决方案。我知道有些部分可以优化它,比如忽略序列,我知道我正在检查的文件名不再是该序列的一部分,因为序列是预先排序的等等,但我觉得我的方法从一开始就有缺陷,并且有更好的方法来解决这个问题。
import os
import re
def get_seq_position(this_path, other_path, compiled_pattern,
seq_position = None):
"""
Finds out, if there is a sequence between two file paths and returns either
the position of the sequence in a tuple or None if there is no sequence
between the two paths.
"""
if not seq_position is None:
return get_seq_position_range(this_path, other_path, seq_position)
this_name = os.path.basename(this_path)
other_name = os.path.basename(other_path)
this_dir = os.path.dirname(this_path)
other_dir = os.path.dirname(other_path)
if this_dir != other_dir:
return None
this_dir_len = 0
other_dir_len = 0
if this_dir:
this_dir_len = len(this_dir) + 1
other_dir_len = len(other_dir) + 1
# Sequences are more often towards the end [::-1].
matches_this = list(compiled_pattern.finditer(this_name))[::-1]
matches_other = list(compiled_pattern.finditer(other_name))[::-1]
matches_this_count = len(matches_this)
matches_other_count = len(matches_other)
if (matches_this_count == 0
or matches_this_count != matches_other_count):
return None
for match_this, match_other in zip(matches_this, matches_other):
this_test_name = "".join([this_name[:match_this.start(0)],
this_name[match_this.end(0):]])
other_test_name = "".join([other_name[:match_other.start(0)],
other_name[match_other.end(0):]])
match_this_value = match_this.group(0)
match_other_value = match_other.group(0)
if len(match_this_value) != len(match_other_value):
return None
if this_test_name == other_test_name:
if int(match_this_value) != int(match_other_value):
return (this_dir_len + match_this.start(0),
other_dir_len + match_this.end(0))
return None
def get_seq_position_range(this_path, other_path, seq_position):
"""
Does the same as get_seq_position() except that it takes the sequence
position (tuple) if the position is already known for one of the two
given paths. This is a lot faster than get_seq_position().
"""
seq_start, seq_end = seq_position
this_merged_path = "".join([this_path[:seq_start], this_path[seq_end:]])
other_merged_path = "".join([other_path[:seq_start], other_path[seq_end:]])
this_sequence = this_path[seq_start:seq_end]
other_sequence = other_path[seq_start:seq_end]
if (this_merged_path == other_merged_path
and len(this_sequence) == len(other_sequence)
and int(this_sequence) != int(other_sequence)):
return seq_position
return None
def create_sequences(filepaths, flavor = ""):
"""
Creates the list of sequences as follows. seq_position is a tuple
containing the position of the sequence in the filepath.
[[[filepath_1, filepath_2, ...], seq_position],
[[filepath_a, filepath_b, ...], seq_position], ...]
"""
filepaths.sort()
compiled_pattern = re.compile("\d+")
sequences = []
for filepath in filepaths:
for sequence in sequences:
filepath_to_check = sequence[0][0]
seq_position = get_seq_position(filepath_to_check, filepath,
compiled_pattern, sequence[1])
if seq_position is None:
continue
else:
sequence[0].append(filepath)
if sequence[1] is None:
sequence[1] = seq_position
break
else:
sequences.append([[filepath], None])
if flavor == "nuke" or flavor == "nuke_info":
return sequences_to_nuke(sequences, flavor)
return sequences
def main(dir_path):
filepaths = [
os.path.join(dir_path, filename)
for filename
in os.listdir(dir_path)
if os.path.isfile(os.path.join(dir_path, filename))
]
seqs = create_sequences(filepaths)
for seq in seqs:
print(seq)
return None
if __name__ == "__main__":
main(r"C:\path\to\sequence_folder")
解决方案
第一步可能是这样的:
from collections import defaultdict
from pprint import pprint
import re
names=[
"fde302be-4d3e-xxxx_abc08_xyz_05.png",
"fde302be-4d3e-xxxx_abc09_xyz_05.png",
"fde302be-4d3e-xxxx_abc10_xyz_05.JPG",
"fde302be-4d3e-xxxx_abc15_xyz_05.JPG",
"fde302be-4d3e-xxxx_abc16_xyz_05.png",
"fde305be-4d3e-xxxx_abc08_abc_05.png",
"fde309be-4d3e-xxxx_abc08_abc_05.png",
"fde310be-4d3e-xxxx_abc08_abc_05.png"
]
pat = re.compile(r'\d\d+')
seqs = defaultdict(list)
for name in names:
k1 = pat.sub("#", name)
k2 = tuple(m.span() for m in pat.finditer(name))
seqs[(k1,k2)].append(name)
pprint(seqs)
defaultdict(<class 'list'>,
{('fde#be-4d3e-xxxx_abc#_abc_#.png', ((3, 6), (22, 24), (29, 31))): ['fde305be-4d3e-xxxx_abc08_abc_05.png',
'fde309be-4d3e-xxxx_abc08_abc_05.png',
'fde310be-4d3e-xxxx_abc08_abc_05.png'],
('fde#be-4d3e-xxxx_abc#_xyz_#.JPG', ((3, 6), (22, 24), (29, 31))): ['fde302be-4d3e-xxxx_abc10_xyz_05.JPG',
'fde302be-4d3e-xxxx_abc15_xyz_05.JPG'],
('fde#be-4d3e-xxxx_abc#_xyz_#.png', ((3, 6), (22, 24), (29, 31))): ['fde302be-4d3e-xxxx_abc08_xyz_05.png',
'fde302be-4d3e-xxxx_abc09_xyz_05.png',
'fde302be-4d3e-xxxx_abc16_xyz_05.png']})
这给出了一个字典,其值是文件名列表,仅在相同位置的数字字符之间存在差异。
在这一点上,我们仍然必须 a) 丢弃长度为 1 的“序列”和 b) 找出名称实际不同的位置。
步骤a)是微不足道的。至于步骤 b) 假设我们有“aaa01bbb02.jpg”、“aaa02bbb02.jpg”、“aaa01bbb03.jpg”,所以名字和第二个名字组成一个序列,但第一个和第三个组成另一个(而第二个和第三个在两个地方不同所以不要做一个序列):你会如何处理这个?
编辑
好的,根据 OP 的最后一个答案,这里是缺失部分的可能解决方案:
trueseqs=[]
for k,v in seqs.items():
if len(v) == 1:
continue
for start,stop in reversed(k[1]):
#start, stop = span
diff = False
for vi in v[:-1]:
for j,vj in enumerate(v[1:]):
if vi[start:stop] == vj[start:stop]:
continue
if vi[:start]+vi[stop:] == vj[:start]+vj[stop:]:
diff = (start,stop)
trueseqs.append([[vi],diff])
print(f'insert {vi} with span {diff}')
break
if diff:
break
for vh in v[j+1:]:
if vi[start:stop] != vh[start:stop] and vi[:start]+vi[stop:] == vh[:start]+vh[stop:]:
trueseqs[-1][0].append(vh)
print(f'add {vh}')
if diff:
break
insert fde302be-4d3e-xxxx_abc08_xyz_05.png with span (22, 24)
add fde302be-4d3e-xxxx_abc09_xyz_05.png
add fde302be-4d3e-xxxx_abc16_xyz_05.png
insert fde302be-4d3e-xxxx_abc10_xyz_05.JPG with span (22, 24)
add fde302be-4d3e-xxxx_abc15_xyz_05.JPG
insert fde305be-4d3e-xxxx_abc08_abc_05.png with span (3, 6)
add fde309be-4d3e-xxxx_abc08_abc_05.png
add fde310be-4d3e-xxxx_abc08_abc_05.png
pprint(trueseqs)
[[['fde302be-4d3e-xxxx_abc08_xyz_05.png',
'fde302be-4d3e-xxxx_abc09_xyz_05.png',
'fde302be-4d3e-xxxx_abc16_xyz_05.png'],
(22, 24)],
[['fde302be-4d3e-xxxx_abc10_xyz_05.JPG',
'fde302be-4d3e-xxxx_abc15_xyz_05.JPG'],
(22, 24)],
[['fde305be-4d3e-xxxx_abc08_abc_05.png',
'fde309be-4d3e-xxxx_abc08_abc_05.png',
'fde310be-4d3e-xxxx_abc08_abc_05.png'],
(3, 6)]]
推荐阅读
- javascript - 从Javascript中的字符串中删除子字符串
- nest-device-access - 列出设备和结构为空
- data-structures - 如何断言/内省嵌套的递归结构/枚举?
- php - 不在 Nginx 服务器上加载 css 文件
- android - MutableMap/HashMap 的快捷方式
- assembly - 想用Labels简化小人电脑程序BubbleSort
- firebase - Google 应用 ID 丢失 Firebase Analytics 已禁用
- android - 错误:找不到正确的提供者
高于此消费者 小部件 - python - 如何在文档集合中找到术语的熵?
- c++ - 用户制作对象的属性不会打印(C++)