首页 > 解决方案 > 在文件路径/文件名之间查找序列的快速方法

问题描述

我正在尝试在给定的文件名列表中查找文件名之间的文件序列。然而,潜在文件序列的位置是未知的。它可以在文件名中的任何位置。

序列有以下规则:

  1. 序列的长度总是相同的。所以 01 - 12 和 070 - 110 是有效的序列。1 - 12 和 70 - 110 不是。
  2. 序列允许有间隙。01, 02, 05, 10, 21 将是一个有效的序列。

例子:

输入:

[
"fde302be-4d3e-xxxx_abc08_xyz_05.png",
"fde302be-4d3e-xxxx_abc09_xyz_05.png",
"fde302be-4d3e-xxxx_abc10_xyz_05.JPG",
"fde302be-4d3e-xxxx_abc15_xyz_05.JPG",
"fde302be-4d3e-xxxx_abc16_xyz_05.png",
"fde305be-4d3e-xxxx_abc08_abcde_05.png",
"fde309be-4d3e-xxxx_abc08_abcde_05.png",
"fde310be-4d3e-xxxx_abc08_abcde_05.png"
]

期望的输出:

[
[["fde302be-4d3e-xxxx_abc08_xyz_05.png",
"fde302be-4d3e-xxxx_abc09_xyz_05.png",
"fde302be-4d3e-xxxx_abc16_xyz_05.png"], (22, 24)],

[["fde302be-4d3e-xxxx_abc10_xyz_05.JPG",
"fde302be-4d3e-xxxx_abc15_xyz_05.JPG"], (22, 24)],

[["fde305be-4d3e-xxxx_abc08_abcde_05.png",
"fde309be-4d3e-xxxx_abc08_abcde_05.png",
"fde310be-4d3e-xxxx_abc08_abcde_05.png"], (3, 6)]
]

其中 (22, 24) 和 (3, 6) 描述了序列在字符串中的位置。对于最后一个,我并不特别关心它是 (3, 6) 还是 (4, 6)。但两者中的任何一个。列表元素(输入和输出)不必按顺序排列。

我有一个工作版本,但它真的很慢,随着文件名和序列的数量而变得非常慢,并且大约有 100 行代码。

有没有更快更简单的方法来获得所需的输出?

这是我目前的解决方案。我知道有些部分可以优化它,比如忽略序列,我知道我正在检查的文件名不再是该序列的一部分,因为序列是预先排序的等等,但我觉得我的方法从一开始就有缺陷,并且有更好的方法来解决这个问题。

import os
import re


def get_seq_position(this_path, other_path, compiled_pattern,
                     seq_position = None):
    """
    Finds out, if there is a sequence between two file paths and returns either
    the position of the sequence in a tuple or None if there is no sequence
    between the two paths.
    """

    if not seq_position is None:
        return get_seq_position_range(this_path, other_path, seq_position)

    this_name = os.path.basename(this_path)
    other_name = os.path.basename(other_path)
    this_dir = os.path.dirname(this_path)
    other_dir = os.path.dirname(other_path)

    if this_dir != other_dir:
        return None

    this_dir_len = 0
    other_dir_len = 0

    if this_dir:
        this_dir_len = len(this_dir) + 1
        other_dir_len = len(other_dir) + 1

    # Sequences are more often towards the end [::-1].
    matches_this = list(compiled_pattern.finditer(this_name))[::-1]
    matches_other = list(compiled_pattern.finditer(other_name))[::-1]
    matches_this_count = len(matches_this)
    matches_other_count = len(matches_other)

    if (matches_this_count == 0
        or matches_this_count != matches_other_count):
        return None

    for match_this, match_other in zip(matches_this, matches_other):
        this_test_name = "".join([this_name[:match_this.start(0)],
                                  this_name[match_this.end(0):]])
        other_test_name = "".join([other_name[:match_other.start(0)],
                                   other_name[match_other.end(0):]])

        match_this_value = match_this.group(0)
        match_other_value = match_other.group(0)

        if len(match_this_value) != len(match_other_value):
            return None

        if this_test_name == other_test_name:
            if int(match_this_value) != int(match_other_value):
                return (this_dir_len + match_this.start(0),
                        other_dir_len + match_this.end(0))

    return None


def get_seq_position_range(this_path, other_path, seq_position):
    """
    Does the same as get_seq_position() except that it takes the sequence
    position (tuple) if the position is already known for one of the two
    given paths. This is a lot faster than get_seq_position().
    """

    seq_start, seq_end = seq_position

    this_merged_path = "".join([this_path[:seq_start], this_path[seq_end:]])
    other_merged_path = "".join([other_path[:seq_start], other_path[seq_end:]])
    this_sequence = this_path[seq_start:seq_end]
    other_sequence = other_path[seq_start:seq_end]

    if (this_merged_path == other_merged_path 
        and len(this_sequence) == len(other_sequence)
        and int(this_sequence) != int(other_sequence)):
        return seq_position

    return None


def create_sequences(filepaths, flavor = ""):
    """
    Creates the list of sequences as follows. seq_position is a tuple
    containing the position of the sequence in the filepath.
    [[[filepath_1, filepath_2, ...], seq_position],
     [[filepath_a, filepath_b, ...], seq_position], ...]
    """

    filepaths.sort()
    compiled_pattern = re.compile("\d+")

    sequences = []
    for filepath in filepaths:
        for sequence in sequences:
            filepath_to_check = sequence[0][0]
            seq_position = get_seq_position(filepath_to_check, filepath,
                                            compiled_pattern, sequence[1])

            if seq_position is None:
                continue
            else:
                sequence[0].append(filepath)
                if sequence[1] is None:
                    sequence[1] = seq_position
                break
        else:
            sequences.append([[filepath], None])

    if flavor == "nuke" or flavor == "nuke_info":
        return sequences_to_nuke(sequences, flavor)

    return sequences


def main(dir_path):
    filepaths = [
                 os.path.join(dir_path, filename)
                 for filename
                 in os.listdir(dir_path)
                 if os.path.isfile(os.path.join(dir_path, filename))
                ]

    seqs = create_sequences(filepaths)

    for seq in seqs:
        print(seq)

    return None


if __name__ == "__main__":
    main(r"C:\path\to\sequence_folder")

标签: python

解决方案


第一步可能是这样的:

from collections import defaultdict
from pprint import pprint
import re

names=[
"fde302be-4d3e-xxxx_abc08_xyz_05.png",
"fde302be-4d3e-xxxx_abc09_xyz_05.png",
"fde302be-4d3e-xxxx_abc10_xyz_05.JPG",
"fde302be-4d3e-xxxx_abc15_xyz_05.JPG",
"fde302be-4d3e-xxxx_abc16_xyz_05.png",
"fde305be-4d3e-xxxx_abc08_abc_05.png",
"fde309be-4d3e-xxxx_abc08_abc_05.png",
"fde310be-4d3e-xxxx_abc08_abc_05.png"
]

pat = re.compile(r'\d\d+')
seqs = defaultdict(list)
for name in names:
    k1 = pat.sub("#", name)
    k2 = tuple(m.span() for m in pat.finditer(name))
    seqs[(k1,k2)].append(name)

pprint(seqs)
defaultdict(<class 'list'>,
            {('fde#be-4d3e-xxxx_abc#_abc_#.png', ((3, 6), (22, 24), (29, 31))): ['fde305be-4d3e-xxxx_abc08_abc_05.png',
                                                                                 'fde309be-4d3e-xxxx_abc08_abc_05.png',
                                                                                 'fde310be-4d3e-xxxx_abc08_abc_05.png'],
             ('fde#be-4d3e-xxxx_abc#_xyz_#.JPG', ((3, 6), (22, 24), (29, 31))): ['fde302be-4d3e-xxxx_abc10_xyz_05.JPG',
                                                                                 'fde302be-4d3e-xxxx_abc15_xyz_05.JPG'],
             ('fde#be-4d3e-xxxx_abc#_xyz_#.png', ((3, 6), (22, 24), (29, 31))): ['fde302be-4d3e-xxxx_abc08_xyz_05.png',
                                                                                 'fde302be-4d3e-xxxx_abc09_xyz_05.png',
                                                                                 'fde302be-4d3e-xxxx_abc16_xyz_05.png']})

这给出了一个字典,其值是文件名列表,仅在相同位置的数字字符之间存在差异。

在这一点上,我们仍然必须 a) 丢弃长度为 1 的“序列”和 b) 找出名称实际不同的位置。

步骤a)是微不足道的。至于步骤 b) 假设我们有“aaa01bbb02.jpg”、“aaa02bbb02.jpg”、“aaa01bbb03.jpg”,所以名字和第二个名字组成一个序列,但第一个和第三个组成另一个(而第二个和第三个在两个地方不同所以不要做一个序列):你会如何处理这个?

编辑

好的,根据 OP 的最后一个答案,这里是缺失部分的可能解决方案:

trueseqs=[]
for k,v in seqs.items():
    if len(v) == 1:
        continue
    for start,stop in reversed(k[1]):
        #start, stop = span
        diff = False
        for vi in v[:-1]:
            for j,vj in enumerate(v[1:]):
                if vi[start:stop] == vj[start:stop]:
                    continue
                if vi[:start]+vi[stop:] == vj[:start]+vj[stop:]:
                    diff = (start,stop)
                    trueseqs.append([[vi],diff])
                    print(f'insert {vi} with span {diff}')
                    break
            if diff:
                break
        for vh in v[j+1:]:
            if vi[start:stop] != vh[start:stop] and vi[:start]+vi[stop:] == vh[:start]+vh[stop:]:
                trueseqs[-1][0].append(vh)
                print(f'add    {vh}')
        if diff:
            break
insert fde302be-4d3e-xxxx_abc08_xyz_05.png with span (22, 24)
add    fde302be-4d3e-xxxx_abc09_xyz_05.png
add    fde302be-4d3e-xxxx_abc16_xyz_05.png
insert fde302be-4d3e-xxxx_abc10_xyz_05.JPG with span (22, 24)
add    fde302be-4d3e-xxxx_abc15_xyz_05.JPG
insert fde305be-4d3e-xxxx_abc08_abc_05.png with span (3, 6)
add    fde309be-4d3e-xxxx_abc08_abc_05.png
add    fde310be-4d3e-xxxx_abc08_abc_05.png

pprint(trueseqs)
[[['fde302be-4d3e-xxxx_abc08_xyz_05.png',
   'fde302be-4d3e-xxxx_abc09_xyz_05.png',
   'fde302be-4d3e-xxxx_abc16_xyz_05.png'],
  (22, 24)],
 [['fde302be-4d3e-xxxx_abc10_xyz_05.JPG',
   'fde302be-4d3e-xxxx_abc15_xyz_05.JPG'],
  (22, 24)],
 [['fde305be-4d3e-xxxx_abc08_abc_05.png',
   'fde309be-4d3e-xxxx_abc08_abc_05.png',
   'fde310be-4d3e-xxxx_abc08_abc_05.png'],
  (3, 6)]]

推荐阅读