首页 > 解决方案 > 尝试按相似属性对对象进行分组时,我得到重复的分组

问题描述

这是一个脚本的隐喻模型,它将使用“关键字”和“搜索结果”代替“篮子”和“食物”。目的是按内容对篮子进行分组。

生成:生成 100 个篮子,并从列表中抽取 10 种食物。每个篮子都有一个随机的“体积”编号分配给它。卷号是任意的,与篮子的内容无关

分析:按具有 6 个或更多相同项目的篮子将篮子组合在一起。最大容量的篮子应该是“主篮子”,其余的应该是按容量降序排列的列表。

输出:“行”对象最终将被发送到谷歌表格。不要担心这个

期望的结果:没有重复的不同组,以及少于 5 个相同对象的一大组。

实际结果:关闭,但我得到了几个重复的组和条目,我不知道为什么

import random
from typing import Generic, TypeVar, List, Set
from uuid import uuid4

import attr

food = [
    "apple",
    "banana",
    "grapes",
    "orange",
    "potato",
    "kiwi",
    "pomegranate",
    "blueberry",
    "strawberry",
    "cantalope",
    "honeydew",
    "papaya",
    "mango",
    "raspberry",
    "celery",
    "carrot",
    "potato",
    "raddish",
    "lettuce",
    "tomato",
    "garlic",
    "onion",
    "cabbage",
    "corn",
    "shallot",
    "peas",
    "squash",
    "broccoli",
    "spinach",
]

ItemType = TypeVar("ItemType")


@attr.dataclass
class Basket(Generic[ItemType]):
    items: Set[ItemType]
    volume: int
    id: str = attr.ib(factory=lambda: str(uuid4()))

@attr.dataclass
class Row(Generic[ItemType]):
    num: int
    main_keyword: str
    similiar_keywords: Set[ItemType]
    total_volume: int

#Creates a list of basket objects
basket_names = [f"basket{i}" for i in range(1, 101)]
baskets: List[Basket[str]] = [
    Basket(items=set(random.sample(food, 10)), volume=random.randint(0, 1000), id=name)
    for name in basket_names
]

#Orders basket objects by their volume in descending order
baskets.sort(key=lambda basket: basket.volume, reverse=True)

THRESHOLD = 6
clusters = {}

def get_total_volume(basket_list: List, baskets: List):
    total_volume = 0
    for basket in baskets:
        for name in basket_list:
            if name == basket.id:
                total_volume += basket.volume
    return total_volume

#Orders list by volume
def order_by_volume(inlist, baskets: List):
    outlist = []
    for basket in baskets:
        for name in inlist:
            if name == basket.id:
                outlist.append(name)
    return outlist

# for each basket
for basket in baskets:
    # if the basket isn't already accounted for.
    if basket.id not in clusters:
        # initialize a new cluster for it, feeding its own ID
        clusters[basket.id] = {basket.id}
        # then for all other baskets
        for other_basket in baskets:
            # skipping itself, of course
            if basket.id == other_basket.id:
                # don't re-count ourselves.
                continue
            # if they are similar enough (above THRESHOLD)
            if len(other_basket.items.intersection(basket.items)) >= THRESHOLD:
                # add them to the cluster
                clusters[basket.id].add(other_basket.id)
                # and mark the "other" basket as visited via sentinel.
                clusters[other_basket.id] = None

# now we just:tm: reduce the clusters by filtering out all the Nones.
clusters_out = {
    name: related for name, related in clusters.items() if related is not None
}


for key, related in clusters_out.items():
    for term in related:
        if term == key:
            continue
        assert (
            term not in clusters_out
        ), f"term {term!r} appears as a key in the output. this breaks constraint."

print("Clusters Out:")
print(clusters_out)

row_objects = []
row_number = 2
baskets_used = 0
for key in clusters_out:
    print("\nKey: "+key)
    print("Values: "+str(clusters_out[key]))

    # Creates a list of baskets from the dict, with the duplicate key removed from the values
    values = clusters_out[key]
    values.remove(key)
    newlist = [key] + list(values)
    print("Newlist: " + str(newlist))

    # Sorts baskets by their volume
    sorted_list = order_by_volume(newlist, baskets)
    print("Sorted list: " + str(sorted_list))
    print("Length of sorted list: " + str(len(sorted_list)))

    #For debug purposes, counts the number of baskets used
    for i in sorted_list:
        baskets_used+=1

    #Creates a list of row objects
    row_objects.append(Row(num=row_number, main_keyword=sorted_list[0], similiar_keywords=sorted_list[1:], total_volume=get_total_volume(sorted_list, baskets)))
    row_number += 1

print(row_objects)
print("Number of baskets: " + str(len(baskets)))
print("Number of baskets used: " + str(baskets_used))

for i in row_objects:
    print(str(i))





标签: pythonalgorithmsortinggrouping

解决方案


推荐阅读