python - 尝试按相似属性对对象进行分组时,我得到重复的分组
问题描述
这是一个脚本的隐喻模型,它将使用“关键字”和“搜索结果”代替“篮子”和“食物”。目的是按内容对篮子进行分组。
生成:生成 100 个篮子,并从列表中抽取 10 种食物。每个篮子都有一个随机的“体积”编号分配给它。卷号是任意的,与篮子的内容无关
分析:按具有 6 个或更多相同项目的篮子将篮子组合在一起。最大容量的篮子应该是“主篮子”,其余的应该是按容量降序排列的列表。
输出:“行”对象最终将被发送到谷歌表格。不要担心这个
期望的结果:没有重复的不同组,以及少于 5 个相同对象的一大组。
实际结果:关闭,但我得到了几个重复的组和条目,我不知道为什么
import random
from typing import Generic, TypeVar, List, Set
from uuid import uuid4
import attr
food = [
"apple",
"banana",
"grapes",
"orange",
"potato",
"kiwi",
"pomegranate",
"blueberry",
"strawberry",
"cantalope",
"honeydew",
"papaya",
"mango",
"raspberry",
"celery",
"carrot",
"potato",
"raddish",
"lettuce",
"tomato",
"garlic",
"onion",
"cabbage",
"corn",
"shallot",
"peas",
"squash",
"broccoli",
"spinach",
]
ItemType = TypeVar("ItemType")
@attr.dataclass
class Basket(Generic[ItemType]):
items: Set[ItemType]
volume: int
id: str = attr.ib(factory=lambda: str(uuid4()))
@attr.dataclass
class Row(Generic[ItemType]):
num: int
main_keyword: str
similiar_keywords: Set[ItemType]
total_volume: int
#Creates a list of basket objects
basket_names = [f"basket{i}" for i in range(1, 101)]
baskets: List[Basket[str]] = [
Basket(items=set(random.sample(food, 10)), volume=random.randint(0, 1000), id=name)
for name in basket_names
]
#Orders basket objects by their volume in descending order
baskets.sort(key=lambda basket: basket.volume, reverse=True)
THRESHOLD = 6
clusters = {}
def get_total_volume(basket_list: List, baskets: List):
total_volume = 0
for basket in baskets:
for name in basket_list:
if name == basket.id:
total_volume += basket.volume
return total_volume
#Orders list by volume
def order_by_volume(inlist, baskets: List):
outlist = []
for basket in baskets:
for name in inlist:
if name == basket.id:
outlist.append(name)
return outlist
# for each basket
for basket in baskets:
# if the basket isn't already accounted for.
if basket.id not in clusters:
# initialize a new cluster for it, feeding its own ID
clusters[basket.id] = {basket.id}
# then for all other baskets
for other_basket in baskets:
# skipping itself, of course
if basket.id == other_basket.id:
# don't re-count ourselves.
continue
# if they are similar enough (above THRESHOLD)
if len(other_basket.items.intersection(basket.items)) >= THRESHOLD:
# add them to the cluster
clusters[basket.id].add(other_basket.id)
# and mark the "other" basket as visited via sentinel.
clusters[other_basket.id] = None
# now we just:tm: reduce the clusters by filtering out all the Nones.
clusters_out = {
name: related for name, related in clusters.items() if related is not None
}
for key, related in clusters_out.items():
for term in related:
if term == key:
continue
assert (
term not in clusters_out
), f"term {term!r} appears as a key in the output. this breaks constraint."
print("Clusters Out:")
print(clusters_out)
row_objects = []
row_number = 2
baskets_used = 0
for key in clusters_out:
print("\nKey: "+key)
print("Values: "+str(clusters_out[key]))
# Creates a list of baskets from the dict, with the duplicate key removed from the values
values = clusters_out[key]
values.remove(key)
newlist = [key] + list(values)
print("Newlist: " + str(newlist))
# Sorts baskets by their volume
sorted_list = order_by_volume(newlist, baskets)
print("Sorted list: " + str(sorted_list))
print("Length of sorted list: " + str(len(sorted_list)))
#For debug purposes, counts the number of baskets used
for i in sorted_list:
baskets_used+=1
#Creates a list of row objects
row_objects.append(Row(num=row_number, main_keyword=sorted_list[0], similiar_keywords=sorted_list[1:], total_volume=get_total_volume(sorted_list, baskets)))
row_number += 1
print(row_objects)
print("Number of baskets: " + str(len(baskets)))
print("Number of baskets used: " + str(baskets_used))
for i in row_objects:
print(str(i))