python - 将点移动到最近的未占用网格位置
问题描述
我在二维空间中有一个随机分布的点,如下所示:
from sklearn import datasets
import pandas as pd
import numpy as np
arr, labels = datasets.make_moons()
arr, labels = datasets.make_blobs(n_samples=1000, centers=3)
pd.DataFrame(arr, columns=['x', 'y']).plot.scatter('x', 'y', s=1)
我试图将这些点中的每一个分配给假想的六角网格上最近的未占用插槽,以确保这些点不会重叠。我用来实现这个目标的代码产生了下面的图。总体思路是创建 hex bin,然后遍历每个点并找到允许算法将该点分配给未占用的 hex bin 的最小半径:
from scipy.spatial.distance import euclidean
def get_bounds(arr):
'''Given a 2D array return the y_min, y_max, x_min, x_max'''
return [
np.min(arr[:,1]),
np.max(arr[:,1]),
np.min(arr[:,0]),
np.max(arr[:,0]),
]
def create_mesh(arr, h=100, w=100):
'''arr is a 2d array; h=number of vertical divisions; w=number of horizontal divisions'''
print(' * creating mesh with size', h, w)
bounds = get_bounds(arr)
# create array of valid positions
y_vals = np.arange(bounds[0], bounds[1], (bounds[1]-bounds[0])/h)
x_vals = np.arange(bounds[2], bounds[3], (bounds[3]-bounds[2])/w)
# create the dense mesh
data = np.tile(
[[0, 1], [1, 0]],
np.array([
int(np.ceil(len(y_vals) / 2)),
int(np.ceil(len(x_vals) / 2)),
]))
# ensure each axis has an even number of slots
if len(y_vals) % 2 != 0 or len(x_vals) % 2 != 0:
data = data[0:len(y_vals), 0:len(x_vals)]
return pd.DataFrame(data, index=y_vals, columns=x_vals)
def align_points_to_grid(arr, h=100, w=100, verbose=False):
'''arr is a 2d array; h=number of vertical divisions; w=number of horizontal divisions'''
h = w = len(arr)/10
grid = create_mesh(arr, h=h, w=w)
# fill the mesh
print(' * filling mesh')
df = pd.DataFrame(arr, columns=['x', 'y'])
bounds = get_bounds(arr)
# store the number of points slotted
c = 0
for site, point in df[['x', 'y']].iterrows():
# skip points not in original points domain
if point.y < bounds[0] or point.y > bounds[1] or \
point.x < bounds[2] or point.x > bounds[3]:
raise Exception('Input point is out of bounds', point.x, point.y, bounds)
# assign this point to the nearest open slot
r_y = (bounds[1]-bounds[0])/h
r_x = (bounds[3]-bounds[2])/w
slotted = False
while not slotted:
bottom = grid.index.searchsorted(point.y - r_y)
top = grid.index.searchsorted(point.y + r_y, side='right')
left = grid.columns.searchsorted(point.x - r_x)
right = grid.columns.searchsorted(point.x + r_x, side='right')
close_grid_points = grid.iloc[bottom:top, left:right]
# store the position in this point's radius that minimizes distortion
best_dist = np.inf
grid_loc = [np.nan, np.nan]
for x, col in close_grid_points.iterrows():
for y, val in col.items():
if val != 1: continue
dist = euclidean(point, (x,y))
if dist < best_dist:
best_dist = dist
grid_loc = [x,y]
# no open slots were found so increase the search radius
if np.isnan(grid_loc[0]):
r_y *= 2
r_x *= 2
# success! report the slotted position to the user
else:
# assign a value other than 1 to mark this slot as filled
grid.loc[grid_loc[0], grid_loc[1]] = 2
df.loc[site, ['x', 'y']] = grid_loc
slotted = True
c += 1
if verbose:
print(' * completed', c, 'of', len(arr), 'assignments')
return df
# plot sample data
df = align_points_to_grid(arr, verbose=False)
df.plot.scatter('x', 'y', s=1)
我对这个算法的结果很满意,但对性能不满意。
Python中这种hexbin分配问题有更快的解决方案吗?我觉得其他更多接触线性分配问题或匈牙利算法的人可能对这个问题有宝贵的见解。任何建议都会非常有帮助!
解决方案
事实证明,将每个点分配给当前半径内的第一个可用网格点就足够了。
对于最终来到这里的其他人,为了方便起见,我的实验室将此功能包装到了一个小的Python 包中。您可以pip install pointgrid
然后:
from pointgrid import align_points_to_grid
from sklearn import datasets
# create fake data
arr, labels = datasets.make_blobs(n_samples=1000, centers=5)
# get updated point positions
updated = align_points_to_grid(arr)
updated
将是一个与输入数组具有相同形状的 numpy 数组arr
。
推荐阅读
- javascript - 如何防止不再关注的可内容编辑 div 中出现红色曲线?
- java - 如何使用 maven-flatten 解决 pom.xml 父项目修订中的 fecth 问题?
- sql - SQLite - 删除所有空格
- groovy - 尝试断言 500 错误,但满足条件时响应为空
- javascript - JS 类抽象
- java - 使用 OpenJDK 11.0.4 编译 CoreNLP 时出错
- java - 如果加载时间过长,如何使 selenium 重新加载所需的 url
- css - 如何在 DT::datatable 中禁用 scrollX
- javascript - Chrome 扩展程序:如何在 chrom 扩展程序更新后删除孤立的脚本
- ssl - 为什么在我的表单中添加一个简单的星形图标会禁用 SSL?