首页 > 解决方案 > Cythonize ndarrays 列表到indirect_contiguous

问题描述

我想对 ndarrays 列表(具有不同大小)进行 cythonize 以加快函数的性能。[::view.indirect_contiguous,::1] 类型的数据结构似乎是要走的路,创建一个连续的指针数组,链接到不同大小的连续内存视图,但我不清楚如何正确设置它。我该如何做以及如何访问它的元素?

在下面的 MWE 中,我将元素的简单总和只是为了测试元素的访问(我对使用其他公式加速它不感兴趣)

from typing import List
import numpy as np

def python_foo(array_list: List[np.ndarray]):
  list_len = len(array_list)
  results = np.zeros((list_len,1), dtype=np.int8)
  # prints few elements and sum them
  print(array_list[0][0], array_list[0][1], array_list[1][0], array_list[1][1])
  for k in range(list_len):
    results[k] = np.sum(array_list[k])
  return results


import cython
cimport numpy as np
from cython cimport view
from libc.stdio cimport printf

DTYPE = np.float64
ctypedef np.float64_t DTYPE_t

def cython_foo(array_list: List[np.ndarray]):
  cdef int list_len = len(array_list)
  cdef DTYPE_t[::view.indirect_contiguous, ::1] my_mem_view
  # (1) - how do I assign the ndarrays to my_mem_view? 

  # prints few elements and sum them
  # (2) - how do I access the elements of my_mem_view? is this correct?
  printf("%f %f %f %f\n", my_mem_view[0,0], my_mem_view[0,1], my_mem_view[1,0], my_mem_view[1,1])
  cdef DTYPE_t results[list_len] = {0}
  cdef int k
  cdef int n
  for k in range(list_len):
    for n in range(array_list[k].size) # should I also create an array of lengths?
      results[k] += my_mem_view[k,n] 
  # BONUS question: I probably need to convert results to Python objects (list, ndarrays), right?
  return results

标签: pythonperformancenumpycython

解决方案


这是一个可能的解决方案,它使用临时内存视图来获取指向数据的指针。如果有人找到更好、更清洁或更快的答案,请告诉我。

我想知道我的内存管理是否正确,或者是否缺少某些东西。

# indirect_contiguous.pyx
cimport numpy as np
from cpython.mem cimport PyMem_Malloc, PyMem_Free
DTYPE = np.float64
ctypedef np.float64_t DTYPE_t

def cython_foo(array_list: List[np.ndarray]):
  cdef int list_len = len(array_list)

  # (1) 
  cdef DTYPE_t ** my_mem_view = <DTYPE_t **> PyMem_Malloc(list_len * sizeof(DTYPE_t *))
  cdef int idx
  cdef DTYPE_t[:] item_1 # cdef not allowed in conditionals
  cdef DTYPE_t[:,:] item_2
  cdef DTYPE_t[:,:,:] item_3
  # ... other cdef for DTYPE_t[...] up to dimension 8 - the maximum allowed for a memoryview
  cdef int *array_len = <int *> PyMem_Malloc(list_len * sizeof(int))
  for idx in range(list_len):
    if len(array_list[idx].shape) == 1:
      item_1 = np.ascontiguousarray(array_list[idx], dtype=DTYPE)
      my_mem_view[idx] = <DTYPE_t *> &item_1[0]
    elif len(array_list[idx].shape) == 2:
      item_2 = np.ascontiguousarray(array_list[idx], dtype=DTYPE)
      my_mem_view[idx] = <DTYPE_t *> &item_2[0, 0]
    elif len(array_list[idx].shape) == 3:
      item_3 = np.ascontiguousarray(array_list[idx], dtype=DTYPE)
      my_mem_view[idx] = <DTYPE_t *> &item_3[0, 0, 0]
    # ... other elif for DTYPE_t[:,:,:] up to dimension 8
    array_len[idx] = array_list[idx].size

  # prints few elements and sum them
  # (2)
  printf("%f %f %f %f\n", my_mem_view[0][0], my_mem_view[0][1], my_mem_view[1][0], my_mem_view[1][1])
  cdef DTYPE_t *results = <DTYPE_t *> PyMem_Malloc(list_len * sizeof(DTYPE_t))
  cdef int k
  cdef int n
  for k in range(list_len):
    results[k] = 0
    for n in range(array_len[k]):
      results[k] += my_mem_view[k][n]

  py_results = []
  for k in range(list_len):
    py_results.append(results[k])

  # free memory
  PyMem_Free(my_mem_view)
  PyMem_Free(array_len)
  PyMem_Free(results)

  return py_results

测试速度

import timeit
print(timeit.timeit(stmt="indirect_contiguous.python_foo([np.random.random((100,100,100)), np.random.random((100,100))])",
                    setup="import numpy as np; import indirect_contiguous; ", number=100))
print(timeit.timeit(stmt="indirect_contiguous.cython_foo([np.random.random((100,100,100)), np.random.random((100,100))])",
                    setup="import numpy as np; import indirect_contiguous; ", number=100))

我得到了 2-3% 的小幅改进(1.45 秒对 1.42 秒),这可能是因为我只是在做元素的总和(numpy 已经对其进行了优化)。


推荐阅读