我的Python / Cython迭代基准测试代表吗?

我想遍历Python程序中的大型数据结构并为每个元素执行一个任务。为了简单起见,假设元素是整数,而任务只是一个增量。最后,最后一个递增的元素将作为(虚拟)结果返回。为了寻求最佳的结构/方法,我比较了纯Python和Cython中这些结构的时序(我无法在其他地方找到它们的直接比较):

  • Python清单
  • NumPy数组/键入的内存视图
  • 具有基础C ++向量的Cython扩展类型

我计时的迭代是:

  • 列表迭代中的Python foreach(it_list)
  • 具有显式元素访问(cy_list)的Cython列表迭代
  • 数组迭代中的Python foreach(it_nparray)
  • Python NumPy向量化运算(vec_nparray)
  • 具有显式元素访问(cy_memview)的Cython内存视图迭代
  • 基础向量迭代(it_pyvector)中的Python foreach
  • Python foreach通过__iter__(it_pyvector_iterator)在基础向量迭代中
  • 具有显式元素访问(cy_pyvector)的Cython向量迭代
  • 通过vector.iterator(cit_pyvector_iterator)进行Cython向量迭代

我从中得出结论(时序如下):

  • 在NumPy数组上进行普通的Python迭代非常慢(比Python列表迭代慢大约10倍)->这不是一个好主意
  • 包装的C ++向量上的Python迭代也很慢(比Python列表迭代慢约1.5倍)->不好
  • 包装的C ++向量上的Cython迭代是最快的选择,大约等于C连续内存视图
  • 使用显式元素访问的向量迭代比使用迭代器的迭代要快->为什么要麻烦使用迭代器?
  • 内存视图方法的开销比扩展类型方法大得多

现在的问题是:我的数字可靠吗(我做错了什么还是错过了什么)?这是否符合您在实际示例中的经验?还有什么我可以做来改善迭代的吗?下面是我使用的代码和时间。顺便说一句,我正在Jupyter笔记本中使用它。建议和评论高度赞赏!

不同数据结构大小n的相对定时(最小值1.000):

================================================================================
Timings for n = 1:
--------------------------------------------------------------------------------
    cit_pyvector_iterator:   1.000
             cit_pyvector:   1.005
                 cit_list:   1.023
                  it_list:   3.064
              it_pyvector:   4.230
     it_pyvector_iterator:   4.937
              cit_memview:   8.196
              vec_nparray:  20.187
               it_nparray:  25.310
================================================================================ 

================================================================================
Timings for n = 1000:
--------------------------------------------------------------------------------
    cit_pyvector_iterator:   1.000
             cit_pyvector:   1.001
              cit_memview:   2.453
              vec_nparray:   5.845
                 cit_list:   9.944
                  it_list: 137.694
              it_pyvector: 199.702
     it_pyvector_iterator: 218.699
               it_nparray: 1516.080
================================================================================ 

================================================================================
Timings for n = 1000000:
--------------------------------------------------------------------------------
             cit_pyvector:   1.000
              cit_memview:   1.056
    cit_pyvector_iterator:   1.197
              vec_nparray:   2.516
                 cit_list:   7.089
                  it_list:  87.099
     it_pyvector_iterator: 143.232
              it_pyvector: 162.374
               it_nparray: 897.602
================================================================================ 

================================================================================
Timings for n = 10000000:
--------------------------------------------------------------------------------
             cit_pyvector:   1.000
              cit_memview:   1.004
    cit_pyvector_iterator:   1.060
              vec_nparray:   2.721
                 cit_list:   7.714
                  it_list:  88.792
     it_pyvector_iterator: 130.116
              it_pyvector: 149.497
               it_nparray: 872.798
================================================================================

Cython代码:

%%cython --annotate

# distutils: language = c++
# cython: boundscheck = False
# cython: wraparound = False

from libcpp.vector cimport vector
from cython.operator cimport dereference as deref, preincrement as princ


# Extension type wrapping a vector
cdef class pyvector:
    cdef vector[long] _data

    cpdef void push_back(self, long x):
        self._data.push_back(x)

    def __iter__(self):
        cdef size_t i, n = self._data.size()

        for i in range(n):
            yield self._data[i]

    @property
    def data(self):
        return self._data


# Cython iteration over Python list      
cpdef long cit_list(list l):
    cdef:
        long j, ii
        size_t i, n = len(l)

    for i in range(n):
        ii = l[i]
        j = ii + 1  
    return j


# Cython iteration over NumPy array
cpdef long cit_memview(long[::1] v) nogil:
    cdef:
        size_t i, n = v.shape[0]
        long j

    for i in range(n):
        j = v[i] + 1   
    return j


# Iterate over pyvector
cpdef long cit_pyvector(pyvector v) nogil:
    cdef:
        size_t i, n = v._data.size()
        long j

    for i in range(n):
        j = v._data[i] + 1
    return j    


cpdef long cit_pyvector_iterator(pyvector v) nogil:
    cdef:
        vector[long].iterator it = v._data.begin()
        long j

    while it != v._data.end():
        j = deref(it) + 1
        princ(it)
    return j

Python代码:

# Python iteration over Python list
def it_list(l):
    for i in l:
        j = i + 1    
    return j


# Python iteration over NumPy array
def it_nparray(a):
    for i in a:
        j = i + 1    
    return j


# Vectorised NumPy operation
def vec_nparray(a):
    a + 1
    return a[-1]


# Python iteration over C++ vector extension type
def it_pyvector_iterator(v):
    for i in v:
        j = i + 1
    return j


def it_pyvector(v):
    for i in v.data:
        j = i + 1
    return j

对于基准:

import numpy as np
from operator import itemgetter


def bm(sizes):
    """Call functions with data structures of varying length"""

    Timings = {}
    for n in sizes:
        Timings[n] = {}

        # Python list
        list_ = list(range(n))

        # NumPy array
        a = np.arange(n, dtype=np.int64)

        # C++ vector extension type
        pyv = pyvector()
        for i in range(n):
            pyv.push_back(i)


        calls = [
            (it_list, list_),
            (cit_list, list_),
            (it_nparray, a),
            (vec_nparray, a),
            (cit_memview, a),
            (it_pyvector, pyv),
            (it_pyvector_iterator, pyv),
            (cit_pyvector, pyv),
            (cit_pyvector_iterator, pyv),
        ]

        for fxn, arg in calls:
            Timings[n][fxn.__name__] = %timeit -o fxn(arg)

    return Timings


def ratios(timings, base=None):
    """Show relative performance of runs based on `timings` dict"""

    if base is not None:
        base = timings[base].average
    else:
        base = min(x.average for x in timings.values())

    return sorted([
        (k, v.average / base)
        for k, v in timings.items()
        ], key=itemgetter(1))


Timings = {}
sizes = [1, 1000, 1000000, 10000000]
Timings.update(bm(sizes))
for s in sizes:
    print("=" * 80)
    print(f"Timings for n = {s}:")
    print("-" * 80)
    for x in ratios(Timings[s]):
        print(f"{x[0]:>25}: {x[1]:7.3f}")
    print("=" * 80, "\n")
评论