Logistic Map-PyCUDA-Cupy- Numpy之间的速度比较

我是一名研究生(硕士),在CUDA-C,PyCUDA和Cupy中测试了不同的代码以进行速度比较。这个想法是为了在我的等离子物理学论文模拟中在执行速度和编程时间之间选择最佳的平衡。这是PyCUDA和Cupy中的物流地图代码的片段。

PyCUDA:

 mod = SourceModule("""

  __global__ void mapeo(int N, int t_run, float *x, float *r, float *lambda)
  {
    int id = blockIdx.x* blockDim.x + threadIdx.x;
    if(id<N){
            for(int jd=0; jd<t_run; jd++){
                lambda[id] += logf(fabsf(r[id]-2*r[id]*x[id]));
                x[id] = r[id]*x[id]*(1.0-x[id]);
            }
            lambda[id] = lambda[id]/t_run;
    }
  }
""")

# Allocacion de arrays en el device y copia de datos:
x_m02 = np.copy(x_host)
lambda_m02 = np.copy(lambda_host)

# configuracion y llamado al kernel
mapeo_kernel = mod.get_function('mapeo')
numThreads = 128
numBlocks = (N + numThreads - 1 )//numThreads

start.record() # timing con eventos de pycuda
#llamado a kernel
mapeo_kernel(np.int32(N),np.int32(t_run),cuda.InOut(x_m02),cuda.In(r_host),cuda.InOut(lambda_m02),block=(numThreads,1,1), grid=(numBlocks,1,1)) 

end.record() # end timing
end.synchronize()
secs = start.time_till(end)*1e-3

print("----")
print('Metodo 02: %f s', secs)

丘比:

def mapeo(x, r, lamb): 
    for i in range(t_run):
        lamb +=  cp.log(cp.abs(r-2*r*x))
        x = r*x*(1.0-x)

cp.cuda.Device().synchronize()
t1 = time()

mapeo(x_gpu, r_gpu, l_gpu)

cp.cuda.Device().synchronize()
t2 = time()

print("tiempo cupy: %f ", t2-t1)

带有自定义内核的Cupy(RawKernel)

kernel_1 = cp.RawKernel( r'''
    extern "C" __global__ 
    void mapeo_logistico( int t_run, 
                            float *r, float *x, float *lamb){

            int id = blockDim.x * blockIdx.x + threadIdx.x;

            for(int jd=0; jd<t_run; jd++){
                lamb[id] += logf(fabsf(r[id]-2*r[id]*x[id]));
                x[id] = r[id]*x[id]*(1.0-x[id]);
            }
            lamb[id] = lamb[id]/t_run;      

    }
    ''', 'mapeo_logistico')

r_gpu_2 = cp.asarray(r_host)
x_gpu_2 = cp.asarray(x_host)
l_gpu_2 = cp.asarray(lambda_host)

cp.cuda.Device().synchronize()
t1 = time()
kernel_1((N,),(1,), (t_run, r_gpu_2, x_gpu_2, l_gpu_2))
cp.cuda.Device().synchronize()
t2 = time()

结果如下:

Speed Comparison SpeedUp Am I doing something wrong? I cant understand why Cupy is slower even with the Custom Kernel... Is Pycuda better or Im missing something?