CUDA多维数组内存分配时,内存对齐问题

1.简介
以二维数组为例,cuda内对二维数组的存储时按照行优先,一维方式存储的。在分配二维数组空间和数据操作的时候为了提高效率,需要对二维空间的行进行内存对齐。在CUDA10.2版本中,内存对齐是以512Byte的整数倍进行操作的。并行处理时,可以用二维线程块对数组按行进行索引,这也是官方文档例子中使用的方法。

2.使用方法
首先创建二维数组,然后相加。代码如下。


#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include 

#include 

#include 

using namespace std;

//测试数据列数

#define M 1024

#define N 129

//用一维方式操作二维数组-2  数组相加

cudaError_t AddData_2DArray(/*float* d_transMat, size_t pitch,*/size_t blockSize, size_t threadSize);
int main()
{
    size_t blockSize = 512;
    size_t threadSize = 256;
    AddData_2DArray(blockSize, threadSize);

    return 0;

}
//这里相加的两个数组大小相等,所以两个pitch相等

//d_transMat1和2相加,结果放到1内

__global__ void Add_transMat (float* d_transMat1, size_t pitch1,float* d_transMat2, size_t pitch2)

{

size_t pitch = pitch1 = pitch2;

//int count = 1;

for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < M; j += blockDim.y * gridDim.y)

{

float* row_d_transMat1 = (float*)((char*)d_transMat1 + j * pitch);

float* row_d_transMat2 = (float*)((char*)d_transMat2 + j * pitch);

for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x)

{

row_d_transMat1[i] = 1;

row_d_transMat2[i] = 2;

row_d_transMat1[i] += row_d_transMat2[i];

//count++;

}

}

}

cudaError_t AddData_2DArray(/*float* d_transMat, size_t pitch,*/size_t blockSize, size_t threadSize)

{

float* d_transMat1, * d_transMat2;

float* transMat1;

size_t pitch1, pitch2;

cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.

cudaStatus = cudaSetDevice(0);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");

goto Error;

}

transMat1 = (float*)malloc(sizeof(float)*M*N);

// Allocate GPU buffers for three vectors (two input, one output)    .

cudaStatus = cudaMallocPitch(&d_transMat1, &pitch1, sizeof(float) * N, M);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMalloc failed!");

goto Error;

}

cudaStatus = cudaMallocPitch(&d_transMat2, &pitch2, sizeof(float) * N, M);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMalloc failed!");

goto Error;

}

// Launch a kernel on the GPU with one thread for each element.

Add_transMat << > > (d_transMat1, pitch1, d_transMat2, pitch2);

cudaStatus = cudaMemcpy2D(transMat1, sizeof(float) * N, d_transMat1, pitch1, sizeof(float) * N, M, cudaMemcpyDeviceToHost);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMemcpy failed!");

goto Error;

}

for (int i = 0; i < N*M; i++)

cout << i << "  "<

你可能感兴趣的