当前位置:首页 > 开发 > 编程语言 > 多线程 > 正文

CUDA程序block和thread超出硬件允许值时的异常

发表于: 2015-07-13   作者:cherishLC   来源:转载   浏览:
摘要: 调用CUDA的核函数时指定block 和 thread大小,该大小可以是dim3类型的(三维数组),只用一维时可以是usigned int型的。 以下程序验证了当block或thread大小超出硬件允许值时会产生异常!!!GPU根本不会执行运算!!! 所以验证结果的正确性很重要!!! 在VS中创建CUDA项目会有一个模板,里面有更详细的状态验证。 以下程序在K5000GPU上跑的。
调用CUDA的核函数时指定block 和 thread大小,该大小可以是dim3类型的(三维数组),只用一维时可以是usigned int型的。
以下程序验证了当block或thread大小超出硬件允许值时会产生异常!!!GPU根本不会执行运算!!!
所以验证结果的正确性很重要!!!
在VS中创建CUDA项目会有一个模板,里面有更详细的状态验证。


以下程序在K5000GPU上跑的。
奇怪的是cuda samples中的deviceQuery程序给出的block大小的第一维可以到2^31,但实测还是只有655535!!!
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <string>
#include <sstream>
#include <random>


cudaDeviceProp getCudaDeviceProperties(int deviceIdx = 0) {
	cudaSetDevice(deviceIdx);
	cudaDeviceProp deviceProp;
	cudaGetDeviceProperties(&deviceProp, deviceIdx);
	return deviceProp;
}




///在GPU上跑的函数,被称为kernel function
__global__ void productArray_bt_kernel(float *pa, float *pb, float *pResult, int N) {
	int idx = blockDim.x * blockIdx.x + threadIdx.x;
	if (idx < N)
		pResult[idx] = pa[idx] * pb[idx];
}

///kernel function 的辅助函数,用于自动分配内存、验证结果等;此处将内存的分配、拷贝放在了外部!因为同一参数要执行好多次
void productArray_bt(float *pa, float *pb, float *pResult, int N, int threadNum = 32) throw(std::string) {
	cudaError_t cudaStatus;
	int blockNum = (N - 1) / threadNum + 1;
	dim3 bd(blockNum, 1, 1);
	productArray_bt_kernel << <blockNum, threadNum >> > (pa, pb, pResult, N);
	// Check for any errors launching the kernel
	cudaStatus = cudaGetLastError();

	if (cudaStatus != cudaSuccess) {
		std::stringstream ss;
		ss << "productArray_bt_kernel launch failed: " << cudaGetErrorString(cudaStatus) << "\n\tblockNum=" << blockNum << ";\tthreadNum=" << threadNum << ";\t";
		std::string errStr = ss.str();
		//std::cerr << errStr << std::endl;
		throw errStr;
	}
	// cudaDeviceSynchronize waits for the kernel to finish, and returns
	// any errors encountered during the launch.
	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
		std::stringstream ss;
		ss << "cudaDeviceSynchronize returned error code" << cudaStatus << "after launching addKernel!" << "\n\tblockNum=" << blockNum << ";\tthreadNum=" << threadNum << ";\t";
		std::string errStr = ss.str();
		//std::cerr <<errStr<< std::endl;
		throw errStr;
	}
}


void TestProductSpeed() {
	float *pa, *pb, *pResult;			// host copies
	float *d_a, *d_b, *d_c;	// device copies

	int N = 5;



	for (N = 1; N <= (1 << 20); N *= 2) {
		std::cout << "\n\n**************数组长度为  " << N << "  的测试\n";
		int size = sizeof (float)* N;


		// Allocate space for device copies of pa, pb, pResult
		cudaMalloc((void **)&d_a, size);
		cudaMalloc((void **)&d_b, size);
		cudaMalloc((void **)&d_c, size);


		pa = new float[N];
		pb = new float[N];
		pResult = new float[N];
		for (int i = 0; i < N; ++i) {
			pa[i] = i;
			pb[i] = i * 10;
		}



		// Copy inputs to device
		cudaMemcpy(d_a, pa, size, cudaMemcpyHostToDevice);
		cudaMemcpy(d_b, pb, size, cudaMemcpyHostToDevice);

		//std::cout << "blockDim.x=" << blockDim.x<<std::endl;
		// Launch add() kernel on GPU

		for (int threadNum = 1; threadNum <= 4096; threadNum *= 4) {
			try{
				cudaEvent_t start, stop;
				cudaEventCreate(&start);
				cudaEventCreate(&stop);
				cudaEventRecord(start, 0);
				{
					//统计GPU耗时的代码段
					productArray_bt(d_a, d_b, d_c, N, threadNum);
				}
				cudaEventRecord(stop, 0);
				cudaEventSynchronize(stop);
				float costtime;
				cudaEventElapsedTime(&costtime, start, stop);

				std::cout << "数组长度=" << N << ";\t" << "treadNum=" << threadNum << ";\t" << "点积用时:" << costtime / 1000 << "s" << std::endl;



				// Copy result back to host
				cudaMemcpy(pResult, d_c, size, cudaMemcpyDeviceToHost);


				//验证结果的正确性
				for (int i = 1; i < N; i *= 2) {
					//std::cout << "pa[i]=" << pa[i] << std::endl;
					if (pResult[i] != pa[i] * pb[i])
						std::cout << "错误: " << "i=" << i << ";\ti*10i=" << pResult[i] << std::endl;

				}


			}
			catch (std::string s){
				std::cout << "异常:" << s << std::endl;
				std::cout << "\t" << "数组长度=" << N << ";\t" << "treadNum=" << threadNum << ";\t" << std::endl;

			}
			catch (...){
				std::cout << "未知的异常类型" << std::endl;
				std::cout << "\t\t" << "数组长度=" << N << ";\t" << "treadNum=" << threadNum << ";\t" << std::endl;
			}


			std::cout << std::endl;

		}

		// Cleanup
		cudaFree(d_a);
		cudaFree(d_b);
		cudaFree(d_c);

		delete[]pa;
		delete[]pb;
		delete[]pResult;
	}

}





#define printExp(x) std::cout<< #x <<" = "<< x <<std::endl;

int main(void) {
	std::cout << __FILE__ << std::endl;

	TestProductSpeed();
	return 0;
}




程序输出如下:

***/testThreadSpeed.cu


**************数组长度为  1  的测试
数组长度=1;	treadNum=1;	点积用时:3.5904e-005s

数组长度=1;	treadNum=4;	点积用时:0.000135648s

数组长度=1;	treadNum=16;	点积用时:3.4592e-005s

数组长度=1;	treadNum=64;	点积用时:3.2384e-005s

数组长度=1;	treadNum=256;	点积用时:3.2928e-005s

数组长度=1;	treadNum=1024;	点积用时:3.1296e-005s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=1;	threadNum=4096;	
	数组长度=1;	treadNum=4096;	



**************数组长度为  2  的测试
数组长度=2;	treadNum=1;	点积用时:3.6448e-005s

数组长度=2;	treadNum=4;	点积用时:3.3696e-005s

数组长度=2;	treadNum=16;	点积用时:3.4368e-005s

数组长度=2;	treadNum=64;	点积用时:2.9024e-005s

数组长度=2;	treadNum=256;	点积用时:3.232e-005s

数组长度=2;	treadNum=1024;	点积用时:3.2352e-005s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=1;	threadNum=4096;	
	数组长度=2;	treadNum=4096;	



**************数组长度为  4  的测试
数组长度=4;	treadNum=1;	点积用时:3.6448e-005s

数组长度=4;	treadNum=4;	点积用时:4.3488e-005s

数组长度=4;	treadNum=16;	点积用时:3.248e-005s

数组长度=4;	treadNum=64;	点积用时:3.3984e-005s

数组长度=4;	treadNum=256;	点积用时:4.1952e-005s

数组长度=4;	treadNum=1024;	点积用时:3.3632e-005s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=1;	threadNum=4096;	
	数组长度=4;	treadNum=4096;	



**************数组长度为  8  的测试
数组长度=8;	treadNum=1;	点积用时:3.328e-005s

数组长度=8;	treadNum=4;	点积用时:3.36e-005s

数组长度=8;	treadNum=16;	点积用时:3.2032e-005s

数组长度=8;	treadNum=64;	点积用时:3.2736e-005s

数组长度=8;	treadNum=256;	点积用时:3.2416e-005s

数组长度=8;	treadNum=1024;	点积用时:3.3376e-005s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=1;	threadNum=4096;	
	数组长度=8;	treadNum=4096;	



**************数组长度为  16  的测试
数组长度=16;	treadNum=1;	点积用时:3.472e-005s

数组长度=16;	treadNum=4;	点积用时:3.2224e-005s

数组长度=16;	treadNum=16;	点积用时:3.2352e-005s

数组长度=16;	treadNum=64;	点积用时:3.28e-005s

数组长度=16;	treadNum=256;	点积用时:3.1648e-005s

数组长度=16;	treadNum=1024;	点积用时:3.2672e-005s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=1;	threadNum=4096;	
	数组长度=16;	treadNum=4096;	



**************数组长度为  32  的测试
数组长度=32;	treadNum=1;	点积用时:3.44e-005s

数组长度=32;	treadNum=4;	点积用时:3.3472e-005s

数组长度=32;	treadNum=16;	点积用时:3.1968e-005s

数组长度=32;	treadNum=64;	点积用时:4.8256e-005s

数组长度=32;	treadNum=256;	点积用时:3.4592e-005s

数组长度=32;	treadNum=1024;	点积用时:3.3152e-005s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=1;	threadNum=4096;	
	数组长度=32;	treadNum=4096;	



**************数组长度为  64  的测试
数组长度=64;	treadNum=1;	点积用时:3.5712e-005s

数组长度=64;	treadNum=4;	点积用时:3.12e-005s

数组长度=64;	treadNum=16;	点积用时:3.2352e-005s

数组长度=64;	treadNum=64;	点积用时:3.1168e-005s

数组长度=64;	treadNum=256;	点积用时:3.2064e-005s

数组长度=64;	treadNum=1024;	点积用时:3.3376e-005s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=1;	threadNum=4096;	
	数组长度=64;	treadNum=4096;	



**************数组长度为  128  的测试
数组长度=128;	treadNum=1;	点积用时:3.504e-005s

数组长度=128;	treadNum=4;	点积用时:3.2544e-005s

数组长度=128;	treadNum=16;	点积用时:3.1776e-005s

数组长度=128;	treadNum=64;	点积用时:3.1744e-005s

数组长度=128;	treadNum=256;	点积用时:3.1776e-005s

数组长度=128;	treadNum=1024;	点积用时:3.1872e-005s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=1;	threadNum=4096;	
	数组长度=128;	treadNum=4096;	



**************数组长度为  256  的测试
数组长度=256;	treadNum=1;	点积用时:3.5328e-005s

数组长度=256;	treadNum=4;	点积用时:3.3408e-005s

数组长度=256;	treadNum=16;	点积用时:3.456e-005s

数组长度=256;	treadNum=64;	点积用时:3.3952e-005s

数组长度=256;	treadNum=256;	点积用时:4.6336e-005s

数组长度=256;	treadNum=1024;	点积用时:3.1776e-005s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=1;	threadNum=4096;	
	数组长度=256;	treadNum=4096;	



**************数组长度为  512  的测试
数组长度=512;	treadNum=1;	点积用时:3.68e-005s

数组长度=512;	treadNum=4;	点积用时:3.2064e-005s

数组长度=512;	treadNum=16;	点积用时:3.2512e-005s

数组长度=512;	treadNum=64;	点积用时:3.2736e-005s

数组长度=512;	treadNum=256;	点积用时:3.136e-005s

数组长度=512;	treadNum=1024;	点积用时:3.2128e-005s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=1;	threadNum=4096;	
	数组长度=512;	treadNum=4096;	



**************数组长度为  1024  的测试
数组长度=1024;	treadNum=1;	点积用时:3.9552e-005s

数组长度=1024;	treadNum=4;	点积用时:3.3568e-005s

数组长度=1024;	treadNum=16;	点积用时:3.1712e-005s

数组长度=1024;	treadNum=64;	点积用时:3.184e-005s

数组长度=1024;	treadNum=256;	点积用时:3.5264e-005s

数组长度=1024;	treadNum=1024;	点积用时:3.2544e-005s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=1;	threadNum=4096;	
	数组长度=1024;	treadNum=4096;	



**************数组长度为  2048  的测试
数组长度=2048;	treadNum=1;	点积用时:4.7872e-005s

数组长度=2048;	treadNum=4;	点积用时:3.4368e-005s

数组长度=2048;	treadNum=16;	点积用时:3.2544e-005s

数组长度=2048;	treadNum=64;	点积用时:5.648e-005s

数组长度=2048;	treadNum=256;	点积用时:3.4336e-005s

数组长度=2048;	treadNum=1024;	点积用时:3.5296e-005s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=1;	threadNum=4096;	
	数组长度=2048;	treadNum=4096;	



**************数组长度为  4096  的测试
数组长度=4096;	treadNum=1;	点积用时:6.0384e-005s

数组长度=4096;	treadNum=4;	点积用时:3.872e-005s

数组长度=4096;	treadNum=16;	点积用时:3.328e-005s

数组长度=4096;	treadNum=64;	点积用时:3.3344e-005s

数组长度=4096;	treadNum=256;	点积用时:3.2832e-005s

数组长度=4096;	treadNum=1024;	点积用时:3.3088e-005s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=1;	threadNum=4096;	
	数组长度=4096;	treadNum=4096;	



**************数组长度为  8192  的测试
数组长度=8192;	treadNum=1;	点积用时:8.7168e-005s

数组长度=8192;	treadNum=4;	点积用时:4.5664e-005s

数组长度=8192;	treadNum=16;	点积用时:3.504e-005s

数组长度=8192;	treadNum=64;	点积用时:3.2928e-005s

数组长度=8192;	treadNum=256;	点积用时:3.2096e-005s

数组长度=8192;	treadNum=1024;	点积用时:3.2128e-005s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=2;	threadNum=4096;	
	数组长度=8192;	treadNum=4096;	



**************数组长度为  16384  的测试
数组长度=16384;	treadNum=1;	点积用时:0.000143616s

数组长度=16384;	treadNum=4;	点积用时:6.08e-005s

数组长度=16384;	treadNum=16;	点积用时:4.4512e-005s

数组长度=16384;	treadNum=64;	点积用时:3.7184e-005s

数组长度=16384;	treadNum=256;	点积用时:3.3088e-005s

数组长度=16384;	treadNum=1024;	点积用时:3.2736e-005s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=4;	threadNum=4096;	
	数组长度=16384;	treadNum=4096;	



**************数组长度为  32768  的测试
数组长度=32768;	treadNum=1;	点积用时:0.000344064s

数组长度=32768;	treadNum=4;	点积用时:8.608e-005s

数组长度=32768;	treadNum=16;	点积用时:4.4608e-005s

数组长度=32768;	treadNum=64;	点积用时:3.568e-005s

数组长度=32768;	treadNum=256;	点积用时:3.248e-005s

数组长度=32768;	treadNum=1024;	点积用时:3.5552e-005s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=8;	threadNum=4096;	
	数组长度=32768;	treadNum=4096;	



**************数组长度为  65536  的测试
异常:productArray_bt_kernel launch failed: invalid argument
	blockNum=65536;	threadNum=1;	
	数组长度=65536;	treadNum=1;	

数组长度=65536;	treadNum=4;	点积用时:0.000200448s

数组长度=65536;	treadNum=16;	点积用时:0.000135808s

数组长度=65536;	treadNum=64;	点积用时:0.00033584s

数组长度=65536;	treadNum=256;	点积用时:0.0003568s

数组长度=65536;	treadNum=1024;	点积用时:0.000324992s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=16;	threadNum=4096;	
	数组长度=65536;	treadNum=4096;	



**************数组长度为  131072  的测试
异常:productArray_bt_kernel launch failed: invalid argument
	blockNum=131072;	threadNum=1;	
	数组长度=131072;	treadNum=1;	

数组长度=131072;	treadNum=4;	点积用时:0.00029168s

数组长度=131072;	treadNum=16;	点积用时:0.00015808s

数组长度=131072;	treadNum=64;	点积用时:0.000143424s

数组长度=131072;	treadNum=256;	点积用时:4.2816e-005s

数组长度=131072;	treadNum=1024;	点积用时:0.000152416s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=32;	threadNum=4096;	
	数组长度=131072;	treadNum=4096;	



**************数组长度为  262144  的测试
异常:productArray_bt_kernel launch failed: invalid argument
	blockNum=262144;	threadNum=1;	
	数组长度=262144;	treadNum=1;	

异常:productArray_bt_kernel launch failed: invalid argument
	blockNum=65536;	threadNum=4;	
	数组长度=262144;	treadNum=4;	

数组长度=262144;	treadNum=16;	点积用时:0.000238848s

数组长度=262144;	treadNum=64;	点积用时:0.000137728s

数组长度=262144;	treadNum=256;	点积用时:0.000148288s

数组长度=262144;	treadNum=1024;	点积用时:0.000140192s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=64;	threadNum=4096;	
	数组长度=262144;	treadNum=4096;	



**************数组长度为  524288  的测试
异常:productArray_bt_kernel launch failed: invalid argument
	blockNum=524288;	threadNum=1;	
	数组长度=524288;	treadNum=1;	

异常:productArray_bt_kernel launch failed: invalid argument
	blockNum=131072;	threadNum=4;	
	数组长度=524288;	treadNum=4;	

数组长度=524288;	treadNum=16;	点积用时:0.000356736s

数组长度=524288;	treadNum=64;	点积用时:0.00019056s

数组长度=524288;	treadNum=256;	点积用时:0.000161248s

数组长度=524288;	treadNum=1024;	点积用时:0.000157632s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=128;	threadNum=4096;	
	数组长度=524288;	treadNum=4096;	



**************数组长度为  1048576  的测试
异常:productArray_bt_kernel launch failed: invalid argument
	blockNum=1048576;	threadNum=1;	
	数组长度=1048576;	treadNum=1;	

异常:productArray_bt_kernel launch failed: invalid argument
	blockNum=262144;	threadNum=4;	
	数组长度=1048576;	treadNum=4;	

异常:productArray_bt_kernel launch failed: invalid argument
	blockNum=65536;	threadNum=16;	
	数组长度=1048576;	treadNum=16;	

数组长度=1048576;	treadNum=64;	点积用时:0.000241312s

数组长度=1048576;	treadNum=256;	点积用时:0.000206912s

数组长度=1048576;	treadNum=1024;	点积用时:0.000214688s

异常:productArray_bt_kernel launch failed: invalid configuration argument
	blockNum=256;	threadNum=4096;	
	数组长度=1048576;	treadNum=4096;	


CUDA程序block和thread超出硬件允许值时的异常

  • 0

    开心

    开心

  • 0

    板砖

    板砖

  • 0

    感动

    感动

  • 0

    有用

    有用

  • 0

    疑问

    疑问

  • 0

    难过

    难过

  • 0

    无聊

    无聊

  • 0

    震惊

    震惊

编辑推荐
CUDA Thread Block:transpose 在 Heresy ��的前�善� sample 程式(VectorAdd、DeviceInfo) �
测试环境   测试的GPU平台为GTX660M,计算能力为3.0 首先介绍一下GPU提供的函数: int printf(con
#include "cuda_runtime.h" #include <stdio.h> #include <stdlib.h> #include <mat
#include "cuda_runtime.h" #include <stdio.h> #include <stdlib.h> #include <mat
今天管理一台服务器,远程连接时帐号密码都输入完后,点连接后弹出一个“终端服务器超出最大允许连
LSComboBoxItem.cs using System; using System.Net; using System.Windows; using System.Windows.
一个简单的CUDA程序 上一篇博客中讲到了如何下载并且搭建CUDA的开发环境,这次我将我第一次学到的CU
转载请注明来源:http://blog.csdn.net/singwhatiwanna/article/details/17289479 前言 大家都知道
转载请注明来源:http://blog.csdn.net/singwhatiwanna/article/details/17289479 前言 大家都知道
转载请注明来源:http://blog.csdn.net/singwhatiwanna/article/details/17289479 前言 大家都知道
版权所有 IT知识库 CopyRight © 2009-2015 IT知识库 IT610.com , All Rights Reserved. 京ICP备09083238号