12 #ifndef _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
13 #define _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
16 #ifdef _DEBUG // Do this only in debug mode...
17 # define WINDOWS_LEAN_AND_MEAN
33 #define cutilSafeCallNoSync(err) __cudaSafeCallNoSync(err, __FILE__, __LINE__)
34 #define cutilSafeCall(err) __cudaSafeCall (err, __FILE__, __LINE__)
35 #define cutilSafeThreadSync() __cudaSafeThreadSync(__FILE__, __LINE__)
36 #define cufftSafeCall(err) __cufftSafeCall (err, __FILE__, __LINE__)
37 #define cutilCheckError(err) __cutilCheckError (err, __FILE__, __LINE__)
38 #define cutilCheckMsg(msg) __cutilGetLastError (msg, __FILE__, __LINE__)
39 #define cutilCheckMsgAndSync(msg) __cutilGetLastErrorAndSync (msg, __FILE__, __LINE__)
40 #define cutilSafeMalloc(mallocCall) __cutilSafeMalloc ((mallocCall), __FILE__, __LINE__)
41 #define cutilCondition(val) __cutilCondition (val, __FILE__, __LINE__)
42 #define cutilExit(argc, argv) __cutilExit (argc, argv)
44 inline cudaError cutilDeviceSynchronize()
46 #if CUDART_VERSION >= 4000
47 return cudaDeviceSynchronize();
49 return cudaThreadSynchronize();
53 inline cudaError cutilDeviceReset()
55 #if CUDART_VERSION >= 4000
56 return cudaDeviceReset();
58 return cudaThreadExit();
62 inline void __cutilCondition(
int val,
char *file,
int line)
64 if( CUTFalse == cutCheckCondition( val, file, line ) ) {
69 inline void __cutilExit(
int argc,
char **argv)
71 if (!cutCheckCmdLineFlag(argc, (
const char**)argv,
"noprompt")) {
72 printf(
"\nPress ENTER to exit...\n");
80 #define MIN(a,b) ((a < b) ? a : b)
81 #define MAX(a,b) ((a > b) ? a : b)
84 inline int _ConvertSMVer2Cores(
int major,
int minor)
92 sSMtoCores nGpuArchCoresPerSM[] =
103 while (nGpuArchCoresPerSM[index].SM != -1) {
104 if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
105 return nGpuArchCoresPerSM[index].Cores;
109 printf(
"MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
115 inline int cutGetMaxGflopsDeviceId()
117 int current_device = 0, sm_per_multiproc = 0;
118 int max_compute_perf = 0, max_perf_device = 0;
119 int device_count = 0, best_SM_arch = 0;
120 cudaDeviceProp deviceProp;
122 cudaGetDeviceCount( &device_count );
124 while ( current_device < device_count ) {
125 cudaGetDeviceProperties( &deviceProp, current_device );
126 if (deviceProp.major > 0 && deviceProp.major < 9999) {
127 best_SM_arch = MAX(best_SM_arch, deviceProp.major);
134 while( current_device < device_count ) {
135 cudaGetDeviceProperties( &deviceProp, current_device );
136 if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
137 sm_per_multiproc = 1;
139 sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
142 int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
143 if( compute_perf > max_compute_perf ) {
145 if ( best_SM_arch > 2 ) {
147 if (deviceProp.major == best_SM_arch) {
148 max_compute_perf = compute_perf;
149 max_perf_device = current_device;
152 max_compute_perf = compute_perf;
153 max_perf_device = current_device;
158 return max_perf_device;
162 inline int cutGetMaxGflopsGraphicsDeviceId()
164 int current_device = 0, sm_per_multiproc = 0;
165 int max_compute_perf = 0, max_perf_device = 0;
166 int device_count = 0, best_SM_arch = 0;
168 cudaDeviceProp deviceProp;
170 cudaGetDeviceCount( &device_count );
172 while ( current_device < device_count ) {
173 cudaGetDeviceProperties( &deviceProp, current_device );
175 #if CUDA_VERSION >= 3020
176 if (deviceProp.tccDriver) bTCC = 1;
179 if (deviceProp.name[0] ==
'T') bTCC = 1;
183 if (deviceProp.major > 0 && deviceProp.major < 9999) {
184 best_SM_arch = MAX(best_SM_arch, deviceProp.major);
192 while( current_device < device_count ) {
193 cudaGetDeviceProperties( &deviceProp, current_device );
194 if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
195 sm_per_multiproc = 1;
197 sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
200 #if CUDA_VERSION >= 3020
201 if (deviceProp.tccDriver) bTCC = 1;
204 if (deviceProp.name[0] ==
'T') bTCC = 1;
209 int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
210 if( compute_perf > max_compute_perf ) {
212 if ( best_SM_arch > 2 ) {
214 if (deviceProp.major == best_SM_arch) {
215 max_compute_perf = compute_perf;
216 max_perf_device = current_device;
219 max_compute_perf = compute_perf;
220 max_perf_device = current_device;
226 return max_perf_device;
232 # ifdef _DEBUG // Do this only in debug mode...
233 inline void VSPrintf(FILE *file, LPCSTR fmt, ...)
235 size_t fmt2_sz = 2048;
236 char *fmt2 = (
char*)malloc(fmt2_sz);
238 va_start(vlist, fmt);
239 while((_vsnprintf(fmt2, fmt2_sz, fmt, vlist)) < 0)
243 fmt2 = (
char*)malloc(fmt2_sz);
245 OutputDebugStringA(fmt2);
249 # define FPRINTF(a) VSPrintf a
251 # define FPRINTF(a) fprintf a
256 # define FPRINTF(a) fprintf a
259 # define FPRINTF(a) fprintf a
265 inline void __cudaSafeCallNoSync( cudaError err,
const char *file,
const int line )
267 if( cudaSuccess != err) {
268 FPRINTF((stderr,
"%s(%i) : cudaSafeCallNoSync() Runtime API error : %s.\n",
269 file, line, cudaGetErrorString( err) ));
274 inline void __cudaSafeCall( cudaError err,
const char *file,
const int line )
276 if( cudaSuccess != err) {
277 FPRINTF((stderr,
"%s(%i) : cudaSafeCall() Runtime API error : %s.\n",
278 file, line, cudaGetErrorString( err) ));
283 inline void __cudaSafeThreadSync(
const char *file,
const int line )
285 cudaError err = cutilDeviceSynchronize();
286 if ( cudaSuccess != err) {
287 FPRINTF((stderr,
"%s(%i) : cudaDeviceSynchronize() Runtime API error : %s.\n",
288 file, line, cudaGetErrorString( err) ));
293 inline void __cufftSafeCall( cufftResult err,
const char *file,
const int line )
295 if( CUFFT_SUCCESS != err) {
296 FPRINTF((stderr,
"%s(%i) : cufftSafeCall() CUFFT error.\n",
302 inline void __cutilCheckError( CUTBoolean err,
const char *file,
const int line )
304 if( CUTTrue != err) {
305 FPRINTF((stderr,
"%s(%i) : CUTIL CUDA error.\n",
311 inline void __cutilGetLastError(
const char *errorMessage,
const char *file,
const int line )
313 cudaError_t err = cudaGetLastError();
314 if( cudaSuccess != err) {
315 FPRINTF((stderr,
"%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : %s.\n",
316 file, line, errorMessage, cudaGetErrorString( err) ));
321 inline void __cutilGetLastErrorAndSync(
const char *errorMessage,
const char *file,
const int line )
323 cudaError_t err = cudaGetLastError();
324 if( cudaSuccess != err) {
325 FPRINTF((stderr,
"%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : %s.\n",
326 file, line, errorMessage, cudaGetErrorString( err) ));
330 err = cutilDeviceSynchronize();
331 if( cudaSuccess != err) {
332 FPRINTF((stderr,
"%s(%i) : cutilCheckMsg cudaDeviceSynchronize error: %s : %s.\n",
333 file, line, errorMessage, cudaGetErrorString( err) ));
338 inline void __cutilSafeMalloc(
void *pointer,
const char *file,
const int line )
341 FPRINTF((stderr,
"%s(%i) : cutilSafeMalloc host malloc failure\n",
347 #if __DEVICE_EMULATION__
348 inline int cutilDeviceInit(
int ARGC,
char **ARGV) { }
349 inline int cutilChooseCudaDevice(
int ARGC,
char **ARGV) { }
351 inline int cutilDeviceInit(
int ARGC,
char **ARGV)
354 cutilSafeCallNoSync(cudaGetDeviceCount(&deviceCount));
355 if (deviceCount == 0) {
356 FPRINTF((stderr,
"CUTIL CUDA error: no devices supporting CUDA.\n"));
360 cutGetCmdLineArgumenti(ARGC, (
const char **) ARGV,
"device", &dev);
363 if (dev > deviceCount-1) {
364 fprintf(stderr,
"\n");
365 fprintf(stderr,
">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
366 fprintf(stderr,
">> cutilDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
367 fprintf(stderr,
"\n");
370 cudaDeviceProp deviceProp;
371 cutilSafeCallNoSync(cudaGetDeviceProperties(&deviceProp, dev));
372 if (deviceProp.major < 1) {
373 FPRINTF((stderr,
"cutil error: GPU device does not support CUDA.\n"));
376 printf(
"> Using CUDA device [%d]: %s\n", dev, deviceProp.name);
377 cutilSafeCall(cudaSetDevice(dev));
383 inline int cutilChooseCudaDevice(
int argc,
char **argv)
385 cudaDeviceProp deviceProp;
388 if( cutCheckCmdLineFlag(argc, (
const char**)argv,
"device") ) {
389 devID = cutilDeviceInit(argc, argv);
391 printf(
"exiting...\n");
392 cutilExit(argc, argv);
397 devID = cutGetMaxGflopsDeviceId();
398 cutilSafeCallNoSync( cudaSetDevice( devID ) );
399 cutilSafeCallNoSync( cudaGetDeviceProperties(&deviceProp, devID) );
400 printf(
"> Using CUDA device [%d]: %s\n", devID, deviceProp.name);
408 inline void cutilCudaCheckCtxLost(
const char *errorMessage,
const char *file,
const int line )
410 cudaError_t err = cudaGetLastError();
411 if( cudaSuccess != err) {
412 FPRINTF((stderr,
"%s(%i) : CUDA error: %s : %s.\n",
413 file, line, errorMessage, cudaGetErrorString( err) ));
416 err = cutilDeviceSynchronize();
417 if( cudaSuccess != err) {
418 FPRINTF((stderr,
"%s(%i) : CUDA error: %s : %s.\n",
419 file, line, errorMessage, cudaGetErrorString( err) ));
426 #define STRCASECMP _stricmp
428 #define STRCASECMP strcasecmp
434 #define STRNCASECMP _strnicmp
436 #define STRNCASECMP strncasecmp
440 inline void __cutilQAFinish(
int argc,
char **argv,
bool bStatus)
442 const char *sStatus[] = {
"FAILED",
"PASSED",
"WAIVED", NULL };
445 for (
int i=1; i < argc; i++) {
446 if (!STRCASECMP(argv[i],
"-qatest") || !STRCASECMP(argv[i],
"-noprompt")) {
452 printf(
"&&&& %s %s", sStatus[bStatus], argv[0]);
453 for (
int i=1; i < argc; i++) printf(
" %s", argv[i]);
455 printf(
"[%s] test result\n%s\n", argv[0], sStatus[bStatus]);
460 inline bool cutilCudaCapabilities(
int major_version,
int minor_version,
int argc,
char **argv)
462 cudaDeviceProp deviceProp;
463 deviceProp.major = 0;
464 deviceProp.minor = 0;
467 #ifdef __DEVICE_EMULATION__
468 printf(
"> Compute Device Emulation Mode \n");
471 cutilSafeCall( cudaGetDevice(&dev) );
472 cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev));
474 if((deviceProp.major > major_version) ||
475 (deviceProp.major == major_version && deviceProp.minor >= minor_version))
477 printf(
"> Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
482 printf(
"There is no device supporting CUDA compute capability %d.%d.\n", major_version, minor_version);
483 __cutilQAFinish(argc, argv,
true);
488 #endif // _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_