Point Cloud Library (PCL)  1.9.1
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends Modules Pages
cutil_inline_runtime.h
1 /*
2  * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3  *
4  * Please refer to the NVIDIA end user license agreement (EULA) associated
5  * with this source code for terms and conditions that govern your use of
6  * this software. Any use, reproduction, disclosure, or distribution of
7  * this software and related documentation outside the terms of the EULA
8  * is strictly prohibited.
9  *
10  */
11 
12 #ifndef _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
13 #define _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
14 
15 #ifdef _WIN32
16 #ifdef _DEBUG // Do this only in debug mode...
17 # define WINDOWS_LEAN_AND_MEAN
18 # include <windows.h>
19 # include <stdlib.h>
20 # undef min
21 # undef max
22 #endif
23 #endif
24 
25 #include <stdio.h>
26 #include <string.h>
27 #include <stdlib.h>
28 
29 #include <cufft.h>
30 
31 // We define these calls here, so the user doesn't need to include __FILE__ and __LINE__
32 // The advantage is the developers gets to use the inline function so they can debug
33 #define cutilSafeCallNoSync(err) __cudaSafeCallNoSync(err, __FILE__, __LINE__)
34 #define cutilSafeCall(err) __cudaSafeCall (err, __FILE__, __LINE__)
35 #define cutilSafeThreadSync() __cudaSafeThreadSync(__FILE__, __LINE__)
36 #define cufftSafeCall(err) __cufftSafeCall (err, __FILE__, __LINE__)
37 #define cutilCheckError(err) __cutilCheckError (err, __FILE__, __LINE__)
38 #define cutilCheckMsg(msg) __cutilGetLastError (msg, __FILE__, __LINE__)
39 #define cutilCheckMsgAndSync(msg) __cutilGetLastErrorAndSync (msg, __FILE__, __LINE__)
40 #define cutilSafeMalloc(mallocCall) __cutilSafeMalloc ((mallocCall), __FILE__, __LINE__)
41 #define cutilCondition(val) __cutilCondition (val, __FILE__, __LINE__)
42 #define cutilExit(argc, argv) __cutilExit (argc, argv)
43 
44 inline cudaError cutilDeviceSynchronize()
45 {
46 #if CUDART_VERSION >= 4000
47  return cudaDeviceSynchronize();
48 #else
49  return cudaThreadSynchronize();
50 #endif
51 }
52 
53 inline cudaError cutilDeviceReset()
54 {
55 #if CUDART_VERSION >= 4000
56  return cudaDeviceReset();
57 #else
58  return cudaThreadExit();
59 #endif
60 }
61 
62 inline void __cutilCondition(int val, char *file, int line)
63 {
64  if( CUTFalse == cutCheckCondition( val, file, line ) ) {
65  exit(EXIT_FAILURE);
66  }
67 }
68 
69 inline void __cutilExit(int argc, char **argv)
70 {
71  if (!cutCheckCmdLineFlag(argc, (const char**)argv, "noprompt")) {
72  printf("\nPress ENTER to exit...\n");
73  fflush( stdout);
74  fflush( stderr);
75  getchar();
76  }
77  exit(EXIT_SUCCESS);
78 }
79 
80 #define MIN(a,b) ((a < b) ? a : b)
81 #define MAX(a,b) ((a > b) ? a : b)
82 
83 // Beginning of GPU Architecture definitions
84 inline int _ConvertSMVer2Cores(int major, int minor)
85 {
86  // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
87  typedef struct {
88  int SM; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version
89  int Cores;
90  } sSMtoCores;
91 
92  sSMtoCores nGpuArchCoresPerSM[] =
93  { { 0x10, 8 },
94  { 0x11, 8 },
95  { 0x12, 8 },
96  { 0x13, 8 },
97  { 0x20, 32 },
98  { 0x21, 48 },
99  { -1, -1 }
100  };
101 
102  int index = 0;
103  while (nGpuArchCoresPerSM[index].SM != -1) {
104  if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
105  return nGpuArchCoresPerSM[index].Cores;
106  }
107  index++;
108  }
109  printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
110  return -1;
111 }
112 // end of GPU Architecture definitions
113 
114 // This function returns the best GPU (with maximum GFLOPS)
115 inline int cutGetMaxGflopsDeviceId()
116 {
117  int current_device = 0, sm_per_multiproc = 0;
118  int max_compute_perf = 0, max_perf_device = 0;
119  int device_count = 0, best_SM_arch = 0;
120  cudaDeviceProp deviceProp;
121 
122  cudaGetDeviceCount( &device_count );
123  // Find the best major SM Architecture GPU device
124  while ( current_device < device_count ) {
125  cudaGetDeviceProperties( &deviceProp, current_device );
126  if (deviceProp.major > 0 && deviceProp.major < 9999) {
127  best_SM_arch = MAX(best_SM_arch, deviceProp.major);
128  }
129  current_device++;
130  }
131 
132  // Find the best CUDA capable GPU device
133  current_device = 0;
134  while( current_device < device_count ) {
135  cudaGetDeviceProperties( &deviceProp, current_device );
136  if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
137  sm_per_multiproc = 1;
138  } else {
139  sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
140  }
141 
142  int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
143  if( compute_perf > max_compute_perf ) {
144  // If we find GPU with SM major > 2, search only these
145  if ( best_SM_arch > 2 ) {
146  // If our device==dest_SM_arch, choose this, or else pass
147  if (deviceProp.major == best_SM_arch) {
148  max_compute_perf = compute_perf;
149  max_perf_device = current_device;
150  }
151  } else {
152  max_compute_perf = compute_perf;
153  max_perf_device = current_device;
154  }
155  }
156  ++current_device;
157  }
158  return max_perf_device;
159 }
160 
161 // This function returns the best GPU (with maximum GFLOPS)
162 inline int cutGetMaxGflopsGraphicsDeviceId()
163 {
164  int current_device = 0, sm_per_multiproc = 0;
165  int max_compute_perf = 0, max_perf_device = 0;
166  int device_count = 0, best_SM_arch = 0;
167  int bTCC = 0;
168  cudaDeviceProp deviceProp;
169 
170  cudaGetDeviceCount( &device_count );
171  // Find the best major SM Architecture GPU device that is graphics capable
172  while ( current_device < device_count ) {
173  cudaGetDeviceProperties( &deviceProp, current_device );
174 
175 #if CUDA_VERSION >= 3020
176  if (deviceProp.tccDriver) bTCC = 1;
177 #else
178  // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
179  if (deviceProp.name[0] == 'T') bTCC = 1;
180 #endif
181 
182  if (!bTCC) {
183  if (deviceProp.major > 0 && deviceProp.major < 9999) {
184  best_SM_arch = MAX(best_SM_arch, deviceProp.major);
185  }
186  }
187  current_device++;
188  }
189 
190  // Find the best CUDA capable GPU device
191  current_device = 0;
192  while( current_device < device_count ) {
193  cudaGetDeviceProperties( &deviceProp, current_device );
194  if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
195  sm_per_multiproc = 1;
196  } else {
197  sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
198  }
199 
200 #if CUDA_VERSION >= 3020
201  if (deviceProp.tccDriver) bTCC = 1;
202 #else
203  // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
204  if (deviceProp.name[0] == 'T') bTCC = 1;
205 #endif
206 
207  if (!bTCC) // Is this GPU running the TCC driver? If so we pass on this
208  {
209  int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
210  if( compute_perf > max_compute_perf ) {
211  // If we find GPU with SM major > 2, search only these
212  if ( best_SM_arch > 2 ) {
213  // If our device==dest_SM_arch, choose this, or else pass
214  if (deviceProp.major == best_SM_arch) {
215  max_compute_perf = compute_perf;
216  max_perf_device = current_device;
217  }
218  } else {
219  max_compute_perf = compute_perf;
220  max_perf_device = current_device;
221  }
222  }
223  }
224  ++current_device;
225  }
226  return max_perf_device;
227 }
228 
229 // Give a little more for Windows : the console window often disappears before we can read the message
230 #ifdef _WIN32
231 # if 1//ndef UNICODE
232 # ifdef _DEBUG // Do this only in debug mode...
233  inline void VSPrintf(FILE *file, LPCSTR fmt, ...)
234  {
235  size_t fmt2_sz = 2048;
236  char *fmt2 = (char*)malloc(fmt2_sz);
237  va_list vlist;
238  va_start(vlist, fmt);
239  while((_vsnprintf(fmt2, fmt2_sz, fmt, vlist)) < 0) // means there wasn't anough room
240  {
241  fmt2_sz *= 2;
242  if(fmt2) free(fmt2);
243  fmt2 = (char*)malloc(fmt2_sz);
244  }
245  OutputDebugStringA(fmt2);
246  fprintf(file, fmt2);
247  free(fmt2);
248  }
249 # define FPRINTF(a) VSPrintf a
250 # else //debug
251 # define FPRINTF(a) fprintf a
252 // For other than Win32
253 # endif //debug
254 # else //unicode
255 // Unicode case... let's give-up for now and keep basic printf
256 # define FPRINTF(a) fprintf a
257 # endif //unicode
258 #else //win32
259 # define FPRINTF(a) fprintf a
260 #endif //win32
261 
262 // NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at the right line
263 // when the user double clicks on the error line in the Output pane. Like any compile error.
264 
265 inline void __cudaSafeCallNoSync( cudaError err, const char *file, const int line )
266 {
267  if( cudaSuccess != err) {
268  FPRINTF((stderr, "%s(%i) : cudaSafeCallNoSync() Runtime API error : %s.\n",
269  file, line, cudaGetErrorString( err) ));
270  exit(-1);
271  }
272 }
273 
274 inline void __cudaSafeCall( cudaError err, const char *file, const int line )
275 {
276  if( cudaSuccess != err) {
277  FPRINTF((stderr, "%s(%i) : cudaSafeCall() Runtime API error : %s.\n",
278  file, line, cudaGetErrorString( err) ));
279  exit(-1);
280  }
281 }
282 
283 inline void __cudaSafeThreadSync( const char *file, const int line )
284 {
285  cudaError err = cutilDeviceSynchronize();
286  if ( cudaSuccess != err) {
287  FPRINTF((stderr, "%s(%i) : cudaDeviceSynchronize() Runtime API error : %s.\n",
288  file, line, cudaGetErrorString( err) ));
289  exit(-1);
290  }
291 }
292 
293 inline void __cufftSafeCall( cufftResult err, const char *file, const int line )
294 {
295  if( CUFFT_SUCCESS != err) {
296  FPRINTF((stderr, "%s(%i) : cufftSafeCall() CUFFT error.\n",
297  file, line));
298  exit(-1);
299  }
300 }
301 
302 inline void __cutilCheckError( CUTBoolean err, const char *file, const int line )
303 {
304  if( CUTTrue != err) {
305  FPRINTF((stderr, "%s(%i) : CUTIL CUDA error.\n",
306  file, line));
307  exit(-1);
308  }
309 }
310 
311 inline void __cutilGetLastError( const char *errorMessage, const char *file, const int line )
312 {
313  cudaError_t err = cudaGetLastError();
314  if( cudaSuccess != err) {
315  FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : %s.\n",
316  file, line, errorMessage, cudaGetErrorString( err) ));
317  exit(-1);
318  }
319 }
320 
321 inline void __cutilGetLastErrorAndSync( const char *errorMessage, const char *file, const int line )
322 {
323  cudaError_t err = cudaGetLastError();
324  if( cudaSuccess != err) {
325  FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : %s.\n",
326  file, line, errorMessage, cudaGetErrorString( err) ));
327  exit(-1);
328  }
329 
330  err = cutilDeviceSynchronize();
331  if( cudaSuccess != err) {
332  FPRINTF((stderr, "%s(%i) : cutilCheckMsg cudaDeviceSynchronize error: %s : %s.\n",
333  file, line, errorMessage, cudaGetErrorString( err) ));
334  exit(-1);
335  }
336 }
337 
338 inline void __cutilSafeMalloc( void *pointer, const char *file, const int line )
339 {
340  if( !(pointer)) {
341  FPRINTF((stderr, "%s(%i) : cutilSafeMalloc host malloc failure\n",
342  file, line));
343  exit(-1);
344  }
345 }
346 
347 #if __DEVICE_EMULATION__
348  inline int cutilDeviceInit(int ARGC, char **ARGV) { }
349  inline int cutilChooseCudaDevice(int ARGC, char **ARGV) { }
350 #else
351  inline int cutilDeviceInit(int ARGC, char **ARGV)
352  {
353  int deviceCount;
354  cutilSafeCallNoSync(cudaGetDeviceCount(&deviceCount));
355  if (deviceCount == 0) {
356  FPRINTF((stderr, "CUTIL CUDA error: no devices supporting CUDA.\n"));
357  exit(-1);
358  }
359  int dev = 0;
360  cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev);
361  if (dev < 0)
362  dev = 0;
363  if (dev > deviceCount-1) {
364  fprintf(stderr, "\n");
365  fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
366  fprintf(stderr, ">> cutilDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
367  fprintf(stderr, "\n");
368  return -dev;
369  }
370  cudaDeviceProp deviceProp;
371  cutilSafeCallNoSync(cudaGetDeviceProperties(&deviceProp, dev));
372  if (deviceProp.major < 1) {
373  FPRINTF((stderr, "cutil error: GPU device does not support CUDA.\n"));
374  exit(-1); \
375  }
376  printf("> Using CUDA device [%d]: %s\n", dev, deviceProp.name);
377  cutilSafeCall(cudaSetDevice(dev));
378 
379  return dev;
380  }
381 
382  // General initialization call to pick the best CUDA Device
383  inline int cutilChooseCudaDevice(int argc, char **argv)
384  {
385  cudaDeviceProp deviceProp;
386  int devID = 0;
387  // If the command-line has a device number specified, use it
388  if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
389  devID = cutilDeviceInit(argc, argv);
390  if (devID < 0) {
391  printf("exiting...\n");
392  cutilExit(argc, argv);
393  exit(0);
394  }
395  } else {
396  // Otherwise pick the device with highest Gflops/s
397  devID = cutGetMaxGflopsDeviceId();
398  cutilSafeCallNoSync( cudaSetDevice( devID ) );
399  cutilSafeCallNoSync( cudaGetDeviceProperties(&deviceProp, devID) );
400  printf("> Using CUDA device [%d]: %s\n", devID, deviceProp.name);
401  }
402  return devID;
403  }
404 #endif
405 
406 
407 //! Check for CUDA context lost
408 inline void cutilCudaCheckCtxLost(const char *errorMessage, const char *file, const int line )
409 {
410  cudaError_t err = cudaGetLastError();
411  if( cudaSuccess != err) {
412  FPRINTF((stderr, "%s(%i) : CUDA error: %s : %s.\n",
413  file, line, errorMessage, cudaGetErrorString( err) ));
414  exit(-1);
415  }
416  err = cutilDeviceSynchronize();
417  if( cudaSuccess != err) {
418  FPRINTF((stderr, "%s(%i) : CUDA error: %s : %s.\n",
419  file, line, errorMessage, cudaGetErrorString( err) ));
420  exit(-1);
421  }
422 }
423 
424 #ifndef STRCASECMP
425 #ifdef _WIN32
426 #define STRCASECMP _stricmp
427 #else
428 #define STRCASECMP strcasecmp
429 #endif
430 #endif
431 
432 #ifndef STRNCASECMP
433 #ifdef _WIN32
434 #define STRNCASECMP _strnicmp
435 #else
436 #define STRNCASECMP strncasecmp
437 #endif
438 #endif
439 
440 inline void __cutilQAFinish(int argc, char **argv, bool bStatus)
441 {
442  const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
443 
444  bool bFlag = false;
445  for (int i=1; i < argc; i++) {
446  if (!STRCASECMP(argv[i], "-qatest") || !STRCASECMP(argv[i], "-noprompt")) {
447  bFlag |= true;
448  }
449  }
450 
451  if (bFlag) {
452  printf("&&&& %s %s", sStatus[bStatus], argv[0]);
453  for (int i=1; i < argc; i++) printf(" %s", argv[i]);
454  } else {
455  printf("[%s] test result\n%s\n", argv[0], sStatus[bStatus]);
456  }
457 }
458 
459 // General check for CUDA GPU SM Capabilities
460 inline bool cutilCudaCapabilities(int major_version, int minor_version, int argc, char **argv)
461 {
462  cudaDeviceProp deviceProp;
463  deviceProp.major = 0;
464  deviceProp.minor = 0;
465  int dev;
466 
467 #ifdef __DEVICE_EMULATION__
468  printf("> Compute Device Emulation Mode \n");
469 #endif
470 
471  cutilSafeCall( cudaGetDevice(&dev) );
472  cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev));
473 
474  if((deviceProp.major > major_version) ||
475  (deviceProp.major == major_version && deviceProp.minor >= minor_version))
476  {
477  printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
478  return true;
479  }
480  else
481  {
482  printf("There is no device supporting CUDA compute capability %d.%d.\n", major_version, minor_version);
483  __cutilQAFinish(argc, argv, true);
484  return false;
485  }
486 }
487 
488 #endif // _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_