Point Cloud Library (PCL)  1.9.1
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends Modules Pages
cutil_inline_drvapi.h
1 /*
2  * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3  *
4  * Please refer to the NVIDIA end user license agreement (EULA) associated
5  * with this source code for terms and conditions that govern your use of
6  * this software. Any use, reproduction, disclosure, or distribution of
7  * this software and related documentation outside the terms of the EULA
8  * is strictly prohibited.
9  *
10  */
11 
12 #ifndef _CUTIL_INLINE_FUNCTIONS_DRVAPI_H_
13 #define _CUTIL_INLINE_FUNCTIONS_DRVAPI_H_
14 
15 #include <stdio.h>
16 #include <string.h>
17 #include <stdlib.h>
18 
19 
20 // We define these calls here, so the user doesn't need to include __FILE__ and __LINE__
21 // The advantage is the developers gets to use the inline function so they can debug
22 #define cutilDrvSafeCallNoSync(err) __cuSafeCallNoSync (err, __FILE__, __LINE__)
23 #define cutilDrvSafeCall(err) __cuSafeCall (err, __FILE__, __LINE__)
24 #define cutilDrvCtxSync() __cuCtxSync (__FILE__, __LINE__)
25 #define cutilDrvCheckMsg(msg) __cuCheckMsg (msg, __FILE__, __LINE__)
26 #define cutilDrvAlignOffset(offset, alignment) ( offset = (offset + (alignment-1)) & ~((alignment-1)) )
27 
28 // These are the inline versions for all of the CUTIL functions
29 inline void __cuSafeCallNoSync( CUresult err, const char *file, const int line )
30 {
31  if( CUDA_SUCCESS != err) {
32  fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
33  err, file, line );
34  exit(-1);
35  }
36 }
37 inline void __cuSafeCall( CUresult err, const char *file, const int line )
38 {
39  __cuSafeCallNoSync( err, file, line );
40 }
41 
42 inline void __cuCtxSync(const char *file, const int line )
43 {
44  CUresult err = cuCtxSynchronize();
45  if( CUDA_SUCCESS != err ) {
46  fprintf(stderr, "cuCtxSynchronize() API error = %04d in file <%s>, line %i.\n",
47  err, file, line );
48  exit(-1);
49  }
50 }
51 
52 #define MIN(a,b) ((a < b) ? a : b)
53 #define MAX(a,b) ((a > b) ? a : b)
54 
55 // Beginning of GPU Architecture definitions
56 inline int _ConvertSMVer2CoresDrvApi(int major, int minor)
57 {
58  // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
59  typedef struct {
60  int SM; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version
61  int Cores;
62  } sSMtoCores;
63 
64  sSMtoCores nGpuArchCoresPerSM[] =
65  { { 0x10, 8 },
66  { 0x11, 8 },
67  { 0x12, 8 },
68  { 0x13, 8 },
69  { 0x20, 32 },
70  { 0x21, 48 },
71  { -1, -1 }
72  };
73 
74  int index = 0;
75  while (nGpuArchCoresPerSM[index].SM != -1) {
76  if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
77  return nGpuArchCoresPerSM[index].Cores;
78  }
79  index++;
80  }
81  printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
82  return -1;
83 }
84 // end of GPU Architecture definitions
85 
86 // This function returns the best GPU based on performance
87 inline int cutilDrvGetMaxGflopsDeviceId()
88 {
89  CUdevice current_device = 0, max_perf_device = 0;
90  int device_count = 0, sm_per_multiproc = 0;
91  int max_compute_perf = 0, best_SM_arch = 0;
92  int major = 0, minor = 0, multiProcessorCount, clockRate;
93 
94  cuInit(0);
95  cutilDrvSafeCallNoSync(cuDeviceGetCount(&device_count));
96 
97  // Find the best major SM Architecture GPU device
98  while ( current_device < device_count ) {
99  cutilDrvSafeCallNoSync( cuDeviceComputeCapability(&major, &minor, current_device ) );
100  if (major > 0 && major < 9999) {
101  best_SM_arch = MAX(best_SM_arch, major);
102  }
103  current_device++;
104  }
105 
106  // Find the best CUDA capable GPU device
107  current_device = 0;
108  while( current_device < device_count ) {
109  cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &multiProcessorCount,
110  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
111  current_device ) );
112  cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &clockRate,
113  CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
114  current_device ) );
115  cutilDrvSafeCallNoSync( cuDeviceComputeCapability(&major, &minor, current_device ) );
116 
117  if (major == 9999 && minor == 9999) {
118  sm_per_multiproc = 1;
119  } else {
120  sm_per_multiproc = _ConvertSMVer2CoresDrvApi(major, minor);
121  }
122 
123  int compute_perf = multiProcessorCount * sm_per_multiproc * clockRate;
124  if( compute_perf > max_compute_perf ) {
125  // If we find GPU with SM major > 2, search only these
126  if ( best_SM_arch > 2 ) {
127  // If our device==dest_SM_arch, choose this, or else pass
128  if (major == best_SM_arch) {
129  max_compute_perf = compute_perf;
130  max_perf_device = current_device;
131  }
132  } else {
133  max_compute_perf = compute_perf;
134  max_perf_device = current_device;
135  }
136  }
137  ++current_device;
138  }
139  return max_perf_device;
140 }
141 
142 // This function returns the best Graphics GPU based on performance
143 inline int cutilDrvGetMaxGflopsGraphicsDeviceId()
144 {
145  CUdevice current_device = 0, max_perf_device = 0;
146  int device_count = 0, sm_per_multiproc = 0;
147  int max_compute_perf = 0, best_SM_arch = 0;
148  int major = 0, minor = 0, multiProcessorCount, clockRate;
149  int bTCC = 0;
150  char deviceName[256];
151 
152  cuInit(0);
153  cutilDrvSafeCallNoSync(cuDeviceGetCount(&device_count));
154 
155  // Find the best major SM Architecture GPU device that are graphics devices
156  while ( current_device < device_count ) {
157  cutilDrvSafeCallNoSync( cuDeviceGetName(deviceName, 256, current_device) );
158  cutilDrvSafeCallNoSync( cuDeviceComputeCapability(&major, &minor, current_device ) );
159 
160 #if CUDA_VERSION >= 3020
161  cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device ) );
162 #else
163  // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
164  if (deviceName[0] == 'T') bTCC = 1;
165 #endif
166  if (!bTCC) {
167  if (major > 0 && major < 9999) {
168  best_SM_arch = MAX(best_SM_arch, major);
169  }
170  }
171  current_device++;
172  }
173 
174  // Find the best CUDA capable GPU device
175  current_device = 0;
176  while( current_device < device_count ) {
177  cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &multiProcessorCount,
178  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
179  current_device ) );
180  cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &clockRate,
181  CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
182  current_device ) );
183  cutilDrvSafeCallNoSync( cuDeviceComputeCapability(&major, &minor, current_device ) );
184 
185 #if CUDA_VERSION >= 3020
186  cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device ) );
187 #else
188  // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
189  if (deviceName[0] == 'T') bTCC = 1;
190 #endif
191 
192  if (major == 9999 && minor == 9999) {
193  sm_per_multiproc = 1;
194  } else {
195  sm_per_multiproc = _ConvertSMVer2CoresDrvApi(major, minor);
196  }
197 
198  // If this is a Tesla based GPU and SM 2.0, and TCC is disabled, this is a contender
199  if (!bTCC) // Is this GPU running the TCC driver? If so we pass on this
200  {
201  int compute_perf = multiProcessorCount * sm_per_multiproc * clockRate;
202  if( compute_perf > max_compute_perf ) {
203  // If we find GPU with SM major > 2, search only these
204  if ( best_SM_arch > 2 ) {
205  // If our device = dest_SM_arch, then we pick this one
206  if (major == best_SM_arch) {
207  max_compute_perf = compute_perf;
208  max_perf_device = current_device;
209  }
210  } else {
211  max_compute_perf = compute_perf;
212  max_perf_device = current_device;
213  }
214  }
215  }
216  ++current_device;
217  }
218  return max_perf_device;
219 }
220 
221 inline void __cuCheckMsg( const char * msg, const char *file, const int line )
222 {
223  CUresult err = cuCtxSynchronize();
224  if( CUDA_SUCCESS != err) {
225  fprintf(stderr, "cutilDrvCheckMsg -> %s", msg);
226  fprintf(stderr, "cutilDrvCheckMsg -> cuCtxSynchronize API error = %04d in file <%s>, line %i.\n",
227  err, file, line );
228  exit(-1);
229  }
230 }
231 
232 
233 #if __DEVICE_EMULATION__
234  inline int cutilDeviceInitDrv(int ARGC, char **ARGV) { }
235 #else
236  inline int cutilDeviceInitDrv(int ARGC, char ** ARGV)
237  {
238  int cuDevice = 0;
239  int deviceCount = 0;
240  CUresult err = cuInit(0);
241  if (CUDA_SUCCESS == err)
242  cutilDrvSafeCallNoSync(cuDeviceGetCount(&deviceCount));
243  if (deviceCount == 0) {
244  fprintf(stderr, "CUTIL DeviceInitDrv error: no devices supporting CUDA\n");
245  exit(-1);
246  }
247  int dev = 0;
248  cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev);
249  if (dev < 0) dev = 0;
250  if (dev > deviceCount-1) {
251  fprintf(stderr, "\n");
252  fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
253  fprintf(stderr, ">> cutilDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
254  fprintf(stderr, "\n");
255  return -dev;
256  }
257  cutilDrvSafeCallNoSync(cuDeviceGet(&cuDevice, dev));
258  char name[100];
259  cuDeviceGetName(name, 100, cuDevice);
260  if (cutCheckCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == CUTFalse) {
261  printf("> Using CUDA Device [%d]: %s\n", dev, name);
262  }
263  return dev;
264  }
265 #endif
266 
267  // General initialization call to pick the best CUDA Device
268 #if __DEVICE_EMULATION__
269  inline CUdevice cutilChooseCudaDeviceDrv(int argc, char **argv, int *p_devID)
270 #else
271  inline CUdevice cutilChooseCudaDeviceDrv(int argc, char **argv, int *p_devID)
272  {
273  CUdevice cuDevice;
274  int devID = 0;
275  // If the command-line has a device number specified, use it
276  if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
277  devID = cutilDeviceInitDrv(argc, argv);
278  if (devID < 0) {
279  printf("exiting...\n");
280  exit(0);
281  }
282  } else {
283  // Otherwise pick the device with highest Gflops/s
284  char name[100];
285  devID = cutilDrvGetMaxGflopsDeviceId();
286  cutilDrvSafeCallNoSync(cuDeviceGet(&cuDevice, devID));
287  cuDeviceGetName(name, 100, cuDevice);
288  printf("> Using CUDA Device [%d]: %s\n", devID, name);
289  }
290  cuDeviceGet(&cuDevice, devID);
291  if (p_devID) *p_devID = devID;
292  return cuDevice;
293  }
294 #endif
295 
296 
297 //! Check for CUDA context lost
298 inline void cutilDrvCudaCheckCtxLost(const char *errorMessage, const char *file, const int line )
299 {
300  CUresult err = cuCtxSynchronize();
301  if( CUDA_ERROR_INVALID_CONTEXT != err) {
302  fprintf(stderr, "Cuda error: %s in file '%s' in line %i\n",
303  errorMessage, file, line );
304  exit(-1);
305  }
306  err = cuCtxSynchronize();
307  if( CUDA_SUCCESS != err) {
308  fprintf(stderr, "Cuda error: %s in file '%s' in line %i\n",
309  errorMessage, file, line );
310  exit(-1);
311  }
312 }
313 
314 #ifndef STRCASECMP
315 #ifdef _WIN32
316 #define STRCASECMP _stricmp
317 #else
318 #define STRCASECMP strcasecmp
319 #endif
320 #endif
321 
322 #ifndef STRNCASECMP
323 #ifdef _WIN32
324 #define STRNCASECMP _strnicmp
325 #else
326 #define STRNCASECMP strncasecmp
327 #endif
328 #endif
329 
330 inline void __cutilDrvQAFinish(int argc, char **argv, bool bStatus)
331 {
332  const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
333 
334  bool bFlag = false;
335  for (int i=1; i < argc; i++) {
336  if (!STRCASECMP(argv[i], "-qatest") || !STRCASECMP(argv[i], "-noprompt")) {
337  bFlag |= true;
338  }
339  }
340 
341  if (bFlag) {
342  printf("&&&& %s %s", sStatus[bStatus], argv[0]);
343  for (int i=1; i < argc; i++) printf(" %s", argv[i]);
344  } else {
345  printf("[%s] test result\n%s\n", argv[0], sStatus[bStatus]);
346  }
347 }
348 
349 // General check for CUDA GPU SM Capabilities for a specific device #
350 inline bool cutilDrvCudaDevCapabilities(int major_version, int minor_version, int deviceNum, int argc, char** argv)
351 {
352  int major, minor, dev;
353  char device_name[256];
354 
355 #ifdef __DEVICE_EMULATION__
356  printf("> Compute Device Emulation Mode \n");
357 #endif
358 
359  cutilDrvSafeCallNoSync( cuDeviceGet(&dev, deviceNum) );
360  cutilDrvSafeCallNoSync( cuDeviceComputeCapability(&major, &minor, dev));
361  cutilDrvSafeCallNoSync( cuDeviceGetName(device_name, 256, dev) );
362 
363  if((major > major_version) ||
364  (major == major_version && minor >= minor_version))
365  {
366  printf("> Device %d: < %s >, Compute SM %d.%d detected\n", dev, device_name, major, minor);
367  return true;
368  }
369  else
370  {
371  printf("There is no device supporting CUDA compute capability %d.%d.\n", major_version, minor_version);
372  __cutilDrvQAFinish(argc, argv, true);
373  return false;
374  }
375 }
376 
377 // General check for CUDA GPU SM Capabilities
378 inline bool cutilDrvCudaCapabilities(int major_version, int minor_version, int argc, char **argv)
379 {
380  return cutilDrvCudaDevCapabilities(major_version, minor_version, 0, argc, argv);
381 }
382 
383 
384 #endif // _CUTIL_INLINE_FUNCTIONS_DRVAPI_H_