Point Cloud Library (PCL) 1.12.1
cutil_inline_runtime.h
1/*
2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3 *
4 * Please refer to the NVIDIA end user license agreement (EULA) associated
5 * with this source code for terms and conditions that govern your use of
6 * this software. Any use, reproduction, disclosure, or distribution of
7 * this software and related documentation outside the terms of the EULA
8 * is strictly prohibited.
9 *
10 */
11
12#pragma once
13
14#ifdef _WIN32
15#ifdef _DEBUG // Do this only in debug mode...
16# define WINDOWS_LEAN_AND_MEAN
17# include <windows.h>
18# include <stdlib.h>
19# undef min
20# undef max
21#endif
22#endif
23
24#include <stdio.h>
25#include <string.h>
26#include <stdlib.h>
27
28#include <cufft.h>
29
30// We define these calls here, so the user doesn't need to include __FILE__ and __LINE__
31// The advantage is the developers gets to use the inline function so they can debug
32#define cutilSafeCallNoSync(err) __cudaSafeCallNoSync(err, __FILE__, __LINE__)
33#define cutilSafeCall(err) __cudaSafeCall (err, __FILE__, __LINE__)
34#define cutilSafeThreadSync() __cudaSafeThreadSync(__FILE__, __LINE__)
35#define cufftSafeCall(err) __cufftSafeCall (err, __FILE__, __LINE__)
36#define cutilCheckError(err) __cutilCheckError (err, __FILE__, __LINE__)
37#define cutilCheckMsg(msg) __cutilGetLastError (msg, __FILE__, __LINE__)
38#define cutilCheckMsgAndSync(msg) __cutilGetLastErrorAndSync (msg, __FILE__, __LINE__)
39#define cutilSafeMalloc(mallocCall) __cutilSafeMalloc ((mallocCall), __FILE__, __LINE__)
40#define cutilCondition(val) __cutilCondition (val, __FILE__, __LINE__)
41#define cutilExit(argc, argv) __cutilExit (argc, argv)
42
43inline cudaError cutilDeviceSynchronize()
44{
45 return cudaDeviceSynchronize();
46}
47
48inline cudaError cutilDeviceReset()
49{
50 return cudaDeviceReset();
51}
52
53inline void __cutilCondition(int val, char *file, int line)
54{
55 if( CUTFalse == cutCheckCondition( val, file, line ) ) {
56 exit(EXIT_FAILURE);
57 }
58}
59
60inline void __cutilExit(int argc, char **argv)
61{
62 if (!cutCheckCmdLineFlag(argc, (const char**)argv, "noprompt")) {
63 printf("\nPress ENTER to exit...\n");
64 fflush( stdout);
65 fflush( stderr);
66 getchar();
67 }
68 exit(EXIT_SUCCESS);
69}
70
71#define MIN(a,b) ((a < b) ? a : b)
72#define MAX(a,b) ((a > b) ? a : b)
73
74// Beginning of GPU Architecture definitions
75inline int _ConvertSMVer2Cores(int major, int minor)
76{
77 // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
78 struct sSMtoCores {
79 int SM; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version
80 int Cores;
81 };
82
83 sSMtoCores nGpuArchCoresPerSM[] =
84 { { 0x10, 8 },
85 { 0x11, 8 },
86 { 0x12, 8 },
87 { 0x13, 8 },
88 { 0x20, 32 },
89 { 0x21, 48 },
90 { -1, -1 }
91 };
92
93 int index = 0;
94 while (nGpuArchCoresPerSM[index].SM != -1) {
95 if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
96 return nGpuArchCoresPerSM[index].Cores;
97 }
98 index++;
99 }
100 printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
101 return -1;
102}
103// end of GPU Architecture definitions
104
105// This function returns the best GPU (with maximum GFLOPS)
106inline int cutGetMaxGflopsDeviceId()
107{
108 int current_device = 0;
109 int max_compute_perf = 0;
110 int max_perf_device = 0;
111 int device_count = 0;
112 int best_SM_arch = 0;
113
114 cudaGetDeviceCount( &device_count );
115 // Find the best major SM Architecture GPU device
116 while ( current_device < device_count ) {
117 cudaDeviceProp deviceProp;
118 cudaGetDeviceProperties( &deviceProp, current_device );
119 if (deviceProp.major > 0 && deviceProp.major < 9999) {
120 best_SM_arch = MAX(best_SM_arch, deviceProp.major);
121 }
122 current_device++;
123 }
124
125 // Find the best CUDA capable GPU device
126 current_device = 0;
127 while( current_device < device_count ) {
128 cudaDeviceProp deviceProp;
129 cudaGetDeviceProperties( &deviceProp, current_device );
130 int sm_per_multiproc = (deviceProp.major == 9999 && deviceProp.minor == 9999) ? 1 : _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
131
132 int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
133 if( compute_perf > max_compute_perf ) {
134 // If we find GPU with SM major > 2, search only these
135 if ( best_SM_arch > 2 ) {
136 // If our device==dest_SM_arch, choose this, or else pass
137 if (deviceProp.major == best_SM_arch) {
138 max_compute_perf = compute_perf;
139 max_perf_device = current_device;
140 }
141 } else {
142 max_compute_perf = compute_perf;
143 max_perf_device = current_device;
144 }
145 }
146 ++current_device;
147 }
148 return max_perf_device;
149}
150
151// This function returns the best GPU (with maximum GFLOPS)
152inline int cutGetMaxGflopsGraphicsDeviceId()
153{
154 int current_device = 0;
155 int max_compute_perf = 0;
156 int max_perf_device = 0;
157 int device_count = 0;
158 int best_SM_arch = 0;
159 int bTCC = 0;
160
161 cudaGetDeviceCount( &device_count );
162 // Find the best major SM Architecture GPU device that is graphics capable
163 while ( current_device < device_count ) {
164 cudaDeviceProp deviceProp;
165 cudaGetDeviceProperties( &deviceProp, current_device );
166
167 if (deviceProp.tccDriver) bTCC = 1;
168
169 if (!bTCC) {
170 if (deviceProp.major > 0 && deviceProp.major < 9999) {
171 best_SM_arch = MAX(best_SM_arch, deviceProp.major);
172 }
173 }
174 current_device++;
175 }
176
177 // Find the best CUDA capable GPU device
178 current_device = 0;
179 while( current_device < device_count ) {
180 cudaDeviceProp deviceProp;
181 cudaGetDeviceProperties( &deviceProp, current_device );
182 int sm_per_multiproc = (deviceProp.major == 9999 && deviceProp.minor == 9999) ? 1 : _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
183
184 if (deviceProp.tccDriver) bTCC = 1;
185
186 if (!bTCC) // Is this GPU running the TCC driver? If so we pass on this
187 {
188 int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
189 if( compute_perf > max_compute_perf ) {
190 // If we find GPU with SM major > 2, search only these
191 if ( best_SM_arch > 2 ) {
192 // If our device==dest_SM_arch, choose this, or else pass
193 if (deviceProp.major == best_SM_arch) {
194 max_compute_perf = compute_perf;
195 max_perf_device = current_device;
196 }
197 }
198 else {
199 max_compute_perf = compute_perf;
200 max_perf_device = current_device;
201 }
202 }
203 }
204 ++current_device;
205 }
206 return max_perf_device;
207}
208
209// Give a little more for Windows : the console window often disappears before we can read the message
210#ifdef _WIN32
211# if 1//ndef UNICODE
212# ifdef _DEBUG // Do this only in debug mode...
213 inline void VSPrintf(FILE *file, LPCSTR fmt, ...)
214 {
215 std::size_t fmt2_sz = 2048;
216 char *fmt2 = (char*)malloc(fmt2_sz);
217 va_list vlist;
218 va_start(vlist, fmt);
219 while((_vsnprintf(fmt2, fmt2_sz, fmt, vlist)) < 0) // means there wasn't anough room
220 {
221 fmt2_sz *= 2;
222 if(fmt2) free(fmt2);
223 fmt2 = (char*)malloc(fmt2_sz);
224 }
225 OutputDebugStringA(fmt2);
226 fprintf(file, fmt2);
227 free(fmt2);
228 }
229#define FPRINTF(a) VSPrintf a
230#else //debug
231#define FPRINTF(a) fprintf a
232// For other than Win32
233#endif //debug
234#else //unicode
235// Unicode case... let's give-up for now and keep basic printf
236#define FPRINTF(a) fprintf a
237#endif //unicode
238#else //win32
239#define FPRINTF(a) fprintf a
240#endif //win32
241
242// NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at the right line
243// when the user double clicks on the error line in the Output pane. Like any compile error.
244
245inline void __cudaSafeCallNoSync( cudaError err, const char *file, const int line )
246{
247 if( cudaSuccess != err) {
248 FPRINTF((stderr, "%s(%i) : cudaSafeCallNoSync() Runtime API error : %s.\n",
249 file, line, cudaGetErrorString( err) ));
250 exit(-1);
251 }
252}
253
254inline void __cudaSafeCall( cudaError err, const char *file, const int line )
255{
256 if( cudaSuccess != err) {
257 FPRINTF((stderr, "%s(%i) : cudaSafeCall() Runtime API error : %s.\n",
258 file, line, cudaGetErrorString( err) ));
259 exit(-1);
260 }
261}
262
263inline void __cudaSafeThreadSync( const char *file, const int line )
264{
265 cudaError err = cutilDeviceSynchronize();
266 if ( cudaSuccess != err) {
267 FPRINTF((stderr, "%s(%i) : cudaDeviceSynchronize() Runtime API error : %s.\n",
268 file, line, cudaGetErrorString( err) ));
269 exit(-1);
270 }
271}
272
273inline void __cufftSafeCall( cufftResult err, const char *file, const int line )
274{
275 if( CUFFT_SUCCESS != err) {
276 FPRINTF((stderr, "%s(%i) : cufftSafeCall() CUFFT error.\n",
277 file, line));
278 exit(-1);
279 }
280}
281
282inline void __cutilCheckError( CUTBoolean err, const char *file, const int line )
283{
284 if( CUTTrue != err) {
285 FPRINTF((stderr, "%s(%i) : CUTIL CUDA error.\n",
286 file, line));
287 exit(-1);
288 }
289}
290
291inline void __cutilGetLastError( const char *errorMessage, const char *file, const int line )
292{
293 cudaError_t err = cudaGetLastError();
294 if( cudaSuccess != err) {
295 FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : %s.\n",
296 file, line, errorMessage, cudaGetErrorString( err) ));
297 exit(-1);
298 }
299}
300
301inline void __cutilGetLastErrorAndSync( const char *errorMessage, const char *file, const int line )
302{
303 cudaError_t err = cudaGetLastError();
304 if( cudaSuccess != err) {
305 FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : %s.\n",
306 file, line, errorMessage, cudaGetErrorString( err) ));
307 exit(-1);
308 }
309
310 err = cutilDeviceSynchronize();
311 if( cudaSuccess != err) {
312 FPRINTF((stderr, "%s(%i) : cutilCheckMsg cudaDeviceSynchronize error: %s : %s.\n",
313 file, line, errorMessage, cudaGetErrorString( err) ));
314 exit(-1);
315 }
316}
317
318inline void __cutilSafeMalloc( void *pointer, const char *file, const int line )
319{
320 if( !(pointer)) {
321 FPRINTF((stderr, "%s(%i) : cutilSafeMalloc host malloc failure\n",
322 file, line));
323 exit(-1);
324 }
325}
326
327#if __DEVICE_EMULATION__
328 inline int cutilDeviceInit(int ARGC, char **ARGV) { }
329 inline int cutilChooseCudaDevice(int ARGC, char **ARGV) { }
330#else
331 inline int cutilDeviceInit(int ARGC, char **ARGV)
332 {
333 int deviceCount;
334 cutilSafeCallNoSync(cudaGetDeviceCount(&deviceCount));
335 if (deviceCount == 0) {
336 FPRINTF((stderr, "CUTIL CUDA error: no devices supporting CUDA.\n"));
337 exit(-1);
338 }
339 int dev = 0;
340 cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev);
341 if (dev < 0)
342 dev = 0;
343 if (dev > deviceCount-1) {
344 fprintf(stderr, "\n");
345 fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
346 fprintf(stderr, ">> cutilDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
347 fprintf(stderr, "\n");
348 return -dev;
349 }
350 cudaDeviceProp deviceProp;
351 cutilSafeCallNoSync(cudaGetDeviceProperties(&deviceProp, dev));
352 if (deviceProp.major < 1) {
353 FPRINTF((stderr, "cutil error: GPU device does not support CUDA.\n"));
354 exit(-1); \
355 }
356 printf("> Using CUDA device [%d]: %s\n", dev, deviceProp.name);
357 cutilSafeCall(cudaSetDevice(dev));
358
359 return dev;
360 }
361
362 // General initialization call to pick the best CUDA Device
363 inline int cutilChooseCudaDevice(int argc, char **argv)
364 {
365 cudaDeviceProp deviceProp;
366 int devID = 0;
367 // If the command-line has a device number specified, use it
368 if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
369 devID = cutilDeviceInit(argc, argv);
370 if (devID < 0) {
371 printf("exiting...\n");
372 cutilExit(argc, argv);
373 exit(0);
374 }
375 } else {
376 // Otherwise pick the device with highest Gflops/s
377 devID = cutGetMaxGflopsDeviceId();
378 cutilSafeCallNoSync( cudaSetDevice( devID ) );
379 cutilSafeCallNoSync( cudaGetDeviceProperties(&deviceProp, devID) );
380 printf("> Using CUDA device [%d]: %s\n", devID, deviceProp.name);
381 }
382 return devID;
383 }
384#endif
385
386
387//! Check for CUDA context lost
388inline void cutilCudaCheckCtxLost(const char *errorMessage, const char *file, const int line )
389{
390 cudaError_t err = cudaGetLastError();
391 if( cudaSuccess != err) {
392 FPRINTF((stderr, "%s(%i) : CUDA error: %s : %s.\n",
393 file, line, errorMessage, cudaGetErrorString( err) ));
394 exit(-1);
395 }
396 err = cutilDeviceSynchronize();
397 if( cudaSuccess != err) {
398 FPRINTF((stderr, "%s(%i) : CUDA error: %s : %s.\n",
399 file, line, errorMessage, cudaGetErrorString( err) ));
400 exit(-1);
401 }
402}
403
404#ifndef STRCASECMP
405#ifdef _WIN32
406#define STRCASECMP _stricmp
407#else
408#define STRCASECMP strcasecmp
409#endif
410#endif
411
412#ifndef STRNCASECMP
413#ifdef _WIN32
414#define STRNCASECMP _strnicmp
415#else
416#define STRNCASECMP strncasecmp
417#endif
418#endif
419
420inline void __cutilQAFinish(int argc, char **argv, bool bStatus)
421{
422 const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
423
424 bool bFlag = false;
425 for (int i=1; i < argc; i++) {
426 if (!STRCASECMP(argv[i], "-qatest") || !STRCASECMP(argv[i], "-noprompt")) {
427 bFlag |= true;
428 }
429 }
430
431 if (bFlag) {
432 printf("&&&& %s %s", sStatus[bStatus], argv[0]);
433 for (int i=1; i < argc; i++) printf(" %s", argv[i]);
434 }
435 else {
436 printf("[%s] test result\n%s\n", argv[0], sStatus[bStatus]);
437 }
438}
439
440// General check for CUDA GPU SM Capabilities
441inline bool cutilCudaCapabilities(int major_version, int minor_version, int argc, char **argv)
442{
443 cudaDeviceProp deviceProp;
444 deviceProp.major = 0;
445 deviceProp.minor = 0;
446 int dev;
447
448#ifdef __DEVICE_EMULATION__
449 printf("> Compute Device Emulation Mode \n");
450#endif
451
452 cutilSafeCall( cudaGetDevice(&dev) );
453 cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev));
454
455 if((deviceProp.major > major_version) ||
456 (deviceProp.major == major_version && deviceProp.minor >= minor_version)) {
457 printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
458 return true;
459 }
460 else {
461 printf("There is no device supporting CUDA compute capability %d.%d.\n", major_version, minor_version);
462 __cutilQAFinish(argc, argv, true);
463 return false;
464 }
465}