Point Cloud Library (PCL) 1.12.1
cutil_inline_drvapi.h
1/*
2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3 *
4 * Please refer to the NVIDIA end user license agreement (EULA) associated
5 * with this source code for terms and conditions that govern your use of
6 * this software. Any use, reproduction, disclosure, or distribution of
7 * this software and related documentation outside the terms of the EULA
8 * is strictly prohibited.
9 *
10 */
11
12#pragma once
13
14#include <stdio.h>
15#include <string.h>
16#include <stdlib.h>
17
18
19// We define these calls here, so the user doesn't need to include __FILE__ and __LINE__
20// The advantage is the developers gets to use the inline function so they can debug
21#define cutilDrvSafeCallNoSync(err) __cuSafeCallNoSync (err, __FILE__, __LINE__)
22#define cutilDrvSafeCall(err) __cuSafeCall (err, __FILE__, __LINE__)
23#define cutilDrvCtxSync() __cuCtxSync (__FILE__, __LINE__)
24#define cutilDrvCheckMsg(msg) __cuCheckMsg (msg, __FILE__, __LINE__)
25#define cutilDrvAlignOffset(offset, alignment) ( offset = (offset + (alignment-1)) & ~((alignment-1)) )
26
27// These are the inline versions for all of the CUTIL functions
28inline void __cuSafeCallNoSync( CUresult err, const char *file, const int line )
29{
30 if( CUDA_SUCCESS != err) {
31 fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
32 err, file, line );
33 exit(-1);
34 }
35}
36inline void __cuSafeCall( CUresult err, const char *file, const int line )
37{
38 __cuSafeCallNoSync( err, file, line );
39}
40
41inline void __cuCtxSync(const char *file, const int line )
42{
43 CUresult err = cuCtxSynchronize();
44 if( CUDA_SUCCESS != err ) {
45 fprintf(stderr, "cuCtxSynchronize() API error = %04d in file <%s>, line %i.\n",
46 err, file, line );
47 exit(-1);
48 }
49}
50
51#define MIN(a,b) ((a < b) ? a : b)
52#define MAX(a,b) ((a > b) ? a : b)
53
54// Beginning of GPU Architecture definitions
55inline int _ConvertSMVer2CoresDrvApi(int major, int minor)
56{
57 // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
58 struct sSMtoCores{
59 int SM; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version
60 int Cores;
61 };
62
63 sSMtoCores nGpuArchCoresPerSM[] =
64 { { 0x10, 8 },
65 { 0x11, 8 },
66 { 0x12, 8 },
67 { 0x13, 8 },
68 { 0x20, 32 },
69 { 0x21, 48 },
70 { -1, -1 }
71 };
72
73 int index = 0;
74 while (nGpuArchCoresPerSM[index].SM != -1) {
75 if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
76 return nGpuArchCoresPerSM[index].Cores;
77 }
78 index++;
79 }
80 printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
81 return -1;
82}
83// end of GPU Architecture definitions
84
85// This function returns the best GPU based on performance
86inline int cutilDrvGetMaxGflopsDeviceId()
87{
88 CUdevice current_device = 0;
89 CUdevice max_perf_device = 0;
90 int device_count = 0;
91 int max_compute_perf = 0;
92 int best_SM_arch = 0;
93
94 cuInit(0);
95 cutilDrvSafeCallNoSync(cuDeviceGetCount(&device_count));
96
97 // Find the best major SM Architecture GPU device
98 while ( current_device < device_count ) {
99 int major = 0;
100 int minor = 0;
101 cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
102 cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
103 if (major > 0 && major < 9999) {
104 best_SM_arch = MAX(best_SM_arch, major);
105 }
106 current_device++;
107 }
108
109 // Find the best CUDA capable GPU device
110 current_device = 0;
111 while( current_device < device_count ) {
112 int multiProcessorCount;
113 int clockRate;
114 int major = 0;
115 int minor = 0;
116 cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &multiProcessorCount,
117 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
118 current_device ) );
119 cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &clockRate,
120 CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
121 current_device ) );
122 cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
123 cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
124
125 int sm_per_multiproc = (major == 9999 && minor == 9999) ? 1 : _ConvertSMVer2CoresDrvApi(major, minor);
126
127 int compute_perf = multiProcessorCount * sm_per_multiproc * clockRate;
128 if( compute_perf > max_compute_perf ) {
129 // If we find GPU with SM major > 2, search only these
130 if ( best_SM_arch > 2 ) {
131 // If our device==dest_SM_arch, choose this, or else pass
132 if (major == best_SM_arch) {
133 max_compute_perf = compute_perf;
134 max_perf_device = current_device;
135 }
136 }
137 else {
138 max_compute_perf = compute_perf;
139 max_perf_device = current_device;
140 }
141 }
142 ++current_device;
143 }
144 return max_perf_device;
145}
146
147// This function returns the best Graphics GPU based on performance
148inline int cutilDrvGetMaxGflopsGraphicsDeviceId()
149{
150 CUdevice current_device = 0;
151 CUdevice max_perf_device = 0;
152 int device_count = 0;
153 int max_compute_perf = 0;
154 int best_SM_arch = 0;
155
156 cuInit(0);
157 cutilDrvSafeCallNoSync(cuDeviceGetCount(&device_count));
158
159 // Find the best major SM Architecture GPU device that are graphics devices
160 while ( current_device < device_count ) {
161 char deviceName[256];
162 int major = 0;
163 int minor = 0;
164 int bTCC = 0;
165 cutilDrvSafeCallNoSync( cuDeviceGetName(deviceName, 256, current_device) );
166 cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
167 cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
168 cutilDrvSafeCallNoSync( cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device) );
169
170 if (!bTCC) {
171 if (major > 0 && major < 9999) {
172 best_SM_arch = MAX(best_SM_arch, major);
173 }
174 }
175 current_device++;
176 }
177
178 // Find the best CUDA capable GPU device
179 current_device = 0;
180 while( current_device < device_count ) {
181 int multiProcessorCount;
182 int clockRate;
183 int major = 0;
184 int minor = 0;
185 int bTCC = 0;
186 cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &multiProcessorCount,
187 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
188 current_device ) );
189 cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &clockRate,
190 CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
191 current_device ) );
192 cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
193 cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
194
195 cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device ) );
196
197 int sm_per_multiproc = (major == 9999 && minor == 9999) ? 1 : _ConvertSMVer2CoresDrvApi(major, minor);
198
199 // If this is a Tesla based GPU and SM 2.0, and TCC is disabled, this is a contender
200 if (!bTCC) // Is this GPU running the TCC driver? If so we pass on this
201 {
202 int compute_perf = multiProcessorCount * sm_per_multiproc * clockRate;
203 if( compute_perf > max_compute_perf ) {
204 // If we find GPU with SM major > 2, search only these
205 if ( best_SM_arch > 2 ) {
206 // If our device = dest_SM_arch, then we pick this one
207 if (major == best_SM_arch) {
208 max_compute_perf = compute_perf;
209 max_perf_device = current_device;
210 }
211 }
212 else {
213 max_compute_perf = compute_perf;
214 max_perf_device = current_device;
215 }
216 }
217 }
218 ++current_device;
219 }
220 return max_perf_device;
221}
222
223inline void __cuCheckMsg( const char * msg, const char *file, const int line )
224{
225 CUresult err = cuCtxSynchronize();
226 if( CUDA_SUCCESS != err) {
227 fprintf(stderr, "cutilDrvCheckMsg -> %s", msg);
228 fprintf(stderr, "cutilDrvCheckMsg -> cuCtxSynchronize API error = %04d in file <%s>, line %i.\n",
229 err, file, line );
230 exit(-1);
231 }
232}
233
234
235#if __DEVICE_EMULATION__
236 inline int cutilDeviceInitDrv(int ARGC, char **ARGV) { }
237#else
238 inline int cutilDeviceInitDrv(int ARGC, char ** ARGV)
239 {
240 int cuDevice = 0;
241 int deviceCount = 0;
242 CUresult err = cuInit(0);
243 if (CUDA_SUCCESS == err)
244 cutilDrvSafeCallNoSync(cuDeviceGetCount(&deviceCount));
245 if (deviceCount == 0) {
246 fprintf(stderr, "CUTIL DeviceInitDrv error: no devices supporting CUDA\n");
247 exit(-1);
248 }
249 int dev = 0;
250 cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev);
251 if (dev < 0) dev = 0;
252 if (dev > deviceCount-1) {
253 fprintf(stderr, "\n");
254 fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
255 fprintf(stderr, ">> cutilDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
256 fprintf(stderr, "\n");
257 return -dev;
258 }
259 cutilDrvSafeCallNoSync(cuDeviceGet(&cuDevice, dev));
260 char name[100];
261 cuDeviceGetName(name, 100, cuDevice);
262 if (cutCheckCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == CUTFalse) {
263 printf("> Using CUDA Device [%d]: %s\n", dev, name);
264 }
265 return dev;
266 }
267#endif
268
269 // General initialization call to pick the best CUDA Device
270#if __DEVICE_EMULATION__
271 inline CUdevice cutilChooseCudaDeviceDrv(int argc, char **argv, int *p_devID)
272#else
273 inline CUdevice cutilChooseCudaDeviceDrv(int argc, char **argv, int *p_devID)
274 {
275 CUdevice cuDevice;
276 int devID = 0;
277 // If the command-line has a device number specified, use it
278 if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
279 devID = cutilDeviceInitDrv(argc, argv);
280 if (devID < 0) {
281 printf("exiting...\n");
282 exit(0);
283 }
284 } else {
285 // Otherwise pick the device with highest Gflops/s
286 char name[100];
287 devID = cutilDrvGetMaxGflopsDeviceId();
288 cutilDrvSafeCallNoSync(cuDeviceGet(&cuDevice, devID));
289 cuDeviceGetName(name, 100, cuDevice);
290 printf("> Using CUDA Device [%d]: %s\n", devID, name);
291 }
292 cuDeviceGet(&cuDevice, devID);
293 if (p_devID) *p_devID = devID;
294 return cuDevice;
295 }
296#endif
297
298
299//! Check for CUDA context lost
300inline void cutilDrvCudaCheckCtxLost(const char *errorMessage, const char *file, const int line )
301{
302 CUresult err = cuCtxSynchronize();
303 if( CUDA_ERROR_INVALID_CONTEXT != err) {
304 fprintf(stderr, "Cuda error: %s in file '%s' in line %i\n",
305 errorMessage, file, line );
306 exit(-1);
307 }
308 err = cuCtxSynchronize();
309 if( CUDA_SUCCESS != err) {
310 fprintf(stderr, "Cuda error: %s in file '%s' in line %i\n",
311 errorMessage, file, line );
312 exit(-1);
313 }
314}
315
316#ifndef STRCASECMP
317#ifdef _WIN32
318#define STRCASECMP _stricmp
319#else
320#define STRCASECMP strcasecmp
321#endif
322#endif
323
324#ifndef STRNCASECMP
325#ifdef _WIN32
326#define STRNCASECMP _strnicmp
327#else
328#define STRNCASECMP strncasecmp
329#endif
330#endif
331
332inline void __cutilDrvQAFinish(int argc, char **argv, bool bStatus)
333{
334 const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
335
336 bool bFlag = false;
337 for (int i=1; i < argc; i++) {
338 if (!STRCASECMP(argv[i], "-qatest") || !STRCASECMP(argv[i], "-noprompt")) {
339 bFlag |= true;
340 }
341 }
342
343 if (bFlag) {
344 printf("&&&& %s %s", sStatus[bStatus], argv[0]);
345 for (int i=1; i < argc; i++) printf(" %s", argv[i]);
346 } else {
347 printf("[%s] test result\n%s\n", argv[0], sStatus[bStatus]);
348 }
349}
350
351// General check for CUDA GPU SM Capabilities for a specific device #
352inline bool cutilDrvCudaDevCapabilities(int major_version, int minor_version, int deviceNum, int argc, char** argv)
353{
354 int major, minor, dev;
355 char device_name[256];
356
357#ifdef __DEVICE_EMULATION__
358 printf("> Compute Device Emulation Mode \n");
359#endif
360
361 cutilDrvSafeCallNoSync( cuDeviceGet(&dev, deviceNum) );
362 cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
363 cutilDrvSafeCallNoSync (cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
364 cutilDrvSafeCallNoSync( cuDeviceGetName(device_name, 256, dev) );
365
366 if((major > major_version) ||
367 (major == major_version && minor >= minor_version))
368 {
369 printf("> Device %d: < %s >, Compute SM %d.%d detected\n", dev, device_name, major, minor);
370 return true;
371 }
372 else
373 {
374 printf("There is no device supporting CUDA compute capability %d.%d.\n", major_version, minor_version);
375 __cutilDrvQAFinish(argc, argv, true);
376 return false;
377 }
378}
379
380// General check for CUDA GPU SM Capabilities
381inline bool cutilDrvCudaCapabilities(int major_version, int minor_version, int argc, char **argv)
382{
383 return cutilDrvCudaDevCapabilities(major_version, minor_version, 0, argc, argv);
384}