Actual source code: sfcuda.cu
1: #include <../src/vec/is/sf/impls/basic/sfpack.h>
3: /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */
4: __device__ static inline PetscInt MapTidToIndex(const PetscInt *opt,PetscInt tid)
5: {
6: PetscInt i,j,k,m,n,r;
7: const PetscInt *offset,*start,*dx,*dy,*X,*Y;
9: n = opt[0];
10: offset = opt + 1;
11: start = opt + n + 2;
12: dx = opt + 2*n + 2;
13: dy = opt + 3*n + 2;
14: X = opt + 5*n + 2;
15: Y = opt + 6*n + 2;
16: for (r=0; r<n; r++) {if (tid < offset[r+1]) break;}
17: m = (tid - offset[r]);
18: k = m/(dx[r]*dy[r]);
19: j = (m - k*dx[r]*dy[r])/dx[r];
20: i = m - k*dx[r]*dy[r] - j*dx[r];
22: return (start[r] + k*X[r]*Y[r] + j*X[r] + i);
23: }
25: /*====================================================================================*/
26: /* Templated CUDA kernels for pack/unpack. The Op can be regular or atomic */
27: /*====================================================================================*/
29: /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
30: <Type> is PetscReal, which is the primitive type we operate on.
31: <bs> is 16, which says <unit> contains 16 primitive types.
32: <BS> is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
33: <EQ> is 0, which is (bs == BS ? 1 : 0)
35: If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
36: For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
37: */
38: template<class Type,PetscInt BS,PetscInt EQ>
39: __global__ static void d_Pack(PetscInt bs,PetscInt count,PetscInt start,const PetscInt *opt,const PetscInt *idx,const Type *data,Type *buf)
40: {
41: PetscInt i,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
42: const PetscInt grid_size = gridDim.x * blockDim.x;
43: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */
44: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile-time const when EQ=1. */
46: for (; tid<count; tid += grid_size) {
47: /* opt != NULL ==> idx == NULL, i.e., the indices have patterns but not contiguous;
48: opt == NULL && idx == NULL ==> the indices are contiguous;
49: */
50: t = (opt? MapTidToIndex(opt,tid) : (idx? idx[tid] : start+tid))*MBS;
51: s = tid*MBS;
52: for (i=0; i<MBS; i++) buf[s+i] = data[t+i];
53: }
54: }
56: template<class Type,class Op,PetscInt BS,PetscInt EQ>
57: __global__ static void d_UnpackAndOp(PetscInt bs,PetscInt count,PetscInt start,const PetscInt *opt,const PetscInt *idx,Type *data,const Type *buf)
58: {
59: PetscInt i,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
60: const PetscInt grid_size = gridDim.x * blockDim.x;
61: const PetscInt M = (EQ) ? 1 : bs/BS, MBS = M*BS;
62: Op op;
64: for (; tid<count; tid += grid_size) {
65: t = (opt? MapTidToIndex(opt,tid) : (idx? idx[tid] : start+tid))*MBS;
66: s = tid*MBS;
67: for (i=0; i<MBS; i++) op(data[t+i],buf[s+i]);
68: }
69: }
71: template<class Type,class Op,PetscInt BS,PetscInt EQ>
72: __global__ static void d_FetchAndOp(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt *rootopt,const PetscInt *rootidx,Type *rootdata,Type *leafbuf)
73: {
74: PetscInt i,l,r,tid = blockIdx.x*blockDim.x + threadIdx.x;
75: const PetscInt grid_size = gridDim.x * blockDim.x;
76: const PetscInt M = (EQ) ? 1 : bs/BS, MBS = M*BS;
77: Op op;
79: for (; tid<count; tid += grid_size) {
80: r = (rootopt? MapTidToIndex(rootopt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
81: l = tid*MBS;
82: for (i=0; i<MBS; i++) leafbuf[l+i] = op(rootdata[r+i],leafbuf[l+i]);
83: }
84: }
86: template<class Type,class Op,PetscInt BS,PetscInt EQ>
87: __global__ static void d_ScatterAndOp(PetscInt bs,PetscInt count,PetscInt srcx,PetscInt srcy,PetscInt srcX,PetscInt srcY,PetscInt srcStart,const PetscInt* srcIdx,const Type *src,PetscInt dstx,PetscInt dsty,PetscInt dstX,PetscInt dstY,PetscInt dstStart,const PetscInt *dstIdx,Type *dst)
88: {
89: PetscInt i,j,k,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
90: const PetscInt grid_size = gridDim.x * blockDim.x;
91: const PetscInt M = (EQ) ? 1 : bs/BS, MBS = M*BS;
92: Op op;
94: for (; tid<count; tid += grid_size) {
95: if (!srcIdx) { /* src is either contiguous or 3D */
96: k = tid/(srcx*srcy);
97: j = (tid - k*srcx*srcy)/srcx;
98: i = tid - k*srcx*srcy - j*srcx;
99: s = srcStart + k*srcX*srcY + j*srcX + i;
100: } else {
101: s = srcIdx[tid];
102: }
104: if (!dstIdx) { /* dst is either contiguous or 3D */
105: k = tid/(dstx*dsty);
106: j = (tid - k*dstx*dsty)/dstx;
107: i = tid - k*dstx*dsty - j*dstx;
108: t = dstStart + k*dstX*dstY + j*dstX + i;
109: } else {
110: t = dstIdx[tid];
111: }
113: s *= MBS;
114: t *= MBS;
115: for (i=0; i<MBS; i++) op(dst[t+i],src[s+i]);
116: }
117: }
119: template<class Type,class Op,PetscInt BS,PetscInt EQ>
120: __global__ static void d_FetchAndOpLocal(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt *rootopt,const PetscInt *rootidx,Type *rootdata,PetscInt leafstart,const PetscInt *leafopt,const PetscInt *leafidx,const Type *leafdata,Type *leafupdate)
121: {
122: PetscInt i,l,r,tid = blockIdx.x*blockDim.x + threadIdx.x;
123: const PetscInt grid_size = gridDim.x * blockDim.x;
124: const PetscInt M = (EQ) ? 1 : bs/BS, MBS = M*BS;
125: Op op;
127: for (; tid<count; tid += grid_size) {
128: r = (rootopt? MapTidToIndex(rootopt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
129: l = (leafopt? MapTidToIndex(leafopt,tid) : (leafidx? leafidx[tid] : leafstart+tid))*MBS;
130: for (i=0; i<MBS; i++) leafupdate[l+i] = op(rootdata[r+i],leafdata[l+i]);
131: }
132: }
134: /*====================================================================================*/
135: /* Regular operations on device */
136: /*====================================================================================*/
137: template<typename Type> struct Insert {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = y; return old;}};
138: template<typename Type> struct Add {__device__ Type operator() (Type& x,Type y) const {Type old = x; x += y; return old;}};
139: template<typename Type> struct Mult {__device__ Type operator() (Type& x,Type y) const {Type old = x; x *= y; return old;}};
140: template<typename Type> struct Min {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = PetscMin(x,y); return old;}};
141: template<typename Type> struct Max {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = PetscMax(x,y); return old;}};
142: template<typename Type> struct LAND {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x && y; return old;}};
143: template<typename Type> struct LOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x || y; return old;}};
144: template<typename Type> struct LXOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = !x != !y; return old;}};
145: template<typename Type> struct BAND {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x & y; return old;}};
146: template<typename Type> struct BOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x | y; return old;}};
147: template<typename Type> struct BXOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x ^ y; return old;}};
148: template<typename Type> struct Minloc {
149: __device__ Type operator() (Type& x,Type y) const {
150: Type old = x;
151: if (y.a < x.a) x = y;
152: else if (y.a == x.a) x.b = min(x.b,y.b);
153: return old;
154: }
155: };
156: template<typename Type> struct Maxloc {
157: __device__ Type operator() (Type& x,Type y) const {
158: Type old = x;
159: if (y.a > x.a) x = y;
160: else if (y.a == x.a) x.b = min(x.b,y.b); /* See MPI MAXLOC */
161: return old;
162: }
163: };
165: /*====================================================================================*/
166: /* Atomic operations on device */
167: /*====================================================================================*/
169: /*
170: Atomic Insert (exchange) operations
172: CUDA C Programming Guide V10.1 Chapter B.12.1.3:
174: int atomicExch(int* address, int val);
175: unsigned int atomicExch(unsigned int* address, unsigned int val);
176: unsigned long long int atomicExch(unsigned long long int* address, unsigned long long int val);
177: float atomicExch(float* address, float val);
179: reads the 32-bit or 64-bit word old located at the address address in global or shared
180: memory and stores val back to memory at the same address. These two operations are
181: performed in one atomic transaction. The function returns old.
183: PETSc notes:
185: It may be useful in PetscSFFetchAndOp with op = MPI_REPLACE.
187: VecScatter with multiple entries scattered to the same location using INSERT_VALUES does not need
188: atomic insertion, since it does not need the old value. A 32-bit or 64-bit store instruction should
189: be atomic itself.
191: With bs>1 and a unit > 64 bits, the current element-wise atomic approach can not guarantee the whole
192: insertion is atomic. Hope no user codes rely on that.
193: */
194: __device__ static double atomicExch(double* address,double val) {return __longlong_as_double(atomicExch((ullint*)address,__double_as_longlong(val)));}
196: __device__ static llint atomicExch(llint* address,llint val) {return (llint)(atomicExch((ullint*)address,(ullint)val));}
198: template<typename Type> struct AtomicInsert {__device__ Type operator() (Type& x,Type y) const {return atomicExch(&x,y);}};
200: #if defined(PETSC_HAVE_COMPLEX)
201: #if defined(PETSC_USE_REAL_DOUBLE)
202: /* CUDA does not support 128-bit atomics. Users should not insert different 128-bit PetscComplex values to the same location */
203: template<> struct AtomicInsert<PetscComplex> {
204: __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
205: PetscComplex old, *z = &old;
206: double *xp = (double*)&x,*yp = (double*)&y;
207: AtomicInsert<double> op;
208: z[0] = op(xp[0],yp[0]);
209: z[1] = op(xp[1],yp[1]);
210: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
211: }
212: };
213: #elif defined(PETSC_USE_REAL_SINGLE)
214: template<> struct AtomicInsert<PetscComplex> {
215: __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
216: double *xp = (double*)&x,*yp = (double*)&y;
217: AtomicInsert<double> op;
218: return op(xp[0],yp[0]);
219: }
220: };
221: #endif
222: #endif
224: /*
225: Atomic add operations
227: CUDA C Programming Guide V10.1 Chapter B.12.1.1:
229: int atomicAdd(int* address, int val);
230: unsigned int atomicAdd(unsigned int* address,unsigned int val);
231: unsigned long long int atomicAdd(unsigned long long int* address,unsigned long long int val);
232: float atomicAdd(float* address, float val);
233: double atomicAdd(double* address, double val);
234: __half2 atomicAdd(__half2 *address, __half2 val);
235: __half atomicAdd(__half *address, __half val);
237: reads the 16-bit, 32-bit or 64-bit word old located at the address address in global or shared memory, computes (old + val),
238: and stores the result back to memory at the same address. These three operations are performed in one atomic transaction. The
239: function returns old.
241: The 32-bit floating-point version of atomicAdd() is only supported by devices of compute capability 2.x and higher.
242: The 64-bit floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and higher.
243: The 32-bit __half2 floating-point version of atomicAdd() is only supported by devices of compute capability 6.x and
244: higher. The atomicity of the __half2 add operation is guaranteed separately for each of the two __half elements;
245: the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
246: The 16-bit __half floating-point version of atomicAdd() is only supported by devices of compute capability 7.x and higher.
247: */
248: __device__ static llint atomicAdd(llint* address,llint val) {return (llint)atomicAdd((ullint*)address,(ullint)val);}
250: template<typename Type> struct AtomicAdd {__device__ Type operator() (Type& x,Type y) const {return atomicAdd(&x,y);}};
252: template<> struct AtomicAdd<double> {
253: __device__ double operator() (double& x,double y) const {
254: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
255: return atomicAdd(&x,y);
256: #else
257: double *address = &x, val = y;
258: ullint *address_as_ull = (ullint*)address;
259: ullint old = *address_as_ull, assumed;
260: do {
261: assumed = old;
262: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
263: /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
264: } while (assumed != old);
265: return __longlong_as_double(old);
266: #endif
267: }
268: };
270: template<> struct AtomicAdd<float> {
271: __device__ float operator() (float& x,float y) const {
272: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
273: return atomicAdd(&x,y);
274: #else
275: float *address = &x, val = y;
276: int *address_as_int = (int*)address;
277: int old = *address_as_int, assumed;
278: do {
279: assumed = old;
280: old = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
281: /* Note: uses integer comparison to avoid hang in case of NaN (since NaN !=NaN) */
282: } while (assumed != old);
283: return __int_as_float(old);
284: #endif
285: }
286: };
288: #if defined(PETSC_HAVE_COMPLEX)
289: template<> struct AtomicAdd<PetscComplex> {
290: __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
291: PetscComplex old, *z = &old;
292: PetscReal *xp = (PetscReal*)&x,*yp = (PetscReal*)&y;
293: AtomicAdd<PetscReal> op;
294: z[0] = op(xp[0],yp[0]);
295: z[1] = op(xp[1],yp[1]);
296: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
297: }
298: };
299: #endif
301: /*
302: Atomic Mult operations:
304: CUDA has no atomicMult at all, so we build our own with atomicCAS
305: */
306: #if defined(PETSC_USE_REAL_DOUBLE)
307: __device__ static double atomicMult(double* address, double val)
308: {
309: ullint *address_as_ull = (ullint*)(address);
310: ullint old = *address_as_ull, assumed;
311: do {
312: assumed = old;
313: /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
314: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val*__longlong_as_double(assumed)));
315: } while (assumed != old);
316: return __longlong_as_double(old);
317: }
318: #elif defined(PETSC_USE_REAL_SINGLE)
319: __device__ static float atomicMult(float* address,float val)
320: {
321: int *address_as_int = (int*)(address);
322: int old = *address_as_int, assumed;
323: do {
324: assumed = old;
325: old = atomicCAS(address_as_int, assumed, __float_as_int(val*__int_as_float(assumed)));
326: } while (assumed != old);
327: return __int_as_float(old);
328: }
329: #endif
331: __device__ static int atomicMult(int* address,int val)
332: {
333: int *address_as_int = (int*)(address);
334: int old = *address_as_int, assumed;
335: do {
336: assumed = old;
337: old = atomicCAS(address_as_int, assumed, val*assumed);
338: } while (assumed != old);
339: return (int)old;
340: }
342: __device__ static llint atomicMult(llint* address,llint val)
343: {
344: ullint *address_as_ull = (ullint*)(address);
345: ullint old = *address_as_ull, assumed;
346: do {
347: assumed = old;
348: old = atomicCAS(address_as_ull, assumed, (ullint)(val*(llint)assumed));
349: } while (assumed != old);
350: return (llint)old;
351: }
353: template<typename Type> struct AtomicMult {__device__ Type operator() (Type& x,Type y) const {return atomicMult(&x,y);}};
355: /*
356: Atomic Min/Max operations
358: CUDA C Programming Guide V10.1 Chapter B.12.1.4~5:
360: int atomicMin(int* address, int val);
361: unsigned int atomicMin(unsigned int* address,unsigned int val);
362: unsigned long long int atomicMin(unsigned long long int* address,unsigned long long int val);
364: reads the 32-bit or 64-bit word old located at the address address in global or shared
365: memory, computes the minimum of old and val, and stores the result back to memory
366: at the same address. These three operations are performed in one atomic transaction.
367: The function returns old.
368: The 64-bit version of atomicMin() is only supported by devices of compute capability 3.5 and higher.
370: atomicMax() is similar.
371: */
373: #if defined(PETSC_USE_REAL_DOUBLE)
374: __device__ static double atomicMin(double* address, double val)
375: {
376: ullint *address_as_ull = (ullint*)(address);
377: ullint old = *address_as_ull, assumed;
378: do {
379: assumed = old;
380: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val,__longlong_as_double(assumed))));
381: } while (assumed != old);
382: return __longlong_as_double(old);
383: }
385: __device__ static double atomicMax(double* address, double val)
386: {
387: ullint *address_as_ull = (ullint*)(address);
388: ullint old = *address_as_ull, assumed;
389: do {
390: assumed = old;
391: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val,__longlong_as_double(assumed))));
392: } while (assumed != old);
393: return __longlong_as_double(old);
394: }
395: #elif defined(PETSC_USE_REAL_SINGLE)
396: __device__ static float atomicMin(float* address,float val)
397: {
398: int *address_as_int = (int*)(address);
399: int old = *address_as_int, assumed;
400: do {
401: assumed = old;
402: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val,__int_as_float(assumed))));
403: } while (assumed != old);
404: return __int_as_float(old);
405: }
407: __device__ static float atomicMax(float* address,float val)
408: {
409: int *address_as_int = (int*)(address);
410: int old = *address_as_int, assumed;
411: do {
412: assumed = old;
413: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val,__int_as_float(assumed))));
414: } while (assumed != old);
415: return __int_as_float(old);
416: }
417: #endif
419: /*
420: atomicMin/Max(long long *, long long) are not in Nvidia's documentation. But on OLCF Summit we found
421: atomicMin/Max/And/Or/Xor(long long *, long long) in /sw/summit/cuda/10.1.243/include/sm_32_atomic_functions.h.
422: This causes compilation errors with pgi compilers and 64-bit indices:
423: error: function "atomicMin(long long *, long long)" has already been defined
425: So we add extra conditions defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
426: */
427: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320)
428: __device__ static llint atomicMin(llint* address,llint val)
429: {
430: ullint *address_as_ull = (ullint*)(address);
431: ullint old = *address_as_ull, assumed;
432: do {
433: assumed = old;
434: old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val,(llint)assumed)));
435: } while (assumed != old);
436: return (llint)old;
437: }
439: __device__ static llint atomicMax(llint* address,llint val)
440: {
441: ullint *address_as_ull = (ullint*)(address);
442: ullint old = *address_as_ull, assumed;
443: do {
444: assumed = old;
445: old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val,(llint)assumed)));
446: } while (assumed != old);
447: return (llint)old;
448: }
449: #endif
451: template<typename Type> struct AtomicMin {__device__ Type operator() (Type& x,Type y) const {return atomicMin(&x,y);}};
452: template<typename Type> struct AtomicMax {__device__ Type operator() (Type& x,Type y) const {return atomicMax(&x,y);}};
454: /*
455: Atomic bitwise operations
457: CUDA C Programming Guide V10.1 Chapter B.12.2.1 ~ B.12.2.3:
459: int atomicAnd(int* address, int val);
460: unsigned int atomicAnd(unsigned int* address,unsigned int val);
461: unsigned long long int atomicAnd(unsigned long long int* address,unsigned long long int val);
463: reads the 32-bit or 64-bit word old located at the address address in global or shared
464: memory, computes (old & val), and stores the result back to memory at the same
465: address. These three operations are performed in one atomic transaction.
466: The function returns old.
468: The 64-bit version of atomicAnd() is only supported by devices of compute capability 3.5 and higher.
470: atomicOr() and atomicXor are similar.
471: */
473: #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 320) /* Why 320? see comments at atomicMin() above */
474: __device__ static llint atomicAnd(llint* address,llint val)
475: {
476: ullint *address_as_ull = (ullint*)(address);
477: ullint old = *address_as_ull, assumed;
478: do {
479: assumed = old;
480: old = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
481: } while (assumed != old);
482: return (llint)old;
483: }
484: __device__ static llint atomicOr(llint* address,llint val)
485: {
486: ullint *address_as_ull = (ullint*)(address);
487: ullint old = *address_as_ull, assumed;
488: do {
489: assumed = old;
490: old = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
491: } while (assumed != old);
492: return (llint)old;
493: }
495: __device__ static llint atomicXor(llint* address,llint val)
496: {
497: ullint *address_as_ull = (ullint*)(address);
498: ullint old = *address_as_ull, assumed;
499: do {
500: assumed = old;
501: old = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
502: } while (assumed != old);
503: return (llint)old;
504: }
505: #endif
507: template<typename Type> struct AtomicBAND {__device__ Type operator() (Type& x,Type y) const {return atomicAnd(&x,y);}};
508: template<typename Type> struct AtomicBOR {__device__ Type operator() (Type& x,Type y) const {return atomicOr (&x,y);}};
509: template<typename Type> struct AtomicBXOR {__device__ Type operator() (Type& x,Type y) const {return atomicXor(&x,y);}};
511: /*
512: Atomic logical operations:
514: CUDA has no atomic logical operations at all. We support them on integer types.
515: */
517: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
518: which is what we want since we only support 32-bit and 64-bit integers.
519: */
520: template<typename Type,class Op,int size/* sizeof(Type) */> struct AtomicLogical;
522: template<typename Type,class Op>
523: struct AtomicLogical<Type,Op,4> {
524: __device__ Type operator()(Type& x,Type y) const {
525: int *address_as_int = (int*)(&x);
526: int old = *address_as_int, assumed;
527: Op op;
528: do {
529: assumed = old;
530: old = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed,y)));
531: } while (assumed != old);
532: return (Type)old;
533: }
534: };
536: template<typename Type,class Op>
537: struct AtomicLogical<Type,Op,8> {
538: __device__ Type operator()(Type& x,Type y) const {
539: ullint *address_as_ull = (ullint*)(&x);
540: ullint old = *address_as_ull, assumed;
541: Op op;
542: do {
543: assumed = old;
544: old = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed,y)));
545: } while (assumed != old);
546: return (Type)old;
547: }
548: };
550: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
551: template<typename Type> struct land {__device__ Type operator()(Type x, Type y) {return x && y;}};
552: template<typename Type> struct lor {__device__ Type operator()(Type x, Type y) {return x || y;}};
553: template<typename Type> struct lxor {__device__ Type operator()(Type x, Type y) {return (!x != !y);}};
555: template<typename Type> struct AtomicLAND {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,land<Type>,sizeof(Type)> op; return op(x,y);}};
556: template<typename Type> struct AtomicLOR {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lor<Type> ,sizeof(Type)> op; return op(x,y);}};
557: template<typename Type> struct AtomicLXOR {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lxor<Type>,sizeof(Type)> op; return op(x,y);}};
559: /*====================================================================================*/
560: /* Wrapper functions of cuda kernels. Function pointers are stored in 'link' */
561: /*====================================================================================*/
562: template<typename Type,PetscInt BS,PetscInt EQ>
563: static PetscErrorCode Pack(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *data,void *buf)
564: {
565: cudaError_t cerr;
566: PetscInt nthreads=256;
567: PetscInt nblocks=(count+nthreads-1)/nthreads;
568: const PetscInt *iarray=opt ? opt->array : NULL;
571: if (!count) return(0);
572: if (!opt && !idx) { /* It is a 'CUDA data to nvshmem buf' memory copy */
573: cerr = cudaMemcpyAsync(buf,(char*)data+start*link->unitbytes,count*link->unitbytes,cudaMemcpyDeviceToDevice,link->stream);CHKERRCUDA(cerr);
574: } else {
575: nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
576: d_Pack<Type,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(const Type*)data,(Type*)buf);
577: cerr = cudaGetLastError();CHKERRCUDA(cerr);
578: }
579: return(0);
580: }
582: /* To specialize UnpackAndOp for the cudaMemcpyAsync() below. Usually if this is a contiguous memcpy, we use root/leafdirect and do
583: not need UnpackAndOp. Only with nvshmem, we need this 'nvshmem buf to CUDA data' memory copy
584: */
585: template<typename Type,PetscInt BS,PetscInt EQ>
586: static PetscErrorCode Unpack(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,const void *buf)
587: {
588: cudaError_t cerr;
589: PetscInt nthreads=256;
590: PetscInt nblocks=(count+nthreads-1)/nthreads;
591: const PetscInt *iarray=opt ? opt->array : NULL;
594: if (!count) return(0);
595: if (!opt && !idx) { /* It is a 'nvshmem buf to CUDA data' memory copy */
596: cerr = cudaMemcpyAsync((char*)data+start*link->unitbytes,buf,count*link->unitbytes,cudaMemcpyDeviceToDevice,link->stream);CHKERRCUDA(cerr);
597: } else {
598: nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
599: d_UnpackAndOp<Type,Insert<Type>,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(Type*)data,(const Type*)buf);
600: cerr = cudaGetLastError();CHKERRCUDA(cerr);
601: }
602: return(0);
603: }
605: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
606: static PetscErrorCode UnpackAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,const void *buf)
607: {
608: cudaError_t cerr;
609: PetscInt nthreads=256;
610: PetscInt nblocks=(count+nthreads-1)/nthreads;
611: const PetscInt *iarray=opt ? opt->array : NULL;
614: if (!count) return(0);
615: nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
616: d_UnpackAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(Type*)data,(const Type*)buf);
617: cerr = cudaGetLastError();CHKERRCUDA(cerr);
618: return(0);
619: }
621: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
622: static PetscErrorCode FetchAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,void *buf)
623: {
624: cudaError_t cerr;
625: PetscInt nthreads=256;
626: PetscInt nblocks=(count+nthreads-1)/nthreads;
627: const PetscInt *iarray=opt ? opt->array : NULL;
630: if (!count) return(0);
631: nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
632: d_FetchAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,start,iarray,idx,(Type*)data,(Type*)buf);
633: cerr = cudaGetLastError();CHKERRCUDA(cerr);
634: return(0);
635: }
637: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
638: static PetscErrorCode ScatterAndOp(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst)
639: {
640: cudaError_t cerr;
641: PetscInt nthreads=256;
642: PetscInt nblocks=(count+nthreads-1)/nthreads;
643: PetscInt srcx=0,srcy=0,srcX=0,srcY=0,dstx=0,dsty=0,dstX=0,dstY=0;
646: if (!count) return(0);
647: nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
649: /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use CUDA 3D grid and block */
650: if (srcOpt) {srcx = srcOpt->dx[0]; srcy = srcOpt->dy[0]; srcX = srcOpt->X[0]; srcY = srcOpt->Y[0]; srcStart = srcOpt->start[0]; srcIdx = NULL;}
651: else if (!srcIdx) {srcx = srcX = count; srcy = srcY = 1;}
653: if (dstOpt) {dstx = dstOpt->dx[0]; dsty = dstOpt->dy[0]; dstX = dstOpt->X[0]; dstY = dstOpt->Y[0]; dstStart = dstOpt->start[0]; dstIdx = NULL;}
654: else if (!dstIdx) {dstx = dstX = count; dsty = dstY = 1;}
656: d_ScatterAndOp<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,srcx,srcy,srcX,srcY,srcStart,srcIdx,(const Type*)src,dstx,dsty,dstX,dstY,dstStart,dstIdx,(Type*)dst);
657: cerr = cudaGetLastError();CHKERRCUDA(cerr);
658: return(0);
659: }
661: /* Specialization for Insert since we may use cudaMemcpyAsync */
662: template<typename Type,PetscInt BS,PetscInt EQ>
663: static PetscErrorCode ScatterAndInsert(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst)
664: {
665: PetscErrorCode ierr;
666: cudaError_t cerr;
669: if (!count) return(0);
670: /*src and dst are contiguous */
671: if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) {
672: cerr = cudaMemcpyAsync((Type*)dst+dstStart*link->bs,(const Type*)src+srcStart*link->bs,count*link->unitbytes,cudaMemcpyDeviceToDevice,link->stream);CHKERRCUDA(cerr);
673: } else {
674: ScatterAndOp<Type,Insert<Type>,BS,EQ>(link,count,srcStart,srcOpt,srcIdx,src,dstStart,dstOpt,dstIdx,dst);
675: }
676: return(0);
677: }
679: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
680: static PetscErrorCode FetchAndOpLocal(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate)
681: {
682: cudaError_t cerr;
683: PetscInt nthreads=256;
684: PetscInt nblocks=(count+nthreads-1)/nthreads;
685: const PetscInt *rarray = rootopt ? rootopt->array : NULL;
686: const PetscInt *larray = leafopt ? leafopt->array : NULL;
689: if (!count) return(0);
690: nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
691: d_FetchAndOpLocal<Type,Op,BS,EQ><<<nblocks,nthreads,0,link->stream>>>(link->bs,count,rootstart,rarray,rootidx,(Type*)rootdata,leafstart,larray,leafidx,(const Type*)leafdata,(Type*)leafupdate);
692: cerr = cudaGetLastError();CHKERRCUDA(cerr);
693: return(0);
694: }
696: /*====================================================================================*/
697: /* Init various types and instantiate pack/unpack function pointers */
698: /*====================================================================================*/
699: template<typename Type,PetscInt BS,PetscInt EQ>
700: static void PackInit_RealType(PetscSFLink link)
701: {
702: /* Pack/unpack for remote communication */
703: link->d_Pack = Pack<Type,BS,EQ>;
704: link->d_UnpackAndInsert = Unpack<Type,BS,EQ>;
705: link->d_UnpackAndAdd = UnpackAndOp <Type,Add<Type> ,BS,EQ>;
706: link->d_UnpackAndMult = UnpackAndOp <Type,Mult<Type> ,BS,EQ>;
707: link->d_UnpackAndMin = UnpackAndOp <Type,Min<Type> ,BS,EQ>;
708: link->d_UnpackAndMax = UnpackAndOp <Type,Max<Type> ,BS,EQ>;
709: link->d_FetchAndAdd = FetchAndOp <Type,Add<Type> ,BS,EQ>;
711: /* Scatter for local communication */
712: link->d_ScatterAndInsert = ScatterAndInsert<Type ,BS,EQ>; /* Has special optimizations */
713: link->d_ScatterAndAdd = ScatterAndOp <Type,Add<Type> ,BS,EQ>;
714: link->d_ScatterAndMult = ScatterAndOp <Type,Mult<Type> ,BS,EQ>;
715: link->d_ScatterAndMin = ScatterAndOp <Type,Min<Type> ,BS,EQ>;
716: link->d_ScatterAndMax = ScatterAndOp <Type,Max<Type> ,BS,EQ>;
717: link->d_FetchAndAddLocal = FetchAndOpLocal <Type,Add <Type> ,BS,EQ>;
719: /* Atomic versions when there are data-race possibilities */
720: link->da_UnpackAndInsert = UnpackAndOp <Type,AtomicInsert<Type>,BS,EQ>;
721: link->da_UnpackAndAdd = UnpackAndOp <Type,AtomicAdd<Type> ,BS,EQ>;
722: link->da_UnpackAndMult = UnpackAndOp <Type,AtomicMult<Type> ,BS,EQ>;
723: link->da_UnpackAndMin = UnpackAndOp <Type,AtomicMin<Type> ,BS,EQ>;
724: link->da_UnpackAndMax = UnpackAndOp <Type,AtomicMax<Type> ,BS,EQ>;
725: link->da_FetchAndAdd = FetchAndOp <Type,AtomicAdd<Type> ,BS,EQ>;
727: link->da_ScatterAndInsert = ScatterAndOp <Type,AtomicInsert<Type>,BS,EQ>;
728: link->da_ScatterAndAdd = ScatterAndOp <Type,AtomicAdd<Type> ,BS,EQ>;
729: link->da_ScatterAndMult = ScatterAndOp <Type,AtomicMult<Type> ,BS,EQ>;
730: link->da_ScatterAndMin = ScatterAndOp <Type,AtomicMin<Type> ,BS,EQ>;
731: link->da_ScatterAndMax = ScatterAndOp <Type,AtomicMax<Type> ,BS,EQ>;
732: link->da_FetchAndAddLocal = FetchAndOpLocal <Type,AtomicAdd<Type> ,BS,EQ>;
733: }
735: /* Have this templated class to specialize for char integers */
736: template<typename Type,PetscInt BS,PetscInt EQ,PetscInt size/*sizeof(Type)*/>
737: struct PackInit_IntegerType_Atomic {
738: static void Init(PetscSFLink link)
739: {
740: link->da_UnpackAndInsert = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
741: link->da_UnpackAndAdd = UnpackAndOp<Type,AtomicAdd<Type> ,BS,EQ>;
742: link->da_UnpackAndMult = UnpackAndOp<Type,AtomicMult<Type> ,BS,EQ>;
743: link->da_UnpackAndMin = UnpackAndOp<Type,AtomicMin<Type> ,BS,EQ>;
744: link->da_UnpackAndMax = UnpackAndOp<Type,AtomicMax<Type> ,BS,EQ>;
745: link->da_UnpackAndLAND = UnpackAndOp<Type,AtomicLAND<Type> ,BS,EQ>;
746: link->da_UnpackAndLOR = UnpackAndOp<Type,AtomicLOR<Type> ,BS,EQ>;
747: link->da_UnpackAndLXOR = UnpackAndOp<Type,AtomicLXOR<Type> ,BS,EQ>;
748: link->da_UnpackAndBAND = UnpackAndOp<Type,AtomicBAND<Type> ,BS,EQ>;
749: link->da_UnpackAndBOR = UnpackAndOp<Type,AtomicBOR<Type> ,BS,EQ>;
750: link->da_UnpackAndBXOR = UnpackAndOp<Type,AtomicBXOR<Type> ,BS,EQ>;
751: link->da_FetchAndAdd = FetchAndOp <Type,AtomicAdd<Type> ,BS,EQ>;
753: link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
754: link->da_ScatterAndAdd = ScatterAndOp<Type,AtomicAdd<Type> ,BS,EQ>;
755: link->da_ScatterAndMult = ScatterAndOp<Type,AtomicMult<Type> ,BS,EQ>;
756: link->da_ScatterAndMin = ScatterAndOp<Type,AtomicMin<Type> ,BS,EQ>;
757: link->da_ScatterAndMax = ScatterAndOp<Type,AtomicMax<Type> ,BS,EQ>;
758: link->da_ScatterAndLAND = ScatterAndOp<Type,AtomicLAND<Type> ,BS,EQ>;
759: link->da_ScatterAndLOR = ScatterAndOp<Type,AtomicLOR<Type> ,BS,EQ>;
760: link->da_ScatterAndLXOR = ScatterAndOp<Type,AtomicLXOR<Type> ,BS,EQ>;
761: link->da_ScatterAndBAND = ScatterAndOp<Type,AtomicBAND<Type> ,BS,EQ>;
762: link->da_ScatterAndBOR = ScatterAndOp<Type,AtomicBOR<Type> ,BS,EQ>;
763: link->da_ScatterAndBXOR = ScatterAndOp<Type,AtomicBXOR<Type> ,BS,EQ>;
764: link->da_FetchAndAddLocal = FetchAndOpLocal<Type,AtomicAdd<Type>,BS,EQ>;
765: }
766: };
768: /* CUDA does not support atomics on chars. It is TBD in PETSc. */
769: template<typename Type,PetscInt BS,PetscInt EQ>
770: struct PackInit_IntegerType_Atomic<Type,BS,EQ,1> {
771: static void Init(PetscSFLink link) {/* Nothing to leave function pointers NULL */}
772: };
774: template<typename Type,PetscInt BS,PetscInt EQ>
775: static void PackInit_IntegerType(PetscSFLink link)
776: {
777: link->d_Pack = Pack<Type,BS,EQ>;
778: link->d_UnpackAndInsert = Unpack<Type,BS,EQ>;
779: link->d_UnpackAndAdd = UnpackAndOp<Type,Add<Type> ,BS,EQ>;
780: link->d_UnpackAndMult = UnpackAndOp<Type,Mult<Type> ,BS,EQ>;
781: link->d_UnpackAndMin = UnpackAndOp<Type,Min<Type> ,BS,EQ>;
782: link->d_UnpackAndMax = UnpackAndOp<Type,Max<Type> ,BS,EQ>;
783: link->d_UnpackAndLAND = UnpackAndOp<Type,LAND<Type> ,BS,EQ>;
784: link->d_UnpackAndLOR = UnpackAndOp<Type,LOR<Type> ,BS,EQ>;
785: link->d_UnpackAndLXOR = UnpackAndOp<Type,LXOR<Type> ,BS,EQ>;
786: link->d_UnpackAndBAND = UnpackAndOp<Type,BAND<Type> ,BS,EQ>;
787: link->d_UnpackAndBOR = UnpackAndOp<Type,BOR<Type> ,BS,EQ>;
788: link->d_UnpackAndBXOR = UnpackAndOp<Type,BXOR<Type> ,BS,EQ>;
789: link->d_FetchAndAdd = FetchAndOp <Type,Add<Type> ,BS,EQ>;
791: link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
792: link->d_ScatterAndAdd = ScatterAndOp<Type,Add<Type> ,BS,EQ>;
793: link->d_ScatterAndMult = ScatterAndOp<Type,Mult<Type> ,BS,EQ>;
794: link->d_ScatterAndMin = ScatterAndOp<Type,Min<Type> ,BS,EQ>;
795: link->d_ScatterAndMax = ScatterAndOp<Type,Max<Type> ,BS,EQ>;
796: link->d_ScatterAndLAND = ScatterAndOp<Type,LAND<Type> ,BS,EQ>;
797: link->d_ScatterAndLOR = ScatterAndOp<Type,LOR<Type> ,BS,EQ>;
798: link->d_ScatterAndLXOR = ScatterAndOp<Type,LXOR<Type> ,BS,EQ>;
799: link->d_ScatterAndBAND = ScatterAndOp<Type,BAND<Type> ,BS,EQ>;
800: link->d_ScatterAndBOR = ScatterAndOp<Type,BOR<Type> ,BS,EQ>;
801: link->d_ScatterAndBXOR = ScatterAndOp<Type,BXOR<Type> ,BS,EQ>;
802: link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;
803: PackInit_IntegerType_Atomic<Type,BS,EQ,sizeof(Type)>::Init(link);
804: }
806: #if defined(PETSC_HAVE_COMPLEX)
807: template<typename Type,PetscInt BS,PetscInt EQ>
808: static void PackInit_ComplexType(PetscSFLink link)
809: {
810: link->d_Pack = Pack<Type,BS,EQ>;
811: link->d_UnpackAndInsert = Unpack<Type,BS,EQ>;
812: link->d_UnpackAndAdd = UnpackAndOp<Type,Add<Type> ,BS,EQ>;
813: link->d_UnpackAndMult = UnpackAndOp<Type,Mult<Type> ,BS,EQ>;
814: link->d_FetchAndAdd = FetchAndOp <Type,Add<Type> ,BS,EQ>;
816: link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
817: link->d_ScatterAndAdd = ScatterAndOp<Type,Add<Type> ,BS,EQ>;
818: link->d_ScatterAndMult = ScatterAndOp<Type,Mult<Type> ,BS,EQ>;
819: link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;
821: link->da_UnpackAndInsert = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
822: link->da_UnpackAndAdd = UnpackAndOp<Type,AtomicAdd<Type>,BS,EQ>;
823: link->da_UnpackAndMult = NULL; /* Not implemented yet */
824: link->da_FetchAndAdd = NULL; /* Return value of atomicAdd on complex is not atomic */
826: link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
827: link->da_ScatterAndAdd = ScatterAndOp<Type,AtomicAdd<Type>,BS,EQ>;
828: }
829: #endif
831: typedef signed char SignedChar;
832: typedef unsigned char UnsignedChar;
833: typedef struct {int a; int b; } PairInt;
834: typedef struct {PetscInt a; PetscInt b;} PairPetscInt;
836: template<typename Type>
837: static void PackInit_PairType(PetscSFLink link)
838: {
839: link->d_Pack = Pack<Type,1,1>;
840: link->d_UnpackAndInsert = Unpack<Type,1,1>;
841: link->d_UnpackAndMaxloc = UnpackAndOp<Type,Maxloc<Type>,1,1>;
842: link->d_UnpackAndMinloc = UnpackAndOp<Type,Minloc<Type>,1,1>;
844: link->d_ScatterAndInsert = ScatterAndOp<Type,Insert<Type>,1,1>;
845: link->d_ScatterAndMaxloc = ScatterAndOp<Type,Maxloc<Type>,1,1>;
846: link->d_ScatterAndMinloc = ScatterAndOp<Type,Minloc<Type>,1,1>;
847: /* Atomics for pair types are not implemented yet */
848: }
850: template<typename Type,PetscInt BS,PetscInt EQ>
851: static void PackInit_DumbType(PetscSFLink link)
852: {
853: link->d_Pack = Pack<Type,BS,EQ>;
854: link->d_UnpackAndInsert = Unpack<Type,BS,EQ>;
855: link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
856: /* Atomics for dumb types are not implemented yet */
857: }
859: /* Some device-specific utilities */
860: static PetscErrorCode PetscSFLinkSyncDevice_CUDA(PetscSFLink link)
861: {
862: cudaError_t cerr;
864: cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr);
865: return(0);
866: }
868: static PetscErrorCode PetscSFLinkSyncStream_CUDA(PetscSFLink link)
869: {
870: cudaError_t cerr;
872: cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
873: return(0);
874: }
876: static PetscErrorCode PetscSFLinkMemcpy_CUDA(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
877: {
879: enum cudaMemcpyKind kinds[2][2] = {{cudaMemcpyHostToHost,cudaMemcpyHostToDevice},{cudaMemcpyDeviceToHost,cudaMemcpyDeviceToDevice}};
881: if (n) {
882: if (PetscMemTypeHost(dstmtype) && PetscMemTypeHost(srcmtype)) { /* Separate HostToHost so that pure-cpu code won't call cuda runtime */
883: PetscErrorCode PetscMemcpy(dst,src,n);
884: } else {
885: int stype = PetscMemTypeDevice(srcmtype) ? 1 : 0;
886: int dtype = PetscMemTypeDevice(dstmtype) ? 1 : 0;
887: cudaError_t cerr = cudaMemcpyAsync(dst,src,n,kinds[stype][dtype],link->stream);CHKERRCUDA(cerr);
888: }
889: }
890: return(0);
891: }
893: PetscErrorCode PetscSFMalloc_CUDA(PetscMemType mtype,size_t size,void** ptr)
894: {
896: if (PetscMemTypeHost(mtype)) {PetscErrorCode PetscMalloc(size,ptr);}
897: else if (PetscMemTypeDevice(mtype)) {
898: if (!PetscCUDAInitialized) {PetscErrorCode PetscCUDAInitializeCheck();}
899: cudaError_t err = cudaMalloc(ptr,size);CHKERRCUDA(err);
900: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d", (int)mtype);
901: return(0);
902: }
904: PetscErrorCode PetscSFFree_CUDA(PetscMemType mtype,void* ptr)
905: {
907: if (PetscMemTypeHost(mtype)) {PetscErrorCode PetscFree(ptr);}
908: else if (PetscMemTypeDevice(mtype)) {cudaError_t err = cudaFree(ptr);CHKERRCUDA(err);}
909: else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d",(int)mtype);
910: return(0);
911: }
913: /* Destructor when the link uses MPI for communication on CUDA device */
914: static PetscErrorCode PetscSFLinkDestroy_MPI_CUDA(PetscSF sf,PetscSFLink link)
915: {
916: cudaError_t cerr;
919: for (int i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
920: cerr = cudaFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRCUDA(cerr);
921: cerr = cudaFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRCUDA(cerr);
922: }
923: return(0);
924: }
926: /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
927: PetscErrorCode PetscSFLinkSetUp_CUDA(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
928: {
930: cudaError_t cerr;
931: PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
932: PetscBool is2Int,is2PetscInt;
933: #if defined(PETSC_HAVE_COMPLEX)
934: PetscInt nPetscComplex=0;
935: #endif
938: if (link->deviceinited) return(0);
939: MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);
940: MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
941: /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
942: MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);
943: MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
944: MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
945: #if defined(PETSC_HAVE_COMPLEX)
946: MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
947: #endif
948: MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
949: MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);
951: if (is2Int) {
952: PackInit_PairType<PairInt>(link);
953: } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
954: PackInit_PairType<PairPetscInt>(link);
955: } else if (nPetscReal) {
956: if (nPetscReal == 8) PackInit_RealType<PetscReal,8,1>(link); else if (nPetscReal%8 == 0) PackInit_RealType<PetscReal,8,0>(link);
957: else if (nPetscReal == 4) PackInit_RealType<PetscReal,4,1>(link); else if (nPetscReal%4 == 0) PackInit_RealType<PetscReal,4,0>(link);
958: else if (nPetscReal == 2) PackInit_RealType<PetscReal,2,1>(link); else if (nPetscReal%2 == 0) PackInit_RealType<PetscReal,2,0>(link);
959: else if (nPetscReal == 1) PackInit_RealType<PetscReal,1,1>(link); else if (nPetscReal%1 == 0) PackInit_RealType<PetscReal,1,0>(link);
960: } else if (nPetscInt && sizeof(PetscInt) == sizeof(llint)) {
961: if (nPetscInt == 8) PackInit_IntegerType<llint,8,1>(link); else if (nPetscInt%8 == 0) PackInit_IntegerType<llint,8,0>(link);
962: else if (nPetscInt == 4) PackInit_IntegerType<llint,4,1>(link); else if (nPetscInt%4 == 0) PackInit_IntegerType<llint,4,0>(link);
963: else if (nPetscInt == 2) PackInit_IntegerType<llint,2,1>(link); else if (nPetscInt%2 == 0) PackInit_IntegerType<llint,2,0>(link);
964: else if (nPetscInt == 1) PackInit_IntegerType<llint,1,1>(link); else if (nPetscInt%1 == 0) PackInit_IntegerType<llint,1,0>(link);
965: } else if (nInt) {
966: if (nInt == 8) PackInit_IntegerType<int,8,1>(link); else if (nInt%8 == 0) PackInit_IntegerType<int,8,0>(link);
967: else if (nInt == 4) PackInit_IntegerType<int,4,1>(link); else if (nInt%4 == 0) PackInit_IntegerType<int,4,0>(link);
968: else if (nInt == 2) PackInit_IntegerType<int,2,1>(link); else if (nInt%2 == 0) PackInit_IntegerType<int,2,0>(link);
969: else if (nInt == 1) PackInit_IntegerType<int,1,1>(link); else if (nInt%1 == 0) PackInit_IntegerType<int,1,0>(link);
970: } else if (nSignedChar) {
971: if (nSignedChar == 8) PackInit_IntegerType<SignedChar,8,1>(link); else if (nSignedChar%8 == 0) PackInit_IntegerType<SignedChar,8,0>(link);
972: else if (nSignedChar == 4) PackInit_IntegerType<SignedChar,4,1>(link); else if (nSignedChar%4 == 0) PackInit_IntegerType<SignedChar,4,0>(link);
973: else if (nSignedChar == 2) PackInit_IntegerType<SignedChar,2,1>(link); else if (nSignedChar%2 == 0) PackInit_IntegerType<SignedChar,2,0>(link);
974: else if (nSignedChar == 1) PackInit_IntegerType<SignedChar,1,1>(link); else if (nSignedChar%1 == 0) PackInit_IntegerType<SignedChar,1,0>(link);
975: } else if (nUnsignedChar) {
976: if (nUnsignedChar == 8) PackInit_IntegerType<UnsignedChar,8,1>(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType<UnsignedChar,8,0>(link);
977: else if (nUnsignedChar == 4) PackInit_IntegerType<UnsignedChar,4,1>(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType<UnsignedChar,4,0>(link);
978: else if (nUnsignedChar == 2) PackInit_IntegerType<UnsignedChar,2,1>(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType<UnsignedChar,2,0>(link);
979: else if (nUnsignedChar == 1) PackInit_IntegerType<UnsignedChar,1,1>(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType<UnsignedChar,1,0>(link);
980: #if defined(PETSC_HAVE_COMPLEX)
981: } else if (nPetscComplex) {
982: if (nPetscComplex == 8) PackInit_ComplexType<PetscComplex,8,1>(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType<PetscComplex,8,0>(link);
983: else if (nPetscComplex == 4) PackInit_ComplexType<PetscComplex,4,1>(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType<PetscComplex,4,0>(link);
984: else if (nPetscComplex == 2) PackInit_ComplexType<PetscComplex,2,1>(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType<PetscComplex,2,0>(link);
985: else if (nPetscComplex == 1) PackInit_ComplexType<PetscComplex,1,1>(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType<PetscComplex,1,0>(link);
986: #endif
987: } else {
988: MPI_Aint lb,nbyte;
989: MPI_Type_get_extent(unit,&lb,&nbyte);
990: if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
991: if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
992: if (nbyte == 4) PackInit_DumbType<char,4,1>(link); else if (nbyte%4 == 0) PackInit_DumbType<char,4,0>(link);
993: else if (nbyte == 2) PackInit_DumbType<char,2,1>(link); else if (nbyte%2 == 0) PackInit_DumbType<char,2,0>(link);
994: else if (nbyte == 1) PackInit_DumbType<char,1,1>(link); else if (nbyte%1 == 0) PackInit_DumbType<char,1,0>(link);
995: } else {
996: nInt = nbyte / sizeof(int);
997: if (nInt == 8) PackInit_DumbType<int,8,1>(link); else if (nInt%8 == 0) PackInit_DumbType<int,8,0>(link);
998: else if (nInt == 4) PackInit_DumbType<int,4,1>(link); else if (nInt%4 == 0) PackInit_DumbType<int,4,0>(link);
999: else if (nInt == 2) PackInit_DumbType<int,2,1>(link); else if (nInt%2 == 0) PackInit_DumbType<int,2,0>(link);
1000: else if (nInt == 1) PackInit_DumbType<int,1,1>(link); else if (nInt%1 == 0) PackInit_DumbType<int,1,0>(link);
1001: }
1002: }
1004: if (!sf->maxResidentThreadsPerGPU) { /* Not initialized */
1005: int device;
1006: struct cudaDeviceProp props;
1007: cerr = cudaGetDevice(&device);CHKERRCUDA(cerr);
1008: cerr = cudaGetDeviceProperties(&props,device);CHKERRCUDA(cerr);
1009: sf->maxResidentThreadsPerGPU = props.maxThreadsPerMultiProcessor*props.multiProcessorCount;
1010: }
1011: link->maxResidentThreadsPerGPU = sf->maxResidentThreadsPerGPU;
1013: link->stream = PetscDefaultCudaStream;
1014: link->Destroy = PetscSFLinkDestroy_MPI_CUDA;
1015: link->SyncDevice = PetscSFLinkSyncDevice_CUDA;
1016: link->SyncStream = PetscSFLinkSyncStream_CUDA;
1017: link->Memcpy = PetscSFLinkMemcpy_CUDA;
1018: link->deviceinited = PETSC_TRUE;
1019: return(0);
1020: }