Actual source code: aijcusparseband.cu
1: /*
2: AIJCUSPARSE methods implemented with Cuda kernels. Uses cuSparse/Thrust maps from AIJCUSPARSE
3: */
4: #define PETSC_SKIP_SPINLOCK
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #undef VecType
11: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
12: #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 600 && PETSC_PKG_CUDA_VERSION_GE(11,0,0)
13: #define AIJBANDUSEGROUPS 1
14: #endif
15: #if defined(AIJBANDUSEGROUPS)
16: #include <cooperative_groups.h>
17: #endif
19: #define CHECK_LAUNCH_ERROR() \
20: do { \
21: /* Check synchronous errors, i.e. pre-launch */ \
22: cudaError_t err = cudaGetLastError(); \
23: if (cudaSuccess != err) { \
24: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
25: } \
26: /* Check asynchronous errors, i.e. kernel failed (ULF) */ \
27: err = cudaDeviceSynchronize(); \
28: if (cudaSuccess != err) { \
29: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
30: } \
31: } while (0)
33: /*
34: LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields)
36: requires:
37: structurally symmetric: fix with transpose/column meta data
38: */
40: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat,Mat,IS,IS,const MatFactorInfo*);
41: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat,Mat,const MatFactorInfo*);
43: /*
44: The GPU LU factor kernel
45: */
46: __global__
47: void __launch_bounds__(1024,1)
48: mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[])
49: {
50: const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
51: const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
52: const PetscInt nloc_i = (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
54: // set i (row+1)
55: if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero
56: for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
57: if (rowb < end_i && threadIdx.x==0) {
58: PetscInt i=rowb+1, ni = (rowb>bw) ? bw+1 : i, n1L = ni*(ni-1)/2, nug= i*bw, n2L = bw*((rowb>bw) ? (rowb-bw) : 0), mi = bw + rowb + 1 - n, clip = (mi>0) ? mi*(mi-1)/2 + mi: 0;
59: bi_csr[rowb+1] = n1L + nug - clip + n2L + i;
60: }
61: }
62: }
63: // copy AIJ to AIJ_BAND
64: __global__
65: void __launch_bounds__(1024,1)
66: mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[],
67: const int ai_d[], const int aj_d[], const PetscScalar aa_d[],
68: const int bi_csr[], PetscScalar ba_csr[])
69: {
70: const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
71: const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
72: const PetscInt nloc_i = (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
74: // zero B
75: if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end
76: for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
77: if (rowb < end_i) {
78: PetscScalar *batmp = ba_csr + bi_csr[rowb];
79: const PetscInt nzb = bi_csr[rowb+1] - bi_csr[rowb];
80: for (int j=threadIdx.x ; j<nzb ; j += blockDim.x) {
81: if (j<nzb) {
82: batmp[j] = 0;
83: }
84: }
85: }
86: }
88: // copy A into B with CSR format -- these two loops can be fused
89: for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
90: if (rowb < end_i) {
91: const PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
92: const int *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb>bw) ? rowb-bw : 0;
93: const PetscScalar *av = aa_d + ai_d[rowa];
94: PetscScalar *batmp = ba_csr + bi_csr[rowb];
95: /* load in initial (unfactored row) */
96: for (int j=threadIdx.x ; j<nza ; j += blockDim.x) {
97: if (j<nza) {
98: PetscInt colb = ic[ajtmp[j]], idx = colb - bjStart;
99: PetscScalar vala = av[j];
100: batmp[idx] = vala;
101: }
102: }
103: }
104: }
105: }
106: // print AIJ_BAND
107: __global__
108: void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[])
109: {
110: // debug
111: if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) {
112: printf("B (AIJ) n=%d:\n",(int)n);
113: for (int rowb=0;rowb<n;rowb++) {
114: const PetscInt nz = bi_csr[rowb+1] - bi_csr[rowb];
115: const PetscScalar *batmp = ba_csr + bi_csr[rowb];
116: for (int j=0; j<nz; j++) printf("(%13.6e) ",PetscRealPart(batmp[j]));
117: printf(" bi=%d\n",bi_csr[rowb+1]);
118: }
119: }
120: }
121: // Band LU kernel --- ba_csr bi_csr
122: __global__
123: void __launch_bounds__(1024,1)
124: mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[], int *use_group_sync)
125: {
126: const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
127: const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
128: const PetscInt start = field*nloc, end = start + nloc;
129: #if defined(AIJBANDUSEGROUPS)
130: auto g = cooperative_groups::this_grid();
131: #endif
132: // A22 panel update for each row A(1,:) and col A(:,1)
133: for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
134: PetscInt tnzUd = bw, maxU = end-1 - glbDD; // we are chopping off the inter ears
135: const PetscInt nzUd = (tnzUd>maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first
136: PetscScalar *pBdd = ba_csr + bi_csr[glbDD] + dOffset;
137: const PetscScalar *baUd = pBdd + 1; // vector of data U(i,i+1:end)
138: const PetscScalar Bdd = *pBdd;
139: const PetscInt offset = blkIdx*blockDim.y + threadIdx.y, inc = Nblk*blockDim.y;
140: if (threadIdx.x==0) {
141: for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd; idx += inc, myi += inc) { /* assuming symmetric structure */
142: const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
143: PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
144: *Aid = *Aid/Bdd;
145: }
146: }
147: __syncthreads(); // synch on threadIdx.x only
148: for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd; idx += inc, myi += inc) {
149: const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
150: PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
151: PetscScalar *Aij = Aid + 1;
152: const PetscScalar Lid = *Aid;
153: for (int jIdx=threadIdx.x ; jIdx<nzUd; jIdx += blockDim.x) {
154: Aij[jIdx] -= Lid*baUd[jIdx];
155: }
156: }
157: #if defined(AIJBANDUSEGROUPS)
158: if (use_group_sync) {
159: g.sync();
160: } else {
161: __syncthreads();
162: }
163: #else
164: __syncthreads();
165: #endif
166: } /* endof for (i=0; i<n; i++) { */
167: }
169: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat,Vec,Vec);
170: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B,Mat A,const MatFactorInfo *info)
171: {
172: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
173: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
174: if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
175: Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
176: Mat_SeqAIJCUSPARSEMultStruct *matstructA;
177: CsrMatrix *matrixA;
178: PetscErrorCode ierr;
179: cudaError_t cerr;
180: const PetscInt n=A->rmap->n, *ic, *r;
181: const int *ai_d, *aj_d;
182: const PetscScalar *aa_d;
183: PetscScalar *ba_t = cusparseTriFactors->a_band_d;
184: int *bi_t = cusparseTriFactors->i_band_d;
185: PetscContainer container;
186: int Ni = 10, team_size=9, Nf, nVec=56, nconcurrent = 1, nsm = -1;
189: if (A->rmap->n == 0) {
190: return(0);
191: }
192: // cusparse setup
193: if (!cusparsestructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparsestructA");
194: matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; // matstruct->cprowIndices
195: if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
196: matrixA = (CsrMatrix*)matstructA->mat;
197: if (!matrixA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matrix cusparsestructA->mat->mat");
199: // factor: get Nf if available
200: PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
201: if (container) {
202: PetscInt *pNf=NULL;
203: PetscContainerGetPointer(container, (void **) &pNf);
204: Nf = (*pNf)%1000;
205: if ((*pNf)/1000>0) nconcurrent = (*pNf)/1000; // number of SMs to use
206: } else Nf = 1;
207: if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
209: // get data
210: ic = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data());
211: ai_d = thrust::raw_pointer_cast(matrixA->row_offsets->data());
212: aj_d = thrust::raw_pointer_cast(matrixA->column_indices->data());
213: aa_d = thrust::raw_pointer_cast(matrixA->values->data().get());
214: r = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data());
216: cerr = WaitForCUDA();CHKERRCUDA(cerr);
217: PetscLogGpuTimeBegin();
218: {
219: int bw = (int)(2.*(double)n-1. - (double)(PetscSqrtReal(1.+4.*((double)n*(double)n-(double)b->nz))+PETSC_MACHINE_EPSILON))/2, bm1=bw-1,nl=n/Nf;
220: #if !defined(AIJBANDUSEGROUPS)
221: Ni = 1/nconcurrent;
222: Ni = 1;
223: #else
224: if (!cusparseTriFactors->init_dev_prop) {
225: int gpuid;
226: cusparseTriFactors->init_dev_prop = PETSC_TRUE;
227: cudaGetDevice(&gpuid);
228: cudaGetDeviceProperties(&cusparseTriFactors->dev_prop, gpuid);
229: }
230: nsm = cusparseTriFactors->dev_prop.multiProcessorCount;
231: Ni = nsm/Nf/nconcurrent;
232: #endif
233: team_size = bw/Ni + !!(bw%Ni);
234: nVec = PetscMin(bw, 1024/team_size);
235: PetscInfo7(A,"Matrix Bandwidth = %d, number SMs/block = %d, num concurency = %d, num fields = %d, numSMs/GPU = %d, thread group size = %d,%d\n",bw,Ni,nconcurrent,Nf,nsm,team_size,nVec);
236: {
237: dim3 dimBlockTeam(nVec,team_size);
238: dim3 dimBlockLeague(Nf,Ni);
239: mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague,dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t);
240: CHECK_LAUNCH_ERROR(); // does a sync
241: #if defined(AIJBANDUSEGROUPS)
242: if (Ni > 1) {
243: void *kernelArgs[] = { (void*)&n, (void*)&bw, (void*)&bi_t, (void*)&ba_t, (void*)&nsm };
244: cudaLaunchCooperativeKernel((void*)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, 0, NULL);
245: } else {
246: mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam>>>(n, bw, bi_t, ba_t, NULL);
247: }
248: #else
249: mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam>>>(n, bw, bi_t, ba_t, NULL);
250: #endif
251: CHECK_LAUNCH_ERROR(); // does a sync
252: #if defined(PETSC_USE_LOG)
253: PetscLogGpuFlops((PetscLogDouble)Nf*(bm1*(bm1 + 1)*(PetscLogDouble)(2*bm1 + 1)/3 + (PetscLogDouble)2*(nl-bw)*bw*bw + (PetscLogDouble)nl*(nl+1)/2));
254: #endif
255: }
256: }
257: PetscLogGpuTimeEnd();
258: /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */
259: B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND;
260: B->ops->solvetranspose = NULL; // need transpose
261: B->ops->matsolve = NULL;
262: B->ops->matsolvetranspose = NULL;
263: return(0);
264: }
266: static PetscErrorCode MatrixNfDestroy(void *ptr)
267: {
268: PetscInt *nf = (PetscInt *)ptr;
269: PetscErrorCode ierr;
271: PetscFree(nf);
272: return(0);
273: }
275: PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
276: {
277: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b;
278: IS isicol;
279: PetscErrorCode ierr;
280: cudaError_t cerr;
281: const PetscInt *ic,*ai=a->i,*aj=a->j;
282: PetscScalar *ba_t;
283: int *bi_t;
284: PetscInt i,n=A->rmap->n,Nf;
285: PetscInt nzBcsr,bwL,bwU;
286: PetscBool missing;
287: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
288: PetscContainer container;
291: if (A->rmap->N != A->cmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"matrix must be square");
292: MatMissingDiagonal(A,&missing,&i);
293: if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
294: if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"!cusparseTriFactors");
295: MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&missing);
296: if (!missing) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"only structrally symmetric matrices supported");
298: // factor: get Nf if available
299: PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
300: if (container) {
301: PetscInt *pNf=NULL;
302: PetscContainerGetPointer(container, (void **) &pNf);
303: Nf = (*pNf)%1000;
304: PetscContainerCreate(PETSC_COMM_SELF, &container);
305: PetscMalloc(sizeof(PetscInt), &pNf);
306: *pNf = Nf;
307: PetscContainerSetPointer(container, (void *)pNf);
308: PetscContainerSetUserDestroy(container, MatrixNfDestroy);
309: PetscObjectCompose((PetscObject)B, "Nf", (PetscObject) container);
310: PetscContainerDestroy(&container);
311: } else Nf = 1;
312: if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
314: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
315: ISGetIndices(isicol,&ic);
317: MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);
318: PetscLogObjectParent((PetscObject)B,(PetscObject)isicol);
319: b = (Mat_SeqAIJ*)(B)->data;
321: /* get band widths, MatComputeBandwidth should take a reordering ic and do this */
322: bwL = bwU = 0;
323: for (int rwb=0; rwb<n; rwb++) {
324: const PetscInt rwa = ic[rwb], anz = ai[rwb+1] - ai[rwb], *ajtmp = aj + ai[rwb];
325: for (int j=0;j<anz;j++) {
326: PetscInt colb = ic[ajtmp[j]];
327: if (colb<rwa) { // L
328: if (rwa-colb > bwL) bwL = rwa-colb;
329: } else {
330: if (colb-rwa > bwU) bwU = colb-rwa;
331: }
332: }
333: }
334: ISRestoreIndices(isicol,&ic);
335: /* only support structurally symmetric, but it might work */
336: if (bwL!=bwU) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Only symmetric structure supported (now) W_L=%D W_U=%D",bwL,bwU);
337: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
338: nzBcsr = n + (2*n-1)*bwU - bwU*bwU;
339: b->maxnz = b->nz = nzBcsr;
340: cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz
341: PetscInfo2(A,"Matrix Bandwidth = %D, nnz = %D\n",bwL,b->nz);
342: if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
343: cerr = cudaMalloc(&ba_t,(b->nz+1)*sizeof(PetscScalar));CHKERRCUDA(cerr); // include a place for flops
344: cerr = cudaMalloc(&bi_t,(n+1)*sizeof(int));CHKERRCUDA(cerr);
345: cusparseTriFactors->a_band_d = ba_t;
346: cusparseTriFactors->i_band_d = bi_t;
347: /* In b structure: Free imax, ilen, old a, old j. Allocate solve_work, new a, new j */
348: PetscLogObjectMemory((PetscObject)B,(nzBcsr+1)*(sizeof(PetscInt)+sizeof(PetscScalar)));
349: {
350: dim3 dimBlockTeam(1,128);
351: dim3 dimBlockLeague(Nf,1);
352: mat_lu_factor_band_init_set_i<<<dimBlockLeague,dimBlockTeam>>>(n, bwU, bi_t);
353: }
354: CHECK_LAUNCH_ERROR(); // does a sync
356: // setup data
357: if (!cusparseTriFactors->rpermIndices) {
358: const PetscInt *r;
360: ISGetIndices(isrow,&r);
361: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
362: cusparseTriFactors->rpermIndices->assign(r, r+n);
363: ISRestoreIndices(isrow,&r);
364: PetscLogCpuToGpu(n*sizeof(PetscInt));
365: }
366: /* upper triangular indices */
367: if (!cusparseTriFactors->cpermIndices) {
368: const PetscInt *c;
370: ISGetIndices(isicol,&c);
371: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
372: cusparseTriFactors->cpermIndices->assign(c, c+n);
373: ISRestoreIndices(isicol,&c);
374: PetscLogCpuToGpu(n*sizeof(PetscInt));
375: }
377: /* put together the new matrix */
378: b->free_a = PETSC_FALSE;
379: b->free_ij = PETSC_FALSE;
380: b->singlemalloc = PETSC_FALSE;
381: b->ilen = NULL;
382: b->imax = NULL;
383: b->row = isrow;
384: b->col = iscol;
385: PetscObjectReference((PetscObject)isrow);
386: PetscObjectReference((PetscObject)iscol);
387: b->icol = isicol;
388: PetscMalloc1(n+1,&b->solve_work);
390: B->factortype = MAT_FACTOR_LU;
391: B->info.factor_mallocs = 0;
392: B->info.fill_ratio_given = 0;
394: if (ai[n]) {
395: B->info.fill_ratio_needed = ((PetscReal)(nzBcsr))/((PetscReal)ai[n]);
396: } else {
397: B->info.fill_ratio_needed = 0.0;
398: }
399: #if defined(PETSC_USE_INFO)
400: if (ai[n] != 0) {
401: PetscReal af = B->info.fill_ratio_needed;
402: PetscInfo1(A,"Band fill ratio %g\n",(double)af);
403: } else {
404: PetscInfo(A,"Empty matrix\n");
405: }
406: #endif
407: if (a->inode.size) {
408: PetscInfo(A,"Warning: using inodes in band solver.\n");
409: }
410: MatSeqAIJCheckInode_FactorLU(B);
411: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND;
412: B->offloadmask = PETSC_OFFLOAD_GPU;
414: return(0);
415: }
417: /* Use -pc_factor_mat_solver_type cusparseband */
418: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat A,MatSolverType *type)
419: {
421: *type = MATSOLVERCUSPARSEBAND;
422: return(0);
423: }
425: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A,MatFactorType ftype,Mat *B)
426: {
428: PetscInt n = A->rmap->n;
431: MatCreate(PetscObjectComm((PetscObject)A),B);
432: MatSetSizes(*B,n,n,n,n);
433: (*B)->factortype = ftype;
434: (*B)->canuseordering = PETSC_TRUE;
435: MatSetType(*B,MATSEQAIJCUSPARSE);
437: if (ftype == MAT_FACTOR_LU) {
438: MatSetBlockSizesFromMats(*B,A,A);
439: (*B)->ops->ilufactorsymbolic = NULL; // MatILUFactorSymbolic_SeqAIJCUSPARSE;
440: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND;
441: PetscStrallocpy(MATORDERINGRCM,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);
442: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSEBAND Matrix Types");
444: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
445: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse_band);
446: return(0);
447: }
449: #define WARP_SIZE 32
450: template <typename T>
451: __forceinline__ __device__
452: T wreduce(T a)
453: {
454: T b;
455: #pragma unroll
456: for (int i = WARP_SIZE/2; i >= 1; i = i >> 1) {
457: b = __shfl_down_sync(0xffffffff, a, i);
458: a += b;
459: }
460: return a;
461: }
462: // reduce in a block, returns result in thread 0
463: template <typename T, int BLOCK_SIZE>
464: __device__
465: T breduce(T a)
466: {
467: constexpr int NWARP = BLOCK_SIZE/WARP_SIZE;
468: __shared__ double buf[NWARP];
469: int wid = threadIdx.x / WARP_SIZE;
470: int laneid = threadIdx.x % WARP_SIZE;
471: T b = wreduce<T>(a);
472: if (laneid == 0)
473: buf[wid] = b;
474: __syncthreads();
475: if (wid == 0) {
476: if (threadIdx.x < NWARP)
477: a = buf[threadIdx.x];
478: else
479: a = 0;
480: for (int i = (NWARP+1)/2; i >= 1; i = i >> 1) {
481: a += __shfl_down_sync(0xffffffff, a, i);
482: }
483: }
484: return a;
485: }
487: // Band LU kernel --- ba_csr bi_csr
488: template <int BLOCK_SIZE>
489: __global__
490: void __launch_bounds__(256,1)
491: mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[])
492: {
493: const PetscInt Nf = gridDim.x, nloc = n/Nf, field = blockIdx.x, start = field*nloc, end = start + nloc, chopnz = bw*(bw+1)/2, blocknz=(2*bw+1)*nloc, blocknz_0 = blocknz-chopnz;
494: const PetscScalar *pLi;
495: const int tid = threadIdx.x;
497: /* Next, solve L */
498: pLi = ba_csr + (field==0 ? 0 : blocknz_0 + (field-1)*blocknz + bw); // diagonal (0,0) in field
499: for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
500: const PetscInt col = locDD<bw ? start : (glbDD-bw);
501: PetscScalar t = 0;
502: for (int j=col+tid,idx=tid;j<glbDD;j+=blockDim.x,idx+=blockDim.x) {
503: t += pLi[idx]*x[j];
504: }
505: #if defined(PETSC_USE_COMPLEX)
506: PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
507: PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
508: t = tt;
509: #else
510: t = breduce<PetscReal,BLOCK_SIZE>(t);
511: #endif
512: if (threadIdx.x == 0)
513: x[glbDD] -= t; // /1.0
514: __syncthreads();
515: // inc
516: pLi += glbDD-col; // get to diagonal
517: if (glbDD > n-1-bw) pLi += n-1-glbDD; // skip over U, only last block has funny offset
518: else pLi += bw;
519: pLi += 1; // skip to next row
520: if (field>0 && (locDD+1)<bw) pLi += bw-(locDD+1); // skip padding at beginning (ear)
521: }
522: /* Then, solve U */
523: pLi = ba_csr + Nf*blocknz - 2*chopnz - 1; // end of real data on block (diagonal)
524: if (field != Nf-1) pLi -= blocknz_0 + (Nf-2-field)*blocknz + bw; // diagonal of last local row
526: for (int glbDD=end-1, locDD = 0; glbDD >= start; glbDD--, locDD++) {
527: const PetscInt col = (locDD<bw) ? end-1 : glbDD+bw; // end of row in U
528: PetscScalar t = 0;
529: for (int j=col-tid,idx=tid;j>glbDD;j-=blockDim.x,idx+=blockDim.x) {
530: t += pLi[-idx]*x[j];
531: }
532: #if defined(PETSC_USE_COMPLEX)
533: PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
534: PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
535: t = tt;
536: #else
537: t = breduce<PetscReal,BLOCK_SIZE>(PetscRealPart(t));
538: #endif
539: pLi -= col-glbDD; // diagonal
540: if (threadIdx.x == 0) {
541: x[glbDD] -= t;
542: x[glbDD] /= pLi[0];
543: }
544: __syncthreads();
545: // inc past L to start of previous U
546: pLi -= bw+1;
547: if (glbDD<bw) pLi += bw-glbDD; // overshot in top left corner
548: if (((locDD+1) < bw) && field != Nf-1) pLi -= (bw - (locDD+1)); // skip past right corner
549: }
550: }
552: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A,Vec bb,Vec xx)
553: {
554: const PetscScalar *barray;
555: PetscScalar *xarray;
556: thrust::device_ptr<const PetscScalar> bGPU;
557: thrust::device_ptr<PetscScalar> xGPU;
558: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
559: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
560: PetscInt n=A->rmap->n, nz=cusparseTriFactors->nnz, Nf;
561: PetscInt bw = (int)(2.*(double)n-1.-(double)(PetscSqrtReal(1.+4.*((double)n*(double)n-(double)nz))+PETSC_MACHINE_EPSILON))/2; // quadric formula for bandwidth
562: PetscErrorCode ierr;
563: PetscContainer container;
566: if (A->rmap->n == 0) {
567: return(0);
568: }
569: // factor: get Nf if available
570: PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
571: if (container) {
572: PetscInt *pNf=NULL;
573: PetscContainerGetPointer(container, (void **) &pNf);
574: Nf = (*pNf)%1000;
575: } else Nf = 1;
576: if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n(%D) % Nf(%D) != 0",n,Nf);
578: /* Get the GPU pointers */
579: VecCUDAGetArrayWrite(xx,&xarray);
580: VecCUDAGetArrayRead(bb,&barray);
581: xGPU = thrust::device_pointer_cast(xarray);
582: bGPU = thrust::device_pointer_cast(barray);
584: PetscLogGpuTimeBegin();
585: /* First, reorder with the row permutation */
586: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
587: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
588: tempGPU->begin());
589: constexpr int block = 128;
590: mat_solve_band<block><<<Nf,block>>>(n,bw,cusparseTriFactors->a_band_d,tempGPU->data().get());
591: CHECK_LAUNCH_ERROR(); // does a sync
593: /* Last, reorder with the column permutation */
594: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
595: thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
596: xGPU);
598: VecCUDARestoreArrayRead(bb,&barray);
599: VecCUDARestoreArrayWrite(xx,&xarray);
600: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
601: PetscLogGpuTimeEnd();
603: return(0);
604: }