Actual source code: baijov.c
2: /*
3: Routines to compute overlapping regions of a parallel MPI matrix
4: and to find submatrices that were shared across processors.
5: */
6: #include <../src/mat/impls/baij/mpi/mpibaij.h>
7: #include <petscbt.h>
9: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
10: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
11: extern PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
12: extern PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14: PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
15: {
17: PetscInt i,N=C->cmap->N, bs=C->rmap->bs;
18: IS *is_new;
21: PetscMalloc1(imax,&is_new);
22: /* Convert the indices into block format */
23: ISCompressIndicesGeneral(N,C->rmap->n,bs,imax,is,is_new);
24: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");
25: for (i=0; i<ov; ++i) {
26: MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);
27: }
28: for (i=0; i<imax; i++) {ISDestroy(&is[i]);}
29: ISExpandIndicesGeneral(N,N,bs,imax,is_new,is);
30: for (i=0; i<imax; i++) {ISDestroy(&is_new[i]);}
31: PetscFree(is_new);
32: return(0);
33: }
35: /*
36: Sample message format:
37: If a processor A wants processor B to process some elements corresponding
38: to index sets is[1], is[5]
39: mesg [0] = 2 (no of index sets in the mesg)
40: -----------
41: mesg [1] = 1 => is[1]
42: mesg [2] = sizeof(is[1]);
43: -----------
44: mesg [5] = 5 => is[5]
45: mesg [6] = sizeof(is[5]);
46: -----------
47: mesg [7]
48: mesg [n] data(is[1])
49: -----------
50: mesg[n+1]
51: mesg[m] data(is[5])
52: -----------
54: Notes:
55: nrqs - no of requests sent (or to be sent out)
56: nrqr - no of requests received (which have to be or which have been processed)
57: */
58: PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
59: {
60: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
61: const PetscInt **idx,*idx_i;
62: PetscInt *n,*w3,*w4,**data,len;
64: PetscMPIInt size,rank,tag1,tag2,*w2,*w1,nrqr;
65: PetscInt Mbs,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
66: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
67: PetscMPIInt *onodes1,*olengths1,*onodes2,*olengths2,proc=-1;
68: PetscBT *table;
69: MPI_Comm comm,*iscomms;
70: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
71: char *t_p;
74: PetscObjectGetComm((PetscObject)C,&comm);
75: size = c->size;
76: rank = c->rank;
77: Mbs = c->Mbs;
79: PetscObjectGetNewTag((PetscObject)C,&tag1);
80: PetscObjectGetNewTag((PetscObject)C,&tag2);
82: PetscMalloc2(imax+1,(PetscInt***)&idx,imax,&n);
84: for (i=0; i<imax; i++) {
85: ISGetIndices(is[i],&idx[i]);
86: ISGetLocalSize(is[i],&n[i]);
87: }
89: /* evaluate communication - mesg to who,length of mesg, and buffer space
90: required. Based on this, buffers are allocated, and data copied into them*/
91: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
92: for (i=0; i<imax; i++) {
93: PetscArrayzero(w4,size); /* initialise work vector*/
94: idx_i = idx[i];
95: len = n[i];
96: for (j=0; j<len; j++) {
97: row = idx_i[j];
98: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
99: PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);
100: w4[proc]++;
101: }
102: for (j=0; j<size; j++) {
103: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
104: }
105: }
107: nrqs = 0; /* no of outgoing messages */
108: msz = 0; /* total mesg length (for all proc */
109: w1[rank] = 0; /* no mesg sent to itself */
110: w3[rank] = 0;
111: for (i=0; i<size; i++) {
112: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
113: }
114: /* pa - is list of processors to communicate with */
115: PetscMalloc1(nrqs,&pa);
116: for (i=0,j=0; i<size; i++) {
117: if (w1[i]) {pa[j] = i; j++;}
118: }
120: /* Each message would have a header = 1 + 2*(no of IS) + data */
121: for (i=0; i<nrqs; i++) {
122: j = pa[i];
123: w1[j] += w2[j] + 2*w3[j];
124: msz += w1[j];
125: }
127: /* Determine the number of messages to expect, their lengths, from from-ids */
128: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
129: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
131: /* Now post the Irecvs corresponding to these messages */
132: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
134: /* Allocate Memory for outgoing messages */
135: PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
136: PetscArrayzero(outdat,size);
137: PetscArrayzero(ptr,size);
138: {
139: PetscInt *iptr = tmp,ict = 0;
140: for (i=0; i<nrqs; i++) {
141: j = pa[i];
142: iptr += ict;
143: outdat[j] = iptr;
144: ict = w1[j];
145: }
146: }
148: /* Form the outgoing messages */
149: /*plug in the headers*/
150: for (i=0; i<nrqs; i++) {
151: j = pa[i];
152: outdat[j][0] = 0;
153: PetscArrayzero(outdat[j]+1,2*w3[j]);
154: ptr[j] = outdat[j] + 2*w3[j] + 1;
155: }
157: /* Memory for doing local proc's work*/
158: {
159: PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mbs*imax,&d_p, (Mbs/PETSC_BITS_PER_BYTE+1)*imax,&t_p);
161: for (i=0; i<imax; i++) {
162: table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
163: data[i] = d_p + (Mbs)*i;
164: }
165: }
167: /* Parse the IS and update local tables and the outgoing buf with the data*/
168: {
169: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
170: PetscBT table_i;
172: for (i=0; i<imax; i++) {
173: PetscArrayzero(ctr,size);
174: n_i = n[i];
175: table_i = table[i];
176: idx_i = idx[i];
177: data_i = data[i];
178: isz_i = isz[i];
179: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
180: row = idx_i[j];
181: PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);
182: if (proc != rank) { /* copy to the outgoing buffer */
183: ctr[proc]++;
184: *ptr[proc] = row;
185: ptr[proc]++;
186: } else { /* Update the local table */
187: if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
188: }
189: }
190: /* Update the headers for the current IS */
191: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
192: if ((ctr_j = ctr[j])) {
193: outdat_j = outdat[j];
194: k = ++outdat_j[0];
195: outdat_j[2*k] = ctr_j;
196: outdat_j[2*k-1] = i;
197: }
198: }
199: isz[i] = isz_i;
200: }
201: }
203: /* Now post the sends */
204: PetscMalloc1(nrqs,&s_waits1);
205: for (i=0; i<nrqs; ++i) {
206: j = pa[i];
207: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
208: }
210: /* No longer need the original indices*/
211: for (i=0; i<imax; ++i) {
212: ISRestoreIndices(is[i],idx+i);
213: }
214: PetscFree2(*(PetscInt***)&idx,n);
216: PetscMalloc1(imax,&iscomms);
217: for (i=0; i<imax; ++i) {
218: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);
219: ISDestroy(&is[i]);
220: }
222: /* Do Local work*/
223: MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);
225: /* Receive messages*/
226: MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE);
227: MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);
229: /* Phase 1 sends are complete - deallocate buffers */
230: PetscFree4(outdat,ptr,tmp,ctr);
231: PetscFree4(w1,w2,w3,w4);
233: PetscMalloc1(nrqr,&xdata);
234: PetscMalloc1(nrqr,&isz1);
235: MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
236: if (rbuf) {
237: PetscFree(rbuf[0]);
238: PetscFree(rbuf);
239: }
241: /* Send the data back*/
242: /* Do a global reduction to know the buffer space req for incoming messages*/
243: {
244: PetscMPIInt *rw1;
246: PetscCalloc1(size,&rw1);
248: for (i=0; i<nrqr; ++i) {
249: proc = onodes1[i];
250: rw1[proc] = isz1[i];
251: }
253: /* Determine the number of messages to expect, their lengths, from from-ids */
254: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
255: PetscFree(rw1);
256: }
257: /* Now post the Irecvs corresponding to these messages */
258: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
260: /* Now post the sends */
261: PetscMalloc1(nrqr,&s_waits2);
262: for (i=0; i<nrqr; ++i) {
263: j = onodes1[i];
264: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
265: }
267: PetscFree(onodes1);
268: PetscFree(olengths1);
270: /* receive work done on other processors*/
271: {
272: PetscMPIInt idex;
273: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
274: PetscBT table_i;
276: for (i=0; i<nrqs; ++i) {
277: MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE);
278: /* Process the message*/
279: rbuf2_i = rbuf2[idex];
280: ct1 = 2*rbuf2_i[0]+1;
281: jmax = rbuf2[idex][0];
282: for (j=1; j<=jmax; j++) {
283: max = rbuf2_i[2*j];
284: is_no = rbuf2_i[2*j-1];
285: isz_i = isz[is_no];
286: data_i = data[is_no];
287: table_i = table[is_no];
288: for (k=0; k<max; k++,ct1++) {
289: row = rbuf2_i[ct1];
290: if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
291: }
292: isz[is_no] = isz_i;
293: }
294: }
295: MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);
296: }
298: for (i=0; i<imax; ++i) {
299: ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);
300: PetscCommDestroy(&iscomms[i]);
301: }
303: PetscFree(iscomms);
304: PetscFree(onodes2);
305: PetscFree(olengths2);
307: PetscFree(pa);
308: if (rbuf2) {
309: PetscFree(rbuf2[0]);
310: PetscFree(rbuf2);
311: }
312: PetscFree(s_waits1);
313: PetscFree(r_waits1);
314: PetscFree(s_waits2);
315: PetscFree(r_waits2);
316: PetscFree5(table,data,isz,d_p,t_p);
317: if (xdata) {
318: PetscFree(xdata[0]);
319: PetscFree(xdata);
320: }
321: PetscFree(isz1);
322: return(0);
323: }
325: /*
326: MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
327: the work on the local processor.
329: Inputs:
330: C - MAT_MPIBAIJ;
331: imax - total no of index sets processed at a time;
332: table - an array of char - size = Mbs bits.
334: Output:
335: isz - array containing the count of the solution elements corresponding
336: to each index set;
337: data - pointer to the solutions
338: */
339: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
340: {
341: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
342: Mat A = c->A,B = c->B;
343: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
344: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
345: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
346: PetscBT table_i;
349: rstart = c->rstartbs;
350: cstart = c->cstartbs;
351: ai = a->i;
352: aj = a->j;
353: bi = b->i;
354: bj = b->j;
355: garray = c->garray;
357: for (i=0; i<imax; i++) {
358: data_i = data[i];
359: table_i = table[i];
360: isz_i = isz[i];
361: for (j=0,max=isz[i]; j<max; j++) {
362: row = data_i[j] - rstart;
363: start = ai[row];
364: end = ai[row+1];
365: for (k=start; k<end; k++) { /* Amat */
366: val = aj[k] + cstart;
367: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
368: }
369: start = bi[row];
370: end = bi[row+1];
371: for (k=start; k<end; k++) { /* Bmat */
372: val = garray[bj[k]];
373: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
374: }
375: }
376: isz[i] = isz_i;
377: }
378: return(0);
379: }
380: /*
381: MatIncreaseOverlap_MPIBAIJ_Receive - Process the received messages,
382: and return the output
384: Input:
385: C - the matrix
386: nrqr - no of messages being processed.
387: rbuf - an array of pointers to the received requests
389: Output:
390: xdata - array of messages to be sent back
391: isz1 - size of each message
393: For better efficiency perhaps we should malloc separately each xdata[i],
394: then if a remalloc is required we need only copy the data for that one row
395: rather than all previous rows as it is now where a single large chunk of
396: memory is used.
398: */
399: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
400: {
401: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
402: Mat A = c->A,B = c->B;
403: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
405: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
406: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
407: PetscInt val,max1,max2,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
408: PetscInt *rbuf_i,kmax,rbuf_0;
409: PetscBT xtable;
412: Mbs = c->Mbs;
413: rstart = c->rstartbs;
414: cstart = c->cstartbs;
415: ai = a->i;
416: aj = a->j;
417: bi = b->i;
418: bj = b->j;
419: garray = c->garray;
421: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
422: rbuf_i = rbuf[i];
423: rbuf_0 = rbuf_i[0];
424: ct += rbuf_0;
425: for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
426: }
428: if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
429: else max1 = 1;
430: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
431: if (nrqr) {
432: PetscMalloc1(mem_estimate,&xdata[0]);
433: ++no_malloc;
434: }
435: PetscBTCreate(Mbs,&xtable);
436: PetscArrayzero(isz1,nrqr);
438: ct3 = 0;
439: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
440: rbuf_i = rbuf[i];
441: rbuf_0 = rbuf_i[0];
442: ct1 = 2*rbuf_0+1;
443: ct2 = ct1;
444: ct3 += ct1;
445: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
446: PetscBTMemzero(Mbs,xtable);
447: oct2 = ct2;
448: kmax = rbuf_i[2*j];
449: for (k=0; k<kmax; k++,ct1++) {
450: row = rbuf_i[ct1];
451: if (!PetscBTLookupSet(xtable,row)) {
452: if (!(ct3 < mem_estimate)) {
453: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
454: PetscMalloc1(new_estimate,&tmp);
455: PetscArraycpy(tmp,xdata[0],mem_estimate);
456: PetscFree(xdata[0]);
457: xdata[0] = tmp;
458: mem_estimate = new_estimate; ++no_malloc;
459: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
460: }
461: xdata[i][ct2++] = row;
462: ct3++;
463: }
464: }
465: for (k=oct2,max2=ct2; k<max2; k++) {
466: row = xdata[i][k] - rstart;
467: start = ai[row];
468: end = ai[row+1];
469: for (l=start; l<end; l++) {
470: val = aj[l] + cstart;
471: if (!PetscBTLookupSet(xtable,val)) {
472: if (!(ct3 < mem_estimate)) {
473: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
474: PetscMalloc1(new_estimate,&tmp);
475: PetscArraycpy(tmp,xdata[0],mem_estimate);
476: PetscFree(xdata[0]);
477: xdata[0] = tmp;
478: mem_estimate = new_estimate; ++no_malloc;
479: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
480: }
481: xdata[i][ct2++] = val;
482: ct3++;
483: }
484: }
485: start = bi[row];
486: end = bi[row+1];
487: for (l=start; l<end; l++) {
488: val = garray[bj[l]];
489: if (!PetscBTLookupSet(xtable,val)) {
490: if (!(ct3 < mem_estimate)) {
491: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
492: PetscMalloc1(new_estimate,&tmp);
493: PetscArraycpy(tmp,xdata[0],mem_estimate);
494: PetscFree(xdata[0]);
495: xdata[0] = tmp;
496: mem_estimate = new_estimate; ++no_malloc;
497: for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
498: }
499: xdata[i][ct2++] = val;
500: ct3++;
501: }
502: }
503: }
504: /* Update the header*/
505: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
506: xdata[i][2*j-1] = rbuf_i[2*j-1];
507: }
508: xdata[i][0] = rbuf_0;
509: if (i+1<nrqr) xdata[i+1] = xdata[i] + ct2;
510: isz1[i] = ct2; /* size of each message */
511: }
512: PetscBTDestroy(&xtable);
513: PetscInfo3(C,"Allocated %D bytes, required %D, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
514: return(0);
515: }
517: PetscErrorCode MatCreateSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
518: {
519: IS *isrow_block,*iscol_block;
520: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
522: PetscInt nmax,nstages_local,nstages,i,pos,max_no,N=C->cmap->N,bs=C->rmap->bs;
523: Mat_SeqBAIJ *subc;
524: Mat_SubSppt *smat;
527: /* The compression and expansion should be avoided. Doesn't point
528: out errors, might change the indices, hence buggey */
529: PetscMalloc2(ismax+1,&isrow_block,ismax+1,&iscol_block);
530: ISCompressIndicesGeneral(N,C->rmap->n,bs,ismax,isrow,isrow_block);
531: ISCompressIndicesGeneral(N,C->cmap->n,bs,ismax,iscol,iscol_block);
533: /* Determine the number of stages through which submatrices are done */
534: if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
535: else nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt));
536: if (!nmax) nmax = 1;
538: if (scall == MAT_INITIAL_MATRIX) {
539: nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
541: /* Make sure every processor loops through the nstages */
542: MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
544: /* Allocate memory to hold all the submatrices and dummy submatrices */
545: PetscCalloc1(ismax+nstages,submat);
546: } else { /* MAT_REUSE_MATRIX */
547: if (ismax) {
548: subc = (Mat_SeqBAIJ*)((*submat)[0]->data);
549: smat = subc->submatis1;
550: } else { /* (*submat)[0] is a dummy matrix */
551: smat = (Mat_SubSppt*)(*submat)[0]->data;
552: }
553: if (!smat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"MatCreateSubMatrices(...,MAT_REUSE_MATRIX,...) requires submat");
554: nstages = smat->nstages;
555: }
557: for (i=0,pos=0; i<nstages; i++) {
558: if (pos+nmax <= ismax) max_no = nmax;
559: else if (pos >= ismax) max_no = 0;
560: else max_no = ismax-pos;
562: MatCreateSubMatrices_MPIBAIJ_local(C,max_no,isrow_block+pos,iscol_block+pos,scall,*submat+pos);
563: if (!max_no) {
564: if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
565: smat = (Mat_SubSppt*)(*submat)[pos]->data;
566: smat->nstages = nstages;
567: }
568: pos++; /* advance to next dummy matrix if any */
569: } else pos += max_no;
570: }
572: if (scall == MAT_INITIAL_MATRIX && ismax) {
573: /* save nstages for reuse */
574: subc = (Mat_SeqBAIJ*)((*submat)[0]->data);
575: smat = subc->submatis1;
576: smat->nstages = nstages;
577: }
579: for (i=0; i<ismax; i++) {
580: ISDestroy(&isrow_block[i]);
581: ISDestroy(&iscol_block[i]);
582: }
583: PetscFree2(isrow_block,iscol_block);
584: return(0);
585: }
587: #if defined(PETSC_USE_CTABLE)
588: PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
589: {
590: PetscInt nGlobalNd = proc_gnode[size];
591: PetscMPIInt fproc;
595: PetscMPIIntCast((PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)),&fproc);
596: if (fproc > size) fproc = size;
597: while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
598: if (row < proc_gnode[fproc]) fproc--;
599: else fproc++;
600: }
601: *rank = fproc;
602: return(0);
603: }
604: #endif
606: /* -------------------------------------------------------------------------*/
607: /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
608: PetscErrorCode MatCreateSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
609: {
610: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
611: Mat A = c->A;
612: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*subc;
613: const PetscInt **icol,**irow;
614: PetscInt *nrow,*ncol,start;
616: PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
617: PetscInt **sbuf1,**sbuf2,*sbuf2_i,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
618: PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
619: PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
620: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
621: #if defined(PETSC_USE_CTABLE)
622: PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
623: #else
624: PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
625: #endif
626: const PetscInt *irow_i,*icol_i;
627: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
628: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
629: MPI_Request *r_waits4,*s_waits3,*s_waits4;
630: MPI_Comm comm;
631: PetscScalar **rbuf4,*rbuf4_i=NULL,**sbuf_aa,*vals,*mat_a=NULL,*imat_a=NULL,*sbuf_aa_i;
632: PetscMPIInt *onodes1,*olengths1,end;
633: PetscInt **row2proc,*row2proc_i,*imat_ilen,*imat_j,*imat_i;
634: Mat_SubSppt *smat_i;
635: PetscBool *issorted,colflag,iscsorted=PETSC_TRUE;
636: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
637: PetscInt bs=C->rmap->bs,bs2=c->bs2,rstart = c->rstartbs;
638: PetscBool ijonly=c->ijonly; /* private flag indicates only matrix data structures are requested */
639: PetscInt nzA,nzB,*a_i=a->i,*b_i=b->i,*a_j = a->j,*b_j = b->j,ctmp,imark,*cworkA,*cworkB;
640: PetscScalar *vworkA=NULL,*vworkB=NULL,*a_a = a->a,*b_a = b->a;
641: PetscInt cstart = c->cstartbs,*bmap = c->garray;
642: PetscBool *allrows,*allcolumns;
645: PetscObjectGetComm((PetscObject)C,&comm);
646: size = c->size;
647: rank = c->rank;
649: PetscMalloc5(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns,ismax,&allrows);
650: PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);
652: for (i=0; i<ismax; i++) {
653: ISSorted(iscol[i],&issorted[i]);
654: if (!issorted[i]) iscsorted = issorted[i]; /* columns are not sorted! */
655: ISSorted(isrow[i],&issorted[i]);
657: /* Check for special case: allcolumns */
658: ISIdentity(iscol[i],&colflag);
659: ISGetLocalSize(iscol[i],&ncol[i]);
661: if (colflag && ncol[i] == c->Nbs) {
662: allcolumns[i] = PETSC_TRUE;
663: icol[i] = NULL;
664: } else {
665: allcolumns[i] = PETSC_FALSE;
666: ISGetIndices(iscol[i],&icol[i]);
667: }
669: /* Check for special case: allrows */
670: ISIdentity(isrow[i],&colflag);
671: ISGetLocalSize(isrow[i],&nrow[i]);
672: if (colflag && nrow[i] == c->Mbs) {
673: allrows[i] = PETSC_TRUE;
674: irow[i] = NULL;
675: } else {
676: allrows[i] = PETSC_FALSE;
677: ISGetIndices(isrow[i],&irow[i]);
678: }
679: }
681: if (scall == MAT_REUSE_MATRIX) {
682: /* Assumes new rows are same length as the old rows */
683: for (i=0; i<ismax; i++) {
684: subc = (Mat_SeqBAIJ*)(submats[i]->data);
685: if (subc->mbs != nrow[i] || subc->nbs != ncol[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
687: /* Initial matrix as if empty */
688: PetscArrayzero(subc->ilen,subc->mbs);
690: /* Initial matrix as if empty */
691: submats[i]->factortype = C->factortype;
693: smat_i = subc->submatis1;
695: nrqs = smat_i->nrqs;
696: nrqr = smat_i->nrqr;
697: rbuf1 = smat_i->rbuf1;
698: rbuf2 = smat_i->rbuf2;
699: rbuf3 = smat_i->rbuf3;
700: req_source2 = smat_i->req_source2;
702: sbuf1 = smat_i->sbuf1;
703: sbuf2 = smat_i->sbuf2;
704: ptr = smat_i->ptr;
705: tmp = smat_i->tmp;
706: ctr = smat_i->ctr;
708: pa = smat_i->pa;
709: req_size = smat_i->req_size;
710: req_source1 = smat_i->req_source1;
712: allcolumns[i] = smat_i->allcolumns;
713: allrows[i] = smat_i->allrows;
714: row2proc[i] = smat_i->row2proc;
715: rmap[i] = smat_i->rmap;
716: cmap[i] = smat_i->cmap;
717: }
719: if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
720: if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
721: smat_i = (Mat_SubSppt*)submats[0]->data;
723: nrqs = smat_i->nrqs;
724: nrqr = smat_i->nrqr;
725: rbuf1 = smat_i->rbuf1;
726: rbuf2 = smat_i->rbuf2;
727: rbuf3 = smat_i->rbuf3;
728: req_source2 = smat_i->req_source2;
730: sbuf1 = smat_i->sbuf1;
731: sbuf2 = smat_i->sbuf2;
732: ptr = smat_i->ptr;
733: tmp = smat_i->tmp;
734: ctr = smat_i->ctr;
736: pa = smat_i->pa;
737: req_size = smat_i->req_size;
738: req_source1 = smat_i->req_source1;
740: allcolumns[0] = PETSC_FALSE;
741: }
742: } else { /* scall == MAT_INITIAL_MATRIX */
743: /* Get some new tags to keep the communication clean */
744: PetscObjectGetNewTag((PetscObject)C,&tag2);
745: PetscObjectGetNewTag((PetscObject)C,&tag3);
747: /* evaluate communication - mesg to who, length of mesg, and buffer space
748: required. Based on this, buffers are allocated, and data copied into them*/
749: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4); /* mesg size, initialize work vectors */
751: for (i=0; i<ismax; i++) {
752: jmax = nrow[i];
753: irow_i = irow[i];
755: PetscMalloc1(jmax,&row2proc_i);
756: row2proc[i] = row2proc_i;
758: if (issorted[i]) proc = 0;
759: for (j=0; j<jmax; j++) {
760: if (!issorted[i]) proc = 0;
761: if (allrows[i]) row = j;
762: else row = irow_i[j];
764: while (row >= c->rangebs[proc+1]) proc++;
765: w4[proc]++;
766: row2proc_i[j] = proc; /* map row index to proc */
767: }
768: for (j=0; j<size; j++) {
769: if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;}
770: }
771: }
773: nrqs = 0; /* no of outgoing messages */
774: msz = 0; /* total mesg length (for all procs) */
775: w1[rank] = 0; /* no mesg sent to self */
776: w3[rank] = 0;
777: for (i=0; i<size; i++) {
778: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
779: }
780: PetscMalloc1(nrqs,&pa); /*(proc -array)*/
781: for (i=0,j=0; i<size; i++) {
782: if (w1[i]) { pa[j] = i; j++; }
783: }
785: /* Each message would have a header = 1 + 2*(no of IS) + data */
786: for (i=0; i<nrqs; i++) {
787: j = pa[i];
788: w1[j] += w2[j] + 2* w3[j];
789: msz += w1[j];
790: }
791: PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);
793: /* Determine the number of messages to expect, their lengths, from from-ids */
794: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
795: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
797: /* Now post the Irecvs corresponding to these messages */
798: PetscObjectGetNewTag((PetscObject)C,&tag0);
799: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
801: /* Allocate Memory for outgoing messages */
802: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
803: PetscArrayzero(sbuf1,size);
804: PetscArrayzero(ptr,size);
806: {
807: PetscInt *iptr = tmp;
808: k = 0;
809: for (i=0; i<nrqs; i++) {
810: j = pa[i];
811: iptr += k;
812: sbuf1[j] = iptr;
813: k = w1[j];
814: }
815: }
817: /* Form the outgoing messages. Initialize the header space */
818: for (i=0; i<nrqs; i++) {
819: j = pa[i];
820: sbuf1[j][0] = 0;
821: PetscArrayzero(sbuf1[j]+1,2*w3[j]);
822: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
823: }
825: /* Parse the isrow and copy data into outbuf */
826: for (i=0; i<ismax; i++) {
827: row2proc_i = row2proc[i];
828: PetscArrayzero(ctr,size);
829: irow_i = irow[i];
830: jmax = nrow[i];
831: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
832: proc = row2proc_i[j];
833: if (allrows[i]) row = j;
834: else row = irow_i[j];
836: if (proc != rank) { /* copy to the outgoing buf*/
837: ctr[proc]++;
838: *ptr[proc] = row;
839: ptr[proc]++;
840: }
841: }
842: /* Update the headers for the current IS */
843: for (j=0; j<size; j++) { /* Can Optimise this loop too */
844: if ((ctr_j = ctr[j])) {
845: sbuf1_j = sbuf1[j];
846: k = ++sbuf1_j[0];
847: sbuf1_j[2*k] = ctr_j;
848: sbuf1_j[2*k-1] = i;
849: }
850: }
851: }
853: /* Now post the sends */
854: PetscMalloc1(nrqs,&s_waits1);
855: for (i=0; i<nrqs; ++i) {
856: j = pa[i];
857: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
858: }
860: /* Post Receives to capture the buffer size */
861: PetscMalloc1(nrqs,&r_waits2);
862: PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);
863: if (nrqs) rbuf2[0] = tmp + msz;
864: for (i=1; i<nrqs; ++i) {
865: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
866: }
867: for (i=0; i<nrqs; ++i) {
868: j = pa[i];
869: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);
870: }
872: /* Send to other procs the buf size they should allocate */
873: /* Receive messages*/
874: PetscMalloc1(nrqr,&s_waits2);
875: PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
877: MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE);
878: for (i=0; i<nrqr; ++i) {
879: req_size[i] = 0;
880: rbuf1_i = rbuf1[i];
881: start = 2*rbuf1_i[0] + 1;
882: end = olengths1[i];
883: PetscMalloc1(end,&sbuf2[i]);
884: sbuf2_i = sbuf2[i];
885: for (j=start; j<end; j++) {
886: row = rbuf1_i[j] - rstart;
887: ncols = a_i[row+1] - a_i[row] + b_i[row+1] - b_i[row];
888: sbuf2_i[j] = ncols;
889: req_size[i] += ncols;
890: }
891: req_source1[i] = onodes1[i];
892: /* form the header */
893: sbuf2_i[0] = req_size[i];
894: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
896: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
897: }
899: PetscFree(onodes1);
900: PetscFree(olengths1);
902: PetscFree(r_waits1);
903: PetscFree4(w1,w2,w3,w4);
905: /* Receive messages*/
906: PetscMalloc1(nrqs,&r_waits3);
908: MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE);
909: for (i=0; i<nrqs; ++i) {
910: PetscMalloc1(rbuf2[i][0],&rbuf3[i]);
911: req_source2[i] = pa[i];
912: MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
913: }
914: PetscFree(r_waits2);
916: /* Wait on sends1 and sends2 */
917: MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);
918: MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);
919: PetscFree(s_waits1);
920: PetscFree(s_waits2);
922: /* Now allocate sending buffers for a->j, and send them off */
923: PetscMalloc1(nrqr,&sbuf_aj);
924: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
925: if (nrqr) {PetscMalloc1(j,&sbuf_aj[0]);}
926: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
928: PetscMalloc1(nrqr,&s_waits3);
929: {
931: for (i=0; i<nrqr; i++) {
932: rbuf1_i = rbuf1[i];
933: sbuf_aj_i = sbuf_aj[i];
934: ct1 = 2*rbuf1_i[0] + 1;
935: ct2 = 0;
936: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
937: kmax = rbuf1[i][2*j];
938: for (k=0; k<kmax; k++,ct1++) {
939: row = rbuf1_i[ct1] - rstart;
940: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
941: ncols = nzA + nzB;
942: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
944: /* load the column indices for this row into cols */
945: cols = sbuf_aj_i + ct2;
946: for (l=0; l<nzB; l++) {
947: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
948: else break;
949: }
950: imark = l;
951: for (l=0; l<nzA; l++) {cols[imark+l] = cstart + cworkA[l];}
952: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
953: ct2 += ncols;
954: }
955: }
956: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
957: }
958: }
960: /* create col map: global col of C -> local col of submatrices */
961: #if defined(PETSC_USE_CTABLE)
962: for (i=0; i<ismax; i++) {
963: if (!allcolumns[i]) {
964: PetscTableCreate(ncol[i],c->Nbs,&cmap[i]);
966: jmax = ncol[i];
967: icol_i = icol[i];
968: cmap_i = cmap[i];
969: for (j=0; j<jmax; j++) {
970: PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);
971: }
972: } else cmap[i] = NULL;
973: }
974: #else
975: for (i=0; i<ismax; i++) {
976: if (!allcolumns[i]) {
977: PetscCalloc1(c->Nbs,&cmap[i]);
978: jmax = ncol[i];
979: icol_i = icol[i];
980: cmap_i = cmap[i];
981: for (j=0; j<jmax; j++) cmap_i[icol_i[j]] = j+1;
982: } else cmap[i] = NULL;
983: }
984: #endif
986: /* Create lens which is required for MatCreate... */
987: for (i=0,j=0; i<ismax; i++) j += nrow[i];
988: PetscMalloc1(ismax,&lens);
990: if (ismax) {
991: PetscCalloc1(j,&lens[0]);
992: }
993: for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
995: /* Update lens from local data */
996: for (i=0; i<ismax; i++) {
997: row2proc_i = row2proc[i];
998: jmax = nrow[i];
999: if (!allcolumns[i]) cmap_i = cmap[i];
1000: irow_i = irow[i];
1001: lens_i = lens[i];
1002: for (j=0; j<jmax; j++) {
1003: if (allrows[i]) row = j;
1004: else row = irow_i[j]; /* global blocked row of C */
1006: proc = row2proc_i[j];
1007: if (proc == rank) {
1008: /* Get indices from matA and then from matB */
1009: #if defined(PETSC_USE_CTABLE)
1010: PetscInt tt;
1011: #endif
1012: row = row - rstart;
1013: nzA = a_i[row+1] - a_i[row];
1014: nzB = b_i[row+1] - b_i[row];
1015: cworkA = a_j + a_i[row];
1016: cworkB = b_j + b_i[row];
1018: if (!allcolumns[i]) {
1019: #if defined(PETSC_USE_CTABLE)
1020: for (k=0; k<nzA; k++) {
1021: PetscTableFind(cmap_i,cstart+cworkA[k]+1,&tt);
1022: if (tt) lens_i[j]++;
1023: }
1024: for (k=0; k<nzB; k++) {
1025: PetscTableFind(cmap_i,bmap[cworkB[k]]+1,&tt);
1026: if (tt) lens_i[j]++;
1027: }
1029: #else
1030: for (k=0; k<nzA; k++) {
1031: if (cmap_i[cstart + cworkA[k]]) lens_i[j]++;
1032: }
1033: for (k=0; k<nzB; k++) {
1034: if (cmap_i[bmap[cworkB[k]]]) lens_i[j]++;
1035: }
1036: #endif
1037: } else { /* allcolumns */
1038: lens_i[j] = nzA + nzB;
1039: }
1040: }
1041: }
1042: }
1044: /* Create row map: global row of C -> local row of submatrices */
1045: for (i=0; i<ismax; i++) {
1046: if (!allrows[i]) {
1047: #if defined(PETSC_USE_CTABLE)
1048: PetscTableCreate(nrow[i],c->Mbs,&rmap[i]);
1049: irow_i = irow[i];
1050: jmax = nrow[i];
1051: for (j=0; j<jmax; j++) {
1052: if (allrows[i]) {
1053: PetscTableAdd(rmap[i],j+1,j+1,INSERT_VALUES);
1054: } else {
1055: PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);
1056: }
1057: }
1058: #else
1059: PetscCalloc1(c->Mbs,&rmap[i]);
1060: rmap_i = rmap[i];
1061: irow_i = irow[i];
1062: jmax = nrow[i];
1063: for (j=0; j<jmax; j++) {
1064: if (allrows[i]) rmap_i[j] = j;
1065: else rmap_i[irow_i[j]] = j;
1066: }
1067: #endif
1068: } else rmap[i] = NULL;
1069: }
1071: /* Update lens from offproc data */
1072: {
1073: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1075: MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE);
1076: for (tmp2=0; tmp2<nrqs; tmp2++) {
1077: sbuf1_i = sbuf1[pa[tmp2]];
1078: jmax = sbuf1_i[0];
1079: ct1 = 2*jmax+1;
1080: ct2 = 0;
1081: rbuf2_i = rbuf2[tmp2];
1082: rbuf3_i = rbuf3[tmp2];
1083: for (j=1; j<=jmax; j++) {
1084: is_no = sbuf1_i[2*j-1];
1085: max1 = sbuf1_i[2*j];
1086: lens_i = lens[is_no];
1087: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1088: rmap_i = rmap[is_no];
1089: for (k=0; k<max1; k++,ct1++) {
1090: if (allrows[is_no]) {
1091: row = sbuf1_i[ct1];
1092: } else {
1093: #if defined(PETSC_USE_CTABLE)
1094: PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
1095: row--;
1096: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1097: #else
1098: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1099: #endif
1100: }
1101: max2 = rbuf2_i[ct1];
1102: for (l=0; l<max2; l++,ct2++) {
1103: if (!allcolumns[is_no]) {
1104: #if defined(PETSC_USE_CTABLE)
1105: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1106: #else
1107: tcol = cmap_i[rbuf3_i[ct2]];
1108: #endif
1109: if (tcol) lens_i[row]++;
1110: } else { /* allcolumns */
1111: lens_i[row]++; /* lens_i[row] += max2 ? */
1112: }
1113: }
1114: }
1115: }
1116: }
1117: }
1118: PetscFree(r_waits3);
1119: MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE);
1120: PetscFree(s_waits3);
1122: /* Create the submatrices */
1123: for (i=0; i<ismax; i++) {
1124: PetscInt bs_tmp;
1125: if (ijonly) bs_tmp = 1;
1126: else bs_tmp = bs;
1128: MatCreate(PETSC_COMM_SELF,submats+i);
1129: MatSetSizes(submats[i],nrow[i]*bs_tmp,ncol[i]*bs_tmp,PETSC_DETERMINE,PETSC_DETERMINE);
1131: MatSetType(submats[i],((PetscObject)A)->type_name);
1132: MatSeqBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]);
1133: MatSeqSBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]); /* this subroutine is used by SBAIJ routines */
1135: /* create struct Mat_SubSppt and attached it to submat */
1136: PetscNew(&smat_i);
1137: subc = (Mat_SeqBAIJ*)submats[i]->data;
1138: subc->submatis1 = smat_i;
1140: smat_i->destroy = submats[i]->ops->destroy;
1141: submats[i]->ops->destroy = MatDestroySubMatrix_SeqBAIJ;
1142: submats[i]->factortype = C->factortype;
1144: smat_i->id = i;
1145: smat_i->nrqs = nrqs;
1146: smat_i->nrqr = nrqr;
1147: smat_i->rbuf1 = rbuf1;
1148: smat_i->rbuf2 = rbuf2;
1149: smat_i->rbuf3 = rbuf3;
1150: smat_i->sbuf2 = sbuf2;
1151: smat_i->req_source2 = req_source2;
1153: smat_i->sbuf1 = sbuf1;
1154: smat_i->ptr = ptr;
1155: smat_i->tmp = tmp;
1156: smat_i->ctr = ctr;
1158: smat_i->pa = pa;
1159: smat_i->req_size = req_size;
1160: smat_i->req_source1 = req_source1;
1162: smat_i->allcolumns = allcolumns[i];
1163: smat_i->allrows = allrows[i];
1164: smat_i->singleis = PETSC_FALSE;
1165: smat_i->row2proc = row2proc[i];
1166: smat_i->rmap = rmap[i];
1167: smat_i->cmap = cmap[i];
1168: }
1170: if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
1171: MatCreate(PETSC_COMM_SELF,&submats[0]);
1172: MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);
1173: MatSetType(submats[0],MATDUMMY);
1175: /* create struct Mat_SubSppt and attached it to submat */
1176: PetscNewLog(submats[0],&smat_i);
1177: submats[0]->data = (void*)smat_i;
1179: smat_i->destroy = submats[0]->ops->destroy;
1180: submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
1181: submats[0]->factortype = C->factortype;
1183: smat_i->id = 0;
1184: smat_i->nrqs = nrqs;
1185: smat_i->nrqr = nrqr;
1186: smat_i->rbuf1 = rbuf1;
1187: smat_i->rbuf2 = rbuf2;
1188: smat_i->rbuf3 = rbuf3;
1189: smat_i->sbuf2 = sbuf2;
1190: smat_i->req_source2 = req_source2;
1192: smat_i->sbuf1 = sbuf1;
1193: smat_i->ptr = ptr;
1194: smat_i->tmp = tmp;
1195: smat_i->ctr = ctr;
1197: smat_i->pa = pa;
1198: smat_i->req_size = req_size;
1199: smat_i->req_source1 = req_source1;
1201: smat_i->allcolumns = PETSC_FALSE;
1202: smat_i->singleis = PETSC_FALSE;
1203: smat_i->row2proc = NULL;
1204: smat_i->rmap = NULL;
1205: smat_i->cmap = NULL;
1206: }
1208: if (ismax) {PetscFree(lens[0]);}
1209: PetscFree(lens);
1210: if (sbuf_aj) {
1211: PetscFree(sbuf_aj[0]);
1212: PetscFree(sbuf_aj);
1213: }
1215: } /* endof scall == MAT_INITIAL_MATRIX */
1217: /* Post recv matrix values */
1218: if (!ijonly) {
1219: PetscObjectGetNewTag((PetscObject)C,&tag4);
1220: PetscMalloc1(nrqs,&rbuf4);
1221: PetscMalloc1(nrqs,&r_waits4);
1222: for (i=0; i<nrqs; ++i) {
1223: PetscMalloc1(rbuf2[i][0]*bs2,&rbuf4[i]);
1224: MPI_Irecv(rbuf4[i],rbuf2[i][0]*bs2,MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
1225: }
1227: /* Allocate sending buffers for a->a, and send them off */
1228: PetscMalloc1(nrqr,&sbuf_aa);
1229: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1231: if (nrqr) {PetscMalloc1(j*bs2,&sbuf_aa[0]);}
1232: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
1234: PetscMalloc1(nrqr,&s_waits4);
1236: for (i=0; i<nrqr; i++) {
1237: rbuf1_i = rbuf1[i];
1238: sbuf_aa_i = sbuf_aa[i];
1239: ct1 = 2*rbuf1_i[0]+1;
1240: ct2 = 0;
1241: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1242: kmax = rbuf1_i[2*j];
1243: for (k=0; k<kmax; k++,ct1++) {
1244: row = rbuf1_i[ct1] - rstart;
1245: nzA = a_i[row+1] - a_i[row];
1246: nzB = b_i[row+1] - b_i[row];
1247: ncols = nzA + nzB;
1248: cworkB = b_j + b_i[row];
1249: vworkA = a_a + a_i[row]*bs2;
1250: vworkB = b_a + b_i[row]*bs2;
1252: /* load the column values for this row into vals*/
1253: vals = sbuf_aa_i+ct2*bs2;
1254: for (l=0; l<nzB; l++) {
1255: if ((bmap[cworkB[l]]) < cstart) {
1256: PetscArraycpy(vals+l*bs2,vworkB+l*bs2,bs2);
1257: } else break;
1258: }
1259: imark = l;
1260: for (l=0; l<nzA; l++) {
1261: PetscArraycpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2);
1262: }
1263: for (l=imark; l<nzB; l++) {
1264: PetscArraycpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2);
1265: }
1267: ct2 += ncols;
1268: }
1269: }
1270: MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
1271: }
1272: }
1274: /* Assemble the matrices */
1275: /* First assemble the local rows */
1276: for (i=0; i<ismax; i++) {
1277: row2proc_i = row2proc[i];
1278: subc = (Mat_SeqBAIJ*)submats[i]->data;
1279: imat_ilen = subc->ilen;
1280: imat_j = subc->j;
1281: imat_i = subc->i;
1282: imat_a = subc->a;
1284: if (!allcolumns[i]) cmap_i = cmap[i];
1285: rmap_i = rmap[i];
1286: irow_i = irow[i];
1287: jmax = nrow[i];
1288: for (j=0; j<jmax; j++) {
1289: if (allrows[i]) row = j;
1290: else row = irow_i[j];
1291: proc = row2proc_i[j];
1293: if (proc == rank) {
1295: row = row - rstart;
1296: nzA = a_i[row+1] - a_i[row];
1297: nzB = b_i[row+1] - b_i[row];
1298: cworkA = a_j + a_i[row];
1299: cworkB = b_j + b_i[row];
1300: if (!ijonly) {
1301: vworkA = a_a + a_i[row]*bs2;
1302: vworkB = b_a + b_i[row]*bs2;
1303: }
1305: if (allrows[i]) {
1306: row = row+rstart;
1307: } else {
1308: #if defined(PETSC_USE_CTABLE)
1309: PetscTableFind(rmap_i,row+rstart+1,&row);
1310: row--;
1312: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1313: #else
1314: row = rmap_i[row + rstart];
1315: #endif
1316: }
1317: mat_i = imat_i[row];
1318: if (!ijonly) mat_a = imat_a + mat_i*bs2;
1319: mat_j = imat_j + mat_i;
1320: ilen = imat_ilen[row];
1322: /* load the column indices for this row into cols*/
1323: if (!allcolumns[i]) {
1324: for (l=0; l<nzB; l++) {
1325: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1326: #if defined(PETSC_USE_CTABLE)
1327: PetscTableFind(cmap_i,ctmp+1,&tcol);
1328: if (tcol) {
1329: #else
1330: if ((tcol = cmap_i[ctmp])) {
1331: #endif
1332: *mat_j++ = tcol - 1;
1333: PetscArraycpy(mat_a,vworkB+l*bs2,bs2);
1334: mat_a += bs2;
1335: ilen++;
1336: }
1337: } else break;
1338: }
1339: imark = l;
1340: for (l=0; l<nzA; l++) {
1341: #if defined(PETSC_USE_CTABLE)
1342: PetscTableFind(cmap_i,cstart+cworkA[l]+1,&tcol);
1343: if (tcol) {
1344: #else
1345: if ((tcol = cmap_i[cstart + cworkA[l]])) {
1346: #endif
1347: *mat_j++ = tcol - 1;
1348: if (!ijonly) {
1349: PetscArraycpy(mat_a,vworkA+l*bs2,bs2);
1350: mat_a += bs2;
1351: }
1352: ilen++;
1353: }
1354: }
1355: for (l=imark; l<nzB; l++) {
1356: #if defined(PETSC_USE_CTABLE)
1357: PetscTableFind(cmap_i,bmap[cworkB[l]]+1,&tcol);
1358: if (tcol) {
1359: #else
1360: if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1361: #endif
1362: *mat_j++ = tcol - 1;
1363: if (!ijonly) {
1364: PetscArraycpy(mat_a,vworkB+l*bs2,bs2);
1365: mat_a += bs2;
1366: }
1367: ilen++;
1368: }
1369: }
1370: } else { /* allcolumns */
1371: for (l=0; l<nzB; l++) {
1372: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1373: *mat_j++ = ctmp;
1374: PetscArraycpy(mat_a,vworkB+l*bs2,bs2);
1375: mat_a += bs2;
1376: ilen++;
1377: } else break;
1378: }
1379: imark = l;
1380: for (l=0; l<nzA; l++) {
1381: *mat_j++ = cstart+cworkA[l];
1382: if (!ijonly) {
1383: PetscArraycpy(mat_a,vworkA+l*bs2,bs2);
1384: mat_a += bs2;
1385: }
1386: ilen++;
1387: }
1388: for (l=imark; l<nzB; l++) {
1389: *mat_j++ = bmap[cworkB[l]];
1390: if (!ijonly) {
1391: PetscArraycpy(mat_a,vworkB+l*bs2,bs2);
1392: mat_a += bs2;
1393: }
1394: ilen++;
1395: }
1396: }
1397: imat_ilen[row] = ilen;
1398: }
1399: }
1400: }
1402: /* Now assemble the off proc rows */
1403: if (!ijonly) {
1404: MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE);
1405: }
1406: for (tmp2=0; tmp2<nrqs; tmp2++) {
1407: sbuf1_i = sbuf1[pa[tmp2]];
1408: jmax = sbuf1_i[0];
1409: ct1 = 2*jmax + 1;
1410: ct2 = 0;
1411: rbuf2_i = rbuf2[tmp2];
1412: rbuf3_i = rbuf3[tmp2];
1413: if (!ijonly) rbuf4_i = rbuf4[tmp2];
1414: for (j=1; j<=jmax; j++) {
1415: is_no = sbuf1_i[2*j-1];
1416: rmap_i = rmap[is_no];
1417: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1418: subc = (Mat_SeqBAIJ*)submats[is_no]->data;
1419: imat_ilen = subc->ilen;
1420: imat_j = subc->j;
1421: imat_i = subc->i;
1422: if (!ijonly) imat_a = subc->a;
1423: max1 = sbuf1_i[2*j];
1424: for (k=0; k<max1; k++,ct1++) { /* for each recved block row */
1425: row = sbuf1_i[ct1];
1427: if (allrows[is_no]) {
1428: row = sbuf1_i[ct1];
1429: } else {
1430: #if defined(PETSC_USE_CTABLE)
1431: PetscTableFind(rmap_i,row+1,&row);
1432: row--;
1433: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1434: #else
1435: row = rmap_i[row];
1436: #endif
1437: }
1438: ilen = imat_ilen[row];
1439: mat_i = imat_i[row];
1440: if (!ijonly) mat_a = imat_a + mat_i*bs2;
1441: mat_j = imat_j + mat_i;
1442: max2 = rbuf2_i[ct1];
1443: if (!allcolumns[is_no]) {
1444: for (l=0; l<max2; l++,ct2++) {
1445: #if defined(PETSC_USE_CTABLE)
1446: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1447: #else
1448: tcol = cmap_i[rbuf3_i[ct2]];
1449: #endif
1450: if (tcol) {
1451: *mat_j++ = tcol - 1;
1452: if (!ijonly) {
1453: PetscArraycpy(mat_a,rbuf4_i+ct2*bs2,bs2);
1454: mat_a += bs2;
1455: }
1456: ilen++;
1457: }
1458: }
1459: } else { /* allcolumns */
1460: for (l=0; l<max2; l++,ct2++) {
1461: *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1462: if (!ijonly) {
1463: PetscArraycpy(mat_a,rbuf4_i+ct2*bs2,bs2);
1464: mat_a += bs2;
1465: }
1466: ilen++;
1467: }
1468: }
1469: imat_ilen[row] = ilen;
1470: }
1471: }
1472: }
1474: if (!iscsorted) { /* sort column indices of the rows */
1475: MatScalar *work;
1477: PetscMalloc1(bs2,&work);
1478: for (i=0; i<ismax; i++) {
1479: subc = (Mat_SeqBAIJ*)submats[i]->data;
1480: imat_ilen = subc->ilen;
1481: imat_j = subc->j;
1482: imat_i = subc->i;
1483: if (!ijonly) imat_a = subc->a;
1484: if (allcolumns[i]) continue;
1486: jmax = nrow[i];
1487: for (j=0; j<jmax; j++) {
1488: mat_i = imat_i[j];
1489: mat_j = imat_j + mat_i;
1490: ilen = imat_ilen[j];
1491: if (ijonly) {
1492: PetscSortInt(ilen,mat_j);
1493: } else {
1494: mat_a = imat_a + mat_i*bs2;
1495: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
1496: }
1497: }
1498: }
1499: PetscFree(work);
1500: }
1502: if (!ijonly) {
1503: PetscFree(r_waits4);
1504: MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE);
1505: PetscFree(s_waits4);
1506: }
1508: /* Restore the indices */
1509: for (i=0; i<ismax; i++) {
1510: if (!allrows[i]) {
1511: ISRestoreIndices(isrow[i],irow+i);
1512: }
1513: if (!allcolumns[i]) {
1514: ISRestoreIndices(iscol[i],icol+i);
1515: }
1516: }
1518: for (i=0; i<ismax; i++) {
1519: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1520: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1521: }
1523: PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);
1524: PetscFree5(row2proc,cmap,rmap,allcolumns,allrows);
1526: if (!ijonly) {
1527: if (sbuf_aa) {
1528: PetscFree(sbuf_aa[0]);
1529: PetscFree(sbuf_aa);
1530: }
1532: for (i=0; i<nrqs; ++i) {
1533: PetscFree(rbuf4[i]);
1534: }
1535: PetscFree(rbuf4);
1536: }
1537: c->ijonly = PETSC_FALSE; /* set back to the default */
1538: return(0);
1539: }