1 /*
2 Defines matrix-matrix product routines for pairs of SeqAIJ matrices
3 C = A * B
4 */
5
6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7 #include <../src/mat/utils/freespace.h>
8 #include <petscbt.h>
9 #include <petsc/private/isimpl.h>
10 #include <../src/mat/impls/dense/seq/dense.h>
11
MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)12 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
13 {
14 PetscFunctionBegin;
15 if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C));
16 else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C));
17 PetscFunctionReturn(PETSC_SUCCESS);
18 }
19
20 /* Modified from MatCreateSeqAIJWithArrays() */
MatSetSeqAIJWithArrays_private(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt i[],PetscInt j[],PetscScalar a[],MatType mtype,Mat mat)21 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat)
22 {
23 PetscInt ii;
24 Mat_SeqAIJ *aij;
25 PetscBool isseqaij, ofree_a, ofree_ij;
26
27 PetscFunctionBegin;
28 PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
29 PetscCall(MatSetSizes(mat, m, n, m, n));
30
31 if (!mtype) {
32 PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij));
33 if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ));
34 } else {
35 PetscCall(MatSetType(mat, mtype));
36 }
37
38 aij = (Mat_SeqAIJ *)mat->data;
39 ofree_a = aij->free_a;
40 ofree_ij = aij->free_ij;
41 /* changes the free flags */
42 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL));
43
44 PetscCall(PetscFree(aij->ilen));
45 PetscCall(PetscFree(aij->imax));
46 PetscCall(PetscMalloc1(m, &aij->imax));
47 PetscCall(PetscMalloc1(m, &aij->ilen));
48 for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) {
49 const PetscInt rnz = i[ii + 1] - i[ii];
50 aij->nonzerorowcnt += !!rnz;
51 aij->rmax = PetscMax(aij->rmax, rnz);
52 aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii];
53 }
54 aij->maxnz = i[m];
55 aij->nz = i[m];
56
57 if (ofree_a) PetscCall(PetscShmgetDeallocateArray((void **)&aij->a));
58 if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->j));
59 if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->i));
60
61 aij->i = i;
62 aij->j = j;
63 aij->a = a;
64 aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */
65 aij->free_a = PETSC_FALSE;
66 aij->free_ij = PETSC_FALSE;
67 PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6));
68 // Always build the diag info when i, j are set
69 PetscFunctionReturn(PETSC_SUCCESS);
70 }
71
MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)72 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
73 {
74 Mat_Product *product = C->product;
75 MatProductAlgorithm alg;
76 PetscBool flg;
77
78 PetscFunctionBegin;
79 if (product) {
80 alg = product->alg;
81 } else {
82 alg = "sorted";
83 }
84 /* sorted */
85 PetscCall(PetscStrcmp(alg, "sorted", &flg));
86 if (flg) {
87 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C));
88 PetscFunctionReturn(PETSC_SUCCESS);
89 }
90
91 /* scalable */
92 PetscCall(PetscStrcmp(alg, "scalable", &flg));
93 if (flg) {
94 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C));
95 PetscFunctionReturn(PETSC_SUCCESS);
96 }
97
98 /* scalable_fast */
99 PetscCall(PetscStrcmp(alg, "scalable_fast", &flg));
100 if (flg) {
101 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C));
102 PetscFunctionReturn(PETSC_SUCCESS);
103 }
104
105 /* heap */
106 PetscCall(PetscStrcmp(alg, "heap", &flg));
107 if (flg) {
108 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C));
109 PetscFunctionReturn(PETSC_SUCCESS);
110 }
111
112 /* btheap */
113 PetscCall(PetscStrcmp(alg, "btheap", &flg));
114 if (flg) {
115 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C));
116 PetscFunctionReturn(PETSC_SUCCESS);
117 }
118
119 /* llcondensed */
120 PetscCall(PetscStrcmp(alg, "llcondensed", &flg));
121 if (flg) {
122 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C));
123 PetscFunctionReturn(PETSC_SUCCESS);
124 }
125
126 /* rowmerge */
127 PetscCall(PetscStrcmp(alg, "rowmerge", &flg));
128 if (flg) {
129 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C));
130 PetscFunctionReturn(PETSC_SUCCESS);
131 }
132
133 #if defined(PETSC_HAVE_HYPRE)
134 PetscCall(PetscStrcmp(alg, "hypre", &flg));
135 if (flg) {
136 PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C));
137 PetscFunctionReturn(PETSC_SUCCESS);
138 }
139 #endif
140
141 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
142 }
143
MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A,Mat B,PetscReal fill,Mat C)144 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C)
145 {
146 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
147 PetscInt *ai = a->i, *bi = b->i, *ci, *cj;
148 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
149 PetscReal afill;
150 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
151 PetscHMapI ta;
152 PetscBT lnkbt;
153 PetscFreeSpaceList free_space = NULL, current_space = NULL;
154
155 PetscFunctionBegin;
156 /* Get ci and cj */
157 /* Allocate ci array, arrays for fill computation and */
158 /* free space for accumulating nonzero column info */
159 PetscCall(PetscMalloc1(am + 2, &ci));
160 ci[0] = 0;
161
162 /* create and initialize a linked list */
163 PetscCall(PetscHMapICreateWithSize(bn, &ta));
164 MatRowMergeMax_SeqAIJ(b, bm, ta);
165 PetscCall(PetscHMapIGetSize(ta, &Crmax));
166 PetscCall(PetscHMapIDestroy(&ta));
167
168 PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt));
169
170 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
171 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
172
173 current_space = free_space;
174
175 /* Determine ci and cj */
176 for (i = 0; i < am; i++) {
177 anzi = ai[i + 1] - ai[i];
178 aj = a->j + ai[i];
179 for (j = 0; j < anzi; j++) {
180 brow = aj[j];
181 bnzj = bi[brow + 1] - bi[brow];
182 bj = b->j + bi[brow];
183 /* add non-zero cols of B into the sorted linked list lnk */
184 PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt));
185 }
186 /* add possible missing diagonal entry */
187 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt));
188 cnzi = lnk[0];
189
190 /* If free space is not available, make more free space */
191 /* Double the amount of total space in the list */
192 if (current_space->local_remaining < cnzi) {
193 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space));
194 ndouble++;
195 }
196
197 /* Copy data into free space, then initialize lnk */
198 PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt));
199
200 current_space->array += cnzi;
201 current_space->local_used += cnzi;
202 current_space->local_remaining -= cnzi;
203
204 ci[i + 1] = ci[i] + cnzi;
205 }
206
207 /* Column indices are in the list of free space */
208 /* Allocate space for cj, initialize cj, and */
209 /* destroy list of free space and other temporary array(s) */
210 PetscCall(PetscMalloc1(ci[am] + 1, &cj));
211 PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
212 PetscCall(PetscLLCondensedDestroy(lnk, lnkbt));
213
214 /* put together the new symbolic matrix */
215 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
216 PetscCall(MatSetBlockSizesFromMats(C, A, B));
217
218 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
219 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
220 c = (Mat_SeqAIJ *)C->data;
221 c->free_a = PETSC_FALSE;
222 c->free_ij = PETSC_TRUE;
223 c->nonew = 0;
224
225 /* fast, needs non-scalable O(bn) array 'abdense' */
226 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
227
228 /* set MatInfo */
229 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
230 if (afill < 1.0) afill = 1.0;
231 C->info.mallocs = ndouble;
232 C->info.fill_ratio_given = fill;
233 C->info.fill_ratio_needed = afill;
234
235 #if defined(PETSC_USE_INFO)
236 if (ci[am]) {
237 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
238 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
239 } else {
240 PetscCall(PetscInfo(C, "Empty matrix product\n"));
241 }
242 #endif
243 PetscFunctionReturn(PETSC_SUCCESS);
244 }
245
MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,Mat C)246 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C)
247 {
248 PetscLogDouble flops = 0.0;
249 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
250 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
251 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
252 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
253 PetscInt am = A->rmap->n, cm = C->rmap->n;
254 PetscInt i, j, k, anzi, bnzi, cnzi, brow;
255 PetscScalar *ca, valtmp;
256 PetscScalar *ab_dense;
257 PetscContainer cab_dense;
258 const PetscScalar *aa, *ba, *baj;
259
260 PetscFunctionBegin;
261 PetscCall(MatSeqAIJGetArrayRead(A, &aa));
262 PetscCall(MatSeqAIJGetArrayRead(B, &ba));
263 if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
264 PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
265 c->a = ca;
266 c->free_a = PETSC_TRUE;
267 } else ca = c->a;
268
269 /* TODO this should be done in the symbolic phase */
270 /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers
271 that is hard to eradicate) */
272 PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense));
273 if (!cab_dense) {
274 PetscCall(PetscMalloc1(B->cmap->N, &ab_dense));
275 PetscCall(PetscObjectContainerCompose((PetscObject)C, "__PETSc__ab_dense", ab_dense, PetscCtxDestroyDefault));
276 } else PetscCall(PetscContainerGetPointer(cab_dense, &ab_dense));
277 PetscCall(PetscArrayzero(ab_dense, B->cmap->N));
278
279 /* clean old values in C */
280 PetscCall(PetscArrayzero(ca, ci[cm]));
281 /* Traverse A row-wise. */
282 /* Build the ith row in C by summing over nonzero columns in A, */
283 /* the rows of B corresponding to nonzeros of A. */
284 for (i = 0; i < am; i++) {
285 anzi = ai[i + 1] - ai[i];
286 for (j = 0; j < anzi; j++) {
287 brow = aj[j];
288 bnzi = bi[brow + 1] - bi[brow];
289 bjj = PetscSafePointerPlusOffset(bj, bi[brow]);
290 baj = PetscSafePointerPlusOffset(ba, bi[brow]);
291 /* perform dense axpy */
292 valtmp = aa[j];
293 for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k];
294 flops += 2 * bnzi;
295 }
296 aj = PetscSafePointerPlusOffset(aj, anzi);
297 aa = PetscSafePointerPlusOffset(aa, anzi);
298
299 cnzi = ci[i + 1] - ci[i];
300 for (k = 0; k < cnzi; k++) {
301 ca[k] += ab_dense[cj[k]];
302 ab_dense[cj[k]] = 0.0; /* zero ab_dense */
303 }
304 flops += cnzi;
305 cj = PetscSafePointerPlusOffset(cj, cnzi);
306 ca += cnzi;
307 }
308 #if defined(PETSC_HAVE_DEVICE)
309 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
310 #endif
311 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
312 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
313 PetscCall(PetscLogFlops(flops));
314 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
315 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
316 PetscFunctionReturn(PETSC_SUCCESS);
317 }
318
MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,Mat C)319 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C)
320 {
321 PetscLogDouble flops = 0.0;
322 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
323 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
324 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
325 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j;
326 PetscInt am = A->rmap->N, cm = C->rmap->N;
327 PetscInt i, j, k, anzi, bnzi, cnzi, brow;
328 PetscScalar *ca = c->a, valtmp;
329 const PetscScalar *aa, *ba, *baj;
330 PetscInt nextb;
331
332 PetscFunctionBegin;
333 PetscCall(MatSeqAIJGetArrayRead(A, &aa));
334 PetscCall(MatSeqAIJGetArrayRead(B, &ba));
335 if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */
336 PetscCall(PetscMalloc1(ci[cm] + 1, &ca));
337 c->a = ca;
338 c->free_a = PETSC_TRUE;
339 }
340
341 /* clean old values in C */
342 PetscCall(PetscArrayzero(ca, ci[cm]));
343 /* Traverse A row-wise. */
344 /* Build the ith row in C by summing over nonzero columns in A, */
345 /* the rows of B corresponding to nonzeros of A. */
346 for (i = 0; i < am; i++) {
347 anzi = ai[i + 1] - ai[i];
348 cnzi = ci[i + 1] - ci[i];
349 for (j = 0; j < anzi; j++) {
350 brow = aj[j];
351 bnzi = bi[brow + 1] - bi[brow];
352 bjj = bj + bi[brow];
353 baj = ba + bi[brow];
354 /* perform sparse axpy */
355 valtmp = aa[j];
356 nextb = 0;
357 for (k = 0; nextb < bnzi; k++) {
358 if (cj[k] == bjj[nextb]) { /* ccol == bcol */
359 ca[k] += valtmp * baj[nextb++];
360 }
361 }
362 flops += 2 * bnzi;
363 }
364 aj += anzi;
365 aa += anzi;
366 cj += cnzi;
367 ca += cnzi;
368 }
369 #if defined(PETSC_HAVE_DEVICE)
370 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU;
371 #endif
372 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
373 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
374 PetscCall(PetscLogFlops(flops));
375 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa));
376 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
377 PetscFunctionReturn(PETSC_SUCCESS);
378 }
379
MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A,Mat B,PetscReal fill,Mat C)380 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C)
381 {
382 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
383 PetscInt *ai = a->i, *bi = b->i, *ci, *cj;
384 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
385 MatScalar *ca;
386 PetscReal afill;
387 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
388 PetscHMapI ta;
389 PetscFreeSpaceList free_space = NULL, current_space = NULL;
390
391 PetscFunctionBegin;
392 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */
393 /* Allocate arrays for fill computation and free space for accumulating nonzero column */
394 PetscCall(PetscMalloc1(am + 2, &ci));
395 ci[0] = 0;
396
397 /* create and initialize a linked list */
398 PetscCall(PetscHMapICreateWithSize(bn, &ta));
399 MatRowMergeMax_SeqAIJ(b, bm, ta);
400 PetscCall(PetscHMapIGetSize(ta, &Crmax));
401 PetscCall(PetscHMapIDestroy(&ta));
402
403 PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk));
404
405 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
406 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
407 current_space = free_space;
408
409 /* Determine ci and cj */
410 for (i = 0; i < am; i++) {
411 anzi = ai[i + 1] - ai[i];
412 aj = a->j + ai[i];
413 for (j = 0; j < anzi; j++) {
414 brow = aj[j];
415 bnzj = bi[brow + 1] - bi[brow];
416 bj = b->j + bi[brow];
417 /* add non-zero cols of B into the sorted linked list lnk */
418 PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk));
419 }
420 /* add possible missing diagonal entry */
421 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk));
422 cnzi = lnk[1];
423
424 /* If free space is not available, make more free space */
425 /* Double the amount of total space in the list */
426 if (current_space->local_remaining < cnzi) {
427 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space));
428 ndouble++;
429 }
430
431 /* Copy data into free space, then initialize lnk */
432 PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk));
433
434 current_space->array += cnzi;
435 current_space->local_used += cnzi;
436 current_space->local_remaining -= cnzi;
437
438 ci[i + 1] = ci[i] + cnzi;
439 }
440
441 /* Column indices are in the list of free space */
442 /* Allocate space for cj, initialize cj, and */
443 /* destroy list of free space and other temporary array(s) */
444 PetscCall(PetscMalloc1(ci[am] + 1, &cj));
445 PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
446 PetscCall(PetscLLCondensedDestroy_fast(lnk));
447
448 /* Allocate space for ca */
449 PetscCall(PetscCalloc1(ci[am] + 1, &ca));
450
451 /* put together the new symbolic matrix */
452 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
453 PetscCall(MatSetBlockSizesFromMats(C, A, B));
454
455 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
456 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
457 c = (Mat_SeqAIJ *)C->data;
458 c->free_a = PETSC_TRUE;
459 c->free_ij = PETSC_TRUE;
460 c->nonew = 0;
461
462 /* slower, less memory */
463 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
464
465 /* set MatInfo */
466 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
467 if (afill < 1.0) afill = 1.0;
468 C->info.mallocs = ndouble;
469 C->info.fill_ratio_given = fill;
470 C->info.fill_ratio_needed = afill;
471
472 #if defined(PETSC_USE_INFO)
473 if (ci[am]) {
474 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
475 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
476 } else {
477 PetscCall(PetscInfo(C, "Empty matrix product\n"));
478 }
479 #endif
480 PetscFunctionReturn(PETSC_SUCCESS);
481 }
482
MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A,Mat B,PetscReal fill,Mat C)483 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C)
484 {
485 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
486 PetscInt *ai = a->i, *bi = b->i, *ci, *cj;
487 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
488 MatScalar *ca;
489 PetscReal afill;
490 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax;
491 PetscHMapI ta;
492 PetscFreeSpaceList free_space = NULL, current_space = NULL;
493
494 PetscFunctionBegin;
495 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */
496 /* Allocate arrays for fill computation and free space for accumulating nonzero column */
497 PetscCall(PetscMalloc1(am + 2, &ci));
498 ci[0] = 0;
499
500 /* create and initialize a linked list */
501 PetscCall(PetscHMapICreateWithSize(bn, &ta));
502 MatRowMergeMax_SeqAIJ(b, bm, ta);
503 PetscCall(PetscHMapIGetSize(ta, &Crmax));
504 PetscCall(PetscHMapIDestroy(&ta));
505 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk));
506
507 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
508 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
509 current_space = free_space;
510
511 /* Determine ci and cj */
512 for (i = 0; i < am; i++) {
513 anzi = ai[i + 1] - ai[i];
514 aj = a->j + ai[i];
515 for (j = 0; j < anzi; j++) {
516 brow = aj[j];
517 bnzj = bi[brow + 1] - bi[brow];
518 bj = b->j + bi[brow];
519 /* add non-zero cols of B into the sorted linked list lnk */
520 PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk));
521 }
522 /* add possible missing diagonal entry */
523 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk));
524
525 cnzi = lnk[0];
526
527 /* If free space is not available, make more free space */
528 /* Double the amount of total space in the list */
529 if (current_space->local_remaining < cnzi) {
530 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space));
531 ndouble++;
532 }
533
534 /* Copy data into free space, then initialize lnk */
535 PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk));
536
537 current_space->array += cnzi;
538 current_space->local_used += cnzi;
539 current_space->local_remaining -= cnzi;
540
541 ci[i + 1] = ci[i] + cnzi;
542 }
543
544 /* Column indices are in the list of free space */
545 /* Allocate space for cj, initialize cj, and */
546 /* destroy list of free space and other temporary array(s) */
547 PetscCall(PetscMalloc1(ci[am] + 1, &cj));
548 PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
549 PetscCall(PetscLLCondensedDestroy_Scalable(lnk));
550
551 /* Allocate space for ca */
552 PetscCall(PetscCalloc1(ci[am] + 1, &ca));
553
554 /* put together the new symbolic matrix */
555 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C));
556 PetscCall(MatSetBlockSizesFromMats(C, A, B));
557
558 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
559 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
560 c = (Mat_SeqAIJ *)C->data;
561 c->free_a = PETSC_TRUE;
562 c->free_ij = PETSC_TRUE;
563 c->nonew = 0;
564
565 /* slower, less memory */
566 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable;
567
568 /* set MatInfo */
569 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
570 if (afill < 1.0) afill = 1.0;
571 C->info.mallocs = ndouble;
572 C->info.fill_ratio_given = fill;
573 C->info.fill_ratio_needed = afill;
574
575 #if defined(PETSC_USE_INFO)
576 if (ci[am]) {
577 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
578 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
579 } else {
580 PetscCall(PetscInfo(C, "Empty matrix product\n"));
581 }
582 #endif
583 PetscFunctionReturn(PETSC_SUCCESS);
584 }
585
MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A,Mat B,PetscReal fill,Mat C)586 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C)
587 {
588 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
589 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
590 PetscInt *ci, *cj, *bb;
591 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
592 PetscReal afill;
593 PetscInt i, j, col, ndouble = 0;
594 PetscFreeSpaceList free_space = NULL, current_space = NULL;
595 PetscHeap h;
596
597 PetscFunctionBegin;
598 /* Get ci and cj - by merging sorted rows using a heap */
599 /* Allocate arrays for fill computation and free space for accumulating nonzero column */
600 PetscCall(PetscMalloc1(am + 2, &ci));
601 ci[0] = 0;
602
603 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
604 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
605 current_space = free_space;
606
607 PetscCall(PetscHeapCreate(a->rmax, &h));
608 PetscCall(PetscMalloc1(a->rmax, &bb));
609
610 /* Determine ci and cj */
611 for (i = 0; i < am; i++) {
612 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
613 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
614 ci[i + 1] = ci[i];
615 /* Populate the min heap */
616 for (j = 0; j < anzi; j++) {
617 bb[j] = bi[acol[j]]; /* bb points at the start of the row */
618 if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */
619 PetscCall(PetscHeapAdd(h, j, bj[bb[j]++]));
620 }
621 }
622 /* Pick off the min element, adding it to free space */
623 PetscCall(PetscHeapPop(h, &j, &col));
624 while (j >= 0) {
625 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
626 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space));
627 ndouble++;
628 }
629 *(current_space->array++) = col;
630 current_space->local_used++;
631 current_space->local_remaining--;
632 ci[i + 1]++;
633
634 /* stash if anything else remains in this row of B */
635 if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++]));
636 while (1) { /* pop and stash any other rows of B that also had an entry in this column */
637 PetscInt j2, col2;
638 PetscCall(PetscHeapPeek(h, &j2, &col2));
639 if (col2 != col) break;
640 PetscCall(PetscHeapPop(h, &j2, &col2));
641 if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++]));
642 }
643 /* Put any stashed elements back into the min heap */
644 PetscCall(PetscHeapUnstash(h));
645 PetscCall(PetscHeapPop(h, &j, &col));
646 }
647 }
648 PetscCall(PetscFree(bb));
649 PetscCall(PetscHeapDestroy(&h));
650
651 /* Column indices are in the list of free space */
652 /* Allocate space for cj, initialize cj, and */
653 /* destroy list of free space and other temporary array(s) */
654 PetscCall(PetscMalloc1(ci[am], &cj));
655 PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
656
657 /* put together the new symbolic matrix */
658 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
659 PetscCall(MatSetBlockSizesFromMats(C, A, B));
660
661 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
662 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
663 c = (Mat_SeqAIJ *)C->data;
664 c->free_a = PETSC_TRUE;
665 c->free_ij = PETSC_TRUE;
666 c->nonew = 0;
667
668 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
669
670 /* set MatInfo */
671 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
672 if (afill < 1.0) afill = 1.0;
673 C->info.mallocs = ndouble;
674 C->info.fill_ratio_given = fill;
675 C->info.fill_ratio_needed = afill;
676
677 #if defined(PETSC_USE_INFO)
678 if (ci[am]) {
679 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
680 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
681 } else {
682 PetscCall(PetscInfo(C, "Empty matrix product\n"));
683 }
684 #endif
685 PetscFunctionReturn(PETSC_SUCCESS);
686 }
687
MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A,Mat B,PetscReal fill,Mat C)688 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C)
689 {
690 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
691 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
692 PetscInt *ci, *cj, *bb;
693 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
694 PetscReal afill;
695 PetscInt i, j, col, ndouble = 0;
696 PetscFreeSpaceList free_space = NULL, current_space = NULL;
697 PetscHeap h;
698 PetscBT bt;
699
700 PetscFunctionBegin;
701 /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */
702 /* Allocate arrays for fill computation and free space for accumulating nonzero column */
703 PetscCall(PetscMalloc1(am + 2, &ci));
704 ci[0] = 0;
705
706 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
707 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space));
708
709 current_space = free_space;
710
711 PetscCall(PetscHeapCreate(a->rmax, &h));
712 PetscCall(PetscMalloc1(a->rmax, &bb));
713 PetscCall(PetscBTCreate(bn, &bt));
714
715 /* Determine ci and cj */
716 for (i = 0; i < am; i++) {
717 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
718 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
719 const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */
720 ci[i + 1] = ci[i];
721 /* Populate the min heap */
722 for (j = 0; j < anzi; j++) {
723 PetscInt brow = acol[j];
724 for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) {
725 PetscInt bcol = bj[bb[j]];
726 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
727 PetscCall(PetscHeapAdd(h, j, bcol));
728 bb[j]++;
729 break;
730 }
731 }
732 }
733 /* Pick off the min element, adding it to free space */
734 PetscCall(PetscHeapPop(h, &j, &col));
735 while (j >= 0) {
736 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */
737 fptr = NULL; /* need PetscBTMemzero */
738 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space));
739 ndouble++;
740 }
741 *(current_space->array++) = col;
742 current_space->local_used++;
743 current_space->local_remaining--;
744 ci[i + 1]++;
745
746 /* stash if anything else remains in this row of B */
747 for (; bb[j] < bi[acol[j] + 1]; bb[j]++) {
748 PetscInt bcol = bj[bb[j]];
749 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */
750 PetscCall(PetscHeapAdd(h, j, bcol));
751 bb[j]++;
752 break;
753 }
754 }
755 PetscCall(PetscHeapPop(h, &j, &col));
756 }
757 if (fptr) { /* Clear the bits for this row */
758 for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr));
759 } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */
760 PetscCall(PetscBTMemzero(bn, bt));
761 }
762 }
763 PetscCall(PetscFree(bb));
764 PetscCall(PetscHeapDestroy(&h));
765 PetscCall(PetscBTDestroy(&bt));
766
767 /* Column indices are in the list of free space */
768 /* Allocate space for cj, initialize cj, and */
769 /* destroy list of free space and other temporary array(s) */
770 PetscCall(PetscMalloc1(ci[am], &cj));
771 PetscCall(PetscFreeSpaceContiguous(&free_space, cj));
772
773 /* put together the new symbolic matrix */
774 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
775 PetscCall(MatSetBlockSizesFromMats(C, A, B));
776
777 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
778 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
779 c = (Mat_SeqAIJ *)C->data;
780 c->free_a = PETSC_TRUE;
781 c->free_ij = PETSC_TRUE;
782 c->nonew = 0;
783
784 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
785
786 /* set MatInfo */
787 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
788 if (afill < 1.0) afill = 1.0;
789 C->info.mallocs = ndouble;
790 C->info.fill_ratio_given = fill;
791 C->info.fill_ratio_needed = afill;
792
793 #if defined(PETSC_USE_INFO)
794 if (ci[am]) {
795 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
796 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
797 } else {
798 PetscCall(PetscInfo(C, "Empty matrix product\n"));
799 }
800 #endif
801 PetscFunctionReturn(PETSC_SUCCESS);
802 }
803
MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A,Mat B,PetscReal fill,Mat C)804 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C)
805 {
806 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
807 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1;
808 PetscInt *ci, *cj, *outputj, worki_L1[9], worki_L2[9];
809 PetscInt c_maxmem, a_maxrownnz = 0, a_rownnz;
810 const PetscInt workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7};
811 const PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
812 const PetscInt *brow_ptr[8], *brow_end[8];
813 PetscInt window[8];
814 PetscInt window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows;
815 PetscInt i, k, ndouble = 0, L1_rowsleft, rowsleft;
816 PetscReal afill;
817 PetscInt *workj_L1, *workj_L2, *workj_L3;
818 PetscInt L1_nnz, L2_nnz;
819
820 /* Step 1: Get upper bound on memory required for allocation.
821 Because of the way virtual memory works,
822 only the memory pages that are actually needed will be physically allocated. */
823 PetscFunctionBegin;
824 PetscCall(PetscMalloc1(am + 1, &ci));
825 for (i = 0; i < am; i++) {
826 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
827 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
828 a_rownnz = 0;
829 for (k = 0; k < anzi; ++k) {
830 a_rownnz += bi[acol[k] + 1] - bi[acol[k]];
831 if (a_rownnz > bn) {
832 a_rownnz = bn;
833 break;
834 }
835 }
836 a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz);
837 }
838 /* temporary work areas for merging rows */
839 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1));
840 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2));
841 PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3));
842
843 /* This should be enough for almost all matrices. If not, memory is reallocated later. */
844 c_maxmem = 8 * (ai[am] + bi[bm]);
845 /* Step 2: Populate pattern for C */
846 PetscCall(PetscMalloc1(c_maxmem, &cj));
847
848 ci_nnz = 0;
849 ci[0] = 0;
850 worki_L1[0] = 0;
851 worki_L2[0] = 0;
852 for (i = 0; i < am; i++) {
853 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
854 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */
855 rowsleft = anzi;
856 inputcol_L1 = acol;
857 L2_nnz = 0;
858 L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */
859 worki_L2[1] = 0;
860 outputi_nnz = 0;
861
862 /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */
863 while (ci_nnz + a_maxrownnz > c_maxmem) {
864 c_maxmem *= 2;
865 ndouble++;
866 PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj));
867 }
868
869 while (rowsleft) {
870 L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */
871 L1_nrows = 0;
872 L1_nnz = 0;
873 inputcol = inputcol_L1;
874 inputi = bi;
875 inputj = bj;
876
877 /* The following macro is used to specialize for small rows in A.
878 This helps with compiler unrolling, improving performance substantially.
879 Input: inputj inputi inputcol bn
880 Output: outputj outputi_nnz */
881 #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \
882 do { \
883 window_min = bn; \
884 outputi_nnz = 0; \
885 for (k = 0; k < ANNZ; ++k) { \
886 brow_ptr[k] = inputj + inputi[inputcol[k]]; \
887 brow_end[k] = inputj + inputi[inputcol[k] + 1]; \
888 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
889 window_min = PetscMin(window[k], window_min); \
890 } \
891 while (window_min < bn) { \
892 outputj[outputi_nnz++] = window_min; \
893 /* advance front and compute new minimum */ \
894 old_window_min = window_min; \
895 window_min = bn; \
896 for (k = 0; k < ANNZ; ++k) { \
897 if (window[k] == old_window_min) { \
898 brow_ptr[k]++; \
899 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \
900 } \
901 window_min = PetscMin(window[k], window_min); \
902 } \
903 } \
904 } while (0)
905
906 /************** L E V E L 1 ***************/
907 /* Merge up to 8 rows of B to L1 work array*/
908 while (L1_rowsleft) {
909 outputi_nnz = 0;
910 if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/
911 else outputj = cj + ci_nnz; /* Merge directly to C */
912
913 switch (L1_rowsleft) {
914 case 1:
915 brow_ptr[0] = inputj + inputi[inputcol[0]];
916 brow_end[0] = inputj + inputi[inputcol[0] + 1];
917 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
918 inputcol += L1_rowsleft;
919 rowsleft -= L1_rowsleft;
920 L1_rowsleft = 0;
921 break;
922 case 2:
923 MatMatMultSymbolic_RowMergeMacro(2);
924 inputcol += L1_rowsleft;
925 rowsleft -= L1_rowsleft;
926 L1_rowsleft = 0;
927 break;
928 case 3:
929 MatMatMultSymbolic_RowMergeMacro(3);
930 inputcol += L1_rowsleft;
931 rowsleft -= L1_rowsleft;
932 L1_rowsleft = 0;
933 break;
934 case 4:
935 MatMatMultSymbolic_RowMergeMacro(4);
936 inputcol += L1_rowsleft;
937 rowsleft -= L1_rowsleft;
938 L1_rowsleft = 0;
939 break;
940 case 5:
941 MatMatMultSymbolic_RowMergeMacro(5);
942 inputcol += L1_rowsleft;
943 rowsleft -= L1_rowsleft;
944 L1_rowsleft = 0;
945 break;
946 case 6:
947 MatMatMultSymbolic_RowMergeMacro(6);
948 inputcol += L1_rowsleft;
949 rowsleft -= L1_rowsleft;
950 L1_rowsleft = 0;
951 break;
952 case 7:
953 MatMatMultSymbolic_RowMergeMacro(7);
954 inputcol += L1_rowsleft;
955 rowsleft -= L1_rowsleft;
956 L1_rowsleft = 0;
957 break;
958 default:
959 MatMatMultSymbolic_RowMergeMacro(8);
960 inputcol += 8;
961 rowsleft -= 8;
962 L1_rowsleft -= 8;
963 break;
964 }
965 inputcol_L1 = inputcol;
966 L1_nnz += outputi_nnz;
967 worki_L1[++L1_nrows] = L1_nnz;
968 }
969
970 /********************** L E V E L 2 ************************/
971 /* Merge from L1 work array to either C or to L2 work array */
972 if (anzi > 8) {
973 inputi = worki_L1;
974 inputj = workj_L1;
975 inputcol = workcol;
976 outputi_nnz = 0;
977
978 if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */
979 else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */
980
981 switch (L1_nrows) {
982 case 1:
983 brow_ptr[0] = inputj + inputi[inputcol[0]];
984 brow_end[0] = inputj + inputi[inputcol[0] + 1];
985 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
986 break;
987 case 2:
988 MatMatMultSymbolic_RowMergeMacro(2);
989 break;
990 case 3:
991 MatMatMultSymbolic_RowMergeMacro(3);
992 break;
993 case 4:
994 MatMatMultSymbolic_RowMergeMacro(4);
995 break;
996 case 5:
997 MatMatMultSymbolic_RowMergeMacro(5);
998 break;
999 case 6:
1000 MatMatMultSymbolic_RowMergeMacro(6);
1001 break;
1002 case 7:
1003 MatMatMultSymbolic_RowMergeMacro(7);
1004 break;
1005 case 8:
1006 MatMatMultSymbolic_RowMergeMacro(8);
1007 break;
1008 default:
1009 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!");
1010 }
1011 L2_nnz += outputi_nnz;
1012 worki_L2[++L2_nrows] = L2_nnz;
1013
1014 /************************ L E V E L 3 **********************/
1015 /* Merge from L2 work array to either C or to L2 work array */
1016 if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) {
1017 inputi = worki_L2;
1018 inputj = workj_L2;
1019 inputcol = workcol;
1020 outputi_nnz = 0;
1021 if (rowsleft) outputj = workj_L3;
1022 else outputj = cj + ci_nnz;
1023 switch (L2_nrows) {
1024 case 1:
1025 brow_ptr[0] = inputj + inputi[inputcol[0]];
1026 brow_end[0] = inputj + inputi[inputcol[0] + 1];
1027 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */
1028 break;
1029 case 2:
1030 MatMatMultSymbolic_RowMergeMacro(2);
1031 break;
1032 case 3:
1033 MatMatMultSymbolic_RowMergeMacro(3);
1034 break;
1035 case 4:
1036 MatMatMultSymbolic_RowMergeMacro(4);
1037 break;
1038 case 5:
1039 MatMatMultSymbolic_RowMergeMacro(5);
1040 break;
1041 case 6:
1042 MatMatMultSymbolic_RowMergeMacro(6);
1043 break;
1044 case 7:
1045 MatMatMultSymbolic_RowMergeMacro(7);
1046 break;
1047 case 8:
1048 MatMatMultSymbolic_RowMergeMacro(8);
1049 break;
1050 default:
1051 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!");
1052 }
1053 L2_nrows = 1;
1054 L2_nnz = outputi_nnz;
1055 worki_L2[1] = outputi_nnz;
1056 /* Copy to workj_L2 */
1057 if (rowsleft) {
1058 for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k];
1059 }
1060 }
1061 }
1062 } /* while (rowsleft) */
1063 #undef MatMatMultSymbolic_RowMergeMacro
1064
1065 /* terminate current row */
1066 ci_nnz += outputi_nnz;
1067 ci[i + 1] = ci_nnz;
1068 }
1069
1070 /* Step 3: Create the new symbolic matrix */
1071 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
1072 PetscCall(MatSetBlockSizesFromMats(C, A, B));
1073
1074 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1075 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1076 c = (Mat_SeqAIJ *)C->data;
1077 c->free_a = PETSC_TRUE;
1078 c->free_ij = PETSC_TRUE;
1079 c->nonew = 0;
1080
1081 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1082
1083 /* set MatInfo */
1084 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5;
1085 if (afill < 1.0) afill = 1.0;
1086 C->info.mallocs = ndouble;
1087 C->info.fill_ratio_given = fill;
1088 C->info.fill_ratio_needed = afill;
1089
1090 #if defined(PETSC_USE_INFO)
1091 if (ci[am]) {
1092 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
1093 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1094 } else {
1095 PetscCall(PetscInfo(C, "Empty matrix product\n"));
1096 }
1097 #endif
1098
1099 /* Step 4: Free temporary work areas */
1100 PetscCall(PetscFree(workj_L1));
1101 PetscCall(PetscFree(workj_L2));
1102 PetscCall(PetscFree(workj_L3));
1103 PetscFunctionReturn(PETSC_SUCCESS);
1104 }
1105
1106 /* concatenate unique entries and then sort */
MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A,Mat B,PetscReal fill,Mat C)1107 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C)
1108 {
1109 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
1110 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j;
1111 PetscInt *ci, *cj, bcol;
1112 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N;
1113 PetscReal afill;
1114 PetscInt i, j, ndouble = 0;
1115 PetscSegBuffer seg, segrow;
1116 char *seen;
1117
1118 PetscFunctionBegin;
1119 PetscCall(PetscMalloc1(am + 1, &ci));
1120 ci[0] = 0;
1121
1122 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */
1123 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg));
1124 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow));
1125 PetscCall(PetscCalloc1(bn, &seen));
1126
1127 /* Determine ci and cj */
1128 for (i = 0; i < am; i++) {
1129 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */
1130 const PetscInt *acol = PetscSafePointerPlusOffset(aj, ai[i]); /* column indices of nonzero entries in this row */
1131 PetscInt packlen = 0, *PETSC_RESTRICT crow;
1132
1133 /* Pack segrow */
1134 for (j = 0; j < anzi; j++) {
1135 PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k;
1136 for (k = bjstart; k < bjend; k++) {
1137 bcol = bj[k];
1138 if (!seen[bcol]) { /* new entry */
1139 PetscInt *PETSC_RESTRICT slot;
1140 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1141 *slot = bcol;
1142 seen[bcol] = 1;
1143 packlen++;
1144 }
1145 }
1146 }
1147
1148 /* Check i-th diagonal entry */
1149 if (C->force_diagonals && !seen[i]) {
1150 PetscInt *PETSC_RESTRICT slot;
1151 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot));
1152 *slot = i;
1153 seen[i] = 1;
1154 packlen++;
1155 }
1156
1157 PetscCall(PetscSegBufferGetInts(seg, packlen, &crow));
1158 PetscCall(PetscSegBufferExtractTo(segrow, crow));
1159 PetscCall(PetscSortInt(packlen, crow));
1160 ci[i + 1] = ci[i] + packlen;
1161 for (j = 0; j < packlen; j++) seen[crow[j]] = 0;
1162 }
1163 PetscCall(PetscSegBufferDestroy(&segrow));
1164 PetscCall(PetscFree(seen));
1165
1166 /* Column indices are in the segmented buffer */
1167 PetscCall(PetscSegBufferExtractAlloc(seg, &cj));
1168 PetscCall(PetscSegBufferDestroy(&seg));
1169
1170 /* put together the new symbolic matrix */
1171 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C));
1172 PetscCall(MatSetBlockSizesFromMats(C, A, B));
1173
1174 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1175 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */
1176 c = (Mat_SeqAIJ *)C->data;
1177 c->free_a = PETSC_TRUE;
1178 c->free_ij = PETSC_TRUE;
1179 c->nonew = 0;
1180
1181 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted;
1182
1183 /* set MatInfo */
1184 afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5;
1185 if (afill < 1.0) afill = 1.0;
1186 C->info.mallocs = ndouble;
1187 C->info.fill_ratio_given = fill;
1188 C->info.fill_ratio_needed = afill;
1189
1190 #if defined(PETSC_USE_INFO)
1191 if (ci[am]) {
1192 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill));
1193 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill));
1194 } else {
1195 PetscCall(PetscInfo(C, "Empty matrix product\n"));
1196 }
1197 #endif
1198 PetscFunctionReturn(PETSC_SUCCESS);
1199 }
1200
MatProductCtxDestroy_SeqAIJ_MatMatMultTrans(PetscCtxRt data)1201 static PetscErrorCode MatProductCtxDestroy_SeqAIJ_MatMatMultTrans(PetscCtxRt data)
1202 {
1203 MatProductCtx_MatMatTransMult *abt = *(MatProductCtx_MatMatTransMult **)data;
1204
1205 PetscFunctionBegin;
1206 PetscCall(MatTransposeColoringDestroy(&abt->matcoloring));
1207 PetscCall(MatDestroy(&abt->Bt_den));
1208 PetscCall(MatDestroy(&abt->ABt_den));
1209 PetscCall(PetscFree(abt));
1210 PetscFunctionReturn(PETSC_SUCCESS);
1211 }
1212
MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)1213 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1214 {
1215 Mat Bt;
1216 MatProductCtx_MatMatTransMult *abt;
1217 Mat_Product *product = C->product;
1218 char *alg;
1219
1220 PetscFunctionBegin;
1221 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1222 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
1223
1224 /* create symbolic Bt */
1225 PetscCall(MatTransposeSymbolic(B, &Bt));
1226 PetscCall(MatSetBlockSizes(Bt, A->cmap->bs, B->cmap->bs));
1227 PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name));
1228
1229 /* get symbolic C=A*Bt */
1230 PetscCall(PetscStrallocpy(product->alg, &alg));
1231 PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */
1232 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C));
1233 PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */
1234 PetscCall(PetscFree(alg));
1235
1236 /* create a supporting struct for reuse intermediate dense matrices with matcoloring */
1237 PetscCall(PetscNew(&abt));
1238
1239 product->data = abt;
1240 product->destroy = MatProductCtxDestroy_SeqAIJ_MatMatMultTrans;
1241
1242 C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ;
1243
1244 abt->usecoloring = PETSC_FALSE;
1245 PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring));
1246 if (abt->usecoloring) {
1247 /* Create MatTransposeColoring from symbolic C=A*B^T */
1248 MatTransposeColoring matcoloring;
1249 MatColoring coloring;
1250 ISColoring iscoloring;
1251 Mat Bt_dense, C_dense;
1252
1253 /* inode causes memory problem */
1254 PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE));
1255
1256 PetscCall(MatColoringCreate(C, &coloring));
1257 PetscCall(MatColoringSetDistance(coloring, 2));
1258 PetscCall(MatColoringSetType(coloring, MATCOLORINGSL));
1259 PetscCall(MatColoringSetFromOptions(coloring));
1260 PetscCall(MatColoringApply(coloring, &iscoloring));
1261 PetscCall(MatColoringDestroy(&coloring));
1262 PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring));
1263
1264 abt->matcoloring = matcoloring;
1265
1266 PetscCall(ISColoringDestroy(&iscoloring));
1267
1268 /* Create Bt_dense and C_dense = A*Bt_dense */
1269 PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense));
1270 PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors));
1271 PetscCall(MatSetType(Bt_dense, MATSEQDENSE));
1272 PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL));
1273
1274 Bt_dense->assembled = PETSC_TRUE;
1275 abt->Bt_den = Bt_dense;
1276
1277 PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense));
1278 PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors));
1279 PetscCall(MatSetType(C_dense, MATSEQDENSE));
1280 PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL));
1281
1282 Bt_dense->assembled = PETSC_TRUE;
1283 abt->ABt_den = C_dense;
1284
1285 #if defined(PETSC_USE_INFO)
1286 {
1287 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data;
1288 PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n,
1289 Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)c->nz) / ((PetscReal)(A->rmap->n * matcoloring->ncolors)))));
1290 }
1291 #endif
1292 }
1293 /* clean up */
1294 PetscCall(MatDestroy(&Bt));
1295 PetscFunctionReturn(PETSC_SUCCESS);
1296 }
1297
MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)1298 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1299 {
1300 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1301 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow;
1302 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol;
1303 PetscLogDouble flops = 0.0;
1304 MatScalar *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval;
1305 MatProductCtx_MatMatTransMult *abt;
1306 Mat_Product *product = C->product;
1307
1308 PetscFunctionBegin;
1309 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1310 abt = (MatProductCtx_MatMatTransMult *)product->data;
1311 PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1312 /* clear old values in C */
1313 if (!c->a) {
1314 PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
1315 c->a = ca;
1316 c->free_a = PETSC_TRUE;
1317 } else {
1318 ca = c->a;
1319 PetscCall(PetscArrayzero(ca, ci[cm] + 1));
1320 }
1321
1322 if (abt->usecoloring) {
1323 MatTransposeColoring matcoloring = abt->matcoloring;
1324 Mat Bt_dense, C_dense = abt->ABt_den;
1325
1326 /* Get Bt_dense by Apply MatTransposeColoring to B */
1327 Bt_dense = abt->Bt_den;
1328 PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense));
1329
1330 /* C_dense = A*Bt_dense */
1331 PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense));
1332
1333 /* Recover C from C_dense */
1334 PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C));
1335 PetscFunctionReturn(PETSC_SUCCESS);
1336 }
1337
1338 for (i = 0; i < cm; i++) {
1339 anzi = ai[i + 1] - ai[i];
1340 acol = PetscSafePointerPlusOffset(aj, ai[i]);
1341 aval = PetscSafePointerPlusOffset(aa, ai[i]);
1342 cnzi = ci[i + 1] - ci[i];
1343 ccol = PetscSafePointerPlusOffset(cj, ci[i]);
1344 cval = ca + ci[i];
1345 for (j = 0; j < cnzi; j++) {
1346 brow = ccol[j];
1347 bnzj = bi[brow + 1] - bi[brow];
1348 bcol = bj + bi[brow];
1349 bval = ba + bi[brow];
1350
1351 /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */
1352 nexta = 0;
1353 nextb = 0;
1354 while (nexta < anzi && nextb < bnzj) {
1355 while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++;
1356 if (nexta == anzi) break;
1357 while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++;
1358 if (nextb == bnzj) break;
1359 if (acol[nexta] == bcol[nextb]) {
1360 cval[j] += aval[nexta] * bval[nextb];
1361 nexta++;
1362 nextb++;
1363 flops += 2;
1364 }
1365 }
1366 }
1367 }
1368 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1369 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1370 PetscCall(PetscLogFlops(flops));
1371 PetscFunctionReturn(PETSC_SUCCESS);
1372 }
1373
MatProductCtxDestroy_SeqAIJ_MatTransMatMult(PetscCtxRt data)1374 PetscErrorCode MatProductCtxDestroy_SeqAIJ_MatTransMatMult(PetscCtxRt data)
1375 {
1376 MatProductCtx_MatTransMatMult *atb = *(MatProductCtx_MatTransMatMult **)data;
1377
1378 PetscFunctionBegin;
1379 PetscCall(MatDestroy(&atb->At));
1380 if (atb->destroy) PetscCall((*atb->destroy)(&atb->data));
1381 PetscCall(PetscFree(atb));
1382 PetscFunctionReturn(PETSC_SUCCESS);
1383 }
1384
MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat B,PetscReal fill,Mat C)1385 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C)
1386 {
1387 Mat At = NULL;
1388 Mat_Product *product = C->product;
1389 PetscBool flg, def, square;
1390
1391 PetscFunctionBegin;
1392 MatCheckProduct(C, 4);
1393 square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE);
1394 /* outerproduct */
1395 PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg));
1396 if (flg) {
1397 /* create symbolic At */
1398 if (!square) {
1399 PetscCall(MatTransposeSymbolic(A, &At));
1400 PetscCall(MatSetBlockSizes(At, A->cmap->bs, B->cmap->bs));
1401 PetscCall(MatSetType(At, ((PetscObject)A)->type_name));
1402 }
1403 /* get symbolic C=At*B */
1404 PetscCall(MatProductSetAlgorithm(C, "sorted"));
1405 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
1406
1407 /* clean up */
1408 if (!square) PetscCall(MatDestroy(&At));
1409
1410 C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */
1411 PetscCall(MatProductSetAlgorithm(C, "outerproduct"));
1412 PetscFunctionReturn(PETSC_SUCCESS);
1413 }
1414
1415 /* matmatmult */
1416 PetscCall(PetscStrcmp(product->alg, "default", &def));
1417 PetscCall(PetscStrcmp(product->alg, "at*b", &flg));
1418 if (flg || def) {
1419 MatProductCtx_MatTransMatMult *atb;
1420
1421 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty");
1422 PetscCall(PetscNew(&atb));
1423 if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
1424 PetscCall(MatProductSetAlgorithm(C, "sorted"));
1425 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C));
1426 PetscCall(MatProductSetAlgorithm(C, "at*b"));
1427 product->data = atb;
1428 product->destroy = MatProductCtxDestroy_SeqAIJ_MatTransMatMult;
1429 atb->At = At;
1430
1431 C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */
1432 PetscFunctionReturn(PETSC_SUCCESS);
1433 }
1434
1435 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported");
1436 }
1437
MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A,Mat B,Mat C)1438 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C)
1439 {
1440 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data;
1441 PetscInt am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb;
1442 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k;
1443 PetscLogDouble flops = 0.0;
1444 MatScalar *aa = a->a, *ba, *ca, *caj;
1445
1446 PetscFunctionBegin;
1447 if (!c->a) {
1448 PetscCall(PetscCalloc1(ci[cm] + 1, &ca));
1449
1450 c->a = ca;
1451 c->free_a = PETSC_TRUE;
1452 } else {
1453 ca = c->a;
1454 PetscCall(PetscArrayzero(ca, ci[cm]));
1455 }
1456
1457 /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */
1458 for (i = 0; i < am; i++) {
1459 bj = b->j + bi[i];
1460 ba = b->a + bi[i];
1461 bnzi = bi[i + 1] - bi[i];
1462 anzi = ai[i + 1] - ai[i];
1463 for (j = 0; j < anzi; j++) {
1464 nextb = 0;
1465 crow = *aj++;
1466 cjj = cj + ci[crow];
1467 caj = ca + ci[crow];
1468 /* perform sparse axpy operation. Note cjj includes bj. */
1469 for (k = 0; nextb < bnzi; k++) {
1470 if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */
1471 caj[k] += (*aa) * (*(ba + nextb));
1472 nextb++;
1473 }
1474 }
1475 flops += 2 * bnzi;
1476 aa++;
1477 }
1478 }
1479
1480 /* Assemble the final matrix and clean up */
1481 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1482 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1483 PetscCall(PetscLogFlops(flops));
1484 PetscFunctionReturn(PETSC_SUCCESS);
1485 }
1486
MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)1487 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1488 {
1489 PetscFunctionBegin;
1490 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C));
1491 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense;
1492 PetscFunctionReturn(PETSC_SUCCESS);
1493 }
1494
MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A,Mat B,Mat C,const PetscBool add)1495 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add)
1496 {
1497 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1498 PetscScalar *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4;
1499 const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av;
1500 const PetscInt *aj;
1501 PetscInt cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n;
1502 PetscInt clda;
1503 PetscInt am4, bm4, col, i, j, n;
1504
1505 PetscFunctionBegin;
1506 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS);
1507 PetscCall(MatSeqAIJGetArrayRead(A, &av));
1508 if (add) {
1509 PetscCall(MatDenseGetArray(C, &c));
1510 } else {
1511 PetscCall(MatDenseGetArrayWrite(C, &c));
1512 }
1513 PetscCall(MatDenseGetArrayRead(B, &b));
1514 PetscCall(MatDenseGetLDA(B, &bm));
1515 PetscCall(MatDenseGetLDA(C, &clda));
1516 am4 = 4 * clda;
1517 bm4 = 4 * bm;
1518 if (b) {
1519 b1 = b;
1520 b2 = b1 + bm;
1521 b3 = b2 + bm;
1522 b4 = b3 + bm;
1523 } else b1 = b2 = b3 = b4 = NULL;
1524 c1 = c;
1525 c2 = c1 + clda;
1526 c3 = c2 + clda;
1527 c4 = c3 + clda;
1528 for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */
1529 for (i = 0; i < am; i++) { /* over rows of A in those columns */
1530 r1 = r2 = r3 = r4 = 0.0;
1531 n = a->i[i + 1] - a->i[i];
1532 aj = PetscSafePointerPlusOffset(a->j, a->i[i]);
1533 aa = PetscSafePointerPlusOffset(av, a->i[i]);
1534 for (j = 0; j < n; j++) {
1535 const PetscScalar aatmp = aa[j];
1536 const PetscInt ajtmp = aj[j];
1537 r1 += aatmp * b1[ajtmp];
1538 r2 += aatmp * b2[ajtmp];
1539 r3 += aatmp * b3[ajtmp];
1540 r4 += aatmp * b4[ajtmp];
1541 }
1542 if (add) {
1543 c1[i] += r1;
1544 c2[i] += r2;
1545 c3[i] += r3;
1546 c4[i] += r4;
1547 } else {
1548 c1[i] = r1;
1549 c2[i] = r2;
1550 c3[i] = r3;
1551 c4[i] = r4;
1552 }
1553 }
1554 if (b) {
1555 b1 += bm4;
1556 b2 += bm4;
1557 b3 += bm4;
1558 b4 += bm4;
1559 }
1560 c1 += am4;
1561 c2 += am4;
1562 c3 += am4;
1563 c4 += am4;
1564 }
1565 /* process remaining columns */
1566 if (col != cn) {
1567 PetscInt rc = cn - col;
1568
1569 if (rc == 1) {
1570 for (i = 0; i < am; i++) {
1571 r1 = 0.0;
1572 n = a->i[i + 1] - a->i[i];
1573 aj = PetscSafePointerPlusOffset(a->j, a->i[i]);
1574 aa = PetscSafePointerPlusOffset(av, a->i[i]);
1575 for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]];
1576 if (add) c1[i] += r1;
1577 else c1[i] = r1;
1578 }
1579 } else if (rc == 2) {
1580 for (i = 0; i < am; i++) {
1581 r1 = r2 = 0.0;
1582 n = a->i[i + 1] - a->i[i];
1583 aj = PetscSafePointerPlusOffset(a->j, a->i[i]);
1584 aa = PetscSafePointerPlusOffset(av, a->i[i]);
1585 for (j = 0; j < n; j++) {
1586 const PetscScalar aatmp = aa[j];
1587 const PetscInt ajtmp = aj[j];
1588 r1 += aatmp * b1[ajtmp];
1589 r2 += aatmp * b2[ajtmp];
1590 }
1591 if (add) {
1592 c1[i] += r1;
1593 c2[i] += r2;
1594 } else {
1595 c1[i] = r1;
1596 c2[i] = r2;
1597 }
1598 }
1599 } else {
1600 for (i = 0; i < am; i++) {
1601 r1 = r2 = r3 = 0.0;
1602 n = a->i[i + 1] - a->i[i];
1603 aj = PetscSafePointerPlusOffset(a->j, a->i[i]);
1604 aa = PetscSafePointerPlusOffset(av, a->i[i]);
1605 for (j = 0; j < n; j++) {
1606 const PetscScalar aatmp = aa[j];
1607 const PetscInt ajtmp = aj[j];
1608 r1 += aatmp * b1[ajtmp];
1609 r2 += aatmp * b2[ajtmp];
1610 r3 += aatmp * b3[ajtmp];
1611 }
1612 if (add) {
1613 c1[i] += r1;
1614 c2[i] += r2;
1615 c3[i] += r3;
1616 } else {
1617 c1[i] = r1;
1618 c2[i] = r2;
1619 c3[i] = r3;
1620 }
1621 }
1622 }
1623 }
1624 PetscCall(PetscLogFlops(cn * (2.0 * a->nz)));
1625 if (add) {
1626 PetscCall(MatDenseRestoreArray(C, &c));
1627 } else {
1628 PetscCall(MatDenseRestoreArrayWrite(C, &c));
1629 }
1630 PetscCall(MatDenseRestoreArrayRead(B, &b));
1631 PetscCall(MatSeqAIJRestoreArrayRead(A, &av));
1632 PetscFunctionReturn(PETSC_SUCCESS);
1633 }
1634
MatMatMultNumeric_SeqAIJ_SeqDense(Mat A,Mat B,Mat C)1635 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C)
1636 {
1637 PetscFunctionBegin;
1638 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n);
1639 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n);
1640 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n);
1641
1642 PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE));
1643 PetscFunctionReturn(PETSC_SUCCESS);
1644 }
1645
MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)1646 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C)
1647 {
1648 PetscFunctionBegin;
1649 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense;
1650 C->ops->productsymbolic = MatProductSymbolic_AB;
1651 PetscFunctionReturn(PETSC_SUCCESS);
1652 }
1653
1654 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat);
1655
MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)1656 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C)
1657 {
1658 PetscFunctionBegin;
1659 C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1660 C->ops->productsymbolic = MatProductSymbolic_AtB;
1661 PetscFunctionReturn(PETSC_SUCCESS);
1662 }
1663
MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)1664 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C)
1665 {
1666 PetscFunctionBegin;
1667 C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense;
1668 C->ops->productsymbolic = MatProductSymbolic_ABt;
1669 PetscFunctionReturn(PETSC_SUCCESS);
1670 }
1671
MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)1672 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C)
1673 {
1674 Mat_Product *product = C->product;
1675
1676 PetscFunctionBegin;
1677 switch (product->type) {
1678 case MATPRODUCT_AB:
1679 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C));
1680 break;
1681 case MATPRODUCT_AtB:
1682 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C));
1683 break;
1684 case MATPRODUCT_ABt:
1685 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C));
1686 break;
1687 default:
1688 break;
1689 }
1690 PetscFunctionReturn(PETSC_SUCCESS);
1691 }
1692
MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)1693 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C)
1694 {
1695 Mat_Product *product = C->product;
1696 Mat A = product->A;
1697 PetscBool baij;
1698
1699 PetscFunctionBegin;
1700 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij));
1701 if (!baij) { /* A is seqsbaij */
1702 PetscBool sbaij;
1703 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij));
1704 PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format");
1705
1706 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense;
1707 } else { /* A is seqbaij */
1708 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense;
1709 }
1710
1711 C->ops->productsymbolic = MatProductSymbolic_AB;
1712 PetscFunctionReturn(PETSC_SUCCESS);
1713 }
1714
MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)1715 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C)
1716 {
1717 Mat_Product *product = C->product;
1718
1719 PetscFunctionBegin;
1720 MatCheckProduct(C, 1);
1721 PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A");
1722 if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C));
1723 else if (product->type == MATPRODUCT_AtB) {
1724 PetscBool flg;
1725
1726 PetscCall(PetscObjectTypeCompare((PetscObject)product->A, MATSEQBAIJ, &flg));
1727 if (flg) {
1728 C->ops->transposematmultsymbolic = MatTransposeMatMultSymbolic_SeqBAIJ_SeqDense;
1729 C->ops->productsymbolic = MatProductSymbolic_AtB;
1730 }
1731 }
1732 PetscFunctionReturn(PETSC_SUCCESS);
1733 }
1734
MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)1735 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C)
1736 {
1737 PetscFunctionBegin;
1738 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ;
1739 C->ops->productsymbolic = MatProductSymbolic_AB;
1740 PetscFunctionReturn(PETSC_SUCCESS);
1741 }
1742
MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)1743 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C)
1744 {
1745 Mat_Product *product = C->product;
1746
1747 PetscFunctionBegin;
1748 if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C));
1749 PetscFunctionReturn(PETSC_SUCCESS);
1750 }
1751
MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring,Mat B,Mat Btdense)1752 PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense)
1753 {
1754 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data;
1755 Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data;
1756 PetscInt *bi = b->i, *bj = b->j;
1757 PetscInt m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns;
1758 MatScalar *btval, *btval_den, *ba = b->a;
1759 PetscInt *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors;
1760
1761 PetscFunctionBegin;
1762 btval_den = btdense->v;
1763 PetscCall(PetscArrayzero(btval_den, m * n));
1764 for (k = 0; k < ncolors; k++) {
1765 ncolumns = coloring->ncolumns[k];
1766 for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */
1767 col = *(columns + colorforcol[k] + l);
1768 btcol = bj + bi[col];
1769 btval = ba + bi[col];
1770 anz = bi[col + 1] - bi[col];
1771 for (j = 0; j < anz; j++) {
1772 brow = btcol[j];
1773 btval_den[brow] = btval[j];
1774 }
1775 }
1776 btval_den += m;
1777 }
1778 PetscFunctionReturn(PETSC_SUCCESS);
1779 }
1780
MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring,Mat Cden,Mat Csp)1781 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp)
1782 {
1783 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)Csp->data;
1784 const PetscScalar *ca_den, *ca_den_ptr;
1785 PetscScalar *ca = csp->a;
1786 PetscInt k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors;
1787 PetscInt brows = matcoloring->brows, *den2sp = matcoloring->den2sp;
1788 PetscInt nrows, *row, *idx;
1789 PetscInt *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow;
1790
1791 PetscFunctionBegin;
1792 PetscCall(MatDenseGetArrayRead(Cden, &ca_den));
1793
1794 if (brows > 0) {
1795 PetscInt *lstart, row_end, row_start;
1796 lstart = matcoloring->lstart;
1797 PetscCall(PetscArrayzero(lstart, ncolors));
1798
1799 row_end = brows;
1800 if (row_end > m) row_end = m;
1801 for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */
1802 ca_den_ptr = ca_den;
1803 for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */
1804 nrows = matcoloring->nrows[k];
1805 row = rows + colorforrow[k];
1806 idx = den2sp + colorforrow[k];
1807 for (l = lstart[k]; l < nrows; l++) {
1808 if (row[l] >= row_end) {
1809 lstart[k] = l;
1810 break;
1811 } else {
1812 ca[idx[l]] = ca_den_ptr[row[l]];
1813 }
1814 }
1815 ca_den_ptr += m;
1816 }
1817 row_end += brows;
1818 if (row_end > m) row_end = m;
1819 }
1820 } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */
1821 ca_den_ptr = ca_den;
1822 for (k = 0; k < ncolors; k++) {
1823 nrows = matcoloring->nrows[k];
1824 row = rows + colorforrow[k];
1825 idx = den2sp + colorforrow[k];
1826 for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]];
1827 ca_den_ptr += m;
1828 }
1829 }
1830
1831 PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den));
1832 #if defined(PETSC_USE_INFO)
1833 if (matcoloring->brows > 0) {
1834 PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows));
1835 } else {
1836 PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n"));
1837 }
1838 #endif
1839 PetscFunctionReturn(PETSC_SUCCESS);
1840 }
1841
MatTransposeColoringCreate_SeqAIJ(Mat mat,ISColoring iscoloring,MatTransposeColoring c)1842 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c)
1843 {
1844 PetscInt i, n, nrows, Nbs, j, k, m, ncols, col, cm;
1845 const PetscInt *is, *ci, *cj, *row_idx;
1846 PetscInt nis = iscoloring->n, *rowhit, bs = 1;
1847 IS *isa;
1848 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)mat->data;
1849 PetscInt *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i;
1850 PetscInt *colorforcol, *columns, *columns_i, brows;
1851 PetscBool flg;
1852
1853 PetscFunctionBegin;
1854 PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa));
1855
1856 /* bs > 1 is not being tested yet! */
1857 Nbs = mat->cmap->N / bs;
1858 c->M = mat->rmap->N / bs; /* set total rows, columns and local rows */
1859 c->N = Nbs;
1860 c->m = c->M;
1861 c->rstart = 0;
1862 c->brows = 100;
1863
1864 c->ncolors = nis;
1865 PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow));
1866 PetscCall(PetscMalloc1(csp->nz + 1, &rows));
1867 PetscCall(PetscMalloc1(csp->nz + 1, &den2sp));
1868
1869 brows = c->brows;
1870 PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg));
1871 if (flg) c->brows = brows;
1872 if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart));
1873
1874 colorforrow[0] = 0;
1875 rows_i = rows;
1876 den2sp_i = den2sp;
1877
1878 PetscCall(PetscMalloc1(nis + 1, &colorforcol));
1879 PetscCall(PetscMalloc1(Nbs + 1, &columns));
1880
1881 colorforcol[0] = 0;
1882 columns_i = columns;
1883
1884 /* get column-wise storage of mat */
1885 PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
1886
1887 cm = c->m;
1888 PetscCall(PetscMalloc1(cm + 1, &rowhit));
1889 PetscCall(PetscMalloc1(cm + 1, &idxhit));
1890 for (i = 0; i < nis; i++) { /* loop over color */
1891 PetscCall(ISGetLocalSize(isa[i], &n));
1892 PetscCall(ISGetIndices(isa[i], &is));
1893
1894 c->ncolumns[i] = n;
1895 if (n) PetscCall(PetscArraycpy(columns_i, is, n));
1896 colorforcol[i + 1] = colorforcol[i] + n;
1897 columns_i += n;
1898
1899 /* fast, crude version requires O(N*N) work */
1900 PetscCall(PetscArrayzero(rowhit, cm));
1901
1902 for (j = 0; j < n; j++) { /* loop over columns*/
1903 col = is[j];
1904 row_idx = cj + ci[col];
1905 m = ci[col + 1] - ci[col];
1906 for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */
1907 idxhit[*row_idx] = spidx[ci[col] + k];
1908 rowhit[*row_idx++] = col + 1;
1909 }
1910 }
1911 /* count the number of hits */
1912 nrows = 0;
1913 for (j = 0; j < cm; j++) {
1914 if (rowhit[j]) nrows++;
1915 }
1916 c->nrows[i] = nrows;
1917 colorforrow[i + 1] = colorforrow[i] + nrows;
1918
1919 nrows = 0;
1920 for (j = 0; j < cm; j++) { /* loop over rows */
1921 if (rowhit[j]) {
1922 rows_i[nrows] = j;
1923 den2sp_i[nrows] = idxhit[j];
1924 nrows++;
1925 }
1926 }
1927 den2sp_i += nrows;
1928
1929 PetscCall(ISRestoreIndices(isa[i], &is));
1930 rows_i += nrows;
1931 }
1932 PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL));
1933 PetscCall(PetscFree(rowhit));
1934 PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa));
1935 PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]);
1936
1937 c->colorforrow = colorforrow;
1938 c->rows = rows;
1939 c->den2sp = den2sp;
1940 c->colorforcol = colorforcol;
1941 c->columns = columns;
1942
1943 PetscCall(PetscFree(idxhit));
1944 PetscFunctionReturn(PETSC_SUCCESS);
1945 }
1946
MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)1947 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C)
1948 {
1949 Mat_Product *product = C->product;
1950 Mat A = product->A, B = product->B;
1951
1952 PetscFunctionBegin;
1953 if (C->ops->mattransposemultnumeric) {
1954 /* Alg: "outerproduct" */
1955 PetscCall((*C->ops->mattransposemultnumeric)(A, B, C));
1956 } else {
1957 /* Alg: "matmatmult" -- C = At*B */
1958 MatProductCtx_MatTransMatMult *atb = (MatProductCtx_MatTransMatMult *)product->data;
1959
1960 PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct");
1961 if (atb->At) {
1962 /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ();
1963 user may have called MatProductReplaceMats() to get this A=product->A */
1964 PetscCall(MatTransposeSetPrecursor(A, atb->At));
1965 PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At));
1966 }
1967 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C));
1968 }
1969 PetscFunctionReturn(PETSC_SUCCESS);
1970 }
1971
MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)1972 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C)
1973 {
1974 Mat_Product *product = C->product;
1975 Mat A = product->A, B = product->B;
1976 PetscReal fill = product->fill;
1977
1978 PetscFunctionBegin;
1979 PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C));
1980
1981 C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ;
1982 PetscFunctionReturn(PETSC_SUCCESS);
1983 }
1984
MatProductSetFromOptions_SeqAIJ_AB(Mat C)1985 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C)
1986 {
1987 Mat_Product *product = C->product;
1988 PetscInt alg = 0; /* default algorithm */
1989 PetscBool flg = PETSC_FALSE;
1990 #if !defined(PETSC_HAVE_HYPRE)
1991 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
1992 PetscInt nalg = 7;
1993 #else
1994 const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"};
1995 PetscInt nalg = 8;
1996 #endif
1997
1998 PetscFunctionBegin;
1999 /* Set default algorithm */
2000 PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
2001 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2002
2003 /* Get runtime option */
2004 if (product->api_user) {
2005 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
2006 PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg));
2007 PetscOptionsEnd();
2008 } else {
2009 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
2010 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg));
2011 PetscOptionsEnd();
2012 }
2013 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2014
2015 C->ops->productsymbolic = MatProductSymbolic_AB;
2016 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ;
2017 PetscFunctionReturn(PETSC_SUCCESS);
2018 }
2019
MatProductSetFromOptions_SeqAIJ_AtB(Mat C)2020 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C)
2021 {
2022 Mat_Product *product = C->product;
2023 PetscInt alg = 0; /* default algorithm */
2024 PetscBool flg = PETSC_FALSE;
2025 const char *algTypes[3] = {"default", "at*b", "outerproduct"};
2026 PetscInt nalg = 3;
2027
2028 PetscFunctionBegin;
2029 /* Get runtime option */
2030 if (product->api_user) {
2031 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat");
2032 PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2033 PetscOptionsEnd();
2034 } else {
2035 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat");
2036 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg));
2037 PetscOptionsEnd();
2038 }
2039 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2040
2041 C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ;
2042 PetscFunctionReturn(PETSC_SUCCESS);
2043 }
2044
MatProductSetFromOptions_SeqAIJ_ABt(Mat C)2045 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C)
2046 {
2047 Mat_Product *product = C->product;
2048 PetscInt alg = 0; /* default algorithm */
2049 PetscBool flg = PETSC_FALSE;
2050 const char *algTypes[2] = {"default", "color"};
2051 PetscInt nalg = 2;
2052
2053 PetscFunctionBegin;
2054 /* Set default algorithm */
2055 PetscCall(PetscStrcmp(C->product->alg, "default", &flg));
2056 if (!flg) {
2057 alg = 1;
2058 PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2059 }
2060
2061 /* Get runtime option */
2062 if (product->api_user) {
2063 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat");
2064 PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2065 PetscOptionsEnd();
2066 } else {
2067 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat");
2068 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg));
2069 PetscOptionsEnd();
2070 }
2071 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2072
2073 C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ;
2074 C->ops->productsymbolic = MatProductSymbolic_ABt;
2075 PetscFunctionReturn(PETSC_SUCCESS);
2076 }
2077
MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)2078 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C)
2079 {
2080 Mat_Product *product = C->product;
2081 PetscBool flg = PETSC_FALSE;
2082 PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */
2083 #if !defined(PETSC_HAVE_HYPRE)
2084 const char *algTypes[2] = {"scalable", "rap"};
2085 PetscInt nalg = 2;
2086 #else
2087 const char *algTypes[3] = {"scalable", "rap", "hypre"};
2088 PetscInt nalg = 3;
2089 #endif
2090
2091 PetscFunctionBegin;
2092 /* Set default algorithm */
2093 PetscCall(PetscStrcmp(product->alg, "default", &flg));
2094 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2095
2096 /* Get runtime option */
2097 if (product->api_user) {
2098 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
2099 PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2100 PetscOptionsEnd();
2101 } else {
2102 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
2103 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg));
2104 PetscOptionsEnd();
2105 }
2106 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2107
2108 C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ;
2109 PetscFunctionReturn(PETSC_SUCCESS);
2110 }
2111
MatProductSetFromOptions_SeqAIJ_RARt(Mat C)2112 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C)
2113 {
2114 Mat_Product *product = C->product;
2115 PetscBool flg = PETSC_FALSE;
2116 PetscInt alg = 0; /* default algorithm */
2117 const char *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"};
2118 PetscInt nalg = 3;
2119
2120 PetscFunctionBegin;
2121 /* Set default algorithm */
2122 PetscCall(PetscStrcmp(product->alg, "default", &flg));
2123 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2124
2125 /* Get runtime option */
2126 if (product->api_user) {
2127 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat");
2128 PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg));
2129 PetscOptionsEnd();
2130 } else {
2131 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat");
2132 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg));
2133 PetscOptionsEnd();
2134 }
2135 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2136
2137 C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ;
2138 PetscFunctionReturn(PETSC_SUCCESS);
2139 }
2140
2141 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */
MatProductSetFromOptions_SeqAIJ_ABC(Mat C)2142 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C)
2143 {
2144 Mat_Product *product = C->product;
2145 PetscInt alg = 0; /* default algorithm */
2146 PetscBool flg = PETSC_FALSE;
2147 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"};
2148 PetscInt nalg = 7;
2149
2150 PetscFunctionBegin;
2151 /* Set default algorithm */
2152 PetscCall(PetscStrcmp(product->alg, "default", &flg));
2153 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2154
2155 /* Get runtime option */
2156 if (product->api_user) {
2157 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat");
2158 PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg));
2159 PetscOptionsEnd();
2160 } else {
2161 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat");
2162 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg));
2163 PetscOptionsEnd();
2164 }
2165 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg]));
2166
2167 C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ;
2168 C->ops->productsymbolic = MatProductSymbolic_ABC;
2169 PetscFunctionReturn(PETSC_SUCCESS);
2170 }
2171
MatProductSetFromOptions_SeqAIJ(Mat C)2172 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C)
2173 {
2174 Mat_Product *product = C->product;
2175
2176 PetscFunctionBegin;
2177 switch (product->type) {
2178 case MATPRODUCT_AB:
2179 PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C));
2180 break;
2181 case MATPRODUCT_AtB:
2182 PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C));
2183 break;
2184 case MATPRODUCT_ABt:
2185 PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C));
2186 break;
2187 case MATPRODUCT_PtAP:
2188 PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C));
2189 break;
2190 case MATPRODUCT_RARt:
2191 PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C));
2192 break;
2193 case MATPRODUCT_ABC:
2194 PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C));
2195 break;
2196 default:
2197 break;
2198 }
2199 PetscFunctionReturn(PETSC_SUCCESS);
2200 }
2201