1 2 /* 3 Defines basic operations for the MATSEQAIJPERM matrix class. 4 This class is derived from the MATSEQAIJ class and retains the 5 compressed row storage (aka Yale sparse matrix format) but augments 6 it with some permutation information that enables some operations 7 to be more vectorizable. A physically rearranged copy of the matrix 8 may be stored if the user desires. 9 10 Eventually a variety of permutations may be supported. 11 */ 12 13 #include <../src/mat/impls/aij/seq/aij.h> 14 15 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 16 #include <immintrin.h> 17 18 #if !defined(_MM_SCALE_8) 19 #define _MM_SCALE_8 8 20 #endif 21 #if !defined(_MM_SCALE_4) 22 #define _MM_SCALE_4 4 23 #endif 24 #endif 25 26 #define NDIM 512 27 /* NDIM specifies how many rows at a time we should work with when 28 * performing the vectorized mat-vec. This depends on various factors 29 * such as vector register length, etc., and I really need to add a 30 * way for the user (or the library) to tune this. I'm setting it to 31 * 512 for now since that is what Ed D'Azevedo was using in his Fortran 32 * routines. */ 33 34 typedef struct { 35 PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */ 36 37 PetscInt ngroup; 38 PetscInt *xgroup; 39 /* Denotes where groups of rows with same number of nonzeros 40 * begin and end, i.e., xgroup[i] gives us the position in iperm[] 41 * where the ith group begins. */ 42 43 PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */ 44 PetscInt *iperm; /* The permutation vector. */ 45 46 /* Some of this stuff is for Ed's recursive triangular solve. 47 * I'm not sure what I need yet. */ 48 PetscInt blocksize; 49 PetscInt nstep; 50 PetscInt *jstart_list; 51 PetscInt *jend_list; 52 PetscInt *action_list; 53 PetscInt *ngroup_list; 54 PetscInt **ipointer_list; 55 PetscInt **xgroup_list; 56 PetscInt **nzgroup_list; 57 PetscInt **iperm_list; 58 } Mat_SeqAIJPERM; 59 60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat) 61 { 62 /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */ 63 /* so we will ignore 'MatType type'. */ 64 Mat B = *newmat; 65 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 66 67 PetscFunctionBegin; 68 if (reuse == MAT_INITIAL_MATRIX) { 69 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B)); 70 aijperm = (Mat_SeqAIJPERM *)B->spptr; 71 } 72 73 /* Reset the original function pointers. */ 74 B->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 75 B->ops->destroy = MatDestroy_SeqAIJ; 76 B->ops->duplicate = MatDuplicate_SeqAIJ; 77 B->ops->mult = MatMult_SeqAIJ; 78 B->ops->multadd = MatMultAdd_SeqAIJ; 79 80 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", NULL)); 81 82 /* Free everything in the Mat_SeqAIJPERM data structure.*/ 83 PetscCall(PetscFree(aijperm->xgroup)); 84 PetscCall(PetscFree(aijperm->nzgroup)); 85 PetscCall(PetscFree(aijperm->iperm)); 86 PetscCall(PetscFree(B->spptr)); 87 88 /* Change the type of B to MATSEQAIJ. */ 89 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ)); 90 91 *newmat = B; 92 PetscFunctionReturn(PETSC_SUCCESS); 93 } 94 95 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A) 96 { 97 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 98 99 PetscFunctionBegin; 100 if (aijperm) { 101 /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */ 102 PetscCall(PetscFree(aijperm->xgroup)); 103 PetscCall(PetscFree(aijperm->nzgroup)); 104 PetscCall(PetscFree(aijperm->iperm)); 105 PetscCall(PetscFree(A->spptr)); 106 } 107 /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ() 108 * to destroy everything that remains. */ 109 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ)); 110 /* Note that I don't call MatSetType(). I believe this is because that 111 * is only to be called when *building* a matrix. I could be wrong, but 112 * that is how things work for the SuperLU matrix class. */ 113 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijperm_seqaij_C", NULL)); 114 PetscCall(MatDestroy_SeqAIJ(A)); 115 PetscFunctionReturn(PETSC_SUCCESS); 116 } 117 118 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M) 119 { 120 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 121 Mat_SeqAIJPERM *aijperm_dest; 122 PetscBool perm; 123 124 PetscFunctionBegin; 125 PetscCall(MatDuplicate_SeqAIJ(A, op, M)); 126 PetscCall(PetscObjectTypeCompare((PetscObject)*M, MATSEQAIJPERM, &perm)); 127 if (perm) { 128 aijperm_dest = (Mat_SeqAIJPERM *)(*M)->spptr; 129 PetscCall(PetscFree(aijperm_dest->xgroup)); 130 PetscCall(PetscFree(aijperm_dest->nzgroup)); 131 PetscCall(PetscFree(aijperm_dest->iperm)); 132 } else { 133 PetscCall(PetscNew(&aijperm_dest)); 134 (*M)->spptr = (void *)aijperm_dest; 135 PetscCall(PetscObjectChangeTypeName((PetscObject)*M, MATSEQAIJPERM)); 136 PetscCall(PetscObjectComposeFunction((PetscObject)*M, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ)); 137 } 138 PetscCall(PetscArraycpy(aijperm_dest, aijperm, 1)); 139 /* Allocate space for, and copy the grouping and permutation info. 140 * I note that when the groups are initially determined in 141 * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than 142 * necessary. But at this point, we know how large they need to be, and 143 * allocate only the necessary amount of memory. So the duplicated matrix 144 * may actually use slightly less storage than the original! */ 145 PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm)); 146 PetscCall(PetscMalloc1(aijperm->ngroup + 1, &aijperm_dest->xgroup)); 147 PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup)); 148 PetscCall(PetscArraycpy(aijperm_dest->iperm, aijperm->iperm, A->rmap->n)); 149 PetscCall(PetscArraycpy(aijperm_dest->xgroup, aijperm->xgroup, aijperm->ngroup + 1)); 150 PetscCall(PetscArraycpy(aijperm_dest->nzgroup, aijperm->nzgroup, aijperm->ngroup)); 151 PetscFunctionReturn(PETSC_SUCCESS); 152 } 153 154 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ *)(A)->data; 157 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 158 PetscInt m; /* Number of rows in the matrix. */ 159 PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */ 160 PetscInt maxnz; /* Maximum number of nonzeros in any row. */ 161 PetscInt *rows_in_bucket; 162 /* To construct the permutation, we sort each row into one of maxnz 163 * buckets based on how many nonzeros are in the row. */ 164 PetscInt nz; 165 PetscInt *nz_in_row; /* the number of nonzero elements in row k. */ 166 PetscInt *ipnz; 167 /* When constructing the iperm permutation vector, 168 * ipnz[nz] is used to point to the next place in the permutation vector 169 * that a row with nz nonzero elements should be placed.*/ 170 PetscInt i, ngroup, istart, ipos; 171 172 PetscFunctionBegin; 173 if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); /* permutation exists and matches current nonzero structure */ 174 aijperm->nonzerostate = A->nonzerostate; 175 /* Free anything previously put in the Mat_SeqAIJPERM data structure. */ 176 PetscCall(PetscFree(aijperm->xgroup)); 177 PetscCall(PetscFree(aijperm->nzgroup)); 178 PetscCall(PetscFree(aijperm->iperm)); 179 180 m = A->rmap->n; 181 ia = a->i; 182 183 /* Allocate the arrays that will hold the permutation vector. */ 184 PetscCall(PetscMalloc1(m, &aijperm->iperm)); 185 186 /* Allocate some temporary work arrays that will be used in 187 * calculating the permutation vector and groupings. */ 188 PetscCall(PetscMalloc1(m, &nz_in_row)); 189 190 /* Now actually figure out the permutation and grouping. */ 191 192 /* First pass: Determine number of nonzeros in each row, maximum 193 * number of nonzeros in any row, and how many rows fall into each 194 * "bucket" of rows with same number of nonzeros. */ 195 maxnz = 0; 196 for (i = 0; i < m; i++) { 197 nz_in_row[i] = ia[i + 1] - ia[i]; 198 if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i]; 199 } 200 PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &rows_in_bucket)); 201 PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &ipnz)); 202 203 for (i = 0; i <= maxnz; i++) rows_in_bucket[i] = 0; 204 for (i = 0; i < m; i++) { 205 nz = nz_in_row[i]; 206 rows_in_bucket[nz]++; 207 } 208 209 /* Allocate space for the grouping info. There will be at most (maxnz + 1) 210 * groups. (It is maxnz + 1 instead of simply maxnz because there may be 211 * rows with no nonzero elements.) If there are (maxnz + 1) groups, 212 * then xgroup[] must consist of (maxnz + 2) elements, since the last 213 * element of xgroup will tell us where the (maxnz + 1)th group ends. 214 * We allocate space for the maximum number of groups; 215 * that is potentially a little wasteful, but not too much so. 216 * Perhaps I should fix it later. */ 217 PetscCall(PetscMalloc1(maxnz + 2, &aijperm->xgroup)); 218 PetscCall(PetscMalloc1(maxnz + 1, &aijperm->nzgroup)); 219 220 /* Second pass. Look at what is in the buckets and create the groupings. 221 * Note that it is OK to have a group of rows with no non-zero values. */ 222 ngroup = 0; 223 istart = 0; 224 for (i = 0; i <= maxnz; i++) { 225 if (rows_in_bucket[i] > 0) { 226 aijperm->nzgroup[ngroup] = i; 227 aijperm->xgroup[ngroup] = istart; 228 ngroup++; 229 istart += rows_in_bucket[i]; 230 } 231 } 232 233 aijperm->xgroup[ngroup] = istart; 234 aijperm->ngroup = ngroup; 235 236 /* Now fill in the permutation vector iperm. */ 237 ipnz[0] = 0; 238 for (i = 0; i < maxnz; i++) ipnz[i + 1] = ipnz[i] + rows_in_bucket[i]; 239 240 for (i = 0; i < m; i++) { 241 nz = nz_in_row[i]; 242 ipos = ipnz[nz]; 243 aijperm->iperm[ipos] = i; 244 ipnz[nz]++; 245 } 246 247 /* Clean up temporary work arrays. */ 248 PetscCall(PetscFree(rows_in_bucket)); 249 PetscCall(PetscFree(ipnz)); 250 PetscCall(PetscFree(nz_in_row)); 251 PetscFunctionReturn(PETSC_SUCCESS); 252 } 253 254 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode) 255 { 256 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 257 258 PetscFunctionBegin; 259 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 260 261 /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some 262 * extra information, call the AssemblyEnd routine for a MATSEQAIJ. 263 * I'm not sure if this is the best way to do this, but it avoids 264 * a lot of code duplication. 265 * I also note that currently MATSEQAIJPERM doesn't know anything about 266 * the Mat_CompressedRow data structure that SeqAIJ now uses when there 267 * are many zero rows. If the SeqAIJ assembly end routine decides to use 268 * this, this may break things. (Don't know... haven't looked at it.) */ 269 a->inode.use = PETSC_FALSE; 270 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 271 272 /* Now calculate the permutation and grouping information. */ 273 PetscCall(MatSeqAIJPERM_create_perm(A)); 274 PetscFunctionReturn(PETSC_SUCCESS); 275 } 276 277 PetscErrorCode MatMult_SeqAIJPERM(Mat A, Vec xx, Vec yy) 278 { 279 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 280 const PetscScalar *x; 281 PetscScalar *y; 282 const MatScalar *aa; 283 const PetscInt *aj, *ai; 284 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)) 285 PetscInt i, j; 286 #endif 287 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 288 __m512d vec_x, vec_y, vec_vals; 289 __m256i vec_idx, vec_ipos, vec_j; 290 __mmask8 mask; 291 #endif 292 293 /* Variables that don't appear in MatMult_SeqAIJ. */ 294 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr; 295 PetscInt *iperm; /* Points to the permutation vector. */ 296 PetscInt *xgroup; 297 /* Denotes where groups of rows with same number of nonzeros 298 * begin and end in iperm. */ 299 PetscInt *nzgroup; 300 PetscInt ngroup; 301 PetscInt igroup; 302 PetscInt jstart, jend; 303 /* jstart is used in loops to denote the position in iperm where a 304 * group starts; jend denotes the position where it ends. 305 * (jend + 1 is where the next group starts.) */ 306 PetscInt iold, nz; 307 PetscInt istart, iend, isize; 308 PetscInt ipos; 309 PetscScalar yp[NDIM]; 310 PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */ 311 312 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 313 #pragma disjoint(*x, *y, *aa) 314 #endif 315 316 PetscFunctionBegin; 317 PetscCall(VecGetArrayRead(xx, &x)); 318 PetscCall(VecGetArray(yy, &y)); 319 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 320 aa = a->a; /* Nonzero elements stored row-by-row. */ 321 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 322 323 /* Get the info we need about the permutations and groupings. */ 324 iperm = aijperm->iperm; 325 ngroup = aijperm->ngroup; 326 xgroup = aijperm->xgroup; 327 nzgroup = aijperm->nzgroup; 328 329 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking) 330 fortranmultaijperm_(&m, x, ii, aj, aa, y); 331 #else 332 333 for (igroup = 0; igroup < ngroup; igroup++) { 334 jstart = xgroup[igroup]; 335 jend = xgroup[igroup + 1] - 1; 336 nz = nzgroup[igroup]; 337 338 /* Handle the special cases where the number of nonzeros per row 339 * in the group is either 0 or 1. */ 340 if (nz == 0) { 341 for (i = jstart; i <= jend; i++) y[iperm[i]] = 0.0; 342 } else if (nz == 1) { 343 for (i = jstart; i <= jend; i++) { 344 iold = iperm[i]; 345 ipos = ai[iold]; 346 y[iold] = aa[ipos] * x[aj[ipos]]; 347 } 348 } else { 349 /* We work our way through the current group in chunks of NDIM rows 350 * at a time. */ 351 352 for (istart = jstart; istart <= jend; istart += NDIM) { 353 /* Figure out where the chunk of 'isize' rows ends in iperm. 354 * 'isize may of course be less than NDIM for the last chunk. */ 355 iend = istart + (NDIM - 1); 356 357 if (iend > jend) iend = jend; 358 359 isize = iend - istart + 1; 360 361 /* Initialize the yp[] array that will be used to hold part of 362 * the permuted results vector, and figure out where in aa each 363 * row of the chunk will begin. */ 364 for (i = 0; i < isize; i++) { 365 iold = iperm[istart + i]; 366 /* iold is a row number from the matrix A *before* reordering. */ 367 ip[i] = ai[iold]; 368 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 369 yp[i] = (PetscScalar)0.0; 370 } 371 372 /* If the number of zeros per row exceeds the number of rows in 373 * the chunk, we should vectorize along nz, that is, perform the 374 * mat-vec one row at a time as in the usual CSR case. */ 375 if (nz > isize) { 376 #if defined(PETSC_HAVE_CRAY_VECTOR) 377 #pragma _CRI preferstream 378 #endif 379 for (i = 0; i < isize; i++) { 380 #if defined(PETSC_HAVE_CRAY_VECTOR) 381 #pragma _CRI prefervector 382 #endif 383 384 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 385 vec_y = _mm512_setzero_pd(); 386 ipos = ip[i]; 387 for (j = 0; j < (nz >> 3); j++) { 388 vec_idx = _mm256_loadu_si256((__m256i const *)&aj[ipos]); 389 vec_vals = _mm512_loadu_pd(&aa[ipos]); 390 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); 391 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y); 392 ipos += 8; 393 } 394 if ((nz & 0x07) > 2) { 395 mask = (__mmask8)(0xff >> (8 - (nz & 0x07))); 396 vec_idx = _mm256_loadu_si256((__m256i const *)&aj[ipos]); 397 vec_vals = _mm512_loadu_pd(&aa[ipos]); 398 vec_x = _mm512_mask_i32gather_pd(vec_x, mask, vec_idx, x, _MM_SCALE_8); 399 vec_y = _mm512_mask3_fmadd_pd(vec_x, vec_vals, vec_y, mask); 400 } else if ((nz & 0x07) == 2) { 401 yp[i] += aa[ipos] * x[aj[ipos]]; 402 yp[i] += aa[ipos + 1] * x[aj[ipos + 1]]; 403 } else if ((nz & 0x07) == 1) { 404 yp[i] += aa[ipos] * x[aj[ipos]]; 405 } 406 yp[i] += _mm512_reduce_add_pd(vec_y); 407 #else 408 for (j = 0; j < nz; j++) { 409 ipos = ip[i] + j; 410 yp[i] += aa[ipos] * x[aj[ipos]]; 411 } 412 #endif 413 } 414 } else { 415 /* Otherwise, there are enough rows in the chunk to make it 416 * worthwhile to vectorize across the rows, that is, to do the 417 * matvec by operating with "columns" of the chunk. */ 418 for (j = 0; j < nz; j++) { 419 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 420 vec_j = _mm256_set1_epi32(j); 421 for (i = 0; i < ((isize >> 3) << 3); i += 8) { 422 vec_y = _mm512_loadu_pd(&yp[i]); 423 vec_ipos = _mm256_loadu_si256((__m256i const *)&ip[i]); 424 vec_ipos = _mm256_add_epi32(vec_ipos, vec_j); 425 vec_idx = _mm256_i32gather_epi32(aj, vec_ipos, _MM_SCALE_4); 426 vec_vals = _mm512_i32gather_pd(vec_ipos, aa, _MM_SCALE_8); 427 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); 428 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y); 429 _mm512_storeu_pd(&yp[i], vec_y); 430 } 431 for (i = isize - (isize & 0x07); i < isize; i++) { 432 ipos = ip[i] + j; 433 yp[i] += aa[ipos] * x[aj[ipos]]; 434 } 435 #else 436 for (i = 0; i < isize; i++) { 437 ipos = ip[i] + j; 438 yp[i] += aa[ipos] * x[aj[ipos]]; 439 } 440 #endif 441 } 442 } 443 444 #if defined(PETSC_HAVE_CRAY_VECTOR) 445 #pragma _CRI ivdep 446 #endif 447 /* Put results from yp[] into non-permuted result vector y. */ 448 for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i]; 449 } /* End processing chunk of isize rows of a group. */ 450 } /* End handling matvec for chunk with nz > 1. */ 451 } /* End loop over igroup. */ 452 #endif 453 PetscCall(PetscLogFlops(PetscMax(2.0 * a->nz - A->rmap->n, 0))); 454 PetscCall(VecRestoreArrayRead(xx, &x)); 455 PetscCall(VecRestoreArray(yy, &y)); 456 PetscFunctionReturn(PETSC_SUCCESS); 457 } 458 459 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx. 460 * Note that the names I used to designate the vectors differs from that 461 * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent 462 * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */ 463 /* 464 I hate having virtually identical code for the mult and the multadd!!! 465 */ 466 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A, Vec xx, Vec ww, Vec yy) 467 { 468 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 469 const PetscScalar *x; 470 PetscScalar *y, *w; 471 const MatScalar *aa; 472 const PetscInt *aj, *ai; 473 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 474 PetscInt i, j; 475 #endif 476 477 /* Variables that don't appear in MatMultAdd_SeqAIJ. */ 478 Mat_SeqAIJPERM *aijperm; 479 PetscInt *iperm; /* Points to the permutation vector. */ 480 PetscInt *xgroup; 481 /* Denotes where groups of rows with same number of nonzeros 482 * begin and end in iperm. */ 483 PetscInt *nzgroup; 484 PetscInt ngroup; 485 PetscInt igroup; 486 PetscInt jstart, jend; 487 /* jstart is used in loops to denote the position in iperm where a 488 * group starts; jend denotes the position where it ends. 489 * (jend + 1 is where the next group starts.) */ 490 PetscInt iold, nz; 491 PetscInt istart, iend, isize; 492 PetscInt ipos; 493 PetscScalar yp[NDIM]; 494 PetscInt ip[NDIM]; 495 /* yp[] and ip[] are treated as vector "registers" for performing 496 * the mat-vec. */ 497 498 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 499 #pragma disjoint(*x, *y, *aa) 500 #endif 501 502 PetscFunctionBegin; 503 PetscCall(VecGetArrayRead(xx, &x)); 504 PetscCall(VecGetArrayPair(yy, ww, &y, &w)); 505 506 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 507 aa = a->a; /* Nonzero elements stored row-by-row. */ 508 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 509 510 /* Get the info we need about the permutations and groupings. */ 511 aijperm = (Mat_SeqAIJPERM *)A->spptr; 512 iperm = aijperm->iperm; 513 ngroup = aijperm->ngroup; 514 xgroup = aijperm->xgroup; 515 nzgroup = aijperm->nzgroup; 516 517 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 518 fortranmultaddaijperm_(&m, x, ii, aj, aa, y, w); 519 #else 520 521 for (igroup = 0; igroup < ngroup; igroup++) { 522 jstart = xgroup[igroup]; 523 jend = xgroup[igroup + 1] - 1; 524 525 nz = nzgroup[igroup]; 526 527 /* Handle the special cases where the number of nonzeros per row 528 * in the group is either 0 or 1. */ 529 if (nz == 0) { 530 for (i = jstart; i <= jend; i++) { 531 iold = iperm[i]; 532 y[iold] = w[iold]; 533 } 534 } else if (nz == 1) { 535 for (i = jstart; i <= jend; i++) { 536 iold = iperm[i]; 537 ipos = ai[iold]; 538 y[iold] = w[iold] + aa[ipos] * x[aj[ipos]]; 539 } 540 } 541 /* For the general case: */ 542 else { 543 /* We work our way through the current group in chunks of NDIM rows 544 * at a time. */ 545 546 for (istart = jstart; istart <= jend; istart += NDIM) { 547 /* Figure out where the chunk of 'isize' rows ends in iperm. 548 * 'isize may of course be less than NDIM for the last chunk. */ 549 iend = istart + (NDIM - 1); 550 if (iend > jend) iend = jend; 551 isize = iend - istart + 1; 552 553 /* Initialize the yp[] array that will be used to hold part of 554 * the permuted results vector, and figure out where in aa each 555 * row of the chunk will begin. */ 556 for (i = 0; i < isize; i++) { 557 iold = iperm[istart + i]; 558 /* iold is a row number from the matrix A *before* reordering. */ 559 ip[i] = ai[iold]; 560 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 561 yp[i] = w[iold]; 562 } 563 564 /* If the number of zeros per row exceeds the number of rows in 565 * the chunk, we should vectorize along nz, that is, perform the 566 * mat-vec one row at a time as in the usual CSR case. */ 567 if (nz > isize) { 568 #if defined(PETSC_HAVE_CRAY_VECTOR) 569 #pragma _CRI preferstream 570 #endif 571 for (i = 0; i < isize; i++) { 572 #if defined(PETSC_HAVE_CRAY_VECTOR) 573 #pragma _CRI prefervector 574 #endif 575 for (j = 0; j < nz; j++) { 576 ipos = ip[i] + j; 577 yp[i] += aa[ipos] * x[aj[ipos]]; 578 } 579 } 580 } 581 /* Otherwise, there are enough rows in the chunk to make it 582 * worthwhile to vectorize across the rows, that is, to do the 583 * matvec by operating with "columns" of the chunk. */ 584 else { 585 for (j = 0; j < nz; j++) { 586 for (i = 0; i < isize; i++) { 587 ipos = ip[i] + j; 588 yp[i] += aa[ipos] * x[aj[ipos]]; 589 } 590 } 591 } 592 593 #if defined(PETSC_HAVE_CRAY_VECTOR) 594 #pragma _CRI ivdep 595 #endif 596 /* Put results from yp[] into non-permuted result vector y. */ 597 for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i]; 598 } /* End processing chunk of isize rows of a group. */ 599 600 } /* End handling matvec for chunk with nz > 1. */ 601 } /* End loop over igroup. */ 602 603 #endif 604 PetscCall(PetscLogFlops(2.0 * a->nz)); 605 PetscCall(VecRestoreArrayRead(xx, &x)); 606 PetscCall(VecRestoreArrayPair(yy, ww, &y, &w)); 607 PetscFunctionReturn(PETSC_SUCCESS); 608 } 609 610 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a 611 * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM() 612 * routine, but can also be used to convert an assembled SeqAIJ matrix 613 * into a SeqAIJPERM one. */ 614 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A, MatType type, MatReuse reuse, Mat *newmat) 615 { 616 Mat B = *newmat; 617 Mat_SeqAIJPERM *aijperm; 618 PetscBool sametype; 619 620 PetscFunctionBegin; 621 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B)); 622 PetscCall(PetscObjectTypeCompare((PetscObject)A, type, &sametype)); 623 if (sametype) PetscFunctionReturn(PETSC_SUCCESS); 624 625 PetscCall(PetscNew(&aijperm)); 626 B->spptr = (void *)aijperm; 627 628 /* Set function pointers for methods that we inherit from AIJ but override. */ 629 B->ops->duplicate = MatDuplicate_SeqAIJPERM; 630 B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM; 631 B->ops->destroy = MatDestroy_SeqAIJPERM; 632 B->ops->mult = MatMult_SeqAIJPERM; 633 B->ops->multadd = MatMultAdd_SeqAIJPERM; 634 635 aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/ 636 /* If A has already been assembled, compute the permutation. */ 637 if (A->assembled) PetscCall(MatSeqAIJPERM_create_perm(B)); 638 639 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ)); 640 641 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJPERM)); 642 *newmat = B; 643 PetscFunctionReturn(PETSC_SUCCESS); 644 } 645 646 /*@C 647 MatCreateSeqAIJPERM - Creates a sparse matrix of type `MATSEQAIJPERM`. 648 This type inherits from `MATSEQAIJ`, but calculates some additional permutation 649 information that is used to allow better vectorization of some 650 operations. At the cost of increased storage, the `MATSEQAIJ` formatted 651 matrix can be copied to a format in which pieces of the matrix are 652 stored in ELLPACK format, allowing the vectorized matrix multiply 653 routine to use stride-1 memory accesses. 654 655 Collective 656 657 Input Parameters: 658 + comm - MPI communicator, set to `PETSC_COMM_SELF` 659 . m - number of rows 660 . n - number of columns 661 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is given 662 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 663 664 Output Parameter: 665 . A - the matrix 666 667 Level: intermediate 668 669 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()` 670 @*/ 671 PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 672 { 673 PetscFunctionBegin; 674 PetscCall(MatCreate(comm, A)); 675 PetscCall(MatSetSizes(*A, m, n, m, n)); 676 PetscCall(MatSetType(*A, MATSEQAIJPERM)); 677 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, nnz)); 678 PetscFunctionReturn(PETSC_SUCCESS); 679 } 680 681 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A) 682 { 683 PetscFunctionBegin; 684 PetscCall(MatSetType(A, MATSEQAIJ)); 685 PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A, MATSEQAIJPERM, MAT_INPLACE_MATRIX, &A)); 686 PetscFunctionReturn(PETSC_SUCCESS); 687 } 688