1 2 /* 3 Defines basic operations for the MATSEQAIJPERM matrix class. 4 This class is derived from the MATSEQAIJ class and retains the 5 compressed row storage (aka Yale sparse matrix format) but augments 6 it with some permutation information that enables some operations 7 to be more vectorizable. A physically rearranged copy of the matrix 8 may be stored if the user desires. 9 10 Eventually a variety of permutations may be supported. 11 */ 12 13 #include <../src/mat/impls/aij/seq/aij.h> 14 15 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 16 #include <immintrin.h> 17 18 #if !defined(_MM_SCALE_8) 19 #define _MM_SCALE_8 8 20 #endif 21 #if !defined(_MM_SCALE_4) 22 #define _MM_SCALE_4 4 23 #endif 24 #endif 25 26 #define NDIM 512 27 /* NDIM specifies how many rows at a time we should work with when 28 * performing the vectorized mat-vec. This depends on various factors 29 * such as vector register length, etc., and I really need to add a 30 * way for the user (or the library) to tune this. I'm setting it to 31 * 512 for now since that is what Ed D'Azevedo was using in his Fortran 32 * routines. */ 33 34 typedef struct { 35 PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */ 36 37 PetscInt ngroup; 38 PetscInt *xgroup; 39 /* Denotes where groups of rows with same number of nonzeros 40 * begin and end, i.e., xgroup[i] gives us the position in iperm[] 41 * where the ith group begins. */ 42 43 PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */ 44 PetscInt *iperm; /* The permutation vector. */ 45 46 /* Some of this stuff is for Ed's recursive triangular solve. 47 * I'm not sure what I need yet. */ 48 PetscInt blocksize; 49 PetscInt nstep; 50 PetscInt *jstart_list; 51 PetscInt *jend_list; 52 PetscInt *action_list; 53 PetscInt *ngroup_list; 54 PetscInt **ipointer_list; 55 PetscInt **xgroup_list; 56 PetscInt **nzgroup_list; 57 PetscInt **iperm_list; 58 } Mat_SeqAIJPERM; 59 60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat) 61 { 62 /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */ 63 /* so we will ignore 'MatType type'. */ 64 Mat B = *newmat; 65 Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr; 66 67 PetscFunctionBegin; 68 if (reuse == MAT_INITIAL_MATRIX) { 69 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,&B)); 70 aijperm=(Mat_SeqAIJPERM*)B->spptr; 71 } 72 73 /* Reset the original function pointers. */ 74 B->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 75 B->ops->destroy = MatDestroy_SeqAIJ; 76 B->ops->duplicate = MatDuplicate_SeqAIJ; 77 B->ops->mult = MatMult_SeqAIJ; 78 B->ops->multadd = MatMultAdd_SeqAIJ; 79 80 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL)); 81 82 /* Free everything in the Mat_SeqAIJPERM data structure.*/ 83 PetscCall(PetscFree(aijperm->xgroup)); 84 PetscCall(PetscFree(aijperm->nzgroup)); 85 PetscCall(PetscFree(aijperm->iperm)); 86 PetscCall(PetscFree(B->spptr)); 87 88 /* Change the type of B to MATSEQAIJ. */ 89 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ)); 90 91 *newmat = B; 92 PetscFunctionReturn(0); 93 } 94 95 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A) 96 { 97 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 98 99 PetscFunctionBegin; 100 if (aijperm) { 101 /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */ 102 PetscCall(PetscFree(aijperm->xgroup)); 103 PetscCall(PetscFree(aijperm->nzgroup)); 104 PetscCall(PetscFree(aijperm->iperm)); 105 PetscCall(PetscFree(A->spptr)); 106 } 107 /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ() 108 * to destroy everything that remains. */ 109 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ)); 110 /* Note that I don't call MatSetType(). I believe this is because that 111 * is only to be called when *building* a matrix. I could be wrong, but 112 * that is how things work for the SuperLU matrix class. */ 113 PetscCall(MatDestroy_SeqAIJ(A)); 114 PetscFunctionReturn(0); 115 } 116 117 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M) 118 { 119 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 120 Mat_SeqAIJPERM *aijperm_dest; 121 PetscBool perm; 122 123 PetscFunctionBegin; 124 PetscCall(MatDuplicate_SeqAIJ(A,op,M)); 125 PetscCall(PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm)); 126 if (perm) { 127 aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr; 128 PetscCall(PetscFree(aijperm_dest->xgroup)); 129 PetscCall(PetscFree(aijperm_dest->nzgroup)); 130 PetscCall(PetscFree(aijperm_dest->iperm)); 131 } else { 132 PetscCall(PetscNewLog(*M,&aijperm_dest)); 133 (*M)->spptr = (void*) aijperm_dest; 134 PetscCall(PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM)); 135 PetscCall(PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ)); 136 } 137 PetscCall(PetscArraycpy(aijperm_dest,aijperm,1)); 138 /* Allocate space for, and copy the grouping and permutation info. 139 * I note that when the groups are initially determined in 140 * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than 141 * necessary. But at this point, we know how large they need to be, and 142 * allocate only the necessary amount of memory. So the duplicated matrix 143 * may actually use slightly less storage than the original! */ 144 PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm)); 145 PetscCall(PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup)); 146 PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup)); 147 PetscCall(PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n)); 148 PetscCall(PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1)); 149 PetscCall(PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup)); 150 PetscFunctionReturn(0); 151 } 152 153 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A) 154 { 155 Mat_SeqAIJ *a = (Mat_SeqAIJ*)(A)->data; 156 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 157 PetscInt m; /* Number of rows in the matrix. */ 158 PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */ 159 PetscInt maxnz; /* Maximum number of nonzeros in any row. */ 160 PetscInt *rows_in_bucket; 161 /* To construct the permutation, we sort each row into one of maxnz 162 * buckets based on how many nonzeros are in the row. */ 163 PetscInt nz; 164 PetscInt *nz_in_row; /* the number of nonzero elements in row k. */ 165 PetscInt *ipnz; 166 /* When constructing the iperm permutation vector, 167 * ipnz[nz] is used to point to the next place in the permutation vector 168 * that a row with nz nonzero elements should be placed.*/ 169 PetscInt i, ngroup, istart, ipos; 170 171 PetscFunctionBegin; 172 if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */ 173 aijperm->nonzerostate = A->nonzerostate; 174 /* Free anything previously put in the Mat_SeqAIJPERM data structure. */ 175 PetscCall(PetscFree(aijperm->xgroup)); 176 PetscCall(PetscFree(aijperm->nzgroup)); 177 PetscCall(PetscFree(aijperm->iperm)); 178 179 m = A->rmap->n; 180 ia = a->i; 181 182 /* Allocate the arrays that will hold the permutation vector. */ 183 PetscCall(PetscMalloc1(m, &aijperm->iperm)); 184 185 /* Allocate some temporary work arrays that will be used in 186 * calculating the permutation vector and groupings. */ 187 PetscCall(PetscMalloc1(m, &nz_in_row)); 188 189 /* Now actually figure out the permutation and grouping. */ 190 191 /* First pass: Determine number of nonzeros in each row, maximum 192 * number of nonzeros in any row, and how many rows fall into each 193 * "bucket" of rows with same number of nonzeros. */ 194 maxnz = 0; 195 for (i=0; i<m; i++) { 196 nz_in_row[i] = ia[i+1]-ia[i]; 197 if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i]; 198 } 199 PetscCall(PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket)); 200 PetscCall(PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz)); 201 202 for (i=0; i<=maxnz; i++) { 203 rows_in_bucket[i] = 0; 204 } 205 for (i=0; i<m; i++) { 206 nz = nz_in_row[i]; 207 rows_in_bucket[nz]++; 208 } 209 210 /* Allocate space for the grouping info. There will be at most (maxnz + 1) 211 * groups. (It is maxnz + 1 instead of simply maxnz because there may be 212 * rows with no nonzero elements.) If there are (maxnz + 1) groups, 213 * then xgroup[] must consist of (maxnz + 2) elements, since the last 214 * element of xgroup will tell us where the (maxnz + 1)th group ends. 215 * We allocate space for the maximum number of groups; 216 * that is potentially a little wasteful, but not too much so. 217 * Perhaps I should fix it later. */ 218 PetscCall(PetscMalloc1(maxnz+2, &aijperm->xgroup)); 219 PetscCall(PetscMalloc1(maxnz+1, &aijperm->nzgroup)); 220 221 /* Second pass. Look at what is in the buckets and create the groupings. 222 * Note that it is OK to have a group of rows with no non-zero values. */ 223 ngroup = 0; 224 istart = 0; 225 for (i=0; i<=maxnz; i++) { 226 if (rows_in_bucket[i] > 0) { 227 aijperm->nzgroup[ngroup] = i; 228 aijperm->xgroup[ngroup] = istart; 229 ngroup++; 230 istart += rows_in_bucket[i]; 231 } 232 } 233 234 aijperm->xgroup[ngroup] = istart; 235 aijperm->ngroup = ngroup; 236 237 /* Now fill in the permutation vector iperm. */ 238 ipnz[0] = 0; 239 for (i=0; i<maxnz; i++) { 240 ipnz[i+1] = ipnz[i] + rows_in_bucket[i]; 241 } 242 243 for (i=0; i<m; i++) { 244 nz = nz_in_row[i]; 245 ipos = ipnz[nz]; 246 aijperm->iperm[ipos] = i; 247 ipnz[nz]++; 248 } 249 250 /* Clean up temporary work arrays. */ 251 PetscCall(PetscFree(rows_in_bucket)); 252 PetscCall(PetscFree(ipnz)); 253 PetscCall(PetscFree(nz_in_row)); 254 PetscFunctionReturn(0); 255 } 256 257 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode) 258 { 259 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 260 261 PetscFunctionBegin; 262 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 263 264 /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some 265 * extra information, call the AssemblyEnd routine for a MATSEQAIJ. 266 * I'm not sure if this is the best way to do this, but it avoids 267 * a lot of code duplication. 268 * I also note that currently MATSEQAIJPERM doesn't know anything about 269 * the Mat_CompressedRow data structure that SeqAIJ now uses when there 270 * are many zero rows. If the SeqAIJ assembly end routine decides to use 271 * this, this may break things. (Don't know... haven't looked at it.) */ 272 a->inode.use = PETSC_FALSE; 273 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 274 275 /* Now calculate the permutation and grouping information. */ 276 PetscCall(MatSeqAIJPERM_create_perm(A)); 277 PetscFunctionReturn(0); 278 } 279 280 PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy) 281 { 282 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 283 const PetscScalar *x; 284 PetscScalar *y; 285 const MatScalar *aa; 286 const PetscInt *aj,*ai; 287 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)) 288 PetscInt i,j; 289 #endif 290 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 291 __m512d vec_x,vec_y,vec_vals; 292 __m256i vec_idx,vec_ipos,vec_j; 293 __mmask8 mask; 294 #endif 295 296 /* Variables that don't appear in MatMult_SeqAIJ. */ 297 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 298 PetscInt *iperm; /* Points to the permutation vector. */ 299 PetscInt *xgroup; 300 /* Denotes where groups of rows with same number of nonzeros 301 * begin and end in iperm. */ 302 PetscInt *nzgroup; 303 PetscInt ngroup; 304 PetscInt igroup; 305 PetscInt jstart,jend; 306 /* jstart is used in loops to denote the position in iperm where a 307 * group starts; jend denotes the position where it ends. 308 * (jend + 1 is where the next group starts.) */ 309 PetscInt iold,nz; 310 PetscInt istart,iend,isize; 311 PetscInt ipos; 312 PetscScalar yp[NDIM]; 313 PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */ 314 315 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 316 #pragma disjoint(*x,*y,*aa) 317 #endif 318 319 PetscFunctionBegin; 320 PetscCall(VecGetArrayRead(xx,&x)); 321 PetscCall(VecGetArray(yy,&y)); 322 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 323 aa = a->a; /* Nonzero elements stored row-by-row. */ 324 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 325 326 /* Get the info we need about the permutations and groupings. */ 327 iperm = aijperm->iperm; 328 ngroup = aijperm->ngroup; 329 xgroup = aijperm->xgroup; 330 nzgroup = aijperm->nzgroup; 331 332 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking) 333 fortranmultaijperm_(&m,x,ii,aj,aa,y); 334 #else 335 336 for (igroup=0; igroup<ngroup; igroup++) { 337 jstart = xgroup[igroup]; 338 jend = xgroup[igroup+1] - 1; 339 nz = nzgroup[igroup]; 340 341 /* Handle the special cases where the number of nonzeros per row 342 * in the group is either 0 or 1. */ 343 if (nz == 0) { 344 for (i=jstart; i<=jend; i++) { 345 y[iperm[i]] = 0.0; 346 } 347 } else if (nz == 1) { 348 for (i=jstart; i<=jend; i++) { 349 iold = iperm[i]; 350 ipos = ai[iold]; 351 y[iold] = aa[ipos] * x[aj[ipos]]; 352 } 353 } else { 354 355 /* We work our way through the current group in chunks of NDIM rows 356 * at a time. */ 357 358 for (istart=jstart; istart<=jend; istart+=NDIM) { 359 /* Figure out where the chunk of 'isize' rows ends in iperm. 360 * 'isize may of course be less than NDIM for the last chunk. */ 361 iend = istart + (NDIM - 1); 362 363 if (iend > jend) iend = jend; 364 365 isize = iend - istart + 1; 366 367 /* Initialize the yp[] array that will be used to hold part of 368 * the permuted results vector, and figure out where in aa each 369 * row of the chunk will begin. */ 370 for (i=0; i<isize; i++) { 371 iold = iperm[istart + i]; 372 /* iold is a row number from the matrix A *before* reordering. */ 373 ip[i] = ai[iold]; 374 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 375 yp[i] = (PetscScalar) 0.0; 376 } 377 378 /* If the number of zeros per row exceeds the number of rows in 379 * the chunk, we should vectorize along nz, that is, perform the 380 * mat-vec one row at a time as in the usual CSR case. */ 381 if (nz > isize) { 382 #if defined(PETSC_HAVE_CRAY_VECTOR) 383 #pragma _CRI preferstream 384 #endif 385 for (i=0; i<isize; i++) { 386 #if defined(PETSC_HAVE_CRAY_VECTOR) 387 #pragma _CRI prefervector 388 #endif 389 390 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 391 vec_y = _mm512_setzero_pd(); 392 ipos = ip[i]; 393 for (j=0; j<(nz>>3); j++) { 394 vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]); 395 vec_vals = _mm512_loadu_pd(&aa[ipos]); 396 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); 397 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y); 398 ipos += 8; 399 } 400 if ((nz&0x07)>2) { 401 mask = (__mmask8)(0xff >> (8-(nz&0x07))); 402 vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]); 403 vec_vals = _mm512_loadu_pd(&aa[ipos]); 404 vec_x = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8); 405 vec_y = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask); 406 } else if ((nz&0x07)==2) { 407 yp[i] += aa[ipos]*x[aj[ipos]]; 408 yp[i] += aa[ipos+1]*x[aj[ipos+1]]; 409 } else if ((nz&0x07)==1) { 410 yp[i] += aa[ipos]*x[aj[ipos]]; 411 } 412 yp[i] += _mm512_reduce_add_pd(vec_y); 413 #else 414 for (j=0; j<nz; j++) { 415 ipos = ip[i] + j; 416 yp[i] += aa[ipos] * x[aj[ipos]]; 417 } 418 #endif 419 } 420 } else { 421 /* Otherwise, there are enough rows in the chunk to make it 422 * worthwhile to vectorize across the rows, that is, to do the 423 * matvec by operating with "columns" of the chunk. */ 424 for (j=0; j<nz; j++) { 425 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 426 vec_j = _mm256_set1_epi32(j); 427 for (i=0; i<((isize>>3)<<3); i+=8) { 428 vec_y = _mm512_loadu_pd(&yp[i]); 429 vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]); 430 vec_ipos = _mm256_add_epi32(vec_ipos,vec_j); 431 vec_idx = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4); 432 vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8); 433 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); 434 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y); 435 _mm512_storeu_pd(&yp[i],vec_y); 436 } 437 for (i=isize-(isize&0x07); i<isize; i++) { 438 ipos = ip[i]+j; 439 yp[i] += aa[ipos]*x[aj[ipos]]; 440 } 441 #else 442 for (i=0; i<isize; i++) { 443 ipos = ip[i] + j; 444 yp[i] += aa[ipos] * x[aj[ipos]]; 445 } 446 #endif 447 } 448 } 449 450 #if defined(PETSC_HAVE_CRAY_VECTOR) 451 #pragma _CRI ivdep 452 #endif 453 /* Put results from yp[] into non-permuted result vector y. */ 454 for (i=0; i<isize; i++) { 455 y[iperm[istart+i]] = yp[i]; 456 } 457 } /* End processing chunk of isize rows of a group. */ 458 } /* End handling matvec for chunk with nz > 1. */ 459 } /* End loop over igroup. */ 460 #endif 461 PetscCall(PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0))); 462 PetscCall(VecRestoreArrayRead(xx,&x)); 463 PetscCall(VecRestoreArray(yy,&y)); 464 PetscFunctionReturn(0); 465 } 466 467 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx. 468 * Note that the names I used to designate the vectors differs from that 469 * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent 470 * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */ 471 /* 472 I hate having virtually identical code for the mult and the multadd!!! 473 */ 474 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy) 475 { 476 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 477 const PetscScalar *x; 478 PetscScalar *y,*w; 479 const MatScalar *aa; 480 const PetscInt *aj,*ai; 481 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 482 PetscInt i,j; 483 #endif 484 485 /* Variables that don't appear in MatMultAdd_SeqAIJ. */ 486 Mat_SeqAIJPERM * aijperm; 487 PetscInt *iperm; /* Points to the permutation vector. */ 488 PetscInt *xgroup; 489 /* Denotes where groups of rows with same number of nonzeros 490 * begin and end in iperm. */ 491 PetscInt *nzgroup; 492 PetscInt ngroup; 493 PetscInt igroup; 494 PetscInt jstart,jend; 495 /* jstart is used in loops to denote the position in iperm where a 496 * group starts; jend denotes the position where it ends. 497 * (jend + 1 is where the next group starts.) */ 498 PetscInt iold,nz; 499 PetscInt istart,iend,isize; 500 PetscInt ipos; 501 PetscScalar yp[NDIM]; 502 PetscInt ip[NDIM]; 503 /* yp[] and ip[] are treated as vector "registers" for performing 504 * the mat-vec. */ 505 506 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 507 #pragma disjoint(*x,*y,*aa) 508 #endif 509 510 PetscFunctionBegin; 511 PetscCall(VecGetArrayRead(xx,&x)); 512 PetscCall(VecGetArrayPair(yy,ww,&y,&w)); 513 514 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 515 aa = a->a; /* Nonzero elements stored row-by-row. */ 516 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 517 518 /* Get the info we need about the permutations and groupings. */ 519 aijperm = (Mat_SeqAIJPERM*) A->spptr; 520 iperm = aijperm->iperm; 521 ngroup = aijperm->ngroup; 522 xgroup = aijperm->xgroup; 523 nzgroup = aijperm->nzgroup; 524 525 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 526 fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w); 527 #else 528 529 for (igroup=0; igroup<ngroup; igroup++) { 530 jstart = xgroup[igroup]; 531 jend = xgroup[igroup+1] - 1; 532 533 nz = nzgroup[igroup]; 534 535 /* Handle the special cases where the number of nonzeros per row 536 * in the group is either 0 or 1. */ 537 if (nz == 0) { 538 for (i=jstart; i<=jend; i++) { 539 iold = iperm[i]; 540 y[iold] = w[iold]; 541 } 542 } 543 else if (nz == 1) { 544 for (i=jstart; i<=jend; i++) { 545 iold = iperm[i]; 546 ipos = ai[iold]; 547 y[iold] = w[iold] + aa[ipos] * x[aj[ipos]]; 548 } 549 } 550 /* For the general case: */ 551 else { 552 553 /* We work our way through the current group in chunks of NDIM rows 554 * at a time. */ 555 556 for (istart=jstart; istart<=jend; istart+=NDIM) { 557 /* Figure out where the chunk of 'isize' rows ends in iperm. 558 * 'isize may of course be less than NDIM for the last chunk. */ 559 iend = istart + (NDIM - 1); 560 if (iend > jend) iend = jend; 561 isize = iend - istart + 1; 562 563 /* Initialize the yp[] array that will be used to hold part of 564 * the permuted results vector, and figure out where in aa each 565 * row of the chunk will begin. */ 566 for (i=0; i<isize; i++) { 567 iold = iperm[istart + i]; 568 /* iold is a row number from the matrix A *before* reordering. */ 569 ip[i] = ai[iold]; 570 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 571 yp[i] = w[iold]; 572 } 573 574 /* If the number of zeros per row exceeds the number of rows in 575 * the chunk, we should vectorize along nz, that is, perform the 576 * mat-vec one row at a time as in the usual CSR case. */ 577 if (nz > isize) { 578 #if defined(PETSC_HAVE_CRAY_VECTOR) 579 #pragma _CRI preferstream 580 #endif 581 for (i=0; i<isize; i++) { 582 #if defined(PETSC_HAVE_CRAY_VECTOR) 583 #pragma _CRI prefervector 584 #endif 585 for (j=0; j<nz; j++) { 586 ipos = ip[i] + j; 587 yp[i] += aa[ipos] * x[aj[ipos]]; 588 } 589 } 590 } 591 /* Otherwise, there are enough rows in the chunk to make it 592 * worthwhile to vectorize across the rows, that is, to do the 593 * matvec by operating with "columns" of the chunk. */ 594 else { 595 for (j=0; j<nz; j++) { 596 for (i=0; i<isize; i++) { 597 ipos = ip[i] + j; 598 yp[i] += aa[ipos] * x[aj[ipos]]; 599 } 600 } 601 } 602 603 #if defined(PETSC_HAVE_CRAY_VECTOR) 604 #pragma _CRI ivdep 605 #endif 606 /* Put results from yp[] into non-permuted result vector y. */ 607 for (i=0; i<isize; i++) { 608 y[iperm[istart+i]] = yp[i]; 609 } 610 } /* End processing chunk of isize rows of a group. */ 611 612 } /* End handling matvec for chunk with nz > 1. */ 613 } /* End loop over igroup. */ 614 615 #endif 616 PetscCall(PetscLogFlops(2.0*a->nz)); 617 PetscCall(VecRestoreArrayRead(xx,&x)); 618 PetscCall(VecRestoreArrayPair(yy,ww,&y,&w)); 619 PetscFunctionReturn(0); 620 } 621 622 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a 623 * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM() 624 * routine, but can also be used to convert an assembled SeqAIJ matrix 625 * into a SeqAIJPERM one. */ 626 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat) 627 { 628 Mat B = *newmat; 629 Mat_SeqAIJPERM *aijperm; 630 PetscBool sametype; 631 632 PetscFunctionBegin; 633 if (reuse == MAT_INITIAL_MATRIX) { 634 PetscCall(MatDuplicate(A,MAT_COPY_VALUES,&B)); 635 } 636 PetscCall(PetscObjectTypeCompare((PetscObject)A,type,&sametype)); 637 if (sametype) PetscFunctionReturn(0); 638 639 PetscCall(PetscNewLog(B,&aijperm)); 640 B->spptr = (void*) aijperm; 641 642 /* Set function pointers for methods that we inherit from AIJ but override. */ 643 B->ops->duplicate = MatDuplicate_SeqAIJPERM; 644 B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM; 645 B->ops->destroy = MatDestroy_SeqAIJPERM; 646 B->ops->mult = MatMult_SeqAIJPERM; 647 B->ops->multadd = MatMultAdd_SeqAIJPERM; 648 649 aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/ 650 /* If A has already been assembled, compute the permutation. */ 651 if (A->assembled) { 652 PetscCall(MatSeqAIJPERM_create_perm(B)); 653 } 654 655 PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ)); 656 657 PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM)); 658 *newmat = B; 659 PetscFunctionReturn(0); 660 } 661 662 /*@C 663 MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM. 664 This type inherits from AIJ, but calculates some additional permutation 665 information that is used to allow better vectorization of some 666 operations. At the cost of increased storage, the AIJ formatted 667 matrix can be copied to a format in which pieces of the matrix are 668 stored in ELLPACK format, allowing the vectorized matrix multiply 669 routine to use stride-1 memory accesses. As with the AIJ type, it is 670 important to preallocate matrix storage in order to get good assembly 671 performance. 672 673 Collective 674 675 Input Parameters: 676 + comm - MPI communicator, set to PETSC_COMM_SELF 677 . m - number of rows 678 . n - number of columns 679 . nz - number of nonzeros per row (same for all rows) 680 - nnz - array containing the number of nonzeros in the various rows 681 (possibly different for each row) or NULL 682 683 Output Parameter: 684 . A - the matrix 685 686 Notes: 687 If nnz is given then nz is ignored 688 689 Level: intermediate 690 691 .seealso: `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()` 692 @*/ 693 PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 694 { 695 PetscFunctionBegin; 696 PetscCall(MatCreate(comm,A)); 697 PetscCall(MatSetSizes(*A,m,n,m,n)); 698 PetscCall(MatSetType(*A,MATSEQAIJPERM)); 699 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz)); 700 PetscFunctionReturn(0); 701 } 702 703 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A) 704 { 705 PetscFunctionBegin; 706 PetscCall(MatSetType(A,MATSEQAIJ)); 707 PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A)); 708 PetscFunctionReturn(0); 709 } 710