1 2 /* 3 Defines basic operations for the MATSEQAIJPERM matrix class. 4 This class is derived from the MATSEQAIJ class and retains the 5 compressed row storage (aka Yale sparse matrix format) but augments 6 it with some permutation information that enables some operations 7 to be more vectorizable. A physically rearranged copy of the matrix 8 may be stored if the user desires. 9 10 Eventually a variety of permutations may be supported. 11 */ 12 13 #include <../src/mat/impls/aij/seq/aij.h> 14 15 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 16 #include <immintrin.h> 17 18 #if !defined(_MM_SCALE_8) 19 #define _MM_SCALE_8 8 20 #endif 21 #if !defined(_MM_SCALE_4) 22 #define _MM_SCALE_4 4 23 #endif 24 #endif 25 26 #define NDIM 512 27 /* NDIM specifies how many rows at a time we should work with when 28 * performing the vectorized mat-vec. This depends on various factors 29 * such as vector register length, etc., and I really need to add a 30 * way for the user (or the library) to tune this. I'm setting it to 31 * 512 for now since that is what Ed D'Azevedo was using in his Fortran 32 * routines. */ 33 34 typedef struct { 35 PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */ 36 37 PetscInt ngroup; 38 PetscInt *xgroup; 39 /* Denotes where groups of rows with same number of nonzeros 40 * begin and end, i.e., xgroup[i] gives us the position in iperm[] 41 * where the ith group begins. */ 42 43 PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */ 44 PetscInt *iperm; /* The permutation vector. */ 45 46 /* Some of this stuff is for Ed's recursive triangular solve. 47 * I'm not sure what I need yet. */ 48 PetscInt blocksize; 49 PetscInt nstep; 50 PetscInt *jstart_list; 51 PetscInt *jend_list; 52 PetscInt *action_list; 53 PetscInt *ngroup_list; 54 PetscInt **ipointer_list; 55 PetscInt **xgroup_list; 56 PetscInt **nzgroup_list; 57 PetscInt **iperm_list; 58 } Mat_SeqAIJPERM; 59 60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat) 61 { 62 /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */ 63 /* so we will ignore 'MatType type'. */ 64 PetscErrorCode ierr; 65 Mat B = *newmat; 66 Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr; 67 68 PetscFunctionBegin; 69 if (reuse == MAT_INITIAL_MATRIX) { 70 ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr); 71 aijperm=(Mat_SeqAIJPERM*)B->spptr; 72 } 73 74 /* Reset the original function pointers. */ 75 B->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 76 B->ops->destroy = MatDestroy_SeqAIJ; 77 B->ops->duplicate = MatDuplicate_SeqAIJ; 78 B->ops->mult = MatMult_SeqAIJ; 79 B->ops->multadd = MatMultAdd_SeqAIJ; 80 81 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);CHKERRQ(ierr); 82 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr); 83 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr); 84 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr); 85 ierr = PetscObjectComposeFunction((PetscObject)B,"MatPtAP_is_seqaijperm_C",NULL);CHKERRQ(ierr); 86 87 /* Free everything in the Mat_SeqAIJPERM data structure.*/ 88 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 89 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 90 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 91 ierr = PetscFree(B->spptr);CHKERRQ(ierr); 92 93 /* Change the type of B to MATSEQAIJ. */ 94 ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr); 95 96 *newmat = B; 97 PetscFunctionReturn(0); 98 } 99 100 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A) 101 { 102 PetscErrorCode ierr; 103 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 104 105 PetscFunctionBegin; 106 if (aijperm) { 107 /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */ 108 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 109 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 110 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 111 ierr = PetscFree(A->spptr);CHKERRQ(ierr); 112 } 113 /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ() 114 * to destroy everything that remains. */ 115 ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr); 116 /* Note that I don't call MatSetType(). I believe this is because that 117 * is only to be called when *building* a matrix. I could be wrong, but 118 * that is how things work for the SuperLU matrix class. */ 119 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 120 PetscFunctionReturn(0); 121 } 122 123 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M) 124 { 125 PetscErrorCode ierr; 126 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 127 Mat_SeqAIJPERM *aijperm_dest; 128 PetscBool perm; 129 130 PetscFunctionBegin; 131 ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr); 132 ierr = PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);CHKERRQ(ierr); 133 if (perm) { 134 aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr; 135 ierr = PetscFree(aijperm_dest->xgroup);CHKERRQ(ierr); 136 ierr = PetscFree(aijperm_dest->nzgroup);CHKERRQ(ierr); 137 ierr = PetscFree(aijperm_dest->iperm);CHKERRQ(ierr); 138 } else { 139 ierr = PetscNewLog(*M,&aijperm_dest);CHKERRQ(ierr); 140 (*M)->spptr = (void*) aijperm_dest; 141 ierr = PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);CHKERRQ(ierr); 142 ierr = PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr); 143 ierr = PetscObjectComposeFunction((PetscObject)*M,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr); 144 ierr = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr); 145 ierr = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr); 146 } 147 ierr = PetscArraycpy(aijperm_dest,aijperm,1);CHKERRQ(ierr); 148 /* Allocate space for, and copy the grouping and permutation info. 149 * I note that when the groups are initially determined in 150 * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than 151 * necessary. But at this point, we know how large they need to be, and 152 * allocate only the necessary amount of memory. So the duplicated matrix 153 * may actually use slightly less storage than the original! */ 154 ierr = PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);CHKERRQ(ierr); 155 ierr = PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);CHKERRQ(ierr); 156 ierr = PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);CHKERRQ(ierr); 157 ierr = PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n);CHKERRQ(ierr); 158 ierr = PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1);CHKERRQ(ierr); 159 ierr = PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup);CHKERRQ(ierr); 160 PetscFunctionReturn(0); 161 } 162 163 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A) 164 { 165 PetscErrorCode ierr; 166 Mat_SeqAIJ *a = (Mat_SeqAIJ*)(A)->data; 167 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 168 PetscInt m; /* Number of rows in the matrix. */ 169 PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */ 170 PetscInt maxnz; /* Maximum number of nonzeros in any row. */ 171 PetscInt *rows_in_bucket; 172 /* To construct the permutation, we sort each row into one of maxnz 173 * buckets based on how many nonzeros are in the row. */ 174 PetscInt nz; 175 PetscInt *nz_in_row; /* the number of nonzero elements in row k. */ 176 PetscInt *ipnz; 177 /* When constructing the iperm permutation vector, 178 * ipnz[nz] is used to point to the next place in the permutation vector 179 * that a row with nz nonzero elements should be placed.*/ 180 PetscInt i, ngroup, istart, ipos; 181 182 PetscFunctionBegin; 183 if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */ 184 aijperm->nonzerostate = A->nonzerostate; 185 /* Free anything previously put in the Mat_SeqAIJPERM data structure. */ 186 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 187 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 188 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 189 190 m = A->rmap->n; 191 ia = a->i; 192 193 /* Allocate the arrays that will hold the permutation vector. */ 194 ierr = PetscMalloc1(m, &aijperm->iperm);CHKERRQ(ierr); 195 196 /* Allocate some temporary work arrays that will be used in 197 * calculating the permuation vector and groupings. */ 198 ierr = PetscMalloc1(m, &nz_in_row);CHKERRQ(ierr); 199 200 /* Now actually figure out the permutation and grouping. */ 201 202 /* First pass: Determine number of nonzeros in each row, maximum 203 * number of nonzeros in any row, and how many rows fall into each 204 * "bucket" of rows with same number of nonzeros. */ 205 maxnz = 0; 206 for (i=0; i<m; i++) { 207 nz_in_row[i] = ia[i+1]-ia[i]; 208 if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i]; 209 } 210 ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);CHKERRQ(ierr); 211 ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);CHKERRQ(ierr); 212 213 for (i=0; i<=maxnz; i++) { 214 rows_in_bucket[i] = 0; 215 } 216 for (i=0; i<m; i++) { 217 nz = nz_in_row[i]; 218 rows_in_bucket[nz]++; 219 } 220 221 /* Allocate space for the grouping info. There will be at most (maxnz + 1) 222 * groups. (It is maxnz + 1 instead of simply maxnz because there may be 223 * rows with no nonzero elements.) If there are (maxnz + 1) groups, 224 * then xgroup[] must consist of (maxnz + 2) elements, since the last 225 * element of xgroup will tell us where the (maxnz + 1)th group ends. 226 * We allocate space for the maximum number of groups; 227 * that is potentially a little wasteful, but not too much so. 228 * Perhaps I should fix it later. */ 229 ierr = PetscMalloc1(maxnz+2, &aijperm->xgroup);CHKERRQ(ierr); 230 ierr = PetscMalloc1(maxnz+1, &aijperm->nzgroup);CHKERRQ(ierr); 231 232 /* Second pass. Look at what is in the buckets and create the groupings. 233 * Note that it is OK to have a group of rows with no non-zero values. */ 234 ngroup = 0; 235 istart = 0; 236 for (i=0; i<=maxnz; i++) { 237 if (rows_in_bucket[i] > 0) { 238 aijperm->nzgroup[ngroup] = i; 239 aijperm->xgroup[ngroup] = istart; 240 ngroup++; 241 istart += rows_in_bucket[i]; 242 } 243 } 244 245 aijperm->xgroup[ngroup] = istart; 246 aijperm->ngroup = ngroup; 247 248 /* Now fill in the permutation vector iperm. */ 249 ipnz[0] = 0; 250 for (i=0; i<maxnz; i++) { 251 ipnz[i+1] = ipnz[i] + rows_in_bucket[i]; 252 } 253 254 for (i=0; i<m; i++) { 255 nz = nz_in_row[i]; 256 ipos = ipnz[nz]; 257 aijperm->iperm[ipos] = i; 258 ipnz[nz]++; 259 } 260 261 /* Clean up temporary work arrays. */ 262 ierr = PetscFree(rows_in_bucket);CHKERRQ(ierr); 263 ierr = PetscFree(ipnz);CHKERRQ(ierr); 264 ierr = PetscFree(nz_in_row);CHKERRQ(ierr); 265 PetscFunctionReturn(0); 266 } 267 268 269 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode) 270 { 271 PetscErrorCode ierr; 272 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 273 274 PetscFunctionBegin; 275 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 276 277 /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some 278 * extra information, call the AssemblyEnd routine for a MATSEQAIJ. 279 * I'm not sure if this is the best way to do this, but it avoids 280 * a lot of code duplication. 281 * I also note that currently MATSEQAIJPERM doesn't know anything about 282 * the Mat_CompressedRow data structure that SeqAIJ now uses when there 283 * are many zero rows. If the SeqAIJ assembly end routine decides to use 284 * this, this may break things. (Don't know... haven't looked at it.) */ 285 a->inode.use = PETSC_FALSE; 286 ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr); 287 288 /* Now calculate the permutation and grouping information. */ 289 ierr = MatSeqAIJPERM_create_perm(A);CHKERRQ(ierr); 290 PetscFunctionReturn(0); 291 } 292 293 PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy) 294 { 295 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 296 const PetscScalar *x; 297 PetscScalar *y; 298 const MatScalar *aa; 299 PetscErrorCode ierr; 300 const PetscInt *aj,*ai; 301 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)) 302 PetscInt i,j; 303 #endif 304 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 305 __m512d vec_x,vec_y,vec_vals; 306 __m256i vec_idx,vec_ipos,vec_j; 307 __mmask8 mask; 308 #endif 309 310 /* Variables that don't appear in MatMult_SeqAIJ. */ 311 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 312 PetscInt *iperm; /* Points to the permutation vector. */ 313 PetscInt *xgroup; 314 /* Denotes where groups of rows with same number of nonzeros 315 * begin and end in iperm. */ 316 PetscInt *nzgroup; 317 PetscInt ngroup; 318 PetscInt igroup; 319 PetscInt jstart,jend; 320 /* jstart is used in loops to denote the position in iperm where a 321 * group starts; jend denotes the position where it ends. 322 * (jend + 1 is where the next group starts.) */ 323 PetscInt iold,nz; 324 PetscInt istart,iend,isize; 325 PetscInt ipos; 326 PetscScalar yp[NDIM]; 327 PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */ 328 329 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 330 #pragma disjoint(*x,*y,*aa) 331 #endif 332 333 PetscFunctionBegin; 334 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 335 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 336 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 337 aa = a->a; /* Nonzero elements stored row-by-row. */ 338 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 339 340 /* Get the info we need about the permutations and groupings. */ 341 iperm = aijperm->iperm; 342 ngroup = aijperm->ngroup; 343 xgroup = aijperm->xgroup; 344 nzgroup = aijperm->nzgroup; 345 346 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking) 347 fortranmultaijperm_(&m,x,ii,aj,aa,y); 348 #else 349 350 for (igroup=0; igroup<ngroup; igroup++) { 351 jstart = xgroup[igroup]; 352 jend = xgroup[igroup+1] - 1; 353 nz = nzgroup[igroup]; 354 355 /* Handle the special cases where the number of nonzeros per row 356 * in the group is either 0 or 1. */ 357 if (nz == 0) { 358 for (i=jstart; i<=jend; i++) { 359 y[iperm[i]] = 0.0; 360 } 361 } else if (nz == 1) { 362 for (i=jstart; i<=jend; i++) { 363 iold = iperm[i]; 364 ipos = ai[iold]; 365 y[iold] = aa[ipos] * x[aj[ipos]]; 366 } 367 } else { 368 369 /* We work our way through the current group in chunks of NDIM rows 370 * at a time. */ 371 372 for (istart=jstart; istart<=jend; istart+=NDIM) { 373 /* Figure out where the chunk of 'isize' rows ends in iperm. 374 * 'isize may of course be less than NDIM for the last chunk. */ 375 iend = istart + (NDIM - 1); 376 377 if (iend > jend) iend = jend; 378 379 isize = iend - istart + 1; 380 381 /* Initialize the yp[] array that will be used to hold part of 382 * the permuted results vector, and figure out where in aa each 383 * row of the chunk will begin. */ 384 for (i=0; i<isize; i++) { 385 iold = iperm[istart + i]; 386 /* iold is a row number from the matrix A *before* reordering. */ 387 ip[i] = ai[iold]; 388 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 389 yp[i] = (PetscScalar) 0.0; 390 } 391 392 /* If the number of zeros per row exceeds the number of rows in 393 * the chunk, we should vectorize along nz, that is, perform the 394 * mat-vec one row at a time as in the usual CSR case. */ 395 if (nz > isize) { 396 #if defined(PETSC_HAVE_CRAY_VECTOR) 397 #pragma _CRI preferstream 398 #endif 399 for (i=0; i<isize; i++) { 400 #if defined(PETSC_HAVE_CRAY_VECTOR) 401 #pragma _CRI prefervector 402 #endif 403 404 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 405 vec_y = _mm512_setzero_pd(); 406 ipos = ip[i]; 407 for (j=0; j<(nz>>3); j++) { 408 vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]); 409 vec_vals = _mm512_loadu_pd(&aa[ipos]); 410 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); 411 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y); 412 ipos += 8; 413 } 414 if ((nz&0x07)>2) { 415 mask = (__mmask8)(0xff >> (8-(nz&0x07))); 416 vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]); 417 vec_vals = _mm512_loadu_pd(&aa[ipos]); 418 vec_x = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8); 419 vec_y = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask); 420 } else if ((nz&0x07)==2) { 421 yp[i] += aa[ipos]*x[aj[ipos]]; 422 yp[i] += aa[ipos+1]*x[aj[ipos+1]]; 423 } else if ((nz&0x07)==1) { 424 yp[i] += aa[ipos]*x[aj[ipos]]; 425 } 426 yp[i] += _mm512_reduce_add_pd(vec_y); 427 #else 428 for (j=0; j<nz; j++) { 429 ipos = ip[i] + j; 430 yp[i] += aa[ipos] * x[aj[ipos]]; 431 } 432 #endif 433 } 434 } else { 435 /* Otherwise, there are enough rows in the chunk to make it 436 * worthwhile to vectorize across the rows, that is, to do the 437 * matvec by operating with "columns" of the chunk. */ 438 for (j=0; j<nz; j++) { 439 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 440 vec_j = _mm256_set1_epi32(j); 441 for (i=0; i<((isize>>3)<<3); i+=8) { 442 vec_y = _mm512_loadu_pd(&yp[i]); 443 vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]); 444 vec_ipos = _mm256_add_epi32(vec_ipos,vec_j); 445 vec_idx = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4); 446 vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8); 447 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); 448 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y); 449 _mm512_storeu_pd(&yp[i],vec_y); 450 } 451 for (i=isize-(isize&0x07); i<isize; i++) { 452 ipos = ip[i]+j; 453 yp[i] += aa[ipos]*x[aj[ipos]]; 454 } 455 #else 456 for (i=0; i<isize; i++) { 457 ipos = ip[i] + j; 458 yp[i] += aa[ipos] * x[aj[ipos]]; 459 } 460 #endif 461 } 462 } 463 464 #if defined(PETSC_HAVE_CRAY_VECTOR) 465 #pragma _CRI ivdep 466 #endif 467 /* Put results from yp[] into non-permuted result vector y. */ 468 for (i=0; i<isize; i++) { 469 y[iperm[istart+i]] = yp[i]; 470 } 471 } /* End processing chunk of isize rows of a group. */ 472 } /* End handling matvec for chunk with nz > 1. */ 473 } /* End loop over igroup. */ 474 #endif 475 ierr = PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));CHKERRQ(ierr); 476 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 477 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 478 PetscFunctionReturn(0); 479 } 480 481 482 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx. 483 * Note that the names I used to designate the vectors differs from that 484 * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent 485 * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */ 486 /* 487 I hate having virtually identical code for the mult and the multadd!!! 488 */ 489 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy) 490 { 491 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 492 const PetscScalar *x; 493 PetscScalar *y,*w; 494 const MatScalar *aa; 495 PetscErrorCode ierr; 496 const PetscInt *aj,*ai; 497 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 498 PetscInt i,j; 499 #endif 500 501 /* Variables that don't appear in MatMultAdd_SeqAIJ. */ 502 Mat_SeqAIJPERM * aijperm; 503 PetscInt *iperm; /* Points to the permutation vector. */ 504 PetscInt *xgroup; 505 /* Denotes where groups of rows with same number of nonzeros 506 * begin and end in iperm. */ 507 PetscInt *nzgroup; 508 PetscInt ngroup; 509 PetscInt igroup; 510 PetscInt jstart,jend; 511 /* jstart is used in loops to denote the position in iperm where a 512 * group starts; jend denotes the position where it ends. 513 * (jend + 1 is where the next group starts.) */ 514 PetscInt iold,nz; 515 PetscInt istart,iend,isize; 516 PetscInt ipos; 517 PetscScalar yp[NDIM]; 518 PetscInt ip[NDIM]; 519 /* yp[] and ip[] are treated as vector "registers" for performing 520 * the mat-vec. */ 521 522 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 523 #pragma disjoint(*x,*y,*aa) 524 #endif 525 526 PetscFunctionBegin; 527 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 528 ierr = VecGetArrayPair(yy,ww,&y,&w);CHKERRQ(ierr); 529 530 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 531 aa = a->a; /* Nonzero elements stored row-by-row. */ 532 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 533 534 /* Get the info we need about the permutations and groupings. */ 535 aijperm = (Mat_SeqAIJPERM*) A->spptr; 536 iperm = aijperm->iperm; 537 ngroup = aijperm->ngroup; 538 xgroup = aijperm->xgroup; 539 nzgroup = aijperm->nzgroup; 540 541 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 542 fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w); 543 #else 544 545 for (igroup=0; igroup<ngroup; igroup++) { 546 jstart = xgroup[igroup]; 547 jend = xgroup[igroup+1] - 1; 548 549 nz = nzgroup[igroup]; 550 551 /* Handle the special cases where the number of nonzeros per row 552 * in the group is either 0 or 1. */ 553 if (nz == 0) { 554 for (i=jstart; i<=jend; i++) { 555 iold = iperm[i]; 556 y[iold] = w[iold]; 557 } 558 } 559 else if (nz == 1) { 560 for (i=jstart; i<=jend; i++) { 561 iold = iperm[i]; 562 ipos = ai[iold]; 563 y[iold] = w[iold] + aa[ipos] * x[aj[ipos]]; 564 } 565 } 566 /* For the general case: */ 567 else { 568 569 /* We work our way through the current group in chunks of NDIM rows 570 * at a time. */ 571 572 for (istart=jstart; istart<=jend; istart+=NDIM) { 573 /* Figure out where the chunk of 'isize' rows ends in iperm. 574 * 'isize may of course be less than NDIM for the last chunk. */ 575 iend = istart + (NDIM - 1); 576 if (iend > jend) iend = jend; 577 isize = iend - istart + 1; 578 579 /* Initialize the yp[] array that will be used to hold part of 580 * the permuted results vector, and figure out where in aa each 581 * row of the chunk will begin. */ 582 for (i=0; i<isize; i++) { 583 iold = iperm[istart + i]; 584 /* iold is a row number from the matrix A *before* reordering. */ 585 ip[i] = ai[iold]; 586 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 587 yp[i] = w[iold]; 588 } 589 590 /* If the number of zeros per row exceeds the number of rows in 591 * the chunk, we should vectorize along nz, that is, perform the 592 * mat-vec one row at a time as in the usual CSR case. */ 593 if (nz > isize) { 594 #if defined(PETSC_HAVE_CRAY_VECTOR) 595 #pragma _CRI preferstream 596 #endif 597 for (i=0; i<isize; i++) { 598 #if defined(PETSC_HAVE_CRAY_VECTOR) 599 #pragma _CRI prefervector 600 #endif 601 for (j=0; j<nz; j++) { 602 ipos = ip[i] + j; 603 yp[i] += aa[ipos] * x[aj[ipos]]; 604 } 605 } 606 } 607 /* Otherwise, there are enough rows in the chunk to make it 608 * worthwhile to vectorize across the rows, that is, to do the 609 * matvec by operating with "columns" of the chunk. */ 610 else { 611 for (j=0; j<nz; j++) { 612 for (i=0; i<isize; i++) { 613 ipos = ip[i] + j; 614 yp[i] += aa[ipos] * x[aj[ipos]]; 615 } 616 } 617 } 618 619 #if defined(PETSC_HAVE_CRAY_VECTOR) 620 #pragma _CRI ivdep 621 #endif 622 /* Put results from yp[] into non-permuted result vector y. */ 623 for (i=0; i<isize; i++) { 624 y[iperm[istart+i]] = yp[i]; 625 } 626 } /* End processing chunk of isize rows of a group. */ 627 628 } /* End handling matvec for chunk with nz > 1. */ 629 } /* End loop over igroup. */ 630 631 #endif 632 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 633 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 634 ierr = VecRestoreArrayPair(yy,ww,&y,&w);CHKERRQ(ierr); 635 PetscFunctionReturn(0); 636 } 637 638 /* This function prototype is needed in MatConvert_SeqAIJ_SeqAIJPERM(), below. */ 639 PETSC_INTERN PetscErrorCode MatPtAP_IS_XAIJ(Mat,Mat,MatReuse,PetscReal,Mat*); 640 641 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a 642 * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM() 643 * routine, but can also be used to convert an assembled SeqAIJ matrix 644 * into a SeqAIJPERM one. */ 645 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat) 646 { 647 PetscErrorCode ierr; 648 Mat B = *newmat; 649 Mat_SeqAIJPERM *aijperm; 650 PetscBool sametype; 651 652 PetscFunctionBegin; 653 if (reuse == MAT_INITIAL_MATRIX) { 654 ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr); 655 } 656 ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr); 657 if (sametype) PetscFunctionReturn(0); 658 659 ierr = PetscNewLog(B,&aijperm);CHKERRQ(ierr); 660 B->spptr = (void*) aijperm; 661 662 /* Set function pointers for methods that we inherit from AIJ but override. */ 663 B->ops->duplicate = MatDuplicate_SeqAIJPERM; 664 B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM; 665 B->ops->destroy = MatDestroy_SeqAIJPERM; 666 B->ops->mult = MatMult_SeqAIJPERM; 667 B->ops->multadd = MatMultAdd_SeqAIJPERM; 668 669 aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/ 670 /* If A has already been assembled, compute the permutation. */ 671 if (A->assembled) { 672 ierr = MatSeqAIJPERM_create_perm(B);CHKERRQ(ierr); 673 } 674 675 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr); 676 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr); 677 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr); 678 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr); 679 ierr = PetscObjectComposeFunction((PetscObject)B,"MatPtAP_is_seqaijperm_C",MatPtAP_IS_XAIJ);CHKERRQ(ierr); 680 681 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);CHKERRQ(ierr); 682 *newmat = B; 683 PetscFunctionReturn(0); 684 } 685 686 /*@C 687 MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM. 688 This type inherits from AIJ, but calculates some additional permutation 689 information that is used to allow better vectorization of some 690 operations. At the cost of increased storage, the AIJ formatted 691 matrix can be copied to a format in which pieces of the matrix are 692 stored in ELLPACK format, allowing the vectorized matrix multiply 693 routine to use stride-1 memory accesses. As with the AIJ type, it is 694 important to preallocate matrix storage in order to get good assembly 695 performance. 696 697 Collective 698 699 Input Parameters: 700 + comm - MPI communicator, set to PETSC_COMM_SELF 701 . m - number of rows 702 . n - number of columns 703 . nz - number of nonzeros per row (same for all rows) 704 - nnz - array containing the number of nonzeros in the various rows 705 (possibly different for each row) or NULL 706 707 Output Parameter: 708 . A - the matrix 709 710 Notes: 711 If nnz is given then nz is ignored 712 713 Level: intermediate 714 715 .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues() 716 @*/ 717 PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 718 { 719 PetscErrorCode ierr; 720 721 PetscFunctionBegin; 722 ierr = MatCreate(comm,A);CHKERRQ(ierr); 723 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 724 ierr = MatSetType(*A,MATSEQAIJPERM);CHKERRQ(ierr); 725 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr); 726 PetscFunctionReturn(0); 727 } 728 729 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A) 730 { 731 PetscErrorCode ierr; 732 733 PetscFunctionBegin; 734 ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr); 735 ierr = MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 736 PetscFunctionReturn(0); 737 } 738 739