1 // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at 2 // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights 3 // reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include "ceed-magma.h" 18 #include <string.h> 19 20 typedef struct { 21 CeedScalar *array; 22 CeedScalar *darray; 23 int own_; 24 int down_; 25 } CeedVector_Magma; 26 27 typedef struct { 28 CeedInt *indices; 29 CeedInt *dindices; 30 int own_; 31 int down_; // cover a case where we own Device memory 32 } CeedElemRestriction_Magma; 33 34 typedef struct { 35 CeedVector 36 *evecs; /// E-vectors needed to apply operator (input followed by outputs) 37 CeedScalar **edata; 38 CeedScalar **qdata; /// Inputs followed by outputs 39 CeedScalar 40 **qdata_alloc; /// Allocated quadrature data arrays (to be freed by us) 41 CeedScalar **indata; 42 CeedScalar **outdata; 43 CeedInt numein; 44 CeedInt numeout; 45 CeedInt numqin; 46 CeedInt numqout; 47 } CeedOperator_Magma; 48 49 // ***************************************************************************** 50 // * Initialize vector vec (after free mem) with values from array based on cmode 51 // * CEED_COPY_VALUES: memory is allocated in vec->array_allocated, made equal 52 // * to array, and data is copied (not store passed pointer) 53 // * CEED_OWN_POINTER: vec->data->array_allocated and vec->data->array = array 54 // * CEED_USE_POINTER: vec->data->array = array (can modify; no ownership) 55 // * mtype: CEED_MEM_HOST or CEED_MEM_DEVICE 56 // ***************************************************************************** 57 static int CeedVectorSetArray_Magma(CeedVector vec, CeedMemType mtype, 58 CeedCopyMode cmode, CeedScalar *array) { 59 CeedVector_Magma *impl = vec->data; 60 int ierr; 61 62 // If own data, free the "old" data, e.g., as it may be of different size 63 if (impl->own_) { 64 magma_free( impl->darray ); 65 magma_free_pinned( impl->array ); 66 impl->darray = NULL; 67 impl->array = NULL; 68 impl->own_ = 0; 69 impl->down_= 0; 70 } 71 72 if (mtype == CEED_MEM_HOST) { 73 // memory is on the host; own_ = 0 74 switch (cmode) { 75 case CEED_COPY_VALUES: 76 ierr = magma_malloc( (void**)&impl->darray, 77 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 78 ierr = magma_malloc_pinned( (void**)&impl->array, 79 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 80 impl->own_ = 1; 81 82 if (array != NULL) 83 magma_setvector(vec->length, sizeof(array[0]), 84 array, 1, impl->darray, 1); 85 break; 86 case CEED_OWN_POINTER: 87 ierr = magma_malloc( (void**)&impl->darray, 88 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 89 // TODO: possible problem here is if we are passed non-pinned memory; 90 // (as we own it, lter in destroy, we use free for pinned memory). 91 impl->array = array; 92 impl->own_ = 1; 93 94 if (array != NULL) 95 magma_setvector(vec->length, sizeof(array[0]), 96 array, 1, impl->darray, 1); 97 break; 98 case CEED_USE_POINTER: 99 ierr = magma_malloc( (void**)&impl->darray, 100 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 101 magma_setvector(vec->length, sizeof(array[0]), 102 array, 1, impl->darray, 1); 103 104 impl->down_ = 1; 105 impl->array = array; 106 } 107 } else if (mtype == CEED_MEM_DEVICE) { 108 // memory is on the device; own = 0 109 switch (cmode) { 110 case CEED_COPY_VALUES: 111 ierr = magma_malloc( (void**)&impl->darray, 112 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 113 ierr = magma_malloc_pinned( (void**)&impl->array, 114 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 115 impl->own_ = 1; 116 117 if (array) 118 magma_copyvector(vec->length, sizeof(array[0]), 119 array, 1, impl->darray, 1); 120 else 121 // t30 assumes allocation initializes with 0s 122 magma_setvector(vec->length, sizeof(array[0]), 123 impl->array, 1, impl->darray, 1); 124 break; 125 case CEED_OWN_POINTER: 126 impl->darray = array; 127 ierr = magma_malloc_pinned( (void**)&impl->array, 128 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 129 impl->own_ = 1; 130 131 break; 132 case CEED_USE_POINTER: 133 impl->darray = array; 134 impl->array = NULL; 135 } 136 137 } else 138 return CeedError(vec->ceed, 1, "Only MemType = HOST or DEVICE supported"); 139 140 return 0; 141 } 142 143 // ***************************************************************************** 144 // * Give data pointer from vector vec to array (on HOST or DEVICE) 145 // ***************************************************************************** 146 static int CeedVectorGetArray_Magma(CeedVector vec, CeedMemType mtype, 147 CeedScalar **array) { 148 CeedVector_Magma *impl = vec->data; 149 int ierr; 150 151 if (mtype == CEED_MEM_HOST) { 152 if (impl->own_) { 153 // data is owned so GPU had the most up-to-date version; copy it 154 // TTT - apparantly it doesn't have most up to date data 155 magma_getvector(vec->length, sizeof(*array[0]), 156 impl->darray, 1, impl->array, 1); 157 CeedDebug("\033[31m[CeedVectorGetArray_Magma]"); 158 //fprintf(stderr,"rrrrrrrrrrrrrrr\n"); 159 } else if (impl->array == NULL) { 160 // Vector doesn't own the data and was set on GPU 161 if (impl->darray == NULL) { 162 // call was made just to allocate memory 163 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL); 164 CeedChk(ierr); 165 } else 166 return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST"); 167 } 168 *array = impl->array; 169 } else if (mtype == CEED_MEM_DEVICE) { 170 if (impl->darray == NULL) { 171 // Vector doesn't own the data and was set on the CPU 172 if (impl->array == NULL) { 173 // call was made just to allocate memory 174 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL); 175 CeedChk(ierr); 176 } else 177 return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE"); 178 } 179 *array = impl->darray; 180 } else 181 return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory"); 182 183 return 0; 184 } 185 186 // ***************************************************************************** 187 // * Give data pointer from vector vec to array (on HOST or DEVICE) to read it 188 // ***************************************************************************** 189 static int CeedVectorGetArrayRead_Magma(CeedVector vec, CeedMemType mtype, 190 const CeedScalar **array) { 191 CeedVector_Magma *impl = vec->data; 192 int ierr; 193 194 if (mtype == CEED_MEM_HOST) { 195 if (impl->own_) { 196 // data is owned so GPU had the most up-to-date version; copy it 197 magma_getvector(vec->length, sizeof(*array[0]), 198 impl->darray, 1, impl->array, 1); 199 } else if (impl->array == NULL) { 200 // Vector doesn't own the data and was set on GPU 201 if (impl->darray == NULL) { 202 // call was made just to allocate memory 203 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL); 204 CeedChk(ierr); 205 } else 206 return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST"); 207 } 208 *array = impl->array; 209 } else if (mtype == CEED_MEM_DEVICE) { 210 if (impl->darray == NULL) { 211 // Vector doesn't own the data and was set on the CPU 212 if (impl->array == NULL) { 213 // call was made just to allocate memory 214 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL); 215 CeedChk(ierr); 216 } else 217 return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE"); 218 } 219 *array = impl->darray; 220 } else 221 return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory"); 222 223 return 0; 224 } 225 226 // ***************************************************************************** 227 // * There is no mtype here for array so it is not clear if we restore from HOST 228 // * memory or from DEVICE memory. We assume that it is CPU memory because if 229 // * it was GPU memory we would not call this routine at all. 230 // * Restore vector vec with values from array, where array received its values 231 // * from vec and possibly modified them. 232 // ***************************************************************************** 233 static int CeedVectorRestoreArray_Magma(CeedVector vec, CeedScalar **array) { 234 CeedVector_Magma *impl = vec->data; 235 236 // Check if the array is a CPU pointer 237 if (*array == impl->array) { 238 // Update device, if the device pointer is not NULL 239 if (impl->darray != NULL) { 240 magma_setvector(vec->length, sizeof(*array[0]), 241 *array, 1, impl->darray, 1); 242 } else { 243 // nothing to do (case of CPU use pointer) 244 } 245 246 } else if (impl->down_) { 247 // nothing to do if array is on GPU, except if down_=1(case CPU use pointer) 248 magma_getvector(vec->length, sizeof(*array[0]), 249 impl->darray, 1, impl->array, 1); 250 } 251 252 *array = NULL; 253 return 0; 254 } 255 256 // ***************************************************************************** 257 // * There is no mtype here for array so it is not clear if we restore from HOST 258 // * memory or from DEVICE memory. We assume that it is CPU memory because if 259 // * it was GPU memory we would not call this routine at all. 260 // * Restore vector vec with values from array, where array received its values 261 // * from vec to only read them; in this case vec may have been modified meanwhile 262 // * and needs to be restored here. 263 // ***************************************************************************** 264 static int CeedVectorRestoreArrayRead_Magma(CeedVector vec, 265 const CeedScalar **array) { 266 CeedVector_Magma *impl = vec->data; 267 268 // Check if the array is a CPU pointer 269 if (*array == impl->array) { 270 // Update device, if the device pointer is not NULL 271 if (impl->darray != NULL) { 272 magma_setvector(vec->length, sizeof(*array[0]), 273 *array, 1, impl->darray, 1); 274 } else { 275 // nothing to do (case of CPU use pointer) 276 } 277 278 } else if (impl->down_) { 279 // nothing to do if array is on GPU, except if down_=1(case CPU use pointer) 280 magma_getvector(vec->length, sizeof(*array[0]), 281 impl->darray, 1, impl->array, 1); 282 } 283 284 *array = NULL; 285 return 0; 286 } 287 288 static int CeedVectorDestroy_Magma(CeedVector vec) { 289 CeedVector_Magma *impl = vec->data; 290 int ierr; 291 292 // Free if we own the data 293 if (impl->own_) { 294 ierr = magma_free_pinned(impl->array); CeedChk(ierr); 295 ierr = magma_free(impl->darray); CeedChk(ierr); 296 } else if (impl->down_) { 297 ierr = magma_free(impl->darray); CeedChk(ierr); 298 } 299 ierr = CeedFree(&vec->data); CeedChk(ierr); 300 return 0; 301 } 302 303 // ***************************************************************************** 304 // * Create vector vec of size n 305 // ***************************************************************************** 306 static int CeedVectorCreate_Magma(Ceed ceed, CeedInt n, CeedVector vec) { 307 CeedVector_Magma *impl; 308 int ierr; 309 310 vec->SetArray = CeedVectorSetArray_Magma; 311 vec->GetArray = CeedVectorGetArray_Magma; 312 vec->GetArrayRead = CeedVectorGetArrayRead_Magma; 313 vec->RestoreArray = CeedVectorRestoreArray_Magma; 314 vec->RestoreArrayRead = CeedVectorRestoreArrayRead_Magma; 315 vec->Destroy = CeedVectorDestroy_Magma; 316 ierr = CeedCalloc(1,&impl); CeedChk(ierr); 317 impl->darray = NULL; 318 impl->array = NULL; 319 impl->own_ = 0; 320 impl->down_= 0; 321 vec->data = impl; 322 return 0; 323 } 324 325 326 // ***************************************************************************** 327 // * Apply restriction operator r to u: v = r(rmode) u 328 // ***************************************************************************** 329 static int CeedElemRestrictionApply_Magma(CeedElemRestriction r, 330 CeedTransposeMode tmode, 331 CeedTransposeMode lmode, CeedVector u, 332 CeedVector v, CeedRequest *request) { 333 CeedElemRestriction_Magma *impl = r->data; 334 int ierr; 335 const CeedScalar *uu; 336 CeedScalar *vv; 337 CeedInt nelem = r->nelem, elemsize = r->elemsize, ndof = r->ndof, 338 ncomp=r->ncomp; 339 CeedInt esize = nelem * elemsize; 340 341 #ifdef USE_MAGMA_BATCH2 342 CeedInt *dindices = impl->dindices; 343 // Get pointers on the device 344 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &uu); CeedChk(ierr); 345 ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &vv); CeedChk(ierr); 346 #else 347 CeedInt *indices = impl->indices; 348 ierr = CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu); CeedChk(ierr); 349 ierr = CeedVectorGetArray(v, CEED_MEM_HOST, &vv); CeedChk(ierr); 350 #endif 351 352 if (tmode == CEED_NOTRANSPOSE) { 353 // Perform: v = r * u 354 if (ncomp == 1) { 355 #ifdef USE_MAGMA_BATCH2 356 magma_template<<i=0:esize>> 357 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) { 358 vv[i] = uu[dindices[i]]; 359 } 360 #else 361 for (CeedInt i=0; i<esize; i++) vv[i] = uu[indices[i]]; 362 #endif 363 } else { 364 // vv is (elemsize x ncomp x nelem), column-major 365 if (lmode == CEED_NOTRANSPOSE) { // u is (ndof x ncomp), column-major 366 #ifdef USE_MAGMA_BATCH2 367 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>> 368 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, int ndof) { 369 vv[i + iend*(d+dend*e)] = uu[dindices[i+iend*e]+ndof*d]; 370 } 371 #else 372 for (CeedInt e = 0; e < nelem; e++) 373 for (CeedInt d = 0; d < ncomp; d++) 374 for (CeedInt i=0; i < elemsize; i++) { 375 vv[i + elemsize*(d+ncomp*e)] = 376 uu[indices[i+elemsize*e]+ndof*d]; 377 } 378 #endif 379 } else { // u is (ncomp x ndof), column-major 380 #ifdef USE_MAGMA_BATCH2 381 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>> 382 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) { 383 vv[i + iend*(d+dend*e)] = uu[d+dend*dindices[i + iend*e]]; 384 } 385 #else 386 for (CeedInt e = 0; e < nelem; e++) 387 for (CeedInt d = 0; d < ncomp; d++) 388 for (CeedInt i=0; i< elemsize; i++) { 389 vv[i + elemsize*(d+ncomp*e)] = 390 uu[d+ncomp*indices[i+elemsize*e]]; 391 } 392 #endif 393 } 394 } 395 } else { 396 // Note: in transpose mode, we perform: v += r^t * u 397 if (ncomp == 1) { 398 // fprintf(stderr,"3 ---------\n"); 399 #ifdef USE_MAGMA_BATCH2 400 magma_template<<i=0:esize>> 401 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) { 402 magmablas_datomic_add( &vv[dindices[i]], uu[i]); 403 } 404 #else 405 for (CeedInt i=0; i<esize; i++) vv[indices[i]] += uu[i]; 406 #endif 407 } else { // u is (elemsize x ncomp x nelem) 408 fprintf(stderr,"2 ---------\n"); 409 410 if (lmode == CEED_NOTRANSPOSE) { // vv is (ndof x ncomp), column-major 411 #ifdef USE_MAGMA_BATCH2 412 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>> 413 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, CeedInt ndof) { 414 magmablas_datomic_add( &vv[dindices[i+iend*e]+ndof*d], 415 uu[i+iend*(d+e*dend)]); 416 } 417 #else 418 for (CeedInt e = 0; e < nelem; e++) 419 for (CeedInt d = 0; d < ncomp; d++) 420 for (CeedInt i=0; i < elemsize; i++) { 421 vv[indices[i + elemsize*e]+ndof*d] += 422 uu[i + elemsize*(d+e*ncomp)]; 423 } 424 #endif 425 } else { // vv is (ncomp x ndof), column-major 426 #ifdef USE_MAGMA_BATCH2 427 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>> 428 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) { 429 magmablas_datomic_add( &vv[d+dend*dindices[i + iend*e]], 430 uu[i+iend*(d+e*dend)]); 431 } 432 #else 433 for (CeedInt e = 0; e < nelem; e++) 434 for (CeedInt d = 0; d < ncomp; d++) 435 for (CeedInt i=0; i < elemsize; i++) { 436 vv[d+ncomp*indices[i + elemsize*e]] += 437 uu[i + elemsize*(d+e*ncomp)]; 438 } 439 #endif 440 } 441 } 442 } 443 444 ierr = CeedVectorRestoreArrayRead(u, &uu); CeedChk(ierr); 445 ierr = CeedVectorRestoreArray(v, &vv); CeedChk(ierr); 446 447 if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) 448 *request = NULL; 449 return 0; 450 } 451 452 static int CeedElemRestrictionDestroy_Magma(CeedElemRestriction r) { 453 CeedElemRestriction_Magma *impl = r->data; 454 int ierr; 455 456 // Free if we own the data 457 if (impl->own_) { 458 ierr = magma_free_pinned(impl->indices); CeedChk(ierr); 459 ierr = magma_free(impl->dindices); CeedChk(ierr); 460 } else if (impl->down_) { 461 ierr = magma_free(impl->dindices); CeedChk(ierr); 462 } 463 ierr = CeedFree(&r->data); CeedChk(ierr); 464 return 0; 465 } 466 467 static int CeedElemRestrictionCreate_Magma(CeedElemRestriction r, 468 CeedMemType mtype, 469 CeedCopyMode cmode, 470 const CeedInt *indices) { 471 int ierr, size = r->nelem*r->elemsize; 472 CeedElemRestriction_Magma *impl; 473 474 // Allocate memory for the MAGMA Restricton and initializa pointers to NULL 475 ierr = CeedCalloc(1,&impl); CeedChk(ierr); 476 impl->dindices = NULL; 477 impl->indices = NULL; 478 impl->own_ = 0; 479 impl->down_= 0; 480 481 if (mtype == CEED_MEM_HOST) { 482 // memory is on the host; own_ = 0 483 switch (cmode) { 484 case CEED_COPY_VALUES: 485 ierr = magma_malloc( (void**)&impl->dindices, 486 size * sizeof(CeedInt)); CeedChk(ierr); 487 ierr = magma_malloc_pinned( (void**)&impl->indices, 488 size * sizeof(CeedInt)); CeedChk(ierr); 489 impl->own_ = 1; 490 491 if (indices != NULL) { 492 memcpy(impl->indices, indices, size * sizeof(indices[0])); 493 magma_setvector(size, sizeof(CeedInt), 494 impl->indices, 1, impl->dindices, 1); 495 } 496 break; 497 case CEED_OWN_POINTER: 498 ierr = magma_malloc( (void**)&impl->dindices, 499 size * sizeof(CeedInt)); CeedChk(ierr); 500 // TODO: possible problem here is if we are passed non-pinned memory; 501 // (as we own it, lter in destroy, we use free for pinned memory). 502 impl->indices = (CeedInt *)indices; 503 impl->own_ = 1; 504 505 if (indices != NULL) 506 magma_setvector(size, sizeof(CeedInt), 507 indices, 1, impl->dindices, 1); 508 break; 509 case CEED_USE_POINTER: 510 ierr = magma_malloc( (void**)&impl->dindices, 511 size * sizeof(CeedInt)); CeedChk(ierr); 512 magma_setvector(size, sizeof(CeedInt), 513 indices, 1, impl->dindices, 1); 514 impl->down_ = 1; 515 impl->indices = (CeedInt *)indices; 516 } 517 } else if (mtype == CEED_MEM_DEVICE) { 518 // memory is on the device; own = 0 519 switch (cmode) { 520 case CEED_COPY_VALUES: 521 ierr = magma_malloc( (void**)&impl->dindices, 522 size * sizeof(CeedInt)); CeedChk(ierr); 523 ierr = magma_malloc_pinned( (void**)&impl->indices, 524 size * sizeof(CeedInt)); CeedChk(ierr); 525 impl->own_ = 1; 526 527 if (indices) 528 magma_copyvector(size, sizeof(CeedInt), 529 indices, 1, impl->dindices, 1); 530 break; 531 case CEED_OWN_POINTER: 532 impl->dindices = (CeedInt *)indices; 533 ierr = magma_malloc_pinned( (void**)&impl->indices, 534 size * sizeof(CeedInt)); CeedChk(ierr); 535 impl->own_ = 1; 536 537 break; 538 case CEED_USE_POINTER: 539 impl->dindices = (CeedInt *)indices; 540 impl->indices = NULL; 541 } 542 543 } else 544 return CeedError(r->ceed, 1, "Only MemType = HOST or DEVICE supported"); 545 546 r->data = impl; 547 r->Apply = CeedElemRestrictionApply_Magma; 548 r->Destroy = CeedElemRestrictionDestroy_Magma; 549 550 return 0; 551 } 552 553 // Contracts on the middle index 554 // NOTRANSPOSE: V_ajc = T_jb U_abc 555 // TRANSPOSE: V_ajc = T_bj U_abc 556 // If Add != 0, "=" is replaced by "+=" 557 static int CeedTensorContract_Magma(Ceed ceed, 558 CeedInt A, CeedInt B, CeedInt C, CeedInt J, 559 const CeedScalar *t, CeedTransposeMode tmode, 560 const CeedInt Add, 561 const CeedScalar *u, CeedScalar *v) { 562 #ifdef USE_MAGMA_BATCH 563 magma_dtensor_contract(ceed, A, B, C, J, t, tmode, Add, u, v); 564 #else 565 CeedInt tstride0 = B, tstride1 = 1; 566 if (tmode == CEED_TRANSPOSE) { 567 tstride0 = 1; tstride1 = J; 568 } 569 CeedDebug("\033[31m[CeedTensorContract] A=%d, J=%d, C=%d, B=%d: %d %d %d", 570 A,J,C,B,A*J*B*C, C*J*A, C*B*A); 571 for (CeedInt a=0; a<A; a++) { 572 for (CeedInt j=0; j<J; j++) { 573 if (!Add) { 574 for (CeedInt c=0; c<C; c++) 575 v[(a*J+j)*C+c] = 0; 576 } 577 for (CeedInt b=0; b<B; b++) { 578 for (CeedInt c=0; c<C; c++) { 579 v[(a*J+j)*C+c] += t[j*tstride0 + b*tstride1] * u[(a*B+b)*C+c]; 580 } 581 } 582 } 583 } 584 #endif 585 return 0; 586 } 587 588 static int CeedBasisApply_Magma(CeedBasis basis, CeedTransposeMode tmode, 589 CeedEvalMode emode, 590 const CeedScalar *u, CeedScalar *v) { 591 int ierr; 592 const CeedInt dim = basis->dim; 593 const CeedInt ndof = basis->ndof; 594 const CeedInt nqpt = ndof*CeedPowInt(basis->Q1d, dim); 595 const CeedInt add = (tmode == CEED_TRANSPOSE); 596 597 CeedDebug("\033[01m[CeedBasisApply_Magma] vsize=%d", 598 ndof*CeedPowInt(basis->P1d, dim)); 599 600 if (tmode == CEED_TRANSPOSE) { 601 const CeedInt vsize = ndof*CeedPowInt(basis->P1d, dim); 602 for (CeedInt i = 0; i < vsize; i++) 603 v[i] = (CeedScalar) 0; 604 } 605 if (emode & CEED_EVAL_INTERP) { 606 CeedInt P = basis->P1d, Q = basis->Q1d; 607 if (tmode == CEED_TRANSPOSE) { 608 P = basis->Q1d; Q = basis->P1d; 609 } 610 CeedInt pre = ndof*CeedPowInt(P, dim-1), post = 1; 611 CeedScalar tmp[2][ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)]; 612 CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d", 613 ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)); 614 for (CeedInt d=0; d<dim; d++) { 615 ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q, basis->interp1d, 616 tmode, add&&(d==dim-1), 617 d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]); 618 CeedChk(ierr); 619 pre /= P; 620 post *= Q; 621 } 622 if (tmode == CEED_NOTRANSPOSE) { 623 v += nqpt; 624 } else { 625 u += nqpt; 626 } 627 } 628 if (emode & CEED_EVAL_GRAD) { 629 CeedInt P = basis->P1d, Q = basis->Q1d; 630 // In CEED_NOTRANSPOSE mode: 631 // u is (P^dim x nc), column-major layout (nc = ndof) 632 // v is (Q^dim x nc x dim), column-major layout (nc = ndof) 633 // In CEED_TRANSPOSE mode, the sizes of u and v are switched. 634 if (tmode == CEED_TRANSPOSE) { 635 P = basis->Q1d, Q = basis->P1d; 636 } 637 CeedScalar tmp[2][ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)]; 638 CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d", 639 ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)); 640 for (CeedInt p = 0; p < dim; p++) { 641 CeedInt pre = ndof*CeedPowInt(P, dim-1), post = 1; 642 for (CeedInt d=0; d<dim; d++) { 643 ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q, 644 (p==d)?basis->grad1d:basis->interp1d, 645 tmode, add&&(d==dim-1), 646 d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]); 647 CeedChk(ierr); 648 pre /= P; 649 post *= Q; 650 } 651 if (tmode == CEED_NOTRANSPOSE) { 652 v += nqpt; 653 } else { 654 u += nqpt; 655 } 656 } 657 } 658 if (emode & CEED_EVAL_WEIGHT) { 659 if (tmode == CEED_TRANSPOSE) 660 return CeedError(basis->ceed, 1, 661 "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 662 CeedInt Q = basis->Q1d; 663 for (CeedInt d=0; d<dim; d++) { 664 CeedInt pre = CeedPowInt(Q, dim-d-1), post = CeedPowInt(Q, d); 665 for (CeedInt i=0; i<pre; i++) { 666 for (CeedInt j=0; j<Q; j++) { 667 for (CeedInt k=0; k<post; k++) { 668 v[(i*Q + j)*post + k] = basis->qweight1d[j] 669 * (d == 0 ? 1 : v[(i*Q + j)*post + k]); 670 } 671 } 672 } 673 } 674 } 675 return 0; 676 } 677 678 static int CeedBasisDestroy_Magma(CeedBasis basis) { 679 return 0; 680 } 681 682 static int CeedBasisCreateTensorH1_Magma(Ceed ceed, CeedInt dim, CeedInt P1d, 683 CeedInt Q1d, const CeedScalar *interp1d, 684 const CeedScalar *grad1d, 685 const CeedScalar *qref1d, 686 const CeedScalar *qweight1d, 687 CeedBasis basis) { 688 basis->Apply = CeedBasisApply_Magma; 689 basis->Destroy = CeedBasisDestroy_Magma; 690 return 0; 691 } 692 693 static int CeedQFunctionApply_Magma(CeedQFunction qf, CeedInt Q, 694 const CeedScalar *const *u, 695 CeedScalar *const *v) { 696 int ierr; 697 ierr = qf->function(qf->ctx, Q, u, v); CeedChk(ierr); 698 return 0; 699 } 700 701 static int CeedQFunctionDestroy_Magma(CeedQFunction qf) { 702 return 0; 703 } 704 705 static int CeedQFunctionCreate_Magma(CeedQFunction qf) { 706 qf->Apply = CeedQFunctionApply_Magma; 707 qf->Destroy = CeedQFunctionDestroy_Magma; 708 return 0; 709 } 710 711 static int CeedOperatorDestroy_Magma(CeedOperator op) { 712 CeedOperator_Magma *impl = op->data; 713 int ierr; 714 715 for (CeedInt i=0; i<impl->numein+impl->numeout; i++) { 716 ierr = CeedVectorDestroy(&impl->evecs[i]); CeedChk(ierr); 717 } 718 ierr = CeedFree(&impl->evecs); CeedChk(ierr); 719 ierr = CeedFree(&impl->edata); CeedChk(ierr); 720 721 for (CeedInt i=0; i<impl->numqin+impl->numqout; i++) { 722 ierr = CeedFree(&impl->qdata_alloc[i]); CeedChk(ierr); 723 } 724 ierr = CeedFree(&impl->qdata_alloc); CeedChk(ierr); 725 ierr = CeedFree(&impl->qdata); CeedChk(ierr); 726 727 ierr = CeedFree(&impl->indata); CeedChk(ierr); 728 ierr = CeedFree(&impl->outdata); CeedChk(ierr); 729 730 ierr = CeedFree(&op->data); CeedChk(ierr); 731 return 0; 732 } 733 734 /* 735 Setup infields or outfields 736 */ 737 static int CeedOperatorSetupFields_Magma(struct CeedQFunctionField qfields[16], 738 struct CeedOperatorField ofields[16], 739 CeedVector *evecs, CeedScalar **qdata, 740 CeedScalar **qdata_alloc, CeedScalar **indata, 741 CeedInt starti, CeedInt starte, 742 CeedInt startq, CeedInt numfields, CeedInt Q) { 743 CeedInt dim, ierr, ie=starte, iq=startq, ncomp; 744 745 // Loop over fields 746 for (CeedInt i=0; i<numfields; i++) { 747 if (ofields[i].Erestrict) { 748 ierr = CeedElemRestrictionCreateVector(ofields[i].Erestrict, NULL, &evecs[ie]); 749 CeedChk(ierr); 750 ie++; 751 } 752 CeedEvalMode emode = qfields[i].emode; 753 switch(emode) { 754 case CEED_EVAL_NONE: 755 break; // No action 756 case CEED_EVAL_INTERP: 757 ncomp = qfields[i].ncomp; 758 ierr = CeedMalloc(Q*ncomp, &qdata_alloc[iq]); CeedChk(ierr); 759 qdata[i + starti] = qdata_alloc[iq]; 760 iq++; 761 break; 762 case CEED_EVAL_GRAD: 763 ncomp = qfields[i].ncomp; 764 dim = ofields[i].basis->dim; 765 ierr = CeedMalloc(Q*ncomp*dim, &qdata_alloc[iq]); CeedChk(ierr); 766 qdata[i + starti] = qdata_alloc[iq]; 767 iq++; 768 break; 769 case CEED_EVAL_WEIGHT: // Only on input fields 770 ierr = CeedMalloc(Q, &qdata_alloc[iq]); CeedChk(ierr); 771 ierr = CeedBasisApply(ofields[iq].basis, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, 772 NULL, qdata_alloc[iq]); CeedChk(ierr); 773 qdata[i] = qdata_alloc[iq]; 774 indata[i] = qdata[i]; 775 iq++; 776 break; 777 case CEED_EVAL_DIV: 778 break; // Not implimented 779 case CEED_EVAL_CURL: 780 break; // Not implimented 781 } 782 } 783 return 0; 784 } 785 786 /* 787 CeedOperator needs to connect all the named fields (be they active or passive) 788 to the named inputs and outputs of its CeedQFunction. 789 */ 790 static int CeedOperatorSetup_Magma(CeedOperator op) { 791 if (op->setupdone) return 0; 792 CeedOperator_Magma *opMagma = op->data; 793 CeedQFunction qf = op->qf; 794 CeedInt Q = op->numqpoints; 795 int ierr; 796 797 // Count infield and outfield array sizes and evectors 798 for (CeedInt i=0; i<qf->numinputfields; i++) { 799 CeedEvalMode emode = qf->inputfields[i].emode; 800 opMagma->numqin += !!(emode & CEED_EVAL_INTERP) + !!(emode & CEED_EVAL_GRAD) + 801 !! 802 (emode & CEED_EVAL_WEIGHT); 803 opMagma->numein += 804 !!op->inputfields[i].Erestrict; // Need E-vector when restriction exists 805 } 806 for (CeedInt i=0; i<qf->numoutputfields; i++) { 807 CeedEvalMode emode = qf->outputfields[i].emode; 808 opMagma->numqout += !!(emode & CEED_EVAL_INTERP) + !!(emode & CEED_EVAL_GRAD); 809 opMagma->numeout += !!op->outputfields[i].Erestrict; 810 } 811 812 // Allocate 813 ierr = CeedCalloc(opMagma->numein + opMagma->numeout, &opMagma->evecs); 814 CeedChk(ierr); 815 ierr = CeedCalloc(qf->numinputfields + qf->numoutputfields, &opMagma->edata); 816 CeedChk(ierr); 817 818 ierr = CeedCalloc(opMagma->numqin + opMagma->numqout, &opMagma->qdata_alloc); 819 CeedChk(ierr); 820 ierr = CeedCalloc(qf->numinputfields + qf->numoutputfields, &opMagma->qdata); 821 CeedChk(ierr); 822 823 ierr = CeedCalloc(16, &opMagma->indata); CeedChk(ierr); 824 ierr = CeedCalloc(16, &opMagma->outdata); CeedChk(ierr); 825 826 // Set up infield and outfield pointer arrays 827 // Infields 828 ierr = CeedOperatorSetupFields_Magma(qf->inputfields, op->inputfields, 829 opMagma->evecs, opMagma->qdata, opMagma->qdata_alloc, 830 opMagma->indata, 0, 0, 0, 831 qf->numinputfields, Q); CeedChk(ierr); 832 833 // Outfields 834 ierr = CeedOperatorSetupFields_Magma(qf->outputfields, op->outputfields, 835 opMagma->evecs, opMagma->qdata, opMagma->qdata_alloc, 836 opMagma->indata, qf->numinputfields, opMagma->numein, 837 opMagma->numqin, qf->numoutputfields, Q); CeedChk(ierr); 838 839 op->setupdone = 1; 840 841 return 0; 842 } 843 844 static int CeedOperatorApply_Magma(CeedOperator op, CeedVector invec, 845 CeedVector outvec, CeedRequest *request) { 846 CeedOperator_Magma *opMagma = op->data; 847 CeedInt Q = op->numqpoints, elemsize; 848 int ierr; 849 CeedQFunction qf = op->qf; 850 CeedTransposeMode lmode = CEED_NOTRANSPOSE; 851 852 // Setup 853 ierr = CeedOperatorSetup_Magma(op); CeedChk(ierr); 854 855 // Input Evecs and Restriction 856 for (CeedInt i=0,iein=0; i<qf->numinputfields; i++) { 857 // Restriction 858 if (op->inputfields[i].Erestrict) { 859 // Passive 860 if (op->inputfields[i].vec) { 861 ierr = CeedElemRestrictionApply(op->inputfields[i].Erestrict, CEED_NOTRANSPOSE, 862 lmode, op->inputfields[i].vec, opMagma->evecs[iein], 863 request); CeedChk(ierr); 864 ierr = CeedVectorGetArrayRead(opMagma->evecs[iein], CEED_MEM_HOST, 865 (const CeedScalar **) &opMagma->edata[i]); CeedChk(ierr); 866 iein++; 867 } else { 868 // Active 869 ierr = CeedElemRestrictionApply(op->inputfields[i].Erestrict, CEED_NOTRANSPOSE, 870 lmode, invec, opMagma->evecs[iein], request); CeedChk(ierr); 871 ierr = CeedVectorGetArrayRead(opMagma->evecs[iein], CEED_MEM_HOST, 872 (const CeedScalar **) &opMagma->edata[i]); CeedChk(ierr); 873 iein++; 874 } 875 } else { 876 // No restriction 877 CeedEvalMode emode = qf->inputfields[i].emode; 878 if (emode & CEED_EVAL_WEIGHT) { 879 } else { 880 ierr = CeedVectorGetArrayRead(op->inputfields[i].vec, CEED_MEM_HOST, 881 (const CeedScalar **) &opMagma->edata[i]); CeedChk(ierr); 882 } 883 } 884 } 885 886 // Output Evecs 887 for (CeedInt i=0,ieout=opMagma->numein; i<qf->numoutputfields; i++) { 888 // Restriction 889 if (op->outputfields[i].Erestrict) { 890 ierr = CeedVectorGetArray(opMagma->evecs[ieout], CEED_MEM_HOST, 891 &opMagma->edata[i + qf->numinputfields]); CeedChk(ierr); 892 ieout++; 893 } else { 894 // No restriction 895 // Passive 896 if (op->inputfields[i].vec) { 897 ierr = CeedVectorGetArray(op->inputfields[i].vec, CEED_MEM_HOST, 898 &opMagma->edata[i + qf->numinputfields]); CeedChk(ierr); 899 } else { 900 // Active 901 ierr = CeedVectorGetArray(outvec, CEED_MEM_HOST, 902 &opMagma->edata[i + qf->numinputfields]); CeedChk(ierr); 903 } 904 } 905 } 906 907 // Output Qvecs 908 for (CeedInt i=0; i<qf->numoutputfields; i++) { 909 CeedEvalMode emode = qf->outputfields[i].emode; 910 if (emode != CEED_EVAL_NONE) { 911 opMagma->outdata[i] = opMagma->qdata[i + qf->numinputfields]; 912 } 913 } 914 915 // Loop through elements 916 for (CeedInt e=0; e<op->numelements; e++) { 917 // Input basis apply if needed 918 for (CeedInt i=0; i<qf->numinputfields; i++) { 919 // Get elemsize 920 if (op->inputfields[i].Erestrict) { 921 elemsize = op->inputfields[i].Erestrict->elemsize; 922 } else { 923 elemsize = Q; 924 } 925 // Get emode, ncomp 926 CeedEvalMode emode = qf->inputfields[i].emode; 927 CeedInt ncomp = qf->inputfields[i].ncomp; 928 // Basis action 929 switch(emode) { 930 case CEED_EVAL_NONE: 931 opMagma->indata[i] = &opMagma->edata[i][e*Q*ncomp]; 932 break; 933 case CEED_EVAL_INTERP: 934 ierr = CeedBasisApply(op->inputfields[i].basis, CEED_NOTRANSPOSE, 935 CEED_EVAL_INTERP, &opMagma->edata[i][e*elemsize*ncomp], opMagma->qdata[i]); 936 CeedChk(ierr); 937 opMagma->indata[i] = opMagma->qdata[i]; 938 break; 939 case CEED_EVAL_GRAD: 940 ierr = CeedBasisApply(op->inputfields[i].basis, CEED_NOTRANSPOSE, 941 CEED_EVAL_GRAD, &opMagma->edata[i][e*elemsize*ncomp], opMagma->qdata[i]); 942 CeedChk(ierr); 943 opMagma->indata[i] = opMagma->qdata[i]; 944 break; 945 case CEED_EVAL_WEIGHT: 946 break; // No action 947 case CEED_EVAL_DIV: 948 break; // Not implimented 949 case CEED_EVAL_CURL: 950 break; // Not implimented 951 } 952 } 953 // Output pointers 954 for (CeedInt i=0; i<qf->numoutputfields; i++) { 955 CeedEvalMode emode = qf->outputfields[i].emode; 956 if (emode == CEED_EVAL_NONE) { 957 CeedInt ncomp = qf->outputfields[i].ncomp; 958 opMagma->outdata[i] = &opMagma->edata[i + qf->numinputfields][e*Q*ncomp]; 959 } 960 } 961 // Q function 962 ierr = CeedQFunctionApply(op->qf, Q, 963 (const CeedScalar * const*) opMagma->indata, 964 opMagma->outdata); CeedChk(ierr); 965 966 // Output basis apply if needed 967 for (CeedInt i=0; i<qf->numoutputfields; i++) { 968 // Get elemsize 969 if (op->outputfields[i].Erestrict) { 970 elemsize = op->outputfields[i].Erestrict->elemsize; 971 } else { 972 elemsize = Q; 973 } 974 // Get emode, ncomp 975 CeedInt ncomp = qf->outputfields[i].ncomp; 976 CeedEvalMode emode = qf->outputfields[i].emode; 977 // Basis action 978 switch(emode) { 979 case CEED_EVAL_NONE: 980 break; // No action 981 case CEED_EVAL_INTERP: 982 ierr = CeedBasisApply(op->outputfields[i].basis, CEED_TRANSPOSE, 983 CEED_EVAL_INTERP, opMagma->outdata[i], 984 &opMagma->edata[i + qf->numinputfields][e*elemsize*ncomp]); CeedChk(ierr); 985 break; 986 case CEED_EVAL_GRAD: 987 ierr = CeedBasisApply(op->outputfields[i].basis, CEED_TRANSPOSE, CEED_EVAL_GRAD, 988 opMagma->outdata[i], &opMagma->edata[i + qf->numinputfields][e*elemsize*ncomp]); 989 CeedChk(ierr); 990 break; 991 case CEED_EVAL_WEIGHT: 992 break; // Should not occur 993 case CEED_EVAL_DIV: 994 break; // Not implimented 995 case CEED_EVAL_CURL: 996 break; // Not implimented 997 } 998 } 999 } 1000 1001 // Output restriction 1002 for (CeedInt i=0,ieout=opMagma->numein; i<qf->numoutputfields; i++) { 1003 // Restriction 1004 if (op->outputfields[i].Erestrict) { 1005 // Passive 1006 if (op->outputfields[i].vec) { 1007 ierr = CeedVectorRestoreArray(opMagma->evecs[ieout], 1008 &opMagma->edata[i + qf->numinputfields]); CeedChk(ierr); 1009 ierr = CeedElemRestrictionApply(op->outputfields[i].Erestrict, CEED_TRANSPOSE, 1010 lmode, opMagma->evecs[ieout], op->outputfields[i].vec, request); CeedChk(ierr); 1011 ieout++; 1012 } else { 1013 // Active 1014 ierr = CeedVectorRestoreArray(opMagma->evecs[ieout], 1015 &opMagma->edata[i + qf->numinputfields]); CeedChk(ierr); 1016 ierr = CeedElemRestrictionApply(op->outputfields[i].Erestrict, CEED_TRANSPOSE, 1017 lmode, opMagma->evecs[ieout], outvec, request); CeedChk(ierr); 1018 ieout++; 1019 } 1020 } else { 1021 // No Restriction 1022 // Passive 1023 if (op->outputfields[i].vec) { 1024 ierr = CeedVectorRestoreArray(op->outputfields[i].vec, 1025 &opMagma->edata[i + qf->numinputfields]); CeedChk(ierr); 1026 } else { 1027 // Active 1028 ierr = CeedVectorRestoreArray(outvec, &opMagma->edata[i + qf->numinputfields]); 1029 CeedChk(ierr); 1030 } 1031 } 1032 } 1033 1034 return 0; 1035 } 1036 1037 static int CeedOperatorCreate_Magma(CeedOperator op) { 1038 CeedOperator_Magma *impl; 1039 int ierr; 1040 1041 ierr = CeedCalloc(1, &impl); CeedChk(ierr); 1042 op->data = impl; 1043 op->Destroy = CeedOperatorDestroy_Magma; 1044 op->Apply = CeedOperatorApply_Magma; 1045 return 0; 1046 } 1047 1048 // ***************************************************************************** 1049 // * INIT 1050 // ***************************************************************************** 1051 static int CeedInit_Magma(const char *resource, Ceed ceed) { 1052 int ierr; 1053 if (strcmp(resource, "/gpu/magma")) 1054 return CeedError(ceed, 1, "MAGMA backend cannot use resource: %s", resource); 1055 1056 ierr = magma_init(); 1057 if (ierr) return CeedError(ceed, 1, "error in magma_init(): %d\n", ierr); 1058 //magma_print_environment(); 1059 1060 ceed->VecCreate = CeedVectorCreate_Magma; 1061 ceed->BasisCreateTensorH1 = CeedBasisCreateTensorH1_Magma; 1062 ceed->ElemRestrictionCreate = CeedElemRestrictionCreate_Magma; 1063 ceed->QFunctionCreate = CeedQFunctionCreate_Magma; 1064 ceed->OperatorCreate = CeedOperatorCreate_Magma; 1065 return 0; 1066 } 1067 1068 // ***************************************************************************** 1069 // * REGISTER 1070 // ***************************************************************************** 1071 __attribute__((constructor)) 1072 static void Register(void) { 1073 CeedRegister("/gpu/magma", CeedInit_Magma, 20); 1074 } 1075