1 // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at 2 // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights 3 // reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include "ceed-magma.h" 18 #include <string.h> 19 20 typedef struct { 21 CeedScalar *array; 22 CeedScalar *darray; 23 int own_; 24 int down_; 25 } CeedVector_Magma; 26 27 typedef struct { 28 CeedInt *indices; 29 CeedInt *dindices; 30 int own_; 31 int down_; // cover a case where we own Device memory 32 } CeedElemRestriction_Magma; 33 34 typedef struct { 35 CeedVector etmp; 36 CeedVector qdata; 37 } CeedOperator_Magma; 38 39 // ***************************************************************************** 40 // * Initialize vector vec (after free mem) with values from array based on cmode 41 // * CEED_COPY_VALUES: memory is allocated in vec->array_allocated, made equal 42 // * to array, and data is copied (not store passed pointer) 43 // * CEED_OWN_POINTER: vec->data->array_allocated and vec->data->array = array 44 // * CEED_USE_POINTER: vec->data->array = array (can modify; no ownership) 45 // * mtype: CEED_MEM_HOST or CEED_MEM_DEVICE 46 // ***************************************************************************** 47 static int CeedVectorSetArray_Magma(CeedVector vec, CeedMemType mtype, 48 CeedCopyMode cmode, CeedScalar *array) { 49 CeedVector_Magma *impl = vec->data; 50 int ierr; 51 52 // If own data, free the "old" data, e.g., as it may be of different size 53 if (impl->own_) { 54 magma_free( impl->darray ); 55 magma_free_pinned( impl->array ); 56 impl->darray = NULL; 57 impl->array = NULL; 58 impl->own_ = 0; 59 impl->down_= 0; 60 } 61 62 if (mtype == CEED_MEM_HOST) { 63 // memory is on the host; own_ = 0 64 switch (cmode) { 65 case CEED_COPY_VALUES: 66 ierr = magma_malloc( (void**)&impl->darray, 67 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 68 ierr = magma_malloc_pinned( (void**)&impl->array, 69 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 70 impl->own_ = 1; 71 72 if (array != NULL) 73 magma_setvector(vec->length, sizeof(array[0]), 74 array, 1, impl->darray, 1); 75 break; 76 case CEED_OWN_POINTER: 77 ierr = magma_malloc( (void**)&impl->darray, 78 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 79 // TODO: possible problem here is if we are passed non-pinned memory; 80 // (as we own it, lter in destroy, we use free for pinned memory). 81 impl->array = array; 82 impl->own_ = 1; 83 84 if (array != NULL) 85 magma_setvector(vec->length, sizeof(array[0]), 86 array, 1, impl->darray, 1); 87 break; 88 case CEED_USE_POINTER: 89 ierr = magma_malloc( (void**)&impl->darray, 90 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 91 magma_setvector(vec->length, sizeof(array[0]), 92 array, 1, impl->darray, 1); 93 94 impl->down_ = 1; 95 impl->array = array; 96 } 97 } else if (mtype == CEED_MEM_DEVICE) { 98 // memory is on the device; own = 0 99 switch (cmode) { 100 case CEED_COPY_VALUES: 101 ierr = magma_malloc( (void**)&impl->darray, 102 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 103 ierr = magma_malloc_pinned( (void**)&impl->array, 104 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 105 impl->own_ = 1; 106 107 if (array) 108 magma_copyvector(vec->length, sizeof(array[0]), 109 array, 1, impl->darray, 1); 110 else 111 // t30 assumes allocation initializes with 0s 112 magma_setvector(vec->length, sizeof(array[0]), 113 impl->array, 1, impl->darray, 1); 114 break; 115 case CEED_OWN_POINTER: 116 impl->darray = array; 117 ierr = magma_malloc_pinned( (void**)&impl->array, 118 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 119 impl->own_ = 1; 120 121 break; 122 case CEED_USE_POINTER: 123 impl->darray = array; 124 impl->array = NULL; 125 } 126 127 } else 128 return CeedError(vec->ceed, 1, "Only MemType = HOST or DEVICE supported"); 129 130 return 0; 131 } 132 133 // ***************************************************************************** 134 // * Give data pointer from vector vec to array (on HOST or DEVICE) 135 // ***************************************************************************** 136 static int CeedVectorGetArray_Magma(CeedVector vec, CeedMemType mtype, 137 CeedScalar **array) { 138 CeedVector_Magma *impl = vec->data; 139 int ierr; 140 141 if (mtype == CEED_MEM_HOST) { 142 if (impl->own_) { 143 // data is owned so GPU had the most up-to-date version; copy it 144 // TTT - apparantly it doesn't have most up to date data 145 magma_getvector(vec->length, sizeof(*array[0]), 146 impl->darray, 1, impl->array, 1); 147 CeedDebug("\033[31m[CeedVectorGetArray_Magma]"); 148 //fprintf(stderr,"rrrrrrrrrrrrrrr\n"); 149 } else if (impl->array == NULL) { 150 // Vector doesn't own the data and was set on GPU 151 if (impl->darray == NULL) { 152 // call was made just to allocate memory 153 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL); 154 CeedChk(ierr); 155 } else 156 return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST"); 157 } 158 *array = impl->array; 159 } else if (mtype == CEED_MEM_DEVICE) { 160 if (impl->darray == NULL) { 161 // Vector doesn't own the data and was set on the CPU 162 if (impl->array == NULL) { 163 // call was made just to allocate memory 164 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL); 165 CeedChk(ierr); 166 } else 167 return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE"); 168 } 169 *array = impl->darray; 170 } else 171 return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory"); 172 173 return 0; 174 } 175 176 // ***************************************************************************** 177 // * Give data pointer from vector vec to array (on HOST or DEVICE) to read it 178 // ***************************************************************************** 179 static int CeedVectorGetArrayRead_Magma(CeedVector vec, CeedMemType mtype, 180 const CeedScalar **array) { 181 CeedVector_Magma *impl = vec->data; 182 int ierr; 183 184 if (mtype == CEED_MEM_HOST) { 185 if (impl->own_) { 186 // data is owned so GPU had the most up-to-date version; copy it 187 magma_getvector(vec->length, sizeof(*array[0]), 188 impl->darray, 1, impl->array, 1); 189 } else if (impl->array == NULL) { 190 // Vector doesn't own the data and was set on GPU 191 if (impl->darray == NULL) { 192 // call was made just to allocate memory 193 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL); 194 CeedChk(ierr); 195 } else 196 return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST"); 197 } 198 *array = impl->array; 199 } else if (mtype == CEED_MEM_DEVICE) { 200 if (impl->darray == NULL) { 201 // Vector doesn't own the data and was set on the CPU 202 if (impl->array == NULL) { 203 // call was made just to allocate memory 204 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL); 205 CeedChk(ierr); 206 } else 207 return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE"); 208 } 209 *array = impl->darray; 210 } else 211 return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory"); 212 213 return 0; 214 } 215 216 // ***************************************************************************** 217 // * There is no mtype here for array so it is not clear if we restore from HOST 218 // * memory or from DEVICE memory. We assume that it is CPU memory because if 219 // * it was GPU memory we would not call this routine at all. 220 // * Restore vector vec with values from array, where array received its values 221 // * from vec and possibly modified them. 222 // ***************************************************************************** 223 static int CeedVectorRestoreArray_Magma(CeedVector vec, CeedScalar **array) { 224 CeedVector_Magma *impl = vec->data; 225 226 // Check if the array is a CPU pointer 227 if (*array == impl->array) { 228 // Update device, if the device pointer is not NULL 229 if (impl->darray != NULL) { 230 magma_setvector(vec->length, sizeof(*array[0]), 231 *array, 1, impl->darray, 1); 232 } else { 233 // nothing to do (case of CPU use pointer) 234 } 235 236 } else if (impl->down_) { 237 // nothing to do if array is on GPU, except if down_=1(case CPU use pointer) 238 magma_getvector(vec->length, sizeof(*array[0]), 239 impl->darray, 1, impl->array, 1); 240 } 241 242 *array = NULL; 243 return 0; 244 } 245 246 // ***************************************************************************** 247 // * There is no mtype here for array so it is not clear if we restore from HOST 248 // * memory or from DEVICE memory. We assume that it is CPU memory because if 249 // * it was GPU memory we would not call this routine at all. 250 // * Restore vector vec with values from array, where array received its values 251 // * from vec to only read them; in this case vec may have been modified meanwhile 252 // * and needs to be restored here. 253 // ***************************************************************************** 254 static int CeedVectorRestoreArrayRead_Magma(CeedVector vec, 255 const CeedScalar **array) { 256 CeedVector_Magma *impl = vec->data; 257 258 // Check if the array is a CPU pointer 259 if (*array == impl->array) { 260 // Update device, if the device pointer is not NULL 261 if (impl->darray != NULL) { 262 magma_setvector(vec->length, sizeof(*array[0]), 263 *array, 1, impl->darray, 1); 264 } else { 265 // nothing to do (case of CPU use pointer) 266 } 267 268 } else if (impl->down_) { 269 // nothing to do if array is on GPU, except if down_=1(case CPU use pointer) 270 magma_getvector(vec->length, sizeof(*array[0]), 271 impl->darray, 1, impl->array, 1); 272 } 273 274 *array = NULL; 275 return 0; 276 } 277 278 static int CeedVectorDestroy_Magma(CeedVector vec) { 279 CeedVector_Magma *impl = vec->data; 280 int ierr; 281 282 // Free if we own the data 283 if (impl->own_) { 284 ierr = magma_free_pinned(impl->array); CeedChk(ierr); 285 ierr = magma_free(impl->darray); CeedChk(ierr); 286 } else if (impl->down_) { 287 ierr = magma_free(impl->darray); CeedChk(ierr); 288 } 289 ierr = CeedFree(&vec->data); CeedChk(ierr); 290 return 0; 291 } 292 293 // ***************************************************************************** 294 // * Create vector vec of size n 295 // ***************************************************************************** 296 static int CeedVectorCreate_Magma(Ceed ceed, CeedInt n, CeedVector vec) { 297 CeedVector_Magma *impl; 298 int ierr; 299 300 vec->SetArray = CeedVectorSetArray_Magma; 301 vec->GetArray = CeedVectorGetArray_Magma; 302 vec->GetArrayRead = CeedVectorGetArrayRead_Magma; 303 vec->RestoreArray = CeedVectorRestoreArray_Magma; 304 vec->RestoreArrayRead = CeedVectorRestoreArrayRead_Magma; 305 vec->Destroy = CeedVectorDestroy_Magma; 306 ierr = CeedCalloc(1,&impl); CeedChk(ierr); 307 impl->darray = NULL; 308 impl->array = NULL; 309 impl->own_ = 0; 310 impl->down_= 0; 311 vec->data = impl; 312 return 0; 313 } 314 315 // ***************************************************************************** 316 // * Apply restriction operator r to u: v = r(rmode) u 317 // ***************************************************************************** 318 static int CeedElemRestrictionApply_Magma(CeedElemRestriction r, 319 CeedTransposeMode tmode, CeedInt ncomp, 320 CeedTransposeMode lmode, CeedVector u, 321 CeedVector v, CeedRequest *request) { 322 CeedElemRestriction_Magma *impl = r->data; 323 int ierr; 324 const CeedScalar *uu; 325 CeedScalar *vv; 326 CeedInt nelem = r->nelem, elemsize = r->elemsize, ndof = r->ndof; 327 CeedInt esize = nelem * elemsize; 328 329 #ifdef USE_MAGMA_BATCH2 330 CeedInt *dindices = impl->dindices; 331 // Get pointers on the device 332 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &uu); CeedChk(ierr); 333 ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &vv); CeedChk(ierr); 334 #else 335 CeedInt *indices = impl->indices; 336 ierr = CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu); CeedChk(ierr); 337 ierr = CeedVectorGetArray(v, CEED_MEM_HOST, &vv); CeedChk(ierr); 338 #endif 339 340 if (tmode == CEED_NOTRANSPOSE) { 341 // Perform: v = r * u 342 if (ncomp == 1) { 343 #ifdef USE_MAGMA_BATCH2 344 magma_template<<i=0:esize>> 345 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) { 346 vv[i] = uu[dindices[i]]; 347 } 348 #else 349 for (CeedInt i=0; i<esize; i++) vv[i] = uu[indices[i]]; 350 #endif 351 } else { 352 // vv is (elemsize x ncomp x nelem), column-major 353 if (lmode == CEED_NOTRANSPOSE) { // u is (ndof x ncomp), column-major 354 #ifdef USE_MAGMA_BATCH2 355 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>> 356 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, int ndof) { 357 vv[i + iend*(d+dend*e)] = uu[dindices[i+iend*e]+ndof*d]; 358 } 359 #else 360 for (CeedInt e = 0; e < nelem; e++) 361 for (CeedInt d = 0; d < ncomp; d++) 362 for (CeedInt i=0; i < elemsize; i++) { 363 vv[i + elemsize*(d+ncomp*e)] = 364 uu[indices[i+elemsize*e]+ndof*d]; 365 } 366 #endif 367 } else { // u is (ncomp x ndof), column-major 368 #ifdef USE_MAGMA_BATCH2 369 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>> 370 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) { 371 vv[i + iend*(d+dend*e)] = uu[d+dend*dindices[i + iend*e]]; 372 } 373 #else 374 for (CeedInt e = 0; e < nelem; e++) 375 for (CeedInt d = 0; d < ncomp; d++) 376 for (CeedInt i=0; i< elemsize; i++) { 377 vv[i + elemsize*(d+ncomp*e)] = 378 uu[d+ncomp*indices[i+elemsize*e]]; 379 } 380 #endif 381 } 382 } 383 } else { 384 // Note: in transpose mode, we perform: v += r^t * u 385 if (ncomp == 1) { 386 // fprintf(stderr,"3 ---------\n"); 387 #ifdef USE_MAGMA_BATCH2 388 magma_template<<i=0:esize>> 389 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) { 390 magmablas_datomic_add( &vv[dindices[i]], uu[i]); 391 } 392 #else 393 for (CeedInt i=0; i<esize; i++) vv[indices[i]] += uu[i]; 394 #endif 395 } else { // u is (elemsize x ncomp x nelem) 396 fprintf(stderr,"2 ---------\n"); 397 398 if (lmode == CEED_NOTRANSPOSE) { // vv is (ndof x ncomp), column-major 399 #ifdef USE_MAGMA_BATCH2 400 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>> 401 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, CeedInt ndof) { 402 magmablas_datomic_add( &vv[dindices[i+iend*e]+ndof*d], 403 uu[i+iend*(d+e*dend)]); 404 } 405 #else 406 for (CeedInt e = 0; e < nelem; e++) 407 for (CeedInt d = 0; d < ncomp; d++) 408 for (CeedInt i=0; i < elemsize; i++) { 409 vv[indices[i + elemsize*e]+ndof*d] += 410 uu[i + elemsize*(d+e*ncomp)]; 411 } 412 #endif 413 } else { // vv is (ncomp x ndof), column-major 414 #ifdef USE_MAGMA_BATCH2 415 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>> 416 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) { 417 magmablas_datomic_add( &vv[d+dend*dindices[i + iend*e]], 418 uu[i+iend*(d+e*dend)]); 419 } 420 #else 421 for (CeedInt e = 0; e < nelem; e++) 422 for (CeedInt d = 0; d < ncomp; d++) 423 for (CeedInt i=0; i < elemsize; i++) { 424 vv[d+ncomp*indices[i + elemsize*e]] += 425 uu[i + elemsize*(d+e*ncomp)]; 426 } 427 #endif 428 } 429 } 430 } 431 432 ierr = CeedVectorRestoreArrayRead(u, &uu); CeedChk(ierr); 433 ierr = CeedVectorRestoreArray(v, &vv); CeedChk(ierr); 434 435 if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) 436 *request = NULL; 437 return 0; 438 } 439 440 static int CeedElemRestrictionDestroy_Magma(CeedElemRestriction r) { 441 CeedElemRestriction_Magma *impl = r->data; 442 int ierr; 443 444 // Free if we own the data 445 if (impl->own_) { 446 ierr = magma_free_pinned(impl->indices); CeedChk(ierr); 447 ierr = magma_free(impl->dindices); CeedChk(ierr); 448 } else if (impl->down_) { 449 ierr = magma_free(impl->dindices); CeedChk(ierr); 450 } 451 ierr = CeedFree(&r->data); CeedChk(ierr); 452 return 0; 453 } 454 455 static int CeedElemRestrictionCreate_Magma(CeedElemRestriction r, 456 CeedMemType mtype, 457 CeedCopyMode cmode, 458 const CeedInt *indices) { 459 int ierr, size = r->nelem*r->elemsize; 460 CeedElemRestriction_Magma *impl; 461 462 // Allocate memory for the MAGMA Restricton and initializa pointers to NULL 463 ierr = CeedCalloc(1,&impl); CeedChk(ierr); 464 impl->dindices = NULL; 465 impl->indices = NULL; 466 impl->own_ = 0; 467 impl->down_= 0; 468 469 if (mtype == CEED_MEM_HOST) { 470 // memory is on the host; own_ = 0 471 switch (cmode) { 472 case CEED_COPY_VALUES: 473 ierr = magma_malloc( (void**)&impl->dindices, 474 size * sizeof(CeedInt)); CeedChk(ierr); 475 ierr = magma_malloc_pinned( (void**)&impl->indices, 476 size * sizeof(CeedInt)); CeedChk(ierr); 477 impl->own_ = 1; 478 479 if (indices != NULL) { 480 memcpy(impl->indices, indices, size * sizeof(indices[0])); 481 magma_setvector(size, sizeof(CeedInt), 482 impl->indices, 1, impl->dindices, 1); 483 } 484 break; 485 case CEED_OWN_POINTER: 486 ierr = magma_malloc( (void**)&impl->dindices, 487 size * sizeof(CeedInt)); CeedChk(ierr); 488 // TODO: possible problem here is if we are passed non-pinned memory; 489 // (as we own it, lter in destroy, we use free for pinned memory). 490 impl->indices = (CeedInt *)indices; 491 impl->own_ = 1; 492 493 if (indices != NULL) 494 magma_setvector(size, sizeof(CeedInt), 495 indices, 1, impl->dindices, 1); 496 break; 497 case CEED_USE_POINTER: 498 ierr = magma_malloc( (void**)&impl->dindices, 499 size * sizeof(CeedInt)); CeedChk(ierr); 500 magma_setvector(size, sizeof(CeedInt), 501 indices, 1, impl->dindices, 1); 502 impl->down_ = 1; 503 impl->indices = (CeedInt *)indices; 504 } 505 } else if (mtype == CEED_MEM_DEVICE) { 506 // memory is on the device; own = 0 507 switch (cmode) { 508 case CEED_COPY_VALUES: 509 ierr = magma_malloc( (void**)&impl->dindices, 510 size * sizeof(CeedInt)); CeedChk(ierr); 511 ierr = magma_malloc_pinned( (void**)&impl->indices, 512 size * sizeof(CeedInt)); CeedChk(ierr); 513 impl->own_ = 1; 514 515 if (indices) 516 magma_copyvector(size, sizeof(CeedInt), 517 indices, 1, impl->dindices, 1); 518 break; 519 case CEED_OWN_POINTER: 520 impl->dindices = (CeedInt *)indices; 521 ierr = magma_malloc_pinned( (void**)&impl->indices, 522 size * sizeof(CeedInt)); CeedChk(ierr); 523 impl->own_ = 1; 524 525 break; 526 case CEED_USE_POINTER: 527 impl->dindices = (CeedInt *)indices; 528 impl->indices = NULL; 529 } 530 531 } else 532 return CeedError(r->ceed, 1, "Only MemType = HOST or DEVICE supported"); 533 534 r->data = impl; 535 r->Apply = CeedElemRestrictionApply_Magma; 536 r->Destroy = CeedElemRestrictionDestroy_Magma; 537 538 return 0; 539 } 540 541 // Contracts on the middle index 542 // NOTRANSPOSE: V_ajc = T_jb U_abc 543 // TRANSPOSE: V_ajc = T_bj U_abc 544 // If Add != 0, "=" is replaced by "+=" 545 static int CeedTensorContract_Magma(Ceed ceed, 546 CeedInt A, CeedInt B, CeedInt C, CeedInt J, 547 const CeedScalar *t, CeedTransposeMode tmode, 548 const CeedInt Add, 549 const CeedScalar *u, CeedScalar *v) { 550 #ifdef USE_MAGMA_BATCH 551 magma_dtensor_contract(ceed, A, B, C, J, t, tmode, Add, u, v); 552 #else 553 CeedInt tstride0 = B, tstride1 = 1; 554 if (tmode == CEED_TRANSPOSE) { 555 tstride0 = 1; tstride1 = J; 556 } 557 CeedDebug("\033[31m[CeedTensorContract] A=%d, J=%d, C=%d, B=%d: %d %d %d", 558 A,J,C,B,A*J*B*C, C*J*A, C*B*A); 559 for (CeedInt a=0; a<A; a++) { 560 for (CeedInt j=0; j<J; j++) { 561 if (!Add) { 562 for (CeedInt c=0; c<C; c++) 563 v[(a*J+j)*C+c] = 0; 564 } 565 for (CeedInt b=0; b<B; b++) { 566 for (CeedInt c=0; c<C; c++) { 567 v[(a*J+j)*C+c] += t[j*tstride0 + b*tstride1] * u[(a*B+b)*C+c]; 568 } 569 } 570 } 571 } 572 #endif 573 return 0; 574 } 575 576 static int CeedBasisApply_Magma(CeedBasis basis, CeedTransposeMode tmode, 577 CeedEvalMode emode, 578 const CeedScalar *u, CeedScalar *v) { 579 int ierr; 580 const CeedInt dim = basis->dim; 581 const CeedInt ndof = basis->ndof; 582 const CeedInt nqpt = ndof*CeedPowInt(basis->Q1d, dim); 583 const CeedInt add = (tmode == CEED_TRANSPOSE); 584 585 CeedDebug("\033[01m[CeedBasisApply_Magma] vsize=%d", 586 ndof*CeedPowInt(basis->P1d, dim)); 587 588 if (tmode == CEED_TRANSPOSE) { 589 const CeedInt vsize = ndof*CeedPowInt(basis->P1d, dim); 590 for (CeedInt i = 0; i < vsize; i++) 591 v[i] = (CeedScalar) 0; 592 } 593 if (emode & CEED_EVAL_INTERP) { 594 CeedInt P = basis->P1d, Q = basis->Q1d; 595 if (tmode == CEED_TRANSPOSE) { 596 P = basis->Q1d; Q = basis->P1d; 597 } 598 CeedInt pre = ndof*CeedPowInt(P, dim-1), post = 1; 599 CeedScalar tmp[2][ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)]; 600 CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d", 601 ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)); 602 for (CeedInt d=0; d<dim; d++) { 603 ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q, basis->interp1d, 604 tmode, add&&(d==dim-1), 605 d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]); 606 CeedChk(ierr); 607 pre /= P; 608 post *= Q; 609 } 610 if (tmode == CEED_NOTRANSPOSE) { 611 v += nqpt; 612 } else { 613 u += nqpt; 614 } 615 } 616 if (emode & CEED_EVAL_GRAD) { 617 CeedInt P = basis->P1d, Q = basis->Q1d; 618 // In CEED_NOTRANSPOSE mode: 619 // u is (P^dim x nc), column-major layout (nc = ndof) 620 // v is (Q^dim x nc x dim), column-major layout (nc = ndof) 621 // In CEED_TRANSPOSE mode, the sizes of u and v are switched. 622 if (tmode == CEED_TRANSPOSE) { 623 P = basis->Q1d, Q = basis->P1d; 624 } 625 CeedScalar tmp[2][ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)]; 626 CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d", 627 ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)); 628 for (CeedInt p = 0; p < dim; p++) { 629 CeedInt pre = ndof*CeedPowInt(P, dim-1), post = 1; 630 for (CeedInt d=0; d<dim; d++) { 631 ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q, 632 (p==d)?basis->grad1d:basis->interp1d, 633 tmode, add&&(d==dim-1), 634 d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]); 635 CeedChk(ierr); 636 pre /= P; 637 post *= Q; 638 } 639 if (tmode == CEED_NOTRANSPOSE) { 640 v += nqpt; 641 } else { 642 u += nqpt; 643 } 644 } 645 } 646 if (emode & CEED_EVAL_WEIGHT) { 647 if (tmode == CEED_TRANSPOSE) 648 return CeedError(basis->ceed, 1, 649 "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 650 CeedInt Q = basis->Q1d; 651 for (CeedInt d=0; d<dim; d++) { 652 CeedInt pre = CeedPowInt(Q, dim-d-1), post = CeedPowInt(Q, d); 653 for (CeedInt i=0; i<pre; i++) { 654 for (CeedInt j=0; j<Q; j++) { 655 for (CeedInt k=0; k<post; k++) { 656 v[(i*Q + j)*post + k] = basis->qweight1d[j] 657 * (d == 0 ? 1 : v[(i*Q + j)*post + k]); 658 } 659 } 660 } 661 } 662 } 663 return 0; 664 } 665 666 static int CeedBasisDestroy_Magma(CeedBasis basis) { 667 return 0; 668 } 669 670 static int CeedBasisCreateTensorH1_Magma(Ceed ceed, CeedInt dim, CeedInt P1d, 671 CeedInt Q1d, const CeedScalar *interp1d, 672 const CeedScalar *grad1d, 673 const CeedScalar *qref1d, 674 const CeedScalar *qweight1d, 675 CeedBasis basis) { 676 basis->Apply = CeedBasisApply_Magma; 677 basis->Destroy = CeedBasisDestroy_Magma; 678 return 0; 679 } 680 681 static int CeedQFunctionApply_Magma(CeedQFunction qf, void *qdata, CeedInt Q, 682 const CeedScalar *const *u, 683 CeedScalar *const *v) { 684 int ierr; 685 ierr = qf->function(qf->ctx, qdata, Q, u, v); CeedChk(ierr); 686 return 0; 687 } 688 689 static int CeedQFunctionDestroy_Magma(CeedQFunction qf) { 690 return 0; 691 } 692 693 static int CeedQFunctionCreate_Magma(CeedQFunction qf) { 694 qf->Apply = CeedQFunctionApply_Magma; 695 qf->Destroy = CeedQFunctionDestroy_Magma; 696 return 0; 697 } 698 699 static int CeedOperatorDestroy_Magma(CeedOperator op) { 700 CeedOperator_Magma *impl = op->data; 701 int ierr; 702 703 ierr = CeedVectorDestroy(&impl->etmp); CeedChk(ierr); 704 ierr = CeedVectorDestroy(&impl->qdata); CeedChk(ierr); 705 ierr = CeedFree(&op->data); CeedChk(ierr); 706 return 0; 707 } 708 709 static int CeedOperatorApply_Magma(CeedOperator op, CeedVector qdata, 710 CeedVector ustate, 711 CeedVector residual, CeedRequest *request) { 712 CeedOperator_Magma *impl = op->data; 713 CeedVector etmp; 714 CeedInt Q; 715 const CeedInt nc = op->basis->ndof, dim = op->basis->dim; 716 CeedScalar *Eu; 717 char *qd; 718 int ierr; 719 CeedTransposeMode lmode = CEED_NOTRANSPOSE; 720 721 if (!impl->etmp) { 722 ierr = CeedVectorCreate(op->ceed, 723 nc * op->Erestrict->nelem * op->Erestrict->elemsize, 724 &impl->etmp); CeedChk(ierr); 725 // etmp is allocated when CeedVectorGetArray is called below 726 } 727 etmp = impl->etmp; 728 if (op->qf->inmode & ~CEED_EVAL_WEIGHT) { 729 ierr = CeedElemRestrictionApply(op->Erestrict, CEED_NOTRANSPOSE, 730 nc, lmode, ustate, etmp, 731 CEED_REQUEST_IMMEDIATE); CeedChk(ierr); 732 } 733 ierr = CeedBasisGetNumQuadraturePoints(op->basis, &Q); CeedChk(ierr); 734 ierr = CeedVectorGetArray(etmp, CEED_MEM_HOST, &Eu); CeedChk(ierr); 735 ierr = CeedVectorGetArray(qdata, CEED_MEM_HOST, (CeedScalar**)&qd); 736 CeedChk(ierr); 737 738 for (CeedInt e=0; e<op->Erestrict->nelem; e++) { 739 CeedScalar BEu[Q*nc*(dim+2)], BEv[Q*nc*(dim+2)], *out[5] = {0,0,0,0,0}; 740 const CeedScalar *in[5] = {0,0,0,0,0}; 741 // TODO: quadrature weights can be computed just once 742 CeedDebug("\033[11m[CeedOperatorApply_Magma] e=%d: Eu+%d, %d", 743 e, e*op->Erestrict->elemsize*nc, Q*nc*(dim+2)); 744 ierr = CeedBasisApply(op->basis, CEED_NOTRANSPOSE, op->qf->inmode, 745 &Eu[e*op->Erestrict->elemsize*nc], BEu); 746 CeedChk(ierr); 747 CeedScalar *u_ptr = BEu, *v_ptr = BEv; 748 if (op->qf->inmode & CEED_EVAL_INTERP) { in[0] = u_ptr; u_ptr += Q*nc; } 749 if (op->qf->inmode & CEED_EVAL_GRAD) { in[1] = u_ptr; u_ptr += Q*nc*dim; } 750 if (op->qf->inmode & CEED_EVAL_WEIGHT) { in[4] = u_ptr; u_ptr += Q; } 751 if (op->qf->outmode & CEED_EVAL_INTERP) { out[0] = v_ptr; v_ptr += Q*nc; } 752 if (op->qf->outmode & CEED_EVAL_GRAD) { out[1] = v_ptr; v_ptr += Q*nc*dim; } 753 ierr = CeedQFunctionApply(op->qf, &qd[e*Q*op->qf->qdatasize], Q, in, out); 754 CeedChk(ierr); 755 CeedDebug("\033[31m[CeedOperatorApply_Magma] e=%d: ",e); 756 ierr = CeedBasisApply(op->basis, CEED_TRANSPOSE, op->qf->outmode, BEv, 757 &Eu[e*op->Erestrict->elemsize*nc]); 758 CeedChk(ierr); 759 } 760 ierr = CeedVectorRestoreArray(etmp, &Eu); CeedChk(ierr); 761 // qdata must be restored 762 ierr = CeedVectorRestoreArray(qdata, (CeedScalar**)&qd); CeedChk(ierr); 763 if (residual) { 764 CeedScalar *res; 765 CeedVectorGetArray(residual, CEED_MEM_HOST, &res); 766 for (int i = 0; i < residual->length; i++) 767 res[i] = (CeedScalar)0; 768 // residual must be restored 769 CeedVectorRestoreArray(residual, &res); 770 ierr = CeedElemRestrictionApply(op->Erestrict, CEED_TRANSPOSE, 771 nc, lmode, etmp, residual, 772 CEED_REQUEST_IMMEDIATE); CeedChk(ierr); 773 } 774 if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) 775 *request = NULL; 776 return 0; 777 } 778 779 static int CeedOperatorGetQData_Magma(CeedOperator op, CeedVector *qdata) { 780 CeedOperator_Magma *impl = op->data; 781 int ierr; 782 783 if (!impl->qdata) { 784 CeedInt Q; 785 ierr = CeedBasisGetNumQuadraturePoints(op->basis, &Q); CeedChk(ierr); 786 ierr = CeedVectorCreate(op->ceed, 787 op->Erestrict->nelem * Q 788 * op->qf->qdatasize / sizeof(CeedScalar), 789 &impl->qdata); CeedChk(ierr); 790 } 791 *qdata = impl->qdata; 792 return 0; 793 } 794 795 static int CeedOperatorCreate_Magma(CeedOperator op) { 796 CeedOperator_Magma *impl; 797 int ierr; 798 799 ierr = CeedCalloc(1, &impl); CeedChk(ierr); 800 op->data = impl; 801 op->Destroy = CeedOperatorDestroy_Magma; 802 op->Apply = CeedOperatorApply_Magma; 803 op->GetQData = CeedOperatorGetQData_Magma; 804 return 0; 805 } 806 807 // ***************************************************************************** 808 // * INIT 809 // ***************************************************************************** 810 static int CeedInit_Magma(const char *resource, Ceed ceed) { 811 int ierr; 812 if (strcmp(resource, "/gpu/magma")) 813 return CeedError(ceed, 1, "MAGMA backend cannot use resource: %s", resource); 814 815 ierr = magma_init(); 816 if (ierr) return CeedError(ceed, 1, "error in magma_init(): %d\n", ierr); 817 //magma_print_environment(); 818 819 ceed->VecCreate = CeedVectorCreate_Magma; 820 ceed->BasisCreateTensorH1 = CeedBasisCreateTensorH1_Magma; 821 ceed->ElemRestrictionCreate = CeedElemRestrictionCreate_Magma; 822 ceed->QFunctionCreate = CeedQFunctionCreate_Magma; 823 ceed->OperatorCreate = CeedOperatorCreate_Magma; 824 return 0; 825 } 826 827 // ***************************************************************************** 828 // * REGISTER 829 // ***************************************************************************** 830 __attribute__((constructor)) 831 static void Register(void) { 832 CeedRegister("/gpu/magma", CeedInit_Magma, 20); 833 } 834