1 // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at 2 // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights 3 // reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include "ceed-magma.h" 18 #include <string.h> 19 20 typedef struct { 21 CeedScalar *array; 22 CeedScalar *darray; 23 int own_; 24 int down_; 25 } CeedVector_Magma; 26 27 typedef struct { 28 CeedInt *indices; 29 CeedInt *dindices; 30 int own_; 31 int down_; // cover a case where we own Device memory 32 } CeedElemRestriction_Magma; 33 34 typedef struct { 35 CeedVector 36 *evecs; /// E-vectors needed to apply operator (input followed by outputs) 37 CeedScalar **edata; 38 CeedScalar **qdata; /// Inputs followed by outputs 39 CeedScalar 40 **qdata_alloc; /// Allocated quadrature data arrays (to be freed by us) 41 CeedScalar **indata; 42 CeedScalar **outdata; 43 CeedInt numein; 44 CeedInt numeout; 45 CeedInt numqin; 46 CeedInt numqout; 47 } CeedOperator_Magma; 48 49 // ***************************************************************************** 50 // * Initialize vector vec (after free mem) with values from array based on cmode 51 // * CEED_COPY_VALUES: memory is allocated in vec->array_allocated, made equal 52 // * to array, and data is copied (not store passed pointer) 53 // * CEED_OWN_POINTER: vec->data->array_allocated and vec->data->array = array 54 // * CEED_USE_POINTER: vec->data->array = array (can modify; no ownership) 55 // * mtype: CEED_MEM_HOST or CEED_MEM_DEVICE 56 // ***************************************************************************** 57 static int CeedVectorSetArray_Magma(CeedVector vec, CeedMemType mtype, 58 CeedCopyMode cmode, CeedScalar *array) { 59 CeedVector_Magma *impl = vec->data; 60 int ierr; 61 62 // If own data, free the "old" data, e.g., as it may be of different size 63 if (impl->own_) { 64 magma_free( impl->darray ); 65 magma_free_pinned( impl->array ); 66 impl->darray = NULL; 67 impl->array = NULL; 68 impl->own_ = 0; 69 impl->down_= 0; 70 } 71 72 if (mtype == CEED_MEM_HOST) { 73 // memory is on the host; own_ = 0 74 switch (cmode) { 75 case CEED_COPY_VALUES: 76 ierr = magma_malloc( (void**)&impl->darray, 77 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 78 ierr = magma_malloc_pinned( (void**)&impl->array, 79 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 80 impl->own_ = 1; 81 82 if (array != NULL) 83 magma_setvector(vec->length, sizeof(array[0]), 84 array, 1, impl->darray, 1); 85 break; 86 case CEED_OWN_POINTER: 87 ierr = magma_malloc( (void**)&impl->darray, 88 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 89 // TODO: possible problem here is if we are passed non-pinned memory; 90 // (as we own it, lter in destroy, we use free for pinned memory). 91 impl->array = array; 92 impl->own_ = 1; 93 94 if (array != NULL) 95 magma_setvector(vec->length, sizeof(array[0]), 96 array, 1, impl->darray, 1); 97 break; 98 case CEED_USE_POINTER: 99 ierr = magma_malloc( (void**)&impl->darray, 100 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 101 magma_setvector(vec->length, sizeof(array[0]), 102 array, 1, impl->darray, 1); 103 104 impl->down_ = 1; 105 impl->array = array; 106 } 107 } else if (mtype == CEED_MEM_DEVICE) { 108 // memory is on the device; own = 0 109 switch (cmode) { 110 case CEED_COPY_VALUES: 111 ierr = magma_malloc( (void**)&impl->darray, 112 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 113 ierr = magma_malloc_pinned( (void**)&impl->array, 114 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 115 impl->own_ = 1; 116 117 if (array) 118 magma_copyvector(vec->length, sizeof(array[0]), 119 array, 1, impl->darray, 1); 120 else 121 // t30 assumes allocation initializes with 0s 122 magma_setvector(vec->length, sizeof(array[0]), 123 impl->array, 1, impl->darray, 1); 124 break; 125 case CEED_OWN_POINTER: 126 impl->darray = array; 127 ierr = magma_malloc_pinned( (void**)&impl->array, 128 vec->length * sizeof(CeedScalar)); CeedChk(ierr); 129 impl->own_ = 1; 130 131 break; 132 case CEED_USE_POINTER: 133 impl->darray = array; 134 impl->array = NULL; 135 } 136 137 } else 138 return CeedError(vec->ceed, 1, "Only MemType = HOST or DEVICE supported"); 139 140 return 0; 141 } 142 143 // ***************************************************************************** 144 // * Give data pointer from vector vec to array (on HOST or DEVICE) 145 // ***************************************************************************** 146 static int CeedVectorGetArray_Magma(CeedVector vec, CeedMemType mtype, 147 CeedScalar **array) { 148 CeedVector_Magma *impl = vec->data; 149 int ierr; 150 151 if (mtype == CEED_MEM_HOST) { 152 if (impl->own_) { 153 // data is owned so GPU had the most up-to-date version; copy it 154 // TTT - apparantly it doesn't have most up to date data 155 magma_getvector(vec->length, sizeof(*array[0]), 156 impl->darray, 1, impl->array, 1); 157 CeedDebug("\033[31m[CeedVectorGetArray_Magma]"); 158 //fprintf(stderr,"rrrrrrrrrrrrrrr\n"); 159 } else if (impl->array == NULL) { 160 // Vector doesn't own the data and was set on GPU 161 if (impl->darray == NULL) { 162 // call was made just to allocate memory 163 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL); 164 CeedChk(ierr); 165 } else 166 return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST"); 167 } 168 *array = impl->array; 169 } else if (mtype == CEED_MEM_DEVICE) { 170 if (impl->darray == NULL) { 171 // Vector doesn't own the data and was set on the CPU 172 if (impl->array == NULL) { 173 // call was made just to allocate memory 174 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL); 175 CeedChk(ierr); 176 } else 177 return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE"); 178 } 179 *array = impl->darray; 180 } else 181 return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory"); 182 183 return 0; 184 } 185 186 // ***************************************************************************** 187 // * Give data pointer from vector vec to array (on HOST or DEVICE) to read it 188 // ***************************************************************************** 189 static int CeedVectorGetArrayRead_Magma(CeedVector vec, CeedMemType mtype, 190 const CeedScalar **array) { 191 CeedVector_Magma *impl = vec->data; 192 int ierr; 193 194 if (mtype == CEED_MEM_HOST) { 195 if (impl->own_) { 196 // data is owned so GPU had the most up-to-date version; copy it 197 magma_getvector(vec->length, sizeof(*array[0]), 198 impl->darray, 1, impl->array, 1); 199 } else if (impl->array == NULL) { 200 // Vector doesn't own the data and was set on GPU 201 if (impl->darray == NULL) { 202 // call was made just to allocate memory 203 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL); 204 CeedChk(ierr); 205 } else 206 return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST"); 207 } 208 *array = impl->array; 209 } else if (mtype == CEED_MEM_DEVICE) { 210 if (impl->darray == NULL) { 211 // Vector doesn't own the data and was set on the CPU 212 if (impl->array == NULL) { 213 // call was made just to allocate memory 214 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL); 215 CeedChk(ierr); 216 } else 217 return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE"); 218 } 219 *array = impl->darray; 220 } else 221 return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory"); 222 223 return 0; 224 } 225 226 // ***************************************************************************** 227 // * There is no mtype here for array so it is not clear if we restore from HOST 228 // * memory or from DEVICE memory. We assume that it is CPU memory because if 229 // * it was GPU memory we would not call this routine at all. 230 // * Restore vector vec with values from array, where array received its values 231 // * from vec and possibly modified them. 232 // ***************************************************************************** 233 static int CeedVectorRestoreArray_Magma(CeedVector vec, CeedScalar **array) { 234 CeedVector_Magma *impl = vec->data; 235 236 // Check if the array is a CPU pointer 237 if (*array == impl->array) { 238 // Update device, if the device pointer is not NULL 239 if (impl->darray != NULL) { 240 magma_setvector(vec->length, sizeof(*array[0]), 241 *array, 1, impl->darray, 1); 242 } else { 243 // nothing to do (case of CPU use pointer) 244 } 245 246 } else if (impl->down_) { 247 // nothing to do if array is on GPU, except if down_=1(case CPU use pointer) 248 magma_getvector(vec->length, sizeof(*array[0]), 249 impl->darray, 1, impl->array, 1); 250 } 251 252 *array = NULL; 253 return 0; 254 } 255 256 // ***************************************************************************** 257 // * There is no mtype here for array so it is not clear if we restore from HOST 258 // * memory or from DEVICE memory. We assume that it is CPU memory because if 259 // * it was GPU memory we would not call this routine at all. 260 // * Restore vector vec with values from array, where array received its values 261 // * from vec to only read them; in this case vec may have been modified meanwhile 262 // * and needs to be restored here. 263 // ***************************************************************************** 264 static int CeedVectorRestoreArrayRead_Magma(CeedVector vec, 265 const CeedScalar **array) { 266 CeedVector_Magma *impl = vec->data; 267 268 // Check if the array is a CPU pointer 269 if (*array == impl->array) { 270 // Update device, if the device pointer is not NULL 271 if (impl->darray != NULL) { 272 magma_setvector(vec->length, sizeof(*array[0]), 273 *array, 1, impl->darray, 1); 274 } else { 275 // nothing to do (case of CPU use pointer) 276 } 277 278 } else if (impl->down_) { 279 // nothing to do if array is on GPU, except if down_=1(case CPU use pointer) 280 magma_getvector(vec->length, sizeof(*array[0]), 281 impl->darray, 1, impl->array, 1); 282 } 283 284 *array = NULL; 285 return 0; 286 } 287 288 static int CeedVectorDestroy_Magma(CeedVector vec) { 289 CeedVector_Magma *impl = vec->data; 290 int ierr; 291 292 // Free if we own the data 293 if (impl->own_) { 294 ierr = magma_free_pinned(impl->array); CeedChk(ierr); 295 ierr = magma_free(impl->darray); CeedChk(ierr); 296 } else if (impl->down_) { 297 ierr = magma_free(impl->darray); CeedChk(ierr); 298 } 299 ierr = CeedFree(&vec->data); CeedChk(ierr); 300 return 0; 301 } 302 303 // ***************************************************************************** 304 // * Create vector vec of size n 305 // ***************************************************************************** 306 static int CeedVectorCreate_Magma(CeedInt n, CeedVector vec) { 307 CeedVector_Magma *impl; 308 int ierr; 309 310 vec->SetArray = CeedVectorSetArray_Magma; 311 vec->GetArray = CeedVectorGetArray_Magma; 312 vec->GetArrayRead = CeedVectorGetArrayRead_Magma; 313 vec->RestoreArray = CeedVectorRestoreArray_Magma; 314 vec->RestoreArrayRead = CeedVectorRestoreArrayRead_Magma; 315 vec->Destroy = CeedVectorDestroy_Magma; 316 ierr = CeedCalloc(1,&impl); CeedChk(ierr); 317 impl->darray = NULL; 318 impl->array = NULL; 319 impl->own_ = 0; 320 impl->down_= 0; 321 vec->data = impl; 322 return 0; 323 } 324 325 326 // ***************************************************************************** 327 // * Apply restriction operator r to u: v = r(rmode) u 328 // ***************************************************************************** 329 static int CeedElemRestrictionApply_Magma(CeedElemRestriction r, 330 CeedTransposeMode tmode, 331 CeedTransposeMode lmode, CeedVector u, 332 CeedVector v, CeedRequest *request) { 333 CeedElemRestriction_Magma *impl = r->data; 334 int ierr; 335 const CeedScalar *uu; 336 CeedScalar *vv; 337 CeedInt nelem = r->nelem, elemsize = r->elemsize, ndof = r->ndof, 338 ncomp=r->ncomp; 339 CeedInt esize = nelem * elemsize; 340 341 #ifdef USE_MAGMA_BATCH2 342 CeedInt *dindices = impl->dindices; 343 // Get pointers on the device 344 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &uu); CeedChk(ierr); 345 ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &vv); CeedChk(ierr); 346 #else 347 CeedInt *indices = impl->indices; 348 ierr = CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu); CeedChk(ierr); 349 ierr = CeedVectorGetArray(v, CEED_MEM_HOST, &vv); CeedChk(ierr); 350 #endif 351 352 if (tmode == CEED_NOTRANSPOSE) { 353 // Perform: v = r * u 354 if (!impl->indices) { 355 for (CeedInt i=0; i<esize*ncomp; i++) vv[i] = uu[i]; 356 } else if (ncomp == 1) { 357 #ifdef USE_MAGMA_BATCH2 358 magma_template<<i=0:esize>> 359 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) { 360 vv[i] = uu[dindices[i]]; 361 } 362 #else 363 for (CeedInt i=0; i<esize; i++) vv[i] = uu[indices[i]]; 364 #endif 365 } else { 366 // vv is (elemsize x ncomp x nelem), column-major 367 if (lmode == CEED_NOTRANSPOSE) { // u is (ndof x ncomp), column-major 368 #ifdef USE_MAGMA_BATCH2 369 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>> 370 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, int ndof) { 371 vv[i + iend*(d+dend*e)] = uu[dindices[i+iend*e]+ndof*d]; 372 } 373 #else 374 for (CeedInt e = 0; e < nelem; e++) 375 for (CeedInt d = 0; d < ncomp; d++) 376 for (CeedInt i=0; i < elemsize; i++) { 377 vv[i + elemsize*(d+ncomp*e)] = 378 uu[indices[i+elemsize*e]+ndof*d]; 379 } 380 #endif 381 } else { // u is (ncomp x ndof), column-major 382 #ifdef USE_MAGMA_BATCH2 383 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>> 384 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) { 385 vv[i + iend*(d+dend*e)] = uu[d+dend*dindices[i + iend*e]]; 386 } 387 #else 388 for (CeedInt e = 0; e < nelem; e++) 389 for (CeedInt d = 0; d < ncomp; d++) 390 for (CeedInt i=0; i< elemsize; i++) { 391 vv[i + elemsize*(d+ncomp*e)] = 392 uu[d+ncomp*indices[i+elemsize*e]]; 393 } 394 #endif 395 } 396 } 397 } else { 398 // Note: in transpose mode, we perform: v += r^t * u 399 if (!impl->indices) { 400 for (CeedInt i=0; i<esize; i++) vv[i] += uu[i]; 401 } else if (ncomp == 1) { 402 // fprintf(stderr,"3 ---------\n"); 403 #ifdef USE_MAGMA_BATCH2 404 magma_template<<i=0:esize>> 405 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) { 406 magmablas_datomic_add( &vv[dindices[i]], uu[i]); 407 } 408 #else 409 for (CeedInt i=0; i<esize; i++) vv[indices[i]] += uu[i]; 410 #endif 411 } else { // u is (elemsize x ncomp x nelem) 412 fprintf(stderr,"2 ---------\n"); 413 414 if (lmode == CEED_NOTRANSPOSE) { // vv is (ndof x ncomp), column-major 415 #ifdef USE_MAGMA_BATCH2 416 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>> 417 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, CeedInt ndof) { 418 magmablas_datomic_add( &vv[dindices[i+iend*e]+ndof*d], 419 uu[i+iend*(d+e*dend)]); 420 } 421 #else 422 for (CeedInt e = 0; e < nelem; e++) 423 for (CeedInt d = 0; d < ncomp; d++) 424 for (CeedInt i=0; i < elemsize; i++) { 425 vv[indices[i + elemsize*e]+ndof*d] += 426 uu[i + elemsize*(d+e*ncomp)]; 427 } 428 #endif 429 } else { // vv is (ncomp x ndof), column-major 430 #ifdef USE_MAGMA_BATCH2 431 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>> 432 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) { 433 magmablas_datomic_add( &vv[d+dend*dindices[i + iend*e]], 434 uu[i+iend*(d+e*dend)]); 435 } 436 #else 437 for (CeedInt e = 0; e < nelem; e++) 438 for (CeedInt d = 0; d < ncomp; d++) 439 for (CeedInt i=0; i < elemsize; i++) { 440 vv[d+ncomp*indices[i + elemsize*e]] += 441 uu[i + elemsize*(d+e*ncomp)]; 442 } 443 #endif 444 } 445 } 446 } 447 448 ierr = CeedVectorRestoreArrayRead(u, &uu); CeedChk(ierr); 449 ierr = CeedVectorRestoreArray(v, &vv); CeedChk(ierr); 450 451 if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) 452 *request = NULL; 453 return 0; 454 } 455 456 static int CeedElemRestrictionDestroy_Magma(CeedElemRestriction r) { 457 CeedElemRestriction_Magma *impl = r->data; 458 int ierr; 459 460 // Free if we own the data 461 if (impl->own_) { 462 ierr = magma_free_pinned(impl->indices); CeedChk(ierr); 463 ierr = magma_free(impl->dindices); CeedChk(ierr); 464 } else if (impl->down_) { 465 ierr = magma_free(impl->dindices); CeedChk(ierr); 466 } 467 ierr = CeedFree(&r->data); CeedChk(ierr); 468 return 0; 469 } 470 471 static int CeedElemRestrictionCreate_Magma(CeedMemType mtype, 472 CeedCopyMode cmode, 473 const CeedInt *indices, CeedElemRestriction r) { 474 int ierr, size = r->nelem*r->elemsize; 475 CeedElemRestriction_Magma *impl; 476 477 // Allocate memory for the MAGMA Restricton and initializa pointers to NULL 478 ierr = CeedCalloc(1,&impl); CeedChk(ierr); 479 impl->dindices = NULL; 480 impl->indices = NULL; 481 impl->own_ = 0; 482 impl->down_= 0; 483 484 if (mtype == CEED_MEM_HOST) { 485 // memory is on the host; own_ = 0 486 switch (cmode) { 487 case CEED_COPY_VALUES: 488 ierr = magma_malloc( (void**)&impl->dindices, 489 size * sizeof(CeedInt)); CeedChk(ierr); 490 ierr = magma_malloc_pinned( (void**)&impl->indices, 491 size * sizeof(CeedInt)); CeedChk(ierr); 492 impl->own_ = 1; 493 494 if (indices != NULL) { 495 memcpy(impl->indices, indices, size * sizeof(indices[0])); 496 magma_setvector(size, sizeof(CeedInt), 497 impl->indices, 1, impl->dindices, 1); 498 } 499 break; 500 case CEED_OWN_POINTER: 501 ierr = magma_malloc( (void**)&impl->dindices, 502 size * sizeof(CeedInt)); CeedChk(ierr); 503 // TODO: possible problem here is if we are passed non-pinned memory; 504 // (as we own it, lter in destroy, we use free for pinned memory). 505 impl->indices = (CeedInt *)indices; 506 impl->own_ = 1; 507 508 if (indices != NULL) 509 magma_setvector(size, sizeof(CeedInt), 510 indices, 1, impl->dindices, 1); 511 break; 512 case CEED_USE_POINTER: 513 ierr = magma_malloc( (void**)&impl->dindices, 514 size * sizeof(CeedInt)); CeedChk(ierr); 515 magma_setvector(size, sizeof(CeedInt), 516 indices, 1, impl->dindices, 1); 517 impl->down_ = 1; 518 impl->indices = (CeedInt *)indices; 519 } 520 } else if (mtype == CEED_MEM_DEVICE) { 521 // memory is on the device; own = 0 522 switch (cmode) { 523 case CEED_COPY_VALUES: 524 ierr = magma_malloc( (void**)&impl->dindices, 525 size * sizeof(CeedInt)); CeedChk(ierr); 526 ierr = magma_malloc_pinned( (void**)&impl->indices, 527 size * sizeof(CeedInt)); CeedChk(ierr); 528 impl->own_ = 1; 529 530 if (indices) 531 magma_copyvector(size, sizeof(CeedInt), 532 indices, 1, impl->dindices, 1); 533 break; 534 case CEED_OWN_POINTER: 535 impl->dindices = (CeedInt *)indices; 536 ierr = magma_malloc_pinned( (void**)&impl->indices, 537 size * sizeof(CeedInt)); CeedChk(ierr); 538 impl->own_ = 1; 539 540 break; 541 case CEED_USE_POINTER: 542 impl->dindices = (CeedInt *)indices; 543 impl->indices = NULL; 544 } 545 546 } else 547 return CeedError(r->ceed, 1, "Only MemType = HOST or DEVICE supported"); 548 549 r->data = impl; 550 r->Apply = CeedElemRestrictionApply_Magma; 551 r->Destroy = CeedElemRestrictionDestroy_Magma; 552 553 return 0; 554 } 555 556 static int CeedElemRestrictionCreateBlocked_Magma(CeedMemType mtype, 557 CeedCopyMode cmode, 558 const CeedInt *indices, CeedElemRestriction r) { 559 return CeedError(r->ceed, 1, "Backend does not implement blocked restrictions"); 560 } 561 562 // Contracts on the middle index 563 // NOTRANSPOSE: V_ajc = T_jb U_abc 564 // TRANSPOSE: V_ajc = T_bj U_abc 565 // If Add != 0, "=" is replaced by "+=" 566 static int CeedTensorContract_Magma(Ceed ceed, 567 CeedInt A, CeedInt B, CeedInt C, CeedInt J, 568 const CeedScalar *t, CeedTransposeMode tmode, 569 const CeedInt Add, 570 const CeedScalar *u, CeedScalar *v) { 571 #ifdef USE_MAGMA_BATCH 572 magma_dtensor_contract(ceed, A, B, C, J, t, tmode, Add, u, v); 573 #else 574 CeedInt tstride0 = B, tstride1 = 1; 575 if (tmode == CEED_TRANSPOSE) { 576 tstride0 = 1; tstride1 = J; 577 } 578 CeedDebug("\033[31m[CeedTensorContract] A=%d, J=%d, C=%d, B=%d: %d %d %d", 579 A,J,C,B,A*J*B*C, C*J*A, C*B*A); 580 for (CeedInt a=0; a<A; a++) { 581 for (CeedInt j=0; j<J; j++) { 582 if (!Add) { 583 for (CeedInt c=0; c<C; c++) 584 v[(a*J+j)*C+c] = 0; 585 } 586 for (CeedInt b=0; b<B; b++) { 587 for (CeedInt c=0; c<C; c++) { 588 v[(a*J+j)*C+c] += t[j*tstride0 + b*tstride1] * u[(a*B+b)*C+c]; 589 } 590 } 591 } 592 } 593 #endif 594 return 0; 595 } 596 597 static int CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem, 598 CeedTransposeMode tmode, CeedEvalMode emode, 599 const CeedScalar *u, CeedScalar *v) { 600 int ierr; 601 const CeedInt dim = basis->dim; 602 const CeedInt ncomp = basis->ncomp; 603 const CeedInt nqpt = ncomp*CeedPowInt(basis->Q1d, dim); 604 const CeedInt add = (tmode == CEED_TRANSPOSE); 605 606 if (nelem != 1) 607 return CeedError(basis->ceed, 1, 608 "This backend does not support BasisApply for multiple elements"); 609 610 CeedDebug("\033[01m[CeedBasisApply_Magma] vsize=%d", 611 ncomp*CeedPowInt(basis->P1d, dim)); 612 613 if (tmode == CEED_TRANSPOSE) { 614 const CeedInt vsize = ncomp*CeedPowInt(basis->P1d, dim); 615 for (CeedInt i = 0; i < vsize; i++) 616 v[i] = (CeedScalar) 0; 617 } 618 if (emode & CEED_EVAL_INTERP) { 619 CeedInt P = basis->P1d, Q = basis->Q1d; 620 if (tmode == CEED_TRANSPOSE) { 621 P = basis->Q1d; Q = basis->P1d; 622 } 623 CeedInt pre = ncomp*CeedPowInt(P, dim-1), post = 1; 624 CeedScalar tmp[2][ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1)]; 625 CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d", 626 ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1)); 627 for (CeedInt d=0; d<dim; d++) { 628 ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q, basis->interp1d, 629 tmode, add&&(d==dim-1), 630 d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]); 631 CeedChk(ierr); 632 pre /= P; 633 post *= Q; 634 } 635 if (tmode == CEED_NOTRANSPOSE) { 636 v += nqpt; 637 } else { 638 u += nqpt; 639 } 640 } 641 if (emode & CEED_EVAL_GRAD) { 642 CeedInt P = basis->P1d, Q = basis->Q1d; 643 // In CEED_NOTRANSPOSE mode: 644 // u is (P^dim x nc), column-major layout (nc = ncomp) 645 // v is (Q^dim x nc x dim), column-major layout (nc = ncomp) 646 // In CEED_TRANSPOSE mode, the sizes of u and v are switched. 647 if (tmode == CEED_TRANSPOSE) { 648 P = basis->Q1d, Q = basis->P1d; 649 } 650 CeedScalar tmp[2][ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1)]; 651 CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d", 652 ncomp*Q*CeedPowInt(P>Q?P:Q, dim-1)); 653 for (CeedInt p = 0; p < dim; p++) { 654 CeedInt pre = ncomp*CeedPowInt(P, dim-1), post = 1; 655 for (CeedInt d=0; d<dim; d++) { 656 ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q, 657 (p==d)?basis->grad1d:basis->interp1d, 658 tmode, add&&(d==dim-1), 659 d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]); 660 CeedChk(ierr); 661 pre /= P; 662 post *= Q; 663 } 664 if (tmode == CEED_NOTRANSPOSE) { 665 v += nqpt; 666 } else { 667 u += nqpt; 668 } 669 } 670 } 671 if (emode & CEED_EVAL_WEIGHT) { 672 if (tmode == CEED_TRANSPOSE) 673 return CeedError(basis->ceed, 1, 674 "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); 675 CeedInt Q = basis->Q1d; 676 for (CeedInt d=0; d<dim; d++) { 677 CeedInt pre = CeedPowInt(Q, dim-d-1), post = CeedPowInt(Q, d); 678 for (CeedInt i=0; i<pre; i++) { 679 for (CeedInt j=0; j<Q; j++) { 680 for (CeedInt k=0; k<post; k++) { 681 v[(i*Q + j)*post + k] = basis->qweight1d[j] 682 * (d == 0 ? 1 : v[(i*Q + j)*post + k]); 683 } 684 } 685 } 686 } 687 } 688 return 0; 689 } 690 691 static int CeedBasisDestroy_Magma(CeedBasis basis) { 692 return 0; 693 } 694 695 static int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, 696 CeedInt Q1d, const CeedScalar *interp1d, 697 const CeedScalar *grad1d, 698 const CeedScalar *qref1d, 699 const CeedScalar *qweight1d, 700 CeedBasis basis) { 701 basis->Apply = CeedBasisApply_Magma; 702 basis->Destroy = CeedBasisDestroy_Magma; 703 return 0; 704 } 705 706 static int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, 707 CeedInt ndof, CeedInt nqpts, 708 const CeedScalar *interp, 709 const CeedScalar *grad, 710 const CeedScalar *qref, 711 const CeedScalar *qweight, 712 CeedBasis basis) { 713 return CeedError(basis->ceed, 1, "Backend does not implement non-tensor bases"); 714 } 715 716 static int CeedQFunctionApply_Magma(CeedQFunction qf, CeedInt Q, 717 const CeedScalar *const *u, 718 CeedScalar *const *v) { 719 int ierr; 720 ierr = qf->function(qf->ctx, Q, u, v); CeedChk(ierr); 721 return 0; 722 } 723 724 static int CeedQFunctionDestroy_Magma(CeedQFunction qf) { 725 return 0; 726 } 727 728 static int CeedQFunctionCreate_Magma(CeedQFunction qf) { 729 qf->Apply = CeedQFunctionApply_Magma; 730 qf->Destroy = CeedQFunctionDestroy_Magma; 731 return 0; 732 } 733 734 static int CeedOperatorDestroy_Magma(CeedOperator op) { 735 CeedOperator_Magma *impl = op->data; 736 int ierr; 737 738 for (CeedInt i=0; i<impl->numein+impl->numeout; i++) { 739 ierr = CeedVectorDestroy(&impl->evecs[i]); CeedChk(ierr); 740 } 741 742 ierr = CeedFree(&impl->evecs); CeedChk(ierr); 743 ierr = CeedFree(&impl->edata); CeedChk(ierr); 744 745 for (CeedInt i=0; i<impl->numqin+impl->numqout; i++) { 746 ierr = CeedFree(&impl->qdata_alloc[i]); CeedChk(ierr); 747 } 748 749 ierr = CeedFree(&impl->qdata_alloc); CeedChk(ierr); 750 ierr = CeedFree(&impl->qdata); CeedChk(ierr); 751 752 ierr = CeedFree(&impl->indata); CeedChk(ierr); 753 ierr = CeedFree(&impl->outdata); CeedChk(ierr); 754 755 ierr = CeedFree(&op->data); CeedChk(ierr); 756 return 0; 757 } 758 759 760 /* 761 Setup infields or outfields 762 */ 763 static int CeedOperatorSetupFields_Magma(CeedQFunctionField qfields[16], 764 CeedOperatorField ofields[16], 765 CeedVector *evecs, CeedScalar **qdata, 766 CeedScalar **qdata_alloc, CeedScalar **indata, 767 CeedInt starti, CeedInt startq, 768 CeedInt numfields, CeedInt Q) { 769 CeedInt dim, ierr, iq=startq, ncomp; 770 771 // Loop over fields 772 for (CeedInt i=0; i<numfields; i++) { 773 CeedEvalMode emode = qfields[i]->emode; 774 if (emode != CEED_EVAL_WEIGHT) { 775 ierr = CeedElemRestrictionCreateVector(ofields[i]->Erestrict, NULL, &evecs[i]); 776 CeedChk(ierr); 777 } 778 switch(emode) { 779 case CEED_EVAL_NONE: 780 break; // No action 781 case CEED_EVAL_INTERP: 782 ncomp = qfields[i]->ncomp; 783 ierr = CeedMalloc(Q*ncomp, &qdata_alloc[iq]); CeedChk(ierr); 784 qdata[i + starti] = qdata_alloc[iq]; 785 iq++; 786 break; 787 case CEED_EVAL_GRAD: 788 ncomp = qfields[i]->ncomp; 789 dim = ofields[i]->basis->dim; 790 ierr = CeedMalloc(Q*ncomp*dim, &qdata_alloc[iq]); CeedChk(ierr); 791 qdata[i + starti] = qdata_alloc[iq]; 792 iq++; 793 break; 794 case CEED_EVAL_WEIGHT: // Only on input fields 795 ierr = CeedMalloc(Q, &qdata_alloc[iq]); CeedChk(ierr); 796 ierr = CeedBasisApply(ofields[iq]->basis, 1, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, 797 NULL, qdata_alloc[iq]); CeedChk(ierr); 798 qdata[i] = qdata_alloc[iq]; 799 indata[i] = qdata[i]; 800 iq++; 801 break; 802 case CEED_EVAL_DIV: 803 break; // Not implimented 804 case CEED_EVAL_CURL: 805 break; // Not implimented 806 } 807 } 808 return 0; 809 } 810 811 /* 812 CeedOperator needs to connect all the named fields (be they active or passive) 813 to the named inputs and outputs of its CeedQFunction. 814 */ 815 static int CeedOperatorSetup_Magma(CeedOperator op) { 816 if (op->setupdone) return 0; 817 CeedOperator_Magma *opmagma = op->data; 818 CeedQFunction qf = op->qf; 819 CeedInt Q = op->numqpoints; 820 int ierr; 821 822 // Count infield and outfield array sizes and evectors 823 opmagma->numein = qf->numinfutfields; 824 for (CeedInt i=0; i<qf->numinputfields; i++) { 825 CeedEvalMode emode = qf->inputfields[i]->emode; 826 opmagma->numqin += !!(emode & CEED_EVAL_INTERP) + !!(emode & CEED_EVAL_GRAD) + !! 827 (emode & CEED_EVAL_WEIGHT); 828 } 829 qpmagma->numeout = qf->numoutputfields; 830 for (CeedInt i=0; i<qf->numoutputfields; i++) { 831 CeedEvalMode emode = qf->outputfields[i]->emode; 832 opmagma->numqout += !!(emode & CEED_EVAL_INTERP) + !!(emode & CEED_EVAL_GRAD); 833 } 834 835 // Allocate 836 ierr = CeedCalloc(opmagma->numein + opmagma->numeout, &opmagma->evecs); CeedChk(ierr); 837 ierr = CeedCalloc(opmagma->numein + opmagma->numeout, &opmagma->edata); 838 CeedChk(ierr); 839 840 ierr = CeedCalloc(opmagma->numqin + opmagma->numqout, &opmagma->qdata_alloc); 841 CeedChk(ierr); 842 ierr = CeedCalloc(qf->numinputfields + qf->numoutputfields, &opmagma->qdata); 843 CeedChk(ierr); 844 845 ierr = CeedCalloc(16, &opmagma->indata); CeedChk(ierr); 846 ierr = CeedCalloc(16, &opmagma->outdata); CeedChk(ierr); 847 848 // Set up infield and outfield pointer arrays 849 // Infields 850 ierr = CeedOperatorSetupFields_Magma(qf->inputfields, op->inputfields, 851 opmagma->evecs, opmagma->qdata, opmagma->qdata_alloc, 852 opmagma->indata, 0, 0, 853 qf->numinputfields, Q); CeedChk(ierr); 854 855 // Outfields 856 ierr = CeedOperatorSetupFields_Magma(qf->outputfields, op->outputfields, 857 opmagma->evecs, opmagma->qdata, opmagma->qdata_alloc, 858 opmagma->indata, qf->numinputfields, 859 opmagma->numqin, qf->numoutputfields, Q); CeedChk(ierr); 860 861 // Output Qvecs 862 for (CeedInt i=0; i<qf->numoutputfields; i++) { 863 CeedEvalMode emode = qf->outputfields[i]->emode; 864 if (emode != CEED_EVAL_NONE) { 865 opmagma->outdata[i] = opmagma->qdata[i + qf->numinputfields]; 866 } 867 } 868 869 op->setupdone = 1; 870 871 return 0; 872 } 873 874 static int CeedOperatorApply_Magma(CeedOperator op, CeedVector invec, 875 CeedVector outvec, CeedRequest *request) { 876 CeedOperator_Magma *opmagma = op->data; 877 CeedInt Q = op->numqpoints, elemsize; 878 int ierr; 879 CeedQFunction qf = op->qf; 880 CeedScalar *vec_temp; 881 882 // Setup 883 ierr = CeedOperatorSetup_Magma(op); CeedChk(ierr); 884 885 // Input Evecs and Restriction 886 for (CeedInt i=0; i<qf->numinputfields; i++) { 887 CeedEvalMode emode = qf->inputfields[i]->emode; 888 if (emode & CEED_EVAL_WEIGHT) { // Skip 889 } else { 890 // Zero evec 891 ierr = CeedVectorGetArray(opmagma->evecs[i], CEED_MEM_HOST, &vec_temp); 892 CeedChk(ierr); 893 for (CeedInt j=0; j<opmagma->evecs[i]->length; j++) 894 vec_temp[j] = 0.; 895 ierr = CeedVectorRestoreArray(opmagma->evecs[i], &vec_temp); CeedChk(ierr); 896 // Active 897 if (op->inputfields[i]->vec == CEED_VECTOR_ACTIVE) { 898 // Restrict 899 ierr = CeedElemRestrictionApply(op->inputfields[i]->Erestrict, CEED_NOTRANSPOSE, 900 op->inputfields[i]->lmode, invec, opmagma->evecs[ieiin], 901 request); CeedChk(ierr); 902 // Get evec 903 ierr = CeedVectorGetArrayRead(opmagma->evecs[i], CEED_MEM_HOST, 904 (const CeedScalar **) &opmagma->edata[i]); CeedChk(ierr); 905 } else { 906 // Passive 907 // Restrict 908 ierr = CeedElemRestrictionApply(op->inputfields[i]->Erestrict, CEED_NOTRANSPOSE, 909 op->inputfields[i]->lmode, op->inputfields[i]->vec, opmagma->evecs[i], 910 request); CeedChk(ierr); 911 // Get evec 912 ierr = CeedVectorGetArrayRead(opmagma->evecs[i], CEED_MEM_HOST, 913 (const CeedScalar **) &opmagma->edata[i]); CeedChk(ierr); 914 } 915 } 916 } 917 918 // Output Evecs 919 for (CeedInt i=0; i<qf->numoutputfields; i++) { 920 ierr = CeedVectorGetArray(opmagma->evecs[i+opmagma->numein], CEED_MEM_HOST, 921 &opmagma->edata[i + qf->numinputfields]); 922 CeedChk(ierr); 923 } 924 925 // Loop through elements 926 for (CeedInt e=0; e<op->numelements; e++) { 927 // Input basis apply if needed 928 for (CeedInt i=0; i<qf->numinputfields; i++) { 929 // Get elemsize, emode, ncomp 930 elemsize = op->inputfields[i].Erestrict->elemsize; 931 CeedEvalMode emode = qf->inputfields[i]->emode; 932 CeedInt ncomp = qf->inputfields[i]->ncomp; 933 // Basis action 934 switch(emode) { 935 case CEED_EVAL_NONE: 936 opmagma->indata[i] = &opmagma->edata[i][e*Q*ncomp]; 937 break; 938 case CEED_EVAL_INTERP: 939 ierr = CeedBasisApply(op->inputfields[i]->basis, 1, CEED_NOTRANSPOSE, 940 CEED_EVAL_INTERP, &opmagma->edata[i][e*elemsize*ncomp], opmagma->qdata[i]); 941 CeedChk(ierr); 942 opmagma->indata[i] = opmagma->qdata[i]; 943 break; 944 case CEED_EVAL_GRAD: 945 ierr = CeedBasisApply(op->inputfields[i]->basis, 1, CEED_NOTRANSPOSE, 946 CEED_EVAL_GRAD, &opmagma->edata[i][e*elemsize*ncomp], opmagma->qdata[i]); 947 CeedChk(ierr); 948 opmagma->indata[i] = opmagma->qdata[i]; 949 break; 950 case CEED_EVAL_WEIGHT: 951 break; // No action 952 case CEED_EVAL_DIV: 953 break; // Not implimented 954 case CEED_EVAL_CURL: 955 break; // Not implimented 956 } 957 } 958 // Output pointers 959 for (CeedInt i=0; i<qf->numoutputfields; i++) { 960 CeedEvalMode emode = qf->outputfields[i]->emode; 961 if (emode == CEED_EVAL_NONE) { 962 CeedInt ncomp = qf->outputfields[i]->ncomp; 963 opmagma->outdata[i] = &opmagma->edata[i + qf->numinputfields][e*Q*ncomp]; 964 } 965 } 966 // Q function 967 ierr = CeedQFunctionApply(op->qf, Q, (const CeedScalar * const*) opmagma->indata, 968 opmagma->outdata); CeedChk(ierr); 969 970 // Output basis apply if needed 971 for (CeedInt i=0; i<qf->numoutputfields; i++) { 972 // Get elemsize, emode, ncomp 973 elemsize = op->outputfields[i]->Erestrict->elemsize; 974 CeedInt ncomp = qf->outputfields[i]->ncomp; 975 CeedEvalMode emode = qf->outputfields[i]->emode; 976 // Basis action 977 switch(emode) { 978 case CEED_EVAL_NONE: 979 break; // No action 980 case CEED_EVAL_INTERP: 981 ierr = CeedBasisApply(op->outputfields[i]->basis, 1, CEED_TRANSPOSE, 982 CEED_EVAL_INTERP, opmagma->outdata[i], 983 &opmagma->edata[i + qf->numinputfields][e*elemsize*ncomp]); CeedChk(ierr); 984 break; 985 case CEED_EVAL_GRAD: 986 ierr = CeedBasisApply(op->outputfields[i]->basis, 1, CEED_TRANSPOSE, CEED_EVAL_GRAD, 987 opmagma->outdata[i], &opmagma->edata[i + qf->numinputfields][e*elemsize*ncomp]); 988 CeedChk(ierr); 989 break; 990 case CEED_EVAL_WEIGHT: 991 break; // Should not occur 992 case CEED_EVAL_DIV: 993 break; // Not implimented 994 case CEED_EVAL_CURL: 995 break; // Not implimented 996 } 997 } 998 } 999 1000 // Output restriction 1001 for (CeedInt i=0; i<qf->numoutputfields; i++) { 1002 // Active 1003 if (op->outputfields[i]->vec == CEED_VECTOR_ACTIVE) { 1004 // Restore evec 1005 ierr = CeedVectorRestoreArray(opmagma->evecs[i+opmagma->numein], 1006 &opmagma->edata[i + qf->numinputfields]); CeedChk(ierr); 1007 // Zero lvec 1008 ierr = CeedVectorGetArray(outvec, CEED_MEM_HOST, &vec_temp); CeedChk(ierr); 1009 for (CeedInt j=0; j<outvec->length; j++) 1010 vec_temp[j] = 0.; 1011 ierr = CeedVectorRestoreArray(outvec, &vec_temp); CeedChk(ierr); 1012 // Restrict 1013 ierr = CeedElemRestrictionApply(op->outputfields[i]->Erestrict, CEED_TRANSPOSE, 1014 op->outputfields[i]->lmode, opmagma->evecs[i+opmagma->numein], outvec, request); CeedChk(ierr); 1015 } else { 1016 // Passive 1017 // Restore evec 1018 ierr = CeedVectorRestoreArray(opmagma->evecs[i+opmagma->numein], 1019 &opmagma->edata[i + qf->numinputfields]); CeedChk(ierr); 1020 // Zero lvec 1021 ierr = CeedVectorGetArray(op->outputfields[i]->vec, CEED_MEM_HOST, &vec_temp); 1022 CeedChk(ierr); 1023 for (CeedInt j=0; j<op->outputfields[i]->vec->length; j++) 1024 vec_temp[j] = 0.; 1025 ierr = CeedVectorRestoreArray(op->outputfields[i]->vec, &vec_temp); 1026 CeedChk(ierr); 1027 // Restrict 1028 ierr = CeedElemRestrictionApply(op->outputfields[i]->Erestrict, CEED_TRANSPOSE, 1029 op->outputfields[i]->lmode, opmagma->evecs[i+opmagma->numein], op->outputfields[i].vec, 1030 request); CeedChk(ierr); 1031 } 1032 } 1033 1034 // Restore input arrays 1035 for (CeedInt i=0; i<qf->numinputfields; i++) { 1036 CeedEvalMode emode = qf->inputfields[i]->emode; 1037 if (emode & CEED_EVAL_WEIGHT) { 1038 } else { 1039 ierr = CeedVectorRestoreArrayRead(opmagma->evecs[i], 1040 (const CeedScalar **) &opmagma->edata[i]); CeedChk(ierr); 1041 } 1042 } 1043 1044 return 0; 1045 } 1046 1047 static int CeedOperatorCreate_Magma(CeedOperator op) { 1048 CeedOperator_Magma *impl; 1049 int ierr; 1050 1051 ierr = CeedCalloc(1, &impl); CeedChk(ierr); 1052 op->data = impl; 1053 op->Destroy = CeedOperatorDestroy_Magma; 1054 op->Apply = CeedOperatorApply_Magma; 1055 return 0; 1056 } 1057 1058 // ***************************************************************************** 1059 // * INIT 1060 // ***************************************************************************** 1061 static int CeedInit_Magma(const char *resource, Ceed ceed) { 1062 int ierr; 1063 if (strcmp(resource, "/gpu/magma")) 1064 return CeedError(ceed, 1, "MAGMA backend cannot use resource: %s", resource); 1065 1066 ierr = magma_init(); 1067 if (ierr) return CeedError(ceed, 1, "error in magma_init(): %d\n", ierr); 1068 //magma_print_environment(); 1069 1070 ceed->VecCreate = CeedVectorCreate_Magma; 1071 ceed->BasisCreateTensorH1 = CeedBasisCreateTensorH1_Magma; 1072 ceed->BasisCreateH1 = CeedBasisCreateH1_Magma; 1073 ceed->ElemRestrictionCreate = CeedElemRestrictionCreate_Magma; 1074 ceed->ElemRestrictionCreateBlocked = CeedElemRestrictionCreateBlocked_Magma; 1075 ceed->QFunctionCreate = CeedQFunctionCreate_Magma; 1076 ceed->OperatorCreate = CeedOperatorCreate_Magma; 1077 return 0; 1078 } 1079 1080 // ***************************************************************************** 1081 // * REGISTER 1082 // ***************************************************************************** 1083 __attribute__((constructor)) 1084 static void Register(void) { 1085 CeedRegister("/gpu/magma", CeedInit_Magma, 20); 1086 } 1087