backends/cuda-gen/ceed-cuda-gen-operator.c

*241a4b83SYohann// Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
*241a4b83SYohann// Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
*241a4b83SYohann// All Rights reserved. See files LICENSE and NOTICE for details.
*241a4b83SYohann//
*241a4b83SYohann// This file is part of CEED, a collection of benchmarks, miniapps, software
*241a4b83SYohann// libraries and APIs for efficient high-order finite element and spectral
*241a4b83SYohann// element discretizations for exascale applications. For more information and
*241a4b83SYohann// source code availability see http://github.com/ceed.
*241a4b83SYohann//
*241a4b83SYohann// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
*241a4b83SYohann// a collaborative effort of two U.S. Department of Energy organizations (Office
*241a4b83SYohann// of Science and the National Nuclear Security Administration) responsible for
*241a4b83SYohann// the planning and preparation of a capable exascale ecosystem, including
*241a4b83SYohann// software, applications, hardware, advanced system engineering and early
*241a4b83SYohann// testbed platforms, in support of the nation's exascale computing imperative.
*241a4b83SYohann
*241a4b83SYohann#include <ceed-backend.h>
*241a4b83SYohann#include "ceed-cuda-gen.h"
*241a4b83SYohann#include "ceed-cuda-gen-operator-build.h"
*241a4b83SYohann#include "../cuda/ceed-cuda.h"
*241a4b83SYohann
*241a4b83SYohannstatic int CeedOperatorDestroy_Cuda_gen(CeedOperator op) {
*241a4b83SYohann  int ierr;
*241a4b83SYohann  CeedOperator_Cuda_gen *impl;
*241a4b83SYohann  ierr = CeedOperatorGetData(op, (void *)&impl); CeedChk(ierr);
*241a4b83SYohann
*241a4b83SYohann  ierr = CeedFree(&impl); CeedChk(ierr);
*241a4b83SYohann  return 0;
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohannstatic int CeedOperatorApply_Cuda_gen(CeedOperator op, CeedVector invec,
*241a4b83SYohann                                      CeedVector outvec, CeedRequest *request) {
*241a4b83SYohann  int ierr;
*241a4b83SYohann  Ceed ceed;
*241a4b83SYohann  ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
*241a4b83SYohann  CeedOperator_Cuda_gen *data;
*241a4b83SYohann  ierr = CeedOperatorGetData(op, (void *)&data); CeedChk(ierr);
*241a4b83SYohann  CeedQFunction qf;
*241a4b83SYohann  CeedQFunction_Cuda_gen *qf_data;
*241a4b83SYohann  ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
*241a4b83SYohann  ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr);
*241a4b83SYohann  CeedInt nelem, numinputfields, numoutputfields;
*241a4b83SYohann  ierr = CeedOperatorGetNumElements(op, &nelem); CeedChk(ierr);
*241a4b83SYohann  ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
*241a4b83SYohann  CeedChk(ierr);
*241a4b83SYohann  CeedOperatorField *opinputfields, *opoutputfields;
*241a4b83SYohann  ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
*241a4b83SYohann  CeedChk(ierr);
*241a4b83SYohann  CeedQFunctionField *qfinputfields, *qfoutputfields;
*241a4b83SYohann  ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
*241a4b83SYohann  CeedChk(ierr);
*241a4b83SYohann  CeedEvalMode emode;
*241a4b83SYohann  CeedVector vec;
*241a4b83SYohann
*241a4b83SYohann  //Creation of the operator
*241a4b83SYohann  ierr = CeedCudaGenOperatorBuild(op); CeedChk(ierr);
*241a4b83SYohann
*241a4b83SYohann  // Zero lvecs
*241a4b83SYohann  for (CeedInt i = 0; i < numoutputfields; i++) {
*241a4b83SYohann    ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr);
*241a4b83SYohann    if (vec == CEED_VECTOR_ACTIVE)
*241a4b83SYohann      vec = outvec;
*241a4b83SYohann    ierr = CeedVectorSetValue(vec, 0.0); CeedChk(ierr);
*241a4b83SYohann  }
*241a4b83SYohann
*241a4b83SYohann  // Input vectors
*241a4b83SYohann  for (CeedInt i = 0; i < numinputfields; i++) {
*241a4b83SYohann    ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    if (emode == CEED_EVAL_WEIGHT) { // Skip
*241a4b83SYohann      data->fields.in[i] = NULL;
*241a4b83SYohann    } else {
*241a4b83SYohann      // Get input vector
*241a4b83SYohann      ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr);
*241a4b83SYohann      if (vec == CEED_VECTOR_ACTIVE) vec = invec;
*241a4b83SYohann      ierr = CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->fields.in[i]);
*241a4b83SYohann      CeedChk(ierr);
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann
*241a4b83SYohann  // Output vectors
*241a4b83SYohann  for (CeedInt i = 0; i < numoutputfields; i++) {
*241a4b83SYohann    ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    if (emode == CEED_EVAL_WEIGHT) { // Skip
*241a4b83SYohann      data->fields.out[i] = NULL;
*241a4b83SYohann    } else {
*241a4b83SYohann      // Get output vector
*241a4b83SYohann      ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr);
*241a4b83SYohann      if (vec == CEED_VECTOR_ACTIVE) vec = outvec;
*241a4b83SYohann      ierr = CeedVectorGetArray(vec, CEED_MEM_DEVICE, &data->fields.out[i]);
*241a4b83SYohann      CeedChk(ierr);
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann
*241a4b83SYohann  // Copy the context
*241a4b83SYohann  size_t ctxsize;
*241a4b83SYohann  ierr = CeedQFunctionGetContextSize(qf, &ctxsize); CeedChk(ierr);
*241a4b83SYohann  if (ctxsize > 0) {
*241a4b83SYohann    if(!qf_data->d_c) {
*241a4b83SYohann      ierr = cudaMalloc(&qf_data->d_c, ctxsize); CeedChk_Cu(ceed, ierr);
*241a4b83SYohann    }
*241a4b83SYohann    void *ctx;
*241a4b83SYohann    ierr = CeedQFunctionGetInnerContext(qf, &ctx); CeedChk(ierr);
*241a4b83SYohann    ierr = cudaMemcpy(qf_data->d_c, ctx, ctxsize, cudaMemcpyHostToDevice);
*241a4b83SYohann    CeedChk_Cu(ceed, ierr);
*241a4b83SYohann  }
*241a4b83SYohann
*241a4b83SYohann  // Apply operator
*241a4b83SYohann  void *opargs[] = {(void *) &nelem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W};
*241a4b83SYohann  const CeedInt dim = data->dim;
*241a4b83SYohann  const CeedInt Q1d = data->Q1d;
*241a4b83SYohann  if (dim==1) {
*241a4b83SYohann    const CeedInt elemsPerBlock = 32;
*241a4b83SYohann    CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
*241a4b83SYohann                                           ? 1 : 0 );
*241a4b83SYohann    CeedInt sharedMem = elemsPerBlock*Q1d*sizeof(CeedScalar);
*241a4b83SYohann    ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, 1, elemsPerBlock,
*241a4b83SYohann                                      sharedMem, opargs);
*241a4b83SYohann  } else if (dim==2) {
*241a4b83SYohann    const CeedInt elemsPerBlock = Q1d<4? 16 : 2;
*241a4b83SYohann    CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
*241a4b83SYohann                                           ? 1 : 0 );
*241a4b83SYohann    CeedInt sharedMem = elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar);
*241a4b83SYohann    ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, Q1d, elemsPerBlock,
*241a4b83SYohann                                      sharedMem, opargs);
*241a4b83SYohann  } else if (dim==3) {
*241a4b83SYohann    const CeedInt elemsPerBlock = Q1d<8? 4 : 1;
*241a4b83SYohann    CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
*241a4b83SYohann                                           ? 1 : 0 );
*241a4b83SYohann    CeedInt sharedMem = elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar);
*241a4b83SYohann    ierr = CeedRunKernelDimSharedCuda(ceed, data->op, grid, Q1d, Q1d, elemsPerBlock,
*241a4b83SYohann                                      sharedMem, opargs);
*241a4b83SYohann  }
*241a4b83SYohann  CeedChk(ierr);
*241a4b83SYohann
*241a4b83SYohann  // Restore input arrays
*241a4b83SYohann  for (CeedInt i = 0; i < numinputfields; i++) {
*241a4b83SYohann    ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    if (emode == CEED_EVAL_WEIGHT) { // Skip
*241a4b83SYohann    } else {
*241a4b83SYohann      ierr = CeedOperatorFieldGetVector(opinputfields[i], &vec); CeedChk(ierr);
*241a4b83SYohann      if (vec == CEED_VECTOR_ACTIVE) vec = invec;
*241a4b83SYohann      ierr = CeedVectorRestoreArrayRead(vec, &data->fields.in[i]);
*241a4b83SYohann      CeedChk(ierr);
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann
*241a4b83SYohann  // Restore output arrays
*241a4b83SYohann  for (CeedInt i = 0; i < numoutputfields; i++) {
*241a4b83SYohann    ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    if (emode == CEED_EVAL_WEIGHT) { // Skip
*241a4b83SYohann    } else {
*241a4b83SYohann      ierr = CeedOperatorFieldGetVector(opoutputfields[i], &vec); CeedChk(ierr);
*241a4b83SYohann      if (vec == CEED_VECTOR_ACTIVE) vec = outvec;
*241a4b83SYohann      ierr = CeedVectorRestoreArray(vec, &data->fields.out[i]);
*241a4b83SYohann      CeedChk(ierr);
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann
*241a4b83SYohann  return 0;
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohannint CeedOperatorCreate_Cuda_gen(CeedOperator op) {
*241a4b83SYohann  int ierr;
*241a4b83SYohann  Ceed ceed;
*241a4b83SYohann  ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
*241a4b83SYohann  CeedOperator_Cuda_gen *impl;
*241a4b83SYohann
*241a4b83SYohann  ierr = CeedCalloc(1, &impl); CeedChk(ierr);
*241a4b83SYohann  ierr = CeedOperatorSetData(op, (void *)&impl);
*241a4b83SYohann
*241a4b83SYohann  ierr = CeedSetBackendFunction(ceed, "Operator", op, "Apply",
*241a4b83SYohann                                CeedOperatorApply_Cuda_gen); CeedChk(ierr);
*241a4b83SYohann  ierr = CeedSetBackendFunction(ceed, "Operator", op, "Destroy",
*241a4b83SYohann                                CeedOperatorDestroy_Cuda_gen); CeedChk(ierr);
*241a4b83SYohann  return 0;
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohannint CeedCompositeOperatorCreate_Cuda_gen(CeedOperator op) {
*241a4b83SYohann  int ierr;
*241a4b83SYohann  Ceed ceed;
*241a4b83SYohann  ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
*241a4b83SYohann  return CeedError(ceed, 1, "Backend does not implement composite operators");
*241a4b83SYohann}