xref: /honee/src/smartsim/smartsim.c (revision 7ecf6641c340caefd7fa9f7fc7a6efebf89ae19d)
1ae2b091fSJames Wright // SPDX-FileCopyrightText: Copyright (c) 2017-2024, HONEE contributors.
2ae2b091fSJames Wright // SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause
37cd70835SJames Wright // Based on the instructions from https://www.craylabs.org/docs/sr_integration.html and PHASTA implementation
47cd70835SJames Wright 
59ae013d6SJames Wright #include <smartsim-impl.h>
67cd70835SJames Wright 
7149fb536SJames Wright #include <navierstokes.h>
87cd70835SJames Wright 
97ebeccb9SJames Wright #define SMARTSIM_KEY "SmartSimData"
10797f7eedSJames Wright 
SmartSimDataDestroy(SmartSimData * smartsim)117ebeccb9SJames Wright static PetscErrorCode SmartSimDataDestroy(SmartSimData *smartsim) {
127ebeccb9SJames Wright   SmartSimData smartsim_ = *smartsim;
13*14bd2a07SJames Wright 
147ebeccb9SJames Wright   PetscFunctionBeginUser;
157ebeccb9SJames Wright   if (!smartsim_) PetscFunctionReturn(PETSC_SUCCESS);
167ebeccb9SJames Wright 
177ebeccb9SJames Wright   PetscCallSmartRedis(DeleteCClient(&smartsim_->client));
187ebeccb9SJames Wright   PetscCall(PetscFree(smartsim_));
197ebeccb9SJames Wright   *smartsim = NULL;
207cd70835SJames Wright   PetscFunctionReturn(PETSC_SUCCESS);
217cd70835SJames Wright }
227cd70835SJames Wright 
SmartSimTrainingSetup(Honee honee)23797f7eedSJames Wright static PetscErrorCode SmartSimTrainingSetup(Honee honee) {
247ebeccb9SJames Wright   SmartSimData smartsim;
257cd70835SJames Wright   PetscMPIInt  rank;
267cd70835SJames Wright   PetscReal    checkrun[2] = {1};
277cd70835SJames Wright   size_t       dim_2[1]    = {2};
287cd70835SJames Wright 
297cd70835SJames Wright   PetscFunctionBeginUser;
307ebeccb9SJames Wright   PetscCall(HoneeGetSmartSimData(honee, &smartsim));
310c373b74SJames Wright   PetscCallMPI(MPI_Comm_rank(honee->comm, &rank));
327cd70835SJames Wright 
337cd70835SJames Wright   if (rank % smartsim->collocated_database_num_ranks == 0) {
347cd70835SJames Wright     // -- Send array that communicates when ML is done training
35ea615d4cSJames Wright     PetscCall(PetscLogEventBegin(HONEE_SmartRedis_Meta, 0, 0, 0, 0));
3643e9749fSJames Wright     PetscCallSmartRedis(put_tensor(smartsim->client, "check-run", 9, checkrun, dim_2, 1, SRTensorTypeDouble, SRMemLayoutContiguous));
377cd70835SJames Wright     PetscCall(SmartRedisVerifyPutTensor(smartsim->client, "check-run", 9));
38ea615d4cSJames Wright     PetscCall(PetscLogEventEnd(HONEE_SmartRedis_Meta, 0, 0, 0, 0));
397cd70835SJames Wright   }
40aa0b7f76SJames Wright   PetscFunctionReturn(PETSC_SUCCESS);
417cd70835SJames Wright }
427cd70835SJames Wright 
SmartSimSetup(Honee honee)437ebeccb9SJames Wright static PetscErrorCode SmartSimSetup(Honee honee) {
447cd70835SJames Wright   PetscMPIInt  rank;
457cd70835SJames Wright   PetscInt     num_orchestrator_nodes = 1;
467ebeccb9SJames Wright   SmartSimData smartsim;
477cd70835SJames Wright 
487cd70835SJames Wright   PetscFunctionBeginUser;
497ebeccb9SJames Wright   PetscCall(PetscNew(&smartsim));
507cd70835SJames Wright 
517cd70835SJames Wright   smartsim->collocated_database_num_ranks = 1;
520c373b74SJames Wright   PetscOptionsBegin(honee->comm, NULL, "Options for SmartSim integration", NULL);
537cd70835SJames Wright   PetscCall(PetscOptionsInt("-smartsim_collocated_database_num_ranks", "Number of ranks per collocated database instance", NULL,
547cd70835SJames Wright                             smartsim->collocated_database_num_ranks, &smartsim->collocated_database_num_ranks, NULL));
557cd70835SJames Wright   PetscOptionsEnd();
567cd70835SJames Wright 
577cd70835SJames Wright   // Create prefix to be put on tensor names
580c373b74SJames Wright   PetscCallMPI(MPI_Comm_rank(honee->comm, &rank));
594fa1625aSJames Wright   PetscCall(PetscSNPrintf(smartsim->rank_id_name, sizeof(smartsim->rank_id_name), "y.%d", rank));
607cd70835SJames Wright 
61ea615d4cSJames Wright   PetscCall(PetscLogEventBegin(HONEE_SmartRedis_Init, 0, 0, 0, 0));
6243e9749fSJames Wright   PetscCallSmartRedis(SmartRedisCClient(num_orchestrator_nodes != 1, smartsim->rank_id_name, strlen(smartsim->rank_id_name), &smartsim->client));
63ea615d4cSJames Wright   PetscCall(PetscLogEventEnd(HONEE_SmartRedis_Init, 0, 0, 0, 0));
647cd70835SJames Wright 
650c70a8bcSJames Wright   PetscCall(HoneeSetContainer(honee, SMARTSIM_KEY, smartsim, (PetscCtxDestroyFn *)SmartSimDataDestroy));
667ebeccb9SJames Wright 
670c373b74SJames Wright   PetscCall(SmartSimTrainingSetup(honee));
687cd70835SJames Wright   PetscFunctionReturn(PETSC_SUCCESS);
697cd70835SJames Wright }
70ec6e4151SJames Wright 
718fc6ab98SJames Wright /**
728fc6ab98SJames Wright   @brief Obtains the `SmartSimData` from the `Honee` object
738fc6ab98SJames Wright 
748fc6ab98SJames Wright   If `SmartSimData` has not already been initialized, this will initialize and create the struct.
758fc6ab98SJames Wright 
768fc6ab98SJames Wright   @param[in]  honee `Honee` object containing the SmartSim data
778fc6ab98SJames Wright   @param[out] smartsim `SmartSimData` containing the data
788fc6ab98SJames Wright **/
HoneeGetSmartSimData(Honee honee,SmartSimData * smartsim)797ebeccb9SJames Wright PetscErrorCode HoneeGetSmartSimData(Honee honee, SmartSimData *smartsim) {
800c70a8bcSJames Wright   PetscBool has_smartsim;
810c70a8bcSJames Wright 
827ebeccb9SJames Wright   PetscFunctionBeginUser;
830c70a8bcSJames Wright   PetscCall(HoneeHasContainer(honee, SMARTSIM_KEY, &has_smartsim));
840c70a8bcSJames Wright   if (!has_smartsim) PetscCall(SmartSimSetup(honee));
850c70a8bcSJames Wright   PetscCall(HoneeGetContainer(honee, SMARTSIM_KEY, smartsim));
867ebeccb9SJames Wright   PetscFunctionReturn(PETSC_SUCCESS);
877ebeccb9SJames Wright }
887ebeccb9SJames Wright 
898fc6ab98SJames Wright /**
908fc6ab98SJames Wright   @brief Checks if a tensor with `name` is in the SmartRedis database
918fc6ab98SJames Wright 
928fc6ab98SJames Wright   Function will error out if tensor does not exist.
938fc6ab98SJames Wright 
948fc6ab98SJames Wright   @param[in] c_client SmartRedis client object
958fc6ab98SJames Wright   @param[in] name Name of the tensor
968fc6ab98SJames Wright   @param[in] name_length Length of the tensor name
978fc6ab98SJames Wright   @return An error code: 0 - success, otherwise - failure
988fc6ab98SJames Wright **/
SmartRedisVerifyPutTensor(void * c_client,const char * name,const size_t name_length)99797f7eedSJames Wright PetscErrorCode SmartRedisVerifyPutTensor(void *c_client, const char *name, const size_t name_length) {
100797f7eedSJames Wright   bool does_exist = true;
101ec6e4151SJames Wright 
102797f7eedSJames Wright   PetscFunctionBeginUser;
103797f7eedSJames Wright   PetscCall(PetscLogEventBegin(HONEE_SmartRedis_Meta, 0, 0, 0, 0));
104797f7eedSJames Wright   PetscCallSmartRedis(tensor_exists(c_client, name, name_length, &does_exist));
105797f7eedSJames Wright   PetscCheck(does_exist, PETSC_COMM_SELF, -1, "Tensor of name '%s' was not written to the database successfully", name);
106797f7eedSJames Wright   PetscCall(PetscLogEventEnd(HONEE_SmartRedis_Meta, 0, 0, 0, 0));
107ec6e4151SJames Wright   PetscFunctionReturn(PETSC_SUCCESS);
108ec6e4151SJames Wright }
109