1 static char help[] = "Driver for benchmarking SpMV."; 2 3 #include <petscmat.h> 4 #include "cJSON.h" 5 #include "mmloader.h" 6 7 char *read_file(const char *filename) 8 { 9 FILE *file = NULL; 10 long length = 0; 11 char *content = NULL; 12 size_t read_chars = 0; 13 14 /* open in read binary mode */ 15 file = fopen(filename, "rb"); 16 if (file) { 17 /* get the length */ 18 fseek(file, 0, SEEK_END); 19 length = ftell(file); 20 fseek(file, 0, SEEK_SET); 21 /* allocate content buffer */ 22 content = (char *)malloc((size_t)length + sizeof("")); 23 /* read the file into memory */ 24 read_chars = fread(content, sizeof(char), (size_t)length, file); 25 content[read_chars] = '\0'; 26 fclose(file); 27 } 28 return content; 29 } 30 31 void write_file(const char *filename, const char *content) 32 { 33 FILE *file = NULL; 34 file = fopen(filename, "w"); 35 if (file) { fputs(content, file); } 36 fclose(file); 37 } 38 39 int ParseJSON(const char *const inputjsonfile, char ***outputfilenames, char ***outputgroupnames, char ***outputmatnames, int *nmat) 40 { 41 char *content = read_file(inputjsonfile); 42 cJSON *matrix_json = NULL; 43 const cJSON *problem = NULL, *elem = NULL; 44 const cJSON *item = NULL; 45 char **filenames, **groupnames, **matnames; 46 int i, n; 47 if (!content) return 0; 48 matrix_json = cJSON_Parse(content); 49 if (!matrix_json) return 0; 50 n = cJSON_GetArraySize(matrix_json); 51 *nmat = n; 52 filenames = (char **)malloc(sizeof(char *) * n); 53 groupnames = (char **)malloc(sizeof(char *) * n); 54 matnames = (char **)malloc(sizeof(char *) * n); 55 for (i = 0; i < n; i++) { 56 elem = cJSON_GetArrayItem(matrix_json, i); 57 item = cJSON_GetObjectItemCaseSensitive(elem, "filename"); 58 filenames[i] = (char *)malloc(sizeof(char) * (strlen(item->valuestring) + 1)); 59 strcpy(filenames[i], item->valuestring); 60 problem = cJSON_GetObjectItemCaseSensitive(elem, "problem"); 61 item = cJSON_GetObjectItemCaseSensitive(problem, "group"); 62 groupnames[i] = (char *)malloc(sizeof(char) * strlen(item->valuestring) + 1); 63 strcpy(groupnames[i], item->valuestring); 64 item = cJSON_GetObjectItemCaseSensitive(problem, "name"); 65 matnames[i] = (char *)malloc(sizeof(char) * strlen(item->valuestring) + 1); 66 strcpy(matnames[i], item->valuestring); 67 } 68 cJSON_Delete(matrix_json); 69 free(content); 70 *outputfilenames = filenames; 71 *outputgroupnames = groupnames; 72 *outputmatnames = matnames; 73 return 0; 74 } 75 76 int UpdateJSON(const char *const inputjsonfile, PetscReal *spmv_times, PetscReal starting_spmv_time, const char *const matformat, PetscBool use_gpu, PetscInt repetitions) 77 { 78 char *content = read_file(inputjsonfile); 79 cJSON *matrix_json = NULL; 80 cJSON *elem = NULL; 81 int i, n; 82 if (!content) return 0; 83 matrix_json = cJSON_Parse(content); 84 if (!matrix_json) return 0; 85 n = cJSON_GetArraySize(matrix_json); 86 for (i = 0; i < n; i++) { 87 cJSON *spmv = NULL; 88 cJSON *format = NULL; 89 elem = cJSON_GetArrayItem(matrix_json, i); 90 spmv = cJSON_GetObjectItem(elem, "spmv"); 91 if (spmv) { 92 format = cJSON_GetObjectItem(spmv, matformat); 93 if (format) { 94 cJSON_SetNumberValue(cJSON_GetObjectItem(format, "time"), (spmv_times[i] - ((i == 0) ? starting_spmv_time : spmv_times[i - 1])) / repetitions); 95 cJSON_SetIntValue(cJSON_GetObjectItem(format, "repetitions"), repetitions); 96 } else { 97 format = cJSON_CreateObject(); 98 cJSON_AddItemToObject(spmv, matformat, format); 99 cJSON_AddNumberToObject(format, "time", (spmv_times[i] - ((i == 0) ? starting_spmv_time : spmv_times[i - 1])) / repetitions); 100 cJSON_AddNumberToObject(format, "repetitions", repetitions); 101 } 102 } else { 103 spmv = cJSON_CreateObject(); 104 cJSON_AddItemToObject(elem, "spmv", spmv); 105 format = cJSON_CreateObject(); 106 cJSON_AddItemToObject(spmv, matformat, format); 107 cJSON_AddNumberToObject(format, "time", (spmv_times[i] - ((i == 0) ? starting_spmv_time : spmv_times[i - 1])) / repetitions); 108 cJSON_AddNumberToObject(format, "repetitions", repetitions); 109 } 110 } 111 free(content); 112 content = cJSON_Print(matrix_json); 113 write_file(inputjsonfile, content); 114 cJSON_Delete(matrix_json); 115 free(content); 116 return 0; 117 } 118 119 /* 120 For GPU formats, we keep two copies of the matrix on CPU and one copy on GPU. 121 The extra CPU copy allows us to destroy the GPU matrix and recreate it efficiently 122 in each repetition. As a result, each MatMult call is fresh, and we can capture 123 the first-time overhead (e.g. of CuSparse SpMV), and avoids the cache effect 124 during consecutive calls. 125 */ 126 PetscErrorCode TimedSpMV(Mat A, Vec b, PetscReal *time, const char *petscmatformat, PetscBool use_gpu, PetscInt repetitions) 127 { 128 Mat A2 = NULL; 129 PetscInt i; 130 Vec u; 131 PetscLogDouble vstart = 0, vend = 0; 132 PetscBool isaijcusparse, isaijhipsparse, isaijkokkos, issellcuda, issellhip; 133 134 PetscFunctionBeginUser; 135 PetscCall(PetscStrcmp(petscmatformat, MATAIJCUSPARSE, &isaijcusparse)); 136 PetscCall(PetscStrcmp(petscmatformat, MATAIJHIPSPARSE, &isaijhipsparse)); 137 PetscCall(PetscStrcmp(petscmatformat, MATAIJKOKKOS, &isaijkokkos)); 138 PetscCall(PetscStrcmp(petscmatformat, MATSELLCUDA, &issellcuda)); 139 PetscCall(PetscStrcmp(petscmatformat, MATSELLHIP, &issellhip)); 140 if (isaijcusparse || issellcuda) PetscCall(VecSetType(b, VECCUDA)); 141 if (isaijkokkos) PetscCall(VecSetType(b, VECKOKKOS)); 142 if (isaijhipsparse || issellhip) PetscCall(VecSetType(b, VECHIP)); 143 PetscCall(VecDuplicate(b, &u)); 144 if (time) *time = 0.0; 145 for (i = 0; i < repetitions; i++) { 146 if (use_gpu) { 147 PetscCall(MatDestroy(&A2)); 148 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &A2)); 149 PetscCall(MatSetType(A2, petscmatformat)); 150 PetscCall(MatSetFromOptions(A2)); // This allows to change parameters such as slice height in SpMV kernels for SELL 151 } else A2 = A; 152 /* Timing MatMult */ 153 if (time) PetscCall(PetscTime(&vstart)); 154 155 PetscCall(MatMult(A2, b, u)); 156 157 if (time) { 158 PetscCall(PetscTime(&vend)); 159 *time += (PetscReal)(vend - vstart); 160 } 161 } 162 PetscCall(VecDestroy(&u)); 163 if (repetitions > 0 && use_gpu) PetscCall(MatDestroy(&A2)); 164 PetscFunctionReturn(PETSC_SUCCESS); 165 } 166 167 PetscErrorCode WarmUpDevice(Mat A, Vec b, const char *petscmatformat) 168 { 169 Mat A2 = NULL; 170 PetscLogEvent event; 171 Vec u; 172 PetscBool isaijcusparse, isaijhipsparse, isaijkokkos, issellcuda, issellhip; 173 174 PetscFunctionBeginUser; 175 PetscCall(PetscStrcmp(petscmatformat, MATAIJCUSPARSE, &isaijcusparse)); 176 PetscCall(PetscStrcmp(petscmatformat, MATAIJHIPSPARSE, &isaijhipsparse)); 177 PetscCall(PetscStrcmp(petscmatformat, MATAIJKOKKOS, &isaijkokkos)); 178 PetscCall(PetscStrcmp(petscmatformat, MATSELLCUDA, &issellcuda)); 179 PetscCall(PetscStrcmp(petscmatformat, MATSELLHIP, &issellhip)); 180 if (!isaijcusparse && !isaijkokkos && !isaijhipsparse && !issellcuda && !issellhip) PetscFunctionReturn(PETSC_SUCCESS); 181 if (isaijcusparse || issellcuda) PetscCall(VecSetType(b, VECCUDA)); 182 if (isaijkokkos) PetscCall(VecSetType(b, VECKOKKOS)); 183 if (isaijhipsparse || issellhip) PetscCall(VecSetType(b, VECHIP)); 184 PetscCall(VecDuplicate(b, &u)); 185 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &A2)); 186 PetscCall(MatSetType(A2, petscmatformat)); 187 PetscCall(PetscLogEventGetId("MatMult", &event)); 188 PetscCall(PetscLogEventDeactivatePush(event)); 189 PetscCall(MatMult(A2, b, u)); 190 PetscCall(PetscLogEventDeactivatePop(event)); 191 PetscCall(VecDestroy(&u)); 192 PetscCall(MatDestroy(&A2)); 193 PetscFunctionReturn(PETSC_SUCCESS); 194 } 195 196 PetscErrorCode PetscLogSpMVTime(PetscReal *gputime, PetscReal *cputime, PetscReal *gpuflops, const char *petscmatformat) 197 { 198 PetscLogEvent event; 199 PetscEventPerfInfo eventInfo; 200 // PetscReal gpuflopRate; 201 202 // if (matformat) { 203 // PetscCall(PetscLogEventGetId("MatCUDACopyTo", &event)); 204 // } else { 205 // PetscCall(PetscLogEventGetId("MatCUSPARSCopyTo", &event)); 206 // } 207 // PetscCall(PetscLogEventGetPerfInfo(PETSC_DETERMINE, event, &eventInfo)); 208 // PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%.4e ", eventInfo.time)); 209 210 PetscFunctionBeginUser; 211 PetscCall(PetscLogEventGetId("MatMult", &event)); 212 PetscCall(PetscLogEventGetPerfInfo(PETSC_DETERMINE, event, &eventInfo)); 213 // gpuflopRate = eventInfo.GpuFlops/eventInfo.GpuTime; 214 // PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%.2f %.4e %.4e\n", gpuflopRate/1.e6, eventInfo.GpuTime, eventInfo.time)); 215 if (cputime) *cputime = eventInfo.time; 216 #if defined(PETSC_HAVE_DEVICE) 217 if (gputime) *gputime = eventInfo.GpuTime; 218 if (gpuflops) *gpuflops = eventInfo.GpuFlops / 1.e6; 219 #endif 220 PetscFunctionReturn(PETSC_SUCCESS); 221 } 222 223 PetscErrorCode MapToPetscMatType(const char *matformat, PetscBool use_gpu, char **petscmatformat) 224 { 225 PetscBool iscsr, issell, iscsrkokkos; 226 227 PetscFunctionBeginUser; 228 PetscCall(PetscStrcmp(matformat, "csr", &iscsr)); 229 if (iscsr) { 230 if (use_gpu) { 231 #if defined(PETSC_HAVE_CUDA) 232 PetscCall(PetscStrallocpy(MATAIJCUSPARSE, petscmatformat)); 233 #endif 234 #if defined(PETSC_HAVE_HIP) 235 PetscCall(PetscStrallocpy(MATAIJHIPSPARSE, petscmatformat)); 236 #endif 237 } else PetscCall(PetscStrallocpy(MATAIJ, petscmatformat)); 238 } else { 239 PetscCall(PetscStrcmp(matformat, "sell", &issell)); 240 if (issell) { 241 if (use_gpu) { 242 #if defined(PETSC_HAVE_CUDA) 243 PetscCall(PetscStrallocpy(MATSELLCUDA, petscmatformat)); 244 #endif 245 #if defined(PETSC_HAVE_HIP) 246 PetscCall(PetscStrallocpy(MATSELLHIP, petscmatformat)); 247 #endif 248 } else PetscCall(PetscStrallocpy(MATSELL, petscmatformat)); 249 } else { 250 PetscCall(PetscStrcmp(matformat, "csrkokkos", &iscsrkokkos)); 251 if (iscsrkokkos) PetscCall(PetscStrallocpy(MATAIJKOKKOS, petscmatformat)); 252 } 253 } 254 PetscFunctionReturn(PETSC_SUCCESS); 255 } 256 257 int main(int argc, char **args) 258 { 259 PetscInt nmat = 1, nformats = 5, i, j, repetitions = 1; 260 Mat A; 261 Vec b; 262 char jfilename[PETSC_MAX_PATH_LEN]; 263 char filename[PETSC_MAX_PATH_LEN], bfilename[PETSC_MAX_PATH_LEN]; 264 char groupname[PETSC_MAX_PATH_LEN], matname[PETSC_MAX_PATH_LEN]; 265 char *matformats[5]; 266 char **filenames = NULL, **groupnames = NULL, **matnames = NULL; 267 char ordering[256] = MATORDERINGRCM; 268 PetscBool bflg, flg1, flg2, flg3, use_gpu = PETSC_FALSE, permute = PETSC_FALSE; 269 IS rowperm = NULL, colperm = NULL; 270 PetscViewer fd; 271 PetscReal starting_spmv_time = 0, *spmv_times; 272 273 PetscCall(PetscOptionsInsertString(NULL, "-log_view_gpu_time -log_view :/dev/null")); 274 PetscCall(PetscInitialize(&argc, &args, (char *)0, help)); 275 PetscCall(PetscOptionsGetStringArray(NULL, NULL, "-formats", matformats, &nformats, &flg1)); 276 if (!flg1) { 277 nformats = 1; 278 PetscCall(PetscStrallocpy("csr", &matformats[0])); 279 } 280 PetscCall(PetscOptionsGetBool(NULL, NULL, "-use_gpu", &use_gpu, NULL)); 281 PetscCall(PetscOptionsGetInt(NULL, NULL, "-repetitions", &repetitions, NULL)); 282 /* Read matrix and RHS */ 283 PetscCall(PetscOptionsGetString(NULL, NULL, "-groupname", groupname, PETSC_MAX_PATH_LEN, NULL)); 284 PetscCall(PetscOptionsGetString(NULL, NULL, "-matname", matname, PETSC_MAX_PATH_LEN, NULL)); 285 PetscCall(PetscOptionsGetString(NULL, NULL, "-ABIN", filename, PETSC_MAX_PATH_LEN, &flg1)); 286 PetscCall(PetscOptionsGetString(NULL, NULL, "-AMTX", filename, PETSC_MAX_PATH_LEN, &flg2)); 287 PetscCall(PetscOptionsGetString(NULL, NULL, "-AJSON", jfilename, PETSC_MAX_PATH_LEN, &flg3)); 288 PetscOptionsBegin(PETSC_COMM_WORLD, NULL, "Extra options", ""); 289 PetscCall(PetscOptionsFList("-permute", "Permute matrix and vector to solving in new ordering", "", MatOrderingList, ordering, ordering, sizeof(ordering), &permute)); 290 PetscOptionsEnd(); 291 #if !defined(PETSC_HAVE_DEVICE) 292 PetscCheck(!use_gpu, PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "To use the option -use_gpu 1, PETSc must be configured with GPU support"); 293 #endif 294 PetscCheck(flg1 || flg2 || flg3, PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "Must indicate an input file with the -ABIN or -AMTX or -AJSON depending on the file format"); 295 if (flg3) { 296 ParseJSON(jfilename, &filenames, &groupnames, &matnames, &nmat); 297 PetscCall(PetscCalloc1(nmat, &spmv_times)); 298 } else if (flg2) { 299 PetscCall(MatCreateFromMTX(&A, filename, PETSC_TRUE)); 300 } else if (flg1) { 301 PetscCall(PetscViewerBinaryOpen(PETSC_COMM_WORLD, filename, FILE_MODE_READ, &fd)); 302 PetscCall(MatCreate(PETSC_COMM_WORLD, &A)); 303 PetscCall(MatSetType(A, MATAIJ)); 304 PetscCall(MatSetFromOptions(A)); 305 PetscCall(MatLoad(A, fd)); 306 PetscCall(PetscViewerDestroy(&fd)); 307 } 308 if (permute) { 309 Mat Aperm; 310 PetscCall(MatGetOrdering(A, ordering, &rowperm, &colperm)); 311 PetscCall(MatPermute(A, rowperm, colperm, &Aperm)); 312 PetscCall(MatDestroy(&A)); 313 A = Aperm; /* Replace original operator with permuted version */ 314 } 315 /* Let the vec object trigger the first CUDA call, which takes a relatively long time to init CUDA */ 316 PetscCall(PetscOptionsGetString(NULL, NULL, "-b", bfilename, PETSC_MAX_PATH_LEN, &bflg)); 317 if (bflg) { 318 PetscViewer fb; 319 PetscCall(VecCreate(PETSC_COMM_WORLD, &b)); 320 PetscCall(VecSetFromOptions(b)); 321 PetscCall(PetscViewerBinaryOpen(PETSC_COMM_WORLD, bfilename, FILE_MODE_READ, &fb)); 322 PetscCall(VecLoad(b, fb)); 323 PetscCall(PetscViewerDestroy(&fb)); 324 } 325 326 for (j = 0; j < nformats; j++) { 327 char *petscmatformat = NULL; 328 PetscCall(MapToPetscMatType(matformats[j], use_gpu, &petscmatformat)); 329 PetscCheck(petscmatformat, PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "Invalid mat format %s, supported options include csr and sell.", matformats[j]); 330 if (flg3) { // mat names specified in a JSON file 331 for (i = 0; i < nmat; i++) { 332 PetscCall(MatCreateFromMTX(&A, filenames[i], PETSC_TRUE)); 333 if (!bflg) { 334 PetscCall(MatCreateVecs(A, &b, NULL)); 335 PetscCall(VecSet(b, 1.0)); 336 } 337 if (use_gpu) PetscCall(WarmUpDevice(A, b, petscmatformat)); 338 PetscCall(TimedSpMV(A, b, NULL, petscmatformat, use_gpu, repetitions)); 339 if (use_gpu) PetscCall(PetscLogSpMVTime(&spmv_times[i], NULL, NULL, petscmatformat)); 340 else PetscCall(PetscLogSpMVTime(NULL, &spmv_times[i], NULL, petscmatformat)); 341 PetscCall(MatDestroy(&A)); 342 if (!bflg) PetscCall(VecDestroy(&b)); 343 } 344 UpdateJSON(jfilename, spmv_times, starting_spmv_time, matformats[j], use_gpu, repetitions); 345 starting_spmv_time = spmv_times[nmat - 1]; 346 } else { 347 PetscReal spmv_time; 348 if (!bflg) { 349 PetscCall(MatCreateVecs(A, &b, NULL)); 350 PetscCall(VecSet(b, 1.0)); 351 } 352 if (use_gpu) PetscCall(WarmUpDevice(A, b, petscmatformat)); 353 PetscCall(TimedSpMV(A, b, &spmv_time, petscmatformat, use_gpu, repetitions)); 354 if (!bflg) PetscCall(VecDestroy(&b)); 355 } 356 PetscCall(PetscFree(petscmatformat)); 357 } 358 if (flg3) { 359 for (i = 0; i < nmat; i++) { 360 free(filenames[i]); 361 free(groupnames[i]); 362 free(matnames[i]); 363 } 364 free(filenames); 365 free(groupnames); 366 free(matnames); 367 PetscCall(PetscFree(spmv_times)); 368 } 369 for (j = 0; j < nformats; j++) PetscCall(PetscFree(matformats[j])); 370 if (flg1 || flg2) PetscCall(MatDestroy(&A)); 371 if (bflg) PetscCall(VecDestroy(&b)); 372 PetscCall(ISDestroy(&rowperm)); 373 PetscCall(ISDestroy(&colperm)); 374 PetscCall(PetscFinalize()); 375 return 0; 376 } 377 /*TEST 378 379 build: 380 requires: !complex double !windows_compilers !defined(PETSC_USE_64BIT_INDICES) 381 depends: mmloader.c mmio.c cJSON.c 382 383 test: 384 suffix: 1 385 args: -AMTX ${wPETSC_DIR}/share/petsc/datafiles/matrices/amesos2_test_mat0.mtx 386 387 test: 388 suffix: 2 389 args:-AMTX ${wPETSC_DIR}/share/petsc/datafiles/matrices/amesos2_test_mat0.mtx -use_gpu 390 output_file: output/bench_spmv_1.out 391 requires: cuda 392 393 test: 394 suffix: 3 395 args:-AMTX ${wPETSC_DIR}/share/petsc/datafiles/matrices/amesos2_test_mat0.mtx -use_gpu 396 output_file: output/bench_spmv_1.out 397 requires: hip 398 399 TEST*/ 400