1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Memory bandwidth monitoring and allocation library 4 * 5 * Copyright (C) 2018 Intel Corporation 6 * 7 * Authors: 8 * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>, 9 * Fenghua Yu <fenghua.yu@intel.com> 10 */ 11 #include "resctrl.h" 12 13 #define UNCORE_IMC "uncore_imc" 14 #define READ_FILE_NAME "cas_count_read" 15 #define DYN_PMU_PATH "/sys/bus/event_source/devices" 16 #define SCALE 0.00006103515625 17 #define MAX_IMCS 40 18 #define MAX_TOKENS 5 19 20 #define CON_MBM_LOCAL_BYTES_PATH \ 21 "%s/%s/mon_data/mon_L3_%02d/mbm_local_bytes" 22 23 struct membw_read_format { 24 __u64 value; /* The value of the event */ 25 __u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */ 26 __u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */ 27 __u64 id; /* if PERF_FORMAT_ID */ 28 }; 29 30 struct imc_counter_config { 31 __u32 type; 32 __u64 event; 33 __u64 umask; 34 struct perf_event_attr pe; 35 int fd; 36 }; 37 38 static char mbm_total_path[1024]; 39 static int imcs; 40 static struct imc_counter_config imc_counters_config[MAX_IMCS]; 41 static const struct resctrl_test *current_test; 42 43 static void read_mem_bw_initialize_perf_event_attr(int i) 44 { 45 memset(&imc_counters_config[i].pe, 0, 46 sizeof(struct perf_event_attr)); 47 imc_counters_config[i].pe.type = imc_counters_config[i].type; 48 imc_counters_config[i].pe.size = sizeof(struct perf_event_attr); 49 imc_counters_config[i].pe.disabled = 1; 50 imc_counters_config[i].pe.inherit = 1; 51 imc_counters_config[i].pe.exclude_guest = 0; 52 imc_counters_config[i].pe.config = 53 imc_counters_config[i].umask << 8 | 54 imc_counters_config[i].event; 55 imc_counters_config[i].pe.sample_type = PERF_SAMPLE_IDENTIFIER; 56 imc_counters_config[i].pe.read_format = 57 PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; 58 } 59 60 static void read_mem_bw_ioctl_perf_event_ioc_reset_enable(int i) 61 { 62 ioctl(imc_counters_config[i].fd, PERF_EVENT_IOC_RESET, 0); 63 ioctl(imc_counters_config[i].fd, PERF_EVENT_IOC_ENABLE, 0); 64 } 65 66 static void read_mem_bw_ioctl_perf_event_ioc_disable(int i) 67 { 68 ioctl(imc_counters_config[i].fd, PERF_EVENT_IOC_DISABLE, 0); 69 } 70 71 /* 72 * get_read_event_and_umask: Parse config into event and umask 73 * @cas_count_cfg: Config 74 * @count: iMC number 75 */ 76 static void get_read_event_and_umask(char *cas_count_cfg, unsigned int count) 77 { 78 char *token[MAX_TOKENS]; 79 int i = 0; 80 81 token[0] = strtok(cas_count_cfg, "=,"); 82 83 for (i = 1; i < MAX_TOKENS; i++) 84 token[i] = strtok(NULL, "=,"); 85 86 for (i = 0; i < MAX_TOKENS - 1; i++) { 87 if (!token[i]) 88 break; 89 if (strcmp(token[i], "event") == 0) 90 imc_counters_config[count].event = strtol(token[i + 1], NULL, 16); 91 if (strcmp(token[i], "umask") == 0) 92 imc_counters_config[count].umask = strtol(token[i + 1], NULL, 16); 93 } 94 } 95 96 static int open_perf_read_event(int i, int cpu_no) 97 { 98 imc_counters_config[i].fd = 99 perf_event_open(&imc_counters_config[i].pe, -1, cpu_no, -1, 100 PERF_FLAG_FD_CLOEXEC); 101 102 if (imc_counters_config[i].fd == -1) { 103 fprintf(stderr, "Error opening leader %llx\n", 104 imc_counters_config[i].pe.config); 105 106 return -1; 107 } 108 109 return 0; 110 } 111 112 static int parse_imc_read_bw_events(char *imc_dir, unsigned int type, 113 unsigned int *count) 114 { 115 char imc_events_dir[PATH_MAX], imc_counter_cfg[PATH_MAX]; 116 unsigned int orig_count = *count; 117 char cas_count_cfg[1024]; 118 struct dirent *ep; 119 int path_len; 120 int ret = -1; 121 int num_cfg; 122 FILE *fp; 123 DIR *dp; 124 125 path_len = snprintf(imc_events_dir, sizeof(imc_events_dir), "%sevents", 126 imc_dir); 127 if (path_len >= sizeof(imc_events_dir)) { 128 ksft_print_msg("Unable to create path to %sevents\n", imc_dir); 129 return -1; 130 } 131 132 dp = opendir(imc_events_dir); 133 if (!dp) { 134 ksft_perror("Unable to open PMU events directory"); 135 return -1; 136 } 137 138 while ((ep = readdir(dp))) { 139 /* 140 * Parse all event files with READ_FILE_NAME prefix that 141 * contain the event number and umask. Skip files containing 142 * "." that contain unused properties of event. 143 */ 144 if (!strstr(ep->d_name, READ_FILE_NAME) || 145 strchr(ep->d_name, '.')) 146 continue; 147 148 path_len = snprintf(imc_counter_cfg, sizeof(imc_counter_cfg), 149 "%s/%s", imc_events_dir, ep->d_name); 150 if (path_len >= sizeof(imc_counter_cfg)) { 151 ksft_print_msg("Unable to create path to %s/%s\n", 152 imc_events_dir, ep->d_name); 153 goto out_close; 154 } 155 fp = fopen(imc_counter_cfg, "r"); 156 if (!fp) { 157 ksft_perror("Failed to open iMC config file"); 158 goto out_close; 159 } 160 num_cfg = fscanf(fp, "%1023s", cas_count_cfg); 161 fclose(fp); 162 if (num_cfg <= 0) { 163 ksft_perror("Could not get iMC cas count read"); 164 goto out_close; 165 } 166 if (*count >= MAX_IMCS) { 167 ksft_print_msg("Maximum iMC count exceeded\n"); 168 goto out_close; 169 } 170 171 imc_counters_config[*count].type = type; 172 get_read_event_and_umask(cas_count_cfg, *count); 173 /* Do not fail after incrementing *count. */ 174 *count += 1; 175 } 176 if (*count == orig_count) { 177 ksft_print_msg("Unable to find events in %s\n", imc_events_dir); 178 goto out_close; 179 } 180 ret = 0; 181 out_close: 182 closedir(dp); 183 return ret; 184 } 185 186 /* Get type and config of an iMC counter's read event. */ 187 static int read_from_imc_dir(char *imc_dir, unsigned int *count) 188 { 189 char imc_counter_type[PATH_MAX]; 190 unsigned int type; 191 int path_len; 192 FILE *fp; 193 int ret; 194 195 /* Get type of iMC counter */ 196 path_len = snprintf(imc_counter_type, sizeof(imc_counter_type), 197 "%s%s", imc_dir, "type"); 198 if (path_len >= sizeof(imc_counter_type)) { 199 ksft_print_msg("Unable to create path to %s%s\n", 200 imc_dir, "type"); 201 return -1; 202 } 203 fp = fopen(imc_counter_type, "r"); 204 if (!fp) { 205 ksft_perror("Failed to open iMC counter type file"); 206 207 return -1; 208 } 209 ret = fscanf(fp, "%u", &type); 210 fclose(fp); 211 if (ret <= 0) { 212 ksft_perror("Could not get iMC type"); 213 return -1; 214 } 215 ret = parse_imc_read_bw_events(imc_dir, type, count); 216 if (ret) { 217 ksft_print_msg("Unable to parse bandwidth event and umask\n"); 218 return ret; 219 } 220 221 return 0; 222 } 223 224 /* 225 * A system can have 'n' number of iMC (Integrated Memory Controller) 226 * counters, get that 'n'. Discover the properties of the available 227 * counters in support of needed performance measurement via perf. 228 * For each iMC counter get it's type and config. Also obtain each 229 * counter's event and umask for the memory read events that will be 230 * measured. 231 * 232 * Enumerate all these details into an array of structures. 233 * 234 * Return: >= 0 on success. < 0 on failure. 235 */ 236 static int num_of_imcs(void) 237 { 238 char imc_dir[512], *temp; 239 unsigned int count = 0; 240 struct dirent *ep; 241 int ret; 242 DIR *dp; 243 244 dp = opendir(DYN_PMU_PATH); 245 if (dp) { 246 while ((ep = readdir(dp))) { 247 temp = strstr(ep->d_name, UNCORE_IMC); 248 if (!temp) 249 continue; 250 251 /* 252 * imc counters are named as "uncore_imc_<n>", hence 253 * increment the pointer to point to <n>. Note that 254 * sizeof(UNCORE_IMC) would count for null character as 255 * well and hence the last underscore character in 256 * uncore_imc'_' need not be counted. 257 */ 258 temp = temp + sizeof(UNCORE_IMC); 259 260 /* 261 * Some directories under "DYN_PMU_PATH" could have 262 * names like "uncore_imc_free_running", hence, check if 263 * first character is a numerical digit or not. 264 */ 265 if (temp[0] >= '0' && temp[0] <= '9') { 266 sprintf(imc_dir, "%s/%s/", DYN_PMU_PATH, 267 ep->d_name); 268 ret = read_from_imc_dir(imc_dir, &count); 269 if (ret) { 270 closedir(dp); 271 272 return ret; 273 } 274 } 275 } 276 closedir(dp); 277 if (count == 0) { 278 ksft_print_msg("Unable to find iMC counters\n"); 279 280 return -1; 281 } 282 } else { 283 ksft_perror("Unable to open PMU directory"); 284 285 return -1; 286 } 287 288 return count; 289 } 290 291 int initialize_read_mem_bw_imc(void) 292 { 293 int imc; 294 295 imcs = num_of_imcs(); 296 if (imcs <= 0) 297 return imcs; 298 299 /* Initialize perf_event_attr structures for all iMC's */ 300 for (imc = 0; imc < imcs; imc++) 301 read_mem_bw_initialize_perf_event_attr(imc); 302 303 return 0; 304 } 305 306 static void perf_close_imc_read_mem_bw(void) 307 { 308 int mc; 309 310 for (mc = 0; mc < imcs; mc++) { 311 if (imc_counters_config[mc].fd != -1) 312 close(imc_counters_config[mc].fd); 313 } 314 } 315 316 /* 317 * perf_open_imc_read_mem_bw - Open perf fds for IMCs 318 * @cpu_no: CPU number that the benchmark PID is bound to 319 * 320 * Return: = 0 on success. < 0 on failure. 321 */ 322 static int perf_open_imc_read_mem_bw(int cpu_no) 323 { 324 int imc, ret; 325 326 for (imc = 0; imc < imcs; imc++) 327 imc_counters_config[imc].fd = -1; 328 329 for (imc = 0; imc < imcs; imc++) { 330 ret = open_perf_read_event(imc, cpu_no); 331 if (ret) 332 goto close_fds; 333 } 334 335 return 0; 336 337 close_fds: 338 perf_close_imc_read_mem_bw(); 339 return -1; 340 } 341 342 /* 343 * do_imc_read_mem_bw_test - Perform memory bandwidth test 344 * 345 * Runs memory bandwidth test over one second period. Also, handles starting 346 * and stopping of the IMC perf counters around the test. 347 */ 348 static void do_imc_read_mem_bw_test(void) 349 { 350 int imc; 351 352 for (imc = 0; imc < imcs; imc++) 353 read_mem_bw_ioctl_perf_event_ioc_reset_enable(imc); 354 355 sleep(1); 356 357 /* Stop counters after a second to get results. */ 358 for (imc = 0; imc < imcs; imc++) 359 read_mem_bw_ioctl_perf_event_ioc_disable(imc); 360 } 361 362 /* 363 * get_read_mem_bw_imc - Memory read bandwidth as reported by iMC counters 364 * 365 * Memory read bandwidth utilized by a process on a socket can be calculated 366 * using iMC counters' read events. Perf events are used to read these 367 * counters. 368 * 369 * Return: = 0 on success. < 0 on failure. 370 */ 371 static int get_read_mem_bw_imc(float *bw_imc) 372 { 373 float reads = 0, of_mul_read = 1; 374 int imc; 375 376 /* 377 * Log read event values from all iMC counters into 378 * struct imc_counter_config. 379 * Take overflow into consideration before calculating total bandwidth. 380 */ 381 for (imc = 0; imc < imcs; imc++) { 382 struct membw_read_format measurement; 383 struct imc_counter_config *r = 384 &imc_counters_config[imc]; 385 386 if (read(r->fd, &measurement, sizeof(measurement)) == -1) { 387 ksft_perror("Couldn't get read bandwidth through iMC"); 388 return -1; 389 } 390 391 __u64 r_time_enabled = measurement.time_enabled; 392 __u64 r_time_running = measurement.time_running; 393 394 if (r_time_enabled != r_time_running) 395 of_mul_read = (float)r_time_enabled / 396 (float)r_time_running; 397 398 reads += measurement.value * of_mul_read * SCALE; 399 } 400 401 *bw_imc = reads; 402 return 0; 403 } 404 405 /* 406 * initialize_mem_bw_resctrl: Appropriately populate "mbm_total_path" 407 * @param: Parameters passed to resctrl_val() 408 * @domain_id: Domain ID (cache ID; for MB, L3 cache ID) 409 */ 410 void initialize_mem_bw_resctrl(const struct resctrl_val_param *param, 411 int domain_id) 412 { 413 sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH, 414 param->ctrlgrp, domain_id); 415 } 416 417 /* 418 * Open file to read MBM local bytes from resctrl FS 419 */ 420 static FILE *open_mem_bw_resctrl(const char *mbm_bw_file) 421 { 422 FILE *fp; 423 424 fp = fopen(mbm_bw_file, "r"); 425 if (!fp) 426 ksft_perror("Failed to open total memory bandwidth file"); 427 428 return fp; 429 } 430 431 /* 432 * Get MBM Local bytes as reported by resctrl FS 433 */ 434 static int get_mem_bw_resctrl(FILE *fp, unsigned long *mbm_total) 435 { 436 if (fscanf(fp, "%lu\n", mbm_total) <= 0) { 437 ksft_perror("Could not get MBM local bytes"); 438 return -1; 439 } 440 return 0; 441 } 442 443 static pid_t bm_pid; 444 445 void ctrlc_handler(int signum, siginfo_t *info, void *ptr) 446 { 447 /* Only kill child after bm_pid is set after fork() */ 448 if (bm_pid) 449 kill(bm_pid, SIGKILL); 450 umount_resctrlfs(); 451 if (current_test && current_test->cleanup) 452 current_test->cleanup(); 453 ksft_print_msg("Ending\n\n"); 454 455 exit(EXIT_SUCCESS); 456 } 457 458 /* 459 * Register CTRL-C handler for parent, as it has to kill 460 * child process before exiting. 461 */ 462 int signal_handler_register(const struct resctrl_test *test) 463 { 464 struct sigaction sigact = {}; 465 int ret = 0; 466 467 bm_pid = 0; 468 469 current_test = test; 470 sigact.sa_sigaction = ctrlc_handler; 471 sigemptyset(&sigact.sa_mask); 472 sigact.sa_flags = SA_SIGINFO; 473 if (sigaction(SIGINT, &sigact, NULL) || 474 sigaction(SIGTERM, &sigact, NULL) || 475 sigaction(SIGHUP, &sigact, NULL)) { 476 ksft_perror("sigaction"); 477 ret = -1; 478 } 479 return ret; 480 } 481 482 /* 483 * Reset signal handler to SIG_DFL. 484 * Non-Value return because the caller should keep 485 * the error code of other path even if sigaction fails. 486 */ 487 void signal_handler_unregister(void) 488 { 489 struct sigaction sigact = {}; 490 491 current_test = NULL; 492 sigact.sa_handler = SIG_DFL; 493 sigemptyset(&sigact.sa_mask); 494 if (sigaction(SIGINT, &sigact, NULL) || 495 sigaction(SIGTERM, &sigact, NULL) || 496 sigaction(SIGHUP, &sigact, NULL)) { 497 ksft_perror("sigaction"); 498 } 499 } 500 501 /* 502 * print_results_bw: the memory bandwidth results are stored in a file 503 * @filename: file that stores the results 504 * @bm_pid: child pid that runs benchmark 505 * @bw_imc: perf imc counter value 506 * @bw_resc: memory bandwidth value 507 * 508 * Return: 0 on success, < 0 on error. 509 */ 510 static int print_results_bw(char *filename, pid_t bm_pid, float bw_imc, 511 unsigned long bw_resc) 512 { 513 unsigned long diff = fabs(bw_imc - bw_resc); 514 FILE *fp; 515 516 if (strcmp(filename, "stdio") == 0 || strcmp(filename, "stderr") == 0) { 517 printf("Pid: %d \t Mem_BW_iMC: %f \t ", (int)bm_pid, bw_imc); 518 printf("Mem_BW_resc: %lu \t Difference: %lu\n", bw_resc, diff); 519 } else { 520 fp = fopen(filename, "a"); 521 if (!fp) { 522 ksft_perror("Cannot open results file"); 523 524 return -1; 525 } 526 if (fprintf(fp, "Pid: %d \t Mem_BW_iMC: %f \t Mem_BW_resc: %lu \t Difference: %lu\n", 527 (int)bm_pid, bw_imc, bw_resc, diff) <= 0) { 528 ksft_print_msg("Could not log results\n"); 529 fclose(fp); 530 531 return -1; 532 } 533 fclose(fp); 534 } 535 536 return 0; 537 } 538 539 /* 540 * measure_read_mem_bw - Measures read memory bandwidth numbers while benchmark runs 541 * @uparams: User supplied parameters 542 * @param: Parameters passed to resctrl_val() 543 * @bm_pid: PID that runs the benchmark 544 * 545 * Measure memory bandwidth from resctrl and from another source which is 546 * perf imc value or could be something else if perf imc event is not 547 * available. Compare the two values to validate resctrl value. It takes 548 * 1 sec to measure the data. 549 * resctrl does not distinguish between read and write operations so 550 * its data includes all memory operations. 551 */ 552 int measure_read_mem_bw(const struct user_params *uparams, 553 struct resctrl_val_param *param, pid_t bm_pid) 554 { 555 unsigned long bw_resc, bw_resc_start, bw_resc_end; 556 FILE *mem_bw_fp; 557 float bw_imc; 558 int ret; 559 560 mem_bw_fp = open_mem_bw_resctrl(mbm_total_path); 561 if (!mem_bw_fp) 562 return -1; 563 564 ret = perf_open_imc_read_mem_bw(uparams->cpu); 565 if (ret < 0) 566 goto close_fp; 567 568 ret = get_mem_bw_resctrl(mem_bw_fp, &bw_resc_start); 569 if (ret < 0) 570 goto close_imc; 571 572 rewind(mem_bw_fp); 573 574 do_imc_read_mem_bw_test(); 575 576 ret = get_mem_bw_resctrl(mem_bw_fp, &bw_resc_end); 577 if (ret < 0) 578 goto close_imc; 579 580 ret = get_read_mem_bw_imc(&bw_imc); 581 if (ret < 0) 582 goto close_imc; 583 584 perf_close_imc_read_mem_bw(); 585 fclose(mem_bw_fp); 586 587 bw_resc = (bw_resc_end - bw_resc_start) / MB; 588 589 return print_results_bw(param->filename, bm_pid, bw_imc, bw_resc); 590 591 close_imc: 592 perf_close_imc_read_mem_bw(); 593 close_fp: 594 fclose(mem_bw_fp); 595 return ret; 596 } 597 598 /* 599 * resctrl_val: execute benchmark and measure memory bandwidth on 600 * the benchmark 601 * @test: test information structure 602 * @uparams: user supplied parameters 603 * @param: parameters passed to resctrl_val() 604 * 605 * Return: 0 when the test was run, < 0 on error. 606 */ 607 int resctrl_val(const struct resctrl_test *test, 608 const struct user_params *uparams, 609 struct resctrl_val_param *param) 610 { 611 unsigned char *buf = NULL; 612 cpu_set_t old_affinity; 613 int domain_id; 614 int ret = 0; 615 pid_t ppid; 616 617 if (strcmp(param->filename, "") == 0) 618 sprintf(param->filename, "stdio"); 619 620 ret = get_domain_id(test->resource, uparams->cpu, &domain_id); 621 if (ret < 0) { 622 ksft_print_msg("Could not get domain ID\n"); 623 return ret; 624 } 625 626 ppid = getpid(); 627 628 /* Taskset test to specified CPU. */ 629 ret = taskset_benchmark(ppid, uparams->cpu, &old_affinity); 630 if (ret) 631 return ret; 632 633 /* Write test to specified control & monitoring group in resctrl FS. */ 634 ret = write_bm_pid_to_resctrl(ppid, param->ctrlgrp, param->mongrp); 635 if (ret) 636 goto reset_affinity; 637 638 if (param->init) { 639 ret = param->init(test, uparams, param, domain_id); 640 if (ret) 641 goto reset_affinity; 642 } 643 644 /* 645 * If not running user provided benchmark, run the default 646 * "fill_buf". First phase of "fill_buf" is to prepare the 647 * buffer that the benchmark will operate on. No measurements 648 * are needed during this phase and prepared memory will be 649 * passed to next part of benchmark via copy-on-write thus 650 * no impact on the benchmark that relies on reading from 651 * memory only. 652 */ 653 if (param->fill_buf) { 654 buf = alloc_buffer(param->fill_buf->buf_size, 655 param->fill_buf->memflush); 656 if (!buf) { 657 ret = -ENOMEM; 658 goto reset_affinity; 659 } 660 } 661 662 fflush(stdout); 663 bm_pid = fork(); 664 if (bm_pid == -1) { 665 ret = -errno; 666 ksft_perror("Unable to fork"); 667 goto free_buf; 668 } 669 670 /* 671 * What needs to be measured runs in separate process until 672 * terminated. 673 */ 674 if (bm_pid == 0) { 675 if (param->fill_buf) 676 fill_cache_read(buf, param->fill_buf->buf_size, false); 677 else if (uparams->benchmark_cmd[0]) 678 execvp(uparams->benchmark_cmd[0], (char **)uparams->benchmark_cmd); 679 exit(EXIT_SUCCESS); 680 } 681 682 ksft_print_msg("Benchmark PID: %d\n", (int)bm_pid); 683 684 /* Give benchmark enough time to fully run. */ 685 sleep(1); 686 687 /* Test runs until the callback setup() tells the test to stop. */ 688 while (1) { 689 ret = param->setup(test, uparams, param); 690 if (ret == END_OF_TESTS) { 691 ret = 0; 692 break; 693 } 694 if (ret < 0) 695 break; 696 697 ret = param->measure(uparams, param, bm_pid); 698 if (ret) 699 break; 700 } 701 702 kill(bm_pid, SIGKILL); 703 free_buf: 704 free(buf); 705 reset_affinity: 706 taskset_restore(ppid, &old_affinity); 707 return ret; 708 } 709