1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * mem-memcpy.c 4 * 5 * Simple memcpy() and memset() benchmarks 6 * 7 * Written by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp> 8 */ 9 10 #include "debug.h" 11 #include "../perf-sys.h" 12 #include <subcmd/parse-options.h> 13 #include "../util/header.h" 14 #include "../util/cloexec.h" 15 #include "../util/string2.h" 16 #include "bench.h" 17 #include "mem-memcpy-arch.h" 18 #include "mem-memset-arch.h" 19 20 #include <stdio.h> 21 #include <stdlib.h> 22 #include <string.h> 23 #include <unistd.h> 24 #include <sys/time.h> 25 #include <sys/mman.h> 26 #include <errno.h> 27 #include <linux/time64.h> 28 #include <linux/log2.h> 29 30 #define K 1024 31 32 #define PAGE_SHIFT_4KB 12 33 #define PAGE_SHIFT_2MB 21 34 #define PAGE_SHIFT_1GB 30 35 36 static const char *size_str = "1MB"; 37 static const char *function_str = "all"; 38 static const char *page_size_str = "4KB"; 39 static const char *chunk_size_str = "0"; 40 static unsigned int nr_loops = 1; 41 static bool use_cycles; 42 static int cycles_fd; 43 static unsigned int seed; 44 45 static const struct option bench_common_options[] = { 46 OPT_STRING('s', "size", &size_str, "1MB", 47 "Specify the size of the memory buffers. " 48 "Available units: B, KB, MB, GB and TB (case insensitive)"), 49 50 OPT_STRING('p', "page", &page_size_str, "4KB", 51 "Specify page-size for mapping memory buffers. " 52 "Available sizes: 4KB, 2MB, 1GB (case insensitive)"), 53 54 OPT_STRING('f', "function", &function_str, "all", 55 "Specify the function to run, \"all\" runs all available functions, \"help\" lists them"), 56 57 OPT_UINTEGER('l', "nr_loops", &nr_loops, 58 "Specify the number of loops to run. (default: 1)"), 59 60 OPT_BOOLEAN('c', "cycles", &use_cycles, 61 "Use a cycles event instead of gettimeofday() to measure performance"), 62 63 OPT_END() 64 }; 65 66 static const struct option bench_mem_options[] = { 67 OPT_STRING('k', "chunk", &chunk_size_str, "0", 68 "Specify the chunk-size for each invocation. " 69 "Available units: B, KB, MB, GB and TB (case insensitive)"), 70 OPT_PARENT(bench_common_options), 71 OPT_END() 72 }; 73 74 union bench_clock { 75 u64 cycles; 76 struct timeval tv; 77 }; 78 79 struct bench_params { 80 size_t size; 81 size_t size_total; 82 size_t chunk_size; 83 unsigned int nr_loops; 84 unsigned int page_shift; 85 unsigned int seed; 86 }; 87 88 struct bench_mem_info { 89 const struct function *functions; 90 int (*do_op)(const struct function *r, struct bench_params *p, 91 void *src, void *dst, union bench_clock *rt); 92 const char *const *usage; 93 const struct option *options; 94 bool alloc_src; 95 }; 96 97 typedef bool (*mem_init_t)(struct bench_mem_info *, struct bench_params *, 98 void **, void **); 99 typedef void (*mem_fini_t)(struct bench_mem_info *, struct bench_params *, 100 void **, void **); 101 typedef void *(*memcpy_t)(void *, const void *, size_t); 102 typedef void *(*memset_t)(void *, int, size_t); 103 typedef void (*mmap_op_t)(void *, size_t, unsigned int, bool); 104 105 struct function { 106 const char *name; 107 const char *desc; 108 struct { 109 mem_init_t init; 110 mem_fini_t fini; 111 union { 112 memcpy_t memcpy; 113 memset_t memset; 114 mmap_op_t mmap_op; 115 }; 116 } fn; 117 }; 118 119 static struct perf_event_attr cycle_attr = { 120 .type = PERF_TYPE_HARDWARE, 121 .config = PERF_COUNT_HW_CPU_CYCLES 122 }; 123 124 static int init_cycles(void) 125 { 126 cycles_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, perf_event_open_cloexec_flag()); 127 128 if (cycles_fd < 0 && errno == ENOSYS) { 129 pr_debug("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); 130 return -1; 131 } 132 133 return cycles_fd; 134 } 135 136 static u64 get_cycles(void) 137 { 138 int ret; 139 u64 clk; 140 141 ret = read(cycles_fd, &clk, sizeof(u64)); 142 BUG_ON(ret != sizeof(u64)); 143 144 return clk; 145 } 146 147 static void clock_get(union bench_clock *t) 148 { 149 if (use_cycles) 150 t->cycles = get_cycles(); 151 else 152 BUG_ON(gettimeofday(&t->tv, NULL)); 153 } 154 155 static union bench_clock clock_diff(union bench_clock *s, union bench_clock *e) 156 { 157 union bench_clock t; 158 159 if (use_cycles) 160 t.cycles = e->cycles - s->cycles; 161 else 162 timersub(&e->tv, &s->tv, &t.tv); 163 164 return t; 165 } 166 167 static void clock_accum(union bench_clock *a, union bench_clock *b) 168 { 169 if (use_cycles) 170 a->cycles += b->cycles; 171 else 172 timeradd(&a->tv, &b->tv, &a->tv); 173 } 174 175 static double timeval2double(struct timeval *ts) 176 { 177 return (double)ts->tv_sec + (double)ts->tv_usec / (double)USEC_PER_SEC; 178 } 179 180 #define print_bps(x) do { \ 181 if (x < K) \ 182 printf(" %14lf bytes/sec\n", x); \ 183 else if (x < K * K) \ 184 printf(" %14lfd KB/sec\n", x / K); \ 185 else if (x < K * K * K) \ 186 printf(" %14lf MB/sec\n", x / K / K); \ 187 else \ 188 printf(" %14lf GB/sec\n", x / K / K / K); \ 189 } while (0) 190 191 static void __bench_mem_function(struct bench_mem_info *info, struct bench_params *p, 192 int r_idx) 193 { 194 const struct function *r = &info->functions[r_idx]; 195 double result_bps = 0.0; 196 union bench_clock rt = { 0 }; 197 void *src = NULL, *dst = NULL; 198 199 printf("# function '%s' (%s)\n", r->name, r->desc); 200 201 if (r->fn.init && r->fn.init(info, p, &src, &dst)) 202 goto out_init_failed; 203 204 if (bench_format == BENCH_FORMAT_DEFAULT) 205 printf("# Copying %s bytes ...\n\n", size_str); 206 207 if (info->do_op(r, p, src, dst, &rt)) 208 goto out_test_failed; 209 210 switch (bench_format) { 211 case BENCH_FORMAT_DEFAULT: 212 if (use_cycles) { 213 printf(" %14lf cycles/byte\n", (double)rt.cycles/(double)p->size_total); 214 } else { 215 result_bps = (double)p->size_total/timeval2double(&rt.tv); 216 print_bps(result_bps); 217 } 218 break; 219 220 case BENCH_FORMAT_SIMPLE: 221 if (use_cycles) { 222 printf("%lf\n", (double)rt.cycles/(double)p->size_total); 223 } else { 224 result_bps = (double)p->size_total/timeval2double(&rt.tv); 225 printf("%lf\n", result_bps); 226 } 227 break; 228 229 default: 230 BUG_ON(1); 231 break; 232 } 233 234 out_test_failed: 235 out_free: 236 if (r->fn.fini) r->fn.fini(info, p, &src, &dst); 237 return; 238 out_init_failed: 239 printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str, 240 p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large"); 241 goto out_free; 242 } 243 244 static int bench_mem_common(int argc, const char **argv, struct bench_mem_info *info) 245 { 246 int i; 247 struct bench_params p = { 0 }; 248 unsigned int page_size; 249 250 argc = parse_options(argc, argv, info->options, info->usage, 0); 251 252 if (use_cycles) { 253 i = init_cycles(); 254 if (i < 0) { 255 fprintf(stderr, "Failed to open cycles counter\n"); 256 return i; 257 } 258 } 259 260 p.nr_loops = nr_loops; 261 p.size = (size_t)perf_atoll((char *)size_str); 262 263 if ((s64)p.size <= 0) { 264 fprintf(stderr, "Invalid size:%s\n", size_str); 265 return 1; 266 } 267 p.size_total = p.size * p.nr_loops; 268 269 p.chunk_size = (size_t)perf_atoll((char *)chunk_size_str); 270 if ((s64)p.chunk_size < 0 || (s64)p.chunk_size > (s64)p.size) { 271 fprintf(stderr, "Invalid chunk_size:%s\n", chunk_size_str); 272 return 1; 273 } 274 if (!p.chunk_size) 275 p.chunk_size = p.size; 276 277 page_size = (unsigned int)perf_atoll((char *)page_size_str); 278 if (page_size != (1 << PAGE_SHIFT_4KB) && 279 page_size != (1 << PAGE_SHIFT_2MB) && 280 page_size != (1 << PAGE_SHIFT_1GB)) { 281 fprintf(stderr, "Invalid page-size:%s\n", page_size_str); 282 return 1; 283 } 284 p.page_shift = ilog2(page_size); 285 286 p.seed = seed; 287 288 if (!strncmp(function_str, "all", 3)) { 289 for (i = 0; info->functions[i].name; i++) 290 __bench_mem_function(info, &p, i); 291 return 0; 292 } 293 294 for (i = 0; info->functions[i].name; i++) { 295 if (!strcmp(info->functions[i].name, function_str)) 296 break; 297 } 298 if (!info->functions[i].name) { 299 if (strcmp(function_str, "help") && strcmp(function_str, "h")) 300 printf("Unknown function: %s\n", function_str); 301 printf("Available functions:\n"); 302 for (i = 0; info->functions[i].name; i++) { 303 printf("\t%s ... %s\n", 304 info->functions[i].name, info->functions[i].desc); 305 } 306 return 1; 307 } 308 309 __bench_mem_function(info, &p, i); 310 311 return 0; 312 } 313 314 static void memcpy_prefault(memcpy_t fn, size_t size, void *src, void *dst) 315 { 316 /* Make sure to always prefault zero pages even if MMAP_THRESH is crossed: */ 317 memset(src, 0, size); 318 319 /* 320 * We prefault the freshly allocated memory range here, 321 * to not measure page fault overhead: 322 */ 323 fn(dst, src, size); 324 } 325 326 static int do_memcpy(const struct function *r, struct bench_params *p, 327 void *src, void *dst, union bench_clock *rt) 328 { 329 union bench_clock start, end; 330 memcpy_t fn = r->fn.memcpy; 331 332 memcpy_prefault(fn, p->size, src, dst); 333 334 clock_get(&start); 335 for (unsigned int i = 0; i < p->nr_loops; ++i) 336 for (size_t off = 0; off < p->size; off += p->chunk_size) 337 fn(dst + off, src + off, min(p->chunk_size, p->size - off)); 338 clock_get(&end); 339 340 *rt = clock_diff(&start, &end); 341 342 return 0; 343 } 344 345 static void *bench_mmap(size_t size, bool populate, unsigned int page_shift) 346 { 347 void *p; 348 int extra = populate ? MAP_POPULATE : 0; 349 350 if (page_shift != PAGE_SHIFT_4KB) 351 extra |= MAP_HUGETLB | (page_shift << MAP_HUGE_SHIFT); 352 353 p = mmap(NULL, size, PROT_READ|PROT_WRITE, 354 extra | MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); 355 356 return p == MAP_FAILED ? NULL : p; 357 } 358 359 static void bench_munmap(void *p, size_t size) 360 { 361 if (p) 362 munmap(p, size); 363 } 364 365 static bool mem_alloc(struct bench_mem_info *info, struct bench_params *p, 366 void **src, void **dst) 367 { 368 bool failed; 369 370 *dst = bench_mmap(p->size, true, p->page_shift); 371 failed = *dst == NULL; 372 373 if (info->alloc_src) { 374 *src = bench_mmap(p->size, true, p->page_shift); 375 failed = failed || *src == NULL; 376 } 377 378 return failed; 379 } 380 381 static void mem_free(struct bench_mem_info *info __maybe_unused, 382 struct bench_params *p __maybe_unused, 383 void **src, void **dst) 384 { 385 bench_munmap(*dst, p->size); 386 bench_munmap(*src, p->size); 387 388 *dst = *src = NULL; 389 } 390 391 struct function memcpy_functions[] = { 392 { .name = "default", 393 .desc = "Default memcpy() provided by glibc", 394 .fn.init = mem_alloc, 395 .fn.fini = mem_free, 396 .fn.memcpy = memcpy }, 397 398 #ifdef HAVE_ARCH_X86_64_SUPPORT 399 # define MEMCPY_FN(_fn, _init, _fini, _name, _desc) \ 400 {.name = _name, .desc = _desc, .fn.memcpy = _fn, .fn.init = _init, .fn.fini = _fini }, 401 # include "mem-memcpy-x86-64-asm-def.h" 402 # undef MEMCPY_FN 403 #endif 404 405 { .name = NULL, } 406 }; 407 408 static const char * const bench_mem_memcpy_usage[] = { 409 "perf bench mem memcpy <options>", 410 NULL 411 }; 412 413 int bench_mem_memcpy(int argc, const char **argv) 414 { 415 struct bench_mem_info info = { 416 .functions = memcpy_functions, 417 .do_op = do_memcpy, 418 .usage = bench_mem_memcpy_usage, 419 .options = bench_mem_options, 420 .alloc_src = true, 421 }; 422 423 return bench_mem_common(argc, argv, &info); 424 } 425 426 static int do_memset(const struct function *r, struct bench_params *p, 427 void *src __maybe_unused, void *dst, union bench_clock *rt) 428 { 429 union bench_clock start, end; 430 memset_t fn = r->fn.memset; 431 432 /* 433 * We prefault the freshly allocated memory range here, 434 * to not measure page fault overhead: 435 */ 436 fn(dst, -1, p->size); 437 438 clock_get(&start); 439 for (unsigned int i = 0; i < p->nr_loops; ++i) 440 for (size_t off = 0; off < p->size; off += p->chunk_size) 441 fn(dst + off, i, min(p->chunk_size, p->size - off)); 442 clock_get(&end); 443 444 *rt = clock_diff(&start, &end); 445 446 return 0; 447 } 448 449 static const char * const bench_mem_memset_usage[] = { 450 "perf bench mem memset <options>", 451 NULL 452 }; 453 454 static const struct function memset_functions[] = { 455 { .name = "default", 456 .desc = "Default memset() provided by glibc", 457 .fn.init = mem_alloc, 458 .fn.fini = mem_free, 459 .fn.memset = memset }, 460 461 #ifdef HAVE_ARCH_X86_64_SUPPORT 462 # define MEMSET_FN(_fn, _init, _fini, _name, _desc) \ 463 {.name = _name, .desc = _desc, .fn.memset = _fn, .fn.init = _init, .fn.fini = _fini }, 464 # include "mem-memset-x86-64-asm-def.h" 465 # undef MEMSET_FN 466 #endif 467 468 { .name = NULL, } 469 }; 470 471 int bench_mem_memset(int argc, const char **argv) 472 { 473 struct bench_mem_info info = { 474 .functions = memset_functions, 475 .do_op = do_memset, 476 .usage = bench_mem_memset_usage, 477 .options = bench_mem_options, 478 }; 479 480 return bench_mem_common(argc, argv, &info); 481 } 482 483 static void mmap_page_touch(void *dst, size_t size, unsigned int page_shift, bool random) 484 { 485 unsigned long npages = size / (1 << page_shift); 486 unsigned long offset = 0, r = 0; 487 488 for (unsigned long i = 0; i < npages; i++) { 489 if (random) 490 r = rand() % (1 << page_shift); 491 492 *((char *)dst + offset + r) = *(char *)(dst + offset + r) + i; 493 offset += 1 << page_shift; 494 } 495 } 496 497 static int do_mmap(const struct function *r, struct bench_params *p, 498 void *src __maybe_unused, void *dst __maybe_unused, 499 union bench_clock *accum) 500 { 501 union bench_clock start, end, diff; 502 mmap_op_t fn = r->fn.mmap_op; 503 bool populate = strcmp(r->name, "populate") == 0; 504 505 if (p->seed) 506 srand(p->seed); 507 508 for (unsigned int i = 0; i < p->nr_loops; i++) { 509 clock_get(&start); 510 dst = bench_mmap(p->size, populate, p->page_shift); 511 if (!dst) 512 goto out; 513 514 fn(dst, p->size, p->page_shift, p->seed); 515 clock_get(&end); 516 diff = clock_diff(&start, &end); 517 clock_accum(accum, &diff); 518 519 bench_munmap(dst, p->size); 520 } 521 522 return 0; 523 out: 524 printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str, 525 p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large"); 526 return -1; 527 } 528 529 static const char * const bench_mem_mmap_usage[] = { 530 "perf bench mem mmap <options>", 531 NULL 532 }; 533 534 static const struct function mmap_functions[] = { 535 { .name = "demand", 536 .desc = "Demand loaded mmap()", 537 .fn.mmap_op = mmap_page_touch }, 538 539 { .name = "populate", 540 .desc = "Eagerly populated mmap()", 541 .fn.mmap_op = mmap_page_touch }, 542 543 { .name = NULL, } 544 }; 545 546 int bench_mem_mmap(int argc, const char **argv) 547 { 548 static const struct option bench_mmap_options[] = { 549 OPT_UINTEGER('r', "randomize", &seed, 550 "Seed to randomize page access offset."), 551 OPT_PARENT(bench_common_options), 552 OPT_END() 553 }; 554 555 struct bench_mem_info info = { 556 .functions = mmap_functions, 557 .do_op = do_mmap, 558 .usage = bench_mem_mmap_usage, 559 .options = bench_mmap_options, 560 }; 561 562 return bench_mem_common(argc, argv, &info); 563 } 564