1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * mem-memcpy.c
4 *
5 * Simple memcpy() and memset() benchmarks
6 *
7 * Written by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
8 */
9
10 #include "debug.h"
11 #include "../perf-sys.h"
12 #include <subcmd/parse-options.h>
13 #include "../util/header.h"
14 #include "../util/cloexec.h"
15 #include "../util/string2.h"
16 #include "bench.h"
17 #include "mem-memcpy-arch.h"
18 #include "mem-memset-arch.h"
19
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <unistd.h>
24 #include <sys/time.h>
25 #include <sys/mman.h>
26 #include <errno.h>
27 #include <linux/time64.h>
28 #include <linux/log2.h>
29
30 #define K 1024
31
32 #define PAGE_SHIFT_4KB 12
33 #define PAGE_SHIFT_2MB 21
34 #define PAGE_SHIFT_1GB 30
35
36 static const char *size_str = "1MB";
37 static const char *function_str = "all";
38 static const char *page_size_str = "4KB";
39 static const char *chunk_size_str = "0";
40 static unsigned int nr_loops = 1;
41 static bool use_cycles;
42 static int cycles_fd;
43 static unsigned int seed;
44
45 static const struct option bench_common_options[] = {
46 OPT_STRING('s', "size", &size_str, "1MB",
47 "Specify the size of the memory buffers. "
48 "Available units: B, KB, MB, GB and TB (case insensitive)"),
49
50 OPT_STRING('p', "page", &page_size_str, "4KB",
51 "Specify page-size for mapping memory buffers. "
52 "Available sizes: 4KB, 2MB, 1GB (case insensitive)"),
53
54 OPT_STRING('f', "function", &function_str, "all",
55 "Specify the function to run, \"all\" runs all available functions, \"help\" lists them"),
56
57 OPT_UINTEGER('l', "nr_loops", &nr_loops,
58 "Specify the number of loops to run. (default: 1)"),
59
60 OPT_BOOLEAN('c', "cycles", &use_cycles,
61 "Use a cycles event instead of gettimeofday() to measure performance"),
62
63 OPT_END()
64 };
65
66 static const struct option bench_mem_options[] = {
67 OPT_STRING('k', "chunk", &chunk_size_str, "0",
68 "Specify the chunk-size for each invocation. "
69 "Available units: B, KB, MB, GB and TB (case insensitive)"),
70 OPT_PARENT(bench_common_options),
71 OPT_END()
72 };
73
74 union bench_clock {
75 u64 cycles;
76 struct timeval tv;
77 };
78
79 struct bench_params {
80 size_t size;
81 size_t size_total;
82 size_t chunk_size;
83 unsigned int nr_loops;
84 unsigned int page_shift;
85 unsigned int seed;
86 };
87
88 struct bench_mem_info {
89 const struct function *functions;
90 int (*do_op)(const struct function *r, struct bench_params *p,
91 void *src, void *dst, union bench_clock *rt);
92 const char *const *usage;
93 const struct option *options;
94 bool alloc_src;
95 };
96
97 typedef bool (*mem_init_t)(struct bench_mem_info *, struct bench_params *,
98 void **, void **);
99 typedef void (*mem_fini_t)(struct bench_mem_info *, struct bench_params *,
100 void **, void **);
101 typedef void *(*memcpy_t)(void *, const void *, size_t);
102 typedef void *(*memset_t)(void *, int, size_t);
103 typedef void (*mmap_op_t)(void *, size_t, unsigned int, bool);
104
105 struct function {
106 const char *name;
107 const char *desc;
108 struct {
109 mem_init_t init;
110 mem_fini_t fini;
111 union {
112 memcpy_t memcpy;
113 memset_t memset;
114 mmap_op_t mmap_op;
115 };
116 } fn;
117 };
118
119 static struct perf_event_attr cycle_attr = {
120 .type = PERF_TYPE_HARDWARE,
121 .config = PERF_COUNT_HW_CPU_CYCLES
122 };
123
init_cycles(void)124 static int init_cycles(void)
125 {
126 cycles_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, perf_event_open_cloexec_flag());
127
128 if (cycles_fd < 0 && errno == ENOSYS) {
129 pr_debug("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
130 return -1;
131 }
132
133 return cycles_fd;
134 }
135
get_cycles(void)136 static u64 get_cycles(void)
137 {
138 int ret;
139 u64 clk;
140
141 ret = read(cycles_fd, &clk, sizeof(u64));
142 BUG_ON(ret != sizeof(u64));
143
144 return clk;
145 }
146
clock_get(union bench_clock * t)147 static void clock_get(union bench_clock *t)
148 {
149 if (use_cycles)
150 t->cycles = get_cycles();
151 else
152 BUG_ON(gettimeofday(&t->tv, NULL));
153 }
154
clock_diff(union bench_clock * s,union bench_clock * e)155 static union bench_clock clock_diff(union bench_clock *s, union bench_clock *e)
156 {
157 union bench_clock t;
158
159 if (use_cycles)
160 t.cycles = e->cycles - s->cycles;
161 else
162 timersub(&e->tv, &s->tv, &t.tv);
163
164 return t;
165 }
166
clock_accum(union bench_clock * a,union bench_clock * b)167 static void clock_accum(union bench_clock *a, union bench_clock *b)
168 {
169 if (use_cycles)
170 a->cycles += b->cycles;
171 else
172 timeradd(&a->tv, &b->tv, &a->tv);
173 }
174
timeval2double(struct timeval * ts)175 static double timeval2double(struct timeval *ts)
176 {
177 return (double)ts->tv_sec + (double)ts->tv_usec / (double)USEC_PER_SEC;
178 }
179
180 #define print_bps(x) do { \
181 if (x < K) \
182 printf(" %14lf bytes/sec\n", x); \
183 else if (x < K * K) \
184 printf(" %14lfd KB/sec\n", x / K); \
185 else if (x < K * K * K) \
186 printf(" %14lf MB/sec\n", x / K / K); \
187 else \
188 printf(" %14lf GB/sec\n", x / K / K / K); \
189 } while (0)
190
__bench_mem_function(struct bench_mem_info * info,struct bench_params * p,int r_idx)191 static void __bench_mem_function(struct bench_mem_info *info, struct bench_params *p,
192 int r_idx)
193 {
194 const struct function *r = &info->functions[r_idx];
195 double result_bps = 0.0;
196 union bench_clock rt = { 0 };
197 void *src = NULL, *dst = NULL;
198
199 printf("# function '%s' (%s)\n", r->name, r->desc);
200
201 if (r->fn.init && r->fn.init(info, p, &src, &dst))
202 goto out_init_failed;
203
204 if (bench_format == BENCH_FORMAT_DEFAULT)
205 printf("# Copying %s bytes ...\n\n", size_str);
206
207 if (info->do_op(r, p, src, dst, &rt))
208 goto out_test_failed;
209
210 switch (bench_format) {
211 case BENCH_FORMAT_DEFAULT:
212 if (use_cycles) {
213 printf(" %14lf cycles/byte\n", (double)rt.cycles/(double)p->size_total);
214 } else {
215 result_bps = (double)p->size_total/timeval2double(&rt.tv);
216 print_bps(result_bps);
217 }
218 break;
219
220 case BENCH_FORMAT_SIMPLE:
221 if (use_cycles) {
222 printf("%lf\n", (double)rt.cycles/(double)p->size_total);
223 } else {
224 result_bps = (double)p->size_total/timeval2double(&rt.tv);
225 printf("%lf\n", result_bps);
226 }
227 break;
228
229 default:
230 BUG_ON(1);
231 break;
232 }
233
234 out_test_failed:
235 out_free:
236 if (r->fn.fini) r->fn.fini(info, p, &src, &dst);
237 return;
238 out_init_failed:
239 printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str,
240 p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large");
241 goto out_free;
242 }
243
bench_mem_common(int argc,const char ** argv,struct bench_mem_info * info)244 static int bench_mem_common(int argc, const char **argv, struct bench_mem_info *info)
245 {
246 int i;
247 struct bench_params p = { 0 };
248 unsigned int page_size;
249
250 argc = parse_options(argc, argv, info->options, info->usage, 0);
251
252 if (use_cycles) {
253 i = init_cycles();
254 if (i < 0) {
255 fprintf(stderr, "Failed to open cycles counter\n");
256 return i;
257 }
258 }
259
260 p.nr_loops = nr_loops;
261 p.size = (size_t)perf_atoll((char *)size_str);
262
263 if ((s64)p.size <= 0) {
264 fprintf(stderr, "Invalid size:%s\n", size_str);
265 return 1;
266 }
267 p.size_total = p.size * p.nr_loops;
268
269 p.chunk_size = (size_t)perf_atoll((char *)chunk_size_str);
270 if ((s64)p.chunk_size < 0 || (s64)p.chunk_size > (s64)p.size) {
271 fprintf(stderr, "Invalid chunk_size:%s\n", chunk_size_str);
272 return 1;
273 }
274 if (!p.chunk_size)
275 p.chunk_size = p.size;
276
277 page_size = (unsigned int)perf_atoll((char *)page_size_str);
278 if (page_size != (1 << PAGE_SHIFT_4KB) &&
279 page_size != (1 << PAGE_SHIFT_2MB) &&
280 page_size != (1 << PAGE_SHIFT_1GB)) {
281 fprintf(stderr, "Invalid page-size:%s\n", page_size_str);
282 return 1;
283 }
284 p.page_shift = ilog2(page_size);
285
286 p.seed = seed;
287
288 if (!strncmp(function_str, "all", 3)) {
289 for (i = 0; info->functions[i].name; i++)
290 __bench_mem_function(info, &p, i);
291 return 0;
292 }
293
294 for (i = 0; info->functions[i].name; i++) {
295 if (!strcmp(info->functions[i].name, function_str))
296 break;
297 }
298 if (!info->functions[i].name) {
299 if (strcmp(function_str, "help") && strcmp(function_str, "h"))
300 printf("Unknown function: %s\n", function_str);
301 printf("Available functions:\n");
302 for (i = 0; info->functions[i].name; i++) {
303 printf("\t%s ... %s\n",
304 info->functions[i].name, info->functions[i].desc);
305 }
306 return 1;
307 }
308
309 __bench_mem_function(info, &p, i);
310
311 return 0;
312 }
313
memcpy_prefault(memcpy_t fn,size_t size,void * src,void * dst)314 static void memcpy_prefault(memcpy_t fn, size_t size, void *src, void *dst)
315 {
316 /* Make sure to always prefault zero pages even if MMAP_THRESH is crossed: */
317 memset(src, 0, size);
318
319 /*
320 * We prefault the freshly allocated memory range here,
321 * to not measure page fault overhead:
322 */
323 fn(dst, src, size);
324 }
325
do_memcpy(const struct function * r,struct bench_params * p,void * src,void * dst,union bench_clock * rt)326 static int do_memcpy(const struct function *r, struct bench_params *p,
327 void *src, void *dst, union bench_clock *rt)
328 {
329 union bench_clock start, end;
330 memcpy_t fn = r->fn.memcpy;
331
332 memcpy_prefault(fn, p->size, src, dst);
333
334 clock_get(&start);
335 for (unsigned int i = 0; i < p->nr_loops; ++i)
336 for (size_t off = 0; off < p->size; off += p->chunk_size)
337 fn(dst + off, src + off, min(p->chunk_size, p->size - off));
338 clock_get(&end);
339
340 *rt = clock_diff(&start, &end);
341
342 return 0;
343 }
344
bench_mmap(size_t size,bool populate,unsigned int page_shift)345 static void *bench_mmap(size_t size, bool populate, unsigned int page_shift)
346 {
347 void *p;
348 int extra = populate ? MAP_POPULATE : 0;
349
350 if (page_shift != PAGE_SHIFT_4KB)
351 extra |= MAP_HUGETLB | (page_shift << MAP_HUGE_SHIFT);
352
353 p = mmap(NULL, size, PROT_READ|PROT_WRITE,
354 extra | MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
355
356 return p == MAP_FAILED ? NULL : p;
357 }
358
bench_munmap(void * p,size_t size)359 static void bench_munmap(void *p, size_t size)
360 {
361 if (p)
362 munmap(p, size);
363 }
364
mem_alloc(struct bench_mem_info * info,struct bench_params * p,void ** src,void ** dst)365 static bool mem_alloc(struct bench_mem_info *info, struct bench_params *p,
366 void **src, void **dst)
367 {
368 bool failed;
369
370 *dst = bench_mmap(p->size, true, p->page_shift);
371 failed = *dst == NULL;
372
373 if (info->alloc_src) {
374 *src = bench_mmap(p->size, true, p->page_shift);
375 failed = failed || *src == NULL;
376 }
377
378 return failed;
379 }
380
mem_free(struct bench_mem_info * info __maybe_unused,struct bench_params * p __maybe_unused,void ** src,void ** dst)381 static void mem_free(struct bench_mem_info *info __maybe_unused,
382 struct bench_params *p __maybe_unused,
383 void **src, void **dst)
384 {
385 bench_munmap(*dst, p->size);
386 bench_munmap(*src, p->size);
387
388 *dst = *src = NULL;
389 }
390
391 struct function memcpy_functions[] = {
392 { .name = "default",
393 .desc = "Default memcpy() provided by glibc",
394 .fn.init = mem_alloc,
395 .fn.fini = mem_free,
396 .fn.memcpy = memcpy },
397
398 #ifdef HAVE_ARCH_X86_64_SUPPORT
399 # define MEMCPY_FN(_fn, _init, _fini, _name, _desc) \
400 {.name = _name, .desc = _desc, .fn.memcpy = _fn, .fn.init = _init, .fn.fini = _fini },
401 # include "mem-memcpy-x86-64-asm-def.h"
402 # undef MEMCPY_FN
403 #endif
404
405 { .name = NULL, }
406 };
407
408 static const char * const bench_mem_memcpy_usage[] = {
409 "perf bench mem memcpy <options>",
410 NULL
411 };
412
bench_mem_memcpy(int argc,const char ** argv)413 int bench_mem_memcpy(int argc, const char **argv)
414 {
415 struct bench_mem_info info = {
416 .functions = memcpy_functions,
417 .do_op = do_memcpy,
418 .usage = bench_mem_memcpy_usage,
419 .options = bench_mem_options,
420 .alloc_src = true,
421 };
422
423 return bench_mem_common(argc, argv, &info);
424 }
425
do_memset(const struct function * r,struct bench_params * p,void * src __maybe_unused,void * dst,union bench_clock * rt)426 static int do_memset(const struct function *r, struct bench_params *p,
427 void *src __maybe_unused, void *dst, union bench_clock *rt)
428 {
429 union bench_clock start, end;
430 memset_t fn = r->fn.memset;
431
432 /*
433 * We prefault the freshly allocated memory range here,
434 * to not measure page fault overhead:
435 */
436 fn(dst, -1, p->size);
437
438 clock_get(&start);
439 for (unsigned int i = 0; i < p->nr_loops; ++i)
440 for (size_t off = 0; off < p->size; off += p->chunk_size)
441 fn(dst + off, i, min(p->chunk_size, p->size - off));
442 clock_get(&end);
443
444 *rt = clock_diff(&start, &end);
445
446 return 0;
447 }
448
449 static const char * const bench_mem_memset_usage[] = {
450 "perf bench mem memset <options>",
451 NULL
452 };
453
454 static const struct function memset_functions[] = {
455 { .name = "default",
456 .desc = "Default memset() provided by glibc",
457 .fn.init = mem_alloc,
458 .fn.fini = mem_free,
459 .fn.memset = memset },
460
461 #ifdef HAVE_ARCH_X86_64_SUPPORT
462 # define MEMSET_FN(_fn, _init, _fini, _name, _desc) \
463 {.name = _name, .desc = _desc, .fn.memset = _fn, .fn.init = _init, .fn.fini = _fini },
464 # include "mem-memset-x86-64-asm-def.h"
465 # undef MEMSET_FN
466 #endif
467
468 { .name = NULL, }
469 };
470
bench_mem_memset(int argc,const char ** argv)471 int bench_mem_memset(int argc, const char **argv)
472 {
473 struct bench_mem_info info = {
474 .functions = memset_functions,
475 .do_op = do_memset,
476 .usage = bench_mem_memset_usage,
477 .options = bench_mem_options,
478 };
479
480 return bench_mem_common(argc, argv, &info);
481 }
482
mmap_page_touch(void * dst,size_t size,unsigned int page_shift,bool random)483 static void mmap_page_touch(void *dst, size_t size, unsigned int page_shift, bool random)
484 {
485 unsigned long npages = size / (1 << page_shift);
486 unsigned long offset = 0, r = 0;
487
488 for (unsigned long i = 0; i < npages; i++) {
489 if (random)
490 r = rand() % (1 << page_shift);
491
492 *((char *)dst + offset + r) = *(char *)(dst + offset + r) + i;
493 offset += 1 << page_shift;
494 }
495 }
496
do_mmap(const struct function * r,struct bench_params * p,void * src __maybe_unused,void * dst __maybe_unused,union bench_clock * accum)497 static int do_mmap(const struct function *r, struct bench_params *p,
498 void *src __maybe_unused, void *dst __maybe_unused,
499 union bench_clock *accum)
500 {
501 union bench_clock start, end, diff;
502 mmap_op_t fn = r->fn.mmap_op;
503 bool populate = strcmp(r->name, "populate") == 0;
504
505 if (p->seed)
506 srand(p->seed);
507
508 for (unsigned int i = 0; i < p->nr_loops; i++) {
509 clock_get(&start);
510 dst = bench_mmap(p->size, populate, p->page_shift);
511 if (!dst)
512 goto out;
513
514 fn(dst, p->size, p->page_shift, p->seed);
515 clock_get(&end);
516 diff = clock_diff(&start, &end);
517 clock_accum(accum, &diff);
518
519 bench_munmap(dst, p->size);
520 }
521
522 return 0;
523 out:
524 printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str,
525 p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large");
526 return -1;
527 }
528
529 static const char * const bench_mem_mmap_usage[] = {
530 "perf bench mem mmap <options>",
531 NULL
532 };
533
534 static const struct function mmap_functions[] = {
535 { .name = "demand",
536 .desc = "Demand loaded mmap()",
537 .fn.mmap_op = mmap_page_touch },
538
539 { .name = "populate",
540 .desc = "Eagerly populated mmap()",
541 .fn.mmap_op = mmap_page_touch },
542
543 { .name = NULL, }
544 };
545
bench_mem_mmap(int argc,const char ** argv)546 int bench_mem_mmap(int argc, const char **argv)
547 {
548 static const struct option bench_mmap_options[] = {
549 OPT_UINTEGER('r', "randomize", &seed,
550 "Seed to randomize page access offset."),
551 OPT_PARENT(bench_common_options),
552 OPT_END()
553 };
554
555 struct bench_mem_info info = {
556 .functions = mmap_functions,
557 .do_op = do_mmap,
558 .usage = bench_mem_mmap_usage,
559 .options = bench_mmap_options,
560 };
561
562 return bench_mem_common(argc, argv, &info);
563 }
564