xref: /linux/tools/perf/bench/mem-functions.c (revision ec714e371f22f716a04e6ecb2a24988c92b26911)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * mem-memcpy.c
4  *
5  * Simple memcpy() and memset() benchmarks
6  *
7  * Written by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
8  */
9 
10 #include "debug.h"
11 #include "../perf-sys.h"
12 #include <subcmd/parse-options.h>
13 #include "../util/header.h"
14 #include "../util/cloexec.h"
15 #include "../util/string2.h"
16 #include "bench.h"
17 #include "mem-memcpy-arch.h"
18 #include "mem-memset-arch.h"
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <unistd.h>
24 #include <sys/time.h>
25 #include <sys/mman.h>
26 #include <errno.h>
27 #include <linux/time64.h>
28 #include <linux/log2.h>
29 
30 #define K 1024
31 
32 #define PAGE_SHIFT_4KB		12
33 #define PAGE_SHIFT_2MB		21
34 #define PAGE_SHIFT_1GB		30
35 
36 static const char	*size_str	= "1MB";
37 static const char	*function_str	= "all";
38 static const char	*page_size_str	= "4KB";
39 static const char	*chunk_size_str	= "0";
40 static unsigned int	nr_loops	= 1;
41 static bool		use_cycles;
42 static int		cycles_fd;
43 static unsigned int	seed;
44 
45 static const struct option bench_common_options[] = {
46 	OPT_STRING('s', "size", &size_str, "1MB",
47 		    "Specify the size of the memory buffers. "
48 		    "Available units: B, KB, MB, GB and TB (case insensitive)"),
49 
50 	OPT_STRING('p', "page", &page_size_str, "4KB",
51 		    "Specify page-size for mapping memory buffers. "
52 		    "Available sizes: 4KB, 2MB, 1GB (case insensitive)"),
53 
54 	OPT_STRING('f', "function", &function_str, "all",
55 		    "Specify the function to run, \"all\" runs all available functions, \"help\" lists them"),
56 
57 	OPT_UINTEGER('l', "nr_loops", &nr_loops,
58 		    "Specify the number of loops to run. (default: 1)"),
59 
60 	OPT_BOOLEAN('c', "cycles", &use_cycles,
61 		    "Use a cycles event instead of gettimeofday() to measure performance"),
62 
63 	OPT_END()
64 };
65 
66 static const struct option bench_mem_options[] = {
67 	OPT_STRING('k', "chunk", &chunk_size_str, "0",
68 		    "Specify the chunk-size for each invocation. "
69 		    "Available units: B, KB, MB, GB and TB (case insensitive)"),
70 	OPT_PARENT(bench_common_options),
71 	OPT_END()
72 };
73 
74 union bench_clock {
75 	u64		cycles;
76 	struct timeval	tv;
77 };
78 
79 struct bench_params {
80 	size_t		size;
81 	size_t		size_total;
82 	size_t		chunk_size;
83 	unsigned int	nr_loops;
84 	unsigned int	page_shift;
85 	unsigned int	seed;
86 };
87 
88 struct bench_mem_info {
89 	const struct function *functions;
90 	int (*do_op)(const struct function *r, struct bench_params *p,
91 		     void *src, void *dst, union bench_clock *rt);
92 	const char *const *usage;
93 	const struct option *options;
94 	bool alloc_src;
95 };
96 
97 typedef bool (*mem_init_t)(struct bench_mem_info *, struct bench_params *,
98 			   void **, void **);
99 typedef void (*mem_fini_t)(struct bench_mem_info *, struct bench_params *,
100 			   void **, void **);
101 typedef void *(*memcpy_t)(void *, const void *, size_t);
102 typedef void *(*memset_t)(void *, int, size_t);
103 typedef void (*mmap_op_t)(void *, size_t, unsigned int, bool);
104 
105 struct function {
106 	const char *name;
107 	const char *desc;
108 	struct {
109 		mem_init_t init;
110 		mem_fini_t fini;
111 		union {
112 			memcpy_t memcpy;
113 			memset_t memset;
114 			mmap_op_t mmap_op;
115 		};
116 	} fn;
117 };
118 
119 static struct perf_event_attr cycle_attr = {
120 	.type		= PERF_TYPE_HARDWARE,
121 	.config		= PERF_COUNT_HW_CPU_CYCLES
122 };
123 
init_cycles(void)124 static int init_cycles(void)
125 {
126 	cycles_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, perf_event_open_cloexec_flag());
127 
128 	if (cycles_fd < 0 && errno == ENOSYS) {
129 		pr_debug("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
130 		return -1;
131 	}
132 
133 	return cycles_fd;
134 }
135 
get_cycles(void)136 static u64 get_cycles(void)
137 {
138 	int ret;
139 	u64 clk;
140 
141 	ret = read(cycles_fd, &clk, sizeof(u64));
142 	BUG_ON(ret != sizeof(u64));
143 
144 	return clk;
145 }
146 
clock_get(union bench_clock * t)147 static void clock_get(union bench_clock *t)
148 {
149 	if (use_cycles)
150 		t->cycles = get_cycles();
151 	else
152 		BUG_ON(gettimeofday(&t->tv, NULL));
153 }
154 
clock_diff(union bench_clock * s,union bench_clock * e)155 static union bench_clock clock_diff(union bench_clock *s, union bench_clock *e)
156 {
157 	union bench_clock t;
158 
159 	if (use_cycles)
160 		t.cycles = e->cycles - s->cycles;
161 	else
162 		timersub(&e->tv, &s->tv, &t.tv);
163 
164 	return t;
165 }
166 
clock_accum(union bench_clock * a,union bench_clock * b)167 static void clock_accum(union bench_clock *a, union bench_clock *b)
168 {
169 	if (use_cycles)
170 		a->cycles += b->cycles;
171 	else
172 		timeradd(&a->tv, &b->tv, &a->tv);
173 }
174 
timeval2double(struct timeval * ts)175 static double timeval2double(struct timeval *ts)
176 {
177 	return (double)ts->tv_sec + (double)ts->tv_usec / (double)USEC_PER_SEC;
178 }
179 
180 #define print_bps(x) do {						\
181 		if (x < K)						\
182 			printf(" %14lf bytes/sec\n", x);		\
183 		else if (x < K * K)					\
184 			printf(" %14lfd KB/sec\n", x / K);		\
185 		else if (x < K * K * K)					\
186 			printf(" %14lf MB/sec\n", x / K / K);		\
187 		else							\
188 			printf(" %14lf GB/sec\n", x / K / K / K);	\
189 	} while (0)
190 
__bench_mem_function(struct bench_mem_info * info,struct bench_params * p,int r_idx)191 static void __bench_mem_function(struct bench_mem_info *info, struct bench_params *p,
192 				 int r_idx)
193 {
194 	const struct function *r = &info->functions[r_idx];
195 	double result_bps = 0.0;
196 	union bench_clock rt = { 0 };
197 	void *src = NULL, *dst = NULL;
198 
199 	printf("# function '%s' (%s)\n", r->name, r->desc);
200 
201 	if (r->fn.init && r->fn.init(info, p, &src, &dst))
202 		goto out_init_failed;
203 
204 	if (bench_format == BENCH_FORMAT_DEFAULT)
205 		printf("# Copying %s bytes ...\n\n", size_str);
206 
207 	if (info->do_op(r, p, src, dst, &rt))
208 		goto out_test_failed;
209 
210 	switch (bench_format) {
211 	case BENCH_FORMAT_DEFAULT:
212 		if (use_cycles) {
213 			printf(" %14lf cycles/byte\n", (double)rt.cycles/(double)p->size_total);
214 		} else {
215 			result_bps = (double)p->size_total/timeval2double(&rt.tv);
216 			print_bps(result_bps);
217 		}
218 		break;
219 
220 	case BENCH_FORMAT_SIMPLE:
221 		if (use_cycles) {
222 			printf("%lf\n", (double)rt.cycles/(double)p->size_total);
223 		} else {
224 			result_bps = (double)p->size_total/timeval2double(&rt.tv);
225 			printf("%lf\n", result_bps);
226 		}
227 		break;
228 
229 	default:
230 		BUG_ON(1);
231 		break;
232 	}
233 
234 out_test_failed:
235 out_free:
236 	if (r->fn.fini) r->fn.fini(info, p, &src, &dst);
237 	return;
238 out_init_failed:
239 	printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str,
240 			p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large");
241 	goto out_free;
242 }
243 
bench_mem_common(int argc,const char ** argv,struct bench_mem_info * info)244 static int bench_mem_common(int argc, const char **argv, struct bench_mem_info *info)
245 {
246 	int i;
247 	struct bench_params p = { 0 };
248 	unsigned int page_size;
249 
250 	argc = parse_options(argc, argv, info->options, info->usage, 0);
251 
252 	if (use_cycles) {
253 		i = init_cycles();
254 		if (i < 0) {
255 			fprintf(stderr, "Failed to open cycles counter\n");
256 			return i;
257 		}
258 	}
259 
260 	p.nr_loops = nr_loops;
261 	p.size = (size_t)perf_atoll((char *)size_str);
262 
263 	if ((s64)p.size <= 0) {
264 		fprintf(stderr, "Invalid size:%s\n", size_str);
265 		return 1;
266 	}
267 	p.size_total = p.size * p.nr_loops;
268 
269 	p.chunk_size = (size_t)perf_atoll((char *)chunk_size_str);
270 	if ((s64)p.chunk_size < 0 || (s64)p.chunk_size > (s64)p.size) {
271 		fprintf(stderr, "Invalid chunk_size:%s\n", chunk_size_str);
272 		return 1;
273 	}
274 	if (!p.chunk_size)
275 		p.chunk_size = p.size;
276 
277 	page_size = (unsigned int)perf_atoll((char *)page_size_str);
278 	if (page_size != (1 << PAGE_SHIFT_4KB) &&
279 	    page_size != (1 << PAGE_SHIFT_2MB) &&
280 	    page_size != (1 << PAGE_SHIFT_1GB)) {
281 		fprintf(stderr, "Invalid page-size:%s\n", page_size_str);
282 		return 1;
283 	}
284 	p.page_shift = ilog2(page_size);
285 
286 	p.seed = seed;
287 
288 	if (!strncmp(function_str, "all", 3)) {
289 		for (i = 0; info->functions[i].name; i++)
290 			__bench_mem_function(info, &p, i);
291 		return 0;
292 	}
293 
294 	for (i = 0; info->functions[i].name; i++) {
295 		if (!strcmp(info->functions[i].name, function_str))
296 			break;
297 	}
298 	if (!info->functions[i].name) {
299 		if (strcmp(function_str, "help") && strcmp(function_str, "h"))
300 			printf("Unknown function: %s\n", function_str);
301 		printf("Available functions:\n");
302 		for (i = 0; info->functions[i].name; i++) {
303 			printf("\t%s ... %s\n",
304 			       info->functions[i].name, info->functions[i].desc);
305 		}
306 		return 1;
307 	}
308 
309 	__bench_mem_function(info, &p, i);
310 
311 	return 0;
312 }
313 
memcpy_prefault(memcpy_t fn,size_t size,void * src,void * dst)314 static void memcpy_prefault(memcpy_t fn, size_t size, void *src, void *dst)
315 {
316 	/* Make sure to always prefault zero pages even if MMAP_THRESH is crossed: */
317 	memset(src, 0, size);
318 
319 	/*
320 	 * We prefault the freshly allocated memory range here,
321 	 * to not measure page fault overhead:
322 	 */
323 	fn(dst, src, size);
324 }
325 
do_memcpy(const struct function * r,struct bench_params * p,void * src,void * dst,union bench_clock * rt)326 static int do_memcpy(const struct function *r, struct bench_params *p,
327 		     void *src, void *dst, union bench_clock *rt)
328 {
329 	union bench_clock start, end;
330 	memcpy_t fn = r->fn.memcpy;
331 
332 	memcpy_prefault(fn, p->size, src, dst);
333 
334 	clock_get(&start);
335 	for (unsigned int i = 0; i < p->nr_loops; ++i)
336 		for (size_t off = 0; off < p->size; off += p->chunk_size)
337 			fn(dst + off, src + off, min(p->chunk_size, p->size - off));
338 	clock_get(&end);
339 
340 	*rt = clock_diff(&start, &end);
341 
342 	return 0;
343 }
344 
bench_mmap(size_t size,bool populate,unsigned int page_shift)345 static void *bench_mmap(size_t size, bool populate, unsigned int page_shift)
346 {
347 	void *p;
348 	int extra = populate ? MAP_POPULATE : 0;
349 
350 	if (page_shift != PAGE_SHIFT_4KB)
351 		extra |= MAP_HUGETLB | (page_shift << MAP_HUGE_SHIFT);
352 
353 	p = mmap(NULL, size, PROT_READ|PROT_WRITE,
354 		 extra | MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
355 
356 	return p == MAP_FAILED ? NULL : p;
357 }
358 
bench_munmap(void * p,size_t size)359 static void bench_munmap(void *p, size_t size)
360 {
361 	if (p)
362 		munmap(p, size);
363 }
364 
mem_alloc(struct bench_mem_info * info,struct bench_params * p,void ** src,void ** dst)365 static bool mem_alloc(struct bench_mem_info *info, struct bench_params *p,
366 		      void **src, void **dst)
367 {
368 	bool failed;
369 
370 	*dst = bench_mmap(p->size, true, p->page_shift);
371 	failed = *dst == NULL;
372 
373 	if (info->alloc_src) {
374 		*src = bench_mmap(p->size, true, p->page_shift);
375 		failed = failed || *src == NULL;
376 	}
377 
378 	return failed;
379 }
380 
mem_free(struct bench_mem_info * info __maybe_unused,struct bench_params * p __maybe_unused,void ** src,void ** dst)381 static void mem_free(struct bench_mem_info *info __maybe_unused,
382 		     struct bench_params *p __maybe_unused,
383 		     void **src, void **dst)
384 {
385 	bench_munmap(*dst, p->size);
386 	bench_munmap(*src, p->size);
387 
388 	*dst = *src = NULL;
389 }
390 
391 struct function memcpy_functions[] = {
392 	{ .name		= "default",
393 	  .desc		= "Default memcpy() provided by glibc",
394 	  .fn.init	= mem_alloc,
395 	  .fn.fini	= mem_free,
396 	  .fn.memcpy	= memcpy },
397 
398 #ifdef HAVE_ARCH_X86_64_SUPPORT
399 # define MEMCPY_FN(_fn, _init, _fini, _name, _desc)	\
400 	{.name = _name, .desc = _desc, .fn.memcpy = _fn, .fn.init = _init, .fn.fini = _fini },
401 # include "mem-memcpy-x86-64-asm-def.h"
402 # undef MEMCPY_FN
403 #endif
404 
405 	{ .name = NULL, }
406 };
407 
408 static const char * const bench_mem_memcpy_usage[] = {
409 	"perf bench mem memcpy <options>",
410 	NULL
411 };
412 
bench_mem_memcpy(int argc,const char ** argv)413 int bench_mem_memcpy(int argc, const char **argv)
414 {
415 	struct bench_mem_info info = {
416 		.functions		= memcpy_functions,
417 		.do_op			= do_memcpy,
418 		.usage			= bench_mem_memcpy_usage,
419 		.options		= bench_mem_options,
420 		.alloc_src              = true,
421 	};
422 
423 	return bench_mem_common(argc, argv, &info);
424 }
425 
do_memset(const struct function * r,struct bench_params * p,void * src __maybe_unused,void * dst,union bench_clock * rt)426 static int do_memset(const struct function *r, struct bench_params *p,
427 		     void *src __maybe_unused, void *dst, union bench_clock *rt)
428 {
429 	union bench_clock start, end;
430 	memset_t fn = r->fn.memset;
431 
432 	/*
433 	 * We prefault the freshly allocated memory range here,
434 	 * to not measure page fault overhead:
435 	 */
436 	fn(dst, -1, p->size);
437 
438 	clock_get(&start);
439 	for (unsigned int i = 0; i < p->nr_loops; ++i)
440 		for (size_t off = 0; off < p->size; off += p->chunk_size)
441 			fn(dst + off, i, min(p->chunk_size, p->size - off));
442 	clock_get(&end);
443 
444 	*rt = clock_diff(&start, &end);
445 
446 	return 0;
447 }
448 
449 static const char * const bench_mem_memset_usage[] = {
450 	"perf bench mem memset <options>",
451 	NULL
452 };
453 
454 static const struct function memset_functions[] = {
455 	{ .name		= "default",
456 	  .desc		= "Default memset() provided by glibc",
457 	  .fn.init	= mem_alloc,
458 	  .fn.fini	= mem_free,
459 	  .fn.memset	= memset },
460 
461 #ifdef HAVE_ARCH_X86_64_SUPPORT
462 # define MEMSET_FN(_fn, _init, _fini, _name, _desc) \
463 	{.name = _name, .desc = _desc, .fn.memset = _fn, .fn.init = _init, .fn.fini = _fini },
464 # include "mem-memset-x86-64-asm-def.h"
465 # undef MEMSET_FN
466 #endif
467 
468 	{ .name = NULL, }
469 };
470 
bench_mem_memset(int argc,const char ** argv)471 int bench_mem_memset(int argc, const char **argv)
472 {
473 	struct bench_mem_info info = {
474 		.functions		= memset_functions,
475 		.do_op			= do_memset,
476 		.usage			= bench_mem_memset_usage,
477 		.options		= bench_mem_options,
478 	};
479 
480 	return bench_mem_common(argc, argv, &info);
481 }
482 
mmap_page_touch(void * dst,size_t size,unsigned int page_shift,bool random)483 static void mmap_page_touch(void *dst, size_t size, unsigned int page_shift, bool random)
484 {
485 	unsigned long npages = size / (1 << page_shift);
486 	unsigned long offset = 0, r = 0;
487 
488 	for (unsigned long i = 0; i < npages; i++) {
489 		if (random)
490 			r = rand() % (1 << page_shift);
491 
492 		*((char *)dst + offset + r) = *(char *)(dst + offset + r) + i;
493 		offset += 1 << page_shift;
494 	}
495 }
496 
do_mmap(const struct function * r,struct bench_params * p,void * src __maybe_unused,void * dst __maybe_unused,union bench_clock * accum)497 static int do_mmap(const struct function *r, struct bench_params *p,
498 		  void *src __maybe_unused, void *dst __maybe_unused,
499 		  union bench_clock *accum)
500 {
501 	union bench_clock start, end, diff;
502 	mmap_op_t fn = r->fn.mmap_op;
503 	bool populate = strcmp(r->name, "populate") == 0;
504 
505 	if (p->seed)
506 		srand(p->seed);
507 
508 	for (unsigned int i = 0; i < p->nr_loops; i++) {
509 		clock_get(&start);
510 		dst = bench_mmap(p->size, populate, p->page_shift);
511 		if (!dst)
512 			goto out;
513 
514 		fn(dst, p->size, p->page_shift, p->seed);
515 		clock_get(&end);
516 		diff = clock_diff(&start, &end);
517 		clock_accum(accum, &diff);
518 
519 		bench_munmap(dst, p->size);
520 	}
521 
522 	return 0;
523 out:
524 	printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str,
525 			p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large");
526 	return -1;
527 }
528 
529 static const char * const bench_mem_mmap_usage[] = {
530 	"perf bench mem mmap <options>",
531 	NULL
532 };
533 
534 static const struct function mmap_functions[] = {
535 	{ .name		= "demand",
536 	  .desc		= "Demand loaded mmap()",
537 	  .fn.mmap_op	= mmap_page_touch },
538 
539 	{ .name		= "populate",
540 	  .desc		= "Eagerly populated mmap()",
541 	  .fn.mmap_op	= mmap_page_touch },
542 
543 	{ .name = NULL, }
544 };
545 
bench_mem_mmap(int argc,const char ** argv)546 int bench_mem_mmap(int argc, const char **argv)
547 {
548 	static const struct option bench_mmap_options[] = {
549 		OPT_UINTEGER('r', "randomize", &seed,
550 			    "Seed to randomize page access offset."),
551 		OPT_PARENT(bench_common_options),
552 		OPT_END()
553 	};
554 
555 	struct bench_mem_info info = {
556 		.functions		= mmap_functions,
557 		.do_op			= do_mmap,
558 		.usage			= bench_mem_mmap_usage,
559 		.options		= bench_mmap_options,
560 	};
561 
562 	return bench_mem_common(argc, argv, &info);
563 }
564