1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2020 Facebook */
3 #define _GNU_SOURCE
4 #include <argp.h>
5 #include <unistd.h>
6 #include <stdint.h>
7 #include "bpf_util.h"
8 #include "bench.h"
9 #include "trigger_bench.skel.h"
10 #include "trace_helpers.h"
11
12 #define MAX_TRIG_BATCH_ITERS 1000
13
14 static struct {
15 __u32 batch_iters;
16 } args = {
17 .batch_iters = 100,
18 };
19
20 enum {
21 ARG_TRIG_BATCH_ITERS = 7000,
22 };
23
24 static const struct argp_option opts[] = {
25 { "trig-batch-iters", ARG_TRIG_BATCH_ITERS, "BATCH_ITER_CNT", 0,
26 "Number of in-kernel iterations per one driver test run"},
27 {},
28 };
29
parse_arg(int key,char * arg,struct argp_state * state)30 static error_t parse_arg(int key, char *arg, struct argp_state *state)
31 {
32 long ret;
33
34 switch (key) {
35 case ARG_TRIG_BATCH_ITERS:
36 ret = strtol(arg, NULL, 10);
37 if (ret < 1 || ret > MAX_TRIG_BATCH_ITERS) {
38 fprintf(stderr, "invalid --trig-batch-iters value (should be between %d and %d)\n",
39 1, MAX_TRIG_BATCH_ITERS);
40 argp_usage(state);
41 }
42 args.batch_iters = ret;
43 break;
44 default:
45 return ARGP_ERR_UNKNOWN;
46 }
47
48 return 0;
49 }
50
51 const struct argp bench_trigger_batch_argp = {
52 .options = opts,
53 .parser = parse_arg,
54 };
55
56 /* adjust slot shift in inc_hits() if changing */
57 #define MAX_BUCKETS 256
58
59 #pragma GCC diagnostic ignored "-Wattributes"
60
61 /* BPF triggering benchmarks */
62 static struct trigger_ctx {
63 struct trigger_bench *skel;
64 bool usermode_counters;
65 int driver_prog_fd;
66 } ctx;
67
68 static struct counter base_hits[MAX_BUCKETS];
69
inc_counter(struct counter * counters)70 static __always_inline void inc_counter(struct counter *counters)
71 {
72 static __thread int tid = 0;
73 unsigned slot;
74
75 if (unlikely(tid == 0))
76 tid = sys_gettid();
77
78 /* multiplicative hashing, it's fast */
79 slot = 2654435769U * tid;
80 slot >>= 24;
81
82 atomic_inc(&base_hits[slot].value); /* use highest byte as an index */
83 }
84
sum_and_reset_counters(struct counter * counters)85 static long sum_and_reset_counters(struct counter *counters)
86 {
87 int i;
88 long sum = 0;
89
90 for (i = 0; i < MAX_BUCKETS; i++)
91 sum += atomic_swap(&counters[i].value, 0);
92 return sum;
93 }
94
trigger_validate(void)95 static void trigger_validate(void)
96 {
97 if (env.consumer_cnt != 0) {
98 fprintf(stderr, "benchmark doesn't support consumer!\n");
99 exit(1);
100 }
101 }
102
trigger_producer(void * input)103 static void *trigger_producer(void *input)
104 {
105 if (ctx.usermode_counters) {
106 while (true) {
107 (void)syscall(__NR_getpgid);
108 inc_counter(base_hits);
109 }
110 } else {
111 while (true)
112 (void)syscall(__NR_getpgid);
113 }
114 return NULL;
115 }
116
trigger_producer_batch(void * input)117 static void *trigger_producer_batch(void *input)
118 {
119 int fd = ctx.driver_prog_fd ?: bpf_program__fd(ctx.skel->progs.trigger_driver);
120
121 while (true)
122 bpf_prog_test_run_opts(fd, NULL);
123
124 return NULL;
125 }
126
trigger_measure(struct bench_res * res)127 static void trigger_measure(struct bench_res *res)
128 {
129 if (ctx.usermode_counters)
130 res->hits = sum_and_reset_counters(base_hits);
131 else
132 res->hits = sum_and_reset_counters(ctx.skel->bss->hits);
133 }
134
setup_ctx(void)135 static void setup_ctx(void)
136 {
137 setup_libbpf();
138
139 ctx.skel = trigger_bench__open();
140 if (!ctx.skel) {
141 fprintf(stderr, "failed to open skeleton\n");
142 exit(1);
143 }
144
145 /* default "driver" BPF program */
146 bpf_program__set_autoload(ctx.skel->progs.trigger_driver, true);
147
148 ctx.skel->rodata->batch_iters = args.batch_iters;
149 ctx.skel->rodata->stacktrace = env.stacktrace;
150 }
151
load_ctx(void)152 static void load_ctx(void)
153 {
154 int err;
155
156 err = trigger_bench__load(ctx.skel);
157 if (err) {
158 fprintf(stderr, "failed to open skeleton\n");
159 exit(1);
160 }
161 }
162
attach_bpf(struct bpf_program * prog)163 static void attach_bpf(struct bpf_program *prog)
164 {
165 struct bpf_link *link;
166
167 link = bpf_program__attach(prog);
168 if (!link) {
169 fprintf(stderr, "failed to attach program!\n");
170 exit(1);
171 }
172 }
173
trigger_syscall_count_setup(void)174 static void trigger_syscall_count_setup(void)
175 {
176 ctx.usermode_counters = true;
177 }
178
179 /* Batched, staying mostly in-kernel triggering setups */
trigger_kernel_count_setup(void)180 static void trigger_kernel_count_setup(void)
181 {
182 setup_ctx();
183 bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false);
184 bpf_program__set_autoload(ctx.skel->progs.trigger_kernel_count, true);
185 load_ctx();
186 /* override driver program */
187 ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_kernel_count);
188 }
189
trigger_kprobe_setup(void)190 static void trigger_kprobe_setup(void)
191 {
192 setup_ctx();
193 bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kprobe, true);
194 load_ctx();
195 attach_bpf(ctx.skel->progs.bench_trigger_kprobe);
196 }
197
trigger_kretprobe_setup(void)198 static void trigger_kretprobe_setup(void)
199 {
200 setup_ctx();
201 bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kretprobe, true);
202 load_ctx();
203 attach_bpf(ctx.skel->progs.bench_trigger_kretprobe);
204 }
205
trigger_kprobe_multi_setup(void)206 static void trigger_kprobe_multi_setup(void)
207 {
208 setup_ctx();
209 bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kprobe_multi, true);
210 load_ctx();
211 attach_bpf(ctx.skel->progs.bench_trigger_kprobe_multi);
212 }
213
trigger_kretprobe_multi_setup(void)214 static void trigger_kretprobe_multi_setup(void)
215 {
216 setup_ctx();
217 bpf_program__set_autoload(ctx.skel->progs.bench_trigger_kretprobe_multi, true);
218 load_ctx();
219 attach_bpf(ctx.skel->progs.bench_trigger_kretprobe_multi);
220 }
221
trigger_fentry_setup(void)222 static void trigger_fentry_setup(void)
223 {
224 setup_ctx();
225 bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fentry, true);
226 load_ctx();
227 attach_bpf(ctx.skel->progs.bench_trigger_fentry);
228 }
229
attach_ksyms_all(struct bpf_program * empty,bool kretprobe)230 static void attach_ksyms_all(struct bpf_program *empty, bool kretprobe)
231 {
232 LIBBPF_OPTS(bpf_kprobe_multi_opts, opts);
233 struct bpf_link *link = NULL;
234 struct ksyms *ksyms = NULL;
235
236 /* Some recursive functions will be skipped in
237 * bpf_get_ksyms -> skip_entry, as they can introduce sufficient
238 * overhead. However, it's difficut to skip all the recursive
239 * functions for a debug kernel.
240 *
241 * So, don't run the kprobe-multi-all and kretprobe-multi-all on
242 * a debug kernel.
243 */
244 if (bpf_get_ksyms(&ksyms, true)) {
245 fprintf(stderr, "failed to get ksyms\n");
246 exit(1);
247 }
248
249 opts.syms = (const char **)ksyms->filtered_syms;
250 opts.cnt = ksyms->filtered_cnt;
251 opts.retprobe = kretprobe;
252 /* attach empty to all the kernel functions except bpf_get_numa_node_id. */
253 link = bpf_program__attach_kprobe_multi_opts(empty, NULL, &opts);
254 free_kallsyms_local(ksyms);
255 if (!link) {
256 fprintf(stderr, "failed to attach bpf_program__attach_kprobe_multi_opts to all\n");
257 exit(1);
258 }
259 }
260
trigger_kprobe_multi_all_setup(void)261 static void trigger_kprobe_multi_all_setup(void)
262 {
263 struct bpf_program *prog, *empty;
264
265 setup_ctx();
266 empty = ctx.skel->progs.bench_kprobe_multi_empty;
267 prog = ctx.skel->progs.bench_trigger_kprobe_multi;
268 bpf_program__set_autoload(empty, true);
269 bpf_program__set_autoload(prog, true);
270 load_ctx();
271
272 attach_ksyms_all(empty, false);
273 attach_bpf(prog);
274 }
275
trigger_kretprobe_multi_all_setup(void)276 static void trigger_kretprobe_multi_all_setup(void)
277 {
278 struct bpf_program *prog, *empty;
279
280 setup_ctx();
281 empty = ctx.skel->progs.bench_kretprobe_multi_empty;
282 prog = ctx.skel->progs.bench_trigger_kretprobe_multi;
283 bpf_program__set_autoload(empty, true);
284 bpf_program__set_autoload(prog, true);
285 load_ctx();
286
287 attach_ksyms_all(empty, true);
288 attach_bpf(prog);
289 }
290
trigger_fexit_setup(void)291 static void trigger_fexit_setup(void)
292 {
293 setup_ctx();
294 bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fexit, true);
295 load_ctx();
296 attach_bpf(ctx.skel->progs.bench_trigger_fexit);
297 }
298
trigger_fmodret_setup(void)299 static void trigger_fmodret_setup(void)
300 {
301 setup_ctx();
302 bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false);
303 bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true);
304 bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fmodret, true);
305 load_ctx();
306 /* override driver program */
307 ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc);
308 attach_bpf(ctx.skel->progs.bench_trigger_fmodret);
309 }
310
trigger_tp_setup(void)311 static void trigger_tp_setup(void)
312 {
313 setup_ctx();
314 bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false);
315 bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true);
316 bpf_program__set_autoload(ctx.skel->progs.bench_trigger_tp, true);
317 load_ctx();
318 /* override driver program */
319 ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc);
320 attach_bpf(ctx.skel->progs.bench_trigger_tp);
321 }
322
trigger_rawtp_setup(void)323 static void trigger_rawtp_setup(void)
324 {
325 setup_ctx();
326 bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false);
327 bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true);
328 bpf_program__set_autoload(ctx.skel->progs.bench_trigger_rawtp, true);
329 load_ctx();
330 /* override driver program */
331 ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc);
332 attach_bpf(ctx.skel->progs.bench_trigger_rawtp);
333 }
334
335 /* make sure call is not inlined and not avoided by compiler, so __weak and
336 * inline asm volatile in the body of the function
337 *
338 * There is a performance difference between uprobing at nop location vs other
339 * instructions. So use two different targets, one of which starts with nop
340 * and another doesn't.
341 *
342 * GCC doesn't generate stack setup preamble for these functions due to them
343 * having no input arguments and doing nothing in the body.
344 */
uprobe_target_nop(void)345 __nocf_check __weak void uprobe_target_nop(void)
346 {
347 asm volatile ("nop");
348 }
349
opaque_noop_func(void)350 __weak void opaque_noop_func(void)
351 {
352 }
353
uprobe_target_push(void)354 __nocf_check __weak int uprobe_target_push(void)
355 {
356 /* overhead of function call is negligible compared to uprobe
357 * triggering, so this shouldn't affect benchmark results much
358 */
359 opaque_noop_func();
360 return 1;
361 }
362
uprobe_target_ret(void)363 __nocf_check __weak void uprobe_target_ret(void)
364 {
365 asm volatile ("");
366 }
367
uprobe_producer_count(void * input)368 static void *uprobe_producer_count(void *input)
369 {
370 while (true) {
371 uprobe_target_nop();
372 inc_counter(base_hits);
373 }
374 return NULL;
375 }
376
uprobe_producer_nop(void * input)377 static void *uprobe_producer_nop(void *input)
378 {
379 while (true)
380 uprobe_target_nop();
381 return NULL;
382 }
383
uprobe_producer_push(void * input)384 static void *uprobe_producer_push(void *input)
385 {
386 while (true)
387 uprobe_target_push();
388 return NULL;
389 }
390
uprobe_producer_ret(void * input)391 static void *uprobe_producer_ret(void *input)
392 {
393 while (true)
394 uprobe_target_ret();
395 return NULL;
396 }
397
398 #ifdef __x86_64__
uprobe_target_nop5(void)399 __nocf_check __weak void uprobe_target_nop5(void)
400 {
401 asm volatile (".byte 0x0f, 0x1f, 0x44, 0x00, 0x00");
402 }
403
uprobe_producer_nop5(void * input)404 static void *uprobe_producer_nop5(void *input)
405 {
406 while (true)
407 uprobe_target_nop5();
408 return NULL;
409 }
410 #endif
411
usetup(bool use_retprobe,bool use_multi,void * target_addr)412 static void usetup(bool use_retprobe, bool use_multi, void *target_addr)
413 {
414 size_t uprobe_offset;
415 struct bpf_link *link;
416 int err;
417
418 setup_libbpf();
419
420 ctx.skel = trigger_bench__open();
421 if (!ctx.skel) {
422 fprintf(stderr, "failed to open skeleton\n");
423 exit(1);
424 }
425
426 if (use_multi)
427 bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe_multi, true);
428 else
429 bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true);
430
431 err = trigger_bench__load(ctx.skel);
432 if (err) {
433 fprintf(stderr, "failed to load skeleton\n");
434 exit(1);
435 }
436
437 uprobe_offset = get_uprobe_offset(target_addr);
438 if (use_multi) {
439 LIBBPF_OPTS(bpf_uprobe_multi_opts, opts,
440 .retprobe = use_retprobe,
441 .cnt = 1,
442 .offsets = &uprobe_offset,
443 );
444 link = bpf_program__attach_uprobe_multi(
445 ctx.skel->progs.bench_trigger_uprobe_multi,
446 -1 /* all PIDs */, "/proc/self/exe", NULL, &opts);
447 ctx.skel->links.bench_trigger_uprobe_multi = link;
448 } else {
449 link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe,
450 use_retprobe,
451 -1 /* all PIDs */,
452 "/proc/self/exe",
453 uprobe_offset);
454 ctx.skel->links.bench_trigger_uprobe = link;
455 }
456 if (!link) {
457 fprintf(stderr, "failed to attach %s!\n", use_multi ? "multi-uprobe" : "uprobe");
458 exit(1);
459 }
460 }
461
usermode_count_setup(void)462 static void usermode_count_setup(void)
463 {
464 ctx.usermode_counters = true;
465 }
466
uprobe_nop_setup(void)467 static void uprobe_nop_setup(void)
468 {
469 usetup(false, false /* !use_multi */, &uprobe_target_nop);
470 }
471
uretprobe_nop_setup(void)472 static void uretprobe_nop_setup(void)
473 {
474 usetup(true, false /* !use_multi */, &uprobe_target_nop);
475 }
476
uprobe_push_setup(void)477 static void uprobe_push_setup(void)
478 {
479 usetup(false, false /* !use_multi */, &uprobe_target_push);
480 }
481
uretprobe_push_setup(void)482 static void uretprobe_push_setup(void)
483 {
484 usetup(true, false /* !use_multi */, &uprobe_target_push);
485 }
486
uprobe_ret_setup(void)487 static void uprobe_ret_setup(void)
488 {
489 usetup(false, false /* !use_multi */, &uprobe_target_ret);
490 }
491
uretprobe_ret_setup(void)492 static void uretprobe_ret_setup(void)
493 {
494 usetup(true, false /* !use_multi */, &uprobe_target_ret);
495 }
496
uprobe_multi_nop_setup(void)497 static void uprobe_multi_nop_setup(void)
498 {
499 usetup(false, true /* use_multi */, &uprobe_target_nop);
500 }
501
uretprobe_multi_nop_setup(void)502 static void uretprobe_multi_nop_setup(void)
503 {
504 usetup(true, true /* use_multi */, &uprobe_target_nop);
505 }
506
uprobe_multi_push_setup(void)507 static void uprobe_multi_push_setup(void)
508 {
509 usetup(false, true /* use_multi */, &uprobe_target_push);
510 }
511
uretprobe_multi_push_setup(void)512 static void uretprobe_multi_push_setup(void)
513 {
514 usetup(true, true /* use_multi */, &uprobe_target_push);
515 }
516
uprobe_multi_ret_setup(void)517 static void uprobe_multi_ret_setup(void)
518 {
519 usetup(false, true /* use_multi */, &uprobe_target_ret);
520 }
521
uretprobe_multi_ret_setup(void)522 static void uretprobe_multi_ret_setup(void)
523 {
524 usetup(true, true /* use_multi */, &uprobe_target_ret);
525 }
526
527 #ifdef __x86_64__
uprobe_nop5_setup(void)528 static void uprobe_nop5_setup(void)
529 {
530 usetup(false, false /* !use_multi */, &uprobe_target_nop5);
531 }
532
uretprobe_nop5_setup(void)533 static void uretprobe_nop5_setup(void)
534 {
535 usetup(true, false /* !use_multi */, &uprobe_target_nop5);
536 }
537
uprobe_multi_nop5_setup(void)538 static void uprobe_multi_nop5_setup(void)
539 {
540 usetup(false, true /* use_multi */, &uprobe_target_nop5);
541 }
542
uretprobe_multi_nop5_setup(void)543 static void uretprobe_multi_nop5_setup(void)
544 {
545 usetup(true, true /* use_multi */, &uprobe_target_nop5);
546 }
547 #endif
548
549 const struct bench bench_trig_syscall_count = {
550 .name = "trig-syscall-count",
551 .validate = trigger_validate,
552 .setup = trigger_syscall_count_setup,
553 .producer_thread = trigger_producer,
554 .measure = trigger_measure,
555 .report_progress = hits_drops_report_progress,
556 .report_final = hits_drops_report_final,
557 };
558
559 /* batched (staying mostly in kernel) kprobe/fentry benchmarks */
560 #define BENCH_TRIG_KERNEL(KIND, NAME) \
561 const struct bench bench_trig_##KIND = { \
562 .name = "trig-" NAME, \
563 .setup = trigger_##KIND##_setup, \
564 .producer_thread = trigger_producer_batch, \
565 .measure = trigger_measure, \
566 .report_progress = hits_drops_report_progress, \
567 .report_final = hits_drops_report_final, \
568 .argp = &bench_trigger_batch_argp, \
569 }
570
571 BENCH_TRIG_KERNEL(kernel_count, "kernel-count");
572 BENCH_TRIG_KERNEL(kprobe, "kprobe");
573 BENCH_TRIG_KERNEL(kretprobe, "kretprobe");
574 BENCH_TRIG_KERNEL(kprobe_multi, "kprobe-multi");
575 BENCH_TRIG_KERNEL(kretprobe_multi, "kretprobe-multi");
576 BENCH_TRIG_KERNEL(fentry, "fentry");
577 BENCH_TRIG_KERNEL(kprobe_multi_all, "kprobe-multi-all");
578 BENCH_TRIG_KERNEL(kretprobe_multi_all, "kretprobe-multi-all");
579 BENCH_TRIG_KERNEL(fexit, "fexit");
580 BENCH_TRIG_KERNEL(fmodret, "fmodret");
581 BENCH_TRIG_KERNEL(tp, "tp");
582 BENCH_TRIG_KERNEL(rawtp, "rawtp");
583
584 /* uprobe benchmarks */
585 #define BENCH_TRIG_USERMODE(KIND, PRODUCER, NAME) \
586 const struct bench bench_trig_##KIND = { \
587 .name = "trig-" NAME, \
588 .validate = trigger_validate, \
589 .setup = KIND##_setup, \
590 .producer_thread = uprobe_producer_##PRODUCER, \
591 .measure = trigger_measure, \
592 .report_progress = hits_drops_report_progress, \
593 .report_final = hits_drops_report_final, \
594 }
595
596 BENCH_TRIG_USERMODE(usermode_count, count, "usermode-count");
597 BENCH_TRIG_USERMODE(uprobe_nop, nop, "uprobe-nop");
598 BENCH_TRIG_USERMODE(uprobe_push, push, "uprobe-push");
599 BENCH_TRIG_USERMODE(uprobe_ret, ret, "uprobe-ret");
600 BENCH_TRIG_USERMODE(uretprobe_nop, nop, "uretprobe-nop");
601 BENCH_TRIG_USERMODE(uretprobe_push, push, "uretprobe-push");
602 BENCH_TRIG_USERMODE(uretprobe_ret, ret, "uretprobe-ret");
603 BENCH_TRIG_USERMODE(uprobe_multi_nop, nop, "uprobe-multi-nop");
604 BENCH_TRIG_USERMODE(uprobe_multi_push, push, "uprobe-multi-push");
605 BENCH_TRIG_USERMODE(uprobe_multi_ret, ret, "uprobe-multi-ret");
606 BENCH_TRIG_USERMODE(uretprobe_multi_nop, nop, "uretprobe-multi-nop");
607 BENCH_TRIG_USERMODE(uretprobe_multi_push, push, "uretprobe-multi-push");
608 BENCH_TRIG_USERMODE(uretprobe_multi_ret, ret, "uretprobe-multi-ret");
609 #ifdef __x86_64__
610 BENCH_TRIG_USERMODE(uprobe_nop5, nop5, "uprobe-nop5");
611 BENCH_TRIG_USERMODE(uretprobe_nop5, nop5, "uretprobe-nop5");
612 BENCH_TRIG_USERMODE(uprobe_multi_nop5, nop5, "uprobe-multi-nop5");
613 BENCH_TRIG_USERMODE(uretprobe_multi_nop5, nop5, "uretprobe-multi-nop5");
614 #endif
615