108158c11SPuranjay Mohan // SPDX-License-Identifier: GPL-2.0 208158c11SPuranjay Mohan /* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ 308158c11SPuranjay Mohan 408158c11SPuranjay Mohan #include <stdio.h> 508158c11SPuranjay Mohan #include <stdlib.h> 608158c11SPuranjay Mohan #include <string.h> 708158c11SPuranjay Mohan #include <math.h> 808158c11SPuranjay Mohan #include "bench_bpf_timing.h" 908158c11SPuranjay Mohan #include "bpf_util.h" 1008158c11SPuranjay Mohan 1108158c11SPuranjay Mohan struct timing_stats { 1208158c11SPuranjay Mohan double min, max; 1308158c11SPuranjay Mohan double median, p99; 1408158c11SPuranjay Mohan double mean, stddev; 1508158c11SPuranjay Mohan int count; 1608158c11SPuranjay Mohan }; 1708158c11SPuranjay Mohan 1808158c11SPuranjay Mohan static int cmp_double(const void *a, const void *b) 1908158c11SPuranjay Mohan { 2008158c11SPuranjay Mohan double da = *(const double *)a; 2108158c11SPuranjay Mohan double db = *(const double *)b; 2208158c11SPuranjay Mohan 2308158c11SPuranjay Mohan if (da < db) 2408158c11SPuranjay Mohan return -1; 2508158c11SPuranjay Mohan if (da > db) 2608158c11SPuranjay Mohan return 1; 2708158c11SPuranjay Mohan return 0; 2808158c11SPuranjay Mohan } 2908158c11SPuranjay Mohan 3008158c11SPuranjay Mohan static double percentile(const double *sorted, int n, double pct) 3108158c11SPuranjay Mohan { 3208158c11SPuranjay Mohan int idx = (int)(n * pct / 100.0); 3308158c11SPuranjay Mohan 3408158c11SPuranjay Mohan if (idx >= n) 3508158c11SPuranjay Mohan idx = n - 1; 3608158c11SPuranjay Mohan return sorted[idx]; 3708158c11SPuranjay Mohan } 3808158c11SPuranjay Mohan 3908158c11SPuranjay Mohan static int collect_samples(struct bpf_bench_timing *t, 4008158c11SPuranjay Mohan double *out, int max_out) 4108158c11SPuranjay Mohan { 4208158c11SPuranjay Mohan unsigned int nr_cpus = bpf_num_possible_cpus(); 4308158c11SPuranjay Mohan __u32 timed_iters = t->batch_iters; 4408158c11SPuranjay Mohan int total = 0; 4508158c11SPuranjay Mohan 4608158c11SPuranjay Mohan if (nr_cpus > BENCH_NR_CPUS) 4708158c11SPuranjay Mohan nr_cpus = BENCH_NR_CPUS; 4808158c11SPuranjay Mohan 4908158c11SPuranjay Mohan for (unsigned int cpu = 0; cpu < nr_cpus; cpu++) { 5008158c11SPuranjay Mohan __u32 count = t->idx[cpu]; 5108158c11SPuranjay Mohan 5208158c11SPuranjay Mohan if (count > BENCH_NR_SAMPLES) 5308158c11SPuranjay Mohan count = BENCH_NR_SAMPLES; 5408158c11SPuranjay Mohan 5508158c11SPuranjay Mohan for (__u32 i = 0; i < count && total < max_out; i++) { 5608158c11SPuranjay Mohan __u64 sample = t->samples[cpu][i]; 5708158c11SPuranjay Mohan 5808158c11SPuranjay Mohan if (sample == 0) 5908158c11SPuranjay Mohan continue; 6008158c11SPuranjay Mohan out[total++] = (double)sample / timed_iters; 6108158c11SPuranjay Mohan } 6208158c11SPuranjay Mohan } 6308158c11SPuranjay Mohan 6408158c11SPuranjay Mohan qsort(out, total, sizeof(double), cmp_double); 6508158c11SPuranjay Mohan return total; 6608158c11SPuranjay Mohan } 6708158c11SPuranjay Mohan 68*abac8acbSPuranjay Mohan static int filter_outliers_iqr(double *sorted, int n) 69*abac8acbSPuranjay Mohan { 70*abac8acbSPuranjay Mohan double q1, q3, iqr, lo, hi; 71*abac8acbSPuranjay Mohan int start = 0, end = n; 72*abac8acbSPuranjay Mohan 73*abac8acbSPuranjay Mohan if (n < 8) 74*abac8acbSPuranjay Mohan return n; 75*abac8acbSPuranjay Mohan 76*abac8acbSPuranjay Mohan q1 = sorted[n / 4]; 77*abac8acbSPuranjay Mohan q3 = sorted[3 * n / 4]; 78*abac8acbSPuranjay Mohan iqr = q3 - q1; 79*abac8acbSPuranjay Mohan lo = q1 - 1.5 * iqr; 80*abac8acbSPuranjay Mohan hi = q3 + 1.5 * iqr; 81*abac8acbSPuranjay Mohan 82*abac8acbSPuranjay Mohan while (start < end && sorted[start] < lo) 83*abac8acbSPuranjay Mohan start++; 84*abac8acbSPuranjay Mohan while (end > start && sorted[end - 1] > hi) 85*abac8acbSPuranjay Mohan end--; 86*abac8acbSPuranjay Mohan 87*abac8acbSPuranjay Mohan if (start > 0) 88*abac8acbSPuranjay Mohan memmove(sorted, sorted + start, (end - start) * sizeof(double)); 89*abac8acbSPuranjay Mohan 90*abac8acbSPuranjay Mohan return end - start; 91*abac8acbSPuranjay Mohan } 92*abac8acbSPuranjay Mohan 9308158c11SPuranjay Mohan static void compute_stats(const double *sorted, int n, 9408158c11SPuranjay Mohan struct timing_stats *s) 9508158c11SPuranjay Mohan { 9608158c11SPuranjay Mohan double sum = 0, var_sum = 0; 9708158c11SPuranjay Mohan 9808158c11SPuranjay Mohan memset(s, 0, sizeof(*s)); 9908158c11SPuranjay Mohan s->count = n; 10008158c11SPuranjay Mohan 10108158c11SPuranjay Mohan if (n == 0) 10208158c11SPuranjay Mohan return; 10308158c11SPuranjay Mohan 10408158c11SPuranjay Mohan s->min = sorted[0]; 10508158c11SPuranjay Mohan s->max = sorted[n - 1]; 10608158c11SPuranjay Mohan s->median = sorted[n / 2]; 10708158c11SPuranjay Mohan s->p99 = percentile(sorted, n, 99); 10808158c11SPuranjay Mohan 10908158c11SPuranjay Mohan for (int i = 0; i < n; i++) 11008158c11SPuranjay Mohan sum += sorted[i]; 11108158c11SPuranjay Mohan s->mean = sum / n; 11208158c11SPuranjay Mohan 11308158c11SPuranjay Mohan for (int i = 0; i < n; i++) { 11408158c11SPuranjay Mohan double d = sorted[i] - s->mean; 11508158c11SPuranjay Mohan 11608158c11SPuranjay Mohan var_sum += d * d; 11708158c11SPuranjay Mohan } 11808158c11SPuranjay Mohan s->stddev = n > 1 ? sqrt(var_sum / (n - 1)) : 0; 11908158c11SPuranjay Mohan } 12008158c11SPuranjay Mohan 12108158c11SPuranjay Mohan void bpf_bench_timing_measure(struct bpf_bench_timing *t, struct bench_res *res) 12208158c11SPuranjay Mohan { 12308158c11SPuranjay Mohan unsigned int nr_cpus; 12408158c11SPuranjay Mohan __u32 total_samples; 12508158c11SPuranjay Mohan int i; 12608158c11SPuranjay Mohan 12708158c11SPuranjay Mohan t->warmup_ticks++; 12808158c11SPuranjay Mohan 12908158c11SPuranjay Mohan if (t->warmup_ticks < env.warmup_sec) 13008158c11SPuranjay Mohan return; 13108158c11SPuranjay Mohan 13208158c11SPuranjay Mohan if (t->warmup_ticks == env.warmup_sec) { 13308158c11SPuranjay Mohan *t->timing_enabled = 1; 13408158c11SPuranjay Mohan return; 13508158c11SPuranjay Mohan } 13608158c11SPuranjay Mohan 13708158c11SPuranjay Mohan nr_cpus = bpf_num_possible_cpus(); 13808158c11SPuranjay Mohan if (nr_cpus > BENCH_NR_CPUS) 13908158c11SPuranjay Mohan nr_cpus = BENCH_NR_CPUS; 14008158c11SPuranjay Mohan 14108158c11SPuranjay Mohan total_samples = 0; 14208158c11SPuranjay Mohan for (i = 0; i < (int)nr_cpus; i++) { 14308158c11SPuranjay Mohan __u32 cnt = t->idx[i]; 14408158c11SPuranjay Mohan 14508158c11SPuranjay Mohan if (cnt > BENCH_NR_SAMPLES) 14608158c11SPuranjay Mohan cnt = BENCH_NR_SAMPLES; 14708158c11SPuranjay Mohan total_samples += cnt; 14808158c11SPuranjay Mohan } 14908158c11SPuranjay Mohan 15008158c11SPuranjay Mohan if (total_samples >= (__u32)env.producer_cnt * t->target_samples && !t->done) { 15108158c11SPuranjay Mohan t->done = true; 15208158c11SPuranjay Mohan *t->timing_enabled = 0; 15308158c11SPuranjay Mohan bench_force_done(); 15408158c11SPuranjay Mohan } 15508158c11SPuranjay Mohan } 15608158c11SPuranjay Mohan 15708158c11SPuranjay Mohan void bpf_bench_timing_report(struct bpf_bench_timing *t, const char *name, const char *description) 15808158c11SPuranjay Mohan { 15908158c11SPuranjay Mohan int max_out = BENCH_NR_CPUS * BENCH_NR_SAMPLES; 16008158c11SPuranjay Mohan struct timing_stats s; 16108158c11SPuranjay Mohan double *all; 16208158c11SPuranjay Mohan int total; 16308158c11SPuranjay Mohan 16408158c11SPuranjay Mohan all = calloc(max_out, sizeof(*all)); 16508158c11SPuranjay Mohan if (!all) { 16608158c11SPuranjay Mohan fprintf(stderr, "failed to allocate timing buffer\n"); 16708158c11SPuranjay Mohan return; 16808158c11SPuranjay Mohan } 16908158c11SPuranjay Mohan 17008158c11SPuranjay Mohan total = collect_samples(t, all, max_out); 17108158c11SPuranjay Mohan 17208158c11SPuranjay Mohan if (total == 0) { 17308158c11SPuranjay Mohan printf("No timing samples collected.\n"); 17408158c11SPuranjay Mohan free(all); 17508158c11SPuranjay Mohan return; 17608158c11SPuranjay Mohan } 17708158c11SPuranjay Mohan 178*abac8acbSPuranjay Mohan total = filter_outliers_iqr(all, total); 17908158c11SPuranjay Mohan compute_stats(all, total, &s); 18008158c11SPuranjay Mohan 18108158c11SPuranjay Mohan if (t->machine_readable) { 18208158c11SPuranjay Mohan printf("RESULT scenario=%s samples=%d median=%.2f stddev=%.2f cv=%.2f min=%.2f " 18308158c11SPuranjay Mohan "p99=%.2f max=%.2f\n", name, total, s.median, s.stddev, 18408158c11SPuranjay Mohan s.mean > 0 ? s.stddev / s.mean * 100.0 : 0.0, s.min, s.p99, s.max); 18508158c11SPuranjay Mohan } else { 18608158c11SPuranjay Mohan printf("%s: median %.2f ns/op, stddev %.2f, p99 %.2f (%d samples)\n", name, 18708158c11SPuranjay Mohan s.median, s.stddev, s.p99, total); 18808158c11SPuranjay Mohan } 18908158c11SPuranjay Mohan 19008158c11SPuranjay Mohan free(all); 19108158c11SPuranjay Mohan } 19208158c11SPuranjay Mohan 19308158c11SPuranjay Mohan #define CALIBRATE_SEED_BATCH 100 19408158c11SPuranjay Mohan #define CALIBRATE_MIN_BATCH 100 19508158c11SPuranjay Mohan #define CALIBRATE_MAX_BATCH 10000000 19608158c11SPuranjay Mohan #define CALIBRATE_TARGET_MS 10 19708158c11SPuranjay Mohan #define CALIBRATE_RUNS 5 19808158c11SPuranjay Mohan #define PROPORTIONALITY_TOL 0.05 /* 5% */ 19908158c11SPuranjay Mohan 20008158c11SPuranjay Mohan static void reset_timing(struct bpf_bench_timing *t) 20108158c11SPuranjay Mohan { 20208158c11SPuranjay Mohan *t->timing_enabled = 0; 20308158c11SPuranjay Mohan memset(t->samples, 0, sizeof(__u64) * BENCH_NR_CPUS * BENCH_NR_SAMPLES); 20408158c11SPuranjay Mohan memset(t->idx, 0, sizeof(__u32) * BENCH_NR_CPUS); 20508158c11SPuranjay Mohan } 20608158c11SPuranjay Mohan 20708158c11SPuranjay Mohan static __u64 measure_elapsed(struct bpf_bench_timing *t, bpf_bench_run_fn run_fn, void *run_ctx, 20808158c11SPuranjay Mohan __u32 iters, int runs) 20908158c11SPuranjay Mohan { 21008158c11SPuranjay Mohan __u64 buf[CALIBRATE_RUNS]; 21108158c11SPuranjay Mohan int n = 0, i, j; 21208158c11SPuranjay Mohan 21308158c11SPuranjay Mohan reset_timing(t); 21408158c11SPuranjay Mohan *t->batch_iters_bss = iters; 21508158c11SPuranjay Mohan *t->timing_enabled = 1; 21608158c11SPuranjay Mohan 21708158c11SPuranjay Mohan for (i = 0; i < runs; i++) 21808158c11SPuranjay Mohan run_fn(run_ctx); 21908158c11SPuranjay Mohan 22008158c11SPuranjay Mohan *t->timing_enabled = 0; 22108158c11SPuranjay Mohan 22208158c11SPuranjay Mohan for (i = 0; i < BENCH_NR_CPUS && n < runs; i++) { 22308158c11SPuranjay Mohan __u32 cnt = t->idx[i]; 22408158c11SPuranjay Mohan 22508158c11SPuranjay Mohan for (j = 0; j < (int)cnt && n < runs; j++) 22608158c11SPuranjay Mohan buf[n++] = t->samples[i][j]; 22708158c11SPuranjay Mohan } 22808158c11SPuranjay Mohan 22908158c11SPuranjay Mohan if (n == 0) 23008158c11SPuranjay Mohan return 0; 23108158c11SPuranjay Mohan 23208158c11SPuranjay Mohan for (i = 1; i < n; i++) { 23308158c11SPuranjay Mohan __u64 key = buf[i]; 23408158c11SPuranjay Mohan 23508158c11SPuranjay Mohan j = i - 1; 23608158c11SPuranjay Mohan while (j >= 0 && buf[j] > key) { 23708158c11SPuranjay Mohan buf[j + 1] = buf[j]; 23808158c11SPuranjay Mohan j--; 23908158c11SPuranjay Mohan } 24008158c11SPuranjay Mohan buf[j + 1] = key; 24108158c11SPuranjay Mohan } 24208158c11SPuranjay Mohan 24308158c11SPuranjay Mohan return buf[n / 2]; 24408158c11SPuranjay Mohan } 24508158c11SPuranjay Mohan 24608158c11SPuranjay Mohan static __u32 compute_batch_iters(__u64 per_op_ns) 24708158c11SPuranjay Mohan { 24808158c11SPuranjay Mohan __u64 target_ns = (__u64)CALIBRATE_TARGET_MS * 1000000ULL; 24908158c11SPuranjay Mohan __u32 iters; 25008158c11SPuranjay Mohan 25108158c11SPuranjay Mohan if (per_op_ns == 0) 25208158c11SPuranjay Mohan return CALIBRATE_MIN_BATCH; 25308158c11SPuranjay Mohan 25408158c11SPuranjay Mohan iters = target_ns / per_op_ns; 25508158c11SPuranjay Mohan 25608158c11SPuranjay Mohan if (iters < CALIBRATE_MIN_BATCH) 25708158c11SPuranjay Mohan iters = CALIBRATE_MIN_BATCH; 25808158c11SPuranjay Mohan if (iters > CALIBRATE_MAX_BATCH) 25908158c11SPuranjay Mohan iters = CALIBRATE_MAX_BATCH; 26008158c11SPuranjay Mohan 26108158c11SPuranjay Mohan return iters; 26208158c11SPuranjay Mohan } 26308158c11SPuranjay Mohan 26408158c11SPuranjay Mohan void bpf_bench_calibrate(struct bpf_bench_timing *t, bpf_bench_run_fn run_fn, void *run_ctx) 26508158c11SPuranjay Mohan { 26608158c11SPuranjay Mohan __u64 elapsed, per_op_ns; 26708158c11SPuranjay Mohan __u64 time_n, time_2n; 26808158c11SPuranjay Mohan double ratio; 26908158c11SPuranjay Mohan 27008158c11SPuranjay Mohan elapsed = measure_elapsed(t, run_fn, run_ctx, CALIBRATE_SEED_BATCH, CALIBRATE_RUNS); 27108158c11SPuranjay Mohan if (elapsed == 0) { 27208158c11SPuranjay Mohan fprintf(stderr, "calibration: no timing samples, using default\n"); 27308158c11SPuranjay Mohan t->batch_iters = 10000; 27408158c11SPuranjay Mohan *t->batch_iters_bss = t->batch_iters; 27508158c11SPuranjay Mohan reset_timing(t); 27608158c11SPuranjay Mohan return; 27708158c11SPuranjay Mohan } 27808158c11SPuranjay Mohan 27908158c11SPuranjay Mohan per_op_ns = elapsed / CALIBRATE_SEED_BATCH; 28008158c11SPuranjay Mohan t->batch_iters = compute_batch_iters(per_op_ns); 28108158c11SPuranjay Mohan 28208158c11SPuranjay Mohan time_n = measure_elapsed(t, run_fn, run_ctx, t->batch_iters, CALIBRATE_RUNS); 28308158c11SPuranjay Mohan time_2n = measure_elapsed(t, run_fn, run_ctx, t->batch_iters * 2, CALIBRATE_RUNS); 28408158c11SPuranjay Mohan 28508158c11SPuranjay Mohan if (time_n > 0 && time_2n > 0) { 28608158c11SPuranjay Mohan ratio = (double)time_2n / (double)time_n; 28708158c11SPuranjay Mohan 28808158c11SPuranjay Mohan if (fabs(ratio - 2.0) / 2.0 > PROPORTIONALITY_TOL) 28908158c11SPuranjay Mohan fprintf(stderr, 29008158c11SPuranjay Mohan "WARNING: proportionality check failed (2N/N ratio=%.3f, " 29108158c11SPuranjay Mohan "expected=2.000, error=%.1f%%)\n System noise may be affecting " 29208158c11SPuranjay Mohan "results.\n", 29308158c11SPuranjay Mohan ratio, fabs(ratio - 2.0) / 2.0 * 100.0); 29408158c11SPuranjay Mohan } 29508158c11SPuranjay Mohan 29608158c11SPuranjay Mohan *t->batch_iters_bss = t->batch_iters; 29708158c11SPuranjay Mohan reset_timing(t); 29808158c11SPuranjay Mohan } 299