xref: /linux/tools/testing/selftests/bpf/benchs/bench_bpf_timing.c (revision abac8acb633a9448369d658889ac2bcfbd96f54b)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
3 
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <string.h>
7 #include <math.h>
8 #include "bench_bpf_timing.h"
9 #include "bpf_util.h"
10 
11 struct timing_stats {
12 	double min, max;
13 	double median, p99;
14 	double mean, stddev;
15 	int count;
16 };
17 
18 static int cmp_double(const void *a, const void *b)
19 {
20 	double da = *(const double *)a;
21 	double db = *(const double *)b;
22 
23 	if (da < db)
24 		return -1;
25 	if (da > db)
26 		return 1;
27 	return 0;
28 }
29 
30 static double percentile(const double *sorted, int n, double pct)
31 {
32 	int idx = (int)(n * pct / 100.0);
33 
34 	if (idx >= n)
35 		idx = n - 1;
36 	return sorted[idx];
37 }
38 
39 static int collect_samples(struct bpf_bench_timing *t,
40 			   double *out, int max_out)
41 {
42 	unsigned int nr_cpus = bpf_num_possible_cpus();
43 	__u32 timed_iters = t->batch_iters;
44 	int total = 0;
45 
46 	if (nr_cpus > BENCH_NR_CPUS)
47 		nr_cpus = BENCH_NR_CPUS;
48 
49 	for (unsigned int cpu = 0; cpu < nr_cpus; cpu++) {
50 		__u32 count = t->idx[cpu];
51 
52 		if (count > BENCH_NR_SAMPLES)
53 			count = BENCH_NR_SAMPLES;
54 
55 		for (__u32 i = 0; i < count && total < max_out; i++) {
56 			__u64 sample = t->samples[cpu][i];
57 
58 			if (sample == 0)
59 				continue;
60 			out[total++] = (double)sample / timed_iters;
61 		}
62 	}
63 
64 	qsort(out, total, sizeof(double), cmp_double);
65 	return total;
66 }
67 
68 static int filter_outliers_iqr(double *sorted, int n)
69 {
70 	double q1, q3, iqr, lo, hi;
71 	int start = 0, end = n;
72 
73 	if (n < 8)
74 		return n;
75 
76 	q1 = sorted[n / 4];
77 	q3 = sorted[3 * n / 4];
78 	iqr = q3 - q1;
79 	lo = q1 - 1.5 * iqr;
80 	hi = q3 + 1.5 * iqr;
81 
82 	while (start < end && sorted[start] < lo)
83 		start++;
84 	while (end > start && sorted[end - 1] > hi)
85 		end--;
86 
87 	if (start > 0)
88 		memmove(sorted, sorted + start, (end - start) * sizeof(double));
89 
90 	return end - start;
91 }
92 
93 static void compute_stats(const double *sorted, int n,
94 			  struct timing_stats *s)
95 {
96 	double sum = 0, var_sum = 0;
97 
98 	memset(s, 0, sizeof(*s));
99 	s->count = n;
100 
101 	if (n == 0)
102 		return;
103 
104 	s->min    = sorted[0];
105 	s->max    = sorted[n - 1];
106 	s->median = sorted[n / 2];
107 	s->p99    = percentile(sorted, n, 99);
108 
109 	for (int i = 0; i < n; i++)
110 		sum += sorted[i];
111 	s->mean = sum / n;
112 
113 	for (int i = 0; i < n; i++) {
114 		double d = sorted[i] - s->mean;
115 
116 		var_sum += d * d;
117 	}
118 	s->stddev = n > 1 ? sqrt(var_sum / (n - 1)) : 0;
119 }
120 
121 void bpf_bench_timing_measure(struct bpf_bench_timing *t, struct bench_res *res)
122 {
123 	unsigned int nr_cpus;
124 	__u32 total_samples;
125 	int i;
126 
127 	t->warmup_ticks++;
128 
129 	if (t->warmup_ticks < env.warmup_sec)
130 		return;
131 
132 	if (t->warmup_ticks == env.warmup_sec) {
133 		*t->timing_enabled = 1;
134 		return;
135 	}
136 
137 	nr_cpus = bpf_num_possible_cpus();
138 	if (nr_cpus > BENCH_NR_CPUS)
139 		nr_cpus = BENCH_NR_CPUS;
140 
141 	total_samples = 0;
142 	for (i = 0; i < (int)nr_cpus; i++) {
143 		__u32 cnt = t->idx[i];
144 
145 		if (cnt > BENCH_NR_SAMPLES)
146 			cnt = BENCH_NR_SAMPLES;
147 		total_samples += cnt;
148 	}
149 
150 	if (total_samples >= (__u32)env.producer_cnt * t->target_samples && !t->done) {
151 		t->done = true;
152 		*t->timing_enabled = 0;
153 		bench_force_done();
154 	}
155 }
156 
157 void bpf_bench_timing_report(struct bpf_bench_timing *t, const char *name, const char *description)
158 {
159 	int max_out = BENCH_NR_CPUS * BENCH_NR_SAMPLES;
160 	struct timing_stats s;
161 	double *all;
162 	int total;
163 
164 	all = calloc(max_out, sizeof(*all));
165 	if (!all) {
166 		fprintf(stderr, "failed to allocate timing buffer\n");
167 		return;
168 	}
169 
170 	total = collect_samples(t, all, max_out);
171 
172 	if (total == 0) {
173 		printf("No timing samples collected.\n");
174 		free(all);
175 		return;
176 	}
177 
178 	total = filter_outliers_iqr(all, total);
179 	compute_stats(all, total, &s);
180 
181 	if (t->machine_readable) {
182 		printf("RESULT scenario=%s samples=%d median=%.2f stddev=%.2f cv=%.2f min=%.2f "
183 		       "p99=%.2f max=%.2f\n", name, total, s.median, s.stddev,
184 		       s.mean > 0 ? s.stddev / s.mean * 100.0 : 0.0, s.min, s.p99, s.max);
185 	} else {
186 		printf("%s: median %.2f ns/op, stddev %.2f, p99 %.2f (%d samples)\n", name,
187 		       s.median, s.stddev, s.p99, total);
188 	}
189 
190 	free(all);
191 }
192 
193 #define CALIBRATE_SEED_BATCH	100
194 #define CALIBRATE_MIN_BATCH	100
195 #define CALIBRATE_MAX_BATCH	10000000
196 #define CALIBRATE_TARGET_MS	10
197 #define CALIBRATE_RUNS		5
198 #define PROPORTIONALITY_TOL	0.05	/* 5% */
199 
200 static void reset_timing(struct bpf_bench_timing *t)
201 {
202 	*t->timing_enabled = 0;
203 	memset(t->samples, 0, sizeof(__u64) * BENCH_NR_CPUS * BENCH_NR_SAMPLES);
204 	memset(t->idx, 0, sizeof(__u32) * BENCH_NR_CPUS);
205 }
206 
207 static __u64 measure_elapsed(struct bpf_bench_timing *t, bpf_bench_run_fn run_fn, void *run_ctx,
208 			     __u32 iters, int runs)
209 {
210 	__u64 buf[CALIBRATE_RUNS];
211 	int n = 0, i, j;
212 
213 	reset_timing(t);
214 	*t->batch_iters_bss = iters;
215 	*t->timing_enabled = 1;
216 
217 	for (i = 0; i < runs; i++)
218 		run_fn(run_ctx);
219 
220 	*t->timing_enabled = 0;
221 
222 	for (i = 0; i < BENCH_NR_CPUS && n < runs; i++) {
223 		__u32 cnt = t->idx[i];
224 
225 		for (j = 0; j < (int)cnt && n < runs; j++)
226 			buf[n++] = t->samples[i][j];
227 	}
228 
229 	if (n == 0)
230 		return 0;
231 
232 	for (i = 1; i < n; i++) {
233 		__u64 key = buf[i];
234 
235 		j = i - 1;
236 		while (j >= 0 && buf[j] > key) {
237 			buf[j + 1] = buf[j];
238 			j--;
239 		}
240 		buf[j + 1] = key;
241 	}
242 
243 	return buf[n / 2];
244 }
245 
246 static __u32 compute_batch_iters(__u64 per_op_ns)
247 {
248 	__u64 target_ns = (__u64)CALIBRATE_TARGET_MS * 1000000ULL;
249 	__u32 iters;
250 
251 	if (per_op_ns == 0)
252 		return CALIBRATE_MIN_BATCH;
253 
254 	iters = target_ns / per_op_ns;
255 
256 	if (iters < CALIBRATE_MIN_BATCH)
257 		iters = CALIBRATE_MIN_BATCH;
258 	if (iters > CALIBRATE_MAX_BATCH)
259 		iters = CALIBRATE_MAX_BATCH;
260 
261 	return iters;
262 }
263 
264 void bpf_bench_calibrate(struct bpf_bench_timing *t, bpf_bench_run_fn run_fn, void *run_ctx)
265 {
266 	__u64 elapsed, per_op_ns;
267 	__u64 time_n, time_2n;
268 	double ratio;
269 
270 	elapsed = measure_elapsed(t, run_fn, run_ctx, CALIBRATE_SEED_BATCH, CALIBRATE_RUNS);
271 	if (elapsed == 0) {
272 		fprintf(stderr, "calibration: no timing samples, using default\n");
273 		t->batch_iters = 10000;
274 		*t->batch_iters_bss = t->batch_iters;
275 		reset_timing(t);
276 		return;
277 	}
278 
279 	per_op_ns = elapsed / CALIBRATE_SEED_BATCH;
280 	t->batch_iters = compute_batch_iters(per_op_ns);
281 
282 	time_n = measure_elapsed(t, run_fn, run_ctx, t->batch_iters, CALIBRATE_RUNS);
283 	time_2n = measure_elapsed(t, run_fn, run_ctx, t->batch_iters * 2, CALIBRATE_RUNS);
284 
285 	if (time_n > 0 && time_2n > 0) {
286 		ratio = (double)time_2n / (double)time_n;
287 
288 		if (fabs(ratio - 2.0) / 2.0 > PROPORTIONALITY_TOL)
289 			fprintf(stderr,
290 				"WARNING: proportionality check failed (2N/N ratio=%.3f, "
291 				"expected=2.000, error=%.1f%%)\n  System noise may be affecting "
292 				"results.\n",
293 				ratio, fabs(ratio - 2.0) / 2.0 * 100.0);
294 	}
295 
296 	*t->batch_iters_bss = t->batch_iters;
297 	reset_timing(t);
298 }
299