1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ 3 4 #include <stdio.h> 5 #include <stdlib.h> 6 #include <string.h> 7 #include <math.h> 8 #include "bench_bpf_timing.h" 9 #include "bpf_util.h" 10 11 struct timing_stats { 12 double min, max; 13 double median, p99; 14 double mean, stddev; 15 int count; 16 }; 17 18 static int cmp_double(const void *a, const void *b) 19 { 20 double da = *(const double *)a; 21 double db = *(const double *)b; 22 23 if (da < db) 24 return -1; 25 if (da > db) 26 return 1; 27 return 0; 28 } 29 30 static double percentile(const double *sorted, int n, double pct) 31 { 32 int idx = (int)(n * pct / 100.0); 33 34 if (idx >= n) 35 idx = n - 1; 36 return sorted[idx]; 37 } 38 39 static int collect_samples(struct bpf_bench_timing *t, 40 double *out, int max_out) 41 { 42 unsigned int nr_cpus = bpf_num_possible_cpus(); 43 __u32 timed_iters = t->batch_iters; 44 int total = 0; 45 46 if (nr_cpus > BENCH_NR_CPUS) 47 nr_cpus = BENCH_NR_CPUS; 48 49 for (unsigned int cpu = 0; cpu < nr_cpus; cpu++) { 50 __u32 count = t->idx[cpu]; 51 52 if (count > BENCH_NR_SAMPLES) 53 count = BENCH_NR_SAMPLES; 54 55 for (__u32 i = 0; i < count && total < max_out; i++) { 56 __u64 sample = t->samples[cpu][i]; 57 58 if (sample == 0) 59 continue; 60 out[total++] = (double)sample / timed_iters; 61 } 62 } 63 64 qsort(out, total, sizeof(double), cmp_double); 65 return total; 66 } 67 68 static int filter_outliers_iqr(double *sorted, int n) 69 { 70 double q1, q3, iqr, lo, hi; 71 int start = 0, end = n; 72 73 if (n < 8) 74 return n; 75 76 q1 = sorted[n / 4]; 77 q3 = sorted[3 * n / 4]; 78 iqr = q3 - q1; 79 lo = q1 - 1.5 * iqr; 80 hi = q3 + 1.5 * iqr; 81 82 while (start < end && sorted[start] < lo) 83 start++; 84 while (end > start && sorted[end - 1] > hi) 85 end--; 86 87 if (start > 0) 88 memmove(sorted, sorted + start, (end - start) * sizeof(double)); 89 90 return end - start; 91 } 92 93 static void compute_stats(const double *sorted, int n, 94 struct timing_stats *s) 95 { 96 double sum = 0, var_sum = 0; 97 98 memset(s, 0, sizeof(*s)); 99 s->count = n; 100 101 if (n == 0) 102 return; 103 104 s->min = sorted[0]; 105 s->max = sorted[n - 1]; 106 s->median = sorted[n / 2]; 107 s->p99 = percentile(sorted, n, 99); 108 109 for (int i = 0; i < n; i++) 110 sum += sorted[i]; 111 s->mean = sum / n; 112 113 for (int i = 0; i < n; i++) { 114 double d = sorted[i] - s->mean; 115 116 var_sum += d * d; 117 } 118 s->stddev = n > 1 ? sqrt(var_sum / (n - 1)) : 0; 119 } 120 121 void bpf_bench_timing_measure(struct bpf_bench_timing *t, struct bench_res *res) 122 { 123 unsigned int nr_cpus; 124 __u32 total_samples; 125 int i; 126 127 t->warmup_ticks++; 128 129 if (t->warmup_ticks < env.warmup_sec) 130 return; 131 132 if (t->warmup_ticks == env.warmup_sec) { 133 *t->timing_enabled = 1; 134 return; 135 } 136 137 nr_cpus = bpf_num_possible_cpus(); 138 if (nr_cpus > BENCH_NR_CPUS) 139 nr_cpus = BENCH_NR_CPUS; 140 141 total_samples = 0; 142 for (i = 0; i < (int)nr_cpus; i++) { 143 __u32 cnt = t->idx[i]; 144 145 if (cnt > BENCH_NR_SAMPLES) 146 cnt = BENCH_NR_SAMPLES; 147 total_samples += cnt; 148 } 149 150 if (total_samples >= (__u32)env.producer_cnt * t->target_samples && !t->done) { 151 t->done = true; 152 *t->timing_enabled = 0; 153 bench_force_done(); 154 } 155 } 156 157 void bpf_bench_timing_report(struct bpf_bench_timing *t, const char *name, const char *description) 158 { 159 int max_out = BENCH_NR_CPUS * BENCH_NR_SAMPLES; 160 struct timing_stats s; 161 double *all; 162 int total; 163 164 all = calloc(max_out, sizeof(*all)); 165 if (!all) { 166 fprintf(stderr, "failed to allocate timing buffer\n"); 167 return; 168 } 169 170 total = collect_samples(t, all, max_out); 171 172 if (total == 0) { 173 printf("No timing samples collected.\n"); 174 free(all); 175 return; 176 } 177 178 total = filter_outliers_iqr(all, total); 179 compute_stats(all, total, &s); 180 181 if (t->machine_readable) { 182 printf("RESULT scenario=%s samples=%d median=%.2f stddev=%.2f cv=%.2f min=%.2f " 183 "p99=%.2f max=%.2f\n", name, total, s.median, s.stddev, 184 s.mean > 0 ? s.stddev / s.mean * 100.0 : 0.0, s.min, s.p99, s.max); 185 } else { 186 printf("%s: median %.2f ns/op, stddev %.2f, p99 %.2f (%d samples)\n", name, 187 s.median, s.stddev, s.p99, total); 188 } 189 190 free(all); 191 } 192 193 #define CALIBRATE_SEED_BATCH 100 194 #define CALIBRATE_MIN_BATCH 100 195 #define CALIBRATE_MAX_BATCH 10000000 196 #define CALIBRATE_TARGET_MS 10 197 #define CALIBRATE_RUNS 5 198 #define PROPORTIONALITY_TOL 0.05 /* 5% */ 199 200 static void reset_timing(struct bpf_bench_timing *t) 201 { 202 *t->timing_enabled = 0; 203 memset(t->samples, 0, sizeof(__u64) * BENCH_NR_CPUS * BENCH_NR_SAMPLES); 204 memset(t->idx, 0, sizeof(__u32) * BENCH_NR_CPUS); 205 } 206 207 static __u64 measure_elapsed(struct bpf_bench_timing *t, bpf_bench_run_fn run_fn, void *run_ctx, 208 __u32 iters, int runs) 209 { 210 __u64 buf[CALIBRATE_RUNS]; 211 int n = 0, i, j; 212 213 reset_timing(t); 214 *t->batch_iters_bss = iters; 215 *t->timing_enabled = 1; 216 217 for (i = 0; i < runs; i++) 218 run_fn(run_ctx); 219 220 *t->timing_enabled = 0; 221 222 for (i = 0; i < BENCH_NR_CPUS && n < runs; i++) { 223 __u32 cnt = t->idx[i]; 224 225 for (j = 0; j < (int)cnt && n < runs; j++) 226 buf[n++] = t->samples[i][j]; 227 } 228 229 if (n == 0) 230 return 0; 231 232 for (i = 1; i < n; i++) { 233 __u64 key = buf[i]; 234 235 j = i - 1; 236 while (j >= 0 && buf[j] > key) { 237 buf[j + 1] = buf[j]; 238 j--; 239 } 240 buf[j + 1] = key; 241 } 242 243 return buf[n / 2]; 244 } 245 246 static __u32 compute_batch_iters(__u64 per_op_ns) 247 { 248 __u64 target_ns = (__u64)CALIBRATE_TARGET_MS * 1000000ULL; 249 __u32 iters; 250 251 if (per_op_ns == 0) 252 return CALIBRATE_MIN_BATCH; 253 254 iters = target_ns / per_op_ns; 255 256 if (iters < CALIBRATE_MIN_BATCH) 257 iters = CALIBRATE_MIN_BATCH; 258 if (iters > CALIBRATE_MAX_BATCH) 259 iters = CALIBRATE_MAX_BATCH; 260 261 return iters; 262 } 263 264 void bpf_bench_calibrate(struct bpf_bench_timing *t, bpf_bench_run_fn run_fn, void *run_ctx) 265 { 266 __u64 elapsed, per_op_ns; 267 __u64 time_n, time_2n; 268 double ratio; 269 270 elapsed = measure_elapsed(t, run_fn, run_ctx, CALIBRATE_SEED_BATCH, CALIBRATE_RUNS); 271 if (elapsed == 0) { 272 fprintf(stderr, "calibration: no timing samples, using default\n"); 273 t->batch_iters = 10000; 274 *t->batch_iters_bss = t->batch_iters; 275 reset_timing(t); 276 return; 277 } 278 279 per_op_ns = elapsed / CALIBRATE_SEED_BATCH; 280 t->batch_iters = compute_batch_iters(per_op_ns); 281 282 time_n = measure_elapsed(t, run_fn, run_ctx, t->batch_iters, CALIBRATE_RUNS); 283 time_2n = measure_elapsed(t, run_fn, run_ctx, t->batch_iters * 2, CALIBRATE_RUNS); 284 285 if (time_n > 0 && time_2n > 0) { 286 ratio = (double)time_2n / (double)time_n; 287 288 if (fabs(ratio - 2.0) / 2.0 > PROPORTIONALITY_TOL) 289 fprintf(stderr, 290 "WARNING: proportionality check failed (2N/N ratio=%.3f, " 291 "expected=2.000, error=%.1f%%)\n System noise may be affecting " 292 "results.\n", 293 ratio, fabs(ratio - 2.0) / 2.0 * 100.0); 294 } 295 296 *t->batch_iters_bss = t->batch_iters; 297 reset_timing(t); 298 } 299