1 /*
2 * Microbenchmark for math functions.
3 *
4 * Copyright (c) 2018-2024, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8 #if WANT_SVE_TESTS
9 # if __aarch64__ && __linux__
10 # ifdef __clang__
11 # pragma clang attribute push(__attribute__((target("sve"))), \
12 apply_to = any(function))
13 # else
14 # pragma GCC target("+sve")
15 # endif
16 # else
17 # error "SVE not supported - please disable WANT_SVE_TESTS"
18 # endif
19 #endif
20
21 #undef _GNU_SOURCE
22 #define _GNU_SOURCE 1
23 #include <stdint.h>
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <time.h>
28 #include <math.h>
29 #include "mathlib.h"
30
31 /* Number of measurements, best result is reported. */
32 #define MEASURE 60
33 /* Array size. */
34 #define N 8000
35 /* Iterations over the array. */
36 #define ITER 125
37
38 static double *Trace;
39 static size_t trace_size;
40 static double A[N];
41 static float Af[N];
42 static long measurecount = MEASURE;
43 static long itercount = ITER;
44
45 static double
dummy(double x)46 dummy (double x)
47 {
48 return x;
49 }
50
51 static float
dummyf(float x)52 dummyf (float x)
53 {
54 return x;
55 }
56 #if __aarch64__ && __linux__
57 __vpcs static float64x2_t
__vn_dummy(float64x2_t x)58 __vn_dummy (float64x2_t x)
59 {
60 return x;
61 }
62
63 __vpcs static float32x4_t
__vn_dummyf(float32x4_t x)64 __vn_dummyf (float32x4_t x)
65 {
66 return x;
67 }
68 #endif
69 #if WANT_SVE_TESTS
70 static svfloat64_t
__sv_dummy(svfloat64_t x,svbool_t pg)71 __sv_dummy (svfloat64_t x, svbool_t pg)
72 {
73 return x;
74 }
75
76 static svfloat32_t
__sv_dummyf(svfloat32_t x,svbool_t pg)77 __sv_dummyf (svfloat32_t x, svbool_t pg)
78 {
79 return x;
80 }
81
82 #endif
83
84 #include "test/mathbench_wrappers.h"
85
86 static const struct fun
87 {
88 const char *name;
89 int prec;
90 int vec;
91 double lo;
92 double hi;
93 union
94 {
95 double (*d) (double);
96 float (*f) (float);
97 #if __aarch64__ && __linux__
98 __vpcs float64x2_t (*vnd) (float64x2_t);
99 __vpcs float32x4_t (*vnf) (float32x4_t);
100 #endif
101 #if WANT_SVE_TESTS
102 svfloat64_t (*svd) (svfloat64_t, svbool_t);
103 svfloat32_t (*svf) (svfloat32_t, svbool_t);
104 #endif
105 } fun;
106 } funtab[] = {
107 // clang-format off
108 #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
109 #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
110 #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
111 #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
112 #define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},
113 #define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
114 D (dummy, 1.0, 2.0)
115 F (dummyf, 1.0, 2.0)
116 #if __aarch64__ && __linux__
117 VND (__vn_dummy, 1.0, 2.0)
118 VNF (__vn_dummyf, 1.0, 2.0)
119 #endif
120 #if WANT_SVE_TESTS
121 SVD (__sv_dummy, 1.0, 2.0)
122 SVF (__sv_dummyf, 1.0, 2.0)
123 #endif
124 #include "test/mathbench_funcs.h"
125 {0},
126 #undef F
127 #undef D
128 #undef VNF
129 #undef VND
130 #undef SVF
131 #undef SVD
132 // clang-format on
133 };
134
135 static void
gen_linear(double lo,double hi)136 gen_linear (double lo, double hi)
137 {
138 for (int i = 0; i < N; i++)
139 A[i] = (lo * (N - i) + hi * i) / N;
140 }
141
142 static void
genf_linear(double lo,double hi)143 genf_linear (double lo, double hi)
144 {
145 for (int i = 0; i < N; i++)
146 Af[i] = (float)(lo * (N - i) + hi * i) / N;
147 }
148
149 static inline double
asdouble(uint64_t i)150 asdouble (uint64_t i)
151 {
152 union
153 {
154 uint64_t i;
155 double f;
156 } u = {i};
157 return u.f;
158 }
159
160 static uint64_t seed = 0x0123456789abcdef;
161
162 static double
frand(double lo,double hi)163 frand (double lo, double hi)
164 {
165 seed = 6364136223846793005ULL * seed + 1;
166 return lo + (hi - lo) * (asdouble (seed >> 12 | 0x3ffULL << 52) - 1.0);
167 }
168
169 static void
gen_rand(double lo,double hi)170 gen_rand (double lo, double hi)
171 {
172 for (int i = 0; i < N; i++)
173 A[i] = frand (lo, hi);
174 }
175
176 static void
genf_rand(double lo,double hi)177 genf_rand (double lo, double hi)
178 {
179 for (int i = 0; i < N; i++)
180 Af[i] = (float)frand (lo, hi);
181 }
182
183 static void
gen_trace(int index)184 gen_trace (int index)
185 {
186 for (int i = 0; i < N; i++)
187 A[i] = Trace[index + i];
188 }
189
190 static void
genf_trace(int index)191 genf_trace (int index)
192 {
193 for (int i = 0; i < N; i++)
194 Af[i] = (float)Trace[index + i];
195 }
196
197 static void
run_thruput(double f (double))198 run_thruput (double f (double))
199 {
200 for (int i = 0; i < N; i++)
201 f (A[i]);
202 }
203
204 static void
runf_thruput(float f (float))205 runf_thruput (float f (float))
206 {
207 for (int i = 0; i < N; i++)
208 f (Af[i]);
209 }
210
211 volatile double zero = 0;
212
213 static void
run_latency(double f (double))214 run_latency (double f (double))
215 {
216 double z = zero;
217 double prev = z;
218 for (int i = 0; i < N; i++)
219 prev = f (A[i] + prev * z);
220 }
221
222 static void
runf_latency(float f (float))223 runf_latency (float f (float))
224 {
225 float z = (float)zero;
226 float prev = z;
227 for (int i = 0; i < N; i++)
228 prev = f (Af[i] + prev * z);
229 }
230
231 #if __aarch64__ && __linux__
232 static void
run_vn_thruput(__vpcs float64x2_t f (float64x2_t))233 run_vn_thruput (__vpcs float64x2_t f (float64x2_t))
234 {
235 for (int i = 0; i < N; i += 2)
236 f (vld1q_f64 (A + i));
237 }
238
239 static void
runf_vn_thruput(__vpcs float32x4_t f (float32x4_t))240 runf_vn_thruput (__vpcs float32x4_t f (float32x4_t))
241 {
242 for (int i = 0; i < N; i += 4)
243 f (vld1q_f32 (Af + i));
244 }
245
246 static void
run_vn_latency(__vpcs float64x2_t f (float64x2_t))247 run_vn_latency (__vpcs float64x2_t f (float64x2_t))
248 {
249 volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 };
250 uint64x2_t sel = vsel;
251 float64x2_t prev = vdupq_n_f64 (0);
252 for (int i = 0; i < N; i += 2)
253 prev = f (vbslq_f64 (sel, prev, vld1q_f64 (A + i)));
254 }
255
256 static void
runf_vn_latency(__vpcs float32x4_t f (float32x4_t))257 runf_vn_latency (__vpcs float32x4_t f (float32x4_t))
258 {
259 volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 };
260 uint32x4_t sel = vsel;
261 float32x4_t prev = vdupq_n_f32 (0);
262 for (int i = 0; i < N; i += 4)
263 prev = f (vbslq_f32 (sel, prev, vld1q_f32 (Af + i)));
264 }
265 #endif
266
267 #if WANT_SVE_TESTS
268 static void
run_sv_thruput(svfloat64_t f (svfloat64_t,svbool_t))269 run_sv_thruput (svfloat64_t f (svfloat64_t, svbool_t))
270 {
271 for (int i = 0; i < N; i += svcntd ())
272 f (svld1_f64 (svptrue_b64 (), A + i), svptrue_b64 ());
273 }
274
275 static void
runf_sv_thruput(svfloat32_t f (svfloat32_t,svbool_t))276 runf_sv_thruput (svfloat32_t f (svfloat32_t, svbool_t))
277 {
278 for (int i = 0; i < N; i += svcntw ())
279 f (svld1_f32 (svptrue_b32 (), Af + i), svptrue_b32 ());
280 }
281
282 static void
run_sv_latency(svfloat64_t f (svfloat64_t,svbool_t))283 run_sv_latency (svfloat64_t f (svfloat64_t, svbool_t))
284 {
285 volatile svbool_t vsel = svptrue_b64 ();
286 svbool_t sel = vsel;
287 svfloat64_t prev = svdup_f64 (0);
288 for (int i = 0; i < N; i += svcntd ())
289 prev = f (svsel_f64 (sel, svld1_f64 (svptrue_b64 (), A + i), prev),
290 svptrue_b64 ());
291 }
292
293 static void
runf_sv_latency(svfloat32_t f (svfloat32_t,svbool_t))294 runf_sv_latency (svfloat32_t f (svfloat32_t, svbool_t))
295 {
296 volatile svbool_t vsel = svptrue_b32 ();
297 svbool_t sel = vsel;
298 svfloat32_t prev = svdup_f32 (0);
299 for (int i = 0; i < N; i += svcntw ())
300 prev = f (svsel_f32 (sel, svld1_f32 (svptrue_b32 (), Af + i), prev),
301 svptrue_b32 ());
302 }
303 #endif
304
305 static uint64_t
tic(void)306 tic (void)
307 {
308 struct timespec ts;
309 #if defined(_MSC_VER)
310 if (!timespec_get (&ts, TIME_UTC))
311 #else
312 if (clock_gettime (CLOCK_REALTIME, &ts))
313 #endif
314 abort ();
315 return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
316 }
317
318 #define TIMEIT(run, f) do { \
319 dt = -1; \
320 run (f); /* Warm up. */ \
321 for (int j = 0; j < measurecount; j++) \
322 { \
323 uint64_t t0 = tic (); \
324 for (int i = 0; i < itercount; i++) \
325 run (f); \
326 uint64_t t1 = tic (); \
327 if (t1 - t0 < dt) \
328 dt = t1 - t0; \
329 } \
330 } while (0)
331
332 static void
bench1(const struct fun * f,int type,double lo,double hi)333 bench1 (const struct fun *f, int type, double lo, double hi)
334 {
335 uint64_t dt = 0;
336 uint64_t ns100;
337 const char *s = type == 't' ? "rthruput" : "latency";
338 int vlen = 1;
339
340 if (f->vec == 'n')
341 vlen = f->prec == 'd' ? 2 : 4;
342 #if WANT_SVE_TESTS
343 else if (f->vec == 's')
344 vlen = f->prec == 'd' ? svcntd () : svcntw ();
345 #endif
346
347 if (f->prec == 'd' && type == 't' && f->vec == 0)
348 TIMEIT (run_thruput, f->fun.d);
349 else if (f->prec == 'd' && type == 'l' && f->vec == 0)
350 TIMEIT (run_latency, f->fun.d);
351 else if (f->prec == 'f' && type == 't' && f->vec == 0)
352 TIMEIT (runf_thruput, f->fun.f);
353 else if (f->prec == 'f' && type == 'l' && f->vec == 0)
354 TIMEIT (runf_latency, f->fun.f);
355 #if __aarch64__ && __linux__
356 else if (f->prec == 'd' && type == 't' && f->vec == 'n')
357 TIMEIT (run_vn_thruput, f->fun.vnd);
358 else if (f->prec == 'd' && type == 'l' && f->vec == 'n')
359 TIMEIT (run_vn_latency, f->fun.vnd);
360 else if (f->prec == 'f' && type == 't' && f->vec == 'n')
361 TIMEIT (runf_vn_thruput, f->fun.vnf);
362 else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
363 TIMEIT (runf_vn_latency, f->fun.vnf);
364 #endif
365 #if WANT_SVE_TESTS
366 else if (f->prec == 'd' && type == 't' && f->vec == 's')
367 TIMEIT (run_sv_thruput, f->fun.svd);
368 else if (f->prec == 'd' && type == 'l' && f->vec == 's')
369 TIMEIT (run_sv_latency, f->fun.svd);
370 else if (f->prec == 'f' && type == 't' && f->vec == 's')
371 TIMEIT (runf_sv_thruput, f->fun.svf);
372 else if (f->prec == 'f' && type == 'l' && f->vec == 's')
373 TIMEIT (runf_sv_latency, f->fun.svf);
374 #endif
375
376 if (type == 't')
377 {
378 ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
379 printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n",
380 f->name, s,
381 (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
382 (unsigned long long) dt, lo, hi, vlen);
383 }
384 else if (type == 'l')
385 {
386 ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
387 printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n",
388 f->name, s,
389 (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
390 (unsigned long long) dt, lo, hi, vlen);
391 }
392 fflush (stdout);
393 }
394
395 static void
bench(const struct fun * f,double lo,double hi,int type,int gen)396 bench (const struct fun *f, double lo, double hi, int type, int gen)
397 {
398 if (f->prec == 'd' && gen == 'r')
399 gen_rand (lo, hi);
400 else if (f->prec == 'd' && gen == 'l')
401 gen_linear (lo, hi);
402 else if (f->prec == 'd' && gen == 't')
403 gen_trace (0);
404 else if (f->prec == 'f' && gen == 'r')
405 genf_rand (lo, hi);
406 else if (f->prec == 'f' && gen == 'l')
407 genf_linear (lo, hi);
408 else if (f->prec == 'f' && gen == 't')
409 genf_trace (0);
410
411 if (gen == 't')
412 hi = trace_size / N;
413
414 if (type == 'b' || type == 't')
415 bench1 (f, 't', lo, hi);
416
417 if (type == 'b' || type == 'l')
418 bench1 (f, 'l', lo, hi);
419
420 for (int i = N; i < trace_size; i += N)
421 {
422 if (f->prec == 'd')
423 gen_trace (i);
424 else
425 genf_trace (i);
426
427 lo = i / N;
428 if (type == 'b' || type == 't')
429 bench1 (f, 't', lo, hi);
430
431 if (type == 'b' || type == 'l')
432 bench1 (f, 'l', lo, hi);
433 }
434 }
435
436 static void
readtrace(const char * name)437 readtrace (const char *name)
438 {
439 int n = 0;
440 FILE *f = strcmp (name, "-") == 0 ? stdin : fopen (name, "r");
441 if (!f)
442 {
443 printf ("openning \"%s\" failed: %m\n", name);
444 exit (1);
445 }
446 for (;;)
447 {
448 if (n >= trace_size)
449 {
450 trace_size += N;
451 Trace = realloc (Trace, trace_size * sizeof (Trace[0]));
452 if (Trace == NULL)
453 {
454 printf ("out of memory\n");
455 exit (1);
456 }
457 }
458 if (fscanf (f, "%lf", Trace + n) != 1)
459 break;
460 n++;
461 }
462 if (ferror (f) || n == 0)
463 {
464 printf ("reading \"%s\" failed: %m\n", name);
465 exit (1);
466 }
467 fclose (f);
468 if (n % N == 0)
469 trace_size = n;
470 for (int i = 0; n < trace_size; n++, i++)
471 Trace[n] = Trace[i];
472 }
473
474 static void
usage(void)475 usage (void)
476 {
477 printf ("usage: ./mathbench [-g rand|linear|trace] [-t latency|thruput|both] "
478 "[-i low high] [-f tracefile] [-m measurements] [-c iterations] func "
479 "[func2 ..]\n");
480 printf ("func:\n");
481 printf ("%7s [run all benchmarks]\n", "all");
482 for (const struct fun *f = funtab; f->name; f++)
483 printf ("%7s [low: %g high: %g]\n", f->name, f->lo, f->hi);
484 exit (1);
485 }
486
487 int
main(int argc,char * argv[])488 main (int argc, char *argv[])
489 {
490 int usergen = 0, gen = 'r', type = 'b', all = 0;
491 double lo = 0, hi = 0;
492 const char *tracefile = "-";
493
494 argv++;
495 argc--;
496 for (;;)
497 {
498 if (argc <= 0)
499 usage ();
500 if (argv[0][0] != '-')
501 break;
502 else if (argc >= 3 && strcmp (argv[0], "-i") == 0)
503 {
504 usergen = 1;
505 lo = strtod (argv[1], 0);
506 hi = strtod (argv[2], 0);
507 argv += 3;
508 argc -= 3;
509 }
510 else if (argc >= 2 && strcmp (argv[0], "-m") == 0)
511 {
512 measurecount = strtol (argv[1], 0, 0);
513 argv += 2;
514 argc -= 2;
515 }
516 else if (argc >= 2 && strcmp (argv[0], "-c") == 0)
517 {
518 itercount = strtol (argv[1], 0, 0);
519 argv += 2;
520 argc -= 2;
521 }
522 else if (argc >= 2 && strcmp (argv[0], "-g") == 0)
523 {
524 gen = argv[1][0];
525 if (strchr ("rlt", gen) == 0)
526 usage ();
527 argv += 2;
528 argc -= 2;
529 }
530 else if (argc >= 2 && strcmp (argv[0], "-f") == 0)
531 {
532 gen = 't'; /* -f implies -g trace. */
533 tracefile = argv[1];
534 argv += 2;
535 argc -= 2;
536 }
537 else if (argc >= 2 && strcmp (argv[0], "-t") == 0)
538 {
539 type = argv[1][0];
540 if (strchr ("ltb", type) == 0)
541 usage ();
542 argv += 2;
543 argc -= 2;
544 }
545 else
546 usage ();
547 }
548 if (gen == 't')
549 {
550 readtrace (tracefile);
551 lo = hi = 0;
552 usergen = 1;
553 }
554 while (argc > 0)
555 {
556 int found = 0;
557 all = strcmp (argv[0], "all") == 0;
558 for (const struct fun *f = funtab; f->name; f++)
559 if (all || strcmp (argv[0], f->name) == 0)
560 {
561 found = 1;
562 if (!usergen)
563 {
564 lo = f->lo;
565 hi = f->hi;
566 }
567 bench (f, lo, hi, type, gen);
568 if (usergen && !all)
569 break;
570 }
571 if (!found)
572 printf ("unknown function: %s\n", argv[0]);
573 argv++;
574 argc--;
575 }
576 return 0;
577 }
578
579 #if __aarch64__ && __linux__ && WANT_SVE_TESTS && defined(__clang__)
580 # pragma clang attribute pop
581 #endif
582