1 /*
2 * Microbenchmark for math functions.
3 *
4 * Copyright (c) 2018-2023, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8 #undef _GNU_SOURCE
9 #define _GNU_SOURCE 1
10 #include <stdint.h>
11 #include <stdlib.h>
12 #include <stdio.h>
13 #include <string.h>
14 #include <time.h>
15 #include <math.h>
16 #include "mathlib.h"
17
18 /* Number of measurements, best result is reported. */
19 #define MEASURE 60
20 /* Array size. */
21 #define N 8000
22 /* Iterations over the array. */
23 #define ITER 125
24
25 static double *Trace;
26 static size_t trace_size;
27 static double A[N];
28 static float Af[N];
29 static long measurecount = MEASURE;
30 static long itercount = ITER;
31
32 #ifdef __vpcs
33 #include <arm_neon.h>
34 typedef float64x2_t v_double;
35
36 #define v_double_len() 2
37
38 static inline v_double
v_double_load(const double * p)39 v_double_load (const double *p)
40 {
41 return (v_double){p[0], p[1]};
42 }
43
44 static inline v_double
v_double_dup(double x)45 v_double_dup (double x)
46 {
47 return (v_double){x, x};
48 }
49
50 typedef float32x4_t v_float;
51
52 #define v_float_len() 4
53
54 static inline v_float
v_float_load(const float * p)55 v_float_load (const float *p)
56 {
57 return (v_float){p[0], p[1], p[2], p[3]};
58 }
59
60 static inline v_float
v_float_dup(float x)61 v_float_dup (float x)
62 {
63 return (v_float){x, x, x, x};
64 }
65 #else
66 /* dummy definitions to make things compile. */
67 typedef double v_double;
68 typedef float v_float;
69 #define v_double_len(x) 1
70 #define v_double_load(x) (x)[0]
71 #define v_double_dup(x) (x)
72 #define v_float_len(x) 1
73 #define v_float_load(x) (x)[0]
74 #define v_float_dup(x) (x)
75
76 #endif
77
78 #if WANT_SVE_MATH
79 #include <arm_sve.h>
80 typedef svbool_t sv_bool;
81 typedef svfloat64_t sv_double;
82
83 #define sv_double_len() svcntd()
84
85 static inline sv_double
sv_double_load(const double * p)86 sv_double_load (const double *p)
87 {
88 svbool_t pg = svptrue_b64();
89 return svld1(pg, p);
90 }
91
92 static inline sv_double
sv_double_dup(double x)93 sv_double_dup (double x)
94 {
95 return svdup_n_f64(x);
96 }
97
98 typedef svfloat32_t sv_float;
99
100 #define sv_float_len() svcntw()
101
102 static inline sv_float
sv_float_load(const float * p)103 sv_float_load (const float *p)
104 {
105 svbool_t pg = svptrue_b32();
106 return svld1(pg, p);
107 }
108
109 static inline sv_float
sv_float_dup(float x)110 sv_float_dup (float x)
111 {
112 return svdup_n_f32(x);
113 }
114 #else
115 /* dummy definitions to make things compile. */
116 #define sv_double_len(x) 1
117 #define sv_float_len(x) 1
118 #endif
119
120 static double
dummy(double x)121 dummy (double x)
122 {
123 return x;
124 }
125
126 static float
dummyf(float x)127 dummyf (float x)
128 {
129 return x;
130 }
131 #ifdef __vpcs
132 __vpcs static v_double
__vn_dummy(v_double x)133 __vn_dummy (v_double x)
134 {
135 return x;
136 }
137
138 __vpcs static v_float
__vn_dummyf(v_float x)139 __vn_dummyf (v_float x)
140 {
141 return x;
142 }
143 #endif
144 #if WANT_SVE_MATH
145 static sv_double
__sv_dummy(sv_double x,sv_bool pg)146 __sv_dummy (sv_double x, sv_bool pg)
147 {
148 return x;
149 }
150
151 static sv_float
__sv_dummyf(sv_float x,sv_bool pg)152 __sv_dummyf (sv_float x, sv_bool pg)
153 {
154 return x;
155 }
156
157 #endif
158
159 #include "test/mathbench_wrappers.h"
160
161 static const struct fun
162 {
163 const char *name;
164 int prec;
165 int vec;
166 double lo;
167 double hi;
168 union
169 {
170 double (*d) (double);
171 float (*f) (float);
172 #ifdef __vpcs
173 __vpcs v_double (*vnd) (v_double);
174 __vpcs v_float (*vnf) (v_float);
175 #endif
176 #if WANT_SVE_MATH
177 sv_double (*svd) (sv_double, sv_bool);
178 sv_float (*svf) (sv_float, sv_bool);
179 #endif
180 } fun;
181 } funtab[] = {
182 #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
183 #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
184 #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
185 #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
186 #define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},
187 #define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
188 D (dummy, 1.0, 2.0)
189 F (dummyf, 1.0, 2.0)
190 #ifdef __vpcs
191 VND (__vn_dummy, 1.0, 2.0)
192 VNF (__vn_dummyf, 1.0, 2.0)
193 #endif
194 #if WANT_SVE_MATH
195 SVD (__sv_dummy, 1.0, 2.0)
196 SVF (__sv_dummyf, 1.0, 2.0)
197 #endif
198 #include "test/mathbench_funcs.h"
199 {0},
200 #undef F
201 #undef D
202 #undef VNF
203 #undef VND
204 #undef SVF
205 #undef SVD
206 };
207
208 static void
gen_linear(double lo,double hi)209 gen_linear (double lo, double hi)
210 {
211 for (int i = 0; i < N; i++)
212 A[i] = (lo * (N - i) + hi * i) / N;
213 }
214
215 static void
genf_linear(double lo,double hi)216 genf_linear (double lo, double hi)
217 {
218 for (int i = 0; i < N; i++)
219 Af[i] = (float)(lo * (N - i) + hi * i) / N;
220 }
221
222 static inline double
asdouble(uint64_t i)223 asdouble (uint64_t i)
224 {
225 union
226 {
227 uint64_t i;
228 double f;
229 } u = {i};
230 return u.f;
231 }
232
233 static uint64_t seed = 0x0123456789abcdef;
234
235 static double
frand(double lo,double hi)236 frand (double lo, double hi)
237 {
238 seed = 6364136223846793005ULL * seed + 1;
239 return lo + (hi - lo) * (asdouble (seed >> 12 | 0x3ffULL << 52) - 1.0);
240 }
241
242 static void
gen_rand(double lo,double hi)243 gen_rand (double lo, double hi)
244 {
245 for (int i = 0; i < N; i++)
246 A[i] = frand (lo, hi);
247 }
248
249 static void
genf_rand(double lo,double hi)250 genf_rand (double lo, double hi)
251 {
252 for (int i = 0; i < N; i++)
253 Af[i] = (float)frand (lo, hi);
254 }
255
256 static void
gen_trace(int index)257 gen_trace (int index)
258 {
259 for (int i = 0; i < N; i++)
260 A[i] = Trace[index + i];
261 }
262
263 static void
genf_trace(int index)264 genf_trace (int index)
265 {
266 for (int i = 0; i < N; i++)
267 Af[i] = (float)Trace[index + i];
268 }
269
270 static void
run_thruput(double f (double))271 run_thruput (double f (double))
272 {
273 for (int i = 0; i < N; i++)
274 f (A[i]);
275 }
276
277 static void
runf_thruput(float f (float))278 runf_thruput (float f (float))
279 {
280 for (int i = 0; i < N; i++)
281 f (Af[i]);
282 }
283
284 volatile double zero = 0;
285
286 static void
run_latency(double f (double))287 run_latency (double f (double))
288 {
289 double z = zero;
290 double prev = z;
291 for (int i = 0; i < N; i++)
292 prev = f (A[i] + prev * z);
293 }
294
295 static void
runf_latency(float f (float))296 runf_latency (float f (float))
297 {
298 float z = (float)zero;
299 float prev = z;
300 for (int i = 0; i < N; i++)
301 prev = f (Af[i] + prev * z);
302 }
303
304 #ifdef __vpcs
305 static void
run_vn_thruput(__vpcs v_double f (v_double))306 run_vn_thruput (__vpcs v_double f (v_double))
307 {
308 for (int i = 0; i < N; i += v_double_len ())
309 f (v_double_load (A+i));
310 }
311
312 static void
runf_vn_thruput(__vpcs v_float f (v_float))313 runf_vn_thruput (__vpcs v_float f (v_float))
314 {
315 for (int i = 0; i < N; i += v_float_len ())
316 f (v_float_load (Af+i));
317 }
318
319 static void
run_vn_latency(__vpcs v_double f (v_double))320 run_vn_latency (__vpcs v_double f (v_double))
321 {
322 volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 };
323 uint64x2_t sel = vsel;
324 v_double prev = v_double_dup (0);
325 for (int i = 0; i < N; i += v_double_len ())
326 prev = f (vbslq_f64 (sel, prev, v_double_load (A+i)));
327 }
328
329 static void
runf_vn_latency(__vpcs v_float f (v_float))330 runf_vn_latency (__vpcs v_float f (v_float))
331 {
332 volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 };
333 uint32x4_t sel = vsel;
334 v_float prev = v_float_dup (0);
335 for (int i = 0; i < N; i += v_float_len ())
336 prev = f (vbslq_f32 (sel, prev, v_float_load (Af+i)));
337 }
338 #endif
339
340 #if WANT_SVE_MATH
341 static void
run_sv_thruput(sv_double f (sv_double,sv_bool))342 run_sv_thruput (sv_double f (sv_double, sv_bool))
343 {
344 for (int i = 0; i < N; i += sv_double_len ())
345 f (sv_double_load (A+i), svptrue_b64 ());
346 }
347
348 static void
runf_sv_thruput(sv_float f (sv_float,sv_bool))349 runf_sv_thruput (sv_float f (sv_float, sv_bool))
350 {
351 for (int i = 0; i < N; i += sv_float_len ())
352 f (sv_float_load (Af+i), svptrue_b32 ());
353 }
354
355 static void
run_sv_latency(sv_double f (sv_double,sv_bool))356 run_sv_latency (sv_double f (sv_double, sv_bool))
357 {
358 volatile sv_bool vsel = svptrue_b64 ();
359 sv_bool sel = vsel;
360 sv_double prev = sv_double_dup (0);
361 for (int i = 0; i < N; i += sv_double_len ())
362 prev = f (svsel_f64 (sel, sv_double_load (A+i), prev), svptrue_b64 ());
363 }
364
365 static void
runf_sv_latency(sv_float f (sv_float,sv_bool))366 runf_sv_latency (sv_float f (sv_float, sv_bool))
367 {
368 volatile sv_bool vsel = svptrue_b32 ();
369 sv_bool sel = vsel;
370 sv_float prev = sv_float_dup (0);
371 for (int i = 0; i < N; i += sv_float_len ())
372 prev = f (svsel_f32 (sel, sv_float_load (Af+i), prev), svptrue_b32 ());
373 }
374 #endif
375
376 static uint64_t
tic(void)377 tic (void)
378 {
379 struct timespec ts;
380 if (clock_gettime (CLOCK_REALTIME, &ts))
381 abort ();
382 return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
383 }
384
385 #define TIMEIT(run, f) do { \
386 dt = -1; \
387 run (f); /* Warm up. */ \
388 for (int j = 0; j < measurecount; j++) \
389 { \
390 uint64_t t0 = tic (); \
391 for (int i = 0; i < itercount; i++) \
392 run (f); \
393 uint64_t t1 = tic (); \
394 if (t1 - t0 < dt) \
395 dt = t1 - t0; \
396 } \
397 } while (0)
398
399 static void
bench1(const struct fun * f,int type,double lo,double hi)400 bench1 (const struct fun *f, int type, double lo, double hi)
401 {
402 uint64_t dt = 0;
403 uint64_t ns100;
404 const char *s = type == 't' ? "rthruput" : "latency";
405 int vlen = 1;
406
407 if (f->vec == 'n')
408 vlen = f->prec == 'd' ? v_double_len() : v_float_len();
409 else if (f->vec == 's')
410 vlen = f->prec == 'd' ? sv_double_len() : sv_float_len();
411
412 if (f->prec == 'd' && type == 't' && f->vec == 0)
413 TIMEIT (run_thruput, f->fun.d);
414 else if (f->prec == 'd' && type == 'l' && f->vec == 0)
415 TIMEIT (run_latency, f->fun.d);
416 else if (f->prec == 'f' && type == 't' && f->vec == 0)
417 TIMEIT (runf_thruput, f->fun.f);
418 else if (f->prec == 'f' && type == 'l' && f->vec == 0)
419 TIMEIT (runf_latency, f->fun.f);
420 #ifdef __vpcs
421 else if (f->prec == 'd' && type == 't' && f->vec == 'n')
422 TIMEIT (run_vn_thruput, f->fun.vnd);
423 else if (f->prec == 'd' && type == 'l' && f->vec == 'n')
424 TIMEIT (run_vn_latency, f->fun.vnd);
425 else if (f->prec == 'f' && type == 't' && f->vec == 'n')
426 TIMEIT (runf_vn_thruput, f->fun.vnf);
427 else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
428 TIMEIT (runf_vn_latency, f->fun.vnf);
429 #endif
430 #if WANT_SVE_MATH
431 else if (f->prec == 'd' && type == 't' && f->vec == 's')
432 TIMEIT (run_sv_thruput, f->fun.svd);
433 else if (f->prec == 'd' && type == 'l' && f->vec == 's')
434 TIMEIT (run_sv_latency, f->fun.svd);
435 else if (f->prec == 'f' && type == 't' && f->vec == 's')
436 TIMEIT (runf_sv_thruput, f->fun.svf);
437 else if (f->prec == 'f' && type == 'l' && f->vec == 's')
438 TIMEIT (runf_sv_latency, f->fun.svf);
439 #endif
440
441 if (type == 't')
442 {
443 ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
444 printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n",
445 f->name, s,
446 (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
447 (unsigned long long) dt, lo, hi, vlen);
448 }
449 else if (type == 'l')
450 {
451 ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
452 printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n",
453 f->name, s,
454 (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
455 (unsigned long long) dt, lo, hi, vlen);
456 }
457 fflush (stdout);
458 }
459
460 static void
bench(const struct fun * f,double lo,double hi,int type,int gen)461 bench (const struct fun *f, double lo, double hi, int type, int gen)
462 {
463 if (f->prec == 'd' && gen == 'r')
464 gen_rand (lo, hi);
465 else if (f->prec == 'd' && gen == 'l')
466 gen_linear (lo, hi);
467 else if (f->prec == 'd' && gen == 't')
468 gen_trace (0);
469 else if (f->prec == 'f' && gen == 'r')
470 genf_rand (lo, hi);
471 else if (f->prec == 'f' && gen == 'l')
472 genf_linear (lo, hi);
473 else if (f->prec == 'f' && gen == 't')
474 genf_trace (0);
475
476 if (gen == 't')
477 hi = trace_size / N;
478
479 if (type == 'b' || type == 't')
480 bench1 (f, 't', lo, hi);
481
482 if (type == 'b' || type == 'l')
483 bench1 (f, 'l', lo, hi);
484
485 for (int i = N; i < trace_size; i += N)
486 {
487 if (f->prec == 'd')
488 gen_trace (i);
489 else
490 genf_trace (i);
491
492 lo = i / N;
493 if (type == 'b' || type == 't')
494 bench1 (f, 't', lo, hi);
495
496 if (type == 'b' || type == 'l')
497 bench1 (f, 'l', lo, hi);
498 }
499 }
500
501 static void
readtrace(const char * name)502 readtrace (const char *name)
503 {
504 int n = 0;
505 FILE *f = strcmp (name, "-") == 0 ? stdin : fopen (name, "r");
506 if (!f)
507 {
508 printf ("openning \"%s\" failed: %m\n", name);
509 exit (1);
510 }
511 for (;;)
512 {
513 if (n >= trace_size)
514 {
515 trace_size += N;
516 Trace = realloc (Trace, trace_size * sizeof (Trace[0]));
517 if (Trace == NULL)
518 {
519 printf ("out of memory\n");
520 exit (1);
521 }
522 }
523 if (fscanf (f, "%lf", Trace + n) != 1)
524 break;
525 n++;
526 }
527 if (ferror (f) || n == 0)
528 {
529 printf ("reading \"%s\" failed: %m\n", name);
530 exit (1);
531 }
532 fclose (f);
533 if (n % N == 0)
534 trace_size = n;
535 for (int i = 0; n < trace_size; n++, i++)
536 Trace[n] = Trace[i];
537 }
538
539 static void
usage(void)540 usage (void)
541 {
542 printf ("usage: ./mathbench [-g rand|linear|trace] [-t latency|thruput|both] "
543 "[-i low high] [-f tracefile] [-m measurements] [-c iterations] func "
544 "[func2 ..]\n");
545 printf ("func:\n");
546 printf ("%7s [run all benchmarks]\n", "all");
547 for (const struct fun *f = funtab; f->name; f++)
548 printf ("%7s [low: %g high: %g]\n", f->name, f->lo, f->hi);
549 exit (1);
550 }
551
552 int
main(int argc,char * argv[])553 main (int argc, char *argv[])
554 {
555 int usergen = 0, gen = 'r', type = 'b', all = 0;
556 double lo = 0, hi = 0;
557 const char *tracefile = "-";
558
559 argv++;
560 argc--;
561 for (;;)
562 {
563 if (argc <= 0)
564 usage ();
565 if (argv[0][0] != '-')
566 break;
567 else if (argc >= 3 && strcmp (argv[0], "-i") == 0)
568 {
569 usergen = 1;
570 lo = strtod (argv[1], 0);
571 hi = strtod (argv[2], 0);
572 argv += 3;
573 argc -= 3;
574 }
575 else if (argc >= 2 && strcmp (argv[0], "-m") == 0)
576 {
577 measurecount = strtol (argv[1], 0, 0);
578 argv += 2;
579 argc -= 2;
580 }
581 else if (argc >= 2 && strcmp (argv[0], "-c") == 0)
582 {
583 itercount = strtol (argv[1], 0, 0);
584 argv += 2;
585 argc -= 2;
586 }
587 else if (argc >= 2 && strcmp (argv[0], "-g") == 0)
588 {
589 gen = argv[1][0];
590 if (strchr ("rlt", gen) == 0)
591 usage ();
592 argv += 2;
593 argc -= 2;
594 }
595 else if (argc >= 2 && strcmp (argv[0], "-f") == 0)
596 {
597 gen = 't'; /* -f implies -g trace. */
598 tracefile = argv[1];
599 argv += 2;
600 argc -= 2;
601 }
602 else if (argc >= 2 && strcmp (argv[0], "-t") == 0)
603 {
604 type = argv[1][0];
605 if (strchr ("ltb", type) == 0)
606 usage ();
607 argv += 2;
608 argc -= 2;
609 }
610 else
611 usage ();
612 }
613 if (gen == 't')
614 {
615 readtrace (tracefile);
616 lo = hi = 0;
617 usergen = 1;
618 }
619 while (argc > 0)
620 {
621 int found = 0;
622 all = strcmp (argv[0], "all") == 0;
623 for (const struct fun *f = funtab; f->name; f++)
624 if (all || strcmp (argv[0], f->name) == 0)
625 {
626 found = 1;
627 if (!usergen)
628 {
629 lo = f->lo;
630 hi = f->hi;
631 }
632 bench (f, lo, hi, type, gen);
633 if (usergen && !all)
634 break;
635 }
636 if (!found)
637 printf ("unknown function: %s\n", argv[0]);
638 argv++;
639 argc--;
640 }
641 return 0;
642 }
643