xref: /freebsd/contrib/arm-optimized-routines/math/test/mathbench.c (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1 /*
2  * Microbenchmark for math functions.
3  *
4  * Copyright (c) 2018-2024, Arm Limited.
5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6  */
7 
8 #if WANT_SVE_TESTS
9 #  if __aarch64__ && __linux__
10 #    ifdef __clang__
11 #      pragma clang attribute push(__attribute__((target("sve"))),            \
12 				   apply_to = any(function))
13 #    else
14 #      pragma GCC target("+sve")
15 #    endif
16 #  else
17 #    error "SVE not supported - please disable WANT_SVE_TESTS"
18 #  endif
19 #endif
20 
21 #undef _GNU_SOURCE
22 #define _GNU_SOURCE 1
23 #include <stdint.h>
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <time.h>
28 #include <math.h>
29 #include "mathlib.h"
30 
31 /* Number of measurements, best result is reported.  */
32 #define MEASURE 60
33 /* Array size.  */
34 #define N 8000
35 /* Iterations over the array.  */
36 #define ITER 125
37 
38 static double *Trace;
39 static size_t trace_size;
40 static double A[N];
41 static float Af[N];
42 static long measurecount = MEASURE;
43 static long itercount = ITER;
44 
45 static double
46 dummy (double x)
47 {
48   return x;
49 }
50 
51 static float
52 dummyf (float x)
53 {
54   return x;
55 }
56 #if __aarch64__ && __linux__
57 __vpcs static float64x2_t
58 __vn_dummy (float64x2_t x)
59 {
60   return x;
61 }
62 
63 __vpcs static float32x4_t
64 __vn_dummyf (float32x4_t x)
65 {
66   return x;
67 }
68 #endif
69 #if WANT_SVE_TESTS
70 static svfloat64_t
71 __sv_dummy (svfloat64_t x, svbool_t pg)
72 {
73   return x;
74 }
75 
76 static svfloat32_t
77 __sv_dummyf (svfloat32_t x, svbool_t pg)
78 {
79   return x;
80 }
81 
82 #endif
83 
84 #include "test/mathbench_wrappers.h"
85 
86 static const struct fun
87 {
88   const char *name;
89   int prec;
90   int vec;
91   double lo;
92   double hi;
93   union
94   {
95     double (*d) (double);
96     float (*f) (float);
97 #if __aarch64__ && __linux__
98     __vpcs float64x2_t (*vnd) (float64x2_t);
99     __vpcs float32x4_t (*vnf) (float32x4_t);
100 #endif
101 #if WANT_SVE_TESTS
102     svfloat64_t (*svd) (svfloat64_t, svbool_t);
103     svfloat32_t (*svf) (svfloat32_t, svbool_t);
104 #endif
105   } fun;
106 } funtab[] = {
107 // clang-format off
108 #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
109 #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
110 #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
111 #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
112 #define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},
113 #define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
114 D (dummy, 1.0, 2.0)
115 F (dummyf, 1.0, 2.0)
116 #if  __aarch64__ && __linux__
117 VND (__vn_dummy, 1.0, 2.0)
118 VNF (__vn_dummyf, 1.0, 2.0)
119 #endif
120 #if WANT_SVE_TESTS
121 SVD (__sv_dummy, 1.0, 2.0)
122 SVF (__sv_dummyf, 1.0, 2.0)
123 #endif
124 #include "test/mathbench_funcs.h"
125 {0},
126 #undef F
127 #undef D
128 #undef VNF
129 #undef VND
130 #undef SVF
131 #undef SVD
132   // clang-format on
133 };
134 
135 static void
136 gen_linear (double lo, double hi)
137 {
138   for (int i = 0; i < N; i++)
139     A[i] = (lo * (N - i) + hi * i) / N;
140 }
141 
142 static void
143 genf_linear (double lo, double hi)
144 {
145   for (int i = 0; i < N; i++)
146     Af[i] = (float)(lo * (N - i) + hi * i) / N;
147 }
148 
149 static inline double
150 asdouble (uint64_t i)
151 {
152   union
153   {
154     uint64_t i;
155     double f;
156   } u = {i};
157   return u.f;
158 }
159 
160 static uint64_t seed = 0x0123456789abcdef;
161 
162 static double
163 frand (double lo, double hi)
164 {
165   seed = 6364136223846793005ULL * seed + 1;
166   return lo + (hi - lo) * (asdouble (seed >> 12 | 0x3ffULL << 52) - 1.0);
167 }
168 
169 static void
170 gen_rand (double lo, double hi)
171 {
172   for (int i = 0; i < N; i++)
173     A[i] = frand (lo, hi);
174 }
175 
176 static void
177 genf_rand (double lo, double hi)
178 {
179   for (int i = 0; i < N; i++)
180     Af[i] = (float)frand (lo, hi);
181 }
182 
183 static void
184 gen_trace (int index)
185 {
186   for (int i = 0; i < N; i++)
187     A[i] = Trace[index + i];
188 }
189 
190 static void
191 genf_trace (int index)
192 {
193   for (int i = 0; i < N; i++)
194     Af[i] = (float)Trace[index + i];
195 }
196 
197 static void
198 run_thruput (double f (double))
199 {
200   for (int i = 0; i < N; i++)
201     f (A[i]);
202 }
203 
204 static void
205 runf_thruput (float f (float))
206 {
207   for (int i = 0; i < N; i++)
208     f (Af[i]);
209 }
210 
211 volatile double zero = 0;
212 
213 static void
214 run_latency (double f (double))
215 {
216   double z = zero;
217   double prev = z;
218   for (int i = 0; i < N; i++)
219     prev = f (A[i] + prev * z);
220 }
221 
222 static void
223 runf_latency (float f (float))
224 {
225   float z = (float)zero;
226   float prev = z;
227   for (int i = 0; i < N; i++)
228     prev = f (Af[i] + prev * z);
229 }
230 
231 #if  __aarch64__ && __linux__
232 static void
233 run_vn_thruput (__vpcs float64x2_t f (float64x2_t))
234 {
235   for (int i = 0; i < N; i += 2)
236     f (vld1q_f64 (A + i));
237 }
238 
239 static void
240 runf_vn_thruput (__vpcs float32x4_t f (float32x4_t))
241 {
242   for (int i = 0; i < N; i += 4)
243     f (vld1q_f32 (Af + i));
244 }
245 
246 static void
247 run_vn_latency (__vpcs float64x2_t f (float64x2_t))
248 {
249   volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 };
250   uint64x2_t sel = vsel;
251   float64x2_t prev = vdupq_n_f64 (0);
252   for (int i = 0; i < N; i += 2)
253     prev = f (vbslq_f64 (sel, prev, vld1q_f64 (A + i)));
254 }
255 
256 static void
257 runf_vn_latency (__vpcs float32x4_t f (float32x4_t))
258 {
259   volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 };
260   uint32x4_t sel = vsel;
261   float32x4_t prev = vdupq_n_f32 (0);
262   for (int i = 0; i < N; i += 4)
263     prev = f (vbslq_f32 (sel, prev, vld1q_f32 (Af + i)));
264 }
265 #endif
266 
267 #if WANT_SVE_TESTS
268 static void
269 run_sv_thruput (svfloat64_t f (svfloat64_t, svbool_t))
270 {
271   for (int i = 0; i < N; i += svcntd ())
272     f (svld1_f64 (svptrue_b64 (), A + i), svptrue_b64 ());
273 }
274 
275 static void
276 runf_sv_thruput (svfloat32_t f (svfloat32_t, svbool_t))
277 {
278   for (int i = 0; i < N; i += svcntw ())
279     f (svld1_f32 (svptrue_b32 (), Af + i), svptrue_b32 ());
280 }
281 
282 static void
283 run_sv_latency (svfloat64_t f (svfloat64_t, svbool_t))
284 {
285   volatile svbool_t vsel = svptrue_b64 ();
286   svbool_t sel = vsel;
287   svfloat64_t prev = svdup_f64 (0);
288   for (int i = 0; i < N; i += svcntd ())
289     prev = f (svsel_f64 (sel, svld1_f64 (svptrue_b64 (), A + i), prev),
290 	      svptrue_b64 ());
291 }
292 
293 static void
294 runf_sv_latency (svfloat32_t f (svfloat32_t, svbool_t))
295 {
296   volatile svbool_t vsel = svptrue_b32 ();
297   svbool_t sel = vsel;
298   svfloat32_t prev = svdup_f32 (0);
299   for (int i = 0; i < N; i += svcntw ())
300     prev = f (svsel_f32 (sel, svld1_f32 (svptrue_b32 (), Af + i), prev),
301 	      svptrue_b32 ());
302 }
303 #endif
304 
305 static uint64_t
306 tic (void)
307 {
308   struct timespec ts;
309 #if defined(_MSC_VER)
310   if (!timespec_get (&ts, TIME_UTC))
311 #else
312   if (clock_gettime (CLOCK_REALTIME, &ts))
313 #endif
314     abort ();
315   return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
316 }
317 
318 #define TIMEIT(run, f) do { \
319   dt = -1; \
320   run (f); /* Warm up.  */ \
321   for (int j = 0; j < measurecount; j++) \
322     { \
323       uint64_t t0 = tic (); \
324       for (int i = 0; i < itercount; i++) \
325 	run (f); \
326       uint64_t t1 = tic (); \
327       if (t1 - t0 < dt) \
328 	dt = t1 - t0; \
329     } \
330 } while (0)
331 
332 static void
333 bench1 (const struct fun *f, int type, double lo, double hi)
334 {
335   uint64_t dt = 0;
336   uint64_t ns100;
337   const char *s = type == 't' ? "rthruput" : "latency";
338   int vlen = 1;
339 
340   if (f->vec == 'n')
341     vlen = f->prec == 'd' ? 2 : 4;
342 #if WANT_SVE_TESTS
343   else if (f->vec == 's')
344     vlen = f->prec == 'd' ? svcntd () : svcntw ();
345 #endif
346 
347   if (f->prec == 'd' && type == 't' && f->vec == 0)
348     TIMEIT (run_thruput, f->fun.d);
349   else if (f->prec == 'd' && type == 'l' && f->vec == 0)
350     TIMEIT (run_latency, f->fun.d);
351   else if (f->prec == 'f' && type == 't' && f->vec == 0)
352     TIMEIT (runf_thruput, f->fun.f);
353   else if (f->prec == 'f' && type == 'l' && f->vec == 0)
354     TIMEIT (runf_latency, f->fun.f);
355 #if __aarch64__ && __linux__
356   else if (f->prec == 'd' && type == 't' && f->vec == 'n')
357     TIMEIT (run_vn_thruput, f->fun.vnd);
358   else if (f->prec == 'd' && type == 'l' && f->vec == 'n')
359     TIMEIT (run_vn_latency, f->fun.vnd);
360   else if (f->prec == 'f' && type == 't' && f->vec == 'n')
361     TIMEIT (runf_vn_thruput, f->fun.vnf);
362   else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
363     TIMEIT (runf_vn_latency, f->fun.vnf);
364 #endif
365 #if WANT_SVE_TESTS
366   else if (f->prec == 'd' && type == 't' && f->vec == 's')
367     TIMEIT (run_sv_thruput, f->fun.svd);
368   else if (f->prec == 'd' && type == 'l' && f->vec == 's')
369     TIMEIT (run_sv_latency, f->fun.svd);
370   else if (f->prec == 'f' && type == 't' && f->vec == 's')
371     TIMEIT (runf_sv_thruput, f->fun.svf);
372   else if (f->prec == 'f' && type == 'l' && f->vec == 's')
373     TIMEIT (runf_sv_latency, f->fun.svf);
374 #endif
375 
376   if (type == 't')
377     {
378       ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
379       printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n",
380 	      f->name, s,
381 	      (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
382 	      (unsigned long long) dt, lo, hi, vlen);
383     }
384   else if (type == 'l')
385     {
386       ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
387       printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n",
388 	      f->name, s,
389 	      (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
390 	      (unsigned long long) dt, lo, hi, vlen);
391     }
392   fflush (stdout);
393 }
394 
395 static void
396 bench (const struct fun *f, double lo, double hi, int type, int gen)
397 {
398   if (f->prec == 'd' && gen == 'r')
399     gen_rand (lo, hi);
400   else if (f->prec == 'd' && gen == 'l')
401     gen_linear (lo, hi);
402   else if (f->prec == 'd' && gen == 't')
403     gen_trace (0);
404   else if (f->prec == 'f' && gen == 'r')
405     genf_rand (lo, hi);
406   else if (f->prec == 'f' && gen == 'l')
407     genf_linear (lo, hi);
408   else if (f->prec == 'f' && gen == 't')
409     genf_trace (0);
410 
411   if (gen == 't')
412     hi = trace_size / N;
413 
414   if (type == 'b' || type == 't')
415     bench1 (f, 't', lo, hi);
416 
417   if (type == 'b' || type == 'l')
418     bench1 (f, 'l', lo, hi);
419 
420   for (int i = N; i < trace_size; i += N)
421     {
422       if (f->prec == 'd')
423 	gen_trace (i);
424       else
425 	genf_trace (i);
426 
427       lo = i / N;
428       if (type == 'b' || type == 't')
429 	bench1 (f, 't', lo, hi);
430 
431       if (type == 'b' || type == 'l')
432 	bench1 (f, 'l', lo, hi);
433     }
434 }
435 
436 static void
437 readtrace (const char *name)
438 {
439 	int n = 0;
440 	FILE *f = strcmp (name, "-") == 0 ? stdin : fopen (name, "r");
441 	if (!f)
442 	  {
443 	    printf ("openning \"%s\" failed: %m\n", name);
444 	    exit (1);
445 	  }
446 	for (;;)
447 	  {
448 	    if (n >= trace_size)
449 	      {
450 		trace_size += N;
451 		Trace = realloc (Trace, trace_size * sizeof (Trace[0]));
452 		if (Trace == NULL)
453 		  {
454 		    printf ("out of memory\n");
455 		    exit (1);
456 		  }
457 	      }
458 	    if (fscanf (f, "%lf", Trace + n) != 1)
459 	      break;
460 	    n++;
461 	  }
462 	if (ferror (f) || n == 0)
463 	  {
464 	    printf ("reading \"%s\" failed: %m\n", name);
465 	    exit (1);
466 	  }
467 	fclose (f);
468 	if (n % N == 0)
469 	  trace_size = n;
470 	for (int i = 0; n < trace_size; n++, i++)
471 	  Trace[n] = Trace[i];
472 }
473 
474 static void
475 usage (void)
476 {
477   printf ("usage: ./mathbench [-g rand|linear|trace] [-t latency|thruput|both] "
478 	  "[-i low high] [-f tracefile] [-m measurements] [-c iterations] func "
479 	  "[func2 ..]\n");
480   printf ("func:\n");
481   printf ("%7s [run all benchmarks]\n", "all");
482   for (const struct fun *f = funtab; f->name; f++)
483     printf ("%7s [low: %g high: %g]\n", f->name, f->lo, f->hi);
484   exit (1);
485 }
486 
487 int
488 main (int argc, char *argv[])
489 {
490   int usergen = 0, gen = 'r', type = 'b', all = 0;
491   double lo = 0, hi = 0;
492   const char *tracefile = "-";
493 
494   argv++;
495   argc--;
496   for (;;)
497     {
498       if (argc <= 0)
499 	usage ();
500       if (argv[0][0] != '-')
501 	break;
502       else if (argc >= 3 && strcmp (argv[0], "-i") == 0)
503 	{
504 	  usergen = 1;
505 	  lo = strtod (argv[1], 0);
506 	  hi = strtod (argv[2], 0);
507 	  argv += 3;
508 	  argc -= 3;
509 	}
510       else if (argc >= 2 && strcmp (argv[0], "-m") == 0)
511 	{
512 	  measurecount = strtol (argv[1], 0, 0);
513 	  argv += 2;
514 	  argc -= 2;
515 	}
516       else if (argc >= 2 && strcmp (argv[0], "-c") == 0)
517 	{
518 	  itercount = strtol (argv[1], 0, 0);
519 	  argv += 2;
520 	  argc -= 2;
521 	}
522       else if (argc >= 2 && strcmp (argv[0], "-g") == 0)
523 	{
524 	  gen = argv[1][0];
525 	  if (strchr ("rlt", gen) == 0)
526 	    usage ();
527 	  argv += 2;
528 	  argc -= 2;
529 	}
530       else if (argc >= 2 && strcmp (argv[0], "-f") == 0)
531 	{
532 	  gen = 't';  /* -f implies -g trace.  */
533 	  tracefile = argv[1];
534 	  argv += 2;
535 	  argc -= 2;
536 	}
537       else if (argc >= 2 && strcmp (argv[0], "-t") == 0)
538 	{
539 	  type = argv[1][0];
540 	  if (strchr ("ltb", type) == 0)
541 	    usage ();
542 	  argv += 2;
543 	  argc -= 2;
544 	}
545       else
546 	usage ();
547     }
548   if (gen == 't')
549     {
550       readtrace (tracefile);
551       lo = hi = 0;
552       usergen = 1;
553     }
554   while (argc > 0)
555     {
556       int found = 0;
557       all = strcmp (argv[0], "all") == 0;
558       for (const struct fun *f = funtab; f->name; f++)
559 	if (all || strcmp (argv[0], f->name) == 0)
560 	  {
561 	    found = 1;
562 	    if (!usergen)
563 	      {
564 		lo = f->lo;
565 		hi = f->hi;
566 	      }
567 	    bench (f, lo, hi, type, gen);
568 	    if (usergen && !all)
569 	      break;
570 	  }
571       if (!found)
572 	printf ("unknown function: %s\n", argv[0]);
573       argv++;
574       argc--;
575     }
576   return 0;
577 }
578 
579 #if __aarch64__ && __linux__ && WANT_SVE_TESTS && defined(__clang__)
580 #  pragma clang attribute pop
581 #endif
582