1 /* 2 * Microbenchmark for math functions. 3 * 4 * Copyright (c) 2018-2023, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8 #undef _GNU_SOURCE 9 #define _GNU_SOURCE 1 10 #include <stdint.h> 11 #include <stdlib.h> 12 #include <stdio.h> 13 #include <string.h> 14 #include <time.h> 15 #include <math.h> 16 #include "mathlib.h" 17 18 /* Number of measurements, best result is reported. */ 19 #define MEASURE 60 20 /* Array size. */ 21 #define N 8000 22 /* Iterations over the array. */ 23 #define ITER 125 24 25 static double *Trace; 26 static size_t trace_size; 27 static double A[N]; 28 static float Af[N]; 29 static long measurecount = MEASURE; 30 static long itercount = ITER; 31 32 #ifdef __vpcs 33 #include <arm_neon.h> 34 typedef float64x2_t v_double; 35 36 #define v_double_len() 2 37 38 static inline v_double 39 v_double_load (const double *p) 40 { 41 return (v_double){p[0], p[1]}; 42 } 43 44 static inline v_double 45 v_double_dup (double x) 46 { 47 return (v_double){x, x}; 48 } 49 50 typedef float32x4_t v_float; 51 52 #define v_float_len() 4 53 54 static inline v_float 55 v_float_load (const float *p) 56 { 57 return (v_float){p[0], p[1], p[2], p[3]}; 58 } 59 60 static inline v_float 61 v_float_dup (float x) 62 { 63 return (v_float){x, x, x, x}; 64 } 65 #else 66 /* dummy definitions to make things compile. */ 67 typedef double v_double; 68 typedef float v_float; 69 #define v_double_len(x) 1 70 #define v_double_load(x) (x)[0] 71 #define v_double_dup(x) (x) 72 #define v_float_len(x) 1 73 #define v_float_load(x) (x)[0] 74 #define v_float_dup(x) (x) 75 76 #endif 77 78 #if WANT_SVE_MATH 79 #include <arm_sve.h> 80 typedef svbool_t sv_bool; 81 typedef svfloat64_t sv_double; 82 83 #define sv_double_len() svcntd() 84 85 static inline sv_double 86 sv_double_load (const double *p) 87 { 88 svbool_t pg = svptrue_b64(); 89 return svld1(pg, p); 90 } 91 92 static inline sv_double 93 sv_double_dup (double x) 94 { 95 return svdup_n_f64(x); 96 } 97 98 typedef svfloat32_t sv_float; 99 100 #define sv_float_len() svcntw() 101 102 static inline sv_float 103 sv_float_load (const float *p) 104 { 105 svbool_t pg = svptrue_b32(); 106 return svld1(pg, p); 107 } 108 109 static inline sv_float 110 sv_float_dup (float x) 111 { 112 return svdup_n_f32(x); 113 } 114 #else 115 /* dummy definitions to make things compile. */ 116 #define sv_double_len(x) 1 117 #define sv_float_len(x) 1 118 #endif 119 120 static double 121 dummy (double x) 122 { 123 return x; 124 } 125 126 static float 127 dummyf (float x) 128 { 129 return x; 130 } 131 #ifdef __vpcs 132 __vpcs static v_double 133 __vn_dummy (v_double x) 134 { 135 return x; 136 } 137 138 __vpcs static v_float 139 __vn_dummyf (v_float x) 140 { 141 return x; 142 } 143 #endif 144 #if WANT_SVE_MATH 145 static sv_double 146 __sv_dummy (sv_double x, sv_bool pg) 147 { 148 return x; 149 } 150 151 static sv_float 152 __sv_dummyf (sv_float x, sv_bool pg) 153 { 154 return x; 155 } 156 157 #endif 158 159 #include "test/mathbench_wrappers.h" 160 161 static const struct fun 162 { 163 const char *name; 164 int prec; 165 int vec; 166 double lo; 167 double hi; 168 union 169 { 170 double (*d) (double); 171 float (*f) (float); 172 #ifdef __vpcs 173 __vpcs v_double (*vnd) (v_double); 174 __vpcs v_float (*vnf) (v_float); 175 #endif 176 #if WANT_SVE_MATH 177 sv_double (*svd) (sv_double, sv_bool); 178 sv_float (*svf) (sv_float, sv_bool); 179 #endif 180 } fun; 181 } funtab[] = { 182 #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}}, 183 #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}}, 184 #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}}, 185 #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}}, 186 #define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}}, 187 #define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}}, 188 D (dummy, 1.0, 2.0) 189 F (dummyf, 1.0, 2.0) 190 #ifdef __vpcs 191 VND (__vn_dummy, 1.0, 2.0) 192 VNF (__vn_dummyf, 1.0, 2.0) 193 #endif 194 #if WANT_SVE_MATH 195 SVD (__sv_dummy, 1.0, 2.0) 196 SVF (__sv_dummyf, 1.0, 2.0) 197 #endif 198 #include "test/mathbench_funcs.h" 199 {0}, 200 #undef F 201 #undef D 202 #undef VNF 203 #undef VND 204 #undef SVF 205 #undef SVD 206 }; 207 208 static void 209 gen_linear (double lo, double hi) 210 { 211 for (int i = 0; i < N; i++) 212 A[i] = (lo * (N - i) + hi * i) / N; 213 } 214 215 static void 216 genf_linear (double lo, double hi) 217 { 218 for (int i = 0; i < N; i++) 219 Af[i] = (float)(lo * (N - i) + hi * i) / N; 220 } 221 222 static inline double 223 asdouble (uint64_t i) 224 { 225 union 226 { 227 uint64_t i; 228 double f; 229 } u = {i}; 230 return u.f; 231 } 232 233 static uint64_t seed = 0x0123456789abcdef; 234 235 static double 236 frand (double lo, double hi) 237 { 238 seed = 6364136223846793005ULL * seed + 1; 239 return lo + (hi - lo) * (asdouble (seed >> 12 | 0x3ffULL << 52) - 1.0); 240 } 241 242 static void 243 gen_rand (double lo, double hi) 244 { 245 for (int i = 0; i < N; i++) 246 A[i] = frand (lo, hi); 247 } 248 249 static void 250 genf_rand (double lo, double hi) 251 { 252 for (int i = 0; i < N; i++) 253 Af[i] = (float)frand (lo, hi); 254 } 255 256 static void 257 gen_trace (int index) 258 { 259 for (int i = 0; i < N; i++) 260 A[i] = Trace[index + i]; 261 } 262 263 static void 264 genf_trace (int index) 265 { 266 for (int i = 0; i < N; i++) 267 Af[i] = (float)Trace[index + i]; 268 } 269 270 static void 271 run_thruput (double f (double)) 272 { 273 for (int i = 0; i < N; i++) 274 f (A[i]); 275 } 276 277 static void 278 runf_thruput (float f (float)) 279 { 280 for (int i = 0; i < N; i++) 281 f (Af[i]); 282 } 283 284 volatile double zero = 0; 285 286 static void 287 run_latency (double f (double)) 288 { 289 double z = zero; 290 double prev = z; 291 for (int i = 0; i < N; i++) 292 prev = f (A[i] + prev * z); 293 } 294 295 static void 296 runf_latency (float f (float)) 297 { 298 float z = (float)zero; 299 float prev = z; 300 for (int i = 0; i < N; i++) 301 prev = f (Af[i] + prev * z); 302 } 303 304 #ifdef __vpcs 305 static void 306 run_vn_thruput (__vpcs v_double f (v_double)) 307 { 308 for (int i = 0; i < N; i += v_double_len ()) 309 f (v_double_load (A+i)); 310 } 311 312 static void 313 runf_vn_thruput (__vpcs v_float f (v_float)) 314 { 315 for (int i = 0; i < N; i += v_float_len ()) 316 f (v_float_load (Af+i)); 317 } 318 319 static void 320 run_vn_latency (__vpcs v_double f (v_double)) 321 { 322 volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 }; 323 uint64x2_t sel = vsel; 324 v_double prev = v_double_dup (0); 325 for (int i = 0; i < N; i += v_double_len ()) 326 prev = f (vbslq_f64 (sel, prev, v_double_load (A+i))); 327 } 328 329 static void 330 runf_vn_latency (__vpcs v_float f (v_float)) 331 { 332 volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 }; 333 uint32x4_t sel = vsel; 334 v_float prev = v_float_dup (0); 335 for (int i = 0; i < N; i += v_float_len ()) 336 prev = f (vbslq_f32 (sel, prev, v_float_load (Af+i))); 337 } 338 #endif 339 340 #if WANT_SVE_MATH 341 static void 342 run_sv_thruput (sv_double f (sv_double, sv_bool)) 343 { 344 for (int i = 0; i < N; i += sv_double_len ()) 345 f (sv_double_load (A+i), svptrue_b64 ()); 346 } 347 348 static void 349 runf_sv_thruput (sv_float f (sv_float, sv_bool)) 350 { 351 for (int i = 0; i < N; i += sv_float_len ()) 352 f (sv_float_load (Af+i), svptrue_b32 ()); 353 } 354 355 static void 356 run_sv_latency (sv_double f (sv_double, sv_bool)) 357 { 358 volatile sv_bool vsel = svptrue_b64 (); 359 sv_bool sel = vsel; 360 sv_double prev = sv_double_dup (0); 361 for (int i = 0; i < N; i += sv_double_len ()) 362 prev = f (svsel_f64 (sel, sv_double_load (A+i), prev), svptrue_b64 ()); 363 } 364 365 static void 366 runf_sv_latency (sv_float f (sv_float, sv_bool)) 367 { 368 volatile sv_bool vsel = svptrue_b32 (); 369 sv_bool sel = vsel; 370 sv_float prev = sv_float_dup (0); 371 for (int i = 0; i < N; i += sv_float_len ()) 372 prev = f (svsel_f32 (sel, sv_float_load (Af+i), prev), svptrue_b32 ()); 373 } 374 #endif 375 376 static uint64_t 377 tic (void) 378 { 379 struct timespec ts; 380 if (clock_gettime (CLOCK_REALTIME, &ts)) 381 abort (); 382 return ts.tv_sec * 1000000000ULL + ts.tv_nsec; 383 } 384 385 #define TIMEIT(run, f) do { \ 386 dt = -1; \ 387 run (f); /* Warm up. */ \ 388 for (int j = 0; j < measurecount; j++) \ 389 { \ 390 uint64_t t0 = tic (); \ 391 for (int i = 0; i < itercount; i++) \ 392 run (f); \ 393 uint64_t t1 = tic (); \ 394 if (t1 - t0 < dt) \ 395 dt = t1 - t0; \ 396 } \ 397 } while (0) 398 399 static void 400 bench1 (const struct fun *f, int type, double lo, double hi) 401 { 402 uint64_t dt = 0; 403 uint64_t ns100; 404 const char *s = type == 't' ? "rthruput" : "latency"; 405 int vlen = 1; 406 407 if (f->vec == 'n') 408 vlen = f->prec == 'd' ? v_double_len() : v_float_len(); 409 else if (f->vec == 's') 410 vlen = f->prec == 'd' ? sv_double_len() : sv_float_len(); 411 412 if (f->prec == 'd' && type == 't' && f->vec == 0) 413 TIMEIT (run_thruput, f->fun.d); 414 else if (f->prec == 'd' && type == 'l' && f->vec == 0) 415 TIMEIT (run_latency, f->fun.d); 416 else if (f->prec == 'f' && type == 't' && f->vec == 0) 417 TIMEIT (runf_thruput, f->fun.f); 418 else if (f->prec == 'f' && type == 'l' && f->vec == 0) 419 TIMEIT (runf_latency, f->fun.f); 420 #ifdef __vpcs 421 else if (f->prec == 'd' && type == 't' && f->vec == 'n') 422 TIMEIT (run_vn_thruput, f->fun.vnd); 423 else if (f->prec == 'd' && type == 'l' && f->vec == 'n') 424 TIMEIT (run_vn_latency, f->fun.vnd); 425 else if (f->prec == 'f' && type == 't' && f->vec == 'n') 426 TIMEIT (runf_vn_thruput, f->fun.vnf); 427 else if (f->prec == 'f' && type == 'l' && f->vec == 'n') 428 TIMEIT (runf_vn_latency, f->fun.vnf); 429 #endif 430 #if WANT_SVE_MATH 431 else if (f->prec == 'd' && type == 't' && f->vec == 's') 432 TIMEIT (run_sv_thruput, f->fun.svd); 433 else if (f->prec == 'd' && type == 'l' && f->vec == 's') 434 TIMEIT (run_sv_latency, f->fun.svd); 435 else if (f->prec == 'f' && type == 't' && f->vec == 's') 436 TIMEIT (runf_sv_thruput, f->fun.svf); 437 else if (f->prec == 'f' && type == 'l' && f->vec == 's') 438 TIMEIT (runf_sv_latency, f->fun.svf); 439 #endif 440 441 if (type == 't') 442 { 443 ns100 = (100 * dt + itercount * N / 2) / (itercount * N); 444 printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n", 445 f->name, s, 446 (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), 447 (unsigned long long) dt, lo, hi, vlen); 448 } 449 else if (type == 'l') 450 { 451 ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen); 452 printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n", 453 f->name, s, 454 (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), 455 (unsigned long long) dt, lo, hi, vlen); 456 } 457 fflush (stdout); 458 } 459 460 static void 461 bench (const struct fun *f, double lo, double hi, int type, int gen) 462 { 463 if (f->prec == 'd' && gen == 'r') 464 gen_rand (lo, hi); 465 else if (f->prec == 'd' && gen == 'l') 466 gen_linear (lo, hi); 467 else if (f->prec == 'd' && gen == 't') 468 gen_trace (0); 469 else if (f->prec == 'f' && gen == 'r') 470 genf_rand (lo, hi); 471 else if (f->prec == 'f' && gen == 'l') 472 genf_linear (lo, hi); 473 else if (f->prec == 'f' && gen == 't') 474 genf_trace (0); 475 476 if (gen == 't') 477 hi = trace_size / N; 478 479 if (type == 'b' || type == 't') 480 bench1 (f, 't', lo, hi); 481 482 if (type == 'b' || type == 'l') 483 bench1 (f, 'l', lo, hi); 484 485 for (int i = N; i < trace_size; i += N) 486 { 487 if (f->prec == 'd') 488 gen_trace (i); 489 else 490 genf_trace (i); 491 492 lo = i / N; 493 if (type == 'b' || type == 't') 494 bench1 (f, 't', lo, hi); 495 496 if (type == 'b' || type == 'l') 497 bench1 (f, 'l', lo, hi); 498 } 499 } 500 501 static void 502 readtrace (const char *name) 503 { 504 int n = 0; 505 FILE *f = strcmp (name, "-") == 0 ? stdin : fopen (name, "r"); 506 if (!f) 507 { 508 printf ("openning \"%s\" failed: %m\n", name); 509 exit (1); 510 } 511 for (;;) 512 { 513 if (n >= trace_size) 514 { 515 trace_size += N; 516 Trace = realloc (Trace, trace_size * sizeof (Trace[0])); 517 if (Trace == NULL) 518 { 519 printf ("out of memory\n"); 520 exit (1); 521 } 522 } 523 if (fscanf (f, "%lf", Trace + n) != 1) 524 break; 525 n++; 526 } 527 if (ferror (f) || n == 0) 528 { 529 printf ("reading \"%s\" failed: %m\n", name); 530 exit (1); 531 } 532 fclose (f); 533 if (n % N == 0) 534 trace_size = n; 535 for (int i = 0; n < trace_size; n++, i++) 536 Trace[n] = Trace[i]; 537 } 538 539 static void 540 usage (void) 541 { 542 printf ("usage: ./mathbench [-g rand|linear|trace] [-t latency|thruput|both] " 543 "[-i low high] [-f tracefile] [-m measurements] [-c iterations] func " 544 "[func2 ..]\n"); 545 printf ("func:\n"); 546 printf ("%7s [run all benchmarks]\n", "all"); 547 for (const struct fun *f = funtab; f->name; f++) 548 printf ("%7s [low: %g high: %g]\n", f->name, f->lo, f->hi); 549 exit (1); 550 } 551 552 int 553 main (int argc, char *argv[]) 554 { 555 int usergen = 0, gen = 'r', type = 'b', all = 0; 556 double lo = 0, hi = 0; 557 const char *tracefile = "-"; 558 559 argv++; 560 argc--; 561 for (;;) 562 { 563 if (argc <= 0) 564 usage (); 565 if (argv[0][0] != '-') 566 break; 567 else if (argc >= 3 && strcmp (argv[0], "-i") == 0) 568 { 569 usergen = 1; 570 lo = strtod (argv[1], 0); 571 hi = strtod (argv[2], 0); 572 argv += 3; 573 argc -= 3; 574 } 575 else if (argc >= 2 && strcmp (argv[0], "-m") == 0) 576 { 577 measurecount = strtol (argv[1], 0, 0); 578 argv += 2; 579 argc -= 2; 580 } 581 else if (argc >= 2 && strcmp (argv[0], "-c") == 0) 582 { 583 itercount = strtol (argv[1], 0, 0); 584 argv += 2; 585 argc -= 2; 586 } 587 else if (argc >= 2 && strcmp (argv[0], "-g") == 0) 588 { 589 gen = argv[1][0]; 590 if (strchr ("rlt", gen) == 0) 591 usage (); 592 argv += 2; 593 argc -= 2; 594 } 595 else if (argc >= 2 && strcmp (argv[0], "-f") == 0) 596 { 597 gen = 't'; /* -f implies -g trace. */ 598 tracefile = argv[1]; 599 argv += 2; 600 argc -= 2; 601 } 602 else if (argc >= 2 && strcmp (argv[0], "-t") == 0) 603 { 604 type = argv[1][0]; 605 if (strchr ("ltb", type) == 0) 606 usage (); 607 argv += 2; 608 argc -= 2; 609 } 610 else 611 usage (); 612 } 613 if (gen == 't') 614 { 615 readtrace (tracefile); 616 lo = hi = 0; 617 usergen = 1; 618 } 619 while (argc > 0) 620 { 621 int found = 0; 622 all = strcmp (argv[0], "all") == 0; 623 for (const struct fun *f = funtab; f->name; f++) 624 if (all || strcmp (argv[0], f->name) == 0) 625 { 626 found = 1; 627 if (!usergen) 628 { 629 lo = f->lo; 630 hi = f->hi; 631 } 632 bench (f, lo, hi, type, gen); 633 if (usergen && !all) 634 break; 635 } 636 if (!found) 637 printf ("unknown function: %s\n", argv[0]); 638 argv++; 639 argc--; 640 } 641 return 0; 642 } 643