1 /* 2 * Microbenchmark for math functions. 3 * 4 * Copyright (c) 2018-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8 #undef _GNU_SOURCE 9 #define _GNU_SOURCE 1 10 #include <stdint.h> 11 #include <stdlib.h> 12 #include <stdio.h> 13 #include <string.h> 14 #include <time.h> 15 #include <math.h> 16 #include "mathlib.h" 17 18 #ifndef WANT_VMATH 19 /* Enable the build of vector math code. */ 20 # define WANT_VMATH 1 21 #endif 22 23 /* Number of measurements, best result is reported. */ 24 #define MEASURE 60 25 /* Array size. */ 26 #define N 8000 27 /* Iterations over the array. */ 28 #define ITER 125 29 30 static double *Trace; 31 static size_t trace_size; 32 static double A[N]; 33 static float Af[N]; 34 static long measurecount = MEASURE; 35 static long itercount = ITER; 36 37 #if __aarch64__ && WANT_VMATH 38 typedef __f64x2_t v_double; 39 40 #define v_double_len() 2 41 42 static inline v_double 43 v_double_load (const double *p) 44 { 45 return (v_double){p[0], p[1]}; 46 } 47 48 static inline v_double 49 v_double_dup (double x) 50 { 51 return (v_double){x, x}; 52 } 53 54 typedef __f32x4_t v_float; 55 56 #define v_float_len() 4 57 58 static inline v_float 59 v_float_load (const float *p) 60 { 61 return (v_float){p[0], p[1], p[2], p[3]}; 62 } 63 64 static inline v_float 65 v_float_dup (float x) 66 { 67 return (v_float){x, x, x, x}; 68 } 69 #if WANT_SVE_MATH 70 #include <arm_sve.h> 71 typedef svbool_t sv_bool; 72 typedef svfloat64_t sv_double; 73 74 #define sv_double_len() svcntd() 75 76 static inline sv_double 77 sv_double_load (const double *p) 78 { 79 svbool_t pg = svptrue_b64(); 80 return svld1(pg, p); 81 } 82 83 static inline sv_double 84 sv_double_dup (double x) 85 { 86 return svdup_n_f64(x); 87 } 88 89 typedef svfloat32_t sv_float; 90 91 #define sv_float_len() svcntw() 92 93 static inline sv_float 94 sv_float_load (const float *p) 95 { 96 svbool_t pg = svptrue_b32(); 97 return svld1(pg, p); 98 } 99 100 static inline sv_float 101 sv_float_dup (float x) 102 { 103 return svdup_n_f32(x); 104 } 105 #endif 106 #else 107 /* dummy definitions to make things compile. */ 108 typedef double v_double; 109 typedef float v_float; 110 #define v_double_len(x) 1 111 #define v_double_load(x) (x)[0] 112 #define v_double_dup(x) (x) 113 #define v_float_len(x) 1 114 #define v_float_load(x) (x)[0] 115 #define v_float_dup(x) (x) 116 #endif 117 118 static double 119 dummy (double x) 120 { 121 return x; 122 } 123 124 static float 125 dummyf (float x) 126 { 127 return x; 128 } 129 #if WANT_VMATH 130 #if __aarch64__ 131 static v_double 132 __v_dummy (v_double x) 133 { 134 return x; 135 } 136 137 static v_float 138 __v_dummyf (v_float x) 139 { 140 return x; 141 } 142 143 #ifdef __vpcs 144 __vpcs static v_double 145 __vn_dummy (v_double x) 146 { 147 return x; 148 } 149 150 __vpcs static v_float 151 __vn_dummyf (v_float x) 152 { 153 return x; 154 } 155 #endif 156 #if WANT_SVE_MATH 157 static sv_double 158 __sv_dummy (sv_double x, sv_bool pg) 159 { 160 return x; 161 } 162 163 static sv_float 164 __sv_dummyf (sv_float x, sv_bool pg) 165 { 166 return x; 167 } 168 169 #endif 170 #endif 171 #endif 172 173 #include "test/mathbench_wrappers.h" 174 175 static const struct fun 176 { 177 const char *name; 178 int prec; 179 int vec; 180 double lo; 181 double hi; 182 union 183 { 184 double (*d) (double); 185 float (*f) (float); 186 v_double (*vd) (v_double); 187 v_float (*vf) (v_float); 188 #ifdef __vpcs 189 __vpcs v_double (*vnd) (v_double); 190 __vpcs v_float (*vnf) (v_float); 191 #endif 192 #if WANT_SVE_MATH 193 sv_double (*svd) (sv_double, sv_bool); 194 sv_float (*svf) (sv_float, sv_bool); 195 #endif 196 } fun; 197 } funtab[] = { 198 #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}}, 199 #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}}, 200 #define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}}, 201 #define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}}, 202 #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}}, 203 #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}}, 204 #define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}}, 205 #define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}}, 206 D (dummy, 1.0, 2.0) 207 F (dummyf, 1.0, 2.0) 208 #if WANT_VMATH 209 #if __aarch64__ 210 VD (__v_dummy, 1.0, 2.0) 211 VF (__v_dummyf, 1.0, 2.0) 212 #ifdef __vpcs 213 VND (__vn_dummy, 1.0, 2.0) 214 VNF (__vn_dummyf, 1.0, 2.0) 215 #endif 216 #if WANT_SVE_MATH 217 SVD (__sv_dummy, 1.0, 2.0) 218 SVF (__sv_dummyf, 1.0, 2.0) 219 #endif 220 #endif 221 #endif 222 #include "test/mathbench_funcs.h" 223 {0}, 224 #undef F 225 #undef D 226 #undef VF 227 #undef VD 228 #undef VNF 229 #undef VND 230 #undef SVF 231 #undef SVD 232 }; 233 234 static void 235 gen_linear (double lo, double hi) 236 { 237 for (int i = 0; i < N; i++) 238 A[i] = (lo * (N - i) + hi * i) / N; 239 } 240 241 static void 242 genf_linear (double lo, double hi) 243 { 244 for (int i = 0; i < N; i++) 245 Af[i] = (float)(lo * (N - i) + hi * i) / N; 246 } 247 248 static inline double 249 asdouble (uint64_t i) 250 { 251 union 252 { 253 uint64_t i; 254 double f; 255 } u = {i}; 256 return u.f; 257 } 258 259 static uint64_t seed = 0x0123456789abcdef; 260 261 static double 262 frand (double lo, double hi) 263 { 264 seed = 6364136223846793005ULL * seed + 1; 265 return lo + (hi - lo) * (asdouble (seed >> 12 | 0x3ffULL << 52) - 1.0); 266 } 267 268 static void 269 gen_rand (double lo, double hi) 270 { 271 for (int i = 0; i < N; i++) 272 A[i] = frand (lo, hi); 273 } 274 275 static void 276 genf_rand (double lo, double hi) 277 { 278 for (int i = 0; i < N; i++) 279 Af[i] = (float)frand (lo, hi); 280 } 281 282 static void 283 gen_trace (int index) 284 { 285 for (int i = 0; i < N; i++) 286 A[i] = Trace[index + i]; 287 } 288 289 static void 290 genf_trace (int index) 291 { 292 for (int i = 0; i < N; i++) 293 Af[i] = (float)Trace[index + i]; 294 } 295 296 static void 297 run_thruput (double f (double)) 298 { 299 for (int i = 0; i < N; i++) 300 f (A[i]); 301 } 302 303 static void 304 runf_thruput (float f (float)) 305 { 306 for (int i = 0; i < N; i++) 307 f (Af[i]); 308 } 309 310 volatile double zero = 0; 311 312 static void 313 run_latency (double f (double)) 314 { 315 double z = zero; 316 double prev = z; 317 for (int i = 0; i < N; i++) 318 prev = f (A[i] + prev * z); 319 } 320 321 static void 322 runf_latency (float f (float)) 323 { 324 float z = (float)zero; 325 float prev = z; 326 for (int i = 0; i < N; i++) 327 prev = f (Af[i] + prev * z); 328 } 329 330 static void 331 run_v_thruput (v_double f (v_double)) 332 { 333 for (int i = 0; i < N; i += v_double_len ()) 334 f (v_double_load (A+i)); 335 } 336 337 static void 338 runf_v_thruput (v_float f (v_float)) 339 { 340 for (int i = 0; i < N; i += v_float_len ()) 341 f (v_float_load (Af+i)); 342 } 343 344 static void 345 run_v_latency (v_double f (v_double)) 346 { 347 v_double z = v_double_dup (zero); 348 v_double prev = z; 349 for (int i = 0; i < N; i += v_double_len ()) 350 prev = f (v_double_load (A+i) + prev * z); 351 } 352 353 static void 354 runf_v_latency (v_float f (v_float)) 355 { 356 v_float z = v_float_dup (zero); 357 v_float prev = z; 358 for (int i = 0; i < N; i += v_float_len ()) 359 prev = f (v_float_load (Af+i) + prev * z); 360 } 361 362 #ifdef __vpcs 363 static void 364 run_vn_thruput (__vpcs v_double f (v_double)) 365 { 366 for (int i = 0; i < N; i += v_double_len ()) 367 f (v_double_load (A+i)); 368 } 369 370 static void 371 runf_vn_thruput (__vpcs v_float f (v_float)) 372 { 373 for (int i = 0; i < N; i += v_float_len ()) 374 f (v_float_load (Af+i)); 375 } 376 377 static void 378 run_vn_latency (__vpcs v_double f (v_double)) 379 { 380 v_double z = v_double_dup (zero); 381 v_double prev = z; 382 for (int i = 0; i < N; i += v_double_len ()) 383 prev = f (v_double_load (A+i) + prev * z); 384 } 385 386 static void 387 runf_vn_latency (__vpcs v_float f (v_float)) 388 { 389 v_float z = v_float_dup (zero); 390 v_float prev = z; 391 for (int i = 0; i < N; i += v_float_len ()) 392 prev = f (v_float_load (Af+i) + prev * z); 393 } 394 #endif 395 396 #if WANT_SVE_MATH 397 static void 398 run_sv_thruput (sv_double f (sv_double, sv_bool)) 399 { 400 for (int i = 0; i < N; i += sv_double_len ()) 401 f (sv_double_load (A+i), svptrue_b64 ()); 402 } 403 404 static void 405 runf_sv_thruput (sv_float f (sv_float, sv_bool)) 406 { 407 for (int i = 0; i < N; i += sv_float_len ()) 408 f (sv_float_load (Af+i), svptrue_b32 ()); 409 } 410 411 static void 412 run_sv_latency (sv_double f (sv_double, sv_bool)) 413 { 414 sv_double z = sv_double_dup (zero); 415 sv_double prev = z; 416 for (int i = 0; i < N; i += sv_double_len ()) 417 prev = f (svmad_f64_x (svptrue_b64 (), prev, z, sv_double_load (A+i)), svptrue_b64 ()); 418 } 419 420 static void 421 runf_sv_latency (sv_float f (sv_float, sv_bool)) 422 { 423 sv_float z = sv_float_dup (zero); 424 sv_float prev = z; 425 for (int i = 0; i < N; i += sv_float_len ()) 426 prev = f (svmad_f32_x (svptrue_b32 (), prev, z, sv_float_load (Af+i)), svptrue_b32 ()); 427 } 428 #endif 429 430 static uint64_t 431 tic (void) 432 { 433 struct timespec ts; 434 if (clock_gettime (CLOCK_REALTIME, &ts)) 435 abort (); 436 return ts.tv_sec * 1000000000ULL + ts.tv_nsec; 437 } 438 439 #define TIMEIT(run, f) do { \ 440 dt = -1; \ 441 run (f); /* Warm up. */ \ 442 for (int j = 0; j < measurecount; j++) \ 443 { \ 444 uint64_t t0 = tic (); \ 445 for (int i = 0; i < itercount; i++) \ 446 run (f); \ 447 uint64_t t1 = tic (); \ 448 if (t1 - t0 < dt) \ 449 dt = t1 - t0; \ 450 } \ 451 } while (0) 452 453 static void 454 bench1 (const struct fun *f, int type, double lo, double hi) 455 { 456 uint64_t dt = 0; 457 uint64_t ns100; 458 const char *s = type == 't' ? "rthruput" : "latency"; 459 int vlen = 1; 460 461 if (f->vec && f->prec == 'd') 462 vlen = v_double_len(); 463 else if (f->vec && f->prec == 'f') 464 vlen = v_float_len(); 465 466 if (f->prec == 'd' && type == 't' && f->vec == 0) 467 TIMEIT (run_thruput, f->fun.d); 468 else if (f->prec == 'd' && type == 'l' && f->vec == 0) 469 TIMEIT (run_latency, f->fun.d); 470 else if (f->prec == 'f' && type == 't' && f->vec == 0) 471 TIMEIT (runf_thruput, f->fun.f); 472 else if (f->prec == 'f' && type == 'l' && f->vec == 0) 473 TIMEIT (runf_latency, f->fun.f); 474 else if (f->prec == 'd' && type == 't' && f->vec == 'v') 475 TIMEIT (run_v_thruput, f->fun.vd); 476 else if (f->prec == 'd' && type == 'l' && f->vec == 'v') 477 TIMEIT (run_v_latency, f->fun.vd); 478 else if (f->prec == 'f' && type == 't' && f->vec == 'v') 479 TIMEIT (runf_v_thruput, f->fun.vf); 480 else if (f->prec == 'f' && type == 'l' && f->vec == 'v') 481 TIMEIT (runf_v_latency, f->fun.vf); 482 #ifdef __vpcs 483 else if (f->prec == 'd' && type == 't' && f->vec == 'n') 484 TIMEIT (run_vn_thruput, f->fun.vnd); 485 else if (f->prec == 'd' && type == 'l' && f->vec == 'n') 486 TIMEIT (run_vn_latency, f->fun.vnd); 487 else if (f->prec == 'f' && type == 't' && f->vec == 'n') 488 TIMEIT (runf_vn_thruput, f->fun.vnf); 489 else if (f->prec == 'f' && type == 'l' && f->vec == 'n') 490 TIMEIT (runf_vn_latency, f->fun.vnf); 491 #endif 492 #if WANT_SVE_MATH 493 else if (f->prec == 'd' && type == 't' && f->vec == 's') 494 TIMEIT (run_sv_thruput, f->fun.svd); 495 else if (f->prec == 'd' && type == 'l' && f->vec == 's') 496 TIMEIT (run_sv_latency, f->fun.svd); 497 else if (f->prec == 'f' && type == 't' && f->vec == 's') 498 TIMEIT (runf_sv_thruput, f->fun.svf); 499 else if (f->prec == 'f' && type == 'l' && f->vec == 's') 500 TIMEIT (runf_sv_latency, f->fun.svf); 501 #endif 502 503 if (type == 't') 504 { 505 ns100 = (100 * dt + itercount * N / 2) / (itercount * N); 506 printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s, 507 (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), 508 (unsigned long long) dt, lo, hi); 509 } 510 else if (type == 'l') 511 { 512 ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen); 513 printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s, 514 (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), 515 (unsigned long long) dt, lo, hi); 516 } 517 fflush (stdout); 518 } 519 520 static void 521 bench (const struct fun *f, double lo, double hi, int type, int gen) 522 { 523 if (f->prec == 'd' && gen == 'r') 524 gen_rand (lo, hi); 525 else if (f->prec == 'd' && gen == 'l') 526 gen_linear (lo, hi); 527 else if (f->prec == 'd' && gen == 't') 528 gen_trace (0); 529 else if (f->prec == 'f' && gen == 'r') 530 genf_rand (lo, hi); 531 else if (f->prec == 'f' && gen == 'l') 532 genf_linear (lo, hi); 533 else if (f->prec == 'f' && gen == 't') 534 genf_trace (0); 535 536 if (gen == 't') 537 hi = trace_size / N; 538 539 if (type == 'b' || type == 't') 540 bench1 (f, 't', lo, hi); 541 542 if (type == 'b' || type == 'l') 543 bench1 (f, 'l', lo, hi); 544 545 for (int i = N; i < trace_size; i += N) 546 { 547 if (f->prec == 'd') 548 gen_trace (i); 549 else 550 genf_trace (i); 551 552 lo = i / N; 553 if (type == 'b' || type == 't') 554 bench1 (f, 't', lo, hi); 555 556 if (type == 'b' || type == 'l') 557 bench1 (f, 'l', lo, hi); 558 } 559 } 560 561 static void 562 readtrace (const char *name) 563 { 564 int n = 0; 565 FILE *f = strcmp (name, "-") == 0 ? stdin : fopen (name, "r"); 566 if (!f) 567 { 568 printf ("openning \"%s\" failed: %m\n", name); 569 exit (1); 570 } 571 for (;;) 572 { 573 if (n >= trace_size) 574 { 575 trace_size += N; 576 Trace = realloc (Trace, trace_size * sizeof (Trace[0])); 577 if (Trace == NULL) 578 { 579 printf ("out of memory\n"); 580 exit (1); 581 } 582 } 583 if (fscanf (f, "%lf", Trace + n) != 1) 584 break; 585 n++; 586 } 587 if (ferror (f) || n == 0) 588 { 589 printf ("reading \"%s\" failed: %m\n", name); 590 exit (1); 591 } 592 fclose (f); 593 if (n % N == 0) 594 trace_size = n; 595 for (int i = 0; n < trace_size; n++, i++) 596 Trace[n] = Trace[i]; 597 } 598 599 static void 600 usage (void) 601 { 602 printf ("usage: ./mathbench [-g rand|linear|trace] [-t latency|thruput|both] " 603 "[-i low high] [-f tracefile] [-m measurements] [-c iterations] func " 604 "[func2 ..]\n"); 605 printf ("func:\n"); 606 printf ("%7s [run all benchmarks]\n", "all"); 607 for (const struct fun *f = funtab; f->name; f++) 608 printf ("%7s [low: %g high: %g]\n", f->name, f->lo, f->hi); 609 exit (1); 610 } 611 612 int 613 main (int argc, char *argv[]) 614 { 615 int usergen = 0, gen = 'r', type = 'b', all = 0; 616 double lo = 0, hi = 0; 617 const char *tracefile = "-"; 618 619 argv++; 620 argc--; 621 for (;;) 622 { 623 if (argc <= 0) 624 usage (); 625 if (argv[0][0] != '-') 626 break; 627 else if (argc >= 3 && strcmp (argv[0], "-i") == 0) 628 { 629 usergen = 1; 630 lo = strtod (argv[1], 0); 631 hi = strtod (argv[2], 0); 632 argv += 3; 633 argc -= 3; 634 } 635 else if (argc >= 2 && strcmp (argv[0], "-m") == 0) 636 { 637 measurecount = strtol (argv[1], 0, 0); 638 argv += 2; 639 argc -= 2; 640 } 641 else if (argc >= 2 && strcmp (argv[0], "-c") == 0) 642 { 643 itercount = strtol (argv[1], 0, 0); 644 argv += 2; 645 argc -= 2; 646 } 647 else if (argc >= 2 && strcmp (argv[0], "-g") == 0) 648 { 649 gen = argv[1][0]; 650 if (strchr ("rlt", gen) == 0) 651 usage (); 652 argv += 2; 653 argc -= 2; 654 } 655 else if (argc >= 2 && strcmp (argv[0], "-f") == 0) 656 { 657 gen = 't'; /* -f implies -g trace. */ 658 tracefile = argv[1]; 659 argv += 2; 660 argc -= 2; 661 } 662 else if (argc >= 2 && strcmp (argv[0], "-t") == 0) 663 { 664 type = argv[1][0]; 665 if (strchr ("ltb", type) == 0) 666 usage (); 667 argv += 2; 668 argc -= 2; 669 } 670 else 671 usage (); 672 } 673 if (gen == 't') 674 { 675 readtrace (tracefile); 676 lo = hi = 0; 677 usergen = 1; 678 } 679 while (argc > 0) 680 { 681 int found = 0; 682 all = strcmp (argv[0], "all") == 0; 683 for (const struct fun *f = funtab; f->name; f++) 684 if (all || strcmp (argv[0], f->name) == 0) 685 { 686 found = 1; 687 if (!usergen) 688 { 689 lo = f->lo; 690 hi = f->hi; 691 } 692 bench (f, lo, hi, type, gen); 693 if (usergen && !all) 694 break; 695 } 696 if (!found) 697 printf ("unknown function: %s\n", argv[0]); 698 argv++; 699 argc--; 700 } 701 return 0; 702 } 703