1 /* 2 * Microbenchmark for math functions. 3 * 4 * Copyright (c) 2018-2020, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8 #undef _GNU_SOURCE 9 #define _GNU_SOURCE 1 10 #include <stdint.h> 11 #include <stdlib.h> 12 #include <stdio.h> 13 #include <string.h> 14 #include <time.h> 15 #include <math.h> 16 #include "mathlib.h" 17 18 #ifndef WANT_VMATH 19 /* Enable the build of vector math code. */ 20 # define WANT_VMATH 1 21 #endif 22 23 /* Number of measurements, best result is reported. */ 24 #define MEASURE 60 25 /* Array size. */ 26 #define N 8000 27 /* Iterations over the array. */ 28 #define ITER 125 29 30 static double *Trace; 31 static size_t trace_size; 32 static double A[N]; 33 static float Af[N]; 34 static long measurecount = MEASURE; 35 static long itercount = ITER; 36 37 #if __aarch64__ && WANT_VMATH 38 typedef __f64x2_t v_double; 39 40 #define v_double_len() 2 41 42 static inline v_double 43 v_double_load (const double *p) 44 { 45 return (v_double){p[0], p[1]}; 46 } 47 48 static inline v_double 49 v_double_dup (double x) 50 { 51 return (v_double){x, x}; 52 } 53 54 typedef __f32x4_t v_float; 55 56 #define v_float_len() 4 57 58 static inline v_float 59 v_float_load (const float *p) 60 { 61 return (v_float){p[0], p[1], p[2], p[3]}; 62 } 63 64 static inline v_float 65 v_float_dup (float x) 66 { 67 return (v_float){x, x, x, x}; 68 } 69 #else 70 /* dummy definitions to make things compile. */ 71 typedef double v_double; 72 typedef float v_float; 73 #define v_double_len(x) 1 74 #define v_double_load(x) (x)[0] 75 #define v_double_dup(x) (x) 76 #define v_float_len(x) 1 77 #define v_float_load(x) (x)[0] 78 #define v_float_dup(x) (x) 79 #endif 80 81 static double 82 dummy (double x) 83 { 84 return x; 85 } 86 87 static float 88 dummyf (float x) 89 { 90 return x; 91 } 92 93 #if WANT_VMATH 94 #if __aarch64__ 95 static v_double 96 __v_dummy (v_double x) 97 { 98 return x; 99 } 100 101 static v_float 102 __v_dummyf (v_float x) 103 { 104 return x; 105 } 106 107 #ifdef __vpcs 108 __vpcs static v_double 109 __vn_dummy (v_double x) 110 { 111 return x; 112 } 113 114 __vpcs static v_float 115 __vn_dummyf (v_float x) 116 { 117 return x; 118 } 119 120 __vpcs static v_float 121 xy__vn_powf (v_float x) 122 { 123 return __vn_powf (x, x); 124 } 125 126 __vpcs static v_float 127 xy_Z_powf (v_float x) 128 { 129 return _ZGVnN4vv_powf (x, x); 130 } 131 132 __vpcs static v_double 133 xy__vn_pow (v_double x) 134 { 135 return __vn_pow (x, x); 136 } 137 138 __vpcs static v_double 139 xy_Z_pow (v_double x) 140 { 141 return _ZGVnN2vv_pow (x, x); 142 } 143 #endif 144 145 static v_float 146 xy__v_powf (v_float x) 147 { 148 return __v_powf (x, x); 149 } 150 151 static v_double 152 xy__v_pow (v_double x) 153 { 154 return __v_pow (x, x); 155 } 156 #endif 157 158 static float 159 xy__s_powf (float x) 160 { 161 return __s_powf (x, x); 162 } 163 164 static double 165 xy__s_pow (double x) 166 { 167 return __s_pow (x, x); 168 } 169 #endif 170 171 static double 172 xypow (double x) 173 { 174 return pow (x, x); 175 } 176 177 static float 178 xypowf (float x) 179 { 180 return powf (x, x); 181 } 182 183 static double 184 xpow (double x) 185 { 186 return pow (x, 23.4); 187 } 188 189 static float 190 xpowf (float x) 191 { 192 return powf (x, 23.4f); 193 } 194 195 static double 196 ypow (double x) 197 { 198 return pow (2.34, x); 199 } 200 201 static float 202 ypowf (float x) 203 { 204 return powf (2.34f, x); 205 } 206 207 static float 208 sincosf_wrap (float x) 209 { 210 float s, c; 211 sincosf (x, &s, &c); 212 return s + c; 213 } 214 215 static const struct fun 216 { 217 const char *name; 218 int prec; 219 int vec; 220 double lo; 221 double hi; 222 union 223 { 224 double (*d) (double); 225 float (*f) (float); 226 v_double (*vd) (v_double); 227 v_float (*vf) (v_float); 228 #ifdef __vpcs 229 __vpcs v_double (*vnd) (v_double); 230 __vpcs v_float (*vnf) (v_float); 231 #endif 232 } fun; 233 } funtab[] = { 234 #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}}, 235 #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}}, 236 #define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}}, 237 #define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}}, 238 #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}}, 239 #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}}, 240 D (dummy, 1.0, 2.0) 241 D (exp, -9.9, 9.9) 242 D (exp, 0.5, 1.0) 243 D (exp2, -9.9, 9.9) 244 D (log, 0.01, 11.1) 245 D (log, 0.999, 1.001) 246 D (log2, 0.01, 11.1) 247 D (log2, 0.999, 1.001) 248 {"pow", 'd', 0, 0.01, 11.1, {.d = xypow}}, 249 D (xpow, 0.01, 11.1) 250 D (ypow, -9.9, 9.9) 251 D (erf, -6.0, 6.0) 252 253 F (dummyf, 1.0, 2.0) 254 F (expf, -9.9, 9.9) 255 F (exp2f, -9.9, 9.9) 256 F (logf, 0.01, 11.1) 257 F (log2f, 0.01, 11.1) 258 {"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}}, 259 F (xpowf, 0.01, 11.1) 260 F (ypowf, -9.9, 9.9) 261 {"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}}, 262 {"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}}, 263 {"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}}, 264 {"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}}, 265 {"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}}, 266 {"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}}, 267 F (sinf, 0.1, 0.7) 268 F (sinf, 0.8, 3.1) 269 F (sinf, -3.1, 3.1) 270 F (sinf, 3.3, 33.3) 271 F (sinf, 100, 1000) 272 F (sinf, 1e6, 1e32) 273 F (cosf, 0.1, 0.7) 274 F (cosf, 0.8, 3.1) 275 F (cosf, -3.1, 3.1) 276 F (cosf, 3.3, 33.3) 277 F (cosf, 100, 1000) 278 F (cosf, 1e6, 1e32) 279 F (erff, -4.0, 4.0) 280 #if WANT_VMATH 281 D (__s_sin, -3.1, 3.1) 282 D (__s_cos, -3.1, 3.1) 283 D (__s_exp, -9.9, 9.9) 284 D (__s_log, 0.01, 11.1) 285 {"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}}, 286 F (__s_expf, -9.9, 9.9) 287 F (__s_expf_1u, -9.9, 9.9) 288 F (__s_exp2f, -9.9, 9.9) 289 F (__s_exp2f_1u, -9.9, 9.9) 290 F (__s_logf, 0.01, 11.1) 291 {"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}}, 292 F (__s_sinf, -3.1, 3.1) 293 F (__s_cosf, -3.1, 3.1) 294 #if __aarch64__ 295 VD (__v_dummy, 1.0, 2.0) 296 VD (__v_sin, -3.1, 3.1) 297 VD (__v_cos, -3.1, 3.1) 298 VD (__v_exp, -9.9, 9.9) 299 VD (__v_log, 0.01, 11.1) 300 {"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}}, 301 VF (__v_dummyf, 1.0, 2.0) 302 VF (__v_expf, -9.9, 9.9) 303 VF (__v_expf_1u, -9.9, 9.9) 304 VF (__v_exp2f, -9.9, 9.9) 305 VF (__v_exp2f_1u, -9.9, 9.9) 306 VF (__v_logf, 0.01, 11.1) 307 {"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}}, 308 VF (__v_sinf, -3.1, 3.1) 309 VF (__v_cosf, -3.1, 3.1) 310 #ifdef __vpcs 311 VND (__vn_dummy, 1.0, 2.0) 312 VND (__vn_exp, -9.9, 9.9) 313 VND (_ZGVnN2v_exp, -9.9, 9.9) 314 VND (__vn_log, 0.01, 11.1) 315 VND (_ZGVnN2v_log, 0.01, 11.1) 316 {"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}}, 317 {"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}}, 318 VND (__vn_sin, -3.1, 3.1) 319 VND (_ZGVnN2v_sin, -3.1, 3.1) 320 VND (__vn_cos, -3.1, 3.1) 321 VND (_ZGVnN2v_cos, -3.1, 3.1) 322 VNF (__vn_dummyf, 1.0, 2.0) 323 VNF (__vn_expf, -9.9, 9.9) 324 VNF (_ZGVnN4v_expf, -9.9, 9.9) 325 VNF (__vn_expf_1u, -9.9, 9.9) 326 VNF (__vn_exp2f, -9.9, 9.9) 327 VNF (_ZGVnN4v_exp2f, -9.9, 9.9) 328 VNF (__vn_exp2f_1u, -9.9, 9.9) 329 VNF (__vn_logf, 0.01, 11.1) 330 VNF (_ZGVnN4v_logf, 0.01, 11.1) 331 {"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}}, 332 {"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}}, 333 VNF (__vn_sinf, -3.1, 3.1) 334 VNF (_ZGVnN4v_sinf, -3.1, 3.1) 335 VNF (__vn_cosf, -3.1, 3.1) 336 VNF (_ZGVnN4v_cosf, -3.1, 3.1) 337 #endif 338 #endif 339 #endif 340 {0}, 341 #undef F 342 #undef D 343 #undef VF 344 #undef VD 345 #undef VNF 346 #undef VND 347 }; 348 349 static void 350 gen_linear (double lo, double hi) 351 { 352 for (int i = 0; i < N; i++) 353 A[i] = (lo * (N - i) + hi * i) / N; 354 } 355 356 static void 357 genf_linear (double lo, double hi) 358 { 359 for (int i = 0; i < N; i++) 360 Af[i] = (float)(lo * (N - i) + hi * i) / N; 361 } 362 363 static inline double 364 asdouble (uint64_t i) 365 { 366 union 367 { 368 uint64_t i; 369 double f; 370 } u = {i}; 371 return u.f; 372 } 373 374 static uint64_t seed = 0x0123456789abcdef; 375 376 static double 377 frand (double lo, double hi) 378 { 379 seed = 6364136223846793005ULL * seed + 1; 380 return lo + (hi - lo) * (asdouble (seed >> 12 | 0x3ffULL << 52) - 1.0); 381 } 382 383 static void 384 gen_rand (double lo, double hi) 385 { 386 for (int i = 0; i < N; i++) 387 A[i] = frand (lo, hi); 388 } 389 390 static void 391 genf_rand (double lo, double hi) 392 { 393 for (int i = 0; i < N; i++) 394 Af[i] = (float)frand (lo, hi); 395 } 396 397 static void 398 gen_trace (int index) 399 { 400 for (int i = 0; i < N; i++) 401 A[i] = Trace[index + i]; 402 } 403 404 static void 405 genf_trace (int index) 406 { 407 for (int i = 0; i < N; i++) 408 Af[i] = (float)Trace[index + i]; 409 } 410 411 static void 412 run_thruput (double f (double)) 413 { 414 for (int i = 0; i < N; i++) 415 f (A[i]); 416 } 417 418 static void 419 runf_thruput (float f (float)) 420 { 421 for (int i = 0; i < N; i++) 422 f (Af[i]); 423 } 424 425 volatile double zero = 0; 426 427 static void 428 run_latency (double f (double)) 429 { 430 double z = zero; 431 double prev = z; 432 for (int i = 0; i < N; i++) 433 prev = f (A[i] + prev * z); 434 } 435 436 static void 437 runf_latency (float f (float)) 438 { 439 float z = (float)zero; 440 float prev = z; 441 for (int i = 0; i < N; i++) 442 prev = f (Af[i] + prev * z); 443 } 444 445 static void 446 run_v_thruput (v_double f (v_double)) 447 { 448 for (int i = 0; i < N; i += v_double_len ()) 449 f (v_double_load (A+i)); 450 } 451 452 static void 453 runf_v_thruput (v_float f (v_float)) 454 { 455 for (int i = 0; i < N; i += v_float_len ()) 456 f (v_float_load (Af+i)); 457 } 458 459 static void 460 run_v_latency (v_double f (v_double)) 461 { 462 v_double z = v_double_dup (zero); 463 v_double prev = z; 464 for (int i = 0; i < N; i += v_double_len ()) 465 prev = f (v_double_load (A+i) + prev * z); 466 } 467 468 static void 469 runf_v_latency (v_float f (v_float)) 470 { 471 v_float z = v_float_dup (zero); 472 v_float prev = z; 473 for (int i = 0; i < N; i += v_float_len ()) 474 prev = f (v_float_load (Af+i) + prev * z); 475 } 476 477 #ifdef __vpcs 478 static void 479 run_vn_thruput (__vpcs v_double f (v_double)) 480 { 481 for (int i = 0; i < N; i += v_double_len ()) 482 f (v_double_load (A+i)); 483 } 484 485 static void 486 runf_vn_thruput (__vpcs v_float f (v_float)) 487 { 488 for (int i = 0; i < N; i += v_float_len ()) 489 f (v_float_load (Af+i)); 490 } 491 492 static void 493 run_vn_latency (__vpcs v_double f (v_double)) 494 { 495 v_double z = v_double_dup (zero); 496 v_double prev = z; 497 for (int i = 0; i < N; i += v_double_len ()) 498 prev = f (v_double_load (A+i) + prev * z); 499 } 500 501 static void 502 runf_vn_latency (__vpcs v_float f (v_float)) 503 { 504 v_float z = v_float_dup (zero); 505 v_float prev = z; 506 for (int i = 0; i < N; i += v_float_len ()) 507 prev = f (v_float_load (Af+i) + prev * z); 508 } 509 #endif 510 511 static uint64_t 512 tic (void) 513 { 514 struct timespec ts; 515 if (clock_gettime (CLOCK_REALTIME, &ts)) 516 abort (); 517 return ts.tv_sec * 1000000000ULL + ts.tv_nsec; 518 } 519 520 #define TIMEIT(run, f) do { \ 521 dt = -1; \ 522 run (f); /* Warm up. */ \ 523 for (int j = 0; j < measurecount; j++) \ 524 { \ 525 uint64_t t0 = tic (); \ 526 for (int i = 0; i < itercount; i++) \ 527 run (f); \ 528 uint64_t t1 = tic (); \ 529 if (t1 - t0 < dt) \ 530 dt = t1 - t0; \ 531 } \ 532 } while (0) 533 534 static void 535 bench1 (const struct fun *f, int type, double lo, double hi) 536 { 537 uint64_t dt = 0; 538 uint64_t ns100; 539 const char *s = type == 't' ? "rthruput" : "latency"; 540 int vlen = 1; 541 542 if (f->vec && f->prec == 'd') 543 vlen = v_double_len(); 544 else if (f->vec && f->prec == 'f') 545 vlen = v_float_len(); 546 547 if (f->prec == 'd' && type == 't' && f->vec == 0) 548 TIMEIT (run_thruput, f->fun.d); 549 else if (f->prec == 'd' && type == 'l' && f->vec == 0) 550 TIMEIT (run_latency, f->fun.d); 551 else if (f->prec == 'f' && type == 't' && f->vec == 0) 552 TIMEIT (runf_thruput, f->fun.f); 553 else if (f->prec == 'f' && type == 'l' && f->vec == 0) 554 TIMEIT (runf_latency, f->fun.f); 555 else if (f->prec == 'd' && type == 't' && f->vec == 'v') 556 TIMEIT (run_v_thruput, f->fun.vd); 557 else if (f->prec == 'd' && type == 'l' && f->vec == 'v') 558 TIMEIT (run_v_latency, f->fun.vd); 559 else if (f->prec == 'f' && type == 't' && f->vec == 'v') 560 TIMEIT (runf_v_thruput, f->fun.vf); 561 else if (f->prec == 'f' && type == 'l' && f->vec == 'v') 562 TIMEIT (runf_v_latency, f->fun.vf); 563 #ifdef __vpcs 564 else if (f->prec == 'd' && type == 't' && f->vec == 'n') 565 TIMEIT (run_vn_thruput, f->fun.vnd); 566 else if (f->prec == 'd' && type == 'l' && f->vec == 'n') 567 TIMEIT (run_vn_latency, f->fun.vnd); 568 else if (f->prec == 'f' && type == 't' && f->vec == 'n') 569 TIMEIT (runf_vn_thruput, f->fun.vnf); 570 else if (f->prec == 'f' && type == 'l' && f->vec == 'n') 571 TIMEIT (runf_vn_latency, f->fun.vnf); 572 #endif 573 574 if (type == 't') 575 { 576 ns100 = (100 * dt + itercount * N / 2) / (itercount * N); 577 printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s, 578 (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), 579 (unsigned long long) dt, lo, hi); 580 } 581 else if (type == 'l') 582 { 583 ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen); 584 printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s, 585 (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), 586 (unsigned long long) dt, lo, hi); 587 } 588 fflush (stdout); 589 } 590 591 static void 592 bench (const struct fun *f, double lo, double hi, int type, int gen) 593 { 594 if (f->prec == 'd' && gen == 'r') 595 gen_rand (lo, hi); 596 else if (f->prec == 'd' && gen == 'l') 597 gen_linear (lo, hi); 598 else if (f->prec == 'd' && gen == 't') 599 gen_trace (0); 600 else if (f->prec == 'f' && gen == 'r') 601 genf_rand (lo, hi); 602 else if (f->prec == 'f' && gen == 'l') 603 genf_linear (lo, hi); 604 else if (f->prec == 'f' && gen == 't') 605 genf_trace (0); 606 607 if (gen == 't') 608 hi = trace_size / N; 609 610 if (type == 'b' || type == 't') 611 bench1 (f, 't', lo, hi); 612 613 if (type == 'b' || type == 'l') 614 bench1 (f, 'l', lo, hi); 615 616 for (int i = N; i < trace_size; i += N) 617 { 618 if (f->prec == 'd') 619 gen_trace (i); 620 else 621 genf_trace (i); 622 623 lo = i / N; 624 if (type == 'b' || type == 't') 625 bench1 (f, 't', lo, hi); 626 627 if (type == 'b' || type == 'l') 628 bench1 (f, 'l', lo, hi); 629 } 630 } 631 632 static void 633 readtrace (const char *name) 634 { 635 int n = 0; 636 FILE *f = strcmp (name, "-") == 0 ? stdin : fopen (name, "r"); 637 if (!f) 638 { 639 printf ("openning \"%s\" failed: %m\n", name); 640 exit (1); 641 } 642 for (;;) 643 { 644 if (n >= trace_size) 645 { 646 trace_size += N; 647 Trace = realloc (Trace, trace_size * sizeof (Trace[0])); 648 if (Trace == NULL) 649 { 650 printf ("out of memory\n"); 651 exit (1); 652 } 653 } 654 if (fscanf (f, "%lf", Trace + n) != 1) 655 break; 656 n++; 657 } 658 if (ferror (f) || n == 0) 659 { 660 printf ("reading \"%s\" failed: %m\n", name); 661 exit (1); 662 } 663 fclose (f); 664 if (n % N == 0) 665 trace_size = n; 666 for (int i = 0; n < trace_size; n++, i++) 667 Trace[n] = Trace[i]; 668 } 669 670 static void 671 usage (void) 672 { 673 printf ("usage: ./mathbench [-g rand|linear|trace] [-t latency|thruput|both] " 674 "[-i low high] [-f tracefile] [-m measurements] [-c iterations] func " 675 "[func2 ..]\n"); 676 printf ("func:\n"); 677 printf ("%7s [run all benchmarks]\n", "all"); 678 for (const struct fun *f = funtab; f->name; f++) 679 printf ("%7s [low: %g high: %g]\n", f->name, f->lo, f->hi); 680 exit (1); 681 } 682 683 int 684 main (int argc, char *argv[]) 685 { 686 int usergen = 0, gen = 'r', type = 'b', all = 0; 687 double lo = 0, hi = 0; 688 const char *tracefile = "-"; 689 690 argv++; 691 argc--; 692 for (;;) 693 { 694 if (argc <= 0) 695 usage (); 696 if (argv[0][0] != '-') 697 break; 698 else if (argc >= 3 && strcmp (argv[0], "-i") == 0) 699 { 700 usergen = 1; 701 lo = strtod (argv[1], 0); 702 hi = strtod (argv[2], 0); 703 argv += 3; 704 argc -= 3; 705 } 706 else if (argc >= 2 && strcmp (argv[0], "-m") == 0) 707 { 708 measurecount = strtol (argv[1], 0, 0); 709 argv += 2; 710 argc -= 2; 711 } 712 else if (argc >= 2 && strcmp (argv[0], "-c") == 0) 713 { 714 itercount = strtol (argv[1], 0, 0); 715 argv += 2; 716 argc -= 2; 717 } 718 else if (argc >= 2 && strcmp (argv[0], "-g") == 0) 719 { 720 gen = argv[1][0]; 721 if (strchr ("rlt", gen) == 0) 722 usage (); 723 argv += 2; 724 argc -= 2; 725 } 726 else if (argc >= 2 && strcmp (argv[0], "-f") == 0) 727 { 728 gen = 't'; /* -f implies -g trace. */ 729 tracefile = argv[1]; 730 argv += 2; 731 argc -= 2; 732 } 733 else if (argc >= 2 && strcmp (argv[0], "-t") == 0) 734 { 735 type = argv[1][0]; 736 if (strchr ("ltb", type) == 0) 737 usage (); 738 argv += 2; 739 argc -= 2; 740 } 741 else 742 usage (); 743 } 744 if (gen == 't') 745 { 746 readtrace (tracefile); 747 lo = hi = 0; 748 usergen = 1; 749 } 750 while (argc > 0) 751 { 752 int found = 0; 753 all = strcmp (argv[0], "all") == 0; 754 for (const struct fun *f = funtab; f->name; f++) 755 if (all || strcmp (argv[0], f->name) == 0) 756 { 757 found = 1; 758 if (!usergen) 759 { 760 lo = f->lo; 761 hi = f->hi; 762 } 763 bench (f, lo, hi, type, gen); 764 if (usergen && !all) 765 break; 766 } 767 if (!found) 768 printf ("unknown function: %s\n", argv[0]); 769 argv++; 770 argc--; 771 } 772 return 0; 773 } 774