1 /* 2 * Microbenchmark for math functions. 3 * 4 * Copyright (c) 2018-2024, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8 #if WANT_SVE_TESTS 9 # if __aarch64__ && __linux__ 10 # ifdef __clang__ 11 # pragma clang attribute push(__attribute__((target("sve"))), \ 12 apply_to = any(function)) 13 # else 14 # pragma GCC target("+sve") 15 # endif 16 # else 17 # error "SVE not supported - please disable WANT_SVE_TESTS" 18 # endif 19 #endif 20 21 #undef _GNU_SOURCE 22 #define _GNU_SOURCE 1 23 #include <stdint.h> 24 #include <stdlib.h> 25 #include <stdio.h> 26 #include <string.h> 27 #include <time.h> 28 #include <math.h> 29 #include "mathlib.h" 30 31 /* Number of measurements, best result is reported. */ 32 #define MEASURE 60 33 /* Array size. */ 34 #define N 8000 35 /* Iterations over the array. */ 36 #define ITER 125 37 38 static double *Trace; 39 static size_t trace_size; 40 static double A[N]; 41 static float Af[N]; 42 static long measurecount = MEASURE; 43 static long itercount = ITER; 44 45 static double 46 dummy (double x) 47 { 48 return x; 49 } 50 51 static float 52 dummyf (float x) 53 { 54 return x; 55 } 56 #if __aarch64__ && __linux__ 57 __vpcs static float64x2_t 58 __vn_dummy (float64x2_t x) 59 { 60 return x; 61 } 62 63 __vpcs static float32x4_t 64 __vn_dummyf (float32x4_t x) 65 { 66 return x; 67 } 68 #endif 69 #if WANT_SVE_TESTS 70 static svfloat64_t 71 __sv_dummy (svfloat64_t x, svbool_t pg) 72 { 73 return x; 74 } 75 76 static svfloat32_t 77 __sv_dummyf (svfloat32_t x, svbool_t pg) 78 { 79 return x; 80 } 81 82 #endif 83 84 #include "test/mathbench_wrappers.h" 85 86 static const struct fun 87 { 88 const char *name; 89 int prec; 90 int vec; 91 double lo; 92 double hi; 93 union 94 { 95 double (*d) (double); 96 float (*f) (float); 97 #if __aarch64__ && __linux__ 98 __vpcs float64x2_t (*vnd) (float64x2_t); 99 __vpcs float32x4_t (*vnf) (float32x4_t); 100 #endif 101 #if WANT_SVE_TESTS 102 svfloat64_t (*svd) (svfloat64_t, svbool_t); 103 svfloat32_t (*svf) (svfloat32_t, svbool_t); 104 #endif 105 } fun; 106 } funtab[] = { 107 // clang-format off 108 #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}}, 109 #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}}, 110 #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}}, 111 #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}}, 112 #define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}}, 113 #define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}}, 114 D (dummy, 1.0, 2.0) 115 F (dummyf, 1.0, 2.0) 116 #if __aarch64__ && __linux__ 117 VND (__vn_dummy, 1.0, 2.0) 118 VNF (__vn_dummyf, 1.0, 2.0) 119 #endif 120 #if WANT_SVE_TESTS 121 SVD (__sv_dummy, 1.0, 2.0) 122 SVF (__sv_dummyf, 1.0, 2.0) 123 #endif 124 #include "test/mathbench_funcs.h" 125 {0}, 126 #undef F 127 #undef D 128 #undef VNF 129 #undef VND 130 #undef SVF 131 #undef SVD 132 // clang-format on 133 }; 134 135 static void 136 gen_linear (double lo, double hi) 137 { 138 for (int i = 0; i < N; i++) 139 A[i] = (lo * (N - i) + hi * i) / N; 140 } 141 142 static void 143 genf_linear (double lo, double hi) 144 { 145 for (int i = 0; i < N; i++) 146 Af[i] = (float)(lo * (N - i) + hi * i) / N; 147 } 148 149 static inline double 150 asdouble (uint64_t i) 151 { 152 union 153 { 154 uint64_t i; 155 double f; 156 } u = {i}; 157 return u.f; 158 } 159 160 static uint64_t seed = 0x0123456789abcdef; 161 162 static double 163 frand (double lo, double hi) 164 { 165 seed = 6364136223846793005ULL * seed + 1; 166 return lo + (hi - lo) * (asdouble (seed >> 12 | 0x3ffULL << 52) - 1.0); 167 } 168 169 static void 170 gen_rand (double lo, double hi) 171 { 172 for (int i = 0; i < N; i++) 173 A[i] = frand (lo, hi); 174 } 175 176 static void 177 genf_rand (double lo, double hi) 178 { 179 for (int i = 0; i < N; i++) 180 Af[i] = (float)frand (lo, hi); 181 } 182 183 static void 184 gen_trace (int index) 185 { 186 for (int i = 0; i < N; i++) 187 A[i] = Trace[index + i]; 188 } 189 190 static void 191 genf_trace (int index) 192 { 193 for (int i = 0; i < N; i++) 194 Af[i] = (float)Trace[index + i]; 195 } 196 197 static void 198 run_thruput (double f (double)) 199 { 200 for (int i = 0; i < N; i++) 201 f (A[i]); 202 } 203 204 static void 205 runf_thruput (float f (float)) 206 { 207 for (int i = 0; i < N; i++) 208 f (Af[i]); 209 } 210 211 volatile double zero = 0; 212 213 static void 214 run_latency (double f (double)) 215 { 216 double z = zero; 217 double prev = z; 218 for (int i = 0; i < N; i++) 219 prev = f (A[i] + prev * z); 220 } 221 222 static void 223 runf_latency (float f (float)) 224 { 225 float z = (float)zero; 226 float prev = z; 227 for (int i = 0; i < N; i++) 228 prev = f (Af[i] + prev * z); 229 } 230 231 #if __aarch64__ && __linux__ 232 static void 233 run_vn_thruput (__vpcs float64x2_t f (float64x2_t)) 234 { 235 for (int i = 0; i < N; i += 2) 236 f (vld1q_f64 (A + i)); 237 } 238 239 static void 240 runf_vn_thruput (__vpcs float32x4_t f (float32x4_t)) 241 { 242 for (int i = 0; i < N; i += 4) 243 f (vld1q_f32 (Af + i)); 244 } 245 246 static void 247 run_vn_latency (__vpcs float64x2_t f (float64x2_t)) 248 { 249 volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 }; 250 uint64x2_t sel = vsel; 251 float64x2_t prev = vdupq_n_f64 (0); 252 for (int i = 0; i < N; i += 2) 253 prev = f (vbslq_f64 (sel, prev, vld1q_f64 (A + i))); 254 } 255 256 static void 257 runf_vn_latency (__vpcs float32x4_t f (float32x4_t)) 258 { 259 volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 }; 260 uint32x4_t sel = vsel; 261 float32x4_t prev = vdupq_n_f32 (0); 262 for (int i = 0; i < N; i += 4) 263 prev = f (vbslq_f32 (sel, prev, vld1q_f32 (Af + i))); 264 } 265 #endif 266 267 #if WANT_SVE_TESTS 268 static void 269 run_sv_thruput (svfloat64_t f (svfloat64_t, svbool_t)) 270 { 271 for (int i = 0; i < N; i += svcntd ()) 272 f (svld1_f64 (svptrue_b64 (), A + i), svptrue_b64 ()); 273 } 274 275 static void 276 runf_sv_thruput (svfloat32_t f (svfloat32_t, svbool_t)) 277 { 278 for (int i = 0; i < N; i += svcntw ()) 279 f (svld1_f32 (svptrue_b32 (), Af + i), svptrue_b32 ()); 280 } 281 282 static void 283 run_sv_latency (svfloat64_t f (svfloat64_t, svbool_t)) 284 { 285 volatile svbool_t vsel = svptrue_b64 (); 286 svbool_t sel = vsel; 287 svfloat64_t prev = svdup_f64 (0); 288 for (int i = 0; i < N; i += svcntd ()) 289 prev = f (svsel_f64 (sel, svld1_f64 (svptrue_b64 (), A + i), prev), 290 svptrue_b64 ()); 291 } 292 293 static void 294 runf_sv_latency (svfloat32_t f (svfloat32_t, svbool_t)) 295 { 296 volatile svbool_t vsel = svptrue_b32 (); 297 svbool_t sel = vsel; 298 svfloat32_t prev = svdup_f32 (0); 299 for (int i = 0; i < N; i += svcntw ()) 300 prev = f (svsel_f32 (sel, svld1_f32 (svptrue_b32 (), Af + i), prev), 301 svptrue_b32 ()); 302 } 303 #endif 304 305 static uint64_t 306 tic (void) 307 { 308 struct timespec ts; 309 #if defined(_MSC_VER) 310 if (!timespec_get (&ts, TIME_UTC)) 311 #else 312 if (clock_gettime (CLOCK_REALTIME, &ts)) 313 #endif 314 abort (); 315 return ts.tv_sec * 1000000000ULL + ts.tv_nsec; 316 } 317 318 #define TIMEIT(run, f) do { \ 319 dt = -1; \ 320 run (f); /* Warm up. */ \ 321 for (int j = 0; j < measurecount; j++) \ 322 { \ 323 uint64_t t0 = tic (); \ 324 for (int i = 0; i < itercount; i++) \ 325 run (f); \ 326 uint64_t t1 = tic (); \ 327 if (t1 - t0 < dt) \ 328 dt = t1 - t0; \ 329 } \ 330 } while (0) 331 332 static void 333 bench1 (const struct fun *f, int type, double lo, double hi) 334 { 335 uint64_t dt = 0; 336 uint64_t ns100; 337 const char *s = type == 't' ? "rthruput" : "latency"; 338 int vlen = 1; 339 340 if (f->vec == 'n') 341 vlen = f->prec == 'd' ? 2 : 4; 342 #if WANT_SVE_TESTS 343 else if (f->vec == 's') 344 vlen = f->prec == 'd' ? svcntd () : svcntw (); 345 #endif 346 347 if (f->prec == 'd' && type == 't' && f->vec == 0) 348 TIMEIT (run_thruput, f->fun.d); 349 else if (f->prec == 'd' && type == 'l' && f->vec == 0) 350 TIMEIT (run_latency, f->fun.d); 351 else if (f->prec == 'f' && type == 't' && f->vec == 0) 352 TIMEIT (runf_thruput, f->fun.f); 353 else if (f->prec == 'f' && type == 'l' && f->vec == 0) 354 TIMEIT (runf_latency, f->fun.f); 355 #if __aarch64__ && __linux__ 356 else if (f->prec == 'd' && type == 't' && f->vec == 'n') 357 TIMEIT (run_vn_thruput, f->fun.vnd); 358 else if (f->prec == 'd' && type == 'l' && f->vec == 'n') 359 TIMEIT (run_vn_latency, f->fun.vnd); 360 else if (f->prec == 'f' && type == 't' && f->vec == 'n') 361 TIMEIT (runf_vn_thruput, f->fun.vnf); 362 else if (f->prec == 'f' && type == 'l' && f->vec == 'n') 363 TIMEIT (runf_vn_latency, f->fun.vnf); 364 #endif 365 #if WANT_SVE_TESTS 366 else if (f->prec == 'd' && type == 't' && f->vec == 's') 367 TIMEIT (run_sv_thruput, f->fun.svd); 368 else if (f->prec == 'd' && type == 'l' && f->vec == 's') 369 TIMEIT (run_sv_latency, f->fun.svd); 370 else if (f->prec == 'f' && type == 't' && f->vec == 's') 371 TIMEIT (runf_sv_thruput, f->fun.svf); 372 else if (f->prec == 'f' && type == 'l' && f->vec == 's') 373 TIMEIT (runf_sv_latency, f->fun.svf); 374 #endif 375 376 if (type == 't') 377 { 378 ns100 = (100 * dt + itercount * N / 2) / (itercount * N); 379 printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n", 380 f->name, s, 381 (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), 382 (unsigned long long) dt, lo, hi, vlen); 383 } 384 else if (type == 'l') 385 { 386 ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen); 387 printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n", 388 f->name, s, 389 (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), 390 (unsigned long long) dt, lo, hi, vlen); 391 } 392 fflush (stdout); 393 } 394 395 static void 396 bench (const struct fun *f, double lo, double hi, int type, int gen) 397 { 398 if (f->prec == 'd' && gen == 'r') 399 gen_rand (lo, hi); 400 else if (f->prec == 'd' && gen == 'l') 401 gen_linear (lo, hi); 402 else if (f->prec == 'd' && gen == 't') 403 gen_trace (0); 404 else if (f->prec == 'f' && gen == 'r') 405 genf_rand (lo, hi); 406 else if (f->prec == 'f' && gen == 'l') 407 genf_linear (lo, hi); 408 else if (f->prec == 'f' && gen == 't') 409 genf_trace (0); 410 411 if (gen == 't') 412 hi = trace_size / N; 413 414 if (type == 'b' || type == 't') 415 bench1 (f, 't', lo, hi); 416 417 if (type == 'b' || type == 'l') 418 bench1 (f, 'l', lo, hi); 419 420 for (int i = N; i < trace_size; i += N) 421 { 422 if (f->prec == 'd') 423 gen_trace (i); 424 else 425 genf_trace (i); 426 427 lo = i / N; 428 if (type == 'b' || type == 't') 429 bench1 (f, 't', lo, hi); 430 431 if (type == 'b' || type == 'l') 432 bench1 (f, 'l', lo, hi); 433 } 434 } 435 436 static void 437 readtrace (const char *name) 438 { 439 int n = 0; 440 FILE *f = strcmp (name, "-") == 0 ? stdin : fopen (name, "r"); 441 if (!f) 442 { 443 printf ("openning \"%s\" failed: %m\n", name); 444 exit (1); 445 } 446 for (;;) 447 { 448 if (n >= trace_size) 449 { 450 trace_size += N; 451 Trace = realloc (Trace, trace_size * sizeof (Trace[0])); 452 if (Trace == NULL) 453 { 454 printf ("out of memory\n"); 455 exit (1); 456 } 457 } 458 if (fscanf (f, "%lf", Trace + n) != 1) 459 break; 460 n++; 461 } 462 if (ferror (f) || n == 0) 463 { 464 printf ("reading \"%s\" failed: %m\n", name); 465 exit (1); 466 } 467 fclose (f); 468 if (n % N == 0) 469 trace_size = n; 470 for (int i = 0; n < trace_size; n++, i++) 471 Trace[n] = Trace[i]; 472 } 473 474 static void 475 usage (void) 476 { 477 printf ("usage: ./mathbench [-g rand|linear|trace] [-t latency|thruput|both] " 478 "[-i low high] [-f tracefile] [-m measurements] [-c iterations] func " 479 "[func2 ..]\n"); 480 printf ("func:\n"); 481 printf ("%7s [run all benchmarks]\n", "all"); 482 for (const struct fun *f = funtab; f->name; f++) 483 printf ("%7s [low: %g high: %g]\n", f->name, f->lo, f->hi); 484 exit (1); 485 } 486 487 int 488 main (int argc, char *argv[]) 489 { 490 int usergen = 0, gen = 'r', type = 'b', all = 0; 491 double lo = 0, hi = 0; 492 const char *tracefile = "-"; 493 494 argv++; 495 argc--; 496 for (;;) 497 { 498 if (argc <= 0) 499 usage (); 500 if (argv[0][0] != '-') 501 break; 502 else if (argc >= 3 && strcmp (argv[0], "-i") == 0) 503 { 504 usergen = 1; 505 lo = strtod (argv[1], 0); 506 hi = strtod (argv[2], 0); 507 argv += 3; 508 argc -= 3; 509 } 510 else if (argc >= 2 && strcmp (argv[0], "-m") == 0) 511 { 512 measurecount = strtol (argv[1], 0, 0); 513 argv += 2; 514 argc -= 2; 515 } 516 else if (argc >= 2 && strcmp (argv[0], "-c") == 0) 517 { 518 itercount = strtol (argv[1], 0, 0); 519 argv += 2; 520 argc -= 2; 521 } 522 else if (argc >= 2 && strcmp (argv[0], "-g") == 0) 523 { 524 gen = argv[1][0]; 525 if (strchr ("rlt", gen) == 0) 526 usage (); 527 argv += 2; 528 argc -= 2; 529 } 530 else if (argc >= 2 && strcmp (argv[0], "-f") == 0) 531 { 532 gen = 't'; /* -f implies -g trace. */ 533 tracefile = argv[1]; 534 argv += 2; 535 argc -= 2; 536 } 537 else if (argc >= 2 && strcmp (argv[0], "-t") == 0) 538 { 539 type = argv[1][0]; 540 if (strchr ("ltb", type) == 0) 541 usage (); 542 argv += 2; 543 argc -= 2; 544 } 545 else 546 usage (); 547 } 548 if (gen == 't') 549 { 550 readtrace (tracefile); 551 lo = hi = 0; 552 usergen = 1; 553 } 554 while (argc > 0) 555 { 556 int found = 0; 557 all = strcmp (argv[0], "all") == 0; 558 for (const struct fun *f = funtab; f->name; f++) 559 if (all || strcmp (argv[0], f->name) == 0) 560 { 561 found = 1; 562 if (!usergen) 563 { 564 lo = f->lo; 565 hi = f->hi; 566 } 567 bench (f, lo, hi, type, gen); 568 if (usergen && !all) 569 break; 570 } 571 if (!found) 572 printf ("unknown function: %s\n", argv[0]); 573 argv++; 574 argc--; 575 } 576 return 0; 577 } 578 579 #if __aarch64__ && __linux__ && WANT_SVE_TESTS && defined(__clang__) 580 # pragma clang attribute pop 581 #endif 582