1 /* 2 * Ones' complement checksum test & benchmark 3 * 4 * Copyright (c) 2016-2020, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8 #define _GNU_SOURCE 9 #include <inttypes.h> 10 #include <stdbool.h> 11 #include <stdint.h> 12 #include <stdio.h> 13 #include <stdlib.h> 14 #include <string.h> 15 #include <sys/mman.h> 16 #include <time.h> 17 #include <unistd.h> 18 #include "../include/networking.h" 19 20 #if WANT_ASSERT 21 #undef NDEBUG 22 #include <assert.h> 23 #define Assert(exp) assert(exp) 24 #else 25 #define Assert(exp) (void) (exp) 26 #endif 27 28 #ifdef __GNUC__ 29 #define may_alias __attribute__((__may_alias__)) 30 #else 31 #define may_alias 32 #endif 33 34 #define CACHE_LINE 64 35 #define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1)) 36 37 /* Reference implementation - do not modify! */ 38 static uint16_t 39 checksum_simple(const void *ptr, uint32_t nbytes) 40 { 41 const uint16_t *may_alias hptr = ptr; 42 uint64_t sum = 0;/* Need 64-bit accumulator when nbytes > 64K */ 43 44 /* Sum all halfwords, assume misaligned accesses are handled in HW */ 45 for (uint32_t nhalfs = nbytes >> 1; nhalfs != 0; nhalfs--) 46 { 47 sum += *hptr++; 48 } 49 50 /* Add any trailing odd byte */ 51 if ((nbytes & 0x01) != 0) 52 { 53 sum += *(uint8_t *) hptr; 54 } 55 56 /* Fold 64-bit sum to 32 bits */ 57 sum = (sum & 0xffffffff) + (sum >> 32); 58 sum = (sum & 0xffffffff) + (sum >> 32); 59 Assert(sum == (uint32_t) sum); 60 61 /* Fold 32-bit sum to 16 bits */ 62 sum = (sum & 0xffff) + (sum >> 16); 63 sum = (sum & 0xffff) + (sum >> 16); 64 Assert(sum == (uint16_t) sum); 65 66 return (uint16_t) sum; 67 } 68 69 static struct 70 { 71 uint16_t (*cksum_fp)(const void *, uint32_t); 72 const char *name; 73 } implementations[] = 74 { 75 { checksum_simple, "simple"}, 76 { __chksum, "scalar"}, 77 #if __arm__ 78 { __chksum_arm_simd, "simd" }, 79 #elif __aarch64__ 80 { __chksum_aarch64_simd, "simd" }, 81 #endif 82 { NULL, NULL} 83 }; 84 85 static int 86 find_impl(const char *name) 87 { 88 for (int i = 0; implementations[i].name != NULL; i++) 89 { 90 if (strcmp(implementations[i].name, name) == 0) 91 { 92 return i; 93 } 94 } 95 return -1; 96 } 97 98 static uint16_t (*CKSUM_FP)(const void *, uint32_t); 99 static volatile uint16_t SINK; 100 101 static bool 102 verify(const void *data, uint32_t offset, uint32_t size) 103 { 104 105 uint16_t csum_expected = checksum_simple(data, size); 106 uint16_t csum_actual = CKSUM_FP(data, size); 107 if (csum_actual != csum_expected) 108 { 109 fprintf(stderr, "\nInvalid checksum for offset %u size %u: " 110 "actual %04x expected %04x (valid)", 111 offset, size, csum_actual, csum_expected); 112 if (size < 65536) 113 { 114 /* Fatal error */ 115 exit(EXIT_FAILURE); 116 } 117 /* Else some implementations only support sizes up to 2^16 */ 118 return false; 119 } 120 return true; 121 } 122 123 static uint64_t 124 clock_get_ns(void) 125 { 126 struct timespec ts; 127 clock_gettime(CLOCK_MONOTONIC, &ts); 128 return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec; 129 } 130 131 static void 132 benchmark(const uint8_t *base, 133 size_t poolsize, 134 uint32_t blksize, 135 uint32_t numops, 136 uint64_t cpufreq) 137 { 138 printf("%11u ", (unsigned int) blksize); fflush(stdout); 139 140 uint64_t start = clock_get_ns(); 141 for (uint32_t i = 0; i < numops; i ++) 142 { 143 /* Read a random value from the pool */ 144 uint32_t random = ((uint32_t *) base)[i % (poolsize / 4)]; 145 /* Generate a random starting address */ 146 const void *data = &base[random % (poolsize - blksize)]; 147 SINK = CKSUM_FP(data, blksize); 148 } 149 uint64_t end = clock_get_ns(); 150 151 #define MEGABYTE 1000000 /* Decimal megabyte (MB) */ 152 uint64_t elapsed_ns = end - start; 153 uint64_t elapsed_ms = elapsed_ns / 1000000; 154 uint32_t blks_per_s = (uint32_t) ((numops / elapsed_ms) * 1000); 155 uint64_t accbytes = (uint64_t) numops * blksize; 156 printf("%11ju ", (uintmax_t) ((accbytes / elapsed_ms) * 1000) / MEGABYTE); 157 unsigned int cyc_per_blk = cpufreq / blks_per_s; 158 printf("%11u ", cyc_per_blk); 159 if (blksize != 0) 160 { 161 unsigned int cyc_per_byte = 1000 * cyc_per_blk / blksize; 162 printf("%7u.%03u ", 163 cyc_per_byte / 1000, cyc_per_byte % 1000); 164 } 165 printf("\n"); 166 } 167 168 int main(int argc, char *argv[]) 169 { 170 int c; 171 bool DUMP = false; 172 uint32_t IMPL = 0;/* Simple implementation */ 173 uint64_t CPUFREQ = 0; 174 uint32_t BLKSIZE = 0; 175 uint32_t NUMOPS = 1000000; 176 uint32_t POOLSIZE = 512 * 1024;/* Typical ARM L2 cache size */ 177 178 setvbuf(stdout, NULL, _IOLBF, 160); 179 while ((c = getopt(argc, argv, "b:df:i:n:p:")) != -1) 180 { 181 switch (c) 182 { 183 case 'b' : 184 { 185 int blksize = atoi(optarg); 186 if (blksize < 1 || blksize > POOLSIZE / 2) 187 { 188 fprintf(stderr, "Invalid block size %d\n", blksize); 189 exit(EXIT_FAILURE); 190 } 191 BLKSIZE = (unsigned) blksize; 192 break; 193 } 194 case 'd' : 195 DUMP = true; 196 break; 197 case 'f' : 198 { 199 int64_t cpufreq = atoll(optarg); 200 if (cpufreq < 1) 201 { 202 fprintf(stderr, "Invalid CPU frequency %"PRId64"\n", 203 cpufreq); 204 exit(EXIT_FAILURE); 205 } 206 CPUFREQ = cpufreq; 207 break; 208 } 209 case 'i' : 210 { 211 int impl = find_impl(optarg); 212 if (impl < 0) 213 { 214 fprintf(stderr, "Invalid implementation %s\n", optarg); 215 goto usage; 216 } 217 IMPL = (unsigned) impl; 218 break; 219 } 220 case 'n' : 221 { 222 int numops = atoi(optarg); 223 if (numops < 1) 224 { 225 fprintf(stderr, "Invalid number of operations %d\n", numops); 226 exit(EXIT_FAILURE); 227 } 228 NUMOPS = (unsigned) numops; 229 break; 230 } 231 case 'p' : 232 { 233 int poolsize = atoi(optarg); 234 if (poolsize < 4096) 235 { 236 fprintf(stderr, "Invalid pool size %d\n", poolsize); 237 exit(EXIT_FAILURE); 238 } 239 char c = optarg[strlen(optarg) - 1]; 240 if (c == 'M') 241 { 242 POOLSIZE = (unsigned) poolsize * 1024 * 1024; 243 } 244 else if (c == 'K') 245 { 246 POOLSIZE = (unsigned) poolsize * 1024; 247 } 248 else 249 { 250 POOLSIZE = (unsigned) poolsize; 251 } 252 break; 253 } 254 default : 255 usage : 256 fprintf(stderr, "Usage: checksum <options>\n" 257 "-b <blksize> Block size\n" 258 "-d Dump first 96 bytes of data\n" 259 "-f <cpufreq> CPU frequency (Hz)\n" 260 "-i <impl> Implementation\n" 261 "-n <numops> Number of operations\n" 262 "-p <poolsize> Pool size (K or M suffix)\n" 263 ); 264 printf("Implementations:"); 265 for (int i = 0; implementations[i].name != NULL; i++) 266 { 267 printf(" %s", implementations[i].name); 268 } 269 printf("\n"); 270 exit(EXIT_FAILURE); 271 } 272 } 273 if (optind > argc) 274 { 275 goto usage; 276 } 277 278 CKSUM_FP = implementations[IMPL].cksum_fp; 279 POOLSIZE = ALIGN(POOLSIZE, CACHE_LINE); 280 uint8_t *base = mmap(0, POOLSIZE, PROT_READ|PROT_WRITE, 281 MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); 282 if (base == MAP_FAILED) 283 { 284 perror("aligned_alloc"), exit(EXIT_FAILURE); 285 } 286 for (size_t i = 0; i < POOLSIZE / 4; i++) 287 { 288 ((uint32_t *) base)[i] = rand(); 289 } 290 291 printf("Implementation: %s\n", implementations[IMPL].name); 292 printf("numops %u, poolsize ", NUMOPS); 293 if (POOLSIZE % (1024 * 1024) == 0) 294 { 295 printf("%uMiB", POOLSIZE / (1024 * 1024)); 296 } 297 else if (POOLSIZE % 1024 == 0) 298 { 299 printf("%uKiB", POOLSIZE / 1024); 300 } 301 else 302 { 303 printf("%uB", POOLSIZE); 304 } 305 printf(", blocksize %u, CPU frequency %juMHz\n", 306 BLKSIZE, (uintmax_t) (CPUFREQ / 1000000)); 307 #if WANT_ASSERT 308 printf("Warning: assertions are enabled\n"); 309 #endif 310 311 if (DUMP) 312 { 313 /* Print out first 96 bytes of data for human debugging */ 314 for (int i = 0; i < 96; i++) 315 { 316 if (i % 8 == 0) 317 printf("%2u:", i); 318 printf(" %02x", base[i]); 319 if (i % 8 == 7) 320 printf("\n"); 321 } 322 } 323 324 /* Verify that chosen algorithm handles all combinations of offsets and sizes */ 325 printf("Verifying..."); fflush(stdout); 326 bool success = true; 327 /* Check all (relevant) combinations of size and offset */ 328 for (int size = 0; size <= 256; size++) 329 { 330 for (int offset = 0; offset < 255; offset++) 331 { 332 /* Check at start of mapped memory */ 333 success &= verify(&base[offset], offset, size); 334 /* Check at end of mapped memory */ 335 uint8_t *p = base + POOLSIZE - (size + offset); 336 success &= verify(p, (uintptr_t) p % 64, size); 337 } 338 } 339 /* Check increasingly larger sizes */ 340 for (size_t size = 1; size < POOLSIZE; size *= 2) 341 { 342 success &= verify(base, 0, size); 343 } 344 /* Check the full size, this can detect accumulator overflows */ 345 success &= verify(base, 0, POOLSIZE); 346 printf("%s\n", success ? "OK" : "failure"); 347 348 /* Print throughput in decimal megabyte (1000000B) per second */ 349 if (CPUFREQ != 0) 350 { 351 printf("%11s %11s %11s %11s\n", 352 "block size", "MB/s", "cycles/blk", "cycles/byte"); 353 } 354 else 355 { 356 printf("%11s %11s %11s %11s\n", 357 "block size", "MB/s", "ns/blk", "ns/byte"); 358 CPUFREQ = 1000000000; 359 } 360 if (BLKSIZE != 0) 361 { 362 benchmark(base, POOLSIZE, BLKSIZE, NUMOPS, CPUFREQ); 363 } 364 else 365 { 366 static const uint16_t sizes[] = 367 { 20, 42, 102, 250, 612, 1500, 3674, 9000, 0 }; 368 for (int i = 0; sizes[i] != 0; i++) 369 { 370 uint32_t numops = NUMOPS * 10000 / (40 + sizes[i]); 371 benchmark(base, POOLSIZE, sizes[i], numops, CPUFREQ); 372 } 373 } 374 375 if (munmap(base, POOLSIZE) != 0) 376 { 377 perror("munmap"), exit(EXIT_FAILURE); 378 } 379 380 return success ? EXIT_SUCCESS : EXIT_FAILURE; 381 } 382