xref: /freebsd/contrib/arm-optimized-routines/string/bench/memcpy.c (revision f3087bef11543b42e0d69b708f367097a4118d24)
1 /*
2  * memcpy benchmark.
3  *
4  * Copyright (c) 2020-2023, Arm Limited.
5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6  */
7 
8 #define _GNU_SOURCE
9 #include <stdint.h>
10 #include <stdio.h>
11 #include <string.h>
12 #include <assert.h>
13 #include "stringlib.h"
14 #include "benchlib.h"
15 
16 #define ITERS  5000
17 #define ITERS2 20000000
18 #define ITERS3 200000
19 #define NUM_TESTS 16384
20 #define MIN_SIZE 32768
21 #define MAX_SIZE (1024 * 1024)
22 
23 static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(4096)));
24 static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(4096)));
25 
26 #define DOTEST(STR,TESTFN)			\
27   printf (STR);					\
28   RUN (TESTFN, memcpy);				\
29   RUNA64 (TESTFN, __memcpy_aarch64);		\
30   RUNA64 (TESTFN, __memcpy_aarch64_simd);	\
31   RUNSVE (TESTFN, __memcpy_aarch64_sve);	\
32   RUNMOPS (TESTFN, __memcpy_aarch64_mops);	\
33   RUNA32 (TESTFN, __memcpy_arm);		\
34   printf ("\n");
35 
36 typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
37 typedef struct { uint8_t align; uint16_t freq; } align_data_t;
38 
39 #define SIZE_NUM 65536
40 #define SIZE_MASK (SIZE_NUM-1)
41 static uint8_t size_arr[SIZE_NUM];
42 
43 /* Frequency data for memcpy of less than 4096 bytes based on SPEC2017.  */
44 static freq_data_t size_freq[] =
45 {
46 {32,22320}, { 16,9554}, {  8,8915}, {152,5327}, {  4,2159}, {292,2035},
47 { 12,1608}, { 24,1343}, {1152,895}, {144, 813}, {884, 733}, {284, 721},
48 {120, 661}, {  2, 649}, {882, 550}, {  5, 475}, {  7, 461}, {108, 460},
49 { 10, 361}, {  9, 361}, {  6, 334}, {  3, 326}, {464, 308}, {2048,303},
50 {  1, 298}, { 64, 250}, { 11, 197}, {296, 194}, { 68, 187}, { 15, 185},
51 {192, 184}, {1764,183}, { 13, 173}, {560, 126}, {160, 115}, {288,  96},
52 {104,  96}, {1144, 83}, { 18,  80}, { 23,  78}, { 40,  77}, { 19,  68},
53 { 48,  63}, { 17,  57}, { 72,  54}, {1280, 51}, { 20,  49}, { 28,  47},
54 { 22,  46}, {640,  45}, { 25,  41}, { 14,  40}, { 56,  37}, { 27,  35},
55 { 35,  33}, {384,  33}, { 29,  32}, { 80,  30}, {4095, 22}, {232,  22},
56 { 36,  19}, {184,  17}, { 21,  17}, {256,  16}, { 44,  15}, { 26,  15},
57 { 31,  14}, { 88,  14}, {176,  13}, { 33,  12}, {1024, 12}, {208,  11},
58 { 62,  11}, {128,  10}, {704,  10}, {324,  10}, { 96,  10}, { 60,   9},
59 {136,   9}, {124,   9}, { 34,   8}, { 30,   8}, {480,   8}, {1344,  8},
60 {273,   7}, {520,   7}, {112,   6}, { 52,   6}, {344,   6}, {336,   6},
61 {504,   5}, {168,   5}, {424,   5}, {  0,   4}, { 76,   3}, {200,   3},
62 {512,   3}, {312,   3}, {240,   3}, {960,   3}, {264,   2}, {672,   2},
63 { 38,   2}, {328,   2}, { 84,   2}, { 39,   2}, {216,   2}, { 42,   2},
64 { 37,   2}, {1608,  2}, { 70,   2}, { 46,   2}, {536,   2}, {280,   1},
65 {248,   1}, { 47,   1}, {1088,  1}, {1288,  1}, {224,   1}, { 41,   1},
66 { 50,   1}, { 49,   1}, {808,   1}, {360,   1}, {440,   1}, { 43,   1},
67 { 45,   1}, { 78,   1}, {968,   1}, {392,   1}, { 54,   1}, { 53,   1},
68 { 59,   1}, {376,   1}, {664,   1}, { 58,   1}, {272,   1}, { 66,   1},
69 {2688,  1}, {472,   1}, {568,   1}, {720,   1}, { 51,   1}, { 63,   1},
70 { 86,   1}, {496,   1}, {776,   1}, { 57,   1}, {680,   1}, {792,   1},
71 {122,   1}, {760,   1}, {824,   1}, {552,   1}, { 67,   1}, {456,   1},
72 {984,   1}, { 74,   1}, {408,   1}, { 75,   1}, { 92,   1}, {576,   1},
73 {116,   1}, { 65,   1}, {117,   1}, { 82,   1}, {352,   1}, { 55,   1},
74 {100,   1}, { 90,   1}, {696,   1}, {111,   1}, {880,   1}, { 79,   1},
75 {488,   1}, { 61,   1}, {114,   1}, { 94,   1}, {1032,  1}, { 98,   1},
76 { 87,   1}, {584,   1}, { 85,   1}, {648,   1}, {0, 0}
77 };
78 
79 #define ALIGN_NUM 1024
80 #define ALIGN_MASK (ALIGN_NUM-1)
81 static uint8_t src_align_arr[ALIGN_NUM];
82 static uint8_t dst_align_arr[ALIGN_NUM];
83 
84 /* Source alignment frequency for memcpy based on SPEC2017.  */
85 static align_data_t src_align_freq[] =
86 {
87   {8, 300}, {16, 292}, {32, 168}, {64, 153}, {4, 79}, {2, 14}, {1, 18}, {0, 0}
88 };
89 
90 static align_data_t dst_align_freq[] =
91 {
92   {8, 265}, {16, 263}, {64, 209}, {32, 174}, {4, 90}, {2, 10}, {1, 13}, {0, 0}
93 };
94 
95 typedef struct
96 {
97   uint64_t src : 24;
98   uint64_t dst : 24;
99   uint64_t len : 16;
100 } copy_t;
101 
102 static copy_t test_arr[NUM_TESTS];
103 
104 typedef char *(*proto_t) (char *, const char *, size_t);
105 
106 static void
init_copy_distribution(void)107 init_copy_distribution (void)
108 {
109   int i, j, freq, size, n;
110 
111   for (n = i = 0; (freq = size_freq[i].freq) != 0; i++)
112     for (j = 0, size = size_freq[i].size; j < freq; j++)
113       size_arr[n++] = size;
114   assert (n == SIZE_NUM);
115 
116   for (n = i = 0; (freq = src_align_freq[i].freq) != 0; i++)
117     for (j = 0, size = src_align_freq[i].align; j < freq; j++)
118       src_align_arr[n++] = size - 1;
119   assert (n == ALIGN_NUM);
120 
121   for (n = i = 0; (freq = dst_align_freq[i].freq) != 0; i++)
122     for (j = 0, size = dst_align_freq[i].align; j < freq; j++)
123       dst_align_arr[n++] = size - 1;
124   assert (n == ALIGN_NUM);
125 }
126 
127 static size_t
init_copies(size_t max_size)128 init_copies (size_t max_size)
129 {
130   size_t total = 0;
131   /* Create a random set of copies with the given size and alignment
132      distributions.  */
133   for (int i = 0; i < NUM_TESTS; i++)
134     {
135       test_arr[i].dst = (rand32 (0) & (max_size - 1));
136       test_arr[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
137       test_arr[i].src = (rand32 (0) & (max_size - 1));
138       test_arr[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
139       test_arr[i].len = size_arr[rand32 (0) & SIZE_MASK];
140       total += test_arr[i].len;
141     }
142 
143   return total;
144 }
145 
146 static void inline __attribute ((always_inline))
memcpy_random(const char * name,void * (* fn)(void *,const void *,size_t))147 memcpy_random (const char *name, void *(*fn)(void *, const void *, size_t))
148 {
149   printf ("%22s ", name);
150   uint64_t total = 0, tsum = 0;
151   for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
152     {
153       uint64_t copy_size = init_copies (size) * ITERS;
154 
155       for (int c = 0; c < NUM_TESTS; c++)
156 	fn (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
157 
158       uint64_t t = clock_get_ns ();
159       for (int i = 0; i < ITERS; i++)
160 	for (int c = 0; c < NUM_TESTS; c++)
161 	  fn (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
162       t = clock_get_ns () - t;
163       total += copy_size;
164       tsum += t;
165       printf ("%dK: %5.2f ", size / 1024, (double)copy_size / t);
166     }
167   printf( "avg %5.2f\n", (double)total / tsum);
168 }
169 
170 static void inline __attribute ((always_inline))
memcpy_medium_aligned(const char * name,void * (* fn)(void *,const void *,size_t))171 memcpy_medium_aligned (const char *name, void *(*fn)(void *, const void *, size_t))
172 {
173   printf ("%22s ", name);
174 
175   for (int size = 8; size <= 512; size *= 2)
176     {
177       uint64_t t = clock_get_ns ();
178       for (int i = 0; i < ITERS2; i++)
179 	fn (b, a, size);
180       t = clock_get_ns () - t;
181       printf ("%dB: %5.2f ", size, (double)size * ITERS2 / t);
182     }
183   printf ("\n");
184 }
185 
186 static void inline __attribute ((always_inline))
memcpy_medium_unaligned(const char * name,void * (* fn)(void *,const void *,size_t))187 memcpy_medium_unaligned (const char *name, void *(*fn)(void *, const void *, size_t))
188 {
189   printf ("%22s ", name);
190 
191   for (int size = 8; size <= 512; size *= 2)
192     {
193       uint64_t t = clock_get_ns ();
194       for (int i = 0; i < ITERS2; i++)
195 	fn (b + 3, a + 1, size);
196       t = clock_get_ns () - t;
197       printf ("%dB: %5.2f ", size, (double)size * ITERS2 / t);
198     }
199   printf ("\n");
200 }
201 
202 static void inline __attribute ((always_inline))
memcpy_large(const char * name,void * (* fn)(void *,const void *,size_t))203 memcpy_large (const char *name, void *(*fn)(void *, const void *, size_t))
204 {
205   printf ("%22s ", name);
206 
207   for (int size = 1024; size <= 65536; size *= 2)
208     {
209       uint64_t t = clock_get_ns ();
210       for (int i = 0; i < ITERS3; i++)
211 	fn (b, a, size);
212       t = clock_get_ns () - t;
213       printf ("%dK: %5.2f ", size / 1024, (double)size * ITERS3 / t);
214     }
215   printf ("\n");
216 }
217 
218 static void inline __attribute ((always_inline))
memmove_forward_unaligned(const char * name,void * (* fn)(void *,const void *,size_t))219 memmove_forward_unaligned (const char *name, void *(*fn)(void *, const void *, size_t))
220 {
221   printf ("%22s ", name);
222 
223   for (int size = 1024; size <= 65536; size *= 2)
224     {
225       uint64_t t = clock_get_ns ();
226       for (int i = 0; i < ITERS3; i++)
227         fn (a, a + 256 + (i & 31), size);
228       t = clock_get_ns () - t;
229       printf ("%dK: %5.2f ", size / 1024, (double)size * ITERS3 / t);
230     }
231 
232   printf ("\n");
233 }
234 
235 static void inline __attribute ((always_inline))
memmove_backward_unaligned(const char * name,void * (* fn)(void *,const void *,size_t))236 memmove_backward_unaligned (const char *name, void *(*fn)(void *, const void *, size_t))
237 {
238   printf ("%22s ", name);
239 
240   for (int size = 1024; size <= 65536; size *= 2)
241     {
242       uint64_t t = clock_get_ns ();
243       for (int i = 0; i < ITERS3; i++)
244 	fn (a + 256 + (i & 31), a, size);
245       t = clock_get_ns () - t;
246       printf ("%dK: %5.2f ", size / 1024, (double)size * ITERS3 / t);
247     }
248 
249   printf ("\n");
250 }
251 
main(void)252 int main (void)
253 {
254   init_copy_distribution ();
255 
256   memset (a, 1, sizeof (a));
257   memset (b, 2, sizeof (b));
258 
259   DOTEST ("Random memcpy (bytes/ns):\n", memcpy_random);
260   DOTEST ("Medium memcpy aligned (bytes/ns):\n", memcpy_medium_aligned);
261   DOTEST ("Medium memcpy unaligned (bytes/ns):\n", memcpy_medium_unaligned);
262   DOTEST ("Large memcpy (bytes/ns):\n", memcpy_large);
263   DOTEST ("Forwards memmove unaligned (bytes/ns):\n", memmove_forward_unaligned);
264   DOTEST ("Backwards memmove unaligned (bytes/ns):\n", memmove_backward_unaligned);
265 
266   return 0;
267 }
268