1 // SPDX-License-Identifier: GPL-2.0 OR MIT
2 /*
3 * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4 * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation
5 */
6
7 #include <linux/types.h>
8 #include <linux/jump_label.h>
9 #include <linux/kernel.h>
10
11 #include <asm/cpufeature.h>
12 #include <asm/processor.h>
13
eq_mask(u64 a,u64 b)14 static __always_inline u64 eq_mask(u64 a, u64 b)
15 {
16 u64 x = a ^ b;
17 u64 minus_x = ~x + (u64)1U;
18 u64 x_or_minus_x = x | minus_x;
19 u64 xnx = x_or_minus_x >> (u32)63U;
20 return xnx - (u64)1U;
21 }
22
gte_mask(u64 a,u64 b)23 static __always_inline u64 gte_mask(u64 a, u64 b)
24 {
25 u64 x = a;
26 u64 y = b;
27 u64 x_xor_y = x ^ y;
28 u64 x_sub_y = x - y;
29 u64 x_sub_y_xor_y = x_sub_y ^ y;
30 u64 q = x_xor_y | x_sub_y_xor_y;
31 u64 x_xor_q = x ^ q;
32 u64 x_xor_q_ = x_xor_q >> (u32)63U;
33 return x_xor_q_ - (u64)1U;
34 }
35
36 /* Computes the addition of four-element f1 with value in f2
37 * and returns the carry (if any) */
add_scalar(u64 * out,const u64 * f1,u64 f2)38 static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
39 {
40 u64 carry_r;
41
42 asm volatile(
43 /* Clear registers to propagate the carry bit */
44 " xor %%r8d, %%r8d;"
45 " xor %%r9d, %%r9d;"
46 " xor %%r10d, %%r10d;"
47 " xor %%r11d, %%r11d;"
48 " xor %k1, %k1;"
49
50 /* Begin addition chain */
51 " addq 0(%3), %0;"
52 " movq %0, 0(%2);"
53 " adcxq 8(%3), %%r8;"
54 " movq %%r8, 8(%2);"
55 " adcxq 16(%3), %%r9;"
56 " movq %%r9, 16(%2);"
57 " adcxq 24(%3), %%r10;"
58 " movq %%r10, 24(%2);"
59
60 /* Return the carry bit in a register */
61 " adcx %%r11, %1;"
62 : "+&r"(f2), "=&r"(carry_r)
63 : "r"(out), "r"(f1)
64 : "%r8", "%r9", "%r10", "%r11", "memory", "cc");
65
66 return carry_r;
67 }
68
69 /* Computes the field addition of two field elements */
fadd(u64 * out,const u64 * f1,const u64 * f2)70 static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
71 {
72 asm volatile(
73 /* Compute the raw addition of f1 + f2 */
74 " movq 0(%0), %%r8;"
75 " addq 0(%2), %%r8;"
76 " movq 8(%0), %%r9;"
77 " adcxq 8(%2), %%r9;"
78 " movq 16(%0), %%r10;"
79 " adcxq 16(%2), %%r10;"
80 " movq 24(%0), %%r11;"
81 " adcxq 24(%2), %%r11;"
82
83 /* Wrap the result back into the field */
84
85 /* Step 1: Compute carry*38 */
86 " mov $0, %%rax;"
87 " mov $38, %0;"
88 " cmovc %0, %%rax;"
89
90 /* Step 2: Add carry*38 to the original sum */
91 " xor %%ecx, %%ecx;"
92 " add %%rax, %%r8;"
93 " adcx %%rcx, %%r9;"
94 " movq %%r9, 8(%1);"
95 " adcx %%rcx, %%r10;"
96 " movq %%r10, 16(%1);"
97 " adcx %%rcx, %%r11;"
98 " movq %%r11, 24(%1);"
99
100 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
101 " mov $0, %%rax;"
102 " cmovc %0, %%rax;"
103 " add %%rax, %%r8;"
104 " movq %%r8, 0(%1);"
105 : "+&r"(f2)
106 : "r"(out), "r"(f1)
107 : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
108 }
109
110 /* Computes the field subtraction of two field elements */
fsub(u64 * out,const u64 * f1,const u64 * f2)111 static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
112 {
113 asm volatile(
114 /* Compute the raw subtraction of f1-f2 */
115 " movq 0(%1), %%r8;"
116 " subq 0(%2), %%r8;"
117 " movq 8(%1), %%r9;"
118 " sbbq 8(%2), %%r9;"
119 " movq 16(%1), %%r10;"
120 " sbbq 16(%2), %%r10;"
121 " movq 24(%1), %%r11;"
122 " sbbq 24(%2), %%r11;"
123
124 /* Wrap the result back into the field */
125
126 /* Step 1: Compute carry*38 */
127 " mov $0, %%rax;"
128 " mov $38, %%rcx;"
129 " cmovc %%rcx, %%rax;"
130
131 /* Step 2: Subtract carry*38 from the original difference */
132 " sub %%rax, %%r8;"
133 " sbb $0, %%r9;"
134 " sbb $0, %%r10;"
135 " sbb $0, %%r11;"
136
137 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
138 " mov $0, %%rax;"
139 " cmovc %%rcx, %%rax;"
140 " sub %%rax, %%r8;"
141
142 /* Store the result */
143 " movq %%r8, 0(%0);"
144 " movq %%r9, 8(%0);"
145 " movq %%r10, 16(%0);"
146 " movq %%r11, 24(%0);"
147 :
148 : "r"(out), "r"(f1), "r"(f2)
149 : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
150 }
151
152 /* Computes a field multiplication: out <- f1 * f2
153 * Uses the 8-element buffer tmp for intermediate results */
fmul(u64 * out,const u64 * f1,const u64 * f2,u64 * tmp)154 static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
155 {
156 asm volatile(
157
158 /* Compute the raw multiplication: tmp <- src1 * src2 */
159
160 /* Compute src1[0] * src2 */
161 " movq 0(%0), %%rdx;"
162 " mulxq 0(%1), %%r8, %%r9;"
163 " xor %%r10d, %%r10d;"
164 " movq %%r8, 0(%2);"
165 " mulxq 8(%1), %%r10, %%r11;"
166 " adox %%r9, %%r10;"
167 " movq %%r10, 8(%2);"
168 " mulxq 16(%1), %%rbx, %%r13;"
169 " adox %%r11, %%rbx;"
170 " mulxq 24(%1), %%r14, %%rdx;"
171 " adox %%r13, %%r14;"
172 " mov $0, %%rax;"
173 " adox %%rdx, %%rax;"
174
175 /* Compute src1[1] * src2 */
176 " movq 8(%0), %%rdx;"
177 " mulxq 0(%1), %%r8, %%r9;"
178 " xor %%r10d, %%r10d;"
179 " adcxq 8(%2), %%r8;"
180 " movq %%r8, 8(%2);"
181 " mulxq 8(%1), %%r10, %%r11;"
182 " adox %%r9, %%r10;"
183 " adcx %%rbx, %%r10;"
184 " movq %%r10, 16(%2);"
185 " mulxq 16(%1), %%rbx, %%r13;"
186 " adox %%r11, %%rbx;"
187 " adcx %%r14, %%rbx;"
188 " mov $0, %%r8;"
189 " mulxq 24(%1), %%r14, %%rdx;"
190 " adox %%r13, %%r14;"
191 " adcx %%rax, %%r14;"
192 " mov $0, %%rax;"
193 " adox %%rdx, %%rax;"
194 " adcx %%r8, %%rax;"
195
196 /* Compute src1[2] * src2 */
197 " movq 16(%0), %%rdx;"
198 " mulxq 0(%1), %%r8, %%r9;"
199 " xor %%r10d, %%r10d;"
200 " adcxq 16(%2), %%r8;"
201 " movq %%r8, 16(%2);"
202 " mulxq 8(%1), %%r10, %%r11;"
203 " adox %%r9, %%r10;"
204 " adcx %%rbx, %%r10;"
205 " movq %%r10, 24(%2);"
206 " mulxq 16(%1), %%rbx, %%r13;"
207 " adox %%r11, %%rbx;"
208 " adcx %%r14, %%rbx;"
209 " mov $0, %%r8;"
210 " mulxq 24(%1), %%r14, %%rdx;"
211 " adox %%r13, %%r14;"
212 " adcx %%rax, %%r14;"
213 " mov $0, %%rax;"
214 " adox %%rdx, %%rax;"
215 " adcx %%r8, %%rax;"
216
217 /* Compute src1[3] * src2 */
218 " movq 24(%0), %%rdx;"
219 " mulxq 0(%1), %%r8, %%r9;"
220 " xor %%r10d, %%r10d;"
221 " adcxq 24(%2), %%r8;"
222 " movq %%r8, 24(%2);"
223 " mulxq 8(%1), %%r10, %%r11;"
224 " adox %%r9, %%r10;"
225 " adcx %%rbx, %%r10;"
226 " movq %%r10, 32(%2);"
227 " mulxq 16(%1), %%rbx, %%r13;"
228 " adox %%r11, %%rbx;"
229 " adcx %%r14, %%rbx;"
230 " movq %%rbx, 40(%2);"
231 " mov $0, %%r8;"
232 " mulxq 24(%1), %%r14, %%rdx;"
233 " adox %%r13, %%r14;"
234 " adcx %%rax, %%r14;"
235 " movq %%r14, 48(%2);"
236 " mov $0, %%rax;"
237 " adox %%rdx, %%rax;"
238 " adcx %%r8, %%rax;"
239 " movq %%rax, 56(%2);"
240
241 /* Line up pointers */
242 " mov %2, %0;"
243 " mov %3, %2;"
244
245 /* Wrap the result back into the field */
246
247 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
248 " mov $38, %%rdx;"
249 " mulxq 32(%0), %%r8, %%r13;"
250 " xor %k1, %k1;"
251 " adoxq 0(%0), %%r8;"
252 " mulxq 40(%0), %%r9, %%rbx;"
253 " adcx %%r13, %%r9;"
254 " adoxq 8(%0), %%r9;"
255 " mulxq 48(%0), %%r10, %%r13;"
256 " adcx %%rbx, %%r10;"
257 " adoxq 16(%0), %%r10;"
258 " mulxq 56(%0), %%r11, %%rax;"
259 " adcx %%r13, %%r11;"
260 " adoxq 24(%0), %%r11;"
261 " adcx %1, %%rax;"
262 " adox %1, %%rax;"
263 " imul %%rdx, %%rax;"
264
265 /* Step 2: Fold the carry back into dst */
266 " add %%rax, %%r8;"
267 " adcx %1, %%r9;"
268 " movq %%r9, 8(%2);"
269 " adcx %1, %%r10;"
270 " movq %%r10, 16(%2);"
271 " adcx %1, %%r11;"
272 " movq %%r11, 24(%2);"
273
274 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
275 " mov $0, %%rax;"
276 " cmovc %%rdx, %%rax;"
277 " add %%rax, %%r8;"
278 " movq %%r8, 0(%2);"
279 : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
280 : "r"(out)
281 : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
282 "%r14", "memory", "cc");
283 }
284
285 /* Computes two field multiplications:
286 * out[0] <- f1[0] * f2[0]
287 * out[1] <- f1[1] * f2[1]
288 * Uses the 16-element buffer tmp for intermediate results: */
fmul2(u64 * out,const u64 * f1,const u64 * f2,u64 * tmp)289 static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
290 {
291 asm volatile(
292
293 /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
294
295 /* Compute src1[0] * src2 */
296 " movq 0(%0), %%rdx;"
297 " mulxq 0(%1), %%r8, %%r9;"
298 " xor %%r10d, %%r10d;"
299 " movq %%r8, 0(%2);"
300 " mulxq 8(%1), %%r10, %%r11;"
301 " adox %%r9, %%r10;"
302 " movq %%r10, 8(%2);"
303 " mulxq 16(%1), %%rbx, %%r13;"
304 " adox %%r11, %%rbx;"
305 " mulxq 24(%1), %%r14, %%rdx;"
306 " adox %%r13, %%r14;"
307 " mov $0, %%rax;"
308 " adox %%rdx, %%rax;"
309
310 /* Compute src1[1] * src2 */
311 " movq 8(%0), %%rdx;"
312 " mulxq 0(%1), %%r8, %%r9;"
313 " xor %%r10d, %%r10d;"
314 " adcxq 8(%2), %%r8;"
315 " movq %%r8, 8(%2);"
316 " mulxq 8(%1), %%r10, %%r11;"
317 " adox %%r9, %%r10;"
318 " adcx %%rbx, %%r10;"
319 " movq %%r10, 16(%2);"
320 " mulxq 16(%1), %%rbx, %%r13;"
321 " adox %%r11, %%rbx;"
322 " adcx %%r14, %%rbx;"
323 " mov $0, %%r8;"
324 " mulxq 24(%1), %%r14, %%rdx;"
325 " adox %%r13, %%r14;"
326 " adcx %%rax, %%r14;"
327 " mov $0, %%rax;"
328 " adox %%rdx, %%rax;"
329 " adcx %%r8, %%rax;"
330
331 /* Compute src1[2] * src2 */
332 " movq 16(%0), %%rdx;"
333 " mulxq 0(%1), %%r8, %%r9;"
334 " xor %%r10d, %%r10d;"
335 " adcxq 16(%2), %%r8;"
336 " movq %%r8, 16(%2);"
337 " mulxq 8(%1), %%r10, %%r11;"
338 " adox %%r9, %%r10;"
339 " adcx %%rbx, %%r10;"
340 " movq %%r10, 24(%2);"
341 " mulxq 16(%1), %%rbx, %%r13;"
342 " adox %%r11, %%rbx;"
343 " adcx %%r14, %%rbx;"
344 " mov $0, %%r8;"
345 " mulxq 24(%1), %%r14, %%rdx;"
346 " adox %%r13, %%r14;"
347 " adcx %%rax, %%r14;"
348 " mov $0, %%rax;"
349 " adox %%rdx, %%rax;"
350 " adcx %%r8, %%rax;"
351
352 /* Compute src1[3] * src2 */
353 " movq 24(%0), %%rdx;"
354 " mulxq 0(%1), %%r8, %%r9;"
355 " xor %%r10d, %%r10d;"
356 " adcxq 24(%2), %%r8;"
357 " movq %%r8, 24(%2);"
358 " mulxq 8(%1), %%r10, %%r11;"
359 " adox %%r9, %%r10;"
360 " adcx %%rbx, %%r10;"
361 " movq %%r10, 32(%2);"
362 " mulxq 16(%1), %%rbx, %%r13;"
363 " adox %%r11, %%rbx;"
364 " adcx %%r14, %%rbx;"
365 " movq %%rbx, 40(%2);"
366 " mov $0, %%r8;"
367 " mulxq 24(%1), %%r14, %%rdx;"
368 " adox %%r13, %%r14;"
369 " adcx %%rax, %%r14;"
370 " movq %%r14, 48(%2);"
371 " mov $0, %%rax;"
372 " adox %%rdx, %%rax;"
373 " adcx %%r8, %%rax;"
374 " movq %%rax, 56(%2);"
375
376 /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
377
378 /* Compute src1[0] * src2 */
379 " movq 32(%0), %%rdx;"
380 " mulxq 32(%1), %%r8, %%r9;"
381 " xor %%r10d, %%r10d;"
382 " movq %%r8, 64(%2);"
383 " mulxq 40(%1), %%r10, %%r11;"
384 " adox %%r9, %%r10;"
385 " movq %%r10, 72(%2);"
386 " mulxq 48(%1), %%rbx, %%r13;"
387 " adox %%r11, %%rbx;"
388 " mulxq 56(%1), %%r14, %%rdx;"
389 " adox %%r13, %%r14;"
390 " mov $0, %%rax;"
391 " adox %%rdx, %%rax;"
392
393 /* Compute src1[1] * src2 */
394 " movq 40(%0), %%rdx;"
395 " mulxq 32(%1), %%r8, %%r9;"
396 " xor %%r10d, %%r10d;"
397 " adcxq 72(%2), %%r8;"
398 " movq %%r8, 72(%2);"
399 " mulxq 40(%1), %%r10, %%r11;"
400 " adox %%r9, %%r10;"
401 " adcx %%rbx, %%r10;"
402 " movq %%r10, 80(%2);"
403 " mulxq 48(%1), %%rbx, %%r13;"
404 " adox %%r11, %%rbx;"
405 " adcx %%r14, %%rbx;"
406 " mov $0, %%r8;"
407 " mulxq 56(%1), %%r14, %%rdx;"
408 " adox %%r13, %%r14;"
409 " adcx %%rax, %%r14;"
410 " mov $0, %%rax;"
411 " adox %%rdx, %%rax;"
412 " adcx %%r8, %%rax;"
413
414 /* Compute src1[2] * src2 */
415 " movq 48(%0), %%rdx;"
416 " mulxq 32(%1), %%r8, %%r9;"
417 " xor %%r10d, %%r10d;"
418 " adcxq 80(%2), %%r8;"
419 " movq %%r8, 80(%2);"
420 " mulxq 40(%1), %%r10, %%r11;"
421 " adox %%r9, %%r10;"
422 " adcx %%rbx, %%r10;"
423 " movq %%r10, 88(%2);"
424 " mulxq 48(%1), %%rbx, %%r13;"
425 " adox %%r11, %%rbx;"
426 " adcx %%r14, %%rbx;"
427 " mov $0, %%r8;"
428 " mulxq 56(%1), %%r14, %%rdx;"
429 " adox %%r13, %%r14;"
430 " adcx %%rax, %%r14;"
431 " mov $0, %%rax;"
432 " adox %%rdx, %%rax;"
433 " adcx %%r8, %%rax;"
434
435 /* Compute src1[3] * src2 */
436 " movq 56(%0), %%rdx;"
437 " mulxq 32(%1), %%r8, %%r9;"
438 " xor %%r10d, %%r10d;"
439 " adcxq 88(%2), %%r8;"
440 " movq %%r8, 88(%2);"
441 " mulxq 40(%1), %%r10, %%r11;"
442 " adox %%r9, %%r10;"
443 " adcx %%rbx, %%r10;"
444 " movq %%r10, 96(%2);"
445 " mulxq 48(%1), %%rbx, %%r13;"
446 " adox %%r11, %%rbx;"
447 " adcx %%r14, %%rbx;"
448 " movq %%rbx, 104(%2);"
449 " mov $0, %%r8;"
450 " mulxq 56(%1), %%r14, %%rdx;"
451 " adox %%r13, %%r14;"
452 " adcx %%rax, %%r14;"
453 " movq %%r14, 112(%2);"
454 " mov $0, %%rax;"
455 " adox %%rdx, %%rax;"
456 " adcx %%r8, %%rax;"
457 " movq %%rax, 120(%2);"
458
459 /* Line up pointers */
460 " mov %2, %0;"
461 " mov %3, %2;"
462
463 /* Wrap the results back into the field */
464
465 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
466 " mov $38, %%rdx;"
467 " mulxq 32(%0), %%r8, %%r13;"
468 " xor %k1, %k1;"
469 " adoxq 0(%0), %%r8;"
470 " mulxq 40(%0), %%r9, %%rbx;"
471 " adcx %%r13, %%r9;"
472 " adoxq 8(%0), %%r9;"
473 " mulxq 48(%0), %%r10, %%r13;"
474 " adcx %%rbx, %%r10;"
475 " adoxq 16(%0), %%r10;"
476 " mulxq 56(%0), %%r11, %%rax;"
477 " adcx %%r13, %%r11;"
478 " adoxq 24(%0), %%r11;"
479 " adcx %1, %%rax;"
480 " adox %1, %%rax;"
481 " imul %%rdx, %%rax;"
482
483 /* Step 2: Fold the carry back into dst */
484 " add %%rax, %%r8;"
485 " adcx %1, %%r9;"
486 " movq %%r9, 8(%2);"
487 " adcx %1, %%r10;"
488 " movq %%r10, 16(%2);"
489 " adcx %1, %%r11;"
490 " movq %%r11, 24(%2);"
491
492 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
493 " mov $0, %%rax;"
494 " cmovc %%rdx, %%rax;"
495 " add %%rax, %%r8;"
496 " movq %%r8, 0(%2);"
497
498 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
499 " mov $38, %%rdx;"
500 " mulxq 96(%0), %%r8, %%r13;"
501 " xor %k1, %k1;"
502 " adoxq 64(%0), %%r8;"
503 " mulxq 104(%0), %%r9, %%rbx;"
504 " adcx %%r13, %%r9;"
505 " adoxq 72(%0), %%r9;"
506 " mulxq 112(%0), %%r10, %%r13;"
507 " adcx %%rbx, %%r10;"
508 " adoxq 80(%0), %%r10;"
509 " mulxq 120(%0), %%r11, %%rax;"
510 " adcx %%r13, %%r11;"
511 " adoxq 88(%0), %%r11;"
512 " adcx %1, %%rax;"
513 " adox %1, %%rax;"
514 " imul %%rdx, %%rax;"
515
516 /* Step 2: Fold the carry back into dst */
517 " add %%rax, %%r8;"
518 " adcx %1, %%r9;"
519 " movq %%r9, 40(%2);"
520 " adcx %1, %%r10;"
521 " movq %%r10, 48(%2);"
522 " adcx %1, %%r11;"
523 " movq %%r11, 56(%2);"
524
525 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
526 " mov $0, %%rax;"
527 " cmovc %%rdx, %%rax;"
528 " add %%rax, %%r8;"
529 " movq %%r8, 32(%2);"
530 : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
531 : "r"(out)
532 : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
533 "%r14", "memory", "cc");
534 }
535
536 /* Computes the field multiplication of four-element f1 with value in f2
537 * Requires f2 to be smaller than 2^17 */
fmul_scalar(u64 * out,const u64 * f1,u64 f2)538 static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
539 {
540 register u64 f2_r asm("rdx") = f2;
541
542 asm volatile(
543 /* Compute the raw multiplication of f1*f2 */
544 " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
545 " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
546 " add %%rcx, %%r9;"
547 " mov $0, %%rcx;"
548 " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
549 " adcx %%rbx, %%r10;"
550 " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
551 " adcx %%r13, %%r11;"
552 " adcx %%rcx, %%rax;"
553
554 /* Wrap the result back into the field */
555
556 /* Step 1: Compute carry*38 */
557 " mov $38, %%rdx;"
558 " imul %%rdx, %%rax;"
559
560 /* Step 2: Fold the carry back into dst */
561 " add %%rax, %%r8;"
562 " adcx %%rcx, %%r9;"
563 " movq %%r9, 8(%1);"
564 " adcx %%rcx, %%r10;"
565 " movq %%r10, 16(%1);"
566 " adcx %%rcx, %%r11;"
567 " movq %%r11, 24(%1);"
568
569 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
570 " mov $0, %%rax;"
571 " cmovc %%rdx, %%rax;"
572 " add %%rax, %%r8;"
573 " movq %%r8, 0(%1);"
574 : "+&r"(f2_r)
575 : "r"(out), "r"(f1)
576 : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13",
577 "memory", "cc");
578 }
579
580 /* Computes p1 <- bit ? p2 : p1 in constant time */
cswap2(u64 bit,const u64 * p1,const u64 * p2)581 static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
582 {
583 asm volatile(
584 /* Transfer bit into CF flag */
585 " add $18446744073709551615, %0;"
586
587 /* cswap p1[0], p2[0] */
588 " movq 0(%1), %%r8;"
589 " movq 0(%2), %%r9;"
590 " mov %%r8, %%r10;"
591 " cmovc %%r9, %%r8;"
592 " cmovc %%r10, %%r9;"
593 " movq %%r8, 0(%1);"
594 " movq %%r9, 0(%2);"
595
596 /* cswap p1[1], p2[1] */
597 " movq 8(%1), %%r8;"
598 " movq 8(%2), %%r9;"
599 " mov %%r8, %%r10;"
600 " cmovc %%r9, %%r8;"
601 " cmovc %%r10, %%r9;"
602 " movq %%r8, 8(%1);"
603 " movq %%r9, 8(%2);"
604
605 /* cswap p1[2], p2[2] */
606 " movq 16(%1), %%r8;"
607 " movq 16(%2), %%r9;"
608 " mov %%r8, %%r10;"
609 " cmovc %%r9, %%r8;"
610 " cmovc %%r10, %%r9;"
611 " movq %%r8, 16(%1);"
612 " movq %%r9, 16(%2);"
613
614 /* cswap p1[3], p2[3] */
615 " movq 24(%1), %%r8;"
616 " movq 24(%2), %%r9;"
617 " mov %%r8, %%r10;"
618 " cmovc %%r9, %%r8;"
619 " cmovc %%r10, %%r9;"
620 " movq %%r8, 24(%1);"
621 " movq %%r9, 24(%2);"
622
623 /* cswap p1[4], p2[4] */
624 " movq 32(%1), %%r8;"
625 " movq 32(%2), %%r9;"
626 " mov %%r8, %%r10;"
627 " cmovc %%r9, %%r8;"
628 " cmovc %%r10, %%r9;"
629 " movq %%r8, 32(%1);"
630 " movq %%r9, 32(%2);"
631
632 /* cswap p1[5], p2[5] */
633 " movq 40(%1), %%r8;"
634 " movq 40(%2), %%r9;"
635 " mov %%r8, %%r10;"
636 " cmovc %%r9, %%r8;"
637 " cmovc %%r10, %%r9;"
638 " movq %%r8, 40(%1);"
639 " movq %%r9, 40(%2);"
640
641 /* cswap p1[6], p2[6] */
642 " movq 48(%1), %%r8;"
643 " movq 48(%2), %%r9;"
644 " mov %%r8, %%r10;"
645 " cmovc %%r9, %%r8;"
646 " cmovc %%r10, %%r9;"
647 " movq %%r8, 48(%1);"
648 " movq %%r9, 48(%2);"
649
650 /* cswap p1[7], p2[7] */
651 " movq 56(%1), %%r8;"
652 " movq 56(%2), %%r9;"
653 " mov %%r8, %%r10;"
654 " cmovc %%r9, %%r8;"
655 " cmovc %%r10, %%r9;"
656 " movq %%r8, 56(%1);"
657 " movq %%r9, 56(%2);"
658 : "+&r"(bit)
659 : "r"(p1), "r"(p2)
660 : "%r8", "%r9", "%r10", "memory", "cc");
661 }
662
663 /* Computes the square of a field element: out <- f * f
664 * Uses the 8-element buffer tmp for intermediate results */
fsqr(u64 * out,const u64 * f,u64 * tmp)665 static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
666 {
667 asm volatile(
668 /* Compute the raw multiplication: tmp <- f * f */
669
670 /* Step 1: Compute all partial products */
671 " movq 0(%0), %%rdx;" /* f[0] */
672 " mulxq 8(%0), %%r8, %%r14;"
673 " xor %%r15d, %%r15d;" /* f[1]*f[0] */
674 " mulxq 16(%0), %%r9, %%r10;"
675 " adcx %%r14, %%r9;" /* f[2]*f[0] */
676 " mulxq 24(%0), %%rax, %%rcx;"
677 " adcx %%rax, %%r10;" /* f[3]*f[0] */
678 " movq 24(%0), %%rdx;" /* f[3] */
679 " mulxq 8(%0), %%r11, %%rbx;"
680 " adcx %%rcx, %%r11;" /* f[1]*f[3] */
681 " mulxq 16(%0), %%rax, %%r13;"
682 " adcx %%rax, %%rbx;" /* f[2]*f[3] */
683 " movq 8(%0), %%rdx;"
684 " adcx %%r15, %%r13;" /* f1 */
685 " mulxq 16(%0), %%rax, %%rcx;"
686 " mov $0, %%r14;" /* f[2]*f[1] */
687
688 /* Step 2: Compute two parallel carry chains */
689 " xor %%r15d, %%r15d;"
690 " adox %%rax, %%r10;"
691 " adcx %%r8, %%r8;"
692 " adox %%rcx, %%r11;"
693 " adcx %%r9, %%r9;"
694 " adox %%r15, %%rbx;"
695 " adcx %%r10, %%r10;"
696 " adox %%r15, %%r13;"
697 " adcx %%r11, %%r11;"
698 " adox %%r15, %%r14;"
699 " adcx %%rbx, %%rbx;"
700 " adcx %%r13, %%r13;"
701 " adcx %%r14, %%r14;"
702
703 /* Step 3: Compute intermediate squares */
704 " movq 0(%0), %%rdx;"
705 " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
706 " movq %%rax, 0(%1);"
707 " add %%rcx, %%r8;"
708 " movq %%r8, 8(%1);"
709 " movq 8(%0), %%rdx;"
710 " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
711 " adcx %%rax, %%r9;"
712 " movq %%r9, 16(%1);"
713 " adcx %%rcx, %%r10;"
714 " movq %%r10, 24(%1);"
715 " movq 16(%0), %%rdx;"
716 " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
717 " adcx %%rax, %%r11;"
718 " movq %%r11, 32(%1);"
719 " adcx %%rcx, %%rbx;"
720 " movq %%rbx, 40(%1);"
721 " movq 24(%0), %%rdx;"
722 " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
723 " adcx %%rax, %%r13;"
724 " movq %%r13, 48(%1);"
725 " adcx %%rcx, %%r14;"
726 " movq %%r14, 56(%1);"
727
728 /* Line up pointers */
729 " mov %1, %0;"
730 " mov %2, %1;"
731
732 /* Wrap the result back into the field */
733
734 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
735 " mov $38, %%rdx;"
736 " mulxq 32(%0), %%r8, %%r13;"
737 " xor %%ecx, %%ecx;"
738 " adoxq 0(%0), %%r8;"
739 " mulxq 40(%0), %%r9, %%rbx;"
740 " adcx %%r13, %%r9;"
741 " adoxq 8(%0), %%r9;"
742 " mulxq 48(%0), %%r10, %%r13;"
743 " adcx %%rbx, %%r10;"
744 " adoxq 16(%0), %%r10;"
745 " mulxq 56(%0), %%r11, %%rax;"
746 " adcx %%r13, %%r11;"
747 " adoxq 24(%0), %%r11;"
748 " adcx %%rcx, %%rax;"
749 " adox %%rcx, %%rax;"
750 " imul %%rdx, %%rax;"
751
752 /* Step 2: Fold the carry back into dst */
753 " add %%rax, %%r8;"
754 " adcx %%rcx, %%r9;"
755 " movq %%r9, 8(%1);"
756 " adcx %%rcx, %%r10;"
757 " movq %%r10, 16(%1);"
758 " adcx %%rcx, %%r11;"
759 " movq %%r11, 24(%1);"
760
761 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
762 " mov $0, %%rax;"
763 " cmovc %%rdx, %%rax;"
764 " add %%rax, %%r8;"
765 " movq %%r8, 0(%1);"
766 : "+&r"(f), "+&r"(tmp)
767 : "r"(out)
768 : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
769 "%r13", "%r14", "%r15", "memory", "cc");
770 }
771
772 /* Computes two field squarings:
773 * out[0] <- f[0] * f[0]
774 * out[1] <- f[1] * f[1]
775 * Uses the 16-element buffer tmp for intermediate results */
fsqr2(u64 * out,const u64 * f,u64 * tmp)776 static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
777 {
778 asm volatile(
779 /* Step 1: Compute all partial products */
780 " movq 0(%0), %%rdx;" /* f[0] */
781 " mulxq 8(%0), %%r8, %%r14;"
782 " xor %%r15d, %%r15d;" /* f[1]*f[0] */
783 " mulxq 16(%0), %%r9, %%r10;"
784 " adcx %%r14, %%r9;" /* f[2]*f[0] */
785 " mulxq 24(%0), %%rax, %%rcx;"
786 " adcx %%rax, %%r10;" /* f[3]*f[0] */
787 " movq 24(%0), %%rdx;" /* f[3] */
788 " mulxq 8(%0), %%r11, %%rbx;"
789 " adcx %%rcx, %%r11;" /* f[1]*f[3] */
790 " mulxq 16(%0), %%rax, %%r13;"
791 " adcx %%rax, %%rbx;" /* f[2]*f[3] */
792 " movq 8(%0), %%rdx;"
793 " adcx %%r15, %%r13;" /* f1 */
794 " mulxq 16(%0), %%rax, %%rcx;"
795 " mov $0, %%r14;" /* f[2]*f[1] */
796
797 /* Step 2: Compute two parallel carry chains */
798 " xor %%r15d, %%r15d;"
799 " adox %%rax, %%r10;"
800 " adcx %%r8, %%r8;"
801 " adox %%rcx, %%r11;"
802 " adcx %%r9, %%r9;"
803 " adox %%r15, %%rbx;"
804 " adcx %%r10, %%r10;"
805 " adox %%r15, %%r13;"
806 " adcx %%r11, %%r11;"
807 " adox %%r15, %%r14;"
808 " adcx %%rbx, %%rbx;"
809 " adcx %%r13, %%r13;"
810 " adcx %%r14, %%r14;"
811
812 /* Step 3: Compute intermediate squares */
813 " movq 0(%0), %%rdx;"
814 " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
815 " movq %%rax, 0(%1);"
816 " add %%rcx, %%r8;"
817 " movq %%r8, 8(%1);"
818 " movq 8(%0), %%rdx;"
819 " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
820 " adcx %%rax, %%r9;"
821 " movq %%r9, 16(%1);"
822 " adcx %%rcx, %%r10;"
823 " movq %%r10, 24(%1);"
824 " movq 16(%0), %%rdx;"
825 " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
826 " adcx %%rax, %%r11;"
827 " movq %%r11, 32(%1);"
828 " adcx %%rcx, %%rbx;"
829 " movq %%rbx, 40(%1);"
830 " movq 24(%0), %%rdx;"
831 " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
832 " adcx %%rax, %%r13;"
833 " movq %%r13, 48(%1);"
834 " adcx %%rcx, %%r14;"
835 " movq %%r14, 56(%1);"
836
837 /* Step 1: Compute all partial products */
838 " movq 32(%0), %%rdx;" /* f[0] */
839 " mulxq 40(%0), %%r8, %%r14;"
840 " xor %%r15d, %%r15d;" /* f[1]*f[0] */
841 " mulxq 48(%0), %%r9, %%r10;"
842 " adcx %%r14, %%r9;" /* f[2]*f[0] */
843 " mulxq 56(%0), %%rax, %%rcx;"
844 " adcx %%rax, %%r10;" /* f[3]*f[0] */
845 " movq 56(%0), %%rdx;" /* f[3] */
846 " mulxq 40(%0), %%r11, %%rbx;"
847 " adcx %%rcx, %%r11;" /* f[1]*f[3] */
848 " mulxq 48(%0), %%rax, %%r13;"
849 " adcx %%rax, %%rbx;" /* f[2]*f[3] */
850 " movq 40(%0), %%rdx;"
851 " adcx %%r15, %%r13;" /* f1 */
852 " mulxq 48(%0), %%rax, %%rcx;"
853 " mov $0, %%r14;" /* f[2]*f[1] */
854
855 /* Step 2: Compute two parallel carry chains */
856 " xor %%r15d, %%r15d;"
857 " adox %%rax, %%r10;"
858 " adcx %%r8, %%r8;"
859 " adox %%rcx, %%r11;"
860 " adcx %%r9, %%r9;"
861 " adox %%r15, %%rbx;"
862 " adcx %%r10, %%r10;"
863 " adox %%r15, %%r13;"
864 " adcx %%r11, %%r11;"
865 " adox %%r15, %%r14;"
866 " adcx %%rbx, %%rbx;"
867 " adcx %%r13, %%r13;"
868 " adcx %%r14, %%r14;"
869
870 /* Step 3: Compute intermediate squares */
871 " movq 32(%0), %%rdx;"
872 " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
873 " movq %%rax, 64(%1);"
874 " add %%rcx, %%r8;"
875 " movq %%r8, 72(%1);"
876 " movq 40(%0), %%rdx;"
877 " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
878 " adcx %%rax, %%r9;"
879 " movq %%r9, 80(%1);"
880 " adcx %%rcx, %%r10;"
881 " movq %%r10, 88(%1);"
882 " movq 48(%0), %%rdx;"
883 " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
884 " adcx %%rax, %%r11;"
885 " movq %%r11, 96(%1);"
886 " adcx %%rcx, %%rbx;"
887 " movq %%rbx, 104(%1);"
888 " movq 56(%0), %%rdx;"
889 " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
890 " adcx %%rax, %%r13;"
891 " movq %%r13, 112(%1);"
892 " adcx %%rcx, %%r14;"
893 " movq %%r14, 120(%1);"
894
895 /* Line up pointers */
896 " mov %1, %0;"
897 " mov %2, %1;"
898
899 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
900 " mov $38, %%rdx;"
901 " mulxq 32(%0), %%r8, %%r13;"
902 " xor %%ecx, %%ecx;"
903 " adoxq 0(%0), %%r8;"
904 " mulxq 40(%0), %%r9, %%rbx;"
905 " adcx %%r13, %%r9;"
906 " adoxq 8(%0), %%r9;"
907 " mulxq 48(%0), %%r10, %%r13;"
908 " adcx %%rbx, %%r10;"
909 " adoxq 16(%0), %%r10;"
910 " mulxq 56(%0), %%r11, %%rax;"
911 " adcx %%r13, %%r11;"
912 " adoxq 24(%0), %%r11;"
913 " adcx %%rcx, %%rax;"
914 " adox %%rcx, %%rax;"
915 " imul %%rdx, %%rax;"
916
917 /* Step 2: Fold the carry back into dst */
918 " add %%rax, %%r8;"
919 " adcx %%rcx, %%r9;"
920 " movq %%r9, 8(%1);"
921 " adcx %%rcx, %%r10;"
922 " movq %%r10, 16(%1);"
923 " adcx %%rcx, %%r11;"
924 " movq %%r11, 24(%1);"
925
926 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
927 " mov $0, %%rax;"
928 " cmovc %%rdx, %%rax;"
929 " add %%rax, %%r8;"
930 " movq %%r8, 0(%1);"
931
932 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
933 " mov $38, %%rdx;"
934 " mulxq 96(%0), %%r8, %%r13;"
935 " xor %%ecx, %%ecx;"
936 " adoxq 64(%0), %%r8;"
937 " mulxq 104(%0), %%r9, %%rbx;"
938 " adcx %%r13, %%r9;"
939 " adoxq 72(%0), %%r9;"
940 " mulxq 112(%0), %%r10, %%r13;"
941 " adcx %%rbx, %%r10;"
942 " adoxq 80(%0), %%r10;"
943 " mulxq 120(%0), %%r11, %%rax;"
944 " adcx %%r13, %%r11;"
945 " adoxq 88(%0), %%r11;"
946 " adcx %%rcx, %%rax;"
947 " adox %%rcx, %%rax;"
948 " imul %%rdx, %%rax;"
949
950 /* Step 2: Fold the carry back into dst */
951 " add %%rax, %%r8;"
952 " adcx %%rcx, %%r9;"
953 " movq %%r9, 40(%1);"
954 " adcx %%rcx, %%r10;"
955 " movq %%r10, 48(%1);"
956 " adcx %%rcx, %%r11;"
957 " movq %%r11, 56(%1);"
958
959 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
960 " mov $0, %%rax;"
961 " cmovc %%rdx, %%rax;"
962 " add %%rax, %%r8;"
963 " movq %%r8, 32(%1);"
964 : "+&r"(f), "+&r"(tmp)
965 : "r"(out)
966 : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
967 "%r13", "%r14", "%r15", "memory", "cc");
968 }
969
point_add_and_double(u64 * q,u64 * p01_tmp1,u64 * tmp2)970 static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
971 {
972 u64 *nq = p01_tmp1;
973 u64 *nq_p1 = p01_tmp1 + (u32)8U;
974 u64 *tmp1 = p01_tmp1 + (u32)16U;
975 u64 *x1 = q;
976 u64 *x2 = nq;
977 u64 *z2 = nq + (u32)4U;
978 u64 *z3 = nq_p1 + (u32)4U;
979 u64 *a = tmp1;
980 u64 *b = tmp1 + (u32)4U;
981 u64 *ab = tmp1;
982 u64 *dc = tmp1 + (u32)8U;
983 u64 *x3;
984 u64 *z31;
985 u64 *d0;
986 u64 *c0;
987 u64 *a1;
988 u64 *b1;
989 u64 *d;
990 u64 *c;
991 u64 *ab1;
992 u64 *dc1;
993 fadd(a, x2, z2);
994 fsub(b, x2, z2);
995 x3 = nq_p1;
996 z31 = nq_p1 + (u32)4U;
997 d0 = dc;
998 c0 = dc + (u32)4U;
999 fadd(c0, x3, z31);
1000 fsub(d0, x3, z31);
1001 fmul2(dc, dc, ab, tmp2);
1002 fadd(x3, d0, c0);
1003 fsub(z31, d0, c0);
1004 a1 = tmp1;
1005 b1 = tmp1 + (u32)4U;
1006 d = tmp1 + (u32)8U;
1007 c = tmp1 + (u32)12U;
1008 ab1 = tmp1;
1009 dc1 = tmp1 + (u32)8U;
1010 fsqr2(dc1, ab1, tmp2);
1011 fsqr2(nq_p1, nq_p1, tmp2);
1012 a1[0U] = c[0U];
1013 a1[1U] = c[1U];
1014 a1[2U] = c[2U];
1015 a1[3U] = c[3U];
1016 fsub(c, d, c);
1017 fmul_scalar(b1, c, (u64)121665U);
1018 fadd(b1, b1, d);
1019 fmul2(nq, dc1, ab1, tmp2);
1020 fmul(z3, z3, x1, tmp2);
1021 }
1022
point_double(u64 * nq,u64 * tmp1,u64 * tmp2)1023 static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2)
1024 {
1025 u64 *x2 = nq;
1026 u64 *z2 = nq + (u32)4U;
1027 u64 *a = tmp1;
1028 u64 *b = tmp1 + (u32)4U;
1029 u64 *d = tmp1 + (u32)8U;
1030 u64 *c = tmp1 + (u32)12U;
1031 u64 *ab = tmp1;
1032 u64 *dc = tmp1 + (u32)8U;
1033 fadd(a, x2, z2);
1034 fsub(b, x2, z2);
1035 fsqr2(dc, ab, tmp2);
1036 a[0U] = c[0U];
1037 a[1U] = c[1U];
1038 a[2U] = c[2U];
1039 a[3U] = c[3U];
1040 fsub(c, d, c);
1041 fmul_scalar(b, c, (u64)121665U);
1042 fadd(b, b, d);
1043 fmul2(nq, dc, ab, tmp2);
1044 }
1045
montgomery_ladder(u64 * out,const u8 * key,u64 * init1)1046 static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
1047 {
1048 u64 tmp2[16U] = { 0U };
1049 u64 p01_tmp1_swap[33U] = { 0U };
1050 u64 *p0 = p01_tmp1_swap;
1051 u64 *p01 = p01_tmp1_swap;
1052 u64 *p03 = p01;
1053 u64 *p11 = p01 + (u32)8U;
1054 u64 *x0;
1055 u64 *z0;
1056 u64 *p01_tmp1;
1057 u64 *p01_tmp11;
1058 u64 *nq10;
1059 u64 *nq_p11;
1060 u64 *swap1;
1061 u64 sw0;
1062 u64 *nq1;
1063 u64 *tmp1;
1064 memcpy(p11, init1, (u32)8U * sizeof(init1[0U]));
1065 x0 = p03;
1066 z0 = p03 + (u32)4U;
1067 x0[0U] = (u64)1U;
1068 x0[1U] = (u64)0U;
1069 x0[2U] = (u64)0U;
1070 x0[3U] = (u64)0U;
1071 z0[0U] = (u64)0U;
1072 z0[1U] = (u64)0U;
1073 z0[2U] = (u64)0U;
1074 z0[3U] = (u64)0U;
1075 p01_tmp1 = p01_tmp1_swap;
1076 p01_tmp11 = p01_tmp1_swap;
1077 nq10 = p01_tmp1_swap;
1078 nq_p11 = p01_tmp1_swap + (u32)8U;
1079 swap1 = p01_tmp1_swap + (u32)32U;
1080 cswap2((u64)1U, nq10, nq_p11);
1081 point_add_and_double(init1, p01_tmp11, tmp2);
1082 swap1[0U] = (u64)1U;
1083 {
1084 u32 i;
1085 for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) {
1086 u64 *p01_tmp12 = p01_tmp1_swap;
1087 u64 *swap2 = p01_tmp1_swap + (u32)32U;
1088 u64 *nq2 = p01_tmp12;
1089 u64 *nq_p12 = p01_tmp12 + (u32)8U;
1090 u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U);
1091 u64 sw = swap2[0U] ^ bit;
1092 cswap2(sw, nq2, nq_p12);
1093 point_add_and_double(init1, p01_tmp12, tmp2);
1094 swap2[0U] = bit;
1095 }
1096 }
1097 sw0 = swap1[0U];
1098 cswap2(sw0, nq10, nq_p11);
1099 nq1 = p01_tmp1;
1100 tmp1 = p01_tmp1 + (u32)16U;
1101 point_double(nq1, tmp1, tmp2);
1102 point_double(nq1, tmp1, tmp2);
1103 point_double(nq1, tmp1, tmp2);
1104 memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
1105
1106 memzero_explicit(tmp2, sizeof(tmp2));
1107 memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap));
1108 }
1109
fsquare_times(u64 * o,const u64 * inp,u64 * tmp,u32 n1)1110 static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
1111 {
1112 u32 i;
1113 fsqr(o, inp, tmp);
1114 for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U)
1115 fsqr(o, o, tmp);
1116 }
1117
finv(u64 * o,const u64 * i,u64 * tmp)1118 static void finv(u64 *o, const u64 *i, u64 *tmp)
1119 {
1120 u64 t1[16U] = { 0U };
1121 u64 *a0 = t1;
1122 u64 *b = t1 + (u32)4U;
1123 u64 *c = t1 + (u32)8U;
1124 u64 *t00 = t1 + (u32)12U;
1125 u64 *tmp1 = tmp;
1126 u64 *a;
1127 u64 *t0;
1128 fsquare_times(a0, i, tmp1, (u32)1U);
1129 fsquare_times(t00, a0, tmp1, (u32)2U);
1130 fmul(b, t00, i, tmp);
1131 fmul(a0, b, a0, tmp);
1132 fsquare_times(t00, a0, tmp1, (u32)1U);
1133 fmul(b, t00, b, tmp);
1134 fsquare_times(t00, b, tmp1, (u32)5U);
1135 fmul(b, t00, b, tmp);
1136 fsquare_times(t00, b, tmp1, (u32)10U);
1137 fmul(c, t00, b, tmp);
1138 fsquare_times(t00, c, tmp1, (u32)20U);
1139 fmul(t00, t00, c, tmp);
1140 fsquare_times(t00, t00, tmp1, (u32)10U);
1141 fmul(b, t00, b, tmp);
1142 fsquare_times(t00, b, tmp1, (u32)50U);
1143 fmul(c, t00, b, tmp);
1144 fsquare_times(t00, c, tmp1, (u32)100U);
1145 fmul(t00, t00, c, tmp);
1146 fsquare_times(t00, t00, tmp1, (u32)50U);
1147 fmul(t00, t00, b, tmp);
1148 fsquare_times(t00, t00, tmp1, (u32)5U);
1149 a = t1;
1150 t0 = t1 + (u32)12U;
1151 fmul(o, t0, a, tmp);
1152 }
1153
store_felem(u64 * b,u64 * f)1154 static void store_felem(u64 *b, u64 *f)
1155 {
1156 u64 f30 = f[3U];
1157 u64 top_bit0 = f30 >> (u32)63U;
1158 u64 f31;
1159 u64 top_bit;
1160 u64 f0;
1161 u64 f1;
1162 u64 f2;
1163 u64 f3;
1164 u64 m0;
1165 u64 m1;
1166 u64 m2;
1167 u64 m3;
1168 u64 mask;
1169 u64 f0_;
1170 u64 f1_;
1171 u64 f2_;
1172 u64 f3_;
1173 u64 o0;
1174 u64 o1;
1175 u64 o2;
1176 u64 o3;
1177 f[3U] = f30 & (u64)0x7fffffffffffffffU;
1178 add_scalar(f, f, (u64)19U * top_bit0);
1179 f31 = f[3U];
1180 top_bit = f31 >> (u32)63U;
1181 f[3U] = f31 & (u64)0x7fffffffffffffffU;
1182 add_scalar(f, f, (u64)19U * top_bit);
1183 f0 = f[0U];
1184 f1 = f[1U];
1185 f2 = f[2U];
1186 f3 = f[3U];
1187 m0 = gte_mask(f0, (u64)0xffffffffffffffedU);
1188 m1 = eq_mask(f1, (u64)0xffffffffffffffffU);
1189 m2 = eq_mask(f2, (u64)0xffffffffffffffffU);
1190 m3 = eq_mask(f3, (u64)0x7fffffffffffffffU);
1191 mask = ((m0 & m1) & m2) & m3;
1192 f0_ = f0 - (mask & (u64)0xffffffffffffffedU);
1193 f1_ = f1 - (mask & (u64)0xffffffffffffffffU);
1194 f2_ = f2 - (mask & (u64)0xffffffffffffffffU);
1195 f3_ = f3 - (mask & (u64)0x7fffffffffffffffU);
1196 o0 = f0_;
1197 o1 = f1_;
1198 o2 = f2_;
1199 o3 = f3_;
1200 b[0U] = o0;
1201 b[1U] = o1;
1202 b[2U] = o2;
1203 b[3U] = o3;
1204 }
1205
encode_point(u8 * o,const u64 * i)1206 static void encode_point(u8 *o, const u64 *i)
1207 {
1208 const u64 *x = i;
1209 const u64 *z = i + (u32)4U;
1210 u64 tmp[4U] = { 0U };
1211 u64 tmp_w[16U] = { 0U };
1212 finv(tmp, z, tmp_w);
1213 fmul(tmp, tmp, x, tmp_w);
1214 store_felem((u64 *)o, tmp);
1215 }
1216
curve25519_ever64(u8 * out,const u8 * priv,const u8 * pub)1217 static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub)
1218 {
1219 u64 init1[8U] = { 0U };
1220 u64 tmp[4U] = { 0U };
1221 u64 tmp3;
1222 u64 *x;
1223 u64 *z;
1224 {
1225 u32 i;
1226 for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) {
1227 u64 *os = tmp;
1228 const u8 *bj = pub + i * (u32)8U;
1229 u64 u = *(u64 *)bj;
1230 u64 r = u;
1231 u64 x0 = r;
1232 os[i] = x0;
1233 }
1234 }
1235 tmp3 = tmp[3U];
1236 tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU;
1237 x = init1;
1238 z = init1 + (u32)4U;
1239 z[0U] = (u64)1U;
1240 z[1U] = (u64)0U;
1241 z[2U] = (u64)0U;
1242 z[3U] = (u64)0U;
1243 x[0U] = tmp[0U];
1244 x[1U] = tmp[1U];
1245 x[2U] = tmp[2U];
1246 x[3U] = tmp[3U];
1247 montgomery_ladder(init1, priv, init1);
1248 encode_point(out, init1);
1249 }
1250
1251 /* The below constants were generated using this sage script:
1252 *
1253 * #!/usr/bin/env sage
1254 * import sys
1255 * from sage.all import *
1256 * def limbs(n):
1257 * n = int(n)
1258 * l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64)
1259 * return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l
1260 * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0])
1261 * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0]
1262 * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s))
1263 * print("static const u64 table_ladder[] = {")
1264 * p = ec.lift_x(9)
1265 * for i in range(252):
1266 * l = (p[0] + p[2]) / (p[0] - p[2])
1267 * print(("\t%s" + ("," if i != 251 else "")) % limbs(l))
1268 * p = p * 2
1269 * print("};")
1270 *
1271 */
1272
1273 static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL };
1274
1275 static const u64 table_ladder[] = {
1276 0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL,
1277 0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL,
1278 0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL,
1279 0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL,
1280 0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL,
1281 0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL,
1282 0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL,
1283 0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL,
1284 0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL,
1285 0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL,
1286 0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL,
1287 0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL,
1288 0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL,
1289 0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL,
1290 0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL,
1291 0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL,
1292 0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL,
1293 0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL,
1294 0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL,
1295 0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL,
1296 0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL,
1297 0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL,
1298 0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL,
1299 0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL,
1300 0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL,
1301 0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL,
1302 0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL,
1303 0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL,
1304 0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL,
1305 0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL,
1306 0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL,
1307 0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL,
1308 0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL,
1309 0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL,
1310 0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL,
1311 0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL,
1312 0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL,
1313 0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL,
1314 0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL,
1315 0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL,
1316 0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL,
1317 0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL,
1318 0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL,
1319 0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL,
1320 0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL,
1321 0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL,
1322 0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL,
1323 0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL,
1324 0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL,
1325 0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL,
1326 0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL,
1327 0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL,
1328 0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL,
1329 0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL,
1330 0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL,
1331 0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL,
1332 0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL,
1333 0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL,
1334 0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL,
1335 0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL,
1336 0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL,
1337 0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL,
1338 0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL,
1339 0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL,
1340 0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL,
1341 0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL,
1342 0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL,
1343 0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL,
1344 0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL,
1345 0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL,
1346 0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL,
1347 0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL,
1348 0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL,
1349 0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL,
1350 0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL,
1351 0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL,
1352 0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL,
1353 0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL,
1354 0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL,
1355 0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL,
1356 0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL,
1357 0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL,
1358 0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL,
1359 0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL,
1360 0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL,
1361 0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL,
1362 0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL,
1363 0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL,
1364 0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL,
1365 0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL,
1366 0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL,
1367 0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL,
1368 0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL,
1369 0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL,
1370 0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL,
1371 0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL,
1372 0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL,
1373 0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL,
1374 0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL,
1375 0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL,
1376 0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL,
1377 0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL,
1378 0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL,
1379 0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL,
1380 0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL,
1381 0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL,
1382 0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL,
1383 0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL,
1384 0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL,
1385 0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL,
1386 0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL,
1387 0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL,
1388 0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL,
1389 0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL,
1390 0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL,
1391 0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL,
1392 0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL,
1393 0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL,
1394 0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL,
1395 0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL,
1396 0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL,
1397 0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL,
1398 0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL,
1399 0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL,
1400 0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL,
1401 0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL,
1402 0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL,
1403 0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL,
1404 0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL,
1405 0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL,
1406 0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL,
1407 0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL,
1408 0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL,
1409 0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL,
1410 0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL,
1411 0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL,
1412 0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL,
1413 0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL,
1414 0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL,
1415 0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL,
1416 0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL,
1417 0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL,
1418 0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL,
1419 0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL,
1420 0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL,
1421 0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL,
1422 0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL,
1423 0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL,
1424 0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL,
1425 0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL,
1426 0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL,
1427 0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL,
1428 0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL,
1429 0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL,
1430 0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL,
1431 0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL,
1432 0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL,
1433 0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL,
1434 0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL,
1435 0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL,
1436 0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL,
1437 0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL,
1438 0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL,
1439 0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL,
1440 0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL,
1441 0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL,
1442 0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL,
1443 0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL,
1444 0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL,
1445 0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL,
1446 0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL,
1447 0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL,
1448 0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL,
1449 0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL,
1450 0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL,
1451 0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL,
1452 0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL,
1453 0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL,
1454 0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL,
1455 0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL,
1456 0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL,
1457 0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL,
1458 0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL,
1459 0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL,
1460 0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL,
1461 0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL,
1462 0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL,
1463 0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL,
1464 0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL,
1465 0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL,
1466 0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL,
1467 0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL,
1468 0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL,
1469 0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL,
1470 0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL,
1471 0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL,
1472 0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL,
1473 0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL,
1474 0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL,
1475 0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL,
1476 0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL,
1477 0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL,
1478 0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL,
1479 0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL,
1480 0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL,
1481 0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL,
1482 0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL,
1483 0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL,
1484 0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL,
1485 0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL,
1486 0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL,
1487 0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL,
1488 0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL,
1489 0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL,
1490 0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL,
1491 0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL,
1492 0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL,
1493 0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL,
1494 0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL,
1495 0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL,
1496 0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL,
1497 0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL,
1498 0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL,
1499 0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL,
1500 0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL,
1501 0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL,
1502 0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL,
1503 0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL,
1504 0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL,
1505 0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL,
1506 0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL,
1507 0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL,
1508 0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL,
1509 0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL,
1510 0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL,
1511 0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL,
1512 0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL,
1513 0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL,
1514 0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL,
1515 0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL,
1516 0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL,
1517 0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL,
1518 0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL,
1519 0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL,
1520 0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL,
1521 0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL,
1522 0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL,
1523 0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL,
1524 0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL,
1525 0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL,
1526 0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL,
1527 0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL
1528 };
1529
curve25519_ever64_base(u8 * out,const u8 * priv)1530 static void curve25519_ever64_base(u8 *out, const u8 *priv)
1531 {
1532 u64 swap = 1;
1533 int i, j, k;
1534 u64 tmp[16 + 32 + 4];
1535 u64 *x1 = &tmp[0];
1536 u64 *z1 = &tmp[4];
1537 u64 *x2 = &tmp[8];
1538 u64 *z2 = &tmp[12];
1539 u64 *xz1 = &tmp[0];
1540 u64 *xz2 = &tmp[8];
1541 u64 *a = &tmp[0 + 16];
1542 u64 *b = &tmp[4 + 16];
1543 u64 *c = &tmp[8 + 16];
1544 u64 *ab = &tmp[0 + 16];
1545 u64 *abcd = &tmp[0 + 16];
1546 u64 *ef = &tmp[16 + 16];
1547 u64 *efgh = &tmp[16 + 16];
1548 u64 *key = &tmp[0 + 16 + 32];
1549
1550 memcpy(key, priv, 32);
1551 ((u8 *)key)[0] &= 248;
1552 ((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64;
1553
1554 x1[0] = 1, x1[1] = x1[2] = x1[3] = 0;
1555 z1[0] = 1, z1[1] = z1[2] = z1[3] = 0;
1556 z2[0] = 1, z2[1] = z2[2] = z2[3] = 0;
1557 memcpy(x2, p_minus_s, sizeof(p_minus_s));
1558
1559 j = 3;
1560 for (i = 0; i < 4; ++i) {
1561 while (j < (const int[]){ 64, 64, 64, 63 }[i]) {
1562 u64 bit = (key[i] >> j) & 1;
1563 k = (64 * i + j - 3);
1564 swap = swap ^ bit;
1565 cswap2(swap, xz1, xz2);
1566 swap = bit;
1567 fsub(b, x1, z1);
1568 fadd(a, x1, z1);
1569 fmul(c, &table_ladder[4 * k], b, ef);
1570 fsub(b, a, c);
1571 fadd(a, a, c);
1572 fsqr2(ab, ab, efgh);
1573 fmul2(xz1, xz2, ab, efgh);
1574 ++j;
1575 }
1576 j = 0;
1577 }
1578
1579 point_double(xz1, abcd, efgh);
1580 point_double(xz1, abcd, efgh);
1581 point_double(xz1, abcd, efgh);
1582 encode_point(out, xz1);
1583
1584 memzero_explicit(tmp, sizeof(tmp));
1585 }
1586
1587 static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx);
1588
curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],const u8 secret[CURVE25519_KEY_SIZE],const u8 basepoint[CURVE25519_KEY_SIZE])1589 static void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
1590 const u8 secret[CURVE25519_KEY_SIZE],
1591 const u8 basepoint[CURVE25519_KEY_SIZE])
1592 {
1593 if (static_branch_likely(&curve25519_use_bmi2_adx))
1594 curve25519_ever64(mypublic, secret, basepoint);
1595 else
1596 curve25519_generic(mypublic, secret, basepoint);
1597 }
1598
curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],const u8 secret[CURVE25519_KEY_SIZE])1599 static void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
1600 const u8 secret[CURVE25519_KEY_SIZE])
1601 {
1602 if (static_branch_likely(&curve25519_use_bmi2_adx))
1603 curve25519_ever64_base(pub, secret);
1604 else
1605 curve25519_generic(pub, secret, curve25519_base_point);
1606 }
1607
1608 #define curve25519_mod_init_arch curve25519_mod_init_arch
curve25519_mod_init_arch(void)1609 static void curve25519_mod_init_arch(void)
1610 {
1611 if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
1612 static_branch_enable(&curve25519_use_bmi2_adx);
1613 }
1614