xref: /linux/arch/x86/crypto/curve25519-x86_64.c (revision 2b703bbda2713fd2a7d98029ea6c44f9c3159f34)
1 // SPDX-License-Identifier: GPL-2.0 OR MIT
2 /*
3  * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4  * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation
5  */
6 
7 #include <crypto/curve25519.h>
8 #include <crypto/internal/kpp.h>
9 
10 #include <linux/types.h>
11 #include <linux/jump_label.h>
12 #include <linux/kernel.h>
13 #include <linux/module.h>
14 
15 #include <asm/cpufeature.h>
16 #include <asm/processor.h>
17 
18 static __always_inline u64 eq_mask(u64 a, u64 b)
19 {
20 	u64 x = a ^ b;
21 	u64 minus_x = ~x + (u64)1U;
22 	u64 x_or_minus_x = x | minus_x;
23 	u64 xnx = x_or_minus_x >> (u32)63U;
24 	return xnx - (u64)1U;
25 }
26 
27 static __always_inline u64 gte_mask(u64 a, u64 b)
28 {
29 	u64 x = a;
30 	u64 y = b;
31 	u64 x_xor_y = x ^ y;
32 	u64 x_sub_y = x - y;
33 	u64 x_sub_y_xor_y = x_sub_y ^ y;
34 	u64 q = x_xor_y | x_sub_y_xor_y;
35 	u64 x_xor_q = x ^ q;
36 	u64 x_xor_q_ = x_xor_q >> (u32)63U;
37 	return x_xor_q_ - (u64)1U;
38 }
39 
40 /* Computes the addition of four-element f1 with value in f2
41  * and returns the carry (if any) */
42 static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
43 {
44 	u64 carry_r;
45 
46 	asm volatile(
47 		/* Clear registers to propagate the carry bit */
48 		"  xor %%r8, %%r8;"
49 		"  xor %%r9, %%r9;"
50 		"  xor %%r10, %%r10;"
51 		"  xor %%r11, %%r11;"
52 		"  xor %1, %1;"
53 
54 		/* Begin addition chain */
55 		"  addq 0(%3), %0;"
56 		"  movq %0, 0(%2);"
57 		"  adcxq 8(%3), %%r8;"
58 		"  movq %%r8, 8(%2);"
59 		"  adcxq 16(%3), %%r9;"
60 		"  movq %%r9, 16(%2);"
61 		"  adcxq 24(%3), %%r10;"
62 		"  movq %%r10, 24(%2);"
63 
64 		/* Return the carry bit in a register */
65 		"  adcx %%r11, %1;"
66 	: "+&r" (f2), "=&r" (carry_r)
67 	: "r" (out), "r" (f1)
68 	: "%r8", "%r9", "%r10", "%r11", "memory", "cc"
69 	);
70 
71 	return carry_r;
72 }
73 
74 /* Computes the field addition of two field elements */
75 static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
76 {
77 	asm volatile(
78 		/* Compute the raw addition of f1 + f2 */
79 		"  movq 0(%0), %%r8;"
80 		"  addq 0(%2), %%r8;"
81 		"  movq 8(%0), %%r9;"
82 		"  adcxq 8(%2), %%r9;"
83 		"  movq 16(%0), %%r10;"
84 		"  adcxq 16(%2), %%r10;"
85 		"  movq 24(%0), %%r11;"
86 		"  adcxq 24(%2), %%r11;"
87 
88 		/* Wrap the result back into the field */
89 
90 		/* Step 1: Compute carry*38 */
91 		"  mov $0, %%rax;"
92 		"  mov $38, %0;"
93 		"  cmovc %0, %%rax;"
94 
95 		/* Step 2: Add carry*38 to the original sum */
96 		"  xor %%rcx, %%rcx;"
97 		"  add %%rax, %%r8;"
98 		"  adcx %%rcx, %%r9;"
99 		"  movq %%r9, 8(%1);"
100 		"  adcx %%rcx, %%r10;"
101 		"  movq %%r10, 16(%1);"
102 		"  adcx %%rcx, %%r11;"
103 		"  movq %%r11, 24(%1);"
104 
105 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
106 		"  mov $0, %%rax;"
107 		"  cmovc %0, %%rax;"
108 		"  add %%rax, %%r8;"
109 		"  movq %%r8, 0(%1);"
110 	: "+&r" (f2)
111 	: "r" (out), "r" (f1)
112 	: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
113 	);
114 }
115 
116 /* Computes the field substraction of two field elements */
117 static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
118 {
119 	asm volatile(
120 		/* Compute the raw substraction of f1-f2 */
121 		"  movq 0(%1), %%r8;"
122 		"  subq 0(%2), %%r8;"
123 		"  movq 8(%1), %%r9;"
124 		"  sbbq 8(%2), %%r9;"
125 		"  movq 16(%1), %%r10;"
126 		"  sbbq 16(%2), %%r10;"
127 		"  movq 24(%1), %%r11;"
128 		"  sbbq 24(%2), %%r11;"
129 
130 		/* Wrap the result back into the field */
131 
132 		/* Step 1: Compute carry*38 */
133 		"  mov $0, %%rax;"
134 		"  mov $38, %%rcx;"
135 		"  cmovc %%rcx, %%rax;"
136 
137 		/* Step 2: Substract carry*38 from the original difference */
138 		"  sub %%rax, %%r8;"
139 		"  sbb $0, %%r9;"
140 		"  sbb $0, %%r10;"
141 		"  sbb $0, %%r11;"
142 
143 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
144 		"  mov $0, %%rax;"
145 		"  cmovc %%rcx, %%rax;"
146 		"  sub %%rax, %%r8;"
147 
148 		/* Store the result */
149 		"  movq %%r8, 0(%0);"
150 		"  movq %%r9, 8(%0);"
151 		"  movq %%r10, 16(%0);"
152 		"  movq %%r11, 24(%0);"
153 	:
154 	: "r" (out), "r" (f1), "r" (f2)
155 	: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
156 	);
157 }
158 
159 /* Computes a field multiplication: out <- f1 * f2
160  * Uses the 8-element buffer tmp for intermediate results */
161 static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
162 {
163 	asm volatile(
164 		/* Compute the raw multiplication: tmp <- src1 * src2 */
165 
166 		/* Compute src1[0] * src2 */
167 		"  movq 0(%1), %%rdx;"
168 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
169 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
170 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
171 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
172 		                                   "  adox %%rdx, %%rax;"
173 		/* Compute src1[1] * src2 */
174 		"  movq 8(%1), %%rdx;"
175 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
176 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
177 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
178 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
179 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
180 		/* Compute src1[2] * src2 */
181 		"  movq 16(%1), %%rdx;"
182 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
183 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
184 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
185 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
186 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
187 		/* Compute src1[3] * src2 */
188 		"  movq 24(%1), %%rdx;"
189 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
190 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
191 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
192 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
193 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
194 		/* Line up pointers */
195 		"  mov %0, %1;"
196 		"  mov %2, %0;"
197 
198 		/* Wrap the result back into the field */
199 
200 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
201 		"  mov $38, %%rdx;"
202 		"  mulxq 32(%1), %%r8, %%r13;"
203 		"  xor %3, %3;"
204 		"  adoxq 0(%1), %%r8;"
205 		"  mulxq 40(%1), %%r9, %%rbx;"
206 		"  adcx %%r13, %%r9;"
207 		"  adoxq 8(%1), %%r9;"
208 		"  mulxq 48(%1), %%r10, %%r13;"
209 		"  adcx %%rbx, %%r10;"
210 		"  adoxq 16(%1), %%r10;"
211 		"  mulxq 56(%1), %%r11, %%rax;"
212 		"  adcx %%r13, %%r11;"
213 		"  adoxq 24(%1), %%r11;"
214 		"  adcx %3, %%rax;"
215 		"  adox %3, %%rax;"
216 		"  imul %%rdx, %%rax;"
217 
218 		/* Step 2: Fold the carry back into dst */
219 		"  add %%rax, %%r8;"
220 		"  adcx %3, %%r9;"
221 		"  movq %%r9, 8(%0);"
222 		"  adcx %3, %%r10;"
223 		"  movq %%r10, 16(%0);"
224 		"  adcx %3, %%r11;"
225 		"  movq %%r11, 24(%0);"
226 
227 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
228 		"  mov $0, %%rax;"
229 		"  cmovc %%rdx, %%rax;"
230 		"  add %%rax, %%r8;"
231 		"  movq %%r8, 0(%0);"
232 	: "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
233 	:
234 	: "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
235 	);
236 }
237 
238 /* Computes two field multiplications:
239  * out[0] <- f1[0] * f2[0]
240  * out[1] <- f1[1] * f2[1]
241  * Uses the 16-element buffer tmp for intermediate results. */
242 static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
243 {
244 	asm volatile(
245 		/* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
246 
247 		/* Compute src1[0] * src2 */
248 		"  movq 0(%1), %%rdx;"
249 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
250 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
251 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
252 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
253 		                                   "  adox %%rdx, %%rax;"
254 		/* Compute src1[1] * src2 */
255 		"  movq 8(%1), %%rdx;"
256 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
257 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
258 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
259 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
260 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
261 		/* Compute src1[2] * src2 */
262 		"  movq 16(%1), %%rdx;"
263 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
264 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
265 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
266 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
267 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
268 		/* Compute src1[3] * src2 */
269 		"  movq 24(%1), %%rdx;"
270 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
271 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
272 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
273 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
274 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
275 
276 		/* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
277 
278 		/* Compute src1[0] * src2 */
279 		"  movq 32(%1), %%rdx;"
280 		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 64(%0);"
281 		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
282 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
283 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
284 		                                   "  adox %%rdx, %%rax;"
285 		/* Compute src1[1] * src2 */
286 		"  movq 40(%1), %%rdx;"
287 		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 72(%0), %%r8;"    "  movq %%r8, 72(%0);"
288 		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
289 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
290 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
291 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
292 		/* Compute src1[2] * src2 */
293 		"  movq 48(%1), %%rdx;"
294 		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 80(%0), %%r8;"    "  movq %%r8, 80(%0);"
295 		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
296 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
297 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
298 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
299 		/* Compute src1[3] * src2 */
300 		"  movq 56(%1), %%rdx;"
301 		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 88(%0), %%r8;"    "  movq %%r8, 88(%0);"
302 		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
303 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
304 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
305 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
306 		/* Line up pointers */
307 		"  mov %0, %1;"
308 		"  mov %2, %0;"
309 
310 		/* Wrap the results back into the field */
311 
312 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
313 		"  mov $38, %%rdx;"
314 		"  mulxq 32(%1), %%r8, %%r13;"
315 		"  xor %3, %3;"
316 		"  adoxq 0(%1), %%r8;"
317 		"  mulxq 40(%1), %%r9, %%rbx;"
318 		"  adcx %%r13, %%r9;"
319 		"  adoxq 8(%1), %%r9;"
320 		"  mulxq 48(%1), %%r10, %%r13;"
321 		"  adcx %%rbx, %%r10;"
322 		"  adoxq 16(%1), %%r10;"
323 		"  mulxq 56(%1), %%r11, %%rax;"
324 		"  adcx %%r13, %%r11;"
325 		"  adoxq 24(%1), %%r11;"
326 		"  adcx %3, %%rax;"
327 		"  adox %3, %%rax;"
328 		"  imul %%rdx, %%rax;"
329 
330 		/* Step 2: Fold the carry back into dst */
331 		"  add %%rax, %%r8;"
332 		"  adcx %3, %%r9;"
333 		"  movq %%r9, 8(%0);"
334 		"  adcx %3, %%r10;"
335 		"  movq %%r10, 16(%0);"
336 		"  adcx %3, %%r11;"
337 		"  movq %%r11, 24(%0);"
338 
339 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
340 		"  mov $0, %%rax;"
341 		"  cmovc %%rdx, %%rax;"
342 		"  add %%rax, %%r8;"
343 		"  movq %%r8, 0(%0);"
344 
345 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
346 		"  mov $38, %%rdx;"
347 		"  mulxq 96(%1), %%r8, %%r13;"
348 		"  xor %3, %3;"
349 		"  adoxq 64(%1), %%r8;"
350 		"  mulxq 104(%1), %%r9, %%rbx;"
351 		"  adcx %%r13, %%r9;"
352 		"  adoxq 72(%1), %%r9;"
353 		"  mulxq 112(%1), %%r10, %%r13;"
354 		"  adcx %%rbx, %%r10;"
355 		"  adoxq 80(%1), %%r10;"
356 		"  mulxq 120(%1), %%r11, %%rax;"
357 		"  adcx %%r13, %%r11;"
358 		"  adoxq 88(%1), %%r11;"
359 		"  adcx %3, %%rax;"
360 		"  adox %3, %%rax;"
361 		"  imul %%rdx, %%rax;"
362 
363 		/* Step 2: Fold the carry back into dst */
364 		"  add %%rax, %%r8;"
365 		"  adcx %3, %%r9;"
366 		"  movq %%r9, 40(%0);"
367 		"  adcx %3, %%r10;"
368 		"  movq %%r10, 48(%0);"
369 		"  adcx %3, %%r11;"
370 		"  movq %%r11, 56(%0);"
371 
372 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
373 		"  mov $0, %%rax;"
374 		"  cmovc %%rdx, %%rax;"
375 		"  add %%rax, %%r8;"
376 		"  movq %%r8, 32(%0);"
377 	: "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
378 	:
379 	: "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
380 	);
381 }
382 
383 /* Computes the field multiplication of four-element f1 with value in f2 */
384 static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
385 {
386 	register u64 f2_r asm("rdx") = f2;
387 
388 	asm volatile(
389 		/* Compute the raw multiplication of f1*f2 */
390 		"  mulxq 0(%2), %%r8, %%rcx;"      /* f1[0]*f2 */
391 		"  mulxq 8(%2), %%r9, %%rbx;"      /* f1[1]*f2 */
392 		"  add %%rcx, %%r9;"
393 		"  mov $0, %%rcx;"
394 		"  mulxq 16(%2), %%r10, %%r13;"    /* f1[2]*f2 */
395 		"  adcx %%rbx, %%r10;"
396 		"  mulxq 24(%2), %%r11, %%rax;"    /* f1[3]*f2 */
397 		"  adcx %%r13, %%r11;"
398 		"  adcx %%rcx, %%rax;"
399 
400 		/* Wrap the result back into the field */
401 
402 		/* Step 1: Compute carry*38 */
403 		"  mov $38, %%rdx;"
404 		"  imul %%rdx, %%rax;"
405 
406 		/* Step 2: Fold the carry back into dst */
407 		"  add %%rax, %%r8;"
408 		"  adcx %%rcx, %%r9;"
409 		"  movq %%r9, 8(%1);"
410 		"  adcx %%rcx, %%r10;"
411 		"  movq %%r10, 16(%1);"
412 		"  adcx %%rcx, %%r11;"
413 		"  movq %%r11, 24(%1);"
414 
415 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
416 		"  mov $0, %%rax;"
417 		"  cmovc %%rdx, %%rax;"
418 		"  add %%rax, %%r8;"
419 		"  movq %%r8, 0(%1);"
420 	: "+&r" (f2_r)
421 	: "r" (out), "r" (f1)
422 	: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "memory", "cc"
423 	);
424 }
425 
426 /* Computes p1 <- bit ? p2 : p1 in constant time */
427 static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
428 {
429 	asm volatile(
430 		/* Invert the polarity of bit to match cmov expectations */
431 		"  add $18446744073709551615, %0;"
432 
433 		/* cswap p1[0], p2[0] */
434 		"  movq 0(%1), %%r8;"
435 		"  movq 0(%2), %%r9;"
436 		"  mov %%r8, %%r10;"
437 		"  cmovc %%r9, %%r8;"
438 		"  cmovc %%r10, %%r9;"
439 		"  movq %%r8, 0(%1);"
440 		"  movq %%r9, 0(%2);"
441 
442 		/* cswap p1[1], p2[1] */
443 		"  movq 8(%1), %%r8;"
444 		"  movq 8(%2), %%r9;"
445 		"  mov %%r8, %%r10;"
446 		"  cmovc %%r9, %%r8;"
447 		"  cmovc %%r10, %%r9;"
448 		"  movq %%r8, 8(%1);"
449 		"  movq %%r9, 8(%2);"
450 
451 		/* cswap p1[2], p2[2] */
452 		"  movq 16(%1), %%r8;"
453 		"  movq 16(%2), %%r9;"
454 		"  mov %%r8, %%r10;"
455 		"  cmovc %%r9, %%r8;"
456 		"  cmovc %%r10, %%r9;"
457 		"  movq %%r8, 16(%1);"
458 		"  movq %%r9, 16(%2);"
459 
460 		/* cswap p1[3], p2[3] */
461 		"  movq 24(%1), %%r8;"
462 		"  movq 24(%2), %%r9;"
463 		"  mov %%r8, %%r10;"
464 		"  cmovc %%r9, %%r8;"
465 		"  cmovc %%r10, %%r9;"
466 		"  movq %%r8, 24(%1);"
467 		"  movq %%r9, 24(%2);"
468 
469 		/* cswap p1[4], p2[4] */
470 		"  movq 32(%1), %%r8;"
471 		"  movq 32(%2), %%r9;"
472 		"  mov %%r8, %%r10;"
473 		"  cmovc %%r9, %%r8;"
474 		"  cmovc %%r10, %%r9;"
475 		"  movq %%r8, 32(%1);"
476 		"  movq %%r9, 32(%2);"
477 
478 		/* cswap p1[5], p2[5] */
479 		"  movq 40(%1), %%r8;"
480 		"  movq 40(%2), %%r9;"
481 		"  mov %%r8, %%r10;"
482 		"  cmovc %%r9, %%r8;"
483 		"  cmovc %%r10, %%r9;"
484 		"  movq %%r8, 40(%1);"
485 		"  movq %%r9, 40(%2);"
486 
487 		/* cswap p1[6], p2[6] */
488 		"  movq 48(%1), %%r8;"
489 		"  movq 48(%2), %%r9;"
490 		"  mov %%r8, %%r10;"
491 		"  cmovc %%r9, %%r8;"
492 		"  cmovc %%r10, %%r9;"
493 		"  movq %%r8, 48(%1);"
494 		"  movq %%r9, 48(%2);"
495 
496 		/* cswap p1[7], p2[7] */
497 		"  movq 56(%1), %%r8;"
498 		"  movq 56(%2), %%r9;"
499 		"  mov %%r8, %%r10;"
500 		"  cmovc %%r9, %%r8;"
501 		"  cmovc %%r10, %%r9;"
502 		"  movq %%r8, 56(%1);"
503 		"  movq %%r9, 56(%2);"
504 	: "+&r" (bit)
505 	: "r" (p1), "r" (p2)
506 	: "%r8", "%r9", "%r10", "memory", "cc"
507 	);
508 }
509 
510 /* Computes the square of a field element: out <- f * f
511  * Uses the 8-element buffer tmp for intermediate results */
512 static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
513 {
514 	asm volatile(
515 		/* Compute the raw multiplication: tmp <- f * f */
516 
517 		/* Step 1: Compute all partial products */
518 		"  movq 0(%1), %%rdx;"                                       /* f[0] */
519 		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
520 		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
521 		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
522 		"  movq 24(%1), %%rdx;"                                      /* f[3] */
523 		"  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
524 		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
525 		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
526 		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
527 
528 		/* Step 2: Compute two parallel carry chains */
529 		"  xor %%r15, %%r15;"
530 		"  adox %%rax, %%r10;"
531 		"  adcx %%r8, %%r8;"
532 		"  adox %%rcx, %%r11;"
533 		"  adcx %%r9, %%r9;"
534 		"  adox %%r15, %%rbx;"
535 		"  adcx %%r10, %%r10;"
536 		"  adox %%r15, %%r13;"
537 		"  adcx %%r11, %%r11;"
538 		"  adox %%r15, %%r14;"
539 		"  adcx %%rbx, %%rbx;"
540 		"  adcx %%r13, %%r13;"
541 		"  adcx %%r14, %%r14;"
542 
543 		/* Step 3: Compute intermediate squares */
544 		"  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
545 		                           "  movq %%rax, 0(%0);"
546 		"  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
547 		"  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
548 		"  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
549 		"  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
550 		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
551 		"  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
552 		"  adcx %%rcx, %%rbx;"     "  movq %%rbx, 40(%0);"
553 		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
554 		"  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
555 		"  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
556 
557 		/* Line up pointers */
558 		"  mov %0, %1;"
559 		"  mov %2, %0;"
560 
561 		/* Wrap the result back into the field */
562 
563 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
564 		"  mov $38, %%rdx;"
565 		"  mulxq 32(%1), %%r8, %%r13;"
566 		"  xor %%rcx, %%rcx;"
567 		"  adoxq 0(%1), %%r8;"
568 		"  mulxq 40(%1), %%r9, %%rbx;"
569 		"  adcx %%r13, %%r9;"
570 		"  adoxq 8(%1), %%r9;"
571 		"  mulxq 48(%1), %%r10, %%r13;"
572 		"  adcx %%rbx, %%r10;"
573 		"  adoxq 16(%1), %%r10;"
574 		"  mulxq 56(%1), %%r11, %%rax;"
575 		"  adcx %%r13, %%r11;"
576 		"  adoxq 24(%1), %%r11;"
577 		"  adcx %%rcx, %%rax;"
578 		"  adox %%rcx, %%rax;"
579 		"  imul %%rdx, %%rax;"
580 
581 		/* Step 2: Fold the carry back into dst */
582 		"  add %%rax, %%r8;"
583 		"  adcx %%rcx, %%r9;"
584 		"  movq %%r9, 8(%0);"
585 		"  adcx %%rcx, %%r10;"
586 		"  movq %%r10, 16(%0);"
587 		"  adcx %%rcx, %%r11;"
588 		"  movq %%r11, 24(%0);"
589 
590 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
591 		"  mov $0, %%rax;"
592 		"  cmovc %%rdx, %%rax;"
593 		"  add %%rax, %%r8;"
594 		"  movq %%r8, 0(%0);"
595 	: "+&r" (tmp), "+&r" (f), "+&r" (out)
596 	:
597 	: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
598 	);
599 }
600 
601 /* Computes two field squarings:
602  * out[0] <- f[0] * f[0]
603  * out[1] <- f[1] * f[1]
604  * Uses the 16-element buffer tmp for intermediate results */
605 static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
606 {
607 	asm volatile(
608 		/* Step 1: Compute all partial products */
609 		"  movq 0(%1), %%rdx;"                                       /* f[0] */
610 		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
611 		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
612 		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
613 		"  movq 24(%1), %%rdx;"                                      /* f[3] */
614 		"  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
615 		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
616 		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
617 		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
618 
619 		/* Step 2: Compute two parallel carry chains */
620 		"  xor %%r15, %%r15;"
621 		"  adox %%rax, %%r10;"
622 		"  adcx %%r8, %%r8;"
623 		"  adox %%rcx, %%r11;"
624 		"  adcx %%r9, %%r9;"
625 		"  adox %%r15, %%rbx;"
626 		"  adcx %%r10, %%r10;"
627 		"  adox %%r15, %%r13;"
628 		"  adcx %%r11, %%r11;"
629 		"  adox %%r15, %%r14;"
630 		"  adcx %%rbx, %%rbx;"
631 		"  adcx %%r13, %%r13;"
632 		"  adcx %%r14, %%r14;"
633 
634 		/* Step 3: Compute intermediate squares */
635 		"  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
636 		                           "  movq %%rax, 0(%0);"
637 		"  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
638 		"  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
639 		"  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
640 		"  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
641 		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
642 		"  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
643 		"  adcx %%rcx, %%rbx;"     "  movq %%rbx, 40(%0);"
644 		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
645 		"  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
646 		"  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
647 
648 		/* Step 1: Compute all partial products */
649 		"  movq 32(%1), %%rdx;"                                       /* f[0] */
650 		"  mulxq 40(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
651 		"  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
652 		"  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
653 		"  movq 56(%1), %%rdx;"                                      /* f[3] */
654 		"  mulxq 40(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
655 		"  mulxq 48(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
656 		"  movq 40(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
657 		"  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
658 
659 		/* Step 2: Compute two parallel carry chains */
660 		"  xor %%r15, %%r15;"
661 		"  adox %%rax, %%r10;"
662 		"  adcx %%r8, %%r8;"
663 		"  adox %%rcx, %%r11;"
664 		"  adcx %%r9, %%r9;"
665 		"  adox %%r15, %%rbx;"
666 		"  adcx %%r10, %%r10;"
667 		"  adox %%r15, %%r13;"
668 		"  adcx %%r11, %%r11;"
669 		"  adox %%r15, %%r14;"
670 		"  adcx %%rbx, %%rbx;"
671 		"  adcx %%r13, %%r13;"
672 		"  adcx %%r14, %%r14;"
673 
674 		/* Step 3: Compute intermediate squares */
675 		"  movq 32(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
676 		                           "  movq %%rax, 64(%0);"
677 		"  add %%rcx, %%r8;"       "  movq %%r8, 72(%0);"
678 		"  movq 40(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
679 		"  adcx %%rax, %%r9;"      "  movq %%r9, 80(%0);"
680 		"  adcx %%rcx, %%r10;"     "  movq %%r10, 88(%0);"
681 		"  movq 48(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
682 		"  adcx %%rax, %%r11;"     "  movq %%r11, 96(%0);"
683 		"  adcx %%rcx, %%rbx;"     "  movq %%rbx, 104(%0);"
684 		"  movq 56(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
685 		"  adcx %%rax, %%r13;"     "  movq %%r13, 112(%0);"
686 		"  adcx %%rcx, %%r14;"     "  movq %%r14, 120(%0);"
687 
688 		/* Line up pointers */
689 		"  mov %0, %1;"
690 		"  mov %2, %0;"
691 
692 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
693 		"  mov $38, %%rdx;"
694 		"  mulxq 32(%1), %%r8, %%r13;"
695 		"  xor %%rcx, %%rcx;"
696 		"  adoxq 0(%1), %%r8;"
697 		"  mulxq 40(%1), %%r9, %%rbx;"
698 		"  adcx %%r13, %%r9;"
699 		"  adoxq 8(%1), %%r9;"
700 		"  mulxq 48(%1), %%r10, %%r13;"
701 		"  adcx %%rbx, %%r10;"
702 		"  adoxq 16(%1), %%r10;"
703 		"  mulxq 56(%1), %%r11, %%rax;"
704 		"  adcx %%r13, %%r11;"
705 		"  adoxq 24(%1), %%r11;"
706 		"  adcx %%rcx, %%rax;"
707 		"  adox %%rcx, %%rax;"
708 		"  imul %%rdx, %%rax;"
709 
710 		/* Step 2: Fold the carry back into dst */
711 		"  add %%rax, %%r8;"
712 		"  adcx %%rcx, %%r9;"
713 		"  movq %%r9, 8(%0);"
714 		"  adcx %%rcx, %%r10;"
715 		"  movq %%r10, 16(%0);"
716 		"  adcx %%rcx, %%r11;"
717 		"  movq %%r11, 24(%0);"
718 
719 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
720 		"  mov $0, %%rax;"
721 		"  cmovc %%rdx, %%rax;"
722 		"  add %%rax, %%r8;"
723 		"  movq %%r8, 0(%0);"
724 
725 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
726 		"  mov $38, %%rdx;"
727 		"  mulxq 96(%1), %%r8, %%r13;"
728 		"  xor %%rcx, %%rcx;"
729 		"  adoxq 64(%1), %%r8;"
730 		"  mulxq 104(%1), %%r9, %%rbx;"
731 		"  adcx %%r13, %%r9;"
732 		"  adoxq 72(%1), %%r9;"
733 		"  mulxq 112(%1), %%r10, %%r13;"
734 		"  adcx %%rbx, %%r10;"
735 		"  adoxq 80(%1), %%r10;"
736 		"  mulxq 120(%1), %%r11, %%rax;"
737 		"  adcx %%r13, %%r11;"
738 		"  adoxq 88(%1), %%r11;"
739 		"  adcx %%rcx, %%rax;"
740 		"  adox %%rcx, %%rax;"
741 		"  imul %%rdx, %%rax;"
742 
743 		/* Step 2: Fold the carry back into dst */
744 		"  add %%rax, %%r8;"
745 		"  adcx %%rcx, %%r9;"
746 		"  movq %%r9, 40(%0);"
747 		"  adcx %%rcx, %%r10;"
748 		"  movq %%r10, 48(%0);"
749 		"  adcx %%rcx, %%r11;"
750 		"  movq %%r11, 56(%0);"
751 
752 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
753 		"  mov $0, %%rax;"
754 		"  cmovc %%rdx, %%rax;"
755 		"  add %%rax, %%r8;"
756 		"  movq %%r8, 32(%0);"
757 	: "+&r" (tmp), "+&r" (f), "+&r" (out)
758 	:
759 	: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
760 	);
761 }
762 
763 static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
764 {
765 	u64 *nq = p01_tmp1;
766 	u64 *nq_p1 = p01_tmp1 + (u32)8U;
767 	u64 *tmp1 = p01_tmp1 + (u32)16U;
768 	u64 *x1 = q;
769 	u64 *x2 = nq;
770 	u64 *z2 = nq + (u32)4U;
771 	u64 *z3 = nq_p1 + (u32)4U;
772 	u64 *a = tmp1;
773 	u64 *b = tmp1 + (u32)4U;
774 	u64 *ab = tmp1;
775 	u64 *dc = tmp1 + (u32)8U;
776 	u64 *x3;
777 	u64 *z31;
778 	u64 *d0;
779 	u64 *c0;
780 	u64 *a1;
781 	u64 *b1;
782 	u64 *d;
783 	u64 *c;
784 	u64 *ab1;
785 	u64 *dc1;
786 	fadd(a, x2, z2);
787 	fsub(b, x2, z2);
788 	x3 = nq_p1;
789 	z31 = nq_p1 + (u32)4U;
790 	d0 = dc;
791 	c0 = dc + (u32)4U;
792 	fadd(c0, x3, z31);
793 	fsub(d0, x3, z31);
794 	fmul2(dc, dc, ab, tmp2);
795 	fadd(x3, d0, c0);
796 	fsub(z31, d0, c0);
797 	a1 = tmp1;
798 	b1 = tmp1 + (u32)4U;
799 	d = tmp1 + (u32)8U;
800 	c = tmp1 + (u32)12U;
801 	ab1 = tmp1;
802 	dc1 = tmp1 + (u32)8U;
803 	fsqr2(dc1, ab1, tmp2);
804 	fsqr2(nq_p1, nq_p1, tmp2);
805 	a1[0U] = c[0U];
806 	a1[1U] = c[1U];
807 	a1[2U] = c[2U];
808 	a1[3U] = c[3U];
809 	fsub(c, d, c);
810 	fmul_scalar(b1, c, (u64)121665U);
811 	fadd(b1, b1, d);
812 	fmul2(nq, dc1, ab1, tmp2);
813 	fmul(z3, z3, x1, tmp2);
814 }
815 
816 static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2)
817 {
818 	u64 *x2 = nq;
819 	u64 *z2 = nq + (u32)4U;
820 	u64 *a = tmp1;
821 	u64 *b = tmp1 + (u32)4U;
822 	u64 *d = tmp1 + (u32)8U;
823 	u64 *c = tmp1 + (u32)12U;
824 	u64 *ab = tmp1;
825 	u64 *dc = tmp1 + (u32)8U;
826 	fadd(a, x2, z2);
827 	fsub(b, x2, z2);
828 	fsqr2(dc, ab, tmp2);
829 	a[0U] = c[0U];
830 	a[1U] = c[1U];
831 	a[2U] = c[2U];
832 	a[3U] = c[3U];
833 	fsub(c, d, c);
834 	fmul_scalar(b, c, (u64)121665U);
835 	fadd(b, b, d);
836 	fmul2(nq, dc, ab, tmp2);
837 }
838 
839 static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
840 {
841 	u64 tmp2[16U] = { 0U };
842 	u64 p01_tmp1_swap[33U] = { 0U };
843 	u64 *p0 = p01_tmp1_swap;
844 	u64 *p01 = p01_tmp1_swap;
845 	u64 *p03 = p01;
846 	u64 *p11 = p01 + (u32)8U;
847 	u64 *x0;
848 	u64 *z0;
849 	u64 *p01_tmp1;
850 	u64 *p01_tmp11;
851 	u64 *nq10;
852 	u64 *nq_p11;
853 	u64 *swap1;
854 	u64 sw0;
855 	u64 *nq1;
856 	u64 *tmp1;
857 	memcpy(p11, init1, (u32)8U * sizeof(init1[0U]));
858 	x0 = p03;
859 	z0 = p03 + (u32)4U;
860 	x0[0U] = (u64)1U;
861 	x0[1U] = (u64)0U;
862 	x0[2U] = (u64)0U;
863 	x0[3U] = (u64)0U;
864 	z0[0U] = (u64)0U;
865 	z0[1U] = (u64)0U;
866 	z0[2U] = (u64)0U;
867 	z0[3U] = (u64)0U;
868 	p01_tmp1 = p01_tmp1_swap;
869 	p01_tmp11 = p01_tmp1_swap;
870 	nq10 = p01_tmp1_swap;
871 	nq_p11 = p01_tmp1_swap + (u32)8U;
872 	swap1 = p01_tmp1_swap + (u32)32U;
873 	cswap2((u64)1U, nq10, nq_p11);
874 	point_add_and_double(init1, p01_tmp11, tmp2);
875 	swap1[0U] = (u64)1U;
876 	{
877 		u32 i;
878 		for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) {
879 			u64 *p01_tmp12 = p01_tmp1_swap;
880 			u64 *swap2 = p01_tmp1_swap + (u32)32U;
881 			u64 *nq2 = p01_tmp12;
882 			u64 *nq_p12 = p01_tmp12 + (u32)8U;
883 			u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U);
884 			u64 sw = swap2[0U] ^ bit;
885 			cswap2(sw, nq2, nq_p12);
886 			point_add_and_double(init1, p01_tmp12, tmp2);
887 			swap2[0U] = bit;
888 		}
889 	}
890 	sw0 = swap1[0U];
891 	cswap2(sw0, nq10, nq_p11);
892 	nq1 = p01_tmp1;
893 	tmp1 = p01_tmp1 + (u32)16U;
894 	point_double(nq1, tmp1, tmp2);
895 	point_double(nq1, tmp1, tmp2);
896 	point_double(nq1, tmp1, tmp2);
897 	memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
898 
899 	memzero_explicit(tmp2, sizeof(tmp2));
900 	memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap));
901 }
902 
903 static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
904 {
905 	u32 i;
906 	fsqr(o, inp, tmp);
907 	for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U)
908 		fsqr(o, o, tmp);
909 }
910 
911 static void finv(u64 *o, const u64 *i, u64 *tmp)
912 {
913 	u64 t1[16U] = { 0U };
914 	u64 *a0 = t1;
915 	u64 *b = t1 + (u32)4U;
916 	u64 *c = t1 + (u32)8U;
917 	u64 *t00 = t1 + (u32)12U;
918 	u64 *tmp1 = tmp;
919 	u64 *a;
920 	u64 *t0;
921 	fsquare_times(a0, i, tmp1, (u32)1U);
922 	fsquare_times(t00, a0, tmp1, (u32)2U);
923 	fmul(b, t00, i, tmp);
924 	fmul(a0, b, a0, tmp);
925 	fsquare_times(t00, a0, tmp1, (u32)1U);
926 	fmul(b, t00, b, tmp);
927 	fsquare_times(t00, b, tmp1, (u32)5U);
928 	fmul(b, t00, b, tmp);
929 	fsquare_times(t00, b, tmp1, (u32)10U);
930 	fmul(c, t00, b, tmp);
931 	fsquare_times(t00, c, tmp1, (u32)20U);
932 	fmul(t00, t00, c, tmp);
933 	fsquare_times(t00, t00, tmp1, (u32)10U);
934 	fmul(b, t00, b, tmp);
935 	fsquare_times(t00, b, tmp1, (u32)50U);
936 	fmul(c, t00, b, tmp);
937 	fsquare_times(t00, c, tmp1, (u32)100U);
938 	fmul(t00, t00, c, tmp);
939 	fsquare_times(t00, t00, tmp1, (u32)50U);
940 	fmul(t00, t00, b, tmp);
941 	fsquare_times(t00, t00, tmp1, (u32)5U);
942 	a = t1;
943 	t0 = t1 + (u32)12U;
944 	fmul(o, t0, a, tmp);
945 }
946 
947 static void store_felem(u64 *b, u64 *f)
948 {
949 	u64 f30 = f[3U];
950 	u64 top_bit0 = f30 >> (u32)63U;
951 	u64 carry0;
952 	u64 f31;
953 	u64 top_bit;
954 	u64 carry;
955 	u64 f0;
956 	u64 f1;
957 	u64 f2;
958 	u64 f3;
959 	u64 m0;
960 	u64 m1;
961 	u64 m2;
962 	u64 m3;
963 	u64 mask;
964 	u64 f0_;
965 	u64 f1_;
966 	u64 f2_;
967 	u64 f3_;
968 	u64 o0;
969 	u64 o1;
970 	u64 o2;
971 	u64 o3;
972 	f[3U] = f30 & (u64)0x7fffffffffffffffU;
973 	carry0 = add_scalar(f, f, (u64)19U * top_bit0);
974 	f31 = f[3U];
975 	top_bit = f31 >> (u32)63U;
976 	f[3U] = f31 & (u64)0x7fffffffffffffffU;
977 	carry = add_scalar(f, f, (u64)19U * top_bit);
978 	f0 = f[0U];
979 	f1 = f[1U];
980 	f2 = f[2U];
981 	f3 = f[3U];
982 	m0 = gte_mask(f0, (u64)0xffffffffffffffedU);
983 	m1 = eq_mask(f1, (u64)0xffffffffffffffffU);
984 	m2 = eq_mask(f2, (u64)0xffffffffffffffffU);
985 	m3 = eq_mask(f3, (u64)0x7fffffffffffffffU);
986 	mask = ((m0 & m1) & m2) & m3;
987 	f0_ = f0 - (mask & (u64)0xffffffffffffffedU);
988 	f1_ = f1 - (mask & (u64)0xffffffffffffffffU);
989 	f2_ = f2 - (mask & (u64)0xffffffffffffffffU);
990 	f3_ = f3 - (mask & (u64)0x7fffffffffffffffU);
991 	o0 = f0_;
992 	o1 = f1_;
993 	o2 = f2_;
994 	o3 = f3_;
995 	b[0U] = o0;
996 	b[1U] = o1;
997 	b[2U] = o2;
998 	b[3U] = o3;
999 }
1000 
1001 static void encode_point(u8 *o, const u64 *i)
1002 {
1003 	const u64 *x = i;
1004 	const u64 *z = i + (u32)4U;
1005 	u64 tmp[4U] = { 0U };
1006 	u64 tmp_w[16U] = { 0U };
1007 	finv(tmp, z, tmp_w);
1008 	fmul(tmp, tmp, x, tmp_w);
1009 	store_felem((u64 *)o, tmp);
1010 }
1011 
1012 static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub)
1013 {
1014 	u64 init1[8U] = { 0U };
1015 	u64 tmp[4U] = { 0U };
1016 	u64 tmp3;
1017 	u64 *x;
1018 	u64 *z;
1019 	{
1020 		u32 i;
1021 		for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) {
1022 			u64 *os = tmp;
1023 			const u8 *bj = pub + i * (u32)8U;
1024 			u64 u = *(u64 *)bj;
1025 			u64 r = u;
1026 			u64 x0 = r;
1027 			os[i] = x0;
1028 		}
1029 	}
1030 	tmp3 = tmp[3U];
1031 	tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU;
1032 	x = init1;
1033 	z = init1 + (u32)4U;
1034 	z[0U] = (u64)1U;
1035 	z[1U] = (u64)0U;
1036 	z[2U] = (u64)0U;
1037 	z[3U] = (u64)0U;
1038 	x[0U] = tmp[0U];
1039 	x[1U] = tmp[1U];
1040 	x[2U] = tmp[2U];
1041 	x[3U] = tmp[3U];
1042 	montgomery_ladder(init1, priv, init1);
1043 	encode_point(out, init1);
1044 }
1045 
1046 /* The below constants were generated using this sage script:
1047  *
1048  * #!/usr/bin/env sage
1049  * import sys
1050  * from sage.all import *
1051  * def limbs(n):
1052  * 	n = int(n)
1053  * 	l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64)
1054  * 	return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l
1055  * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0])
1056  * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0]
1057  * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s))
1058  * print("static const u64 table_ladder[] = {")
1059  * p = ec.lift_x(9)
1060  * for i in range(252):
1061  * 	l = (p[0] + p[2]) / (p[0] - p[2])
1062  * 	print(("\t%s" + ("," if i != 251 else "")) % limbs(l))
1063  * 	p = p * 2
1064  * print("};")
1065  *
1066  */
1067 
1068 static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL };
1069 
1070 static const u64 table_ladder[] = {
1071 	0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL,
1072 	0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL,
1073 	0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL,
1074 	0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL,
1075 	0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL,
1076 	0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL,
1077 	0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL,
1078 	0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL,
1079 	0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL,
1080 	0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL,
1081 	0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL,
1082 	0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL,
1083 	0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL,
1084 	0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL,
1085 	0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL,
1086 	0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL,
1087 	0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL,
1088 	0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL,
1089 	0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL,
1090 	0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL,
1091 	0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL,
1092 	0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL,
1093 	0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL,
1094 	0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL,
1095 	0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL,
1096 	0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL,
1097 	0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL,
1098 	0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL,
1099 	0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL,
1100 	0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL,
1101 	0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL,
1102 	0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL,
1103 	0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL,
1104 	0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL,
1105 	0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL,
1106 	0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL,
1107 	0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL,
1108 	0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL,
1109 	0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL,
1110 	0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL,
1111 	0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL,
1112 	0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL,
1113 	0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL,
1114 	0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL,
1115 	0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL,
1116 	0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL,
1117 	0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL,
1118 	0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL,
1119 	0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL,
1120 	0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL,
1121 	0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL,
1122 	0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL,
1123 	0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL,
1124 	0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL,
1125 	0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL,
1126 	0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL,
1127 	0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL,
1128 	0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL,
1129 	0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL,
1130 	0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL,
1131 	0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL,
1132 	0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL,
1133 	0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL,
1134 	0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL,
1135 	0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL,
1136 	0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL,
1137 	0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL,
1138 	0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL,
1139 	0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL,
1140 	0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL,
1141 	0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL,
1142 	0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL,
1143 	0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL,
1144 	0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL,
1145 	0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL,
1146 	0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL,
1147 	0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL,
1148 	0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL,
1149 	0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL,
1150 	0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL,
1151 	0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL,
1152 	0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL,
1153 	0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL,
1154 	0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL,
1155 	0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL,
1156 	0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL,
1157 	0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL,
1158 	0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL,
1159 	0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL,
1160 	0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL,
1161 	0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL,
1162 	0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL,
1163 	0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL,
1164 	0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL,
1165 	0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL,
1166 	0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL,
1167 	0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL,
1168 	0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL,
1169 	0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL,
1170 	0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL,
1171 	0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL,
1172 	0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL,
1173 	0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL,
1174 	0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL,
1175 	0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL,
1176 	0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL,
1177 	0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL,
1178 	0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL,
1179 	0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL,
1180 	0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL,
1181 	0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL,
1182 	0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL,
1183 	0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL,
1184 	0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL,
1185 	0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL,
1186 	0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL,
1187 	0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL,
1188 	0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL,
1189 	0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL,
1190 	0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL,
1191 	0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL,
1192 	0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL,
1193 	0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL,
1194 	0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL,
1195 	0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL,
1196 	0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL,
1197 	0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL,
1198 	0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL,
1199 	0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL,
1200 	0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL,
1201 	0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL,
1202 	0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL,
1203 	0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL,
1204 	0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL,
1205 	0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL,
1206 	0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL,
1207 	0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL,
1208 	0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL,
1209 	0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL,
1210 	0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL,
1211 	0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL,
1212 	0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL,
1213 	0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL,
1214 	0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL,
1215 	0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL,
1216 	0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL,
1217 	0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL,
1218 	0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL,
1219 	0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL,
1220 	0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL,
1221 	0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL,
1222 	0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL,
1223 	0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL,
1224 	0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL,
1225 	0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL,
1226 	0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL,
1227 	0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL,
1228 	0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL,
1229 	0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL,
1230 	0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL,
1231 	0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL,
1232 	0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL,
1233 	0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL,
1234 	0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL,
1235 	0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL,
1236 	0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL,
1237 	0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL,
1238 	0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL,
1239 	0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL,
1240 	0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL,
1241 	0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL,
1242 	0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL,
1243 	0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL,
1244 	0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL,
1245 	0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL,
1246 	0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL,
1247 	0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL,
1248 	0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL,
1249 	0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL,
1250 	0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL,
1251 	0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL,
1252 	0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL,
1253 	0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL,
1254 	0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL,
1255 	0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL,
1256 	0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL,
1257 	0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL,
1258 	0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL,
1259 	0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL,
1260 	0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL,
1261 	0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL,
1262 	0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL,
1263 	0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL,
1264 	0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL,
1265 	0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL,
1266 	0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL,
1267 	0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL,
1268 	0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL,
1269 	0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL,
1270 	0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL,
1271 	0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL,
1272 	0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL,
1273 	0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL,
1274 	0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL,
1275 	0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL,
1276 	0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL,
1277 	0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL,
1278 	0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL,
1279 	0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL,
1280 	0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL,
1281 	0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL,
1282 	0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL,
1283 	0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL,
1284 	0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL,
1285 	0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL,
1286 	0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL,
1287 	0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL,
1288 	0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL,
1289 	0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL,
1290 	0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL,
1291 	0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL,
1292 	0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL,
1293 	0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL,
1294 	0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL,
1295 	0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL,
1296 	0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL,
1297 	0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL,
1298 	0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL,
1299 	0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL,
1300 	0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL,
1301 	0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL,
1302 	0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL,
1303 	0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL,
1304 	0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL,
1305 	0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL,
1306 	0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL,
1307 	0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL,
1308 	0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL,
1309 	0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL,
1310 	0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL,
1311 	0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL,
1312 	0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL,
1313 	0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL,
1314 	0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL,
1315 	0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL,
1316 	0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL,
1317 	0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL,
1318 	0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL,
1319 	0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL,
1320 	0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL,
1321 	0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL,
1322 	0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL
1323 };
1324 
1325 static void curve25519_ever64_base(u8 *out, const u8 *priv)
1326 {
1327 	u64 swap = 1;
1328 	int i, j, k;
1329 	u64 tmp[16 + 32 + 4];
1330 	u64 *x1 = &tmp[0];
1331 	u64 *z1 = &tmp[4];
1332 	u64 *x2 = &tmp[8];
1333 	u64 *z2 = &tmp[12];
1334 	u64 *xz1 = &tmp[0];
1335 	u64 *xz2 = &tmp[8];
1336 	u64 *a = &tmp[0 + 16];
1337 	u64 *b = &tmp[4 + 16];
1338 	u64 *c = &tmp[8 + 16];
1339 	u64 *ab = &tmp[0 + 16];
1340 	u64 *abcd = &tmp[0 + 16];
1341 	u64 *ef = &tmp[16 + 16];
1342 	u64 *efgh = &tmp[16 + 16];
1343 	u64 *key = &tmp[0 + 16 + 32];
1344 
1345 	memcpy(key, priv, 32);
1346 	((u8 *)key)[0] &= 248;
1347 	((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64;
1348 
1349 	x1[0] = 1, x1[1] = x1[2] = x1[3] = 0;
1350 	z1[0] = 1, z1[1] = z1[2] = z1[3] = 0;
1351 	z2[0] = 1, z2[1] = z2[2] = z2[3] = 0;
1352 	memcpy(x2, p_minus_s, sizeof(p_minus_s));
1353 
1354 	j = 3;
1355 	for (i = 0; i < 4; ++i) {
1356 		while (j < (const int[]){ 64, 64, 64, 63 }[i]) {
1357 			u64 bit = (key[i] >> j) & 1;
1358 			k = (64 * i + j - 3);
1359 			swap = swap ^ bit;
1360 			cswap2(swap, xz1, xz2);
1361 			swap = bit;
1362 			fsub(b, x1, z1);
1363 			fadd(a, x1, z1);
1364 			fmul(c, &table_ladder[4 * k], b, ef);
1365 			fsub(b, a, c);
1366 			fadd(a, a, c);
1367 			fsqr2(ab, ab, efgh);
1368 			fmul2(xz1, xz2, ab, efgh);
1369 			++j;
1370 		}
1371 		j = 0;
1372 	}
1373 
1374 	point_double(xz1, abcd, efgh);
1375 	point_double(xz1, abcd, efgh);
1376 	point_double(xz1, abcd, efgh);
1377 	encode_point(out, xz1);
1378 
1379 	memzero_explicit(tmp, sizeof(tmp));
1380 }
1381 
1382 static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx);
1383 
1384 void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
1385 		     const u8 secret[CURVE25519_KEY_SIZE],
1386 		     const u8 basepoint[CURVE25519_KEY_SIZE])
1387 {
1388 	if (static_branch_likely(&curve25519_use_bmi2_adx))
1389 		curve25519_ever64(mypublic, secret, basepoint);
1390 	else
1391 		curve25519_generic(mypublic, secret, basepoint);
1392 }
1393 EXPORT_SYMBOL(curve25519_arch);
1394 
1395 void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
1396 			  const u8 secret[CURVE25519_KEY_SIZE])
1397 {
1398 	if (static_branch_likely(&curve25519_use_bmi2_adx))
1399 		curve25519_ever64_base(pub, secret);
1400 	else
1401 		curve25519_generic(pub, secret, curve25519_base_point);
1402 }
1403 EXPORT_SYMBOL(curve25519_base_arch);
1404 
1405 static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
1406 				 unsigned int len)
1407 {
1408 	u8 *secret = kpp_tfm_ctx(tfm);
1409 
1410 	if (!len)
1411 		curve25519_generate_secret(secret);
1412 	else if (len == CURVE25519_KEY_SIZE &&
1413 		 crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
1414 		memcpy(secret, buf, CURVE25519_KEY_SIZE);
1415 	else
1416 		return -EINVAL;
1417 	return 0;
1418 }
1419 
1420 static int curve25519_generate_public_key(struct kpp_request *req)
1421 {
1422 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
1423 	const u8 *secret = kpp_tfm_ctx(tfm);
1424 	u8 buf[CURVE25519_KEY_SIZE];
1425 	int copied, nbytes;
1426 
1427 	if (req->src)
1428 		return -EINVAL;
1429 
1430 	curve25519_base_arch(buf, secret);
1431 
1432 	/* might want less than we've got */
1433 	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
1434 	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
1435 								nbytes),
1436 				     buf, nbytes);
1437 	if (copied != nbytes)
1438 		return -EINVAL;
1439 	return 0;
1440 }
1441 
1442 static int curve25519_compute_shared_secret(struct kpp_request *req)
1443 {
1444 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
1445 	const u8 *secret = kpp_tfm_ctx(tfm);
1446 	u8 public_key[CURVE25519_KEY_SIZE];
1447 	u8 buf[CURVE25519_KEY_SIZE];
1448 	int copied, nbytes;
1449 
1450 	if (!req->src)
1451 		return -EINVAL;
1452 
1453 	copied = sg_copy_to_buffer(req->src,
1454 				   sg_nents_for_len(req->src,
1455 						    CURVE25519_KEY_SIZE),
1456 				   public_key, CURVE25519_KEY_SIZE);
1457 	if (copied != CURVE25519_KEY_SIZE)
1458 		return -EINVAL;
1459 
1460 	curve25519_arch(buf, secret, public_key);
1461 
1462 	/* might want less than we've got */
1463 	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
1464 	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
1465 								nbytes),
1466 				     buf, nbytes);
1467 	if (copied != nbytes)
1468 		return -EINVAL;
1469 	return 0;
1470 }
1471 
1472 static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
1473 {
1474 	return CURVE25519_KEY_SIZE;
1475 }
1476 
1477 static struct kpp_alg curve25519_alg = {
1478 	.base.cra_name		= "curve25519",
1479 	.base.cra_driver_name	= "curve25519-x86",
1480 	.base.cra_priority	= 200,
1481 	.base.cra_module	= THIS_MODULE,
1482 	.base.cra_ctxsize	= CURVE25519_KEY_SIZE,
1483 
1484 	.set_secret		= curve25519_set_secret,
1485 	.generate_public_key	= curve25519_generate_public_key,
1486 	.compute_shared_secret	= curve25519_compute_shared_secret,
1487 	.max_size		= curve25519_max_size,
1488 };
1489 
1490 
1491 static int __init curve25519_mod_init(void)
1492 {
1493 	if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
1494 		static_branch_enable(&curve25519_use_bmi2_adx);
1495 	else
1496 		return 0;
1497 	return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
1498 		crypto_register_kpp(&curve25519_alg) : 0;
1499 }
1500 
1501 static void __exit curve25519_mod_exit(void)
1502 {
1503 	if (IS_REACHABLE(CONFIG_CRYPTO_KPP) &&
1504 	    (boot_cpu_has(X86_FEATURE_BMI2) || boot_cpu_has(X86_FEATURE_ADX)))
1505 		crypto_unregister_kpp(&curve25519_alg);
1506 }
1507 
1508 module_init(curve25519_mod_init);
1509 module_exit(curve25519_mod_exit);
1510 
1511 MODULE_ALIAS_CRYPTO("curve25519");
1512 MODULE_ALIAS_CRYPTO("curve25519-x86");
1513 MODULE_LICENSE("GPL v2");
1514 MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
1515