xref: /freebsd/sys/crypto/armv8/armv8_crypto_wrap.c (revision e9d419a05357036ea2fd37218d853d2c713d55cc)
1 /*-
2  * Copyright (c) 2016 The FreeBSD Foundation
3  * Copyright (c) 2020 Ampere Computing
4  * All rights reserved.
5  *
6  * This software was developed by Andrew Turner under
7  * sponsorship from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 /*
32  * This code is built with floating-point enabled. Make sure to have entered
33  * into floating-point context before calling any of these functions.
34  */
35 
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/malloc.h>
42 #include <sys/queue.h>
43 
44 #include <opencrypto/cryptodev.h>
45 #include <opencrypto/gmac.h>
46 #include <crypto/rijndael/rijndael.h>
47 #include <crypto/armv8/armv8_crypto.h>
48 
49 #include <arm_neon.h>
50 
51 static uint8x16_t
52 armv8_aes_enc(int rounds, const uint8x16_t *keysched, const uint8x16_t from)
53 {
54 	uint8x16_t tmp;
55 	int i;
56 
57 	tmp = from;
58 	for (i = 0; i < rounds - 1; i += 2) {
59 		tmp = vaeseq_u8(tmp, keysched[i]);
60 		tmp = vaesmcq_u8(tmp);
61 		tmp = vaeseq_u8(tmp, keysched[i + 1]);
62 		tmp = vaesmcq_u8(tmp);
63 	}
64 
65 	tmp = vaeseq_u8(tmp, keysched[rounds - 1]);
66 	tmp = vaesmcq_u8(tmp);
67 	tmp = vaeseq_u8(tmp, keysched[rounds]);
68 	tmp = veorq_u8(tmp, keysched[rounds + 1]);
69 
70 	return (tmp);
71 }
72 
73 static uint8x16_t
74 armv8_aes_dec(int rounds, const uint8x16_t *keysched, const uint8x16_t from)
75 {
76 	uint8x16_t tmp;
77 	int i;
78 
79 	tmp = from;
80 	for (i = 0; i < rounds - 1; i += 2) {
81 		tmp = vaesdq_u8(tmp, keysched[i]);
82 		tmp = vaesimcq_u8(tmp);
83 		tmp = vaesdq_u8(tmp, keysched[i+1]);
84 		tmp = vaesimcq_u8(tmp);
85 	}
86 
87 	tmp = vaesdq_u8(tmp, keysched[rounds - 1]);
88 	tmp = vaesimcq_u8(tmp);
89 	tmp = vaesdq_u8(tmp, keysched[rounds]);
90 	tmp = veorq_u8(tmp, keysched[rounds + 1]);
91 
92 	return (tmp);
93 }
94 
95 void
96 armv8_aes_encrypt_cbc(const AES_key_t *key, size_t len,
97     const uint8_t *from, uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN])
98 {
99 	uint8x16_t tot, ivreg, tmp;
100 	size_t i;
101 
102 	len /= AES_BLOCK_LEN;
103 	ivreg = vld1q_u8(iv);
104 	for (i = 0; i < len; i++) {
105 		tmp = vld1q_u8(from);
106 		tot = armv8_aes_enc(key->aes_rounds - 1,
107 		    (const void*)key->aes_key, veorq_u8(tmp, ivreg));
108 		ivreg = tot;
109 		vst1q_u8(to, tot);
110 		from += AES_BLOCK_LEN;
111 		to += AES_BLOCK_LEN;
112 	}
113 }
114 
115 void
116 armv8_aes_decrypt_cbc(const AES_key_t *key, size_t len,
117     uint8_t *buf, const uint8_t iv[static AES_BLOCK_LEN])
118 {
119 	uint8x16_t ivreg, nextiv, tmp;
120 	size_t i;
121 
122 	len /= AES_BLOCK_LEN;
123 	ivreg = vld1q_u8(iv);
124 	for (i = 0; i < len; i++) {
125 		nextiv = vld1q_u8(buf);
126 		tmp = armv8_aes_dec(key->aes_rounds - 1,
127 		    (const void*)key->aes_key, nextiv);
128 		vst1q_u8(buf, veorq_u8(tmp, ivreg));
129 		ivreg = nextiv;
130 		buf += AES_BLOCK_LEN;
131 	}
132 }
133 
134 #define	AES_XTS_BLOCKSIZE	16
135 #define	AES_XTS_IVSIZE		8
136 #define	AES_XTS_ALPHA		0x87	/* GF(2^128) generator polynomial */
137 
138 static inline int32x4_t
139 xts_crank_lfsr(int32x4_t inp)
140 {
141 	const int32x4_t alphamask = {AES_XTS_ALPHA, 1, 1, 1};
142 	int32x4_t xtweak, ret;
143 
144 	/* set up xor mask */
145 	xtweak = vextq_s32(inp, inp, 3);
146 	xtweak = vshrq_n_s32(xtweak, 31);
147 	xtweak &= alphamask;
148 
149 	/* next term */
150 	ret = vshlq_n_s32(inp, 1);
151 	ret ^= xtweak;
152 
153 	return ret;
154 }
155 
156 static void
157 armv8_aes_crypt_xts_block(int rounds, const uint8x16_t *key_schedule,
158     uint8x16_t *tweak, const uint8_t *from, uint8_t *to, int do_encrypt)
159 {
160 	uint8x16_t block;
161 
162 	block = vld1q_u8(from) ^ *tweak;
163 
164 	if (do_encrypt)
165 		block = armv8_aes_enc(rounds - 1, key_schedule, block);
166 	else
167 		block = armv8_aes_dec(rounds - 1, key_schedule, block);
168 
169 	vst1q_u8(to, block ^ *tweak);
170 
171 	*tweak = vreinterpretq_u8_s32(xts_crank_lfsr(vreinterpretq_s32_u8(*tweak)));
172 }
173 
174 static void
175 armv8_aes_crypt_xts(int rounds, const uint8x16_t *data_schedule,
176     const uint8x16_t *tweak_schedule, size_t len, const uint8_t *from,
177     uint8_t *to, const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt)
178 {
179 	uint8x16_t tweakreg;
180 	uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16);
181 	size_t i, cnt;
182 
183 	/*
184 	 * Prepare tweak as E_k2(IV). IV is specified as LE representation
185 	 * of a 64-bit block number which we allow to be passed in directly.
186 	 */
187 #if BYTE_ORDER == LITTLE_ENDIAN
188 	bcopy(iv, tweak, AES_XTS_IVSIZE);
189 	/* Last 64 bits of IV are always zero. */
190 	bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE);
191 #else
192 #error Only LITTLE_ENDIAN architectures are supported.
193 #endif
194 	tweakreg = vld1q_u8(tweak);
195 	tweakreg = armv8_aes_enc(rounds - 1, tweak_schedule, tweakreg);
196 
197 	cnt = len / AES_XTS_BLOCKSIZE;
198 	for (i = 0; i < cnt; i++) {
199 		armv8_aes_crypt_xts_block(rounds, data_schedule, &tweakreg,
200 		    from, to, do_encrypt);
201 		from += AES_XTS_BLOCKSIZE;
202 		to += AES_XTS_BLOCKSIZE;
203 	}
204 }
205 
206 void
207 armv8_aes_encrypt_xts(AES_key_t *data_schedule,
208     const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to,
209     const uint8_t iv[static AES_BLOCK_LEN])
210 {
211 
212 	armv8_aes_crypt_xts(data_schedule->aes_rounds,
213 	    (const void *)&data_schedule->aes_key, tweak_schedule, len, from,
214 	    to, iv, 1);
215 }
216 
217 void
218 armv8_aes_decrypt_xts(AES_key_t *data_schedule,
219     const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to,
220     const uint8_t iv[static AES_BLOCK_LEN])
221 {
222 
223 	armv8_aes_crypt_xts(data_schedule->aes_rounds,
224 	    (const void *)&data_schedule->aes_key, tweak_schedule, len, from,
225 	    to,iv, 0);
226 
227 }
228 
229 #define	AES_INC_COUNTER(counter)				\
230 	do {							\
231 		for (int pos = AES_BLOCK_LEN - 1;		\
232 		     pos >= 0; pos--)				\
233 			if (++(counter)[pos])			\
234 				break;				\
235 	} while (0)
236 
237 struct armv8_gcm_state {
238 	__uint128_val_t EK0;
239 	__uint128_val_t EKi;
240 	__uint128_val_t Xi;
241 	__uint128_val_t lenblock;
242 	uint8_t aes_counter[AES_BLOCK_LEN];
243 };
244 
245 void
246 armv8_aes_encrypt_gcm(AES_key_t *aes_key, size_t len,
247     const uint8_t *from, uint8_t *to,
248     size_t authdatalen, const uint8_t *authdata,
249     uint8_t tag[static GMAC_DIGEST_LEN],
250     const uint8_t iv[static AES_GCM_IV_LEN],
251     const __uint128_val_t *Htable)
252 {
253 	struct armv8_gcm_state s;
254 	const uint64_t *from64;
255 	uint64_t *to64;
256 	uint8_t block[AES_BLOCK_LEN];
257 	size_t i, trailer;
258 
259 	bzero(&s.aes_counter, AES_BLOCK_LEN);
260 	memcpy(s.aes_counter, iv, AES_GCM_IV_LEN);
261 
262 	/* Setup the counter */
263 	s.aes_counter[AES_BLOCK_LEN - 1] = 1;
264 
265 	/* EK0 for a final GMAC round */
266 	aes_v8_encrypt(s.aes_counter, s.EK0.c, aes_key);
267 
268 	/* GCM starts with 2 as counter, 1 is used for final xor of tag. */
269 	s.aes_counter[AES_BLOCK_LEN - 1] = 2;
270 
271 	memset(s.Xi.c, 0, sizeof(s.Xi.c));
272 	trailer = authdatalen % AES_BLOCK_LEN;
273 	if (authdatalen - trailer > 0) {
274 		gcm_ghash_v8(s.Xi.u, Htable, authdata, authdatalen - trailer);
275 		authdata += authdatalen - trailer;
276 	}
277 	if (trailer > 0 || authdatalen == 0) {
278 		memset(block, 0, sizeof(block));
279 		memcpy(block, authdata, trailer);
280 		gcm_ghash_v8(s.Xi.u, Htable, block, AES_BLOCK_LEN);
281 	}
282 
283 	from64 = (const uint64_t*)from;
284 	to64 = (uint64_t*)to;
285 	trailer = len % AES_BLOCK_LEN;
286 
287 	for (i = 0; i < (len - trailer); i += AES_BLOCK_LEN) {
288 		aes_v8_encrypt(s.aes_counter, s.EKi.c, aes_key);
289 		AES_INC_COUNTER(s.aes_counter);
290 		to64[0] = from64[0] ^ s.EKi.u[0];
291 		to64[1] = from64[1] ^ s.EKi.u[1];
292 		gcm_ghash_v8(s.Xi.u, Htable, (uint8_t*)to64, AES_BLOCK_LEN);
293 
294 		to64 += 2;
295 		from64 += 2;
296 	}
297 
298 	to += (len - trailer);
299 	from += (len - trailer);
300 
301 	if (trailer) {
302 		aes_v8_encrypt(s.aes_counter, s.EKi.c, aes_key);
303 		AES_INC_COUNTER(s.aes_counter);
304 		memset(block, 0, sizeof(block));
305 		for (i = 0; i < trailer; i++) {
306 			block[i] = to[i] = from[i] ^ s.EKi.c[i];
307 		}
308 
309 		gcm_ghash_v8(s.Xi.u, Htable, block, AES_BLOCK_LEN);
310 	}
311 
312 	/* Lengths block */
313 	s.lenblock.u[0] = s.lenblock.u[1] = 0;
314 	s.lenblock.d[1] = htobe32(authdatalen * 8);
315 	s.lenblock.d[3] = htobe32(len * 8);
316 	gcm_ghash_v8(s.Xi.u, Htable, s.lenblock.c, AES_BLOCK_LEN);
317 
318 	s.Xi.u[0] ^= s.EK0.u[0];
319 	s.Xi.u[1] ^= s.EK0.u[1];
320 	memcpy(tag, s.Xi.c, GMAC_DIGEST_LEN);
321 
322 	explicit_bzero(&s, sizeof(s));
323 }
324 
325 int
326 armv8_aes_decrypt_gcm(AES_key_t *aes_key, size_t len,
327     const uint8_t *from, uint8_t *to,
328     size_t authdatalen, const uint8_t *authdata,
329     const uint8_t tag[static GMAC_DIGEST_LEN],
330     const uint8_t iv[static AES_GCM_IV_LEN],
331     const __uint128_val_t *Htable)
332 {
333 	struct armv8_gcm_state s;
334 	const uint64_t *from64;
335 	uint64_t *to64;
336 	uint8_t block[AES_BLOCK_LEN];
337 	size_t i, trailer;
338 	int error;
339 
340 	error = 0;
341 	bzero(&s.aes_counter, AES_BLOCK_LEN);
342 	memcpy(s.aes_counter, iv, AES_GCM_IV_LEN);
343 
344 	/* Setup the counter */
345 	s.aes_counter[AES_BLOCK_LEN - 1] = 1;
346 
347 	/* EK0 for a final GMAC round */
348 	aes_v8_encrypt(s.aes_counter, s.EK0.c, aes_key);
349 
350 	memset(s.Xi.c, 0, sizeof(s.Xi.c));
351 	trailer = authdatalen % AES_BLOCK_LEN;
352 	if (authdatalen - trailer > 0) {
353 		gcm_ghash_v8(s.Xi.u, Htable, authdata, authdatalen - trailer);
354 		authdata += authdatalen - trailer;
355 	}
356 	if (trailer > 0 || authdatalen == 0) {
357 		memset(block, 0, sizeof(block));
358 		memcpy(block, authdata, trailer);
359 		gcm_ghash_v8(s.Xi.u, Htable, block, AES_BLOCK_LEN);
360 	}
361 
362 	trailer = len % AES_BLOCK_LEN;
363 	if (len - trailer > 0)
364 		gcm_ghash_v8(s.Xi.u, Htable, from, len - trailer);
365 	if (trailer > 0) {
366 		memset(block, 0, sizeof(block));
367 		memcpy(block, from + len - trailer, trailer);
368 		gcm_ghash_v8(s.Xi.u, Htable, block, AES_BLOCK_LEN);
369 	}
370 
371 	/* Lengths block */
372 	s.lenblock.u[0] = s.lenblock.u[1] = 0;
373 	s.lenblock.d[1] = htobe32(authdatalen * 8);
374 	s.lenblock.d[3] = htobe32(len * 8);
375 	gcm_ghash_v8(s.Xi.u, Htable, s.lenblock.c, AES_BLOCK_LEN);
376 
377 	s.Xi.u[0] ^= s.EK0.u[0];
378 	s.Xi.u[1] ^= s.EK0.u[1];
379 	if (timingsafe_bcmp(tag, s.Xi.c, GMAC_DIGEST_LEN) != 0) {
380 		error = EBADMSG;
381 		goto out;
382 	}
383 
384 	/* GCM starts with 2 as counter, 1 is used for final xor of tag. */
385 	s.aes_counter[AES_BLOCK_LEN - 1] = 2;
386 
387 	from64 = (const uint64_t*)from;
388 	to64 = (uint64_t*)to;
389 
390 	for (i = 0; i < (len - trailer); i += AES_BLOCK_LEN) {
391 		aes_v8_encrypt(s.aes_counter, s.EKi.c, aes_key);
392 		AES_INC_COUNTER(s.aes_counter);
393 		to64[0] = from64[0] ^ s.EKi.u[0];
394 		to64[1] = from64[1] ^ s.EKi.u[1];
395 		to64 += 2;
396 		from64 += 2;
397 	}
398 
399 	to += (len - trailer);
400 	from += (len - trailer);
401 
402 	if (trailer) {
403 		aes_v8_encrypt(s.aes_counter, s.EKi.c, aes_key);
404 		AES_INC_COUNTER(s.aes_counter);
405 		for (i = 0; i < trailer; i++)
406 			to[i] = from[i] ^ s.EKi.c[i];
407 	}
408 
409 out:
410 	explicit_bzero(&s, sizeof(s));
411 	return (error);
412 }
413