xref: /illumos-gate/usr/src/common/crypto/arcfour/sun4v/arcfour_crypt.c (revision c2b09db8b5b01162dadf9205ddd83ccf4f7d5535)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include "../arcfour.h"
27 
28 /* Initialize the key stream 'key' using the key value */
29 void
30 arcfour_key_init(ARCFour_key *key, uchar_t *keyval, int keyvallen)
31 {
32 	uchar_t ext_keyval[256];
33 	uchar_t tmp;
34 	int i, j;
35 
36 	for (i = j = 0; i < 256; i++, j++) {
37 		if (j == keyvallen)
38 			j = 0;
39 
40 		ext_keyval[i] = keyval[j];
41 	}
42 	for (i = 0; i < 256; i++)
43 		key->arr[i] = (uchar_t)i;
44 
45 	j = 0;
46 	for (i = 0; i < 256; i++) {
47 		j = (j + key->arr[i] + ext_keyval[i]) % 256;
48 		tmp = key->arr[i];
49 		key->arr[i] = key->arr[j];
50 		key->arr[j] = tmp;
51 	}
52 	key->i = 0;
53 	key->j = 0;
54 }
55 
56 
57 /*
58  * Encipher 'in' using 'key.
59  * in and out can point to the same location
60  */
61 void
62 arcfour_crypt(ARCFour_key *key, uchar_t *in, uchar_t *out, size_t len)
63 {
64 	size_t ii;
65 	unsigned long long in0, merge = 0, merge0 = 0, merge1, mask = 0;
66 	uchar_t i, j, *base, jj, *base1, tmp;
67 	unsigned int tmp0, tmp1, i_accum, shift = 0, i1;
68 
69 	int index;
70 
71 	base = key->arr;
72 
73 	index = (((uintptr_t)in) & 0x7);
74 
75 	/* Get the 'in' on an 8-byte alignment */
76 	if (index > 0) {
77 		i = key->i;
78 		j = key->j;
79 
80 		for (index = 8 - index; (index-- > 0) && len > 0;
81 		    len--, in++, out++) {
82 
83 			i = i + 1;
84 			j = j + key->arr[i];
85 			tmp = key->arr[i];
86 			key->arr[i] = key->arr[j];
87 			key->arr[j] = tmp;
88 			tmp = key->arr[i] + key->arr[j];
89 			*out = *in ^ key->arr[tmp];
90 		}
91 		key->i = i;
92 		key->j = j;
93 
94 	}
95 	if (len == 0)
96 		return;
97 
98 	/* See if we're fortunate and 'out' got aligned as well */
99 
100 
101 	/*
102 	 * Niagara optimized version for
103 	 * the cases where the input and output  buffers are aligned on
104 	 * a multiple of 8-byte boundary.
105 	 */
106 #ifdef	sun4v
107 	if ((((uintptr_t)out) & 7) != 0) {
108 #endif	/* sun4v */
109 		i = key->i;
110 		j = key->j;
111 		for (ii = 0; ii < len; ii++) {
112 			i = i + 1;
113 			tmp0 = base[i];
114 			j = j + tmp0;
115 			tmp1 = base[j];
116 			base[i] = (uchar_t)tmp1;
117 			base[j] = (uchar_t)tmp0;
118 			tmp0 += tmp1;
119 			tmp0 = tmp0 & 0xff;
120 			out[ii] = in[ii] ^ base[tmp0];
121 		}
122 		key->i = i;
123 		key->j = j;
124 #ifdef	sun4v
125 	} else {
126 		i = key->i;
127 		j = key->j;
128 
129 		/*
130 		 * Want to align base[i] on a 2B boundary -- allows updates
131 		 * via [i] to be performed in 2B chunks (reducing # of stores).
132 		 * Requires appropriate alias detection.
133 		 */
134 
135 		if (((i+1) % 2) != 0) {
136 			i = i + 1;
137 			tmp0 = base[i];
138 			j = j + tmp0;
139 			tmp1 = base[j];
140 
141 			base[i] = (uchar_t)tmp1;
142 			base[j] = (uchar_t)tmp0;
143 
144 			tmp0 += tmp1;
145 			tmp0 = tmp0 & 0xff;
146 
147 			merge0 = (unsigned long long)(base[tmp0]) << 56;
148 			shift = 8; mask = 0xff;
149 		}
150 
151 		/*
152 		 * Note - in and out may now be misaligned -
153 		 * as updating [out] in 8B chunks need to handle this
154 		 * possibility. Also could have a 1B overrun.
155 		 * Need to drop out of loop early as a result.
156 		 */
157 
158 		for (ii = 0, i1 = i; ii < ((len-1)  & (~7));
159 		    ii += 8, i1 = i1&0xff) {
160 
161 			/*
162 			 * If i < less than 248, know wont wrap around
163 			 * (i % 256), so don't need to bother with masking i
164 			 * after each increment
165 			 */
166 			if (i1 < 248) {
167 
168 				/* BYTE 0 */
169 				i1 = (i1 + 1);
170 
171 				/*
172 				 * Creating this base pointer reduces subsequent
173 				 * arihmetic ops required to load [i]
174 				 *
175 				 * N.B. don't need to check if [j] aliases.
176 				 * [i] and [j] end up with the same values
177 				 * anyway.
178 				 */
179 				base1 = &base[i1];
180 
181 				tmp0 = base1[0];
182 				j = j + tmp0;
183 
184 				tmp1 = base[j];
185 				/*
186 				 * Don't store [i] yet
187 				 */
188 				i_accum = tmp1;
189 				base[j] = (uchar_t)tmp0;
190 
191 				tmp0 += tmp1;
192 				tmp0 = tmp0 & 0xff;
193 
194 				/*
195 				 * Check [tmp0] doesn't alias with [i]
196 				 */
197 
198 				/*
199 				 * Updating [out] in 8B chunks
200 				 */
201 				if (i1 == tmp0) {
202 					merge =
203 					    (unsigned long long)(i_accum) << 56;
204 				} else {
205 					merge =
206 					    (unsigned long long)(base[tmp0]) <<
207 					    56;
208 				}
209 
210 				/* BYTE 1 */
211 				tmp0 = base1[1];
212 
213 				j = j + tmp0;
214 
215 				/*
216 				 * [j] can now alias with [i] and [i-1]
217 				 * If alias abort speculation
218 				 */
219 				if ((i1 ^ j) < 2) {
220 					base1[0] = (uchar_t)i_accum;
221 
222 					tmp1 = base[j];
223 
224 					base1[1] = (uchar_t)tmp1;
225 					base[j] = (uchar_t)tmp0;
226 
227 					tmp0 += tmp1;
228 					tmp0 = tmp0 & 0xff;
229 
230 					merge |= (unsigned long long)
231 					    (base[tmp0]) << 48;
232 				} else {
233 
234 					tmp1 = base[j];
235 
236 					i_accum = i_accum << 8;
237 					i_accum |= tmp1;
238 
239 					base[j] = (uchar_t)tmp0;
240 
241 					tmp0 += tmp1;
242 					tmp0 = tmp0 & 0xff;
243 
244 					/*
245 					 * Speculation suceeded! Update [i]
246 					 * in 2B chunk
247 					 */
248 					/* LINTED E_BAD_PTR_CAST_ALIGN */
249 					*((unsigned short *) &base[i1]) =
250 					    i_accum;
251 
252 					merge |=
253 					    (unsigned long long)(base[tmp0]) <<
254 					    48;
255 				}
256 
257 
258 				/*
259 				 * Too expensive to perform [i] speculation for
260 				 * every byte. Just need to reduce frequency
261 				 * of stores until store buffer full stalls
262 				 * are not the bottleneck.
263 				 */
264 
265 				/* BYTE 2 */
266 				tmp0 = base1[2];
267 				j = j + tmp0;
268 				tmp1 = base[j];
269 				base1[2] = (uchar_t)tmp1;
270 				base[j] = (uchar_t)tmp0;
271 				tmp1 += tmp0;
272 				tmp1 = tmp1 & 0xff;
273 				merge |= (unsigned long long)(base[tmp1]) << 40;
274 
275 				/* BYTE 3 */
276 				tmp0 = base1[3];
277 				j = j + tmp0;
278 				tmp1 = base[j];
279 				base1[3] = (uchar_t)tmp1;
280 				base[j] = (uchar_t)tmp0;
281 				tmp0 += tmp1;
282 				tmp0 = tmp0 & 0xff;
283 				merge |= (unsigned long long)(base[tmp0]) << 32;
284 
285 				/* BYTE 4 */
286 				tmp0 = base1[4];
287 				j = j + tmp0;
288 				tmp1 = base[j];
289 				base1[4] = (uchar_t)tmp1;
290 				base[j] = (uchar_t)tmp0;
291 				tmp0 += tmp1;
292 				tmp0 = tmp0 & 0xff;
293 				merge |= (unsigned long long)(base[tmp0]) << 24;
294 
295 				/* BYTE 5 */
296 				tmp0 = base1[5];
297 				j = j + tmp0;
298 				tmp1 = base[j];
299 				base1[5] = (uchar_t)tmp1;
300 				base[j] = (uchar_t)tmp0;
301 				tmp0 += tmp1;
302 				tmp0 = tmp0 & 0xff;
303 				merge |= (unsigned long long)(base[tmp0]) << 16;
304 
305 				/* BYTE 6 */
306 				i1 = (i1+6);
307 				tmp0 = base1[6];
308 				j = j + tmp0;
309 				tmp1 = base[j];
310 				i_accum = tmp1;
311 				base[j] = (uchar_t)tmp0;
312 
313 				tmp0 += tmp1;
314 				tmp0 = tmp0 & 0xff;
315 
316 				if (i1 == tmp0) {
317 					merge |=
318 					    (unsigned long long)(i_accum) << 8;
319 				} else {
320 					merge |=
321 					    (unsigned long long)(base[tmp0]) <<
322 					    8;
323 				}
324 
325 				/* BYTE 7 */
326 				tmp0 = base1[7];
327 
328 				/*
329 				 * Perform [i] speculation again. Indentical
330 				 * to that performed for BYTE0 and BYTE1.
331 				 */
332 				j = j + tmp0;
333 				if ((i1 ^ j) < 2) {
334 					base1[6] = (uchar_t)i_accum;
335 					tmp1 = base[j];
336 
337 					base1[7] = (uchar_t)tmp1;
338 					base[j] = (uchar_t)tmp0;
339 
340 					tmp0 += tmp1;
341 					tmp0 = tmp0 & 0xff;
342 
343 					merge |=
344 					    (unsigned long long)(base[tmp0]);
345 
346 				} else {
347 					tmp1 = base[j];
348 
349 					i_accum = i_accum << 8;
350 					i_accum |= tmp1;
351 
352 					base[j] = (uchar_t)tmp0;
353 
354 					tmp0 += tmp1;
355 					tmp0 = tmp0 & 0xff;
356 
357 					/* LINTED E_BAD_PTR_CAST_ALIGN */
358 					*((unsigned short *) &base[i1]) =
359 					    i_accum;
360 
361 					merge |=
362 					    (unsigned long long)(base[tmp0]);
363 				}
364 				i1++;
365 			} else {
366 				/*
367 				 * i is too close to wrap-around to allow
368 				 * masking to be disregarded
369 				 */
370 
371 				/*
372 				 * Same old speculation for BYTE 0 and BYTE 1
373 				 */
374 
375 				/* BYTE 0 */
376 				i1 = (i1 + 1) & 0xff;
377 				jj = (uchar_t)i1;
378 
379 				tmp0 = base[i1];
380 				j = j + tmp0;
381 
382 				tmp1 = base[j];
383 				i_accum = tmp1;
384 				base[j] = (uchar_t)tmp0;
385 
386 				tmp0 += tmp1;
387 				tmp0 = tmp0 & 0xff;
388 
389 				if (i1 == tmp0) {
390 					merge =
391 					    (unsigned long long)(i_accum) << 56;
392 				} else {
393 					merge =
394 					    (unsigned long long)(base[tmp0]) <<
395 					    56;
396 				}
397 
398 				/* BYTE 1 */
399 				tmp0 = base[i1+1];
400 
401 				j = j + tmp0;
402 
403 				if ((jj ^ j) < 2) {
404 					base[jj] = (uchar_t)i_accum;
405 
406 					tmp1 = base[j];
407 
408 					base[i1+1] = (uchar_t)tmp1;
409 					base[j] = (uchar_t)tmp0;
410 
411 					tmp0 += tmp1;
412 					tmp0 = tmp0 & 0xff;
413 
414 					merge |=
415 					    (unsigned long long)(base[tmp0]) <<
416 					    48;
417 				} else {
418 
419 					tmp1 = base[j];
420 
421 					i_accum = i_accum << 8;
422 					i_accum |= tmp1;
423 
424 					base[j] = (uchar_t)tmp0;
425 
426 					tmp0 += tmp1;
427 					tmp0 = tmp0 & 0xff;
428 
429 					/* LINTED E_BAD_PTR_CAST_ALIGN */
430 					*((unsigned short *) &base[jj]) =
431 					    i_accum;
432 
433 					merge |=
434 					    (unsigned long long)(base[tmp0]) <<
435 					    48;
436 				}
437 
438 				/* BYTE 2 */
439 				/*
440 				 * As know i must be even when enter loop (to
441 				 * satisfy alignment), can only wrap around
442 				 * on the even bytes. So just need to perform
443 				 * mask every 2nd byte
444 				 */
445 				i1 = (i1 + 2) & 0xff;
446 				tmp0 = base[i1];
447 				j = j + tmp0;
448 				tmp1 = base[j];
449 				base[i1] = (uchar_t)tmp1;
450 				base[j] = (uchar_t)tmp0;
451 				tmp0 += tmp1;
452 				tmp0 = tmp0 & 0xff;
453 				merge |= (unsigned long long)(base[tmp0]) << 40;
454 
455 				/* BYTE 3 */
456 				tmp0 = base[i1+1];
457 				j = j + tmp0;
458 				tmp1 = base[j];
459 				base[i1+1] = (uchar_t)tmp1;
460 				base[j] = (uchar_t)tmp0;
461 				tmp0 += tmp1;
462 				tmp0 = tmp0 & 0xff;
463 				merge |= (unsigned long long)(base[tmp0]) << 32;
464 
465 				/* BYTE 4 */
466 				i1 = (i1 + 2) & 0xff;
467 				tmp0 = base[i1];
468 				j = j + tmp0;
469 				tmp1 = base[j];
470 				base[i1] = (uchar_t)tmp1;
471 				base[j] = (uchar_t)tmp0;
472 				tmp0 += tmp1;
473 				tmp0 = tmp0 & 0xff;
474 				merge |= (unsigned long long)(base[tmp0]) << 24;
475 
476 				/* BYTE 5 */
477 				tmp0 = base[i1+1];
478 				j = j + tmp0;
479 				tmp1 = base[j];
480 				base[i1+1] = (uchar_t)tmp1;
481 				base[j] = (uchar_t)tmp0;
482 				tmp0 += tmp1;
483 				tmp0 = tmp0 & 0xff;
484 				merge |= (unsigned long long)(base[tmp0]) << 16;
485 
486 				/* BYTE 6 */
487 				i1 = (i1+2) &0xff;
488 				jj = (uchar_t)i1;
489 				tmp0 = base[i1];
490 
491 				j = j + tmp0;
492 
493 				tmp1 = base[j];
494 				i_accum = tmp1;
495 				base[j] = (uchar_t)tmp0;
496 
497 
498 				tmp0 += tmp1;
499 				tmp0 = tmp0 & 0xff;
500 
501 				if (i1 == tmp0) {
502 					merge |=
503 					    (unsigned long long)(i_accum) << 8;
504 				} else {
505 					merge |=
506 					    (unsigned long long)(base[tmp0]) <<
507 					    8;
508 				}
509 
510 				/* BYTE 7 */
511 				i1++;
512 				tmp0 = base[i1];
513 
514 				j = j + tmp0;
515 				if ((jj ^ j) < 2) {
516 					base[jj] = (uchar_t)i_accum;
517 					tmp1 = base[j];
518 
519 					base[i1] = (uchar_t)tmp1;
520 					base[j] = (uchar_t)tmp0;
521 
522 					tmp0 += tmp1;
523 					tmp0 = tmp0 & 0xff;
524 
525 					merge |=
526 					    (unsigned long long)(base[tmp0]);
527 
528 				} else {
529 
530 					tmp1 = base[j];
531 
532 					i_accum = i_accum << 8;
533 					i_accum |= tmp1;
534 
535 					base[j] = (uchar_t)tmp0;
536 
537 					tmp0 += tmp1;
538 					tmp0 = tmp0 & 0xff;
539 
540 					/* LINTED E_BAD_PTR_CAST_ALIGN */
541 					*((unsigned short *) &base[jj]) =
542 					    i_accum;
543 
544 					merge |=
545 					    (unsigned long long)(base[tmp0]);
546 				}
547 			}
548 
549 			/*
550 			 * Perform update to [out]
551 			 * Remember could be alignment issues
552 			 */
553 			/* LINTED E_BAD_PTR_CAST_ALIGN */
554 			in0 = *((unsigned long long *) (&in[ii]));
555 
556 			merge1 = merge0 | (merge >> shift);
557 
558 			merge0 = (merge & mask) << 56;
559 
560 			in0 = in0 ^ merge1;
561 
562 			/* LINTED E_BAD_PTR_CAST_ALIGN */
563 			*((unsigned long long *) (&out[ii])) = in0;
564 		}
565 
566 		i = (uchar_t)i1;
567 
568 		/*
569 		 * Handle any overrun
570 		 */
571 		if (shift) {
572 			out[ii] = in[ii] ^ (merge0 >> 56);
573 			ii++;
574 		}
575 
576 		/*
577 		 * Handle final few bytes
578 		 */
579 		for (; ii < len; ii++) {
580 			i = i + 1;
581 			tmp0 = base[i];
582 			j = j + tmp0;
583 			tmp1 = base[j];
584 
585 			base[i] = (uchar_t)tmp1;
586 			base[j] = (uchar_t)tmp0;
587 
588 			tmp0 += tmp1;
589 			tmp0 = tmp0 & 0xff;
590 			out[ii] = in[ii] ^ base[tmp0];
591 		}
592 		key->i = i;
593 		key->j = j;
594 	}
595 #endif /* sun4v */
596 }
597