xref: /illumos-gate/usr/src/common/crypto/arcfour/sun4v/arcfour_crypt.c (revision 35a5a3587fd94b666239c157d3722745250ccbd7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include "../arcfour.h"
27 
28 /* Initialize the key stream 'key' using the key value */
29 void
30 arcfour_key_init(ARCFour_key *key, uchar_t *keyval, int keyvallen)
31 {
32 /* EXPORT DELETE START */
33 
34 	uchar_t ext_keyval[256];
35 	uchar_t tmp;
36 	int i, j;
37 
38 	for (i = j = 0; i < 256; i++, j++) {
39 		if (j == keyvallen)
40 			j = 0;
41 
42 		ext_keyval[i] = keyval[j];
43 	}
44 	for (i = 0; i < 256; i++)
45 		key->arr[i] = (uchar_t)i;
46 
47 	j = 0;
48 	for (i = 0; i < 256; i++) {
49 		j = (j + key->arr[i] + ext_keyval[i]) % 256;
50 		tmp = key->arr[i];
51 		key->arr[i] = key->arr[j];
52 		key->arr[j] = tmp;
53 	}
54 	key->i = 0;
55 	key->j = 0;
56 
57 /* EXPORT DELETE END */
58 }
59 
60 
61 /*
62  * Encipher 'in' using 'key.
63  * in and out can point to the same location
64  */
65 void
66 arcfour_crypt(ARCFour_key *key, uchar_t *in, uchar_t *out, size_t len)
67 {
68 	size_t ii;
69 	unsigned long long in0, merge = 0, merge0 = 0, merge1, mask = 0;
70 	uchar_t i, j, *base, jj, *base1, tmp;
71 	unsigned int tmp0, tmp1, i_accum, shift = 0, i1;
72 
73 
74 /* EXPORT DELETE START */
75 	int index;
76 
77 	base = key->arr;
78 
79 	index = (((uintptr_t)in) & 0x7);
80 
81 	/* Get the 'in' on an 8-byte alignment */
82 	if (index > 0) {
83 		i = key->i;
84 		j = key->j;
85 
86 		for (index = 8 - index; (index-- > 0) && len > 0;
87 		    len--, in++, out++) {
88 
89 			i = i + 1;
90 			j = j + key->arr[i];
91 			tmp = key->arr[i];
92 			key->arr[i] = key->arr[j];
93 			key->arr[j] = tmp;
94 			tmp = key->arr[i] + key->arr[j];
95 			*out = *in ^ key->arr[tmp];
96 		}
97 		key->i = i;
98 		key->j = j;
99 
100 	}
101 	if (len == 0)
102 		return;
103 
104 	/* See if we're fortunate and 'out' got aligned as well */
105 
106 
107 	/*
108 	 * Niagara optimized version for
109 	 * the cases where the input and output  buffers are aligned on
110 	 * a multiple of 8-byte boundary.
111 	 */
112 #ifdef	sun4v
113 	if ((((uintptr_t)out) & 7) != 0) {
114 #endif	/* sun4v */
115 		i = key->i;
116 		j = key->j;
117 		for (ii = 0; ii < len; ii++) {
118 			i = i + 1;
119 			tmp0 = base[i];
120 			j = j + tmp0;
121 			tmp1 = base[j];
122 			base[i] = tmp1;
123 			base[j] = tmp0;
124 			tmp0 += tmp1;
125 			tmp0 = tmp0 & 0xff;
126 			out[ii] = in[ii] ^ base[tmp0];
127 		}
128 		key->i = i;
129 		key->j = j;
130 #ifdef	sun4v
131 	} else {
132 		i = key->i;
133 		j = key->j;
134 
135 		/*
136 		 * Want to align base[i] on a 2B boundary -- allows updates
137 		 * via [i] to be performed in 2B chunks (reducing # of stores).
138 		 * Requires appropriate alias detection.
139 		 */
140 
141 		if (((i+1) % 2) != 0) {
142 			i = i + 1;
143 			tmp0 = base[i];
144 			j = j + tmp0;
145 			tmp1 = base[j];
146 
147 			base[i] = tmp1;
148 			base[j] = tmp0;
149 
150 			tmp0 += tmp1;
151 			tmp0 = tmp0 & 0xff;
152 
153 			merge0 = (unsigned long long)(base[tmp0]) << 56;
154 			shift = 8; mask = 0xff;
155 		}
156 
157 		/*
158 		 * Note - in and out may now be misaligned -
159 		 * as updating [out] in 8B chunks need to handle this
160 		 * possibility. Also could have a 1B overrun.
161 		 * Need to drop out of loop early as a result.
162 		 */
163 
164 		for (ii = 0, i1 = i; ii < ((len-1)  & (~7));
165 		    ii += 8, i1 = i1&0xff) {
166 
167 			/*
168 			 * If i < less than 248, know wont wrap around
169 			 * (i % 256), so don't need to bother with masking i
170 			 * after each increment
171 			 */
172 			if (i1 < 248) {
173 
174 				/* BYTE 0 */
175 				i1 = (i1 + 1);
176 
177 				/*
178 				 * Creating this base pointer reduces subsequent
179 				 * arihmetic ops required to load [i]
180 				 *
181 				 * N.B. don't need to check if [j] aliases.
182 				 * [i] and [j] end up with the same values
183 				 * anyway.
184 				 */
185 				base1 = &base[i1];
186 
187 				tmp0 = base1[0];
188 				j = j + tmp0;
189 
190 				tmp1 = base[j];
191 				/*
192 				 * Don't store [i] yet
193 				 */
194 				i_accum = tmp1;
195 				base[j] = tmp0;
196 
197 				tmp0 += tmp1;
198 				tmp0 = tmp0 & 0xff;
199 
200 				/*
201 				 * Check [tmp0] doesn't alias with [i]
202 				 */
203 
204 				/*
205 				 * Updating [out] in 8B chunks
206 				 */
207 				if (i1 == tmp0) {
208 					merge =
209 					    (unsigned long long)(i_accum) << 56;
210 				} else {
211 					merge =
212 					    (unsigned long long)(base[tmp0]) <<
213 					    56;
214 				}
215 
216 				/* BYTE 1 */
217 				tmp0 = base1[1];
218 
219 				j = j + tmp0;
220 
221 				/*
222 				 * [j] can now alias with [i] and [i-1]
223 				 * If alias abort speculation
224 				 */
225 				if ((i1 ^ j) < 2) {
226 					base1[0] = i_accum;
227 
228 					tmp1 = base[j];
229 
230 					base1[1] = tmp1;
231 					base[j] = tmp0;
232 
233 					tmp0 += tmp1;
234 					tmp0 = tmp0 & 0xff;
235 
236 					merge |= (unsigned long long)
237 					    (base[tmp0]) << 48;
238 				} else {
239 
240 					tmp1 = base[j];
241 
242 					i_accum = i_accum << 8;
243 					i_accum |= tmp1;
244 
245 					base[j] = tmp0;
246 
247 					tmp0 += tmp1;
248 					tmp0 = tmp0 & 0xff;
249 
250 					/*
251 					 * Speculation suceeded! Update [i]
252 					 * in 2B chunk
253 					 */
254 					/* LINTED E_BAD_PTR_CAST_ALIGN */
255 					*((unsigned short *) &base[i1]) =
256 					    i_accum;
257 
258 					merge |=
259 					    (unsigned long long)(base[tmp0]) <<
260 					    48;
261 				}
262 
263 
264 				/*
265 				 * Too expensive to perform [i] speculation for
266 				 * every byte. Just need to reduce frequency
267 				 * of stores until store buffer full stalls
268 				 * are not the bottleneck.
269 				 */
270 
271 				/* BYTE 2 */
272 				tmp0 = base1[2];
273 				j = j + tmp0;
274 				tmp1 = base[j];
275 				base1[2] = tmp1;
276 				base[j] = tmp0;
277 				tmp1 += tmp0;
278 				tmp1 = tmp1 & 0xff;
279 				merge |= (unsigned long long)(base[tmp1]) << 40;
280 
281 				/* BYTE 3 */
282 				tmp0 = base1[3];
283 				j = j + tmp0;
284 				tmp1 = base[j];
285 				base1[3] = tmp1;
286 				base[j] = tmp0;
287 				tmp0 += tmp1;
288 				tmp0 = tmp0 & 0xff;
289 				merge |= (unsigned long long)(base[tmp0]) << 32;
290 
291 				/* BYTE 4 */
292 				tmp0 = base1[4];
293 				j = j + tmp0;
294 				tmp1 = base[j];
295 				base1[4] = tmp1;
296 				base[j] = tmp0;
297 				tmp0 += tmp1;
298 				tmp0 = tmp0 & 0xff;
299 				merge |= (unsigned long long)(base[tmp0]) << 24;
300 
301 				/* BYTE 5 */
302 				tmp0 = base1[5];
303 				j = j + tmp0;
304 				tmp1 = base[j];
305 				base1[5] = tmp1;
306 				base[j] = tmp0;
307 				tmp0 += tmp1;
308 				tmp0 = tmp0 & 0xff;
309 				merge |= (unsigned long long)(base[tmp0]) << 16;
310 
311 				/* BYTE 6 */
312 				i1 = (i1+6);
313 				tmp0 = base1[6];
314 				j = j + tmp0;
315 				tmp1 = base[j];
316 				i_accum = tmp1;
317 				base[j] = tmp0;
318 
319 				tmp0 += tmp1;
320 				tmp0 = tmp0 & 0xff;
321 
322 				if (i1 == tmp0) {
323 					merge |=
324 					    (unsigned long long)(i_accum) << 8;
325 				} else {
326 					merge |=
327 					    (unsigned long long)(base[tmp0]) <<
328 					    8;
329 				}
330 
331 				/* BYTE 7 */
332 				tmp0 = base1[7];
333 
334 				/*
335 				 * Perform [i] speculation again. Indentical
336 				 * to that performed for BYTE0 and BYTE1.
337 				 */
338 				j = j + tmp0;
339 				if ((i1 ^ j) < 2) {
340 					base1[6] = i_accum;
341 					tmp1 = base[j];
342 
343 					base1[7] = tmp1;
344 					base[j] = tmp0;
345 
346 					tmp0 += tmp1;
347 					tmp0 = tmp0 & 0xff;
348 
349 					merge |=
350 					    (unsigned long long)(base[tmp0]);
351 
352 				} else {
353 					tmp1 = base[j];
354 
355 					i_accum = i_accum << 8;
356 					i_accum |= tmp1;
357 
358 					base[j] = tmp0;
359 
360 					tmp0 += tmp1;
361 					tmp0 = tmp0 & 0xff;
362 
363 					/* LINTED E_BAD_PTR_CAST_ALIGN */
364 					*((unsigned short *) &base[i1]) =
365 					    i_accum;
366 
367 					merge |=
368 					    (unsigned long long)(base[tmp0]);
369 				}
370 				i1++;
371 			} else {
372 				/*
373 				 * i is too close to wrap-around to allow
374 				 * masking to be disregarded
375 				 */
376 
377 				/*
378 				 * Same old speculation for BYTE 0 and BYTE 1
379 				 */
380 
381 				/* BYTE 0 */
382 				i1 = (i1 + 1) & 0xff;
383 				jj = i1;
384 
385 				tmp0 = base[i1];
386 				j = j + tmp0;
387 
388 				tmp1 = base[j];
389 				i_accum = tmp1;
390 				base[j] = tmp0;
391 
392 				tmp0 += tmp1;
393 				tmp0 = tmp0 & 0xff;
394 
395 				if (i1 == tmp0) {
396 					merge =
397 					    (unsigned long long)(i_accum) << 56;
398 				} else {
399 					merge =
400 					    (unsigned long long)(base[tmp0]) <<
401 					    56;
402 				}
403 
404 				/* BYTE 1 */
405 				tmp0 = base[i1+1];
406 
407 				j = j + tmp0;
408 
409 				if ((jj ^ j) < 2) {
410 					base[jj] = i_accum;
411 
412 					tmp1 = base[j];
413 
414 					base[i1+1] = tmp1;
415 					base[j] = tmp0;
416 
417 					tmp0 += tmp1;
418 					tmp0 = tmp0 & 0xff;
419 
420 					merge |=
421 					    (unsigned long long)(base[tmp0]) <<
422 					    48;
423 				} else {
424 
425 					tmp1 = base[j];
426 
427 					i_accum = i_accum << 8;
428 					i_accum |= tmp1;
429 
430 					base[j] = tmp0;
431 
432 					tmp0 += tmp1;
433 					tmp0 = tmp0 & 0xff;
434 
435 					/* LINTED E_BAD_PTR_CAST_ALIGN */
436 					*((unsigned short *) &base[jj]) =
437 					    i_accum;
438 
439 					merge |=
440 					    (unsigned long long)(base[tmp0]) <<
441 					    48;
442 				}
443 
444 				/* BYTE 2 */
445 				/*
446 				 * As know i must be even when enter loop (to
447 				 * satisfy alignment), can only wrap around
448 				 * on the even bytes. So just need to perform
449 				 * mask every 2nd byte
450 				 */
451 				i1 = (i1 + 2) & 0xff;
452 				tmp0 = base[i1];
453 				j = j + tmp0;
454 				tmp1 = base[j];
455 				base[i1] = tmp1;
456 				base[j] = tmp0;
457 				tmp0 += tmp1;
458 				tmp0 = tmp0 & 0xff;
459 				merge |= (unsigned long long)(base[tmp0]) << 40;
460 
461 				/* BYTE 3 */
462 				tmp0 = base[i1+1];
463 				j = j + tmp0;
464 				tmp1 = base[j];
465 				base[i1+1] = tmp1;
466 				base[j] = tmp0;
467 				tmp0 += tmp1;
468 				tmp0 = tmp0 & 0xff;
469 				merge |= (unsigned long long)(base[tmp0]) << 32;
470 
471 				/* BYTE 4 */
472 				i1 = (i1 + 2) & 0xff;
473 				tmp0 = base[i1];
474 				j = j + tmp0;
475 				tmp1 = base[j];
476 				base[i1] = tmp1;
477 				base[j] = tmp0;
478 				tmp0 += tmp1;
479 				tmp0 = tmp0 & 0xff;
480 				merge |= (unsigned long long)(base[tmp0]) << 24;
481 
482 				/* BYTE 5 */
483 				tmp0 = base[i1+1];
484 				j = j + tmp0;
485 				tmp1 = base[j];
486 				base[i1+1] = tmp1;
487 				base[j] = tmp0;
488 				tmp0 += tmp1;
489 				tmp0 = tmp0 & 0xff;
490 				merge |= (unsigned long long)(base[tmp0]) << 16;
491 
492 				/* BYTE 6 */
493 				i1 = (i1+2) &0xff;
494 				jj = i1;
495 				tmp0 = base[i1];
496 
497 				j = j + tmp0;
498 
499 				tmp1 = base[j];
500 				i_accum = tmp1;
501 				base[j] = tmp0;
502 
503 
504 				tmp0 += tmp1;
505 				tmp0 = tmp0 & 0xff;
506 
507 				if (i1 == tmp0) {
508 					merge |=
509 					    (unsigned long long)(i_accum) << 8;
510 				} else {
511 					merge |=
512 					    (unsigned long long)(base[tmp0]) <<
513 					    8;
514 				}
515 
516 				/* BYTE 7 */
517 				i1++;
518 				tmp0 = base[i1];
519 
520 				j = j + tmp0;
521 				if ((jj ^ j) < 2) {
522 					base[jj] = i_accum;
523 					tmp1 = base[j];
524 
525 					base[i1] = tmp1;
526 					base[j] = tmp0;
527 
528 					tmp0 += tmp1;
529 					tmp0 = tmp0 & 0xff;
530 
531 					merge |=
532 					    (unsigned long long)(base[tmp0]);
533 
534 				} else {
535 
536 					tmp1 = base[j];
537 
538 					i_accum = i_accum << 8;
539 					i_accum |= tmp1;
540 
541 					base[j] = tmp0;
542 
543 					tmp0 += tmp1;
544 					tmp0 = tmp0 & 0xff;
545 
546 					/* LINTED E_BAD_PTR_CAST_ALIGN */
547 					*((unsigned short *) &base[jj]) =
548 					    i_accum;
549 
550 					merge |=
551 					    (unsigned long long)(base[tmp0]);
552 				}
553 			}
554 
555 			/*
556 			 * Perform update to [out]
557 			 * Remember could be alignment issues
558 			 */
559 			/* LINTED E_BAD_PTR_CAST_ALIGN */
560 			in0 = *((unsigned long long *) (&in[ii]));
561 
562 			merge1 = merge0 | (merge >> shift);
563 
564 			merge0 = (merge & mask) << 56;
565 
566 			in0 = in0 ^ merge1;
567 
568 			/* LINTED E_BAD_PTR_CAST_ALIGN */
569 			*((unsigned long long *) (&out[ii])) = in0;
570 		}
571 
572 		i = i1;
573 
574 		/*
575 		 * Handle any overrun
576 		 */
577 		if (shift) {
578 			out[ii] = in[ii] ^ (merge0 >> 56);
579 			ii++;
580 		}
581 
582 		/*
583 		 * Handle final few bytes
584 		 */
585 		for (; ii < len; ii++) {
586 			i = i + 1;
587 			tmp0 = base[i];
588 			j = j + tmp0;
589 			tmp1 = base[j];
590 
591 			base[i] = tmp1;
592 			base[j] = tmp0;
593 
594 			tmp0 += tmp1;
595 			tmp0 = tmp0 & 0xff;
596 			out[ii] = in[ii] ^ base[tmp0];
597 		}
598 		key->i = i;
599 		key->j = j;
600 	}
601 #endif /* sun4v */
602 
603 /* EXPORT DELETE END */
604 }
605