xref: /titanic_44/usr/src/common/crypto/arcfour/sun4v/arcfour_crypt.c (revision 21bf64a78855d076f09716ea1c06175d954e934c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include "../arcfour.h"
29 
30 /* Initialize the key stream 'key' using the key value */
31 void
32 arcfour_key_init(ARCFour_key *key, uchar_t *keyval, int keyvallen)
33 {
34 /* EXPORT DELETE START */
35 
36 	uchar_t ext_keyval[256];
37 	uchar_t tmp;
38 	int i, j;
39 
40 	for (i = j = 0; i < 256; i++, j++) {
41 		if (j == keyvallen)
42 			j = 0;
43 
44 		ext_keyval[i] = keyval[j];
45 	}
46 	for (i = 0; i < 256; i++)
47 		key->arr[i] = (uchar_t)i;
48 
49 	j = 0;
50 	for (i = 0; i < 256; i++) {
51 		j = (j + key->arr[i] + ext_keyval[i]) % 256;
52 		tmp = key->arr[i];
53 		key->arr[i] = key->arr[j];
54 		key->arr[j] = tmp;
55 	}
56 	key->i = 0;
57 	key->j = 0;
58 
59 /* EXPORT DELETE END */
60 }
61 
62 
63 /*
64  * Encipher 'in' using 'key.
65  * in and out can point to the same location
66  */
67 void
68 arcfour_crypt(ARCFour_key *key, uchar_t *in, uchar_t *out, size_t len)
69 {
70 	size_t ii, it;
71 	unsigned long long in0, out0, merge = 0, merge0 = 0, merge1, mask = 0;
72 	uchar_t i, j, *base, jj, *base1, tmp;
73 	unsigned int tmp0, tmp1, i_accum, count = 0, shift = 0, i1;
74 
75 
76 /* EXPORT DELETE START */
77 	int index;
78 
79 	base = key->arr;
80 
81 	index = (((uintptr_t)in) & 0x7);
82 
83 	/* Get the 'in' on an 8-byte alignment */
84 	if (index > 0) {
85 		i = key->i;
86 		j = key->j;
87 
88 		for (index = 8 - index; (index-- > 0) && len > 0;
89 		    len--, in++, out++) {
90 
91 			i = i + 1;
92 			j = j + key->arr[i];
93 			tmp = key->arr[i];
94 			key->arr[i] = key->arr[j];
95 			key->arr[j] = tmp;
96 			tmp = key->arr[i] + key->arr[j];
97 			*out = *in ^ key->arr[tmp];
98 		}
99 		key->i = i;
100 		key->j = j;
101 
102 	}
103 	if (len == 0)
104 		return;
105 
106 	/* See if we're fortunate and 'out' got aligned as well */
107 
108 
109 	/*
110 	 * Niagara optimized version for
111 	 * the cases where the input and output  buffers are aligned on
112 	 * a multiple of 8-byte boundary.
113 	 */
114 #ifdef	sun4v
115 	if ((((uintptr_t)out) & 7) != 0) {
116 #endif	/* sun4v */
117 		i = key->i;
118 		j = key->j;
119 		for (ii = 0; ii < len; ii++) {
120 			i = i + 1;
121 			tmp0 = base[i];
122 			j = j + tmp0;
123 			tmp1 = base[j];
124 			base[i] = tmp1;
125 			base[j] = tmp0;
126 			tmp0 += tmp1;
127 			tmp0 = tmp0 & 0xff;
128 			out[ii] = in[ii] ^ base[tmp0];
129 		}
130 		key->i = i;
131 		key->j = j;
132 #ifdef	sun4v
133 	} else {
134 		i = key->i;
135 		j = key->j;
136 
137 		/*
138 		 * Want to align base[i] on a 2B boundary -- allows updates
139 		 * via [i] to be performed in 2B chunks (reducing # of stores).
140 		 * Requires appropriate alias detection.
141 		 */
142 
143 		if (((i+1) % 2) != 0) {
144 			i = i + 1;
145 			tmp0 = base[i];
146 			j = j + tmp0;
147 			tmp1 = base[j];
148 
149 			base[i] = tmp1;
150 			base[j] = tmp0;
151 
152 			tmp0 += tmp1;
153 			tmp0 = tmp0 & 0xff;
154 
155 			merge0 = (unsigned long long)(base[tmp0]) << 56;
156 			shift = 8; mask = 0xff;
157 		}
158 
159 		/*
160 		 * Note - in and out may now be misaligned -
161 		 * as updating [out] in 8B chunks need to handle this
162 		 * possibility. Also could have a 1B overrun.
163 		 * Need to drop out of loop early as a result.
164 		 */
165 
166 		for (ii = 0, i1 = i; ii < ((len-1)  & (~7));
167 		    ii += 8, i1 = i1&0xff) {
168 
169 			/*
170 			 * If i < less than 248, know wont wrap around
171 			 * (i % 256), so don't need to bother with masking i
172 			 * after each increment
173 			 */
174 			if (i1 < 248) {
175 
176 				/* BYTE 0 */
177 				i1 = (i1 + 1);
178 
179 				/*
180 				 * Creating this base pointer reduces subsequent
181 				 * arihmetic ops required to load [i]
182 				 *
183 				 * N.B. don't need to check if [j] aliases.
184 				 * [i] and [j] end up with the same values
185 				 * anyway.
186 				 */
187 				base1 = &base[i1];
188 
189 				tmp0 = base1[0];
190 				j = j + tmp0;
191 
192 				tmp1 = base[j];
193 				/*
194 				 * Don't store [i] yet
195 				 */
196 				i_accum = tmp1;
197 				base[j] = tmp0;
198 
199 				tmp0 += tmp1;
200 				tmp0 = tmp0 & 0xff;
201 
202 				/*
203 				 * Check [tmp0] doesn't alias with [i]
204 				 */
205 
206 				/*
207 				 * Updating [out] in 8B chunks
208 				 */
209 				if (i1 == tmp0) {
210 					merge =
211 					    (unsigned long long)(i_accum) << 56;
212 				} else {
213 					merge =
214 					    (unsigned long long)(base[tmp0]) <<
215 					    56;
216 				}
217 
218 				/* BYTE 1 */
219 				tmp0 = base1[1];
220 
221 				j = j + tmp0;
222 
223 				/*
224 				 * [j] can now alias with [i] and [i-1]
225 				 * If alias abort speculation
226 				 */
227 				if ((i1 ^ j) < 2) {
228 					base1[0] = i_accum;
229 
230 					tmp1 = base[j];
231 
232 					base1[1] = tmp1;
233 					base[j] = tmp0;
234 
235 					tmp0 += tmp1;
236 					tmp0 = tmp0 & 0xff;
237 
238 					merge |= (unsigned long long)
239 					    (base[tmp0]) << 48;
240 				} else {
241 
242 					tmp1 = base[j];
243 
244 					i_accum = i_accum << 8;
245 					i_accum |= tmp1;
246 
247 					base[j] = tmp0;
248 
249 					tmp0 += tmp1;
250 					tmp0 = tmp0 & 0xff;
251 
252 					/*
253 					 * Speculation suceeded! Update [i]
254 					 * in 2B chunk
255 					 */
256 					*((unsigned short *) &base[i1]) =
257 					    i_accum;
258 
259 					merge |=
260 					    (unsigned long long)(base[tmp0]) <<
261 					    48;
262 				}
263 
264 
265 				/*
266 				 * Too expensive to perform [i] speculation for
267 				 * every byte. Just need to reduce frequency
268 				 * of stores until store buffer full stalls
269 				 * are not the bottleneck.
270 				 */
271 
272 				/* BYTE 2 */
273 				tmp0 = base1[2];
274 				j = j + tmp0;
275 				tmp1 = base[j];
276 				base1[2] = tmp1;
277 				base[j] = tmp0;
278 				tmp1 += tmp0;
279 				tmp1 = tmp1 & 0xff;
280 				merge |= (unsigned long long)(base[tmp1]) << 40;
281 
282 				/* BYTE 3 */
283 				tmp0 = base1[3];
284 				j = j + tmp0;
285 				tmp1 = base[j];
286 				base1[3] = tmp1;
287 				base[j] = tmp0;
288 				tmp0 += tmp1;
289 				tmp0 = tmp0 & 0xff;
290 				merge |= (unsigned long long)(base[tmp0]) << 32;
291 
292 				/* BYTE 4 */
293 				tmp0 = base1[4];
294 				j = j + tmp0;
295 				tmp1 = base[j];
296 				base1[4] = tmp1;
297 				base[j] = tmp0;
298 				tmp0 += tmp1;
299 				tmp0 = tmp0 & 0xff;
300 				merge |= (unsigned long long)(base[tmp0]) << 24;
301 
302 				/* BYTE 5 */
303 				tmp0 = base1[5];
304 				j = j + tmp0;
305 				tmp1 = base[j];
306 				base1[5] = tmp1;
307 				base[j] = tmp0;
308 				tmp0 += tmp1;
309 				tmp0 = tmp0 & 0xff;
310 				merge |= (unsigned long long)(base[tmp0]) << 16;
311 
312 				/* BYTE 6 */
313 				i1 = (i1+6);
314 				tmp0 = base1[6];
315 				j = j + tmp0;
316 				tmp1 = base[j];
317 				i_accum = tmp1;
318 				base[j] = tmp0;
319 
320 				tmp0 += tmp1;
321 				tmp0 = tmp0 & 0xff;
322 
323 				if (i1 == tmp0) {
324 					merge |=
325 					    (unsigned long long)(i_accum) << 8;
326 				} else {
327 					merge |=
328 					    (unsigned long long)(base[tmp0]) <<
329 					    8;
330 				}
331 
332 				/* BYTE 7 */
333 				tmp0 = base1[7];
334 
335 				/*
336 				 * Perform [i] speculation again. Indentical
337 				 * to that performed for BYTE0 and BYTE1.
338 				 */
339 				j = j + tmp0;
340 				if ((i1 ^ j) < 2) {
341 					base1[6] = i_accum;
342 					tmp1 = base[j];
343 
344 					base1[7] = tmp1;
345 					base[j] = tmp0;
346 
347 					tmp0 += tmp1;
348 					tmp0 = tmp0 & 0xff;
349 
350 					merge |=
351 					    (unsigned long long)(base[tmp0]);
352 
353 				} else {
354 					tmp1 = base[j];
355 
356 					i_accum = i_accum << 8;
357 					i_accum |= tmp1;
358 
359 					base[j] = tmp0;
360 
361 					tmp0 += tmp1;
362 					tmp0 = tmp0 & 0xff;
363 
364 					*((unsigned short *) &base[i1]) =
365 					    i_accum;
366 
367 					merge |=
368 					    (unsigned long long)(base[tmp0]);
369 				}
370 				i1++;
371 			} else {
372 				/*
373 				 * i is too close to wrap-around to allow
374 				 * masking to be disregarded
375 				 */
376 
377 				/*
378 				 * Same old speculation for BYTE 0 and BYTE 1
379 				 */
380 
381 				/* BYTE 0 */
382 				i1 = (i1 + 1) & 0xff;
383 				jj = i1;
384 
385 				tmp0 = base[i1];
386 				j = j + tmp0;
387 
388 				tmp1 = base[j];
389 				i_accum = tmp1;
390 				base[j] = tmp0;
391 
392 				tmp0 += tmp1;
393 				tmp0 = tmp0 & 0xff;
394 
395 				if (i1 == tmp0) {
396 					merge =
397 					    (unsigned long long)(i_accum) << 56;
398 				} else {
399 					merge =
400 					    (unsigned long long)(base[tmp0]) <<
401 					    56;
402 				}
403 
404 				/* BYTE 1 */
405 				tmp0 = base[i1+1];
406 
407 				j = j + tmp0;
408 
409 				if ((jj ^ j) < 2) {
410 					base[jj] = i_accum;
411 
412 					tmp1 = base[j];
413 
414 					base[i1+1] = tmp1;
415 					base[j] = tmp0;
416 
417 					tmp0 += tmp1;
418 					tmp0 = tmp0 & 0xff;
419 
420 					merge |=
421 					    (unsigned long long)(base[tmp0]) <<
422 					    48;
423 				} else {
424 
425 					tmp1 = base[j];
426 
427 					i_accum = i_accum << 8;
428 					i_accum |= tmp1;
429 
430 					base[j] = tmp0;
431 
432 					tmp0 += tmp1;
433 					tmp0 = tmp0 & 0xff;
434 
435 					*((unsigned short *) &base[jj]) =
436 					    i_accum;
437 
438 					merge |=
439 					    (unsigned long long)(base[tmp0]) <<
440 					    48;
441 				}
442 
443 				/* BYTE 2 */
444 				/*
445 				 * As know i must be even when enter loop (to
446 				 * satisfy alignment), can only wrap around
447 				 * on the even bytes. So just need to perform
448 				 * mask every 2nd byte
449 				 */
450 				i1 = (i1 + 2) & 0xff;
451 				tmp0 = base[i1];
452 				j = j + tmp0;
453 				tmp1 = base[j];
454 				base[i1] = tmp1;
455 				base[j] = tmp0;
456 				tmp0 += tmp1;
457 				tmp0 = tmp0 & 0xff;
458 				merge |= (unsigned long long)(base[tmp0]) << 40;
459 
460 				/* BYTE 3 */
461 				tmp0 = base[i1+1];
462 				j = j + tmp0;
463 				tmp1 = base[j];
464 				base[i1+1] = tmp1;
465 				base[j] = tmp0;
466 				tmp0 += tmp1;
467 				tmp0 = tmp0 & 0xff;
468 				merge |= (unsigned long long)(base[tmp0]) << 32;
469 
470 				/* BYTE 4 */
471 				i1 = (i1 + 2) & 0xff;
472 				tmp0 = base[i1];
473 				j = j + tmp0;
474 				tmp1 = base[j];
475 				base[i1] = tmp1;
476 				base[j] = tmp0;
477 				tmp0 += tmp1;
478 				tmp0 = tmp0 & 0xff;
479 				merge |= (unsigned long long)(base[tmp0]) << 24;
480 
481 				/* BYTE 5 */
482 				tmp0 = base[i1+1];
483 				j = j + tmp0;
484 				tmp1 = base[j];
485 				base[i1+1] = tmp1;
486 				base[j] = tmp0;
487 				tmp0 += tmp1;
488 				tmp0 = tmp0 & 0xff;
489 				merge |= (unsigned long long)(base[tmp0]) << 16;
490 
491 				/* BYTE 6 */
492 				i1 = (i1+2) &0xff;
493 				jj = i1;
494 				tmp0 = base[i1];
495 
496 				j = j + tmp0;
497 
498 				tmp1 = base[j];
499 				i_accum = tmp1;
500 				base[j] = tmp0;
501 
502 
503 				tmp0 += tmp1;
504 				tmp0 = tmp0 & 0xff;
505 
506 				if (i1 == tmp0) {
507 					merge |=
508 					    (unsigned long long)(i_accum) << 8;
509 				} else {
510 					merge |=
511 					    (unsigned long long)(base[tmp0]) <<
512 					    8;
513 				}
514 
515 				/* BYTE 7 */
516 				i1++;
517 				tmp0 = base[i1];
518 
519 				j = j + tmp0;
520 				if ((jj ^ j) < 2) {
521 					base[jj] = i_accum;
522 					tmp1 = base[j];
523 
524 					base[i1] = tmp1;
525 					base[j] = tmp0;
526 
527 					tmp0 += tmp1;
528 					tmp0 = tmp0 & 0xff;
529 
530 					merge |=
531 					    (unsigned long long)(base[tmp0]);
532 
533 				} else {
534 					tmp1 = base[j];
535 
536 					i_accum = i_accum << 8;
537 					i_accum |= tmp1;
538 
539 					base[j] = tmp0;
540 
541 					tmp0 += tmp1;
542 					tmp0 = tmp0 & 0xff;
543 
544 					*((unsigned short *) &base[jj]) =
545 					    i_accum;
546 
547 					merge |=
548 					    (unsigned long long)(base[tmp0]);
549 				}
550 			}
551 
552 			/*
553 			 * Perform update to [out]
554 			 * Remember could be alignment issues
555 			 */
556 			in0 = *((unsigned long long *) (&in[ii]));
557 
558 			merge1 = merge0 | (merge >> shift);
559 
560 			merge0 = (merge & mask) << 56;
561 
562 			in0 = in0 ^ merge1;
563 
564 			*((unsigned long long *) (&out[ii])) = in0;
565 		}
566 
567 		i = i1;
568 
569 		/*
570 		 * Handle any overrun
571 		 */
572 		if (shift) {
573 			out[ii] = in[ii] ^ (merge0 >> 56);
574 			ii++;
575 		}
576 
577 		/*
578 		 * Handle final few bytes
579 		 */
580 		for (; ii < len; ii++) {
581 			i = i + 1;
582 			tmp0 = base[i];
583 			j = j + tmp0;
584 			tmp1 = base[j];
585 
586 			base[i] = tmp1;
587 			base[j] = tmp0;
588 
589 			tmp0 += tmp1;
590 			tmp0 = tmp0 & 0xff;
591 			out[ii] = in[ii] ^ base[tmp0];
592 		}
593 		key->i = i;
594 		key->j = j;
595 	}
596 #endif /* sun4v */
597 
598 /* EXPORT DELETE END */
599 }
600