xref: /illumos-gate/usr/src/common/crypto/arcfour/sun4v/arcfour_crypt.c (revision 814a60b13c0ad90e5d2edfd29a7a84bbf416cc1a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include "../arcfour.h"
30 
31 /* Initialize the key stream 'key' using the key value */
32 void
33 arcfour_key_init(ARCFour_key *key, uchar_t *keyval, int keyvallen)
34 {
35 /* EXPORT DELETE START */
36 
37 	uchar_t ext_keyval[256];
38 	uchar_t tmp;
39 	int i, j;
40 
41 	for (i = j = 0; i < 256; i++, j++) {
42 		if (j == keyvallen)
43 			j = 0;
44 
45 		ext_keyval[i] = keyval[j];
46 	}
47 	for (i = 0; i < 256; i++)
48 		key->arr[i] = (uchar_t)i;
49 
50 	j = 0;
51 	for (i = 0; i < 256; i++) {
52 		j = (j + key->arr[i] + ext_keyval[i]) % 256;
53 		tmp = key->arr[i];
54 		key->arr[i] = key->arr[j];
55 		key->arr[j] = tmp;
56 	}
57 	key->i = 0;
58 	key->j = 0;
59 
60 /* EXPORT DELETE END */
61 }
62 
63 
64 /*
65  * Encipher 'in' using 'key.
66  * in and out can point to the same location
67  */
68 void
69 arcfour_crypt(ARCFour_key *key, uchar_t *in, uchar_t *out, size_t len)
70 {
71 	size_t ii, it;
72 	unsigned long long in0, out0, merge = 0, merge0 = 0, merge1, mask = 0;
73 	uchar_t i, j, *base, jj, *base1, tmp;
74 	unsigned int tmp0, tmp1, i_accum, count = 0, shift = 0, i1;
75 
76 
77 /* EXPORT DELETE START */
78 	int index;
79 
80 	base = key->arr;
81 
82 	index = (((uint64_t)in) & 0x7);
83 
84 	/* Get the 'in' on an 8-byte alignment */
85 	if (index > 0) {
86 		i = key->i;
87 		j = key->j;
88 		for (index = 8 - (uint64_t)in & 0x7; (index-- > 0) && len > 0;
89 		    len--, in++, out++) {
90 			i = i + 1;
91 			j = j + key->arr[i];
92 			tmp = key->arr[i];
93 			key->arr[i] = key->arr[j];
94 			key->arr[j] = tmp;
95 			tmp = key->arr[i] + key->arr[j];
96 			*out = *in ^ key->arr[tmp];
97 		}
98 		key->i = i;
99 		key->j = j;
100 
101 	}
102 	if (len == 0)
103 		return;
104 
105 	/* See if we're fortunate and 'out' got aligned as well */
106 
107 
108 	/*
109 	 * Niagara optimized version for
110 	 * the cases where the input and output  buffers are aligned on
111 	 * a multiple of 8-byte boundary.
112 	 */
113 #ifdef	sun4v
114 	if ((((uint64_t)out) & 7) != 0) {
115 #endif	/* sun4v */
116 		i = key->i;
117 		j = key->j;
118 		for (ii = 0; ii < len; ii++) {
119 			i = i + 1;
120 			tmp0 = base[i];
121 			j = j + tmp0;
122 			tmp1 = base[j];
123 			base[i] = tmp1;
124 			base[j] = tmp0;
125 			tmp0 += tmp1;
126 			tmp0 = tmp0 & 0xff;
127 			out[ii] = in[ii] ^ base[tmp0];
128 		}
129 		key->i = i;
130 		key->j = j;
131 #ifdef	sun4v
132 	} else {
133 		i = key->i;
134 		j = key->j;
135 
136 		/*
137 		 * Want to align base[i] on a 2B boundary -- allows updates
138 		 * via [i] to be performed in 2B chunks (reducing # of stores).
139 		 * Requires appropriate alias detection.
140 		 */
141 
142 		if (((i+1) % 2) != 0) {
143 			i = i + 1;
144 			tmp0 = base[i];
145 			j = j + tmp0;
146 			tmp1 = base[j];
147 
148 			base[i] = tmp1;
149 			base[j] = tmp0;
150 
151 			tmp0 += tmp1;
152 			tmp0 = tmp0 & 0xff;
153 
154 			merge0 = (unsigned long long)(base[tmp0]) << 56;
155 			shift = 8; mask = 0xff;
156 		}
157 
158 		/*
159 		 * Note - in and out may now be misaligned -
160 		 * as updating [out] in 8B chunks need to handle this
161 		 * possibility. Also could have a 1B overrun.
162 		 * Need to drop out of loop early as a result.
163 		 */
164 
165 		for (ii = 0, i1 = i; ii < ((len-1)  & (~7));
166 		    ii += 8, i1 = i1&0xff) {
167 
168 			/*
169 			 * If i < less than 248, know wont wrap around
170 			 * (i % 256), so don't need to bother with masking i
171 			 * after each increment
172 			 */
173 			if (i1 < 248) {
174 
175 				/* BYTE 0 */
176 				i1 = (i1 + 1);
177 
178 				/*
179 				 * Creating this base pointer reduces subsequent
180 				 * arihmetic ops required to load [i]
181 				 *
182 				 * N.B. don't need to check if [j] aliases.
183 				 * [i] and [j] end up with the same values
184 				 * anyway.
185 				 */
186 				base1 = &base[i1];
187 
188 				tmp0 = base1[0];
189 				j = j + tmp0;
190 
191 				tmp1 = base[j];
192 				/*
193 				 * Don't store [i] yet
194 				 */
195 				i_accum = tmp1;
196 				base[j] = tmp0;
197 
198 				tmp0 += tmp1;
199 				tmp0 = tmp0 & 0xff;
200 
201 				/*
202 				 * Check [tmp0] doesn't alias with [i]
203 				 */
204 
205 				/*
206 				 * Updating [out] in 8B chunks
207 				 */
208 				if (i1 == tmp0) {
209 					merge =
210 					    (unsigned long long)(i_accum) << 56;
211 				} else {
212 					merge =
213 					    (unsigned long long)(base[tmp0]) <<
214 					    56;
215 				}
216 
217 				/* BYTE 1 */
218 				tmp0 = base1[1];
219 
220 				j = j + tmp0;
221 
222 				/*
223 				 * [j] can now alias with [i] and [i-1]
224 				 * If alias abort speculation
225 				 */
226 				if ((i1 ^ j) < 2) {
227 					base1[0] = i_accum;
228 
229 					tmp1 = base[j];
230 
231 					base1[1] = tmp1;
232 					base[j] = tmp0;
233 
234 					tmp0 += tmp1;
235 					tmp0 = tmp0 & 0xff;
236 
237 					merge |= (unsigned long long)
238 					    (base[tmp0]) << 48;
239 				} else {
240 
241 					tmp1 = base[j];
242 
243 					i_accum = i_accum << 8;
244 					i_accum |= tmp1;
245 
246 					base[j] = tmp0;
247 
248 					tmp0 += tmp1;
249 					tmp0 = tmp0 & 0xff;
250 
251 					/*
252 					 * Speculation suceeded! Update [i]
253 					 * in 2B chunk
254 					 */
255 					*((unsigned short *) &base[i1]) =
256 					    i_accum;
257 
258 					merge |=
259 					    (unsigned long long)(base[tmp0]) <<
260 					    48;
261 				}
262 
263 
264 				/*
265 				 * Too expensive to perform [i] speculation for
266 				 * every byte. Just need to reduce frequency
267 				 * of stores until store buffer full stalls
268 				 * are not the bottleneck.
269 				 */
270 
271 				/* BYTE 2 */
272 				tmp0 = base1[2];
273 				j = j + tmp0;
274 				tmp1 = base[j];
275 				base1[2] = tmp1;
276 				base[j] = tmp0;
277 				tmp1 += tmp0;
278 				tmp1 = tmp1 & 0xff;
279 				merge |= (unsigned long long)(base[tmp1]) << 40;
280 
281 				/* BYTE 3 */
282 				tmp0 = base1[3];
283 				j = j + tmp0;
284 				tmp1 = base[j];
285 				base1[3] = tmp1;
286 				base[j] = tmp0;
287 				tmp0 += tmp1;
288 				tmp0 = tmp0 & 0xff;
289 				merge |= (unsigned long long)(base[tmp0]) << 32;
290 
291 				/* BYTE 4 */
292 				tmp0 = base1[4];
293 				j = j + tmp0;
294 				tmp1 = base[j];
295 				base1[4] = tmp1;
296 				base[j] = tmp0;
297 				tmp0 += tmp1;
298 				tmp0 = tmp0 & 0xff;
299 				merge |= (unsigned long long)(base[tmp0]) << 24;
300 
301 				/* BYTE 5 */
302 				tmp0 = base1[5];
303 				j = j + tmp0;
304 				tmp1 = base[j];
305 				base1[5] = tmp1;
306 				base[j] = tmp0;
307 				tmp0 += tmp1;
308 				tmp0 = tmp0 & 0xff;
309 				merge |= (unsigned long long)(base[tmp0]) << 16;
310 
311 				/* BYTE 6 */
312 				i1 = (i1+6);
313 				tmp0 = base1[6];
314 				j = j + tmp0;
315 				tmp1 = base[j];
316 				i_accum = tmp1;
317 				base[j] = tmp0;
318 
319 				tmp0 += tmp1;
320 				tmp0 = tmp0 & 0xff;
321 
322 				if (i1 == tmp0) {
323 					merge |=
324 					    (unsigned long long)(i_accum) << 8;
325 				} else {
326 					merge |=
327 					    (unsigned long long)(base[tmp0]) <<
328 					    8;
329 				}
330 
331 				/* BYTE 7 */
332 				tmp0 = base1[7];
333 
334 				/*
335 				 * Perform [i] speculation again. Indentical
336 				 * to that performed for BYTE0 and BYTE1.
337 				 */
338 				j = j + tmp0;
339 				if ((i1 ^ j) < 2) {
340 					base1[6] = i_accum;
341 					tmp1 = base[j];
342 
343 					base1[7] = tmp1;
344 					base[j] = tmp0;
345 
346 					tmp0 += tmp1;
347 					tmp0 = tmp0 & 0xff;
348 
349 					merge |=
350 					    (unsigned long long)(base[tmp0]);
351 
352 				} else {
353 					tmp1 = base[j];
354 
355 					i_accum = i_accum << 8;
356 					i_accum |= tmp1;
357 
358 					base[j] = tmp0;
359 
360 					tmp0 += tmp1;
361 					tmp0 = tmp0 & 0xff;
362 
363 					*((unsigned short *) &base[i1]) =
364 					    i_accum;
365 
366 					merge |=
367 					    (unsigned long long)(base[tmp0]);
368 				}
369 				i1++;
370 			} else {
371 				/*
372 				 * i is too close to wrap-around to allow
373 				 * masking to be disregarded
374 				 */
375 
376 				/*
377 				 * Same old speculation for BYTE 0 and BYTE 1
378 				 */
379 
380 				/* BYTE 0 */
381 				i1 = (i1 + 1) & 0xff;
382 				jj = i1;
383 
384 				tmp0 = base[i1];
385 				j = j + tmp0;
386 
387 				tmp1 = base[j];
388 				i_accum = tmp1;
389 				base[j] = tmp0;
390 
391 				tmp0 += tmp1;
392 				tmp0 = tmp0 & 0xff;
393 
394 				if (i1 == tmp0) {
395 					merge =
396 					    (unsigned long long)(i_accum) << 56;
397 				} else {
398 					merge =
399 					    (unsigned long long)(base[tmp0]) <<
400 					    56;
401 				}
402 
403 				/* BYTE 1 */
404 				tmp0 = base[i1+1];
405 
406 				j = j + tmp0;
407 
408 				if ((jj ^ j) < 2) {
409 					base[jj] = i_accum;
410 
411 					tmp1 = base[j];
412 
413 					base[i1+1] = tmp1;
414 					base[j] = tmp0;
415 
416 					tmp0 += tmp1;
417 					tmp0 = tmp0 & 0xff;
418 
419 					merge |=
420 					    (unsigned long long)(base[tmp0]) <<
421 					    48;
422 				} else {
423 
424 					tmp1 = base[j];
425 
426 					i_accum = i_accum << 8;
427 					i_accum |= tmp1;
428 
429 					base[j] = tmp0;
430 
431 					tmp0 += tmp1;
432 					tmp0 = tmp0 & 0xff;
433 
434 					*((unsigned short *) &base[jj]) =
435 					    i_accum;
436 
437 					merge |=
438 					    (unsigned long long)(base[tmp0]) <<
439 					    48;
440 				}
441 
442 				/* BYTE 2 */
443 				/*
444 				 * As know i must be even when enter loop (to
445 				 * satisfy alignment), can only wrap around
446 				 * on the even bytes. So just need to perform
447 				 * mask every 2nd byte
448 				 */
449 				i1 = (i1 + 2) & 0xff;
450 				tmp0 = base[i1];
451 				j = j + tmp0;
452 				tmp1 = base[j];
453 				base[i1] = tmp1;
454 				base[j] = tmp0;
455 				tmp0 += tmp1;
456 				tmp0 = tmp0 & 0xff;
457 				merge |= (unsigned long long)(base[tmp0]) << 40;
458 
459 				/* BYTE 3 */
460 				tmp0 = base[i1+1];
461 				j = j + tmp0;
462 				tmp1 = base[j];
463 				base[i1+1] = tmp1;
464 				base[j] = tmp0;
465 				tmp0 += tmp1;
466 				tmp0 = tmp0 & 0xff;
467 				merge |= (unsigned long long)(base[tmp0]) << 32;
468 
469 				/* BYTE 4 */
470 				i1 = (i1 + 2) & 0xff;
471 				tmp0 = base[i1];
472 				j = j + tmp0;
473 				tmp1 = base[j];
474 				base[i1] = tmp1;
475 				base[j] = tmp0;
476 				tmp0 += tmp1;
477 				tmp0 = tmp0 & 0xff;
478 				merge |= (unsigned long long)(base[tmp0]) << 24;
479 
480 				/* BYTE 5 */
481 				tmp0 = base[i1+1];
482 				j = j + tmp0;
483 				tmp1 = base[j];
484 				base[i1+1] = tmp1;
485 				base[j] = tmp0;
486 				tmp0 += tmp1;
487 				tmp0 = tmp0 & 0xff;
488 				merge |= (unsigned long long)(base[tmp0]) << 16;
489 
490 				/* BYTE 6 */
491 				i1 = (i1+2) &0xff;
492 				jj = i1;
493 				tmp0 = base[i1];
494 
495 				j = j + tmp0;
496 
497 				tmp1 = base[j];
498 				i_accum = tmp1;
499 				base[j] = tmp0;
500 
501 
502 				tmp0 += tmp1;
503 				tmp0 = tmp0 & 0xff;
504 
505 				if (i1 == tmp0) {
506 					merge |=
507 					    (unsigned long long)(i_accum) << 8;
508 				} else {
509 					merge |=
510 					    (unsigned long long)(base[tmp0]) <<
511 					    8;
512 				}
513 
514 				/* BYTE 7 */
515 				i1++;
516 				tmp0 = base[i1];
517 
518 				j = j + tmp0;
519 				if ((jj ^ j) < 2) {
520 					base[jj] = i_accum;
521 					tmp1 = base[j];
522 
523 					base[i1] = tmp1;
524 					base[j] = tmp0;
525 
526 					tmp0 += tmp1;
527 					tmp0 = tmp0 & 0xff;
528 
529 					merge |=
530 					    (unsigned long long)(base[tmp0]);
531 
532 				} else {
533 					tmp1 = base[j];
534 
535 					i_accum = i_accum << 8;
536 					i_accum |= tmp1;
537 
538 					base[j] = tmp0;
539 
540 					tmp0 += tmp1;
541 					tmp0 = tmp0 & 0xff;
542 
543 					*((unsigned short *) &base[jj]) =
544 					    i_accum;
545 
546 					merge |=
547 					    (unsigned long long)(base[tmp0]);
548 				}
549 			}
550 
551 			/*
552 			 * Perform update to [out]
553 			 * Remember could be alignment issues
554 			 */
555 			in0 = *((unsigned long long *) (&in[ii]));
556 
557 			merge1 = merge0 | (merge >> shift);
558 
559 			merge0 = (merge & mask) << 56;
560 
561 			in0 = in0 ^ merge1;
562 
563 			*((unsigned long long *) (&out[ii])) = in0;
564 		}
565 
566 		i = i1;
567 
568 		/*
569 		 * Handle any overrun
570 		 */
571 		if (shift) {
572 			out[ii] = in[ii] ^ (merge0 >> 56);
573 			ii++;
574 		}
575 
576 		/*
577 		 * Handle final few bytes
578 		 */
579 		for (; ii < len; ii++) {
580 			i = i + 1;
581 			tmp0 = base[i];
582 			j = j + tmp0;
583 			tmp1 = base[j];
584 
585 			base[i] = tmp1;
586 			base[j] = tmp0;
587 
588 			tmp0 += tmp1;
589 			tmp0 = tmp0 & 0xff;
590 			out[ii] = in[ii] ^ base[tmp0];
591 		}
592 		key->i = i;
593 		key->j = j;
594 	}
595 #endif /* sun4v */
596 
597 /* EXPORT DELETE END */
598 }
599