xref: /titanic_41/usr/src/common/crypto/md5/md5.c (revision 0e42dee69ed771bf604dd1789fca9d77b5bbe302)
1 /*
2  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
3  * Use is subject to license terms.
4  */
5 
6 /*
7  * Cleaned-up and optimized version of MD5, based on the reference
8  * implementation provided in RFC 1321.  See RSA Copyright information
9  * below.
10  */
11 
12 #pragma ident	"%Z%%M%	%I%	%E% SMI"
13 
14 /*
15  * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
16  */
17 
18 /*
19  * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
20  * rights reserved.
21  *
22  * License to copy and use this software is granted provided that it
23  * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
24  * Algorithm" in all material mentioning or referencing this software
25  * or this function.
26  *
27  * License is also granted to make and use derivative works provided
28  * that such works are identified as "derived from the RSA Data
29  * Security, Inc. MD5 Message-Digest Algorithm" in all material
30  * mentioning or referencing the derived work.
31  *
32  * RSA Data Security, Inc. makes no representations concerning either
33  * the merchantability of this software or the suitability of this
34  * software for any particular purpose. It is provided "as is"
35  * without express or implied warranty of any kind.
36  *
37  * These notices must be retained in any copies of any part of this
38  * documentation and/or software.
39  */
40 
41 #include <sys/types.h>
42 #include <sys/md5.h>
43 #include <sys/md5_consts.h>	/* MD5_CONST() optimization */
44 #include "md5_byteswap.h"
45 #if	!defined(_KERNEL) || defined(_BOOT)
46 #include <strings.h>
47 #endif /* !_KERNEL || _BOOT */
48 
49 #ifdef _KERNEL
50 #include <sys/systm.h>
51 #endif /* _KERNEL */
52 
53 static void Encode(uint8_t *, const uint32_t *, size_t);
54 static void MD5Transform(uint32_t, uint32_t, uint32_t, uint32_t, MD5_CTX *,
55     const uint8_t [64]);
56 
57 static uint8_t PADDING[64] = { 0x80, /* all zeros */ };
58 
59 /*
60  * F, G, H and I are the basic MD5 functions.
61  */
62 #define	F(b, c, d)	(((b) & (c)) | ((~b) & (d)))
63 #define	G(b, c, d)	(((b) & (d)) | ((c) & (~d)))
64 #define	H(b, c, d)	((b) ^ (c) ^ (d))
65 #define	I(b, c, d)	((c) ^ ((b) | (~d)))
66 
67 /*
68  * ROTATE_LEFT rotates x left n bits.
69  */
70 #define	ROTATE_LEFT(x, n)	\
71 	(((x) << (n)) | ((x) >> ((sizeof (x) << 3) - (n))))
72 
73 /*
74  * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
75  * Rotation is separate from addition to prevent recomputation.
76  */
77 
78 #define	FF(a, b, c, d, x, s, ac) { \
79 	(a) += F((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \
80 	(a) = ROTATE_LEFT((a), (s)); \
81 	(a) += (b); \
82 	}
83 
84 #define	GG(a, b, c, d, x, s, ac) { \
85 	(a) += G((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \
86 	(a) = ROTATE_LEFT((a), (s)); \
87 	(a) += (b); \
88 	}
89 
90 #define	HH(a, b, c, d, x, s, ac) { \
91 	(a) += H((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \
92 	(a) = ROTATE_LEFT((a), (s)); \
93 	(a) += (b); \
94 	}
95 
96 #define	II(a, b, c, d, x, s, ac) { \
97 	(a) += I((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \
98 	(a) = ROTATE_LEFT((a), (s)); \
99 	(a) += (b); \
100 	}
101 
102 /*
103  * Loading 32-bit constants on a RISC is expensive since it involves both a
104  * `sethi' and an `or'.  thus, we instead have the compiler generate `ld's to
105  * load the constants from an array called `md5_consts'.  however, on intel
106  * (and other CISC processors), it is cheaper to load the constant
107  * directly.  thus, the c code in MD5Transform() uses the macro MD5_CONST()
108  * which either expands to a constant or an array reference, depending on the
109  * architecture the code is being compiled for.
110  *
111  * Right now, i386 and amd64 are the CISC exceptions.
112  * If we get another CISC ISA, we'll have to change the ifdef.
113  */
114 
115 #if defined(__i386) || defined(__amd64)
116 
117 #define	MD5_CONST(x)		(MD5_CONST_ ## x)
118 #define	MD5_CONST_e(x)		MD5_CONST(x)
119 #define	MD5_CONST_o(x)		MD5_CONST(x)
120 
121 #else
122 /*
123  * sparc/RISC optimization:
124  *
125  * while it is somewhat counter-intuitive, on sparc (and presumably other RISC
126  * machines), it is more efficient to place all the constants used in this
127  * function in an array and load the values out of the array than to manually
128  * load the constants.  this is because setting a register to a 32-bit value
129  * takes two ops in most cases: a `sethi' and an `or', but loading a 32-bit
130  * value from memory only takes one `ld' (or `lduw' on v9).  while this
131  * increases memory usage, the compiler can find enough other things to do
132  * while waiting to keep the pipeline does not stall.  additionally, it is
133  * likely that many of these constants are cached so that later accesses do
134  * not even go out to the bus.
135  *
136  * this array is declared `static' to keep the compiler from having to
137  * bcopy() this array onto the stack frame of MD5Transform() each time it is
138  * called -- which is unacceptably expensive.
139  *
140  * the `const' is to ensure that callers are good citizens and do not try to
141  * munge the array.  since these routines are going to be called from inside
142  * multithreaded kernelland, this is a good safety check. -- `constants' will
143  * end up in .rodata.
144  *
145  * unfortunately, loading from an array in this manner hurts performance under
146  * intel (and presumably other CISC machines).  so, there is a macro,
147  * MD5_CONST(), used in MD5Transform(), that either expands to a reference to
148  * this array, or to the actual constant, depending on what platform this code
149  * is compiled for.
150  */
151 
152 #ifdef sun4v
153 
154 /*
155  * Going to load these consts in 8B chunks, so need to enforce 8B alignment
156  */
157 
158 /* CSTYLED */
159 #pragma align 64 (md5_consts)
160 
161 #endif /* sun4v */
162 
163 static const uint32_t md5_consts[] = {
164 	MD5_CONST_0,	MD5_CONST_1,	MD5_CONST_2,	MD5_CONST_3,
165 	MD5_CONST_4,	MD5_CONST_5,	MD5_CONST_6,	MD5_CONST_7,
166 	MD5_CONST_8,	MD5_CONST_9,	MD5_CONST_10,	MD5_CONST_11,
167 	MD5_CONST_12,	MD5_CONST_13,	MD5_CONST_14,	MD5_CONST_15,
168 	MD5_CONST_16,	MD5_CONST_17,	MD5_CONST_18,	MD5_CONST_19,
169 	MD5_CONST_20,	MD5_CONST_21,	MD5_CONST_22,	MD5_CONST_23,
170 	MD5_CONST_24,	MD5_CONST_25,	MD5_CONST_26,	MD5_CONST_27,
171 	MD5_CONST_28,	MD5_CONST_29,	MD5_CONST_30,	MD5_CONST_31,
172 	MD5_CONST_32,	MD5_CONST_33,	MD5_CONST_34,	MD5_CONST_35,
173 	MD5_CONST_36,	MD5_CONST_37,	MD5_CONST_38,	MD5_CONST_39,
174 	MD5_CONST_40,	MD5_CONST_41,	MD5_CONST_42,	MD5_CONST_43,
175 	MD5_CONST_44,	MD5_CONST_45,	MD5_CONST_46,	MD5_CONST_47,
176 	MD5_CONST_48,	MD5_CONST_49,	MD5_CONST_50,	MD5_CONST_51,
177 	MD5_CONST_52,	MD5_CONST_53,	MD5_CONST_54,	MD5_CONST_55,
178 	MD5_CONST_56,	MD5_CONST_57,	MD5_CONST_58,	MD5_CONST_59,
179 	MD5_CONST_60,	MD5_CONST_61,	MD5_CONST_62,	MD5_CONST_63
180 };
181 
182 
183 #ifdef sun4v
184 /*
185  * To reduce the number of loads, load consts in 64-bit
186  * chunks and then split.
187  *
188  * No need to mask upper 32-bits, as just interested in
189  * low 32-bits (saves an & operation and means that this
190  * optimization doesn't increases the icount.
191  */
192 #define	MD5_CONST_e(x)		(md5_consts64[x/2] >> 32)
193 #define	MD5_CONST_o(x)		(md5_consts64[x/2])
194 
195 #else
196 
197 #define	MD5_CONST_e(x)		(md5_consts[x])
198 #define	MD5_CONST_o(x)		(md5_consts[x])
199 
200 #endif /* sun4v */
201 
202 #endif
203 
204 /*
205  * MD5Init()
206  *
207  * purpose: initializes the md5 context and begins and md5 digest operation
208  *   input: MD5_CTX *	: the context to initialize.
209  *  output: void
210  */
211 
212 void
213 MD5Init(MD5_CTX *ctx)
214 {
215 	ctx->count[0] = ctx->count[1] = 0;
216 
217 	/* load magic initialization constants */
218 	ctx->state[0] = MD5_INIT_CONST_1;
219 	ctx->state[1] = MD5_INIT_CONST_2;
220 	ctx->state[2] = MD5_INIT_CONST_3;
221 	ctx->state[3] = MD5_INIT_CONST_4;
222 }
223 
224 /*
225  * MD5Update()
226  *
227  * purpose: continues an md5 digest operation, using the message block
228  *          to update the context.
229  *   input: MD5_CTX *	: the context to update
230  *          uint8_t *	: the message block
231  *          uint32_t    : the length of the message block in bytes
232  *  output: void
233  *
234  * MD5 crunches in 64-byte blocks.  All numeric constants here are related to
235  * that property of MD5.
236  */
237 
238 void
239 MD5Update(MD5_CTX *ctx, const void *inpp, unsigned int input_len)
240 {
241 	uint32_t		i, buf_index, buf_len;
242 #ifdef	sun4v
243 	uint32_t		old_asi;
244 #endif	/* sun4v */
245 	const unsigned char 	*input = (const unsigned char *)inpp;
246 
247 	/* compute (number of bytes computed so far) mod 64 */
248 	buf_index = (ctx->count[0] >> 3) & 0x3F;
249 
250 	/* update number of bits hashed into this MD5 computation so far */
251 	if ((ctx->count[0] += (input_len << 3)) < (input_len << 3))
252 	    ctx->count[1]++;
253 	ctx->count[1] += (input_len >> 29);
254 
255 	buf_len = 64 - buf_index;
256 
257 	/* transform as many times as possible */
258 	i = 0;
259 	if (input_len >= buf_len) {
260 
261 		/*
262 		 * general optimization:
263 		 *
264 		 * only do initial bcopy() and MD5Transform() if
265 		 * buf_index != 0.  if buf_index == 0, we're just
266 		 * wasting our time doing the bcopy() since there
267 		 * wasn't any data left over from a previous call to
268 		 * MD5Update().
269 		 */
270 
271 #ifdef sun4v
272 		/*
273 		 * For N1 use %asi register. However, costly to repeatedly set
274 		 * in MD5Transform. Therefore, set once here.
275 		 * Should probably restore the old value afterwards...
276 		 */
277 		old_asi = get_little();
278 		set_little(0x88);
279 #endif /* sun4v */
280 
281 		if (buf_index) {
282 			bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len);
283 
284 			MD5Transform(ctx->state[0], ctx->state[1],
285 			    ctx->state[2], ctx->state[3], ctx,
286 			    ctx->buf_un.buf8);
287 
288 			i = buf_len;
289 		}
290 
291 		for (; i + 63 < input_len; i += 64)
292 			MD5Transform(ctx->state[0], ctx->state[1],
293 			    ctx->state[2], ctx->state[3], ctx, &input[i]);
294 
295 
296 #ifdef sun4v
297 		/*
298 		 * Restore old %ASI value
299 		 */
300 		set_little(old_asi);
301 #endif /* sun4v */
302 
303 		/*
304 		 * general optimization:
305 		 *
306 		 * if i and input_len are the same, return now instead
307 		 * of calling bcopy(), since the bcopy() in this
308 		 * case will be an expensive nop.
309 		 */
310 
311 		if (input_len == i)
312 			return;
313 
314 		buf_index = 0;
315 	}
316 
317 	/* buffer remaining input */
318 	bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i);
319 }
320 
321 /*
322  * MD5Final()
323  *
324  * purpose: ends an md5 digest operation, finalizing the message digest and
325  *          zeroing the context.
326  *   input: uint8_t *	: a buffer to store the digest in
327  *          MD5_CTX *   : the context to finalize, save, and zero
328  *  output: void
329  */
330 
331 void
332 MD5Final(unsigned char *digest, MD5_CTX *ctx)
333 {
334 	uint8_t		bitcount_le[sizeof (ctx->count)];
335 	uint32_t	index = (ctx->count[0] >> 3) & 0x3f;
336 
337 	/* store bit count, little endian */
338 	Encode(bitcount_le, ctx->count, sizeof (bitcount_le));
339 
340 	/* pad out to 56 mod 64 */
341 	MD5Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index);
342 
343 	/* append length (before padding) */
344 	MD5Update(ctx, bitcount_le, sizeof (bitcount_le));
345 
346 	/* store state in digest */
347 	Encode(digest, ctx->state, sizeof (ctx->state));
348 
349 	/* zeroize sensitive information */
350 	bzero(ctx, sizeof (*ctx));
351 }
352 
353 #ifndef	_KERNEL
354 
355 void
356 md5_calc(unsigned char *output, unsigned char *input, unsigned int inlen)
357 {
358 	MD5_CTX context;
359 
360 	MD5Init(&context);
361 	MD5Update(&context, input, inlen);
362 	MD5Final(output, &context);
363 }
364 
365 #endif	/* !_KERNEL */
366 
367 /*
368  * sparc register window optimization:
369  *
370  * `a', `b', `c', and `d' are passed into MD5Transform explicitly
371  * since it increases the number of registers available to the
372  * compiler.  under this scheme, these variables can be held in
373  * %i0 - %i3, which leaves more local and out registers available.
374  */
375 
376 /*
377  * MD5Transform()
378  *
379  * purpose: md5 transformation -- updates the digest based on `block'
380  *   input: uint32_t	: bytes  1 -  4 of the digest
381  *          uint32_t	: bytes  5 -  8 of the digest
382  *          uint32_t	: bytes  9 - 12 of the digest
383  *          uint32_t	: bytes 12 - 16 of the digest
384  *          MD5_CTX *   : the context to update
385  *          uint8_t [64]: the block to use to update the digest
386  *  output: void
387  */
388 
389 static void
390 MD5Transform(uint32_t a, uint32_t b, uint32_t c, uint32_t d,
391     MD5_CTX *ctx, const uint8_t block[64])
392 {
393 	/*
394 	 * general optimization:
395 	 *
396 	 * use individual integers instead of using an array.  this is a
397 	 * win, although the amount it wins by seems to vary quite a bit.
398 	 */
399 
400 	register uint32_t	x_0, x_1, x_2,  x_3,  x_4,  x_5,  x_6,  x_7;
401 	register uint32_t	x_8, x_9, x_10, x_11, x_12, x_13, x_14, x_15;
402 #ifdef sun4v
403 	unsigned long long 	*md5_consts64;
404 
405 		/* LINTED E_BAD_PTR_CAST_ALIGN */
406 	md5_consts64 = (unsigned long long *) md5_consts;
407 #endif	/* sun4v */
408 
409 	/*
410 	 * general optimization:
411 	 *
412 	 * the compiler (at least SC4.2/5.x) generates better code if
413 	 * variable use is localized.  in this case, swapping the integers in
414 	 * this order allows `x_0 'to be swapped nearest to its first use in
415 	 * FF(), and likewise for `x_1' and up.  note that the compiler
416 	 * prefers this to doing each swap right before the FF() that
417 	 * uses it.
418 	 */
419 
420 	/*
421 	 * sparc v9/v8plus optimization:
422 	 *
423 	 * if `block' is already aligned on a 4-byte boundary, use the
424 	 * optimized load_little_32() directly.  otherwise, bcopy()
425 	 * into a buffer that *is* aligned on a 4-byte boundary and
426 	 * then do the load_little_32() on that buffer.  benchmarks
427 	 * have shown that using the bcopy() is better than loading
428 	 * the bytes individually and doing the endian-swap by hand.
429 	 *
430 	 * even though it's quite tempting to assign to do:
431 	 *
432 	 * blk = bcopy(blk, ctx->buf_un.buf32, sizeof (ctx->buf_un.buf32));
433 	 *
434 	 * and only have one set of LOAD_LITTLE_32()'s, the compiler (at least
435 	 * SC4.2/5.x) *does not* like that, so please resist the urge.
436 	 */
437 
438 #ifdef _MD5_CHECK_ALIGNMENT
439 	if ((uintptr_t)block & 0x3) {		/* not 4-byte aligned? */
440 		bcopy(block, ctx->buf_un.buf32, sizeof (ctx->buf_un.buf32));
441 
442 #ifdef sun4v
443 		x_15 = LOAD_LITTLE_32_f(ctx->buf_un.buf32);
444 		x_14 = LOAD_LITTLE_32_e(ctx->buf_un.buf32);
445 		x_13 = LOAD_LITTLE_32_d(ctx->buf_un.buf32);
446 		x_12 = LOAD_LITTLE_32_c(ctx->buf_un.buf32);
447 		x_11 = LOAD_LITTLE_32_b(ctx->buf_un.buf32);
448 		x_10 = LOAD_LITTLE_32_a(ctx->buf_un.buf32);
449 		x_9  = LOAD_LITTLE_32_9(ctx->buf_un.buf32);
450 		x_8  = LOAD_LITTLE_32_8(ctx->buf_un.buf32);
451 		x_7  = LOAD_LITTLE_32_7(ctx->buf_un.buf32);
452 		x_6  = LOAD_LITTLE_32_6(ctx->buf_un.buf32);
453 		x_5  = LOAD_LITTLE_32_5(ctx->buf_un.buf32);
454 		x_4  = LOAD_LITTLE_32_4(ctx->buf_un.buf32);
455 		x_3  = LOAD_LITTLE_32_3(ctx->buf_un.buf32);
456 		x_2  = LOAD_LITTLE_32_2(ctx->buf_un.buf32);
457 		x_1  = LOAD_LITTLE_32_1(ctx->buf_un.buf32);
458 		x_0  = LOAD_LITTLE_32_0(ctx->buf_un.buf32);
459 #else
460 		x_15 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 15);
461 		x_14 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 14);
462 		x_13 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 13);
463 		x_12 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 12);
464 		x_11 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 11);
465 		x_10 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 10);
466 		x_9  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  9);
467 		x_8  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  8);
468 		x_7  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  7);
469 		x_6  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  6);
470 		x_5  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  5);
471 		x_4  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  4);
472 		x_3  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  3);
473 		x_2  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  2);
474 		x_1  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  1);
475 		x_0  = LOAD_LITTLE_32(ctx->buf_un.buf32 +  0);
476 #endif /* sun4v */
477 	} else
478 #endif
479 	{
480 
481 #ifdef sun4v
482 		/* LINTED E_BAD_PTR_CAST_ALIGN */
483 		x_15 = LOAD_LITTLE_32_f(block);
484 		/* LINTED E_BAD_PTR_CAST_ALIGN */
485 		x_14 = LOAD_LITTLE_32_e(block);
486 		/* LINTED E_BAD_PTR_CAST_ALIGN */
487 		x_13 = LOAD_LITTLE_32_d(block);
488 		/* LINTED E_BAD_PTR_CAST_ALIGN */
489 		x_12 = LOAD_LITTLE_32_c(block);
490 		/* LINTED E_BAD_PTR_CAST_ALIGN */
491 		x_11 = LOAD_LITTLE_32_b(block);
492 		/* LINTED E_BAD_PTR_CAST_ALIGN */
493 		x_10 = LOAD_LITTLE_32_a(block);
494 		/* LINTED E_BAD_PTR_CAST_ALIGN */
495 		x_9  = LOAD_LITTLE_32_9(block);
496 		/* LINTED E_BAD_PTR_CAST_ALIGN */
497 		x_8  = LOAD_LITTLE_32_8(block);
498 		/* LINTED E_BAD_PTR_CAST_ALIGN */
499 		x_7  = LOAD_LITTLE_32_7(block);
500 		/* LINTED E_BAD_PTR_CAST_ALIGN */
501 		x_6  = LOAD_LITTLE_32_6(block);
502 		/* LINTED E_BAD_PTR_CAST_ALIGN */
503 		x_5  = LOAD_LITTLE_32_5(block);
504 		/* LINTED E_BAD_PTR_CAST_ALIGN */
505 		x_4  = LOAD_LITTLE_32_4(block);
506 		/* LINTED E_BAD_PTR_CAST_ALIGN */
507 		x_3  = LOAD_LITTLE_32_3(block);
508 		/* LINTED E_BAD_PTR_CAST_ALIGN */
509 		x_2  = LOAD_LITTLE_32_2(block);
510 		/* LINTED E_BAD_PTR_CAST_ALIGN */
511 		x_1  = LOAD_LITTLE_32_1(block);
512 		/* LINTED E_BAD_PTR_CAST_ALIGN */
513 		x_0  = LOAD_LITTLE_32_0(block);
514 #else
515 		/* LINTED E_BAD_PTR_CAST_ALIGN */
516 		x_15 = LOAD_LITTLE_32(block + 60);
517 		/* LINTED E_BAD_PTR_CAST_ALIGN */
518 		x_14 = LOAD_LITTLE_32(block + 56);
519 		/* LINTED E_BAD_PTR_CAST_ALIGN */
520 		x_13 = LOAD_LITTLE_32(block + 52);
521 		/* LINTED E_BAD_PTR_CAST_ALIGN */
522 		x_12 = LOAD_LITTLE_32(block + 48);
523 		/* LINTED E_BAD_PTR_CAST_ALIGN */
524 		x_11 = LOAD_LITTLE_32(block + 44);
525 		/* LINTED E_BAD_PTR_CAST_ALIGN */
526 		x_10 = LOAD_LITTLE_32(block + 40);
527 		/* LINTED E_BAD_PTR_CAST_ALIGN */
528 		x_9  = LOAD_LITTLE_32(block + 36);
529 		/* LINTED E_BAD_PTR_CAST_ALIGN */
530 		x_8  = LOAD_LITTLE_32(block + 32);
531 		/* LINTED E_BAD_PTR_CAST_ALIGN */
532 		x_7  = LOAD_LITTLE_32(block + 28);
533 		/* LINTED E_BAD_PTR_CAST_ALIGN */
534 		x_6  = LOAD_LITTLE_32(block + 24);
535 		/* LINTED E_BAD_PTR_CAST_ALIGN */
536 		x_5  = LOAD_LITTLE_32(block + 20);
537 		/* LINTED E_BAD_PTR_CAST_ALIGN */
538 		x_4  = LOAD_LITTLE_32(block + 16);
539 		/* LINTED E_BAD_PTR_CAST_ALIGN */
540 		x_3  = LOAD_LITTLE_32(block + 12);
541 		/* LINTED E_BAD_PTR_CAST_ALIGN */
542 		x_2  = LOAD_LITTLE_32(block +  8);
543 		/* LINTED E_BAD_PTR_CAST_ALIGN */
544 		x_1  = LOAD_LITTLE_32(block +  4);
545 		/* LINTED E_BAD_PTR_CAST_ALIGN */
546 		x_0  = LOAD_LITTLE_32(block +  0);
547 #endif /* sun4v */
548 	}
549 
550 	/* round 1 */
551 	FF(a, b, c, d, 	x_0, MD5_SHIFT_11, MD5_CONST_e(0));  /* 1 */
552 	FF(d, a, b, c, 	x_1, MD5_SHIFT_12, MD5_CONST_o(1));  /* 2 */
553 	FF(c, d, a, b, 	x_2, MD5_SHIFT_13, MD5_CONST_e(2));  /* 3 */
554 	FF(b, c, d, a, 	x_3, MD5_SHIFT_14, MD5_CONST_o(3));  /* 4 */
555 	FF(a, b, c, d, 	x_4, MD5_SHIFT_11, MD5_CONST_e(4));  /* 5 */
556 	FF(d, a, b, c, 	x_5, MD5_SHIFT_12, MD5_CONST_o(5));  /* 6 */
557 	FF(c, d, a, b, 	x_6, MD5_SHIFT_13, MD5_CONST_e(6));  /* 7 */
558 	FF(b, c, d, a, 	x_7, MD5_SHIFT_14, MD5_CONST_o(7));  /* 8 */
559 	FF(a, b, c, d, 	x_8, MD5_SHIFT_11, MD5_CONST_e(8));  /* 9 */
560 	FF(d, a, b, c, 	x_9, MD5_SHIFT_12, MD5_CONST_o(9));  /* 10 */
561 	FF(c, d, a, b, x_10, MD5_SHIFT_13, MD5_CONST_e(10)); /* 11 */
562 	FF(b, c, d, a, x_11, MD5_SHIFT_14, MD5_CONST_o(11)); /* 12 */
563 	FF(a, b, c, d, x_12, MD5_SHIFT_11, MD5_CONST_e(12)); /* 13 */
564 	FF(d, a, b, c, x_13, MD5_SHIFT_12, MD5_CONST_o(13)); /* 14 */
565 	FF(c, d, a, b, x_14, MD5_SHIFT_13, MD5_CONST_e(14)); /* 15 */
566 	FF(b, c, d, a, x_15, MD5_SHIFT_14, MD5_CONST_o(15)); /* 16 */
567 
568 	/* round 2 */
569 	GG(a, b, c, d,  x_1, MD5_SHIFT_21, MD5_CONST_e(16)); /* 17 */
570 	GG(d, a, b, c,  x_6, MD5_SHIFT_22, MD5_CONST_o(17)); /* 18 */
571 	GG(c, d, a, b, x_11, MD5_SHIFT_23, MD5_CONST_e(18)); /* 19 */
572 	GG(b, c, d, a,  x_0, MD5_SHIFT_24, MD5_CONST_o(19)); /* 20 */
573 	GG(a, b, c, d,  x_5, MD5_SHIFT_21, MD5_CONST_e(20)); /* 21 */
574 	GG(d, a, b, c, x_10, MD5_SHIFT_22, MD5_CONST_o(21)); /* 22 */
575 	GG(c, d, a, b, x_15, MD5_SHIFT_23, MD5_CONST_e(22)); /* 23 */
576 	GG(b, c, d, a,  x_4, MD5_SHIFT_24, MD5_CONST_o(23)); /* 24 */
577 	GG(a, b, c, d,  x_9, MD5_SHIFT_21, MD5_CONST_e(24)); /* 25 */
578 	GG(d, a, b, c, x_14, MD5_SHIFT_22, MD5_CONST_o(25)); /* 26 */
579 	GG(c, d, a, b,  x_3, MD5_SHIFT_23, MD5_CONST_e(26)); /* 27 */
580 	GG(b, c, d, a,  x_8, MD5_SHIFT_24, MD5_CONST_o(27)); /* 28 */
581 	GG(a, b, c, d, x_13, MD5_SHIFT_21, MD5_CONST_e(28)); /* 29 */
582 	GG(d, a, b, c,  x_2, MD5_SHIFT_22, MD5_CONST_o(29)); /* 30 */
583 	GG(c, d, a, b,  x_7, MD5_SHIFT_23, MD5_CONST_e(30)); /* 31 */
584 	GG(b, c, d, a, x_12, MD5_SHIFT_24, MD5_CONST_o(31)); /* 32 */
585 
586 	/* round 3 */
587 	HH(a, b, c, d,  x_5, MD5_SHIFT_31, MD5_CONST_e(32)); /* 33 */
588 	HH(d, a, b, c,  x_8, MD5_SHIFT_32, MD5_CONST_o(33)); /* 34 */
589 	HH(c, d, a, b, x_11, MD5_SHIFT_33, MD5_CONST_e(34)); /* 35 */
590 	HH(b, c, d, a, x_14, MD5_SHIFT_34, MD5_CONST_o(35)); /* 36 */
591 	HH(a, b, c, d,  x_1, MD5_SHIFT_31, MD5_CONST_e(36)); /* 37 */
592 	HH(d, a, b, c,  x_4, MD5_SHIFT_32, MD5_CONST_o(37)); /* 38 */
593 	HH(c, d, a, b,  x_7, MD5_SHIFT_33, MD5_CONST_e(38)); /* 39 */
594 	HH(b, c, d, a, x_10, MD5_SHIFT_34, MD5_CONST_o(39)); /* 40 */
595 	HH(a, b, c, d, x_13, MD5_SHIFT_31, MD5_CONST_e(40)); /* 41 */
596 	HH(d, a, b, c,  x_0, MD5_SHIFT_32, MD5_CONST_o(41)); /* 42 */
597 	HH(c, d, a, b,  x_3, MD5_SHIFT_33, MD5_CONST_e(42)); /* 43 */
598 	HH(b, c, d, a,  x_6, MD5_SHIFT_34, MD5_CONST_o(43)); /* 44 */
599 	HH(a, b, c, d,  x_9, MD5_SHIFT_31, MD5_CONST_e(44)); /* 45 */
600 	HH(d, a, b, c, x_12, MD5_SHIFT_32, MD5_CONST_o(45)); /* 46 */
601 	HH(c, d, a, b, x_15, MD5_SHIFT_33, MD5_CONST_e(46)); /* 47 */
602 	HH(b, c, d, a,  x_2, MD5_SHIFT_34, MD5_CONST_o(47)); /* 48 */
603 
604 	/* round 4 */
605 	II(a, b, c, d,  x_0, MD5_SHIFT_41, MD5_CONST_e(48)); /* 49 */
606 	II(d, a, b, c,  x_7, MD5_SHIFT_42, MD5_CONST_o(49)); /* 50 */
607 	II(c, d, a, b, x_14, MD5_SHIFT_43, MD5_CONST_e(50)); /* 51 */
608 	II(b, c, d, a,  x_5, MD5_SHIFT_44, MD5_CONST_o(51)); /* 52 */
609 	II(a, b, c, d, x_12, MD5_SHIFT_41, MD5_CONST_e(52)); /* 53 */
610 	II(d, a, b, c,  x_3, MD5_SHIFT_42, MD5_CONST_o(53)); /* 54 */
611 	II(c, d, a, b, x_10, MD5_SHIFT_43, MD5_CONST_e(54)); /* 55 */
612 	II(b, c, d, a,  x_1, MD5_SHIFT_44, MD5_CONST_o(55)); /* 56 */
613 	II(a, b, c, d,  x_8, MD5_SHIFT_41, MD5_CONST_e(56)); /* 57 */
614 	II(d, a, b, c, x_15, MD5_SHIFT_42, MD5_CONST_o(57)); /* 58 */
615 	II(c, d, a, b,  x_6, MD5_SHIFT_43, MD5_CONST_e(58)); /* 59 */
616 	II(b, c, d, a, x_13, MD5_SHIFT_44, MD5_CONST_o(59)); /* 60 */
617 	II(a, b, c, d,  x_4, MD5_SHIFT_41, MD5_CONST_e(60)); /* 61 */
618 	II(d, a, b, c, x_11, MD5_SHIFT_42, MD5_CONST_o(61)); /* 62 */
619 	II(c, d, a, b,  x_2, MD5_SHIFT_43, MD5_CONST_e(62)); /* 63 */
620 	II(b, c, d, a,  x_9, MD5_SHIFT_44, MD5_CONST_o(63)); /* 64 */
621 
622 	ctx->state[0] += a;
623 	ctx->state[1] += b;
624 	ctx->state[2] += c;
625 	ctx->state[3] += d;
626 
627 	/*
628 	 * zeroize sensitive information -- compiler will optimize
629 	 * this out if everything is kept in registers
630 	 */
631 
632 	x_0 = x_1  = x_2  = x_3  = x_4  = x_5  = x_6  = x_7 = x_8 = 0;
633 	x_9 = x_10 = x_11 = x_12 = x_13 = x_14 = x_15 = 0;
634 }
635 
636 /*
637  * Encode()
638  *
639  * purpose: to convert a list of numbers from big endian to little endian
640  *   input: uint8_t *	: place to store the converted little endian numbers
641  *	    uint32_t *	: place to get numbers to convert from
642  *          size_t	: the length of the input in bytes
643  *  output: void
644  */
645 
646 static void
647 Encode(uint8_t *_RESTRICT_KYWD output, const uint32_t *_RESTRICT_KYWD input,
648     size_t input_len)
649 {
650 	size_t		i, j;
651 
652 	for (i = 0, j = 0; j < input_len; i++, j += sizeof (uint32_t)) {
653 
654 #ifdef _LITTLE_ENDIAN
655 
656 #ifdef _MD5_CHECK_ALIGNMENT
657 		if ((uintptr_t)output & 0x3)	/* Not 4-byte aligned */
658 			bcopy(input + i, output + j, 4);
659 		else *(uint32_t *)(output + j) = input[i];
660 #else
661 		/*LINTED E_BAD_PTR_CAST_ALIGN*/
662 		*(uint32_t *)(output + j) = input[i];
663 #endif /* _MD5_CHECK_ALIGNMENT */
664 
665 #else	/* big endian -- will work on little endian, but slowly */
666 
667 		output[j] = input[i] & 0xff;
668 		output[j + 1] = (input[i] >> 8)  & 0xff;
669 		output[j + 2] = (input[i] >> 16) & 0xff;
670 		output[j + 3] = (input[i] >> 24) & 0xff;
671 #endif
672 	}
673 }
674