xref: /linux/arch/powerpc/crypto/crc32-vpmsum_core.S (revision 4d5e3b06e1fc1428be14cd4ebe3b37c1bb34f95d)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Core of the accelerated CRC algorithm.
4 * In your file, define the constants and CRC_FUNCTION_NAME
5 * Then include this file.
6 *
7 * Calculate the checksum of data that is 16 byte aligned and a multiple of
8 * 16 bytes.
9 *
10 * The first step is to reduce it to 1024 bits. We do this in 8 parallel
11 * chunks in order to mask the latency of the vpmsum instructions. If we
12 * have more than 32 kB of data to checksum we repeat this step multiple
13 * times, passing in the previous 1024 bits.
14 *
15 * The next step is to reduce the 1024 bits to 64 bits. This step adds
16 * 32 bits of 0s to the end - this matches what a CRC does. We just
17 * calculate constants that land the data in this 32 bits.
18 *
19 * We then use fixed point Barrett reduction to compute a mod n over GF(2)
20 * for n = CRC using POWER8 instructions. We use x = 32.
21 *
22 * https://en.wikipedia.org/wiki/Barrett_reduction
23 *
24 * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
25*/
26
27#include <asm/ppc_asm.h>
28#include <asm/ppc-opcode.h>
29
30#define MAX_SIZE	32768
31
32	.text
33
34#if defined(__BIG_ENDIAN__) && defined(REFLECT)
35#define BYTESWAP_DATA
36#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
37#define BYTESWAP_DATA
38#else
39#undef BYTESWAP_DATA
40#endif
41
42#define off16		r25
43#define off32		r26
44#define off48		r27
45#define off64		r28
46#define off80		r29
47#define off96		r30
48#define off112		r31
49
50#define const1		v24
51#define const2		v25
52
53#define byteswap	v26
54#define	mask_32bit	v27
55#define	mask_64bit	v28
56#define zeroes		v29
57
58#ifdef BYTESWAP_DATA
59#define VPERM(A, B, C, D) vperm	A, B, C, D
60#else
61#define VPERM(A, B, C, D)
62#endif
63
64/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
65FUNC_START(CRC_FUNCTION_NAME)
66	std	r31,-8(r1)
67	std	r30,-16(r1)
68	std	r29,-24(r1)
69	std	r28,-32(r1)
70	std	r27,-40(r1)
71	std	r26,-48(r1)
72	std	r25,-56(r1)
73
74	li	off16,16
75	li	off32,32
76	li	off48,48
77	li	off64,64
78	li	off80,80
79	li	off96,96
80	li	off112,112
81	li	r0,0
82
83	/* Enough room for saving 10 non volatile VMX registers */
84	subi	r6,r1,56+10*16
85	subi	r7,r1,56+2*16
86
87	stvx	v20,0,r6
88	stvx	v21,off16,r6
89	stvx	v22,off32,r6
90	stvx	v23,off48,r6
91	stvx	v24,off64,r6
92	stvx	v25,off80,r6
93	stvx	v26,off96,r6
94	stvx	v27,off112,r6
95	stvx	v28,0,r7
96	stvx	v29,off16,r7
97
98	mr	r10,r3
99
100	vxor	zeroes,zeroes,zeroes
101	vspltisw v0,-1
102
103	vsldoi	mask_32bit,zeroes,v0,4
104	vsldoi	mask_64bit,zeroes,v0,8
105
106	/* Get the initial value into v8 */
107	vxor	v8,v8,v8
108	MTVRD(v8, R3)
109#ifdef REFLECT
110	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
111#else
112	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
113#endif
114
115#ifdef BYTESWAP_DATA
116	addis	r3,r2,.byteswap_constant@toc@ha
117	addi	r3,r3,.byteswap_constant@toc@l
118
119	lvx	byteswap,0,r3
120	addi	r3,r3,16
121#endif
122
123	cmpdi	r5,256
124	blt	.Lshort
125
126	rldicr	r6,r5,0,56
127
128	/* Checksum in blocks of MAX_SIZE */
1291:	lis	r7,MAX_SIZE@h
130	ori	r7,r7,MAX_SIZE@l
131	mr	r9,r7
132	cmpd	r6,r7
133	bgt	2f
134	mr	r7,r6
1352:	subf	r6,r7,r6
136
137	/* our main loop does 128 bytes at a time */
138	srdi	r7,r7,7
139
140	/*
141	 * Work out the offset into the constants table to start at. Each
142	 * constant is 16 bytes, and it is used against 128 bytes of input
143	 * data - 128 / 16 = 8
144	 */
145	sldi	r8,r7,4
146	srdi	r9,r9,3
147	subf	r8,r8,r9
148
149	/* We reduce our final 128 bytes in a separate step */
150	addi	r7,r7,-1
151	mtctr	r7
152
153	addis	r3,r2,.constants@toc@ha
154	addi	r3,r3,.constants@toc@l
155
156	/* Find the start of our constants */
157	add	r3,r3,r8
158
159	/* zero v0-v7 which will contain our checksums */
160	vxor	v0,v0,v0
161	vxor	v1,v1,v1
162	vxor	v2,v2,v2
163	vxor	v3,v3,v3
164	vxor	v4,v4,v4
165	vxor	v5,v5,v5
166	vxor	v6,v6,v6
167	vxor	v7,v7,v7
168
169	lvx	const1,0,r3
170
171	/*
172	 * If we are looping back to consume more data we use the values
173	 * already in v16-v23.
174	 */
175	cmpdi	r0,1
176	beq	2f
177
178	/* First warm up pass */
179	lvx	v16,0,r4
180	lvx	v17,off16,r4
181	VPERM(v16,v16,v16,byteswap)
182	VPERM(v17,v17,v17,byteswap)
183	lvx	v18,off32,r4
184	lvx	v19,off48,r4
185	VPERM(v18,v18,v18,byteswap)
186	VPERM(v19,v19,v19,byteswap)
187	lvx	v20,off64,r4
188	lvx	v21,off80,r4
189	VPERM(v20,v20,v20,byteswap)
190	VPERM(v21,v21,v21,byteswap)
191	lvx	v22,off96,r4
192	lvx	v23,off112,r4
193	VPERM(v22,v22,v22,byteswap)
194	VPERM(v23,v23,v23,byteswap)
195	addi	r4,r4,8*16
196
197	/* xor in initial value */
198	vxor	v16,v16,v8
199
2002:	bdz	.Lfirst_warm_up_done
201
202	addi	r3,r3,16
203	lvx	const2,0,r3
204
205	/* Second warm up pass */
206	VPMSUMD(v8,v16,const1)
207	lvx	v16,0,r4
208	VPERM(v16,v16,v16,byteswap)
209	ori	r2,r2,0
210
211	VPMSUMD(v9,v17,const1)
212	lvx	v17,off16,r4
213	VPERM(v17,v17,v17,byteswap)
214	ori	r2,r2,0
215
216	VPMSUMD(v10,v18,const1)
217	lvx	v18,off32,r4
218	VPERM(v18,v18,v18,byteswap)
219	ori	r2,r2,0
220
221	VPMSUMD(v11,v19,const1)
222	lvx	v19,off48,r4
223	VPERM(v19,v19,v19,byteswap)
224	ori	r2,r2,0
225
226	VPMSUMD(v12,v20,const1)
227	lvx	v20,off64,r4
228	VPERM(v20,v20,v20,byteswap)
229	ori	r2,r2,0
230
231	VPMSUMD(v13,v21,const1)
232	lvx	v21,off80,r4
233	VPERM(v21,v21,v21,byteswap)
234	ori	r2,r2,0
235
236	VPMSUMD(v14,v22,const1)
237	lvx	v22,off96,r4
238	VPERM(v22,v22,v22,byteswap)
239	ori	r2,r2,0
240
241	VPMSUMD(v15,v23,const1)
242	lvx	v23,off112,r4
243	VPERM(v23,v23,v23,byteswap)
244
245	addi	r4,r4,8*16
246
247	bdz	.Lfirst_cool_down
248
249	/*
250	 * main loop. We modulo schedule it such that it takes three iterations
251	 * to complete - first iteration load, second iteration vpmsum, third
252	 * iteration xor.
253	 */
254	.balign	16
2554:	lvx	const1,0,r3
256	addi	r3,r3,16
257	ori	r2,r2,0
258
259	vxor	v0,v0,v8
260	VPMSUMD(v8,v16,const2)
261	lvx	v16,0,r4
262	VPERM(v16,v16,v16,byteswap)
263	ori	r2,r2,0
264
265	vxor	v1,v1,v9
266	VPMSUMD(v9,v17,const2)
267	lvx	v17,off16,r4
268	VPERM(v17,v17,v17,byteswap)
269	ori	r2,r2,0
270
271	vxor	v2,v2,v10
272	VPMSUMD(v10,v18,const2)
273	lvx	v18,off32,r4
274	VPERM(v18,v18,v18,byteswap)
275	ori	r2,r2,0
276
277	vxor	v3,v3,v11
278	VPMSUMD(v11,v19,const2)
279	lvx	v19,off48,r4
280	VPERM(v19,v19,v19,byteswap)
281	lvx	const2,0,r3
282	ori	r2,r2,0
283
284	vxor	v4,v4,v12
285	VPMSUMD(v12,v20,const1)
286	lvx	v20,off64,r4
287	VPERM(v20,v20,v20,byteswap)
288	ori	r2,r2,0
289
290	vxor	v5,v5,v13
291	VPMSUMD(v13,v21,const1)
292	lvx	v21,off80,r4
293	VPERM(v21,v21,v21,byteswap)
294	ori	r2,r2,0
295
296	vxor	v6,v6,v14
297	VPMSUMD(v14,v22,const1)
298	lvx	v22,off96,r4
299	VPERM(v22,v22,v22,byteswap)
300	ori	r2,r2,0
301
302	vxor	v7,v7,v15
303	VPMSUMD(v15,v23,const1)
304	lvx	v23,off112,r4
305	VPERM(v23,v23,v23,byteswap)
306
307	addi	r4,r4,8*16
308
309	bdnz	4b
310
311.Lfirst_cool_down:
312	/* First cool down pass */
313	lvx	const1,0,r3
314	addi	r3,r3,16
315
316	vxor	v0,v0,v8
317	VPMSUMD(v8,v16,const1)
318	ori	r2,r2,0
319
320	vxor	v1,v1,v9
321	VPMSUMD(v9,v17,const1)
322	ori	r2,r2,0
323
324	vxor	v2,v2,v10
325	VPMSUMD(v10,v18,const1)
326	ori	r2,r2,0
327
328	vxor	v3,v3,v11
329	VPMSUMD(v11,v19,const1)
330	ori	r2,r2,0
331
332	vxor	v4,v4,v12
333	VPMSUMD(v12,v20,const1)
334	ori	r2,r2,0
335
336	vxor	v5,v5,v13
337	VPMSUMD(v13,v21,const1)
338	ori	r2,r2,0
339
340	vxor	v6,v6,v14
341	VPMSUMD(v14,v22,const1)
342	ori	r2,r2,0
343
344	vxor	v7,v7,v15
345	VPMSUMD(v15,v23,const1)
346	ori	r2,r2,0
347
348.Lsecond_cool_down:
349	/* Second cool down pass */
350	vxor	v0,v0,v8
351	vxor	v1,v1,v9
352	vxor	v2,v2,v10
353	vxor	v3,v3,v11
354	vxor	v4,v4,v12
355	vxor	v5,v5,v13
356	vxor	v6,v6,v14
357	vxor	v7,v7,v15
358
359#ifdef REFLECT
360	/*
361	 * vpmsumd produces a 96 bit result in the least significant bits
362	 * of the register. Since we are bit reflected we have to shift it
363	 * left 32 bits so it occupies the least significant bits in the
364	 * bit reflected domain.
365	 */
366	vsldoi	v0,v0,zeroes,4
367	vsldoi	v1,v1,zeroes,4
368	vsldoi	v2,v2,zeroes,4
369	vsldoi	v3,v3,zeroes,4
370	vsldoi	v4,v4,zeroes,4
371	vsldoi	v5,v5,zeroes,4
372	vsldoi	v6,v6,zeroes,4
373	vsldoi	v7,v7,zeroes,4
374#endif
375
376	/* xor with last 1024 bits */
377	lvx	v8,0,r4
378	lvx	v9,off16,r4
379	VPERM(v8,v8,v8,byteswap)
380	VPERM(v9,v9,v9,byteswap)
381	lvx	v10,off32,r4
382	lvx	v11,off48,r4
383	VPERM(v10,v10,v10,byteswap)
384	VPERM(v11,v11,v11,byteswap)
385	lvx	v12,off64,r4
386	lvx	v13,off80,r4
387	VPERM(v12,v12,v12,byteswap)
388	VPERM(v13,v13,v13,byteswap)
389	lvx	v14,off96,r4
390	lvx	v15,off112,r4
391	VPERM(v14,v14,v14,byteswap)
392	VPERM(v15,v15,v15,byteswap)
393
394	addi	r4,r4,8*16
395
396	vxor	v16,v0,v8
397	vxor	v17,v1,v9
398	vxor	v18,v2,v10
399	vxor	v19,v3,v11
400	vxor	v20,v4,v12
401	vxor	v21,v5,v13
402	vxor	v22,v6,v14
403	vxor	v23,v7,v15
404
405	li	r0,1
406	cmpdi	r6,0
407	addi	r6,r6,128
408	bne	1b
409
410	/* Work out how many bytes we have left */
411	andi.	r5,r5,127
412
413	/* Calculate where in the constant table we need to start */
414	subfic	r6,r5,128
415	add	r3,r3,r6
416
417	/* How many 16 byte chunks are in the tail */
418	srdi	r7,r5,4
419	mtctr	r7
420
421	/*
422	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
423	 * 32 bits to include the trailing 32 bits of zeros
424	 */
425	lvx	v0,0,r3
426	lvx	v1,off16,r3
427	lvx	v2,off32,r3
428	lvx	v3,off48,r3
429	lvx	v4,off64,r3
430	lvx	v5,off80,r3
431	lvx	v6,off96,r3
432	lvx	v7,off112,r3
433	addi	r3,r3,8*16
434
435	VPMSUMW(v0,v16,v0)
436	VPMSUMW(v1,v17,v1)
437	VPMSUMW(v2,v18,v2)
438	VPMSUMW(v3,v19,v3)
439	VPMSUMW(v4,v20,v4)
440	VPMSUMW(v5,v21,v5)
441	VPMSUMW(v6,v22,v6)
442	VPMSUMW(v7,v23,v7)
443
444	/* Now reduce the tail (0 - 112 bytes) */
445	cmpdi	r7,0
446	beq	1f
447
448	lvx	v16,0,r4
449	lvx	v17,0,r3
450	VPERM(v16,v16,v16,byteswap)
451	VPMSUMW(v16,v16,v17)
452	vxor	v0,v0,v16
453	bdz	1f
454
455	lvx	v16,off16,r4
456	lvx	v17,off16,r3
457	VPERM(v16,v16,v16,byteswap)
458	VPMSUMW(v16,v16,v17)
459	vxor	v0,v0,v16
460	bdz	1f
461
462	lvx	v16,off32,r4
463	lvx	v17,off32,r3
464	VPERM(v16,v16,v16,byteswap)
465	VPMSUMW(v16,v16,v17)
466	vxor	v0,v0,v16
467	bdz	1f
468
469	lvx	v16,off48,r4
470	lvx	v17,off48,r3
471	VPERM(v16,v16,v16,byteswap)
472	VPMSUMW(v16,v16,v17)
473	vxor	v0,v0,v16
474	bdz	1f
475
476	lvx	v16,off64,r4
477	lvx	v17,off64,r3
478	VPERM(v16,v16,v16,byteswap)
479	VPMSUMW(v16,v16,v17)
480	vxor	v0,v0,v16
481	bdz	1f
482
483	lvx	v16,off80,r4
484	lvx	v17,off80,r3
485	VPERM(v16,v16,v16,byteswap)
486	VPMSUMW(v16,v16,v17)
487	vxor	v0,v0,v16
488	bdz	1f
489
490	lvx	v16,off96,r4
491	lvx	v17,off96,r3
492	VPERM(v16,v16,v16,byteswap)
493	VPMSUMW(v16,v16,v17)
494	vxor	v0,v0,v16
495
496	/* Now xor all the parallel chunks together */
4971:	vxor	v0,v0,v1
498	vxor	v2,v2,v3
499	vxor	v4,v4,v5
500	vxor	v6,v6,v7
501
502	vxor	v0,v0,v2
503	vxor	v4,v4,v6
504
505	vxor	v0,v0,v4
506
507.Lbarrett_reduction:
508	/* Barrett constants */
509	addis	r3,r2,.barrett_constants@toc@ha
510	addi	r3,r3,.barrett_constants@toc@l
511
512	lvx	const1,0,r3
513	lvx	const2,off16,r3
514
515	vsldoi	v1,v0,v0,8
516	vxor	v0,v0,v1		/* xor two 64 bit results together */
517
518#ifdef REFLECT
519	/* shift left one bit */
520	vspltisb v1,1
521	vsl	v0,v0,v1
522#endif
523
524	vand	v0,v0,mask_64bit
525#ifndef REFLECT
526	/*
527	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
528	 * the multiple of our polynomial that we need to subtract. By
529	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
530	 * result back down 2x bits, we round down to the nearest multiple.
531	 */
532	VPMSUMD(v1,v0,const1)	/* ma */
533	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
534	VPMSUMD(v1,v1,const2)	/* qn */
535	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */
536
537	/*
538	 * Get the result into r3. We need to shift it left 8 bytes:
539	 * V0 [ 0 1 2 X ]
540	 * V0 [ 0 X 2 3 ]
541	 */
542	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
543#else
544	/*
545	 * The reflected version of Barrett reduction. Instead of bit
546	 * reflecting our data (which is expensive to do), we bit reflect our
547	 * constants and our algorithm, which means the intermediate data in
548	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
549	 * the algorithm because we don't carry in mod 2 arithmetic.
550	 */
551	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
552	VPMSUMD(v1,v1,const1)		/* ma */
553	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
554	VPMSUMD(v1,v1,const2)		/* qn */
555	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
556
557	/*
558	 * Since we are bit reflected, the result (ie the low 32 bits) is in
559	 * the high 32 bits. We just need to shift it left 4 bytes
560	 * V0 [ 0 1 X 3 ]
561	 * V0 [ 0 X 2 3 ]
562	 */
563	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
564#endif
565
566	/* Get it into r3 */
567	MFVRD(R3, v0)
568
569.Lout:
570	subi	r6,r1,56+10*16
571	subi	r7,r1,56+2*16
572
573	lvx	v20,0,r6
574	lvx	v21,off16,r6
575	lvx	v22,off32,r6
576	lvx	v23,off48,r6
577	lvx	v24,off64,r6
578	lvx	v25,off80,r6
579	lvx	v26,off96,r6
580	lvx	v27,off112,r6
581	lvx	v28,0,r7
582	lvx	v29,off16,r7
583
584	ld	r31,-8(r1)
585	ld	r30,-16(r1)
586	ld	r29,-24(r1)
587	ld	r28,-32(r1)
588	ld	r27,-40(r1)
589	ld	r26,-48(r1)
590	ld	r25,-56(r1)
591
592	blr
593
594.Lfirst_warm_up_done:
595	lvx	const1,0,r3
596	addi	r3,r3,16
597
598	VPMSUMD(v8,v16,const1)
599	VPMSUMD(v9,v17,const1)
600	VPMSUMD(v10,v18,const1)
601	VPMSUMD(v11,v19,const1)
602	VPMSUMD(v12,v20,const1)
603	VPMSUMD(v13,v21,const1)
604	VPMSUMD(v14,v22,const1)
605	VPMSUMD(v15,v23,const1)
606
607	b	.Lsecond_cool_down
608
609.Lshort:
610	cmpdi	r5,0
611	beq	.Lzero
612
613	addis	r3,r2,.short_constants@toc@ha
614	addi	r3,r3,.short_constants@toc@l
615
616	/* Calculate where in the constant table we need to start */
617	subfic	r6,r5,256
618	add	r3,r3,r6
619
620	/* How many 16 byte chunks? */
621	srdi	r7,r5,4
622	mtctr	r7
623
624	vxor	v19,v19,v19
625	vxor	v20,v20,v20
626
627	lvx	v0,0,r4
628	lvx	v16,0,r3
629	VPERM(v0,v0,v16,byteswap)
630	vxor	v0,v0,v8	/* xor in initial value */
631	VPMSUMW(v0,v0,v16)
632	bdz	.Lv0
633
634	lvx	v1,off16,r4
635	lvx	v17,off16,r3
636	VPERM(v1,v1,v17,byteswap)
637	VPMSUMW(v1,v1,v17)
638	bdz	.Lv1
639
640	lvx	v2,off32,r4
641	lvx	v16,off32,r3
642	VPERM(v2,v2,v16,byteswap)
643	VPMSUMW(v2,v2,v16)
644	bdz	.Lv2
645
646	lvx	v3,off48,r4
647	lvx	v17,off48,r3
648	VPERM(v3,v3,v17,byteswap)
649	VPMSUMW(v3,v3,v17)
650	bdz	.Lv3
651
652	lvx	v4,off64,r4
653	lvx	v16,off64,r3
654	VPERM(v4,v4,v16,byteswap)
655	VPMSUMW(v4,v4,v16)
656	bdz	.Lv4
657
658	lvx	v5,off80,r4
659	lvx	v17,off80,r3
660	VPERM(v5,v5,v17,byteswap)
661	VPMSUMW(v5,v5,v17)
662	bdz	.Lv5
663
664	lvx	v6,off96,r4
665	lvx	v16,off96,r3
666	VPERM(v6,v6,v16,byteswap)
667	VPMSUMW(v6,v6,v16)
668	bdz	.Lv6
669
670	lvx	v7,off112,r4
671	lvx	v17,off112,r3
672	VPERM(v7,v7,v17,byteswap)
673	VPMSUMW(v7,v7,v17)
674	bdz	.Lv7
675
676	addi	r3,r3,128
677	addi	r4,r4,128
678
679	lvx	v8,0,r4
680	lvx	v16,0,r3
681	VPERM(v8,v8,v16,byteswap)
682	VPMSUMW(v8,v8,v16)
683	bdz	.Lv8
684
685	lvx	v9,off16,r4
686	lvx	v17,off16,r3
687	VPERM(v9,v9,v17,byteswap)
688	VPMSUMW(v9,v9,v17)
689	bdz	.Lv9
690
691	lvx	v10,off32,r4
692	lvx	v16,off32,r3
693	VPERM(v10,v10,v16,byteswap)
694	VPMSUMW(v10,v10,v16)
695	bdz	.Lv10
696
697	lvx	v11,off48,r4
698	lvx	v17,off48,r3
699	VPERM(v11,v11,v17,byteswap)
700	VPMSUMW(v11,v11,v17)
701	bdz	.Lv11
702
703	lvx	v12,off64,r4
704	lvx	v16,off64,r3
705	VPERM(v12,v12,v16,byteswap)
706	VPMSUMW(v12,v12,v16)
707	bdz	.Lv12
708
709	lvx	v13,off80,r4
710	lvx	v17,off80,r3
711	VPERM(v13,v13,v17,byteswap)
712	VPMSUMW(v13,v13,v17)
713	bdz	.Lv13
714
715	lvx	v14,off96,r4
716	lvx	v16,off96,r3
717	VPERM(v14,v14,v16,byteswap)
718	VPMSUMW(v14,v14,v16)
719	bdz	.Lv14
720
721	lvx	v15,off112,r4
722	lvx	v17,off112,r3
723	VPERM(v15,v15,v17,byteswap)
724	VPMSUMW(v15,v15,v17)
725
726.Lv15:	vxor	v19,v19,v15
727.Lv14:	vxor	v20,v20,v14
728.Lv13:	vxor	v19,v19,v13
729.Lv12:	vxor	v20,v20,v12
730.Lv11:	vxor	v19,v19,v11
731.Lv10:	vxor	v20,v20,v10
732.Lv9:	vxor	v19,v19,v9
733.Lv8:	vxor	v20,v20,v8
734.Lv7:	vxor	v19,v19,v7
735.Lv6:	vxor	v20,v20,v6
736.Lv5:	vxor	v19,v19,v5
737.Lv4:	vxor	v20,v20,v4
738.Lv3:	vxor	v19,v19,v3
739.Lv2:	vxor	v20,v20,v2
740.Lv1:	vxor	v19,v19,v1
741.Lv0:	vxor	v20,v20,v0
742
743	vxor	v0,v19,v20
744
745	b	.Lbarrett_reduction
746
747.Lzero:
748	mr	r3,r10
749	b	.Lout
750
751FUNC_END(CRC_FUNCTION_NAME)
752