xref: /linux/arch/powerpc/lib/crc32-vpmsum_core.S (revision e814f3fd16acfb7f9966773953de8f740a1e3202)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Core of the accelerated CRC algorithm.
4 * In your file, define the constants and CRC_FUNCTION_NAME
5 * Then include this file.
6 *
7 * Calculate the checksum of data that is 16 byte aligned and a multiple of
8 * 16 bytes.
9 *
10 * The first step is to reduce it to 1024 bits. We do this in 8 parallel
11 * chunks in order to mask the latency of the vpmsum instructions. If we
12 * have more than 32 kB of data to checksum we repeat this step multiple
13 * times, passing in the previous 1024 bits.
14 *
15 * The next step is to reduce the 1024 bits to 64 bits. This step adds
16 * 32 bits of 0s to the end - this matches what a CRC does. We just
17 * calculate constants that land the data in this 32 bits.
18 *
19 * We then use fixed point Barrett reduction to compute a mod n over GF(2)
20 * for n = CRC using POWER8 instructions. We use x = 32.
21 *
22 * https://en.wikipedia.org/wiki/Barrett_reduction
23 *
24 * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
25*/
26
27#include <asm/ppc_asm.h>
28#include <asm/ppc-opcode.h>
29
30#define MAX_SIZE	32768
31
32	.text
33
34#if defined(__BIG_ENDIAN__) && defined(REFLECT)
35#define BYTESWAP_DATA
36#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
37#define BYTESWAP_DATA
38#else
39#undef BYTESWAP_DATA
40#endif
41
42#define off16		r25
43#define off32		r26
44#define off48		r27
45#define off64		r28
46#define off80		r29
47#define off96		r30
48#define off112		r31
49
50#define const1		v24
51#define const2		v25
52
53#define byteswap	v26
54#define	mask_32bit	v27
55#define	mask_64bit	v28
56#define zeroes		v29
57
58#ifdef BYTESWAP_DATA
59#define VPERM(A, B, C, D) vperm	A, B, C, D
60#else
61#define VPERM(A, B, C, D)
62#endif
63
64/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
65FUNC_START(CRC_FUNCTION_NAME)
66	std	r31,-8(r1)
67	std	r30,-16(r1)
68	std	r29,-24(r1)
69	std	r28,-32(r1)
70	std	r27,-40(r1)
71	std	r26,-48(r1)
72	std	r25,-56(r1)
73
74	li	off16,16
75	li	off32,32
76	li	off48,48
77	li	off64,64
78	li	off80,80
79	li	off96,96
80	li	off112,112
81	li	r0,0
82
83	/* Enough room for saving 10 non volatile VMX registers */
84	subi	r6,r1,56+10*16
85	subi	r7,r1,56+2*16
86
87	stvx	v20,0,r6
88	stvx	v21,off16,r6
89	stvx	v22,off32,r6
90	stvx	v23,off48,r6
91	stvx	v24,off64,r6
92	stvx	v25,off80,r6
93	stvx	v26,off96,r6
94	stvx	v27,off112,r6
95	stvx	v28,0,r7
96	stvx	v29,off16,r7
97
98	mr	r10,r3
99
100	vxor	zeroes,zeroes,zeroes
101	vspltisw v0,-1
102
103	vsldoi	mask_32bit,zeroes,v0,4
104	vsldoi	mask_64bit,zeroes,v0,8
105
106	/* Get the initial value into v8 */
107	vxor	v8,v8,v8
108	MTVRD(v8, R3)
109#ifdef REFLECT
110	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
111#else
112	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
113#endif
114
115#ifdef BYTESWAP_DATA
116	LOAD_REG_ADDR(r3, .byteswap_constant)
117	lvx	byteswap,0,r3
118	addi	r3,r3,16
119#endif
120
121	cmpdi	r5,256
122	blt	.Lshort
123
124	rldicr	r6,r5,0,56
125
126	/* Checksum in blocks of MAX_SIZE */
1271:	lis	r7,MAX_SIZE@h
128	ori	r7,r7,MAX_SIZE@l
129	mr	r9,r7
130	cmpd	r6,r7
131	bgt	2f
132	mr	r7,r6
1332:	subf	r6,r7,r6
134
135	/* our main loop does 128 bytes at a time */
136	srdi	r7,r7,7
137
138	/*
139	 * Work out the offset into the constants table to start at. Each
140	 * constant is 16 bytes, and it is used against 128 bytes of input
141	 * data - 128 / 16 = 8
142	 */
143	sldi	r8,r7,4
144	srdi	r9,r9,3
145	subf	r8,r8,r9
146
147	/* We reduce our final 128 bytes in a separate step */
148	addi	r7,r7,-1
149	mtctr	r7
150
151	LOAD_REG_ADDR(r3, .constants)
152
153	/* Find the start of our constants */
154	add	r3,r3,r8
155
156	/* zero v0-v7 which will contain our checksums */
157	vxor	v0,v0,v0
158	vxor	v1,v1,v1
159	vxor	v2,v2,v2
160	vxor	v3,v3,v3
161	vxor	v4,v4,v4
162	vxor	v5,v5,v5
163	vxor	v6,v6,v6
164	vxor	v7,v7,v7
165
166	lvx	const1,0,r3
167
168	/*
169	 * If we are looping back to consume more data we use the values
170	 * already in v16-v23.
171	 */
172	cmpdi	r0,1
173	beq	2f
174
175	/* First warm up pass */
176	lvx	v16,0,r4
177	lvx	v17,off16,r4
178	VPERM(v16,v16,v16,byteswap)
179	VPERM(v17,v17,v17,byteswap)
180	lvx	v18,off32,r4
181	lvx	v19,off48,r4
182	VPERM(v18,v18,v18,byteswap)
183	VPERM(v19,v19,v19,byteswap)
184	lvx	v20,off64,r4
185	lvx	v21,off80,r4
186	VPERM(v20,v20,v20,byteswap)
187	VPERM(v21,v21,v21,byteswap)
188	lvx	v22,off96,r4
189	lvx	v23,off112,r4
190	VPERM(v22,v22,v22,byteswap)
191	VPERM(v23,v23,v23,byteswap)
192	addi	r4,r4,8*16
193
194	/* xor in initial value */
195	vxor	v16,v16,v8
196
1972:	bdz	.Lfirst_warm_up_done
198
199	addi	r3,r3,16
200	lvx	const2,0,r3
201
202	/* Second warm up pass */
203	VPMSUMD(v8,v16,const1)
204	lvx	v16,0,r4
205	VPERM(v16,v16,v16,byteswap)
206	ori	r2,r2,0
207
208	VPMSUMD(v9,v17,const1)
209	lvx	v17,off16,r4
210	VPERM(v17,v17,v17,byteswap)
211	ori	r2,r2,0
212
213	VPMSUMD(v10,v18,const1)
214	lvx	v18,off32,r4
215	VPERM(v18,v18,v18,byteswap)
216	ori	r2,r2,0
217
218	VPMSUMD(v11,v19,const1)
219	lvx	v19,off48,r4
220	VPERM(v19,v19,v19,byteswap)
221	ori	r2,r2,0
222
223	VPMSUMD(v12,v20,const1)
224	lvx	v20,off64,r4
225	VPERM(v20,v20,v20,byteswap)
226	ori	r2,r2,0
227
228	VPMSUMD(v13,v21,const1)
229	lvx	v21,off80,r4
230	VPERM(v21,v21,v21,byteswap)
231	ori	r2,r2,0
232
233	VPMSUMD(v14,v22,const1)
234	lvx	v22,off96,r4
235	VPERM(v22,v22,v22,byteswap)
236	ori	r2,r2,0
237
238	VPMSUMD(v15,v23,const1)
239	lvx	v23,off112,r4
240	VPERM(v23,v23,v23,byteswap)
241
242	addi	r4,r4,8*16
243
244	bdz	.Lfirst_cool_down
245
246	/*
247	 * main loop. We modulo schedule it such that it takes three iterations
248	 * to complete - first iteration load, second iteration vpmsum, third
249	 * iteration xor.
250	 */
251	.balign	16
2524:	lvx	const1,0,r3
253	addi	r3,r3,16
254	ori	r2,r2,0
255
256	vxor	v0,v0,v8
257	VPMSUMD(v8,v16,const2)
258	lvx	v16,0,r4
259	VPERM(v16,v16,v16,byteswap)
260	ori	r2,r2,0
261
262	vxor	v1,v1,v9
263	VPMSUMD(v9,v17,const2)
264	lvx	v17,off16,r4
265	VPERM(v17,v17,v17,byteswap)
266	ori	r2,r2,0
267
268	vxor	v2,v2,v10
269	VPMSUMD(v10,v18,const2)
270	lvx	v18,off32,r4
271	VPERM(v18,v18,v18,byteswap)
272	ori	r2,r2,0
273
274	vxor	v3,v3,v11
275	VPMSUMD(v11,v19,const2)
276	lvx	v19,off48,r4
277	VPERM(v19,v19,v19,byteswap)
278	lvx	const2,0,r3
279	ori	r2,r2,0
280
281	vxor	v4,v4,v12
282	VPMSUMD(v12,v20,const1)
283	lvx	v20,off64,r4
284	VPERM(v20,v20,v20,byteswap)
285	ori	r2,r2,0
286
287	vxor	v5,v5,v13
288	VPMSUMD(v13,v21,const1)
289	lvx	v21,off80,r4
290	VPERM(v21,v21,v21,byteswap)
291	ori	r2,r2,0
292
293	vxor	v6,v6,v14
294	VPMSUMD(v14,v22,const1)
295	lvx	v22,off96,r4
296	VPERM(v22,v22,v22,byteswap)
297	ori	r2,r2,0
298
299	vxor	v7,v7,v15
300	VPMSUMD(v15,v23,const1)
301	lvx	v23,off112,r4
302	VPERM(v23,v23,v23,byteswap)
303
304	addi	r4,r4,8*16
305
306	bdnz	4b
307
308.Lfirst_cool_down:
309	/* First cool down pass */
310	lvx	const1,0,r3
311	addi	r3,r3,16
312
313	vxor	v0,v0,v8
314	VPMSUMD(v8,v16,const1)
315	ori	r2,r2,0
316
317	vxor	v1,v1,v9
318	VPMSUMD(v9,v17,const1)
319	ori	r2,r2,0
320
321	vxor	v2,v2,v10
322	VPMSUMD(v10,v18,const1)
323	ori	r2,r2,0
324
325	vxor	v3,v3,v11
326	VPMSUMD(v11,v19,const1)
327	ori	r2,r2,0
328
329	vxor	v4,v4,v12
330	VPMSUMD(v12,v20,const1)
331	ori	r2,r2,0
332
333	vxor	v5,v5,v13
334	VPMSUMD(v13,v21,const1)
335	ori	r2,r2,0
336
337	vxor	v6,v6,v14
338	VPMSUMD(v14,v22,const1)
339	ori	r2,r2,0
340
341	vxor	v7,v7,v15
342	VPMSUMD(v15,v23,const1)
343	ori	r2,r2,0
344
345.Lsecond_cool_down:
346	/* Second cool down pass */
347	vxor	v0,v0,v8
348	vxor	v1,v1,v9
349	vxor	v2,v2,v10
350	vxor	v3,v3,v11
351	vxor	v4,v4,v12
352	vxor	v5,v5,v13
353	vxor	v6,v6,v14
354	vxor	v7,v7,v15
355
356#ifdef REFLECT
357	/*
358	 * vpmsumd produces a 96 bit result in the least significant bits
359	 * of the register. Since we are bit reflected we have to shift it
360	 * left 32 bits so it occupies the least significant bits in the
361	 * bit reflected domain.
362	 */
363	vsldoi	v0,v0,zeroes,4
364	vsldoi	v1,v1,zeroes,4
365	vsldoi	v2,v2,zeroes,4
366	vsldoi	v3,v3,zeroes,4
367	vsldoi	v4,v4,zeroes,4
368	vsldoi	v5,v5,zeroes,4
369	vsldoi	v6,v6,zeroes,4
370	vsldoi	v7,v7,zeroes,4
371#endif
372
373	/* xor with last 1024 bits */
374	lvx	v8,0,r4
375	lvx	v9,off16,r4
376	VPERM(v8,v8,v8,byteswap)
377	VPERM(v9,v9,v9,byteswap)
378	lvx	v10,off32,r4
379	lvx	v11,off48,r4
380	VPERM(v10,v10,v10,byteswap)
381	VPERM(v11,v11,v11,byteswap)
382	lvx	v12,off64,r4
383	lvx	v13,off80,r4
384	VPERM(v12,v12,v12,byteswap)
385	VPERM(v13,v13,v13,byteswap)
386	lvx	v14,off96,r4
387	lvx	v15,off112,r4
388	VPERM(v14,v14,v14,byteswap)
389	VPERM(v15,v15,v15,byteswap)
390
391	addi	r4,r4,8*16
392
393	vxor	v16,v0,v8
394	vxor	v17,v1,v9
395	vxor	v18,v2,v10
396	vxor	v19,v3,v11
397	vxor	v20,v4,v12
398	vxor	v21,v5,v13
399	vxor	v22,v6,v14
400	vxor	v23,v7,v15
401
402	li	r0,1
403	cmpdi	r6,0
404	addi	r6,r6,128
405	bne	1b
406
407	/* Work out how many bytes we have left */
408	andi.	r5,r5,127
409
410	/* Calculate where in the constant table we need to start */
411	subfic	r6,r5,128
412	add	r3,r3,r6
413
414	/* How many 16 byte chunks are in the tail */
415	srdi	r7,r5,4
416	mtctr	r7
417
418	/*
419	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
420	 * 32 bits to include the trailing 32 bits of zeros
421	 */
422	lvx	v0,0,r3
423	lvx	v1,off16,r3
424	lvx	v2,off32,r3
425	lvx	v3,off48,r3
426	lvx	v4,off64,r3
427	lvx	v5,off80,r3
428	lvx	v6,off96,r3
429	lvx	v7,off112,r3
430	addi	r3,r3,8*16
431
432	VPMSUMW(v0,v16,v0)
433	VPMSUMW(v1,v17,v1)
434	VPMSUMW(v2,v18,v2)
435	VPMSUMW(v3,v19,v3)
436	VPMSUMW(v4,v20,v4)
437	VPMSUMW(v5,v21,v5)
438	VPMSUMW(v6,v22,v6)
439	VPMSUMW(v7,v23,v7)
440
441	/* Now reduce the tail (0 - 112 bytes) */
442	cmpdi	r7,0
443	beq	1f
444
445	lvx	v16,0,r4
446	lvx	v17,0,r3
447	VPERM(v16,v16,v16,byteswap)
448	VPMSUMW(v16,v16,v17)
449	vxor	v0,v0,v16
450	bdz	1f
451
452	lvx	v16,off16,r4
453	lvx	v17,off16,r3
454	VPERM(v16,v16,v16,byteswap)
455	VPMSUMW(v16,v16,v17)
456	vxor	v0,v0,v16
457	bdz	1f
458
459	lvx	v16,off32,r4
460	lvx	v17,off32,r3
461	VPERM(v16,v16,v16,byteswap)
462	VPMSUMW(v16,v16,v17)
463	vxor	v0,v0,v16
464	bdz	1f
465
466	lvx	v16,off48,r4
467	lvx	v17,off48,r3
468	VPERM(v16,v16,v16,byteswap)
469	VPMSUMW(v16,v16,v17)
470	vxor	v0,v0,v16
471	bdz	1f
472
473	lvx	v16,off64,r4
474	lvx	v17,off64,r3
475	VPERM(v16,v16,v16,byteswap)
476	VPMSUMW(v16,v16,v17)
477	vxor	v0,v0,v16
478	bdz	1f
479
480	lvx	v16,off80,r4
481	lvx	v17,off80,r3
482	VPERM(v16,v16,v16,byteswap)
483	VPMSUMW(v16,v16,v17)
484	vxor	v0,v0,v16
485	bdz	1f
486
487	lvx	v16,off96,r4
488	lvx	v17,off96,r3
489	VPERM(v16,v16,v16,byteswap)
490	VPMSUMW(v16,v16,v17)
491	vxor	v0,v0,v16
492
493	/* Now xor all the parallel chunks together */
4941:	vxor	v0,v0,v1
495	vxor	v2,v2,v3
496	vxor	v4,v4,v5
497	vxor	v6,v6,v7
498
499	vxor	v0,v0,v2
500	vxor	v4,v4,v6
501
502	vxor	v0,v0,v4
503
504.Lbarrett_reduction:
505	/* Barrett constants */
506	LOAD_REG_ADDR(r3, .barrett_constants)
507
508	lvx	const1,0,r3
509	lvx	const2,off16,r3
510
511	vsldoi	v1,v0,v0,8
512	vxor	v0,v0,v1		/* xor two 64 bit results together */
513
514#ifdef REFLECT
515	/* shift left one bit */
516	vspltisb v1,1
517	vsl	v0,v0,v1
518#endif
519
520	vand	v0,v0,mask_64bit
521#ifndef REFLECT
522	/*
523	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
524	 * the multiple of our polynomial that we need to subtract. By
525	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
526	 * result back down 2x bits, we round down to the nearest multiple.
527	 */
528	VPMSUMD(v1,v0,const1)	/* ma */
529	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
530	VPMSUMD(v1,v1,const2)	/* qn */
531	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */
532
533	/*
534	 * Get the result into r3. We need to shift it left 8 bytes:
535	 * V0 [ 0 1 2 X ]
536	 * V0 [ 0 X 2 3 ]
537	 */
538	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
539#else
540	/*
541	 * The reflected version of Barrett reduction. Instead of bit
542	 * reflecting our data (which is expensive to do), we bit reflect our
543	 * constants and our algorithm, which means the intermediate data in
544	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
545	 * the algorithm because we don't carry in mod 2 arithmetic.
546	 */
547	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
548	VPMSUMD(v1,v1,const1)		/* ma */
549	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
550	VPMSUMD(v1,v1,const2)		/* qn */
551	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
552
553	/*
554	 * Since we are bit reflected, the result (ie the low 32 bits) is in
555	 * the high 32 bits. We just need to shift it left 4 bytes
556	 * V0 [ 0 1 X 3 ]
557	 * V0 [ 0 X 2 3 ]
558	 */
559	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
560#endif
561
562	/* Get it into r3 */
563	MFVRD(R3, v0)
564
565.Lout:
566	subi	r6,r1,56+10*16
567	subi	r7,r1,56+2*16
568
569	lvx	v20,0,r6
570	lvx	v21,off16,r6
571	lvx	v22,off32,r6
572	lvx	v23,off48,r6
573	lvx	v24,off64,r6
574	lvx	v25,off80,r6
575	lvx	v26,off96,r6
576	lvx	v27,off112,r6
577	lvx	v28,0,r7
578	lvx	v29,off16,r7
579
580	ld	r31,-8(r1)
581	ld	r30,-16(r1)
582	ld	r29,-24(r1)
583	ld	r28,-32(r1)
584	ld	r27,-40(r1)
585	ld	r26,-48(r1)
586	ld	r25,-56(r1)
587
588	blr
589
590.Lfirst_warm_up_done:
591	lvx	const1,0,r3
592	addi	r3,r3,16
593
594	VPMSUMD(v8,v16,const1)
595	VPMSUMD(v9,v17,const1)
596	VPMSUMD(v10,v18,const1)
597	VPMSUMD(v11,v19,const1)
598	VPMSUMD(v12,v20,const1)
599	VPMSUMD(v13,v21,const1)
600	VPMSUMD(v14,v22,const1)
601	VPMSUMD(v15,v23,const1)
602
603	b	.Lsecond_cool_down
604
605.Lshort:
606	cmpdi	r5,0
607	beq	.Lzero
608
609	LOAD_REG_ADDR(r3, .short_constants)
610
611	/* Calculate where in the constant table we need to start */
612	subfic	r6,r5,256
613	add	r3,r3,r6
614
615	/* How many 16 byte chunks? */
616	srdi	r7,r5,4
617	mtctr	r7
618
619	vxor	v19,v19,v19
620	vxor	v20,v20,v20
621
622	lvx	v0,0,r4
623	lvx	v16,0,r3
624	VPERM(v0,v0,v16,byteswap)
625	vxor	v0,v0,v8	/* xor in initial value */
626	VPMSUMW(v0,v0,v16)
627	bdz	.Lv0
628
629	lvx	v1,off16,r4
630	lvx	v17,off16,r3
631	VPERM(v1,v1,v17,byteswap)
632	VPMSUMW(v1,v1,v17)
633	bdz	.Lv1
634
635	lvx	v2,off32,r4
636	lvx	v16,off32,r3
637	VPERM(v2,v2,v16,byteswap)
638	VPMSUMW(v2,v2,v16)
639	bdz	.Lv2
640
641	lvx	v3,off48,r4
642	lvx	v17,off48,r3
643	VPERM(v3,v3,v17,byteswap)
644	VPMSUMW(v3,v3,v17)
645	bdz	.Lv3
646
647	lvx	v4,off64,r4
648	lvx	v16,off64,r3
649	VPERM(v4,v4,v16,byteswap)
650	VPMSUMW(v4,v4,v16)
651	bdz	.Lv4
652
653	lvx	v5,off80,r4
654	lvx	v17,off80,r3
655	VPERM(v5,v5,v17,byteswap)
656	VPMSUMW(v5,v5,v17)
657	bdz	.Lv5
658
659	lvx	v6,off96,r4
660	lvx	v16,off96,r3
661	VPERM(v6,v6,v16,byteswap)
662	VPMSUMW(v6,v6,v16)
663	bdz	.Lv6
664
665	lvx	v7,off112,r4
666	lvx	v17,off112,r3
667	VPERM(v7,v7,v17,byteswap)
668	VPMSUMW(v7,v7,v17)
669	bdz	.Lv7
670
671	addi	r3,r3,128
672	addi	r4,r4,128
673
674	lvx	v8,0,r4
675	lvx	v16,0,r3
676	VPERM(v8,v8,v16,byteswap)
677	VPMSUMW(v8,v8,v16)
678	bdz	.Lv8
679
680	lvx	v9,off16,r4
681	lvx	v17,off16,r3
682	VPERM(v9,v9,v17,byteswap)
683	VPMSUMW(v9,v9,v17)
684	bdz	.Lv9
685
686	lvx	v10,off32,r4
687	lvx	v16,off32,r3
688	VPERM(v10,v10,v16,byteswap)
689	VPMSUMW(v10,v10,v16)
690	bdz	.Lv10
691
692	lvx	v11,off48,r4
693	lvx	v17,off48,r3
694	VPERM(v11,v11,v17,byteswap)
695	VPMSUMW(v11,v11,v17)
696	bdz	.Lv11
697
698	lvx	v12,off64,r4
699	lvx	v16,off64,r3
700	VPERM(v12,v12,v16,byteswap)
701	VPMSUMW(v12,v12,v16)
702	bdz	.Lv12
703
704	lvx	v13,off80,r4
705	lvx	v17,off80,r3
706	VPERM(v13,v13,v17,byteswap)
707	VPMSUMW(v13,v13,v17)
708	bdz	.Lv13
709
710	lvx	v14,off96,r4
711	lvx	v16,off96,r3
712	VPERM(v14,v14,v16,byteswap)
713	VPMSUMW(v14,v14,v16)
714	bdz	.Lv14
715
716	lvx	v15,off112,r4
717	lvx	v17,off112,r3
718	VPERM(v15,v15,v17,byteswap)
719	VPMSUMW(v15,v15,v17)
720
721.Lv15:	vxor	v19,v19,v15
722.Lv14:	vxor	v20,v20,v14
723.Lv13:	vxor	v19,v19,v13
724.Lv12:	vxor	v20,v20,v12
725.Lv11:	vxor	v19,v19,v11
726.Lv10:	vxor	v20,v20,v10
727.Lv9:	vxor	v19,v19,v9
728.Lv8:	vxor	v20,v20,v8
729.Lv7:	vxor	v19,v19,v7
730.Lv6:	vxor	v20,v20,v6
731.Lv5:	vxor	v19,v19,v5
732.Lv4:	vxor	v20,v20,v4
733.Lv3:	vxor	v19,v19,v3
734.Lv2:	vxor	v20,v20,v2
735.Lv1:	vxor	v19,v19,v1
736.Lv0:	vxor	v20,v20,v0
737
738	vxor	v0,v19,v20
739
740	b	.Lbarrett_reduction
741
742.Lzero:
743	mr	r3,r10
744	b	.Lout
745
746FUNC_END(CRC_FUNCTION_NAME)
747