xref: /linux/lib/raid6/neon.uc (revision 0e833e697bcf4c2f3f7fb9fce39d08cd4439e5d7)
17d11965dSArd Biesheuvel/* -----------------------------------------------------------------------
27d11965dSArd Biesheuvel *
37d11965dSArd Biesheuvel *   neon.uc - RAID-6 syndrome calculation using ARM NEON instructions
47d11965dSArd Biesheuvel *
57d11965dSArd Biesheuvel *   Copyright (C) 2012 Rob Herring
6*0e833e69SArd Biesheuvel *   Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
77d11965dSArd Biesheuvel *
87d11965dSArd Biesheuvel *   Based on altivec.uc:
97d11965dSArd Biesheuvel *     Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
107d11965dSArd Biesheuvel *
117d11965dSArd Biesheuvel *   This program is free software; you can redistribute it and/or modify
127d11965dSArd Biesheuvel *   it under the terms of the GNU General Public License as published by
137d11965dSArd Biesheuvel *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
147d11965dSArd Biesheuvel *   Boston MA 02111-1307, USA; either version 2 of the License, or
157d11965dSArd Biesheuvel *   (at your option) any later version; incorporated herein by reference.
167d11965dSArd Biesheuvel *
177d11965dSArd Biesheuvel * ----------------------------------------------------------------------- */
187d11965dSArd Biesheuvel
197d11965dSArd Biesheuvel/*
207d11965dSArd Biesheuvel * neon$#.c
217d11965dSArd Biesheuvel *
227d11965dSArd Biesheuvel * $#-way unrolled NEON intrinsics math RAID-6 instruction set
237d11965dSArd Biesheuvel *
247d11965dSArd Biesheuvel * This file is postprocessed using unroll.awk
257d11965dSArd Biesheuvel */
267d11965dSArd Biesheuvel
277d11965dSArd Biesheuvel#include <arm_neon.h>
287d11965dSArd Biesheuvel
297d11965dSArd Biesheuveltypedef uint8x16_t unative_t;
307d11965dSArd Biesheuvel
317d11965dSArd Biesheuvel#define NBYTES(x) ((unative_t){x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x})
327d11965dSArd Biesheuvel#define NSIZE	sizeof(unative_t)
337d11965dSArd Biesheuvel
347d11965dSArd Biesheuvel/*
357d11965dSArd Biesheuvel * The SHLBYTE() operation shifts each byte left by 1, *not*
367d11965dSArd Biesheuvel * rolling over into the next byte
377d11965dSArd Biesheuvel */
387d11965dSArd Biesheuvelstatic inline unative_t SHLBYTE(unative_t v)
397d11965dSArd Biesheuvel{
407d11965dSArd Biesheuvel	return vshlq_n_u8(v, 1);
417d11965dSArd Biesheuvel}
427d11965dSArd Biesheuvel
437d11965dSArd Biesheuvel/*
447d11965dSArd Biesheuvel * The MASK() operation returns 0xFF in any byte for which the high
457d11965dSArd Biesheuvel * bit is 1, 0x00 for any byte for which the high bit is 0.
467d11965dSArd Biesheuvel */
477d11965dSArd Biesheuvelstatic inline unative_t MASK(unative_t v)
487d11965dSArd Biesheuvel{
497d11965dSArd Biesheuvel	const uint8x16_t temp = NBYTES(0);
507d11965dSArd Biesheuvel	return (unative_t)vcltq_s8((int8x16_t)v, (int8x16_t)temp);
517d11965dSArd Biesheuvel}
527d11965dSArd Biesheuvel
537d11965dSArd Biesheuvelvoid raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
547d11965dSArd Biesheuvel{
557d11965dSArd Biesheuvel	uint8_t **dptr = (uint8_t **)ptrs;
567d11965dSArd Biesheuvel	uint8_t *p, *q;
577d11965dSArd Biesheuvel	int d, z, z0;
587d11965dSArd Biesheuvel
597d11965dSArd Biesheuvel	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
607d11965dSArd Biesheuvel	const unative_t x1d = NBYTES(0x1d);
617d11965dSArd Biesheuvel
627d11965dSArd Biesheuvel	z0 = disks - 3;		/* Highest data disk */
637d11965dSArd Biesheuvel	p = dptr[z0+1];		/* XOR parity */
647d11965dSArd Biesheuvel	q = dptr[z0+2];		/* RS syndrome */
657d11965dSArd Biesheuvel
667d11965dSArd Biesheuvel	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
677d11965dSArd Biesheuvel		wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
687d11965dSArd Biesheuvel		for ( z = z0-1 ; z >= 0 ; z-- ) {
697d11965dSArd Biesheuvel			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
707d11965dSArd Biesheuvel			wp$$ = veorq_u8(wp$$, wd$$);
717d11965dSArd Biesheuvel			w2$$ = MASK(wq$$);
727d11965dSArd Biesheuvel			w1$$ = SHLBYTE(wq$$);
737d11965dSArd Biesheuvel
747d11965dSArd Biesheuvel			w2$$ = vandq_u8(w2$$, x1d);
757d11965dSArd Biesheuvel			w1$$ = veorq_u8(w1$$, w2$$);
767d11965dSArd Biesheuvel			wq$$ = veorq_u8(w1$$, wd$$);
777d11965dSArd Biesheuvel		}
787d11965dSArd Biesheuvel		vst1q_u8(&p[d+NSIZE*$$], wp$$);
797d11965dSArd Biesheuvel		vst1q_u8(&q[d+NSIZE*$$], wq$$);
807d11965dSArd Biesheuvel	}
817d11965dSArd Biesheuvel}
82*0e833e69SArd Biesheuvel
83*0e833e69SArd Biesheuvelvoid raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
84*0e833e69SArd Biesheuvel				    unsigned long bytes, void **ptrs)
85*0e833e69SArd Biesheuvel{
86*0e833e69SArd Biesheuvel	uint8_t **dptr = (uint8_t **)ptrs;
87*0e833e69SArd Biesheuvel	uint8_t *p, *q;
88*0e833e69SArd Biesheuvel	int d, z, z0;
89*0e833e69SArd Biesheuvel
90*0e833e69SArd Biesheuvel	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
91*0e833e69SArd Biesheuvel	const unative_t x1d = NBYTES(0x1d);
92*0e833e69SArd Biesheuvel
93*0e833e69SArd Biesheuvel	z0 = stop;		/* P/Q right side optimization */
94*0e833e69SArd Biesheuvel	p = dptr[disks-2];	/* XOR parity */
95*0e833e69SArd Biesheuvel	q = dptr[disks-1];	/* RS syndrome */
96*0e833e69SArd Biesheuvel
97*0e833e69SArd Biesheuvel	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
98*0e833e69SArd Biesheuvel		wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
99*0e833e69SArd Biesheuvel		wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$);
100*0e833e69SArd Biesheuvel
101*0e833e69SArd Biesheuvel		/* P/Q data pages */
102*0e833e69SArd Biesheuvel		for ( z = z0-1 ; z >= start ; z-- ) {
103*0e833e69SArd Biesheuvel			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
104*0e833e69SArd Biesheuvel			wp$$ = veorq_u8(wp$$, wd$$);
105*0e833e69SArd Biesheuvel			w2$$ = MASK(wq$$);
106*0e833e69SArd Biesheuvel			w1$$ = SHLBYTE(wq$$);
107*0e833e69SArd Biesheuvel
108*0e833e69SArd Biesheuvel			w2$$ = vandq_u8(w2$$, x1d);
109*0e833e69SArd Biesheuvel			w1$$ = veorq_u8(w1$$, w2$$);
110*0e833e69SArd Biesheuvel			wq$$ = veorq_u8(w1$$, wd$$);
111*0e833e69SArd Biesheuvel		}
112*0e833e69SArd Biesheuvel		/* P/Q left side optimization */
113*0e833e69SArd Biesheuvel		for ( z = start-1 ; z >= 0 ; z-- ) {
114*0e833e69SArd Biesheuvel			w2$$ = MASK(wq$$);
115*0e833e69SArd Biesheuvel			w1$$ = SHLBYTE(wq$$);
116*0e833e69SArd Biesheuvel
117*0e833e69SArd Biesheuvel			w2$$ = vandq_u8(w2$$, x1d);
118*0e833e69SArd Biesheuvel			wq$$ = veorq_u8(w1$$, w2$$);
119*0e833e69SArd Biesheuvel		}
120*0e833e69SArd Biesheuvel		w1$$ = vld1q_u8(&q[d+NSIZE*$$]);
121*0e833e69SArd Biesheuvel		wq$$ = veorq_u8(wq$$, w1$$);
122*0e833e69SArd Biesheuvel
123*0e833e69SArd Biesheuvel		vst1q_u8(&p[d+NSIZE*$$], wp$$);
124*0e833e69SArd Biesheuvel		vst1q_u8(&q[d+NSIZE*$$], wq$$);
125*0e833e69SArd Biesheuvel	}
126*0e833e69SArd Biesheuvel}
127