xref: /linux/lib/raid6/neon.uc (revision cdd5b5a9761fd66d17586e4f4ba6588c70e640ea)
17d11965dSArd Biesheuvel/* -----------------------------------------------------------------------
27d11965dSArd Biesheuvel *
37d11965dSArd Biesheuvel *   neon.uc - RAID-6 syndrome calculation using ARM NEON instructions
47d11965dSArd Biesheuvel *
57d11965dSArd Biesheuvel *   Copyright (C) 2012 Rob Herring
60e833e69SArd Biesheuvel *   Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
77d11965dSArd Biesheuvel *
87d11965dSArd Biesheuvel *   Based on altivec.uc:
97d11965dSArd Biesheuvel *     Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
107d11965dSArd Biesheuvel *
117d11965dSArd Biesheuvel *   This program is free software; you can redistribute it and/or modify
127d11965dSArd Biesheuvel *   it under the terms of the GNU General Public License as published by
137d11965dSArd Biesheuvel *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
147d11965dSArd Biesheuvel *   Boston MA 02111-1307, USA; either version 2 of the License, or
157d11965dSArd Biesheuvel *   (at your option) any later version; incorporated herein by reference.
167d11965dSArd Biesheuvel *
177d11965dSArd Biesheuvel * ----------------------------------------------------------------------- */
187d11965dSArd Biesheuvel
197d11965dSArd Biesheuvel/*
207d11965dSArd Biesheuvel * neon$#.c
217d11965dSArd Biesheuvel *
227d11965dSArd Biesheuvel * $#-way unrolled NEON intrinsics math RAID-6 instruction set
237d11965dSArd Biesheuvel *
247d11965dSArd Biesheuvel * This file is postprocessed using unroll.awk
257d11965dSArd Biesheuvel */
267d11965dSArd Biesheuvel
277d11965dSArd Biesheuvel#include <arm_neon.h>
28*3de13550SArnd Bergmann#include "neon.h"
297d11965dSArd Biesheuvel
307d11965dSArd Biesheuveltypedef uint8x16_t unative_t;
317d11965dSArd Biesheuvel
327d11965dSArd Biesheuvel#define NSIZE	sizeof(unative_t)
337d11965dSArd Biesheuvel
347d11965dSArd Biesheuvel/*
357d11965dSArd Biesheuvel * The SHLBYTE() operation shifts each byte left by 1, *not*
367d11965dSArd Biesheuvel * rolling over into the next byte
377d11965dSArd Biesheuvel */
387d11965dSArd Biesheuvelstatic inline unative_t SHLBYTE(unative_t v)
397d11965dSArd Biesheuvel{
407d11965dSArd Biesheuvel	return vshlq_n_u8(v, 1);
417d11965dSArd Biesheuvel}
427d11965dSArd Biesheuvel
437d11965dSArd Biesheuvel/*
447d11965dSArd Biesheuvel * The MASK() operation returns 0xFF in any byte for which the high
457d11965dSArd Biesheuvel * bit is 1, 0x00 for any byte for which the high bit is 0.
467d11965dSArd Biesheuvel */
477d11965dSArd Biesheuvelstatic inline unative_t MASK(unative_t v)
487d11965dSArd Biesheuvel{
4935129ddeSArd Biesheuvel	return (unative_t)vshrq_n_s8((int8x16_t)v, 7);
5035129ddeSArd Biesheuvel}
5135129ddeSArd Biesheuvel
5235129ddeSArd Biesheuvelstatic inline unative_t PMUL(unative_t v, unative_t u)
5335129ddeSArd Biesheuvel{
5435129ddeSArd Biesheuvel	return (unative_t)vmulq_p8((poly8x16_t)v, (poly8x16_t)u);
557d11965dSArd Biesheuvel}
567d11965dSArd Biesheuvel
577d11965dSArd Biesheuvelvoid raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
587d11965dSArd Biesheuvel{
597d11965dSArd Biesheuvel	uint8_t **dptr = (uint8_t **)ptrs;
607d11965dSArd Biesheuvel	uint8_t *p, *q;
617d11965dSArd Biesheuvel	int d, z, z0;
627d11965dSArd Biesheuvel
637d11965dSArd Biesheuvel	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
641ad3935bSndesaulniers@google.com	const unative_t x1d = vdupq_n_u8(0x1d);
657d11965dSArd Biesheuvel
667d11965dSArd Biesheuvel	z0 = disks - 3;		/* Highest data disk */
677d11965dSArd Biesheuvel	p = dptr[z0+1];		/* XOR parity */
687d11965dSArd Biesheuvel	q = dptr[z0+2];		/* RS syndrome */
697d11965dSArd Biesheuvel
707d11965dSArd Biesheuvel	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
717d11965dSArd Biesheuvel		wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
727d11965dSArd Biesheuvel		for ( z = z0-1 ; z >= 0 ; z-- ) {
737d11965dSArd Biesheuvel			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
747d11965dSArd Biesheuvel			wp$$ = veorq_u8(wp$$, wd$$);
757d11965dSArd Biesheuvel			w2$$ = MASK(wq$$);
767d11965dSArd Biesheuvel			w1$$ = SHLBYTE(wq$$);
777d11965dSArd Biesheuvel
787d11965dSArd Biesheuvel			w2$$ = vandq_u8(w2$$, x1d);
797d11965dSArd Biesheuvel			w1$$ = veorq_u8(w1$$, w2$$);
807d11965dSArd Biesheuvel			wq$$ = veorq_u8(w1$$, wd$$);
817d11965dSArd Biesheuvel		}
827d11965dSArd Biesheuvel		vst1q_u8(&p[d+NSIZE*$$], wp$$);
837d11965dSArd Biesheuvel		vst1q_u8(&q[d+NSIZE*$$], wq$$);
847d11965dSArd Biesheuvel	}
857d11965dSArd Biesheuvel}
860e833e69SArd Biesheuvel
870e833e69SArd Biesheuvelvoid raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
880e833e69SArd Biesheuvel				    unsigned long bytes, void **ptrs)
890e833e69SArd Biesheuvel{
900e833e69SArd Biesheuvel	uint8_t **dptr = (uint8_t **)ptrs;
910e833e69SArd Biesheuvel	uint8_t *p, *q;
920e833e69SArd Biesheuvel	int d, z, z0;
930e833e69SArd Biesheuvel
940e833e69SArd Biesheuvel	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
951ad3935bSndesaulniers@google.com	const unative_t x1d = vdupq_n_u8(0x1d);
960e833e69SArd Biesheuvel
970e833e69SArd Biesheuvel	z0 = stop;		/* P/Q right side optimization */
980e833e69SArd Biesheuvel	p = dptr[disks-2];	/* XOR parity */
990e833e69SArd Biesheuvel	q = dptr[disks-1];	/* RS syndrome */
1000e833e69SArd Biesheuvel
1010e833e69SArd Biesheuvel	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
1020e833e69SArd Biesheuvel		wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
1030e833e69SArd Biesheuvel		wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$);
1040e833e69SArd Biesheuvel
1050e833e69SArd Biesheuvel		/* P/Q data pages */
1060e833e69SArd Biesheuvel		for ( z = z0-1 ; z >= start ; z-- ) {
1070e833e69SArd Biesheuvel			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
1080e833e69SArd Biesheuvel			wp$$ = veorq_u8(wp$$, wd$$);
1090e833e69SArd Biesheuvel			w2$$ = MASK(wq$$);
1100e833e69SArd Biesheuvel			w1$$ = SHLBYTE(wq$$);
1110e833e69SArd Biesheuvel
1120e833e69SArd Biesheuvel			w2$$ = vandq_u8(w2$$, x1d);
1130e833e69SArd Biesheuvel			w1$$ = veorq_u8(w1$$, w2$$);
1140e833e69SArd Biesheuvel			wq$$ = veorq_u8(w1$$, wd$$);
1150e833e69SArd Biesheuvel		}
1160e833e69SArd Biesheuvel		/* P/Q left side optimization */
11735129ddeSArd Biesheuvel		for ( z = start-1 ; z >= 3 ; z -= 4 ) {
11835129ddeSArd Biesheuvel			w2$$ = vshrq_n_u8(wq$$, 4);
11935129ddeSArd Biesheuvel			w1$$ = vshlq_n_u8(wq$$, 4);
12035129ddeSArd Biesheuvel
12135129ddeSArd Biesheuvel			w2$$ = PMUL(w2$$, x1d);
12235129ddeSArd Biesheuvel			wq$$ = veorq_u8(w1$$, w2$$);
12335129ddeSArd Biesheuvel		}
12435129ddeSArd Biesheuvel
12535129ddeSArd Biesheuvel		switch (z) {
12635129ddeSArd Biesheuvel		case 2:
12735129ddeSArd Biesheuvel			w2$$ = vshrq_n_u8(wq$$, 5);
12835129ddeSArd Biesheuvel			w1$$ = vshlq_n_u8(wq$$, 3);
12935129ddeSArd Biesheuvel
13035129ddeSArd Biesheuvel			w2$$ = PMUL(w2$$, x1d);
13135129ddeSArd Biesheuvel			wq$$ = veorq_u8(w1$$, w2$$);
13235129ddeSArd Biesheuvel			break;
13335129ddeSArd Biesheuvel		case 1:
13435129ddeSArd Biesheuvel			w2$$ = vshrq_n_u8(wq$$, 6);
13535129ddeSArd Biesheuvel			w1$$ = vshlq_n_u8(wq$$, 2);
13635129ddeSArd Biesheuvel
13735129ddeSArd Biesheuvel			w2$$ = PMUL(w2$$, x1d);
13835129ddeSArd Biesheuvel			wq$$ = veorq_u8(w1$$, w2$$);
13935129ddeSArd Biesheuvel			break;
14035129ddeSArd Biesheuvel		case 0:
1410e833e69SArd Biesheuvel			w2$$ = MASK(wq$$);
1420e833e69SArd Biesheuvel			w1$$ = SHLBYTE(wq$$);
1430e833e69SArd Biesheuvel
1440e833e69SArd Biesheuvel			w2$$ = vandq_u8(w2$$, x1d);
1450e833e69SArd Biesheuvel			wq$$ = veorq_u8(w1$$, w2$$);
1460e833e69SArd Biesheuvel		}
1470e833e69SArd Biesheuvel		w1$$ = vld1q_u8(&q[d+NSIZE*$$]);
1480e833e69SArd Biesheuvel		wq$$ = veorq_u8(wq$$, w1$$);
1490e833e69SArd Biesheuvel
1500e833e69SArd Biesheuvel		vst1q_u8(&p[d+NSIZE*$$], wp$$);
1510e833e69SArd Biesheuvel		vst1q_u8(&q[d+NSIZE*$$], wq$$);
1520e833e69SArd Biesheuvel	}
1530e833e69SArd Biesheuvel}
154