1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Authors: Jackie Liu <liuyun01@kylinos.cn> 4 * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. 5 */ 6 7 #include <linux/cache.h> 8 #include <asm/neon-intrinsics.h> 9 #include "xor_impl.h" 10 #include "xor_arch.h" 11 #include "xor-neon.h" 12 13 static void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1, 14 const unsigned long * __restrict p2) 15 { 16 uint64_t *dp1 = (uint64_t *)p1; 17 uint64_t *dp2 = (uint64_t *)p2; 18 19 register uint64x2_t v0, v1, v2, v3; 20 long lines = bytes / (sizeof(uint64x2_t) * 4); 21 22 do { 23 /* p1 ^= p2 */ 24 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 25 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 26 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 27 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 28 29 /* store */ 30 vst1q_u64(dp1 + 0, v0); 31 vst1q_u64(dp1 + 2, v1); 32 vst1q_u64(dp1 + 4, v2); 33 vst1q_u64(dp1 + 6, v3); 34 35 dp1 += 8; 36 dp2 += 8; 37 } while (--lines > 0); 38 } 39 40 static void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1, 41 const unsigned long * __restrict p2, 42 const unsigned long * __restrict p3) 43 { 44 uint64_t *dp1 = (uint64_t *)p1; 45 uint64_t *dp2 = (uint64_t *)p2; 46 uint64_t *dp3 = (uint64_t *)p3; 47 48 register uint64x2_t v0, v1, v2, v3; 49 long lines = bytes / (sizeof(uint64x2_t) * 4); 50 51 do { 52 /* p1 ^= p2 */ 53 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 54 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 55 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 56 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 57 58 /* p1 ^= p3 */ 59 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 60 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 61 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 62 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 63 64 /* store */ 65 vst1q_u64(dp1 + 0, v0); 66 vst1q_u64(dp1 + 2, v1); 67 vst1q_u64(dp1 + 4, v2); 68 vst1q_u64(dp1 + 6, v3); 69 70 dp1 += 8; 71 dp2 += 8; 72 dp3 += 8; 73 } while (--lines > 0); 74 } 75 76 static void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1, 77 const unsigned long * __restrict p2, 78 const unsigned long * __restrict p3, 79 const unsigned long * __restrict p4) 80 { 81 uint64_t *dp1 = (uint64_t *)p1; 82 uint64_t *dp2 = (uint64_t *)p2; 83 uint64_t *dp3 = (uint64_t *)p3; 84 uint64_t *dp4 = (uint64_t *)p4; 85 86 register uint64x2_t v0, v1, v2, v3; 87 long lines = bytes / (sizeof(uint64x2_t) * 4); 88 89 do { 90 /* p1 ^= p2 */ 91 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 92 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 93 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 94 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 95 96 /* p1 ^= p3 */ 97 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 98 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 99 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 100 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 101 102 /* p1 ^= p4 */ 103 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 104 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 105 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 106 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 107 108 /* store */ 109 vst1q_u64(dp1 + 0, v0); 110 vst1q_u64(dp1 + 2, v1); 111 vst1q_u64(dp1 + 4, v2); 112 vst1q_u64(dp1 + 6, v3); 113 114 dp1 += 8; 115 dp2 += 8; 116 dp3 += 8; 117 dp4 += 8; 118 } while (--lines > 0); 119 } 120 121 static void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1, 122 const unsigned long * __restrict p2, 123 const unsigned long * __restrict p3, 124 const unsigned long * __restrict p4, 125 const unsigned long * __restrict p5) 126 { 127 uint64_t *dp1 = (uint64_t *)p1; 128 uint64_t *dp2 = (uint64_t *)p2; 129 uint64_t *dp3 = (uint64_t *)p3; 130 uint64_t *dp4 = (uint64_t *)p4; 131 uint64_t *dp5 = (uint64_t *)p5; 132 133 register uint64x2_t v0, v1, v2, v3; 134 long lines = bytes / (sizeof(uint64x2_t) * 4); 135 136 do { 137 /* p1 ^= p2 */ 138 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 139 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 140 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 141 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 142 143 /* p1 ^= p3 */ 144 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 145 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 146 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 147 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 148 149 /* p1 ^= p4 */ 150 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 151 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 152 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 153 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 154 155 /* p1 ^= p5 */ 156 v0 = veorq_u64(v0, vld1q_u64(dp5 + 0)); 157 v1 = veorq_u64(v1, vld1q_u64(dp5 + 2)); 158 v2 = veorq_u64(v2, vld1q_u64(dp5 + 4)); 159 v3 = veorq_u64(v3, vld1q_u64(dp5 + 6)); 160 161 /* store */ 162 vst1q_u64(dp1 + 0, v0); 163 vst1q_u64(dp1 + 2, v1); 164 vst1q_u64(dp1 + 4, v2); 165 vst1q_u64(dp1 + 6, v3); 166 167 dp1 += 8; 168 dp2 += 8; 169 dp3 += 8; 170 dp4 += 8; 171 dp5 += 8; 172 } while (--lines > 0); 173 } 174 175 __DO_XOR_BLOCKS(neon_inner, __xor_neon_2, __xor_neon_3, __xor_neon_4, 176 __xor_neon_5); 177 178 static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) 179 { 180 uint64x2_t res; 181 182 asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n" 183 "eor3 %0.16b, %1.16b, %2.16b, %3.16b" 184 : "=w"(res) : "w"(p), "w"(q), "w"(r)); 185 return res; 186 } 187 188 static void __xor_eor3_3(unsigned long bytes, unsigned long * __restrict p1, 189 const unsigned long * __restrict p2, 190 const unsigned long * __restrict p3) 191 { 192 uint64_t *dp1 = (uint64_t *)p1; 193 uint64_t *dp2 = (uint64_t *)p2; 194 uint64_t *dp3 = (uint64_t *)p3; 195 196 register uint64x2_t v0, v1, v2, v3; 197 long lines = bytes / (sizeof(uint64x2_t) * 4); 198 199 do { 200 /* p1 ^= p2 ^ p3 */ 201 v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), 202 vld1q_u64(dp3 + 0)); 203 v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), 204 vld1q_u64(dp3 + 2)); 205 v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), 206 vld1q_u64(dp3 + 4)); 207 v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), 208 vld1q_u64(dp3 + 6)); 209 210 /* store */ 211 vst1q_u64(dp1 + 0, v0); 212 vst1q_u64(dp1 + 2, v1); 213 vst1q_u64(dp1 + 4, v2); 214 vst1q_u64(dp1 + 6, v3); 215 216 dp1 += 8; 217 dp2 += 8; 218 dp3 += 8; 219 } while (--lines > 0); 220 } 221 222 static void __xor_eor3_4(unsigned long bytes, unsigned long * __restrict p1, 223 const unsigned long * __restrict p2, 224 const unsigned long * __restrict p3, 225 const unsigned long * __restrict p4) 226 { 227 uint64_t *dp1 = (uint64_t *)p1; 228 uint64_t *dp2 = (uint64_t *)p2; 229 uint64_t *dp3 = (uint64_t *)p3; 230 uint64_t *dp4 = (uint64_t *)p4; 231 232 register uint64x2_t v0, v1, v2, v3; 233 long lines = bytes / (sizeof(uint64x2_t) * 4); 234 235 do { 236 /* p1 ^= p2 ^ p3 */ 237 v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), 238 vld1q_u64(dp3 + 0)); 239 v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), 240 vld1q_u64(dp3 + 2)); 241 v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), 242 vld1q_u64(dp3 + 4)); 243 v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), 244 vld1q_u64(dp3 + 6)); 245 246 /* p1 ^= p4 */ 247 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 248 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 249 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 250 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 251 252 /* store */ 253 vst1q_u64(dp1 + 0, v0); 254 vst1q_u64(dp1 + 2, v1); 255 vst1q_u64(dp1 + 4, v2); 256 vst1q_u64(dp1 + 6, v3); 257 258 dp1 += 8; 259 dp2 += 8; 260 dp3 += 8; 261 dp4 += 8; 262 } while (--lines > 0); 263 } 264 265 static void __xor_eor3_5(unsigned long bytes, unsigned long * __restrict p1, 266 const unsigned long * __restrict p2, 267 const unsigned long * __restrict p3, 268 const unsigned long * __restrict p4, 269 const unsigned long * __restrict p5) 270 { 271 uint64_t *dp1 = (uint64_t *)p1; 272 uint64_t *dp2 = (uint64_t *)p2; 273 uint64_t *dp3 = (uint64_t *)p3; 274 uint64_t *dp4 = (uint64_t *)p4; 275 uint64_t *dp5 = (uint64_t *)p5; 276 277 register uint64x2_t v0, v1, v2, v3; 278 long lines = bytes / (sizeof(uint64x2_t) * 4); 279 280 do { 281 /* p1 ^= p2 ^ p3 */ 282 v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), 283 vld1q_u64(dp3 + 0)); 284 v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), 285 vld1q_u64(dp3 + 2)); 286 v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), 287 vld1q_u64(dp3 + 4)); 288 v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), 289 vld1q_u64(dp3 + 6)); 290 291 /* p1 ^= p4 ^ p5 */ 292 v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0)); 293 v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2)); 294 v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4)); 295 v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6)); 296 297 /* store */ 298 vst1q_u64(dp1 + 0, v0); 299 vst1q_u64(dp1 + 2, v1); 300 vst1q_u64(dp1 + 4, v2); 301 vst1q_u64(dp1 + 6, v3); 302 303 dp1 += 8; 304 dp2 += 8; 305 dp3 += 8; 306 dp4 += 8; 307 dp5 += 8; 308 } while (--lines > 0); 309 } 310 311 __DO_XOR_BLOCKS(eor3_inner, __xor_neon_2, __xor_eor3_3, __xor_eor3_4, 312 __xor_eor3_5); 313