1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (C) 2016 Romain Dolbeau. All rights reserved. 23 */ 24 25 #include <sys/types.h> 26 #include <sys/simd.h> 27 28 #ifdef __linux__ 29 #define __asm __asm__ __volatile__ 30 #endif 31 32 #define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N 33 #define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) 34 35 #define VR0_(REG, ...) "%[w"#REG"]" 36 #define VR1_(_1, REG, ...) "%[w"#REG"]" 37 #define VR2_(_1, _2, REG, ...) "%[w"#REG"]" 38 #define VR3_(_1, _2, _3, REG, ...) "%[w"#REG"]" 39 #define VR4_(_1, _2, _3, _4, REG, ...) "%[w"#REG"]" 40 #define VR5_(_1, _2, _3, _4, _5, REG, ...) "%[w"#REG"]" 41 #define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "%[w"#REG"]" 42 #define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "%[w"#REG"]" 43 44 /* 45 * Here we need registers not used otherwise. 46 * They will be used in unused ASM for the case 47 * with more registers than required... but GCC 48 * will still need to make sure the constraints 49 * are correct, and duplicate constraints are illegal 50 * ... and we use the "register" number as a name 51 */ 52 53 #define VR0(r...) VR0_(r) 54 #define VR1(r...) VR1_(r) 55 #define VR2(r...) VR2_(r, 36) 56 #define VR3(r...) VR3_(r, 36, 35) 57 #define VR4(r...) VR4_(r, 36, 35, 34, 33) 58 #define VR5(r...) VR5_(r, 36, 35, 34, 33, 32) 59 #define VR6(r...) VR6_(r, 36, 35, 34, 33, 32, 31) 60 #define VR7(r...) VR7_(r, 36, 35, 34, 33, 32, 31, 30) 61 62 #define VR(X) "%[w"#X"]" 63 64 #define RVR0_(REG, ...) [w##REG] "w" (w##REG) 65 #define RVR1_(_1, REG, ...) [w##REG] "w" (w##REG) 66 #define RVR2_(_1, _2, REG, ...) [w##REG] "w" (w##REG) 67 #define RVR3_(_1, _2, _3, REG, ...) [w##REG] "w" (w##REG) 68 #define RVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "w" (w##REG) 69 #define RVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "w" (w##REG) 70 #define RVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "w" (w##REG) 71 #define RVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "w" (w##REG) 72 73 #define RVR0(r...) RVR0_(r) 74 #define RVR1(r...) RVR1_(r) 75 #define RVR2(r...) RVR2_(r, 36) 76 #define RVR3(r...) RVR3_(r, 36, 35) 77 #define RVR4(r...) RVR4_(r, 36, 35, 34, 33) 78 #define RVR5(r...) RVR5_(r, 36, 35, 34, 33, 32) 79 #define RVR6(r...) RVR6_(r, 36, 35, 34, 33, 32, 31) 80 #define RVR7(r...) RVR7_(r, 36, 35, 34, 33, 32, 31, 30) 81 82 #define RVR(X) [w##X] "w" (w##X) 83 84 #define WVR0_(REG, ...) [w##REG] "=w" (w##REG) 85 #define WVR1_(_1, REG, ...) [w##REG] "=w" (w##REG) 86 #define WVR2_(_1, _2, REG, ...) [w##REG] "=w" (w##REG) 87 #define WVR3_(_1, _2, _3, REG, ...) [w##REG] "=w" (w##REG) 88 #define WVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "=w" (w##REG) 89 #define WVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "=w" (w##REG) 90 #define WVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "=w" (w##REG) 91 #define WVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "=w" (w##REG) 92 93 #define WVR0(r...) WVR0_(r) 94 #define WVR1(r...) WVR1_(r) 95 #define WVR2(r...) WVR2_(r, 36) 96 #define WVR3(r...) WVR3_(r, 36, 35) 97 #define WVR4(r...) WVR4_(r, 36, 35, 34, 33) 98 #define WVR5(r...) WVR5_(r, 36, 35, 34, 33, 32) 99 #define WVR6(r...) WVR6_(r, 36, 35, 34, 33, 32, 31) 100 #define WVR7(r...) WVR7_(r, 36, 35, 34, 33, 32, 31, 30) 101 102 #define WVR(X) [w##X] "=w" (w##X) 103 104 #define UVR0_(REG, ...) [w##REG] "+&w" (w##REG) 105 #define UVR1_(_1, REG, ...) [w##REG] "+&w" (w##REG) 106 #define UVR2_(_1, _2, REG, ...) [w##REG] "+&w" (w##REG) 107 #define UVR3_(_1, _2, _3, REG, ...) [w##REG] "+&w" (w##REG) 108 #define UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+&w" (w##REG) 109 #define UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+&w" (w##REG) 110 #define UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+&w" (w##REG) 111 #define UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+&w" (w##REG) 112 113 #define UVR0(r...) UVR0_(r) 114 #define UVR1(r...) UVR1_(r) 115 #define UVR2(r...) UVR2_(r, 36) 116 #define UVR3(r...) UVR3_(r, 36, 35) 117 #define UVR4(r...) UVR4_(r, 36, 35, 34, 33) 118 #define UVR5(r...) UVR5_(r, 36, 35, 34, 33, 32) 119 #define UVR6(r...) UVR6_(r, 36, 35, 34, 33, 32, 31) 120 #define UVR7(r...) UVR7_(r, 36, 35, 34, 33, 32, 31, 30) 121 122 #define UVR(X) [w##X] "+&w" (w##X) 123 124 #define R_01(REG1, REG2, ...) REG1, REG2 125 #define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3 126 #define R_23(REG...) _R_23(REG, 1, 2, 3) 127 128 #define ZFS_ASM_BUG() ASSERT(0) 129 130 #define OFFSET(ptr, val) (((unsigned char *)(ptr))+val) 131 132 extern const uint8_t gf_clmul_mod_lt[4*256][16]; 133 134 #define ELEM_SIZE 16 135 136 typedef struct v { 137 uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); 138 } v_t; 139 140 #define XOR_ACC(src, r...) \ 141 { \ 142 switch (REG_CNT(r)) { \ 143 case 8: \ 144 __asm( \ 145 "ld1 { v21.4s },%[SRC0]\n" \ 146 "ld1 { v20.4s },%[SRC1]\n" \ 147 "ld1 { v19.4s },%[SRC2]\n" \ 148 "ld1 { v18.4s },%[SRC3]\n" \ 149 "eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n" \ 150 "eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n" \ 151 "eor " VR2(r) ".16b," VR2(r) ".16b,v19.16b\n" \ 152 "eor " VR3(r) ".16b," VR3(r) ".16b,v18.16b\n" \ 153 "ld1 { v21.4s },%[SRC4]\n" \ 154 "ld1 { v20.4s },%[SRC5]\n" \ 155 "ld1 { v19.4s },%[SRC6]\n" \ 156 "ld1 { v18.4s },%[SRC7]\n" \ 157 "eor " VR4(r) ".16b," VR4(r) ".16b,v21.16b\n" \ 158 "eor " VR5(r) ".16b," VR5(r) ".16b,v20.16b\n" \ 159 "eor " VR6(r) ".16b," VR6(r) ".16b,v19.16b\n" \ 160 "eor " VR7(r) ".16b," VR7(r) ".16b,v18.16b\n" \ 161 : UVR0(r), UVR1(r), UVR2(r), UVR3(r), \ 162 UVR4(r), UVR5(r), UVR6(r), UVR7(r) \ 163 : [SRC0] "Q" (*(OFFSET(src, 0))), \ 164 [SRC1] "Q" (*(OFFSET(src, 16))), \ 165 [SRC2] "Q" (*(OFFSET(src, 32))), \ 166 [SRC3] "Q" (*(OFFSET(src, 48))), \ 167 [SRC4] "Q" (*(OFFSET(src, 64))), \ 168 [SRC5] "Q" (*(OFFSET(src, 80))), \ 169 [SRC6] "Q" (*(OFFSET(src, 96))), \ 170 [SRC7] "Q" (*(OFFSET(src, 112))) \ 171 : "v18", "v19", "v20", "v21"); \ 172 break; \ 173 case 4: \ 174 __asm( \ 175 "ld1 { v21.4s },%[SRC0]\n" \ 176 "ld1 { v20.4s },%[SRC1]\n" \ 177 "ld1 { v19.4s },%[SRC2]\n" \ 178 "ld1 { v18.4s },%[SRC3]\n" \ 179 "eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n" \ 180 "eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n" \ 181 "eor " VR2(r) ".16b," VR2(r) ".16b,v19.16b\n" \ 182 "eor " VR3(r) ".16b," VR3(r) ".16b,v18.16b\n" \ 183 : UVR0(r), UVR1(r), UVR2(r), UVR3(r) \ 184 : [SRC0] "Q" (*(OFFSET(src, 0))), \ 185 [SRC1] "Q" (*(OFFSET(src, 16))), \ 186 [SRC2] "Q" (*(OFFSET(src, 32))), \ 187 [SRC3] "Q" (*(OFFSET(src, 48))) \ 188 : "v18", "v19", "v20", "v21"); \ 189 break; \ 190 case 2: \ 191 __asm( \ 192 "ld1 { v21.4s },%[SRC0]\n" \ 193 "ld1 { v20.4s },%[SRC1]\n" \ 194 "eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n" \ 195 "eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n" \ 196 : UVR0(r), UVR1(r) \ 197 : [SRC0] "Q" (*(OFFSET(src, 0))), \ 198 [SRC1] "Q" (*(OFFSET(src, 16))) \ 199 : "v20", "v21"); \ 200 break; \ 201 default: \ 202 ZFS_ASM_BUG(); \ 203 } \ 204 } 205 206 #define XOR(r...) \ 207 { \ 208 switch (REG_CNT(r)) { \ 209 case 8: \ 210 __asm( \ 211 "eor " VR4(r) ".16b," VR4(r) ".16b," VR0(r) ".16b\n" \ 212 "eor " VR5(r) ".16b," VR5(r) ".16b," VR1(r) ".16b\n" \ 213 "eor " VR6(r) ".16b," VR6(r) ".16b," VR2(r) ".16b\n" \ 214 "eor " VR7(r) ".16b," VR7(r) ".16b," VR3(r) ".16b\n" \ 215 : UVR4(r), UVR5(r), UVR6(r), UVR7(r) \ 216 : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \ 217 break; \ 218 case 4: \ 219 __asm( \ 220 "eor " VR2(r) ".16b," VR2(r) ".16b," VR0(r) ".16b\n" \ 221 "eor " VR3(r) ".16b," VR3(r) ".16b," VR1(r) ".16b\n" \ 222 : UVR2(r), UVR3(r) \ 223 : RVR0(r), RVR1(r)); \ 224 break; \ 225 default: \ 226 ZFS_ASM_BUG(); \ 227 } \ 228 } 229 230 #define ZERO(r...) \ 231 { \ 232 switch (REG_CNT(r)) { \ 233 case 8: \ 234 __asm( \ 235 "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \ 236 "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n" \ 237 "eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n" \ 238 "eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n" \ 239 "eor " VR4(r) ".16b," VR4(r) ".16b," VR4(r) ".16b\n" \ 240 "eor " VR5(r) ".16b," VR5(r) ".16b," VR5(r) ".16b\n" \ 241 "eor " VR6(r) ".16b," VR6(r) ".16b," VR6(r) ".16b\n" \ 242 "eor " VR7(r) ".16b," VR7(r) ".16b," VR7(r) ".16b\n" \ 243 : WVR0(r), WVR1(r), WVR2(r), WVR3(r), \ 244 WVR4(r), WVR5(r), WVR6(r), WVR7(r)); \ 245 break; \ 246 case 4: \ 247 __asm( \ 248 "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \ 249 "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n" \ 250 "eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n" \ 251 "eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n" \ 252 : WVR0(r), WVR1(r), WVR2(r), WVR3(r)); \ 253 break; \ 254 case 2: \ 255 __asm( \ 256 "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \ 257 "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n" \ 258 : WVR0(r), WVR1(r)); \ 259 break; \ 260 default: \ 261 ZFS_ASM_BUG(); \ 262 } \ 263 } 264 265 #define COPY(r...) \ 266 { \ 267 switch (REG_CNT(r)) { \ 268 case 8: \ 269 __asm( \ 270 "mov " VR4(r) ".16b," VR0(r) ".16b\n" \ 271 "mov " VR5(r) ".16b," VR1(r) ".16b\n" \ 272 "mov " VR6(r) ".16b," VR2(r) ".16b\n" \ 273 "mov " VR7(r) ".16b," VR3(r) ".16b\n" \ 274 : WVR4(r), WVR5(r), WVR6(r), WVR7(r) \ 275 : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \ 276 break; \ 277 case 4: \ 278 __asm( \ 279 "mov " VR2(r) ".16b," VR0(r) ".16b\n" \ 280 "mov " VR3(r) ".16b," VR1(r) ".16b\n" \ 281 : WVR2(r), WVR3(r) \ 282 : RVR0(r), RVR1(r)); \ 283 break; \ 284 default: \ 285 ZFS_ASM_BUG(); \ 286 } \ 287 } 288 289 #define LOAD(src, r...) \ 290 { \ 291 switch (REG_CNT(r)) { \ 292 case 8: \ 293 __asm( \ 294 "ld1 { " VR0(r) ".4s },%[SRC0]\n" \ 295 "ld1 { " VR1(r) ".4s },%[SRC1]\n" \ 296 "ld1 { " VR2(r) ".4s },%[SRC2]\n" \ 297 "ld1 { " VR3(r) ".4s },%[SRC3]\n" \ 298 "ld1 { " VR4(r) ".4s },%[SRC4]\n" \ 299 "ld1 { " VR5(r) ".4s },%[SRC5]\n" \ 300 "ld1 { " VR6(r) ".4s },%[SRC6]\n" \ 301 "ld1 { " VR7(r) ".4s },%[SRC7]\n" \ 302 : WVR0(r), WVR1(r), WVR2(r), WVR3(r), \ 303 WVR4(r), WVR5(r), WVR6(r), WVR7(r) \ 304 : [SRC0] "Q" (*(OFFSET(src, 0))), \ 305 [SRC1] "Q" (*(OFFSET(src, 16))), \ 306 [SRC2] "Q" (*(OFFSET(src, 32))), \ 307 [SRC3] "Q" (*(OFFSET(src, 48))), \ 308 [SRC4] "Q" (*(OFFSET(src, 64))), \ 309 [SRC5] "Q" (*(OFFSET(src, 80))), \ 310 [SRC6] "Q" (*(OFFSET(src, 96))), \ 311 [SRC7] "Q" (*(OFFSET(src, 112)))); \ 312 break; \ 313 case 4: \ 314 __asm( \ 315 "ld1 { " VR0(r) ".4s },%[SRC0]\n" \ 316 "ld1 { " VR1(r) ".4s },%[SRC1]\n" \ 317 "ld1 { " VR2(r) ".4s },%[SRC2]\n" \ 318 "ld1 { " VR3(r) ".4s },%[SRC3]\n" \ 319 : WVR0(r), WVR1(r), WVR2(r), WVR3(r) \ 320 : [SRC0] "Q" (*(OFFSET(src, 0))), \ 321 [SRC1] "Q" (*(OFFSET(src, 16))), \ 322 [SRC2] "Q" (*(OFFSET(src, 32))), \ 323 [SRC3] "Q" (*(OFFSET(src, 48)))); \ 324 break; \ 325 case 2: \ 326 __asm( \ 327 "ld1 { " VR0(r) ".4s },%[SRC0]\n" \ 328 "ld1 { " VR1(r) ".4s },%[SRC1]\n" \ 329 : WVR0(r), WVR1(r) \ 330 : [SRC0] "Q" (*(OFFSET(src, 0))), \ 331 [SRC1] "Q" (*(OFFSET(src, 16)))); \ 332 break; \ 333 default: \ 334 ZFS_ASM_BUG(); \ 335 } \ 336 } 337 338 #define STORE(dst, r...) \ 339 { \ 340 switch (REG_CNT(r)) { \ 341 case 8: \ 342 __asm( \ 343 "st1 { " VR0(r) ".4s },%[DST0]\n" \ 344 "st1 { " VR1(r) ".4s },%[DST1]\n" \ 345 "st1 { " VR2(r) ".4s },%[DST2]\n" \ 346 "st1 { " VR3(r) ".4s },%[DST3]\n" \ 347 "st1 { " VR4(r) ".4s },%[DST4]\n" \ 348 "st1 { " VR5(r) ".4s },%[DST5]\n" \ 349 "st1 { " VR6(r) ".4s },%[DST6]\n" \ 350 "st1 { " VR7(r) ".4s },%[DST7]\n" \ 351 : [DST0] "=Q" (*(OFFSET(dst, 0))), \ 352 [DST1] "=Q" (*(OFFSET(dst, 16))), \ 353 [DST2] "=Q" (*(OFFSET(dst, 32))), \ 354 [DST3] "=Q" (*(OFFSET(dst, 48))), \ 355 [DST4] "=Q" (*(OFFSET(dst, 64))), \ 356 [DST5] "=Q" (*(OFFSET(dst, 80))), \ 357 [DST6] "=Q" (*(OFFSET(dst, 96))), \ 358 [DST7] "=Q" (*(OFFSET(dst, 112))) \ 359 : RVR0(r), RVR1(r), RVR2(r), RVR3(r), \ 360 RVR4(r), RVR5(r), RVR6(r), RVR7(r)); \ 361 break; \ 362 case 4: \ 363 __asm( \ 364 "st1 { " VR0(r) ".4s },%[DST0]\n" \ 365 "st1 { " VR1(r) ".4s },%[DST1]\n" \ 366 "st1 { " VR2(r) ".4s },%[DST2]\n" \ 367 "st1 { " VR3(r) ".4s },%[DST3]\n" \ 368 : [DST0] "=Q" (*(OFFSET(dst, 0))), \ 369 [DST1] "=Q" (*(OFFSET(dst, 16))), \ 370 [DST2] "=Q" (*(OFFSET(dst, 32))), \ 371 [DST3] "=Q" (*(OFFSET(dst, 48))) \ 372 : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \ 373 break; \ 374 case 2: \ 375 __asm( \ 376 "st1 { " VR0(r) ".4s },%[DST0]\n" \ 377 "st1 { " VR1(r) ".4s },%[DST1]\n" \ 378 : [DST0] "=Q" (*(OFFSET(dst, 0))), \ 379 [DST1] "=Q" (*(OFFSET(dst, 16))) \ 380 : RVR0(r), RVR1(r)); \ 381 break; \ 382 default: \ 383 ZFS_ASM_BUG(); \ 384 } \ 385 } 386 387 /* 388 * Unfortunately cannot use the macro, because GCC 389 * will try to use the macro name and not value 390 * later on... 391 * Kept as a reference to what a numbered variable is 392 */ 393 #define _00 "v17" 394 #define _1d "v16" 395 #define _temp0 "v19" 396 #define _temp1 "v18" 397 398 #define MUL2_SETUP() \ 399 { \ 400 __asm( \ 401 "eor " VR(17) ".16b," VR(17) ".16b," VR(17) ".16b\n" \ 402 "movi " VR(16) ".16b,#0x1d\n" \ 403 : WVR(16), WVR(17)); \ 404 } 405 406 #define MUL2(r...) \ 407 { \ 408 switch (REG_CNT(r)) { \ 409 case 4: \ 410 __asm( \ 411 "cmgt v19.16b," VR(17) ".16b," VR0(r) ".16b\n" \ 412 "cmgt v18.16b," VR(17) ".16b," VR1(r) ".16b\n" \ 413 "cmgt v21.16b," VR(17) ".16b," VR2(r) ".16b\n" \ 414 "cmgt v20.16b," VR(17) ".16b," VR3(r) ".16b\n" \ 415 "and v19.16b,v19.16b," VR(16) ".16b\n" \ 416 "and v18.16b,v18.16b," VR(16) ".16b\n" \ 417 "and v21.16b,v21.16b," VR(16) ".16b\n" \ 418 "and v20.16b,v20.16b," VR(16) ".16b\n" \ 419 "shl " VR0(r) ".16b," VR0(r) ".16b,#1\n" \ 420 "shl " VR1(r) ".16b," VR1(r) ".16b,#1\n" \ 421 "shl " VR2(r) ".16b," VR2(r) ".16b,#1\n" \ 422 "shl " VR3(r) ".16b," VR3(r) ".16b,#1\n" \ 423 "eor " VR0(r) ".16b,v19.16b," VR0(r) ".16b\n" \ 424 "eor " VR1(r) ".16b,v18.16b," VR1(r) ".16b\n" \ 425 "eor " VR2(r) ".16b,v21.16b," VR2(r) ".16b\n" \ 426 "eor " VR3(r) ".16b,v20.16b," VR3(r) ".16b\n" \ 427 : UVR0(r), UVR1(r), UVR2(r), UVR3(r) \ 428 : RVR(17), RVR(16) \ 429 : "v18", "v19", "v20", "v21"); \ 430 break; \ 431 case 2: \ 432 __asm( \ 433 "cmgt v19.16b," VR(17) ".16b," VR0(r) ".16b\n" \ 434 "cmgt v18.16b," VR(17) ".16b," VR1(r) ".16b\n" \ 435 "and v19.16b,v19.16b," VR(16) ".16b\n" \ 436 "and v18.16b,v18.16b," VR(16) ".16b\n" \ 437 "shl " VR0(r) ".16b," VR0(r) ".16b,#1\n" \ 438 "shl " VR1(r) ".16b," VR1(r) ".16b,#1\n" \ 439 "eor " VR0(r) ".16b,v19.16b," VR0(r) ".16b\n" \ 440 "eor " VR1(r) ".16b,v18.16b," VR1(r) ".16b\n" \ 441 : UVR0(r), UVR1(r) \ 442 : RVR(17), RVR(16) \ 443 : "v18", "v19"); \ 444 break; \ 445 default: \ 446 ZFS_ASM_BUG(); \ 447 } \ 448 } 449 450 #define MUL4(r...) \ 451 { \ 452 MUL2(r); \ 453 MUL2(r); \ 454 } 455 456 /* 457 * Unfortunately cannot use the macro, because GCC 458 * will try to use the macro name and not value 459 * later on... 460 * Kept as a reference to what a register is 461 * (here we're using actual registers for the 462 * clobbered ones) 463 */ 464 #define _0f "v15" 465 #define _a_save "v14" 466 #define _b_save "v13" 467 #define _lt_mod_a "v12" 468 #define _lt_clmul_a "v11" 469 #define _lt_mod_b "v10" 470 #define _lt_clmul_b "v15" 471 472 #define _MULx2(c, r...) \ 473 { \ 474 switch (REG_CNT(r)) { \ 475 case 2: \ 476 __asm( \ 477 /* lts for upper part */ \ 478 "movi v15.16b,#0x0f\n" \ 479 "ld1 { v10.4s },%[lt0]\n" \ 480 "ld1 { v11.4s },%[lt1]\n" \ 481 /* upper part */ \ 482 "and v14.16b," VR0(r) ".16b,v15.16b\n" \ 483 "and v13.16b," VR1(r) ".16b,v15.16b\n" \ 484 "ushr " VR0(r) ".16b," VR0(r) ".16b,#4\n" \ 485 "ushr " VR1(r) ".16b," VR1(r) ".16b,#4\n" \ 486 \ 487 "tbl v12.16b,{v10.16b}," VR0(r) ".16b\n" \ 488 "tbl v10.16b,{v10.16b}," VR1(r) ".16b\n" \ 489 "tbl v15.16b,{v11.16b}," VR0(r) ".16b\n" \ 490 "tbl v11.16b,{v11.16b}," VR1(r) ".16b\n" \ 491 \ 492 "eor " VR0(r) ".16b,v15.16b,v12.16b\n" \ 493 "eor " VR1(r) ".16b,v11.16b,v10.16b\n" \ 494 /* lts for lower part */ \ 495 "ld1 { v10.4s },%[lt2]\n" \ 496 "ld1 { v15.4s },%[lt3]\n" \ 497 /* lower part */ \ 498 "tbl v12.16b,{v10.16b},v14.16b\n" \ 499 "tbl v10.16b,{v10.16b},v13.16b\n" \ 500 "tbl v11.16b,{v15.16b},v14.16b\n" \ 501 "tbl v15.16b,{v15.16b},v13.16b\n" \ 502 \ 503 "eor " VR0(r) ".16b," VR0(r) ".16b,v12.16b\n" \ 504 "eor " VR1(r) ".16b," VR1(r) ".16b,v10.16b\n" \ 505 "eor " VR0(r) ".16b," VR0(r) ".16b,v11.16b\n" \ 506 "eor " VR1(r) ".16b," VR1(r) ".16b,v15.16b\n" \ 507 : UVR0(r), UVR1(r) \ 508 : [lt0] "Q" ((gf_clmul_mod_lt[4*(c)+0][0])), \ 509 [lt1] "Q" ((gf_clmul_mod_lt[4*(c)+1][0])), \ 510 [lt2] "Q" ((gf_clmul_mod_lt[4*(c)+2][0])), \ 511 [lt3] "Q" ((gf_clmul_mod_lt[4*(c)+3][0])) \ 512 : "v10", "v11", "v12", "v13", "v14", "v15"); \ 513 break; \ 514 default: \ 515 ZFS_ASM_BUG(); \ 516 } \ 517 } 518 519 #define MUL(c, r...) \ 520 { \ 521 switch (REG_CNT(r)) { \ 522 case 4: \ 523 _MULx2(c, R_23(r)); \ 524 _MULx2(c, R_01(r)); \ 525 break; \ 526 case 2: \ 527 _MULx2(c, R_01(r)); \ 528 break; \ 529 default: \ 530 ZFS_ASM_BUG(); \ 531 } \ 532 } 533 534 #define raidz_math_begin() kfpu_begin() 535 #define raidz_math_end() kfpu_end() 536 537 /* Overkill... */ 538 #if defined(_KERNEL) 539 #define GEN_X_DEFINE_0_3() \ 540 register unsigned char w0 asm("v0") __attribute__((vector_size(16))); \ 541 register unsigned char w1 asm("v1") __attribute__((vector_size(16))); \ 542 register unsigned char w2 asm("v2") __attribute__((vector_size(16))); \ 543 register unsigned char w3 asm("v3") __attribute__((vector_size(16))); 544 #define GEN_X_DEFINE_4_5() \ 545 register unsigned char w4 asm("v4") __attribute__((vector_size(16))); \ 546 register unsigned char w5 asm("v5") __attribute__((vector_size(16))); 547 #define GEN_X_DEFINE_6_7() \ 548 register unsigned char w6 asm("v6") __attribute__((vector_size(16))); \ 549 register unsigned char w7 asm("v7") __attribute__((vector_size(16))); 550 #define GEN_X_DEFINE_8_9() \ 551 register unsigned char w8 asm("v8") __attribute__((vector_size(16))); \ 552 register unsigned char w9 asm("v9") __attribute__((vector_size(16))); 553 #define GEN_X_DEFINE_10_11() \ 554 register unsigned char w10 asm("v10") __attribute__((vector_size(16))); \ 555 register unsigned char w11 asm("v11") __attribute__((vector_size(16))); 556 #define GEN_X_DEFINE_12_15() \ 557 register unsigned char w12 asm("v12") __attribute__((vector_size(16))); \ 558 register unsigned char w13 asm("v13") __attribute__((vector_size(16))); \ 559 register unsigned char w14 asm("v14") __attribute__((vector_size(16))); \ 560 register unsigned char w15 asm("v15") __attribute__((vector_size(16))); 561 #define GEN_X_DEFINE_16() \ 562 register unsigned char w16 asm("v16") __attribute__((vector_size(16))); 563 #define GEN_X_DEFINE_17() \ 564 register unsigned char w17 asm("v17") __attribute__((vector_size(16))); 565 #define GEN_X_DEFINE_18_21() \ 566 register unsigned char w18 asm("v18") __attribute__((vector_size(16))); \ 567 register unsigned char w19 asm("v19") __attribute__((vector_size(16))); \ 568 register unsigned char w20 asm("v20") __attribute__((vector_size(16))); \ 569 register unsigned char w21 asm("v21") __attribute__((vector_size(16))); 570 #define GEN_X_DEFINE_22_23() \ 571 register unsigned char w22 asm("v22") __attribute__((vector_size(16))); \ 572 register unsigned char w23 asm("v23") __attribute__((vector_size(16))); 573 #define GEN_X_DEFINE_24_27() \ 574 register unsigned char w24 asm("v24") __attribute__((vector_size(16))); \ 575 register unsigned char w25 asm("v25") __attribute__((vector_size(16))); \ 576 register unsigned char w26 asm("v26") __attribute__((vector_size(16))); \ 577 register unsigned char w27 asm("v27") __attribute__((vector_size(16))); 578 #define GEN_X_DEFINE_28_30() \ 579 register unsigned char w28 asm("v28") __attribute__((vector_size(16))); \ 580 register unsigned char w29 asm("v29") __attribute__((vector_size(16))); \ 581 register unsigned char w30 asm("v30") __attribute__((vector_size(16))); 582 #define GEN_X_DEFINE_31() \ 583 register unsigned char w31 asm("v31") __attribute__((vector_size(16))); 584 #define GEN_X_DEFINE_32() \ 585 register unsigned char w32 asm("v31") __attribute__((vector_size(16))); 586 #define GEN_X_DEFINE_33_36() \ 587 register unsigned char w33 asm("v31") __attribute__((vector_size(16))); \ 588 register unsigned char w34 asm("v31") __attribute__((vector_size(16))); \ 589 register unsigned char w35 asm("v31") __attribute__((vector_size(16))); \ 590 register unsigned char w36 asm("v31") __attribute__((vector_size(16))); 591 #define GEN_X_DEFINE_37_38() \ 592 register unsigned char w37 asm("v31") __attribute__((vector_size(16))); \ 593 register unsigned char w38 asm("v31") __attribute__((vector_size(16))); 594 #define GEN_X_DEFINE_ALL() \ 595 GEN_X_DEFINE_0_3() \ 596 GEN_X_DEFINE_4_5() \ 597 GEN_X_DEFINE_6_7() \ 598 GEN_X_DEFINE_8_9() \ 599 GEN_X_DEFINE_10_11() \ 600 GEN_X_DEFINE_12_15() \ 601 GEN_X_DEFINE_16() \ 602 GEN_X_DEFINE_17() \ 603 GEN_X_DEFINE_18_21() \ 604 GEN_X_DEFINE_22_23() \ 605 GEN_X_DEFINE_24_27() \ 606 GEN_X_DEFINE_28_30() \ 607 GEN_X_DEFINE_31() \ 608 GEN_X_DEFINE_32() \ 609 GEN_X_DEFINE_33_36() \ 610 GEN_X_DEFINE_37_38() 611 #else 612 #define GEN_X_DEFINE_0_3() \ 613 unsigned char w0 __attribute__((vector_size(16))); \ 614 unsigned char w1 __attribute__((vector_size(16))); \ 615 unsigned char w2 __attribute__((vector_size(16))); \ 616 unsigned char w3 __attribute__((vector_size(16))); 617 #define GEN_X_DEFINE_4_5() \ 618 unsigned char w4 __attribute__((vector_size(16))); \ 619 unsigned char w5 __attribute__((vector_size(16))); 620 #define GEN_X_DEFINE_6_7() \ 621 unsigned char w6 __attribute__((vector_size(16))); \ 622 unsigned char w7 __attribute__((vector_size(16))); 623 #define GEN_X_DEFINE_8_9() \ 624 unsigned char w8 __attribute__((vector_size(16))); \ 625 unsigned char w9 __attribute__((vector_size(16))); 626 #define GEN_X_DEFINE_10_11() \ 627 unsigned char w10 __attribute__((vector_size(16))); \ 628 unsigned char w11 __attribute__((vector_size(16))); 629 #define GEN_X_DEFINE_12_15() \ 630 unsigned char w12 __attribute__((vector_size(16))); \ 631 unsigned char w13 __attribute__((vector_size(16))); \ 632 unsigned char w14 __attribute__((vector_size(16))); \ 633 unsigned char w15 __attribute__((vector_size(16))); 634 #define GEN_X_DEFINE_16() \ 635 unsigned char w16 __attribute__((vector_size(16))); 636 #define GEN_X_DEFINE_17() \ 637 unsigned char w17 __attribute__((vector_size(16))); 638 #define GEN_X_DEFINE_18_21() \ 639 unsigned char w18 __attribute__((vector_size(16))); \ 640 unsigned char w19 __attribute__((vector_size(16))); \ 641 unsigned char w20 __attribute__((vector_size(16))); \ 642 unsigned char w21 __attribute__((vector_size(16))); 643 #define GEN_X_DEFINE_22_23() \ 644 unsigned char w22 __attribute__((vector_size(16))); \ 645 unsigned char w23 __attribute__((vector_size(16))); 646 #define GEN_X_DEFINE_24_27() \ 647 unsigned char w24 __attribute__((vector_size(16))); \ 648 unsigned char w25 __attribute__((vector_size(16))); \ 649 unsigned char w26 __attribute__((vector_size(16))); \ 650 unsigned char w27 __attribute__((vector_size(16))); 651 #define GEN_X_DEFINE_28_30() \ 652 unsigned char w28 __attribute__((vector_size(16))); \ 653 unsigned char w29 __attribute__((vector_size(16))); \ 654 unsigned char w30 __attribute__((vector_size(16))); 655 #define GEN_X_DEFINE_31() \ 656 unsigned char w31 __attribute__((vector_size(16))); 657 #define GEN_X_DEFINE_32() \ 658 unsigned char w32 __attribute__((vector_size(16))); 659 #define GEN_X_DEFINE_33_36() \ 660 unsigned char w33 __attribute__((vector_size(16))); \ 661 unsigned char w34 __attribute__((vector_size(16))); \ 662 unsigned char w35 __attribute__((vector_size(16))); \ 663 unsigned char w36 __attribute__((vector_size(16))); 664 #define GEN_X_DEFINE_37_38() \ 665 unsigned char w37 __attribute__((vector_size(16))); \ 666 unsigned char w38 __attribute__((vector_size(16))); 667 #define GEN_X_DEFINE_ALL() \ 668 GEN_X_DEFINE_0_3() \ 669 GEN_X_DEFINE_4_5() \ 670 GEN_X_DEFINE_6_7() \ 671 GEN_X_DEFINE_8_9() \ 672 GEN_X_DEFINE_10_11() \ 673 GEN_X_DEFINE_12_15() \ 674 GEN_X_DEFINE_16() \ 675 GEN_X_DEFINE_17() \ 676 GEN_X_DEFINE_18_21() \ 677 GEN_X_DEFINE_22_23() \ 678 GEN_X_DEFINE_24_27() \ 679 GEN_X_DEFINE_28_30() \ 680 GEN_X_DEFINE_31() \ 681 GEN_X_DEFINE_32() \ 682 GEN_X_DEFINE_33_36() \ 683 GEN_X_DEFINE_37_38() 684 #endif 685