1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Accelerated GHASH implementation with ARMv8 ASIMD instructions. 4 * 5 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> 6 */ 7 8#include <linux/linkage.h> 9#include <asm/assembler.h> 10 11 SHASH .req v0 12 SHASH2 .req v1 13 T1 .req v2 14 T2 .req v3 15 XM .req v5 16 XL .req v6 17 XH .req v7 18 IN1 .req v7 19 20 k00_16 .req v8 21 k32_48 .req v9 22 23 t3 .req v10 24 t4 .req v11 25 t5 .req v12 26 t6 .req v13 27 t7 .req v14 28 t8 .req v15 29 t9 .req v16 30 31 perm1 .req v17 32 perm2 .req v18 33 perm3 .req v19 34 35 sh1 .req v20 36 sh2 .req v21 37 sh3 .req v22 38 sh4 .req v23 39 40 ss1 .req v24 41 ss2 .req v25 42 ss3 .req v26 43 ss4 .req v27 44 45 .text 46 47 .macro __pmull_p8, rq, ad, bd 48 ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 49 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 50 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 51 52 __pmull_p8_\bd \rq, \ad 53 .endm 54 55 .macro __pmull2_p8, rq, ad, bd 56 tbl t3.16b, {\ad\().16b}, perm1.16b // A1 57 tbl t5.16b, {\ad\().16b}, perm2.16b // A2 58 tbl t7.16b, {\ad\().16b}, perm3.16b // A3 59 60 __pmull2_p8_\bd \rq, \ad 61 .endm 62 63 .macro __pmull_p8_SHASH, rq, ad 64 __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 65 .endm 66 67 .macro __pmull_p8_SHASH2, rq, ad 68 __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 69 .endm 70 71 .macro __pmull2_p8_SHASH, rq, ad 72 __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 73 .endm 74 75 .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 76 pmull\t t3.8h, t3.\nb, \bd // F = A1*B 77 pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 78 pmull\t t5.8h, t5.\nb, \bd // H = A2*B 79 pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 80 pmull\t t7.8h, t7.\nb, \bd // J = A3*B 81 pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 82 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 83 pmull\t \rq\().8h, \ad, \bd // D = A*B 84 85 eor t3.16b, t3.16b, t4.16b // L = E + F 86 eor t5.16b, t5.16b, t6.16b // M = G + H 87 eor t7.16b, t7.16b, t8.16b // N = I + J 88 89 uzp1 t4.2d, t3.2d, t5.2d 90 uzp2 t3.2d, t3.2d, t5.2d 91 uzp1 t6.2d, t7.2d, t9.2d 92 uzp2 t7.2d, t7.2d, t9.2d 93 94 // t3 = (L) (P0 + P1) << 8 95 // t5 = (M) (P2 + P3) << 16 96 eor t4.16b, t4.16b, t3.16b 97 and t3.16b, t3.16b, k32_48.16b 98 99 // t7 = (N) (P4 + P5) << 24 100 // t9 = (K) (P6 + P7) << 32 101 eor t6.16b, t6.16b, t7.16b 102 and t7.16b, t7.16b, k00_16.16b 103 104 eor t4.16b, t4.16b, t3.16b 105 eor t6.16b, t6.16b, t7.16b 106 107 zip2 t5.2d, t4.2d, t3.2d 108 zip1 t3.2d, t4.2d, t3.2d 109 zip2 t9.2d, t6.2d, t7.2d 110 zip1 t7.2d, t6.2d, t7.2d 111 112 ext t3.16b, t3.16b, t3.16b, #15 113 ext t5.16b, t5.16b, t5.16b, #14 114 ext t7.16b, t7.16b, t7.16b, #13 115 ext t9.16b, t9.16b, t9.16b, #12 116 117 eor t3.16b, t3.16b, t5.16b 118 eor t7.16b, t7.16b, t9.16b 119 eor \rq\().16b, \rq\().16b, t3.16b 120 eor \rq\().16b, \rq\().16b, t7.16b 121 .endm 122 123 .macro __pmull_pre_p8 124 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 125 eor SHASH2.16b, SHASH2.16b, SHASH.16b 126 127 // k00_16 := 0x0000000000000000_000000000000ffff 128 // k32_48 := 0x00000000ffffffff_0000ffffffffffff 129 movi k32_48.2d, #0xffffffff 130 mov k32_48.h[2], k32_48.h[0] 131 ushr k00_16.2d, k32_48.2d, #32 132 133 // prepare the permutation vectors 134 mov_q x5, 0x080f0e0d0c0b0a09 135 movi T1.8b, #8 136 dup perm1.2d, x5 137 eor perm1.16b, perm1.16b, T1.16b 138 ushr perm2.2d, perm1.2d, #8 139 ushr perm3.2d, perm1.2d, #16 140 ushr T1.2d, perm1.2d, #24 141 sli perm2.2d, perm1.2d, #56 142 sli perm3.2d, perm1.2d, #48 143 sli T1.2d, perm1.2d, #40 144 145 // precompute loop invariants 146 tbl sh1.16b, {SHASH.16b}, perm1.16b 147 tbl sh2.16b, {SHASH.16b}, perm2.16b 148 tbl sh3.16b, {SHASH.16b}, perm3.16b 149 tbl sh4.16b, {SHASH.16b}, T1.16b 150 ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 151 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 152 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 153 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 154 .endm 155 156 .macro __pmull_reduce_p8 157 eor XM.16b, XM.16b, T1.16b 158 159 mov XL.d[1], XM.d[0] 160 mov XH.d[0], XM.d[1] 161 162 shl T1.2d, XL.2d, #57 163 shl T2.2d, XL.2d, #62 164 eor T2.16b, T2.16b, T1.16b 165 shl T1.2d, XL.2d, #63 166 eor T2.16b, T2.16b, T1.16b 167 ext T1.16b, XL.16b, XH.16b, #8 168 eor T2.16b, T2.16b, T1.16b 169 170 mov XL.d[1], T2.d[0] 171 mov XH.d[0], T2.d[1] 172 173 ushr T2.2d, XL.2d, #1 174 eor XH.16b, XH.16b, XL.16b 175 eor XL.16b, XL.16b, T2.16b 176 ushr T2.2d, T2.2d, #6 177 ushr XL.2d, XL.2d, #1 178 .endm 179 180 /* 181 * void pmull_ghash_update_p8(size_t blocks, struct polyval_elem *dg, 182 * const u8 *src, 183 * const struct polyval_elem *h) 184 */ 185SYM_FUNC_START(pmull_ghash_update_p8) 186 ld1 {SHASH.2d}, [x3] 187 ld1 {XL.2d}, [x1] 188 189 __pmull_pre_p8 190 1910: ld1 {T1.2d}, [x2], #16 192 sub x0, x0, #1 193 194 /* multiply XL by SHASH in GF(2^128) */ 195 rev64 T1.16b, T1.16b 196 197 ext T2.16b, XL.16b, XL.16b, #8 198 ext IN1.16b, T1.16b, T1.16b, #8 199 eor T1.16b, T1.16b, T2.16b 200 eor XL.16b, XL.16b, IN1.16b 201 202 __pmull2_p8 XH, XL, SHASH // a1 * b1 203 eor T1.16b, T1.16b, XL.16b 204 __pmull_p8 XL, XL, SHASH // a0 * b0 205 __pmull_p8 XM, T1, SHASH2 // (a1 + a0)(b1 + b0) 206 207 eor T2.16b, XL.16b, XH.16b 208 ext T1.16b, XL.16b, XH.16b, #8 209 eor XM.16b, XM.16b, T2.16b 210 211 __pmull_reduce_p8 212 213 eor T2.16b, T2.16b, XH.16b 214 eor XL.16b, XL.16b, T2.16b 215 216 cbnz x0, 0b 217 218 st1 {XL.2d}, [x1] 219 ret 220SYM_FUNC_END(pmull_ghash_update_p8) 221