1/* Do not modify. This file is auto-generated from vpaes-armv8.pl. */ 2#include "arm_arch.h" 3 4.section .rodata 5 6.type _vpaes_consts,%object 7.align 7 // totally strategic alignment 8_vpaes_consts: 9.Lk_mc_forward: // mc_forward 10.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 11.quad 0x080B0A0904070605, 0x000302010C0F0E0D 12.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 13.quad 0x000302010C0F0E0D, 0x080B0A0904070605 14.Lk_mc_backward: // mc_backward 15.quad 0x0605040702010003, 0x0E0D0C0F0A09080B 16.quad 0x020100030E0D0C0F, 0x0A09080B06050407 17.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 18.quad 0x0A09080B06050407, 0x020100030E0D0C0F 19.Lk_sr: // sr 20.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 21.quad 0x030E09040F0A0500, 0x0B06010C07020D08 22.quad 0x0F060D040B020900, 0x070E050C030A0108 23.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 24 25// 26// "Hot" constants 27// 28.Lk_inv: // inv, inva 29.quad 0x0E05060F0D080180, 0x040703090A0B0C02 30.quad 0x01040A060F0B0780, 0x030D0E0C02050809 31.Lk_ipt: // input transform (lo, hi) 32.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 33.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 34.Lk_sbo: // sbou, sbot 35.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 36.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA 37.Lk_sb1: // sb1u, sb1t 38.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF 39.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 40.Lk_sb2: // sb2u, sb2t 41.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A 42.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD 43 44// 45// Decryption stuff 46// 47.Lk_dipt: // decryption input transform 48.quad 0x0F505B040B545F00, 0x154A411E114E451A 49.quad 0x86E383E660056500, 0x12771772F491F194 50.Lk_dsbo: // decryption sbox final output 51.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D 52.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C 53.Lk_dsb9: // decryption sbox output *9*u, *9*t 54.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 55.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 56.Lk_dsbd: // decryption sbox output *D*u, *D*t 57.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 58.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 59.Lk_dsbb: // decryption sbox output *B*u, *B*t 60.quad 0xD022649296B44200, 0x602646F6B0F2D404 61.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B 62.Lk_dsbe: // decryption sbox output *E*u, *E*t 63.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 64.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 65 66// 67// Key schedule constants 68// 69.Lk_dksd: // decryption key schedule: invskew x*D 70.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 71.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E 72.Lk_dksb: // decryption key schedule: invskew x*B 73.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 74.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 75.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 76.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 77.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 78.Lk_dks9: // decryption key schedule: invskew x*9 79.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC 80.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE 81 82.Lk_rcon: // rcon 83.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 84 85.Lk_opt: // output transform 86.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 87.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 88.Lk_deskew: // deskew tables: inverts the sbox's "skew" 89.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A 90.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 91 92.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 93.align 2 94.size _vpaes_consts,.-_vpaes_consts 95.align 6 96 97.text 98 99// 100// _aes_preheat 101// 102// Fills register %r10 -> .aes_consts (so you can -fPIC) 103// and %xmm9-%xmm15 as specified below. 104// 105.type _vpaes_encrypt_preheat,%function 106.align 4 107_vpaes_encrypt_preheat: 108 adrp x10, .Lk_inv 109 add x10, x10, #:lo12:.Lk_inv 110 movi v17.16b, #0x0f 111 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv 112 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo 113 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 114 ret 115.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat 116 117// 118// _aes_encrypt_core 119// 120// AES-encrypt %xmm0. 121// 122// Inputs: 123// %xmm0 = input 124// %xmm9-%xmm15 as in _vpaes_preheat 125// (%rdx) = scheduled keys 126// 127// Output in %xmm0 128// Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax 129// Preserves %xmm6 - %xmm8 so you get some local vectors 130// 131// 132.type _vpaes_encrypt_core,%function 133.align 4 134_vpaes_encrypt_core: 135 mov x9, x2 136 ldr w8, [x2,#240] // pull rounds 137 adrp x11, .Lk_mc_forward+16 138 add x11, x11, #:lo12:.Lk_mc_forward+16 139 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 140 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 141 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 142 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 143 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 144 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 145 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 146 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 147 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 148 b .Lenc_entry 149 150.align 4 151.Lenc_loop: 152 // middle of middle round 153 add x10, x11, #0x40 154 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 155 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 156 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 157 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 158 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 159 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 160 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 161 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 162 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 163 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 164 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 165 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 166 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 167 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 168 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 169 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 170 sub w8, w8, #1 // nr-- 171 172.Lenc_entry: 173 // top of round 174 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 175 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 176 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 177 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 178 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 179 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 180 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 181 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 182 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 183 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 184 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 185 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 186 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 187 cbnz w8, .Lenc_loop 188 189 // middle of last round 190 add x10, x11, #0x80 191 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 192 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 193 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 194 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 195 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 196 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 197 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 198 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 199 ret 200.size _vpaes_encrypt_core,.-_vpaes_encrypt_core 201 202.globl vpaes_encrypt 203.type vpaes_encrypt,%function 204.align 4 205vpaes_encrypt: 206 AARCH64_SIGN_LINK_REGISTER 207 stp x29,x30,[sp,#-16]! 208 add x29,sp,#0 209 210 ld1 {v7.16b}, [x0] 211 bl _vpaes_encrypt_preheat 212 bl _vpaes_encrypt_core 213 st1 {v0.16b}, [x1] 214 215 ldp x29,x30,[sp],#16 216 AARCH64_VALIDATE_LINK_REGISTER 217 ret 218.size vpaes_encrypt,.-vpaes_encrypt 219 220.type _vpaes_encrypt_2x,%function 221.align 4 222_vpaes_encrypt_2x: 223 mov x9, x2 224 ldr w8, [x2,#240] // pull rounds 225 adrp x11, .Lk_mc_forward+16 226 add x11, x11, #:lo12:.Lk_mc_forward+16 227 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 228 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 229 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 230 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 231 and v9.16b, v15.16b, v17.16b 232 ushr v8.16b, v15.16b, #4 233 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 234 tbl v9.16b, {v20.16b}, v9.16b 235 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 236 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 237 tbl v10.16b, {v21.16b}, v8.16b 238 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 239 eor v8.16b, v9.16b, v16.16b 240 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 241 eor v8.16b, v8.16b, v10.16b 242 b .Lenc_2x_entry 243 244.align 4 245.Lenc_2x_loop: 246 // middle of middle round 247 add x10, x11, #0x40 248 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 249 tbl v12.16b, {v25.16b}, v10.16b 250 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 251 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 252 tbl v8.16b, {v24.16b}, v11.16b 253 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 254 eor v12.16b, v12.16b, v16.16b 255 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 256 tbl v13.16b, {v27.16b}, v10.16b 257 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 258 eor v8.16b, v8.16b, v12.16b 259 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 260 tbl v10.16b, {v26.16b}, v11.16b 261 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 262 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 263 tbl v11.16b, {v8.16b}, v1.16b 264 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 265 eor v10.16b, v10.16b, v13.16b 266 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 267 tbl v8.16b, {v8.16b}, v4.16b 268 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 269 eor v11.16b, v11.16b, v10.16b 270 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 271 tbl v12.16b, {v11.16b},v1.16b 272 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 273 eor v8.16b, v8.16b, v11.16b 274 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 275 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 276 eor v8.16b, v8.16b, v12.16b 277 sub w8, w8, #1 // nr-- 278 279.Lenc_2x_entry: 280 // top of round 281 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 282 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 283 and v9.16b, v8.16b, v17.16b 284 ushr v8.16b, v8.16b, #4 285 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 286 tbl v13.16b, {v19.16b},v9.16b 287 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 288 eor v9.16b, v9.16b, v8.16b 289 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 290 tbl v11.16b, {v18.16b},v8.16b 291 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 292 tbl v12.16b, {v18.16b},v9.16b 293 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 294 eor v11.16b, v11.16b, v13.16b 295 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 296 eor v12.16b, v12.16b, v13.16b 297 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 298 tbl v10.16b, {v18.16b},v11.16b 299 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 300 tbl v11.16b, {v18.16b},v12.16b 301 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 302 eor v10.16b, v10.16b, v9.16b 303 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 304 eor v11.16b, v11.16b, v8.16b 305 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 306 cbnz w8, .Lenc_2x_loop 307 308 // middle of last round 309 add x10, x11, #0x80 310 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 311 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 312 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 313 tbl v12.16b, {v22.16b}, v10.16b 314 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 315 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 316 tbl v8.16b, {v23.16b}, v11.16b 317 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 318 eor v12.16b, v12.16b, v16.16b 319 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 320 eor v8.16b, v8.16b, v12.16b 321 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 322 tbl v1.16b, {v8.16b},v1.16b 323 ret 324.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x 325 326.type _vpaes_decrypt_preheat,%function 327.align 4 328_vpaes_decrypt_preheat: 329 adrp x10, .Lk_inv 330 add x10, x10, #:lo12:.Lk_inv 331 movi v17.16b, #0x0f 332 adrp x11, .Lk_dipt 333 add x11, x11, #:lo12:.Lk_dipt 334 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv 335 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo 336 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd 337 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe 338 ret 339.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat 340 341// 342// Decryption core 343// 344// Same API as encryption core. 345// 346.type _vpaes_decrypt_core,%function 347.align 4 348_vpaes_decrypt_core: 349 mov x9, x2 350 ldr w8, [x2,#240] // pull rounds 351 352 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 353 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 354 eor x11, x11, #0x30 // xor $0x30, %r11 355 adrp x10, .Lk_sr 356 add x10, x10, #:lo12:.Lk_sr 357 and x11, x11, #0x30 // and $0x30, %r11 358 add x11, x11, x10 359 adrp x10, .Lk_mc_forward+48 360 add x10, x10, #:lo12:.Lk_mc_forward+48 361 362 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 363 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 364 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 365 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 366 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 367 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 368 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 369 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 370 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 371 b .Ldec_entry 372 373.align 4 374.Ldec_loop: 375// 376// Inverse mix columns 377// 378 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 379 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 380 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 381 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 382 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 383 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 384 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 385 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 386 387 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 388 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 389 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 390 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 391 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 392 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 393 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 394 395 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 396 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 397 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 398 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 399 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 400 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 401 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 402 403 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 404 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 405 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 406 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 407 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 408 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 409 sub w8, w8, #1 // sub $1,%rax # nr-- 410 411.Ldec_entry: 412 // top of round 413 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 414 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 415 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 416 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 417 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 418 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 419 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 420 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 421 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 422 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 423 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 424 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 425 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 426 cbnz w8, .Ldec_loop 427 428 // middle of last round 429 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 430 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 431 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 432 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 433 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 434 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 435 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 436 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 437 ret 438.size _vpaes_decrypt_core,.-_vpaes_decrypt_core 439 440.globl vpaes_decrypt 441.type vpaes_decrypt,%function 442.align 4 443vpaes_decrypt: 444 AARCH64_SIGN_LINK_REGISTER 445 stp x29,x30,[sp,#-16]! 446 add x29,sp,#0 447 448 ld1 {v7.16b}, [x0] 449 bl _vpaes_decrypt_preheat 450 bl _vpaes_decrypt_core 451 st1 {v0.16b}, [x1] 452 453 ldp x29,x30,[sp],#16 454 AARCH64_VALIDATE_LINK_REGISTER 455 ret 456.size vpaes_decrypt,.-vpaes_decrypt 457 458// v14-v15 input, v0-v1 output 459.type _vpaes_decrypt_2x,%function 460.align 4 461_vpaes_decrypt_2x: 462 mov x9, x2 463 ldr w8, [x2,#240] // pull rounds 464 465 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 466 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 467 eor x11, x11, #0x30 // xor $0x30, %r11 468 adrp x10, .Lk_sr 469 add x10, x10, #:lo12:.Lk_sr 470 and x11, x11, #0x30 // and $0x30, %r11 471 add x11, x11, x10 472 adrp x10, .Lk_mc_forward+48 473 add x10, x10, #:lo12:.Lk_mc_forward+48 474 475 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 476 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 477 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 478 and v9.16b, v15.16b, v17.16b 479 ushr v8.16b, v15.16b, #4 480 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 481 tbl v10.16b, {v20.16b},v9.16b 482 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 483 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 484 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 485 tbl v8.16b, {v21.16b},v8.16b 486 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 487 eor v10.16b, v10.16b, v16.16b 488 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 489 eor v8.16b, v8.16b, v10.16b 490 b .Ldec_2x_entry 491 492.align 4 493.Ldec_2x_loop: 494// 495// Inverse mix columns 496// 497 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 498 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 499 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 500 tbl v12.16b, {v24.16b}, v10.16b 501 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 502 tbl v9.16b, {v25.16b}, v11.16b 503 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 504 eor v8.16b, v12.16b, v16.16b 505 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 506 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 507 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 508 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 509 510 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 511 tbl v12.16b, {v26.16b}, v10.16b 512 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 513 tbl v8.16b, {v8.16b},v5.16b 514 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 515 tbl v9.16b, {v27.16b}, v11.16b 516 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 517 eor v8.16b, v8.16b, v12.16b 518 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 519 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 520 eor v8.16b, v8.16b, v9.16b 521 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 522 523 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 524 tbl v12.16b, {v28.16b}, v10.16b 525 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 526 tbl v8.16b, {v8.16b},v5.16b 527 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 528 tbl v9.16b, {v29.16b}, v11.16b 529 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 530 eor v8.16b, v8.16b, v12.16b 531 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 532 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 533 eor v8.16b, v8.16b, v9.16b 534 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 535 536 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 537 tbl v12.16b, {v30.16b}, v10.16b 538 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 539 tbl v8.16b, {v8.16b},v5.16b 540 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 541 tbl v9.16b, {v31.16b}, v11.16b 542 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 543 eor v8.16b, v8.16b, v12.16b 544 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 545 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 546 eor v8.16b, v8.16b, v9.16b 547 sub w8, w8, #1 // sub $1,%rax # nr-- 548 549.Ldec_2x_entry: 550 // top of round 551 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 552 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 553 and v9.16b, v8.16b, v17.16b 554 ushr v8.16b, v8.16b, #4 555 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 556 tbl v10.16b, {v19.16b},v9.16b 557 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 558 eor v9.16b, v9.16b, v8.16b 559 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 560 tbl v11.16b, {v18.16b},v8.16b 561 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 562 tbl v12.16b, {v18.16b},v9.16b 563 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 564 eor v11.16b, v11.16b, v10.16b 565 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 566 eor v12.16b, v12.16b, v10.16b 567 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 568 tbl v10.16b, {v18.16b},v11.16b 569 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 570 tbl v11.16b, {v18.16b},v12.16b 571 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 572 eor v10.16b, v10.16b, v9.16b 573 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 574 eor v11.16b, v11.16b, v8.16b 575 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 576 cbnz w8, .Ldec_2x_loop 577 578 // middle of last round 579 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 580 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 581 tbl v12.16b, {v22.16b}, v10.16b 582 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 583 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 584 tbl v9.16b, {v23.16b}, v11.16b 585 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 586 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 587 eor v12.16b, v12.16b, v16.16b 588 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 589 eor v8.16b, v9.16b, v12.16b 590 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 591 tbl v1.16b, {v8.16b},v2.16b 592 ret 593.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x 594//////////////////////////////////////////////////////// 595// // 596// AES key schedule // 597// // 598//////////////////////////////////////////////////////// 599.type _vpaes_key_preheat,%function 600.align 4 601_vpaes_key_preheat: 602 adrp x10, .Lk_inv 603 add x10, x10, #:lo12:.Lk_inv 604 movi v16.16b, #0x5b // .Lk_s63 605 adrp x11, .Lk_sb1 606 add x11, x11, #:lo12:.Lk_sb1 607 movi v17.16b, #0x0f // .Lk_s0F 608 ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt 609 adrp x10, .Lk_dksd 610 add x10, x10, #:lo12:.Lk_dksd 611 ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 612 adrp x11, .Lk_mc_forward 613 add x11, x11, #:lo12:.Lk_mc_forward 614 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb 615 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 616 ld1 {v8.2d}, [x10] // .Lk_rcon 617 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] 618 ret 619.size _vpaes_key_preheat,.-_vpaes_key_preheat 620 621.type _vpaes_schedule_core,%function 622.align 4 623_vpaes_schedule_core: 624 AARCH64_SIGN_LINK_REGISTER 625 stp x29, x30, [sp,#-16]! 626 add x29,sp,#0 627 628 bl _vpaes_key_preheat // load the tables 629 630 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) 631 632 // input transform 633 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 634 bl _vpaes_schedule_transform 635 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 636 637 adrp x10, .Lk_sr 638 add x10, x10, #:lo12:.Lk_sr 639 add x8, x8, x10 640 cbnz w3, .Lschedule_am_decrypting 641 642 // encrypting, output zeroth round key after transform 643 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) 644 b .Lschedule_go 645 646.Lschedule_am_decrypting: 647 // decrypting, output zeroth round key after shiftrows 648 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 649 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 650 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 651 eor x8, x8, #0x30 // xor $0x30, %r8 652 653.Lschedule_go: 654 cmp w1, #192 // cmp $192, %esi 655 b.hi .Lschedule_256 656 b.eq .Lschedule_192 657 // 128: fall though 658 659// 660// .schedule_128 661// 662// 128-bit specific part of key schedule. 663// 664// This schedule is really simple, because all its parts 665// are accomplished by the subroutines. 666// 667.Lschedule_128: 668 mov x0, #10 // mov $10, %esi 669 670.Loop_schedule_128: 671 sub x0, x0, #1 // dec %esi 672 bl _vpaes_schedule_round 673 cbz x0, .Lschedule_mangle_last 674 bl _vpaes_schedule_mangle // write output 675 b .Loop_schedule_128 676 677// 678// .aes_schedule_192 679// 680// 192-bit specific part of key schedule. 681// 682// The main body of this schedule is the same as the 128-bit 683// schedule, but with more smearing. The long, high side is 684// stored in %xmm7 as before, and the short, low side is in 685// the high bits of %xmm6. 686// 687// This schedule is somewhat nastier, however, because each 688// round produces 192 bits of key material, or 1.5 round keys. 689// Therefore, on each cycle we do 2 rounds and produce 3 round 690// keys. 691// 692.align 4 693.Lschedule_192: 694 sub x0, x0, #8 695 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 696 bl _vpaes_schedule_transform // input transform 697 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part 698 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 699 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros 700 mov x0, #4 // mov $4, %esi 701 702.Loop_schedule_192: 703 sub x0, x0, #1 // dec %esi 704 bl _vpaes_schedule_round 705 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 706 bl _vpaes_schedule_mangle // save key n 707 bl _vpaes_schedule_192_smear 708 bl _vpaes_schedule_mangle // save key n+1 709 bl _vpaes_schedule_round 710 cbz x0, .Lschedule_mangle_last 711 bl _vpaes_schedule_mangle // save key n+2 712 bl _vpaes_schedule_192_smear 713 b .Loop_schedule_192 714 715// 716// .aes_schedule_256 717// 718// 256-bit specific part of key schedule. 719// 720// The structure here is very similar to the 128-bit 721// schedule, but with an additional "low side" in 722// %xmm6. The low side's rounds are the same as the 723// high side's, except no rcon and no rotation. 724// 725.align 4 726.Lschedule_256: 727 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 728 bl _vpaes_schedule_transform // input transform 729 mov x0, #7 // mov $7, %esi 730 731.Loop_schedule_256: 732 sub x0, x0, #1 // dec %esi 733 bl _vpaes_schedule_mangle // output low result 734 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 735 736 // high round 737 bl _vpaes_schedule_round 738 cbz x0, .Lschedule_mangle_last 739 bl _vpaes_schedule_mangle 740 741 // low round. swap xmm7 and xmm6 742 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 743 movi v4.16b, #0 744 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 745 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 746 bl _vpaes_schedule_low_round 747 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 748 749 b .Loop_schedule_256 750 751// 752// .aes_schedule_mangle_last 753// 754// Mangler for last round of key schedule 755// Mangles %xmm0 756// when encrypting, outputs out(%xmm0) ^ 63 757// when decrypting, outputs unskew(%xmm0) 758// 759// Always called right before return... jumps to cleanup and exits 760// 761.align 4 762.Lschedule_mangle_last: 763 // schedule last round key from xmm0 764 adrp x11, .Lk_deskew 765 add x11, x11, #:lo12:.Lk_deskew 766 cbnz w3, .Lschedule_mangle_last_dec 767 768 // encrypting 769 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 770 adrp x11, .Lk_opt 771 add x11, x11, #:lo12:.Lk_opt 772 add x2, x2, #32 // add $32, %rdx 773 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute 774 775.Lschedule_mangle_last_dec: 776 ld1 {v20.2d,v21.2d}, [x11] // reload constants 777 sub x2, x2, #16 // add $-16, %rdx 778 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 779 bl _vpaes_schedule_transform // output transform 780 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key 781 782 // cleanup 783 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 784 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 785 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 786 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 787 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 788 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 789 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 790 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 791 ldp x29, x30, [sp],#16 792 AARCH64_VALIDATE_LINK_REGISTER 793 ret 794.size _vpaes_schedule_core,.-_vpaes_schedule_core 795 796// 797// .aes_schedule_192_smear 798// 799// Smear the short, low side in the 192-bit key schedule. 800// 801// Inputs: 802// %xmm7: high side, b a x y 803// %xmm6: low side, d c 0 0 804// %xmm13: 0 805// 806// Outputs: 807// %xmm6: b+c+d b+c 0 0 808// %xmm0: b+c+d b+c b a 809// 810.type _vpaes_schedule_192_smear,%function 811.align 4 812_vpaes_schedule_192_smear: 813 movi v1.16b, #0 814 dup v0.4s, v7.s[3] 815 ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 816 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 817 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 818 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 819 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 820 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 821 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros 822 ret 823.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear 824 825// 826// .aes_schedule_round 827// 828// Runs one main round of the key schedule on %xmm0, %xmm7 829// 830// Specifically, runs subbytes on the high dword of %xmm0 831// then rotates it by one byte and xors into the low dword of 832// %xmm7. 833// 834// Adds rcon from low byte of %xmm8, then rotates %xmm8 for 835// next rcon. 836// 837// Smears the dwords of %xmm7 by xoring the low into the 838// second low, result into third, result into highest. 839// 840// Returns results in %xmm7 = %xmm0. 841// Clobbers %xmm1-%xmm4, %r11. 842// 843.type _vpaes_schedule_round,%function 844.align 4 845_vpaes_schedule_round: 846 // extract rcon from xmm8 847 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 848 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 849 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 850 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 851 852 // rotate 853 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 854 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 855 856 // fall through... 857 858 // low round: same as high round, but no rotation and no rcon. 859_vpaes_schedule_low_round: 860 // smear xmm7 861 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 862 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 863 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 864 865 // subbytes 866 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 867 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 868 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 869 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 870 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 871 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 872 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 873 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 874 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 875 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 876 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 877 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 878 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io 879 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 880 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 881 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 882 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 883 884 // add in smeared stuff 885 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 886 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 887 ret 888.size _vpaes_schedule_round,.-_vpaes_schedule_round 889 890// 891// .aes_schedule_transform 892// 893// Linear-transform %xmm0 according to tables at (%r11) 894// 895// Requires that %xmm9 = 0x0F0F... as in preheat 896// Output in %xmm0 897// Clobbers %xmm1, %xmm2 898// 899.type _vpaes_schedule_transform,%function 900.align 4 901_vpaes_schedule_transform: 902 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 903 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 904 // vmovdqa (%r11), %xmm2 # lo 905 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 906 // vmovdqa 16(%r11), %xmm1 # hi 907 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 908 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 909 ret 910.size _vpaes_schedule_transform,.-_vpaes_schedule_transform 911 912// 913// .aes_schedule_mangle 914// 915// Mangle xmm0 from (basis-transformed) standard version 916// to our version. 917// 918// On encrypt, 919// xor with 0x63 920// multiply by circulant 0,1,1,1 921// apply shiftrows transform 922// 923// On decrypt, 924// xor with 0x63 925// multiply by "inverse mixcolumns" circulant E,B,D,9 926// deskew 927// apply shiftrows transform 928// 929// 930// Writes out to (%rdx), and increments or decrements it 931// Keeps track of round number mod 4 in %r8 932// Preserves xmm0 933// Clobbers xmm1-xmm5 934// 935.type _vpaes_schedule_mangle,%function 936.align 4 937_vpaes_schedule_mangle: 938 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later 939 // vmovdqa .Lk_mc_forward(%rip),%xmm5 940 cbnz w3, .Lschedule_mangle_dec 941 942 // encrypting 943 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 944 add x2, x2, #16 // add $16, %rdx 945 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 946 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 947 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 948 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 949 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 950 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 951 952 b .Lschedule_mangle_both 953.align 4 954.Lschedule_mangle_dec: 955 // inverse mix columns 956 // lea .Lk_dksd(%rip),%r11 957 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi 958 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo 959 960 // vmovdqa 0x00(%r11), %xmm2 961 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 962 // vmovdqa 0x10(%r11), %xmm3 963 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 964 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 965 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 966 967 // vmovdqa 0x20(%r11), %xmm2 968 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 969 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 970 // vmovdqa 0x30(%r11), %xmm3 971 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 972 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 973 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 974 975 // vmovdqa 0x40(%r11), %xmm2 976 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 977 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 978 // vmovdqa 0x50(%r11), %xmm3 979 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 980 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 981 982 // vmovdqa 0x60(%r11), %xmm2 983 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 984 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 985 // vmovdqa 0x70(%r11), %xmm4 986 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 987 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 988 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 989 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 990 991 sub x2, x2, #16 // add $-16, %rdx 992 993.Lschedule_mangle_both: 994 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 995 add x8, x8, #64-16 // add $-16, %r8 996 and x8, x8, #~(1<<6) // and $0x30, %r8 997 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 998 ret 999.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle 1000 1001.globl vpaes_set_encrypt_key 1002.type vpaes_set_encrypt_key,%function 1003.align 4 1004vpaes_set_encrypt_key: 1005 AARCH64_SIGN_LINK_REGISTER 1006 stp x29,x30,[sp,#-16]! 1007 add x29,sp,#0 1008 stp d8,d9,[sp,#-16]! // ABI spec says so 1009 1010 lsr w9, w1, #5 // shr $5,%eax 1011 add w9, w9, #5 // $5,%eax 1012 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1013 1014 mov w3, #0 // mov $0,%ecx 1015 mov x8, #0x30 // mov $0x30,%r8d 1016 bl _vpaes_schedule_core 1017 eor x0, x0, x0 1018 1019 ldp d8,d9,[sp],#16 1020 ldp x29,x30,[sp],#16 1021 AARCH64_VALIDATE_LINK_REGISTER 1022 ret 1023.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key 1024 1025.globl vpaes_set_decrypt_key 1026.type vpaes_set_decrypt_key,%function 1027.align 4 1028vpaes_set_decrypt_key: 1029 AARCH64_SIGN_LINK_REGISTER 1030 stp x29,x30,[sp,#-16]! 1031 add x29,sp,#0 1032 stp d8,d9,[sp,#-16]! // ABI spec says so 1033 1034 lsr w9, w1, #5 // shr $5,%eax 1035 add w9, w9, #5 // $5,%eax 1036 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1037 lsl w9, w9, #4 // shl $4,%eax 1038 add x2, x2, #16 // lea 16(%rdx,%rax),%rdx 1039 add x2, x2, x9 1040 1041 mov w3, #1 // mov $1,%ecx 1042 lsr w8, w1, #1 // shr $1,%r8d 1043 and x8, x8, #32 // and $32,%r8d 1044 eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 1045 bl _vpaes_schedule_core 1046 1047 ldp d8,d9,[sp],#16 1048 ldp x29,x30,[sp],#16 1049 AARCH64_VALIDATE_LINK_REGISTER 1050 ret 1051.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key 1052.globl vpaes_cbc_encrypt 1053.type vpaes_cbc_encrypt,%function 1054.align 4 1055vpaes_cbc_encrypt: 1056 AARCH64_SIGN_LINK_REGISTER 1057 cbz x2, .Lcbc_abort 1058 cmp w5, #0 // check direction 1059 b.eq vpaes_cbc_decrypt 1060 1061 stp x29,x30,[sp,#-16]! 1062 add x29,sp,#0 1063 1064 mov x17, x2 // reassign 1065 mov x2, x3 // reassign 1066 1067 ld1 {v0.16b}, [x4] // load ivec 1068 bl _vpaes_encrypt_preheat 1069 b .Lcbc_enc_loop 1070 1071.align 4 1072.Lcbc_enc_loop: 1073 ld1 {v7.16b}, [x0],#16 // load input 1074 eor v7.16b, v7.16b, v0.16b // xor with ivec 1075 bl _vpaes_encrypt_core 1076 st1 {v0.16b}, [x1],#16 // save output 1077 subs x17, x17, #16 1078 b.hi .Lcbc_enc_loop 1079 1080 st1 {v0.16b}, [x4] // write ivec 1081 1082 ldp x29,x30,[sp],#16 1083.Lcbc_abort: 1084 AARCH64_VALIDATE_LINK_REGISTER 1085 ret 1086.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt 1087 1088.type vpaes_cbc_decrypt,%function 1089.align 4 1090vpaes_cbc_decrypt: 1091 // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to 1092 // only from vpaes_cbc_encrypt which has already signed the return address. 1093 stp x29,x30,[sp,#-16]! 1094 add x29,sp,#0 1095 stp d8,d9,[sp,#-16]! // ABI spec says so 1096 stp d10,d11,[sp,#-16]! 1097 stp d12,d13,[sp,#-16]! 1098 stp d14,d15,[sp,#-16]! 1099 1100 mov x17, x2 // reassign 1101 mov x2, x3 // reassign 1102 ld1 {v6.16b}, [x4] // load ivec 1103 bl _vpaes_decrypt_preheat 1104 tst x17, #16 1105 b.eq .Lcbc_dec_loop2x 1106 1107 ld1 {v7.16b}, [x0], #16 // load input 1108 bl _vpaes_decrypt_core 1109 eor v0.16b, v0.16b, v6.16b // xor with ivec 1110 orr v6.16b, v7.16b, v7.16b // next ivec value 1111 st1 {v0.16b}, [x1], #16 1112 subs x17, x17, #16 1113 b.ls .Lcbc_dec_done 1114 1115.align 4 1116.Lcbc_dec_loop2x: 1117 ld1 {v14.16b,v15.16b}, [x0], #32 1118 bl _vpaes_decrypt_2x 1119 eor v0.16b, v0.16b, v6.16b // xor with ivec 1120 eor v1.16b, v1.16b, v14.16b 1121 orr v6.16b, v15.16b, v15.16b 1122 st1 {v0.16b,v1.16b}, [x1], #32 1123 subs x17, x17, #32 1124 b.hi .Lcbc_dec_loop2x 1125 1126.Lcbc_dec_done: 1127 st1 {v6.16b}, [x4] 1128 1129 ldp d14,d15,[sp],#16 1130 ldp d12,d13,[sp],#16 1131 ldp d10,d11,[sp],#16 1132 ldp d8,d9,[sp],#16 1133 ldp x29,x30,[sp],#16 1134 AARCH64_VALIDATE_LINK_REGISTER 1135 ret 1136.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt 1137.globl vpaes_ecb_encrypt 1138.type vpaes_ecb_encrypt,%function 1139.align 4 1140vpaes_ecb_encrypt: 1141 AARCH64_SIGN_LINK_REGISTER 1142 stp x29,x30,[sp,#-16]! 1143 add x29,sp,#0 1144 stp d8,d9,[sp,#-16]! // ABI spec says so 1145 stp d10,d11,[sp,#-16]! 1146 stp d12,d13,[sp,#-16]! 1147 stp d14,d15,[sp,#-16]! 1148 1149 mov x17, x2 1150 mov x2, x3 1151 bl _vpaes_encrypt_preheat 1152 tst x17, #16 1153 b.eq .Lecb_enc_loop 1154 1155 ld1 {v7.16b}, [x0],#16 1156 bl _vpaes_encrypt_core 1157 st1 {v0.16b}, [x1],#16 1158 subs x17, x17, #16 1159 b.ls .Lecb_enc_done 1160 1161.align 4 1162.Lecb_enc_loop: 1163 ld1 {v14.16b,v15.16b}, [x0], #32 1164 bl _vpaes_encrypt_2x 1165 st1 {v0.16b,v1.16b}, [x1], #32 1166 subs x17, x17, #32 1167 b.hi .Lecb_enc_loop 1168 1169.Lecb_enc_done: 1170 ldp d14,d15,[sp],#16 1171 ldp d12,d13,[sp],#16 1172 ldp d10,d11,[sp],#16 1173 ldp d8,d9,[sp],#16 1174 ldp x29,x30,[sp],#16 1175 AARCH64_VALIDATE_LINK_REGISTER 1176 ret 1177.size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt 1178 1179.globl vpaes_ecb_decrypt 1180.type vpaes_ecb_decrypt,%function 1181.align 4 1182vpaes_ecb_decrypt: 1183 AARCH64_SIGN_LINK_REGISTER 1184 stp x29,x30,[sp,#-16]! 1185 add x29,sp,#0 1186 stp d8,d9,[sp,#-16]! // ABI spec says so 1187 stp d10,d11,[sp,#-16]! 1188 stp d12,d13,[sp,#-16]! 1189 stp d14,d15,[sp,#-16]! 1190 1191 mov x17, x2 1192 mov x2, x3 1193 bl _vpaes_decrypt_preheat 1194 tst x17, #16 1195 b.eq .Lecb_dec_loop 1196 1197 ld1 {v7.16b}, [x0],#16 1198 bl _vpaes_encrypt_core 1199 st1 {v0.16b}, [x1],#16 1200 subs x17, x17, #16 1201 b.ls .Lecb_dec_done 1202 1203.align 4 1204.Lecb_dec_loop: 1205 ld1 {v14.16b,v15.16b}, [x0], #32 1206 bl _vpaes_decrypt_2x 1207 st1 {v0.16b,v1.16b}, [x1], #32 1208 subs x17, x17, #32 1209 b.hi .Lecb_dec_loop 1210 1211.Lecb_dec_done: 1212 ldp d14,d15,[sp],#16 1213 ldp d12,d13,[sp],#16 1214 ldp d10,d11,[sp],#16 1215 ldp d8,d9,[sp],#16 1216 ldp x29,x30,[sp],#16 1217 AARCH64_VALIDATE_LINK_REGISTER 1218 ret 1219.size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt 1220