1/* Do not modify. This file is auto-generated from vpaes-armv8.pl. */ 2#include "arm_arch.h" 3 4.text 5 6.type _vpaes_consts,%object 7.align 7 // totally strategic alignment 8_vpaes_consts: 9.Lk_mc_forward: // mc_forward 10.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 11.quad 0x080B0A0904070605, 0x000302010C0F0E0D 12.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 13.quad 0x000302010C0F0E0D, 0x080B0A0904070605 14.Lk_mc_backward: // mc_backward 15.quad 0x0605040702010003, 0x0E0D0C0F0A09080B 16.quad 0x020100030E0D0C0F, 0x0A09080B06050407 17.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 18.quad 0x0A09080B06050407, 0x020100030E0D0C0F 19.Lk_sr: // sr 20.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 21.quad 0x030E09040F0A0500, 0x0B06010C07020D08 22.quad 0x0F060D040B020900, 0x070E050C030A0108 23.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 24 25// 26// "Hot" constants 27// 28.Lk_inv: // inv, inva 29.quad 0x0E05060F0D080180, 0x040703090A0B0C02 30.quad 0x01040A060F0B0780, 0x030D0E0C02050809 31.Lk_ipt: // input transform (lo, hi) 32.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 33.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 34.Lk_sbo: // sbou, sbot 35.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 36.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA 37.Lk_sb1: // sb1u, sb1t 38.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF 39.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 40.Lk_sb2: // sb2u, sb2t 41.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A 42.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD 43 44// 45// Decryption stuff 46// 47.Lk_dipt: // decryption input transform 48.quad 0x0F505B040B545F00, 0x154A411E114E451A 49.quad 0x86E383E660056500, 0x12771772F491F194 50.Lk_dsbo: // decryption sbox final output 51.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D 52.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C 53.Lk_dsb9: // decryption sbox output *9*u, *9*t 54.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 55.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 56.Lk_dsbd: // decryption sbox output *D*u, *D*t 57.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 58.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 59.Lk_dsbb: // decryption sbox output *B*u, *B*t 60.quad 0xD022649296B44200, 0x602646F6B0F2D404 61.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B 62.Lk_dsbe: // decryption sbox output *E*u, *E*t 63.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 64.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 65 66// 67// Key schedule constants 68// 69.Lk_dksd: // decryption key schedule: invskew x*D 70.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 71.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E 72.Lk_dksb: // decryption key schedule: invskew x*B 73.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 74.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 75.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 76.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 77.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 78.Lk_dks9: // decryption key schedule: invskew x*9 79.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC 80.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE 81 82.Lk_rcon: // rcon 83.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 84 85.Lk_opt: // output transform 86.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 87.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 88.Lk_deskew: // deskew tables: inverts the sbox's "skew" 89.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A 90.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 91 92.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 93.align 2 94.size _vpaes_consts,.-_vpaes_consts 95.align 6 96// 97// _aes_preheat 98// 99// Fills register %r10 -> .aes_consts (so you can -fPIC) 100// and %xmm9-%xmm15 as specified below. 101// 102.type _vpaes_encrypt_preheat,%function 103.align 4 104_vpaes_encrypt_preheat: 105 adr x10, .Lk_inv 106 movi v17.16b, #0x0f 107 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv 108 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo 109 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 110 ret 111.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat 112 113// 114// _aes_encrypt_core 115// 116// AES-encrypt %xmm0. 117// 118// Inputs: 119// %xmm0 = input 120// %xmm9-%xmm15 as in _vpaes_preheat 121// (%rdx) = scheduled keys 122// 123// Output in %xmm0 124// Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax 125// Preserves %xmm6 - %xmm8 so you get some local vectors 126// 127// 128.type _vpaes_encrypt_core,%function 129.align 4 130_vpaes_encrypt_core: 131 mov x9, x2 132 ldr w8, [x2,#240] // pull rounds 133 adr x11, .Lk_mc_forward+16 134 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 135 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 136 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 137 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 138 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 139 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 140 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 141 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 142 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 143 b .Lenc_entry 144 145.align 4 146.Lenc_loop: 147 // middle of middle round 148 add x10, x11, #0x40 149 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 150 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 151 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 152 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 153 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 154 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 155 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 156 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 157 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 158 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 159 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 160 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 161 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 162 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 163 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 164 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 165 sub w8, w8, #1 // nr-- 166 167.Lenc_entry: 168 // top of round 169 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 170 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 171 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 172 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 173 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 174 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 175 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 176 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 177 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 178 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 179 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 180 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 181 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 182 cbnz w8, .Lenc_loop 183 184 // middle of last round 185 add x10, x11, #0x80 186 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 187 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 188 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 189 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 190 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 191 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 192 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 193 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 194 ret 195.size _vpaes_encrypt_core,.-_vpaes_encrypt_core 196 197.globl vpaes_encrypt 198.type vpaes_encrypt,%function 199.align 4 200vpaes_encrypt: 201 AARCH64_SIGN_LINK_REGISTER 202 stp x29,x30,[sp,#-16]! 203 add x29,sp,#0 204 205 ld1 {v7.16b}, [x0] 206 bl _vpaes_encrypt_preheat 207 bl _vpaes_encrypt_core 208 st1 {v0.16b}, [x1] 209 210 ldp x29,x30,[sp],#16 211 AARCH64_VALIDATE_LINK_REGISTER 212 ret 213.size vpaes_encrypt,.-vpaes_encrypt 214 215.type _vpaes_encrypt_2x,%function 216.align 4 217_vpaes_encrypt_2x: 218 mov x9, x2 219 ldr w8, [x2,#240] // pull rounds 220 adr x11, .Lk_mc_forward+16 221 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 222 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 223 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 224 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 225 and v9.16b, v15.16b, v17.16b 226 ushr v8.16b, v15.16b, #4 227 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 228 tbl v9.16b, {v20.16b}, v9.16b 229 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 230 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 231 tbl v10.16b, {v21.16b}, v8.16b 232 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 233 eor v8.16b, v9.16b, v16.16b 234 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 235 eor v8.16b, v8.16b, v10.16b 236 b .Lenc_2x_entry 237 238.align 4 239.Lenc_2x_loop: 240 // middle of middle round 241 add x10, x11, #0x40 242 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 243 tbl v12.16b, {v25.16b}, v10.16b 244 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 245 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 246 tbl v8.16b, {v24.16b}, v11.16b 247 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 248 eor v12.16b, v12.16b, v16.16b 249 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 250 tbl v13.16b, {v27.16b}, v10.16b 251 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 252 eor v8.16b, v8.16b, v12.16b 253 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 254 tbl v10.16b, {v26.16b}, v11.16b 255 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 256 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 257 tbl v11.16b, {v8.16b}, v1.16b 258 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 259 eor v10.16b, v10.16b, v13.16b 260 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 261 tbl v8.16b, {v8.16b}, v4.16b 262 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 263 eor v11.16b, v11.16b, v10.16b 264 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 265 tbl v12.16b, {v11.16b},v1.16b 266 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 267 eor v8.16b, v8.16b, v11.16b 268 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 269 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 270 eor v8.16b, v8.16b, v12.16b 271 sub w8, w8, #1 // nr-- 272 273.Lenc_2x_entry: 274 // top of round 275 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 276 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 277 and v9.16b, v8.16b, v17.16b 278 ushr v8.16b, v8.16b, #4 279 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 280 tbl v13.16b, {v19.16b},v9.16b 281 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 282 eor v9.16b, v9.16b, v8.16b 283 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 284 tbl v11.16b, {v18.16b},v8.16b 285 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 286 tbl v12.16b, {v18.16b},v9.16b 287 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 288 eor v11.16b, v11.16b, v13.16b 289 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 290 eor v12.16b, v12.16b, v13.16b 291 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 292 tbl v10.16b, {v18.16b},v11.16b 293 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 294 tbl v11.16b, {v18.16b},v12.16b 295 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 296 eor v10.16b, v10.16b, v9.16b 297 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 298 eor v11.16b, v11.16b, v8.16b 299 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 300 cbnz w8, .Lenc_2x_loop 301 302 // middle of last round 303 add x10, x11, #0x80 304 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 305 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 306 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 307 tbl v12.16b, {v22.16b}, v10.16b 308 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 309 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 310 tbl v8.16b, {v23.16b}, v11.16b 311 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 312 eor v12.16b, v12.16b, v16.16b 313 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 314 eor v8.16b, v8.16b, v12.16b 315 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 316 tbl v1.16b, {v8.16b},v1.16b 317 ret 318.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x 319 320.type _vpaes_decrypt_preheat,%function 321.align 4 322_vpaes_decrypt_preheat: 323 adr x10, .Lk_inv 324 movi v17.16b, #0x0f 325 adr x11, .Lk_dipt 326 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv 327 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo 328 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd 329 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe 330 ret 331.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat 332 333// 334// Decryption core 335// 336// Same API as encryption core. 337// 338.type _vpaes_decrypt_core,%function 339.align 4 340_vpaes_decrypt_core: 341 mov x9, x2 342 ldr w8, [x2,#240] // pull rounds 343 344 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 345 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 346 eor x11, x11, #0x30 // xor $0x30, %r11 347 adr x10, .Lk_sr 348 and x11, x11, #0x30 // and $0x30, %r11 349 add x11, x11, x10 350 adr x10, .Lk_mc_forward+48 351 352 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 353 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 354 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 355 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 356 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 357 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 358 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 359 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 360 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 361 b .Ldec_entry 362 363.align 4 364.Ldec_loop: 365// 366// Inverse mix columns 367// 368 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 369 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 370 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 371 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 372 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 373 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 374 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 375 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 376 377 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 378 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 379 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 380 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 381 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 382 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 383 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 384 385 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 386 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 387 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 388 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 389 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 390 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 391 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 392 393 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 394 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 395 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 396 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 397 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 398 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 399 sub w8, w8, #1 // sub $1,%rax # nr-- 400 401.Ldec_entry: 402 // top of round 403 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 404 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 405 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 406 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 407 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 408 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 409 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 410 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 411 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 412 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 413 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 414 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 415 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 416 cbnz w8, .Ldec_loop 417 418 // middle of last round 419 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 420 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 421 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 422 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 423 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 424 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 425 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 426 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 427 ret 428.size _vpaes_decrypt_core,.-_vpaes_decrypt_core 429 430.globl vpaes_decrypt 431.type vpaes_decrypt,%function 432.align 4 433vpaes_decrypt: 434 AARCH64_SIGN_LINK_REGISTER 435 stp x29,x30,[sp,#-16]! 436 add x29,sp,#0 437 438 ld1 {v7.16b}, [x0] 439 bl _vpaes_decrypt_preheat 440 bl _vpaes_decrypt_core 441 st1 {v0.16b}, [x1] 442 443 ldp x29,x30,[sp],#16 444 AARCH64_VALIDATE_LINK_REGISTER 445 ret 446.size vpaes_decrypt,.-vpaes_decrypt 447 448// v14-v15 input, v0-v1 output 449.type _vpaes_decrypt_2x,%function 450.align 4 451_vpaes_decrypt_2x: 452 mov x9, x2 453 ldr w8, [x2,#240] // pull rounds 454 455 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 456 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 457 eor x11, x11, #0x30 // xor $0x30, %r11 458 adr x10, .Lk_sr 459 and x11, x11, #0x30 // and $0x30, %r11 460 add x11, x11, x10 461 adr x10, .Lk_mc_forward+48 462 463 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 464 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 465 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 466 and v9.16b, v15.16b, v17.16b 467 ushr v8.16b, v15.16b, #4 468 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 469 tbl v10.16b, {v20.16b},v9.16b 470 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 471 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 472 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 473 tbl v8.16b, {v21.16b},v8.16b 474 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 475 eor v10.16b, v10.16b, v16.16b 476 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 477 eor v8.16b, v8.16b, v10.16b 478 b .Ldec_2x_entry 479 480.align 4 481.Ldec_2x_loop: 482// 483// Inverse mix columns 484// 485 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 486 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 487 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 488 tbl v12.16b, {v24.16b}, v10.16b 489 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 490 tbl v9.16b, {v25.16b}, v11.16b 491 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 492 eor v8.16b, v12.16b, v16.16b 493 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 494 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 495 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 496 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 497 498 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 499 tbl v12.16b, {v26.16b}, v10.16b 500 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 501 tbl v8.16b, {v8.16b},v5.16b 502 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 503 tbl v9.16b, {v27.16b}, v11.16b 504 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 505 eor v8.16b, v8.16b, v12.16b 506 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 507 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 508 eor v8.16b, v8.16b, v9.16b 509 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 510 511 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 512 tbl v12.16b, {v28.16b}, v10.16b 513 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 514 tbl v8.16b, {v8.16b},v5.16b 515 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 516 tbl v9.16b, {v29.16b}, v11.16b 517 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 518 eor v8.16b, v8.16b, v12.16b 519 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 520 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 521 eor v8.16b, v8.16b, v9.16b 522 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 523 524 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 525 tbl v12.16b, {v30.16b}, v10.16b 526 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 527 tbl v8.16b, {v8.16b},v5.16b 528 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 529 tbl v9.16b, {v31.16b}, v11.16b 530 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 531 eor v8.16b, v8.16b, v12.16b 532 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 533 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 534 eor v8.16b, v8.16b, v9.16b 535 sub w8, w8, #1 // sub $1,%rax # nr-- 536 537.Ldec_2x_entry: 538 // top of round 539 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 540 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 541 and v9.16b, v8.16b, v17.16b 542 ushr v8.16b, v8.16b, #4 543 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 544 tbl v10.16b, {v19.16b},v9.16b 545 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 546 eor v9.16b, v9.16b, v8.16b 547 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 548 tbl v11.16b, {v18.16b},v8.16b 549 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 550 tbl v12.16b, {v18.16b},v9.16b 551 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 552 eor v11.16b, v11.16b, v10.16b 553 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 554 eor v12.16b, v12.16b, v10.16b 555 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 556 tbl v10.16b, {v18.16b},v11.16b 557 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 558 tbl v11.16b, {v18.16b},v12.16b 559 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 560 eor v10.16b, v10.16b, v9.16b 561 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 562 eor v11.16b, v11.16b, v8.16b 563 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 564 cbnz w8, .Ldec_2x_loop 565 566 // middle of last round 567 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 568 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 569 tbl v12.16b, {v22.16b}, v10.16b 570 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 571 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 572 tbl v9.16b, {v23.16b}, v11.16b 573 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 574 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 575 eor v12.16b, v12.16b, v16.16b 576 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 577 eor v8.16b, v9.16b, v12.16b 578 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 579 tbl v1.16b, {v8.16b},v2.16b 580 ret 581.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x 582//////////////////////////////////////////////////////// 583// // 584// AES key schedule // 585// // 586//////////////////////////////////////////////////////// 587.type _vpaes_key_preheat,%function 588.align 4 589_vpaes_key_preheat: 590 adr x10, .Lk_inv 591 movi v16.16b, #0x5b // .Lk_s63 592 adr x11, .Lk_sb1 593 movi v17.16b, #0x0f // .Lk_s0F 594 ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt 595 adr x10, .Lk_dksd 596 ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 597 adr x11, .Lk_mc_forward 598 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb 599 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 600 ld1 {v8.2d}, [x10] // .Lk_rcon 601 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] 602 ret 603.size _vpaes_key_preheat,.-_vpaes_key_preheat 604 605.type _vpaes_schedule_core,%function 606.align 4 607_vpaes_schedule_core: 608 AARCH64_SIGN_LINK_REGISTER 609 stp x29, x30, [sp,#-16]! 610 add x29,sp,#0 611 612 bl _vpaes_key_preheat // load the tables 613 614 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) 615 616 // input transform 617 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 618 bl _vpaes_schedule_transform 619 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 620 621 adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10 622 add x8, x8, x10 623 cbnz w3, .Lschedule_am_decrypting 624 625 // encrypting, output zeroth round key after transform 626 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) 627 b .Lschedule_go 628 629.Lschedule_am_decrypting: 630 // decrypting, output zeroth round key after shiftrows 631 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 632 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 633 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 634 eor x8, x8, #0x30 // xor $0x30, %r8 635 636.Lschedule_go: 637 cmp w1, #192 // cmp $192, %esi 638 b.hi .Lschedule_256 639 b.eq .Lschedule_192 640 // 128: fall though 641 642// 643// .schedule_128 644// 645// 128-bit specific part of key schedule. 646// 647// This schedule is really simple, because all its parts 648// are accomplished by the subroutines. 649// 650.Lschedule_128: 651 mov x0, #10 // mov $10, %esi 652 653.Loop_schedule_128: 654 sub x0, x0, #1 // dec %esi 655 bl _vpaes_schedule_round 656 cbz x0, .Lschedule_mangle_last 657 bl _vpaes_schedule_mangle // write output 658 b .Loop_schedule_128 659 660// 661// .aes_schedule_192 662// 663// 192-bit specific part of key schedule. 664// 665// The main body of this schedule is the same as the 128-bit 666// schedule, but with more smearing. The long, high side is 667// stored in %xmm7 as before, and the short, low side is in 668// the high bits of %xmm6. 669// 670// This schedule is somewhat nastier, however, because each 671// round produces 192 bits of key material, or 1.5 round keys. 672// Therefore, on each cycle we do 2 rounds and produce 3 round 673// keys. 674// 675.align 4 676.Lschedule_192: 677 sub x0, x0, #8 678 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 679 bl _vpaes_schedule_transform // input transform 680 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part 681 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 682 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros 683 mov x0, #4 // mov $4, %esi 684 685.Loop_schedule_192: 686 sub x0, x0, #1 // dec %esi 687 bl _vpaes_schedule_round 688 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 689 bl _vpaes_schedule_mangle // save key n 690 bl _vpaes_schedule_192_smear 691 bl _vpaes_schedule_mangle // save key n+1 692 bl _vpaes_schedule_round 693 cbz x0, .Lschedule_mangle_last 694 bl _vpaes_schedule_mangle // save key n+2 695 bl _vpaes_schedule_192_smear 696 b .Loop_schedule_192 697 698// 699// .aes_schedule_256 700// 701// 256-bit specific part of key schedule. 702// 703// The structure here is very similar to the 128-bit 704// schedule, but with an additional "low side" in 705// %xmm6. The low side's rounds are the same as the 706// high side's, except no rcon and no rotation. 707// 708.align 4 709.Lschedule_256: 710 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 711 bl _vpaes_schedule_transform // input transform 712 mov x0, #7 // mov $7, %esi 713 714.Loop_schedule_256: 715 sub x0, x0, #1 // dec %esi 716 bl _vpaes_schedule_mangle // output low result 717 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 718 719 // high round 720 bl _vpaes_schedule_round 721 cbz x0, .Lschedule_mangle_last 722 bl _vpaes_schedule_mangle 723 724 // low round. swap xmm7 and xmm6 725 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 726 movi v4.16b, #0 727 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 728 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 729 bl _vpaes_schedule_low_round 730 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 731 732 b .Loop_schedule_256 733 734// 735// .aes_schedule_mangle_last 736// 737// Mangler for last round of key schedule 738// Mangles %xmm0 739// when encrypting, outputs out(%xmm0) ^ 63 740// when decrypting, outputs unskew(%xmm0) 741// 742// Always called right before return... jumps to cleanup and exits 743// 744.align 4 745.Lschedule_mangle_last: 746 // schedule last round key from xmm0 747 adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew 748 cbnz w3, .Lschedule_mangle_last_dec 749 750 // encrypting 751 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 752 adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform 753 add x2, x2, #32 // add $32, %rdx 754 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute 755 756.Lschedule_mangle_last_dec: 757 ld1 {v20.2d,v21.2d}, [x11] // reload constants 758 sub x2, x2, #16 // add $-16, %rdx 759 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 760 bl _vpaes_schedule_transform // output transform 761 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key 762 763 // cleanup 764 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 765 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 766 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 767 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 768 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 769 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 770 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 771 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 772 ldp x29, x30, [sp],#16 773 AARCH64_VALIDATE_LINK_REGISTER 774 ret 775.size _vpaes_schedule_core,.-_vpaes_schedule_core 776 777// 778// .aes_schedule_192_smear 779// 780// Smear the short, low side in the 192-bit key schedule. 781// 782// Inputs: 783// %xmm7: high side, b a x y 784// %xmm6: low side, d c 0 0 785// %xmm13: 0 786// 787// Outputs: 788// %xmm6: b+c+d b+c 0 0 789// %xmm0: b+c+d b+c b a 790// 791.type _vpaes_schedule_192_smear,%function 792.align 4 793_vpaes_schedule_192_smear: 794 movi v1.16b, #0 795 dup v0.4s, v7.s[3] 796 ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 797 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 798 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 799 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 800 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 801 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 802 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros 803 ret 804.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear 805 806// 807// .aes_schedule_round 808// 809// Runs one main round of the key schedule on %xmm0, %xmm7 810// 811// Specifically, runs subbytes on the high dword of %xmm0 812// then rotates it by one byte and xors into the low dword of 813// %xmm7. 814// 815// Adds rcon from low byte of %xmm8, then rotates %xmm8 for 816// next rcon. 817// 818// Smears the dwords of %xmm7 by xoring the low into the 819// second low, result into third, result into highest. 820// 821// Returns results in %xmm7 = %xmm0. 822// Clobbers %xmm1-%xmm4, %r11. 823// 824.type _vpaes_schedule_round,%function 825.align 4 826_vpaes_schedule_round: 827 // extract rcon from xmm8 828 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 829 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 830 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 831 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 832 833 // rotate 834 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 835 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 836 837 // fall through... 838 839 // low round: same as high round, but no rotation and no rcon. 840_vpaes_schedule_low_round: 841 // smear xmm7 842 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 843 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 844 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 845 846 // subbytes 847 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 848 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 849 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 850 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 851 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 852 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 853 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 854 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 855 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 856 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 857 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 858 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 859 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io 860 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 861 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 862 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 863 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 864 865 // add in smeared stuff 866 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 867 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 868 ret 869.size _vpaes_schedule_round,.-_vpaes_schedule_round 870 871// 872// .aes_schedule_transform 873// 874// Linear-transform %xmm0 according to tables at (%r11) 875// 876// Requires that %xmm9 = 0x0F0F... as in preheat 877// Output in %xmm0 878// Clobbers %xmm1, %xmm2 879// 880.type _vpaes_schedule_transform,%function 881.align 4 882_vpaes_schedule_transform: 883 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 884 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 885 // vmovdqa (%r11), %xmm2 # lo 886 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 887 // vmovdqa 16(%r11), %xmm1 # hi 888 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 889 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 890 ret 891.size _vpaes_schedule_transform,.-_vpaes_schedule_transform 892 893// 894// .aes_schedule_mangle 895// 896// Mangle xmm0 from (basis-transformed) standard version 897// to our version. 898// 899// On encrypt, 900// xor with 0x63 901// multiply by circulant 0,1,1,1 902// apply shiftrows transform 903// 904// On decrypt, 905// xor with 0x63 906// multiply by "inverse mixcolumns" circulant E,B,D,9 907// deskew 908// apply shiftrows transform 909// 910// 911// Writes out to (%rdx), and increments or decrements it 912// Keeps track of round number mod 4 in %r8 913// Preserves xmm0 914// Clobbers xmm1-xmm5 915// 916.type _vpaes_schedule_mangle,%function 917.align 4 918_vpaes_schedule_mangle: 919 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later 920 // vmovdqa .Lk_mc_forward(%rip),%xmm5 921 cbnz w3, .Lschedule_mangle_dec 922 923 // encrypting 924 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 925 add x2, x2, #16 // add $16, %rdx 926 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 927 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 928 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 929 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 930 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 931 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 932 933 b .Lschedule_mangle_both 934.align 4 935.Lschedule_mangle_dec: 936 // inverse mix columns 937 // lea .Lk_dksd(%rip),%r11 938 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi 939 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo 940 941 // vmovdqa 0x00(%r11), %xmm2 942 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 943 // vmovdqa 0x10(%r11), %xmm3 944 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 945 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 946 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 947 948 // vmovdqa 0x20(%r11), %xmm2 949 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 950 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 951 // vmovdqa 0x30(%r11), %xmm3 952 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 953 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 954 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 955 956 // vmovdqa 0x40(%r11), %xmm2 957 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 958 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 959 // vmovdqa 0x50(%r11), %xmm3 960 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 961 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 962 963 // vmovdqa 0x60(%r11), %xmm2 964 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 965 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 966 // vmovdqa 0x70(%r11), %xmm4 967 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 968 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 969 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 970 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 971 972 sub x2, x2, #16 // add $-16, %rdx 973 974.Lschedule_mangle_both: 975 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 976 add x8, x8, #64-16 // add $-16, %r8 977 and x8, x8, #~(1<<6) // and $0x30, %r8 978 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 979 ret 980.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle 981 982.globl vpaes_set_encrypt_key 983.type vpaes_set_encrypt_key,%function 984.align 4 985vpaes_set_encrypt_key: 986 AARCH64_SIGN_LINK_REGISTER 987 stp x29,x30,[sp,#-16]! 988 add x29,sp,#0 989 stp d8,d9,[sp,#-16]! // ABI spec says so 990 991 lsr w9, w1, #5 // shr $5,%eax 992 add w9, w9, #5 // $5,%eax 993 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 994 995 mov w3, #0 // mov $0,%ecx 996 mov x8, #0x30 // mov $0x30,%r8d 997 bl _vpaes_schedule_core 998 eor x0, x0, x0 999 1000 ldp d8,d9,[sp],#16 1001 ldp x29,x30,[sp],#16 1002 AARCH64_VALIDATE_LINK_REGISTER 1003 ret 1004.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key 1005 1006.globl vpaes_set_decrypt_key 1007.type vpaes_set_decrypt_key,%function 1008.align 4 1009vpaes_set_decrypt_key: 1010 AARCH64_SIGN_LINK_REGISTER 1011 stp x29,x30,[sp,#-16]! 1012 add x29,sp,#0 1013 stp d8,d9,[sp,#-16]! // ABI spec says so 1014 1015 lsr w9, w1, #5 // shr $5,%eax 1016 add w9, w9, #5 // $5,%eax 1017 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1018 lsl w9, w9, #4 // shl $4,%eax 1019 add x2, x2, #16 // lea 16(%rdx,%rax),%rdx 1020 add x2, x2, x9 1021 1022 mov w3, #1 // mov $1,%ecx 1023 lsr w8, w1, #1 // shr $1,%r8d 1024 and x8, x8, #32 // and $32,%r8d 1025 eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 1026 bl _vpaes_schedule_core 1027 1028 ldp d8,d9,[sp],#16 1029 ldp x29,x30,[sp],#16 1030 AARCH64_VALIDATE_LINK_REGISTER 1031 ret 1032.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key 1033.globl vpaes_cbc_encrypt 1034.type vpaes_cbc_encrypt,%function 1035.align 4 1036vpaes_cbc_encrypt: 1037 AARCH64_SIGN_LINK_REGISTER 1038 cbz x2, .Lcbc_abort 1039 cmp w5, #0 // check direction 1040 b.eq vpaes_cbc_decrypt 1041 1042 stp x29,x30,[sp,#-16]! 1043 add x29,sp,#0 1044 1045 mov x17, x2 // reassign 1046 mov x2, x3 // reassign 1047 1048 ld1 {v0.16b}, [x4] // load ivec 1049 bl _vpaes_encrypt_preheat 1050 b .Lcbc_enc_loop 1051 1052.align 4 1053.Lcbc_enc_loop: 1054 ld1 {v7.16b}, [x0],#16 // load input 1055 eor v7.16b, v7.16b, v0.16b // xor with ivec 1056 bl _vpaes_encrypt_core 1057 st1 {v0.16b}, [x1],#16 // save output 1058 subs x17, x17, #16 1059 b.hi .Lcbc_enc_loop 1060 1061 st1 {v0.16b}, [x4] // write ivec 1062 1063 ldp x29,x30,[sp],#16 1064.Lcbc_abort: 1065 AARCH64_VALIDATE_LINK_REGISTER 1066 ret 1067.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt 1068 1069.type vpaes_cbc_decrypt,%function 1070.align 4 1071vpaes_cbc_decrypt: 1072 // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to 1073 // only from vpaes_cbc_encrypt which has already signed the return address. 1074 stp x29,x30,[sp,#-16]! 1075 add x29,sp,#0 1076 stp d8,d9,[sp,#-16]! // ABI spec says so 1077 stp d10,d11,[sp,#-16]! 1078 stp d12,d13,[sp,#-16]! 1079 stp d14,d15,[sp,#-16]! 1080 1081 mov x17, x2 // reassign 1082 mov x2, x3 // reassign 1083 ld1 {v6.16b}, [x4] // load ivec 1084 bl _vpaes_decrypt_preheat 1085 tst x17, #16 1086 b.eq .Lcbc_dec_loop2x 1087 1088 ld1 {v7.16b}, [x0], #16 // load input 1089 bl _vpaes_decrypt_core 1090 eor v0.16b, v0.16b, v6.16b // xor with ivec 1091 orr v6.16b, v7.16b, v7.16b // next ivec value 1092 st1 {v0.16b}, [x1], #16 1093 subs x17, x17, #16 1094 b.ls .Lcbc_dec_done 1095 1096.align 4 1097.Lcbc_dec_loop2x: 1098 ld1 {v14.16b,v15.16b}, [x0], #32 1099 bl _vpaes_decrypt_2x 1100 eor v0.16b, v0.16b, v6.16b // xor with ivec 1101 eor v1.16b, v1.16b, v14.16b 1102 orr v6.16b, v15.16b, v15.16b 1103 st1 {v0.16b,v1.16b}, [x1], #32 1104 subs x17, x17, #32 1105 b.hi .Lcbc_dec_loop2x 1106 1107.Lcbc_dec_done: 1108 st1 {v6.16b}, [x4] 1109 1110 ldp d14,d15,[sp],#16 1111 ldp d12,d13,[sp],#16 1112 ldp d10,d11,[sp],#16 1113 ldp d8,d9,[sp],#16 1114 ldp x29,x30,[sp],#16 1115 AARCH64_VALIDATE_LINK_REGISTER 1116 ret 1117.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt 1118.globl vpaes_ecb_encrypt 1119.type vpaes_ecb_encrypt,%function 1120.align 4 1121vpaes_ecb_encrypt: 1122 AARCH64_SIGN_LINK_REGISTER 1123 stp x29,x30,[sp,#-16]! 1124 add x29,sp,#0 1125 stp d8,d9,[sp,#-16]! // ABI spec says so 1126 stp d10,d11,[sp,#-16]! 1127 stp d12,d13,[sp,#-16]! 1128 stp d14,d15,[sp,#-16]! 1129 1130 mov x17, x2 1131 mov x2, x3 1132 bl _vpaes_encrypt_preheat 1133 tst x17, #16 1134 b.eq .Lecb_enc_loop 1135 1136 ld1 {v7.16b}, [x0],#16 1137 bl _vpaes_encrypt_core 1138 st1 {v0.16b}, [x1],#16 1139 subs x17, x17, #16 1140 b.ls .Lecb_enc_done 1141 1142.align 4 1143.Lecb_enc_loop: 1144 ld1 {v14.16b,v15.16b}, [x0], #32 1145 bl _vpaes_encrypt_2x 1146 st1 {v0.16b,v1.16b}, [x1], #32 1147 subs x17, x17, #32 1148 b.hi .Lecb_enc_loop 1149 1150.Lecb_enc_done: 1151 ldp d14,d15,[sp],#16 1152 ldp d12,d13,[sp],#16 1153 ldp d10,d11,[sp],#16 1154 ldp d8,d9,[sp],#16 1155 ldp x29,x30,[sp],#16 1156 AARCH64_VALIDATE_LINK_REGISTER 1157 ret 1158.size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt 1159 1160.globl vpaes_ecb_decrypt 1161.type vpaes_ecb_decrypt,%function 1162.align 4 1163vpaes_ecb_decrypt: 1164 AARCH64_SIGN_LINK_REGISTER 1165 stp x29,x30,[sp,#-16]! 1166 add x29,sp,#0 1167 stp d8,d9,[sp,#-16]! // ABI spec says so 1168 stp d10,d11,[sp,#-16]! 1169 stp d12,d13,[sp,#-16]! 1170 stp d14,d15,[sp,#-16]! 1171 1172 mov x17, x2 1173 mov x2, x3 1174 bl _vpaes_decrypt_preheat 1175 tst x17, #16 1176 b.eq .Lecb_dec_loop 1177 1178 ld1 {v7.16b}, [x0],#16 1179 bl _vpaes_encrypt_core 1180 st1 {v0.16b}, [x1],#16 1181 subs x17, x17, #16 1182 b.ls .Lecb_dec_done 1183 1184.align 4 1185.Lecb_dec_loop: 1186 ld1 {v14.16b,v15.16b}, [x0], #32 1187 bl _vpaes_decrypt_2x 1188 st1 {v0.16b,v1.16b}, [x1], #32 1189 subs x17, x17, #32 1190 b.hi .Lecb_dec_loop 1191 1192.Lecb_dec_done: 1193 ldp d14,d15,[sp],#16 1194 ldp d12,d13,[sp],#16 1195 ldp d10,d11,[sp],#16 1196 ldp d8,d9,[sp],#16 1197 ldp x29,x30,[sp],#16 1198 AARCH64_VALIDATE_LINK_REGISTER 1199 ret 1200.size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt 1201