1/* Do not modify. This file is auto-generated from sm3-armv8.pl. */ 2// Copyright 2021-2025 The OpenSSL Project Authors. All Rights Reserved. 3// 4// Licensed under the Apache License 2.0 (the "License"). You may not use 5// this file except in compliance with the License. You can obtain a copy 6// in the file LICENSE in the source distribution or at 7// https://www.openssl.org/source/license.html 8// 9// This module implements support for Armv8 SM3 instructions 10 11// $output is the last argument if it looks like a file (it has an extension) 12// $flavour is the first argument if it doesn't look like a file 13#include "arm_arch.h" 14.text 15.globl ossl_hwsm3_block_data_order 16.type ossl_hwsm3_block_data_order,%function 17.align 5 18ossl_hwsm3_block_data_order: 19 AARCH64_VALID_CALL_TARGET 20 // load state 21 ld1 {v5.4s,v6.4s}, [x0] 22 rev64 v5.4s, v5.4s 23 rev64 v6.4s, v6.4s 24 ext v5.16b, v5.16b, v5.16b, #8 25 ext v6.16b, v6.16b, v6.16b, #8 26 adrp x8, .Tj 27 add x8, x8, #:lo12:.Tj 28 ldp s16, s17, [x8] 29 30.Loop: 31 // load input 32 ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x1], #64 33 sub w2, w2, #1 34 35 mov v18.16b, v5.16b 36 mov v19.16b, v6.16b 37 38#ifndef __AARCH64EB__ 39 rev32 v0.16b, v0.16b 40 rev32 v1.16b, v1.16b 41 rev32 v2.16b, v2.16b 42 rev32 v3.16b, v3.16b 43#endif 44 45 ext v20.16b, v16.16b, v16.16b, #4 46 // s4 = w7 | w8 | w9 | w10 47 ext v4.16b, v1.16b, v2.16b, #12 48 // vtmp1 = w3 | w4 | w5 | w6 49 ext v22.16b, v0.16b, v1.16b, #12 50 // vtmp2 = w10 | w11 | w12 | w13 51 ext v23.16b, v2.16b, v3.16b, #8 52.inst 0xce63c004 //sm3partw1 v4.4s, v0.4s, v3.4s 53.inst 0xce76c6e4 //sm3partw2 v4.4s, v23.4s, v22.4s 54 eor v22.16b, v0.16b, v1.16b 55.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 56 shl v21.4s, v20.4s, #1 57 sri v21.4s, v20.4s, #31 58.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0] 59.inst 0xce408ae6 //sm3tt2a v6.4s, v23.4s, v0.4s[0] 60.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 61 shl v20.4s, v21.4s, #1 62 sri v20.4s, v21.4s, #31 63.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1] 64.inst 0xce409ae6 //sm3tt2a v6.4s, v23.4s, v0.4s[1] 65.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 66 shl v21.4s, v20.4s, #1 67 sri v21.4s, v20.4s, #31 68.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2] 69.inst 0xce40aae6 //sm3tt2a v6.4s, v23.4s, v0.4s[2] 70.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 71 shl v20.4s, v21.4s, #1 72 sri v20.4s, v21.4s, #31 73.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3] 74.inst 0xce40bae6 //sm3tt2a v6.4s, v23.4s, v0.4s[3] 75 // s4 = w7 | w8 | w9 | w10 76 ext v0.16b, v2.16b, v3.16b, #12 77 // vtmp1 = w3 | w4 | w5 | w6 78 ext v22.16b, v1.16b, v2.16b, #12 79 // vtmp2 = w10 | w11 | w12 | w13 80 ext v23.16b, v3.16b, v4.16b, #8 81.inst 0xce64c020 //sm3partw1 v0.4s, v1.4s, v4.4s 82.inst 0xce76c6e0 //sm3partw2 v0.4s, v23.4s, v22.4s 83 eor v22.16b, v1.16b, v2.16b 84.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 85 shl v21.4s, v20.4s, #1 86 sri v21.4s, v20.4s, #31 87.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0] 88.inst 0xce418ae6 //sm3tt2a v6.4s, v23.4s, v1.4s[0] 89.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 90 shl v20.4s, v21.4s, #1 91 sri v20.4s, v21.4s, #31 92.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1] 93.inst 0xce419ae6 //sm3tt2a v6.4s, v23.4s, v1.4s[1] 94.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 95 shl v21.4s, v20.4s, #1 96 sri v21.4s, v20.4s, #31 97.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2] 98.inst 0xce41aae6 //sm3tt2a v6.4s, v23.4s, v1.4s[2] 99.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 100 shl v20.4s, v21.4s, #1 101 sri v20.4s, v21.4s, #31 102.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3] 103.inst 0xce41bae6 //sm3tt2a v6.4s, v23.4s, v1.4s[3] 104 // s4 = w7 | w8 | w9 | w10 105 ext v1.16b, v3.16b, v4.16b, #12 106 // vtmp1 = w3 | w4 | w5 | w6 107 ext v22.16b, v2.16b, v3.16b, #12 108 // vtmp2 = w10 | w11 | w12 | w13 109 ext v23.16b, v4.16b, v0.16b, #8 110.inst 0xce60c041 //sm3partw1 v1.4s, v2.4s, v0.4s 111.inst 0xce76c6e1 //sm3partw2 v1.4s, v23.4s, v22.4s 112 eor v22.16b, v2.16b, v3.16b 113.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 114 shl v21.4s, v20.4s, #1 115 sri v21.4s, v20.4s, #31 116.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0] 117.inst 0xce428ae6 //sm3tt2a v6.4s, v23.4s, v2.4s[0] 118.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 119 shl v20.4s, v21.4s, #1 120 sri v20.4s, v21.4s, #31 121.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1] 122.inst 0xce429ae6 //sm3tt2a v6.4s, v23.4s, v2.4s[1] 123.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 124 shl v21.4s, v20.4s, #1 125 sri v21.4s, v20.4s, #31 126.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2] 127.inst 0xce42aae6 //sm3tt2a v6.4s, v23.4s, v2.4s[2] 128.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 129 shl v20.4s, v21.4s, #1 130 sri v20.4s, v21.4s, #31 131.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3] 132.inst 0xce42bae6 //sm3tt2a v6.4s, v23.4s, v2.4s[3] 133 // s4 = w7 | w8 | w9 | w10 134 ext v2.16b, v4.16b, v0.16b, #12 135 // vtmp1 = w3 | w4 | w5 | w6 136 ext v22.16b, v3.16b, v4.16b, #12 137 // vtmp2 = w10 | w11 | w12 | w13 138 ext v23.16b, v0.16b, v1.16b, #8 139.inst 0xce61c062 //sm3partw1 v2.4s, v3.4s, v1.4s 140.inst 0xce76c6e2 //sm3partw2 v2.4s, v23.4s, v22.4s 141 eor v22.16b, v3.16b, v4.16b 142.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 143 shl v21.4s, v20.4s, #1 144 sri v21.4s, v20.4s, #31 145.inst 0xce5682e5 //sm3tt1a v5.4s, v23.4s, v22.4s[0] 146.inst 0xce438ae6 //sm3tt2a v6.4s, v23.4s, v3.4s[0] 147.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 148 shl v20.4s, v21.4s, #1 149 sri v20.4s, v21.4s, #31 150.inst 0xce5692e5 //sm3tt1a v5.4s, v23.4s, v22.4s[1] 151.inst 0xce439ae6 //sm3tt2a v6.4s, v23.4s, v3.4s[1] 152.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 153 shl v21.4s, v20.4s, #1 154 sri v21.4s, v20.4s, #31 155.inst 0xce56a2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[2] 156.inst 0xce43aae6 //sm3tt2a v6.4s, v23.4s, v3.4s[2] 157.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 158 shl v20.4s, v21.4s, #1 159 sri v20.4s, v21.4s, #31 160.inst 0xce56b2e5 //sm3tt1a v5.4s, v23.4s, v22.4s[3] 161.inst 0xce43bae6 //sm3tt2a v6.4s, v23.4s, v3.4s[3] 162 ext v20.16b, v17.16b, v17.16b, #4 163 // s4 = w7 | w8 | w9 | w10 164 ext v3.16b, v0.16b, v1.16b, #12 165 // vtmp1 = w3 | w4 | w5 | w6 166 ext v22.16b, v4.16b, v0.16b, #12 167 // vtmp2 = w10 | w11 | w12 | w13 168 ext v23.16b, v1.16b, v2.16b, #8 169.inst 0xce62c083 //sm3partw1 v3.4s, v4.4s, v2.4s 170.inst 0xce76c6e3 //sm3partw2 v3.4s, v23.4s, v22.4s 171 eor v22.16b, v4.16b, v0.16b 172.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 173 shl v21.4s, v20.4s, #1 174 sri v21.4s, v20.4s, #31 175.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] 176.inst 0xce448ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[0] 177.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 178 shl v20.4s, v21.4s, #1 179 sri v20.4s, v21.4s, #31 180.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] 181.inst 0xce449ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[1] 182.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 183 shl v21.4s, v20.4s, #1 184 sri v21.4s, v20.4s, #31 185.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] 186.inst 0xce44aee6 //sm3tt2b v6.4s, v23.4s, v4.4s[2] 187.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 188 shl v20.4s, v21.4s, #1 189 sri v20.4s, v21.4s, #31 190.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] 191.inst 0xce44bee6 //sm3tt2b v6.4s, v23.4s, v4.4s[3] 192 // s4 = w7 | w8 | w9 | w10 193 ext v4.16b, v1.16b, v2.16b, #12 194 // vtmp1 = w3 | w4 | w5 | w6 195 ext v22.16b, v0.16b, v1.16b, #12 196 // vtmp2 = w10 | w11 | w12 | w13 197 ext v23.16b, v2.16b, v3.16b, #8 198.inst 0xce63c004 //sm3partw1 v4.4s, v0.4s, v3.4s 199.inst 0xce76c6e4 //sm3partw2 v4.4s, v23.4s, v22.4s 200 eor v22.16b, v0.16b, v1.16b 201.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 202 shl v21.4s, v20.4s, #1 203 sri v21.4s, v20.4s, #31 204.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] 205.inst 0xce408ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[0] 206.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 207 shl v20.4s, v21.4s, #1 208 sri v20.4s, v21.4s, #31 209.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] 210.inst 0xce409ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[1] 211.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 212 shl v21.4s, v20.4s, #1 213 sri v21.4s, v20.4s, #31 214.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] 215.inst 0xce40aee6 //sm3tt2b v6.4s, v23.4s, v0.4s[2] 216.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 217 shl v20.4s, v21.4s, #1 218 sri v20.4s, v21.4s, #31 219.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] 220.inst 0xce40bee6 //sm3tt2b v6.4s, v23.4s, v0.4s[3] 221 // s4 = w7 | w8 | w9 | w10 222 ext v0.16b, v2.16b, v3.16b, #12 223 // vtmp1 = w3 | w4 | w5 | w6 224 ext v22.16b, v1.16b, v2.16b, #12 225 // vtmp2 = w10 | w11 | w12 | w13 226 ext v23.16b, v3.16b, v4.16b, #8 227.inst 0xce64c020 //sm3partw1 v0.4s, v1.4s, v4.4s 228.inst 0xce76c6e0 //sm3partw2 v0.4s, v23.4s, v22.4s 229 eor v22.16b, v1.16b, v2.16b 230.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 231 shl v21.4s, v20.4s, #1 232 sri v21.4s, v20.4s, #31 233.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] 234.inst 0xce418ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[0] 235.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 236 shl v20.4s, v21.4s, #1 237 sri v20.4s, v21.4s, #31 238.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] 239.inst 0xce419ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[1] 240.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 241 shl v21.4s, v20.4s, #1 242 sri v21.4s, v20.4s, #31 243.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] 244.inst 0xce41aee6 //sm3tt2b v6.4s, v23.4s, v1.4s[2] 245.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 246 shl v20.4s, v21.4s, #1 247 sri v20.4s, v21.4s, #31 248.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] 249.inst 0xce41bee6 //sm3tt2b v6.4s, v23.4s, v1.4s[3] 250 // s4 = w7 | w8 | w9 | w10 251 ext v1.16b, v3.16b, v4.16b, #12 252 // vtmp1 = w3 | w4 | w5 | w6 253 ext v22.16b, v2.16b, v3.16b, #12 254 // vtmp2 = w10 | w11 | w12 | w13 255 ext v23.16b, v4.16b, v0.16b, #8 256.inst 0xce60c041 //sm3partw1 v1.4s, v2.4s, v0.4s 257.inst 0xce76c6e1 //sm3partw2 v1.4s, v23.4s, v22.4s 258 eor v22.16b, v2.16b, v3.16b 259.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 260 shl v21.4s, v20.4s, #1 261 sri v21.4s, v20.4s, #31 262.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] 263.inst 0xce428ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[0] 264.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 265 shl v20.4s, v21.4s, #1 266 sri v20.4s, v21.4s, #31 267.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] 268.inst 0xce429ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[1] 269.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 270 shl v21.4s, v20.4s, #1 271 sri v21.4s, v20.4s, #31 272.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] 273.inst 0xce42aee6 //sm3tt2b v6.4s, v23.4s, v2.4s[2] 274.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 275 shl v20.4s, v21.4s, #1 276 sri v20.4s, v21.4s, #31 277.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] 278.inst 0xce42bee6 //sm3tt2b v6.4s, v23.4s, v2.4s[3] 279 // s4 = w7 | w8 | w9 | w10 280 ext v2.16b, v4.16b, v0.16b, #12 281 // vtmp1 = w3 | w4 | w5 | w6 282 ext v22.16b, v3.16b, v4.16b, #12 283 // vtmp2 = w10 | w11 | w12 | w13 284 ext v23.16b, v0.16b, v1.16b, #8 285.inst 0xce61c062 //sm3partw1 v2.4s, v3.4s, v1.4s 286.inst 0xce76c6e2 //sm3partw2 v2.4s, v23.4s, v22.4s 287 eor v22.16b, v3.16b, v4.16b 288.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 289 shl v21.4s, v20.4s, #1 290 sri v21.4s, v20.4s, #31 291.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] 292.inst 0xce438ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[0] 293.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 294 shl v20.4s, v21.4s, #1 295 sri v20.4s, v21.4s, #31 296.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] 297.inst 0xce439ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[1] 298.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 299 shl v21.4s, v20.4s, #1 300 sri v21.4s, v20.4s, #31 301.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] 302.inst 0xce43aee6 //sm3tt2b v6.4s, v23.4s, v3.4s[2] 303.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 304 shl v20.4s, v21.4s, #1 305 sri v20.4s, v21.4s, #31 306.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] 307.inst 0xce43bee6 //sm3tt2b v6.4s, v23.4s, v3.4s[3] 308 // s4 = w7 | w8 | w9 | w10 309 ext v3.16b, v0.16b, v1.16b, #12 310 // vtmp1 = w3 | w4 | w5 | w6 311 ext v22.16b, v4.16b, v0.16b, #12 312 // vtmp2 = w10 | w11 | w12 | w13 313 ext v23.16b, v1.16b, v2.16b, #8 314.inst 0xce62c083 //sm3partw1 v3.4s, v4.4s, v2.4s 315.inst 0xce76c6e3 //sm3partw2 v3.4s, v23.4s, v22.4s 316 eor v22.16b, v4.16b, v0.16b 317.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 318 shl v21.4s, v20.4s, #1 319 sri v21.4s, v20.4s, #31 320.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] 321.inst 0xce448ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[0] 322.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 323 shl v20.4s, v21.4s, #1 324 sri v20.4s, v21.4s, #31 325.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] 326.inst 0xce449ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[1] 327.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 328 shl v21.4s, v20.4s, #1 329 sri v21.4s, v20.4s, #31 330.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] 331.inst 0xce44aee6 //sm3tt2b v6.4s, v23.4s, v4.4s[2] 332.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 333 shl v20.4s, v21.4s, #1 334 sri v20.4s, v21.4s, #31 335.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] 336.inst 0xce44bee6 //sm3tt2b v6.4s, v23.4s, v4.4s[3] 337 // s4 = w7 | w8 | w9 | w10 338 ext v4.16b, v1.16b, v2.16b, #12 339 // vtmp1 = w3 | w4 | w5 | w6 340 ext v22.16b, v0.16b, v1.16b, #12 341 // vtmp2 = w10 | w11 | w12 | w13 342 ext v23.16b, v2.16b, v3.16b, #8 343.inst 0xce63c004 //sm3partw1 v4.4s, v0.4s, v3.4s 344.inst 0xce76c6e4 //sm3partw2 v4.4s, v23.4s, v22.4s 345 eor v22.16b, v0.16b, v1.16b 346.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 347 shl v21.4s, v20.4s, #1 348 sri v21.4s, v20.4s, #31 349.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] 350.inst 0xce408ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[0] 351.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 352 shl v20.4s, v21.4s, #1 353 sri v20.4s, v21.4s, #31 354.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] 355.inst 0xce409ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[1] 356.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 357 shl v21.4s, v20.4s, #1 358 sri v21.4s, v20.4s, #31 359.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] 360.inst 0xce40aee6 //sm3tt2b v6.4s, v23.4s, v0.4s[2] 361.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 362 shl v20.4s, v21.4s, #1 363 sri v20.4s, v21.4s, #31 364.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] 365.inst 0xce40bee6 //sm3tt2b v6.4s, v23.4s, v0.4s[3] 366 // s4 = w7 | w8 | w9 | w10 367 ext v0.16b, v2.16b, v3.16b, #12 368 // vtmp1 = w3 | w4 | w5 | w6 369 ext v22.16b, v1.16b, v2.16b, #12 370 // vtmp2 = w10 | w11 | w12 | w13 371 ext v23.16b, v3.16b, v4.16b, #8 372.inst 0xce64c020 //sm3partw1 v0.4s, v1.4s, v4.4s 373.inst 0xce76c6e0 //sm3partw2 v0.4s, v23.4s, v22.4s 374 eor v22.16b, v1.16b, v2.16b 375.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 376 shl v21.4s, v20.4s, #1 377 sri v21.4s, v20.4s, #31 378.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] 379.inst 0xce418ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[0] 380.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 381 shl v20.4s, v21.4s, #1 382 sri v20.4s, v21.4s, #31 383.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] 384.inst 0xce419ee6 //sm3tt2b v6.4s, v23.4s, v1.4s[1] 385.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 386 shl v21.4s, v20.4s, #1 387 sri v21.4s, v20.4s, #31 388.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] 389.inst 0xce41aee6 //sm3tt2b v6.4s, v23.4s, v1.4s[2] 390.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 391 shl v20.4s, v21.4s, #1 392 sri v20.4s, v21.4s, #31 393.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] 394.inst 0xce41bee6 //sm3tt2b v6.4s, v23.4s, v1.4s[3] 395 // s4 = w7 | w8 | w9 | w10 396 ext v1.16b, v3.16b, v4.16b, #12 397 // vtmp1 = w3 | w4 | w5 | w6 398 ext v22.16b, v2.16b, v3.16b, #12 399 // vtmp2 = w10 | w11 | w12 | w13 400 ext v23.16b, v4.16b, v0.16b, #8 401.inst 0xce60c041 //sm3partw1 v1.4s, v2.4s, v0.4s 402.inst 0xce76c6e1 //sm3partw2 v1.4s, v23.4s, v22.4s 403 eor v22.16b, v2.16b, v3.16b 404.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 405 shl v21.4s, v20.4s, #1 406 sri v21.4s, v20.4s, #31 407.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] 408.inst 0xce428ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[0] 409.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 410 shl v20.4s, v21.4s, #1 411 sri v20.4s, v21.4s, #31 412.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] 413.inst 0xce429ee6 //sm3tt2b v6.4s, v23.4s, v2.4s[1] 414.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 415 shl v21.4s, v20.4s, #1 416 sri v21.4s, v20.4s, #31 417.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] 418.inst 0xce42aee6 //sm3tt2b v6.4s, v23.4s, v2.4s[2] 419.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 420 shl v20.4s, v21.4s, #1 421 sri v20.4s, v21.4s, #31 422.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] 423.inst 0xce42bee6 //sm3tt2b v6.4s, v23.4s, v2.4s[3] 424 eor v22.16b, v3.16b, v4.16b 425.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 426 shl v21.4s, v20.4s, #1 427 sri v21.4s, v20.4s, #31 428.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] 429.inst 0xce438ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[0] 430.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 431 shl v20.4s, v21.4s, #1 432 sri v20.4s, v21.4s, #31 433.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] 434.inst 0xce439ee6 //sm3tt2b v6.4s, v23.4s, v3.4s[1] 435.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 436 shl v21.4s, v20.4s, #1 437 sri v21.4s, v20.4s, #31 438.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] 439.inst 0xce43aee6 //sm3tt2b v6.4s, v23.4s, v3.4s[2] 440.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 441 shl v20.4s, v21.4s, #1 442 sri v20.4s, v21.4s, #31 443.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] 444.inst 0xce43bee6 //sm3tt2b v6.4s, v23.4s, v3.4s[3] 445 eor v22.16b, v4.16b, v0.16b 446.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 447 shl v21.4s, v20.4s, #1 448 sri v21.4s, v20.4s, #31 449.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] 450.inst 0xce448ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[0] 451.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 452 shl v20.4s, v21.4s, #1 453 sri v20.4s, v21.4s, #31 454.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] 455.inst 0xce449ee6 //sm3tt2b v6.4s, v23.4s, v4.4s[1] 456.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 457 shl v21.4s, v20.4s, #1 458 sri v21.4s, v20.4s, #31 459.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] 460.inst 0xce44aee6 //sm3tt2b v6.4s, v23.4s, v4.4s[2] 461.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 462 shl v20.4s, v21.4s, #1 463 sri v20.4s, v21.4s, #31 464.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] 465.inst 0xce44bee6 //sm3tt2b v6.4s, v23.4s, v4.4s[3] 466 eor v22.16b, v0.16b, v1.16b 467.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 468 shl v21.4s, v20.4s, #1 469 sri v21.4s, v20.4s, #31 470.inst 0xce5686e5 //sm3tt1b v5.4s, v23.4s, v22.4s[0] 471.inst 0xce408ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[0] 472.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 473 shl v20.4s, v21.4s, #1 474 sri v20.4s, v21.4s, #31 475.inst 0xce5696e5 //sm3tt1b v5.4s, v23.4s, v22.4s[1] 476.inst 0xce409ee6 //sm3tt2b v6.4s, v23.4s, v0.4s[1] 477.inst 0xce5418b7 //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s 478 shl v21.4s, v20.4s, #1 479 sri v21.4s, v20.4s, #31 480.inst 0xce56a6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[2] 481.inst 0xce40aee6 //sm3tt2b v6.4s, v23.4s, v0.4s[2] 482.inst 0xce5518b7 //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s 483 shl v20.4s, v21.4s, #1 484 sri v20.4s, v21.4s, #31 485.inst 0xce56b6e5 //sm3tt1b v5.4s, v23.4s, v22.4s[3] 486.inst 0xce40bee6 //sm3tt2b v6.4s, v23.4s, v0.4s[3] 487 eor v5.16b, v5.16b, v18.16b 488 eor v6.16b, v6.16b, v19.16b 489 490 // any remained blocks? 491 cbnz w2, .Loop 492 493 // save state 494 rev64 v5.4s, v5.4s 495 rev64 v6.4s, v6.4s 496 ext v5.16b, v5.16b, v5.16b, #8 497 ext v6.16b, v6.16b, v6.16b, #8 498 st1 {v5.4s,v6.4s}, [x0] 499 ret 500.size ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order 501.section .rodata 502 503.type _sm3_consts,%object 504.align 3 505_sm3_consts: 506.Tj: 507.word 0x79cc4519, 0x9d8a7a87 508.size _sm3_consts,.-_sm3_consts 509.previous 510