xref: /freebsd/sys/crypto/openssl/aarch64/sm3-armv8.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1/* Do not modify. This file is auto-generated from sm3-armv8.pl. */
2// Copyright 2021-2025 The OpenSSL Project Authors. All Rights Reserved.
3//
4// Licensed under the Apache License 2.0 (the "License").  You may not use
5// this file except in compliance with the License.  You can obtain a copy
6// in the file LICENSE in the source distribution or at
7// https://www.openssl.org/source/license.html
8//
9// This module implements support for Armv8 SM3 instructions
10
11// $output is the last argument if it looks like a file (it has an extension)
12// $flavour is the first argument if it doesn't look like a file
13#include "arm_arch.h"
14.text
15.globl	ossl_hwsm3_block_data_order
16.type	ossl_hwsm3_block_data_order,%function
17.align	5
18ossl_hwsm3_block_data_order:
19	AARCH64_VALID_CALL_TARGET
20	// load state
21	ld1	{v5.4s,v6.4s}, [x0]
22	rev64	v5.4s, v5.4s
23	rev64	v6.4s, v6.4s
24	ext	v5.16b, v5.16b, v5.16b, #8
25	ext	v6.16b, v6.16b, v6.16b, #8
26	adrp	x8, .Tj
27	add	x8, x8, #:lo12:.Tj
28	ldp	s16, s17, [x8]
29
30.Loop:
31	// load input
32	ld1	{v0.4s,v1.4s,v2.4s,v3.4s}, [x1], #64
33	sub	w2, w2, #1
34
35	mov	v18.16b, v5.16b
36	mov	v19.16b, v6.16b
37
38#ifndef __AARCH64EB__
39	rev32	v0.16b, v0.16b
40	rev32	v1.16b, v1.16b
41	rev32	v2.16b, v2.16b
42	rev32	v3.16b, v3.16b
43#endif
44
45	ext	v20.16b, v16.16b, v16.16b, #4
46	// s4 = w7  | w8  | w9  | w10
47	ext	v4.16b, v1.16b, v2.16b, #12
48	// vtmp1 = w3  | w4  | w5  | w6
49	ext	v22.16b, v0.16b, v1.16b, #12
50	// vtmp2 = w10 | w11 | w12 | w13
51	ext	v23.16b, v2.16b, v3.16b, #8
52.inst	0xce63c004	//sm3partw1 v4.4s, v0.4s, v3.4s
53.inst	0xce76c6e4	//sm3partw2 v4.4s, v23.4s, v22.4s
54	eor	v22.16b, v0.16b, v1.16b
55.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
56	shl	v21.4s, v20.4s, #1
57	sri	v21.4s, v20.4s, #31
58.inst	0xce5682e5	//sm3tt1a v5.4s, v23.4s, v22.4s[0]
59.inst	0xce408ae6	//sm3tt2a v6.4s, v23.4s, v0.4s[0]
60.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
61	shl	v20.4s, v21.4s, #1
62	sri	v20.4s, v21.4s, #31
63.inst	0xce5692e5	//sm3tt1a v5.4s, v23.4s, v22.4s[1]
64.inst	0xce409ae6	//sm3tt2a v6.4s, v23.4s, v0.4s[1]
65.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
66	shl	v21.4s, v20.4s, #1
67	sri	v21.4s, v20.4s, #31
68.inst	0xce56a2e5	//sm3tt1a v5.4s, v23.4s, v22.4s[2]
69.inst	0xce40aae6	//sm3tt2a v6.4s, v23.4s, v0.4s[2]
70.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
71	shl	v20.4s, v21.4s, #1
72	sri	v20.4s, v21.4s, #31
73.inst	0xce56b2e5	//sm3tt1a v5.4s, v23.4s, v22.4s[3]
74.inst	0xce40bae6	//sm3tt2a v6.4s, v23.4s, v0.4s[3]
75	// s4 = w7  | w8  | w9  | w10
76	ext	v0.16b, v2.16b, v3.16b, #12
77	// vtmp1 = w3  | w4  | w5  | w6
78	ext	v22.16b, v1.16b, v2.16b, #12
79	// vtmp2 = w10 | w11 | w12 | w13
80	ext	v23.16b, v3.16b, v4.16b, #8
81.inst	0xce64c020	//sm3partw1 v0.4s, v1.4s, v4.4s
82.inst	0xce76c6e0	//sm3partw2 v0.4s, v23.4s, v22.4s
83	eor	v22.16b, v1.16b, v2.16b
84.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
85	shl	v21.4s, v20.4s, #1
86	sri	v21.4s, v20.4s, #31
87.inst	0xce5682e5	//sm3tt1a v5.4s, v23.4s, v22.4s[0]
88.inst	0xce418ae6	//sm3tt2a v6.4s, v23.4s, v1.4s[0]
89.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
90	shl	v20.4s, v21.4s, #1
91	sri	v20.4s, v21.4s, #31
92.inst	0xce5692e5	//sm3tt1a v5.4s, v23.4s, v22.4s[1]
93.inst	0xce419ae6	//sm3tt2a v6.4s, v23.4s, v1.4s[1]
94.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
95	shl	v21.4s, v20.4s, #1
96	sri	v21.4s, v20.4s, #31
97.inst	0xce56a2e5	//sm3tt1a v5.4s, v23.4s, v22.4s[2]
98.inst	0xce41aae6	//sm3tt2a v6.4s, v23.4s, v1.4s[2]
99.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
100	shl	v20.4s, v21.4s, #1
101	sri	v20.4s, v21.4s, #31
102.inst	0xce56b2e5	//sm3tt1a v5.4s, v23.4s, v22.4s[3]
103.inst	0xce41bae6	//sm3tt2a v6.4s, v23.4s, v1.4s[3]
104	// s4 = w7  | w8  | w9  | w10
105	ext	v1.16b, v3.16b, v4.16b, #12
106	// vtmp1 = w3  | w4  | w5  | w6
107	ext	v22.16b, v2.16b, v3.16b, #12
108	// vtmp2 = w10 | w11 | w12 | w13
109	ext	v23.16b, v4.16b, v0.16b, #8
110.inst	0xce60c041	//sm3partw1 v1.4s, v2.4s, v0.4s
111.inst	0xce76c6e1	//sm3partw2 v1.4s, v23.4s, v22.4s
112	eor	v22.16b, v2.16b, v3.16b
113.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
114	shl	v21.4s, v20.4s, #1
115	sri	v21.4s, v20.4s, #31
116.inst	0xce5682e5	//sm3tt1a v5.4s, v23.4s, v22.4s[0]
117.inst	0xce428ae6	//sm3tt2a v6.4s, v23.4s, v2.4s[0]
118.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
119	shl	v20.4s, v21.4s, #1
120	sri	v20.4s, v21.4s, #31
121.inst	0xce5692e5	//sm3tt1a v5.4s, v23.4s, v22.4s[1]
122.inst	0xce429ae6	//sm3tt2a v6.4s, v23.4s, v2.4s[1]
123.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
124	shl	v21.4s, v20.4s, #1
125	sri	v21.4s, v20.4s, #31
126.inst	0xce56a2e5	//sm3tt1a v5.4s, v23.4s, v22.4s[2]
127.inst	0xce42aae6	//sm3tt2a v6.4s, v23.4s, v2.4s[2]
128.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
129	shl	v20.4s, v21.4s, #1
130	sri	v20.4s, v21.4s, #31
131.inst	0xce56b2e5	//sm3tt1a v5.4s, v23.4s, v22.4s[3]
132.inst	0xce42bae6	//sm3tt2a v6.4s, v23.4s, v2.4s[3]
133	// s4 = w7  | w8  | w9  | w10
134	ext	v2.16b, v4.16b, v0.16b, #12
135	// vtmp1 = w3  | w4  | w5  | w6
136	ext	v22.16b, v3.16b, v4.16b, #12
137	// vtmp2 = w10 | w11 | w12 | w13
138	ext	v23.16b, v0.16b, v1.16b, #8
139.inst	0xce61c062	//sm3partw1 v2.4s, v3.4s, v1.4s
140.inst	0xce76c6e2	//sm3partw2 v2.4s, v23.4s, v22.4s
141	eor	v22.16b, v3.16b, v4.16b
142.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
143	shl	v21.4s, v20.4s, #1
144	sri	v21.4s, v20.4s, #31
145.inst	0xce5682e5	//sm3tt1a v5.4s, v23.4s, v22.4s[0]
146.inst	0xce438ae6	//sm3tt2a v6.4s, v23.4s, v3.4s[0]
147.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
148	shl	v20.4s, v21.4s, #1
149	sri	v20.4s, v21.4s, #31
150.inst	0xce5692e5	//sm3tt1a v5.4s, v23.4s, v22.4s[1]
151.inst	0xce439ae6	//sm3tt2a v6.4s, v23.4s, v3.4s[1]
152.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
153	shl	v21.4s, v20.4s, #1
154	sri	v21.4s, v20.4s, #31
155.inst	0xce56a2e5	//sm3tt1a v5.4s, v23.4s, v22.4s[2]
156.inst	0xce43aae6	//sm3tt2a v6.4s, v23.4s, v3.4s[2]
157.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
158	shl	v20.4s, v21.4s, #1
159	sri	v20.4s, v21.4s, #31
160.inst	0xce56b2e5	//sm3tt1a v5.4s, v23.4s, v22.4s[3]
161.inst	0xce43bae6	//sm3tt2a v6.4s, v23.4s, v3.4s[3]
162	ext	v20.16b, v17.16b, v17.16b, #4
163	// s4 = w7  | w8  | w9  | w10
164	ext	v3.16b, v0.16b, v1.16b, #12
165	// vtmp1 = w3  | w4  | w5  | w6
166	ext	v22.16b, v4.16b, v0.16b, #12
167	// vtmp2 = w10 | w11 | w12 | w13
168	ext	v23.16b, v1.16b, v2.16b, #8
169.inst	0xce62c083	//sm3partw1 v3.4s, v4.4s, v2.4s
170.inst	0xce76c6e3	//sm3partw2 v3.4s, v23.4s, v22.4s
171	eor	v22.16b, v4.16b, v0.16b
172.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
173	shl	v21.4s, v20.4s, #1
174	sri	v21.4s, v20.4s, #31
175.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
176.inst	0xce448ee6	//sm3tt2b v6.4s, v23.4s, v4.4s[0]
177.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
178	shl	v20.4s, v21.4s, #1
179	sri	v20.4s, v21.4s, #31
180.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
181.inst	0xce449ee6	//sm3tt2b v6.4s, v23.4s, v4.4s[1]
182.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
183	shl	v21.4s, v20.4s, #1
184	sri	v21.4s, v20.4s, #31
185.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
186.inst	0xce44aee6	//sm3tt2b v6.4s, v23.4s, v4.4s[2]
187.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
188	shl	v20.4s, v21.4s, #1
189	sri	v20.4s, v21.4s, #31
190.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
191.inst	0xce44bee6	//sm3tt2b v6.4s, v23.4s, v4.4s[3]
192	// s4 = w7  | w8  | w9  | w10
193	ext	v4.16b, v1.16b, v2.16b, #12
194	// vtmp1 = w3  | w4  | w5  | w6
195	ext	v22.16b, v0.16b, v1.16b, #12
196	// vtmp2 = w10 | w11 | w12 | w13
197	ext	v23.16b, v2.16b, v3.16b, #8
198.inst	0xce63c004	//sm3partw1 v4.4s, v0.4s, v3.4s
199.inst	0xce76c6e4	//sm3partw2 v4.4s, v23.4s, v22.4s
200	eor	v22.16b, v0.16b, v1.16b
201.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
202	shl	v21.4s, v20.4s, #1
203	sri	v21.4s, v20.4s, #31
204.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
205.inst	0xce408ee6	//sm3tt2b v6.4s, v23.4s, v0.4s[0]
206.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
207	shl	v20.4s, v21.4s, #1
208	sri	v20.4s, v21.4s, #31
209.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
210.inst	0xce409ee6	//sm3tt2b v6.4s, v23.4s, v0.4s[1]
211.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
212	shl	v21.4s, v20.4s, #1
213	sri	v21.4s, v20.4s, #31
214.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
215.inst	0xce40aee6	//sm3tt2b v6.4s, v23.4s, v0.4s[2]
216.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
217	shl	v20.4s, v21.4s, #1
218	sri	v20.4s, v21.4s, #31
219.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
220.inst	0xce40bee6	//sm3tt2b v6.4s, v23.4s, v0.4s[3]
221	// s4 = w7  | w8  | w9  | w10
222	ext	v0.16b, v2.16b, v3.16b, #12
223	// vtmp1 = w3  | w4  | w5  | w6
224	ext	v22.16b, v1.16b, v2.16b, #12
225	// vtmp2 = w10 | w11 | w12 | w13
226	ext	v23.16b, v3.16b, v4.16b, #8
227.inst	0xce64c020	//sm3partw1 v0.4s, v1.4s, v4.4s
228.inst	0xce76c6e0	//sm3partw2 v0.4s, v23.4s, v22.4s
229	eor	v22.16b, v1.16b, v2.16b
230.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
231	shl	v21.4s, v20.4s, #1
232	sri	v21.4s, v20.4s, #31
233.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
234.inst	0xce418ee6	//sm3tt2b v6.4s, v23.4s, v1.4s[0]
235.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
236	shl	v20.4s, v21.4s, #1
237	sri	v20.4s, v21.4s, #31
238.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
239.inst	0xce419ee6	//sm3tt2b v6.4s, v23.4s, v1.4s[1]
240.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
241	shl	v21.4s, v20.4s, #1
242	sri	v21.4s, v20.4s, #31
243.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
244.inst	0xce41aee6	//sm3tt2b v6.4s, v23.4s, v1.4s[2]
245.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
246	shl	v20.4s, v21.4s, #1
247	sri	v20.4s, v21.4s, #31
248.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
249.inst	0xce41bee6	//sm3tt2b v6.4s, v23.4s, v1.4s[3]
250	// s4 = w7  | w8  | w9  | w10
251	ext	v1.16b, v3.16b, v4.16b, #12
252	// vtmp1 = w3  | w4  | w5  | w6
253	ext	v22.16b, v2.16b, v3.16b, #12
254	// vtmp2 = w10 | w11 | w12 | w13
255	ext	v23.16b, v4.16b, v0.16b, #8
256.inst	0xce60c041	//sm3partw1 v1.4s, v2.4s, v0.4s
257.inst	0xce76c6e1	//sm3partw2 v1.4s, v23.4s, v22.4s
258	eor	v22.16b, v2.16b, v3.16b
259.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
260	shl	v21.4s, v20.4s, #1
261	sri	v21.4s, v20.4s, #31
262.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
263.inst	0xce428ee6	//sm3tt2b v6.4s, v23.4s, v2.4s[0]
264.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
265	shl	v20.4s, v21.4s, #1
266	sri	v20.4s, v21.4s, #31
267.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
268.inst	0xce429ee6	//sm3tt2b v6.4s, v23.4s, v2.4s[1]
269.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
270	shl	v21.4s, v20.4s, #1
271	sri	v21.4s, v20.4s, #31
272.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
273.inst	0xce42aee6	//sm3tt2b v6.4s, v23.4s, v2.4s[2]
274.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
275	shl	v20.4s, v21.4s, #1
276	sri	v20.4s, v21.4s, #31
277.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
278.inst	0xce42bee6	//sm3tt2b v6.4s, v23.4s, v2.4s[3]
279	// s4 = w7  | w8  | w9  | w10
280	ext	v2.16b, v4.16b, v0.16b, #12
281	// vtmp1 = w3  | w4  | w5  | w6
282	ext	v22.16b, v3.16b, v4.16b, #12
283	// vtmp2 = w10 | w11 | w12 | w13
284	ext	v23.16b, v0.16b, v1.16b, #8
285.inst	0xce61c062	//sm3partw1 v2.4s, v3.4s, v1.4s
286.inst	0xce76c6e2	//sm3partw2 v2.4s, v23.4s, v22.4s
287	eor	v22.16b, v3.16b, v4.16b
288.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
289	shl	v21.4s, v20.4s, #1
290	sri	v21.4s, v20.4s, #31
291.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
292.inst	0xce438ee6	//sm3tt2b v6.4s, v23.4s, v3.4s[0]
293.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
294	shl	v20.4s, v21.4s, #1
295	sri	v20.4s, v21.4s, #31
296.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
297.inst	0xce439ee6	//sm3tt2b v6.4s, v23.4s, v3.4s[1]
298.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
299	shl	v21.4s, v20.4s, #1
300	sri	v21.4s, v20.4s, #31
301.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
302.inst	0xce43aee6	//sm3tt2b v6.4s, v23.4s, v3.4s[2]
303.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
304	shl	v20.4s, v21.4s, #1
305	sri	v20.4s, v21.4s, #31
306.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
307.inst	0xce43bee6	//sm3tt2b v6.4s, v23.4s, v3.4s[3]
308	// s4 = w7  | w8  | w9  | w10
309	ext	v3.16b, v0.16b, v1.16b, #12
310	// vtmp1 = w3  | w4  | w5  | w6
311	ext	v22.16b, v4.16b, v0.16b, #12
312	// vtmp2 = w10 | w11 | w12 | w13
313	ext	v23.16b, v1.16b, v2.16b, #8
314.inst	0xce62c083	//sm3partw1 v3.4s, v4.4s, v2.4s
315.inst	0xce76c6e3	//sm3partw2 v3.4s, v23.4s, v22.4s
316	eor	v22.16b, v4.16b, v0.16b
317.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
318	shl	v21.4s, v20.4s, #1
319	sri	v21.4s, v20.4s, #31
320.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
321.inst	0xce448ee6	//sm3tt2b v6.4s, v23.4s, v4.4s[0]
322.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
323	shl	v20.4s, v21.4s, #1
324	sri	v20.4s, v21.4s, #31
325.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
326.inst	0xce449ee6	//sm3tt2b v6.4s, v23.4s, v4.4s[1]
327.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
328	shl	v21.4s, v20.4s, #1
329	sri	v21.4s, v20.4s, #31
330.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
331.inst	0xce44aee6	//sm3tt2b v6.4s, v23.4s, v4.4s[2]
332.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
333	shl	v20.4s, v21.4s, #1
334	sri	v20.4s, v21.4s, #31
335.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
336.inst	0xce44bee6	//sm3tt2b v6.4s, v23.4s, v4.4s[3]
337	// s4 = w7  | w8  | w9  | w10
338	ext	v4.16b, v1.16b, v2.16b, #12
339	// vtmp1 = w3  | w4  | w5  | w6
340	ext	v22.16b, v0.16b, v1.16b, #12
341	// vtmp2 = w10 | w11 | w12 | w13
342	ext	v23.16b, v2.16b, v3.16b, #8
343.inst	0xce63c004	//sm3partw1 v4.4s, v0.4s, v3.4s
344.inst	0xce76c6e4	//sm3partw2 v4.4s, v23.4s, v22.4s
345	eor	v22.16b, v0.16b, v1.16b
346.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
347	shl	v21.4s, v20.4s, #1
348	sri	v21.4s, v20.4s, #31
349.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
350.inst	0xce408ee6	//sm3tt2b v6.4s, v23.4s, v0.4s[0]
351.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
352	shl	v20.4s, v21.4s, #1
353	sri	v20.4s, v21.4s, #31
354.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
355.inst	0xce409ee6	//sm3tt2b v6.4s, v23.4s, v0.4s[1]
356.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
357	shl	v21.4s, v20.4s, #1
358	sri	v21.4s, v20.4s, #31
359.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
360.inst	0xce40aee6	//sm3tt2b v6.4s, v23.4s, v0.4s[2]
361.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
362	shl	v20.4s, v21.4s, #1
363	sri	v20.4s, v21.4s, #31
364.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
365.inst	0xce40bee6	//sm3tt2b v6.4s, v23.4s, v0.4s[3]
366	// s4 = w7  | w8  | w9  | w10
367	ext	v0.16b, v2.16b, v3.16b, #12
368	// vtmp1 = w3  | w4  | w5  | w6
369	ext	v22.16b, v1.16b, v2.16b, #12
370	// vtmp2 = w10 | w11 | w12 | w13
371	ext	v23.16b, v3.16b, v4.16b, #8
372.inst	0xce64c020	//sm3partw1 v0.4s, v1.4s, v4.4s
373.inst	0xce76c6e0	//sm3partw2 v0.4s, v23.4s, v22.4s
374	eor	v22.16b, v1.16b, v2.16b
375.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
376	shl	v21.4s, v20.4s, #1
377	sri	v21.4s, v20.4s, #31
378.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
379.inst	0xce418ee6	//sm3tt2b v6.4s, v23.4s, v1.4s[0]
380.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
381	shl	v20.4s, v21.4s, #1
382	sri	v20.4s, v21.4s, #31
383.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
384.inst	0xce419ee6	//sm3tt2b v6.4s, v23.4s, v1.4s[1]
385.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
386	shl	v21.4s, v20.4s, #1
387	sri	v21.4s, v20.4s, #31
388.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
389.inst	0xce41aee6	//sm3tt2b v6.4s, v23.4s, v1.4s[2]
390.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
391	shl	v20.4s, v21.4s, #1
392	sri	v20.4s, v21.4s, #31
393.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
394.inst	0xce41bee6	//sm3tt2b v6.4s, v23.4s, v1.4s[3]
395	// s4 = w7  | w8  | w9  | w10
396	ext	v1.16b, v3.16b, v4.16b, #12
397	// vtmp1 = w3  | w4  | w5  | w6
398	ext	v22.16b, v2.16b, v3.16b, #12
399	// vtmp2 = w10 | w11 | w12 | w13
400	ext	v23.16b, v4.16b, v0.16b, #8
401.inst	0xce60c041	//sm3partw1 v1.4s, v2.4s, v0.4s
402.inst	0xce76c6e1	//sm3partw2 v1.4s, v23.4s, v22.4s
403	eor	v22.16b, v2.16b, v3.16b
404.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
405	shl	v21.4s, v20.4s, #1
406	sri	v21.4s, v20.4s, #31
407.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
408.inst	0xce428ee6	//sm3tt2b v6.4s, v23.4s, v2.4s[0]
409.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
410	shl	v20.4s, v21.4s, #1
411	sri	v20.4s, v21.4s, #31
412.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
413.inst	0xce429ee6	//sm3tt2b v6.4s, v23.4s, v2.4s[1]
414.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
415	shl	v21.4s, v20.4s, #1
416	sri	v21.4s, v20.4s, #31
417.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
418.inst	0xce42aee6	//sm3tt2b v6.4s, v23.4s, v2.4s[2]
419.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
420	shl	v20.4s, v21.4s, #1
421	sri	v20.4s, v21.4s, #31
422.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
423.inst	0xce42bee6	//sm3tt2b v6.4s, v23.4s, v2.4s[3]
424	eor	v22.16b, v3.16b, v4.16b
425.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
426	shl	v21.4s, v20.4s, #1
427	sri	v21.4s, v20.4s, #31
428.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
429.inst	0xce438ee6	//sm3tt2b v6.4s, v23.4s, v3.4s[0]
430.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
431	shl	v20.4s, v21.4s, #1
432	sri	v20.4s, v21.4s, #31
433.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
434.inst	0xce439ee6	//sm3tt2b v6.4s, v23.4s, v3.4s[1]
435.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
436	shl	v21.4s, v20.4s, #1
437	sri	v21.4s, v20.4s, #31
438.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
439.inst	0xce43aee6	//sm3tt2b v6.4s, v23.4s, v3.4s[2]
440.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
441	shl	v20.4s, v21.4s, #1
442	sri	v20.4s, v21.4s, #31
443.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
444.inst	0xce43bee6	//sm3tt2b v6.4s, v23.4s, v3.4s[3]
445	eor	v22.16b, v4.16b, v0.16b
446.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
447	shl	v21.4s, v20.4s, #1
448	sri	v21.4s, v20.4s, #31
449.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
450.inst	0xce448ee6	//sm3tt2b v6.4s, v23.4s, v4.4s[0]
451.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
452	shl	v20.4s, v21.4s, #1
453	sri	v20.4s, v21.4s, #31
454.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
455.inst	0xce449ee6	//sm3tt2b v6.4s, v23.4s, v4.4s[1]
456.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
457	shl	v21.4s, v20.4s, #1
458	sri	v21.4s, v20.4s, #31
459.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
460.inst	0xce44aee6	//sm3tt2b v6.4s, v23.4s, v4.4s[2]
461.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
462	shl	v20.4s, v21.4s, #1
463	sri	v20.4s, v21.4s, #31
464.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
465.inst	0xce44bee6	//sm3tt2b v6.4s, v23.4s, v4.4s[3]
466	eor	v22.16b, v0.16b, v1.16b
467.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
468	shl	v21.4s, v20.4s, #1
469	sri	v21.4s, v20.4s, #31
470.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
471.inst	0xce408ee6	//sm3tt2b v6.4s, v23.4s, v0.4s[0]
472.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
473	shl	v20.4s, v21.4s, #1
474	sri	v20.4s, v21.4s, #31
475.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
476.inst	0xce409ee6	//sm3tt2b v6.4s, v23.4s, v0.4s[1]
477.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
478	shl	v21.4s, v20.4s, #1
479	sri	v21.4s, v20.4s, #31
480.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
481.inst	0xce40aee6	//sm3tt2b v6.4s, v23.4s, v0.4s[2]
482.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
483	shl	v20.4s, v21.4s, #1
484	sri	v20.4s, v21.4s, #31
485.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
486.inst	0xce40bee6	//sm3tt2b v6.4s, v23.4s, v0.4s[3]
487	eor	v5.16b, v5.16b, v18.16b
488	eor	v6.16b, v6.16b, v19.16b
489
490	// any remained blocks?
491	cbnz	w2, .Loop
492
493	// save state
494	rev64	v5.4s, v5.4s
495	rev64	v6.4s, v6.4s
496	ext	v5.16b, v5.16b, v5.16b, #8
497	ext	v6.16b, v6.16b, v6.16b, #8
498	st1	{v5.4s,v6.4s}, [x0]
499	ret
500.size	ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order
501.section	.rodata
502
503.type	_sm3_consts,%object
504.align	3
505_sm3_consts:
506.Tj:
507.word	0x79cc4519, 0x9d8a7a87
508.size	_sm3_consts,.-_sm3_consts
509.previous
510