xref: /freebsd/crypto/openssl/crypto/sm3/asm/sm3-armv8.pl (revision e7be843b4a162e68651d3911f0357ed464915629)
1#! /usr/bin/env perl
2# Copyright 2021-2025 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# This module implements support for Armv8 SM3 instructions
10
11# $output is the last argument if it looks like a file (it has an extension)
12# $flavour is the first argument if it doesn't look like a file
13$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
14$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
15
16$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
17( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
18( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
19die "can't locate arm-xlate.pl";
20
21open OUT,"| \"$^X\" $xlate $flavour \"$output\""
22    or die "can't call $xlate: $!";
23*STDOUT=*OUT;
24
25$prefix="sm3";
26# Message expanding:
27#	Wj <- P1(W[j-16]^W[j-9]^(W[j-3]<<<15))^(W[j-13]<<<7)^W[j-6]
28# Input: s0, s1, s2, s3
29#	s0 = w0  | w1  | w2  | w3
30#	s1 = w4  | w5  | w6  | w7
31#	s2 = w8  | w9  | w10 | w11
32#	s3 = w12 | w13 | w14 | w15
33# Output: s4
34sub msg_exp () {
35my $s0 = shift;
36my $s1 = shift;
37my $s2 = shift;
38my $s3 = shift;
39my $s4 = shift;
40my $vtmp1 = shift;
41my $vtmp2 = shift;
42$code.=<<___;
43	// s4 = w7  | w8  | w9  | w10
44	ext     $s4.16b, $s1.16b, $s2.16b, #12
45	// vtmp1 = w3  | w4  | w5  | w6
46	ext	$vtmp1.16b, $s0.16b, $s1.16b, #12
47	// vtmp2 = w10 | w11 | w12 | w13
48	ext     $vtmp2.16b, $s2.16b, $s3.16b, #8
49	sm3partw1       $s4.4s, $s0.4s, $s3.4s
50	sm3partw2       $s4.4s, $vtmp2.4s, $vtmp1.4s
51___
52}
53
54# A round of compresson function
55# Input:
56# 	ab - choose instruction among sm3tt1a, sm3tt1b, sm3tt2a, sm3tt2b
57# 	vstate0 - vstate1, store digest status(A - H)
58# 	vconst0 - vconst1, interleaved used to store Tj <<< j
59# 	vtmp - temporary register
60# 	vw - for sm3tt1ab, vw = s0 eor s1
61# 	s0 - for sm3tt2ab, just be s0
62# 	i, choose wj' or wj from vw
63sub round () {
64my $ab = shift;
65my $vstate0 = shift;
66my $vstate1 = shift;
67my $vconst0 = shift;
68my $vconst1 = shift;
69my $vtmp = shift;
70my $vw = shift;
71my $s0 = shift;
72my $i = shift;
73$code.=<<___;
74	sm3ss1  $vtmp.4s, $vstate0.4s, $vconst0.4s, $vstate1.4s
75	shl     $vconst1.4s, $vconst0.4s, #1
76	sri     $vconst1.4s, $vconst0.4s, #31
77	sm3tt1$ab       $vstate0.4s, $vtmp.4s, $vw.4s[$i]
78	sm3tt2$ab       $vstate1.4s, $vtmp.4s, $s0.4s[$i]
79___
80}
81
82sub qround () {
83my $ab = shift;
84my $vstate0 = shift;
85my $vstate1 = shift;
86my $vconst0 = shift;
87my $vconst1 = shift;
88my $vtmp1 = shift;
89my $vtmp2 = shift;
90my $s0 = shift;
91my $s1 = shift;
92my $s2 = shift;
93my $s3 = shift;
94my $s4 = shift;
95	if($s4) {
96		&msg_exp($s0, $s1, $s2, $s3, $s4, $vtmp1, $vtmp2);
97	}
98$code.=<<___;
99	eor     $vtmp1.16b, $s0.16b, $s1.16b
100___
101	&round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
102               $vtmp1, $s0, 0);
103	&round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
104               $vtmp1, $s0, 1);
105	&round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
106               $vtmp1, $s0, 2);
107	&round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
108               $vtmp1, $s0, 3);
109}
110
111$code=<<___;
112#include "arm_arch.h"
113.text
114___
115
116{{{
117my ($pstate,$pdata,$num)=("x0","x1","w2");
118my ($state1,$state2)=("v5","v6");
119my ($sconst1, $sconst2)=("s16","s17");
120my ($vconst1, $vconst2)=("v16","v17");
121my ($s0,$s1,$s2,$s3,$s4)=map("v$_",(0..4));
122my ($bkstate1,$bkstate2)=("v18","v19");
123my ($vconst_tmp1,$vconst_tmp2)=("v20","v21");
124my ($vtmp1,$vtmp2)=("v22","v23");
125my $constaddr="x8";
126# void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num)
127$code.=<<___;
128.globl	ossl_hwsm3_block_data_order
129.type	ossl_hwsm3_block_data_order,%function
130.align	5
131ossl_hwsm3_block_data_order:
132	AARCH64_VALID_CALL_TARGET
133	// load state
134	ld1     {$state1.4s-$state2.4s}, [$pstate]
135	rev64   $state1.4s, $state1.4s
136	rev64   $state2.4s, $state2.4s
137	ext     $state1.16b, $state1.16b, $state1.16b, #8
138	ext     $state2.16b, $state2.16b, $state2.16b, #8
139___
140if ($flavour =~ /linux64/)
141{
142$code.=<<___;
143	adrp    $constaddr, .Tj
144	add     $constaddr, $constaddr, #:lo12:.Tj
145___
146} else {
147$code.=<<___;
148	adr     $constaddr, .Tj
149___
150}
151$code.=<<___;
152	ldp     $sconst1, $sconst2, [$constaddr]
153
154.Loop:
155	// load input
156	ld1     {$s0.4s-$s3.4s}, [$pdata], #64
157	sub     $num, $num, #1
158
159	mov     $bkstate1.16b, $state1.16b
160	mov     $bkstate2.16b, $state2.16b
161
162#ifndef __AARCH64EB__
163	rev32   $s0.16b, $s0.16b
164	rev32   $s1.16b, $s1.16b
165	rev32   $s2.16b, $s2.16b
166	rev32   $s3.16b, $s3.16b
167#endif
168
169	ext     $vconst_tmp1.16b, $vconst1.16b, $vconst1.16b, #4
170___
171	&qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
172                $s0,$s1,$s2,$s3,$s4);
173	&qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
174                $s1,$s2,$s3,$s4,$s0);
175	&qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
176                $s2,$s3,$s4,$s0,$s1);
177	&qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
178                $s3,$s4,$s0,$s1,$s2);
179
180$code.=<<___;
181	ext     $vconst_tmp1.16b, $vconst2.16b, $vconst2.16b, #4
182___
183
184	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
185                $s4,$s0,$s1,$s2,$s3);
186	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
187                $s0,$s1,$s2,$s3,$s4);
188	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
189                $s1,$s2,$s3,$s4,$s0);
190	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
191                $s2,$s3,$s4,$s0,$s1);
192	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
193                $s3,$s4,$s0,$s1,$s2);
194	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
195                $s4,$s0,$s1,$s2,$s3);
196	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
197                $s0,$s1,$s2,$s3,$s4);
198	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
199                $s1,$s2,$s3,$s4,$s0);
200	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
201                $s2,$s3,$s4,$s0,$s1);
202	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
203                $s3,$s4);
204	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
205                $s4,$s0);
206	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
207                $s0,$s1);
208
209$code.=<<___;
210	eor     $state1.16b, $state1.16b, $bkstate1.16b
211	eor     $state2.16b, $state2.16b, $bkstate2.16b
212
213	// any remained blocks?
214	cbnz    $num, .Loop
215
216	// save state
217	rev64   $state1.4s, $state1.4s
218	rev64   $state2.4s, $state2.4s
219	ext     $state1.16b, $state1.16b, $state1.16b, #8
220	ext     $state2.16b, $state2.16b, $state2.16b, #8
221	st1     {$state1.4s-$state2.4s}, [$pstate]
222	ret
223.size	ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order
224___
225
226$code.=".rodata\n"  if ($flavour =~ /linux64/);
227
228$code.=<<___;
229
230.type	_${prefix}_consts,%object
231.align	3
232_${prefix}_consts:
233.Tj:
234.word	0x79cc4519, 0x9d8a7a87
235.size _${prefix}_consts,.-_${prefix}_consts
236___
237
238$code.=".previous\n"  if ($flavour =~ /linux64/);
239
240}}}
241
242#########################################
243my %sm3partopcode = (
244	"sm3partw1"         =>   0xce60C000,
245        "sm3partw2"         =>   0xce60C400);
246
247my %sm3ss1opcode = (
248	"sm3ss1"            =>   0xce400000);
249
250my %sm3ttopcode = (
251	"sm3tt1a"           =>   0xce408000,
252	"sm3tt1b"           =>   0xce408400,
253	"sm3tt2a"           =>   0xce408800,
254	"sm3tt2b"           =>   0xce408C00);
255
256sub unsm3part {
257	my ($mnemonic,$arg)=@_;
258
259	$arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o
260	&&
261	sprintf ".inst\t0x%08x\t//%s %s",
262			$sm3partopcode{$mnemonic}|$1|($2<<5)|($3<<16),
263			$mnemonic,$arg;
264}
265
266sub unsm3ss1 {
267	my ($mnemonic,$arg)=@_;
268
269	$arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o
270	&&
271	sprintf ".inst\t0x%08x\t//%s %s",
272			$sm3ss1opcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10),
273			$mnemonic,$arg;
274}
275
276sub unsm3tt {
277	my ($mnemonic,$arg)=@_;
278
279	$arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*\[([0-3])\]/o
280	&&
281	sprintf ".inst\t0x%08x\t//%s %s",
282			$sm3ttopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<12),
283			$mnemonic,$arg;
284}
285
286open SELF,$0;
287while(<SELF>) {
288        next if (/^#!/);
289        last if (!s/^#/\/\// and !/^$/);
290        print;
291}
292close SELF;
293
294foreach(split("\n",$code)) {
295	s/\`([^\`]*)\`/eval($1)/ge;
296
297	s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge;
298	s/\b(sm3ss1)\s+([qv].*)/unsm3ss1($1,$2)/ge;
299	s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge;
300	print $_,"\n";
301}
302
303close STDOUT or die "error closing STDOUT: $!";
304