xref: /linux/arch/x86/crypto/sha256-avx2-asm.S (revision e3b9f1e81de2083f359bacd2a94bf1c024f2ede0)
1########################################################################
2# Implement fast SHA-256 with AVX2 instructions. (x86_64)
3#
4# Copyright (C) 2013 Intel Corporation.
5#
6# Authors:
7#     James Guilford <james.guilford@intel.com>
8#     Kirk Yap <kirk.s.yap@intel.com>
9#     Tim Chen <tim.c.chen@linux.intel.com>
10#
11# This software is available to you under a choice of one of two
12# licenses.  You may choose to be licensed under the terms of the GNU
13# General Public License (GPL) Version 2, available from the file
14# COPYING in the main directory of this source tree, or the
15# OpenIB.org BSD license below:
16#
17#     Redistribution and use in source and binary forms, with or
18#     without modification, are permitted provided that the following
19#     conditions are met:
20#
21#      - Redistributions of source code must retain the above
22#        copyright notice, this list of conditions and the following
23#        disclaimer.
24#
25#      - Redistributions in binary form must reproduce the above
26#        copyright notice, this list of conditions and the following
27#        disclaimer in the documentation and/or other materials
28#        provided with the distribution.
29#
30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37# SOFTWARE.
38#
39########################################################################
40#
41# This code is described in an Intel White-Paper:
42# "Fast SHA-256 Implementations on Intel Architecture Processors"
43#
44# To find it, surf to http://www.intel.com/p/en_US/embedded
45# and search for that title.
46#
47########################################################################
48# This code schedules 2 blocks at a time, with 4 lanes per block
49########################################################################
50
51#ifdef CONFIG_AS_AVX2
52#include <linux/linkage.h>
53
54## assume buffers not aligned
55#define	VMOVDQ vmovdqu
56
57################################ Define Macros
58
59# addm [mem], reg
60# Add reg to mem using reg-mem add and store
61.macro addm p1 p2
62	add	\p1, \p2
63	mov	\p2, \p1
64.endm
65
66################################
67
68X0 = %ymm4
69X1 = %ymm5
70X2 = %ymm6
71X3 = %ymm7
72
73# XMM versions of above
74XWORD0 = %xmm4
75XWORD1 = %xmm5
76XWORD2 = %xmm6
77XWORD3 = %xmm7
78
79XTMP0 = %ymm0
80XTMP1 = %ymm1
81XTMP2 = %ymm2
82XTMP3 = %ymm3
83XTMP4 = %ymm8
84XFER  = %ymm9
85XTMP5 = %ymm11
86
87SHUF_00BA =	%ymm10 # shuffle xBxA -> 00BA
88SHUF_DC00 =	%ymm12 # shuffle xDxC -> DC00
89BYTE_FLIP_MASK = %ymm13
90
91X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
92
93NUM_BLKS = %rdx	# 3rd arg
94INP	= %rsi  # 2nd arg
95CTX	= %rdi	# 1st arg
96c	= %ecx
97d	= %r8d
98e       = %edx	# clobbers NUM_BLKS
99y3	= %esi	# clobbers INP
100
101SRND	= CTX	# SRND is same register as CTX
102
103a = %eax
104b = %ebx
105f = %r9d
106g = %r10d
107h = %r11d
108old_h = %r11d
109
110T1 = %r12d
111y0 = %r13d
112y1 = %r14d
113y2 = %r15d
114
115
116_XFER_SIZE	= 2*64*4	# 2 blocks, 64 rounds, 4 bytes/round
117_XMM_SAVE_SIZE	= 0
118_INP_END_SIZE	= 8
119_INP_SIZE	= 8
120_CTX_SIZE	= 8
121_RSP_SIZE	= 8
122
123_XFER		= 0
124_XMM_SAVE	= _XFER     + _XFER_SIZE
125_INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
126_INP		= _INP_END  + _INP_END_SIZE
127_CTX		= _INP      + _INP_SIZE
128_RSP		= _CTX      + _CTX_SIZE
129STACK_SIZE	= _RSP      + _RSP_SIZE
130
131# rotate_Xs
132# Rotate values of symbols X0...X3
133.macro rotate_Xs
134	X_ = X0
135	X0 = X1
136	X1 = X2
137	X2 = X3
138	X3 = X_
139.endm
140
141# ROTATE_ARGS
142# Rotate values of symbols a...h
143.macro ROTATE_ARGS
144	old_h = h
145	TMP_ = h
146	h = g
147	g = f
148	f = e
149	e = d
150	d = c
151	c = b
152	b = a
153	a = TMP_
154.endm
155
156.macro FOUR_ROUNDS_AND_SCHED disp
157################################### RND N + 0 ############################
158
159	mov	a, y3		# y3 = a                                # MAJA
160	rorx	$25, e, y0	# y0 = e >> 25				# S1A
161	rorx	$11, e, y1	# y1 = e >> 11				# S1B
162
163	addl	\disp(%rsp, SRND), h		# h = k + w + h         # --
164	or	c, y3		# y3 = a|c                              # MAJA
165	vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
166	mov	f, y2		# y2 = f                                # CH
167	rorx	$13, a, T1	# T1 = a >> 13				# S0B
168
169	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
170	xor	g, y2		# y2 = f^g                              # CH
171	vpaddd	X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
172	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
173
174	and	e, y2		# y2 = (f^g)&e                          # CH
175	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
176	rorx	$22, a, y1	# y1 = a >> 22				# S0A
177	add	h, d		# d = k + w + h + d                     # --
178
179	and	b, y3		# y3 = (a|c)&b                          # MAJA
180	vpalignr $4, X0, X1, XTMP1	# XTMP1 = W[-15]
181	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
182	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
183
184	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
185	vpsrld	$7, XTMP1, XTMP2
186	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
187	mov	a, T1		# T1 = a                                # MAJB
188	and	c, T1		# T1 = a&c                              # MAJB
189
190	add	y0, y2		# y2 = S1 + CH                          # --
191	vpslld	$(32-7), XTMP1, XTMP3
192	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
193	add	y1, h		# h = k + w + h + S0                    # --
194
195	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
196	vpor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7
197
198	vpsrld	$18, XTMP1, XTMP2
199	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
200	add	y3, h		# h = t1 + S0 + MAJ                     # --
201
202
203	ROTATE_ARGS
204
205################################### RND N + 1 ############################
206
207	mov	a, y3		# y3 = a                                # MAJA
208	rorx	$25, e, y0	# y0 = e >> 25				# S1A
209	rorx	$11, e, y1	# y1 = e >> 11				# S1B
210	offset = \disp + 1*4
211	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
212	or	c, y3		# y3 = a|c                              # MAJA
213
214
215	vpsrld	$3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
216	mov	f, y2		# y2 = f                                # CH
217	rorx	$13, a, T1	# T1 = a >> 13				# S0B
218	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
219	xor	g, y2		# y2 = f^g                              # CH
220
221
222	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
223	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
224	rorx	$22, a, y1	# y1 = a >> 22				# S0A
225	and	e, y2		# y2 = (f^g)&e                          # CH
226	add	h, d		# d = k + w + h + d                     # --
227
228	vpslld	$(32-18), XTMP1, XTMP1
229	and	b, y3		# y3 = (a|c)&b                          # MAJA
230	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
231
232	vpxor	XTMP1, XTMP3, XTMP3
233	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
234	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
235
236	vpxor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
237	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
238	mov	a, T1		# T1 = a                                # MAJB
239	and	c, T1		# T1 = a&c                              # MAJB
240	add	y0, y2		# y2 = S1 + CH                          # --
241
242	vpxor	XTMP4, XTMP3, XTMP1	# XTMP1 = s0
243	vpshufd	$0b11111010, X3, XTMP2	# XTMP2 = W[-2] {BBAA}
244	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
245	add	y1, h		# h = k + w + h + S0                    # --
246
247	vpaddd	XTMP1, XTMP0, XTMP0	# XTMP0 = W[-16] + W[-7] + s0
248	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
249	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
250	add	y3, h		# h = t1 + S0 + MAJ                     # --
251
252	vpsrld	$10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
253
254
255	ROTATE_ARGS
256
257################################### RND N + 2 ############################
258
259	mov	a, y3		# y3 = a                                # MAJA
260	rorx	$25, e, y0	# y0 = e >> 25				# S1A
261	offset = \disp + 2*4
262	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
263
264	vpsrlq	$19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
265	rorx	$11, e, y1	# y1 = e >> 11				# S1B
266	or	c, y3		# y3 = a|c                              # MAJA
267	mov	f, y2		# y2 = f                                # CH
268	xor	g, y2		# y2 = f^g                              # CH
269
270	rorx	$13, a, T1	# T1 = a >> 13				# S0B
271	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
272	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xBxA}
273	and	e, y2		# y2 = (f^g)&e                          # CH
274
275	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
276	vpxor	XTMP3, XTMP2, XTMP2
277	add	h, d		# d = k + w + h + d                     # --
278	and	b, y3		# y3 = (a|c)&b                          # MAJA
279
280	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
281	rorx	$22, a, y1	# y1 = a >> 22				# S0A
282	vpxor	XTMP2, XTMP4, XTMP4	# XTMP4 = s1 {xBxA}
283	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
284
285	vpshufb	SHUF_00BA, XTMP4, XTMP4	# XTMP4 = s1 {00BA}
286	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
287	rorx	$2, a ,T1	# T1 = (a >> 2)				# S0
288	vpaddd	XTMP4, XTMP0, XTMP0	# XTMP0 = {..., ..., W[1], W[0]}
289
290	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
291	mov	a, T1		# T1 = a                                # MAJB
292	and	c, T1		# T1 = a&c                              # MAJB
293	add	y0, y2		# y2 = S1 + CH                          # --
294	vpshufd	$0b01010000, XTMP0, XTMP2	# XTMP2 = W[-2] {DDCC}
295
296	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
297	add	y1,h		# h = k + w + h + S0                    # --
298	add	y2,d		# d = k + w + h + d + S1 + CH = d + t1  # --
299	add	y2,h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
300
301	add	y3,h		# h = t1 + S0 + MAJ                     # --
302
303
304	ROTATE_ARGS
305
306################################### RND N + 3 ############################
307
308	mov	a, y3		# y3 = a                                # MAJA
309	rorx	$25, e, y0	# y0 = e >> 25				# S1A
310	rorx	$11, e, y1	# y1 = e >> 11				# S1B
311	offset = \disp + 3*4
312	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
313	or	c, y3		# y3 = a|c                              # MAJA
314
315
316	vpsrld	$10, XTMP2, XTMP5	# XTMP5 = W[-2] >> 10 {DDCC}
317	mov	f, y2		# y2 = f                                # CH
318	rorx	$13, a, T1	# T1 = a >> 13				# S0B
319	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
320	xor	g, y2		# y2 = f^g                              # CH
321
322
323	vpsrlq	$19, XTMP2, XTMP3	# XTMP3 = W[-2] ror 19 {xDxC}
324	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
325	and	e, y2		# y2 = (f^g)&e                          # CH
326	add	h, d		# d = k + w + h + d                     # --
327	and	b, y3		# y3 = (a|c)&b                          # MAJA
328
329	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xDxC}
330	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
331	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
332
333	vpxor	XTMP3, XTMP2, XTMP2
334	rorx	$22, a, y1	# y1 = a >> 22				# S0A
335	add	y0, y2		# y2 = S1 + CH                          # --
336
337	vpxor	XTMP2, XTMP5, XTMP5	# XTMP5 = s1 {xDxC}
338	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
339	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
340
341	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
342	vpshufb	SHUF_DC00, XTMP5, XTMP5	# XTMP5 = s1 {DC00}
343
344	vpaddd	XTMP0, XTMP5, X0	# X0 = {W[3], W[2], W[1], W[0]}
345	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
346	mov	a, T1		# T1 = a                                # MAJB
347	and	c, T1		# T1 = a&c                              # MAJB
348	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
349
350	add	y1, h		# h = k + w + h + S0                    # --
351	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
352	add	y3, h		# h = t1 + S0 + MAJ                     # --
353
354	ROTATE_ARGS
355	rotate_Xs
356.endm
357
358.macro DO_4ROUNDS disp
359################################### RND N + 0 ###########################
360
361	mov	f, y2		# y2 = f                                # CH
362	rorx	$25, e, y0	# y0 = e >> 25				# S1A
363	rorx	$11, e, y1	# y1 = e >> 11				# S1B
364	xor	g, y2		# y2 = f^g                              # CH
365
366	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
367	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
368	and	e, y2		# y2 = (f^g)&e                          # CH
369
370	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
371	rorx	$13, a, T1	# T1 = a >> 13				# S0B
372	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
373	rorx	$22, a, y1	# y1 = a >> 22				# S0A
374	mov	a, y3		# y3 = a                                # MAJA
375
376	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
377	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
378	addl	\disp(%rsp, SRND), h		# h = k + w + h # --
379	or	c, y3		# y3 = a|c                              # MAJA
380
381	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
382	mov	a, T1		# T1 = a                                # MAJB
383	and	b, y3		# y3 = (a|c)&b                          # MAJA
384	and	c, T1		# T1 = a&c                              # MAJB
385	add	y0, y2		# y2 = S1 + CH                          # --
386
387
388	add	h, d		# d = k + w + h + d                     # --
389	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
390	add	y1, h		# h = k + w + h + S0                    # --
391	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
392
393	ROTATE_ARGS
394
395################################### RND N + 1 ###########################
396
397	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
398	mov	f, y2		# y2 = f                                # CH
399	rorx	$25, e, y0	# y0 = e >> 25				# S1A
400	rorx	$11, e, y1	# y1 = e >> 11				# S1B
401	xor	g, y2		# y2 = f^g                              # CH
402
403	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
404	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
405	and	e, y2		# y2 = (f^g)&e                          # CH
406	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
407
408	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
409	rorx	$13, a, T1	# T1 = a >> 13				# S0B
410	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
411	rorx	$22, a, y1	# y1 = a >> 22				# S0A
412	mov	a, y3		# y3 = a                                # MAJA
413
414	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
415	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
416	offset = 4*1 + \disp
417	addl	offset(%rsp, SRND), h		# h = k + w + h # --
418	or	c, y3		# y3 = a|c                              # MAJA
419
420	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
421	mov	a, T1		# T1 = a                                # MAJB
422	and	b, y3		# y3 = (a|c)&b                          # MAJA
423	and	c, T1		# T1 = a&c                              # MAJB
424	add	y0, y2		# y2 = S1 + CH                          # --
425
426
427	add	h, d		# d = k + w + h + d                     # --
428	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
429	add	y1, h		# h = k + w + h + S0                    # --
430
431	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
432
433	ROTATE_ARGS
434
435################################### RND N + 2 ##############################
436
437	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
438	mov	f, y2		# y2 = f                                # CH
439	rorx	$25, e, y0	# y0 = e >> 25				# S1A
440	rorx	$11, e, y1	# y1 = e >> 11				# S1B
441	xor	g, y2		# y2 = f^g                              # CH
442
443	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
444	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
445	and	e, y2		# y2 = (f^g)&e                          # CH
446	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
447
448	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
449	rorx	$13, a, T1	# T1 = a >> 13				# S0B
450	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
451	rorx	$22, a, y1	# y1 = a >> 22				# S0A
452	mov	a, y3		# y3 = a                                # MAJA
453
454	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
455	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
456	offset = 4*2 + \disp
457	addl	offset(%rsp, SRND), h		# h = k + w + h # --
458	or	c, y3		# y3 = a|c                              # MAJA
459
460	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
461	mov	a, T1		# T1 = a                                # MAJB
462	and	b, y3		# y3 = (a|c)&b                          # MAJA
463	and	c, T1		# T1 = a&c                              # MAJB
464	add	y0, y2		# y2 = S1 + CH                          # --
465
466
467	add	h, d		# d = k + w + h + d                     # --
468	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
469	add	y1, h		# h = k + w + h + S0                    # --
470
471	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
472
473	ROTATE_ARGS
474
475################################### RND N + 3 ###########################
476
477	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
478	mov	f, y2		# y2 = f                                # CH
479	rorx	$25, e, y0	# y0 = e >> 25				# S1A
480	rorx	$11, e, y1	# y1 = e >> 11				# S1B
481	xor	g, y2		# y2 = f^g                              # CH
482
483	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
484	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
485	and	e, y2		# y2 = (f^g)&e                          # CH
486	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
487
488	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
489	rorx	$13, a, T1	# T1 = a >> 13				# S0B
490	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
491	rorx	$22, a, y1	# y1 = a >> 22				# S0A
492	mov	a, y3		# y3 = a                                # MAJA
493
494	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
495	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
496	offset = 4*3 + \disp
497	addl	offset(%rsp, SRND), h		# h = k + w + h # --
498	or	c, y3		# y3 = a|c                              # MAJA
499
500	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
501	mov	a, T1		# T1 = a                                # MAJB
502	and	b, y3		# y3 = (a|c)&b                          # MAJA
503	and	c, T1		# T1 = a&c                              # MAJB
504	add	y0, y2		# y2 = S1 + CH                          # --
505
506
507	add	h, d		# d = k + w + h + d                     # --
508	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
509	add	y1, h		# h = k + w + h + S0                    # --
510
511	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
512
513
514	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
515
516	add	y3, h		# h = t1 + S0 + MAJ                     # --
517
518	ROTATE_ARGS
519
520.endm
521
522########################################################################
523## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
524## arg 1 : pointer to digest
525## arg 2 : pointer to input data
526## arg 3 : Num blocks
527########################################################################
528.text
529ENTRY(sha256_transform_rorx)
530.align 32
531	pushq	%rbx
532	pushq	%r12
533	pushq	%r13
534	pushq	%r14
535	pushq	%r15
536
537	mov	%rsp, %rax
538	subq	$STACK_SIZE, %rsp
539	and	$-32, %rsp	# align rsp to 32 byte boundary
540	mov	%rax, _RSP(%rsp)
541
542
543	shl	$6, NUM_BLKS	# convert to bytes
544	jz	done_hash
545	lea	-64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
546	mov	NUM_BLKS, _INP_END(%rsp)
547
548	cmp	NUM_BLKS, INP
549	je	only_one_block
550
551	## load initial digest
552	mov	(CTX), a
553	mov	4*1(CTX), b
554	mov	4*2(CTX), c
555	mov	4*3(CTX), d
556	mov	4*4(CTX), e
557	mov	4*5(CTX), f
558	mov	4*6(CTX), g
559	mov	4*7(CTX), h
560
561	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
562	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
563	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
564
565	mov	CTX, _CTX(%rsp)
566
567loop0:
568	## Load first 16 dwords from two blocks
569	VMOVDQ	0*32(INP),XTMP0
570	VMOVDQ	1*32(INP),XTMP1
571	VMOVDQ	2*32(INP),XTMP2
572	VMOVDQ	3*32(INP),XTMP3
573
574	## byte swap data
575	vpshufb	BYTE_FLIP_MASK, XTMP0, XTMP0
576	vpshufb	BYTE_FLIP_MASK, XTMP1, XTMP1
577	vpshufb	BYTE_FLIP_MASK, XTMP2, XTMP2
578	vpshufb	BYTE_FLIP_MASK, XTMP3, XTMP3
579
580	## transpose data into high/low halves
581	vperm2i128	$0x20, XTMP2, XTMP0, X0
582	vperm2i128	$0x31, XTMP2, XTMP0, X1
583	vperm2i128	$0x20, XTMP3, XTMP1, X2
584	vperm2i128	$0x31, XTMP3, XTMP1, X3
585
586last_block_enter:
587	add	$64, INP
588	mov	INP, _INP(%rsp)
589
590	## schedule 48 input dwords, by doing 3 rounds of 12 each
591	xor	SRND, SRND
592
593.align 16
594loop1:
595	vpaddd	K256+0*32(SRND), X0, XFER
596	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
597	FOUR_ROUNDS_AND_SCHED	_XFER + 0*32
598
599	vpaddd	K256+1*32(SRND), X0, XFER
600	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
601	FOUR_ROUNDS_AND_SCHED	_XFER + 1*32
602
603	vpaddd	K256+2*32(SRND), X0, XFER
604	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
605	FOUR_ROUNDS_AND_SCHED	_XFER + 2*32
606
607	vpaddd	K256+3*32(SRND), X0, XFER
608	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
609	FOUR_ROUNDS_AND_SCHED	_XFER + 3*32
610
611	add	$4*32, SRND
612	cmp	$3*4*32, SRND
613	jb	loop1
614
615loop2:
616	## Do last 16 rounds with no scheduling
617	vpaddd	K256+0*32(SRND), X0, XFER
618	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
619	DO_4ROUNDS	_XFER + 0*32
620
621	vpaddd	K256+1*32(SRND), X1, XFER
622	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
623	DO_4ROUNDS	_XFER + 1*32
624	add	$2*32, SRND
625
626	vmovdqa	X2, X0
627	vmovdqa	X3, X1
628
629	cmp	$4*4*32, SRND
630	jb	loop2
631
632	mov	_CTX(%rsp), CTX
633	mov	_INP(%rsp), INP
634
635	addm    (4*0)(CTX),a
636	addm    (4*1)(CTX),b
637	addm    (4*2)(CTX),c
638	addm    (4*3)(CTX),d
639	addm    (4*4)(CTX),e
640	addm    (4*5)(CTX),f
641	addm    (4*6)(CTX),g
642	addm    (4*7)(CTX),h
643
644	cmp	_INP_END(%rsp), INP
645	ja	done_hash
646
647	#### Do second block using previously scheduled results
648	xor	SRND, SRND
649.align 16
650loop3:
651	DO_4ROUNDS	 _XFER + 0*32 + 16
652	DO_4ROUNDS	 _XFER + 1*32 + 16
653	add	$2*32, SRND
654	cmp	$4*4*32, SRND
655	jb	loop3
656
657	mov	_CTX(%rsp), CTX
658	mov	_INP(%rsp), INP
659	add	$64, INP
660
661	addm    (4*0)(CTX),a
662	addm    (4*1)(CTX),b
663	addm    (4*2)(CTX),c
664	addm    (4*3)(CTX),d
665	addm    (4*4)(CTX),e
666	addm    (4*5)(CTX),f
667	addm    (4*6)(CTX),g
668	addm    (4*7)(CTX),h
669
670	cmp	_INP_END(%rsp), INP
671	jb	loop0
672	ja	done_hash
673
674do_last_block:
675	VMOVDQ	0*16(INP),XWORD0
676	VMOVDQ	1*16(INP),XWORD1
677	VMOVDQ	2*16(INP),XWORD2
678	VMOVDQ	3*16(INP),XWORD3
679
680	vpshufb	X_BYTE_FLIP_MASK, XWORD0, XWORD0
681	vpshufb	X_BYTE_FLIP_MASK, XWORD1, XWORD1
682	vpshufb	X_BYTE_FLIP_MASK, XWORD2, XWORD2
683	vpshufb	X_BYTE_FLIP_MASK, XWORD3, XWORD3
684
685	jmp	last_block_enter
686
687only_one_block:
688
689	## load initial digest
690	mov	(4*0)(CTX),a
691	mov	(4*1)(CTX),b
692	mov	(4*2)(CTX),c
693	mov	(4*3)(CTX),d
694	mov	(4*4)(CTX),e
695	mov	(4*5)(CTX),f
696	mov	(4*6)(CTX),g
697	mov	(4*7)(CTX),h
698
699	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
700	vmovdqa	_SHUF_00BA(%rip), SHUF_00BA
701	vmovdqa	_SHUF_DC00(%rip), SHUF_DC00
702
703	mov	CTX, _CTX(%rsp)
704	jmp	do_last_block
705
706done_hash:
707
708	mov	_RSP(%rsp), %rsp
709
710	popq	%r15
711	popq	%r14
712	popq	%r13
713	popq	%r12
714	popq	%rbx
715	ret
716ENDPROC(sha256_transform_rorx)
717
718.section	.rodata.cst512.K256, "aM", @progbits, 512
719.align 64
720K256:
721	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
722	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
723	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
724	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
725	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
726	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
727	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
728	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
729	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
730	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
731	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
732	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
733	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
734	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
735	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
736	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
737	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
738	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
739	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
740	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
741	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
742	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
743	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
744	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
745	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
746	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
747	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
748	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
749	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
750	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
751	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
752	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
753
754.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
755.align 32
756PSHUFFLE_BYTE_FLIP_MASK:
757	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
758
759# shuffle xBxA -> 00BA
760.section	.rodata.cst32._SHUF_00BA, "aM", @progbits, 32
761.align 32
762_SHUF_00BA:
763	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
764
765# shuffle xDxC -> DC00
766.section	.rodata.cst32._SHUF_DC00, "aM", @progbits, 32
767.align 32
768_SHUF_DC00:
769	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
770
771#endif
772