xref: /linux/arch/x86/crypto/sha256-avx2-asm.S (revision ca55b2fef3a9373fcfc30f82fd26bc7fccbda732)
1########################################################################
2# Implement fast SHA-256 with AVX2 instructions. (x86_64)
3#
4# Copyright (C) 2013 Intel Corporation.
5#
6# Authors:
7#     James Guilford <james.guilford@intel.com>
8#     Kirk Yap <kirk.s.yap@intel.com>
9#     Tim Chen <tim.c.chen@linux.intel.com>
10#
11# This software is available to you under a choice of one of two
12# licenses.  You may choose to be licensed under the terms of the GNU
13# General Public License (GPL) Version 2, available from the file
14# COPYING in the main directory of this source tree, or the
15# OpenIB.org BSD license below:
16#
17#     Redistribution and use in source and binary forms, with or
18#     without modification, are permitted provided that the following
19#     conditions are met:
20#
21#      - Redistributions of source code must retain the above
22#        copyright notice, this list of conditions and the following
23#        disclaimer.
24#
25#      - Redistributions in binary form must reproduce the above
26#        copyright notice, this list of conditions and the following
27#        disclaimer in the documentation and/or other materials
28#        provided with the distribution.
29#
30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37# SOFTWARE.
38#
39########################################################################
40#
41# This code is described in an Intel White-Paper:
42# "Fast SHA-256 Implementations on Intel Architecture Processors"
43#
44# To find it, surf to http://www.intel.com/p/en_US/embedded
45# and search for that title.
46#
47########################################################################
48# This code schedules 2 blocks at a time, with 4 lanes per block
49########################################################################
50
51#ifdef CONFIG_AS_AVX2
52#include <linux/linkage.h>
53
54## assume buffers not aligned
55#define	VMOVDQ vmovdqu
56
57################################ Define Macros
58
59# addm [mem], reg
60# Add reg to mem using reg-mem add and store
61.macro addm p1 p2
62	add	\p1, \p2
63	mov	\p2, \p1
64.endm
65
66################################
67
68X0 = %ymm4
69X1 = %ymm5
70X2 = %ymm6
71X3 = %ymm7
72
73# XMM versions of above
74XWORD0 = %xmm4
75XWORD1 = %xmm5
76XWORD2 = %xmm6
77XWORD3 = %xmm7
78
79XTMP0 = %ymm0
80XTMP1 = %ymm1
81XTMP2 = %ymm2
82XTMP3 = %ymm3
83XTMP4 = %ymm8
84XFER  = %ymm9
85XTMP5 = %ymm11
86
87SHUF_00BA =	%ymm10 # shuffle xBxA -> 00BA
88SHUF_DC00 =	%ymm12 # shuffle xDxC -> DC00
89BYTE_FLIP_MASK = %ymm13
90
91X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
92
93NUM_BLKS = %rdx	# 3rd arg
94INP	= %rsi  # 2nd arg
95CTX	= %rdi	# 1st arg
96c	= %ecx
97d	= %r8d
98e       = %edx	# clobbers NUM_BLKS
99y3	= %esi	# clobbers INP
100
101
102TBL	= %rbp
103SRND	= CTX	# SRND is same register as CTX
104
105a = %eax
106b = %ebx
107f = %r9d
108g = %r10d
109h = %r11d
110old_h = %r11d
111
112T1 = %r12d
113y0 = %r13d
114y1 = %r14d
115y2 = %r15d
116
117
118_XFER_SIZE	= 2*64*4	# 2 blocks, 64 rounds, 4 bytes/round
119_XMM_SAVE_SIZE	= 0
120_INP_END_SIZE	= 8
121_INP_SIZE	= 8
122_CTX_SIZE	= 8
123_RSP_SIZE	= 8
124
125_XFER		= 0
126_XMM_SAVE	= _XFER     + _XFER_SIZE
127_INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
128_INP		= _INP_END  + _INP_END_SIZE
129_CTX		= _INP      + _INP_SIZE
130_RSP		= _CTX      + _CTX_SIZE
131STACK_SIZE	= _RSP      + _RSP_SIZE
132
133# rotate_Xs
134# Rotate values of symbols X0...X3
135.macro rotate_Xs
136	X_ = X0
137	X0 = X1
138	X1 = X2
139	X2 = X3
140	X3 = X_
141.endm
142
143# ROTATE_ARGS
144# Rotate values of symbols a...h
145.macro ROTATE_ARGS
146	old_h = h
147	TMP_ = h
148	h = g
149	g = f
150	f = e
151	e = d
152	d = c
153	c = b
154	b = a
155	a = TMP_
156.endm
157
158.macro FOUR_ROUNDS_AND_SCHED disp
159################################### RND N + 0 ############################
160
161	mov	a, y3		# y3 = a                                # MAJA
162	rorx	$25, e, y0	# y0 = e >> 25				# S1A
163	rorx	$11, e, y1	# y1 = e >> 11				# S1B
164
165	addl	\disp(%rsp, SRND), h		# h = k + w + h         # --
166	or	c, y3		# y3 = a|c                              # MAJA
167	vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
168	mov	f, y2		# y2 = f                                # CH
169	rorx	$13, a, T1	# T1 = a >> 13				# S0B
170
171	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
172	xor	g, y2		# y2 = f^g                              # CH
173	vpaddd	X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
174	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
175
176	and	e, y2		# y2 = (f^g)&e                          # CH
177	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
178	rorx	$22, a, y1	# y1 = a >> 22				# S0A
179	add	h, d		# d = k + w + h + d                     # --
180
181	and	b, y3		# y3 = (a|c)&b                          # MAJA
182	vpalignr $4, X0, X1, XTMP1	# XTMP1 = W[-15]
183	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
184	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
185
186	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
187	vpsrld	$7, XTMP1, XTMP2
188	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
189	mov	a, T1		# T1 = a                                # MAJB
190	and	c, T1		# T1 = a&c                              # MAJB
191
192	add	y0, y2		# y2 = S1 + CH                          # --
193	vpslld	$(32-7), XTMP1, XTMP3
194	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
195	add	y1, h		# h = k + w + h + S0                    # --
196
197	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
198	vpor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7
199
200	vpsrld	$18, XTMP1, XTMP2
201	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
202	add	y3, h		# h = t1 + S0 + MAJ                     # --
203
204
205	ROTATE_ARGS
206
207################################### RND N + 1 ############################
208
209	mov	a, y3		# y3 = a                                # MAJA
210	rorx	$25, e, y0	# y0 = e >> 25				# S1A
211	rorx	$11, e, y1	# y1 = e >> 11				# S1B
212	offset = \disp + 1*4
213	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
214	or	c, y3		# y3 = a|c                              # MAJA
215
216
217	vpsrld	$3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
218	mov	f, y2		# y2 = f                                # CH
219	rorx	$13, a, T1	# T1 = a >> 13				# S0B
220	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
221	xor	g, y2		# y2 = f^g                              # CH
222
223
224	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
225	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
226	rorx	$22, a, y1	# y1 = a >> 22				# S0A
227	and	e, y2		# y2 = (f^g)&e                          # CH
228	add	h, d		# d = k + w + h + d                     # --
229
230	vpslld	$(32-18), XTMP1, XTMP1
231	and	b, y3		# y3 = (a|c)&b                          # MAJA
232	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
233
234	vpxor	XTMP1, XTMP3, XTMP3
235	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
236	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
237
238	vpxor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
239	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
240	mov	a, T1		# T1 = a                                # MAJB
241	and	c, T1		# T1 = a&c                              # MAJB
242	add	y0, y2		# y2 = S1 + CH                          # --
243
244	vpxor	XTMP4, XTMP3, XTMP1	# XTMP1 = s0
245	vpshufd	$0b11111010, X3, XTMP2	# XTMP2 = W[-2] {BBAA}
246	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
247	add	y1, h		# h = k + w + h + S0                    # --
248
249	vpaddd	XTMP1, XTMP0, XTMP0	# XTMP0 = W[-16] + W[-7] + s0
250	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
251	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
252	add	y3, h		# h = t1 + S0 + MAJ                     # --
253
254	vpsrld	$10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
255
256
257	ROTATE_ARGS
258
259################################### RND N + 2 ############################
260
261	mov	a, y3		# y3 = a                                # MAJA
262	rorx	$25, e, y0	# y0 = e >> 25				# S1A
263	offset = \disp + 2*4
264	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
265
266	vpsrlq	$19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
267	rorx	$11, e, y1	# y1 = e >> 11				# S1B
268	or	c, y3		# y3 = a|c                              # MAJA
269	mov	f, y2		# y2 = f                                # CH
270	xor	g, y2		# y2 = f^g                              # CH
271
272	rorx	$13, a, T1	# T1 = a >> 13				# S0B
273	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
274	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xBxA}
275	and	e, y2		# y2 = (f^g)&e                          # CH
276
277	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
278	vpxor	XTMP3, XTMP2, XTMP2
279	add	h, d		# d = k + w + h + d                     # --
280	and	b, y3		# y3 = (a|c)&b                          # MAJA
281
282	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
283	rorx	$22, a, y1	# y1 = a >> 22				# S0A
284	vpxor	XTMP2, XTMP4, XTMP4	# XTMP4 = s1 {xBxA}
285	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
286
287	vpshufb	SHUF_00BA, XTMP4, XTMP4	# XTMP4 = s1 {00BA}
288	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
289	rorx	$2, a ,T1	# T1 = (a >> 2)				# S0
290	vpaddd	XTMP4, XTMP0, XTMP0	# XTMP0 = {..., ..., W[1], W[0]}
291
292	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
293	mov	a, T1		# T1 = a                                # MAJB
294	and	c, T1		# T1 = a&c                              # MAJB
295	add	y0, y2		# y2 = S1 + CH                          # --
296	vpshufd	$0b01010000, XTMP0, XTMP2	# XTMP2 = W[-2] {DDCC}
297
298	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
299	add	y1,h		# h = k + w + h + S0                    # --
300	add	y2,d		# d = k + w + h + d + S1 + CH = d + t1  # --
301	add	y2,h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
302
303	add	y3,h		# h = t1 + S0 + MAJ                     # --
304
305
306	ROTATE_ARGS
307
308################################### RND N + 3 ############################
309
310	mov	a, y3		# y3 = a                                # MAJA
311	rorx	$25, e, y0	# y0 = e >> 25				# S1A
312	rorx	$11, e, y1	# y1 = e >> 11				# S1B
313	offset = \disp + 3*4
314	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
315	or	c, y3		# y3 = a|c                              # MAJA
316
317
318	vpsrld	$10, XTMP2, XTMP5	# XTMP5 = W[-2] >> 10 {DDCC}
319	mov	f, y2		# y2 = f                                # CH
320	rorx	$13, a, T1	# T1 = a >> 13				# S0B
321	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
322	xor	g, y2		# y2 = f^g                              # CH
323
324
325	vpsrlq	$19, XTMP2, XTMP3	# XTMP3 = W[-2] ror 19 {xDxC}
326	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
327	and	e, y2		# y2 = (f^g)&e                          # CH
328	add	h, d		# d = k + w + h + d                     # --
329	and	b, y3		# y3 = (a|c)&b                          # MAJA
330
331	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xDxC}
332	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
333	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
334
335	vpxor	XTMP3, XTMP2, XTMP2
336	rorx	$22, a, y1	# y1 = a >> 22				# S0A
337	add	y0, y2		# y2 = S1 + CH                          # --
338
339	vpxor	XTMP2, XTMP5, XTMP5	# XTMP5 = s1 {xDxC}
340	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
341	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
342
343	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
344	vpshufb	SHUF_DC00, XTMP5, XTMP5	# XTMP5 = s1 {DC00}
345
346	vpaddd	XTMP0, XTMP5, X0	# X0 = {W[3], W[2], W[1], W[0]}
347	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
348	mov	a, T1		# T1 = a                                # MAJB
349	and	c, T1		# T1 = a&c                              # MAJB
350	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
351
352	add	y1, h		# h = k + w + h + S0                    # --
353	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
354	add	y3, h		# h = t1 + S0 + MAJ                     # --
355
356	ROTATE_ARGS
357	rotate_Xs
358.endm
359
360.macro DO_4ROUNDS disp
361################################### RND N + 0 ###########################
362
363	mov	f, y2		# y2 = f                                # CH
364	rorx	$25, e, y0	# y0 = e >> 25				# S1A
365	rorx	$11, e, y1	# y1 = e >> 11				# S1B
366	xor	g, y2		# y2 = f^g                              # CH
367
368	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
369	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
370	and	e, y2		# y2 = (f^g)&e                          # CH
371
372	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
373	rorx	$13, a, T1	# T1 = a >> 13				# S0B
374	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
375	rorx	$22, a, y1	# y1 = a >> 22				# S0A
376	mov	a, y3		# y3 = a                                # MAJA
377
378	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
379	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
380	addl	\disp(%rsp, SRND), h		# h = k + w + h # --
381	or	c, y3		# y3 = a|c                              # MAJA
382
383	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
384	mov	a, T1		# T1 = a                                # MAJB
385	and	b, y3		# y3 = (a|c)&b                          # MAJA
386	and	c, T1		# T1 = a&c                              # MAJB
387	add	y0, y2		# y2 = S1 + CH                          # --
388
389
390	add	h, d		# d = k + w + h + d                     # --
391	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
392	add	y1, h		# h = k + w + h + S0                    # --
393	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
394
395	ROTATE_ARGS
396
397################################### RND N + 1 ###########################
398
399	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
400	mov	f, y2		# y2 = f                                # CH
401	rorx	$25, e, y0	# y0 = e >> 25				# S1A
402	rorx	$11, e, y1	# y1 = e >> 11				# S1B
403	xor	g, y2		# y2 = f^g                              # CH
404
405	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
406	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
407	and	e, y2		# y2 = (f^g)&e                          # CH
408	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
409
410	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
411	rorx	$13, a, T1	# T1 = a >> 13				# S0B
412	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
413	rorx	$22, a, y1	# y1 = a >> 22				# S0A
414	mov	a, y3		# y3 = a                                # MAJA
415
416	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
417	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
418	offset = 4*1 + \disp
419	addl	offset(%rsp, SRND), h		# h = k + w + h # --
420	or	c, y3		# y3 = a|c                              # MAJA
421
422	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
423	mov	a, T1		# T1 = a                                # MAJB
424	and	b, y3		# y3 = (a|c)&b                          # MAJA
425	and	c, T1		# T1 = a&c                              # MAJB
426	add	y0, y2		# y2 = S1 + CH                          # --
427
428
429	add	h, d		# d = k + w + h + d                     # --
430	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
431	add	y1, h		# h = k + w + h + S0                    # --
432
433	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
434
435	ROTATE_ARGS
436
437################################### RND N + 2 ##############################
438
439	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
440	mov	f, y2		# y2 = f                                # CH
441	rorx	$25, e, y0	# y0 = e >> 25				# S1A
442	rorx	$11, e, y1	# y1 = e >> 11				# S1B
443	xor	g, y2		# y2 = f^g                              # CH
444
445	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
446	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
447	and	e, y2		# y2 = (f^g)&e                          # CH
448	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
449
450	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
451	rorx	$13, a, T1	# T1 = a >> 13				# S0B
452	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
453	rorx	$22, a, y1	# y1 = a >> 22				# S0A
454	mov	a, y3		# y3 = a                                # MAJA
455
456	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
457	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
458	offset = 4*2 + \disp
459	addl	offset(%rsp, SRND), h		# h = k + w + h # --
460	or	c, y3		# y3 = a|c                              # MAJA
461
462	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
463	mov	a, T1		# T1 = a                                # MAJB
464	and	b, y3		# y3 = (a|c)&b                          # MAJA
465	and	c, T1		# T1 = a&c                              # MAJB
466	add	y0, y2		# y2 = S1 + CH                          # --
467
468
469	add	h, d		# d = k + w + h + d                     # --
470	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
471	add	y1, h		# h = k + w + h + S0                    # --
472
473	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
474
475	ROTATE_ARGS
476
477################################### RND N + 3 ###########################
478
479	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
480	mov	f, y2		# y2 = f                                # CH
481	rorx	$25, e, y0	# y0 = e >> 25				# S1A
482	rorx	$11, e, y1	# y1 = e >> 11				# S1B
483	xor	g, y2		# y2 = f^g                              # CH
484
485	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
486	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
487	and	e, y2		# y2 = (f^g)&e                          # CH
488	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
489
490	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
491	rorx	$13, a, T1	# T1 = a >> 13				# S0B
492	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
493	rorx	$22, a, y1	# y1 = a >> 22				# S0A
494	mov	a, y3		# y3 = a                                # MAJA
495
496	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
497	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
498	offset = 4*3 + \disp
499	addl	offset(%rsp, SRND), h		# h = k + w + h # --
500	or	c, y3		# y3 = a|c                              # MAJA
501
502	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
503	mov	a, T1		# T1 = a                                # MAJB
504	and	b, y3		# y3 = (a|c)&b                          # MAJA
505	and	c, T1		# T1 = a&c                              # MAJB
506	add	y0, y2		# y2 = S1 + CH                          # --
507
508
509	add	h, d		# d = k + w + h + d                     # --
510	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
511	add	y1, h		# h = k + w + h + S0                    # --
512
513	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
514
515
516	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
517
518	add	y3, h		# h = t1 + S0 + MAJ                     # --
519
520	ROTATE_ARGS
521
522.endm
523
524########################################################################
525## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
526## arg 1 : pointer to digest
527## arg 2 : pointer to input data
528## arg 3 : Num blocks
529########################################################################
530.text
531ENTRY(sha256_transform_rorx)
532.align 32
533	pushq	%rbx
534	pushq	%rbp
535	pushq	%r12
536	pushq	%r13
537	pushq	%r14
538	pushq	%r15
539
540	mov	%rsp, %rax
541	subq	$STACK_SIZE, %rsp
542	and	$-32, %rsp	# align rsp to 32 byte boundary
543	mov	%rax, _RSP(%rsp)
544
545
546	shl	$6, NUM_BLKS	# convert to bytes
547	jz	done_hash
548	lea	-64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
549	mov	NUM_BLKS, _INP_END(%rsp)
550
551	cmp	NUM_BLKS, INP
552	je	only_one_block
553
554	## load initial digest
555	mov	(CTX), a
556	mov	4*1(CTX), b
557	mov	4*2(CTX), c
558	mov	4*3(CTX), d
559	mov	4*4(CTX), e
560	mov	4*5(CTX), f
561	mov	4*6(CTX), g
562	mov	4*7(CTX), h
563
564	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
565	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
566	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
567
568	mov	CTX, _CTX(%rsp)
569
570loop0:
571	lea     K256(%rip), TBL
572
573	## Load first 16 dwords from two blocks
574	VMOVDQ	0*32(INP),XTMP0
575	VMOVDQ	1*32(INP),XTMP1
576	VMOVDQ	2*32(INP),XTMP2
577	VMOVDQ	3*32(INP),XTMP3
578
579	## byte swap data
580	vpshufb	BYTE_FLIP_MASK, XTMP0, XTMP0
581	vpshufb	BYTE_FLIP_MASK, XTMP1, XTMP1
582	vpshufb	BYTE_FLIP_MASK, XTMP2, XTMP2
583	vpshufb	BYTE_FLIP_MASK, XTMP3, XTMP3
584
585	## transpose data into high/low halves
586	vperm2i128	$0x20, XTMP2, XTMP0, X0
587	vperm2i128	$0x31, XTMP2, XTMP0, X1
588	vperm2i128	$0x20, XTMP3, XTMP1, X2
589	vperm2i128	$0x31, XTMP3, XTMP1, X3
590
591last_block_enter:
592	add	$64, INP
593	mov	INP, _INP(%rsp)
594
595	## schedule 48 input dwords, by doing 3 rounds of 12 each
596	xor	SRND, SRND
597
598.align 16
599loop1:
600	vpaddd	0*32(TBL, SRND), X0, XFER
601	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
602	FOUR_ROUNDS_AND_SCHED	_XFER + 0*32
603
604	vpaddd	1*32(TBL, SRND), X0, XFER
605	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
606	FOUR_ROUNDS_AND_SCHED	_XFER + 1*32
607
608	vpaddd	2*32(TBL, SRND), X0, XFER
609	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
610	FOUR_ROUNDS_AND_SCHED	_XFER + 2*32
611
612	vpaddd	3*32(TBL, SRND), X0, XFER
613	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
614	FOUR_ROUNDS_AND_SCHED	_XFER + 3*32
615
616	add	$4*32, SRND
617	cmp	$3*4*32, SRND
618	jb	loop1
619
620loop2:
621	## Do last 16 rounds with no scheduling
622	vpaddd	0*32(TBL, SRND), X0, XFER
623	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
624	DO_4ROUNDS	_XFER + 0*32
625	vpaddd	1*32(TBL, SRND), X1, XFER
626	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
627	DO_4ROUNDS	_XFER + 1*32
628	add	$2*32, SRND
629
630	vmovdqa	X2, X0
631	vmovdqa	X3, X1
632
633	cmp	$4*4*32, SRND
634	jb	loop2
635
636	mov	_CTX(%rsp), CTX
637	mov	_INP(%rsp), INP
638
639	addm    (4*0)(CTX),a
640	addm    (4*1)(CTX),b
641	addm    (4*2)(CTX),c
642	addm    (4*3)(CTX),d
643	addm    (4*4)(CTX),e
644	addm    (4*5)(CTX),f
645	addm    (4*6)(CTX),g
646	addm    (4*7)(CTX),h
647
648	cmp	_INP_END(%rsp), INP
649	ja	done_hash
650
651	#### Do second block using previously scheduled results
652	xor	SRND, SRND
653.align 16
654loop3:
655	DO_4ROUNDS	 _XFER + 0*32 + 16
656	DO_4ROUNDS	 _XFER + 1*32 + 16
657	add	$2*32, SRND
658	cmp	$4*4*32, SRND
659	jb	loop3
660
661	mov	_CTX(%rsp), CTX
662	mov	_INP(%rsp), INP
663	add	$64, INP
664
665	addm    (4*0)(CTX),a
666	addm    (4*1)(CTX),b
667	addm    (4*2)(CTX),c
668	addm    (4*3)(CTX),d
669	addm    (4*4)(CTX),e
670	addm    (4*5)(CTX),f
671	addm    (4*6)(CTX),g
672	addm    (4*7)(CTX),h
673
674	cmp	_INP_END(%rsp), INP
675	jb	loop0
676	ja	done_hash
677
678do_last_block:
679	#### do last block
680	lea	K256(%rip), TBL
681
682	VMOVDQ	0*16(INP),XWORD0
683	VMOVDQ	1*16(INP),XWORD1
684	VMOVDQ	2*16(INP),XWORD2
685	VMOVDQ	3*16(INP),XWORD3
686
687	vpshufb	X_BYTE_FLIP_MASK, XWORD0, XWORD0
688	vpshufb	X_BYTE_FLIP_MASK, XWORD1, XWORD1
689	vpshufb	X_BYTE_FLIP_MASK, XWORD2, XWORD2
690	vpshufb	X_BYTE_FLIP_MASK, XWORD3, XWORD3
691
692	jmp	last_block_enter
693
694only_one_block:
695
696	## load initial digest
697	mov	(4*0)(CTX),a
698	mov	(4*1)(CTX),b
699	mov	(4*2)(CTX),c
700	mov	(4*3)(CTX),d
701	mov	(4*4)(CTX),e
702	mov	(4*5)(CTX),f
703	mov	(4*6)(CTX),g
704	mov	(4*7)(CTX),h
705
706	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
707	vmovdqa	_SHUF_00BA(%rip), SHUF_00BA
708	vmovdqa	_SHUF_DC00(%rip), SHUF_DC00
709
710	mov	CTX, _CTX(%rsp)
711	jmp	do_last_block
712
713done_hash:
714
715	mov	_RSP(%rsp), %rsp
716
717	popq	%r15
718	popq	%r14
719	popq	%r13
720	popq	%r12
721	popq	%rbp
722	popq	%rbx
723	ret
724ENDPROC(sha256_transform_rorx)
725
726.data
727.align 64
728K256:
729	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
730	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
731	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
732	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
733	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
734	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
735	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
736	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
737	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
738	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
739	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
740	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
741	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
742	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
743	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
744	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
745	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
746	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
747	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
748	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
749	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
750	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
751	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
752	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
753	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
754	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
755	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
756	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
757	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
758	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
759	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
760	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
761
762PSHUFFLE_BYTE_FLIP_MASK:
763	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
764
765# shuffle xBxA -> 00BA
766_SHUF_00BA:
767	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
768
769# shuffle xDxC -> DC00
770_SHUF_DC00:
771	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
772#endif
773