xref: /linux/lib/crypto/x86/sha256-avx2-asm.S (revision 22c55fb9eb92395d999b8404d73e58540d11bdd8)
1########################################################################
2# Implement fast SHA-256 with AVX2 instructions. (x86_64)
3#
4# Copyright (C) 2013 Intel Corporation.
5#
6# Authors:
7#     James Guilford <james.guilford@intel.com>
8#     Kirk Yap <kirk.s.yap@intel.com>
9#     Tim Chen <tim.c.chen@linux.intel.com>
10#
11# This software is available to you under a choice of one of two
12# licenses.  You may choose to be licensed under the terms of the GNU
13# General Public License (GPL) Version 2, available from the file
14# COPYING in the main directory of this source tree, or the
15# OpenIB.org BSD license below:
16#
17#     Redistribution and use in source and binary forms, with or
18#     without modification, are permitted provided that the following
19#     conditions are met:
20#
21#      - Redistributions of source code must retain the above
22#        copyright notice, this list of conditions and the following
23#        disclaimer.
24#
25#      - Redistributions in binary form must reproduce the above
26#        copyright notice, this list of conditions and the following
27#        disclaimer in the documentation and/or other materials
28#        provided with the distribution.
29#
30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37# SOFTWARE.
38#
39########################################################################
40#
41# This code is described in an Intel White-Paper:
42# "Fast SHA-256 Implementations on Intel Architecture Processors"
43#
44# To find it, surf to http://www.intel.com/p/en_US/embedded
45# and search for that title.
46#
47########################################################################
48# This code schedules 2 blocks at a time, with 4 lanes per block
49########################################################################
50
51#include <linux/linkage.h>
52
53## assume buffers not aligned
54#define	VMOVDQ vmovdqu
55
56################################ Define Macros
57
58# addm [mem], reg
59# Add reg to mem using reg-mem add and store
60.macro addm p1 p2
61	add	\p1, \p2
62	mov	\p2, \p1
63.endm
64
65################################
66
67X0 = %ymm4
68X1 = %ymm5
69X2 = %ymm6
70X3 = %ymm7
71
72# XMM versions of above
73XWORD0 = %xmm4
74XWORD1 = %xmm5
75XWORD2 = %xmm6
76XWORD3 = %xmm7
77
78XTMP0 = %ymm0
79XTMP1 = %ymm1
80XTMP2 = %ymm2
81XTMP3 = %ymm3
82XTMP4 = %ymm8
83XFER  = %ymm9
84XTMP5 = %ymm11
85
86SHUF_00BA =	%ymm10 # shuffle xBxA -> 00BA
87SHUF_DC00 =	%ymm12 # shuffle xDxC -> DC00
88BYTE_FLIP_MASK = %ymm13
89
90X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
91
92NUM_BLKS = %rdx	# 3rd arg
93INP	= %rsi  # 2nd arg
94CTX	= %rdi	# 1st arg
95c	= %ecx
96d	= %r8d
97e       = %edx	# clobbers NUM_BLKS
98y3	= %esi	# clobbers INP
99
100SRND	= CTX	# SRND is same register as CTX
101
102a = %eax
103b = %ebx
104f = %r9d
105g = %r10d
106h = %r11d
107old_h = %r11d
108
109T1 = %r12d
110y0 = %r13d
111y1 = %r14d
112y2 = %r15d
113
114
115_XFER_SIZE	= 2*64*4	# 2 blocks, 64 rounds, 4 bytes/round
116_XMM_SAVE_SIZE	= 0
117_INP_END_SIZE	= 8
118_INP_SIZE	= 8
119_CTX_SIZE	= 8
120
121_XFER		= 0
122_XMM_SAVE	= _XFER     + _XFER_SIZE
123_INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
124_INP		= _INP_END  + _INP_END_SIZE
125_CTX		= _INP      + _INP_SIZE
126STACK_SIZE	= _CTX      + _CTX_SIZE
127
128# rotate_Xs
129# Rotate values of symbols X0...X3
130.macro rotate_Xs
131	X_ = X0
132	X0 = X1
133	X1 = X2
134	X2 = X3
135	X3 = X_
136.endm
137
138# ROTATE_ARGS
139# Rotate values of symbols a...h
140.macro ROTATE_ARGS
141	old_h = h
142	TMP_ = h
143	h = g
144	g = f
145	f = e
146	e = d
147	d = c
148	c = b
149	b = a
150	a = TMP_
151.endm
152
153.macro FOUR_ROUNDS_AND_SCHED disp
154################################### RND N + 0 ############################
155
156	mov	a, y3		# y3 = a                                # MAJA
157	rorx	$25, e, y0	# y0 = e >> 25				# S1A
158	rorx	$11, e, y1	# y1 = e >> 11				# S1B
159
160	addl	\disp(%rsp, SRND), h		# h = k + w + h         # --
161	or	c, y3		# y3 = a|c                              # MAJA
162	vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
163	mov	f, y2		# y2 = f                                # CH
164	rorx	$13, a, T1	# T1 = a >> 13				# S0B
165
166	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
167	xor	g, y2		# y2 = f^g                              # CH
168	vpaddd	X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
169	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
170
171	and	e, y2		# y2 = (f^g)&e                          # CH
172	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
173	rorx	$22, a, y1	# y1 = a >> 22				# S0A
174	add	h, d		# d = k + w + h + d                     # --
175
176	and	b, y3		# y3 = (a|c)&b                          # MAJA
177	vpalignr $4, X0, X1, XTMP1	# XTMP1 = W[-15]
178	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
179	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
180
181	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
182	vpsrld	$7, XTMP1, XTMP2
183	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
184	mov	a, T1		# T1 = a                                # MAJB
185	and	c, T1		# T1 = a&c                              # MAJB
186
187	add	y0, y2		# y2 = S1 + CH                          # --
188	vpslld	$(32-7), XTMP1, XTMP3
189	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
190	add	y1, h		# h = k + w + h + S0                    # --
191
192	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
193	vpor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7
194
195	vpsrld	$18, XTMP1, XTMP2
196	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
197	add	y3, h		# h = t1 + S0 + MAJ                     # --
198
199
200	ROTATE_ARGS
201
202################################### RND N + 1 ############################
203
204	mov	a, y3		# y3 = a                                # MAJA
205	rorx	$25, e, y0	# y0 = e >> 25				# S1A
206	rorx	$11, e, y1	# y1 = e >> 11				# S1B
207	offset = \disp + 1*4
208	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
209	or	c, y3		# y3 = a|c                              # MAJA
210
211
212	vpsrld	$3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
213	mov	f, y2		# y2 = f                                # CH
214	rorx	$13, a, T1	# T1 = a >> 13				# S0B
215	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
216	xor	g, y2		# y2 = f^g                              # CH
217
218
219	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
220	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
221	rorx	$22, a, y1	# y1 = a >> 22				# S0A
222	and	e, y2		# y2 = (f^g)&e                          # CH
223	add	h, d		# d = k + w + h + d                     # --
224
225	vpslld	$(32-18), XTMP1, XTMP1
226	and	b, y3		# y3 = (a|c)&b                          # MAJA
227	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
228
229	vpxor	XTMP1, XTMP3, XTMP3
230	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
231	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
232
233	vpxor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
234	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
235	mov	a, T1		# T1 = a                                # MAJB
236	and	c, T1		# T1 = a&c                              # MAJB
237	add	y0, y2		# y2 = S1 + CH                          # --
238
239	vpxor	XTMP4, XTMP3, XTMP1	# XTMP1 = s0
240	vpshufd	$0b11111010, X3, XTMP2	# XTMP2 = W[-2] {BBAA}
241	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
242	add	y1, h		# h = k + w + h + S0                    # --
243
244	vpaddd	XTMP1, XTMP0, XTMP0	# XTMP0 = W[-16] + W[-7] + s0
245	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
246	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
247	add	y3, h		# h = t1 + S0 + MAJ                     # --
248
249	vpsrld	$10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
250
251
252	ROTATE_ARGS
253
254################################### RND N + 2 ############################
255
256	mov	a, y3		# y3 = a                                # MAJA
257	rorx	$25, e, y0	# y0 = e >> 25				# S1A
258	offset = \disp + 2*4
259	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
260
261	vpsrlq	$19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
262	rorx	$11, e, y1	# y1 = e >> 11				# S1B
263	or	c, y3		# y3 = a|c                              # MAJA
264	mov	f, y2		# y2 = f                                # CH
265	xor	g, y2		# y2 = f^g                              # CH
266
267	rorx	$13, a, T1	# T1 = a >> 13				# S0B
268	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
269	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xBxA}
270	and	e, y2		# y2 = (f^g)&e                          # CH
271
272	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
273	vpxor	XTMP3, XTMP2, XTMP2
274	add	h, d		# d = k + w + h + d                     # --
275	and	b, y3		# y3 = (a|c)&b                          # MAJA
276
277	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
278	rorx	$22, a, y1	# y1 = a >> 22				# S0A
279	vpxor	XTMP2, XTMP4, XTMP4	# XTMP4 = s1 {xBxA}
280	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
281
282	vpshufb	SHUF_00BA, XTMP4, XTMP4	# XTMP4 = s1 {00BA}
283	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
284	rorx	$2, a ,T1	# T1 = (a >> 2)				# S0
285	vpaddd	XTMP4, XTMP0, XTMP0	# XTMP0 = {..., ..., W[1], W[0]}
286
287	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
288	mov	a, T1		# T1 = a                                # MAJB
289	and	c, T1		# T1 = a&c                              # MAJB
290	add	y0, y2		# y2 = S1 + CH                          # --
291	vpshufd	$0b01010000, XTMP0, XTMP2	# XTMP2 = W[-2] {DDCC}
292
293	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
294	add	y1,h		# h = k + w + h + S0                    # --
295	add	y2,d		# d = k + w + h + d + S1 + CH = d + t1  # --
296	add	y2,h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
297
298	add	y3,h		# h = t1 + S0 + MAJ                     # --
299
300
301	ROTATE_ARGS
302
303################################### RND N + 3 ############################
304
305	mov	a, y3		# y3 = a                                # MAJA
306	rorx	$25, e, y0	# y0 = e >> 25				# S1A
307	rorx	$11, e, y1	# y1 = e >> 11				# S1B
308	offset = \disp + 3*4
309	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
310	or	c, y3		# y3 = a|c                              # MAJA
311
312
313	vpsrld	$10, XTMP2, XTMP5	# XTMP5 = W[-2] >> 10 {DDCC}
314	mov	f, y2		# y2 = f                                # CH
315	rorx	$13, a, T1	# T1 = a >> 13				# S0B
316	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
317	xor	g, y2		# y2 = f^g                              # CH
318
319
320	vpsrlq	$19, XTMP2, XTMP3	# XTMP3 = W[-2] ror 19 {xDxC}
321	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
322	and	e, y2		# y2 = (f^g)&e                          # CH
323	add	h, d		# d = k + w + h + d                     # --
324	and	b, y3		# y3 = (a|c)&b                          # MAJA
325
326	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xDxC}
327	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
328	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
329
330	vpxor	XTMP3, XTMP2, XTMP2
331	rorx	$22, a, y1	# y1 = a >> 22				# S0A
332	add	y0, y2		# y2 = S1 + CH                          # --
333
334	vpxor	XTMP2, XTMP5, XTMP5	# XTMP5 = s1 {xDxC}
335	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
336	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
337
338	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
339	vpshufb	SHUF_DC00, XTMP5, XTMP5	# XTMP5 = s1 {DC00}
340
341	vpaddd	XTMP0, XTMP5, X0	# X0 = {W[3], W[2], W[1], W[0]}
342	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
343	mov	a, T1		# T1 = a                                # MAJB
344	and	c, T1		# T1 = a&c                              # MAJB
345	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
346
347	add	y1, h		# h = k + w + h + S0                    # --
348	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
349	add	y3, h		# h = t1 + S0 + MAJ                     # --
350
351	ROTATE_ARGS
352	rotate_Xs
353.endm
354
355.macro DO_4ROUNDS disp
356################################### RND N + 0 ###########################
357
358	mov	f, y2		# y2 = f                                # CH
359	rorx	$25, e, y0	# y0 = e >> 25				# S1A
360	rorx	$11, e, y1	# y1 = e >> 11				# S1B
361	xor	g, y2		# y2 = f^g                              # CH
362
363	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
364	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
365	and	e, y2		# y2 = (f^g)&e                          # CH
366
367	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
368	rorx	$13, a, T1	# T1 = a >> 13				# S0B
369	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
370	rorx	$22, a, y1	# y1 = a >> 22				# S0A
371	mov	a, y3		# y3 = a                                # MAJA
372
373	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
374	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
375	addl	\disp(%rsp, SRND), h		# h = k + w + h # --
376	or	c, y3		# y3 = a|c                              # MAJA
377
378	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
379	mov	a, T1		# T1 = a                                # MAJB
380	and	b, y3		# y3 = (a|c)&b                          # MAJA
381	and	c, T1		# T1 = a&c                              # MAJB
382	add	y0, y2		# y2 = S1 + CH                          # --
383
384
385	add	h, d		# d = k + w + h + d                     # --
386	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
387	add	y1, h		# h = k + w + h + S0                    # --
388	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
389
390	ROTATE_ARGS
391
392################################### RND N + 1 ###########################
393
394	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
395	mov	f, y2		# y2 = f                                # CH
396	rorx	$25, e, y0	# y0 = e >> 25				# S1A
397	rorx	$11, e, y1	# y1 = e >> 11				# S1B
398	xor	g, y2		# y2 = f^g                              # CH
399
400	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
401	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
402	and	e, y2		# y2 = (f^g)&e                          # CH
403	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
404
405	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
406	rorx	$13, a, T1	# T1 = a >> 13				# S0B
407	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
408	rorx	$22, a, y1	# y1 = a >> 22				# S0A
409	mov	a, y3		# y3 = a                                # MAJA
410
411	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
412	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
413	offset = 4*1 + \disp
414	addl	offset(%rsp, SRND), h		# h = k + w + h # --
415	or	c, y3		# y3 = a|c                              # MAJA
416
417	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
418	mov	a, T1		# T1 = a                                # MAJB
419	and	b, y3		# y3 = (a|c)&b                          # MAJA
420	and	c, T1		# T1 = a&c                              # MAJB
421	add	y0, y2		# y2 = S1 + CH                          # --
422
423
424	add	h, d		# d = k + w + h + d                     # --
425	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
426	add	y1, h		# h = k + w + h + S0                    # --
427
428	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
429
430	ROTATE_ARGS
431
432################################### RND N + 2 ##############################
433
434	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
435	mov	f, y2		# y2 = f                                # CH
436	rorx	$25, e, y0	# y0 = e >> 25				# S1A
437	rorx	$11, e, y1	# y1 = e >> 11				# S1B
438	xor	g, y2		# y2 = f^g                              # CH
439
440	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
441	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
442	and	e, y2		# y2 = (f^g)&e                          # CH
443	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
444
445	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
446	rorx	$13, a, T1	# T1 = a >> 13				# S0B
447	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
448	rorx	$22, a, y1	# y1 = a >> 22				# S0A
449	mov	a, y3		# y3 = a                                # MAJA
450
451	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
452	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
453	offset = 4*2 + \disp
454	addl	offset(%rsp, SRND), h		# h = k + w + h # --
455	or	c, y3		# y3 = a|c                              # MAJA
456
457	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
458	mov	a, T1		# T1 = a                                # MAJB
459	and	b, y3		# y3 = (a|c)&b                          # MAJA
460	and	c, T1		# T1 = a&c                              # MAJB
461	add	y0, y2		# y2 = S1 + CH                          # --
462
463
464	add	h, d		# d = k + w + h + d                     # --
465	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
466	add	y1, h		# h = k + w + h + S0                    # --
467
468	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
469
470	ROTATE_ARGS
471
472################################### RND N + 3 ###########################
473
474	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
475	mov	f, y2		# y2 = f                                # CH
476	rorx	$25, e, y0	# y0 = e >> 25				# S1A
477	rorx	$11, e, y1	# y1 = e >> 11				# S1B
478	xor	g, y2		# y2 = f^g                              # CH
479
480	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
481	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
482	and	e, y2		# y2 = (f^g)&e                          # CH
483	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
484
485	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
486	rorx	$13, a, T1	# T1 = a >> 13				# S0B
487	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
488	rorx	$22, a, y1	# y1 = a >> 22				# S0A
489	mov	a, y3		# y3 = a                                # MAJA
490
491	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
492	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
493	offset = 4*3 + \disp
494	addl	offset(%rsp, SRND), h		# h = k + w + h # --
495	or	c, y3		# y3 = a|c                              # MAJA
496
497	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
498	mov	a, T1		# T1 = a                                # MAJB
499	and	b, y3		# y3 = (a|c)&b                          # MAJA
500	and	c, T1		# T1 = a&c                              # MAJB
501	add	y0, y2		# y2 = S1 + CH                          # --
502
503
504	add	h, d		# d = k + w + h + d                     # --
505	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
506	add	y1, h		# h = k + w + h + S0                    # --
507
508	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
509
510
511	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
512
513	add	y3, h		# h = t1 + S0 + MAJ                     # --
514
515	ROTATE_ARGS
516
517.endm
518
519########################################################################
520## void sha256_transform_rorx(struct sha256_block_state *state,
521##			      const u8 *data, size_t nblocks);
522########################################################################
523.text
524SYM_FUNC_START(sha256_transform_rorx)
525	pushq	%rbx
526	pushq	%r12
527	pushq	%r13
528	pushq	%r14
529	pushq	%r15
530
531	push	%rbp
532	mov	%rsp, %rbp
533
534	subq	$STACK_SIZE, %rsp
535	and	$-32, %rsp	# align rsp to 32 byte boundary
536
537	shl	$6, NUM_BLKS	# convert to bytes
538	lea	-64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
539	mov	NUM_BLKS, _INP_END(%rsp)
540
541	cmp	NUM_BLKS, INP
542	je	.Lonly_one_block
543
544	## load initial digest
545	mov	(CTX), a
546	mov	4*1(CTX), b
547	mov	4*2(CTX), c
548	mov	4*3(CTX), d
549	mov	4*4(CTX), e
550	mov	4*5(CTX), f
551	mov	4*6(CTX), g
552	mov	4*7(CTX), h
553
554	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
555	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
556	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
557
558	mov	CTX, _CTX(%rsp)
559
560.Lloop0:
561	## Load first 16 dwords from two blocks
562	VMOVDQ	0*32(INP),XTMP0
563	VMOVDQ	1*32(INP),XTMP1
564	VMOVDQ	2*32(INP),XTMP2
565	VMOVDQ	3*32(INP),XTMP3
566
567	## byte swap data
568	vpshufb	BYTE_FLIP_MASK, XTMP0, XTMP0
569	vpshufb	BYTE_FLIP_MASK, XTMP1, XTMP1
570	vpshufb	BYTE_FLIP_MASK, XTMP2, XTMP2
571	vpshufb	BYTE_FLIP_MASK, XTMP3, XTMP3
572
573	## transpose data into high/low halves
574	vperm2i128	$0x20, XTMP2, XTMP0, X0
575	vperm2i128	$0x31, XTMP2, XTMP0, X1
576	vperm2i128	$0x20, XTMP3, XTMP1, X2
577	vperm2i128	$0x31, XTMP3, XTMP1, X3
578
579.Llast_block_enter:
580	add	$64, INP
581	mov	INP, _INP(%rsp)
582
583	## schedule 48 input dwords, by doing 3 rounds of 12 each
584	xor	SRND, SRND
585
586.align 16
587.Lloop1:
588	leaq	K256+0*32(%rip), INP		## reuse INP as scratch reg
589	vpaddd	(INP, SRND), X0, XFER
590	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
591	FOUR_ROUNDS_AND_SCHED	(_XFER + 0*32)
592
593	leaq	K256+1*32(%rip), INP
594	vpaddd	(INP, SRND), X0, XFER
595	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
596	FOUR_ROUNDS_AND_SCHED	(_XFER + 1*32)
597
598	leaq	K256+2*32(%rip), INP
599	vpaddd	(INP, SRND), X0, XFER
600	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
601	FOUR_ROUNDS_AND_SCHED	(_XFER + 2*32)
602
603	leaq	K256+3*32(%rip), INP
604	vpaddd	(INP, SRND), X0, XFER
605	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
606	FOUR_ROUNDS_AND_SCHED	(_XFER + 3*32)
607
608	add	$4*32, SRND
609	cmp	$3*4*32, SRND
610	jb	.Lloop1
611
612.Lloop2:
613	## Do last 16 rounds with no scheduling
614	leaq	K256+0*32(%rip), INP
615	vpaddd	(INP, SRND), X0, XFER
616	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
617	DO_4ROUNDS	(_XFER + 0*32)
618
619	leaq	K256+1*32(%rip), INP
620	vpaddd	(INP, SRND), X1, XFER
621	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
622	DO_4ROUNDS	(_XFER + 1*32)
623	add	$2*32, SRND
624
625	vmovdqa	X2, X0
626	vmovdqa	X3, X1
627
628	cmp	$4*4*32, SRND
629	jb	.Lloop2
630
631	mov	_CTX(%rsp), CTX
632	mov	_INP(%rsp), INP
633
634	addm    (4*0)(CTX),a
635	addm    (4*1)(CTX),b
636	addm    (4*2)(CTX),c
637	addm    (4*3)(CTX),d
638	addm    (4*4)(CTX),e
639	addm    (4*5)(CTX),f
640	addm    (4*6)(CTX),g
641	addm    (4*7)(CTX),h
642
643	cmp	_INP_END(%rsp), INP
644	ja	.Ldone_hash
645
646	#### Do second block using previously scheduled results
647	xor	SRND, SRND
648.align 16
649.Lloop3:
650	DO_4ROUNDS	(_XFER + 0*32 + 16)
651	DO_4ROUNDS	(_XFER + 1*32 + 16)
652	add	$2*32, SRND
653	cmp	$4*4*32, SRND
654	jb	.Lloop3
655
656	mov	_CTX(%rsp), CTX
657	mov	_INP(%rsp), INP
658	add	$64, INP
659
660	addm    (4*0)(CTX),a
661	addm    (4*1)(CTX),b
662	addm    (4*2)(CTX),c
663	addm    (4*3)(CTX),d
664	addm    (4*4)(CTX),e
665	addm    (4*5)(CTX),f
666	addm    (4*6)(CTX),g
667	addm    (4*7)(CTX),h
668
669	cmp	_INP_END(%rsp), INP
670	jb	.Lloop0
671	ja	.Ldone_hash
672
673.Ldo_last_block:
674	VMOVDQ	0*16(INP),XWORD0
675	VMOVDQ	1*16(INP),XWORD1
676	VMOVDQ	2*16(INP),XWORD2
677	VMOVDQ	3*16(INP),XWORD3
678
679	vpshufb	X_BYTE_FLIP_MASK, XWORD0, XWORD0
680	vpshufb	X_BYTE_FLIP_MASK, XWORD1, XWORD1
681	vpshufb	X_BYTE_FLIP_MASK, XWORD2, XWORD2
682	vpshufb	X_BYTE_FLIP_MASK, XWORD3, XWORD3
683
684	jmp	.Llast_block_enter
685
686.Lonly_one_block:
687
688	## load initial digest
689	mov	(4*0)(CTX),a
690	mov	(4*1)(CTX),b
691	mov	(4*2)(CTX),c
692	mov	(4*3)(CTX),d
693	mov	(4*4)(CTX),e
694	mov	(4*5)(CTX),f
695	mov	(4*6)(CTX),g
696	mov	(4*7)(CTX),h
697
698	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
699	vmovdqa	_SHUF_00BA(%rip), SHUF_00BA
700	vmovdqa	_SHUF_DC00(%rip), SHUF_DC00
701
702	mov	CTX, _CTX(%rsp)
703	jmp	.Ldo_last_block
704
705.Ldone_hash:
706
707	mov	%rbp, %rsp
708	pop	%rbp
709
710	popq	%r15
711	popq	%r14
712	popq	%r13
713	popq	%r12
714	popq	%rbx
715	vzeroupper
716	RET
717SYM_FUNC_END(sha256_transform_rorx)
718
719.section	.rodata.cst512.K256, "aM", @progbits, 512
720.align 64
721K256:
722	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
723	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
724	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
725	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
726	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
727	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
728	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
729	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
730	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
731	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
732	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
733	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
734	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
735	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
736	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
737	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
738	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
739	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
740	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
741	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
742	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
743	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
744	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
745	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
746	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
747	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
748	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
749	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
750	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
751	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
752	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
753	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
754
755.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
756.align 32
757PSHUFFLE_BYTE_FLIP_MASK:
758	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
759
760# shuffle xBxA -> 00BA
761.section	.rodata.cst32._SHUF_00BA, "aM", @progbits, 32
762.align 32
763_SHUF_00BA:
764	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
765
766# shuffle xDxC -> DC00
767.section	.rodata.cst32._SHUF_DC00, "aM", @progbits, 32
768.align 32
769_SHUF_DC00:
770	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
771