Lines Matching +full:4 +full:b
30 mov dg2v.16b, dg0v.16b
32 add t1.4s, v\s0\().4s, \rc\().4s
33 sha256h dg0q, dg1q, t0.4s
34 sha256h2 dg1q, dg2q, t0.4s
37 add t0.4s, v\s0\().4s, \rc\().4s
39 sha256h dg0q, dg1q, t1.4s
40 sha256h2 dg1q, dg2q, t1.4s
45 sha256su0 v\s0\().4s, v\s1\().4s
47 sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s
54 .align 4
75 ld1 { v0.4s- v3.4s}, [\tmp], #64
76 ld1 { v4.4s- v7.4s}, [\tmp], #64
77 ld1 { v8.4s-v11.4s}, [\tmp], #64
78 ld1 {v12.4s-v15.4s}, [\tmp]
91 ld1 {dgav.4s, dgbv.4s}, [x0]
94 0: ld1 {v16.4s-v19.4s}, [x1], #64
97 CPU_LE( rev32 v16.16b, v16.16b )
98 CPU_LE( rev32 v17.16b, v17.16b )
99 CPU_LE( rev32 v18.16b, v18.16b )
100 CPU_LE( rev32 v19.16b, v19.16b )
102 add t0.4s, v16.4s, v0.4s
103 mov dg0v.16b, dgav.16b
104 mov dg1v.16b, dgbv.16b
127 add dgav.4s, dgav.4s, dg0v.4s
128 add dgbv.4s, dgbv.4s, dg1v.4s
134 cbnz x2, 0b
137 1: st1 {dgav.4s, dgbv.4s}, [x0]
193 // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a
194 // and m0_b contain the current 4 message schedule words for the first
198 // this also computes 4 more message schedule words for each message.
199 // m1_a-m3_a contain the next 3 groups of 4 message schedule words for
208 add t0_a\().4s, \m0_a\().4s, \k\().4s
209 add t0_b\().4s, \m0_b\().4s, \k\().4s
211 sha256su0 \m0_a\().4s, \m1_a\().4s
212 sha256su0 \m0_b\().4s, \m1_b\().4s
213 sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s
214 sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s
216 mov t1_a.16b, state0_a.16b
217 mov t1_b.16b, state0_b.16b
218 sha256h state0_a_q, state1_a_q, t0_a\().4s
219 sha256h state0_b_q, state1_b_q, t0_b\().4s
220 sha256h2 state1_a_q, t1_a_q, t0_a\().4s
221 sha256h2 state1_b_q, t1_b_q, t0_b\().4s
226 do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20
251 ld1 {state0_a.4s-state1_a.4s}, [ctx]
266 ld1 {v16.16b-v19.16b}, [x9]
267 st1 {v16.16b-v19.16b}, [sp]
269 ld1 {v16.16b-v19.16b}, [data1], #64
271 st1 {v16.16b-v19.16b}, [x9]
272 ld1 {v16.4s-v19.4s}, [sp]
274 ld1 {v20.16b-v23.16b}, [data2], #64
275 st1 {v20.16b-v23.16b}, [x9]
276 ld1 {v20.4s-v23.4s}, [sp]
282 mov state0_b.16b, state0_a.16b
283 mov state1_b.16b, state1_a.16b
284 b .Lfinup2x_loop_have_data
288 mov state0_b.16b, state0_a.16b
289 mov state1_b.16b, state1_a.16b
292 ld1 {v16.4s-v19.4s}, [data1], #64
293 ld1 {v20.4s-v23.4s}, [data2], #64
296 CPU_LE( rev32 v16.16b, v16.16b )
297 CPU_LE( rev32 v17.16b, v17.16b )
298 CPU_LE( rev32 v18.16b, v18.16b )
299 CPU_LE( rev32 v19.16b, v19.16b )
300 CPU_LE( rev32 v20.16b, v20.16b )
301 CPU_LE( rev32 v21.16b, v21.16b )
302 CPU_LE( rev32 v22.16b, v22.16b )
303 CPU_LE( rev32 v23.16b, v23.16b )
307 st1 {state0_a.4s-state1_b.4s}, [sp]
316 ld1 {v16.4s-v19.4s}, [sp]
317 add state0_a.4s, state0_a.4s, v16.4s
318 add state1_a.4s, state1_a.4s, v17.4s
319 add state0_b.4s, state0_b.4s, v18.4s
320 add state1_b.4s, state1_b.4s, v19.4s
345 CPU_BE( movi v16.16b, #0 )
348 movi v17.16b, #0
353 b.ge 1f // will count spill into its own block?
358 b 2f
362 ld1 {v16.16b-v19.16b}, [data1]
363 st1 {v16.16b-v19.16b}, [sp]
364 ld1 {v16.4s-v19.4s}, [x9]
365 ld1 {v20.16b-v23.16b}, [data2]
366 st1 {v20.16b-v23.16b}, [sp]
367 ld1 {v20.4s-v23.4s}, [x9]
368 b .Lfinup2x_loop_have_data
381 b 1f
391 mov v20.16b, v16.16b
394 mov v23.16b, v19.16b
396 b .Lfinup2x_loop_have_bswapped_data
400 CPU_LE( rev32 state0_a.16b, state0_a.16b )
401 CPU_LE( rev32 state1_a.16b, state1_a.16b )
402 CPU_LE( rev32 state0_b.16b, state0_b.16b )
403 CPU_LE( rev32 state1_b.16b, state1_b.16b )
404 st1 {state0_a.4s-state1_a.4s}, [out1]
405 st1 {state0_b.4s-state1_b.4s}, [out2]