Lines Matching +full:2 +full:- +full:8

1 /* Do not modify. This file is auto-generated from armv8-mont.pl. */
34 stp x29,x30,[sp,#-64]!
40 ldr x9,[x2],#8 // bp[0]
45 and x22,x22,#-16 // ABI says so
49 sub x21,x5,#16 // j=num-2
68 // x6 being non-zero. So that carry can be calculated
69 // by adding -1 to x6. That's what next instruction does.
76 ldr x8,[x1],#8
78 sub x21,x21,#8 // j--
81 ldr x14,[x3],#8
91 str x12,[x22],#8 // tp[j-1]
104 sub x20,x5,#8 // i=num-1
111 ldr x9,[x2],#8 // bp[i]
114 add x22,sp,#8
117 sub x21,x5,#16 // j=num-2
126 sub x20,x20,#8 // i--
137 ldr x8,[x1],#8
139 ldr x23,[x22],#8 // tp[j]
141 sub x21,x21,#8 // j--
145 ldr x14,[x3],#8
156 stur x12,[x22,#-16] // tp[j-1]
160 ldr x23,[x22],#8 // tp[j]
177 stp x12,x13,[x22,#-16]
186 add x22,sp,#8
187 ldr x14,[x3],#8 // np[0]
188 subs x21,x5,#8 // j=num-1 and clear borrow
191 sbcs x8,x23,x14 // tp[j]-np[j]
192 ldr x23,[x22],#8
193 sub x21,x21,#8 // j--
194 ldr x14,[x3],#8
195 str x8,[x1],#8 // rp[j]=tp[j]-np[j]
200 str x8,[x1],#8 // rp[num-1]
203 add x22,sp,#8
204 ldr x8,[x0],#8 // rp[0]
205 sub x5,x5,#8 // num--
208 sub x5,x5,#8 // num--
210 ldr x23,[x22],#8
211 ldr x8,[x0],#8
212 stur xzr,[x22,#-16] // wipe tp
213 stur x14,[x0,#-16]
217 stur xzr,[x22,#-8] // wipe tp
218 stur x14,[x0,#-8]
228 .size bn_mul_mont,.-bn_mul_mont
234 stp x29,x30,[sp,#-80]!
250 and x7,x7,#-64
256 sub x8,x5,#8
261 st1 {v6.2d,v7.2d},[x7],#32
262 subs x8,x8,#8
263 st1 {v8.2d,v9.2d},[x7],#32
264 st1 {v10.2d,v11.2d},[x7],#32
265 st1 {v12.2d,v13.2d},[x7],#32
270 add x10,sp,#8
282 umlal v6.2d,v28.2s,v0.s[0]
283 umlal v7.2d,v28.2s,v0.s[1]
284 umlal v8.2d,v28.2s,v0.s[2]
285 shl v29.2d,v6.2d,#16
286 ext v29.16b,v29.16b,v29.16b,#8
287 umlal v9.2d,v28.2s,v0.s[3]
288 add v29.2d,v29.2d,v6.2d
289 umlal v10.2d,v28.2s,v1.s[0]
290 mul v29.2s,v29.2s,v30.2s
291 umlal v11.2d,v28.2s,v1.s[1]
292 st1 {v28.2s},[sp] // put aside smashed b[8*i+0]
293 umlal v12.2d,v28.2s,v1.s[2]
295 umlal v13.2d,v28.2s,v1.s[3]
297 umlal v6.2d,v29.2s,v2.s[0]
298 umlal v7.2d,v29.2s,v2.s[1]
300 umlal v8.2d,v29.2s,v2.s[2]
301 ushr v15.2d,v6.2d,#16
302 umlal v9.2d,v29.2s,v2.s[3]
303 umlal v10.2d,v29.2s,v3.s[0]
304 ext v6.16b,v6.16b,v6.16b,#8
305 add v6.2d,v6.2d,v15.2d
306 umlal v11.2d,v29.2s,v3.s[1]
307 ushr v6.2d,v6.2d,#16
308 umlal v12.2d,v29.2s,v3.s[2]
309 umlal v13.2d,v29.2s,v3.s[3]
310 add v16.2d,v7.2d,v6.2d
312 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+0]
313 umlal v7.2d,v28.2s,v0.s[0]
314 ld1 {v6.2d},[x6],#16
315 umlal v8.2d,v28.2s,v0.s[1]
316 umlal v9.2d,v28.2s,v0.s[2]
317 shl v29.2d,v7.2d,#16
318 ext v29.16b,v29.16b,v29.16b,#8
319 umlal v10.2d,v28.2s,v0.s[3]
320 add v29.2d,v29.2d,v7.2d
321 umlal v11.2d,v28.2s,v1.s[0]
322 mul v29.2s,v29.2s,v30.2s
323 umlal v12.2d,v28.2s,v1.s[1]
324 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+1]
325 umlal v13.2d,v28.2s,v1.s[2]
327 umlal v6.2d,v28.2s,v1.s[3]
329 umlal v7.2d,v29.2s,v2.s[0]
330 umlal v8.2d,v29.2s,v2.s[1]
332 umlal v9.2d,v29.2s,v2.s[2]
333 ushr v15.2d,v7.2d,#16
334 umlal v10.2d,v29.2s,v2.s[3]
335 umlal v11.2d,v29.2s,v3.s[0]
336 ext v7.16b,v7.16b,v7.16b,#8
337 add v7.2d,v7.2d,v15.2d
338 umlal v12.2d,v29.2s,v3.s[1]
339 ushr v7.2d,v7.2d,#16
340 umlal v13.2d,v29.2s,v3.s[2]
341 umlal v6.2d,v29.2s,v3.s[3]
342 add v16.2d,v8.2d,v7.2d
344 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+1]
345 umlal v8.2d,v28.2s,v0.s[0]
346 ld1 {v7.2d},[x6],#16
347 umlal v9.2d,v28.2s,v0.s[1]
348 umlal v10.2d,v28.2s,v0.s[2]
349 shl v29.2d,v8.2d,#16
350 ext v29.16b,v29.16b,v29.16b,#8
351 umlal v11.2d,v28.2s,v0.s[3]
352 add v29.2d,v29.2d,v8.2d
353 umlal v12.2d,v28.2s,v1.s[0]
354 mul v29.2s,v29.2s,v30.2s
355 umlal v13.2d,v28.2s,v1.s[1]
356 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+2]
357 umlal v6.2d,v28.2s,v1.s[2]
359 umlal v7.2d,v28.2s,v1.s[3]
361 umlal v8.2d,v29.2s,v2.s[0]
362 umlal v9.2d,v29.2s,v2.s[1]
364 umlal v10.2d,v29.2s,v2.s[2]
365 ushr v15.2d,v8.2d,#16
366 umlal v11.2d,v29.2s,v2.s[3]
367 umlal v12.2d,v29.2s,v3.s[0]
368 ext v8.16b,v8.16b,v8.16b,#8
369 add v8.2d,v8.2d,v15.2d
370 umlal v13.2d,v29.2s,v3.s[1]
371 ushr v8.2d,v8.2d,#16
372 umlal v6.2d,v29.2s,v3.s[2]
373 umlal v7.2d,v29.2s,v3.s[3]
374 add v16.2d,v9.2d,v8.2d
376 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+2]
377 umlal v9.2d,v28.2s,v0.s[0]
378 ld1 {v8.2d},[x6],#16
379 umlal v10.2d,v28.2s,v0.s[1]
380 umlal v11.2d,v28.2s,v0.s[2]
381 shl v29.2d,v9.2d,#16
382 ext v29.16b,v29.16b,v29.16b,#8
383 umlal v12.2d,v28.2s,v0.s[3]
384 add v29.2d,v29.2d,v9.2d
385 umlal v13.2d,v28.2s,v1.s[0]
386 mul v29.2s,v29.2s,v30.2s
387 umlal v6.2d,v28.2s,v1.s[1]
388 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+3]
389 umlal v7.2d,v28.2s,v1.s[2]
391 umlal v8.2d,v28.2s,v1.s[3]
393 umlal v9.2d,v29.2s,v2.s[0]
394 umlal v10.2d,v29.2s,v2.s[1]
396 umlal v11.2d,v29.2s,v2.s[2]
397 ushr v15.2d,v9.2d,#16
398 umlal v12.2d,v29.2s,v2.s[3]
399 umlal v13.2d,v29.2s,v3.s[0]
400 ext v9.16b,v9.16b,v9.16b,#8
401 add v9.2d,v9.2d,v15.2d
402 umlal v6.2d,v29.2s,v3.s[1]
403 ushr v9.2d,v9.2d,#16
404 umlal v7.2d,v29.2s,v3.s[2]
405 umlal v8.2d,v29.2s,v3.s[3]
406 add v16.2d,v10.2d,v9.2d
408 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+3]
409 umlal v10.2d,v28.2s,v0.s[0]
410 ld1 {v9.2d},[x6],#16
411 umlal v11.2d,v28.2s,v0.s[1]
412 umlal v12.2d,v28.2s,v0.s[2]
413 shl v29.2d,v10.2d,#16
414 ext v29.16b,v29.16b,v29.16b,#8
415 umlal v13.2d,v28.2s,v0.s[3]
416 add v29.2d,v29.2d,v10.2d
417 umlal v6.2d,v28.2s,v1.s[0]
418 mul v29.2s,v29.2s,v30.2s
419 umlal v7.2d,v28.2s,v1.s[1]
420 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+4]
421 umlal v8.2d,v28.2s,v1.s[2]
423 umlal v9.2d,v28.2s,v1.s[3]
425 umlal v10.2d,v29.2s,v2.s[0]
426 umlal v11.2d,v29.2s,v2.s[1]
428 umlal v12.2d,v29.2s,v2.s[2]
429 ushr v15.2d,v10.2d,#16
430 umlal v13.2d,v29.2s,v2.s[3]
431 umlal v6.2d,v29.2s,v3.s[0]
432 ext v10.16b,v10.16b,v10.16b,#8
433 add v10.2d,v10.2d,v15.2d
434 umlal v7.2d,v29.2s,v3.s[1]
435 ushr v10.2d,v10.2d,#16
436 umlal v8.2d,v29.2s,v3.s[2]
437 umlal v9.2d,v29.2s,v3.s[3]
438 add v16.2d,v11.2d,v10.2d
440 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+4]
441 umlal v11.2d,v28.2s,v0.s[0]
442 ld1 {v10.2d},[x6],#16
443 umlal v12.2d,v28.2s,v0.s[1]
444 umlal v13.2d,v28.2s,v0.s[2]
445 shl v29.2d,v11.2d,#16
446 ext v29.16b,v29.16b,v29.16b,#8
447 umlal v6.2d,v28.2s,v0.s[3]
448 add v29.2d,v29.2d,v11.2d
449 umlal v7.2d,v28.2s,v1.s[0]
450 mul v29.2s,v29.2s,v30.2s
451 umlal v8.2d,v28.2s,v1.s[1]
452 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+5]
453 umlal v9.2d,v28.2s,v1.s[2]
455 umlal v10.2d,v28.2s,v1.s[3]
457 umlal v11.2d,v29.2s,v2.s[0]
458 umlal v12.2d,v29.2s,v2.s[1]
460 umlal v13.2d,v29.2s,v2.s[2]
461 ushr v15.2d,v11.2d,#16
462 umlal v6.2d,v29.2s,v2.s[3]
463 umlal v7.2d,v29.2s,v3.s[0]
464 ext v11.16b,v11.16b,v11.16b,#8
465 add v11.2d,v11.2d,v15.2d
466 umlal v8.2d,v29.2s,v3.s[1]
467 ushr v11.2d,v11.2d,#16
468 umlal v9.2d,v29.2s,v3.s[2]
469 umlal v10.2d,v29.2s,v3.s[3]
470 add v16.2d,v12.2d,v11.2d
472 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+5]
473 umlal v12.2d,v28.2s,v0.s[0]
474 ld1 {v11.2d},[x6],#16
475 umlal v13.2d,v28.2s,v0.s[1]
476 umlal v6.2d,v28.2s,v0.s[2]
477 shl v29.2d,v12.2d,#16
478 ext v29.16b,v29.16b,v29.16b,#8
479 umlal v7.2d,v28.2s,v0.s[3]
480 add v29.2d,v29.2d,v12.2d
481 umlal v8.2d,v28.2s,v1.s[0]
482 mul v29.2s,v29.2s,v30.2s
483 umlal v9.2d,v28.2s,v1.s[1]
484 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+6]
485 umlal v10.2d,v28.2s,v1.s[2]
487 umlal v11.2d,v28.2s,v1.s[3]
489 umlal v12.2d,v29.2s,v2.s[0]
490 umlal v13.2d,v29.2s,v2.s[1]
492 umlal v6.2d,v29.2s,v2.s[2]
493 ushr v15.2d,v12.2d,#16
494 umlal v7.2d,v29.2s,v2.s[3]
495 umlal v8.2d,v29.2s,v3.s[0]
496 ext v12.16b,v12.16b,v12.16b,#8
497 add v12.2d,v12.2d,v15.2d
498 umlal v9.2d,v29.2s,v3.s[1]
499 ushr v12.2d,v12.2d,#16
500 umlal v10.2d,v29.2s,v3.s[2]
501 umlal v11.2d,v29.2s,v3.s[3]
502 add v16.2d,v13.2d,v12.2d
504 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+6]
505 umlal v13.2d,v28.2s,v0.s[0]
506 ld1 {v12.2d},[x6],#16
507 umlal v6.2d,v28.2s,v0.s[1]
508 umlal v7.2d,v28.2s,v0.s[2]
509 shl v29.2d,v13.2d,#16
510 ext v29.16b,v29.16b,v29.16b,#8
511 umlal v8.2d,v28.2s,v0.s[3]
512 add v29.2d,v29.2d,v13.2d
513 umlal v9.2d,v28.2s,v1.s[0]
514 mul v29.2s,v29.2s,v30.2s
515 umlal v10.2d,v28.2s,v1.s[1]
516 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+7]
517 umlal v11.2d,v28.2s,v1.s[2]
519 umlal v12.2d,v28.2s,v1.s[3]
520 ld1 {v28.2s},[sp] // pull smashed b[8*i+0]
521 umlal v13.2d,v29.2s,v2.s[0]
523 umlal v6.2d,v29.2s,v2.s[1]
524 umlal v7.2d,v29.2s,v2.s[2]
526 ushr v5.2d,v5.2d,#16
527 ext v13.16b,v13.16b,v13.16b,#8
528 umlal v8.2d,v29.2s,v2.s[3]
529 umlal v9.2d,v29.2s,v3.s[0]
530 add v13.2d,v13.2d,v5.2d
531 umlal v10.2d,v29.2s,v3.s[1]
532 ushr v13.2d,v13.2d,#16
535 umlal v11.2d,v29.2s,v3.s[2]
536 umlal v12.2d,v29.2s,v3.s[3]
537 add v6.2d,v6.2d,v13.2d
538 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+7]
539 add x10,sp,#8 // rewind
540 sub x8,x5,#8
545 subs x8,x8,#8
546 umlal v6.2d,v28.2s,v0.s[0]
547 ld1 {v13.2d},[x6]
548 umlal v7.2d,v28.2s,v0.s[1]
549 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+0]
550 umlal v8.2d,v28.2s,v0.s[2]
552 umlal v9.2d,v28.2s,v0.s[3]
556 umlal v10.2d,v28.2s,v1.s[0]
557 umlal v11.2d,v28.2s,v1.s[1]
558 umlal v12.2d,v28.2s,v1.s[2]
559 umlal v13.2d,v28.2s,v1.s[3]
560 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+1]
561 umlal v6.2d,v29.2s,v2.s[0]
562 umlal v7.2d,v29.2s,v2.s[1]
563 umlal v8.2d,v29.2s,v2.s[2]
564 umlal v9.2d,v29.2s,v2.s[3]
565 umlal v10.2d,v29.2s,v3.s[0]
566 umlal v11.2d,v29.2s,v3.s[1]
567 umlal v12.2d,v29.2s,v3.s[2]
568 umlal v13.2d,v29.2s,v3.s[3]
569 st1 {v6.2d},[x7],#16
570 umlal v7.2d,v28.2s,v0.s[0]
571 ld1 {v6.2d},[x6]
572 umlal v8.2d,v28.2s,v0.s[1]
573 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+1]
574 umlal v9.2d,v28.2s,v0.s[2]
578 umlal v10.2d,v28.2s,v0.s[3]
579 umlal v11.2d,v28.2s,v1.s[0]
580 umlal v12.2d,v28.2s,v1.s[1]
581 umlal v13.2d,v28.2s,v1.s[2]
582 umlal v6.2d,v28.2s,v1.s[3]
583 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+2]
584 umlal v7.2d,v29.2s,v2.s[0]
585 umlal v8.2d,v29.2s,v2.s[1]
586 umlal v9.2d,v29.2s,v2.s[2]
587 umlal v10.2d,v29.2s,v2.s[3]
588 umlal v11.2d,v29.2s,v3.s[0]
589 umlal v12.2d,v29.2s,v3.s[1]
590 umlal v13.2d,v29.2s,v3.s[2]
591 umlal v6.2d,v29.2s,v3.s[3]
592 st1 {v7.2d},[x7],#16
593 umlal v8.2d,v28.2s,v0.s[0]
594 ld1 {v7.2d},[x6]
595 umlal v9.2d,v28.2s,v0.s[1]
596 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+2]
597 umlal v10.2d,v28.2s,v0.s[2]
601 umlal v11.2d,v28.2s,v0.s[3]
602 umlal v12.2d,v28.2s,v1.s[0]
603 umlal v13.2d,v28.2s,v1.s[1]
604 umlal v6.2d,v28.2s,v1.s[2]
605 umlal v7.2d,v28.2s,v1.s[3]
606 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+3]
607 umlal v8.2d,v29.2s,v2.s[0]
608 umlal v9.2d,v29.2s,v2.s[1]
609 umlal v10.2d,v29.2s,v2.s[2]
610 umlal v11.2d,v29.2s,v2.s[3]
611 umlal v12.2d,v29.2s,v3.s[0]
612 umlal v13.2d,v29.2s,v3.s[1]
613 umlal v6.2d,v29.2s,v3.s[2]
614 umlal v7.2d,v29.2s,v3.s[3]
615 st1 {v8.2d},[x7],#16
616 umlal v9.2d,v28.2s,v0.s[0]
617 ld1 {v8.2d},[x6]
618 umlal v10.2d,v28.2s,v0.s[1]
619 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+3]
620 umlal v11.2d,v28.2s,v0.s[2]
624 umlal v12.2d,v28.2s,v0.s[3]
625 umlal v13.2d,v28.2s,v1.s[0]
626 umlal v6.2d,v28.2s,v1.s[1]
627 umlal v7.2d,v28.2s,v1.s[2]
628 umlal v8.2d,v28.2s,v1.s[3]
629 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+4]
630 umlal v9.2d,v29.2s,v2.s[0]
631 umlal v10.2d,v29.2s,v2.s[1]
632 umlal v11.2d,v29.2s,v2.s[2]
633 umlal v12.2d,v29.2s,v2.s[3]
634 umlal v13.2d,v29.2s,v3.s[0]
635 umlal v6.2d,v29.2s,v3.s[1]
636 umlal v7.2d,v29.2s,v3.s[2]
637 umlal v8.2d,v29.2s,v3.s[3]
638 st1 {v9.2d},[x7],#16
639 umlal v10.2d,v28.2s,v0.s[0]
640 ld1 {v9.2d},[x6]
641 umlal v11.2d,v28.2s,v0.s[1]
642 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+4]
643 umlal v12.2d,v28.2s,v0.s[2]
647 umlal v13.2d,v28.2s,v0.s[3]
648 umlal v6.2d,v28.2s,v1.s[0]
649 umlal v7.2d,v28.2s,v1.s[1]
650 umlal v8.2d,v28.2s,v1.s[2]
651 umlal v9.2d,v28.2s,v1.s[3]
652 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+5]
653 umlal v10.2d,v29.2s,v2.s[0]
654 umlal v11.2d,v29.2s,v2.s[1]
655 umlal v12.2d,v29.2s,v2.s[2]
656 umlal v13.2d,v29.2s,v2.s[3]
657 umlal v6.2d,v29.2s,v3.s[0]
658 umlal v7.2d,v29.2s,v3.s[1]
659 umlal v8.2d,v29.2s,v3.s[2]
660 umlal v9.2d,v29.2s,v3.s[3]
661 st1 {v10.2d},[x7],#16
662 umlal v11.2d,v28.2s,v0.s[0]
663 ld1 {v10.2d},[x6]
664 umlal v12.2d,v28.2s,v0.s[1]
665 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+5]
666 umlal v13.2d,v28.2s,v0.s[2]
670 umlal v6.2d,v28.2s,v0.s[3]
671 umlal v7.2d,v28.2s,v1.s[0]
672 umlal v8.2d,v28.2s,v1.s[1]
673 umlal v9.2d,v28.2s,v1.s[2]
674 umlal v10.2d,v28.2s,v1.s[3]
675 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+6]
676 umlal v11.2d,v29.2s,v2.s[0]
677 umlal v12.2d,v29.2s,v2.s[1]
678 umlal v13.2d,v29.2s,v2.s[2]
679 umlal v6.2d,v29.2s,v2.s[3]
680 umlal v7.2d,v29.2s,v3.s[0]
681 umlal v8.2d,v29.2s,v3.s[1]
682 umlal v9.2d,v29.2s,v3.s[2]
683 umlal v10.2d,v29.2s,v3.s[3]
684 st1 {v11.2d},[x7],#16
685 umlal v12.2d,v28.2s,v0.s[0]
686 ld1 {v11.2d},[x6]
687 umlal v13.2d,v28.2s,v0.s[1]
688 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+6]
689 umlal v6.2d,v28.2s,v0.s[2]
693 umlal v7.2d,v28.2s,v0.s[3]
694 umlal v8.2d,v28.2s,v1.s[0]
695 umlal v9.2d,v28.2s,v1.s[1]
696 umlal v10.2d,v28.2s,v1.s[2]
697 umlal v11.2d,v28.2s,v1.s[3]
698 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+7]
699 umlal v12.2d,v29.2s,v2.s[0]
700 umlal v13.2d,v29.2s,v2.s[1]
701 umlal v6.2d,v29.2s,v2.s[2]
702 umlal v7.2d,v29.2s,v2.s[3]
703 umlal v8.2d,v29.2s,v3.s[0]
704 umlal v9.2d,v29.2s,v3.s[1]
705 umlal v10.2d,v29.2s,v3.s[2]
706 umlal v11.2d,v29.2s,v3.s[3]
707 st1 {v12.2d},[x7],#16
708 umlal v13.2d,v28.2s,v0.s[0]
709 ld1 {v12.2d},[x6]
710 umlal v6.2d,v28.2s,v0.s[1]
711 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+7]
712 umlal v7.2d,v28.2s,v0.s[2]
716 umlal v8.2d,v28.2s,v0.s[3]
717 umlal v9.2d,v28.2s,v1.s[0]
718 umlal v10.2d,v28.2s,v1.s[1]
719 umlal v11.2d,v28.2s,v1.s[2]
720 umlal v12.2d,v28.2s,v1.s[3]
722 sub x1,x1,x5,lsl#2 // rewind
724 umlal v13.2d,v29.2s,v2.s[0]
725 ld1 {v28.2s},[sp] // pull smashed b[8*i+0]
726 umlal v6.2d,v29.2s,v2.s[1]
728 umlal v7.2d,v29.2s,v2.s[2]
729 add x10,sp,#8 // rewind
730 umlal v8.2d,v29.2s,v2.s[3]
731 umlal v9.2d,v29.2s,v3.s[0]
732 umlal v10.2d,v29.2s,v3.s[1]
733 umlal v11.2d,v29.2s,v3.s[2]
734 st1 {v13.2d},[x7],#16
735 umlal v12.2d,v29.2s,v3.s[3]
739 st1 {v6.2d,v7.2d},[x7],#32
741 st1 {v8.2d,v9.2d},[x7],#32
743 st1 {v10.2d,v11.2d},[x7],#32
744 st1 {v12.2d},[x7]
746 subs x9,x9,#8
747 ld1 {v6.2d,v7.2d},[x6],#32
748 ld1 {v8.2d,v9.2d},[x6],#32
749 ld1 {v10.2d,v11.2d},[x6],#32
750 ld1 {v12.2d,v13.2d},[x6],#32
753 sub x3,x3,x5,lsl#2 // rewind
758 st1 {v2.2d,v3.2d}, [sp],#32 // start wiping stack frame
760 ushr v15.2d,v6.2d,#16
761 ext v6.16b,v6.16b,v6.16b,#8
762 st1 {v2.2d,v3.2d}, [sp],#32
763 add v6.2d,v6.2d,v15.2d
764 st1 {v2.2d,v3.2d}, [sp],#32
765 ushr v15.2d,v6.2d,#16
766 st1 {v2.2d,v3.2d}, [sp],#32
775 add v6.2d,v6.2d,v15.2d
777 ushr v15.2d,v6.2d,#16
778 ext v6.16b,v6.16b,v6.16b,#8
779 ld1 {v8.2d,v9.2d}, [x6],#32
780 add v6.2d,v6.2d,v15.2d
781 ld1 {v10.2d,v11.2d}, [x6],#32
782 ushr v15.2d,v6.2d,#16
783 ld1 {v12.2d,v13.2d}, [x6],#32
788 add v7.2d,v7.2d,v15.2d
790 ushr v15.2d,v7.2d,#16
792 ext v7.16b,v7.16b,v7.16b,#8
793 add v7.2d,v7.2d,v15.2d
794 ushr v15.2d,v7.2d,#16
797 add v8.2d,v8.2d,v15.2d
799 ushr v15.2d,v8.2d,#16
801 ext v8.16b,v8.16b,v8.16b,#8
802 add v8.2d,v8.2d,v15.2d
803 ushr v15.2d,v8.2d,#16
806 add v9.2d,v9.2d,v15.2d
808 ushr v15.2d,v9.2d,#16
810 ext v9.16b,v9.16b,v9.16b,#8
811 add v9.2d,v9.2d,v15.2d
812 ushr v15.2d,v9.2d,#16
815 add v10.2d,v10.2d,v15.2d
817 ushr v15.2d,v10.2d,#16
819 ext v10.16b,v10.16b,v10.16b,#8
820 add v10.2d,v10.2d,v15.2d
821 ushr v15.2d,v10.2d,#16
824 add v11.2d,v11.2d,v15.2d
826 ushr v15.2d,v11.2d,#16
828 ext v11.16b,v11.16b,v11.16b,#8
829 add v11.2d,v11.2d,v15.2d
830 ushr v15.2d,v11.2d,#16
833 add v12.2d,v12.2d,v15.2d
835 ushr v15.2d,v12.2d,#16
837 ext v12.16b,v12.16b,v12.16b,#8
838 add v12.2d,v12.2d,v15.2d
839 ushr v15.2d,v12.2d,#16
842 add v13.2d,v13.2d,v15.2d
844 ushr v15.2d,v13.2d,#16
846 ext v13.16b,v13.16b,v13.16b,#8
847 add v13.2d,v13.2d,v15.2d
848 ushr v15.2d,v13.2d,#16
851 ld1 {v6.2d,v7.2d}, [x6],#32
852 subs x8,x8,#8
856 st1 {v15.s}[0], [x7],#4 // top-most bit
857 sub x3,x3,x5,lsl#2 // rewind x3
859 add x2,sp,x5,lsl#2
862 ldp w4,w5,[x1],#8
863 ldp w6,w7,[x1],#8
864 ldp w8,w9,[x3],#8
865 ldp w10,w11,[x3],#8
871 stp w8,w9,[x0],#8
872 stp w10,w11,[x0],#8
875 ldr w10, [x1] // load top-most bit
886 ldp w4,w5,[x1],#8
887 ldp w6,w7,[x1],#8
888 ldp w8,w9,[x0],#8
890 sub x0,x0,#8
897 st1 {v0.2d,v1.2d}, [x3],#32 // wipe
898 st1 {v0.2d,v1.2d}, [x3],#32 // wipe
899 ldp w4,w5,[x1],#8
900 ldp w6,w7,[x1],#8
901 stp w8,w9,[x0],#8
902 stp w10,w11,[x0],#8
904 ldp w8,w9,[x0],#8
906 sub x0,x0,#8
913 st1 {v0.2d,v1.2d}, [x1],#32 // wipe
914 st1 {v0.2d,v1.2d}, [x3],#32 // wipe
916 stp w8,w9,[x0],#8
917 stp w10,w11,[x0],#8
929 .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
938 stp x29,x30,[sp,#-128]!
947 ldp x6,x7,[x1,#8*0]
948 ldp x8,x9,[x1,#8*2]
949 ldp x10,x11,[x1,#8*4]
950 ldp x12,x13,[x1,#8*6]
956 sub x27,x5,#8*8
960 sub x27,x27,#8*8
961 stp xzr,xzr,[x2,#8*0]
962 stp xzr,xzr,[x2,#8*2]
963 stp xzr,xzr,[x2,#8*4]
964 stp xzr,xzr,[x2,#8*6]
966 stp xzr,xzr,[x2,#8*8]
967 stp xzr,xzr,[x2,#8*10]
968 stp xzr,xzr,[x2,#8*12]
969 stp xzr,xzr,[x2,#8*14]
970 add x2,x2,#8*16
974 add x1,x1,#8*8
990 // a[2]a[0]
996 // a[2]a[1] (ii)
1002 // a[3]a[2] (iii)
1003 // a[4]a[2]
1004 // a[5]a[2]
1005 // a[6]a[2]
1006 // a[7]a[2]
1036 stp x19,x20,[x2],#8*2 // t[0..1]
1037 adc x19,xzr,xzr // t[8]
1038 adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
1045 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
1058 umulh x14,x8,x7 // hi(a[2..7]*a[1])
1065 stp x21,x22,[x2],#8*2 // t[2..3]
1072 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
1083 umulh x17,x9,x8 // hi(a[3..7]*a[2])
1090 stp x23,x24,[x2],#8*2 // t[4..5]
1111 stp x25,x26,[x2],#8*2 // t[6..7]
1153 ldp x6,x7,[x2,#8*0]
1154 ldp x8,x9,[x2,#8*2]
1155 ldp x10,x11,[x2,#8*4]
1156 ldp x12,x13,[x2,#8*6]
1159 ldp x6,x7,[x1,#8*0]
1162 ldp x8,x9,[x1,#8*2]
1165 ldp x10,x11,[x1,#8*4]
1169 ldp x12,x13,[x1,#8*6]
1170 add x1,x1,#8*8
1172 mov x27,#-8*8
1174 // a[8]a[0]
1182 // a[8]a[1]
1184 // a[8]a[2]
1185 // a[f]a[2]........................
1186 // a[8]a[3]
1188 // a[8]a[4]
1190 // a[8]a[5]
1192 // a[8]a[6]
1194 // a[8]a[7]
1198 adc x28,xzr,xzr // carry bit, modulo-scheduled
1200 add x27,x27,#8
1220 str x19,[x2],#8
1241 ldp x6,x7,[x2,#8*0]
1242 ldp x8,x9,[x2,#8*2]
1243 ldp x10,x11,[x2,#8*4]
1244 ldp x12,x13,[x2,#8*6]
1246 ldur x4,[x0,#-8*8]
1248 ldp x6,x7,[x1,#8*0]
1251 ldp x8,x9,[x1,#8*2]
1254 ldp x10,x11,[x1,#8*4]
1256 mov x27,#-8*8
1258 ldp x12,x13,[x1,#8*6]
1259 add x1,x1,#8*8
1265 ldp x6,x7,[x0,#8*0]
1266 add x1,x0,#8*8
1267 ldp x8,x9,[x0,#8*2]
1269 ldp x10,x11,[x0,#8*4]
1271 ldp x12,x13,[x0,#8*6]
1274 stp x19,x20,[x2,#8*0]
1275 ldp x19,x20,[x15,#8*0]
1276 stp x21,x22,[x2,#8*2]
1277 ldp x21,x22,[x15,#8*2]
1278 stp x23,x24,[x2,#8*4]
1279 ldp x23,x24,[x15,#8*4]
1280 stp x25,x26,[x2,#8*6]
1282 ldp x25,x26,[x15,#8*6]
1287 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1288 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
1289 ldp x15,x16,[sp,#8*1]
1290 ldp x11,x13,[x14,#8*2]
1291 add x1,x14,#8*4
1292 ldp x17,x14,[sp,#8*3]
1294 stp x19,x20,[x2,#8*0]
1296 stp x21,x22,[x2,#8*2]
1298 stp x23,x24,[x2,#8*4]
1300 stp x25,x26,[x2,#8*6]
1305 sub x27,x5,#8*4
1310 sub x27,x27,#8*4
1312 ldp x15,x16,[x2,#8*5]
1314 ldp x7,x9,[x1],#8*2
1319 stp x19,x20,[x2,#8*0]
1322 stp x21,x22,[x2,#8*2]
1324 ldp x17,x14,[x2,#8*7]
1329 ldp x15,x16,[x2,#8*9]
1331 ldp x11,x13,[x1],#8*2
1335 stp x23,x24,[x2,#8*4]
1337 stp x25,x26,[x2,#8*6]
1338 add x2,x2,#8*8
1342 ldp x17,x14,[x2,#8*3]
1350 ldp x15,x16,[x2,#8*5]
1353 stp x19,x20,[x2,#8*0]
1356 stp x21,x22,[x2,#8*2]
1360 ldp x19,x20,[sp,#8*0]
1363 ldp x6,x7,[x1,#8*0]
1366 ldp x8,x9,[x1,#8*2]
1368 ldp x10,x11,[x1,#8*4]
1372 ldp x12,x13,[x1,#8*6]
1374 ldp x21,x22,[sp,#8*2]
1375 stp x23,x24,[x2,#8*4]
1376 ldp x23,x24,[sp,#8*4]
1377 stp x25,x26,[x2,#8*6]
1378 ldp x25,x26,[sp,#8*6]
1379 add x1,x1,#8*8
1380 mov x30,xzr // initial top-most carry
1382 mov x27,#8
1385 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
1389 str x28,[x2],#8 // put aside t[0]*n0 for tail processing
1401 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
1424 ldp x14,x15,[x2,#8*0]
1425 ldp x16,x17,[x2,#8*2]
1430 ldp x14,x15,[x2,#8*4]
1433 ldp x16,x17,[x2,#8*6]
1441 ldur x4,[x2,#-8*8]
1442 ldp x6,x7,[x1,#8*0]
1443 ldp x8,x9,[x1,#8*2]
1444 ldp x10,x11,[x1,#8*4]
1445 mov x27,#-8*8
1446 ldp x12,x13,[x1,#8*6]
1447 add x1,x1,#8*8
1451 adc x28,xzr,xzr // carry bit, modulo-scheduled
1453 add x27,x27,#8
1473 str x19,[x2],#8
1491 ldp x6,x7,[x2,#8*0]
1494 ldp x8,x9,[x2,#8*2]
1495 ldp x10,x11,[x2,#8*4]
1496 ldp x12,x13,[x2,#8*6]
1499 ldur x4,[x0,#-8*8]
1502 ldp x6,x7,[x1,#8*0]
1505 ldp x8,x9,[x1,#8*2]
1508 ldp x10,x11,[x1,#8*4]
1510 mov x27,#-8*8
1512 ldp x12,x13,[x1,#8*6]
1513 add x1,x1,#8*8
1520 add x27,x2,#8*8 // end of current t[num] window
1522 subs xzr,x30,#1 // "move" top-most carry to carry bit
1525 ldp x19,x20,[x0,#8*0]
1527 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
1529 ldp x8,x9,[x16,#8*2]
1532 ldp x10,x11,[x16,#8*4]
1535 ldp x12,x13,[x16,#8*6]
1536 add x1,x16,#8*8
1537 adc x30,xzr,xzr // top-most carry
1539 stp x14,x15,[x2,#8*0]
1540 stp x21,x22,[x2,#8*2]
1541 ldp x21,x22,[x0,#8*2]
1542 stp x23,x24,[x2,#8*4]
1543 ldp x23,x24,[x0,#8*4]
1545 stp x25,x26,[x2,#8*6]
1547 ldp x25,x26,[x0,#8*6]
1548 mov x27,#8
1556 add x2,x2,#8*8
1559 sub x27,x5,#8*8
1564 ldp x6,x7,[x1,#8*0]
1566 stp x14,x15,[x0,#8*0]
1568 ldp x8,x9,[x1,#8*2]
1570 stp x16,x17,[x0,#8*2]
1572 ldp x10,x11,[x1,#8*4]
1574 ldp x12,x13,[x1,#8*6]
1575 add x1,x1,#8*8
1576 ldp x19,x20,[x2,#8*0]
1577 sub x27,x27,#8*8
1578 ldp x21,x22,[x2,#8*2]
1579 ldp x23,x24,[x2,#8*4]
1580 ldp x25,x26,[x2,#8*6]
1581 add x2,x2,#8*8
1582 stp x14,x15,[x0,#8*4]
1584 stp x16,x17,[x0,#8*6]
1585 add x0,x0,#8*8
1592 ldp x6,x7,[x3,#8*0]
1594 stp x14,x15,[x0,#8*0]
1596 ldp x8,x9,[x3,#8*2]
1598 stp x16,x17,[x0,#8*2]
1600 ldp x19,x20,[x1,#8*0]
1602 ldp x21,x22,[x1,#8*2]
1604 ldr x30,[x29,#8] // pull return address
1605 stp x14,x15,[x0,#8*4]
1606 stp x16,x17,[x0,#8*6]
1608 sub x27,x5,#8*4
1610 sub x27,x27,#8*4
1612 stp xzr,xzr,[x2,#8*0]
1614 ldp x6,x7,[x3,#8*4]
1615 ldp x19,x20,[x1,#8*4]
1617 stp xzr,xzr,[x2,#8*2]
1618 add x2,x2,#8*4
1620 ldp x8,x9,[x3,#8*6]
1621 ldp x21,x22,[x1,#8*6]
1622 add x1,x1,#8*4
1623 stp x14,x15,[x3,#8*0]
1624 stp x16,x17,[x3,#8*2]
1625 add x3,x3,#8*4
1626 stp xzr,xzr,[x1,#8*0]
1627 stp xzr,xzr,[x1,#8*2]
1631 stp xzr,xzr,[x2,#8*0]
1633 stp xzr,xzr,[x2,#8*2]
1636 stp x14,x15,[x3,#8*0]
1637 stp x16,x17,[x3,#8*2]
1644 ldr x30,[x29,#8] // pull return address
1645 // x19-7,x28 hold result, x6-7 hold modulus
1649 stp xzr,xzr,[sp,#8*0]
1651 stp xzr,xzr,[sp,#8*2]
1653 stp xzr,xzr,[sp,#8*4]
1655 stp xzr,xzr,[sp,#8*6]
1657 stp xzr,xzr,[sp,#8*8]
1659 stp xzr,xzr,[sp,#8*10]
1661 stp xzr,xzr,[sp,#8*12]
1663 stp xzr,xzr,[sp,#8*14]
1665 // x6-7 hold result-modulus
1670 stp x6,x7,[x1,#8*0]
1673 stp x8,x9,[x1,#8*2]
1676 stp x10,x11,[x1,#8*4]
1677 stp x12,x13,[x1,#8*6]
1691 .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1697 stp x29,x30,[sp,#-128]!
1708 sub sp,x26,#8*4 // alloca
1714 ldr x24,[x2,#8*0] // b[0]
1715 ldp x6,x7,[x1,#8*0] // a[0..3]
1716 ldp x8,x9,[x1,#8*2]
1717 add x1,x1,#8*4
1722 ldp x14,x15,[x3,#8*0] // n[0..3]
1723 ldp x16,x17,[x3,#8*2]
1724 adds x3,x3,#8*4 // clear carry bit
1731 adc x0,x0,xzr // modulo-scheduled
1733 add x28,x28,#8
1750 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1778 ldp x6,x7,[x1,#8*0] // a[4..7]
1779 ldp x8,x9,[x1,#8*2]
1780 add x1,x1,#8*4
1782 ldp x14,x15,[x3,#8*0] // n[4..7]
1783 ldp x16,x17,[x3,#8*2]
1784 add x3,x3,#8*4
1788 adc x0,x0,xzr // modulo-scheduled
1790 add x28,x28,#8
1823 str x19,[x26],#8 // result!!!
1835 ldp x6,x7,[x1,#8*0]
1836 ldp x8,x9,[x1,#8*2]
1837 add x1,x1,#8*4
1838 ldp x14,x15,[x3,#8*0]
1839 ldp x16,x17,[x3,#8*2]
1840 add x3,x3,#8*4
1845 ldr x24,[x2,#8*4]! // *++b
1847 ldp x6,x7,[x11,#8*0] // a[0..3]
1849 ldp x8,x9,[x11,#8*2]
1850 add x1,x11,#8*4
1852 stp x19,x20,[x26,#8*0] // result!!!
1853 ldp x19,x20,[sp,#8*4] // t[0..3]
1854 stp x21,x22,[x26,#8*2] // result!!!
1855 ldp x21,x22,[sp,#8*6]
1857 ldp x14,x15,[x3,#8*0] // n[0..3]
1859 ldp x16,x17,[x3,#8*2]
1860 adds x3,x3,#8*4 // clear carry bit
1866 adc x0,x0,xzr // modulo-scheduled
1868 add x28,x28,#8
1885 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1911 ldp x10,x11,[x26,#8*4] // t[4..7]
1912 ldp x12,x13,[x26,#8*6]
1913 ldp x6,x7,[x1,#8*0] // a[4..7]
1914 ldp x8,x9,[x1,#8*2]
1915 add x1,x1,#8*4
1923 ldp x14,x15,[x3,#8*0] // n[4..7]
1924 ldp x16,x17,[x3,#8*2]
1925 add x3,x3,#8*4
1930 adc x0,x0,xzr // modulo-scheduled
1932 add x28,x28,#8
1965 str x19,[x26],#8 // result!!!
1978 ldp x10,x11,[x26,#8*4]
1979 ldp x12,x13,[x26,#8*6]
1980 ldp x6,x7,[x1,#8*0]
1981 ldp x8,x9,[x1,#8*2]
1982 add x1,x1,#8*4
1988 ldp x14,x15,[x3,#8*0]
1989 ldp x16,x17,[x3,#8*2]
1990 add x3,x3,#8*4
1997 add x2,x2,#8*4 // bp++
2001 stp x19,x20,[x26,#8*0] // result!!!
2003 ldp x19,x20,[sp,#8*4] // t[0..3]
2005 stp x21,x22,[x26,#8*2] // result!!!
2007 ldp x21,x22,[sp,#8*6]
2008 ldp x14,x15,[x11,#8*0] // n[0..3]
2009 ldp x16,x17,[x11,#8*2]
2010 add x3,x11,#8*4
2014 ldp x6,x7,[x1,#8*0] // a[0..3]
2015 ldp x8,x9,[x1,#8*2]
2016 adds x1,x1,#8*4 // clear carry bit
2030 add x26,sp,#8*8
2032 sub x28,x5,#8*4
2036 ldp x14,x15,[x3,#8*0]
2037 sub x28,x28,#8*4
2038 ldp x19,x20,[x26,#8*0]
2040 ldp x16,x17,[x3,#8*2]
2041 add x3,x3,#8*4
2042 ldp x21,x22,[x26,#8*2]
2043 add x26,x26,#8*4
2044 stp x10,x11,[x0,#8*0]
2046 stp x12,x13,[x0,#8*2]
2047 add x0,x0,#8*4
2053 add x1,sp,#8*4
2054 ldp x6,x7,[x27,#8*0]
2056 stp x10,x11,[x0,#8*0]
2057 ldp x8,x9,[x27,#8*2]
2058 stp x12,x13,[x0,#8*2]
2059 ldp x19,x20,[x1,#8*0]
2060 ldp x21,x22,[x1,#8*2]
2062 ldr x30,[x29,#8] // pull return address
2064 sub x28,x5,#8*4
2066 sub x28,x28,#8*4
2068 stp xzr,xzr,[x26,#8*0]
2070 ldp x6,x7,[x27,#8*4]
2071 ldp x19,x20,[x1,#8*4]
2073 stp xzr,xzr,[x26,#8*2]
2074 add x26,x26,#8*4
2076 ldp x8,x9,[x27,#8*6]
2077 ldp x21,x22,[x1,#8*6]
2078 add x1,x1,#8*4
2079 stp x10,x11,[x27,#8*0]
2080 stp x12,x13,[x27,#8*2]
2081 add x27,x27,#8*4
2085 stp xzr,xzr,[x26,#8*0]
2087 stp xzr,xzr,[x26,#8*2]
2089 stp xzr,xzr,[x26,#8*3]
2091 stp xzr,xzr,[x26,#8*4]
2092 stp x10,x11,[x27,#8*0]
2093 stp x12,x13,[x27,#8*2]
2101 // x19-3,x0 hold result, x14-7 hold modulus
2103 ldr x30,[x29,#8] // pull return address
2105 stp xzr,xzr,[sp,#8*0]
2107 stp xzr,xzr,[sp,#8*2]
2109 stp xzr,xzr,[sp,#8*4]
2111 stp xzr,xzr,[sp,#8*6]
2113 // x6-3 hold result-modulus
2118 stp x6,x7,[x1,#8*0]
2119 stp x8,x9,[x1,#8*2]
2133 .size __bn_mul4x_mont,.-__bn_mul4x_mont
2136 .align 2