Lines Matching +full:pre +full:- +full:multiply
2 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
4 # Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
5 # Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
6 # Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
35 # Skylake-X system performance. Since we are likely to suppress
36 # AVX512F capability flag [at least on Skylake-X], conversion serves
43 # IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
44 # P4 4.46/+120% -
45 # Core 2 2.41/+90% -
46 # Westmere 1.88/+120% -
49 # Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
50 # Silvermont 2.83/+95% -
52 # Goldmont 1.70/+180% -
53 # VIA Nano 1.82/+150% -
54 # Sledgehammer 1.38/+160% -
61 # (**) SSE2 implementation was attempted, but among non-AVX processors
62 # it was faster than integer-only code only on older Intel P4 and
63 # Core processors, 50-30%, less newer processor is, but slower on
78 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
79 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
80 die "can't locate x86_64-xlate.pl";
85 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
86 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
91 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
97 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
101 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
139 $code .= ".size $name,.-$name\n";
196 # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
197 # output: $h0-$h2 *= $r0-$r1
226 mov \$-4,%rax # mask value
277 bt \$`60-32`,%r9 # AVX?
364 dec %r15 # len-=16
385 .cfi_adjust_cfa_offset -48
405 shr \$2,%r10 # did 130-bit value overflow?
436 .type __poly1305_block,\@abi-omnipotent
445 .size __poly1305_block,.-__poly1305_block
447 .type __poly1305_init_avx,\@abi-omnipotent
467 mov %eax,`16*0+0-64`($ctx)
469 mov %edx,`16*0+4-64`($ctx)
476 mov %eax,`16*1+0-64`($ctx)
478 mov %edx,`16*1+4-64`($ctx)
480 mov %eax,`16*2+0-64`($ctx)
482 mov %edx,`16*2+4-64`($ctx)
493 mov %eax,`16*3+0-64`($ctx)
495 mov %edx,`16*3+4-64`($ctx)
497 mov %eax,`16*4+0-64`($ctx)
499 mov %edx,`16*4+4-64`($ctx)
508 mov %eax,`16*5+0-64`($ctx)
510 mov %edx,`16*5+4-64`($ctx)
512 mov %eax,`16*6+0-64`($ctx)
514 mov %edx,`16*6+4-64`($ctx)
520 mov $d1#d,`16*7+0-64`($ctx)
522 mov $d2#d,`16*7+4-64`($ctx)
524 mov $d1#d,`16*8+0-64`($ctx)
525 mov $d2#d,`16*8+4-64`($ctx)
534 mov %eax,`16*0+12-64`($ctx)
538 mov %edx,`16*1+12-64`($ctx)
541 mov %edx,`16*2+12-64`($ctx)
547 mov %eax,`16*3+12-64`($ctx)
550 mov %eax,`16*4+12-64`($ctx)
555 mov %edx,`16*5+12-64`($ctx)
558 mov %edx,`16*6+12-64`($ctx)
563 mov $d1#d,`16*7+12-64`($ctx)
565 mov $d1#d,`16*8+12-64`($ctx)
574 mov %eax,`16*0+8-64`($ctx)
578 mov %edx,`16*1+8-64`($ctx)
581 mov %edx,`16*2+8-64`($ctx)
587 mov %eax,`16*3+8-64`($ctx)
590 mov %eax,`16*4+8-64`($ctx)
595 mov %edx,`16*5+8-64`($ctx)
598 mov %edx,`16*6+8-64`($ctx)
603 mov $d1#d,`16*7+8-64`($ctx)
605 mov $d1#d,`16*8+8-64`($ctx)
607 lea -48-64($ctx),$ctx # size [de-]optimization
610 .size __poly1305_init_avx,.-__poly1305_init_avx
623 and \$-16,$len
658 ################################# base 2^26 -> base 2^64
660 and \$`-1*(1<<31)`,$d1
663 and \$`-1*(1<<31)`,$d2
679 mov \$-4,$d2 # ... so reduce
704 ################################# base 2^64 -> base 2^26
809 ################################# base 2^64 -> base 2^26
868 and \$-32,%rsp
869 sub \$-8,%rsp
870 lea -0x58(%rsp),%r11
874 lea -0xf8(%rsp),%r11
890 lea -32($inp),%rax
921 # expand and copy pre-calculated table to stack
922 vmovdqu `16*1-64`($ctx),$D1
923 vmovdqu `16*2-64`($ctx),$D2
924 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
925 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
926 vmovdqa $D3,-0x90(%r11)
929 vmovdqu `16*3-64`($ctx),$D0
931 vmovdqa $D4,-0x80(%r11)
934 vmovdqu `16*4-64`($ctx),$D1
936 vmovdqa $D3,-0x70(%r11)
939 vmovdqu `16*5-64`($ctx),$D2
941 vmovdqa $D4,-0x60(%r11)
944 vmovdqu `16*6-64`($ctx),$D0
946 vmovdqa $D3,-0x50(%r11)
949 vmovdqu `16*7-64`($ctx),$D1
951 vmovdqa $D4,-0x40(%r11)
954 vmovdqu `16*8-64`($ctx),$D2
956 vmovdqa $D3,-0x30(%r11)
960 vmovdqa $D4,-0x20(%r11)
965 vmovdqa $D3,-0x10(%r11)
1056 vmovdqa -0x90(%r11),$T4 # r0^4
1098 vmovdqa -0x80(%r11),$T2 # r1^4
1104 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
1111 vmovdqa -0x60(%r11),$T3 # r2^4
1118 vmovdqa -0x50(%r11),$T4 # s2^4
1123 vmovdqa -0x40(%r11),$T2 # r3^4
1128 vmovdqa -0x30(%r11),$T3 # s3^4
1133 vmovdqa -0x10(%r11),$T4 # s4^4
1154 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
1182 vpaddq $D3,$H4,$H4 # h3 -> h4
1186 vpaddq $D0,$D1,$H1 # h0 -> h1
1193 vpaddq $D1,$H2,$H2 # h1 -> h2
1197 vpaddq $D0,$H0,$H0 # h4 -> h0
1201 vpaddq $D2,$H3,$H3 # h2 -> h3
1205 vpaddq $D0,$H1,$H1 # h0 -> h1
1209 vpaddq $D3,$H4,$H4 # h3 -> h4
1215 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1217 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
1242 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
1249 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
1252 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
1260 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
1265 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
1270 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
1278 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
1281 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
1319 vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
1327 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1335 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
1343 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
1346 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
1354 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
1359 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
1364 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
1372 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
1375 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
1412 vpaddq $H3,$D4,$D4 # h3 -> h4
1416 vpaddq $H0,$D1,$D1 # h0 -> h1
1423 vpaddq $H1,$D2,$D2 # h1 -> h2
1427 vpaddq $H4,$D0,$D0 # h4 -> h0
1431 vpaddq $H2,$D3,$D3 # h2 -> h3
1435 vpaddq $H0,$D1,$D1 # h0 -> h1
1439 vpaddq $H3,$D4,$D4 # h3 -> h4
1441 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
1442 vmovd $D1,`4*1-48-64`($ctx)
1443 vmovd $D2,`4*2-48-64`($ctx)
1444 vmovd $D3,`4*3-48-64`($ctx)
1445 vmovd $D4,`4*4-48-64`($ctx)
1462 lea -8(%r10),%rsp
1483 shl \$26,%rcx # base 2^26 -> base 2^64
1503 and \$-4,%rcx
1514 shr \$2,%r10 # did 130-bit value overflow?
1545 and \$-16,$len
1580 ################################# base 2^26 -> base 2^64
1582 and \$`-1*(1<<31)`,$d1
1585 and \$`-1*(1<<31)`,$d2
1601 mov \$-4,$d2 # ... so reduce
1632 ################################# base 2^64 -> base 2^26
1742 ################################# base 2^64 -> base 2^26
1829 vmovdqa %xmm6,-0xb0(%r10)
1830 vmovdqa %xmm7,-0xa0(%r10)
1831 vmovdqa %xmm8,-0x90(%r10)
1832 vmovdqa %xmm9,-0x80(%r10)
1833 vmovdqa %xmm10,-0x70(%r10)
1834 vmovdqa %xmm11,-0x60(%r10)
1835 vmovdqa %xmm12,-0x50(%r10)
1836 vmovdqa %xmm13,-0x40(%r10)
1837 vmovdqa %xmm14,-0x30(%r10)
1838 vmovdqa %xmm15,-0x20(%r10)
1846 # expand and copy pre-calculated table to stack
1847 vmovdqu `16*0-64`($ctx),%x#$T2
1848 and \$-512,%rsp
1849 vmovdqu `16*1-64`($ctx),%x#$T3
1850 vmovdqu `16*2-64`($ctx),%x#$T4
1851 vmovdqu `16*3-64`($ctx),%x#$D0
1852 vmovdqu `16*4-64`($ctx),%x#$D1
1853 vmovdqu `16*5-64`($ctx),%x#$D2
1855 vmovdqu `16*6-64`($ctx),%x#$D3
1856 vpermd $T2,$T0,$T2 # 00003412 -> 14243444
1857 vmovdqu `16*7-64`($ctx),%x#$D4
1859 vmovdqu `16*8-64`($ctx),%x#$MASK
1863 vmovdqa $T3,0x20-0x90(%rax)
1865 vmovdqa $T4,0x40-0x90(%rax)
1867 vmovdqa $D0,0x60-0x90(%rax)
1869 vmovdqa $D1,0x80-0x90(%rax)
1871 vmovdqa $D2,0xa0-0x90(%rax)
1873 vmovdqa $D3,0xc0-0x90(%rax)
1874 vmovdqa $D4,0xe0-0x90(%rax)
1875 vmovdqa $MASK,0x100-0x90(%rax)
1924 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
1925 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
1956 vmovdqa `32*4-0x90`(%rax),$T1 # s2
1974 vmovdqa `32*5-0x90`(%rax),$H2 # r3
2000 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
2011 vpaddq $D3,$H4,$H4 # h3 -> h4
2015 vpaddq $D0,$D1,$H1 # h0 -> h1
2024 vpaddq $D1,$H2,$H2 # h1 -> h2
2028 vpaddq $D4,$H0,$H0 # h4 -> h0
2035 vpaddq $D2,$H3,$H3 # h2 -> h3
2037 vpaddq $T2,$H2,$H2 # modulo-scheduled
2042 vpaddq $D0,$H1,$H1 # h0 -> h1
2048 vpaddq $D3,$H4,$H4 # h3 -> h4
2062 # iteration we multiply least significant lane by r^4 and most
2074 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
2075 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
2095 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
2106 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
2125 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
2161 vpaddq $D3,$H4,$H4 # h3 -> h4
2165 vpaddq $D0,$D1,$H1 # h0 -> h1
2172 vpaddq $D1,$H2,$H2 # h1 -> h2
2176 vpaddq $D4,$H0,$H0 # h4 -> h0
2180 vpaddq $D2,$H3,$H3 # h2 -> h3
2184 vpaddq $D0,$H1,$H1 # h0 -> h1
2188 vpaddq $D3,$H4,$H4 # h3 -> h4
2190 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2191 vmovd %x#$H1,`4*1-48-64`($ctx)
2192 vmovd %x#$H2,`4*2-48-64`($ctx)
2193 vmovd %x#$H3,`4*3-48-64`($ctx)
2194 vmovd %x#$H4,`4*4-48-64`($ctx)
2197 vmovdqa -0xb0(%r10),%xmm6
2198 vmovdqa -0xa0(%r10),%xmm7
2199 vmovdqa -0x90(%r10),%xmm8
2200 vmovdqa -0x80(%r10),%xmm9
2201 vmovdqa -0x70(%r10),%xmm10
2202 vmovdqa -0x60(%r10),%xmm11
2203 vmovdqa -0x50(%r10),%xmm12
2204 vmovdqa -0x40(%r10),%xmm13
2205 vmovdqa -0x30(%r10),%xmm14
2206 vmovdqa -0x20(%r10),%xmm15
2207 lea -8(%r10),%rsp
2211 lea -8(%r10),%rsp
2243 vmovdqa %xmm6,-0xb0(%r10)
2244 vmovdqa %xmm7,-0xa0(%r10)
2245 vmovdqa %xmm8,-0x90(%r10)
2246 vmovdqa %xmm9,-0x80(%r10)
2247 vmovdqa %xmm10,-0x70(%r10)
2248 vmovdqa %xmm11,-0x60(%r10)
2249 vmovdqa %xmm12,-0x50(%r10)
2250 vmovdqa %xmm13,-0x40(%r10)
2251 vmovdqa %xmm14,-0x30(%r10)
2252 vmovdqa %xmm15,-0x20(%r10)
2260 # expand pre-calculated table
2261 vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
2262 and \$-512,%rsp
2263 vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
2265 vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
2266 vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
2267 vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
2268 vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
2269 vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
2270 vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
2271 vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
2272 vpermd $D0,$T2,$R0 # 00003412 -> 14243444
2278 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
2367 vpaddq $M3,$D4,$D4 # d3 -> d4
2371 vpaddq $M0,$D1,$D1 # d0 -> d1
2378 vpaddq $M1,$D2,$D2 # d1 -> d2
2382 vpaddq $M4,$D0,$D0 # d4 -> d0
2386 vpaddq $M2,$D3,$D3 # d2 -> d3
2390 vpaddq $M0,$D1,$D1 # d0 -> d1
2394 vpaddq $M3,$D4,$D4 # d3 -> d4
2397 # at this point we have 14243444 in $R0-$S4 and 05060708 in
2398 # $D0-$D4, ...
2403 # ... since input 64-bit lanes are ordered as 73625140, we could
2405 # we could just flow along, hence the goal for $R0-$S4 is
2412 vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
2418 vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
2554 vpaddq $H3,$D4,$H4 # h3 -> h4
2560 vpaddq $D0,$H1,$H1 # h0 -> h1
2569 vpaddq $D1,$H2,$H2 # h1 -> h2
2573 vpaddq $D4,$H0,$H0 # h4 -> h0
2575 vpaddq $T2,$H2,$H2 # modulo-scheduled
2580 vpaddq $D2,$D3,$H3 # h2 -> h3
2586 vpaddq $D0,$H1,$H1 # h0 -> h1
2592 vpaddq $D3,$H4,$H4 # h3 -> h4
2605 # iteration we multiply least significant lane by r^8 and most
2735 vpaddq $D3,$H4,$H4 # h3 -> h4
2741 vpaddq $D0,$H1,$H1 # h0 -> h1
2750 vpaddq $D1,$H2,$H2 # h1 -> h2
2756 vpaddq $D4,$H0,$H0 # h4 -> h0
2762 vpaddq $D2,$H3,$H3 # h2 -> h3
2768 vpaddq $D0,$H1,$H1 # h0 -> h1
2774 vpaddq $D3,$H4,$H4 # h3 -> h4
2781 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2782 vmovd %x#$H1,`4*1-48-64`($ctx)
2783 vmovd %x#$H2,`4*2-48-64`($ctx)
2784 vmovd %x#$H3,`4*3-48-64`($ctx)
2785 vmovd %x#$H4,`4*4-48-64`($ctx)
2789 movdqa -0xb0(%r10),%xmm6
2790 movdqa -0xa0(%r10),%xmm7
2791 movdqa -0x90(%r10),%xmm8
2792 movdqa -0x80(%r10),%xmm9
2793 movdqa -0x70(%r10),%xmm10
2794 movdqa -0x60(%r10),%xmm11
2795 movdqa -0x50(%r10),%xmm12
2796 movdqa -0x40(%r10),%xmm13
2797 movdqa -0x30(%r10),%xmm14
2798 movdqa -0x20(%r10),%xmm15
2799 lea -8(%r10),%rsp
2803 lea -8(%r10),%rsp
2838 # at amount of multiply-n-accumulate operations. Secondly, it makes it
2839 # impossible to pre-compute multiples of 5 [referred to as s[]/sN in
2888 movq \$-1,64($ctx) # write impossible value
2901 .size poly1305_init_base2_44,.-poly1305_init_base2_44
2919 # blocks with this single-block subroutine, otherwise ensure that
2958 vmovdqu32 0($inp),%x#$T0 # load input as ----3210
2961 vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
2992 vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
3008 dec %rax # len-=16
3018 .size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
3025 # and is handled in 256-bit %ymm registers.
3080 # at this point 64-bit lanes are ordered as 3-1-2-0
3225 vpsrldq \$8,$R0,$R0 # 0-1-0-2
3250 # at this point 64-bit lanes are ordered as x-1-x-0
3341 sub \$4,$len # len-=64
3449 sub \$2,$len # len-=32
3459 .size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
3497 vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
3569 # At this point Rx holds 1324 powers, RRx - 5768, and the goal
3614 # at this point 64-bit lanes are ordered as 73625140
3706 sub \$8,$len # len-=128
3832 .size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
3859 shr \$2,%r10 # did 130-bit value overflow?
3869 .size poly1305_emit_base2_44,.-poly1305_emit_base2_44
3875 { # chacha20-poly1305 helpers
3880 .type xor128_encrypt_n_pad,\@abi-omnipotent
3924 .size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
3927 .type xor128_decrypt_n_pad,\@abi-omnipotent
3975 .size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
3989 .type se_handler,\@abi-omnipotent
4003 mov 120($context),%rax # pull context->Rax
4004 mov 248($context),%rbx # pull context->Rip
4006 mov 8($disp),%rsi # disp->ImageBase
4007 mov 56($disp),%r11 # disp->HandlerData
4011 cmp %r10,%rbx # context->Rip<.Lprologue
4014 mov 152($context),%rax # pull context->Rsp
4018 cmp %r10,%rbx # context->Rip>=.Lepilogue
4023 mov -8(%rax),%rbx
4024 mov -16(%rax),%rbp
4025 mov -24(%rax),%r12
4026 mov -32(%rax),%r13
4027 mov -40(%rax),%r14
4028 mov -48(%rax),%r15
4029 mov %rbx,144($context) # restore context->Rbx
4030 mov %rbp,160($context) # restore context->Rbp
4031 mov %r12,216($context) # restore context->R12
4032 mov %r13,224($context) # restore context->R13
4033 mov %r14,232($context) # restore context->R14
4034 mov %r15,240($context) # restore context->R14
4037 .size se_handler,.-se_handler
4039 .type avx_handler,\@abi-omnipotent
4053 mov 120($context),%rax # pull context->Rax
4054 mov 248($context),%rbx # pull context->Rip
4056 mov 8($disp),%rsi # disp->ImageBase
4057 mov 56($disp),%r11 # disp->HandlerData
4061 cmp %r10,%rbx # context->Rip<prologue label
4064 mov 152($context),%rax # pull context->Rsp
4068 cmp %r10,%rbx # context->Rip>=epilogue label
4071 mov 208($context),%rax # pull context->R11
4082 mov %rax,152($context) # restore context->Rsp
4083 mov %rsi,168($context) # restore context->Rsi
4084 mov %rdi,176($context) # restore context->Rdi
4086 mov 40($disp),%rdi # disp->ContextRecord
4093 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4094 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4095 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4096 mov 40(%rsi),%r10 # disp->ContextRecord
4097 lea 56(%rsi),%r11 # &disp->HandlerData
4098 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4117 .size avx_handler,.-avx_handler
4241 s/%r([a-z]+)#d/%e$1/g;
4242 s/%r([0-9]+)#d/%r$1d/g;
4246 s/(^\.type.*),[0-9]+$/\1/;
4247 s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;