Lines Matching +full:- +full:16 +full:g

1 /* Do not modify. This file is auto-generated from sha512-armv8.pl. */
2 // Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
23 // SHA256-hw SHA256(*) SHA512
24 // Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
25 // Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
26 // Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
28 // X-Gene 20.0 (+100%) 12.8 (+300%(***))
35 // (**) The result is a trade-off: it's possible to improve it by
37 // on Cortex-A53 (or by 4 cycles per round).
38 // (***) Super-impressive coefficients over gcc-generated code are
40 // generated with -mgeneral-regs-only is significantly faster
41 // and the gap is only 40-90%.
46 // version of SHA256 for 64-bit processors. This is because performance
47 // improvement on most wide-spread Cortex-A5x processors was observed
48 // to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
49 // observed that 32-bit NEON SHA256 performs significantly better than
50 // 64-bit scalar version on *some* of the more recent processors. As
51 // result 64-bit NEON version of SHA256 was added to provide best
52 // all-round performance. For example it executes ~30% faster on X-Gene
54 // deliver much less improvement, likely *negative* on Cortex-A5x.
81 stp x29,x30,[sp,#-128]!
84 stp x19,x20,[sp,#16]
113 orr w17,w17,w19 // Ch(e,f,g)
117 add w27,w27,w17 // h+=Ch(e,f,g)
138 orr w17,w17,w28 // Ch(e,f,g)
142 add w26,w26,w17 // h+=Ch(e,f,g)
162 orr w17,w17,w19 // Ch(e,f,g)
166 add w25,w25,w17 // h+=Ch(e,f,g)
187 orr w17,w17,w28 // Ch(e,f,g)
191 add w24,w24,w17 // h+=Ch(e,f,g)
211 orr w17,w17,w19 // Ch(e,f,g)
215 add w23,w23,w17 // h+=Ch(e,f,g)
236 orr w17,w17,w28 // Ch(e,f,g)
240 add w22,w22,w17 // h+=Ch(e,f,g)
260 orr w17,w17,w19 // Ch(e,f,g)
264 add w21,w21,w17 // h+=Ch(e,f,g)
285 orr w17,w17,w28 // Ch(e,f,g)
289 add w20,w20,w17 // h+=Ch(e,f,g)
309 orr w17,w17,w19 // Ch(e,f,g)
313 add w27,w27,w17 // h+=Ch(e,f,g)
334 orr w17,w17,w28 // Ch(e,f,g)
338 add w26,w26,w17 // h+=Ch(e,f,g)
358 orr w17,w17,w19 // Ch(e,f,g)
362 add w25,w25,w17 // h+=Ch(e,f,g)
384 orr w17,w17,w28 // Ch(e,f,g)
388 add w24,w24,w17 // h+=Ch(e,f,g)
409 orr w17,w17,w19 // Ch(e,f,g)
413 add w23,w23,w17 // h+=Ch(e,f,g)
435 orr w17,w17,w28 // Ch(e,f,g)
439 add w22,w22,w17 // h+=Ch(e,f,g)
461 orr w17,w17,w19 // Ch(e,f,g)
465 add w21,w21,w17 // h+=Ch(e,f,g)
491 orr w17,w17,w28 // Ch(e,f,g)
495 add w20,w20,w17 // h+=Ch(e,f,g)
523 orr w17,w17,w19 // Ch(e,f,g)
527 add w27,w27,w17 // h+=Ch(e,f,g)
554 orr w17,w17,w28 // Ch(e,f,g)
558 add w26,w26,w17 // h+=Ch(e,f,g)
585 orr w17,w17,w19 // Ch(e,f,g)
589 add w25,w25,w17 // h+=Ch(e,f,g)
616 orr w17,w17,w28 // Ch(e,f,g)
620 add w24,w24,w17 // h+=Ch(e,f,g)
647 orr w17,w17,w19 // Ch(e,f,g)
651 add w23,w23,w17 // h+=Ch(e,f,g)
678 orr w17,w17,w28 // Ch(e,f,g)
682 add w22,w22,w17 // h+=Ch(e,f,g)
709 orr w17,w17,w19 // Ch(e,f,g)
713 add w21,w21,w17 // h+=Ch(e,f,g)
740 orr w17,w17,w28 // Ch(e,f,g)
744 add w20,w20,w17 // h+=Ch(e,f,g)
771 orr w17,w17,w19 // Ch(e,f,g)
775 add w27,w27,w17 // h+=Ch(e,f,g)
802 orr w17,w17,w28 // Ch(e,f,g)
806 add w26,w26,w17 // h+=Ch(e,f,g)
833 orr w17,w17,w19 // Ch(e,f,g)
837 add w25,w25,w17 // h+=Ch(e,f,g)
864 orr w17,w17,w28 // Ch(e,f,g)
868 add w24,w24,w17 // h+=Ch(e,f,g)
895 orr w17,w17,w19 // Ch(e,f,g)
899 add w23,w23,w17 // h+=Ch(e,f,g)
926 orr w17,w17,w28 // Ch(e,f,g)
930 add w22,w22,w17 // h+=Ch(e,f,g)
957 orr w17,w17,w19 // Ch(e,f,g)
961 add w21,w21,w17 // h+=Ch(e,f,g)
988 orr w17,w17,w28 // Ch(e,f,g)
992 add w20,w20,w17 // h+=Ch(e,f,g)
1033 ldp x19,x20,[x29,#16]
1042 .size sha256_block_data_order,.-sha256_block_data_order
1064 .size .LK256,.-.LK256
1073 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1074 stp x29,x30,[sp,#-16]!
1081 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1083 ld1 {v16.4s},[x3],#16
1084 rev32 v4.16b,v4.16b
1085 rev32 v5.16b,v5.16b
1086 rev32 v6.16b,v6.16b
1087 rev32 v7.16b,v7.16b
1088 orr v18.16b,v0.16b,v0.16b // offload
1089 orr v19.16b,v1.16b,v1.16b
1090 ld1 {v17.4s},[x3],#16
1092 .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
1093 orr v2.16b,v0.16b,v0.16b
1094 .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
1095 .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
1096 .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
1097 ld1 {v16.4s},[x3],#16
1099 .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
1100 orr v2.16b,v0.16b,v0.16b
1101 .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
1102 .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
1103 .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
1104 ld1 {v17.4s},[x3],#16
1106 .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
1107 orr v2.16b,v0.16b,v0.16b
1108 .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
1109 .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
1110 .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
1111 ld1 {v16.4s},[x3],#16
1113 .inst 0x5e282887 //sha256su0 v7.16b,v4.16b
1114 orr v2.16b,v0.16b,v0.16b
1115 .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
1116 .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
1117 .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
1118 ld1 {v17.4s},[x3],#16
1120 .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
1121 orr v2.16b,v0.16b,v0.16b
1122 .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
1123 .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
1124 .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
1125 ld1 {v16.4s},[x3],#16
1127 .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
1128 orr v2.16b,v0.16b,v0.16b
1129 .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
1130 .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
1131 .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
1132 ld1 {v17.4s},[x3],#16
1134 .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
1135 orr v2.16b,v0.16b,v0.16b
1136 .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
1137 .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
1138 .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
1139 ld1 {v16.4s},[x3],#16
1141 .inst 0x5e282887 //sha256su0 v7.16b,v4.16b
1142 orr v2.16b,v0.16b,v0.16b
1143 .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
1144 .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
1145 .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
1146 ld1 {v17.4s},[x3],#16
1148 .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
1149 orr v2.16b,v0.16b,v0.16b
1150 .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
1151 .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
1152 .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
1153 ld1 {v16.4s},[x3],#16
1155 .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
1156 orr v2.16b,v0.16b,v0.16b
1157 .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
1158 .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
1159 .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
1160 ld1 {v17.4s},[x3],#16
1162 .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
1163 orr v2.16b,v0.16b,v0.16b
1164 .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
1165 .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
1166 .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
1167 ld1 {v16.4s},[x3],#16
1169 .inst 0x5e282887 //sha256su0 v7.16b,v4.16b
1170 orr v2.16b,v0.16b,v0.16b
1171 .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
1172 .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
1173 .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
1174 ld1 {v17.4s},[x3],#16
1176 orr v2.16b,v0.16b,v0.16b
1177 .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
1178 .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
1180 ld1 {v16.4s},[x3],#16
1182 orr v2.16b,v0.16b,v0.16b
1183 .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
1184 .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
1188 sub x3,x3,#64*4-16 // rewind
1189 orr v2.16b,v0.16b,v0.16b
1190 .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
1191 .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
1194 orr v2.16b,v0.16b,v0.16b
1195 .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
1196 .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
1205 ldr x29,[sp],#16
1207 .size sha256_block_armv8,.-sha256_block_armv8
1217 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later
1218 stp x29, x30, [sp, #-16]!
1220 sub sp,sp,#16*4
1225 ld1 {v0.16b},[x1], #16
1226 ld1 {v1.16b},[x1], #16
1227 ld1 {v2.16b},[x1], #16
1228 ld1 {v3.16b},[x1], #16
1229 ld1 {v4.4s},[x16], #16
1230 ld1 {v5.4s},[x16], #16
1231 ld1 {v6.4s},[x16], #16
1232 ld1 {v7.4s},[x16], #16
1233 rev32 v0.16b,v0.16b // yes, even on
1234 rev32 v1.16b,v1.16b // big-endian
1235 rev32 v2.16b,v2.16b
1236 rev32 v3.16b,v3.16b
1248 ldp w7,w8,[x0,#16]
1258 ext v4.16b,v0.16b,v1.16b,#4
1263 ext v7.16b,v2.16b,v3.16b,#4
1282 eor v5.16b,v5.16b,v6.16b
1291 eor v5.16b,v5.16b,v7.16b
1310 eor v17.16b,v17.16b,v16.16b
1314 eor v17.16b,v17.16b,v7.16b
1332 eor v19.16b,v19.16b,v18.16b
1339 ld1 {v4.4s},[x16], #16
1341 eor v19.16b,v19.16b,v17.16b
1343 eor v17.16b,v17.16b,v17.16b
1361 ldr w12,[sp,#16]
1366 st1 {v4.4s},[x17], #16
1367 ext v4.16b,v1.16b,v2.16b,#4
1372 ext v7.16b,v3.16b,v0.16b,#4
1391 eor v5.16b,v5.16b,v6.16b
1400 eor v5.16b,v5.16b,v7.16b
1419 eor v17.16b,v17.16b,v16.16b
1423 eor v17.16b,v17.16b,v7.16b
1441 eor v19.16b,v19.16b,v18.16b
1448 ld1 {v4.4s},[x16], #16
1450 eor v19.16b,v19.16b,v17.16b
1452 eor v17.16b,v17.16b,v17.16b
1475 st1 {v4.4s},[x17], #16
1476 ext v4.16b,v2.16b,v3.16b,#4
1481 ext v7.16b,v0.16b,v1.16b,#4
1500 eor v5.16b,v5.16b,v6.16b
1509 eor v5.16b,v5.16b,v7.16b
1528 eor v17.16b,v17.16b,v16.16b
1532 eor v17.16b,v17.16b,v7.16b
1550 eor v19.16b,v19.16b,v18.16b
1557 ld1 {v4.4s},[x16], #16
1559 eor v19.16b,v19.16b,v17.16b
1561 eor v17.16b,v17.16b,v17.16b
1584 st1 {v4.4s},[x17], #16
1585 ext v4.16b,v3.16b,v0.16b,#4
1590 ext v7.16b,v1.16b,v2.16b,#4
1609 eor v5.16b,v5.16b,v6.16b
1618 eor v5.16b,v5.16b,v7.16b
1637 eor v17.16b,v17.16b,v16.16b
1641 eor v17.16b,v17.16b,v7.16b
1659 eor v19.16b,v19.16b,v18.16b
1666 ld1 {v4.4s},[x16], #16
1668 eor v19.16b,v19.16b,v17.16b
1670 eor v17.16b,v17.16b,v17.16b
1693 st1 {v4.4s},[x17], #16
1708 ld1 {v0.16b},[x1],#16
1711 ld1 {v4.4s},[x16],#16
1716 rev32 v0.16b,v0.16b
1780 ldr w12,[sp,#16]
1785 st1 {v4.4s},[x17], #16
1789 ld1 {v1.16b},[x1],#16
1792 ld1 {v4.4s},[x16],#16
1797 rev32 v1.16b,v1.16b
1866 st1 {v4.4s},[x17], #16
1870 ld1 {v2.16b},[x1],#16
1873 ld1 {v4.4s},[x16],#16
1878 rev32 v2.16b,v2.16b
1947 st1 {v4.4s},[x17], #16
1951 ld1 {v3.16b},[x1],#16
1954 ld1 {v4.4s},[x16],#16
1959 rev32 v3.16b,v3.16b
2027 st1 {v4.4s},[x17], #16
2034 ldp w11,w12,[x0,#16]
2046 stp w7,w8,[x0,#16]
2054 add sp,sp,#16*4+16
2056 .size sha256_block_neon,.-sha256_block_neon