1/*- 2 * Copyright (c) 2024, 2025 Robert Clausecker <fuz@FreeBSD.org> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7#include <machine/asm.h> 8 9/* apply the round keys to the four round functions */ 10.macro allrounds rfn0, rfn1, rfn2, rfn3 11 \rfn0 0, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee 12 \rfn0 4, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 13 \rfn0 8, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be 14 \rfn0 12, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 15 16 \rfn1 16, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa 17 \rfn1 20, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 18 \rfn1 24, 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed 19 \rfn1 28, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a 20 21 \rfn2 32, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c 22 \rfn2 36, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 23 \rfn2 40, 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 24 \rfn2 44, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 25 26 \rfn3 48, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 27 \rfn3 52, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 28 \rfn3 56, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 29 \rfn3 60, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 30.endm 31 32 // md5block(MD5_CTX, buf, len) 33ENTRY(_libmd_md5block_baseline) 34.macro round a, b, c, d, f, k, m, s 35 \f %ebp, \b, \c, \d 36 add $\k, \a // a + k[i] 37 add ((\m)%16*4)(%rsi), \a // a + k[i] + m[g] 38 add %ebp, \a // a + k[i] + m[g] + f 39 rol $\s, \a 40 add \b, \a 41.endm 42 43 // f = b ? c : d 44.macro f0 f, b, c, d 45 mov \c, \f 46 xor \d, \f 47 and \b, \f 48 xor \d, \f 49.endm 50 51 // f = d ? b : c 52.macro f1 f, b, c, d 53 mov \c, \f 54 xor \b, \f 55 and \d, \f 56 xor \c, \f 57.endm 58 59 // f = b ^ c ^ d 60.macro f2 f, b, c, d 61 mov \c, \f 62 xor \d, \f 63 xor \b, \f 64.endm 65 66 // f = c ^ (b | ~d) 67.macro f3 f, b, c, d 68 mov $-1, \f 69 xor \d, \f 70 or \b, \f 71 xor \c, \f 72.endm 73 74 // do 4 rounds 75.macro rounds f, p, q, s0, s1, s2, s3, k0, k1, k2, k3 76 round %eax, %ebx, %ecx, %edx, \f, \k0, \p*0+\q, \s0 77 round %edx, %eax, %ebx, %ecx, \f, \k1, \p*1+\q, \s1 78 round %ecx, %edx, %eax, %ebx, \f, \k2, \p*2+\q, \s2 79 round %ebx, %ecx, %edx, %eax, \f, \k3, \p*3+\q, \s3 80.endm 81 82 // do 4 rounds with f0, f1, f2, f3 83.macro rounds0 i, k0, k1, k2, k3 84 rounds f0, 1, \i, 7, 12, 17, 22, \k0, \k1, \k2, \k3 85.endm 86 87.macro rounds1 i, k0, k1, k2, k3 88 rounds f1, 5, 5*\i+1, 5, 9, 14, 20, \k0, \k1, \k2, \k3 89.endm 90 91.macro rounds2 i, k0, k1, k2, k3 92 rounds f2, 3, 3*\i+5, 4, 11, 16, 23, \k0, \k1, \k2, \k3 93.endm 94 95.macro rounds3 i, k0, k1, k2, k3 96 rounds f3, 7, 7*\i, 6, 10, 15, 21, \k0, \k1, \k2, \k3 97.endm 98 99 push %rbx 100 push %rbp 101 push %r12 102 103 and $~63, %rdx // length in blocks 104 lea (%rsi, %rdx, 1), %r12 // end pointer 105 106 mov (%rdi), %eax // a 107 mov 4(%rdi), %ebx // b 108 mov 8(%rdi), %ecx // c 109 mov 12(%rdi), %edx // d 110 111 cmp %rsi, %r12 // any data to process? 112 je .Lend 113 114 .balign 16 115.Lloop: mov %eax, %r8d 116 mov %ebx, %r9d 117 mov %ecx, %r10d 118 mov %edx, %r11d 119 120 allrounds rounds0, rounds1, rounds2, rounds3 121 122 add %r8d, %eax 123 add %r9d, %ebx 124 add %r10d, %ecx 125 add %r11d, %edx 126 127 add $64, %rsi 128 cmp %rsi, %r12 129 jne .Lloop 130 131 mov %eax, (%rdi) 132 mov %ebx, 4(%rdi) 133 mov %ecx, 8(%rdi) 134 mov %edx, 12(%rdi) 135 136.Lend: pop %r12 137 pop %rbp 138 pop %rbx 139 ret 140END(_libmd_md5block_baseline) 141 142 /* 143 * An implementation leveraging the ANDN instruction 144 * from BMI1 to shorten some dependency chains. 145 */ 146ENTRY(_libmd_md5block_bmi1) 147 // special-cased round 1 148 // f1 = d ? b : c = (d & b) + (~d & c) 149.macro round1 a, b, c, d, k, m, s 150 andn \c, \d, %edi // ~d & c 151 add $\k, \a // a + k[i] 152 mov \d, %ebp 153 add ((\m)%16*4)(%rsi), \a // a + k[i] + m[g] 154 and \b, %ebp // d & b 155 add %edi, \a // a + k[i] + m[g] + (~d & c) 156 add %ebp, \a // a + k[i] + m[g] + (~d & c) + (d & b) 157 rol $\s, \a 158 add \b, \a 159.endm 160 161 // special-cased round 3 162 // f3 = c ^ (b | ~d) = ~(c ^ ~b & d) = -1 - (c ^ ~b & d) 163.macro round3 a, b, c, d, k, m, s 164 andn \d, \b, %ebp 165 add $\k - 1, \a // a + k[i] - 1 166 add ((\m)%16*4)(%rsi), \a // a + k[i] + m[g] 167 xor \c, %ebp 168 sub %ebp, \a // a + k[i] + m[g] + f 169 rol $\s, \a 170 add \b, \a 171.endm 172 173 .purgem rounds1 174.macro rounds1 i, k0, k1, k2, k3 175 round1 %eax, %ebx, %ecx, %edx, \k0, 5*\i+ 1, 5 176 round1 %edx, %eax, %ebx, %ecx, \k1, 5*\i+ 6, 9 177 round1 %ecx, %edx, %eax, %ebx, \k2, 5*\i+11, 14 178 round1 %ebx, %ecx, %edx, %eax, \k3, 5*\i+16, 20 179.endm 180 181 .purgem rounds3 182.macro rounds3 i, k0, k1, k2, k3 183 round3 %eax, %ebx, %ecx, %edx, \k0, 7*\i+ 0, 6 184 round3 %edx, %eax, %ebx, %ecx, \k1, 7*\i+ 7, 10 185 round3 %ecx, %edx, %eax, %ebx, \k2, 7*\i+14, 15 186 round3 %ebx, %ecx, %edx, %eax, \k3, 7*\i+21, 21 187.endm 188 189 push %rbx 190 push %rbp 191 push %r12 192 193 and $~63, %rdx // length in blocks 194 lea (%rsi, %rdx, 1), %r12 // end pointer 195 196 mov (%rdi), %eax // a 197 mov 4(%rdi), %ebx // b 198 mov 8(%rdi), %ecx // c 199 mov 12(%rdi), %edx // d 200 201 cmp %rsi, %r12 // any data to process? 202 je 0f 203 204 push %rdi 205 206 .balign 16 2071: mov %eax, %r8d 208 mov %ebx, %r9d 209 mov %ecx, %r10d 210 mov %edx, %r11d 211 212 allrounds rounds0, rounds1, rounds2, rounds3 213 214 add %r8d, %eax 215 add %r9d, %ebx 216 add %r10d, %ecx 217 add %r11d, %edx 218 219 add $64, %rsi 220 cmp %rsi, %r12 221 jne 1b 222 223 pop %rdi 224 mov %eax, (%rdi) 225 mov %ebx, 4(%rdi) 226 mov %ecx, 8(%rdi) 227 mov %edx, 12(%rdi) 228 2290: pop %r12 230 pop %rbp 231 pop %rbx 232 ret 233END(_libmd_md5block_bmi1) 234 235#ifndef _KERNEL 236 /* 237 * An implementation leveraging AVX-512 for its VPTERNLOGD 238 * instruction. We're using only XMM registers here, 239 * avoiding costly thermal licensing. 240 */ 241ENTRY(_libmd_md5block_avx512) 242.macro vround a, b, c, d, f, i, m, mi, s 243 vmovdqa \b, %xmm4 244 vpternlogd $\f, \d, \c, %xmm4 245 vpaddd 4*(\i)(%rax){1to4}, \m, %xmm5 // m[g] + k[i] 246.if \mi != 0 247 vpshufd $0x55 * \mi, %xmm5, %xmm5 // broadcast to each dword 248.endif 249 vpaddd %xmm5, \a, \a // a + k[i] + m[g] 250 vpaddd %xmm4, \a, \a // a + k[i] + m[g] + f 251 vprold $\s, \a, \a 252 vpaddd \b, \a, \a 253.endm 254 255.macro vrounds f, i, m0, i0, m1, i1, m2, i2, m3, i3, s0, s1, s2, s3 256 vround %xmm0, %xmm1, %xmm2, %xmm3, \f, \i+0, \m0, \i0, \s0 257 vround %xmm3, %xmm0, %xmm1, %xmm2, \f, \i+1, \m1, \i1, \s1 258 vround %xmm2, %xmm3, %xmm0, %xmm1, \f, \i+2, \m2, \i2, \s2 259 vround %xmm1, %xmm2, %xmm3, %xmm0, \f, \i+3, \m3, \i3, \s3 260.endm 261 262/* 263 * d c b f0 f1 f2 f3 264 * 0 0 0 0 0 0 1 265 * 1 0 0 1 0 1 0 266 * 0 1 0 0 1 1 0 267 * 1 1 0 1 0 0 1 268 * 0 0 1 0 0 1 1 269 * 1 0 1 0 1 0 1 270 * 0 1 1 1 1 0 0 271 * 1 1 1 1 1 1 0 272 */ 273 274.macro vrounds0 i, m 275 vrounds 0xca, \i, \m, 0, \m, 1, \m, 2, \m, 3, 7, 12, 17, 22 276.endm 277 278.macro vrounds1 i, m0, i0, m1, i1, m2, i2, m3, i3 279 vrounds 0xe4, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 5, 9, 14, 20 280.endm 281 282.macro vrounds2 i, m0, i0, m1, i1, m2, i2, m3, i3 283 vrounds 0x96, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 4, 11, 16, 23 284.endm 285 286.macro vrounds3 i, m0, i0, m1, i1, m2, i2, m3, i3 287 vrounds 0x39, \i, \m0, \i0, \m1, \i1, \m2, \i2, \m3, \i3, 6, 10, 15, 21 288.endm 289 290 and $~63, %rdx // length in blocks 291 add %rsi, %rdx // end pointer 292 293 vmovd (%rdi), %xmm0 // a 294 vmovd 4(%rdi), %xmm1 // b 295 vmovd 8(%rdi), %xmm2 // c 296 vmovd 12(%rdi), %xmm3 // d 297 298 lea keys(%rip), %rax 299 300 cmp %rsi, %rdx // any data to process? 301 je 0f 302 303 .balign 16 3041: vmovdqu 0*4(%rsi), %xmm8 // message words 305 vmovdqu 4*4(%rsi), %xmm9 306 vmovdqu 8*4(%rsi), %xmm10 307 vmovdqu 12*4(%rsi), %xmm11 308 309 vmovdqa %xmm0, %xmm12 // stash old state variables 310 vmovdqa %xmm1, %xmm13 311 vmovdqa %xmm2, %xmm14 312 vmovdqa %xmm3, %xmm15 313 314 vrounds0 0, %xmm8 315 vrounds0 4, %xmm9 316 vrounds0 8, %xmm10 317 vrounds0 12, %xmm11 318 319 vrounds1 16, %xmm8, 1, %xmm9, 2, %xmm10, 3, %xmm8, 0 320 vrounds1 20, %xmm9, 1, %xmm10, 2, %xmm11, 3, %xmm9, 0 321 vrounds1 24, %xmm10, 1, %xmm11, 2, %xmm8, 3, %xmm10, 0 322 vrounds1 28, %xmm11, 1, %xmm8, 2, %xmm9, 3, %xmm11, 0 323 324 vrounds2 32, %xmm9, 1, %xmm10, 0, %xmm10, 3, %xmm11, 2 325 vrounds2 36, %xmm8, 1, %xmm9, 0, %xmm9, 3, %xmm10, 2 326 vrounds2 40, %xmm11, 1, %xmm8, 0, %xmm8, 3, %xmm9, 2 327 vrounds2 44 %xmm10, 1, %xmm11, 0, %xmm11, 3, %xmm8, 2 328 329 vrounds3 48, %xmm8, 0, %xmm9, 3, %xmm11, 2, %xmm9, 1 330 vrounds3 52, %xmm11, 0, %xmm8, 3, %xmm10, 2, %xmm8, 1 331 vrounds3 56, %xmm10, 0, %xmm11, 3, %xmm9, 2, %xmm11, 1 332 vrounds3 60, %xmm9, 0, %xmm10, 3, %xmm8, 2, %xmm10, 1 333 334 vpaddd %xmm12, %xmm0, %xmm0 335 vpaddd %xmm13, %xmm1, %xmm1 336 vpaddd %xmm14, %xmm2, %xmm2 337 vpaddd %xmm15, %xmm3, %xmm3 338 339 add $64, %rsi 340 cmp %rsi, %rdx 341 jne 1b 342 343 vmovd %xmm0, (%rdi) 344 vmovd %xmm1, 4(%rdi) 345 vmovd %xmm2, 8(%rdi) 346 vmovd %xmm3, 12(%rdi) 347 3480: ret 349END(_libmd_md5block_avx512) 350 351 // round keys, for use in md5block_avx512 352 .section .rodata 353 .balign 16 354 355.macro putkeys i, a, b, c, d 356 .4byte \a, \b, \c, \d 357.endm 358 359keys: allrounds putkeys, putkeys, putkeys, putkeys 360 .size keys, .-keys 361#endif /* !defined(_KERNEL) */ 362 363 .section .note.GNU-stack,"",%progbits 364