1/* 2 * Copyright (c) Facebook, Inc. 3 * All rights reserved. 4 * 5 * This source code is licensed under both the BSD-style license (found in the 6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7 * in the COPYING file in the root directory of this source tree). 8 * You may select, at your option, one of the above-listed licenses. 9 */ 10 11#include "../common/portability_macros.h" 12 13/* Stack marking 14 * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart 15 */ 16#if defined(__ELF__) && defined(__GNUC__) 17.section .note.GNU-stack,"",%progbits 18#endif 19 20#if ZSTD_ENABLE_ASM_X86_64_BMI2 21 22/* Calling convention: 23 * 24 * %rdi contains the first argument: HUF_DecompressAsmArgs*. 25 * %rbp isn't maintained (no frame pointer). 26 * %rsp contains the stack pointer that grows down. 27 * No red-zone is assumed, only addresses >= %rsp are used. 28 * All register contents are preserved. 29 * 30 * TODO: Support Windows calling convention. 31 */ 32 33ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop) 34ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop) 35ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop) 36ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop) 37.global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop 38.global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop 39.global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop 40.global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop 41.text 42 43/* Sets up register mappings for clarity. 44 * op[], bits[], dtable & ip[0] each get their own register. 45 * ip[1,2,3] & olimit alias var[]. 46 * %rax is a scratch register. 47 */ 48 49#define op0 rsi 50#define op1 rbx 51#define op2 rcx 52#define op3 rdi 53 54#define ip0 r8 55#define ip1 r9 56#define ip2 r10 57#define ip3 r11 58 59#define bits0 rbp 60#define bits1 rdx 61#define bits2 r12 62#define bits3 r13 63#define dtable r14 64#define olimit r15 65 66/* var[] aliases ip[1,2,3] & olimit 67 * ip[1,2,3] are saved every iteration. 68 * olimit is only used in compute_olimit. 69 */ 70#define var0 r15 71#define var1 r9 72#define var2 r10 73#define var3 r11 74 75/* 32-bit var registers */ 76#define vard0 r15d 77#define vard1 r9d 78#define vard2 r10d 79#define vard3 r11d 80 81/* Calls X(N) for each stream 0, 1, 2, 3. */ 82#define FOR_EACH_STREAM(X) \ 83 X(0); \ 84 X(1); \ 85 X(2); \ 86 X(3) 87 88/* Calls X(N, idx) for each stream 0, 1, 2, 3. */ 89#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ 90 X(0, idx); \ 91 X(1, idx); \ 92 X(2, idx); \ 93 X(3, idx) 94 95/* Define both _HUF_* & HUF_* symbols because MacOS 96 * C symbols are prefixed with '_' & Linux symbols aren't. 97 */ 98_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: 99HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: 100 /* Save all registers - even if they are callee saved for simplicity. */ 101 push %rax 102 push %rbx 103 push %rcx 104 push %rdx 105 push %rbp 106 push %rsi 107 push %rdi 108 push %r8 109 push %r9 110 push %r10 111 push %r11 112 push %r12 113 push %r13 114 push %r14 115 push %r15 116 117 /* Read HUF_DecompressAsmArgs* args from %rax */ 118 movq %rdi, %rax 119 movq 0(%rax), %ip0 120 movq 8(%rax), %ip1 121 movq 16(%rax), %ip2 122 movq 24(%rax), %ip3 123 movq 32(%rax), %op0 124 movq 40(%rax), %op1 125 movq 48(%rax), %op2 126 movq 56(%rax), %op3 127 movq 64(%rax), %bits0 128 movq 72(%rax), %bits1 129 movq 80(%rax), %bits2 130 movq 88(%rax), %bits3 131 movq 96(%rax), %dtable 132 push %rax /* argument */ 133 push 104(%rax) /* ilimit */ 134 push 112(%rax) /* oend */ 135 push %olimit /* olimit space */ 136 137 subq $24, %rsp 138 139.L_4X1_compute_olimit: 140 /* Computes how many iterations we can do safely 141 * %r15, %rax may be clobbered 142 * rbx, rdx must be saved 143 * op3 & ip0 mustn't be clobbered 144 */ 145 movq %rbx, 0(%rsp) 146 movq %rdx, 8(%rsp) 147 148 movq 32(%rsp), %rax /* rax = oend */ 149 subq %op3, %rax /* rax = oend - op3 */ 150 151 /* r15 = (oend - op3) / 5 */ 152 movabsq $-3689348814741910323, %rdx 153 mulq %rdx 154 movq %rdx, %r15 155 shrq $2, %r15 156 157 movq %ip0, %rax /* rax = ip0 */ 158 movq 40(%rsp), %rdx /* rdx = ilimit */ 159 subq %rdx, %rax /* rax = ip0 - ilimit */ 160 movq %rax, %rbx /* rbx = ip0 - ilimit */ 161 162 /* rdx = (ip0 - ilimit) / 7 */ 163 movabsq $2635249153387078803, %rdx 164 mulq %rdx 165 subq %rdx, %rbx 166 shrq %rbx 167 addq %rbx, %rdx 168 shrq $2, %rdx 169 170 /* r15 = min(%rdx, %r15) */ 171 cmpq %rdx, %r15 172 cmova %rdx, %r15 173 174 /* r15 = r15 * 5 */ 175 leaq (%r15, %r15, 4), %r15 176 177 /* olimit = op3 + r15 */ 178 addq %op3, %olimit 179 180 movq 8(%rsp), %rdx 181 movq 0(%rsp), %rbx 182 183 /* If (op3 + 20 > olimit) */ 184 movq %op3, %rax /* rax = op3 */ 185 addq $20, %rax /* rax = op3 + 20 */ 186 cmpq %rax, %olimit /* op3 + 20 > olimit */ 187 jb .L_4X1_exit 188 189 /* If (ip1 < ip0) go to exit */ 190 cmpq %ip0, %ip1 191 jb .L_4X1_exit 192 193 /* If (ip2 < ip1) go to exit */ 194 cmpq %ip1, %ip2 195 jb .L_4X1_exit 196 197 /* If (ip3 < ip2) go to exit */ 198 cmpq %ip2, %ip3 199 jb .L_4X1_exit 200 201/* Reads top 11 bits from bits[n] 202 * Loads dt[bits[n]] into var[n] 203 */ 204#define GET_NEXT_DELT(n) \ 205 movq $53, %var##n; \ 206 shrxq %var##n, %bits##n, %var##n; \ 207 movzwl (%dtable,%var##n,2),%vard##n 208 209/* var[n] must contain the DTable entry computed with GET_NEXT_DELT 210 * Moves var[n] to %rax 211 * bits[n] <<= var[n] & 63 212 * op[n][idx] = %rax >> 8 213 * %ah is a way to access bits [8, 16) of %rax 214 */ 215#define DECODE_FROM_DELT(n, idx) \ 216 movq %var##n, %rax; \ 217 shlxq %var##n, %bits##n, %bits##n; \ 218 movb %ah, idx(%op##n) 219 220/* Assumes GET_NEXT_DELT has been called. 221 * Calls DECODE_FROM_DELT then GET_NEXT_DELT 222 */ 223#define DECODE_AND_GET_NEXT(n, idx) \ 224 DECODE_FROM_DELT(n, idx); \ 225 GET_NEXT_DELT(n) \ 226 227/* // ctz & nbBytes is stored in bits[n] 228 * // nbBits is stored in %rax 229 * ctz = CTZ[bits[n]] 230 * nbBits = ctz & 7 231 * nbBytes = ctz >> 3 232 * op[n] += 5 233 * ip[n] -= nbBytes 234 * // Note: x86-64 is little-endian ==> no bswap 235 * bits[n] = MEM_readST(ip[n]) | 1 236 * bits[n] <<= nbBits 237 */ 238#define RELOAD_BITS(n) \ 239 bsfq %bits##n, %bits##n; \ 240 movq %bits##n, %rax; \ 241 andq $7, %rax; \ 242 shrq $3, %bits##n; \ 243 leaq 5(%op##n), %op##n; \ 244 subq %bits##n, %ip##n; \ 245 movq (%ip##n), %bits##n; \ 246 orq $1, %bits##n; \ 247 shlx %rax, %bits##n, %bits##n 248 249 /* Store clobbered variables on the stack */ 250 movq %olimit, 24(%rsp) 251 movq %ip1, 0(%rsp) 252 movq %ip2, 8(%rsp) 253 movq %ip3, 16(%rsp) 254 255 /* Call GET_NEXT_DELT for each stream */ 256 FOR_EACH_STREAM(GET_NEXT_DELT) 257 258 .p2align 6 259 260.L_4X1_loop_body: 261 /* Decode 5 symbols in each of the 4 streams (20 total) 262 * Must have called GET_NEXT_DELT for each stream 263 */ 264 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0) 265 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1) 266 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2) 267 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3) 268 FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4) 269 270 /* Load ip[1,2,3] from stack (var[] aliases them) 271 * ip[] is needed for RELOAD_BITS 272 * Each will be stored back to the stack after RELOAD 273 */ 274 movq 0(%rsp), %ip1 275 movq 8(%rsp), %ip2 276 movq 16(%rsp), %ip3 277 278 /* Reload each stream & fetch the next table entry 279 * to prepare for the next iteration 280 */ 281 RELOAD_BITS(0) 282 GET_NEXT_DELT(0) 283 284 RELOAD_BITS(1) 285 movq %ip1, 0(%rsp) 286 GET_NEXT_DELT(1) 287 288 RELOAD_BITS(2) 289 movq %ip2, 8(%rsp) 290 GET_NEXT_DELT(2) 291 292 RELOAD_BITS(3) 293 movq %ip3, 16(%rsp) 294 GET_NEXT_DELT(3) 295 296 /* If op3 < olimit: continue the loop */ 297 cmp %op3, 24(%rsp) 298 ja .L_4X1_loop_body 299 300 /* Reload ip[1,2,3] from stack */ 301 movq 0(%rsp), %ip1 302 movq 8(%rsp), %ip2 303 movq 16(%rsp), %ip3 304 305 /* Re-compute olimit */ 306 jmp .L_4X1_compute_olimit 307 308#undef GET_NEXT_DELT 309#undef DECODE_FROM_DELT 310#undef DECODE 311#undef RELOAD_BITS 312.L_4X1_exit: 313 addq $24, %rsp 314 315 /* Restore stack (oend & olimit) */ 316 pop %rax /* olimit */ 317 pop %rax /* oend */ 318 pop %rax /* ilimit */ 319 pop %rax /* arg */ 320 321 /* Save ip / op / bits */ 322 movq %ip0, 0(%rax) 323 movq %ip1, 8(%rax) 324 movq %ip2, 16(%rax) 325 movq %ip3, 24(%rax) 326 movq %op0, 32(%rax) 327 movq %op1, 40(%rax) 328 movq %op2, 48(%rax) 329 movq %op3, 56(%rax) 330 movq %bits0, 64(%rax) 331 movq %bits1, 72(%rax) 332 movq %bits2, 80(%rax) 333 movq %bits3, 88(%rax) 334 335 /* Restore registers */ 336 pop %r15 337 pop %r14 338 pop %r13 339 pop %r12 340 pop %r11 341 pop %r10 342 pop %r9 343 pop %r8 344 pop %rdi 345 pop %rsi 346 pop %rbp 347 pop %rdx 348 pop %rcx 349 pop %rbx 350 pop %rax 351 ret 352 353_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: 354HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: 355 /* Save all registers - even if they are callee saved for simplicity. */ 356 push %rax 357 push %rbx 358 push %rcx 359 push %rdx 360 push %rbp 361 push %rsi 362 push %rdi 363 push %r8 364 push %r9 365 push %r10 366 push %r11 367 push %r12 368 push %r13 369 push %r14 370 push %r15 371 372 movq %rdi, %rax 373 movq 0(%rax), %ip0 374 movq 8(%rax), %ip1 375 movq 16(%rax), %ip2 376 movq 24(%rax), %ip3 377 movq 32(%rax), %op0 378 movq 40(%rax), %op1 379 movq 48(%rax), %op2 380 movq 56(%rax), %op3 381 movq 64(%rax), %bits0 382 movq 72(%rax), %bits1 383 movq 80(%rax), %bits2 384 movq 88(%rax), %bits3 385 movq 96(%rax), %dtable 386 push %rax /* argument */ 387 push %rax /* olimit */ 388 push 104(%rax) /* ilimit */ 389 390 movq 112(%rax), %rax 391 push %rax /* oend3 */ 392 393 movq %op3, %rax 394 push %rax /* oend2 */ 395 396 movq %op2, %rax 397 push %rax /* oend1 */ 398 399 movq %op1, %rax 400 push %rax /* oend0 */ 401 402 /* Scratch space */ 403 subq $8, %rsp 404 405.L_4X2_compute_olimit: 406 /* Computes how many iterations we can do safely 407 * %r15, %rax may be clobbered 408 * rdx must be saved 409 * op[1,2,3,4] & ip0 mustn't be clobbered 410 */ 411 movq %rdx, 0(%rsp) 412 413 /* We can consume up to 7 input bytes each iteration. */ 414 movq %ip0, %rax /* rax = ip0 */ 415 movq 40(%rsp), %rdx /* rdx = ilimit */ 416 subq %rdx, %rax /* rax = ip0 - ilimit */ 417 movq %rax, %r15 /* r15 = ip0 - ilimit */ 418 419 /* rdx = rax / 7 */ 420 movabsq $2635249153387078803, %rdx 421 mulq %rdx 422 subq %rdx, %r15 423 shrq %r15 424 addq %r15, %rdx 425 shrq $2, %rdx 426 427 /* r15 = (ip0 - ilimit) / 7 */ 428 movq %rdx, %r15 429 430 movabsq $-3689348814741910323, %rdx 431 movq 8(%rsp), %rax /* rax = oend0 */ 432 subq %op0, %rax /* rax = oend0 - op0 */ 433 mulq %rdx 434 shrq $3, %rdx /* rdx = rax / 10 */ 435 436 /* r15 = min(%rdx, %r15) */ 437 cmpq %rdx, %r15 438 cmova %rdx, %r15 439 440 movabsq $-3689348814741910323, %rdx 441 movq 16(%rsp), %rax /* rax = oend1 */ 442 subq %op1, %rax /* rax = oend1 - op1 */ 443 mulq %rdx 444 shrq $3, %rdx /* rdx = rax / 10 */ 445 446 /* r15 = min(%rdx, %r15) */ 447 cmpq %rdx, %r15 448 cmova %rdx, %r15 449 450 movabsq $-3689348814741910323, %rdx 451 movq 24(%rsp), %rax /* rax = oend2 */ 452 subq %op2, %rax /* rax = oend2 - op2 */ 453 mulq %rdx 454 shrq $3, %rdx /* rdx = rax / 10 */ 455 456 /* r15 = min(%rdx, %r15) */ 457 cmpq %rdx, %r15 458 cmova %rdx, %r15 459 460 movabsq $-3689348814741910323, %rdx 461 movq 32(%rsp), %rax /* rax = oend3 */ 462 subq %op3, %rax /* rax = oend3 - op3 */ 463 mulq %rdx 464 shrq $3, %rdx /* rdx = rax / 10 */ 465 466 /* r15 = min(%rdx, %r15) */ 467 cmpq %rdx, %r15 468 cmova %rdx, %r15 469 470 /* olimit = op3 + 5 * r15 */ 471 movq %r15, %rax 472 leaq (%op3, %rax, 4), %olimit 473 addq %rax, %olimit 474 475 movq 0(%rsp), %rdx 476 477 /* If (op3 + 10 > olimit) */ 478 movq %op3, %rax /* rax = op3 */ 479 addq $10, %rax /* rax = op3 + 10 */ 480 cmpq %rax, %olimit /* op3 + 10 > olimit */ 481 jb .L_4X2_exit 482 483 /* If (ip1 < ip0) go to exit */ 484 cmpq %ip0, %ip1 485 jb .L_4X2_exit 486 487 /* If (ip2 < ip1) go to exit */ 488 cmpq %ip1, %ip2 489 jb .L_4X2_exit 490 491 /* If (ip3 < ip2) go to exit */ 492 cmpq %ip2, %ip3 493 jb .L_4X2_exit 494 495#define DECODE(n, idx) \ 496 movq %bits##n, %rax; \ 497 shrq $53, %rax; \ 498 movzwl 0(%dtable,%rax,4),%r8d; \ 499 movzbl 2(%dtable,%rax,4),%r15d; \ 500 movzbl 3(%dtable,%rax,4),%eax; \ 501 movw %r8w, (%op##n); \ 502 shlxq %r15, %bits##n, %bits##n; \ 503 addq %rax, %op##n 504 505#define RELOAD_BITS(n) \ 506 bsfq %bits##n, %bits##n; \ 507 movq %bits##n, %rax; \ 508 shrq $3, %bits##n; \ 509 andq $7, %rax; \ 510 subq %bits##n, %ip##n; \ 511 movq (%ip##n), %bits##n; \ 512 orq $1, %bits##n; \ 513 shlxq %rax, %bits##n, %bits##n 514 515 516 movq %olimit, 48(%rsp) 517 518 .p2align 6 519 520.L_4X2_loop_body: 521 /* We clobber r8, so store it on the stack */ 522 movq %r8, 0(%rsp) 523 524 /* Decode 5 symbols from each of the 4 streams (20 symbols total). */ 525 FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) 526 FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) 527 FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) 528 FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) 529 FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) 530 531 /* Reload r8 */ 532 movq 0(%rsp), %r8 533 534 FOR_EACH_STREAM(RELOAD_BITS) 535 536 cmp %op3, 48(%rsp) 537 ja .L_4X2_loop_body 538 jmp .L_4X2_compute_olimit 539 540#undef DECODE 541#undef RELOAD_BITS 542.L_4X2_exit: 543 addq $8, %rsp 544 /* Restore stack (oend & olimit) */ 545 pop %rax /* oend0 */ 546 pop %rax /* oend1 */ 547 pop %rax /* oend2 */ 548 pop %rax /* oend3 */ 549 pop %rax /* ilimit */ 550 pop %rax /* olimit */ 551 pop %rax /* arg */ 552 553 /* Save ip / op / bits */ 554 movq %ip0, 0(%rax) 555 movq %ip1, 8(%rax) 556 movq %ip2, 16(%rax) 557 movq %ip3, 24(%rax) 558 movq %op0, 32(%rax) 559 movq %op1, 40(%rax) 560 movq %op2, 48(%rax) 561 movq %op3, 56(%rax) 562 movq %bits0, 64(%rax) 563 movq %bits1, 72(%rax) 564 movq %bits2, 80(%rax) 565 movq %bits3, 88(%rax) 566 567 /* Restore registers */ 568 pop %r15 569 pop %r14 570 pop %r13 571 pop %r12 572 pop %r11 573 pop %r10 574 pop %r9 575 pop %r8 576 pop %rdi 577 pop %rsi 578 pop %rbp 579 pop %rdx 580 pop %rcx 581 pop %rbx 582 pop %rax 583 ret 584 585#endif 586