xref: /freebsd/sys/contrib/zstd/lib/decompress/huf_decompress_amd64.S (revision b37d1deb045d7bc7877fb1d9afdb39d43130dcc4)
1/*
2 * Copyright (c) Facebook, Inc.
3 * All rights reserved.
4 *
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
9 */
10
11#include "../common/portability_macros.h"
12
13/* Stack marking
14 * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
15 */
16#if defined(__ELF__) && defined(__GNUC__)
17.section .note.GNU-stack,"",%progbits
18#endif
19
20#if ZSTD_ENABLE_ASM_X86_64_BMI2
21
22/* Calling convention:
23 *
24 * %rdi contains the first argument: HUF_DecompressAsmArgs*.
25 * %rbp isn't maintained (no frame pointer).
26 * %rsp contains the stack pointer that grows down.
27 *      No red-zone is assumed, only addresses >= %rsp are used.
28 * All register contents are preserved.
29 *
30 * TODO: Support Windows calling convention.
31 */
32
33ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
34ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)
35ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)
36ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
37.global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
38.global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
39.global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
40.global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
41.text
42
43/* Sets up register mappings for clarity.
44 * op[], bits[], dtable & ip[0] each get their own register.
45 * ip[1,2,3] & olimit alias var[].
46 * %rax is a scratch register.
47 */
48
49#define op0    rsi
50#define op1    rbx
51#define op2    rcx
52#define op3    rdi
53
54#define ip0    r8
55#define ip1    r9
56#define ip2    r10
57#define ip3    r11
58
59#define bits0  rbp
60#define bits1  rdx
61#define bits2  r12
62#define bits3  r13
63#define dtable r14
64#define olimit r15
65
66/* var[] aliases ip[1,2,3] & olimit
67 * ip[1,2,3] are saved every iteration.
68 * olimit is only used in compute_olimit.
69 */
70#define var0   r15
71#define var1   r9
72#define var2   r10
73#define var3   r11
74
75/* 32-bit var registers */
76#define vard0  r15d
77#define vard1  r9d
78#define vard2  r10d
79#define vard3  r11d
80
81/* Calls X(N) for each stream 0, 1, 2, 3. */
82#define FOR_EACH_STREAM(X) \
83    X(0);                  \
84    X(1);                  \
85    X(2);                  \
86    X(3)
87
88/* Calls X(N, idx) for each stream 0, 1, 2, 3. */
89#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
90    X(0, idx);                             \
91    X(1, idx);                             \
92    X(2, idx);                             \
93    X(3, idx)
94
95/* Define both _HUF_* & HUF_* symbols because MacOS
96 * C symbols are prefixed with '_' & Linux symbols aren't.
97 */
98_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
99HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
100    /* Save all registers - even if they are callee saved for simplicity. */
101    push %rax
102    push %rbx
103    push %rcx
104    push %rdx
105    push %rbp
106    push %rsi
107    push %rdi
108    push %r8
109    push %r9
110    push %r10
111    push %r11
112    push %r12
113    push %r13
114    push %r14
115    push %r15
116
117    /* Read HUF_DecompressAsmArgs* args from %rax */
118    movq %rdi, %rax
119    movq  0(%rax), %ip0
120    movq  8(%rax), %ip1
121    movq 16(%rax), %ip2
122    movq 24(%rax), %ip3
123    movq 32(%rax), %op0
124    movq 40(%rax), %op1
125    movq 48(%rax), %op2
126    movq 56(%rax), %op3
127    movq 64(%rax), %bits0
128    movq 72(%rax), %bits1
129    movq 80(%rax), %bits2
130    movq 88(%rax), %bits3
131    movq 96(%rax), %dtable
132    push %rax      /* argument */
133    push 104(%rax) /* ilimit */
134    push 112(%rax) /* oend */
135    push %olimit   /* olimit space */
136
137    subq $24, %rsp
138
139.L_4X1_compute_olimit:
140    /* Computes how many iterations we can do safely
141     * %r15, %rax may be clobbered
142     * rbx, rdx must be saved
143     * op3 & ip0 mustn't be clobbered
144     */
145    movq %rbx, 0(%rsp)
146    movq %rdx, 8(%rsp)
147
148    movq 32(%rsp), %rax /* rax = oend */
149    subq %op3,    %rax  /* rax = oend - op3 */
150
151    /* r15 = (oend - op3) / 5 */
152    movabsq $-3689348814741910323, %rdx
153    mulq %rdx
154    movq %rdx, %r15
155    shrq $2, %r15
156
157    movq %ip0,     %rax /* rax = ip0 */
158    movq 40(%rsp), %rdx /* rdx = ilimit */
159    subq %rdx,     %rax /* rax = ip0 - ilimit */
160    movq %rax,     %rbx /* rbx = ip0 - ilimit */
161
162    /* rdx = (ip0 - ilimit) / 7 */
163    movabsq $2635249153387078803, %rdx
164    mulq %rdx
165    subq %rdx, %rbx
166    shrq %rbx
167    addq %rbx, %rdx
168    shrq $2, %rdx
169
170    /* r15 = min(%rdx, %r15) */
171    cmpq %rdx, %r15
172    cmova %rdx, %r15
173
174    /* r15 = r15 * 5 */
175    leaq (%r15, %r15, 4), %r15
176
177    /* olimit = op3 + r15 */
178    addq %op3, %olimit
179
180    movq 8(%rsp), %rdx
181    movq 0(%rsp), %rbx
182
183    /* If (op3 + 20 > olimit) */
184    movq %op3, %rax    /* rax = op3 */
185    addq $20,  %rax    /* rax = op3 + 20 */
186    cmpq %rax, %olimit /* op3 + 20 > olimit */
187    jb .L_4X1_exit
188
189    /* If (ip1 < ip0) go to exit */
190    cmpq %ip0, %ip1
191    jb .L_4X1_exit
192
193    /* If (ip2 < ip1) go to exit */
194    cmpq %ip1, %ip2
195    jb .L_4X1_exit
196
197    /* If (ip3 < ip2) go to exit */
198    cmpq %ip2, %ip3
199    jb .L_4X1_exit
200
201/* Reads top 11 bits from bits[n]
202 * Loads dt[bits[n]] into var[n]
203 */
204#define GET_NEXT_DELT(n)                \
205    movq $53, %var##n;                  \
206    shrxq %var##n, %bits##n, %var##n;   \
207    movzwl (%dtable,%var##n,2),%vard##n
208
209/* var[n] must contain the DTable entry computed with GET_NEXT_DELT
210 * Moves var[n] to %rax
211 * bits[n] <<= var[n] & 63
212 * op[n][idx] = %rax >> 8
213 * %ah is a way to access bits [8, 16) of %rax
214 */
215#define DECODE_FROM_DELT(n, idx)       \
216    movq %var##n, %rax;                \
217    shlxq %var##n, %bits##n, %bits##n; \
218    movb %ah, idx(%op##n)
219
220/* Assumes GET_NEXT_DELT has been called.
221 * Calls DECODE_FROM_DELT then GET_NEXT_DELT
222 */
223#define DECODE_AND_GET_NEXT(n, idx) \
224    DECODE_FROM_DELT(n, idx);       \
225    GET_NEXT_DELT(n)                \
226
227/* // ctz & nbBytes is stored in bits[n]
228 * // nbBits is stored in %rax
229 * ctz  = CTZ[bits[n]]
230 * nbBits  = ctz & 7
231 * nbBytes = ctz >> 3
232 * op[n]  += 5
233 * ip[n]  -= nbBytes
234 * // Note: x86-64 is little-endian ==> no bswap
235 * bits[n] = MEM_readST(ip[n]) | 1
236 * bits[n] <<= nbBits
237 */
238#define RELOAD_BITS(n)             \
239    bsfq %bits##n, %bits##n;       \
240    movq %bits##n, %rax;           \
241    andq $7, %rax;                 \
242    shrq $3, %bits##n;             \
243    leaq 5(%op##n), %op##n;        \
244    subq %bits##n, %ip##n;         \
245    movq (%ip##n), %bits##n;       \
246    orq $1, %bits##n;              \
247    shlx %rax, %bits##n, %bits##n
248
249    /* Store clobbered variables on the stack */
250    movq %olimit, 24(%rsp)
251    movq %ip1, 0(%rsp)
252    movq %ip2, 8(%rsp)
253    movq %ip3, 16(%rsp)
254
255    /* Call GET_NEXT_DELT for each stream */
256    FOR_EACH_STREAM(GET_NEXT_DELT)
257
258    .p2align 6
259
260.L_4X1_loop_body:
261    /* Decode 5 symbols in each of the 4 streams (20 total)
262     * Must have called GET_NEXT_DELT for each stream
263     */
264    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
265    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
266    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
267    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
268    FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
269
270    /* Load ip[1,2,3] from stack (var[] aliases them)
271     * ip[] is needed for RELOAD_BITS
272     * Each will be stored back to the stack after RELOAD
273     */
274    movq 0(%rsp), %ip1
275    movq 8(%rsp), %ip2
276    movq 16(%rsp), %ip3
277
278    /* Reload each stream & fetch the next table entry
279     * to prepare for the next iteration
280     */
281    RELOAD_BITS(0)
282    GET_NEXT_DELT(0)
283
284    RELOAD_BITS(1)
285    movq %ip1, 0(%rsp)
286    GET_NEXT_DELT(1)
287
288    RELOAD_BITS(2)
289    movq %ip2, 8(%rsp)
290    GET_NEXT_DELT(2)
291
292    RELOAD_BITS(3)
293    movq %ip3, 16(%rsp)
294    GET_NEXT_DELT(3)
295
296    /* If op3 < olimit: continue the loop */
297    cmp %op3, 24(%rsp)
298    ja .L_4X1_loop_body
299
300    /* Reload ip[1,2,3] from stack */
301    movq 0(%rsp), %ip1
302    movq 8(%rsp), %ip2
303    movq 16(%rsp), %ip3
304
305    /* Re-compute olimit */
306    jmp .L_4X1_compute_olimit
307
308#undef GET_NEXT_DELT
309#undef DECODE_FROM_DELT
310#undef DECODE
311#undef RELOAD_BITS
312.L_4X1_exit:
313    addq $24, %rsp
314
315    /* Restore stack (oend & olimit) */
316    pop %rax /* olimit */
317    pop %rax /* oend */
318    pop %rax /* ilimit */
319    pop %rax /* arg */
320
321    /* Save ip / op / bits */
322    movq %ip0,  0(%rax)
323    movq %ip1,  8(%rax)
324    movq %ip2, 16(%rax)
325    movq %ip3, 24(%rax)
326    movq %op0, 32(%rax)
327    movq %op1, 40(%rax)
328    movq %op2, 48(%rax)
329    movq %op3, 56(%rax)
330    movq %bits0, 64(%rax)
331    movq %bits1, 72(%rax)
332    movq %bits2, 80(%rax)
333    movq %bits3, 88(%rax)
334
335    /* Restore registers */
336    pop %r15
337    pop %r14
338    pop %r13
339    pop %r12
340    pop %r11
341    pop %r10
342    pop %r9
343    pop %r8
344    pop %rdi
345    pop %rsi
346    pop %rbp
347    pop %rdx
348    pop %rcx
349    pop %rbx
350    pop %rax
351    ret
352
353_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
354HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
355    /* Save all registers - even if they are callee saved for simplicity. */
356    push %rax
357    push %rbx
358    push %rcx
359    push %rdx
360    push %rbp
361    push %rsi
362    push %rdi
363    push %r8
364    push %r9
365    push %r10
366    push %r11
367    push %r12
368    push %r13
369    push %r14
370    push %r15
371
372    movq %rdi, %rax
373    movq  0(%rax), %ip0
374    movq  8(%rax), %ip1
375    movq 16(%rax), %ip2
376    movq 24(%rax), %ip3
377    movq 32(%rax), %op0
378    movq 40(%rax), %op1
379    movq 48(%rax), %op2
380    movq 56(%rax), %op3
381    movq 64(%rax), %bits0
382    movq 72(%rax), %bits1
383    movq 80(%rax), %bits2
384    movq 88(%rax), %bits3
385    movq 96(%rax), %dtable
386    push %rax      /* argument */
387    push %rax      /* olimit */
388    push 104(%rax) /* ilimit */
389
390    movq 112(%rax), %rax
391    push %rax /* oend3 */
392
393    movq %op3, %rax
394    push %rax /* oend2 */
395
396    movq %op2, %rax
397    push %rax /* oend1 */
398
399    movq %op1, %rax
400    push %rax /* oend0 */
401
402    /* Scratch space */
403    subq $8, %rsp
404
405.L_4X2_compute_olimit:
406    /* Computes how many iterations we can do safely
407     * %r15, %rax may be clobbered
408     * rdx must be saved
409     * op[1,2,3,4] & ip0 mustn't be clobbered
410     */
411    movq %rdx, 0(%rsp)
412
413    /* We can consume up to 7 input bytes each iteration. */
414    movq %ip0,     %rax  /* rax = ip0 */
415    movq 40(%rsp), %rdx  /* rdx = ilimit */
416    subq %rdx,     %rax  /* rax = ip0 - ilimit */
417    movq %rax,    %r15   /* r15 = ip0 - ilimit */
418
419    /* rdx = rax / 7 */
420    movabsq $2635249153387078803, %rdx
421    mulq %rdx
422    subq %rdx, %r15
423    shrq %r15
424    addq %r15, %rdx
425    shrq $2, %rdx
426
427    /* r15 = (ip0 - ilimit) / 7 */
428    movq %rdx, %r15
429
430    movabsq $-3689348814741910323, %rdx
431    movq 8(%rsp), %rax /* rax = oend0 */
432    subq %op0,    %rax /* rax = oend0 - op0 */
433    mulq %rdx
434    shrq $3,      %rdx /* rdx = rax / 10 */
435
436    /* r15 = min(%rdx, %r15) */
437    cmpq  %rdx, %r15
438    cmova %rdx, %r15
439
440    movabsq $-3689348814741910323, %rdx
441    movq 16(%rsp), %rax /* rax = oend1 */
442    subq %op1,     %rax /* rax = oend1 - op1 */
443    mulq %rdx
444    shrq $3,       %rdx /* rdx = rax / 10 */
445
446    /* r15 = min(%rdx, %r15) */
447    cmpq  %rdx, %r15
448    cmova %rdx, %r15
449
450    movabsq $-3689348814741910323, %rdx
451    movq 24(%rsp), %rax /* rax = oend2 */
452    subq %op2,     %rax /* rax = oend2 - op2 */
453    mulq %rdx
454    shrq $3,       %rdx /* rdx = rax / 10 */
455
456    /* r15 = min(%rdx, %r15) */
457    cmpq  %rdx, %r15
458    cmova %rdx, %r15
459
460    movabsq $-3689348814741910323, %rdx
461    movq 32(%rsp), %rax /* rax = oend3 */
462    subq %op3,     %rax /* rax = oend3 - op3 */
463    mulq %rdx
464    shrq $3,       %rdx /* rdx = rax / 10 */
465
466    /* r15 = min(%rdx, %r15) */
467    cmpq  %rdx, %r15
468    cmova %rdx, %r15
469
470    /* olimit = op3 + 5 * r15 */
471    movq %r15, %rax
472    leaq (%op3, %rax, 4), %olimit
473    addq %rax, %olimit
474
475    movq 0(%rsp), %rdx
476
477    /* If (op3 + 10 > olimit) */
478    movq %op3, %rax    /* rax = op3 */
479    addq $10,  %rax    /* rax = op3 + 10 */
480    cmpq %rax, %olimit /* op3 + 10 > olimit */
481    jb .L_4X2_exit
482
483    /* If (ip1 < ip0) go to exit */
484    cmpq %ip0, %ip1
485    jb .L_4X2_exit
486
487    /* If (ip2 < ip1) go to exit */
488    cmpq %ip1, %ip2
489    jb .L_4X2_exit
490
491    /* If (ip3 < ip2) go to exit */
492    cmpq %ip2, %ip3
493    jb .L_4X2_exit
494
495#define DECODE(n, idx)              \
496    movq %bits##n, %rax;            \
497    shrq $53, %rax;                 \
498    movzwl 0(%dtable,%rax,4),%r8d;  \
499    movzbl 2(%dtable,%rax,4),%r15d; \
500    movzbl 3(%dtable,%rax,4),%eax;  \
501    movw %r8w, (%op##n);            \
502    shlxq %r15, %bits##n, %bits##n; \
503    addq %rax, %op##n
504
505#define RELOAD_BITS(n)              \
506    bsfq %bits##n, %bits##n;        \
507    movq %bits##n, %rax;            \
508    shrq $3, %bits##n;              \
509    andq $7, %rax;                  \
510    subq %bits##n, %ip##n;          \
511    movq (%ip##n), %bits##n;        \
512    orq $1, %bits##n;               \
513    shlxq %rax, %bits##n, %bits##n
514
515
516    movq %olimit, 48(%rsp)
517
518    .p2align 6
519
520.L_4X2_loop_body:
521    /* We clobber r8, so store it on the stack */
522    movq %r8, 0(%rsp)
523
524    /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
525    FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
526    FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
527    FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
528    FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
529    FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
530
531    /* Reload r8 */
532    movq 0(%rsp), %r8
533
534    FOR_EACH_STREAM(RELOAD_BITS)
535
536    cmp %op3, 48(%rsp)
537    ja .L_4X2_loop_body
538    jmp .L_4X2_compute_olimit
539
540#undef DECODE
541#undef RELOAD_BITS
542.L_4X2_exit:
543    addq $8, %rsp
544    /* Restore stack (oend & olimit) */
545    pop %rax /* oend0 */
546    pop %rax /* oend1 */
547    pop %rax /* oend2 */
548    pop %rax /* oend3 */
549    pop %rax /* ilimit */
550    pop %rax /* olimit */
551    pop %rax /* arg */
552
553    /* Save ip / op / bits */
554    movq %ip0,  0(%rax)
555    movq %ip1,  8(%rax)
556    movq %ip2, 16(%rax)
557    movq %ip3, 24(%rax)
558    movq %op0, 32(%rax)
559    movq %op1, 40(%rax)
560    movq %op2, 48(%rax)
561    movq %op3, 56(%rax)
562    movq %bits0, 64(%rax)
563    movq %bits1, 72(%rax)
564    movq %bits2, 80(%rax)
565    movq %bits3, 88(%rax)
566
567    /* Restore registers */
568    pop %r15
569    pop %r14
570    pop %r13
571    pop %r12
572    pop %r11
573    pop %r10
574    pop %r9
575    pop %r8
576    pop %rdi
577    pop %rsi
578    pop %rbp
579    pop %rdx
580    pop %rcx
581    pop %rbx
582    pop %rax
583    ret
584
585#endif
586