1*8ddb146aSEd Maste/* 2*8ddb146aSEd MasteCopyright (c) 2014, Intel Corporation 3*8ddb146aSEd MasteAll rights reserved. 4*8ddb146aSEd Maste 5*8ddb146aSEd MasteRedistribution and use in source and binary forms, with or without 6*8ddb146aSEd Mastemodification, are permitted provided that the following conditions are met: 7*8ddb146aSEd Maste 8*8ddb146aSEd Maste * Redistributions of source code must retain the above copyright notice, 9*8ddb146aSEd Maste * this list of conditions and the following disclaimer. 10*8ddb146aSEd Maste 11*8ddb146aSEd Maste * Redistributions in binary form must reproduce the above copyright notice, 12*8ddb146aSEd Maste * this list of conditions and the following disclaimer in the documentation 13*8ddb146aSEd Maste * and/or other materials provided with the distribution. 14*8ddb146aSEd Maste 15*8ddb146aSEd Maste * Neither the name of Intel Corporation nor the names of its contributors 16*8ddb146aSEd Maste * may be used to endorse or promote products derived from this software 17*8ddb146aSEd Maste * without specific prior written permission. 18*8ddb146aSEd Maste 19*8ddb146aSEd MasteTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20*8ddb146aSEd MasteANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21*8ddb146aSEd MasteWARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22*8ddb146aSEd MasteDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23*8ddb146aSEd MasteANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24*8ddb146aSEd Maste(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25*8ddb146aSEd MasteLOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26*8ddb146aSEd MasteANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27*8ddb146aSEd Maste(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28*8ddb146aSEd MasteSOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*8ddb146aSEd Maste*/ 30*8ddb146aSEd Maste 31*8ddb146aSEd Maste#include "cache.h" 32*8ddb146aSEd Maste 33*8ddb146aSEd Maste#ifndef MEMMOVE 34*8ddb146aSEd Maste# define MEMMOVE memmove 35*8ddb146aSEd Maste#endif 36*8ddb146aSEd Maste 37*8ddb146aSEd Maste#ifndef L 38*8ddb146aSEd Maste# define L(label) .L##label 39*8ddb146aSEd Maste#endif 40*8ddb146aSEd Maste 41*8ddb146aSEd Maste#ifndef cfi_startproc 42*8ddb146aSEd Maste# define cfi_startproc .cfi_startproc 43*8ddb146aSEd Maste#endif 44*8ddb146aSEd Maste 45*8ddb146aSEd Maste#ifndef cfi_endproc 46*8ddb146aSEd Maste# define cfi_endproc .cfi_endproc 47*8ddb146aSEd Maste#endif 48*8ddb146aSEd Maste 49*8ddb146aSEd Maste#ifndef cfi_rel_offset 50*8ddb146aSEd Maste# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 51*8ddb146aSEd Maste#endif 52*8ddb146aSEd Maste 53*8ddb146aSEd Maste#ifndef cfi_restore 54*8ddb146aSEd Maste# define cfi_restore(reg) .cfi_restore reg 55*8ddb146aSEd Maste#endif 56*8ddb146aSEd Maste 57*8ddb146aSEd Maste#ifndef cfi_adjust_cfa_offset 58*8ddb146aSEd Maste# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 59*8ddb146aSEd Maste#endif 60*8ddb146aSEd Maste 61*8ddb146aSEd Maste#ifndef ENTRY 62*8ddb146aSEd Maste# define ENTRY(name) \ 63*8ddb146aSEd Maste .type name, @function; \ 64*8ddb146aSEd Maste .globl name; \ 65*8ddb146aSEd Maste .p2align 4; \ 66*8ddb146aSEd Mastename: \ 67*8ddb146aSEd Maste cfi_startproc 68*8ddb146aSEd Maste#endif 69*8ddb146aSEd Maste 70*8ddb146aSEd Maste#ifndef ALIAS_SYMBOL 71*8ddb146aSEd Maste# define ALIAS_SYMBOL(alias, original) \ 72*8ddb146aSEd Maste .globl alias; \ 73*8ddb146aSEd Maste .equ alias, original 74*8ddb146aSEd Maste#endif 75*8ddb146aSEd Maste 76*8ddb146aSEd Maste#ifndef END 77*8ddb146aSEd Maste# define END(name) \ 78*8ddb146aSEd Maste cfi_endproc; \ 79*8ddb146aSEd Maste .size name, .-name 80*8ddb146aSEd Maste#endif 81*8ddb146aSEd Maste 82*8ddb146aSEd Maste#define CFI_PUSH(REG) \ 83*8ddb146aSEd Maste cfi_adjust_cfa_offset (4); \ 84*8ddb146aSEd Maste cfi_rel_offset (REG, 0) 85*8ddb146aSEd Maste 86*8ddb146aSEd Maste#define CFI_POP(REG) \ 87*8ddb146aSEd Maste cfi_adjust_cfa_offset (-4); \ 88*8ddb146aSEd Maste cfi_restore (REG) 89*8ddb146aSEd Maste 90*8ddb146aSEd Maste#define PUSH(REG) push REG; 91*8ddb146aSEd Maste#define POP(REG) pop REG; 92*8ddb146aSEd Maste 93*8ddb146aSEd Maste#define ENTRANCE PUSH (%rbx); 94*8ddb146aSEd Maste#define RETURN_END POP (%rbx); ret 95*8ddb146aSEd Maste#define RETURN RETURN_END; 96*8ddb146aSEd Maste 97*8ddb146aSEd Maste .section .text.sse2,"ax",@progbits 98*8ddb146aSEd MasteENTRY (MEMMOVE) 99*8ddb146aSEd Maste ENTRANCE 100*8ddb146aSEd Maste mov %rdi, %rax 101*8ddb146aSEd Maste 102*8ddb146aSEd Maste/* Check whether we should copy backward or forward. */ 103*8ddb146aSEd Maste cmp %rsi, %rdi 104*8ddb146aSEd Maste je L(mm_return) 105*8ddb146aSEd Maste jg L(mm_len_0_or_more_backward) 106*8ddb146aSEd Maste 107*8ddb146aSEd Maste/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] 108*8ddb146aSEd Maste separately. */ 109*8ddb146aSEd Maste cmp $16, %rdx 110*8ddb146aSEd Maste jbe L(mm_len_0_16_bytes_forward) 111*8ddb146aSEd Maste 112*8ddb146aSEd Maste cmp $32, %rdx 113*8ddb146aSEd Maste ja L(mm_len_32_or_more_forward) 114*8ddb146aSEd Maste 115*8ddb146aSEd Maste/* Copy [0..32] and return. */ 116*8ddb146aSEd Maste movdqu (%rsi), %xmm0 117*8ddb146aSEd Maste movdqu -16(%rsi, %rdx), %xmm1 118*8ddb146aSEd Maste movdqu %xmm0, (%rdi) 119*8ddb146aSEd Maste movdqu %xmm1, -16(%rdi, %rdx) 120*8ddb146aSEd Maste jmp L(mm_return) 121*8ddb146aSEd Maste 122*8ddb146aSEd MasteL(mm_len_32_or_more_forward): 123*8ddb146aSEd Maste cmp $64, %rdx 124*8ddb146aSEd Maste ja L(mm_len_64_or_more_forward) 125*8ddb146aSEd Maste 126*8ddb146aSEd Maste/* Copy [0..64] and return. */ 127*8ddb146aSEd Maste movdqu (%rsi), %xmm0 128*8ddb146aSEd Maste movdqu 16(%rsi), %xmm1 129*8ddb146aSEd Maste movdqu -16(%rsi, %rdx), %xmm2 130*8ddb146aSEd Maste movdqu -32(%rsi, %rdx), %xmm3 131*8ddb146aSEd Maste movdqu %xmm0, (%rdi) 132*8ddb146aSEd Maste movdqu %xmm1, 16(%rdi) 133*8ddb146aSEd Maste movdqu %xmm2, -16(%rdi, %rdx) 134*8ddb146aSEd Maste movdqu %xmm3, -32(%rdi, %rdx) 135*8ddb146aSEd Maste jmp L(mm_return) 136*8ddb146aSEd Maste 137*8ddb146aSEd MasteL(mm_len_64_or_more_forward): 138*8ddb146aSEd Maste cmp $128, %rdx 139*8ddb146aSEd Maste ja L(mm_len_128_or_more_forward) 140*8ddb146aSEd Maste 141*8ddb146aSEd Maste/* Copy [0..128] and return. */ 142*8ddb146aSEd Maste movdqu (%rsi), %xmm0 143*8ddb146aSEd Maste movdqu 16(%rsi), %xmm1 144*8ddb146aSEd Maste movdqu 32(%rsi), %xmm2 145*8ddb146aSEd Maste movdqu 48(%rsi), %xmm3 146*8ddb146aSEd Maste movdqu -64(%rsi, %rdx), %xmm4 147*8ddb146aSEd Maste movdqu -48(%rsi, %rdx), %xmm5 148*8ddb146aSEd Maste movdqu -32(%rsi, %rdx), %xmm6 149*8ddb146aSEd Maste movdqu -16(%rsi, %rdx), %xmm7 150*8ddb146aSEd Maste movdqu %xmm0, (%rdi) 151*8ddb146aSEd Maste movdqu %xmm1, 16(%rdi) 152*8ddb146aSEd Maste movdqu %xmm2, 32(%rdi) 153*8ddb146aSEd Maste movdqu %xmm3, 48(%rdi) 154*8ddb146aSEd Maste movdqu %xmm4, -64(%rdi, %rdx) 155*8ddb146aSEd Maste movdqu %xmm5, -48(%rdi, %rdx) 156*8ddb146aSEd Maste movdqu %xmm6, -32(%rdi, %rdx) 157*8ddb146aSEd Maste movdqu %xmm7, -16(%rdi, %rdx) 158*8ddb146aSEd Maste jmp L(mm_return) 159*8ddb146aSEd Maste 160*8ddb146aSEd MasteL(mm_len_128_or_more_forward): 161*8ddb146aSEd Maste/* Aligning the address of destination. */ 162*8ddb146aSEd Maste/* save first unaligned 64 bytes */ 163*8ddb146aSEd Maste movdqu (%rsi), %xmm0 164*8ddb146aSEd Maste movdqu 16(%rsi), %xmm1 165*8ddb146aSEd Maste movdqu 32(%rsi), %xmm2 166*8ddb146aSEd Maste movdqu 48(%rsi), %xmm3 167*8ddb146aSEd Maste 168*8ddb146aSEd Maste lea 64(%rdi), %r8 169*8ddb146aSEd Maste and $-64, %r8 /* r8 now aligned to next 64 byte boundary */ 170*8ddb146aSEd Maste sub %rdi, %rsi /* rsi = src - dst = diff */ 171*8ddb146aSEd Maste 172*8ddb146aSEd Maste movdqu (%r8, %rsi), %xmm4 173*8ddb146aSEd Maste movdqu 16(%r8, %rsi), %xmm5 174*8ddb146aSEd Maste movdqu 32(%r8, %rsi), %xmm6 175*8ddb146aSEd Maste movdqu 48(%r8, %rsi), %xmm7 176*8ddb146aSEd Maste 177*8ddb146aSEd Maste movdqu %xmm0, (%rdi) 178*8ddb146aSEd Maste movdqu %xmm1, 16(%rdi) 179*8ddb146aSEd Maste movdqu %xmm2, 32(%rdi) 180*8ddb146aSEd Maste movdqu %xmm3, 48(%rdi) 181*8ddb146aSEd Maste movdqa %xmm4, (%r8) 182*8ddb146aSEd Maste movaps %xmm5, 16(%r8) 183*8ddb146aSEd Maste movaps %xmm6, 32(%r8) 184*8ddb146aSEd Maste movaps %xmm7, 48(%r8) 185*8ddb146aSEd Maste add $64, %r8 186*8ddb146aSEd Maste 187*8ddb146aSEd Maste lea (%rdi, %rdx), %rbx 188*8ddb146aSEd Maste and $-64, %rbx 189*8ddb146aSEd Maste cmp %r8, %rbx 190*8ddb146aSEd Maste jbe L(mm_copy_remaining_forward) 191*8ddb146aSEd Maste 192*8ddb146aSEd Maste cmp $SHARED_CACHE_SIZE_HALF, %rdx 193*8ddb146aSEd Maste jae L(mm_large_page_loop_forward) 194*8ddb146aSEd Maste 195*8ddb146aSEd Maste .p2align 4 196*8ddb146aSEd MasteL(mm_main_loop_forward): 197*8ddb146aSEd Maste 198*8ddb146aSEd Maste prefetcht0 128(%r8, %rsi) 199*8ddb146aSEd Maste 200*8ddb146aSEd Maste movdqu (%r8, %rsi), %xmm0 201*8ddb146aSEd Maste movdqu 16(%r8, %rsi), %xmm1 202*8ddb146aSEd Maste movdqu 32(%r8, %rsi), %xmm2 203*8ddb146aSEd Maste movdqu 48(%r8, %rsi), %xmm3 204*8ddb146aSEd Maste movdqa %xmm0, (%r8) 205*8ddb146aSEd Maste movaps %xmm1, 16(%r8) 206*8ddb146aSEd Maste movaps %xmm2, 32(%r8) 207*8ddb146aSEd Maste movaps %xmm3, 48(%r8) 208*8ddb146aSEd Maste lea 64(%r8), %r8 209*8ddb146aSEd Maste cmp %r8, %rbx 210*8ddb146aSEd Maste ja L(mm_main_loop_forward) 211*8ddb146aSEd Maste 212*8ddb146aSEd MasteL(mm_copy_remaining_forward): 213*8ddb146aSEd Maste add %rdi, %rdx 214*8ddb146aSEd Maste sub %r8, %rdx 215*8ddb146aSEd Maste/* We copied all up till %rdi position in the dst. 216*8ddb146aSEd Maste In %rdx now is how many bytes are left to copy. 217*8ddb146aSEd Maste Now we need to advance %r8. */ 218*8ddb146aSEd Maste lea (%r8, %rsi), %r9 219*8ddb146aSEd Maste 220*8ddb146aSEd MasteL(mm_remaining_0_64_bytes_forward): 221*8ddb146aSEd Maste cmp $32, %rdx 222*8ddb146aSEd Maste ja L(mm_remaining_33_64_bytes_forward) 223*8ddb146aSEd Maste cmp $16, %rdx 224*8ddb146aSEd Maste ja L(mm_remaining_17_32_bytes_forward) 225*8ddb146aSEd Maste test %rdx, %rdx 226*8ddb146aSEd Maste .p2align 4,,2 227*8ddb146aSEd Maste je L(mm_return) 228*8ddb146aSEd Maste 229*8ddb146aSEd Maste cmpb $8, %dl 230*8ddb146aSEd Maste ja L(mm_remaining_9_16_bytes_forward) 231*8ddb146aSEd Maste cmpb $4, %dl 232*8ddb146aSEd Maste .p2align 4,,5 233*8ddb146aSEd Maste ja L(mm_remaining_5_8_bytes_forward) 234*8ddb146aSEd Maste cmpb $2, %dl 235*8ddb146aSEd Maste .p2align 4,,1 236*8ddb146aSEd Maste ja L(mm_remaining_3_4_bytes_forward) 237*8ddb146aSEd Maste movzbl -1(%r9,%rdx), %esi 238*8ddb146aSEd Maste movzbl (%r9), %ebx 239*8ddb146aSEd Maste movb %sil, -1(%r8,%rdx) 240*8ddb146aSEd Maste movb %bl, (%r8) 241*8ddb146aSEd Maste jmp L(mm_return) 242*8ddb146aSEd Maste 243*8ddb146aSEd MasteL(mm_remaining_33_64_bytes_forward): 244*8ddb146aSEd Maste movdqu (%r9), %xmm0 245*8ddb146aSEd Maste movdqu 16(%r9), %xmm1 246*8ddb146aSEd Maste movdqu -32(%r9, %rdx), %xmm2 247*8ddb146aSEd Maste movdqu -16(%r9, %rdx), %xmm3 248*8ddb146aSEd Maste movdqu %xmm0, (%r8) 249*8ddb146aSEd Maste movdqu %xmm1, 16(%r8) 250*8ddb146aSEd Maste movdqu %xmm2, -32(%r8, %rdx) 251*8ddb146aSEd Maste movdqu %xmm3, -16(%r8, %rdx) 252*8ddb146aSEd Maste jmp L(mm_return) 253*8ddb146aSEd Maste 254*8ddb146aSEd MasteL(mm_remaining_17_32_bytes_forward): 255*8ddb146aSEd Maste movdqu (%r9), %xmm0 256*8ddb146aSEd Maste movdqu -16(%r9, %rdx), %xmm1 257*8ddb146aSEd Maste movdqu %xmm0, (%r8) 258*8ddb146aSEd Maste movdqu %xmm1, -16(%r8, %rdx) 259*8ddb146aSEd Maste jmp L(mm_return) 260*8ddb146aSEd Maste 261*8ddb146aSEd MasteL(mm_remaining_5_8_bytes_forward): 262*8ddb146aSEd Maste movl (%r9), %esi 263*8ddb146aSEd Maste movl -4(%r9,%rdx), %ebx 264*8ddb146aSEd Maste movl %esi, (%r8) 265*8ddb146aSEd Maste movl %ebx, -4(%r8,%rdx) 266*8ddb146aSEd Maste jmp L(mm_return) 267*8ddb146aSEd Maste 268*8ddb146aSEd MasteL(mm_remaining_9_16_bytes_forward): 269*8ddb146aSEd Maste mov (%r9), %rsi 270*8ddb146aSEd Maste mov -8(%r9, %rdx), %rbx 271*8ddb146aSEd Maste mov %rsi, (%r8) 272*8ddb146aSEd Maste mov %rbx, -8(%r8, %rdx) 273*8ddb146aSEd Maste jmp L(mm_return) 274*8ddb146aSEd Maste 275*8ddb146aSEd MasteL(mm_remaining_3_4_bytes_forward): 276*8ddb146aSEd Maste movzwl -2(%r9,%rdx), %esi 277*8ddb146aSEd Maste movzwl (%r9), %ebx 278*8ddb146aSEd Maste movw %si, -2(%r8,%rdx) 279*8ddb146aSEd Maste movw %bx, (%r8) 280*8ddb146aSEd Maste jmp L(mm_return) 281*8ddb146aSEd Maste 282*8ddb146aSEd MasteL(mm_len_0_16_bytes_forward): 283*8ddb146aSEd Maste testb $24, %dl 284*8ddb146aSEd Maste jne L(mm_len_9_16_bytes_forward) 285*8ddb146aSEd Maste testb $4, %dl 286*8ddb146aSEd Maste .p2align 4,,5 287*8ddb146aSEd Maste jne L(mm_len_5_8_bytes_forward) 288*8ddb146aSEd Maste test %rdx, %rdx 289*8ddb146aSEd Maste .p2align 4,,2 290*8ddb146aSEd Maste je L(mm_return) 291*8ddb146aSEd Maste testb $2, %dl 292*8ddb146aSEd Maste .p2align 4,,1 293*8ddb146aSEd Maste jne L(mm_len_2_4_bytes_forward) 294*8ddb146aSEd Maste movzbl -1(%rsi,%rdx), %ebx 295*8ddb146aSEd Maste movzbl (%rsi), %esi 296*8ddb146aSEd Maste movb %bl, -1(%rdi,%rdx) 297*8ddb146aSEd Maste movb %sil, (%rdi) 298*8ddb146aSEd Maste jmp L(mm_return) 299*8ddb146aSEd Maste 300*8ddb146aSEd MasteL(mm_len_2_4_bytes_forward): 301*8ddb146aSEd Maste movzwl -2(%rsi,%rdx), %ebx 302*8ddb146aSEd Maste movzwl (%rsi), %esi 303*8ddb146aSEd Maste movw %bx, -2(%rdi,%rdx) 304*8ddb146aSEd Maste movw %si, (%rdi) 305*8ddb146aSEd Maste jmp L(mm_return) 306*8ddb146aSEd Maste 307*8ddb146aSEd MasteL(mm_len_5_8_bytes_forward): 308*8ddb146aSEd Maste movl (%rsi), %ebx 309*8ddb146aSEd Maste movl -4(%rsi,%rdx), %esi 310*8ddb146aSEd Maste movl %ebx, (%rdi) 311*8ddb146aSEd Maste movl %esi, -4(%rdi,%rdx) 312*8ddb146aSEd Maste jmp L(mm_return) 313*8ddb146aSEd Maste 314*8ddb146aSEd MasteL(mm_len_9_16_bytes_forward): 315*8ddb146aSEd Maste mov (%rsi), %rbx 316*8ddb146aSEd Maste mov -8(%rsi, %rdx), %rsi 317*8ddb146aSEd Maste mov %rbx, (%rdi) 318*8ddb146aSEd Maste mov %rsi, -8(%rdi, %rdx) 319*8ddb146aSEd Maste jmp L(mm_return) 320*8ddb146aSEd Maste 321*8ddb146aSEd MasteL(mm_recalc_len): 322*8ddb146aSEd Maste/* Compute in %rdx how many bytes are left to copy after 323*8ddb146aSEd Maste the main loop stops. */ 324*8ddb146aSEd Maste mov %rbx, %rdx 325*8ddb146aSEd Maste sub %rdi, %rdx 326*8ddb146aSEd Maste/* The code for copying backwards. */ 327*8ddb146aSEd MasteL(mm_len_0_or_more_backward): 328*8ddb146aSEd Maste 329*8ddb146aSEd Maste/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] 330*8ddb146aSEd Maste separately. */ 331*8ddb146aSEd Maste cmp $16, %rdx 332*8ddb146aSEd Maste jbe L(mm_len_0_16_bytes_backward) 333*8ddb146aSEd Maste 334*8ddb146aSEd Maste cmp $32, %rdx 335*8ddb146aSEd Maste ja L(mm_len_32_or_more_backward) 336*8ddb146aSEd Maste 337*8ddb146aSEd Maste/* Copy [0..32] and return. */ 338*8ddb146aSEd Maste movdqu (%rsi), %xmm0 339*8ddb146aSEd Maste movdqu -16(%rsi, %rdx), %xmm1 340*8ddb146aSEd Maste movdqu %xmm0, (%rdi) 341*8ddb146aSEd Maste movdqu %xmm1, -16(%rdi, %rdx) 342*8ddb146aSEd Maste jmp L(mm_return) 343*8ddb146aSEd Maste 344*8ddb146aSEd MasteL(mm_len_32_or_more_backward): 345*8ddb146aSEd Maste cmp $64, %rdx 346*8ddb146aSEd Maste ja L(mm_len_64_or_more_backward) 347*8ddb146aSEd Maste 348*8ddb146aSEd Maste/* Copy [0..64] and return. */ 349*8ddb146aSEd Maste movdqu (%rsi), %xmm0 350*8ddb146aSEd Maste movdqu 16(%rsi), %xmm1 351*8ddb146aSEd Maste movdqu -16(%rsi, %rdx), %xmm2 352*8ddb146aSEd Maste movdqu -32(%rsi, %rdx), %xmm3 353*8ddb146aSEd Maste movdqu %xmm0, (%rdi) 354*8ddb146aSEd Maste movdqu %xmm1, 16(%rdi) 355*8ddb146aSEd Maste movdqu %xmm2, -16(%rdi, %rdx) 356*8ddb146aSEd Maste movdqu %xmm3, -32(%rdi, %rdx) 357*8ddb146aSEd Maste jmp L(mm_return) 358*8ddb146aSEd Maste 359*8ddb146aSEd MasteL(mm_len_64_or_more_backward): 360*8ddb146aSEd Maste cmp $128, %rdx 361*8ddb146aSEd Maste ja L(mm_len_128_or_more_backward) 362*8ddb146aSEd Maste 363*8ddb146aSEd Maste/* Copy [0..128] and return. */ 364*8ddb146aSEd Maste movdqu (%rsi), %xmm0 365*8ddb146aSEd Maste movdqu 16(%rsi), %xmm1 366*8ddb146aSEd Maste movdqu 32(%rsi), %xmm2 367*8ddb146aSEd Maste movdqu 48(%rsi), %xmm3 368*8ddb146aSEd Maste movdqu -64(%rsi, %rdx), %xmm4 369*8ddb146aSEd Maste movdqu -48(%rsi, %rdx), %xmm5 370*8ddb146aSEd Maste movdqu -32(%rsi, %rdx), %xmm6 371*8ddb146aSEd Maste movdqu -16(%rsi, %rdx), %xmm7 372*8ddb146aSEd Maste movdqu %xmm0, (%rdi) 373*8ddb146aSEd Maste movdqu %xmm1, 16(%rdi) 374*8ddb146aSEd Maste movdqu %xmm2, 32(%rdi) 375*8ddb146aSEd Maste movdqu %xmm3, 48(%rdi) 376*8ddb146aSEd Maste movdqu %xmm4, -64(%rdi, %rdx) 377*8ddb146aSEd Maste movdqu %xmm5, -48(%rdi, %rdx) 378*8ddb146aSEd Maste movdqu %xmm6, -32(%rdi, %rdx) 379*8ddb146aSEd Maste movdqu %xmm7, -16(%rdi, %rdx) 380*8ddb146aSEd Maste jmp L(mm_return) 381*8ddb146aSEd Maste 382*8ddb146aSEd MasteL(mm_len_128_or_more_backward): 383*8ddb146aSEd Maste/* Aligning the address of destination. We need to save 384*8ddb146aSEd Maste 16 bits from the source in order not to overwrite them. */ 385*8ddb146aSEd Maste movdqu -16(%rsi, %rdx), %xmm0 386*8ddb146aSEd Maste movdqu -32(%rsi, %rdx), %xmm1 387*8ddb146aSEd Maste movdqu -48(%rsi, %rdx), %xmm2 388*8ddb146aSEd Maste movdqu -64(%rsi, %rdx), %xmm3 389*8ddb146aSEd Maste 390*8ddb146aSEd Maste lea (%rdi, %rdx), %r9 391*8ddb146aSEd Maste and $-64, %r9 /* r9 = aligned dst */ 392*8ddb146aSEd Maste 393*8ddb146aSEd Maste mov %rsi, %r8 394*8ddb146aSEd Maste sub %rdi, %r8 /* r8 = src - dst, diff */ 395*8ddb146aSEd Maste 396*8ddb146aSEd Maste movdqu -16(%r9, %r8), %xmm4 397*8ddb146aSEd Maste movdqu -32(%r9, %r8), %xmm5 398*8ddb146aSEd Maste movdqu -48(%r9, %r8), %xmm6 399*8ddb146aSEd Maste movdqu -64(%r9, %r8), %xmm7 400*8ddb146aSEd Maste 401*8ddb146aSEd Maste movdqu %xmm0, -16(%rdi, %rdx) 402*8ddb146aSEd Maste movdqu %xmm1, -32(%rdi, %rdx) 403*8ddb146aSEd Maste movdqu %xmm2, -48(%rdi, %rdx) 404*8ddb146aSEd Maste movdqu %xmm3, -64(%rdi, %rdx) 405*8ddb146aSEd Maste movdqa %xmm4, -16(%r9) 406*8ddb146aSEd Maste movaps %xmm5, -32(%r9) 407*8ddb146aSEd Maste movaps %xmm6, -48(%r9) 408*8ddb146aSEd Maste movaps %xmm7, -64(%r9) 409*8ddb146aSEd Maste lea -64(%r9), %r9 410*8ddb146aSEd Maste 411*8ddb146aSEd Maste lea 64(%rdi), %rbx 412*8ddb146aSEd Maste and $-64, %rbx 413*8ddb146aSEd Maste 414*8ddb146aSEd Maste cmp %r9, %rbx 415*8ddb146aSEd Maste jae L(mm_recalc_len) 416*8ddb146aSEd Maste 417*8ddb146aSEd Maste cmp $SHARED_CACHE_SIZE_HALF, %rdx 418*8ddb146aSEd Maste jae L(mm_large_page_loop_backward) 419*8ddb146aSEd Maste 420*8ddb146aSEd Maste .p2align 4 421*8ddb146aSEd MasteL(mm_main_loop_backward): 422*8ddb146aSEd Maste 423*8ddb146aSEd Maste prefetcht0 -128(%r9, %r8) 424*8ddb146aSEd Maste 425*8ddb146aSEd Maste movdqu -64(%r9, %r8), %xmm0 426*8ddb146aSEd Maste movdqu -48(%r9, %r8), %xmm1 427*8ddb146aSEd Maste movdqu -32(%r9, %r8), %xmm2 428*8ddb146aSEd Maste movdqu -16(%r9, %r8), %xmm3 429*8ddb146aSEd Maste movdqa %xmm0, -64(%r9) 430*8ddb146aSEd Maste movaps %xmm1, -48(%r9) 431*8ddb146aSEd Maste movaps %xmm2, -32(%r9) 432*8ddb146aSEd Maste movaps %xmm3, -16(%r9) 433*8ddb146aSEd Maste lea -64(%r9), %r9 434*8ddb146aSEd Maste cmp %r9, %rbx 435*8ddb146aSEd Maste jb L(mm_main_loop_backward) 436*8ddb146aSEd Maste jmp L(mm_recalc_len) 437*8ddb146aSEd Maste 438*8ddb146aSEd Maste/* Copy [0..16] and return. */ 439*8ddb146aSEd MasteL(mm_len_0_16_bytes_backward): 440*8ddb146aSEd Maste testb $24, %dl 441*8ddb146aSEd Maste jnz L(mm_len_9_16_bytes_backward) 442*8ddb146aSEd Maste testb $4, %dl 443*8ddb146aSEd Maste .p2align 4,,5 444*8ddb146aSEd Maste jnz L(mm_len_5_8_bytes_backward) 445*8ddb146aSEd Maste test %rdx, %rdx 446*8ddb146aSEd Maste .p2align 4,,2 447*8ddb146aSEd Maste je L(mm_return) 448*8ddb146aSEd Maste testb $2, %dl 449*8ddb146aSEd Maste .p2align 4,,1 450*8ddb146aSEd Maste jne L(mm_len_3_4_bytes_backward) 451*8ddb146aSEd Maste movzbl -1(%rsi,%rdx), %ebx 452*8ddb146aSEd Maste movzbl (%rsi), %ecx 453*8ddb146aSEd Maste movb %bl, -1(%rdi,%rdx) 454*8ddb146aSEd Maste movb %cl, (%rdi) 455*8ddb146aSEd Maste jmp L(mm_return) 456*8ddb146aSEd Maste 457*8ddb146aSEd MasteL(mm_len_3_4_bytes_backward): 458*8ddb146aSEd Maste movzwl -2(%rsi,%rdx), %ebx 459*8ddb146aSEd Maste movzwl (%rsi), %ecx 460*8ddb146aSEd Maste movw %bx, -2(%rdi,%rdx) 461*8ddb146aSEd Maste movw %cx, (%rdi) 462*8ddb146aSEd Maste jmp L(mm_return) 463*8ddb146aSEd Maste 464*8ddb146aSEd MasteL(mm_len_9_16_bytes_backward): 465*8ddb146aSEd Maste movl -4(%rsi,%rdx), %ebx 466*8ddb146aSEd Maste movl -8(%rsi,%rdx), %ecx 467*8ddb146aSEd Maste movl %ebx, -4(%rdi,%rdx) 468*8ddb146aSEd Maste movl %ecx, -8(%rdi,%rdx) 469*8ddb146aSEd Maste sub $8, %rdx 470*8ddb146aSEd Maste jmp L(mm_len_0_16_bytes_backward) 471*8ddb146aSEd Maste 472*8ddb146aSEd MasteL(mm_len_5_8_bytes_backward): 473*8ddb146aSEd Maste movl (%rsi), %ebx 474*8ddb146aSEd Maste movl -4(%rsi,%rdx), %ecx 475*8ddb146aSEd Maste movl %ebx, (%rdi) 476*8ddb146aSEd Maste movl %ecx, -4(%rdi,%rdx) 477*8ddb146aSEd Maste 478*8ddb146aSEd MasteL(mm_return): 479*8ddb146aSEd Maste RETURN 480*8ddb146aSEd Maste 481*8ddb146aSEd Maste/* Big length copy forward part. */ 482*8ddb146aSEd Maste 483*8ddb146aSEd Maste .p2align 4 484*8ddb146aSEd MasteL(mm_large_page_loop_forward): 485*8ddb146aSEd Maste movdqu (%r8, %rsi), %xmm0 486*8ddb146aSEd Maste movdqu 16(%r8, %rsi), %xmm1 487*8ddb146aSEd Maste movdqu 32(%r8, %rsi), %xmm2 488*8ddb146aSEd Maste movdqu 48(%r8, %rsi), %xmm3 489*8ddb146aSEd Maste movntdq %xmm0, (%r8) 490*8ddb146aSEd Maste movntdq %xmm1, 16(%r8) 491*8ddb146aSEd Maste movntdq %xmm2, 32(%r8) 492*8ddb146aSEd Maste movntdq %xmm3, 48(%r8) 493*8ddb146aSEd Maste lea 64(%r8), %r8 494*8ddb146aSEd Maste cmp %r8, %rbx 495*8ddb146aSEd Maste ja L(mm_large_page_loop_forward) 496*8ddb146aSEd Maste sfence 497*8ddb146aSEd Maste jmp L(mm_copy_remaining_forward) 498*8ddb146aSEd Maste 499*8ddb146aSEd Maste/* Big length copy backward part. */ 500*8ddb146aSEd Maste .p2align 4 501*8ddb146aSEd MasteL(mm_large_page_loop_backward): 502*8ddb146aSEd Maste movdqu -64(%r9, %r8), %xmm0 503*8ddb146aSEd Maste movdqu -48(%r9, %r8), %xmm1 504*8ddb146aSEd Maste movdqu -32(%r9, %r8), %xmm2 505*8ddb146aSEd Maste movdqu -16(%r9, %r8), %xmm3 506*8ddb146aSEd Maste movntdq %xmm0, -64(%r9) 507*8ddb146aSEd Maste movntdq %xmm1, -48(%r9) 508*8ddb146aSEd Maste movntdq %xmm2, -32(%r9) 509*8ddb146aSEd Maste movntdq %xmm3, -16(%r9) 510*8ddb146aSEd Maste lea -64(%r9), %r9 511*8ddb146aSEd Maste cmp %r9, %rbx 512*8ddb146aSEd Maste jb L(mm_large_page_loop_backward) 513*8ddb146aSEd Maste sfence 514*8ddb146aSEd Maste jmp L(mm_recalc_len) 515*8ddb146aSEd Maste 516*8ddb146aSEd MasteEND (MEMMOVE) 517*8ddb146aSEd Maste 518*8ddb146aSEd MasteALIAS_SYMBOL(memcpy, MEMMOVE) 519