1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2008, Intel Corporation 24 * All rights reserved. 25 */ 26 27/* 28 * memcpy.s - copies two blocks of memory 29 * Implements memcpy() and memmove() libc primitives. 30 */ 31 .ident "%Z%%M% %I% %E% SMI" 32 33 .file "%M%" 34 35#include <sys/asm_linkage.h> 36 ANSI_PRAGMA_WEAK(memmove,function) 37 ANSI_PRAGMA_WEAK(memcpy,function) 38 39#include "synonyms.h" 40#include "cache.h" 41#include "proc64_id.h" 42 43 ANSI_PRAGMA_WEAK2(_private_memcpy,memcpy,function) 44 45#define L(s) .memcpy/**/s 46 47/* 48 * memcpy algorithm overview: 49 * 50 * Thresholds used below were determined experimentally. 51 * 52 * Pseudo code: 53 * 54 * If (size <= 128 bytes) { 55 * do unrolled code (primarily 8-byte loads/stores) regardless of 56 * alignment. 57 * } else { 58 * Align destination to 16-byte boundary 59 * 60 * if (NO_SSE) { 61 * If (size > half of the largest level cache) { 62 * Use 8-byte non-temporal stores (64-bytes/loop) 63 * } else { 64 * if (size > 4K && size <= half l1 cache size) { 65 * Use rep movsq 66 * } else { 67 * Use 8-byte loads/stores (64 bytes per loop) 68 * } 69 * } 70 * 71 * } else { **USE SSE** 72 * If (size > half of the largest level cache) { 73 * Use 16-byte non-temporal stores (128-bytes per loop) 74 * } else { 75 * If (both source and destination are aligned) { 76 * Use 16-byte aligned loads and stores (128 bytes/loop) 77 * } else { 78 * use pairs of xmm registers with SSE2 or SSSE3 79 * instructions to concatenate and shift appropriately 80 * to account for source unalignment. This enables 81 * 16-byte aligned loads to be done. 82 * } 83 * } 84 } 85 * 86 * Finish any remaining bytes via unrolled code above. 87 * } 88 * 89 * memmove overview: 90 * memmove is the same as memcpy except one case where copy needs to be 91 * done backwards. The copy backwards code is done in a similar manner. 92 */ 93 94 ENTRY(memmove) 95 cmp %rsi,%rdi # if dst <= src 96 jbe L(CopyForward) # then do copy forward 97 mov %rsi,%r9 # move src to r9 98 add %rdx,%r9 # add len to get addr of end of src 99 cmp %r9,%rdi # if dst < end of src 100 jb L(CopyBackwards) # then do copy backwards 101 jmp L(CopyForward) 102 103 ENTRY (memcpy) 104L(CopyForward): 105 mov %rdx,%r8 106 mov %rdi,%rcx 107 mov %rsi,%rdx 108 mov %rdi,%rax 109 lea L(fwdPxQx)(%rip),%r11 110 cmp $0x80,%r8 # 128 111 jg L(ck_use_sse2) 112 add %r8,%rcx 113 add %r8,%rdx 114 115 movslq (%r11,%r8,4),%r10 116 lea (%r10,%r11,1),%r11 117 jmpq *%r11 118 119 .balign 16 120L(ShrtAlignNew): 121 lea L(AliPxQx)(%rip),%r11 122 mov %rcx,%r9 123 and $0xf,%r9 124 125 movslq (%r11,%r9,4),%r10 126 lea (%r10,%r11,1),%r11 127 jmpq *%r11 128 129 .balign 16 130L(fwdPxQx): .int L(P0Q0)-L(fwdPxQx) 131 .int L(P1Q0)-L(fwdPxQx) 132 .int L(P2Q0)-L(fwdPxQx) 133 .int L(P3Q0)-L(fwdPxQx) 134 .int L(P4Q0)-L(fwdPxQx) 135 .int L(P5Q0)-L(fwdPxQx) 136 .int L(P6Q0)-L(fwdPxQx) 137 .int L(P7Q0)-L(fwdPxQx) 138 139 .int L(P0Q1)-L(fwdPxQx) 140 .int L(P1Q1)-L(fwdPxQx) 141 .int L(P2Q1)-L(fwdPxQx) 142 .int L(P3Q1)-L(fwdPxQx) 143 .int L(P4Q1)-L(fwdPxQx) 144 .int L(P5Q1)-L(fwdPxQx) 145 .int L(P6Q1)-L(fwdPxQx) 146 .int L(P7Q1)-L(fwdPxQx) 147 148 .int L(P0Q2)-L(fwdPxQx) 149 .int L(P1Q2)-L(fwdPxQx) 150 .int L(P2Q2)-L(fwdPxQx) 151 .int L(P3Q2)-L(fwdPxQx) 152 .int L(P4Q2)-L(fwdPxQx) 153 .int L(P5Q2)-L(fwdPxQx) 154 .int L(P6Q2)-L(fwdPxQx) 155 .int L(P7Q2)-L(fwdPxQx) 156 157 .int L(P0Q3)-L(fwdPxQx) 158 .int L(P1Q3)-L(fwdPxQx) 159 .int L(P2Q3)-L(fwdPxQx) 160 .int L(P3Q3)-L(fwdPxQx) 161 .int L(P4Q3)-L(fwdPxQx) 162 .int L(P5Q3)-L(fwdPxQx) 163 .int L(P6Q3)-L(fwdPxQx) 164 .int L(P7Q3)-L(fwdPxQx) 165 166 .int L(P0Q4)-L(fwdPxQx) 167 .int L(P1Q4)-L(fwdPxQx) 168 .int L(P2Q4)-L(fwdPxQx) 169 .int L(P3Q4)-L(fwdPxQx) 170 .int L(P4Q4)-L(fwdPxQx) 171 .int L(P5Q4)-L(fwdPxQx) 172 .int L(P6Q4)-L(fwdPxQx) 173 .int L(P7Q4)-L(fwdPxQx) 174 175 .int L(P0Q5)-L(fwdPxQx) 176 .int L(P1Q5)-L(fwdPxQx) 177 .int L(P2Q5)-L(fwdPxQx) 178 .int L(P3Q5)-L(fwdPxQx) 179 .int L(P4Q5)-L(fwdPxQx) 180 .int L(P5Q5)-L(fwdPxQx) 181 .int L(P6Q5)-L(fwdPxQx) 182 .int L(P7Q5)-L(fwdPxQx) 183 184 .int L(P0Q6)-L(fwdPxQx) 185 .int L(P1Q6)-L(fwdPxQx) 186 .int L(P2Q6)-L(fwdPxQx) 187 .int L(P3Q6)-L(fwdPxQx) 188 .int L(P4Q6)-L(fwdPxQx) 189 .int L(P5Q6)-L(fwdPxQx) 190 .int L(P6Q6)-L(fwdPxQx) 191 .int L(P7Q6)-L(fwdPxQx) 192 193 .int L(P0Q7)-L(fwdPxQx) 194 .int L(P1Q7)-L(fwdPxQx) 195 .int L(P2Q7)-L(fwdPxQx) 196 .int L(P3Q7)-L(fwdPxQx) 197 .int L(P4Q7)-L(fwdPxQx) 198 .int L(P5Q7)-L(fwdPxQx) 199 .int L(P6Q7)-L(fwdPxQx) 200 .int L(P7Q7)-L(fwdPxQx) 201 202 .int L(P0Q8)-L(fwdPxQx) 203 .int L(P1Q8)-L(fwdPxQx) 204 .int L(P2Q8)-L(fwdPxQx) 205 .int L(P3Q8)-L(fwdPxQx) 206 .int L(P4Q8)-L(fwdPxQx) 207 .int L(P5Q8)-L(fwdPxQx) 208 .int L(P6Q8)-L(fwdPxQx) 209 .int L(P7Q8)-L(fwdPxQx) 210 211 .int L(P0Q9)-L(fwdPxQx) 212 .int L(P1Q9)-L(fwdPxQx) 213 .int L(P2Q9)-L(fwdPxQx) 214 .int L(P3Q9)-L(fwdPxQx) 215 .int L(P4Q9)-L(fwdPxQx) 216 .int L(P5Q9)-L(fwdPxQx) 217 .int L(P6Q9)-L(fwdPxQx) 218 .int L(P7Q9)-L(fwdPxQx) 219 220 .int L(P0QA)-L(fwdPxQx) 221 .int L(P1QA)-L(fwdPxQx) 222 .int L(P2QA)-L(fwdPxQx) 223 .int L(P3QA)-L(fwdPxQx) 224 .int L(P4QA)-L(fwdPxQx) 225 .int L(P5QA)-L(fwdPxQx) 226 .int L(P6QA)-L(fwdPxQx) 227 .int L(P7QA)-L(fwdPxQx) 228 229 .int L(P0QB)-L(fwdPxQx) 230 .int L(P1QB)-L(fwdPxQx) 231 .int L(P2QB)-L(fwdPxQx) 232 .int L(P3QB)-L(fwdPxQx) 233 .int L(P4QB)-L(fwdPxQx) 234 .int L(P5QB)-L(fwdPxQx) 235 .int L(P6QB)-L(fwdPxQx) 236 .int L(P7QB)-L(fwdPxQx) 237 238 .int L(P0QC)-L(fwdPxQx) 239 .int L(P1QC)-L(fwdPxQx) 240 .int L(P2QC)-L(fwdPxQx) 241 .int L(P3QC)-L(fwdPxQx) 242 .int L(P4QC)-L(fwdPxQx) 243 .int L(P5QC)-L(fwdPxQx) 244 .int L(P6QC)-L(fwdPxQx) 245 .int L(P7QC)-L(fwdPxQx) 246 247 .int L(P0QD)-L(fwdPxQx) 248 .int L(P1QD)-L(fwdPxQx) 249 .int L(P2QD)-L(fwdPxQx) 250 .int L(P3QD)-L(fwdPxQx) 251 .int L(P4QD)-L(fwdPxQx) 252 .int L(P5QD)-L(fwdPxQx) 253 .int L(P6QD)-L(fwdPxQx) 254 .int L(P7QD)-L(fwdPxQx) 255 256 .int L(P0QE)-L(fwdPxQx) 257 .int L(P1QE)-L(fwdPxQx) 258 .int L(P2QE)-L(fwdPxQx) 259 .int L(P3QE)-L(fwdPxQx) 260 .int L(P4QE)-L(fwdPxQx) 261 .int L(P5QE)-L(fwdPxQx) 262 .int L(P6QE)-L(fwdPxQx) 263 .int L(P7QE)-L(fwdPxQx) 264 265 .int L(P0QF)-L(fwdPxQx) 266 .int L(P1QF)-L(fwdPxQx) 267 .int L(P2QF)-L(fwdPxQx) 268 .int L(P3QF)-L(fwdPxQx) 269 .int L(P4QF)-L(fwdPxQx) 270 .int L(P5QF)-L(fwdPxQx) 271 .int L(P6QF)-L(fwdPxQx) 272 .int L(P7QF)-L(fwdPxQx) 273 274 .int L(P0QG)-L(fwdPxQx) # 0x80 275 276 .balign 16 277L(AliPxQx): .int L(now_qw_aligned)-L(AliPxQx) 278 .int L(A1Q0)-L(AliPxQx) 279 .int L(A2Q0)-L(AliPxQx) 280 .int L(A3Q0)-L(AliPxQx) 281 .int L(A4Q0)-L(AliPxQx) 282 .int L(A5Q0)-L(AliPxQx) 283 .int L(A6Q0)-L(AliPxQx) 284 .int L(A7Q0)-L(AliPxQx) 285 .int L(A0Q1)-L(AliPxQx) 286 .int L(A1Q1)-L(AliPxQx) 287 .int L(A2Q1)-L(AliPxQx) 288 .int L(A3Q1)-L(AliPxQx) 289 .int L(A4Q1)-L(AliPxQx) 290 .int L(A5Q1)-L(AliPxQx) 291 .int L(A6Q1)-L(AliPxQx) 292 .int L(A7Q1)-L(AliPxQx) 293 294 .balign 16 295L(A1Q0): # ; need to move 8+ 7=1+2+4 bytes 296 movzbq (%rdx),%r11 297 sub $0xf,%r8 298 mov %r11b,(%rcx) 299 300 movzwq 0x1(%rdx),%r10 301 mov %r10w,0x1(%rcx) 302 303 mov 0x3(%rdx),%r9d 304 mov %r9d,0x3(%rcx) 305 306 mov 0x7(%rdx),%r11 307 add $0xf,%rdx 308 mov %r11,0x7(%rcx) 309 310 add $0xf,%rcx 311 jmp L(now_qw_aligned) 312 313 .balign 16 314L(A2Q0): # ; need to move 8+ 6=2+4 bytes 315 movzwq (%rdx),%r10 316 sub $0xe,%r8 317 mov %r10w,(%rcx) 318 319 mov 0x2(%rdx),%r9d 320 mov %r9d,0x2(%rcx) 321 322 mov 0x6(%rdx),%r11 323 add $0xe,%rdx 324 mov %r11,0x6(%rcx) 325 add $0xe,%rcx 326 jmp L(now_qw_aligned) 327 328 .balign 16 329L(A3Q0): # ; need to move 8+ 5=1+4 bytes 330 movzbq (%rdx),%r11 331 sub $0xd,%r8 332 mov %r11b,(%rcx) 333 334 mov 0x1(%rdx),%r9d 335 mov %r9d,0x1(%rcx) 336 337 mov 0x5(%rdx),%r10 338 add $0xd,%rdx 339 mov %r10,0x5(%rcx) 340 341 add $0xd,%rcx 342 jmp L(now_qw_aligned) 343 344 .balign 16 345L(A4Q0): # ; need to move 8+4 bytes 346 mov (%rdx),%r9d 347 sub $0xc,%r8 348 mov %r9d,(%rcx) 349 350 mov 0x4(%rdx),%r10 351 add $0xc,%rdx 352 mov %r10,0x4(%rcx) 353 354 add $0xc,%rcx 355 jmp L(now_qw_aligned) 356 357 .balign 16 358L(A5Q0): # ; need to move 8+ 3=1+2 bytes 359 movzbq (%rdx),%r11 360 sub $0xb,%r8 361 mov %r11b,(%rcx) 362 363 movzwq 0x1(%rdx),%r10 364 mov %r10w,0x1(%rcx) 365 366 mov 0x3(%rdx),%r9 367 add $0xb,%rdx 368 mov %r9,0x3(%rcx) 369 370 add $0xb,%rcx 371 jmp L(now_qw_aligned) 372 373 .balign 16 374L(A6Q0): # ; need to move 8+2 bytes 375 movzwq (%rdx),%r10 376 sub $0xa,%r8 377 mov %r10w,(%rcx) 378 379 mov 0x2(%rdx),%r9 380 add $0xa,%rdx 381 mov %r9,0x2(%rcx) 382 383 add $0xa,%rcx 384 jmp L(now_qw_aligned) 385 386 .balign 16 387L(A7Q0): # ; need to move 8+1 byte 388 movzbq (%rdx),%r11 389 sub $0x9,%r8 390 mov %r11b,(%rcx) 391 392 mov 0x1(%rdx),%r10 393 add $0x9,%rdx 394 mov %r10,0x1(%rcx) 395 396 add $0x9,%rcx 397 jmp L(now_qw_aligned) 398 399 .balign 16 400L(A0Q1): # ; need to move 8 bytes 401 402 mov (%rdx),%r10 403 add $0x8,%rdx 404 sub $0x8,%r8 405 mov %r10,(%rcx) 406 407 add $0x8,%rcx 408 jmp L(now_qw_aligned) 409 410 .balign 16 411L(A1Q1): # ; need to move 7=1+2+4 bytes 412 movzbq (%rdx),%r11 413 sub $0x7,%r8 414 mov %r11b,(%rcx) 415 416 movzwq 0x1(%rdx),%r10 417 mov %r10w,0x1(%rcx) 418 419 mov 0x3(%rdx),%r9d 420 add $0x7,%rdx 421 mov %r9d,0x3(%rcx) 422 add $0x7,%rcx 423 jmp L(now_qw_aligned) 424 425 .balign 16 426L(A2Q1): # ; need to move 6=2+4 bytes 427 movzwq (%rdx),%r10 428 sub $0x6,%r8 429 mov %r10w,(%rcx) 430 mov 0x2(%rdx),%r9d 431 add $0x6,%rdx 432 mov %r9d,0x2(%rcx) 433 add $0x6,%rcx 434 jmp L(now_qw_aligned) 435 436 .balign 16 437L(A3Q1): # ; need to move 5=1+4 bytes 438 movzbq (%rdx),%r11 439 sub $0x5,%r8 440 mov %r11b,(%rcx) 441 mov 0x1(%rdx),%r9d 442 add $0x5,%rdx 443 mov %r9d,0x1(%rcx) 444 add $0x5,%rcx 445 jmp L(now_qw_aligned) 446 447 .balign 16 448L(A4Q1): # ; need to move 4 bytes 449 mov (%rdx),%r9d 450 sub $0x4,%r8 451 add $0x4,%rdx 452 mov %r9d,(%rcx) 453 add $0x4,%rcx 454 jmp L(now_qw_aligned) 455 456 .balign 16 457L(A5Q1): # ; need to move 3=1+2 bytes 458 movzbq (%rdx),%r11 459 sub $0x3,%r8 460 mov %r11b,(%rcx) 461 462 movzwq 0x1(%rdx),%r10 463 add $0x3,%rdx 464 mov %r10w,0x1(%rcx) 465 466 add $0x3,%rcx 467 jmp L(now_qw_aligned) 468 469 .balign 16 470L(A6Q1): # ; need to move 2 bytes 471 movzwq (%rdx),%r10 472 sub $0x2,%r8 473 add $0x2,%rdx 474 mov %r10w,(%rcx) 475 add $0x2,%rcx 476 jmp L(now_qw_aligned) 477 478 .balign 16 479L(A7Q1): # ; need to move 1 byte 480 movzbq (%rdx),%r11 481 dec %r8 482 inc %rdx 483 mov %r11b,(%rcx) 484 inc %rcx 485 jmp L(now_qw_aligned) 486 487 488 .balign 16 489L(P0QG): 490 mov -0x80(%rdx),%r9 491 mov %r9,-0x80(%rcx) 492L(P0QF): 493 mov -0x78(%rdx),%r10 494 mov %r10,-0x78(%rcx) 495L(P0QE): 496 mov -0x70(%rdx),%r9 497 mov %r9,-0x70(%rcx) 498L(P0QD): 499 mov -0x68(%rdx),%r10 500 mov %r10,-0x68(%rcx) 501L(P0QC): 502 mov -0x60(%rdx),%r9 503 mov %r9,-0x60(%rcx) 504L(P0QB): 505 mov -0x58(%rdx),%r10 506 mov %r10,-0x58(%rcx) 507L(P0QA): 508 mov -0x50(%rdx),%r9 509 mov %r9,-0x50(%rcx) 510L(P0Q9): 511 mov -0x48(%rdx),%r10 512 mov %r10,-0x48(%rcx) 513L(P0Q8): 514 mov -0x40(%rdx),%r9 515 mov %r9,-0x40(%rcx) 516L(P0Q7): 517 mov -0x38(%rdx),%r10 518 mov %r10,-0x38(%rcx) 519L(P0Q6): 520 mov -0x30(%rdx),%r9 521 mov %r9,-0x30(%rcx) 522L(P0Q5): 523 mov -0x28(%rdx),%r10 524 mov %r10,-0x28(%rcx) 525L(P0Q4): 526 mov -0x20(%rdx),%r9 527 mov %r9,-0x20(%rcx) 528L(P0Q3): 529 mov -0x18(%rdx),%r10 530 mov %r10,-0x18(%rcx) 531L(P0Q2): 532 mov -0x10(%rdx),%r9 533 mov %r9,-0x10(%rcx) 534L(P0Q1): 535 mov -0x8(%rdx),%r10 536 mov %r10,-0x8(%rcx) 537L(P0Q0): 538 ret 539 540 .balign 16 541L(P1QF): 542 mov -0x79(%rdx),%r9 543 mov %r9,-0x79(%rcx) 544L(P1QE): 545 mov -0x71(%rdx),%r11 546 mov %r11,-0x71(%rcx) 547L(P1QD): 548 mov -0x69(%rdx),%r10 549 mov %r10,-0x69(%rcx) 550L(P1QC): 551 mov -0x61(%rdx),%r9 552 mov %r9,-0x61(%rcx) 553L(P1QB): 554 mov -0x59(%rdx),%r11 555 mov %r11,-0x59(%rcx) 556L(P1QA): 557 mov -0x51(%rdx),%r10 558 mov %r10,-0x51(%rcx) 559L(P1Q9): 560 mov -0x49(%rdx),%r9 561 mov %r9,-0x49(%rcx) 562L(P1Q8): 563 mov -0x41(%rdx),%r11 564 mov %r11,-0x41(%rcx) 565L(P1Q7): 566 mov -0x39(%rdx),%r10 567 mov %r10,-0x39(%rcx) 568L(P1Q6): 569 mov -0x31(%rdx),%r9 570 mov %r9,-0x31(%rcx) 571L(P1Q5): 572 mov -0x29(%rdx),%r11 573 mov %r11,-0x29(%rcx) 574L(P1Q4): 575 mov -0x21(%rdx),%r10 576 mov %r10,-0x21(%rcx) 577L(P1Q3): 578 mov -0x19(%rdx),%r9 579 mov %r9,-0x19(%rcx) 580L(P1Q2): 581 mov -0x11(%rdx),%r11 582 mov %r11,-0x11(%rcx) 583L(P1Q1): 584 mov -0x9(%rdx),%r10 585 mov %r10,-0x9(%rcx) 586L(P1Q0): 587 movzbq -0x1(%rdx),%r9 588 mov %r9b,-0x1(%rcx) 589 ret 590 591 .balign 16 592L(P2QF): 593 mov -0x7a(%rdx),%r9 594 mov %r9,-0x7a(%rcx) 595L(P2QE): 596 mov -0x72(%rdx),%r11 597 mov %r11,-0x72(%rcx) 598L(P2QD): 599 mov -0x6a(%rdx),%r10 600 mov %r10,-0x6a(%rcx) 601L(P2QC): 602 mov -0x62(%rdx),%r9 603 mov %r9,-0x62(%rcx) 604L(P2QB): 605 mov -0x5a(%rdx),%r11 606 mov %r11,-0x5a(%rcx) 607L(P2QA): 608 mov -0x52(%rdx),%r10 609 mov %r10,-0x52(%rcx) 610L(P2Q9): 611 mov -0x4a(%rdx),%r9 612 mov %r9,-0x4a(%rcx) 613L(P2Q8): 614 mov -0x42(%rdx),%r11 615 mov %r11,-0x42(%rcx) 616L(P2Q7): 617 mov -0x3a(%rdx),%r10 618 mov %r10,-0x3a(%rcx) 619L(P2Q6): 620 mov -0x32(%rdx),%r9 621 mov %r9,-0x32(%rcx) 622L(P2Q5): 623 mov -0x2a(%rdx),%r11 624 mov %r11,-0x2a(%rcx) 625L(P2Q4): 626 mov -0x22(%rdx),%r10 627 mov %r10,-0x22(%rcx) 628L(P2Q3): 629 mov -0x1a(%rdx),%r9 630 mov %r9,-0x1a(%rcx) 631L(P2Q2): 632 mov -0x12(%rdx),%r11 633 mov %r11,-0x12(%rcx) 634L(P2Q1): 635 mov -0xa(%rdx),%r10 636 mov %r10,-0xa(%rcx) 637L(P2Q0): 638 movzwq -0x2(%rdx),%r9 639 mov %r9w,-0x2(%rcx) 640 ret 641 642 .balign 16 643L(P3QF): 644 mov -0x7b(%rdx),%r9 645 mov %r9,-0x7b(%rcx) 646L(P3QE): 647 mov -0x73(%rdx),%r11 648 mov %r11,-0x73(%rcx) 649L(P3QD): 650 mov -0x6b(%rdx),%r10 651 mov %r10,-0x6b(%rcx) 652L(P3QC): 653 mov -0x63(%rdx),%r9 654 mov %r9,-0x63(%rcx) 655L(P3QB): 656 mov -0x5b(%rdx),%r11 657 mov %r11,-0x5b(%rcx) 658L(P3QA): 659 mov -0x53(%rdx),%r10 660 mov %r10,-0x53(%rcx) 661L(P3Q9): 662 mov -0x4b(%rdx),%r9 663 mov %r9,-0x4b(%rcx) 664L(P3Q8): 665 mov -0x43(%rdx),%r11 666 mov %r11,-0x43(%rcx) 667L(P3Q7): 668 mov -0x3b(%rdx),%r10 669 mov %r10,-0x3b(%rcx) 670L(P3Q6): 671 mov -0x33(%rdx),%r9 672 mov %r9,-0x33(%rcx) 673L(P3Q5): 674 mov -0x2b(%rdx),%r11 675 mov %r11,-0x2b(%rcx) 676L(P3Q4): 677 mov -0x23(%rdx),%r10 678 mov %r10,-0x23(%rcx) 679L(P3Q3): 680 mov -0x1b(%rdx),%r9 681 mov %r9,-0x1b(%rcx) 682L(P3Q2): 683 mov -0x13(%rdx),%r11 684 mov %r11,-0x13(%rcx) 685L(P3Q1): 686 mov -0xb(%rdx),%r10 687 mov %r10,-0xb(%rcx) 688 /* 689 * These trailing loads/stores have to do all their loads 1st, 690 * then do the stores. 691 */ 692L(P3Q0): 693 movzwq -0x3(%rdx),%r9 694 movzbq -0x1(%rdx),%r10 695 mov %r9w,-0x3(%rcx) 696 mov %r10b,-0x1(%rcx) 697 ret 698 699 .balign 16 700L(P4QF): 701 mov -0x7c(%rdx),%r9 702 mov %r9,-0x7c(%rcx) 703L(P4QE): 704 mov -0x74(%rdx),%r11 705 mov %r11,-0x74(%rcx) 706L(P4QD): 707 mov -0x6c(%rdx),%r10 708 mov %r10,-0x6c(%rcx) 709L(P4QC): 710 mov -0x64(%rdx),%r9 711 mov %r9,-0x64(%rcx) 712L(P4QB): 713 mov -0x5c(%rdx),%r11 714 mov %r11,-0x5c(%rcx) 715L(P4QA): 716 mov -0x54(%rdx),%r10 717 mov %r10,-0x54(%rcx) 718L(P4Q9): 719 mov -0x4c(%rdx),%r9 720 mov %r9,-0x4c(%rcx) 721L(P4Q8): 722 mov -0x44(%rdx),%r11 723 mov %r11,-0x44(%rcx) 724L(P4Q7): 725 mov -0x3c(%rdx),%r10 726 mov %r10,-0x3c(%rcx) 727L(P4Q6): 728 mov -0x34(%rdx),%r9 729 mov %r9,-0x34(%rcx) 730L(P4Q5): 731 mov -0x2c(%rdx),%r11 732 mov %r11,-0x2c(%rcx) 733L(P4Q4): 734 mov -0x24(%rdx),%r10 735 mov %r10,-0x24(%rcx) 736L(P4Q3): 737 mov -0x1c(%rdx),%r9 738 mov %r9,-0x1c(%rcx) 739L(P4Q2): 740 mov -0x14(%rdx),%r11 741 mov %r11,-0x14(%rcx) 742L(P4Q1): 743 mov -0xc(%rdx),%r10 744 mov %r10,-0xc(%rcx) 745L(P4Q0): 746 mov -0x4(%rdx),%r9d 747 mov %r9d,-0x4(%rcx) 748 ret 749 750 .balign 16 751L(P5QF): 752 mov -0x7d(%rdx),%r9 753 mov %r9,-0x7d(%rcx) 754L(P5QE): 755 mov -0x75(%rdx),%r11 756 mov %r11,-0x75(%rcx) 757L(P5QD): 758 mov -0x6d(%rdx),%r10 759 mov %r10,-0x6d(%rcx) 760L(P5QC): 761 mov -0x65(%rdx),%r9 762 mov %r9,-0x65(%rcx) 763L(P5QB): 764 mov -0x5d(%rdx),%r11 765 mov %r11,-0x5d(%rcx) 766L(P5QA): 767 mov -0x55(%rdx),%r10 768 mov %r10,-0x55(%rcx) 769L(P5Q9): 770 mov -0x4d(%rdx),%r9 771 mov %r9,-0x4d(%rcx) 772L(P5Q8): 773 mov -0x45(%rdx),%r11 774 mov %r11,-0x45(%rcx) 775L(P5Q7): 776 mov -0x3d(%rdx),%r10 777 mov %r10,-0x3d(%rcx) 778L(P5Q6): 779 mov -0x35(%rdx),%r9 780 mov %r9,-0x35(%rcx) 781L(P5Q5): 782 mov -0x2d(%rdx),%r11 783 mov %r11,-0x2d(%rcx) 784L(P5Q4): 785 mov -0x25(%rdx),%r10 786 mov %r10,-0x25(%rcx) 787L(P5Q3): 788 mov -0x1d(%rdx),%r9 789 mov %r9,-0x1d(%rcx) 790L(P5Q2): 791 mov -0x15(%rdx),%r11 792 mov %r11,-0x15(%rcx) 793L(P5Q1): 794 mov -0xd(%rdx),%r10 795 mov %r10,-0xd(%rcx) 796 /* 797 * These trailing loads/stores have to do all their loads 1st, 798 * then do the stores. 799 */ 800L(P5Q0): 801 mov -0x5(%rdx),%r9d 802 movzbq -0x1(%rdx),%r10 803 mov %r9d,-0x5(%rcx) 804 mov %r10b,-0x1(%rcx) 805 ret 806 807 .balign 16 808L(P6QF): 809 mov -0x7e(%rdx),%r9 810 mov %r9,-0x7e(%rcx) 811L(P6QE): 812 mov -0x76(%rdx),%r11 813 mov %r11,-0x76(%rcx) 814L(P6QD): 815 mov -0x6e(%rdx),%r10 816 mov %r10,-0x6e(%rcx) 817L(P6QC): 818 mov -0x66(%rdx),%r9 819 mov %r9,-0x66(%rcx) 820L(P6QB): 821 mov -0x5e(%rdx),%r11 822 mov %r11,-0x5e(%rcx) 823L(P6QA): 824 mov -0x56(%rdx),%r10 825 mov %r10,-0x56(%rcx) 826L(P6Q9): 827 mov -0x4e(%rdx),%r9 828 mov %r9,-0x4e(%rcx) 829L(P6Q8): 830 mov -0x46(%rdx),%r11 831 mov %r11,-0x46(%rcx) 832L(P6Q7): 833 mov -0x3e(%rdx),%r10 834 mov %r10,-0x3e(%rcx) 835L(P6Q6): 836 mov -0x36(%rdx),%r9 837 mov %r9,-0x36(%rcx) 838L(P6Q5): 839 mov -0x2e(%rdx),%r11 840 mov %r11,-0x2e(%rcx) 841L(P6Q4): 842 mov -0x26(%rdx),%r10 843 mov %r10,-0x26(%rcx) 844L(P6Q3): 845 mov -0x1e(%rdx),%r9 846 mov %r9,-0x1e(%rcx) 847L(P6Q2): 848 mov -0x16(%rdx),%r11 849 mov %r11,-0x16(%rcx) 850L(P6Q1): 851 mov -0xe(%rdx),%r10 852 mov %r10,-0xe(%rcx) 853 /* 854 * These trailing loads/stores have to do all their loads 1st, 855 * then do the stores. 856 */ 857L(P6Q0): 858 mov -0x6(%rdx),%r9d 859 movzwq -0x2(%rdx),%r10 860 mov %r9d,-0x6(%rcx) 861 mov %r10w,-0x2(%rcx) 862 ret 863 864 .balign 16 865L(P7QF): 866 mov -0x7f(%rdx),%r9 867 mov %r9,-0x7f(%rcx) 868L(P7QE): 869 mov -0x77(%rdx),%r11 870 mov %r11,-0x77(%rcx) 871L(P7QD): 872 mov -0x6f(%rdx),%r10 873 mov %r10,-0x6f(%rcx) 874L(P7QC): 875 mov -0x67(%rdx),%r9 876 mov %r9,-0x67(%rcx) 877L(P7QB): 878 mov -0x5f(%rdx),%r11 879 mov %r11,-0x5f(%rcx) 880L(P7QA): 881 mov -0x57(%rdx),%r10 882 mov %r10,-0x57(%rcx) 883L(P7Q9): 884 mov -0x4f(%rdx),%r9 885 mov %r9,-0x4f(%rcx) 886L(P7Q8): 887 mov -0x47(%rdx),%r11 888 mov %r11,-0x47(%rcx) 889L(P7Q7): 890 mov -0x3f(%rdx),%r10 891 mov %r10,-0x3f(%rcx) 892L(P7Q6): 893 mov -0x37(%rdx),%r9 894 mov %r9,-0x37(%rcx) 895L(P7Q5): 896 mov -0x2f(%rdx),%r11 897 mov %r11,-0x2f(%rcx) 898L(P7Q4): 899 mov -0x27(%rdx),%r10 900 mov %r10,-0x27(%rcx) 901L(P7Q3): 902 mov -0x1f(%rdx),%r9 903 mov %r9,-0x1f(%rcx) 904L(P7Q2): 905 mov -0x17(%rdx),%r11 906 mov %r11,-0x17(%rcx) 907L(P7Q1): 908 mov -0xf(%rdx),%r10 909 mov %r10,-0xf(%rcx) 910 /* 911 * These trailing loads/stores have to do all their loads 1st, 912 * then do the stores. 913 */ 914L(P7Q0): 915 mov -0x7(%rdx),%r9d 916 movzwq -0x3(%rdx),%r10 917 movzbq -0x1(%rdx),%r11 918 mov %r9d,-0x7(%rcx) 919 mov %r10w,-0x3(%rcx) 920 mov %r11b,-0x1(%rcx) 921 ret 922 923 .balign 16 924L(ck_use_sse2): 925 /* 926 * Align dest to 16 byte boundary. 927 */ 928 test $0xf,%rcx 929 jnz L(ShrtAlignNew) 930 931L(now_qw_aligned): 932 cmpl $NO_SSE,.memops_method(%rip) 933 je L(Loop8byte_pre) 934 935 /* 936 * The fall-through path is to do SSE2 16-byte load/stores 937 */ 938 939 /* 940 * If current move size is larger than half of the highest level cache 941 * size, then do non-temporal moves. 942 */ 943 mov .largest_level_cache_size(%rip),%r9d 944 shr %r9 # take half of it 945 cmp %r9,%r8 946 jg L(sse2_nt_move) 947 948 /* 949 * If both the source and dest are aligned, then use the both aligned 950 * logic. Well aligned data should reap the rewards. 951 */ 952 test $0xf,%rdx 953 jz L(pre_both_aligned) 954 955 lea L(SSE_src)(%rip),%r10 # SSE2 (default) 956 testl $USE_SSSE3,.memops_method(%rip) 957 jz 1f 958 lea L(SSSE3_src)(%rip),%r10 # SSSE3 959 9601: 961 /* 962 * if the src is not 16 byte aligned... 963 */ 964 mov %rdx,%r11 965 and $0xf,%r11 966 movdqu (%rdx),%xmm0 967 movdqa %xmm0,(%rcx) 968 add $0x10,%rdx 969 sub %r11,%rdx 970 add $0x10,%rcx 971 sub $0x10,%r8 972 movdqa (%rdx),%xmm1 973 974 movslq (%r10,%r11,4),%r9 975 lea (%r9,%r10,1),%r10 976 jmpq *%r10 977 978 .balign 16 979L(SSSE3_src): .int L(pre_both_aligned)-L(SSSE3_src) 980 .int L(mov3dqa1) -L(SSSE3_src) 981 .int L(mov3dqa2) -L(SSSE3_src) 982 .int L(mov3dqa3) -L(SSSE3_src) 983 .int L(mov3dqa4) -L(SSSE3_src) 984 .int L(mov3dqa5) -L(SSSE3_src) 985 .int L(mov3dqa6) -L(SSSE3_src) 986 .int L(mov3dqa7) -L(SSSE3_src) 987 .int L(movdqa8) -L(SSSE3_src) 988 .int L(mov3dqa9) -L(SSSE3_src) 989 .int L(mov3dqa10)-L(SSSE3_src) 990 .int L(mov3dqa11)-L(SSSE3_src) 991 .int L(mov3dqa12)-L(SSSE3_src) 992 .int L(mov3dqa13)-L(SSSE3_src) 993 .int L(mov3dqa14)-L(SSSE3_src) 994 .int L(mov3dqa15)-L(SSSE3_src) 995L(SSE_src): .int L(pre_both_aligned)-L(SSE_src) 996 .int L(movdqa1) -L(SSE_src) 997 .int L(movdqa2) -L(SSE_src) 998 .int L(movdqa3) -L(SSE_src) 999 .int L(movdqa4) -L(SSE_src) 1000 .int L(movdqa5) -L(SSE_src) 1001 .int L(movdqa6) -L(SSE_src) 1002 .int L(movdqa7) -L(SSE_src) 1003 .int L(movdqa8) -L(SSE_src) 1004 .int L(movdqa9) -L(SSE_src) 1005 .int L(movdqa10)-L(SSE_src) 1006 .int L(movdqa11)-L(SSE_src) 1007 .int L(movdqa12)-L(SSE_src) 1008 .int L(movdqa13)-L(SSE_src) 1009 .int L(movdqa14)-L(SSE_src) 1010 .int L(movdqa15)-L(SSE_src) 1011 1012 .balign 16 1013L(movdqa1): 1014 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1015 movdqa 0x20(%rdx),%xmm0 # load the upper source buffer 1016 lea 0x20(%rdx),%rdx 1017 lea -0x20(%r8),%r8 1018 1019 psrldq $0x1,%xmm1 # shift right prev buffer (saved from last iteration) 1020 movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration 1021 pslldq $0xf,%xmm3 # shift the current buffer left (shift in zeros) 1022 por %xmm1,%xmm3 # OR them together 1023 cmp $0x20,%r8 1024 1025 psrldq $0x1,%xmm2 # shift right prev buffer (saved from last iteration) 1026 movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration 1027 pslldq $0xf,%xmm0 # shift the current buffer left (shift in zeros) 1028 por %xmm2,%xmm0 # OR them together 1029 movdqa %xmm3,(%rcx) # store it 1030 movdqa %xmm0,0x10(%rcx) # store it 1031 lea 0x20(%rcx),%rcx 1032 1033 jge L(movdqa1) 1034 jmp L(movdqa_epi) 1035 1036 .balign 16 1037L(movdqa2): 1038 sub $0x20,%r8 1039 movdqa 0x10(%rdx),%xmm3 1040 movdqa 0x20(%rdx),%xmm0 1041 add $0x20,%rdx 1042 1043 psrldq $0x2,%xmm1 1044 movdqa %xmm3,%xmm2 1045 pslldq $0xe,%xmm3 1046 por %xmm1,%xmm3 1047 1048 psrldq $0x2,%xmm2 1049 movdqa %xmm0,%xmm1 1050 pslldq $0xe,%xmm0 1051 por %xmm2,%xmm0 1052 movdqa %xmm3,(%rcx) 1053 movdqa %xmm0,0x10(%rcx) 1054 1055 add $0x20,%rcx 1056 cmp $0x20,%r8 1057 jge L(movdqa2) 1058 jmp L(movdqa_epi) 1059 1060 .balign 16 1061L(movdqa3): 1062 sub $0x20,%r8 1063 movdqa 0x10(%rdx),%xmm3 1064 movdqa 0x20(%rdx),%xmm0 1065 add $0x20,%rdx 1066 1067 psrldq $0x3,%xmm1 1068 movdqa %xmm3,%xmm2 1069 pslldq $0xd,%xmm3 1070 por %xmm1,%xmm3 1071 1072 psrldq $0x3,%xmm2 1073 movdqa %xmm0,%xmm1 1074 pslldq $0xd,%xmm0 1075 por %xmm2,%xmm0 1076 movdqa %xmm3,(%rcx) 1077 movdqa %xmm0,0x10(%rcx) 1078 1079 add $0x20,%rcx 1080 cmp $0x20,%r8 1081 jge L(movdqa3) 1082 jmp L(movdqa_epi) 1083 1084 .balign 16 1085L(movdqa4): 1086 sub $0x20,%r8 1087 movdqa 0x10(%rdx),%xmm3 1088 movdqa 0x20(%rdx),%xmm0 1089 add $0x20,%rdx 1090 1091 psrldq $0x4,%xmm1 1092 movdqa %xmm3,%xmm2 1093 pslldq $0xc,%xmm3 1094 por %xmm1,%xmm3 1095 1096 psrldq $0x4,%xmm2 1097 movdqa %xmm0,%xmm1 1098 pslldq $0xc,%xmm0 1099 por %xmm2,%xmm0 1100 1101 movdqa %xmm3,(%rcx) 1102 movdqa %xmm0,0x10(%rcx) 1103 1104 add $0x20,%rcx 1105 cmp $0x20,%r8 1106 jge L(movdqa4) 1107 jmp L(movdqa_epi) 1108 1109 .balign 16 1110L(movdqa5): 1111 sub $0x20,%r8 1112 movdqa 0x10(%rdx),%xmm3 1113 movdqa 0x20(%rdx),%xmm0 1114 add $0x20,%rdx 1115 1116 psrldq $0x5,%xmm1 1117 movdqa %xmm3,%xmm2 1118 pslldq $0xb,%xmm3 1119 por %xmm1,%xmm3 1120 1121 psrldq $0x5,%xmm2 1122 movdqa %xmm0,%xmm1 1123 pslldq $0xb,%xmm0 1124 por %xmm2,%xmm0 1125 1126 movdqa %xmm3,(%rcx) 1127 movdqa %xmm0,0x10(%rcx) 1128 1129 add $0x20,%rcx 1130 cmp $0x20,%r8 1131 jge L(movdqa5) 1132 jmp L(movdqa_epi) 1133 1134 .balign 16 1135L(movdqa6): 1136 sub $0x20,%r8 1137 movdqa 0x10(%rdx),%xmm3 1138 movdqa 0x20(%rdx),%xmm0 1139 add $0x20,%rdx 1140 1141 psrldq $0x6,%xmm1 1142 movdqa %xmm3,%xmm2 1143 pslldq $0xa,%xmm3 1144 por %xmm1,%xmm3 1145 1146 psrldq $0x6,%xmm2 1147 movdqa %xmm0,%xmm1 1148 pslldq $0xa,%xmm0 1149 por %xmm2,%xmm0 1150 movdqa %xmm3,(%rcx) 1151 movdqa %xmm0,0x10(%rcx) 1152 1153 add $0x20,%rcx 1154 cmp $0x20,%r8 1155 jge L(movdqa6) 1156 jmp L(movdqa_epi) 1157 1158 .balign 16 1159L(movdqa7): 1160 sub $0x20,%r8 1161 movdqa 0x10(%rdx),%xmm3 1162 movdqa 0x20(%rdx),%xmm0 1163 add $0x20,%rdx 1164 1165 psrldq $0x7,%xmm1 1166 movdqa %xmm3,%xmm2 1167 pslldq $0x9,%xmm3 1168 por %xmm1,%xmm3 1169 1170 psrldq $0x7,%xmm2 1171 movdqa %xmm0,%xmm1 1172 pslldq $0x9,%xmm0 1173 por %xmm2,%xmm0 1174 movdqa %xmm3,(%rcx) 1175 movdqa %xmm0,0x10(%rcx) 1176 1177 add $0x20,%rcx 1178 cmp $0x20,%r8 1179 jge L(movdqa7) 1180 jmp L(movdqa_epi) 1181 1182 .balign 16 1183L(movdqa8): 1184 movdqa 0x10(%rdx),%xmm3 1185 sub $0x30,%r8 1186 movdqa 0x20(%rdx),%xmm0 1187 movdqa 0x30(%rdx),%xmm5 1188 lea 0x30(%rdx),%rdx 1189 1190 shufpd $0x1,%xmm3,%xmm1 1191 movdqa %xmm1,(%rcx) 1192 1193 cmp $0x30,%r8 1194 1195 shufpd $0x1,%xmm0,%xmm3 1196 movdqa %xmm3,0x10(%rcx) 1197 1198 movdqa %xmm5,%xmm1 1199 shufpd $0x1,%xmm5,%xmm0 1200 movdqa %xmm0,0x20(%rcx) 1201 1202 lea 0x30(%rcx),%rcx 1203 1204 jge L(movdqa8) 1205 jmp L(movdqa_epi) 1206 1207 .balign 16 1208L(movdqa9): 1209 sub $0x20,%r8 1210 movdqa 0x10(%rdx),%xmm3 1211 movdqa 0x20(%rdx),%xmm0 1212 add $0x20,%rdx 1213 1214 psrldq $0x9,%xmm1 1215 movdqa %xmm3,%xmm2 1216 pslldq $0x7,%xmm3 1217 por %xmm1,%xmm3 1218 1219 psrldq $0x9,%xmm2 1220 movdqa %xmm0,%xmm1 1221 pslldq $0x7,%xmm0 1222 por %xmm2,%xmm0 1223 movdqa %xmm3,(%rcx) 1224 movdqa %xmm0,0x10(%rcx) 1225 1226 add $0x20,%rcx 1227 cmp $0x20,%r8 1228 jge L(movdqa9) 1229 jmp L(movdqa_epi) 1230 1231 .balign 16 1232L(movdqa10): 1233 sub $0x20,%r8 1234 movdqa 0x10(%rdx),%xmm3 1235 movdqa 0x20(%rdx),%xmm0 1236 add $0x20,%rdx 1237 1238 psrldq $0xa,%xmm1 1239 movdqa %xmm3,%xmm2 1240 pslldq $0x6,%xmm3 1241 por %xmm1,%xmm3 1242 1243 psrldq $0xa,%xmm2 1244 movdqa %xmm0,%xmm1 1245 pslldq $0x6,%xmm0 1246 por %xmm2,%xmm0 1247 movdqa %xmm3,(%rcx) 1248 movdqa %xmm0,0x10(%rcx) 1249 1250 add $0x20,%rcx 1251 cmp $0x20,%r8 1252 jge L(movdqa10) 1253 jmp L(movdqa_epi) 1254 1255 .balign 16 1256L(movdqa11): 1257 sub $0x20,%r8 1258 movdqa 0x10(%rdx),%xmm3 1259 movdqa 0x20(%rdx),%xmm0 1260 add $0x20,%rdx 1261 1262 psrldq $0xb,%xmm1 1263 movdqa %xmm3,%xmm2 1264 pslldq $0x5,%xmm3 1265 por %xmm1,%xmm3 1266 1267 psrldq $0xb,%xmm2 1268 movdqa %xmm0,%xmm1 1269 pslldq $0x5,%xmm0 1270 por %xmm2,%xmm0 1271 movdqa %xmm3,(%rcx) 1272 movdqa %xmm0,0x10(%rcx) 1273 1274 add $0x20,%rcx 1275 cmp $0x20,%r8 1276 jge L(movdqa11) 1277 jmp L(movdqa_epi) 1278 1279 .balign 16 1280L(movdqa12): 1281 sub $0x20,%r8 1282 movdqa 0x10(%rdx),%xmm3 1283 movdqa 0x20(%rdx),%xmm0 1284 add $0x20,%rdx 1285 1286 psrldq $0xc,%xmm1 1287 movdqa %xmm3,%xmm2 1288 pslldq $0x4,%xmm3 1289 por %xmm1,%xmm3 1290 1291 psrldq $0xc,%xmm2 1292 movdqa %xmm0,%xmm1 1293 pslldq $0x4,%xmm0 1294 por %xmm2,%xmm0 1295 movdqa %xmm3,(%rcx) 1296 movdqa %xmm0,0x10(%rcx) 1297 1298 add $0x20,%rcx 1299 cmp $0x20,%r8 1300 jge L(movdqa12) 1301 jmp L(movdqa_epi) 1302 1303 .balign 16 1304L(movdqa13): 1305 sub $0x20,%r8 1306 movdqa 0x10(%rdx),%xmm3 1307 movdqa 0x20(%rdx),%xmm0 1308 add $0x20,%rdx 1309 1310 psrldq $0xd,%xmm1 1311 movdqa %xmm3,%xmm2 1312 pslldq $0x3,%xmm3 1313 por %xmm1,%xmm3 1314 1315 psrldq $0xd,%xmm2 1316 movdqa %xmm0,%xmm1 1317 pslldq $0x3,%xmm0 1318 por %xmm2,%xmm0 1319 movdqa %xmm3,(%rcx) 1320 movdqa %xmm0,0x10(%rcx) 1321 1322 add $0x20,%rcx 1323 cmp $0x20,%r8 1324 jge L(movdqa13) 1325 jmp L(movdqa_epi) 1326 1327 .balign 16 1328L(movdqa14): 1329 sub $0x20,%r8 1330 movdqa 0x10(%rdx),%xmm3 1331 movdqa 0x20(%rdx),%xmm0 1332 add $0x20,%rdx 1333 1334 psrldq $0xe,%xmm1 1335 movdqa %xmm3,%xmm2 1336 pslldq $0x2,%xmm3 1337 por %xmm1,%xmm3 1338 1339 psrldq $0xe,%xmm2 1340 movdqa %xmm0,%xmm1 1341 pslldq $0x2,%xmm0 1342 por %xmm2,%xmm0 1343 movdqa %xmm3,(%rcx) 1344 movdqa %xmm0,0x10(%rcx) 1345 1346 add $0x20,%rcx 1347 cmp $0x20,%r8 1348 jge L(movdqa14) 1349 jmp L(movdqa_epi) 1350 1351 .balign 16 1352L(movdqa15): 1353 sub $0x20,%r8 1354 movdqa 0x10(%rdx),%xmm3 1355 movdqa 0x20(%rdx),%xmm0 1356 add $0x20,%rdx 1357 1358 psrldq $0xf,%xmm1 1359 movdqa %xmm3,%xmm2 1360 pslldq $0x1,%xmm3 1361 por %xmm1,%xmm3 1362 1363 psrldq $0xf,%xmm2 1364 movdqa %xmm0,%xmm1 1365 pslldq $0x1,%xmm0 1366 por %xmm2,%xmm0 1367 movdqa %xmm3,(%rcx) 1368 movdqa %xmm0,0x10(%rcx) 1369 1370 add $0x20,%rcx 1371 cmp $0x20,%r8 1372 jge L(movdqa15) 1373 #jmp L(movdqa_epi) 1374 1375 .balign 16 1376L(movdqa_epi): 1377 lea L(fwdPxQx)(%rip),%r10 1378 add %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop) 1379 add %r8,%rcx 1380 add %r8,%rdx 1381 1382 movslq (%r10,%r8,4),%r9 1383 lea (%r9,%r10,1),%r10 1384 jmpq *%r10 1385 1386 .balign 16 1387L(mov3dqa1): 1388 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1389 sub $0x30,%r8 1390 movdqa 0x20(%rdx),%xmm0 # load the upper source buffer 1391 movdqa 0x30(%rdx),%xmm5 # load the upper source buffer 1392 lea 0x30(%rdx),%rdx 1393 cmp $0x30,%r8 1394 1395 movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration 1396 #palignr $0x1,%xmm1,%xmm3 1397 .byte 0x66,0x0f,0x3a,0x0f 1398 .byte 0xd9,0x01 1399 movdqa %xmm3,(%rcx) # store it 1400 1401 movdqa %xmm0,%xmm4 # store off xmm reg for use next iteration 1402 #palignr $0x1,%xmm2,%xmm0 1403 .byte 0x66,0x0f,0x3a,0x0f 1404 .byte 0xc2,0x01 1405 movdqa %xmm0,0x10(%rcx) # store it 1406 1407 movdqa %xmm5,%xmm1 # store off xmm reg for use next iteration 1408 #palignr $0x1,%xmm4,%xmm5 1409 .byte 0x66,0x0f,0x3a,0x0f 1410 .byte 0xec,0x01 1411 movdqa %xmm5,0x20(%rcx) # store it 1412 1413 lea 0x30(%rcx),%rcx 1414 jge L(mov3dqa1) 1415 1416 cmp $0x10,%r8 1417 jl L(movdqa_epi) 1418 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1419 sub $0x10,%r8 1420 lea 0x10(%rdx),%rdx 1421 movdqa %xmm3,%xmm2 # save for use next concat 1422 #palignr $0x1,%xmm1,%xmm3 1423 .byte 0x66,0x0f,0x3a,0x0f 1424 .byte 0xd9,0x01 1425 1426 cmp $0x10,%r8 1427 movdqa %xmm3,(%rcx) # store it 1428 lea 0x10(%rcx),%rcx 1429 jl L(movdqa_epi) 1430 1431 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1432 sub $0x10,%r8 1433 lea 0x10(%rdx),%rdx 1434 #palignr $0x1,%xmm2,%xmm0 1435 .byte 0x66,0x0f,0x3a,0x0f 1436 .byte 0xc2,0x01 1437 movdqa %xmm0,(%rcx) # store it 1438 lea 0x10(%rcx),%rcx 1439 jmp L(movdqa_epi) 1440 1441 .balign 16 1442L(mov3dqa2): 1443 movdqa 0x10(%rdx),%xmm3 1444 sub $0x30,%r8 1445 movdqa 0x20(%rdx),%xmm0 1446 movdqa 0x30(%rdx),%xmm5 1447 lea 0x30(%rdx),%rdx 1448 cmp $0x30,%r8 1449 1450 movdqa %xmm3,%xmm2 1451 #palignr $0x2,%xmm1,%xmm3 1452 .byte 0x66,0x0f,0x3a,0x0f 1453 .byte 0xd9,0x02 1454 movdqa %xmm3,(%rcx) 1455 1456 movdqa %xmm0,%xmm4 1457 #palignr $0x2,%xmm2,%xmm0 1458 .byte 0x66,0x0f,0x3a,0x0f 1459 .byte 0xc2,0x02 1460 movdqa %xmm0,0x10(%rcx) 1461 1462 movdqa %xmm5,%xmm1 1463 #palignr $0x2,%xmm4,%xmm5 1464 .byte 0x66,0x0f,0x3a,0x0f 1465 .byte 0xec,0x02 1466 movdqa %xmm5,0x20(%rcx) 1467 1468 lea 0x30(%rcx),%rcx 1469 jge L(mov3dqa2) 1470 1471 cmp $0x10,%r8 1472 jl L(movdqa_epi) 1473 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1474 sub $0x10,%r8 1475 lea 0x10(%rdx),%rdx 1476 movdqa %xmm3,%xmm2 # save for use next concat 1477 #palignr $0x2,%xmm1,%xmm3 1478 .byte 0x66,0x0f,0x3a,0x0f 1479 .byte 0xd9,0x02 1480 1481 cmp $0x10,%r8 1482 movdqa %xmm3,(%rcx) # store it 1483 lea 0x10(%rcx),%rcx 1484 jl L(movdqa_epi) 1485 1486 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1487 sub $0x10,%r8 1488 lea 0x10(%rdx),%rdx 1489 #palignr $0x2,%xmm2,%xmm0 1490 .byte 0x66,0x0f,0x3a,0x0f 1491 .byte 0xc2,0x02 1492 movdqa %xmm0,(%rcx) # store it 1493 lea 0x10(%rcx),%rcx 1494 jmp L(movdqa_epi) 1495 1496 .balign 16 1497L(mov3dqa3): 1498 movdqa 0x10(%rdx),%xmm3 1499 sub $0x30,%r8 1500 movdqa 0x20(%rdx),%xmm0 1501 movdqa 0x30(%rdx),%xmm5 1502 lea 0x30(%rdx),%rdx 1503 cmp $0x30,%r8 1504 1505 movdqa %xmm3,%xmm2 1506 #palignr $0x3,%xmm1,%xmm3 1507 .byte 0x66,0x0f,0x3a,0x0f 1508 .byte 0xd9,0x03 1509 movdqa %xmm3,(%rcx) 1510 1511 movdqa %xmm0,%xmm4 1512 #palignr $0x3,%xmm2,%xmm0 1513 .byte 0x66,0x0f,0x3a,0x0f 1514 .byte 0xc2,0x03 1515 movdqa %xmm0,0x10(%rcx) 1516 1517 movdqa %xmm5,%xmm1 1518 #palignr $0x3,%xmm4,%xmm5 1519 .byte 0x66,0x0f,0x3a,0x0f 1520 .byte 0xec,0x03 1521 movdqa %xmm5,0x20(%rcx) 1522 1523 lea 0x30(%rcx),%rcx 1524 jge L(mov3dqa3) 1525 1526 cmp $0x10,%r8 1527 jl L(movdqa_epi) 1528 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1529 sub $0x10,%r8 1530 lea 0x10(%rdx),%rdx 1531 movdqa %xmm3,%xmm2 # save for use next concat 1532 #palignr $0x3,%xmm1,%xmm3 1533 .byte 0x66,0x0f,0x3a,0x0f 1534 .byte 0xd9,0x03 1535 1536 cmp $0x10,%r8 1537 movdqa %xmm3,(%rcx) # store it 1538 lea 0x10(%rcx),%rcx 1539 jl L(movdqa_epi) 1540 1541 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1542 sub $0x10,%r8 1543 lea 0x10(%rdx),%rdx 1544 #palignr $0x3,%xmm2,%xmm0 1545 .byte 0x66,0x0f,0x3a,0x0f 1546 .byte 0xc2,0x03 1547 movdqa %xmm0,(%rcx) # store it 1548 lea 0x10(%rcx),%rcx 1549 jmp L(movdqa_epi) 1550 1551 .balign 16 1552L(mov3dqa4): 1553 movdqa 0x10(%rdx),%xmm3 1554 sub $0x30,%r8 1555 movdqa 0x20(%rdx),%xmm0 1556 movdqa 0x30(%rdx),%xmm5 1557 lea 0x30(%rdx),%rdx 1558 cmp $0x30,%r8 1559 1560 movdqa %xmm3,%xmm2 1561 #palignr $0x4,%xmm1,%xmm3 1562 .byte 0x66,0x0f,0x3a,0x0f 1563 .byte 0xd9,0x04 1564 movdqa %xmm3,(%rcx) 1565 1566 movdqa %xmm0,%xmm4 1567 #palignr $0x4,%xmm2,%xmm0 1568 .byte 0x66,0x0f,0x3a,0x0f 1569 .byte 0xc2,0x04 1570 movdqa %xmm0,0x10(%rcx) 1571 1572 movdqa %xmm5,%xmm1 1573 #palignr $0x4,%xmm4,%xmm5 1574 .byte 0x66,0x0f,0x3a,0x0f 1575 .byte 0xec,0x04 1576 movdqa %xmm5,0x20(%rcx) 1577 1578 lea 0x30(%rcx),%rcx 1579 jge L(mov3dqa4) 1580 1581 cmp $0x10,%r8 1582 jl L(movdqa_epi) 1583 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1584 sub $0x10,%r8 1585 lea 0x10(%rdx),%rdx 1586 movdqa %xmm3,%xmm2 # save for use next concat 1587 #palignr $0x4,%xmm1,%xmm3 1588 .byte 0x66,0x0f,0x3a,0x0f 1589 .byte 0xd9,0x04 1590 1591 cmp $0x10,%r8 1592 movdqa %xmm3,(%rcx) # store it 1593 lea 0x10(%rcx),%rcx 1594 jl L(movdqa_epi) 1595 1596 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1597 sub $0x10,%r8 1598 lea 0x10(%rdx),%rdx 1599 #palignr $0x4,%xmm2,%xmm0 1600 .byte 0x66,0x0f,0x3a,0x0f 1601 .byte 0xc2,0x04 1602 movdqa %xmm0,(%rcx) # store it 1603 lea 0x10(%rcx),%rcx 1604 jmp L(movdqa_epi) 1605 1606 .balign 16 1607L(mov3dqa5): 1608 movdqa 0x10(%rdx),%xmm3 1609 sub $0x30,%r8 1610 movdqa 0x20(%rdx),%xmm0 1611 movdqa 0x30(%rdx),%xmm5 1612 lea 0x30(%rdx),%rdx 1613 cmp $0x30,%r8 1614 1615 movdqa %xmm3,%xmm2 1616 #palignr $0x5,%xmm1,%xmm3 1617 .byte 0x66,0x0f,0x3a,0x0f 1618 .byte 0xd9,0x05 1619 movdqa %xmm3,(%rcx) 1620 1621 movdqa %xmm0,%xmm4 1622 #palignr $0x5,%xmm2,%xmm0 1623 .byte 0x66,0x0f,0x3a,0x0f 1624 .byte 0xc2,0x05 1625 movdqa %xmm0,0x10(%rcx) 1626 1627 movdqa %xmm5,%xmm1 1628 #palignr $0x5,%xmm4,%xmm5 1629 .byte 0x66,0x0f,0x3a,0x0f 1630 .byte 0xec,0x05 1631 movdqa %xmm5,0x20(%rcx) 1632 1633 lea 0x30(%rcx),%rcx 1634 jge L(mov3dqa5) 1635 1636 cmp $0x10,%r8 1637 jl L(movdqa_epi) 1638 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1639 sub $0x10,%r8 1640 lea 0x10(%rdx),%rdx 1641 movdqa %xmm3,%xmm2 # save for use next concat 1642 #palignr $0x5,%xmm1,%xmm3 1643 .byte 0x66,0x0f,0x3a,0x0f 1644 .byte 0xd9,0x05 1645 1646 cmp $0x10,%r8 1647 movdqa %xmm3,(%rcx) # store it 1648 lea 0x10(%rcx),%rcx 1649 jl L(movdqa_epi) 1650 1651 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1652 sub $0x10,%r8 1653 lea 0x10(%rdx),%rdx 1654 #palignr $0x5,%xmm2,%xmm0 1655 .byte 0x66,0x0f,0x3a,0x0f 1656 .byte 0xc2,0x05 1657 movdqa %xmm0,(%rcx) # store it 1658 lea 0x10(%rcx),%rcx 1659 jmp L(movdqa_epi) 1660 1661 .balign 16 1662L(mov3dqa6): 1663 movdqa 0x10(%rdx),%xmm3 1664 sub $0x30,%r8 1665 movdqa 0x20(%rdx),%xmm0 1666 movdqa 0x30(%rdx),%xmm5 1667 lea 0x30(%rdx),%rdx 1668 cmp $0x30,%r8 1669 1670 movdqa %xmm3,%xmm2 1671 #palignr $0x6,%xmm1,%xmm3 1672 .byte 0x66,0x0f,0x3a,0x0f 1673 .byte 0xd9,0x06 1674 movdqa %xmm3,(%rcx) 1675 1676 movdqa %xmm0,%xmm4 1677 #palignr $0x6,%xmm2,%xmm0 1678 .byte 0x66,0x0f,0x3a,0x0f 1679 .byte 0xc2,0x06 1680 movdqa %xmm0,0x10(%rcx) 1681 1682 movdqa %xmm5,%xmm1 1683 #palignr $0x6,%xmm4,%xmm5 1684 .byte 0x66,0x0f,0x3a,0x0f 1685 .byte 0xec,0x06 1686 movdqa %xmm5,0x20(%rcx) 1687 1688 lea 0x30(%rcx),%rcx 1689 jge L(mov3dqa6) 1690 1691 cmp $0x10,%r8 1692 jl L(movdqa_epi) 1693 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1694 sub $0x10,%r8 1695 lea 0x10(%rdx),%rdx 1696 movdqa %xmm3,%xmm2 # save for use next concat 1697 #palignr $0x6,%xmm1,%xmm3 1698 .byte 0x66,0x0f,0x3a,0x0f 1699 .byte 0xd9,0x06 1700 1701 cmp $0x10,%r8 1702 movdqa %xmm3,(%rcx) # store it 1703 lea 0x10(%rcx),%rcx 1704 jl L(movdqa_epi) 1705 1706 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1707 sub $0x10,%r8 1708 lea 0x10(%rdx),%rdx 1709 #palignr $0x6,%xmm2,%xmm0 1710 .byte 0x66,0x0f,0x3a,0x0f 1711 .byte 0xc2,0x06 1712 movdqa %xmm0,(%rcx) # store it 1713 lea 0x10(%rcx),%rcx 1714 jmp L(movdqa_epi) 1715 1716 .balign 16 1717L(mov3dqa7): 1718 movdqa 0x10(%rdx),%xmm3 1719 sub $0x30,%r8 1720 movdqa 0x20(%rdx),%xmm0 1721 movdqa 0x30(%rdx),%xmm5 1722 lea 0x30(%rdx),%rdx 1723 cmp $0x30,%r8 1724 1725 movdqa %xmm3,%xmm2 1726 #palignr $0x7,%xmm1,%xmm3 1727 .byte 0x66,0x0f,0x3a,0x0f 1728 .byte 0xd9,0x07 1729 movdqa %xmm3,(%rcx) 1730 1731 movdqa %xmm0,%xmm4 1732 #palignr $0x7,%xmm2,%xmm0 1733 .byte 0x66,0x0f,0x3a,0x0f 1734 .byte 0xc2,0x07 1735 movdqa %xmm0,0x10(%rcx) 1736 1737 movdqa %xmm5,%xmm1 1738 #palignr $0x7,%xmm4,%xmm5 1739 .byte 0x66,0x0f,0x3a,0x0f 1740 .byte 0xec,0x07 1741 movdqa %xmm5,0x20(%rcx) 1742 1743 lea 0x30(%rcx),%rcx 1744 jge L(mov3dqa7) 1745 1746 cmp $0x10,%r8 1747 jl L(movdqa_epi) 1748 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1749 sub $0x10,%r8 1750 lea 0x10(%rdx),%rdx 1751 movdqa %xmm3,%xmm2 # save for use next concat 1752 #palignr $0x7,%xmm1,%xmm3 1753 .byte 0x66,0x0f,0x3a,0x0f 1754 .byte 0xd9,0x07 1755 1756 cmp $0x10,%r8 1757 movdqa %xmm3,(%rcx) # store it 1758 lea 0x10(%rcx),%rcx 1759 jl L(movdqa_epi) 1760 1761 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1762 sub $0x10,%r8 1763 lea 0x10(%rdx),%rdx 1764 #palignr $0x7,%xmm2,%xmm0 1765 .byte 0x66,0x0f,0x3a,0x0f 1766 .byte 0xc2,0x07 1767 movdqa %xmm0,(%rcx) # store it 1768 lea 0x10(%rcx),%rcx 1769 jmp L(movdqa_epi) 1770 1771 .balign 16 1772L(mov3dqa9): 1773 movdqa 0x10(%rdx),%xmm3 1774 sub $0x30,%r8 1775 movdqa 0x20(%rdx),%xmm0 1776 movdqa 0x30(%rdx),%xmm5 1777 lea 0x30(%rdx),%rdx 1778 cmp $0x30,%r8 1779 1780 movdqa %xmm3,%xmm2 1781 #palignr $0x9,%xmm1,%xmm3 1782 .byte 0x66,0x0f,0x3a,0x0f 1783 .byte 0xd9,0x09 1784 movdqa %xmm3,(%rcx) 1785 1786 movdqa %xmm0,%xmm4 1787 #palignr $0x9,%xmm2,%xmm0 1788 .byte 0x66,0x0f,0x3a,0x0f 1789 .byte 0xc2,0x09 1790 movdqa %xmm0,0x10(%rcx) 1791 1792 movdqa %xmm5,%xmm1 1793 #palignr $0x9,%xmm4,%xmm5 1794 .byte 0x66,0x0f,0x3a,0x0f 1795 .byte 0xec,0x09 1796 movdqa %xmm5,0x20(%rcx) 1797 1798 lea 0x30(%rcx),%rcx 1799 jge L(mov3dqa9) 1800 1801 cmp $0x10,%r8 1802 jl L(movdqa_epi) 1803 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1804 sub $0x10,%r8 1805 lea 0x10(%rdx),%rdx 1806 movdqa %xmm3,%xmm2 # save for use next concat 1807 #palignr $0x9,%xmm1,%xmm3 1808 .byte 0x66,0x0f,0x3a,0x0f 1809 .byte 0xd9,0x09 1810 1811 cmp $0x10,%r8 1812 movdqa %xmm3,(%rcx) # store it 1813 lea 0x10(%rcx),%rcx 1814 jl L(movdqa_epi) 1815 1816 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1817 sub $0x10,%r8 1818 lea 0x10(%rdx),%rdx 1819 #palignr $0x9,%xmm2,%xmm0 1820 .byte 0x66,0x0f,0x3a,0x0f 1821 .byte 0xc2,0x09 1822 movdqa %xmm0,(%rcx) # store it 1823 lea 0x10(%rcx),%rcx 1824 jmp L(movdqa_epi) 1825 1826 .balign 16 1827L(mov3dqa10): 1828 movdqa 0x10(%rdx),%xmm3 1829 sub $0x30,%r8 1830 movdqa 0x20(%rdx),%xmm0 1831 movdqa 0x30(%rdx),%xmm5 1832 lea 0x30(%rdx),%rdx 1833 cmp $0x30,%r8 1834 1835 movdqa %xmm3,%xmm2 1836 #palignr $0xa,%xmm1,%xmm3 1837 .byte 0x66,0x0f,0x3a,0x0f 1838 .byte 0xd9,0x0a 1839 movdqa %xmm3,(%rcx) 1840 1841 movdqa %xmm0,%xmm4 1842 #palignr $0xa,%xmm2,%xmm0 1843 .byte 0x66,0x0f,0x3a,0x0f 1844 .byte 0xc2,0x0a 1845 movdqa %xmm0,0x10(%rcx) 1846 1847 movdqa %xmm5,%xmm1 1848 #palignr $0xa,%xmm4,%xmm5 1849 .byte 0x66,0x0f,0x3a,0x0f 1850 .byte 0xec,0x0a 1851 movdqa %xmm5,0x20(%rcx) 1852 1853 lea 0x30(%rcx),%rcx 1854 jge L(mov3dqa10) 1855 1856 cmp $0x10,%r8 1857 jl L(movdqa_epi) 1858 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1859 sub $0x10,%r8 1860 lea 0x10(%rdx),%rdx 1861 movdqa %xmm3,%xmm2 # save for use next concat 1862 #palignr $0xa,%xmm1,%xmm3 1863 .byte 0x66,0x0f,0x3a,0x0f 1864 .byte 0xd9,0x0a 1865 1866 cmp $0x10,%r8 1867 movdqa %xmm3,(%rcx) # store it 1868 lea 0x10(%rcx),%rcx 1869 jl L(movdqa_epi) 1870 1871 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1872 sub $0x10,%r8 1873 lea 0x10(%rdx),%rdx 1874 #palignr $0xa,%xmm2,%xmm0 1875 .byte 0x66,0x0f,0x3a,0x0f 1876 .byte 0xc2,0x0a 1877 movdqa %xmm0,(%rcx) # store it 1878 lea 0x10(%rcx),%rcx 1879 jmp L(movdqa_epi) 1880 1881 .balign 16 1882L(mov3dqa11): 1883 movdqa 0x10(%rdx),%xmm3 1884 sub $0x30,%r8 1885 movdqa 0x20(%rdx),%xmm0 1886 movdqa 0x30(%rdx),%xmm5 1887 lea 0x30(%rdx),%rdx 1888 cmp $0x30,%r8 1889 1890 movdqa %xmm3,%xmm2 1891 #palignr $0xb,%xmm1,%xmm3 1892 .byte 0x66,0x0f,0x3a,0x0f 1893 .byte 0xd9,0x0b 1894 movdqa %xmm3,(%rcx) 1895 1896 movdqa %xmm0,%xmm4 1897 #palignr $0xb,%xmm2,%xmm0 1898 .byte 0x66,0x0f,0x3a,0x0f 1899 .byte 0xc2,0x0b 1900 movdqa %xmm0,0x10(%rcx) 1901 1902 movdqa %xmm5,%xmm1 1903 #palignr $0xb,%xmm4,%xmm5 1904 .byte 0x66,0x0f,0x3a,0x0f 1905 .byte 0xec,0x0b 1906 movdqa %xmm5,0x20(%rcx) 1907 1908 lea 0x30(%rcx),%rcx 1909 jge L(mov3dqa11) 1910 1911 cmp $0x10,%r8 1912 jl L(movdqa_epi) 1913 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1914 sub $0x10,%r8 1915 lea 0x10(%rdx),%rdx 1916 movdqa %xmm3,%xmm2 # save for use next concat 1917 #palignr $0xb,%xmm1,%xmm3 1918 .byte 0x66,0x0f,0x3a,0x0f 1919 .byte 0xd9,0x0b 1920 1921 cmp $0x10,%r8 1922 movdqa %xmm3,(%rcx) # store it 1923 lea 0x10(%rcx),%rcx 1924 jl L(movdqa_epi) 1925 1926 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1927 sub $0x10,%r8 1928 lea 0x10(%rdx),%rdx 1929 #palignr $0xb,%xmm2,%xmm0 1930 .byte 0x66,0x0f,0x3a,0x0f 1931 .byte 0xc2,0x0b 1932 movdqa %xmm0,(%rcx) # store it 1933 lea 0x10(%rcx),%rcx 1934 jmp L(movdqa_epi) 1935 1936 .balign 16 1937L(mov3dqa12): 1938 movdqa 0x10(%rdx),%xmm3 1939 sub $0x30,%r8 1940 movdqa 0x20(%rdx),%xmm0 1941 movdqa 0x30(%rdx),%xmm5 1942 lea 0x30(%rdx),%rdx 1943 cmp $0x30,%r8 1944 1945 movdqa %xmm3,%xmm2 1946 #palignr $0xc,%xmm1,%xmm3 1947 .byte 0x66,0x0f,0x3a,0x0f 1948 .byte 0xd9,0x0c 1949 movdqa %xmm3,(%rcx) 1950 1951 movdqa %xmm0,%xmm4 1952 #palignr $0xc,%xmm2,%xmm0 1953 .byte 0x66,0x0f,0x3a,0x0f 1954 .byte 0xc2,0x0c 1955 movdqa %xmm0,0x10(%rcx) 1956 1957 movdqa %xmm5,%xmm1 1958 #palignr $0xc,%xmm4,%xmm5 1959 .byte 0x66,0x0f,0x3a,0x0f 1960 .byte 0xec,0x0c 1961 movdqa %xmm5,0x20(%rcx) 1962 1963 lea 0x30(%rcx),%rcx 1964 jge L(mov3dqa12) 1965 1966 cmp $0x10,%r8 1967 jl L(movdqa_epi) 1968 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1969 sub $0x10,%r8 1970 lea 0x10(%rdx),%rdx 1971 movdqa %xmm3,%xmm2 # save for use next concat 1972 #palignr $0xc,%xmm1,%xmm3 1973 .byte 0x66,0x0f,0x3a,0x0f 1974 .byte 0xd9,0x0c 1975 1976 cmp $0x10,%r8 1977 movdqa %xmm3,(%rcx) # store it 1978 lea 0x10(%rcx),%rcx 1979 jl L(movdqa_epi) 1980 1981 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1982 sub $0x10,%r8 1983 lea 0x10(%rdx),%rdx 1984 #palignr $0xc,%xmm2,%xmm0 1985 .byte 0x66,0x0f,0x3a,0x0f 1986 .byte 0xc2,0x0c 1987 movdqa %xmm0,(%rcx) # store it 1988 lea 0x10(%rcx),%rcx 1989 jmp L(movdqa_epi) 1990 1991 .balign 16 1992L(mov3dqa13): 1993 movdqa 0x10(%rdx),%xmm3 1994 sub $0x30,%r8 1995 movdqa 0x20(%rdx),%xmm0 1996 movdqa 0x30(%rdx),%xmm5 1997 lea 0x30(%rdx),%rdx 1998 cmp $0x30,%r8 1999 2000 movdqa %xmm3,%xmm2 2001 #palignr $0xd,%xmm1,%xmm3 2002 .byte 0x66,0x0f,0x3a,0x0f 2003 .byte 0xd9,0x0d 2004 movdqa %xmm3,(%rcx) 2005 2006 movdqa %xmm0,%xmm4 2007 #palignr $0xd,%xmm2,%xmm0 2008 .byte 0x66,0x0f,0x3a,0x0f 2009 .byte 0xc2,0x0d 2010 movdqa %xmm0,0x10(%rcx) 2011 2012 movdqa %xmm5,%xmm1 2013 #palignr $0xd,%xmm4,%xmm5 2014 .byte 0x66,0x0f,0x3a,0x0f 2015 .byte 0xec,0x0d 2016 movdqa %xmm5,0x20(%rcx) 2017 2018 lea 0x30(%rcx),%rcx 2019 jge L(mov3dqa13) 2020 2021 cmp $0x10,%r8 2022 jl L(movdqa_epi) 2023 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2024 sub $0x10,%r8 2025 lea 0x10(%rdx),%rdx 2026 movdqa %xmm3,%xmm2 # save for use next concat 2027 #palignr $0xd,%xmm1,%xmm3 2028 .byte 0x66,0x0f,0x3a,0x0f 2029 .byte 0xd9,0x0d 2030 2031 cmp $0x10,%r8 2032 movdqa %xmm3,(%rcx) # store it 2033 lea 0x10(%rcx),%rcx 2034 jl L(movdqa_epi) 2035 2036 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2037 sub $0x10,%r8 2038 lea 0x10(%rdx),%rdx 2039 #palignr $0xd,%xmm2,%xmm0 2040 .byte 0x66,0x0f,0x3a,0x0f 2041 .byte 0xc2,0x0d 2042 movdqa %xmm0,(%rcx) # store it 2043 lea 0x10(%rcx),%rcx 2044 jmp L(movdqa_epi) 2045 2046 .balign 16 2047L(mov3dqa14): 2048 movdqa 0x10(%rdx),%xmm3 2049 sub $0x30,%r8 2050 movdqa 0x20(%rdx),%xmm0 2051 movdqa 0x30(%rdx),%xmm5 2052 lea 0x30(%rdx),%rdx 2053 cmp $0x30,%r8 2054 2055 movdqa %xmm3,%xmm2 2056 #palignr $0xe,%xmm1,%xmm3 2057 .byte 0x66,0x0f,0x3a,0x0f 2058 .byte 0xd9,0x0e 2059 movdqa %xmm3,(%rcx) 2060 2061 movdqa %xmm0,%xmm4 2062 #palignr $0xe,%xmm2,%xmm0 2063 .byte 0x66,0x0f,0x3a,0x0f 2064 .byte 0xc2,0x0e 2065 movdqa %xmm0,0x10(%rcx) 2066 2067 movdqa %xmm5,%xmm1 2068 #palignr $0xe,%xmm4,%xmm5 2069 .byte 0x66,0x0f,0x3a,0x0f 2070 .byte 0xec,0x0e 2071 movdqa %xmm5,0x20(%rcx) 2072 2073 lea 0x30(%rcx),%rcx 2074 jge L(mov3dqa14) 2075 2076 cmp $0x10,%r8 2077 jl L(movdqa_epi) 2078 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2079 sub $0x10,%r8 2080 lea 0x10(%rdx),%rdx 2081 movdqa %xmm3,%xmm2 # save for use next concat 2082 #palignr $0xe,%xmm1,%xmm3 2083 .byte 0x66,0x0f,0x3a,0x0f 2084 .byte 0xd9,0x0e 2085 2086 cmp $0x10,%r8 2087 movdqa %xmm3,(%rcx) # store it 2088 lea 0x10(%rcx),%rcx 2089 jl L(movdqa_epi) 2090 2091 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2092 sub $0x10,%r8 2093 lea 0x10(%rdx),%rdx 2094 #palignr $0xe,%xmm2,%xmm0 2095 .byte 0x66,0x0f,0x3a,0x0f 2096 .byte 0xc2,0x0e 2097 movdqa %xmm0,(%rcx) # store it 2098 lea 0x10(%rcx),%rcx 2099 jmp L(movdqa_epi) 2100 2101 .balign 16 2102L(mov3dqa15): 2103 movdqa 0x10(%rdx),%xmm3 2104 sub $0x30,%r8 2105 movdqa 0x20(%rdx),%xmm0 2106 movdqa 0x30(%rdx),%xmm5 2107 lea 0x30(%rdx),%rdx 2108 cmp $0x30,%r8 2109 2110 movdqa %xmm3,%xmm2 2111 #palignr $0xf,%xmm1,%xmm3 2112 .byte 0x66,0x0f,0x3a,0x0f 2113 .byte 0xd9,0x0f 2114 movdqa %xmm3,(%rcx) 2115 2116 movdqa %xmm0,%xmm4 2117 #palignr $0xf,%xmm2,%xmm0 2118 .byte 0x66,0x0f,0x3a,0x0f 2119 .byte 0xc2,0x0f 2120 movdqa %xmm0,0x10(%rcx) 2121 2122 movdqa %xmm5,%xmm1 2123 #palignr $0xf,%xmm4,%xmm5 2124 .byte 0x66,0x0f,0x3a,0x0f 2125 .byte 0xec,0x0f 2126 movdqa %xmm5,0x20(%rcx) 2127 2128 lea 0x30(%rcx),%rcx 2129 jge L(mov3dqa15) 2130 2131 cmp $0x10,%r8 2132 jl L(movdqa_epi) 2133 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2134 sub $0x10,%r8 2135 lea 0x10(%rdx),%rdx 2136 movdqa %xmm3,%xmm2 # save for use next concat 2137 #palignr $0xf,%xmm1,%xmm3 2138 .byte 0x66,0x0f,0x3a,0x0f 2139 .byte 0xd9,0x0f 2140 2141 cmp $0x10,%r8 2142 movdqa %xmm3,(%rcx) # store it 2143 lea 0x10(%rcx),%rcx 2144 jl L(movdqa_epi) 2145 2146 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2147 sub $0x10,%r8 2148 lea 0x10(%rdx),%rdx 2149 #palignr $0xf,%xmm2,%xmm0 2150 .byte 0x66,0x0f,0x3a,0x0f 2151 .byte 0xc2,0x0f 2152 movdqa %xmm0,(%rcx) # store it 2153 lea 0x10(%rcx),%rcx 2154 jmp L(movdqa_epi) 2155 2156 .balign 16 2157L(sse2_nt_move): 2158 lea 0x40(%rcx),%rcx 2159 lea 0x40(%rdx),%rdx 2160 lea -0x40(%r8),%r8 2161 2162 /* 2163 * doesn't matter if source is aligned for stuff out of cache. 2164 * the mis-aligned penalty is masked by the slowness of main memory. 2165 */ 2166 prefetchnta 0x180(%rdx) 2167 movdqu -0x40(%rdx),%xmm0 2168 movdqu -0x30(%rdx),%xmm1 2169 2170 cmp $0x40,%r8 2171 movntdq %xmm0,-0x40(%rcx) 2172 movntdq %xmm1,-0x30(%rcx) 2173 2174 movdqu -0x20(%rdx),%xmm2 2175 movdqu -0x10(%rdx),%xmm3 2176 2177 movntdq %xmm2,-0x20(%rcx) 2178 movntdq %xmm3,-0x10(%rcx) 2179 2180 jge L(sse2_nt_move) 2181 2182 lea L(Fix16EndTable)(%rip),%r10 2183 mov %r8,%r9 2184 and $0xFFFFFFFFFFFFFFF0,%r9 2185 add %r9,%rcx 2186 add %r9,%rdx 2187 sub %r9,%r8 2188 shr $0x4,%r9 2189 sfence 2190 2191 movslq (%r10,%r9,4),%r11 2192 lea (%r11,%r10,1),%r10 2193 jmpq *%r10 2194 2195 .balign 16 2196L(Fix16EndTable): 2197 .int L(fix16_0)-L(Fix16EndTable) 2198 .int L(fix16_1)-L(Fix16EndTable) 2199 .int L(fix16_2)-L(Fix16EndTable) 2200 .int L(fix16_3)-L(Fix16EndTable) 2201 2202 .balign 16 2203L(fix16_3): 2204 movdqu -0x30(%rdx),%xmm1 2205 movdqa %xmm1,-0x30(%rcx) 2206L(fix16_2): 2207 movdqu -0x20(%rdx),%xmm2 2208 movdqa %xmm2,-0x20(%rcx) 2209L(fix16_1): 2210 movdqu -0x10(%rdx),%xmm3 2211 movdqa %xmm3,-0x10(%rcx) 2212L(fix16_0): 2213 lea L(fwdPxQx)(%rip),%r10 2214 add %r8,%rdx 2215 add %r8,%rcx 2216 2217 movslq (%r10,%r8,4),%r9 2218 lea (%r9,%r10,1),%r10 2219 jmpq *%r10 2220 2221 .balign 16 2222L(pre_both_aligned): 2223 cmp $0x80,%r8 2224 jl L(fix_16b) 2225 2226 .balign 16 2227L(both_aligned): 2228 2229 /* 2230 * this 'paired' load/load/store/store seems to do best. 2231 */ 2232 movdqa (%rdx),%xmm0 2233 movdqa 0x10(%rdx),%xmm1 2234 2235 movdqa %xmm0,(%rcx) 2236 movdqa %xmm1,0x10(%rcx) 2237 lea -0x80(%r8),%r8 2238 2239 movdqa 0x20(%rdx),%xmm2 2240 movdqa 0x30(%rdx),%xmm3 2241 2242 movdqa %xmm2,0x20(%rcx) 2243 movdqa %xmm3,0x30(%rcx) 2244 2245 movdqa 0x40(%rdx),%xmm0 2246 movdqa 0x50(%rdx),%xmm1 2247 cmp $0x80,%r8 2248 2249 movdqa %xmm0,0x40(%rcx) 2250 movdqa %xmm1,0x50(%rcx) 2251 2252 movdqa 0x60(%rdx),%xmm2 2253 movdqa 0x70(%rdx),%xmm3 2254 lea 0x80(%rdx),%rdx 2255 movdqa %xmm2,0x60(%rcx) 2256 movdqa %xmm3,0x70(%rcx) 2257 lea 0x80(%rcx),%rcx 2258 jge L(both_aligned) 2259 2260L(fix_16b): 2261 add %r8,%rcx 2262 lea L(fwdPxQx)(%rip),%r10 2263 add %r8,%rdx 2264 2265 movslq (%r10,%r8,4),%r9 2266 lea (%r9,%r10,1),%r10 2267 jmpq *%r10 2268 2269 .balign 16 2270L(Loop8byte_pre): 2271 # Use 8-byte moves 2272 mov .largest_level_cache_size(%rip),%r9d 2273 shr %r9 # take half of it 2274 cmp %r9,%r8 2275 jg L(byte8_nt_top) 2276 # Find out whether to use rep movsq 2277 cmp $4096,%r8 2278 jle L(byte8_top) 2279 mov .amd64cache1half(%rip),%r9d # half of l1 cache 2280 cmp %r9,%r8 2281 jle L(use_rep) 2282 2283 .balign 16 2284L(byte8_top): 2285 mov (%rdx),%r9 2286 mov 0x8(%rdx),%r10 2287 lea -0x40(%r8),%r8 2288 mov %r9,(%rcx) 2289 mov %r10,0x8(%rcx) 2290 mov 0x10(%rdx),%r11 2291 mov 0x18(%rdx),%r9 2292 mov %r11,0x10(%rcx) 2293 mov %r9,0x18(%rcx) 2294 2295 cmp $0x40,%r8 2296 mov 0x20(%rdx),%r10 2297 mov 0x28(%rdx),%r11 2298 mov %r10,0x20(%rcx) 2299 mov %r11,0x28(%rcx) 2300 mov 0x30(%rdx),%r9 2301 mov 0x38(%rdx),%r10 2302 lea 0x40(%rdx),%rdx 2303 mov %r9,0x30(%rcx) 2304 mov %r10,0x38(%rcx) 2305 lea 0x40(%rcx),%rcx 2306 jg L(byte8_top) 2307 2308L(byte8_end): 2309 lea L(fwdPxQx)(%rip),%r10 2310 lea (%rdx,%r8,1),%rdx 2311 lea (%rcx,%r8,1),%rcx 2312 2313 movslq (%r10,%r8,4),%r9 2314 lea (%r9,%r10,1),%r10 2315 jmpq *%r10 2316 2317 .balign 16 2318L(use_rep): 2319 mov %rdx,%rsi # %rsi = source 2320 mov %rcx,%rdi # %rdi = destination 2321 mov %r8,%rcx # %rcx = count 2322 shrq $3,%rcx # 8-byte word count 2323 rep 2324 movsq 2325 mov %rsi,%rdx # source 2326 mov %rdi,%rcx # destination 2327 andq $7,%r8 # remainder 2328 jnz L(byte8_end) 2329 ret 2330 2331 .balign 16 2332L(byte8_nt_top): 2333 sub $0x40,%r8 2334 prefetchnta 0x180(%rdx) 2335 mov (%rdx),%r9 2336 movnti %r9,(%rcx) 2337 mov 0x8(%rdx),%r10 2338 movnti %r10,0x8(%rcx) 2339 mov 0x10(%rdx),%r11 2340 movnti %r11,0x10(%rcx) 2341 mov 0x18(%rdx),%r9 2342 movnti %r9,0x18(%rcx) 2343 mov 0x20(%rdx),%r10 2344 movnti %r10,0x20(%rcx) 2345 mov 0x28(%rdx),%r11 2346 movnti %r11,0x28(%rcx) 2347 mov 0x30(%rdx),%r9 2348 movnti %r9,0x30(%rcx) 2349 mov 0x38(%rdx),%r10 2350 movnti %r10,0x38(%rcx) 2351 2352 lea 0x40(%rdx),%rdx 2353 lea 0x40(%rcx),%rcx 2354 cmp $0x40,%r8 2355 jge L(byte8_nt_top) 2356 sfence 2357 jmp L(byte8_end) 2358 2359 SET_SIZE(memcpy) 2360 2361 .balign 16 2362L(CopyBackwards): 2363 mov %rdx,%r8 2364 mov %rdi,%rcx 2365 mov %rsi,%rdx 2366 mov %rdi,%rax # return value 2367 2368 # ck alignment of last byte 2369 lea (%rcx,%r8,1),%rcx 2370 test $0x7,%rcx 2371 lea (%rdx,%r8,1),%rdx 2372 jne L(bk_align) 2373 2374L(bk_qw_aligned): 2375 lea L(bkPxQx)(%rip),%r10 2376 2377 cmp $0x90,%r8 # 144 2378 jg L(bk_ck_sse2_alignment) 2379 2380 sub %r8,%rcx 2381 sub %r8,%rdx 2382 2383 movslq (%r10,%r8,4),%r9 2384 lea (%r9,%r10,1),%r10 2385 jmpq *%r10 2386 2387 .balign 16 2388L(bk_align): 2389 # only align if len > 8 2390 cmp $8,%r8 2391 jle L(bk_qw_aligned) 2392 test $0x1,%rcx 2393 je L(bk_tst2) 2394 dec %rcx 2395 dec %rdx 2396 dec %r8 2397 mov (%rdx),%r9b 2398 mov %r9b,(%rcx) 2399 2400L(bk_tst2): 2401 test $0x2,%rcx 2402 je L(bk_tst3) 2403 2404L(bk_got2): 2405 sub $0x2,%rcx 2406 sub $0x2,%rdx 2407 sub $0x2,%r8 2408 movzwq (%rdx),%r9 2409 mov %r9w,(%rcx) 2410 2411L(bk_tst3): 2412 test $0x4,%rcx 2413 je L(bk_qw_aligned) 2414 2415L(bk_got3): 2416 sub $0x4,%rcx 2417 sub $0x4,%rdx 2418 sub $0x4,%r8 2419 mov (%rdx),%r9d 2420 mov %r9d,(%rcx) 2421 jmp L(bk_qw_aligned) 2422 2423 .balign 16 2424L(bk_ck_sse2_alignment): 2425 cmpl $NO_SSE,.memops_method(%rip) 2426 je L(bk_use_rep) 2427 # check alignment of last byte 2428 test $0xf,%rcx 2429 jz L(bk_sse2_cpy) 2430 2431L(bk_sse2_align): 2432 # only here if already aligned on at least a qword bndry 2433 sub $0x8,%rcx 2434 sub $0x8,%rdx 2435 sub $0x8,%r8 2436 mov (%rdx),%r9 2437 mov %r9,(%rcx) 2438 #jmp L(bk_sse2_cpy) 2439 2440 .balign 16 2441L(bk_sse2_cpy): 2442 sub $0x80,%rcx # 128 2443 sub $0x80,%rdx 2444 movdqu 0x70(%rdx),%xmm3 2445 movdqu 0x60(%rdx),%xmm2 2446 movdqa %xmm3,0x70(%rcx) 2447 movdqa %xmm2,0x60(%rcx) 2448 sub $0x80,%r8 2449 movdqu 0x50(%rdx),%xmm1 2450 movdqu 0x40(%rdx),%xmm0 2451 movdqa %xmm1,0x50(%rcx) 2452 movdqa %xmm0,0x40(%rcx) 2453 2454 cmp $0x80,%r8 2455 movdqu 0x30(%rdx),%xmm3 2456 movdqu 0x20(%rdx),%xmm2 2457 movdqa %xmm3,0x30(%rcx) 2458 movdqa %xmm2,0x20(%rcx) 2459 movdqu 0x10(%rdx),%xmm1 2460 movdqu (%rdx),%xmm0 2461 movdqa %xmm1,0x10(%rcx) 2462 movdqa %xmm0,(%rcx) 2463 jge L(bk_sse2_cpy) 2464 2465L(bk_sse2_cpy_end): 2466 lea L(bkPxQx)(%rip),%r10 2467 sub %r8,%rdx 2468 sub %r8,%rcx 2469 movslq (%r10,%r8,4),%r9 2470 lea (%r9,%r10,1),%r10 2471 jmpq *%r10 2472 2473 .balign 16 2474L(bk_use_rep): 2475 xchg %rcx,%r9 2476 mov %rdx,%rsi # source 2477 mov %r9,%rdi # destination 2478 mov %r8,%rcx # count 2479 sub $8,%rsi 2480 sub $8,%rdi 2481 shr $3,%rcx 2482 std # reverse direction 2483 rep 2484 movsq 2485 cld # reset direction flag 2486 2487 xchg %rcx,%r9 2488 lea L(bkPxQx)(%rip),%r10 2489 sub %r8,%rdx 2490 sub %r8,%rcx 2491 andq $7,%r8 # remainder 2492 jz 2f 2493 movslq (%r10,%r8,4),%r9 2494 lea (%r9,%r10,1),%r10 2495 jmpq *%r10 24962: 2497 ret 2498 2499 .balign 16 2500L(bkP0QI): 2501 mov 0x88(%rdx),%r10 2502 mov %r10,0x88(%rcx) 2503L(bkP0QH): 2504 mov 0x80(%rdx),%r10 2505 mov %r10,0x80(%rcx) 2506L(bkP0QG): 2507 mov 0x78(%rdx),%r9 2508 mov %r9,0x78(%rcx) 2509L(bkP0QF): 2510 mov 0x70(%rdx),%r11 2511 mov %r11,0x70(%rcx) 2512L(bkP0QE): 2513 mov 0x68(%rdx),%r10 2514 mov %r10,0x68(%rcx) 2515L(bkP0QD): 2516 mov 0x60(%rdx),%r9 2517 mov %r9,0x60(%rcx) 2518L(bkP0QC): 2519 mov 0x58(%rdx),%r11 2520 mov %r11,0x58(%rcx) 2521L(bkP0QB): 2522 mov 0x50(%rdx),%r10 2523 mov %r10,0x50(%rcx) 2524L(bkP0QA): 2525 mov 0x48(%rdx),%r9 2526 mov %r9,0x48(%rcx) 2527L(bkP0Q9): 2528 mov 0x40(%rdx),%r11 2529 mov %r11,0x40(%rcx) 2530L(bkP0Q8): 2531 mov 0x38(%rdx),%r10 2532 mov %r10,0x38(%rcx) 2533L(bkP0Q7): 2534 mov 0x30(%rdx),%r9 2535 mov %r9,0x30(%rcx) 2536L(bkP0Q6): 2537 mov 0x28(%rdx),%r11 2538 mov %r11,0x28(%rcx) 2539L(bkP0Q5): 2540 mov 0x20(%rdx),%r10 2541 mov %r10,0x20(%rcx) 2542L(bkP0Q4): 2543 mov 0x18(%rdx),%r9 2544 mov %r9,0x18(%rcx) 2545L(bkP0Q3): 2546 mov 0x10(%rdx),%r11 2547 mov %r11,0x10(%rcx) 2548L(bkP0Q2): 2549 mov 0x8(%rdx),%r10 2550 mov %r10,0x8(%rcx) 2551L(bkP0Q1): 2552 mov (%rdx),%r9 2553 mov %r9,(%rcx) 2554L(bkP0Q0): 2555 ret 2556 2557 .balign 16 2558L(bkP1QI): 2559 mov 0x89(%rdx),%r10 2560 mov %r10,0x89(%rcx) 2561L(bkP1QH): 2562 mov 0x81(%rdx),%r11 2563 mov %r11,0x81(%rcx) 2564L(bkP1QG): 2565 mov 0x79(%rdx),%r10 2566 mov %r10,0x79(%rcx) 2567L(bkP1QF): 2568 mov 0x71(%rdx),%r9 2569 mov %r9,0x71(%rcx) 2570L(bkP1QE): 2571 mov 0x69(%rdx),%r11 2572 mov %r11,0x69(%rcx) 2573L(bkP1QD): 2574 mov 0x61(%rdx),%r10 2575 mov %r10,0x61(%rcx) 2576L(bkP1QC): 2577 mov 0x59(%rdx),%r9 2578 mov %r9,0x59(%rcx) 2579L(bkP1QB): 2580 mov 0x51(%rdx),%r11 2581 mov %r11,0x51(%rcx) 2582L(bkP1QA): 2583 mov 0x49(%rdx),%r10 2584 mov %r10,0x49(%rcx) 2585L(bkP1Q9): 2586 mov 0x41(%rdx),%r9 2587 mov %r9,0x41(%rcx) 2588L(bkP1Q8): 2589 mov 0x39(%rdx),%r11 2590 mov %r11,0x39(%rcx) 2591L(bkP1Q7): 2592 mov 0x31(%rdx),%r10 2593 mov %r10,0x31(%rcx) 2594L(bkP1Q6): 2595 mov 0x29(%rdx),%r9 2596 mov %r9,0x29(%rcx) 2597L(bkP1Q5): 2598 mov 0x21(%rdx),%r11 2599 mov %r11,0x21(%rcx) 2600L(bkP1Q4): 2601 mov 0x19(%rdx),%r10 2602 mov %r10,0x19(%rcx) 2603L(bkP1Q3): 2604 mov 0x11(%rdx),%r9 2605 mov %r9,0x11(%rcx) 2606L(bkP1Q2): 2607 mov 0x9(%rdx),%r11 2608 mov %r11,0x9(%rcx) 2609L(bkP1Q1): 2610 mov 0x1(%rdx),%r10 2611 mov %r10,0x1(%rcx) 2612L(bkP1Q0): 2613 mov (%rdx),%r9b 2614 mov %r9b,(%rcx) 2615 ret 2616 2617 .balign 16 2618L(bkP2QI): 2619 mov 0x8a(%rdx),%r10 2620 mov %r10,0x8a(%rcx) 2621L(bkP2QH): 2622 mov 0x82(%rdx),%r11 2623 mov %r11,0x82(%rcx) 2624L(bkP2QG): 2625 mov 0x7a(%rdx),%r10 2626 mov %r10,0x7a(%rcx) 2627L(bkP2QF): 2628 mov 0x72(%rdx),%r9 2629 mov %r9,0x72(%rcx) 2630L(bkP2QE): 2631 mov 0x6a(%rdx),%r11 2632 mov %r11,0x6a(%rcx) 2633L(bkP2QD): 2634 mov 0x62(%rdx),%r10 2635 mov %r10,0x62(%rcx) 2636L(bkP2QC): 2637 mov 0x5a(%rdx),%r9 2638 mov %r9,0x5a(%rcx) 2639L(bkP2QB): 2640 mov 0x52(%rdx),%r11 2641 mov %r11,0x52(%rcx) 2642L(bkP2QA): 2643 mov 0x4a(%rdx),%r10 2644 mov %r10,0x4a(%rcx) 2645L(bkP2Q9): 2646 mov 0x42(%rdx),%r9 2647 mov %r9,0x42(%rcx) 2648L(bkP2Q8): 2649 mov 0x3a(%rdx),%r11 2650 mov %r11,0x3a(%rcx) 2651L(bkP2Q7): 2652 mov 0x32(%rdx),%r10 2653 mov %r10,0x32(%rcx) 2654L(bkP2Q6): 2655 mov 0x2a(%rdx),%r9 2656 mov %r9,0x2a(%rcx) 2657L(bkP2Q5): 2658 mov 0x22(%rdx),%r11 2659 mov %r11,0x22(%rcx) 2660L(bkP2Q4): 2661 mov 0x1a(%rdx),%r10 2662 mov %r10,0x1a(%rcx) 2663L(bkP2Q3): 2664 mov 0x12(%rdx),%r9 2665 mov %r9,0x12(%rcx) 2666L(bkP2Q2): 2667 mov 0xa(%rdx),%r11 2668 mov %r11,0xa(%rcx) 2669L(bkP2Q1): 2670 mov 0x2(%rdx),%r10 2671 mov %r10,0x2(%rcx) 2672L(bkP2Q0): 2673 mov (%rdx),%r9w 2674 mov %r9w,(%rcx) 2675 ret 2676 2677 .balign 16 2678L(bkP3QI): 2679 mov 0x8b(%rdx),%r10 2680 mov %r10,0x8b(%rcx) 2681L(bkP3QH): 2682 mov 0x83(%rdx),%r11 2683 mov %r11,0x83(%rcx) 2684L(bkP3QG): 2685 mov 0x7b(%rdx),%r10 2686 mov %r10,0x7b(%rcx) 2687L(bkP3QF): 2688 mov 0x73(%rdx),%r9 2689 mov %r9,0x73(%rcx) 2690L(bkP3QE): 2691 mov 0x6b(%rdx),%r11 2692 mov %r11,0x6b(%rcx) 2693L(bkP3QD): 2694 mov 0x63(%rdx),%r10 2695 mov %r10,0x63(%rcx) 2696L(bkP3QC): 2697 mov 0x5b(%rdx),%r9 2698 mov %r9,0x5b(%rcx) 2699L(bkP3QB): 2700 mov 0x53(%rdx),%r11 2701 mov %r11,0x53(%rcx) 2702L(bkP3QA): 2703 mov 0x4b(%rdx),%r10 2704 mov %r10,0x4b(%rcx) 2705L(bkP3Q9): 2706 mov 0x43(%rdx),%r9 2707 mov %r9,0x43(%rcx) 2708L(bkP3Q8): 2709 mov 0x3b(%rdx),%r11 2710 mov %r11,0x3b(%rcx) 2711L(bkP3Q7): 2712 mov 0x33(%rdx),%r10 2713 mov %r10,0x33(%rcx) 2714L(bkP3Q6): 2715 mov 0x2b(%rdx),%r9 2716 mov %r9,0x2b(%rcx) 2717L(bkP3Q5): 2718 mov 0x23(%rdx),%r11 2719 mov %r11,0x23(%rcx) 2720L(bkP3Q4): 2721 mov 0x1b(%rdx),%r10 2722 mov %r10,0x1b(%rcx) 2723L(bkP3Q3): 2724 mov 0x13(%rdx),%r9 2725 mov %r9,0x13(%rcx) 2726L(bkP3Q2): 2727 mov 0xb(%rdx),%r11 2728 mov %r11,0xb(%rcx) 2729L(bkP3Q1): 2730 mov 0x3(%rdx),%r10 2731 mov %r10,0x3(%rcx) 2732L(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores 2733 mov 0x1(%rdx),%r9w 2734 mov %r9w,0x1(%rcx) 2735 mov (%rdx),%r10b 2736 mov %r10b,(%rcx) 2737 ret 2738 2739 .balign 16 2740L(bkP4QI): 2741 mov 0x8c(%rdx),%r10 2742 mov %r10,0x8c(%rcx) 2743L(bkP4QH): 2744 mov 0x84(%rdx),%r11 2745 mov %r11,0x84(%rcx) 2746L(bkP4QG): 2747 mov 0x7c(%rdx),%r10 2748 mov %r10,0x7c(%rcx) 2749L(bkP4QF): 2750 mov 0x74(%rdx),%r9 2751 mov %r9,0x74(%rcx) 2752L(bkP4QE): 2753 mov 0x6c(%rdx),%r11 2754 mov %r11,0x6c(%rcx) 2755L(bkP4QD): 2756 mov 0x64(%rdx),%r10 2757 mov %r10,0x64(%rcx) 2758L(bkP4QC): 2759 mov 0x5c(%rdx),%r9 2760 mov %r9,0x5c(%rcx) 2761L(bkP4QB): 2762 mov 0x54(%rdx),%r11 2763 mov %r11,0x54(%rcx) 2764L(bkP4QA): 2765 mov 0x4c(%rdx),%r10 2766 mov %r10,0x4c(%rcx) 2767L(bkP4Q9): 2768 mov 0x44(%rdx),%r9 2769 mov %r9,0x44(%rcx) 2770L(bkP4Q8): 2771 mov 0x3c(%rdx),%r11 2772 mov %r11,0x3c(%rcx) 2773L(bkP4Q7): 2774 mov 0x34(%rdx),%r10 2775 mov %r10,0x34(%rcx) 2776L(bkP4Q6): 2777 mov 0x2c(%rdx),%r9 2778 mov %r9,0x2c(%rcx) 2779L(bkP4Q5): 2780 mov 0x24(%rdx),%r11 2781 mov %r11,0x24(%rcx) 2782L(bkP4Q4): 2783 mov 0x1c(%rdx),%r10 2784 mov %r10,0x1c(%rcx) 2785L(bkP4Q3): 2786 mov 0x14(%rdx),%r9 2787 mov %r9,0x14(%rcx) 2788L(bkP4Q2): 2789 mov 0xc(%rdx),%r11 2790 mov %r11,0xc(%rcx) 2791L(bkP4Q1): 2792 mov 0x4(%rdx),%r10 2793 mov %r10,0x4(%rcx) 2794L(bkP4Q0): 2795 mov (%rdx),%r9d 2796 mov %r9d,(%rcx) 2797 ret 2798 2799 .balign 16 2800L(bkP5QI): 2801 mov 0x8d(%rdx),%r10 2802 mov %r10,0x8d(%rcx) 2803L(bkP5QH): 2804 mov 0x85(%rdx),%r9 2805 mov %r9,0x85(%rcx) 2806L(bkP5QG): 2807 mov 0x7d(%rdx),%r11 2808 mov %r11,0x7d(%rcx) 2809L(bkP5QF): 2810 mov 0x75(%rdx),%r10 2811 mov %r10,0x75(%rcx) 2812L(bkP5QE): 2813 mov 0x6d(%rdx),%r9 2814 mov %r9,0x6d(%rcx) 2815L(bkP5QD): 2816 mov 0x65(%rdx),%r11 2817 mov %r11,0x65(%rcx) 2818L(bkP5QC): 2819 mov 0x5d(%rdx),%r10 2820 mov %r10,0x5d(%rcx) 2821L(bkP5QB): 2822 mov 0x55(%rdx),%r9 2823 mov %r9,0x55(%rcx) 2824L(bkP5QA): 2825 mov 0x4d(%rdx),%r11 2826 mov %r11,0x4d(%rcx) 2827L(bkP5Q9): 2828 mov 0x45(%rdx),%r10 2829 mov %r10,0x45(%rcx) 2830L(bkP5Q8): 2831 mov 0x3d(%rdx),%r9 2832 mov %r9,0x3d(%rcx) 2833L(bkP5Q7): 2834 mov 0x35(%rdx),%r11 2835 mov %r11,0x35(%rcx) 2836L(bkP5Q6): 2837 mov 0x2d(%rdx),%r10 2838 mov %r10,0x2d(%rcx) 2839L(bkP5Q5): 2840 mov 0x25(%rdx),%r9 2841 mov %r9,0x25(%rcx) 2842L(bkP5Q4): 2843 mov 0x1d(%rdx),%r11 2844 mov %r11,0x1d(%rcx) 2845L(bkP5Q3): 2846 mov 0x15(%rdx),%r10 2847 mov %r10,0x15(%rcx) 2848L(bkP5Q2): 2849 mov 0xd(%rdx),%r9 2850 mov %r9,0xd(%rcx) 2851L(bkP5Q1): 2852 mov 0x5(%rdx),%r11 2853 mov %r11,0x5(%rcx) 2854L(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores 2855 mov 0x1(%rdx),%r9d 2856 mov %r9d,0x1(%rcx) 2857 mov (%rdx),%r10b 2858 mov %r10b,(%rcx) 2859 ret 2860 2861 .balign 16 2862L(bkP6QI): 2863 mov 0x8e(%rdx),%r10 2864 mov %r10,0x8e(%rcx) 2865L(bkP6QH): 2866 mov 0x86(%rdx),%r11 2867 mov %r11,0x86(%rcx) 2868L(bkP6QG): 2869 mov 0x7e(%rdx),%r10 2870 mov %r10,0x7e(%rcx) 2871L(bkP6QF): 2872 mov 0x76(%rdx),%r9 2873 mov %r9,0x76(%rcx) 2874L(bkP6QE): 2875 mov 0x6e(%rdx),%r11 2876 mov %r11,0x6e(%rcx) 2877L(bkP6QD): 2878 mov 0x66(%rdx),%r10 2879 mov %r10,0x66(%rcx) 2880L(bkP6QC): 2881 mov 0x5e(%rdx),%r9 2882 mov %r9,0x5e(%rcx) 2883L(bkP6QB): 2884 mov 0x56(%rdx),%r11 2885 mov %r11,0x56(%rcx) 2886L(bkP6QA): 2887 mov 0x4e(%rdx),%r10 2888 mov %r10,0x4e(%rcx) 2889L(bkP6Q9): 2890 mov 0x46(%rdx),%r9 2891 mov %r9,0x46(%rcx) 2892L(bkP6Q8): 2893 mov 0x3e(%rdx),%r11 2894 mov %r11,0x3e(%rcx) 2895L(bkP6Q7): 2896 mov 0x36(%rdx),%r10 2897 mov %r10,0x36(%rcx) 2898L(bkP6Q6): 2899 mov 0x2e(%rdx),%r9 2900 mov %r9,0x2e(%rcx) 2901L(bkP6Q5): 2902 mov 0x26(%rdx),%r11 2903 mov %r11,0x26(%rcx) 2904L(bkP6Q4): 2905 mov 0x1e(%rdx),%r10 2906 mov %r10,0x1e(%rcx) 2907L(bkP6Q3): 2908 mov 0x16(%rdx),%r9 2909 mov %r9,0x16(%rcx) 2910L(bkP6Q2): 2911 mov 0xe(%rdx),%r11 2912 mov %r11,0xe(%rcx) 2913L(bkP6Q1): 2914 mov 0x6(%rdx),%r10 2915 mov %r10,0x6(%rcx) 2916L(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores 2917 mov 0x2(%rdx),%r9d 2918 mov %r9d,0x2(%rcx) 2919 mov (%rdx),%r10w 2920 mov %r10w,(%rcx) 2921 ret 2922 2923 .balign 16 2924L(bkP7QI): 2925 mov 0x8f(%rdx),%r10 2926 mov %r10,0x8f(%rcx) 2927L(bkP7QH): 2928 mov 0x87(%rdx),%r11 2929 mov %r11,0x87(%rcx) 2930L(bkP7QG): 2931 mov 0x7f(%rdx),%r10 2932 mov %r10,0x7f(%rcx) 2933L(bkP7QF): 2934 mov 0x77(%rdx),%r9 2935 mov %r9,0x77(%rcx) 2936L(bkP7QE): 2937 mov 0x6f(%rdx),%r11 2938 mov %r11,0x6f(%rcx) 2939L(bkP7QD): 2940 mov 0x67(%rdx),%r10 2941 mov %r10,0x67(%rcx) 2942L(bkP7QC): 2943 mov 0x5f(%rdx),%r9 2944 mov %r9,0x5f(%rcx) 2945L(bkP7QB): 2946 mov 0x57(%rdx),%r11 2947 mov %r11,0x57(%rcx) 2948L(bkP7QA): 2949 mov 0x4f(%rdx),%r10 2950 mov %r10,0x4f(%rcx) 2951L(bkP7Q9): 2952 mov 0x47(%rdx),%r9 2953 mov %r9,0x47(%rcx) 2954L(bkP7Q8): 2955 mov 0x3f(%rdx),%r11 2956 mov %r11,0x3f(%rcx) 2957L(bkP7Q7): 2958 mov 0x37(%rdx),%r10 2959 mov %r10,0x37(%rcx) 2960L(bkP7Q6): 2961 mov 0x2f(%rdx),%r9 2962 mov %r9,0x2f(%rcx) 2963L(bkP7Q5): 2964 mov 0x27(%rdx),%r11 2965 mov %r11,0x27(%rcx) 2966L(bkP7Q4): 2967 mov 0x1f(%rdx),%r10 2968 mov %r10,0x1f(%rcx) 2969L(bkP7Q3): 2970 mov 0x17(%rdx),%r9 2971 mov %r9,0x17(%rcx) 2972L(bkP7Q2): 2973 mov 0xf(%rdx),%r11 2974 mov %r11,0xf(%rcx) 2975L(bkP7Q1): 2976 mov 0x7(%rdx),%r10 2977 mov %r10,0x7(%rcx) 2978L(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores 2979 mov 0x3(%rdx),%r9d 2980 mov %r9d,0x3(%rcx) 2981 mov 0x1(%rdx),%r10w 2982 mov %r10w,0x1(%rcx) 2983 mov (%rdx),%r11b 2984 mov %r11b,(%rcx) 2985 ret 2986 2987 .balign 16 2988L(bkPxQx): .int L(bkP0Q0)-L(bkPxQx) 2989 .int L(bkP1Q0)-L(bkPxQx) 2990 .int L(bkP2Q0)-L(bkPxQx) 2991 .int L(bkP3Q0)-L(bkPxQx) 2992 .int L(bkP4Q0)-L(bkPxQx) 2993 .int L(bkP5Q0)-L(bkPxQx) 2994 .int L(bkP6Q0)-L(bkPxQx) 2995 .int L(bkP7Q0)-L(bkPxQx) 2996 2997 .int L(bkP0Q1)-L(bkPxQx) 2998 .int L(bkP1Q1)-L(bkPxQx) 2999 .int L(bkP2Q1)-L(bkPxQx) 3000 .int L(bkP3Q1)-L(bkPxQx) 3001 .int L(bkP4Q1)-L(bkPxQx) 3002 .int L(bkP5Q1)-L(bkPxQx) 3003 .int L(bkP6Q1)-L(bkPxQx) 3004 .int L(bkP7Q1)-L(bkPxQx) 3005 3006 .int L(bkP0Q2)-L(bkPxQx) 3007 .int L(bkP1Q2)-L(bkPxQx) 3008 .int L(bkP2Q2)-L(bkPxQx) 3009 .int L(bkP3Q2)-L(bkPxQx) 3010 .int L(bkP4Q2)-L(bkPxQx) 3011 .int L(bkP5Q2)-L(bkPxQx) 3012 .int L(bkP6Q2)-L(bkPxQx) 3013 .int L(bkP7Q2)-L(bkPxQx) 3014 3015 .int L(bkP0Q3)-L(bkPxQx) 3016 .int L(bkP1Q3)-L(bkPxQx) 3017 .int L(bkP2Q3)-L(bkPxQx) 3018 .int L(bkP3Q3)-L(bkPxQx) 3019 .int L(bkP4Q3)-L(bkPxQx) 3020 .int L(bkP5Q3)-L(bkPxQx) 3021 .int L(bkP6Q3)-L(bkPxQx) 3022 .int L(bkP7Q3)-L(bkPxQx) 3023 3024 .int L(bkP0Q4)-L(bkPxQx) 3025 .int L(bkP1Q4)-L(bkPxQx) 3026 .int L(bkP2Q4)-L(bkPxQx) 3027 .int L(bkP3Q4)-L(bkPxQx) 3028 .int L(bkP4Q4)-L(bkPxQx) 3029 .int L(bkP5Q4)-L(bkPxQx) 3030 .int L(bkP6Q4)-L(bkPxQx) 3031 .int L(bkP7Q4)-L(bkPxQx) 3032 3033 .int L(bkP0Q5)-L(bkPxQx) 3034 .int L(bkP1Q5)-L(bkPxQx) 3035 .int L(bkP2Q5)-L(bkPxQx) 3036 .int L(bkP3Q5)-L(bkPxQx) 3037 .int L(bkP4Q5)-L(bkPxQx) 3038 .int L(bkP5Q5)-L(bkPxQx) 3039 .int L(bkP6Q5)-L(bkPxQx) 3040 .int L(bkP7Q5)-L(bkPxQx) 3041 3042 .int L(bkP0Q6)-L(bkPxQx) 3043 .int L(bkP1Q6)-L(bkPxQx) 3044 .int L(bkP2Q6)-L(bkPxQx) 3045 .int L(bkP3Q6)-L(bkPxQx) 3046 .int L(bkP4Q6)-L(bkPxQx) 3047 .int L(bkP5Q6)-L(bkPxQx) 3048 .int L(bkP6Q6)-L(bkPxQx) 3049 .int L(bkP7Q6)-L(bkPxQx) 3050 3051 .int L(bkP0Q7)-L(bkPxQx) 3052 .int L(bkP1Q7)-L(bkPxQx) 3053 .int L(bkP2Q7)-L(bkPxQx) 3054 .int L(bkP3Q7)-L(bkPxQx) 3055 .int L(bkP4Q7)-L(bkPxQx) 3056 .int L(bkP5Q7)-L(bkPxQx) 3057 .int L(bkP6Q7)-L(bkPxQx) 3058 .int L(bkP7Q7)-L(bkPxQx) 3059 3060 .int L(bkP0Q8)-L(bkPxQx) 3061 .int L(bkP1Q8)-L(bkPxQx) 3062 .int L(bkP2Q8)-L(bkPxQx) 3063 .int L(bkP3Q8)-L(bkPxQx) 3064 .int L(bkP4Q8)-L(bkPxQx) 3065 .int L(bkP5Q8)-L(bkPxQx) 3066 .int L(bkP6Q8)-L(bkPxQx) 3067 .int L(bkP7Q8)-L(bkPxQx) 3068 3069 .int L(bkP0Q9)-L(bkPxQx) 3070 .int L(bkP1Q9)-L(bkPxQx) 3071 .int L(bkP2Q9)-L(bkPxQx) 3072 .int L(bkP3Q9)-L(bkPxQx) 3073 .int L(bkP4Q9)-L(bkPxQx) 3074 .int L(bkP5Q9)-L(bkPxQx) 3075 .int L(bkP6Q9)-L(bkPxQx) 3076 .int L(bkP7Q9)-L(bkPxQx) 3077 3078 .int L(bkP0QA)-L(bkPxQx) 3079 .int L(bkP1QA)-L(bkPxQx) 3080 .int L(bkP2QA)-L(bkPxQx) 3081 .int L(bkP3QA)-L(bkPxQx) 3082 .int L(bkP4QA)-L(bkPxQx) 3083 .int L(bkP5QA)-L(bkPxQx) 3084 .int L(bkP6QA)-L(bkPxQx) 3085 .int L(bkP7QA)-L(bkPxQx) 3086 3087 .int L(bkP0QB)-L(bkPxQx) 3088 .int L(bkP1QB)-L(bkPxQx) 3089 .int L(bkP2QB)-L(bkPxQx) 3090 .int L(bkP3QB)-L(bkPxQx) 3091 .int L(bkP4QB)-L(bkPxQx) 3092 .int L(bkP5QB)-L(bkPxQx) 3093 .int L(bkP6QB)-L(bkPxQx) 3094 .int L(bkP7QB)-L(bkPxQx) 3095 3096 .int L(bkP0QC)-L(bkPxQx) 3097 .int L(bkP1QC)-L(bkPxQx) 3098 .int L(bkP2QC)-L(bkPxQx) 3099 .int L(bkP3QC)-L(bkPxQx) 3100 .int L(bkP4QC)-L(bkPxQx) 3101 .int L(bkP5QC)-L(bkPxQx) 3102 .int L(bkP6QC)-L(bkPxQx) 3103 .int L(bkP7QC)-L(bkPxQx) 3104 3105 .int L(bkP0QD)-L(bkPxQx) 3106 .int L(bkP1QD)-L(bkPxQx) 3107 .int L(bkP2QD)-L(bkPxQx) 3108 .int L(bkP3QD)-L(bkPxQx) 3109 .int L(bkP4QD)-L(bkPxQx) 3110 .int L(bkP5QD)-L(bkPxQx) 3111 .int L(bkP6QD)-L(bkPxQx) 3112 .int L(bkP7QD)-L(bkPxQx) 3113 3114 .int L(bkP0QE)-L(bkPxQx) 3115 .int L(bkP1QE)-L(bkPxQx) 3116 .int L(bkP2QE)-L(bkPxQx) 3117 .int L(bkP3QE)-L(bkPxQx) 3118 .int L(bkP4QE)-L(bkPxQx) 3119 .int L(bkP5QE)-L(bkPxQx) 3120 .int L(bkP6QE)-L(bkPxQx) 3121 .int L(bkP7QE)-L(bkPxQx) 3122 3123 .int L(bkP0QF)-L(bkPxQx) 3124 .int L(bkP1QF)-L(bkPxQx) 3125 .int L(bkP2QF)-L(bkPxQx) 3126 .int L(bkP3QF)-L(bkPxQx) 3127 .int L(bkP4QF)-L(bkPxQx) 3128 .int L(bkP5QF)-L(bkPxQx) 3129 .int L(bkP6QF)-L(bkPxQx) 3130 .int L(bkP7QF)-L(bkPxQx) 3131 3132 .int L(bkP0QG)-L(bkPxQx) 3133 .int L(bkP1QG)-L(bkPxQx) 3134 .int L(bkP2QG)-L(bkPxQx) 3135 .int L(bkP3QG)-L(bkPxQx) 3136 .int L(bkP4QG)-L(bkPxQx) 3137 .int L(bkP5QG)-L(bkPxQx) 3138 .int L(bkP6QG)-L(bkPxQx) 3139 .int L(bkP7QG)-L(bkPxQx) 3140 3141 .int L(bkP0QH)-L(bkPxQx) 3142 .int L(bkP1QH)-L(bkPxQx) 3143 .int L(bkP2QH)-L(bkPxQx) 3144 .int L(bkP3QH)-L(bkPxQx) 3145 .int L(bkP4QH)-L(bkPxQx) 3146 .int L(bkP5QH)-L(bkPxQx) 3147 .int L(bkP6QH)-L(bkPxQx) 3148 .int L(bkP7QH)-L(bkPxQx) 3149 3150 .int L(bkP0QI)-L(bkPxQx) 3151 .int L(bkP1QI)-L(bkPxQx) 3152 .int L(bkP2QI)-L(bkPxQx) 3153 .int L(bkP3QI)-L(bkPxQx) 3154 .int L(bkP4QI)-L(bkPxQx) 3155 .int L(bkP5QI)-L(bkPxQx) 3156 .int L(bkP6QI)-L(bkPxQx) 3157 .int L(bkP7QI)-L(bkPxQx) 3158 3159 SET_SIZE(memmove) 3160