1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27/* 28 * Copyright (c) 2008, Intel Corporation 29 * All rights reserved. 30 */ 31 32/* 33 * memcpy.s - copies two blocks of memory 34 * Implements memcpy() and memmove() libc primitives. 35 */ 36 37 .file "memcpy.s" 38 39#include <sys/asm_linkage.h> 40 41 ANSI_PRAGMA_WEAK(memmove,function) 42 ANSI_PRAGMA_WEAK(memcpy,function) 43 44#include "cache.h" 45#include "proc64_id.h" 46 47#define L(s) .memcpy/**/s 48 49/* 50 * memcpy algorithm overview: 51 * 52 * Thresholds used below were determined experimentally. 53 * 54 * Pseudo code: 55 * 56 * If (size <= 128 bytes) { 57 * do unrolled code (primarily 8-byte loads/stores) regardless of 58 * alignment. 59 * } else { 60 * Align destination to 16-byte boundary 61 * 62 * if (NO_SSE) { 63 * If (size > half of the largest level cache) { 64 * Use 8-byte non-temporal stores (64-bytes/loop) 65 * } else { 66 * if (size > 4K && size <= half l1 cache size) { 67 * Use rep movsq 68 * } else { 69 * Use 8-byte loads/stores (64 bytes per loop) 70 * } 71 * } 72 * 73 * } else { **USE SSE** 74 * If (size > half of the largest level cache) { 75 * Use 16-byte non-temporal stores (128-bytes per loop) 76 * } else { 77 * If (both source and destination are aligned) { 78 * Use 16-byte aligned loads and stores (128 bytes/loop) 79 * } else { 80 * use pairs of xmm registers with SSE2 or SSSE3 81 * instructions to concatenate and shift appropriately 82 * to account for source unalignment. This enables 83 * 16-byte aligned loads to be done. 84 * } 85 * } 86 } 87 * 88 * Finish any remaining bytes via unrolled code above. 89 * } 90 * 91 * memmove overview: 92 * memmove is the same as memcpy except one case where copy needs to be 93 * done backwards. The copy backwards code is done in a similar manner. 94 */ 95 96 ENTRY(memmove) 97 cmp %rsi,%rdi # if dst <= src 98 jbe L(CopyForward) # then do copy forward 99 mov %rsi,%r9 # move src to r9 100 add %rdx,%r9 # add len to get addr of end of src 101 cmp %r9,%rdi # if dst < end of src 102 jb L(CopyBackwards) # then do copy backwards 103 jmp L(CopyForward) 104 105 ENTRY (memcpy) 106L(CopyForward): 107 mov %rdx,%r8 108 mov %rdi,%rcx 109 mov %rsi,%rdx 110 mov %rdi,%rax 111 lea L(fwdPxQx)(%rip),%r11 112 cmp $0x80,%r8 # 128 113 jg L(ck_use_sse2) 114 add %r8,%rcx 115 add %r8,%rdx 116 117 movslq (%r11,%r8,4),%r10 118 lea (%r10,%r11,1),%r11 119 jmpq *%r11 120 121 .balign 16 122L(ShrtAlignNew): 123 lea L(AliPxQx)(%rip),%r11 124 mov %rcx,%r9 125 and $0xf,%r9 126 127 movslq (%r11,%r9,4),%r10 128 lea (%r10,%r11,1),%r11 129 jmpq *%r11 130 131 .balign 16 132L(fwdPxQx): .int L(P0Q0)-L(fwdPxQx) 133 .int L(P1Q0)-L(fwdPxQx) 134 .int L(P2Q0)-L(fwdPxQx) 135 .int L(P3Q0)-L(fwdPxQx) 136 .int L(P4Q0)-L(fwdPxQx) 137 .int L(P5Q0)-L(fwdPxQx) 138 .int L(P6Q0)-L(fwdPxQx) 139 .int L(P7Q0)-L(fwdPxQx) 140 141 .int L(P0Q1)-L(fwdPxQx) 142 .int L(P1Q1)-L(fwdPxQx) 143 .int L(P2Q1)-L(fwdPxQx) 144 .int L(P3Q1)-L(fwdPxQx) 145 .int L(P4Q1)-L(fwdPxQx) 146 .int L(P5Q1)-L(fwdPxQx) 147 .int L(P6Q1)-L(fwdPxQx) 148 .int L(P7Q1)-L(fwdPxQx) 149 150 .int L(P0Q2)-L(fwdPxQx) 151 .int L(P1Q2)-L(fwdPxQx) 152 .int L(P2Q2)-L(fwdPxQx) 153 .int L(P3Q2)-L(fwdPxQx) 154 .int L(P4Q2)-L(fwdPxQx) 155 .int L(P5Q2)-L(fwdPxQx) 156 .int L(P6Q2)-L(fwdPxQx) 157 .int L(P7Q2)-L(fwdPxQx) 158 159 .int L(P0Q3)-L(fwdPxQx) 160 .int L(P1Q3)-L(fwdPxQx) 161 .int L(P2Q3)-L(fwdPxQx) 162 .int L(P3Q3)-L(fwdPxQx) 163 .int L(P4Q3)-L(fwdPxQx) 164 .int L(P5Q3)-L(fwdPxQx) 165 .int L(P6Q3)-L(fwdPxQx) 166 .int L(P7Q3)-L(fwdPxQx) 167 168 .int L(P0Q4)-L(fwdPxQx) 169 .int L(P1Q4)-L(fwdPxQx) 170 .int L(P2Q4)-L(fwdPxQx) 171 .int L(P3Q4)-L(fwdPxQx) 172 .int L(P4Q4)-L(fwdPxQx) 173 .int L(P5Q4)-L(fwdPxQx) 174 .int L(P6Q4)-L(fwdPxQx) 175 .int L(P7Q4)-L(fwdPxQx) 176 177 .int L(P0Q5)-L(fwdPxQx) 178 .int L(P1Q5)-L(fwdPxQx) 179 .int L(P2Q5)-L(fwdPxQx) 180 .int L(P3Q5)-L(fwdPxQx) 181 .int L(P4Q5)-L(fwdPxQx) 182 .int L(P5Q5)-L(fwdPxQx) 183 .int L(P6Q5)-L(fwdPxQx) 184 .int L(P7Q5)-L(fwdPxQx) 185 186 .int L(P0Q6)-L(fwdPxQx) 187 .int L(P1Q6)-L(fwdPxQx) 188 .int L(P2Q6)-L(fwdPxQx) 189 .int L(P3Q6)-L(fwdPxQx) 190 .int L(P4Q6)-L(fwdPxQx) 191 .int L(P5Q6)-L(fwdPxQx) 192 .int L(P6Q6)-L(fwdPxQx) 193 .int L(P7Q6)-L(fwdPxQx) 194 195 .int L(P0Q7)-L(fwdPxQx) 196 .int L(P1Q7)-L(fwdPxQx) 197 .int L(P2Q7)-L(fwdPxQx) 198 .int L(P3Q7)-L(fwdPxQx) 199 .int L(P4Q7)-L(fwdPxQx) 200 .int L(P5Q7)-L(fwdPxQx) 201 .int L(P6Q7)-L(fwdPxQx) 202 .int L(P7Q7)-L(fwdPxQx) 203 204 .int L(P0Q8)-L(fwdPxQx) 205 .int L(P1Q8)-L(fwdPxQx) 206 .int L(P2Q8)-L(fwdPxQx) 207 .int L(P3Q8)-L(fwdPxQx) 208 .int L(P4Q8)-L(fwdPxQx) 209 .int L(P5Q8)-L(fwdPxQx) 210 .int L(P6Q8)-L(fwdPxQx) 211 .int L(P7Q8)-L(fwdPxQx) 212 213 .int L(P0Q9)-L(fwdPxQx) 214 .int L(P1Q9)-L(fwdPxQx) 215 .int L(P2Q9)-L(fwdPxQx) 216 .int L(P3Q9)-L(fwdPxQx) 217 .int L(P4Q9)-L(fwdPxQx) 218 .int L(P5Q9)-L(fwdPxQx) 219 .int L(P6Q9)-L(fwdPxQx) 220 .int L(P7Q9)-L(fwdPxQx) 221 222 .int L(P0QA)-L(fwdPxQx) 223 .int L(P1QA)-L(fwdPxQx) 224 .int L(P2QA)-L(fwdPxQx) 225 .int L(P3QA)-L(fwdPxQx) 226 .int L(P4QA)-L(fwdPxQx) 227 .int L(P5QA)-L(fwdPxQx) 228 .int L(P6QA)-L(fwdPxQx) 229 .int L(P7QA)-L(fwdPxQx) 230 231 .int L(P0QB)-L(fwdPxQx) 232 .int L(P1QB)-L(fwdPxQx) 233 .int L(P2QB)-L(fwdPxQx) 234 .int L(P3QB)-L(fwdPxQx) 235 .int L(P4QB)-L(fwdPxQx) 236 .int L(P5QB)-L(fwdPxQx) 237 .int L(P6QB)-L(fwdPxQx) 238 .int L(P7QB)-L(fwdPxQx) 239 240 .int L(P0QC)-L(fwdPxQx) 241 .int L(P1QC)-L(fwdPxQx) 242 .int L(P2QC)-L(fwdPxQx) 243 .int L(P3QC)-L(fwdPxQx) 244 .int L(P4QC)-L(fwdPxQx) 245 .int L(P5QC)-L(fwdPxQx) 246 .int L(P6QC)-L(fwdPxQx) 247 .int L(P7QC)-L(fwdPxQx) 248 249 .int L(P0QD)-L(fwdPxQx) 250 .int L(P1QD)-L(fwdPxQx) 251 .int L(P2QD)-L(fwdPxQx) 252 .int L(P3QD)-L(fwdPxQx) 253 .int L(P4QD)-L(fwdPxQx) 254 .int L(P5QD)-L(fwdPxQx) 255 .int L(P6QD)-L(fwdPxQx) 256 .int L(P7QD)-L(fwdPxQx) 257 258 .int L(P0QE)-L(fwdPxQx) 259 .int L(P1QE)-L(fwdPxQx) 260 .int L(P2QE)-L(fwdPxQx) 261 .int L(P3QE)-L(fwdPxQx) 262 .int L(P4QE)-L(fwdPxQx) 263 .int L(P5QE)-L(fwdPxQx) 264 .int L(P6QE)-L(fwdPxQx) 265 .int L(P7QE)-L(fwdPxQx) 266 267 .int L(P0QF)-L(fwdPxQx) 268 .int L(P1QF)-L(fwdPxQx) 269 .int L(P2QF)-L(fwdPxQx) 270 .int L(P3QF)-L(fwdPxQx) 271 .int L(P4QF)-L(fwdPxQx) 272 .int L(P5QF)-L(fwdPxQx) 273 .int L(P6QF)-L(fwdPxQx) 274 .int L(P7QF)-L(fwdPxQx) 275 276 .int L(P0QG)-L(fwdPxQx) # 0x80 277 278 .balign 16 279L(AliPxQx): .int L(now_qw_aligned)-L(AliPxQx) 280 .int L(A1Q0)-L(AliPxQx) 281 .int L(A2Q0)-L(AliPxQx) 282 .int L(A3Q0)-L(AliPxQx) 283 .int L(A4Q0)-L(AliPxQx) 284 .int L(A5Q0)-L(AliPxQx) 285 .int L(A6Q0)-L(AliPxQx) 286 .int L(A7Q0)-L(AliPxQx) 287 .int L(A0Q1)-L(AliPxQx) 288 .int L(A1Q1)-L(AliPxQx) 289 .int L(A2Q1)-L(AliPxQx) 290 .int L(A3Q1)-L(AliPxQx) 291 .int L(A4Q1)-L(AliPxQx) 292 .int L(A5Q1)-L(AliPxQx) 293 .int L(A6Q1)-L(AliPxQx) 294 .int L(A7Q1)-L(AliPxQx) 295 296 .balign 16 297L(A1Q0): # ; need to move 8+ 7=1+2+4 bytes 298 movzbq (%rdx),%r11 299 sub $0xf,%r8 300 mov %r11b,(%rcx) 301 302 movzwq 0x1(%rdx),%r10 303 mov %r10w,0x1(%rcx) 304 305 mov 0x3(%rdx),%r9d 306 mov %r9d,0x3(%rcx) 307 308 mov 0x7(%rdx),%r11 309 add $0xf,%rdx 310 mov %r11,0x7(%rcx) 311 312 add $0xf,%rcx 313 jmp L(now_qw_aligned) 314 315 .balign 16 316L(A2Q0): # ; need to move 8+ 6=2+4 bytes 317 movzwq (%rdx),%r10 318 sub $0xe,%r8 319 mov %r10w,(%rcx) 320 321 mov 0x2(%rdx),%r9d 322 mov %r9d,0x2(%rcx) 323 324 mov 0x6(%rdx),%r11 325 add $0xe,%rdx 326 mov %r11,0x6(%rcx) 327 add $0xe,%rcx 328 jmp L(now_qw_aligned) 329 330 .balign 16 331L(A3Q0): # ; need to move 8+ 5=1+4 bytes 332 movzbq (%rdx),%r11 333 sub $0xd,%r8 334 mov %r11b,(%rcx) 335 336 mov 0x1(%rdx),%r9d 337 mov %r9d,0x1(%rcx) 338 339 mov 0x5(%rdx),%r10 340 add $0xd,%rdx 341 mov %r10,0x5(%rcx) 342 343 add $0xd,%rcx 344 jmp L(now_qw_aligned) 345 346 .balign 16 347L(A4Q0): # ; need to move 8+4 bytes 348 mov (%rdx),%r9d 349 sub $0xc,%r8 350 mov %r9d,(%rcx) 351 352 mov 0x4(%rdx),%r10 353 add $0xc,%rdx 354 mov %r10,0x4(%rcx) 355 356 add $0xc,%rcx 357 jmp L(now_qw_aligned) 358 359 .balign 16 360L(A5Q0): # ; need to move 8+ 3=1+2 bytes 361 movzbq (%rdx),%r11 362 sub $0xb,%r8 363 mov %r11b,(%rcx) 364 365 movzwq 0x1(%rdx),%r10 366 mov %r10w,0x1(%rcx) 367 368 mov 0x3(%rdx),%r9 369 add $0xb,%rdx 370 mov %r9,0x3(%rcx) 371 372 add $0xb,%rcx 373 jmp L(now_qw_aligned) 374 375 .balign 16 376L(A6Q0): # ; need to move 8+2 bytes 377 movzwq (%rdx),%r10 378 sub $0xa,%r8 379 mov %r10w,(%rcx) 380 381 mov 0x2(%rdx),%r9 382 add $0xa,%rdx 383 mov %r9,0x2(%rcx) 384 385 add $0xa,%rcx 386 jmp L(now_qw_aligned) 387 388 .balign 16 389L(A7Q0): # ; need to move 8+1 byte 390 movzbq (%rdx),%r11 391 sub $0x9,%r8 392 mov %r11b,(%rcx) 393 394 mov 0x1(%rdx),%r10 395 add $0x9,%rdx 396 mov %r10,0x1(%rcx) 397 398 add $0x9,%rcx 399 jmp L(now_qw_aligned) 400 401 .balign 16 402L(A0Q1): # ; need to move 8 bytes 403 404 mov (%rdx),%r10 405 add $0x8,%rdx 406 sub $0x8,%r8 407 mov %r10,(%rcx) 408 409 add $0x8,%rcx 410 jmp L(now_qw_aligned) 411 412 .balign 16 413L(A1Q1): # ; need to move 7=1+2+4 bytes 414 movzbq (%rdx),%r11 415 sub $0x7,%r8 416 mov %r11b,(%rcx) 417 418 movzwq 0x1(%rdx),%r10 419 mov %r10w,0x1(%rcx) 420 421 mov 0x3(%rdx),%r9d 422 add $0x7,%rdx 423 mov %r9d,0x3(%rcx) 424 add $0x7,%rcx 425 jmp L(now_qw_aligned) 426 427 .balign 16 428L(A2Q1): # ; need to move 6=2+4 bytes 429 movzwq (%rdx),%r10 430 sub $0x6,%r8 431 mov %r10w,(%rcx) 432 mov 0x2(%rdx),%r9d 433 add $0x6,%rdx 434 mov %r9d,0x2(%rcx) 435 add $0x6,%rcx 436 jmp L(now_qw_aligned) 437 438 .balign 16 439L(A3Q1): # ; need to move 5=1+4 bytes 440 movzbq (%rdx),%r11 441 sub $0x5,%r8 442 mov %r11b,(%rcx) 443 mov 0x1(%rdx),%r9d 444 add $0x5,%rdx 445 mov %r9d,0x1(%rcx) 446 add $0x5,%rcx 447 jmp L(now_qw_aligned) 448 449 .balign 16 450L(A4Q1): # ; need to move 4 bytes 451 mov (%rdx),%r9d 452 sub $0x4,%r8 453 add $0x4,%rdx 454 mov %r9d,(%rcx) 455 add $0x4,%rcx 456 jmp L(now_qw_aligned) 457 458 .balign 16 459L(A5Q1): # ; need to move 3=1+2 bytes 460 movzbq (%rdx),%r11 461 sub $0x3,%r8 462 mov %r11b,(%rcx) 463 464 movzwq 0x1(%rdx),%r10 465 add $0x3,%rdx 466 mov %r10w,0x1(%rcx) 467 468 add $0x3,%rcx 469 jmp L(now_qw_aligned) 470 471 .balign 16 472L(A6Q1): # ; need to move 2 bytes 473 movzwq (%rdx),%r10 474 sub $0x2,%r8 475 add $0x2,%rdx 476 mov %r10w,(%rcx) 477 add $0x2,%rcx 478 jmp L(now_qw_aligned) 479 480 .balign 16 481L(A7Q1): # ; need to move 1 byte 482 movzbq (%rdx),%r11 483 dec %r8 484 inc %rdx 485 mov %r11b,(%rcx) 486 inc %rcx 487 jmp L(now_qw_aligned) 488 489 490 .balign 16 491L(P0QG): 492 mov -0x80(%rdx),%r9 493 mov %r9,-0x80(%rcx) 494L(P0QF): 495 mov -0x78(%rdx),%r10 496 mov %r10,-0x78(%rcx) 497L(P0QE): 498 mov -0x70(%rdx),%r9 499 mov %r9,-0x70(%rcx) 500L(P0QD): 501 mov -0x68(%rdx),%r10 502 mov %r10,-0x68(%rcx) 503L(P0QC): 504 mov -0x60(%rdx),%r9 505 mov %r9,-0x60(%rcx) 506L(P0QB): 507 mov -0x58(%rdx),%r10 508 mov %r10,-0x58(%rcx) 509L(P0QA): 510 mov -0x50(%rdx),%r9 511 mov %r9,-0x50(%rcx) 512L(P0Q9): 513 mov -0x48(%rdx),%r10 514 mov %r10,-0x48(%rcx) 515L(P0Q8): 516 mov -0x40(%rdx),%r9 517 mov %r9,-0x40(%rcx) 518L(P0Q7): 519 mov -0x38(%rdx),%r10 520 mov %r10,-0x38(%rcx) 521L(P0Q6): 522 mov -0x30(%rdx),%r9 523 mov %r9,-0x30(%rcx) 524L(P0Q5): 525 mov -0x28(%rdx),%r10 526 mov %r10,-0x28(%rcx) 527L(P0Q4): 528 mov -0x20(%rdx),%r9 529 mov %r9,-0x20(%rcx) 530L(P0Q3): 531 mov -0x18(%rdx),%r10 532 mov %r10,-0x18(%rcx) 533L(P0Q2): 534 mov -0x10(%rdx),%r9 535 mov %r9,-0x10(%rcx) 536L(P0Q1): 537 mov -0x8(%rdx),%r10 538 mov %r10,-0x8(%rcx) 539L(P0Q0): 540 ret 541 542 .balign 16 543L(P1QF): 544 mov -0x79(%rdx),%r9 545 mov %r9,-0x79(%rcx) 546L(P1QE): 547 mov -0x71(%rdx),%r11 548 mov %r11,-0x71(%rcx) 549L(P1QD): 550 mov -0x69(%rdx),%r10 551 mov %r10,-0x69(%rcx) 552L(P1QC): 553 mov -0x61(%rdx),%r9 554 mov %r9,-0x61(%rcx) 555L(P1QB): 556 mov -0x59(%rdx),%r11 557 mov %r11,-0x59(%rcx) 558L(P1QA): 559 mov -0x51(%rdx),%r10 560 mov %r10,-0x51(%rcx) 561L(P1Q9): 562 mov -0x49(%rdx),%r9 563 mov %r9,-0x49(%rcx) 564L(P1Q8): 565 mov -0x41(%rdx),%r11 566 mov %r11,-0x41(%rcx) 567L(P1Q7): 568 mov -0x39(%rdx),%r10 569 mov %r10,-0x39(%rcx) 570L(P1Q6): 571 mov -0x31(%rdx),%r9 572 mov %r9,-0x31(%rcx) 573L(P1Q5): 574 mov -0x29(%rdx),%r11 575 mov %r11,-0x29(%rcx) 576L(P1Q4): 577 mov -0x21(%rdx),%r10 578 mov %r10,-0x21(%rcx) 579L(P1Q3): 580 mov -0x19(%rdx),%r9 581 mov %r9,-0x19(%rcx) 582L(P1Q2): 583 mov -0x11(%rdx),%r11 584 mov %r11,-0x11(%rcx) 585L(P1Q1): 586 mov -0x9(%rdx),%r10 587 mov %r10,-0x9(%rcx) 588L(P1Q0): 589 movzbq -0x1(%rdx),%r9 590 mov %r9b,-0x1(%rcx) 591 ret 592 593 .balign 16 594L(P2QF): 595 mov -0x7a(%rdx),%r9 596 mov %r9,-0x7a(%rcx) 597L(P2QE): 598 mov -0x72(%rdx),%r11 599 mov %r11,-0x72(%rcx) 600L(P2QD): 601 mov -0x6a(%rdx),%r10 602 mov %r10,-0x6a(%rcx) 603L(P2QC): 604 mov -0x62(%rdx),%r9 605 mov %r9,-0x62(%rcx) 606L(P2QB): 607 mov -0x5a(%rdx),%r11 608 mov %r11,-0x5a(%rcx) 609L(P2QA): 610 mov -0x52(%rdx),%r10 611 mov %r10,-0x52(%rcx) 612L(P2Q9): 613 mov -0x4a(%rdx),%r9 614 mov %r9,-0x4a(%rcx) 615L(P2Q8): 616 mov -0x42(%rdx),%r11 617 mov %r11,-0x42(%rcx) 618L(P2Q7): 619 mov -0x3a(%rdx),%r10 620 mov %r10,-0x3a(%rcx) 621L(P2Q6): 622 mov -0x32(%rdx),%r9 623 mov %r9,-0x32(%rcx) 624L(P2Q5): 625 mov -0x2a(%rdx),%r11 626 mov %r11,-0x2a(%rcx) 627L(P2Q4): 628 mov -0x22(%rdx),%r10 629 mov %r10,-0x22(%rcx) 630L(P2Q3): 631 mov -0x1a(%rdx),%r9 632 mov %r9,-0x1a(%rcx) 633L(P2Q2): 634 mov -0x12(%rdx),%r11 635 mov %r11,-0x12(%rcx) 636L(P2Q1): 637 mov -0xa(%rdx),%r10 638 mov %r10,-0xa(%rcx) 639L(P2Q0): 640 movzwq -0x2(%rdx),%r9 641 mov %r9w,-0x2(%rcx) 642 ret 643 644 .balign 16 645L(P3QF): 646 mov -0x7b(%rdx),%r9 647 mov %r9,-0x7b(%rcx) 648L(P3QE): 649 mov -0x73(%rdx),%r11 650 mov %r11,-0x73(%rcx) 651L(P3QD): 652 mov -0x6b(%rdx),%r10 653 mov %r10,-0x6b(%rcx) 654L(P3QC): 655 mov -0x63(%rdx),%r9 656 mov %r9,-0x63(%rcx) 657L(P3QB): 658 mov -0x5b(%rdx),%r11 659 mov %r11,-0x5b(%rcx) 660L(P3QA): 661 mov -0x53(%rdx),%r10 662 mov %r10,-0x53(%rcx) 663L(P3Q9): 664 mov -0x4b(%rdx),%r9 665 mov %r9,-0x4b(%rcx) 666L(P3Q8): 667 mov -0x43(%rdx),%r11 668 mov %r11,-0x43(%rcx) 669L(P3Q7): 670 mov -0x3b(%rdx),%r10 671 mov %r10,-0x3b(%rcx) 672L(P3Q6): 673 mov -0x33(%rdx),%r9 674 mov %r9,-0x33(%rcx) 675L(P3Q5): 676 mov -0x2b(%rdx),%r11 677 mov %r11,-0x2b(%rcx) 678L(P3Q4): 679 mov -0x23(%rdx),%r10 680 mov %r10,-0x23(%rcx) 681L(P3Q3): 682 mov -0x1b(%rdx),%r9 683 mov %r9,-0x1b(%rcx) 684L(P3Q2): 685 mov -0x13(%rdx),%r11 686 mov %r11,-0x13(%rcx) 687L(P3Q1): 688 mov -0xb(%rdx),%r10 689 mov %r10,-0xb(%rcx) 690 /* 691 * These trailing loads/stores have to do all their loads 1st, 692 * then do the stores. 693 */ 694L(P3Q0): 695 movzwq -0x3(%rdx),%r9 696 movzbq -0x1(%rdx),%r10 697 mov %r9w,-0x3(%rcx) 698 mov %r10b,-0x1(%rcx) 699 ret 700 701 .balign 16 702L(P4QF): 703 mov -0x7c(%rdx),%r9 704 mov %r9,-0x7c(%rcx) 705L(P4QE): 706 mov -0x74(%rdx),%r11 707 mov %r11,-0x74(%rcx) 708L(P4QD): 709 mov -0x6c(%rdx),%r10 710 mov %r10,-0x6c(%rcx) 711L(P4QC): 712 mov -0x64(%rdx),%r9 713 mov %r9,-0x64(%rcx) 714L(P4QB): 715 mov -0x5c(%rdx),%r11 716 mov %r11,-0x5c(%rcx) 717L(P4QA): 718 mov -0x54(%rdx),%r10 719 mov %r10,-0x54(%rcx) 720L(P4Q9): 721 mov -0x4c(%rdx),%r9 722 mov %r9,-0x4c(%rcx) 723L(P4Q8): 724 mov -0x44(%rdx),%r11 725 mov %r11,-0x44(%rcx) 726L(P4Q7): 727 mov -0x3c(%rdx),%r10 728 mov %r10,-0x3c(%rcx) 729L(P4Q6): 730 mov -0x34(%rdx),%r9 731 mov %r9,-0x34(%rcx) 732L(P4Q5): 733 mov -0x2c(%rdx),%r11 734 mov %r11,-0x2c(%rcx) 735L(P4Q4): 736 mov -0x24(%rdx),%r10 737 mov %r10,-0x24(%rcx) 738L(P4Q3): 739 mov -0x1c(%rdx),%r9 740 mov %r9,-0x1c(%rcx) 741L(P4Q2): 742 mov -0x14(%rdx),%r11 743 mov %r11,-0x14(%rcx) 744L(P4Q1): 745 mov -0xc(%rdx),%r10 746 mov %r10,-0xc(%rcx) 747L(P4Q0): 748 mov -0x4(%rdx),%r9d 749 mov %r9d,-0x4(%rcx) 750 ret 751 752 .balign 16 753L(P5QF): 754 mov -0x7d(%rdx),%r9 755 mov %r9,-0x7d(%rcx) 756L(P5QE): 757 mov -0x75(%rdx),%r11 758 mov %r11,-0x75(%rcx) 759L(P5QD): 760 mov -0x6d(%rdx),%r10 761 mov %r10,-0x6d(%rcx) 762L(P5QC): 763 mov -0x65(%rdx),%r9 764 mov %r9,-0x65(%rcx) 765L(P5QB): 766 mov -0x5d(%rdx),%r11 767 mov %r11,-0x5d(%rcx) 768L(P5QA): 769 mov -0x55(%rdx),%r10 770 mov %r10,-0x55(%rcx) 771L(P5Q9): 772 mov -0x4d(%rdx),%r9 773 mov %r9,-0x4d(%rcx) 774L(P5Q8): 775 mov -0x45(%rdx),%r11 776 mov %r11,-0x45(%rcx) 777L(P5Q7): 778 mov -0x3d(%rdx),%r10 779 mov %r10,-0x3d(%rcx) 780L(P5Q6): 781 mov -0x35(%rdx),%r9 782 mov %r9,-0x35(%rcx) 783L(P5Q5): 784 mov -0x2d(%rdx),%r11 785 mov %r11,-0x2d(%rcx) 786L(P5Q4): 787 mov -0x25(%rdx),%r10 788 mov %r10,-0x25(%rcx) 789L(P5Q3): 790 mov -0x1d(%rdx),%r9 791 mov %r9,-0x1d(%rcx) 792L(P5Q2): 793 mov -0x15(%rdx),%r11 794 mov %r11,-0x15(%rcx) 795L(P5Q1): 796 mov -0xd(%rdx),%r10 797 mov %r10,-0xd(%rcx) 798 /* 799 * These trailing loads/stores have to do all their loads 1st, 800 * then do the stores. 801 */ 802L(P5Q0): 803 mov -0x5(%rdx),%r9d 804 movzbq -0x1(%rdx),%r10 805 mov %r9d,-0x5(%rcx) 806 mov %r10b,-0x1(%rcx) 807 ret 808 809 .balign 16 810L(P6QF): 811 mov -0x7e(%rdx),%r9 812 mov %r9,-0x7e(%rcx) 813L(P6QE): 814 mov -0x76(%rdx),%r11 815 mov %r11,-0x76(%rcx) 816L(P6QD): 817 mov -0x6e(%rdx),%r10 818 mov %r10,-0x6e(%rcx) 819L(P6QC): 820 mov -0x66(%rdx),%r9 821 mov %r9,-0x66(%rcx) 822L(P6QB): 823 mov -0x5e(%rdx),%r11 824 mov %r11,-0x5e(%rcx) 825L(P6QA): 826 mov -0x56(%rdx),%r10 827 mov %r10,-0x56(%rcx) 828L(P6Q9): 829 mov -0x4e(%rdx),%r9 830 mov %r9,-0x4e(%rcx) 831L(P6Q8): 832 mov -0x46(%rdx),%r11 833 mov %r11,-0x46(%rcx) 834L(P6Q7): 835 mov -0x3e(%rdx),%r10 836 mov %r10,-0x3e(%rcx) 837L(P6Q6): 838 mov -0x36(%rdx),%r9 839 mov %r9,-0x36(%rcx) 840L(P6Q5): 841 mov -0x2e(%rdx),%r11 842 mov %r11,-0x2e(%rcx) 843L(P6Q4): 844 mov -0x26(%rdx),%r10 845 mov %r10,-0x26(%rcx) 846L(P6Q3): 847 mov -0x1e(%rdx),%r9 848 mov %r9,-0x1e(%rcx) 849L(P6Q2): 850 mov -0x16(%rdx),%r11 851 mov %r11,-0x16(%rcx) 852L(P6Q1): 853 mov -0xe(%rdx),%r10 854 mov %r10,-0xe(%rcx) 855 /* 856 * These trailing loads/stores have to do all their loads 1st, 857 * then do the stores. 858 */ 859L(P6Q0): 860 mov -0x6(%rdx),%r9d 861 movzwq -0x2(%rdx),%r10 862 mov %r9d,-0x6(%rcx) 863 mov %r10w,-0x2(%rcx) 864 ret 865 866 .balign 16 867L(P7QF): 868 mov -0x7f(%rdx),%r9 869 mov %r9,-0x7f(%rcx) 870L(P7QE): 871 mov -0x77(%rdx),%r11 872 mov %r11,-0x77(%rcx) 873L(P7QD): 874 mov -0x6f(%rdx),%r10 875 mov %r10,-0x6f(%rcx) 876L(P7QC): 877 mov -0x67(%rdx),%r9 878 mov %r9,-0x67(%rcx) 879L(P7QB): 880 mov -0x5f(%rdx),%r11 881 mov %r11,-0x5f(%rcx) 882L(P7QA): 883 mov -0x57(%rdx),%r10 884 mov %r10,-0x57(%rcx) 885L(P7Q9): 886 mov -0x4f(%rdx),%r9 887 mov %r9,-0x4f(%rcx) 888L(P7Q8): 889 mov -0x47(%rdx),%r11 890 mov %r11,-0x47(%rcx) 891L(P7Q7): 892 mov -0x3f(%rdx),%r10 893 mov %r10,-0x3f(%rcx) 894L(P7Q6): 895 mov -0x37(%rdx),%r9 896 mov %r9,-0x37(%rcx) 897L(P7Q5): 898 mov -0x2f(%rdx),%r11 899 mov %r11,-0x2f(%rcx) 900L(P7Q4): 901 mov -0x27(%rdx),%r10 902 mov %r10,-0x27(%rcx) 903L(P7Q3): 904 mov -0x1f(%rdx),%r9 905 mov %r9,-0x1f(%rcx) 906L(P7Q2): 907 mov -0x17(%rdx),%r11 908 mov %r11,-0x17(%rcx) 909L(P7Q1): 910 mov -0xf(%rdx),%r10 911 mov %r10,-0xf(%rcx) 912 /* 913 * These trailing loads/stores have to do all their loads 1st, 914 * then do the stores. 915 */ 916L(P7Q0): 917 mov -0x7(%rdx),%r9d 918 movzwq -0x3(%rdx),%r10 919 movzbq -0x1(%rdx),%r11 920 mov %r9d,-0x7(%rcx) 921 mov %r10w,-0x3(%rcx) 922 mov %r11b,-0x1(%rcx) 923 ret 924 925 .balign 16 926L(ck_use_sse2): 927 /* 928 * Align dest to 16 byte boundary. 929 */ 930 test $0xf,%rcx 931 jnz L(ShrtAlignNew) 932 933L(now_qw_aligned): 934 cmpl $NO_SSE,.memops_method(%rip) 935 je L(Loop8byte_pre) 936 937 /* 938 * The fall-through path is to do SSE2 16-byte load/stores 939 */ 940 941 /* 942 * If current move size is larger than half of the highest level cache 943 * size, then do non-temporal moves. 944 */ 945 mov .largest_level_cache_size(%rip),%r9d 946 shr %r9 # take half of it 947 cmp %r9,%r8 948 jg L(sse2_nt_move) 949 950 /* 951 * If both the source and dest are aligned, then use the both aligned 952 * logic. Well aligned data should reap the rewards. 953 */ 954 test $0xf,%rdx 955 jz L(pre_both_aligned) 956 957 lea L(SSE_src)(%rip),%r10 # SSE2 (default) 958 testl $USE_SSSE3,.memops_method(%rip) 959 jz 1f 960 lea L(SSSE3_src)(%rip),%r10 # SSSE3 961 9621: 963 /* 964 * if the src is not 16 byte aligned... 965 */ 966 mov %rdx,%r11 967 and $0xf,%r11 968 movdqu (%rdx),%xmm0 969 movdqa %xmm0,(%rcx) 970 add $0x10,%rdx 971 sub %r11,%rdx 972 add $0x10,%rcx 973 sub $0x10,%r8 974 movdqa (%rdx),%xmm1 975 976 movslq (%r10,%r11,4),%r9 977 lea (%r9,%r10,1),%r10 978 jmpq *%r10 979 980 .balign 16 981L(SSSE3_src): .int L(pre_both_aligned)-L(SSSE3_src) 982 .int L(mov3dqa1) -L(SSSE3_src) 983 .int L(mov3dqa2) -L(SSSE3_src) 984 .int L(mov3dqa3) -L(SSSE3_src) 985 .int L(mov3dqa4) -L(SSSE3_src) 986 .int L(mov3dqa5) -L(SSSE3_src) 987 .int L(mov3dqa6) -L(SSSE3_src) 988 .int L(mov3dqa7) -L(SSSE3_src) 989 .int L(movdqa8) -L(SSSE3_src) 990 .int L(mov3dqa9) -L(SSSE3_src) 991 .int L(mov3dqa10)-L(SSSE3_src) 992 .int L(mov3dqa11)-L(SSSE3_src) 993 .int L(mov3dqa12)-L(SSSE3_src) 994 .int L(mov3dqa13)-L(SSSE3_src) 995 .int L(mov3dqa14)-L(SSSE3_src) 996 .int L(mov3dqa15)-L(SSSE3_src) 997L(SSE_src): .int L(pre_both_aligned)-L(SSE_src) 998 .int L(movdqa1) -L(SSE_src) 999 .int L(movdqa2) -L(SSE_src) 1000 .int L(movdqa3) -L(SSE_src) 1001 .int L(movdqa4) -L(SSE_src) 1002 .int L(movdqa5) -L(SSE_src) 1003 .int L(movdqa6) -L(SSE_src) 1004 .int L(movdqa7) -L(SSE_src) 1005 .int L(movdqa8) -L(SSE_src) 1006 .int L(movdqa9) -L(SSE_src) 1007 .int L(movdqa10)-L(SSE_src) 1008 .int L(movdqa11)-L(SSE_src) 1009 .int L(movdqa12)-L(SSE_src) 1010 .int L(movdqa13)-L(SSE_src) 1011 .int L(movdqa14)-L(SSE_src) 1012 .int L(movdqa15)-L(SSE_src) 1013 1014 .balign 16 1015L(movdqa1): 1016 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1017 movdqa 0x20(%rdx),%xmm0 # load the upper source buffer 1018 lea 0x20(%rdx),%rdx 1019 lea -0x20(%r8),%r8 1020 1021 psrldq $0x1,%xmm1 # shift right prev buffer (saved from last iteration) 1022 movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration 1023 pslldq $0xf,%xmm3 # shift the current buffer left (shift in zeros) 1024 por %xmm1,%xmm3 # OR them together 1025 cmp $0x20,%r8 1026 1027 psrldq $0x1,%xmm2 # shift right prev buffer (saved from last iteration) 1028 movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration 1029 pslldq $0xf,%xmm0 # shift the current buffer left (shift in zeros) 1030 por %xmm2,%xmm0 # OR them together 1031 movdqa %xmm3,(%rcx) # store it 1032 movdqa %xmm0,0x10(%rcx) # store it 1033 lea 0x20(%rcx),%rcx 1034 1035 jge L(movdqa1) 1036 jmp L(movdqa_epi) 1037 1038 .balign 16 1039L(movdqa2): 1040 sub $0x20,%r8 1041 movdqa 0x10(%rdx),%xmm3 1042 movdqa 0x20(%rdx),%xmm0 1043 add $0x20,%rdx 1044 1045 psrldq $0x2,%xmm1 1046 movdqa %xmm3,%xmm2 1047 pslldq $0xe,%xmm3 1048 por %xmm1,%xmm3 1049 1050 psrldq $0x2,%xmm2 1051 movdqa %xmm0,%xmm1 1052 pslldq $0xe,%xmm0 1053 por %xmm2,%xmm0 1054 movdqa %xmm3,(%rcx) 1055 movdqa %xmm0,0x10(%rcx) 1056 1057 add $0x20,%rcx 1058 cmp $0x20,%r8 1059 jge L(movdqa2) 1060 jmp L(movdqa_epi) 1061 1062 .balign 16 1063L(movdqa3): 1064 sub $0x20,%r8 1065 movdqa 0x10(%rdx),%xmm3 1066 movdqa 0x20(%rdx),%xmm0 1067 add $0x20,%rdx 1068 1069 psrldq $0x3,%xmm1 1070 movdqa %xmm3,%xmm2 1071 pslldq $0xd,%xmm3 1072 por %xmm1,%xmm3 1073 1074 psrldq $0x3,%xmm2 1075 movdqa %xmm0,%xmm1 1076 pslldq $0xd,%xmm0 1077 por %xmm2,%xmm0 1078 movdqa %xmm3,(%rcx) 1079 movdqa %xmm0,0x10(%rcx) 1080 1081 add $0x20,%rcx 1082 cmp $0x20,%r8 1083 jge L(movdqa3) 1084 jmp L(movdqa_epi) 1085 1086 .balign 16 1087L(movdqa4): 1088 sub $0x20,%r8 1089 movdqa 0x10(%rdx),%xmm3 1090 movdqa 0x20(%rdx),%xmm0 1091 add $0x20,%rdx 1092 1093 psrldq $0x4,%xmm1 1094 movdqa %xmm3,%xmm2 1095 pslldq $0xc,%xmm3 1096 por %xmm1,%xmm3 1097 1098 psrldq $0x4,%xmm2 1099 movdqa %xmm0,%xmm1 1100 pslldq $0xc,%xmm0 1101 por %xmm2,%xmm0 1102 1103 movdqa %xmm3,(%rcx) 1104 movdqa %xmm0,0x10(%rcx) 1105 1106 add $0x20,%rcx 1107 cmp $0x20,%r8 1108 jge L(movdqa4) 1109 jmp L(movdqa_epi) 1110 1111 .balign 16 1112L(movdqa5): 1113 sub $0x20,%r8 1114 movdqa 0x10(%rdx),%xmm3 1115 movdqa 0x20(%rdx),%xmm0 1116 add $0x20,%rdx 1117 1118 psrldq $0x5,%xmm1 1119 movdqa %xmm3,%xmm2 1120 pslldq $0xb,%xmm3 1121 por %xmm1,%xmm3 1122 1123 psrldq $0x5,%xmm2 1124 movdqa %xmm0,%xmm1 1125 pslldq $0xb,%xmm0 1126 por %xmm2,%xmm0 1127 1128 movdqa %xmm3,(%rcx) 1129 movdqa %xmm0,0x10(%rcx) 1130 1131 add $0x20,%rcx 1132 cmp $0x20,%r8 1133 jge L(movdqa5) 1134 jmp L(movdqa_epi) 1135 1136 .balign 16 1137L(movdqa6): 1138 sub $0x20,%r8 1139 movdqa 0x10(%rdx),%xmm3 1140 movdqa 0x20(%rdx),%xmm0 1141 add $0x20,%rdx 1142 1143 psrldq $0x6,%xmm1 1144 movdqa %xmm3,%xmm2 1145 pslldq $0xa,%xmm3 1146 por %xmm1,%xmm3 1147 1148 psrldq $0x6,%xmm2 1149 movdqa %xmm0,%xmm1 1150 pslldq $0xa,%xmm0 1151 por %xmm2,%xmm0 1152 movdqa %xmm3,(%rcx) 1153 movdqa %xmm0,0x10(%rcx) 1154 1155 add $0x20,%rcx 1156 cmp $0x20,%r8 1157 jge L(movdqa6) 1158 jmp L(movdqa_epi) 1159 1160 .balign 16 1161L(movdqa7): 1162 sub $0x20,%r8 1163 movdqa 0x10(%rdx),%xmm3 1164 movdqa 0x20(%rdx),%xmm0 1165 add $0x20,%rdx 1166 1167 psrldq $0x7,%xmm1 1168 movdqa %xmm3,%xmm2 1169 pslldq $0x9,%xmm3 1170 por %xmm1,%xmm3 1171 1172 psrldq $0x7,%xmm2 1173 movdqa %xmm0,%xmm1 1174 pslldq $0x9,%xmm0 1175 por %xmm2,%xmm0 1176 movdqa %xmm3,(%rcx) 1177 movdqa %xmm0,0x10(%rcx) 1178 1179 add $0x20,%rcx 1180 cmp $0x20,%r8 1181 jge L(movdqa7) 1182 jmp L(movdqa_epi) 1183 1184 .balign 16 1185L(movdqa8): 1186 movdqa 0x10(%rdx),%xmm3 1187 sub $0x30,%r8 1188 movdqa 0x20(%rdx),%xmm0 1189 movdqa 0x30(%rdx),%xmm5 1190 lea 0x30(%rdx),%rdx 1191 1192 shufpd $0x1,%xmm3,%xmm1 1193 movdqa %xmm1,(%rcx) 1194 1195 cmp $0x30,%r8 1196 1197 shufpd $0x1,%xmm0,%xmm3 1198 movdqa %xmm3,0x10(%rcx) 1199 1200 movdqa %xmm5,%xmm1 1201 shufpd $0x1,%xmm5,%xmm0 1202 movdqa %xmm0,0x20(%rcx) 1203 1204 lea 0x30(%rcx),%rcx 1205 1206 jge L(movdqa8) 1207 jmp L(movdqa_epi) 1208 1209 .balign 16 1210L(movdqa9): 1211 sub $0x20,%r8 1212 movdqa 0x10(%rdx),%xmm3 1213 movdqa 0x20(%rdx),%xmm0 1214 add $0x20,%rdx 1215 1216 psrldq $0x9,%xmm1 1217 movdqa %xmm3,%xmm2 1218 pslldq $0x7,%xmm3 1219 por %xmm1,%xmm3 1220 1221 psrldq $0x9,%xmm2 1222 movdqa %xmm0,%xmm1 1223 pslldq $0x7,%xmm0 1224 por %xmm2,%xmm0 1225 movdqa %xmm3,(%rcx) 1226 movdqa %xmm0,0x10(%rcx) 1227 1228 add $0x20,%rcx 1229 cmp $0x20,%r8 1230 jge L(movdqa9) 1231 jmp L(movdqa_epi) 1232 1233 .balign 16 1234L(movdqa10): 1235 sub $0x20,%r8 1236 movdqa 0x10(%rdx),%xmm3 1237 movdqa 0x20(%rdx),%xmm0 1238 add $0x20,%rdx 1239 1240 psrldq $0xa,%xmm1 1241 movdqa %xmm3,%xmm2 1242 pslldq $0x6,%xmm3 1243 por %xmm1,%xmm3 1244 1245 psrldq $0xa,%xmm2 1246 movdqa %xmm0,%xmm1 1247 pslldq $0x6,%xmm0 1248 por %xmm2,%xmm0 1249 movdqa %xmm3,(%rcx) 1250 movdqa %xmm0,0x10(%rcx) 1251 1252 add $0x20,%rcx 1253 cmp $0x20,%r8 1254 jge L(movdqa10) 1255 jmp L(movdqa_epi) 1256 1257 .balign 16 1258L(movdqa11): 1259 sub $0x20,%r8 1260 movdqa 0x10(%rdx),%xmm3 1261 movdqa 0x20(%rdx),%xmm0 1262 add $0x20,%rdx 1263 1264 psrldq $0xb,%xmm1 1265 movdqa %xmm3,%xmm2 1266 pslldq $0x5,%xmm3 1267 por %xmm1,%xmm3 1268 1269 psrldq $0xb,%xmm2 1270 movdqa %xmm0,%xmm1 1271 pslldq $0x5,%xmm0 1272 por %xmm2,%xmm0 1273 movdqa %xmm3,(%rcx) 1274 movdqa %xmm0,0x10(%rcx) 1275 1276 add $0x20,%rcx 1277 cmp $0x20,%r8 1278 jge L(movdqa11) 1279 jmp L(movdqa_epi) 1280 1281 .balign 16 1282L(movdqa12): 1283 sub $0x20,%r8 1284 movdqa 0x10(%rdx),%xmm3 1285 movdqa 0x20(%rdx),%xmm0 1286 add $0x20,%rdx 1287 1288 psrldq $0xc,%xmm1 1289 movdqa %xmm3,%xmm2 1290 pslldq $0x4,%xmm3 1291 por %xmm1,%xmm3 1292 1293 psrldq $0xc,%xmm2 1294 movdqa %xmm0,%xmm1 1295 pslldq $0x4,%xmm0 1296 por %xmm2,%xmm0 1297 movdqa %xmm3,(%rcx) 1298 movdqa %xmm0,0x10(%rcx) 1299 1300 add $0x20,%rcx 1301 cmp $0x20,%r8 1302 jge L(movdqa12) 1303 jmp L(movdqa_epi) 1304 1305 .balign 16 1306L(movdqa13): 1307 sub $0x20,%r8 1308 movdqa 0x10(%rdx),%xmm3 1309 movdqa 0x20(%rdx),%xmm0 1310 add $0x20,%rdx 1311 1312 psrldq $0xd,%xmm1 1313 movdqa %xmm3,%xmm2 1314 pslldq $0x3,%xmm3 1315 por %xmm1,%xmm3 1316 1317 psrldq $0xd,%xmm2 1318 movdqa %xmm0,%xmm1 1319 pslldq $0x3,%xmm0 1320 por %xmm2,%xmm0 1321 movdqa %xmm3,(%rcx) 1322 movdqa %xmm0,0x10(%rcx) 1323 1324 add $0x20,%rcx 1325 cmp $0x20,%r8 1326 jge L(movdqa13) 1327 jmp L(movdqa_epi) 1328 1329 .balign 16 1330L(movdqa14): 1331 sub $0x20,%r8 1332 movdqa 0x10(%rdx),%xmm3 1333 movdqa 0x20(%rdx),%xmm0 1334 add $0x20,%rdx 1335 1336 psrldq $0xe,%xmm1 1337 movdqa %xmm3,%xmm2 1338 pslldq $0x2,%xmm3 1339 por %xmm1,%xmm3 1340 1341 psrldq $0xe,%xmm2 1342 movdqa %xmm0,%xmm1 1343 pslldq $0x2,%xmm0 1344 por %xmm2,%xmm0 1345 movdqa %xmm3,(%rcx) 1346 movdqa %xmm0,0x10(%rcx) 1347 1348 add $0x20,%rcx 1349 cmp $0x20,%r8 1350 jge L(movdqa14) 1351 jmp L(movdqa_epi) 1352 1353 .balign 16 1354L(movdqa15): 1355 sub $0x20,%r8 1356 movdqa 0x10(%rdx),%xmm3 1357 movdqa 0x20(%rdx),%xmm0 1358 add $0x20,%rdx 1359 1360 psrldq $0xf,%xmm1 1361 movdqa %xmm3,%xmm2 1362 pslldq $0x1,%xmm3 1363 por %xmm1,%xmm3 1364 1365 psrldq $0xf,%xmm2 1366 movdqa %xmm0,%xmm1 1367 pslldq $0x1,%xmm0 1368 por %xmm2,%xmm0 1369 movdqa %xmm3,(%rcx) 1370 movdqa %xmm0,0x10(%rcx) 1371 1372 add $0x20,%rcx 1373 cmp $0x20,%r8 1374 jge L(movdqa15) 1375 #jmp L(movdqa_epi) 1376 1377 .balign 16 1378L(movdqa_epi): 1379 lea L(fwdPxQx)(%rip),%r10 1380 add %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop) 1381 add %r8,%rcx 1382 add %r8,%rdx 1383 1384 movslq (%r10,%r8,4),%r9 1385 lea (%r9,%r10,1),%r10 1386 jmpq *%r10 1387 1388 .balign 16 1389L(mov3dqa1): 1390 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1391 sub $0x30,%r8 1392 movdqa 0x20(%rdx),%xmm0 # load the upper source buffer 1393 movdqa 0x30(%rdx),%xmm5 # load the upper source buffer 1394 lea 0x30(%rdx),%rdx 1395 cmp $0x30,%r8 1396 1397 movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration 1398 #palignr $0x1,%xmm1,%xmm3 1399 .byte 0x66,0x0f,0x3a,0x0f 1400 .byte 0xd9,0x01 1401 movdqa %xmm3,(%rcx) # store it 1402 1403 movdqa %xmm0,%xmm4 # store off xmm reg for use next iteration 1404 #palignr $0x1,%xmm2,%xmm0 1405 .byte 0x66,0x0f,0x3a,0x0f 1406 .byte 0xc2,0x01 1407 movdqa %xmm0,0x10(%rcx) # store it 1408 1409 movdqa %xmm5,%xmm1 # store off xmm reg for use next iteration 1410 #palignr $0x1,%xmm4,%xmm5 1411 .byte 0x66,0x0f,0x3a,0x0f 1412 .byte 0xec,0x01 1413 movdqa %xmm5,0x20(%rcx) # store it 1414 1415 lea 0x30(%rcx),%rcx 1416 jge L(mov3dqa1) 1417 1418 cmp $0x10,%r8 1419 jl L(movdqa_epi) 1420 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1421 sub $0x10,%r8 1422 lea 0x10(%rdx),%rdx 1423 movdqa %xmm3,%xmm2 # save for use next concat 1424 #palignr $0x1,%xmm1,%xmm3 1425 .byte 0x66,0x0f,0x3a,0x0f 1426 .byte 0xd9,0x01 1427 1428 cmp $0x10,%r8 1429 movdqa %xmm3,(%rcx) # store it 1430 lea 0x10(%rcx),%rcx 1431 jl L(movdqa_epi) 1432 1433 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1434 sub $0x10,%r8 1435 lea 0x10(%rdx),%rdx 1436 #palignr $0x1,%xmm2,%xmm0 1437 .byte 0x66,0x0f,0x3a,0x0f 1438 .byte 0xc2,0x01 1439 movdqa %xmm0,(%rcx) # store it 1440 lea 0x10(%rcx),%rcx 1441 jmp L(movdqa_epi) 1442 1443 .balign 16 1444L(mov3dqa2): 1445 movdqa 0x10(%rdx),%xmm3 1446 sub $0x30,%r8 1447 movdqa 0x20(%rdx),%xmm0 1448 movdqa 0x30(%rdx),%xmm5 1449 lea 0x30(%rdx),%rdx 1450 cmp $0x30,%r8 1451 1452 movdqa %xmm3,%xmm2 1453 #palignr $0x2,%xmm1,%xmm3 1454 .byte 0x66,0x0f,0x3a,0x0f 1455 .byte 0xd9,0x02 1456 movdqa %xmm3,(%rcx) 1457 1458 movdqa %xmm0,%xmm4 1459 #palignr $0x2,%xmm2,%xmm0 1460 .byte 0x66,0x0f,0x3a,0x0f 1461 .byte 0xc2,0x02 1462 movdqa %xmm0,0x10(%rcx) 1463 1464 movdqa %xmm5,%xmm1 1465 #palignr $0x2,%xmm4,%xmm5 1466 .byte 0x66,0x0f,0x3a,0x0f 1467 .byte 0xec,0x02 1468 movdqa %xmm5,0x20(%rcx) 1469 1470 lea 0x30(%rcx),%rcx 1471 jge L(mov3dqa2) 1472 1473 cmp $0x10,%r8 1474 jl L(movdqa_epi) 1475 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1476 sub $0x10,%r8 1477 lea 0x10(%rdx),%rdx 1478 movdqa %xmm3,%xmm2 # save for use next concat 1479 #palignr $0x2,%xmm1,%xmm3 1480 .byte 0x66,0x0f,0x3a,0x0f 1481 .byte 0xd9,0x02 1482 1483 cmp $0x10,%r8 1484 movdqa %xmm3,(%rcx) # store it 1485 lea 0x10(%rcx),%rcx 1486 jl L(movdqa_epi) 1487 1488 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1489 sub $0x10,%r8 1490 lea 0x10(%rdx),%rdx 1491 #palignr $0x2,%xmm2,%xmm0 1492 .byte 0x66,0x0f,0x3a,0x0f 1493 .byte 0xc2,0x02 1494 movdqa %xmm0,(%rcx) # store it 1495 lea 0x10(%rcx),%rcx 1496 jmp L(movdqa_epi) 1497 1498 .balign 16 1499L(mov3dqa3): 1500 movdqa 0x10(%rdx),%xmm3 1501 sub $0x30,%r8 1502 movdqa 0x20(%rdx),%xmm0 1503 movdqa 0x30(%rdx),%xmm5 1504 lea 0x30(%rdx),%rdx 1505 cmp $0x30,%r8 1506 1507 movdqa %xmm3,%xmm2 1508 #palignr $0x3,%xmm1,%xmm3 1509 .byte 0x66,0x0f,0x3a,0x0f 1510 .byte 0xd9,0x03 1511 movdqa %xmm3,(%rcx) 1512 1513 movdqa %xmm0,%xmm4 1514 #palignr $0x3,%xmm2,%xmm0 1515 .byte 0x66,0x0f,0x3a,0x0f 1516 .byte 0xc2,0x03 1517 movdqa %xmm0,0x10(%rcx) 1518 1519 movdqa %xmm5,%xmm1 1520 #palignr $0x3,%xmm4,%xmm5 1521 .byte 0x66,0x0f,0x3a,0x0f 1522 .byte 0xec,0x03 1523 movdqa %xmm5,0x20(%rcx) 1524 1525 lea 0x30(%rcx),%rcx 1526 jge L(mov3dqa3) 1527 1528 cmp $0x10,%r8 1529 jl L(movdqa_epi) 1530 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1531 sub $0x10,%r8 1532 lea 0x10(%rdx),%rdx 1533 movdqa %xmm3,%xmm2 # save for use next concat 1534 #palignr $0x3,%xmm1,%xmm3 1535 .byte 0x66,0x0f,0x3a,0x0f 1536 .byte 0xd9,0x03 1537 1538 cmp $0x10,%r8 1539 movdqa %xmm3,(%rcx) # store it 1540 lea 0x10(%rcx),%rcx 1541 jl L(movdqa_epi) 1542 1543 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1544 sub $0x10,%r8 1545 lea 0x10(%rdx),%rdx 1546 #palignr $0x3,%xmm2,%xmm0 1547 .byte 0x66,0x0f,0x3a,0x0f 1548 .byte 0xc2,0x03 1549 movdqa %xmm0,(%rcx) # store it 1550 lea 0x10(%rcx),%rcx 1551 jmp L(movdqa_epi) 1552 1553 .balign 16 1554L(mov3dqa4): 1555 movdqa 0x10(%rdx),%xmm3 1556 sub $0x30,%r8 1557 movdqa 0x20(%rdx),%xmm0 1558 movdqa 0x30(%rdx),%xmm5 1559 lea 0x30(%rdx),%rdx 1560 cmp $0x30,%r8 1561 1562 movdqa %xmm3,%xmm2 1563 #palignr $0x4,%xmm1,%xmm3 1564 .byte 0x66,0x0f,0x3a,0x0f 1565 .byte 0xd9,0x04 1566 movdqa %xmm3,(%rcx) 1567 1568 movdqa %xmm0,%xmm4 1569 #palignr $0x4,%xmm2,%xmm0 1570 .byte 0x66,0x0f,0x3a,0x0f 1571 .byte 0xc2,0x04 1572 movdqa %xmm0,0x10(%rcx) 1573 1574 movdqa %xmm5,%xmm1 1575 #palignr $0x4,%xmm4,%xmm5 1576 .byte 0x66,0x0f,0x3a,0x0f 1577 .byte 0xec,0x04 1578 movdqa %xmm5,0x20(%rcx) 1579 1580 lea 0x30(%rcx),%rcx 1581 jge L(mov3dqa4) 1582 1583 cmp $0x10,%r8 1584 jl L(movdqa_epi) 1585 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1586 sub $0x10,%r8 1587 lea 0x10(%rdx),%rdx 1588 movdqa %xmm3,%xmm2 # save for use next concat 1589 #palignr $0x4,%xmm1,%xmm3 1590 .byte 0x66,0x0f,0x3a,0x0f 1591 .byte 0xd9,0x04 1592 1593 cmp $0x10,%r8 1594 movdqa %xmm3,(%rcx) # store it 1595 lea 0x10(%rcx),%rcx 1596 jl L(movdqa_epi) 1597 1598 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1599 sub $0x10,%r8 1600 lea 0x10(%rdx),%rdx 1601 #palignr $0x4,%xmm2,%xmm0 1602 .byte 0x66,0x0f,0x3a,0x0f 1603 .byte 0xc2,0x04 1604 movdqa %xmm0,(%rcx) # store it 1605 lea 0x10(%rcx),%rcx 1606 jmp L(movdqa_epi) 1607 1608 .balign 16 1609L(mov3dqa5): 1610 movdqa 0x10(%rdx),%xmm3 1611 sub $0x30,%r8 1612 movdqa 0x20(%rdx),%xmm0 1613 movdqa 0x30(%rdx),%xmm5 1614 lea 0x30(%rdx),%rdx 1615 cmp $0x30,%r8 1616 1617 movdqa %xmm3,%xmm2 1618 #palignr $0x5,%xmm1,%xmm3 1619 .byte 0x66,0x0f,0x3a,0x0f 1620 .byte 0xd9,0x05 1621 movdqa %xmm3,(%rcx) 1622 1623 movdqa %xmm0,%xmm4 1624 #palignr $0x5,%xmm2,%xmm0 1625 .byte 0x66,0x0f,0x3a,0x0f 1626 .byte 0xc2,0x05 1627 movdqa %xmm0,0x10(%rcx) 1628 1629 movdqa %xmm5,%xmm1 1630 #palignr $0x5,%xmm4,%xmm5 1631 .byte 0x66,0x0f,0x3a,0x0f 1632 .byte 0xec,0x05 1633 movdqa %xmm5,0x20(%rcx) 1634 1635 lea 0x30(%rcx),%rcx 1636 jge L(mov3dqa5) 1637 1638 cmp $0x10,%r8 1639 jl L(movdqa_epi) 1640 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1641 sub $0x10,%r8 1642 lea 0x10(%rdx),%rdx 1643 movdqa %xmm3,%xmm2 # save for use next concat 1644 #palignr $0x5,%xmm1,%xmm3 1645 .byte 0x66,0x0f,0x3a,0x0f 1646 .byte 0xd9,0x05 1647 1648 cmp $0x10,%r8 1649 movdqa %xmm3,(%rcx) # store it 1650 lea 0x10(%rcx),%rcx 1651 jl L(movdqa_epi) 1652 1653 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1654 sub $0x10,%r8 1655 lea 0x10(%rdx),%rdx 1656 #palignr $0x5,%xmm2,%xmm0 1657 .byte 0x66,0x0f,0x3a,0x0f 1658 .byte 0xc2,0x05 1659 movdqa %xmm0,(%rcx) # store it 1660 lea 0x10(%rcx),%rcx 1661 jmp L(movdqa_epi) 1662 1663 .balign 16 1664L(mov3dqa6): 1665 movdqa 0x10(%rdx),%xmm3 1666 sub $0x30,%r8 1667 movdqa 0x20(%rdx),%xmm0 1668 movdqa 0x30(%rdx),%xmm5 1669 lea 0x30(%rdx),%rdx 1670 cmp $0x30,%r8 1671 1672 movdqa %xmm3,%xmm2 1673 #palignr $0x6,%xmm1,%xmm3 1674 .byte 0x66,0x0f,0x3a,0x0f 1675 .byte 0xd9,0x06 1676 movdqa %xmm3,(%rcx) 1677 1678 movdqa %xmm0,%xmm4 1679 #palignr $0x6,%xmm2,%xmm0 1680 .byte 0x66,0x0f,0x3a,0x0f 1681 .byte 0xc2,0x06 1682 movdqa %xmm0,0x10(%rcx) 1683 1684 movdqa %xmm5,%xmm1 1685 #palignr $0x6,%xmm4,%xmm5 1686 .byte 0x66,0x0f,0x3a,0x0f 1687 .byte 0xec,0x06 1688 movdqa %xmm5,0x20(%rcx) 1689 1690 lea 0x30(%rcx),%rcx 1691 jge L(mov3dqa6) 1692 1693 cmp $0x10,%r8 1694 jl L(movdqa_epi) 1695 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1696 sub $0x10,%r8 1697 lea 0x10(%rdx),%rdx 1698 movdqa %xmm3,%xmm2 # save for use next concat 1699 #palignr $0x6,%xmm1,%xmm3 1700 .byte 0x66,0x0f,0x3a,0x0f 1701 .byte 0xd9,0x06 1702 1703 cmp $0x10,%r8 1704 movdqa %xmm3,(%rcx) # store it 1705 lea 0x10(%rcx),%rcx 1706 jl L(movdqa_epi) 1707 1708 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1709 sub $0x10,%r8 1710 lea 0x10(%rdx),%rdx 1711 #palignr $0x6,%xmm2,%xmm0 1712 .byte 0x66,0x0f,0x3a,0x0f 1713 .byte 0xc2,0x06 1714 movdqa %xmm0,(%rcx) # store it 1715 lea 0x10(%rcx),%rcx 1716 jmp L(movdqa_epi) 1717 1718 .balign 16 1719L(mov3dqa7): 1720 movdqa 0x10(%rdx),%xmm3 1721 sub $0x30,%r8 1722 movdqa 0x20(%rdx),%xmm0 1723 movdqa 0x30(%rdx),%xmm5 1724 lea 0x30(%rdx),%rdx 1725 cmp $0x30,%r8 1726 1727 movdqa %xmm3,%xmm2 1728 #palignr $0x7,%xmm1,%xmm3 1729 .byte 0x66,0x0f,0x3a,0x0f 1730 .byte 0xd9,0x07 1731 movdqa %xmm3,(%rcx) 1732 1733 movdqa %xmm0,%xmm4 1734 #palignr $0x7,%xmm2,%xmm0 1735 .byte 0x66,0x0f,0x3a,0x0f 1736 .byte 0xc2,0x07 1737 movdqa %xmm0,0x10(%rcx) 1738 1739 movdqa %xmm5,%xmm1 1740 #palignr $0x7,%xmm4,%xmm5 1741 .byte 0x66,0x0f,0x3a,0x0f 1742 .byte 0xec,0x07 1743 movdqa %xmm5,0x20(%rcx) 1744 1745 lea 0x30(%rcx),%rcx 1746 jge L(mov3dqa7) 1747 1748 cmp $0x10,%r8 1749 jl L(movdqa_epi) 1750 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1751 sub $0x10,%r8 1752 lea 0x10(%rdx),%rdx 1753 movdqa %xmm3,%xmm2 # save for use next concat 1754 #palignr $0x7,%xmm1,%xmm3 1755 .byte 0x66,0x0f,0x3a,0x0f 1756 .byte 0xd9,0x07 1757 1758 cmp $0x10,%r8 1759 movdqa %xmm3,(%rcx) # store it 1760 lea 0x10(%rcx),%rcx 1761 jl L(movdqa_epi) 1762 1763 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1764 sub $0x10,%r8 1765 lea 0x10(%rdx),%rdx 1766 #palignr $0x7,%xmm2,%xmm0 1767 .byte 0x66,0x0f,0x3a,0x0f 1768 .byte 0xc2,0x07 1769 movdqa %xmm0,(%rcx) # store it 1770 lea 0x10(%rcx),%rcx 1771 jmp L(movdqa_epi) 1772 1773 .balign 16 1774L(mov3dqa9): 1775 movdqa 0x10(%rdx),%xmm3 1776 sub $0x30,%r8 1777 movdqa 0x20(%rdx),%xmm0 1778 movdqa 0x30(%rdx),%xmm5 1779 lea 0x30(%rdx),%rdx 1780 cmp $0x30,%r8 1781 1782 movdqa %xmm3,%xmm2 1783 #palignr $0x9,%xmm1,%xmm3 1784 .byte 0x66,0x0f,0x3a,0x0f 1785 .byte 0xd9,0x09 1786 movdqa %xmm3,(%rcx) 1787 1788 movdqa %xmm0,%xmm4 1789 #palignr $0x9,%xmm2,%xmm0 1790 .byte 0x66,0x0f,0x3a,0x0f 1791 .byte 0xc2,0x09 1792 movdqa %xmm0,0x10(%rcx) 1793 1794 movdqa %xmm5,%xmm1 1795 #palignr $0x9,%xmm4,%xmm5 1796 .byte 0x66,0x0f,0x3a,0x0f 1797 .byte 0xec,0x09 1798 movdqa %xmm5,0x20(%rcx) 1799 1800 lea 0x30(%rcx),%rcx 1801 jge L(mov3dqa9) 1802 1803 cmp $0x10,%r8 1804 jl L(movdqa_epi) 1805 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1806 sub $0x10,%r8 1807 lea 0x10(%rdx),%rdx 1808 movdqa %xmm3,%xmm2 # save for use next concat 1809 #palignr $0x9,%xmm1,%xmm3 1810 .byte 0x66,0x0f,0x3a,0x0f 1811 .byte 0xd9,0x09 1812 1813 cmp $0x10,%r8 1814 movdqa %xmm3,(%rcx) # store it 1815 lea 0x10(%rcx),%rcx 1816 jl L(movdqa_epi) 1817 1818 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1819 sub $0x10,%r8 1820 lea 0x10(%rdx),%rdx 1821 #palignr $0x9,%xmm2,%xmm0 1822 .byte 0x66,0x0f,0x3a,0x0f 1823 .byte 0xc2,0x09 1824 movdqa %xmm0,(%rcx) # store it 1825 lea 0x10(%rcx),%rcx 1826 jmp L(movdqa_epi) 1827 1828 .balign 16 1829L(mov3dqa10): 1830 movdqa 0x10(%rdx),%xmm3 1831 sub $0x30,%r8 1832 movdqa 0x20(%rdx),%xmm0 1833 movdqa 0x30(%rdx),%xmm5 1834 lea 0x30(%rdx),%rdx 1835 cmp $0x30,%r8 1836 1837 movdqa %xmm3,%xmm2 1838 #palignr $0xa,%xmm1,%xmm3 1839 .byte 0x66,0x0f,0x3a,0x0f 1840 .byte 0xd9,0x0a 1841 movdqa %xmm3,(%rcx) 1842 1843 movdqa %xmm0,%xmm4 1844 #palignr $0xa,%xmm2,%xmm0 1845 .byte 0x66,0x0f,0x3a,0x0f 1846 .byte 0xc2,0x0a 1847 movdqa %xmm0,0x10(%rcx) 1848 1849 movdqa %xmm5,%xmm1 1850 #palignr $0xa,%xmm4,%xmm5 1851 .byte 0x66,0x0f,0x3a,0x0f 1852 .byte 0xec,0x0a 1853 movdqa %xmm5,0x20(%rcx) 1854 1855 lea 0x30(%rcx),%rcx 1856 jge L(mov3dqa10) 1857 1858 cmp $0x10,%r8 1859 jl L(movdqa_epi) 1860 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1861 sub $0x10,%r8 1862 lea 0x10(%rdx),%rdx 1863 movdqa %xmm3,%xmm2 # save for use next concat 1864 #palignr $0xa,%xmm1,%xmm3 1865 .byte 0x66,0x0f,0x3a,0x0f 1866 .byte 0xd9,0x0a 1867 1868 cmp $0x10,%r8 1869 movdqa %xmm3,(%rcx) # store it 1870 lea 0x10(%rcx),%rcx 1871 jl L(movdqa_epi) 1872 1873 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1874 sub $0x10,%r8 1875 lea 0x10(%rdx),%rdx 1876 #palignr $0xa,%xmm2,%xmm0 1877 .byte 0x66,0x0f,0x3a,0x0f 1878 .byte 0xc2,0x0a 1879 movdqa %xmm0,(%rcx) # store it 1880 lea 0x10(%rcx),%rcx 1881 jmp L(movdqa_epi) 1882 1883 .balign 16 1884L(mov3dqa11): 1885 movdqa 0x10(%rdx),%xmm3 1886 sub $0x30,%r8 1887 movdqa 0x20(%rdx),%xmm0 1888 movdqa 0x30(%rdx),%xmm5 1889 lea 0x30(%rdx),%rdx 1890 cmp $0x30,%r8 1891 1892 movdqa %xmm3,%xmm2 1893 #palignr $0xb,%xmm1,%xmm3 1894 .byte 0x66,0x0f,0x3a,0x0f 1895 .byte 0xd9,0x0b 1896 movdqa %xmm3,(%rcx) 1897 1898 movdqa %xmm0,%xmm4 1899 #palignr $0xb,%xmm2,%xmm0 1900 .byte 0x66,0x0f,0x3a,0x0f 1901 .byte 0xc2,0x0b 1902 movdqa %xmm0,0x10(%rcx) 1903 1904 movdqa %xmm5,%xmm1 1905 #palignr $0xb,%xmm4,%xmm5 1906 .byte 0x66,0x0f,0x3a,0x0f 1907 .byte 0xec,0x0b 1908 movdqa %xmm5,0x20(%rcx) 1909 1910 lea 0x30(%rcx),%rcx 1911 jge L(mov3dqa11) 1912 1913 cmp $0x10,%r8 1914 jl L(movdqa_epi) 1915 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1916 sub $0x10,%r8 1917 lea 0x10(%rdx),%rdx 1918 movdqa %xmm3,%xmm2 # save for use next concat 1919 #palignr $0xb,%xmm1,%xmm3 1920 .byte 0x66,0x0f,0x3a,0x0f 1921 .byte 0xd9,0x0b 1922 1923 cmp $0x10,%r8 1924 movdqa %xmm3,(%rcx) # store it 1925 lea 0x10(%rcx),%rcx 1926 jl L(movdqa_epi) 1927 1928 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1929 sub $0x10,%r8 1930 lea 0x10(%rdx),%rdx 1931 #palignr $0xb,%xmm2,%xmm0 1932 .byte 0x66,0x0f,0x3a,0x0f 1933 .byte 0xc2,0x0b 1934 movdqa %xmm0,(%rcx) # store it 1935 lea 0x10(%rcx),%rcx 1936 jmp L(movdqa_epi) 1937 1938 .balign 16 1939L(mov3dqa12): 1940 movdqa 0x10(%rdx),%xmm3 1941 sub $0x30,%r8 1942 movdqa 0x20(%rdx),%xmm0 1943 movdqa 0x30(%rdx),%xmm5 1944 lea 0x30(%rdx),%rdx 1945 cmp $0x30,%r8 1946 1947 movdqa %xmm3,%xmm2 1948 #palignr $0xc,%xmm1,%xmm3 1949 .byte 0x66,0x0f,0x3a,0x0f 1950 .byte 0xd9,0x0c 1951 movdqa %xmm3,(%rcx) 1952 1953 movdqa %xmm0,%xmm4 1954 #palignr $0xc,%xmm2,%xmm0 1955 .byte 0x66,0x0f,0x3a,0x0f 1956 .byte 0xc2,0x0c 1957 movdqa %xmm0,0x10(%rcx) 1958 1959 movdqa %xmm5,%xmm1 1960 #palignr $0xc,%xmm4,%xmm5 1961 .byte 0x66,0x0f,0x3a,0x0f 1962 .byte 0xec,0x0c 1963 movdqa %xmm5,0x20(%rcx) 1964 1965 lea 0x30(%rcx),%rcx 1966 jge L(mov3dqa12) 1967 1968 cmp $0x10,%r8 1969 jl L(movdqa_epi) 1970 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1971 sub $0x10,%r8 1972 lea 0x10(%rdx),%rdx 1973 movdqa %xmm3,%xmm2 # save for use next concat 1974 #palignr $0xc,%xmm1,%xmm3 1975 .byte 0x66,0x0f,0x3a,0x0f 1976 .byte 0xd9,0x0c 1977 1978 cmp $0x10,%r8 1979 movdqa %xmm3,(%rcx) # store it 1980 lea 0x10(%rcx),%rcx 1981 jl L(movdqa_epi) 1982 1983 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1984 sub $0x10,%r8 1985 lea 0x10(%rdx),%rdx 1986 #palignr $0xc,%xmm2,%xmm0 1987 .byte 0x66,0x0f,0x3a,0x0f 1988 .byte 0xc2,0x0c 1989 movdqa %xmm0,(%rcx) # store it 1990 lea 0x10(%rcx),%rcx 1991 jmp L(movdqa_epi) 1992 1993 .balign 16 1994L(mov3dqa13): 1995 movdqa 0x10(%rdx),%xmm3 1996 sub $0x30,%r8 1997 movdqa 0x20(%rdx),%xmm0 1998 movdqa 0x30(%rdx),%xmm5 1999 lea 0x30(%rdx),%rdx 2000 cmp $0x30,%r8 2001 2002 movdqa %xmm3,%xmm2 2003 #palignr $0xd,%xmm1,%xmm3 2004 .byte 0x66,0x0f,0x3a,0x0f 2005 .byte 0xd9,0x0d 2006 movdqa %xmm3,(%rcx) 2007 2008 movdqa %xmm0,%xmm4 2009 #palignr $0xd,%xmm2,%xmm0 2010 .byte 0x66,0x0f,0x3a,0x0f 2011 .byte 0xc2,0x0d 2012 movdqa %xmm0,0x10(%rcx) 2013 2014 movdqa %xmm5,%xmm1 2015 #palignr $0xd,%xmm4,%xmm5 2016 .byte 0x66,0x0f,0x3a,0x0f 2017 .byte 0xec,0x0d 2018 movdqa %xmm5,0x20(%rcx) 2019 2020 lea 0x30(%rcx),%rcx 2021 jge L(mov3dqa13) 2022 2023 cmp $0x10,%r8 2024 jl L(movdqa_epi) 2025 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2026 sub $0x10,%r8 2027 lea 0x10(%rdx),%rdx 2028 movdqa %xmm3,%xmm2 # save for use next concat 2029 #palignr $0xd,%xmm1,%xmm3 2030 .byte 0x66,0x0f,0x3a,0x0f 2031 .byte 0xd9,0x0d 2032 2033 cmp $0x10,%r8 2034 movdqa %xmm3,(%rcx) # store it 2035 lea 0x10(%rcx),%rcx 2036 jl L(movdqa_epi) 2037 2038 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2039 sub $0x10,%r8 2040 lea 0x10(%rdx),%rdx 2041 #palignr $0xd,%xmm2,%xmm0 2042 .byte 0x66,0x0f,0x3a,0x0f 2043 .byte 0xc2,0x0d 2044 movdqa %xmm0,(%rcx) # store it 2045 lea 0x10(%rcx),%rcx 2046 jmp L(movdqa_epi) 2047 2048 .balign 16 2049L(mov3dqa14): 2050 movdqa 0x10(%rdx),%xmm3 2051 sub $0x30,%r8 2052 movdqa 0x20(%rdx),%xmm0 2053 movdqa 0x30(%rdx),%xmm5 2054 lea 0x30(%rdx),%rdx 2055 cmp $0x30,%r8 2056 2057 movdqa %xmm3,%xmm2 2058 #palignr $0xe,%xmm1,%xmm3 2059 .byte 0x66,0x0f,0x3a,0x0f 2060 .byte 0xd9,0x0e 2061 movdqa %xmm3,(%rcx) 2062 2063 movdqa %xmm0,%xmm4 2064 #palignr $0xe,%xmm2,%xmm0 2065 .byte 0x66,0x0f,0x3a,0x0f 2066 .byte 0xc2,0x0e 2067 movdqa %xmm0,0x10(%rcx) 2068 2069 movdqa %xmm5,%xmm1 2070 #palignr $0xe,%xmm4,%xmm5 2071 .byte 0x66,0x0f,0x3a,0x0f 2072 .byte 0xec,0x0e 2073 movdqa %xmm5,0x20(%rcx) 2074 2075 lea 0x30(%rcx),%rcx 2076 jge L(mov3dqa14) 2077 2078 cmp $0x10,%r8 2079 jl L(movdqa_epi) 2080 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2081 sub $0x10,%r8 2082 lea 0x10(%rdx),%rdx 2083 movdqa %xmm3,%xmm2 # save for use next concat 2084 #palignr $0xe,%xmm1,%xmm3 2085 .byte 0x66,0x0f,0x3a,0x0f 2086 .byte 0xd9,0x0e 2087 2088 cmp $0x10,%r8 2089 movdqa %xmm3,(%rcx) # store it 2090 lea 0x10(%rcx),%rcx 2091 jl L(movdqa_epi) 2092 2093 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2094 sub $0x10,%r8 2095 lea 0x10(%rdx),%rdx 2096 #palignr $0xe,%xmm2,%xmm0 2097 .byte 0x66,0x0f,0x3a,0x0f 2098 .byte 0xc2,0x0e 2099 movdqa %xmm0,(%rcx) # store it 2100 lea 0x10(%rcx),%rcx 2101 jmp L(movdqa_epi) 2102 2103 .balign 16 2104L(mov3dqa15): 2105 movdqa 0x10(%rdx),%xmm3 2106 sub $0x30,%r8 2107 movdqa 0x20(%rdx),%xmm0 2108 movdqa 0x30(%rdx),%xmm5 2109 lea 0x30(%rdx),%rdx 2110 cmp $0x30,%r8 2111 2112 movdqa %xmm3,%xmm2 2113 #palignr $0xf,%xmm1,%xmm3 2114 .byte 0x66,0x0f,0x3a,0x0f 2115 .byte 0xd9,0x0f 2116 movdqa %xmm3,(%rcx) 2117 2118 movdqa %xmm0,%xmm4 2119 #palignr $0xf,%xmm2,%xmm0 2120 .byte 0x66,0x0f,0x3a,0x0f 2121 .byte 0xc2,0x0f 2122 movdqa %xmm0,0x10(%rcx) 2123 2124 movdqa %xmm5,%xmm1 2125 #palignr $0xf,%xmm4,%xmm5 2126 .byte 0x66,0x0f,0x3a,0x0f 2127 .byte 0xec,0x0f 2128 movdqa %xmm5,0x20(%rcx) 2129 2130 lea 0x30(%rcx),%rcx 2131 jge L(mov3dqa15) 2132 2133 cmp $0x10,%r8 2134 jl L(movdqa_epi) 2135 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2136 sub $0x10,%r8 2137 lea 0x10(%rdx),%rdx 2138 movdqa %xmm3,%xmm2 # save for use next concat 2139 #palignr $0xf,%xmm1,%xmm3 2140 .byte 0x66,0x0f,0x3a,0x0f 2141 .byte 0xd9,0x0f 2142 2143 cmp $0x10,%r8 2144 movdqa %xmm3,(%rcx) # store it 2145 lea 0x10(%rcx),%rcx 2146 jl L(movdqa_epi) 2147 2148 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2149 sub $0x10,%r8 2150 lea 0x10(%rdx),%rdx 2151 #palignr $0xf,%xmm2,%xmm0 2152 .byte 0x66,0x0f,0x3a,0x0f 2153 .byte 0xc2,0x0f 2154 movdqa %xmm0,(%rcx) # store it 2155 lea 0x10(%rcx),%rcx 2156 jmp L(movdqa_epi) 2157 2158 .balign 16 2159L(sse2_nt_move): 2160 lea 0x40(%rcx),%rcx 2161 lea 0x40(%rdx),%rdx 2162 lea -0x40(%r8),%r8 2163 2164 /* 2165 * doesn't matter if source is aligned for stuff out of cache. 2166 * the mis-aligned penalty is masked by the slowness of main memory. 2167 */ 2168 prefetchnta 0x180(%rdx) 2169 movdqu -0x40(%rdx),%xmm0 2170 movdqu -0x30(%rdx),%xmm1 2171 2172 cmp $0x40,%r8 2173 movntdq %xmm0,-0x40(%rcx) 2174 movntdq %xmm1,-0x30(%rcx) 2175 2176 movdqu -0x20(%rdx),%xmm2 2177 movdqu -0x10(%rdx),%xmm3 2178 2179 movntdq %xmm2,-0x20(%rcx) 2180 movntdq %xmm3,-0x10(%rcx) 2181 2182 jge L(sse2_nt_move) 2183 2184 lea L(Fix16EndTable)(%rip),%r10 2185 mov %r8,%r9 2186 and $0xFFFFFFFFFFFFFFF0,%r9 2187 add %r9,%rcx 2188 add %r9,%rdx 2189 sub %r9,%r8 2190 shr $0x4,%r9 2191 sfence 2192 2193 movslq (%r10,%r9,4),%r11 2194 lea (%r11,%r10,1),%r10 2195 jmpq *%r10 2196 2197 .balign 16 2198L(Fix16EndTable): 2199 .int L(fix16_0)-L(Fix16EndTable) 2200 .int L(fix16_1)-L(Fix16EndTable) 2201 .int L(fix16_2)-L(Fix16EndTable) 2202 .int L(fix16_3)-L(Fix16EndTable) 2203 2204 .balign 16 2205L(fix16_3): 2206 movdqu -0x30(%rdx),%xmm1 2207 movdqa %xmm1,-0x30(%rcx) 2208L(fix16_2): 2209 movdqu -0x20(%rdx),%xmm2 2210 movdqa %xmm2,-0x20(%rcx) 2211L(fix16_1): 2212 movdqu -0x10(%rdx),%xmm3 2213 movdqa %xmm3,-0x10(%rcx) 2214L(fix16_0): 2215 lea L(fwdPxQx)(%rip),%r10 2216 add %r8,%rdx 2217 add %r8,%rcx 2218 2219 movslq (%r10,%r8,4),%r9 2220 lea (%r9,%r10,1),%r10 2221 jmpq *%r10 2222 2223 .balign 16 2224L(pre_both_aligned): 2225 cmp $0x80,%r8 2226 jl L(fix_16b) 2227 2228 .balign 16 2229L(both_aligned): 2230 2231 /* 2232 * this 'paired' load/load/store/store seems to do best. 2233 */ 2234 movdqa (%rdx),%xmm0 2235 movdqa 0x10(%rdx),%xmm1 2236 2237 movdqa %xmm0,(%rcx) 2238 movdqa %xmm1,0x10(%rcx) 2239 lea -0x80(%r8),%r8 2240 2241 movdqa 0x20(%rdx),%xmm2 2242 movdqa 0x30(%rdx),%xmm3 2243 2244 movdqa %xmm2,0x20(%rcx) 2245 movdqa %xmm3,0x30(%rcx) 2246 2247 movdqa 0x40(%rdx),%xmm0 2248 movdqa 0x50(%rdx),%xmm1 2249 cmp $0x80,%r8 2250 2251 movdqa %xmm0,0x40(%rcx) 2252 movdqa %xmm1,0x50(%rcx) 2253 2254 movdqa 0x60(%rdx),%xmm2 2255 movdqa 0x70(%rdx),%xmm3 2256 lea 0x80(%rdx),%rdx 2257 movdqa %xmm2,0x60(%rcx) 2258 movdqa %xmm3,0x70(%rcx) 2259 lea 0x80(%rcx),%rcx 2260 jge L(both_aligned) 2261 2262L(fix_16b): 2263 add %r8,%rcx 2264 lea L(fwdPxQx)(%rip),%r10 2265 add %r8,%rdx 2266 2267 movslq (%r10,%r8,4),%r9 2268 lea (%r9,%r10,1),%r10 2269 jmpq *%r10 2270 2271 .balign 16 2272L(Loop8byte_pre): 2273 # Use 8-byte moves 2274 mov .largest_level_cache_size(%rip),%r9d 2275 shr %r9 # take half of it 2276 cmp %r9,%r8 2277 jg L(byte8_nt_top) 2278 # Find out whether to use rep movsq 2279 cmp $4096,%r8 2280 jle L(byte8_top) 2281 mov .amd64cache1half(%rip),%r9d # half of l1 cache 2282 cmp %r9,%r8 2283 jle L(use_rep) 2284 2285 .balign 16 2286L(byte8_top): 2287 mov (%rdx),%r9 2288 mov 0x8(%rdx),%r10 2289 lea -0x40(%r8),%r8 2290 mov %r9,(%rcx) 2291 mov %r10,0x8(%rcx) 2292 mov 0x10(%rdx),%r11 2293 mov 0x18(%rdx),%r9 2294 mov %r11,0x10(%rcx) 2295 mov %r9,0x18(%rcx) 2296 2297 cmp $0x40,%r8 2298 mov 0x20(%rdx),%r10 2299 mov 0x28(%rdx),%r11 2300 mov %r10,0x20(%rcx) 2301 mov %r11,0x28(%rcx) 2302 mov 0x30(%rdx),%r9 2303 mov 0x38(%rdx),%r10 2304 lea 0x40(%rdx),%rdx 2305 mov %r9,0x30(%rcx) 2306 mov %r10,0x38(%rcx) 2307 lea 0x40(%rcx),%rcx 2308 jg L(byte8_top) 2309 2310L(byte8_end): 2311 lea L(fwdPxQx)(%rip),%r10 2312 lea (%rdx,%r8,1),%rdx 2313 lea (%rcx,%r8,1),%rcx 2314 2315 movslq (%r10,%r8,4),%r9 2316 lea (%r9,%r10,1),%r10 2317 jmpq *%r10 2318 2319 .balign 16 2320L(use_rep): 2321 mov %rdx,%rsi # %rsi = source 2322 mov %rcx,%rdi # %rdi = destination 2323 mov %r8,%rcx # %rcx = count 2324 shrq $3,%rcx # 8-byte word count 2325 rep 2326 movsq 2327 mov %rsi,%rdx # source 2328 mov %rdi,%rcx # destination 2329 andq $7,%r8 # remainder 2330 jnz L(byte8_end) 2331 ret 2332 2333 .balign 16 2334L(byte8_nt_top): 2335 sub $0x40,%r8 2336 prefetchnta 0x180(%rdx) 2337 mov (%rdx),%r9 2338 movnti %r9,(%rcx) 2339 mov 0x8(%rdx),%r10 2340 movnti %r10,0x8(%rcx) 2341 mov 0x10(%rdx),%r11 2342 movnti %r11,0x10(%rcx) 2343 mov 0x18(%rdx),%r9 2344 movnti %r9,0x18(%rcx) 2345 mov 0x20(%rdx),%r10 2346 movnti %r10,0x20(%rcx) 2347 mov 0x28(%rdx),%r11 2348 movnti %r11,0x28(%rcx) 2349 mov 0x30(%rdx),%r9 2350 movnti %r9,0x30(%rcx) 2351 mov 0x38(%rdx),%r10 2352 movnti %r10,0x38(%rcx) 2353 2354 lea 0x40(%rdx),%rdx 2355 lea 0x40(%rcx),%rcx 2356 cmp $0x40,%r8 2357 jge L(byte8_nt_top) 2358 sfence 2359 jmp L(byte8_end) 2360 2361 SET_SIZE(memcpy) 2362 2363 .balign 16 2364L(CopyBackwards): 2365 mov %rdx,%r8 2366 mov %rdi,%rcx 2367 mov %rsi,%rdx 2368 mov %rdi,%rax # return value 2369 2370 # ck alignment of last byte 2371 lea (%rcx,%r8,1),%rcx 2372 test $0x7,%rcx 2373 lea (%rdx,%r8,1),%rdx 2374 jne L(bk_align) 2375 2376L(bk_qw_aligned): 2377 lea L(bkPxQx)(%rip),%r10 2378 2379 cmp $0x90,%r8 # 144 2380 jg L(bk_ck_sse2_alignment) 2381 2382 sub %r8,%rcx 2383 sub %r8,%rdx 2384 2385 movslq (%r10,%r8,4),%r9 2386 lea (%r9,%r10,1),%r10 2387 jmpq *%r10 2388 2389 .balign 16 2390L(bk_align): 2391 # only align if len > 8 2392 cmp $8,%r8 2393 jle L(bk_qw_aligned) 2394 test $0x1,%rcx 2395 je L(bk_tst2) 2396 dec %rcx 2397 dec %rdx 2398 dec %r8 2399 mov (%rdx),%r9b 2400 mov %r9b,(%rcx) 2401 2402L(bk_tst2): 2403 test $0x2,%rcx 2404 je L(bk_tst3) 2405 2406L(bk_got2): 2407 sub $0x2,%rcx 2408 sub $0x2,%rdx 2409 sub $0x2,%r8 2410 movzwq (%rdx),%r9 2411 mov %r9w,(%rcx) 2412 2413L(bk_tst3): 2414 test $0x4,%rcx 2415 je L(bk_qw_aligned) 2416 2417L(bk_got3): 2418 sub $0x4,%rcx 2419 sub $0x4,%rdx 2420 sub $0x4,%r8 2421 mov (%rdx),%r9d 2422 mov %r9d,(%rcx) 2423 jmp L(bk_qw_aligned) 2424 2425 .balign 16 2426L(bk_ck_sse2_alignment): 2427 cmpl $NO_SSE,.memops_method(%rip) 2428 je L(bk_use_rep) 2429 # check alignment of last byte 2430 test $0xf,%rcx 2431 jz L(bk_sse2_cpy) 2432 2433L(bk_sse2_align): 2434 # only here if already aligned on at least a qword bndry 2435 sub $0x8,%rcx 2436 sub $0x8,%rdx 2437 sub $0x8,%r8 2438 mov (%rdx),%r9 2439 mov %r9,(%rcx) 2440 #jmp L(bk_sse2_cpy) 2441 2442 .balign 16 2443L(bk_sse2_cpy): 2444 sub $0x80,%rcx # 128 2445 sub $0x80,%rdx 2446 movdqu 0x70(%rdx),%xmm3 2447 movdqu 0x60(%rdx),%xmm2 2448 movdqa %xmm3,0x70(%rcx) 2449 movdqa %xmm2,0x60(%rcx) 2450 sub $0x80,%r8 2451 movdqu 0x50(%rdx),%xmm1 2452 movdqu 0x40(%rdx),%xmm0 2453 movdqa %xmm1,0x50(%rcx) 2454 movdqa %xmm0,0x40(%rcx) 2455 2456 cmp $0x80,%r8 2457 movdqu 0x30(%rdx),%xmm3 2458 movdqu 0x20(%rdx),%xmm2 2459 movdqa %xmm3,0x30(%rcx) 2460 movdqa %xmm2,0x20(%rcx) 2461 movdqu 0x10(%rdx),%xmm1 2462 movdqu (%rdx),%xmm0 2463 movdqa %xmm1,0x10(%rcx) 2464 movdqa %xmm0,(%rcx) 2465 jge L(bk_sse2_cpy) 2466 2467L(bk_sse2_cpy_end): 2468 lea L(bkPxQx)(%rip),%r10 2469 sub %r8,%rdx 2470 sub %r8,%rcx 2471 movslq (%r10,%r8,4),%r9 2472 lea (%r9,%r10,1),%r10 2473 jmpq *%r10 2474 2475 .balign 16 2476L(bk_use_rep): 2477 xchg %rcx,%r9 2478 mov %rdx,%rsi # source 2479 mov %r9,%rdi # destination 2480 mov %r8,%rcx # count 2481 sub $8,%rsi 2482 sub $8,%rdi 2483 shr $3,%rcx 2484 std # reverse direction 2485 rep 2486 movsq 2487 cld # reset direction flag 2488 2489 xchg %rcx,%r9 2490 lea L(bkPxQx)(%rip),%r10 2491 sub %r8,%rdx 2492 sub %r8,%rcx 2493 andq $7,%r8 # remainder 2494 jz 2f 2495 movslq (%r10,%r8,4),%r9 2496 lea (%r9,%r10,1),%r10 2497 jmpq *%r10 24982: 2499 ret 2500 2501 .balign 16 2502L(bkP0QI): 2503 mov 0x88(%rdx),%r10 2504 mov %r10,0x88(%rcx) 2505L(bkP0QH): 2506 mov 0x80(%rdx),%r10 2507 mov %r10,0x80(%rcx) 2508L(bkP0QG): 2509 mov 0x78(%rdx),%r9 2510 mov %r9,0x78(%rcx) 2511L(bkP0QF): 2512 mov 0x70(%rdx),%r11 2513 mov %r11,0x70(%rcx) 2514L(bkP0QE): 2515 mov 0x68(%rdx),%r10 2516 mov %r10,0x68(%rcx) 2517L(bkP0QD): 2518 mov 0x60(%rdx),%r9 2519 mov %r9,0x60(%rcx) 2520L(bkP0QC): 2521 mov 0x58(%rdx),%r11 2522 mov %r11,0x58(%rcx) 2523L(bkP0QB): 2524 mov 0x50(%rdx),%r10 2525 mov %r10,0x50(%rcx) 2526L(bkP0QA): 2527 mov 0x48(%rdx),%r9 2528 mov %r9,0x48(%rcx) 2529L(bkP0Q9): 2530 mov 0x40(%rdx),%r11 2531 mov %r11,0x40(%rcx) 2532L(bkP0Q8): 2533 mov 0x38(%rdx),%r10 2534 mov %r10,0x38(%rcx) 2535L(bkP0Q7): 2536 mov 0x30(%rdx),%r9 2537 mov %r9,0x30(%rcx) 2538L(bkP0Q6): 2539 mov 0x28(%rdx),%r11 2540 mov %r11,0x28(%rcx) 2541L(bkP0Q5): 2542 mov 0x20(%rdx),%r10 2543 mov %r10,0x20(%rcx) 2544L(bkP0Q4): 2545 mov 0x18(%rdx),%r9 2546 mov %r9,0x18(%rcx) 2547L(bkP0Q3): 2548 mov 0x10(%rdx),%r11 2549 mov %r11,0x10(%rcx) 2550L(bkP0Q2): 2551 mov 0x8(%rdx),%r10 2552 mov %r10,0x8(%rcx) 2553L(bkP0Q1): 2554 mov (%rdx),%r9 2555 mov %r9,(%rcx) 2556L(bkP0Q0): 2557 ret 2558 2559 .balign 16 2560L(bkP1QI): 2561 mov 0x89(%rdx),%r10 2562 mov %r10,0x89(%rcx) 2563L(bkP1QH): 2564 mov 0x81(%rdx),%r11 2565 mov %r11,0x81(%rcx) 2566L(bkP1QG): 2567 mov 0x79(%rdx),%r10 2568 mov %r10,0x79(%rcx) 2569L(bkP1QF): 2570 mov 0x71(%rdx),%r9 2571 mov %r9,0x71(%rcx) 2572L(bkP1QE): 2573 mov 0x69(%rdx),%r11 2574 mov %r11,0x69(%rcx) 2575L(bkP1QD): 2576 mov 0x61(%rdx),%r10 2577 mov %r10,0x61(%rcx) 2578L(bkP1QC): 2579 mov 0x59(%rdx),%r9 2580 mov %r9,0x59(%rcx) 2581L(bkP1QB): 2582 mov 0x51(%rdx),%r11 2583 mov %r11,0x51(%rcx) 2584L(bkP1QA): 2585 mov 0x49(%rdx),%r10 2586 mov %r10,0x49(%rcx) 2587L(bkP1Q9): 2588 mov 0x41(%rdx),%r9 2589 mov %r9,0x41(%rcx) 2590L(bkP1Q8): 2591 mov 0x39(%rdx),%r11 2592 mov %r11,0x39(%rcx) 2593L(bkP1Q7): 2594 mov 0x31(%rdx),%r10 2595 mov %r10,0x31(%rcx) 2596L(bkP1Q6): 2597 mov 0x29(%rdx),%r9 2598 mov %r9,0x29(%rcx) 2599L(bkP1Q5): 2600 mov 0x21(%rdx),%r11 2601 mov %r11,0x21(%rcx) 2602L(bkP1Q4): 2603 mov 0x19(%rdx),%r10 2604 mov %r10,0x19(%rcx) 2605L(bkP1Q3): 2606 mov 0x11(%rdx),%r9 2607 mov %r9,0x11(%rcx) 2608L(bkP1Q2): 2609 mov 0x9(%rdx),%r11 2610 mov %r11,0x9(%rcx) 2611L(bkP1Q1): 2612 mov 0x1(%rdx),%r10 2613 mov %r10,0x1(%rcx) 2614L(bkP1Q0): 2615 mov (%rdx),%r9b 2616 mov %r9b,(%rcx) 2617 ret 2618 2619 .balign 16 2620L(bkP2QI): 2621 mov 0x8a(%rdx),%r10 2622 mov %r10,0x8a(%rcx) 2623L(bkP2QH): 2624 mov 0x82(%rdx),%r11 2625 mov %r11,0x82(%rcx) 2626L(bkP2QG): 2627 mov 0x7a(%rdx),%r10 2628 mov %r10,0x7a(%rcx) 2629L(bkP2QF): 2630 mov 0x72(%rdx),%r9 2631 mov %r9,0x72(%rcx) 2632L(bkP2QE): 2633 mov 0x6a(%rdx),%r11 2634 mov %r11,0x6a(%rcx) 2635L(bkP2QD): 2636 mov 0x62(%rdx),%r10 2637 mov %r10,0x62(%rcx) 2638L(bkP2QC): 2639 mov 0x5a(%rdx),%r9 2640 mov %r9,0x5a(%rcx) 2641L(bkP2QB): 2642 mov 0x52(%rdx),%r11 2643 mov %r11,0x52(%rcx) 2644L(bkP2QA): 2645 mov 0x4a(%rdx),%r10 2646 mov %r10,0x4a(%rcx) 2647L(bkP2Q9): 2648 mov 0x42(%rdx),%r9 2649 mov %r9,0x42(%rcx) 2650L(bkP2Q8): 2651 mov 0x3a(%rdx),%r11 2652 mov %r11,0x3a(%rcx) 2653L(bkP2Q7): 2654 mov 0x32(%rdx),%r10 2655 mov %r10,0x32(%rcx) 2656L(bkP2Q6): 2657 mov 0x2a(%rdx),%r9 2658 mov %r9,0x2a(%rcx) 2659L(bkP2Q5): 2660 mov 0x22(%rdx),%r11 2661 mov %r11,0x22(%rcx) 2662L(bkP2Q4): 2663 mov 0x1a(%rdx),%r10 2664 mov %r10,0x1a(%rcx) 2665L(bkP2Q3): 2666 mov 0x12(%rdx),%r9 2667 mov %r9,0x12(%rcx) 2668L(bkP2Q2): 2669 mov 0xa(%rdx),%r11 2670 mov %r11,0xa(%rcx) 2671L(bkP2Q1): 2672 mov 0x2(%rdx),%r10 2673 mov %r10,0x2(%rcx) 2674L(bkP2Q0): 2675 mov (%rdx),%r9w 2676 mov %r9w,(%rcx) 2677 ret 2678 2679 .balign 16 2680L(bkP3QI): 2681 mov 0x8b(%rdx),%r10 2682 mov %r10,0x8b(%rcx) 2683L(bkP3QH): 2684 mov 0x83(%rdx),%r11 2685 mov %r11,0x83(%rcx) 2686L(bkP3QG): 2687 mov 0x7b(%rdx),%r10 2688 mov %r10,0x7b(%rcx) 2689L(bkP3QF): 2690 mov 0x73(%rdx),%r9 2691 mov %r9,0x73(%rcx) 2692L(bkP3QE): 2693 mov 0x6b(%rdx),%r11 2694 mov %r11,0x6b(%rcx) 2695L(bkP3QD): 2696 mov 0x63(%rdx),%r10 2697 mov %r10,0x63(%rcx) 2698L(bkP3QC): 2699 mov 0x5b(%rdx),%r9 2700 mov %r9,0x5b(%rcx) 2701L(bkP3QB): 2702 mov 0x53(%rdx),%r11 2703 mov %r11,0x53(%rcx) 2704L(bkP3QA): 2705 mov 0x4b(%rdx),%r10 2706 mov %r10,0x4b(%rcx) 2707L(bkP3Q9): 2708 mov 0x43(%rdx),%r9 2709 mov %r9,0x43(%rcx) 2710L(bkP3Q8): 2711 mov 0x3b(%rdx),%r11 2712 mov %r11,0x3b(%rcx) 2713L(bkP3Q7): 2714 mov 0x33(%rdx),%r10 2715 mov %r10,0x33(%rcx) 2716L(bkP3Q6): 2717 mov 0x2b(%rdx),%r9 2718 mov %r9,0x2b(%rcx) 2719L(bkP3Q5): 2720 mov 0x23(%rdx),%r11 2721 mov %r11,0x23(%rcx) 2722L(bkP3Q4): 2723 mov 0x1b(%rdx),%r10 2724 mov %r10,0x1b(%rcx) 2725L(bkP3Q3): 2726 mov 0x13(%rdx),%r9 2727 mov %r9,0x13(%rcx) 2728L(bkP3Q2): 2729 mov 0xb(%rdx),%r11 2730 mov %r11,0xb(%rcx) 2731L(bkP3Q1): 2732 mov 0x3(%rdx),%r10 2733 mov %r10,0x3(%rcx) 2734L(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores 2735 mov 0x1(%rdx),%r9w 2736 mov %r9w,0x1(%rcx) 2737 mov (%rdx),%r10b 2738 mov %r10b,(%rcx) 2739 ret 2740 2741 .balign 16 2742L(bkP4QI): 2743 mov 0x8c(%rdx),%r10 2744 mov %r10,0x8c(%rcx) 2745L(bkP4QH): 2746 mov 0x84(%rdx),%r11 2747 mov %r11,0x84(%rcx) 2748L(bkP4QG): 2749 mov 0x7c(%rdx),%r10 2750 mov %r10,0x7c(%rcx) 2751L(bkP4QF): 2752 mov 0x74(%rdx),%r9 2753 mov %r9,0x74(%rcx) 2754L(bkP4QE): 2755 mov 0x6c(%rdx),%r11 2756 mov %r11,0x6c(%rcx) 2757L(bkP4QD): 2758 mov 0x64(%rdx),%r10 2759 mov %r10,0x64(%rcx) 2760L(bkP4QC): 2761 mov 0x5c(%rdx),%r9 2762 mov %r9,0x5c(%rcx) 2763L(bkP4QB): 2764 mov 0x54(%rdx),%r11 2765 mov %r11,0x54(%rcx) 2766L(bkP4QA): 2767 mov 0x4c(%rdx),%r10 2768 mov %r10,0x4c(%rcx) 2769L(bkP4Q9): 2770 mov 0x44(%rdx),%r9 2771 mov %r9,0x44(%rcx) 2772L(bkP4Q8): 2773 mov 0x3c(%rdx),%r11 2774 mov %r11,0x3c(%rcx) 2775L(bkP4Q7): 2776 mov 0x34(%rdx),%r10 2777 mov %r10,0x34(%rcx) 2778L(bkP4Q6): 2779 mov 0x2c(%rdx),%r9 2780 mov %r9,0x2c(%rcx) 2781L(bkP4Q5): 2782 mov 0x24(%rdx),%r11 2783 mov %r11,0x24(%rcx) 2784L(bkP4Q4): 2785 mov 0x1c(%rdx),%r10 2786 mov %r10,0x1c(%rcx) 2787L(bkP4Q3): 2788 mov 0x14(%rdx),%r9 2789 mov %r9,0x14(%rcx) 2790L(bkP4Q2): 2791 mov 0xc(%rdx),%r11 2792 mov %r11,0xc(%rcx) 2793L(bkP4Q1): 2794 mov 0x4(%rdx),%r10 2795 mov %r10,0x4(%rcx) 2796L(bkP4Q0): 2797 mov (%rdx),%r9d 2798 mov %r9d,(%rcx) 2799 ret 2800 2801 .balign 16 2802L(bkP5QI): 2803 mov 0x8d(%rdx),%r10 2804 mov %r10,0x8d(%rcx) 2805L(bkP5QH): 2806 mov 0x85(%rdx),%r9 2807 mov %r9,0x85(%rcx) 2808L(bkP5QG): 2809 mov 0x7d(%rdx),%r11 2810 mov %r11,0x7d(%rcx) 2811L(bkP5QF): 2812 mov 0x75(%rdx),%r10 2813 mov %r10,0x75(%rcx) 2814L(bkP5QE): 2815 mov 0x6d(%rdx),%r9 2816 mov %r9,0x6d(%rcx) 2817L(bkP5QD): 2818 mov 0x65(%rdx),%r11 2819 mov %r11,0x65(%rcx) 2820L(bkP5QC): 2821 mov 0x5d(%rdx),%r10 2822 mov %r10,0x5d(%rcx) 2823L(bkP5QB): 2824 mov 0x55(%rdx),%r9 2825 mov %r9,0x55(%rcx) 2826L(bkP5QA): 2827 mov 0x4d(%rdx),%r11 2828 mov %r11,0x4d(%rcx) 2829L(bkP5Q9): 2830 mov 0x45(%rdx),%r10 2831 mov %r10,0x45(%rcx) 2832L(bkP5Q8): 2833 mov 0x3d(%rdx),%r9 2834 mov %r9,0x3d(%rcx) 2835L(bkP5Q7): 2836 mov 0x35(%rdx),%r11 2837 mov %r11,0x35(%rcx) 2838L(bkP5Q6): 2839 mov 0x2d(%rdx),%r10 2840 mov %r10,0x2d(%rcx) 2841L(bkP5Q5): 2842 mov 0x25(%rdx),%r9 2843 mov %r9,0x25(%rcx) 2844L(bkP5Q4): 2845 mov 0x1d(%rdx),%r11 2846 mov %r11,0x1d(%rcx) 2847L(bkP5Q3): 2848 mov 0x15(%rdx),%r10 2849 mov %r10,0x15(%rcx) 2850L(bkP5Q2): 2851 mov 0xd(%rdx),%r9 2852 mov %r9,0xd(%rcx) 2853L(bkP5Q1): 2854 mov 0x5(%rdx),%r11 2855 mov %r11,0x5(%rcx) 2856L(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores 2857 mov 0x1(%rdx),%r9d 2858 mov %r9d,0x1(%rcx) 2859 mov (%rdx),%r10b 2860 mov %r10b,(%rcx) 2861 ret 2862 2863 .balign 16 2864L(bkP6QI): 2865 mov 0x8e(%rdx),%r10 2866 mov %r10,0x8e(%rcx) 2867L(bkP6QH): 2868 mov 0x86(%rdx),%r11 2869 mov %r11,0x86(%rcx) 2870L(bkP6QG): 2871 mov 0x7e(%rdx),%r10 2872 mov %r10,0x7e(%rcx) 2873L(bkP6QF): 2874 mov 0x76(%rdx),%r9 2875 mov %r9,0x76(%rcx) 2876L(bkP6QE): 2877 mov 0x6e(%rdx),%r11 2878 mov %r11,0x6e(%rcx) 2879L(bkP6QD): 2880 mov 0x66(%rdx),%r10 2881 mov %r10,0x66(%rcx) 2882L(bkP6QC): 2883 mov 0x5e(%rdx),%r9 2884 mov %r9,0x5e(%rcx) 2885L(bkP6QB): 2886 mov 0x56(%rdx),%r11 2887 mov %r11,0x56(%rcx) 2888L(bkP6QA): 2889 mov 0x4e(%rdx),%r10 2890 mov %r10,0x4e(%rcx) 2891L(bkP6Q9): 2892 mov 0x46(%rdx),%r9 2893 mov %r9,0x46(%rcx) 2894L(bkP6Q8): 2895 mov 0x3e(%rdx),%r11 2896 mov %r11,0x3e(%rcx) 2897L(bkP6Q7): 2898 mov 0x36(%rdx),%r10 2899 mov %r10,0x36(%rcx) 2900L(bkP6Q6): 2901 mov 0x2e(%rdx),%r9 2902 mov %r9,0x2e(%rcx) 2903L(bkP6Q5): 2904 mov 0x26(%rdx),%r11 2905 mov %r11,0x26(%rcx) 2906L(bkP6Q4): 2907 mov 0x1e(%rdx),%r10 2908 mov %r10,0x1e(%rcx) 2909L(bkP6Q3): 2910 mov 0x16(%rdx),%r9 2911 mov %r9,0x16(%rcx) 2912L(bkP6Q2): 2913 mov 0xe(%rdx),%r11 2914 mov %r11,0xe(%rcx) 2915L(bkP6Q1): 2916 mov 0x6(%rdx),%r10 2917 mov %r10,0x6(%rcx) 2918L(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores 2919 mov 0x2(%rdx),%r9d 2920 mov %r9d,0x2(%rcx) 2921 mov (%rdx),%r10w 2922 mov %r10w,(%rcx) 2923 ret 2924 2925 .balign 16 2926L(bkP7QI): 2927 mov 0x8f(%rdx),%r10 2928 mov %r10,0x8f(%rcx) 2929L(bkP7QH): 2930 mov 0x87(%rdx),%r11 2931 mov %r11,0x87(%rcx) 2932L(bkP7QG): 2933 mov 0x7f(%rdx),%r10 2934 mov %r10,0x7f(%rcx) 2935L(bkP7QF): 2936 mov 0x77(%rdx),%r9 2937 mov %r9,0x77(%rcx) 2938L(bkP7QE): 2939 mov 0x6f(%rdx),%r11 2940 mov %r11,0x6f(%rcx) 2941L(bkP7QD): 2942 mov 0x67(%rdx),%r10 2943 mov %r10,0x67(%rcx) 2944L(bkP7QC): 2945 mov 0x5f(%rdx),%r9 2946 mov %r9,0x5f(%rcx) 2947L(bkP7QB): 2948 mov 0x57(%rdx),%r11 2949 mov %r11,0x57(%rcx) 2950L(bkP7QA): 2951 mov 0x4f(%rdx),%r10 2952 mov %r10,0x4f(%rcx) 2953L(bkP7Q9): 2954 mov 0x47(%rdx),%r9 2955 mov %r9,0x47(%rcx) 2956L(bkP7Q8): 2957 mov 0x3f(%rdx),%r11 2958 mov %r11,0x3f(%rcx) 2959L(bkP7Q7): 2960 mov 0x37(%rdx),%r10 2961 mov %r10,0x37(%rcx) 2962L(bkP7Q6): 2963 mov 0x2f(%rdx),%r9 2964 mov %r9,0x2f(%rcx) 2965L(bkP7Q5): 2966 mov 0x27(%rdx),%r11 2967 mov %r11,0x27(%rcx) 2968L(bkP7Q4): 2969 mov 0x1f(%rdx),%r10 2970 mov %r10,0x1f(%rcx) 2971L(bkP7Q3): 2972 mov 0x17(%rdx),%r9 2973 mov %r9,0x17(%rcx) 2974L(bkP7Q2): 2975 mov 0xf(%rdx),%r11 2976 mov %r11,0xf(%rcx) 2977L(bkP7Q1): 2978 mov 0x7(%rdx),%r10 2979 mov %r10,0x7(%rcx) 2980L(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores 2981 mov 0x3(%rdx),%r9d 2982 mov %r9d,0x3(%rcx) 2983 mov 0x1(%rdx),%r10w 2984 mov %r10w,0x1(%rcx) 2985 mov (%rdx),%r11b 2986 mov %r11b,(%rcx) 2987 ret 2988 2989 .balign 16 2990L(bkPxQx): .int L(bkP0Q0)-L(bkPxQx) 2991 .int L(bkP1Q0)-L(bkPxQx) 2992 .int L(bkP2Q0)-L(bkPxQx) 2993 .int L(bkP3Q0)-L(bkPxQx) 2994 .int L(bkP4Q0)-L(bkPxQx) 2995 .int L(bkP5Q0)-L(bkPxQx) 2996 .int L(bkP6Q0)-L(bkPxQx) 2997 .int L(bkP7Q0)-L(bkPxQx) 2998 2999 .int L(bkP0Q1)-L(bkPxQx) 3000 .int L(bkP1Q1)-L(bkPxQx) 3001 .int L(bkP2Q1)-L(bkPxQx) 3002 .int L(bkP3Q1)-L(bkPxQx) 3003 .int L(bkP4Q1)-L(bkPxQx) 3004 .int L(bkP5Q1)-L(bkPxQx) 3005 .int L(bkP6Q1)-L(bkPxQx) 3006 .int L(bkP7Q1)-L(bkPxQx) 3007 3008 .int L(bkP0Q2)-L(bkPxQx) 3009 .int L(bkP1Q2)-L(bkPxQx) 3010 .int L(bkP2Q2)-L(bkPxQx) 3011 .int L(bkP3Q2)-L(bkPxQx) 3012 .int L(bkP4Q2)-L(bkPxQx) 3013 .int L(bkP5Q2)-L(bkPxQx) 3014 .int L(bkP6Q2)-L(bkPxQx) 3015 .int L(bkP7Q2)-L(bkPxQx) 3016 3017 .int L(bkP0Q3)-L(bkPxQx) 3018 .int L(bkP1Q3)-L(bkPxQx) 3019 .int L(bkP2Q3)-L(bkPxQx) 3020 .int L(bkP3Q3)-L(bkPxQx) 3021 .int L(bkP4Q3)-L(bkPxQx) 3022 .int L(bkP5Q3)-L(bkPxQx) 3023 .int L(bkP6Q3)-L(bkPxQx) 3024 .int L(bkP7Q3)-L(bkPxQx) 3025 3026 .int L(bkP0Q4)-L(bkPxQx) 3027 .int L(bkP1Q4)-L(bkPxQx) 3028 .int L(bkP2Q4)-L(bkPxQx) 3029 .int L(bkP3Q4)-L(bkPxQx) 3030 .int L(bkP4Q4)-L(bkPxQx) 3031 .int L(bkP5Q4)-L(bkPxQx) 3032 .int L(bkP6Q4)-L(bkPxQx) 3033 .int L(bkP7Q4)-L(bkPxQx) 3034 3035 .int L(bkP0Q5)-L(bkPxQx) 3036 .int L(bkP1Q5)-L(bkPxQx) 3037 .int L(bkP2Q5)-L(bkPxQx) 3038 .int L(bkP3Q5)-L(bkPxQx) 3039 .int L(bkP4Q5)-L(bkPxQx) 3040 .int L(bkP5Q5)-L(bkPxQx) 3041 .int L(bkP6Q5)-L(bkPxQx) 3042 .int L(bkP7Q5)-L(bkPxQx) 3043 3044 .int L(bkP0Q6)-L(bkPxQx) 3045 .int L(bkP1Q6)-L(bkPxQx) 3046 .int L(bkP2Q6)-L(bkPxQx) 3047 .int L(bkP3Q6)-L(bkPxQx) 3048 .int L(bkP4Q6)-L(bkPxQx) 3049 .int L(bkP5Q6)-L(bkPxQx) 3050 .int L(bkP6Q6)-L(bkPxQx) 3051 .int L(bkP7Q6)-L(bkPxQx) 3052 3053 .int L(bkP0Q7)-L(bkPxQx) 3054 .int L(bkP1Q7)-L(bkPxQx) 3055 .int L(bkP2Q7)-L(bkPxQx) 3056 .int L(bkP3Q7)-L(bkPxQx) 3057 .int L(bkP4Q7)-L(bkPxQx) 3058 .int L(bkP5Q7)-L(bkPxQx) 3059 .int L(bkP6Q7)-L(bkPxQx) 3060 .int L(bkP7Q7)-L(bkPxQx) 3061 3062 .int L(bkP0Q8)-L(bkPxQx) 3063 .int L(bkP1Q8)-L(bkPxQx) 3064 .int L(bkP2Q8)-L(bkPxQx) 3065 .int L(bkP3Q8)-L(bkPxQx) 3066 .int L(bkP4Q8)-L(bkPxQx) 3067 .int L(bkP5Q8)-L(bkPxQx) 3068 .int L(bkP6Q8)-L(bkPxQx) 3069 .int L(bkP7Q8)-L(bkPxQx) 3070 3071 .int L(bkP0Q9)-L(bkPxQx) 3072 .int L(bkP1Q9)-L(bkPxQx) 3073 .int L(bkP2Q9)-L(bkPxQx) 3074 .int L(bkP3Q9)-L(bkPxQx) 3075 .int L(bkP4Q9)-L(bkPxQx) 3076 .int L(bkP5Q9)-L(bkPxQx) 3077 .int L(bkP6Q9)-L(bkPxQx) 3078 .int L(bkP7Q9)-L(bkPxQx) 3079 3080 .int L(bkP0QA)-L(bkPxQx) 3081 .int L(bkP1QA)-L(bkPxQx) 3082 .int L(bkP2QA)-L(bkPxQx) 3083 .int L(bkP3QA)-L(bkPxQx) 3084 .int L(bkP4QA)-L(bkPxQx) 3085 .int L(bkP5QA)-L(bkPxQx) 3086 .int L(bkP6QA)-L(bkPxQx) 3087 .int L(bkP7QA)-L(bkPxQx) 3088 3089 .int L(bkP0QB)-L(bkPxQx) 3090 .int L(bkP1QB)-L(bkPxQx) 3091 .int L(bkP2QB)-L(bkPxQx) 3092 .int L(bkP3QB)-L(bkPxQx) 3093 .int L(bkP4QB)-L(bkPxQx) 3094 .int L(bkP5QB)-L(bkPxQx) 3095 .int L(bkP6QB)-L(bkPxQx) 3096 .int L(bkP7QB)-L(bkPxQx) 3097 3098 .int L(bkP0QC)-L(bkPxQx) 3099 .int L(bkP1QC)-L(bkPxQx) 3100 .int L(bkP2QC)-L(bkPxQx) 3101 .int L(bkP3QC)-L(bkPxQx) 3102 .int L(bkP4QC)-L(bkPxQx) 3103 .int L(bkP5QC)-L(bkPxQx) 3104 .int L(bkP6QC)-L(bkPxQx) 3105 .int L(bkP7QC)-L(bkPxQx) 3106 3107 .int L(bkP0QD)-L(bkPxQx) 3108 .int L(bkP1QD)-L(bkPxQx) 3109 .int L(bkP2QD)-L(bkPxQx) 3110 .int L(bkP3QD)-L(bkPxQx) 3111 .int L(bkP4QD)-L(bkPxQx) 3112 .int L(bkP5QD)-L(bkPxQx) 3113 .int L(bkP6QD)-L(bkPxQx) 3114 .int L(bkP7QD)-L(bkPxQx) 3115 3116 .int L(bkP0QE)-L(bkPxQx) 3117 .int L(bkP1QE)-L(bkPxQx) 3118 .int L(bkP2QE)-L(bkPxQx) 3119 .int L(bkP3QE)-L(bkPxQx) 3120 .int L(bkP4QE)-L(bkPxQx) 3121 .int L(bkP5QE)-L(bkPxQx) 3122 .int L(bkP6QE)-L(bkPxQx) 3123 .int L(bkP7QE)-L(bkPxQx) 3124 3125 .int L(bkP0QF)-L(bkPxQx) 3126 .int L(bkP1QF)-L(bkPxQx) 3127 .int L(bkP2QF)-L(bkPxQx) 3128 .int L(bkP3QF)-L(bkPxQx) 3129 .int L(bkP4QF)-L(bkPxQx) 3130 .int L(bkP5QF)-L(bkPxQx) 3131 .int L(bkP6QF)-L(bkPxQx) 3132 .int L(bkP7QF)-L(bkPxQx) 3133 3134 .int L(bkP0QG)-L(bkPxQx) 3135 .int L(bkP1QG)-L(bkPxQx) 3136 .int L(bkP2QG)-L(bkPxQx) 3137 .int L(bkP3QG)-L(bkPxQx) 3138 .int L(bkP4QG)-L(bkPxQx) 3139 .int L(bkP5QG)-L(bkPxQx) 3140 .int L(bkP6QG)-L(bkPxQx) 3141 .int L(bkP7QG)-L(bkPxQx) 3142 3143 .int L(bkP0QH)-L(bkPxQx) 3144 .int L(bkP1QH)-L(bkPxQx) 3145 .int L(bkP2QH)-L(bkPxQx) 3146 .int L(bkP3QH)-L(bkPxQx) 3147 .int L(bkP4QH)-L(bkPxQx) 3148 .int L(bkP5QH)-L(bkPxQx) 3149 .int L(bkP6QH)-L(bkPxQx) 3150 .int L(bkP7QH)-L(bkPxQx) 3151 3152 .int L(bkP0QI)-L(bkPxQx) 3153 .int L(bkP1QI)-L(bkPxQx) 3154 .int L(bkP2QI)-L(bkPxQx) 3155 .int L(bkP3QI)-L(bkPxQx) 3156 .int L(bkP4QI)-L(bkPxQx) 3157 .int L(bkP5QI)-L(bkPxQx) 3158 .int L(bkP6QI)-L(bkPxQx) 3159 .int L(bkP7QI)-L(bkPxQx) 3160 3161 SET_SIZE(memmove) 3162