1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2008, Intel Corporation 24 * All rights reserved. 25 */ 26 27/* 28 * memcpy.s - copies two blocks of memory 29 * Implements memcpy() and memmove() libc primitives. 30 */ 31 .ident "%Z%%M% %I% %E% SMI" 32 33 .file "%M%" 34 35#include <sys/asm_linkage.h> 36 ANSI_PRAGMA_WEAK(memmove,function) 37 ANSI_PRAGMA_WEAK(memcpy,function) 38 39#include "synonyms.h" 40#include "cache.h" 41#include "proc64_id.h" 42 43#define L(s) .memcpy/**/s 44 45/* 46 * memcpy algorithm overview: 47 * 48 * Thresholds used below were determined experimentally. 49 * 50 * Pseudo code: 51 * 52 * If (size <= 128 bytes) { 53 * do unrolled code (primarily 8-byte loads/stores) regardless of 54 * alignment. 55 * } else { 56 * Align destination to 16-byte boundary 57 * 58 * if (NO_SSE) { 59 * If (size > half of the largest level cache) { 60 * Use 8-byte non-temporal stores (64-bytes/loop) 61 * } else { 62 * if (size > 4K && size <= half l1 cache size) { 63 * Use rep movsq 64 * } else { 65 * Use 8-byte loads/stores (64 bytes per loop) 66 * } 67 * } 68 * 69 * } else { **USE SSE** 70 * If (size > half of the largest level cache) { 71 * Use 16-byte non-temporal stores (128-bytes per loop) 72 * } else { 73 * If (both source and destination are aligned) { 74 * Use 16-byte aligned loads and stores (128 bytes/loop) 75 * } else { 76 * use pairs of xmm registers with SSE2 or SSSE3 77 * instructions to concatenate and shift appropriately 78 * to account for source unalignment. This enables 79 * 16-byte aligned loads to be done. 80 * } 81 * } 82 } 83 * 84 * Finish any remaining bytes via unrolled code above. 85 * } 86 * 87 * memmove overview: 88 * memmove is the same as memcpy except one case where copy needs to be 89 * done backwards. The copy backwards code is done in a similar manner. 90 */ 91 92 ENTRY(memmove) 93 cmp %rsi,%rdi # if dst <= src 94 jbe L(CopyForward) # then do copy forward 95 mov %rsi,%r9 # move src to r9 96 add %rdx,%r9 # add len to get addr of end of src 97 cmp %r9,%rdi # if dst < end of src 98 jb L(CopyBackwards) # then do copy backwards 99 jmp L(CopyForward) 100 101 ENTRY (memcpy) 102L(CopyForward): 103 mov %rdx,%r8 104 mov %rdi,%rcx 105 mov %rsi,%rdx 106 mov %rdi,%rax 107 lea L(fwdPxQx)(%rip),%r11 108 cmp $0x80,%r8 # 128 109 jg L(ck_use_sse2) 110 add %r8,%rcx 111 add %r8,%rdx 112 113 movslq (%r11,%r8,4),%r10 114 lea (%r10,%r11,1),%r11 115 jmpq *%r11 116 117 .balign 16 118L(ShrtAlignNew): 119 lea L(AliPxQx)(%rip),%r11 120 mov %rcx,%r9 121 and $0xf,%r9 122 123 movslq (%r11,%r9,4),%r10 124 lea (%r10,%r11,1),%r11 125 jmpq *%r11 126 127 .balign 16 128L(fwdPxQx): .int L(P0Q0)-L(fwdPxQx) 129 .int L(P1Q0)-L(fwdPxQx) 130 .int L(P2Q0)-L(fwdPxQx) 131 .int L(P3Q0)-L(fwdPxQx) 132 .int L(P4Q0)-L(fwdPxQx) 133 .int L(P5Q0)-L(fwdPxQx) 134 .int L(P6Q0)-L(fwdPxQx) 135 .int L(P7Q0)-L(fwdPxQx) 136 137 .int L(P0Q1)-L(fwdPxQx) 138 .int L(P1Q1)-L(fwdPxQx) 139 .int L(P2Q1)-L(fwdPxQx) 140 .int L(P3Q1)-L(fwdPxQx) 141 .int L(P4Q1)-L(fwdPxQx) 142 .int L(P5Q1)-L(fwdPxQx) 143 .int L(P6Q1)-L(fwdPxQx) 144 .int L(P7Q1)-L(fwdPxQx) 145 146 .int L(P0Q2)-L(fwdPxQx) 147 .int L(P1Q2)-L(fwdPxQx) 148 .int L(P2Q2)-L(fwdPxQx) 149 .int L(P3Q2)-L(fwdPxQx) 150 .int L(P4Q2)-L(fwdPxQx) 151 .int L(P5Q2)-L(fwdPxQx) 152 .int L(P6Q2)-L(fwdPxQx) 153 .int L(P7Q2)-L(fwdPxQx) 154 155 .int L(P0Q3)-L(fwdPxQx) 156 .int L(P1Q3)-L(fwdPxQx) 157 .int L(P2Q3)-L(fwdPxQx) 158 .int L(P3Q3)-L(fwdPxQx) 159 .int L(P4Q3)-L(fwdPxQx) 160 .int L(P5Q3)-L(fwdPxQx) 161 .int L(P6Q3)-L(fwdPxQx) 162 .int L(P7Q3)-L(fwdPxQx) 163 164 .int L(P0Q4)-L(fwdPxQx) 165 .int L(P1Q4)-L(fwdPxQx) 166 .int L(P2Q4)-L(fwdPxQx) 167 .int L(P3Q4)-L(fwdPxQx) 168 .int L(P4Q4)-L(fwdPxQx) 169 .int L(P5Q4)-L(fwdPxQx) 170 .int L(P6Q4)-L(fwdPxQx) 171 .int L(P7Q4)-L(fwdPxQx) 172 173 .int L(P0Q5)-L(fwdPxQx) 174 .int L(P1Q5)-L(fwdPxQx) 175 .int L(P2Q5)-L(fwdPxQx) 176 .int L(P3Q5)-L(fwdPxQx) 177 .int L(P4Q5)-L(fwdPxQx) 178 .int L(P5Q5)-L(fwdPxQx) 179 .int L(P6Q5)-L(fwdPxQx) 180 .int L(P7Q5)-L(fwdPxQx) 181 182 .int L(P0Q6)-L(fwdPxQx) 183 .int L(P1Q6)-L(fwdPxQx) 184 .int L(P2Q6)-L(fwdPxQx) 185 .int L(P3Q6)-L(fwdPxQx) 186 .int L(P4Q6)-L(fwdPxQx) 187 .int L(P5Q6)-L(fwdPxQx) 188 .int L(P6Q6)-L(fwdPxQx) 189 .int L(P7Q6)-L(fwdPxQx) 190 191 .int L(P0Q7)-L(fwdPxQx) 192 .int L(P1Q7)-L(fwdPxQx) 193 .int L(P2Q7)-L(fwdPxQx) 194 .int L(P3Q7)-L(fwdPxQx) 195 .int L(P4Q7)-L(fwdPxQx) 196 .int L(P5Q7)-L(fwdPxQx) 197 .int L(P6Q7)-L(fwdPxQx) 198 .int L(P7Q7)-L(fwdPxQx) 199 200 .int L(P0Q8)-L(fwdPxQx) 201 .int L(P1Q8)-L(fwdPxQx) 202 .int L(P2Q8)-L(fwdPxQx) 203 .int L(P3Q8)-L(fwdPxQx) 204 .int L(P4Q8)-L(fwdPxQx) 205 .int L(P5Q8)-L(fwdPxQx) 206 .int L(P6Q8)-L(fwdPxQx) 207 .int L(P7Q8)-L(fwdPxQx) 208 209 .int L(P0Q9)-L(fwdPxQx) 210 .int L(P1Q9)-L(fwdPxQx) 211 .int L(P2Q9)-L(fwdPxQx) 212 .int L(P3Q9)-L(fwdPxQx) 213 .int L(P4Q9)-L(fwdPxQx) 214 .int L(P5Q9)-L(fwdPxQx) 215 .int L(P6Q9)-L(fwdPxQx) 216 .int L(P7Q9)-L(fwdPxQx) 217 218 .int L(P0QA)-L(fwdPxQx) 219 .int L(P1QA)-L(fwdPxQx) 220 .int L(P2QA)-L(fwdPxQx) 221 .int L(P3QA)-L(fwdPxQx) 222 .int L(P4QA)-L(fwdPxQx) 223 .int L(P5QA)-L(fwdPxQx) 224 .int L(P6QA)-L(fwdPxQx) 225 .int L(P7QA)-L(fwdPxQx) 226 227 .int L(P0QB)-L(fwdPxQx) 228 .int L(P1QB)-L(fwdPxQx) 229 .int L(P2QB)-L(fwdPxQx) 230 .int L(P3QB)-L(fwdPxQx) 231 .int L(P4QB)-L(fwdPxQx) 232 .int L(P5QB)-L(fwdPxQx) 233 .int L(P6QB)-L(fwdPxQx) 234 .int L(P7QB)-L(fwdPxQx) 235 236 .int L(P0QC)-L(fwdPxQx) 237 .int L(P1QC)-L(fwdPxQx) 238 .int L(P2QC)-L(fwdPxQx) 239 .int L(P3QC)-L(fwdPxQx) 240 .int L(P4QC)-L(fwdPxQx) 241 .int L(P5QC)-L(fwdPxQx) 242 .int L(P6QC)-L(fwdPxQx) 243 .int L(P7QC)-L(fwdPxQx) 244 245 .int L(P0QD)-L(fwdPxQx) 246 .int L(P1QD)-L(fwdPxQx) 247 .int L(P2QD)-L(fwdPxQx) 248 .int L(P3QD)-L(fwdPxQx) 249 .int L(P4QD)-L(fwdPxQx) 250 .int L(P5QD)-L(fwdPxQx) 251 .int L(P6QD)-L(fwdPxQx) 252 .int L(P7QD)-L(fwdPxQx) 253 254 .int L(P0QE)-L(fwdPxQx) 255 .int L(P1QE)-L(fwdPxQx) 256 .int L(P2QE)-L(fwdPxQx) 257 .int L(P3QE)-L(fwdPxQx) 258 .int L(P4QE)-L(fwdPxQx) 259 .int L(P5QE)-L(fwdPxQx) 260 .int L(P6QE)-L(fwdPxQx) 261 .int L(P7QE)-L(fwdPxQx) 262 263 .int L(P0QF)-L(fwdPxQx) 264 .int L(P1QF)-L(fwdPxQx) 265 .int L(P2QF)-L(fwdPxQx) 266 .int L(P3QF)-L(fwdPxQx) 267 .int L(P4QF)-L(fwdPxQx) 268 .int L(P5QF)-L(fwdPxQx) 269 .int L(P6QF)-L(fwdPxQx) 270 .int L(P7QF)-L(fwdPxQx) 271 272 .int L(P0QG)-L(fwdPxQx) # 0x80 273 274 .balign 16 275L(AliPxQx): .int L(now_qw_aligned)-L(AliPxQx) 276 .int L(A1Q0)-L(AliPxQx) 277 .int L(A2Q0)-L(AliPxQx) 278 .int L(A3Q0)-L(AliPxQx) 279 .int L(A4Q0)-L(AliPxQx) 280 .int L(A5Q0)-L(AliPxQx) 281 .int L(A6Q0)-L(AliPxQx) 282 .int L(A7Q0)-L(AliPxQx) 283 .int L(A0Q1)-L(AliPxQx) 284 .int L(A1Q1)-L(AliPxQx) 285 .int L(A2Q1)-L(AliPxQx) 286 .int L(A3Q1)-L(AliPxQx) 287 .int L(A4Q1)-L(AliPxQx) 288 .int L(A5Q1)-L(AliPxQx) 289 .int L(A6Q1)-L(AliPxQx) 290 .int L(A7Q1)-L(AliPxQx) 291 292 .balign 16 293L(A1Q0): # ; need to move 8+ 7=1+2+4 bytes 294 movzbq (%rdx),%r11 295 sub $0xf,%r8 296 mov %r11b,(%rcx) 297 298 movzwq 0x1(%rdx),%r10 299 mov %r10w,0x1(%rcx) 300 301 mov 0x3(%rdx),%r9d 302 mov %r9d,0x3(%rcx) 303 304 mov 0x7(%rdx),%r11 305 add $0xf,%rdx 306 mov %r11,0x7(%rcx) 307 308 add $0xf,%rcx 309 jmp L(now_qw_aligned) 310 311 .balign 16 312L(A2Q0): # ; need to move 8+ 6=2+4 bytes 313 movzwq (%rdx),%r10 314 sub $0xe,%r8 315 mov %r10w,(%rcx) 316 317 mov 0x2(%rdx),%r9d 318 mov %r9d,0x2(%rcx) 319 320 mov 0x6(%rdx),%r11 321 add $0xe,%rdx 322 mov %r11,0x6(%rcx) 323 add $0xe,%rcx 324 jmp L(now_qw_aligned) 325 326 .balign 16 327L(A3Q0): # ; need to move 8+ 5=1+4 bytes 328 movzbq (%rdx),%r11 329 sub $0xd,%r8 330 mov %r11b,(%rcx) 331 332 mov 0x1(%rdx),%r9d 333 mov %r9d,0x1(%rcx) 334 335 mov 0x5(%rdx),%r10 336 add $0xd,%rdx 337 mov %r10,0x5(%rcx) 338 339 add $0xd,%rcx 340 jmp L(now_qw_aligned) 341 342 .balign 16 343L(A4Q0): # ; need to move 8+4 bytes 344 mov (%rdx),%r9d 345 sub $0xc,%r8 346 mov %r9d,(%rcx) 347 348 mov 0x4(%rdx),%r10 349 add $0xc,%rdx 350 mov %r10,0x4(%rcx) 351 352 add $0xc,%rcx 353 jmp L(now_qw_aligned) 354 355 .balign 16 356L(A5Q0): # ; need to move 8+ 3=1+2 bytes 357 movzbq (%rdx),%r11 358 sub $0xb,%r8 359 mov %r11b,(%rcx) 360 361 movzwq 0x1(%rdx),%r10 362 mov %r10w,0x1(%rcx) 363 364 mov 0x3(%rdx),%r9 365 add $0xb,%rdx 366 mov %r9,0x3(%rcx) 367 368 add $0xb,%rcx 369 jmp L(now_qw_aligned) 370 371 .balign 16 372L(A6Q0): # ; need to move 8+2 bytes 373 movzwq (%rdx),%r10 374 sub $0xa,%r8 375 mov %r10w,(%rcx) 376 377 mov 0x2(%rdx),%r9 378 add $0xa,%rdx 379 mov %r9,0x2(%rcx) 380 381 add $0xa,%rcx 382 jmp L(now_qw_aligned) 383 384 .balign 16 385L(A7Q0): # ; need to move 8+1 byte 386 movzbq (%rdx),%r11 387 sub $0x9,%r8 388 mov %r11b,(%rcx) 389 390 mov 0x1(%rdx),%r10 391 add $0x9,%rdx 392 mov %r10,0x1(%rcx) 393 394 add $0x9,%rcx 395 jmp L(now_qw_aligned) 396 397 .balign 16 398L(A0Q1): # ; need to move 8 bytes 399 400 mov (%rdx),%r10 401 add $0x8,%rdx 402 sub $0x8,%r8 403 mov %r10,(%rcx) 404 405 add $0x8,%rcx 406 jmp L(now_qw_aligned) 407 408 .balign 16 409L(A1Q1): # ; need to move 7=1+2+4 bytes 410 movzbq (%rdx),%r11 411 sub $0x7,%r8 412 mov %r11b,(%rcx) 413 414 movzwq 0x1(%rdx),%r10 415 mov %r10w,0x1(%rcx) 416 417 mov 0x3(%rdx),%r9d 418 add $0x7,%rdx 419 mov %r9d,0x3(%rcx) 420 add $0x7,%rcx 421 jmp L(now_qw_aligned) 422 423 .balign 16 424L(A2Q1): # ; need to move 6=2+4 bytes 425 movzwq (%rdx),%r10 426 sub $0x6,%r8 427 mov %r10w,(%rcx) 428 mov 0x2(%rdx),%r9d 429 add $0x6,%rdx 430 mov %r9d,0x2(%rcx) 431 add $0x6,%rcx 432 jmp L(now_qw_aligned) 433 434 .balign 16 435L(A3Q1): # ; need to move 5=1+4 bytes 436 movzbq (%rdx),%r11 437 sub $0x5,%r8 438 mov %r11b,(%rcx) 439 mov 0x1(%rdx),%r9d 440 add $0x5,%rdx 441 mov %r9d,0x1(%rcx) 442 add $0x5,%rcx 443 jmp L(now_qw_aligned) 444 445 .balign 16 446L(A4Q1): # ; need to move 4 bytes 447 mov (%rdx),%r9d 448 sub $0x4,%r8 449 add $0x4,%rdx 450 mov %r9d,(%rcx) 451 add $0x4,%rcx 452 jmp L(now_qw_aligned) 453 454 .balign 16 455L(A5Q1): # ; need to move 3=1+2 bytes 456 movzbq (%rdx),%r11 457 sub $0x3,%r8 458 mov %r11b,(%rcx) 459 460 movzwq 0x1(%rdx),%r10 461 add $0x3,%rdx 462 mov %r10w,0x1(%rcx) 463 464 add $0x3,%rcx 465 jmp L(now_qw_aligned) 466 467 .balign 16 468L(A6Q1): # ; need to move 2 bytes 469 movzwq (%rdx),%r10 470 sub $0x2,%r8 471 add $0x2,%rdx 472 mov %r10w,(%rcx) 473 add $0x2,%rcx 474 jmp L(now_qw_aligned) 475 476 .balign 16 477L(A7Q1): # ; need to move 1 byte 478 movzbq (%rdx),%r11 479 dec %r8 480 inc %rdx 481 mov %r11b,(%rcx) 482 inc %rcx 483 jmp L(now_qw_aligned) 484 485 486 .balign 16 487L(P0QG): 488 mov -0x80(%rdx),%r9 489 mov %r9,-0x80(%rcx) 490L(P0QF): 491 mov -0x78(%rdx),%r10 492 mov %r10,-0x78(%rcx) 493L(P0QE): 494 mov -0x70(%rdx),%r9 495 mov %r9,-0x70(%rcx) 496L(P0QD): 497 mov -0x68(%rdx),%r10 498 mov %r10,-0x68(%rcx) 499L(P0QC): 500 mov -0x60(%rdx),%r9 501 mov %r9,-0x60(%rcx) 502L(P0QB): 503 mov -0x58(%rdx),%r10 504 mov %r10,-0x58(%rcx) 505L(P0QA): 506 mov -0x50(%rdx),%r9 507 mov %r9,-0x50(%rcx) 508L(P0Q9): 509 mov -0x48(%rdx),%r10 510 mov %r10,-0x48(%rcx) 511L(P0Q8): 512 mov -0x40(%rdx),%r9 513 mov %r9,-0x40(%rcx) 514L(P0Q7): 515 mov -0x38(%rdx),%r10 516 mov %r10,-0x38(%rcx) 517L(P0Q6): 518 mov -0x30(%rdx),%r9 519 mov %r9,-0x30(%rcx) 520L(P0Q5): 521 mov -0x28(%rdx),%r10 522 mov %r10,-0x28(%rcx) 523L(P0Q4): 524 mov -0x20(%rdx),%r9 525 mov %r9,-0x20(%rcx) 526L(P0Q3): 527 mov -0x18(%rdx),%r10 528 mov %r10,-0x18(%rcx) 529L(P0Q2): 530 mov -0x10(%rdx),%r9 531 mov %r9,-0x10(%rcx) 532L(P0Q1): 533 mov -0x8(%rdx),%r10 534 mov %r10,-0x8(%rcx) 535L(P0Q0): 536 ret 537 538 .balign 16 539L(P1QF): 540 mov -0x79(%rdx),%r9 541 mov %r9,-0x79(%rcx) 542L(P1QE): 543 mov -0x71(%rdx),%r11 544 mov %r11,-0x71(%rcx) 545L(P1QD): 546 mov -0x69(%rdx),%r10 547 mov %r10,-0x69(%rcx) 548L(P1QC): 549 mov -0x61(%rdx),%r9 550 mov %r9,-0x61(%rcx) 551L(P1QB): 552 mov -0x59(%rdx),%r11 553 mov %r11,-0x59(%rcx) 554L(P1QA): 555 mov -0x51(%rdx),%r10 556 mov %r10,-0x51(%rcx) 557L(P1Q9): 558 mov -0x49(%rdx),%r9 559 mov %r9,-0x49(%rcx) 560L(P1Q8): 561 mov -0x41(%rdx),%r11 562 mov %r11,-0x41(%rcx) 563L(P1Q7): 564 mov -0x39(%rdx),%r10 565 mov %r10,-0x39(%rcx) 566L(P1Q6): 567 mov -0x31(%rdx),%r9 568 mov %r9,-0x31(%rcx) 569L(P1Q5): 570 mov -0x29(%rdx),%r11 571 mov %r11,-0x29(%rcx) 572L(P1Q4): 573 mov -0x21(%rdx),%r10 574 mov %r10,-0x21(%rcx) 575L(P1Q3): 576 mov -0x19(%rdx),%r9 577 mov %r9,-0x19(%rcx) 578L(P1Q2): 579 mov -0x11(%rdx),%r11 580 mov %r11,-0x11(%rcx) 581L(P1Q1): 582 mov -0x9(%rdx),%r10 583 mov %r10,-0x9(%rcx) 584L(P1Q0): 585 movzbq -0x1(%rdx),%r9 586 mov %r9b,-0x1(%rcx) 587 ret 588 589 .balign 16 590L(P2QF): 591 mov -0x7a(%rdx),%r9 592 mov %r9,-0x7a(%rcx) 593L(P2QE): 594 mov -0x72(%rdx),%r11 595 mov %r11,-0x72(%rcx) 596L(P2QD): 597 mov -0x6a(%rdx),%r10 598 mov %r10,-0x6a(%rcx) 599L(P2QC): 600 mov -0x62(%rdx),%r9 601 mov %r9,-0x62(%rcx) 602L(P2QB): 603 mov -0x5a(%rdx),%r11 604 mov %r11,-0x5a(%rcx) 605L(P2QA): 606 mov -0x52(%rdx),%r10 607 mov %r10,-0x52(%rcx) 608L(P2Q9): 609 mov -0x4a(%rdx),%r9 610 mov %r9,-0x4a(%rcx) 611L(P2Q8): 612 mov -0x42(%rdx),%r11 613 mov %r11,-0x42(%rcx) 614L(P2Q7): 615 mov -0x3a(%rdx),%r10 616 mov %r10,-0x3a(%rcx) 617L(P2Q6): 618 mov -0x32(%rdx),%r9 619 mov %r9,-0x32(%rcx) 620L(P2Q5): 621 mov -0x2a(%rdx),%r11 622 mov %r11,-0x2a(%rcx) 623L(P2Q4): 624 mov -0x22(%rdx),%r10 625 mov %r10,-0x22(%rcx) 626L(P2Q3): 627 mov -0x1a(%rdx),%r9 628 mov %r9,-0x1a(%rcx) 629L(P2Q2): 630 mov -0x12(%rdx),%r11 631 mov %r11,-0x12(%rcx) 632L(P2Q1): 633 mov -0xa(%rdx),%r10 634 mov %r10,-0xa(%rcx) 635L(P2Q0): 636 movzwq -0x2(%rdx),%r9 637 mov %r9w,-0x2(%rcx) 638 ret 639 640 .balign 16 641L(P3QF): 642 mov -0x7b(%rdx),%r9 643 mov %r9,-0x7b(%rcx) 644L(P3QE): 645 mov -0x73(%rdx),%r11 646 mov %r11,-0x73(%rcx) 647L(P3QD): 648 mov -0x6b(%rdx),%r10 649 mov %r10,-0x6b(%rcx) 650L(P3QC): 651 mov -0x63(%rdx),%r9 652 mov %r9,-0x63(%rcx) 653L(P3QB): 654 mov -0x5b(%rdx),%r11 655 mov %r11,-0x5b(%rcx) 656L(P3QA): 657 mov -0x53(%rdx),%r10 658 mov %r10,-0x53(%rcx) 659L(P3Q9): 660 mov -0x4b(%rdx),%r9 661 mov %r9,-0x4b(%rcx) 662L(P3Q8): 663 mov -0x43(%rdx),%r11 664 mov %r11,-0x43(%rcx) 665L(P3Q7): 666 mov -0x3b(%rdx),%r10 667 mov %r10,-0x3b(%rcx) 668L(P3Q6): 669 mov -0x33(%rdx),%r9 670 mov %r9,-0x33(%rcx) 671L(P3Q5): 672 mov -0x2b(%rdx),%r11 673 mov %r11,-0x2b(%rcx) 674L(P3Q4): 675 mov -0x23(%rdx),%r10 676 mov %r10,-0x23(%rcx) 677L(P3Q3): 678 mov -0x1b(%rdx),%r9 679 mov %r9,-0x1b(%rcx) 680L(P3Q2): 681 mov -0x13(%rdx),%r11 682 mov %r11,-0x13(%rcx) 683L(P3Q1): 684 mov -0xb(%rdx),%r10 685 mov %r10,-0xb(%rcx) 686 /* 687 * These trailing loads/stores have to do all their loads 1st, 688 * then do the stores. 689 */ 690L(P3Q0): 691 movzwq -0x3(%rdx),%r9 692 movzbq -0x1(%rdx),%r10 693 mov %r9w,-0x3(%rcx) 694 mov %r10b,-0x1(%rcx) 695 ret 696 697 .balign 16 698L(P4QF): 699 mov -0x7c(%rdx),%r9 700 mov %r9,-0x7c(%rcx) 701L(P4QE): 702 mov -0x74(%rdx),%r11 703 mov %r11,-0x74(%rcx) 704L(P4QD): 705 mov -0x6c(%rdx),%r10 706 mov %r10,-0x6c(%rcx) 707L(P4QC): 708 mov -0x64(%rdx),%r9 709 mov %r9,-0x64(%rcx) 710L(P4QB): 711 mov -0x5c(%rdx),%r11 712 mov %r11,-0x5c(%rcx) 713L(P4QA): 714 mov -0x54(%rdx),%r10 715 mov %r10,-0x54(%rcx) 716L(P4Q9): 717 mov -0x4c(%rdx),%r9 718 mov %r9,-0x4c(%rcx) 719L(P4Q8): 720 mov -0x44(%rdx),%r11 721 mov %r11,-0x44(%rcx) 722L(P4Q7): 723 mov -0x3c(%rdx),%r10 724 mov %r10,-0x3c(%rcx) 725L(P4Q6): 726 mov -0x34(%rdx),%r9 727 mov %r9,-0x34(%rcx) 728L(P4Q5): 729 mov -0x2c(%rdx),%r11 730 mov %r11,-0x2c(%rcx) 731L(P4Q4): 732 mov -0x24(%rdx),%r10 733 mov %r10,-0x24(%rcx) 734L(P4Q3): 735 mov -0x1c(%rdx),%r9 736 mov %r9,-0x1c(%rcx) 737L(P4Q2): 738 mov -0x14(%rdx),%r11 739 mov %r11,-0x14(%rcx) 740L(P4Q1): 741 mov -0xc(%rdx),%r10 742 mov %r10,-0xc(%rcx) 743L(P4Q0): 744 mov -0x4(%rdx),%r9d 745 mov %r9d,-0x4(%rcx) 746 ret 747 748 .balign 16 749L(P5QF): 750 mov -0x7d(%rdx),%r9 751 mov %r9,-0x7d(%rcx) 752L(P5QE): 753 mov -0x75(%rdx),%r11 754 mov %r11,-0x75(%rcx) 755L(P5QD): 756 mov -0x6d(%rdx),%r10 757 mov %r10,-0x6d(%rcx) 758L(P5QC): 759 mov -0x65(%rdx),%r9 760 mov %r9,-0x65(%rcx) 761L(P5QB): 762 mov -0x5d(%rdx),%r11 763 mov %r11,-0x5d(%rcx) 764L(P5QA): 765 mov -0x55(%rdx),%r10 766 mov %r10,-0x55(%rcx) 767L(P5Q9): 768 mov -0x4d(%rdx),%r9 769 mov %r9,-0x4d(%rcx) 770L(P5Q8): 771 mov -0x45(%rdx),%r11 772 mov %r11,-0x45(%rcx) 773L(P5Q7): 774 mov -0x3d(%rdx),%r10 775 mov %r10,-0x3d(%rcx) 776L(P5Q6): 777 mov -0x35(%rdx),%r9 778 mov %r9,-0x35(%rcx) 779L(P5Q5): 780 mov -0x2d(%rdx),%r11 781 mov %r11,-0x2d(%rcx) 782L(P5Q4): 783 mov -0x25(%rdx),%r10 784 mov %r10,-0x25(%rcx) 785L(P5Q3): 786 mov -0x1d(%rdx),%r9 787 mov %r9,-0x1d(%rcx) 788L(P5Q2): 789 mov -0x15(%rdx),%r11 790 mov %r11,-0x15(%rcx) 791L(P5Q1): 792 mov -0xd(%rdx),%r10 793 mov %r10,-0xd(%rcx) 794 /* 795 * These trailing loads/stores have to do all their loads 1st, 796 * then do the stores. 797 */ 798L(P5Q0): 799 mov -0x5(%rdx),%r9d 800 movzbq -0x1(%rdx),%r10 801 mov %r9d,-0x5(%rcx) 802 mov %r10b,-0x1(%rcx) 803 ret 804 805 .balign 16 806L(P6QF): 807 mov -0x7e(%rdx),%r9 808 mov %r9,-0x7e(%rcx) 809L(P6QE): 810 mov -0x76(%rdx),%r11 811 mov %r11,-0x76(%rcx) 812L(P6QD): 813 mov -0x6e(%rdx),%r10 814 mov %r10,-0x6e(%rcx) 815L(P6QC): 816 mov -0x66(%rdx),%r9 817 mov %r9,-0x66(%rcx) 818L(P6QB): 819 mov -0x5e(%rdx),%r11 820 mov %r11,-0x5e(%rcx) 821L(P6QA): 822 mov -0x56(%rdx),%r10 823 mov %r10,-0x56(%rcx) 824L(P6Q9): 825 mov -0x4e(%rdx),%r9 826 mov %r9,-0x4e(%rcx) 827L(P6Q8): 828 mov -0x46(%rdx),%r11 829 mov %r11,-0x46(%rcx) 830L(P6Q7): 831 mov -0x3e(%rdx),%r10 832 mov %r10,-0x3e(%rcx) 833L(P6Q6): 834 mov -0x36(%rdx),%r9 835 mov %r9,-0x36(%rcx) 836L(P6Q5): 837 mov -0x2e(%rdx),%r11 838 mov %r11,-0x2e(%rcx) 839L(P6Q4): 840 mov -0x26(%rdx),%r10 841 mov %r10,-0x26(%rcx) 842L(P6Q3): 843 mov -0x1e(%rdx),%r9 844 mov %r9,-0x1e(%rcx) 845L(P6Q2): 846 mov -0x16(%rdx),%r11 847 mov %r11,-0x16(%rcx) 848L(P6Q1): 849 mov -0xe(%rdx),%r10 850 mov %r10,-0xe(%rcx) 851 /* 852 * These trailing loads/stores have to do all their loads 1st, 853 * then do the stores. 854 */ 855L(P6Q0): 856 mov -0x6(%rdx),%r9d 857 movzwq -0x2(%rdx),%r10 858 mov %r9d,-0x6(%rcx) 859 mov %r10w,-0x2(%rcx) 860 ret 861 862 .balign 16 863L(P7QF): 864 mov -0x7f(%rdx),%r9 865 mov %r9,-0x7f(%rcx) 866L(P7QE): 867 mov -0x77(%rdx),%r11 868 mov %r11,-0x77(%rcx) 869L(P7QD): 870 mov -0x6f(%rdx),%r10 871 mov %r10,-0x6f(%rcx) 872L(P7QC): 873 mov -0x67(%rdx),%r9 874 mov %r9,-0x67(%rcx) 875L(P7QB): 876 mov -0x5f(%rdx),%r11 877 mov %r11,-0x5f(%rcx) 878L(P7QA): 879 mov -0x57(%rdx),%r10 880 mov %r10,-0x57(%rcx) 881L(P7Q9): 882 mov -0x4f(%rdx),%r9 883 mov %r9,-0x4f(%rcx) 884L(P7Q8): 885 mov -0x47(%rdx),%r11 886 mov %r11,-0x47(%rcx) 887L(P7Q7): 888 mov -0x3f(%rdx),%r10 889 mov %r10,-0x3f(%rcx) 890L(P7Q6): 891 mov -0x37(%rdx),%r9 892 mov %r9,-0x37(%rcx) 893L(P7Q5): 894 mov -0x2f(%rdx),%r11 895 mov %r11,-0x2f(%rcx) 896L(P7Q4): 897 mov -0x27(%rdx),%r10 898 mov %r10,-0x27(%rcx) 899L(P7Q3): 900 mov -0x1f(%rdx),%r9 901 mov %r9,-0x1f(%rcx) 902L(P7Q2): 903 mov -0x17(%rdx),%r11 904 mov %r11,-0x17(%rcx) 905L(P7Q1): 906 mov -0xf(%rdx),%r10 907 mov %r10,-0xf(%rcx) 908 /* 909 * These trailing loads/stores have to do all their loads 1st, 910 * then do the stores. 911 */ 912L(P7Q0): 913 mov -0x7(%rdx),%r9d 914 movzwq -0x3(%rdx),%r10 915 movzbq -0x1(%rdx),%r11 916 mov %r9d,-0x7(%rcx) 917 mov %r10w,-0x3(%rcx) 918 mov %r11b,-0x1(%rcx) 919 ret 920 921 .balign 16 922L(ck_use_sse2): 923 /* 924 * Align dest to 16 byte boundary. 925 */ 926 test $0xf,%rcx 927 jnz L(ShrtAlignNew) 928 929L(now_qw_aligned): 930 cmpl $NO_SSE,.memops_method(%rip) 931 je L(Loop8byte_pre) 932 933 /* 934 * The fall-through path is to do SSE2 16-byte load/stores 935 */ 936 937 /* 938 * If current move size is larger than half of the highest level cache 939 * size, then do non-temporal moves. 940 */ 941 mov .largest_level_cache_size(%rip),%r9d 942 shr %r9 # take half of it 943 cmp %r9,%r8 944 jg L(sse2_nt_move) 945 946 /* 947 * If both the source and dest are aligned, then use the both aligned 948 * logic. Well aligned data should reap the rewards. 949 */ 950 test $0xf,%rdx 951 jz L(pre_both_aligned) 952 953 lea L(SSE_src)(%rip),%r10 # SSE2 (default) 954 testl $USE_SSSE3,.memops_method(%rip) 955 jz 1f 956 lea L(SSSE3_src)(%rip),%r10 # SSSE3 957 9581: 959 /* 960 * if the src is not 16 byte aligned... 961 */ 962 mov %rdx,%r11 963 and $0xf,%r11 964 movdqu (%rdx),%xmm0 965 movdqa %xmm0,(%rcx) 966 add $0x10,%rdx 967 sub %r11,%rdx 968 add $0x10,%rcx 969 sub $0x10,%r8 970 movdqa (%rdx),%xmm1 971 972 movslq (%r10,%r11,4),%r9 973 lea (%r9,%r10,1),%r10 974 jmpq *%r10 975 976 .balign 16 977L(SSSE3_src): .int L(pre_both_aligned)-L(SSSE3_src) 978 .int L(mov3dqa1) -L(SSSE3_src) 979 .int L(mov3dqa2) -L(SSSE3_src) 980 .int L(mov3dqa3) -L(SSSE3_src) 981 .int L(mov3dqa4) -L(SSSE3_src) 982 .int L(mov3dqa5) -L(SSSE3_src) 983 .int L(mov3dqa6) -L(SSSE3_src) 984 .int L(mov3dqa7) -L(SSSE3_src) 985 .int L(movdqa8) -L(SSSE3_src) 986 .int L(mov3dqa9) -L(SSSE3_src) 987 .int L(mov3dqa10)-L(SSSE3_src) 988 .int L(mov3dqa11)-L(SSSE3_src) 989 .int L(mov3dqa12)-L(SSSE3_src) 990 .int L(mov3dqa13)-L(SSSE3_src) 991 .int L(mov3dqa14)-L(SSSE3_src) 992 .int L(mov3dqa15)-L(SSSE3_src) 993L(SSE_src): .int L(pre_both_aligned)-L(SSE_src) 994 .int L(movdqa1) -L(SSE_src) 995 .int L(movdqa2) -L(SSE_src) 996 .int L(movdqa3) -L(SSE_src) 997 .int L(movdqa4) -L(SSE_src) 998 .int L(movdqa5) -L(SSE_src) 999 .int L(movdqa6) -L(SSE_src) 1000 .int L(movdqa7) -L(SSE_src) 1001 .int L(movdqa8) -L(SSE_src) 1002 .int L(movdqa9) -L(SSE_src) 1003 .int L(movdqa10)-L(SSE_src) 1004 .int L(movdqa11)-L(SSE_src) 1005 .int L(movdqa12)-L(SSE_src) 1006 .int L(movdqa13)-L(SSE_src) 1007 .int L(movdqa14)-L(SSE_src) 1008 .int L(movdqa15)-L(SSE_src) 1009 1010 .balign 16 1011L(movdqa1): 1012 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1013 movdqa 0x20(%rdx),%xmm0 # load the upper source buffer 1014 lea 0x20(%rdx),%rdx 1015 lea -0x20(%r8),%r8 1016 1017 psrldq $0x1,%xmm1 # shift right prev buffer (saved from last iteration) 1018 movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration 1019 pslldq $0xf,%xmm3 # shift the current buffer left (shift in zeros) 1020 por %xmm1,%xmm3 # OR them together 1021 cmp $0x20,%r8 1022 1023 psrldq $0x1,%xmm2 # shift right prev buffer (saved from last iteration) 1024 movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration 1025 pslldq $0xf,%xmm0 # shift the current buffer left (shift in zeros) 1026 por %xmm2,%xmm0 # OR them together 1027 movdqa %xmm3,(%rcx) # store it 1028 movdqa %xmm0,0x10(%rcx) # store it 1029 lea 0x20(%rcx),%rcx 1030 1031 jge L(movdqa1) 1032 jmp L(movdqa_epi) 1033 1034 .balign 16 1035L(movdqa2): 1036 sub $0x20,%r8 1037 movdqa 0x10(%rdx),%xmm3 1038 movdqa 0x20(%rdx),%xmm0 1039 add $0x20,%rdx 1040 1041 psrldq $0x2,%xmm1 1042 movdqa %xmm3,%xmm2 1043 pslldq $0xe,%xmm3 1044 por %xmm1,%xmm3 1045 1046 psrldq $0x2,%xmm2 1047 movdqa %xmm0,%xmm1 1048 pslldq $0xe,%xmm0 1049 por %xmm2,%xmm0 1050 movdqa %xmm3,(%rcx) 1051 movdqa %xmm0,0x10(%rcx) 1052 1053 add $0x20,%rcx 1054 cmp $0x20,%r8 1055 jge L(movdqa2) 1056 jmp L(movdqa_epi) 1057 1058 .balign 16 1059L(movdqa3): 1060 sub $0x20,%r8 1061 movdqa 0x10(%rdx),%xmm3 1062 movdqa 0x20(%rdx),%xmm0 1063 add $0x20,%rdx 1064 1065 psrldq $0x3,%xmm1 1066 movdqa %xmm3,%xmm2 1067 pslldq $0xd,%xmm3 1068 por %xmm1,%xmm3 1069 1070 psrldq $0x3,%xmm2 1071 movdqa %xmm0,%xmm1 1072 pslldq $0xd,%xmm0 1073 por %xmm2,%xmm0 1074 movdqa %xmm3,(%rcx) 1075 movdqa %xmm0,0x10(%rcx) 1076 1077 add $0x20,%rcx 1078 cmp $0x20,%r8 1079 jge L(movdqa3) 1080 jmp L(movdqa_epi) 1081 1082 .balign 16 1083L(movdqa4): 1084 sub $0x20,%r8 1085 movdqa 0x10(%rdx),%xmm3 1086 movdqa 0x20(%rdx),%xmm0 1087 add $0x20,%rdx 1088 1089 psrldq $0x4,%xmm1 1090 movdqa %xmm3,%xmm2 1091 pslldq $0xc,%xmm3 1092 por %xmm1,%xmm3 1093 1094 psrldq $0x4,%xmm2 1095 movdqa %xmm0,%xmm1 1096 pslldq $0xc,%xmm0 1097 por %xmm2,%xmm0 1098 1099 movdqa %xmm3,(%rcx) 1100 movdqa %xmm0,0x10(%rcx) 1101 1102 add $0x20,%rcx 1103 cmp $0x20,%r8 1104 jge L(movdqa4) 1105 jmp L(movdqa_epi) 1106 1107 .balign 16 1108L(movdqa5): 1109 sub $0x20,%r8 1110 movdqa 0x10(%rdx),%xmm3 1111 movdqa 0x20(%rdx),%xmm0 1112 add $0x20,%rdx 1113 1114 psrldq $0x5,%xmm1 1115 movdqa %xmm3,%xmm2 1116 pslldq $0xb,%xmm3 1117 por %xmm1,%xmm3 1118 1119 psrldq $0x5,%xmm2 1120 movdqa %xmm0,%xmm1 1121 pslldq $0xb,%xmm0 1122 por %xmm2,%xmm0 1123 1124 movdqa %xmm3,(%rcx) 1125 movdqa %xmm0,0x10(%rcx) 1126 1127 add $0x20,%rcx 1128 cmp $0x20,%r8 1129 jge L(movdqa5) 1130 jmp L(movdqa_epi) 1131 1132 .balign 16 1133L(movdqa6): 1134 sub $0x20,%r8 1135 movdqa 0x10(%rdx),%xmm3 1136 movdqa 0x20(%rdx),%xmm0 1137 add $0x20,%rdx 1138 1139 psrldq $0x6,%xmm1 1140 movdqa %xmm3,%xmm2 1141 pslldq $0xa,%xmm3 1142 por %xmm1,%xmm3 1143 1144 psrldq $0x6,%xmm2 1145 movdqa %xmm0,%xmm1 1146 pslldq $0xa,%xmm0 1147 por %xmm2,%xmm0 1148 movdqa %xmm3,(%rcx) 1149 movdqa %xmm0,0x10(%rcx) 1150 1151 add $0x20,%rcx 1152 cmp $0x20,%r8 1153 jge L(movdqa6) 1154 jmp L(movdqa_epi) 1155 1156 .balign 16 1157L(movdqa7): 1158 sub $0x20,%r8 1159 movdqa 0x10(%rdx),%xmm3 1160 movdqa 0x20(%rdx),%xmm0 1161 add $0x20,%rdx 1162 1163 psrldq $0x7,%xmm1 1164 movdqa %xmm3,%xmm2 1165 pslldq $0x9,%xmm3 1166 por %xmm1,%xmm3 1167 1168 psrldq $0x7,%xmm2 1169 movdqa %xmm0,%xmm1 1170 pslldq $0x9,%xmm0 1171 por %xmm2,%xmm0 1172 movdqa %xmm3,(%rcx) 1173 movdqa %xmm0,0x10(%rcx) 1174 1175 add $0x20,%rcx 1176 cmp $0x20,%r8 1177 jge L(movdqa7) 1178 jmp L(movdqa_epi) 1179 1180 .balign 16 1181L(movdqa8): 1182 movdqa 0x10(%rdx),%xmm3 1183 sub $0x30,%r8 1184 movdqa 0x20(%rdx),%xmm0 1185 movdqa 0x30(%rdx),%xmm5 1186 lea 0x30(%rdx),%rdx 1187 1188 shufpd $0x1,%xmm3,%xmm1 1189 movdqa %xmm1,(%rcx) 1190 1191 cmp $0x30,%r8 1192 1193 shufpd $0x1,%xmm0,%xmm3 1194 movdqa %xmm3,0x10(%rcx) 1195 1196 movdqa %xmm5,%xmm1 1197 shufpd $0x1,%xmm5,%xmm0 1198 movdqa %xmm0,0x20(%rcx) 1199 1200 lea 0x30(%rcx),%rcx 1201 1202 jge L(movdqa8) 1203 jmp L(movdqa_epi) 1204 1205 .balign 16 1206L(movdqa9): 1207 sub $0x20,%r8 1208 movdqa 0x10(%rdx),%xmm3 1209 movdqa 0x20(%rdx),%xmm0 1210 add $0x20,%rdx 1211 1212 psrldq $0x9,%xmm1 1213 movdqa %xmm3,%xmm2 1214 pslldq $0x7,%xmm3 1215 por %xmm1,%xmm3 1216 1217 psrldq $0x9,%xmm2 1218 movdqa %xmm0,%xmm1 1219 pslldq $0x7,%xmm0 1220 por %xmm2,%xmm0 1221 movdqa %xmm3,(%rcx) 1222 movdqa %xmm0,0x10(%rcx) 1223 1224 add $0x20,%rcx 1225 cmp $0x20,%r8 1226 jge L(movdqa9) 1227 jmp L(movdqa_epi) 1228 1229 .balign 16 1230L(movdqa10): 1231 sub $0x20,%r8 1232 movdqa 0x10(%rdx),%xmm3 1233 movdqa 0x20(%rdx),%xmm0 1234 add $0x20,%rdx 1235 1236 psrldq $0xa,%xmm1 1237 movdqa %xmm3,%xmm2 1238 pslldq $0x6,%xmm3 1239 por %xmm1,%xmm3 1240 1241 psrldq $0xa,%xmm2 1242 movdqa %xmm0,%xmm1 1243 pslldq $0x6,%xmm0 1244 por %xmm2,%xmm0 1245 movdqa %xmm3,(%rcx) 1246 movdqa %xmm0,0x10(%rcx) 1247 1248 add $0x20,%rcx 1249 cmp $0x20,%r8 1250 jge L(movdqa10) 1251 jmp L(movdqa_epi) 1252 1253 .balign 16 1254L(movdqa11): 1255 sub $0x20,%r8 1256 movdqa 0x10(%rdx),%xmm3 1257 movdqa 0x20(%rdx),%xmm0 1258 add $0x20,%rdx 1259 1260 psrldq $0xb,%xmm1 1261 movdqa %xmm3,%xmm2 1262 pslldq $0x5,%xmm3 1263 por %xmm1,%xmm3 1264 1265 psrldq $0xb,%xmm2 1266 movdqa %xmm0,%xmm1 1267 pslldq $0x5,%xmm0 1268 por %xmm2,%xmm0 1269 movdqa %xmm3,(%rcx) 1270 movdqa %xmm0,0x10(%rcx) 1271 1272 add $0x20,%rcx 1273 cmp $0x20,%r8 1274 jge L(movdqa11) 1275 jmp L(movdqa_epi) 1276 1277 .balign 16 1278L(movdqa12): 1279 sub $0x20,%r8 1280 movdqa 0x10(%rdx),%xmm3 1281 movdqa 0x20(%rdx),%xmm0 1282 add $0x20,%rdx 1283 1284 psrldq $0xc,%xmm1 1285 movdqa %xmm3,%xmm2 1286 pslldq $0x4,%xmm3 1287 por %xmm1,%xmm3 1288 1289 psrldq $0xc,%xmm2 1290 movdqa %xmm0,%xmm1 1291 pslldq $0x4,%xmm0 1292 por %xmm2,%xmm0 1293 movdqa %xmm3,(%rcx) 1294 movdqa %xmm0,0x10(%rcx) 1295 1296 add $0x20,%rcx 1297 cmp $0x20,%r8 1298 jge L(movdqa12) 1299 jmp L(movdqa_epi) 1300 1301 .balign 16 1302L(movdqa13): 1303 sub $0x20,%r8 1304 movdqa 0x10(%rdx),%xmm3 1305 movdqa 0x20(%rdx),%xmm0 1306 add $0x20,%rdx 1307 1308 psrldq $0xd,%xmm1 1309 movdqa %xmm3,%xmm2 1310 pslldq $0x3,%xmm3 1311 por %xmm1,%xmm3 1312 1313 psrldq $0xd,%xmm2 1314 movdqa %xmm0,%xmm1 1315 pslldq $0x3,%xmm0 1316 por %xmm2,%xmm0 1317 movdqa %xmm3,(%rcx) 1318 movdqa %xmm0,0x10(%rcx) 1319 1320 add $0x20,%rcx 1321 cmp $0x20,%r8 1322 jge L(movdqa13) 1323 jmp L(movdqa_epi) 1324 1325 .balign 16 1326L(movdqa14): 1327 sub $0x20,%r8 1328 movdqa 0x10(%rdx),%xmm3 1329 movdqa 0x20(%rdx),%xmm0 1330 add $0x20,%rdx 1331 1332 psrldq $0xe,%xmm1 1333 movdqa %xmm3,%xmm2 1334 pslldq $0x2,%xmm3 1335 por %xmm1,%xmm3 1336 1337 psrldq $0xe,%xmm2 1338 movdqa %xmm0,%xmm1 1339 pslldq $0x2,%xmm0 1340 por %xmm2,%xmm0 1341 movdqa %xmm3,(%rcx) 1342 movdqa %xmm0,0x10(%rcx) 1343 1344 add $0x20,%rcx 1345 cmp $0x20,%r8 1346 jge L(movdqa14) 1347 jmp L(movdqa_epi) 1348 1349 .balign 16 1350L(movdqa15): 1351 sub $0x20,%r8 1352 movdqa 0x10(%rdx),%xmm3 1353 movdqa 0x20(%rdx),%xmm0 1354 add $0x20,%rdx 1355 1356 psrldq $0xf,%xmm1 1357 movdqa %xmm3,%xmm2 1358 pslldq $0x1,%xmm3 1359 por %xmm1,%xmm3 1360 1361 psrldq $0xf,%xmm2 1362 movdqa %xmm0,%xmm1 1363 pslldq $0x1,%xmm0 1364 por %xmm2,%xmm0 1365 movdqa %xmm3,(%rcx) 1366 movdqa %xmm0,0x10(%rcx) 1367 1368 add $0x20,%rcx 1369 cmp $0x20,%r8 1370 jge L(movdqa15) 1371 #jmp L(movdqa_epi) 1372 1373 .balign 16 1374L(movdqa_epi): 1375 lea L(fwdPxQx)(%rip),%r10 1376 add %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop) 1377 add %r8,%rcx 1378 add %r8,%rdx 1379 1380 movslq (%r10,%r8,4),%r9 1381 lea (%r9,%r10,1),%r10 1382 jmpq *%r10 1383 1384 .balign 16 1385L(mov3dqa1): 1386 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1387 sub $0x30,%r8 1388 movdqa 0x20(%rdx),%xmm0 # load the upper source buffer 1389 movdqa 0x30(%rdx),%xmm5 # load the upper source buffer 1390 lea 0x30(%rdx),%rdx 1391 cmp $0x30,%r8 1392 1393 movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration 1394 #palignr $0x1,%xmm1,%xmm3 1395 .byte 0x66,0x0f,0x3a,0x0f 1396 .byte 0xd9,0x01 1397 movdqa %xmm3,(%rcx) # store it 1398 1399 movdqa %xmm0,%xmm4 # store off xmm reg for use next iteration 1400 #palignr $0x1,%xmm2,%xmm0 1401 .byte 0x66,0x0f,0x3a,0x0f 1402 .byte 0xc2,0x01 1403 movdqa %xmm0,0x10(%rcx) # store it 1404 1405 movdqa %xmm5,%xmm1 # store off xmm reg for use next iteration 1406 #palignr $0x1,%xmm4,%xmm5 1407 .byte 0x66,0x0f,0x3a,0x0f 1408 .byte 0xec,0x01 1409 movdqa %xmm5,0x20(%rcx) # store it 1410 1411 lea 0x30(%rcx),%rcx 1412 jge L(mov3dqa1) 1413 1414 cmp $0x10,%r8 1415 jl L(movdqa_epi) 1416 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1417 sub $0x10,%r8 1418 lea 0x10(%rdx),%rdx 1419 movdqa %xmm3,%xmm2 # save for use next concat 1420 #palignr $0x1,%xmm1,%xmm3 1421 .byte 0x66,0x0f,0x3a,0x0f 1422 .byte 0xd9,0x01 1423 1424 cmp $0x10,%r8 1425 movdqa %xmm3,(%rcx) # store it 1426 lea 0x10(%rcx),%rcx 1427 jl L(movdqa_epi) 1428 1429 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1430 sub $0x10,%r8 1431 lea 0x10(%rdx),%rdx 1432 #palignr $0x1,%xmm2,%xmm0 1433 .byte 0x66,0x0f,0x3a,0x0f 1434 .byte 0xc2,0x01 1435 movdqa %xmm0,(%rcx) # store it 1436 lea 0x10(%rcx),%rcx 1437 jmp L(movdqa_epi) 1438 1439 .balign 16 1440L(mov3dqa2): 1441 movdqa 0x10(%rdx),%xmm3 1442 sub $0x30,%r8 1443 movdqa 0x20(%rdx),%xmm0 1444 movdqa 0x30(%rdx),%xmm5 1445 lea 0x30(%rdx),%rdx 1446 cmp $0x30,%r8 1447 1448 movdqa %xmm3,%xmm2 1449 #palignr $0x2,%xmm1,%xmm3 1450 .byte 0x66,0x0f,0x3a,0x0f 1451 .byte 0xd9,0x02 1452 movdqa %xmm3,(%rcx) 1453 1454 movdqa %xmm0,%xmm4 1455 #palignr $0x2,%xmm2,%xmm0 1456 .byte 0x66,0x0f,0x3a,0x0f 1457 .byte 0xc2,0x02 1458 movdqa %xmm0,0x10(%rcx) 1459 1460 movdqa %xmm5,%xmm1 1461 #palignr $0x2,%xmm4,%xmm5 1462 .byte 0x66,0x0f,0x3a,0x0f 1463 .byte 0xec,0x02 1464 movdqa %xmm5,0x20(%rcx) 1465 1466 lea 0x30(%rcx),%rcx 1467 jge L(mov3dqa2) 1468 1469 cmp $0x10,%r8 1470 jl L(movdqa_epi) 1471 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1472 sub $0x10,%r8 1473 lea 0x10(%rdx),%rdx 1474 movdqa %xmm3,%xmm2 # save for use next concat 1475 #palignr $0x2,%xmm1,%xmm3 1476 .byte 0x66,0x0f,0x3a,0x0f 1477 .byte 0xd9,0x02 1478 1479 cmp $0x10,%r8 1480 movdqa %xmm3,(%rcx) # store it 1481 lea 0x10(%rcx),%rcx 1482 jl L(movdqa_epi) 1483 1484 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1485 sub $0x10,%r8 1486 lea 0x10(%rdx),%rdx 1487 #palignr $0x2,%xmm2,%xmm0 1488 .byte 0x66,0x0f,0x3a,0x0f 1489 .byte 0xc2,0x02 1490 movdqa %xmm0,(%rcx) # store it 1491 lea 0x10(%rcx),%rcx 1492 jmp L(movdqa_epi) 1493 1494 .balign 16 1495L(mov3dqa3): 1496 movdqa 0x10(%rdx),%xmm3 1497 sub $0x30,%r8 1498 movdqa 0x20(%rdx),%xmm0 1499 movdqa 0x30(%rdx),%xmm5 1500 lea 0x30(%rdx),%rdx 1501 cmp $0x30,%r8 1502 1503 movdqa %xmm3,%xmm2 1504 #palignr $0x3,%xmm1,%xmm3 1505 .byte 0x66,0x0f,0x3a,0x0f 1506 .byte 0xd9,0x03 1507 movdqa %xmm3,(%rcx) 1508 1509 movdqa %xmm0,%xmm4 1510 #palignr $0x3,%xmm2,%xmm0 1511 .byte 0x66,0x0f,0x3a,0x0f 1512 .byte 0xc2,0x03 1513 movdqa %xmm0,0x10(%rcx) 1514 1515 movdqa %xmm5,%xmm1 1516 #palignr $0x3,%xmm4,%xmm5 1517 .byte 0x66,0x0f,0x3a,0x0f 1518 .byte 0xec,0x03 1519 movdqa %xmm5,0x20(%rcx) 1520 1521 lea 0x30(%rcx),%rcx 1522 jge L(mov3dqa3) 1523 1524 cmp $0x10,%r8 1525 jl L(movdqa_epi) 1526 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1527 sub $0x10,%r8 1528 lea 0x10(%rdx),%rdx 1529 movdqa %xmm3,%xmm2 # save for use next concat 1530 #palignr $0x3,%xmm1,%xmm3 1531 .byte 0x66,0x0f,0x3a,0x0f 1532 .byte 0xd9,0x03 1533 1534 cmp $0x10,%r8 1535 movdqa %xmm3,(%rcx) # store it 1536 lea 0x10(%rcx),%rcx 1537 jl L(movdqa_epi) 1538 1539 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1540 sub $0x10,%r8 1541 lea 0x10(%rdx),%rdx 1542 #palignr $0x3,%xmm2,%xmm0 1543 .byte 0x66,0x0f,0x3a,0x0f 1544 .byte 0xc2,0x03 1545 movdqa %xmm0,(%rcx) # store it 1546 lea 0x10(%rcx),%rcx 1547 jmp L(movdqa_epi) 1548 1549 .balign 16 1550L(mov3dqa4): 1551 movdqa 0x10(%rdx),%xmm3 1552 sub $0x30,%r8 1553 movdqa 0x20(%rdx),%xmm0 1554 movdqa 0x30(%rdx),%xmm5 1555 lea 0x30(%rdx),%rdx 1556 cmp $0x30,%r8 1557 1558 movdqa %xmm3,%xmm2 1559 #palignr $0x4,%xmm1,%xmm3 1560 .byte 0x66,0x0f,0x3a,0x0f 1561 .byte 0xd9,0x04 1562 movdqa %xmm3,(%rcx) 1563 1564 movdqa %xmm0,%xmm4 1565 #palignr $0x4,%xmm2,%xmm0 1566 .byte 0x66,0x0f,0x3a,0x0f 1567 .byte 0xc2,0x04 1568 movdqa %xmm0,0x10(%rcx) 1569 1570 movdqa %xmm5,%xmm1 1571 #palignr $0x4,%xmm4,%xmm5 1572 .byte 0x66,0x0f,0x3a,0x0f 1573 .byte 0xec,0x04 1574 movdqa %xmm5,0x20(%rcx) 1575 1576 lea 0x30(%rcx),%rcx 1577 jge L(mov3dqa4) 1578 1579 cmp $0x10,%r8 1580 jl L(movdqa_epi) 1581 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1582 sub $0x10,%r8 1583 lea 0x10(%rdx),%rdx 1584 movdqa %xmm3,%xmm2 # save for use next concat 1585 #palignr $0x4,%xmm1,%xmm3 1586 .byte 0x66,0x0f,0x3a,0x0f 1587 .byte 0xd9,0x04 1588 1589 cmp $0x10,%r8 1590 movdqa %xmm3,(%rcx) # store it 1591 lea 0x10(%rcx),%rcx 1592 jl L(movdqa_epi) 1593 1594 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1595 sub $0x10,%r8 1596 lea 0x10(%rdx),%rdx 1597 #palignr $0x4,%xmm2,%xmm0 1598 .byte 0x66,0x0f,0x3a,0x0f 1599 .byte 0xc2,0x04 1600 movdqa %xmm0,(%rcx) # store it 1601 lea 0x10(%rcx),%rcx 1602 jmp L(movdqa_epi) 1603 1604 .balign 16 1605L(mov3dqa5): 1606 movdqa 0x10(%rdx),%xmm3 1607 sub $0x30,%r8 1608 movdqa 0x20(%rdx),%xmm0 1609 movdqa 0x30(%rdx),%xmm5 1610 lea 0x30(%rdx),%rdx 1611 cmp $0x30,%r8 1612 1613 movdqa %xmm3,%xmm2 1614 #palignr $0x5,%xmm1,%xmm3 1615 .byte 0x66,0x0f,0x3a,0x0f 1616 .byte 0xd9,0x05 1617 movdqa %xmm3,(%rcx) 1618 1619 movdqa %xmm0,%xmm4 1620 #palignr $0x5,%xmm2,%xmm0 1621 .byte 0x66,0x0f,0x3a,0x0f 1622 .byte 0xc2,0x05 1623 movdqa %xmm0,0x10(%rcx) 1624 1625 movdqa %xmm5,%xmm1 1626 #palignr $0x5,%xmm4,%xmm5 1627 .byte 0x66,0x0f,0x3a,0x0f 1628 .byte 0xec,0x05 1629 movdqa %xmm5,0x20(%rcx) 1630 1631 lea 0x30(%rcx),%rcx 1632 jge L(mov3dqa5) 1633 1634 cmp $0x10,%r8 1635 jl L(movdqa_epi) 1636 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1637 sub $0x10,%r8 1638 lea 0x10(%rdx),%rdx 1639 movdqa %xmm3,%xmm2 # save for use next concat 1640 #palignr $0x5,%xmm1,%xmm3 1641 .byte 0x66,0x0f,0x3a,0x0f 1642 .byte 0xd9,0x05 1643 1644 cmp $0x10,%r8 1645 movdqa %xmm3,(%rcx) # store it 1646 lea 0x10(%rcx),%rcx 1647 jl L(movdqa_epi) 1648 1649 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1650 sub $0x10,%r8 1651 lea 0x10(%rdx),%rdx 1652 #palignr $0x5,%xmm2,%xmm0 1653 .byte 0x66,0x0f,0x3a,0x0f 1654 .byte 0xc2,0x05 1655 movdqa %xmm0,(%rcx) # store it 1656 lea 0x10(%rcx),%rcx 1657 jmp L(movdqa_epi) 1658 1659 .balign 16 1660L(mov3dqa6): 1661 movdqa 0x10(%rdx),%xmm3 1662 sub $0x30,%r8 1663 movdqa 0x20(%rdx),%xmm0 1664 movdqa 0x30(%rdx),%xmm5 1665 lea 0x30(%rdx),%rdx 1666 cmp $0x30,%r8 1667 1668 movdqa %xmm3,%xmm2 1669 #palignr $0x6,%xmm1,%xmm3 1670 .byte 0x66,0x0f,0x3a,0x0f 1671 .byte 0xd9,0x06 1672 movdqa %xmm3,(%rcx) 1673 1674 movdqa %xmm0,%xmm4 1675 #palignr $0x6,%xmm2,%xmm0 1676 .byte 0x66,0x0f,0x3a,0x0f 1677 .byte 0xc2,0x06 1678 movdqa %xmm0,0x10(%rcx) 1679 1680 movdqa %xmm5,%xmm1 1681 #palignr $0x6,%xmm4,%xmm5 1682 .byte 0x66,0x0f,0x3a,0x0f 1683 .byte 0xec,0x06 1684 movdqa %xmm5,0x20(%rcx) 1685 1686 lea 0x30(%rcx),%rcx 1687 jge L(mov3dqa6) 1688 1689 cmp $0x10,%r8 1690 jl L(movdqa_epi) 1691 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1692 sub $0x10,%r8 1693 lea 0x10(%rdx),%rdx 1694 movdqa %xmm3,%xmm2 # save for use next concat 1695 #palignr $0x6,%xmm1,%xmm3 1696 .byte 0x66,0x0f,0x3a,0x0f 1697 .byte 0xd9,0x06 1698 1699 cmp $0x10,%r8 1700 movdqa %xmm3,(%rcx) # store it 1701 lea 0x10(%rcx),%rcx 1702 jl L(movdqa_epi) 1703 1704 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1705 sub $0x10,%r8 1706 lea 0x10(%rdx),%rdx 1707 #palignr $0x6,%xmm2,%xmm0 1708 .byte 0x66,0x0f,0x3a,0x0f 1709 .byte 0xc2,0x06 1710 movdqa %xmm0,(%rcx) # store it 1711 lea 0x10(%rcx),%rcx 1712 jmp L(movdqa_epi) 1713 1714 .balign 16 1715L(mov3dqa7): 1716 movdqa 0x10(%rdx),%xmm3 1717 sub $0x30,%r8 1718 movdqa 0x20(%rdx),%xmm0 1719 movdqa 0x30(%rdx),%xmm5 1720 lea 0x30(%rdx),%rdx 1721 cmp $0x30,%r8 1722 1723 movdqa %xmm3,%xmm2 1724 #palignr $0x7,%xmm1,%xmm3 1725 .byte 0x66,0x0f,0x3a,0x0f 1726 .byte 0xd9,0x07 1727 movdqa %xmm3,(%rcx) 1728 1729 movdqa %xmm0,%xmm4 1730 #palignr $0x7,%xmm2,%xmm0 1731 .byte 0x66,0x0f,0x3a,0x0f 1732 .byte 0xc2,0x07 1733 movdqa %xmm0,0x10(%rcx) 1734 1735 movdqa %xmm5,%xmm1 1736 #palignr $0x7,%xmm4,%xmm5 1737 .byte 0x66,0x0f,0x3a,0x0f 1738 .byte 0xec,0x07 1739 movdqa %xmm5,0x20(%rcx) 1740 1741 lea 0x30(%rcx),%rcx 1742 jge L(mov3dqa7) 1743 1744 cmp $0x10,%r8 1745 jl L(movdqa_epi) 1746 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1747 sub $0x10,%r8 1748 lea 0x10(%rdx),%rdx 1749 movdqa %xmm3,%xmm2 # save for use next concat 1750 #palignr $0x7,%xmm1,%xmm3 1751 .byte 0x66,0x0f,0x3a,0x0f 1752 .byte 0xd9,0x07 1753 1754 cmp $0x10,%r8 1755 movdqa %xmm3,(%rcx) # store it 1756 lea 0x10(%rcx),%rcx 1757 jl L(movdqa_epi) 1758 1759 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1760 sub $0x10,%r8 1761 lea 0x10(%rdx),%rdx 1762 #palignr $0x7,%xmm2,%xmm0 1763 .byte 0x66,0x0f,0x3a,0x0f 1764 .byte 0xc2,0x07 1765 movdqa %xmm0,(%rcx) # store it 1766 lea 0x10(%rcx),%rcx 1767 jmp L(movdqa_epi) 1768 1769 .balign 16 1770L(mov3dqa9): 1771 movdqa 0x10(%rdx),%xmm3 1772 sub $0x30,%r8 1773 movdqa 0x20(%rdx),%xmm0 1774 movdqa 0x30(%rdx),%xmm5 1775 lea 0x30(%rdx),%rdx 1776 cmp $0x30,%r8 1777 1778 movdqa %xmm3,%xmm2 1779 #palignr $0x9,%xmm1,%xmm3 1780 .byte 0x66,0x0f,0x3a,0x0f 1781 .byte 0xd9,0x09 1782 movdqa %xmm3,(%rcx) 1783 1784 movdqa %xmm0,%xmm4 1785 #palignr $0x9,%xmm2,%xmm0 1786 .byte 0x66,0x0f,0x3a,0x0f 1787 .byte 0xc2,0x09 1788 movdqa %xmm0,0x10(%rcx) 1789 1790 movdqa %xmm5,%xmm1 1791 #palignr $0x9,%xmm4,%xmm5 1792 .byte 0x66,0x0f,0x3a,0x0f 1793 .byte 0xec,0x09 1794 movdqa %xmm5,0x20(%rcx) 1795 1796 lea 0x30(%rcx),%rcx 1797 jge L(mov3dqa9) 1798 1799 cmp $0x10,%r8 1800 jl L(movdqa_epi) 1801 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1802 sub $0x10,%r8 1803 lea 0x10(%rdx),%rdx 1804 movdqa %xmm3,%xmm2 # save for use next concat 1805 #palignr $0x9,%xmm1,%xmm3 1806 .byte 0x66,0x0f,0x3a,0x0f 1807 .byte 0xd9,0x09 1808 1809 cmp $0x10,%r8 1810 movdqa %xmm3,(%rcx) # store it 1811 lea 0x10(%rcx),%rcx 1812 jl L(movdqa_epi) 1813 1814 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1815 sub $0x10,%r8 1816 lea 0x10(%rdx),%rdx 1817 #palignr $0x9,%xmm2,%xmm0 1818 .byte 0x66,0x0f,0x3a,0x0f 1819 .byte 0xc2,0x09 1820 movdqa %xmm0,(%rcx) # store it 1821 lea 0x10(%rcx),%rcx 1822 jmp L(movdqa_epi) 1823 1824 .balign 16 1825L(mov3dqa10): 1826 movdqa 0x10(%rdx),%xmm3 1827 sub $0x30,%r8 1828 movdqa 0x20(%rdx),%xmm0 1829 movdqa 0x30(%rdx),%xmm5 1830 lea 0x30(%rdx),%rdx 1831 cmp $0x30,%r8 1832 1833 movdqa %xmm3,%xmm2 1834 #palignr $0xa,%xmm1,%xmm3 1835 .byte 0x66,0x0f,0x3a,0x0f 1836 .byte 0xd9,0x0a 1837 movdqa %xmm3,(%rcx) 1838 1839 movdqa %xmm0,%xmm4 1840 #palignr $0xa,%xmm2,%xmm0 1841 .byte 0x66,0x0f,0x3a,0x0f 1842 .byte 0xc2,0x0a 1843 movdqa %xmm0,0x10(%rcx) 1844 1845 movdqa %xmm5,%xmm1 1846 #palignr $0xa,%xmm4,%xmm5 1847 .byte 0x66,0x0f,0x3a,0x0f 1848 .byte 0xec,0x0a 1849 movdqa %xmm5,0x20(%rcx) 1850 1851 lea 0x30(%rcx),%rcx 1852 jge L(mov3dqa10) 1853 1854 cmp $0x10,%r8 1855 jl L(movdqa_epi) 1856 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1857 sub $0x10,%r8 1858 lea 0x10(%rdx),%rdx 1859 movdqa %xmm3,%xmm2 # save for use next concat 1860 #palignr $0xa,%xmm1,%xmm3 1861 .byte 0x66,0x0f,0x3a,0x0f 1862 .byte 0xd9,0x0a 1863 1864 cmp $0x10,%r8 1865 movdqa %xmm3,(%rcx) # store it 1866 lea 0x10(%rcx),%rcx 1867 jl L(movdqa_epi) 1868 1869 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1870 sub $0x10,%r8 1871 lea 0x10(%rdx),%rdx 1872 #palignr $0xa,%xmm2,%xmm0 1873 .byte 0x66,0x0f,0x3a,0x0f 1874 .byte 0xc2,0x0a 1875 movdqa %xmm0,(%rcx) # store it 1876 lea 0x10(%rcx),%rcx 1877 jmp L(movdqa_epi) 1878 1879 .balign 16 1880L(mov3dqa11): 1881 movdqa 0x10(%rdx),%xmm3 1882 sub $0x30,%r8 1883 movdqa 0x20(%rdx),%xmm0 1884 movdqa 0x30(%rdx),%xmm5 1885 lea 0x30(%rdx),%rdx 1886 cmp $0x30,%r8 1887 1888 movdqa %xmm3,%xmm2 1889 #palignr $0xb,%xmm1,%xmm3 1890 .byte 0x66,0x0f,0x3a,0x0f 1891 .byte 0xd9,0x0b 1892 movdqa %xmm3,(%rcx) 1893 1894 movdqa %xmm0,%xmm4 1895 #palignr $0xb,%xmm2,%xmm0 1896 .byte 0x66,0x0f,0x3a,0x0f 1897 .byte 0xc2,0x0b 1898 movdqa %xmm0,0x10(%rcx) 1899 1900 movdqa %xmm5,%xmm1 1901 #palignr $0xb,%xmm4,%xmm5 1902 .byte 0x66,0x0f,0x3a,0x0f 1903 .byte 0xec,0x0b 1904 movdqa %xmm5,0x20(%rcx) 1905 1906 lea 0x30(%rcx),%rcx 1907 jge L(mov3dqa11) 1908 1909 cmp $0x10,%r8 1910 jl L(movdqa_epi) 1911 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1912 sub $0x10,%r8 1913 lea 0x10(%rdx),%rdx 1914 movdqa %xmm3,%xmm2 # save for use next concat 1915 #palignr $0xb,%xmm1,%xmm3 1916 .byte 0x66,0x0f,0x3a,0x0f 1917 .byte 0xd9,0x0b 1918 1919 cmp $0x10,%r8 1920 movdqa %xmm3,(%rcx) # store it 1921 lea 0x10(%rcx),%rcx 1922 jl L(movdqa_epi) 1923 1924 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1925 sub $0x10,%r8 1926 lea 0x10(%rdx),%rdx 1927 #palignr $0xb,%xmm2,%xmm0 1928 .byte 0x66,0x0f,0x3a,0x0f 1929 .byte 0xc2,0x0b 1930 movdqa %xmm0,(%rcx) # store it 1931 lea 0x10(%rcx),%rcx 1932 jmp L(movdqa_epi) 1933 1934 .balign 16 1935L(mov3dqa12): 1936 movdqa 0x10(%rdx),%xmm3 1937 sub $0x30,%r8 1938 movdqa 0x20(%rdx),%xmm0 1939 movdqa 0x30(%rdx),%xmm5 1940 lea 0x30(%rdx),%rdx 1941 cmp $0x30,%r8 1942 1943 movdqa %xmm3,%xmm2 1944 #palignr $0xc,%xmm1,%xmm3 1945 .byte 0x66,0x0f,0x3a,0x0f 1946 .byte 0xd9,0x0c 1947 movdqa %xmm3,(%rcx) 1948 1949 movdqa %xmm0,%xmm4 1950 #palignr $0xc,%xmm2,%xmm0 1951 .byte 0x66,0x0f,0x3a,0x0f 1952 .byte 0xc2,0x0c 1953 movdqa %xmm0,0x10(%rcx) 1954 1955 movdqa %xmm5,%xmm1 1956 #palignr $0xc,%xmm4,%xmm5 1957 .byte 0x66,0x0f,0x3a,0x0f 1958 .byte 0xec,0x0c 1959 movdqa %xmm5,0x20(%rcx) 1960 1961 lea 0x30(%rcx),%rcx 1962 jge L(mov3dqa12) 1963 1964 cmp $0x10,%r8 1965 jl L(movdqa_epi) 1966 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1967 sub $0x10,%r8 1968 lea 0x10(%rdx),%rdx 1969 movdqa %xmm3,%xmm2 # save for use next concat 1970 #palignr $0xc,%xmm1,%xmm3 1971 .byte 0x66,0x0f,0x3a,0x0f 1972 .byte 0xd9,0x0c 1973 1974 cmp $0x10,%r8 1975 movdqa %xmm3,(%rcx) # store it 1976 lea 0x10(%rcx),%rcx 1977 jl L(movdqa_epi) 1978 1979 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1980 sub $0x10,%r8 1981 lea 0x10(%rdx),%rdx 1982 #palignr $0xc,%xmm2,%xmm0 1983 .byte 0x66,0x0f,0x3a,0x0f 1984 .byte 0xc2,0x0c 1985 movdqa %xmm0,(%rcx) # store it 1986 lea 0x10(%rcx),%rcx 1987 jmp L(movdqa_epi) 1988 1989 .balign 16 1990L(mov3dqa13): 1991 movdqa 0x10(%rdx),%xmm3 1992 sub $0x30,%r8 1993 movdqa 0x20(%rdx),%xmm0 1994 movdqa 0x30(%rdx),%xmm5 1995 lea 0x30(%rdx),%rdx 1996 cmp $0x30,%r8 1997 1998 movdqa %xmm3,%xmm2 1999 #palignr $0xd,%xmm1,%xmm3 2000 .byte 0x66,0x0f,0x3a,0x0f 2001 .byte 0xd9,0x0d 2002 movdqa %xmm3,(%rcx) 2003 2004 movdqa %xmm0,%xmm4 2005 #palignr $0xd,%xmm2,%xmm0 2006 .byte 0x66,0x0f,0x3a,0x0f 2007 .byte 0xc2,0x0d 2008 movdqa %xmm0,0x10(%rcx) 2009 2010 movdqa %xmm5,%xmm1 2011 #palignr $0xd,%xmm4,%xmm5 2012 .byte 0x66,0x0f,0x3a,0x0f 2013 .byte 0xec,0x0d 2014 movdqa %xmm5,0x20(%rcx) 2015 2016 lea 0x30(%rcx),%rcx 2017 jge L(mov3dqa13) 2018 2019 cmp $0x10,%r8 2020 jl L(movdqa_epi) 2021 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2022 sub $0x10,%r8 2023 lea 0x10(%rdx),%rdx 2024 movdqa %xmm3,%xmm2 # save for use next concat 2025 #palignr $0xd,%xmm1,%xmm3 2026 .byte 0x66,0x0f,0x3a,0x0f 2027 .byte 0xd9,0x0d 2028 2029 cmp $0x10,%r8 2030 movdqa %xmm3,(%rcx) # store it 2031 lea 0x10(%rcx),%rcx 2032 jl L(movdqa_epi) 2033 2034 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2035 sub $0x10,%r8 2036 lea 0x10(%rdx),%rdx 2037 #palignr $0xd,%xmm2,%xmm0 2038 .byte 0x66,0x0f,0x3a,0x0f 2039 .byte 0xc2,0x0d 2040 movdqa %xmm0,(%rcx) # store it 2041 lea 0x10(%rcx),%rcx 2042 jmp L(movdqa_epi) 2043 2044 .balign 16 2045L(mov3dqa14): 2046 movdqa 0x10(%rdx),%xmm3 2047 sub $0x30,%r8 2048 movdqa 0x20(%rdx),%xmm0 2049 movdqa 0x30(%rdx),%xmm5 2050 lea 0x30(%rdx),%rdx 2051 cmp $0x30,%r8 2052 2053 movdqa %xmm3,%xmm2 2054 #palignr $0xe,%xmm1,%xmm3 2055 .byte 0x66,0x0f,0x3a,0x0f 2056 .byte 0xd9,0x0e 2057 movdqa %xmm3,(%rcx) 2058 2059 movdqa %xmm0,%xmm4 2060 #palignr $0xe,%xmm2,%xmm0 2061 .byte 0x66,0x0f,0x3a,0x0f 2062 .byte 0xc2,0x0e 2063 movdqa %xmm0,0x10(%rcx) 2064 2065 movdqa %xmm5,%xmm1 2066 #palignr $0xe,%xmm4,%xmm5 2067 .byte 0x66,0x0f,0x3a,0x0f 2068 .byte 0xec,0x0e 2069 movdqa %xmm5,0x20(%rcx) 2070 2071 lea 0x30(%rcx),%rcx 2072 jge L(mov3dqa14) 2073 2074 cmp $0x10,%r8 2075 jl L(movdqa_epi) 2076 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2077 sub $0x10,%r8 2078 lea 0x10(%rdx),%rdx 2079 movdqa %xmm3,%xmm2 # save for use next concat 2080 #palignr $0xe,%xmm1,%xmm3 2081 .byte 0x66,0x0f,0x3a,0x0f 2082 .byte 0xd9,0x0e 2083 2084 cmp $0x10,%r8 2085 movdqa %xmm3,(%rcx) # store it 2086 lea 0x10(%rcx),%rcx 2087 jl L(movdqa_epi) 2088 2089 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2090 sub $0x10,%r8 2091 lea 0x10(%rdx),%rdx 2092 #palignr $0xe,%xmm2,%xmm0 2093 .byte 0x66,0x0f,0x3a,0x0f 2094 .byte 0xc2,0x0e 2095 movdqa %xmm0,(%rcx) # store it 2096 lea 0x10(%rcx),%rcx 2097 jmp L(movdqa_epi) 2098 2099 .balign 16 2100L(mov3dqa15): 2101 movdqa 0x10(%rdx),%xmm3 2102 sub $0x30,%r8 2103 movdqa 0x20(%rdx),%xmm0 2104 movdqa 0x30(%rdx),%xmm5 2105 lea 0x30(%rdx),%rdx 2106 cmp $0x30,%r8 2107 2108 movdqa %xmm3,%xmm2 2109 #palignr $0xf,%xmm1,%xmm3 2110 .byte 0x66,0x0f,0x3a,0x0f 2111 .byte 0xd9,0x0f 2112 movdqa %xmm3,(%rcx) 2113 2114 movdqa %xmm0,%xmm4 2115 #palignr $0xf,%xmm2,%xmm0 2116 .byte 0x66,0x0f,0x3a,0x0f 2117 .byte 0xc2,0x0f 2118 movdqa %xmm0,0x10(%rcx) 2119 2120 movdqa %xmm5,%xmm1 2121 #palignr $0xf,%xmm4,%xmm5 2122 .byte 0x66,0x0f,0x3a,0x0f 2123 .byte 0xec,0x0f 2124 movdqa %xmm5,0x20(%rcx) 2125 2126 lea 0x30(%rcx),%rcx 2127 jge L(mov3dqa15) 2128 2129 cmp $0x10,%r8 2130 jl L(movdqa_epi) 2131 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2132 sub $0x10,%r8 2133 lea 0x10(%rdx),%rdx 2134 movdqa %xmm3,%xmm2 # save for use next concat 2135 #palignr $0xf,%xmm1,%xmm3 2136 .byte 0x66,0x0f,0x3a,0x0f 2137 .byte 0xd9,0x0f 2138 2139 cmp $0x10,%r8 2140 movdqa %xmm3,(%rcx) # store it 2141 lea 0x10(%rcx),%rcx 2142 jl L(movdqa_epi) 2143 2144 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2145 sub $0x10,%r8 2146 lea 0x10(%rdx),%rdx 2147 #palignr $0xf,%xmm2,%xmm0 2148 .byte 0x66,0x0f,0x3a,0x0f 2149 .byte 0xc2,0x0f 2150 movdqa %xmm0,(%rcx) # store it 2151 lea 0x10(%rcx),%rcx 2152 jmp L(movdqa_epi) 2153 2154 .balign 16 2155L(sse2_nt_move): 2156 lea 0x40(%rcx),%rcx 2157 lea 0x40(%rdx),%rdx 2158 lea -0x40(%r8),%r8 2159 2160 /* 2161 * doesn't matter if source is aligned for stuff out of cache. 2162 * the mis-aligned penalty is masked by the slowness of main memory. 2163 */ 2164 prefetchnta 0x180(%rdx) 2165 movdqu -0x40(%rdx),%xmm0 2166 movdqu -0x30(%rdx),%xmm1 2167 2168 cmp $0x40,%r8 2169 movntdq %xmm0,-0x40(%rcx) 2170 movntdq %xmm1,-0x30(%rcx) 2171 2172 movdqu -0x20(%rdx),%xmm2 2173 movdqu -0x10(%rdx),%xmm3 2174 2175 movntdq %xmm2,-0x20(%rcx) 2176 movntdq %xmm3,-0x10(%rcx) 2177 2178 jge L(sse2_nt_move) 2179 2180 lea L(Fix16EndTable)(%rip),%r10 2181 mov %r8,%r9 2182 and $0xFFFFFFFFFFFFFFF0,%r9 2183 add %r9,%rcx 2184 add %r9,%rdx 2185 sub %r9,%r8 2186 shr $0x4,%r9 2187 sfence 2188 2189 movslq (%r10,%r9,4),%r11 2190 lea (%r11,%r10,1),%r10 2191 jmpq *%r10 2192 2193 .balign 16 2194L(Fix16EndTable): 2195 .int L(fix16_0)-L(Fix16EndTable) 2196 .int L(fix16_1)-L(Fix16EndTable) 2197 .int L(fix16_2)-L(Fix16EndTable) 2198 .int L(fix16_3)-L(Fix16EndTable) 2199 2200 .balign 16 2201L(fix16_3): 2202 movdqu -0x30(%rdx),%xmm1 2203 movdqa %xmm1,-0x30(%rcx) 2204L(fix16_2): 2205 movdqu -0x20(%rdx),%xmm2 2206 movdqa %xmm2,-0x20(%rcx) 2207L(fix16_1): 2208 movdqu -0x10(%rdx),%xmm3 2209 movdqa %xmm3,-0x10(%rcx) 2210L(fix16_0): 2211 lea L(fwdPxQx)(%rip),%r10 2212 add %r8,%rdx 2213 add %r8,%rcx 2214 2215 movslq (%r10,%r8,4),%r9 2216 lea (%r9,%r10,1),%r10 2217 jmpq *%r10 2218 2219 .balign 16 2220L(pre_both_aligned): 2221 cmp $0x80,%r8 2222 jl L(fix_16b) 2223 2224 .balign 16 2225L(both_aligned): 2226 2227 /* 2228 * this 'paired' load/load/store/store seems to do best. 2229 */ 2230 movdqa (%rdx),%xmm0 2231 movdqa 0x10(%rdx),%xmm1 2232 2233 movdqa %xmm0,(%rcx) 2234 movdqa %xmm1,0x10(%rcx) 2235 lea -0x80(%r8),%r8 2236 2237 movdqa 0x20(%rdx),%xmm2 2238 movdqa 0x30(%rdx),%xmm3 2239 2240 movdqa %xmm2,0x20(%rcx) 2241 movdqa %xmm3,0x30(%rcx) 2242 2243 movdqa 0x40(%rdx),%xmm0 2244 movdqa 0x50(%rdx),%xmm1 2245 cmp $0x80,%r8 2246 2247 movdqa %xmm0,0x40(%rcx) 2248 movdqa %xmm1,0x50(%rcx) 2249 2250 movdqa 0x60(%rdx),%xmm2 2251 movdqa 0x70(%rdx),%xmm3 2252 lea 0x80(%rdx),%rdx 2253 movdqa %xmm2,0x60(%rcx) 2254 movdqa %xmm3,0x70(%rcx) 2255 lea 0x80(%rcx),%rcx 2256 jge L(both_aligned) 2257 2258L(fix_16b): 2259 add %r8,%rcx 2260 lea L(fwdPxQx)(%rip),%r10 2261 add %r8,%rdx 2262 2263 movslq (%r10,%r8,4),%r9 2264 lea (%r9,%r10,1),%r10 2265 jmpq *%r10 2266 2267 .balign 16 2268L(Loop8byte_pre): 2269 # Use 8-byte moves 2270 mov .largest_level_cache_size(%rip),%r9d 2271 shr %r9 # take half of it 2272 cmp %r9,%r8 2273 jg L(byte8_nt_top) 2274 # Find out whether to use rep movsq 2275 cmp $4096,%r8 2276 jle L(byte8_top) 2277 mov .amd64cache1half(%rip),%r9d # half of l1 cache 2278 cmp %r9,%r8 2279 jle L(use_rep) 2280 2281 .balign 16 2282L(byte8_top): 2283 mov (%rdx),%r9 2284 mov 0x8(%rdx),%r10 2285 lea -0x40(%r8),%r8 2286 mov %r9,(%rcx) 2287 mov %r10,0x8(%rcx) 2288 mov 0x10(%rdx),%r11 2289 mov 0x18(%rdx),%r9 2290 mov %r11,0x10(%rcx) 2291 mov %r9,0x18(%rcx) 2292 2293 cmp $0x40,%r8 2294 mov 0x20(%rdx),%r10 2295 mov 0x28(%rdx),%r11 2296 mov %r10,0x20(%rcx) 2297 mov %r11,0x28(%rcx) 2298 mov 0x30(%rdx),%r9 2299 mov 0x38(%rdx),%r10 2300 lea 0x40(%rdx),%rdx 2301 mov %r9,0x30(%rcx) 2302 mov %r10,0x38(%rcx) 2303 lea 0x40(%rcx),%rcx 2304 jg L(byte8_top) 2305 2306L(byte8_end): 2307 lea L(fwdPxQx)(%rip),%r10 2308 lea (%rdx,%r8,1),%rdx 2309 lea (%rcx,%r8,1),%rcx 2310 2311 movslq (%r10,%r8,4),%r9 2312 lea (%r9,%r10,1),%r10 2313 jmpq *%r10 2314 2315 .balign 16 2316L(use_rep): 2317 mov %rdx,%rsi # %rsi = source 2318 mov %rcx,%rdi # %rdi = destination 2319 mov %r8,%rcx # %rcx = count 2320 shrq $3,%rcx # 8-byte word count 2321 rep 2322 movsq 2323 mov %rsi,%rdx # source 2324 mov %rdi,%rcx # destination 2325 andq $7,%r8 # remainder 2326 jnz L(byte8_end) 2327 ret 2328 2329 .balign 16 2330L(byte8_nt_top): 2331 sub $0x40,%r8 2332 prefetchnta 0x180(%rdx) 2333 mov (%rdx),%r9 2334 movnti %r9,(%rcx) 2335 mov 0x8(%rdx),%r10 2336 movnti %r10,0x8(%rcx) 2337 mov 0x10(%rdx),%r11 2338 movnti %r11,0x10(%rcx) 2339 mov 0x18(%rdx),%r9 2340 movnti %r9,0x18(%rcx) 2341 mov 0x20(%rdx),%r10 2342 movnti %r10,0x20(%rcx) 2343 mov 0x28(%rdx),%r11 2344 movnti %r11,0x28(%rcx) 2345 mov 0x30(%rdx),%r9 2346 movnti %r9,0x30(%rcx) 2347 mov 0x38(%rdx),%r10 2348 movnti %r10,0x38(%rcx) 2349 2350 lea 0x40(%rdx),%rdx 2351 lea 0x40(%rcx),%rcx 2352 cmp $0x40,%r8 2353 jge L(byte8_nt_top) 2354 sfence 2355 jmp L(byte8_end) 2356 2357 SET_SIZE(memcpy) 2358 2359 .balign 16 2360L(CopyBackwards): 2361 mov %rdx,%r8 2362 mov %rdi,%rcx 2363 mov %rsi,%rdx 2364 mov %rdi,%rax # return value 2365 2366 # ck alignment of last byte 2367 lea (%rcx,%r8,1),%rcx 2368 test $0x7,%rcx 2369 lea (%rdx,%r8,1),%rdx 2370 jne L(bk_align) 2371 2372L(bk_qw_aligned): 2373 lea L(bkPxQx)(%rip),%r10 2374 2375 cmp $0x90,%r8 # 144 2376 jg L(bk_ck_sse2_alignment) 2377 2378 sub %r8,%rcx 2379 sub %r8,%rdx 2380 2381 movslq (%r10,%r8,4),%r9 2382 lea (%r9,%r10,1),%r10 2383 jmpq *%r10 2384 2385 .balign 16 2386L(bk_align): 2387 # only align if len > 8 2388 cmp $8,%r8 2389 jle L(bk_qw_aligned) 2390 test $0x1,%rcx 2391 je L(bk_tst2) 2392 dec %rcx 2393 dec %rdx 2394 dec %r8 2395 mov (%rdx),%r9b 2396 mov %r9b,(%rcx) 2397 2398L(bk_tst2): 2399 test $0x2,%rcx 2400 je L(bk_tst3) 2401 2402L(bk_got2): 2403 sub $0x2,%rcx 2404 sub $0x2,%rdx 2405 sub $0x2,%r8 2406 movzwq (%rdx),%r9 2407 mov %r9w,(%rcx) 2408 2409L(bk_tst3): 2410 test $0x4,%rcx 2411 je L(bk_qw_aligned) 2412 2413L(bk_got3): 2414 sub $0x4,%rcx 2415 sub $0x4,%rdx 2416 sub $0x4,%r8 2417 mov (%rdx),%r9d 2418 mov %r9d,(%rcx) 2419 jmp L(bk_qw_aligned) 2420 2421 .balign 16 2422L(bk_ck_sse2_alignment): 2423 cmpl $NO_SSE,.memops_method(%rip) 2424 je L(bk_use_rep) 2425 # check alignment of last byte 2426 test $0xf,%rcx 2427 jz L(bk_sse2_cpy) 2428 2429L(bk_sse2_align): 2430 # only here if already aligned on at least a qword bndry 2431 sub $0x8,%rcx 2432 sub $0x8,%rdx 2433 sub $0x8,%r8 2434 mov (%rdx),%r9 2435 mov %r9,(%rcx) 2436 #jmp L(bk_sse2_cpy) 2437 2438 .balign 16 2439L(bk_sse2_cpy): 2440 sub $0x80,%rcx # 128 2441 sub $0x80,%rdx 2442 movdqu 0x70(%rdx),%xmm3 2443 movdqu 0x60(%rdx),%xmm2 2444 movdqa %xmm3,0x70(%rcx) 2445 movdqa %xmm2,0x60(%rcx) 2446 sub $0x80,%r8 2447 movdqu 0x50(%rdx),%xmm1 2448 movdqu 0x40(%rdx),%xmm0 2449 movdqa %xmm1,0x50(%rcx) 2450 movdqa %xmm0,0x40(%rcx) 2451 2452 cmp $0x80,%r8 2453 movdqu 0x30(%rdx),%xmm3 2454 movdqu 0x20(%rdx),%xmm2 2455 movdqa %xmm3,0x30(%rcx) 2456 movdqa %xmm2,0x20(%rcx) 2457 movdqu 0x10(%rdx),%xmm1 2458 movdqu (%rdx),%xmm0 2459 movdqa %xmm1,0x10(%rcx) 2460 movdqa %xmm0,(%rcx) 2461 jge L(bk_sse2_cpy) 2462 2463L(bk_sse2_cpy_end): 2464 lea L(bkPxQx)(%rip),%r10 2465 sub %r8,%rdx 2466 sub %r8,%rcx 2467 movslq (%r10,%r8,4),%r9 2468 lea (%r9,%r10,1),%r10 2469 jmpq *%r10 2470 2471 .balign 16 2472L(bk_use_rep): 2473 xchg %rcx,%r9 2474 mov %rdx,%rsi # source 2475 mov %r9,%rdi # destination 2476 mov %r8,%rcx # count 2477 sub $8,%rsi 2478 sub $8,%rdi 2479 shr $3,%rcx 2480 std # reverse direction 2481 rep 2482 movsq 2483 cld # reset direction flag 2484 2485 xchg %rcx,%r9 2486 lea L(bkPxQx)(%rip),%r10 2487 sub %r8,%rdx 2488 sub %r8,%rcx 2489 andq $7,%r8 # remainder 2490 jz 2f 2491 movslq (%r10,%r8,4),%r9 2492 lea (%r9,%r10,1),%r10 2493 jmpq *%r10 24942: 2495 ret 2496 2497 .balign 16 2498L(bkP0QI): 2499 mov 0x88(%rdx),%r10 2500 mov %r10,0x88(%rcx) 2501L(bkP0QH): 2502 mov 0x80(%rdx),%r10 2503 mov %r10,0x80(%rcx) 2504L(bkP0QG): 2505 mov 0x78(%rdx),%r9 2506 mov %r9,0x78(%rcx) 2507L(bkP0QF): 2508 mov 0x70(%rdx),%r11 2509 mov %r11,0x70(%rcx) 2510L(bkP0QE): 2511 mov 0x68(%rdx),%r10 2512 mov %r10,0x68(%rcx) 2513L(bkP0QD): 2514 mov 0x60(%rdx),%r9 2515 mov %r9,0x60(%rcx) 2516L(bkP0QC): 2517 mov 0x58(%rdx),%r11 2518 mov %r11,0x58(%rcx) 2519L(bkP0QB): 2520 mov 0x50(%rdx),%r10 2521 mov %r10,0x50(%rcx) 2522L(bkP0QA): 2523 mov 0x48(%rdx),%r9 2524 mov %r9,0x48(%rcx) 2525L(bkP0Q9): 2526 mov 0x40(%rdx),%r11 2527 mov %r11,0x40(%rcx) 2528L(bkP0Q8): 2529 mov 0x38(%rdx),%r10 2530 mov %r10,0x38(%rcx) 2531L(bkP0Q7): 2532 mov 0x30(%rdx),%r9 2533 mov %r9,0x30(%rcx) 2534L(bkP0Q6): 2535 mov 0x28(%rdx),%r11 2536 mov %r11,0x28(%rcx) 2537L(bkP0Q5): 2538 mov 0x20(%rdx),%r10 2539 mov %r10,0x20(%rcx) 2540L(bkP0Q4): 2541 mov 0x18(%rdx),%r9 2542 mov %r9,0x18(%rcx) 2543L(bkP0Q3): 2544 mov 0x10(%rdx),%r11 2545 mov %r11,0x10(%rcx) 2546L(bkP0Q2): 2547 mov 0x8(%rdx),%r10 2548 mov %r10,0x8(%rcx) 2549L(bkP0Q1): 2550 mov (%rdx),%r9 2551 mov %r9,(%rcx) 2552L(bkP0Q0): 2553 ret 2554 2555 .balign 16 2556L(bkP1QI): 2557 mov 0x89(%rdx),%r10 2558 mov %r10,0x89(%rcx) 2559L(bkP1QH): 2560 mov 0x81(%rdx),%r11 2561 mov %r11,0x81(%rcx) 2562L(bkP1QG): 2563 mov 0x79(%rdx),%r10 2564 mov %r10,0x79(%rcx) 2565L(bkP1QF): 2566 mov 0x71(%rdx),%r9 2567 mov %r9,0x71(%rcx) 2568L(bkP1QE): 2569 mov 0x69(%rdx),%r11 2570 mov %r11,0x69(%rcx) 2571L(bkP1QD): 2572 mov 0x61(%rdx),%r10 2573 mov %r10,0x61(%rcx) 2574L(bkP1QC): 2575 mov 0x59(%rdx),%r9 2576 mov %r9,0x59(%rcx) 2577L(bkP1QB): 2578 mov 0x51(%rdx),%r11 2579 mov %r11,0x51(%rcx) 2580L(bkP1QA): 2581 mov 0x49(%rdx),%r10 2582 mov %r10,0x49(%rcx) 2583L(bkP1Q9): 2584 mov 0x41(%rdx),%r9 2585 mov %r9,0x41(%rcx) 2586L(bkP1Q8): 2587 mov 0x39(%rdx),%r11 2588 mov %r11,0x39(%rcx) 2589L(bkP1Q7): 2590 mov 0x31(%rdx),%r10 2591 mov %r10,0x31(%rcx) 2592L(bkP1Q6): 2593 mov 0x29(%rdx),%r9 2594 mov %r9,0x29(%rcx) 2595L(bkP1Q5): 2596 mov 0x21(%rdx),%r11 2597 mov %r11,0x21(%rcx) 2598L(bkP1Q4): 2599 mov 0x19(%rdx),%r10 2600 mov %r10,0x19(%rcx) 2601L(bkP1Q3): 2602 mov 0x11(%rdx),%r9 2603 mov %r9,0x11(%rcx) 2604L(bkP1Q2): 2605 mov 0x9(%rdx),%r11 2606 mov %r11,0x9(%rcx) 2607L(bkP1Q1): 2608 mov 0x1(%rdx),%r10 2609 mov %r10,0x1(%rcx) 2610L(bkP1Q0): 2611 mov (%rdx),%r9b 2612 mov %r9b,(%rcx) 2613 ret 2614 2615 .balign 16 2616L(bkP2QI): 2617 mov 0x8a(%rdx),%r10 2618 mov %r10,0x8a(%rcx) 2619L(bkP2QH): 2620 mov 0x82(%rdx),%r11 2621 mov %r11,0x82(%rcx) 2622L(bkP2QG): 2623 mov 0x7a(%rdx),%r10 2624 mov %r10,0x7a(%rcx) 2625L(bkP2QF): 2626 mov 0x72(%rdx),%r9 2627 mov %r9,0x72(%rcx) 2628L(bkP2QE): 2629 mov 0x6a(%rdx),%r11 2630 mov %r11,0x6a(%rcx) 2631L(bkP2QD): 2632 mov 0x62(%rdx),%r10 2633 mov %r10,0x62(%rcx) 2634L(bkP2QC): 2635 mov 0x5a(%rdx),%r9 2636 mov %r9,0x5a(%rcx) 2637L(bkP2QB): 2638 mov 0x52(%rdx),%r11 2639 mov %r11,0x52(%rcx) 2640L(bkP2QA): 2641 mov 0x4a(%rdx),%r10 2642 mov %r10,0x4a(%rcx) 2643L(bkP2Q9): 2644 mov 0x42(%rdx),%r9 2645 mov %r9,0x42(%rcx) 2646L(bkP2Q8): 2647 mov 0x3a(%rdx),%r11 2648 mov %r11,0x3a(%rcx) 2649L(bkP2Q7): 2650 mov 0x32(%rdx),%r10 2651 mov %r10,0x32(%rcx) 2652L(bkP2Q6): 2653 mov 0x2a(%rdx),%r9 2654 mov %r9,0x2a(%rcx) 2655L(bkP2Q5): 2656 mov 0x22(%rdx),%r11 2657 mov %r11,0x22(%rcx) 2658L(bkP2Q4): 2659 mov 0x1a(%rdx),%r10 2660 mov %r10,0x1a(%rcx) 2661L(bkP2Q3): 2662 mov 0x12(%rdx),%r9 2663 mov %r9,0x12(%rcx) 2664L(bkP2Q2): 2665 mov 0xa(%rdx),%r11 2666 mov %r11,0xa(%rcx) 2667L(bkP2Q1): 2668 mov 0x2(%rdx),%r10 2669 mov %r10,0x2(%rcx) 2670L(bkP2Q0): 2671 mov (%rdx),%r9w 2672 mov %r9w,(%rcx) 2673 ret 2674 2675 .balign 16 2676L(bkP3QI): 2677 mov 0x8b(%rdx),%r10 2678 mov %r10,0x8b(%rcx) 2679L(bkP3QH): 2680 mov 0x83(%rdx),%r11 2681 mov %r11,0x83(%rcx) 2682L(bkP3QG): 2683 mov 0x7b(%rdx),%r10 2684 mov %r10,0x7b(%rcx) 2685L(bkP3QF): 2686 mov 0x73(%rdx),%r9 2687 mov %r9,0x73(%rcx) 2688L(bkP3QE): 2689 mov 0x6b(%rdx),%r11 2690 mov %r11,0x6b(%rcx) 2691L(bkP3QD): 2692 mov 0x63(%rdx),%r10 2693 mov %r10,0x63(%rcx) 2694L(bkP3QC): 2695 mov 0x5b(%rdx),%r9 2696 mov %r9,0x5b(%rcx) 2697L(bkP3QB): 2698 mov 0x53(%rdx),%r11 2699 mov %r11,0x53(%rcx) 2700L(bkP3QA): 2701 mov 0x4b(%rdx),%r10 2702 mov %r10,0x4b(%rcx) 2703L(bkP3Q9): 2704 mov 0x43(%rdx),%r9 2705 mov %r9,0x43(%rcx) 2706L(bkP3Q8): 2707 mov 0x3b(%rdx),%r11 2708 mov %r11,0x3b(%rcx) 2709L(bkP3Q7): 2710 mov 0x33(%rdx),%r10 2711 mov %r10,0x33(%rcx) 2712L(bkP3Q6): 2713 mov 0x2b(%rdx),%r9 2714 mov %r9,0x2b(%rcx) 2715L(bkP3Q5): 2716 mov 0x23(%rdx),%r11 2717 mov %r11,0x23(%rcx) 2718L(bkP3Q4): 2719 mov 0x1b(%rdx),%r10 2720 mov %r10,0x1b(%rcx) 2721L(bkP3Q3): 2722 mov 0x13(%rdx),%r9 2723 mov %r9,0x13(%rcx) 2724L(bkP3Q2): 2725 mov 0xb(%rdx),%r11 2726 mov %r11,0xb(%rcx) 2727L(bkP3Q1): 2728 mov 0x3(%rdx),%r10 2729 mov %r10,0x3(%rcx) 2730L(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores 2731 mov 0x1(%rdx),%r9w 2732 mov %r9w,0x1(%rcx) 2733 mov (%rdx),%r10b 2734 mov %r10b,(%rcx) 2735 ret 2736 2737 .balign 16 2738L(bkP4QI): 2739 mov 0x8c(%rdx),%r10 2740 mov %r10,0x8c(%rcx) 2741L(bkP4QH): 2742 mov 0x84(%rdx),%r11 2743 mov %r11,0x84(%rcx) 2744L(bkP4QG): 2745 mov 0x7c(%rdx),%r10 2746 mov %r10,0x7c(%rcx) 2747L(bkP4QF): 2748 mov 0x74(%rdx),%r9 2749 mov %r9,0x74(%rcx) 2750L(bkP4QE): 2751 mov 0x6c(%rdx),%r11 2752 mov %r11,0x6c(%rcx) 2753L(bkP4QD): 2754 mov 0x64(%rdx),%r10 2755 mov %r10,0x64(%rcx) 2756L(bkP4QC): 2757 mov 0x5c(%rdx),%r9 2758 mov %r9,0x5c(%rcx) 2759L(bkP4QB): 2760 mov 0x54(%rdx),%r11 2761 mov %r11,0x54(%rcx) 2762L(bkP4QA): 2763 mov 0x4c(%rdx),%r10 2764 mov %r10,0x4c(%rcx) 2765L(bkP4Q9): 2766 mov 0x44(%rdx),%r9 2767 mov %r9,0x44(%rcx) 2768L(bkP4Q8): 2769 mov 0x3c(%rdx),%r11 2770 mov %r11,0x3c(%rcx) 2771L(bkP4Q7): 2772 mov 0x34(%rdx),%r10 2773 mov %r10,0x34(%rcx) 2774L(bkP4Q6): 2775 mov 0x2c(%rdx),%r9 2776 mov %r9,0x2c(%rcx) 2777L(bkP4Q5): 2778 mov 0x24(%rdx),%r11 2779 mov %r11,0x24(%rcx) 2780L(bkP4Q4): 2781 mov 0x1c(%rdx),%r10 2782 mov %r10,0x1c(%rcx) 2783L(bkP4Q3): 2784 mov 0x14(%rdx),%r9 2785 mov %r9,0x14(%rcx) 2786L(bkP4Q2): 2787 mov 0xc(%rdx),%r11 2788 mov %r11,0xc(%rcx) 2789L(bkP4Q1): 2790 mov 0x4(%rdx),%r10 2791 mov %r10,0x4(%rcx) 2792L(bkP4Q0): 2793 mov (%rdx),%r9d 2794 mov %r9d,(%rcx) 2795 ret 2796 2797 .balign 16 2798L(bkP5QI): 2799 mov 0x8d(%rdx),%r10 2800 mov %r10,0x8d(%rcx) 2801L(bkP5QH): 2802 mov 0x85(%rdx),%r9 2803 mov %r9,0x85(%rcx) 2804L(bkP5QG): 2805 mov 0x7d(%rdx),%r11 2806 mov %r11,0x7d(%rcx) 2807L(bkP5QF): 2808 mov 0x75(%rdx),%r10 2809 mov %r10,0x75(%rcx) 2810L(bkP5QE): 2811 mov 0x6d(%rdx),%r9 2812 mov %r9,0x6d(%rcx) 2813L(bkP5QD): 2814 mov 0x65(%rdx),%r11 2815 mov %r11,0x65(%rcx) 2816L(bkP5QC): 2817 mov 0x5d(%rdx),%r10 2818 mov %r10,0x5d(%rcx) 2819L(bkP5QB): 2820 mov 0x55(%rdx),%r9 2821 mov %r9,0x55(%rcx) 2822L(bkP5QA): 2823 mov 0x4d(%rdx),%r11 2824 mov %r11,0x4d(%rcx) 2825L(bkP5Q9): 2826 mov 0x45(%rdx),%r10 2827 mov %r10,0x45(%rcx) 2828L(bkP5Q8): 2829 mov 0x3d(%rdx),%r9 2830 mov %r9,0x3d(%rcx) 2831L(bkP5Q7): 2832 mov 0x35(%rdx),%r11 2833 mov %r11,0x35(%rcx) 2834L(bkP5Q6): 2835 mov 0x2d(%rdx),%r10 2836 mov %r10,0x2d(%rcx) 2837L(bkP5Q5): 2838 mov 0x25(%rdx),%r9 2839 mov %r9,0x25(%rcx) 2840L(bkP5Q4): 2841 mov 0x1d(%rdx),%r11 2842 mov %r11,0x1d(%rcx) 2843L(bkP5Q3): 2844 mov 0x15(%rdx),%r10 2845 mov %r10,0x15(%rcx) 2846L(bkP5Q2): 2847 mov 0xd(%rdx),%r9 2848 mov %r9,0xd(%rcx) 2849L(bkP5Q1): 2850 mov 0x5(%rdx),%r11 2851 mov %r11,0x5(%rcx) 2852L(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores 2853 mov 0x1(%rdx),%r9d 2854 mov %r9d,0x1(%rcx) 2855 mov (%rdx),%r10b 2856 mov %r10b,(%rcx) 2857 ret 2858 2859 .balign 16 2860L(bkP6QI): 2861 mov 0x8e(%rdx),%r10 2862 mov %r10,0x8e(%rcx) 2863L(bkP6QH): 2864 mov 0x86(%rdx),%r11 2865 mov %r11,0x86(%rcx) 2866L(bkP6QG): 2867 mov 0x7e(%rdx),%r10 2868 mov %r10,0x7e(%rcx) 2869L(bkP6QF): 2870 mov 0x76(%rdx),%r9 2871 mov %r9,0x76(%rcx) 2872L(bkP6QE): 2873 mov 0x6e(%rdx),%r11 2874 mov %r11,0x6e(%rcx) 2875L(bkP6QD): 2876 mov 0x66(%rdx),%r10 2877 mov %r10,0x66(%rcx) 2878L(bkP6QC): 2879 mov 0x5e(%rdx),%r9 2880 mov %r9,0x5e(%rcx) 2881L(bkP6QB): 2882 mov 0x56(%rdx),%r11 2883 mov %r11,0x56(%rcx) 2884L(bkP6QA): 2885 mov 0x4e(%rdx),%r10 2886 mov %r10,0x4e(%rcx) 2887L(bkP6Q9): 2888 mov 0x46(%rdx),%r9 2889 mov %r9,0x46(%rcx) 2890L(bkP6Q8): 2891 mov 0x3e(%rdx),%r11 2892 mov %r11,0x3e(%rcx) 2893L(bkP6Q7): 2894 mov 0x36(%rdx),%r10 2895 mov %r10,0x36(%rcx) 2896L(bkP6Q6): 2897 mov 0x2e(%rdx),%r9 2898 mov %r9,0x2e(%rcx) 2899L(bkP6Q5): 2900 mov 0x26(%rdx),%r11 2901 mov %r11,0x26(%rcx) 2902L(bkP6Q4): 2903 mov 0x1e(%rdx),%r10 2904 mov %r10,0x1e(%rcx) 2905L(bkP6Q3): 2906 mov 0x16(%rdx),%r9 2907 mov %r9,0x16(%rcx) 2908L(bkP6Q2): 2909 mov 0xe(%rdx),%r11 2910 mov %r11,0xe(%rcx) 2911L(bkP6Q1): 2912 mov 0x6(%rdx),%r10 2913 mov %r10,0x6(%rcx) 2914L(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores 2915 mov 0x2(%rdx),%r9d 2916 mov %r9d,0x2(%rcx) 2917 mov (%rdx),%r10w 2918 mov %r10w,(%rcx) 2919 ret 2920 2921 .balign 16 2922L(bkP7QI): 2923 mov 0x8f(%rdx),%r10 2924 mov %r10,0x8f(%rcx) 2925L(bkP7QH): 2926 mov 0x87(%rdx),%r11 2927 mov %r11,0x87(%rcx) 2928L(bkP7QG): 2929 mov 0x7f(%rdx),%r10 2930 mov %r10,0x7f(%rcx) 2931L(bkP7QF): 2932 mov 0x77(%rdx),%r9 2933 mov %r9,0x77(%rcx) 2934L(bkP7QE): 2935 mov 0x6f(%rdx),%r11 2936 mov %r11,0x6f(%rcx) 2937L(bkP7QD): 2938 mov 0x67(%rdx),%r10 2939 mov %r10,0x67(%rcx) 2940L(bkP7QC): 2941 mov 0x5f(%rdx),%r9 2942 mov %r9,0x5f(%rcx) 2943L(bkP7QB): 2944 mov 0x57(%rdx),%r11 2945 mov %r11,0x57(%rcx) 2946L(bkP7QA): 2947 mov 0x4f(%rdx),%r10 2948 mov %r10,0x4f(%rcx) 2949L(bkP7Q9): 2950 mov 0x47(%rdx),%r9 2951 mov %r9,0x47(%rcx) 2952L(bkP7Q8): 2953 mov 0x3f(%rdx),%r11 2954 mov %r11,0x3f(%rcx) 2955L(bkP7Q7): 2956 mov 0x37(%rdx),%r10 2957 mov %r10,0x37(%rcx) 2958L(bkP7Q6): 2959 mov 0x2f(%rdx),%r9 2960 mov %r9,0x2f(%rcx) 2961L(bkP7Q5): 2962 mov 0x27(%rdx),%r11 2963 mov %r11,0x27(%rcx) 2964L(bkP7Q4): 2965 mov 0x1f(%rdx),%r10 2966 mov %r10,0x1f(%rcx) 2967L(bkP7Q3): 2968 mov 0x17(%rdx),%r9 2969 mov %r9,0x17(%rcx) 2970L(bkP7Q2): 2971 mov 0xf(%rdx),%r11 2972 mov %r11,0xf(%rcx) 2973L(bkP7Q1): 2974 mov 0x7(%rdx),%r10 2975 mov %r10,0x7(%rcx) 2976L(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores 2977 mov 0x3(%rdx),%r9d 2978 mov %r9d,0x3(%rcx) 2979 mov 0x1(%rdx),%r10w 2980 mov %r10w,0x1(%rcx) 2981 mov (%rdx),%r11b 2982 mov %r11b,(%rcx) 2983 ret 2984 2985 .balign 16 2986L(bkPxQx): .int L(bkP0Q0)-L(bkPxQx) 2987 .int L(bkP1Q0)-L(bkPxQx) 2988 .int L(bkP2Q0)-L(bkPxQx) 2989 .int L(bkP3Q0)-L(bkPxQx) 2990 .int L(bkP4Q0)-L(bkPxQx) 2991 .int L(bkP5Q0)-L(bkPxQx) 2992 .int L(bkP6Q0)-L(bkPxQx) 2993 .int L(bkP7Q0)-L(bkPxQx) 2994 2995 .int L(bkP0Q1)-L(bkPxQx) 2996 .int L(bkP1Q1)-L(bkPxQx) 2997 .int L(bkP2Q1)-L(bkPxQx) 2998 .int L(bkP3Q1)-L(bkPxQx) 2999 .int L(bkP4Q1)-L(bkPxQx) 3000 .int L(bkP5Q1)-L(bkPxQx) 3001 .int L(bkP6Q1)-L(bkPxQx) 3002 .int L(bkP7Q1)-L(bkPxQx) 3003 3004 .int L(bkP0Q2)-L(bkPxQx) 3005 .int L(bkP1Q2)-L(bkPxQx) 3006 .int L(bkP2Q2)-L(bkPxQx) 3007 .int L(bkP3Q2)-L(bkPxQx) 3008 .int L(bkP4Q2)-L(bkPxQx) 3009 .int L(bkP5Q2)-L(bkPxQx) 3010 .int L(bkP6Q2)-L(bkPxQx) 3011 .int L(bkP7Q2)-L(bkPxQx) 3012 3013 .int L(bkP0Q3)-L(bkPxQx) 3014 .int L(bkP1Q3)-L(bkPxQx) 3015 .int L(bkP2Q3)-L(bkPxQx) 3016 .int L(bkP3Q3)-L(bkPxQx) 3017 .int L(bkP4Q3)-L(bkPxQx) 3018 .int L(bkP5Q3)-L(bkPxQx) 3019 .int L(bkP6Q3)-L(bkPxQx) 3020 .int L(bkP7Q3)-L(bkPxQx) 3021 3022 .int L(bkP0Q4)-L(bkPxQx) 3023 .int L(bkP1Q4)-L(bkPxQx) 3024 .int L(bkP2Q4)-L(bkPxQx) 3025 .int L(bkP3Q4)-L(bkPxQx) 3026 .int L(bkP4Q4)-L(bkPxQx) 3027 .int L(bkP5Q4)-L(bkPxQx) 3028 .int L(bkP6Q4)-L(bkPxQx) 3029 .int L(bkP7Q4)-L(bkPxQx) 3030 3031 .int L(bkP0Q5)-L(bkPxQx) 3032 .int L(bkP1Q5)-L(bkPxQx) 3033 .int L(bkP2Q5)-L(bkPxQx) 3034 .int L(bkP3Q5)-L(bkPxQx) 3035 .int L(bkP4Q5)-L(bkPxQx) 3036 .int L(bkP5Q5)-L(bkPxQx) 3037 .int L(bkP6Q5)-L(bkPxQx) 3038 .int L(bkP7Q5)-L(bkPxQx) 3039 3040 .int L(bkP0Q6)-L(bkPxQx) 3041 .int L(bkP1Q6)-L(bkPxQx) 3042 .int L(bkP2Q6)-L(bkPxQx) 3043 .int L(bkP3Q6)-L(bkPxQx) 3044 .int L(bkP4Q6)-L(bkPxQx) 3045 .int L(bkP5Q6)-L(bkPxQx) 3046 .int L(bkP6Q6)-L(bkPxQx) 3047 .int L(bkP7Q6)-L(bkPxQx) 3048 3049 .int L(bkP0Q7)-L(bkPxQx) 3050 .int L(bkP1Q7)-L(bkPxQx) 3051 .int L(bkP2Q7)-L(bkPxQx) 3052 .int L(bkP3Q7)-L(bkPxQx) 3053 .int L(bkP4Q7)-L(bkPxQx) 3054 .int L(bkP5Q7)-L(bkPxQx) 3055 .int L(bkP6Q7)-L(bkPxQx) 3056 .int L(bkP7Q7)-L(bkPxQx) 3057 3058 .int L(bkP0Q8)-L(bkPxQx) 3059 .int L(bkP1Q8)-L(bkPxQx) 3060 .int L(bkP2Q8)-L(bkPxQx) 3061 .int L(bkP3Q8)-L(bkPxQx) 3062 .int L(bkP4Q8)-L(bkPxQx) 3063 .int L(bkP5Q8)-L(bkPxQx) 3064 .int L(bkP6Q8)-L(bkPxQx) 3065 .int L(bkP7Q8)-L(bkPxQx) 3066 3067 .int L(bkP0Q9)-L(bkPxQx) 3068 .int L(bkP1Q9)-L(bkPxQx) 3069 .int L(bkP2Q9)-L(bkPxQx) 3070 .int L(bkP3Q9)-L(bkPxQx) 3071 .int L(bkP4Q9)-L(bkPxQx) 3072 .int L(bkP5Q9)-L(bkPxQx) 3073 .int L(bkP6Q9)-L(bkPxQx) 3074 .int L(bkP7Q9)-L(bkPxQx) 3075 3076 .int L(bkP0QA)-L(bkPxQx) 3077 .int L(bkP1QA)-L(bkPxQx) 3078 .int L(bkP2QA)-L(bkPxQx) 3079 .int L(bkP3QA)-L(bkPxQx) 3080 .int L(bkP4QA)-L(bkPxQx) 3081 .int L(bkP5QA)-L(bkPxQx) 3082 .int L(bkP6QA)-L(bkPxQx) 3083 .int L(bkP7QA)-L(bkPxQx) 3084 3085 .int L(bkP0QB)-L(bkPxQx) 3086 .int L(bkP1QB)-L(bkPxQx) 3087 .int L(bkP2QB)-L(bkPxQx) 3088 .int L(bkP3QB)-L(bkPxQx) 3089 .int L(bkP4QB)-L(bkPxQx) 3090 .int L(bkP5QB)-L(bkPxQx) 3091 .int L(bkP6QB)-L(bkPxQx) 3092 .int L(bkP7QB)-L(bkPxQx) 3093 3094 .int L(bkP0QC)-L(bkPxQx) 3095 .int L(bkP1QC)-L(bkPxQx) 3096 .int L(bkP2QC)-L(bkPxQx) 3097 .int L(bkP3QC)-L(bkPxQx) 3098 .int L(bkP4QC)-L(bkPxQx) 3099 .int L(bkP5QC)-L(bkPxQx) 3100 .int L(bkP6QC)-L(bkPxQx) 3101 .int L(bkP7QC)-L(bkPxQx) 3102 3103 .int L(bkP0QD)-L(bkPxQx) 3104 .int L(bkP1QD)-L(bkPxQx) 3105 .int L(bkP2QD)-L(bkPxQx) 3106 .int L(bkP3QD)-L(bkPxQx) 3107 .int L(bkP4QD)-L(bkPxQx) 3108 .int L(bkP5QD)-L(bkPxQx) 3109 .int L(bkP6QD)-L(bkPxQx) 3110 .int L(bkP7QD)-L(bkPxQx) 3111 3112 .int L(bkP0QE)-L(bkPxQx) 3113 .int L(bkP1QE)-L(bkPxQx) 3114 .int L(bkP2QE)-L(bkPxQx) 3115 .int L(bkP3QE)-L(bkPxQx) 3116 .int L(bkP4QE)-L(bkPxQx) 3117 .int L(bkP5QE)-L(bkPxQx) 3118 .int L(bkP6QE)-L(bkPxQx) 3119 .int L(bkP7QE)-L(bkPxQx) 3120 3121 .int L(bkP0QF)-L(bkPxQx) 3122 .int L(bkP1QF)-L(bkPxQx) 3123 .int L(bkP2QF)-L(bkPxQx) 3124 .int L(bkP3QF)-L(bkPxQx) 3125 .int L(bkP4QF)-L(bkPxQx) 3126 .int L(bkP5QF)-L(bkPxQx) 3127 .int L(bkP6QF)-L(bkPxQx) 3128 .int L(bkP7QF)-L(bkPxQx) 3129 3130 .int L(bkP0QG)-L(bkPxQx) 3131 .int L(bkP1QG)-L(bkPxQx) 3132 .int L(bkP2QG)-L(bkPxQx) 3133 .int L(bkP3QG)-L(bkPxQx) 3134 .int L(bkP4QG)-L(bkPxQx) 3135 .int L(bkP5QG)-L(bkPxQx) 3136 .int L(bkP6QG)-L(bkPxQx) 3137 .int L(bkP7QG)-L(bkPxQx) 3138 3139 .int L(bkP0QH)-L(bkPxQx) 3140 .int L(bkP1QH)-L(bkPxQx) 3141 .int L(bkP2QH)-L(bkPxQx) 3142 .int L(bkP3QH)-L(bkPxQx) 3143 .int L(bkP4QH)-L(bkPxQx) 3144 .int L(bkP5QH)-L(bkPxQx) 3145 .int L(bkP6QH)-L(bkPxQx) 3146 .int L(bkP7QH)-L(bkPxQx) 3147 3148 .int L(bkP0QI)-L(bkPxQx) 3149 .int L(bkP1QI)-L(bkPxQx) 3150 .int L(bkP2QI)-L(bkPxQx) 3151 .int L(bkP3QI)-L(bkPxQx) 3152 .int L(bkP4QI)-L(bkPxQx) 3153 .int L(bkP5QI)-L(bkPxQx) 3154 .int L(bkP6QI)-L(bkPxQx) 3155 .int L(bkP7QI)-L(bkPxQx) 3156 3157 SET_SIZE(memmove) 3158