1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27/* 28 * Copyright (c) 2008, Intel Corporation 29 * All rights reserved. 30 */ 31 32/* 33 * memcpy.s - copies two blocks of memory 34 * Implements memcpy() and memmove() libc primitives. 35 */ 36 37#pragma ident "%Z%%M% %I% %E% SMI" 38 39 .file "%M%" 40 41#include <sys/asm_linkage.h> 42 43 ANSI_PRAGMA_WEAK(memmove,function) 44 ANSI_PRAGMA_WEAK(memcpy,function) 45 46#include "cache.h" 47#include "proc64_id.h" 48 49#define L(s) .memcpy/**/s 50 51/* 52 * memcpy algorithm overview: 53 * 54 * Thresholds used below were determined experimentally. 55 * 56 * Pseudo code: 57 * 58 * If (size <= 128 bytes) { 59 * do unrolled code (primarily 8-byte loads/stores) regardless of 60 * alignment. 61 * } else { 62 * Align destination to 16-byte boundary 63 * 64 * if (NO_SSE) { 65 * If (size > half of the largest level cache) { 66 * Use 8-byte non-temporal stores (64-bytes/loop) 67 * } else { 68 * if (size > 4K && size <= half l1 cache size) { 69 * Use rep movsq 70 * } else { 71 * Use 8-byte loads/stores (64 bytes per loop) 72 * } 73 * } 74 * 75 * } else { **USE SSE** 76 * If (size > half of the largest level cache) { 77 * Use 16-byte non-temporal stores (128-bytes per loop) 78 * } else { 79 * If (both source and destination are aligned) { 80 * Use 16-byte aligned loads and stores (128 bytes/loop) 81 * } else { 82 * use pairs of xmm registers with SSE2 or SSSE3 83 * instructions to concatenate and shift appropriately 84 * to account for source unalignment. This enables 85 * 16-byte aligned loads to be done. 86 * } 87 * } 88 } 89 * 90 * Finish any remaining bytes via unrolled code above. 91 * } 92 * 93 * memmove overview: 94 * memmove is the same as memcpy except one case where copy needs to be 95 * done backwards. The copy backwards code is done in a similar manner. 96 */ 97 98 ENTRY(memmove) 99 cmp %rsi,%rdi # if dst <= src 100 jbe L(CopyForward) # then do copy forward 101 mov %rsi,%r9 # move src to r9 102 add %rdx,%r9 # add len to get addr of end of src 103 cmp %r9,%rdi # if dst < end of src 104 jb L(CopyBackwards) # then do copy backwards 105 jmp L(CopyForward) 106 107 ENTRY (memcpy) 108L(CopyForward): 109 mov %rdx,%r8 110 mov %rdi,%rcx 111 mov %rsi,%rdx 112 mov %rdi,%rax 113 lea L(fwdPxQx)(%rip),%r11 114 cmp $0x80,%r8 # 128 115 jg L(ck_use_sse2) 116 add %r8,%rcx 117 add %r8,%rdx 118 119 movslq (%r11,%r8,4),%r10 120 lea (%r10,%r11,1),%r11 121 jmpq *%r11 122 123 .balign 16 124L(ShrtAlignNew): 125 lea L(AliPxQx)(%rip),%r11 126 mov %rcx,%r9 127 and $0xf,%r9 128 129 movslq (%r11,%r9,4),%r10 130 lea (%r10,%r11,1),%r11 131 jmpq *%r11 132 133 .balign 16 134L(fwdPxQx): .int L(P0Q0)-L(fwdPxQx) 135 .int L(P1Q0)-L(fwdPxQx) 136 .int L(P2Q0)-L(fwdPxQx) 137 .int L(P3Q0)-L(fwdPxQx) 138 .int L(P4Q0)-L(fwdPxQx) 139 .int L(P5Q0)-L(fwdPxQx) 140 .int L(P6Q0)-L(fwdPxQx) 141 .int L(P7Q0)-L(fwdPxQx) 142 143 .int L(P0Q1)-L(fwdPxQx) 144 .int L(P1Q1)-L(fwdPxQx) 145 .int L(P2Q1)-L(fwdPxQx) 146 .int L(P3Q1)-L(fwdPxQx) 147 .int L(P4Q1)-L(fwdPxQx) 148 .int L(P5Q1)-L(fwdPxQx) 149 .int L(P6Q1)-L(fwdPxQx) 150 .int L(P7Q1)-L(fwdPxQx) 151 152 .int L(P0Q2)-L(fwdPxQx) 153 .int L(P1Q2)-L(fwdPxQx) 154 .int L(P2Q2)-L(fwdPxQx) 155 .int L(P3Q2)-L(fwdPxQx) 156 .int L(P4Q2)-L(fwdPxQx) 157 .int L(P5Q2)-L(fwdPxQx) 158 .int L(P6Q2)-L(fwdPxQx) 159 .int L(P7Q2)-L(fwdPxQx) 160 161 .int L(P0Q3)-L(fwdPxQx) 162 .int L(P1Q3)-L(fwdPxQx) 163 .int L(P2Q3)-L(fwdPxQx) 164 .int L(P3Q3)-L(fwdPxQx) 165 .int L(P4Q3)-L(fwdPxQx) 166 .int L(P5Q3)-L(fwdPxQx) 167 .int L(P6Q3)-L(fwdPxQx) 168 .int L(P7Q3)-L(fwdPxQx) 169 170 .int L(P0Q4)-L(fwdPxQx) 171 .int L(P1Q4)-L(fwdPxQx) 172 .int L(P2Q4)-L(fwdPxQx) 173 .int L(P3Q4)-L(fwdPxQx) 174 .int L(P4Q4)-L(fwdPxQx) 175 .int L(P5Q4)-L(fwdPxQx) 176 .int L(P6Q4)-L(fwdPxQx) 177 .int L(P7Q4)-L(fwdPxQx) 178 179 .int L(P0Q5)-L(fwdPxQx) 180 .int L(P1Q5)-L(fwdPxQx) 181 .int L(P2Q5)-L(fwdPxQx) 182 .int L(P3Q5)-L(fwdPxQx) 183 .int L(P4Q5)-L(fwdPxQx) 184 .int L(P5Q5)-L(fwdPxQx) 185 .int L(P6Q5)-L(fwdPxQx) 186 .int L(P7Q5)-L(fwdPxQx) 187 188 .int L(P0Q6)-L(fwdPxQx) 189 .int L(P1Q6)-L(fwdPxQx) 190 .int L(P2Q6)-L(fwdPxQx) 191 .int L(P3Q6)-L(fwdPxQx) 192 .int L(P4Q6)-L(fwdPxQx) 193 .int L(P5Q6)-L(fwdPxQx) 194 .int L(P6Q6)-L(fwdPxQx) 195 .int L(P7Q6)-L(fwdPxQx) 196 197 .int L(P0Q7)-L(fwdPxQx) 198 .int L(P1Q7)-L(fwdPxQx) 199 .int L(P2Q7)-L(fwdPxQx) 200 .int L(P3Q7)-L(fwdPxQx) 201 .int L(P4Q7)-L(fwdPxQx) 202 .int L(P5Q7)-L(fwdPxQx) 203 .int L(P6Q7)-L(fwdPxQx) 204 .int L(P7Q7)-L(fwdPxQx) 205 206 .int L(P0Q8)-L(fwdPxQx) 207 .int L(P1Q8)-L(fwdPxQx) 208 .int L(P2Q8)-L(fwdPxQx) 209 .int L(P3Q8)-L(fwdPxQx) 210 .int L(P4Q8)-L(fwdPxQx) 211 .int L(P5Q8)-L(fwdPxQx) 212 .int L(P6Q8)-L(fwdPxQx) 213 .int L(P7Q8)-L(fwdPxQx) 214 215 .int L(P0Q9)-L(fwdPxQx) 216 .int L(P1Q9)-L(fwdPxQx) 217 .int L(P2Q9)-L(fwdPxQx) 218 .int L(P3Q9)-L(fwdPxQx) 219 .int L(P4Q9)-L(fwdPxQx) 220 .int L(P5Q9)-L(fwdPxQx) 221 .int L(P6Q9)-L(fwdPxQx) 222 .int L(P7Q9)-L(fwdPxQx) 223 224 .int L(P0QA)-L(fwdPxQx) 225 .int L(P1QA)-L(fwdPxQx) 226 .int L(P2QA)-L(fwdPxQx) 227 .int L(P3QA)-L(fwdPxQx) 228 .int L(P4QA)-L(fwdPxQx) 229 .int L(P5QA)-L(fwdPxQx) 230 .int L(P6QA)-L(fwdPxQx) 231 .int L(P7QA)-L(fwdPxQx) 232 233 .int L(P0QB)-L(fwdPxQx) 234 .int L(P1QB)-L(fwdPxQx) 235 .int L(P2QB)-L(fwdPxQx) 236 .int L(P3QB)-L(fwdPxQx) 237 .int L(P4QB)-L(fwdPxQx) 238 .int L(P5QB)-L(fwdPxQx) 239 .int L(P6QB)-L(fwdPxQx) 240 .int L(P7QB)-L(fwdPxQx) 241 242 .int L(P0QC)-L(fwdPxQx) 243 .int L(P1QC)-L(fwdPxQx) 244 .int L(P2QC)-L(fwdPxQx) 245 .int L(P3QC)-L(fwdPxQx) 246 .int L(P4QC)-L(fwdPxQx) 247 .int L(P5QC)-L(fwdPxQx) 248 .int L(P6QC)-L(fwdPxQx) 249 .int L(P7QC)-L(fwdPxQx) 250 251 .int L(P0QD)-L(fwdPxQx) 252 .int L(P1QD)-L(fwdPxQx) 253 .int L(P2QD)-L(fwdPxQx) 254 .int L(P3QD)-L(fwdPxQx) 255 .int L(P4QD)-L(fwdPxQx) 256 .int L(P5QD)-L(fwdPxQx) 257 .int L(P6QD)-L(fwdPxQx) 258 .int L(P7QD)-L(fwdPxQx) 259 260 .int L(P0QE)-L(fwdPxQx) 261 .int L(P1QE)-L(fwdPxQx) 262 .int L(P2QE)-L(fwdPxQx) 263 .int L(P3QE)-L(fwdPxQx) 264 .int L(P4QE)-L(fwdPxQx) 265 .int L(P5QE)-L(fwdPxQx) 266 .int L(P6QE)-L(fwdPxQx) 267 .int L(P7QE)-L(fwdPxQx) 268 269 .int L(P0QF)-L(fwdPxQx) 270 .int L(P1QF)-L(fwdPxQx) 271 .int L(P2QF)-L(fwdPxQx) 272 .int L(P3QF)-L(fwdPxQx) 273 .int L(P4QF)-L(fwdPxQx) 274 .int L(P5QF)-L(fwdPxQx) 275 .int L(P6QF)-L(fwdPxQx) 276 .int L(P7QF)-L(fwdPxQx) 277 278 .int L(P0QG)-L(fwdPxQx) # 0x80 279 280 .balign 16 281L(AliPxQx): .int L(now_qw_aligned)-L(AliPxQx) 282 .int L(A1Q0)-L(AliPxQx) 283 .int L(A2Q0)-L(AliPxQx) 284 .int L(A3Q0)-L(AliPxQx) 285 .int L(A4Q0)-L(AliPxQx) 286 .int L(A5Q0)-L(AliPxQx) 287 .int L(A6Q0)-L(AliPxQx) 288 .int L(A7Q0)-L(AliPxQx) 289 .int L(A0Q1)-L(AliPxQx) 290 .int L(A1Q1)-L(AliPxQx) 291 .int L(A2Q1)-L(AliPxQx) 292 .int L(A3Q1)-L(AliPxQx) 293 .int L(A4Q1)-L(AliPxQx) 294 .int L(A5Q1)-L(AliPxQx) 295 .int L(A6Q1)-L(AliPxQx) 296 .int L(A7Q1)-L(AliPxQx) 297 298 .balign 16 299L(A1Q0): # ; need to move 8+ 7=1+2+4 bytes 300 movzbq (%rdx),%r11 301 sub $0xf,%r8 302 mov %r11b,(%rcx) 303 304 movzwq 0x1(%rdx),%r10 305 mov %r10w,0x1(%rcx) 306 307 mov 0x3(%rdx),%r9d 308 mov %r9d,0x3(%rcx) 309 310 mov 0x7(%rdx),%r11 311 add $0xf,%rdx 312 mov %r11,0x7(%rcx) 313 314 add $0xf,%rcx 315 jmp L(now_qw_aligned) 316 317 .balign 16 318L(A2Q0): # ; need to move 8+ 6=2+4 bytes 319 movzwq (%rdx),%r10 320 sub $0xe,%r8 321 mov %r10w,(%rcx) 322 323 mov 0x2(%rdx),%r9d 324 mov %r9d,0x2(%rcx) 325 326 mov 0x6(%rdx),%r11 327 add $0xe,%rdx 328 mov %r11,0x6(%rcx) 329 add $0xe,%rcx 330 jmp L(now_qw_aligned) 331 332 .balign 16 333L(A3Q0): # ; need to move 8+ 5=1+4 bytes 334 movzbq (%rdx),%r11 335 sub $0xd,%r8 336 mov %r11b,(%rcx) 337 338 mov 0x1(%rdx),%r9d 339 mov %r9d,0x1(%rcx) 340 341 mov 0x5(%rdx),%r10 342 add $0xd,%rdx 343 mov %r10,0x5(%rcx) 344 345 add $0xd,%rcx 346 jmp L(now_qw_aligned) 347 348 .balign 16 349L(A4Q0): # ; need to move 8+4 bytes 350 mov (%rdx),%r9d 351 sub $0xc,%r8 352 mov %r9d,(%rcx) 353 354 mov 0x4(%rdx),%r10 355 add $0xc,%rdx 356 mov %r10,0x4(%rcx) 357 358 add $0xc,%rcx 359 jmp L(now_qw_aligned) 360 361 .balign 16 362L(A5Q0): # ; need to move 8+ 3=1+2 bytes 363 movzbq (%rdx),%r11 364 sub $0xb,%r8 365 mov %r11b,(%rcx) 366 367 movzwq 0x1(%rdx),%r10 368 mov %r10w,0x1(%rcx) 369 370 mov 0x3(%rdx),%r9 371 add $0xb,%rdx 372 mov %r9,0x3(%rcx) 373 374 add $0xb,%rcx 375 jmp L(now_qw_aligned) 376 377 .balign 16 378L(A6Q0): # ; need to move 8+2 bytes 379 movzwq (%rdx),%r10 380 sub $0xa,%r8 381 mov %r10w,(%rcx) 382 383 mov 0x2(%rdx),%r9 384 add $0xa,%rdx 385 mov %r9,0x2(%rcx) 386 387 add $0xa,%rcx 388 jmp L(now_qw_aligned) 389 390 .balign 16 391L(A7Q0): # ; need to move 8+1 byte 392 movzbq (%rdx),%r11 393 sub $0x9,%r8 394 mov %r11b,(%rcx) 395 396 mov 0x1(%rdx),%r10 397 add $0x9,%rdx 398 mov %r10,0x1(%rcx) 399 400 add $0x9,%rcx 401 jmp L(now_qw_aligned) 402 403 .balign 16 404L(A0Q1): # ; need to move 8 bytes 405 406 mov (%rdx),%r10 407 add $0x8,%rdx 408 sub $0x8,%r8 409 mov %r10,(%rcx) 410 411 add $0x8,%rcx 412 jmp L(now_qw_aligned) 413 414 .balign 16 415L(A1Q1): # ; need to move 7=1+2+4 bytes 416 movzbq (%rdx),%r11 417 sub $0x7,%r8 418 mov %r11b,(%rcx) 419 420 movzwq 0x1(%rdx),%r10 421 mov %r10w,0x1(%rcx) 422 423 mov 0x3(%rdx),%r9d 424 add $0x7,%rdx 425 mov %r9d,0x3(%rcx) 426 add $0x7,%rcx 427 jmp L(now_qw_aligned) 428 429 .balign 16 430L(A2Q1): # ; need to move 6=2+4 bytes 431 movzwq (%rdx),%r10 432 sub $0x6,%r8 433 mov %r10w,(%rcx) 434 mov 0x2(%rdx),%r9d 435 add $0x6,%rdx 436 mov %r9d,0x2(%rcx) 437 add $0x6,%rcx 438 jmp L(now_qw_aligned) 439 440 .balign 16 441L(A3Q1): # ; need to move 5=1+4 bytes 442 movzbq (%rdx),%r11 443 sub $0x5,%r8 444 mov %r11b,(%rcx) 445 mov 0x1(%rdx),%r9d 446 add $0x5,%rdx 447 mov %r9d,0x1(%rcx) 448 add $0x5,%rcx 449 jmp L(now_qw_aligned) 450 451 .balign 16 452L(A4Q1): # ; need to move 4 bytes 453 mov (%rdx),%r9d 454 sub $0x4,%r8 455 add $0x4,%rdx 456 mov %r9d,(%rcx) 457 add $0x4,%rcx 458 jmp L(now_qw_aligned) 459 460 .balign 16 461L(A5Q1): # ; need to move 3=1+2 bytes 462 movzbq (%rdx),%r11 463 sub $0x3,%r8 464 mov %r11b,(%rcx) 465 466 movzwq 0x1(%rdx),%r10 467 add $0x3,%rdx 468 mov %r10w,0x1(%rcx) 469 470 add $0x3,%rcx 471 jmp L(now_qw_aligned) 472 473 .balign 16 474L(A6Q1): # ; need to move 2 bytes 475 movzwq (%rdx),%r10 476 sub $0x2,%r8 477 add $0x2,%rdx 478 mov %r10w,(%rcx) 479 add $0x2,%rcx 480 jmp L(now_qw_aligned) 481 482 .balign 16 483L(A7Q1): # ; need to move 1 byte 484 movzbq (%rdx),%r11 485 dec %r8 486 inc %rdx 487 mov %r11b,(%rcx) 488 inc %rcx 489 jmp L(now_qw_aligned) 490 491 492 .balign 16 493L(P0QG): 494 mov -0x80(%rdx),%r9 495 mov %r9,-0x80(%rcx) 496L(P0QF): 497 mov -0x78(%rdx),%r10 498 mov %r10,-0x78(%rcx) 499L(P0QE): 500 mov -0x70(%rdx),%r9 501 mov %r9,-0x70(%rcx) 502L(P0QD): 503 mov -0x68(%rdx),%r10 504 mov %r10,-0x68(%rcx) 505L(P0QC): 506 mov -0x60(%rdx),%r9 507 mov %r9,-0x60(%rcx) 508L(P0QB): 509 mov -0x58(%rdx),%r10 510 mov %r10,-0x58(%rcx) 511L(P0QA): 512 mov -0x50(%rdx),%r9 513 mov %r9,-0x50(%rcx) 514L(P0Q9): 515 mov -0x48(%rdx),%r10 516 mov %r10,-0x48(%rcx) 517L(P0Q8): 518 mov -0x40(%rdx),%r9 519 mov %r9,-0x40(%rcx) 520L(P0Q7): 521 mov -0x38(%rdx),%r10 522 mov %r10,-0x38(%rcx) 523L(P0Q6): 524 mov -0x30(%rdx),%r9 525 mov %r9,-0x30(%rcx) 526L(P0Q5): 527 mov -0x28(%rdx),%r10 528 mov %r10,-0x28(%rcx) 529L(P0Q4): 530 mov -0x20(%rdx),%r9 531 mov %r9,-0x20(%rcx) 532L(P0Q3): 533 mov -0x18(%rdx),%r10 534 mov %r10,-0x18(%rcx) 535L(P0Q2): 536 mov -0x10(%rdx),%r9 537 mov %r9,-0x10(%rcx) 538L(P0Q1): 539 mov -0x8(%rdx),%r10 540 mov %r10,-0x8(%rcx) 541L(P0Q0): 542 ret 543 544 .balign 16 545L(P1QF): 546 mov -0x79(%rdx),%r9 547 mov %r9,-0x79(%rcx) 548L(P1QE): 549 mov -0x71(%rdx),%r11 550 mov %r11,-0x71(%rcx) 551L(P1QD): 552 mov -0x69(%rdx),%r10 553 mov %r10,-0x69(%rcx) 554L(P1QC): 555 mov -0x61(%rdx),%r9 556 mov %r9,-0x61(%rcx) 557L(P1QB): 558 mov -0x59(%rdx),%r11 559 mov %r11,-0x59(%rcx) 560L(P1QA): 561 mov -0x51(%rdx),%r10 562 mov %r10,-0x51(%rcx) 563L(P1Q9): 564 mov -0x49(%rdx),%r9 565 mov %r9,-0x49(%rcx) 566L(P1Q8): 567 mov -0x41(%rdx),%r11 568 mov %r11,-0x41(%rcx) 569L(P1Q7): 570 mov -0x39(%rdx),%r10 571 mov %r10,-0x39(%rcx) 572L(P1Q6): 573 mov -0x31(%rdx),%r9 574 mov %r9,-0x31(%rcx) 575L(P1Q5): 576 mov -0x29(%rdx),%r11 577 mov %r11,-0x29(%rcx) 578L(P1Q4): 579 mov -0x21(%rdx),%r10 580 mov %r10,-0x21(%rcx) 581L(P1Q3): 582 mov -0x19(%rdx),%r9 583 mov %r9,-0x19(%rcx) 584L(P1Q2): 585 mov -0x11(%rdx),%r11 586 mov %r11,-0x11(%rcx) 587L(P1Q1): 588 mov -0x9(%rdx),%r10 589 mov %r10,-0x9(%rcx) 590L(P1Q0): 591 movzbq -0x1(%rdx),%r9 592 mov %r9b,-0x1(%rcx) 593 ret 594 595 .balign 16 596L(P2QF): 597 mov -0x7a(%rdx),%r9 598 mov %r9,-0x7a(%rcx) 599L(P2QE): 600 mov -0x72(%rdx),%r11 601 mov %r11,-0x72(%rcx) 602L(P2QD): 603 mov -0x6a(%rdx),%r10 604 mov %r10,-0x6a(%rcx) 605L(P2QC): 606 mov -0x62(%rdx),%r9 607 mov %r9,-0x62(%rcx) 608L(P2QB): 609 mov -0x5a(%rdx),%r11 610 mov %r11,-0x5a(%rcx) 611L(P2QA): 612 mov -0x52(%rdx),%r10 613 mov %r10,-0x52(%rcx) 614L(P2Q9): 615 mov -0x4a(%rdx),%r9 616 mov %r9,-0x4a(%rcx) 617L(P2Q8): 618 mov -0x42(%rdx),%r11 619 mov %r11,-0x42(%rcx) 620L(P2Q7): 621 mov -0x3a(%rdx),%r10 622 mov %r10,-0x3a(%rcx) 623L(P2Q6): 624 mov -0x32(%rdx),%r9 625 mov %r9,-0x32(%rcx) 626L(P2Q5): 627 mov -0x2a(%rdx),%r11 628 mov %r11,-0x2a(%rcx) 629L(P2Q4): 630 mov -0x22(%rdx),%r10 631 mov %r10,-0x22(%rcx) 632L(P2Q3): 633 mov -0x1a(%rdx),%r9 634 mov %r9,-0x1a(%rcx) 635L(P2Q2): 636 mov -0x12(%rdx),%r11 637 mov %r11,-0x12(%rcx) 638L(P2Q1): 639 mov -0xa(%rdx),%r10 640 mov %r10,-0xa(%rcx) 641L(P2Q0): 642 movzwq -0x2(%rdx),%r9 643 mov %r9w,-0x2(%rcx) 644 ret 645 646 .balign 16 647L(P3QF): 648 mov -0x7b(%rdx),%r9 649 mov %r9,-0x7b(%rcx) 650L(P3QE): 651 mov -0x73(%rdx),%r11 652 mov %r11,-0x73(%rcx) 653L(P3QD): 654 mov -0x6b(%rdx),%r10 655 mov %r10,-0x6b(%rcx) 656L(P3QC): 657 mov -0x63(%rdx),%r9 658 mov %r9,-0x63(%rcx) 659L(P3QB): 660 mov -0x5b(%rdx),%r11 661 mov %r11,-0x5b(%rcx) 662L(P3QA): 663 mov -0x53(%rdx),%r10 664 mov %r10,-0x53(%rcx) 665L(P3Q9): 666 mov -0x4b(%rdx),%r9 667 mov %r9,-0x4b(%rcx) 668L(P3Q8): 669 mov -0x43(%rdx),%r11 670 mov %r11,-0x43(%rcx) 671L(P3Q7): 672 mov -0x3b(%rdx),%r10 673 mov %r10,-0x3b(%rcx) 674L(P3Q6): 675 mov -0x33(%rdx),%r9 676 mov %r9,-0x33(%rcx) 677L(P3Q5): 678 mov -0x2b(%rdx),%r11 679 mov %r11,-0x2b(%rcx) 680L(P3Q4): 681 mov -0x23(%rdx),%r10 682 mov %r10,-0x23(%rcx) 683L(P3Q3): 684 mov -0x1b(%rdx),%r9 685 mov %r9,-0x1b(%rcx) 686L(P3Q2): 687 mov -0x13(%rdx),%r11 688 mov %r11,-0x13(%rcx) 689L(P3Q1): 690 mov -0xb(%rdx),%r10 691 mov %r10,-0xb(%rcx) 692 /* 693 * These trailing loads/stores have to do all their loads 1st, 694 * then do the stores. 695 */ 696L(P3Q0): 697 movzwq -0x3(%rdx),%r9 698 movzbq -0x1(%rdx),%r10 699 mov %r9w,-0x3(%rcx) 700 mov %r10b,-0x1(%rcx) 701 ret 702 703 .balign 16 704L(P4QF): 705 mov -0x7c(%rdx),%r9 706 mov %r9,-0x7c(%rcx) 707L(P4QE): 708 mov -0x74(%rdx),%r11 709 mov %r11,-0x74(%rcx) 710L(P4QD): 711 mov -0x6c(%rdx),%r10 712 mov %r10,-0x6c(%rcx) 713L(P4QC): 714 mov -0x64(%rdx),%r9 715 mov %r9,-0x64(%rcx) 716L(P4QB): 717 mov -0x5c(%rdx),%r11 718 mov %r11,-0x5c(%rcx) 719L(P4QA): 720 mov -0x54(%rdx),%r10 721 mov %r10,-0x54(%rcx) 722L(P4Q9): 723 mov -0x4c(%rdx),%r9 724 mov %r9,-0x4c(%rcx) 725L(P4Q8): 726 mov -0x44(%rdx),%r11 727 mov %r11,-0x44(%rcx) 728L(P4Q7): 729 mov -0x3c(%rdx),%r10 730 mov %r10,-0x3c(%rcx) 731L(P4Q6): 732 mov -0x34(%rdx),%r9 733 mov %r9,-0x34(%rcx) 734L(P4Q5): 735 mov -0x2c(%rdx),%r11 736 mov %r11,-0x2c(%rcx) 737L(P4Q4): 738 mov -0x24(%rdx),%r10 739 mov %r10,-0x24(%rcx) 740L(P4Q3): 741 mov -0x1c(%rdx),%r9 742 mov %r9,-0x1c(%rcx) 743L(P4Q2): 744 mov -0x14(%rdx),%r11 745 mov %r11,-0x14(%rcx) 746L(P4Q1): 747 mov -0xc(%rdx),%r10 748 mov %r10,-0xc(%rcx) 749L(P4Q0): 750 mov -0x4(%rdx),%r9d 751 mov %r9d,-0x4(%rcx) 752 ret 753 754 .balign 16 755L(P5QF): 756 mov -0x7d(%rdx),%r9 757 mov %r9,-0x7d(%rcx) 758L(P5QE): 759 mov -0x75(%rdx),%r11 760 mov %r11,-0x75(%rcx) 761L(P5QD): 762 mov -0x6d(%rdx),%r10 763 mov %r10,-0x6d(%rcx) 764L(P5QC): 765 mov -0x65(%rdx),%r9 766 mov %r9,-0x65(%rcx) 767L(P5QB): 768 mov -0x5d(%rdx),%r11 769 mov %r11,-0x5d(%rcx) 770L(P5QA): 771 mov -0x55(%rdx),%r10 772 mov %r10,-0x55(%rcx) 773L(P5Q9): 774 mov -0x4d(%rdx),%r9 775 mov %r9,-0x4d(%rcx) 776L(P5Q8): 777 mov -0x45(%rdx),%r11 778 mov %r11,-0x45(%rcx) 779L(P5Q7): 780 mov -0x3d(%rdx),%r10 781 mov %r10,-0x3d(%rcx) 782L(P5Q6): 783 mov -0x35(%rdx),%r9 784 mov %r9,-0x35(%rcx) 785L(P5Q5): 786 mov -0x2d(%rdx),%r11 787 mov %r11,-0x2d(%rcx) 788L(P5Q4): 789 mov -0x25(%rdx),%r10 790 mov %r10,-0x25(%rcx) 791L(P5Q3): 792 mov -0x1d(%rdx),%r9 793 mov %r9,-0x1d(%rcx) 794L(P5Q2): 795 mov -0x15(%rdx),%r11 796 mov %r11,-0x15(%rcx) 797L(P5Q1): 798 mov -0xd(%rdx),%r10 799 mov %r10,-0xd(%rcx) 800 /* 801 * These trailing loads/stores have to do all their loads 1st, 802 * then do the stores. 803 */ 804L(P5Q0): 805 mov -0x5(%rdx),%r9d 806 movzbq -0x1(%rdx),%r10 807 mov %r9d,-0x5(%rcx) 808 mov %r10b,-0x1(%rcx) 809 ret 810 811 .balign 16 812L(P6QF): 813 mov -0x7e(%rdx),%r9 814 mov %r9,-0x7e(%rcx) 815L(P6QE): 816 mov -0x76(%rdx),%r11 817 mov %r11,-0x76(%rcx) 818L(P6QD): 819 mov -0x6e(%rdx),%r10 820 mov %r10,-0x6e(%rcx) 821L(P6QC): 822 mov -0x66(%rdx),%r9 823 mov %r9,-0x66(%rcx) 824L(P6QB): 825 mov -0x5e(%rdx),%r11 826 mov %r11,-0x5e(%rcx) 827L(P6QA): 828 mov -0x56(%rdx),%r10 829 mov %r10,-0x56(%rcx) 830L(P6Q9): 831 mov -0x4e(%rdx),%r9 832 mov %r9,-0x4e(%rcx) 833L(P6Q8): 834 mov -0x46(%rdx),%r11 835 mov %r11,-0x46(%rcx) 836L(P6Q7): 837 mov -0x3e(%rdx),%r10 838 mov %r10,-0x3e(%rcx) 839L(P6Q6): 840 mov -0x36(%rdx),%r9 841 mov %r9,-0x36(%rcx) 842L(P6Q5): 843 mov -0x2e(%rdx),%r11 844 mov %r11,-0x2e(%rcx) 845L(P6Q4): 846 mov -0x26(%rdx),%r10 847 mov %r10,-0x26(%rcx) 848L(P6Q3): 849 mov -0x1e(%rdx),%r9 850 mov %r9,-0x1e(%rcx) 851L(P6Q2): 852 mov -0x16(%rdx),%r11 853 mov %r11,-0x16(%rcx) 854L(P6Q1): 855 mov -0xe(%rdx),%r10 856 mov %r10,-0xe(%rcx) 857 /* 858 * These trailing loads/stores have to do all their loads 1st, 859 * then do the stores. 860 */ 861L(P6Q0): 862 mov -0x6(%rdx),%r9d 863 movzwq -0x2(%rdx),%r10 864 mov %r9d,-0x6(%rcx) 865 mov %r10w,-0x2(%rcx) 866 ret 867 868 .balign 16 869L(P7QF): 870 mov -0x7f(%rdx),%r9 871 mov %r9,-0x7f(%rcx) 872L(P7QE): 873 mov -0x77(%rdx),%r11 874 mov %r11,-0x77(%rcx) 875L(P7QD): 876 mov -0x6f(%rdx),%r10 877 mov %r10,-0x6f(%rcx) 878L(P7QC): 879 mov -0x67(%rdx),%r9 880 mov %r9,-0x67(%rcx) 881L(P7QB): 882 mov -0x5f(%rdx),%r11 883 mov %r11,-0x5f(%rcx) 884L(P7QA): 885 mov -0x57(%rdx),%r10 886 mov %r10,-0x57(%rcx) 887L(P7Q9): 888 mov -0x4f(%rdx),%r9 889 mov %r9,-0x4f(%rcx) 890L(P7Q8): 891 mov -0x47(%rdx),%r11 892 mov %r11,-0x47(%rcx) 893L(P7Q7): 894 mov -0x3f(%rdx),%r10 895 mov %r10,-0x3f(%rcx) 896L(P7Q6): 897 mov -0x37(%rdx),%r9 898 mov %r9,-0x37(%rcx) 899L(P7Q5): 900 mov -0x2f(%rdx),%r11 901 mov %r11,-0x2f(%rcx) 902L(P7Q4): 903 mov -0x27(%rdx),%r10 904 mov %r10,-0x27(%rcx) 905L(P7Q3): 906 mov -0x1f(%rdx),%r9 907 mov %r9,-0x1f(%rcx) 908L(P7Q2): 909 mov -0x17(%rdx),%r11 910 mov %r11,-0x17(%rcx) 911L(P7Q1): 912 mov -0xf(%rdx),%r10 913 mov %r10,-0xf(%rcx) 914 /* 915 * These trailing loads/stores have to do all their loads 1st, 916 * then do the stores. 917 */ 918L(P7Q0): 919 mov -0x7(%rdx),%r9d 920 movzwq -0x3(%rdx),%r10 921 movzbq -0x1(%rdx),%r11 922 mov %r9d,-0x7(%rcx) 923 mov %r10w,-0x3(%rcx) 924 mov %r11b,-0x1(%rcx) 925 ret 926 927 .balign 16 928L(ck_use_sse2): 929 /* 930 * Align dest to 16 byte boundary. 931 */ 932 test $0xf,%rcx 933 jnz L(ShrtAlignNew) 934 935L(now_qw_aligned): 936 cmpl $NO_SSE,.memops_method(%rip) 937 je L(Loop8byte_pre) 938 939 /* 940 * The fall-through path is to do SSE2 16-byte load/stores 941 */ 942 943 /* 944 * If current move size is larger than half of the highest level cache 945 * size, then do non-temporal moves. 946 */ 947 mov .largest_level_cache_size(%rip),%r9d 948 shr %r9 # take half of it 949 cmp %r9,%r8 950 jg L(sse2_nt_move) 951 952 /* 953 * If both the source and dest are aligned, then use the both aligned 954 * logic. Well aligned data should reap the rewards. 955 */ 956 test $0xf,%rdx 957 jz L(pre_both_aligned) 958 959 lea L(SSE_src)(%rip),%r10 # SSE2 (default) 960 testl $USE_SSSE3,.memops_method(%rip) 961 jz 1f 962 lea L(SSSE3_src)(%rip),%r10 # SSSE3 963 9641: 965 /* 966 * if the src is not 16 byte aligned... 967 */ 968 mov %rdx,%r11 969 and $0xf,%r11 970 movdqu (%rdx),%xmm0 971 movdqa %xmm0,(%rcx) 972 add $0x10,%rdx 973 sub %r11,%rdx 974 add $0x10,%rcx 975 sub $0x10,%r8 976 movdqa (%rdx),%xmm1 977 978 movslq (%r10,%r11,4),%r9 979 lea (%r9,%r10,1),%r10 980 jmpq *%r10 981 982 .balign 16 983L(SSSE3_src): .int L(pre_both_aligned)-L(SSSE3_src) 984 .int L(mov3dqa1) -L(SSSE3_src) 985 .int L(mov3dqa2) -L(SSSE3_src) 986 .int L(mov3dqa3) -L(SSSE3_src) 987 .int L(mov3dqa4) -L(SSSE3_src) 988 .int L(mov3dqa5) -L(SSSE3_src) 989 .int L(mov3dqa6) -L(SSSE3_src) 990 .int L(mov3dqa7) -L(SSSE3_src) 991 .int L(movdqa8) -L(SSSE3_src) 992 .int L(mov3dqa9) -L(SSSE3_src) 993 .int L(mov3dqa10)-L(SSSE3_src) 994 .int L(mov3dqa11)-L(SSSE3_src) 995 .int L(mov3dqa12)-L(SSSE3_src) 996 .int L(mov3dqa13)-L(SSSE3_src) 997 .int L(mov3dqa14)-L(SSSE3_src) 998 .int L(mov3dqa15)-L(SSSE3_src) 999L(SSE_src): .int L(pre_both_aligned)-L(SSE_src) 1000 .int L(movdqa1) -L(SSE_src) 1001 .int L(movdqa2) -L(SSE_src) 1002 .int L(movdqa3) -L(SSE_src) 1003 .int L(movdqa4) -L(SSE_src) 1004 .int L(movdqa5) -L(SSE_src) 1005 .int L(movdqa6) -L(SSE_src) 1006 .int L(movdqa7) -L(SSE_src) 1007 .int L(movdqa8) -L(SSE_src) 1008 .int L(movdqa9) -L(SSE_src) 1009 .int L(movdqa10)-L(SSE_src) 1010 .int L(movdqa11)-L(SSE_src) 1011 .int L(movdqa12)-L(SSE_src) 1012 .int L(movdqa13)-L(SSE_src) 1013 .int L(movdqa14)-L(SSE_src) 1014 .int L(movdqa15)-L(SSE_src) 1015 1016 .balign 16 1017L(movdqa1): 1018 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1019 movdqa 0x20(%rdx),%xmm0 # load the upper source buffer 1020 lea 0x20(%rdx),%rdx 1021 lea -0x20(%r8),%r8 1022 1023 psrldq $0x1,%xmm1 # shift right prev buffer (saved from last iteration) 1024 movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration 1025 pslldq $0xf,%xmm3 # shift the current buffer left (shift in zeros) 1026 por %xmm1,%xmm3 # OR them together 1027 cmp $0x20,%r8 1028 1029 psrldq $0x1,%xmm2 # shift right prev buffer (saved from last iteration) 1030 movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration 1031 pslldq $0xf,%xmm0 # shift the current buffer left (shift in zeros) 1032 por %xmm2,%xmm0 # OR them together 1033 movdqa %xmm3,(%rcx) # store it 1034 movdqa %xmm0,0x10(%rcx) # store it 1035 lea 0x20(%rcx),%rcx 1036 1037 jge L(movdqa1) 1038 jmp L(movdqa_epi) 1039 1040 .balign 16 1041L(movdqa2): 1042 sub $0x20,%r8 1043 movdqa 0x10(%rdx),%xmm3 1044 movdqa 0x20(%rdx),%xmm0 1045 add $0x20,%rdx 1046 1047 psrldq $0x2,%xmm1 1048 movdqa %xmm3,%xmm2 1049 pslldq $0xe,%xmm3 1050 por %xmm1,%xmm3 1051 1052 psrldq $0x2,%xmm2 1053 movdqa %xmm0,%xmm1 1054 pslldq $0xe,%xmm0 1055 por %xmm2,%xmm0 1056 movdqa %xmm3,(%rcx) 1057 movdqa %xmm0,0x10(%rcx) 1058 1059 add $0x20,%rcx 1060 cmp $0x20,%r8 1061 jge L(movdqa2) 1062 jmp L(movdqa_epi) 1063 1064 .balign 16 1065L(movdqa3): 1066 sub $0x20,%r8 1067 movdqa 0x10(%rdx),%xmm3 1068 movdqa 0x20(%rdx),%xmm0 1069 add $0x20,%rdx 1070 1071 psrldq $0x3,%xmm1 1072 movdqa %xmm3,%xmm2 1073 pslldq $0xd,%xmm3 1074 por %xmm1,%xmm3 1075 1076 psrldq $0x3,%xmm2 1077 movdqa %xmm0,%xmm1 1078 pslldq $0xd,%xmm0 1079 por %xmm2,%xmm0 1080 movdqa %xmm3,(%rcx) 1081 movdqa %xmm0,0x10(%rcx) 1082 1083 add $0x20,%rcx 1084 cmp $0x20,%r8 1085 jge L(movdqa3) 1086 jmp L(movdqa_epi) 1087 1088 .balign 16 1089L(movdqa4): 1090 sub $0x20,%r8 1091 movdqa 0x10(%rdx),%xmm3 1092 movdqa 0x20(%rdx),%xmm0 1093 add $0x20,%rdx 1094 1095 psrldq $0x4,%xmm1 1096 movdqa %xmm3,%xmm2 1097 pslldq $0xc,%xmm3 1098 por %xmm1,%xmm3 1099 1100 psrldq $0x4,%xmm2 1101 movdqa %xmm0,%xmm1 1102 pslldq $0xc,%xmm0 1103 por %xmm2,%xmm0 1104 1105 movdqa %xmm3,(%rcx) 1106 movdqa %xmm0,0x10(%rcx) 1107 1108 add $0x20,%rcx 1109 cmp $0x20,%r8 1110 jge L(movdqa4) 1111 jmp L(movdqa_epi) 1112 1113 .balign 16 1114L(movdqa5): 1115 sub $0x20,%r8 1116 movdqa 0x10(%rdx),%xmm3 1117 movdqa 0x20(%rdx),%xmm0 1118 add $0x20,%rdx 1119 1120 psrldq $0x5,%xmm1 1121 movdqa %xmm3,%xmm2 1122 pslldq $0xb,%xmm3 1123 por %xmm1,%xmm3 1124 1125 psrldq $0x5,%xmm2 1126 movdqa %xmm0,%xmm1 1127 pslldq $0xb,%xmm0 1128 por %xmm2,%xmm0 1129 1130 movdqa %xmm3,(%rcx) 1131 movdqa %xmm0,0x10(%rcx) 1132 1133 add $0x20,%rcx 1134 cmp $0x20,%r8 1135 jge L(movdqa5) 1136 jmp L(movdqa_epi) 1137 1138 .balign 16 1139L(movdqa6): 1140 sub $0x20,%r8 1141 movdqa 0x10(%rdx),%xmm3 1142 movdqa 0x20(%rdx),%xmm0 1143 add $0x20,%rdx 1144 1145 psrldq $0x6,%xmm1 1146 movdqa %xmm3,%xmm2 1147 pslldq $0xa,%xmm3 1148 por %xmm1,%xmm3 1149 1150 psrldq $0x6,%xmm2 1151 movdqa %xmm0,%xmm1 1152 pslldq $0xa,%xmm0 1153 por %xmm2,%xmm0 1154 movdqa %xmm3,(%rcx) 1155 movdqa %xmm0,0x10(%rcx) 1156 1157 add $0x20,%rcx 1158 cmp $0x20,%r8 1159 jge L(movdqa6) 1160 jmp L(movdqa_epi) 1161 1162 .balign 16 1163L(movdqa7): 1164 sub $0x20,%r8 1165 movdqa 0x10(%rdx),%xmm3 1166 movdqa 0x20(%rdx),%xmm0 1167 add $0x20,%rdx 1168 1169 psrldq $0x7,%xmm1 1170 movdqa %xmm3,%xmm2 1171 pslldq $0x9,%xmm3 1172 por %xmm1,%xmm3 1173 1174 psrldq $0x7,%xmm2 1175 movdqa %xmm0,%xmm1 1176 pslldq $0x9,%xmm0 1177 por %xmm2,%xmm0 1178 movdqa %xmm3,(%rcx) 1179 movdqa %xmm0,0x10(%rcx) 1180 1181 add $0x20,%rcx 1182 cmp $0x20,%r8 1183 jge L(movdqa7) 1184 jmp L(movdqa_epi) 1185 1186 .balign 16 1187L(movdqa8): 1188 movdqa 0x10(%rdx),%xmm3 1189 sub $0x30,%r8 1190 movdqa 0x20(%rdx),%xmm0 1191 movdqa 0x30(%rdx),%xmm5 1192 lea 0x30(%rdx),%rdx 1193 1194 shufpd $0x1,%xmm3,%xmm1 1195 movdqa %xmm1,(%rcx) 1196 1197 cmp $0x30,%r8 1198 1199 shufpd $0x1,%xmm0,%xmm3 1200 movdqa %xmm3,0x10(%rcx) 1201 1202 movdqa %xmm5,%xmm1 1203 shufpd $0x1,%xmm5,%xmm0 1204 movdqa %xmm0,0x20(%rcx) 1205 1206 lea 0x30(%rcx),%rcx 1207 1208 jge L(movdqa8) 1209 jmp L(movdqa_epi) 1210 1211 .balign 16 1212L(movdqa9): 1213 sub $0x20,%r8 1214 movdqa 0x10(%rdx),%xmm3 1215 movdqa 0x20(%rdx),%xmm0 1216 add $0x20,%rdx 1217 1218 psrldq $0x9,%xmm1 1219 movdqa %xmm3,%xmm2 1220 pslldq $0x7,%xmm3 1221 por %xmm1,%xmm3 1222 1223 psrldq $0x9,%xmm2 1224 movdqa %xmm0,%xmm1 1225 pslldq $0x7,%xmm0 1226 por %xmm2,%xmm0 1227 movdqa %xmm3,(%rcx) 1228 movdqa %xmm0,0x10(%rcx) 1229 1230 add $0x20,%rcx 1231 cmp $0x20,%r8 1232 jge L(movdqa9) 1233 jmp L(movdqa_epi) 1234 1235 .balign 16 1236L(movdqa10): 1237 sub $0x20,%r8 1238 movdqa 0x10(%rdx),%xmm3 1239 movdqa 0x20(%rdx),%xmm0 1240 add $0x20,%rdx 1241 1242 psrldq $0xa,%xmm1 1243 movdqa %xmm3,%xmm2 1244 pslldq $0x6,%xmm3 1245 por %xmm1,%xmm3 1246 1247 psrldq $0xa,%xmm2 1248 movdqa %xmm0,%xmm1 1249 pslldq $0x6,%xmm0 1250 por %xmm2,%xmm0 1251 movdqa %xmm3,(%rcx) 1252 movdqa %xmm0,0x10(%rcx) 1253 1254 add $0x20,%rcx 1255 cmp $0x20,%r8 1256 jge L(movdqa10) 1257 jmp L(movdqa_epi) 1258 1259 .balign 16 1260L(movdqa11): 1261 sub $0x20,%r8 1262 movdqa 0x10(%rdx),%xmm3 1263 movdqa 0x20(%rdx),%xmm0 1264 add $0x20,%rdx 1265 1266 psrldq $0xb,%xmm1 1267 movdqa %xmm3,%xmm2 1268 pslldq $0x5,%xmm3 1269 por %xmm1,%xmm3 1270 1271 psrldq $0xb,%xmm2 1272 movdqa %xmm0,%xmm1 1273 pslldq $0x5,%xmm0 1274 por %xmm2,%xmm0 1275 movdqa %xmm3,(%rcx) 1276 movdqa %xmm0,0x10(%rcx) 1277 1278 add $0x20,%rcx 1279 cmp $0x20,%r8 1280 jge L(movdqa11) 1281 jmp L(movdqa_epi) 1282 1283 .balign 16 1284L(movdqa12): 1285 sub $0x20,%r8 1286 movdqa 0x10(%rdx),%xmm3 1287 movdqa 0x20(%rdx),%xmm0 1288 add $0x20,%rdx 1289 1290 psrldq $0xc,%xmm1 1291 movdqa %xmm3,%xmm2 1292 pslldq $0x4,%xmm3 1293 por %xmm1,%xmm3 1294 1295 psrldq $0xc,%xmm2 1296 movdqa %xmm0,%xmm1 1297 pslldq $0x4,%xmm0 1298 por %xmm2,%xmm0 1299 movdqa %xmm3,(%rcx) 1300 movdqa %xmm0,0x10(%rcx) 1301 1302 add $0x20,%rcx 1303 cmp $0x20,%r8 1304 jge L(movdqa12) 1305 jmp L(movdqa_epi) 1306 1307 .balign 16 1308L(movdqa13): 1309 sub $0x20,%r8 1310 movdqa 0x10(%rdx),%xmm3 1311 movdqa 0x20(%rdx),%xmm0 1312 add $0x20,%rdx 1313 1314 psrldq $0xd,%xmm1 1315 movdqa %xmm3,%xmm2 1316 pslldq $0x3,%xmm3 1317 por %xmm1,%xmm3 1318 1319 psrldq $0xd,%xmm2 1320 movdqa %xmm0,%xmm1 1321 pslldq $0x3,%xmm0 1322 por %xmm2,%xmm0 1323 movdqa %xmm3,(%rcx) 1324 movdqa %xmm0,0x10(%rcx) 1325 1326 add $0x20,%rcx 1327 cmp $0x20,%r8 1328 jge L(movdqa13) 1329 jmp L(movdqa_epi) 1330 1331 .balign 16 1332L(movdqa14): 1333 sub $0x20,%r8 1334 movdqa 0x10(%rdx),%xmm3 1335 movdqa 0x20(%rdx),%xmm0 1336 add $0x20,%rdx 1337 1338 psrldq $0xe,%xmm1 1339 movdqa %xmm3,%xmm2 1340 pslldq $0x2,%xmm3 1341 por %xmm1,%xmm3 1342 1343 psrldq $0xe,%xmm2 1344 movdqa %xmm0,%xmm1 1345 pslldq $0x2,%xmm0 1346 por %xmm2,%xmm0 1347 movdqa %xmm3,(%rcx) 1348 movdqa %xmm0,0x10(%rcx) 1349 1350 add $0x20,%rcx 1351 cmp $0x20,%r8 1352 jge L(movdqa14) 1353 jmp L(movdqa_epi) 1354 1355 .balign 16 1356L(movdqa15): 1357 sub $0x20,%r8 1358 movdqa 0x10(%rdx),%xmm3 1359 movdqa 0x20(%rdx),%xmm0 1360 add $0x20,%rdx 1361 1362 psrldq $0xf,%xmm1 1363 movdqa %xmm3,%xmm2 1364 pslldq $0x1,%xmm3 1365 por %xmm1,%xmm3 1366 1367 psrldq $0xf,%xmm2 1368 movdqa %xmm0,%xmm1 1369 pslldq $0x1,%xmm0 1370 por %xmm2,%xmm0 1371 movdqa %xmm3,(%rcx) 1372 movdqa %xmm0,0x10(%rcx) 1373 1374 add $0x20,%rcx 1375 cmp $0x20,%r8 1376 jge L(movdqa15) 1377 #jmp L(movdqa_epi) 1378 1379 .balign 16 1380L(movdqa_epi): 1381 lea L(fwdPxQx)(%rip),%r10 1382 add %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop) 1383 add %r8,%rcx 1384 add %r8,%rdx 1385 1386 movslq (%r10,%r8,4),%r9 1387 lea (%r9,%r10,1),%r10 1388 jmpq *%r10 1389 1390 .balign 16 1391L(mov3dqa1): 1392 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1393 sub $0x30,%r8 1394 movdqa 0x20(%rdx),%xmm0 # load the upper source buffer 1395 movdqa 0x30(%rdx),%xmm5 # load the upper source buffer 1396 lea 0x30(%rdx),%rdx 1397 cmp $0x30,%r8 1398 1399 movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration 1400 #palignr $0x1,%xmm1,%xmm3 1401 .byte 0x66,0x0f,0x3a,0x0f 1402 .byte 0xd9,0x01 1403 movdqa %xmm3,(%rcx) # store it 1404 1405 movdqa %xmm0,%xmm4 # store off xmm reg for use next iteration 1406 #palignr $0x1,%xmm2,%xmm0 1407 .byte 0x66,0x0f,0x3a,0x0f 1408 .byte 0xc2,0x01 1409 movdqa %xmm0,0x10(%rcx) # store it 1410 1411 movdqa %xmm5,%xmm1 # store off xmm reg for use next iteration 1412 #palignr $0x1,%xmm4,%xmm5 1413 .byte 0x66,0x0f,0x3a,0x0f 1414 .byte 0xec,0x01 1415 movdqa %xmm5,0x20(%rcx) # store it 1416 1417 lea 0x30(%rcx),%rcx 1418 jge L(mov3dqa1) 1419 1420 cmp $0x10,%r8 1421 jl L(movdqa_epi) 1422 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1423 sub $0x10,%r8 1424 lea 0x10(%rdx),%rdx 1425 movdqa %xmm3,%xmm2 # save for use next concat 1426 #palignr $0x1,%xmm1,%xmm3 1427 .byte 0x66,0x0f,0x3a,0x0f 1428 .byte 0xd9,0x01 1429 1430 cmp $0x10,%r8 1431 movdqa %xmm3,(%rcx) # store it 1432 lea 0x10(%rcx),%rcx 1433 jl L(movdqa_epi) 1434 1435 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1436 sub $0x10,%r8 1437 lea 0x10(%rdx),%rdx 1438 #palignr $0x1,%xmm2,%xmm0 1439 .byte 0x66,0x0f,0x3a,0x0f 1440 .byte 0xc2,0x01 1441 movdqa %xmm0,(%rcx) # store it 1442 lea 0x10(%rcx),%rcx 1443 jmp L(movdqa_epi) 1444 1445 .balign 16 1446L(mov3dqa2): 1447 movdqa 0x10(%rdx),%xmm3 1448 sub $0x30,%r8 1449 movdqa 0x20(%rdx),%xmm0 1450 movdqa 0x30(%rdx),%xmm5 1451 lea 0x30(%rdx),%rdx 1452 cmp $0x30,%r8 1453 1454 movdqa %xmm3,%xmm2 1455 #palignr $0x2,%xmm1,%xmm3 1456 .byte 0x66,0x0f,0x3a,0x0f 1457 .byte 0xd9,0x02 1458 movdqa %xmm3,(%rcx) 1459 1460 movdqa %xmm0,%xmm4 1461 #palignr $0x2,%xmm2,%xmm0 1462 .byte 0x66,0x0f,0x3a,0x0f 1463 .byte 0xc2,0x02 1464 movdqa %xmm0,0x10(%rcx) 1465 1466 movdqa %xmm5,%xmm1 1467 #palignr $0x2,%xmm4,%xmm5 1468 .byte 0x66,0x0f,0x3a,0x0f 1469 .byte 0xec,0x02 1470 movdqa %xmm5,0x20(%rcx) 1471 1472 lea 0x30(%rcx),%rcx 1473 jge L(mov3dqa2) 1474 1475 cmp $0x10,%r8 1476 jl L(movdqa_epi) 1477 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1478 sub $0x10,%r8 1479 lea 0x10(%rdx),%rdx 1480 movdqa %xmm3,%xmm2 # save for use next concat 1481 #palignr $0x2,%xmm1,%xmm3 1482 .byte 0x66,0x0f,0x3a,0x0f 1483 .byte 0xd9,0x02 1484 1485 cmp $0x10,%r8 1486 movdqa %xmm3,(%rcx) # store it 1487 lea 0x10(%rcx),%rcx 1488 jl L(movdqa_epi) 1489 1490 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1491 sub $0x10,%r8 1492 lea 0x10(%rdx),%rdx 1493 #palignr $0x2,%xmm2,%xmm0 1494 .byte 0x66,0x0f,0x3a,0x0f 1495 .byte 0xc2,0x02 1496 movdqa %xmm0,(%rcx) # store it 1497 lea 0x10(%rcx),%rcx 1498 jmp L(movdqa_epi) 1499 1500 .balign 16 1501L(mov3dqa3): 1502 movdqa 0x10(%rdx),%xmm3 1503 sub $0x30,%r8 1504 movdqa 0x20(%rdx),%xmm0 1505 movdqa 0x30(%rdx),%xmm5 1506 lea 0x30(%rdx),%rdx 1507 cmp $0x30,%r8 1508 1509 movdqa %xmm3,%xmm2 1510 #palignr $0x3,%xmm1,%xmm3 1511 .byte 0x66,0x0f,0x3a,0x0f 1512 .byte 0xd9,0x03 1513 movdqa %xmm3,(%rcx) 1514 1515 movdqa %xmm0,%xmm4 1516 #palignr $0x3,%xmm2,%xmm0 1517 .byte 0x66,0x0f,0x3a,0x0f 1518 .byte 0xc2,0x03 1519 movdqa %xmm0,0x10(%rcx) 1520 1521 movdqa %xmm5,%xmm1 1522 #palignr $0x3,%xmm4,%xmm5 1523 .byte 0x66,0x0f,0x3a,0x0f 1524 .byte 0xec,0x03 1525 movdqa %xmm5,0x20(%rcx) 1526 1527 lea 0x30(%rcx),%rcx 1528 jge L(mov3dqa3) 1529 1530 cmp $0x10,%r8 1531 jl L(movdqa_epi) 1532 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1533 sub $0x10,%r8 1534 lea 0x10(%rdx),%rdx 1535 movdqa %xmm3,%xmm2 # save for use next concat 1536 #palignr $0x3,%xmm1,%xmm3 1537 .byte 0x66,0x0f,0x3a,0x0f 1538 .byte 0xd9,0x03 1539 1540 cmp $0x10,%r8 1541 movdqa %xmm3,(%rcx) # store it 1542 lea 0x10(%rcx),%rcx 1543 jl L(movdqa_epi) 1544 1545 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1546 sub $0x10,%r8 1547 lea 0x10(%rdx),%rdx 1548 #palignr $0x3,%xmm2,%xmm0 1549 .byte 0x66,0x0f,0x3a,0x0f 1550 .byte 0xc2,0x03 1551 movdqa %xmm0,(%rcx) # store it 1552 lea 0x10(%rcx),%rcx 1553 jmp L(movdqa_epi) 1554 1555 .balign 16 1556L(mov3dqa4): 1557 movdqa 0x10(%rdx),%xmm3 1558 sub $0x30,%r8 1559 movdqa 0x20(%rdx),%xmm0 1560 movdqa 0x30(%rdx),%xmm5 1561 lea 0x30(%rdx),%rdx 1562 cmp $0x30,%r8 1563 1564 movdqa %xmm3,%xmm2 1565 #palignr $0x4,%xmm1,%xmm3 1566 .byte 0x66,0x0f,0x3a,0x0f 1567 .byte 0xd9,0x04 1568 movdqa %xmm3,(%rcx) 1569 1570 movdqa %xmm0,%xmm4 1571 #palignr $0x4,%xmm2,%xmm0 1572 .byte 0x66,0x0f,0x3a,0x0f 1573 .byte 0xc2,0x04 1574 movdqa %xmm0,0x10(%rcx) 1575 1576 movdqa %xmm5,%xmm1 1577 #palignr $0x4,%xmm4,%xmm5 1578 .byte 0x66,0x0f,0x3a,0x0f 1579 .byte 0xec,0x04 1580 movdqa %xmm5,0x20(%rcx) 1581 1582 lea 0x30(%rcx),%rcx 1583 jge L(mov3dqa4) 1584 1585 cmp $0x10,%r8 1586 jl L(movdqa_epi) 1587 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1588 sub $0x10,%r8 1589 lea 0x10(%rdx),%rdx 1590 movdqa %xmm3,%xmm2 # save for use next concat 1591 #palignr $0x4,%xmm1,%xmm3 1592 .byte 0x66,0x0f,0x3a,0x0f 1593 .byte 0xd9,0x04 1594 1595 cmp $0x10,%r8 1596 movdqa %xmm3,(%rcx) # store it 1597 lea 0x10(%rcx),%rcx 1598 jl L(movdqa_epi) 1599 1600 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1601 sub $0x10,%r8 1602 lea 0x10(%rdx),%rdx 1603 #palignr $0x4,%xmm2,%xmm0 1604 .byte 0x66,0x0f,0x3a,0x0f 1605 .byte 0xc2,0x04 1606 movdqa %xmm0,(%rcx) # store it 1607 lea 0x10(%rcx),%rcx 1608 jmp L(movdqa_epi) 1609 1610 .balign 16 1611L(mov3dqa5): 1612 movdqa 0x10(%rdx),%xmm3 1613 sub $0x30,%r8 1614 movdqa 0x20(%rdx),%xmm0 1615 movdqa 0x30(%rdx),%xmm5 1616 lea 0x30(%rdx),%rdx 1617 cmp $0x30,%r8 1618 1619 movdqa %xmm3,%xmm2 1620 #palignr $0x5,%xmm1,%xmm3 1621 .byte 0x66,0x0f,0x3a,0x0f 1622 .byte 0xd9,0x05 1623 movdqa %xmm3,(%rcx) 1624 1625 movdqa %xmm0,%xmm4 1626 #palignr $0x5,%xmm2,%xmm0 1627 .byte 0x66,0x0f,0x3a,0x0f 1628 .byte 0xc2,0x05 1629 movdqa %xmm0,0x10(%rcx) 1630 1631 movdqa %xmm5,%xmm1 1632 #palignr $0x5,%xmm4,%xmm5 1633 .byte 0x66,0x0f,0x3a,0x0f 1634 .byte 0xec,0x05 1635 movdqa %xmm5,0x20(%rcx) 1636 1637 lea 0x30(%rcx),%rcx 1638 jge L(mov3dqa5) 1639 1640 cmp $0x10,%r8 1641 jl L(movdqa_epi) 1642 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1643 sub $0x10,%r8 1644 lea 0x10(%rdx),%rdx 1645 movdqa %xmm3,%xmm2 # save for use next concat 1646 #palignr $0x5,%xmm1,%xmm3 1647 .byte 0x66,0x0f,0x3a,0x0f 1648 .byte 0xd9,0x05 1649 1650 cmp $0x10,%r8 1651 movdqa %xmm3,(%rcx) # store it 1652 lea 0x10(%rcx),%rcx 1653 jl L(movdqa_epi) 1654 1655 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1656 sub $0x10,%r8 1657 lea 0x10(%rdx),%rdx 1658 #palignr $0x5,%xmm2,%xmm0 1659 .byte 0x66,0x0f,0x3a,0x0f 1660 .byte 0xc2,0x05 1661 movdqa %xmm0,(%rcx) # store it 1662 lea 0x10(%rcx),%rcx 1663 jmp L(movdqa_epi) 1664 1665 .balign 16 1666L(mov3dqa6): 1667 movdqa 0x10(%rdx),%xmm3 1668 sub $0x30,%r8 1669 movdqa 0x20(%rdx),%xmm0 1670 movdqa 0x30(%rdx),%xmm5 1671 lea 0x30(%rdx),%rdx 1672 cmp $0x30,%r8 1673 1674 movdqa %xmm3,%xmm2 1675 #palignr $0x6,%xmm1,%xmm3 1676 .byte 0x66,0x0f,0x3a,0x0f 1677 .byte 0xd9,0x06 1678 movdqa %xmm3,(%rcx) 1679 1680 movdqa %xmm0,%xmm4 1681 #palignr $0x6,%xmm2,%xmm0 1682 .byte 0x66,0x0f,0x3a,0x0f 1683 .byte 0xc2,0x06 1684 movdqa %xmm0,0x10(%rcx) 1685 1686 movdqa %xmm5,%xmm1 1687 #palignr $0x6,%xmm4,%xmm5 1688 .byte 0x66,0x0f,0x3a,0x0f 1689 .byte 0xec,0x06 1690 movdqa %xmm5,0x20(%rcx) 1691 1692 lea 0x30(%rcx),%rcx 1693 jge L(mov3dqa6) 1694 1695 cmp $0x10,%r8 1696 jl L(movdqa_epi) 1697 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1698 sub $0x10,%r8 1699 lea 0x10(%rdx),%rdx 1700 movdqa %xmm3,%xmm2 # save for use next concat 1701 #palignr $0x6,%xmm1,%xmm3 1702 .byte 0x66,0x0f,0x3a,0x0f 1703 .byte 0xd9,0x06 1704 1705 cmp $0x10,%r8 1706 movdqa %xmm3,(%rcx) # store it 1707 lea 0x10(%rcx),%rcx 1708 jl L(movdqa_epi) 1709 1710 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1711 sub $0x10,%r8 1712 lea 0x10(%rdx),%rdx 1713 #palignr $0x6,%xmm2,%xmm0 1714 .byte 0x66,0x0f,0x3a,0x0f 1715 .byte 0xc2,0x06 1716 movdqa %xmm0,(%rcx) # store it 1717 lea 0x10(%rcx),%rcx 1718 jmp L(movdqa_epi) 1719 1720 .balign 16 1721L(mov3dqa7): 1722 movdqa 0x10(%rdx),%xmm3 1723 sub $0x30,%r8 1724 movdqa 0x20(%rdx),%xmm0 1725 movdqa 0x30(%rdx),%xmm5 1726 lea 0x30(%rdx),%rdx 1727 cmp $0x30,%r8 1728 1729 movdqa %xmm3,%xmm2 1730 #palignr $0x7,%xmm1,%xmm3 1731 .byte 0x66,0x0f,0x3a,0x0f 1732 .byte 0xd9,0x07 1733 movdqa %xmm3,(%rcx) 1734 1735 movdqa %xmm0,%xmm4 1736 #palignr $0x7,%xmm2,%xmm0 1737 .byte 0x66,0x0f,0x3a,0x0f 1738 .byte 0xc2,0x07 1739 movdqa %xmm0,0x10(%rcx) 1740 1741 movdqa %xmm5,%xmm1 1742 #palignr $0x7,%xmm4,%xmm5 1743 .byte 0x66,0x0f,0x3a,0x0f 1744 .byte 0xec,0x07 1745 movdqa %xmm5,0x20(%rcx) 1746 1747 lea 0x30(%rcx),%rcx 1748 jge L(mov3dqa7) 1749 1750 cmp $0x10,%r8 1751 jl L(movdqa_epi) 1752 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1753 sub $0x10,%r8 1754 lea 0x10(%rdx),%rdx 1755 movdqa %xmm3,%xmm2 # save for use next concat 1756 #palignr $0x7,%xmm1,%xmm3 1757 .byte 0x66,0x0f,0x3a,0x0f 1758 .byte 0xd9,0x07 1759 1760 cmp $0x10,%r8 1761 movdqa %xmm3,(%rcx) # store it 1762 lea 0x10(%rcx),%rcx 1763 jl L(movdqa_epi) 1764 1765 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1766 sub $0x10,%r8 1767 lea 0x10(%rdx),%rdx 1768 #palignr $0x7,%xmm2,%xmm0 1769 .byte 0x66,0x0f,0x3a,0x0f 1770 .byte 0xc2,0x07 1771 movdqa %xmm0,(%rcx) # store it 1772 lea 0x10(%rcx),%rcx 1773 jmp L(movdqa_epi) 1774 1775 .balign 16 1776L(mov3dqa9): 1777 movdqa 0x10(%rdx),%xmm3 1778 sub $0x30,%r8 1779 movdqa 0x20(%rdx),%xmm0 1780 movdqa 0x30(%rdx),%xmm5 1781 lea 0x30(%rdx),%rdx 1782 cmp $0x30,%r8 1783 1784 movdqa %xmm3,%xmm2 1785 #palignr $0x9,%xmm1,%xmm3 1786 .byte 0x66,0x0f,0x3a,0x0f 1787 .byte 0xd9,0x09 1788 movdqa %xmm3,(%rcx) 1789 1790 movdqa %xmm0,%xmm4 1791 #palignr $0x9,%xmm2,%xmm0 1792 .byte 0x66,0x0f,0x3a,0x0f 1793 .byte 0xc2,0x09 1794 movdqa %xmm0,0x10(%rcx) 1795 1796 movdqa %xmm5,%xmm1 1797 #palignr $0x9,%xmm4,%xmm5 1798 .byte 0x66,0x0f,0x3a,0x0f 1799 .byte 0xec,0x09 1800 movdqa %xmm5,0x20(%rcx) 1801 1802 lea 0x30(%rcx),%rcx 1803 jge L(mov3dqa9) 1804 1805 cmp $0x10,%r8 1806 jl L(movdqa_epi) 1807 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1808 sub $0x10,%r8 1809 lea 0x10(%rdx),%rdx 1810 movdqa %xmm3,%xmm2 # save for use next concat 1811 #palignr $0x9,%xmm1,%xmm3 1812 .byte 0x66,0x0f,0x3a,0x0f 1813 .byte 0xd9,0x09 1814 1815 cmp $0x10,%r8 1816 movdqa %xmm3,(%rcx) # store it 1817 lea 0x10(%rcx),%rcx 1818 jl L(movdqa_epi) 1819 1820 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1821 sub $0x10,%r8 1822 lea 0x10(%rdx),%rdx 1823 #palignr $0x9,%xmm2,%xmm0 1824 .byte 0x66,0x0f,0x3a,0x0f 1825 .byte 0xc2,0x09 1826 movdqa %xmm0,(%rcx) # store it 1827 lea 0x10(%rcx),%rcx 1828 jmp L(movdqa_epi) 1829 1830 .balign 16 1831L(mov3dqa10): 1832 movdqa 0x10(%rdx),%xmm3 1833 sub $0x30,%r8 1834 movdqa 0x20(%rdx),%xmm0 1835 movdqa 0x30(%rdx),%xmm5 1836 lea 0x30(%rdx),%rdx 1837 cmp $0x30,%r8 1838 1839 movdqa %xmm3,%xmm2 1840 #palignr $0xa,%xmm1,%xmm3 1841 .byte 0x66,0x0f,0x3a,0x0f 1842 .byte 0xd9,0x0a 1843 movdqa %xmm3,(%rcx) 1844 1845 movdqa %xmm0,%xmm4 1846 #palignr $0xa,%xmm2,%xmm0 1847 .byte 0x66,0x0f,0x3a,0x0f 1848 .byte 0xc2,0x0a 1849 movdqa %xmm0,0x10(%rcx) 1850 1851 movdqa %xmm5,%xmm1 1852 #palignr $0xa,%xmm4,%xmm5 1853 .byte 0x66,0x0f,0x3a,0x0f 1854 .byte 0xec,0x0a 1855 movdqa %xmm5,0x20(%rcx) 1856 1857 lea 0x30(%rcx),%rcx 1858 jge L(mov3dqa10) 1859 1860 cmp $0x10,%r8 1861 jl L(movdqa_epi) 1862 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1863 sub $0x10,%r8 1864 lea 0x10(%rdx),%rdx 1865 movdqa %xmm3,%xmm2 # save for use next concat 1866 #palignr $0xa,%xmm1,%xmm3 1867 .byte 0x66,0x0f,0x3a,0x0f 1868 .byte 0xd9,0x0a 1869 1870 cmp $0x10,%r8 1871 movdqa %xmm3,(%rcx) # store it 1872 lea 0x10(%rcx),%rcx 1873 jl L(movdqa_epi) 1874 1875 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1876 sub $0x10,%r8 1877 lea 0x10(%rdx),%rdx 1878 #palignr $0xa,%xmm2,%xmm0 1879 .byte 0x66,0x0f,0x3a,0x0f 1880 .byte 0xc2,0x0a 1881 movdqa %xmm0,(%rcx) # store it 1882 lea 0x10(%rcx),%rcx 1883 jmp L(movdqa_epi) 1884 1885 .balign 16 1886L(mov3dqa11): 1887 movdqa 0x10(%rdx),%xmm3 1888 sub $0x30,%r8 1889 movdqa 0x20(%rdx),%xmm0 1890 movdqa 0x30(%rdx),%xmm5 1891 lea 0x30(%rdx),%rdx 1892 cmp $0x30,%r8 1893 1894 movdqa %xmm3,%xmm2 1895 #palignr $0xb,%xmm1,%xmm3 1896 .byte 0x66,0x0f,0x3a,0x0f 1897 .byte 0xd9,0x0b 1898 movdqa %xmm3,(%rcx) 1899 1900 movdqa %xmm0,%xmm4 1901 #palignr $0xb,%xmm2,%xmm0 1902 .byte 0x66,0x0f,0x3a,0x0f 1903 .byte 0xc2,0x0b 1904 movdqa %xmm0,0x10(%rcx) 1905 1906 movdqa %xmm5,%xmm1 1907 #palignr $0xb,%xmm4,%xmm5 1908 .byte 0x66,0x0f,0x3a,0x0f 1909 .byte 0xec,0x0b 1910 movdqa %xmm5,0x20(%rcx) 1911 1912 lea 0x30(%rcx),%rcx 1913 jge L(mov3dqa11) 1914 1915 cmp $0x10,%r8 1916 jl L(movdqa_epi) 1917 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1918 sub $0x10,%r8 1919 lea 0x10(%rdx),%rdx 1920 movdqa %xmm3,%xmm2 # save for use next concat 1921 #palignr $0xb,%xmm1,%xmm3 1922 .byte 0x66,0x0f,0x3a,0x0f 1923 .byte 0xd9,0x0b 1924 1925 cmp $0x10,%r8 1926 movdqa %xmm3,(%rcx) # store it 1927 lea 0x10(%rcx),%rcx 1928 jl L(movdqa_epi) 1929 1930 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1931 sub $0x10,%r8 1932 lea 0x10(%rdx),%rdx 1933 #palignr $0xb,%xmm2,%xmm0 1934 .byte 0x66,0x0f,0x3a,0x0f 1935 .byte 0xc2,0x0b 1936 movdqa %xmm0,(%rcx) # store it 1937 lea 0x10(%rcx),%rcx 1938 jmp L(movdqa_epi) 1939 1940 .balign 16 1941L(mov3dqa12): 1942 movdqa 0x10(%rdx),%xmm3 1943 sub $0x30,%r8 1944 movdqa 0x20(%rdx),%xmm0 1945 movdqa 0x30(%rdx),%xmm5 1946 lea 0x30(%rdx),%rdx 1947 cmp $0x30,%r8 1948 1949 movdqa %xmm3,%xmm2 1950 #palignr $0xc,%xmm1,%xmm3 1951 .byte 0x66,0x0f,0x3a,0x0f 1952 .byte 0xd9,0x0c 1953 movdqa %xmm3,(%rcx) 1954 1955 movdqa %xmm0,%xmm4 1956 #palignr $0xc,%xmm2,%xmm0 1957 .byte 0x66,0x0f,0x3a,0x0f 1958 .byte 0xc2,0x0c 1959 movdqa %xmm0,0x10(%rcx) 1960 1961 movdqa %xmm5,%xmm1 1962 #palignr $0xc,%xmm4,%xmm5 1963 .byte 0x66,0x0f,0x3a,0x0f 1964 .byte 0xec,0x0c 1965 movdqa %xmm5,0x20(%rcx) 1966 1967 lea 0x30(%rcx),%rcx 1968 jge L(mov3dqa12) 1969 1970 cmp $0x10,%r8 1971 jl L(movdqa_epi) 1972 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1973 sub $0x10,%r8 1974 lea 0x10(%rdx),%rdx 1975 movdqa %xmm3,%xmm2 # save for use next concat 1976 #palignr $0xc,%xmm1,%xmm3 1977 .byte 0x66,0x0f,0x3a,0x0f 1978 .byte 0xd9,0x0c 1979 1980 cmp $0x10,%r8 1981 movdqa %xmm3,(%rcx) # store it 1982 lea 0x10(%rcx),%rcx 1983 jl L(movdqa_epi) 1984 1985 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1986 sub $0x10,%r8 1987 lea 0x10(%rdx),%rdx 1988 #palignr $0xc,%xmm2,%xmm0 1989 .byte 0x66,0x0f,0x3a,0x0f 1990 .byte 0xc2,0x0c 1991 movdqa %xmm0,(%rcx) # store it 1992 lea 0x10(%rcx),%rcx 1993 jmp L(movdqa_epi) 1994 1995 .balign 16 1996L(mov3dqa13): 1997 movdqa 0x10(%rdx),%xmm3 1998 sub $0x30,%r8 1999 movdqa 0x20(%rdx),%xmm0 2000 movdqa 0x30(%rdx),%xmm5 2001 lea 0x30(%rdx),%rdx 2002 cmp $0x30,%r8 2003 2004 movdqa %xmm3,%xmm2 2005 #palignr $0xd,%xmm1,%xmm3 2006 .byte 0x66,0x0f,0x3a,0x0f 2007 .byte 0xd9,0x0d 2008 movdqa %xmm3,(%rcx) 2009 2010 movdqa %xmm0,%xmm4 2011 #palignr $0xd,%xmm2,%xmm0 2012 .byte 0x66,0x0f,0x3a,0x0f 2013 .byte 0xc2,0x0d 2014 movdqa %xmm0,0x10(%rcx) 2015 2016 movdqa %xmm5,%xmm1 2017 #palignr $0xd,%xmm4,%xmm5 2018 .byte 0x66,0x0f,0x3a,0x0f 2019 .byte 0xec,0x0d 2020 movdqa %xmm5,0x20(%rcx) 2021 2022 lea 0x30(%rcx),%rcx 2023 jge L(mov3dqa13) 2024 2025 cmp $0x10,%r8 2026 jl L(movdqa_epi) 2027 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2028 sub $0x10,%r8 2029 lea 0x10(%rdx),%rdx 2030 movdqa %xmm3,%xmm2 # save for use next concat 2031 #palignr $0xd,%xmm1,%xmm3 2032 .byte 0x66,0x0f,0x3a,0x0f 2033 .byte 0xd9,0x0d 2034 2035 cmp $0x10,%r8 2036 movdqa %xmm3,(%rcx) # store it 2037 lea 0x10(%rcx),%rcx 2038 jl L(movdqa_epi) 2039 2040 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2041 sub $0x10,%r8 2042 lea 0x10(%rdx),%rdx 2043 #palignr $0xd,%xmm2,%xmm0 2044 .byte 0x66,0x0f,0x3a,0x0f 2045 .byte 0xc2,0x0d 2046 movdqa %xmm0,(%rcx) # store it 2047 lea 0x10(%rcx),%rcx 2048 jmp L(movdqa_epi) 2049 2050 .balign 16 2051L(mov3dqa14): 2052 movdqa 0x10(%rdx),%xmm3 2053 sub $0x30,%r8 2054 movdqa 0x20(%rdx),%xmm0 2055 movdqa 0x30(%rdx),%xmm5 2056 lea 0x30(%rdx),%rdx 2057 cmp $0x30,%r8 2058 2059 movdqa %xmm3,%xmm2 2060 #palignr $0xe,%xmm1,%xmm3 2061 .byte 0x66,0x0f,0x3a,0x0f 2062 .byte 0xd9,0x0e 2063 movdqa %xmm3,(%rcx) 2064 2065 movdqa %xmm0,%xmm4 2066 #palignr $0xe,%xmm2,%xmm0 2067 .byte 0x66,0x0f,0x3a,0x0f 2068 .byte 0xc2,0x0e 2069 movdqa %xmm0,0x10(%rcx) 2070 2071 movdqa %xmm5,%xmm1 2072 #palignr $0xe,%xmm4,%xmm5 2073 .byte 0x66,0x0f,0x3a,0x0f 2074 .byte 0xec,0x0e 2075 movdqa %xmm5,0x20(%rcx) 2076 2077 lea 0x30(%rcx),%rcx 2078 jge L(mov3dqa14) 2079 2080 cmp $0x10,%r8 2081 jl L(movdqa_epi) 2082 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2083 sub $0x10,%r8 2084 lea 0x10(%rdx),%rdx 2085 movdqa %xmm3,%xmm2 # save for use next concat 2086 #palignr $0xe,%xmm1,%xmm3 2087 .byte 0x66,0x0f,0x3a,0x0f 2088 .byte 0xd9,0x0e 2089 2090 cmp $0x10,%r8 2091 movdqa %xmm3,(%rcx) # store it 2092 lea 0x10(%rcx),%rcx 2093 jl L(movdqa_epi) 2094 2095 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2096 sub $0x10,%r8 2097 lea 0x10(%rdx),%rdx 2098 #palignr $0xe,%xmm2,%xmm0 2099 .byte 0x66,0x0f,0x3a,0x0f 2100 .byte 0xc2,0x0e 2101 movdqa %xmm0,(%rcx) # store it 2102 lea 0x10(%rcx),%rcx 2103 jmp L(movdqa_epi) 2104 2105 .balign 16 2106L(mov3dqa15): 2107 movdqa 0x10(%rdx),%xmm3 2108 sub $0x30,%r8 2109 movdqa 0x20(%rdx),%xmm0 2110 movdqa 0x30(%rdx),%xmm5 2111 lea 0x30(%rdx),%rdx 2112 cmp $0x30,%r8 2113 2114 movdqa %xmm3,%xmm2 2115 #palignr $0xf,%xmm1,%xmm3 2116 .byte 0x66,0x0f,0x3a,0x0f 2117 .byte 0xd9,0x0f 2118 movdqa %xmm3,(%rcx) 2119 2120 movdqa %xmm0,%xmm4 2121 #palignr $0xf,%xmm2,%xmm0 2122 .byte 0x66,0x0f,0x3a,0x0f 2123 .byte 0xc2,0x0f 2124 movdqa %xmm0,0x10(%rcx) 2125 2126 movdqa %xmm5,%xmm1 2127 #palignr $0xf,%xmm4,%xmm5 2128 .byte 0x66,0x0f,0x3a,0x0f 2129 .byte 0xec,0x0f 2130 movdqa %xmm5,0x20(%rcx) 2131 2132 lea 0x30(%rcx),%rcx 2133 jge L(mov3dqa15) 2134 2135 cmp $0x10,%r8 2136 jl L(movdqa_epi) 2137 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2138 sub $0x10,%r8 2139 lea 0x10(%rdx),%rdx 2140 movdqa %xmm3,%xmm2 # save for use next concat 2141 #palignr $0xf,%xmm1,%xmm3 2142 .byte 0x66,0x0f,0x3a,0x0f 2143 .byte 0xd9,0x0f 2144 2145 cmp $0x10,%r8 2146 movdqa %xmm3,(%rcx) # store it 2147 lea 0x10(%rcx),%rcx 2148 jl L(movdqa_epi) 2149 2150 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2151 sub $0x10,%r8 2152 lea 0x10(%rdx),%rdx 2153 #palignr $0xf,%xmm2,%xmm0 2154 .byte 0x66,0x0f,0x3a,0x0f 2155 .byte 0xc2,0x0f 2156 movdqa %xmm0,(%rcx) # store it 2157 lea 0x10(%rcx),%rcx 2158 jmp L(movdqa_epi) 2159 2160 .balign 16 2161L(sse2_nt_move): 2162 lea 0x40(%rcx),%rcx 2163 lea 0x40(%rdx),%rdx 2164 lea -0x40(%r8),%r8 2165 2166 /* 2167 * doesn't matter if source is aligned for stuff out of cache. 2168 * the mis-aligned penalty is masked by the slowness of main memory. 2169 */ 2170 prefetchnta 0x180(%rdx) 2171 movdqu -0x40(%rdx),%xmm0 2172 movdqu -0x30(%rdx),%xmm1 2173 2174 cmp $0x40,%r8 2175 movntdq %xmm0,-0x40(%rcx) 2176 movntdq %xmm1,-0x30(%rcx) 2177 2178 movdqu -0x20(%rdx),%xmm2 2179 movdqu -0x10(%rdx),%xmm3 2180 2181 movntdq %xmm2,-0x20(%rcx) 2182 movntdq %xmm3,-0x10(%rcx) 2183 2184 jge L(sse2_nt_move) 2185 2186 lea L(Fix16EndTable)(%rip),%r10 2187 mov %r8,%r9 2188 and $0xFFFFFFFFFFFFFFF0,%r9 2189 add %r9,%rcx 2190 add %r9,%rdx 2191 sub %r9,%r8 2192 shr $0x4,%r9 2193 sfence 2194 2195 movslq (%r10,%r9,4),%r11 2196 lea (%r11,%r10,1),%r10 2197 jmpq *%r10 2198 2199 .balign 16 2200L(Fix16EndTable): 2201 .int L(fix16_0)-L(Fix16EndTable) 2202 .int L(fix16_1)-L(Fix16EndTable) 2203 .int L(fix16_2)-L(Fix16EndTable) 2204 .int L(fix16_3)-L(Fix16EndTable) 2205 2206 .balign 16 2207L(fix16_3): 2208 movdqu -0x30(%rdx),%xmm1 2209 movdqa %xmm1,-0x30(%rcx) 2210L(fix16_2): 2211 movdqu -0x20(%rdx),%xmm2 2212 movdqa %xmm2,-0x20(%rcx) 2213L(fix16_1): 2214 movdqu -0x10(%rdx),%xmm3 2215 movdqa %xmm3,-0x10(%rcx) 2216L(fix16_0): 2217 lea L(fwdPxQx)(%rip),%r10 2218 add %r8,%rdx 2219 add %r8,%rcx 2220 2221 movslq (%r10,%r8,4),%r9 2222 lea (%r9,%r10,1),%r10 2223 jmpq *%r10 2224 2225 .balign 16 2226L(pre_both_aligned): 2227 cmp $0x80,%r8 2228 jl L(fix_16b) 2229 2230 .balign 16 2231L(both_aligned): 2232 2233 /* 2234 * this 'paired' load/load/store/store seems to do best. 2235 */ 2236 movdqa (%rdx),%xmm0 2237 movdqa 0x10(%rdx),%xmm1 2238 2239 movdqa %xmm0,(%rcx) 2240 movdqa %xmm1,0x10(%rcx) 2241 lea -0x80(%r8),%r8 2242 2243 movdqa 0x20(%rdx),%xmm2 2244 movdqa 0x30(%rdx),%xmm3 2245 2246 movdqa %xmm2,0x20(%rcx) 2247 movdqa %xmm3,0x30(%rcx) 2248 2249 movdqa 0x40(%rdx),%xmm0 2250 movdqa 0x50(%rdx),%xmm1 2251 cmp $0x80,%r8 2252 2253 movdqa %xmm0,0x40(%rcx) 2254 movdqa %xmm1,0x50(%rcx) 2255 2256 movdqa 0x60(%rdx),%xmm2 2257 movdqa 0x70(%rdx),%xmm3 2258 lea 0x80(%rdx),%rdx 2259 movdqa %xmm2,0x60(%rcx) 2260 movdqa %xmm3,0x70(%rcx) 2261 lea 0x80(%rcx),%rcx 2262 jge L(both_aligned) 2263 2264L(fix_16b): 2265 add %r8,%rcx 2266 lea L(fwdPxQx)(%rip),%r10 2267 add %r8,%rdx 2268 2269 movslq (%r10,%r8,4),%r9 2270 lea (%r9,%r10,1),%r10 2271 jmpq *%r10 2272 2273 .balign 16 2274L(Loop8byte_pre): 2275 # Use 8-byte moves 2276 mov .largest_level_cache_size(%rip),%r9d 2277 shr %r9 # take half of it 2278 cmp %r9,%r8 2279 jg L(byte8_nt_top) 2280 # Find out whether to use rep movsq 2281 cmp $4096,%r8 2282 jle L(byte8_top) 2283 mov .amd64cache1half(%rip),%r9d # half of l1 cache 2284 cmp %r9,%r8 2285 jle L(use_rep) 2286 2287 .balign 16 2288L(byte8_top): 2289 mov (%rdx),%r9 2290 mov 0x8(%rdx),%r10 2291 lea -0x40(%r8),%r8 2292 mov %r9,(%rcx) 2293 mov %r10,0x8(%rcx) 2294 mov 0x10(%rdx),%r11 2295 mov 0x18(%rdx),%r9 2296 mov %r11,0x10(%rcx) 2297 mov %r9,0x18(%rcx) 2298 2299 cmp $0x40,%r8 2300 mov 0x20(%rdx),%r10 2301 mov 0x28(%rdx),%r11 2302 mov %r10,0x20(%rcx) 2303 mov %r11,0x28(%rcx) 2304 mov 0x30(%rdx),%r9 2305 mov 0x38(%rdx),%r10 2306 lea 0x40(%rdx),%rdx 2307 mov %r9,0x30(%rcx) 2308 mov %r10,0x38(%rcx) 2309 lea 0x40(%rcx),%rcx 2310 jg L(byte8_top) 2311 2312L(byte8_end): 2313 lea L(fwdPxQx)(%rip),%r10 2314 lea (%rdx,%r8,1),%rdx 2315 lea (%rcx,%r8,1),%rcx 2316 2317 movslq (%r10,%r8,4),%r9 2318 lea (%r9,%r10,1),%r10 2319 jmpq *%r10 2320 2321 .balign 16 2322L(use_rep): 2323 mov %rdx,%rsi # %rsi = source 2324 mov %rcx,%rdi # %rdi = destination 2325 mov %r8,%rcx # %rcx = count 2326 shrq $3,%rcx # 8-byte word count 2327 rep 2328 movsq 2329 mov %rsi,%rdx # source 2330 mov %rdi,%rcx # destination 2331 andq $7,%r8 # remainder 2332 jnz L(byte8_end) 2333 ret 2334 2335 .balign 16 2336L(byte8_nt_top): 2337 sub $0x40,%r8 2338 prefetchnta 0x180(%rdx) 2339 mov (%rdx),%r9 2340 movnti %r9,(%rcx) 2341 mov 0x8(%rdx),%r10 2342 movnti %r10,0x8(%rcx) 2343 mov 0x10(%rdx),%r11 2344 movnti %r11,0x10(%rcx) 2345 mov 0x18(%rdx),%r9 2346 movnti %r9,0x18(%rcx) 2347 mov 0x20(%rdx),%r10 2348 movnti %r10,0x20(%rcx) 2349 mov 0x28(%rdx),%r11 2350 movnti %r11,0x28(%rcx) 2351 mov 0x30(%rdx),%r9 2352 movnti %r9,0x30(%rcx) 2353 mov 0x38(%rdx),%r10 2354 movnti %r10,0x38(%rcx) 2355 2356 lea 0x40(%rdx),%rdx 2357 lea 0x40(%rcx),%rcx 2358 cmp $0x40,%r8 2359 jge L(byte8_nt_top) 2360 sfence 2361 jmp L(byte8_end) 2362 2363 SET_SIZE(memcpy) 2364 2365 .balign 16 2366L(CopyBackwards): 2367 mov %rdx,%r8 2368 mov %rdi,%rcx 2369 mov %rsi,%rdx 2370 mov %rdi,%rax # return value 2371 2372 # ck alignment of last byte 2373 lea (%rcx,%r8,1),%rcx 2374 test $0x7,%rcx 2375 lea (%rdx,%r8,1),%rdx 2376 jne L(bk_align) 2377 2378L(bk_qw_aligned): 2379 lea L(bkPxQx)(%rip),%r10 2380 2381 cmp $0x90,%r8 # 144 2382 jg L(bk_ck_sse2_alignment) 2383 2384 sub %r8,%rcx 2385 sub %r8,%rdx 2386 2387 movslq (%r10,%r8,4),%r9 2388 lea (%r9,%r10,1),%r10 2389 jmpq *%r10 2390 2391 .balign 16 2392L(bk_align): 2393 # only align if len > 8 2394 cmp $8,%r8 2395 jle L(bk_qw_aligned) 2396 test $0x1,%rcx 2397 je L(bk_tst2) 2398 dec %rcx 2399 dec %rdx 2400 dec %r8 2401 mov (%rdx),%r9b 2402 mov %r9b,(%rcx) 2403 2404L(bk_tst2): 2405 test $0x2,%rcx 2406 je L(bk_tst3) 2407 2408L(bk_got2): 2409 sub $0x2,%rcx 2410 sub $0x2,%rdx 2411 sub $0x2,%r8 2412 movzwq (%rdx),%r9 2413 mov %r9w,(%rcx) 2414 2415L(bk_tst3): 2416 test $0x4,%rcx 2417 je L(bk_qw_aligned) 2418 2419L(bk_got3): 2420 sub $0x4,%rcx 2421 sub $0x4,%rdx 2422 sub $0x4,%r8 2423 mov (%rdx),%r9d 2424 mov %r9d,(%rcx) 2425 jmp L(bk_qw_aligned) 2426 2427 .balign 16 2428L(bk_ck_sse2_alignment): 2429 cmpl $NO_SSE,.memops_method(%rip) 2430 je L(bk_use_rep) 2431 # check alignment of last byte 2432 test $0xf,%rcx 2433 jz L(bk_sse2_cpy) 2434 2435L(bk_sse2_align): 2436 # only here if already aligned on at least a qword bndry 2437 sub $0x8,%rcx 2438 sub $0x8,%rdx 2439 sub $0x8,%r8 2440 mov (%rdx),%r9 2441 mov %r9,(%rcx) 2442 #jmp L(bk_sse2_cpy) 2443 2444 .balign 16 2445L(bk_sse2_cpy): 2446 sub $0x80,%rcx # 128 2447 sub $0x80,%rdx 2448 movdqu 0x70(%rdx),%xmm3 2449 movdqu 0x60(%rdx),%xmm2 2450 movdqa %xmm3,0x70(%rcx) 2451 movdqa %xmm2,0x60(%rcx) 2452 sub $0x80,%r8 2453 movdqu 0x50(%rdx),%xmm1 2454 movdqu 0x40(%rdx),%xmm0 2455 movdqa %xmm1,0x50(%rcx) 2456 movdqa %xmm0,0x40(%rcx) 2457 2458 cmp $0x80,%r8 2459 movdqu 0x30(%rdx),%xmm3 2460 movdqu 0x20(%rdx),%xmm2 2461 movdqa %xmm3,0x30(%rcx) 2462 movdqa %xmm2,0x20(%rcx) 2463 movdqu 0x10(%rdx),%xmm1 2464 movdqu (%rdx),%xmm0 2465 movdqa %xmm1,0x10(%rcx) 2466 movdqa %xmm0,(%rcx) 2467 jge L(bk_sse2_cpy) 2468 2469L(bk_sse2_cpy_end): 2470 lea L(bkPxQx)(%rip),%r10 2471 sub %r8,%rdx 2472 sub %r8,%rcx 2473 movslq (%r10,%r8,4),%r9 2474 lea (%r9,%r10,1),%r10 2475 jmpq *%r10 2476 2477 .balign 16 2478L(bk_use_rep): 2479 xchg %rcx,%r9 2480 mov %rdx,%rsi # source 2481 mov %r9,%rdi # destination 2482 mov %r8,%rcx # count 2483 sub $8,%rsi 2484 sub $8,%rdi 2485 shr $3,%rcx 2486 std # reverse direction 2487 rep 2488 movsq 2489 cld # reset direction flag 2490 2491 xchg %rcx,%r9 2492 lea L(bkPxQx)(%rip),%r10 2493 sub %r8,%rdx 2494 sub %r8,%rcx 2495 andq $7,%r8 # remainder 2496 jz 2f 2497 movslq (%r10,%r8,4),%r9 2498 lea (%r9,%r10,1),%r10 2499 jmpq *%r10 25002: 2501 ret 2502 2503 .balign 16 2504L(bkP0QI): 2505 mov 0x88(%rdx),%r10 2506 mov %r10,0x88(%rcx) 2507L(bkP0QH): 2508 mov 0x80(%rdx),%r10 2509 mov %r10,0x80(%rcx) 2510L(bkP0QG): 2511 mov 0x78(%rdx),%r9 2512 mov %r9,0x78(%rcx) 2513L(bkP0QF): 2514 mov 0x70(%rdx),%r11 2515 mov %r11,0x70(%rcx) 2516L(bkP0QE): 2517 mov 0x68(%rdx),%r10 2518 mov %r10,0x68(%rcx) 2519L(bkP0QD): 2520 mov 0x60(%rdx),%r9 2521 mov %r9,0x60(%rcx) 2522L(bkP0QC): 2523 mov 0x58(%rdx),%r11 2524 mov %r11,0x58(%rcx) 2525L(bkP0QB): 2526 mov 0x50(%rdx),%r10 2527 mov %r10,0x50(%rcx) 2528L(bkP0QA): 2529 mov 0x48(%rdx),%r9 2530 mov %r9,0x48(%rcx) 2531L(bkP0Q9): 2532 mov 0x40(%rdx),%r11 2533 mov %r11,0x40(%rcx) 2534L(bkP0Q8): 2535 mov 0x38(%rdx),%r10 2536 mov %r10,0x38(%rcx) 2537L(bkP0Q7): 2538 mov 0x30(%rdx),%r9 2539 mov %r9,0x30(%rcx) 2540L(bkP0Q6): 2541 mov 0x28(%rdx),%r11 2542 mov %r11,0x28(%rcx) 2543L(bkP0Q5): 2544 mov 0x20(%rdx),%r10 2545 mov %r10,0x20(%rcx) 2546L(bkP0Q4): 2547 mov 0x18(%rdx),%r9 2548 mov %r9,0x18(%rcx) 2549L(bkP0Q3): 2550 mov 0x10(%rdx),%r11 2551 mov %r11,0x10(%rcx) 2552L(bkP0Q2): 2553 mov 0x8(%rdx),%r10 2554 mov %r10,0x8(%rcx) 2555L(bkP0Q1): 2556 mov (%rdx),%r9 2557 mov %r9,(%rcx) 2558L(bkP0Q0): 2559 ret 2560 2561 .balign 16 2562L(bkP1QI): 2563 mov 0x89(%rdx),%r10 2564 mov %r10,0x89(%rcx) 2565L(bkP1QH): 2566 mov 0x81(%rdx),%r11 2567 mov %r11,0x81(%rcx) 2568L(bkP1QG): 2569 mov 0x79(%rdx),%r10 2570 mov %r10,0x79(%rcx) 2571L(bkP1QF): 2572 mov 0x71(%rdx),%r9 2573 mov %r9,0x71(%rcx) 2574L(bkP1QE): 2575 mov 0x69(%rdx),%r11 2576 mov %r11,0x69(%rcx) 2577L(bkP1QD): 2578 mov 0x61(%rdx),%r10 2579 mov %r10,0x61(%rcx) 2580L(bkP1QC): 2581 mov 0x59(%rdx),%r9 2582 mov %r9,0x59(%rcx) 2583L(bkP1QB): 2584 mov 0x51(%rdx),%r11 2585 mov %r11,0x51(%rcx) 2586L(bkP1QA): 2587 mov 0x49(%rdx),%r10 2588 mov %r10,0x49(%rcx) 2589L(bkP1Q9): 2590 mov 0x41(%rdx),%r9 2591 mov %r9,0x41(%rcx) 2592L(bkP1Q8): 2593 mov 0x39(%rdx),%r11 2594 mov %r11,0x39(%rcx) 2595L(bkP1Q7): 2596 mov 0x31(%rdx),%r10 2597 mov %r10,0x31(%rcx) 2598L(bkP1Q6): 2599 mov 0x29(%rdx),%r9 2600 mov %r9,0x29(%rcx) 2601L(bkP1Q5): 2602 mov 0x21(%rdx),%r11 2603 mov %r11,0x21(%rcx) 2604L(bkP1Q4): 2605 mov 0x19(%rdx),%r10 2606 mov %r10,0x19(%rcx) 2607L(bkP1Q3): 2608 mov 0x11(%rdx),%r9 2609 mov %r9,0x11(%rcx) 2610L(bkP1Q2): 2611 mov 0x9(%rdx),%r11 2612 mov %r11,0x9(%rcx) 2613L(bkP1Q1): 2614 mov 0x1(%rdx),%r10 2615 mov %r10,0x1(%rcx) 2616L(bkP1Q0): 2617 mov (%rdx),%r9b 2618 mov %r9b,(%rcx) 2619 ret 2620 2621 .balign 16 2622L(bkP2QI): 2623 mov 0x8a(%rdx),%r10 2624 mov %r10,0x8a(%rcx) 2625L(bkP2QH): 2626 mov 0x82(%rdx),%r11 2627 mov %r11,0x82(%rcx) 2628L(bkP2QG): 2629 mov 0x7a(%rdx),%r10 2630 mov %r10,0x7a(%rcx) 2631L(bkP2QF): 2632 mov 0x72(%rdx),%r9 2633 mov %r9,0x72(%rcx) 2634L(bkP2QE): 2635 mov 0x6a(%rdx),%r11 2636 mov %r11,0x6a(%rcx) 2637L(bkP2QD): 2638 mov 0x62(%rdx),%r10 2639 mov %r10,0x62(%rcx) 2640L(bkP2QC): 2641 mov 0x5a(%rdx),%r9 2642 mov %r9,0x5a(%rcx) 2643L(bkP2QB): 2644 mov 0x52(%rdx),%r11 2645 mov %r11,0x52(%rcx) 2646L(bkP2QA): 2647 mov 0x4a(%rdx),%r10 2648 mov %r10,0x4a(%rcx) 2649L(bkP2Q9): 2650 mov 0x42(%rdx),%r9 2651 mov %r9,0x42(%rcx) 2652L(bkP2Q8): 2653 mov 0x3a(%rdx),%r11 2654 mov %r11,0x3a(%rcx) 2655L(bkP2Q7): 2656 mov 0x32(%rdx),%r10 2657 mov %r10,0x32(%rcx) 2658L(bkP2Q6): 2659 mov 0x2a(%rdx),%r9 2660 mov %r9,0x2a(%rcx) 2661L(bkP2Q5): 2662 mov 0x22(%rdx),%r11 2663 mov %r11,0x22(%rcx) 2664L(bkP2Q4): 2665 mov 0x1a(%rdx),%r10 2666 mov %r10,0x1a(%rcx) 2667L(bkP2Q3): 2668 mov 0x12(%rdx),%r9 2669 mov %r9,0x12(%rcx) 2670L(bkP2Q2): 2671 mov 0xa(%rdx),%r11 2672 mov %r11,0xa(%rcx) 2673L(bkP2Q1): 2674 mov 0x2(%rdx),%r10 2675 mov %r10,0x2(%rcx) 2676L(bkP2Q0): 2677 mov (%rdx),%r9w 2678 mov %r9w,(%rcx) 2679 ret 2680 2681 .balign 16 2682L(bkP3QI): 2683 mov 0x8b(%rdx),%r10 2684 mov %r10,0x8b(%rcx) 2685L(bkP3QH): 2686 mov 0x83(%rdx),%r11 2687 mov %r11,0x83(%rcx) 2688L(bkP3QG): 2689 mov 0x7b(%rdx),%r10 2690 mov %r10,0x7b(%rcx) 2691L(bkP3QF): 2692 mov 0x73(%rdx),%r9 2693 mov %r9,0x73(%rcx) 2694L(bkP3QE): 2695 mov 0x6b(%rdx),%r11 2696 mov %r11,0x6b(%rcx) 2697L(bkP3QD): 2698 mov 0x63(%rdx),%r10 2699 mov %r10,0x63(%rcx) 2700L(bkP3QC): 2701 mov 0x5b(%rdx),%r9 2702 mov %r9,0x5b(%rcx) 2703L(bkP3QB): 2704 mov 0x53(%rdx),%r11 2705 mov %r11,0x53(%rcx) 2706L(bkP3QA): 2707 mov 0x4b(%rdx),%r10 2708 mov %r10,0x4b(%rcx) 2709L(bkP3Q9): 2710 mov 0x43(%rdx),%r9 2711 mov %r9,0x43(%rcx) 2712L(bkP3Q8): 2713 mov 0x3b(%rdx),%r11 2714 mov %r11,0x3b(%rcx) 2715L(bkP3Q7): 2716 mov 0x33(%rdx),%r10 2717 mov %r10,0x33(%rcx) 2718L(bkP3Q6): 2719 mov 0x2b(%rdx),%r9 2720 mov %r9,0x2b(%rcx) 2721L(bkP3Q5): 2722 mov 0x23(%rdx),%r11 2723 mov %r11,0x23(%rcx) 2724L(bkP3Q4): 2725 mov 0x1b(%rdx),%r10 2726 mov %r10,0x1b(%rcx) 2727L(bkP3Q3): 2728 mov 0x13(%rdx),%r9 2729 mov %r9,0x13(%rcx) 2730L(bkP3Q2): 2731 mov 0xb(%rdx),%r11 2732 mov %r11,0xb(%rcx) 2733L(bkP3Q1): 2734 mov 0x3(%rdx),%r10 2735 mov %r10,0x3(%rcx) 2736L(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores 2737 mov 0x1(%rdx),%r9w 2738 mov %r9w,0x1(%rcx) 2739 mov (%rdx),%r10b 2740 mov %r10b,(%rcx) 2741 ret 2742 2743 .balign 16 2744L(bkP4QI): 2745 mov 0x8c(%rdx),%r10 2746 mov %r10,0x8c(%rcx) 2747L(bkP4QH): 2748 mov 0x84(%rdx),%r11 2749 mov %r11,0x84(%rcx) 2750L(bkP4QG): 2751 mov 0x7c(%rdx),%r10 2752 mov %r10,0x7c(%rcx) 2753L(bkP4QF): 2754 mov 0x74(%rdx),%r9 2755 mov %r9,0x74(%rcx) 2756L(bkP4QE): 2757 mov 0x6c(%rdx),%r11 2758 mov %r11,0x6c(%rcx) 2759L(bkP4QD): 2760 mov 0x64(%rdx),%r10 2761 mov %r10,0x64(%rcx) 2762L(bkP4QC): 2763 mov 0x5c(%rdx),%r9 2764 mov %r9,0x5c(%rcx) 2765L(bkP4QB): 2766 mov 0x54(%rdx),%r11 2767 mov %r11,0x54(%rcx) 2768L(bkP4QA): 2769 mov 0x4c(%rdx),%r10 2770 mov %r10,0x4c(%rcx) 2771L(bkP4Q9): 2772 mov 0x44(%rdx),%r9 2773 mov %r9,0x44(%rcx) 2774L(bkP4Q8): 2775 mov 0x3c(%rdx),%r11 2776 mov %r11,0x3c(%rcx) 2777L(bkP4Q7): 2778 mov 0x34(%rdx),%r10 2779 mov %r10,0x34(%rcx) 2780L(bkP4Q6): 2781 mov 0x2c(%rdx),%r9 2782 mov %r9,0x2c(%rcx) 2783L(bkP4Q5): 2784 mov 0x24(%rdx),%r11 2785 mov %r11,0x24(%rcx) 2786L(bkP4Q4): 2787 mov 0x1c(%rdx),%r10 2788 mov %r10,0x1c(%rcx) 2789L(bkP4Q3): 2790 mov 0x14(%rdx),%r9 2791 mov %r9,0x14(%rcx) 2792L(bkP4Q2): 2793 mov 0xc(%rdx),%r11 2794 mov %r11,0xc(%rcx) 2795L(bkP4Q1): 2796 mov 0x4(%rdx),%r10 2797 mov %r10,0x4(%rcx) 2798L(bkP4Q0): 2799 mov (%rdx),%r9d 2800 mov %r9d,(%rcx) 2801 ret 2802 2803 .balign 16 2804L(bkP5QI): 2805 mov 0x8d(%rdx),%r10 2806 mov %r10,0x8d(%rcx) 2807L(bkP5QH): 2808 mov 0x85(%rdx),%r9 2809 mov %r9,0x85(%rcx) 2810L(bkP5QG): 2811 mov 0x7d(%rdx),%r11 2812 mov %r11,0x7d(%rcx) 2813L(bkP5QF): 2814 mov 0x75(%rdx),%r10 2815 mov %r10,0x75(%rcx) 2816L(bkP5QE): 2817 mov 0x6d(%rdx),%r9 2818 mov %r9,0x6d(%rcx) 2819L(bkP5QD): 2820 mov 0x65(%rdx),%r11 2821 mov %r11,0x65(%rcx) 2822L(bkP5QC): 2823 mov 0x5d(%rdx),%r10 2824 mov %r10,0x5d(%rcx) 2825L(bkP5QB): 2826 mov 0x55(%rdx),%r9 2827 mov %r9,0x55(%rcx) 2828L(bkP5QA): 2829 mov 0x4d(%rdx),%r11 2830 mov %r11,0x4d(%rcx) 2831L(bkP5Q9): 2832 mov 0x45(%rdx),%r10 2833 mov %r10,0x45(%rcx) 2834L(bkP5Q8): 2835 mov 0x3d(%rdx),%r9 2836 mov %r9,0x3d(%rcx) 2837L(bkP5Q7): 2838 mov 0x35(%rdx),%r11 2839 mov %r11,0x35(%rcx) 2840L(bkP5Q6): 2841 mov 0x2d(%rdx),%r10 2842 mov %r10,0x2d(%rcx) 2843L(bkP5Q5): 2844 mov 0x25(%rdx),%r9 2845 mov %r9,0x25(%rcx) 2846L(bkP5Q4): 2847 mov 0x1d(%rdx),%r11 2848 mov %r11,0x1d(%rcx) 2849L(bkP5Q3): 2850 mov 0x15(%rdx),%r10 2851 mov %r10,0x15(%rcx) 2852L(bkP5Q2): 2853 mov 0xd(%rdx),%r9 2854 mov %r9,0xd(%rcx) 2855L(bkP5Q1): 2856 mov 0x5(%rdx),%r11 2857 mov %r11,0x5(%rcx) 2858L(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores 2859 mov 0x1(%rdx),%r9d 2860 mov %r9d,0x1(%rcx) 2861 mov (%rdx),%r10b 2862 mov %r10b,(%rcx) 2863 ret 2864 2865 .balign 16 2866L(bkP6QI): 2867 mov 0x8e(%rdx),%r10 2868 mov %r10,0x8e(%rcx) 2869L(bkP6QH): 2870 mov 0x86(%rdx),%r11 2871 mov %r11,0x86(%rcx) 2872L(bkP6QG): 2873 mov 0x7e(%rdx),%r10 2874 mov %r10,0x7e(%rcx) 2875L(bkP6QF): 2876 mov 0x76(%rdx),%r9 2877 mov %r9,0x76(%rcx) 2878L(bkP6QE): 2879 mov 0x6e(%rdx),%r11 2880 mov %r11,0x6e(%rcx) 2881L(bkP6QD): 2882 mov 0x66(%rdx),%r10 2883 mov %r10,0x66(%rcx) 2884L(bkP6QC): 2885 mov 0x5e(%rdx),%r9 2886 mov %r9,0x5e(%rcx) 2887L(bkP6QB): 2888 mov 0x56(%rdx),%r11 2889 mov %r11,0x56(%rcx) 2890L(bkP6QA): 2891 mov 0x4e(%rdx),%r10 2892 mov %r10,0x4e(%rcx) 2893L(bkP6Q9): 2894 mov 0x46(%rdx),%r9 2895 mov %r9,0x46(%rcx) 2896L(bkP6Q8): 2897 mov 0x3e(%rdx),%r11 2898 mov %r11,0x3e(%rcx) 2899L(bkP6Q7): 2900 mov 0x36(%rdx),%r10 2901 mov %r10,0x36(%rcx) 2902L(bkP6Q6): 2903 mov 0x2e(%rdx),%r9 2904 mov %r9,0x2e(%rcx) 2905L(bkP6Q5): 2906 mov 0x26(%rdx),%r11 2907 mov %r11,0x26(%rcx) 2908L(bkP6Q4): 2909 mov 0x1e(%rdx),%r10 2910 mov %r10,0x1e(%rcx) 2911L(bkP6Q3): 2912 mov 0x16(%rdx),%r9 2913 mov %r9,0x16(%rcx) 2914L(bkP6Q2): 2915 mov 0xe(%rdx),%r11 2916 mov %r11,0xe(%rcx) 2917L(bkP6Q1): 2918 mov 0x6(%rdx),%r10 2919 mov %r10,0x6(%rcx) 2920L(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores 2921 mov 0x2(%rdx),%r9d 2922 mov %r9d,0x2(%rcx) 2923 mov (%rdx),%r10w 2924 mov %r10w,(%rcx) 2925 ret 2926 2927 .balign 16 2928L(bkP7QI): 2929 mov 0x8f(%rdx),%r10 2930 mov %r10,0x8f(%rcx) 2931L(bkP7QH): 2932 mov 0x87(%rdx),%r11 2933 mov %r11,0x87(%rcx) 2934L(bkP7QG): 2935 mov 0x7f(%rdx),%r10 2936 mov %r10,0x7f(%rcx) 2937L(bkP7QF): 2938 mov 0x77(%rdx),%r9 2939 mov %r9,0x77(%rcx) 2940L(bkP7QE): 2941 mov 0x6f(%rdx),%r11 2942 mov %r11,0x6f(%rcx) 2943L(bkP7QD): 2944 mov 0x67(%rdx),%r10 2945 mov %r10,0x67(%rcx) 2946L(bkP7QC): 2947 mov 0x5f(%rdx),%r9 2948 mov %r9,0x5f(%rcx) 2949L(bkP7QB): 2950 mov 0x57(%rdx),%r11 2951 mov %r11,0x57(%rcx) 2952L(bkP7QA): 2953 mov 0x4f(%rdx),%r10 2954 mov %r10,0x4f(%rcx) 2955L(bkP7Q9): 2956 mov 0x47(%rdx),%r9 2957 mov %r9,0x47(%rcx) 2958L(bkP7Q8): 2959 mov 0x3f(%rdx),%r11 2960 mov %r11,0x3f(%rcx) 2961L(bkP7Q7): 2962 mov 0x37(%rdx),%r10 2963 mov %r10,0x37(%rcx) 2964L(bkP7Q6): 2965 mov 0x2f(%rdx),%r9 2966 mov %r9,0x2f(%rcx) 2967L(bkP7Q5): 2968 mov 0x27(%rdx),%r11 2969 mov %r11,0x27(%rcx) 2970L(bkP7Q4): 2971 mov 0x1f(%rdx),%r10 2972 mov %r10,0x1f(%rcx) 2973L(bkP7Q3): 2974 mov 0x17(%rdx),%r9 2975 mov %r9,0x17(%rcx) 2976L(bkP7Q2): 2977 mov 0xf(%rdx),%r11 2978 mov %r11,0xf(%rcx) 2979L(bkP7Q1): 2980 mov 0x7(%rdx),%r10 2981 mov %r10,0x7(%rcx) 2982L(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores 2983 mov 0x3(%rdx),%r9d 2984 mov %r9d,0x3(%rcx) 2985 mov 0x1(%rdx),%r10w 2986 mov %r10w,0x1(%rcx) 2987 mov (%rdx),%r11b 2988 mov %r11b,(%rcx) 2989 ret 2990 2991 .balign 16 2992L(bkPxQx): .int L(bkP0Q0)-L(bkPxQx) 2993 .int L(bkP1Q0)-L(bkPxQx) 2994 .int L(bkP2Q0)-L(bkPxQx) 2995 .int L(bkP3Q0)-L(bkPxQx) 2996 .int L(bkP4Q0)-L(bkPxQx) 2997 .int L(bkP5Q0)-L(bkPxQx) 2998 .int L(bkP6Q0)-L(bkPxQx) 2999 .int L(bkP7Q0)-L(bkPxQx) 3000 3001 .int L(bkP0Q1)-L(bkPxQx) 3002 .int L(bkP1Q1)-L(bkPxQx) 3003 .int L(bkP2Q1)-L(bkPxQx) 3004 .int L(bkP3Q1)-L(bkPxQx) 3005 .int L(bkP4Q1)-L(bkPxQx) 3006 .int L(bkP5Q1)-L(bkPxQx) 3007 .int L(bkP6Q1)-L(bkPxQx) 3008 .int L(bkP7Q1)-L(bkPxQx) 3009 3010 .int L(bkP0Q2)-L(bkPxQx) 3011 .int L(bkP1Q2)-L(bkPxQx) 3012 .int L(bkP2Q2)-L(bkPxQx) 3013 .int L(bkP3Q2)-L(bkPxQx) 3014 .int L(bkP4Q2)-L(bkPxQx) 3015 .int L(bkP5Q2)-L(bkPxQx) 3016 .int L(bkP6Q2)-L(bkPxQx) 3017 .int L(bkP7Q2)-L(bkPxQx) 3018 3019 .int L(bkP0Q3)-L(bkPxQx) 3020 .int L(bkP1Q3)-L(bkPxQx) 3021 .int L(bkP2Q3)-L(bkPxQx) 3022 .int L(bkP3Q3)-L(bkPxQx) 3023 .int L(bkP4Q3)-L(bkPxQx) 3024 .int L(bkP5Q3)-L(bkPxQx) 3025 .int L(bkP6Q3)-L(bkPxQx) 3026 .int L(bkP7Q3)-L(bkPxQx) 3027 3028 .int L(bkP0Q4)-L(bkPxQx) 3029 .int L(bkP1Q4)-L(bkPxQx) 3030 .int L(bkP2Q4)-L(bkPxQx) 3031 .int L(bkP3Q4)-L(bkPxQx) 3032 .int L(bkP4Q4)-L(bkPxQx) 3033 .int L(bkP5Q4)-L(bkPxQx) 3034 .int L(bkP6Q4)-L(bkPxQx) 3035 .int L(bkP7Q4)-L(bkPxQx) 3036 3037 .int L(bkP0Q5)-L(bkPxQx) 3038 .int L(bkP1Q5)-L(bkPxQx) 3039 .int L(bkP2Q5)-L(bkPxQx) 3040 .int L(bkP3Q5)-L(bkPxQx) 3041 .int L(bkP4Q5)-L(bkPxQx) 3042 .int L(bkP5Q5)-L(bkPxQx) 3043 .int L(bkP6Q5)-L(bkPxQx) 3044 .int L(bkP7Q5)-L(bkPxQx) 3045 3046 .int L(bkP0Q6)-L(bkPxQx) 3047 .int L(bkP1Q6)-L(bkPxQx) 3048 .int L(bkP2Q6)-L(bkPxQx) 3049 .int L(bkP3Q6)-L(bkPxQx) 3050 .int L(bkP4Q6)-L(bkPxQx) 3051 .int L(bkP5Q6)-L(bkPxQx) 3052 .int L(bkP6Q6)-L(bkPxQx) 3053 .int L(bkP7Q6)-L(bkPxQx) 3054 3055 .int L(bkP0Q7)-L(bkPxQx) 3056 .int L(bkP1Q7)-L(bkPxQx) 3057 .int L(bkP2Q7)-L(bkPxQx) 3058 .int L(bkP3Q7)-L(bkPxQx) 3059 .int L(bkP4Q7)-L(bkPxQx) 3060 .int L(bkP5Q7)-L(bkPxQx) 3061 .int L(bkP6Q7)-L(bkPxQx) 3062 .int L(bkP7Q7)-L(bkPxQx) 3063 3064 .int L(bkP0Q8)-L(bkPxQx) 3065 .int L(bkP1Q8)-L(bkPxQx) 3066 .int L(bkP2Q8)-L(bkPxQx) 3067 .int L(bkP3Q8)-L(bkPxQx) 3068 .int L(bkP4Q8)-L(bkPxQx) 3069 .int L(bkP5Q8)-L(bkPxQx) 3070 .int L(bkP6Q8)-L(bkPxQx) 3071 .int L(bkP7Q8)-L(bkPxQx) 3072 3073 .int L(bkP0Q9)-L(bkPxQx) 3074 .int L(bkP1Q9)-L(bkPxQx) 3075 .int L(bkP2Q9)-L(bkPxQx) 3076 .int L(bkP3Q9)-L(bkPxQx) 3077 .int L(bkP4Q9)-L(bkPxQx) 3078 .int L(bkP5Q9)-L(bkPxQx) 3079 .int L(bkP6Q9)-L(bkPxQx) 3080 .int L(bkP7Q9)-L(bkPxQx) 3081 3082 .int L(bkP0QA)-L(bkPxQx) 3083 .int L(bkP1QA)-L(bkPxQx) 3084 .int L(bkP2QA)-L(bkPxQx) 3085 .int L(bkP3QA)-L(bkPxQx) 3086 .int L(bkP4QA)-L(bkPxQx) 3087 .int L(bkP5QA)-L(bkPxQx) 3088 .int L(bkP6QA)-L(bkPxQx) 3089 .int L(bkP7QA)-L(bkPxQx) 3090 3091 .int L(bkP0QB)-L(bkPxQx) 3092 .int L(bkP1QB)-L(bkPxQx) 3093 .int L(bkP2QB)-L(bkPxQx) 3094 .int L(bkP3QB)-L(bkPxQx) 3095 .int L(bkP4QB)-L(bkPxQx) 3096 .int L(bkP5QB)-L(bkPxQx) 3097 .int L(bkP6QB)-L(bkPxQx) 3098 .int L(bkP7QB)-L(bkPxQx) 3099 3100 .int L(bkP0QC)-L(bkPxQx) 3101 .int L(bkP1QC)-L(bkPxQx) 3102 .int L(bkP2QC)-L(bkPxQx) 3103 .int L(bkP3QC)-L(bkPxQx) 3104 .int L(bkP4QC)-L(bkPxQx) 3105 .int L(bkP5QC)-L(bkPxQx) 3106 .int L(bkP6QC)-L(bkPxQx) 3107 .int L(bkP7QC)-L(bkPxQx) 3108 3109 .int L(bkP0QD)-L(bkPxQx) 3110 .int L(bkP1QD)-L(bkPxQx) 3111 .int L(bkP2QD)-L(bkPxQx) 3112 .int L(bkP3QD)-L(bkPxQx) 3113 .int L(bkP4QD)-L(bkPxQx) 3114 .int L(bkP5QD)-L(bkPxQx) 3115 .int L(bkP6QD)-L(bkPxQx) 3116 .int L(bkP7QD)-L(bkPxQx) 3117 3118 .int L(bkP0QE)-L(bkPxQx) 3119 .int L(bkP1QE)-L(bkPxQx) 3120 .int L(bkP2QE)-L(bkPxQx) 3121 .int L(bkP3QE)-L(bkPxQx) 3122 .int L(bkP4QE)-L(bkPxQx) 3123 .int L(bkP5QE)-L(bkPxQx) 3124 .int L(bkP6QE)-L(bkPxQx) 3125 .int L(bkP7QE)-L(bkPxQx) 3126 3127 .int L(bkP0QF)-L(bkPxQx) 3128 .int L(bkP1QF)-L(bkPxQx) 3129 .int L(bkP2QF)-L(bkPxQx) 3130 .int L(bkP3QF)-L(bkPxQx) 3131 .int L(bkP4QF)-L(bkPxQx) 3132 .int L(bkP5QF)-L(bkPxQx) 3133 .int L(bkP6QF)-L(bkPxQx) 3134 .int L(bkP7QF)-L(bkPxQx) 3135 3136 .int L(bkP0QG)-L(bkPxQx) 3137 .int L(bkP1QG)-L(bkPxQx) 3138 .int L(bkP2QG)-L(bkPxQx) 3139 .int L(bkP3QG)-L(bkPxQx) 3140 .int L(bkP4QG)-L(bkPxQx) 3141 .int L(bkP5QG)-L(bkPxQx) 3142 .int L(bkP6QG)-L(bkPxQx) 3143 .int L(bkP7QG)-L(bkPxQx) 3144 3145 .int L(bkP0QH)-L(bkPxQx) 3146 .int L(bkP1QH)-L(bkPxQx) 3147 .int L(bkP2QH)-L(bkPxQx) 3148 .int L(bkP3QH)-L(bkPxQx) 3149 .int L(bkP4QH)-L(bkPxQx) 3150 .int L(bkP5QH)-L(bkPxQx) 3151 .int L(bkP6QH)-L(bkPxQx) 3152 .int L(bkP7QH)-L(bkPxQx) 3153 3154 .int L(bkP0QI)-L(bkPxQx) 3155 .int L(bkP1QI)-L(bkPxQx) 3156 .int L(bkP2QI)-L(bkPxQx) 3157 .int L(bkP3QI)-L(bkPxQx) 3158 .int L(bkP4QI)-L(bkPxQx) 3159 .int L(bkP5QI)-L(bkPxQx) 3160 .int L(bkP6QI)-L(bkPxQx) 3161 .int L(bkP7QI)-L(bkPxQx) 3162 3163 SET_SIZE(memmove) 3164