1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27/* 28 * Copyright (c) 2008, Intel Corporation 29 * All rights reserved. 30 */ 31 32/* 33 * memcpy.s - copies two blocks of memory 34 * Implements memcpy() and memmove() libc primitives. 35 */ 36 37 .file "memcpy.s" 38 39#include <sys/asm_linkage.h> 40 41 ANSI_PRAGMA_WEAK(memmove,function) 42 ANSI_PRAGMA_WEAK(memcpy,function) 43 44#include "cache.h" 45#include "proc64_id.h" 46 47#define L(s) .memcpy/**/s 48 49/* 50 * memcpy algorithm overview: 51 * 52 * Thresholds used below were determined experimentally. 53 * 54 * Pseudo code: 55 * 56 * NOTE: On AMD NO_SSE is always set. Performance on Opteron did not improve 57 * using 16-byte stores. Setting NO_SSE on AMD should be re-evaluated on 58 * future AMD processors. 59 * 60 * 61 * If (size <= 128 bytes) { 62 * do unrolled code (primarily 8-byte loads/stores) regardless of 63 * alignment. 64 * } else { 65 * Align destination to 16-byte boundary 66 * 67 * if (NO_SSE) { 68 * If (size > half of the largest level cache) { 69 * Use 8-byte non-temporal stores (64-bytes/loop) 70 * } else { 71 * if (size > 4K && size <= half l1 cache size) { 72 * Use rep movsq 73 * } else { 74 * Use 8-byte loads/stores (64 bytes per loop) 75 * } 76 * } 77 * 78 * } else { **USE SSE** 79 * If (size > half of the largest level cache) { 80 * Use 16-byte non-temporal stores (128-bytes per loop) 81 * } else { 82 * If (both source and destination are aligned) { 83 * Use 16-byte aligned loads and stores (128 bytes/loop) 84 * } else { 85 * use pairs of xmm registers with SSE2 or SSSE3 86 * instructions to concatenate and shift appropriately 87 * to account for source unalignment. This enables 88 * 16-byte aligned loads to be done. 89 * } 90 * } 91 } 92 * 93 * Finish any remaining bytes via unrolled code above. 94 * } 95 * 96 * memmove overview: 97 * memmove is the same as memcpy except one case where copy needs to be 98 * done backwards. The copy backwards code is done in a similar manner. 99 */ 100 101 ENTRY(memmove) 102 cmp %rsi,%rdi # if dst <= src 103 jbe L(CopyForward) # then do copy forward 104 mov %rsi,%r9 # move src to r9 105 add %rdx,%r9 # add len to get addr of end of src 106 cmp %r9,%rdi # if dst < end of src 107 jb L(CopyBackwards) # then do copy backwards 108 jmp L(CopyForward) 109 110 ENTRY (memcpy) 111L(CopyForward): 112 mov %rdx,%r8 113 mov %rdi,%rcx 114 mov %rsi,%rdx 115 mov %rdi,%rax 116 lea L(fwdPxQx)(%rip),%r11 117 cmp $0x80,%r8 # 128 118 jg L(ck_use_sse2) 119 add %r8,%rcx 120 add %r8,%rdx 121 122 movslq (%r11,%r8,4),%r10 123 lea (%r10,%r11,1),%r11 124 jmpq *%r11 125 126 .balign 16 127L(ShrtAlignNew): 128 lea L(AliPxQx)(%rip),%r11 129 mov %rcx,%r9 130 and $0xf,%r9 131 132 movslq (%r11,%r9,4),%r10 133 lea (%r10,%r11,1),%r11 134 jmpq *%r11 135 136 .balign 16 137L(fwdPxQx): .int L(P0Q0)-L(fwdPxQx) 138 .int L(P1Q0)-L(fwdPxQx) 139 .int L(P2Q0)-L(fwdPxQx) 140 .int L(P3Q0)-L(fwdPxQx) 141 .int L(P4Q0)-L(fwdPxQx) 142 .int L(P5Q0)-L(fwdPxQx) 143 .int L(P6Q0)-L(fwdPxQx) 144 .int L(P7Q0)-L(fwdPxQx) 145 146 .int L(P0Q1)-L(fwdPxQx) 147 .int L(P1Q1)-L(fwdPxQx) 148 .int L(P2Q1)-L(fwdPxQx) 149 .int L(P3Q1)-L(fwdPxQx) 150 .int L(P4Q1)-L(fwdPxQx) 151 .int L(P5Q1)-L(fwdPxQx) 152 .int L(P6Q1)-L(fwdPxQx) 153 .int L(P7Q1)-L(fwdPxQx) 154 155 .int L(P0Q2)-L(fwdPxQx) 156 .int L(P1Q2)-L(fwdPxQx) 157 .int L(P2Q2)-L(fwdPxQx) 158 .int L(P3Q2)-L(fwdPxQx) 159 .int L(P4Q2)-L(fwdPxQx) 160 .int L(P5Q2)-L(fwdPxQx) 161 .int L(P6Q2)-L(fwdPxQx) 162 .int L(P7Q2)-L(fwdPxQx) 163 164 .int L(P0Q3)-L(fwdPxQx) 165 .int L(P1Q3)-L(fwdPxQx) 166 .int L(P2Q3)-L(fwdPxQx) 167 .int L(P3Q3)-L(fwdPxQx) 168 .int L(P4Q3)-L(fwdPxQx) 169 .int L(P5Q3)-L(fwdPxQx) 170 .int L(P6Q3)-L(fwdPxQx) 171 .int L(P7Q3)-L(fwdPxQx) 172 173 .int L(P0Q4)-L(fwdPxQx) 174 .int L(P1Q4)-L(fwdPxQx) 175 .int L(P2Q4)-L(fwdPxQx) 176 .int L(P3Q4)-L(fwdPxQx) 177 .int L(P4Q4)-L(fwdPxQx) 178 .int L(P5Q4)-L(fwdPxQx) 179 .int L(P6Q4)-L(fwdPxQx) 180 .int L(P7Q4)-L(fwdPxQx) 181 182 .int L(P0Q5)-L(fwdPxQx) 183 .int L(P1Q5)-L(fwdPxQx) 184 .int L(P2Q5)-L(fwdPxQx) 185 .int L(P3Q5)-L(fwdPxQx) 186 .int L(P4Q5)-L(fwdPxQx) 187 .int L(P5Q5)-L(fwdPxQx) 188 .int L(P6Q5)-L(fwdPxQx) 189 .int L(P7Q5)-L(fwdPxQx) 190 191 .int L(P0Q6)-L(fwdPxQx) 192 .int L(P1Q6)-L(fwdPxQx) 193 .int L(P2Q6)-L(fwdPxQx) 194 .int L(P3Q6)-L(fwdPxQx) 195 .int L(P4Q6)-L(fwdPxQx) 196 .int L(P5Q6)-L(fwdPxQx) 197 .int L(P6Q6)-L(fwdPxQx) 198 .int L(P7Q6)-L(fwdPxQx) 199 200 .int L(P0Q7)-L(fwdPxQx) 201 .int L(P1Q7)-L(fwdPxQx) 202 .int L(P2Q7)-L(fwdPxQx) 203 .int L(P3Q7)-L(fwdPxQx) 204 .int L(P4Q7)-L(fwdPxQx) 205 .int L(P5Q7)-L(fwdPxQx) 206 .int L(P6Q7)-L(fwdPxQx) 207 .int L(P7Q7)-L(fwdPxQx) 208 209 .int L(P0Q8)-L(fwdPxQx) 210 .int L(P1Q8)-L(fwdPxQx) 211 .int L(P2Q8)-L(fwdPxQx) 212 .int L(P3Q8)-L(fwdPxQx) 213 .int L(P4Q8)-L(fwdPxQx) 214 .int L(P5Q8)-L(fwdPxQx) 215 .int L(P6Q8)-L(fwdPxQx) 216 .int L(P7Q8)-L(fwdPxQx) 217 218 .int L(P0Q9)-L(fwdPxQx) 219 .int L(P1Q9)-L(fwdPxQx) 220 .int L(P2Q9)-L(fwdPxQx) 221 .int L(P3Q9)-L(fwdPxQx) 222 .int L(P4Q9)-L(fwdPxQx) 223 .int L(P5Q9)-L(fwdPxQx) 224 .int L(P6Q9)-L(fwdPxQx) 225 .int L(P7Q9)-L(fwdPxQx) 226 227 .int L(P0QA)-L(fwdPxQx) 228 .int L(P1QA)-L(fwdPxQx) 229 .int L(P2QA)-L(fwdPxQx) 230 .int L(P3QA)-L(fwdPxQx) 231 .int L(P4QA)-L(fwdPxQx) 232 .int L(P5QA)-L(fwdPxQx) 233 .int L(P6QA)-L(fwdPxQx) 234 .int L(P7QA)-L(fwdPxQx) 235 236 .int L(P0QB)-L(fwdPxQx) 237 .int L(P1QB)-L(fwdPxQx) 238 .int L(P2QB)-L(fwdPxQx) 239 .int L(P3QB)-L(fwdPxQx) 240 .int L(P4QB)-L(fwdPxQx) 241 .int L(P5QB)-L(fwdPxQx) 242 .int L(P6QB)-L(fwdPxQx) 243 .int L(P7QB)-L(fwdPxQx) 244 245 .int L(P0QC)-L(fwdPxQx) 246 .int L(P1QC)-L(fwdPxQx) 247 .int L(P2QC)-L(fwdPxQx) 248 .int L(P3QC)-L(fwdPxQx) 249 .int L(P4QC)-L(fwdPxQx) 250 .int L(P5QC)-L(fwdPxQx) 251 .int L(P6QC)-L(fwdPxQx) 252 .int L(P7QC)-L(fwdPxQx) 253 254 .int L(P0QD)-L(fwdPxQx) 255 .int L(P1QD)-L(fwdPxQx) 256 .int L(P2QD)-L(fwdPxQx) 257 .int L(P3QD)-L(fwdPxQx) 258 .int L(P4QD)-L(fwdPxQx) 259 .int L(P5QD)-L(fwdPxQx) 260 .int L(P6QD)-L(fwdPxQx) 261 .int L(P7QD)-L(fwdPxQx) 262 263 .int L(P0QE)-L(fwdPxQx) 264 .int L(P1QE)-L(fwdPxQx) 265 .int L(P2QE)-L(fwdPxQx) 266 .int L(P3QE)-L(fwdPxQx) 267 .int L(P4QE)-L(fwdPxQx) 268 .int L(P5QE)-L(fwdPxQx) 269 .int L(P6QE)-L(fwdPxQx) 270 .int L(P7QE)-L(fwdPxQx) 271 272 .int L(P0QF)-L(fwdPxQx) 273 .int L(P1QF)-L(fwdPxQx) 274 .int L(P2QF)-L(fwdPxQx) 275 .int L(P3QF)-L(fwdPxQx) 276 .int L(P4QF)-L(fwdPxQx) 277 .int L(P5QF)-L(fwdPxQx) 278 .int L(P6QF)-L(fwdPxQx) 279 .int L(P7QF)-L(fwdPxQx) 280 281 .int L(P0QG)-L(fwdPxQx) # 0x80 282 283 .balign 16 284L(AliPxQx): .int L(now_qw_aligned)-L(AliPxQx) 285 .int L(A1Q0)-L(AliPxQx) 286 .int L(A2Q0)-L(AliPxQx) 287 .int L(A3Q0)-L(AliPxQx) 288 .int L(A4Q0)-L(AliPxQx) 289 .int L(A5Q0)-L(AliPxQx) 290 .int L(A6Q0)-L(AliPxQx) 291 .int L(A7Q0)-L(AliPxQx) 292 .int L(A0Q1)-L(AliPxQx) 293 .int L(A1Q1)-L(AliPxQx) 294 .int L(A2Q1)-L(AliPxQx) 295 .int L(A3Q1)-L(AliPxQx) 296 .int L(A4Q1)-L(AliPxQx) 297 .int L(A5Q1)-L(AliPxQx) 298 .int L(A6Q1)-L(AliPxQx) 299 .int L(A7Q1)-L(AliPxQx) 300 301 .balign 16 302L(A1Q0): # ; need to move 8+ 7=1+2+4 bytes 303 movzbq (%rdx),%r11 304 sub $0xf,%r8 305 mov %r11b,(%rcx) 306 307 movzwq 0x1(%rdx),%r10 308 mov %r10w,0x1(%rcx) 309 310 mov 0x3(%rdx),%r9d 311 mov %r9d,0x3(%rcx) 312 313 mov 0x7(%rdx),%r11 314 add $0xf,%rdx 315 mov %r11,0x7(%rcx) 316 317 add $0xf,%rcx 318 jmp L(now_qw_aligned) 319 320 .balign 16 321L(A2Q0): # ; need to move 8+ 6=2+4 bytes 322 movzwq (%rdx),%r10 323 sub $0xe,%r8 324 mov %r10w,(%rcx) 325 326 mov 0x2(%rdx),%r9d 327 mov %r9d,0x2(%rcx) 328 329 mov 0x6(%rdx),%r11 330 add $0xe,%rdx 331 mov %r11,0x6(%rcx) 332 add $0xe,%rcx 333 jmp L(now_qw_aligned) 334 335 .balign 16 336L(A3Q0): # ; need to move 8+ 5=1+4 bytes 337 movzbq (%rdx),%r11 338 sub $0xd,%r8 339 mov %r11b,(%rcx) 340 341 mov 0x1(%rdx),%r9d 342 mov %r9d,0x1(%rcx) 343 344 mov 0x5(%rdx),%r10 345 add $0xd,%rdx 346 mov %r10,0x5(%rcx) 347 348 add $0xd,%rcx 349 jmp L(now_qw_aligned) 350 351 .balign 16 352L(A4Q0): # ; need to move 8+4 bytes 353 mov (%rdx),%r9d 354 sub $0xc,%r8 355 mov %r9d,(%rcx) 356 357 mov 0x4(%rdx),%r10 358 add $0xc,%rdx 359 mov %r10,0x4(%rcx) 360 361 add $0xc,%rcx 362 jmp L(now_qw_aligned) 363 364 .balign 16 365L(A5Q0): # ; need to move 8+ 3=1+2 bytes 366 movzbq (%rdx),%r11 367 sub $0xb,%r8 368 mov %r11b,(%rcx) 369 370 movzwq 0x1(%rdx),%r10 371 mov %r10w,0x1(%rcx) 372 373 mov 0x3(%rdx),%r9 374 add $0xb,%rdx 375 mov %r9,0x3(%rcx) 376 377 add $0xb,%rcx 378 jmp L(now_qw_aligned) 379 380 .balign 16 381L(A6Q0): # ; need to move 8+2 bytes 382 movzwq (%rdx),%r10 383 sub $0xa,%r8 384 mov %r10w,(%rcx) 385 386 mov 0x2(%rdx),%r9 387 add $0xa,%rdx 388 mov %r9,0x2(%rcx) 389 390 add $0xa,%rcx 391 jmp L(now_qw_aligned) 392 393 .balign 16 394L(A7Q0): # ; need to move 8+1 byte 395 movzbq (%rdx),%r11 396 sub $0x9,%r8 397 mov %r11b,(%rcx) 398 399 mov 0x1(%rdx),%r10 400 add $0x9,%rdx 401 mov %r10,0x1(%rcx) 402 403 add $0x9,%rcx 404 jmp L(now_qw_aligned) 405 406 .balign 16 407L(A0Q1): # ; need to move 8 bytes 408 409 mov (%rdx),%r10 410 add $0x8,%rdx 411 sub $0x8,%r8 412 mov %r10,(%rcx) 413 414 add $0x8,%rcx 415 jmp L(now_qw_aligned) 416 417 .balign 16 418L(A1Q1): # ; need to move 7=1+2+4 bytes 419 movzbq (%rdx),%r11 420 sub $0x7,%r8 421 mov %r11b,(%rcx) 422 423 movzwq 0x1(%rdx),%r10 424 mov %r10w,0x1(%rcx) 425 426 mov 0x3(%rdx),%r9d 427 add $0x7,%rdx 428 mov %r9d,0x3(%rcx) 429 add $0x7,%rcx 430 jmp L(now_qw_aligned) 431 432 .balign 16 433L(A2Q1): # ; need to move 6=2+4 bytes 434 movzwq (%rdx),%r10 435 sub $0x6,%r8 436 mov %r10w,(%rcx) 437 mov 0x2(%rdx),%r9d 438 add $0x6,%rdx 439 mov %r9d,0x2(%rcx) 440 add $0x6,%rcx 441 jmp L(now_qw_aligned) 442 443 .balign 16 444L(A3Q1): # ; need to move 5=1+4 bytes 445 movzbq (%rdx),%r11 446 sub $0x5,%r8 447 mov %r11b,(%rcx) 448 mov 0x1(%rdx),%r9d 449 add $0x5,%rdx 450 mov %r9d,0x1(%rcx) 451 add $0x5,%rcx 452 jmp L(now_qw_aligned) 453 454 .balign 16 455L(A4Q1): # ; need to move 4 bytes 456 mov (%rdx),%r9d 457 sub $0x4,%r8 458 add $0x4,%rdx 459 mov %r9d,(%rcx) 460 add $0x4,%rcx 461 jmp L(now_qw_aligned) 462 463 .balign 16 464L(A5Q1): # ; need to move 3=1+2 bytes 465 movzbq (%rdx),%r11 466 sub $0x3,%r8 467 mov %r11b,(%rcx) 468 469 movzwq 0x1(%rdx),%r10 470 add $0x3,%rdx 471 mov %r10w,0x1(%rcx) 472 473 add $0x3,%rcx 474 jmp L(now_qw_aligned) 475 476 .balign 16 477L(A6Q1): # ; need to move 2 bytes 478 movzwq (%rdx),%r10 479 sub $0x2,%r8 480 add $0x2,%rdx 481 mov %r10w,(%rcx) 482 add $0x2,%rcx 483 jmp L(now_qw_aligned) 484 485 .balign 16 486L(A7Q1): # ; need to move 1 byte 487 movzbq (%rdx),%r11 488 dec %r8 489 inc %rdx 490 mov %r11b,(%rcx) 491 inc %rcx 492 jmp L(now_qw_aligned) 493 494 495 .balign 16 496L(P0QG): 497 mov -0x80(%rdx),%r9 498 mov %r9,-0x80(%rcx) 499L(P0QF): 500 mov -0x78(%rdx),%r10 501 mov %r10,-0x78(%rcx) 502L(P0QE): 503 mov -0x70(%rdx),%r9 504 mov %r9,-0x70(%rcx) 505L(P0QD): 506 mov -0x68(%rdx),%r10 507 mov %r10,-0x68(%rcx) 508L(P0QC): 509 mov -0x60(%rdx),%r9 510 mov %r9,-0x60(%rcx) 511L(P0QB): 512 mov -0x58(%rdx),%r10 513 mov %r10,-0x58(%rcx) 514L(P0QA): 515 mov -0x50(%rdx),%r9 516 mov %r9,-0x50(%rcx) 517L(P0Q9): 518 mov -0x48(%rdx),%r10 519 mov %r10,-0x48(%rcx) 520L(P0Q8): 521 mov -0x40(%rdx),%r9 522 mov %r9,-0x40(%rcx) 523L(P0Q7): 524 mov -0x38(%rdx),%r10 525 mov %r10,-0x38(%rcx) 526L(P0Q6): 527 mov -0x30(%rdx),%r9 528 mov %r9,-0x30(%rcx) 529L(P0Q5): 530 mov -0x28(%rdx),%r10 531 mov %r10,-0x28(%rcx) 532L(P0Q4): 533 mov -0x20(%rdx),%r9 534 mov %r9,-0x20(%rcx) 535L(P0Q3): 536 mov -0x18(%rdx),%r10 537 mov %r10,-0x18(%rcx) 538L(P0Q2): 539 mov -0x10(%rdx),%r9 540 mov %r9,-0x10(%rcx) 541L(P0Q1): 542 mov -0x8(%rdx),%r10 543 mov %r10,-0x8(%rcx) 544L(P0Q0): 545 ret 546 547 .balign 16 548L(P1QF): 549 mov -0x79(%rdx),%r9 550 mov %r9,-0x79(%rcx) 551L(P1QE): 552 mov -0x71(%rdx),%r11 553 mov %r11,-0x71(%rcx) 554L(P1QD): 555 mov -0x69(%rdx),%r10 556 mov %r10,-0x69(%rcx) 557L(P1QC): 558 mov -0x61(%rdx),%r9 559 mov %r9,-0x61(%rcx) 560L(P1QB): 561 mov -0x59(%rdx),%r11 562 mov %r11,-0x59(%rcx) 563L(P1QA): 564 mov -0x51(%rdx),%r10 565 mov %r10,-0x51(%rcx) 566L(P1Q9): 567 mov -0x49(%rdx),%r9 568 mov %r9,-0x49(%rcx) 569L(P1Q8): 570 mov -0x41(%rdx),%r11 571 mov %r11,-0x41(%rcx) 572L(P1Q7): 573 mov -0x39(%rdx),%r10 574 mov %r10,-0x39(%rcx) 575L(P1Q6): 576 mov -0x31(%rdx),%r9 577 mov %r9,-0x31(%rcx) 578L(P1Q5): 579 mov -0x29(%rdx),%r11 580 mov %r11,-0x29(%rcx) 581L(P1Q4): 582 mov -0x21(%rdx),%r10 583 mov %r10,-0x21(%rcx) 584L(P1Q3): 585 mov -0x19(%rdx),%r9 586 mov %r9,-0x19(%rcx) 587L(P1Q2): 588 mov -0x11(%rdx),%r11 589 mov %r11,-0x11(%rcx) 590L(P1Q1): 591 mov -0x9(%rdx),%r10 592 mov %r10,-0x9(%rcx) 593L(P1Q0): 594 movzbq -0x1(%rdx),%r9 595 mov %r9b,-0x1(%rcx) 596 ret 597 598 .balign 16 599L(P2QF): 600 mov -0x7a(%rdx),%r9 601 mov %r9,-0x7a(%rcx) 602L(P2QE): 603 mov -0x72(%rdx),%r11 604 mov %r11,-0x72(%rcx) 605L(P2QD): 606 mov -0x6a(%rdx),%r10 607 mov %r10,-0x6a(%rcx) 608L(P2QC): 609 mov -0x62(%rdx),%r9 610 mov %r9,-0x62(%rcx) 611L(P2QB): 612 mov -0x5a(%rdx),%r11 613 mov %r11,-0x5a(%rcx) 614L(P2QA): 615 mov -0x52(%rdx),%r10 616 mov %r10,-0x52(%rcx) 617L(P2Q9): 618 mov -0x4a(%rdx),%r9 619 mov %r9,-0x4a(%rcx) 620L(P2Q8): 621 mov -0x42(%rdx),%r11 622 mov %r11,-0x42(%rcx) 623L(P2Q7): 624 mov -0x3a(%rdx),%r10 625 mov %r10,-0x3a(%rcx) 626L(P2Q6): 627 mov -0x32(%rdx),%r9 628 mov %r9,-0x32(%rcx) 629L(P2Q5): 630 mov -0x2a(%rdx),%r11 631 mov %r11,-0x2a(%rcx) 632L(P2Q4): 633 mov -0x22(%rdx),%r10 634 mov %r10,-0x22(%rcx) 635L(P2Q3): 636 mov -0x1a(%rdx),%r9 637 mov %r9,-0x1a(%rcx) 638L(P2Q2): 639 mov -0x12(%rdx),%r11 640 mov %r11,-0x12(%rcx) 641L(P2Q1): 642 mov -0xa(%rdx),%r10 643 mov %r10,-0xa(%rcx) 644L(P2Q0): 645 movzwq -0x2(%rdx),%r9 646 mov %r9w,-0x2(%rcx) 647 ret 648 649 .balign 16 650L(P3QF): 651 mov -0x7b(%rdx),%r9 652 mov %r9,-0x7b(%rcx) 653L(P3QE): 654 mov -0x73(%rdx),%r11 655 mov %r11,-0x73(%rcx) 656L(P3QD): 657 mov -0x6b(%rdx),%r10 658 mov %r10,-0x6b(%rcx) 659L(P3QC): 660 mov -0x63(%rdx),%r9 661 mov %r9,-0x63(%rcx) 662L(P3QB): 663 mov -0x5b(%rdx),%r11 664 mov %r11,-0x5b(%rcx) 665L(P3QA): 666 mov -0x53(%rdx),%r10 667 mov %r10,-0x53(%rcx) 668L(P3Q9): 669 mov -0x4b(%rdx),%r9 670 mov %r9,-0x4b(%rcx) 671L(P3Q8): 672 mov -0x43(%rdx),%r11 673 mov %r11,-0x43(%rcx) 674L(P3Q7): 675 mov -0x3b(%rdx),%r10 676 mov %r10,-0x3b(%rcx) 677L(P3Q6): 678 mov -0x33(%rdx),%r9 679 mov %r9,-0x33(%rcx) 680L(P3Q5): 681 mov -0x2b(%rdx),%r11 682 mov %r11,-0x2b(%rcx) 683L(P3Q4): 684 mov -0x23(%rdx),%r10 685 mov %r10,-0x23(%rcx) 686L(P3Q3): 687 mov -0x1b(%rdx),%r9 688 mov %r9,-0x1b(%rcx) 689L(P3Q2): 690 mov -0x13(%rdx),%r11 691 mov %r11,-0x13(%rcx) 692L(P3Q1): 693 mov -0xb(%rdx),%r10 694 mov %r10,-0xb(%rcx) 695 /* 696 * These trailing loads/stores have to do all their loads 1st, 697 * then do the stores. 698 */ 699L(P3Q0): 700 movzwq -0x3(%rdx),%r9 701 movzbq -0x1(%rdx),%r10 702 mov %r9w,-0x3(%rcx) 703 mov %r10b,-0x1(%rcx) 704 ret 705 706 .balign 16 707L(P4QF): 708 mov -0x7c(%rdx),%r9 709 mov %r9,-0x7c(%rcx) 710L(P4QE): 711 mov -0x74(%rdx),%r11 712 mov %r11,-0x74(%rcx) 713L(P4QD): 714 mov -0x6c(%rdx),%r10 715 mov %r10,-0x6c(%rcx) 716L(P4QC): 717 mov -0x64(%rdx),%r9 718 mov %r9,-0x64(%rcx) 719L(P4QB): 720 mov -0x5c(%rdx),%r11 721 mov %r11,-0x5c(%rcx) 722L(P4QA): 723 mov -0x54(%rdx),%r10 724 mov %r10,-0x54(%rcx) 725L(P4Q9): 726 mov -0x4c(%rdx),%r9 727 mov %r9,-0x4c(%rcx) 728L(P4Q8): 729 mov -0x44(%rdx),%r11 730 mov %r11,-0x44(%rcx) 731L(P4Q7): 732 mov -0x3c(%rdx),%r10 733 mov %r10,-0x3c(%rcx) 734L(P4Q6): 735 mov -0x34(%rdx),%r9 736 mov %r9,-0x34(%rcx) 737L(P4Q5): 738 mov -0x2c(%rdx),%r11 739 mov %r11,-0x2c(%rcx) 740L(P4Q4): 741 mov -0x24(%rdx),%r10 742 mov %r10,-0x24(%rcx) 743L(P4Q3): 744 mov -0x1c(%rdx),%r9 745 mov %r9,-0x1c(%rcx) 746L(P4Q2): 747 mov -0x14(%rdx),%r11 748 mov %r11,-0x14(%rcx) 749L(P4Q1): 750 mov -0xc(%rdx),%r10 751 mov %r10,-0xc(%rcx) 752L(P4Q0): 753 mov -0x4(%rdx),%r9d 754 mov %r9d,-0x4(%rcx) 755 ret 756 757 .balign 16 758L(P5QF): 759 mov -0x7d(%rdx),%r9 760 mov %r9,-0x7d(%rcx) 761L(P5QE): 762 mov -0x75(%rdx),%r11 763 mov %r11,-0x75(%rcx) 764L(P5QD): 765 mov -0x6d(%rdx),%r10 766 mov %r10,-0x6d(%rcx) 767L(P5QC): 768 mov -0x65(%rdx),%r9 769 mov %r9,-0x65(%rcx) 770L(P5QB): 771 mov -0x5d(%rdx),%r11 772 mov %r11,-0x5d(%rcx) 773L(P5QA): 774 mov -0x55(%rdx),%r10 775 mov %r10,-0x55(%rcx) 776L(P5Q9): 777 mov -0x4d(%rdx),%r9 778 mov %r9,-0x4d(%rcx) 779L(P5Q8): 780 mov -0x45(%rdx),%r11 781 mov %r11,-0x45(%rcx) 782L(P5Q7): 783 mov -0x3d(%rdx),%r10 784 mov %r10,-0x3d(%rcx) 785L(P5Q6): 786 mov -0x35(%rdx),%r9 787 mov %r9,-0x35(%rcx) 788L(P5Q5): 789 mov -0x2d(%rdx),%r11 790 mov %r11,-0x2d(%rcx) 791L(P5Q4): 792 mov -0x25(%rdx),%r10 793 mov %r10,-0x25(%rcx) 794L(P5Q3): 795 mov -0x1d(%rdx),%r9 796 mov %r9,-0x1d(%rcx) 797L(P5Q2): 798 mov -0x15(%rdx),%r11 799 mov %r11,-0x15(%rcx) 800L(P5Q1): 801 mov -0xd(%rdx),%r10 802 mov %r10,-0xd(%rcx) 803 /* 804 * These trailing loads/stores have to do all their loads 1st, 805 * then do the stores. 806 */ 807L(P5Q0): 808 mov -0x5(%rdx),%r9d 809 movzbq -0x1(%rdx),%r10 810 mov %r9d,-0x5(%rcx) 811 mov %r10b,-0x1(%rcx) 812 ret 813 814 .balign 16 815L(P6QF): 816 mov -0x7e(%rdx),%r9 817 mov %r9,-0x7e(%rcx) 818L(P6QE): 819 mov -0x76(%rdx),%r11 820 mov %r11,-0x76(%rcx) 821L(P6QD): 822 mov -0x6e(%rdx),%r10 823 mov %r10,-0x6e(%rcx) 824L(P6QC): 825 mov -0x66(%rdx),%r9 826 mov %r9,-0x66(%rcx) 827L(P6QB): 828 mov -0x5e(%rdx),%r11 829 mov %r11,-0x5e(%rcx) 830L(P6QA): 831 mov -0x56(%rdx),%r10 832 mov %r10,-0x56(%rcx) 833L(P6Q9): 834 mov -0x4e(%rdx),%r9 835 mov %r9,-0x4e(%rcx) 836L(P6Q8): 837 mov -0x46(%rdx),%r11 838 mov %r11,-0x46(%rcx) 839L(P6Q7): 840 mov -0x3e(%rdx),%r10 841 mov %r10,-0x3e(%rcx) 842L(P6Q6): 843 mov -0x36(%rdx),%r9 844 mov %r9,-0x36(%rcx) 845L(P6Q5): 846 mov -0x2e(%rdx),%r11 847 mov %r11,-0x2e(%rcx) 848L(P6Q4): 849 mov -0x26(%rdx),%r10 850 mov %r10,-0x26(%rcx) 851L(P6Q3): 852 mov -0x1e(%rdx),%r9 853 mov %r9,-0x1e(%rcx) 854L(P6Q2): 855 mov -0x16(%rdx),%r11 856 mov %r11,-0x16(%rcx) 857L(P6Q1): 858 mov -0xe(%rdx),%r10 859 mov %r10,-0xe(%rcx) 860 /* 861 * These trailing loads/stores have to do all their loads 1st, 862 * then do the stores. 863 */ 864L(P6Q0): 865 mov -0x6(%rdx),%r9d 866 movzwq -0x2(%rdx),%r10 867 mov %r9d,-0x6(%rcx) 868 mov %r10w,-0x2(%rcx) 869 ret 870 871 .balign 16 872L(P7QF): 873 mov -0x7f(%rdx),%r9 874 mov %r9,-0x7f(%rcx) 875L(P7QE): 876 mov -0x77(%rdx),%r11 877 mov %r11,-0x77(%rcx) 878L(P7QD): 879 mov -0x6f(%rdx),%r10 880 mov %r10,-0x6f(%rcx) 881L(P7QC): 882 mov -0x67(%rdx),%r9 883 mov %r9,-0x67(%rcx) 884L(P7QB): 885 mov -0x5f(%rdx),%r11 886 mov %r11,-0x5f(%rcx) 887L(P7QA): 888 mov -0x57(%rdx),%r10 889 mov %r10,-0x57(%rcx) 890L(P7Q9): 891 mov -0x4f(%rdx),%r9 892 mov %r9,-0x4f(%rcx) 893L(P7Q8): 894 mov -0x47(%rdx),%r11 895 mov %r11,-0x47(%rcx) 896L(P7Q7): 897 mov -0x3f(%rdx),%r10 898 mov %r10,-0x3f(%rcx) 899L(P7Q6): 900 mov -0x37(%rdx),%r9 901 mov %r9,-0x37(%rcx) 902L(P7Q5): 903 mov -0x2f(%rdx),%r11 904 mov %r11,-0x2f(%rcx) 905L(P7Q4): 906 mov -0x27(%rdx),%r10 907 mov %r10,-0x27(%rcx) 908L(P7Q3): 909 mov -0x1f(%rdx),%r9 910 mov %r9,-0x1f(%rcx) 911L(P7Q2): 912 mov -0x17(%rdx),%r11 913 mov %r11,-0x17(%rcx) 914L(P7Q1): 915 mov -0xf(%rdx),%r10 916 mov %r10,-0xf(%rcx) 917 /* 918 * These trailing loads/stores have to do all their loads 1st, 919 * then do the stores. 920 */ 921L(P7Q0): 922 mov -0x7(%rdx),%r9d 923 movzwq -0x3(%rdx),%r10 924 movzbq -0x1(%rdx),%r11 925 mov %r9d,-0x7(%rcx) 926 mov %r10w,-0x3(%rcx) 927 mov %r11b,-0x1(%rcx) 928 ret 929 930 .balign 16 931L(ck_use_sse2): 932 /* 933 * Align dest to 16 byte boundary. 934 */ 935 test $0xf,%rcx 936 jnz L(ShrtAlignNew) 937 938L(now_qw_aligned): 939 cmpl $NO_SSE,.memops_method(%rip) 940 je L(Loop8byte_pre) 941 942 /* 943 * The fall-through path is to do SSE2 16-byte load/stores 944 */ 945 946 /* 947 * If current move size is larger than half of the highest level cache 948 * size, then do non-temporal moves. 949 */ 950 mov .largest_level_cache_size(%rip),%r9d 951 shr %r9 # take half of it 952 cmp %r9,%r8 953 jg L(sse2_nt_move) 954 955 /* 956 * If both the source and dest are aligned, then use the both aligned 957 * logic. Well aligned data should reap the rewards. 958 */ 959 test $0xf,%rdx 960 jz L(pre_both_aligned) 961 962 lea L(SSE_src)(%rip),%r10 # SSE2 (default) 963 testl $USE_SSSE3,.memops_method(%rip) 964 jz 1f 965 lea L(SSSE3_src)(%rip),%r10 # SSSE3 966 9671: 968 /* 969 * if the src is not 16 byte aligned... 970 */ 971 mov %rdx,%r11 972 and $0xf,%r11 973 movdqu (%rdx),%xmm0 974 movdqa %xmm0,(%rcx) 975 add $0x10,%rdx 976 sub %r11,%rdx 977 add $0x10,%rcx 978 sub $0x10,%r8 979 movdqa (%rdx),%xmm1 980 981 movslq (%r10,%r11,4),%r9 982 lea (%r9,%r10,1),%r10 983 jmpq *%r10 984 985 .balign 16 986L(SSSE3_src): .int L(pre_both_aligned)-L(SSSE3_src) 987 .int L(mov3dqa1) -L(SSSE3_src) 988 .int L(mov3dqa2) -L(SSSE3_src) 989 .int L(mov3dqa3) -L(SSSE3_src) 990 .int L(mov3dqa4) -L(SSSE3_src) 991 .int L(mov3dqa5) -L(SSSE3_src) 992 .int L(mov3dqa6) -L(SSSE3_src) 993 .int L(mov3dqa7) -L(SSSE3_src) 994 .int L(movdqa8) -L(SSSE3_src) 995 .int L(mov3dqa9) -L(SSSE3_src) 996 .int L(mov3dqa10)-L(SSSE3_src) 997 .int L(mov3dqa11)-L(SSSE3_src) 998 .int L(mov3dqa12)-L(SSSE3_src) 999 .int L(mov3dqa13)-L(SSSE3_src) 1000 .int L(mov3dqa14)-L(SSSE3_src) 1001 .int L(mov3dqa15)-L(SSSE3_src) 1002L(SSE_src): .int L(pre_both_aligned)-L(SSE_src) 1003 .int L(movdqa1) -L(SSE_src) 1004 .int L(movdqa2) -L(SSE_src) 1005 .int L(movdqa3) -L(SSE_src) 1006 .int L(movdqa4) -L(SSE_src) 1007 .int L(movdqa5) -L(SSE_src) 1008 .int L(movdqa6) -L(SSE_src) 1009 .int L(movdqa7) -L(SSE_src) 1010 .int L(movdqa8) -L(SSE_src) 1011 .int L(movdqa9) -L(SSE_src) 1012 .int L(movdqa10)-L(SSE_src) 1013 .int L(movdqa11)-L(SSE_src) 1014 .int L(movdqa12)-L(SSE_src) 1015 .int L(movdqa13)-L(SSE_src) 1016 .int L(movdqa14)-L(SSE_src) 1017 .int L(movdqa15)-L(SSE_src) 1018 1019 .balign 16 1020L(movdqa1): 1021 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1022 movdqa 0x20(%rdx),%xmm0 # load the upper source buffer 1023 lea 0x20(%rdx),%rdx 1024 lea -0x20(%r8),%r8 1025 1026 psrldq $0x1,%xmm1 # shift right prev buffer (saved from last iteration) 1027 movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration 1028 pslldq $0xf,%xmm3 # shift the current buffer left (shift in zeros) 1029 por %xmm1,%xmm3 # OR them together 1030 cmp $0x20,%r8 1031 1032 psrldq $0x1,%xmm2 # shift right prev buffer (saved from last iteration) 1033 movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration 1034 pslldq $0xf,%xmm0 # shift the current buffer left (shift in zeros) 1035 por %xmm2,%xmm0 # OR them together 1036 movdqa %xmm3,(%rcx) # store it 1037 movdqa %xmm0,0x10(%rcx) # store it 1038 lea 0x20(%rcx),%rcx 1039 1040 jge L(movdqa1) 1041 jmp L(movdqa_epi) 1042 1043 .balign 16 1044L(movdqa2): 1045 sub $0x20,%r8 1046 movdqa 0x10(%rdx),%xmm3 1047 movdqa 0x20(%rdx),%xmm0 1048 add $0x20,%rdx 1049 1050 psrldq $0x2,%xmm1 1051 movdqa %xmm3,%xmm2 1052 pslldq $0xe,%xmm3 1053 por %xmm1,%xmm3 1054 1055 psrldq $0x2,%xmm2 1056 movdqa %xmm0,%xmm1 1057 pslldq $0xe,%xmm0 1058 por %xmm2,%xmm0 1059 movdqa %xmm3,(%rcx) 1060 movdqa %xmm0,0x10(%rcx) 1061 1062 add $0x20,%rcx 1063 cmp $0x20,%r8 1064 jge L(movdqa2) 1065 jmp L(movdqa_epi) 1066 1067 .balign 16 1068L(movdqa3): 1069 sub $0x20,%r8 1070 movdqa 0x10(%rdx),%xmm3 1071 movdqa 0x20(%rdx),%xmm0 1072 add $0x20,%rdx 1073 1074 psrldq $0x3,%xmm1 1075 movdqa %xmm3,%xmm2 1076 pslldq $0xd,%xmm3 1077 por %xmm1,%xmm3 1078 1079 psrldq $0x3,%xmm2 1080 movdqa %xmm0,%xmm1 1081 pslldq $0xd,%xmm0 1082 por %xmm2,%xmm0 1083 movdqa %xmm3,(%rcx) 1084 movdqa %xmm0,0x10(%rcx) 1085 1086 add $0x20,%rcx 1087 cmp $0x20,%r8 1088 jge L(movdqa3) 1089 jmp L(movdqa_epi) 1090 1091 .balign 16 1092L(movdqa4): 1093 sub $0x20,%r8 1094 movdqa 0x10(%rdx),%xmm3 1095 movdqa 0x20(%rdx),%xmm0 1096 add $0x20,%rdx 1097 1098 psrldq $0x4,%xmm1 1099 movdqa %xmm3,%xmm2 1100 pslldq $0xc,%xmm3 1101 por %xmm1,%xmm3 1102 1103 psrldq $0x4,%xmm2 1104 movdqa %xmm0,%xmm1 1105 pslldq $0xc,%xmm0 1106 por %xmm2,%xmm0 1107 1108 movdqa %xmm3,(%rcx) 1109 movdqa %xmm0,0x10(%rcx) 1110 1111 add $0x20,%rcx 1112 cmp $0x20,%r8 1113 jge L(movdqa4) 1114 jmp L(movdqa_epi) 1115 1116 .balign 16 1117L(movdqa5): 1118 sub $0x20,%r8 1119 movdqa 0x10(%rdx),%xmm3 1120 movdqa 0x20(%rdx),%xmm0 1121 add $0x20,%rdx 1122 1123 psrldq $0x5,%xmm1 1124 movdqa %xmm3,%xmm2 1125 pslldq $0xb,%xmm3 1126 por %xmm1,%xmm3 1127 1128 psrldq $0x5,%xmm2 1129 movdqa %xmm0,%xmm1 1130 pslldq $0xb,%xmm0 1131 por %xmm2,%xmm0 1132 1133 movdqa %xmm3,(%rcx) 1134 movdqa %xmm0,0x10(%rcx) 1135 1136 add $0x20,%rcx 1137 cmp $0x20,%r8 1138 jge L(movdqa5) 1139 jmp L(movdqa_epi) 1140 1141 .balign 16 1142L(movdqa6): 1143 sub $0x20,%r8 1144 movdqa 0x10(%rdx),%xmm3 1145 movdqa 0x20(%rdx),%xmm0 1146 add $0x20,%rdx 1147 1148 psrldq $0x6,%xmm1 1149 movdqa %xmm3,%xmm2 1150 pslldq $0xa,%xmm3 1151 por %xmm1,%xmm3 1152 1153 psrldq $0x6,%xmm2 1154 movdqa %xmm0,%xmm1 1155 pslldq $0xa,%xmm0 1156 por %xmm2,%xmm0 1157 movdqa %xmm3,(%rcx) 1158 movdqa %xmm0,0x10(%rcx) 1159 1160 add $0x20,%rcx 1161 cmp $0x20,%r8 1162 jge L(movdqa6) 1163 jmp L(movdqa_epi) 1164 1165 .balign 16 1166L(movdqa7): 1167 sub $0x20,%r8 1168 movdqa 0x10(%rdx),%xmm3 1169 movdqa 0x20(%rdx),%xmm0 1170 add $0x20,%rdx 1171 1172 psrldq $0x7,%xmm1 1173 movdqa %xmm3,%xmm2 1174 pslldq $0x9,%xmm3 1175 por %xmm1,%xmm3 1176 1177 psrldq $0x7,%xmm2 1178 movdqa %xmm0,%xmm1 1179 pslldq $0x9,%xmm0 1180 por %xmm2,%xmm0 1181 movdqa %xmm3,(%rcx) 1182 movdqa %xmm0,0x10(%rcx) 1183 1184 add $0x20,%rcx 1185 cmp $0x20,%r8 1186 jge L(movdqa7) 1187 jmp L(movdqa_epi) 1188 1189 .balign 16 1190L(movdqa8): 1191 movdqa 0x10(%rdx),%xmm3 1192 sub $0x30,%r8 1193 movdqa 0x20(%rdx),%xmm0 1194 movdqa 0x30(%rdx),%xmm5 1195 lea 0x30(%rdx),%rdx 1196 1197 shufpd $0x1,%xmm3,%xmm1 1198 movdqa %xmm1,(%rcx) 1199 1200 cmp $0x30,%r8 1201 1202 shufpd $0x1,%xmm0,%xmm3 1203 movdqa %xmm3,0x10(%rcx) 1204 1205 movdqa %xmm5,%xmm1 1206 shufpd $0x1,%xmm5,%xmm0 1207 movdqa %xmm0,0x20(%rcx) 1208 1209 lea 0x30(%rcx),%rcx 1210 1211 jge L(movdqa8) 1212 jmp L(movdqa_epi) 1213 1214 .balign 16 1215L(movdqa9): 1216 sub $0x20,%r8 1217 movdqa 0x10(%rdx),%xmm3 1218 movdqa 0x20(%rdx),%xmm0 1219 add $0x20,%rdx 1220 1221 psrldq $0x9,%xmm1 1222 movdqa %xmm3,%xmm2 1223 pslldq $0x7,%xmm3 1224 por %xmm1,%xmm3 1225 1226 psrldq $0x9,%xmm2 1227 movdqa %xmm0,%xmm1 1228 pslldq $0x7,%xmm0 1229 por %xmm2,%xmm0 1230 movdqa %xmm3,(%rcx) 1231 movdqa %xmm0,0x10(%rcx) 1232 1233 add $0x20,%rcx 1234 cmp $0x20,%r8 1235 jge L(movdqa9) 1236 jmp L(movdqa_epi) 1237 1238 .balign 16 1239L(movdqa10): 1240 sub $0x20,%r8 1241 movdqa 0x10(%rdx),%xmm3 1242 movdqa 0x20(%rdx),%xmm0 1243 add $0x20,%rdx 1244 1245 psrldq $0xa,%xmm1 1246 movdqa %xmm3,%xmm2 1247 pslldq $0x6,%xmm3 1248 por %xmm1,%xmm3 1249 1250 psrldq $0xa,%xmm2 1251 movdqa %xmm0,%xmm1 1252 pslldq $0x6,%xmm0 1253 por %xmm2,%xmm0 1254 movdqa %xmm3,(%rcx) 1255 movdqa %xmm0,0x10(%rcx) 1256 1257 add $0x20,%rcx 1258 cmp $0x20,%r8 1259 jge L(movdqa10) 1260 jmp L(movdqa_epi) 1261 1262 .balign 16 1263L(movdqa11): 1264 sub $0x20,%r8 1265 movdqa 0x10(%rdx),%xmm3 1266 movdqa 0x20(%rdx),%xmm0 1267 add $0x20,%rdx 1268 1269 psrldq $0xb,%xmm1 1270 movdqa %xmm3,%xmm2 1271 pslldq $0x5,%xmm3 1272 por %xmm1,%xmm3 1273 1274 psrldq $0xb,%xmm2 1275 movdqa %xmm0,%xmm1 1276 pslldq $0x5,%xmm0 1277 por %xmm2,%xmm0 1278 movdqa %xmm3,(%rcx) 1279 movdqa %xmm0,0x10(%rcx) 1280 1281 add $0x20,%rcx 1282 cmp $0x20,%r8 1283 jge L(movdqa11) 1284 jmp L(movdqa_epi) 1285 1286 .balign 16 1287L(movdqa12): 1288 sub $0x20,%r8 1289 movdqa 0x10(%rdx),%xmm3 1290 movdqa 0x20(%rdx),%xmm0 1291 add $0x20,%rdx 1292 1293 psrldq $0xc,%xmm1 1294 movdqa %xmm3,%xmm2 1295 pslldq $0x4,%xmm3 1296 por %xmm1,%xmm3 1297 1298 psrldq $0xc,%xmm2 1299 movdqa %xmm0,%xmm1 1300 pslldq $0x4,%xmm0 1301 por %xmm2,%xmm0 1302 movdqa %xmm3,(%rcx) 1303 movdqa %xmm0,0x10(%rcx) 1304 1305 add $0x20,%rcx 1306 cmp $0x20,%r8 1307 jge L(movdqa12) 1308 jmp L(movdqa_epi) 1309 1310 .balign 16 1311L(movdqa13): 1312 sub $0x20,%r8 1313 movdqa 0x10(%rdx),%xmm3 1314 movdqa 0x20(%rdx),%xmm0 1315 add $0x20,%rdx 1316 1317 psrldq $0xd,%xmm1 1318 movdqa %xmm3,%xmm2 1319 pslldq $0x3,%xmm3 1320 por %xmm1,%xmm3 1321 1322 psrldq $0xd,%xmm2 1323 movdqa %xmm0,%xmm1 1324 pslldq $0x3,%xmm0 1325 por %xmm2,%xmm0 1326 movdqa %xmm3,(%rcx) 1327 movdqa %xmm0,0x10(%rcx) 1328 1329 add $0x20,%rcx 1330 cmp $0x20,%r8 1331 jge L(movdqa13) 1332 jmp L(movdqa_epi) 1333 1334 .balign 16 1335L(movdqa14): 1336 sub $0x20,%r8 1337 movdqa 0x10(%rdx),%xmm3 1338 movdqa 0x20(%rdx),%xmm0 1339 add $0x20,%rdx 1340 1341 psrldq $0xe,%xmm1 1342 movdqa %xmm3,%xmm2 1343 pslldq $0x2,%xmm3 1344 por %xmm1,%xmm3 1345 1346 psrldq $0xe,%xmm2 1347 movdqa %xmm0,%xmm1 1348 pslldq $0x2,%xmm0 1349 por %xmm2,%xmm0 1350 movdqa %xmm3,(%rcx) 1351 movdqa %xmm0,0x10(%rcx) 1352 1353 add $0x20,%rcx 1354 cmp $0x20,%r8 1355 jge L(movdqa14) 1356 jmp L(movdqa_epi) 1357 1358 .balign 16 1359L(movdqa15): 1360 sub $0x20,%r8 1361 movdqa 0x10(%rdx),%xmm3 1362 movdqa 0x20(%rdx),%xmm0 1363 add $0x20,%rdx 1364 1365 psrldq $0xf,%xmm1 1366 movdqa %xmm3,%xmm2 1367 pslldq $0x1,%xmm3 1368 por %xmm1,%xmm3 1369 1370 psrldq $0xf,%xmm2 1371 movdqa %xmm0,%xmm1 1372 pslldq $0x1,%xmm0 1373 por %xmm2,%xmm0 1374 movdqa %xmm3,(%rcx) 1375 movdqa %xmm0,0x10(%rcx) 1376 1377 add $0x20,%rcx 1378 cmp $0x20,%r8 1379 jge L(movdqa15) 1380 #jmp L(movdqa_epi) 1381 1382 .balign 16 1383L(movdqa_epi): 1384 lea L(fwdPxQx)(%rip),%r10 1385 add %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop) 1386 add %r8,%rcx 1387 add %r8,%rdx 1388 1389 movslq (%r10,%r8,4),%r9 1390 lea (%r9,%r10,1),%r10 1391 jmpq *%r10 1392 1393 .balign 16 1394L(mov3dqa1): 1395 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1396 sub $0x30,%r8 1397 movdqa 0x20(%rdx),%xmm0 # load the upper source buffer 1398 movdqa 0x30(%rdx),%xmm5 # load the upper source buffer 1399 lea 0x30(%rdx),%rdx 1400 cmp $0x30,%r8 1401 1402 movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration 1403 #palignr $0x1,%xmm1,%xmm3 1404 .byte 0x66,0x0f,0x3a,0x0f 1405 .byte 0xd9,0x01 1406 movdqa %xmm3,(%rcx) # store it 1407 1408 movdqa %xmm0,%xmm4 # store off xmm reg for use next iteration 1409 #palignr $0x1,%xmm2,%xmm0 1410 .byte 0x66,0x0f,0x3a,0x0f 1411 .byte 0xc2,0x01 1412 movdqa %xmm0,0x10(%rcx) # store it 1413 1414 movdqa %xmm5,%xmm1 # store off xmm reg for use next iteration 1415 #palignr $0x1,%xmm4,%xmm5 1416 .byte 0x66,0x0f,0x3a,0x0f 1417 .byte 0xec,0x01 1418 movdqa %xmm5,0x20(%rcx) # store it 1419 1420 lea 0x30(%rcx),%rcx 1421 jge L(mov3dqa1) 1422 1423 cmp $0x10,%r8 1424 jl L(movdqa_epi) 1425 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1426 sub $0x10,%r8 1427 lea 0x10(%rdx),%rdx 1428 movdqa %xmm3,%xmm2 # save for use next concat 1429 #palignr $0x1,%xmm1,%xmm3 1430 .byte 0x66,0x0f,0x3a,0x0f 1431 .byte 0xd9,0x01 1432 1433 cmp $0x10,%r8 1434 movdqa %xmm3,(%rcx) # store it 1435 lea 0x10(%rcx),%rcx 1436 jl L(movdqa_epi) 1437 1438 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1439 sub $0x10,%r8 1440 lea 0x10(%rdx),%rdx 1441 #palignr $0x1,%xmm2,%xmm0 1442 .byte 0x66,0x0f,0x3a,0x0f 1443 .byte 0xc2,0x01 1444 movdqa %xmm0,(%rcx) # store it 1445 lea 0x10(%rcx),%rcx 1446 jmp L(movdqa_epi) 1447 1448 .balign 16 1449L(mov3dqa2): 1450 movdqa 0x10(%rdx),%xmm3 1451 sub $0x30,%r8 1452 movdqa 0x20(%rdx),%xmm0 1453 movdqa 0x30(%rdx),%xmm5 1454 lea 0x30(%rdx),%rdx 1455 cmp $0x30,%r8 1456 1457 movdqa %xmm3,%xmm2 1458 #palignr $0x2,%xmm1,%xmm3 1459 .byte 0x66,0x0f,0x3a,0x0f 1460 .byte 0xd9,0x02 1461 movdqa %xmm3,(%rcx) 1462 1463 movdqa %xmm0,%xmm4 1464 #palignr $0x2,%xmm2,%xmm0 1465 .byte 0x66,0x0f,0x3a,0x0f 1466 .byte 0xc2,0x02 1467 movdqa %xmm0,0x10(%rcx) 1468 1469 movdqa %xmm5,%xmm1 1470 #palignr $0x2,%xmm4,%xmm5 1471 .byte 0x66,0x0f,0x3a,0x0f 1472 .byte 0xec,0x02 1473 movdqa %xmm5,0x20(%rcx) 1474 1475 lea 0x30(%rcx),%rcx 1476 jge L(mov3dqa2) 1477 1478 cmp $0x10,%r8 1479 jl L(movdqa_epi) 1480 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1481 sub $0x10,%r8 1482 lea 0x10(%rdx),%rdx 1483 movdqa %xmm3,%xmm2 # save for use next concat 1484 #palignr $0x2,%xmm1,%xmm3 1485 .byte 0x66,0x0f,0x3a,0x0f 1486 .byte 0xd9,0x02 1487 1488 cmp $0x10,%r8 1489 movdqa %xmm3,(%rcx) # store it 1490 lea 0x10(%rcx),%rcx 1491 jl L(movdqa_epi) 1492 1493 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1494 sub $0x10,%r8 1495 lea 0x10(%rdx),%rdx 1496 #palignr $0x2,%xmm2,%xmm0 1497 .byte 0x66,0x0f,0x3a,0x0f 1498 .byte 0xc2,0x02 1499 movdqa %xmm0,(%rcx) # store it 1500 lea 0x10(%rcx),%rcx 1501 jmp L(movdqa_epi) 1502 1503 .balign 16 1504L(mov3dqa3): 1505 movdqa 0x10(%rdx),%xmm3 1506 sub $0x30,%r8 1507 movdqa 0x20(%rdx),%xmm0 1508 movdqa 0x30(%rdx),%xmm5 1509 lea 0x30(%rdx),%rdx 1510 cmp $0x30,%r8 1511 1512 movdqa %xmm3,%xmm2 1513 #palignr $0x3,%xmm1,%xmm3 1514 .byte 0x66,0x0f,0x3a,0x0f 1515 .byte 0xd9,0x03 1516 movdqa %xmm3,(%rcx) 1517 1518 movdqa %xmm0,%xmm4 1519 #palignr $0x3,%xmm2,%xmm0 1520 .byte 0x66,0x0f,0x3a,0x0f 1521 .byte 0xc2,0x03 1522 movdqa %xmm0,0x10(%rcx) 1523 1524 movdqa %xmm5,%xmm1 1525 #palignr $0x3,%xmm4,%xmm5 1526 .byte 0x66,0x0f,0x3a,0x0f 1527 .byte 0xec,0x03 1528 movdqa %xmm5,0x20(%rcx) 1529 1530 lea 0x30(%rcx),%rcx 1531 jge L(mov3dqa3) 1532 1533 cmp $0x10,%r8 1534 jl L(movdqa_epi) 1535 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1536 sub $0x10,%r8 1537 lea 0x10(%rdx),%rdx 1538 movdqa %xmm3,%xmm2 # save for use next concat 1539 #palignr $0x3,%xmm1,%xmm3 1540 .byte 0x66,0x0f,0x3a,0x0f 1541 .byte 0xd9,0x03 1542 1543 cmp $0x10,%r8 1544 movdqa %xmm3,(%rcx) # store it 1545 lea 0x10(%rcx),%rcx 1546 jl L(movdqa_epi) 1547 1548 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1549 sub $0x10,%r8 1550 lea 0x10(%rdx),%rdx 1551 #palignr $0x3,%xmm2,%xmm0 1552 .byte 0x66,0x0f,0x3a,0x0f 1553 .byte 0xc2,0x03 1554 movdqa %xmm0,(%rcx) # store it 1555 lea 0x10(%rcx),%rcx 1556 jmp L(movdqa_epi) 1557 1558 .balign 16 1559L(mov3dqa4): 1560 movdqa 0x10(%rdx),%xmm3 1561 sub $0x30,%r8 1562 movdqa 0x20(%rdx),%xmm0 1563 movdqa 0x30(%rdx),%xmm5 1564 lea 0x30(%rdx),%rdx 1565 cmp $0x30,%r8 1566 1567 movdqa %xmm3,%xmm2 1568 #palignr $0x4,%xmm1,%xmm3 1569 .byte 0x66,0x0f,0x3a,0x0f 1570 .byte 0xd9,0x04 1571 movdqa %xmm3,(%rcx) 1572 1573 movdqa %xmm0,%xmm4 1574 #palignr $0x4,%xmm2,%xmm0 1575 .byte 0x66,0x0f,0x3a,0x0f 1576 .byte 0xc2,0x04 1577 movdqa %xmm0,0x10(%rcx) 1578 1579 movdqa %xmm5,%xmm1 1580 #palignr $0x4,%xmm4,%xmm5 1581 .byte 0x66,0x0f,0x3a,0x0f 1582 .byte 0xec,0x04 1583 movdqa %xmm5,0x20(%rcx) 1584 1585 lea 0x30(%rcx),%rcx 1586 jge L(mov3dqa4) 1587 1588 cmp $0x10,%r8 1589 jl L(movdqa_epi) 1590 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1591 sub $0x10,%r8 1592 lea 0x10(%rdx),%rdx 1593 movdqa %xmm3,%xmm2 # save for use next concat 1594 #palignr $0x4,%xmm1,%xmm3 1595 .byte 0x66,0x0f,0x3a,0x0f 1596 .byte 0xd9,0x04 1597 1598 cmp $0x10,%r8 1599 movdqa %xmm3,(%rcx) # store it 1600 lea 0x10(%rcx),%rcx 1601 jl L(movdqa_epi) 1602 1603 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1604 sub $0x10,%r8 1605 lea 0x10(%rdx),%rdx 1606 #palignr $0x4,%xmm2,%xmm0 1607 .byte 0x66,0x0f,0x3a,0x0f 1608 .byte 0xc2,0x04 1609 movdqa %xmm0,(%rcx) # store it 1610 lea 0x10(%rcx),%rcx 1611 jmp L(movdqa_epi) 1612 1613 .balign 16 1614L(mov3dqa5): 1615 movdqa 0x10(%rdx),%xmm3 1616 sub $0x30,%r8 1617 movdqa 0x20(%rdx),%xmm0 1618 movdqa 0x30(%rdx),%xmm5 1619 lea 0x30(%rdx),%rdx 1620 cmp $0x30,%r8 1621 1622 movdqa %xmm3,%xmm2 1623 #palignr $0x5,%xmm1,%xmm3 1624 .byte 0x66,0x0f,0x3a,0x0f 1625 .byte 0xd9,0x05 1626 movdqa %xmm3,(%rcx) 1627 1628 movdqa %xmm0,%xmm4 1629 #palignr $0x5,%xmm2,%xmm0 1630 .byte 0x66,0x0f,0x3a,0x0f 1631 .byte 0xc2,0x05 1632 movdqa %xmm0,0x10(%rcx) 1633 1634 movdqa %xmm5,%xmm1 1635 #palignr $0x5,%xmm4,%xmm5 1636 .byte 0x66,0x0f,0x3a,0x0f 1637 .byte 0xec,0x05 1638 movdqa %xmm5,0x20(%rcx) 1639 1640 lea 0x30(%rcx),%rcx 1641 jge L(mov3dqa5) 1642 1643 cmp $0x10,%r8 1644 jl L(movdqa_epi) 1645 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1646 sub $0x10,%r8 1647 lea 0x10(%rdx),%rdx 1648 movdqa %xmm3,%xmm2 # save for use next concat 1649 #palignr $0x5,%xmm1,%xmm3 1650 .byte 0x66,0x0f,0x3a,0x0f 1651 .byte 0xd9,0x05 1652 1653 cmp $0x10,%r8 1654 movdqa %xmm3,(%rcx) # store it 1655 lea 0x10(%rcx),%rcx 1656 jl L(movdqa_epi) 1657 1658 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1659 sub $0x10,%r8 1660 lea 0x10(%rdx),%rdx 1661 #palignr $0x5,%xmm2,%xmm0 1662 .byte 0x66,0x0f,0x3a,0x0f 1663 .byte 0xc2,0x05 1664 movdqa %xmm0,(%rcx) # store it 1665 lea 0x10(%rcx),%rcx 1666 jmp L(movdqa_epi) 1667 1668 .balign 16 1669L(mov3dqa6): 1670 movdqa 0x10(%rdx),%xmm3 1671 sub $0x30,%r8 1672 movdqa 0x20(%rdx),%xmm0 1673 movdqa 0x30(%rdx),%xmm5 1674 lea 0x30(%rdx),%rdx 1675 cmp $0x30,%r8 1676 1677 movdqa %xmm3,%xmm2 1678 #palignr $0x6,%xmm1,%xmm3 1679 .byte 0x66,0x0f,0x3a,0x0f 1680 .byte 0xd9,0x06 1681 movdqa %xmm3,(%rcx) 1682 1683 movdqa %xmm0,%xmm4 1684 #palignr $0x6,%xmm2,%xmm0 1685 .byte 0x66,0x0f,0x3a,0x0f 1686 .byte 0xc2,0x06 1687 movdqa %xmm0,0x10(%rcx) 1688 1689 movdqa %xmm5,%xmm1 1690 #palignr $0x6,%xmm4,%xmm5 1691 .byte 0x66,0x0f,0x3a,0x0f 1692 .byte 0xec,0x06 1693 movdqa %xmm5,0x20(%rcx) 1694 1695 lea 0x30(%rcx),%rcx 1696 jge L(mov3dqa6) 1697 1698 cmp $0x10,%r8 1699 jl L(movdqa_epi) 1700 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1701 sub $0x10,%r8 1702 lea 0x10(%rdx),%rdx 1703 movdqa %xmm3,%xmm2 # save for use next concat 1704 #palignr $0x6,%xmm1,%xmm3 1705 .byte 0x66,0x0f,0x3a,0x0f 1706 .byte 0xd9,0x06 1707 1708 cmp $0x10,%r8 1709 movdqa %xmm3,(%rcx) # store it 1710 lea 0x10(%rcx),%rcx 1711 jl L(movdqa_epi) 1712 1713 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1714 sub $0x10,%r8 1715 lea 0x10(%rdx),%rdx 1716 #palignr $0x6,%xmm2,%xmm0 1717 .byte 0x66,0x0f,0x3a,0x0f 1718 .byte 0xc2,0x06 1719 movdqa %xmm0,(%rcx) # store it 1720 lea 0x10(%rcx),%rcx 1721 jmp L(movdqa_epi) 1722 1723 .balign 16 1724L(mov3dqa7): 1725 movdqa 0x10(%rdx),%xmm3 1726 sub $0x30,%r8 1727 movdqa 0x20(%rdx),%xmm0 1728 movdqa 0x30(%rdx),%xmm5 1729 lea 0x30(%rdx),%rdx 1730 cmp $0x30,%r8 1731 1732 movdqa %xmm3,%xmm2 1733 #palignr $0x7,%xmm1,%xmm3 1734 .byte 0x66,0x0f,0x3a,0x0f 1735 .byte 0xd9,0x07 1736 movdqa %xmm3,(%rcx) 1737 1738 movdqa %xmm0,%xmm4 1739 #palignr $0x7,%xmm2,%xmm0 1740 .byte 0x66,0x0f,0x3a,0x0f 1741 .byte 0xc2,0x07 1742 movdqa %xmm0,0x10(%rcx) 1743 1744 movdqa %xmm5,%xmm1 1745 #palignr $0x7,%xmm4,%xmm5 1746 .byte 0x66,0x0f,0x3a,0x0f 1747 .byte 0xec,0x07 1748 movdqa %xmm5,0x20(%rcx) 1749 1750 lea 0x30(%rcx),%rcx 1751 jge L(mov3dqa7) 1752 1753 cmp $0x10,%r8 1754 jl L(movdqa_epi) 1755 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1756 sub $0x10,%r8 1757 lea 0x10(%rdx),%rdx 1758 movdqa %xmm3,%xmm2 # save for use next concat 1759 #palignr $0x7,%xmm1,%xmm3 1760 .byte 0x66,0x0f,0x3a,0x0f 1761 .byte 0xd9,0x07 1762 1763 cmp $0x10,%r8 1764 movdqa %xmm3,(%rcx) # store it 1765 lea 0x10(%rcx),%rcx 1766 jl L(movdqa_epi) 1767 1768 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1769 sub $0x10,%r8 1770 lea 0x10(%rdx),%rdx 1771 #palignr $0x7,%xmm2,%xmm0 1772 .byte 0x66,0x0f,0x3a,0x0f 1773 .byte 0xc2,0x07 1774 movdqa %xmm0,(%rcx) # store it 1775 lea 0x10(%rcx),%rcx 1776 jmp L(movdqa_epi) 1777 1778 .balign 16 1779L(mov3dqa9): 1780 movdqa 0x10(%rdx),%xmm3 1781 sub $0x30,%r8 1782 movdqa 0x20(%rdx),%xmm0 1783 movdqa 0x30(%rdx),%xmm5 1784 lea 0x30(%rdx),%rdx 1785 cmp $0x30,%r8 1786 1787 movdqa %xmm3,%xmm2 1788 #palignr $0x9,%xmm1,%xmm3 1789 .byte 0x66,0x0f,0x3a,0x0f 1790 .byte 0xd9,0x09 1791 movdqa %xmm3,(%rcx) 1792 1793 movdqa %xmm0,%xmm4 1794 #palignr $0x9,%xmm2,%xmm0 1795 .byte 0x66,0x0f,0x3a,0x0f 1796 .byte 0xc2,0x09 1797 movdqa %xmm0,0x10(%rcx) 1798 1799 movdqa %xmm5,%xmm1 1800 #palignr $0x9,%xmm4,%xmm5 1801 .byte 0x66,0x0f,0x3a,0x0f 1802 .byte 0xec,0x09 1803 movdqa %xmm5,0x20(%rcx) 1804 1805 lea 0x30(%rcx),%rcx 1806 jge L(mov3dqa9) 1807 1808 cmp $0x10,%r8 1809 jl L(movdqa_epi) 1810 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1811 sub $0x10,%r8 1812 lea 0x10(%rdx),%rdx 1813 movdqa %xmm3,%xmm2 # save for use next concat 1814 #palignr $0x9,%xmm1,%xmm3 1815 .byte 0x66,0x0f,0x3a,0x0f 1816 .byte 0xd9,0x09 1817 1818 cmp $0x10,%r8 1819 movdqa %xmm3,(%rcx) # store it 1820 lea 0x10(%rcx),%rcx 1821 jl L(movdqa_epi) 1822 1823 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1824 sub $0x10,%r8 1825 lea 0x10(%rdx),%rdx 1826 #palignr $0x9,%xmm2,%xmm0 1827 .byte 0x66,0x0f,0x3a,0x0f 1828 .byte 0xc2,0x09 1829 movdqa %xmm0,(%rcx) # store it 1830 lea 0x10(%rcx),%rcx 1831 jmp L(movdqa_epi) 1832 1833 .balign 16 1834L(mov3dqa10): 1835 movdqa 0x10(%rdx),%xmm3 1836 sub $0x30,%r8 1837 movdqa 0x20(%rdx),%xmm0 1838 movdqa 0x30(%rdx),%xmm5 1839 lea 0x30(%rdx),%rdx 1840 cmp $0x30,%r8 1841 1842 movdqa %xmm3,%xmm2 1843 #palignr $0xa,%xmm1,%xmm3 1844 .byte 0x66,0x0f,0x3a,0x0f 1845 .byte 0xd9,0x0a 1846 movdqa %xmm3,(%rcx) 1847 1848 movdqa %xmm0,%xmm4 1849 #palignr $0xa,%xmm2,%xmm0 1850 .byte 0x66,0x0f,0x3a,0x0f 1851 .byte 0xc2,0x0a 1852 movdqa %xmm0,0x10(%rcx) 1853 1854 movdqa %xmm5,%xmm1 1855 #palignr $0xa,%xmm4,%xmm5 1856 .byte 0x66,0x0f,0x3a,0x0f 1857 .byte 0xec,0x0a 1858 movdqa %xmm5,0x20(%rcx) 1859 1860 lea 0x30(%rcx),%rcx 1861 jge L(mov3dqa10) 1862 1863 cmp $0x10,%r8 1864 jl L(movdqa_epi) 1865 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1866 sub $0x10,%r8 1867 lea 0x10(%rdx),%rdx 1868 movdqa %xmm3,%xmm2 # save for use next concat 1869 #palignr $0xa,%xmm1,%xmm3 1870 .byte 0x66,0x0f,0x3a,0x0f 1871 .byte 0xd9,0x0a 1872 1873 cmp $0x10,%r8 1874 movdqa %xmm3,(%rcx) # store it 1875 lea 0x10(%rcx),%rcx 1876 jl L(movdqa_epi) 1877 1878 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1879 sub $0x10,%r8 1880 lea 0x10(%rdx),%rdx 1881 #palignr $0xa,%xmm2,%xmm0 1882 .byte 0x66,0x0f,0x3a,0x0f 1883 .byte 0xc2,0x0a 1884 movdqa %xmm0,(%rcx) # store it 1885 lea 0x10(%rcx),%rcx 1886 jmp L(movdqa_epi) 1887 1888 .balign 16 1889L(mov3dqa11): 1890 movdqa 0x10(%rdx),%xmm3 1891 sub $0x30,%r8 1892 movdqa 0x20(%rdx),%xmm0 1893 movdqa 0x30(%rdx),%xmm5 1894 lea 0x30(%rdx),%rdx 1895 cmp $0x30,%r8 1896 1897 movdqa %xmm3,%xmm2 1898 #palignr $0xb,%xmm1,%xmm3 1899 .byte 0x66,0x0f,0x3a,0x0f 1900 .byte 0xd9,0x0b 1901 movdqa %xmm3,(%rcx) 1902 1903 movdqa %xmm0,%xmm4 1904 #palignr $0xb,%xmm2,%xmm0 1905 .byte 0x66,0x0f,0x3a,0x0f 1906 .byte 0xc2,0x0b 1907 movdqa %xmm0,0x10(%rcx) 1908 1909 movdqa %xmm5,%xmm1 1910 #palignr $0xb,%xmm4,%xmm5 1911 .byte 0x66,0x0f,0x3a,0x0f 1912 .byte 0xec,0x0b 1913 movdqa %xmm5,0x20(%rcx) 1914 1915 lea 0x30(%rcx),%rcx 1916 jge L(mov3dqa11) 1917 1918 cmp $0x10,%r8 1919 jl L(movdqa_epi) 1920 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1921 sub $0x10,%r8 1922 lea 0x10(%rdx),%rdx 1923 movdqa %xmm3,%xmm2 # save for use next concat 1924 #palignr $0xb,%xmm1,%xmm3 1925 .byte 0x66,0x0f,0x3a,0x0f 1926 .byte 0xd9,0x0b 1927 1928 cmp $0x10,%r8 1929 movdqa %xmm3,(%rcx) # store it 1930 lea 0x10(%rcx),%rcx 1931 jl L(movdqa_epi) 1932 1933 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1934 sub $0x10,%r8 1935 lea 0x10(%rdx),%rdx 1936 #palignr $0xb,%xmm2,%xmm0 1937 .byte 0x66,0x0f,0x3a,0x0f 1938 .byte 0xc2,0x0b 1939 movdqa %xmm0,(%rcx) # store it 1940 lea 0x10(%rcx),%rcx 1941 jmp L(movdqa_epi) 1942 1943 .balign 16 1944L(mov3dqa12): 1945 movdqa 0x10(%rdx),%xmm3 1946 sub $0x30,%r8 1947 movdqa 0x20(%rdx),%xmm0 1948 movdqa 0x30(%rdx),%xmm5 1949 lea 0x30(%rdx),%rdx 1950 cmp $0x30,%r8 1951 1952 movdqa %xmm3,%xmm2 1953 #palignr $0xc,%xmm1,%xmm3 1954 .byte 0x66,0x0f,0x3a,0x0f 1955 .byte 0xd9,0x0c 1956 movdqa %xmm3,(%rcx) 1957 1958 movdqa %xmm0,%xmm4 1959 #palignr $0xc,%xmm2,%xmm0 1960 .byte 0x66,0x0f,0x3a,0x0f 1961 .byte 0xc2,0x0c 1962 movdqa %xmm0,0x10(%rcx) 1963 1964 movdqa %xmm5,%xmm1 1965 #palignr $0xc,%xmm4,%xmm5 1966 .byte 0x66,0x0f,0x3a,0x0f 1967 .byte 0xec,0x0c 1968 movdqa %xmm5,0x20(%rcx) 1969 1970 lea 0x30(%rcx),%rcx 1971 jge L(mov3dqa12) 1972 1973 cmp $0x10,%r8 1974 jl L(movdqa_epi) 1975 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1976 sub $0x10,%r8 1977 lea 0x10(%rdx),%rdx 1978 movdqa %xmm3,%xmm2 # save for use next concat 1979 #palignr $0xc,%xmm1,%xmm3 1980 .byte 0x66,0x0f,0x3a,0x0f 1981 .byte 0xd9,0x0c 1982 1983 cmp $0x10,%r8 1984 movdqa %xmm3,(%rcx) # store it 1985 lea 0x10(%rcx),%rcx 1986 jl L(movdqa_epi) 1987 1988 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1989 sub $0x10,%r8 1990 lea 0x10(%rdx),%rdx 1991 #palignr $0xc,%xmm2,%xmm0 1992 .byte 0x66,0x0f,0x3a,0x0f 1993 .byte 0xc2,0x0c 1994 movdqa %xmm0,(%rcx) # store it 1995 lea 0x10(%rcx),%rcx 1996 jmp L(movdqa_epi) 1997 1998 .balign 16 1999L(mov3dqa13): 2000 movdqa 0x10(%rdx),%xmm3 2001 sub $0x30,%r8 2002 movdqa 0x20(%rdx),%xmm0 2003 movdqa 0x30(%rdx),%xmm5 2004 lea 0x30(%rdx),%rdx 2005 cmp $0x30,%r8 2006 2007 movdqa %xmm3,%xmm2 2008 #palignr $0xd,%xmm1,%xmm3 2009 .byte 0x66,0x0f,0x3a,0x0f 2010 .byte 0xd9,0x0d 2011 movdqa %xmm3,(%rcx) 2012 2013 movdqa %xmm0,%xmm4 2014 #palignr $0xd,%xmm2,%xmm0 2015 .byte 0x66,0x0f,0x3a,0x0f 2016 .byte 0xc2,0x0d 2017 movdqa %xmm0,0x10(%rcx) 2018 2019 movdqa %xmm5,%xmm1 2020 #palignr $0xd,%xmm4,%xmm5 2021 .byte 0x66,0x0f,0x3a,0x0f 2022 .byte 0xec,0x0d 2023 movdqa %xmm5,0x20(%rcx) 2024 2025 lea 0x30(%rcx),%rcx 2026 jge L(mov3dqa13) 2027 2028 cmp $0x10,%r8 2029 jl L(movdqa_epi) 2030 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2031 sub $0x10,%r8 2032 lea 0x10(%rdx),%rdx 2033 movdqa %xmm3,%xmm2 # save for use next concat 2034 #palignr $0xd,%xmm1,%xmm3 2035 .byte 0x66,0x0f,0x3a,0x0f 2036 .byte 0xd9,0x0d 2037 2038 cmp $0x10,%r8 2039 movdqa %xmm3,(%rcx) # store it 2040 lea 0x10(%rcx),%rcx 2041 jl L(movdqa_epi) 2042 2043 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2044 sub $0x10,%r8 2045 lea 0x10(%rdx),%rdx 2046 #palignr $0xd,%xmm2,%xmm0 2047 .byte 0x66,0x0f,0x3a,0x0f 2048 .byte 0xc2,0x0d 2049 movdqa %xmm0,(%rcx) # store it 2050 lea 0x10(%rcx),%rcx 2051 jmp L(movdqa_epi) 2052 2053 .balign 16 2054L(mov3dqa14): 2055 movdqa 0x10(%rdx),%xmm3 2056 sub $0x30,%r8 2057 movdqa 0x20(%rdx),%xmm0 2058 movdqa 0x30(%rdx),%xmm5 2059 lea 0x30(%rdx),%rdx 2060 cmp $0x30,%r8 2061 2062 movdqa %xmm3,%xmm2 2063 #palignr $0xe,%xmm1,%xmm3 2064 .byte 0x66,0x0f,0x3a,0x0f 2065 .byte 0xd9,0x0e 2066 movdqa %xmm3,(%rcx) 2067 2068 movdqa %xmm0,%xmm4 2069 #palignr $0xe,%xmm2,%xmm0 2070 .byte 0x66,0x0f,0x3a,0x0f 2071 .byte 0xc2,0x0e 2072 movdqa %xmm0,0x10(%rcx) 2073 2074 movdqa %xmm5,%xmm1 2075 #palignr $0xe,%xmm4,%xmm5 2076 .byte 0x66,0x0f,0x3a,0x0f 2077 .byte 0xec,0x0e 2078 movdqa %xmm5,0x20(%rcx) 2079 2080 lea 0x30(%rcx),%rcx 2081 jge L(mov3dqa14) 2082 2083 cmp $0x10,%r8 2084 jl L(movdqa_epi) 2085 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2086 sub $0x10,%r8 2087 lea 0x10(%rdx),%rdx 2088 movdqa %xmm3,%xmm2 # save for use next concat 2089 #palignr $0xe,%xmm1,%xmm3 2090 .byte 0x66,0x0f,0x3a,0x0f 2091 .byte 0xd9,0x0e 2092 2093 cmp $0x10,%r8 2094 movdqa %xmm3,(%rcx) # store it 2095 lea 0x10(%rcx),%rcx 2096 jl L(movdqa_epi) 2097 2098 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2099 sub $0x10,%r8 2100 lea 0x10(%rdx),%rdx 2101 #palignr $0xe,%xmm2,%xmm0 2102 .byte 0x66,0x0f,0x3a,0x0f 2103 .byte 0xc2,0x0e 2104 movdqa %xmm0,(%rcx) # store it 2105 lea 0x10(%rcx),%rcx 2106 jmp L(movdqa_epi) 2107 2108 .balign 16 2109L(mov3dqa15): 2110 movdqa 0x10(%rdx),%xmm3 2111 sub $0x30,%r8 2112 movdqa 0x20(%rdx),%xmm0 2113 movdqa 0x30(%rdx),%xmm5 2114 lea 0x30(%rdx),%rdx 2115 cmp $0x30,%r8 2116 2117 movdqa %xmm3,%xmm2 2118 #palignr $0xf,%xmm1,%xmm3 2119 .byte 0x66,0x0f,0x3a,0x0f 2120 .byte 0xd9,0x0f 2121 movdqa %xmm3,(%rcx) 2122 2123 movdqa %xmm0,%xmm4 2124 #palignr $0xf,%xmm2,%xmm0 2125 .byte 0x66,0x0f,0x3a,0x0f 2126 .byte 0xc2,0x0f 2127 movdqa %xmm0,0x10(%rcx) 2128 2129 movdqa %xmm5,%xmm1 2130 #palignr $0xf,%xmm4,%xmm5 2131 .byte 0x66,0x0f,0x3a,0x0f 2132 .byte 0xec,0x0f 2133 movdqa %xmm5,0x20(%rcx) 2134 2135 lea 0x30(%rcx),%rcx 2136 jge L(mov3dqa15) 2137 2138 cmp $0x10,%r8 2139 jl L(movdqa_epi) 2140 movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2141 sub $0x10,%r8 2142 lea 0x10(%rdx),%rdx 2143 movdqa %xmm3,%xmm2 # save for use next concat 2144 #palignr $0xf,%xmm1,%xmm3 2145 .byte 0x66,0x0f,0x3a,0x0f 2146 .byte 0xd9,0x0f 2147 2148 cmp $0x10,%r8 2149 movdqa %xmm3,(%rcx) # store it 2150 lea 0x10(%rcx),%rcx 2151 jl L(movdqa_epi) 2152 2153 movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2154 sub $0x10,%r8 2155 lea 0x10(%rdx),%rdx 2156 #palignr $0xf,%xmm2,%xmm0 2157 .byte 0x66,0x0f,0x3a,0x0f 2158 .byte 0xc2,0x0f 2159 movdqa %xmm0,(%rcx) # store it 2160 lea 0x10(%rcx),%rcx 2161 jmp L(movdqa_epi) 2162 2163 .balign 16 2164L(sse2_nt_move): 2165 lea 0x40(%rcx),%rcx 2166 lea 0x40(%rdx),%rdx 2167 lea -0x40(%r8),%r8 2168 2169 /* 2170 * doesn't matter if source is aligned for stuff out of cache. 2171 * the mis-aligned penalty is masked by the slowness of main memory. 2172 */ 2173 prefetchnta 0x180(%rdx) 2174 movdqu -0x40(%rdx),%xmm0 2175 movdqu -0x30(%rdx),%xmm1 2176 2177 cmp $0x40,%r8 2178 movntdq %xmm0,-0x40(%rcx) 2179 movntdq %xmm1,-0x30(%rcx) 2180 2181 movdqu -0x20(%rdx),%xmm2 2182 movdqu -0x10(%rdx),%xmm3 2183 2184 movntdq %xmm2,-0x20(%rcx) 2185 movntdq %xmm3,-0x10(%rcx) 2186 2187 jge L(sse2_nt_move) 2188 2189 lea L(Fix16EndTable)(%rip),%r10 2190 mov %r8,%r9 2191 and $0xFFFFFFFFFFFFFFF0,%r9 2192 add %r9,%rcx 2193 add %r9,%rdx 2194 sub %r9,%r8 2195 shr $0x4,%r9 2196 sfence 2197 2198 movslq (%r10,%r9,4),%r11 2199 lea (%r11,%r10,1),%r10 2200 jmpq *%r10 2201 2202 .balign 16 2203L(Fix16EndTable): 2204 .int L(fix16_0)-L(Fix16EndTable) 2205 .int L(fix16_1)-L(Fix16EndTable) 2206 .int L(fix16_2)-L(Fix16EndTable) 2207 .int L(fix16_3)-L(Fix16EndTable) 2208 2209 .balign 16 2210L(fix16_3): 2211 movdqu -0x30(%rdx),%xmm1 2212 movdqa %xmm1,-0x30(%rcx) 2213L(fix16_2): 2214 movdqu -0x20(%rdx),%xmm2 2215 movdqa %xmm2,-0x20(%rcx) 2216L(fix16_1): 2217 movdqu -0x10(%rdx),%xmm3 2218 movdqa %xmm3,-0x10(%rcx) 2219L(fix16_0): 2220 lea L(fwdPxQx)(%rip),%r10 2221 add %r8,%rdx 2222 add %r8,%rcx 2223 2224 movslq (%r10,%r8,4),%r9 2225 lea (%r9,%r10,1),%r10 2226 jmpq *%r10 2227 2228 .balign 16 2229L(pre_both_aligned): 2230 cmp $0x80,%r8 2231 jl L(fix_16b) 2232 2233 .balign 16 2234L(both_aligned): 2235 2236 /* 2237 * this 'paired' load/load/store/store seems to do best. 2238 */ 2239 movdqa (%rdx),%xmm0 2240 movdqa 0x10(%rdx),%xmm1 2241 2242 movdqa %xmm0,(%rcx) 2243 movdqa %xmm1,0x10(%rcx) 2244 lea -0x80(%r8),%r8 2245 2246 movdqa 0x20(%rdx),%xmm2 2247 movdqa 0x30(%rdx),%xmm3 2248 2249 movdqa %xmm2,0x20(%rcx) 2250 movdqa %xmm3,0x30(%rcx) 2251 2252 movdqa 0x40(%rdx),%xmm0 2253 movdqa 0x50(%rdx),%xmm1 2254 cmp $0x80,%r8 2255 2256 movdqa %xmm0,0x40(%rcx) 2257 movdqa %xmm1,0x50(%rcx) 2258 2259 movdqa 0x60(%rdx),%xmm2 2260 movdqa 0x70(%rdx),%xmm3 2261 lea 0x80(%rdx),%rdx 2262 movdqa %xmm2,0x60(%rcx) 2263 movdqa %xmm3,0x70(%rcx) 2264 lea 0x80(%rcx),%rcx 2265 jge L(both_aligned) 2266 2267L(fix_16b): 2268 add %r8,%rcx 2269 lea L(fwdPxQx)(%rip),%r10 2270 add %r8,%rdx 2271 2272 movslq (%r10,%r8,4),%r9 2273 lea (%r9,%r10,1),%r10 2274 jmpq *%r10 2275 2276 .balign 16 2277L(Loop8byte_pre): 2278 # Use 8-byte moves 2279 mov .largest_level_cache_size(%rip),%r9d 2280 shr %r9 # take half of it 2281 cmp %r9,%r8 2282 jge L(byte8_nt_top) 2283 # Find out whether to use rep movsq 2284 cmp $4096,%r8 2285 jle L(byte8_top) 2286 mov .amd64cache1half(%rip),%r9d # half of l1 cache 2287 cmp %r9,%r8 2288 jle L(use_rep) 2289 2290 .balign 16 2291L(byte8_top): 2292 mov (%rdx),%r9 2293 mov 0x8(%rdx),%r10 2294 lea -0x40(%r8),%r8 2295 mov %r9,(%rcx) 2296 mov %r10,0x8(%rcx) 2297 mov 0x10(%rdx),%r11 2298 mov 0x18(%rdx),%r9 2299 mov %r11,0x10(%rcx) 2300 mov %r9,0x18(%rcx) 2301 2302 cmp $0x40,%r8 2303 mov 0x20(%rdx),%r10 2304 mov 0x28(%rdx),%r11 2305 mov %r10,0x20(%rcx) 2306 mov %r11,0x28(%rcx) 2307 mov 0x30(%rdx),%r9 2308 mov 0x38(%rdx),%r10 2309 lea 0x40(%rdx),%rdx 2310 mov %r9,0x30(%rcx) 2311 mov %r10,0x38(%rcx) 2312 lea 0x40(%rcx),%rcx 2313 jg L(byte8_top) 2314 2315L(byte8_end): 2316 lea L(fwdPxQx)(%rip),%r10 2317 lea (%rdx,%r8,1),%rdx 2318 lea (%rcx,%r8,1),%rcx 2319 2320 movslq (%r10,%r8,4),%r9 2321 lea (%r9,%r10,1),%r10 2322 jmpq *%r10 2323 2324 .balign 16 2325L(use_rep): 2326 mov %rdx,%rsi # %rsi = source 2327 mov %rcx,%rdi # %rdi = destination 2328 mov %r8,%rcx # %rcx = count 2329 shrq $3,%rcx # 8-byte word count 2330 rep 2331 movsq 2332 mov %rsi,%rdx # source 2333 mov %rdi,%rcx # destination 2334 andq $7,%r8 # remainder 2335 jnz L(byte8_end) 2336 ret 2337 2338 .balign 16 2339L(byte8_nt_top): 2340 sub $0x40,%r8 2341 prefetchnta 0x180(%rdx) 2342 mov (%rdx),%r9 2343 movnti %r9,(%rcx) 2344 mov 0x8(%rdx),%r10 2345 movnti %r10,0x8(%rcx) 2346 mov 0x10(%rdx),%r11 2347 movnti %r11,0x10(%rcx) 2348 mov 0x18(%rdx),%r9 2349 movnti %r9,0x18(%rcx) 2350 mov 0x20(%rdx),%r10 2351 movnti %r10,0x20(%rcx) 2352 mov 0x28(%rdx),%r11 2353 movnti %r11,0x28(%rcx) 2354 mov 0x30(%rdx),%r9 2355 movnti %r9,0x30(%rcx) 2356 mov 0x38(%rdx),%r10 2357 movnti %r10,0x38(%rcx) 2358 2359 lea 0x40(%rdx),%rdx 2360 lea 0x40(%rcx),%rcx 2361 cmp $0x40,%r8 2362 jge L(byte8_nt_top) 2363 sfence 2364 jmp L(byte8_end) 2365 2366 SET_SIZE(memcpy) 2367 2368 .balign 16 2369L(CopyBackwards): 2370 mov %rdx,%r8 2371 mov %rdi,%rcx 2372 mov %rsi,%rdx 2373 mov %rdi,%rax # return value 2374 2375 # ck alignment of last byte 2376 lea (%rcx,%r8,1),%rcx 2377 test $0x7,%rcx 2378 lea (%rdx,%r8,1),%rdx 2379 jne L(bk_align) 2380 2381L(bk_qw_aligned): 2382 lea L(bkPxQx)(%rip),%r10 2383 2384 cmp $0x90,%r8 # 144 2385 jg L(bk_ck_sse2_alignment) 2386 2387 sub %r8,%rcx 2388 sub %r8,%rdx 2389 2390 movslq (%r10,%r8,4),%r9 2391 lea (%r9,%r10,1),%r10 2392 jmpq *%r10 2393 2394 .balign 16 2395L(bk_align): 2396 # only align if len > 8 2397 cmp $8,%r8 2398 jle L(bk_qw_aligned) 2399 test $0x1,%rcx 2400 je L(bk_tst2) 2401 dec %rcx 2402 dec %rdx 2403 dec %r8 2404 mov (%rdx),%r9b 2405 mov %r9b,(%rcx) 2406 2407L(bk_tst2): 2408 test $0x2,%rcx 2409 je L(bk_tst3) 2410 2411L(bk_got2): 2412 sub $0x2,%rcx 2413 sub $0x2,%rdx 2414 sub $0x2,%r8 2415 movzwq (%rdx),%r9 2416 mov %r9w,(%rcx) 2417 2418L(bk_tst3): 2419 test $0x4,%rcx 2420 je L(bk_qw_aligned) 2421 2422L(bk_got3): 2423 sub $0x4,%rcx 2424 sub $0x4,%rdx 2425 sub $0x4,%r8 2426 mov (%rdx),%r9d 2427 mov %r9d,(%rcx) 2428 jmp L(bk_qw_aligned) 2429 2430 .balign 16 2431L(bk_ck_sse2_alignment): 2432 cmpl $NO_SSE,.memops_method(%rip) 2433 je L(bk_use_rep) 2434 # check alignment of last byte 2435 test $0xf,%rcx 2436 jz L(bk_sse2_cpy) 2437 2438L(bk_sse2_align): 2439 # only here if already aligned on at least a qword bndry 2440 sub $0x8,%rcx 2441 sub $0x8,%rdx 2442 sub $0x8,%r8 2443 mov (%rdx),%r9 2444 mov %r9,(%rcx) 2445 #jmp L(bk_sse2_cpy) 2446 2447 .balign 16 2448L(bk_sse2_cpy): 2449 sub $0x80,%rcx # 128 2450 sub $0x80,%rdx 2451 movdqu 0x70(%rdx),%xmm3 2452 movdqu 0x60(%rdx),%xmm2 2453 movdqa %xmm3,0x70(%rcx) 2454 movdqa %xmm2,0x60(%rcx) 2455 sub $0x80,%r8 2456 movdqu 0x50(%rdx),%xmm1 2457 movdqu 0x40(%rdx),%xmm0 2458 movdqa %xmm1,0x50(%rcx) 2459 movdqa %xmm0,0x40(%rcx) 2460 2461 cmp $0x80,%r8 2462 movdqu 0x30(%rdx),%xmm3 2463 movdqu 0x20(%rdx),%xmm2 2464 movdqa %xmm3,0x30(%rcx) 2465 movdqa %xmm2,0x20(%rcx) 2466 movdqu 0x10(%rdx),%xmm1 2467 movdqu (%rdx),%xmm0 2468 movdqa %xmm1,0x10(%rcx) 2469 movdqa %xmm0,(%rcx) 2470 jge L(bk_sse2_cpy) 2471 2472L(bk_sse2_cpy_end): 2473 lea L(bkPxQx)(%rip),%r10 2474 sub %r8,%rdx 2475 sub %r8,%rcx 2476 movslq (%r10,%r8,4),%r9 2477 lea (%r9,%r10,1),%r10 2478 jmpq *%r10 2479 2480 .balign 16 2481L(bk_use_rep): 2482 xchg %rcx,%r9 2483 mov %rdx,%rsi # source 2484 mov %r9,%rdi # destination 2485 mov %r8,%rcx # count 2486 sub $8,%rsi 2487 sub $8,%rdi 2488 shr $3,%rcx 2489 std # reverse direction 2490 rep 2491 movsq 2492 cld # reset direction flag 2493 2494 xchg %rcx,%r9 2495 lea L(bkPxQx)(%rip),%r10 2496 sub %r8,%rdx 2497 sub %r8,%rcx 2498 andq $7,%r8 # remainder 2499 jz 2f 2500 movslq (%r10,%r8,4),%r9 2501 lea (%r9,%r10,1),%r10 2502 jmpq *%r10 25032: 2504 ret 2505 2506 .balign 16 2507L(bkP0QI): 2508 mov 0x88(%rdx),%r10 2509 mov %r10,0x88(%rcx) 2510L(bkP0QH): 2511 mov 0x80(%rdx),%r10 2512 mov %r10,0x80(%rcx) 2513L(bkP0QG): 2514 mov 0x78(%rdx),%r9 2515 mov %r9,0x78(%rcx) 2516L(bkP0QF): 2517 mov 0x70(%rdx),%r11 2518 mov %r11,0x70(%rcx) 2519L(bkP0QE): 2520 mov 0x68(%rdx),%r10 2521 mov %r10,0x68(%rcx) 2522L(bkP0QD): 2523 mov 0x60(%rdx),%r9 2524 mov %r9,0x60(%rcx) 2525L(bkP0QC): 2526 mov 0x58(%rdx),%r11 2527 mov %r11,0x58(%rcx) 2528L(bkP0QB): 2529 mov 0x50(%rdx),%r10 2530 mov %r10,0x50(%rcx) 2531L(bkP0QA): 2532 mov 0x48(%rdx),%r9 2533 mov %r9,0x48(%rcx) 2534L(bkP0Q9): 2535 mov 0x40(%rdx),%r11 2536 mov %r11,0x40(%rcx) 2537L(bkP0Q8): 2538 mov 0x38(%rdx),%r10 2539 mov %r10,0x38(%rcx) 2540L(bkP0Q7): 2541 mov 0x30(%rdx),%r9 2542 mov %r9,0x30(%rcx) 2543L(bkP0Q6): 2544 mov 0x28(%rdx),%r11 2545 mov %r11,0x28(%rcx) 2546L(bkP0Q5): 2547 mov 0x20(%rdx),%r10 2548 mov %r10,0x20(%rcx) 2549L(bkP0Q4): 2550 mov 0x18(%rdx),%r9 2551 mov %r9,0x18(%rcx) 2552L(bkP0Q3): 2553 mov 0x10(%rdx),%r11 2554 mov %r11,0x10(%rcx) 2555L(bkP0Q2): 2556 mov 0x8(%rdx),%r10 2557 mov %r10,0x8(%rcx) 2558L(bkP0Q1): 2559 mov (%rdx),%r9 2560 mov %r9,(%rcx) 2561L(bkP0Q0): 2562 ret 2563 2564 .balign 16 2565L(bkP1QI): 2566 mov 0x89(%rdx),%r10 2567 mov %r10,0x89(%rcx) 2568L(bkP1QH): 2569 mov 0x81(%rdx),%r11 2570 mov %r11,0x81(%rcx) 2571L(bkP1QG): 2572 mov 0x79(%rdx),%r10 2573 mov %r10,0x79(%rcx) 2574L(bkP1QF): 2575 mov 0x71(%rdx),%r9 2576 mov %r9,0x71(%rcx) 2577L(bkP1QE): 2578 mov 0x69(%rdx),%r11 2579 mov %r11,0x69(%rcx) 2580L(bkP1QD): 2581 mov 0x61(%rdx),%r10 2582 mov %r10,0x61(%rcx) 2583L(bkP1QC): 2584 mov 0x59(%rdx),%r9 2585 mov %r9,0x59(%rcx) 2586L(bkP1QB): 2587 mov 0x51(%rdx),%r11 2588 mov %r11,0x51(%rcx) 2589L(bkP1QA): 2590 mov 0x49(%rdx),%r10 2591 mov %r10,0x49(%rcx) 2592L(bkP1Q9): 2593 mov 0x41(%rdx),%r9 2594 mov %r9,0x41(%rcx) 2595L(bkP1Q8): 2596 mov 0x39(%rdx),%r11 2597 mov %r11,0x39(%rcx) 2598L(bkP1Q7): 2599 mov 0x31(%rdx),%r10 2600 mov %r10,0x31(%rcx) 2601L(bkP1Q6): 2602 mov 0x29(%rdx),%r9 2603 mov %r9,0x29(%rcx) 2604L(bkP1Q5): 2605 mov 0x21(%rdx),%r11 2606 mov %r11,0x21(%rcx) 2607L(bkP1Q4): 2608 mov 0x19(%rdx),%r10 2609 mov %r10,0x19(%rcx) 2610L(bkP1Q3): 2611 mov 0x11(%rdx),%r9 2612 mov %r9,0x11(%rcx) 2613L(bkP1Q2): 2614 mov 0x9(%rdx),%r11 2615 mov %r11,0x9(%rcx) 2616L(bkP1Q1): 2617 mov 0x1(%rdx),%r10 2618 mov %r10,0x1(%rcx) 2619L(bkP1Q0): 2620 mov (%rdx),%r9b 2621 mov %r9b,(%rcx) 2622 ret 2623 2624 .balign 16 2625L(bkP2QI): 2626 mov 0x8a(%rdx),%r10 2627 mov %r10,0x8a(%rcx) 2628L(bkP2QH): 2629 mov 0x82(%rdx),%r11 2630 mov %r11,0x82(%rcx) 2631L(bkP2QG): 2632 mov 0x7a(%rdx),%r10 2633 mov %r10,0x7a(%rcx) 2634L(bkP2QF): 2635 mov 0x72(%rdx),%r9 2636 mov %r9,0x72(%rcx) 2637L(bkP2QE): 2638 mov 0x6a(%rdx),%r11 2639 mov %r11,0x6a(%rcx) 2640L(bkP2QD): 2641 mov 0x62(%rdx),%r10 2642 mov %r10,0x62(%rcx) 2643L(bkP2QC): 2644 mov 0x5a(%rdx),%r9 2645 mov %r9,0x5a(%rcx) 2646L(bkP2QB): 2647 mov 0x52(%rdx),%r11 2648 mov %r11,0x52(%rcx) 2649L(bkP2QA): 2650 mov 0x4a(%rdx),%r10 2651 mov %r10,0x4a(%rcx) 2652L(bkP2Q9): 2653 mov 0x42(%rdx),%r9 2654 mov %r9,0x42(%rcx) 2655L(bkP2Q8): 2656 mov 0x3a(%rdx),%r11 2657 mov %r11,0x3a(%rcx) 2658L(bkP2Q7): 2659 mov 0x32(%rdx),%r10 2660 mov %r10,0x32(%rcx) 2661L(bkP2Q6): 2662 mov 0x2a(%rdx),%r9 2663 mov %r9,0x2a(%rcx) 2664L(bkP2Q5): 2665 mov 0x22(%rdx),%r11 2666 mov %r11,0x22(%rcx) 2667L(bkP2Q4): 2668 mov 0x1a(%rdx),%r10 2669 mov %r10,0x1a(%rcx) 2670L(bkP2Q3): 2671 mov 0x12(%rdx),%r9 2672 mov %r9,0x12(%rcx) 2673L(bkP2Q2): 2674 mov 0xa(%rdx),%r11 2675 mov %r11,0xa(%rcx) 2676L(bkP2Q1): 2677 mov 0x2(%rdx),%r10 2678 mov %r10,0x2(%rcx) 2679L(bkP2Q0): 2680 mov (%rdx),%r9w 2681 mov %r9w,(%rcx) 2682 ret 2683 2684 .balign 16 2685L(bkP3QI): 2686 mov 0x8b(%rdx),%r10 2687 mov %r10,0x8b(%rcx) 2688L(bkP3QH): 2689 mov 0x83(%rdx),%r11 2690 mov %r11,0x83(%rcx) 2691L(bkP3QG): 2692 mov 0x7b(%rdx),%r10 2693 mov %r10,0x7b(%rcx) 2694L(bkP3QF): 2695 mov 0x73(%rdx),%r9 2696 mov %r9,0x73(%rcx) 2697L(bkP3QE): 2698 mov 0x6b(%rdx),%r11 2699 mov %r11,0x6b(%rcx) 2700L(bkP3QD): 2701 mov 0x63(%rdx),%r10 2702 mov %r10,0x63(%rcx) 2703L(bkP3QC): 2704 mov 0x5b(%rdx),%r9 2705 mov %r9,0x5b(%rcx) 2706L(bkP3QB): 2707 mov 0x53(%rdx),%r11 2708 mov %r11,0x53(%rcx) 2709L(bkP3QA): 2710 mov 0x4b(%rdx),%r10 2711 mov %r10,0x4b(%rcx) 2712L(bkP3Q9): 2713 mov 0x43(%rdx),%r9 2714 mov %r9,0x43(%rcx) 2715L(bkP3Q8): 2716 mov 0x3b(%rdx),%r11 2717 mov %r11,0x3b(%rcx) 2718L(bkP3Q7): 2719 mov 0x33(%rdx),%r10 2720 mov %r10,0x33(%rcx) 2721L(bkP3Q6): 2722 mov 0x2b(%rdx),%r9 2723 mov %r9,0x2b(%rcx) 2724L(bkP3Q5): 2725 mov 0x23(%rdx),%r11 2726 mov %r11,0x23(%rcx) 2727L(bkP3Q4): 2728 mov 0x1b(%rdx),%r10 2729 mov %r10,0x1b(%rcx) 2730L(bkP3Q3): 2731 mov 0x13(%rdx),%r9 2732 mov %r9,0x13(%rcx) 2733L(bkP3Q2): 2734 mov 0xb(%rdx),%r11 2735 mov %r11,0xb(%rcx) 2736L(bkP3Q1): 2737 mov 0x3(%rdx),%r10 2738 mov %r10,0x3(%rcx) 2739L(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores 2740 mov 0x1(%rdx),%r9w 2741 mov %r9w,0x1(%rcx) 2742 mov (%rdx),%r10b 2743 mov %r10b,(%rcx) 2744 ret 2745 2746 .balign 16 2747L(bkP4QI): 2748 mov 0x8c(%rdx),%r10 2749 mov %r10,0x8c(%rcx) 2750L(bkP4QH): 2751 mov 0x84(%rdx),%r11 2752 mov %r11,0x84(%rcx) 2753L(bkP4QG): 2754 mov 0x7c(%rdx),%r10 2755 mov %r10,0x7c(%rcx) 2756L(bkP4QF): 2757 mov 0x74(%rdx),%r9 2758 mov %r9,0x74(%rcx) 2759L(bkP4QE): 2760 mov 0x6c(%rdx),%r11 2761 mov %r11,0x6c(%rcx) 2762L(bkP4QD): 2763 mov 0x64(%rdx),%r10 2764 mov %r10,0x64(%rcx) 2765L(bkP4QC): 2766 mov 0x5c(%rdx),%r9 2767 mov %r9,0x5c(%rcx) 2768L(bkP4QB): 2769 mov 0x54(%rdx),%r11 2770 mov %r11,0x54(%rcx) 2771L(bkP4QA): 2772 mov 0x4c(%rdx),%r10 2773 mov %r10,0x4c(%rcx) 2774L(bkP4Q9): 2775 mov 0x44(%rdx),%r9 2776 mov %r9,0x44(%rcx) 2777L(bkP4Q8): 2778 mov 0x3c(%rdx),%r11 2779 mov %r11,0x3c(%rcx) 2780L(bkP4Q7): 2781 mov 0x34(%rdx),%r10 2782 mov %r10,0x34(%rcx) 2783L(bkP4Q6): 2784 mov 0x2c(%rdx),%r9 2785 mov %r9,0x2c(%rcx) 2786L(bkP4Q5): 2787 mov 0x24(%rdx),%r11 2788 mov %r11,0x24(%rcx) 2789L(bkP4Q4): 2790 mov 0x1c(%rdx),%r10 2791 mov %r10,0x1c(%rcx) 2792L(bkP4Q3): 2793 mov 0x14(%rdx),%r9 2794 mov %r9,0x14(%rcx) 2795L(bkP4Q2): 2796 mov 0xc(%rdx),%r11 2797 mov %r11,0xc(%rcx) 2798L(bkP4Q1): 2799 mov 0x4(%rdx),%r10 2800 mov %r10,0x4(%rcx) 2801L(bkP4Q0): 2802 mov (%rdx),%r9d 2803 mov %r9d,(%rcx) 2804 ret 2805 2806 .balign 16 2807L(bkP5QI): 2808 mov 0x8d(%rdx),%r10 2809 mov %r10,0x8d(%rcx) 2810L(bkP5QH): 2811 mov 0x85(%rdx),%r9 2812 mov %r9,0x85(%rcx) 2813L(bkP5QG): 2814 mov 0x7d(%rdx),%r11 2815 mov %r11,0x7d(%rcx) 2816L(bkP5QF): 2817 mov 0x75(%rdx),%r10 2818 mov %r10,0x75(%rcx) 2819L(bkP5QE): 2820 mov 0x6d(%rdx),%r9 2821 mov %r9,0x6d(%rcx) 2822L(bkP5QD): 2823 mov 0x65(%rdx),%r11 2824 mov %r11,0x65(%rcx) 2825L(bkP5QC): 2826 mov 0x5d(%rdx),%r10 2827 mov %r10,0x5d(%rcx) 2828L(bkP5QB): 2829 mov 0x55(%rdx),%r9 2830 mov %r9,0x55(%rcx) 2831L(bkP5QA): 2832 mov 0x4d(%rdx),%r11 2833 mov %r11,0x4d(%rcx) 2834L(bkP5Q9): 2835 mov 0x45(%rdx),%r10 2836 mov %r10,0x45(%rcx) 2837L(bkP5Q8): 2838 mov 0x3d(%rdx),%r9 2839 mov %r9,0x3d(%rcx) 2840L(bkP5Q7): 2841 mov 0x35(%rdx),%r11 2842 mov %r11,0x35(%rcx) 2843L(bkP5Q6): 2844 mov 0x2d(%rdx),%r10 2845 mov %r10,0x2d(%rcx) 2846L(bkP5Q5): 2847 mov 0x25(%rdx),%r9 2848 mov %r9,0x25(%rcx) 2849L(bkP5Q4): 2850 mov 0x1d(%rdx),%r11 2851 mov %r11,0x1d(%rcx) 2852L(bkP5Q3): 2853 mov 0x15(%rdx),%r10 2854 mov %r10,0x15(%rcx) 2855L(bkP5Q2): 2856 mov 0xd(%rdx),%r9 2857 mov %r9,0xd(%rcx) 2858L(bkP5Q1): 2859 mov 0x5(%rdx),%r11 2860 mov %r11,0x5(%rcx) 2861L(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores 2862 mov 0x1(%rdx),%r9d 2863 mov %r9d,0x1(%rcx) 2864 mov (%rdx),%r10b 2865 mov %r10b,(%rcx) 2866 ret 2867 2868 .balign 16 2869L(bkP6QI): 2870 mov 0x8e(%rdx),%r10 2871 mov %r10,0x8e(%rcx) 2872L(bkP6QH): 2873 mov 0x86(%rdx),%r11 2874 mov %r11,0x86(%rcx) 2875L(bkP6QG): 2876 mov 0x7e(%rdx),%r10 2877 mov %r10,0x7e(%rcx) 2878L(bkP6QF): 2879 mov 0x76(%rdx),%r9 2880 mov %r9,0x76(%rcx) 2881L(bkP6QE): 2882 mov 0x6e(%rdx),%r11 2883 mov %r11,0x6e(%rcx) 2884L(bkP6QD): 2885 mov 0x66(%rdx),%r10 2886 mov %r10,0x66(%rcx) 2887L(bkP6QC): 2888 mov 0x5e(%rdx),%r9 2889 mov %r9,0x5e(%rcx) 2890L(bkP6QB): 2891 mov 0x56(%rdx),%r11 2892 mov %r11,0x56(%rcx) 2893L(bkP6QA): 2894 mov 0x4e(%rdx),%r10 2895 mov %r10,0x4e(%rcx) 2896L(bkP6Q9): 2897 mov 0x46(%rdx),%r9 2898 mov %r9,0x46(%rcx) 2899L(bkP6Q8): 2900 mov 0x3e(%rdx),%r11 2901 mov %r11,0x3e(%rcx) 2902L(bkP6Q7): 2903 mov 0x36(%rdx),%r10 2904 mov %r10,0x36(%rcx) 2905L(bkP6Q6): 2906 mov 0x2e(%rdx),%r9 2907 mov %r9,0x2e(%rcx) 2908L(bkP6Q5): 2909 mov 0x26(%rdx),%r11 2910 mov %r11,0x26(%rcx) 2911L(bkP6Q4): 2912 mov 0x1e(%rdx),%r10 2913 mov %r10,0x1e(%rcx) 2914L(bkP6Q3): 2915 mov 0x16(%rdx),%r9 2916 mov %r9,0x16(%rcx) 2917L(bkP6Q2): 2918 mov 0xe(%rdx),%r11 2919 mov %r11,0xe(%rcx) 2920L(bkP6Q1): 2921 mov 0x6(%rdx),%r10 2922 mov %r10,0x6(%rcx) 2923L(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores 2924 mov 0x2(%rdx),%r9d 2925 mov %r9d,0x2(%rcx) 2926 mov (%rdx),%r10w 2927 mov %r10w,(%rcx) 2928 ret 2929 2930 .balign 16 2931L(bkP7QI): 2932 mov 0x8f(%rdx),%r10 2933 mov %r10,0x8f(%rcx) 2934L(bkP7QH): 2935 mov 0x87(%rdx),%r11 2936 mov %r11,0x87(%rcx) 2937L(bkP7QG): 2938 mov 0x7f(%rdx),%r10 2939 mov %r10,0x7f(%rcx) 2940L(bkP7QF): 2941 mov 0x77(%rdx),%r9 2942 mov %r9,0x77(%rcx) 2943L(bkP7QE): 2944 mov 0x6f(%rdx),%r11 2945 mov %r11,0x6f(%rcx) 2946L(bkP7QD): 2947 mov 0x67(%rdx),%r10 2948 mov %r10,0x67(%rcx) 2949L(bkP7QC): 2950 mov 0x5f(%rdx),%r9 2951 mov %r9,0x5f(%rcx) 2952L(bkP7QB): 2953 mov 0x57(%rdx),%r11 2954 mov %r11,0x57(%rcx) 2955L(bkP7QA): 2956 mov 0x4f(%rdx),%r10 2957 mov %r10,0x4f(%rcx) 2958L(bkP7Q9): 2959 mov 0x47(%rdx),%r9 2960 mov %r9,0x47(%rcx) 2961L(bkP7Q8): 2962 mov 0x3f(%rdx),%r11 2963 mov %r11,0x3f(%rcx) 2964L(bkP7Q7): 2965 mov 0x37(%rdx),%r10 2966 mov %r10,0x37(%rcx) 2967L(bkP7Q6): 2968 mov 0x2f(%rdx),%r9 2969 mov %r9,0x2f(%rcx) 2970L(bkP7Q5): 2971 mov 0x27(%rdx),%r11 2972 mov %r11,0x27(%rcx) 2973L(bkP7Q4): 2974 mov 0x1f(%rdx),%r10 2975 mov %r10,0x1f(%rcx) 2976L(bkP7Q3): 2977 mov 0x17(%rdx),%r9 2978 mov %r9,0x17(%rcx) 2979L(bkP7Q2): 2980 mov 0xf(%rdx),%r11 2981 mov %r11,0xf(%rcx) 2982L(bkP7Q1): 2983 mov 0x7(%rdx),%r10 2984 mov %r10,0x7(%rcx) 2985L(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores 2986 mov 0x3(%rdx),%r9d 2987 mov %r9d,0x3(%rcx) 2988 mov 0x1(%rdx),%r10w 2989 mov %r10w,0x1(%rcx) 2990 mov (%rdx),%r11b 2991 mov %r11b,(%rcx) 2992 ret 2993 2994 .balign 16 2995L(bkPxQx): .int L(bkP0Q0)-L(bkPxQx) 2996 .int L(bkP1Q0)-L(bkPxQx) 2997 .int L(bkP2Q0)-L(bkPxQx) 2998 .int L(bkP3Q0)-L(bkPxQx) 2999 .int L(bkP4Q0)-L(bkPxQx) 3000 .int L(bkP5Q0)-L(bkPxQx) 3001 .int L(bkP6Q0)-L(bkPxQx) 3002 .int L(bkP7Q0)-L(bkPxQx) 3003 3004 .int L(bkP0Q1)-L(bkPxQx) 3005 .int L(bkP1Q1)-L(bkPxQx) 3006 .int L(bkP2Q1)-L(bkPxQx) 3007 .int L(bkP3Q1)-L(bkPxQx) 3008 .int L(bkP4Q1)-L(bkPxQx) 3009 .int L(bkP5Q1)-L(bkPxQx) 3010 .int L(bkP6Q1)-L(bkPxQx) 3011 .int L(bkP7Q1)-L(bkPxQx) 3012 3013 .int L(bkP0Q2)-L(bkPxQx) 3014 .int L(bkP1Q2)-L(bkPxQx) 3015 .int L(bkP2Q2)-L(bkPxQx) 3016 .int L(bkP3Q2)-L(bkPxQx) 3017 .int L(bkP4Q2)-L(bkPxQx) 3018 .int L(bkP5Q2)-L(bkPxQx) 3019 .int L(bkP6Q2)-L(bkPxQx) 3020 .int L(bkP7Q2)-L(bkPxQx) 3021 3022 .int L(bkP0Q3)-L(bkPxQx) 3023 .int L(bkP1Q3)-L(bkPxQx) 3024 .int L(bkP2Q3)-L(bkPxQx) 3025 .int L(bkP3Q3)-L(bkPxQx) 3026 .int L(bkP4Q3)-L(bkPxQx) 3027 .int L(bkP5Q3)-L(bkPxQx) 3028 .int L(bkP6Q3)-L(bkPxQx) 3029 .int L(bkP7Q3)-L(bkPxQx) 3030 3031 .int L(bkP0Q4)-L(bkPxQx) 3032 .int L(bkP1Q4)-L(bkPxQx) 3033 .int L(bkP2Q4)-L(bkPxQx) 3034 .int L(bkP3Q4)-L(bkPxQx) 3035 .int L(bkP4Q4)-L(bkPxQx) 3036 .int L(bkP5Q4)-L(bkPxQx) 3037 .int L(bkP6Q4)-L(bkPxQx) 3038 .int L(bkP7Q4)-L(bkPxQx) 3039 3040 .int L(bkP0Q5)-L(bkPxQx) 3041 .int L(bkP1Q5)-L(bkPxQx) 3042 .int L(bkP2Q5)-L(bkPxQx) 3043 .int L(bkP3Q5)-L(bkPxQx) 3044 .int L(bkP4Q5)-L(bkPxQx) 3045 .int L(bkP5Q5)-L(bkPxQx) 3046 .int L(bkP6Q5)-L(bkPxQx) 3047 .int L(bkP7Q5)-L(bkPxQx) 3048 3049 .int L(bkP0Q6)-L(bkPxQx) 3050 .int L(bkP1Q6)-L(bkPxQx) 3051 .int L(bkP2Q6)-L(bkPxQx) 3052 .int L(bkP3Q6)-L(bkPxQx) 3053 .int L(bkP4Q6)-L(bkPxQx) 3054 .int L(bkP5Q6)-L(bkPxQx) 3055 .int L(bkP6Q6)-L(bkPxQx) 3056 .int L(bkP7Q6)-L(bkPxQx) 3057 3058 .int L(bkP0Q7)-L(bkPxQx) 3059 .int L(bkP1Q7)-L(bkPxQx) 3060 .int L(bkP2Q7)-L(bkPxQx) 3061 .int L(bkP3Q7)-L(bkPxQx) 3062 .int L(bkP4Q7)-L(bkPxQx) 3063 .int L(bkP5Q7)-L(bkPxQx) 3064 .int L(bkP6Q7)-L(bkPxQx) 3065 .int L(bkP7Q7)-L(bkPxQx) 3066 3067 .int L(bkP0Q8)-L(bkPxQx) 3068 .int L(bkP1Q8)-L(bkPxQx) 3069 .int L(bkP2Q8)-L(bkPxQx) 3070 .int L(bkP3Q8)-L(bkPxQx) 3071 .int L(bkP4Q8)-L(bkPxQx) 3072 .int L(bkP5Q8)-L(bkPxQx) 3073 .int L(bkP6Q8)-L(bkPxQx) 3074 .int L(bkP7Q8)-L(bkPxQx) 3075 3076 .int L(bkP0Q9)-L(bkPxQx) 3077 .int L(bkP1Q9)-L(bkPxQx) 3078 .int L(bkP2Q9)-L(bkPxQx) 3079 .int L(bkP3Q9)-L(bkPxQx) 3080 .int L(bkP4Q9)-L(bkPxQx) 3081 .int L(bkP5Q9)-L(bkPxQx) 3082 .int L(bkP6Q9)-L(bkPxQx) 3083 .int L(bkP7Q9)-L(bkPxQx) 3084 3085 .int L(bkP0QA)-L(bkPxQx) 3086 .int L(bkP1QA)-L(bkPxQx) 3087 .int L(bkP2QA)-L(bkPxQx) 3088 .int L(bkP3QA)-L(bkPxQx) 3089 .int L(bkP4QA)-L(bkPxQx) 3090 .int L(bkP5QA)-L(bkPxQx) 3091 .int L(bkP6QA)-L(bkPxQx) 3092 .int L(bkP7QA)-L(bkPxQx) 3093 3094 .int L(bkP0QB)-L(bkPxQx) 3095 .int L(bkP1QB)-L(bkPxQx) 3096 .int L(bkP2QB)-L(bkPxQx) 3097 .int L(bkP3QB)-L(bkPxQx) 3098 .int L(bkP4QB)-L(bkPxQx) 3099 .int L(bkP5QB)-L(bkPxQx) 3100 .int L(bkP6QB)-L(bkPxQx) 3101 .int L(bkP7QB)-L(bkPxQx) 3102 3103 .int L(bkP0QC)-L(bkPxQx) 3104 .int L(bkP1QC)-L(bkPxQx) 3105 .int L(bkP2QC)-L(bkPxQx) 3106 .int L(bkP3QC)-L(bkPxQx) 3107 .int L(bkP4QC)-L(bkPxQx) 3108 .int L(bkP5QC)-L(bkPxQx) 3109 .int L(bkP6QC)-L(bkPxQx) 3110 .int L(bkP7QC)-L(bkPxQx) 3111 3112 .int L(bkP0QD)-L(bkPxQx) 3113 .int L(bkP1QD)-L(bkPxQx) 3114 .int L(bkP2QD)-L(bkPxQx) 3115 .int L(bkP3QD)-L(bkPxQx) 3116 .int L(bkP4QD)-L(bkPxQx) 3117 .int L(bkP5QD)-L(bkPxQx) 3118 .int L(bkP6QD)-L(bkPxQx) 3119 .int L(bkP7QD)-L(bkPxQx) 3120 3121 .int L(bkP0QE)-L(bkPxQx) 3122 .int L(bkP1QE)-L(bkPxQx) 3123 .int L(bkP2QE)-L(bkPxQx) 3124 .int L(bkP3QE)-L(bkPxQx) 3125 .int L(bkP4QE)-L(bkPxQx) 3126 .int L(bkP5QE)-L(bkPxQx) 3127 .int L(bkP6QE)-L(bkPxQx) 3128 .int L(bkP7QE)-L(bkPxQx) 3129 3130 .int L(bkP0QF)-L(bkPxQx) 3131 .int L(bkP1QF)-L(bkPxQx) 3132 .int L(bkP2QF)-L(bkPxQx) 3133 .int L(bkP3QF)-L(bkPxQx) 3134 .int L(bkP4QF)-L(bkPxQx) 3135 .int L(bkP5QF)-L(bkPxQx) 3136 .int L(bkP6QF)-L(bkPxQx) 3137 .int L(bkP7QF)-L(bkPxQx) 3138 3139 .int L(bkP0QG)-L(bkPxQx) 3140 .int L(bkP1QG)-L(bkPxQx) 3141 .int L(bkP2QG)-L(bkPxQx) 3142 .int L(bkP3QG)-L(bkPxQx) 3143 .int L(bkP4QG)-L(bkPxQx) 3144 .int L(bkP5QG)-L(bkPxQx) 3145 .int L(bkP6QG)-L(bkPxQx) 3146 .int L(bkP7QG)-L(bkPxQx) 3147 3148 .int L(bkP0QH)-L(bkPxQx) 3149 .int L(bkP1QH)-L(bkPxQx) 3150 .int L(bkP2QH)-L(bkPxQx) 3151 .int L(bkP3QH)-L(bkPxQx) 3152 .int L(bkP4QH)-L(bkPxQx) 3153 .int L(bkP5QH)-L(bkPxQx) 3154 .int L(bkP6QH)-L(bkPxQx) 3155 .int L(bkP7QH)-L(bkPxQx) 3156 3157 .int L(bkP0QI)-L(bkPxQx) 3158 .int L(bkP1QI)-L(bkPxQx) 3159 .int L(bkP2QI)-L(bkPxQx) 3160 .int L(bkP3QI)-L(bkPxQx) 3161 .int L(bkP4QI)-L(bkPxQx) 3162 .int L(bkP5QI)-L(bkPxQx) 3163 .int L(bkP6QI)-L(bkPxQx) 3164 .int L(bkP7QI)-L(bkPxQx) 3165 3166 SET_SIZE(memmove) 3167