1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma ident "%Z%%M% %I% %E% SMI" 27 28#include <sys/asm_linkage.h> 29#include <sys/regset.h> 30#include <sys/privregs.h> 31 32#if defined(__lint) 33#include <sys/types.h> 34#include <sys/archsystm.h> 35#else 36#include "assym.h" 37#endif 38 39/* 40 * Do block operations using Streaming SIMD extensions 41 */ 42 43#if defined(DEBUG) 44#if defined(__amd64) 45#define ASSERT_KPREEMPT_DISABLED(t, r32, msg) \ 46 movq %gs:CPU_THREAD, t; \ 47 movsbl T_PREEMPT(t), r32; \ 48 testl r32, r32; \ 49 jne 5f; \ 50 pushq %rbp; \ 51 movq %rsp, %rbp; \ 52 leaq msg(%rip), %rdi; \ 53 xorl %eax, %eax; \ 54 call panic; \ 555: 56#elif defined(__i386) 57#define ASSERT_KPREEMPT_DISABLED(t, r32, msg) \ 58 movl %gs:CPU_THREAD, t; \ 59 movsbl T_PREEMPT(t), r32; \ 60 testl r32, r32; \ 61 jne 5f; \ 62 pushl %ebp; \ 63 movl %esp, %ebp; \ 64 pushl $msg; \ 65 call panic; \ 665: 67#endif /* __i386 */ 68#else /* DEBUG */ 69#define ASSERT_KPREEMPT_DISABLED(t, r32, msg) 70#endif /* DEBUG */ 71 72#define BLOCKSHIFT 6 73#define BLOCKSIZE 64 /* (1 << BLOCKSHIFT) */ 74#define BLOCKMASK 63 /* (BLOCKSIZE - 1) */ 75 76#if (1 << BLOCKSHIFT) != BLOCKSIZE || BLOCKMASK != (BLOCKSIZE - 1) 77#error "mucked up constants" 78#endif 79 80#if defined(__lint) 81 82/*ARGSUSED*/ 83void 84hwblkclr(void *addr, size_t size) 85{} 86 87#else /* __lint */ 88 89#if defined(__amd64) 90#define ADD addq 91#define SUB subq 92#else 93#define ADD addl 94#define SUB subl 95#endif 96 97#define SAVE_XMM0(r) \ 98 SAVE_XMM_PROLOG(r, 1); \ 99 movdqa %xmm0, (r) 100 101#define ZERO_LOOP_INIT_XMM(dst) \ 102 pxor %xmm0, %xmm0 103 104#define ZERO_LOOP_BODY_XMM(dst, cnt) \ 105 movntdq %xmm0, (dst); \ 106 movntdq %xmm0, 0x10(dst); \ 107 movntdq %xmm0, 0x20(dst); \ 108 movntdq %xmm0, 0x30(dst); \ 109 ADD $BLOCKSIZE, dst; \ 110 SUB $1, cnt 111 112#define ZERO_LOOP_FINI_XMM(dst) \ 113 mfence 114 115#define RSTOR_XMM0(r) \ 116 movdqa 0x0(r), %xmm0; \ 117 RSTOR_XMM_EPILOG(r, 1) 118 119#if defined(__amd64) 120 121 /* 122 * %rdi dst 123 * %rsi size 124 * %rax saved %cr0 (#if DEBUG then %eax is t->t_preempt) 125 * %r8 pointer to %xmm register save area 126 */ 127 ENTRY(hwblkclr) 128 pushq %rbp 129 movq %rsp, %rbp 130 testl $BLOCKMASK, %edi /* address must be BLOCKSIZE aligned */ 131 jne .dobzero 132 cmpq $BLOCKSIZE, %rsi /* size must be at least BLOCKSIZE */ 133 jl .dobzero 134 testq $BLOCKMASK, %rsi /* .. and be a multiple of BLOCKSIZE */ 135 jne .dobzero 136 shrq $BLOCKSHIFT, %rsi 137 138 ASSERT_KPREEMPT_DISABLED(%r11, %eax, .not_disabled) 139 movq %cr0, %rax 140 clts 141 testl $CR0_TS, %eax 142 jnz 1f 143 144 SAVE_XMM0(%r8) 1451: ZERO_LOOP_INIT_XMM(%rdi) 1469: ZERO_LOOP_BODY_XMM(%rdi, %rsi) 147 jnz 9b 148 ZERO_LOOP_FINI_XMM(%rdi) 149 150 testl $CR0_TS, %eax 151 jnz 2f 152 RSTOR_XMM0(%r8) 1532: movq %rax, %cr0 154 leave 155 ret 156.dobzero: 157 leave 158 jmp bzero 159 SET_SIZE(hwblkclr) 160 161#elif defined(__i386) 162 163 /* 164 * %eax dst 165 * %ecx size in bytes, loop count 166 * %ebx saved %cr0 (#if DEBUG then t->t_preempt) 167 * %edi pointer to %xmm register save area 168 */ 169 ENTRY(hwblkclr) 170 movl 4(%esp), %eax 171 movl 8(%esp), %ecx 172 testl $BLOCKMASK, %eax /* address must be BLOCKSIZE aligned */ 173 jne .dobzero 174 cmpl $BLOCKSIZE, %ecx /* size must be at least BLOCKSIZE */ 175 jl .dobzero 176 testl $BLOCKMASK, %ecx /* .. and be a multiple of BLOCKSIZE */ 177 jne .dobzero 178 shrl $BLOCKSHIFT, %ecx 179 movl 0xc(%esp), %edx 180 pushl %ebx 181 182 pushl %esi 183 ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled) 184 popl %esi 185 movl %cr0, %ebx 186 clts 187 testl $CR0_TS, %ebx 188 jnz 1f 189 190 pushl %edi 191 SAVE_XMM0(%edi) 1921: ZERO_LOOP_INIT_XMM(%eax) 1939: ZERO_LOOP_BODY_XMM(%eax, %ecx) 194 jnz 9b 195 ZERO_LOOP_FINI_XMM(%eax) 196 197 testl $CR0_TS, %ebx 198 jnz 2f 199 RSTOR_XMM0(%edi) 200 popl %edi 2012: movl %ebx, %cr0 202 popl %ebx 203 ret 204.dobzero: 205 jmp bzero 206 SET_SIZE(hwblkclr) 207 208#endif /* __i386 */ 209#endif /* __lint */ 210 211 212#if defined(__lint) 213 214/*ARGSUSED*/ 215void 216hwblkpagecopy(const void *src, void *dst) 217{} 218 219#else /* __lint */ 220 221#define PREFETCH_START(src) \ 222 prefetchnta 0x0(src); \ 223 prefetchnta 0x40(src) 224 225#define SAVE_XMMS(r) \ 226 SAVE_XMM_PROLOG(r, 8); \ 227 movdqa %xmm0, (r); \ 228 movdqa %xmm1, 0x10(r); \ 229 movdqa %xmm2, 0x20(r); \ 230 movdqa %xmm3, 0x30(r); \ 231 movdqa %xmm4, 0x40(r); \ 232 movdqa %xmm5, 0x50(r); \ 233 movdqa %xmm6, 0x60(r); \ 234 movdqa %xmm7, 0x70(r) 235 236#define COPY_LOOP_INIT_XMM(src) \ 237 prefetchnta 0x80(src); \ 238 prefetchnta 0xc0(src); \ 239 movdqa 0x0(src), %xmm0; \ 240 movdqa 0x10(src), %xmm1; \ 241 movdqa 0x20(src), %xmm2; \ 242 movdqa 0x30(src), %xmm3; \ 243 movdqa 0x40(src), %xmm4; \ 244 movdqa 0x50(src), %xmm5; \ 245 movdqa 0x60(src), %xmm6; \ 246 movdqa 0x70(src), %xmm7; \ 247 ADD $0x80, src 248 249#define COPY_LOOP_BODY_XMM(src, dst, cnt) \ 250 prefetchnta 0x80(src); \ 251 prefetchnta 0xc0(src); \ 252 prefetchnta 0x100(src); \ 253 prefetchnta 0x140(src); \ 254 movntdq %xmm0, (dst); \ 255 movntdq %xmm1, 0x10(dst); \ 256 movntdq %xmm2, 0x20(dst); \ 257 movntdq %xmm3, 0x30(dst); \ 258 movdqa 0x0(src), %xmm0; \ 259 movdqa 0x10(src), %xmm1; \ 260 movntdq %xmm4, 0x40(dst); \ 261 movntdq %xmm5, 0x50(dst); \ 262 movdqa 0x20(src), %xmm2; \ 263 movdqa 0x30(src), %xmm3; \ 264 movntdq %xmm6, 0x60(dst); \ 265 movntdq %xmm7, 0x70(dst); \ 266 movdqa 0x40(src), %xmm4; \ 267 movdqa 0x50(src), %xmm5; \ 268 ADD $0x80, dst; \ 269 movdqa 0x60(src), %xmm6; \ 270 movdqa 0x70(src), %xmm7; \ 271 ADD $0x80, src; \ 272 subl $1, cnt 273 274#define COPY_LOOP_FINI_XMM(dst) \ 275 movntdq %xmm0, 0x0(dst); \ 276 movntdq %xmm1, 0x10(dst); \ 277 movntdq %xmm2, 0x20(dst); \ 278 movntdq %xmm3, 0x30(dst); \ 279 movntdq %xmm4, 0x40(dst); \ 280 movntdq %xmm5, 0x50(dst); \ 281 movntdq %xmm6, 0x60(dst); \ 282 movntdq %xmm7, 0x70(dst) 283 284#define RSTOR_XMMS(r) \ 285 movdqa 0x0(r), %xmm0; \ 286 movdqa 0x10(r), %xmm1; \ 287 movdqa 0x20(r), %xmm2; \ 288 movdqa 0x30(r), %xmm3; \ 289 movdqa 0x40(r), %xmm4; \ 290 movdqa 0x50(r), %xmm5; \ 291 movdqa 0x60(r), %xmm6; \ 292 movdqa 0x70(r), %xmm7; \ 293 RSTOR_XMM_EPILOG(r, 8) 294 295#if defined(__amd64) 296 297 /* 298 * %rdi src 299 * %rsi dst 300 * %rdx #if DEBUG then curthread 301 * %ecx loop count 302 * %rax saved %cr0 (#if DEBUG then %eax is t->t_prempt) 303 * %r8 pointer to %xmm register save area 304 */ 305 ENTRY(hwblkpagecopy) 306 pushq %rbp 307 movq %rsp, %rbp 308 PREFETCH_START(%rdi) 309 /* 310 * PAGESIZE is 4096, each loop moves 128 bytes, but the initial 311 * load and final store save us on loop count 312 */ 313 movl $_CONST(32 - 1), %ecx 314 ASSERT_KPREEMPT_DISABLED(%rdx, %eax, .not_disabled) 315 movq %cr0, %rax 316 clts 317 testl $CR0_TS, %eax 318 jnz 3f 319 SAVE_XMMS(%r8) 3203: COPY_LOOP_INIT_XMM(%rdi) 3214: COPY_LOOP_BODY_XMM(%rdi, %rsi, %ecx) 322 jnz 4b 323 COPY_LOOP_FINI_XMM(%rsi) 324 testl $CR0_TS, %eax 325 jnz 5f 326 RSTOR_XMMS(%r8) 3275: movq %rax, %cr0 328 mfence 329 leave 330 ret 331 SET_SIZE(hwblkpagecopy) 332 333#elif defined(__i386) 334 335 /* 336 * %eax src 337 * %edx dst 338 * %ecx loop count 339 * %ebx saved %cr0 (#if DEBUG then t->t_prempt) 340 * %edi pointer to %xmm register save area 341 * %esi #if DEBUG temporary thread pointer 342 */ 343 ENTRY(hwblkpagecopy) 344 movl 4(%esp), %eax 345 movl 8(%esp), %edx 346 PREFETCH_START(%eax) 347 pushl %ebx 348 /* 349 * PAGESIZE is 4096, each loop moves 128 bytes, but the initial 350 * load and final store save us one loop count 351 */ 352 movl $_CONST(32 - 1), %ecx 353 pushl %esi 354 ASSERT_KPREEMPT_DISABLED(%esi, %ebx, .not_disabled) 355 popl %esi 356 movl %cr0, %ebx 357 clts 358 testl $CR0_TS, %ebx 359 jnz 3f 360 pushl %edi 361 SAVE_XMMS(%edi) 3623: COPY_LOOP_INIT_XMM(%eax) 3634: COPY_LOOP_BODY_XMM(%eax, %edx, %ecx) 364 jnz 4b 365 COPY_LOOP_FINI_XMM(%edx) 366 testl $CR0_TS, %ebx 367 jnz 5f 368 RSTOR_XMMS(%edi) 369 popl %edi 3705: movl %ebx, %cr0 371 popl %ebx 372 mfence 373 ret 374 SET_SIZE(hwblkpagecopy) 375 376#endif /* __i386 */ 377#endif /* __lint */ 378 379#if defined(__lint) 380 381/* 382 * Version of hwblkclr which doesn't use XMM registers. 383 * Note that it requires aligned dst and len. 384 * 385 * XXPV This needs to be performance tuned at some point. 386 * Is 4 the best number of iterations to unroll? 387 */ 388/*ARGSUSED*/ 389void 390block_zero_no_xmm(void *dst, int len) 391{} 392 393#else /* __lint */ 394 395#if defined(__amd64) 396 397 ENTRY(block_zero_no_xmm) 398 pushq %rbp 399 movq %rsp, %rbp 400 xorl %eax, %eax 401 addq %rsi, %rdi 402 negq %rsi 4031: 404 movnti %rax, (%rdi, %rsi) 405 movnti %rax, 8(%rdi, %rsi) 406 movnti %rax, 16(%rdi, %rsi) 407 movnti %rax, 24(%rdi, %rsi) 408 addq $32, %rsi 409 jnz 1b 410 mfence 411 leave 412 ret 413 SET_SIZE(block_zero_no_xmm) 414 415#elif defined(__i386) 416 417 ENTRY(block_zero_no_xmm) 418 pushl %ebp 419 movl %esp, %ebp 420 xorl %eax, %eax 421 movl 8(%ebp), %edx 422 movl 12(%ebp), %ecx 423 addl %ecx, %edx 424 negl %ecx 4251: 426 movnti %eax, (%edx, %ecx) 427 movnti %eax, 4(%edx, %ecx) 428 movnti %eax, 8(%edx, %ecx) 429 movnti %eax, 12(%edx, %ecx) 430 addl $16, %ecx 431 jnz 1b 432 mfence 433 leave 434 ret 435 SET_SIZE(block_zero_no_xmm) 436 437#endif /* __i386 */ 438#endif /* __lint */ 439 440 441#if defined(__lint) 442 443/* 444 * Version of page copy which doesn't use XMM registers. 445 * 446 * XXPV This needs to be performance tuned at some point. 447 * Is 4 the right number of iterations to unroll? 448 * Is the load/store order optimal? Should it use prefetch? 449 */ 450/*ARGSUSED*/ 451void 452page_copy_no_xmm(void *dst, void *src) 453{} 454 455#else /* __lint */ 456 457#if defined(__amd64) 458 459 ENTRY(page_copy_no_xmm) 460 movq $MMU_STD_PAGESIZE, %rcx 461 addq %rcx, %rdi 462 addq %rcx, %rsi 463 negq %rcx 4641: 465 movq (%rsi, %rcx), %rax 466 movnti %rax, (%rdi, %rcx) 467 movq 8(%rsi, %rcx), %rax 468 movnti %rax, 8(%rdi, %rcx) 469 movq 16(%rsi, %rcx), %rax 470 movnti %rax, 16(%rdi, %rcx) 471 movq 24(%rsi, %rcx), %rax 472 movnti %rax, 24(%rdi, %rcx) 473 addq $32, %rcx 474 jnz 1b 475 mfence 476 ret 477 SET_SIZE(page_copy_no_xmm) 478 479#elif defined(__i386) 480 481 ENTRY(page_copy_no_xmm) 482 pushl %esi 483 movl $MMU_STD_PAGESIZE, %ecx 484 movl 8(%esp), %edx 485 movl 12(%esp), %esi 486 addl %ecx, %edx 487 addl %ecx, %esi 488 negl %ecx 4891: 490 movl (%esi, %ecx), %eax 491 movnti %eax, (%edx, %ecx) 492 movl 4(%esi, %ecx), %eax 493 movnti %eax, 4(%edx, %ecx) 494 movl 8(%esi, %ecx), %eax 495 movnti %eax, 8(%edx, %ecx) 496 movl 12(%esi, %ecx), %eax 497 movnti %eax, 12(%edx, %ecx) 498 addl $16, %ecx 499 jnz 1b 500 mfence 501 popl %esi 502 ret 503 SET_SIZE(page_copy_no_xmm) 504 505#endif /* __i386 */ 506#endif /* __lint */ 507 508#if defined(DEBUG) && !defined(__lint) 509 .text 510.not_disabled: 511 .string "sseblk: preemption not disabled!" 512#endif 513