1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26/* 27 * Copyright 2020 Joyent, Inc. 28 * Copyright 2024 MNX Cloud, Inc. 29 */ 30 31/* 32 * Process switching routines. 33 */ 34 35#include <sys/asm_linkage.h> 36#include <sys/asm_misc.h> 37#include <sys/regset.h> 38#include <sys/privregs.h> 39#include <sys/stack.h> 40#include <sys/segments.h> 41#include <sys/psw.h> 42 43#include "assym.h" 44 45/* 46 * resume(thread_id_t t); 47 * 48 * a thread can only run on one processor at a time. there 49 * exists a window on MPs where the current thread on one 50 * processor is capable of being dispatched by another processor. 51 * some overlap between outgoing and incoming threads can happen 52 * when they are the same thread. in this case where the threads 53 * are the same, resume() on one processor will spin on the incoming 54 * thread until resume() on the other processor has finished with 55 * the outgoing thread. 56 * 57 * The MMU context changes when the resuming thread resides in a different 58 * process. Kernel threads are known by resume to reside in process 0. 59 * The MMU context, therefore, only changes when resuming a thread in 60 * a process different from curproc. 61 * 62 * resume_from_intr() is called when the thread being resumed was not 63 * passivated by resume (e.g. was interrupted). This means that the 64 * resume lock is already held and that a restore context is not needed. 65 * Also, the MMU context is not changed on the resume in this case. 66 * 67 * resume_from_zombie() is the same as resume except the calling thread 68 * is a zombie and must be put on the deathrow list after the CPU is 69 * off the stack. 70 */ 71 72#if LWP_PCB_FPU != 0 73#error LWP_PCB_FPU MUST be defined as 0 for code in swtch.s to work 74#endif /* LWP_PCB_FPU != 0 */ 75 76/* 77 * Save non-volatile regs other than %rsp (%rbx, %rbp, and %r12 - %r15) 78 * 79 * The stack frame must be created before the save of %rsp so that tracebacks 80 * of swtch()ed-out processes show the process as having last called swtch(). 81 */ 82#define SAVE_REGS(thread_t, retaddr) \ 83 movq %rbp, T_RBP(thread_t); \ 84 movq %rbx, T_RBX(thread_t); \ 85 movq %r12, T_R12(thread_t); \ 86 movq %r13, T_R13(thread_t); \ 87 movq %r14, T_R14(thread_t); \ 88 movq %r15, T_R15(thread_t); \ 89 pushq %rbp; \ 90 movq %rsp, %rbp; \ 91 movq %rsp, T_SP(thread_t); \ 92 movq retaddr, T_PC(thread_t); \ 93 movq %rdi, %r12; \ 94 call __dtrace_probe___sched_off__cpu 95 96/* 97 * Restore non-volatile regs other than %rsp (%rbx, %rbp, and %r12 - %r15) 98 * 99 * We load up %rsp from the label_t as part of the context switch, so 100 * we don't repeat that here. 101 * 102 * We don't do a 'leave,' because reloading %rsp/%rbp from the label_t 103 * already has the effect of putting the stack back the way it was when 104 * we came in. 105 */ 106#define RESTORE_REGS(scratch_reg) \ 107 movq %gs:CPU_THREAD, scratch_reg; \ 108 movq T_RBP(scratch_reg), %rbp; \ 109 movq T_RBX(scratch_reg), %rbx; \ 110 movq T_R12(scratch_reg), %r12; \ 111 movq T_R13(scratch_reg), %r13; \ 112 movq T_R14(scratch_reg), %r14; \ 113 movq T_R15(scratch_reg), %r15 114 115/* 116 * Get pointer to a thread's hat structure 117 */ 118#define GET_THREAD_HATP(hatp, thread_t, scratch_reg) \ 119 movq T_PROCP(thread_t), hatp; \ 120 movq P_AS(hatp), scratch_reg; \ 121 movq A_HAT(scratch_reg), hatp 122 123#define TSC_READ() \ 124 call tsc_read; \ 125 movq %rax, %r14; 126 127/* 128 * If we are resuming an interrupt thread, store a timestamp in the thread 129 * structure. If an interrupt occurs between tsc_read() and its subsequent 130 * store, the timestamp will be stale by the time it is stored. We can detect 131 * this by doing a compare-and-swap on the thread's timestamp, since any 132 * interrupt occurring in this window will put a new timestamp in the thread's 133 * t_intr_start field. 134 */ 135#define STORE_INTR_START(thread_t) \ 136 testw $T_INTR_THREAD, T_FLAGS(thread_t); \ 137 jz 1f; \ 1380: \ 139 TSC_READ(); \ 140 movq T_INTR_START(thread_t), %rax; \ 141 cmpxchgq %r14, T_INTR_START(thread_t); \ 142 jnz 0b; \ 1431: 144 145 .global kpti_enable 146 147 ENTRY(resume) 148 movq %gs:CPU_THREAD, %rax 149 leaq resume_return(%rip), %r11 150 151 /* 152 * Deal with SMAP here. A thread may be switched out at any point while 153 * it is executing. The thread could be under on_fault() or it could be 154 * pre-empted while performing a copy interruption. If this happens and 155 * we're not in the context of an interrupt which happens to handle 156 * saving and restoring rflags correctly, we may lose our SMAP related 157 * state. 158 * 159 * To handle this, as part of being switched out, we first save whether 160 * or not userland access is allowed ($PS_ACHK in rflags) and store that 161 * in t_useracc on the kthread_t and unconditionally enable SMAP to 162 * protect the system. 163 * 164 * Later, when the thread finishes resuming, we potentially disable smap 165 * if PS_ACHK was present in rflags. See uts/intel/ml/copy.s for 166 * more information on rflags and SMAP. 167 */ 168 pushfq 169 popq %rsi 170 andq $PS_ACHK, %rsi 171 movq %rsi, T_USERACC(%rax) 172 call smap_enable 173 174 /* 175 * Take a moment to potentially clear the RSB buffer. This is done to 176 * prevent various Spectre variant 2 and SpectreRSB attacks. This may 177 * not be sufficient. Please see uts/intel/ml/retpoline.S for more 178 * information about this. 179 */ 180 call x86_rsb_stuff 181 182 /* 183 * Take another moment to potentially clear the branch history buffer 184 * (BHB). This is done to prevent recent discoveries that branch 185 * history can also be trained to exploit certain compiler-generated 186 * instruction sequences (known as "gadgets") to leak data 187 * speculatively. As with x86_rsb_stuff, see retpoline.S, and this 188 * may not be sufficient. 189 */ 190 call x86_bhb_clear 191 192 /* 193 * Save non-volatile registers, and set return address for current 194 * thread to resume_return. 195 * 196 * %r12 = t (new thread) when done 197 */ 198 SAVE_REGS(%rax, %r11) 199 200 201 LOADCPU(%r15) /* %r15 = CPU */ 202 movq CPU_THREAD(%r15), %r13 /* %r13 = curthread */ 203 204 /* 205 * Call savectx if thread has installed context ops. 206 * 207 * Note that if we have floating point context, the save op 208 * (either fpsave_begin or fpxsave_begin) will issue the 209 * async save instruction (fnsave or fxsave respectively) 210 * that we fwait for below. 211 */ 212 cmpq $0, T_CTX(%r13) /* should current thread savectx? */ 213 je .nosavectx /* skip call when zero */ 214 215 movq %r13, %rdi /* arg = thread pointer */ 216 call savectx /* call ctx ops */ 217.nosavectx: 218 219 /* 220 * Check that the curthread is not using the FPU while in the kernel. 221 */ 222 call kernel_fpu_no_swtch 223 224 /* 225 * Call savepctx if process has installed context ops. 226 */ 227 movq T_PROCP(%r13), %r14 /* %r14 = proc */ 228 cmpq $0, P_PCTX(%r14) /* should current thread savepctx? */ 229 je .nosavepctx /* skip call when zero */ 230 231 movq %r14, %rdi /* arg = proc pointer */ 232 call savepctx /* call ctx ops */ 233.nosavepctx: 234 235 /* 236 * Temporarily switch to the idle thread's stack 237 */ 238 movq CPU_IDLE_THREAD(%r15), %rax /* idle thread pointer */ 239 240 /* 241 * Set the idle thread as the current thread 242 */ 243 movq T_SP(%rax), %rsp /* It is safe to set rsp */ 244 movq %rax, CPU_THREAD(%r15) 245 246 /* 247 * Switch in the hat context for the new thread 248 * 249 */ 250 GET_THREAD_HATP(%rdi, %r12, %r11) 251 call hat_switch 252 253 /* 254 * Clear and unlock previous thread's t_lock 255 * to allow it to be dispatched by another processor. 256 */ 257 movb $0, T_LOCK(%r13) 258 259 /* 260 * IMPORTANT: Registers at this point must be: 261 * %r12 = new thread 262 * 263 * Here we are in the idle thread, have dropped the old thread. 264 */ 265 ALTENTRY(_resume_from_idle) 266 /* 267 * spin until dispatched thread's mutex has 268 * been unlocked. this mutex is unlocked when 269 * it becomes safe for the thread to run. 270 */ 271.lock_thread_mutex: 272 lock 273 btsl $0, T_LOCK(%r12) /* attempt to lock new thread's mutex */ 274 jnc .thread_mutex_locked /* got it */ 275 276.spin_thread_mutex: 277 pause 278 cmpb $0, T_LOCK(%r12) /* check mutex status */ 279 jz .lock_thread_mutex /* clear, retry lock */ 280 jmp .spin_thread_mutex /* still locked, spin... */ 281 282.thread_mutex_locked: 283 /* 284 * Fix CPU structure to indicate new running thread. 285 * Set pointer in new thread to the CPU structure. 286 */ 287 LOADCPU(%r13) /* load current CPU pointer */ 288 cmpq %r13, T_CPU(%r12) 289 je .setup_cpu 290 291 /* cp->cpu_stats.sys.cpumigrate++ */ 292 incq CPU_STATS_SYS_CPUMIGRATE(%r13) 293 movq %r13, T_CPU(%r12) /* set new thread's CPU pointer */ 294 295.setup_cpu: 296 /* 297 * Setup rsp0 (kernel stack) in TSS to curthread's saved regs 298 * structure. If this thread doesn't have a regs structure above 299 * the stack -- that is, if lwp_stk_init() was never called for the 300 * thread -- this will set rsp0 to the wrong value, but it's harmless 301 * as it's a kernel thread, and it won't actually attempt to implicitly 302 * use the rsp0 via a privilege change. 303 * 304 * Note that when we have KPTI enabled on amd64, we never use this 305 * value at all (since all the interrupts have an IST set). 306 */ 307 movq CPU_TSS(%r13), %r14 308#if !defined(__xpv) 309 cmpq $1, kpti_enable 310 jne 1f 311 leaq CPU_KPTI_TR_RSP(%r13), %rax 312 jmp 2f 3131: 314 movq T_STACK(%r12), %rax 315 addq $REGSIZE+MINFRAME, %rax /* to the bottom of thread stack */ 3162: 317 movq %rax, TSS_RSP0(%r14) 318#else 319 movq T_STACK(%r12), %rax 320 addq $REGSIZE+MINFRAME, %rax /* to the bottom of thread stack */ 321 movl $KDS_SEL, %edi 322 movq %rax, %rsi 323 call HYPERVISOR_stack_switch 324#endif /* __xpv */ 325 326 movq %r12, CPU_THREAD(%r13) /* set CPU's thread pointer */ 327 mfence /* synchronize with mutex_exit() */ 328 xorl %ebp, %ebp /* make $<threadlist behave better */ 329 movq T_LWP(%r12), %rax /* set associated lwp to */ 330 movq %rax, CPU_LWP(%r13) /* CPU's lwp ptr */ 331 332 movq T_SP(%r12), %rsp /* switch to outgoing thread's stack */ 333 movq T_PC(%r12), %r13 /* saved return addr */ 334 335 /* 336 * Call restorectx if context ops have been installed. 337 */ 338 cmpq $0, T_CTX(%r12) /* should resumed thread restorectx? */ 339 jz .norestorectx /* skip call when zero */ 340 movq %r12, %rdi /* arg = thread pointer */ 341 call restorectx /* call ctx ops */ 342.norestorectx: 343 344 /* 345 * Call restorepctx if context ops have been installed for the proc. 346 */ 347 movq T_PROCP(%r12), %rcx 348 cmpq $0, P_PCTX(%rcx) 349 jz .norestorepctx 350 movq %rcx, %rdi 351 call restorepctx 352.norestorepctx: 353 354 STORE_INTR_START(%r12) 355 356 /* 357 * If we came into swtch with the ability to access userland pages, go 358 * ahead and restore that fact by disabling SMAP. Clear the indicator 359 * flag out of paranoia. 360 */ 361 movq T_USERACC(%r12), %rax /* should we disable smap? */ 362 cmpq $0, %rax /* skip call when zero */ 363 jz .nosmap 364 xorq %rax, %rax 365 movq %rax, T_USERACC(%r12) 366 call smap_disable 367.nosmap: 368 369 call smt_mark 370 371 /* 372 * Restore non-volatile registers, then have spl0 return to the 373 * resuming thread's PC after first setting the priority as low as 374 * possible and blocking all interrupt threads that may be active. 375 */ 376 movq %r13, %rax /* save return address */ 377 RESTORE_REGS(%r11) 378 pushq %rax /* push return address for spl0() */ 379 call __dtrace_probe___sched_on__cpu 380 jmp spl0 381 382resume_return: 383 /* 384 * Remove stack frame created in SAVE_REGS() 385 */ 386 addq $CLONGSIZE, %rsp 387 ret 388 SET_SIZE(_resume_from_idle) 389 SET_SIZE(resume) 390 391 ENTRY(resume_from_zombie) 392 movq %gs:CPU_THREAD, %rax 393 leaq resume_from_zombie_return(%rip), %r11 394 395 /* 396 * Save non-volatile registers, and set return address for current 397 * thread to resume_from_zombie_return. 398 * 399 * %r12 = t (new thread) when done 400 */ 401 SAVE_REGS(%rax, %r11) 402 403 movq %gs:CPU_THREAD, %r13 /* %r13 = curthread */ 404 405 /* clean up the fp unit. It might be left enabled */ 406 407#if defined(__xpv) /* XXPV XXtclayton */ 408 /* 409 * Remove this after bringup. 410 * (Too many #gp's for an instrumented hypervisor.) 411 */ 412 STTS(%rax) 413#else 414 movq %cr0, %rax 415 testq $CR0_TS, %rax 416 jnz .zfpu_disabled /* if TS already set, nothing to do */ 417 fninit /* init fpu & discard pending error */ 418 orq $CR0_TS, %rax 419 movq %rax, %cr0 420.zfpu_disabled: 421 422#endif /* __xpv */ 423 424 /* 425 * Temporarily switch to the idle thread's stack so that the zombie 426 * thread's stack can be reclaimed by the reaper. 427 */ 428 movq %gs:CPU_IDLE_THREAD, %rax /* idle thread pointer */ 429 movq T_SP(%rax), %rsp /* get onto idle thread stack */ 430 431 /* 432 * Sigh. If the idle thread has never run thread_start() 433 * then t_sp is mis-aligned by thread_load(). 434 */ 435 andq $_BITNOT(STACK_ALIGN-1), %rsp 436 437 /* 438 * Set the idle thread as the current thread. 439 */ 440 movq %rax, %gs:CPU_THREAD 441 442 /* switch in the hat context for the new thread */ 443 GET_THREAD_HATP(%rdi, %r12, %r11) 444 call hat_switch 445 446 /* 447 * Put the zombie on death-row. 448 */ 449 movq %r13, %rdi 450 call reapq_add 451 452 jmp _resume_from_idle /* finish job of resume */ 453 454resume_from_zombie_return: 455 RESTORE_REGS(%r11) /* restore non-volatile registers */ 456 call __dtrace_probe___sched_on__cpu 457 458 /* 459 * Remove stack frame created in SAVE_REGS() 460 */ 461 addq $CLONGSIZE, %rsp 462 ret 463 SET_SIZE(resume_from_zombie) 464 465 ENTRY(resume_from_intr) 466 movq %gs:CPU_THREAD, %rax 467 leaq resume_from_intr_return(%rip), %r11 468 469 /* 470 * Save non-volatile registers, and set return address for current 471 * thread to resume_from_intr_return. 472 * 473 * %r12 = t (new thread) when done 474 */ 475 SAVE_REGS(%rax, %r11) 476 477 movq %gs:CPU_THREAD, %r13 /* %r13 = curthread */ 478 movq %r12, %gs:CPU_THREAD /* set CPU's thread pointer */ 479 mfence /* synchronize with mutex_exit() */ 480 movq T_SP(%r12), %rsp /* restore resuming thread's sp */ 481 xorl %ebp, %ebp /* make $<threadlist behave better */ 482 483 /* 484 * Unlock outgoing thread's mutex dispatched by another processor. 485 */ 486 xorl %eax, %eax 487 xchgb %al, T_LOCK(%r13) 488 489 STORE_INTR_START(%r12) 490 491 call smt_mark 492 493 /* 494 * Restore non-volatile registers, then have spl0 return to the 495 * resuming thread's PC after first setting the priority as low as 496 * possible and blocking all interrupt threads that may be active. 497 */ 498 movq T_PC(%r12), %rax /* saved return addr */ 499 RESTORE_REGS(%r11); 500 pushq %rax /* push return address for spl0() */ 501 call __dtrace_probe___sched_on__cpu 502 jmp spl0 503 504resume_from_intr_return: 505 /* 506 * Remove stack frame created in SAVE_REGS() 507 */ 508 addq $CLONGSIZE, %rsp 509 ret 510 SET_SIZE(resume_from_intr) 511 512 ENTRY(thread_start) 513 popq %rax /* start() */ 514 popq %rdi /* arg */ 515 popq %rsi /* len */ 516 movq %rsp, %rbp 517 INDIRECT_CALL_REG(rax) 518 call thread_exit /* destroy thread if it returns. */ 519 /*NOTREACHED*/ 520 SET_SIZE(thread_start) 521