1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * Copyright 2018 Joyent, Inc. All rights reserved. 28 * Copyright 2022 Oxide Computer Compnay 29 */ 30 31 /* 32 * Copyright (c) 1992 Terrence R. Lambert. 33 * Copyright (c) 1990 The Regents of the University of California. 34 * All rights reserved. 35 * 36 * This code is derived from software contributed to Berkeley by 37 * William Jolitz. 38 * 39 * Redistribution and use in source and binary forms, with or without 40 * modification, are permitted provided that the following conditions 41 * are met: 42 * 1. Redistributions of source code must retain the above copyright 43 * notice, this list of conditions and the following disclaimer. 44 * 2. Redistributions in binary form must reproduce the above copyright 45 * notice, this list of conditions and the following disclaimer in the 46 * documentation and/or other materials provided with the distribution. 47 * 3. All advertising materials mentioning features or use of this software 48 * must display the following acknowledgement: 49 * This product includes software developed by the University of 50 * California, Berkeley and its contributors. 51 * 4. Neither the name of the University nor the names of its contributors 52 * may be used to endorse or promote products derived from this software 53 * without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * SUCH DAMAGE. 66 * 67 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 68 */ 69 70 #include <sys/types.h> 71 #include <sys/sysmacros.h> 72 #include <sys/tss.h> 73 #include <sys/segments.h> 74 #include <sys/trap.h> 75 #include <sys/cpuvar.h> 76 #include <sys/bootconf.h> 77 #include <sys/x86_archext.h> 78 #include <sys/controlregs.h> 79 #include <sys/archsystm.h> 80 #include <sys/machsystm.h> 81 #include <sys/kobj.h> 82 #include <sys/cmn_err.h> 83 #include <sys/reboot.h> 84 #include <sys/kdi.h> 85 #include <sys/mach_mmu.h> 86 #include <sys/systm.h> 87 #include <sys/note.h> 88 89 #ifdef __xpv 90 #include <sys/hypervisor.h> 91 #include <vm/as.h> 92 #endif 93 94 #include <sys/promif.h> 95 #include <sys/bootinfo.h> 96 #include <vm/kboot_mmu.h> 97 #include <vm/hat_pte.h> 98 99 /* 100 * cpu0 and default tables and structures. 101 */ 102 user_desc_t *gdt0; 103 #if !defined(__xpv) 104 desctbr_t gdt0_default_r; 105 #endif 106 107 gate_desc_t *idt0; /* interrupt descriptor table */ 108 109 tss_t *ktss0; /* kernel task state structure */ 110 111 112 user_desc_t zero_udesc; /* base zero user desc native procs */ 113 user_desc_t null_udesc; /* null user descriptor */ 114 system_desc_t null_sdesc; /* null system descriptor */ 115 116 user_desc_t zero_u32desc; /* 32-bit compatibility procs */ 117 118 user_desc_t ucs_on; 119 user_desc_t ucs_off; 120 user_desc_t ucs32_on; 121 user_desc_t ucs32_off; 122 123 /* 124 * If the size of this is changed, you must update hat_pcp_setup() and the 125 * definitions in exception.s 126 */ 127 extern char dblfault_stack0[DEFAULTSTKSZ]; 128 extern char nmi_stack0[DEFAULTSTKSZ]; 129 extern char mce_stack0[DEFAULTSTKSZ]; 130 131 extern void fast_null(void); 132 extern hrtime_t get_hrtime(void); 133 extern hrtime_t gethrvtime(void); 134 extern hrtime_t get_hrestime(void); 135 extern uint64_t getlgrp(void); 136 137 void (*(fasttable[]))(void) = { 138 fast_null, /* T_FNULL routine */ 139 fast_null, /* T_FGETFP routine (initially null) */ 140 fast_null, /* T_FSETFP routine (initially null) */ 141 (void (*)())(uintptr_t)get_hrtime, /* T_GETHRTIME */ 142 (void (*)())(uintptr_t)gethrvtime, /* T_GETHRVTIME */ 143 (void (*)())(uintptr_t)get_hrestime, /* T_GETHRESTIME */ 144 (void (*)())(uintptr_t)getlgrp /* T_GETLGRP */ 145 }; 146 147 /* 148 * Structure containing pre-computed descriptors to allow us to temporarily 149 * interpose on a standard handler. 150 */ 151 struct interposing_handler { 152 int ih_inum; 153 gate_desc_t ih_interp_desc; 154 gate_desc_t ih_default_desc; 155 }; 156 157 /* 158 * The brand infrastructure interposes on two handlers, and we use one as a 159 * NULL signpost. 160 */ 161 static struct interposing_handler brand_tbl[2]; 162 163 /* 164 * software prototypes for default local descriptor table 165 */ 166 167 /* 168 * Routines for loading segment descriptors in format the hardware 169 * can understand. 170 */ 171 172 /* 173 * In long mode we have the new L or long mode attribute bit 174 * for code segments. Only the conforming bit in type is used along 175 * with descriptor priority and present bits. Default operand size must 176 * be zero when in long mode. In 32-bit compatibility mode all fields 177 * are treated as in legacy mode. For data segments while in long mode 178 * only the present bit is loaded. 179 */ 180 void 181 set_usegd(user_desc_t *dp, uint_t lmode, void *base, size_t size, 182 uint_t type, uint_t dpl, uint_t gran, uint_t defopsz) 183 { 184 ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG); 185 /* This should never be a "system" segment. */ 186 ASSERT3U(type & SDT_S, !=, 0); 187 188 /* 189 * 64-bit long mode. 190 */ 191 if (lmode == SDP_LONG) 192 dp->usd_def32 = 0; /* 32-bit operands only */ 193 else 194 /* 195 * 32-bit compatibility mode. 196 */ 197 dp->usd_def32 = defopsz; /* 0 = 16, 1 = 32-bit ops */ 198 199 /* 200 * We should always set the "accessed" bit (SDT_A), otherwise the CPU 201 * will write to the GDT whenever we change segment registers around. 202 * With KPTI on, the GDT is read-only in the user page table, which 203 * causes crashes if we don't set this. 204 */ 205 ASSERT3U(type & SDT_A, !=, 0); 206 207 dp->usd_long = lmode; /* 64-bit mode */ 208 dp->usd_type = type; 209 dp->usd_dpl = dpl; 210 dp->usd_p = 1; 211 dp->usd_gran = gran; /* 0 = bytes, 1 = pages */ 212 213 dp->usd_lobase = (uintptr_t)base; 214 dp->usd_midbase = (uintptr_t)base >> 16; 215 dp->usd_hibase = (uintptr_t)base >> (16 + 8); 216 dp->usd_lolimit = size; 217 dp->usd_hilimit = (uintptr_t)size >> 16; 218 } 219 220 /* 221 * Install system segment descriptor for LDT and TSS segments. 222 */ 223 224 void 225 set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type, 226 uint_t dpl) 227 { 228 dp->ssd_lolimit = size; 229 dp->ssd_hilimit = (uintptr_t)size >> 16; 230 231 dp->ssd_lobase = (uintptr_t)base; 232 dp->ssd_midbase = (uintptr_t)base >> 16; 233 dp->ssd_hibase = (uintptr_t)base >> (16 + 8); 234 dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8); 235 236 dp->ssd_type = type; 237 dp->ssd_zero1 = 0; /* must be zero */ 238 dp->ssd_zero2 = 0; 239 dp->ssd_dpl = dpl; 240 dp->ssd_p = 1; 241 dp->ssd_gran = 0; /* force byte units */ 242 } 243 244 void * 245 get_ssd_base(system_desc_t *dp) 246 { 247 uintptr_t base; 248 249 base = (uintptr_t)dp->ssd_lobase | 250 (uintptr_t)dp->ssd_midbase << 16 | 251 (uintptr_t)dp->ssd_hibase << (16 + 8) | 252 (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8); 253 return ((void *)base); 254 } 255 256 /* 257 * Install gate segment descriptor for interrupt, trap, call and task gates. 258 * 259 * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on 260 * all interrupts. We have different ISTs for each class of exceptions that are 261 * most likely to occur while handling an existing exception; while many of 262 * these are just going to panic, it's nice not to trample on the existing 263 * exception state for debugging purposes. 264 * 265 * Normal interrupts are all redirected unconditionally to the KPTI trampoline 266 * stack space. This unifies the trampoline handling between user and kernel 267 * space (and avoids the need to touch %gs). 268 * 269 * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when 270 * we do a read from KMDB that cause another #PF. Without its own IST, this 271 * would stomp on the kernel's mcpu_kpti_flt frame. 272 */ 273 uint_t 274 idt_vector_to_ist(uint_t vector) 275 { 276 #if defined(__xpv) 277 _NOTE(ARGUNUSED(vector)); 278 return (IST_NONE); 279 #else 280 switch (vector) { 281 /* These should always use IST even without KPTI enabled. */ 282 case T_DBLFLT: 283 return (IST_DF); 284 case T_NMIFLT: 285 return (IST_NMI); 286 case T_MCE: 287 return (IST_MCE); 288 289 case T_BPTFLT: 290 case T_SGLSTP: 291 if (kpti_enable == 1) { 292 return (IST_DBG); 293 } 294 return (IST_NONE); 295 case T_STKFLT: 296 case T_GPFLT: 297 case T_PGFLT: 298 if (kpti_enable == 1) { 299 return (IST_NESTABLE); 300 } 301 return (IST_NONE); 302 default: 303 if (kpti_enable == 1) { 304 return (IST_DEFAULT); 305 } 306 return (IST_NONE); 307 } 308 #endif 309 } 310 311 void 312 set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel, 313 uint_t type, uint_t dpl, uint_t ist) 314 { 315 dp->sgd_looffset = (uintptr_t)func; 316 dp->sgd_hioffset = (uintptr_t)func >> 16; 317 dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16); 318 dp->sgd_selector = (uint16_t)sel; 319 dp->sgd_ist = ist; 320 dp->sgd_type = type; 321 dp->sgd_dpl = dpl; 322 dp->sgd_p = 1; 323 } 324 325 /* 326 * Updates a single user descriptor in the the GDT of the current cpu. 327 * Caller is responsible for preventing cpu migration. 328 */ 329 330 void 331 gdt_update_usegd(uint_t sidx, user_desc_t *udp) 332 { 333 #if defined(DEBUG) 334 /* This should never be a "system" segment, but it might be null. */ 335 if (udp->usd_p != 0 || udp->usd_type != 0) { 336 ASSERT3U(udp->usd_type & SDT_S, !=, 0); 337 } 338 /* 339 * We should always set the "accessed" bit (SDT_A), otherwise the CPU 340 * will write to the GDT whenever we change segment registers around. 341 * With KPTI on, the GDT is read-only in the user page table, which 342 * causes crashes if we don't set this. 343 */ 344 if (udp->usd_p != 0 || udp->usd_type != 0) { 345 ASSERT3U(udp->usd_type & SDT_A, !=, 0); 346 } 347 #endif 348 349 #if defined(__xpv) 350 uint64_t dpa = CPU->cpu_m.mcpu_gdtpa + sizeof (*udp) * sidx; 351 352 if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp)) 353 panic("gdt_update_usegd: HYPERVISOR_update_descriptor"); 354 355 #else /* __xpv */ 356 CPU->cpu_gdt[sidx] = *udp; 357 #endif /* __xpv */ 358 } 359 360 /* 361 * Writes single descriptor pointed to by udp into a processes 362 * LDT entry pointed to by ldp. 363 */ 364 int 365 ldt_update_segd(user_desc_t *ldp, user_desc_t *udp) 366 { 367 #if defined(DEBUG) 368 /* This should never be a "system" segment, but it might be null. */ 369 if (udp->usd_p != 0 || udp->usd_type != 0) { 370 ASSERT3U(udp->usd_type & SDT_S, !=, 0); 371 } 372 /* 373 * We should always set the "accessed" bit (SDT_A), otherwise the CPU 374 * will write to the LDT whenever we change segment registers around. 375 * With KPTI on, the LDT is read-only in the user page table, which 376 * causes crashes if we don't set this. 377 */ 378 if (udp->usd_p != 0 || udp->usd_type != 0) { 379 ASSERT3U(udp->usd_type & SDT_A, !=, 0); 380 } 381 #endif 382 383 #if defined(__xpv) 384 uint64_t dpa; 385 386 dpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)ldp)) | 387 ((uintptr_t)ldp & PAGEOFFSET); 388 389 /* 390 * The hypervisor is a little more restrictive about what it 391 * supports in the LDT. 392 */ 393 if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp) != 0) 394 return (EINVAL); 395 396 #else /* __xpv */ 397 *ldp = *udp; 398 399 #endif /* __xpv */ 400 return (0); 401 } 402 403 #if defined(__xpv) 404 405 /* 406 * Converts hw format gate descriptor into pseudo-IDT format for the hypervisor. 407 * Returns true if a valid entry was written. 408 */ 409 int 410 xen_idt_to_trap_info(uint_t vec, gate_desc_t *sgd, void *ti_arg) 411 { 412 trap_info_t *ti = ti_arg; /* XXPV Aargh - segments.h comment */ 413 414 /* 415 * skip holes in the IDT 416 */ 417 if (GATESEG_GETOFFSET(sgd) == 0) 418 return (0); 419 420 ASSERT(sgd->sgd_type == SDT_SYSIGT); 421 ti->vector = vec; 422 TI_SET_DPL(ti, sgd->sgd_dpl); 423 424 /* 425 * Is this an interrupt gate? 426 */ 427 if (sgd->sgd_type == SDT_SYSIGT) { 428 /* LINTED */ 429 TI_SET_IF(ti, 1); 430 } 431 ti->cs = sgd->sgd_selector; 432 ti->cs |= SEL_KPL; /* force into ring 3. see KCS_SEL */ 433 ti->address = GATESEG_GETOFFSET(sgd); 434 return (1); 435 } 436 437 /* 438 * Convert a single hw format gate descriptor and write it into our virtual IDT. 439 */ 440 void 441 xen_idt_write(gate_desc_t *sgd, uint_t vec) 442 { 443 trap_info_t trapinfo[2]; 444 445 bzero(trapinfo, sizeof (trapinfo)); 446 if (xen_idt_to_trap_info(vec, sgd, &trapinfo[0]) == 0) 447 return; 448 if (xen_set_trap_table(trapinfo) != 0) 449 panic("xen_idt_write: xen_set_trap_table() failed"); 450 } 451 452 #endif /* __xpv */ 453 454 455 /* 456 * Build kernel GDT. 457 */ 458 459 static void 460 init_gdt_common(user_desc_t *gdt) 461 { 462 int i; 463 464 /* 465 * 64-bit kernel code segment. 466 */ 467 set_usegd(&gdt[GDT_KCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_KPL, 468 SDP_PAGES, SDP_OP32); 469 470 /* 471 * 64-bit kernel data segment. The limit attribute is ignored in 64-bit 472 * mode, but we set it here to 0xFFFF so that we can use the SYSRET 473 * instruction to return from system calls back to 32-bit applications. 474 * SYSRET doesn't update the base, limit, or attributes of %ss or %ds 475 * descriptors. We therefore must ensure that the kernel uses something, 476 * though it will be ignored by hardware, that is compatible with 32-bit 477 * apps. For the same reason we must set the default op size of this 478 * descriptor to 32-bit operands. 479 */ 480 set_usegd(&gdt[GDT_KDATA], SDP_LONG, NULL, -1, SDT_MEMRWA, 481 SEL_KPL, SDP_PAGES, SDP_OP32); 482 gdt[GDT_KDATA].usd_def32 = 1; 483 484 /* 485 * 64-bit user code segment. 486 */ 487 set_usegd(&gdt[GDT_UCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_UPL, 488 SDP_PAGES, SDP_OP32); 489 490 /* 491 * 32-bit user code segment. 492 */ 493 set_usegd(&gdt[GDT_U32CODE], SDP_SHORT, NULL, -1, SDT_MEMERA, 494 SEL_UPL, SDP_PAGES, SDP_OP32); 495 496 /* 497 * See gdt_ucode32() and gdt_ucode_native(). 498 */ 499 ucs_on = ucs_off = gdt[GDT_UCODE]; 500 ucs_off.usd_p = 0; /* forces #np fault */ 501 502 ucs32_on = ucs32_off = gdt[GDT_U32CODE]; 503 ucs32_off.usd_p = 0; /* forces #np fault */ 504 505 /* 506 * 32 and 64 bit data segments can actually share the same descriptor. 507 * In long mode only the present bit is checked but all other fields 508 * are loaded. But in compatibility mode all fields are interpreted 509 * as in legacy mode so they must be set correctly for a 32-bit data 510 * segment. 511 */ 512 set_usegd(&gdt[GDT_UDATA], SDP_SHORT, NULL, -1, SDT_MEMRWA, SEL_UPL, 513 SDP_PAGES, SDP_OP32); 514 515 #if !defined(__xpv) 516 517 /* 518 * The 64-bit kernel has no default LDT. By default, the LDT descriptor 519 * in the GDT is 0. 520 */ 521 522 /* 523 * Kernel TSS 524 */ 525 set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0, 526 sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL); 527 528 #endif /* !__xpv */ 529 530 /* 531 * Initialize fs and gs descriptors for 32 bit processes. 532 * Only attributes and limits are initialized, the effective 533 * base address is programmed via fsbase/gsbase. 534 */ 535 set_usegd(&gdt[GDT_LWPFS], SDP_SHORT, NULL, -1, SDT_MEMRWA, 536 SEL_UPL, SDP_PAGES, SDP_OP32); 537 set_usegd(&gdt[GDT_LWPGS], SDP_SHORT, NULL, -1, SDT_MEMRWA, 538 SEL_UPL, SDP_PAGES, SDP_OP32); 539 540 /* 541 * Initialize the descriptors set aside for brand usage. 542 * Only attributes and limits are initialized. 543 */ 544 for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++) 545 set_usegd(&gdt0[i], SDP_SHORT, NULL, -1, SDT_MEMRWA, 546 SEL_UPL, SDP_PAGES, SDP_OP32); 547 548 /* 549 * Initialize convenient zero base user descriptors for clearing 550 * lwp private %fs and %gs descriptors in GDT. See setregs() for 551 * an example. 552 */ 553 set_usegd(&zero_udesc, SDP_LONG, 0, 0, SDT_MEMRWA, SEL_UPL, 554 SDP_BYTES, SDP_OP32); 555 set_usegd(&zero_u32desc, SDP_SHORT, 0, -1, SDT_MEMRWA, SEL_UPL, 556 SDP_PAGES, SDP_OP32); 557 } 558 559 #if defined(__xpv) 560 561 static user_desc_t * 562 init_gdt(void) 563 { 564 uint64_t gdtpa; 565 ulong_t ma[1]; /* XXPV should be a memory_t */ 566 ulong_t addr; 567 568 #if !defined(__lint) 569 /* 570 * Our gdt is never larger than a single page. 571 */ 572 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE); 573 #endif 574 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA, 575 PAGESIZE, PAGESIZE); 576 bzero(gdt0, PAGESIZE); 577 578 init_gdt_common(gdt0); 579 580 /* 581 * XXX Since we never invoke kmdb until after the kernel takes 582 * over the descriptor tables why not have it use the kernel's 583 * selectors? 584 */ 585 if (boothowto & RB_DEBUG) { 586 set_usegd(&gdt0[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, 587 SEL_KPL, SDP_PAGES, SDP_OP32); 588 set_usegd(&gdt0[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, 589 SEL_KPL, SDP_PAGES, SDP_OP32); 590 } 591 592 /* 593 * Clear write permission for page containing the gdt and install it. 594 */ 595 gdtpa = pfn_to_pa(va_to_pfn(gdt0)); 596 ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT); 597 kbm_read_only((uintptr_t)gdt0, gdtpa); 598 xen_set_gdt(ma, NGDT); 599 600 /* 601 * Reload the segment registers to use the new GDT. 602 * On 64-bit, fixup KCS_SEL to be in ring 3. 603 * See KCS_SEL in segments.h. 604 */ 605 load_segment_registers((KCS_SEL | SEL_KPL), KFS_SEL, KGS_SEL, KDS_SEL); 606 607 /* 608 * setup %gs for kernel 609 */ 610 xen_set_segment_base(SEGBASE_GS_KERNEL, (ulong_t)&cpus[0]); 611 612 /* 613 * XX64 We should never dereference off "other gsbase" or 614 * "fsbase". So, we should arrange to point FSBASE and 615 * KGSBASE somewhere truly awful e.g. point it at the last 616 * valid address below the hole so that any attempts to index 617 * off them cause an exception. 618 * 619 * For now, point it at 8G -- at least it should be unmapped 620 * until some 64-bit processes run. 621 */ 622 addr = 0x200000000ul; 623 xen_set_segment_base(SEGBASE_FS, addr); 624 xen_set_segment_base(SEGBASE_GS_USER, addr); 625 xen_set_segment_base(SEGBASE_GS_USER_SEL, 0); 626 627 return (gdt0); 628 } 629 630 #else /* __xpv */ 631 632 static user_desc_t * 633 init_gdt(void) 634 { 635 desctbr_t r_bgdt, r_gdt; 636 user_desc_t *bgdt; 637 638 #if !defined(__lint) 639 /* 640 * Our gdt is never larger than a single page. 641 */ 642 ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE); 643 #endif 644 gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA, 645 PAGESIZE, PAGESIZE); 646 bzero(gdt0, PAGESIZE); 647 648 init_gdt_common(gdt0); 649 650 /* 651 * Copy in from boot's gdt to our gdt. 652 * Entry 0 is the null descriptor by definition. 653 */ 654 rd_gdtr(&r_bgdt); 655 bgdt = (user_desc_t *)r_bgdt.dtr_base; 656 if (bgdt == NULL) 657 panic("null boot gdt"); 658 659 gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA]; 660 gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE]; 661 gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE]; 662 gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA]; 663 gdt0[GDT_B64CODE] = bgdt[GDT_B64CODE]; 664 665 /* 666 * Install our new GDT 667 */ 668 r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1; 669 r_gdt.dtr_base = (uintptr_t)gdt0; 670 wr_gdtr(&r_gdt); 671 672 /* 673 * Reload the segment registers to use the new GDT 674 */ 675 load_segment_registers(KCS_SEL, KFS_SEL, KGS_SEL, KDS_SEL); 676 677 /* 678 * setup %gs for kernel 679 */ 680 wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]); 681 682 /* 683 * XX64 We should never dereference off "other gsbase" or 684 * "fsbase". So, we should arrange to point FSBASE and 685 * KGSBASE somewhere truly awful e.g. point it at the last 686 * valid address below the hole so that any attempts to index 687 * off them cause an exception. 688 * 689 * For now, point it at 8G -- at least it should be unmapped 690 * until some 64-bit processes run. 691 */ 692 wrmsr(MSR_AMD_FSBASE, 0x200000000ul); 693 wrmsr(MSR_AMD_KGSBASE, 0x200000000ul); 694 return (gdt0); 695 } 696 697 #endif /* __xpv */ 698 699 700 /* 701 * Build kernel IDT. 702 * 703 * Note that for amd64 we pretty much require every gate to be an interrupt 704 * gate which blocks interrupts atomically on entry; that's because of our 705 * dependency on using 'swapgs' every time we come into the kernel to find 706 * the cpu structure. If we get interrupted just before doing that, %cs could 707 * be in kernel mode (so that the trap prolog doesn't do a swapgs), but 708 * %gsbase is really still pointing at something in userland. Bad things will 709 * ensue. We also use interrupt gates for i386 as well even though this is not 710 * required for some traps. 711 * 712 * Perhaps they should have invented a trap gate that does an atomic swapgs? 713 */ 714 static void 715 init_idt_common(gate_desc_t *idt) 716 { 717 set_gatesegd(&idt[T_ZERODIV], 718 (kpti_enable == 1) ? &tr_div0trap : &div0trap, 719 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV)); 720 set_gatesegd(&idt[T_SGLSTP], 721 (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap, 722 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP)); 723 set_gatesegd(&idt[T_NMIFLT], 724 (kpti_enable == 1) ? &tr_nmiint : &nmiint, 725 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT)); 726 set_gatesegd(&idt[T_BPTFLT], 727 (kpti_enable == 1) ? &tr_brktrap : &brktrap, 728 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT)); 729 set_gatesegd(&idt[T_OVFLW], 730 (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap, 731 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW)); 732 set_gatesegd(&idt[T_BOUNDFLT], 733 (kpti_enable == 1) ? &tr_boundstrap : &boundstrap, 734 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT)); 735 set_gatesegd(&idt[T_ILLINST], 736 (kpti_enable == 1) ? &tr_invoptrap : &invoptrap, 737 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST)); 738 set_gatesegd(&idt[T_NOEXTFLT], 739 (kpti_enable == 1) ? &tr_ndptrap : &ndptrap, 740 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT)); 741 742 /* 743 * double fault handler. 744 * 745 * Note that on the hypervisor a guest does not receive #df faults. 746 * Instead a failsafe event is injected into the guest if its selectors 747 * and/or stack is in a broken state. See xen_failsafe_callback. 748 */ 749 #if !defined(__xpv) 750 set_gatesegd(&idt[T_DBLFLT], 751 (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap, 752 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT)); 753 #endif /* !__xpv */ 754 755 /* 756 * T_EXTOVRFLT coprocessor-segment-overrun not supported. 757 */ 758 set_gatesegd(&idt[T_TSSFLT], 759 (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap, 760 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT)); 761 set_gatesegd(&idt[T_SEGFLT], 762 (kpti_enable == 1) ? &tr_segnptrap : &segnptrap, 763 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT)); 764 set_gatesegd(&idt[T_STKFLT], 765 (kpti_enable == 1) ? &tr_stktrap : &stktrap, 766 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT)); 767 set_gatesegd(&idt[T_GPFLT], 768 (kpti_enable == 1) ? &tr_gptrap : &gptrap, 769 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT)); 770 set_gatesegd(&idt[T_PGFLT], 771 (kpti_enable == 1) ? &tr_pftrap : &pftrap, 772 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT)); 773 set_gatesegd(&idt[T_EXTERRFLT], 774 (kpti_enable == 1) ? &tr_ndperr : &ndperr, 775 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT)); 776 set_gatesegd(&idt[T_ALIGNMENT], 777 (kpti_enable == 1) ? &tr_achktrap : &achktrap, 778 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT)); 779 set_gatesegd(&idt[T_MCE], 780 (kpti_enable == 1) ? &tr_mcetrap : &mcetrap, 781 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE)); 782 set_gatesegd(&idt[T_SIMDFPE], 783 (kpti_enable == 1) ? &tr_xmtrap : &xmtrap, 784 KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE)); 785 786 /* 787 * install fast trap handler at 210. 788 */ 789 set_gatesegd(&idt[T_FASTTRAP], 790 (kpti_enable == 1) ? &tr_fasttrap : &fasttrap, 791 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP)); 792 793 /* 794 * System call handler. 795 */ 796 set_gatesegd(&idt[T_SYSCALLINT], 797 (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int, 798 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT)); 799 800 /* 801 * Install the DTrace interrupt handler for the pid provider. 802 */ 803 set_gatesegd(&idt[T_DTRACE_RET], 804 (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret, 805 KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET)); 806 807 /* 808 * Prepare interposing descriptor for the syscall handler 809 * and cache copy of the default descriptor. 810 */ 811 brand_tbl[0].ih_inum = T_SYSCALLINT; 812 brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT]; 813 814 set_gatesegd(&(brand_tbl[0].ih_interp_desc), 815 (kpti_enable == 1) ? &tr_brand_sys_syscall_int : 816 &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL, 817 idt_vector_to_ist(T_SYSCALLINT)); 818 819 brand_tbl[1].ih_inum = 0; 820 } 821 822 #if defined(__xpv) 823 824 static void 825 init_idt(gate_desc_t *idt) 826 { 827 init_idt_common(idt); 828 } 829 830 #else /* __xpv */ 831 832 static void 833 init_idt(gate_desc_t *idt) 834 { 835 char ivctname[80]; 836 void (*ivctptr)(void); 837 int i; 838 839 /* 840 * Initialize entire table with 'reserved' trap and then overwrite 841 * specific entries. T_EXTOVRFLT (9) is unsupported and reserved 842 * since it can only be generated on a 386 processor. 15 is also 843 * unsupported and reserved. 844 */ 845 for (i = 0; i < NIDT; i++) { 846 set_gatesegd(&idt[i], 847 (kpti_enable == 1) ? &tr_resvtrap : &resvtrap, 848 KCS_SEL, SDT_SYSIGT, TRP_KPL, 849 idt_vector_to_ist(T_RESVTRAP)); 850 } 851 852 /* 853 * 20-31 reserved 854 */ 855 for (i = 20; i < 32; i++) { 856 set_gatesegd(&idt[i], 857 (kpti_enable == 1) ? &tr_invaltrap : &invaltrap, 858 KCS_SEL, SDT_SYSIGT, TRP_KPL, 859 idt_vector_to_ist(T_INVALTRAP)); 860 } 861 862 /* 863 * interrupts 32 - 255 864 */ 865 for (i = 32; i < 256; i++) { 866 (void) snprintf(ivctname, sizeof (ivctname), 867 (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i); 868 ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0); 869 if (ivctptr == NULL) 870 panic("kobj_getsymvalue(%s) failed", ivctname); 871 872 set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL, 873 idt_vector_to_ist(i)); 874 } 875 876 /* 877 * Now install the common ones. Note that it will overlay some 878 * entries installed above like T_SYSCALLINT, T_FASTTRAP etc. 879 */ 880 init_idt_common(idt); 881 } 882 883 #endif /* __xpv */ 884 885 /* 886 * The kernel does not deal with LDTs unless a user explicitly creates 887 * one. Under normal circumstances, the LDTR contains 0. Any process attempting 888 * to reference the LDT will therefore cause a #gp. System calls made via the 889 * obsolete lcall mechanism are emulated by the #gp fault handler. 890 */ 891 static void 892 init_ldt(void) 893 { 894 #if defined(__xpv) 895 xen_set_ldt(NULL, 0); 896 #else 897 wr_ldtr(0); 898 #endif 899 } 900 901 #if !defined(__xpv) 902 903 static void 904 init_tss(void) 905 { 906 extern struct cpu cpus[]; 907 908 /* 909 * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each 910 * context switch but it'll be overwritten with this same value anyway. 911 */ 912 if (kpti_enable == 1) { 913 ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp; 914 } 915 916 /* Set up the IST stacks for double fault, NMI, MCE. */ 917 ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)]; 918 ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)]; 919 ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)]; 920 921 /* 922 * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is 923 * enabled), and also for KDI (always). 924 */ 925 ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp; 926 927 if (kpti_enable == 1) { 928 /* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */ 929 ktss0->tss_ist5 = 930 (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp; 931 932 /* This IST stack is used for all other intrs (for KPTI). */ 933 ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp; 934 } 935 936 /* 937 * Set I/O bit map offset equal to size of TSS segment limit 938 * for no I/O permission map. This will force all user I/O 939 * instructions to generate #gp fault. 940 */ 941 ktss0->tss_bitmapbase = sizeof (*ktss0); 942 943 /* 944 * Point %tr to descriptor for ktss0 in gdt. 945 */ 946 wr_tsr(KTSS_SEL); 947 } 948 949 #endif /* !__xpv */ 950 951 #if defined(__xpv) 952 953 void 954 init_desctbls(void) 955 { 956 uint_t vec; 957 user_desc_t *gdt; 958 959 /* 960 * Setup and install our GDT. 961 */ 962 gdt = init_gdt(); 963 964 /* 965 * Store static pa of gdt to speed up pa_to_ma() translations 966 * on lwp context switches. 967 */ 968 ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE)); 969 CPU->cpu_gdt = gdt; 970 CPU->cpu_m.mcpu_gdtpa = pfn_to_pa(va_to_pfn(gdt)); 971 972 /* 973 * Setup and install our IDT. 974 */ 975 #if !defined(__lint) 976 ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE); 977 #endif 978 idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA, 979 PAGESIZE, PAGESIZE); 980 bzero(idt0, PAGESIZE); 981 init_idt(idt0); 982 for (vec = 0; vec < NIDT; vec++) 983 xen_idt_write(&idt0[vec], vec); 984 985 CPU->cpu_idt = idt0; 986 987 /* 988 * set default kernel stack 989 */ 990 xen_stack_switch(KDS_SEL, 991 (ulong_t)&dblfault_stack0[sizeof (dblfault_stack0)]); 992 993 xen_init_callbacks(); 994 995 init_ldt(); 996 } 997 998 #else /* __xpv */ 999 1000 void 1001 init_desctbls(void) 1002 { 1003 user_desc_t *gdt; 1004 desctbr_t idtr; 1005 1006 /* 1007 * Allocate IDT and TSS structures on unique pages for better 1008 * performance in virtual machines. 1009 */ 1010 #if !defined(__lint) 1011 ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE); 1012 #endif 1013 idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA, 1014 PAGESIZE, PAGESIZE); 1015 bzero(idt0, PAGESIZE); 1016 #if !defined(__lint) 1017 ASSERT(sizeof (*ktss0) <= PAGESIZE); 1018 #endif 1019 ktss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)KTSS_VA, 1020 PAGESIZE, PAGESIZE); 1021 bzero(ktss0, PAGESIZE); 1022 1023 1024 /* 1025 * Setup and install our GDT. 1026 */ 1027 gdt = init_gdt(); 1028 ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE)); 1029 CPU->cpu_gdt = gdt; 1030 1031 /* 1032 * Initialize this CPU's LDT. 1033 */ 1034 CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA, 1035 LDT_CPU_SIZE, PAGESIZE); 1036 bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE); 1037 CPU->cpu_m.mcpu_ldt_len = 0; 1038 1039 /* 1040 * Setup and install our IDT. 1041 */ 1042 init_idt(idt0); 1043 1044 idtr.dtr_base = (uintptr_t)idt0; 1045 idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1; 1046 wr_idtr(&idtr); 1047 CPU->cpu_idt = idt0; 1048 1049 1050 init_tss(); 1051 CPU->cpu_tss = ktss0; 1052 init_ldt(); 1053 1054 /* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */ 1055 kpti_safe_cr3 = (uint64_t)getcr3(); 1056 } 1057 1058 #endif /* __xpv */ 1059 1060 #ifndef __xpv 1061 /* 1062 * As per Intel Vol 3 27.5.2, the GDTR limit is reset to 64Kb on a VM exit, so 1063 * we have to manually fix it up ourselves. 1064 * 1065 * The caller may still need to make sure that it can't go off-CPU with the 1066 * incorrect limit, before calling this (such as disabling pre-emption). 1067 */ 1068 void 1069 reset_gdtr_limit(void) 1070 { 1071 ulong_t flags = intr_clear(); 1072 desctbr_t gdtr; 1073 1074 rd_gdtr(&gdtr); 1075 gdtr.dtr_limit = (sizeof (user_desc_t) * NGDT) - 1; 1076 wr_gdtr(&gdtr); 1077 1078 intr_restore(flags); 1079 } 1080 #endif /* __xpv */ 1081 1082 /* 1083 * In the early kernel, we need to set up a simple GDT to run on. 1084 * 1085 * XXPV Can dboot use this too? See dboot_gdt.s 1086 */ 1087 void 1088 init_boot_gdt(user_desc_t *bgdt) 1089 { 1090 set_usegd(&bgdt[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, SEL_KPL, 1091 SDP_PAGES, SDP_OP32); 1092 set_usegd(&bgdt[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, SEL_KPL, 1093 SDP_PAGES, SDP_OP32); 1094 } 1095 1096 /* 1097 * Enable interpositioning on the system call path by rewriting the 1098 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use 1099 * the branded entry points. 1100 */ 1101 void 1102 brand_interpositioning_enable(void *arg __unused) 1103 { 1104 gate_desc_t *idt = CPU->cpu_idt; 1105 int i; 1106 1107 ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL); 1108 1109 for (i = 0; brand_tbl[i].ih_inum; i++) { 1110 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc; 1111 #if defined(__xpv) 1112 xen_idt_write(&idt[brand_tbl[i].ih_inum], 1113 brand_tbl[i].ih_inum); 1114 #endif 1115 } 1116 1117 #if defined(__xpv) 1118 1119 /* 1120 * Currently the hypervisor only supports 64-bit syscalls via 1121 * syscall instruction. The 32-bit syscalls are handled by 1122 * interrupt gate above. 1123 */ 1124 xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall, 1125 CALLBACKF_mask_events); 1126 1127 #else 1128 1129 if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) { 1130 if (kpti_enable == 1) { 1131 wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall); 1132 wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32); 1133 } else { 1134 wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall); 1135 wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32); 1136 } 1137 } 1138 1139 #endif 1140 1141 if (is_x86_feature(x86_featureset, X86FSET_SEP)) { 1142 if (kpti_enable == 1) { 1143 wrmsr(MSR_INTC_SEP_EIP, 1144 (uintptr_t)tr_brand_sys_sysenter); 1145 } else { 1146 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter); 1147 } 1148 } 1149 } 1150 1151 /* 1152 * Disable interpositioning on the system call path by rewriting the 1153 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use 1154 * the standard entry points, which bypass the interpositioning hooks. 1155 */ 1156 void 1157 brand_interpositioning_disable(void *arg __unused) 1158 { 1159 gate_desc_t *idt = CPU->cpu_idt; 1160 int i; 1161 1162 ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL); 1163 1164 for (i = 0; brand_tbl[i].ih_inum; i++) { 1165 idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc; 1166 #if defined(__xpv) 1167 xen_idt_write(&idt[brand_tbl[i].ih_inum], 1168 brand_tbl[i].ih_inum); 1169 #endif 1170 } 1171 1172 #if defined(__xpv) 1173 1174 /* 1175 * See comment above in brand_interpositioning_enable. 1176 */ 1177 xen_set_callback(sys_syscall, CALLBACKTYPE_syscall, 1178 CALLBACKF_mask_events); 1179 1180 #else 1181 1182 if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) { 1183 if (kpti_enable == 1) { 1184 wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall); 1185 wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32); 1186 } else { 1187 wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall); 1188 wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32); 1189 } 1190 } 1191 1192 #endif 1193 1194 if (is_x86_feature(x86_featureset, X86FSET_SEP)) { 1195 if (kpti_enable == 1) { 1196 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter); 1197 } else { 1198 wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter); 1199 } 1200 } 1201 } 1202