1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* derived from netbsd's xen_machdep.c 1.1.2.1 */ 28 29 /* 30 * 31 * Copyright (c) 2004 Christian Limpach. 32 * All rights reserved. 33 * 34 * Redistribution and use in source and binary forms, with or without 35 * modification, are permitted provided that the following conditions 36 * are met: 37 * 1. Redistributions of source code must retain the above copyright 38 * notice, this list of conditions and the following disclaimer. 39 * 2. Redistributions in binary form must reproduce the above copyright 40 * notice, this list of conditions and the following disclaimer in the 41 * documentation and/or other materials provided with the distribution. 42 * 3. This section intentionally left blank. 43 * 4. The name of the author may not be used to endorse or promote products 44 * derived from this software without specific prior written permission. 45 * 46 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 47 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 48 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 49 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 50 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 51 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 52 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 53 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 54 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 55 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 56 */ 57 /* 58 * Section 3 of the above license was updated in response to bug 6379571. 59 */ 60 61 #include <sys/ctype.h> 62 #include <sys/types.h> 63 #include <sys/cmn_err.h> 64 #include <sys/trap.h> 65 #include <sys/segments.h> 66 #include <sys/hypervisor.h> 67 #include <sys/xen_mmu.h> 68 #include <sys/machsystm.h> 69 #include <sys/promif.h> 70 #include <sys/bootconf.h> 71 #include <sys/bootinfo.h> 72 #include <sys/cpr.h> 73 #include <sys/taskq.h> 74 #include <sys/uadmin.h> 75 #include <sys/evtchn_impl.h> 76 #include <sys/archsystm.h> 77 #include <xen/sys/xenbus_impl.h> 78 #include <sys/mach_mmu.h> 79 #include <vm/hat_i86.h> 80 #include <sys/gnttab.h> 81 #include <sys/reboot.h> 82 #include <sys/stack.h> 83 #include <sys/clock.h> 84 #include <sys/bitmap.h> 85 #include <sys/processor.h> 86 #include <sys/xen_errno.h> 87 #include <sys/xpv_panic.h> 88 #include <sys/smp_impldefs.h> 89 #include <sys/cpu.h> 90 #include <sys/balloon_impl.h> 91 #include <sys/ddi.h> 92 93 #ifdef DEBUG 94 #define SUSPEND_DEBUG if (xen_suspend_debug) xen_printf 95 #else 96 #define SUSPEND_DEBUG(...) 97 #endif 98 99 int cpr_debug; 100 cpuset_t cpu_suspend_lost_set; 101 static int xen_suspend_debug; 102 103 uint_t xen_phys_ncpus; 104 xen_mc_logical_cpu_t *xen_phys_cpus; 105 int xen_physinfo_debug = 0; 106 107 /* 108 * Determine helpful version information. 109 * 110 * (And leave copies in the data segment so we can look at them later 111 * with e.g. kmdb.) 112 */ 113 114 typedef enum xen_version { 115 XENVER_BOOT_IDX, 116 XENVER_CURRENT_IDX 117 } xen_version_t; 118 119 struct xenver { 120 ulong_t xv_major; 121 ulong_t xv_minor; 122 ulong_t xv_revision; 123 xen_extraversion_t xv_ver; 124 ulong_t xv_is_xvm; 125 xen_changeset_info_t xv_chgset; 126 xen_compile_info_t xv_build; 127 xen_capabilities_info_t xv_caps; 128 } xenver[2]; 129 130 #define XENVER_BOOT(m) (xenver[XENVER_BOOT_IDX].m) 131 #define XENVER_CURRENT(m) (xenver[XENVER_CURRENT_IDX].m) 132 133 /* 134 * Update the xenver data. We maintain two copies, boot and 135 * current. If we are setting the boot, then also set current. 136 */ 137 static void 138 xen_set_version(xen_version_t idx) 139 { 140 ulong_t ver; 141 142 bzero(&xenver[idx], sizeof (xenver[idx])); 143 144 ver = HYPERVISOR_xen_version(XENVER_version, 0); 145 146 xenver[idx].xv_major = BITX(ver, 31, 16); 147 xenver[idx].xv_minor = BITX(ver, 15, 0); 148 149 (void) HYPERVISOR_xen_version(XENVER_extraversion, &xenver[idx].xv_ver); 150 151 /* 152 * The revision is buried in the extraversion information that is 153 * maintained by the hypervisor. For our purposes we expect that 154 * the revision number is: 155 * - the second character in the extraversion information 156 * - one character long 157 * - numeric digit 158 * If it isn't then we can't extract the revision and we leave it 159 * set to 0. 160 */ 161 if (strlen(xenver[idx].xv_ver) > 1 && isdigit(xenver[idx].xv_ver[1])) 162 xenver[idx].xv_revision = xenver[idx].xv_ver[1] - '0'; 163 else 164 cmn_err(CE_WARN, "Cannot extract revision on this hypervisor " 165 "version: v%s, unexpected version format", 166 xenver[idx].xv_ver); 167 168 xenver[idx].xv_is_xvm = 0; 169 170 if (strlen(xenver[idx].xv_ver) >= 4 && 171 strncmp(xenver[idx].xv_ver + strlen(xenver[idx].xv_ver) - 4, 172 "-xvm", 4) == 0) 173 xenver[idx].xv_is_xvm = 1; 174 175 (void) HYPERVISOR_xen_version(XENVER_changeset, 176 &xenver[idx].xv_chgset); 177 178 (void) HYPERVISOR_xen_version(XENVER_compile_info, 179 &xenver[idx].xv_build); 180 /* 181 * Capabilities are a set of space separated ascii strings 182 * e.g. 'xen-3.1-x86_32p' or 'hvm-3.2-x86_64' 183 */ 184 (void) HYPERVISOR_xen_version(XENVER_capabilities, 185 &xenver[idx].xv_caps); 186 187 cmn_err(CE_CONT, "?v%lu.%lu%s chgset '%s'\n", xenver[idx].xv_major, 188 xenver[idx].xv_minor, xenver[idx].xv_ver, xenver[idx].xv_chgset); 189 190 if (idx == XENVER_BOOT_IDX) 191 bcopy(&xenver[XENVER_BOOT_IDX], &xenver[XENVER_CURRENT_IDX], 192 sizeof (xenver[XENVER_BOOT_IDX])); 193 } 194 195 typedef enum xen_hypervisor_check { 196 XEN_RUN_CHECK, 197 XEN_SUSPEND_CHECK 198 } xen_hypervisor_check_t; 199 200 /* 201 * To run the hypervisor must be 3.0.4 or better. To suspend/resume 202 * we need 3.0.4 or better and if it is 3.0.4. then it must be provided 203 * by the Solaris xVM project. 204 * Checking can be disabled for testing purposes by setting the 205 * xen_suspend_debug variable. 206 */ 207 static int 208 xen_hypervisor_supports_solaris(xen_hypervisor_check_t check) 209 { 210 if (xen_suspend_debug == 1) 211 return (1); 212 if (XENVER_CURRENT(xv_major) < 3) 213 return (0); 214 if (XENVER_CURRENT(xv_major) > 3) 215 return (1); 216 if (XENVER_CURRENT(xv_minor) > 0) 217 return (1); 218 if (XENVER_CURRENT(xv_revision) < 4) 219 return (0); 220 if (check == XEN_SUSPEND_CHECK && XENVER_CURRENT(xv_revision) == 4 && 221 !XENVER_CURRENT(xv_is_xvm)) 222 return (0); 223 224 return (1); 225 } 226 227 /* 228 * If the hypervisor is -xvm, or 3.1.2 or higher, we don't need the 229 * workaround. 230 */ 231 static void 232 xen_pte_workaround(void) 233 { 234 #if defined(__amd64) 235 extern int pt_kern; 236 237 if (XENVER_CURRENT(xv_major) != 3) 238 return; 239 if (XENVER_CURRENT(xv_minor) > 1) 240 return; 241 if (XENVER_CURRENT(xv_minor) == 1 && 242 XENVER_CURRENT(xv_revision) > 1) 243 return; 244 if (XENVER_CURRENT(xv_is_xvm)) 245 return; 246 247 pt_kern = PT_USER; 248 #endif 249 } 250 251 void 252 xen_set_callback(void (*func)(void), uint_t type, uint_t flags) 253 { 254 struct callback_register cb; 255 256 bzero(&cb, sizeof (cb)); 257 #if defined(__amd64) 258 cb.address = (ulong_t)func; 259 #elif defined(__i386) 260 cb.address.cs = KCS_SEL; 261 cb.address.eip = (ulong_t)func; 262 #endif 263 cb.type = type; 264 cb.flags = flags; 265 266 /* 267 * XXPV always ignore return value for NMI 268 */ 269 if (HYPERVISOR_callback_op(CALLBACKOP_register, &cb) != 0 && 270 type != CALLBACKTYPE_nmi) 271 panic("HYPERVISOR_callback_op failed"); 272 } 273 274 void 275 xen_init_callbacks(void) 276 { 277 /* 278 * register event (interrupt) handler. 279 */ 280 xen_set_callback(xen_callback, CALLBACKTYPE_event, 0); 281 282 /* 283 * failsafe handler. 284 */ 285 xen_set_callback(xen_failsafe_callback, CALLBACKTYPE_failsafe, 286 CALLBACKF_mask_events); 287 288 /* 289 * NMI handler. 290 */ 291 xen_set_callback(nmiint, CALLBACKTYPE_nmi, 0); 292 293 /* 294 * system call handler 295 * XXPV move to init_cpu_syscall? 296 */ 297 #if defined(__amd64) 298 xen_set_callback(sys_syscall, CALLBACKTYPE_syscall, 299 CALLBACKF_mask_events); 300 #endif /* __amd64 */ 301 } 302 303 304 /* 305 * cmn_err() followed by a 1/4 second delay; this gives the 306 * logging service a chance to flush messages and helps avoid 307 * intermixing output from prom_printf(). 308 * XXPV: doesn't exactly help us on UP though. 309 */ 310 /*PRINTFLIKE2*/ 311 void 312 cpr_err(int ce, const char *fmt, ...) 313 { 314 va_list adx; 315 316 va_start(adx, fmt); 317 vcmn_err(ce, fmt, adx); 318 va_end(adx); 319 drv_usecwait(MICROSEC >> 2); 320 } 321 322 void 323 xen_suspend_devices(void) 324 { 325 int rc; 326 327 SUSPEND_DEBUG("xen_suspend_devices\n"); 328 329 if ((rc = cpr_suspend_devices(ddi_root_node())) != 0) 330 panic("failed to suspend devices: %d", rc); 331 } 332 333 void 334 xen_resume_devices(void) 335 { 336 int rc; 337 338 SUSPEND_DEBUG("xen_resume_devices\n"); 339 340 if ((rc = cpr_resume_devices(ddi_root_node(), 0)) != 0) 341 panic("failed to resume devices: %d", rc); 342 } 343 344 /* 345 * The list of mfn pages is out of date. Recompute it. 346 */ 347 static void 348 rebuild_mfn_list(void) 349 { 350 int i = 0; 351 size_t sz; 352 size_t off; 353 pfn_t pfn; 354 355 SUSPEND_DEBUG("rebuild_mfn_list\n"); 356 357 sz = ((mfn_count * sizeof (mfn_t)) + MMU_PAGEOFFSET) & MMU_PAGEMASK; 358 359 for (off = 0; off < sz; off += MMU_PAGESIZE) { 360 size_t j = mmu_btop(off); 361 if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) { 362 pfn = hat_getpfnum(kas.a_hat, 363 (caddr_t)&mfn_list_pages[j]); 364 mfn_list_pages_page[i++] = pfn_to_mfn(pfn); 365 } 366 367 pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list + off); 368 mfn_list_pages[j] = pfn_to_mfn(pfn); 369 } 370 371 pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list_pages_page); 372 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list 373 = pfn_to_mfn(pfn); 374 } 375 376 static void 377 suspend_cpus(void) 378 { 379 int i; 380 381 SUSPEND_DEBUG("suspend_cpus\n"); 382 383 mp_enter_barrier(); 384 385 for (i = 1; i < ncpus; i++) { 386 if (!CPU_IN_SET(cpu_suspend_lost_set, i)) { 387 SUSPEND_DEBUG("xen_vcpu_down %d\n", i); 388 (void) xen_vcpu_down(i); 389 } 390 391 mach_cpucontext_reset(cpu[i]); 392 } 393 } 394 395 static void 396 resume_cpus(void) 397 { 398 int i; 399 400 for (i = 1; i < ncpus; i++) { 401 if (cpu[i] == NULL) 402 continue; 403 404 if (!CPU_IN_SET(cpu_suspend_lost_set, i)) { 405 SUSPEND_DEBUG("xen_vcpu_up %d\n", i); 406 mach_cpucontext_restore(cpu[i]); 407 (void) xen_vcpu_up(i); 408 } 409 } 410 411 mp_leave_barrier(); 412 } 413 414 /* 415 * Top level routine to direct suspend/resume of a domain. 416 */ 417 void 418 xen_suspend_domain(void) 419 { 420 extern void rtcsync(void); 421 extern hrtime_t hres_last_tick; 422 mfn_t start_info_mfn; 423 ulong_t flags; 424 pfn_t pfn; 425 int i; 426 427 /* 428 * Check that we are happy to suspend on this hypervisor. 429 */ 430 if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) { 431 cpr_err(CE_WARN, "Cannot suspend on this hypervisor " 432 "version: v%lu.%lu%s, need at least version v3.0.4 or " 433 "-xvm based hypervisor", XENVER_CURRENT(xv_major), 434 XENVER_CURRENT(xv_minor), XENVER_CURRENT(xv_ver)); 435 return; 436 } 437 438 /* 439 * XXPV - Are we definitely OK to suspend by the time we've connected 440 * the handler? 441 */ 442 443 cpr_err(CE_NOTE, "Domain suspending for save/migrate"); 444 445 SUSPEND_DEBUG("xen_suspend_domain\n"); 446 447 /* 448 * suspend interrupts and devices 449 * XXPV - we use suspend/resume for both save/restore domains (like sun 450 * cpr) and for migration. Would be nice to know the difference if 451 * possible. For save/restore where down time may be a long time, we 452 * may want to do more of the things that cpr does. (i.e. notify user 453 * processes, shrink memory footprint for faster restore, etc.) 454 */ 455 xen_suspend_devices(); 456 SUSPEND_DEBUG("xenbus_suspend\n"); 457 xenbus_suspend(); 458 459 pfn = hat_getpfnum(kas.a_hat, (caddr_t)xen_info); 460 start_info_mfn = pfn_to_mfn(pfn); 461 462 /* 463 * XXPV: cpu hotplug can hold this under a xenbus watch. Are we safe 464 * wrt xenbus being suspended here? 465 */ 466 mutex_enter(&cpu_lock); 467 468 /* 469 * Suspend must be done on vcpu 0, as no context for other CPUs is 470 * saved. 471 * 472 * XXPV - add to taskq API ? 473 */ 474 thread_affinity_set(curthread, 0); 475 kpreempt_disable(); 476 477 SUSPEND_DEBUG("xen_start_migrate\n"); 478 xen_start_migrate(); 479 if (ncpus > 1) 480 suspend_cpus(); 481 482 /* 483 * We can grab the ec_lock as it's a spinlock with a high SPL. Hence 484 * any holder would have dropped it to get through suspend_cpus(). 485 */ 486 mutex_enter(&ec_lock); 487 488 /* 489 * From here on in, we can't take locks. 490 */ 491 SUSPEND_DEBUG("ec_suspend\n"); 492 ec_suspend(); 493 SUSPEND_DEBUG("gnttab_suspend\n"); 494 gnttab_suspend(); 495 496 flags = intr_clear(); 497 498 xpv_time_suspend(); 499 500 /* 501 * Currently, the hypervisor incorrectly fails to bring back 502 * powered-down VCPUs. Thus we need to record any powered-down VCPUs 503 * to prevent any attempts to operate on them. But we have to do this 504 * *after* the very first time we do ec_suspend(). 505 */ 506 for (i = 1; i < ncpus; i++) { 507 if (cpu[i] == NULL) 508 continue; 509 510 if (cpu_get_state(cpu[i]) == P_POWEROFF) 511 CPUSET_ATOMIC_ADD(cpu_suspend_lost_set, i); 512 } 513 514 /* 515 * The dom0 save/migrate code doesn't automatically translate 516 * these into PFNs, but expects them to be, so we do it here. 517 * We don't use mfn_to_pfn() because so many OS services have 518 * been disabled at this point. 519 */ 520 xen_info->store_mfn = mfn_to_pfn_mapping[xen_info->store_mfn]; 521 xen_info->console.domU.mfn = 522 mfn_to_pfn_mapping[xen_info->console.domU.mfn]; 523 524 if (CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0) { 525 prom_printf("xen_suspend_domain(): " 526 "CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask not set\n"); 527 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 528 } 529 530 if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, 531 0, UVMF_INVLPG)) { 532 prom_printf("xen_suspend_domain(): " 533 "HYPERVISOR_update_va_mapping() failed\n"); 534 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 535 } 536 537 SUSPEND_DEBUG("HYPERVISOR_suspend\n"); 538 539 /* 540 * At this point we suspend and sometime later resume. 541 */ 542 if (HYPERVISOR_suspend(start_info_mfn)) { 543 prom_printf("xen_suspend_domain(): " 544 "HYPERVISOR_suspend() failed\n"); 545 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 546 } 547 548 /* 549 * Point HYPERVISOR_shared_info to its new value. 550 */ 551 if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, 552 xen_info->shared_info | PT_NOCONSIST | PT_VALID | PT_WRITABLE, 553 UVMF_INVLPG)) 554 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 555 556 if (xen_info->nr_pages != mfn_count) { 557 prom_printf("xen_suspend_domain(): number of pages" 558 " changed, was 0x%lx, now 0x%lx\n", mfn_count, 559 xen_info->nr_pages); 560 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 561 } 562 563 xpv_time_resume(); 564 565 cached_max_mfn = 0; 566 567 SUSPEND_DEBUG("gnttab_resume\n"); 568 gnttab_resume(); 569 570 /* XXPV: add a note that this must be lockless. */ 571 SUSPEND_DEBUG("ec_resume\n"); 572 ec_resume(); 573 574 intr_restore(flags); 575 576 if (ncpus > 1) 577 resume_cpus(); 578 579 mutex_exit(&ec_lock); 580 xen_end_migrate(); 581 mutex_exit(&cpu_lock); 582 583 /* 584 * Now we can take locks again. 585 */ 586 587 /* 588 * Force the tick value used for tv_nsec in hres_tick() to be up to 589 * date. rtcsync() will reset the hrestime value appropriately. 590 */ 591 hres_last_tick = xpv_gethrtime(); 592 593 /* 594 * XXPV: we need to have resumed the CPUs since this takes locks, but 595 * can remote CPUs see bad state? Presumably yes. Should probably nest 596 * taking of todlock inside of cpu_lock, or vice versa, then provide an 597 * unlocked version. Probably need to call clkinitf to reset cpu freq 598 * and re-calibrate if we migrated to a different speed cpu. Also need 599 * to make a (re)init_cpu_info call to update processor info structs 600 * and device tree info. That remains to be written at the moment. 601 */ 602 rtcsync(); 603 604 rebuild_mfn_list(); 605 606 SUSPEND_DEBUG("xenbus_resume\n"); 607 xenbus_resume(); 608 SUSPEND_DEBUG("xenbus_resume_devices\n"); 609 xen_resume_devices(); 610 611 thread_affinity_clear(curthread); 612 kpreempt_enable(); 613 614 SUSPEND_DEBUG("finished xen_suspend_domain\n"); 615 616 /* 617 * We have restarted our suspended domain, update the hypervisor 618 * details. NB: This must be done at the end of this function, 619 * since we need the domain to be completely resumed before 620 * these functions will work correctly. 621 */ 622 xen_set_version(XENVER_CURRENT_IDX); 623 624 /* 625 * We can check and report a warning, but we don't stop the 626 * process. 627 */ 628 if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) 629 cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s " 630 "but need at least version v3.0.4", 631 XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor), 632 XENVER_CURRENT(xv_ver)); 633 634 cmn_err(CE_NOTE, "domain restore/migrate completed"); 635 } 636 637 /*ARGSUSED*/ 638 int 639 xen_debug_handler(void *arg) 640 { 641 debug_enter("External debug event received"); 642 643 /* 644 * If we've not got KMDB loaded, output some stuff difficult to capture 645 * from a domain core. 646 */ 647 if (!(boothowto & RB_DEBUG)) { 648 shared_info_t *si = HYPERVISOR_shared_info; 649 int i; 650 651 prom_printf("evtchn_pending [ "); 652 for (i = 0; i < 8; i++) 653 prom_printf("%lx ", si->evtchn_pending[i]); 654 prom_printf("]\nevtchn_mask [ "); 655 for (i = 0; i < 8; i++) 656 prom_printf("%lx ", si->evtchn_mask[i]); 657 prom_printf("]\n"); 658 659 for (i = 0; i < ncpus; i++) { 660 vcpu_info_t *vcpu = &si->vcpu_info[i]; 661 if (cpu[i] == NULL) 662 continue; 663 prom_printf("CPU%d pending %d mask %d sel %lx\n", 664 i, vcpu->evtchn_upcall_pending, 665 vcpu->evtchn_upcall_mask, 666 vcpu->evtchn_pending_sel); 667 } 668 } 669 670 return (0); 671 } 672 673 /*ARGSUSED*/ 674 static void 675 xen_sysrq_handler(struct xenbus_watch *watch, const char **vec, 676 unsigned int len) 677 { 678 xenbus_transaction_t xbt; 679 char key = '\0'; 680 int ret; 681 682 retry: 683 if (xenbus_transaction_start(&xbt)) { 684 cmn_err(CE_WARN, "failed to start sysrq transaction"); 685 return; 686 } 687 688 if ((ret = xenbus_scanf(xbt, "control", "sysrq", "%c", &key)) != 0) { 689 /* 690 * ENOENT happens in response to our own xenbus_rm. 691 * XXPV - this happens spuriously on boot? 692 */ 693 if (ret != ENOENT) 694 cmn_err(CE_WARN, "failed to read sysrq: %d", ret); 695 goto out; 696 } 697 698 if ((ret = xenbus_rm(xbt, "control", "sysrq")) != 0) { 699 cmn_err(CE_WARN, "failed to reset sysrq: %d", ret); 700 goto out; 701 } 702 703 if (xenbus_transaction_end(xbt, 0) == EAGAIN) 704 goto retry; 705 706 /* 707 * Somewhat arbitrary - on Linux this means 'reboot'. We could just 708 * accept any key, but this might increase the risk of sending a 709 * harmless sysrq to the wrong domain... 710 */ 711 if (key == 'b') 712 (void) xen_debug_handler(NULL); 713 else 714 cmn_err(CE_WARN, "Ignored sysrq %c", key); 715 return; 716 717 out: 718 (void) xenbus_transaction_end(xbt, 1); 719 } 720 721 taskq_t *xen_shutdown_tq; 722 723 #define SHUTDOWN_INVALID -1 724 #define SHUTDOWN_POWEROFF 0 725 #define SHUTDOWN_REBOOT 1 726 #define SHUTDOWN_SUSPEND 2 727 #define SHUTDOWN_HALT 3 728 #define SHUTDOWN_MAX 4 729 730 #define SHUTDOWN_TIMEOUT_SECS (60 * 5) 731 732 static const char *cmd_strings[SHUTDOWN_MAX] = { 733 "poweroff", 734 "reboot", 735 "suspend", 736 "halt" 737 }; 738 739 static void 740 xen_dirty_shutdown(void *arg) 741 { 742 int cmd = (uintptr_t)arg; 743 744 cmn_err(CE_WARN, "Externally requested shutdown failed or " 745 "timed out.\nShutting down.\n"); 746 747 switch (cmd) { 748 case SHUTDOWN_HALT: 749 case SHUTDOWN_POWEROFF: 750 (void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred); 751 break; 752 case SHUTDOWN_REBOOT: 753 (void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred); 754 break; 755 } 756 } 757 758 static void 759 xen_shutdown(void *arg) 760 { 761 int cmd = (uintptr_t)arg; 762 proc_t *initpp; 763 764 ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX); 765 766 if (cmd == SHUTDOWN_SUSPEND) { 767 xen_suspend_domain(); 768 return; 769 } 770 771 switch (cmd) { 772 case SHUTDOWN_POWEROFF: 773 force_shutdown_method = AD_POWEROFF; 774 break; 775 case SHUTDOWN_HALT: 776 force_shutdown_method = AD_HALT; 777 break; 778 case SHUTDOWN_REBOOT: 779 force_shutdown_method = AD_BOOT; 780 break; 781 } 782 783 /* 784 * If we're still booting and init(1) isn't set up yet, simply halt. 785 */ 786 mutex_enter(&pidlock); 787 initpp = prfind(P_INITPID); 788 mutex_exit(&pidlock); 789 if (initpp == NULL) { 790 extern void halt(char *); 791 halt("Power off the System"); /* just in case */ 792 } 793 794 /* 795 * else, graceful shutdown with inittab and all getting involved 796 */ 797 psignal(initpp, SIGPWR); 798 799 (void) timeout(xen_dirty_shutdown, arg, 800 SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC)); 801 } 802 803 /*ARGSUSED*/ 804 static void 805 xen_shutdown_handler(struct xenbus_watch *watch, const char **vec, 806 unsigned int len) 807 { 808 char *str; 809 xenbus_transaction_t xbt; 810 int err, shutdown_code = SHUTDOWN_INVALID; 811 unsigned int slen; 812 813 again: 814 err = xenbus_transaction_start(&xbt); 815 if (err) 816 return; 817 if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) { 818 (void) xenbus_transaction_end(xbt, 1); 819 return; 820 } 821 822 SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str); 823 824 /* 825 * If this is a watch fired from our write below, check out early to 826 * avoid an infinite loop. 827 */ 828 if (strcmp(str, "") == 0) { 829 (void) xenbus_transaction_end(xbt, 0); 830 kmem_free(str, slen); 831 return; 832 } else if (strcmp(str, "poweroff") == 0) { 833 shutdown_code = SHUTDOWN_POWEROFF; 834 } else if (strcmp(str, "reboot") == 0) { 835 shutdown_code = SHUTDOWN_REBOOT; 836 } else if (strcmp(str, "suspend") == 0) { 837 shutdown_code = SHUTDOWN_SUSPEND; 838 } else if (strcmp(str, "halt") == 0) { 839 shutdown_code = SHUTDOWN_HALT; 840 } else { 841 printf("Ignoring shutdown request: %s\n", str); 842 } 843 844 /* 845 * XXPV Should we check the value of xenbus_write() too, or are all 846 * errors automatically folded into xenbus_transaction_end() ?? 847 */ 848 (void) xenbus_write(xbt, "control", "shutdown", ""); 849 err = xenbus_transaction_end(xbt, 0); 850 if (err == EAGAIN) { 851 SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id); 852 kmem_free(str, slen); 853 goto again; 854 } 855 856 kmem_free(str, slen); 857 if (shutdown_code != SHUTDOWN_INVALID) { 858 (void) taskq_dispatch(xen_shutdown_tq, xen_shutdown, 859 (void *)(intptr_t)shutdown_code, 0); 860 } 861 } 862 863 static struct xenbus_watch shutdown_watch; 864 static struct xenbus_watch sysrq_watch; 865 866 void 867 xen_late_startup(void) 868 { 869 if (!DOMAIN_IS_INITDOMAIN(xen_info)) { 870 xen_shutdown_tq = taskq_create("shutdown_taskq", 1, 871 maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE); 872 shutdown_watch.node = "control/shutdown"; 873 shutdown_watch.callback = xen_shutdown_handler; 874 if (register_xenbus_watch(&shutdown_watch)) 875 cmn_err(CE_WARN, "Failed to set shutdown watcher"); 876 877 sysrq_watch.node = "control/sysrq"; 878 sysrq_watch.callback = xen_sysrq_handler; 879 if (register_xenbus_watch(&sysrq_watch)) 880 cmn_err(CE_WARN, "Failed to set sysrq watcher"); 881 } 882 balloon_init(xen_info->nr_pages); 883 } 884 885 #ifdef DEBUG 886 #define XEN_PRINTF_BUFSIZE 1024 887 888 char xen_printf_buffer[XEN_PRINTF_BUFSIZE]; 889 890 /* 891 * Printf function that calls hypervisor directly. For DomU it only 892 * works when running on a xen hypervisor built with debug on. Works 893 * always since no I/O ring interaction is needed. 894 */ 895 /*PRINTFLIKE1*/ 896 void 897 xen_printf(const char *fmt, ...) 898 { 899 va_list ap; 900 901 va_start(ap, fmt); 902 (void) vsnprintf(xen_printf_buffer, XEN_PRINTF_BUFSIZE, fmt, ap); 903 va_end(ap); 904 905 (void) HYPERVISOR_console_io(CONSOLEIO_write, 906 strlen(xen_printf_buffer), xen_printf_buffer); 907 } 908 #else 909 void 910 xen_printf(const char *fmt, ...) 911 { 912 } 913 #endif /* DEBUG */ 914 915 void 916 startup_xen_version(void) 917 { 918 xen_set_version(XENVER_BOOT_IDX); 919 if (xen_hypervisor_supports_solaris(XEN_RUN_CHECK) == 0) 920 cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s " 921 "but need at least version v3.0.4", 922 XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor), 923 XENVER_CURRENT(xv_ver)); 924 xen_pte_workaround(); 925 } 926 927 int xen_mca_simulate_mc_physinfo_failure = 0; 928 929 void 930 startup_xen_mca(void) 931 { 932 if (!DOMAIN_IS_INITDOMAIN(xen_info)) 933 return; 934 935 xen_phys_ncpus = 0; 936 xen_phys_cpus = NULL; 937 938 if (xen_mca_simulate_mc_physinfo_failure || 939 xen_get_mc_physcpuinfo(NULL, &xen_phys_ncpus) != 0) { 940 cmn_err(CE_WARN, 941 "%sxen_get_mc_physinfo failure during xen MCA startup: " 942 "there will be no machine check support", 943 xen_mca_simulate_mc_physinfo_failure ? "(simulated) " : ""); 944 return; 945 } 946 947 xen_phys_cpus = kmem_alloc(xen_phys_ncpus * 948 sizeof (xen_mc_logical_cpu_t), KM_NOSLEEP); 949 950 if (xen_phys_cpus == NULL) { 951 cmn_err(CE_WARN, 952 "xen_get_physinfo failure: can't allocate CPU array"); 953 return; 954 } 955 956 if (xen_get_mc_physcpuinfo(xen_phys_cpus, &xen_phys_ncpus) != 0) { 957 cmn_err(CE_WARN, "xen_get_mc_physinfo failure: no " 958 "physical CPU info"); 959 kmem_free(xen_phys_cpus, 960 xen_phys_ncpus * sizeof (xen_mc_logical_cpu_t)); 961 xen_phys_ncpus = 0; 962 xen_phys_cpus = NULL; 963 } 964 965 if (xen_physinfo_debug) { 966 xen_mc_logical_cpu_t *xcp; 967 unsigned i; 968 969 cmn_err(CE_NOTE, "xvm mca: %u physical cpus:\n", 970 xen_phys_ncpus); 971 for (i = 0; i < xen_phys_ncpus; i++) { 972 xcp = &xen_phys_cpus[i]; 973 cmn_err(CE_NOTE, "cpu%u: (%u, %u, %u) apid %u", 974 xcp->mc_cpunr, xcp->mc_chipid, xcp->mc_coreid, 975 xcp->mc_threadid, xcp->mc_apicid); 976 } 977 } 978 } 979 980 /* 981 * Miscellaneous hypercall wrappers with slightly more verbose diagnostics. 982 */ 983 984 void 985 xen_set_gdt(ulong_t *frame_list, int entries) 986 { 987 int err; 988 if ((err = HYPERVISOR_set_gdt(frame_list, entries)) != 0) { 989 /* 990 * X_EINVAL: reserved entry or bad frames 991 * X_EFAULT: bad address 992 */ 993 panic("xen_set_gdt(%p, %d): error %d", 994 (void *)frame_list, entries, -(int)err); 995 } 996 } 997 998 void 999 xen_set_ldt(user_desc_t *ldt, uint_t nsels) 1000 { 1001 struct mmuext_op op; 1002 long err; 1003 1004 op.cmd = MMUEXT_SET_LDT; 1005 op.arg1.linear_addr = (uintptr_t)ldt; 1006 op.arg2.nr_ents = nsels; 1007 1008 if ((err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) != 0) { 1009 panic("xen_set_ldt(%p, %d): error %d", 1010 (void *)ldt, nsels, -(int)err); 1011 } 1012 } 1013 1014 void 1015 xen_stack_switch(ulong_t ss, ulong_t esp) 1016 { 1017 long err; 1018 1019 if ((err = HYPERVISOR_stack_switch(ss, esp)) != 0) { 1020 /* 1021 * X_EPERM: bad selector 1022 */ 1023 panic("xen_stack_switch(%lx, %lx): error %d", ss, esp, 1024 -(int)err); 1025 } 1026 } 1027 1028 long 1029 xen_set_trap_table(trap_info_t *table) 1030 { 1031 long err; 1032 1033 if ((err = HYPERVISOR_set_trap_table(table)) != 0) { 1034 /* 1035 * X_EFAULT: bad address 1036 * X_EPERM: bad selector 1037 */ 1038 panic("xen_set_trap_table(%p): error %d", (void *)table, 1039 -(int)err); 1040 } 1041 return (err); 1042 } 1043 1044 #if defined(__amd64) 1045 void 1046 xen_set_segment_base(int reg, ulong_t value) 1047 { 1048 long err; 1049 1050 if ((err = HYPERVISOR_set_segment_base(reg, value)) != 0) { 1051 /* 1052 * X_EFAULT: bad address 1053 * X_EINVAL: bad type 1054 */ 1055 panic("xen_set_segment_base(%d, %lx): error %d", 1056 reg, value, -(int)err); 1057 } 1058 } 1059 #endif /* __amd64 */ 1060 1061 /* 1062 * Translate a hypervisor errcode to a Solaris error code. 1063 */ 1064 int 1065 xen_xlate_errcode(int error) 1066 { 1067 switch (-error) { 1068 1069 /* 1070 * Translate hypervisor errno's into native errno's 1071 */ 1072 1073 #define CASE(num) case X_##num: error = num; break 1074 1075 CASE(EPERM); CASE(ENOENT); CASE(ESRCH); 1076 CASE(EINTR); CASE(EIO); CASE(ENXIO); 1077 CASE(E2BIG); CASE(ENOMEM); CASE(EACCES); 1078 CASE(EFAULT); CASE(EBUSY); CASE(EEXIST); 1079 CASE(ENODEV); CASE(EISDIR); CASE(EINVAL); 1080 CASE(ENOSPC); CASE(ESPIPE); CASE(EROFS); 1081 CASE(ENOSYS); CASE(ENOTEMPTY); CASE(EISCONN); 1082 CASE(ENODATA); 1083 1084 #undef CASE 1085 1086 default: 1087 panic("xen_xlate_errcode: unknown error %d", error); 1088 } 1089 1090 return (error); 1091 } 1092 1093 /* 1094 * Raise PS_IOPL on current vcpu to user level. 1095 * Caller responsible for preventing kernel preemption. 1096 */ 1097 void 1098 xen_enable_user_iopl(void) 1099 { 1100 physdev_set_iopl_t set_iopl; 1101 set_iopl.iopl = 3; /* user ring 3 */ 1102 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 1103 } 1104 1105 /* 1106 * Drop PS_IOPL on current vcpu to kernel level 1107 */ 1108 void 1109 xen_disable_user_iopl(void) 1110 { 1111 physdev_set_iopl_t set_iopl; 1112 set_iopl.iopl = 1; /* kernel pseudo ring 1 */ 1113 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 1114 } 1115 1116 int 1117 xen_gdt_setprot(cpu_t *cp, uint_t prot) 1118 { 1119 int err; 1120 #if defined(__amd64) 1121 int pt_bits = PT_VALID; 1122 if (prot & PROT_WRITE) 1123 pt_bits |= PT_WRITABLE; 1124 #endif 1125 1126 if ((err = as_setprot(&kas, (caddr_t)cp->cpu_gdt, 1127 MMU_PAGESIZE, prot)) != 0) 1128 goto done; 1129 1130 #if defined(__amd64) 1131 err = xen_kpm_page(mmu_btop(cp->cpu_m.mcpu_gdtpa), pt_bits); 1132 #endif 1133 1134 done: 1135 if (err) { 1136 cmn_err(CE_WARN, "cpu%d: xen_gdt_setprot(%s) failed: error %d", 1137 cp->cpu_id, (prot & PROT_WRITE) ? "writable" : "read-only", 1138 err); 1139 } 1140 1141 return (err); 1142 } 1143 1144 int 1145 xen_ldt_setprot(user_desc_t *ldt, size_t lsize, uint_t prot) 1146 { 1147 int err; 1148 caddr_t lva = (caddr_t)ldt; 1149 #if defined(__amd64) 1150 int pt_bits = PT_VALID; 1151 pgcnt_t npgs; 1152 if (prot & PROT_WRITE) 1153 pt_bits |= PT_WRITABLE; 1154 #endif /* __amd64 */ 1155 1156 if ((err = as_setprot(&kas, (caddr_t)ldt, lsize, prot)) != 0) 1157 goto done; 1158 1159 #if defined(__amd64) 1160 1161 ASSERT(IS_P2ALIGNED(lsize, PAGESIZE)); 1162 npgs = mmu_btop(lsize); 1163 while (npgs--) { 1164 if ((err = xen_kpm_page(hat_getpfnum(kas.a_hat, lva), 1165 pt_bits)) != 0) 1166 break; 1167 lva += PAGESIZE; 1168 } 1169 #endif /* __amd64 */ 1170 1171 done: 1172 if (err) { 1173 cmn_err(CE_WARN, "xen_ldt_setprot(%p, %s) failed: error %d", 1174 (void *)lva, 1175 (prot & PROT_WRITE) ? "writable" : "read-only", err); 1176 } 1177 1178 return (err); 1179 } 1180 1181 int 1182 xen_get_physinfo(xen_sysctl_physinfo_t *pi) 1183 { 1184 xen_sysctl_t op; 1185 int ret; 1186 1187 bzero(&op, sizeof (op)); 1188 op.cmd = XEN_SYSCTL_physinfo; 1189 op.interface_version = XEN_SYSCTL_INTERFACE_VERSION; 1190 1191 ret = HYPERVISOR_sysctl(&op); 1192 1193 if (ret != 0) 1194 return (ret); 1195 1196 bcopy(&op.u.physinfo, pi, sizeof (op.u.physinfo)); 1197 return (0); 1198 } 1199 1200 int 1201 xen_get_mc_physcpuinfo(xen_mc_logical_cpu_t *log_cpus, uint_t *ncpus) 1202 { 1203 struct xen_mc_physcpuinfo cpi; 1204 1205 cpi.ncpus = *ncpus; 1206 /*LINTED: constant in conditional context*/ 1207 set_xen_guest_handle(cpi.info, log_cpus); 1208 1209 if (HYPERVISOR_mca(XEN_MC_CMD_physcpuinfo, (xen_mc_arg_t *)&cpi) != 1210 XEN_MC_HCALL_SUCCESS) 1211 return (-1); 1212 1213 *ncpus = cpi.ncpus; 1214 return (0); 1215 } 1216 1217 void 1218 print_panic(const char *str) 1219 { 1220 xen_printf(str); 1221 } 1222 1223 /* 1224 * Interfaces to iterate over real cpu information, but only that info 1225 * which we choose to expose here. These are of interest to dom0 1226 * only (and the backing hypercall should not work for domu). 1227 */ 1228 1229 xen_mc_lcpu_cookie_t 1230 xen_physcpu_next(xen_mc_lcpu_cookie_t cookie) 1231 { 1232 xen_mc_logical_cpu_t *xcp = (xen_mc_logical_cpu_t *)cookie; 1233 1234 if (!DOMAIN_IS_INITDOMAIN(xen_info)) 1235 return (NULL); 1236 1237 if (cookie == NULL) 1238 return ((xen_mc_lcpu_cookie_t)xen_phys_cpus); 1239 1240 if (xcp == xen_phys_cpus + xen_phys_ncpus - 1) 1241 return (NULL); 1242 else 1243 return ((xen_mc_lcpu_cookie_t)++xcp); 1244 } 1245 1246 #define COOKIE2XCP(c) ((xen_mc_logical_cpu_t *)(c)) 1247 1248 const char * 1249 xen_physcpu_vendorstr(xen_mc_lcpu_cookie_t cookie) 1250 { 1251 xen_mc_logical_cpu_t *xcp = COOKIE2XCP(cookie); 1252 1253 return ((const char *)&xcp->mc_vendorid[0]); 1254 } 1255 1256 int 1257 xen_physcpu_family(xen_mc_lcpu_cookie_t cookie) 1258 { 1259 return (COOKIE2XCP(cookie)->mc_family); 1260 } 1261 1262 int 1263 xen_physcpu_model(xen_mc_lcpu_cookie_t cookie) 1264 { 1265 return (COOKIE2XCP(cookie)->mc_model); 1266 } 1267 1268 int 1269 xen_physcpu_stepping(xen_mc_lcpu_cookie_t cookie) 1270 { 1271 return (COOKIE2XCP(cookie)->mc_step); 1272 } 1273 1274 id_t 1275 xen_physcpu_chipid(xen_mc_lcpu_cookie_t cookie) 1276 { 1277 return (COOKIE2XCP(cookie)->mc_chipid); 1278 } 1279 1280 id_t 1281 xen_physcpu_coreid(xen_mc_lcpu_cookie_t cookie) 1282 { 1283 return (COOKIE2XCP(cookie)->mc_coreid); 1284 } 1285 1286 id_t 1287 xen_physcpu_strandid(xen_mc_lcpu_cookie_t cookie) 1288 { 1289 return (COOKIE2XCP(cookie)->mc_threadid); 1290 } 1291 1292 id_t 1293 xen_physcpu_logical_id(xen_mc_lcpu_cookie_t cookie) 1294 { 1295 return (COOKIE2XCP(cookie)->mc_cpunr); 1296 } 1297 1298 boolean_t 1299 xen_physcpu_is_cmt(xen_mc_lcpu_cookie_t cookie) 1300 { 1301 return (COOKIE2XCP(cookie)->mc_nthreads > 1); 1302 } 1303 1304 uint64_t 1305 xen_physcpu_mcg_cap(xen_mc_lcpu_cookie_t cookie) 1306 { 1307 xen_mc_logical_cpu_t *xcp = COOKIE2XCP(cookie); 1308 1309 /* 1310 * Need to #define the indices, or search through the array. 1311 */ 1312 return (xcp->mc_msrvalues[0].value); 1313 } 1314 1315 int 1316 xen_map_gref(uint_t cmd, gnttab_map_grant_ref_t *mapop, uint_t count, 1317 boolean_t uvaddr) 1318 { 1319 long rc; 1320 1321 ASSERT(cmd == GNTTABOP_map_grant_ref); 1322 rc = HYPERVISOR_grant_table_op(cmd, mapop, count); 1323 1324 #if !defined(_BOOT) 1325 /* 1326 * XXPV -- 1327 * The map_grant_ref call suffers a poor design flaw. 1328 * It's the only hypervisor interface that creates page table mappings 1329 * that doesn't take an entire PTE. Hence we can't create the 1330 * mapping with a particular setting of the software PTE bits, NX, etc. 1331 * 1332 * Until the interface is fixed, we need to minimize the possiblity 1333 * of dtrace or kmdb blowing up on a foreign mapping that doesn't 1334 * have a correct setting for the soft bits. We'll force them here. 1335 */ 1336 if ((rc == 0) && (uvaddr == B_FALSE)) { 1337 extern void xen_fix_foreign(struct hat *, uint64_t); 1338 uint_t i; 1339 for (i = 0; i < count; ++i) { 1340 if (mapop[i].status == GNTST_okay) { 1341 xen_fix_foreign(kas.a_hat, mapop[i].host_addr); 1342 } 1343 } 1344 } 1345 #endif 1346 1347 return (rc); 1348 } 1349