1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* derived from netbsd's xen_machdep.c 1.1.2.1 */ 30 31 /* 32 * 33 * Copyright (c) 2004 Christian Limpach. 34 * All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. This section intentionally left blank. 45 * 4. The name of the author may not be used to endorse or promote products 46 * derived from this software without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 49 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 50 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 51 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 52 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 53 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 54 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 55 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 56 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 57 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 58 */ 59 /* 60 * Section 3 of the above license was updated in response to bug 6379571. 61 */ 62 63 #include <sys/ctype.h> 64 #include <sys/types.h> 65 #include <sys/cmn_err.h> 66 #include <sys/trap.h> 67 #include <sys/segments.h> 68 #include <sys/hypervisor.h> 69 #include <sys/xen_mmu.h> 70 #include <sys/machsystm.h> 71 #include <sys/promif.h> 72 #include <sys/bootconf.h> 73 #include <sys/bootinfo.h> 74 #include <sys/cpr.h> 75 #include <sys/taskq.h> 76 #include <sys/uadmin.h> 77 #include <sys/evtchn_impl.h> 78 #include <sys/archsystm.h> 79 #include <xen/sys/xenbus_impl.h> 80 #include <sys/mach_mmu.h> 81 #include <vm/hat_i86.h> 82 #include <sys/gnttab.h> 83 #include <sys/reboot.h> 84 #include <sys/stack.h> 85 #include <sys/clock.h> 86 #include <sys/bitmap.h> 87 #include <sys/processor.h> 88 #include <sys/xen_errno.h> 89 #include <sys/xpv_panic.h> 90 #include <sys/smp_impldefs.h> 91 #include <sys/cpu.h> 92 #include <sys/balloon_impl.h> 93 #include <sys/ddi.h> 94 95 #ifdef DEBUG 96 #define SUSPEND_DEBUG if (xen_suspend_debug) xen_printf 97 #else 98 #define SUSPEND_DEBUG(...) 99 #endif 100 101 int cpr_debug; 102 cpuset_t cpu_suspend_lost_set; 103 static int xen_suspend_debug; 104 105 /* 106 * Determine helpful version information. 107 * 108 * (And leave copies in the data segment so we can look at them later 109 * with e.g. kmdb.) 110 */ 111 112 typedef enum xen_version { 113 XENVER_BOOT_IDX, 114 XENVER_CURRENT_IDX 115 } xen_version_t; 116 117 struct xenver { 118 ulong_t xv_major; 119 ulong_t xv_minor; 120 ulong_t xv_revision; 121 xen_extraversion_t xv_ver; 122 ulong_t xv_is_xvm; 123 xen_changeset_info_t xv_chgset; 124 xen_compile_info_t xv_build; 125 xen_capabilities_info_t xv_caps; 126 } xenver[2]; 127 128 #define XENVER_BOOT(m) (xenver[XENVER_BOOT_IDX].m) 129 #define XENVER_CURRENT(m) (xenver[XENVER_CURRENT_IDX].m) 130 131 /* 132 * Update the xenver data. We maintain two copies, boot and 133 * current. If we are setting the boot, then also set current. 134 */ 135 static void 136 xen_set_version(xen_version_t idx) 137 { 138 ulong_t ver; 139 140 bzero(&xenver[idx], sizeof (xenver[idx])); 141 142 ver = HYPERVISOR_xen_version(XENVER_version, 0); 143 144 xenver[idx].xv_major = BITX(ver, 31, 16); 145 xenver[idx].xv_minor = BITX(ver, 15, 0); 146 147 (void) HYPERVISOR_xen_version(XENVER_extraversion, &xenver[idx].xv_ver); 148 149 /* 150 * The revision is buried in the extraversion information that is 151 * maintained by the hypervisor. For our purposes we expect that 152 * the revision number is: 153 * - the second character in the extraversion information 154 * - one character long 155 * - numeric digit 156 * If it isn't then we can't extract the revision and we leave it 157 * set to 0. 158 */ 159 if (strlen(xenver[idx].xv_ver) > 1 && isdigit(xenver[idx].xv_ver[1])) 160 xenver[idx].xv_revision = xenver[idx].xv_ver[1] - '0'; 161 else 162 cmn_err(CE_WARN, "Cannot extract revision on this hypervisor " 163 "version: v%s, unexpected version format", 164 xenver[idx].xv_ver); 165 166 xenver[idx].xv_is_xvm = 0; 167 168 if (strlen(xenver[idx].xv_ver) >= 4 && 169 strncmp(xenver[idx].xv_ver + strlen(xenver[idx].xv_ver) - 4, 170 "-xvm", 4) == 0) 171 xenver[idx].xv_is_xvm = 1; 172 173 (void) HYPERVISOR_xen_version(XENVER_changeset, 174 &xenver[idx].xv_chgset); 175 176 (void) HYPERVISOR_xen_version(XENVER_compile_info, 177 &xenver[idx].xv_build); 178 /* 179 * Capabilities are a set of space separated ascii strings 180 * e.g. 'xen-3.1-x86_32p' or 'hvm-3.2-x86_64' 181 */ 182 (void) HYPERVISOR_xen_version(XENVER_capabilities, 183 &xenver[idx].xv_caps); 184 185 cmn_err(CE_CONT, "?v%lu.%lu%s chgset '%s'\n", xenver[idx].xv_major, 186 xenver[idx].xv_minor, xenver[idx].xv_ver, xenver[idx].xv_chgset); 187 188 if (idx == XENVER_BOOT_IDX) 189 bcopy(&xenver[XENVER_BOOT_IDX], &xenver[XENVER_CURRENT_IDX], 190 sizeof (xenver[XENVER_BOOT_IDX])); 191 } 192 193 typedef enum xen_hypervisor_check { 194 XEN_RUN_CHECK, 195 XEN_SUSPEND_CHECK 196 } xen_hypervisor_check_t; 197 198 /* 199 * To run the hypervisor must be 3.0.4 or better. To suspend/resume 200 * we need 3.0.4 or better and if it is 3.0.4. then it must be provided 201 * by the Solaris xVM project. 202 * Checking can be disabled for testing purposes by setting the 203 * xen_suspend_debug variable. 204 */ 205 static int 206 xen_hypervisor_supports_solaris(xen_hypervisor_check_t check) 207 { 208 if (xen_suspend_debug == 1) 209 return (1); 210 if (XENVER_CURRENT(xv_major) < 3) 211 return (0); 212 if (XENVER_CURRENT(xv_major) > 3) 213 return (1); 214 if (XENVER_CURRENT(xv_minor) > 0) 215 return (1); 216 if (XENVER_CURRENT(xv_revision) < 4) 217 return (0); 218 if (check == XEN_SUSPEND_CHECK && XENVER_CURRENT(xv_revision) == 4 && 219 !XENVER_CURRENT(xv_is_xvm)) 220 return (0); 221 222 return (1); 223 } 224 225 /* 226 * If the hypervisor is -xvm, or 3.1.2 or higher, we don't need the 227 * workaround. 228 */ 229 static void 230 xen_pte_workaround(void) 231 { 232 #if defined(__amd64) 233 extern int pt_kern; 234 235 if (XENVER_CURRENT(xv_major) != 3) 236 return; 237 if (XENVER_CURRENT(xv_minor) > 1) 238 return; 239 if (XENVER_CURRENT(xv_minor) == 1 && 240 XENVER_CURRENT(xv_revision) > 1) 241 return; 242 if (XENVER_CURRENT(xv_is_xvm)) 243 return; 244 245 pt_kern = PT_USER; 246 #endif 247 } 248 249 void 250 xen_set_callback(void (*func)(void), uint_t type, uint_t flags) 251 { 252 struct callback_register cb; 253 254 bzero(&cb, sizeof (cb)); 255 #if defined(__amd64) 256 cb.address = (ulong_t)func; 257 #elif defined(__i386) 258 cb.address.cs = KCS_SEL; 259 cb.address.eip = (ulong_t)func; 260 #endif 261 cb.type = type; 262 cb.flags = flags; 263 264 /* 265 * XXPV always ignore return value for NMI 266 */ 267 if (HYPERVISOR_callback_op(CALLBACKOP_register, &cb) != 0 && 268 type != CALLBACKTYPE_nmi) 269 panic("HYPERVISOR_callback_op failed"); 270 } 271 272 void 273 xen_init_callbacks(void) 274 { 275 /* 276 * register event (interrupt) handler. 277 */ 278 xen_set_callback(xen_callback, CALLBACKTYPE_event, 0); 279 280 /* 281 * failsafe handler. 282 */ 283 xen_set_callback(xen_failsafe_callback, CALLBACKTYPE_failsafe, 284 CALLBACKF_mask_events); 285 286 /* 287 * NMI handler. 288 */ 289 xen_set_callback(nmiint, CALLBACKTYPE_nmi, 0); 290 291 /* 292 * system call handler 293 * XXPV move to init_cpu_syscall? 294 */ 295 #if defined(__amd64) 296 xen_set_callback(sys_syscall, CALLBACKTYPE_syscall, 297 CALLBACKF_mask_events); 298 #endif /* __amd64 */ 299 } 300 301 302 /* 303 * cmn_err() followed by a 1/4 second delay; this gives the 304 * logging service a chance to flush messages and helps avoid 305 * intermixing output from prom_printf(). 306 * XXPV: doesn't exactly help us on UP though. 307 */ 308 /*PRINTFLIKE2*/ 309 void 310 cpr_err(int ce, const char *fmt, ...) 311 { 312 va_list adx; 313 314 va_start(adx, fmt); 315 vcmn_err(ce, fmt, adx); 316 va_end(adx); 317 drv_usecwait(MICROSEC >> 2); 318 } 319 320 void 321 xen_suspend_devices(void) 322 { 323 int rc; 324 325 SUSPEND_DEBUG("xen_suspend_devices\n"); 326 327 if ((rc = cpr_suspend_devices(ddi_root_node())) != 0) 328 panic("failed to suspend devices: %d", rc); 329 } 330 331 void 332 xen_resume_devices(void) 333 { 334 int rc; 335 336 SUSPEND_DEBUG("xen_resume_devices\n"); 337 338 if ((rc = cpr_resume_devices(ddi_root_node(), 0)) != 0) 339 panic("failed to resume devices: %d", rc); 340 } 341 342 /* 343 * The list of mfn pages is out of date. Recompute it. 344 */ 345 static void 346 rebuild_mfn_list(void) 347 { 348 int i = 0; 349 size_t sz; 350 size_t off; 351 pfn_t pfn; 352 353 SUSPEND_DEBUG("rebuild_mfn_list\n"); 354 355 sz = ((mfn_count * sizeof (mfn_t)) + MMU_PAGEOFFSET) & MMU_PAGEMASK; 356 357 for (off = 0; off < sz; off += MMU_PAGESIZE) { 358 size_t j = mmu_btop(off); 359 if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) { 360 pfn = hat_getpfnum(kas.a_hat, 361 (caddr_t)&mfn_list_pages[j]); 362 mfn_list_pages_page[i++] = pfn_to_mfn(pfn); 363 } 364 365 pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list + off); 366 mfn_list_pages[j] = pfn_to_mfn(pfn); 367 } 368 369 pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list_pages_page); 370 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list 371 = pfn_to_mfn(pfn); 372 } 373 374 static void 375 suspend_cpus(void) 376 { 377 int i; 378 379 SUSPEND_DEBUG("suspend_cpus\n"); 380 381 mp_enter_barrier(); 382 383 for (i = 1; i < ncpus; i++) { 384 if (!CPU_IN_SET(cpu_suspend_lost_set, i)) { 385 SUSPEND_DEBUG("xen_vcpu_down %d\n", i); 386 (void) xen_vcpu_down(i); 387 } 388 389 mach_cpucontext_reset(cpu[i]); 390 } 391 } 392 393 static void 394 resume_cpus(void) 395 { 396 int i; 397 398 for (i = 1; i < ncpus; i++) { 399 if (cpu[i] == NULL) 400 continue; 401 402 if (!CPU_IN_SET(cpu_suspend_lost_set, i)) { 403 SUSPEND_DEBUG("xen_vcpu_up %d\n", i); 404 mach_cpucontext_restore(cpu[i]); 405 (void) xen_vcpu_up(i); 406 } 407 } 408 409 mp_leave_barrier(); 410 } 411 412 /* 413 * Top level routine to direct suspend/resume of a domain. 414 */ 415 void 416 xen_suspend_domain(void) 417 { 418 extern void rtcsync(void); 419 extern hrtime_t hres_last_tick; 420 mfn_t start_info_mfn; 421 ulong_t flags; 422 pfn_t pfn; 423 int i; 424 425 /* 426 * Check that we are happy to suspend on this hypervisor. 427 */ 428 if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) { 429 cpr_err(CE_WARN, "Cannot suspend on this hypervisor " 430 "version: v%lu.%lu%s, need at least version v3.0.4 or " 431 "-xvm based hypervisor", XENVER_CURRENT(xv_major), 432 XENVER_CURRENT(xv_minor), XENVER_CURRENT(xv_ver)); 433 return; 434 } 435 436 /* 437 * XXPV - Are we definitely OK to suspend by the time we've connected 438 * the handler? 439 */ 440 441 cpr_err(CE_NOTE, "Domain suspending for save/migrate"); 442 443 SUSPEND_DEBUG("xen_suspend_domain\n"); 444 445 /* 446 * suspend interrupts and devices 447 * XXPV - we use suspend/resume for both save/restore domains (like sun 448 * cpr) and for migration. Would be nice to know the difference if 449 * possible. For save/restore where down time may be a long time, we 450 * may want to do more of the things that cpr does. (i.e. notify user 451 * processes, shrink memory footprint for faster restore, etc.) 452 */ 453 xen_suspend_devices(); 454 SUSPEND_DEBUG("xenbus_suspend\n"); 455 xenbus_suspend(); 456 457 pfn = hat_getpfnum(kas.a_hat, (caddr_t)xen_info); 458 start_info_mfn = pfn_to_mfn(pfn); 459 460 /* 461 * XXPV: cpu hotplug can hold this under a xenbus watch. Are we safe 462 * wrt xenbus being suspended here? 463 */ 464 mutex_enter(&cpu_lock); 465 466 /* 467 * Suspend must be done on vcpu 0, as no context for other CPUs is 468 * saved. 469 * 470 * XXPV - add to taskq API ? 471 */ 472 thread_affinity_set(curthread, 0); 473 kpreempt_disable(); 474 475 SUSPEND_DEBUG("xen_start_migrate\n"); 476 xen_start_migrate(); 477 if (ncpus > 1) 478 suspend_cpus(); 479 480 /* 481 * We can grab the ec_lock as it's a spinlock with a high SPL. Hence 482 * any holder would have dropped it to get through suspend_cpus(). 483 */ 484 mutex_enter(&ec_lock); 485 486 /* 487 * From here on in, we can't take locks. 488 */ 489 SUSPEND_DEBUG("ec_suspend\n"); 490 ec_suspend(); 491 SUSPEND_DEBUG("gnttab_suspend\n"); 492 gnttab_suspend(); 493 494 flags = intr_clear(); 495 496 xpv_time_suspend(); 497 498 /* 499 * Currently, the hypervisor incorrectly fails to bring back 500 * powered-down VCPUs. Thus we need to record any powered-down VCPUs 501 * to prevent any attempts to operate on them. But we have to do this 502 * *after* the very first time we do ec_suspend(). 503 */ 504 for (i = 1; i < ncpus; i++) { 505 if (cpu[i] == NULL) 506 continue; 507 508 if (cpu_get_state(cpu[i]) == P_POWEROFF) 509 CPUSET_ATOMIC_ADD(cpu_suspend_lost_set, i); 510 } 511 512 /* 513 * The dom0 save/migrate code doesn't automatically translate 514 * these into PFNs, but expects them to be, so we do it here. 515 * We don't use mfn_to_pfn() because so many OS services have 516 * been disabled at this point. 517 */ 518 xen_info->store_mfn = mfn_to_pfn_mapping[xen_info->store_mfn]; 519 xen_info->console.domU.mfn = 520 mfn_to_pfn_mapping[xen_info->console.domU.mfn]; 521 522 if (CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0) { 523 prom_printf("xen_suspend_domain(): " 524 "CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask not set\n"); 525 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 526 } 527 528 if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, 529 0, UVMF_INVLPG)) { 530 prom_printf("xen_suspend_domain(): " 531 "HYPERVISOR_update_va_mapping() failed\n"); 532 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 533 } 534 535 SUSPEND_DEBUG("HYPERVISOR_suspend\n"); 536 537 /* 538 * At this point we suspend and sometime later resume. 539 */ 540 if (HYPERVISOR_suspend(start_info_mfn)) { 541 prom_printf("xen_suspend_domain(): " 542 "HYPERVISOR_suspend() failed\n"); 543 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 544 } 545 546 /* 547 * Point HYPERVISOR_shared_info to its new value. 548 */ 549 if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, 550 xen_info->shared_info | PT_NOCONSIST | PT_VALID | PT_WRITABLE, 551 UVMF_INVLPG)) 552 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 553 554 if (xen_info->nr_pages != mfn_count) { 555 prom_printf("xen_suspend_domain(): number of pages" 556 " changed, was 0x%lx, now 0x%lx\n", mfn_count, 557 xen_info->nr_pages); 558 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 559 } 560 561 xpv_time_resume(); 562 563 cached_max_mfn = 0; 564 565 SUSPEND_DEBUG("gnttab_resume\n"); 566 gnttab_resume(); 567 568 /* XXPV: add a note that this must be lockless. */ 569 SUSPEND_DEBUG("ec_resume\n"); 570 ec_resume(); 571 572 intr_restore(flags); 573 574 if (ncpus > 1) 575 resume_cpus(); 576 577 mutex_exit(&ec_lock); 578 xen_end_migrate(); 579 mutex_exit(&cpu_lock); 580 581 /* 582 * Now we can take locks again. 583 */ 584 585 /* 586 * Force the tick value used for tv_nsec in hres_tick() to be up to 587 * date. rtcsync() will reset the hrestime value appropriately. 588 */ 589 hres_last_tick = xpv_gethrtime(); 590 591 /* 592 * XXPV: we need to have resumed the CPUs since this takes locks, but 593 * can remote CPUs see bad state? Presumably yes. Should probably nest 594 * taking of todlock inside of cpu_lock, or vice versa, then provide an 595 * unlocked version. Probably need to call clkinitf to reset cpu freq 596 * and re-calibrate if we migrated to a different speed cpu. Also need 597 * to make a (re)init_cpu_info call to update processor info structs 598 * and device tree info. That remains to be written at the moment. 599 */ 600 rtcsync(); 601 602 rebuild_mfn_list(); 603 604 SUSPEND_DEBUG("xenbus_resume\n"); 605 xenbus_resume(); 606 SUSPEND_DEBUG("xenbus_resume_devices\n"); 607 xen_resume_devices(); 608 609 thread_affinity_clear(curthread); 610 kpreempt_enable(); 611 612 SUSPEND_DEBUG("finished xen_suspend_domain\n"); 613 614 /* 615 * We have restarted our suspended domain, update the hypervisor 616 * details. NB: This must be done at the end of this function, 617 * since we need the domain to be completely resumed before 618 * these functions will work correctly. 619 */ 620 xen_set_version(XENVER_CURRENT_IDX); 621 622 /* 623 * We can check and report a warning, but we don't stop the 624 * process. 625 */ 626 if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) 627 cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s " 628 "but need at least version v3.0.4", 629 XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor), 630 XENVER_CURRENT(xv_ver)); 631 632 cmn_err(CE_NOTE, "domain restore/migrate completed"); 633 } 634 635 /*ARGSUSED*/ 636 int 637 xen_debug_handler(void *arg) 638 { 639 debug_enter("External debug event received"); 640 641 /* 642 * If we've not got KMDB loaded, output some stuff difficult to capture 643 * from a domain core. 644 */ 645 if (!(boothowto & RB_DEBUG)) { 646 shared_info_t *si = HYPERVISOR_shared_info; 647 int i; 648 649 prom_printf("evtchn_pending [ "); 650 for (i = 0; i < 8; i++) 651 prom_printf("%lx ", si->evtchn_pending[i]); 652 prom_printf("]\nevtchn_mask [ "); 653 for (i = 0; i < 8; i++) 654 prom_printf("%lx ", si->evtchn_mask[i]); 655 prom_printf("]\n"); 656 657 for (i = 0; i < ncpus; i++) { 658 vcpu_info_t *vcpu = &si->vcpu_info[i]; 659 if (cpu[i] == NULL) 660 continue; 661 prom_printf("CPU%d pending %d mask %d sel %lx\n", 662 i, vcpu->evtchn_upcall_pending, 663 vcpu->evtchn_upcall_mask, 664 vcpu->evtchn_pending_sel); 665 } 666 } 667 668 return (0); 669 } 670 671 /*ARGSUSED*/ 672 static void 673 xen_sysrq_handler(struct xenbus_watch *watch, const char **vec, 674 unsigned int len) 675 { 676 xenbus_transaction_t xbt; 677 char key = '\0'; 678 int ret; 679 680 retry: 681 if (xenbus_transaction_start(&xbt)) { 682 cmn_err(CE_WARN, "failed to start sysrq transaction"); 683 return; 684 } 685 686 if ((ret = xenbus_scanf(xbt, "control", "sysrq", "%c", &key)) != 0) { 687 /* 688 * ENOENT happens in response to our own xenbus_rm. 689 * XXPV - this happens spuriously on boot? 690 */ 691 if (ret != ENOENT) 692 cmn_err(CE_WARN, "failed to read sysrq: %d", ret); 693 goto out; 694 } 695 696 if ((ret = xenbus_rm(xbt, "control", "sysrq")) != 0) { 697 cmn_err(CE_WARN, "failed to reset sysrq: %d", ret); 698 goto out; 699 } 700 701 if (xenbus_transaction_end(xbt, 0) == EAGAIN) 702 goto retry; 703 704 /* 705 * Somewhat arbitrary - on Linux this means 'reboot'. We could just 706 * accept any key, but this might increase the risk of sending a 707 * harmless sysrq to the wrong domain... 708 */ 709 if (key == 'b') 710 (void) xen_debug_handler(NULL); 711 else 712 cmn_err(CE_WARN, "Ignored sysrq %c", key); 713 return; 714 715 out: 716 (void) xenbus_transaction_end(xbt, 1); 717 } 718 719 taskq_t *xen_shutdown_tq; 720 721 #define SHUTDOWN_INVALID -1 722 #define SHUTDOWN_POWEROFF 0 723 #define SHUTDOWN_REBOOT 1 724 #define SHUTDOWN_SUSPEND 2 725 #define SHUTDOWN_HALT 3 726 #define SHUTDOWN_MAX 4 727 728 #define SHUTDOWN_TIMEOUT_SECS (60 * 5) 729 730 static const char *cmd_strings[SHUTDOWN_MAX] = { 731 "poweroff", 732 "reboot", 733 "suspend", 734 "halt" 735 }; 736 737 static void 738 xen_dirty_shutdown(void *arg) 739 { 740 int cmd = (uintptr_t)arg; 741 742 cmn_err(CE_WARN, "Externally requested shutdown failed or " 743 "timed out.\nShutting down.\n"); 744 745 switch (cmd) { 746 case SHUTDOWN_HALT: 747 case SHUTDOWN_POWEROFF: 748 (void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred); 749 break; 750 case SHUTDOWN_REBOOT: 751 (void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred); 752 break; 753 } 754 } 755 756 static void 757 xen_shutdown(void *arg) 758 { 759 nvlist_t *attr_list = NULL; 760 sysevent_t *event = NULL; 761 sysevent_id_t eid; 762 int cmd = (uintptr_t)arg; 763 int err; 764 765 ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX); 766 767 if (cmd == SHUTDOWN_SUSPEND) { 768 xen_suspend_domain(); 769 return; 770 } 771 772 err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_SLEEP); 773 if (err != DDI_SUCCESS) 774 goto failure; 775 776 err = nvlist_add_string(attr_list, "shutdown", cmd_strings[cmd]); 777 if (err != DDI_SUCCESS) 778 goto failure; 779 780 if ((event = sysevent_alloc("EC_xpvsys", "control", "SUNW:kern:xpv", 781 SE_SLEEP)) == NULL) 782 goto failure; 783 (void) sysevent_attach_attributes(event, 784 (sysevent_attr_list_t *)attr_list); 785 786 err = log_sysevent(event, SE_SLEEP, &eid); 787 788 sysevent_detach_attributes(event); 789 sysevent_free(event); 790 791 if (err != 0) 792 goto failure; 793 794 (void) timeout(xen_dirty_shutdown, arg, 795 SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC)); 796 797 nvlist_free(attr_list); 798 return; 799 800 failure: 801 if (attr_list != NULL) 802 nvlist_free(attr_list); 803 xen_dirty_shutdown(arg); 804 } 805 806 /*ARGSUSED*/ 807 static void 808 xen_shutdown_handler(struct xenbus_watch *watch, const char **vec, 809 unsigned int len) 810 { 811 char *str; 812 xenbus_transaction_t xbt; 813 int err, shutdown_code = SHUTDOWN_INVALID; 814 unsigned int slen; 815 816 again: 817 err = xenbus_transaction_start(&xbt); 818 if (err) 819 return; 820 if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) { 821 (void) xenbus_transaction_end(xbt, 1); 822 return; 823 } 824 825 SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str); 826 827 /* 828 * If this is a watch fired from our write below, check out early to 829 * avoid an infinite loop. 830 */ 831 if (strcmp(str, "") == 0) { 832 (void) xenbus_transaction_end(xbt, 0); 833 kmem_free(str, slen); 834 return; 835 } else if (strcmp(str, "poweroff") == 0) { 836 shutdown_code = SHUTDOWN_POWEROFF; 837 } else if (strcmp(str, "reboot") == 0) { 838 shutdown_code = SHUTDOWN_REBOOT; 839 } else if (strcmp(str, "suspend") == 0) { 840 shutdown_code = SHUTDOWN_SUSPEND; 841 } else if (strcmp(str, "halt") == 0) { 842 shutdown_code = SHUTDOWN_HALT; 843 } else { 844 printf("Ignoring shutdown request: %s\n", str); 845 } 846 847 /* 848 * XXPV Should we check the value of xenbus_write() too, or are all 849 * errors automatically folded into xenbus_transaction_end() ?? 850 */ 851 (void) xenbus_write(xbt, "control", "shutdown", ""); 852 err = xenbus_transaction_end(xbt, 0); 853 if (err == EAGAIN) { 854 SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id); 855 kmem_free(str, slen); 856 goto again; 857 } 858 859 kmem_free(str, slen); 860 if (shutdown_code != SHUTDOWN_INVALID) { 861 (void) taskq_dispatch(xen_shutdown_tq, xen_shutdown, 862 (void *)(intptr_t)shutdown_code, 0); 863 } 864 } 865 866 static struct xenbus_watch shutdown_watch; 867 static struct xenbus_watch sysrq_watch; 868 869 void 870 xen_late_startup(void) 871 { 872 if (!DOMAIN_IS_INITDOMAIN(xen_info)) { 873 xen_shutdown_tq = taskq_create("shutdown_taskq", 1, 874 maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE); 875 shutdown_watch.node = "control/shutdown"; 876 shutdown_watch.callback = xen_shutdown_handler; 877 if (register_xenbus_watch(&shutdown_watch)) 878 cmn_err(CE_WARN, "Failed to set shutdown watcher"); 879 880 sysrq_watch.node = "control/sysrq"; 881 sysrq_watch.callback = xen_sysrq_handler; 882 if (register_xenbus_watch(&sysrq_watch)) 883 cmn_err(CE_WARN, "Failed to set sysrq watcher"); 884 } 885 balloon_init(xen_info->nr_pages); 886 } 887 888 #ifdef DEBUG 889 #define XEN_PRINTF_BUFSIZE 1024 890 891 char xen_printf_buffer[XEN_PRINTF_BUFSIZE]; 892 893 /* 894 * Printf function that calls hypervisor directly. For DomU it only 895 * works when running on a xen hypervisor built with debug on. Works 896 * always since no I/O ring interaction is needed. 897 */ 898 /*PRINTFLIKE1*/ 899 void 900 xen_printf(const char *fmt, ...) 901 { 902 va_list ap; 903 904 va_start(ap, fmt); 905 (void) vsnprintf(xen_printf_buffer, XEN_PRINTF_BUFSIZE, fmt, ap); 906 va_end(ap); 907 908 (void) HYPERVISOR_console_io(CONSOLEIO_write, 909 strlen(xen_printf_buffer), xen_printf_buffer); 910 } 911 #else 912 void 913 xen_printf(const char *fmt, ...) 914 { 915 } 916 #endif /* DEBUG */ 917 918 void 919 startup_xen_version(void) 920 { 921 xen_set_version(XENVER_BOOT_IDX); 922 if (xen_hypervisor_supports_solaris(XEN_RUN_CHECK) == 0) 923 cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s " 924 "but need at least version v3.0.4", 925 XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor), 926 XENVER_CURRENT(xv_ver)); 927 xen_pte_workaround(); 928 } 929 930 /* 931 * Miscellaneous hypercall wrappers with slightly more verbose diagnostics. 932 */ 933 934 void 935 xen_set_gdt(ulong_t *frame_list, int entries) 936 { 937 int err; 938 if ((err = HYPERVISOR_set_gdt(frame_list, entries)) != 0) { 939 /* 940 * X_EINVAL: reserved entry or bad frames 941 * X_EFAULT: bad address 942 */ 943 panic("xen_set_gdt(%p, %d): error %d", 944 (void *)frame_list, entries, -(int)err); 945 } 946 } 947 948 void 949 xen_set_ldt(user_desc_t *ldt, uint_t nsels) 950 { 951 struct mmuext_op op; 952 long err; 953 954 op.cmd = MMUEXT_SET_LDT; 955 op.arg1.linear_addr = (uintptr_t)ldt; 956 op.arg2.nr_ents = nsels; 957 958 if ((err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) != 0) { 959 panic("xen_set_ldt(%p, %d): error %d", 960 (void *)ldt, nsels, -(int)err); 961 } 962 } 963 964 void 965 xen_stack_switch(ulong_t ss, ulong_t esp) 966 { 967 long err; 968 969 if ((err = HYPERVISOR_stack_switch(ss, esp)) != 0) { 970 /* 971 * X_EPERM: bad selector 972 */ 973 panic("xen_stack_switch(%lx, %lx): error %d", ss, esp, 974 -(int)err); 975 } 976 } 977 978 long 979 xen_set_trap_table(trap_info_t *table) 980 { 981 long err; 982 983 if ((err = HYPERVISOR_set_trap_table(table)) != 0) { 984 /* 985 * X_EFAULT: bad address 986 * X_EPERM: bad selector 987 */ 988 panic("xen_set_trap_table(%p): error %d", (void *)table, 989 -(int)err); 990 } 991 return (err); 992 } 993 994 #if defined(__amd64) 995 void 996 xen_set_segment_base(int reg, ulong_t value) 997 { 998 long err; 999 1000 if ((err = HYPERVISOR_set_segment_base(reg, value)) != 0) { 1001 /* 1002 * X_EFAULT: bad address 1003 * X_EINVAL: bad type 1004 */ 1005 panic("xen_set_segment_base(%d, %lx): error %d", 1006 reg, value, -(int)err); 1007 } 1008 } 1009 #endif /* __amd64 */ 1010 1011 /* 1012 * Translate a hypervisor errcode to a Solaris error code. 1013 */ 1014 int 1015 xen_xlate_errcode(int error) 1016 { 1017 switch (-error) { 1018 1019 /* 1020 * Translate hypervisor errno's into native errno's 1021 */ 1022 1023 #define CASE(num) case X_##num: error = num; break 1024 1025 CASE(EPERM); CASE(ENOENT); CASE(ESRCH); 1026 CASE(EINTR); CASE(EIO); CASE(ENXIO); 1027 CASE(E2BIG); CASE(ENOMEM); CASE(EACCES); 1028 CASE(EFAULT); CASE(EBUSY); CASE(EEXIST); 1029 CASE(ENODEV); CASE(EISDIR); CASE(EINVAL); 1030 CASE(ENOSPC); CASE(ESPIPE); CASE(EROFS); 1031 CASE(ENOSYS); CASE(ENOTEMPTY); CASE(EISCONN); 1032 CASE(ENODATA); 1033 1034 #undef CASE 1035 1036 default: 1037 panic("xen_xlate_errcode: unknown error %d", error); 1038 } 1039 1040 return (error); 1041 } 1042 1043 /* 1044 * Raise PS_IOPL on current vcpu to user level. 1045 * Caller responsible for preventing kernel preemption. 1046 */ 1047 void 1048 xen_enable_user_iopl(void) 1049 { 1050 physdev_set_iopl_t set_iopl; 1051 set_iopl.iopl = 3; /* user ring 3 */ 1052 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 1053 } 1054 1055 /* 1056 * Drop PS_IOPL on current vcpu to kernel level 1057 */ 1058 void 1059 xen_disable_user_iopl(void) 1060 { 1061 physdev_set_iopl_t set_iopl; 1062 set_iopl.iopl = 1; /* kernel pseudo ring 1 */ 1063 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 1064 } 1065 1066 int 1067 xen_gdt_setprot(cpu_t *cp, uint_t prot) 1068 { 1069 int err; 1070 #if defined(__amd64) 1071 int pt_bits = PT_VALID; 1072 if (prot & PROT_WRITE) 1073 pt_bits |= PT_WRITABLE; 1074 #endif 1075 1076 if ((err = as_setprot(&kas, (caddr_t)cp->cpu_gdt, 1077 MMU_PAGESIZE, prot)) != 0) 1078 goto done; 1079 1080 #if defined(__amd64) 1081 err = xen_kpm_page(mmu_btop(cp->cpu_m.mcpu_gdtpa), pt_bits); 1082 #endif 1083 1084 done: 1085 if (err) { 1086 cmn_err(CE_WARN, "cpu%d: xen_gdt_setprot(%s) failed: error %d", 1087 cp->cpu_id, (prot & PROT_WRITE) ? "writable" : "read-only", 1088 err); 1089 } 1090 1091 return (err); 1092 } 1093 1094 int 1095 xen_ldt_setprot(user_desc_t *ldt, size_t lsize, uint_t prot) 1096 { 1097 int err; 1098 caddr_t lva = (caddr_t)ldt; 1099 #if defined(__amd64) 1100 int pt_bits = PT_VALID; 1101 pgcnt_t npgs; 1102 if (prot & PROT_WRITE) 1103 pt_bits |= PT_WRITABLE; 1104 #endif /* __amd64 */ 1105 1106 if ((err = as_setprot(&kas, (caddr_t)ldt, lsize, prot)) != 0) 1107 goto done; 1108 1109 #if defined(__amd64) 1110 1111 ASSERT(IS_P2ALIGNED(lsize, PAGESIZE)); 1112 npgs = mmu_btop(lsize); 1113 while (npgs--) { 1114 if ((err = xen_kpm_page(hat_getpfnum(kas.a_hat, lva), 1115 pt_bits)) != 0) 1116 break; 1117 lva += PAGESIZE; 1118 } 1119 #endif /* __amd64 */ 1120 1121 done: 1122 if (err) { 1123 cmn_err(CE_WARN, "xen_ldt_setprot(%p, %s) failed: error %d", 1124 (void *)lva, 1125 (prot & PROT_WRITE) ? "writable" : "read-only", err); 1126 } 1127 1128 return (err); 1129 } 1130