1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* derived from netbsd's xen_machdep.c 1.1.2.1 */ 30 31 /* 32 * 33 * Copyright (c) 2004 Christian Limpach. 34 * All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. This section intentionally left blank. 45 * 4. The name of the author may not be used to endorse or promote products 46 * derived from this software without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 49 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 50 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 51 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 52 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 53 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 54 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 55 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 56 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 57 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 58 */ 59 /* 60 * Section 3 of the above license was updated in response to bug 6379571. 61 */ 62 63 #include <sys/ctype.h> 64 #include <sys/types.h> 65 #include <sys/cmn_err.h> 66 #include <sys/trap.h> 67 #include <sys/segments.h> 68 #include <sys/hypervisor.h> 69 #include <sys/xen_mmu.h> 70 #include <sys/machsystm.h> 71 #include <sys/promif.h> 72 #include <sys/bootconf.h> 73 #include <sys/bootinfo.h> 74 #include <sys/cpr.h> 75 #include <sys/taskq.h> 76 #include <sys/uadmin.h> 77 #include <sys/evtchn_impl.h> 78 #include <sys/archsystm.h> 79 #include <xen/sys/xenbus_impl.h> 80 #include <sys/mach_mmu.h> 81 #include <vm/hat_i86.h> 82 #include <sys/gnttab.h> 83 #include <sys/reboot.h> 84 #include <sys/stack.h> 85 #include <sys/clock.h> 86 #include <sys/bitmap.h> 87 #include <sys/processor.h> 88 #include <sys/xen_errno.h> 89 #include <sys/xpv_panic.h> 90 #include <sys/smp_impldefs.h> 91 #include <sys/cpu.h> 92 #include <sys/balloon_impl.h> 93 #include <sys/ddi.h> 94 95 #ifdef DEBUG 96 #define SUSPEND_DEBUG if (xen_suspend_debug) xen_printf 97 #else 98 #define SUSPEND_DEBUG(...) 99 #endif 100 101 int cpr_debug; 102 cpuset_t cpu_suspend_lost_set; 103 static int xen_suspend_debug; 104 105 /* 106 * Determine helpful version information. 107 * 108 * (And leave copies in the data segment so we can look at them later 109 * with e.g. kmdb.) 110 */ 111 112 typedef enum xen_version { 113 XENVER_BOOT_IDX, 114 XENVER_CURRENT_IDX 115 } xen_version_t; 116 117 struct xenver { 118 ulong_t xv_major; 119 ulong_t xv_minor; 120 ulong_t xv_revision; 121 xen_extraversion_t xv_ver; 122 xen_changeset_info_t xv_chgset; 123 xen_compile_info_t xv_build; 124 xen_capabilities_info_t xv_caps; 125 } xenver[2]; 126 127 #define XENVER_BOOT(m) (xenver[XENVER_BOOT_IDX].m) 128 #define XENVER_CURRENT(m) (xenver[XENVER_CURRENT_IDX].m) 129 130 /* 131 * Update the xenver data. We maintain two copies, boot and 132 * current. If we are setting the boot, then also set current. 133 */ 134 static void 135 xen_set_version(xen_version_t idx) 136 { 137 ulong_t ver; 138 139 bzero(&xenver[idx], sizeof (xenver[idx])); 140 141 ver = HYPERVISOR_xen_version(XENVER_version, 0); 142 143 xenver[idx].xv_major = BITX(ver, 31, 16); 144 xenver[idx].xv_minor = BITX(ver, 15, 0); 145 146 (void) HYPERVISOR_xen_version(XENVER_extraversion, &xenver[idx].xv_ver); 147 148 /* 149 * The revision is buried in the extraversion information that is 150 * maintained by the hypervisor. For our purposes we expect that 151 * the revision number is: 152 * - the second character in the extraversion information 153 * - one character long 154 * - numeric digit 155 * If it isn't then we can't extract the revision and we leave it 156 * set to 0. 157 */ 158 if (strlen(xenver[idx].xv_ver) > 1 && isdigit(xenver[idx].xv_ver[1])) 159 xenver[idx].xv_revision = xenver[idx].xv_ver[1] - '0'; 160 else 161 cmn_err(CE_WARN, "Cannot extract revision on this hypervisor " 162 "version: v%s, unexpected version format", 163 xenver[idx].xv_ver); 164 165 (void) HYPERVISOR_xen_version(XENVER_changeset, 166 &xenver[idx].xv_chgset); 167 168 (void) HYPERVISOR_xen_version(XENVER_compile_info, 169 &xenver[idx].xv_build); 170 /* 171 * Capabilities are a set of space separated ascii strings 172 * e.g. 'xen-3.1-x86_32p' or 'hvm-3.2-x86_64' 173 */ 174 (void) HYPERVISOR_xen_version(XENVER_capabilities, 175 &xenver[idx].xv_caps); 176 177 cmn_err(CE_CONT, "?v%lu.%lu%s chgset '%s'\n", xenver[idx].xv_major, 178 xenver[idx].xv_minor, xenver[idx].xv_ver, xenver[idx].xv_chgset); 179 180 if (idx == XENVER_BOOT_IDX) 181 bcopy(&xenver[XENVER_BOOT_IDX], &xenver[XENVER_CURRENT_IDX], 182 sizeof (xenver[XENVER_BOOT_IDX])); 183 } 184 185 typedef enum xen_hypervisor_check { 186 XEN_RUN_CHECK, 187 XEN_SUSPEND_CHECK 188 } xen_hypervisor_check_t; 189 190 /* 191 * To run the hypervisor must be 3.0.4 or better. To suspend/resume 192 * we need 3.0.4 or better and if it is 3.0.4. then it must be provided 193 * by the Solaris xVM project. 194 * Checking can be disabled for testing purposes by setting the 195 * xen_suspend_debug variable. 196 */ 197 static int 198 xen_hypervisor_supports_solaris(xen_hypervisor_check_t check) 199 { 200 if (xen_suspend_debug == 1) 201 return (1); 202 if (XENVER_CURRENT(xv_major) < 3) 203 return (0); 204 if (XENVER_CURRENT(xv_major) > 3) 205 return (1); 206 if (XENVER_CURRENT(xv_minor) > 0) 207 return (1); 208 if (XENVER_CURRENT(xv_revision) < 4) 209 return (0); 210 if (XENVER_CURRENT(xv_revision) == 4 && check == XEN_SUSPEND_CHECK) { 211 if (strlen(XENVER_CURRENT(xv_ver)) < 4) 212 return (0); 213 if (strncmp(XENVER_CURRENT(xv_ver) + 214 strlen(XENVER_CURRENT(xv_ver)) - 4, "-xvm", 4)) 215 return (0); 216 } 217 return (1); 218 } 219 220 void 221 xen_set_callback(void (*func)(void), uint_t type, uint_t flags) 222 { 223 struct callback_register cb; 224 225 bzero(&cb, sizeof (cb)); 226 #if defined(__amd64) 227 cb.address = (ulong_t)func; 228 #elif defined(__i386) 229 cb.address.cs = KCS_SEL; 230 cb.address.eip = (ulong_t)func; 231 #endif 232 cb.type = type; 233 cb.flags = flags; 234 235 /* 236 * XXPV always ignore return value for NMI 237 */ 238 if (HYPERVISOR_callback_op(CALLBACKOP_register, &cb) != 0 && 239 type != CALLBACKTYPE_nmi) 240 panic("HYPERVISOR_callback_op failed"); 241 } 242 243 void 244 xen_init_callbacks(void) 245 { 246 /* 247 * register event (interrupt) handler. 248 */ 249 xen_set_callback(xen_callback, CALLBACKTYPE_event, 0); 250 251 /* 252 * failsafe handler. 253 */ 254 xen_set_callback(xen_failsafe_callback, CALLBACKTYPE_failsafe, 255 CALLBACKF_mask_events); 256 257 /* 258 * NMI handler. 259 */ 260 xen_set_callback(nmiint, CALLBACKTYPE_nmi, 0); 261 262 /* 263 * system call handler 264 * XXPV move to init_cpu_syscall? 265 */ 266 #if defined(__amd64) 267 xen_set_callback(sys_syscall, CALLBACKTYPE_syscall, 268 CALLBACKF_mask_events); 269 #endif /* __amd64 */ 270 } 271 272 273 /* 274 * cmn_err() followed by a 1/4 second delay; this gives the 275 * logging service a chance to flush messages and helps avoid 276 * intermixing output from prom_printf(). 277 * XXPV: doesn't exactly help us on UP though. 278 */ 279 /*PRINTFLIKE2*/ 280 void 281 cpr_err(int ce, const char *fmt, ...) 282 { 283 va_list adx; 284 285 va_start(adx, fmt); 286 vcmn_err(ce, fmt, adx); 287 va_end(adx); 288 drv_usecwait(MICROSEC >> 2); 289 } 290 291 void 292 xen_suspend_devices(void) 293 { 294 int rc; 295 296 SUSPEND_DEBUG("xen_suspend_devices\n"); 297 298 if ((rc = cpr_suspend_devices(ddi_root_node())) != 0) 299 panic("failed to suspend devices: %d", rc); 300 } 301 302 void 303 xen_resume_devices(void) 304 { 305 int rc; 306 307 SUSPEND_DEBUG("xen_resume_devices\n"); 308 309 if ((rc = cpr_resume_devices(ddi_root_node(), 0)) != 0) 310 panic("failed to resume devices: %d", rc); 311 } 312 313 /* 314 * The list of mfn pages is out of date. Recompute it. 315 */ 316 static void 317 rebuild_mfn_list(void) 318 { 319 int i = 0; 320 size_t sz; 321 size_t off; 322 pfn_t pfn; 323 324 SUSPEND_DEBUG("rebuild_mfn_list\n"); 325 326 sz = ((mfn_count * sizeof (mfn_t)) + MMU_PAGEOFFSET) & MMU_PAGEMASK; 327 328 for (off = 0; off < sz; off += MMU_PAGESIZE) { 329 size_t j = mmu_btop(off); 330 if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) { 331 pfn = hat_getpfnum(kas.a_hat, 332 (caddr_t)&mfn_list_pages[j]); 333 mfn_list_pages_page[i++] = pfn_to_mfn(pfn); 334 } 335 336 pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list + off); 337 mfn_list_pages[j] = pfn_to_mfn(pfn); 338 } 339 340 pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list_pages_page); 341 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list 342 = pfn_to_mfn(pfn); 343 } 344 345 static void 346 suspend_cpus(void) 347 { 348 int i; 349 350 SUSPEND_DEBUG("suspend_cpus\n"); 351 352 mp_enter_barrier(); 353 354 for (i = 1; i < ncpus; i++) { 355 if (!CPU_IN_SET(cpu_suspend_lost_set, i)) { 356 SUSPEND_DEBUG("xen_vcpu_down %d\n", i); 357 (void) xen_vcpu_down(i); 358 } 359 360 mach_cpucontext_reset(cpu[i]); 361 } 362 } 363 364 static void 365 resume_cpus(void) 366 { 367 int i; 368 369 for (i = 1; i < ncpus; i++) { 370 if (cpu[i] == NULL) 371 continue; 372 373 if (!CPU_IN_SET(cpu_suspend_lost_set, i)) { 374 SUSPEND_DEBUG("xen_vcpu_up %d\n", i); 375 mach_cpucontext_restore(cpu[i]); 376 (void) xen_vcpu_up(i); 377 } 378 } 379 380 mp_leave_barrier(); 381 } 382 383 /* 384 * Top level routine to direct suspend/resume of a domain. 385 */ 386 void 387 xen_suspend_domain(void) 388 { 389 extern void rtcsync(void); 390 extern hrtime_t hres_last_tick; 391 mfn_t start_info_mfn; 392 ulong_t flags; 393 pfn_t pfn; 394 int i; 395 396 /* 397 * Check that we are happy to suspend on this hypervisor. 398 */ 399 if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) { 400 cpr_err(CE_WARN, "Cannot suspend on this hypervisor " 401 "version: v%lu.%lu%s, need at least version v3.0.4 or " 402 "-xvm based hypervisor", XENVER_CURRENT(xv_major), 403 XENVER_CURRENT(xv_minor), XENVER_CURRENT(xv_ver)); 404 return; 405 } 406 407 /* 408 * XXPV - Are we definitely OK to suspend by the time we've connected 409 * the handler? 410 */ 411 412 cpr_err(CE_NOTE, "Domain suspending for save/migrate"); 413 414 SUSPEND_DEBUG("xen_suspend_domain\n"); 415 416 /* 417 * suspend interrupts and devices 418 * XXPV - we use suspend/resume for both save/restore domains (like sun 419 * cpr) and for migration. Would be nice to know the difference if 420 * possible. For save/restore where down time may be a long time, we 421 * may want to do more of the things that cpr does. (i.e. notify user 422 * processes, shrink memory footprint for faster restore, etc.) 423 */ 424 xen_suspend_devices(); 425 SUSPEND_DEBUG("xenbus_suspend\n"); 426 xenbus_suspend(); 427 428 pfn = hat_getpfnum(kas.a_hat, (caddr_t)xen_info); 429 start_info_mfn = pfn_to_mfn(pfn); 430 431 /* 432 * XXPV: cpu hotplug can hold this under a xenbus watch. Are we safe 433 * wrt xenbus being suspended here? 434 */ 435 mutex_enter(&cpu_lock); 436 437 /* 438 * Suspend must be done on vcpu 0, as no context for other CPUs is 439 * saved. 440 * 441 * XXPV - add to taskq API ? 442 */ 443 thread_affinity_set(curthread, 0); 444 kpreempt_disable(); 445 446 SUSPEND_DEBUG("xen_start_migrate\n"); 447 xen_start_migrate(); 448 if (ncpus > 1) 449 suspend_cpus(); 450 451 /* 452 * We can grab the ec_lock as it's a spinlock with a high SPL. Hence 453 * any holder would have dropped it to get through suspend_cpus(). 454 */ 455 mutex_enter(&ec_lock); 456 457 /* 458 * From here on in, we can't take locks. 459 */ 460 SUSPEND_DEBUG("ec_suspend\n"); 461 ec_suspend(); 462 SUSPEND_DEBUG("gnttab_suspend\n"); 463 gnttab_suspend(); 464 465 flags = intr_clear(); 466 467 xpv_time_suspend(); 468 469 /* 470 * Currently, the hypervisor incorrectly fails to bring back 471 * powered-down VCPUs. Thus we need to record any powered-down VCPUs 472 * to prevent any attempts to operate on them. But we have to do this 473 * *after* the very first time we do ec_suspend(). 474 */ 475 for (i = 1; i < ncpus; i++) { 476 if (cpu[i] == NULL) 477 continue; 478 479 if (cpu_get_state(cpu[i]) == P_POWEROFF) 480 CPUSET_ATOMIC_ADD(cpu_suspend_lost_set, i); 481 } 482 483 /* 484 * The dom0 save/migrate code doesn't automatically translate 485 * these into PFNs, but expects them to be, so we do it here. 486 * We don't use mfn_to_pfn() because so many OS services have 487 * been disabled at this point. 488 */ 489 xen_info->store_mfn = mfn_to_pfn_mapping[xen_info->store_mfn]; 490 xen_info->console.domU.mfn = 491 mfn_to_pfn_mapping[xen_info->console.domU.mfn]; 492 493 if (CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0) { 494 prom_printf("xen_suspend_domain(): " 495 "CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask not set\n"); 496 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 497 } 498 499 if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, 500 0, UVMF_INVLPG)) { 501 prom_printf("xen_suspend_domain(): " 502 "HYPERVISOR_update_va_mapping() failed\n"); 503 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 504 } 505 506 SUSPEND_DEBUG("HYPERVISOR_suspend\n"); 507 508 /* 509 * At this point we suspend and sometime later resume. 510 */ 511 if (HYPERVISOR_suspend(start_info_mfn)) { 512 prom_printf("xen_suspend_domain(): " 513 "HYPERVISOR_suspend() failed\n"); 514 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 515 } 516 517 /* 518 * Point HYPERVISOR_shared_info to its new value. 519 */ 520 if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, 521 xen_info->shared_info | PT_NOCONSIST | PT_VALID | PT_WRITABLE, 522 UVMF_INVLPG)) 523 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 524 525 if (xen_info->nr_pages != mfn_count) { 526 prom_printf("xen_suspend_domain(): number of pages" 527 " changed, was 0x%lx, now 0x%lx\n", mfn_count, 528 xen_info->nr_pages); 529 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 530 } 531 532 xpv_time_resume(); 533 534 cached_max_mfn = 0; 535 536 SUSPEND_DEBUG("gnttab_resume\n"); 537 gnttab_resume(); 538 539 /* XXPV: add a note that this must be lockless. */ 540 SUSPEND_DEBUG("ec_resume\n"); 541 ec_resume(); 542 543 intr_restore(flags); 544 545 if (ncpus > 1) 546 resume_cpus(); 547 548 mutex_exit(&ec_lock); 549 xen_end_migrate(); 550 mutex_exit(&cpu_lock); 551 552 /* 553 * Now we can take locks again. 554 */ 555 556 /* 557 * Force the tick value used for tv_nsec in hres_tick() to be up to 558 * date. rtcsync() will reset the hrestime value appropriately. 559 */ 560 hres_last_tick = xpv_gethrtime(); 561 562 /* 563 * XXPV: we need to have resumed the CPUs since this takes locks, but 564 * can remote CPUs see bad state? Presumably yes. Should probably nest 565 * taking of todlock inside of cpu_lock, or vice versa, then provide an 566 * unlocked version. Probably need to call clkinitf to reset cpu freq 567 * and re-calibrate if we migrated to a different speed cpu. Also need 568 * to make a (re)init_cpu_info call to update processor info structs 569 * and device tree info. That remains to be written at the moment. 570 */ 571 rtcsync(); 572 573 rebuild_mfn_list(); 574 575 SUSPEND_DEBUG("xenbus_resume\n"); 576 xenbus_resume(); 577 SUSPEND_DEBUG("xenbus_resume_devices\n"); 578 xen_resume_devices(); 579 580 thread_affinity_clear(curthread); 581 kpreempt_enable(); 582 583 SUSPEND_DEBUG("finished xen_suspend_domain\n"); 584 585 /* 586 * We have restarted our suspended domain, update the hypervisor 587 * details. NB: This must be done at the end of this function, 588 * since we need the domain to be completely resumed before 589 * these functions will work correctly. 590 */ 591 xen_set_version(XENVER_CURRENT_IDX); 592 593 /* 594 * We can check and report a warning, but we don't stop the 595 * process. 596 */ 597 if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) 598 cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s " 599 "but need at least version v3.0.4", 600 XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor), 601 XENVER_CURRENT(xv_ver)); 602 603 cmn_err(CE_NOTE, "domain restore/migrate completed"); 604 } 605 606 /*ARGSUSED*/ 607 int 608 xen_debug_handler(void *arg) 609 { 610 debug_enter("External debug event received"); 611 612 /* 613 * If we've not got KMDB loaded, output some stuff difficult to capture 614 * from a domain core. 615 */ 616 if (!(boothowto & RB_DEBUG)) { 617 shared_info_t *si = HYPERVISOR_shared_info; 618 int i; 619 620 prom_printf("evtchn_pending [ "); 621 for (i = 0; i < 8; i++) 622 prom_printf("%lx ", si->evtchn_pending[i]); 623 prom_printf("]\nevtchn_mask [ "); 624 for (i = 0; i < 8; i++) 625 prom_printf("%lx ", si->evtchn_mask[i]); 626 prom_printf("]\n"); 627 628 for (i = 0; i < ncpus; i++) { 629 vcpu_info_t *vcpu = &si->vcpu_info[i]; 630 if (cpu[i] == NULL) 631 continue; 632 prom_printf("CPU%d pending %d mask %d sel %lx\n", 633 i, vcpu->evtchn_upcall_pending, 634 vcpu->evtchn_upcall_mask, 635 vcpu->evtchn_pending_sel); 636 } 637 } 638 639 return (0); 640 } 641 642 /*ARGSUSED*/ 643 static void 644 xen_sysrq_handler(struct xenbus_watch *watch, const char **vec, 645 unsigned int len) 646 { 647 xenbus_transaction_t xbt; 648 char key = '\0'; 649 int ret; 650 651 retry: 652 if (xenbus_transaction_start(&xbt)) { 653 cmn_err(CE_WARN, "failed to start sysrq transaction"); 654 return; 655 } 656 657 if ((ret = xenbus_scanf(xbt, "control", "sysrq", "%c", &key)) != 0) { 658 /* 659 * ENOENT happens in response to our own xenbus_rm. 660 * XXPV - this happens spuriously on boot? 661 */ 662 if (ret != ENOENT) 663 cmn_err(CE_WARN, "failed to read sysrq: %d", ret); 664 goto out; 665 } 666 667 if ((ret = xenbus_rm(xbt, "control", "sysrq")) != 0) { 668 cmn_err(CE_WARN, "failed to reset sysrq: %d", ret); 669 goto out; 670 } 671 672 if (xenbus_transaction_end(xbt, 0) == EAGAIN) 673 goto retry; 674 675 /* 676 * Somewhat arbitrary - on Linux this means 'reboot'. We could just 677 * accept any key, but this might increase the risk of sending a 678 * harmless sysrq to the wrong domain... 679 */ 680 if (key == 'b') 681 (void) xen_debug_handler(NULL); 682 else 683 cmn_err(CE_WARN, "Ignored sysrq %c", key); 684 return; 685 686 out: 687 (void) xenbus_transaction_end(xbt, 1); 688 } 689 690 taskq_t *xen_shutdown_tq; 691 692 #define SHUTDOWN_INVALID -1 693 #define SHUTDOWN_POWEROFF 0 694 #define SHUTDOWN_REBOOT 1 695 #define SHUTDOWN_SUSPEND 2 696 #define SHUTDOWN_HALT 3 697 #define SHUTDOWN_MAX 4 698 699 #define SHUTDOWN_TIMEOUT_SECS (60 * 5) 700 701 static const char *cmd_strings[SHUTDOWN_MAX] = { 702 "poweroff", 703 "reboot", 704 "suspend", 705 "halt" 706 }; 707 708 static void 709 xen_dirty_shutdown(void *arg) 710 { 711 int cmd = (uintptr_t)arg; 712 713 cmn_err(CE_WARN, "Externally requested shutdown failed or " 714 "timed out.\nShutting down.\n"); 715 716 switch (cmd) { 717 case SHUTDOWN_HALT: 718 case SHUTDOWN_POWEROFF: 719 (void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred); 720 break; 721 case SHUTDOWN_REBOOT: 722 (void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred); 723 break; 724 } 725 } 726 727 static void 728 xen_shutdown(void *arg) 729 { 730 nvlist_t *attr_list = NULL; 731 sysevent_t *event = NULL; 732 sysevent_id_t eid; 733 int cmd = (uintptr_t)arg; 734 int err; 735 736 ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX); 737 738 if (cmd == SHUTDOWN_SUSPEND) { 739 xen_suspend_domain(); 740 return; 741 } 742 743 err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_SLEEP); 744 if (err != DDI_SUCCESS) 745 goto failure; 746 747 err = nvlist_add_string(attr_list, "shutdown", cmd_strings[cmd]); 748 if (err != DDI_SUCCESS) 749 goto failure; 750 751 if ((event = sysevent_alloc("EC_xpvsys", "control", "SUNW:kern:xpv", 752 SE_SLEEP)) == NULL) 753 goto failure; 754 (void) sysevent_attach_attributes(event, 755 (sysevent_attr_list_t *)attr_list); 756 757 err = log_sysevent(event, SE_SLEEP, &eid); 758 759 sysevent_detach_attributes(event); 760 sysevent_free(event); 761 762 if (err != 0) 763 goto failure; 764 765 (void) timeout(xen_dirty_shutdown, arg, 766 SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC)); 767 768 nvlist_free(attr_list); 769 return; 770 771 failure: 772 if (attr_list != NULL) 773 nvlist_free(attr_list); 774 xen_dirty_shutdown(arg); 775 } 776 777 /*ARGSUSED*/ 778 static void 779 xen_shutdown_handler(struct xenbus_watch *watch, const char **vec, 780 unsigned int len) 781 { 782 char *str; 783 xenbus_transaction_t xbt; 784 int err, shutdown_code = SHUTDOWN_INVALID; 785 unsigned int slen; 786 787 again: 788 err = xenbus_transaction_start(&xbt); 789 if (err) 790 return; 791 if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) { 792 (void) xenbus_transaction_end(xbt, 1); 793 return; 794 } 795 796 SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str); 797 798 /* 799 * If this is a watch fired from our write below, check out early to 800 * avoid an infinite loop. 801 */ 802 if (strcmp(str, "") == 0) { 803 (void) xenbus_transaction_end(xbt, 0); 804 kmem_free(str, slen); 805 return; 806 } else if (strcmp(str, "poweroff") == 0) { 807 shutdown_code = SHUTDOWN_POWEROFF; 808 } else if (strcmp(str, "reboot") == 0) { 809 shutdown_code = SHUTDOWN_REBOOT; 810 } else if (strcmp(str, "suspend") == 0) { 811 shutdown_code = SHUTDOWN_SUSPEND; 812 } else if (strcmp(str, "halt") == 0) { 813 shutdown_code = SHUTDOWN_HALT; 814 } else { 815 printf("Ignoring shutdown request: %s\n", str); 816 } 817 818 /* 819 * XXPV Should we check the value of xenbus_write() too, or are all 820 * errors automatically folded into xenbus_transaction_end() ?? 821 */ 822 (void) xenbus_write(xbt, "control", "shutdown", ""); 823 err = xenbus_transaction_end(xbt, 0); 824 if (err == EAGAIN) { 825 SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id); 826 kmem_free(str, slen); 827 goto again; 828 } 829 830 kmem_free(str, slen); 831 if (shutdown_code != SHUTDOWN_INVALID) { 832 (void) taskq_dispatch(xen_shutdown_tq, xen_shutdown, 833 (void *)(intptr_t)shutdown_code, 0); 834 } 835 } 836 837 static struct xenbus_watch shutdown_watch; 838 static struct xenbus_watch sysrq_watch; 839 840 void 841 xen_late_startup(void) 842 { 843 if (!DOMAIN_IS_INITDOMAIN(xen_info)) { 844 xen_shutdown_tq = taskq_create("shutdown_taskq", 1, 845 maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE); 846 shutdown_watch.node = "control/shutdown"; 847 shutdown_watch.callback = xen_shutdown_handler; 848 if (register_xenbus_watch(&shutdown_watch)) 849 cmn_err(CE_WARN, "Failed to set shutdown watcher"); 850 851 sysrq_watch.node = "control/sysrq"; 852 sysrq_watch.callback = xen_sysrq_handler; 853 if (register_xenbus_watch(&sysrq_watch)) 854 cmn_err(CE_WARN, "Failed to set sysrq watcher"); 855 } 856 balloon_init(xen_info->nr_pages); 857 } 858 859 #ifdef DEBUG 860 #define XEN_PRINTF_BUFSIZE 1024 861 862 char xen_printf_buffer[XEN_PRINTF_BUFSIZE]; 863 864 /* 865 * Printf function that calls hypervisor directly. For DomU it only 866 * works when running on a xen hypervisor built with debug on. Works 867 * always since no I/O ring interaction is needed. 868 */ 869 /*PRINTFLIKE1*/ 870 void 871 xen_printf(const char *fmt, ...) 872 { 873 va_list ap; 874 875 va_start(ap, fmt); 876 (void) vsnprintf(xen_printf_buffer, XEN_PRINTF_BUFSIZE, fmt, ap); 877 va_end(ap); 878 879 (void) HYPERVISOR_console_io(CONSOLEIO_write, 880 strlen(xen_printf_buffer), xen_printf_buffer); 881 } 882 #else 883 void 884 xen_printf(const char *fmt, ...) 885 { 886 } 887 #endif /* DEBUG */ 888 889 void 890 xen_version(void) 891 { 892 xen_set_version(XENVER_BOOT_IDX); 893 if (xen_hypervisor_supports_solaris(XEN_RUN_CHECK) == 0) 894 cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s " 895 "but need at least version v3.0.4", 896 XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor), 897 XENVER_CURRENT(xv_ver)); 898 } 899 900 /* 901 * Miscellaneous hypercall wrappers with slightly more verbose diagnostics. 902 */ 903 904 void 905 xen_set_gdt(ulong_t *frame_list, int entries) 906 { 907 int err; 908 if ((err = HYPERVISOR_set_gdt(frame_list, entries)) != 0) { 909 /* 910 * X_EINVAL: reserved entry or bad frames 911 * X_EFAULT: bad address 912 */ 913 panic("xen_set_gdt(%p, %d): error %d", 914 (void *)frame_list, entries, -(int)err); 915 } 916 } 917 918 void 919 xen_set_ldt(user_desc_t *ldt, uint_t nsels) 920 { 921 struct mmuext_op op; 922 long err; 923 924 op.cmd = MMUEXT_SET_LDT; 925 op.arg1.linear_addr = (uintptr_t)ldt; 926 op.arg2.nr_ents = nsels; 927 928 if ((err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) != 0) { 929 panic("xen_set_ldt(%p, %d): error %d", 930 (void *)ldt, nsels, -(int)err); 931 } 932 } 933 934 void 935 xen_stack_switch(ulong_t ss, ulong_t esp) 936 { 937 long err; 938 939 if ((err = HYPERVISOR_stack_switch(ss, esp)) != 0) { 940 /* 941 * X_EPERM: bad selector 942 */ 943 panic("xen_stack_switch(%lx, %lx): error %d", ss, esp, 944 -(int)err); 945 } 946 } 947 948 long 949 xen_set_trap_table(trap_info_t *table) 950 { 951 long err; 952 953 if ((err = HYPERVISOR_set_trap_table(table)) != 0) { 954 /* 955 * X_EFAULT: bad address 956 * X_EPERM: bad selector 957 */ 958 panic("xen_set_trap_table(%p): error %d", (void *)table, 959 -(int)err); 960 } 961 return (err); 962 } 963 964 #if defined(__amd64) 965 void 966 xen_set_segment_base(int reg, ulong_t value) 967 { 968 long err; 969 970 if ((err = HYPERVISOR_set_segment_base(reg, value)) != 0) { 971 /* 972 * X_EFAULT: bad address 973 * X_EINVAL: bad type 974 */ 975 panic("xen_set_segment_base(%d, %lx): error %d", 976 reg, value, -(int)err); 977 } 978 } 979 #endif /* __amd64 */ 980 981 /* 982 * Translate a hypervisor errcode to a Solaris error code. 983 */ 984 int 985 xen_xlate_errcode(int error) 986 { 987 switch (-error) { 988 989 /* 990 * Translate hypervisor errno's into native errno's 991 */ 992 993 #define CASE(num) case X_##num: error = num; break 994 995 CASE(EPERM); CASE(ENOENT); CASE(ESRCH); 996 CASE(EINTR); CASE(EIO); CASE(ENXIO); 997 CASE(E2BIG); CASE(ENOMEM); CASE(EACCES); 998 CASE(EFAULT); CASE(EBUSY); CASE(EEXIST); 999 CASE(ENODEV); CASE(EISDIR); CASE(EINVAL); 1000 CASE(ENOSPC); CASE(ESPIPE); CASE(EROFS); 1001 CASE(ENOSYS); CASE(ENOTEMPTY); CASE(EISCONN); 1002 CASE(ENODATA); 1003 1004 #undef CASE 1005 1006 default: 1007 panic("xen_xlate_errcode: unknown error %d", error); 1008 } 1009 1010 return (error); 1011 } 1012 1013 /* 1014 * Raise PS_IOPL on current vcpu to user level. 1015 * Caller responsible for preventing kernel preemption. 1016 */ 1017 void 1018 xen_enable_user_iopl(void) 1019 { 1020 physdev_set_iopl_t set_iopl; 1021 set_iopl.iopl = 3; /* user ring 3 */ 1022 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 1023 } 1024 1025 /* 1026 * Drop PS_IOPL on current vcpu to kernel level 1027 */ 1028 void 1029 xen_disable_user_iopl(void) 1030 { 1031 physdev_set_iopl_t set_iopl; 1032 set_iopl.iopl = 1; /* kernel pseudo ring 1 */ 1033 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 1034 } 1035 1036 int 1037 xen_gdt_setprot(cpu_t *cp, uint_t prot) 1038 { 1039 int err; 1040 #if defined(__amd64) 1041 int pt_bits = PT_VALID; 1042 if (prot & PROT_WRITE) 1043 pt_bits |= PT_WRITABLE; 1044 #endif 1045 1046 if ((err = as_setprot(&kas, (caddr_t)cp->cpu_gdt, 1047 MMU_PAGESIZE, prot)) != 0) 1048 goto done; 1049 1050 #if defined(__amd64) 1051 err = xen_kpm_page(mmu_btop(cp->cpu_m.mcpu_gdtpa), pt_bits); 1052 #endif 1053 1054 done: 1055 if (err) { 1056 cmn_err(CE_WARN, "cpu%d: xen_gdt_setprot(%s) failed: error %d", 1057 cp->cpu_id, (prot & PROT_WRITE) ? "writable" : "read-only", 1058 err); 1059 } 1060 1061 return (err); 1062 } 1063 1064 int 1065 xen_ldt_setprot(user_desc_t *ldt, size_t lsize, uint_t prot) 1066 { 1067 int err; 1068 caddr_t lva = (caddr_t)ldt; 1069 #if defined(__amd64) 1070 int pt_bits = PT_VALID; 1071 pgcnt_t npgs; 1072 if (prot & PROT_WRITE) 1073 pt_bits |= PT_WRITABLE; 1074 #endif /* __amd64 */ 1075 1076 if ((err = as_setprot(&kas, (caddr_t)ldt, lsize, prot)) != 0) 1077 goto done; 1078 1079 #if defined(__amd64) 1080 1081 ASSERT(IS_P2ALIGNED(lsize, PAGESIZE)); 1082 npgs = mmu_btop(lsize); 1083 while (npgs--) { 1084 if ((err = xen_kpm_page(hat_getpfnum(kas.a_hat, lva), 1085 pt_bits)) != 0) 1086 break; 1087 lva += PAGESIZE; 1088 } 1089 #endif /* __amd64 */ 1090 1091 done: 1092 if (err) { 1093 cmn_err(CE_WARN, "xen_ldt_setprot(%p, %s) failed: error %d", 1094 (void *)lva, 1095 (prot & PROT_WRITE) ? "writable" : "read-only", err); 1096 } 1097 1098 return (err); 1099 } 1100