1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* derived from netbsd's xen_machdep.c 1.1.2.1 */ 30 31 /* 32 * 33 * Copyright (c) 2004 Christian Limpach. 34 * All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. This section intentionally left blank. 45 * 4. The name of the author may not be used to endorse or promote products 46 * derived from this software without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 49 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 50 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 51 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 52 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 53 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 54 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 55 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 56 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 57 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 58 */ 59 /* 60 * Section 3 of the above license was updated in response to bug 6379571. 61 */ 62 63 #include <sys/types.h> 64 #include <sys/cmn_err.h> 65 #include <sys/trap.h> 66 #include <sys/segments.h> 67 #include <sys/sunddi.h> /* for ddi_strtoul */ 68 #include <sys/hypervisor.h> 69 #include <sys/xen_mmu.h> 70 #include <sys/machsystm.h> 71 #include <sys/promif.h> 72 #include <sys/bootconf.h> 73 #include <sys/bootinfo.h> 74 #include <sys/cpr.h> 75 #include <sys/taskq.h> 76 #include <sys/uadmin.h> 77 #include <sys/evtchn_impl.h> 78 #include <sys/archsystm.h> 79 #include <xen/sys/xenbus_impl.h> 80 #include <sys/mach_mmu.h> 81 #include <vm/hat_i86.h> 82 #include <sys/gnttab.h> 83 #include <sys/reboot.h> 84 #include <sys/stack.h> 85 #include <sys/clock.h> 86 #include <sys/bitmap.h> 87 #include <sys/processor.h> 88 #include <sys/xen_errno.h> 89 #include <sys/xpv_panic.h> 90 #include <sys/smp_impldefs.h> 91 #include <sys/cpu.h> 92 #include <sys/balloon_impl.h> 93 #include <sys/ddi.h> 94 95 /* 96 * Hypervisor-specific utility routines - these can be invoked from the 97 * normal control flow. It might be useful to partition these into 98 * different files, but let's see how it looks before we get too 99 * carried away with that idea. 100 */ 101 102 /* 103 * In the current absence of any useful way to debug domains that are hung 104 * whilst suspending, we have a more clumsy approach... 105 */ 106 #ifdef DEBUG 107 #define SUSPEND_DEBUG if (xen_suspend_debug) xen_printf 108 #else 109 #define SUSPEND_DEBUG(...) 110 #endif 111 112 int cpr_debug; 113 cpuset_t cpu_suspend_set; 114 cpuset_t cpu_suspend_lost_set; 115 volatile int xen_suspending_cpus; 116 static int xen_suspend_debug; 117 118 void 119 xen_set_callback(void (*func)(void), uint_t type, uint_t flags) 120 { 121 struct callback_register cb; 122 123 bzero(&cb, sizeof (cb)); 124 #if defined(__amd64) 125 cb.address = (ulong_t)func; 126 #elif defined(__i386) 127 cb.address.cs = KCS_SEL; 128 cb.address.eip = (ulong_t)func; 129 #endif 130 cb.type = type; 131 cb.flags = flags; 132 133 /* 134 * XXPV always ignore return value for NMI 135 */ 136 if (HYPERVISOR_callback_op(CALLBACKOP_register, &cb) != 0 && 137 type != CALLBACKTYPE_nmi) 138 panic("HYPERVISOR_callback_op failed"); 139 } 140 141 void 142 xen_init_callbacks(void) 143 { 144 /* 145 * register event (interrupt) handler. 146 */ 147 xen_set_callback(xen_callback, CALLBACKTYPE_event, 0); 148 149 /* 150 * failsafe handler. 151 */ 152 xen_set_callback(xen_failsafe_callback, CALLBACKTYPE_failsafe, 153 CALLBACKF_mask_events); 154 155 /* 156 * NMI handler. 157 */ 158 xen_set_callback(nmiint, CALLBACKTYPE_nmi, 0); 159 160 /* 161 * system call handler 162 * XXPV move to init_cpu_syscall? 163 */ 164 #if defined(__amd64) 165 xen_set_callback(sys_syscall, CALLBACKTYPE_syscall, 166 CALLBACKF_mask_events); 167 #endif /* __amd64 */ 168 } 169 170 171 /* 172 * cmn_err() followed by a 1/4 second delay; this gives the 173 * logging service a chance to flush messages and helps avoid 174 * intermixing output from prom_printf(). 175 * XXPV: doesn't exactly help us on UP though. 176 */ 177 /*PRINTFLIKE2*/ 178 void 179 cpr_err(int ce, const char *fmt, ...) 180 { 181 va_list adx; 182 183 va_start(adx, fmt); 184 vcmn_err(ce, fmt, adx); 185 va_end(adx); 186 drv_usecwait(MICROSEC >> 2); 187 } 188 189 void 190 xen_suspend_devices(void) 191 { 192 int rc; 193 194 SUSPEND_DEBUG("xen_suspend_devices\n"); 195 196 if ((rc = cpr_suspend_devices(ddi_root_node())) != 0) 197 panic("failed to suspend devices: %d", rc); 198 } 199 200 void 201 xen_resume_devices(void) 202 { 203 int rc; 204 205 SUSPEND_DEBUG("xen_resume_devices\n"); 206 207 if ((rc = cpr_resume_devices(ddi_root_node(), 0)) != 0) 208 panic("failed to resume devices: %d", rc); 209 } 210 211 /* 212 * The list of mfn pages is out of date. Recompute it. 213 * XXPV: can we race against another suspend call? Think not. 214 */ 215 static void 216 rebuild_mfn_list(void) 217 { 218 int i = 0; 219 size_t sz; 220 size_t off; 221 pfn_t pfn; 222 223 SUSPEND_DEBUG("rebuild_mfn_list\n"); 224 225 sz = ((mfn_count * sizeof (mfn_t)) + MMU_PAGEOFFSET) & MMU_PAGEMASK; 226 227 for (off = 0; off < sz; off += MMU_PAGESIZE) { 228 size_t j = mmu_btop(off); 229 if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) { 230 pfn = hat_getpfnum(kas.a_hat, 231 (caddr_t)&mfn_list_pages[j]); 232 mfn_list_pages_page[i++] = pfn_to_mfn(pfn); 233 } 234 235 pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list + off); 236 mfn_list_pages[j] = pfn_to_mfn(pfn); 237 } 238 239 pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list_pages_page); 240 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list 241 = pfn_to_mfn(pfn); 242 } 243 244 static void 245 suspend_cpus(void) 246 { 247 int i; 248 249 SUSPEND_DEBUG("suspend_cpus\n"); 250 251 xen_suspending_cpus = 1; 252 253 pause_cpus(NULL); 254 255 SUSPEND_DEBUG("waiting for offline CPUs\n"); 256 257 /* 258 * For us to proceed safely, all CPUs except the current one must be 259 * present in cpu_suspend_set. Running CPUs will participate in 260 * pause_cpus(), and eventually reach mach_cpu_pause(). Powered-off 261 * VCPUs will already be in the set, again in mach_cpu_pause(). 262 * Finally, offline CPUs will be sitting in mach_cpu_idle(). 263 */ 264 while (!CPUSET_ISEQUAL(mp_cpus, cpu_suspend_set)) 265 SMT_PAUSE(); 266 267 for (i = 1; i < ncpus; i++) { 268 if (!CPU_IN_SET(cpu_suspend_lost_set, i)) { 269 SUSPEND_DEBUG("xen_vcpu_down %d\n", i); 270 (void) xen_vcpu_down(i); 271 } 272 273 mach_cpucontext_reset(cpu[i]); 274 } 275 } 276 277 static void 278 resume_cpus(void) 279 { 280 int i; 281 282 xen_suspending_cpus = 0; 283 284 for (i = 1; i < ncpus; i++) { 285 if (cpu[i] == NULL) 286 continue; 287 288 if (!CPU_IN_SET(cpu_suspend_lost_set, i)) { 289 SUSPEND_DEBUG("xen_vcpu_up %d\n", i); 290 mach_cpucontext_restore(cpu[i]); 291 (void) xen_vcpu_up(i); 292 } 293 } 294 295 start_cpus(); 296 } 297 298 /* 299 * Top level routine to direct suspend/resume of a domain. 300 */ 301 void 302 xen_suspend_domain(void) 303 { 304 extern void rtcsync(void); 305 extern hrtime_t hres_last_tick; 306 mfn_t start_info_mfn; 307 ulong_t flags; 308 pfn_t pfn; 309 int i; 310 311 /* 312 * XXPV - Are we definitely OK to suspend by the time we've connected 313 * the handler? 314 */ 315 316 cpr_err(CE_NOTE, "Domain suspending for save/migrate"); 317 318 SUSPEND_DEBUG("xen_suspend_domain\n"); 319 320 /* 321 * suspend interrupts and devices 322 * XXPV - we use suspend/resume for both save/restore domains (like sun 323 * cpr) and for migration. Would be nice to know the difference if 324 * possible. For save/restore where down time may be a long time, we 325 * may want to do more of the things that cpr does. (i.e. notify user 326 * processes, shrink memory footprint for faster restore, etc.) 327 */ 328 xen_suspend_devices(); 329 SUSPEND_DEBUG("xenbus_suspend\n"); 330 xenbus_suspend(); 331 332 pfn = hat_getpfnum(kas.a_hat, (caddr_t)xen_info); 333 start_info_mfn = pfn_to_mfn(pfn); 334 335 /* 336 * XXPV: cpu hotplug can hold this under a xenbus watch. Are we safe 337 * wrt xenbus being suspended here? 338 */ 339 mutex_enter(&cpu_lock); 340 341 /* 342 * Suspend must be done on vcpu 0, as no context for other CPUs is 343 * saved. 344 * 345 * XXPV - add to taskq API ? 346 */ 347 thread_affinity_set(curthread, 0); 348 kpreempt_disable(); 349 350 SUSPEND_DEBUG("xen_start_migrate\n"); 351 xen_start_migrate(); 352 if (ncpus > 1) 353 suspend_cpus(); 354 355 /* 356 * We can grab the ec_lock as it's a spinlock with a high SPL. Hence 357 * any holder would have dropped it to get through suspend_cpus(). 358 */ 359 mutex_enter(&ec_lock); 360 361 /* 362 * From here on in, we can't take locks. 363 */ 364 SUSPEND_DEBUG("ec_suspend\n"); 365 ec_suspend(); 366 SUSPEND_DEBUG("gnttab_suspend\n"); 367 gnttab_suspend(); 368 369 flags = intr_clear(); 370 371 xpv_time_suspend(); 372 373 /* 374 * Currently, the hypervisor incorrectly fails to bring back 375 * powered-down VCPUs. Thus we need to record any powered-down VCPUs 376 * to prevent any attempts to operate on them. But we have to do this 377 * *after* the very first time we do ec_suspend(). 378 */ 379 for (i = 1; i < ncpus; i++) { 380 if (cpu[i] == NULL) 381 continue; 382 383 if (cpu_get_state(cpu[i]) == P_POWEROFF) 384 CPUSET_ATOMIC_ADD(cpu_suspend_lost_set, i); 385 } 386 387 /* 388 * The dom0 save/migrate code doesn't automatically translate 389 * these into PFNs, but expects them to be, so we do it here. 390 * We don't use mfn_to_pfn() because so many OS services have 391 * been disabled at this point. 392 */ 393 xen_info->store_mfn = mfn_to_pfn_mapping[xen_info->store_mfn]; 394 xen_info->console.domU.mfn = 395 mfn_to_pfn_mapping[xen_info->console.domU.mfn]; 396 397 if (CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0) { 398 prom_printf("xen_suspend_domain(): " 399 "CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask not set\n"); 400 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 401 } 402 403 if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, 404 0, UVMF_INVLPG)) { 405 prom_printf("xen_suspend_domain(): " 406 "HYPERVISOR_update_va_mapping() failed\n"); 407 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 408 } 409 410 SUSPEND_DEBUG("HYPERVISOR_suspend\n"); 411 412 /* 413 * At this point we suspend and sometime later resume. 414 */ 415 if (HYPERVISOR_suspend(start_info_mfn)) { 416 prom_printf("xen_suspend_domain(): " 417 "HYPERVISOR_suspend() failed\n"); 418 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 419 } 420 421 /* 422 * Point HYPERVISOR_shared_info to its new value. 423 */ 424 if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, 425 xen_info->shared_info | PT_NOCONSIST | PT_VALID | PT_WRITABLE, 426 UVMF_INVLPG)) 427 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 428 429 if (xen_info->nr_pages != mfn_count) { 430 prom_printf("xen_suspend_domain(): number of pages" 431 " changed, was 0x%lx, now 0x%lx\n", mfn_count, 432 xen_info->nr_pages); 433 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 434 } 435 436 xpv_time_resume(); 437 438 cached_max_mfn = 0; 439 440 SUSPEND_DEBUG("gnttab_resume\n"); 441 gnttab_resume(); 442 443 /* XXPV: add a note that this must be lockless. */ 444 SUSPEND_DEBUG("ec_resume\n"); 445 ec_resume(); 446 447 intr_restore(flags); 448 449 if (ncpus > 1) 450 resume_cpus(); 451 452 mutex_exit(&ec_lock); 453 xen_end_migrate(); 454 mutex_exit(&cpu_lock); 455 456 /* 457 * Now we can take locks again. 458 */ 459 460 /* 461 * Force the tick value used for tv_nsec in hres_tick() to be up to 462 * date. rtcsync() will reset the hrestime value appropriately. 463 */ 464 hres_last_tick = xpv_gethrtime(); 465 466 /* 467 * XXPV: we need to have resumed the CPUs since this takes locks, but 468 * can remote CPUs see bad state? Presumably yes. Should probably nest 469 * taking of todlock inside of cpu_lock, or vice versa, then provide an 470 * unlocked version. Probably need to call clkinitf to reset cpu freq 471 * and re-calibrate if we migrated to a different speed cpu. Also need 472 * to make a (re)init_cpu_info call to update processor info structs 473 * and device tree info. That remains to be written at the moment. 474 */ 475 rtcsync(); 476 477 rebuild_mfn_list(); 478 479 SUSPEND_DEBUG("xenbus_resume\n"); 480 xenbus_resume(); 481 SUSPEND_DEBUG("xenbus_resume_devices\n"); 482 xen_resume_devices(); 483 484 thread_affinity_clear(curthread); 485 kpreempt_enable(); 486 487 SUSPEND_DEBUG("finished xen_suspend_domain\n"); 488 cmn_err(CE_NOTE, "domain restore/migrate completed"); 489 } 490 491 /*ARGSUSED*/ 492 int 493 xen_debug_handler(void *arg) 494 { 495 debug_enter("External debug event received"); 496 497 /* 498 * If we've not got KMDB loaded, output some stuff difficult to capture 499 * from a domain core. 500 */ 501 if (!(boothowto & RB_DEBUG)) { 502 shared_info_t *si = HYPERVISOR_shared_info; 503 int i; 504 505 prom_printf("evtchn_pending [ "); 506 for (i = 0; i < 8; i++) 507 prom_printf("%lx ", si->evtchn_pending[i]); 508 prom_printf("]\nevtchn_mask [ "); 509 for (i = 0; i < 8; i++) 510 prom_printf("%lx ", si->evtchn_mask[i]); 511 prom_printf("]\n"); 512 513 for (i = 0; i < ncpus; i++) { 514 vcpu_info_t *vcpu = &si->vcpu_info[i]; 515 if (cpu[i] == NULL) 516 continue; 517 prom_printf("CPU%d pending %d mask %d sel %lx\n", 518 i, vcpu->evtchn_upcall_pending, 519 vcpu->evtchn_upcall_mask, 520 vcpu->evtchn_pending_sel); 521 } 522 } 523 524 return (0); 525 } 526 527 /*ARGSUSED*/ 528 static void 529 xen_sysrq_handler(struct xenbus_watch *watch, const char **vec, 530 unsigned int len) 531 { 532 xenbus_transaction_t xbt; 533 char key = '\0'; 534 int ret; 535 536 retry: 537 if (xenbus_transaction_start(&xbt)) { 538 cmn_err(CE_WARN, "failed to start sysrq transaction"); 539 return; 540 } 541 542 if ((ret = xenbus_scanf(xbt, "control", "sysrq", "%c", &key)) != 0) { 543 /* 544 * ENOENT happens in response to our own xenbus_rm. 545 * XXPV - this happens spuriously on boot? 546 */ 547 if (ret != ENOENT) 548 cmn_err(CE_WARN, "failed to read sysrq: %d", ret); 549 goto out; 550 } 551 552 if ((ret = xenbus_rm(xbt, "control", "sysrq")) != 0) { 553 cmn_err(CE_WARN, "failed to reset sysrq: %d", ret); 554 goto out; 555 } 556 557 if (xenbus_transaction_end(xbt, 0) == EAGAIN) 558 goto retry; 559 560 /* 561 * Somewhat arbitrary - on Linux this means 'reboot'. We could just 562 * accept any key, but this might increase the risk of sending a 563 * harmless sysrq to the wrong domain... 564 */ 565 if (key == 'b') 566 (void) xen_debug_handler(NULL); 567 else 568 cmn_err(CE_WARN, "Ignored sysrq %c", key); 569 return; 570 571 out: 572 (void) xenbus_transaction_end(xbt, 1); 573 } 574 575 taskq_t *xen_shutdown_tq; 576 volatile int shutdown_req_active; 577 578 #define SHUTDOWN_INVALID -1 579 #define SHUTDOWN_POWEROFF 0 580 #define SHUTDOWN_REBOOT 1 581 #define SHUTDOWN_SUSPEND 2 582 #define SHUTDOWN_HALT 3 583 #define SHUTDOWN_MAX 4 584 585 #define SHUTDOWN_TIMEOUT_SECS (60 * 5) 586 587 static const char *cmd_strings[SHUTDOWN_MAX] = { 588 "poweroff", 589 "reboot", 590 "suspend", 591 "halt" 592 }; 593 594 static void 595 xen_dirty_shutdown(void *arg) 596 { 597 int cmd = (uintptr_t)arg; 598 599 cmn_err(CE_WARN, "Externally requested shutdown failed or " 600 "timed out.\nShutting down.\n"); 601 602 switch (cmd) { 603 case SHUTDOWN_HALT: 604 case SHUTDOWN_POWEROFF: 605 (void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred); 606 break; 607 case SHUTDOWN_REBOOT: 608 (void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred); 609 break; 610 } 611 } 612 613 static void 614 xen_shutdown(void *arg) 615 { 616 nvlist_t *attr_list = NULL; 617 sysevent_t *event = NULL; 618 sysevent_id_t eid; 619 int cmd = (uintptr_t)arg; 620 int err; 621 622 ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX); 623 624 if (cmd == SHUTDOWN_SUSPEND) { 625 xen_suspend_domain(); 626 shutdown_req_active = 0; 627 return; 628 } 629 630 err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_SLEEP); 631 if (err != DDI_SUCCESS) 632 goto failure; 633 634 err = nvlist_add_string(attr_list, "shutdown", cmd_strings[cmd]); 635 if (err != DDI_SUCCESS) 636 goto failure; 637 638 if ((event = sysevent_alloc("EC_xpvsys", "control", "SUNW:kern:xpv", 639 SE_SLEEP)) == NULL) 640 goto failure; 641 (void) sysevent_attach_attributes(event, 642 (sysevent_attr_list_t *)attr_list); 643 644 err = log_sysevent(event, SE_SLEEP, &eid); 645 646 sysevent_detach_attributes(event); 647 sysevent_free(event); 648 649 if (err != 0) 650 goto failure; 651 652 (void) timeout(xen_dirty_shutdown, arg, 653 SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC)); 654 655 nvlist_free(attr_list); 656 return; 657 658 failure: 659 if (attr_list != NULL) 660 nvlist_free(attr_list); 661 xen_dirty_shutdown(arg); 662 } 663 664 /*ARGSUSED*/ 665 static void 666 xen_shutdown_handler(struct xenbus_watch *watch, const char **vec, 667 unsigned int len) 668 { 669 char *str; 670 xenbus_transaction_t xbt; 671 int err, shutdown_code = SHUTDOWN_INVALID; 672 unsigned int slen; 673 674 again: 675 err = xenbus_transaction_start(&xbt); 676 if (err) 677 return; 678 if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) { 679 (void) xenbus_transaction_end(xbt, 1); 680 return; 681 } 682 683 SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str); 684 685 /* 686 * If this is a watch fired from our write below, check out early to 687 * avoid an infinite loop. 688 */ 689 if (strcmp(str, "") == 0) { 690 (void) xenbus_transaction_end(xbt, 0); 691 kmem_free(str, slen); 692 return; 693 } else if (strcmp(str, "poweroff") == 0) { 694 shutdown_code = SHUTDOWN_POWEROFF; 695 } else if (strcmp(str, "reboot") == 0) { 696 shutdown_code = SHUTDOWN_REBOOT; 697 } else if (strcmp(str, "suspend") == 0) { 698 shutdown_code = SHUTDOWN_SUSPEND; 699 } else if (strcmp(str, "halt") == 0) { 700 shutdown_code = SHUTDOWN_HALT; 701 } else { 702 printf("Ignoring shutdown request: %s\n", str); 703 } 704 705 /* 706 * XXPV Should we check the value of xenbus_write() too, or are all 707 * errors automatically folded into xenbus_transaction_end() ?? 708 */ 709 (void) xenbus_write(xbt, "control", "shutdown", ""); 710 err = xenbus_transaction_end(xbt, 0); 711 if (err == EAGAIN) { 712 SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id); 713 kmem_free(str, slen); 714 goto again; 715 } 716 717 kmem_free(str, slen); 718 if (shutdown_code != SHUTDOWN_INVALID) { 719 if (shutdown_code == SHUTDOWN_SUSPEND) { 720 while (shutdown_req_active) 721 SMT_PAUSE(); 722 } 723 724 shutdown_req_active = 1; 725 (void) taskq_dispatch(xen_shutdown_tq, xen_shutdown, 726 (void *)(intptr_t)shutdown_code, 0); 727 } 728 } 729 730 static struct xenbus_watch shutdown_watch; 731 static struct xenbus_watch sysrq_watch; 732 733 void 734 xen_late_startup(void) 735 { 736 if (!DOMAIN_IS_INITDOMAIN(xen_info)) { 737 xen_shutdown_tq = taskq_create("shutdown_taskq", 1, 738 maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE); 739 shutdown_watch.node = "control/shutdown"; 740 shutdown_watch.callback = xen_shutdown_handler; 741 if (register_xenbus_watch(&shutdown_watch)) 742 cmn_err(CE_WARN, "Failed to set shutdown watcher"); 743 744 sysrq_watch.node = "control/sysrq"; 745 sysrq_watch.callback = xen_sysrq_handler; 746 if (register_xenbus_watch(&sysrq_watch)) 747 cmn_err(CE_WARN, "Failed to set sysrq watcher"); 748 } 749 balloon_init(xen_info->nr_pages); 750 } 751 752 #ifdef DEBUG 753 #define XEN_PRINTF_BUFSIZE 1024 754 755 char xen_printf_buffer[XEN_PRINTF_BUFSIZE]; 756 757 /* 758 * Printf function that calls hypervisor directly. For DomU it only 759 * works when running on a xen hypervisor built with debug on. Works 760 * always since no I/O ring interaction is needed. 761 */ 762 /*PRINTFLIKE1*/ 763 void 764 xen_printf(const char *fmt, ...) 765 { 766 va_list ap; 767 768 va_start(ap, fmt); 769 (void) vsnprintf(xen_printf_buffer, XEN_PRINTF_BUFSIZE, fmt, ap); 770 va_end(ap); 771 772 (void) HYPERVISOR_console_io(CONSOLEIO_write, 773 strlen(xen_printf_buffer), xen_printf_buffer); 774 } 775 #else 776 void 777 xen_printf(const char *fmt, ...) 778 { 779 } 780 #endif /* DEBUG */ 781 782 /* 783 * Determine helpful version information. 784 * 785 * (And leave a copy around in the data segment so we can look 786 * at them later with e.g. kmdb.) 787 */ 788 struct xenver { 789 char *xv_ver; 790 char *xv_chgset; 791 char *xv_compiler; 792 char *xv_compile_date; 793 char *xv_compile_by; 794 char *xv_compile_domain; 795 char *xv_caps; 796 } xenver; 797 798 static char * 799 sprintf_alloc(const char *fmt, ...) 800 { 801 va_list ap; 802 size_t len; 803 char *p; 804 805 va_start(ap, fmt); 806 len = 1 + vsnprintf(NULL, 0, fmt, ap); 807 p = kmem_alloc(len, KM_SLEEP); 808 (void) vsnprintf(p, len, fmt, ap); 809 va_end(ap); 810 return (p); 811 } 812 813 void 814 xen_version(void) 815 { 816 static const char strfmt[] = "%s"; 817 static const char xenver_sun[] = "3.0.4-1-xvm"; /* XXPV */ 818 union { 819 xen_extraversion_t xver; 820 xen_changeset_info_t chgset; 821 xen_compile_info_t build; 822 xen_capabilities_info_t caps; 823 } data, *src = &data; 824 825 ulong_t ver = HYPERVISOR_xen_version(XENVER_version, 0); 826 827 if (HYPERVISOR_xen_version(XENVER_extraversion, src) == 0) { 828 ((char *)(src->xver))[sizeof (src->xver) - 1] = '\0'; 829 } else 830 ((char *)(src->xver))[0] = '\0'; 831 832 xenver.xv_ver = sprintf_alloc("%lu.%lu%s", 833 BITX(ver, 31, 16), BITX(ver, 15, 0), src->xver); 834 835 if (HYPERVISOR_xen_version(XENVER_changeset, src) == 0) { 836 ((char *)(src->chgset))[sizeof (src->chgset) - 1] = '\0'; 837 xenver.xv_chgset = sprintf_alloc(strfmt, src->chgset); 838 } 839 840 cmn_err(CE_CONT, "?xen v%s chgset '%s'\n", 841 xenver.xv_ver, xenver.xv_chgset); 842 843 /* 844 * XXPV - Solaris guests currently require special version of 845 * the hypervisor from Sun to function properly called "3.0.4-1-xvm". 846 * This version is based on "3.0.4-1" plus changes from 847 * Sun that are a work-in-progress. 848 * 849 * This version check will disappear after appropriate fixes 850 * are accepted upstream. 851 */ 852 if (strcmp(xenver.xv_ver, xenver_sun) != 0) { 853 cmn_err(CE_WARN, "Found xen v%s but need xen v%s", 854 xenver.xv_ver, xenver_sun); 855 cmn_err(CE_WARN, "The kernel may not function correctly"); 856 } 857 858 if (HYPERVISOR_xen_version(XENVER_compile_info, src) == 0) { 859 xenver.xv_compiler = sprintf_alloc(strfmt, 860 data.build.compiler); 861 xenver.xv_compile_date = sprintf_alloc(strfmt, 862 data.build.compile_date); 863 xenver.xv_compile_by = sprintf_alloc(strfmt, 864 data.build.compile_by); 865 xenver.xv_compile_domain = sprintf_alloc(strfmt, 866 data.build.compile_domain); 867 } 868 869 /* 870 * Capabilities are a set of space separated ascii strings 871 * e.g. 'xen-3.1-x86_32p' or 'hvm-3.2-x86_64' 872 */ 873 if (HYPERVISOR_xen_version(XENVER_capabilities, src) == 0) { 874 ((char *)(src->caps))[sizeof (src->caps) - 1] = '\0'; 875 xenver.xv_caps = sprintf_alloc(strfmt, src->caps); 876 } 877 } 878 879 /* 880 * Miscellaneous hypercall wrappers with slightly more verbose diagnostics. 881 */ 882 883 void 884 xen_set_gdt(ulong_t *frame_list, int entries) 885 { 886 int err; 887 if ((err = HYPERVISOR_set_gdt(frame_list, entries)) != 0) { 888 /* 889 * X_EINVAL: reserved entry or bad frames 890 * X_EFAULT: bad address 891 */ 892 panic("xen_set_gdt(%p, %d): error %d", 893 (void *)frame_list, entries, -(int)err); 894 } 895 } 896 897 void 898 xen_set_ldt(user_desc_t *ldt, uint_t nsels) 899 { 900 struct mmuext_op op; 901 long err; 902 903 op.cmd = MMUEXT_SET_LDT; 904 op.arg1.linear_addr = (uintptr_t)ldt; 905 op.arg2.nr_ents = nsels; 906 907 if ((err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) != 0) { 908 panic("xen_set_ldt(%p, %d): error %d", 909 (void *)ldt, nsels, -(int)err); 910 } 911 } 912 913 void 914 xen_stack_switch(ulong_t ss, ulong_t esp) 915 { 916 long err; 917 918 if ((err = HYPERVISOR_stack_switch(ss, esp)) != 0) { 919 /* 920 * X_EPERM: bad selector 921 */ 922 panic("xen_stack_switch(%lx, %lx): error %d", ss, esp, 923 -(int)err); 924 } 925 } 926 927 long 928 xen_set_trap_table(trap_info_t *table) 929 { 930 long err; 931 932 if ((err = HYPERVISOR_set_trap_table(table)) != 0) { 933 /* 934 * X_EFAULT: bad address 935 * X_EPERM: bad selector 936 */ 937 panic("xen_set_trap_table(%p): error %d", (void *)table, 938 -(int)err); 939 } 940 return (err); 941 } 942 943 #if defined(__amd64) 944 void 945 xen_set_segment_base(int reg, ulong_t value) 946 { 947 long err; 948 949 if ((err = HYPERVISOR_set_segment_base(reg, value)) != 0) { 950 /* 951 * X_EFAULT: bad address 952 * X_EINVAL: bad type 953 */ 954 panic("xen_set_segment_base(%d, %lx): error %d", 955 reg, value, -(int)err); 956 } 957 } 958 #endif /* __amd64 */ 959 960 /* 961 * Translate a hypervisor errcode to a Solaris error code. 962 */ 963 int 964 xen_xlate_errcode(int error) 965 { 966 switch (-error) { 967 968 /* 969 * Translate hypervisor errno's into native errno's 970 */ 971 972 #define CASE(num) case X_##num: error = num; break 973 974 CASE(EPERM); CASE(ENOENT); CASE(ESRCH); 975 CASE(EINTR); CASE(EIO); CASE(ENXIO); 976 CASE(E2BIG); CASE(ENOMEM); CASE(EACCES); 977 CASE(EFAULT); CASE(EBUSY); CASE(EEXIST); 978 CASE(ENODEV); CASE(EISDIR); CASE(EINVAL); 979 CASE(ENOSPC); CASE(ESPIPE); CASE(EROFS); 980 CASE(ENOSYS); CASE(ENOTEMPTY); CASE(EISCONN); 981 CASE(ENODATA); 982 983 #undef CASE 984 985 default: 986 panic("xen_xlate_errcode: unknown error %d", error); 987 } 988 989 return (error); 990 } 991 992 /* 993 * Raise PS_IOPL on current vcpu to user level. 994 * Caller responsible for preventing kernel preemption. 995 */ 996 void 997 xen_enable_user_iopl(void) 998 { 999 physdev_set_iopl_t set_iopl; 1000 set_iopl.iopl = 3; /* user ring 3 */ 1001 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 1002 } 1003 1004 /* 1005 * Drop PS_IOPL on current vcpu to kernel level 1006 */ 1007 void 1008 xen_disable_user_iopl(void) 1009 { 1010 physdev_set_iopl_t set_iopl; 1011 set_iopl.iopl = 1; /* kernel pseudo ring 1 */ 1012 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 1013 } 1014 1015 int 1016 xen_gdt_setprot(cpu_t *cp, uint_t prot) 1017 { 1018 int err; 1019 #if defined(__amd64) 1020 int pt_bits = PT_VALID; 1021 if (prot & PROT_WRITE) 1022 pt_bits |= PT_WRITABLE; 1023 #endif 1024 1025 if ((err = as_setprot(&kas, (caddr_t)cp->cpu_gdt, 1026 MMU_PAGESIZE, prot)) != 0) 1027 goto done; 1028 1029 #if defined(__amd64) 1030 err = xen_kpm_page(mmu_btop(cp->cpu_m.mcpu_gdtpa), pt_bits); 1031 #endif 1032 1033 done: 1034 if (err) { 1035 cmn_err(CE_WARN, "cpu%d: xen_gdt_setprot(%s) failed: error %d", 1036 cp->cpu_id, (prot & PROT_WRITE) ? "writable" : "read-only", 1037 err); 1038 } 1039 1040 return (err); 1041 } 1042 1043 int 1044 xen_ldt_setprot(user_desc_t *ldt, size_t lsize, uint_t prot) 1045 { 1046 int err; 1047 caddr_t lva = (caddr_t)ldt; 1048 #if defined(__amd64) 1049 int pt_bits = PT_VALID; 1050 pgcnt_t npgs; 1051 if (prot & PROT_WRITE) 1052 pt_bits |= PT_WRITABLE; 1053 #endif /* __amd64 */ 1054 1055 if ((err = as_setprot(&kas, (caddr_t)ldt, lsize, prot)) != 0) 1056 goto done; 1057 1058 #if defined(__amd64) 1059 1060 ASSERT(IS_P2ALIGNED(lsize, PAGESIZE)); 1061 npgs = mmu_btop(lsize); 1062 while (npgs--) { 1063 if ((err = xen_kpm_page(hat_getpfnum(kas.a_hat, lva), 1064 pt_bits)) != 0) 1065 break; 1066 lva += PAGESIZE; 1067 } 1068 #endif /* __amd64 */ 1069 1070 done: 1071 if (err) { 1072 cmn_err(CE_WARN, "xen_ldt_setprot(%p, %s) failed: error %d", 1073 (void *)lva, 1074 (prot & PROT_WRITE) ? "writable" : "read-only", err); 1075 } 1076 1077 return (err); 1078 } 1079