1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* derived from netbsd's xen_machdep.c 1.1.2.1 */ 30 31 /* 32 * 33 * Copyright (c) 2004 Christian Limpach. 34 * All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. This section intentionally left blank. 45 * 4. The name of the author may not be used to endorse or promote products 46 * derived from this software without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 49 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 50 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 51 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 52 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 53 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 54 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 55 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 56 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 57 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 58 */ 59 /* 60 * Section 3 of the above license was updated in response to bug 6379571. 61 */ 62 63 #include <sys/types.h> 64 #include <sys/cmn_err.h> 65 #include <sys/trap.h> 66 #include <sys/segments.h> 67 #include <sys/hypervisor.h> 68 #include <sys/xen_mmu.h> 69 #include <sys/machsystm.h> 70 #include <sys/promif.h> 71 #include <sys/bootconf.h> 72 #include <sys/bootinfo.h> 73 #include <sys/cpr.h> 74 #include <sys/taskq.h> 75 #include <sys/uadmin.h> 76 #include <sys/evtchn_impl.h> 77 #include <sys/archsystm.h> 78 #include <xen/sys/xenbus_impl.h> 79 #include <sys/mach_mmu.h> 80 #include <vm/hat_i86.h> 81 #include <sys/gnttab.h> 82 #include <sys/reboot.h> 83 #include <sys/stack.h> 84 #include <sys/clock.h> 85 #include <sys/bitmap.h> 86 #include <sys/processor.h> 87 #include <sys/xen_errno.h> 88 #include <sys/xpv_panic.h> 89 #include <sys/smp_impldefs.h> 90 #include <sys/cpu.h> 91 #include <sys/balloon_impl.h> 92 #include <sys/ddi.h> 93 94 #ifdef DEBUG 95 #define SUSPEND_DEBUG if (xen_suspend_debug) xen_printf 96 #else 97 #define SUSPEND_DEBUG(...) 98 #endif 99 100 int cpr_debug; 101 cpuset_t cpu_suspend_lost_set; 102 static int xen_suspend_debug; 103 104 void 105 xen_set_callback(void (*func)(void), uint_t type, uint_t flags) 106 { 107 struct callback_register cb; 108 109 bzero(&cb, sizeof (cb)); 110 #if defined(__amd64) 111 cb.address = (ulong_t)func; 112 #elif defined(__i386) 113 cb.address.cs = KCS_SEL; 114 cb.address.eip = (ulong_t)func; 115 #endif 116 cb.type = type; 117 cb.flags = flags; 118 119 /* 120 * XXPV always ignore return value for NMI 121 */ 122 if (HYPERVISOR_callback_op(CALLBACKOP_register, &cb) != 0 && 123 type != CALLBACKTYPE_nmi) 124 panic("HYPERVISOR_callback_op failed"); 125 } 126 127 void 128 xen_init_callbacks(void) 129 { 130 /* 131 * register event (interrupt) handler. 132 */ 133 xen_set_callback(xen_callback, CALLBACKTYPE_event, 0); 134 135 /* 136 * failsafe handler. 137 */ 138 xen_set_callback(xen_failsafe_callback, CALLBACKTYPE_failsafe, 139 CALLBACKF_mask_events); 140 141 /* 142 * NMI handler. 143 */ 144 xen_set_callback(nmiint, CALLBACKTYPE_nmi, 0); 145 146 /* 147 * system call handler 148 * XXPV move to init_cpu_syscall? 149 */ 150 #if defined(__amd64) 151 xen_set_callback(sys_syscall, CALLBACKTYPE_syscall, 152 CALLBACKF_mask_events); 153 #endif /* __amd64 */ 154 } 155 156 157 /* 158 * cmn_err() followed by a 1/4 second delay; this gives the 159 * logging service a chance to flush messages and helps avoid 160 * intermixing output from prom_printf(). 161 * XXPV: doesn't exactly help us on UP though. 162 */ 163 /*PRINTFLIKE2*/ 164 void 165 cpr_err(int ce, const char *fmt, ...) 166 { 167 va_list adx; 168 169 va_start(adx, fmt); 170 vcmn_err(ce, fmt, adx); 171 va_end(adx); 172 drv_usecwait(MICROSEC >> 2); 173 } 174 175 void 176 xen_suspend_devices(void) 177 { 178 int rc; 179 180 SUSPEND_DEBUG("xen_suspend_devices\n"); 181 182 if ((rc = cpr_suspend_devices(ddi_root_node())) != 0) 183 panic("failed to suspend devices: %d", rc); 184 } 185 186 void 187 xen_resume_devices(void) 188 { 189 int rc; 190 191 SUSPEND_DEBUG("xen_resume_devices\n"); 192 193 if ((rc = cpr_resume_devices(ddi_root_node(), 0)) != 0) 194 panic("failed to resume devices: %d", rc); 195 } 196 197 /* 198 * The list of mfn pages is out of date. Recompute it. 199 */ 200 static void 201 rebuild_mfn_list(void) 202 { 203 int i = 0; 204 size_t sz; 205 size_t off; 206 pfn_t pfn; 207 208 SUSPEND_DEBUG("rebuild_mfn_list\n"); 209 210 sz = ((mfn_count * sizeof (mfn_t)) + MMU_PAGEOFFSET) & MMU_PAGEMASK; 211 212 for (off = 0; off < sz; off += MMU_PAGESIZE) { 213 size_t j = mmu_btop(off); 214 if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) { 215 pfn = hat_getpfnum(kas.a_hat, 216 (caddr_t)&mfn_list_pages[j]); 217 mfn_list_pages_page[i++] = pfn_to_mfn(pfn); 218 } 219 220 pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list + off); 221 mfn_list_pages[j] = pfn_to_mfn(pfn); 222 } 223 224 pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list_pages_page); 225 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list 226 = pfn_to_mfn(pfn); 227 } 228 229 static void 230 suspend_cpus(void) 231 { 232 int i; 233 234 SUSPEND_DEBUG("suspend_cpus\n"); 235 236 mp_enter_barrier(); 237 238 for (i = 1; i < ncpus; i++) { 239 if (!CPU_IN_SET(cpu_suspend_lost_set, i)) { 240 SUSPEND_DEBUG("xen_vcpu_down %d\n", i); 241 (void) xen_vcpu_down(i); 242 } 243 244 mach_cpucontext_reset(cpu[i]); 245 } 246 } 247 248 static void 249 resume_cpus(void) 250 { 251 int i; 252 253 for (i = 1; i < ncpus; i++) { 254 if (cpu[i] == NULL) 255 continue; 256 257 if (!CPU_IN_SET(cpu_suspend_lost_set, i)) { 258 SUSPEND_DEBUG("xen_vcpu_up %d\n", i); 259 mach_cpucontext_restore(cpu[i]); 260 (void) xen_vcpu_up(i); 261 } 262 } 263 264 mp_leave_barrier(); 265 } 266 267 /* 268 * Top level routine to direct suspend/resume of a domain. 269 */ 270 void 271 xen_suspend_domain(void) 272 { 273 extern void rtcsync(void); 274 extern hrtime_t hres_last_tick; 275 mfn_t start_info_mfn; 276 ulong_t flags; 277 pfn_t pfn; 278 int i; 279 280 /* 281 * XXPV - Are we definitely OK to suspend by the time we've connected 282 * the handler? 283 */ 284 285 cpr_err(CE_NOTE, "Domain suspending for save/migrate"); 286 287 SUSPEND_DEBUG("xen_suspend_domain\n"); 288 289 /* 290 * suspend interrupts and devices 291 * XXPV - we use suspend/resume for both save/restore domains (like sun 292 * cpr) and for migration. Would be nice to know the difference if 293 * possible. For save/restore where down time may be a long time, we 294 * may want to do more of the things that cpr does. (i.e. notify user 295 * processes, shrink memory footprint for faster restore, etc.) 296 */ 297 xen_suspend_devices(); 298 SUSPEND_DEBUG("xenbus_suspend\n"); 299 xenbus_suspend(); 300 301 pfn = hat_getpfnum(kas.a_hat, (caddr_t)xen_info); 302 start_info_mfn = pfn_to_mfn(pfn); 303 304 /* 305 * XXPV: cpu hotplug can hold this under a xenbus watch. Are we safe 306 * wrt xenbus being suspended here? 307 */ 308 mutex_enter(&cpu_lock); 309 310 /* 311 * Suspend must be done on vcpu 0, as no context for other CPUs is 312 * saved. 313 * 314 * XXPV - add to taskq API ? 315 */ 316 thread_affinity_set(curthread, 0); 317 kpreempt_disable(); 318 319 SUSPEND_DEBUG("xen_start_migrate\n"); 320 xen_start_migrate(); 321 if (ncpus > 1) 322 suspend_cpus(); 323 324 /* 325 * We can grab the ec_lock as it's a spinlock with a high SPL. Hence 326 * any holder would have dropped it to get through suspend_cpus(). 327 */ 328 mutex_enter(&ec_lock); 329 330 /* 331 * From here on in, we can't take locks. 332 */ 333 SUSPEND_DEBUG("ec_suspend\n"); 334 ec_suspend(); 335 SUSPEND_DEBUG("gnttab_suspend\n"); 336 gnttab_suspend(); 337 338 flags = intr_clear(); 339 340 xpv_time_suspend(); 341 342 /* 343 * Currently, the hypervisor incorrectly fails to bring back 344 * powered-down VCPUs. Thus we need to record any powered-down VCPUs 345 * to prevent any attempts to operate on them. But we have to do this 346 * *after* the very first time we do ec_suspend(). 347 */ 348 for (i = 1; i < ncpus; i++) { 349 if (cpu[i] == NULL) 350 continue; 351 352 if (cpu_get_state(cpu[i]) == P_POWEROFF) 353 CPUSET_ATOMIC_ADD(cpu_suspend_lost_set, i); 354 } 355 356 /* 357 * The dom0 save/migrate code doesn't automatically translate 358 * these into PFNs, but expects them to be, so we do it here. 359 * We don't use mfn_to_pfn() because so many OS services have 360 * been disabled at this point. 361 */ 362 xen_info->store_mfn = mfn_to_pfn_mapping[xen_info->store_mfn]; 363 xen_info->console.domU.mfn = 364 mfn_to_pfn_mapping[xen_info->console.domU.mfn]; 365 366 if (CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0) { 367 prom_printf("xen_suspend_domain(): " 368 "CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask not set\n"); 369 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 370 } 371 372 if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, 373 0, UVMF_INVLPG)) { 374 prom_printf("xen_suspend_domain(): " 375 "HYPERVISOR_update_va_mapping() failed\n"); 376 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 377 } 378 379 SUSPEND_DEBUG("HYPERVISOR_suspend\n"); 380 381 /* 382 * At this point we suspend and sometime later resume. 383 */ 384 if (HYPERVISOR_suspend(start_info_mfn)) { 385 prom_printf("xen_suspend_domain(): " 386 "HYPERVISOR_suspend() failed\n"); 387 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 388 } 389 390 /* 391 * Point HYPERVISOR_shared_info to its new value. 392 */ 393 if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info, 394 xen_info->shared_info | PT_NOCONSIST | PT_VALID | PT_WRITABLE, 395 UVMF_INVLPG)) 396 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 397 398 if (xen_info->nr_pages != mfn_count) { 399 prom_printf("xen_suspend_domain(): number of pages" 400 " changed, was 0x%lx, now 0x%lx\n", mfn_count, 401 xen_info->nr_pages); 402 (void) HYPERVISOR_shutdown(SHUTDOWN_crash); 403 } 404 405 xpv_time_resume(); 406 407 cached_max_mfn = 0; 408 409 SUSPEND_DEBUG("gnttab_resume\n"); 410 gnttab_resume(); 411 412 /* XXPV: add a note that this must be lockless. */ 413 SUSPEND_DEBUG("ec_resume\n"); 414 ec_resume(); 415 416 intr_restore(flags); 417 418 if (ncpus > 1) 419 resume_cpus(); 420 421 mutex_exit(&ec_lock); 422 xen_end_migrate(); 423 mutex_exit(&cpu_lock); 424 425 /* 426 * Now we can take locks again. 427 */ 428 429 /* 430 * Force the tick value used for tv_nsec in hres_tick() to be up to 431 * date. rtcsync() will reset the hrestime value appropriately. 432 */ 433 hres_last_tick = xpv_gethrtime(); 434 435 /* 436 * XXPV: we need to have resumed the CPUs since this takes locks, but 437 * can remote CPUs see bad state? Presumably yes. Should probably nest 438 * taking of todlock inside of cpu_lock, or vice versa, then provide an 439 * unlocked version. Probably need to call clkinitf to reset cpu freq 440 * and re-calibrate if we migrated to a different speed cpu. Also need 441 * to make a (re)init_cpu_info call to update processor info structs 442 * and device tree info. That remains to be written at the moment. 443 */ 444 rtcsync(); 445 446 rebuild_mfn_list(); 447 448 SUSPEND_DEBUG("xenbus_resume\n"); 449 xenbus_resume(); 450 SUSPEND_DEBUG("xenbus_resume_devices\n"); 451 xen_resume_devices(); 452 453 thread_affinity_clear(curthread); 454 kpreempt_enable(); 455 456 SUSPEND_DEBUG("finished xen_suspend_domain\n"); 457 cmn_err(CE_NOTE, "domain restore/migrate completed"); 458 } 459 460 /*ARGSUSED*/ 461 int 462 xen_debug_handler(void *arg) 463 { 464 debug_enter("External debug event received"); 465 466 /* 467 * If we've not got KMDB loaded, output some stuff difficult to capture 468 * from a domain core. 469 */ 470 if (!(boothowto & RB_DEBUG)) { 471 shared_info_t *si = HYPERVISOR_shared_info; 472 int i; 473 474 prom_printf("evtchn_pending [ "); 475 for (i = 0; i < 8; i++) 476 prom_printf("%lx ", si->evtchn_pending[i]); 477 prom_printf("]\nevtchn_mask [ "); 478 for (i = 0; i < 8; i++) 479 prom_printf("%lx ", si->evtchn_mask[i]); 480 prom_printf("]\n"); 481 482 for (i = 0; i < ncpus; i++) { 483 vcpu_info_t *vcpu = &si->vcpu_info[i]; 484 if (cpu[i] == NULL) 485 continue; 486 prom_printf("CPU%d pending %d mask %d sel %lx\n", 487 i, vcpu->evtchn_upcall_pending, 488 vcpu->evtchn_upcall_mask, 489 vcpu->evtchn_pending_sel); 490 } 491 } 492 493 return (0); 494 } 495 496 /*ARGSUSED*/ 497 static void 498 xen_sysrq_handler(struct xenbus_watch *watch, const char **vec, 499 unsigned int len) 500 { 501 xenbus_transaction_t xbt; 502 char key = '\0'; 503 int ret; 504 505 retry: 506 if (xenbus_transaction_start(&xbt)) { 507 cmn_err(CE_WARN, "failed to start sysrq transaction"); 508 return; 509 } 510 511 if ((ret = xenbus_scanf(xbt, "control", "sysrq", "%c", &key)) != 0) { 512 /* 513 * ENOENT happens in response to our own xenbus_rm. 514 * XXPV - this happens spuriously on boot? 515 */ 516 if (ret != ENOENT) 517 cmn_err(CE_WARN, "failed to read sysrq: %d", ret); 518 goto out; 519 } 520 521 if ((ret = xenbus_rm(xbt, "control", "sysrq")) != 0) { 522 cmn_err(CE_WARN, "failed to reset sysrq: %d", ret); 523 goto out; 524 } 525 526 if (xenbus_transaction_end(xbt, 0) == EAGAIN) 527 goto retry; 528 529 /* 530 * Somewhat arbitrary - on Linux this means 'reboot'. We could just 531 * accept any key, but this might increase the risk of sending a 532 * harmless sysrq to the wrong domain... 533 */ 534 if (key == 'b') 535 (void) xen_debug_handler(NULL); 536 else 537 cmn_err(CE_WARN, "Ignored sysrq %c", key); 538 return; 539 540 out: 541 (void) xenbus_transaction_end(xbt, 1); 542 } 543 544 taskq_t *xen_shutdown_tq; 545 546 #define SHUTDOWN_INVALID -1 547 #define SHUTDOWN_POWEROFF 0 548 #define SHUTDOWN_REBOOT 1 549 #define SHUTDOWN_SUSPEND 2 550 #define SHUTDOWN_HALT 3 551 #define SHUTDOWN_MAX 4 552 553 #define SHUTDOWN_TIMEOUT_SECS (60 * 5) 554 555 static const char *cmd_strings[SHUTDOWN_MAX] = { 556 "poweroff", 557 "reboot", 558 "suspend", 559 "halt" 560 }; 561 562 static void 563 xen_dirty_shutdown(void *arg) 564 { 565 int cmd = (uintptr_t)arg; 566 567 cmn_err(CE_WARN, "Externally requested shutdown failed or " 568 "timed out.\nShutting down.\n"); 569 570 switch (cmd) { 571 case SHUTDOWN_HALT: 572 case SHUTDOWN_POWEROFF: 573 (void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred); 574 break; 575 case SHUTDOWN_REBOOT: 576 (void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred); 577 break; 578 } 579 } 580 581 static void 582 xen_shutdown(void *arg) 583 { 584 nvlist_t *attr_list = NULL; 585 sysevent_t *event = NULL; 586 sysevent_id_t eid; 587 int cmd = (uintptr_t)arg; 588 int err; 589 590 ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX); 591 592 if (cmd == SHUTDOWN_SUSPEND) { 593 xen_suspend_domain(); 594 return; 595 } 596 597 err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_SLEEP); 598 if (err != DDI_SUCCESS) 599 goto failure; 600 601 err = nvlist_add_string(attr_list, "shutdown", cmd_strings[cmd]); 602 if (err != DDI_SUCCESS) 603 goto failure; 604 605 if ((event = sysevent_alloc("EC_xpvsys", "control", "SUNW:kern:xpv", 606 SE_SLEEP)) == NULL) 607 goto failure; 608 (void) sysevent_attach_attributes(event, 609 (sysevent_attr_list_t *)attr_list); 610 611 err = log_sysevent(event, SE_SLEEP, &eid); 612 613 sysevent_detach_attributes(event); 614 sysevent_free(event); 615 616 if (err != 0) 617 goto failure; 618 619 (void) timeout(xen_dirty_shutdown, arg, 620 SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC)); 621 622 nvlist_free(attr_list); 623 return; 624 625 failure: 626 if (attr_list != NULL) 627 nvlist_free(attr_list); 628 xen_dirty_shutdown(arg); 629 } 630 631 /*ARGSUSED*/ 632 static void 633 xen_shutdown_handler(struct xenbus_watch *watch, const char **vec, 634 unsigned int len) 635 { 636 char *str; 637 xenbus_transaction_t xbt; 638 int err, shutdown_code = SHUTDOWN_INVALID; 639 unsigned int slen; 640 641 again: 642 err = xenbus_transaction_start(&xbt); 643 if (err) 644 return; 645 if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) { 646 (void) xenbus_transaction_end(xbt, 1); 647 return; 648 } 649 650 SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str); 651 652 /* 653 * If this is a watch fired from our write below, check out early to 654 * avoid an infinite loop. 655 */ 656 if (strcmp(str, "") == 0) { 657 (void) xenbus_transaction_end(xbt, 0); 658 kmem_free(str, slen); 659 return; 660 } else if (strcmp(str, "poweroff") == 0) { 661 shutdown_code = SHUTDOWN_POWEROFF; 662 } else if (strcmp(str, "reboot") == 0) { 663 shutdown_code = SHUTDOWN_REBOOT; 664 } else if (strcmp(str, "suspend") == 0) { 665 shutdown_code = SHUTDOWN_SUSPEND; 666 } else if (strcmp(str, "halt") == 0) { 667 shutdown_code = SHUTDOWN_HALT; 668 } else { 669 printf("Ignoring shutdown request: %s\n", str); 670 } 671 672 /* 673 * XXPV Should we check the value of xenbus_write() too, or are all 674 * errors automatically folded into xenbus_transaction_end() ?? 675 */ 676 (void) xenbus_write(xbt, "control", "shutdown", ""); 677 err = xenbus_transaction_end(xbt, 0); 678 if (err == EAGAIN) { 679 SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id); 680 kmem_free(str, slen); 681 goto again; 682 } 683 684 kmem_free(str, slen); 685 if (shutdown_code != SHUTDOWN_INVALID) { 686 (void) taskq_dispatch(xen_shutdown_tq, xen_shutdown, 687 (void *)(intptr_t)shutdown_code, 0); 688 } 689 } 690 691 static struct xenbus_watch shutdown_watch; 692 static struct xenbus_watch sysrq_watch; 693 694 void 695 xen_late_startup(void) 696 { 697 if (!DOMAIN_IS_INITDOMAIN(xen_info)) { 698 xen_shutdown_tq = taskq_create("shutdown_taskq", 1, 699 maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE); 700 shutdown_watch.node = "control/shutdown"; 701 shutdown_watch.callback = xen_shutdown_handler; 702 if (register_xenbus_watch(&shutdown_watch)) 703 cmn_err(CE_WARN, "Failed to set shutdown watcher"); 704 705 sysrq_watch.node = "control/sysrq"; 706 sysrq_watch.callback = xen_sysrq_handler; 707 if (register_xenbus_watch(&sysrq_watch)) 708 cmn_err(CE_WARN, "Failed to set sysrq watcher"); 709 } 710 balloon_init(xen_info->nr_pages); 711 } 712 713 #ifdef DEBUG 714 #define XEN_PRINTF_BUFSIZE 1024 715 716 char xen_printf_buffer[XEN_PRINTF_BUFSIZE]; 717 718 /* 719 * Printf function that calls hypervisor directly. For DomU it only 720 * works when running on a xen hypervisor built with debug on. Works 721 * always since no I/O ring interaction is needed. 722 */ 723 /*PRINTFLIKE1*/ 724 void 725 xen_printf(const char *fmt, ...) 726 { 727 va_list ap; 728 729 va_start(ap, fmt); 730 (void) vsnprintf(xen_printf_buffer, XEN_PRINTF_BUFSIZE, fmt, ap); 731 va_end(ap); 732 733 (void) HYPERVISOR_console_io(CONSOLEIO_write, 734 strlen(xen_printf_buffer), xen_printf_buffer); 735 } 736 #else 737 void 738 xen_printf(const char *fmt, ...) 739 { 740 } 741 #endif /* DEBUG */ 742 743 /* 744 * Determine helpful version information. 745 * 746 * (And leave a copy around in the data segment so we can look 747 * at them later with e.g. kmdb.) 748 */ 749 struct xenver { 750 char *xv_ver; 751 char *xv_chgset; 752 char *xv_compiler; 753 char *xv_compile_date; 754 char *xv_compile_by; 755 char *xv_compile_domain; 756 char *xv_caps; 757 } xenver; 758 759 static char * 760 sprintf_alloc(const char *fmt, ...) 761 { 762 va_list ap; 763 size_t len; 764 char *p; 765 766 va_start(ap, fmt); 767 len = 1 + vsnprintf(NULL, 0, fmt, ap); 768 p = kmem_alloc(len, KM_SLEEP); 769 (void) vsnprintf(p, len, fmt, ap); 770 va_end(ap); 771 return (p); 772 } 773 774 void 775 xen_version(void) 776 { 777 static const char strfmt[] = "%s"; 778 static const char xenver_sun[] = "3.0.4-1-xvm"; /* XXPV */ 779 union { 780 xen_extraversion_t xver; 781 xen_changeset_info_t chgset; 782 xen_compile_info_t build; 783 xen_capabilities_info_t caps; 784 } data, *src = &data; 785 786 ulong_t ver = HYPERVISOR_xen_version(XENVER_version, 0); 787 788 if (HYPERVISOR_xen_version(XENVER_extraversion, src) == 0) { 789 ((char *)(src->xver))[sizeof (src->xver) - 1] = '\0'; 790 } else 791 ((char *)(src->xver))[0] = '\0'; 792 793 xenver.xv_ver = sprintf_alloc("%lu.%lu%s", 794 BITX(ver, 31, 16), BITX(ver, 15, 0), src->xver); 795 796 if (HYPERVISOR_xen_version(XENVER_changeset, src) == 0) { 797 ((char *)(src->chgset))[sizeof (src->chgset) - 1] = '\0'; 798 xenver.xv_chgset = sprintf_alloc(strfmt, src->chgset); 799 } 800 801 cmn_err(CE_CONT, "?xen v%s chgset '%s'\n", 802 xenver.xv_ver, xenver.xv_chgset); 803 804 /* 805 * XXPV - Solaris guests currently require special version of 806 * the hypervisor from Sun to function properly called "3.0.4-1-xvm". 807 * This version is based on "3.0.4-1" plus changes from 808 * Sun that are a work-in-progress. 809 * 810 * This version check will disappear after appropriate fixes 811 * are accepted upstream. 812 */ 813 if (strcmp(xenver.xv_ver, xenver_sun) != 0) { 814 cmn_err(CE_WARN, "Found xen v%s but need xen v%s", 815 xenver.xv_ver, xenver_sun); 816 cmn_err(CE_WARN, "The kernel may not function correctly"); 817 } 818 819 if (HYPERVISOR_xen_version(XENVER_compile_info, src) == 0) { 820 xenver.xv_compiler = sprintf_alloc(strfmt, 821 data.build.compiler); 822 xenver.xv_compile_date = sprintf_alloc(strfmt, 823 data.build.compile_date); 824 xenver.xv_compile_by = sprintf_alloc(strfmt, 825 data.build.compile_by); 826 xenver.xv_compile_domain = sprintf_alloc(strfmt, 827 data.build.compile_domain); 828 } 829 830 /* 831 * Capabilities are a set of space separated ascii strings 832 * e.g. 'xen-3.1-x86_32p' or 'hvm-3.2-x86_64' 833 */ 834 if (HYPERVISOR_xen_version(XENVER_capabilities, src) == 0) { 835 ((char *)(src->caps))[sizeof (src->caps) - 1] = '\0'; 836 xenver.xv_caps = sprintf_alloc(strfmt, src->caps); 837 } 838 } 839 840 /* 841 * Miscellaneous hypercall wrappers with slightly more verbose diagnostics. 842 */ 843 844 void 845 xen_set_gdt(ulong_t *frame_list, int entries) 846 { 847 int err; 848 if ((err = HYPERVISOR_set_gdt(frame_list, entries)) != 0) { 849 /* 850 * X_EINVAL: reserved entry or bad frames 851 * X_EFAULT: bad address 852 */ 853 panic("xen_set_gdt(%p, %d): error %d", 854 (void *)frame_list, entries, -(int)err); 855 } 856 } 857 858 void 859 xen_set_ldt(user_desc_t *ldt, uint_t nsels) 860 { 861 struct mmuext_op op; 862 long err; 863 864 op.cmd = MMUEXT_SET_LDT; 865 op.arg1.linear_addr = (uintptr_t)ldt; 866 op.arg2.nr_ents = nsels; 867 868 if ((err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) != 0) { 869 panic("xen_set_ldt(%p, %d): error %d", 870 (void *)ldt, nsels, -(int)err); 871 } 872 } 873 874 void 875 xen_stack_switch(ulong_t ss, ulong_t esp) 876 { 877 long err; 878 879 if ((err = HYPERVISOR_stack_switch(ss, esp)) != 0) { 880 /* 881 * X_EPERM: bad selector 882 */ 883 panic("xen_stack_switch(%lx, %lx): error %d", ss, esp, 884 -(int)err); 885 } 886 } 887 888 long 889 xen_set_trap_table(trap_info_t *table) 890 { 891 long err; 892 893 if ((err = HYPERVISOR_set_trap_table(table)) != 0) { 894 /* 895 * X_EFAULT: bad address 896 * X_EPERM: bad selector 897 */ 898 panic("xen_set_trap_table(%p): error %d", (void *)table, 899 -(int)err); 900 } 901 return (err); 902 } 903 904 #if defined(__amd64) 905 void 906 xen_set_segment_base(int reg, ulong_t value) 907 { 908 long err; 909 910 if ((err = HYPERVISOR_set_segment_base(reg, value)) != 0) { 911 /* 912 * X_EFAULT: bad address 913 * X_EINVAL: bad type 914 */ 915 panic("xen_set_segment_base(%d, %lx): error %d", 916 reg, value, -(int)err); 917 } 918 } 919 #endif /* __amd64 */ 920 921 /* 922 * Translate a hypervisor errcode to a Solaris error code. 923 */ 924 int 925 xen_xlate_errcode(int error) 926 { 927 switch (-error) { 928 929 /* 930 * Translate hypervisor errno's into native errno's 931 */ 932 933 #define CASE(num) case X_##num: error = num; break 934 935 CASE(EPERM); CASE(ENOENT); CASE(ESRCH); 936 CASE(EINTR); CASE(EIO); CASE(ENXIO); 937 CASE(E2BIG); CASE(ENOMEM); CASE(EACCES); 938 CASE(EFAULT); CASE(EBUSY); CASE(EEXIST); 939 CASE(ENODEV); CASE(EISDIR); CASE(EINVAL); 940 CASE(ENOSPC); CASE(ESPIPE); CASE(EROFS); 941 CASE(ENOSYS); CASE(ENOTEMPTY); CASE(EISCONN); 942 CASE(ENODATA); 943 944 #undef CASE 945 946 default: 947 panic("xen_xlate_errcode: unknown error %d", error); 948 } 949 950 return (error); 951 } 952 953 /* 954 * Raise PS_IOPL on current vcpu to user level. 955 * Caller responsible for preventing kernel preemption. 956 */ 957 void 958 xen_enable_user_iopl(void) 959 { 960 physdev_set_iopl_t set_iopl; 961 set_iopl.iopl = 3; /* user ring 3 */ 962 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 963 } 964 965 /* 966 * Drop PS_IOPL on current vcpu to kernel level 967 */ 968 void 969 xen_disable_user_iopl(void) 970 { 971 physdev_set_iopl_t set_iopl; 972 set_iopl.iopl = 1; /* kernel pseudo ring 1 */ 973 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 974 } 975 976 int 977 xen_gdt_setprot(cpu_t *cp, uint_t prot) 978 { 979 int err; 980 #if defined(__amd64) 981 int pt_bits = PT_VALID; 982 if (prot & PROT_WRITE) 983 pt_bits |= PT_WRITABLE; 984 #endif 985 986 if ((err = as_setprot(&kas, (caddr_t)cp->cpu_gdt, 987 MMU_PAGESIZE, prot)) != 0) 988 goto done; 989 990 #if defined(__amd64) 991 err = xen_kpm_page(mmu_btop(cp->cpu_m.mcpu_gdtpa), pt_bits); 992 #endif 993 994 done: 995 if (err) { 996 cmn_err(CE_WARN, "cpu%d: xen_gdt_setprot(%s) failed: error %d", 997 cp->cpu_id, (prot & PROT_WRITE) ? "writable" : "read-only", 998 err); 999 } 1000 1001 return (err); 1002 } 1003 1004 int 1005 xen_ldt_setprot(user_desc_t *ldt, size_t lsize, uint_t prot) 1006 { 1007 int err; 1008 caddr_t lva = (caddr_t)ldt; 1009 #if defined(__amd64) 1010 int pt_bits = PT_VALID; 1011 pgcnt_t npgs; 1012 if (prot & PROT_WRITE) 1013 pt_bits |= PT_WRITABLE; 1014 #endif /* __amd64 */ 1015 1016 if ((err = as_setprot(&kas, (caddr_t)ldt, lsize, prot)) != 0) 1017 goto done; 1018 1019 #if defined(__amd64) 1020 1021 ASSERT(IS_P2ALIGNED(lsize, PAGESIZE)); 1022 npgs = mmu_btop(lsize); 1023 while (npgs--) { 1024 if ((err = xen_kpm_page(hat_getpfnum(kas.a_hat, lva), 1025 pt_bits)) != 0) 1026 break; 1027 lva += PAGESIZE; 1028 } 1029 #endif /* __amd64 */ 1030 1031 done: 1032 if (err) { 1033 cmn_err(CE_WARN, "xen_ldt_setprot(%p, %s) failed: error %d", 1034 (void *)lva, 1035 (prot & PROT_WRITE) ? "writable" : "read-only", err); 1036 } 1037 1038 return (err); 1039 } 1040