xref: /titanic_52/usr/src/uts/i86xpv/os/mp_xen.c (revision df4705eab9c873eae60bada4f2138e6f22a76e11)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/systm.h>
31 #include <sys/param.h>
32 #include <sys/taskq.h>
33 #include <sys/cmn_err.h>
34 #include <sys/archsystm.h>
35 #include <sys/machsystm.h>
36 #include <sys/segments.h>
37 #include <sys/cpuvar.h>
38 #include <sys/psw.h>
39 #include <sys/x86_archext.h>
40 #include <sys/controlregs.h>
41 #include <vm/as.h>
42 #include <vm/hat.h>
43 #include <vm/hat_i86.h>
44 #include <sys/mman.h>
45 #include <sys/hypervisor.h>
46 #include <xen/sys/xenbus_impl.h>
47 #include <sys/xpv_panic.h>
48 #include <util/sscanf.h>
49 #include <sys/cpu.h>
50 #include <asm/cpu.h>
51 
52 #include <xen/public/vcpu.h>
53 #include <xen/public/io/xs_wire.h>
54 
55 struct xen_evt_data cpu0_evt_data;		/* cpu0's pending event data */
56 
57 static taskq_t *cpu_config_tq;
58 static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
59 static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
60 
61 /*
62  * These routines allocate any global state that might be needed
63  * while starting cpus.  For virtual cpus, there is no such state.
64  */
65 int
66 mach_cpucontext_init(void)
67 {
68 	return (0);
69 }
70 
71 void
72 do_cpu_config_watch(int state)
73 {
74 	static struct xenbus_watch cpu_config_watch;
75 
76 	if (state != XENSTORE_UP)
77 		return;
78 	cpu_config_watch.node = "cpu";
79 	cpu_config_watch.callback = vcpu_config_event;
80 	if (register_xenbus_watch(&cpu_config_watch)) {
81 		taskq_destroy(cpu_config_tq);
82 		cmn_err(CE_WARN, "do_cpu_config_watch: "
83 		    "failed to set vcpu config watch");
84 	}
85 
86 }
87 
88 /*
89  * This routine is called after all the "normal" MP startup has
90  * been done; a good place to start watching xen store for virtual
91  * cpu hot plug events.
92  */
93 void
94 mach_cpucontext_fini(void)
95 {
96 
97 	cpu_config_tq = taskq_create("vcpu config taskq", 1,
98 	    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
99 
100 	(void) xs_register_xenbus_callback(do_cpu_config_watch);
101 }
102 
103 /*
104  * Fill in the remaining CPU context and initialize it.
105  */
106 static int
107 mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
108 {
109 	uint_t vec, iopl;
110 
111 	vgc->flags = VGCF_IN_KERNEL;
112 
113 	/*
114 	 * fpu_ctx we leave as zero; on first fault we'll store
115 	 * sse_initial into it anyway.
116 	 */
117 
118 #if defined(__amd64)
119 	vgc->user_regs.cs = KCS_SEL | SEL_KPL;	/* force to ring 3 */
120 #else
121 	vgc->user_regs.cs = KCS_SEL;
122 #endif
123 	vgc->user_regs.ds = KDS_SEL;
124 	vgc->user_regs.es = KDS_SEL;
125 	vgc->user_regs.ss = KDS_SEL;
126 	vgc->kernel_ss = KDS_SEL;
127 
128 	/*
129 	 * Allow I/O privilege level for Dom0 kernel.
130 	 */
131 	if (DOMAIN_IS_INITDOMAIN(xen_info))
132 		iopl = (PS_IOPL & 0x1000); /* ring 1 */
133 	else
134 		iopl = 0;
135 
136 #if defined(__amd64)
137 	vgc->user_regs.fs = 0;
138 	vgc->user_regs.gs = 0;
139 	vgc->user_regs.rflags = F_OFF | iopl;
140 #elif defined(__i386)
141 	vgc->user_regs.fs = KFS_SEL;
142 	vgc->user_regs.gs = KGS_SEL;
143 	vgc->user_regs.eflags = F_OFF | iopl;
144 	vgc->event_callback_cs = vgc->user_regs.cs;
145 	vgc->failsafe_callback_cs = vgc->user_regs.cs;
146 #endif
147 
148 	/*
149 	 * Initialize the trap_info_t from the IDT
150 	 */
151 #if !defined(__lint)
152 	ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
153 #endif
154 	for (vec = 0; vec < NIDT; vec++) {
155 		trap_info_t *ti = &vgc->trap_ctxt[vec];
156 
157 		if (xen_idt_to_trap_info(vec,
158 		    &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
159 			ti->cs = KCS_SEL;
160 			ti->vector = vec;
161 		}
162 	}
163 
164 	/*
165 	 * No LDT
166 	 */
167 
168 	/*
169 	 * (We assert in various places that the GDT is (a) aligned on a
170 	 * page boundary and (b) one page long, so this really should fit..)
171 	 */
172 #ifdef CRASH_XEN
173 	vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
174 #else
175 	vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
176 #endif
177 	vgc->gdt_ents = NGDT;
178 
179 	vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
180 
181 #if defined(__i386)
182 	if (mmu.pae_hat)
183 		vgc->ctrlreg[3] =
184 		    xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn));
185 	else
186 #endif
187 		vgc->ctrlreg[3] =
188 		    pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
189 
190 	vgc->ctrlreg[4] = getcr4();
191 
192 	vgc->event_callback_eip = (uintptr_t)xen_callback;
193 	vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
194 	vgc->flags |= VGCF_failsafe_disables_events;
195 
196 #if defined(__amd64)
197 	/*
198 	 * XXPV should this be moved to init_cpu_syscall?
199 	 */
200 	vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
201 	vgc->flags |= VGCF_syscall_disables_events;
202 
203 	ASSERT(vgc->user_regs.gs == 0);
204 	vgc->gs_base_kernel = (uintptr_t)cp;
205 #endif
206 
207 	return (xen_vcpu_initialize(cp->cpu_id, vgc));
208 }
209 
210 /*
211  * Create a guest virtual cpu context so that the virtual cpu
212  * springs into life in the domain just about to call mp_startup()
213  *
214  * Virtual CPUs must be initialized once in the lifetime of the domain;
215  * after that subsequent attempts to start them will fail with X_EEXIST.
216  *
217  * Thus 'alloc' -really- creates and initializes the virtual
218  * CPU context just once. Once the initialisation succeeds, we never
219  * free it, nor the regular cpu_t to which it refers.
220  */
221 void *
222 mach_cpucontext_alloc(struct cpu *cp)
223 {
224 	kthread_t *tp = cp->cpu_thread;
225 	vcpu_guest_context_t vgc;
226 
227 	int err = 1;
228 
229 	/*
230 	 * First, augment the incoming cpu structure
231 	 * - vcpu pointer reference
232 	 * - pending event storage area
233 	 * - physical address of GDT
234 	 */
235 	cp->cpu_m.mcpu_vcpu_info =
236 	    &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
237 	cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
238 	    sizeof (struct xen_evt_data), KM_SLEEP);
239 	cp->cpu_m.mcpu_gdtpa =
240 	    mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
241 
242 	if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
243 		goto done;
244 
245 	/*
246 	 * Now set up the vcpu context so that we can start this vcpu
247 	 * in the kernel at tp->t_pc (mp_startup).  Note that the
248 	 * thread will thread_exit() shortly after performing the
249 	 * initialization; in particular, we will *never* take a
250 	 * privilege transition on this thread.
251 	 */
252 
253 	bzero(&vgc, sizeof (vgc));
254 
255 #ifdef __amd64
256 	vgc.user_regs.rip = tp->t_pc;
257 	vgc.user_regs.rsp = tp->t_sp;
258 	vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
259 #else
260 	vgc.user_regs.eip = tp->t_pc;
261 	vgc.user_regs.esp = tp->t_sp;
262 	vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t);
263 #endif
264 	/*
265 	 * XXPV	Fix resume, if Russ didn't already fix it.
266 	 *
267 	 * Note that resume unconditionally puts t->t_stk + sizeof (regs)
268 	 * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
269 	 * that only lwps take traps that switch to the kernel stack;
270 	 * part of creating an lwp adjusts the stack by subtracting
271 	 * sizeof (struct regs) off t_stk.
272 	 *
273 	 * The more interesting question is, why do we do all the work
274 	 * of a fully fledged lwp for a plain thread?  In particular
275 	 * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
276 	 * or futz with the LDT.  This should probably all be done with
277 	 * an lwp context operator to keep pure thread context switch fast.
278 	 */
279 	vgc.kernel_sp = (ulong_t)tp->t_stk;
280 
281 	err = mp_set_cpu_context(&vgc, cp);
282 
283 done:
284 	if (err) {
285 		mach_cpucontext_free(cp, NULL, err);
286 		return (NULL);
287 	}
288 	return (cp);
289 }
290 
291 /*
292  * By the time we are called either we have successfully started
293  * the cpu, or our attempt to start it has failed.
294  */
295 
296 /*ARGSUSED*/
297 void
298 mach_cpucontext_free(struct cpu *cp, void *arg, int err)
299 {
300 	switch (err) {
301 	case 0:
302 		break;
303 	case ETIMEDOUT:
304 		/*
305 		 * The vcpu context is loaded into the hypervisor, and
306 		 * we've tried to start it, but the vcpu has not been set
307 		 * running yet, for whatever reason.  We arrange to -not-
308 		 * free any data structures it may be referencing.  In
309 		 * particular, we've already told the hypervisor about
310 		 * the GDT, and so we can't map it read-write again.
311 		 */
312 		break;
313 	default:
314 		(void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
315 		kmem_free(cp->cpu_m.mcpu_evt_pend,
316 		    sizeof (struct xen_evt_data));
317 		break;
318 	}
319 }
320 
321 /*
322  * Reset this CPU's context.  Clear out any pending evtchn data, since event
323  * channel numbers will all change when we resume.
324  */
325 void
326 mach_cpucontext_reset(cpu_t *cp)
327 {
328 	bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
329 	/* mcpu_intr_pending ? */
330 }
331 
332 static void
333 pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
334 {
335 #ifdef __amd64
336 	vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
337 	vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
338 	vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
339 	vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
340 	vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
341 	vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
342 	vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
343 	vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
344 #else /* __amd64 */
345 	vgc->user_regs.eip = pcb->val[REG_LABEL_PC];
346 	vgc->user_regs.esp = pcb->val[REG_LABEL_SP];
347 	vgc->user_regs.ebp = pcb->val[REG_LABEL_BP];
348 	vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX];
349 	vgc->user_regs.esi = pcb->val[REG_LABEL_ESI];
350 	vgc->user_regs.edi = pcb->val[REG_LABEL_EDI];
351 #endif /* __amd64 */
352 }
353 
354 /*
355  * Restore the context of a CPU during resume.  The CPU must either
356  * have been blocked in cpu_idle() (running the idle thread), if it was
357  * offline, or inside cpu_pause_thread().  Either way we can restore safely
358  * from the t_pcb.
359  */
360 void
361 mach_cpucontext_restore(cpu_t *cp)
362 {
363 	vcpu_guest_context_t vgc;
364 	int err;
365 
366 	ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
367 	    cp->cpu_thread == cp->cpu_idle_thread);
368 
369 	bzero(&vgc, sizeof (vgc));
370 
371 	pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
372 
373 	/*
374 	 * We're emulating a longjmp() here: in particular, we need to bump the
375 	 * stack pointer to account for the pop of xIP that returning from
376 	 * longjmp() normally would do, and set the return value in xAX to 1.
377 	 */
378 #ifdef __amd64
379 	vgc.user_regs.rax = 1;
380 	vgc.user_regs.rsp += sizeof (ulong_t);
381 #else
382 	vgc.user_regs.eax = 1;
383 	vgc.user_regs.esp += sizeof (ulong_t);
384 #endif
385 
386 	vgc.kernel_sp = cp->cpu_thread->t_sp;
387 
388 	err = mp_set_cpu_context(&vgc, cp);
389 
390 	ASSERT(err == 0);
391 }
392 
393 void
394 mach_cpu_idle(void)
395 {
396 	if (IN_XPV_PANIC()) {
397 		xpv_panic_halt();
398 	} else  {
399 		(void) setjmp(&curthread->t_pcb);
400 		CPUSET_ATOMIC_ADD(cpu_suspend_set, CPU->cpu_id);
401 		(void) HYPERVISOR_block();
402 		CPUSET_ATOMIC_DEL(cpu_suspend_set, CPU->cpu_id);
403 	}
404 }
405 
406 void
407 mach_cpu_halt(char *msg)
408 {
409 	if (msg)
410 		prom_printf("%s\n", msg);
411 	(void) xen_vcpu_down(CPU->cpu_id);
412 }
413 
414 void
415 mach_cpu_pause(volatile char *safe)
416 {
417 	ulong_t flags;
418 
419 	flags = intr_clear();
420 
421 	if (setjmp(&curthread->t_pcb) == 0) {
422 		CPUSET_ATOMIC_ADD(cpu_suspend_set, CPU->cpu_id);
423 		/*
424 		 * This cpu is now safe.
425 		 */
426 		*safe = PAUSE_WAIT;
427 		membar_enter();
428 	}
429 
430 	while (*safe != PAUSE_IDLE)
431 		SMT_PAUSE();
432 
433 	CPUSET_ATOMIC_DEL(cpu_suspend_set, CPU->cpu_id);
434 
435 	intr_restore(flags);
436 }
437 
438 /*
439  * Virtual CPU management.
440  *
441  * VCPUs can be controlled in one of two ways; through the domain itself
442  * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
443  * Unfortunately, the terminology is used in different ways; they work out as
444  * follows:
445  *
446  * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
447  *
448  * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
449  * hypervisor on the idle thread).  It must be up since a downed VCPU cannot
450  * receive interrupts, and we require this for offline CPUs in Solaris.
451  *
452  * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
453  * xen_vcpu_down() for it).  It can't take interrupts or run anything, though
454  * if it has run previously, its software state (cpu_t, machcpu structures, IPI
455  * event channels, etc.) will still exist.
456  *
457  * The hypervisor has two notions of CPU states as represented in the store:
458  *
459  * "offline": the VCPU is down.  Corresponds to P_POWEROFF.
460  *
461  * "online": the VCPU is running.  Corresponds to a CPU state other than
462  * P_POWEROFF.
463  *
464  * Currently, only a notification via xenstore can bring a CPU into a
465  * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
466  * P_OFFLINE, etc.  We need to be careful to treat xenstore notifications
467  * idempotently, as we'll get 'duplicate' entries when we resume a domain.
468  *
469  * Note that the xenstore configuration is strictly advisory, in that a domain
470  * can choose to ignore it and still power up a VCPU in the offline state. To
471  * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
472  * ENOTSUP from within Solaris.
473  */
474 
475 /*ARGSUSED*/
476 int
477 mp_cpu_poweron(struct cpu *cp)
478 {
479 	return (ENOTSUP);
480 }
481 
482 /*ARGSUSED*/
483 int
484 mp_cpu_poweroff(struct cpu *cp)
485 {
486 	return (ENOTSUP);
487 }
488 
489 static int
490 poweron_vcpu(struct cpu *cp)
491 {
492 	int error;
493 
494 	ASSERT(MUTEX_HELD(&cpu_lock));
495 
496 	if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
497 		printf("poweron_vcpu: vcpu%d is not available!\n",
498 		    cp->cpu_id);
499 		return (ENXIO);
500 	}
501 
502 	if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
503 		CPUSET_ADD(cpu_ready_set, cp->cpu_id);
504 		cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
505 		cp->cpu_flags &= ~CPU_POWEROFF;
506 		/*
507 		 * There are some nasty races possible here.
508 		 * Tell the vcpu it's up one more time.
509 		 * XXPV	Is this enough?  Is this safe?
510 		 */
511 		(void) xen_vcpu_up(cp->cpu_id);
512 
513 		cpu_set_state(cp);
514 	}
515 	return (error);
516 }
517 
518 static int
519 poweroff_poke(void)
520 {
521 	CPUSET_ATOMIC_DEL(cpu_suspend_set, CPU->cpu_id);
522 	return (0);
523 }
524 
525 /*
526  * We must ensure that the VCPU reaches a safe state (in the suspend set, and
527  * thus is not going to change) before we can power it off.  The VCPU could
528  * still be in mach_cpu_pause() and about to head back out; so just checking
529  * cpu_suspend_set() isn't sufficient to make sure the VCPU has stopped moving.
530  * Instead, we xcall it to delete itself from the set; whichever way it comes
531  * back from that xcall, it won't mark itself in the set until it's safely back
532  * in mach_cpu_idle().
533  */
534 static int
535 poweroff_vcpu(struct cpu *cp)
536 {
537 	int error;
538 	cpuset_t set;
539 
540 	ASSERT(MUTEX_HELD(&cpu_lock));
541 
542 	ASSERT(CPU->cpu_id != cp->cpu_id);
543 	ASSERT(cp->cpu_flags & CPU_QUIESCED);
544 
545 	CPUSET_ONLY(set, cp->cpu_id);
546 
547 	xc_sync(0, 0, 0, X_CALL_HIPRI, set, (xc_func_t)poweroff_poke);
548 
549 	while (!CPU_IN_SET(cpu_suspend_set, cp->cpu_id))
550 		SMT_PAUSE();
551 
552 	if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
553 		ASSERT(CPU_IN_SET(cpu_suspend_set, cp->cpu_id));
554 		CPUSET_DEL(cpu_ready_set, cp->cpu_id);
555 		cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
556 		cp->cpu_flags &=
557 		    ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
558 
559 		cpu_set_state(cp);
560 	}
561 	return (error);
562 }
563 
564 static int
565 vcpu_config_poweroff(processorid_t id)
566 {
567 	int oldstate;
568 	int error;
569 	cpu_t *cp;
570 
571 	mutex_enter(&cpu_lock);
572 
573 	if ((cp = cpu_get(id)) == NULL) {
574 		mutex_exit(&cpu_lock);
575 		return (ESRCH);
576 	}
577 
578 	if (cpu_get_state(cp) == P_POWEROFF) {
579 		mutex_exit(&cpu_lock);
580 		return (0);
581 	}
582 
583 	mutex_exit(&cpu_lock);
584 
585 	do {
586 		error = p_online_internal(id, P_OFFLINE,
587 		    &oldstate);
588 
589 		if (error != 0)
590 			break;
591 
592 		/*
593 		 * So we just changed it to P_OFFLINE.  But then we dropped
594 		 * cpu_lock, so now it is possible for another thread to change
595 		 * the cpu back to a different, non-quiesced state e.g.
596 		 * P_ONLINE.
597 		 */
598 		mutex_enter(&cpu_lock);
599 		if ((cp = cpu_get(id)) == NULL)
600 			error = ESRCH;
601 		else {
602 			if (cp->cpu_flags & CPU_QUIESCED)
603 				error = poweroff_vcpu(cp);
604 			else
605 				error = EBUSY;
606 		}
607 		mutex_exit(&cpu_lock);
608 	} while (error == EBUSY);
609 
610 	return (error);
611 }
612 
613 /*
614  * Add a new virtual cpu to the domain.
615  */
616 static int
617 vcpu_config_new(processorid_t id)
618 {
619 	extern int start_cpu(processorid_t);
620 	int error;
621 
622 	if (ncpus == 1) {
623 		printf("cannot (yet) add cpus to a single-cpu domain\n");
624 		return (ENOTSUP);
625 	}
626 
627 	affinity_set(CPU_CURRENT);
628 	error = start_cpu(id);
629 	affinity_clear();
630 	return (error);
631 }
632 
633 static int
634 vcpu_config_poweron(processorid_t id)
635 {
636 	cpu_t *cp;
637 	int oldstate;
638 	int error;
639 
640 	if (id >= ncpus)
641 		return (vcpu_config_new(id));
642 
643 	mutex_enter(&cpu_lock);
644 
645 	if ((cp = cpu_get(id)) == NULL) {
646 		mutex_exit(&cpu_lock);
647 		return (ESRCH);
648 	}
649 
650 	if (cpu_get_state(cp) != P_POWEROFF) {
651 		mutex_exit(&cpu_lock);
652 		return (0);
653 	}
654 
655 	if ((error = poweron_vcpu(cp)) != 0) {
656 		mutex_exit(&cpu_lock);
657 		return (error);
658 	}
659 
660 	mutex_exit(&cpu_lock);
661 
662 	return (p_online_internal(id, P_ONLINE, &oldstate));
663 }
664 
665 #define	REPORT_LEN	128
666 
667 static void
668 vcpu_config_report(processorid_t id, uint_t newstate, int error)
669 {
670 	char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
671 	size_t len;
672 	char *ps;
673 
674 	switch (newstate) {
675 	case P_ONLINE:
676 		ps = PS_ONLINE;
677 		break;
678 	case P_POWEROFF:
679 		ps = PS_POWEROFF;
680 		break;
681 	default:
682 		cmn_err(CE_PANIC, "unknown state %u\n", newstate);
683 		break;
684 	}
685 
686 	len = snprintf(report, REPORT_LEN,
687 	    "cpu%d: externally initiated %s", id, ps);
688 
689 	if (!error) {
690 		cmn_err(CE_CONT, "!%s\n", report);
691 		kmem_free(report, REPORT_LEN);
692 		return;
693 	}
694 
695 	len += snprintf(report + len, REPORT_LEN - len,
696 	    " failed, error %d: ", error);
697 	switch (error) {
698 	case EEXIST:
699 		len += snprintf(report + len, REPORT_LEN - len,
700 		    "cpu already %s", ps ? ps : "?");
701 		break;
702 	case ESRCH:
703 		len += snprintf(report + len, REPORT_LEN - len,
704 		    "cpu not found");
705 		break;
706 	case EINVAL:
707 	case EALREADY:
708 		break;
709 	case EPERM:
710 		len += snprintf(report + len, REPORT_LEN - len,
711 		    "insufficient privilege (0x%x)", id);
712 		break;
713 	case EBUSY:
714 		switch (newstate) {
715 		case P_ONLINE:
716 			/*
717 			 * This return comes from mp_cpu_start -
718 			 * we cannot 'start' the boot CPU.
719 			 */
720 			len += snprintf(report + len, REPORT_LEN - len,
721 			    "already running");
722 			break;
723 		case P_POWEROFF:
724 			len += snprintf(report + len, REPORT_LEN - len,
725 			    "bound lwps?");
726 			break;
727 		default:
728 			break;
729 		}
730 	default:
731 		break;
732 	}
733 
734 	cmn_err(CE_CONT, "%s\n", report);
735 	kmem_free(report, REPORT_LEN);
736 }
737 
738 static void
739 vcpu_config(void *arg)
740 {
741 	int id = (int)(uintptr_t)arg;
742 	int error;
743 	char dir[16];
744 	char *state;
745 
746 	if ((uint_t)id >= max_ncpus) {
747 		cmn_err(CE_WARN,
748 		    "vcpu_config: cpu%d does not fit in this domain", id);
749 		return;
750 	}
751 
752 	(void) snprintf(dir, sizeof (dir), "cpu/%d", id);
753 	state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
754 	if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
755 		if (strcmp(state, "online") == 0) {
756 			error = vcpu_config_poweron(id);
757 			vcpu_config_report(id, P_ONLINE, error);
758 		} else if (strcmp(state, "offline") == 0) {
759 			error = vcpu_config_poweroff(id);
760 			vcpu_config_report(id, P_POWEROFF, error);
761 		} else {
762 			cmn_err(CE_WARN,
763 			    "cpu%d: unknown target state '%s'", id, state);
764 		}
765 	} else
766 		cmn_err(CE_WARN,
767 		    "cpu%d: unable to read target state from xenstore", id);
768 
769 	kmem_free(state, MAXPATHLEN);
770 }
771 
772 /*ARGSUSED*/
773 static void
774 vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
775 {
776 	const char *path = vec[XS_WATCH_PATH];
777 	processorid_t id;
778 	char *s;
779 
780 	if ((s = strstr(path, "cpu/")) != NULL &&
781 	    sscanf(s, "cpu/%d", &id) == 1) {
782 		/*
783 		 * Run the virtual CPU configuration on a separate thread to
784 		 * avoid blocking on this event for too long (and for now,
785 		 * to ensure configuration requests are serialized.)
786 		 */
787 		(void) taskq_dispatch(cpu_config_tq,
788 		    vcpu_config, (void *)(uintptr_t)id, 0);
789 	}
790 }
791 
792 static int
793 xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
794 {
795 	int err;
796 
797 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
798 		char *str;
799 		int level = CE_WARN;
800 
801 		switch (err) {
802 		case -X_EINVAL:
803 			/*
804 			 * This interface squashes multiple error sources
805 			 * to one error code.  In particular, an X_EINVAL
806 			 * code can mean:
807 			 *
808 			 * -	the vcpu id is out of range
809 			 * -	cs or ss are in ring 0
810 			 * -	cr3 is wrong
811 			 * -	an entry in the new gdt is above the
812 			 *	reserved entry
813 			 * -	a frame underneath the new gdt is bad
814 			 */
815 			str = "something is wrong :(";
816 			break;
817 		case -X_ENOENT:
818 			str = "no such cpu";
819 			break;
820 		case -X_ENOMEM:
821 			str = "no mem to copy ctxt";
822 			break;
823 		case -X_EFAULT:
824 			str = "bad address";
825 			break;
826 		case -X_EEXIST:
827 			/*
828 			 * Hmm.  This error is returned if the vcpu has already
829 			 * been initialized once before in the lifetime of this
830 			 * domain.  This is a logic error in the kernel.
831 			 */
832 			level = CE_PANIC;
833 			str = "already initialized";
834 			break;
835 		default:
836 			level = CE_PANIC;
837 			str = "<unexpected>";
838 			break;
839 		}
840 
841 		cmn_err(level, "vcpu%d: failed to init: error %d: %s",
842 		    id, -err, str);
843 	}
844 	return (err);
845 }
846 
847 long
848 xen_vcpu_up(processorid_t id)
849 {
850 	long err;
851 
852 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
853 		char *str;
854 
855 		switch (err) {
856 		case -X_ENOENT:
857 			str = "no such cpu";
858 			break;
859 		case -X_EINVAL:
860 			/*
861 			 * Perhaps this is diagnostic overkill.
862 			 */
863 			if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
864 				str = "bad cpuid";
865 			else
866 				str = "not initialized";
867 			break;
868 		default:
869 			str = "<unexpected>";
870 			break;
871 		}
872 
873 		printf("vcpu%d: failed to start: error %d: %s\n",
874 		    id, -(int)err, str);
875 		return (EBFONT);	/* deliberately silly */
876 	}
877 	return (err);
878 }
879 
880 long
881 xen_vcpu_down(processorid_t id)
882 {
883 	long err;
884 
885 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
886 		/*
887 		 * X_ENOENT:	no such cpu
888 		 * X_EINVAL:	bad cpuid
889 		 */
890 		panic("vcpu%d: failed to stop: error %d", id, -(int)err);
891 	}
892 
893 	return (err);
894 }
895