xref: /titanic_44/usr/src/uts/common/xen/dtrace/xdt.c (revision 55f5292c612446ce6f93ddd248c0019b5974618b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Xen event provider for DTrace
29  *
30  * NOTE: This provider is PRIVATE. It is intended as a short-term solution and
31  * may disappear or be re-implemented at anytime.
32  *
33  * This provider isn't suitable as a general-purpose solution for a number of
34  * reasons. First and foremost, we rely on the Xen tracing mechanism and don't
35  * have any way to gather data other than that collected by the Xen trace
36  * buffers. Further, it does not fit into the DTrace model (see "Interacting
37  * with DTrace" below.)
38  *
39  *
40  * Tracing in Xen
41  * --------------
42  *
43  * Xen implements a tracing facility for generating and collecting execution
44  * event traces from the hypervisor. When tracing is enabled, compiled in
45  * probes record events in contiguous per-CPU trace buffers.
46  *
47  *               +---------+
48  * +------+      |         |
49  * | CPUn |----> | BUFFERn |
50  * +------+      |         |
51  *               +---------+- tbuf.va + (tbuf.size * n)
52  *               :         :
53  *               +---------+
54  * +------+      |         |
55  * | CPU1 |----> | BUFFER1 |
56  * +------+      |         |
57  *               +---------+- tbuf.va + tbuf.size
58  * +------+      |         |
59  * | CPU0 |----> | BUFFER0 |
60  * +------+      |         |
61  *               +---------+- tbuf.va
62  *
63  * Each CPU buffer consists of a metadata header followed by the trace records.
64  * The metadata consists of a producer/consumer pair of pointers into the buffer
65  * that point to the next record to be written and the next record to be read
66  * respectively.
67  *
68  * A trace record can be in one of two forms, depending on if the TSC is
69  * included. The record header indicates whether or not the TSC field is
70  * present.
71  *
72  * 1. Trace record without TSC:
73  * +------------------------------------------------------------+
74  * | HEADER(uint32_t) |            DATA FIELDS                  |
75  * +------------------------------------------------------------+
76  *
77  * 2. Trace record with TSC:
78  * +--------------------------------------------------------------------------+
79  * | HEADER(uint32_t) | TSC(uint64_t) |              DATA FIELDS              |
80  * +--------------------------------------------------------------------------+
81  *
82  * Where,
83  *
84  * HEADER bit field:
85  * +--------------------------------------------------------------------------+
86  * | C |  NDATA  |                        EVENT                               |
87  * +--------------------------------------------------------------------------+
88  *  31  30     28 27                                                         0
89  *
90  * EVENT: Event ID.
91  * NDATA: Number of populated data fields.
92  *     C: TSC included.
93  *
94  * DATA FIELDS:
95  * +--------------------------------------------------------------------------+
96  * | D1(uint32_t) | D2(uint32_t) | D3(uint32_t) |     . . .    | D7(uint32_t) |
97  * +--------------------------------------------------------------------------+
98  *
99  *
100  * Interacting with DTrace
101  * -----------------------
102  *
103  * Every xdt_poll_nsec nano-seconds we poll the trace buffers for data and feed
104  * each entry into dtrace_probe() with the corresponding probe ID for the event.
105  * As a result of this periodic collection implementation probe firings are
106  * asynchronous. This is the only sensible way to implement this form of
107  * provider, but because of its asynchronous nature asking things like
108  * "current CPU" and, more importantly, arbitrary questions about the context
109  * surrounding the probe firing are not meaningful. So, consumers should not
110  * attempt to infer anything beyond what is supplied via the probe arguments.
111  */
112 
113 #include <sys/xpv_user.h>
114 
115 #include <sys/types.h>
116 #include <sys/sysmacros.h>
117 #include <sys/modctl.h>
118 #include <sys/sunddi.h>
119 #include <sys/ddi.h>
120 #include <sys/conf.h>
121 #include <sys/devops.h>
122 #include <sys/stat.h>
123 #include <sys/cmn_err.h>
124 #include <sys/dtrace.h>
125 #include <sys/sdt.h>
126 #include <sys/cyclic.h>
127 #include <vm/seg_kmem.h>
128 #include <vm/hat_i86.h>
129 
130 #include <sys/hypervisor.h>
131 #include <xen/public/trace.h>
132 #include <xen/public/sched.h>
133 
134 #define	XDT_POLL_DEFAULT	100000000	/* default poll interval (ns) */
135 #define	XDT_POLL_MIN		10000000	/* min poll interval (ns) */
136 #define	XDT_TBUF_RETRY		50		/* tbuf disable retry count */
137 
138 /*
139  * The domid must match IDLE_DOMAIN_ID in xen.hg/xen/include/xen/sched.h
140  * in the xVM gate.
141  */
142 #define	IS_IDLE_DOM(domid)	(domid == 0x7FFFU)
143 
144 /* Macros to extract the domid and cpuid from a HVM trace data field */
145 #define	HVM_DOMID(d)		(d >> 16)
146 #define	HVM_VCPUID(d)		(d & 0xFFFF)
147 
148 /* Flags for shadow page table events */
149 #define	SH_GUEST_32	0x000
150 #define	SH_GUEST_PAE	0x100
151 #define	SH_GUEST_64	0x200
152 
153 #define	XDT_PROBE5(event, arg0, arg1, arg2, arg3, arg4) {		\
154 	dtrace_id_t id = xdt_probemap[event];				\
155 	if (id)								\
156 		dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);		\
157 }									\
158 
159 #define	XDT_PROBE4(event, arg0, arg1, arg2, arg3) \
160 	XDT_PROBE5(event, arg0, arg1, arg2, arg3, 0)
161 
162 #define	XDT_PROBE3(event, arg0, arg1, arg2) \
163 	XDT_PROBE5(event, arg0, arg1, arg2, 0, 0)
164 
165 #define	XDT_PROBE2(event, arg0, arg1) \
166 	XDT_PROBE5(event, arg0, arg1, 0, 0, 0)
167 
168 #define	XDT_PROBE1(event, arg0) \
169 	XDT_PROBE5(event, arg0, 0, 0, 0, 0)
170 
171 #define	XDT_PROBE0(event) \
172 	XDT_PROBE5(event, 0, 0, 0, 0, 0)
173 
174 /* Probe classes */
175 #define	XDT_SCHED			0
176 #define	XDT_MEM				1
177 #define	XDT_HVM				2
178 #define	XDT_GEN				3
179 #define	XDT_PV				4
180 #define	XDT_SHADOW			5
181 #define	XDT_PM				6
182 #define	XDT_NCLASSES			7
183 
184 /* Probe events */
185 #define	XDT_EVT_INVALID			(-(int)1)
186 #define	XDT_SCHED_OFF_CPU		0
187 #define	XDT_SCHED_ON_CPU		1
188 #define	XDT_SCHED_IDLE_OFF_CPU		2
189 #define	XDT_SCHED_IDLE_ON_CPU		3
190 #define	XDT_SCHED_BLOCK			4
191 #define	XDT_SCHED_SLEEP			5
192 #define	XDT_SCHED_WAKE			6
193 #define	XDT_SCHED_YIELD			7
194 #define	XDT_SCHED_SHUTDOWN_POWEROFF	8
195 #define	XDT_SCHED_SHUTDOWN_REBOOT	9
196 #define	XDT_SCHED_SHUTDOWN_SUSPEND	10
197 #define	XDT_SCHED_SHUTDOWN_CRASH	11
198 #define	XDT_MEM_PAGE_GRANT_MAP		12
199 #define	XDT_MEM_PAGE_GRANT_UNMAP	13
200 #define	XDT_MEM_PAGE_GRANT_TRANSFER	14
201 #define	XDT_HVM_VMENTRY			15
202 #define	XDT_HVM_VMEXIT			16
203 #define	XDT_TRC_LOST_RECORDS		17
204 #define	XDT_SCHED_ADD_VCPU		18
205 #define	XDT_SCHED_REM_VCPU		19	/* unused */
206 #define	XDT_SCHED_CTL			20	/* unused */
207 #define	XDT_SCHED_ADJDOM		21
208 #define	XDT_SCHED_S_TIMER_FN		22	/* unused */
209 #define	XDT_SCHED_T_TIMER_FN		23	/* unused */
210 #define	XDT_SCHED_DOM_TIMER_FN		24	/* unused */
211 #define	XDT_PV_HYPERCALL		25
212 #define	XDT_PV_TRAP			26
213 #define	XDT_PV_PAGE_FAULT		27
214 #define	XDT_PV_FORCED_INVALID_OP	28
215 #define	XDT_PV_EMULATE_PRIVOP		29
216 #define	XDT_PV_EMULATE_4GB		30	/* unused (32-bit HV only ) */
217 #define	XDT_PV_MATH_STATE_RESTORE	31
218 #define	XDT_PV_PAGING_FIXUP		32
219 #define	XDT_PV_DT_MAPPING_FAULT		33
220 #define	XDT_PV_PTWR_EMULATION		34
221 #define	XDT_HVM_PF_XEN			35
222 #define	XDT_HVM_PF_INJECT		36
223 #define	XDT_HVM_EXC_INJECT		37
224 #define	XDT_HVM_VIRQ_INJECT		38
225 #define	XDT_HVM_VIRQ_REINJECT		39
226 #define	XDT_HVM_IO_READ			40	/* unused */
227 #define	XDT_HVM_IO_WRITE		41	/* unused */
228 #define	XDT_HVM_CR_READ			42
229 #define	XDT_HVM_CR_WRITE		43
230 #define	XDT_HVM_DR_READ			44	/* unused */
231 #define	XDT_HVM_DR_WRITE		45	/* unused */
232 #define	XDT_HVM_MSR_READ		46
233 #define	XDT_HVM_MSR_WRITE		47
234 #define	XDT_HVM_CPUID			48
235 #define	XDT_HVM_INTR			49
236 #define	XDT_HVM_INTR_WINDOW		50
237 #define	XDT_HVM_NMI			51
238 #define	XDT_HVM_SMI			52
239 #define	XDT_HVM_VMMCALL			53
240 #define	XDT_HVM_HLT			54
241 #define	XDT_HVM_INVLPG			55
242 #define	XDT_HVM_MCE			56
243 #define	XDT_HVM_IOPORT_READ		57
244 #define	XDT_HVM_IOPORT_WRITE		58
245 #define	XDT_HVM_CLTS			59
246 #define	XDT_HVM_LMSW			60
247 #define	XDT_HVM_IOMEM_READ		61
248 #define	XDT_HVM_IOMEM_WRITE		62
249 #define	XDT_SHADOW_NOT_SHADOW			63
250 #define	XDT_SHADOW_FAST_PROPAGATE		64
251 #define	XDT_SHADOW_FAST_MMIO			65
252 #define	XDT_SHADOW_FALSE_FAST_PATH		66
253 #define	XDT_SHADOW_MMIO				67
254 #define	XDT_SHADOW_FIXUP			68
255 #define	XDT_SHADOW_DOMF_DYING			69
256 #define	XDT_SHADOW_EMULATE			70
257 #define	XDT_SHADOW_EMULATE_UNSHADOW_USER	71
258 #define	XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ	72
259 #define	XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED	73
260 #define	XDT_SHADOW_WRMAP_BF			74
261 #define	XDT_SHADOW_PREALLOC_UNPIN		75
262 #define	XDT_SHADOW_RESYNC_FULL			76
263 #define	XDT_SHADOW_RESYNC_ONLY			77
264 #define	XDT_PM_FREQ_CHANGE		78
265 #define	XDT_PM_IDLE_ENTRY		79
266 #define	XDT_PM_IDLE_EXIT		80
267 #define	XDT_SCHED_RUNSTATE_CHANGE	81
268 #define	XDT_SCHED_CONTINUE_RUNNING	82
269 #define	XDT_NEVENTS			83
270 
271 typedef struct {
272 	const char	*pr_mod;	/* probe module */
273 	const char	*pr_name;	/* probe name */
274 	int		evt_id;		/* event id */
275 	uint_t		class;		/* probe class */
276 } xdt_probe_t;
277 
278 typedef struct {
279 	uint32_t	trc_mask;	/* trace mask */
280 	uint32_t	cnt;		/* num enabled probes in class */
281 } xdt_classinfo_t;
282 
283 typedef struct {
284 	ulong_t prev_domid;		/* previous dom executed */
285 	ulong_t prev_vcpuid;		/* previous vcpu executed */
286 	ulong_t prev_ctime;		/* time spent on cpu */
287 	ulong_t next_domid;		/* next dom to be scheduled */
288 	ulong_t next_vcpuid;		/* next vcpu to be scheduled */
289 	ulong_t next_wtime;		/* time spent waiting to get on cpu */
290 	ulong_t next_ts;		/* allocated time slice */
291 	ulong_t cur_domid;		/* current dom */
292 	ulong_t cur_vcpuid;		/* current vcpuid */
293 	int curinfo_valid;		/* info is valid */
294 } xdt_schedinfo_t;
295 
296 static struct {
297 	uint_t cnt;			/* total num of trace buffers */
298 	size_t size;			/* size of each cpu buffer */
299 	mfn_t start_mfn;		/* starting mfn of buffers */
300 	caddr_t va;			/* va buffers are mapped into */
301 
302 	/* per-cpu buffers */
303 	struct t_buf **meta;		/* buffer metadata */
304 	struct t_rec **data;		/* buffer data records */
305 
306 	/* statistics */
307 	uint64_t stat_dropped_recs;	/* records dropped */
308 	uint64_t stat_spurious_cpu;	/* recs with garbage cpuids */
309 	uint64_t stat_spurious_switch;	/* inconsistent vcpu switches */
310 	uint64_t stat_unknown_shutdown;	/* unknown shutdown code */
311 	uint64_t stat_unknown_recs;	/* unknown records */
312 } tbuf;
313 
314 static size_t tbuf_data_size;
315 
316 static char *xdt_stats[] = {
317 	"dropped_recs",
318 };
319 
320 /*
321  * Tunable variables
322  *
323  * The following may be tuned by adding a line to /etc/system that
324  * includes both the name of the module ("xdt") and the name of the variable.
325  * For example:
326  *     set xdt:xdt_tbuf_pages = 40
327  */
328 uint_t xdt_tbuf_pages = 20;			/* pages to alloc per-cpu buf */
329 
330 /*
331  * The following may be tuned by adding a line to
332  * /platform/i86xpv/kernel/drv/xdt.conf.
333  * For example:
334  *     xdt_poll_nsec = 200000000;
335  */
336 static hrtime_t xdt_poll_nsec;			/* trace buffer poll interval */
337 
338 /*
339  * Another tunable variable: the maximum number of records to process
340  * in one scan. If it is 0 (e.g. not set in /etc/system), it will
341  * be set to ncpu * (bufsize / max_rec_size).
342  *
343  * Having an upper limit avoids a situation where the scan would loop
344  * endlessly in case the hypervisor adds records quicker than we
345  * can process them. It's better to drop records than to loop, obviously.
346  */
347 uint_t xdt_max_recs = 0;
348 
349 /*
350  * Internal variables
351  */
352 static dev_info_t *xdt_devi;
353 static dtrace_provider_id_t xdt_id;
354 static uint_t xdt_ncpus;			/* total number of phys CPUs */
355 static uint32_t cur_trace_mask;			/* current trace mask */
356 static xdt_schedinfo_t *xdt_cpu_schedinfo;	/* per-cpu sched info */
357 dtrace_id_t xdt_probemap[XDT_NEVENTS];		/* map of enabled probes */
358 dtrace_id_t xdt_prid[XDT_NEVENTS];		/* IDs of registered events */
359 static cyclic_id_t xdt_cyclic = CYCLIC_NONE;
360 static kstat_t *xdt_kstats;
361 static xdt_classinfo_t xdt_classinfo[XDT_NCLASSES];
362 
363 /*
364  * These provide context when probes fire. They can be accessed
365  * from xdt dtrace probe (as `xdt_curdom, etc). It's ok for these
366  * to be global, and not per-cpu, as probes are run strictly in sequence
367  * as the trace buffers are
368  */
369 uint_t xdt_curdom, xdt_curvcpu, xdt_curpcpu;
370 uint64_t xdt_timestamp;
371 
372 static xdt_probe_t xdt_probe[] = {
373 	/* Sched probes */
374 	{ "sched", "off-cpu", XDT_SCHED_OFF_CPU, XDT_SCHED },
375 	{ "sched", "on-cpu", XDT_SCHED_ON_CPU, XDT_SCHED },
376 	{ "sched", "idle-off-cpu", XDT_SCHED_IDLE_OFF_CPU, XDT_SCHED },
377 	{ "sched", "idle-on-cpu", XDT_SCHED_IDLE_ON_CPU, XDT_SCHED },
378 	{ "sched", "block", XDT_SCHED_BLOCK, XDT_SCHED },
379 	{ "sched", "sleep", XDT_SCHED_SLEEP, XDT_SCHED },
380 	{ "sched", "wake", XDT_SCHED_WAKE, XDT_SCHED },
381 	{ "sched", "yield", XDT_SCHED_YIELD, XDT_SCHED },
382 	{ "sched", "shutdown-poweroff", XDT_SCHED_SHUTDOWN_POWEROFF,
383 		XDT_SCHED },
384 	{ "sched", "shutdown-reboot", XDT_SCHED_SHUTDOWN_REBOOT, XDT_SCHED },
385 	{ "sched", "shutdown-suspend", XDT_SCHED_SHUTDOWN_SUSPEND, XDT_SCHED },
386 	{ "sched", "shutdown-crash", XDT_SCHED_SHUTDOWN_CRASH, XDT_SCHED },
387 	{ "sched", "add", XDT_SCHED_ADD_VCPU, XDT_SCHED },
388 	{ "sched", "runstate-change", XDT_SCHED_RUNSTATE_CHANGE, XDT_SCHED },
389 	{ "sched", "continue-running", XDT_SCHED_CONTINUE_RUNNING, XDT_SCHED },
390 
391 	/* Memory probes */
392 	{ "mem", "page-grant-map", XDT_MEM_PAGE_GRANT_MAP, XDT_MEM },
393 	{ "mem", "page-grant-unmap", XDT_MEM_PAGE_GRANT_UNMAP, XDT_MEM },
394 	{ "mem", "page-grant-transfer", XDT_MEM_PAGE_GRANT_TRANSFER, XDT_MEM },
395 
396 	{"pv", "hypercall", XDT_PV_HYPERCALL, XDT_PV },
397 	{"pv", "trap", XDT_PV_TRAP, XDT_PV },
398 	{"pv", "page-fault", XDT_PV_PAGE_FAULT, XDT_PV },
399 	{"pv", "forced-invalid-op", XDT_PV_FORCED_INVALID_OP, XDT_PV },
400 	{"pv", "emulate-priv-op", XDT_PV_EMULATE_PRIVOP, XDT_PV },
401 	{"pv", "math-state-restore", XDT_PV_MATH_STATE_RESTORE, XDT_PV },
402 	{"pv", "paging-fixup", XDT_PV_PAGING_FIXUP, XDT_PV },
403 	{"pv", "dt-mapping-fault", XDT_PV_DT_MAPPING_FAULT, XDT_PV },
404 	{"pv", "pte-write-emul", XDT_PV_PTWR_EMULATION, XDT_PV },
405 
406 	/* HVM probes */
407 	{ "hvm", "vmentry", XDT_HVM_VMENTRY, XDT_HVM },
408 	{ "hvm", "vmexit", XDT_HVM_VMEXIT, XDT_HVM },
409 	{ "hvm", "pagefault-xen", XDT_HVM_PF_XEN, XDT_HVM },
410 	{ "hvm", "pagefault-inject", XDT_HVM_PF_INJECT, XDT_HVM },
411 	{ "hvm", "exception-inject", XDT_HVM_EXC_INJECT, XDT_HVM },
412 	{ "hvm", "virq-inject", XDT_HVM_VIRQ_INJECT, XDT_HVM },
413 	{ "hvm", "cr-read", XDT_HVM_CR_READ, XDT_HVM },
414 	{ "hvm", "cr-write", XDT_HVM_CR_WRITE, XDT_HVM },
415 	{ "hvm", "msr-read", XDT_HVM_MSR_READ, XDT_HVM },
416 	{ "hvm", "msr-write", XDT_HVM_MSR_WRITE, XDT_HVM },
417 	{ "hvm", "cpuid", XDT_HVM_CPUID, XDT_HVM },
418 	{ "hvm", "intr", XDT_HVM_INTR, XDT_HVM },
419 	{ "hvm", "intr-window", XDT_HVM_INTR_WINDOW, XDT_HVM },
420 	{ "hvm", "nmi", XDT_HVM_NMI, XDT_HVM },
421 	{ "hvm", "smi", XDT_HVM_SMI, XDT_HVM },
422 	{ "hvm", "vmmcall", XDT_HVM_VMMCALL, XDT_HVM },
423 	{ "hvm", "hlt", XDT_HVM_HLT, XDT_HVM },
424 	{ "hvm", "invlpg", XDT_HVM_INVLPG, XDT_HVM },
425 	{ "hvm", "mce", XDT_HVM_MCE, XDT_HVM },
426 	{ "hvm", "pio-read", XDT_HVM_IOPORT_READ, XDT_HVM },
427 	{ "hvm", "pio-write", XDT_HVM_IOPORT_WRITE, XDT_HVM },
428 	{ "hvm", "mmio-read", XDT_HVM_IOMEM_READ, XDT_HVM },
429 	{ "hvm", "mmio-write", XDT_HVM_IOMEM_WRITE, XDT_HVM },
430 	{ "hvm", "clts", XDT_HVM_CLTS, XDT_HVM },
431 	{ "hvm", "lmsw", XDT_HVM_LMSW, XDT_HVM },
432 
433 	{ "shadow", "fault-not-shadow", XDT_SHADOW_NOT_SHADOW, XDT_SHADOW },
434 	{ "shadow", "fast-propagate", XDT_SHADOW_FAST_PROPAGATE, XDT_SHADOW },
435 	{ "shadow", "fast-mmio", XDT_SHADOW_FAST_MMIO, XDT_SHADOW },
436 	{ "shadow", "false-fast-path", XDT_SHADOW_FALSE_FAST_PATH,
437 	    XDT_SHADOW },
438 	{ "shadow", "mmio", XDT_SHADOW_MMIO, XDT_SHADOW },
439 	{ "shadow", "fixup", XDT_SHADOW_FIXUP, XDT_SHADOW },
440 	{ "shadow", "domf-dying", XDT_SHADOW_DOMF_DYING, XDT_SHADOW },
441 	{ "shadow", "emulate", XDT_SHADOW_EMULATE, XDT_SHADOW },
442 	{ "shadow", "emulate-unshadow-user", XDT_SHADOW_EMULATE_UNSHADOW_USER,
443 	    XDT_SHADOW },
444 	{ "shadow", "emulate-unshadow-evtinj",
445 	    XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ, XDT_SHADOW },
446 	{ "shadow", "emulate-unshadow-unhandled",
447 	    XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED, XDT_SHADOW },
448 	{ "shadow", "wrmap-bf", XDT_SHADOW_WRMAP_BF, XDT_SHADOW },
449 	{ "shadow", "prealloc-unpin", XDT_SHADOW_PREALLOC_UNPIN, XDT_SHADOW },
450 	{ "shadow", "resync-full", XDT_SHADOW_RESYNC_FULL, XDT_SHADOW },
451 	{ "shadow", "resync-only", XDT_SHADOW_RESYNC_ONLY, XDT_SHADOW },
452 
453 	{ "pm", "freq-change", XDT_PM_FREQ_CHANGE, XDT_PM },
454 	{ "pm", "idle-entry", XDT_PM_IDLE_ENTRY, XDT_PM },
455 	{ "pm", "idle-exit", XDT_PM_IDLE_EXIT, XDT_PM },
456 
457 	/* Trace buffer related probes */
458 	{ "trace", "records-lost", XDT_TRC_LOST_RECORDS, XDT_GEN },
459 
460 	{ NULL }
461 };
462 
463 static inline uint32_t
xdt_nr_active_probes()464 xdt_nr_active_probes()
465 {
466 	int i;
467 	uint32_t tot = 0;
468 
469 	for (i = 0; i < XDT_NCLASSES; i++)
470 		tot += xdt_classinfo[i].cnt;
471 
472 	return (tot);
473 }
474 
475 static void
xdt_init_trace_masks(void)476 xdt_init_trace_masks(void)
477 {
478 	xdt_classinfo[XDT_SCHED].trc_mask = TRC_SCHED;
479 	xdt_classinfo[XDT_MEM].trc_mask = TRC_MEM;
480 	xdt_classinfo[XDT_HVM].trc_mask = TRC_HVM;
481 	xdt_classinfo[XDT_GEN].trc_mask = TRC_GEN;
482 	xdt_classinfo[XDT_PV].trc_mask = TRC_PV;
483 	xdt_classinfo[XDT_SHADOW].trc_mask = TRC_SHADOW;
484 	xdt_classinfo[XDT_PM].trc_mask = TRC_PM;
485 }
486 
487 static int
xdt_kstat_update(kstat_t * ksp,int flag)488 xdt_kstat_update(kstat_t *ksp, int flag)
489 {
490 	kstat_named_t *knp;
491 
492 	if (flag != KSTAT_READ)
493 		return (EACCES);
494 
495 	knp = ksp->ks_data;
496 
497 	/*
498 	 * Assignment order should match that of the names in
499 	 * xdt_stats.
500 	 */
501 	(knp++)->value.ui64 = tbuf.stat_dropped_recs;
502 
503 	return (0);
504 }
505 
506 static void
xdt_kstat_init(void)507 xdt_kstat_init(void)
508 {
509 	int nstats = sizeof (xdt_stats) / sizeof (xdt_stats[0]);
510 	char **cp = xdt_stats;
511 	kstat_named_t *knp;
512 
513 	if ((xdt_kstats = kstat_create("xdt", 0, "trace_statistics", "misc",
514 	    KSTAT_TYPE_NAMED, nstats, 0)) == NULL)
515 		return;
516 
517 	xdt_kstats->ks_update = xdt_kstat_update;
518 
519 	knp = xdt_kstats->ks_data;
520 	while (nstats > 0) {
521 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
522 		knp++;
523 		cp++;
524 		nstats--;
525 	}
526 
527 	kstat_install(xdt_kstats);
528 }
529 
530 static int
xdt_sysctl_tbuf(xen_sysctl_tbuf_op_t * tbuf_op)531 xdt_sysctl_tbuf(xen_sysctl_tbuf_op_t *tbuf_op)
532 {
533 	xen_sysctl_t op;
534 	int xerr;
535 
536 	op.cmd = XEN_SYSCTL_tbuf_op;
537 	op.interface_version = XEN_SYSCTL_INTERFACE_VERSION;
538 	op.u.tbuf_op = *tbuf_op;
539 
540 	if ((xerr = HYPERVISOR_sysctl(&op)) != 0)
541 		return (xen_xlate_errcode(xerr));
542 
543 	*tbuf_op = op.u.tbuf_op;
544 	return (0);
545 }
546 
547 static int
xdt_map_trace_buffers(mfn_t mfn,caddr_t va,size_t len)548 xdt_map_trace_buffers(mfn_t mfn, caddr_t va, size_t len)
549 {
550 	x86pte_t pte;
551 	caddr_t const sva = va;
552 	caddr_t const eva = va + len;
553 	int xerr;
554 
555 	ASSERT(mfn != MFN_INVALID);
556 	ASSERT(va != NULL);
557 	ASSERT(IS_PAGEALIGNED(len));
558 
559 	for (; va < eva; va += MMU_PAGESIZE) {
560 		/*
561 		 * Ask the HAT to load a throwaway mapping to page zero, then
562 		 * overwrite it with the hypervisor mapping. It gets removed
563 		 * later via hat_unload().
564 		 */
565 		hat_devload(kas.a_hat, va, MMU_PAGESIZE, (pfn_t)0,
566 		    PROT_READ | HAT_UNORDERED_OK,
567 		    HAT_LOAD_NOCONSIST | HAT_LOAD);
568 
569 		pte = mmu_ptob((x86pte_t)mfn) | PT_VALID | PT_USER
570 		    | PT_FOREIGN | PT_WRITABLE;
571 
572 		xerr = HYPERVISOR_update_va_mapping_otherdomain((ulong_t)va,
573 		    pte, UVMF_INVLPG | UVMF_LOCAL, DOMID_XEN);
574 
575 		if (xerr != 0) {
576 			/* unmap pages loaded so far */
577 			size_t ulen = (uintptr_t)(va + MMU_PAGESIZE) -
578 			    (uintptr_t)sva;
579 			hat_unload(kas.a_hat, sva, ulen, HAT_UNLOAD_UNMAP);
580 			return (xen_xlate_errcode(xerr));
581 		}
582 
583 		mfn++;
584 	}
585 
586 	return (0);
587 }
588 
589 static int
xdt_attach_trace_buffers(void)590 xdt_attach_trace_buffers(void)
591 {
592 	xen_sysctl_tbuf_op_t tbuf_op;
593 	size_t len;
594 	int err;
595 	uint_t i;
596 
597 	/*
598 	 * Xen does not support trace buffer re-sizing. If the buffers
599 	 * have already been allocated we just use them as is.
600 	 */
601 	tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_get_info;
602 	if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0)
603 		return (err);
604 
605 	if (tbuf_op.size == 0) {
606 		/* set trace buffer size */
607 		tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_set_size;
608 		tbuf_op.size = xdt_tbuf_pages;
609 		(void) xdt_sysctl_tbuf(&tbuf_op);
610 
611 		/* get trace buffer info */
612 		tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_get_info;
613 		if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0)
614 			return (err);
615 
616 		if (tbuf_op.size == 0) {
617 			cmn_err(CE_NOTE, "Couldn't allocate trace buffers.");
618 			return (ENOBUFS);
619 		}
620 	}
621 
622 	tbuf.size = tbuf_op.size;
623 	tbuf.start_mfn = (mfn_t)tbuf_op.buffer_mfn;
624 	tbuf.cnt = xdt_ncpus;
625 
626 	ASSERT(tbuf.start_mfn != MFN_INVALID);
627 	ASSERT(tbuf.cnt > 0);
628 
629 	len = tbuf.size * tbuf.cnt;
630 	tbuf.va = vmem_alloc(heap_arena, len, VM_SLEEP);
631 
632 	if ((err = xdt_map_trace_buffers(tbuf.start_mfn, tbuf.va, len)) != 0) {
633 		vmem_free(heap_arena, tbuf.va, len);
634 		tbuf.va = NULL;
635 		return (err);
636 	}
637 
638 	tbuf.meta = (struct t_buf **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.meta),
639 	    KM_SLEEP);
640 	tbuf.data = (struct t_rec **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.data),
641 	    KM_SLEEP);
642 
643 	for (i = 0; i < tbuf.cnt; i++) {
644 		void *cpu_buf = (void *)(tbuf.va + (tbuf.size * i));
645 		tbuf.meta[i] = cpu_buf;
646 		tbuf.data[i] = (struct t_rec *)((uintptr_t)cpu_buf +
647 		    sizeof (struct t_buf));
648 
649 		/* throw away stale trace records */
650 		tbuf.meta[i]->cons = tbuf.meta[i]->prod;
651 	}
652 
653 	tbuf_data_size = tbuf.size - sizeof (struct t_buf);
654 	if (xdt_max_recs == 0)
655 		xdt_max_recs = (xdt_ncpus * tbuf_data_size)
656 		    / sizeof (struct t_rec);
657 
658 	return (0);
659 }
660 
661 static void
xdt_detach_trace_buffers(void)662 xdt_detach_trace_buffers(void)
663 {
664 	size_t len = tbuf.size * tbuf.cnt;
665 
666 	ASSERT(tbuf.va != NULL);
667 
668 	hat_unload(kas.a_hat, tbuf.va, len,
669 	    HAT_UNLOAD_UNMAP | HAT_UNLOAD_UNLOCK);
670 	vmem_free(heap_arena, tbuf.va, len);
671 	kmem_free(tbuf.meta, tbuf.cnt * sizeof (*tbuf.meta));
672 	kmem_free(tbuf.data, tbuf.cnt * sizeof (*tbuf.data));
673 }
674 
675 static void
xdt_update_sched_context(uint_t cpuid,uint_t dom,uint_t vcpu)676 xdt_update_sched_context(uint_t cpuid, uint_t dom, uint_t vcpu)
677 {
678 	xdt_schedinfo_t *sp = &xdt_cpu_schedinfo[cpuid];
679 
680 	sp->cur_domid = dom;
681 	sp->cur_vcpuid = vcpu;
682 	sp->curinfo_valid = 1;
683 }
684 
685 static void
xdt_update_domain_context(uint_t dom,uint_t vcpu)686 xdt_update_domain_context(uint_t dom, uint_t vcpu)
687 {
688 	xdt_curdom = dom;
689 	xdt_curvcpu = vcpu;
690 }
691 
692 static size_t
xdt_process_rec(uint_t cpuid,struct t_rec * rec)693 xdt_process_rec(uint_t cpuid, struct t_rec *rec)
694 {
695 	xdt_schedinfo_t *sp = &xdt_cpu_schedinfo[cpuid];
696 	uint_t dom, vcpu;
697 	int eid;
698 	uint32_t *data;
699 	uint64_t tsc, addr64, rip64, val64, pte64;
700 	size_t rec_size;
701 
702 	ASSERT(rec != NULL);
703 	ASSERT(xdt_ncpus == xpv_nr_phys_cpus());
704 
705 	if (cpuid >= xdt_ncpus) {
706 		tbuf.stat_spurious_cpu++;
707 		goto done;
708 	}
709 
710 	/*
711 	 * If our current state isn't valid, and if this is not
712 	 * an event that will update our state, skip it.
713 	 */
714 
715 	if (!sp->curinfo_valid &&
716 	    rec->event != TRC_SCHED_SWITCH &&
717 	    rec->event != TRC_LOST_RECORDS)
718 		goto done;
719 
720 	if (rec->cycles_included) {
721 		data = rec->u.cycles.extra_u32;
722 		tsc = (((uint64_t)rec->u.cycles.cycles_hi) << 32)
723 		    | rec->u.cycles.cycles_lo;
724 	} else {
725 		data = rec->u.nocycles.extra_u32;
726 		tsc = 0;
727 	}
728 
729 	xdt_timestamp = tsc;
730 
731 	switch (rec->event) {
732 	/*
733 	 * Sched probes
734 	 */
735 	case TRC_SCHED_SWITCH_INFPREV:
736 		/*
737 		 * Info on vCPU being de-scheduled
738 		 *
739 		 * data[0] = prev domid
740 		 * data[1] = time spent on pcpu
741 		 */
742 		sp->prev_domid = data[0];
743 		sp->prev_ctime = data[1];
744 		break;
745 
746 	case TRC_SCHED_SWITCH_INFNEXT:
747 		/*
748 		 * Info on next vCPU to be scheduled
749 		 *
750 		 * data[0] = next domid
751 		 * data[1] = time spent waiting to get on cpu
752 		 * data[2] = time slice
753 		 */
754 		sp->next_domid = data[0];
755 		sp->next_wtime = data[1];
756 		sp->next_ts = data[2];
757 		break;
758 
759 	case TRC_SCHED_SWITCH:
760 		/*
761 		 * vCPU switch
762 		 *
763 		 * data[0] = prev domid
764 		 * data[1] = prev vcpuid
765 		 * data[2] = next domid
766 		 * data[3] = next vcpuid
767 		 */
768 
769 		/*
770 		 * Provide valid context for this probe if there
771 		 * wasn't one.
772 		 */
773 		if (!sp->curinfo_valid)
774 			xdt_update_domain_context(data[0], data[1]);
775 
776 		xdt_update_sched_context(cpuid, data[0], data[1]);
777 
778 		if (data[0] != sp->prev_domid &&
779 		    data[2] != sp->next_domid) {
780 			/* prev and next info don't match doms being sched'd */
781 			tbuf.stat_spurious_switch++;
782 			goto switchdone;
783 		}
784 
785 		sp->prev_vcpuid = data[1];
786 		sp->next_vcpuid = data[3];
787 
788 		XDT_PROBE3(IS_IDLE_DOM(sp->prev_domid)?
789 		    XDT_SCHED_IDLE_OFF_CPU:XDT_SCHED_OFF_CPU,
790 		    sp->prev_domid, sp->prev_vcpuid, sp->prev_ctime);
791 
792 		XDT_PROBE4(IS_IDLE_DOM(sp->next_domid)?
793 		    XDT_SCHED_IDLE_ON_CPU:XDT_SCHED_ON_CPU,
794 		    sp->next_domid, sp->next_vcpuid, sp->next_wtime,
795 		    sp->next_ts);
796 switchdone:
797 		xdt_update_sched_context(cpuid, data[2], data[3]);
798 		xdt_update_domain_context(data[2], data[3]);
799 
800 		break;
801 
802 	case TRC_SCHED_BLOCK:
803 		/*
804 		 * vCPU blocked
805 		 *
806 		 * data[0] = domid
807 		 * data[1] = vcpuid
808 		 */
809 		XDT_PROBE2(XDT_SCHED_BLOCK, data[0], data[1]);
810 		break;
811 
812 	case TRC_SCHED_SLEEP:
813 		/*
814 		 * Put vCPU to sleep
815 		 *
816 		 * data[0] = domid
817 		 * data[1] = vcpuid
818 		 */
819 		XDT_PROBE2(XDT_SCHED_SLEEP, data[0], data[1]);
820 		break;
821 
822 	case TRC_SCHED_WAKE:
823 		/*
824 		 * Wake up vCPU
825 		 *
826 		 * data[0] = domid
827 		 * data[1] = vcpuid
828 		 */
829 		XDT_PROBE2(XDT_SCHED_WAKE, data[0], data[1]);
830 		break;
831 
832 	case TRC_SCHED_YIELD:
833 		/*
834 		 * vCPU yielded
835 		 *
836 		 * data[0] = domid
837 		 * data[1] = vcpuid
838 		 */
839 		XDT_PROBE2(XDT_SCHED_YIELD, data[0], data[1]);
840 		break;
841 
842 	case TRC_SCHED_SHUTDOWN:
843 		/*
844 		 * Guest shutting down
845 		 *
846 		 * data[0] = domid
847 		 * data[1] = initiating vcpu
848 		 * data[2] = shutdown code
849 		 */
850 		switch (data[2]) {
851 		case SHUTDOWN_poweroff:
852 			eid = XDT_SCHED_SHUTDOWN_POWEROFF;
853 			break;
854 		case SHUTDOWN_reboot:
855 			eid = XDT_SCHED_SHUTDOWN_REBOOT;
856 			break;
857 		case SHUTDOWN_suspend:
858 			eid = XDT_SCHED_SHUTDOWN_SUSPEND;
859 			break;
860 		case SHUTDOWN_crash:
861 			eid = XDT_SCHED_SHUTDOWN_CRASH;
862 			break;
863 		default:
864 			tbuf.stat_unknown_shutdown++;
865 			goto done;
866 		}
867 
868 		XDT_PROBE2(eid, data[0], data[1]);
869 		break;
870 
871 	case TRC_SCHED_DOM_REM:
872 	case TRC_SCHED_CTL:
873 	case TRC_SCHED_S_TIMER_FN:
874 	case TRC_SCHED_T_TIMER_FN:
875 	case TRC_SCHED_DOM_TIMER_FN:
876 		/* unused */
877 		break;
878 	case TRC_SCHED_DOM_ADD:
879 		/*
880 		 * Add vcpu to a guest.
881 		 *
882 		 * data[0] = domid
883 		 * data[1] = vcpu
884 		 */
885 		XDT_PROBE2(XDT_SCHED_ADD_VCPU, data[0], data[1]);
886 		break;
887 	case TRC_SCHED_ADJDOM:
888 		/*
889 		 * Scheduling parameters for a guest
890 		 * were modified.
891 		 *
892 		 * data[0] = domid;
893 		 */
894 		XDT_PROBE1(XDT_SCHED_ADJDOM, data[1]);
895 		break;
896 	case TRC_SCHED_RUNSTATE_CHANGE:
897 		/*
898 		 * Runstate change for a VCPU.
899 		 *
900 		 * data[0] = (domain << 16) | vcpu;
901 		 * data[1] = oldstate;
902 		 * data[2] = newstate;
903 		 */
904 		XDT_PROBE4(XDT_SCHED_RUNSTATE_CHANGE, data[0] >> 16,
905 		    data[0] & 0xffff, data[1], data[2]);
906 		break;
907 	case TRC_SCHED_CONTINUE_RUNNING:
908 		/*
909 		 * VCPU is back on a physical CPU that it previously
910 		 * was also running this VCPU.
911 		 *
912 		 * data[0] = (domain << 16) | vcpu;
913 		 */
914 		XDT_PROBE2(XDT_SCHED_CONTINUE_RUNNING, data[0] >> 16,
915 		    data[0] & 0xffff);
916 		break;
917 	/*
918 	 * Mem probes
919 	 */
920 	case TRC_MEM_PAGE_GRANT_MAP:
921 		/*
922 		 * Guest mapped page grant
923 		 *
924 		 * data[0] = target domid
925 		 */
926 		XDT_PROBE1(XDT_MEM_PAGE_GRANT_MAP, data[0]);
927 		break;
928 
929 	case TRC_MEM_PAGE_GRANT_UNMAP:
930 		/*
931 		 * Guest unmapped page grant
932 		 *
933 		 * data[0] = target domid
934 		 */
935 		XDT_PROBE1(XDT_MEM_PAGE_GRANT_UNMAP, data[0]);
936 		break;
937 
938 	case TRC_MEM_PAGE_GRANT_TRANSFER:
939 		/*
940 		 * Page grant is being transferred
941 		 *
942 		 * data[0] = target domid
943 		 */
944 		XDT_PROBE1(XDT_MEM_PAGE_GRANT_TRANSFER, data[0]);
945 		break;
946 
947 	/*
948 	 * Probes for PV domains.
949 	 */
950 	case TRC_PV_HYPERCALL:
951 		/*
952 		 * Hypercall from a 32-bit PV domain.
953 		 *
954 		 * data[0] = eip
955 		 * data[1] = eax
956 		 */
957 		XDT_PROBE2(XDT_PV_HYPERCALL, data[0], data[1]);
958 		break;
959 	case TRC_PV_HYPERCALL | TRC_64_FLAG:
960 		/*
961 		 * Hypercall from a 64-bit PV domain.
962 		 *
963 		 * data[0] = rip(0:31)
964 		 * data[1] = rip(32:63)
965 		 * data[2] = eax;
966 		 */
967 		rip64 = (((uint64_t)data[1]) << 32) | data[0];
968 		XDT_PROBE2(XDT_PV_HYPERCALL, rip64, data[2]);
969 		break;
970 	case TRC_PV_TRAP:
971 		/*
972 		 * Trap in a 32-bit PV domain.
973 		 *
974 		 * data[0] = eip
975 		 * data[1] = trapnr | (error_code_valid << 15)
976 		 * 	| (error_code << 16);
977 		 */
978 		XDT_PROBE4(XDT_PV_TRAP, data[0], data[1] & 0x7fff,
979 		    (data[1] >> 15) & 1, data[1] >> 16);
980 		break;
981 	case TRC_PV_TRAP | TRC_64_FLAG:
982 		/*
983 		 * Trap in a 64-bit PV domain.
984 		 *
985 		 * data[0] = rip(0:31)
986 		 * data[1] = rip(32:63)
987 		 * data[2] = trapnr | (error_code_valid << 15)
988 		 * 	| (error_code << 16);
989 		 */
990 		rip64 = (((uint64_t)data[1]) << 32) | data[2];
991 		XDT_PROBE4(XDT_PV_TRAP, rip64, data[2] & 0x7fff,
992 		    (data[2] >> 15) & 1, data[2] >> 16);
993 		break;
994 	case TRC_PV_PAGE_FAULT:
995 		/*
996 		 * Page fault in a 32-bit PV domain.
997 		 *
998 		 * data[0] = eip
999 		 * data[1] = vaddr
1000 		 * data[2] = error code
1001 		 */
1002 		XDT_PROBE3(XDT_PV_PAGE_FAULT, data[0], data[1], data[2]);
1003 		break;
1004 	case TRC_PV_PAGE_FAULT | TRC_64_FLAG:
1005 		/*
1006 		 * Page fault in a 32-bit PV domain.
1007 		 *
1008 		 * data[0] = rip(0:31)
1009 		 * data[1] = rip(31:63)
1010 		 * data[2] = vaddr(0:31)
1011 		 * data[3] = vaddr(31:63)
1012 		 * data[4] = error code
1013 		 */
1014 		rip64 = (((uint64_t)data[1]) << 32) | data[0];
1015 		addr64 = (((uint64_t)data[3]) << 32) | data[2];
1016 		XDT_PROBE3(XDT_PV_PAGE_FAULT, rip64, addr64, data[4]);
1017 		break;
1018 	case TRC_PV_FORCED_INVALID_OP:
1019 		/*
1020 		 * Hypervisor emulated a forced invalid op (ud2)
1021 		 * in a 32-bit PV domain.
1022 		 *
1023 		 * data[1] = eip
1024 		 */
1025 		XDT_PROBE1(XDT_PV_FORCED_INVALID_OP, data[1]);
1026 		break;
1027 	case TRC_PV_FORCED_INVALID_OP | TRC_64_FLAG:
1028 		/*
1029 		 * Hypervisor emulated a forced invalid op (ud2)
1030 		 * in a 64-bit PV domain.
1031 		 *
1032 		 * data[1] = rip(0:31)
1033 		 * data[2] = rip(31:63)
1034 		 *
1035 		 */
1036 		rip64 = (((uint64_t)data[2]) << 32) | data[1];
1037 		XDT_PROBE1(XDT_PV_FORCED_INVALID_OP, rip64);
1038 		break;
1039 	case TRC_PV_EMULATE_PRIVOP:
1040 		/*
1041 		 * Hypervisor emulated a privileged operation
1042 		 * in a 32-bit PV domain.
1043 		 *
1044 		 * data[0] = eip
1045 		 */
1046 		XDT_PROBE1(XDT_PV_EMULATE_PRIVOP, data[0]);
1047 		break;
1048 	case TRC_PV_EMULATE_PRIVOP | TRC_64_FLAG:
1049 		/*
1050 		 * Hypervisor emulated a privileged operation
1051 		 * in a 64-bit PV domain.
1052 		 *
1053 		 * data[0] = rip(0:31)
1054 		 * data[1] = rip(31:63)
1055 		 */
1056 		rip64 = (((uint64_t)data[1]) << 32) | data[0];
1057 		XDT_PROBE1(XDT_PV_EMULATE_PRIVOP, rip64);
1058 		break;
1059 	case TRC_PV_EMULATE_4GB:
1060 		/* unused, 32-bit hypervisor only */
1061 		break;
1062 	case TRC_PV_MATH_STATE_RESTORE:
1063 		/*
1064 		 * Hypervisor restores math state after FP DNA trap.
1065 		 *
1066 		 * No arguments.
1067 		 */
1068 		XDT_PROBE0(XDT_PV_MATH_STATE_RESTORE);
1069 		break;
1070 	case TRC_PV_PAGING_FIXUP:
1071 		/*
1072 		 * Hypervisor fixed up a page fault (e.g. it was
1073 		 * a side-effect of hypervisor guest page table
1074 		 * bookkeeping, and not propagated to the guest).
1075 		 *
1076 		 * data[0] = eip
1077 		 * data[1] = vaddr
1078 		 */
1079 		XDT_PROBE2(XDT_PV_PAGING_FIXUP, data[0], data[2]);
1080 		break;
1081 	case TRC_PV_PAGING_FIXUP | TRC_64_FLAG:
1082 		/*
1083 		 * Hypervisor fixed up a page fault (e.g. it was
1084 		 * a side-effect of hypervisor guest page table
1085 		 * bookkeeping, and not propagated to the guest).
1086 		 *
1087 		 * data[0] = eip(0:31)
1088 		 * data[1] = eip(31:63)
1089 		 * data[2] = vaddr(0:31)
1090 		 * data[3] = vaddr(31:63)
1091 		 */
1092 		rip64 = (((uint64_t)data[1]) << 32) | data[0];
1093 		addr64 = (((uint64_t)data[3]) << 32) | data[2];
1094 		XDT_PROBE2(XDT_PV_PAGING_FIXUP, rip64, addr64);
1095 		break;
1096 	case TRC_PV_GDT_LDT_MAPPING_FAULT:
1097 		/*
1098 		 * Descriptor table mapping fault in a 32-bit PV domain.
1099 		 * data[0] = eip
1100 		 * data[1] = offset
1101 		 */
1102 		XDT_PROBE2(XDT_PV_DT_MAPPING_FAULT, data[0], data[1]);
1103 		break;
1104 	case TRC_PV_GDT_LDT_MAPPING_FAULT | TRC_64_FLAG:
1105 		/*
1106 		 * Descriptor table mapping fault in a 64-bit PV domain.
1107 		 *
1108 		 * data[0] = eip(0:31)
1109 		 * data[1] = eip(31:63)
1110 		 * data[2] = offset(0:31)
1111 		 * data[3] = offset(31:63)
1112 		 */
1113 		rip64 = (((uint64_t)data[1]) << 32) | data[0];
1114 		val64 = (((uint64_t)data[3]) << 32) | data[2];
1115 		XDT_PROBE2(XDT_PV_DT_MAPPING_FAULT, rip64, val64);
1116 		break;
1117 	case TRC_PV_PTWR_EMULATION:
1118 	case TRC_PV_PTWR_EMULATION_PAE | TRC_64_FLAG:
1119 		/*
1120 		 * Should only happen on 32-bit hypervisor; unused.
1121 		 */
1122 		break;
1123 	case TRC_PV_PTWR_EMULATION_PAE:
1124 		/*
1125 		 * PTE write emulation for a 32-bit PV domain.
1126 		 *
1127 		 * data[0] = pte
1128 		 * data[1] = addr
1129 		 * data[2] = eip
1130 		 */
1131 		XDT_PROBE3(XDT_PV_PTWR_EMULATION, data[0], data[1], data[2]);
1132 		break;
1133 	case TRC_PV_PTWR_EMULATION | TRC_64_FLAG:
1134 		/*
1135 		 * PTE write emulation for a 64-bit PV domain.
1136 		 *
1137 		 * data[0] = pte(0:31)
1138 		 * data[1] = pte(32:63)
1139 		 * data[2] = addr(0:31)
1140 		 * data[3] = addr(32:63)
1141 		 * data[4] = rip(0:31)
1142 		 * data[5] = rip(32:63)
1143 		 */
1144 		pte64 = (((uint64_t)data[1]) << 32) | data[0];
1145 		addr64 = (((uint64_t)data[3]) << 32) | data[2];
1146 		rip64 = (((uint64_t)data[5]) << 32) | data[4];
1147 		XDT_PROBE3(XDT_PV_PTWR_EMULATION, pte64, addr64, rip64);
1148 		break;
1149 
1150 	/*
1151 	 * HVM probes
1152 	 */
1153 	case TRC_HVM_VMENTRY:
1154 		/*
1155 		 * Return to guest via vmx_launch/vmrun
1156 		 *
1157 		 */
1158 		XDT_PROBE0(XDT_HVM_VMENTRY);
1159 		break;
1160 
1161 	case TRC_HVM_VMEXIT:
1162 		/*
1163 		 * Entry into VMEXIT handler from 32-bit HVM domain
1164 		 *
1165 		 * data[0] = cpu vendor specific exit code
1166 		 * data[1] = guest eip
1167 		 */
1168 		XDT_PROBE2(XDT_HVM_VMEXIT, data[0], data[1]);
1169 		break;
1170 	case TRC_HVM_VMEXIT64:
1171 		/*
1172 		 * Entry into VMEXIT handler from 64-bit HVM domain
1173 		 *
1174 		 * data[0] = cpu vendor specific exit code
1175 		 * data[1] = guest rip(0:31)
1176 		 * data[2] = guest rip(32:64)
1177 		 */
1178 		rip64 = (((uint64_t)data[2]) << 32) | data[1];
1179 		XDT_PROBE2(XDT_HVM_VMEXIT, data[0], rip64);
1180 		break;
1181 
1182 	case TRC_HVM_PF_XEN64:
1183 		/*
1184 		 * Pagefault in a guest that is a Xen (e.g. shadow)
1185 		 * artifact, and is not injected back into the guest.
1186 		 *
1187 		 * data[0] = error code
1188 		 * data[1] = guest VA(0:31)
1189 		 * data[2] = guest VA(32:64)
1190 		 */
1191 		addr64 = (((uint64_t)data[2]) << 32) | data[1];
1192 		XDT_PROBE2(XDT_HVM_PF_XEN, data[0], addr64);
1193 		break;
1194 
1195 	case TRC_HVM_PF_XEN:
1196 		/*
1197 		 * Same as above, but for a 32-bit HVM domain.
1198 		 *
1199 		 * data[0] = error code
1200 		 * data[1] = guest VA
1201 		 */
1202 		XDT_PROBE2(XDT_HVM_PF_XEN, data[0], data[1]);
1203 		break;
1204 
1205 	case TRC_HVM_PF_INJECT:
1206 		/*
1207 		 * 32-bit Xen only.
1208 		 */
1209 		break;
1210 	case TRC_HVM_PF_INJECT64:
1211 		/*
1212 		 * Pagefault injected back into a guest (e.g. the shadow
1213 		 * code found no mapping).
1214 		 *
1215 		 * data[0] = error code
1216 		 * data[1] = guest VA(0:31)
1217 		 * data[2] = guest VA(32:64)
1218 		 */
1219 		addr64 = (((uint64_t)data[2]) << 32) | data[1];
1220 		XDT_PROBE2(XDT_HVM_PF_INJECT, data[0], addr64);
1221 		break;
1222 
1223 	case TRC_HVM_INJ_EXC:
1224 		/*
1225 		 * Exception injected into an HVM guest.
1226 		 *
1227 		 * data[0] = trap
1228 		 * data[1] = error code
1229 		 */
1230 		XDT_PROBE2(XDT_HVM_EXC_INJECT, data[0], data[1]);
1231 		break;
1232 	case TRC_HVM_INJ_VIRQ:
1233 		/*
1234 		 * Interrupt inject into an HVM guest.
1235 		 *
1236 		 * data[0] = vector
1237 		 */
1238 		XDT_PROBE1(XDT_HVM_VIRQ_INJECT, data[0]);
1239 		break;
1240 	case TRC_HVM_REINJ_VIRQ:
1241 	case TRC_HVM_IO_READ:
1242 	case TRC_HVM_IO_WRITE:
1243 		/* unused */
1244 		break;
1245 	case TRC_HVM_CR_READ64:
1246 		/*
1247 		 * Control register read. Intel VMX only.
1248 		 *
1249 		 * data[0] = control register #
1250 		 * data[1] = value(0:31)
1251 		 * data[2] = value(32:63)
1252 		 */
1253 		val64 = (((uint64_t)data[2]) << 32) | data[1];
1254 		XDT_PROBE2(XDT_HVM_CR_READ, data[0], val64);
1255 		break;
1256 	case TRC_HVM_CR_READ:
1257 		/*
1258 		 * unused (32-bit Xen only)
1259 		 */
1260 		break;
1261 	case TRC_HVM_CR_WRITE64:
1262 		/*
1263 		 * Control register write. Intel VMX only.
1264 		 *
1265 		 * data[0] = control register #
1266 		 * data[1] = value(0:31)
1267 		 * data[2] = value(32:63)
1268 		 */
1269 		val64 = (((uint64_t)data[2]) << 32) | data[1];
1270 		XDT_PROBE2(XDT_HVM_CR_READ, data[0], val64);
1271 		break;
1272 	case TRC_HVM_CR_WRITE:
1273 		/*
1274 		 * unused (32-bit Xen only)
1275 		 */
1276 		break;
1277 	case TRC_HVM_DR_READ:
1278 		/*
1279 		 * unused.
1280 		 *
1281 		 * data[0] = (domid<<16 + vcpuid)
1282 		 */
1283 		break;
1284 	case TRC_HVM_DR_WRITE:
1285 		/*
1286 		 * Debug register write. Not too useful; no values,
1287 		 * so we ignore this.
1288 		 *
1289 		 * data[0] = (domid<<16 + vcpuid)
1290 		 */
1291 		break;
1292 	case TRC_HVM_MSR_READ:
1293 		/*
1294 		 * MSR read.
1295 		 *
1296 		 * data[0] = MSR
1297 		 * data[1] = value(0:31)
1298 		 * data[2] = value(32:63)
1299 		 */
1300 		val64 = (((uint64_t)data[3]) << 32) | data[2];
1301 		XDT_PROBE2(XDT_HVM_MSR_READ, data[0], val64);
1302 		break;
1303 	case TRC_HVM_MSR_WRITE:
1304 		/*
1305 		 * MSR write.
1306 		 *
1307 		 * data[0] = MSR;
1308 		 * data[1] = value(0:31)
1309 		 * data[2] = value(32:63)
1310 		 */
1311 		val64 = (((uint64_t)data[2]) << 32) | data[1];
1312 		XDT_PROBE2(XDT_HVM_MSR_WRITE, data[0], val64);
1313 		break;
1314 	case TRC_HVM_CPUID:
1315 		/*
1316 		 * CPUID insn.
1317 		 *
1318 		 * data[0] = %eax (input)
1319 		 * data[1] = %eax
1320 		 * data[2] = %ebx
1321 		 * data[3] = %ecx
1322 		 * data[4] = %edx
1323 		 */
1324 		XDT_PROBE5(XDT_HVM_CPUID, data[0], data[1], data[2], data[3],
1325 		    data[4]);
1326 		break;
1327 	case TRC_HVM_INTR:
1328 		/*
1329 		 * VMEXIT because of an interrupt.
1330 		 */
1331 		XDT_PROBE0(XDT_HVM_INTR);
1332 		break;
1333 	case TRC_HVM_INTR_WINDOW:
1334 		/*
1335 		 * VMEXIT because of an interrupt window (an interrupt
1336 		 * can't be delivered immediately to a HVM guest and must
1337 		 * be delayed).
1338 		 *
1339 		 * data[0] = vector
1340 		 * data[1] = source
1341 		 * data[2] = info
1342 		 */
1343 		XDT_PROBE3(XDT_HVM_INTR_WINDOW, data[0], data[1], data[2]);
1344 		break;
1345 	case TRC_HVM_NMI:
1346 		/*
1347 		 * VMEXIT because of an NMI.
1348 		 */
1349 		XDT_PROBE0(XDT_HVM_NMI);
1350 		break;
1351 	case TRC_HVM_SMI:
1352 		/*
1353 		 * VMEXIT because of an SMI
1354 		 */
1355 		XDT_PROBE0(XDT_HVM_SMI);
1356 		break;
1357 	case TRC_HVM_VMMCALL:
1358 		/*
1359 		 * VMMCALL insn.
1360 		 *
1361 		 * data[0] = %eax
1362 		 */
1363 		XDT_PROBE1(XDT_HVM_VMMCALL, data[0]);
1364 		break;
1365 	case TRC_HVM_HLT:
1366 		/*
1367 		 * HLT insn.
1368 		 *
1369 		 * data[0] = 1 if VCPU runnable, 0 if not
1370 		 */
1371 		XDT_PROBE1(XDT_HVM_HLT, data[0]);
1372 		break;
1373 	case TRC_HVM_INVLPG64:
1374 		/*
1375 		 *
1376 		 * data[0] = INVLPGA ? 1 : 0
1377 		 * data[1] = vaddr(0:31)
1378 		 * data[2] = vaddr(32:63)
1379 		 */
1380 		addr64 = (((uint64_t)data[2]) << 32) | data[1];
1381 		XDT_PROBE2(XDT_HVM_INVLPG, data[0], addr64);
1382 		break;
1383 	case TRC_HVM_INVLPG:
1384 		/*
1385 		 * unused (32-bit Xen only)
1386 		 *
1387 		 * data[0] = (domid<<16 + vcpuid)
1388 		 */
1389 		break;
1390 	case TRC_HVM_MCE:
1391 		/*
1392 		 * #MCE VMEXIT
1393 		 *
1394 		 */
1395 		XDT_PROBE0(XDT_HVM_MCE);
1396 		break;
1397 	case TRC_HVM_IOPORT_READ:
1398 	case TRC_HVM_IOPORT_WRITE:
1399 	case TRC_HVM_IOMEM_READ:
1400 	case TRC_HVM_IOMEM_WRITE:
1401 		/*
1402 		 * data[0] = addr(0:31)
1403 		 * data[1] = addr(32:63)
1404 		 * data[2] = count
1405 		 * data[3] = size
1406 		 */
1407 		switch (rec->event) {
1408 		case TRC_HVM_IOPORT_READ:
1409 			eid = XDT_HVM_IOPORT_READ;
1410 			break;
1411 		case TRC_HVM_IOPORT_WRITE:
1412 			eid = XDT_HVM_IOPORT_WRITE;
1413 			break;
1414 		case TRC_HVM_IOMEM_READ:
1415 			eid = XDT_HVM_IOMEM_READ;
1416 			break;
1417 		case TRC_HVM_IOMEM_WRITE:
1418 			eid = XDT_HVM_IOMEM_WRITE;
1419 			break;
1420 		}
1421 		addr64 = (((uint64_t)data[1]) << 32) | data[0];
1422 		XDT_PROBE3(eid, addr64, data[2], data[3]);
1423 		break;
1424 	case TRC_HVM_CLTS:
1425 		/*
1426 		 * CLTS insn (Intel VMX only)
1427 		 */
1428 		XDT_PROBE0(XDT_HVM_CLTS);
1429 		break;
1430 	case TRC_HVM_LMSW64:
1431 		/*
1432 		 * LMSW insn.
1433 		 *
1434 		 * data[0] = value(0:31)
1435 		 * data[1] = value(32:63)
1436 		 */
1437 		val64 = (((uint64_t)data[1]) << 32) | data[0];
1438 		XDT_PROBE1(XDT_HVM_LMSW, val64);
1439 		break;
1440 	case TRC_HVM_LMSW:
1441 		/*
1442 		 * unused (32-bit Xen only)
1443 		 */
1444 		break;
1445 
1446 	/*
1447 	 * Shadow page table probes (mainly used for HVM domains
1448 	 * without hardware paging support).
1449 	 */
1450 	case TRC_SHADOW_NOT_SHADOW | SH_GUEST_32:
1451 		/*
1452 		 * data[0] = pte(0:31)
1453 		 * data[1] = pte(32:63)
1454 		 * data[2] = va
1455 		 * data[3] = flags
1456 		 */
1457 		pte64 = ((uint64_t)data[1] << 32) | data[0];
1458 		XDT_PROBE3(XDT_SHADOW_NOT_SHADOW, pte64, data[2], data[3]);
1459 		break;
1460 	case TRC_SHADOW_NOT_SHADOW | SH_GUEST_PAE:
1461 	case TRC_SHADOW_NOT_SHADOW | SH_GUEST_64:
1462 		/*
1463 		 * data[0] = pte(0:31)
1464 		 * data[1] = pte(32:63)
1465 		 * data[2] = va(0:31)
1466 		 * data[3] = va(32:63)
1467 		 * data[4] = flags
1468 		 */
1469 		addr64 = ((uint64_t)data[2] << 32) | data[3];
1470 		pte64 = ((uint64_t)data[1] << 32) | data[0];
1471 		XDT_PROBE3(XDT_SHADOW_NOT_SHADOW, pte64, addr64, data[4]);
1472 		break;
1473 	case TRC_SHADOW_FAST_PROPAGATE | SH_GUEST_32:
1474 		/*
1475 		 * data[0] = va
1476 		 */
1477 		XDT_PROBE1(XDT_SHADOW_FAST_PROPAGATE, data[0]);
1478 		break;
1479 	case TRC_SHADOW_FAST_PROPAGATE | SH_GUEST_PAE:
1480 	case TRC_SHADOW_FAST_PROPAGATE | SH_GUEST_64:
1481 		/*
1482 		 * data[0] = va(0:31)
1483 		 * data[1] = va(32:63)
1484 		 */
1485 		addr64 = ((uint64_t)data[1] << 32) | data[0];
1486 		XDT_PROBE1(XDT_SHADOW_FAST_PROPAGATE, addr64);
1487 		break;
1488 	case TRC_SHADOW_FAST_MMIO | SH_GUEST_32:
1489 		/*
1490 		 * data[0] = va
1491 		 */
1492 		XDT_PROBE1(XDT_SHADOW_FAST_MMIO, data[0]);
1493 		break;
1494 	case TRC_SHADOW_FAST_MMIO | SH_GUEST_PAE:
1495 	case TRC_SHADOW_FAST_MMIO | SH_GUEST_64:
1496 		/*
1497 		 * data[0] = va(0:31)
1498 		 * data[1] = va(32:63)
1499 		 */
1500 		addr64 = ((uint64_t)data[1] << 32) | data[0];
1501 		XDT_PROBE1(XDT_SHADOW_FAST_MMIO, addr64);
1502 		break;
1503 	case TRC_SHADOW_FALSE_FAST_PATH | SH_GUEST_32:
1504 		/*
1505 		 * data[0] = va
1506 		 */
1507 		XDT_PROBE1(XDT_SHADOW_FALSE_FAST_PATH, data[0]);
1508 		break;
1509 	case TRC_SHADOW_FALSE_FAST_PATH | SH_GUEST_PAE:
1510 	case TRC_SHADOW_FALSE_FAST_PATH | SH_GUEST_64:
1511 		/*
1512 		 * data[0] = va(0:31)
1513 		 * data[1] = va(32:63)
1514 		 */
1515 		addr64 = ((uint64_t)data[1] << 32) | data[0];
1516 		XDT_PROBE1(XDT_SHADOW_FALSE_FAST_PATH, addr64);
1517 		break;
1518 	case TRC_SHADOW_MMIO | SH_GUEST_32:
1519 		/*
1520 		 * data[0] = va
1521 		 */
1522 		XDT_PROBE1(XDT_SHADOW_MMIO, data[0]);
1523 		break;
1524 	case TRC_SHADOW_MMIO | SH_GUEST_PAE:
1525 	case TRC_SHADOW_MMIO | SH_GUEST_64:
1526 		/*
1527 		 * data[0] = va(0:31)
1528 		 * data[1] = va(32:63)
1529 		 */
1530 		addr64 = ((uint64_t)data[1] << 32) | data[0];
1531 		XDT_PROBE1(XDT_SHADOW_MMIO, addr64);
1532 		break;
1533 	case TRC_SHADOW_FIXUP | SH_GUEST_32:
1534 		/*
1535 		 * data[0] = pte(0:31)
1536 		 * data[1] = pte(32:63)
1537 		 * data[2] = va
1538 		 * data[3] = flags
1539 		 */
1540 		pte64 = ((uint64_t)data[1] << 32) | data[0];
1541 		XDT_PROBE3(XDT_SHADOW_FIXUP, pte64, data[2], data[3]);
1542 		break;
1543 	case TRC_SHADOW_FIXUP | SH_GUEST_64:
1544 	case TRC_SHADOW_FIXUP | SH_GUEST_PAE:
1545 		/*
1546 		 * data[0] = pte(0:31)
1547 		 * data[1] = pte(32:63)
1548 		 * data[2] = va(0:31)
1549 		 * data[3] = va(32:63)
1550 		 * data[4] = flags
1551 		 */
1552 		addr64 = ((uint64_t)data[2] << 32) | data[3];
1553 		pte64 = ((uint64_t)data[1] << 32) | data[0];
1554 		XDT_PROBE3(XDT_SHADOW_FIXUP, pte64, addr64, data[4]);
1555 		break;
1556 	case TRC_SHADOW_DOMF_DYING | SH_GUEST_32:
1557 		/*
1558 		 * data[0] = va
1559 		 */
1560 		XDT_PROBE1(XDT_SHADOW_DOMF_DYING, data[0]);
1561 		break;
1562 	case TRC_SHADOW_DOMF_DYING | SH_GUEST_PAE:
1563 	case TRC_SHADOW_DOMF_DYING | SH_GUEST_64:
1564 		/*
1565 		 * data[0] = va(0:31)
1566 		 * data[1] = va(32:63)
1567 		 */
1568 		addr64 = ((uint64_t)data[1] << 32) | data[0];
1569 		XDT_PROBE1(XDT_SHADOW_DOMF_DYING, addr64);
1570 		break;
1571 	case TRC_SHADOW_EMULATE | SH_GUEST_32:
1572 		/*
1573 		 * data[0] = pte(0:31)
1574 		 * data[1] = pte(32:63)
1575 		 * data[2] = val(0:31)
1576 		 * data[3] = val(32:63)
1577 		 * data[4] = addr
1578 		 * data[5] = flags
1579 		 */
1580 		pte64 = ((uint64_t)data[1] << 32) | data[0];
1581 		val64 = ((uint64_t)data[3] << 32) | data[2];
1582 		XDT_PROBE5(XDT_SHADOW_EMULATE, pte64, val64, data[4],
1583 		    data[5] & 0x7fffffff, data[5] >> 29);
1584 		break;
1585 	case TRC_SHADOW_EMULATE | SH_GUEST_PAE:
1586 	case TRC_SHADOW_EMULATE | SH_GUEST_64:
1587 		/*
1588 		 * data[0] = pte(0:31)
1589 		 * data[1] = pte(32:63)
1590 		 * data[2] = val(0:31)
1591 		 * data[3] = val(32:63)
1592 		 * data[4] = addr(0:31)
1593 		 * data[5] = addr(32:63)
1594 		 * data[6] = flags
1595 		 */
1596 		pte64 = ((uint64_t)data[1] << 32) | data[0];
1597 		val64 = ((uint64_t)data[3] << 32) | data[2];
1598 		addr64 = ((uint64_t)data[5] << 32) | data[4];
1599 		XDT_PROBE5(XDT_SHADOW_EMULATE, pte64, val64, data[4],
1600 		    data[6] & 0x7fffffff, data[6] >> 29);
1601 		break;
1602 	case TRC_SHADOW_EMULATE_UNSHADOW_USER | SH_GUEST_32:
1603 		/*
1604 		 * data[0] = gfn
1605 		 * data[1] = vaddr
1606 		 */
1607 		XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_USER, data[0], data[1]);
1608 		break;
1609 	case TRC_SHADOW_EMULATE_UNSHADOW_USER | SH_GUEST_PAE:
1610 	case TRC_SHADOW_EMULATE_UNSHADOW_USER | SH_GUEST_64:
1611 		/*
1612 		 * data[0] = gfn(0:31)
1613 		 * data[1] = gfn(32:63)
1614 		 * data[2] = vaddr(0:31)
1615 		 * data[3] = vaddr(32:63)
1616 		 */
1617 		val64 = ((uint64_t)data[1] << 32) | data[0];
1618 		addr64 = ((uint64_t)data[3] << 32) | data[2];
1619 		XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_USER, val64, addr64);
1620 		break;
1621 	case TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ | SH_GUEST_32:
1622 		/*
1623 		 * data[0] = gfn
1624 		 * data[1] = vaddr
1625 		 */
1626 		XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ, data[0],
1627 		    data[1]);
1628 		break;
1629 	case TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ | SH_GUEST_PAE:
1630 	case TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ | SH_GUEST_64:
1631 		/*
1632 		 * data[0] = gfn(0:31)
1633 		 * data[1] = gfn(32:63)
1634 		 * data[2] = vaddr(0:31)
1635 		 * data[3] = vaddr(32:63)
1636 		 */
1637 		val64 = ((uint64_t)data[1] << 32) | data[0];
1638 		addr64 = ((uint64_t)data[3] << 32) | data[2];
1639 		XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ, val64, addr64);
1640 		break;
1641 	case TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED | SH_GUEST_32:
1642 		/*
1643 		 * data[0] = gfn
1644 		 * data[1] = vaddr
1645 		 */
1646 		XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED, data[0],
1647 		    data[1]);
1648 		break;
1649 	case TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED | SH_GUEST_PAE:
1650 	case TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED | SH_GUEST_64:
1651 		/*
1652 		 * data[0] = gfn(0:31)
1653 		 * data[1] = gfn(32:63)
1654 		 * data[2] = vaddr(0:31)
1655 		 * data[3] = vaddr(32:63)
1656 		 */
1657 		val64 = ((uint64_t)data[1] << 32) | data[0];
1658 		addr64 = ((uint64_t)data[3] << 32) | data[2];
1659 		XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED, val64,
1660 		    addr64);
1661 		break;
1662 	case TRC_SHADOW_WRMAP_BF:
1663 		/*
1664 		 * data[0] = gfn(0:31)
1665 		 * data[1] = gfn(32:63)
1666 		 */
1667 		val64 = ((uint64_t)data[1] << 32) | data[0];
1668 		XDT_PROBE1(XDT_SHADOW_WRMAP_BF, val64);
1669 		break;
1670 	case TRC_SHADOW_PREALLOC_UNPIN:
1671 		/*
1672 		 * data[0] = gfn(0:31)
1673 		 * data[1] = gfn(32:63)
1674 		 */
1675 		val64 = ((uint64_t)data[1] << 32) | data[0];
1676 		XDT_PROBE1(XDT_SHADOW_PREALLOC_UNPIN, val64);
1677 		break;
1678 	case TRC_SHADOW_RESYNC_FULL:
1679 		/*
1680 		 * data[0] = gmfn(0:31)
1681 		 * data[1] = gmfn(32:63)
1682 		 */
1683 		val64 = ((uint64_t)data[1] << 32) | data[0];
1684 		XDT_PROBE1(XDT_SHADOW_RESYNC_FULL, val64);
1685 		break;
1686 	case TRC_SHADOW_RESYNC_ONLY:
1687 		/*
1688 		 * data[0] = gmfn(0:31)
1689 		 * data[1] = gmfn(32:63)
1690 		 */
1691 		val64 = ((uint64_t)data[1] << 32) | data[0];
1692 		XDT_PROBE1(XDT_SHADOW_RESYNC_ONLY, val64);
1693 		break;
1694 
1695 	/*
1696 	 * Power management probes.
1697 	 */
1698 	case TRC_PM_FREQ_CHANGE:
1699 		/*
1700 		 * data[0] = old freq
1701 		 * data[1] = new freq
1702 		 */
1703 		XDT_PROBE2(XDT_PM_FREQ_CHANGE, data[0], data[1]);
1704 		break;
1705 	case TRC_PM_IDLE_ENTRY:
1706 		/*
1707 		 * data[0] = C-state
1708 		 * data[1] = time
1709 		 */
1710 		XDT_PROBE2(XDT_PM_IDLE_ENTRY, data[0], data[1]);
1711 		break;
1712 	case TRC_PM_IDLE_EXIT:
1713 		/*
1714 		 * data[0] = C-state
1715 		 * data[1] = time
1716 		 */
1717 		XDT_PROBE2(XDT_PM_IDLE_EXIT, data[0], data[1]);
1718 		break;
1719 	case TRC_LOST_RECORDS:
1720 		vcpu = data[1] >> 16;
1721 		dom = data[1] & 0xffff;
1722 		xdt_update_sched_context(cpuid, dom, vcpu);
1723 		xdt_update_domain_context(dom, vcpu);
1724 		XDT_PROBE1(XDT_TRC_LOST_RECORDS, cpuid);
1725 		tbuf.stat_dropped_recs++;
1726 		break;
1727 
1728 	default:
1729 		tbuf.stat_unknown_recs++;
1730 		break;
1731 	}
1732 
1733 done:
1734 	rec_size = 4 + (rec->cycles_included ? 8 : 0) + (rec->extra_u32 * 4);
1735 	return (rec_size);
1736 }
1737 
1738 /*
1739  * Scan all CPU buffers for the record with the lowest timestamp so
1740  * that the probes will fire in order.
1741  */
1742 static int
xdt_get_first_rec(uint_t * cpuidp,struct t_rec ** recp,uint32_t * consp)1743 xdt_get_first_rec(uint_t *cpuidp, struct t_rec **recp, uint32_t *consp)
1744 {
1745 	uint_t cpuid;
1746 	uint32_t prod, cons, offset;
1747 	struct t_rec *rec;
1748 	uint64_t minstamp = ~0ULL, stamp;
1749 	uintptr_t data;
1750 
1751 	for (cpuid = 0; cpuid < tbuf.cnt; cpuid++) {
1752 		cons = tbuf.meta[cpuid]->cons;
1753 		prod = tbuf.meta[cpuid]->prod;
1754 		membar_consumer();
1755 		if (prod == cons)
1756 			continue;
1757 
1758 		offset = cons % tbuf_data_size;
1759 		data = (uintptr_t)tbuf.data[cpuid] + offset;
1760 		rec = (struct t_rec *)data;
1761 		ASSERT((caddr_t)rec < tbuf.va + (tbuf.size * (cpuid + 1)));
1762 
1763 		/*
1764 		 * All records that we know about have time cycles included.
1765 		 * If this record doesn't have them, assume it's a type
1766 		 * that we don't handle. Use a 0 time value, which will make
1767 		 * it get handled first (it will be thrown away).
1768 		 */
1769 		if (rec->cycles_included)
1770 			stamp = (((uint64_t)rec->u.cycles.cycles_hi) << 32)
1771 			    | rec->u.cycles.cycles_lo;
1772 		else
1773 			stamp = 0;
1774 
1775 		if (stamp < minstamp) {
1776 			minstamp = stamp;
1777 			*cpuidp = cpuid;
1778 			*recp = rec;
1779 			*consp = cons;
1780 		}
1781 	}
1782 
1783 	if (minstamp != ~0ULL)
1784 		return (1);
1785 
1786 	return (0);
1787 }
1788 
1789 /*ARGSUSED*/
1790 static void
xdt_tbuf_scan(void * arg)1791 xdt_tbuf_scan(void *arg)
1792 {
1793 	uint32_t bytes_done, cons;
1794 	struct t_rec *rec;
1795 	xdt_schedinfo_t *sp;
1796 	uint_t nrecs, cpuid;
1797 
1798 	for (nrecs = 0;
1799 	    nrecs < xdt_max_recs && xdt_get_first_rec(&cpuid, &rec, &cons) > 0;
1800 	    nrecs++) {
1801 		xdt_curpcpu = cpuid;
1802 		sp = &xdt_cpu_schedinfo[cpuid];
1803 		if (sp->curinfo_valid)
1804 			xdt_update_domain_context(sp->cur_domid,
1805 			    sp->cur_vcpuid);
1806 
1807 		bytes_done = xdt_process_rec(cpuid, rec);
1808 		cons += bytes_done;
1809 		/*
1810 		 * cons and prod are incremented modulo (2 * tbuf_data_size).
1811 		 * See <xen/public/trace.h>.
1812 		 */
1813 		if (cons >= 2 * tbuf_data_size)
1814 			cons -= 2 * tbuf_data_size;
1815 		membar_exit();
1816 		tbuf.meta[cpuid]->cons = cons;
1817 	}
1818 }
1819 
1820 static void
xdt_cyclic_enable(void)1821 xdt_cyclic_enable(void)
1822 {
1823 	cyc_handler_t hdlr;
1824 	cyc_time_t when;
1825 
1826 	ASSERT(MUTEX_HELD(&cpu_lock));
1827 
1828 	hdlr.cyh_func = xdt_tbuf_scan;
1829 	hdlr.cyh_arg = NULL;
1830 	hdlr.cyh_level = CY_LOW_LEVEL;
1831 
1832 	when.cyt_interval = xdt_poll_nsec;
1833 	when.cyt_when = dtrace_gethrtime() + when.cyt_interval;
1834 
1835 	xdt_cyclic = cyclic_add(&hdlr, &when);
1836 }
1837 
1838 static void
xdt_probe_create(xdt_probe_t * p)1839 xdt_probe_create(xdt_probe_t *p)
1840 {
1841 	ASSERT(p != NULL && p->pr_mod != NULL);
1842 
1843 	if (dtrace_probe_lookup(xdt_id, p->pr_mod, NULL, p->pr_name) != 0)
1844 		return;
1845 
1846 	xdt_prid[p->evt_id] = dtrace_probe_create(xdt_id, p->pr_mod, NULL,
1847 	    p->pr_name, dtrace_mach_aframes(), p);
1848 }
1849 
1850 /*ARGSUSED*/
1851 static void
xdt_provide(void * arg,const dtrace_probedesc_t * desc)1852 xdt_provide(void *arg, const dtrace_probedesc_t *desc)
1853 {
1854 	const char *mod, *name;
1855 	int i;
1856 
1857 	if (desc == NULL) {
1858 		for (i = 0; xdt_probe[i].pr_mod != NULL; i++) {
1859 			xdt_probe_create(&xdt_probe[i]);
1860 		}
1861 	} else {
1862 		mod = desc->dtpd_mod;
1863 		name = desc->dtpd_name;
1864 		for (i = 0; xdt_probe[i].pr_mod != NULL; i++) {
1865 			int l1 = strlen(xdt_probe[i].pr_name);
1866 			int l2 = strlen(xdt_probe[i].pr_mod);
1867 			if (strncmp(name, xdt_probe[i].pr_name, l1) == 0 &&
1868 			    strncmp(mod, xdt_probe[i].pr_mod, l2) == 0)
1869 				break;
1870 		}
1871 
1872 		if (xdt_probe[i].pr_mod == NULL)
1873 			return;
1874 		xdt_probe_create(&xdt_probe[i]);
1875 	}
1876 
1877 }
1878 
1879 /*ARGSUSED*/
1880 static void
xdt_destroy(void * arg,dtrace_id_t id,void * parg)1881 xdt_destroy(void *arg, dtrace_id_t id, void *parg)
1882 {
1883 	xdt_probe_t *p = parg;
1884 	xdt_prid[p->evt_id] = 0;
1885 }
1886 
1887 static void
xdt_set_trace_mask(uint32_t mask)1888 xdt_set_trace_mask(uint32_t mask)
1889 {
1890 	xen_sysctl_tbuf_op_t tbuf_op;
1891 
1892 	/* Always need to trace scheduling, for context */
1893 	if (mask != 0)
1894 		mask |= TRC_SCHED;
1895 	tbuf_op.evt_mask = mask;
1896 	tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_set_evt_mask;
1897 	(void) xdt_sysctl_tbuf(&tbuf_op);
1898 }
1899 
1900 /*ARGSUSED*/
1901 static int
xdt_enable(void * arg,dtrace_id_t id,void * parg)1902 xdt_enable(void *arg, dtrace_id_t id, void *parg)
1903 {
1904 	xdt_probe_t *p = parg;
1905 	xen_sysctl_tbuf_op_t tbuf_op;
1906 
1907 	ASSERT(MUTEX_HELD(&cpu_lock));
1908 	ASSERT(xdt_prid[p->evt_id] != 0);
1909 
1910 	xdt_probemap[p->evt_id] = xdt_prid[p->evt_id];
1911 	xdt_classinfo[p->class].cnt++;
1912 
1913 	if (xdt_classinfo[p->class].cnt == 1) {
1914 		/* set the trace mask for this class */
1915 		cur_trace_mask |= xdt_classinfo[p->class].trc_mask;
1916 		xdt_set_trace_mask(cur_trace_mask);
1917 	}
1918 
1919 	if (xdt_cyclic == CYCLIC_NONE) {
1920 		tbuf_op.cmd = XEN_SYSCTL_TBUFOP_enable;
1921 		if (xdt_sysctl_tbuf(&tbuf_op) != 0) {
1922 			cmn_err(CE_NOTE, "Couldn't enable hypervisor tracing.");
1923 			return (-1);
1924 		}
1925 
1926 		xdt_cyclic_enable();
1927 	}
1928 	return (0);
1929 }
1930 
1931 /*ARGSUSED*/
1932 static void
xdt_disable(void * arg,dtrace_id_t id,void * parg)1933 xdt_disable(void *arg, dtrace_id_t id, void *parg)
1934 {
1935 	xdt_probe_t *p = parg;
1936 	xen_sysctl_tbuf_op_t tbuf_op;
1937 	int i, err;
1938 
1939 	ASSERT(MUTEX_HELD(&cpu_lock));
1940 	ASSERT(xdt_probemap[p->evt_id] != 0);
1941 	ASSERT(xdt_probemap[p->evt_id] == xdt_prid[p->evt_id]);
1942 	ASSERT(xdt_classinfo[p->class].cnt > 0);
1943 
1944 	/*
1945 	 * We could be here in the slight window between the cyclic firing and
1946 	 * a call to dtrace_probe() occurring. We need to be careful if we tear
1947 	 * down any shared state.
1948 	 */
1949 
1950 	xdt_probemap[p->evt_id] = 0;
1951 	xdt_classinfo[p->class].cnt--;
1952 
1953 	if (xdt_nr_active_probes() == 0) {
1954 		cur_trace_mask = 0;
1955 
1956 		if (xdt_cyclic == CYCLIC_NONE)
1957 			return;
1958 
1959 		for (i = 0; i < xdt_ncpus; i++)
1960 			xdt_cpu_schedinfo[i].curinfo_valid = 0;
1961 
1962 		/*
1963 		 * We will try to disable the trace buffers. If we fail for some
1964 		 * reason we will try again, up to a count of XDT_TBUF_RETRY.
1965 		 * If we still aren't successful we try to set the trace mask
1966 		 * to 0 in order to prevent trace records from being written.
1967 		 */
1968 		tbuf_op.cmd = XEN_SYSCTL_TBUFOP_disable;
1969 		i = 0;
1970 		do {
1971 			err = xdt_sysctl_tbuf(&tbuf_op);
1972 		} while ((err != 0) && (++i < XDT_TBUF_RETRY));
1973 
1974 		if (err != 0) {
1975 			cmn_err(CE_NOTE,
1976 			    "Couldn't disable hypervisor tracing.");
1977 			xdt_set_trace_mask(0);
1978 		} else {
1979 			cyclic_remove(xdt_cyclic);
1980 			xdt_cyclic = CYCLIC_NONE;
1981 			/*
1982 			 * We don't bother making the hypercall to set
1983 			 * the trace mask, since it will be reset when
1984 			 * tracing is re-enabled.
1985 			 */
1986 		}
1987 	} else if (xdt_classinfo[p->class].cnt == 0) {
1988 		cur_trace_mask ^= xdt_classinfo[p->class].trc_mask;
1989 		/* other probes are enabled, so add the sub-class mask back */
1990 		cur_trace_mask |= 0xF000;
1991 		xdt_set_trace_mask(cur_trace_mask);
1992 	}
1993 }
1994 
1995 static dtrace_pattr_t xdt_attr = {
1996 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
1997 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
1998 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
1999 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
2000 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
2001 };
2002 
2003 static dtrace_pops_t xdt_pops = {
2004 	xdt_provide,		/* dtps_provide() */
2005 	NULL,			/* dtps_provide_module() */
2006 	xdt_enable,		/* dtps_enable() */
2007 	xdt_disable,		/* dtps_disable() */
2008 	NULL,			/* dtps_suspend() */
2009 	NULL,			/* dtps_resume() */
2010 	NULL,			/* dtps_getargdesc() */
2011 	NULL,			/* dtps_getargval() */
2012 	NULL,			/* dtps_usermode() */
2013 	xdt_destroy		/* dtps_destroy() */
2014 };
2015 
2016 static int
xdt_attach(dev_info_t * devi,ddi_attach_cmd_t cmd)2017 xdt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
2018 {
2019 	int val;
2020 
2021 	if (!DOMAIN_IS_INITDOMAIN(xen_info))
2022 		return (DDI_FAILURE);
2023 
2024 	switch (cmd) {
2025 	case DDI_ATTACH:
2026 		break;
2027 
2028 	case DDI_RESUME:
2029 		/*
2030 		 * We might support proper suspend/resume in the future, so,
2031 		 * return DDI_FAILURE for now.
2032 		 */
2033 		return (DDI_FAILURE);
2034 
2035 	default:
2036 		return (DDI_FAILURE);
2037 	}
2038 
2039 	xdt_ncpus = xpv_nr_phys_cpus();
2040 	ASSERT(xdt_ncpus > 0);
2041 
2042 	if (ddi_create_minor_node(devi, "xdt", S_IFCHR, 0, DDI_PSEUDO, 0) ==
2043 	    DDI_FAILURE || xdt_attach_trace_buffers() != 0 ||
2044 	    dtrace_register("xdt", &xdt_attr, DTRACE_PRIV_KERNEL, NULL,
2045 	    &xdt_pops, NULL, &xdt_id) != 0) {
2046 		if (tbuf.va != NULL)
2047 			xdt_detach_trace_buffers();
2048 		ddi_remove_minor_node(devi, NULL);
2049 		return (DDI_FAILURE);
2050 	}
2051 
2052 	val = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
2053 	    "xdt_poll_nsec", XDT_POLL_DEFAULT);
2054 	xdt_poll_nsec = MAX(val, XDT_POLL_MIN);
2055 
2056 	xdt_cpu_schedinfo = (xdt_schedinfo_t *)kmem_zalloc(xdt_ncpus *
2057 	    sizeof (xdt_schedinfo_t), KM_SLEEP);
2058 	xdt_init_trace_masks();
2059 	xdt_kstat_init();
2060 
2061 	xdt_devi = devi;
2062 	ddi_report_dev(devi);
2063 	return (DDI_SUCCESS);
2064 }
2065 
2066 static int
xdt_detach(dev_info_t * devi,ddi_detach_cmd_t cmd)2067 xdt_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
2068 {
2069 	switch (cmd) {
2070 	case DDI_DETACH:
2071 		break;
2072 
2073 	case DDI_SUSPEND:
2074 		/*
2075 		 * We might support proper suspend/resume in the future. So
2076 		 * return DDI_FAILURE for now.
2077 		 */
2078 		return (DDI_FAILURE);
2079 
2080 	default:
2081 		return (DDI_FAILURE);
2082 	}
2083 
2084 	if (dtrace_unregister(xdt_id) != 0)
2085 		return (DDI_FAILURE);
2086 
2087 	xdt_detach_trace_buffers();
2088 	kmem_free(xdt_cpu_schedinfo, xdt_ncpus * sizeof (xdt_schedinfo_t));
2089 	if (xdt_cyclic != CYCLIC_NONE)
2090 		cyclic_remove(xdt_cyclic);
2091 	if (xdt_kstats != NULL)
2092 		kstat_delete(xdt_kstats);
2093 	xdt_devi = (void *)0;
2094 	ddi_remove_minor_node(devi, NULL);
2095 
2096 	return (DDI_SUCCESS);
2097 }
2098 
2099 /*ARGSUSED*/
2100 static int
xdt_info(dev_info_t * devi,ddi_info_cmd_t infocmd,void * arg,void ** result)2101 xdt_info(dev_info_t *devi, ddi_info_cmd_t infocmd, void *arg, void **result)
2102 {
2103 	int error;
2104 
2105 	switch (infocmd) {
2106 	case DDI_INFO_DEVT2DEVINFO:
2107 		*result = xdt_devi;
2108 		error = DDI_SUCCESS;
2109 		break;
2110 	case DDI_INFO_DEVT2INSTANCE:
2111 		*result = (void *)0;
2112 		error = DDI_SUCCESS;
2113 		break;
2114 	default:
2115 		error = DDI_FAILURE;
2116 	}
2117 	return (error);
2118 }
2119 
2120 static struct cb_ops xdt_cb_ops = {
2121 	nulldev,		/* open(9E) */
2122 	nodev,			/* close(9E) */
2123 	nodev,			/* strategy(9E) */
2124 	nodev,			/* print(9E) */
2125 	nodev,			/* dump(9E) */
2126 	nodev,			/* read(9E) */
2127 	nodev,			/* write(9E) */
2128 	nodev,			/* ioctl(9E) */
2129 	nodev,			/* devmap(9E) */
2130 	nodev,			/* mmap(9E) */
2131 	nodev,			/* segmap(9E) */
2132 	nochpoll,		/* chpoll(9E) */
2133 	ddi_prop_op,		/* prop_op(9E) */
2134 	NULL,			/* streamtab(9S) */
2135 	D_MP | D_64BIT | D_NEW	/* cb_flag */
2136 };
2137 
2138 static struct dev_ops xdt_ops = {
2139 	DEVO_REV,		/* devo_rev */
2140 	0,			/* devo_refcnt */
2141 	xdt_info,		/* getinfo(9E) */
2142 	nulldev,		/* identify(9E) */
2143 	nulldev,		/* probe(9E) */
2144 	xdt_attach,		/* attach(9E) */
2145 	xdt_detach,		/* detach(9E) */
2146 	nulldev,		/* devo_reset */
2147 	&xdt_cb_ops,		/* devo_cb_ops */
2148 	NULL,			/* devo_bus_ops */
2149 	NULL,			/* power(9E) */
2150 	ddi_quiesce_not_needed,	/* devo_quiesce */
2151 };
2152 
2153 
2154 static struct modldrv modldrv = {
2155 	&mod_driverops,
2156 	"Hypervisor event tracing",
2157 	&xdt_ops
2158 };
2159 
2160 static struct modlinkage modlinkage = {
2161 	MODREV_1,
2162 	&modldrv,
2163 	NULL
2164 };
2165 
2166 int
_init(void)2167 _init(void)
2168 {
2169 	return (mod_install(&modlinkage));
2170 }
2171 
2172 int
_fini(void)2173 _fini(void)
2174 {
2175 	return (mod_remove(&modlinkage));
2176 }
2177 
2178 int
_info(struct modinfo * modinfop)2179 _info(struct modinfo *modinfop)
2180 {
2181 	return (mod_info(&modlinkage, modinfop));
2182 }
2183