xref: /illumos-gate/usr/src/uts/common/xen/dtrace/xdt.c (revision 2a9992ecc0392fdbbaa57613c1bcfc31b6c7e247)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Xen event provider for DTrace
29  *
30  * NOTE: This provider is PRIVATE. It is intended as a short-term solution and
31  * may disappear or be re-implemented at anytime.
32  *
33  * This provider isn't suitable as a general-purpose solution for a number of
34  * reasons. First and foremost, we rely on the Xen tracing mechanism and don't
35  * have any way to gather data other than that collected by the Xen trace
36  * buffers. Further, it does not fit into the DTrace model (see "Interacting
37  * with DTrace" below.)
38  *
39  *
40  * Tracing in Xen
41  * --------------
42  *
43  * Xen implements a tracing facility for generating and collecting execution
44  * event traces from the hypervisor. When tracing is enabled, compiled in
45  * probes record events in contiguous per-CPU trace buffers.
46  *
47  *               +---------+
48  * +------+      |         |
49  * | CPUn |----> | BUFFERn |
50  * +------+      |         |
51  *               +---------+- tbuf.va + (tbuf.size * n)
52  *               :         :
53  *               +---------+
54  * +------+      |         |
55  * | CPU1 |----> | BUFFER1 |
56  * +------+      |         |
57  *               +---------+- tbuf.va + tbuf.size
58  * +------+      |         |
59  * | CPU0 |----> | BUFFER0 |
60  * +------+      |         |
61  *               +---------+- tbuf.va
62  *
63  * Each CPU buffer consists of a metadata header followed by the trace records.
64  * The metadata consists of a producer/consumer pair of pointers into the buffer
65  * that point to the next record to be written and the next record to be read
66  * respectively.
67  *
68  * A trace record can be in one of two forms, depending on if the TSC is
69  * included. The record header indicates whether or not the TSC field is
70  * present.
71  *
72  * 1. Trace record without TSC:
73  * +------------------------------------------------------------+
74  * | HEADER(uint32_t) |            DATA FIELDS                  |
75  * +------------------------------------------------------------+
76  *
77  * 2. Trace record with TSC:
78  * +--------------------------------------------------------------------------+
79  * | HEADER(uint32_t) | TSC(uint64_t) |              DATA FIELDS              |
80  * +--------------------------------------------------------------------------+
81  *
82  * Where,
83  *
84  * HEADER bit field:
85  * +--------------------------------------------------------------------------+
86  * | C |  NDATA  |                        EVENT                               |
87  * +--------------------------------------------------------------------------+
88  *  31  30     28 27                                                         0
89  *
90  * EVENT: Event ID.
91  * NDATA: Number of populated data fields.
92  *     C: TSC included.
93  *
94  * DATA FIELDS:
95  * +--------------------------------------------------------------------------+
96  * | D1(uint32_t) | D2(uint32_t) | D3(uint32_t) |     . . .    | D7(uint32_t) |
97  * +--------------------------------------------------------------------------+
98  *
99  *
100  * Interacting with DTrace
101  * -----------------------
102  *
103  * Every xdt_poll_nsec nano-seconds we poll the trace buffers for data and feed
104  * each entry into dtrace_probe() with the corresponding probe ID for the event.
105  * As a result of this periodic collection implementation probe firings are
106  * asynchronous. This is the only sensible way to implement this form of
107  * provider, but because of its asynchronous nature asking things like
108  * "current CPU" and, more importantly, arbitrary questions about the context
109  * surrounding the probe firing are not meaningful. So, consumers should not
110  * attempt to infer anything beyond what is supplied via the probe arguments.
111  */
112 
113 #include <sys/xpv_user.h>
114 
115 #include <sys/types.h>
116 #include <sys/sysmacros.h>
117 #include <sys/modctl.h>
118 #include <sys/sunddi.h>
119 #include <sys/ddi.h>
120 #include <sys/conf.h>
121 #include <sys/devops.h>
122 #include <sys/stat.h>
123 #include <sys/cmn_err.h>
124 #include <sys/dtrace.h>
125 #include <sys/sdt.h>
126 #include <sys/cyclic.h>
127 #include <vm/seg_kmem.h>
128 #include <vm/hat_i86.h>
129 
130 #include <sys/hypervisor.h>
131 #include <xen/public/trace.h>
132 #include <xen/public/sched.h>
133 
134 #define	XDT_POLL_DEFAULT	100000000	/* default poll interval (ns) */
135 #define	XDT_POLL_MIN		10000000	/* min poll interval (ns) */
136 #define	XDT_TBUF_RETRY		50		/* tbuf disable retry count */
137 
138 /*
139  * The domid must match IDLE_DOMAIN_ID in xen.hg/xen/include/xen/sched.h
140  * in the xVM gate.
141  */
142 #define	IS_IDLE_DOM(domid)	(domid == 0x7FFFU)
143 
144 /* Macros to extract the domid and cpuid from a HVM trace data field */
145 #define	HVM_DOMID(d)		(d >> 16)
146 #define	HVM_VCPUID(d)		(d & 0xFFFF)
147 
148 /* Flags for shadow page table events */
149 #define	SH_GUEST_32	0x000
150 #define	SH_GUEST_PAE	0x100
151 #define	SH_GUEST_64	0x200
152 
153 #define	XDT_PROBE5(event, arg0, arg1, arg2, arg3, arg4) {		\
154 	dtrace_id_t id = xdt_probemap[event];				\
155 	if (id)								\
156 		dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);		\
157 }									\
158 
159 #define	XDT_PROBE4(event, arg0, arg1, arg2, arg3) \
160 	XDT_PROBE5(event, arg0, arg1, arg2, arg3, 0)
161 
162 #define	XDT_PROBE3(event, arg0, arg1, arg2) \
163 	XDT_PROBE5(event, arg0, arg1, arg2, 0, 0)
164 
165 #define	XDT_PROBE2(event, arg0, arg1) \
166 	XDT_PROBE5(event, arg0, arg1, 0, 0, 0)
167 
168 #define	XDT_PROBE1(event, arg0) \
169 	XDT_PROBE5(event, arg0, 0, 0, 0, 0)
170 
171 #define	XDT_PROBE0(event) \
172 	XDT_PROBE5(event, 0, 0, 0, 0, 0)
173 
174 /* Probe classes */
175 #define	XDT_SCHED			0
176 #define	XDT_MEM				1
177 #define	XDT_HVM				2
178 #define	XDT_GEN				3
179 #define	XDT_PV				4
180 #define	XDT_SHADOW			5
181 #define	XDT_PM				6
182 #define	XDT_NCLASSES			7
183 
184 /* Probe events */
185 #define	XDT_EVT_INVALID			(-(int)1)
186 #define	XDT_SCHED_OFF_CPU		0
187 #define	XDT_SCHED_ON_CPU		1
188 #define	XDT_SCHED_IDLE_OFF_CPU		2
189 #define	XDT_SCHED_IDLE_ON_CPU		3
190 #define	XDT_SCHED_BLOCK			4
191 #define	XDT_SCHED_SLEEP			5
192 #define	XDT_SCHED_WAKE			6
193 #define	XDT_SCHED_YIELD			7
194 #define	XDT_SCHED_SHUTDOWN_POWEROFF	8
195 #define	XDT_SCHED_SHUTDOWN_REBOOT	9
196 #define	XDT_SCHED_SHUTDOWN_SUSPEND	10
197 #define	XDT_SCHED_SHUTDOWN_CRASH	11
198 #define	XDT_MEM_PAGE_GRANT_MAP		12
199 #define	XDT_MEM_PAGE_GRANT_UNMAP	13
200 #define	XDT_MEM_PAGE_GRANT_TRANSFER	14
201 #define	XDT_HVM_VMENTRY			15
202 #define	XDT_HVM_VMEXIT			16
203 #define	XDT_TRC_LOST_RECORDS		17
204 #define	XDT_SCHED_ADD_VCPU		18
205 #define	XDT_SCHED_REM_VCPU		19	/* unused */
206 #define	XDT_SCHED_CTL			20	/* unused */
207 #define	XDT_SCHED_ADJDOM		21
208 #define	XDT_SCHED_S_TIMER_FN		22	/* unused */
209 #define	XDT_SCHED_T_TIMER_FN		23	/* unused */
210 #define	XDT_SCHED_DOM_TIMER_FN		24	/* unused */
211 #define	XDT_PV_HYPERCALL		25
212 #define	XDT_PV_TRAP			26
213 #define	XDT_PV_PAGE_FAULT		27
214 #define	XDT_PV_FORCED_INVALID_OP	28
215 #define	XDT_PV_EMULATE_PRIVOP		29
216 #define	XDT_PV_EMULATE_4GB		30	/* unused (32-bit HV only ) */
217 #define	XDT_PV_MATH_STATE_RESTORE	31
218 #define	XDT_PV_PAGING_FIXUP		32
219 #define	XDT_PV_DT_MAPPING_FAULT		33
220 #define	XDT_PV_PTWR_EMULATION		34
221 #define	XDT_HVM_PF_XEN			35
222 #define	XDT_HVM_PF_INJECT		36
223 #define	XDT_HVM_EXC_INJECT		37
224 #define	XDT_HVM_VIRQ_INJECT		38
225 #define	XDT_HVM_VIRQ_REINJECT		39
226 #define	XDT_HVM_IO_READ			40	/* unused */
227 #define	XDT_HVM_IO_WRITE		41	/* unused */
228 #define	XDT_HVM_CR_READ			42
229 #define	XDT_HVM_CR_WRITE		43
230 #define	XDT_HVM_DR_READ			44	/* unused */
231 #define	XDT_HVM_DR_WRITE		45	/* unused */
232 #define	XDT_HVM_MSR_READ		46
233 #define	XDT_HVM_MSR_WRITE		47
234 #define	XDT_HVM_CPUID			48
235 #define	XDT_HVM_INTR			49
236 #define	XDT_HVM_INTR_WINDOW		50
237 #define	XDT_HVM_NMI			51
238 #define	XDT_HVM_SMI			52
239 #define	XDT_HVM_VMMCALL			53
240 #define	XDT_HVM_HLT			54
241 #define	XDT_HVM_INVLPG			55
242 #define	XDT_HVM_MCE			56
243 #define	XDT_HVM_IOPORT_READ		57
244 #define	XDT_HVM_IOPORT_WRITE		58
245 #define	XDT_HVM_CLTS			59
246 #define	XDT_HVM_LMSW			60
247 #define	XDT_HVM_IOMEM_READ		61
248 #define	XDT_HVM_IOMEM_WRITE		62
249 #define	XDT_SHADOW_NOT_SHADOW			63
250 #define	XDT_SHADOW_FAST_PROPAGATE		64
251 #define	XDT_SHADOW_FAST_MMIO			65
252 #define	XDT_SHADOW_FALSE_FAST_PATH		66
253 #define	XDT_SHADOW_MMIO				67
254 #define	XDT_SHADOW_FIXUP			68
255 #define	XDT_SHADOW_DOMF_DYING			69
256 #define	XDT_SHADOW_EMULATE			70
257 #define	XDT_SHADOW_EMULATE_UNSHADOW_USER	71
258 #define	XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ	72
259 #define	XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED	73
260 #define	XDT_SHADOW_WRMAP_BF			74
261 #define	XDT_SHADOW_PREALLOC_UNPIN		75
262 #define	XDT_SHADOW_RESYNC_FULL			76
263 #define	XDT_SHADOW_RESYNC_ONLY			77
264 #define	XDT_PM_FREQ_CHANGE		78
265 #define	XDT_PM_IDLE_ENTRY		79
266 #define	XDT_PM_IDLE_EXIT		80
267 #define	XDT_SCHED_RUNSTATE_CHANGE	81
268 #define	XDT_SCHED_CONTINUE_RUNNING	82
269 #define	XDT_NEVENTS			83
270 
271 typedef struct {
272 	const char	*pr_mod;	/* probe module */
273 	const char	*pr_name;	/* probe name */
274 	int		evt_id;		/* event id */
275 	uint_t		class;		/* probe class */
276 } xdt_probe_t;
277 
278 typedef struct {
279 	uint32_t	trc_mask;	/* trace mask */
280 	uint32_t	cnt;		/* num enabled probes in class */
281 } xdt_classinfo_t;
282 
283 typedef struct {
284 	ulong_t prev_domid;		/* previous dom executed */
285 	ulong_t prev_vcpuid;		/* previous vcpu executed */
286 	ulong_t prev_ctime;		/* time spent on cpu */
287 	ulong_t next_domid;		/* next dom to be scheduled */
288 	ulong_t next_vcpuid;		/* next vcpu to be scheduled */
289 	ulong_t next_wtime;		/* time spent waiting to get on cpu */
290 	ulong_t next_ts;		/* allocated time slice */
291 	ulong_t cur_domid;		/* current dom */
292 	ulong_t cur_vcpuid;		/* current vcpuid */
293 	int curinfo_valid;		/* info is valid */
294 } xdt_schedinfo_t;
295 
296 static struct {
297 	uint_t cnt;			/* total num of trace buffers */
298 	size_t size;			/* size of each cpu buffer */
299 	mfn_t start_mfn;		/* starting mfn of buffers */
300 	caddr_t va;			/* va buffers are mapped into */
301 
302 	/* per-cpu buffers */
303 	struct t_buf **meta;		/* buffer metadata */
304 	struct t_rec **data;		/* buffer data records */
305 
306 	/* statistics */
307 	uint64_t stat_dropped_recs;	/* records dropped */
308 	uint64_t stat_spurious_cpu;	/* recs with garbage cpuids */
309 	uint64_t stat_spurious_switch;	/* inconsistent vcpu switches */
310 	uint64_t stat_unknown_shutdown;	/* unknown shutdown code */
311 	uint64_t stat_unknown_recs;	/* unknown records */
312 } tbuf;
313 
314 static size_t tbuf_data_size;
315 
316 static char *xdt_stats[] = {
317 	"dropped_recs",
318 };
319 
320 /*
321  * Tunable variables
322  *
323  * The following may be tuned by adding a line to /etc/system that
324  * includes both the name of the module ("xdt") and the name of the variable.
325  * For example:
326  *     set xdt:xdt_tbuf_pages = 40
327  */
328 uint_t xdt_tbuf_pages = 20;			/* pages to alloc per-cpu buf */
329 
330 /*
331  * The following may be tuned by adding a line to
332  * /platform/i86xpv/kernel/drv/xdt.conf.
333  * For example:
334  *     xdt_poll_nsec = 200000000;
335  */
336 static hrtime_t xdt_poll_nsec;			/* trace buffer poll interval */
337 
338 /*
339  * Another tunable variable: the maximum number of records to process
340  * in one scan. If it is 0 (e.g. not set in /etc/system), it will
341  * be set to ncpu * (bufsize / max_rec_size).
342  *
343  * Having an upper limit avoids a situation where the scan would loop
344  * endlessly in case the hypervisor adds records quicker than we
345  * can process them. It's better to drop records than to loop, obviously.
346  */
347 uint_t xdt_max_recs = 0;
348 
349 /*
350  * Internal variables
351  */
352 static dev_info_t *xdt_devi;
353 static dtrace_provider_id_t xdt_id;
354 static uint_t xdt_ncpus;			/* total number of phys CPUs */
355 static uint32_t cur_trace_mask;			/* current trace mask */
356 static xdt_schedinfo_t *xdt_cpu_schedinfo;	/* per-cpu sched info */
357 dtrace_id_t xdt_probemap[XDT_NEVENTS];		/* map of enabled probes */
358 dtrace_id_t xdt_prid[XDT_NEVENTS];		/* IDs of registered events */
359 static cyclic_id_t xdt_cyclic = CYCLIC_NONE;
360 static kstat_t *xdt_kstats;
361 static xdt_classinfo_t xdt_classinfo[XDT_NCLASSES];
362 
363 /*
364  * These provide context when probes fire. They can be accessed
365  * from xdt dtrace probe (as `xdt_curdom, etc). It's ok for these
366  * to be global, and not per-cpu, as probes are run strictly in sequence
367  * as the trace buffers are
368  */
369 uint_t xdt_curdom, xdt_curvcpu, xdt_curpcpu;
370 uint64_t xdt_timestamp;
371 
372 static xdt_probe_t xdt_probe[] = {
373 	/* Sched probes */
374 	{ "sched", "off-cpu", XDT_SCHED_OFF_CPU, XDT_SCHED },
375 	{ "sched", "on-cpu", XDT_SCHED_ON_CPU, XDT_SCHED },
376 	{ "sched", "idle-off-cpu", XDT_SCHED_IDLE_OFF_CPU, XDT_SCHED },
377 	{ "sched", "idle-on-cpu", XDT_SCHED_IDLE_ON_CPU, XDT_SCHED },
378 	{ "sched", "block", XDT_SCHED_BLOCK, XDT_SCHED },
379 	{ "sched", "sleep", XDT_SCHED_SLEEP, XDT_SCHED },
380 	{ "sched", "wake", XDT_SCHED_WAKE, XDT_SCHED },
381 	{ "sched", "yield", XDT_SCHED_YIELD, XDT_SCHED },
382 	{ "sched", "shutdown-poweroff", XDT_SCHED_SHUTDOWN_POWEROFF,
383 		XDT_SCHED },
384 	{ "sched", "shutdown-reboot", XDT_SCHED_SHUTDOWN_REBOOT, XDT_SCHED },
385 	{ "sched", "shutdown-suspend", XDT_SCHED_SHUTDOWN_SUSPEND, XDT_SCHED },
386 	{ "sched", "shutdown-crash", XDT_SCHED_SHUTDOWN_CRASH, XDT_SCHED },
387 	{ "sched", "add", XDT_SCHED_ADD_VCPU, XDT_SCHED },
388 	{ "sched", "runstate-change", XDT_SCHED_RUNSTATE_CHANGE, XDT_SCHED },
389 	{ "sched", "continue-running", XDT_SCHED_CONTINUE_RUNNING, XDT_SCHED },
390 
391 	/* Memory probes */
392 	{ "mem", "page-grant-map", XDT_MEM_PAGE_GRANT_MAP, XDT_MEM },
393 	{ "mem", "page-grant-unmap", XDT_MEM_PAGE_GRANT_UNMAP, XDT_MEM },
394 	{ "mem", "page-grant-transfer", XDT_MEM_PAGE_GRANT_TRANSFER, XDT_MEM },
395 
396 	{"pv", "hypercall", XDT_PV_HYPERCALL, XDT_PV },
397 	{"pv", "trap", XDT_PV_TRAP, XDT_PV },
398 	{"pv", "page-fault", XDT_PV_PAGE_FAULT, XDT_PV },
399 	{"pv", "forced-invalid-op", XDT_PV_FORCED_INVALID_OP, XDT_PV },
400 	{"pv", "emulate-priv-op", XDT_PV_EMULATE_PRIVOP, XDT_PV },
401 	{"pv", "math-state-restore", XDT_PV_MATH_STATE_RESTORE, XDT_PV },
402 	{"pv", "paging-fixup", XDT_PV_PAGING_FIXUP, XDT_PV },
403 	{"pv", "dt-mapping-fault", XDT_PV_DT_MAPPING_FAULT, XDT_PV },
404 	{"pv", "pte-write-emul", XDT_PV_PTWR_EMULATION, XDT_PV },
405 
406 	/* HVM probes */
407 	{ "hvm", "vmentry", XDT_HVM_VMENTRY, XDT_HVM },
408 	{ "hvm", "vmexit", XDT_HVM_VMEXIT, XDT_HVM },
409 	{ "hvm", "pagefault-xen", XDT_HVM_PF_XEN, XDT_HVM },
410 	{ "hvm", "pagefault-inject", XDT_HVM_PF_INJECT, XDT_HVM },
411 	{ "hvm", "exception-inject", XDT_HVM_EXC_INJECT, XDT_HVM },
412 	{ "hvm", "virq-inject", XDT_HVM_VIRQ_INJECT, XDT_HVM },
413 	{ "hvm", "cr-read", XDT_HVM_CR_READ, XDT_HVM },
414 	{ "hvm", "cr-write", XDT_HVM_CR_WRITE, XDT_HVM },
415 	{ "hvm", "msr-read", XDT_HVM_MSR_READ, XDT_HVM },
416 	{ "hvm", "msr-write", XDT_HVM_MSR_WRITE, XDT_HVM },
417 	{ "hvm", "cpuid", XDT_HVM_CPUID, XDT_HVM },
418 	{ "hvm", "intr", XDT_HVM_INTR, XDT_HVM },
419 	{ "hvm", "intr-window", XDT_HVM_INTR_WINDOW, XDT_HVM },
420 	{ "hvm", "nmi", XDT_HVM_NMI, XDT_HVM },
421 	{ "hvm", "smi", XDT_HVM_SMI, XDT_HVM },
422 	{ "hvm", "vmmcall", XDT_HVM_VMMCALL, XDT_HVM },
423 	{ "hvm", "hlt", XDT_HVM_HLT, XDT_HVM },
424 	{ "hvm", "invlpg", XDT_HVM_INVLPG, XDT_HVM },
425 	{ "hvm", "mce", XDT_HVM_MCE, XDT_HVM },
426 	{ "hvm", "pio-read", XDT_HVM_IOPORT_READ, XDT_HVM },
427 	{ "hvm", "pio-write", XDT_HVM_IOPORT_WRITE, XDT_HVM },
428 	{ "hvm", "mmio-read", XDT_HVM_IOMEM_READ, XDT_HVM },
429 	{ "hvm", "mmio-write", XDT_HVM_IOMEM_WRITE, XDT_HVM },
430 	{ "hvm", "clts", XDT_HVM_CLTS, XDT_HVM },
431 	{ "hvm", "lmsw", XDT_HVM_LMSW, XDT_HVM },
432 
433 	{ "shadow", "fault-not-shadow", XDT_SHADOW_NOT_SHADOW, XDT_SHADOW },
434 	{ "shadow", "fast-propagate", XDT_SHADOW_FAST_PROPAGATE, XDT_SHADOW },
435 	{ "shadow", "fast-mmio", XDT_SHADOW_FAST_MMIO, XDT_SHADOW },
436 	{ "shadow", "false-fast-path", XDT_SHADOW_FALSE_FAST_PATH,
437 	    XDT_SHADOW },
438 	{ "shadow", "mmio", XDT_SHADOW_MMIO, XDT_SHADOW },
439 	{ "shadow", "fixup", XDT_SHADOW_FIXUP, XDT_SHADOW },
440 	{ "shadow", "domf-dying", XDT_SHADOW_DOMF_DYING, XDT_SHADOW },
441 	{ "shadow", "emulate", XDT_SHADOW_EMULATE, XDT_SHADOW },
442 	{ "shadow", "emulate-unshadow-user", XDT_SHADOW_EMULATE_UNSHADOW_USER,
443 	    XDT_SHADOW },
444 	{ "shadow", "emulate-unshadow-evtinj",
445 	    XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ, XDT_SHADOW },
446 	{ "shadow", "emulate-unshadow-unhandled",
447 	    XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED, XDT_SHADOW },
448 	{ "shadow", "wrmap-bf", XDT_SHADOW_WRMAP_BF, XDT_SHADOW },
449 	{ "shadow", "prealloc-unpin", XDT_SHADOW_PREALLOC_UNPIN, XDT_SHADOW },
450 	{ "shadow", "resync-full", XDT_SHADOW_RESYNC_FULL, XDT_SHADOW },
451 	{ "shadow", "resync-only", XDT_SHADOW_RESYNC_ONLY, XDT_SHADOW },
452 
453 	{ "pm", "freq-change", XDT_PM_FREQ_CHANGE, XDT_PM },
454 	{ "pm", "idle-entry", XDT_PM_IDLE_ENTRY, XDT_PM },
455 	{ "pm", "idle-exit", XDT_PM_IDLE_EXIT, XDT_PM },
456 
457 	/* Trace buffer related probes */
458 	{ "trace", "records-lost", XDT_TRC_LOST_RECORDS, XDT_GEN },
459 
460 	{ NULL }
461 };
462 
463 static inline uint32_t
xdt_nr_active_probes()464 xdt_nr_active_probes()
465 {
466 	int i;
467 	uint32_t tot = 0;
468 
469 	for (i = 0; i < XDT_NCLASSES; i++)
470 		tot += xdt_classinfo[i].cnt;
471 
472 	return (tot);
473 }
474 
475 static void
xdt_init_trace_masks(void)476 xdt_init_trace_masks(void)
477 {
478 	xdt_classinfo[XDT_SCHED].trc_mask = TRC_SCHED;
479 	xdt_classinfo[XDT_MEM].trc_mask = TRC_MEM;
480 	xdt_classinfo[XDT_HVM].trc_mask = TRC_HVM;
481 	xdt_classinfo[XDT_GEN].trc_mask = TRC_GEN;
482 	xdt_classinfo[XDT_PV].trc_mask = TRC_PV;
483 	xdt_classinfo[XDT_SHADOW].trc_mask = TRC_SHADOW;
484 	xdt_classinfo[XDT_PM].trc_mask = TRC_PM;
485 }
486 
487 static int
xdt_kstat_update(kstat_t * ksp,int flag)488 xdt_kstat_update(kstat_t *ksp, int flag)
489 {
490 	kstat_named_t *knp;
491 
492 	if (flag != KSTAT_READ)
493 		return (EACCES);
494 
495 	knp = ksp->ks_data;
496 
497 	/*
498 	 * Assignment order should match that of the names in
499 	 * xdt_stats.
500 	 */
501 	(knp++)->value.ui64 = tbuf.stat_dropped_recs;
502 
503 	return (0);
504 }
505 
506 static void
xdt_kstat_init(void)507 xdt_kstat_init(void)
508 {
509 	int nstats = sizeof (xdt_stats) / sizeof (xdt_stats[0]);
510 	char **cp = xdt_stats;
511 	kstat_named_t *knp;
512 
513 	if ((xdt_kstats = kstat_create("xdt", 0, "trace_statistics", "misc",
514 	    KSTAT_TYPE_NAMED, nstats, 0)) == NULL)
515 		return;
516 
517 	xdt_kstats->ks_update = xdt_kstat_update;
518 
519 	knp = xdt_kstats->ks_data;
520 	while (nstats > 0) {
521 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
522 		knp++;
523 		cp++;
524 		nstats--;
525 	}
526 
527 	kstat_install(xdt_kstats);
528 }
529 
530 static int
xdt_sysctl_tbuf(xen_sysctl_tbuf_op_t * tbuf_op)531 xdt_sysctl_tbuf(xen_sysctl_tbuf_op_t *tbuf_op)
532 {
533 	xen_sysctl_t op;
534 	int xerr;
535 
536 	op.cmd = XEN_SYSCTL_tbuf_op;
537 	op.interface_version = XEN_SYSCTL_INTERFACE_VERSION;
538 	op.u.tbuf_op = *tbuf_op;
539 
540 	if ((xerr = HYPERVISOR_sysctl(&op)) != 0)
541 		return (xen_xlate_errcode(xerr));
542 
543 	*tbuf_op = op.u.tbuf_op;
544 	return (0);
545 }
546 
547 static int
xdt_map_trace_buffers(mfn_t mfn,caddr_t va,size_t len)548 xdt_map_trace_buffers(mfn_t mfn, caddr_t va, size_t len)
549 {
550 	x86pte_t pte;
551 	caddr_t const sva = va;
552 	caddr_t const eva = va + len;
553 	int xerr;
554 
555 	ASSERT(mfn != MFN_INVALID);
556 	ASSERT(va != NULL);
557 	ASSERT(IS_PAGEALIGNED(len));
558 
559 	for (; va < eva; va += MMU_PAGESIZE) {
560 		/*
561 		 * Ask the HAT to load a throwaway mapping to page zero, then
562 		 * overwrite it with the hypervisor mapping. It gets removed
563 		 * later via hat_unload().
564 		 */
565 		hat_devload(kas.a_hat, va, MMU_PAGESIZE, (pfn_t)0,
566 		    PROT_READ | HAT_UNORDERED_OK,
567 		    HAT_LOAD_NOCONSIST | HAT_LOAD);
568 
569 		pte = mmu_ptob((x86pte_t)mfn) | PT_VALID | PT_USER
570 		    | PT_FOREIGN | PT_WRITABLE;
571 
572 		xerr = HYPERVISOR_update_va_mapping_otherdomain((ulong_t)va,
573 		    pte, UVMF_INVLPG | UVMF_LOCAL, DOMID_XEN);
574 
575 		if (xerr != 0) {
576 			/* unmap pages loaded so far */
577 			size_t ulen = (uintptr_t)(va + MMU_PAGESIZE) -
578 			    (uintptr_t)sva;
579 			hat_unload(kas.a_hat, sva, ulen, HAT_UNLOAD_UNMAP);
580 			return (xen_xlate_errcode(xerr));
581 		}
582 
583 		mfn++;
584 	}
585 
586 	return (0);
587 }
588 
589 static int
xdt_attach_trace_buffers(void)590 xdt_attach_trace_buffers(void)
591 {
592 	xen_sysctl_tbuf_op_t tbuf_op;
593 	size_t len;
594 	int err;
595 	uint_t i;
596 
597 	/*
598 	 * Xen does not support trace buffer re-sizing. If the buffers
599 	 * have already been allocated we just use them as is.
600 	 */
601 	tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_get_info;
602 	if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0)
603 		return (err);
604 
605 	if (tbuf_op.size == 0) {
606 		/* set trace buffer size */
607 		tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_set_size;
608 		tbuf_op.size = xdt_tbuf_pages;
609 		(void) xdt_sysctl_tbuf(&tbuf_op);
610 
611 		/* get trace buffer info */
612 		tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_get_info;
613 		if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0)
614 			return (err);
615 
616 		if (tbuf_op.size == 0) {
617 			cmn_err(CE_NOTE, "Couldn't allocate trace buffers.");
618 			return (ENOBUFS);
619 		}
620 	}
621 
622 	tbuf.size = tbuf_op.size;
623 	tbuf.start_mfn = (mfn_t)tbuf_op.buffer_mfn;
624 	tbuf.cnt = xdt_ncpus;
625 
626 	ASSERT(tbuf.start_mfn != MFN_INVALID);
627 	ASSERT(tbuf.cnt > 0);
628 
629 	len = tbuf.size * tbuf.cnt;
630 	tbuf.va = vmem_alloc(heap_arena, len, VM_SLEEP);
631 
632 	if ((err = xdt_map_trace_buffers(tbuf.start_mfn, tbuf.va, len)) != 0) {
633 		vmem_free(heap_arena, tbuf.va, len);
634 		tbuf.va = NULL;
635 		return (err);
636 	}
637 
638 	tbuf.meta = (struct t_buf **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.meta),
639 	    KM_SLEEP);
640 	tbuf.data = (struct t_rec **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.data),
641 	    KM_SLEEP);
642 
643 	for (i = 0; i < tbuf.cnt; i++) {
644 		void *cpu_buf = (void *)(tbuf.va + (tbuf.size * i));
645 		tbuf.meta[i] = cpu_buf;
646 		tbuf.data[i] = (struct t_rec *)((uintptr_t)cpu_buf +
647 		    sizeof (struct t_buf));
648 
649 		/* throw away stale trace records */
650 		tbuf.meta[i]->cons = tbuf.meta[i]->prod;
651 	}
652 
653 	tbuf_data_size = tbuf.size - sizeof (struct t_buf);
654 	if (xdt_max_recs == 0)
655 		xdt_max_recs = (xdt_ncpus * tbuf_data_size)
656 		    / sizeof (struct t_rec);
657 
658 	return (0);
659 }
660 
661 static void
xdt_detach_trace_buffers(void)662 xdt_detach_trace_buffers(void)
663 {
664 	size_t len = tbuf.size * tbuf.cnt;
665 
666 	ASSERT(tbuf.va != NULL);
667 
668 	hat_unload(kas.a_hat, tbuf.va, len,
669 	    HAT_UNLOAD_UNMAP | HAT_UNLOAD_UNLOCK);
670 	vmem_free(heap_arena, tbuf.va, len);
671 	kmem_free(tbuf.meta, tbuf.cnt * sizeof (*tbuf.meta));
672 	kmem_free(tbuf.data, tbuf.cnt * sizeof (*tbuf.data));
673 }
674 
675 static void
xdt_update_sched_context(uint_t cpuid,uint_t dom,uint_t vcpu)676 xdt_update_sched_context(uint_t cpuid, uint_t dom, uint_t vcpu)
677 {
678 	xdt_schedinfo_t *sp = &xdt_cpu_schedinfo[cpuid];
679 
680 	sp->cur_domid = dom;
681 	sp->cur_vcpuid = vcpu;
682 	sp->curinfo_valid = 1;
683 }
684 
685 static void
xdt_update_domain_context(uint_t dom,uint_t vcpu)686 xdt_update_domain_context(uint_t dom, uint_t vcpu)
687 {
688 	xdt_curdom = dom;
689 	xdt_curvcpu = vcpu;
690 }
691 
692 static size_t
xdt_process_rec(uint_t cpuid,struct t_rec * rec)693 xdt_process_rec(uint_t cpuid, struct t_rec *rec)
694 {
695 	xdt_schedinfo_t *sp = &xdt_cpu_schedinfo[cpuid];
696 	uint_t dom, vcpu;
697 	int eid;
698 	uint32_t *data;
699 	uint64_t tsc, addr64, rip64, val64, pte64;
700 	size_t rec_size;
701 
702 	ASSERT(rec != NULL);
703 	ASSERT(xdt_ncpus == xpv_nr_phys_cpus());
704 
705 	eid = 0;
706 	if (cpuid >= xdt_ncpus) {
707 		tbuf.stat_spurious_cpu++;
708 		goto done;
709 	}
710 
711 	/*
712 	 * If our current state isn't valid, and if this is not
713 	 * an event that will update our state, skip it.
714 	 */
715 
716 	if (!sp->curinfo_valid &&
717 	    rec->event != TRC_SCHED_SWITCH &&
718 	    rec->event != TRC_LOST_RECORDS)
719 		goto done;
720 
721 	if (rec->cycles_included) {
722 		data = rec->u.cycles.extra_u32;
723 		tsc = (((uint64_t)rec->u.cycles.cycles_hi) << 32)
724 		    | rec->u.cycles.cycles_lo;
725 	} else {
726 		data = rec->u.nocycles.extra_u32;
727 		tsc = 0;
728 	}
729 
730 	xdt_timestamp = tsc;
731 
732 	switch (rec->event) {
733 	/*
734 	 * Sched probes
735 	 */
736 	case TRC_SCHED_SWITCH_INFPREV:
737 		/*
738 		 * Info on vCPU being de-scheduled
739 		 *
740 		 * data[0] = prev domid
741 		 * data[1] = time spent on pcpu
742 		 */
743 		sp->prev_domid = data[0];
744 		sp->prev_ctime = data[1];
745 		break;
746 
747 	case TRC_SCHED_SWITCH_INFNEXT:
748 		/*
749 		 * Info on next vCPU to be scheduled
750 		 *
751 		 * data[0] = next domid
752 		 * data[1] = time spent waiting to get on cpu
753 		 * data[2] = time slice
754 		 */
755 		sp->next_domid = data[0];
756 		sp->next_wtime = data[1];
757 		sp->next_ts = data[2];
758 		break;
759 
760 	case TRC_SCHED_SWITCH:
761 		/*
762 		 * vCPU switch
763 		 *
764 		 * data[0] = prev domid
765 		 * data[1] = prev vcpuid
766 		 * data[2] = next domid
767 		 * data[3] = next vcpuid
768 		 */
769 
770 		/*
771 		 * Provide valid context for this probe if there
772 		 * wasn't one.
773 		 */
774 		if (!sp->curinfo_valid)
775 			xdt_update_domain_context(data[0], data[1]);
776 
777 		xdt_update_sched_context(cpuid, data[0], data[1]);
778 
779 		if (data[0] != sp->prev_domid &&
780 		    data[2] != sp->next_domid) {
781 			/* prev and next info don't match doms being sched'd */
782 			tbuf.stat_spurious_switch++;
783 			goto switchdone;
784 		}
785 
786 		sp->prev_vcpuid = data[1];
787 		sp->next_vcpuid = data[3];
788 
789 		XDT_PROBE3(IS_IDLE_DOM(sp->prev_domid)?
790 		    XDT_SCHED_IDLE_OFF_CPU:XDT_SCHED_OFF_CPU,
791 		    sp->prev_domid, sp->prev_vcpuid, sp->prev_ctime);
792 
793 		XDT_PROBE4(IS_IDLE_DOM(sp->next_domid)?
794 		    XDT_SCHED_IDLE_ON_CPU:XDT_SCHED_ON_CPU,
795 		    sp->next_domid, sp->next_vcpuid, sp->next_wtime,
796 		    sp->next_ts);
797 switchdone:
798 		xdt_update_sched_context(cpuid, data[2], data[3]);
799 		xdt_update_domain_context(data[2], data[3]);
800 
801 		break;
802 
803 	case TRC_SCHED_BLOCK:
804 		/*
805 		 * vCPU blocked
806 		 *
807 		 * data[0] = domid
808 		 * data[1] = vcpuid
809 		 */
810 		XDT_PROBE2(XDT_SCHED_BLOCK, data[0], data[1]);
811 		break;
812 
813 	case TRC_SCHED_SLEEP:
814 		/*
815 		 * Put vCPU to sleep
816 		 *
817 		 * data[0] = domid
818 		 * data[1] = vcpuid
819 		 */
820 		XDT_PROBE2(XDT_SCHED_SLEEP, data[0], data[1]);
821 		break;
822 
823 	case TRC_SCHED_WAKE:
824 		/*
825 		 * Wake up vCPU
826 		 *
827 		 * data[0] = domid
828 		 * data[1] = vcpuid
829 		 */
830 		XDT_PROBE2(XDT_SCHED_WAKE, data[0], data[1]);
831 		break;
832 
833 	case TRC_SCHED_YIELD:
834 		/*
835 		 * vCPU yielded
836 		 *
837 		 * data[0] = domid
838 		 * data[1] = vcpuid
839 		 */
840 		XDT_PROBE2(XDT_SCHED_YIELD, data[0], data[1]);
841 		break;
842 
843 	case TRC_SCHED_SHUTDOWN:
844 		/*
845 		 * Guest shutting down
846 		 *
847 		 * data[0] = domid
848 		 * data[1] = initiating vcpu
849 		 * data[2] = shutdown code
850 		 */
851 		switch (data[2]) {
852 		case SHUTDOWN_poweroff:
853 			eid = XDT_SCHED_SHUTDOWN_POWEROFF;
854 			break;
855 		case SHUTDOWN_reboot:
856 			eid = XDT_SCHED_SHUTDOWN_REBOOT;
857 			break;
858 		case SHUTDOWN_suspend:
859 			eid = XDT_SCHED_SHUTDOWN_SUSPEND;
860 			break;
861 		case SHUTDOWN_crash:
862 			eid = XDT_SCHED_SHUTDOWN_CRASH;
863 			break;
864 		default:
865 			tbuf.stat_unknown_shutdown++;
866 			goto done;
867 		}
868 
869 		XDT_PROBE2(eid, data[0], data[1]);
870 		break;
871 
872 	case TRC_SCHED_DOM_REM:
873 	case TRC_SCHED_CTL:
874 	case TRC_SCHED_S_TIMER_FN:
875 	case TRC_SCHED_T_TIMER_FN:
876 	case TRC_SCHED_DOM_TIMER_FN:
877 		/* unused */
878 		break;
879 	case TRC_SCHED_DOM_ADD:
880 		/*
881 		 * Add vcpu to a guest.
882 		 *
883 		 * data[0] = domid
884 		 * data[1] = vcpu
885 		 */
886 		XDT_PROBE2(XDT_SCHED_ADD_VCPU, data[0], data[1]);
887 		break;
888 	case TRC_SCHED_ADJDOM:
889 		/*
890 		 * Scheduling parameters for a guest
891 		 * were modified.
892 		 *
893 		 * data[0] = domid;
894 		 */
895 		XDT_PROBE1(XDT_SCHED_ADJDOM, data[1]);
896 		break;
897 	case TRC_SCHED_RUNSTATE_CHANGE:
898 		/*
899 		 * Runstate change for a VCPU.
900 		 *
901 		 * data[0] = (domain << 16) | vcpu;
902 		 * data[1] = oldstate;
903 		 * data[2] = newstate;
904 		 */
905 		XDT_PROBE4(XDT_SCHED_RUNSTATE_CHANGE, data[0] >> 16,
906 		    data[0] & 0xffff, data[1], data[2]);
907 		break;
908 	case TRC_SCHED_CONTINUE_RUNNING:
909 		/*
910 		 * VCPU is back on a physical CPU that it previously
911 		 * was also running this VCPU.
912 		 *
913 		 * data[0] = (domain << 16) | vcpu;
914 		 */
915 		XDT_PROBE2(XDT_SCHED_CONTINUE_RUNNING, data[0] >> 16,
916 		    data[0] & 0xffff);
917 		break;
918 	/*
919 	 * Mem probes
920 	 */
921 	case TRC_MEM_PAGE_GRANT_MAP:
922 		/*
923 		 * Guest mapped page grant
924 		 *
925 		 * data[0] = target domid
926 		 */
927 		XDT_PROBE1(XDT_MEM_PAGE_GRANT_MAP, data[0]);
928 		break;
929 
930 	case TRC_MEM_PAGE_GRANT_UNMAP:
931 		/*
932 		 * Guest unmapped page grant
933 		 *
934 		 * data[0] = target domid
935 		 */
936 		XDT_PROBE1(XDT_MEM_PAGE_GRANT_UNMAP, data[0]);
937 		break;
938 
939 	case TRC_MEM_PAGE_GRANT_TRANSFER:
940 		/*
941 		 * Page grant is being transferred
942 		 *
943 		 * data[0] = target domid
944 		 */
945 		XDT_PROBE1(XDT_MEM_PAGE_GRANT_TRANSFER, data[0]);
946 		break;
947 
948 	/*
949 	 * Probes for PV domains.
950 	 */
951 	case TRC_PV_HYPERCALL:
952 		/*
953 		 * Hypercall from a 32-bit PV domain.
954 		 *
955 		 * data[0] = eip
956 		 * data[1] = eax
957 		 */
958 		XDT_PROBE2(XDT_PV_HYPERCALL, data[0], data[1]);
959 		break;
960 	case TRC_PV_HYPERCALL | TRC_64_FLAG:
961 		/*
962 		 * Hypercall from a 64-bit PV domain.
963 		 *
964 		 * data[0] = rip(0:31)
965 		 * data[1] = rip(32:63)
966 		 * data[2] = eax;
967 		 */
968 		rip64 = (((uint64_t)data[1]) << 32) | data[0];
969 		XDT_PROBE2(XDT_PV_HYPERCALL, rip64, data[2]);
970 		break;
971 	case TRC_PV_TRAP:
972 		/*
973 		 * Trap in a 32-bit PV domain.
974 		 *
975 		 * data[0] = eip
976 		 * data[1] = trapnr | (error_code_valid << 15)
977 		 *	| (error_code << 16);
978 		 */
979 		XDT_PROBE4(XDT_PV_TRAP, data[0], data[1] & 0x7fff,
980 		    (data[1] >> 15) & 1, data[1] >> 16);
981 		break;
982 	case TRC_PV_TRAP | TRC_64_FLAG:
983 		/*
984 		 * Trap in a 64-bit PV domain.
985 		 *
986 		 * data[0] = rip(0:31)
987 		 * data[1] = rip(32:63)
988 		 * data[2] = trapnr | (error_code_valid << 15)
989 		 *	| (error_code << 16);
990 		 */
991 		rip64 = (((uint64_t)data[1]) << 32) | data[2];
992 		XDT_PROBE4(XDT_PV_TRAP, rip64, data[2] & 0x7fff,
993 		    (data[2] >> 15) & 1, data[2] >> 16);
994 		break;
995 	case TRC_PV_PAGE_FAULT:
996 		/*
997 		 * Page fault in a 32-bit PV domain.
998 		 *
999 		 * data[0] = eip
1000 		 * data[1] = vaddr
1001 		 * data[2] = error code
1002 		 */
1003 		XDT_PROBE3(XDT_PV_PAGE_FAULT, data[0], data[1], data[2]);
1004 		break;
1005 	case TRC_PV_PAGE_FAULT | TRC_64_FLAG:
1006 		/*
1007 		 * Page fault in a 32-bit PV domain.
1008 		 *
1009 		 * data[0] = rip(0:31)
1010 		 * data[1] = rip(31:63)
1011 		 * data[2] = vaddr(0:31)
1012 		 * data[3] = vaddr(31:63)
1013 		 * data[4] = error code
1014 		 */
1015 		rip64 = (((uint64_t)data[1]) << 32) | data[0];
1016 		addr64 = (((uint64_t)data[3]) << 32) | data[2];
1017 		XDT_PROBE3(XDT_PV_PAGE_FAULT, rip64, addr64, data[4]);
1018 		break;
1019 	case TRC_PV_FORCED_INVALID_OP:
1020 		/*
1021 		 * Hypervisor emulated a forced invalid op (ud2)
1022 		 * in a 32-bit PV domain.
1023 		 *
1024 		 * data[1] = eip
1025 		 */
1026 		XDT_PROBE1(XDT_PV_FORCED_INVALID_OP, data[1]);
1027 		break;
1028 	case TRC_PV_FORCED_INVALID_OP | TRC_64_FLAG:
1029 		/*
1030 		 * Hypervisor emulated a forced invalid op (ud2)
1031 		 * in a 64-bit PV domain.
1032 		 *
1033 		 * data[1] = rip(0:31)
1034 		 * data[2] = rip(31:63)
1035 		 *
1036 		 */
1037 		rip64 = (((uint64_t)data[2]) << 32) | data[1];
1038 		XDT_PROBE1(XDT_PV_FORCED_INVALID_OP, rip64);
1039 		break;
1040 	case TRC_PV_EMULATE_PRIVOP:
1041 		/*
1042 		 * Hypervisor emulated a privileged operation
1043 		 * in a 32-bit PV domain.
1044 		 *
1045 		 * data[0] = eip
1046 		 */
1047 		XDT_PROBE1(XDT_PV_EMULATE_PRIVOP, data[0]);
1048 		break;
1049 	case TRC_PV_EMULATE_PRIVOP | TRC_64_FLAG:
1050 		/*
1051 		 * Hypervisor emulated a privileged operation
1052 		 * in a 64-bit PV domain.
1053 		 *
1054 		 * data[0] = rip(0:31)
1055 		 * data[1] = rip(31:63)
1056 		 */
1057 		rip64 = (((uint64_t)data[1]) << 32) | data[0];
1058 		XDT_PROBE1(XDT_PV_EMULATE_PRIVOP, rip64);
1059 		break;
1060 	case TRC_PV_EMULATE_4GB:
1061 		/* unused, 32-bit hypervisor only */
1062 		break;
1063 	case TRC_PV_MATH_STATE_RESTORE:
1064 		/*
1065 		 * Hypervisor restores math state after FP DNA trap.
1066 		 *
1067 		 * No arguments.
1068 		 */
1069 		XDT_PROBE0(XDT_PV_MATH_STATE_RESTORE);
1070 		break;
1071 	case TRC_PV_PAGING_FIXUP:
1072 		/*
1073 		 * Hypervisor fixed up a page fault (e.g. it was
1074 		 * a side-effect of hypervisor guest page table
1075 		 * bookkeeping, and not propagated to the guest).
1076 		 *
1077 		 * data[0] = eip
1078 		 * data[1] = vaddr
1079 		 */
1080 		XDT_PROBE2(XDT_PV_PAGING_FIXUP, data[0], data[2]);
1081 		break;
1082 	case TRC_PV_PAGING_FIXUP | TRC_64_FLAG:
1083 		/*
1084 		 * Hypervisor fixed up a page fault (e.g. it was
1085 		 * a side-effect of hypervisor guest page table
1086 		 * bookkeeping, and not propagated to the guest).
1087 		 *
1088 		 * data[0] = eip(0:31)
1089 		 * data[1] = eip(31:63)
1090 		 * data[2] = vaddr(0:31)
1091 		 * data[3] = vaddr(31:63)
1092 		 */
1093 		rip64 = (((uint64_t)data[1]) << 32) | data[0];
1094 		addr64 = (((uint64_t)data[3]) << 32) | data[2];
1095 		XDT_PROBE2(XDT_PV_PAGING_FIXUP, rip64, addr64);
1096 		break;
1097 	case TRC_PV_GDT_LDT_MAPPING_FAULT:
1098 		/*
1099 		 * Descriptor table mapping fault in a 32-bit PV domain.
1100 		 * data[0] = eip
1101 		 * data[1] = offset
1102 		 */
1103 		XDT_PROBE2(XDT_PV_DT_MAPPING_FAULT, data[0], data[1]);
1104 		break;
1105 	case TRC_PV_GDT_LDT_MAPPING_FAULT | TRC_64_FLAG:
1106 		/*
1107 		 * Descriptor table mapping fault in a 64-bit PV domain.
1108 		 *
1109 		 * data[0] = eip(0:31)
1110 		 * data[1] = eip(31:63)
1111 		 * data[2] = offset(0:31)
1112 		 * data[3] = offset(31:63)
1113 		 */
1114 		rip64 = (((uint64_t)data[1]) << 32) | data[0];
1115 		val64 = (((uint64_t)data[3]) << 32) | data[2];
1116 		XDT_PROBE2(XDT_PV_DT_MAPPING_FAULT, rip64, val64);
1117 		break;
1118 	case TRC_PV_PTWR_EMULATION:
1119 	case TRC_PV_PTWR_EMULATION_PAE | TRC_64_FLAG:
1120 		/*
1121 		 * Should only happen on 32-bit hypervisor; unused.
1122 		 */
1123 		break;
1124 	case TRC_PV_PTWR_EMULATION_PAE:
1125 		/*
1126 		 * PTE write emulation for a 32-bit PV domain.
1127 		 *
1128 		 * data[0] = pte
1129 		 * data[1] = addr
1130 		 * data[2] = eip
1131 		 */
1132 		XDT_PROBE3(XDT_PV_PTWR_EMULATION, data[0], data[1], data[2]);
1133 		break;
1134 	case TRC_PV_PTWR_EMULATION | TRC_64_FLAG:
1135 		/*
1136 		 * PTE write emulation for a 64-bit PV domain.
1137 		 *
1138 		 * data[0] = pte(0:31)
1139 		 * data[1] = pte(32:63)
1140 		 * data[2] = addr(0:31)
1141 		 * data[3] = addr(32:63)
1142 		 * data[4] = rip(0:31)
1143 		 * data[5] = rip(32:63)
1144 		 */
1145 		pte64 = (((uint64_t)data[1]) << 32) | data[0];
1146 		addr64 = (((uint64_t)data[3]) << 32) | data[2];
1147 		rip64 = (((uint64_t)data[5]) << 32) | data[4];
1148 		XDT_PROBE3(XDT_PV_PTWR_EMULATION, pte64, addr64, rip64);
1149 		break;
1150 
1151 	/*
1152 	 * HVM probes
1153 	 */
1154 	case TRC_HVM_VMENTRY:
1155 		/*
1156 		 * Return to guest via vmx_launch/vmrun
1157 		 *
1158 		 */
1159 		XDT_PROBE0(XDT_HVM_VMENTRY);
1160 		break;
1161 
1162 	case TRC_HVM_VMEXIT:
1163 		/*
1164 		 * Entry into VMEXIT handler from 32-bit HVM domain
1165 		 *
1166 		 * data[0] = cpu vendor specific exit code
1167 		 * data[1] = guest eip
1168 		 */
1169 		XDT_PROBE2(XDT_HVM_VMEXIT, data[0], data[1]);
1170 		break;
1171 	case TRC_HVM_VMEXIT64:
1172 		/*
1173 		 * Entry into VMEXIT handler from 64-bit HVM domain
1174 		 *
1175 		 * data[0] = cpu vendor specific exit code
1176 		 * data[1] = guest rip(0:31)
1177 		 * data[2] = guest rip(32:64)
1178 		 */
1179 		rip64 = (((uint64_t)data[2]) << 32) | data[1];
1180 		XDT_PROBE2(XDT_HVM_VMEXIT, data[0], rip64);
1181 		break;
1182 
1183 	case TRC_HVM_PF_XEN64:
1184 		/*
1185 		 * Pagefault in a guest that is a Xen (e.g. shadow)
1186 		 * artifact, and is not injected back into the guest.
1187 		 *
1188 		 * data[0] = error code
1189 		 * data[1] = guest VA(0:31)
1190 		 * data[2] = guest VA(32:64)
1191 		 */
1192 		addr64 = (((uint64_t)data[2]) << 32) | data[1];
1193 		XDT_PROBE2(XDT_HVM_PF_XEN, data[0], addr64);
1194 		break;
1195 
1196 	case TRC_HVM_PF_XEN:
1197 		/*
1198 		 * Same as above, but for a 32-bit HVM domain.
1199 		 *
1200 		 * data[0] = error code
1201 		 * data[1] = guest VA
1202 		 */
1203 		XDT_PROBE2(XDT_HVM_PF_XEN, data[0], data[1]);
1204 		break;
1205 
1206 	case TRC_HVM_PF_INJECT:
1207 		/*
1208 		 * 32-bit Xen only.
1209 		 */
1210 		break;
1211 	case TRC_HVM_PF_INJECT64:
1212 		/*
1213 		 * Pagefault injected back into a guest (e.g. the shadow
1214 		 * code found no mapping).
1215 		 *
1216 		 * data[0] = error code
1217 		 * data[1] = guest VA(0:31)
1218 		 * data[2] = guest VA(32:64)
1219 		 */
1220 		addr64 = (((uint64_t)data[2]) << 32) | data[1];
1221 		XDT_PROBE2(XDT_HVM_PF_INJECT, data[0], addr64);
1222 		break;
1223 
1224 	case TRC_HVM_INJ_EXC:
1225 		/*
1226 		 * Exception injected into an HVM guest.
1227 		 *
1228 		 * data[0] = trap
1229 		 * data[1] = error code
1230 		 */
1231 		XDT_PROBE2(XDT_HVM_EXC_INJECT, data[0], data[1]);
1232 		break;
1233 	case TRC_HVM_INJ_VIRQ:
1234 		/*
1235 		 * Interrupt inject into an HVM guest.
1236 		 *
1237 		 * data[0] = vector
1238 		 */
1239 		XDT_PROBE1(XDT_HVM_VIRQ_INJECT, data[0]);
1240 		break;
1241 	case TRC_HVM_REINJ_VIRQ:
1242 	case TRC_HVM_IO_READ:
1243 	case TRC_HVM_IO_WRITE:
1244 		/* unused */
1245 		break;
1246 	case TRC_HVM_CR_READ64:
1247 		/*
1248 		 * Control register read. Intel VMX only.
1249 		 *
1250 		 * data[0] = control register #
1251 		 * data[1] = value(0:31)
1252 		 * data[2] = value(32:63)
1253 		 */
1254 		val64 = (((uint64_t)data[2]) << 32) | data[1];
1255 		XDT_PROBE2(XDT_HVM_CR_READ, data[0], val64);
1256 		break;
1257 	case TRC_HVM_CR_READ:
1258 		/*
1259 		 * unused (32-bit Xen only)
1260 		 */
1261 		break;
1262 	case TRC_HVM_CR_WRITE64:
1263 		/*
1264 		 * Control register write. Intel VMX only.
1265 		 *
1266 		 * data[0] = control register #
1267 		 * data[1] = value(0:31)
1268 		 * data[2] = value(32:63)
1269 		 */
1270 		val64 = (((uint64_t)data[2]) << 32) | data[1];
1271 		XDT_PROBE2(XDT_HVM_CR_READ, data[0], val64);
1272 		break;
1273 	case TRC_HVM_CR_WRITE:
1274 		/*
1275 		 * unused (32-bit Xen only)
1276 		 */
1277 		break;
1278 	case TRC_HVM_DR_READ:
1279 		/*
1280 		 * unused.
1281 		 *
1282 		 * data[0] = (domid<<16 + vcpuid)
1283 		 */
1284 		break;
1285 	case TRC_HVM_DR_WRITE:
1286 		/*
1287 		 * Debug register write. Not too useful; no values,
1288 		 * so we ignore this.
1289 		 *
1290 		 * data[0] = (domid<<16 + vcpuid)
1291 		 */
1292 		break;
1293 	case TRC_HVM_MSR_READ:
1294 		/*
1295 		 * MSR read.
1296 		 *
1297 		 * data[0] = MSR
1298 		 * data[1] = value(0:31)
1299 		 * data[2] = value(32:63)
1300 		 */
1301 		val64 = (((uint64_t)data[3]) << 32) | data[2];
1302 		XDT_PROBE2(XDT_HVM_MSR_READ, data[0], val64);
1303 		break;
1304 	case TRC_HVM_MSR_WRITE:
1305 		/*
1306 		 * MSR write.
1307 		 *
1308 		 * data[0] = MSR;
1309 		 * data[1] = value(0:31)
1310 		 * data[2] = value(32:63)
1311 		 */
1312 		val64 = (((uint64_t)data[2]) << 32) | data[1];
1313 		XDT_PROBE2(XDT_HVM_MSR_WRITE, data[0], val64);
1314 		break;
1315 	case TRC_HVM_CPUID:
1316 		/*
1317 		 * CPUID insn.
1318 		 *
1319 		 * data[0] = %eax (input)
1320 		 * data[1] = %eax
1321 		 * data[2] = %ebx
1322 		 * data[3] = %ecx
1323 		 * data[4] = %edx
1324 		 */
1325 		XDT_PROBE5(XDT_HVM_CPUID, data[0], data[1], data[2], data[3],
1326 		    data[4]);
1327 		break;
1328 	case TRC_HVM_INTR:
1329 		/*
1330 		 * VMEXIT because of an interrupt.
1331 		 */
1332 		XDT_PROBE0(XDT_HVM_INTR);
1333 		break;
1334 	case TRC_HVM_INTR_WINDOW:
1335 		/*
1336 		 * VMEXIT because of an interrupt window (an interrupt
1337 		 * can't be delivered immediately to a HVM guest and must
1338 		 * be delayed).
1339 		 *
1340 		 * data[0] = vector
1341 		 * data[1] = source
1342 		 * data[2] = info
1343 		 */
1344 		XDT_PROBE3(XDT_HVM_INTR_WINDOW, data[0], data[1], data[2]);
1345 		break;
1346 	case TRC_HVM_NMI:
1347 		/*
1348 		 * VMEXIT because of an NMI.
1349 		 */
1350 		XDT_PROBE0(XDT_HVM_NMI);
1351 		break;
1352 	case TRC_HVM_SMI:
1353 		/*
1354 		 * VMEXIT because of an SMI
1355 		 */
1356 		XDT_PROBE0(XDT_HVM_SMI);
1357 		break;
1358 	case TRC_HVM_VMMCALL:
1359 		/*
1360 		 * VMMCALL insn.
1361 		 *
1362 		 * data[0] = %eax
1363 		 */
1364 		XDT_PROBE1(XDT_HVM_VMMCALL, data[0]);
1365 		break;
1366 	case TRC_HVM_HLT:
1367 		/*
1368 		 * HLT insn.
1369 		 *
1370 		 * data[0] = 1 if VCPU runnable, 0 if not
1371 		 */
1372 		XDT_PROBE1(XDT_HVM_HLT, data[0]);
1373 		break;
1374 	case TRC_HVM_INVLPG64:
1375 		/*
1376 		 *
1377 		 * data[0] = INVLPGA ? 1 : 0
1378 		 * data[1] = vaddr(0:31)
1379 		 * data[2] = vaddr(32:63)
1380 		 */
1381 		addr64 = (((uint64_t)data[2]) << 32) | data[1];
1382 		XDT_PROBE2(XDT_HVM_INVLPG, data[0], addr64);
1383 		break;
1384 	case TRC_HVM_INVLPG:
1385 		/*
1386 		 * unused (32-bit Xen only)
1387 		 *
1388 		 * data[0] = (domid<<16 + vcpuid)
1389 		 */
1390 		break;
1391 	case TRC_HVM_MCE:
1392 		/*
1393 		 * #MCE VMEXIT
1394 		 *
1395 		 */
1396 		XDT_PROBE0(XDT_HVM_MCE);
1397 		break;
1398 	case TRC_HVM_IOPORT_READ:
1399 	case TRC_HVM_IOPORT_WRITE:
1400 	case TRC_HVM_IOMEM_READ:
1401 	case TRC_HVM_IOMEM_WRITE:
1402 		/*
1403 		 * data[0] = addr(0:31)
1404 		 * data[1] = addr(32:63)
1405 		 * data[2] = count
1406 		 * data[3] = size
1407 		 */
1408 		switch (rec->event) {
1409 		case TRC_HVM_IOPORT_READ:
1410 			eid = XDT_HVM_IOPORT_READ;
1411 			break;
1412 		case TRC_HVM_IOPORT_WRITE:
1413 			eid = XDT_HVM_IOPORT_WRITE;
1414 			break;
1415 		case TRC_HVM_IOMEM_READ:
1416 			eid = XDT_HVM_IOMEM_READ;
1417 			break;
1418 		case TRC_HVM_IOMEM_WRITE:
1419 			eid = XDT_HVM_IOMEM_WRITE;
1420 			break;
1421 		}
1422 		addr64 = (((uint64_t)data[1]) << 32) | data[0];
1423 		XDT_PROBE3(eid, addr64, data[2], data[3]);
1424 		break;
1425 	case TRC_HVM_CLTS:
1426 		/*
1427 		 * CLTS insn (Intel VMX only)
1428 		 */
1429 		XDT_PROBE0(XDT_HVM_CLTS);
1430 		break;
1431 	case TRC_HVM_LMSW64:
1432 		/*
1433 		 * LMSW insn.
1434 		 *
1435 		 * data[0] = value(0:31)
1436 		 * data[1] = value(32:63)
1437 		 */
1438 		val64 = (((uint64_t)data[1]) << 32) | data[0];
1439 		XDT_PROBE1(XDT_HVM_LMSW, val64);
1440 		break;
1441 	case TRC_HVM_LMSW:
1442 		/*
1443 		 * unused (32-bit Xen only)
1444 		 */
1445 		break;
1446 
1447 	/*
1448 	 * Shadow page table probes (mainly used for HVM domains
1449 	 * without hardware paging support).
1450 	 */
1451 	case TRC_SHADOW_NOT_SHADOW | SH_GUEST_32:
1452 		/*
1453 		 * data[0] = pte(0:31)
1454 		 * data[1] = pte(32:63)
1455 		 * data[2] = va
1456 		 * data[3] = flags
1457 		 */
1458 		pte64 = ((uint64_t)data[1] << 32) | data[0];
1459 		XDT_PROBE3(XDT_SHADOW_NOT_SHADOW, pte64, data[2], data[3]);
1460 		break;
1461 	case TRC_SHADOW_NOT_SHADOW | SH_GUEST_PAE:
1462 	case TRC_SHADOW_NOT_SHADOW | SH_GUEST_64:
1463 		/*
1464 		 * data[0] = pte(0:31)
1465 		 * data[1] = pte(32:63)
1466 		 * data[2] = va(0:31)
1467 		 * data[3] = va(32:63)
1468 		 * data[4] = flags
1469 		 */
1470 		addr64 = ((uint64_t)data[2] << 32) | data[3];
1471 		pte64 = ((uint64_t)data[1] << 32) | data[0];
1472 		XDT_PROBE3(XDT_SHADOW_NOT_SHADOW, pte64, addr64, data[4]);
1473 		break;
1474 	case TRC_SHADOW_FAST_PROPAGATE | SH_GUEST_32:
1475 		/*
1476 		 * data[0] = va
1477 		 */
1478 		XDT_PROBE1(XDT_SHADOW_FAST_PROPAGATE, data[0]);
1479 		break;
1480 	case TRC_SHADOW_FAST_PROPAGATE | SH_GUEST_PAE:
1481 	case TRC_SHADOW_FAST_PROPAGATE | SH_GUEST_64:
1482 		/*
1483 		 * data[0] = va(0:31)
1484 		 * data[1] = va(32:63)
1485 		 */
1486 		addr64 = ((uint64_t)data[1] << 32) | data[0];
1487 		XDT_PROBE1(XDT_SHADOW_FAST_PROPAGATE, addr64);
1488 		break;
1489 	case TRC_SHADOW_FAST_MMIO | SH_GUEST_32:
1490 		/*
1491 		 * data[0] = va
1492 		 */
1493 		XDT_PROBE1(XDT_SHADOW_FAST_MMIO, data[0]);
1494 		break;
1495 	case TRC_SHADOW_FAST_MMIO | SH_GUEST_PAE:
1496 	case TRC_SHADOW_FAST_MMIO | SH_GUEST_64:
1497 		/*
1498 		 * data[0] = va(0:31)
1499 		 * data[1] = va(32:63)
1500 		 */
1501 		addr64 = ((uint64_t)data[1] << 32) | data[0];
1502 		XDT_PROBE1(XDT_SHADOW_FAST_MMIO, addr64);
1503 		break;
1504 	case TRC_SHADOW_FALSE_FAST_PATH | SH_GUEST_32:
1505 		/*
1506 		 * data[0] = va
1507 		 */
1508 		XDT_PROBE1(XDT_SHADOW_FALSE_FAST_PATH, data[0]);
1509 		break;
1510 	case TRC_SHADOW_FALSE_FAST_PATH | SH_GUEST_PAE:
1511 	case TRC_SHADOW_FALSE_FAST_PATH | SH_GUEST_64:
1512 		/*
1513 		 * data[0] = va(0:31)
1514 		 * data[1] = va(32:63)
1515 		 */
1516 		addr64 = ((uint64_t)data[1] << 32) | data[0];
1517 		XDT_PROBE1(XDT_SHADOW_FALSE_FAST_PATH, addr64);
1518 		break;
1519 	case TRC_SHADOW_MMIO | SH_GUEST_32:
1520 		/*
1521 		 * data[0] = va
1522 		 */
1523 		XDT_PROBE1(XDT_SHADOW_MMIO, data[0]);
1524 		break;
1525 	case TRC_SHADOW_MMIO | SH_GUEST_PAE:
1526 	case TRC_SHADOW_MMIO | SH_GUEST_64:
1527 		/*
1528 		 * data[0] = va(0:31)
1529 		 * data[1] = va(32:63)
1530 		 */
1531 		addr64 = ((uint64_t)data[1] << 32) | data[0];
1532 		XDT_PROBE1(XDT_SHADOW_MMIO, addr64);
1533 		break;
1534 	case TRC_SHADOW_FIXUP | SH_GUEST_32:
1535 		/*
1536 		 * data[0] = pte(0:31)
1537 		 * data[1] = pte(32:63)
1538 		 * data[2] = va
1539 		 * data[3] = flags
1540 		 */
1541 		pte64 = ((uint64_t)data[1] << 32) | data[0];
1542 		XDT_PROBE3(XDT_SHADOW_FIXUP, pte64, data[2], data[3]);
1543 		break;
1544 	case TRC_SHADOW_FIXUP | SH_GUEST_64:
1545 	case TRC_SHADOW_FIXUP | SH_GUEST_PAE:
1546 		/*
1547 		 * data[0] = pte(0:31)
1548 		 * data[1] = pte(32:63)
1549 		 * data[2] = va(0:31)
1550 		 * data[3] = va(32:63)
1551 		 * data[4] = flags
1552 		 */
1553 		addr64 = ((uint64_t)data[2] << 32) | data[3];
1554 		pte64 = ((uint64_t)data[1] << 32) | data[0];
1555 		XDT_PROBE3(XDT_SHADOW_FIXUP, pte64, addr64, data[4]);
1556 		break;
1557 	case TRC_SHADOW_DOMF_DYING | SH_GUEST_32:
1558 		/*
1559 		 * data[0] = va
1560 		 */
1561 		XDT_PROBE1(XDT_SHADOW_DOMF_DYING, data[0]);
1562 		break;
1563 	case TRC_SHADOW_DOMF_DYING | SH_GUEST_PAE:
1564 	case TRC_SHADOW_DOMF_DYING | SH_GUEST_64:
1565 		/*
1566 		 * data[0] = va(0:31)
1567 		 * data[1] = va(32:63)
1568 		 */
1569 		addr64 = ((uint64_t)data[1] << 32) | data[0];
1570 		XDT_PROBE1(XDT_SHADOW_DOMF_DYING, addr64);
1571 		break;
1572 	case TRC_SHADOW_EMULATE | SH_GUEST_32:
1573 		/*
1574 		 * data[0] = pte(0:31)
1575 		 * data[1] = pte(32:63)
1576 		 * data[2] = val(0:31)
1577 		 * data[3] = val(32:63)
1578 		 * data[4] = addr
1579 		 * data[5] = flags
1580 		 */
1581 		pte64 = ((uint64_t)data[1] << 32) | data[0];
1582 		val64 = ((uint64_t)data[3] << 32) | data[2];
1583 		XDT_PROBE5(XDT_SHADOW_EMULATE, pte64, val64, data[4],
1584 		    data[5] & 0x7fffffff, data[5] >> 29);
1585 		break;
1586 	case TRC_SHADOW_EMULATE | SH_GUEST_PAE:
1587 	case TRC_SHADOW_EMULATE | SH_GUEST_64:
1588 		/*
1589 		 * data[0] = pte(0:31)
1590 		 * data[1] = pte(32:63)
1591 		 * data[2] = val(0:31)
1592 		 * data[3] = val(32:63)
1593 		 * data[4] = addr(0:31)
1594 		 * data[5] = addr(32:63)
1595 		 * data[6] = flags
1596 		 */
1597 		pte64 = ((uint64_t)data[1] << 32) | data[0];
1598 		val64 = ((uint64_t)data[3] << 32) | data[2];
1599 		addr64 = ((uint64_t)data[5] << 32) | data[4];
1600 		XDT_PROBE5(XDT_SHADOW_EMULATE, pte64, val64, data[4],
1601 		    data[6] & 0x7fffffff, data[6] >> 29);
1602 		break;
1603 	case TRC_SHADOW_EMULATE_UNSHADOW_USER | SH_GUEST_32:
1604 		/*
1605 		 * data[0] = gfn
1606 		 * data[1] = vaddr
1607 		 */
1608 		XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_USER, data[0], data[1]);
1609 		break;
1610 	case TRC_SHADOW_EMULATE_UNSHADOW_USER | SH_GUEST_PAE:
1611 	case TRC_SHADOW_EMULATE_UNSHADOW_USER | SH_GUEST_64:
1612 		/*
1613 		 * data[0] = gfn(0:31)
1614 		 * data[1] = gfn(32:63)
1615 		 * data[2] = vaddr(0:31)
1616 		 * data[3] = vaddr(32:63)
1617 		 */
1618 		val64 = ((uint64_t)data[1] << 32) | data[0];
1619 		addr64 = ((uint64_t)data[3] << 32) | data[2];
1620 		XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_USER, val64, addr64);
1621 		break;
1622 	case TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ | SH_GUEST_32:
1623 		/*
1624 		 * data[0] = gfn
1625 		 * data[1] = vaddr
1626 		 */
1627 		XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ, data[0],
1628 		    data[1]);
1629 		break;
1630 	case TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ | SH_GUEST_PAE:
1631 	case TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ | SH_GUEST_64:
1632 		/*
1633 		 * data[0] = gfn(0:31)
1634 		 * data[1] = gfn(32:63)
1635 		 * data[2] = vaddr(0:31)
1636 		 * data[3] = vaddr(32:63)
1637 		 */
1638 		val64 = ((uint64_t)data[1] << 32) | data[0];
1639 		addr64 = ((uint64_t)data[3] << 32) | data[2];
1640 		XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_EVTINJ, val64, addr64);
1641 		break;
1642 	case TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED | SH_GUEST_32:
1643 		/*
1644 		 * data[0] = gfn
1645 		 * data[1] = vaddr
1646 		 */
1647 		XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED, data[0],
1648 		    data[1]);
1649 		break;
1650 	case TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED | SH_GUEST_PAE:
1651 	case TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED | SH_GUEST_64:
1652 		/*
1653 		 * data[0] = gfn(0:31)
1654 		 * data[1] = gfn(32:63)
1655 		 * data[2] = vaddr(0:31)
1656 		 * data[3] = vaddr(32:63)
1657 		 */
1658 		val64 = ((uint64_t)data[1] << 32) | data[0];
1659 		addr64 = ((uint64_t)data[3] << 32) | data[2];
1660 		XDT_PROBE2(XDT_SHADOW_EMULATE_UNSHADOW_UNHANDLED, val64,
1661 		    addr64);
1662 		break;
1663 	case TRC_SHADOW_WRMAP_BF:
1664 		/*
1665 		 * data[0] = gfn(0:31)
1666 		 * data[1] = gfn(32:63)
1667 		 */
1668 		val64 = ((uint64_t)data[1] << 32) | data[0];
1669 		XDT_PROBE1(XDT_SHADOW_WRMAP_BF, val64);
1670 		break;
1671 	case TRC_SHADOW_PREALLOC_UNPIN:
1672 		/*
1673 		 * data[0] = gfn(0:31)
1674 		 * data[1] = gfn(32:63)
1675 		 */
1676 		val64 = ((uint64_t)data[1] << 32) | data[0];
1677 		XDT_PROBE1(XDT_SHADOW_PREALLOC_UNPIN, val64);
1678 		break;
1679 	case TRC_SHADOW_RESYNC_FULL:
1680 		/*
1681 		 * data[0] = gmfn(0:31)
1682 		 * data[1] = gmfn(32:63)
1683 		 */
1684 		val64 = ((uint64_t)data[1] << 32) | data[0];
1685 		XDT_PROBE1(XDT_SHADOW_RESYNC_FULL, val64);
1686 		break;
1687 	case TRC_SHADOW_RESYNC_ONLY:
1688 		/*
1689 		 * data[0] = gmfn(0:31)
1690 		 * data[1] = gmfn(32:63)
1691 		 */
1692 		val64 = ((uint64_t)data[1] << 32) | data[0];
1693 		XDT_PROBE1(XDT_SHADOW_RESYNC_ONLY, val64);
1694 		break;
1695 
1696 	/*
1697 	 * Power management probes.
1698 	 */
1699 	case TRC_PM_FREQ_CHANGE:
1700 		/*
1701 		 * data[0] = old freq
1702 		 * data[1] = new freq
1703 		 */
1704 		XDT_PROBE2(XDT_PM_FREQ_CHANGE, data[0], data[1]);
1705 		break;
1706 	case TRC_PM_IDLE_ENTRY:
1707 		/*
1708 		 * data[0] = C-state
1709 		 * data[1] = time
1710 		 */
1711 		XDT_PROBE2(XDT_PM_IDLE_ENTRY, data[0], data[1]);
1712 		break;
1713 	case TRC_PM_IDLE_EXIT:
1714 		/*
1715 		 * data[0] = C-state
1716 		 * data[1] = time
1717 		 */
1718 		XDT_PROBE2(XDT_PM_IDLE_EXIT, data[0], data[1]);
1719 		break;
1720 	case TRC_LOST_RECORDS:
1721 		vcpu = data[1] >> 16;
1722 		dom = data[1] & 0xffff;
1723 		xdt_update_sched_context(cpuid, dom, vcpu);
1724 		xdt_update_domain_context(dom, vcpu);
1725 		XDT_PROBE1(XDT_TRC_LOST_RECORDS, cpuid);
1726 		tbuf.stat_dropped_recs++;
1727 		break;
1728 
1729 	default:
1730 		tbuf.stat_unknown_recs++;
1731 		break;
1732 	}
1733 
1734 done:
1735 	rec_size = 4 + (rec->cycles_included ? 8 : 0) + (rec->extra_u32 * 4);
1736 	return (rec_size);
1737 }
1738 
1739 /*
1740  * Scan all CPU buffers for the record with the lowest timestamp so
1741  * that the probes will fire in order.
1742  */
1743 static int
xdt_get_first_rec(uint_t * cpuidp,struct t_rec ** recp,uint32_t * consp)1744 xdt_get_first_rec(uint_t *cpuidp, struct t_rec **recp, uint32_t *consp)
1745 {
1746 	uint_t cpuid;
1747 	uint32_t prod, cons, offset;
1748 	struct t_rec *rec;
1749 	uint64_t minstamp = ~0ULL, stamp;
1750 	uintptr_t data;
1751 
1752 	for (cpuid = 0; cpuid < tbuf.cnt; cpuid++) {
1753 		cons = tbuf.meta[cpuid]->cons;
1754 		prod = tbuf.meta[cpuid]->prod;
1755 		membar_consumer();
1756 		if (prod == cons)
1757 			continue;
1758 
1759 		offset = cons % tbuf_data_size;
1760 		data = (uintptr_t)tbuf.data[cpuid] + offset;
1761 		rec = (struct t_rec *)data;
1762 		ASSERT((caddr_t)rec < tbuf.va + (tbuf.size * (cpuid + 1)));
1763 
1764 		/*
1765 		 * All records that we know about have time cycles included.
1766 		 * If this record doesn't have them, assume it's a type
1767 		 * that we don't handle. Use a 0 time value, which will make
1768 		 * it get handled first (it will be thrown away).
1769 		 */
1770 		if (rec->cycles_included)
1771 			stamp = (((uint64_t)rec->u.cycles.cycles_hi) << 32)
1772 			    | rec->u.cycles.cycles_lo;
1773 		else
1774 			stamp = 0;
1775 
1776 		if (stamp < minstamp) {
1777 			minstamp = stamp;
1778 			*cpuidp = cpuid;
1779 			*recp = rec;
1780 			*consp = cons;
1781 		}
1782 	}
1783 
1784 	if (minstamp != ~0ULL)
1785 		return (1);
1786 
1787 	return (0);
1788 }
1789 
1790 /*ARGSUSED*/
1791 static void
xdt_tbuf_scan(void * arg)1792 xdt_tbuf_scan(void *arg)
1793 {
1794 	uint32_t bytes_done, cons;
1795 	struct t_rec *rec;
1796 	xdt_schedinfo_t *sp;
1797 	uint_t nrecs, cpuid;
1798 
1799 	for (nrecs = 0;
1800 	    nrecs < xdt_max_recs && xdt_get_first_rec(&cpuid, &rec, &cons) > 0;
1801 	    nrecs++) {
1802 		xdt_curpcpu = cpuid;
1803 		sp = &xdt_cpu_schedinfo[cpuid];
1804 		if (sp->curinfo_valid)
1805 			xdt_update_domain_context(sp->cur_domid,
1806 			    sp->cur_vcpuid);
1807 
1808 		bytes_done = xdt_process_rec(cpuid, rec);
1809 		cons += bytes_done;
1810 		/*
1811 		 * cons and prod are incremented modulo (2 * tbuf_data_size).
1812 		 * See <xen/public/trace.h>.
1813 		 */
1814 		if (cons >= 2 * tbuf_data_size)
1815 			cons -= 2 * tbuf_data_size;
1816 		membar_exit();
1817 		tbuf.meta[cpuid]->cons = cons;
1818 	}
1819 }
1820 
1821 static void
xdt_cyclic_enable(void)1822 xdt_cyclic_enable(void)
1823 {
1824 	cyc_handler_t hdlr;
1825 	cyc_time_t when;
1826 
1827 	ASSERT(MUTEX_HELD(&cpu_lock));
1828 
1829 	hdlr.cyh_func = xdt_tbuf_scan;
1830 	hdlr.cyh_arg = NULL;
1831 	hdlr.cyh_level = CY_LOW_LEVEL;
1832 
1833 	when.cyt_interval = xdt_poll_nsec;
1834 	when.cyt_when = dtrace_gethrtime() + when.cyt_interval;
1835 
1836 	xdt_cyclic = cyclic_add(&hdlr, &when);
1837 }
1838 
1839 static void
xdt_probe_create(xdt_probe_t * p)1840 xdt_probe_create(xdt_probe_t *p)
1841 {
1842 	ASSERT(p != NULL && p->pr_mod != NULL);
1843 
1844 	if (dtrace_probe_lookup(xdt_id, p->pr_mod, NULL, p->pr_name) != 0)
1845 		return;
1846 
1847 	xdt_prid[p->evt_id] = dtrace_probe_create(xdt_id, p->pr_mod, NULL,
1848 	    p->pr_name, dtrace_mach_aframes(), p);
1849 }
1850 
1851 /*ARGSUSED*/
1852 static void
xdt_provide(void * arg,const dtrace_probedesc_t * desc)1853 xdt_provide(void *arg, const dtrace_probedesc_t *desc)
1854 {
1855 	const char *mod, *name;
1856 	int i;
1857 
1858 	if (desc == NULL) {
1859 		for (i = 0; xdt_probe[i].pr_mod != NULL; i++) {
1860 			xdt_probe_create(&xdt_probe[i]);
1861 		}
1862 	} else {
1863 		mod = desc->dtpd_mod;
1864 		name = desc->dtpd_name;
1865 		for (i = 0; xdt_probe[i].pr_mod != NULL; i++) {
1866 			int l1 = strlen(xdt_probe[i].pr_name);
1867 			int l2 = strlen(xdt_probe[i].pr_mod);
1868 			if (strncmp(name, xdt_probe[i].pr_name, l1) == 0 &&
1869 			    strncmp(mod, xdt_probe[i].pr_mod, l2) == 0)
1870 				break;
1871 		}
1872 
1873 		if (xdt_probe[i].pr_mod == NULL)
1874 			return;
1875 		xdt_probe_create(&xdt_probe[i]);
1876 	}
1877 
1878 }
1879 
1880 /*ARGSUSED*/
1881 static void
xdt_destroy(void * arg,dtrace_id_t id,void * parg)1882 xdt_destroy(void *arg, dtrace_id_t id, void *parg)
1883 {
1884 	xdt_probe_t *p = parg;
1885 	xdt_prid[p->evt_id] = 0;
1886 }
1887 
1888 static void
xdt_set_trace_mask(uint32_t mask)1889 xdt_set_trace_mask(uint32_t mask)
1890 {
1891 	xen_sysctl_tbuf_op_t tbuf_op;
1892 
1893 	/* Always need to trace scheduling, for context */
1894 	if (mask != 0)
1895 		mask |= TRC_SCHED;
1896 	tbuf_op.evt_mask = mask;
1897 	tbuf_op.cmd  = XEN_SYSCTL_TBUFOP_set_evt_mask;
1898 	(void) xdt_sysctl_tbuf(&tbuf_op);
1899 }
1900 
1901 /*ARGSUSED*/
1902 static int
xdt_enable(void * arg,dtrace_id_t id,void * parg)1903 xdt_enable(void *arg, dtrace_id_t id, void *parg)
1904 {
1905 	xdt_probe_t *p = parg;
1906 	xen_sysctl_tbuf_op_t tbuf_op;
1907 
1908 	ASSERT(MUTEX_HELD(&cpu_lock));
1909 	ASSERT(xdt_prid[p->evt_id] != 0);
1910 
1911 	xdt_probemap[p->evt_id] = xdt_prid[p->evt_id];
1912 	xdt_classinfo[p->class].cnt++;
1913 
1914 	if (xdt_classinfo[p->class].cnt == 1) {
1915 		/* set the trace mask for this class */
1916 		cur_trace_mask |= xdt_classinfo[p->class].trc_mask;
1917 		xdt_set_trace_mask(cur_trace_mask);
1918 	}
1919 
1920 	if (xdt_cyclic == CYCLIC_NONE) {
1921 		tbuf_op.cmd = XEN_SYSCTL_TBUFOP_enable;
1922 		if (xdt_sysctl_tbuf(&tbuf_op) != 0) {
1923 			cmn_err(CE_NOTE, "Couldn't enable hypervisor tracing.");
1924 			return (-1);
1925 		}
1926 
1927 		xdt_cyclic_enable();
1928 	}
1929 	return (0);
1930 }
1931 
1932 /*ARGSUSED*/
1933 static void
xdt_disable(void * arg,dtrace_id_t id,void * parg)1934 xdt_disable(void *arg, dtrace_id_t id, void *parg)
1935 {
1936 	xdt_probe_t *p = parg;
1937 	xen_sysctl_tbuf_op_t tbuf_op;
1938 	int i, err;
1939 
1940 	ASSERT(MUTEX_HELD(&cpu_lock));
1941 	ASSERT(xdt_probemap[p->evt_id] != 0);
1942 	ASSERT(xdt_probemap[p->evt_id] == xdt_prid[p->evt_id]);
1943 	ASSERT(xdt_classinfo[p->class].cnt > 0);
1944 
1945 	/*
1946 	 * We could be here in the slight window between the cyclic firing and
1947 	 * a call to dtrace_probe() occurring. We need to be careful if we tear
1948 	 * down any shared state.
1949 	 */
1950 
1951 	xdt_probemap[p->evt_id] = 0;
1952 	xdt_classinfo[p->class].cnt--;
1953 
1954 	if (xdt_nr_active_probes() == 0) {
1955 		cur_trace_mask = 0;
1956 
1957 		if (xdt_cyclic == CYCLIC_NONE)
1958 			return;
1959 
1960 		for (i = 0; i < xdt_ncpus; i++)
1961 			xdt_cpu_schedinfo[i].curinfo_valid = 0;
1962 
1963 		/*
1964 		 * We will try to disable the trace buffers. If we fail for some
1965 		 * reason we will try again, up to a count of XDT_TBUF_RETRY.
1966 		 * If we still aren't successful we try to set the trace mask
1967 		 * to 0 in order to prevent trace records from being written.
1968 		 */
1969 		tbuf_op.cmd = XEN_SYSCTL_TBUFOP_disable;
1970 		i = 0;
1971 		do {
1972 			err = xdt_sysctl_tbuf(&tbuf_op);
1973 		} while ((err != 0) && (++i < XDT_TBUF_RETRY));
1974 
1975 		if (err != 0) {
1976 			cmn_err(CE_NOTE,
1977 			    "Couldn't disable hypervisor tracing.");
1978 			xdt_set_trace_mask(0);
1979 		} else {
1980 			cyclic_remove(xdt_cyclic);
1981 			xdt_cyclic = CYCLIC_NONE;
1982 			/*
1983 			 * We don't bother making the hypercall to set
1984 			 * the trace mask, since it will be reset when
1985 			 * tracing is re-enabled.
1986 			 */
1987 		}
1988 	} else if (xdt_classinfo[p->class].cnt == 0) {
1989 		cur_trace_mask ^= xdt_classinfo[p->class].trc_mask;
1990 		/* other probes are enabled, so add the sub-class mask back */
1991 		cur_trace_mask |= 0xF000;
1992 		xdt_set_trace_mask(cur_trace_mask);
1993 	}
1994 }
1995 
1996 static dtrace_pattr_t xdt_attr = {
1997 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
1998 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
1999 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
2000 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
2001 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
2002 };
2003 
2004 static dtrace_pops_t xdt_pops = {
2005 	xdt_provide,		/* dtps_provide() */
2006 	NULL,			/* dtps_provide_module() */
2007 	xdt_enable,		/* dtps_enable() */
2008 	xdt_disable,		/* dtps_disable() */
2009 	NULL,			/* dtps_suspend() */
2010 	NULL,			/* dtps_resume() */
2011 	NULL,			/* dtps_getargdesc() */
2012 	NULL,			/* dtps_getargval() */
2013 	NULL,			/* dtps_usermode() */
2014 	xdt_destroy		/* dtps_destroy() */
2015 };
2016 
2017 static int
xdt_attach(dev_info_t * devi,ddi_attach_cmd_t cmd)2018 xdt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
2019 {
2020 	int val;
2021 
2022 	if (!DOMAIN_IS_INITDOMAIN(xen_info))
2023 		return (DDI_FAILURE);
2024 
2025 	switch (cmd) {
2026 	case DDI_ATTACH:
2027 		break;
2028 
2029 	case DDI_RESUME:
2030 		/*
2031 		 * We might support proper suspend/resume in the future, so,
2032 		 * return DDI_FAILURE for now.
2033 		 */
2034 		return (DDI_FAILURE);
2035 
2036 	default:
2037 		return (DDI_FAILURE);
2038 	}
2039 
2040 	xdt_ncpus = xpv_nr_phys_cpus();
2041 	ASSERT(xdt_ncpus > 0);
2042 
2043 	if (ddi_create_minor_node(devi, "xdt", S_IFCHR, 0, DDI_PSEUDO, 0) ==
2044 	    DDI_FAILURE || xdt_attach_trace_buffers() != 0 ||
2045 	    dtrace_register("xdt", &xdt_attr, DTRACE_PRIV_KERNEL, NULL,
2046 	    &xdt_pops, NULL, &xdt_id) != 0) {
2047 		if (tbuf.va != NULL)
2048 			xdt_detach_trace_buffers();
2049 		ddi_remove_minor_node(devi, NULL);
2050 		return (DDI_FAILURE);
2051 	}
2052 
2053 	val = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
2054 	    "xdt_poll_nsec", XDT_POLL_DEFAULT);
2055 	xdt_poll_nsec = MAX(val, XDT_POLL_MIN);
2056 
2057 	xdt_cpu_schedinfo = (xdt_schedinfo_t *)kmem_zalloc(xdt_ncpus *
2058 	    sizeof (xdt_schedinfo_t), KM_SLEEP);
2059 	xdt_init_trace_masks();
2060 	xdt_kstat_init();
2061 
2062 	xdt_devi = devi;
2063 	ddi_report_dev(devi);
2064 	return (DDI_SUCCESS);
2065 }
2066 
2067 static int
xdt_detach(dev_info_t * devi,ddi_detach_cmd_t cmd)2068 xdt_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
2069 {
2070 	switch (cmd) {
2071 	case DDI_DETACH:
2072 		break;
2073 
2074 	case DDI_SUSPEND:
2075 		/*
2076 		 * We might support proper suspend/resume in the future. So
2077 		 * return DDI_FAILURE for now.
2078 		 */
2079 		return (DDI_FAILURE);
2080 
2081 	default:
2082 		return (DDI_FAILURE);
2083 	}
2084 
2085 	if (dtrace_unregister(xdt_id) != 0)
2086 		return (DDI_FAILURE);
2087 
2088 	xdt_detach_trace_buffers();
2089 	kmem_free(xdt_cpu_schedinfo, xdt_ncpus * sizeof (xdt_schedinfo_t));
2090 	if (xdt_cyclic != CYCLIC_NONE)
2091 		cyclic_remove(xdt_cyclic);
2092 	if (xdt_kstats != NULL)
2093 		kstat_delete(xdt_kstats);
2094 	xdt_devi = (void *)0;
2095 	ddi_remove_minor_node(devi, NULL);
2096 
2097 	return (DDI_SUCCESS);
2098 }
2099 
2100 /*ARGSUSED*/
2101 static int
xdt_info(dev_info_t * devi,ddi_info_cmd_t infocmd,void * arg,void ** result)2102 xdt_info(dev_info_t *devi, ddi_info_cmd_t infocmd, void *arg, void **result)
2103 {
2104 	int error;
2105 
2106 	switch (infocmd) {
2107 	case DDI_INFO_DEVT2DEVINFO:
2108 		*result = xdt_devi;
2109 		error = DDI_SUCCESS;
2110 		break;
2111 	case DDI_INFO_DEVT2INSTANCE:
2112 		*result = (void *)0;
2113 		error = DDI_SUCCESS;
2114 		break;
2115 	default:
2116 		error = DDI_FAILURE;
2117 	}
2118 	return (error);
2119 }
2120 
2121 static struct cb_ops xdt_cb_ops = {
2122 	nulldev,		/* open(9E) */
2123 	nodev,			/* close(9E) */
2124 	nodev,			/* strategy(9E) */
2125 	nodev,			/* print(9E) */
2126 	nodev,			/* dump(9E) */
2127 	nodev,			/* read(9E) */
2128 	nodev,			/* write(9E) */
2129 	nodev,			/* ioctl(9E) */
2130 	nodev,			/* devmap(9E) */
2131 	nodev,			/* mmap(9E) */
2132 	nodev,			/* segmap(9E) */
2133 	nochpoll,		/* chpoll(9E) */
2134 	ddi_prop_op,		/* prop_op(9E) */
2135 	NULL,			/* streamtab(9S) */
2136 	D_MP | D_64BIT | D_NEW	/* cb_flag */
2137 };
2138 
2139 static struct dev_ops xdt_ops = {
2140 	DEVO_REV,		/* devo_rev */
2141 	0,			/* devo_refcnt */
2142 	xdt_info,		/* getinfo(9E) */
2143 	nulldev,		/* identify(9E) */
2144 	nulldev,		/* probe(9E) */
2145 	xdt_attach,		/* attach(9E) */
2146 	xdt_detach,		/* detach(9E) */
2147 	nulldev,		/* devo_reset */
2148 	&xdt_cb_ops,		/* devo_cb_ops */
2149 	NULL,			/* devo_bus_ops */
2150 	NULL,			/* power(9E) */
2151 	ddi_quiesce_not_needed,	/* devo_quiesce */
2152 };
2153 
2154 
2155 static struct modldrv modldrv = {
2156 	&mod_driverops,
2157 	"Hypervisor event tracing",
2158 	&xdt_ops
2159 };
2160 
2161 static struct modlinkage modlinkage = {
2162 	MODREV_1,
2163 	&modldrv,
2164 	NULL
2165 };
2166 
2167 int
_init(void)2168 _init(void)
2169 {
2170 	return (mod_install(&modlinkage));
2171 }
2172 
2173 int
_fini(void)2174 _fini(void)
2175 {
2176 	return (mod_remove(&modlinkage));
2177 }
2178 
2179 int
_info(struct modinfo * modinfop)2180 _info(struct modinfo *modinfop)
2181 {
2182 	return (mod_info(&modlinkage, modinfop));
2183 }
2184