1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Xen event provider for DTrace 29 * 30 * NOTE: This provider is PRIVATE. It is intended as a short-term solution and 31 * may disappear or be re-implemented at anytime. 32 * 33 * This provider isn't suitable as a general-purpose solution for a number of 34 * reasons. First and foremost, we rely on the Xen tracing mechanism and don't 35 * have any way to gather data other than that collected by the Xen trace 36 * buffers. Further, it does not fit into the DTrace model (see "Interacting 37 * with DTrace" below.) 38 * 39 * 40 * Tracing in Xen 41 * -------------- 42 * 43 * Xen implements a tracing facility for generating and collecting execution 44 * event traces from the hypervisor. When tracing is enabled, compiled in 45 * probes record events in contiguous per-CPU trace buffers. 46 * 47 * +---------+ 48 * +------+ | | 49 * | CPUn |----> | BUFFERn | 50 * +------+ | | 51 * +---------+- tbuf.va + (tbuf.size * n) 52 * : : 53 * +---------+ 54 * +------+ | | 55 * | CPU1 |----> | BUFFER1 | 56 * +------+ | | 57 * +---------+- tbuf.va + tbuf.size 58 * +------+ | | 59 * | CPU0 |----> | BUFFER0 | 60 * +------+ | | 61 * +---------+- tbuf.va 62 * 63 * Each CPU buffer consists of a metadata header followed by the trace records. 64 * The metadata consists of a producer/consumer pair of pointers into the buffer 65 * that point to the next record to be written and the next record to be read 66 * respectively. 67 * 68 * A trace record can be in one of two forms, depending on if the TSC is 69 * included. The record header indicates whether or not the TSC field is 70 * present. 71 * 72 * 1. Trace record without TSC: 73 * +------------------------------------------------------------+ 74 * | HEADER(uint32_t) | DATA FIELDS | 75 * +------------------------------------------------------------+ 76 * 77 * 2. Trace record with TSC: 78 * +--------------------------------------------------------------------------+ 79 * | HEADER(uint32_t) | TSC(uint64_t) | DATA FIELDS | 80 * +--------------------------------------------------------------------------+ 81 * 82 * Where, 83 * 84 * HEADER bit field: 85 * +--------------------------------------------------------------------------+ 86 * | C | NDATA | EVENT | 87 * +--------------------------------------------------------------------------+ 88 * 31 30 28 27 0 89 * 90 * EVENT: Event ID. 91 * NDATA: Number of populated data fields. 92 * C: TSC included. 93 * 94 * DATA FIELDS: 95 * +--------------------------------------------------------------------------+ 96 * | D1(uint32_t) | D2(uint32_t) | D3(uint32_t) | . . . | D7(uint32_t) | 97 * +--------------------------------------------------------------------------+ 98 * 99 * 100 * Interacting with DTrace 101 * ----------------------- 102 * 103 * Every xdt_poll_nsec nano-seconds we poll the trace buffers for data and feed 104 * each entry into dtrace_probe() with the corresponding probe ID for the event. 105 * As a result of this periodic collection implementation probe firings are 106 * asynchronous. This is the only sensible way to implement this form of 107 * provider, but because of its asynchronous nature asking things like 108 * "current CPU" and, more importantly, arbitrary questions about the context 109 * surrounding the probe firing are not meaningful. So, consumers should not 110 * attempt to infer anything beyond what is supplied via the probe arguments. 111 */ 112 113 #include <sys/xpv_user.h> 114 115 #include <sys/types.h> 116 #include <sys/sysmacros.h> 117 #include <sys/modctl.h> 118 #include <sys/sunddi.h> 119 #include <sys/ddi.h> 120 #include <sys/conf.h> 121 #include <sys/devops.h> 122 #include <sys/stat.h> 123 #include <sys/cmn_err.h> 124 #include <sys/dtrace.h> 125 #include <sys/sdt.h> 126 #include <sys/cyclic.h> 127 #include <vm/seg_kmem.h> 128 #include <vm/hat_i86.h> 129 130 #include <sys/hypervisor.h> 131 #include <xen/public/trace.h> 132 #include <xen/public/sched.h> 133 134 #define XDT_POLL_DEFAULT 100000000 /* default poll interval (ns) */ 135 #define XDT_POLL_MIN 10000000 /* min poll interval (ns) */ 136 #define XDT_TBUF_RETRY 50 /* tbuf disable retry count */ 137 138 /* 139 * The domid must match IDLE_DOMAIN_ID in xen.hg/xen/include/xen/sched.h 140 * in the xVM gate. 141 */ 142 #define IS_IDLE_DOM(domid) (domid == 0x7FFFU) 143 144 /* Macros to extract the domid and cpuid from a HVM trace data field */ 145 #define HVM_DOMID(d) (d >> 16) 146 #define HVM_VCPUID(d) (d & 0xFFFF) 147 148 #define XDT_PROBE4(event, cpuid, arg0, arg1, arg2, arg3) { \ 149 dtrace_id_t id = xdt_probemap[event]; \ 150 if (id) \ 151 dtrace_probe(id, cpuid, arg0, arg1, arg2, arg3); \ 152 } \ 153 154 #define XDT_PROBE3(event, cpuid, arg0, arg1, arg2) \ 155 XDT_PROBE4(event, cpuid, arg0, arg1, arg2, 0) 156 157 #define XDT_PROBE2(event, cpuid, arg0, arg1) \ 158 XDT_PROBE4(event, cpuid, arg0, arg1, 0, 0) 159 160 #define XDT_PROBE1(event, cpuid, arg0) \ 161 XDT_PROBE4(event, cpuid, arg0, 0, 0, 0) 162 163 #define XDT_PROBE0(event, cpuid) \ 164 XDT_PROBE4(event, cpuid, 0, 0, 0, 0) 165 166 /* Probe classes */ 167 #define XDT_SCHED 0 168 #define XDT_MEM 1 169 #define XDT_HVM 2 170 #define XDT_GEN 3 171 #define XDT_NCLASSES 4 172 173 /* Probe events */ 174 #define XDT_EVT_INVALID (-(int)1) 175 #define XDT_SCHED_OFF_CPU 0 176 #define XDT_SCHED_ON_CPU 1 177 #define XDT_SCHED_IDLE_OFF_CPU 2 178 #define XDT_SCHED_IDLE_ON_CPU 3 179 #define XDT_SCHED_BLOCK 4 180 #define XDT_SCHED_SLEEP 5 181 #define XDT_SCHED_WAKE 6 182 #define XDT_SCHED_YIELD 7 183 #define XDT_SCHED_SHUTDOWN_POWEROFF 8 184 #define XDT_SCHED_SHUTDOWN_REBOOT 9 185 #define XDT_SCHED_SHUTDOWN_SUSPEND 10 186 #define XDT_SCHED_SHUTDOWN_CRASH 11 187 #define XDT_MEM_PAGE_GRANT_MAP 12 188 #define XDT_MEM_PAGE_GRANT_UNMAP 13 189 #define XDT_MEM_PAGE_GRANT_TRANSFER 14 190 #define XDT_HVM_VMENTRY 15 191 #define XDT_HVM_VMEXIT 16 192 #define XDT_TRC_LOST_RECORDS 17 193 #define XDT_NEVENTS 18 194 195 typedef struct { 196 const char *pr_mod; /* probe module */ 197 const char *pr_name; /* probe name */ 198 int evt_id; /* event id */ 199 uint_t class; /* probe class */ 200 } xdt_probe_t; 201 202 typedef struct { 203 uint32_t trc_mask; /* trace mask */ 204 uint32_t cnt; /* num enabled probes in class */ 205 } xdt_classinfo_t; 206 207 typedef struct { 208 ulong_t prev_domid; /* previous dom executed */ 209 ulong_t prev_vcpuid; /* previous vcpu executed */ 210 ulong_t prev_ctime; /* time spent on cpu */ 211 ulong_t next_domid; /* next dom to be scheduled */ 212 ulong_t next_vcpuid; /* next vcpu to be scheduled */ 213 ulong_t next_wtime; /* time spent waiting to get on cpu */ 214 ulong_t next_ts; /* allocated time slice */ 215 } xdt_schedinfo_t; 216 217 static struct { 218 uint_t cnt; /* total num of trace buffers */ 219 size_t size; /* size of each cpu buffer */ 220 mfn_t start_mfn; /* starting mfn of buffers */ 221 caddr_t va; /* va buffers are mapped into */ 222 223 /* per-cpu buffers */ 224 struct t_buf **meta; /* buffer metadata */ 225 struct t_rec **data; /* buffer data records */ 226 227 /* statistics */ 228 uint64_t stat_dropped_recs; /* records dropped */ 229 uint64_t stat_spurious_cpu; /* recs with garbage cpuids */ 230 uint64_t stat_spurious_switch; /* inconsistent vcpu switches */ 231 uint64_t stat_unknown_shutdown; /* unknown shutdown code */ 232 uint64_t stat_unknown_recs; /* unknown records */ 233 } tbuf; 234 235 static char *xdt_stats[] = { 236 "dropped_recs", 237 }; 238 239 /* 240 * Tunable variables 241 * 242 * The following may be tuned by adding a line to /etc/system that 243 * includes both the name of the module ("xdt") and the name of the variable. 244 * For example: 245 * set xdt:xdt_tbuf_pages = 40 246 */ 247 uint_t xdt_tbuf_pages = 20; /* pages to alloc per-cpu buf */ 248 249 /* 250 * The following may be tuned by adding a line to 251 * /platform/i86xpv/kernel/drv/xdt.conf. 252 * For example: 253 * xdt_poll_nsec = 200000000; 254 */ 255 static hrtime_t xdt_poll_nsec; /* trace buffer poll interval */ 256 257 /* 258 * Internal variables 259 */ 260 static dev_info_t *xdt_devi; 261 static dtrace_provider_id_t xdt_id; 262 static uint_t xdt_ncpus; /* total number of phys CPUs */ 263 static uint32_t cur_trace_mask; /* current trace mask */ 264 static xdt_schedinfo_t *xdt_cpu_schedinfo; /* per-cpu sched info */ 265 dtrace_id_t xdt_probemap[XDT_NEVENTS]; /* map of enabled probes */ 266 dtrace_id_t xdt_prid[XDT_NEVENTS]; /* IDs of registered events */ 267 static cyclic_id_t xdt_cyclic = CYCLIC_NONE; 268 static kstat_t *xdt_kstats; 269 static xdt_classinfo_t xdt_classinfo[XDT_NCLASSES]; 270 271 static xdt_probe_t xdt_probe[] = { 272 /* Sched probes */ 273 { "sched", "off-cpu", XDT_SCHED_OFF_CPU, XDT_SCHED }, 274 { "sched", "on-cpu", XDT_SCHED_ON_CPU, XDT_SCHED }, 275 { "sched", "idle-off-cpu", XDT_SCHED_IDLE_OFF_CPU, XDT_SCHED }, 276 { "sched", "idle-on-cpu", XDT_SCHED_IDLE_ON_CPU, XDT_SCHED }, 277 { "sched", "block", XDT_SCHED_BLOCK, XDT_SCHED }, 278 { "sched", "sleep", XDT_SCHED_SLEEP, XDT_SCHED }, 279 { "sched", "wake", XDT_SCHED_WAKE, XDT_SCHED }, 280 { "sched", "yield", XDT_SCHED_YIELD, XDT_SCHED }, 281 { "sched", "shutdown-poweroff", XDT_SCHED_SHUTDOWN_POWEROFF, 282 XDT_SCHED }, 283 { "sched", "shutdown-reboot", XDT_SCHED_SHUTDOWN_REBOOT, XDT_SCHED }, 284 { "sched", "shutdown-suspend", XDT_SCHED_SHUTDOWN_SUSPEND, XDT_SCHED }, 285 { "sched", "shutdown-crash", XDT_SCHED_SHUTDOWN_CRASH, XDT_SCHED }, 286 287 /* Memory probes */ 288 { "mem", "page-grant-map", XDT_MEM_PAGE_GRANT_MAP, XDT_MEM }, 289 { "mem", "page-grant-unmap", XDT_MEM_PAGE_GRANT_UNMAP, XDT_MEM }, 290 { "mem", "page-grant-transfer", XDT_MEM_PAGE_GRANT_TRANSFER, XDT_MEM }, 291 292 /* HVM probes */ 293 { "hvm", "vmentry", XDT_HVM_VMENTRY, XDT_HVM }, 294 { "hvm", "vmexit", XDT_HVM_VMEXIT, XDT_HVM }, 295 296 /* Trace buffer related probes */ 297 { "trace", "records-lost", XDT_TRC_LOST_RECORDS, XDT_GEN }, 298 299 { NULL } 300 }; 301 302 static inline uint32_t 303 xdt_nr_active_probes() 304 { 305 int i; 306 uint32_t tot = 0; 307 308 for (i = 0; i < XDT_NCLASSES; i++) 309 tot += xdt_classinfo[i].cnt; 310 311 return (tot); 312 } 313 314 static void 315 xdt_init_trace_masks(void) 316 { 317 xdt_classinfo[XDT_SCHED].trc_mask = TRC_SCHED; 318 xdt_classinfo[XDT_MEM].trc_mask = TRC_MEM; 319 xdt_classinfo[XDT_HVM].trc_mask = TRC_HVM; 320 xdt_classinfo[XDT_GEN].trc_mask = TRC_GEN; 321 } 322 323 static int 324 xdt_kstat_update(kstat_t *ksp, int flag) 325 { 326 kstat_named_t *knp; 327 328 if (flag != KSTAT_READ) 329 return (EACCES); 330 331 knp = ksp->ks_data; 332 333 /* 334 * Assignment order should match that of the names in 335 * xdt_stats. 336 */ 337 (knp++)->value.ui64 = tbuf.stat_dropped_recs; 338 339 return (0); 340 } 341 342 static void 343 xdt_kstat_init(void) 344 { 345 int nstats = sizeof (xdt_stats) / sizeof (xdt_stats[0]); 346 char **cp = xdt_stats; 347 kstat_named_t *knp; 348 349 if ((xdt_kstats = kstat_create("xdt", 0, "trace_statistics", "misc", 350 KSTAT_TYPE_NAMED, nstats, 0)) == NULL) 351 return; 352 353 xdt_kstats->ks_update = xdt_kstat_update; 354 355 knp = xdt_kstats->ks_data; 356 while (nstats > 0) { 357 kstat_named_init(knp, *cp, KSTAT_DATA_UINT64); 358 knp++; 359 cp++; 360 nstats--; 361 } 362 363 kstat_install(xdt_kstats); 364 } 365 366 static int 367 xdt_sysctl_tbuf(xen_sysctl_tbuf_op_t *tbuf_op) 368 { 369 xen_sysctl_t op; 370 int xerr; 371 372 op.cmd = XEN_SYSCTL_tbuf_op; 373 op.interface_version = XEN_SYSCTL_INTERFACE_VERSION; 374 op.u.tbuf_op = *tbuf_op; 375 376 if ((xerr = HYPERVISOR_sysctl(&op)) != 0) 377 return (xen_xlate_errcode(xerr)); 378 379 *tbuf_op = op.u.tbuf_op; 380 return (0); 381 } 382 383 static int 384 xdt_map_trace_buffers(mfn_t mfn, caddr_t va, size_t len) 385 { 386 x86pte_t pte; 387 caddr_t const sva = va; 388 caddr_t const eva = va + len; 389 int xerr; 390 391 ASSERT(mfn != MFN_INVALID); 392 ASSERT(va != NULL); 393 ASSERT(IS_PAGEALIGNED(len)); 394 395 for (; va < eva; va += MMU_PAGESIZE) { 396 /* 397 * Ask the HAT to load a throwaway mapping to page zero, then 398 * overwrite it with the hypervisor mapping. It gets removed 399 * later via hat_unload(). 400 */ 401 hat_devload(kas.a_hat, va, MMU_PAGESIZE, (pfn_t)0, 402 PROT_READ | HAT_UNORDERED_OK, 403 HAT_LOAD_NOCONSIST | HAT_LOAD); 404 405 pte = mmu_ptob((x86pte_t)mfn) | PT_VALID | PT_USER 406 | PT_FOREIGN | PT_WRITABLE; 407 408 xerr = HYPERVISOR_update_va_mapping_otherdomain((ulong_t)va, 409 pte, UVMF_INVLPG | UVMF_LOCAL, DOMID_XEN); 410 411 if (xerr != 0) { 412 /* unmap pages loaded so far */ 413 size_t ulen = (uintptr_t)(va + MMU_PAGESIZE) - 414 (uintptr_t)sva; 415 hat_unload(kas.a_hat, sva, ulen, HAT_UNLOAD_UNMAP); 416 return (xen_xlate_errcode(xerr)); 417 } 418 419 mfn++; 420 } 421 422 return (0); 423 } 424 425 static int 426 xdt_attach_trace_buffers(void) 427 { 428 xen_sysctl_tbuf_op_t tbuf_op; 429 size_t len; 430 int err; 431 uint_t i; 432 433 /* 434 * Xen does not support trace buffer re-sizing. If the buffers 435 * have already been allocated we just use them as is. 436 */ 437 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_get_info; 438 if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0) 439 return (err); 440 441 if (tbuf_op.size == 0) { 442 /* set trace buffer size */ 443 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_set_size; 444 tbuf_op.size = xdt_tbuf_pages; 445 (void) xdt_sysctl_tbuf(&tbuf_op); 446 447 /* get trace buffer info */ 448 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_get_info; 449 if ((err = xdt_sysctl_tbuf(&tbuf_op)) != 0) 450 return (err); 451 452 if (tbuf_op.size == 0) { 453 cmn_err(CE_NOTE, "Couldn't allocate trace buffers."); 454 return (ENOBUFS); 455 } 456 } 457 458 tbuf.size = tbuf_op.size; 459 tbuf.start_mfn = (mfn_t)tbuf_op.buffer_mfn; 460 tbuf.cnt = xdt_ncpus; 461 462 ASSERT(tbuf.start_mfn != MFN_INVALID); 463 ASSERT(tbuf.cnt > 0); 464 465 len = tbuf.size * tbuf.cnt; 466 tbuf.va = vmem_alloc(heap_arena, len, VM_SLEEP); 467 468 if ((err = xdt_map_trace_buffers(tbuf.start_mfn, tbuf.va, len)) != 0) { 469 vmem_free(heap_arena, tbuf.va, len); 470 tbuf.va = NULL; 471 return (err); 472 } 473 474 tbuf.meta = (struct t_buf **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.meta), 475 KM_SLEEP); 476 tbuf.data = (struct t_rec **)kmem_alloc(tbuf.cnt * sizeof (*tbuf.data), 477 KM_SLEEP); 478 479 for (i = 0; i < tbuf.cnt; i++) { 480 void *cpu_buf = (void *)(tbuf.va + (tbuf.size * i)); 481 tbuf.meta[i] = cpu_buf; 482 tbuf.data[i] = (struct t_rec *)((uintptr_t)cpu_buf + 483 sizeof (struct t_buf)); 484 485 /* throw away stale trace records */ 486 tbuf.meta[i]->cons = tbuf.meta[i]->prod; 487 } 488 489 return (0); 490 } 491 492 static void 493 xdt_detach_trace_buffers(void) 494 { 495 size_t len = tbuf.size * tbuf.cnt; 496 497 ASSERT(tbuf.va != NULL); 498 499 hat_unload(kas.a_hat, tbuf.va, len, 500 HAT_UNLOAD_UNMAP | HAT_UNLOAD_UNLOCK); 501 vmem_free(heap_arena, tbuf.va, len); 502 kmem_free(tbuf.meta, tbuf.cnt * sizeof (*tbuf.meta)); 503 kmem_free(tbuf.data, tbuf.cnt * sizeof (*tbuf.data)); 504 } 505 506 static inline size_t 507 xdt_process_rec(uint_t cpuid, struct t_rec *rec) 508 { 509 xdt_schedinfo_t *sp = &xdt_cpu_schedinfo[cpuid]; 510 int eid; 511 uint32_t *data; 512 size_t rec_size; 513 514 ASSERT(rec != NULL); 515 ASSERT(xdt_ncpus == xpv_nr_phys_cpus()); 516 517 if (cpuid >= xdt_ncpus) { 518 tbuf.stat_spurious_cpu++; 519 goto done; 520 } 521 522 data = rec->cycles_included ? rec->u.cycles.extra_u32 : 523 rec->u.nocycles.extra_u32; 524 525 switch (rec->event) { 526 /* 527 * Sched probes 528 */ 529 case TRC_SCHED_SWITCH_INFPREV: 530 /* 531 * Info on vCPU being de-scheduled 532 * 533 * data[0] = prev domid 534 * data[1] = time spent on pcpu 535 */ 536 sp->prev_domid = data[0]; 537 sp->prev_ctime = data[1]; 538 break; 539 540 case TRC_SCHED_SWITCH_INFNEXT: 541 /* 542 * Info on next vCPU to be scheduled 543 * 544 * data[0] = next domid 545 * data[1] = time spent waiting to get on cpu 546 * data[2] = time slice 547 */ 548 sp->next_domid = data[0]; 549 sp->next_wtime = data[1]; 550 sp->next_ts = data[2]; 551 break; 552 553 case TRC_SCHED_SWITCH: 554 /* 555 * vCPU switch 556 * 557 * data[0] = prev domid 558 * data[1] = prev vcpuid 559 * data[2] = next domid 560 * data[3] = next vcpuid 561 */ 562 if (data[0] != sp->prev_domid && 563 data[2] != sp->next_domid) { 564 /* prev and next info don't match doms being sched'd */ 565 tbuf.stat_spurious_switch++; 566 goto done; 567 } 568 569 sp->prev_vcpuid = data[1]; 570 sp->next_vcpuid = data[3]; 571 572 XDT_PROBE3(IS_IDLE_DOM(sp->prev_domid)? 573 XDT_SCHED_IDLE_OFF_CPU:XDT_SCHED_OFF_CPU, 574 cpuid, sp->prev_domid, sp->prev_vcpuid, sp->prev_ctime); 575 576 XDT_PROBE4(IS_IDLE_DOM(sp->next_domid)? 577 XDT_SCHED_IDLE_ON_CPU:XDT_SCHED_ON_CPU, 578 cpuid, sp->next_domid, sp->next_vcpuid, sp->next_wtime, 579 sp->next_ts); 580 break; 581 582 case TRC_SCHED_BLOCK: 583 /* 584 * vCPU blocked 585 * 586 * data[0] = domid 587 * data[1] = vcpuid 588 */ 589 XDT_PROBE2(XDT_SCHED_BLOCK, cpuid, data[0], data[1]); 590 break; 591 592 case TRC_SCHED_SLEEP: 593 /* 594 * Put vCPU to sleep 595 * 596 * data[0] = domid 597 * data[1] = vcpuid 598 */ 599 XDT_PROBE2(XDT_SCHED_SLEEP, cpuid, data[0], data[1]); 600 break; 601 602 case TRC_SCHED_WAKE: 603 /* 604 * Wake up vCPU 605 * 606 * data[0] = domid 607 * data[1] = vcpuid 608 */ 609 XDT_PROBE2(XDT_SCHED_WAKE, cpuid, data[0], data[1]); 610 break; 611 612 case TRC_SCHED_YIELD: 613 /* 614 * vCPU yielded 615 * 616 * data[0] = domid 617 * data[1] = vcpuid 618 */ 619 XDT_PROBE2(XDT_SCHED_YIELD, cpuid, data[0], data[1]); 620 break; 621 622 case TRC_SCHED_SHUTDOWN: 623 /* 624 * Guest shutting down 625 * 626 * data[0] = domid 627 * data[1] = initiating vcpu 628 * data[2] = shutdown code 629 */ 630 switch (data[2]) { 631 case SHUTDOWN_poweroff: 632 eid = XDT_SCHED_SHUTDOWN_POWEROFF; 633 break; 634 case SHUTDOWN_reboot: 635 eid = XDT_SCHED_SHUTDOWN_REBOOT; 636 break; 637 case SHUTDOWN_suspend: 638 eid = XDT_SCHED_SHUTDOWN_SUSPEND; 639 break; 640 case SHUTDOWN_crash: 641 eid = XDT_SCHED_SHUTDOWN_CRASH; 642 break; 643 default: 644 tbuf.stat_unknown_shutdown++; 645 goto done; 646 } 647 648 XDT_PROBE1(eid, cpuid, data[0]); 649 break; 650 651 /* 652 * Mem probes 653 */ 654 case TRC_MEM_PAGE_GRANT_MAP: 655 /* 656 * Guest mapped page grant 657 * 658 * data[0] = domid 659 */ 660 XDT_PROBE1(XDT_MEM_PAGE_GRANT_MAP, cpuid, data[0]); 661 break; 662 663 case TRC_MEM_PAGE_GRANT_UNMAP: 664 /* 665 * Guest unmapped page grant 666 * 667 * data[0] = domid 668 */ 669 XDT_PROBE1(XDT_MEM_PAGE_GRANT_UNMAP, cpuid, data[0]); 670 break; 671 672 case TRC_MEM_PAGE_GRANT_TRANSFER: 673 /* 674 * Page grant is being transferred 675 * 676 * data[0] = target domid 677 */ 678 XDT_PROBE1(XDT_MEM_PAGE_GRANT_TRANSFER, cpuid, data[0]); 679 break; 680 681 /* 682 * HVM probes 683 */ 684 case TRC_HVM_VMENTRY: 685 /* 686 * Return to guest via vmx_launch/vmrun 687 * 688 * data[0] = (domid<<16 + vcpuid) 689 */ 690 XDT_PROBE2(XDT_HVM_VMENTRY, cpuid, HVM_DOMID(data[0]), 691 HVM_VCPUID(data[0])); 692 break; 693 694 case TRC_HVM_VMEXIT64: 695 /* 696 * Entry into VMEXIT handler 697 * 698 * data[0] = (domid<<16 + vcpuid) 699 * data[1] = cpu vendor specific exit code 700 * data[2] = guest rip(0:31) 701 * data[3] = guest rip(32:64) 702 */ 703 XDT_PROBE4(XDT_HVM_VMEXIT, cpuid, HVM_DOMID(data[0]), 704 HVM_VCPUID(data[0]), data[1], 705 ((uint64_t)data[3]<<32) | data[2]); 706 break; 707 708 case TRC_LOST_RECORDS: 709 XDT_PROBE0(XDT_TRC_LOST_RECORDS, cpuid); 710 tbuf.stat_dropped_recs++; 711 break; 712 713 default: 714 tbuf.stat_unknown_recs++; 715 break; 716 } 717 718 done: 719 rec_size = 4 + (rec->cycles_included ? 8 : 0) + (rec->extra_u32 * 4); 720 return (rec_size); 721 } 722 723 /*ARGSUSED*/ 724 static void 725 xdt_tbuf_scan(void *arg) 726 { 727 uint_t cpuid; 728 size_t tbuf_data_size; 729 struct t_rec *rec; 730 uintptr_t data; 731 uint32_t prod, cons; 732 uint32_t offset, end_offset; 733 734 tbuf_data_size = tbuf.size - sizeof (struct t_buf); 735 736 /* scan all cpu buffers for new records */ 737 for (cpuid = 0; cpuid < tbuf.cnt; cpuid++) { 738 cons = tbuf.meta[cpuid]->cons; 739 prod = tbuf.meta[cpuid]->prod; 740 membar_consumer(); /* read prod /then/ data */ 741 742 /* see <xen/public/trace.h> */ 743 ASSERT(cons < 2 * tbuf_data_size); 744 ASSERT(prod < 2 * tbuf_data_size); 745 746 if (prod == cons) 747 continue; 748 749 offset = cons % tbuf_data_size; 750 end_offset = prod % tbuf_data_size; 751 752 if (offset >= end_offset) { 753 /* read up to the end of the buffer */ 754 while (offset != tbuf_data_size) { 755 data = (uintptr_t)tbuf.data[cpuid] + offset; 756 rec = (struct t_rec *)data; 757 ASSERT((caddr_t)rec < tbuf.va + (tbuf.size * 758 (cpuid + 1))); 759 offset += xdt_process_rec(cpuid, rec); 760 } 761 offset = 0; /* wrap around */ 762 } 763 764 while (offset != end_offset) { 765 data = (uintptr_t)tbuf.data[cpuid] + offset; 766 rec = (struct t_rec *)data; 767 ASSERT((caddr_t)rec < tbuf.va + (tbuf.size * 768 (cpuid + 1))); 769 offset += xdt_process_rec(cpuid, rec); 770 } 771 772 membar_exit(); /* read data /then/ update cons */ 773 tbuf.meta[cpuid]->cons = prod; 774 } 775 } 776 777 static void 778 xdt_cyclic_enable(void) 779 { 780 cyc_handler_t hdlr; 781 cyc_time_t when; 782 783 ASSERT(MUTEX_HELD(&cpu_lock)); 784 785 hdlr.cyh_func = xdt_tbuf_scan; 786 hdlr.cyh_arg = NULL; 787 hdlr.cyh_level = CY_LOW_LEVEL; 788 789 when.cyt_interval = xdt_poll_nsec; 790 when.cyt_when = dtrace_gethrtime() + when.cyt_interval; 791 792 xdt_cyclic = cyclic_add(&hdlr, &when); 793 } 794 795 static void 796 xdt_probe_create(xdt_probe_t *p) 797 { 798 ASSERT(p != NULL && p->pr_mod != NULL); 799 800 if (dtrace_probe_lookup(xdt_id, p->pr_mod, NULL, p->pr_name) != 0) 801 return; 802 803 xdt_prid[p->evt_id] = dtrace_probe_create(xdt_id, p->pr_mod, NULL, 804 p->pr_name, dtrace_mach_aframes(), p); 805 } 806 807 /*ARGSUSED*/ 808 static void 809 xdt_provide(void *arg, const dtrace_probedesc_t *desc) 810 { 811 const char *mod, *name; 812 int i; 813 814 if (desc == NULL) { 815 for (i = 0; xdt_probe[i].pr_mod != NULL; i++) { 816 xdt_probe_create(&xdt_probe[i]); 817 } 818 } else { 819 mod = desc->dtpd_mod; 820 name = desc->dtpd_name; 821 for (i = 0; xdt_probe[i].pr_mod != NULL; i++) { 822 int l1 = strlen(xdt_probe[i].pr_name); 823 int l2 = strlen(xdt_probe[i].pr_mod); 824 if (strncmp(name, xdt_probe[i].pr_name, l1) == 0 && 825 strncmp(mod, xdt_probe[i].pr_mod, l2) == 0) 826 break; 827 } 828 829 if (xdt_probe[i].pr_mod == NULL) 830 return; 831 xdt_probe_create(&xdt_probe[i]); 832 } 833 834 } 835 836 /*ARGSUSED*/ 837 static void 838 xdt_destroy(void *arg, dtrace_id_t id, void *parg) 839 { 840 xdt_probe_t *p = parg; 841 xdt_prid[p->evt_id] = 0; 842 } 843 844 static void 845 xdt_set_trace_mask(uint32_t mask) 846 { 847 xen_sysctl_tbuf_op_t tbuf_op; 848 849 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_set_evt_mask; 850 tbuf_op.evt_mask = mask; 851 (void) xdt_sysctl_tbuf(&tbuf_op); 852 } 853 854 /*ARGSUSED*/ 855 static int 856 xdt_enable(void *arg, dtrace_id_t id, void *parg) 857 { 858 xdt_probe_t *p = parg; 859 xen_sysctl_tbuf_op_t tbuf_op; 860 861 ASSERT(MUTEX_HELD(&cpu_lock)); 862 ASSERT(xdt_prid[p->evt_id] != 0); 863 864 xdt_probemap[p->evt_id] = xdt_prid[p->evt_id]; 865 xdt_classinfo[p->class].cnt++; 866 867 if (xdt_classinfo[p->class].cnt == 1) { 868 /* set the trace mask for this class */ 869 cur_trace_mask |= xdt_classinfo[p->class].trc_mask; 870 xdt_set_trace_mask(cur_trace_mask); 871 } 872 873 if (xdt_cyclic == CYCLIC_NONE) { 874 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_enable; 875 if (xdt_sysctl_tbuf(&tbuf_op) != 0) { 876 cmn_err(CE_NOTE, "Couldn't enable hypervisor tracing."); 877 return (-1); 878 } 879 880 xdt_cyclic_enable(); 881 } 882 return (0); 883 } 884 885 /*ARGSUSED*/ 886 static void 887 xdt_disable(void *arg, dtrace_id_t id, void *parg) 888 { 889 xdt_probe_t *p = parg; 890 xen_sysctl_tbuf_op_t tbuf_op; 891 int i, err; 892 893 ASSERT(MUTEX_HELD(&cpu_lock)); 894 ASSERT(xdt_probemap[p->evt_id] != 0); 895 ASSERT(xdt_probemap[p->evt_id] == xdt_prid[p->evt_id]); 896 ASSERT(xdt_classinfo[p->class].cnt > 0); 897 898 /* 899 * We could be here in the slight window between the cyclic firing and 900 * a call to dtrace_probe() occurring. We need to be careful if we tear 901 * down any shared state. 902 */ 903 904 xdt_probemap[p->evt_id] = 0; 905 xdt_classinfo[p->class].cnt--; 906 907 if (xdt_nr_active_probes() == 0) { 908 cur_trace_mask = 0; 909 910 if (xdt_cyclic == CYCLIC_NONE) 911 return; 912 913 /* 914 * We will try to disable the trace buffers. If we fail for some 915 * reason we will try again, up to a count of XDT_TBUF_RETRY. 916 * If we still aren't successful we try to set the trace mask 917 * to 0 in order to prevent trace records from being written. 918 */ 919 tbuf_op.cmd = XEN_SYSCTL_TBUFOP_disable; 920 i = 0; 921 do { 922 err = xdt_sysctl_tbuf(&tbuf_op); 923 } while ((err != 0) && (++i < XDT_TBUF_RETRY)); 924 925 if (err != 0) { 926 cmn_err(CE_NOTE, 927 "Couldn't disable hypervisor tracing."); 928 xdt_set_trace_mask(0); 929 } else { 930 cyclic_remove(xdt_cyclic); 931 xdt_cyclic = CYCLIC_NONE; 932 /* 933 * We don't bother making the hypercall to set 934 * the trace mask, since it will be reset when 935 * tracing is re-enabled. 936 */ 937 } 938 } else if (xdt_classinfo[p->class].cnt == 0) { 939 cur_trace_mask ^= xdt_classinfo[p->class].trc_mask; 940 /* other probes are enabled, so add the sub-class mask back */ 941 cur_trace_mask |= 0xF000; 942 xdt_set_trace_mask(cur_trace_mask); 943 } 944 } 945 946 static dtrace_pattr_t xdt_attr = { 947 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 948 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 949 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, 950 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 951 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, 952 }; 953 954 static dtrace_pops_t xdt_pops = { 955 xdt_provide, /* dtps_provide() */ 956 NULL, /* dtps_provide_module() */ 957 xdt_enable, /* dtps_enable() */ 958 xdt_disable, /* dtps_disable() */ 959 NULL, /* dtps_suspend() */ 960 NULL, /* dtps_resume() */ 961 NULL, /* dtps_getargdesc() */ 962 NULL, /* dtps_getargval() */ 963 NULL, /* dtps_usermode() */ 964 xdt_destroy /* dtps_destroy() */ 965 }; 966 967 static int 968 xdt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) 969 { 970 int val; 971 972 if (!DOMAIN_IS_INITDOMAIN(xen_info)) 973 return (DDI_FAILURE); 974 975 switch (cmd) { 976 case DDI_ATTACH: 977 break; 978 979 case DDI_RESUME: 980 /* 981 * We might support proper suspend/resume in the future, so, 982 * return DDI_FAILURE for now. 983 */ 984 return (DDI_FAILURE); 985 986 default: 987 return (DDI_FAILURE); 988 } 989 990 xdt_ncpus = xpv_nr_phys_cpus(); 991 ASSERT(xdt_ncpus > 0); 992 993 if (ddi_create_minor_node(devi, "xdt", S_IFCHR, 0, DDI_PSEUDO, 0) == 994 DDI_FAILURE || xdt_attach_trace_buffers() != 0 || 995 dtrace_register("xdt", &xdt_attr, DTRACE_PRIV_KERNEL, NULL, 996 &xdt_pops, NULL, &xdt_id) != 0) { 997 if (tbuf.va != NULL) 998 xdt_detach_trace_buffers(); 999 ddi_remove_minor_node(devi, NULL); 1000 return (DDI_FAILURE); 1001 } 1002 1003 val = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, 1004 "xdt_poll_nsec", XDT_POLL_DEFAULT); 1005 xdt_poll_nsec = MAX(val, XDT_POLL_MIN); 1006 1007 xdt_cpu_schedinfo = (xdt_schedinfo_t *)kmem_alloc(xdt_ncpus * 1008 sizeof (xdt_schedinfo_t), KM_SLEEP); 1009 xdt_init_trace_masks(); 1010 xdt_kstat_init(); 1011 1012 xdt_devi = devi; 1013 ddi_report_dev(devi); 1014 return (DDI_SUCCESS); 1015 } 1016 1017 static int 1018 xdt_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) 1019 { 1020 switch (cmd) { 1021 case DDI_DETACH: 1022 break; 1023 1024 case DDI_SUSPEND: 1025 /* 1026 * We might support proper suspend/resume in the future. So 1027 * return DDI_FAILURE for now. 1028 */ 1029 return (DDI_FAILURE); 1030 1031 default: 1032 return (DDI_FAILURE); 1033 } 1034 1035 if (dtrace_unregister(xdt_id) != 0) 1036 return (DDI_FAILURE); 1037 1038 xdt_detach_trace_buffers(); 1039 kmem_free(xdt_cpu_schedinfo, xdt_ncpus * sizeof (xdt_schedinfo_t)); 1040 if (xdt_cyclic != CYCLIC_NONE) 1041 cyclic_remove(xdt_cyclic); 1042 if (xdt_kstats != NULL) 1043 kstat_delete(xdt_kstats); 1044 xdt_devi = (void *)0; 1045 ddi_remove_minor_node(devi, NULL); 1046 1047 return (DDI_SUCCESS); 1048 } 1049 1050 /*ARGSUSED*/ 1051 static int 1052 xdt_info(dev_info_t *devi, ddi_info_cmd_t infocmd, void *arg, void **result) 1053 { 1054 int error; 1055 1056 switch (infocmd) { 1057 case DDI_INFO_DEVT2DEVINFO: 1058 *result = xdt_devi; 1059 error = DDI_SUCCESS; 1060 break; 1061 case DDI_INFO_DEVT2INSTANCE: 1062 *result = (void *)0; 1063 error = DDI_SUCCESS; 1064 break; 1065 default: 1066 error = DDI_FAILURE; 1067 } 1068 return (error); 1069 } 1070 1071 static struct cb_ops xdt_cb_ops = { 1072 nulldev, /* open(9E) */ 1073 nodev, /* close(9E) */ 1074 nodev, /* strategy(9E) */ 1075 nodev, /* print(9E) */ 1076 nodev, /* dump(9E) */ 1077 nodev, /* read(9E) */ 1078 nodev, /* write(9E) */ 1079 nodev, /* ioctl(9E) */ 1080 nodev, /* devmap(9E) */ 1081 nodev, /* mmap(9E) */ 1082 nodev, /* segmap(9E) */ 1083 nochpoll, /* chpoll(9E) */ 1084 ddi_prop_op, /* prop_op(9E) */ 1085 NULL, /* streamtab(9S) */ 1086 D_MP | D_64BIT | D_NEW /* cb_flag */ 1087 }; 1088 1089 static struct dev_ops xdt_ops = { 1090 DEVO_REV, /* devo_rev */ 1091 0, /* devo_refcnt */ 1092 xdt_info, /* getinfo(9E) */ 1093 nulldev, /* identify(9E) */ 1094 nulldev, /* probe(9E) */ 1095 xdt_attach, /* attach(9E) */ 1096 xdt_detach, /* detach(9E) */ 1097 nulldev, /* devo_reset */ 1098 &xdt_cb_ops, /* devo_cb_ops */ 1099 NULL, /* devo_bus_ops */ 1100 NULL, /* power(9E) */ 1101 ddi_quiesce_not_needed, /* devo_quiesce */ 1102 }; 1103 1104 1105 static struct modldrv modldrv = { 1106 &mod_driverops, 1107 "Hypervisor event tracing", 1108 &xdt_ops 1109 }; 1110 1111 static struct modlinkage modlinkage = { 1112 MODREV_1, 1113 &modldrv, 1114 NULL 1115 }; 1116 1117 int 1118 _init(void) 1119 { 1120 return (mod_install(&modlinkage)); 1121 } 1122 1123 int 1124 _fini(void) 1125 { 1126 return (mod_remove(&modlinkage)); 1127 } 1128 1129 int 1130 _info(struct modinfo *modinfop) 1131 { 1132 return (mod_info(&modlinkage, modinfop)); 1133 } 1134