1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 27 #include <sys/systm.h> 28 #include <sys/conf.h> 29 #include <sys/stat.h> 30 #include <sys/ddi.h> 31 #include <sys/sunddi.h> 32 #include <sys/modctl.h> 33 #include <sys/cpu_module.h> 34 #include <vm/hat_sfmmu.h> 35 #include <vm/seg_kmem.h> 36 #include <vm/seg_kpm.h> 37 #include <vm/vm_dep.h> 38 #include <sys/machsystm.h> 39 #include <sys/machasi.h> 40 #include <sys/sysmacros.h> 41 #include <sys/callb.h> 42 #include <sys/archsystm.h> 43 #include <sys/trapstat.h> 44 #ifdef sun4v 45 #include <sys/hypervisor_api.h> 46 #endif 47 #ifndef sun4v 48 #include <sys/pghw.h> 49 #endif 50 51 /* BEGIN CSTYLED */ 52 /* 53 * trapstat: Trap Statistics through Dynamic Trap Table Interposition 54 * ------------------------------------------------------------------- 55 * 56 * Motivation and Overview 57 * 58 * Despite being a fundamental indicator of system behavior, there has 59 * historically been very little insight provided into the frequency and cost 60 * of machine-specific traps. The lack of insight has been especially acute 61 * on UltraSPARC microprocessors: because these microprocessors handle TLB 62 * misses as software traps, the frequency and duration of traps play a 63 * decisive role in the performance of the memory system. As applications have 64 * increasingly outstripped TLB reach, this has become increasingly true. 65 * 66 * Part of the difficulty of observing trap behavior is that the trap handlers 67 * are so frequently called (e.g. millions of times per second) that any 68 * permanently enabled instrumentation would induce an unacceptable performance 69 * degradation. Thus, it is a constraint on any trap observability 70 * infrastructure that it have no probe effect when not explicitly enabled. 71 * 72 * The basic idea, then, is to create an interposing trap table in which each 73 * entry increments a per-trap, in-memory counter and then jumps to the actual, 74 * underlying trap table entry. To enable trapstat, we atomically write to the 75 * trap base address (%tba) register to point to our interposing trap table. 76 * (Note that per-CPU statistics fall out by creating a different trap table 77 * for each CPU.) 78 * 79 * Implementation Details 80 * 81 * While the idea is straight-forward, a nuance of SPARC V9 slightly 82 * complicates the implementation. Unlike its predecessors, SPARC V9 supports 83 * the notion of nested traps. The trap level is kept in the TL register: 84 * during normal operation it is 0; when a trap is taken, the TL register is 85 * incremented by 1. To aid system software, SPARC V9 breaks the trap table 86 * into two halves: the lower half contains the trap handlers for traps taken 87 * when TL is 0; the upper half contains the trap handlers for traps taken 88 * when TL is greater than 0. Each half is further subdivided into two 89 * subsequent halves: the lower half contains the trap handlers for traps 90 * other than those induced by the trap instruction (Tcc variants); the upper 91 * half contains the trap handlers for traps induced by the trap instruction. 92 * This gives a total of four ranges, with each range containing 256 traps: 93 * 94 * +--------------------------------+- 3ff 95 * | | . 96 * | Trap instruction, TL>0 | . 97 * | | . 98 * |- - - - - - - - - - - - - - - - +- 300 99 * |- - - - - - - - - - - - - - - - +- 2ff 100 * | | . 101 * | Non-trap instruction, TL>0 | . 102 * | | . 103 * |- - - - - - - - - - - - - - - - +- 200 104 * |- - - - - - - - - - - - - - - - +- 1ff 105 * | | . 106 * | Trap instruction, TL=0 | . 107 * | | . 108 * |- - - - - - - - - - - - - - - - +- 100 109 * |- - - - - - - - - - - - - - - - +- 0ff 110 * | | . 111 * | Non-trap instruction, TL=0 | . 112 * | | . 113 * +--------------------------------+- 000 114 * 115 * 116 * Solaris, however, doesn't have reason to support trap instructions when 117 * TL>0 (only privileged code may execute at TL>0; not supporting this only 118 * constrains our own implementation). The trap table actually looks like: 119 * 120 * +--------------------------------+- 2ff 121 * | | . 122 * | Non-trap instruction, TL>0 | . 123 * | | . 124 * |- - - - - - - - - - - - - - - - +- 200 125 * |- - - - - - - - - - - - - - - - +- 1ff 126 * | | . 127 * | Trap instruction, TL=0 | . 128 * | | . 129 * |- - - - - - - - - - - - - - - - +- 100 130 * |- - - - - - - - - - - - - - - - +- 0ff 131 * | | . 132 * | Non-trap instruction, TL=0 | . 133 * | | . 134 * +--------------------------------+- 000 135 * 136 * Putatively to aid system software, SPARC V9 has the notion of multiple 137 * sets of global registers. UltraSPARC defines four sets of global 138 * registers: 139 * 140 * Normal Globals 141 * Alternate Globals (AGs) 142 * MMU Globals (MGs) 143 * Interrupt Globals (IGs) 144 * 145 * The set of globals in use is controlled by bits in PSTATE; when TL is 0 146 * (and PSTATE has not been otherwise explicitly modified), the Normal Globals 147 * are in use. When a trap is issued, PSTATE is modified to point to a set of 148 * globals corresponding to the trap type. Most traps correspond to the 149 * Alternate Globals, with a minority corresponding to the MMU Globals, and 150 * only the interrupt-vector trap (vector 0x60) corresponding to the Interrupt 151 * Globals. (The complete mapping can be found in the UltraSPARC I&II User's 152 * Manual.) 153 * 154 * Note that the sets of globals are per trap _type_, not per trap _level_. 155 * Thus, when executing a TL>0 trap handler, one may not have registers 156 * available (for example, both trap-instruction traps and spill traps execute 157 * on the alternate globals; if a trap-instruction trap induces a window spill, 158 * the window spill handler has no available globals). For trapstat, this is 159 * problematic: a register is required to transfer control from one arbitrary 160 * location (in the interposing trap table) to another (in the actual trap 161 * table). 162 * 163 * We solve this problem by exploiting the trap table's location at the bottom 164 * of valid kernel memory (i.e. at KERNELBASE). We locate the interposing trap 165 * tables just below KERNELBASE -- thereby allowing us to use a branch-always 166 * instruction (ba) instead of a jump instruction (jmp) to transfer control 167 * from the TL>0 entries in the interposing trap table to the TL>0 entries in 168 * the actual trap table. (N.B. while this allows trap table interposition to 169 * work, it necessarily limits trapstat to only recording information about 170 * TL=0 traps -- there is no way to increment a counter without using a 171 * register.) Diagrammatically: 172 * 173 * Actual trap table: 174 * 175 * +--------------------------------+- 2ff 176 * | | . 177 * | Non-trap instruction, TL>0 | . <-----------------------+ 178 * | | . <-----------------------|-+ 179 * |- - - - - - - - - - - - - - - - +- 200 <-----------------------|-|-+ 180 * |- - - - - - - - - - - - - - - - +- 1ff | | | 181 * | | . | | | 182 * | Trap instruction, TL=0 | . <-----------------+ | | | 183 * | | . <-----------------|-+ | | | 184 * |- - - - - - - - - - - - - - - - +- 100 <-----------------|-|-+ | | | 185 * |- - - - - - - - - - - - - - - - +- 0ff | | | | | | 186 * | | . | | | | | | 187 * | Non-trap instruction, TL=0 | . <-----------+ | | | | | | 188 * | | . <-----------|-+ | | | | | | 189 * +--------------------------------+- 000 <-----------|-|-+ | | | | | | 190 * KERNELBASE | | | | | | | | | 191 * | | | | | | | | | 192 * | | | | | | | | | 193 * Interposing trap table: | | | | | | | | | 194 * | | | | | | | | | 195 * +--------------------------------+- 2ff | | | | | | | | | 196 * | ... | . | | | | | | | | | 197 * | ... | . | | | | | | | | | 198 * | ... | . | | | | | | | | | 199 * |- - - - - - - - - - - - - - - - +- 203 | | | | | | | | | 200 * | ba,a | -------------|-|-|-|-|-|-+ | | 201 * |- - - - - - - - - - - - - - - - +- 202 | | | | | | | | 202 * | ba,a | -------------|-|-|-|-|-|---+ | 203 * |- - - - - - - - - - - - - - - - +- 201 | | | | | | | 204 * | ba,a | -------------|-|-|-|-|-|-----+ 205 * |- - - - - - - - - - - - - - - - +- 200 | | | | | | 206 * | ... | . | | | | | | 207 * | ... | . | | | | | | 208 * | ... | . | | | | | | 209 * |- - - - - - - - - - - - - - - - +- 103 | | | | | | 210 * | (Increment counter) | | | | | | | 211 * | ba,a | -------------------+ | | 212 * |- - - - - - - - - - - - - - - - +- 102 | | | | | 213 * | (Increment counter) | | | | | | 214 * | ba,a | ---------------------+ | 215 * |- - - - - - - - - - - - - - - - +- 101 | | | | 216 * | (Increment counter) | | | | | 217 * | ba,a | -----------------------+ 218 * |- - - - - - - - - - - - - - - - +- 100 | | | 219 * | ... | . | | | 220 * | ... | . | | | 221 * | ... | . | | | 222 * |- - - - - - - - - - - - - - - - +- 003 | | | 223 * | (Increment counter) | | | | 224 * | ba,a | -------------+ | | 225 * |- - - - - - - - - - - - - - - - +- 002 | | 226 * | (Increment counter) | | | 227 * | ba,a | ---------------+ | 228 * |- - - - - - - - - - - - - - - - +- 001 | 229 * | (Increment counter) | | 230 * | ba,a | -----------------+ 231 * +--------------------------------+- 000 232 * KERNELBASE - tstat_total_size 233 * 234 * tstat_total_size is the number of pages required for each trap table. It 235 * must be true that KERNELBASE - tstat_total_size is less than the maximum 236 * branch displacement; if each CPU were to consume a disjoint virtual range 237 * below KERNELBASE for its trap table, we could support at most 238 * (maximum_branch_displacement / tstat_total_size) CPUs. The maximum branch 239 * displacement for Bicc variants is just under eight megabytes, and (because 240 * the %tba must be 32K aligned), tstat_total_size must be at least 32K; if 241 * each CPU were to consume a disjoint virtual range, we would have an 242 * unacceptably low upper bound of 256 CPUs. 243 * 244 * While there are tricks that one could use to address this constraint (e.g., 245 * creating trampolines every maximum_branch_displacement bytes), we instead 246 * solve this by not permitting each CPU to consume a disjoint virtual range. 247 * Rather, we have each CPU's interposing trap table use the _same_ virtual 248 * range, but we back the trap tables with disjoint physical memory. Normally, 249 * such one-to-many virtual-to-physical mappings are illegal; this is 250 * permissible here only because the pages for the interposing trap table are 251 * necessarily locked in the TLB. (The CPUs thus never have the opportunity to 252 * discover that they have conflicting translations.) 253 * 254 * On CMT architectures in which CPUs can share MMUs, the above trick will not 255 * work: two CPUs that share an MMU cannot have the same virtual address map 256 * to disjoint physical pages. On these architectures, any CPUs sharing the 257 * same MMU must consume a disjoint 32K virtual address range -- limiting the 258 * number of CPUs sharing an MMU on these architectures to 256 due to the 259 * branch displacement limitation described above. On the sun4v architecture, 260 * there is a further limitation: a guest may not have more than eight locked 261 * TLB entries per MMU. To allow operation under this restriction, the 262 * interposing trap table and the trap statistics are each accessed through 263 * a single 4M TLB entry. This limits the footprint to two locked entries 264 * (one for the I-TLB and one for the D-TLB), but further restricts the number 265 * of CPUs to 128 per MMU. However, support for more than 128 CPUs can easily 266 * be added via a hybrid scheme, where the same 4M virtual address is used 267 * on different MMUs. 268 * 269 * On sun4v architecture, we cannot use the hybrid scheme as the architecture 270 * imposes additional restriction on the number of permanent mappings per 271 * guest and it is illegal to use the same virtual address to map different 272 * TTEs on different MMUs. Instead, we increase the number of supported CPUs 273 * by reducing the virtual address space requirements per CPU via shared 274 * interposing trap table as follows: 275 * 276 * Offset (within 4MB page) 277 * +------------------------------------+- 0x400000 278 * | CPU 1015 trap statistics (4KB) | . 279 * |- - - - - - - - - - - - - - - - - - +- 0x3ff000 280 * | | 281 * | ... | 282 * | | 283 * |- - - - - - - - - - - - - - - - - - +- 0x00a000 284 * | CPU 1 trap statistics (4KB) | . 285 * |- - - - - - - - - - - - - - - - - - +- 0x009000 286 * | CPU 0 trap statistics (4KB) | . 287 * |- - - - - - - - - - - - - - - - - - +- 0x008000 288 * | Shared trap handler continuation | . 289 * |- - - - - - - - - - - - - - - - - - +- 0x006000 290 * | Non-trap instruction, TL>0 | . 291 * |- - - - - - - - - - - - - - - - - - +- 0x004000 292 * | Trap instruction, TL=0 | . 293 * |- - - - - - - - - - - - - - - - - - +- 0x002000 294 * | Non-trap instruction, TL=0 | . 295 * +------------------------------------+- 0x000000 296 * 297 * Note that each CPU has its own 4K space for its trap statistics but 298 * shares the same interposing trap handlers. Interposing trap handlers 299 * use the CPU ID to determine the location of per CPU trap statistics 300 * area dynamically. This increases the interposing trap handler overhead, 301 * but is acceptable as it allows us to support up to 1016 CPUs with one 302 * 4MB page on sun4v architecture. Support for additional CPUs can be 303 * added with another 4MB page to 2040 cpus (or 3064 cpus with 2 additional 304 * 4MB pages). With additional 4MB pages, we cannot use displacement branch 305 * (ba instruction) and we have to use jmp instruction instead. Note that 306 * with sun4v, globals are nested (not per-trap type as in sun4u), so it is 307 * ok to use additional global reg to do jmp. This option is not available in 308 * sun4u which mandates the usage of displacement branches since no global reg 309 * is available at TL>1 310 * 311 * TLB Statistics 312 * 313 * Because TLB misses are an important component of system performance, we wish 314 * to know much more about these traps than simply the number received. 315 * Specifically, we wish to know: 316 * 317 * (a) The amount of time spent executing the TLB miss handler 318 * (b) TLB misses versus TSB misses 319 * (c) Kernel-level misses versus user-level misses 320 * (d) Misses per pagesize 321 * 322 * TLB Statistics: Time Spent Executing 323 * 324 * To accurately determine the amount of time spent executing the TLB miss 325 * handler, one must get a timestamp on trap entry and trap exit, subtract the 326 * latter from the former, and add the result to an accumulating count. 327 * Consider flow of control during normal TLB miss processing (where "ldx 328 * [%g2], %g2" is an arbitrary TLB-missing instruction): 329 * 330 * + - - - - - - - -+ 331 * : : 332 * : ldx [%g2], %g2 :<-------------------------------------------------------+ 333 * : : Return from trap: | 334 * + - - - - - - - -+ TL <- TL - 1 (0) | 335 * | %pc <- TSTATE[TL].TPC (address of load) | 336 * | TLB miss: | 337 * | TL <- TL + 1 (1) | 338 * | %pc <- TLB-miss-trap-handler | 339 * | | 340 * v | 341 * + - - - - - - - - - - - - - - - + | 342 * : : | 343 * : Lookup VA in TSB : | 344 * : If (hit) : | 345 * : Fill TLB : | 346 * : Else : | 347 * : Lookup VA (hme hash table : | 348 * : or segkpm) : | 349 * : Fill TLB : | 350 * : Endif : | 351 * : Issue "retry" ---------------------------------------------------------+ 352 * : : 353 * + - - - - - - - - - - - - - - - + 354 * TLB-miss-trap-handler 355 * 356 * 357 * As the above diagram indicates, interposing on the trap table allows one 358 * only to determine a timestamp on trap _entry_: when the TLB miss handler 359 * has completed filling the TLB, a "retry" will be issued, and control will 360 * transfer immediately back to the missing %pc. 361 * 362 * To obtain a timestamp on trap exit, we must then somehow interpose between 363 * the "retry" and the subsequent control transfer to the TLB-missing 364 * instruction. To do this, we _push_ a trap level. The basic idea is to 365 * spoof a TLB miss by raising TL, setting the %tpc to be within text 366 * controlled by trapstat (the "TLB return entry") and branching to the 367 * underlying TLB miss handler. When the TLB miss handler issues its "retry", 368 * control will transfer not to the TLB-missing instruction, but rather to the 369 * TLB return entry. This code can then obtain a timestamp, and issue its own 370 * "retry" -- thereby correctly returning to the TLB-missing instruction. 371 * Here is the above TLB miss flow control diagram modified to reflect 372 * trapstat's operation: 373 * 374 * + - - - - - - - -+ 375 * : : 376 * : ldx [%g2], %g2 :<-------------------------------------------------------+ 377 * : : Return from trap: | 378 * + - - - - - - - -+ TL <- TL - 1 (0) | 379 * | %pc <- TSTATE[TL].TPC (address of load) | 380 * | TLB miss: | 381 * | TL <- TL + 1 (1) | 382 * | %pc <- TLB-miss-trap-handler (trapstat) | 383 * | | 384 * v TLB-return-entry (trapstat) | 385 * + - - - - - - - - - - - - - - - - - - + + - - - - - - - - - - - - - + | 386 * : : : : | 387 * : Record timestamp : : Record timestamp : | 388 * : TL <- 2 : : Take timestamp difference : | 389 * : TSTATE[1].TPC <- TLB-return-entry : : Add to running total : | 390 * : ba,a TLB-miss-trap-handler -----------+ : Issue "retry" --------------+ 391 * : : | : : 392 * + - - - - - - - - - - - - - - - - - - + | + - - - - - - - - - - - - - + 393 * TLB-miss-trap-handler | ^ 394 * (trapstat) | | 395 * | | 396 * | | 397 * +-----------------------+ | 398 * | | 399 * | | 400 * v | 401 * + - - - - - - - - - - - - - - - + | 402 * : : | 403 * : Lookup VA in TSB : | 404 * : If (hit) : | 405 * : Fill TLB : | 406 * : Else : | 407 * : Lookup VA (hme hash table : | 408 * : or segkpm) : | 409 * : Fill TLB : | 410 * : Endif : | 411 * : Issue "retry" ------------------------------------------+ 412 * : : Return from trap: 413 * + - - - - - - - - - - - - - - - + TL <- TL - 1 (1) 414 * TLB-miss-trap-handler %pc <- TSTATE[TL].TPC (TLB-return-entry) 415 * 416 * 417 * A final subterfuge is required to complete our artifice: if we miss in 418 * the TLB, the TSB _and_ the subsequent hash or segkpm lookup (that is, if 419 * there is no valid translation for the TLB-missing address), common system 420 * software will need to accurately determine the %tpc as part of its page 421 * fault handling. We therefore modify the kernel to check the %tpc in this 422 * case: if the %tpc falls within the VA range controlled by trapstat and 423 * the TL is 2, TL is simply lowered back to 1 (this check is implemented 424 * by the TSTAT_CHECK_TL1 macro). Lowering TL to 1 has the effect of 425 * discarding the state pushed by trapstat. 426 * 427 * TLB Statistics: TLB Misses versus TSB Misses 428 * 429 * Distinguishing TLB misses from TSB misses requires further interposition 430 * on the TLB miss handler: we cannot know a priori or a posteriori if a 431 * given VA will or has hit in the TSB. 432 * 433 * We achieve this distinction by adding a second TLB return entry almost 434 * identical to the first -- differing only in the address to which it 435 * stores its results. We then modify the TLB miss handlers of the kernel 436 * such that they check the %tpc when they determine that a TLB miss has 437 * subsequently missed in the TSB: if the %tpc lies within trapstat's VA 438 * range and TL is 2 (that is, if trapstat is running), the TLB miss handler 439 * _increments_ the %tpc by the size of the TLB return entry. The ensuing 440 * "retry" will thus transfer control to the second TLB return entry, and 441 * the time spent in the handler will be accumulated in a memory location 442 * specific to TSB misses. 443 * 444 * N.B.: To minimize the amount of knowledge the kernel must have of trapstat, 445 * we do not allow the kernel to hard-code the size of the TLB return entry. 446 * Rather, the actual tsbmiss handler executes a known instruction at the 447 * corresponding tsbmiss patch points (see the tstat_tsbmiss_patch_table) with 448 * the %tpc in %g7: when trapstat is not running, these points contain the 449 * harmless TSTAT_TSBMISS_INSTR instruction ("add %g7, 0, %g7"). Before 450 * running, trapstat modifies the instructions at these patch points such 451 * that the simm13 equals the size of the TLB return entry. 452 * 453 * TLB Statistics: Kernel-level Misses versus User-level Misses 454 * 455 * Differentiating user-level misses from kernel-level misses employs a 456 * similar technique, but is simplified by the ability to distinguish a 457 * user-level miss from a kernel-level miss a priori by reading the context 458 * register: we implement kernel-/user-level differentiation by again doubling 459 * the number of TLB return entries, and setting the %tpc to the appropriate 460 * TLB return entry in trapstat's TLB miss handler. Together with the doubling 461 * of entries required for TLB-miss/TSB-miss differentiation, this yields a 462 * total of four TLB return entries: 463 * 464 * Level TSB hit? Structure member 465 * ------------------------------------------------------------ 466 * Kernel Yes tstat_tlbret_t.ttlbr_ktlb 467 * Kernel No tstat_tlbret_t.ttlbr_ktsb 468 * User Yes tstat_tlbret_t.ttlbr_utlb 469 * User No tstat_tlbret_t.ttlbr_utsb 470 * 471 * TLB Statistics: Misses per Pagesize 472 * 473 * As with the TLB-/TSB-miss differentiation, we have no way of determining 474 * pagesize a priori. This is therefore implemented by mandating a new rule: 475 * whenever the kernel fills the TLB in its TLB miss handler, the TTE 476 * corresponding to the TLB-missing VA must be in %g5 when the handler 477 * executes its "retry". This allows the TLB return entry to determine 478 * pagesize by simply looking at the pagesize field in the TTE stored in 479 * %g5. 480 * 481 * TLB Statistics: Probe Effect 482 * 483 * As one might imagine, gathering TLB statistics by pushing a trap level 484 * induces significant probe effect. To account for this probe effect, 485 * trapstat attempts to observe it by executing a code sequence with a known 486 * number of TLB misses both before and after interposing on the trap table. 487 * This allows trapstat to determine a per-trap probe effect which can then be 488 * factored into the "%tim" fields of the trapstat command. 489 * 490 * Note that on sun4v platforms, TLB misses are normally handled by the 491 * hypervisor or the hardware TSB walker. Thus no fast MMU miss information 492 * is reported for normal operation. However, when trapstat is invoked 493 * with -t or -T option to collect detailed TLB statistics, kernel takes 494 * over TLB miss handling. This results in significantly more overhead 495 * and TLB statistics may not be as accurate as on sun4u platforms. 496 * On some processors, hypervisor or hardware may provide a low overhead 497 * interface to collect TSB hit statistics. This support is exposed via 498 * a well defined CPU module interface (cpu_trapstat_conf to enable this 499 * interface and cpu_trapstat_data to get detailed TSB hit statistics). 500 * In this scenario, TSB miss statistics is collected by intercepting the 501 * IMMU_miss and DMMU_miss traps using above mentioned trap interposition 502 * approach. 503 * 504 * Locking 505 * 506 * The implementation uses two locks: tstat_lock (a local lock) and the global 507 * cpu_lock. tstat_lock is used to assure trapstat's consistency in the 508 * presence of multithreaded /dev/trapstat consumers (while as of this writing 509 * the only consumer of /dev/trapstat is single threaded, it is obviously 510 * necessary to correctly support multithreaded access). cpu_lock is held 511 * whenever CPUs are being manipulated directly, to prevent them from 512 * disappearing in the process. Because trapstat's DR callback 513 * (trapstat_cpu_setup()) must grab tstat_lock and is called with cpu_lock 514 * held, the lock ordering is necessarily cpu_lock before tstat_lock. 515 * 516 */ 517 /* END CSTYLED */ 518 519 static dev_info_t *tstat_devi; /* saved in xxattach() for xxinfo() */ 520 static int tstat_open; /* set if driver is open */ 521 static kmutex_t tstat_lock; /* serialize access */ 522 static vmem_t *tstat_arena; /* arena for TLB-locked pages */ 523 static tstat_percpu_t *tstat_percpu; /* per-CPU data */ 524 static int tstat_running; /* set if trapstat is running */ 525 static tstat_data_t *tstat_buffer; /* staging buffer for outgoing data */ 526 static int tstat_options; /* bit-wise indication of options */ 527 static int *tstat_enabled; /* map of enabled trap entries */ 528 static int tstat_tsbmiss_patched; /* tsbmiss patch flag */ 529 static callb_id_t tstat_cprcb; /* CPR callback */ 530 static char *tstat_probe_area; /* VA range used for probe effect */ 531 static caddr_t tstat_probe_phys; /* physical to back above VA */ 532 static hrtime_t tstat_probe_time; /* time spent on probe effect */ 533 static hrtime_t tstat_probe_before[TSTAT_PROBE_NLAPS]; 534 static hrtime_t tstat_probe_after[TSTAT_PROBE_NLAPS]; 535 static uint_t tstat_pgszs; /* # of kernel page sizes */ 536 static uint_t tstat_user_pgszs; /* # of user page sizes */ 537 538 /* 539 * sizeof tstat_data_t + pgsz data for the kernel. For simplicity's sake, when 540 * we collect data, we do it based upon szc, but when we report data back to 541 * userland, we have to do it based upon the userszc which may not match. 542 * So, these two variables are for internal use and exported use respectively. 543 */ 544 static size_t tstat_data_t_size; 545 static size_t tstat_data_t_exported_size; 546 547 #ifndef sun4v 548 549 static size_t tstat_data_pages; /* number of pages of tstat data */ 550 static size_t tstat_data_size; /* tstat data size in bytes */ 551 static size_t tstat_total_pages; /* #data pages + #instr pages */ 552 static size_t tstat_total_size; /* tstat data size + instr size */ 553 554 #else /* sun4v */ 555 556 static caddr_t tstat_va[TSTAT_NUM4M_LIMIT]; /* VAs of 4MB pages */ 557 static pfn_t tstat_pfn[TSTAT_NUM4M_LIMIT]; /* PFNs of 4MB pages */ 558 static boolean_t tstat_fast_tlbstat = B_FALSE; 559 static int tstat_traptab_initialized; 560 static int tstat_perm_mapping_failed; 561 static int tstat_hv_nopanic; 562 static int tstat_num4m_mapping; 563 564 #endif /* sun4v */ 565 566 /* 567 * In the above block comment, see "TLB Statistics: TLB Misses versus 568 * TSB Misses" for an explanation of the tsbmiss patch points. 569 */ 570 extern uint32_t tsbmiss_trapstat_patch_point; 571 extern uint32_t tsbmiss_trapstat_patch_point_kpm; 572 extern uint32_t tsbmiss_trapstat_patch_point_kpm_small; 573 574 /* 575 * Trapstat tsbmiss patch table 576 */ 577 tstat_tsbmiss_patch_entry_t tstat_tsbmiss_patch_table[] = { 578 {(uint32_t *)&tsbmiss_trapstat_patch_point, 0}, 579 {(uint32_t *)&tsbmiss_trapstat_patch_point_kpm, 0}, 580 {(uint32_t *)&tsbmiss_trapstat_patch_point_kpm_small, 0}, 581 {(uint32_t *)NULL, 0} 582 }; 583 584 /* 585 * We define some general SPARC-specific constants to allow more readable 586 * relocations. 587 */ 588 #define NOP 0x01000000 589 #define HI22(v) ((uint32_t)(v) >> 10) 590 #define LO10(v) ((uint32_t)(v) & 0x3ff) 591 #define LO12(v) ((uint32_t)(v) & 0xfff) 592 #define DISP22(from, to) \ 593 ((((uintptr_t)(to) - (uintptr_t)(from)) >> 2) & 0x3fffff) 594 #define ASI(asi) ((asi) << 5) 595 596 /* 597 * The interposing trap table must be locked in the I-TLB, and any data 598 * referred to in the interposing trap handler must be locked in the D-TLB. 599 * This function locks these pages in the appropriate TLBs by creating TTEs 600 * from whole cloth, and manually loading them into the TLB. This function is 601 * called from cross call context. 602 * 603 * On sun4v platforms, we use 4M page size mappings to minimize the number 604 * of locked down entries (i.e. permanent mappings). Each CPU uses a 605 * reserved portion of that 4M page for its TBA and data. 606 */ 607 static void 608 trapstat_load_tlb(void) 609 { 610 int i; 611 #ifdef sun4v 612 uint64_t ret; 613 #endif 614 tte_t tte; 615 tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id]; 616 caddr_t va = tcpu->tcpu_vabase; 617 618 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED); 619 ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED)); 620 621 #ifndef sun4v 622 for (i = 0; i < tstat_total_pages; i++, va += MMU_PAGESIZE) { 623 tte.tte_inthi = TTE_VALID_INT | TTE_SZ_INT(TTE8K) | 624 TTE_PFN_INTHI(tcpu->tcpu_pfn[i]); 625 if (i < TSTAT_INSTR_PAGES) { 626 tte.tte_intlo = TTE_PFN_INTLO(tcpu->tcpu_pfn[i]) | 627 TTE_LCK_INT | TTE_CP_INT | TTE_PRIV_INT; 628 sfmmu_itlb_ld_kva(va, &tte); 629 } else { 630 tte.tte_intlo = TTE_PFN_INTLO(tcpu->tcpu_pfn[i]) | 631 TTE_LCK_INT | TTE_CP_INT | TTE_CV_INT | 632 TTE_PRIV_INT | TTE_HWWR_INT; 633 sfmmu_dtlb_ld_kva(va, &tte); 634 } 635 } 636 #else /* sun4v */ 637 for (i = 0; i < tstat_num4m_mapping; i++) { 638 tte.tte_inthi = TTE_VALID_INT | TTE_PFN_INTHI(tstat_pfn[i]); 639 tte.tte_intlo = TTE_PFN_INTLO(tstat_pfn[i]) | TTE_CP_INT | 640 TTE_CV_INT | TTE_PRIV_INT | TTE_HWWR_INT | 641 TTE_SZ_INTLO(TTE4M); 642 ret = hv_mmu_map_perm_addr(va, KCONTEXT, *(uint64_t *)&tte, 643 MAP_ITLB | MAP_DTLB); 644 645 if (ret != H_EOK) { 646 if (tstat_hv_nopanic) { 647 int j; 648 /* 649 * The first attempt to create perm mapping 650 * failed. The guest might have exhausted its 651 * perm mapping limit. We don't panic on first 652 * try. 653 */ 654 tstat_perm_mapping_failed = 1; 655 va = tcpu->tcpu_vabase; 656 for (j = 0; j < i; j++) { 657 (void) hv_mmu_unmap_perm_addr(va, 658 KCONTEXT, MAP_ITLB | MAP_DTLB); 659 va += MMU_PAGESIZE4M; 660 } 661 break; 662 } 663 /* 664 * We failed on subsequent cpus trying to 665 * create the same perm mappings. This 666 * should not happen. Panic here. 667 */ 668 cmn_err(CE_PANIC, "trapstat: cannot create " 669 "perm mappings for cpu %d " 670 "(error: 0x%lx)", CPU->cpu_id, ret); 671 } 672 va += MMU_PAGESIZE4M; 673 } 674 #endif /* sun4v */ 675 } 676 677 /* 678 * As mentioned in the "TLB Statistics: TLB Misses versus TSB Misses" section 679 * of the block comment, TLB misses are differentiated from TSB misses in 680 * part by hot-patching the instructions at the tsbmiss patch points (see 681 * tstat_tsbmiss_patch_table). This routine is used both to initially patch 682 * the instructions, and to patch them back to their original values upon 683 * restoring the original trap table. 684 */ 685 static void 686 trapstat_hotpatch() 687 { 688 uint32_t instr; 689 uint32_t simm13; 690 tstat_tsbmiss_patch_entry_t *ep; 691 692 ASSERT(MUTEX_HELD(&tstat_lock)); 693 694 if (!(tstat_options & TSTAT_OPT_TLBDATA)) 695 return; 696 697 if (!tstat_tsbmiss_patched) { 698 /* 699 * We haven't patched the TSB paths; do so now. 700 */ 701 /*CONSTCOND*/ 702 ASSERT(offsetof(tstat_tlbret_t, ttlbr_ktsb) - 703 offsetof(tstat_tlbret_t, ttlbr_ktlb) == 704 offsetof(tstat_tlbret_t, ttlbr_utsb) - 705 offsetof(tstat_tlbret_t, ttlbr_utlb)); 706 707 simm13 = offsetof(tstat_tlbret_t, ttlbr_ktsb) - 708 offsetof(tstat_tlbret_t, ttlbr_ktlb); 709 710 for (ep = tstat_tsbmiss_patch_table; ep->tpe_addr; ep++) { 711 ASSERT(ep->tpe_instr == 0); 712 instr = ep->tpe_instr = *ep->tpe_addr; 713 714 /* 715 * Assert that the instruction we're about to patch is 716 * "add %g7, 0, %g7" (0x8e01e000). 717 */ 718 ASSERT(instr == TSTAT_TSBMISS_INSTR); 719 720 instr |= simm13; 721 hot_patch_kernel_text((caddr_t)ep->tpe_addr, 722 instr, sizeof (instr)); 723 } 724 725 tstat_tsbmiss_patched = 1; 726 727 } else { 728 /* 729 * Remove patches from the TSB paths. 730 */ 731 for (ep = tstat_tsbmiss_patch_table; ep->tpe_addr; ep++) { 732 ASSERT(ep->tpe_instr == TSTAT_TSBMISS_INSTR); 733 hot_patch_kernel_text((caddr_t)ep->tpe_addr, 734 ep->tpe_instr, sizeof (instr)); 735 ep->tpe_instr = 0; 736 } 737 738 tstat_tsbmiss_patched = 0; 739 } 740 } 741 742 /* 743 * This is the routine executed to clock the performance of the trap table, 744 * executed both before and after interposing on the trap table to attempt to 745 * determine probe effect. The probe effect is used to adjust the "%tim" 746 * fields of trapstat's -t and -T output; we only use TLB misses to clock the 747 * trap table. We execute the inner loop (which is designed to exceed the 748 * TLB's reach) nlaps times, taking the best time as our time (thereby 749 * factoring out the effects of interrupts, cache misses or other perturbing 750 * events. 751 */ 752 static hrtime_t 753 trapstat_probe_laps(int nlaps, hrtime_t *buf) 754 { 755 int i, j = 0; 756 hrtime_t ts, best = INT64_MAX; 757 758 while (nlaps--) { 759 ts = rdtick(); 760 761 for (i = 0; i < TSTAT_PROBE_SIZE; i += MMU_PAGESIZE) 762 *((volatile char *)&tstat_probe_area[i]); 763 764 if ((ts = rdtick() - ts) < best) 765 best = ts; 766 buf[j++] = ts; 767 } 768 769 return (best); 770 } 771 772 /* 773 * This routine determines the probe effect by calling trapstat_probe_laps() 774 * both without and with the interposing trap table. Note that this is 775 * called from a cross call on the desired CPU, and that it is called on 776 * every CPU (this is necessary because the probe effect may differ from 777 * one CPU to another). 778 */ 779 static void 780 trapstat_probe() 781 { 782 tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id]; 783 hrtime_t before, after; 784 785 if (!(tcpu->tcpu_flags & TSTAT_CPU_SELECTED)) 786 return; 787 788 if (tstat_probe_area == NULL || (tstat_options & TSTAT_OPT_NOGO)) 789 return; 790 791 /* 792 * We very much expect the %tba to be KERNELBASE; this is a 793 * precautionary measure to assure that trapstat doesn't melt the 794 * machine should the %tba point unexpectedly elsewhere. 795 */ 796 if (get_tba() != (caddr_t)KERNELBASE) 797 return; 798 799 /* 800 * Preserve this CPU's data before destroying it by enabling the 801 * interposing trap table. We can safely use tstat_buffer because 802 * the caller of the trapstat_probe() cross call is holding tstat_lock. 803 */ 804 #ifdef sun4v 805 bcopy(tcpu->tcpu_data, tstat_buffer, TSTAT_DATA_SIZE); 806 #else 807 bcopy(tcpu->tcpu_data, tstat_buffer, tstat_data_t_size); 808 #endif 809 810 tstat_probe_time = gethrtime(); 811 812 before = trapstat_probe_laps(TSTAT_PROBE_NLAPS, tstat_probe_before); 813 (void) set_tba(tcpu->tcpu_ibase); 814 815 after = trapstat_probe_laps(TSTAT_PROBE_NLAPS, tstat_probe_after); 816 (void) set_tba((caddr_t)KERNELBASE); 817 818 tstat_probe_time = gethrtime() - tstat_probe_time; 819 820 #ifdef sun4v 821 bcopy(tstat_buffer, tcpu->tcpu_data, TSTAT_DATA_SIZE); 822 tcpu->tcpu_tdata_peffect = (after - before) / TSTAT_PROBE_NPAGES; 823 #else 824 bcopy(tstat_buffer, tcpu->tcpu_data, tstat_data_t_size); 825 tcpu->tcpu_data->tdata_peffect = (after - before) / TSTAT_PROBE_NPAGES; 826 #endif 827 } 828 829 static void 830 trapstat_probe_alloc() 831 { 832 pfn_t pfn; 833 caddr_t va; 834 int i; 835 836 ASSERT(MUTEX_HELD(&tstat_lock)); 837 ASSERT(tstat_probe_area == NULL); 838 ASSERT(tstat_probe_phys == NULL); 839 840 if (!(tstat_options & TSTAT_OPT_TLBDATA)) 841 return; 842 843 /* 844 * Grab some virtual from the heap arena. 845 */ 846 tstat_probe_area = vmem_alloc(heap_arena, TSTAT_PROBE_SIZE, VM_SLEEP); 847 va = tstat_probe_area; 848 849 /* 850 * Grab a single physical page. 851 */ 852 tstat_probe_phys = vmem_alloc(tstat_arena, MMU_PAGESIZE, VM_SLEEP); 853 pfn = hat_getpfnum(kas.a_hat, tstat_probe_phys); 854 855 /* 856 * Now set the translation for every page in our virtual range 857 * to be our allocated physical page. 858 */ 859 for (i = 0; i < TSTAT_PROBE_NPAGES; i++) { 860 hat_devload(kas.a_hat, va, MMU_PAGESIZE, pfn, PROT_READ, 861 HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK); 862 va += MMU_PAGESIZE; 863 } 864 } 865 866 static void 867 trapstat_probe_free() 868 { 869 caddr_t va; 870 int i; 871 872 ASSERT(MUTEX_HELD(&tstat_lock)); 873 874 if ((va = tstat_probe_area) == NULL) 875 return; 876 877 for (i = 0; i < TSTAT_PROBE_NPAGES; i++) { 878 hat_unload(kas.a_hat, va, MMU_PAGESIZE, HAT_UNLOAD_UNLOCK); 879 va += MMU_PAGESIZE; 880 } 881 882 vmem_free(tstat_arena, tstat_probe_phys, MMU_PAGESIZE); 883 vmem_free(heap_arena, tstat_probe_area, TSTAT_PROBE_SIZE); 884 885 tstat_probe_phys = NULL; 886 tstat_probe_area = NULL; 887 } 888 889 /* 890 * This routine actually enables a CPU by setting its %tba to be the 891 * CPU's interposing trap table. It is called out of cross call context. 892 */ 893 static void 894 trapstat_enable() 895 { 896 tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id]; 897 898 if (!(tcpu->tcpu_flags & TSTAT_CPU_SELECTED)) 899 return; 900 901 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED); 902 ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED)); 903 904 if (get_tba() != (caddr_t)KERNELBASE) 905 return; 906 907 if (!(tstat_options & TSTAT_OPT_NOGO)) 908 (void) set_tba(tcpu->tcpu_ibase); 909 tcpu->tcpu_flags |= TSTAT_CPU_ENABLED; 910 #ifdef sun4v 911 if ((tstat_options & TSTAT_OPT_TLBDATA) && 912 !(tstat_options & TSTAT_OPT_NOGO)) { 913 if (tstat_fast_tlbstat) { 914 /* 915 * Invoke processor specific interface to enable 916 * collection of TSB hit statistics. 917 */ 918 (void) cpu_trapstat_conf(CPU_TSTATCONF_ENABLE); 919 } else { 920 /* 921 * Collect TLB miss statistics by taking over 922 * TLB miss handling from the hypervisor. This 923 * is done by telling the hypervisor that there 924 * is no TSB configured. Also set TSTAT_TLB_STATS 925 * flag so that no user TSB is configured during 926 * context switch time. 927 */ 928 cpu_t *cp = CPU; 929 930 cp->cpu_m.cpu_tstat_flags |= TSTAT_TLB_STATS; 931 (void) hv_set_ctx0(NULL, NULL); 932 (void) hv_set_ctxnon0(NULL, NULL); 933 } 934 } 935 #endif 936 } 937 938 /* 939 * This routine disables a CPU (vis a vis trapstat) by setting its %tba to be 940 * the actual, underlying trap table. It is called out of cross call context. 941 */ 942 static void 943 trapstat_disable() 944 { 945 tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id]; 946 947 if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED)) 948 return; 949 950 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED); 951 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED); 952 953 if (!(tstat_options & TSTAT_OPT_NOGO)) 954 (void) set_tba((caddr_t)KERNELBASE); 955 956 tcpu->tcpu_flags &= ~TSTAT_CPU_ENABLED; 957 958 #ifdef sun4v 959 if ((tstat_options & TSTAT_OPT_TLBDATA) && 960 !(tstat_options & TSTAT_OPT_NOGO)) { 961 if (tstat_fast_tlbstat) { 962 /* 963 * Invoke processor specific interface to disable 964 * collection of TSB hit statistics on each processor. 965 */ 966 (void) cpu_trapstat_conf(CPU_TSTATCONF_DISABLE); 967 } else { 968 /* 969 * As part of collecting TLB miss statistics, we took 970 * over TLB miss handling from the hypervisor by 971 * telling the hypervisor that NO TSB is configured. 972 * We need to restore that by communicating proper 973 * kernel/user TSB information so that TLB misses 974 * can be handled by the hypervisor or the hardware 975 * more efficiently. 976 * 977 * We restore kernel TSB information right away. 978 * However, to minimize any locking dependency, we 979 * don't restore user TSB information right away. 980 * Instead, we simply clear the TSTAT_TLB_STATS flag 981 * so that the user TSB information is automatically 982 * restored on next context switch. 983 * 984 * Note that the call to restore kernel TSB information 985 * will normally not fail, unless wrong information is 986 * passed here. In that scenario, system will still 987 * continue to function properly with the exception of 988 * kernel handling all the TLB misses. 989 */ 990 struct hv_tsb_block *hvbp = &ksfmmup->sfmmu_hvblock; 991 cpu_t *cp = CPU; 992 993 cp->cpu_m.cpu_tstat_flags &= ~TSTAT_TLB_STATS; 994 (void) hv_set_ctx0(hvbp->hv_tsb_info_cnt, 995 hvbp->hv_tsb_info_pa); 996 } 997 } 998 #endif 999 } 1000 1001 /* 1002 * We use %tick as the time base when recording the time spent executing 1003 * the trap handler. %tick, however, is not necessarily kept in sync 1004 * across CPUs (indeed, different CPUs may have different %tick frequencies). 1005 * We therefore cross call onto a CPU to get a snapshot of its data to 1006 * copy out; this is the routine executed out of that cross call. 1007 */ 1008 static void 1009 trapstat_snapshot() 1010 { 1011 tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id]; 1012 tstat_data_t *data = tcpu->tcpu_data; 1013 1014 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED); 1015 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED); 1016 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ENABLED); 1017 1018 #ifndef sun4v 1019 data->tdata_snapts = gethrtime(); 1020 data->tdata_snaptick = rdtick(); 1021 bcopy(data, tstat_buffer, tstat_data_t_size); 1022 #else 1023 /* 1024 * For sun4v, in order to conserve space in the limited 1025 * per-cpu 4K buffer, we derive certain info somewhere else and 1026 * copy them directly into the tstat_buffer output. 1027 * Note that we either are collecting tlb stats or 1028 * regular trapstats but never both. 1029 */ 1030 tstat_buffer->tdata_cpuid = CPU->cpu_id; 1031 tstat_buffer->tdata_peffect = tcpu->tcpu_tdata_peffect; 1032 tstat_buffer->tdata_snapts = gethrtime(); 1033 tstat_buffer->tdata_snaptick = rdtick(); 1034 1035 if (tstat_options & TSTAT_OPT_TLBDATA) { 1036 /* Copy tlb/tsb stats collected in the per-cpu trapdata */ 1037 tstat_tdata_t *tdata = (tstat_tdata_t *)data; 1038 bcopy(&tdata->tdata_pgsz[0], 1039 &tstat_buffer->tdata_pgsz[0], 1040 tstat_pgszs * sizeof (tstat_pgszdata_t)); 1041 1042 /* 1043 * Invoke processor specific interface to collect TLB stats 1044 * on each processor if enabled. 1045 */ 1046 if (tstat_fast_tlbstat) { 1047 cpu_trapstat_data((void *) tstat_buffer->tdata_pgsz, 1048 tstat_pgszs); 1049 } 1050 } else { 1051 /* 1052 * Normal trapstat collection. 1053 * Copy all the 4K data area into tstat_buffer tdata_trap 1054 * area. 1055 */ 1056 bcopy(data, &tstat_buffer->tdata_traps[0], TSTAT_DATA_SIZE); 1057 } 1058 #endif /* sun4v */ 1059 } 1060 1061 /* 1062 * The TSTAT_RETENT_* constants define offsets in the TLB return entry. 1063 * They are used only in trapstat_tlbretent() (below) and #undef'd 1064 * immediately afterwards. Any change to "retent" in trapstat_tlbretent() 1065 * will likely require changes to these constants. 1066 */ 1067 1068 #ifndef sun4v 1069 #define TSTAT_RETENT_STATHI 1 1070 #define TSTAT_RETENT_STATLO 2 1071 #define TSTAT_RETENT_SHIFT 11 1072 #define TSTAT_RETENT_COUNT_LD 13 1073 #define TSTAT_RETENT_COUNT_ST 15 1074 #define TSTAT_RETENT_TMPTSHI 16 1075 #define TSTAT_RETENT_TMPTSLO 17 1076 #define TSTAT_RETENT_TIME_LD 19 1077 #define TSTAT_RETENT_TIME_ST 21 1078 #else /* sun4v */ 1079 #define TSTAT_RETENT_TDATASHFT 2 1080 #define TSTAT_RETENT_STATHI 4 1081 #define TSTAT_RETENT_STATLO 6 1082 #define TSTAT_RETENT_SHIFT 9 1083 #define TSTAT_RETENT_COUNT_LD 11 1084 #define TSTAT_RETENT_COUNT_ST 13 1085 #define TSTAT_RETENT_TMPTSHI 14 1086 #define TSTAT_RETENT_TMPTSLO 16 1087 #define TSTAT_RETENT_TIME_LD 18 1088 #define TSTAT_RETENT_TIME_ST 20 1089 #endif /* sun4v */ 1090 1091 static void 1092 trapstat_tlbretent(tstat_percpu_t *tcpu, tstat_tlbretent_t *ret, 1093 tstat_missdata_t *data) 1094 { 1095 uint32_t *ent = ret->ttlbrent_instr, shift; 1096 uintptr_t base; 1097 #ifndef sun4v 1098 uintptr_t tmptick = TSTAT_DATA_OFFS(tcpu, tdata_tmptick); 1099 #else 1100 uintptr_t tmptick = TSTAT_CPU0_TLBDATA_OFFS(tcpu, tdata_tmptick); 1101 #endif 1102 1103 /* 1104 * This is the entry executed upon return from the TLB/TSB miss 1105 * handler (i.e. the code interpositioned between the "retry" and 1106 * the actual return to the TLB-missing instruction). Detail on its 1107 * theory of operation can be found in the "TLB Statistics" section 1108 * of the block comment. Note that we expect the TTE just loaded 1109 * into the TLB to be in %g5; all other globals are available as 1110 * scratch. Finally, note that the page size information in sun4v is 1111 * located in the lower bits of the TTE -- requiring us to have a 1112 * different return entry on sun4v. 1113 */ 1114 static const uint32_t retent[TSTAT_TLBRET_NINSTR] = { 1115 #ifndef sun4v 1116 0x87410000, /* rd %tick, %g3 */ 1117 0x03000000, /* sethi %hi(stat), %g1 */ 1118 0x82106000, /* or %g1, %lo(stat), %g1 */ 1119 0x89297001, /* sllx %g5, 1, %g4 */ 1120 0x8931303e, /* srlx %g4, 62, %g4 */ 1121 0x8531702e, /* srlx %g5, 46, %g2 */ 1122 0x8408a004, /* and %g2, 4, %g2 */ 1123 0x88110002, /* or %g4, %g2, %g4 */ 1124 0x80a12005, /* cmp %g4, 5 */ 1125 0x34400002, /* bg,a,pn %icc, +8 */ 1126 0x88102004, /* mov 4, %g4 */ 1127 0x89292000, /* sll %g4, shift, %g4 */ 1128 0x82004004, /* add %g1, %g4, %g1 */ 1129 0xc4586000, /* ldx [%g1 + tmiss_count], %g2 */ 1130 0x8400a001, /* add %g2, 1, %g2 */ 1131 0xc4706000, /* stx %g2, [%g1 + tmiss_count] */ 1132 0x0d000000, /* sethi %hi(tdata_tmptick), %g6 */ 1133 0xc459a000, /* ldx [%g6 + %lo(tdata_tmptick)], %g2 */ 1134 0x8620c002, /* sub %g3, %g2, %g3 */ 1135 0xc4586000, /* ldx [%g1 + tmiss_time], %g2 */ 1136 0x84008003, /* add %g2, %g3, %g2 */ 1137 0xc4706000, /* stx %g2, [%g1 + tmiss_time] */ 1138 0x83f00000 /* retry */ 1139 #else /* sun4v */ 1140 0x82102008, /* mov SCRATCHPAD_CPUID, %g1 */ 1141 0xced84400, /* ldxa [%g1]ASI_SCRATCHPAD, %g7 */ 1142 0x8f29f000, /* sllx %g7, TSTAT_DATA_SHIFT, %g7 */ 1143 0x87410000, /* rd %tick, %g3 */ 1144 0x03000000, /* sethi %hi(stat), %g1 */ 1145 0x82004007, /* add %g1, %g7, %g1 */ 1146 0x82106000, /* or %g1, %lo(stat), %g1 */ 1147 0x8929703d, /* sllx %g5, 61, %g4 */ 1148 0x8931303d, /* srlx %g4, 61, %g4 */ 1149 0x89292000, /* sll %g4, shift, %g4 */ 1150 0x82004004, /* add %g1, %g4, %g1 */ 1151 0xc4586000, /* ldx [%g1 + tmiss_count], %g2 */ 1152 0x8400a001, /* add %g2, 1, %g2 */ 1153 0xc4706000, /* stx %g2, [%g1 + tmiss_count] */ 1154 0x0d000000, /* sethi %hi(tdata_tmptick), %g6 */ 1155 0x8c018007, /* add %g6, %g7, %g6 */ 1156 0xc459a000, /* ldx [%g6 + %lo(tdata_tmptick)], %g2 */ 1157 0x8620c002, /* sub %g3, %g2, %g3 */ 1158 0xc4586000, /* ldx [%g1 + tmiss_time], %g2 */ 1159 0x84008003, /* add %g2, %g3, %g2 */ 1160 0xc4706000, /* stx %g2, [%g1 + tmiss_time] */ 1161 0x83f00000 /* retry */ 1162 #endif /* sun4v */ 1163 }; 1164 1165 ASSERT(MUTEX_HELD(&tstat_lock)); 1166 /*CONSTCOND*/ 1167 ASSERT(offsetof(tstat_missdata_t, tmiss_count) <= LO10(-1)); 1168 /*CONSTCOND*/ 1169 ASSERT(offsetof(tstat_missdata_t, tmiss_time) <= LO10(-1)); 1170 /*CONSTCOND*/ 1171 ASSERT(!((sizeof (tstat_pgszdata_t) - 1) & sizeof (tstat_pgszdata_t))); 1172 1173 for (shift = 1; (1 << shift) != sizeof (tstat_pgszdata_t); shift++) 1174 continue; 1175 1176 base = (uintptr_t)tcpu->tcpu_ibase + TSTAT_INSTR_SIZE + 1177 ((uintptr_t)data - (uintptr_t)tcpu->tcpu_data); 1178 1179 bcopy(retent, ent, sizeof (retent)); 1180 1181 #if defined(sun4v) 1182 ent[TSTAT_RETENT_TDATASHFT] |= LO10((uintptr_t)TSTAT_DATA_SHIFT); 1183 #endif 1184 ent[TSTAT_RETENT_STATHI] |= HI22(base); 1185 ent[TSTAT_RETENT_STATLO] |= LO10(base); 1186 ent[TSTAT_RETENT_SHIFT] |= shift; 1187 /* LINTED E_EXPR_NULL_EFFECT */ 1188 ent[TSTAT_RETENT_COUNT_LD] |= offsetof(tstat_missdata_t, tmiss_count); 1189 /* LINTED E_EXPR_NULL_EFFECT */ 1190 ent[TSTAT_RETENT_COUNT_ST] |= offsetof(tstat_missdata_t, tmiss_count); 1191 ent[TSTAT_RETENT_TMPTSHI] |= HI22(tmptick); 1192 ent[TSTAT_RETENT_TMPTSLO] |= LO10(tmptick); 1193 ent[TSTAT_RETENT_TIME_LD] |= offsetof(tstat_missdata_t, tmiss_time); 1194 ent[TSTAT_RETENT_TIME_ST] |= offsetof(tstat_missdata_t, tmiss_time); 1195 } 1196 1197 #if defined(sun4v) 1198 #undef TSTAT_RETENT_TDATASHFT 1199 #endif 1200 #undef TSTAT_RETENT_STATHI 1201 #undef TSTAT_RETENT_STATLO 1202 #undef TSTAT_RETENT_SHIFT 1203 #undef TSTAT_RETENT_COUNT_LD 1204 #undef TSTAT_RETENT_COUNT_ST 1205 #undef TSTAT_RETENT_TMPTSHI 1206 #undef TSTAT_RETENT_TMPTSLO 1207 #undef TSTAT_RETENT_TIME_LD 1208 #undef TSTAT_RETENT_TIME_ST 1209 1210 /* 1211 * The TSTAT_TLBENT_* constants define offsets in the TLB entry. They are 1212 * used only in trapstat_tlbent() (below) and #undef'd immediately afterwards. 1213 * Any change to "tlbent" in trapstat_tlbent() will likely require changes 1214 * to these constants. 1215 */ 1216 1217 #ifndef sun4v 1218 #define TSTAT_TLBENT_STATHI 0 1219 #define TSTAT_TLBENT_STATLO_LD 1 1220 #define TSTAT_TLBENT_STATLO_ST 3 1221 #define TSTAT_TLBENT_MMUASI 15 1222 #define TSTAT_TLBENT_TPCHI 18 1223 #define TSTAT_TLBENT_TPCLO_USER 19 1224 #define TSTAT_TLBENT_TPCLO_KERN 21 1225 #define TSTAT_TLBENT_TSHI 25 1226 #define TSTAT_TLBENT_TSLO 27 1227 #define TSTAT_TLBENT_BA 28 1228 #else /* sun4v */ 1229 #define TSTAT_TLBENT_TDATASHFT 2 1230 #define TSTAT_TLBENT_STATHI 3 1231 #define TSTAT_TLBENT_STATLO_LD 5 1232 #define TSTAT_TLBENT_STATLO_ST 7 1233 #define TSTAT_TLBENT_TAGTARGET 23 1234 #define TSTAT_TLBENT_TPCHI 25 1235 #define TSTAT_TLBENT_TPCLO_USER 26 1236 #define TSTAT_TLBENT_TPCLO_KERN 28 1237 #define TSTAT_TLBENT_TSHI 32 1238 #define TSTAT_TLBENT_TSLO 35 1239 #define TSTAT_TLBENT_ADDRHI 36 1240 #define TSTAT_TLBENT_ADDRLO 37 1241 #endif /* sun4v */ 1242 1243 static void 1244 trapstat_tlbent(tstat_percpu_t *tcpu, int entno) 1245 { 1246 uint32_t *ent; 1247 uintptr_t orig, va; 1248 #ifndef sun4v 1249 uintptr_t baoffs; 1250 int itlb = entno == TSTAT_ENT_ITLBMISS; 1251 uint32_t asi = itlb ? ASI(ASI_IMMU) : ASI(ASI_DMMU); 1252 #else 1253 int itlb = (entno == TSTAT_ENT_IMMUMISS || entno == TSTAT_ENT_ITLBMISS); 1254 uint32_t tagtarget_off = itlb ? MMFSA_I_CTX : MMFSA_D_CTX; 1255 uint32_t *tent; /* MMU trap vector entry */ 1256 uintptr_t tentva; /* MMU trap vector entry va */ 1257 static const uint32_t mmumiss[TSTAT_ENT_NINSTR] = { 1258 0x30800000, /* ba,a addr */ 1259 NOP, NOP, NOP, NOP, NOP, NOP, NOP 1260 }; 1261 #endif 1262 int entoffs = entno << TSTAT_ENT_SHIFT; 1263 uintptr_t tmptick, stat, tpc, utpc; 1264 tstat_pgszdata_t *data; 1265 tstat_tlbdata_t *udata, *kdata; 1266 tstat_tlbret_t *ret; 1267 1268 #ifdef sun4v 1269 data = &((tstat_tdata_t *)tcpu->tcpu_data)->tdata_pgsz[0]; 1270 #else 1271 data = &tcpu->tcpu_data->tdata_pgsz[0]; 1272 #endif /* sun4v */ 1273 1274 /* 1275 * When trapstat is run with TLB statistics, this is the entry for 1276 * both I- and D-TLB misses; this code performs trap level pushing, 1277 * as described in the "TLB Statistics" section of the block comment. 1278 * This code is executing at TL 1; %tstate[0] contains the saved 1279 * state at the time of the TLB miss. Pushing trap level 1 (and thus 1280 * raising TL to 2) requires us to fill in %tstate[1] with our %pstate, 1281 * %cwp and %asi. We leave %tt unchanged, and we set %tpc and %tnpc to 1282 * the appropriate TLB return entry (based on the context of the miss). 1283 * Finally, we sample %tick, and stash it in the tdata_tmptick member 1284 * the per-CPU tstat_data structure. tdata_tmptick will be used in 1285 * the TLB return entry to determine the amount of time spent in the 1286 * TLB miss handler. 1287 * 1288 * Note that on sun4v platforms, we must obtain the context information 1289 * from the MMU fault status area. (The base address of this MMU fault 1290 * status area is kept in the scratchpad register 0.) 1291 */ 1292 static const uint32_t tlbent[] = { 1293 #ifndef sun4v 1294 0x03000000, /* sethi %hi(stat), %g1 */ 1295 0xc4586000, /* ldx [%g1 + %lo(stat)], %g2 */ 1296 0x8400a001, /* add %g2, 1, %g2 */ 1297 0xc4706000, /* stx %g2, [%g1 + %lo(stat)] */ 1298 0x85524000, /* rdpr %cwp, %g2 */ 1299 0x87518000, /* rdpr %pstate, %g3 */ 1300 0x8728f008, /* sllx %g3, 8, %g3 */ 1301 0x84108003, /* or %g2, %g3, %g2 */ 1302 0x8740c000, /* rd %asi, %g3 */ 1303 0x8728f018, /* sllx %g3, 24, %g3 */ 1304 0x84108003, /* or %g2, %g3, %g2 */ 1305 0x8350c000, /* rdpr %tt, %g1 */ 1306 0x8f902002, /* wrpr %g0, 2, %tl */ 1307 0x85908000, /* wrpr %g2, %g0, %tstate */ 1308 0x87904000, /* wrpr %g1, %g0, %tt */ 1309 0xc2d80000, /* ldxa [%g0]ASI_MMU, %g1 */ 1310 0x83307030, /* srlx %g1, CTXSHIFT, %g1 */ 1311 0x02c04004, /* brz,pn %g1, .+0x10 */ 1312 0x03000000, /* sethi %hi(new_tpc), %g1 */ 1313 0x82106000, /* or %g1, %lo(new_tpc), %g1 */ 1314 0x30800002, /* ba,a .+0x8 */ 1315 0x82106000, /* or %g1, %lo(new_tpc), %g1 */ 1316 0x81904000, /* wrpr %g1, %g0, %tpc */ 1317 0x82006004, /* add %g1, 4, %g1 */ 1318 0x83904000, /* wrpr %g1, %g0, %tnpc */ 1319 0x03000000, /* sethi %hi(tmptick), %g1 */ 1320 0x85410000, /* rd %tick, %g2 */ 1321 0xc4706000, /* stx %g2, [%g1 + %lo(tmptick)] */ 1322 0x30800000, /* ba,a addr */ 1323 NOP, NOP, NOP 1324 #else /* sun4v */ 1325 0x82102008, /* mov SCRATCHPAD_CPUID, %g1 */ 1326 0xc8d84400, /* ldxa [%g1]ASI_SCRATCHPAD, %g4 */ 1327 0x89293000, /* sllx %g4, TSTAT_DATA_SHIFT, %g4 */ 1328 0x03000000, /* sethi %hi(stat), %g1 */ 1329 0x82004004, /* add %g1, %g4, %g1 */ 1330 0xc4586000, /* ldx [%g1 + %lo(stat)], %g2 */ 1331 0x8400a001, /* add %g2, 1, %g2 */ 1332 0xc4706000, /* stx %g2, [%g1 + %lo(stat)] */ 1333 0x85524000, /* rdpr %cwp, %g2 */ 1334 0x87518000, /* rdpr %pstate, %g3 */ 1335 0x8728f008, /* sllx %g3, 8, %g3 */ 1336 0x84108003, /* or %g2, %g3, %g2 */ 1337 0x8740c000, /* rd %asi, %g3 */ 1338 0x8728f018, /* sllx %g3, 24, %g3 */ 1339 0x83540000, /* rdpr %gl, %g1 */ 1340 0x83287028, /* sllx %g1, 40, %g1 */ 1341 0x86104003, /* or %g1, %g3, %g3 */ 1342 0x84108003, /* or %g2, %g3, %g2 */ 1343 0x8350c000, /* rdpr %tt, %g1 */ 1344 0x8f902002, /* wrpr %g0, 2, %tl */ 1345 0x85908000, /* wrpr %g2, %g0, %tstate */ 1346 0x87904000, /* wrpr %g1, %g0, %tt */ 1347 0xc2d80400, /* ldxa [%g0]ASI_SCRATCHPAD, %g1 */ 1348 0xc2586000, /* ldx [%g1 + MMFSA_?_CTX], %g1 */ 1349 0x02c04004, /* brz,pn %g1, .+0x10 */ 1350 0x03000000, /* sethi %hi(new_tpc), %g1 */ 1351 0x82106000, /* or %g1, %lo(new_tpc), %g1 */ 1352 0x30800002, /* ba,a .+0x8 */ 1353 0x82106000, /* or %g1, %lo(new_tpc), %g1 */ 1354 0x81904000, /* wrpr %g1, %g0, %tpc */ 1355 0x82006004, /* add %g1, 4, %g1 */ 1356 0x83904000, /* wrpr %g1, %g0, %tnpc */ 1357 0x03000000, /* sethi %hi(tmptick), %g1 */ 1358 0x82004004, /* add %g1, %g4, %g1 */ 1359 0x85410000, /* rd %tick, %g2 */ 1360 0xc4706000, /* stx %g2, [%g1 + %lo(tmptick)] */ 1361 0x05000000, /* sethi %hi(addr), %g2 */ 1362 0x8410a000, /* or %g2, %lo(addr), %g2 */ 1363 0x81c08000, /* jmp %g2 */ 1364 NOP, 1365 #endif /* sun4v */ 1366 }; 1367 1368 ASSERT(MUTEX_HELD(&tstat_lock)); 1369 #ifndef sun4v 1370 ASSERT(entno == TSTAT_ENT_ITLBMISS || entno == TSTAT_ENT_DTLBMISS); 1371 1372 stat = TSTAT_DATA_OFFS(tcpu, tdata_traps) + entoffs; 1373 tmptick = TSTAT_DATA_OFFS(tcpu, tdata_tmptick); 1374 #else /* sun4v */ 1375 ASSERT(entno == TSTAT_ENT_ITLBMISS || entno == TSTAT_ENT_DTLBMISS || 1376 entno == TSTAT_ENT_IMMUMISS || entno == TSTAT_ENT_DMMUMISS); 1377 1378 stat = TSTAT_CPU0_TLBDATA_OFFS(tcpu, tdata_traps[entno]); 1379 tmptick = TSTAT_CPU0_TLBDATA_OFFS(tcpu, tdata_tmptick); 1380 #endif /* sun4v */ 1381 1382 if (itlb) { 1383 ret = &tcpu->tcpu_instr->tinst_itlbret; 1384 udata = &data->tpgsz_user.tmode_itlb; 1385 kdata = &data->tpgsz_kernel.tmode_itlb; 1386 tpc = TSTAT_INSTR_OFFS(tcpu, tinst_itlbret.ttlbr_ktlb); 1387 } else { 1388 ret = &tcpu->tcpu_instr->tinst_dtlbret; 1389 udata = &data->tpgsz_user.tmode_dtlb; 1390 kdata = &data->tpgsz_kernel.tmode_dtlb; 1391 tpc = TSTAT_INSTR_OFFS(tcpu, tinst_dtlbret.ttlbr_ktlb); 1392 } 1393 1394 utpc = tpc + offsetof(tstat_tlbret_t, ttlbr_utlb) - 1395 offsetof(tstat_tlbret_t, ttlbr_ktlb); 1396 1397 ASSERT(HI22(tpc) == HI22(utpc)); 1398 1399 ent = (uint32_t *)((uintptr_t)tcpu->tcpu_instr + entoffs); 1400 orig = KERNELBASE + entoffs; 1401 va = (uintptr_t)tcpu->tcpu_ibase + entoffs; 1402 1403 #ifdef sun4v 1404 /* 1405 * Because of lack of space, interposing tlbent trap handler 1406 * for TLB and MMU miss traps cannot be placed in-line. Instead, 1407 * we copy it to the space set aside for shared trap handlers 1408 * continuation in the interposing trap table and invoke it by 1409 * placing a branch in the trap table itself. 1410 */ 1411 tent = ent; /* trap vector entry */ 1412 tentva = va; /* trap vector entry va */ 1413 1414 if (itlb) { 1415 ent = (uint32_t *)((uintptr_t) 1416 &tcpu->tcpu_instr->tinst_immumiss); 1417 va = TSTAT_INSTR_OFFS(tcpu, tinst_immumiss); 1418 } else { 1419 ent = (uint32_t *)((uintptr_t) 1420 &tcpu->tcpu_instr->tinst_dmmumiss); 1421 va = TSTAT_INSTR_OFFS(tcpu, tinst_dmmumiss); 1422 } 1423 bcopy(mmumiss, tent, sizeof (mmumiss)); 1424 tent[0] |= DISP22(tentva, va); 1425 #endif /* sun4v */ 1426 1427 bcopy(tlbent, ent, sizeof (tlbent)); 1428 1429 #if defined(sun4v) 1430 ent[TSTAT_TLBENT_TDATASHFT] |= LO10((uintptr_t)TSTAT_DATA_SHIFT); 1431 #endif 1432 ent[TSTAT_TLBENT_STATHI] |= HI22(stat); 1433 ent[TSTAT_TLBENT_STATLO_LD] |= LO10(stat); 1434 ent[TSTAT_TLBENT_STATLO_ST] |= LO10(stat); 1435 #ifndef sun4v 1436 ent[TSTAT_TLBENT_MMUASI] |= asi; 1437 #else 1438 ent[TSTAT_TLBENT_TAGTARGET] |= tagtarget_off; 1439 #endif 1440 ent[TSTAT_TLBENT_TPCHI] |= HI22(tpc); 1441 ent[TSTAT_TLBENT_TPCLO_USER] |= LO10(utpc); 1442 ent[TSTAT_TLBENT_TPCLO_KERN] |= LO10(tpc); 1443 ent[TSTAT_TLBENT_TSHI] |= HI22(tmptick); 1444 ent[TSTAT_TLBENT_TSLO] |= LO10(tmptick); 1445 #ifndef sun4v 1446 baoffs = TSTAT_TLBENT_BA * sizeof (uint32_t); 1447 ent[TSTAT_TLBENT_BA] |= DISP22(va + baoffs, orig); 1448 #else 1449 ent[TSTAT_TLBENT_ADDRHI] |= HI22(orig); 1450 ent[TSTAT_TLBENT_ADDRLO] |= LO10(orig); 1451 #endif /* sun4v */ 1452 1453 /* 1454 * And now set up the TLB return entries. 1455 */ 1456 trapstat_tlbretent(tcpu, &ret->ttlbr_ktlb, &kdata->ttlb_tlb); 1457 trapstat_tlbretent(tcpu, &ret->ttlbr_ktsb, &kdata->ttlb_tsb); 1458 trapstat_tlbretent(tcpu, &ret->ttlbr_utlb, &udata->ttlb_tlb); 1459 trapstat_tlbretent(tcpu, &ret->ttlbr_utsb, &udata->ttlb_tsb); 1460 } 1461 1462 #if defined(sun4v) 1463 #undef TSTAT_TLBENT_TDATASHFT 1464 #endif 1465 #undef TSTAT_TLBENT_STATHI 1466 #undef TSTAT_TLBENT_STATLO_LD 1467 #undef TSTAT_TLBENT_STATLO_ST 1468 #ifndef sun4v 1469 #undef TSTAT_TLBENT_MMUASI 1470 #else 1471 #undef TSTAT_TLBENT_TAGTARGET 1472 #endif 1473 #undef TSTAT_TLBENT_TPCHI 1474 #undef TSTAT_TLBENT_TPCLO_USER 1475 #undef TSTAT_TLBENT_TPCLO_KERN 1476 #undef TSTAT_TLBENT_TSHI 1477 #undef TSTAT_TLBENT_TSLO 1478 #undef TSTAT_TLBENT_BA 1479 1480 /* 1481 * The TSTAT_ENABLED_* constants define offsets in the enabled entry; the 1482 * TSTAT_DISABLED_BA constant defines an offset in the disabled entry. Both 1483 * sets of constants are used only in trapstat_make_traptab() (below) and 1484 * #undef'd immediately afterwards. Any change to "enabled" or "disabled" 1485 * in trapstat_make_traptab() will likely require changes to these constants. 1486 */ 1487 #ifndef sun4v 1488 #define TSTAT_ENABLED_STATHI 0 1489 #define TSTAT_ENABLED_STATLO_LD 1 1490 #define TSTAT_ENABLED_STATLO_ST 3 1491 #define TSTAT_ENABLED_BA 4 1492 #define TSTAT_DISABLED_BA 0 1493 1494 static void 1495 trapstat_make_traptab(tstat_percpu_t *tcpu) 1496 { 1497 uint32_t *ent; 1498 uint64_t *stat; 1499 uintptr_t orig, va, en_baoffs, dis_baoffs; 1500 int nent; 1501 1502 /* 1503 * This is the entry in the interposing trap table for enabled trap 1504 * table entries. It loads a counter, increments it and stores it 1505 * back before branching to the actual trap table entry. 1506 */ 1507 static const uint32_t enabled[TSTAT_ENT_NINSTR] = { 1508 0x03000000, /* sethi %hi(stat), %g1 */ 1509 0xc4586000, /* ldx [%g1 + %lo(stat)], %g2 */ 1510 0x8400a001, /* add %g2, 1, %g2 */ 1511 0xc4706000, /* stx %g2, [%g1 + %lo(stat)] */ 1512 0x30800000, /* ba,a addr */ 1513 NOP, NOP, NOP 1514 }; 1515 1516 /* 1517 * This is the entry in the interposing trap table for disabled trap 1518 * table entries. It simply branches to the actual, underlying trap 1519 * table entry. As explained in the "Implementation Details" section 1520 * of the block comment, all TL>0 traps _must_ use the disabled entry; 1521 * additional entries may be explicitly disabled through the use 1522 * of TSTATIOC_ENTRY/TSTATIOC_NOENTRY. 1523 */ 1524 static const uint32_t disabled[TSTAT_ENT_NINSTR] = { 1525 0x30800000, /* ba,a addr */ 1526 NOP, NOP, NOP, NOP, NOP, NOP, NOP, 1527 }; 1528 1529 ASSERT(MUTEX_HELD(&tstat_lock)); 1530 1531 ent = tcpu->tcpu_instr->tinst_traptab; 1532 stat = (uint64_t *)TSTAT_DATA_OFFS(tcpu, tdata_traps); 1533 orig = KERNELBASE; 1534 va = (uintptr_t)tcpu->tcpu_ibase; 1535 en_baoffs = TSTAT_ENABLED_BA * sizeof (uint32_t); 1536 dis_baoffs = TSTAT_DISABLED_BA * sizeof (uint32_t); 1537 1538 for (nent = 0; nent < TSTAT_TOTAL_NENT; nent++) { 1539 if (tstat_enabled[nent]) { 1540 bcopy(enabled, ent, sizeof (enabled)); 1541 ent[TSTAT_ENABLED_STATHI] |= HI22((uintptr_t)stat); 1542 ent[TSTAT_ENABLED_STATLO_LD] |= LO10((uintptr_t)stat); 1543 ent[TSTAT_ENABLED_STATLO_ST] |= LO10((uintptr_t)stat); 1544 ent[TSTAT_ENABLED_BA] |= DISP22(va + en_baoffs, orig); 1545 } else { 1546 bcopy(disabled, ent, sizeof (disabled)); 1547 ent[TSTAT_DISABLED_BA] |= DISP22(va + dis_baoffs, orig); 1548 } 1549 1550 stat++; 1551 orig += sizeof (enabled); 1552 ent += sizeof (enabled) / sizeof (*ent); 1553 va += sizeof (enabled); 1554 } 1555 } 1556 1557 #undef TSTAT_ENABLED_STATHI 1558 #undef TSTAT_ENABLED_STATLO_LD 1559 #undef TSTAT_ENABLED_STATLO_ST 1560 #undef TSTAT_ENABLED_BA 1561 #undef TSTAT_DISABLED_BA 1562 1563 #else /* sun4v */ 1564 1565 #define TSTAT_ENABLED_STATHI 0 1566 #define TSTAT_ENABLED_STATLO 1 1567 #define TSTAT_ENABLED_ADDRHI 2 1568 #define TSTAT_ENABLED_ADDRLO 3 1569 #define TSTAT_ENABLED_CONTBA 6 1570 #define TSTAT_ENABLED_TDATASHFT 7 1571 #define TSTAT_DISABLED_ADDRHI 0 1572 #define TSTAT_DISABLED_ADDRLO 1 1573 1574 static void 1575 trapstat_make_traptab(tstat_percpu_t *tcpu) 1576 { 1577 uint32_t *ent; 1578 uint64_t *stat; 1579 uintptr_t orig, va, en_baoffs; 1580 uintptr_t tstat_cont_va; 1581 int nent; 1582 1583 /* 1584 * This is the entry in the interposing trap table for enabled trap 1585 * table entries. It loads a counter, increments it and stores it 1586 * back before branching to the actual trap table entry. 1587 * 1588 * All CPUs share the same interposing trap entry to count the 1589 * number of traps. Note that the trap counter is kept in per CPU 1590 * trap statistics area. Its address is obtained dynamically by 1591 * adding the offset of that CPU's trap statistics area from CPU 0 1592 * (i.e. cpu_id * TSTAT_DATA_SIZE) to the address of the CPU 0 1593 * trap counter already coded in the interposing trap entry itself. 1594 * 1595 * Since this interposing code sequence to count traps takes more 1596 * than 8 instructions, it's split in two parts as follows: 1597 * 1598 * tstat_trapcnt: 1599 * sethi %hi(stat), %g1 1600 * or %g1, %lo[stat), %g1 ! %g1 = CPU0 trap counter addr 1601 * sethi %hi(addr), %g2 1602 * or %g2, %lo(addr), %g2 ! %g2 = real trap handler addr 1603 * mov ASI_SCRATCHPAD_CPUID, %g3 1604 * ldxa [%g3]ASI_SCRATCHPAD, %g3 ! %g3 = CPU ID 1605 * ba tstat_trapcnt_cont ! branch to tstat_trapcnt_cont 1606 * sllx %g3, TSTAT_DATA_SHIFT, %g3 ! %g3 = CPU trapstat data offset 1607 * 1608 * tstat_trapcnt_cont: 1609 * ldx [%g1 + %g3], %g4 ! get counter value 1610 * add %g4, 1, %g4 ! increment value 1611 * jmp %g2 ! jump to original trap handler 1612 * stx %g4, [%g1 + %g3] ! store counter value 1613 * 1614 * First part, i.e. tstat_trapcnt, is per trap and is kept in-line in 1615 * the interposing trap table. However, the tstat_trapcnt_cont code 1616 * sequence is shared by all traps and is kept right after the 1617 * the interposing trap table. 1618 */ 1619 static const uint32_t enabled[TSTAT_ENT_NINSTR] = { 1620 0x03000000, /* sethi %hi(stat), %g1 */ 1621 0x82106000, /* or %g1, %lo[stat), %g1 */ 1622 0x05000000, /* sethi %hi(addr), %g2 */ 1623 0x8410a000, /* or %g2, %lo(addr), %g2 */ 1624 0x86102008, /* mov ASI_SCRATCHPAD_CPUID, %g3 */ 1625 0xc6d8c400, /* ldxa [%g3]ASI_SCRATCHPAD, %g3 */ 1626 0x10800000, /* ba enabled_cont */ 1627 0x8728f000 /* sllx %g3, TSTAT_DATA_SHIFT, %g3 */ 1628 }; 1629 1630 static const uint32_t enabled_cont[TSTAT_ENT_NINSTR] = { 1631 0xc8584003, /* ldx [%g1 + %g3], %g4 */ 1632 0x88012001, /* add %g4, 1, %g4 */ 1633 0x81c08000, /* jmp %g2 */ 1634 0xc8704003, /* stx %g4, [%g1 + %g3] */ 1635 NOP, NOP, NOP, NOP 1636 }; 1637 1638 /* 1639 * This is the entry in the interposing trap table for disabled trap 1640 * table entries. It simply "jmp" to the actual, underlying trap 1641 * table entry. As explained in the "Implementation Details" section 1642 * of the block comment, all TL>0 traps _must_ use the disabled entry; 1643 * additional entries may be explicitly disabled through the use 1644 * of TSTATIOC_ENTRY/TSTATIOC_NOENTRY. 1645 */ 1646 static const uint32_t disabled[TSTAT_ENT_NINSTR] = { 1647 0x05000000, /* sethi %hi(addr), %g2 */ 1648 0x8410a000, /* or %g2, %lo(addr), %g2 */ 1649 0x81c08000, /* jmp %g2 */ 1650 NOP, NOP, NOP, NOP, NOP, 1651 }; 1652 1653 ASSERT(MUTEX_HELD(&tstat_lock)); 1654 ent = tcpu->tcpu_instr->tinst_traptab; 1655 stat = (uint64_t *)TSTAT_CPU0_DATA_OFFS(tcpu, tdata_traps); 1656 orig = KERNELBASE; 1657 va = (uintptr_t)tcpu->tcpu_ibase; 1658 en_baoffs = TSTAT_ENABLED_CONTBA * sizeof (uint32_t); 1659 tstat_cont_va = TSTAT_INSTR_OFFS(tcpu, tinst_trapcnt); 1660 1661 for (nent = 0; nent < TSTAT_TOTAL_NENT; nent++) { 1662 /* 1663 * If TSTAT_OPT_TLBDATA option is enabled (-t or -T option) 1664 * we make sure only TSTAT_TLB_NENT traps can be enabled. 1665 * Note that this logic is somewhat moot since trapstat 1666 * cmd actually use TSTATIOC_NOENTRY ioctl to disable all 1667 * traps when performing Tlb stats collection. 1668 */ 1669 if ((!(tstat_options & TSTAT_OPT_TLBDATA) || 1670 nent < TSTAT_TLB_NENT) && tstat_enabled[nent]) { 1671 bcopy(enabled, ent, sizeof (enabled)); 1672 ent[TSTAT_ENABLED_STATHI] |= HI22((uintptr_t)stat); 1673 ent[TSTAT_ENABLED_STATLO] |= LO10((uintptr_t)stat); 1674 ent[TSTAT_ENABLED_ADDRHI] |= HI22((uintptr_t)orig); 1675 ent[TSTAT_ENABLED_ADDRLO] |= LO10((uintptr_t)orig); 1676 ent[TSTAT_ENABLED_CONTBA] |= 1677 DISP22(va + en_baoffs, tstat_cont_va); 1678 ent[TSTAT_ENABLED_TDATASHFT] |= 1679 LO10((uintptr_t)TSTAT_DATA_SHIFT); 1680 } else { 1681 bcopy(disabled, ent, sizeof (disabled)); 1682 ent[TSTAT_DISABLED_ADDRHI] |= HI22((uintptr_t)orig); 1683 ent[TSTAT_DISABLED_ADDRLO] |= LO10((uintptr_t)orig); 1684 } 1685 1686 stat++; 1687 orig += sizeof (enabled); 1688 ent += sizeof (enabled) / sizeof (*ent); 1689 va += sizeof (enabled); 1690 } 1691 bcopy(enabled_cont, (uint32_t *)tcpu->tcpu_instr->tinst_trapcnt, 1692 sizeof (enabled_cont)); 1693 } 1694 1695 #undef TSTAT_ENABLED_TDATASHFT 1696 #undef TSTAT_ENABLED_STATHI 1697 #undef TSTAT_ENABLED_STATLO 1698 #undef TSTAT_ENABLED_ADDRHI 1699 #undef TSTAT_ENABLED_ADDRLO 1700 #undef TSTAT_ENABLED_CONTBA 1701 #undef TSTAT_DISABLED_BA 1702 1703 #endif /* sun4v */ 1704 1705 #ifndef sun4v 1706 /* 1707 * See Section A.6 in SPARC v9 Manual. 1708 * max branch = 4*((2^21)-1) = 8388604 1709 */ 1710 #define MAX_BICC_BRANCH_DISPLACEMENT (4 * ((1 << 21) - 1)) 1711 #endif 1712 1713 static void 1714 trapstat_setup(processorid_t cpu) 1715 { 1716 tstat_percpu_t *tcpu = &tstat_percpu[cpu]; 1717 #ifndef sun4v 1718 int i; 1719 caddr_t va; 1720 pfn_t *pfn; 1721 cpu_t *cp; 1722 uint_t strand_idx; 1723 size_t tstat_offset; 1724 #else 1725 uint64_t offset; 1726 #endif 1727 1728 ASSERT(tcpu->tcpu_pfn == NULL); 1729 ASSERT(tcpu->tcpu_instr == NULL); 1730 ASSERT(tcpu->tcpu_data == NULL); 1731 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED); 1732 ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED)); 1733 ASSERT(MUTEX_HELD(&cpu_lock)); 1734 ASSERT(MUTEX_HELD(&tstat_lock)); 1735 1736 #ifndef sun4v 1737 /* 1738 * The lower fifteen bits of the %tba are always read as zero; we must 1739 * align our instruction base address appropriately. 1740 */ 1741 tstat_offset = tstat_total_size; 1742 1743 cp = cpu_get(cpu); 1744 ASSERT(cp != NULL); 1745 if ((strand_idx = cpu ^ pg_plat_hw_instance_id(cp, PGHW_IPIPE)) != 0) { 1746 /* 1747 * On sun4u platforms with multiple CPUs sharing the MMU 1748 * (Olympus-C has 2 strands per core), each CPU uses a 1749 * disjoint trap table. The indexing is based on the 1750 * strand id, which is obtained by XOR'ing the cpuid with 1751 * the coreid. 1752 */ 1753 tstat_offset += tstat_total_size * strand_idx; 1754 1755 /* 1756 * Offset must be less than the maximum PC-relative branch 1757 * displacement for Bicc variants. See the Implementation 1758 * Details comment. 1759 */ 1760 ASSERT(tstat_offset <= MAX_BICC_BRANCH_DISPLACEMENT); 1761 } 1762 1763 tcpu->tcpu_ibase = (caddr_t)((KERNELBASE - tstat_offset) 1764 & TSTAT_TBA_MASK); 1765 tcpu->tcpu_dbase = tcpu->tcpu_ibase + TSTAT_INSTR_SIZE; 1766 tcpu->tcpu_vabase = tcpu->tcpu_ibase; 1767 1768 tcpu->tcpu_pfn = vmem_alloc(tstat_arena, tstat_total_pages, VM_SLEEP); 1769 bzero(tcpu->tcpu_pfn, tstat_total_pages); 1770 pfn = tcpu->tcpu_pfn; 1771 1772 tcpu->tcpu_instr = vmem_alloc(tstat_arena, TSTAT_INSTR_SIZE, VM_SLEEP); 1773 1774 va = (caddr_t)tcpu->tcpu_instr; 1775 for (i = 0; i < TSTAT_INSTR_PAGES; i++, va += MMU_PAGESIZE) 1776 *pfn++ = hat_getpfnum(kas.a_hat, va); 1777 1778 /* 1779 * We must be sure that the pages that we will use to examine the data 1780 * have the same virtual color as the pages to which the data is being 1781 * recorded, hence the alignment and phase constraints on the 1782 * allocation. 1783 */ 1784 tcpu->tcpu_data = vmem_xalloc(tstat_arena, tstat_data_size, 1785 shm_alignment, (uintptr_t)tcpu->tcpu_dbase & (shm_alignment - 1), 1786 0, 0, NULL, VM_SLEEP); 1787 bzero(tcpu->tcpu_data, tstat_data_size); 1788 tcpu->tcpu_data->tdata_cpuid = cpu; 1789 1790 va = (caddr_t)tcpu->tcpu_data; 1791 for (i = 0; i < tstat_data_pages; i++, va += MMU_PAGESIZE) 1792 *pfn++ = hat_getpfnum(kas.a_hat, va); 1793 1794 /* 1795 * Now that we have all of the instruction and data pages allocated, 1796 * make the trap table from scratch. 1797 */ 1798 trapstat_make_traptab(tcpu); 1799 1800 if (tstat_options & TSTAT_OPT_TLBDATA) { 1801 /* 1802 * TLB Statistics have been specified; set up the I- and D-TLB 1803 * entries and corresponding TLB return entries. 1804 */ 1805 trapstat_tlbent(tcpu, TSTAT_ENT_ITLBMISS); 1806 trapstat_tlbent(tcpu, TSTAT_ENT_DTLBMISS); 1807 } 1808 1809 #else /* sun4v */ 1810 1811 /* 1812 * The lower fifteen bits of the %tba are always read as zero; hence 1813 * it must be aligned at least on 512K boundary. 1814 */ 1815 tcpu->tcpu_vabase = (caddr_t)(KERNELBASE - 1816 MMU_PAGESIZE4M * tstat_num4m_mapping); 1817 tcpu->tcpu_ibase = tcpu->tcpu_vabase; 1818 tcpu->tcpu_dbase = tcpu->tcpu_ibase + TSTAT_INSTR_SIZE + 1819 cpu * TSTAT_DATA_SIZE; 1820 1821 tcpu->tcpu_pfn = &tstat_pfn[0]; 1822 tcpu->tcpu_instr = (tstat_instr_t *)tstat_va[0]; 1823 1824 offset = TSTAT_INSTR_SIZE + cpu * TSTAT_DATA_SIZE; 1825 tcpu->tcpu_data = (tstat_data_t *)(tstat_va[offset >> MMU_PAGESHIFT4M] + 1826 (offset & MMU_PAGEOFFSET4M)); 1827 bzero(tcpu->tcpu_data, TSTAT_DATA_SIZE); 1828 1829 /* 1830 * Now that we have all of the instruction and data pages allocated, 1831 * make the trap table from scratch. It should be done only once 1832 * as it is shared by all CPUs. 1833 */ 1834 if (!tstat_traptab_initialized) 1835 trapstat_make_traptab(tcpu); 1836 1837 if (tstat_options & TSTAT_OPT_TLBDATA) { 1838 /* 1839 * TLB Statistics have been specified; set up the I- and D-TLB 1840 * entries and corresponding TLB return entries. 1841 */ 1842 if (!tstat_traptab_initialized) { 1843 if (tstat_fast_tlbstat) { 1844 trapstat_tlbent(tcpu, TSTAT_ENT_IMMUMISS); 1845 trapstat_tlbent(tcpu, TSTAT_ENT_DMMUMISS); 1846 } else { 1847 trapstat_tlbent(tcpu, TSTAT_ENT_ITLBMISS); 1848 trapstat_tlbent(tcpu, TSTAT_ENT_DTLBMISS); 1849 } 1850 } 1851 } 1852 tstat_traptab_initialized = 1; 1853 #endif /* sun4v */ 1854 1855 tcpu->tcpu_flags |= TSTAT_CPU_ALLOCATED; 1856 1857 /* 1858 * Finally, get the target CPU to load the locked pages into its TLBs. 1859 */ 1860 xc_one(cpu, (xcfunc_t *)trapstat_load_tlb, 0, 0); 1861 } 1862 1863 static void 1864 trapstat_teardown(processorid_t cpu) 1865 { 1866 tstat_percpu_t *tcpu = &tstat_percpu[cpu]; 1867 int i; 1868 caddr_t va = tcpu->tcpu_vabase; 1869 1870 ASSERT(tcpu->tcpu_pfn != NULL); 1871 ASSERT(tcpu->tcpu_instr != NULL); 1872 ASSERT(tcpu->tcpu_data != NULL); 1873 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED); 1874 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED); 1875 ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED)); 1876 ASSERT(MUTEX_HELD(&cpu_lock)); 1877 ASSERT(MUTEX_HELD(&tstat_lock)); 1878 1879 #ifndef sun4v 1880 vmem_free(tstat_arena, tcpu->tcpu_pfn, tstat_total_pages); 1881 vmem_free(tstat_arena, tcpu->tcpu_instr, TSTAT_INSTR_SIZE); 1882 vmem_free(tstat_arena, tcpu->tcpu_data, tstat_data_size); 1883 1884 for (i = 0; i < tstat_total_pages; i++, va += MMU_PAGESIZE) { 1885 xt_one(cpu, vtag_flushpage_tl1, (uint64_t)va, 1886 (uint64_t)ksfmmup); 1887 } 1888 #else 1889 for (i = 0; i < tstat_num4m_mapping; i++) { 1890 xt_one(cpu, vtag_unmap_perm_tl1, (uint64_t)va, KCONTEXT); 1891 va += MMU_PAGESIZE4M; 1892 } 1893 #endif 1894 1895 tcpu->tcpu_pfn = NULL; 1896 tcpu->tcpu_instr = NULL; 1897 tcpu->tcpu_data = NULL; 1898 tcpu->tcpu_flags &= ~TSTAT_CPU_ALLOCATED; 1899 } 1900 1901 static int 1902 trapstat_go() 1903 { 1904 cpu_t *cp; 1905 #ifdef sun4v 1906 int i; 1907 #endif /* sun4v */ 1908 1909 mutex_enter(&cpu_lock); 1910 mutex_enter(&tstat_lock); 1911 1912 if (tstat_running) { 1913 mutex_exit(&tstat_lock); 1914 mutex_exit(&cpu_lock); 1915 return (EBUSY); 1916 } 1917 1918 #ifdef sun4v 1919 /* 1920 * Compute the actual number of 4MB mappings 1921 * we need based on the guest's ncpu_guest_max value. 1922 * Note that earlier at compiled time, we did establish 1923 * and check against the sun4v solaris arch limit 1924 * (TSTAT_NUM4M_LIMIT) which is based on NCPU. 1925 */ 1926 tstat_num4m_mapping = TSTAT_NUM4M_MACRO(ncpu_guest_max); 1927 ASSERT(tstat_num4m_mapping <= TSTAT_NUM4M_LIMIT); 1928 1929 /* 1930 * Allocate large pages to hold interposing tables. 1931 */ 1932 for (i = 0; i < tstat_num4m_mapping; i++) { 1933 tstat_va[i] = contig_mem_alloc(MMU_PAGESIZE4M); 1934 tstat_pfn[i] = va_to_pfn(tstat_va[i]); 1935 if (tstat_pfn[i] == PFN_INVALID) { 1936 int j; 1937 for (j = 0; j < i; j++) { 1938 contig_mem_free(tstat_va[j], MMU_PAGESIZE4M); 1939 } 1940 mutex_exit(&tstat_lock); 1941 mutex_exit(&cpu_lock); 1942 return (EAGAIN); 1943 } 1944 } 1945 1946 1947 /* 1948 * For detailed TLB statistics, invoke CPU specific interface 1949 * to see if it supports a low overhead interface to collect 1950 * TSB hit statistics. If so, make set tstat_fast_tlbstat flag 1951 * to reflect that. 1952 */ 1953 if (tstat_options & TSTAT_OPT_TLBDATA) { 1954 int error; 1955 1956 tstat_fast_tlbstat = B_FALSE; 1957 error = cpu_trapstat_conf(CPU_TSTATCONF_INIT); 1958 if (error == 0) 1959 tstat_fast_tlbstat = B_TRUE; 1960 else if (error != ENOTSUP) { 1961 for (i = 0; i < tstat_num4m_mapping; i++) { 1962 contig_mem_free(tstat_va[i], MMU_PAGESIZE4M); 1963 } 1964 mutex_exit(&tstat_lock); 1965 mutex_exit(&cpu_lock); 1966 return (error); 1967 } 1968 } 1969 1970 tstat_hv_nopanic = 1; 1971 tstat_perm_mapping_failed = 0; 1972 #endif /* sun4v */ 1973 1974 /* 1975 * First, perform any necessary hot patching. 1976 */ 1977 trapstat_hotpatch(); 1978 1979 /* 1980 * Allocate the resources we'll need to measure probe effect. 1981 */ 1982 trapstat_probe_alloc(); 1983 1984 cp = cpu_list; 1985 do { 1986 if (!(tstat_percpu[cp->cpu_id].tcpu_flags & TSTAT_CPU_SELECTED)) 1987 continue; 1988 1989 trapstat_setup(cp->cpu_id); 1990 1991 /* 1992 * Note that due to trapstat_probe()'s use of global data, 1993 * we determine the probe effect on each CPU serially instead 1994 * of in parallel with an xc_all(). 1995 */ 1996 xc_one(cp->cpu_id, (xcfunc_t *)trapstat_probe, 0, 0); 1997 1998 #ifdef sun4v 1999 /* 2000 * Check to see if the first cpu's attempt to create 2001 * the perm mappings failed. This might happen if the 2002 * guest somehow exhausted all its limited perm mappings. 2003 * Note that we only check this once for the first 2004 * attempt since it shouldn't fail for subsequent cpus 2005 * mapping the same TTEs if the first attempt was successful. 2006 */ 2007 if (tstat_hv_nopanic && tstat_perm_mapping_failed) { 2008 tstat_percpu_t *tcpu = &tstat_percpu[cp->cpu_id]; 2009 for (i = 0; i < tstat_num4m_mapping; i++) { 2010 contig_mem_free(tstat_va[i], MMU_PAGESIZE4M); 2011 } 2012 2013 /* 2014 * Do clean up before returning. 2015 * Cleanup is manageable since we 2016 * only need to do it for the first cpu 2017 * iteration that failed. 2018 */ 2019 trapstat_probe_free(); 2020 trapstat_hotpatch(); 2021 tcpu->tcpu_pfn = NULL; 2022 tcpu->tcpu_instr = NULL; 2023 tcpu->tcpu_data = NULL; 2024 tcpu->tcpu_flags &= ~TSTAT_CPU_ALLOCATED; 2025 mutex_exit(&tstat_lock); 2026 mutex_exit(&cpu_lock); 2027 return (EAGAIN); 2028 } 2029 tstat_hv_nopanic = 0; 2030 #endif /* sun4v */ 2031 2032 } while ((cp = cp->cpu_next) != cpu_list); 2033 2034 xc_all((xcfunc_t *)trapstat_enable, 0, 0); 2035 2036 trapstat_probe_free(); 2037 tstat_running = 1; 2038 mutex_exit(&tstat_lock); 2039 mutex_exit(&cpu_lock); 2040 2041 return (0); 2042 } 2043 2044 static int 2045 trapstat_stop() 2046 { 2047 int i; 2048 2049 mutex_enter(&cpu_lock); 2050 mutex_enter(&tstat_lock); 2051 if (!tstat_running) { 2052 mutex_exit(&tstat_lock); 2053 mutex_exit(&cpu_lock); 2054 return (ENXIO); 2055 } 2056 2057 xc_all((xcfunc_t *)trapstat_disable, 0, 0); 2058 2059 for (i = 0; i <= max_cpuid; i++) { 2060 if (tstat_percpu[i].tcpu_flags & TSTAT_CPU_ALLOCATED) 2061 trapstat_teardown(i); 2062 } 2063 2064 #ifdef sun4v 2065 tstat_traptab_initialized = 0; 2066 if (tstat_options & TSTAT_OPT_TLBDATA) 2067 (void) cpu_trapstat_conf(CPU_TSTATCONF_FINI); 2068 for (i = 0; i < tstat_num4m_mapping; i++) 2069 contig_mem_free(tstat_va[i], MMU_PAGESIZE4M); 2070 #endif 2071 trapstat_hotpatch(); 2072 tstat_running = 0; 2073 mutex_exit(&tstat_lock); 2074 mutex_exit(&cpu_lock); 2075 2076 return (0); 2077 } 2078 2079 /* 2080 * This is trapstat's DR CPU configuration callback. It's called (with 2081 * cpu_lock held) to unconfigure a newly powered-off CPU, or to configure a 2082 * powered-off CPU that is to be brought into the system. We need only take 2083 * action in the unconfigure case: because a powered-off CPU will have its 2084 * trap table restored to KERNELBASE if it is ever powered back on, we must 2085 * update the flags to reflect that trapstat is no longer enabled on the 2086 * powered-off CPU. Note that this means that a TSTAT_CPU_ENABLED CPU that 2087 * is unconfigured/powered off and later powered back on/reconfigured will 2088 * _not_ be re-TSTAT_CPU_ENABLED. 2089 */ 2090 static int 2091 trapstat_cpu_setup(cpu_setup_t what, processorid_t cpu) 2092 { 2093 tstat_percpu_t *tcpu = &tstat_percpu[cpu]; 2094 2095 ASSERT(MUTEX_HELD(&cpu_lock)); 2096 mutex_enter(&tstat_lock); 2097 2098 if (!tstat_running) { 2099 mutex_exit(&tstat_lock); 2100 return (0); 2101 } 2102 2103 switch (what) { 2104 case CPU_CONFIG: 2105 ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED)); 2106 break; 2107 2108 case CPU_UNCONFIG: 2109 if (tcpu->tcpu_flags & TSTAT_CPU_ENABLED) { 2110 tcpu->tcpu_flags &= ~TSTAT_CPU_ENABLED; 2111 #ifdef sun4v 2112 /* 2113 * A power-off, causes the cpu mondo queues to be 2114 * unconfigured on sun4v. Since we can't teardown 2115 * trapstat's mappings on the cpu that is going away, 2116 * we simply mark it as not allocated. This will 2117 * prevent a teardown on a cpu with the same cpu id 2118 * that might have been added while trapstat is running. 2119 */ 2120 if (tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED) { 2121 tcpu->tcpu_pfn = NULL; 2122 tcpu->tcpu_instr = NULL; 2123 tcpu->tcpu_data = NULL; 2124 tcpu->tcpu_flags &= ~TSTAT_CPU_ALLOCATED; 2125 } 2126 #endif 2127 } 2128 break; 2129 2130 default: 2131 break; 2132 } 2133 2134 mutex_exit(&tstat_lock); 2135 return (0); 2136 } 2137 2138 /* 2139 * This is called before a CPR suspend and after a CPR resume. We don't have 2140 * anything to do before a suspend, but after a restart we must restore the 2141 * trap table to be our interposing trap table. However, we don't actually 2142 * know whether or not the CPUs have been powered off -- this routine may be 2143 * called while restoring from a failed CPR suspend. We thus run through each 2144 * TSTAT_CPU_ENABLED CPU, and explicitly destroy and reestablish its 2145 * interposing trap table. This assures that our state is correct regardless 2146 * of whether or not the CPU has been newly powered on. 2147 */ 2148 /*ARGSUSED*/ 2149 static boolean_t 2150 trapstat_cpr(void *arg, int code) 2151 { 2152 cpu_t *cp; 2153 2154 if (code == CB_CODE_CPR_CHKPT) 2155 return (B_TRUE); 2156 2157 ASSERT(code == CB_CODE_CPR_RESUME); 2158 2159 mutex_enter(&cpu_lock); 2160 mutex_enter(&tstat_lock); 2161 2162 if (!tstat_running) { 2163 mutex_exit(&tstat_lock); 2164 mutex_exit(&cpu_lock); 2165 return (B_TRUE); 2166 } 2167 2168 cp = cpu_list; 2169 do { 2170 tstat_percpu_t *tcpu = &tstat_percpu[cp->cpu_id]; 2171 2172 if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED)) 2173 continue; 2174 2175 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED); 2176 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED); 2177 2178 xc_one(cp->cpu_id, (xcfunc_t *)trapstat_disable, 0, 0); 2179 ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED)); 2180 2181 /* 2182 * Preserve this CPU's data in tstat_buffer and rip down its 2183 * interposing trap table. 2184 */ 2185 #ifdef sun4v 2186 bcopy(tcpu->tcpu_data, tstat_buffer, TSTAT_DATA_SIZE); 2187 #else 2188 bcopy(tcpu->tcpu_data, tstat_buffer, tstat_data_t_size); 2189 #endif /* sun4v */ 2190 trapstat_teardown(cp->cpu_id); 2191 ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED)); 2192 2193 /* 2194 * Reestablish the interposing trap table and restore the old 2195 * data. 2196 */ 2197 trapstat_setup(cp->cpu_id); 2198 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED); 2199 #ifdef sun4v 2200 bcopy(tstat_buffer, tcpu->tcpu_data, TSTAT_DATA_SIZE); 2201 #else 2202 bcopy(tstat_buffer, tcpu->tcpu_data, tstat_data_t_size); 2203 #endif /* sun4v */ 2204 2205 xc_one(cp->cpu_id, (xcfunc_t *)trapstat_enable, 0, 0); 2206 } while ((cp = cp->cpu_next) != cpu_list); 2207 2208 mutex_exit(&tstat_lock); 2209 mutex_exit(&cpu_lock); 2210 2211 return (B_TRUE); 2212 } 2213 2214 /*ARGSUSED*/ 2215 static int 2216 trapstat_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) 2217 { 2218 int i; 2219 2220 mutex_enter(&cpu_lock); 2221 mutex_enter(&tstat_lock); 2222 if (tstat_open != 0) { 2223 mutex_exit(&tstat_lock); 2224 mutex_exit(&cpu_lock); 2225 return (EBUSY); 2226 } 2227 2228 /* 2229 * Register this in open() rather than in attach() to prevent deadlock 2230 * with DR code. During attach, I/O device tree locks are grabbed 2231 * before trapstat_attach() is invoked - registering in attach 2232 * will result in the lock order: device tree lock, cpu_lock. 2233 * DR code however requires that cpu_lock be acquired before 2234 * device tree locks. 2235 */ 2236 ASSERT(!tstat_running); 2237 register_cpu_setup_func((cpu_setup_func_t *)trapstat_cpu_setup, NULL); 2238 2239 /* 2240 * Clear all options. And until specific CPUs are specified, we'll 2241 * mark all CPUs as selected. 2242 */ 2243 tstat_options = 0; 2244 2245 for (i = 0; i <= max_cpuid; i++) 2246 tstat_percpu[i].tcpu_flags |= TSTAT_CPU_SELECTED; 2247 2248 /* 2249 * By default, all traps at TL=0 are enabled. Traps at TL>0 must 2250 * be disabled. 2251 */ 2252 for (i = 0; i < TSTAT_TOTAL_NENT; i++) 2253 tstat_enabled[i] = i < TSTAT_NENT ? 1 : 0; 2254 2255 tstat_open = 1; 2256 mutex_exit(&tstat_lock); 2257 mutex_exit(&cpu_lock); 2258 2259 return (0); 2260 } 2261 2262 /*ARGSUSED*/ 2263 static int 2264 trapstat_close(dev_t dev, int flag, int otyp, cred_t *cred_p) 2265 { 2266 (void) trapstat_stop(); 2267 2268 ASSERT(!tstat_running); 2269 2270 mutex_enter(&cpu_lock); 2271 unregister_cpu_setup_func((cpu_setup_func_t *)trapstat_cpu_setup, NULL); 2272 mutex_exit(&cpu_lock); 2273 2274 tstat_open = 0; 2275 return (DDI_SUCCESS); 2276 } 2277 2278 static int 2279 trapstat_option(int option) 2280 { 2281 mutex_enter(&tstat_lock); 2282 2283 if (tstat_running) { 2284 mutex_exit(&tstat_lock); 2285 return (EBUSY); 2286 } 2287 2288 tstat_options |= option; 2289 mutex_exit(&tstat_lock); 2290 2291 return (0); 2292 } 2293 2294 /*ARGSUSED*/ 2295 static int 2296 trapstat_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *crd, int *rval) 2297 { 2298 int i, j, out; 2299 size_t dsize; 2300 2301 switch (cmd) { 2302 case TSTATIOC_GO: 2303 return (trapstat_go()); 2304 2305 case TSTATIOC_NOGO: 2306 return (trapstat_option(TSTAT_OPT_NOGO)); 2307 2308 case TSTATIOC_STOP: 2309 return (trapstat_stop()); 2310 2311 case TSTATIOC_CPU: 2312 if (arg < 0 || arg > max_cpuid) 2313 return (EINVAL); 2314 /*FALLTHROUGH*/ 2315 2316 case TSTATIOC_NOCPU: 2317 mutex_enter(&tstat_lock); 2318 2319 if (tstat_running) { 2320 mutex_exit(&tstat_lock); 2321 return (EBUSY); 2322 } 2323 2324 /* 2325 * If this is the first CPU to be specified (or if we are 2326 * being asked to explicitly de-select CPUs), disable all CPUs. 2327 */ 2328 if (!(tstat_options & TSTAT_OPT_CPU) || cmd == TSTATIOC_NOCPU) { 2329 tstat_options |= TSTAT_OPT_CPU; 2330 2331 for (i = 0; i <= max_cpuid; i++) { 2332 tstat_percpu_t *tcpu = &tstat_percpu[i]; 2333 2334 ASSERT(cmd == TSTATIOC_NOCPU || 2335 (tcpu->tcpu_flags & TSTAT_CPU_SELECTED)); 2336 tcpu->tcpu_flags &= ~TSTAT_CPU_SELECTED; 2337 } 2338 } 2339 2340 if (cmd == TSTATIOC_CPU) 2341 tstat_percpu[arg].tcpu_flags |= TSTAT_CPU_SELECTED; 2342 2343 mutex_exit(&tstat_lock); 2344 2345 return (0); 2346 2347 case TSTATIOC_ENTRY: 2348 mutex_enter(&tstat_lock); 2349 2350 if (tstat_running) { 2351 mutex_exit(&tstat_lock); 2352 return (EBUSY); 2353 } 2354 2355 if (arg >= TSTAT_NENT || arg < 0) { 2356 mutex_exit(&tstat_lock); 2357 return (EINVAL); 2358 } 2359 2360 if (!(tstat_options & TSTAT_OPT_ENTRY)) { 2361 /* 2362 * If this is the first entry that we are explicitly 2363 * enabling, explicitly disable every TL=0 entry. 2364 */ 2365 for (i = 0; i < TSTAT_NENT; i++) 2366 tstat_enabled[i] = 0; 2367 2368 tstat_options |= TSTAT_OPT_ENTRY; 2369 } 2370 2371 tstat_enabled[arg] = 1; 2372 mutex_exit(&tstat_lock); 2373 return (0); 2374 2375 case TSTATIOC_NOENTRY: 2376 mutex_enter(&tstat_lock); 2377 2378 if (tstat_running) { 2379 mutex_exit(&tstat_lock); 2380 return (EBUSY); 2381 } 2382 2383 for (i = 0; i < TSTAT_NENT; i++) 2384 tstat_enabled[i] = 0; 2385 2386 mutex_exit(&tstat_lock); 2387 return (0); 2388 2389 case TSTATIOC_READ: 2390 mutex_enter(&tstat_lock); 2391 2392 if (tstat_options & TSTAT_OPT_TLBDATA) { 2393 dsize = tstat_data_t_exported_size; 2394 } else { 2395 dsize = sizeof (tstat_data_t); 2396 } 2397 2398 for (i = 0, out = 0; i <= max_cpuid; i++) { 2399 tstat_percpu_t *tcpu = &tstat_percpu[i]; 2400 2401 if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED)) 2402 continue; 2403 2404 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED); 2405 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED); 2406 2407 tstat_buffer->tdata_cpuid = -1; 2408 xc_one(i, (xcfunc_t *)trapstat_snapshot, 0, 0); 2409 2410 if (tstat_buffer->tdata_cpuid == -1) { 2411 /* 2412 * This CPU is not currently responding to 2413 * cross calls; we have caught it while it is 2414 * being unconfigured. We'll drop tstat_lock 2415 * and pick up and drop cpu_lock. By the 2416 * time we acquire cpu_lock, the DR operation 2417 * will appear consistent and we can assert 2418 * that trapstat_cpu_setup() has cleared 2419 * TSTAT_CPU_ENABLED. 2420 */ 2421 mutex_exit(&tstat_lock); 2422 mutex_enter(&cpu_lock); 2423 mutex_exit(&cpu_lock); 2424 mutex_enter(&tstat_lock); 2425 ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED)); 2426 continue; 2427 } 2428 2429 /* 2430 * Need to compensate for the difference between page 2431 * sizes exported to users and page sizes available 2432 * within the kernel. 2433 */ 2434 if ((tstat_options & TSTAT_OPT_TLBDATA) && 2435 (tstat_pgszs != tstat_user_pgszs)) { 2436 tstat_pgszdata_t *tp; 2437 uint_t szc; 2438 2439 tp = &tstat_buffer->tdata_pgsz[0]; 2440 for (j = 0; j < tstat_user_pgszs; j++) { 2441 if ((szc = USERSZC_2_SZC(j)) != j) { 2442 bcopy(&tp[szc], &tp[j], 2443 sizeof (tstat_pgszdata_t)); 2444 } 2445 } 2446 } 2447 2448 if (copyout(tstat_buffer, (void *)arg, dsize) != 0) { 2449 mutex_exit(&tstat_lock); 2450 return (EFAULT); 2451 } 2452 2453 out++; 2454 arg += dsize; 2455 } 2456 2457 if (out != max_cpuid + 1) { 2458 processorid_t cpuid = -1; 2459 arg += offsetof(tstat_data_t, tdata_cpuid); 2460 2461 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0) { 2462 mutex_exit(&tstat_lock); 2463 return (EFAULT); 2464 } 2465 } 2466 2467 mutex_exit(&tstat_lock); 2468 2469 return (0); 2470 2471 case TSTATIOC_TLBDATA: 2472 return (trapstat_option(TSTAT_OPT_TLBDATA)); 2473 2474 default: 2475 break; 2476 } 2477 2478 return (ENOTTY); 2479 } 2480 2481 /*ARGSUSED*/ 2482 static int 2483 trapstat_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 2484 { 2485 int error; 2486 2487 switch (infocmd) { 2488 case DDI_INFO_DEVT2DEVINFO: 2489 *result = (void *)tstat_devi; 2490 error = DDI_SUCCESS; 2491 break; 2492 case DDI_INFO_DEVT2INSTANCE: 2493 *result = (void *)0; 2494 error = DDI_SUCCESS; 2495 break; 2496 default: 2497 error = DDI_FAILURE; 2498 } 2499 return (error); 2500 } 2501 2502 static int 2503 trapstat_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) 2504 { 2505 switch (cmd) { 2506 case DDI_ATTACH: 2507 break; 2508 2509 case DDI_RESUME: 2510 return (DDI_SUCCESS); 2511 2512 default: 2513 return (DDI_FAILURE); 2514 } 2515 2516 if (ddi_create_minor_node(devi, "trapstat", S_IFCHR, 2517 0, DDI_PSEUDO, 0) == DDI_FAILURE) { 2518 ddi_remove_minor_node(devi, NULL); 2519 return (DDI_FAILURE); 2520 } 2521 2522 ddi_report_dev(devi); 2523 tstat_devi = devi; 2524 2525 tstat_pgszs = page_num_pagesizes(); 2526 tstat_user_pgszs = page_num_user_pagesizes(0); 2527 tstat_data_t_size = sizeof (tstat_data_t) + 2528 (tstat_pgszs - 1) * sizeof (tstat_pgszdata_t); 2529 tstat_data_t_exported_size = sizeof (tstat_data_t) + 2530 (tstat_user_pgszs - 1) * sizeof (tstat_pgszdata_t); 2531 #ifndef sun4v 2532 tstat_data_pages = (tstat_data_t_size >> MMU_PAGESHIFT) + 1; 2533 tstat_total_pages = TSTAT_INSTR_PAGES + tstat_data_pages; 2534 tstat_data_size = tstat_data_pages * MMU_PAGESIZE; 2535 tstat_total_size = TSTAT_INSTR_SIZE + tstat_data_size; 2536 #else 2537 /* 2538 * For sun4v, the tstat_data_t_size reflect the tstat_buffer 2539 * output size based on tstat_data_t structure. For tlbstats 2540 * collection, we use the internal tstat_tdata_t structure 2541 * to collect the tlbstats for the pages. Therefore we 2542 * need to adjust the size for the assertion. 2543 */ 2544 ASSERT((tstat_data_t_size - sizeof (tstat_data_t) + 2545 sizeof (tstat_tdata_t)) <= TSTAT_DATA_SIZE); 2546 #endif 2547 2548 tstat_percpu = kmem_zalloc((max_cpuid + 1) * 2549 sizeof (tstat_percpu_t), KM_SLEEP); 2550 2551 /* 2552 * Create our own arena backed by segkmem to assure a source of 2553 * MMU_PAGESIZE-aligned allocations. We allocate out of the 2554 * heap32_arena to assure that we can address the allocated memory with 2555 * a single sethi/simm13 pair in the interposing trap table entries. 2556 */ 2557 tstat_arena = vmem_create("trapstat", NULL, 0, MMU_PAGESIZE, 2558 segkmem_alloc_permanent, segkmem_free, heap32_arena, 0, VM_SLEEP); 2559 2560 tstat_enabled = kmem_alloc(TSTAT_TOTAL_NENT * sizeof (int), KM_SLEEP); 2561 tstat_buffer = kmem_alloc(tstat_data_t_size, KM_SLEEP); 2562 2563 /* 2564 * CB_CL_CPR_POST_USER is the class that executes from cpr_resume() 2565 * after user threads can be restarted. By executing in this class, 2566 * we are assured of the availability of system services needed to 2567 * resume trapstat (specifically, we are assured that all CPUs are 2568 * restarted and responding to cross calls). 2569 */ 2570 tstat_cprcb = 2571 callb_add(trapstat_cpr, NULL, CB_CL_CPR_POST_USER, "trapstat"); 2572 2573 return (DDI_SUCCESS); 2574 } 2575 2576 static int 2577 trapstat_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) 2578 { 2579 int rval; 2580 2581 ASSERT(devi == tstat_devi); 2582 2583 switch (cmd) { 2584 case DDI_DETACH: 2585 break; 2586 2587 case DDI_SUSPEND: 2588 return (DDI_SUCCESS); 2589 2590 default: 2591 return (DDI_FAILURE); 2592 } 2593 2594 ASSERT(!tstat_running); 2595 2596 rval = callb_delete(tstat_cprcb); 2597 ASSERT(rval == 0); 2598 2599 kmem_free(tstat_buffer, tstat_data_t_size); 2600 kmem_free(tstat_enabled, TSTAT_TOTAL_NENT * sizeof (int)); 2601 vmem_destroy(tstat_arena); 2602 kmem_free(tstat_percpu, (max_cpuid + 1) * sizeof (tstat_percpu_t)); 2603 ddi_remove_minor_node(devi, NULL); 2604 2605 return (DDI_SUCCESS); 2606 } 2607 2608 /* 2609 * Configuration data structures 2610 */ 2611 static struct cb_ops trapstat_cb_ops = { 2612 trapstat_open, /* open */ 2613 trapstat_close, /* close */ 2614 nulldev, /* strategy */ 2615 nulldev, /* print */ 2616 nodev, /* dump */ 2617 nodev, /* read */ 2618 nodev, /* write */ 2619 trapstat_ioctl, /* ioctl */ 2620 nodev, /* devmap */ 2621 nodev, /* mmap */ 2622 nodev, /* segmap */ 2623 nochpoll, /* poll */ 2624 ddi_prop_op, /* cb_prop_op */ 2625 0, /* streamtab */ 2626 D_MP | D_NEW /* Driver compatibility flag */ 2627 }; 2628 2629 static struct dev_ops trapstat_ops = { 2630 DEVO_REV, /* devo_rev, */ 2631 0, /* refcnt */ 2632 trapstat_info, /* getinfo */ 2633 nulldev, /* identify */ 2634 nulldev, /* probe */ 2635 trapstat_attach, /* attach */ 2636 trapstat_detach, /* detach */ 2637 nulldev, /* reset */ 2638 &trapstat_cb_ops, /* cb_ops */ 2639 (struct bus_ops *)0, /* bus_ops */ 2640 NULL, /* power */ 2641 ddi_quiesce_not_needed, /* quiesce */ 2642 }; 2643 2644 static struct modldrv modldrv = { 2645 &mod_driverops, /* Type of module. This one is a driver */ 2646 "Trap Statistics 1.1", /* name of module */ 2647 &trapstat_ops, /* driver ops */ 2648 }; 2649 2650 static struct modlinkage modlinkage = { 2651 MODREV_1, (void *)&modldrv, NULL 2652 }; 2653 2654 int 2655 _init(void) 2656 { 2657 return (mod_install(&modlinkage)); 2658 } 2659 2660 int 2661 _fini(void) 2662 { 2663 return (mod_remove(&modlinkage)); 2664 } 2665 2666 int 2667 _info(struct modinfo *modinfop) 2668 { 2669 return (mod_info(&modlinkage, modinfop)); 2670 } 2671