1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26
27 #include <sys/systm.h>
28 #include <sys/conf.h>
29 #include <sys/stat.h>
30 #include <sys/ddi.h>
31 #include <sys/sunddi.h>
32 #include <sys/modctl.h>
33 #include <sys/cpu_module.h>
34 #include <vm/hat_sfmmu.h>
35 #include <vm/seg_kmem.h>
36 #include <vm/seg_kpm.h>
37 #include <vm/vm_dep.h>
38 #include <sys/machsystm.h>
39 #include <sys/machasi.h>
40 #include <sys/sysmacros.h>
41 #include <sys/callb.h>
42 #include <sys/archsystm.h>
43 #include <sys/trapstat.h>
44 #ifdef sun4v
45 #include <sys/hypervisor_api.h>
46 #endif
47 #ifndef sun4v
48 #include <sys/pghw.h>
49 #endif
50
51 /* BEGIN CSTYLED */
52 /*
53 * trapstat: Trap Statistics through Dynamic Trap Table Interposition
54 * -------------------------------------------------------------------
55 *
56 * Motivation and Overview
57 *
58 * Despite being a fundamental indicator of system behavior, there has
59 * historically been very little insight provided into the frequency and cost
60 * of machine-specific traps. The lack of insight has been especially acute
61 * on UltraSPARC microprocessors: because these microprocessors handle TLB
62 * misses as software traps, the frequency and duration of traps play a
63 * decisive role in the performance of the memory system. As applications have
64 * increasingly outstripped TLB reach, this has become increasingly true.
65 *
66 * Part of the difficulty of observing trap behavior is that the trap handlers
67 * are so frequently called (e.g. millions of times per second) that any
68 * permanently enabled instrumentation would induce an unacceptable performance
69 * degradation. Thus, it is a constraint on any trap observability
70 * infrastructure that it have no probe effect when not explicitly enabled.
71 *
72 * The basic idea, then, is to create an interposing trap table in which each
73 * entry increments a per-trap, in-memory counter and then jumps to the actual,
74 * underlying trap table entry. To enable trapstat, we atomically write to the
75 * trap base address (%tba) register to point to our interposing trap table.
76 * (Note that per-CPU statistics fall out by creating a different trap table
77 * for each CPU.)
78 *
79 * Implementation Details
80 *
81 * While the idea is straight-forward, a nuance of SPARC V9 slightly
82 * complicates the implementation. Unlike its predecessors, SPARC V9 supports
83 * the notion of nested traps. The trap level is kept in the TL register:
84 * during normal operation it is 0; when a trap is taken, the TL register is
85 * incremented by 1. To aid system software, SPARC V9 breaks the trap table
86 * into two halves: the lower half contains the trap handlers for traps taken
87 * when TL is 0; the upper half contains the trap handlers for traps taken
88 * when TL is greater than 0. Each half is further subdivided into two
89 * subsequent halves: the lower half contains the trap handlers for traps
90 * other than those induced by the trap instruction (Tcc variants); the upper
91 * half contains the trap handlers for traps induced by the trap instruction.
92 * This gives a total of four ranges, with each range containing 256 traps:
93 *
94 * +--------------------------------+- 3ff
95 * | | .
96 * | Trap instruction, TL>0 | .
97 * | | .
98 * |- - - - - - - - - - - - - - - - +- 300
99 * |- - - - - - - - - - - - - - - - +- 2ff
100 * | | .
101 * | Non-trap instruction, TL>0 | .
102 * | | .
103 * |- - - - - - - - - - - - - - - - +- 200
104 * |- - - - - - - - - - - - - - - - +- 1ff
105 * | | .
106 * | Trap instruction, TL=0 | .
107 * | | .
108 * |- - - - - - - - - - - - - - - - +- 100
109 * |- - - - - - - - - - - - - - - - +- 0ff
110 * | | .
111 * | Non-trap instruction, TL=0 | .
112 * | | .
113 * +--------------------------------+- 000
114 *
115 *
116 * Solaris, however, doesn't have reason to support trap instructions when
117 * TL>0 (only privileged code may execute at TL>0; not supporting this only
118 * constrains our own implementation). The trap table actually looks like:
119 *
120 * +--------------------------------+- 2ff
121 * | | .
122 * | Non-trap instruction, TL>0 | .
123 * | | .
124 * |- - - - - - - - - - - - - - - - +- 200
125 * |- - - - - - - - - - - - - - - - +- 1ff
126 * | | .
127 * | Trap instruction, TL=0 | .
128 * | | .
129 * |- - - - - - - - - - - - - - - - +- 100
130 * |- - - - - - - - - - - - - - - - +- 0ff
131 * | | .
132 * | Non-trap instruction, TL=0 | .
133 * | | .
134 * +--------------------------------+- 000
135 *
136 * Putatively to aid system software, SPARC V9 has the notion of multiple
137 * sets of global registers. UltraSPARC defines four sets of global
138 * registers:
139 *
140 * Normal Globals
141 * Alternate Globals (AGs)
142 * MMU Globals (MGs)
143 * Interrupt Globals (IGs)
144 *
145 * The set of globals in use is controlled by bits in PSTATE; when TL is 0
146 * (and PSTATE has not been otherwise explicitly modified), the Normal Globals
147 * are in use. When a trap is issued, PSTATE is modified to point to a set of
148 * globals corresponding to the trap type. Most traps correspond to the
149 * Alternate Globals, with a minority corresponding to the MMU Globals, and
150 * only the interrupt-vector trap (vector 0x60) corresponding to the Interrupt
151 * Globals. (The complete mapping can be found in the UltraSPARC I&II User's
152 * Manual.)
153 *
154 * Note that the sets of globals are per trap _type_, not per trap _level_.
155 * Thus, when executing a TL>0 trap handler, one may not have registers
156 * available (for example, both trap-instruction traps and spill traps execute
157 * on the alternate globals; if a trap-instruction trap induces a window spill,
158 * the window spill handler has no available globals). For trapstat, this is
159 * problematic: a register is required to transfer control from one arbitrary
160 * location (in the interposing trap table) to another (in the actual trap
161 * table).
162 *
163 * We solve this problem by exploiting the trap table's location at the bottom
164 * of valid kernel memory (i.e. at KERNELBASE). We locate the interposing trap
165 * tables just below KERNELBASE -- thereby allowing us to use a branch-always
166 * instruction (ba) instead of a jump instruction (jmp) to transfer control
167 * from the TL>0 entries in the interposing trap table to the TL>0 entries in
168 * the actual trap table. (N.B. while this allows trap table interposition to
169 * work, it necessarily limits trapstat to only recording information about
170 * TL=0 traps -- there is no way to increment a counter without using a
171 * register.) Diagrammatically:
172 *
173 * Actual trap table:
174 *
175 * +--------------------------------+- 2ff
176 * | | .
177 * | Non-trap instruction, TL>0 | . <-----------------------+
178 * | | . <-----------------------|-+
179 * |- - - - - - - - - - - - - - - - +- 200 <-----------------------|-|-+
180 * |- - - - - - - - - - - - - - - - +- 1ff | | |
181 * | | . | | |
182 * | Trap instruction, TL=0 | . <-----------------+ | | |
183 * | | . <-----------------|-+ | | |
184 * |- - - - - - - - - - - - - - - - +- 100 <-----------------|-|-+ | | |
185 * |- - - - - - - - - - - - - - - - +- 0ff | | | | | |
186 * | | . | | | | | |
187 * | Non-trap instruction, TL=0 | . <-----------+ | | | | | |
188 * | | . <-----------|-+ | | | | | |
189 * +--------------------------------+- 000 <-----------|-|-+ | | | | | |
190 * KERNELBASE | | | | | | | | |
191 * | | | | | | | | |
192 * | | | | | | | | |
193 * Interposing trap table: | | | | | | | | |
194 * | | | | | | | | |
195 * +--------------------------------+- 2ff | | | | | | | | |
196 * | ... | . | | | | | | | | |
197 * | ... | . | | | | | | | | |
198 * | ... | . | | | | | | | | |
199 * |- - - - - - - - - - - - - - - - +- 203 | | | | | | | | |
200 * | ba,a | -------------|-|-|-|-|-|-+ | |
201 * |- - - - - - - - - - - - - - - - +- 202 | | | | | | | |
202 * | ba,a | -------------|-|-|-|-|-|---+ |
203 * |- - - - - - - - - - - - - - - - +- 201 | | | | | | |
204 * | ba,a | -------------|-|-|-|-|-|-----+
205 * |- - - - - - - - - - - - - - - - +- 200 | | | | | |
206 * | ... | . | | | | | |
207 * | ... | . | | | | | |
208 * | ... | . | | | | | |
209 * |- - - - - - - - - - - - - - - - +- 103 | | | | | |
210 * | (Increment counter) | | | | | | |
211 * | ba,a | -------------------+ | |
212 * |- - - - - - - - - - - - - - - - +- 102 | | | | |
213 * | (Increment counter) | | | | | |
214 * | ba,a | ---------------------+ |
215 * |- - - - - - - - - - - - - - - - +- 101 | | | |
216 * | (Increment counter) | | | | |
217 * | ba,a | -----------------------+
218 * |- - - - - - - - - - - - - - - - +- 100 | | |
219 * | ... | . | | |
220 * | ... | . | | |
221 * | ... | . | | |
222 * |- - - - - - - - - - - - - - - - +- 003 | | |
223 * | (Increment counter) | | | |
224 * | ba,a | -------------+ | |
225 * |- - - - - - - - - - - - - - - - +- 002 | |
226 * | (Increment counter) | | |
227 * | ba,a | ---------------+ |
228 * |- - - - - - - - - - - - - - - - +- 001 |
229 * | (Increment counter) | |
230 * | ba,a | -----------------+
231 * +--------------------------------+- 000
232 * KERNELBASE - tstat_total_size
233 *
234 * tstat_total_size is the number of pages required for each trap table. It
235 * must be true that KERNELBASE - tstat_total_size is less than the maximum
236 * branch displacement; if each CPU were to consume a disjoint virtual range
237 * below KERNELBASE for its trap table, we could support at most
238 * (maximum_branch_displacement / tstat_total_size) CPUs. The maximum branch
239 * displacement for Bicc variants is just under eight megabytes, and (because
240 * the %tba must be 32K aligned), tstat_total_size must be at least 32K; if
241 * each CPU were to consume a disjoint virtual range, we would have an
242 * unacceptably low upper bound of 256 CPUs.
243 *
244 * While there are tricks that one could use to address this constraint (e.g.,
245 * creating trampolines every maximum_branch_displacement bytes), we instead
246 * solve this by not permitting each CPU to consume a disjoint virtual range.
247 * Rather, we have each CPU's interposing trap table use the _same_ virtual
248 * range, but we back the trap tables with disjoint physical memory. Normally,
249 * such one-to-many virtual-to-physical mappings are illegal; this is
250 * permissible here only because the pages for the interposing trap table are
251 * necessarily locked in the TLB. (The CPUs thus never have the opportunity to
252 * discover that they have conflicting translations.)
253 *
254 * On CMT architectures in which CPUs can share MMUs, the above trick will not
255 * work: two CPUs that share an MMU cannot have the same virtual address map
256 * to disjoint physical pages. On these architectures, any CPUs sharing the
257 * same MMU must consume a disjoint 32K virtual address range -- limiting the
258 * number of CPUs sharing an MMU on these architectures to 256 due to the
259 * branch displacement limitation described above. On the sun4v architecture,
260 * there is a further limitation: a guest may not have more than eight locked
261 * TLB entries per MMU. To allow operation under this restriction, the
262 * interposing trap table and the trap statistics are each accessed through
263 * a single 4M TLB entry. This limits the footprint to two locked entries
264 * (one for the I-TLB and one for the D-TLB), but further restricts the number
265 * of CPUs to 128 per MMU. However, support for more than 128 CPUs can easily
266 * be added via a hybrid scheme, where the same 4M virtual address is used
267 * on different MMUs.
268 *
269 * On sun4v architecture, we cannot use the hybrid scheme as the architecture
270 * imposes additional restriction on the number of permanent mappings per
271 * guest and it is illegal to use the same virtual address to map different
272 * TTEs on different MMUs. Instead, we increase the number of supported CPUs
273 * by reducing the virtual address space requirements per CPU via shared
274 * interposing trap table as follows:
275 *
276 * Offset (within 4MB page)
277 * +------------------------------------+- 0x400000
278 * | CPU 1015 trap statistics (4KB) | .
279 * |- - - - - - - - - - - - - - - - - - +- 0x3ff000
280 * | |
281 * | ... |
282 * | |
283 * |- - - - - - - - - - - - - - - - - - +- 0x00a000
284 * | CPU 1 trap statistics (4KB) | .
285 * |- - - - - - - - - - - - - - - - - - +- 0x009000
286 * | CPU 0 trap statistics (4KB) | .
287 * |- - - - - - - - - - - - - - - - - - +- 0x008000
288 * | Shared trap handler continuation | .
289 * |- - - - - - - - - - - - - - - - - - +- 0x006000
290 * | Non-trap instruction, TL>0 | .
291 * |- - - - - - - - - - - - - - - - - - +- 0x004000
292 * | Trap instruction, TL=0 | .
293 * |- - - - - - - - - - - - - - - - - - +- 0x002000
294 * | Non-trap instruction, TL=0 | .
295 * +------------------------------------+- 0x000000
296 *
297 * Note that each CPU has its own 4K space for its trap statistics but
298 * shares the same interposing trap handlers. Interposing trap handlers
299 * use the CPU ID to determine the location of per CPU trap statistics
300 * area dynamically. This increases the interposing trap handler overhead,
301 * but is acceptable as it allows us to support up to 1016 CPUs with one
302 * 4MB page on sun4v architecture. Support for additional CPUs can be
303 * added with another 4MB page to 2040 cpus (or 3064 cpus with 2 additional
304 * 4MB pages). With additional 4MB pages, we cannot use displacement branch
305 * (ba instruction) and we have to use jmp instruction instead. Note that
306 * with sun4v, globals are nested (not per-trap type as in sun4u), so it is
307 * ok to use additional global reg to do jmp. This option is not available in
308 * sun4u which mandates the usage of displacement branches since no global reg
309 * is available at TL>1
310 *
311 * TLB Statistics
312 *
313 * Because TLB misses are an important component of system performance, we wish
314 * to know much more about these traps than simply the number received.
315 * Specifically, we wish to know:
316 *
317 * (a) The amount of time spent executing the TLB miss handler
318 * (b) TLB misses versus TSB misses
319 * (c) Kernel-level misses versus user-level misses
320 * (d) Misses per pagesize
321 *
322 * TLB Statistics: Time Spent Executing
323 *
324 * To accurately determine the amount of time spent executing the TLB miss
325 * handler, one must get a timestamp on trap entry and trap exit, subtract the
326 * latter from the former, and add the result to an accumulating count.
327 * Consider flow of control during normal TLB miss processing (where "ldx
328 * [%g2], %g2" is an arbitrary TLB-missing instruction):
329 *
330 * + - - - - - - - -+
331 * : :
332 * : ldx [%g2], %g2 :<-------------------------------------------------------+
333 * : : Return from trap: |
334 * + - - - - - - - -+ TL <- TL - 1 (0) |
335 * | %pc <- TSTATE[TL].TPC (address of load) |
336 * | TLB miss: |
337 * | TL <- TL + 1 (1) |
338 * | %pc <- TLB-miss-trap-handler |
339 * | |
340 * v |
341 * + - - - - - - - - - - - - - - - + |
342 * : : |
343 * : Lookup VA in TSB : |
344 * : If (hit) : |
345 * : Fill TLB : |
346 * : Else : |
347 * : Lookup VA (hme hash table : |
348 * : or segkpm) : |
349 * : Fill TLB : |
350 * : Endif : |
351 * : Issue "retry" ---------------------------------------------------------+
352 * : :
353 * + - - - - - - - - - - - - - - - +
354 * TLB-miss-trap-handler
355 *
356 *
357 * As the above diagram indicates, interposing on the trap table allows one
358 * only to determine a timestamp on trap _entry_: when the TLB miss handler
359 * has completed filling the TLB, a "retry" will be issued, and control will
360 * transfer immediately back to the missing %pc.
361 *
362 * To obtain a timestamp on trap exit, we must then somehow interpose between
363 * the "retry" and the subsequent control transfer to the TLB-missing
364 * instruction. To do this, we _push_ a trap level. The basic idea is to
365 * spoof a TLB miss by raising TL, setting the %tpc to be within text
366 * controlled by trapstat (the "TLB return entry") and branching to the
367 * underlying TLB miss handler. When the TLB miss handler issues its "retry",
368 * control will transfer not to the TLB-missing instruction, but rather to the
369 * TLB return entry. This code can then obtain a timestamp, and issue its own
370 * "retry" -- thereby correctly returning to the TLB-missing instruction.
371 * Here is the above TLB miss flow control diagram modified to reflect
372 * trapstat's operation:
373 *
374 * + - - - - - - - -+
375 * : :
376 * : ldx [%g2], %g2 :<-------------------------------------------------------+
377 * : : Return from trap: |
378 * + - - - - - - - -+ TL <- TL - 1 (0) |
379 * | %pc <- TSTATE[TL].TPC (address of load) |
380 * | TLB miss: |
381 * | TL <- TL + 1 (1) |
382 * | %pc <- TLB-miss-trap-handler (trapstat) |
383 * | |
384 * v TLB-return-entry (trapstat) |
385 * + - - - - - - - - - - - - - - - - - - + + - - - - - - - - - - - - - + |
386 * : : : : |
387 * : Record timestamp : : Record timestamp : |
388 * : TL <- 2 : : Take timestamp difference : |
389 * : TSTATE[1].TPC <- TLB-return-entry : : Add to running total : |
390 * : ba,a TLB-miss-trap-handler -----------+ : Issue "retry" --------------+
391 * : : | : :
392 * + - - - - - - - - - - - - - - - - - - + | + - - - - - - - - - - - - - +
393 * TLB-miss-trap-handler | ^
394 * (trapstat) | |
395 * | |
396 * | |
397 * +-----------------------+ |
398 * | |
399 * | |
400 * v |
401 * + - - - - - - - - - - - - - - - + |
402 * : : |
403 * : Lookup VA in TSB : |
404 * : If (hit) : |
405 * : Fill TLB : |
406 * : Else : |
407 * : Lookup VA (hme hash table : |
408 * : or segkpm) : |
409 * : Fill TLB : |
410 * : Endif : |
411 * : Issue "retry" ------------------------------------------+
412 * : : Return from trap:
413 * + - - - - - - - - - - - - - - - + TL <- TL - 1 (1)
414 * TLB-miss-trap-handler %pc <- TSTATE[TL].TPC (TLB-return-entry)
415 *
416 *
417 * A final subterfuge is required to complete our artifice: if we miss in
418 * the TLB, the TSB _and_ the subsequent hash or segkpm lookup (that is, if
419 * there is no valid translation for the TLB-missing address), common system
420 * software will need to accurately determine the %tpc as part of its page
421 * fault handling. We therefore modify the kernel to check the %tpc in this
422 * case: if the %tpc falls within the VA range controlled by trapstat and
423 * the TL is 2, TL is simply lowered back to 1 (this check is implemented
424 * by the TSTAT_CHECK_TL1 macro). Lowering TL to 1 has the effect of
425 * discarding the state pushed by trapstat.
426 *
427 * TLB Statistics: TLB Misses versus TSB Misses
428 *
429 * Distinguishing TLB misses from TSB misses requires further interposition
430 * on the TLB miss handler: we cannot know a priori or a posteriori if a
431 * given VA will or has hit in the TSB.
432 *
433 * We achieve this distinction by adding a second TLB return entry almost
434 * identical to the first -- differing only in the address to which it
435 * stores its results. We then modify the TLB miss handlers of the kernel
436 * such that they check the %tpc when they determine that a TLB miss has
437 * subsequently missed in the TSB: if the %tpc lies within trapstat's VA
438 * range and TL is 2 (that is, if trapstat is running), the TLB miss handler
439 * _increments_ the %tpc by the size of the TLB return entry. The ensuing
440 * "retry" will thus transfer control to the second TLB return entry, and
441 * the time spent in the handler will be accumulated in a memory location
442 * specific to TSB misses.
443 *
444 * N.B.: To minimize the amount of knowledge the kernel must have of trapstat,
445 * we do not allow the kernel to hard-code the size of the TLB return entry.
446 * Rather, the actual tsbmiss handler executes a known instruction at the
447 * corresponding tsbmiss patch points (see the tstat_tsbmiss_patch_table) with
448 * the %tpc in %g7: when trapstat is not running, these points contain the
449 * harmless TSTAT_TSBMISS_INSTR instruction ("add %g7, 0, %g7"). Before
450 * running, trapstat modifies the instructions at these patch points such
451 * that the simm13 equals the size of the TLB return entry.
452 *
453 * TLB Statistics: Kernel-level Misses versus User-level Misses
454 *
455 * Differentiating user-level misses from kernel-level misses employs a
456 * similar technique, but is simplified by the ability to distinguish a
457 * user-level miss from a kernel-level miss a priori by reading the context
458 * register: we implement kernel-/user-level differentiation by again doubling
459 * the number of TLB return entries, and setting the %tpc to the appropriate
460 * TLB return entry in trapstat's TLB miss handler. Together with the doubling
461 * of entries required for TLB-miss/TSB-miss differentiation, this yields a
462 * total of four TLB return entries:
463 *
464 * Level TSB hit? Structure member
465 * ------------------------------------------------------------
466 * Kernel Yes tstat_tlbret_t.ttlbr_ktlb
467 * Kernel No tstat_tlbret_t.ttlbr_ktsb
468 * User Yes tstat_tlbret_t.ttlbr_utlb
469 * User No tstat_tlbret_t.ttlbr_utsb
470 *
471 * TLB Statistics: Misses per Pagesize
472 *
473 * As with the TLB-/TSB-miss differentiation, we have no way of determining
474 * pagesize a priori. This is therefore implemented by mandating a new rule:
475 * whenever the kernel fills the TLB in its TLB miss handler, the TTE
476 * corresponding to the TLB-missing VA must be in %g5 when the handler
477 * executes its "retry". This allows the TLB return entry to determine
478 * pagesize by simply looking at the pagesize field in the TTE stored in
479 * %g5.
480 *
481 * TLB Statistics: Probe Effect
482 *
483 * As one might imagine, gathering TLB statistics by pushing a trap level
484 * induces significant probe effect. To account for this probe effect,
485 * trapstat attempts to observe it by executing a code sequence with a known
486 * number of TLB misses both before and after interposing on the trap table.
487 * This allows trapstat to determine a per-trap probe effect which can then be
488 * factored into the "%tim" fields of the trapstat command.
489 *
490 * Note that on sun4v platforms, TLB misses are normally handled by the
491 * hypervisor or the hardware TSB walker. Thus no fast MMU miss information
492 * is reported for normal operation. However, when trapstat is invoked
493 * with -t or -T option to collect detailed TLB statistics, kernel takes
494 * over TLB miss handling. This results in significantly more overhead
495 * and TLB statistics may not be as accurate as on sun4u platforms.
496 * On some processors, hypervisor or hardware may provide a low overhead
497 * interface to collect TSB hit statistics. This support is exposed via
498 * a well defined CPU module interface (cpu_trapstat_conf to enable this
499 * interface and cpu_trapstat_data to get detailed TSB hit statistics).
500 * In this scenario, TSB miss statistics is collected by intercepting the
501 * IMMU_miss and DMMU_miss traps using above mentioned trap interposition
502 * approach.
503 *
504 * Locking
505 *
506 * The implementation uses two locks: tstat_lock (a local lock) and the global
507 * cpu_lock. tstat_lock is used to assure trapstat's consistency in the
508 * presence of multithreaded /dev/trapstat consumers (while as of this writing
509 * the only consumer of /dev/trapstat is single threaded, it is obviously
510 * necessary to correctly support multithreaded access). cpu_lock is held
511 * whenever CPUs are being manipulated directly, to prevent them from
512 * disappearing in the process. Because trapstat's DR callback
513 * (trapstat_cpu_setup()) must grab tstat_lock and is called with cpu_lock
514 * held, the lock ordering is necessarily cpu_lock before tstat_lock.
515 *
516 */
517 /* END CSTYLED */
518
519 static dev_info_t *tstat_devi; /* saved in xxattach() for xxinfo() */
520 static int tstat_open; /* set if driver is open */
521 static kmutex_t tstat_lock; /* serialize access */
522 static vmem_t *tstat_arena; /* arena for TLB-locked pages */
523 static tstat_percpu_t *tstat_percpu; /* per-CPU data */
524 static int tstat_running; /* set if trapstat is running */
525 static tstat_data_t *tstat_buffer; /* staging buffer for outgoing data */
526 static int tstat_options; /* bit-wise indication of options */
527 static int *tstat_enabled; /* map of enabled trap entries */
528 static int tstat_tsbmiss_patched; /* tsbmiss patch flag */
529 static callb_id_t tstat_cprcb; /* CPR callback */
530 static char *tstat_probe_area; /* VA range used for probe effect */
531 static caddr_t tstat_probe_phys; /* physical to back above VA */
532 static hrtime_t tstat_probe_time; /* time spent on probe effect */
533 static hrtime_t tstat_probe_before[TSTAT_PROBE_NLAPS];
534 static hrtime_t tstat_probe_after[TSTAT_PROBE_NLAPS];
535 static uint_t tstat_pgszs; /* # of kernel page sizes */
536 static uint_t tstat_user_pgszs; /* # of user page sizes */
537
538 /*
539 * sizeof tstat_data_t + pgsz data for the kernel. For simplicity's sake, when
540 * we collect data, we do it based upon szc, but when we report data back to
541 * userland, we have to do it based upon the userszc which may not match.
542 * So, these two variables are for internal use and exported use respectively.
543 */
544 static size_t tstat_data_t_size;
545 static size_t tstat_data_t_exported_size;
546
547 #ifndef sun4v
548
549 static size_t tstat_data_pages; /* number of pages of tstat data */
550 static size_t tstat_data_size; /* tstat data size in bytes */
551 static size_t tstat_total_pages; /* #data pages + #instr pages */
552 static size_t tstat_total_size; /* tstat data size + instr size */
553
554 #else /* sun4v */
555
556 static caddr_t tstat_va[TSTAT_NUM4M_LIMIT]; /* VAs of 4MB pages */
557 static pfn_t tstat_pfn[TSTAT_NUM4M_LIMIT]; /* PFNs of 4MB pages */
558 static boolean_t tstat_fast_tlbstat = B_FALSE;
559 static int tstat_traptab_initialized;
560 static int tstat_perm_mapping_failed;
561 static int tstat_hv_nopanic;
562 static int tstat_num4m_mapping;
563
564 #endif /* sun4v */
565
566 /*
567 * In the above block comment, see "TLB Statistics: TLB Misses versus
568 * TSB Misses" for an explanation of the tsbmiss patch points.
569 */
570 extern uint32_t tsbmiss_trapstat_patch_point;
571 extern uint32_t tsbmiss_trapstat_patch_point_kpm;
572 extern uint32_t tsbmiss_trapstat_patch_point_kpm_small;
573
574 /*
575 * Trapstat tsbmiss patch table
576 */
577 tstat_tsbmiss_patch_entry_t tstat_tsbmiss_patch_table[] = {
578 {(uint32_t *)&tsbmiss_trapstat_patch_point, 0},
579 {(uint32_t *)&tsbmiss_trapstat_patch_point_kpm, 0},
580 {(uint32_t *)&tsbmiss_trapstat_patch_point_kpm_small, 0},
581 {(uint32_t *)NULL, 0}
582 };
583
584 /*
585 * We define some general SPARC-specific constants to allow more readable
586 * relocations.
587 */
588 #define NOP 0x01000000
589 #define HI22(v) ((uint32_t)(v) >> 10)
590 #define LO10(v) ((uint32_t)(v) & 0x3ff)
591 #define LO12(v) ((uint32_t)(v) & 0xfff)
592 #define DISP22(from, to) \
593 ((((uintptr_t)(to) - (uintptr_t)(from)) >> 2) & 0x3fffff)
594 #define ASI(asi) ((asi) << 5)
595
596 /*
597 * The interposing trap table must be locked in the I-TLB, and any data
598 * referred to in the interposing trap handler must be locked in the D-TLB.
599 * This function locks these pages in the appropriate TLBs by creating TTEs
600 * from whole cloth, and manually loading them into the TLB. This function is
601 * called from cross call context.
602 *
603 * On sun4v platforms, we use 4M page size mappings to minimize the number
604 * of locked down entries (i.e. permanent mappings). Each CPU uses a
605 * reserved portion of that 4M page for its TBA and data.
606 */
607 static void
trapstat_load_tlb(void)608 trapstat_load_tlb(void)
609 {
610 int i;
611 #ifdef sun4v
612 uint64_t ret;
613 #endif
614 tte_t tte;
615 tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
616 caddr_t va = tcpu->tcpu_vabase;
617
618 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
619 ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
620
621 #ifndef sun4v
622 for (i = 0; i < tstat_total_pages; i++, va += MMU_PAGESIZE) {
623 tte.tte_inthi = TTE_VALID_INT | TTE_SZ_INT(TTE8K) |
624 TTE_PFN_INTHI(tcpu->tcpu_pfn[i]);
625 if (i < TSTAT_INSTR_PAGES) {
626 tte.tte_intlo = TTE_PFN_INTLO(tcpu->tcpu_pfn[i]) |
627 TTE_LCK_INT | TTE_CP_INT | TTE_PRIV_INT;
628 sfmmu_itlb_ld_kva(va, &tte);
629 } else {
630 tte.tte_intlo = TTE_PFN_INTLO(tcpu->tcpu_pfn[i]) |
631 TTE_LCK_INT | TTE_CP_INT | TTE_CV_INT |
632 TTE_PRIV_INT | TTE_HWWR_INT;
633 sfmmu_dtlb_ld_kva(va, &tte);
634 }
635 }
636 #else /* sun4v */
637 for (i = 0; i < tstat_num4m_mapping; i++) {
638 tte.tte_inthi = TTE_VALID_INT | TTE_PFN_INTHI(tstat_pfn[i]);
639 tte.tte_intlo = TTE_PFN_INTLO(tstat_pfn[i]) | TTE_CP_INT |
640 TTE_CV_INT | TTE_PRIV_INT | TTE_HWWR_INT |
641 TTE_SZ_INTLO(TTE4M);
642 ret = hv_mmu_map_perm_addr(va, KCONTEXT, *(uint64_t *)&tte,
643 MAP_ITLB | MAP_DTLB);
644
645 if (ret != H_EOK) {
646 if (tstat_hv_nopanic) {
647 int j;
648 /*
649 * The first attempt to create perm mapping
650 * failed. The guest might have exhausted its
651 * perm mapping limit. We don't panic on first
652 * try.
653 */
654 tstat_perm_mapping_failed = 1;
655 va = tcpu->tcpu_vabase;
656 for (j = 0; j < i; j++) {
657 (void) hv_mmu_unmap_perm_addr(va,
658 KCONTEXT, MAP_ITLB | MAP_DTLB);
659 va += MMU_PAGESIZE4M;
660 }
661 break;
662 }
663 /*
664 * We failed on subsequent cpus trying to
665 * create the same perm mappings. This
666 * should not happen. Panic here.
667 */
668 cmn_err(CE_PANIC, "trapstat: cannot create "
669 "perm mappings for cpu %d "
670 "(error: 0x%lx)", CPU->cpu_id, ret);
671 }
672 va += MMU_PAGESIZE4M;
673 }
674 #endif /* sun4v */
675 }
676
677 /*
678 * As mentioned in the "TLB Statistics: TLB Misses versus TSB Misses" section
679 * of the block comment, TLB misses are differentiated from TSB misses in
680 * part by hot-patching the instructions at the tsbmiss patch points (see
681 * tstat_tsbmiss_patch_table). This routine is used both to initially patch
682 * the instructions, and to patch them back to their original values upon
683 * restoring the original trap table.
684 */
685 static void
trapstat_hotpatch()686 trapstat_hotpatch()
687 {
688 uint32_t instr;
689 uint32_t simm13;
690 tstat_tsbmiss_patch_entry_t *ep;
691
692 ASSERT(MUTEX_HELD(&tstat_lock));
693
694 if (!(tstat_options & TSTAT_OPT_TLBDATA))
695 return;
696
697 if (!tstat_tsbmiss_patched) {
698 /*
699 * We haven't patched the TSB paths; do so now.
700 */
701 /*CONSTCOND*/
702 ASSERT(offsetof(tstat_tlbret_t, ttlbr_ktsb) -
703 offsetof(tstat_tlbret_t, ttlbr_ktlb) ==
704 offsetof(tstat_tlbret_t, ttlbr_utsb) -
705 offsetof(tstat_tlbret_t, ttlbr_utlb));
706
707 simm13 = offsetof(tstat_tlbret_t, ttlbr_ktsb) -
708 offsetof(tstat_tlbret_t, ttlbr_ktlb);
709
710 for (ep = tstat_tsbmiss_patch_table; ep->tpe_addr; ep++) {
711 ASSERT(ep->tpe_instr == 0);
712 instr = ep->tpe_instr = *ep->tpe_addr;
713
714 /*
715 * Assert that the instruction we're about to patch is
716 * "add %g7, 0, %g7" (0x8e01e000).
717 */
718 ASSERT(instr == TSTAT_TSBMISS_INSTR);
719
720 instr |= simm13;
721 hot_patch_kernel_text((caddr_t)ep->tpe_addr,
722 instr, sizeof (instr));
723 }
724
725 tstat_tsbmiss_patched = 1;
726
727 } else {
728 /*
729 * Remove patches from the TSB paths.
730 */
731 for (ep = tstat_tsbmiss_patch_table; ep->tpe_addr; ep++) {
732 ASSERT(ep->tpe_instr == TSTAT_TSBMISS_INSTR);
733 hot_patch_kernel_text((caddr_t)ep->tpe_addr,
734 ep->tpe_instr, sizeof (instr));
735 ep->tpe_instr = 0;
736 }
737
738 tstat_tsbmiss_patched = 0;
739 }
740 }
741
742 /*
743 * This is the routine executed to clock the performance of the trap table,
744 * executed both before and after interposing on the trap table to attempt to
745 * determine probe effect. The probe effect is used to adjust the "%tim"
746 * fields of trapstat's -t and -T output; we only use TLB misses to clock the
747 * trap table. We execute the inner loop (which is designed to exceed the
748 * TLB's reach) nlaps times, taking the best time as our time (thereby
749 * factoring out the effects of interrupts, cache misses or other perturbing
750 * events.
751 */
752 static hrtime_t
trapstat_probe_laps(int nlaps,hrtime_t * buf)753 trapstat_probe_laps(int nlaps, hrtime_t *buf)
754 {
755 int i, j = 0;
756 hrtime_t ts, best = INT64_MAX;
757
758 while (nlaps--) {
759 ts = rdtick();
760
761 for (i = 0; i < TSTAT_PROBE_SIZE; i += MMU_PAGESIZE)
762 *((volatile char *)&tstat_probe_area[i]);
763
764 if ((ts = rdtick() - ts) < best)
765 best = ts;
766 buf[j++] = ts;
767 }
768
769 return (best);
770 }
771
772 /*
773 * This routine determines the probe effect by calling trapstat_probe_laps()
774 * both without and with the interposing trap table. Note that this is
775 * called from a cross call on the desired CPU, and that it is called on
776 * every CPU (this is necessary because the probe effect may differ from
777 * one CPU to another).
778 */
779 static void
trapstat_probe()780 trapstat_probe()
781 {
782 tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
783 hrtime_t before, after;
784
785 if (!(tcpu->tcpu_flags & TSTAT_CPU_SELECTED))
786 return;
787
788 if (tstat_probe_area == NULL || (tstat_options & TSTAT_OPT_NOGO))
789 return;
790
791 /*
792 * We very much expect the %tba to be KERNELBASE; this is a
793 * precautionary measure to assure that trapstat doesn't melt the
794 * machine should the %tba point unexpectedly elsewhere.
795 */
796 if (get_tba() != (caddr_t)KERNELBASE)
797 return;
798
799 /*
800 * Preserve this CPU's data before destroying it by enabling the
801 * interposing trap table. We can safely use tstat_buffer because
802 * the caller of the trapstat_probe() cross call is holding tstat_lock.
803 */
804 #ifdef sun4v
805 bcopy(tcpu->tcpu_data, tstat_buffer, TSTAT_DATA_SIZE);
806 #else
807 bcopy(tcpu->tcpu_data, tstat_buffer, tstat_data_t_size);
808 #endif
809
810 tstat_probe_time = gethrtime();
811
812 before = trapstat_probe_laps(TSTAT_PROBE_NLAPS, tstat_probe_before);
813 (void) set_tba(tcpu->tcpu_ibase);
814
815 after = trapstat_probe_laps(TSTAT_PROBE_NLAPS, tstat_probe_after);
816 (void) set_tba((caddr_t)KERNELBASE);
817
818 tstat_probe_time = gethrtime() - tstat_probe_time;
819
820 #ifdef sun4v
821 bcopy(tstat_buffer, tcpu->tcpu_data, TSTAT_DATA_SIZE);
822 tcpu->tcpu_tdata_peffect = (after - before) / TSTAT_PROBE_NPAGES;
823 #else
824 bcopy(tstat_buffer, tcpu->tcpu_data, tstat_data_t_size);
825 tcpu->tcpu_data->tdata_peffect = (after - before) / TSTAT_PROBE_NPAGES;
826 #endif
827 }
828
829 static void
trapstat_probe_alloc()830 trapstat_probe_alloc()
831 {
832 pfn_t pfn;
833 caddr_t va;
834 int i;
835
836 ASSERT(MUTEX_HELD(&tstat_lock));
837 ASSERT(tstat_probe_area == NULL);
838 ASSERT(tstat_probe_phys == NULL);
839
840 if (!(tstat_options & TSTAT_OPT_TLBDATA))
841 return;
842
843 /*
844 * Grab some virtual from the heap arena.
845 */
846 tstat_probe_area = vmem_alloc(heap_arena, TSTAT_PROBE_SIZE, VM_SLEEP);
847 va = tstat_probe_area;
848
849 /*
850 * Grab a single physical page.
851 */
852 tstat_probe_phys = vmem_alloc(tstat_arena, MMU_PAGESIZE, VM_SLEEP);
853 pfn = hat_getpfnum(kas.a_hat, tstat_probe_phys);
854
855 /*
856 * Now set the translation for every page in our virtual range
857 * to be our allocated physical page.
858 */
859 for (i = 0; i < TSTAT_PROBE_NPAGES; i++) {
860 hat_devload(kas.a_hat, va, MMU_PAGESIZE, pfn, PROT_READ,
861 HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
862 va += MMU_PAGESIZE;
863 }
864 }
865
866 static void
trapstat_probe_free()867 trapstat_probe_free()
868 {
869 caddr_t va;
870 int i;
871
872 ASSERT(MUTEX_HELD(&tstat_lock));
873
874 if ((va = tstat_probe_area) == NULL)
875 return;
876
877 for (i = 0; i < TSTAT_PROBE_NPAGES; i++) {
878 hat_unload(kas.a_hat, va, MMU_PAGESIZE, HAT_UNLOAD_UNLOCK);
879 va += MMU_PAGESIZE;
880 }
881
882 vmem_free(tstat_arena, tstat_probe_phys, MMU_PAGESIZE);
883 vmem_free(heap_arena, tstat_probe_area, TSTAT_PROBE_SIZE);
884
885 tstat_probe_phys = NULL;
886 tstat_probe_area = NULL;
887 }
888
889 /*
890 * This routine actually enables a CPU by setting its %tba to be the
891 * CPU's interposing trap table. It is called out of cross call context.
892 */
893 static void
trapstat_enable()894 trapstat_enable()
895 {
896 tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
897
898 if (!(tcpu->tcpu_flags & TSTAT_CPU_SELECTED))
899 return;
900
901 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
902 ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
903
904 if (get_tba() != (caddr_t)KERNELBASE)
905 return;
906
907 if (!(tstat_options & TSTAT_OPT_NOGO))
908 (void) set_tba(tcpu->tcpu_ibase);
909 tcpu->tcpu_flags |= TSTAT_CPU_ENABLED;
910 #ifdef sun4v
911 if ((tstat_options & TSTAT_OPT_TLBDATA) &&
912 !(tstat_options & TSTAT_OPT_NOGO)) {
913 if (tstat_fast_tlbstat) {
914 /*
915 * Invoke processor specific interface to enable
916 * collection of TSB hit statistics.
917 */
918 (void) cpu_trapstat_conf(CPU_TSTATCONF_ENABLE);
919 } else {
920 /*
921 * Collect TLB miss statistics by taking over
922 * TLB miss handling from the hypervisor. This
923 * is done by telling the hypervisor that there
924 * is no TSB configured. Also set TSTAT_TLB_STATS
925 * flag so that no user TSB is configured during
926 * context switch time.
927 */
928 cpu_t *cp = CPU;
929
930 cp->cpu_m.cpu_tstat_flags |= TSTAT_TLB_STATS;
931 (void) hv_set_ctx0(NULL, NULL);
932 (void) hv_set_ctxnon0(NULL, NULL);
933 }
934 }
935 #endif
936 }
937
938 /*
939 * This routine disables a CPU (vis a vis trapstat) by setting its %tba to be
940 * the actual, underlying trap table. It is called out of cross call context.
941 */
942 static void
trapstat_disable()943 trapstat_disable()
944 {
945 tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
946
947 if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED))
948 return;
949
950 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
951 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
952
953 if (!(tstat_options & TSTAT_OPT_NOGO))
954 (void) set_tba((caddr_t)KERNELBASE);
955
956 tcpu->tcpu_flags &= ~TSTAT_CPU_ENABLED;
957
958 #ifdef sun4v
959 if ((tstat_options & TSTAT_OPT_TLBDATA) &&
960 !(tstat_options & TSTAT_OPT_NOGO)) {
961 if (tstat_fast_tlbstat) {
962 /*
963 * Invoke processor specific interface to disable
964 * collection of TSB hit statistics on each processor.
965 */
966 (void) cpu_trapstat_conf(CPU_TSTATCONF_DISABLE);
967 } else {
968 /*
969 * As part of collecting TLB miss statistics, we took
970 * over TLB miss handling from the hypervisor by
971 * telling the hypervisor that NO TSB is configured.
972 * We need to restore that by communicating proper
973 * kernel/user TSB information so that TLB misses
974 * can be handled by the hypervisor or the hardware
975 * more efficiently.
976 *
977 * We restore kernel TSB information right away.
978 * However, to minimize any locking dependency, we
979 * don't restore user TSB information right away.
980 * Instead, we simply clear the TSTAT_TLB_STATS flag
981 * so that the user TSB information is automatically
982 * restored on next context switch.
983 *
984 * Note that the call to restore kernel TSB information
985 * will normally not fail, unless wrong information is
986 * passed here. In that scenario, system will still
987 * continue to function properly with the exception of
988 * kernel handling all the TLB misses.
989 */
990 struct hv_tsb_block *hvbp = &ksfmmup->sfmmu_hvblock;
991 cpu_t *cp = CPU;
992
993 cp->cpu_m.cpu_tstat_flags &= ~TSTAT_TLB_STATS;
994 (void) hv_set_ctx0(hvbp->hv_tsb_info_cnt,
995 hvbp->hv_tsb_info_pa);
996 }
997 }
998 #endif
999 }
1000
1001 /*
1002 * We use %tick as the time base when recording the time spent executing
1003 * the trap handler. %tick, however, is not necessarily kept in sync
1004 * across CPUs (indeed, different CPUs may have different %tick frequencies).
1005 * We therefore cross call onto a CPU to get a snapshot of its data to
1006 * copy out; this is the routine executed out of that cross call.
1007 */
1008 static void
trapstat_snapshot()1009 trapstat_snapshot()
1010 {
1011 tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
1012 tstat_data_t *data = tcpu->tcpu_data;
1013
1014 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
1015 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
1016 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ENABLED);
1017
1018 #ifndef sun4v
1019 data->tdata_snapts = gethrtime();
1020 data->tdata_snaptick = rdtick();
1021 bcopy(data, tstat_buffer, tstat_data_t_size);
1022 #else
1023 /*
1024 * For sun4v, in order to conserve space in the limited
1025 * per-cpu 4K buffer, we derive certain info somewhere else and
1026 * copy them directly into the tstat_buffer output.
1027 * Note that we either are collecting tlb stats or
1028 * regular trapstats but never both.
1029 */
1030 tstat_buffer->tdata_cpuid = CPU->cpu_id;
1031 tstat_buffer->tdata_peffect = tcpu->tcpu_tdata_peffect;
1032 tstat_buffer->tdata_snapts = gethrtime();
1033 tstat_buffer->tdata_snaptick = rdtick();
1034
1035 if (tstat_options & TSTAT_OPT_TLBDATA) {
1036 /* Copy tlb/tsb stats collected in the per-cpu trapdata */
1037 tstat_tdata_t *tdata = (tstat_tdata_t *)data;
1038 bcopy(&tdata->tdata_pgsz[0],
1039 &tstat_buffer->tdata_pgsz[0],
1040 tstat_pgszs * sizeof (tstat_pgszdata_t));
1041
1042 /*
1043 * Invoke processor specific interface to collect TLB stats
1044 * on each processor if enabled.
1045 */
1046 if (tstat_fast_tlbstat) {
1047 cpu_trapstat_data((void *) tstat_buffer->tdata_pgsz,
1048 tstat_pgszs);
1049 }
1050 } else {
1051 /*
1052 * Normal trapstat collection.
1053 * Copy all the 4K data area into tstat_buffer tdata_trap
1054 * area.
1055 */
1056 bcopy(data, &tstat_buffer->tdata_traps[0], TSTAT_DATA_SIZE);
1057 }
1058 #endif /* sun4v */
1059 }
1060
1061 /*
1062 * The TSTAT_RETENT_* constants define offsets in the TLB return entry.
1063 * They are used only in trapstat_tlbretent() (below) and #undef'd
1064 * immediately afterwards. Any change to "retent" in trapstat_tlbretent()
1065 * will likely require changes to these constants.
1066 */
1067
1068 #ifndef sun4v
1069 #define TSTAT_RETENT_STATHI 1
1070 #define TSTAT_RETENT_STATLO 2
1071 #define TSTAT_RETENT_SHIFT 11
1072 #define TSTAT_RETENT_COUNT_LD 13
1073 #define TSTAT_RETENT_COUNT_ST 15
1074 #define TSTAT_RETENT_TMPTSHI 16
1075 #define TSTAT_RETENT_TMPTSLO 17
1076 #define TSTAT_RETENT_TIME_LD 19
1077 #define TSTAT_RETENT_TIME_ST 21
1078 #else /* sun4v */
1079 #define TSTAT_RETENT_TDATASHFT 2
1080 #define TSTAT_RETENT_STATHI 4
1081 #define TSTAT_RETENT_STATLO 6
1082 #define TSTAT_RETENT_SHIFT 9
1083 #define TSTAT_RETENT_COUNT_LD 11
1084 #define TSTAT_RETENT_COUNT_ST 13
1085 #define TSTAT_RETENT_TMPTSHI 14
1086 #define TSTAT_RETENT_TMPTSLO 16
1087 #define TSTAT_RETENT_TIME_LD 18
1088 #define TSTAT_RETENT_TIME_ST 20
1089 #endif /* sun4v */
1090
1091 static void
trapstat_tlbretent(tstat_percpu_t * tcpu,tstat_tlbretent_t * ret,tstat_missdata_t * data)1092 trapstat_tlbretent(tstat_percpu_t *tcpu, tstat_tlbretent_t *ret,
1093 tstat_missdata_t *data)
1094 {
1095 uint32_t *ent = ret->ttlbrent_instr, shift;
1096 uintptr_t base;
1097 #ifndef sun4v
1098 uintptr_t tmptick = TSTAT_DATA_OFFS(tcpu, tdata_tmptick);
1099 #else
1100 uintptr_t tmptick = TSTAT_CPU0_TLBDATA_OFFS(tcpu, tdata_tmptick);
1101 #endif
1102
1103 /*
1104 * This is the entry executed upon return from the TLB/TSB miss
1105 * handler (i.e. the code interpositioned between the "retry" and
1106 * the actual return to the TLB-missing instruction). Detail on its
1107 * theory of operation can be found in the "TLB Statistics" section
1108 * of the block comment. Note that we expect the TTE just loaded
1109 * into the TLB to be in %g5; all other globals are available as
1110 * scratch. Finally, note that the page size information in sun4v is
1111 * located in the lower bits of the TTE -- requiring us to have a
1112 * different return entry on sun4v.
1113 */
1114 static const uint32_t retent[TSTAT_TLBRET_NINSTR] = {
1115 #ifndef sun4v
1116 0x87410000, /* rd %tick, %g3 */
1117 0x03000000, /* sethi %hi(stat), %g1 */
1118 0x82106000, /* or %g1, %lo(stat), %g1 */
1119 0x89297001, /* sllx %g5, 1, %g4 */
1120 0x8931303e, /* srlx %g4, 62, %g4 */
1121 0x8531702e, /* srlx %g5, 46, %g2 */
1122 0x8408a004, /* and %g2, 4, %g2 */
1123 0x88110002, /* or %g4, %g2, %g4 */
1124 0x80a12005, /* cmp %g4, 5 */
1125 0x34400002, /* bg,a,pn %icc, +8 */
1126 0x88102004, /* mov 4, %g4 */
1127 0x89292000, /* sll %g4, shift, %g4 */
1128 0x82004004, /* add %g1, %g4, %g1 */
1129 0xc4586000, /* ldx [%g1 + tmiss_count], %g2 */
1130 0x8400a001, /* add %g2, 1, %g2 */
1131 0xc4706000, /* stx %g2, [%g1 + tmiss_count] */
1132 0x0d000000, /* sethi %hi(tdata_tmptick), %g6 */
1133 0xc459a000, /* ldx [%g6 + %lo(tdata_tmptick)], %g2 */
1134 0x8620c002, /* sub %g3, %g2, %g3 */
1135 0xc4586000, /* ldx [%g1 + tmiss_time], %g2 */
1136 0x84008003, /* add %g2, %g3, %g2 */
1137 0xc4706000, /* stx %g2, [%g1 + tmiss_time] */
1138 0x83f00000 /* retry */
1139 #else /* sun4v */
1140 0x82102008, /* mov SCRATCHPAD_CPUID, %g1 */
1141 0xced84400, /* ldxa [%g1]ASI_SCRATCHPAD, %g7 */
1142 0x8f29f000, /* sllx %g7, TSTAT_DATA_SHIFT, %g7 */
1143 0x87410000, /* rd %tick, %g3 */
1144 0x03000000, /* sethi %hi(stat), %g1 */
1145 0x82004007, /* add %g1, %g7, %g1 */
1146 0x82106000, /* or %g1, %lo(stat), %g1 */
1147 0x8929703d, /* sllx %g5, 61, %g4 */
1148 0x8931303d, /* srlx %g4, 61, %g4 */
1149 0x89292000, /* sll %g4, shift, %g4 */
1150 0x82004004, /* add %g1, %g4, %g1 */
1151 0xc4586000, /* ldx [%g1 + tmiss_count], %g2 */
1152 0x8400a001, /* add %g2, 1, %g2 */
1153 0xc4706000, /* stx %g2, [%g1 + tmiss_count] */
1154 0x0d000000, /* sethi %hi(tdata_tmptick), %g6 */
1155 0x8c018007, /* add %g6, %g7, %g6 */
1156 0xc459a000, /* ldx [%g6 + %lo(tdata_tmptick)], %g2 */
1157 0x8620c002, /* sub %g3, %g2, %g3 */
1158 0xc4586000, /* ldx [%g1 + tmiss_time], %g2 */
1159 0x84008003, /* add %g2, %g3, %g2 */
1160 0xc4706000, /* stx %g2, [%g1 + tmiss_time] */
1161 0x83f00000 /* retry */
1162 #endif /* sun4v */
1163 };
1164
1165 ASSERT(MUTEX_HELD(&tstat_lock));
1166 /*CONSTCOND*/
1167 ASSERT(offsetof(tstat_missdata_t, tmiss_count) <= LO10(-1));
1168 /*CONSTCOND*/
1169 ASSERT(offsetof(tstat_missdata_t, tmiss_time) <= LO10(-1));
1170 /*CONSTCOND*/
1171 ASSERT(!((sizeof (tstat_pgszdata_t) - 1) & sizeof (tstat_pgszdata_t)));
1172
1173 for (shift = 1; (1 << shift) != sizeof (tstat_pgszdata_t); shift++)
1174 continue;
1175
1176 base = (uintptr_t)tcpu->tcpu_ibase + TSTAT_INSTR_SIZE +
1177 ((uintptr_t)data - (uintptr_t)tcpu->tcpu_data);
1178
1179 bcopy(retent, ent, sizeof (retent));
1180
1181 #if defined(sun4v)
1182 ent[TSTAT_RETENT_TDATASHFT] |= LO10((uintptr_t)TSTAT_DATA_SHIFT);
1183 #endif
1184 ent[TSTAT_RETENT_STATHI] |= HI22(base);
1185 ent[TSTAT_RETENT_STATLO] |= LO10(base);
1186 ent[TSTAT_RETENT_SHIFT] |= shift;
1187 /* LINTED E_EXPR_NULL_EFFECT */
1188 ent[TSTAT_RETENT_COUNT_LD] |= offsetof(tstat_missdata_t, tmiss_count);
1189 /* LINTED E_EXPR_NULL_EFFECT */
1190 ent[TSTAT_RETENT_COUNT_ST] |= offsetof(tstat_missdata_t, tmiss_count);
1191 ent[TSTAT_RETENT_TMPTSHI] |= HI22(tmptick);
1192 ent[TSTAT_RETENT_TMPTSLO] |= LO10(tmptick);
1193 ent[TSTAT_RETENT_TIME_LD] |= offsetof(tstat_missdata_t, tmiss_time);
1194 ent[TSTAT_RETENT_TIME_ST] |= offsetof(tstat_missdata_t, tmiss_time);
1195 }
1196
1197 #if defined(sun4v)
1198 #undef TSTAT_RETENT_TDATASHFT
1199 #endif
1200 #undef TSTAT_RETENT_STATHI
1201 #undef TSTAT_RETENT_STATLO
1202 #undef TSTAT_RETENT_SHIFT
1203 #undef TSTAT_RETENT_COUNT_LD
1204 #undef TSTAT_RETENT_COUNT_ST
1205 #undef TSTAT_RETENT_TMPTSHI
1206 #undef TSTAT_RETENT_TMPTSLO
1207 #undef TSTAT_RETENT_TIME_LD
1208 #undef TSTAT_RETENT_TIME_ST
1209
1210 /*
1211 * The TSTAT_TLBENT_* constants define offsets in the TLB entry. They are
1212 * used only in trapstat_tlbent() (below) and #undef'd immediately afterwards.
1213 * Any change to "tlbent" in trapstat_tlbent() will likely require changes
1214 * to these constants.
1215 */
1216
1217 #ifndef sun4v
1218 #define TSTAT_TLBENT_STATHI 0
1219 #define TSTAT_TLBENT_STATLO_LD 1
1220 #define TSTAT_TLBENT_STATLO_ST 3
1221 #define TSTAT_TLBENT_MMUASI 15
1222 #define TSTAT_TLBENT_TPCHI 18
1223 #define TSTAT_TLBENT_TPCLO_USER 19
1224 #define TSTAT_TLBENT_TPCLO_KERN 21
1225 #define TSTAT_TLBENT_TSHI 25
1226 #define TSTAT_TLBENT_TSLO 27
1227 #define TSTAT_TLBENT_BA 28
1228 #else /* sun4v */
1229 #define TSTAT_TLBENT_TDATASHFT 2
1230 #define TSTAT_TLBENT_STATHI 3
1231 #define TSTAT_TLBENT_STATLO_LD 5
1232 #define TSTAT_TLBENT_STATLO_ST 7
1233 #define TSTAT_TLBENT_TAGTARGET 23
1234 #define TSTAT_TLBENT_TPCHI 25
1235 #define TSTAT_TLBENT_TPCLO_USER 26
1236 #define TSTAT_TLBENT_TPCLO_KERN 28
1237 #define TSTAT_TLBENT_TSHI 32
1238 #define TSTAT_TLBENT_TSLO 35
1239 #define TSTAT_TLBENT_ADDRHI 36
1240 #define TSTAT_TLBENT_ADDRLO 37
1241 #endif /* sun4v */
1242
1243 static void
trapstat_tlbent(tstat_percpu_t * tcpu,int entno)1244 trapstat_tlbent(tstat_percpu_t *tcpu, int entno)
1245 {
1246 uint32_t *ent;
1247 uintptr_t orig, va;
1248 #ifndef sun4v
1249 uintptr_t baoffs;
1250 int itlb = entno == TSTAT_ENT_ITLBMISS;
1251 uint32_t asi = itlb ? ASI(ASI_IMMU) : ASI(ASI_DMMU);
1252 #else
1253 int itlb = (entno == TSTAT_ENT_IMMUMISS || entno == TSTAT_ENT_ITLBMISS);
1254 uint32_t tagtarget_off = itlb ? MMFSA_I_CTX : MMFSA_D_CTX;
1255 uint32_t *tent; /* MMU trap vector entry */
1256 uintptr_t tentva; /* MMU trap vector entry va */
1257 static const uint32_t mmumiss[TSTAT_ENT_NINSTR] = {
1258 0x30800000, /* ba,a addr */
1259 NOP, NOP, NOP, NOP, NOP, NOP, NOP
1260 };
1261 #endif
1262 int entoffs = entno << TSTAT_ENT_SHIFT;
1263 uintptr_t tmptick, stat, tpc, utpc;
1264 tstat_pgszdata_t *data;
1265 tstat_tlbdata_t *udata, *kdata;
1266 tstat_tlbret_t *ret;
1267
1268 #ifdef sun4v
1269 data = &((tstat_tdata_t *)tcpu->tcpu_data)->tdata_pgsz[0];
1270 #else
1271 data = &tcpu->tcpu_data->tdata_pgsz[0];
1272 #endif /* sun4v */
1273
1274 /*
1275 * When trapstat is run with TLB statistics, this is the entry for
1276 * both I- and D-TLB misses; this code performs trap level pushing,
1277 * as described in the "TLB Statistics" section of the block comment.
1278 * This code is executing at TL 1; %tstate[0] contains the saved
1279 * state at the time of the TLB miss. Pushing trap level 1 (and thus
1280 * raising TL to 2) requires us to fill in %tstate[1] with our %pstate,
1281 * %cwp and %asi. We leave %tt unchanged, and we set %tpc and %tnpc to
1282 * the appropriate TLB return entry (based on the context of the miss).
1283 * Finally, we sample %tick, and stash it in the tdata_tmptick member
1284 * the per-CPU tstat_data structure. tdata_tmptick will be used in
1285 * the TLB return entry to determine the amount of time spent in the
1286 * TLB miss handler.
1287 *
1288 * Note that on sun4v platforms, we must obtain the context information
1289 * from the MMU fault status area. (The base address of this MMU fault
1290 * status area is kept in the scratchpad register 0.)
1291 */
1292 static const uint32_t tlbent[] = {
1293 #ifndef sun4v
1294 0x03000000, /* sethi %hi(stat), %g1 */
1295 0xc4586000, /* ldx [%g1 + %lo(stat)], %g2 */
1296 0x8400a001, /* add %g2, 1, %g2 */
1297 0xc4706000, /* stx %g2, [%g1 + %lo(stat)] */
1298 0x85524000, /* rdpr %cwp, %g2 */
1299 0x87518000, /* rdpr %pstate, %g3 */
1300 0x8728f008, /* sllx %g3, 8, %g3 */
1301 0x84108003, /* or %g2, %g3, %g2 */
1302 0x8740c000, /* rd %asi, %g3 */
1303 0x8728f018, /* sllx %g3, 24, %g3 */
1304 0x84108003, /* or %g2, %g3, %g2 */
1305 0x8350c000, /* rdpr %tt, %g1 */
1306 0x8f902002, /* wrpr %g0, 2, %tl */
1307 0x85908000, /* wrpr %g2, %g0, %tstate */
1308 0x87904000, /* wrpr %g1, %g0, %tt */
1309 0xc2d80000, /* ldxa [%g0]ASI_MMU, %g1 */
1310 0x83307030, /* srlx %g1, CTXSHIFT, %g1 */
1311 0x02c04004, /* brz,pn %g1, .+0x10 */
1312 0x03000000, /* sethi %hi(new_tpc), %g1 */
1313 0x82106000, /* or %g1, %lo(new_tpc), %g1 */
1314 0x30800002, /* ba,a .+0x8 */
1315 0x82106000, /* or %g1, %lo(new_tpc), %g1 */
1316 0x81904000, /* wrpr %g1, %g0, %tpc */
1317 0x82006004, /* add %g1, 4, %g1 */
1318 0x83904000, /* wrpr %g1, %g0, %tnpc */
1319 0x03000000, /* sethi %hi(tmptick), %g1 */
1320 0x85410000, /* rd %tick, %g2 */
1321 0xc4706000, /* stx %g2, [%g1 + %lo(tmptick)] */
1322 0x30800000, /* ba,a addr */
1323 NOP, NOP, NOP
1324 #else /* sun4v */
1325 0x82102008, /* mov SCRATCHPAD_CPUID, %g1 */
1326 0xc8d84400, /* ldxa [%g1]ASI_SCRATCHPAD, %g4 */
1327 0x89293000, /* sllx %g4, TSTAT_DATA_SHIFT, %g4 */
1328 0x03000000, /* sethi %hi(stat), %g1 */
1329 0x82004004, /* add %g1, %g4, %g1 */
1330 0xc4586000, /* ldx [%g1 + %lo(stat)], %g2 */
1331 0x8400a001, /* add %g2, 1, %g2 */
1332 0xc4706000, /* stx %g2, [%g1 + %lo(stat)] */
1333 0x85524000, /* rdpr %cwp, %g2 */
1334 0x87518000, /* rdpr %pstate, %g3 */
1335 0x8728f008, /* sllx %g3, 8, %g3 */
1336 0x84108003, /* or %g2, %g3, %g2 */
1337 0x8740c000, /* rd %asi, %g3 */
1338 0x8728f018, /* sllx %g3, 24, %g3 */
1339 0x83540000, /* rdpr %gl, %g1 */
1340 0x83287028, /* sllx %g1, 40, %g1 */
1341 0x86104003, /* or %g1, %g3, %g3 */
1342 0x84108003, /* or %g2, %g3, %g2 */
1343 0x8350c000, /* rdpr %tt, %g1 */
1344 0x8f902002, /* wrpr %g0, 2, %tl */
1345 0x85908000, /* wrpr %g2, %g0, %tstate */
1346 0x87904000, /* wrpr %g1, %g0, %tt */
1347 0xc2d80400, /* ldxa [%g0]ASI_SCRATCHPAD, %g1 */
1348 0xc2586000, /* ldx [%g1 + MMFSA_?_CTX], %g1 */
1349 0x02c04004, /* brz,pn %g1, .+0x10 */
1350 0x03000000, /* sethi %hi(new_tpc), %g1 */
1351 0x82106000, /* or %g1, %lo(new_tpc), %g1 */
1352 0x30800002, /* ba,a .+0x8 */
1353 0x82106000, /* or %g1, %lo(new_tpc), %g1 */
1354 0x81904000, /* wrpr %g1, %g0, %tpc */
1355 0x82006004, /* add %g1, 4, %g1 */
1356 0x83904000, /* wrpr %g1, %g0, %tnpc */
1357 0x03000000, /* sethi %hi(tmptick), %g1 */
1358 0x82004004, /* add %g1, %g4, %g1 */
1359 0x85410000, /* rd %tick, %g2 */
1360 0xc4706000, /* stx %g2, [%g1 + %lo(tmptick)] */
1361 0x05000000, /* sethi %hi(addr), %g2 */
1362 0x8410a000, /* or %g2, %lo(addr), %g2 */
1363 0x81c08000, /* jmp %g2 */
1364 NOP,
1365 #endif /* sun4v */
1366 };
1367
1368 ASSERT(MUTEX_HELD(&tstat_lock));
1369 #ifndef sun4v
1370 ASSERT(entno == TSTAT_ENT_ITLBMISS || entno == TSTAT_ENT_DTLBMISS);
1371
1372 stat = TSTAT_DATA_OFFS(tcpu, tdata_traps) + entoffs;
1373 tmptick = TSTAT_DATA_OFFS(tcpu, tdata_tmptick);
1374 #else /* sun4v */
1375 ASSERT(entno == TSTAT_ENT_ITLBMISS || entno == TSTAT_ENT_DTLBMISS ||
1376 entno == TSTAT_ENT_IMMUMISS || entno == TSTAT_ENT_DMMUMISS);
1377
1378 stat = TSTAT_CPU0_TLBDATA_OFFS(tcpu, tdata_traps[entno]);
1379 tmptick = TSTAT_CPU0_TLBDATA_OFFS(tcpu, tdata_tmptick);
1380 #endif /* sun4v */
1381
1382 if (itlb) {
1383 ret = &tcpu->tcpu_instr->tinst_itlbret;
1384 udata = &data->tpgsz_user.tmode_itlb;
1385 kdata = &data->tpgsz_kernel.tmode_itlb;
1386 tpc = TSTAT_INSTR_OFFS(tcpu, tinst_itlbret.ttlbr_ktlb);
1387 } else {
1388 ret = &tcpu->tcpu_instr->tinst_dtlbret;
1389 udata = &data->tpgsz_user.tmode_dtlb;
1390 kdata = &data->tpgsz_kernel.tmode_dtlb;
1391 tpc = TSTAT_INSTR_OFFS(tcpu, tinst_dtlbret.ttlbr_ktlb);
1392 }
1393
1394 utpc = tpc + offsetof(tstat_tlbret_t, ttlbr_utlb) -
1395 offsetof(tstat_tlbret_t, ttlbr_ktlb);
1396
1397 ASSERT(HI22(tpc) == HI22(utpc));
1398
1399 ent = (uint32_t *)((uintptr_t)tcpu->tcpu_instr + entoffs);
1400 orig = KERNELBASE + entoffs;
1401 va = (uintptr_t)tcpu->tcpu_ibase + entoffs;
1402
1403 #ifdef sun4v
1404 /*
1405 * Because of lack of space, interposing tlbent trap handler
1406 * for TLB and MMU miss traps cannot be placed in-line. Instead,
1407 * we copy it to the space set aside for shared trap handlers
1408 * continuation in the interposing trap table and invoke it by
1409 * placing a branch in the trap table itself.
1410 */
1411 tent = ent; /* trap vector entry */
1412 tentva = va; /* trap vector entry va */
1413
1414 if (itlb) {
1415 ent = (uint32_t *)((uintptr_t)
1416 &tcpu->tcpu_instr->tinst_immumiss);
1417 va = TSTAT_INSTR_OFFS(tcpu, tinst_immumiss);
1418 } else {
1419 ent = (uint32_t *)((uintptr_t)
1420 &tcpu->tcpu_instr->tinst_dmmumiss);
1421 va = TSTAT_INSTR_OFFS(tcpu, tinst_dmmumiss);
1422 }
1423 bcopy(mmumiss, tent, sizeof (mmumiss));
1424 tent[0] |= DISP22(tentva, va);
1425 #endif /* sun4v */
1426
1427 bcopy(tlbent, ent, sizeof (tlbent));
1428
1429 #if defined(sun4v)
1430 ent[TSTAT_TLBENT_TDATASHFT] |= LO10((uintptr_t)TSTAT_DATA_SHIFT);
1431 #endif
1432 ent[TSTAT_TLBENT_STATHI] |= HI22(stat);
1433 ent[TSTAT_TLBENT_STATLO_LD] |= LO10(stat);
1434 ent[TSTAT_TLBENT_STATLO_ST] |= LO10(stat);
1435 #ifndef sun4v
1436 ent[TSTAT_TLBENT_MMUASI] |= asi;
1437 #else
1438 ent[TSTAT_TLBENT_TAGTARGET] |= tagtarget_off;
1439 #endif
1440 ent[TSTAT_TLBENT_TPCHI] |= HI22(tpc);
1441 ent[TSTAT_TLBENT_TPCLO_USER] |= LO10(utpc);
1442 ent[TSTAT_TLBENT_TPCLO_KERN] |= LO10(tpc);
1443 ent[TSTAT_TLBENT_TSHI] |= HI22(tmptick);
1444 ent[TSTAT_TLBENT_TSLO] |= LO10(tmptick);
1445 #ifndef sun4v
1446 baoffs = TSTAT_TLBENT_BA * sizeof (uint32_t);
1447 ent[TSTAT_TLBENT_BA] |= DISP22(va + baoffs, orig);
1448 #else
1449 ent[TSTAT_TLBENT_ADDRHI] |= HI22(orig);
1450 ent[TSTAT_TLBENT_ADDRLO] |= LO10(orig);
1451 #endif /* sun4v */
1452
1453 /*
1454 * And now set up the TLB return entries.
1455 */
1456 trapstat_tlbretent(tcpu, &ret->ttlbr_ktlb, &kdata->ttlb_tlb);
1457 trapstat_tlbretent(tcpu, &ret->ttlbr_ktsb, &kdata->ttlb_tsb);
1458 trapstat_tlbretent(tcpu, &ret->ttlbr_utlb, &udata->ttlb_tlb);
1459 trapstat_tlbretent(tcpu, &ret->ttlbr_utsb, &udata->ttlb_tsb);
1460 }
1461
1462 #if defined(sun4v)
1463 #undef TSTAT_TLBENT_TDATASHFT
1464 #endif
1465 #undef TSTAT_TLBENT_STATHI
1466 #undef TSTAT_TLBENT_STATLO_LD
1467 #undef TSTAT_TLBENT_STATLO_ST
1468 #ifndef sun4v
1469 #undef TSTAT_TLBENT_MMUASI
1470 #else
1471 #undef TSTAT_TLBENT_TAGTARGET
1472 #endif
1473 #undef TSTAT_TLBENT_TPCHI
1474 #undef TSTAT_TLBENT_TPCLO_USER
1475 #undef TSTAT_TLBENT_TPCLO_KERN
1476 #undef TSTAT_TLBENT_TSHI
1477 #undef TSTAT_TLBENT_TSLO
1478 #undef TSTAT_TLBENT_BA
1479
1480 /*
1481 * The TSTAT_ENABLED_* constants define offsets in the enabled entry; the
1482 * TSTAT_DISABLED_BA constant defines an offset in the disabled entry. Both
1483 * sets of constants are used only in trapstat_make_traptab() (below) and
1484 * #undef'd immediately afterwards. Any change to "enabled" or "disabled"
1485 * in trapstat_make_traptab() will likely require changes to these constants.
1486 */
1487 #ifndef sun4v
1488 #define TSTAT_ENABLED_STATHI 0
1489 #define TSTAT_ENABLED_STATLO_LD 1
1490 #define TSTAT_ENABLED_STATLO_ST 3
1491 #define TSTAT_ENABLED_BA 4
1492 #define TSTAT_DISABLED_BA 0
1493
1494 static void
trapstat_make_traptab(tstat_percpu_t * tcpu)1495 trapstat_make_traptab(tstat_percpu_t *tcpu)
1496 {
1497 uint32_t *ent;
1498 uint64_t *stat;
1499 uintptr_t orig, va, en_baoffs, dis_baoffs;
1500 int nent;
1501
1502 /*
1503 * This is the entry in the interposing trap table for enabled trap
1504 * table entries. It loads a counter, increments it and stores it
1505 * back before branching to the actual trap table entry.
1506 */
1507 static const uint32_t enabled[TSTAT_ENT_NINSTR] = {
1508 0x03000000, /* sethi %hi(stat), %g1 */
1509 0xc4586000, /* ldx [%g1 + %lo(stat)], %g2 */
1510 0x8400a001, /* add %g2, 1, %g2 */
1511 0xc4706000, /* stx %g2, [%g1 + %lo(stat)] */
1512 0x30800000, /* ba,a addr */
1513 NOP, NOP, NOP
1514 };
1515
1516 /*
1517 * This is the entry in the interposing trap table for disabled trap
1518 * table entries. It simply branches to the actual, underlying trap
1519 * table entry. As explained in the "Implementation Details" section
1520 * of the block comment, all TL>0 traps _must_ use the disabled entry;
1521 * additional entries may be explicitly disabled through the use
1522 * of TSTATIOC_ENTRY/TSTATIOC_NOENTRY.
1523 */
1524 static const uint32_t disabled[TSTAT_ENT_NINSTR] = {
1525 0x30800000, /* ba,a addr */
1526 NOP, NOP, NOP, NOP, NOP, NOP, NOP,
1527 };
1528
1529 ASSERT(MUTEX_HELD(&tstat_lock));
1530
1531 ent = tcpu->tcpu_instr->tinst_traptab;
1532 stat = (uint64_t *)TSTAT_DATA_OFFS(tcpu, tdata_traps);
1533 orig = KERNELBASE;
1534 va = (uintptr_t)tcpu->tcpu_ibase;
1535 en_baoffs = TSTAT_ENABLED_BA * sizeof (uint32_t);
1536 dis_baoffs = TSTAT_DISABLED_BA * sizeof (uint32_t);
1537
1538 for (nent = 0; nent < TSTAT_TOTAL_NENT; nent++) {
1539 if (tstat_enabled[nent]) {
1540 bcopy(enabled, ent, sizeof (enabled));
1541 ent[TSTAT_ENABLED_STATHI] |= HI22((uintptr_t)stat);
1542 ent[TSTAT_ENABLED_STATLO_LD] |= LO10((uintptr_t)stat);
1543 ent[TSTAT_ENABLED_STATLO_ST] |= LO10((uintptr_t)stat);
1544 ent[TSTAT_ENABLED_BA] |= DISP22(va + en_baoffs, orig);
1545 } else {
1546 bcopy(disabled, ent, sizeof (disabled));
1547 ent[TSTAT_DISABLED_BA] |= DISP22(va + dis_baoffs, orig);
1548 }
1549
1550 stat++;
1551 orig += sizeof (enabled);
1552 ent += sizeof (enabled) / sizeof (*ent);
1553 va += sizeof (enabled);
1554 }
1555 }
1556
1557 #undef TSTAT_ENABLED_STATHI
1558 #undef TSTAT_ENABLED_STATLO_LD
1559 #undef TSTAT_ENABLED_STATLO_ST
1560 #undef TSTAT_ENABLED_BA
1561 #undef TSTAT_DISABLED_BA
1562
1563 #else /* sun4v */
1564
1565 #define TSTAT_ENABLED_STATHI 0
1566 #define TSTAT_ENABLED_STATLO 1
1567 #define TSTAT_ENABLED_ADDRHI 2
1568 #define TSTAT_ENABLED_ADDRLO 3
1569 #define TSTAT_ENABLED_CONTBA 6
1570 #define TSTAT_ENABLED_TDATASHFT 7
1571 #define TSTAT_DISABLED_ADDRHI 0
1572 #define TSTAT_DISABLED_ADDRLO 1
1573
1574 static void
trapstat_make_traptab(tstat_percpu_t * tcpu)1575 trapstat_make_traptab(tstat_percpu_t *tcpu)
1576 {
1577 uint32_t *ent;
1578 uint64_t *stat;
1579 uintptr_t orig, va, en_baoffs;
1580 uintptr_t tstat_cont_va;
1581 int nent;
1582
1583 /*
1584 * This is the entry in the interposing trap table for enabled trap
1585 * table entries. It loads a counter, increments it and stores it
1586 * back before branching to the actual trap table entry.
1587 *
1588 * All CPUs share the same interposing trap entry to count the
1589 * number of traps. Note that the trap counter is kept in per CPU
1590 * trap statistics area. Its address is obtained dynamically by
1591 * adding the offset of that CPU's trap statistics area from CPU 0
1592 * (i.e. cpu_id * TSTAT_DATA_SIZE) to the address of the CPU 0
1593 * trap counter already coded in the interposing trap entry itself.
1594 *
1595 * Since this interposing code sequence to count traps takes more
1596 * than 8 instructions, it's split in two parts as follows:
1597 *
1598 * tstat_trapcnt:
1599 * sethi %hi(stat), %g1
1600 * or %g1, %lo[stat), %g1 ! %g1 = CPU0 trap counter addr
1601 * sethi %hi(addr), %g2
1602 * or %g2, %lo(addr), %g2 ! %g2 = real trap handler addr
1603 * mov ASI_SCRATCHPAD_CPUID, %g3
1604 * ldxa [%g3]ASI_SCRATCHPAD, %g3 ! %g3 = CPU ID
1605 * ba tstat_trapcnt_cont ! branch to tstat_trapcnt_cont
1606 * sllx %g3, TSTAT_DATA_SHIFT, %g3 ! %g3 = CPU trapstat data offset
1607 *
1608 * tstat_trapcnt_cont:
1609 * ldx [%g1 + %g3], %g4 ! get counter value
1610 * add %g4, 1, %g4 ! increment value
1611 * jmp %g2 ! jump to original trap handler
1612 * stx %g4, [%g1 + %g3] ! store counter value
1613 *
1614 * First part, i.e. tstat_trapcnt, is per trap and is kept in-line in
1615 * the interposing trap table. However, the tstat_trapcnt_cont code
1616 * sequence is shared by all traps and is kept right after the
1617 * the interposing trap table.
1618 */
1619 static const uint32_t enabled[TSTAT_ENT_NINSTR] = {
1620 0x03000000, /* sethi %hi(stat), %g1 */
1621 0x82106000, /* or %g1, %lo[stat), %g1 */
1622 0x05000000, /* sethi %hi(addr), %g2 */
1623 0x8410a000, /* or %g2, %lo(addr), %g2 */
1624 0x86102008, /* mov ASI_SCRATCHPAD_CPUID, %g3 */
1625 0xc6d8c400, /* ldxa [%g3]ASI_SCRATCHPAD, %g3 */
1626 0x10800000, /* ba enabled_cont */
1627 0x8728f000 /* sllx %g3, TSTAT_DATA_SHIFT, %g3 */
1628 };
1629
1630 static const uint32_t enabled_cont[TSTAT_ENT_NINSTR] = {
1631 0xc8584003, /* ldx [%g1 + %g3], %g4 */
1632 0x88012001, /* add %g4, 1, %g4 */
1633 0x81c08000, /* jmp %g2 */
1634 0xc8704003, /* stx %g4, [%g1 + %g3] */
1635 NOP, NOP, NOP, NOP
1636 };
1637
1638 /*
1639 * This is the entry in the interposing trap table for disabled trap
1640 * table entries. It simply "jmp" to the actual, underlying trap
1641 * table entry. As explained in the "Implementation Details" section
1642 * of the block comment, all TL>0 traps _must_ use the disabled entry;
1643 * additional entries may be explicitly disabled through the use
1644 * of TSTATIOC_ENTRY/TSTATIOC_NOENTRY.
1645 */
1646 static const uint32_t disabled[TSTAT_ENT_NINSTR] = {
1647 0x05000000, /* sethi %hi(addr), %g2 */
1648 0x8410a000, /* or %g2, %lo(addr), %g2 */
1649 0x81c08000, /* jmp %g2 */
1650 NOP, NOP, NOP, NOP, NOP,
1651 };
1652
1653 ASSERT(MUTEX_HELD(&tstat_lock));
1654 ent = tcpu->tcpu_instr->tinst_traptab;
1655 stat = (uint64_t *)TSTAT_CPU0_DATA_OFFS(tcpu, tdata_traps);
1656 orig = KERNELBASE;
1657 va = (uintptr_t)tcpu->tcpu_ibase;
1658 en_baoffs = TSTAT_ENABLED_CONTBA * sizeof (uint32_t);
1659 tstat_cont_va = TSTAT_INSTR_OFFS(tcpu, tinst_trapcnt);
1660
1661 for (nent = 0; nent < TSTAT_TOTAL_NENT; nent++) {
1662 /*
1663 * If TSTAT_OPT_TLBDATA option is enabled (-t or -T option)
1664 * we make sure only TSTAT_TLB_NENT traps can be enabled.
1665 * Note that this logic is somewhat moot since trapstat
1666 * cmd actually use TSTATIOC_NOENTRY ioctl to disable all
1667 * traps when performing Tlb stats collection.
1668 */
1669 if ((!(tstat_options & TSTAT_OPT_TLBDATA) ||
1670 nent < TSTAT_TLB_NENT) && tstat_enabled[nent]) {
1671 bcopy(enabled, ent, sizeof (enabled));
1672 ent[TSTAT_ENABLED_STATHI] |= HI22((uintptr_t)stat);
1673 ent[TSTAT_ENABLED_STATLO] |= LO10((uintptr_t)stat);
1674 ent[TSTAT_ENABLED_ADDRHI] |= HI22((uintptr_t)orig);
1675 ent[TSTAT_ENABLED_ADDRLO] |= LO10((uintptr_t)orig);
1676 ent[TSTAT_ENABLED_CONTBA] |=
1677 DISP22(va + en_baoffs, tstat_cont_va);
1678 ent[TSTAT_ENABLED_TDATASHFT] |=
1679 LO10((uintptr_t)TSTAT_DATA_SHIFT);
1680 } else {
1681 bcopy(disabled, ent, sizeof (disabled));
1682 ent[TSTAT_DISABLED_ADDRHI] |= HI22((uintptr_t)orig);
1683 ent[TSTAT_DISABLED_ADDRLO] |= LO10((uintptr_t)orig);
1684 }
1685
1686 stat++;
1687 orig += sizeof (enabled);
1688 ent += sizeof (enabled) / sizeof (*ent);
1689 va += sizeof (enabled);
1690 }
1691 bcopy(enabled_cont, (uint32_t *)tcpu->tcpu_instr->tinst_trapcnt,
1692 sizeof (enabled_cont));
1693 }
1694
1695 #undef TSTAT_ENABLED_TDATASHFT
1696 #undef TSTAT_ENABLED_STATHI
1697 #undef TSTAT_ENABLED_STATLO
1698 #undef TSTAT_ENABLED_ADDRHI
1699 #undef TSTAT_ENABLED_ADDRLO
1700 #undef TSTAT_ENABLED_CONTBA
1701 #undef TSTAT_DISABLED_BA
1702
1703 #endif /* sun4v */
1704
1705 #ifndef sun4v
1706 /*
1707 * See Section A.6 in SPARC v9 Manual.
1708 * max branch = 4*((2^21)-1) = 8388604
1709 */
1710 #define MAX_BICC_BRANCH_DISPLACEMENT (4 * ((1 << 21) - 1))
1711 #endif
1712
1713 static void
trapstat_setup(processorid_t cpu)1714 trapstat_setup(processorid_t cpu)
1715 {
1716 tstat_percpu_t *tcpu = &tstat_percpu[cpu];
1717 #ifndef sun4v
1718 int i;
1719 caddr_t va;
1720 pfn_t *pfn;
1721 cpu_t *cp;
1722 uint_t strand_idx;
1723 size_t tstat_offset;
1724 #else
1725 uint64_t offset;
1726 #endif
1727
1728 ASSERT(tcpu->tcpu_pfn == NULL);
1729 ASSERT(tcpu->tcpu_instr == NULL);
1730 ASSERT(tcpu->tcpu_data == NULL);
1731 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
1732 ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED));
1733 ASSERT(MUTEX_HELD(&cpu_lock));
1734 ASSERT(MUTEX_HELD(&tstat_lock));
1735
1736 #ifndef sun4v
1737 /*
1738 * The lower fifteen bits of the %tba are always read as zero; we must
1739 * align our instruction base address appropriately.
1740 */
1741 tstat_offset = tstat_total_size;
1742
1743 cp = cpu_get(cpu);
1744 ASSERT(cp != NULL);
1745 if ((strand_idx = cpu ^ pg_plat_hw_instance_id(cp, PGHW_IPIPE)) != 0) {
1746 /*
1747 * On sun4u platforms with multiple CPUs sharing the MMU
1748 * (Olympus-C has 2 strands per core), each CPU uses a
1749 * disjoint trap table. The indexing is based on the
1750 * strand id, which is obtained by XOR'ing the cpuid with
1751 * the coreid.
1752 */
1753 tstat_offset += tstat_total_size * strand_idx;
1754
1755 /*
1756 * Offset must be less than the maximum PC-relative branch
1757 * displacement for Bicc variants. See the Implementation
1758 * Details comment.
1759 */
1760 ASSERT(tstat_offset <= MAX_BICC_BRANCH_DISPLACEMENT);
1761 }
1762
1763 tcpu->tcpu_ibase = (caddr_t)((KERNELBASE - tstat_offset)
1764 & TSTAT_TBA_MASK);
1765 tcpu->tcpu_dbase = tcpu->tcpu_ibase + TSTAT_INSTR_SIZE;
1766 tcpu->tcpu_vabase = tcpu->tcpu_ibase;
1767
1768 tcpu->tcpu_pfn = vmem_alloc(tstat_arena, tstat_total_pages, VM_SLEEP);
1769 bzero(tcpu->tcpu_pfn, tstat_total_pages);
1770 pfn = tcpu->tcpu_pfn;
1771
1772 tcpu->tcpu_instr = vmem_alloc(tstat_arena, TSTAT_INSTR_SIZE, VM_SLEEP);
1773
1774 va = (caddr_t)tcpu->tcpu_instr;
1775 for (i = 0; i < TSTAT_INSTR_PAGES; i++, va += MMU_PAGESIZE)
1776 *pfn++ = hat_getpfnum(kas.a_hat, va);
1777
1778 /*
1779 * We must be sure that the pages that we will use to examine the data
1780 * have the same virtual color as the pages to which the data is being
1781 * recorded, hence the alignment and phase constraints on the
1782 * allocation.
1783 */
1784 tcpu->tcpu_data = vmem_xalloc(tstat_arena, tstat_data_size,
1785 shm_alignment, (uintptr_t)tcpu->tcpu_dbase & (shm_alignment - 1),
1786 0, 0, NULL, VM_SLEEP);
1787 bzero(tcpu->tcpu_data, tstat_data_size);
1788 tcpu->tcpu_data->tdata_cpuid = cpu;
1789
1790 va = (caddr_t)tcpu->tcpu_data;
1791 for (i = 0; i < tstat_data_pages; i++, va += MMU_PAGESIZE)
1792 *pfn++ = hat_getpfnum(kas.a_hat, va);
1793
1794 /*
1795 * Now that we have all of the instruction and data pages allocated,
1796 * make the trap table from scratch.
1797 */
1798 trapstat_make_traptab(tcpu);
1799
1800 if (tstat_options & TSTAT_OPT_TLBDATA) {
1801 /*
1802 * TLB Statistics have been specified; set up the I- and D-TLB
1803 * entries and corresponding TLB return entries.
1804 */
1805 trapstat_tlbent(tcpu, TSTAT_ENT_ITLBMISS);
1806 trapstat_tlbent(tcpu, TSTAT_ENT_DTLBMISS);
1807 }
1808
1809 #else /* sun4v */
1810
1811 /*
1812 * The lower fifteen bits of the %tba are always read as zero; hence
1813 * it must be aligned at least on 512K boundary.
1814 */
1815 tcpu->tcpu_vabase = (caddr_t)(KERNELBASE -
1816 MMU_PAGESIZE4M * tstat_num4m_mapping);
1817 tcpu->tcpu_ibase = tcpu->tcpu_vabase;
1818 tcpu->tcpu_dbase = tcpu->tcpu_ibase + TSTAT_INSTR_SIZE +
1819 cpu * TSTAT_DATA_SIZE;
1820
1821 tcpu->tcpu_pfn = &tstat_pfn[0];
1822 tcpu->tcpu_instr = (tstat_instr_t *)tstat_va[0];
1823
1824 offset = TSTAT_INSTR_SIZE + cpu * TSTAT_DATA_SIZE;
1825 tcpu->tcpu_data = (tstat_data_t *)(tstat_va[offset >> MMU_PAGESHIFT4M] +
1826 (offset & MMU_PAGEOFFSET4M));
1827 bzero(tcpu->tcpu_data, TSTAT_DATA_SIZE);
1828
1829 /*
1830 * Now that we have all of the instruction and data pages allocated,
1831 * make the trap table from scratch. It should be done only once
1832 * as it is shared by all CPUs.
1833 */
1834 if (!tstat_traptab_initialized)
1835 trapstat_make_traptab(tcpu);
1836
1837 if (tstat_options & TSTAT_OPT_TLBDATA) {
1838 /*
1839 * TLB Statistics have been specified; set up the I- and D-TLB
1840 * entries and corresponding TLB return entries.
1841 */
1842 if (!tstat_traptab_initialized) {
1843 if (tstat_fast_tlbstat) {
1844 trapstat_tlbent(tcpu, TSTAT_ENT_IMMUMISS);
1845 trapstat_tlbent(tcpu, TSTAT_ENT_DMMUMISS);
1846 } else {
1847 trapstat_tlbent(tcpu, TSTAT_ENT_ITLBMISS);
1848 trapstat_tlbent(tcpu, TSTAT_ENT_DTLBMISS);
1849 }
1850 }
1851 }
1852 tstat_traptab_initialized = 1;
1853 #endif /* sun4v */
1854
1855 tcpu->tcpu_flags |= TSTAT_CPU_ALLOCATED;
1856
1857 /*
1858 * Finally, get the target CPU to load the locked pages into its TLBs.
1859 */
1860 xc_one(cpu, (xcfunc_t *)trapstat_load_tlb, 0, 0);
1861 }
1862
1863 static void
trapstat_teardown(processorid_t cpu)1864 trapstat_teardown(processorid_t cpu)
1865 {
1866 tstat_percpu_t *tcpu = &tstat_percpu[cpu];
1867 int i;
1868 caddr_t va = tcpu->tcpu_vabase;
1869
1870 ASSERT(tcpu->tcpu_pfn != NULL);
1871 ASSERT(tcpu->tcpu_instr != NULL);
1872 ASSERT(tcpu->tcpu_data != NULL);
1873 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
1874 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
1875 ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
1876 ASSERT(MUTEX_HELD(&cpu_lock));
1877 ASSERT(MUTEX_HELD(&tstat_lock));
1878
1879 #ifndef sun4v
1880 vmem_free(tstat_arena, tcpu->tcpu_pfn, tstat_total_pages);
1881 vmem_free(tstat_arena, tcpu->tcpu_instr, TSTAT_INSTR_SIZE);
1882 vmem_free(tstat_arena, tcpu->tcpu_data, tstat_data_size);
1883
1884 for (i = 0; i < tstat_total_pages; i++, va += MMU_PAGESIZE) {
1885 xt_one(cpu, vtag_flushpage_tl1, (uint64_t)va,
1886 (uint64_t)ksfmmup);
1887 }
1888 #else
1889 for (i = 0; i < tstat_num4m_mapping; i++) {
1890 xt_one(cpu, vtag_unmap_perm_tl1, (uint64_t)va, KCONTEXT);
1891 va += MMU_PAGESIZE4M;
1892 }
1893 #endif
1894
1895 tcpu->tcpu_pfn = NULL;
1896 tcpu->tcpu_instr = NULL;
1897 tcpu->tcpu_data = NULL;
1898 tcpu->tcpu_flags &= ~TSTAT_CPU_ALLOCATED;
1899 }
1900
1901 static int
trapstat_go()1902 trapstat_go()
1903 {
1904 cpu_t *cp;
1905 #ifdef sun4v
1906 int i;
1907 #endif /* sun4v */
1908
1909 mutex_enter(&cpu_lock);
1910 mutex_enter(&tstat_lock);
1911
1912 if (tstat_running) {
1913 mutex_exit(&tstat_lock);
1914 mutex_exit(&cpu_lock);
1915 return (EBUSY);
1916 }
1917
1918 #ifdef sun4v
1919 /*
1920 * Compute the actual number of 4MB mappings
1921 * we need based on the guest's ncpu_guest_max value.
1922 * Note that earlier at compiled time, we did establish
1923 * and check against the sun4v solaris arch limit
1924 * (TSTAT_NUM4M_LIMIT) which is based on NCPU.
1925 */
1926 tstat_num4m_mapping = TSTAT_NUM4M_MACRO(ncpu_guest_max);
1927 ASSERT(tstat_num4m_mapping <= TSTAT_NUM4M_LIMIT);
1928
1929 /*
1930 * Allocate large pages to hold interposing tables.
1931 */
1932 for (i = 0; i < tstat_num4m_mapping; i++) {
1933 tstat_va[i] = contig_mem_alloc(MMU_PAGESIZE4M);
1934 tstat_pfn[i] = va_to_pfn(tstat_va[i]);
1935 if (tstat_pfn[i] == PFN_INVALID) {
1936 int j;
1937 for (j = 0; j < i; j++) {
1938 contig_mem_free(tstat_va[j], MMU_PAGESIZE4M);
1939 }
1940 mutex_exit(&tstat_lock);
1941 mutex_exit(&cpu_lock);
1942 return (EAGAIN);
1943 }
1944 }
1945
1946
1947 /*
1948 * For detailed TLB statistics, invoke CPU specific interface
1949 * to see if it supports a low overhead interface to collect
1950 * TSB hit statistics. If so, make set tstat_fast_tlbstat flag
1951 * to reflect that.
1952 */
1953 if (tstat_options & TSTAT_OPT_TLBDATA) {
1954 int error;
1955
1956 tstat_fast_tlbstat = B_FALSE;
1957 error = cpu_trapstat_conf(CPU_TSTATCONF_INIT);
1958 if (error == 0)
1959 tstat_fast_tlbstat = B_TRUE;
1960 else if (error != ENOTSUP) {
1961 for (i = 0; i < tstat_num4m_mapping; i++) {
1962 contig_mem_free(tstat_va[i], MMU_PAGESIZE4M);
1963 }
1964 mutex_exit(&tstat_lock);
1965 mutex_exit(&cpu_lock);
1966 return (error);
1967 }
1968 }
1969
1970 tstat_hv_nopanic = 1;
1971 tstat_perm_mapping_failed = 0;
1972 #endif /* sun4v */
1973
1974 /*
1975 * First, perform any necessary hot patching.
1976 */
1977 trapstat_hotpatch();
1978
1979 /*
1980 * Allocate the resources we'll need to measure probe effect.
1981 */
1982 trapstat_probe_alloc();
1983
1984 cp = cpu_list;
1985 do {
1986 if (!(tstat_percpu[cp->cpu_id].tcpu_flags & TSTAT_CPU_SELECTED))
1987 continue;
1988
1989 trapstat_setup(cp->cpu_id);
1990
1991 /*
1992 * Note that due to trapstat_probe()'s use of global data,
1993 * we determine the probe effect on each CPU serially instead
1994 * of in parallel with an xc_all().
1995 */
1996 xc_one(cp->cpu_id, (xcfunc_t *)trapstat_probe, 0, 0);
1997
1998 #ifdef sun4v
1999 /*
2000 * Check to see if the first cpu's attempt to create
2001 * the perm mappings failed. This might happen if the
2002 * guest somehow exhausted all its limited perm mappings.
2003 * Note that we only check this once for the first
2004 * attempt since it shouldn't fail for subsequent cpus
2005 * mapping the same TTEs if the first attempt was successful.
2006 */
2007 if (tstat_hv_nopanic && tstat_perm_mapping_failed) {
2008 tstat_percpu_t *tcpu = &tstat_percpu[cp->cpu_id];
2009 for (i = 0; i < tstat_num4m_mapping; i++) {
2010 contig_mem_free(tstat_va[i], MMU_PAGESIZE4M);
2011 }
2012
2013 /*
2014 * Do clean up before returning.
2015 * Cleanup is manageable since we
2016 * only need to do it for the first cpu
2017 * iteration that failed.
2018 */
2019 trapstat_probe_free();
2020 trapstat_hotpatch();
2021 tcpu->tcpu_pfn = NULL;
2022 tcpu->tcpu_instr = NULL;
2023 tcpu->tcpu_data = NULL;
2024 tcpu->tcpu_flags &= ~TSTAT_CPU_ALLOCATED;
2025 mutex_exit(&tstat_lock);
2026 mutex_exit(&cpu_lock);
2027 return (EAGAIN);
2028 }
2029 tstat_hv_nopanic = 0;
2030 #endif /* sun4v */
2031
2032 } while ((cp = cp->cpu_next) != cpu_list);
2033
2034 xc_all((xcfunc_t *)trapstat_enable, 0, 0);
2035
2036 trapstat_probe_free();
2037 tstat_running = 1;
2038 mutex_exit(&tstat_lock);
2039 mutex_exit(&cpu_lock);
2040
2041 return (0);
2042 }
2043
2044 static int
trapstat_stop()2045 trapstat_stop()
2046 {
2047 int i;
2048
2049 mutex_enter(&cpu_lock);
2050 mutex_enter(&tstat_lock);
2051 if (!tstat_running) {
2052 mutex_exit(&tstat_lock);
2053 mutex_exit(&cpu_lock);
2054 return (ENXIO);
2055 }
2056
2057 xc_all((xcfunc_t *)trapstat_disable, 0, 0);
2058
2059 for (i = 0; i <= max_cpuid; i++) {
2060 if (tstat_percpu[i].tcpu_flags & TSTAT_CPU_ALLOCATED)
2061 trapstat_teardown(i);
2062 }
2063
2064 #ifdef sun4v
2065 tstat_traptab_initialized = 0;
2066 if (tstat_options & TSTAT_OPT_TLBDATA)
2067 (void) cpu_trapstat_conf(CPU_TSTATCONF_FINI);
2068 for (i = 0; i < tstat_num4m_mapping; i++)
2069 contig_mem_free(tstat_va[i], MMU_PAGESIZE4M);
2070 #endif
2071 trapstat_hotpatch();
2072 tstat_running = 0;
2073 mutex_exit(&tstat_lock);
2074 mutex_exit(&cpu_lock);
2075
2076 return (0);
2077 }
2078
2079 /*
2080 * This is trapstat's DR CPU configuration callback. It's called (with
2081 * cpu_lock held) to unconfigure a newly powered-off CPU, or to configure a
2082 * powered-off CPU that is to be brought into the system. We need only take
2083 * action in the unconfigure case: because a powered-off CPU will have its
2084 * trap table restored to KERNELBASE if it is ever powered back on, we must
2085 * update the flags to reflect that trapstat is no longer enabled on the
2086 * powered-off CPU. Note that this means that a TSTAT_CPU_ENABLED CPU that
2087 * is unconfigured/powered off and later powered back on/reconfigured will
2088 * _not_ be re-TSTAT_CPU_ENABLED.
2089 */
2090 static int
trapstat_cpu_setup(cpu_setup_t what,processorid_t cpu)2091 trapstat_cpu_setup(cpu_setup_t what, processorid_t cpu)
2092 {
2093 tstat_percpu_t *tcpu = &tstat_percpu[cpu];
2094
2095 ASSERT(MUTEX_HELD(&cpu_lock));
2096 mutex_enter(&tstat_lock);
2097
2098 if (!tstat_running) {
2099 mutex_exit(&tstat_lock);
2100 return (0);
2101 }
2102
2103 switch (what) {
2104 case CPU_CONFIG:
2105 ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
2106 break;
2107
2108 case CPU_UNCONFIG:
2109 if (tcpu->tcpu_flags & TSTAT_CPU_ENABLED) {
2110 tcpu->tcpu_flags &= ~TSTAT_CPU_ENABLED;
2111 #ifdef sun4v
2112 /*
2113 * A power-off, causes the cpu mondo queues to be
2114 * unconfigured on sun4v. Since we can't teardown
2115 * trapstat's mappings on the cpu that is going away,
2116 * we simply mark it as not allocated. This will
2117 * prevent a teardown on a cpu with the same cpu id
2118 * that might have been added while trapstat is running.
2119 */
2120 if (tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED) {
2121 tcpu->tcpu_pfn = NULL;
2122 tcpu->tcpu_instr = NULL;
2123 tcpu->tcpu_data = NULL;
2124 tcpu->tcpu_flags &= ~TSTAT_CPU_ALLOCATED;
2125 }
2126 #endif
2127 }
2128 break;
2129
2130 default:
2131 break;
2132 }
2133
2134 mutex_exit(&tstat_lock);
2135 return (0);
2136 }
2137
2138 /*
2139 * This is called before a CPR suspend and after a CPR resume. We don't have
2140 * anything to do before a suspend, but after a restart we must restore the
2141 * trap table to be our interposing trap table. However, we don't actually
2142 * know whether or not the CPUs have been powered off -- this routine may be
2143 * called while restoring from a failed CPR suspend. We thus run through each
2144 * TSTAT_CPU_ENABLED CPU, and explicitly destroy and reestablish its
2145 * interposing trap table. This assures that our state is correct regardless
2146 * of whether or not the CPU has been newly powered on.
2147 */
2148 /*ARGSUSED*/
2149 static boolean_t
trapstat_cpr(void * arg,int code)2150 trapstat_cpr(void *arg, int code)
2151 {
2152 cpu_t *cp;
2153
2154 if (code == CB_CODE_CPR_CHKPT)
2155 return (B_TRUE);
2156
2157 ASSERT(code == CB_CODE_CPR_RESUME);
2158
2159 mutex_enter(&cpu_lock);
2160 mutex_enter(&tstat_lock);
2161
2162 if (!tstat_running) {
2163 mutex_exit(&tstat_lock);
2164 mutex_exit(&cpu_lock);
2165 return (B_TRUE);
2166 }
2167
2168 cp = cpu_list;
2169 do {
2170 tstat_percpu_t *tcpu = &tstat_percpu[cp->cpu_id];
2171
2172 if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED))
2173 continue;
2174
2175 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
2176 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
2177
2178 xc_one(cp->cpu_id, (xcfunc_t *)trapstat_disable, 0, 0);
2179 ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
2180
2181 /*
2182 * Preserve this CPU's data in tstat_buffer and rip down its
2183 * interposing trap table.
2184 */
2185 #ifdef sun4v
2186 bcopy(tcpu->tcpu_data, tstat_buffer, TSTAT_DATA_SIZE);
2187 #else
2188 bcopy(tcpu->tcpu_data, tstat_buffer, tstat_data_t_size);
2189 #endif /* sun4v */
2190 trapstat_teardown(cp->cpu_id);
2191 ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED));
2192
2193 /*
2194 * Reestablish the interposing trap table and restore the old
2195 * data.
2196 */
2197 trapstat_setup(cp->cpu_id);
2198 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
2199 #ifdef sun4v
2200 bcopy(tstat_buffer, tcpu->tcpu_data, TSTAT_DATA_SIZE);
2201 #else
2202 bcopy(tstat_buffer, tcpu->tcpu_data, tstat_data_t_size);
2203 #endif /* sun4v */
2204
2205 xc_one(cp->cpu_id, (xcfunc_t *)trapstat_enable, 0, 0);
2206 } while ((cp = cp->cpu_next) != cpu_list);
2207
2208 mutex_exit(&tstat_lock);
2209 mutex_exit(&cpu_lock);
2210
2211 return (B_TRUE);
2212 }
2213
2214 /*ARGSUSED*/
2215 static int
trapstat_open(dev_t * devp,int flag,int otyp,cred_t * cred_p)2216 trapstat_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
2217 {
2218 int i;
2219
2220 mutex_enter(&cpu_lock);
2221 mutex_enter(&tstat_lock);
2222 if (tstat_open != 0) {
2223 mutex_exit(&tstat_lock);
2224 mutex_exit(&cpu_lock);
2225 return (EBUSY);
2226 }
2227
2228 /*
2229 * Register this in open() rather than in attach() to prevent deadlock
2230 * with DR code. During attach, I/O device tree locks are grabbed
2231 * before trapstat_attach() is invoked - registering in attach
2232 * will result in the lock order: device tree lock, cpu_lock.
2233 * DR code however requires that cpu_lock be acquired before
2234 * device tree locks.
2235 */
2236 ASSERT(!tstat_running);
2237 register_cpu_setup_func((cpu_setup_func_t *)trapstat_cpu_setup, NULL);
2238
2239 /*
2240 * Clear all options. And until specific CPUs are specified, we'll
2241 * mark all CPUs as selected.
2242 */
2243 tstat_options = 0;
2244
2245 for (i = 0; i <= max_cpuid; i++)
2246 tstat_percpu[i].tcpu_flags |= TSTAT_CPU_SELECTED;
2247
2248 /*
2249 * By default, all traps at TL=0 are enabled. Traps at TL>0 must
2250 * be disabled.
2251 */
2252 for (i = 0; i < TSTAT_TOTAL_NENT; i++)
2253 tstat_enabled[i] = i < TSTAT_NENT ? 1 : 0;
2254
2255 tstat_open = 1;
2256 mutex_exit(&tstat_lock);
2257 mutex_exit(&cpu_lock);
2258
2259 return (0);
2260 }
2261
2262 /*ARGSUSED*/
2263 static int
trapstat_close(dev_t dev,int flag,int otyp,cred_t * cred_p)2264 trapstat_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
2265 {
2266 (void) trapstat_stop();
2267
2268 ASSERT(!tstat_running);
2269
2270 mutex_enter(&cpu_lock);
2271 unregister_cpu_setup_func((cpu_setup_func_t *)trapstat_cpu_setup, NULL);
2272 mutex_exit(&cpu_lock);
2273
2274 tstat_open = 0;
2275 return (DDI_SUCCESS);
2276 }
2277
2278 static int
trapstat_option(int option)2279 trapstat_option(int option)
2280 {
2281 mutex_enter(&tstat_lock);
2282
2283 if (tstat_running) {
2284 mutex_exit(&tstat_lock);
2285 return (EBUSY);
2286 }
2287
2288 tstat_options |= option;
2289 mutex_exit(&tstat_lock);
2290
2291 return (0);
2292 }
2293
2294 /*ARGSUSED*/
2295 static int
trapstat_ioctl(dev_t dev,int cmd,intptr_t arg,int md,cred_t * crd,int * rval)2296 trapstat_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *crd, int *rval)
2297 {
2298 int i, j, out;
2299 size_t dsize;
2300
2301 switch (cmd) {
2302 case TSTATIOC_GO:
2303 return (trapstat_go());
2304
2305 case TSTATIOC_NOGO:
2306 return (trapstat_option(TSTAT_OPT_NOGO));
2307
2308 case TSTATIOC_STOP:
2309 return (trapstat_stop());
2310
2311 case TSTATIOC_CPU:
2312 if (arg < 0 || arg > max_cpuid)
2313 return (EINVAL);
2314 /*FALLTHROUGH*/
2315
2316 case TSTATIOC_NOCPU:
2317 mutex_enter(&tstat_lock);
2318
2319 if (tstat_running) {
2320 mutex_exit(&tstat_lock);
2321 return (EBUSY);
2322 }
2323
2324 /*
2325 * If this is the first CPU to be specified (or if we are
2326 * being asked to explicitly de-select CPUs), disable all CPUs.
2327 */
2328 if (!(tstat_options & TSTAT_OPT_CPU) || cmd == TSTATIOC_NOCPU) {
2329 tstat_options |= TSTAT_OPT_CPU;
2330
2331 for (i = 0; i <= max_cpuid; i++) {
2332 tstat_percpu_t *tcpu = &tstat_percpu[i];
2333
2334 ASSERT(cmd == TSTATIOC_NOCPU ||
2335 (tcpu->tcpu_flags & TSTAT_CPU_SELECTED));
2336 tcpu->tcpu_flags &= ~TSTAT_CPU_SELECTED;
2337 }
2338 }
2339
2340 if (cmd == TSTATIOC_CPU)
2341 tstat_percpu[arg].tcpu_flags |= TSTAT_CPU_SELECTED;
2342
2343 mutex_exit(&tstat_lock);
2344
2345 return (0);
2346
2347 case TSTATIOC_ENTRY:
2348 mutex_enter(&tstat_lock);
2349
2350 if (tstat_running) {
2351 mutex_exit(&tstat_lock);
2352 return (EBUSY);
2353 }
2354
2355 if (arg >= TSTAT_NENT || arg < 0) {
2356 mutex_exit(&tstat_lock);
2357 return (EINVAL);
2358 }
2359
2360 if (!(tstat_options & TSTAT_OPT_ENTRY)) {
2361 /*
2362 * If this is the first entry that we are explicitly
2363 * enabling, explicitly disable every TL=0 entry.
2364 */
2365 for (i = 0; i < TSTAT_NENT; i++)
2366 tstat_enabled[i] = 0;
2367
2368 tstat_options |= TSTAT_OPT_ENTRY;
2369 }
2370
2371 tstat_enabled[arg] = 1;
2372 mutex_exit(&tstat_lock);
2373 return (0);
2374
2375 case TSTATIOC_NOENTRY:
2376 mutex_enter(&tstat_lock);
2377
2378 if (tstat_running) {
2379 mutex_exit(&tstat_lock);
2380 return (EBUSY);
2381 }
2382
2383 for (i = 0; i < TSTAT_NENT; i++)
2384 tstat_enabled[i] = 0;
2385
2386 mutex_exit(&tstat_lock);
2387 return (0);
2388
2389 case TSTATIOC_READ:
2390 mutex_enter(&tstat_lock);
2391
2392 if (tstat_options & TSTAT_OPT_TLBDATA) {
2393 dsize = tstat_data_t_exported_size;
2394 } else {
2395 dsize = sizeof (tstat_data_t);
2396 }
2397
2398 for (i = 0, out = 0; i <= max_cpuid; i++) {
2399 tstat_percpu_t *tcpu = &tstat_percpu[i];
2400
2401 if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED))
2402 continue;
2403
2404 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
2405 ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
2406
2407 tstat_buffer->tdata_cpuid = -1;
2408 xc_one(i, (xcfunc_t *)trapstat_snapshot, 0, 0);
2409
2410 if (tstat_buffer->tdata_cpuid == -1) {
2411 /*
2412 * This CPU is not currently responding to
2413 * cross calls; we have caught it while it is
2414 * being unconfigured. We'll drop tstat_lock
2415 * and pick up and drop cpu_lock. By the
2416 * time we acquire cpu_lock, the DR operation
2417 * will appear consistent and we can assert
2418 * that trapstat_cpu_setup() has cleared
2419 * TSTAT_CPU_ENABLED.
2420 */
2421 mutex_exit(&tstat_lock);
2422 mutex_enter(&cpu_lock);
2423 mutex_exit(&cpu_lock);
2424 mutex_enter(&tstat_lock);
2425 ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
2426 continue;
2427 }
2428
2429 /*
2430 * Need to compensate for the difference between page
2431 * sizes exported to users and page sizes available
2432 * within the kernel.
2433 */
2434 if ((tstat_options & TSTAT_OPT_TLBDATA) &&
2435 (tstat_pgszs != tstat_user_pgszs)) {
2436 tstat_pgszdata_t *tp;
2437 uint_t szc;
2438
2439 tp = &tstat_buffer->tdata_pgsz[0];
2440 for (j = 0; j < tstat_user_pgszs; j++) {
2441 if ((szc = USERSZC_2_SZC(j)) != j) {
2442 bcopy(&tp[szc], &tp[j],
2443 sizeof (tstat_pgszdata_t));
2444 }
2445 }
2446 }
2447
2448 if (copyout(tstat_buffer, (void *)arg, dsize) != 0) {
2449 mutex_exit(&tstat_lock);
2450 return (EFAULT);
2451 }
2452
2453 out++;
2454 arg += dsize;
2455 }
2456
2457 if (out != max_cpuid + 1) {
2458 processorid_t cpuid = -1;
2459 arg += offsetof(tstat_data_t, tdata_cpuid);
2460
2461 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0) {
2462 mutex_exit(&tstat_lock);
2463 return (EFAULT);
2464 }
2465 }
2466
2467 mutex_exit(&tstat_lock);
2468
2469 return (0);
2470
2471 case TSTATIOC_TLBDATA:
2472 return (trapstat_option(TSTAT_OPT_TLBDATA));
2473
2474 default:
2475 break;
2476 }
2477
2478 return (ENOTTY);
2479 }
2480
2481 /*ARGSUSED*/
2482 static int
trapstat_info(dev_info_t * dip,ddi_info_cmd_t infocmd,void * arg,void ** result)2483 trapstat_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
2484 {
2485 int error;
2486
2487 switch (infocmd) {
2488 case DDI_INFO_DEVT2DEVINFO:
2489 *result = (void *)tstat_devi;
2490 error = DDI_SUCCESS;
2491 break;
2492 case DDI_INFO_DEVT2INSTANCE:
2493 *result = (void *)0;
2494 error = DDI_SUCCESS;
2495 break;
2496 default:
2497 error = DDI_FAILURE;
2498 }
2499 return (error);
2500 }
2501
2502 static int
trapstat_attach(dev_info_t * devi,ddi_attach_cmd_t cmd)2503 trapstat_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
2504 {
2505 switch (cmd) {
2506 case DDI_ATTACH:
2507 break;
2508
2509 case DDI_RESUME:
2510 return (DDI_SUCCESS);
2511
2512 default:
2513 return (DDI_FAILURE);
2514 }
2515
2516 if (ddi_create_minor_node(devi, "trapstat", S_IFCHR,
2517 0, DDI_PSEUDO, 0) == DDI_FAILURE) {
2518 ddi_remove_minor_node(devi, NULL);
2519 return (DDI_FAILURE);
2520 }
2521
2522 ddi_report_dev(devi);
2523 tstat_devi = devi;
2524
2525 tstat_pgszs = page_num_pagesizes();
2526 tstat_user_pgszs = page_num_user_pagesizes(0);
2527 tstat_data_t_size = sizeof (tstat_data_t) +
2528 (tstat_pgszs - 1) * sizeof (tstat_pgszdata_t);
2529 tstat_data_t_exported_size = sizeof (tstat_data_t) +
2530 (tstat_user_pgszs - 1) * sizeof (tstat_pgszdata_t);
2531 #ifndef sun4v
2532 tstat_data_pages = (tstat_data_t_size >> MMU_PAGESHIFT) + 1;
2533 tstat_total_pages = TSTAT_INSTR_PAGES + tstat_data_pages;
2534 tstat_data_size = tstat_data_pages * MMU_PAGESIZE;
2535 tstat_total_size = TSTAT_INSTR_SIZE + tstat_data_size;
2536 #else
2537 /*
2538 * For sun4v, the tstat_data_t_size reflect the tstat_buffer
2539 * output size based on tstat_data_t structure. For tlbstats
2540 * collection, we use the internal tstat_tdata_t structure
2541 * to collect the tlbstats for the pages. Therefore we
2542 * need to adjust the size for the assertion.
2543 */
2544 ASSERT((tstat_data_t_size - sizeof (tstat_data_t) +
2545 sizeof (tstat_tdata_t)) <= TSTAT_DATA_SIZE);
2546 #endif
2547
2548 tstat_percpu = kmem_zalloc((max_cpuid + 1) *
2549 sizeof (tstat_percpu_t), KM_SLEEP);
2550
2551 /*
2552 * Create our own arena backed by segkmem to assure a source of
2553 * MMU_PAGESIZE-aligned allocations. We allocate out of the
2554 * heap32_arena to assure that we can address the allocated memory with
2555 * a single sethi/simm13 pair in the interposing trap table entries.
2556 */
2557 tstat_arena = vmem_create("trapstat", NULL, 0, MMU_PAGESIZE,
2558 segkmem_alloc_permanent, segkmem_free, heap32_arena, 0, VM_SLEEP);
2559
2560 tstat_enabled = kmem_alloc(TSTAT_TOTAL_NENT * sizeof (int), KM_SLEEP);
2561 tstat_buffer = kmem_alloc(tstat_data_t_size, KM_SLEEP);
2562
2563 /*
2564 * CB_CL_CPR_POST_USER is the class that executes from cpr_resume()
2565 * after user threads can be restarted. By executing in this class,
2566 * we are assured of the availability of system services needed to
2567 * resume trapstat (specifically, we are assured that all CPUs are
2568 * restarted and responding to cross calls).
2569 */
2570 tstat_cprcb =
2571 callb_add(trapstat_cpr, NULL, CB_CL_CPR_POST_USER, "trapstat");
2572
2573 return (DDI_SUCCESS);
2574 }
2575
2576 static int
trapstat_detach(dev_info_t * devi,ddi_detach_cmd_t cmd)2577 trapstat_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
2578 {
2579 int rval;
2580
2581 ASSERT(devi == tstat_devi);
2582
2583 switch (cmd) {
2584 case DDI_DETACH:
2585 break;
2586
2587 case DDI_SUSPEND:
2588 return (DDI_SUCCESS);
2589
2590 default:
2591 return (DDI_FAILURE);
2592 }
2593
2594 ASSERT(!tstat_running);
2595
2596 rval = callb_delete(tstat_cprcb);
2597 ASSERT(rval == 0);
2598
2599 kmem_free(tstat_buffer, tstat_data_t_size);
2600 kmem_free(tstat_enabled, TSTAT_TOTAL_NENT * sizeof (int));
2601 vmem_destroy(tstat_arena);
2602 kmem_free(tstat_percpu, (max_cpuid + 1) * sizeof (tstat_percpu_t));
2603 ddi_remove_minor_node(devi, NULL);
2604
2605 return (DDI_SUCCESS);
2606 }
2607
2608 /*
2609 * Configuration data structures
2610 */
2611 static struct cb_ops trapstat_cb_ops = {
2612 trapstat_open, /* open */
2613 trapstat_close, /* close */
2614 nulldev, /* strategy */
2615 nulldev, /* print */
2616 nodev, /* dump */
2617 nodev, /* read */
2618 nodev, /* write */
2619 trapstat_ioctl, /* ioctl */
2620 nodev, /* devmap */
2621 nodev, /* mmap */
2622 nodev, /* segmap */
2623 nochpoll, /* poll */
2624 ddi_prop_op, /* cb_prop_op */
2625 0, /* streamtab */
2626 D_MP | D_NEW /* Driver compatibility flag */
2627 };
2628
2629 static struct dev_ops trapstat_ops = {
2630 DEVO_REV, /* devo_rev, */
2631 0, /* refcnt */
2632 trapstat_info, /* getinfo */
2633 nulldev, /* identify */
2634 nulldev, /* probe */
2635 trapstat_attach, /* attach */
2636 trapstat_detach, /* detach */
2637 nulldev, /* reset */
2638 &trapstat_cb_ops, /* cb_ops */
2639 (struct bus_ops *)0, /* bus_ops */
2640 NULL, /* power */
2641 ddi_quiesce_not_needed, /* quiesce */
2642 };
2643
2644 static struct modldrv modldrv = {
2645 &mod_driverops, /* Type of module. This one is a driver */
2646 "Trap Statistics 1.1", /* name of module */
2647 &trapstat_ops, /* driver ops */
2648 };
2649
2650 static struct modlinkage modlinkage = {
2651 MODREV_1, (void *)&modldrv, NULL
2652 };
2653
2654 int
_init(void)2655 _init(void)
2656 {
2657 return (mod_install(&modlinkage));
2658 }
2659
2660 int
_fini(void)2661 _fini(void)
2662 {
2663 return (mod_remove(&modlinkage));
2664 }
2665
2666 int
_info(struct modinfo * modinfop)2667 _info(struct modinfo *modinfop)
2668 {
2669 return (mod_info(&modlinkage, modinfop));
2670 }
2671