xref: /titanic_44/usr/src/uts/sun4/io/trapstat.c (revision 2b4a78020b9c38d1b95e2f3fefa6d6e4be382d1f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 
27 #include <sys/systm.h>
28 #include <sys/conf.h>
29 #include <sys/stat.h>
30 #include <sys/ddi.h>
31 #include <sys/sunddi.h>
32 #include <sys/modctl.h>
33 #include <sys/cpu_module.h>
34 #include <vm/hat_sfmmu.h>
35 #include <vm/seg_kmem.h>
36 #include <vm/seg_kpm.h>
37 #include <vm/vm_dep.h>
38 #include <sys/machsystm.h>
39 #include <sys/machasi.h>
40 #include <sys/sysmacros.h>
41 #include <sys/callb.h>
42 #include <sys/archsystm.h>
43 #include <sys/trapstat.h>
44 #ifdef sun4v
45 #include <sys/hypervisor_api.h>
46 #endif
47 #ifndef sun4v
48 #include <sys/pghw.h>
49 #endif
50 
51 /* BEGIN CSTYLED */
52 /*
53  * trapstat:  Trap Statistics through Dynamic Trap Table Interposition
54  * -------------------------------------------------------------------
55  *
56  * Motivation and Overview
57  *
58  * Despite being a fundamental indicator of system behavior, there has
59  * historically been very little insight provided into the frequency and cost
60  * of machine-specific traps.  The lack of insight has been especially acute
61  * on UltraSPARC microprocessors:  because these microprocessors handle TLB
62  * misses as software traps, the frequency and duration of traps play a
63  * decisive role in the performance of the memory system.  As applications have
64  * increasingly outstripped TLB reach, this has become increasingly true.
65  *
66  * Part of the difficulty of observing trap behavior is that the trap handlers
67  * are so frequently called (e.g. millions of times per second) that any
68  * permanently enabled instrumentation would induce an unacceptable performance
69  * degradation.  Thus, it is a constraint on any trap observability
70  * infrastructure that it have no probe effect when not explicitly enabled.
71  *
72  * The basic idea, then, is to create an interposing trap table in which each
73  * entry increments a per-trap, in-memory counter and then jumps to the actual,
74  * underlying trap table entry.  To enable trapstat, we atomically write to the
75  * trap base address (%tba) register to point to our interposing trap table.
76  * (Note that per-CPU statistics fall out by creating a different trap table
77  * for each CPU.)
78  *
79  * Implementation Details
80  *
81  * While the idea is straight-forward, a nuance of SPARC V9 slightly
82  * complicates the implementation.  Unlike its predecessors, SPARC V9 supports
83  * the notion of nested traps.  The trap level is kept in the TL register:
84  * during normal operation it is 0; when a trap is taken, the TL register is
85  * incremented by 1.  To aid system software, SPARC V9 breaks the trap table
86  * into two halves:  the lower half contains the trap handlers for traps taken
87  * when TL is 0; the upper half contains the trap handlers for traps taken
88  * when TL is greater than 0.  Each half is further subdivided into two
89  * subsequent halves:  the lower half contains the trap handlers for traps
90  * other than those induced by the trap instruction (Tcc variants); the upper
91  * half contains the trap handlers for traps induced by the trap instruction.
92  * This gives a total of four ranges, with each range containing 256 traps:
93  *
94  *       +--------------------------------+- 3ff
95  *       |                                |   .
96  *       |     Trap instruction, TL>0     |   .
97  *       |                                |   .
98  *       |- - - - - - - - - - - - - - - - +- 300
99  *       |- - - - - - - - - - - - - - - - +- 2ff
100  *       |                                |   .
101  *       |   Non-trap instruction, TL>0   |   .
102  *       |                                |   .
103  *       |- - - - - - - - - - - - - - - - +- 200
104  *       |- - - - - - - - - - - - - - - - +- 1ff
105  *       |                                |   .
106  *       |     Trap instruction, TL=0     |   .
107  *       |                                |   .
108  *       |- - - - - - - - - - - - - - - - +- 100
109  *       |- - - - - - - - - - - - - - - - +- 0ff
110  *       |                                |   .
111  *       |   Non-trap instruction, TL=0   |   .
112  *       |                                |   .
113  *       +--------------------------------+- 000
114  *
115  *
116  * Solaris, however, doesn't have reason to support trap instructions when
117  * TL>0 (only privileged code may execute at TL>0; not supporting this only
118  * constrains our own implementation).  The trap table actually looks like:
119  *
120  *       +--------------------------------+- 2ff
121  *       |                                |   .
122  *       |   Non-trap instruction, TL>0   |   .
123  *       |                                |   .
124  *       |- - - - - - - - - - - - - - - - +- 200
125  *       |- - - - - - - - - - - - - - - - +- 1ff
126  *       |                                |   .
127  *       |     Trap instruction, TL=0     |   .
128  *       |                                |   .
129  *       |- - - - - - - - - - - - - - - - +- 100
130  *       |- - - - - - - - - - - - - - - - +- 0ff
131  *       |                                |   .
132  *       |   Non-trap instruction, TL=0   |   .
133  *       |                                |   .
134  *       +--------------------------------+- 000
135  *
136  * Putatively to aid system software, SPARC V9 has the notion of multiple
137  * sets of global registers.  UltraSPARC defines four sets of global
138  * registers:
139  *
140  *    Normal Globals
141  *    Alternate Globals (AGs)
142  *    MMU Globals (MGs)
143  *    Interrupt Globals (IGs)
144  *
145  * The set of globals in use is controlled by bits in PSTATE; when TL is 0
146  * (and PSTATE has not been otherwise explicitly modified), the Normal Globals
147  * are in use.  When a trap is issued, PSTATE is modified to point to a set of
148  * globals corresponding to the trap type.  Most traps correspond to the
149  * Alternate Globals, with a minority corresponding to the MMU Globals, and
150  * only the interrupt-vector trap (vector 0x60) corresponding to the Interrupt
151  * Globals.  (The complete mapping can be found in the UltraSPARC I&II User's
152  * Manual.)
153  *
154  * Note that the sets of globals are per trap _type_, not per trap _level_.
155  * Thus, when executing a TL>0 trap handler, one may not have registers
156  * available (for example, both trap-instruction traps and spill traps execute
157  * on the alternate globals; if a trap-instruction trap induces a window spill,
158  * the window spill handler has no available globals).  For trapstat, this is
159  * problematic:  a register is required to transfer control from one arbitrary
160  * location (in the interposing trap table) to another (in the actual trap
161  * table).
162  *
163  * We solve this problem by exploiting the trap table's location at the bottom
164  * of valid kernel memory (i.e. at KERNELBASE).  We locate the interposing trap
165  * tables just below KERNELBASE -- thereby allowing us to use a branch-always
166  * instruction (ba) instead of a jump instruction (jmp) to transfer control
167  * from the TL>0 entries in the interposing trap table to the TL>0 entries in
168  * the actual trap table.  (N.B. while this allows trap table interposition to
169  * work, it necessarily limits trapstat to only recording information about
170  * TL=0 traps -- there is no way to increment a counter without using a
171  * register.)  Diagrammatically:
172  *
173  *  Actual trap table:
174  *
175  *       +--------------------------------+- 2ff
176  *       |                                |   .
177  *       |   Non-trap instruction, TL>0   |   .   <-----------------------+
178  *       |                                |   .   <-----------------------|-+
179  *       |- - - - - - - - - - - - - - - - +- 200  <-----------------------|-|-+
180  *       |- - - - - - - - - - - - - - - - +- 1ff                          | | |
181  *       |                                |   .                           | | |
182  *       |     Trap instruction, TL=0     |   .   <-----------------+     | | |
183  *       |                                |   .   <-----------------|-+   | | |
184  *       |- - - - - - - - - - - - - - - - +- 100  <-----------------|-|-+ | | |
185  *       |- - - - - - - - - - - - - - - - +- 0ff                    | | | | | |
186  *       |                                |   .                     | | | | | |
187  *       |   Non-trap instruction, TL=0   |   .   <-----------+     | | | | | |
188  *       |                                |   .   <-----------|-+   | | | | | |
189  *       +--------------------------------+- 000  <-----------|-|-+ | | | | | |
190  *        KERNELBASE                                          | | | | | | | | |
191  *                                                            | | | | | | | | |
192  *                                                            | | | | | | | | |
193  *  Interposing trap table:                                   | | | | | | | | |
194  *                                                            | | | | | | | | |
195  *       +--------------------------------+- 2ff              | | | | | | | | |
196  *       |  ...                           |   .               | | | | | | | | |
197  *       |  ...                           |   .               | | | | | | | | |
198  *       |  ...                           |   .               | | | | | | | | |
199  *       |- - - - - - - - - - - - - - - - +- 203              | | | | | | | | |
200  *       |  ba,a                          |      -------------|-|-|-|-|-|-+ | |
201  *       |- - - - - - - - - - - - - - - - +- 202              | | | | | |   | |
202  *       |  ba,a                          |      -------------|-|-|-|-|-|---+ |
203  *       |- - - - - - - - - - - - - - - - +- 201              | | | | | |     |
204  *       |  ba,a                          |      -------------|-|-|-|-|-|-----+
205  *       |- - - - - - - - - - - - - - - - +- 200              | | | | | |
206  *       |  ...                           |   .               | | | | | |
207  *       |  ...                           |   .               | | | | | |
208  *       |  ...                           |   .               | | | | | |
209  *       |- - - - - - - - - - - - - - - - +- 103              | | | | | |
210  *       |  (Increment counter)           |                   | | | | | |
211  *       |  ba,a                          |      -------------------+ | |
212  *       |- - - - - - - - - - - - - - - - +- 102              | | |   | |
213  *       |  (Increment counter)           |                   | | |   | |
214  *       |  ba,a                          |      ---------------------+ |
215  *       |- - - - - - - - - - - - - - - - +- 101              | | |     |
216  *       |  (Increment counter)           |                   | | |     |
217  *       |  ba,a                          |      -----------------------+
218  *       |- - - - - - - - - - - - - - - - +- 100              | | |
219  *       |  ...                           |   .               | | |
220  *       |  ...                           |   .               | | |
221  *       |  ...                           |   .               | | |
222  *       |- - - - - - - - - - - - - - - - +- 003              | | |
223  *       |  (Increment counter)           |                   | | |
224  *       |  ba,a                          |      -------------+ | |
225  *       |- - - - - - - - - - - - - - - - +- 002                | |
226  *       |  (Increment counter)           |                     | |
227  *       |  ba,a                          |      ---------------+ |
228  *       |- - - - - - - - - - - - - - - - +- 001                  |
229  *       |  (Increment counter)           |                       |
230  *       |  ba,a                          |      -----------------+
231  *       +--------------------------------+- 000
232  *        KERNELBASE - tstat_total_size
233  *
234  * tstat_total_size is the number of pages required for each trap table.  It
235  * must be true that KERNELBASE - tstat_total_size is less than the maximum
236  * branch displacement; if each CPU were to consume a disjoint virtual range
237  * below KERNELBASE for its trap table, we could support at most
238  * (maximum_branch_displacement / tstat_total_size) CPUs.  The maximum branch
239  * displacement for Bicc variants is just under eight megabytes, and (because
240  * the %tba must be 32K aligned), tstat_total_size must be at least 32K; if
241  * each CPU were to consume a disjoint virtual range, we would have an
242  * unacceptably low upper bound of 256 CPUs.
243  *
244  * While there are tricks that one could use to address this constraint (e.g.,
245  * creating trampolines every maximum_branch_displacement bytes), we instead
246  * solve this by not permitting each CPU to consume a disjoint virtual range.
247  * Rather, we have each CPU's interposing trap table use the _same_ virtual
248  * range, but we back the trap tables with disjoint physical memory.  Normally,
249  * such one-to-many virtual-to-physical mappings are illegal; this is
250  * permissible here only because the pages for the interposing trap table are
251  * necessarily locked in the TLB.  (The CPUs thus never have the opportunity to
252  * discover that they have conflicting translations.)
253  *
254  * On CMT architectures in which CPUs can share MMUs, the above trick will not
255  * work: two CPUs that share an MMU cannot have the same virtual address map
256  * to disjoint physical pages.  On these architectures, any CPUs sharing the
257  * same MMU must consume a disjoint 32K virtual address range -- limiting the
258  * number of CPUs sharing an MMU on these architectures to 256 due to the
259  * branch displacement limitation described above.  On the sun4v architecture,
260  * there is a further limitation: a guest may not have more than eight locked
261  * TLB entries per MMU.  To allow operation under this restriction, the
262  * interposing trap table and the trap statistics are each accessed through
263  * a single 4M TLB entry.  This limits the footprint to two locked entries
264  * (one for the I-TLB and one for the D-TLB), but further restricts the number
265  * of CPUs to 128 per MMU.  However, support for more than 128 CPUs can easily
266  * be added via a hybrid scheme, where the same 4M virtual address is used
267  * on different MMUs.
268  *
269  * On sun4v architecture, we currently don't use hybrid scheme as it imposes
270  * additional restriction on live migration and transparent CPU replacement.
271  * Instead, we increase the number of supported CPUs by reducing the virtual
272  * address space requirements per CPU via shared interposing trap table as
273  * follows:
274  *
275  *                                          Offset (within 4MB page)
276  *       +------------------------------------+- 0x400000
277  *       |  CPU 507 trap statistics (8KB)     |   .
278  *       |- - - - - - - - - - - - - - - - - - +- 0x3fe000
279  *       |                                    |
280  *       |   ...                              |
281  *       |                                    |
282  *       |- - - - - - - - - - - - - - - - - - +- 0x00c000
283  *       |  CPU 1 trap statistics (8KB)       |   .
284  *       |- - - - - - - - - - - - - - - - - - +- 0x00a000
285  *       |  CPU 0 trap statistics (8KB)       |   .
286  *       |- - - - - - - - - - - - - - - - - - +- 0x008000
287  *       |  Shared trap handler continuation  |   .
288  *       |- - - - - - - - - - - - - - - - - - +- 0x006000
289  *       |  Non-trap instruction, TL>0        |   .
290  *       |- - - - - - - - - - - - - - - - - - +- 0x004000
291  *       |  Trap instruction, TL=0            |   .
292  *       |- - - - - - - - - - - - - - - - - - +- 0x002000
293  *       |  Non-trap instruction, TL=0        |   .
294  *       +------------------------------------+- 0x000000
295  *
296  * Note that each CPU has its own 8K space for its trap statistics but
297  * shares the same interposing trap handlers.  Interposing trap handlers
298  * use the CPU ID to determine the location of per CPU trap statistics
299  * area dynamically. This increases the interposing trap handler overhead,
300  * but is acceptable as it allows us to support up to 508 CPUs with one
301  * 4MB page on sun4v architecture. Support for additional CPUs can be
302  * added via hybrid scheme as mentioned earlier.
303  *
304  * TLB Statistics
305  *
306  * Because TLB misses are an important component of system performance, we wish
307  * to know much more about these traps than simply the number received.
308  * Specifically, we wish to know:
309  *
310  *  (a)	The amount of time spent executing the TLB miss handler
311  *  (b)	TLB misses versus TSB misses
312  *  (c) Kernel-level misses versus user-level misses
313  *  (d) Misses per pagesize
314  *
315  * TLB Statistics: Time Spent Executing
316  *
317  * To accurately determine the amount of time spent executing the TLB miss
318  * handler, one must get a timestamp on trap entry and trap exit, subtract the
319  * latter from the former, and add the result to an accumulating count.
320  * Consider flow of control during normal TLB miss processing (where "ldx
321  * [%g2], %g2" is an arbitrary TLB-missing instruction):
322  *
323  * + - - - - - - - -+
324  * :                :
325  * : ldx [%g2], %g2 :<-------------------------------------------------------+
326  * :                :              Return from trap:                         |
327  * + - - - - - - - -+                TL <- TL - 1 (0)                        |
328  *	  |                          %pc <- TSTATE[TL].TPC (address of load) |
329  *	  | TLB miss:                                                        |
330  *        |   TL <- TL + 1 (1)                                               |
331  *        |   %pc <- TLB-miss-trap-handler                                   |
332  *        |                                                                  |
333  *        v                                                                  |
334  * + - - - - - - - - - - - - - - - +                                         |
335  * :                               :                                         |
336  * : Lookup VA in TSB              :                                         |
337  * : If (hit)                      :                                         |
338  * :     Fill TLB                  :                                         |
339  * : Else                          :                                         |
340  * :     Lookup VA (hme hash table :                                         |
341  * :                or segkpm)     :                                         |
342  * :     Fill TLB                  :                                         |
343  * : Endif                         :                                         |
344  * : Issue "retry"  ---------------------------------------------------------+
345  * :                               :
346  * + - - - - - - - - - - - - - - - +
347  *  TLB-miss-trap-handler
348  *
349  *
350  * As the above diagram indicates, interposing on the trap table allows one
351  * only to determine a timestamp on trap _entry_:  when the TLB miss handler
352  * has completed filling the TLB, a "retry" will be issued, and control will
353  * transfer immediately back to the missing %pc.
354  *
355  * To obtain a timestamp on trap exit, we must then somehow interpose between
356  * the "retry" and the subsequent control transfer to the TLB-missing
357  * instruction.  To do this, we _push_ a trap level.  The basic idea is to
358  * spoof a TLB miss by raising TL, setting the %tpc to be within text
359  * controlled by trapstat (the "TLB return entry") and branching to the
360  * underlying TLB miss handler.  When the TLB miss handler issues its "retry",
361  * control will transfer not to the TLB-missing instruction, but rather to the
362  * TLB return entry.  This code can then obtain a timestamp, and issue its own
363  * "retry" -- thereby correctly returning to the TLB-missing instruction.
364  * Here is the above TLB miss flow control diagram modified to reflect
365  * trapstat's operation:
366  *
367  * + - - - - - - - -+
368  * :                :
369  * : ldx [%g2], %g2 :<-------------------------------------------------------+
370  * :                :             Return from trap:                          |
371  * + - - - - - - - -+               TL <- TL - 1 (0)                         |
372  *	  |                         %pc <- TSTATE[TL].TPC (address of load)  |
373  *	  | TLB miss:                                                        |
374  *        |   TL <- TL + 1 (1)                                               |
375  *        |   %pc <- TLB-miss-trap-handler (trapstat)                        |
376  *        |                                                                  |
377  *        v                                    TLB-return-entry (trapstat)   |
378  * + - - - - - - - - - - - - - - - - - - +    + - - - - - - - - - - - - - +  |
379  * :                                     :    :                           :  |
380  * : Record timestamp                    :    : Record timestamp          :  |
381  * : TL <- 2                             :    : Take timestamp difference :  |
382  * : TSTATE[1].TPC <- TLB-return-entry   :    : Add to running total      :  |
383  * : ba,a TLB-miss-trap-handler -----------+  : Issue "retry"  --------------+
384  * :                                     : |  :                           :
385  * + - - - - - - - - - - - - - - - - - - + |  + - - - - - - - - - - - - - +
386  *  TLB-miss-trap-handler	           |                  ^
387  *  (trapstat)                             |                  |
388  *                                         |                  |
389  *                                         |                  |
390  *                 +-----------------------+                  |
391  *                 |                                          |
392  *                 |                                          |
393  *                 v                                          |
394  * + - - - - - - - - - - - - - - - +                          |
395  * :                               :                          |
396  * : Lookup VA in TSB              :                          |
397  * : If (hit)                      :                          |
398  * :     Fill TLB                  :                          |
399  * : Else                          :                          |
400  * :     Lookup VA (hme hash table :                          |
401  * :                or segkpm)     :                          |
402  * :     Fill TLB                  :                          |
403  * : Endif                         :                          |
404  * : Issue "retry"  ------------------------------------------+
405  * :                               : Return from trap:
406  * + - - - - - - - - - - - - - - - +   TL <- TL - 1 (1)
407  *  TLB-miss-trap-handler              %pc <- TSTATE[TL].TPC (TLB-return-entry)
408  *
409  *
410  * A final subterfuge is required to complete our artifice:  if we miss in
411  * the TLB, the TSB _and_ the subsequent hash or segkpm lookup (that is, if
412  * there is no valid translation for the TLB-missing address), common system
413  * software will need to accurately determine the %tpc as part of its page
414  * fault handling. We therefore modify the kernel to check the %tpc in this
415  * case: if the %tpc falls within the VA range controlled by trapstat and
416  * the TL is 2, TL is simply lowered back to 1 (this check is implemented
417  * by the TSTAT_CHECK_TL1 macro).  Lowering TL to 1 has the effect of
418  * discarding the state pushed by trapstat.
419  *
420  * TLB Statistics: TLB Misses versus TSB Misses
421  *
422  * Distinguishing TLB misses from TSB misses requires further interposition
423  * on the TLB miss handler:  we cannot know a priori or a posteriori if a
424  * given VA will or has hit in the TSB.
425  *
426  * We achieve this distinction by adding a second TLB return entry almost
427  * identical to the first -- differing only in the address to which it
428  * stores its results.  We then modify the TLB miss handlers of the kernel
429  * such that they check the %tpc when they determine that a TLB miss has
430  * subsequently missed in the TSB:  if the %tpc lies within trapstat's VA
431  * range and TL is 2 (that is, if trapstat is running), the TLB miss handler
432  * _increments_ the %tpc by the size of the TLB return entry.  The ensuing
433  * "retry" will thus transfer control to the second TLB return entry, and
434  * the time spent in the handler will be accumulated in a memory location
435  * specific to TSB misses.
436  *
437  * N.B.:  To minimize the amount of knowledge the kernel must have of trapstat,
438  * we do not allow the kernel to hard-code the size of the TLB return entry.
439  * Rather, the actual tsbmiss handler executes a known instruction at the
440  * corresponding tsbmiss patch points (see the tstat_tsbmiss_patch_table) with
441  * the %tpc in %g7:  when trapstat is not running, these points contain the
442  * harmless TSTAT_TSBMISS_INSTR instruction ("add %g7, 0, %g7"). Before
443  * running, trapstat modifies the instructions at these patch points such
444  * that the simm13 equals the size of the TLB return entry.
445  *
446  * TLB Statistics: Kernel-level Misses versus User-level Misses
447  *
448  * Differentiating user-level misses from kernel-level misses employs a
449  * similar technique, but is simplified by the ability to distinguish a
450  * user-level miss from a kernel-level miss a priori by reading the context
451  * register:  we implement kernel-/user-level differentiation by again doubling
452  * the number of TLB return entries, and setting the %tpc to the appropriate
453  * TLB return entry in trapstat's TLB miss handler.  Together with the doubling
454  * of entries required for TLB-miss/TSB-miss differentiation, this yields a
455  * total of four TLB return entries:
456  *
457  *	Level		TSB hit?	Structure member
458  *	------------------------------------------------------------
459  *	Kernel		Yes		tstat_tlbret_t.ttlbr_ktlb
460  *	Kernel		No		tstat_tlbret_t.ttlbr_ktsb
461  *	User		Yes		tstat_tlbret_t.ttlbr_utlb
462  *	User		No		tstat_tlbret_t.ttlbr_utsb
463  *
464  * TLB Statistics: Misses per Pagesize
465  *
466  * As with the TLB-/TSB-miss differentiation, we have no way of determining
467  * pagesize a priori.  This is therefore implemented by mandating a new rule:
468  * whenever the kernel fills the TLB in its TLB miss handler, the TTE
469  * corresponding to the TLB-missing VA must be in %g5 when the handler
470  * executes its "retry".  This allows the TLB return entry to determine
471  * pagesize by simply looking at the pagesize field in the TTE stored in
472  * %g5.
473  *
474  * TLB Statistics: Probe Effect
475  *
476  * As one might imagine, gathering TLB statistics by pushing a trap level
477  * induces significant probe effect.  To account for this probe effect,
478  * trapstat attempts to observe it by executing a code sequence with a known
479  * number of TLB misses both before and after interposing on the trap table.
480  * This allows trapstat to determine a per-trap probe effect which can then be
481  * factored into the "%tim" fields of the trapstat command.
482  *
483  * Note that on sun4v platforms, TLB misses are normally handled by the
484  * hypervisor or the hardware TSB walker. Thus no fast MMU miss information
485  * is reported for normal operation. However, when trapstat is invoked
486  * with -t or -T option to collect detailed TLB statistics, kernel takes
487  * over TLB miss handling. This results in significantly more overhead
488  * and TLB statistics may not be as accurate as on sun4u platforms.
489  * On some processors, hypervisor or hardware may provide a low overhead
490  * interface to collect TSB hit statistics. This support is exposed via
491  * a well defined CPU module interface (cpu_trapstat_conf to enable this
492  * interface and cpu_trapstat_data to get detailed TSB hit statistics).
493  * In this scenario, TSB miss statistics is collected by intercepting the
494  * IMMU_miss and DMMU_miss traps using above mentioned trap interposition
495  * approach.
496  *
497  * Locking
498  *
499  * The implementation uses two locks:  tstat_lock (a local lock) and the global
500  * cpu_lock.  tstat_lock is used to assure trapstat's consistency in the
501  * presence of multithreaded /dev/trapstat consumers (while as of this writing
502  * the only consumer of /dev/trapstat is single threaded, it is obviously
503  * necessary to correctly support multithreaded access).  cpu_lock is held
504  * whenever CPUs are being manipulated directly, to prevent them from
505  * disappearing in the process.  Because trapstat's DR callback
506  * (trapstat_cpu_setup()) must grab tstat_lock and is called with cpu_lock
507  * held, the lock ordering is necessarily cpu_lock before tstat_lock.
508  *
509  */
510 /* END CSTYLED */
511 
512 static dev_info_t	*tstat_devi;	/* saved in xxattach() for xxinfo() */
513 static int		tstat_open;	/* set if driver is open */
514 static kmutex_t		tstat_lock;	/* serialize access */
515 static vmem_t		*tstat_arena;	/* arena for TLB-locked pages */
516 static tstat_percpu_t	*tstat_percpu;	/* per-CPU data */
517 static int		tstat_running;	/* set if trapstat is running */
518 static tstat_data_t	*tstat_buffer;	/* staging buffer for outgoing data */
519 static int		tstat_options;	/* bit-wise indication of options */
520 static int		*tstat_enabled;	/* map of enabled trap entries */
521 static int		tstat_tsbmiss_patched; /* tsbmiss patch flag */
522 static callb_id_t	tstat_cprcb;	/* CPR callback */
523 static char		*tstat_probe_area; /* VA range used for probe effect */
524 static caddr_t		tstat_probe_phys; /* physical to back above VA */
525 static hrtime_t		tstat_probe_time; /* time spent on probe effect */
526 static hrtime_t		tstat_probe_before[TSTAT_PROBE_NLAPS];
527 static hrtime_t		tstat_probe_after[TSTAT_PROBE_NLAPS];
528 static uint_t		tstat_pgszs;		/* # of kernel page sizes */
529 static uint_t		tstat_user_pgszs;	/* # of user page sizes */
530 
531 /*
532  * sizeof tstat_data_t + pgsz data for the kernel.  For simplicity's sake, when
533  * we collect data, we do it based upon szc, but when we report data back to
534  * userland, we have to do it based upon the userszc which may not match.
535  * So, these two variables are for internal use and exported use respectively.
536  */
537 static size_t		tstat_data_t_size;
538 static size_t		tstat_data_t_exported_size;
539 
540 #ifndef sun4v
541 
542 static size_t		tstat_data_pages;  /* number of pages of tstat data */
543 static size_t		tstat_data_size;   /* tstat data size in bytes */
544 static size_t		tstat_total_pages; /* #data pages + #instr pages */
545 static size_t		tstat_total_size;  /* tstat data size + instr size */
546 
547 #else /* sun4v */
548 
549 static caddr_t		tstat_va;	/* VA of memory reserved for TBA */
550 static pfn_t		tstat_pfn;	/* PFN of memory reserved for TBA */
551 static boolean_t	tstat_fast_tlbstat = B_FALSE;
552 static int		tstat_traptab_initialized;
553 
554 #endif /* sun4v */
555 
556 /*
557  * In the above block comment, see "TLB Statistics: TLB Misses versus
558  * TSB Misses" for an explanation of the tsbmiss patch points.
559  */
560 extern uint32_t		tsbmiss_trapstat_patch_point;
561 extern uint32_t		tsbmiss_trapstat_patch_point_kpm;
562 extern uint32_t		tsbmiss_trapstat_patch_point_kpm_small;
563 
564 /*
565  * Trapstat tsbmiss patch table
566  */
567 tstat_tsbmiss_patch_entry_t tstat_tsbmiss_patch_table[] = {
568 	{(uint32_t *)&tsbmiss_trapstat_patch_point, 0},
569 	{(uint32_t *)&tsbmiss_trapstat_patch_point_kpm, 0},
570 	{(uint32_t *)&tsbmiss_trapstat_patch_point_kpm_small, 0},
571 	{(uint32_t *)NULL, 0}
572 };
573 
574 /*
575  * We define some general SPARC-specific constants to allow more readable
576  * relocations.
577  */
578 #define	NOP	0x01000000
579 #define	HI22(v) ((uint32_t)(v) >> 10)
580 #define	LO10(v) ((uint32_t)(v) & 0x3ff)
581 #define	LO12(v) ((uint32_t)(v) & 0xfff)
582 #define	DISP22(from, to) \
583 	((((uintptr_t)(to) - (uintptr_t)(from)) >> 2) & 0x3fffff)
584 #define	ASI(asi)	((asi) << 5)
585 
586 /*
587  * The interposing trap table must be locked in the I-TLB, and any data
588  * referred to in the interposing trap handler must be locked in the D-TLB.
589  * This function locks these pages in the appropriate TLBs by creating TTEs
590  * from whole cloth, and manually loading them into the TLB.  This function is
591  * called from cross call context.
592  *
593  * On sun4v platforms, we use 4M page size mappings to minimize the number
594  * of locked down entries (i.e. permanent mappings). Each CPU uses a
595  * reserved portion of that 4M page for its TBA and data.
596  */
597 static void
598 trapstat_load_tlb(void)
599 {
600 #ifndef sun4v
601 	int i;
602 #else
603 	uint64_t ret;
604 #endif
605 	tte_t tte;
606 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
607 	caddr_t va = tcpu->tcpu_vabase;
608 
609 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
610 	ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
611 
612 #ifndef sun4v
613 	for (i = 0; i < tstat_total_pages; i++, va += MMU_PAGESIZE) {
614 		tte.tte_inthi = TTE_VALID_INT | TTE_SZ_INT(TTE8K) |
615 		    TTE_PFN_INTHI(tcpu->tcpu_pfn[i]);
616 		if (i < TSTAT_INSTR_PAGES) {
617 			tte.tte_intlo = TTE_PFN_INTLO(tcpu->tcpu_pfn[i]) |
618 			    TTE_LCK_INT | TTE_CP_INT | TTE_PRIV_INT;
619 			sfmmu_itlb_ld_kva(va, &tte);
620 		} else {
621 			tte.tte_intlo = TTE_PFN_INTLO(tcpu->tcpu_pfn[i]) |
622 			    TTE_LCK_INT | TTE_CP_INT | TTE_CV_INT |
623 			    TTE_PRIV_INT | TTE_HWWR_INT;
624 			sfmmu_dtlb_ld_kva(va, &tte);
625 		}
626 	}
627 #else /* sun4v */
628 	tte.tte_inthi = TTE_VALID_INT | TTE_PFN_INTHI(tstat_pfn);
629 	tte.tte_intlo = TTE_PFN_INTLO(tstat_pfn) | TTE_CP_INT |
630 	    TTE_CV_INT | TTE_PRIV_INT | TTE_HWWR_INT |
631 	    TTE_SZ_INTLO(TTE4M);
632 	ret = hv_mmu_map_perm_addr(va, KCONTEXT, *(uint64_t *)&tte,
633 	    MAP_ITLB | MAP_DTLB);
634 
635 	if (ret != H_EOK)
636 		cmn_err(CE_PANIC, "trapstat: cannot map new TBA "
637 		    "for cpu %d  (error: 0x%lx)", CPU->cpu_id, ret);
638 #endif /* sun4v */
639 }
640 
641 /*
642  * As mentioned in the "TLB Statistics: TLB Misses versus TSB Misses" section
643  * of the block comment, TLB misses are differentiated from TSB misses in
644  * part by hot-patching the instructions at the tsbmiss patch points (see
645  * tstat_tsbmiss_patch_table). This routine is used both to initially patch
646  * the instructions, and to patch them back to their original values upon
647  * restoring the original trap table.
648  */
649 static void
650 trapstat_hotpatch()
651 {
652 	uint32_t instr;
653 	uint32_t simm13;
654 	tstat_tsbmiss_patch_entry_t *ep;
655 
656 	ASSERT(MUTEX_HELD(&tstat_lock));
657 
658 	if (!(tstat_options & TSTAT_OPT_TLBDATA))
659 		return;
660 
661 	if (!tstat_tsbmiss_patched) {
662 		/*
663 		 * We haven't patched the TSB paths; do so now.
664 		 */
665 		/*CONSTCOND*/
666 		ASSERT(offsetof(tstat_tlbret_t, ttlbr_ktsb) -
667 		    offsetof(tstat_tlbret_t, ttlbr_ktlb) ==
668 		    offsetof(tstat_tlbret_t, ttlbr_utsb) -
669 		    offsetof(tstat_tlbret_t, ttlbr_utlb));
670 
671 		simm13 = offsetof(tstat_tlbret_t, ttlbr_ktsb) -
672 		    offsetof(tstat_tlbret_t, ttlbr_ktlb);
673 
674 		for (ep = tstat_tsbmiss_patch_table; ep->tpe_addr; ep++) {
675 			ASSERT(ep->tpe_instr == 0);
676 			instr = ep->tpe_instr = *ep->tpe_addr;
677 
678 			/*
679 			 * Assert that the instruction we're about to patch is
680 			 * "add %g7, 0, %g7" (0x8e01e000).
681 			 */
682 			ASSERT(instr == TSTAT_TSBMISS_INSTR);
683 
684 			instr |= simm13;
685 			hot_patch_kernel_text((caddr_t)ep->tpe_addr,
686 			    instr, sizeof (instr));
687 		}
688 
689 		tstat_tsbmiss_patched = 1;
690 
691 	} else {
692 		/*
693 		 * Remove patches from the TSB paths.
694 		 */
695 		for (ep = tstat_tsbmiss_patch_table; ep->tpe_addr; ep++) {
696 			ASSERT(ep->tpe_instr == TSTAT_TSBMISS_INSTR);
697 			hot_patch_kernel_text((caddr_t)ep->tpe_addr,
698 			    ep->tpe_instr, sizeof (instr));
699 			ep->tpe_instr = 0;
700 		}
701 
702 		tstat_tsbmiss_patched = 0;
703 	}
704 }
705 
706 /*
707  * This is the routine executed to clock the performance of the trap table,
708  * executed both before and after interposing on the trap table to attempt to
709  * determine probe effect.  The probe effect is used to adjust the "%tim"
710  * fields of trapstat's -t and -T output; we only use TLB misses to clock the
711  * trap table.  We execute the inner loop (which is designed to exceed the
712  * TLB's reach) nlaps times, taking the best time as our time (thereby
713  * factoring out the effects of interrupts, cache misses or other perturbing
714  * events.
715  */
716 static hrtime_t
717 trapstat_probe_laps(int nlaps, hrtime_t *buf)
718 {
719 	int i, j = 0;
720 	hrtime_t ts, best = INT64_MAX;
721 
722 	while (nlaps--) {
723 		ts = rdtick();
724 
725 		for (i = 0; i < TSTAT_PROBE_SIZE; i += MMU_PAGESIZE)
726 			*((volatile char *)&tstat_probe_area[i]);
727 
728 		if ((ts = rdtick() - ts) < best)
729 			best = ts;
730 		buf[j++] = ts;
731 	}
732 
733 	return (best);
734 }
735 
736 /*
737  * This routine determines the probe effect by calling trapstat_probe_laps()
738  * both without and with the interposing trap table.  Note that this is
739  * called from a cross call on the desired CPU, and that it is called on
740  * every CPU (this is necessary because the probe effect may differ from
741  * one CPU to another).
742  */
743 static void
744 trapstat_probe()
745 {
746 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
747 	hrtime_t before, after;
748 
749 	if (!(tcpu->tcpu_flags & TSTAT_CPU_SELECTED))
750 		return;
751 
752 	if (tstat_probe_area == NULL || (tstat_options & TSTAT_OPT_NOGO))
753 		return;
754 
755 	/*
756 	 * We very much expect the %tba to be KERNELBASE; this is a
757 	 * precautionary measure to assure that trapstat doesn't melt the
758 	 * machine should the %tba point unexpectedly elsewhere.
759 	 */
760 	if (get_tba() != (caddr_t)KERNELBASE)
761 		return;
762 
763 	/*
764 	 * Preserve this CPU's data before destroying it by enabling the
765 	 * interposing trap table.  We can safely use tstat_buffer because
766 	 * the caller of the trapstat_probe() cross call is holding tstat_lock.
767 	 */
768 	bcopy(tcpu->tcpu_data, tstat_buffer, tstat_data_t_size);
769 
770 	tstat_probe_time = gethrtime();
771 
772 	before = trapstat_probe_laps(TSTAT_PROBE_NLAPS, tstat_probe_before);
773 	(void) set_tba(tcpu->tcpu_ibase);
774 
775 	after = trapstat_probe_laps(TSTAT_PROBE_NLAPS, tstat_probe_after);
776 	(void) set_tba((caddr_t)KERNELBASE);
777 
778 	tstat_probe_time = gethrtime() - tstat_probe_time;
779 
780 	bcopy(tstat_buffer, tcpu->tcpu_data, tstat_data_t_size);
781 	tcpu->tcpu_data->tdata_peffect = (after - before) / TSTAT_PROBE_NPAGES;
782 }
783 
784 static void
785 trapstat_probe_alloc()
786 {
787 	pfn_t pfn;
788 	caddr_t va;
789 	int i;
790 
791 	ASSERT(MUTEX_HELD(&tstat_lock));
792 	ASSERT(tstat_probe_area == NULL);
793 	ASSERT(tstat_probe_phys == NULL);
794 
795 	if (!(tstat_options & TSTAT_OPT_TLBDATA))
796 		return;
797 
798 	/*
799 	 * Grab some virtual from the heap arena.
800 	 */
801 	tstat_probe_area = vmem_alloc(heap_arena, TSTAT_PROBE_SIZE, VM_SLEEP);
802 	va = tstat_probe_area;
803 
804 	/*
805 	 * Grab a single physical page.
806 	 */
807 	tstat_probe_phys = vmem_alloc(tstat_arena, MMU_PAGESIZE, VM_SLEEP);
808 	pfn = hat_getpfnum(kas.a_hat, tstat_probe_phys);
809 
810 	/*
811 	 * Now set the translation for every page in our virtual range
812 	 * to be our allocated physical page.
813 	 */
814 	for (i = 0; i < TSTAT_PROBE_NPAGES; i++) {
815 		hat_devload(kas.a_hat, va, MMU_PAGESIZE, pfn, PROT_READ,
816 		    HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
817 		va += MMU_PAGESIZE;
818 	}
819 }
820 
821 static void
822 trapstat_probe_free()
823 {
824 	caddr_t va;
825 	int i;
826 
827 	ASSERT(MUTEX_HELD(&tstat_lock));
828 
829 	if ((va = tstat_probe_area) == NULL)
830 		return;
831 
832 	for (i = 0; i < TSTAT_PROBE_NPAGES; i++) {
833 		hat_unload(kas.a_hat, va, MMU_PAGESIZE, HAT_UNLOAD_UNLOCK);
834 		va += MMU_PAGESIZE;
835 	}
836 
837 	vmem_free(tstat_arena, tstat_probe_phys, MMU_PAGESIZE);
838 	vmem_free(heap_arena, tstat_probe_area, TSTAT_PROBE_SIZE);
839 
840 	tstat_probe_phys = NULL;
841 	tstat_probe_area = NULL;
842 }
843 
844 /*
845  * This routine actually enables a CPU by setting its %tba to be the
846  * CPU's interposing trap table.  It is called out of cross call context.
847  */
848 static void
849 trapstat_enable()
850 {
851 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
852 
853 	if (!(tcpu->tcpu_flags & TSTAT_CPU_SELECTED))
854 		return;
855 
856 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
857 	ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
858 
859 	if (get_tba() != (caddr_t)KERNELBASE)
860 		return;
861 
862 	if (!(tstat_options & TSTAT_OPT_NOGO))
863 		(void) set_tba(tcpu->tcpu_ibase);
864 	tcpu->tcpu_flags |= TSTAT_CPU_ENABLED;
865 #ifdef sun4v
866 	if ((tstat_options & TSTAT_OPT_TLBDATA) &&
867 	    !(tstat_options & TSTAT_OPT_NOGO)) {
868 		if (tstat_fast_tlbstat) {
869 			/*
870 			 * Invoke processor specific interface to enable
871 			 * collection of TSB hit statistics.
872 			 */
873 			cpu_trapstat_conf(CPU_TSTATCONF_ENABLE);
874 		} else {
875 			/*
876 			 * Collect TLB miss statistics by taking over
877 			 * TLB miss handling from the hypervisor. This
878 			 * is done by telling the hypervisor that there
879 			 * is no TSB configured. Also set TSTAT_TLB_STATS
880 			 * flag so that no user TSB is configured during
881 			 * context switch time.
882 			 */
883 			cpu_t *cp = CPU;
884 
885 			cp->cpu_m.cpu_tstat_flags |= TSTAT_TLB_STATS;
886 			(void) hv_set_ctx0(NULL, NULL);
887 			(void) hv_set_ctxnon0(NULL, NULL);
888 		}
889 	}
890 #endif
891 }
892 
893 /*
894  * This routine disables a CPU (vis a vis trapstat) by setting its %tba to be
895  * the actual, underlying trap table.  It is called out of cross call context.
896  */
897 static void
898 trapstat_disable()
899 {
900 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
901 
902 	if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED))
903 		return;
904 
905 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
906 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
907 
908 	if (!(tstat_options & TSTAT_OPT_NOGO))
909 		(void) set_tba((caddr_t)KERNELBASE);
910 
911 	tcpu->tcpu_flags &= ~TSTAT_CPU_ENABLED;
912 
913 #ifdef sun4v
914 	if ((tstat_options & TSTAT_OPT_TLBDATA) &&
915 	    !(tstat_options & TSTAT_OPT_NOGO)) {
916 		if (tstat_fast_tlbstat) {
917 			/*
918 			 * Invoke processor specific interface to disable
919 			 * collection of TSB hit statistics on each processor.
920 			 */
921 			cpu_trapstat_conf(CPU_TSTATCONF_DISABLE);
922 		} else {
923 			/*
924 			 * As part of collecting TLB miss statistics, we took
925 			 * over TLB miss handling from the hypervisor by
926 			 * telling the hypervisor that NO TSB is configured.
927 			 * We need to restore that by communicating proper
928 			 * kernel/user TSB information so that TLB misses
929 			 * can be handled by the hypervisor or the hardware
930 			 * more efficiently.
931 			 *
932 			 * We restore kernel TSB information right away.
933 			 * However, to minimize any locking dependency, we
934 			 * don't restore user TSB information right away.
935 			 * Instead, we simply clear the TSTAT_TLB_STATS flag
936 			 * so that the user TSB information is automatically
937 			 * restored on next context switch.
938 			 *
939 			 * Note that the call to restore kernel TSB information
940 			 * will normally not fail, unless wrong information is
941 			 * passed here. In that scenario, system will still
942 			 * continue to function properly with the exception of
943 			 * kernel handling all the TLB misses.
944 			 */
945 			struct hv_tsb_block *hvbp = &ksfmmup->sfmmu_hvblock;
946 			cpu_t *cp = CPU;
947 
948 			cp->cpu_m.cpu_tstat_flags &= ~TSTAT_TLB_STATS;
949 			(void) hv_set_ctx0(hvbp->hv_tsb_info_cnt,
950 			    hvbp->hv_tsb_info_pa);
951 		}
952 	}
953 #endif
954 }
955 
956 /*
957  * We use %tick as the time base when recording the time spent executing
958  * the trap handler.  %tick, however, is not necessarily kept in sync
959  * across CPUs (indeed, different CPUs may have different %tick frequencies).
960  * We therefore cross call onto a CPU to get a snapshot of its data to
961  * copy out; this is the routine executed out of that cross call.
962  */
963 static void
964 trapstat_snapshot()
965 {
966 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
967 	tstat_data_t *data = tcpu->tcpu_data;
968 
969 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
970 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
971 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ENABLED);
972 
973 	data->tdata_snapts = gethrtime();
974 	data->tdata_snaptick = rdtick();
975 	bcopy(data, tstat_buffer, tstat_data_t_size);
976 #ifdef sun4v
977 	/*
978 	 * Invoke processor specific interface to collect TSB hit
979 	 * statistics on each processor.
980 	 */
981 	if ((tstat_options & TSTAT_OPT_TLBDATA) && tstat_fast_tlbstat)
982 		cpu_trapstat_data((void *) tstat_buffer->tdata_pgsz,
983 		    tstat_pgszs);
984 #endif
985 }
986 
987 /*
988  * The TSTAT_RETENT_* constants define offsets in the TLB return entry.
989  * They are used only in trapstat_tlbretent() (below) and #undef'd
990  * immediately afterwards.  Any change to "retent" in trapstat_tlbretent()
991  * will likely require changes to these constants.
992  */
993 
994 #ifndef sun4v
995 #define	TSTAT_RETENT_STATHI	1
996 #define	TSTAT_RETENT_STATLO	2
997 #define	TSTAT_RETENT_SHIFT	11
998 #define	TSTAT_RETENT_COUNT_LD	13
999 #define	TSTAT_RETENT_COUNT_ST	15
1000 #define	TSTAT_RETENT_TMPTSHI	16
1001 #define	TSTAT_RETENT_TMPTSLO	17
1002 #define	TSTAT_RETENT_TIME_LD	19
1003 #define	TSTAT_RETENT_TIME_ST	21
1004 #else /* sun4v */
1005 #define	TSTAT_RETENT_TDATASHFT	2
1006 #define	TSTAT_RETENT_STATHI	4
1007 #define	TSTAT_RETENT_STATLO	6
1008 #define	TSTAT_RETENT_SHIFT	9
1009 #define	TSTAT_RETENT_COUNT_LD	11
1010 #define	TSTAT_RETENT_COUNT_ST	13
1011 #define	TSTAT_RETENT_TMPTSHI	14
1012 #define	TSTAT_RETENT_TMPTSLO	16
1013 #define	TSTAT_RETENT_TIME_LD	18
1014 #define	TSTAT_RETENT_TIME_ST	20
1015 #endif /* sun4v */
1016 
1017 static void
1018 trapstat_tlbretent(tstat_percpu_t *tcpu, tstat_tlbretent_t *ret,
1019     tstat_missdata_t *data)
1020 {
1021 	uint32_t *ent = ret->ttlbrent_instr, shift;
1022 	uintptr_t base;
1023 #ifndef sun4v
1024 	uintptr_t tmptick = TSTAT_DATA_OFFS(tcpu, tdata_tmptick);
1025 #else
1026 	uintptr_t tmptick = TSTAT_CPU0_DATA_OFFS(tcpu, tdata_tmptick);
1027 #endif
1028 
1029 	/*
1030 	 * This is the entry executed upon return from the TLB/TSB miss
1031 	 * handler (i.e. the code interpositioned between the "retry" and
1032 	 * the actual return to the TLB-missing instruction).  Detail on its
1033 	 * theory of operation can be found in the "TLB Statistics" section
1034 	 * of the block comment.  Note that we expect the TTE just loaded
1035 	 * into the TLB to be in %g5; all other globals are available as
1036 	 * scratch.  Finally, note that the page size information in sun4v is
1037 	 * located in the lower bits of the TTE -- requiring us to have a
1038 	 * different return entry on sun4v.
1039 	 */
1040 	static const uint32_t retent[TSTAT_TLBRET_NINSTR] = {
1041 #ifndef sun4v
1042 	    0x87410000,		/* rd    %tick, %g3			*/
1043 	    0x03000000, 	/* sethi %hi(stat), %g1			*/
1044 	    0x82106000,		/* or    %g1, %lo(stat), %g1		*/
1045 	    0x89297001,		/* sllx  %g5, 1, %g4			*/
1046 	    0x8931303e,		/* srlx  %g4, 62, %g4			*/
1047 	    0x8531702e,		/* srlx  %g5, 46, %g2			*/
1048 	    0x8408a004,		/* and   %g2, 4, %g2			*/
1049 	    0x88110002,		/* or    %g4, %g2, %g4			*/
1050 	    0x80a12005,		/* cmp   %g4, 5				*/
1051 	    0x34400002,		/* bg,a,pn %icc, +8			*/
1052 	    0x88102004,		/* mov   4, %g4				*/
1053 	    0x89292000,		/* sll   %g4, shift, %g4		*/
1054 	    0x82004004,		/* add   %g1, %g4, %g1			*/
1055 	    0xc4586000,		/* ldx   [%g1 + tmiss_count], %g2	*/
1056 	    0x8400a001,		/* add   %g2, 1, %g2			*/
1057 	    0xc4706000,		/* stx   %g2, [%g1 + tmiss_count]	*/
1058 	    0x0d000000, 	/* sethi %hi(tdata_tmptick), %g6	*/
1059 	    0xc459a000, 	/* ldx   [%g6 + %lo(tdata_tmptick)], %g2 */
1060 	    0x8620c002,		/* sub   %g3, %g2, %g3			*/
1061 	    0xc4586000,		/* ldx   [%g1 + tmiss_time], %g2	*/
1062 	    0x84008003,		/* add   %g2, %g3, %g2			*/
1063 	    0xc4706000,		/* stx   %g2, [%g1 + tmiss_time]	*/
1064 	    0x83f00000		/* retry				*/
1065 #else /* sun4v */
1066 	    0x82102008,		/* mov   SCRATCHPAD_CPUID, %g1 		*/
1067 	    0xced84400,		/* ldxa  [%g1]ASI_SCRATCHPAD, %g7	*/
1068 	    0x8f29f000,		/* sllx  %g7, TSTAT_DATA_SHIFT, %g7	*/
1069 	    0x87410000,		/* rd    %tick, %g3			*/
1070 	    0x03000000, 	/* sethi %hi(stat), %g1			*/
1071 	    0x82004007,		/* add   %g1, %g7, %g1			*/
1072 	    0x82106000,		/* or    %g1, %lo(stat), %g1		*/
1073 	    0x8929703d,		/* sllx  %g5, 61, %g4			*/
1074 	    0x8931303d,		/* srlx  %g4, 61, %g4			*/
1075 	    0x89292000,		/* sll   %g4, shift, %g4		*/
1076 	    0x82004004,		/* add   %g1, %g4, %g1			*/
1077 	    0xc4586000,		/* ldx   [%g1 + tmiss_count], %g2	*/
1078 	    0x8400a001,		/* add   %g2, 1, %g2			*/
1079 	    0xc4706000,		/* stx   %g2, [%g1 + tmiss_count]	*/
1080 	    0x0d000000, 	/* sethi %hi(tdata_tmptick), %g6	*/
1081 	    0x8c018007,		/* add   %g6, %g7, %g6			*/
1082 	    0xc459a000, 	/* ldx   [%g6 + %lo(tdata_tmptick)], %g2 */
1083 	    0x8620c002,		/* sub   %g3, %g2, %g3			*/
1084 	    0xc4586000,		/* ldx   [%g1 + tmiss_time], %g2	*/
1085 	    0x84008003,		/* add   %g2, %g3, %g2			*/
1086 	    0xc4706000,		/* stx   %g2, [%g1 + tmiss_time]	*/
1087 	    0x83f00000		/* retry				*/
1088 #endif /* sun4v */
1089 	};
1090 
1091 	ASSERT(MUTEX_HELD(&tstat_lock));
1092 	/*CONSTCOND*/
1093 	ASSERT(offsetof(tstat_missdata_t, tmiss_count) <= LO10(-1));
1094 	/*CONSTCOND*/
1095 	ASSERT(offsetof(tstat_missdata_t, tmiss_time) <= LO10(-1));
1096 	/*CONSTCOND*/
1097 	ASSERT(!((sizeof (tstat_pgszdata_t) - 1) & sizeof (tstat_pgszdata_t)));
1098 
1099 	for (shift = 1; (1 << shift) != sizeof (tstat_pgszdata_t); shift++)
1100 		continue;
1101 
1102 	base = (uintptr_t)tcpu->tcpu_ibase + TSTAT_INSTR_SIZE +
1103 	    ((uintptr_t)data - (uintptr_t)tcpu->tcpu_data);
1104 
1105 	bcopy(retent, ent, sizeof (retent));
1106 
1107 #if defined(sun4v)
1108 	ent[TSTAT_RETENT_TDATASHFT] |= LO10((uintptr_t)TSTAT_DATA_SHIFT);
1109 #endif
1110 	ent[TSTAT_RETENT_STATHI] |= HI22(base);
1111 	ent[TSTAT_RETENT_STATLO] |= LO10(base);
1112 	ent[TSTAT_RETENT_SHIFT] |= shift;
1113 	/* LINTED E_EXPR_NULL_EFFECT */
1114 	ent[TSTAT_RETENT_COUNT_LD] |= offsetof(tstat_missdata_t, tmiss_count);
1115 	/* LINTED E_EXPR_NULL_EFFECT */
1116 	ent[TSTAT_RETENT_COUNT_ST] |= offsetof(tstat_missdata_t, tmiss_count);
1117 	ent[TSTAT_RETENT_TMPTSHI] |= HI22(tmptick);
1118 	ent[TSTAT_RETENT_TMPTSLO] |= LO10(tmptick);
1119 	ent[TSTAT_RETENT_TIME_LD] |= offsetof(tstat_missdata_t, tmiss_time);
1120 	ent[TSTAT_RETENT_TIME_ST] |= offsetof(tstat_missdata_t, tmiss_time);
1121 }
1122 
1123 #if defined(sun4v)
1124 #undef TSTAT_RETENT_TDATASHFT
1125 #endif
1126 #undef TSTAT_RETENT_STATHI
1127 #undef TSTAT_RETENT_STATLO
1128 #undef TSTAT_RETENT_SHIFT
1129 #undef TSTAT_RETENT_COUNT_LD
1130 #undef TSTAT_RETENT_COUNT_ST
1131 #undef TSTAT_RETENT_TMPTSHI
1132 #undef TSTAT_RETENT_TMPTSLO
1133 #undef TSTAT_RETENT_TIME_LD
1134 #undef TSTAT_RETENT_TIME_ST
1135 
1136 /*
1137  * The TSTAT_TLBENT_* constants define offsets in the TLB entry.  They are
1138  * used only in trapstat_tlbent() (below) and #undef'd immediately afterwards.
1139  * Any change to "tlbent" in trapstat_tlbent() will likely require changes
1140  * to these constants.
1141  */
1142 
1143 #ifndef sun4v
1144 #define	TSTAT_TLBENT_STATHI	0
1145 #define	TSTAT_TLBENT_STATLO_LD	1
1146 #define	TSTAT_TLBENT_STATLO_ST	3
1147 #define	TSTAT_TLBENT_MMUASI	15
1148 #define	TSTAT_TLBENT_TPCHI	18
1149 #define	TSTAT_TLBENT_TPCLO_USER	19
1150 #define	TSTAT_TLBENT_TPCLO_KERN	21
1151 #define	TSTAT_TLBENT_TSHI	25
1152 #define	TSTAT_TLBENT_TSLO	27
1153 #define	TSTAT_TLBENT_BA		28
1154 #else /* sun4v */
1155 #define	TSTAT_TLBENT_TDATASHFT	2
1156 #define	TSTAT_TLBENT_STATHI	3
1157 #define	TSTAT_TLBENT_STATLO_LD	5
1158 #define	TSTAT_TLBENT_STATLO_ST	7
1159 #define	TSTAT_TLBENT_TAGTARGET	23
1160 #define	TSTAT_TLBENT_TPCHI	25
1161 #define	TSTAT_TLBENT_TPCLO_USER	26
1162 #define	TSTAT_TLBENT_TPCLO_KERN	28
1163 #define	TSTAT_TLBENT_TSHI	32
1164 #define	TSTAT_TLBENT_TSLO	35
1165 #define	TSTAT_TLBENT_BA		36
1166 #endif /* sun4v */
1167 
1168 static void
1169 trapstat_tlbent(tstat_percpu_t *tcpu, int entno)
1170 {
1171 	uint32_t *ent;
1172 	uintptr_t orig, va, baoffs;
1173 #ifndef sun4v
1174 	int itlb = entno == TSTAT_ENT_ITLBMISS;
1175 	uint32_t asi = itlb ? ASI(ASI_IMMU) : ASI(ASI_DMMU);
1176 #else
1177 	int itlb = (entno == TSTAT_ENT_IMMUMISS || entno == TSTAT_ENT_ITLBMISS);
1178 	uint32_t tagtarget_off = itlb ? MMFSA_I_CTX : MMFSA_D_CTX;
1179 	uint32_t *tent;			/* MMU trap vector entry */
1180 	uintptr_t tentva;		/* MMU trap vector entry va */
1181 	static const uint32_t mmumiss[TSTAT_ENT_NINSTR] = {
1182 	    0x30800000,			/* ba,a addr */
1183 	    NOP, NOP, NOP, NOP, NOP, NOP, NOP
1184 	};
1185 #endif
1186 	int entoffs = entno << TSTAT_ENT_SHIFT;
1187 	uintptr_t tmptick, stat, tpc, utpc;
1188 	tstat_pgszdata_t *data = &tcpu->tcpu_data->tdata_pgsz[0];
1189 	tstat_tlbdata_t *udata, *kdata;
1190 	tstat_tlbret_t *ret;
1191 
1192 	/*
1193 	 * When trapstat is run with TLB statistics, this is the entry for
1194 	 * both I- and D-TLB misses; this code performs trap level pushing,
1195 	 * as described in the "TLB Statistics" section of the block comment.
1196 	 * This code is executing at TL 1; %tstate[0] contains the saved
1197 	 * state at the time of the TLB miss.  Pushing trap level 1 (and thus
1198 	 * raising TL to 2) requires us to fill in %tstate[1] with our %pstate,
1199 	 * %cwp and %asi.  We leave %tt unchanged, and we set %tpc and %tnpc to
1200 	 * the appropriate TLB return entry (based on the context of the miss).
1201 	 * Finally, we sample %tick, and stash it in the tdata_tmptick member
1202 	 * the per-CPU tstat_data structure.  tdata_tmptick will be used in
1203 	 * the TLB return entry to determine the amount of time spent in the
1204 	 * TLB miss handler.
1205 	 *
1206 	 * Note that on sun4v platforms, we must obtain the context information
1207 	 * from the MMU fault status area. (The base address of this MMU fault
1208 	 * status area is kept in the scratchpad register 0.)
1209 	 */
1210 	static const uint32_t tlbent[] = {
1211 #ifndef sun4v
1212 	    0x03000000, 		/* sethi %hi(stat), %g1		*/
1213 	    0xc4586000,			/* ldx   [%g1 + %lo(stat)], %g2	*/
1214 	    0x8400a001,			/* add   %g2, 1, %g2		*/
1215 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(stat)]	*/
1216 	    0x85524000,			/* rdpr  %cwp, %g2		*/
1217 	    0x87518000,			/* rdpr  %pstate, %g3		*/
1218 	    0x8728f008,			/* sllx  %g3, 8, %g3		*/
1219 	    0x84108003,			/* or    %g2, %g3, %g2		*/
1220 	    0x8740c000,			/* rd    %asi, %g3		*/
1221 	    0x8728f018,			/* sllx  %g3, 24, %g3		*/
1222 	    0x84108003,			/* or    %g2, %g3, %g2		*/
1223 	    0x8350c000,			/* rdpr  %tt, %g1		*/
1224 	    0x8f902002,			/* wrpr  %g0, 2, %tl		*/
1225 	    0x85908000,			/* wrpr  %g2, %g0, %tstate	*/
1226 	    0x87904000,			/* wrpr  %g1, %g0, %tt		*/
1227 	    0xc2d80000,			/* ldxa  [%g0]ASI_MMU, %g1	*/
1228 	    0x83307030,			/* srlx  %g1, CTXSHIFT, %g1	*/
1229 	    0x02c04004,			/* brz,pn %g1, .+0x10		*/
1230 	    0x03000000, 		/* sethi %hi(new_tpc), %g1	*/
1231 	    0x82106000,			/* or    %g1, %lo(new_tpc), %g1	*/
1232 	    0x30800002,			/* ba,a  .+0x8			*/
1233 	    0x82106000,			/* or    %g1, %lo(new_tpc), %g1	*/
1234 	    0x81904000,			/* wrpr  %g1, %g0, %tpc		*/
1235 	    0x82006004,			/* add   %g1, 4, %g1		*/
1236 	    0x83904000,			/* wrpr  %g1, %g0, %tnpc	*/
1237 	    0x03000000, 		/* sethi %hi(tmptick), %g1	*/
1238 	    0x85410000,			/* rd    %tick, %g2		*/
1239 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(tmptick)] */
1240 	    0x30800000,			/* ba,a  addr			*/
1241 	    NOP, NOP, NOP
1242 #else /* sun4v */
1243 	    0x82102008,			/* mov SCRATCHPAD_CPUID, %g1	*/
1244 	    0xc8d84400,			/* ldxa [%g1]ASI_SCRATCHPAD, %g4 */
1245 	    0x89293000,			/* sllx %g4, TSTAT_DATA_SHIFT, %g4 */
1246 	    0x03000000, 		/* sethi %hi(stat), %g1		*/
1247 	    0x82004004,			/* add %g1, %g4, %g1		*/
1248 	    0xc4586000,			/* ldx   [%g1 + %lo(stat)], %g2	*/
1249 	    0x8400a001,			/* add   %g2, 1, %g2		*/
1250 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(stat)]	*/
1251 	    0x85524000,			/* rdpr  %cwp, %g2		*/
1252 	    0x87518000,			/* rdpr  %pstate, %g3		*/
1253 	    0x8728f008,			/* sllx  %g3, 8, %g3		*/
1254 	    0x84108003,			/* or    %g2, %g3, %g2		*/
1255 	    0x8740c000,			/* rd    %asi, %g3		*/
1256 	    0x8728f018,			/* sllx  %g3, 24, %g3		*/
1257 	    0x83540000,			/* rdpr  %gl, %g1		*/
1258 	    0x83287028,			/* sllx  %g1, 40, %g1		*/
1259 	    0x86104003,			/* or    %g1, %g3, %g3		*/
1260 	    0x84108003,			/* or    %g2, %g3, %g2		*/
1261 	    0x8350c000,			/* rdpr  %tt, %g1		*/
1262 	    0x8f902002,			/* wrpr  %g0, 2, %tl		*/
1263 	    0x85908000,			/* wrpr  %g2, %g0, %tstate	*/
1264 	    0x87904000,			/* wrpr  %g1, %g0, %tt		*/
1265 	    0xc2d80400,			/* ldxa  [%g0]ASI_SCRATCHPAD, %g1 */
1266 	    0xc2586000,			/* ldx  [%g1 + MMFSA_?_CTX], %g1 */
1267 	    0x02c04004,			/* brz,pn %g1, .+0x10		*/
1268 	    0x03000000, 		/* sethi %hi(new_tpc), %g1	*/
1269 	    0x82106000,			/* or    %g1, %lo(new_tpc), %g1	*/
1270 	    0x30800002,			/* ba,a  .+0x8			*/
1271 	    0x82106000,			/* or    %g1, %lo(new_tpc), %g1	*/
1272 	    0x81904000,			/* wrpr  %g1, %g0, %tpc		*/
1273 	    0x82006004,			/* add   %g1, 4, %g1		*/
1274 	    0x83904000,			/* wrpr  %g1, %g0, %tnpc	*/
1275 	    0x03000000, 		/* sethi %hi(tmptick), %g1	*/
1276 	    0x82004004,			/* add %g1, %g4, %g1		*/
1277 	    0x85410000,			/* rd    %tick, %g2		*/
1278 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(tmptick)] */
1279 	    0x30800000			/* ba,a  addr			*/
1280 #endif /* sun4v */
1281 	};
1282 
1283 	ASSERT(MUTEX_HELD(&tstat_lock));
1284 #ifndef sun4v
1285 	ASSERT(entno == TSTAT_ENT_ITLBMISS || entno == TSTAT_ENT_DTLBMISS);
1286 
1287 	stat = TSTAT_DATA_OFFS(tcpu, tdata_traps) + entoffs;
1288 	tmptick = TSTAT_DATA_OFFS(tcpu, tdata_tmptick);
1289 #else /* sun4v */
1290 	ASSERT(entno == TSTAT_ENT_ITLBMISS || entno == TSTAT_ENT_DTLBMISS ||
1291 	    entno == TSTAT_ENT_IMMUMISS || entno == TSTAT_ENT_DMMUMISS);
1292 
1293 	stat = TSTAT_CPU0_DATA_OFFS(tcpu, tdata_traps) + entoffs;
1294 	tmptick = TSTAT_CPU0_DATA_OFFS(tcpu, tdata_tmptick);
1295 #endif /* sun4v */
1296 
1297 	if (itlb) {
1298 		ret = &tcpu->tcpu_instr->tinst_itlbret;
1299 		udata = &data->tpgsz_user.tmode_itlb;
1300 		kdata = &data->tpgsz_kernel.tmode_itlb;
1301 		tpc = TSTAT_INSTR_OFFS(tcpu, tinst_itlbret.ttlbr_ktlb);
1302 	} else {
1303 		ret = &tcpu->tcpu_instr->tinst_dtlbret;
1304 		udata = &data->tpgsz_user.tmode_dtlb;
1305 		kdata = &data->tpgsz_kernel.tmode_dtlb;
1306 		tpc = TSTAT_INSTR_OFFS(tcpu, tinst_dtlbret.ttlbr_ktlb);
1307 	}
1308 
1309 	utpc = tpc + offsetof(tstat_tlbret_t, ttlbr_utlb) -
1310 	    offsetof(tstat_tlbret_t, ttlbr_ktlb);
1311 
1312 	ASSERT(HI22(tpc) == HI22(utpc));
1313 
1314 	ent = (uint32_t *)((uintptr_t)tcpu->tcpu_instr + entoffs);
1315 	orig = KERNELBASE + entoffs;
1316 	va = (uintptr_t)tcpu->tcpu_ibase + entoffs;
1317 	baoffs = TSTAT_TLBENT_BA * sizeof (uint32_t);
1318 
1319 #ifdef sun4v
1320 	/*
1321 	 * Because of lack of space, interposing tlbent trap handler
1322 	 * for TLB and MMU miss traps cannot be placed in-line. Instead,
1323 	 * we copy it to the space set aside for shared trap handlers
1324 	 * continuation in the interposing trap table and invoke it by
1325 	 * placing a branch in the trap table itself.
1326 	 */
1327 	tent = ent;		/* trap vector entry */
1328 	tentva = va;		/* trap vector entry va */
1329 
1330 	if (itlb) {
1331 		ent = (uint32_t *)((uintptr_t)
1332 		    &tcpu->tcpu_instr->tinst_immumiss);
1333 		va = TSTAT_INSTR_OFFS(tcpu, tinst_immumiss);
1334 	} else {
1335 		ent = (uint32_t *)((uintptr_t)
1336 		    &tcpu->tcpu_instr->tinst_dmmumiss);
1337 		va = TSTAT_INSTR_OFFS(tcpu, tinst_dmmumiss);
1338 	}
1339 	bcopy(mmumiss, tent, sizeof (mmumiss));
1340 	tent[0] |= DISP22(tentva, va);
1341 #endif /* sun4v */
1342 
1343 	bcopy(tlbent, ent, sizeof (tlbent));
1344 
1345 #if defined(sun4v)
1346 	ent[TSTAT_TLBENT_TDATASHFT] |= LO10((uintptr_t)TSTAT_DATA_SHIFT);
1347 #endif
1348 	ent[TSTAT_TLBENT_STATHI] |= HI22(stat);
1349 	ent[TSTAT_TLBENT_STATLO_LD] |= LO10(stat);
1350 	ent[TSTAT_TLBENT_STATLO_ST] |= LO10(stat);
1351 #ifndef sun4v
1352 	ent[TSTAT_TLBENT_MMUASI] |= asi;
1353 #else
1354 	ent[TSTAT_TLBENT_TAGTARGET] |= tagtarget_off;
1355 #endif
1356 	ent[TSTAT_TLBENT_TPCHI] |= HI22(tpc);
1357 	ent[TSTAT_TLBENT_TPCLO_USER] |= LO10(utpc);
1358 	ent[TSTAT_TLBENT_TPCLO_KERN] |= LO10(tpc);
1359 	ent[TSTAT_TLBENT_TSHI] |= HI22(tmptick);
1360 	ent[TSTAT_TLBENT_TSLO] |= LO10(tmptick);
1361 	ent[TSTAT_TLBENT_BA] |= DISP22(va + baoffs, orig);
1362 
1363 	/*
1364 	 * And now set up the TLB return entries.
1365 	 */
1366 	trapstat_tlbretent(tcpu, &ret->ttlbr_ktlb, &kdata->ttlb_tlb);
1367 	trapstat_tlbretent(tcpu, &ret->ttlbr_ktsb, &kdata->ttlb_tsb);
1368 	trapstat_tlbretent(tcpu, &ret->ttlbr_utlb, &udata->ttlb_tlb);
1369 	trapstat_tlbretent(tcpu, &ret->ttlbr_utsb, &udata->ttlb_tsb);
1370 }
1371 
1372 #if defined(sun4v)
1373 #undef TSTAT_TLBENT_TDATASHFT
1374 #endif
1375 #undef TSTAT_TLBENT_STATHI
1376 #undef TSTAT_TLBENT_STATLO_LD
1377 #undef TSTAT_TLBENT_STATLO_ST
1378 #ifndef sun4v
1379 #undef TSTAT_TLBENT_MMUASI
1380 #else
1381 #undef TSTAT_TLBENT_TAGTARGET
1382 #endif
1383 #undef TSTAT_TLBENT_TPCHI
1384 #undef TSTAT_TLBENT_TPCLO_USER
1385 #undef TSTAT_TLBENT_TPCLO_KERN
1386 #undef TSTAT_TLBENT_TSHI
1387 #undef TSTAT_TLBENT_TSLO
1388 #undef TSTAT_TLBENT_BA
1389 
1390 /*
1391  * The TSTAT_ENABLED_* constants define offsets in the enabled entry; the
1392  * TSTAT_DISABLED_BA constant defines an offset in the disabled entry.  Both
1393  * sets of constants are used only in trapstat_make_traptab() (below) and
1394  * #undef'd immediately afterwards.  Any change to "enabled" or "disabled"
1395  * in trapstat_make_traptab() will likely require changes to these constants.
1396  */
1397 #ifndef sun4v
1398 #define	TSTAT_ENABLED_STATHI	0
1399 #define	TSTAT_ENABLED_STATLO_LD	1
1400 #define	TSTAT_ENABLED_STATLO_ST 3
1401 #define	TSTAT_ENABLED_BA	4
1402 #define	TSTAT_DISABLED_BA	0
1403 
1404 static void
1405 trapstat_make_traptab(tstat_percpu_t *tcpu)
1406 {
1407 	uint32_t *ent;
1408 	uint64_t *stat;
1409 	uintptr_t orig, va, en_baoffs, dis_baoffs;
1410 	int nent;
1411 
1412 	/*
1413 	 * This is the entry in the interposing trap table for enabled trap
1414 	 * table entries.  It loads a counter, increments it and stores it
1415 	 * back before branching to the actual trap table entry.
1416 	 */
1417 	static const uint32_t enabled[TSTAT_ENT_NINSTR] = {
1418 	    0x03000000, 		/* sethi %hi(stat), %g1		*/
1419 	    0xc4586000,			/* ldx   [%g1 + %lo(stat)], %g2	*/
1420 	    0x8400a001,			/* add   %g2, 1, %g2		*/
1421 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(stat)]	*/
1422 	    0x30800000,			/* ba,a addr			*/
1423 	    NOP, NOP, NOP
1424 	};
1425 
1426 	/*
1427 	 * This is the entry in the interposing trap table for disabled trap
1428 	 * table entries.  It simply branches to the actual, underlying trap
1429 	 * table entry.  As explained in the "Implementation Details" section
1430 	 * of the block comment, all TL>0 traps _must_ use the disabled entry;
1431 	 * additional entries may be explicitly disabled through the use
1432 	 * of TSTATIOC_ENTRY/TSTATIOC_NOENTRY.
1433 	 */
1434 	static const uint32_t disabled[TSTAT_ENT_NINSTR] = {
1435 	    0x30800000,			/* ba,a addr			*/
1436 	    NOP, NOP, NOP, NOP, NOP, NOP, NOP,
1437 	};
1438 
1439 	ASSERT(MUTEX_HELD(&tstat_lock));
1440 
1441 	ent = tcpu->tcpu_instr->tinst_traptab;
1442 	stat = (uint64_t *)TSTAT_DATA_OFFS(tcpu, tdata_traps);
1443 	orig = KERNELBASE;
1444 	va = (uintptr_t)tcpu->tcpu_ibase;
1445 	en_baoffs = TSTAT_ENABLED_BA * sizeof (uint32_t);
1446 	dis_baoffs = TSTAT_DISABLED_BA * sizeof (uint32_t);
1447 
1448 	for (nent = 0; nent < TSTAT_TOTAL_NENT; nent++) {
1449 		if (tstat_enabled[nent]) {
1450 			bcopy(enabled, ent, sizeof (enabled));
1451 			ent[TSTAT_ENABLED_STATHI] |= HI22((uintptr_t)stat);
1452 			ent[TSTAT_ENABLED_STATLO_LD] |= LO10((uintptr_t)stat);
1453 			ent[TSTAT_ENABLED_STATLO_ST] |= LO10((uintptr_t)stat);
1454 			ent[TSTAT_ENABLED_BA] |= DISP22(va + en_baoffs, orig);
1455 		} else {
1456 			bcopy(disabled, ent, sizeof (disabled));
1457 			ent[TSTAT_DISABLED_BA] |= DISP22(va + dis_baoffs, orig);
1458 		}
1459 
1460 		stat++;
1461 		orig += sizeof (enabled);
1462 		ent += sizeof (enabled) / sizeof (*ent);
1463 		va += sizeof (enabled);
1464 	}
1465 }
1466 
1467 #undef TSTAT_ENABLED_STATHI
1468 #undef TSTAT_ENABLED_STATLO_LD
1469 #undef TSTAT_ENABLED_STATLO_ST
1470 #undef TSTAT_ENABLED_BA
1471 #undef TSTAT_DISABLED_BA
1472 
1473 #else /* sun4v */
1474 
1475 #define	TSTAT_ENABLED_STATHI	0
1476 #define	TSTAT_ENABLED_STATLO	1
1477 #define	TSTAT_ENABLED_ADDRHI	2
1478 #define	TSTAT_ENABLED_ADDRLO	3
1479 #define	TSTAT_ENABLED_CONTBA	6
1480 #define	TSTAT_ENABLED_TDATASHFT	7
1481 #define	TSTAT_DISABLED_BA	0
1482 
1483 static void
1484 trapstat_make_traptab(tstat_percpu_t *tcpu)
1485 {
1486 	uint32_t *ent;
1487 	uint64_t *stat;
1488 	uintptr_t orig, va, en_baoffs, dis_baoffs;
1489 	uintptr_t tstat_cont_va;
1490 	int nent;
1491 
1492 	/*
1493 	 * This is the entry in the interposing trap table for enabled trap
1494 	 * table entries.  It loads a counter, increments it and stores it
1495 	 * back before branching to the actual trap table entry.
1496 	 *
1497 	 * All CPUs share the same interposing trap entry to count the
1498 	 * number of traps. Note that the trap counter is kept in per CPU
1499 	 * trap statistics area. Its address is obtained dynamically by
1500 	 * adding the offset of that CPU's trap statistics area from CPU 0
1501 	 * (i.e. cpu_id * TSTAT_DATA_SIZE) to the address of the CPU 0
1502 	 * trap counter already coded in the interposing trap entry itself.
1503 	 *
1504 	 * Since this interposing code sequence to count traps takes more
1505 	 * than 8 instructions, it's split in two parts as follows:
1506 	 *
1507 	 *   tstat_trapcnt:
1508 	 *	sethi %hi(stat), %g1
1509 	 *	or    %g1, %lo[stat), %g1	! %g1 = CPU0 trap counter addr
1510 	 *	sethi %hi(addr), %g2
1511 	 *	or    %g2, %lo(addr), %g2	! %g2 = real trap handler addr
1512 	 *	mov   ASI_SCRATCHPAD_CPUID, %g3
1513 	 *	ldxa [%g3]ASI_SCRATCHPAD, %g3	! %g3 = CPU ID
1514 	 *	ba tstat_trapcnt_cont		! branch to tstat_trapcnt_cont
1515 	 *	sllx %g3, TSTAT_DATA_SHIFT, %g3	! %g3 = CPU trapstat data offset
1516 	 *
1517 	 *   tstat_trapcnt_cont:
1518 	 *	ldx [%g1 + %g3], %g4		! get counter value
1519 	 *	add %g4, 1, %g4			! increment value
1520 	 *	jmp %g2				! jump to original trap handler
1521 	 *	stx %g4, [%g1 + %g3]		! store counter value
1522 	 *
1523 	 * First part, i.e. tstat_trapcnt, is per trap and is kept in-line in
1524 	 * the interposing trap table. However, the tstat_trapcnt_cont code
1525 	 * sequence is shared by all traps and is kept right after the
1526 	 * the interposing trap table.
1527 	 */
1528 	static const uint32_t enabled[TSTAT_ENT_NINSTR] = {
1529 	    0x03000000, 		/* sethi %hi(stat), %g1		*/
1530 	    0x82106000,			/* or   %g1, %lo[stat), %g1	*/
1531 	    0x05000000, 		/* sethi %hi(addr), %g2		*/
1532 	    0x8410a000,			/* or   %g2, %lo(addr), %g2	*/
1533 	    0x86102008,			/* mov	ASI_SCRATCHPAD_CPUID, %g3 */
1534 	    0xc6d8c400,			/* ldxa [%g3]ASI_SCRATCHPAD, %g3 */
1535 	    0x10800000,			/* ba enabled_cont		*/
1536 	    0x8728f000			/* sllx %g3, TSTAT_DATA_SHIFT, %g3 */
1537 	};
1538 
1539 	static const uint32_t enabled_cont[TSTAT_ENT_NINSTR] = {
1540 	    0xc8584003, 		/* ldx [%g1 + %g3], %g4		*/
1541 	    0x88012001,			/* add %g4, 1, %g4		*/
1542 	    0x81c08000,			/* jmp %g2			*/
1543 	    0xc8704003,			/* stx %g4, [%g1 + %g3]		*/
1544 	    NOP, NOP, NOP, NOP
1545 	};
1546 
1547 	/*
1548 	 * This is the entry in the interposing trap table for disabled trap
1549 	 * table entries.  It simply branches to the actual, underlying trap
1550 	 * table entry.  As explained in the "Implementation Details" section
1551 	 * of the block comment, all TL>0 traps _must_ use the disabled entry;
1552 	 * additional entries may be explicitly disabled through the use
1553 	 * of TSTATIOC_ENTRY/TSTATIOC_NOENTRY.
1554 	 */
1555 	static const uint32_t disabled[TSTAT_ENT_NINSTR] = {
1556 	    0x30800000,			/* ba,a addr			*/
1557 	    NOP, NOP, NOP, NOP, NOP, NOP, NOP,
1558 	};
1559 
1560 	ASSERT(MUTEX_HELD(&tstat_lock));
1561 	ent = tcpu->tcpu_instr->tinst_traptab;
1562 	stat = (uint64_t *)TSTAT_CPU0_DATA_OFFS(tcpu, tdata_traps);
1563 	orig = KERNELBASE;
1564 	va = (uintptr_t)tcpu->tcpu_ibase;
1565 	en_baoffs = TSTAT_ENABLED_CONTBA * sizeof (uint32_t);
1566 	dis_baoffs = TSTAT_DISABLED_BA * sizeof (uint32_t);
1567 	tstat_cont_va = TSTAT_INSTR_OFFS(tcpu, tinst_trapcnt);
1568 
1569 	for (nent = 0; nent < TSTAT_TOTAL_NENT; nent++) {
1570 		if (tstat_enabled[nent]) {
1571 			bcopy(enabled, ent, sizeof (enabled));
1572 			ent[TSTAT_ENABLED_STATHI] |= HI22((uintptr_t)stat);
1573 			ent[TSTAT_ENABLED_STATLO] |= LO10((uintptr_t)stat);
1574 			ent[TSTAT_ENABLED_ADDRHI] |= HI22((uintptr_t)orig);
1575 			ent[TSTAT_ENABLED_ADDRLO] |= LO10((uintptr_t)orig);
1576 			ent[TSTAT_ENABLED_CONTBA] |=
1577 			    DISP22(va + en_baoffs, tstat_cont_va);
1578 			ent[TSTAT_ENABLED_TDATASHFT] |=
1579 			    LO10((uintptr_t)TSTAT_DATA_SHIFT);
1580 		} else {
1581 			bcopy(disabled, ent, sizeof (disabled));
1582 			ent[TSTAT_DISABLED_BA] |= DISP22(va + dis_baoffs, orig);
1583 		}
1584 
1585 		stat++;
1586 		orig += sizeof (enabled);
1587 		ent += sizeof (enabled) / sizeof (*ent);
1588 		va += sizeof (enabled);
1589 	}
1590 	bcopy(enabled_cont, (uint32_t *)tcpu->tcpu_instr->tinst_trapcnt,
1591 	    sizeof (enabled_cont));
1592 }
1593 
1594 #undef	TSTAT_ENABLED_TDATASHFT
1595 #undef	TSTAT_ENABLED_STATHI
1596 #undef	TSTAT_ENABLED_STATLO
1597 #undef	TSTAT_ENABLED_ADDRHI
1598 #undef	TSTAT_ENABLED_ADDRLO
1599 #undef	TSTAT_ENABLED_CONTBA
1600 #undef	TSTAT_DISABLED_BA
1601 
1602 #endif /* sun4v */
1603 
1604 #ifndef sun4v
1605 /*
1606  * See Section A.6 in SPARC v9 Manual.
1607  * max branch = 4*((2^21)-1) = 8388604
1608  */
1609 #define	MAX_BICC_BRANCH_DISPLACEMENT (4 * ((1 << 21) - 1))
1610 #endif
1611 
1612 static void
1613 trapstat_setup(processorid_t cpu)
1614 {
1615 	tstat_percpu_t *tcpu = &tstat_percpu[cpu];
1616 #ifndef sun4v
1617 	int i;
1618 	caddr_t va;
1619 	pfn_t *pfn;
1620 	cpu_t *cp;
1621 	uint_t strand_idx;
1622 	size_t tstat_offset;
1623 #endif
1624 
1625 	ASSERT(tcpu->tcpu_pfn == NULL);
1626 	ASSERT(tcpu->tcpu_instr == NULL);
1627 	ASSERT(tcpu->tcpu_data == NULL);
1628 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
1629 	ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED));
1630 	ASSERT(MUTEX_HELD(&cpu_lock));
1631 	ASSERT(MUTEX_HELD(&tstat_lock));
1632 
1633 #ifndef sun4v
1634 	/*
1635 	 * The lower fifteen bits of the %tba are always read as zero; we must
1636 	 * align our instruction base address appropriately.
1637 	 */
1638 	tstat_offset = tstat_total_size;
1639 
1640 	cp = cpu_get(cpu);
1641 	ASSERT(cp != NULL);
1642 	if ((strand_idx = cpu ^ pg_plat_hw_instance_id(cp, PGHW_IPIPE)) != 0) {
1643 		/*
1644 		 * On sun4u platforms with multiple CPUs sharing the MMU
1645 		 * (Olympus-C has 2 strands per core), each CPU uses a
1646 		 * disjoint trap table.  The indexing is based on the
1647 		 * strand id, which is obtained by XOR'ing the cpuid with
1648 		 * the coreid.
1649 		 */
1650 		tstat_offset += tstat_total_size * strand_idx;
1651 
1652 		/*
1653 		 * Offset must be less than the maximum PC-relative branch
1654 		 * displacement for Bicc variants.  See the Implementation
1655 		 * Details comment.
1656 		 */
1657 		ASSERT(tstat_offset <= MAX_BICC_BRANCH_DISPLACEMENT);
1658 	}
1659 
1660 	tcpu->tcpu_ibase = (caddr_t)((KERNELBASE - tstat_offset)
1661 	    & TSTAT_TBA_MASK);
1662 	tcpu->tcpu_dbase = tcpu->tcpu_ibase + TSTAT_INSTR_SIZE;
1663 	tcpu->tcpu_vabase = tcpu->tcpu_ibase;
1664 
1665 	tcpu->tcpu_pfn = vmem_alloc(tstat_arena, tstat_total_pages, VM_SLEEP);
1666 	bzero(tcpu->tcpu_pfn, tstat_total_pages);
1667 	pfn = tcpu->tcpu_pfn;
1668 
1669 	tcpu->tcpu_instr = vmem_alloc(tstat_arena, TSTAT_INSTR_SIZE, VM_SLEEP);
1670 
1671 	va = (caddr_t)tcpu->tcpu_instr;
1672 	for (i = 0; i < TSTAT_INSTR_PAGES; i++, va += MMU_PAGESIZE)
1673 		*pfn++ = hat_getpfnum(kas.a_hat, va);
1674 
1675 	/*
1676 	 * We must be sure that the pages that we will use to examine the data
1677 	 * have the same virtual color as the pages to which the data is being
1678 	 * recorded, hence the alignment and phase constraints on the
1679 	 * allocation.
1680 	 */
1681 	tcpu->tcpu_data = vmem_xalloc(tstat_arena, tstat_data_size,
1682 	    shm_alignment, (uintptr_t)tcpu->tcpu_dbase & (shm_alignment - 1),
1683 	    0, 0, NULL, VM_SLEEP);
1684 	bzero(tcpu->tcpu_data, tstat_data_size);
1685 	tcpu->tcpu_data->tdata_cpuid = cpu;
1686 
1687 	va = (caddr_t)tcpu->tcpu_data;
1688 	for (i = 0; i < tstat_data_pages; i++, va += MMU_PAGESIZE)
1689 		*pfn++ = hat_getpfnum(kas.a_hat, va);
1690 
1691 	/*
1692 	 * Now that we have all of the instruction and data pages allocated,
1693 	 * make the trap table from scratch.
1694 	 */
1695 	trapstat_make_traptab(tcpu);
1696 
1697 	if (tstat_options & TSTAT_OPT_TLBDATA) {
1698 		/*
1699 		 * TLB Statistics have been specified; set up the I- and D-TLB
1700 		 * entries and corresponding TLB return entries.
1701 		 */
1702 		trapstat_tlbent(tcpu, TSTAT_ENT_ITLBMISS);
1703 		trapstat_tlbent(tcpu, TSTAT_ENT_DTLBMISS);
1704 	}
1705 
1706 #else /* sun4v */
1707 
1708 	/*
1709 	 * The lower fifteen bits of the %tba are always read as zero; hence
1710 	 * it must be aligned at least on 512K boundary.
1711 	 */
1712 	tcpu->tcpu_vabase = (caddr_t)(KERNELBASE - MMU_PAGESIZE4M);
1713 	tcpu->tcpu_ibase = tcpu->tcpu_vabase;
1714 	tcpu->tcpu_dbase = tcpu->tcpu_ibase + TSTAT_INSTR_SIZE +
1715 	    cpu * TSTAT_DATA_SIZE;
1716 
1717 	tcpu->tcpu_pfn = &tstat_pfn;
1718 	tcpu->tcpu_instr = (tstat_instr_t *)tstat_va;
1719 	tcpu->tcpu_data = (tstat_data_t *)(tstat_va + TSTAT_INSTR_SIZE +
1720 	    cpu * TSTAT_DATA_SIZE);
1721 	bzero(tcpu->tcpu_data, TSTAT_DATA_SIZE);
1722 	tcpu->tcpu_data->tdata_cpuid = cpu;
1723 
1724 	/*
1725 	 * Now that we have all of the instruction and data pages allocated,
1726 	 * make the trap table from scratch. It should be done only once
1727 	 * as it is shared by all CPUs.
1728 	 */
1729 	if (!tstat_traptab_initialized)
1730 		trapstat_make_traptab(tcpu);
1731 
1732 	if (tstat_options & TSTAT_OPT_TLBDATA) {
1733 		/*
1734 		 * TLB Statistics have been specified; set up the I- and D-TLB
1735 		 * entries and corresponding TLB return entries.
1736 		 */
1737 		if (!tstat_traptab_initialized) {
1738 			if (tstat_fast_tlbstat) {
1739 				trapstat_tlbent(tcpu, TSTAT_ENT_IMMUMISS);
1740 				trapstat_tlbent(tcpu, TSTAT_ENT_DMMUMISS);
1741 			} else {
1742 				trapstat_tlbent(tcpu, TSTAT_ENT_ITLBMISS);
1743 				trapstat_tlbent(tcpu, TSTAT_ENT_DTLBMISS);
1744 			}
1745 		}
1746 	}
1747 	tstat_traptab_initialized = 1;
1748 #endif /* sun4v */
1749 
1750 	tcpu->tcpu_flags |= TSTAT_CPU_ALLOCATED;
1751 
1752 	/*
1753 	 * Finally, get the target CPU to load the locked pages into its TLBs.
1754 	 */
1755 	xc_one(cpu, (xcfunc_t *)trapstat_load_tlb, 0, 0);
1756 }
1757 
1758 static void
1759 trapstat_teardown(processorid_t cpu)
1760 {
1761 	tstat_percpu_t *tcpu = &tstat_percpu[cpu];
1762 #ifndef sun4v
1763 	int i;
1764 #endif
1765 	caddr_t va = tcpu->tcpu_vabase;
1766 
1767 	ASSERT(tcpu->tcpu_pfn != NULL);
1768 	ASSERT(tcpu->tcpu_instr != NULL);
1769 	ASSERT(tcpu->tcpu_data != NULL);
1770 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
1771 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
1772 	ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
1773 	ASSERT(MUTEX_HELD(&cpu_lock));
1774 	ASSERT(MUTEX_HELD(&tstat_lock));
1775 
1776 #ifndef sun4v
1777 	vmem_free(tstat_arena, tcpu->tcpu_pfn, tstat_total_pages);
1778 	vmem_free(tstat_arena, tcpu->tcpu_instr, TSTAT_INSTR_SIZE);
1779 	vmem_free(tstat_arena, tcpu->tcpu_data, tstat_data_size);
1780 
1781 	for (i = 0; i < tstat_total_pages; i++, va += MMU_PAGESIZE) {
1782 		xt_one(cpu, vtag_flushpage_tl1, (uint64_t)va,
1783 		    (uint64_t)ksfmmup);
1784 	}
1785 #else
1786 	xt_one(cpu, vtag_unmap_perm_tl1, (uint64_t)va, KCONTEXT);
1787 #endif
1788 
1789 	tcpu->tcpu_pfn = NULL;
1790 	tcpu->tcpu_instr = NULL;
1791 	tcpu->tcpu_data = NULL;
1792 	tcpu->tcpu_flags &= ~TSTAT_CPU_ALLOCATED;
1793 }
1794 
1795 static int
1796 trapstat_go()
1797 {
1798 	cpu_t *cp;
1799 
1800 	mutex_enter(&cpu_lock);
1801 	mutex_enter(&tstat_lock);
1802 
1803 	if (tstat_running) {
1804 		mutex_exit(&tstat_lock);
1805 		mutex_exit(&cpu_lock);
1806 		return (EBUSY);
1807 	}
1808 
1809 #ifdef sun4v
1810 	/*
1811 	 * Allocate large page to hold interposing tables.
1812 	 */
1813 	tstat_va = contig_mem_alloc(MMU_PAGESIZE4M);
1814 	tstat_pfn = va_to_pfn(tstat_va);
1815 	if (tstat_pfn == PFN_INVALID) {
1816 		mutex_exit(&tstat_lock);
1817 		mutex_exit(&cpu_lock);
1818 		return (EAGAIN);
1819 	}
1820 
1821 	/*
1822 	 * For detailed TLB statistics, invoke CPU specific interface
1823 	 * to see if it supports a low overhead interface to collect
1824 	 * TSB hit statistics. If so, make set tstat_fast_tlbstat flag
1825 	 * to reflect that.
1826 	 */
1827 	if (tstat_options & TSTAT_OPT_TLBDATA) {
1828 		int error;
1829 
1830 		tstat_fast_tlbstat = B_FALSE;
1831 		error = cpu_trapstat_conf(CPU_TSTATCONF_INIT);
1832 		if (error == 0)
1833 			tstat_fast_tlbstat = B_TRUE;
1834 		else if (error != ENOTSUP) {
1835 			contig_mem_free(tstat_va, MMU_PAGESIZE4M);
1836 			mutex_exit(&tstat_lock);
1837 			mutex_exit(&cpu_lock);
1838 			return (error);
1839 		}
1840 	}
1841 #endif /* sun4v */
1842 
1843 	/*
1844 	 * First, perform any necessary hot patching.
1845 	 */
1846 	trapstat_hotpatch();
1847 
1848 	/*
1849 	 * Allocate the resources we'll need to measure probe effect.
1850 	 */
1851 	trapstat_probe_alloc();
1852 
1853 
1854 	cp = cpu_list;
1855 	do {
1856 		if (!(tstat_percpu[cp->cpu_id].tcpu_flags & TSTAT_CPU_SELECTED))
1857 			continue;
1858 
1859 		trapstat_setup(cp->cpu_id);
1860 
1861 		/*
1862 		 * Note that due to trapstat_probe()'s use of global data,
1863 		 * we determine the probe effect on each CPU serially instead
1864 		 * of in parallel with an xc_all().
1865 		 */
1866 		xc_one(cp->cpu_id, (xcfunc_t *)trapstat_probe, 0, 0);
1867 	} while ((cp = cp->cpu_next) != cpu_list);
1868 
1869 	xc_all((xcfunc_t *)trapstat_enable, 0, 0);
1870 
1871 	trapstat_probe_free();
1872 	tstat_running = 1;
1873 	mutex_exit(&tstat_lock);
1874 	mutex_exit(&cpu_lock);
1875 
1876 	return (0);
1877 }
1878 
1879 static int
1880 trapstat_stop()
1881 {
1882 	int i;
1883 
1884 	mutex_enter(&cpu_lock);
1885 	mutex_enter(&tstat_lock);
1886 	if (!tstat_running) {
1887 		mutex_exit(&tstat_lock);
1888 		mutex_exit(&cpu_lock);
1889 		return (ENXIO);
1890 	}
1891 
1892 	xc_all((xcfunc_t *)trapstat_disable, 0, 0);
1893 
1894 	for (i = 0; i <= max_cpuid; i++) {
1895 		if (tstat_percpu[i].tcpu_flags & TSTAT_CPU_ALLOCATED)
1896 			trapstat_teardown(i);
1897 	}
1898 
1899 #ifdef sun4v
1900 	tstat_traptab_initialized = 0;
1901 	if (tstat_options & TSTAT_OPT_TLBDATA)
1902 		cpu_trapstat_conf(CPU_TSTATCONF_FINI);
1903 	contig_mem_free(tstat_va, MMU_PAGESIZE4M);
1904 #endif
1905 	trapstat_hotpatch();
1906 	tstat_running = 0;
1907 	mutex_exit(&tstat_lock);
1908 	mutex_exit(&cpu_lock);
1909 
1910 	return (0);
1911 }
1912 
1913 /*
1914  * This is trapstat's DR CPU configuration callback.  It's called (with
1915  * cpu_lock held) to unconfigure a newly powered-off CPU, or to configure a
1916  * powered-off CPU that is to be brought into the system.  We need only take
1917  * action in the unconfigure case:  because a powered-off CPU will have its
1918  * trap table restored to KERNELBASE if it is ever powered back on, we must
1919  * update the flags to reflect that trapstat is no longer enabled on the
1920  * powered-off CPU.  Note that this means that a TSTAT_CPU_ENABLED CPU that
1921  * is unconfigured/powered off and later powered back on/reconfigured will
1922  * _not_ be re-TSTAT_CPU_ENABLED.
1923  */
1924 static int
1925 trapstat_cpu_setup(cpu_setup_t what, processorid_t cpu)
1926 {
1927 	tstat_percpu_t *tcpu = &tstat_percpu[cpu];
1928 
1929 	ASSERT(MUTEX_HELD(&cpu_lock));
1930 	mutex_enter(&tstat_lock);
1931 
1932 	if (!tstat_running) {
1933 		mutex_exit(&tstat_lock);
1934 		return (0);
1935 	}
1936 
1937 	switch (what) {
1938 	case CPU_CONFIG:
1939 		ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
1940 		break;
1941 
1942 	case CPU_UNCONFIG:
1943 		if (tcpu->tcpu_flags & TSTAT_CPU_ENABLED) {
1944 			tcpu->tcpu_flags &= ~TSTAT_CPU_ENABLED;
1945 #ifdef	sun4v
1946 			/*
1947 			 * A power-off, causes the cpu mondo queues to be
1948 			 * unconfigured on sun4v. Since we can't teardown
1949 			 * trapstat's mappings on the cpu that is going away,
1950 			 * we simply mark it as not allocated. This will
1951 			 * prevent a teardown on a cpu with the same cpu id
1952 			 * that might have been added while trapstat is running.
1953 			 */
1954 			if (tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED) {
1955 				tcpu->tcpu_pfn = NULL;
1956 				tcpu->tcpu_instr = NULL;
1957 				tcpu->tcpu_data = NULL;
1958 				tcpu->tcpu_flags &= ~TSTAT_CPU_ALLOCATED;
1959 			}
1960 #endif
1961 		}
1962 		break;
1963 
1964 	default:
1965 		break;
1966 	}
1967 
1968 	mutex_exit(&tstat_lock);
1969 	return (0);
1970 }
1971 
1972 /*
1973  * This is called before a CPR suspend and after a CPR resume.  We don't have
1974  * anything to do before a suspend, but after a restart we must restore the
1975  * trap table to be our interposing trap table.  However, we don't actually
1976  * know whether or not the CPUs have been powered off -- this routine may be
1977  * called while restoring from a failed CPR suspend.  We thus run through each
1978  * TSTAT_CPU_ENABLED CPU, and explicitly destroy and reestablish its
1979  * interposing trap table.  This assures that our state is correct regardless
1980  * of whether or not the CPU has been newly powered on.
1981  */
1982 /*ARGSUSED*/
1983 static boolean_t
1984 trapstat_cpr(void *arg, int code)
1985 {
1986 	cpu_t *cp;
1987 
1988 	if (code == CB_CODE_CPR_CHKPT)
1989 		return (B_TRUE);
1990 
1991 	ASSERT(code == CB_CODE_CPR_RESUME);
1992 
1993 	mutex_enter(&cpu_lock);
1994 	mutex_enter(&tstat_lock);
1995 
1996 	if (!tstat_running) {
1997 		mutex_exit(&tstat_lock);
1998 		mutex_exit(&cpu_lock);
1999 		return (B_TRUE);
2000 	}
2001 
2002 	cp = cpu_list;
2003 	do {
2004 		tstat_percpu_t *tcpu = &tstat_percpu[cp->cpu_id];
2005 
2006 		if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED))
2007 			continue;
2008 
2009 		ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
2010 		ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
2011 
2012 		xc_one(cp->cpu_id, (xcfunc_t *)trapstat_disable, 0, 0);
2013 		ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
2014 
2015 		/*
2016 		 * Preserve this CPU's data in tstat_buffer and rip down its
2017 		 * interposing trap table.
2018 		 */
2019 		bcopy(tcpu->tcpu_data, tstat_buffer, tstat_data_t_size);
2020 		trapstat_teardown(cp->cpu_id);
2021 		ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED));
2022 
2023 		/*
2024 		 * Reestablish the interposing trap table and restore the old
2025 		 * data.
2026 		 */
2027 		trapstat_setup(cp->cpu_id);
2028 		ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
2029 		bcopy(tstat_buffer, tcpu->tcpu_data, tstat_data_t_size);
2030 
2031 		xc_one(cp->cpu_id, (xcfunc_t *)trapstat_enable, 0, 0);
2032 	} while ((cp = cp->cpu_next) != cpu_list);
2033 
2034 	mutex_exit(&tstat_lock);
2035 	mutex_exit(&cpu_lock);
2036 
2037 	return (B_TRUE);
2038 }
2039 
2040 /*ARGSUSED*/
2041 static int
2042 trapstat_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
2043 {
2044 	int i;
2045 
2046 	mutex_enter(&cpu_lock);
2047 	mutex_enter(&tstat_lock);
2048 	if (tstat_open != 0) {
2049 		mutex_exit(&tstat_lock);
2050 		mutex_exit(&cpu_lock);
2051 		return (EBUSY);
2052 	}
2053 
2054 	/*
2055 	 * Register this in open() rather than in attach() to prevent deadlock
2056 	 * with DR code. During attach, I/O device tree locks are grabbed
2057 	 * before trapstat_attach() is invoked - registering in attach
2058 	 * will result in the lock order: device tree lock, cpu_lock.
2059 	 * DR code however requires that cpu_lock be acquired before
2060 	 * device tree locks.
2061 	 */
2062 	ASSERT(!tstat_running);
2063 	register_cpu_setup_func((cpu_setup_func_t *)trapstat_cpu_setup, NULL);
2064 
2065 	/*
2066 	 * Clear all options.  And until specific CPUs are specified, we'll
2067 	 * mark all CPUs as selected.
2068 	 */
2069 	tstat_options = 0;
2070 
2071 	for (i = 0; i <= max_cpuid; i++)
2072 		tstat_percpu[i].tcpu_flags |= TSTAT_CPU_SELECTED;
2073 
2074 	/*
2075 	 * By default, all traps at TL=0 are enabled.  Traps at TL>0 must
2076 	 * be disabled.
2077 	 */
2078 	for (i = 0; i < TSTAT_TOTAL_NENT; i++)
2079 		tstat_enabled[i] = i < TSTAT_NENT ? 1 : 0;
2080 
2081 	tstat_open = 1;
2082 	mutex_exit(&tstat_lock);
2083 	mutex_exit(&cpu_lock);
2084 
2085 	return (0);
2086 }
2087 
2088 /*ARGSUSED*/
2089 static int
2090 trapstat_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
2091 {
2092 	(void) trapstat_stop();
2093 
2094 	ASSERT(!tstat_running);
2095 
2096 	mutex_enter(&cpu_lock);
2097 	unregister_cpu_setup_func((cpu_setup_func_t *)trapstat_cpu_setup, NULL);
2098 	mutex_exit(&cpu_lock);
2099 
2100 	tstat_open = 0;
2101 	return (DDI_SUCCESS);
2102 }
2103 
2104 static int
2105 trapstat_option(int option)
2106 {
2107 	mutex_enter(&tstat_lock);
2108 
2109 	if (tstat_running) {
2110 		mutex_exit(&tstat_lock);
2111 		return (EBUSY);
2112 	}
2113 
2114 	tstat_options |= option;
2115 	mutex_exit(&tstat_lock);
2116 
2117 	return (0);
2118 }
2119 
2120 /*ARGSUSED*/
2121 static int
2122 trapstat_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *crd, int *rval)
2123 {
2124 	int i, j, out;
2125 	size_t dsize;
2126 
2127 	switch (cmd) {
2128 	case TSTATIOC_GO:
2129 		return (trapstat_go());
2130 
2131 	case TSTATIOC_NOGO:
2132 		return (trapstat_option(TSTAT_OPT_NOGO));
2133 
2134 	case TSTATIOC_STOP:
2135 		return (trapstat_stop());
2136 
2137 	case TSTATIOC_CPU:
2138 		if (arg < 0 || arg > max_cpuid)
2139 			return (EINVAL);
2140 		/*FALLTHROUGH*/
2141 
2142 	case TSTATIOC_NOCPU:
2143 		mutex_enter(&tstat_lock);
2144 
2145 		if (tstat_running) {
2146 			mutex_exit(&tstat_lock);
2147 			return (EBUSY);
2148 		}
2149 
2150 		/*
2151 		 * If this is the first CPU to be specified (or if we are
2152 		 * being asked to explicitly de-select CPUs), disable all CPUs.
2153 		 */
2154 		if (!(tstat_options & TSTAT_OPT_CPU) || cmd == TSTATIOC_NOCPU) {
2155 			tstat_options |= TSTAT_OPT_CPU;
2156 
2157 			for (i = 0; i <= max_cpuid; i++) {
2158 				tstat_percpu_t *tcpu = &tstat_percpu[i];
2159 
2160 				ASSERT(cmd == TSTATIOC_NOCPU ||
2161 				    (tcpu->tcpu_flags & TSTAT_CPU_SELECTED));
2162 				tcpu->tcpu_flags &= ~TSTAT_CPU_SELECTED;
2163 			}
2164 		}
2165 
2166 		if (cmd == TSTATIOC_CPU)
2167 			tstat_percpu[arg].tcpu_flags |= TSTAT_CPU_SELECTED;
2168 
2169 		mutex_exit(&tstat_lock);
2170 
2171 		return (0);
2172 
2173 	case TSTATIOC_ENTRY:
2174 		mutex_enter(&tstat_lock);
2175 
2176 		if (tstat_running) {
2177 			mutex_exit(&tstat_lock);
2178 			return (EBUSY);
2179 		}
2180 
2181 		if (arg >= TSTAT_NENT || arg < 0) {
2182 			mutex_exit(&tstat_lock);
2183 			return (EINVAL);
2184 		}
2185 
2186 		if (!(tstat_options & TSTAT_OPT_ENTRY)) {
2187 			/*
2188 			 * If this is the first entry that we are explicitly
2189 			 * enabling, explicitly disable every TL=0 entry.
2190 			 */
2191 			for (i = 0; i < TSTAT_NENT; i++)
2192 				tstat_enabled[i] = 0;
2193 
2194 			tstat_options |= TSTAT_OPT_ENTRY;
2195 		}
2196 
2197 		tstat_enabled[arg] = 1;
2198 		mutex_exit(&tstat_lock);
2199 		return (0);
2200 
2201 	case TSTATIOC_NOENTRY:
2202 		mutex_enter(&tstat_lock);
2203 
2204 		if (tstat_running) {
2205 			mutex_exit(&tstat_lock);
2206 			return (EBUSY);
2207 		}
2208 
2209 		for (i = 0; i < TSTAT_NENT; i++)
2210 			tstat_enabled[i] = 0;
2211 
2212 		mutex_exit(&tstat_lock);
2213 		return (0);
2214 
2215 	case TSTATIOC_READ:
2216 		mutex_enter(&tstat_lock);
2217 
2218 		if (tstat_options & TSTAT_OPT_TLBDATA) {
2219 			dsize = tstat_data_t_exported_size;
2220 		} else {
2221 			dsize = sizeof (tstat_data_t);
2222 		}
2223 
2224 		for (i = 0, out = 0; i <= max_cpuid; i++) {
2225 			tstat_percpu_t *tcpu = &tstat_percpu[i];
2226 
2227 			if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED))
2228 				continue;
2229 
2230 			ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
2231 			ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
2232 
2233 			tstat_buffer->tdata_cpuid = -1;
2234 			xc_one(i, (xcfunc_t *)trapstat_snapshot, 0, 0);
2235 
2236 			if (tstat_buffer->tdata_cpuid == -1) {
2237 				/*
2238 				 * This CPU is not currently responding to
2239 				 * cross calls; we have caught it while it is
2240 				 * being unconfigured.  We'll drop tstat_lock
2241 				 * and pick up and drop cpu_lock.  By the
2242 				 * time we acquire cpu_lock, the DR operation
2243 				 * will appear consistent and we can assert
2244 				 * that trapstat_cpu_setup() has cleared
2245 				 * TSTAT_CPU_ENABLED.
2246 				 */
2247 				mutex_exit(&tstat_lock);
2248 				mutex_enter(&cpu_lock);
2249 				mutex_exit(&cpu_lock);
2250 				mutex_enter(&tstat_lock);
2251 				ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
2252 				continue;
2253 			}
2254 
2255 			/*
2256 			 * Need to compensate for the difference between page
2257 			 * sizes exported to users and page sizes available
2258 			 * within the kernel.
2259 			 */
2260 			if ((tstat_options & TSTAT_OPT_TLBDATA) &&
2261 			    (tstat_pgszs != tstat_user_pgszs)) {
2262 				tstat_pgszdata_t *tp;
2263 				uint_t szc;
2264 
2265 				tp = &tstat_buffer->tdata_pgsz[0];
2266 				for (j = 0; j < tstat_user_pgszs; j++) {
2267 					if ((szc = USERSZC_2_SZC(j)) != j) {
2268 						bcopy(&tp[szc], &tp[j],
2269 						    sizeof (tstat_pgszdata_t));
2270 					}
2271 				}
2272 			}
2273 
2274 			if (copyout(tstat_buffer, (void *)arg, dsize) != 0) {
2275 				mutex_exit(&tstat_lock);
2276 				return (EFAULT);
2277 			}
2278 
2279 			out++;
2280 			arg += dsize;
2281 		}
2282 
2283 		if (out != max_cpuid + 1) {
2284 			processorid_t cpuid = -1;
2285 			arg += offsetof(tstat_data_t, tdata_cpuid);
2286 
2287 			if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0) {
2288 				mutex_exit(&tstat_lock);
2289 				return (EFAULT);
2290 			}
2291 		}
2292 
2293 		mutex_exit(&tstat_lock);
2294 
2295 		return (0);
2296 
2297 	case TSTATIOC_TLBDATA:
2298 		return (trapstat_option(TSTAT_OPT_TLBDATA));
2299 
2300 	default:
2301 		break;
2302 	}
2303 
2304 	return (ENOTTY);
2305 }
2306 
2307 /*ARGSUSED*/
2308 static int
2309 trapstat_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
2310 {
2311 	int error;
2312 
2313 	switch (infocmd) {
2314 	case DDI_INFO_DEVT2DEVINFO:
2315 		*result = (void *)tstat_devi;
2316 		error = DDI_SUCCESS;
2317 		break;
2318 	case DDI_INFO_DEVT2INSTANCE:
2319 		*result = (void *)0;
2320 		error = DDI_SUCCESS;
2321 		break;
2322 	default:
2323 		error = DDI_FAILURE;
2324 	}
2325 	return (error);
2326 }
2327 
2328 static int
2329 trapstat_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
2330 {
2331 	switch (cmd) {
2332 	case DDI_ATTACH:
2333 		break;
2334 
2335 	case DDI_RESUME:
2336 		return (DDI_SUCCESS);
2337 
2338 	default:
2339 		return (DDI_FAILURE);
2340 	}
2341 
2342 	if (ddi_create_minor_node(devi, "trapstat", S_IFCHR,
2343 	    0, DDI_PSEUDO, 0) == DDI_FAILURE) {
2344 		ddi_remove_minor_node(devi, NULL);
2345 		return (DDI_FAILURE);
2346 	}
2347 
2348 	ddi_report_dev(devi);
2349 	tstat_devi = devi;
2350 
2351 	tstat_pgszs = page_num_pagesizes();
2352 	tstat_user_pgszs = page_num_user_pagesizes(0);
2353 	tstat_data_t_size = sizeof (tstat_data_t) +
2354 	    (tstat_pgszs - 1) * sizeof (tstat_pgszdata_t);
2355 	tstat_data_t_exported_size = sizeof (tstat_data_t) +
2356 	    (tstat_user_pgszs - 1) * sizeof (tstat_pgszdata_t);
2357 #ifndef sun4v
2358 	tstat_data_pages = (tstat_data_t_size >> MMU_PAGESHIFT) + 1;
2359 	tstat_total_pages = TSTAT_INSTR_PAGES + tstat_data_pages;
2360 	tstat_data_size = tstat_data_pages * MMU_PAGESIZE;
2361 	tstat_total_size = TSTAT_INSTR_SIZE + tstat_data_size;
2362 #else
2363 	ASSERT(tstat_data_t_size <= TSTAT_DATA_SIZE);
2364 #endif
2365 
2366 	tstat_percpu = kmem_zalloc((max_cpuid + 1) *
2367 	    sizeof (tstat_percpu_t), KM_SLEEP);
2368 
2369 	/*
2370 	 * Create our own arena backed by segkmem to assure a source of
2371 	 * MMU_PAGESIZE-aligned allocations.  We allocate out of the
2372 	 * heap32_arena to assure that we can address the allocated memory with
2373 	 * a single sethi/simm13 pair in the interposing trap table entries.
2374 	 */
2375 	tstat_arena = vmem_create("trapstat", NULL, 0, MMU_PAGESIZE,
2376 	    segkmem_alloc_permanent, segkmem_free, heap32_arena, 0, VM_SLEEP);
2377 
2378 	tstat_enabled = kmem_alloc(TSTAT_TOTAL_NENT * sizeof (int), KM_SLEEP);
2379 	tstat_buffer = kmem_alloc(tstat_data_t_size, KM_SLEEP);
2380 
2381 	/*
2382 	 * CB_CL_CPR_POST_USER is the class that executes from cpr_resume()
2383 	 * after user threads can be restarted.  By executing in this class,
2384 	 * we are assured of the availability of system services needed to
2385 	 * resume trapstat (specifically, we are assured that all CPUs are
2386 	 * restarted and responding to cross calls).
2387 	 */
2388 	tstat_cprcb =
2389 	    callb_add(trapstat_cpr, NULL, CB_CL_CPR_POST_USER, "trapstat");
2390 
2391 	return (DDI_SUCCESS);
2392 }
2393 
2394 static int
2395 trapstat_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
2396 {
2397 	int rval;
2398 
2399 	ASSERT(devi == tstat_devi);
2400 
2401 	switch (cmd) {
2402 	case DDI_DETACH:
2403 		break;
2404 
2405 	case DDI_SUSPEND:
2406 		return (DDI_SUCCESS);
2407 
2408 	default:
2409 		return (DDI_FAILURE);
2410 	}
2411 
2412 	ASSERT(!tstat_running);
2413 
2414 	rval = callb_delete(tstat_cprcb);
2415 	ASSERT(rval == 0);
2416 
2417 	kmem_free(tstat_buffer, tstat_data_t_size);
2418 	kmem_free(tstat_enabled, TSTAT_TOTAL_NENT * sizeof (int));
2419 	vmem_destroy(tstat_arena);
2420 	kmem_free(tstat_percpu, (max_cpuid + 1) * sizeof (tstat_percpu_t));
2421 	ddi_remove_minor_node(devi, NULL);
2422 
2423 	return (DDI_SUCCESS);
2424 }
2425 
2426 /*
2427  * Configuration data structures
2428  */
2429 static struct cb_ops trapstat_cb_ops = {
2430 	trapstat_open,		/* open */
2431 	trapstat_close,		/* close */
2432 	nulldev,		/* strategy */
2433 	nulldev,		/* print */
2434 	nodev,			/* dump */
2435 	nodev,			/* read */
2436 	nodev,			/* write */
2437 	trapstat_ioctl,		/* ioctl */
2438 	nodev,			/* devmap */
2439 	nodev,			/* mmap */
2440 	nodev,			/* segmap */
2441 	nochpoll,		/* poll */
2442 	ddi_prop_op,		/* cb_prop_op */
2443 	0,			/* streamtab */
2444 	D_MP | D_NEW		/* Driver compatibility flag */
2445 };
2446 
2447 static struct dev_ops trapstat_ops = {
2448 	DEVO_REV,		/* devo_rev, */
2449 	0,			/* refcnt */
2450 	trapstat_info,		/* getinfo */
2451 	nulldev,		/* identify */
2452 	nulldev,		/* probe */
2453 	trapstat_attach,	/* attach */
2454 	trapstat_detach,	/* detach */
2455 	nulldev,		/* reset */
2456 	&trapstat_cb_ops,	/* cb_ops */
2457 	(struct bus_ops *)0,	/* bus_ops */
2458 	NULL,			/* power */
2459 	ddi_quiesce_not_needed,		/* quiesce */
2460 };
2461 
2462 static struct modldrv modldrv = {
2463 	&mod_driverops,		/* Type of module.  This one is a driver */
2464 	"Trap Statistics",	/* name of module */
2465 	&trapstat_ops,		/* driver ops */
2466 };
2467 
2468 static struct modlinkage modlinkage = {
2469 	MODREV_1, (void *)&modldrv, NULL
2470 };
2471 
2472 int
2473 _init(void)
2474 {
2475 	return (mod_install(&modlinkage));
2476 }
2477 
2478 int
2479 _fini(void)
2480 {
2481 	return (mod_remove(&modlinkage));
2482 }
2483 
2484 int
2485 _info(struct modinfo *modinfop)
2486 {
2487 	return (mod_info(&modlinkage, modinfop));
2488 }
2489