xref: /illumos-gate/usr/src/uts/sun4/io/trapstat.c (revision 069e6b7e31ba5dcbc5441b98af272714d9a5455c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 
27 #include <sys/systm.h>
28 #include <sys/conf.h>
29 #include <sys/stat.h>
30 #include <sys/ddi.h>
31 #include <sys/sunddi.h>
32 #include <sys/modctl.h>
33 #include <sys/cpu_module.h>
34 #include <vm/hat_sfmmu.h>
35 #include <vm/seg_kmem.h>
36 #include <vm/seg_kpm.h>
37 #include <vm/vm_dep.h>
38 #include <sys/machsystm.h>
39 #include <sys/machasi.h>
40 #include <sys/sysmacros.h>
41 #include <sys/callb.h>
42 #include <sys/archsystm.h>
43 #include <sys/trapstat.h>
44 #ifdef sun4v
45 #include <sys/hypervisor_api.h>
46 #endif
47 #ifndef sun4v
48 #include <sys/pghw.h>
49 #endif
50 
51 /* BEGIN CSTYLED */
52 /*
53  * trapstat:  Trap Statistics through Dynamic Trap Table Interposition
54  * -------------------------------------------------------------------
55  *
56  * Motivation and Overview
57  *
58  * Despite being a fundamental indicator of system behavior, there has
59  * historically been very little insight provided into the frequency and cost
60  * of machine-specific traps.  The lack of insight has been especially acute
61  * on UltraSPARC microprocessors:  because these microprocessors handle TLB
62  * misses as software traps, the frequency and duration of traps play a
63  * decisive role in the performance of the memory system.  As applications have
64  * increasingly outstripped TLB reach, this has become increasingly true.
65  *
66  * Part of the difficulty of observing trap behavior is that the trap handlers
67  * are so frequently called (e.g. millions of times per second) that any
68  * permanently enabled instrumentation would induce an unacceptable performance
69  * degradation.  Thus, it is a constraint on any trap observability
70  * infrastructure that it have no probe effect when not explicitly enabled.
71  *
72  * The basic idea, then, is to create an interposing trap table in which each
73  * entry increments a per-trap, in-memory counter and then jumps to the actual,
74  * underlying trap table entry.  To enable trapstat, we atomically write to the
75  * trap base address (%tba) register to point to our interposing trap table.
76  * (Note that per-CPU statistics fall out by creating a different trap table
77  * for each CPU.)
78  *
79  * Implementation Details
80  *
81  * While the idea is straight-forward, a nuance of SPARC V9 slightly
82  * complicates the implementation.  Unlike its predecessors, SPARC V9 supports
83  * the notion of nested traps.  The trap level is kept in the TL register:
84  * during normal operation it is 0; when a trap is taken, the TL register is
85  * incremented by 1.  To aid system software, SPARC V9 breaks the trap table
86  * into two halves:  the lower half contains the trap handlers for traps taken
87  * when TL is 0; the upper half contains the trap handlers for traps taken
88  * when TL is greater than 0.  Each half is further subdivided into two
89  * subsequent halves:  the lower half contains the trap handlers for traps
90  * other than those induced by the trap instruction (Tcc variants); the upper
91  * half contains the trap handlers for traps induced by the trap instruction.
92  * This gives a total of four ranges, with each range containing 256 traps:
93  *
94  *       +--------------------------------+- 3ff
95  *       |                                |   .
96  *       |     Trap instruction, TL>0     |   .
97  *       |                                |   .
98  *       |- - - - - - - - - - - - - - - - +- 300
99  *       |- - - - - - - - - - - - - - - - +- 2ff
100  *       |                                |   .
101  *       |   Non-trap instruction, TL>0   |   .
102  *       |                                |   .
103  *       |- - - - - - - - - - - - - - - - +- 200
104  *       |- - - - - - - - - - - - - - - - +- 1ff
105  *       |                                |   .
106  *       |     Trap instruction, TL=0     |   .
107  *       |                                |   .
108  *       |- - - - - - - - - - - - - - - - +- 100
109  *       |- - - - - - - - - - - - - - - - +- 0ff
110  *       |                                |   .
111  *       |   Non-trap instruction, TL=0   |   .
112  *       |                                |   .
113  *       +--------------------------------+- 000
114  *
115  *
116  * Solaris, however, doesn't have reason to support trap instructions when
117  * TL>0 (only privileged code may execute at TL>0; not supporting this only
118  * constrains our own implementation).  The trap table actually looks like:
119  *
120  *       +--------------------------------+- 2ff
121  *       |                                |   .
122  *       |   Non-trap instruction, TL>0   |   .
123  *       |                                |   .
124  *       |- - - - - - - - - - - - - - - - +- 200
125  *       |- - - - - - - - - - - - - - - - +- 1ff
126  *       |                                |   .
127  *       |     Trap instruction, TL=0     |   .
128  *       |                                |   .
129  *       |- - - - - - - - - - - - - - - - +- 100
130  *       |- - - - - - - - - - - - - - - - +- 0ff
131  *       |                                |   .
132  *       |   Non-trap instruction, TL=0   |   .
133  *       |                                |   .
134  *       +--------------------------------+- 000
135  *
136  * Putatively to aid system software, SPARC V9 has the notion of multiple
137  * sets of global registers.  UltraSPARC defines four sets of global
138  * registers:
139  *
140  *    Normal Globals
141  *    Alternate Globals (AGs)
142  *    MMU Globals (MGs)
143  *    Interrupt Globals (IGs)
144  *
145  * The set of globals in use is controlled by bits in PSTATE; when TL is 0
146  * (and PSTATE has not been otherwise explicitly modified), the Normal Globals
147  * are in use.  When a trap is issued, PSTATE is modified to point to a set of
148  * globals corresponding to the trap type.  Most traps correspond to the
149  * Alternate Globals, with a minority corresponding to the MMU Globals, and
150  * only the interrupt-vector trap (vector 0x60) corresponding to the Interrupt
151  * Globals.  (The complete mapping can be found in the UltraSPARC I&II User's
152  * Manual.)
153  *
154  * Note that the sets of globals are per trap _type_, not per trap _level_.
155  * Thus, when executing a TL>0 trap handler, one may not have registers
156  * available (for example, both trap-instruction traps and spill traps execute
157  * on the alternate globals; if a trap-instruction trap induces a window spill,
158  * the window spill handler has no available globals).  For trapstat, this is
159  * problematic:  a register is required to transfer control from one arbitrary
160  * location (in the interposing trap table) to another (in the actual trap
161  * table).
162  *
163  * We solve this problem by exploiting the trap table's location at the bottom
164  * of valid kernel memory (i.e. at KERNELBASE).  We locate the interposing trap
165  * tables just below KERNELBASE -- thereby allowing us to use a branch-always
166  * instruction (ba) instead of a jump instruction (jmp) to transfer control
167  * from the TL>0 entries in the interposing trap table to the TL>0 entries in
168  * the actual trap table.  (N.B. while this allows trap table interposition to
169  * work, it necessarily limits trapstat to only recording information about
170  * TL=0 traps -- there is no way to increment a counter without using a
171  * register.)  Diagrammatically:
172  *
173  *  Actual trap table:
174  *
175  *       +--------------------------------+- 2ff
176  *       |                                |   .
177  *       |   Non-trap instruction, TL>0   |   .   <-----------------------+
178  *       |                                |   .   <-----------------------|-+
179  *       |- - - - - - - - - - - - - - - - +- 200  <-----------------------|-|-+
180  *       |- - - - - - - - - - - - - - - - +- 1ff                          | | |
181  *       |                                |   .                           | | |
182  *       |     Trap instruction, TL=0     |   .   <-----------------+     | | |
183  *       |                                |   .   <-----------------|-+   | | |
184  *       |- - - - - - - - - - - - - - - - +- 100  <-----------------|-|-+ | | |
185  *       |- - - - - - - - - - - - - - - - +- 0ff                    | | | | | |
186  *       |                                |   .                     | | | | | |
187  *       |   Non-trap instruction, TL=0   |   .   <-----------+     | | | | | |
188  *       |                                |   .   <-----------|-+   | | | | | |
189  *       +--------------------------------+- 000  <-----------|-|-+ | | | | | |
190  *        KERNELBASE                                          | | | | | | | | |
191  *                                                            | | | | | | | | |
192  *                                                            | | | | | | | | |
193  *  Interposing trap table:                                   | | | | | | | | |
194  *                                                            | | | | | | | | |
195  *       +--------------------------------+- 2ff              | | | | | | | | |
196  *       |  ...                           |   .               | | | | | | | | |
197  *       |  ...                           |   .               | | | | | | | | |
198  *       |  ...                           |   .               | | | | | | | | |
199  *       |- - - - - - - - - - - - - - - - +- 203              | | | | | | | | |
200  *       |  ba,a                          |      -------------|-|-|-|-|-|-+ | |
201  *       |- - - - - - - - - - - - - - - - +- 202              | | | | | |   | |
202  *       |  ba,a                          |      -------------|-|-|-|-|-|---+ |
203  *       |- - - - - - - - - - - - - - - - +- 201              | | | | | |     |
204  *       |  ba,a                          |      -------------|-|-|-|-|-|-----+
205  *       |- - - - - - - - - - - - - - - - +- 200              | | | | | |
206  *       |  ...                           |   .               | | | | | |
207  *       |  ...                           |   .               | | | | | |
208  *       |  ...                           |   .               | | | | | |
209  *       |- - - - - - - - - - - - - - - - +- 103              | | | | | |
210  *       |  (Increment counter)           |                   | | | | | |
211  *       |  ba,a                          |      -------------------+ | |
212  *       |- - - - - - - - - - - - - - - - +- 102              | | |   | |
213  *       |  (Increment counter)           |                   | | |   | |
214  *       |  ba,a                          |      ---------------------+ |
215  *       |- - - - - - - - - - - - - - - - +- 101              | | |     |
216  *       |  (Increment counter)           |                   | | |     |
217  *       |  ba,a                          |      -----------------------+
218  *       |- - - - - - - - - - - - - - - - +- 100              | | |
219  *       |  ...                           |   .               | | |
220  *       |  ...                           |   .               | | |
221  *       |  ...                           |   .               | | |
222  *       |- - - - - - - - - - - - - - - - +- 003              | | |
223  *       |  (Increment counter)           |                   | | |
224  *       |  ba,a                          |      -------------+ | |
225  *       |- - - - - - - - - - - - - - - - +- 002                | |
226  *       |  (Increment counter)           |                     | |
227  *       |  ba,a                          |      ---------------+ |
228  *       |- - - - - - - - - - - - - - - - +- 001                  |
229  *       |  (Increment counter)           |                       |
230  *       |  ba,a                          |      -----------------+
231  *       +--------------------------------+- 000
232  *        KERNELBASE - tstat_total_size
233  *
234  * tstat_total_size is the number of pages required for each trap table.  It
235  * must be true that KERNELBASE - tstat_total_size is less than the maximum
236  * branch displacement; if each CPU were to consume a disjoint virtual range
237  * below KERNELBASE for its trap table, we could support at most
238  * (maximum_branch_displacement / tstat_total_size) CPUs.  The maximum branch
239  * displacement for Bicc variants is just under eight megabytes, and (because
240  * the %tba must be 32K aligned), tstat_total_size must be at least 32K; if
241  * each CPU were to consume a disjoint virtual range, we would have an
242  * unacceptably low upper bound of 256 CPUs.
243  *
244  * While there are tricks that one could use to address this constraint (e.g.,
245  * creating trampolines every maximum_branch_displacement bytes), we instead
246  * solve this by not permitting each CPU to consume a disjoint virtual range.
247  * Rather, we have each CPU's interposing trap table use the _same_ virtual
248  * range, but we back the trap tables with disjoint physical memory.  Normally,
249  * such one-to-many virtual-to-physical mappings are illegal; this is
250  * permissible here only because the pages for the interposing trap table are
251  * necessarily locked in the TLB.  (The CPUs thus never have the opportunity to
252  * discover that they have conflicting translations.)
253  *
254  * On CMT architectures in which CPUs can share MMUs, the above trick will not
255  * work: two CPUs that share an MMU cannot have the same virtual address map
256  * to disjoint physical pages.  On these architectures, any CPUs sharing the
257  * same MMU must consume a disjoint 32K virtual address range -- limiting the
258  * number of CPUs sharing an MMU on these architectures to 256 due to the
259  * branch displacement limitation described above.  On the sun4v architecture,
260  * there is a further limitation: a guest may not have more than eight locked
261  * TLB entries per MMU.  To allow operation under this restriction, the
262  * interposing trap table and the trap statistics are each accessed through
263  * a single 4M TLB entry.  This limits the footprint to two locked entries
264  * (one for the I-TLB and one for the D-TLB), but further restricts the number
265  * of CPUs to 128 per MMU.  However, support for more than 128 CPUs can easily
266  * be added via a hybrid scheme, where the same 4M virtual address is used
267  * on different MMUs.
268  *
269  * On sun4v architecture, we cannot use the hybrid scheme as the architecture
270  * imposes additional restriction on the number of permanent mappings per
271  * guest and it is illegal to use the same virtual address to map different
272  * TTEs on different MMUs. Instead, we increase the number of supported CPUs
273  * by reducing the virtual address space requirements per CPU via shared
274  * interposing trap table as follows:
275  *
276  *                                          Offset (within 4MB page)
277  *       +------------------------------------+- 0x400000
278  *       |  CPU 1015 trap statistics (4KB)    |   .
279  *       |- - - - - - - - - - - - - - - - - - +- 0x3ff000
280  *       |                                    |
281  *       |   ...                              |
282  *       |                                    |
283  *       |- - - - - - - - - - - - - - - - - - +- 0x00a000
284  *       |  CPU 1 trap statistics (4KB)       |   .
285  *       |- - - - - - - - - - - - - - - - - - +- 0x009000
286  *       |  CPU 0 trap statistics (4KB)       |   .
287  *       |- - - - - - - - - - - - - - - - - - +- 0x008000
288  *       |  Shared trap handler continuation  |   .
289  *       |- - - - - - - - - - - - - - - - - - +- 0x006000
290  *       |  Non-trap instruction, TL>0        |   .
291  *       |- - - - - - - - - - - - - - - - - - +- 0x004000
292  *       |  Trap instruction, TL=0            |   .
293  *       |- - - - - - - - - - - - - - - - - - +- 0x002000
294  *       |  Non-trap instruction, TL=0        |   .
295  *       +------------------------------------+- 0x000000
296  *
297  * Note that each CPU has its own 4K space for its trap statistics but
298  * shares the same interposing trap handlers.  Interposing trap handlers
299  * use the CPU ID to determine the location of per CPU trap statistics
300  * area dynamically. This increases the interposing trap handler overhead,
301  * but is acceptable as it allows us to support up to 1016 CPUs with one
302  * 4MB page on sun4v architecture. Support for additional CPUs can be
303  * added with another 4MB page to 2040 cpus (or 3064 cpus with 2 additional
304  * 4MB pages). With additional 4MB pages, we cannot use displacement branch
305  * (ba instruction) and we have to use jmp instruction instead. Note that
306  * with sun4v, globals are nested (not per-trap type as in sun4u), so it is
307  * ok to use additional global reg to do jmp. This option is not available in
308  * sun4u which mandates the usage of displacement branches since no global reg
309  * is available at TL>1
310  *
311  * TLB Statistics
312  *
313  * Because TLB misses are an important component of system performance, we wish
314  * to know much more about these traps than simply the number received.
315  * Specifically, we wish to know:
316  *
317  *  (a)	The amount of time spent executing the TLB miss handler
318  *  (b)	TLB misses versus TSB misses
319  *  (c) Kernel-level misses versus user-level misses
320  *  (d) Misses per pagesize
321  *
322  * TLB Statistics: Time Spent Executing
323  *
324  * To accurately determine the amount of time spent executing the TLB miss
325  * handler, one must get a timestamp on trap entry and trap exit, subtract the
326  * latter from the former, and add the result to an accumulating count.
327  * Consider flow of control during normal TLB miss processing (where "ldx
328  * [%g2], %g2" is an arbitrary TLB-missing instruction):
329  *
330  * + - - - - - - - -+
331  * :                :
332  * : ldx [%g2], %g2 :<-------------------------------------------------------+
333  * :                :              Return from trap:                         |
334  * + - - - - - - - -+                TL <- TL - 1 (0)                        |
335  *	  |                          %pc <- TSTATE[TL].TPC (address of load) |
336  *	  | TLB miss:                                                        |
337  *        |   TL <- TL + 1 (1)                                               |
338  *        |   %pc <- TLB-miss-trap-handler                                   |
339  *        |                                                                  |
340  *        v                                                                  |
341  * + - - - - - - - - - - - - - - - +                                         |
342  * :                               :                                         |
343  * : Lookup VA in TSB              :                                         |
344  * : If (hit)                      :                                         |
345  * :     Fill TLB                  :                                         |
346  * : Else                          :                                         |
347  * :     Lookup VA (hme hash table :                                         |
348  * :                or segkpm)     :                                         |
349  * :     Fill TLB                  :                                         |
350  * : Endif                         :                                         |
351  * : Issue "retry"  ---------------------------------------------------------+
352  * :                               :
353  * + - - - - - - - - - - - - - - - +
354  *  TLB-miss-trap-handler
355  *
356  *
357  * As the above diagram indicates, interposing on the trap table allows one
358  * only to determine a timestamp on trap _entry_:  when the TLB miss handler
359  * has completed filling the TLB, a "retry" will be issued, and control will
360  * transfer immediately back to the missing %pc.
361  *
362  * To obtain a timestamp on trap exit, we must then somehow interpose between
363  * the "retry" and the subsequent control transfer to the TLB-missing
364  * instruction.  To do this, we _push_ a trap level.  The basic idea is to
365  * spoof a TLB miss by raising TL, setting the %tpc to be within text
366  * controlled by trapstat (the "TLB return entry") and branching to the
367  * underlying TLB miss handler.  When the TLB miss handler issues its "retry",
368  * control will transfer not to the TLB-missing instruction, but rather to the
369  * TLB return entry.  This code can then obtain a timestamp, and issue its own
370  * "retry" -- thereby correctly returning to the TLB-missing instruction.
371  * Here is the above TLB miss flow control diagram modified to reflect
372  * trapstat's operation:
373  *
374  * + - - - - - - - -+
375  * :                :
376  * : ldx [%g2], %g2 :<-------------------------------------------------------+
377  * :                :             Return from trap:                          |
378  * + - - - - - - - -+               TL <- TL - 1 (0)                         |
379  *	  |                         %pc <- TSTATE[TL].TPC (address of load)  |
380  *	  | TLB miss:                                                        |
381  *        |   TL <- TL + 1 (1)                                               |
382  *        |   %pc <- TLB-miss-trap-handler (trapstat)                        |
383  *        |                                                                  |
384  *        v                                    TLB-return-entry (trapstat)   |
385  * + - - - - - - - - - - - - - - - - - - +    + - - - - - - - - - - - - - +  |
386  * :                                     :    :                           :  |
387  * : Record timestamp                    :    : Record timestamp          :  |
388  * : TL <- 2                             :    : Take timestamp difference :  |
389  * : TSTATE[1].TPC <- TLB-return-entry   :    : Add to running total      :  |
390  * : ba,a TLB-miss-trap-handler -----------+  : Issue "retry"  --------------+
391  * :                                     : |  :                           :
392  * + - - - - - - - - - - - - - - - - - - + |  + - - - - - - - - - - - - - +
393  *  TLB-miss-trap-handler	           |                  ^
394  *  (trapstat)                             |                  |
395  *                                         |                  |
396  *                                         |                  |
397  *                 +-----------------------+                  |
398  *                 |                                          |
399  *                 |                                          |
400  *                 v                                          |
401  * + - - - - - - - - - - - - - - - +                          |
402  * :                               :                          |
403  * : Lookup VA in TSB              :                          |
404  * : If (hit)                      :                          |
405  * :     Fill TLB                  :                          |
406  * : Else                          :                          |
407  * :     Lookup VA (hme hash table :                          |
408  * :                or segkpm)     :                          |
409  * :     Fill TLB                  :                          |
410  * : Endif                         :                          |
411  * : Issue "retry"  ------------------------------------------+
412  * :                               : Return from trap:
413  * + - - - - - - - - - - - - - - - +   TL <- TL - 1 (1)
414  *  TLB-miss-trap-handler              %pc <- TSTATE[TL].TPC (TLB-return-entry)
415  *
416  *
417  * A final subterfuge is required to complete our artifice:  if we miss in
418  * the TLB, the TSB _and_ the subsequent hash or segkpm lookup (that is, if
419  * there is no valid translation for the TLB-missing address), common system
420  * software will need to accurately determine the %tpc as part of its page
421  * fault handling. We therefore modify the kernel to check the %tpc in this
422  * case: if the %tpc falls within the VA range controlled by trapstat and
423  * the TL is 2, TL is simply lowered back to 1 (this check is implemented
424  * by the TSTAT_CHECK_TL1 macro).  Lowering TL to 1 has the effect of
425  * discarding the state pushed by trapstat.
426  *
427  * TLB Statistics: TLB Misses versus TSB Misses
428  *
429  * Distinguishing TLB misses from TSB misses requires further interposition
430  * on the TLB miss handler:  we cannot know a priori or a posteriori if a
431  * given VA will or has hit in the TSB.
432  *
433  * We achieve this distinction by adding a second TLB return entry almost
434  * identical to the first -- differing only in the address to which it
435  * stores its results.  We then modify the TLB miss handlers of the kernel
436  * such that they check the %tpc when they determine that a TLB miss has
437  * subsequently missed in the TSB:  if the %tpc lies within trapstat's VA
438  * range and TL is 2 (that is, if trapstat is running), the TLB miss handler
439  * _increments_ the %tpc by the size of the TLB return entry.  The ensuing
440  * "retry" will thus transfer control to the second TLB return entry, and
441  * the time spent in the handler will be accumulated in a memory location
442  * specific to TSB misses.
443  *
444  * N.B.:  To minimize the amount of knowledge the kernel must have of trapstat,
445  * we do not allow the kernel to hard-code the size of the TLB return entry.
446  * Rather, the actual tsbmiss handler executes a known instruction at the
447  * corresponding tsbmiss patch points (see the tstat_tsbmiss_patch_table) with
448  * the %tpc in %g7:  when trapstat is not running, these points contain the
449  * harmless TSTAT_TSBMISS_INSTR instruction ("add %g7, 0, %g7"). Before
450  * running, trapstat modifies the instructions at these patch points such
451  * that the simm13 equals the size of the TLB return entry.
452  *
453  * TLB Statistics: Kernel-level Misses versus User-level Misses
454  *
455  * Differentiating user-level misses from kernel-level misses employs a
456  * similar technique, but is simplified by the ability to distinguish a
457  * user-level miss from a kernel-level miss a priori by reading the context
458  * register:  we implement kernel-/user-level differentiation by again doubling
459  * the number of TLB return entries, and setting the %tpc to the appropriate
460  * TLB return entry in trapstat's TLB miss handler.  Together with the doubling
461  * of entries required for TLB-miss/TSB-miss differentiation, this yields a
462  * total of four TLB return entries:
463  *
464  *	Level		TSB hit?	Structure member
465  *	------------------------------------------------------------
466  *	Kernel		Yes		tstat_tlbret_t.ttlbr_ktlb
467  *	Kernel		No		tstat_tlbret_t.ttlbr_ktsb
468  *	User		Yes		tstat_tlbret_t.ttlbr_utlb
469  *	User		No		tstat_tlbret_t.ttlbr_utsb
470  *
471  * TLB Statistics: Misses per Pagesize
472  *
473  * As with the TLB-/TSB-miss differentiation, we have no way of determining
474  * pagesize a priori.  This is therefore implemented by mandating a new rule:
475  * whenever the kernel fills the TLB in its TLB miss handler, the TTE
476  * corresponding to the TLB-missing VA must be in %g5 when the handler
477  * executes its "retry".  This allows the TLB return entry to determine
478  * pagesize by simply looking at the pagesize field in the TTE stored in
479  * %g5.
480  *
481  * TLB Statistics: Probe Effect
482  *
483  * As one might imagine, gathering TLB statistics by pushing a trap level
484  * induces significant probe effect.  To account for this probe effect,
485  * trapstat attempts to observe it by executing a code sequence with a known
486  * number of TLB misses both before and after interposing on the trap table.
487  * This allows trapstat to determine a per-trap probe effect which can then be
488  * factored into the "%tim" fields of the trapstat command.
489  *
490  * Note that on sun4v platforms, TLB misses are normally handled by the
491  * hypervisor or the hardware TSB walker. Thus no fast MMU miss information
492  * is reported for normal operation. However, when trapstat is invoked
493  * with -t or -T option to collect detailed TLB statistics, kernel takes
494  * over TLB miss handling. This results in significantly more overhead
495  * and TLB statistics may not be as accurate as on sun4u platforms.
496  * On some processors, hypervisor or hardware may provide a low overhead
497  * interface to collect TSB hit statistics. This support is exposed via
498  * a well defined CPU module interface (cpu_trapstat_conf to enable this
499  * interface and cpu_trapstat_data to get detailed TSB hit statistics).
500  * In this scenario, TSB miss statistics is collected by intercepting the
501  * IMMU_miss and DMMU_miss traps using above mentioned trap interposition
502  * approach.
503  *
504  * Locking
505  *
506  * The implementation uses two locks:  tstat_lock (a local lock) and the global
507  * cpu_lock.  tstat_lock is used to assure trapstat's consistency in the
508  * presence of multithreaded /dev/trapstat consumers (while as of this writing
509  * the only consumer of /dev/trapstat is single threaded, it is obviously
510  * necessary to correctly support multithreaded access).  cpu_lock is held
511  * whenever CPUs are being manipulated directly, to prevent them from
512  * disappearing in the process.  Because trapstat's DR callback
513  * (trapstat_cpu_setup()) must grab tstat_lock and is called with cpu_lock
514  * held, the lock ordering is necessarily cpu_lock before tstat_lock.
515  *
516  */
517 /* END CSTYLED */
518 
519 static dev_info_t	*tstat_devi;	/* saved in xxattach() for xxinfo() */
520 static int		tstat_open;	/* set if driver is open */
521 static kmutex_t		tstat_lock;	/* serialize access */
522 static vmem_t		*tstat_arena;	/* arena for TLB-locked pages */
523 static tstat_percpu_t	*tstat_percpu;	/* per-CPU data */
524 static int		tstat_running;	/* set if trapstat is running */
525 static tstat_data_t	*tstat_buffer;	/* staging buffer for outgoing data */
526 static int		tstat_options;	/* bit-wise indication of options */
527 static int		*tstat_enabled;	/* map of enabled trap entries */
528 static int		tstat_tsbmiss_patched; /* tsbmiss patch flag */
529 static callb_id_t	tstat_cprcb;	/* CPR callback */
530 static char		*tstat_probe_area; /* VA range used for probe effect */
531 static caddr_t		tstat_probe_phys; /* physical to back above VA */
532 static hrtime_t		tstat_probe_time; /* time spent on probe effect */
533 static hrtime_t		tstat_probe_before[TSTAT_PROBE_NLAPS];
534 static hrtime_t		tstat_probe_after[TSTAT_PROBE_NLAPS];
535 static uint_t		tstat_pgszs;		/* # of kernel page sizes */
536 static uint_t		tstat_user_pgszs;	/* # of user page sizes */
537 
538 /*
539  * sizeof tstat_data_t + pgsz data for the kernel.  For simplicity's sake, when
540  * we collect data, we do it based upon szc, but when we report data back to
541  * userland, we have to do it based upon the userszc which may not match.
542  * So, these two variables are for internal use and exported use respectively.
543  */
544 static size_t		tstat_data_t_size;
545 static size_t		tstat_data_t_exported_size;
546 
547 #ifndef sun4v
548 
549 static size_t		tstat_data_pages;  /* number of pages of tstat data */
550 static size_t		tstat_data_size;   /* tstat data size in bytes */
551 static size_t		tstat_total_pages; /* #data pages + #instr pages */
552 static size_t		tstat_total_size;  /* tstat data size + instr size */
553 
554 #else /* sun4v */
555 
556 static caddr_t		tstat_va[TSTAT_NUM4M_LIMIT]; /* VAs of 4MB pages */
557 static pfn_t		tstat_pfn[TSTAT_NUM4M_LIMIT]; /* PFNs of 4MB pages */
558 static boolean_t	tstat_fast_tlbstat = B_FALSE;
559 static int		tstat_traptab_initialized;
560 static int		tstat_perm_mapping_failed;
561 static int		tstat_hv_nopanic;
562 static int		tstat_num4m_mapping;
563 
564 #endif /* sun4v */
565 
566 /*
567  * In the above block comment, see "TLB Statistics: TLB Misses versus
568  * TSB Misses" for an explanation of the tsbmiss patch points.
569  */
570 extern uint32_t		tsbmiss_trapstat_patch_point;
571 extern uint32_t		tsbmiss_trapstat_patch_point_kpm;
572 extern uint32_t		tsbmiss_trapstat_patch_point_kpm_small;
573 
574 /*
575  * Trapstat tsbmiss patch table
576  */
577 tstat_tsbmiss_patch_entry_t tstat_tsbmiss_patch_table[] = {
578 	{(uint32_t *)&tsbmiss_trapstat_patch_point, 0},
579 	{(uint32_t *)&tsbmiss_trapstat_patch_point_kpm, 0},
580 	{(uint32_t *)&tsbmiss_trapstat_patch_point_kpm_small, 0},
581 	{(uint32_t *)NULL, 0}
582 };
583 
584 /*
585  * We define some general SPARC-specific constants to allow more readable
586  * relocations.
587  */
588 #define	NOP	0x01000000
589 #define	HI22(v) ((uint32_t)(v) >> 10)
590 #define	LO10(v) ((uint32_t)(v) & 0x3ff)
591 #define	LO12(v) ((uint32_t)(v) & 0xfff)
592 #define	DISP22(from, to) \
593 	((((uintptr_t)(to) - (uintptr_t)(from)) >> 2) & 0x3fffff)
594 #define	ASI(asi)	((asi) << 5)
595 
596 /*
597  * The interposing trap table must be locked in the I-TLB, and any data
598  * referred to in the interposing trap handler must be locked in the D-TLB.
599  * This function locks these pages in the appropriate TLBs by creating TTEs
600  * from whole cloth, and manually loading them into the TLB.  This function is
601  * called from cross call context.
602  *
603  * On sun4v platforms, we use 4M page size mappings to minimize the number
604  * of locked down entries (i.e. permanent mappings). Each CPU uses a
605  * reserved portion of that 4M page for its TBA and data.
606  */
607 static void
608 trapstat_load_tlb(void)
609 {
610 	int i;
611 #ifdef sun4v
612 	uint64_t ret;
613 #endif
614 	tte_t tte;
615 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
616 	caddr_t va = tcpu->tcpu_vabase;
617 
618 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
619 	ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
620 
621 #ifndef sun4v
622 	for (i = 0; i < tstat_total_pages; i++, va += MMU_PAGESIZE) {
623 		tte.tte_inthi = TTE_VALID_INT | TTE_SZ_INT(TTE8K) |
624 		    TTE_PFN_INTHI(tcpu->tcpu_pfn[i]);
625 		if (i < TSTAT_INSTR_PAGES) {
626 			tte.tte_intlo = TTE_PFN_INTLO(tcpu->tcpu_pfn[i]) |
627 			    TTE_LCK_INT | TTE_CP_INT | TTE_PRIV_INT;
628 			sfmmu_itlb_ld_kva(va, &tte);
629 		} else {
630 			tte.tte_intlo = TTE_PFN_INTLO(tcpu->tcpu_pfn[i]) |
631 			    TTE_LCK_INT | TTE_CP_INT | TTE_CV_INT |
632 			    TTE_PRIV_INT | TTE_HWWR_INT;
633 			sfmmu_dtlb_ld_kva(va, &tte);
634 		}
635 	}
636 #else /* sun4v */
637 	for (i = 0; i < tstat_num4m_mapping; i++) {
638 		tte.tte_inthi = TTE_VALID_INT | TTE_PFN_INTHI(tstat_pfn[i]);
639 		tte.tte_intlo = TTE_PFN_INTLO(tstat_pfn[i]) | TTE_CP_INT |
640 		    TTE_CV_INT | TTE_PRIV_INT | TTE_HWWR_INT |
641 		    TTE_SZ_INTLO(TTE4M);
642 		ret = hv_mmu_map_perm_addr(va, KCONTEXT, *(uint64_t *)&tte,
643 		    MAP_ITLB | MAP_DTLB);
644 
645 		if (ret != H_EOK) {
646 			if (tstat_hv_nopanic) {
647 				int j;
648 				/*
649 				 * The first attempt to create perm mapping
650 				 * failed. The guest might have exhausted its
651 				 * perm mapping limit. We don't panic on first
652 				 * try.
653 				 */
654 				tstat_perm_mapping_failed = 1;
655 				va = tcpu->tcpu_vabase;
656 				for (j = 0; j < i; j++) {
657 					(void) hv_mmu_unmap_perm_addr(va,
658 					    KCONTEXT, MAP_ITLB | MAP_DTLB);
659 					va += MMU_PAGESIZE4M;
660 				}
661 				break;
662 			}
663 			/*
664 			 * We failed on subsequent cpus trying to
665 			 * create the same perm mappings. This
666 			 * should not happen. Panic here.
667 			 */
668 			cmn_err(CE_PANIC, "trapstat: cannot create "
669 			    "perm mappings for cpu %d "
670 			    "(error: 0x%lx)", CPU->cpu_id, ret);
671 		}
672 		va += MMU_PAGESIZE4M;
673 	}
674 #endif /* sun4v */
675 }
676 
677 /*
678  * As mentioned in the "TLB Statistics: TLB Misses versus TSB Misses" section
679  * of the block comment, TLB misses are differentiated from TSB misses in
680  * part by hot-patching the instructions at the tsbmiss patch points (see
681  * tstat_tsbmiss_patch_table). This routine is used both to initially patch
682  * the instructions, and to patch them back to their original values upon
683  * restoring the original trap table.
684  */
685 static void
686 trapstat_hotpatch()
687 {
688 	uint32_t instr;
689 	uint32_t simm13;
690 	tstat_tsbmiss_patch_entry_t *ep;
691 
692 	ASSERT(MUTEX_HELD(&tstat_lock));
693 
694 	if (!(tstat_options & TSTAT_OPT_TLBDATA))
695 		return;
696 
697 	if (!tstat_tsbmiss_patched) {
698 		/*
699 		 * We haven't patched the TSB paths; do so now.
700 		 */
701 		/*CONSTCOND*/
702 		ASSERT(offsetof(tstat_tlbret_t, ttlbr_ktsb) -
703 		    offsetof(tstat_tlbret_t, ttlbr_ktlb) ==
704 		    offsetof(tstat_tlbret_t, ttlbr_utsb) -
705 		    offsetof(tstat_tlbret_t, ttlbr_utlb));
706 
707 		simm13 = offsetof(tstat_tlbret_t, ttlbr_ktsb) -
708 		    offsetof(tstat_tlbret_t, ttlbr_ktlb);
709 
710 		for (ep = tstat_tsbmiss_patch_table; ep->tpe_addr; ep++) {
711 			ASSERT(ep->tpe_instr == 0);
712 			instr = ep->tpe_instr = *ep->tpe_addr;
713 
714 			/*
715 			 * Assert that the instruction we're about to patch is
716 			 * "add %g7, 0, %g7" (0x8e01e000).
717 			 */
718 			ASSERT(instr == TSTAT_TSBMISS_INSTR);
719 
720 			instr |= simm13;
721 			hot_patch_kernel_text((caddr_t)ep->tpe_addr,
722 			    instr, sizeof (instr));
723 		}
724 
725 		tstat_tsbmiss_patched = 1;
726 
727 	} else {
728 		/*
729 		 * Remove patches from the TSB paths.
730 		 */
731 		for (ep = tstat_tsbmiss_patch_table; ep->tpe_addr; ep++) {
732 			ASSERT(ep->tpe_instr == TSTAT_TSBMISS_INSTR);
733 			hot_patch_kernel_text((caddr_t)ep->tpe_addr,
734 			    ep->tpe_instr, sizeof (instr));
735 			ep->tpe_instr = 0;
736 		}
737 
738 		tstat_tsbmiss_patched = 0;
739 	}
740 }
741 
742 /*
743  * This is the routine executed to clock the performance of the trap table,
744  * executed both before and after interposing on the trap table to attempt to
745  * determine probe effect.  The probe effect is used to adjust the "%tim"
746  * fields of trapstat's -t and -T output; we only use TLB misses to clock the
747  * trap table.  We execute the inner loop (which is designed to exceed the
748  * TLB's reach) nlaps times, taking the best time as our time (thereby
749  * factoring out the effects of interrupts, cache misses or other perturbing
750  * events.
751  */
752 static hrtime_t
753 trapstat_probe_laps(int nlaps, hrtime_t *buf)
754 {
755 	int i, j = 0;
756 	hrtime_t ts, best = INT64_MAX;
757 
758 	while (nlaps--) {
759 		ts = rdtick();
760 
761 		for (i = 0; i < TSTAT_PROBE_SIZE; i += MMU_PAGESIZE)
762 			*((volatile char *)&tstat_probe_area[i]);
763 
764 		if ((ts = rdtick() - ts) < best)
765 			best = ts;
766 		buf[j++] = ts;
767 	}
768 
769 	return (best);
770 }
771 
772 /*
773  * This routine determines the probe effect by calling trapstat_probe_laps()
774  * both without and with the interposing trap table.  Note that this is
775  * called from a cross call on the desired CPU, and that it is called on
776  * every CPU (this is necessary because the probe effect may differ from
777  * one CPU to another).
778  */
779 static void
780 trapstat_probe()
781 {
782 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
783 	hrtime_t before, after;
784 
785 	if (!(tcpu->tcpu_flags & TSTAT_CPU_SELECTED))
786 		return;
787 
788 	if (tstat_probe_area == NULL || (tstat_options & TSTAT_OPT_NOGO))
789 		return;
790 
791 	/*
792 	 * We very much expect the %tba to be KERNELBASE; this is a
793 	 * precautionary measure to assure that trapstat doesn't melt the
794 	 * machine should the %tba point unexpectedly elsewhere.
795 	 */
796 	if (get_tba() != (caddr_t)KERNELBASE)
797 		return;
798 
799 	/*
800 	 * Preserve this CPU's data before destroying it by enabling the
801 	 * interposing trap table.  We can safely use tstat_buffer because
802 	 * the caller of the trapstat_probe() cross call is holding tstat_lock.
803 	 */
804 #ifdef sun4v
805 	bcopy(tcpu->tcpu_data, tstat_buffer, TSTAT_DATA_SIZE);
806 #else
807 	bcopy(tcpu->tcpu_data, tstat_buffer, tstat_data_t_size);
808 #endif
809 
810 	tstat_probe_time = gethrtime();
811 
812 	before = trapstat_probe_laps(TSTAT_PROBE_NLAPS, tstat_probe_before);
813 	(void) set_tba(tcpu->tcpu_ibase);
814 
815 	after = trapstat_probe_laps(TSTAT_PROBE_NLAPS, tstat_probe_after);
816 	(void) set_tba((caddr_t)KERNELBASE);
817 
818 	tstat_probe_time = gethrtime() - tstat_probe_time;
819 
820 #ifdef sun4v
821 	bcopy(tstat_buffer, tcpu->tcpu_data, TSTAT_DATA_SIZE);
822 	tcpu->tcpu_tdata_peffect = (after - before) / TSTAT_PROBE_NPAGES;
823 #else
824 	bcopy(tstat_buffer, tcpu->tcpu_data, tstat_data_t_size);
825 	tcpu->tcpu_data->tdata_peffect = (after - before) / TSTAT_PROBE_NPAGES;
826 #endif
827 }
828 
829 static void
830 trapstat_probe_alloc()
831 {
832 	pfn_t pfn;
833 	caddr_t va;
834 	int i;
835 
836 	ASSERT(MUTEX_HELD(&tstat_lock));
837 	ASSERT(tstat_probe_area == NULL);
838 	ASSERT(tstat_probe_phys == NULL);
839 
840 	if (!(tstat_options & TSTAT_OPT_TLBDATA))
841 		return;
842 
843 	/*
844 	 * Grab some virtual from the heap arena.
845 	 */
846 	tstat_probe_area = vmem_alloc(heap_arena, TSTAT_PROBE_SIZE, VM_SLEEP);
847 	va = tstat_probe_area;
848 
849 	/*
850 	 * Grab a single physical page.
851 	 */
852 	tstat_probe_phys = vmem_alloc(tstat_arena, MMU_PAGESIZE, VM_SLEEP);
853 	pfn = hat_getpfnum(kas.a_hat, tstat_probe_phys);
854 
855 	/*
856 	 * Now set the translation for every page in our virtual range
857 	 * to be our allocated physical page.
858 	 */
859 	for (i = 0; i < TSTAT_PROBE_NPAGES; i++) {
860 		hat_devload(kas.a_hat, va, MMU_PAGESIZE, pfn, PROT_READ,
861 		    HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
862 		va += MMU_PAGESIZE;
863 	}
864 }
865 
866 static void
867 trapstat_probe_free()
868 {
869 	caddr_t va;
870 	int i;
871 
872 	ASSERT(MUTEX_HELD(&tstat_lock));
873 
874 	if ((va = tstat_probe_area) == NULL)
875 		return;
876 
877 	for (i = 0; i < TSTAT_PROBE_NPAGES; i++) {
878 		hat_unload(kas.a_hat, va, MMU_PAGESIZE, HAT_UNLOAD_UNLOCK);
879 		va += MMU_PAGESIZE;
880 	}
881 
882 	vmem_free(tstat_arena, tstat_probe_phys, MMU_PAGESIZE);
883 	vmem_free(heap_arena, tstat_probe_area, TSTAT_PROBE_SIZE);
884 
885 	tstat_probe_phys = NULL;
886 	tstat_probe_area = NULL;
887 }
888 
889 /*
890  * This routine actually enables a CPU by setting its %tba to be the
891  * CPU's interposing trap table.  It is called out of cross call context.
892  */
893 static void
894 trapstat_enable()
895 {
896 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
897 
898 	if (!(tcpu->tcpu_flags & TSTAT_CPU_SELECTED))
899 		return;
900 
901 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
902 	ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
903 
904 	if (get_tba() != (caddr_t)KERNELBASE)
905 		return;
906 
907 	if (!(tstat_options & TSTAT_OPT_NOGO))
908 		(void) set_tba(tcpu->tcpu_ibase);
909 	tcpu->tcpu_flags |= TSTAT_CPU_ENABLED;
910 #ifdef sun4v
911 	if ((tstat_options & TSTAT_OPT_TLBDATA) &&
912 	    !(tstat_options & TSTAT_OPT_NOGO)) {
913 		if (tstat_fast_tlbstat) {
914 			/*
915 			 * Invoke processor specific interface to enable
916 			 * collection of TSB hit statistics.
917 			 */
918 			(void) cpu_trapstat_conf(CPU_TSTATCONF_ENABLE);
919 		} else {
920 			/*
921 			 * Collect TLB miss statistics by taking over
922 			 * TLB miss handling from the hypervisor. This
923 			 * is done by telling the hypervisor that there
924 			 * is no TSB configured. Also set TSTAT_TLB_STATS
925 			 * flag so that no user TSB is configured during
926 			 * context switch time.
927 			 */
928 			cpu_t *cp = CPU;
929 
930 			cp->cpu_m.cpu_tstat_flags |= TSTAT_TLB_STATS;
931 			(void) hv_set_ctx0(0, 0);
932 			(void) hv_set_ctxnon0(0, 0);
933 		}
934 	}
935 #endif
936 }
937 
938 /*
939  * This routine disables a CPU (vis a vis trapstat) by setting its %tba to be
940  * the actual, underlying trap table.  It is called out of cross call context.
941  */
942 static void
943 trapstat_disable()
944 {
945 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
946 
947 	if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED))
948 		return;
949 
950 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
951 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
952 
953 	if (!(tstat_options & TSTAT_OPT_NOGO))
954 		(void) set_tba((caddr_t)KERNELBASE);
955 
956 	tcpu->tcpu_flags &= ~TSTAT_CPU_ENABLED;
957 
958 #ifdef sun4v
959 	if ((tstat_options & TSTAT_OPT_TLBDATA) &&
960 	    !(tstat_options & TSTAT_OPT_NOGO)) {
961 		if (tstat_fast_tlbstat) {
962 			/*
963 			 * Invoke processor specific interface to disable
964 			 * collection of TSB hit statistics on each processor.
965 			 */
966 			(void) cpu_trapstat_conf(CPU_TSTATCONF_DISABLE);
967 		} else {
968 			/*
969 			 * As part of collecting TLB miss statistics, we took
970 			 * over TLB miss handling from the hypervisor by
971 			 * telling the hypervisor that NO TSB is configured.
972 			 * We need to restore that by communicating proper
973 			 * kernel/user TSB information so that TLB misses
974 			 * can be handled by the hypervisor or the hardware
975 			 * more efficiently.
976 			 *
977 			 * We restore kernel TSB information right away.
978 			 * However, to minimize any locking dependency, we
979 			 * don't restore user TSB information right away.
980 			 * Instead, we simply clear the TSTAT_TLB_STATS flag
981 			 * so that the user TSB information is automatically
982 			 * restored on next context switch.
983 			 *
984 			 * Note that the call to restore kernel TSB information
985 			 * will normally not fail, unless wrong information is
986 			 * passed here. In that scenario, system will still
987 			 * continue to function properly with the exception of
988 			 * kernel handling all the TLB misses.
989 			 */
990 			struct hv_tsb_block *hvbp = &ksfmmup->sfmmu_hvblock;
991 			cpu_t *cp = CPU;
992 
993 			cp->cpu_m.cpu_tstat_flags &= ~TSTAT_TLB_STATS;
994 			(void) hv_set_ctx0(hvbp->hv_tsb_info_cnt,
995 			    hvbp->hv_tsb_info_pa);
996 		}
997 	}
998 #endif
999 }
1000 
1001 /*
1002  * We use %tick as the time base when recording the time spent executing
1003  * the trap handler.  %tick, however, is not necessarily kept in sync
1004  * across CPUs (indeed, different CPUs may have different %tick frequencies).
1005  * We therefore cross call onto a CPU to get a snapshot of its data to
1006  * copy out; this is the routine executed out of that cross call.
1007  */
1008 static void
1009 trapstat_snapshot()
1010 {
1011 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
1012 	tstat_data_t *data = tcpu->tcpu_data;
1013 
1014 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
1015 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
1016 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ENABLED);
1017 
1018 #ifndef sun4v
1019 	data->tdata_snapts = gethrtime();
1020 	data->tdata_snaptick = rdtick();
1021 	bcopy(data, tstat_buffer, tstat_data_t_size);
1022 #else
1023 	/*
1024 	 * For sun4v, in order to conserve space in the limited
1025 	 * per-cpu 4K buffer, we derive certain info somewhere else and
1026 	 * copy them directly into the tstat_buffer output.
1027 	 * Note that we either are collecting tlb stats or
1028 	 * regular trapstats but never both.
1029 	 */
1030 	tstat_buffer->tdata_cpuid = CPU->cpu_id;
1031 	tstat_buffer->tdata_peffect = tcpu->tcpu_tdata_peffect;
1032 	tstat_buffer->tdata_snapts = gethrtime();
1033 	tstat_buffer->tdata_snaptick = rdtick();
1034 
1035 	if (tstat_options & TSTAT_OPT_TLBDATA) {
1036 		/* Copy tlb/tsb stats collected in the per-cpu trapdata */
1037 		tstat_tdata_t *tdata = (tstat_tdata_t *)data;
1038 		bcopy(&tdata->tdata_pgsz[0],
1039 		    &tstat_buffer->tdata_pgsz[0],
1040 		    tstat_pgszs * sizeof (tstat_pgszdata_t));
1041 
1042 		/*
1043 		 * Invoke processor specific interface to collect TLB stats
1044 		 * on each processor if enabled.
1045 		 */
1046 		if (tstat_fast_tlbstat) {
1047 			cpu_trapstat_data((void *) tstat_buffer->tdata_pgsz,
1048 			    tstat_pgszs);
1049 		}
1050 	} else {
1051 		/*
1052 		 * Normal trapstat collection.
1053 		 * Copy all the 4K data area into tstat_buffer tdata_trap
1054 		 * area.
1055 		 */
1056 		bcopy(data, &tstat_buffer->tdata_traps[0], TSTAT_DATA_SIZE);
1057 	}
1058 #endif /* sun4v */
1059 }
1060 
1061 /*
1062  * The TSTAT_RETENT_* constants define offsets in the TLB return entry.
1063  * They are used only in trapstat_tlbretent() (below) and #undef'd
1064  * immediately afterwards.  Any change to "retent" in trapstat_tlbretent()
1065  * will likely require changes to these constants.
1066  */
1067 
1068 #ifndef sun4v
1069 #define	TSTAT_RETENT_STATHI	1
1070 #define	TSTAT_RETENT_STATLO	2
1071 #define	TSTAT_RETENT_SHIFT	11
1072 #define	TSTAT_RETENT_COUNT_LD	13
1073 #define	TSTAT_RETENT_COUNT_ST	15
1074 #define	TSTAT_RETENT_TMPTSHI	16
1075 #define	TSTAT_RETENT_TMPTSLO	17
1076 #define	TSTAT_RETENT_TIME_LD	19
1077 #define	TSTAT_RETENT_TIME_ST	21
1078 #else /* sun4v */
1079 #define	TSTAT_RETENT_TDATASHFT	2
1080 #define	TSTAT_RETENT_STATHI	4
1081 #define	TSTAT_RETENT_STATLO	6
1082 #define	TSTAT_RETENT_SHIFT	9
1083 #define	TSTAT_RETENT_COUNT_LD	11
1084 #define	TSTAT_RETENT_COUNT_ST	13
1085 #define	TSTAT_RETENT_TMPTSHI	14
1086 #define	TSTAT_RETENT_TMPTSLO	16
1087 #define	TSTAT_RETENT_TIME_LD	18
1088 #define	TSTAT_RETENT_TIME_ST	20
1089 #endif /* sun4v */
1090 
1091 static void
1092 trapstat_tlbretent(tstat_percpu_t *tcpu, tstat_tlbretent_t *ret,
1093     tstat_missdata_t *data)
1094 {
1095 	uint32_t *ent = ret->ttlbrent_instr, shift;
1096 	uintptr_t base;
1097 #ifndef sun4v
1098 	uintptr_t tmptick = TSTAT_DATA_OFFS(tcpu, tdata_tmptick);
1099 #else
1100 	uintptr_t tmptick = TSTAT_CPU0_TLBDATA_OFFS(tcpu, tdata_tmptick);
1101 #endif
1102 
1103 	/*
1104 	 * This is the entry executed upon return from the TLB/TSB miss
1105 	 * handler (i.e. the code interpositioned between the "retry" and
1106 	 * the actual return to the TLB-missing instruction).  Detail on its
1107 	 * theory of operation can be found in the "TLB Statistics" section
1108 	 * of the block comment.  Note that we expect the TTE just loaded
1109 	 * into the TLB to be in %g5; all other globals are available as
1110 	 * scratch.  Finally, note that the page size information in sun4v is
1111 	 * located in the lower bits of the TTE -- requiring us to have a
1112 	 * different return entry on sun4v.
1113 	 */
1114 	static const uint32_t retent[TSTAT_TLBRET_NINSTR] = {
1115 #ifndef sun4v
1116 	    0x87410000,		/* rd    %tick, %g3			*/
1117 	    0x03000000,		/* sethi %hi(stat), %g1			*/
1118 	    0x82106000,		/* or    %g1, %lo(stat), %g1		*/
1119 	    0x89297001,		/* sllx  %g5, 1, %g4			*/
1120 	    0x8931303e,		/* srlx  %g4, 62, %g4			*/
1121 	    0x8531702e,		/* srlx  %g5, 46, %g2			*/
1122 	    0x8408a004,		/* and   %g2, 4, %g2			*/
1123 	    0x88110002,		/* or    %g4, %g2, %g4			*/
1124 	    0x80a12005,		/* cmp   %g4, 5				*/
1125 	    0x34400002,		/* bg,a,pn %icc, +8			*/
1126 	    0x88102004,		/* mov   4, %g4				*/
1127 	    0x89292000,		/* sll   %g4, shift, %g4		*/
1128 	    0x82004004,		/* add   %g1, %g4, %g1			*/
1129 	    0xc4586000,		/* ldx   [%g1 + tmiss_count], %g2	*/
1130 	    0x8400a001,		/* add   %g2, 1, %g2			*/
1131 	    0xc4706000,		/* stx   %g2, [%g1 + tmiss_count]	*/
1132 	    0x0d000000,		/* sethi %hi(tdata_tmptick), %g6	*/
1133 	    0xc459a000,		/* ldx   [%g6 + %lo(tdata_tmptick)], %g2 */
1134 	    0x8620c002,		/* sub   %g3, %g2, %g3			*/
1135 	    0xc4586000,		/* ldx   [%g1 + tmiss_time], %g2	*/
1136 	    0x84008003,		/* add   %g2, %g3, %g2			*/
1137 	    0xc4706000,		/* stx   %g2, [%g1 + tmiss_time]	*/
1138 	    0x83f00000		/* retry				*/
1139 #else /* sun4v */
1140 	    0x82102008,		/* mov   SCRATCHPAD_CPUID, %g1		*/
1141 	    0xced84400,		/* ldxa  [%g1]ASI_SCRATCHPAD, %g7	*/
1142 	    0x8f29f000,		/* sllx  %g7, TSTAT_DATA_SHIFT, %g7	*/
1143 	    0x87410000,		/* rd    %tick, %g3			*/
1144 	    0x03000000,		/* sethi %hi(stat), %g1			*/
1145 	    0x82004007,		/* add   %g1, %g7, %g1			*/
1146 	    0x82106000,		/* or    %g1, %lo(stat), %g1		*/
1147 	    0x8929703d,		/* sllx  %g5, 61, %g4			*/
1148 	    0x8931303d,		/* srlx  %g4, 61, %g4			*/
1149 	    0x89292000,		/* sll   %g4, shift, %g4		*/
1150 	    0x82004004,		/* add   %g1, %g4, %g1			*/
1151 	    0xc4586000,		/* ldx   [%g1 + tmiss_count], %g2	*/
1152 	    0x8400a001,		/* add   %g2, 1, %g2			*/
1153 	    0xc4706000,		/* stx   %g2, [%g1 + tmiss_count]	*/
1154 	    0x0d000000,		/* sethi %hi(tdata_tmptick), %g6	*/
1155 	    0x8c018007,		/* add   %g6, %g7, %g6			*/
1156 	    0xc459a000,		/* ldx   [%g6 + %lo(tdata_tmptick)], %g2 */
1157 	    0x8620c002,		/* sub   %g3, %g2, %g3			*/
1158 	    0xc4586000,		/* ldx   [%g1 + tmiss_time], %g2	*/
1159 	    0x84008003,		/* add   %g2, %g3, %g2			*/
1160 	    0xc4706000,		/* stx   %g2, [%g1 + tmiss_time]	*/
1161 	    0x83f00000		/* retry				*/
1162 #endif /* sun4v */
1163 	};
1164 
1165 	ASSERT(MUTEX_HELD(&tstat_lock));
1166 	/*CONSTCOND*/
1167 	ASSERT(offsetof(tstat_missdata_t, tmiss_count) <= LO10(-1));
1168 	/*CONSTCOND*/
1169 	ASSERT(offsetof(tstat_missdata_t, tmiss_time) <= LO10(-1));
1170 	/*CONSTCOND*/
1171 	ASSERT(!((sizeof (tstat_pgszdata_t) - 1) & sizeof (tstat_pgszdata_t)));
1172 
1173 	for (shift = 1; (1 << shift) != sizeof (tstat_pgszdata_t); shift++)
1174 		continue;
1175 
1176 	base = (uintptr_t)tcpu->tcpu_ibase + TSTAT_INSTR_SIZE +
1177 	    ((uintptr_t)data - (uintptr_t)tcpu->tcpu_data);
1178 
1179 	bcopy(retent, ent, sizeof (retent));
1180 
1181 #if defined(sun4v)
1182 	ent[TSTAT_RETENT_TDATASHFT] |= LO10((uintptr_t)TSTAT_DATA_SHIFT);
1183 #endif
1184 	ent[TSTAT_RETENT_STATHI] |= HI22(base);
1185 	ent[TSTAT_RETENT_STATLO] |= LO10(base);
1186 	ent[TSTAT_RETENT_SHIFT] |= shift;
1187 	/* LINTED E_EXPR_NULL_EFFECT */
1188 	ent[TSTAT_RETENT_COUNT_LD] |= offsetof(tstat_missdata_t, tmiss_count);
1189 	/* LINTED E_EXPR_NULL_EFFECT */
1190 	ent[TSTAT_RETENT_COUNT_ST] |= offsetof(tstat_missdata_t, tmiss_count);
1191 	ent[TSTAT_RETENT_TMPTSHI] |= HI22(tmptick);
1192 	ent[TSTAT_RETENT_TMPTSLO] |= LO10(tmptick);
1193 	ent[TSTAT_RETENT_TIME_LD] |= offsetof(tstat_missdata_t, tmiss_time);
1194 	ent[TSTAT_RETENT_TIME_ST] |= offsetof(tstat_missdata_t, tmiss_time);
1195 }
1196 
1197 #if defined(sun4v)
1198 #undef TSTAT_RETENT_TDATASHFT
1199 #endif
1200 #undef TSTAT_RETENT_STATHI
1201 #undef TSTAT_RETENT_STATLO
1202 #undef TSTAT_RETENT_SHIFT
1203 #undef TSTAT_RETENT_COUNT_LD
1204 #undef TSTAT_RETENT_COUNT_ST
1205 #undef TSTAT_RETENT_TMPTSHI
1206 #undef TSTAT_RETENT_TMPTSLO
1207 #undef TSTAT_RETENT_TIME_LD
1208 #undef TSTAT_RETENT_TIME_ST
1209 
1210 /*
1211  * The TSTAT_TLBENT_* constants define offsets in the TLB entry.  They are
1212  * used only in trapstat_tlbent() (below) and #undef'd immediately afterwards.
1213  * Any change to "tlbent" in trapstat_tlbent() will likely require changes
1214  * to these constants.
1215  */
1216 
1217 #ifndef sun4v
1218 #define	TSTAT_TLBENT_STATHI	0
1219 #define	TSTAT_TLBENT_STATLO_LD	1
1220 #define	TSTAT_TLBENT_STATLO_ST	3
1221 #define	TSTAT_TLBENT_MMUASI	15
1222 #define	TSTAT_TLBENT_TPCHI	18
1223 #define	TSTAT_TLBENT_TPCLO_USER	19
1224 #define	TSTAT_TLBENT_TPCLO_KERN	21
1225 #define	TSTAT_TLBENT_TSHI	25
1226 #define	TSTAT_TLBENT_TSLO	27
1227 #define	TSTAT_TLBENT_BA		28
1228 #else /* sun4v */
1229 #define	TSTAT_TLBENT_TDATASHFT	2
1230 #define	TSTAT_TLBENT_STATHI	3
1231 #define	TSTAT_TLBENT_STATLO_LD	5
1232 #define	TSTAT_TLBENT_STATLO_ST	7
1233 #define	TSTAT_TLBENT_TAGTARGET	23
1234 #define	TSTAT_TLBENT_TPCHI	25
1235 #define	TSTAT_TLBENT_TPCLO_USER	26
1236 #define	TSTAT_TLBENT_TPCLO_KERN	28
1237 #define	TSTAT_TLBENT_TSHI	32
1238 #define	TSTAT_TLBENT_TSLO	35
1239 #define	TSTAT_TLBENT_ADDRHI	36
1240 #define	TSTAT_TLBENT_ADDRLO	37
1241 #endif /* sun4v */
1242 
1243 static void
1244 trapstat_tlbent(tstat_percpu_t *tcpu, int entno)
1245 {
1246 	uint32_t *ent;
1247 	uintptr_t orig, va;
1248 #ifndef sun4v
1249 	uintptr_t baoffs;
1250 	int itlb = entno == TSTAT_ENT_ITLBMISS;
1251 	uint32_t asi = itlb ? ASI(ASI_IMMU) : ASI(ASI_DMMU);
1252 #else
1253 	int itlb = (entno == TSTAT_ENT_IMMUMISS || entno == TSTAT_ENT_ITLBMISS);
1254 	uint32_t tagtarget_off = itlb ? MMFSA_I_CTX : MMFSA_D_CTX;
1255 	uint32_t *tent;			/* MMU trap vector entry */
1256 	uintptr_t tentva;		/* MMU trap vector entry va */
1257 	static const uint32_t mmumiss[TSTAT_ENT_NINSTR] = {
1258 	    0x30800000,			/* ba,a addr */
1259 	    NOP, NOP, NOP, NOP, NOP, NOP, NOP
1260 	};
1261 #endif
1262 	int entoffs = entno << TSTAT_ENT_SHIFT;
1263 	uintptr_t tmptick, stat, tpc, utpc;
1264 	tstat_pgszdata_t *data;
1265 	tstat_tlbdata_t *udata, *kdata;
1266 	tstat_tlbret_t *ret;
1267 
1268 #ifdef sun4v
1269 	data = &((tstat_tdata_t *)tcpu->tcpu_data)->tdata_pgsz[0];
1270 #else
1271 	data = &tcpu->tcpu_data->tdata_pgsz[0];
1272 #endif /* sun4v */
1273 
1274 	/*
1275 	 * When trapstat is run with TLB statistics, this is the entry for
1276 	 * both I- and D-TLB misses; this code performs trap level pushing,
1277 	 * as described in the "TLB Statistics" section of the block comment.
1278 	 * This code is executing at TL 1; %tstate[0] contains the saved
1279 	 * state at the time of the TLB miss.  Pushing trap level 1 (and thus
1280 	 * raising TL to 2) requires us to fill in %tstate[1] with our %pstate,
1281 	 * %cwp and %asi.  We leave %tt unchanged, and we set %tpc and %tnpc to
1282 	 * the appropriate TLB return entry (based on the context of the miss).
1283 	 * Finally, we sample %tick, and stash it in the tdata_tmptick member
1284 	 * the per-CPU tstat_data structure.  tdata_tmptick will be used in
1285 	 * the TLB return entry to determine the amount of time spent in the
1286 	 * TLB miss handler.
1287 	 *
1288 	 * Note that on sun4v platforms, we must obtain the context information
1289 	 * from the MMU fault status area. (The base address of this MMU fault
1290 	 * status area is kept in the scratchpad register 0.)
1291 	 */
1292 	static const uint32_t tlbent[] = {
1293 #ifndef sun4v
1294 	    0x03000000,			/* sethi %hi(stat), %g1		*/
1295 	    0xc4586000,			/* ldx   [%g1 + %lo(stat)], %g2	*/
1296 	    0x8400a001,			/* add   %g2, 1, %g2		*/
1297 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(stat)]	*/
1298 	    0x85524000,			/* rdpr  %cwp, %g2		*/
1299 	    0x87518000,			/* rdpr  %pstate, %g3		*/
1300 	    0x8728f008,			/* sllx  %g3, 8, %g3		*/
1301 	    0x84108003,			/* or    %g2, %g3, %g2		*/
1302 	    0x8740c000,			/* rd    %asi, %g3		*/
1303 	    0x8728f018,			/* sllx  %g3, 24, %g3		*/
1304 	    0x84108003,			/* or    %g2, %g3, %g2		*/
1305 	    0x8350c000,			/* rdpr  %tt, %g1		*/
1306 	    0x8f902002,			/* wrpr  %g0, 2, %tl		*/
1307 	    0x85908000,			/* wrpr  %g2, %g0, %tstate	*/
1308 	    0x87904000,			/* wrpr  %g1, %g0, %tt		*/
1309 	    0xc2d80000,			/* ldxa  [%g0]ASI_MMU, %g1	*/
1310 	    0x83307030,			/* srlx  %g1, CTXSHIFT, %g1	*/
1311 	    0x02c04004,			/* brz,pn %g1, .+0x10		*/
1312 	    0x03000000,			/* sethi %hi(new_tpc), %g1	*/
1313 	    0x82106000,			/* or    %g1, %lo(new_tpc), %g1	*/
1314 	    0x30800002,			/* ba,a  .+0x8			*/
1315 	    0x82106000,			/* or    %g1, %lo(new_tpc), %g1	*/
1316 	    0x81904000,			/* wrpr  %g1, %g0, %tpc		*/
1317 	    0x82006004,			/* add   %g1, 4, %g1		*/
1318 	    0x83904000,			/* wrpr  %g1, %g0, %tnpc	*/
1319 	    0x03000000,			/* sethi %hi(tmptick), %g1	*/
1320 	    0x85410000,			/* rd    %tick, %g2		*/
1321 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(tmptick)] */
1322 	    0x30800000,			/* ba,a  addr			*/
1323 	    NOP, NOP, NOP
1324 #else /* sun4v */
1325 	    0x82102008,			/* mov SCRATCHPAD_CPUID, %g1	*/
1326 	    0xc8d84400,			/* ldxa [%g1]ASI_SCRATCHPAD, %g4 */
1327 	    0x89293000,			/* sllx %g4, TSTAT_DATA_SHIFT, %g4 */
1328 	    0x03000000,			/* sethi %hi(stat), %g1		*/
1329 	    0x82004004,			/* add %g1, %g4, %g1		*/
1330 	    0xc4586000,			/* ldx   [%g1 + %lo(stat)], %g2	*/
1331 	    0x8400a001,			/* add   %g2, 1, %g2		*/
1332 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(stat)]	*/
1333 	    0x85524000,			/* rdpr  %cwp, %g2		*/
1334 	    0x87518000,			/* rdpr  %pstate, %g3		*/
1335 	    0x8728f008,			/* sllx  %g3, 8, %g3		*/
1336 	    0x84108003,			/* or    %g2, %g3, %g2		*/
1337 	    0x8740c000,			/* rd    %asi, %g3		*/
1338 	    0x8728f018,			/* sllx  %g3, 24, %g3		*/
1339 	    0x83540000,			/* rdpr  %gl, %g1		*/
1340 	    0x83287028,			/* sllx  %g1, 40, %g1		*/
1341 	    0x86104003,			/* or    %g1, %g3, %g3		*/
1342 	    0x84108003,			/* or    %g2, %g3, %g2		*/
1343 	    0x8350c000,			/* rdpr  %tt, %g1		*/
1344 	    0x8f902002,			/* wrpr  %g0, 2, %tl		*/
1345 	    0x85908000,			/* wrpr  %g2, %g0, %tstate	*/
1346 	    0x87904000,			/* wrpr  %g1, %g0, %tt		*/
1347 	    0xc2d80400,			/* ldxa  [%g0]ASI_SCRATCHPAD, %g1 */
1348 	    0xc2586000,			/* ldx  [%g1 + MMFSA_?_CTX], %g1 */
1349 	    0x02c04004,			/* brz,pn %g1, .+0x10		*/
1350 	    0x03000000,			/* sethi %hi(new_tpc), %g1	*/
1351 	    0x82106000,			/* or    %g1, %lo(new_tpc), %g1	*/
1352 	    0x30800002,			/* ba,a  .+0x8			*/
1353 	    0x82106000,			/* or    %g1, %lo(new_tpc), %g1	*/
1354 	    0x81904000,			/* wrpr  %g1, %g0, %tpc		*/
1355 	    0x82006004,			/* add   %g1, 4, %g1		*/
1356 	    0x83904000,			/* wrpr  %g1, %g0, %tnpc	*/
1357 	    0x03000000,			/* sethi %hi(tmptick), %g1	*/
1358 	    0x82004004,			/* add %g1, %g4, %g1		*/
1359 	    0x85410000,			/* rd    %tick, %g2		*/
1360 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(tmptick)] */
1361 	    0x05000000,			/* sethi %hi(addr), %g2		*/
1362 	    0x8410a000,			/* or %g2, %lo(addr), %g2	*/
1363 	    0x81c08000,			/* jmp %g2			*/
1364 	    NOP,
1365 #endif /* sun4v */
1366 	};
1367 
1368 	ASSERT(MUTEX_HELD(&tstat_lock));
1369 #ifndef sun4v
1370 	ASSERT(entno == TSTAT_ENT_ITLBMISS || entno == TSTAT_ENT_DTLBMISS);
1371 
1372 	stat = TSTAT_DATA_OFFS(tcpu, tdata_traps) + entoffs;
1373 	tmptick = TSTAT_DATA_OFFS(tcpu, tdata_tmptick);
1374 #else /* sun4v */
1375 	ASSERT(entno == TSTAT_ENT_ITLBMISS || entno == TSTAT_ENT_DTLBMISS ||
1376 	    entno == TSTAT_ENT_IMMUMISS || entno == TSTAT_ENT_DMMUMISS);
1377 
1378 	stat = TSTAT_CPU0_TLBDATA_OFFS(tcpu, tdata_traps[entno]);
1379 	tmptick = TSTAT_CPU0_TLBDATA_OFFS(tcpu, tdata_tmptick);
1380 #endif /* sun4v */
1381 
1382 	if (itlb) {
1383 		ret = &tcpu->tcpu_instr->tinst_itlbret;
1384 		udata = &data->tpgsz_user.tmode_itlb;
1385 		kdata = &data->tpgsz_kernel.tmode_itlb;
1386 		tpc = TSTAT_INSTR_OFFS(tcpu, tinst_itlbret.ttlbr_ktlb);
1387 	} else {
1388 		ret = &tcpu->tcpu_instr->tinst_dtlbret;
1389 		udata = &data->tpgsz_user.tmode_dtlb;
1390 		kdata = &data->tpgsz_kernel.tmode_dtlb;
1391 		tpc = TSTAT_INSTR_OFFS(tcpu, tinst_dtlbret.ttlbr_ktlb);
1392 	}
1393 
1394 	utpc = tpc + offsetof(tstat_tlbret_t, ttlbr_utlb) -
1395 	    offsetof(tstat_tlbret_t, ttlbr_ktlb);
1396 
1397 	ASSERT(HI22(tpc) == HI22(utpc));
1398 
1399 	ent = (uint32_t *)((uintptr_t)tcpu->tcpu_instr + entoffs);
1400 	orig = KERNELBASE + entoffs;
1401 	va = (uintptr_t)tcpu->tcpu_ibase + entoffs;
1402 
1403 #ifdef sun4v
1404 	/*
1405 	 * Because of lack of space, interposing tlbent trap handler
1406 	 * for TLB and MMU miss traps cannot be placed in-line. Instead,
1407 	 * we copy it to the space set aside for shared trap handlers
1408 	 * continuation in the interposing trap table and invoke it by
1409 	 * placing a branch in the trap table itself.
1410 	 */
1411 	tent = ent;		/* trap vector entry */
1412 	tentva = va;		/* trap vector entry va */
1413 
1414 	if (itlb) {
1415 		ent = (uint32_t *)((uintptr_t)
1416 		    &tcpu->tcpu_instr->tinst_immumiss);
1417 		va = TSTAT_INSTR_OFFS(tcpu, tinst_immumiss);
1418 	} else {
1419 		ent = (uint32_t *)((uintptr_t)
1420 		    &tcpu->tcpu_instr->tinst_dmmumiss);
1421 		va = TSTAT_INSTR_OFFS(tcpu, tinst_dmmumiss);
1422 	}
1423 	bcopy(mmumiss, tent, sizeof (mmumiss));
1424 	tent[0] |= DISP22(tentva, va);
1425 #endif /* sun4v */
1426 
1427 	bcopy(tlbent, ent, sizeof (tlbent));
1428 
1429 #if defined(sun4v)
1430 	ent[TSTAT_TLBENT_TDATASHFT] |= LO10((uintptr_t)TSTAT_DATA_SHIFT);
1431 #endif
1432 	ent[TSTAT_TLBENT_STATHI] |= HI22(stat);
1433 	ent[TSTAT_TLBENT_STATLO_LD] |= LO10(stat);
1434 	ent[TSTAT_TLBENT_STATLO_ST] |= LO10(stat);
1435 #ifndef sun4v
1436 	ent[TSTAT_TLBENT_MMUASI] |= asi;
1437 #else
1438 	ent[TSTAT_TLBENT_TAGTARGET] |= tagtarget_off;
1439 #endif
1440 	ent[TSTAT_TLBENT_TPCHI] |= HI22(tpc);
1441 	ent[TSTAT_TLBENT_TPCLO_USER] |= LO10(utpc);
1442 	ent[TSTAT_TLBENT_TPCLO_KERN] |= LO10(tpc);
1443 	ent[TSTAT_TLBENT_TSHI] |= HI22(tmptick);
1444 	ent[TSTAT_TLBENT_TSLO] |= LO10(tmptick);
1445 #ifndef	sun4v
1446 	baoffs = TSTAT_TLBENT_BA * sizeof (uint32_t);
1447 	ent[TSTAT_TLBENT_BA] |= DISP22(va + baoffs, orig);
1448 #else
1449 	ent[TSTAT_TLBENT_ADDRHI] |= HI22(orig);
1450 	ent[TSTAT_TLBENT_ADDRLO] |= LO10(orig);
1451 #endif /* sun4v */
1452 
1453 	/*
1454 	 * And now set up the TLB return entries.
1455 	 */
1456 	trapstat_tlbretent(tcpu, &ret->ttlbr_ktlb, &kdata->ttlb_tlb);
1457 	trapstat_tlbretent(tcpu, &ret->ttlbr_ktsb, &kdata->ttlb_tsb);
1458 	trapstat_tlbretent(tcpu, &ret->ttlbr_utlb, &udata->ttlb_tlb);
1459 	trapstat_tlbretent(tcpu, &ret->ttlbr_utsb, &udata->ttlb_tsb);
1460 }
1461 
1462 #if defined(sun4v)
1463 #undef TSTAT_TLBENT_TDATASHFT
1464 #endif
1465 #undef TSTAT_TLBENT_STATHI
1466 #undef TSTAT_TLBENT_STATLO_LD
1467 #undef TSTAT_TLBENT_STATLO_ST
1468 #ifndef sun4v
1469 #undef TSTAT_TLBENT_MMUASI
1470 #else
1471 #undef TSTAT_TLBENT_TAGTARGET
1472 #endif
1473 #undef TSTAT_TLBENT_TPCHI
1474 #undef TSTAT_TLBENT_TPCLO_USER
1475 #undef TSTAT_TLBENT_TPCLO_KERN
1476 #undef TSTAT_TLBENT_TSHI
1477 #undef TSTAT_TLBENT_TSLO
1478 #undef TSTAT_TLBENT_BA
1479 
1480 /*
1481  * The TSTAT_ENABLED_* constants define offsets in the enabled entry; the
1482  * TSTAT_DISABLED_BA constant defines an offset in the disabled entry.  Both
1483  * sets of constants are used only in trapstat_make_traptab() (below) and
1484  * #undef'd immediately afterwards.  Any change to "enabled" or "disabled"
1485  * in trapstat_make_traptab() will likely require changes to these constants.
1486  */
1487 #ifndef sun4v
1488 #define	TSTAT_ENABLED_STATHI	0
1489 #define	TSTAT_ENABLED_STATLO_LD	1
1490 #define	TSTAT_ENABLED_STATLO_ST 3
1491 #define	TSTAT_ENABLED_BA	4
1492 #define	TSTAT_DISABLED_BA	0
1493 
1494 static void
1495 trapstat_make_traptab(tstat_percpu_t *tcpu)
1496 {
1497 	uint32_t *ent;
1498 	uint64_t *stat;
1499 	uintptr_t orig, va, en_baoffs, dis_baoffs;
1500 	int nent;
1501 
1502 	/*
1503 	 * This is the entry in the interposing trap table for enabled trap
1504 	 * table entries.  It loads a counter, increments it and stores it
1505 	 * back before branching to the actual trap table entry.
1506 	 */
1507 	static const uint32_t enabled[TSTAT_ENT_NINSTR] = {
1508 	    0x03000000,			/* sethi %hi(stat), %g1		*/
1509 	    0xc4586000,			/* ldx   [%g1 + %lo(stat)], %g2	*/
1510 	    0x8400a001,			/* add   %g2, 1, %g2		*/
1511 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(stat)]	*/
1512 	    0x30800000,			/* ba,a addr			*/
1513 	    NOP, NOP, NOP
1514 	};
1515 
1516 	/*
1517 	 * This is the entry in the interposing trap table for disabled trap
1518 	 * table entries.  It simply branches to the actual, underlying trap
1519 	 * table entry.  As explained in the "Implementation Details" section
1520 	 * of the block comment, all TL>0 traps _must_ use the disabled entry;
1521 	 * additional entries may be explicitly disabled through the use
1522 	 * of TSTATIOC_ENTRY/TSTATIOC_NOENTRY.
1523 	 */
1524 	static const uint32_t disabled[TSTAT_ENT_NINSTR] = {
1525 	    0x30800000,			/* ba,a addr			*/
1526 	    NOP, NOP, NOP, NOP, NOP, NOP, NOP,
1527 	};
1528 
1529 	ASSERT(MUTEX_HELD(&tstat_lock));
1530 
1531 	ent = tcpu->tcpu_instr->tinst_traptab;
1532 	stat = (uint64_t *)TSTAT_DATA_OFFS(tcpu, tdata_traps);
1533 	orig = KERNELBASE;
1534 	va = (uintptr_t)tcpu->tcpu_ibase;
1535 	en_baoffs = TSTAT_ENABLED_BA * sizeof (uint32_t);
1536 	dis_baoffs = TSTAT_DISABLED_BA * sizeof (uint32_t);
1537 
1538 	for (nent = 0; nent < TSTAT_TOTAL_NENT; nent++) {
1539 		if (tstat_enabled[nent]) {
1540 			bcopy(enabled, ent, sizeof (enabled));
1541 			ent[TSTAT_ENABLED_STATHI] |= HI22((uintptr_t)stat);
1542 			ent[TSTAT_ENABLED_STATLO_LD] |= LO10((uintptr_t)stat);
1543 			ent[TSTAT_ENABLED_STATLO_ST] |= LO10((uintptr_t)stat);
1544 			ent[TSTAT_ENABLED_BA] |= DISP22(va + en_baoffs, orig);
1545 		} else {
1546 			bcopy(disabled, ent, sizeof (disabled));
1547 			ent[TSTAT_DISABLED_BA] |= DISP22(va + dis_baoffs, orig);
1548 		}
1549 
1550 		stat++;
1551 		orig += sizeof (enabled);
1552 		ent += sizeof (enabled) / sizeof (*ent);
1553 		va += sizeof (enabled);
1554 	}
1555 }
1556 
1557 #undef TSTAT_ENABLED_STATHI
1558 #undef TSTAT_ENABLED_STATLO_LD
1559 #undef TSTAT_ENABLED_STATLO_ST
1560 #undef TSTAT_ENABLED_BA
1561 #undef TSTAT_DISABLED_BA
1562 
1563 #else /* sun4v */
1564 
1565 #define	TSTAT_ENABLED_STATHI	0
1566 #define	TSTAT_ENABLED_STATLO	1
1567 #define	TSTAT_ENABLED_ADDRHI	2
1568 #define	TSTAT_ENABLED_ADDRLO	3
1569 #define	TSTAT_ENABLED_CONTBA	6
1570 #define	TSTAT_ENABLED_TDATASHFT	7
1571 #define	TSTAT_DISABLED_ADDRHI	0
1572 #define	TSTAT_DISABLED_ADDRLO	1
1573 
1574 static void
1575 trapstat_make_traptab(tstat_percpu_t *tcpu)
1576 {
1577 	uint32_t *ent;
1578 	uint64_t *stat;
1579 	uintptr_t orig, va, en_baoffs;
1580 	uintptr_t tstat_cont_va;
1581 	int nent;
1582 
1583 	/*
1584 	 * This is the entry in the interposing trap table for enabled trap
1585 	 * table entries.  It loads a counter, increments it and stores it
1586 	 * back before branching to the actual trap table entry.
1587 	 *
1588 	 * All CPUs share the same interposing trap entry to count the
1589 	 * number of traps. Note that the trap counter is kept in per CPU
1590 	 * trap statistics area. Its address is obtained dynamically by
1591 	 * adding the offset of that CPU's trap statistics area from CPU 0
1592 	 * (i.e. cpu_id * TSTAT_DATA_SIZE) to the address of the CPU 0
1593 	 * trap counter already coded in the interposing trap entry itself.
1594 	 *
1595 	 * Since this interposing code sequence to count traps takes more
1596 	 * than 8 instructions, it's split in two parts as follows:
1597 	 *
1598 	 *   tstat_trapcnt:
1599 	 *	sethi %hi(stat), %g1
1600 	 *	or    %g1, %lo[stat), %g1	! %g1 = CPU0 trap counter addr
1601 	 *	sethi %hi(addr), %g2
1602 	 *	or    %g2, %lo(addr), %g2	! %g2 = real trap handler addr
1603 	 *	mov   ASI_SCRATCHPAD_CPUID, %g3
1604 	 *	ldxa [%g3]ASI_SCRATCHPAD, %g3	! %g3 = CPU ID
1605 	 *	ba tstat_trapcnt_cont		! branch to tstat_trapcnt_cont
1606 	 *	sllx %g3, TSTAT_DATA_SHIFT, %g3	! %g3 = CPU trapstat data offset
1607 	 *
1608 	 *   tstat_trapcnt_cont:
1609 	 *	ldx [%g1 + %g3], %g4		! get counter value
1610 	 *	add %g4, 1, %g4			! increment value
1611 	 *	jmp %g2				! jump to original trap handler
1612 	 *	stx %g4, [%g1 + %g3]		! store counter value
1613 	 *
1614 	 * First part, i.e. tstat_trapcnt, is per trap and is kept in-line in
1615 	 * the interposing trap table. However, the tstat_trapcnt_cont code
1616 	 * sequence is shared by all traps and is kept right after the
1617 	 * the interposing trap table.
1618 	 */
1619 	static const uint32_t enabled[TSTAT_ENT_NINSTR] = {
1620 	    0x03000000,			/* sethi %hi(stat), %g1		*/
1621 	    0x82106000,			/* or   %g1, %lo[stat), %g1	*/
1622 	    0x05000000,			/* sethi %hi(addr), %g2		*/
1623 	    0x8410a000,			/* or   %g2, %lo(addr), %g2	*/
1624 	    0x86102008,			/* mov	ASI_SCRATCHPAD_CPUID, %g3 */
1625 	    0xc6d8c400,			/* ldxa [%g3]ASI_SCRATCHPAD, %g3 */
1626 	    0x10800000,			/* ba enabled_cont		*/
1627 	    0x8728f000			/* sllx %g3, TSTAT_DATA_SHIFT, %g3 */
1628 	};
1629 
1630 	static const uint32_t enabled_cont[TSTAT_ENT_NINSTR] = {
1631 	    0xc8584003,			/* ldx [%g1 + %g3], %g4		*/
1632 	    0x88012001,			/* add %g4, 1, %g4		*/
1633 	    0x81c08000,			/* jmp %g2			*/
1634 	    0xc8704003,			/* stx %g4, [%g1 + %g3]		*/
1635 	    NOP, NOP, NOP, NOP
1636 	};
1637 
1638 	/*
1639 	 * This is the entry in the interposing trap table for disabled trap
1640 	 * table entries.  It simply "jmp" to the actual, underlying trap
1641 	 * table entry.  As explained in the "Implementation Details" section
1642 	 * of the block comment, all TL>0 traps _must_ use the disabled entry;
1643 	 * additional entries may be explicitly disabled through the use
1644 	 * of TSTATIOC_ENTRY/TSTATIOC_NOENTRY.
1645 	 */
1646 	static const uint32_t disabled[TSTAT_ENT_NINSTR] = {
1647 	    0x05000000,			/* sethi %hi(addr), %g2		*/
1648 	    0x8410a000,			/* or %g2, %lo(addr), %g2	*/
1649 	    0x81c08000,			/* jmp %g2			*/
1650 	    NOP, NOP, NOP, NOP, NOP,
1651 	};
1652 
1653 	ASSERT(MUTEX_HELD(&tstat_lock));
1654 	ent = tcpu->tcpu_instr->tinst_traptab;
1655 	stat = (uint64_t *)TSTAT_CPU0_DATA_OFFS(tcpu, tdata_traps);
1656 	orig = KERNELBASE;
1657 	va = (uintptr_t)tcpu->tcpu_ibase;
1658 	en_baoffs = TSTAT_ENABLED_CONTBA * sizeof (uint32_t);
1659 	tstat_cont_va = TSTAT_INSTR_OFFS(tcpu, tinst_trapcnt);
1660 
1661 	for (nent = 0; nent < TSTAT_TOTAL_NENT; nent++) {
1662 		/*
1663 		 * If TSTAT_OPT_TLBDATA option is enabled (-t or -T option)
1664 		 * we make sure only TSTAT_TLB_NENT traps can be enabled.
1665 		 * Note that this logic is somewhat moot since trapstat
1666 		 * cmd actually use TSTATIOC_NOENTRY ioctl to disable all
1667 		 * traps when performing Tlb stats collection.
1668 		 */
1669 		if ((!(tstat_options & TSTAT_OPT_TLBDATA) ||
1670 		    nent < TSTAT_TLB_NENT) && tstat_enabled[nent]) {
1671 			bcopy(enabled, ent, sizeof (enabled));
1672 			ent[TSTAT_ENABLED_STATHI] |= HI22((uintptr_t)stat);
1673 			ent[TSTAT_ENABLED_STATLO] |= LO10((uintptr_t)stat);
1674 			ent[TSTAT_ENABLED_ADDRHI] |= HI22((uintptr_t)orig);
1675 			ent[TSTAT_ENABLED_ADDRLO] |= LO10((uintptr_t)orig);
1676 			ent[TSTAT_ENABLED_CONTBA] |=
1677 			    DISP22(va + en_baoffs, tstat_cont_va);
1678 			ent[TSTAT_ENABLED_TDATASHFT] |=
1679 			    LO10((uintptr_t)TSTAT_DATA_SHIFT);
1680 		} else {
1681 			bcopy(disabled, ent, sizeof (disabled));
1682 			ent[TSTAT_DISABLED_ADDRHI] |= HI22((uintptr_t)orig);
1683 			ent[TSTAT_DISABLED_ADDRLO] |= LO10((uintptr_t)orig);
1684 		}
1685 
1686 		stat++;
1687 		orig += sizeof (enabled);
1688 		ent += sizeof (enabled) / sizeof (*ent);
1689 		va += sizeof (enabled);
1690 	}
1691 	bcopy(enabled_cont, (uint32_t *)tcpu->tcpu_instr->tinst_trapcnt,
1692 	    sizeof (enabled_cont));
1693 }
1694 
1695 #undef	TSTAT_ENABLED_TDATASHFT
1696 #undef	TSTAT_ENABLED_STATHI
1697 #undef	TSTAT_ENABLED_STATLO
1698 #undef	TSTAT_ENABLED_ADDRHI
1699 #undef	TSTAT_ENABLED_ADDRLO
1700 #undef	TSTAT_ENABLED_CONTBA
1701 #undef	TSTAT_DISABLED_BA
1702 
1703 #endif /* sun4v */
1704 
1705 #ifndef sun4v
1706 /*
1707  * See Section A.6 in SPARC v9 Manual.
1708  * max branch = 4*((2^21)-1) = 8388604
1709  */
1710 #define	MAX_BICC_BRANCH_DISPLACEMENT (4 * ((1 << 21) - 1))
1711 #endif
1712 
1713 static void
1714 trapstat_setup(processorid_t cpu)
1715 {
1716 	tstat_percpu_t *tcpu = &tstat_percpu[cpu];
1717 #ifndef sun4v
1718 	int i;
1719 	caddr_t va;
1720 	pfn_t *pfn;
1721 	cpu_t *cp;
1722 	uint_t strand_idx;
1723 	size_t tstat_offset;
1724 #else
1725 	uint64_t offset;
1726 #endif
1727 
1728 	ASSERT(tcpu->tcpu_pfn == NULL);
1729 	ASSERT(tcpu->tcpu_instr == NULL);
1730 	ASSERT(tcpu->tcpu_data == NULL);
1731 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
1732 	ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED));
1733 	ASSERT(MUTEX_HELD(&cpu_lock));
1734 	ASSERT(MUTEX_HELD(&tstat_lock));
1735 
1736 #ifndef sun4v
1737 	/*
1738 	 * The lower fifteen bits of the %tba are always read as zero; we must
1739 	 * align our instruction base address appropriately.
1740 	 */
1741 	tstat_offset = tstat_total_size;
1742 
1743 	cp = cpu_get(cpu);
1744 	ASSERT(cp != NULL);
1745 	if ((strand_idx = cpu ^ pg_plat_hw_instance_id(cp, PGHW_IPIPE)) != 0) {
1746 		/*
1747 		 * On sun4u platforms with multiple CPUs sharing the MMU
1748 		 * (Olympus-C has 2 strands per core), each CPU uses a
1749 		 * disjoint trap table.  The indexing is based on the
1750 		 * strand id, which is obtained by XOR'ing the cpuid with
1751 		 * the coreid.
1752 		 */
1753 		tstat_offset += tstat_total_size * strand_idx;
1754 
1755 		/*
1756 		 * Offset must be less than the maximum PC-relative branch
1757 		 * displacement for Bicc variants.  See the Implementation
1758 		 * Details comment.
1759 		 */
1760 		ASSERT(tstat_offset <= MAX_BICC_BRANCH_DISPLACEMENT);
1761 	}
1762 
1763 	tcpu->tcpu_ibase = (caddr_t)((KERNELBASE - tstat_offset)
1764 	    & TSTAT_TBA_MASK);
1765 	tcpu->tcpu_dbase = tcpu->tcpu_ibase + TSTAT_INSTR_SIZE;
1766 	tcpu->tcpu_vabase = tcpu->tcpu_ibase;
1767 
1768 	tcpu->tcpu_pfn = vmem_alloc(tstat_arena, tstat_total_pages, VM_SLEEP);
1769 	bzero(tcpu->tcpu_pfn, tstat_total_pages);
1770 	pfn = tcpu->tcpu_pfn;
1771 
1772 	tcpu->tcpu_instr = vmem_alloc(tstat_arena, TSTAT_INSTR_SIZE, VM_SLEEP);
1773 
1774 	va = (caddr_t)tcpu->tcpu_instr;
1775 	for (i = 0; i < TSTAT_INSTR_PAGES; i++, va += MMU_PAGESIZE)
1776 		*pfn++ = hat_getpfnum(kas.a_hat, va);
1777 
1778 	/*
1779 	 * We must be sure that the pages that we will use to examine the data
1780 	 * have the same virtual color as the pages to which the data is being
1781 	 * recorded, hence the alignment and phase constraints on the
1782 	 * allocation.
1783 	 */
1784 	tcpu->tcpu_data = vmem_xalloc(tstat_arena, tstat_data_size,
1785 	    shm_alignment, (uintptr_t)tcpu->tcpu_dbase & (shm_alignment - 1),
1786 	    0, 0, NULL, VM_SLEEP);
1787 	bzero(tcpu->tcpu_data, tstat_data_size);
1788 	tcpu->tcpu_data->tdata_cpuid = cpu;
1789 
1790 	va = (caddr_t)tcpu->tcpu_data;
1791 	for (i = 0; i < tstat_data_pages; i++, va += MMU_PAGESIZE)
1792 		*pfn++ = hat_getpfnum(kas.a_hat, va);
1793 
1794 	/*
1795 	 * Now that we have all of the instruction and data pages allocated,
1796 	 * make the trap table from scratch.
1797 	 */
1798 	trapstat_make_traptab(tcpu);
1799 
1800 	if (tstat_options & TSTAT_OPT_TLBDATA) {
1801 		/*
1802 		 * TLB Statistics have been specified; set up the I- and D-TLB
1803 		 * entries and corresponding TLB return entries.
1804 		 */
1805 		trapstat_tlbent(tcpu, TSTAT_ENT_ITLBMISS);
1806 		trapstat_tlbent(tcpu, TSTAT_ENT_DTLBMISS);
1807 	}
1808 
1809 #else /* sun4v */
1810 
1811 	/*
1812 	 * The lower fifteen bits of the %tba are always read as zero; hence
1813 	 * it must be aligned at least on 512K boundary.
1814 	 */
1815 	tcpu->tcpu_vabase = (caddr_t)(KERNELBASE -
1816 	    MMU_PAGESIZE4M * tstat_num4m_mapping);
1817 	tcpu->tcpu_ibase = tcpu->tcpu_vabase;
1818 	tcpu->tcpu_dbase = tcpu->tcpu_ibase + TSTAT_INSTR_SIZE +
1819 	    cpu * TSTAT_DATA_SIZE;
1820 
1821 	tcpu->tcpu_pfn = &tstat_pfn[0];
1822 	tcpu->tcpu_instr = (tstat_instr_t *)tstat_va[0];
1823 
1824 	offset = TSTAT_INSTR_SIZE + cpu * TSTAT_DATA_SIZE;
1825 	tcpu->tcpu_data = (tstat_data_t *)(tstat_va[offset >> MMU_PAGESHIFT4M] +
1826 	    (offset & MMU_PAGEOFFSET4M));
1827 	bzero(tcpu->tcpu_data, TSTAT_DATA_SIZE);
1828 
1829 	/*
1830 	 * Now that we have all of the instruction and data pages allocated,
1831 	 * make the trap table from scratch. It should be done only once
1832 	 * as it is shared by all CPUs.
1833 	 */
1834 	if (!tstat_traptab_initialized)
1835 		trapstat_make_traptab(tcpu);
1836 
1837 	if (tstat_options & TSTAT_OPT_TLBDATA) {
1838 		/*
1839 		 * TLB Statistics have been specified; set up the I- and D-TLB
1840 		 * entries and corresponding TLB return entries.
1841 		 */
1842 		if (!tstat_traptab_initialized) {
1843 			if (tstat_fast_tlbstat) {
1844 				trapstat_tlbent(tcpu, TSTAT_ENT_IMMUMISS);
1845 				trapstat_tlbent(tcpu, TSTAT_ENT_DMMUMISS);
1846 			} else {
1847 				trapstat_tlbent(tcpu, TSTAT_ENT_ITLBMISS);
1848 				trapstat_tlbent(tcpu, TSTAT_ENT_DTLBMISS);
1849 			}
1850 		}
1851 	}
1852 	tstat_traptab_initialized = 1;
1853 #endif /* sun4v */
1854 
1855 	tcpu->tcpu_flags |= TSTAT_CPU_ALLOCATED;
1856 
1857 	/*
1858 	 * Finally, get the target CPU to load the locked pages into its TLBs.
1859 	 */
1860 	xc_one(cpu, (xcfunc_t *)trapstat_load_tlb, 0, 0);
1861 }
1862 
1863 static void
1864 trapstat_teardown(processorid_t cpu)
1865 {
1866 	tstat_percpu_t *tcpu = &tstat_percpu[cpu];
1867 	int i;
1868 	caddr_t va = tcpu->tcpu_vabase;
1869 
1870 	ASSERT(tcpu->tcpu_pfn != NULL);
1871 	ASSERT(tcpu->tcpu_instr != NULL);
1872 	ASSERT(tcpu->tcpu_data != NULL);
1873 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
1874 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
1875 	ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
1876 	ASSERT(MUTEX_HELD(&cpu_lock));
1877 	ASSERT(MUTEX_HELD(&tstat_lock));
1878 
1879 #ifndef sun4v
1880 	vmem_free(tstat_arena, tcpu->tcpu_pfn, tstat_total_pages);
1881 	vmem_free(tstat_arena, tcpu->tcpu_instr, TSTAT_INSTR_SIZE);
1882 	vmem_free(tstat_arena, tcpu->tcpu_data, tstat_data_size);
1883 
1884 	for (i = 0; i < tstat_total_pages; i++, va += MMU_PAGESIZE) {
1885 		xt_one(cpu, vtag_flushpage_tl1, (uint64_t)va,
1886 		    (uint64_t)ksfmmup);
1887 	}
1888 #else
1889 	for (i = 0; i < tstat_num4m_mapping; i++) {
1890 		xt_one(cpu, vtag_unmap_perm_tl1, (uint64_t)va, KCONTEXT);
1891 		va += MMU_PAGESIZE4M;
1892 	}
1893 #endif
1894 
1895 	tcpu->tcpu_pfn = NULL;
1896 	tcpu->tcpu_instr = NULL;
1897 	tcpu->tcpu_data = NULL;
1898 	tcpu->tcpu_flags &= ~TSTAT_CPU_ALLOCATED;
1899 }
1900 
1901 static int
1902 trapstat_go()
1903 {
1904 	cpu_t *cp;
1905 #ifdef sun4v
1906 	int i;
1907 #endif /* sun4v */
1908 
1909 	mutex_enter(&cpu_lock);
1910 	mutex_enter(&tstat_lock);
1911 
1912 	if (tstat_running) {
1913 		mutex_exit(&tstat_lock);
1914 		mutex_exit(&cpu_lock);
1915 		return (EBUSY);
1916 	}
1917 
1918 #ifdef sun4v
1919 	/*
1920 	 * Compute the actual number of 4MB mappings
1921 	 * we need based on the guest's ncpu_guest_max value.
1922 	 * Note that earlier at compiled time, we did establish
1923 	 * and check against the sun4v solaris arch limit
1924 	 * (TSTAT_NUM4M_LIMIT) which is based on NCPU.
1925 	 */
1926 	tstat_num4m_mapping = TSTAT_NUM4M_MACRO(ncpu_guest_max);
1927 	ASSERT(tstat_num4m_mapping <= TSTAT_NUM4M_LIMIT);
1928 
1929 	/*
1930 	 * Allocate large pages to hold interposing tables.
1931 	 */
1932 	for (i = 0; i < tstat_num4m_mapping; i++) {
1933 		tstat_va[i] = contig_mem_alloc(MMU_PAGESIZE4M);
1934 		tstat_pfn[i] = va_to_pfn(tstat_va[i]);
1935 		if (tstat_pfn[i] == PFN_INVALID) {
1936 			int j;
1937 			for (j = 0; j < i; j++) {
1938 				contig_mem_free(tstat_va[j], MMU_PAGESIZE4M);
1939 			}
1940 			mutex_exit(&tstat_lock);
1941 			mutex_exit(&cpu_lock);
1942 			return (EAGAIN);
1943 		}
1944 	}
1945 
1946 
1947 	/*
1948 	 * For detailed TLB statistics, invoke CPU specific interface
1949 	 * to see if it supports a low overhead interface to collect
1950 	 * TSB hit statistics. If so, make set tstat_fast_tlbstat flag
1951 	 * to reflect that.
1952 	 */
1953 	if (tstat_options & TSTAT_OPT_TLBDATA) {
1954 		int error;
1955 
1956 		tstat_fast_tlbstat = B_FALSE;
1957 		error = cpu_trapstat_conf(CPU_TSTATCONF_INIT);
1958 		if (error == 0)
1959 			tstat_fast_tlbstat = B_TRUE;
1960 		else if (error != ENOTSUP) {
1961 			for (i = 0; i < tstat_num4m_mapping; i++) {
1962 				contig_mem_free(tstat_va[i], MMU_PAGESIZE4M);
1963 			}
1964 			mutex_exit(&tstat_lock);
1965 			mutex_exit(&cpu_lock);
1966 			return (error);
1967 		}
1968 	}
1969 
1970 	tstat_hv_nopanic = 1;
1971 	tstat_perm_mapping_failed = 0;
1972 #endif /* sun4v */
1973 
1974 	/*
1975 	 * First, perform any necessary hot patching.
1976 	 */
1977 	trapstat_hotpatch();
1978 
1979 	/*
1980 	 * Allocate the resources we'll need to measure probe effect.
1981 	 */
1982 	trapstat_probe_alloc();
1983 
1984 	cp = cpu_list;
1985 	do {
1986 		if (!(tstat_percpu[cp->cpu_id].tcpu_flags & TSTAT_CPU_SELECTED))
1987 			continue;
1988 
1989 		trapstat_setup(cp->cpu_id);
1990 
1991 		/*
1992 		 * Note that due to trapstat_probe()'s use of global data,
1993 		 * we determine the probe effect on each CPU serially instead
1994 		 * of in parallel with an xc_all().
1995 		 */
1996 		xc_one(cp->cpu_id, (xcfunc_t *)trapstat_probe, 0, 0);
1997 
1998 #ifdef sun4v
1999 		/*
2000 		 * Check to see if the first cpu's attempt to create
2001 		 * the perm mappings failed. This might happen if the
2002 		 * guest somehow exhausted all its limited perm mappings.
2003 		 * Note that we only check this once for the first
2004 		 * attempt since it shouldn't fail for subsequent cpus
2005 		 * mapping the same TTEs if the first attempt was successful.
2006 		 */
2007 		if (tstat_hv_nopanic && tstat_perm_mapping_failed) {
2008 			tstat_percpu_t *tcpu = &tstat_percpu[cp->cpu_id];
2009 			for (i = 0; i < tstat_num4m_mapping; i++) {
2010 				contig_mem_free(tstat_va[i], MMU_PAGESIZE4M);
2011 			}
2012 
2013 			/*
2014 			 * Do clean up before returning.
2015 			 * Cleanup is manageable since we
2016 			 * only need to do it for the first cpu
2017 			 * iteration that failed.
2018 			 */
2019 			trapstat_probe_free();
2020 			trapstat_hotpatch();
2021 			tcpu->tcpu_pfn = NULL;
2022 			tcpu->tcpu_instr = NULL;
2023 			tcpu->tcpu_data = NULL;
2024 			tcpu->tcpu_flags &= ~TSTAT_CPU_ALLOCATED;
2025 			mutex_exit(&tstat_lock);
2026 			mutex_exit(&cpu_lock);
2027 			return (EAGAIN);
2028 		}
2029 		tstat_hv_nopanic = 0;
2030 #endif /* sun4v */
2031 
2032 	} while ((cp = cp->cpu_next) != cpu_list);
2033 
2034 	xc_all((xcfunc_t *)trapstat_enable, 0, 0);
2035 
2036 	trapstat_probe_free();
2037 	tstat_running = 1;
2038 	mutex_exit(&tstat_lock);
2039 	mutex_exit(&cpu_lock);
2040 
2041 	return (0);
2042 }
2043 
2044 static int
2045 trapstat_stop()
2046 {
2047 	int i;
2048 
2049 	mutex_enter(&cpu_lock);
2050 	mutex_enter(&tstat_lock);
2051 	if (!tstat_running) {
2052 		mutex_exit(&tstat_lock);
2053 		mutex_exit(&cpu_lock);
2054 		return (ENXIO);
2055 	}
2056 
2057 	xc_all((xcfunc_t *)trapstat_disable, 0, 0);
2058 
2059 	for (i = 0; i <= max_cpuid; i++) {
2060 		if (tstat_percpu[i].tcpu_flags & TSTAT_CPU_ALLOCATED)
2061 			trapstat_teardown(i);
2062 	}
2063 
2064 #ifdef sun4v
2065 	tstat_traptab_initialized = 0;
2066 	if (tstat_options & TSTAT_OPT_TLBDATA)
2067 		(void) cpu_trapstat_conf(CPU_TSTATCONF_FINI);
2068 	for (i = 0; i < tstat_num4m_mapping; i++)
2069 		contig_mem_free(tstat_va[i], MMU_PAGESIZE4M);
2070 #endif
2071 	trapstat_hotpatch();
2072 	tstat_running = 0;
2073 	mutex_exit(&tstat_lock);
2074 	mutex_exit(&cpu_lock);
2075 
2076 	return (0);
2077 }
2078 
2079 /*
2080  * This is trapstat's DR CPU configuration callback.  It's called (with
2081  * cpu_lock held) to unconfigure a newly powered-off CPU, or to configure a
2082  * powered-off CPU that is to be brought into the system.  We need only take
2083  * action in the unconfigure case:  because a powered-off CPU will have its
2084  * trap table restored to KERNELBASE if it is ever powered back on, we must
2085  * update the flags to reflect that trapstat is no longer enabled on the
2086  * powered-off CPU.  Note that this means that a TSTAT_CPU_ENABLED CPU that
2087  * is unconfigured/powered off and later powered back on/reconfigured will
2088  * _not_ be re-TSTAT_CPU_ENABLED.
2089  */
2090 static int
2091 trapstat_cpu_setup(cpu_setup_t what, processorid_t cpu)
2092 {
2093 	tstat_percpu_t *tcpu = &tstat_percpu[cpu];
2094 
2095 	ASSERT(MUTEX_HELD(&cpu_lock));
2096 	mutex_enter(&tstat_lock);
2097 
2098 	if (!tstat_running) {
2099 		mutex_exit(&tstat_lock);
2100 		return (0);
2101 	}
2102 
2103 	switch (what) {
2104 	case CPU_CONFIG:
2105 		ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
2106 		break;
2107 
2108 	case CPU_UNCONFIG:
2109 		if (tcpu->tcpu_flags & TSTAT_CPU_ENABLED) {
2110 			tcpu->tcpu_flags &= ~TSTAT_CPU_ENABLED;
2111 #ifdef	sun4v
2112 			/*
2113 			 * A power-off, causes the cpu mondo queues to be
2114 			 * unconfigured on sun4v. Since we can't teardown
2115 			 * trapstat's mappings on the cpu that is going away,
2116 			 * we simply mark it as not allocated. This will
2117 			 * prevent a teardown on a cpu with the same cpu id
2118 			 * that might have been added while trapstat is running.
2119 			 */
2120 			if (tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED) {
2121 				tcpu->tcpu_pfn = NULL;
2122 				tcpu->tcpu_instr = NULL;
2123 				tcpu->tcpu_data = NULL;
2124 				tcpu->tcpu_flags &= ~TSTAT_CPU_ALLOCATED;
2125 			}
2126 #endif
2127 		}
2128 		break;
2129 
2130 	default:
2131 		break;
2132 	}
2133 
2134 	mutex_exit(&tstat_lock);
2135 	return (0);
2136 }
2137 
2138 /*
2139  * This is called before a CPR suspend and after a CPR resume.  We don't have
2140  * anything to do before a suspend, but after a restart we must restore the
2141  * trap table to be our interposing trap table.  However, we don't actually
2142  * know whether or not the CPUs have been powered off -- this routine may be
2143  * called while restoring from a failed CPR suspend.  We thus run through each
2144  * TSTAT_CPU_ENABLED CPU, and explicitly destroy and reestablish its
2145  * interposing trap table.  This assures that our state is correct regardless
2146  * of whether or not the CPU has been newly powered on.
2147  */
2148 /*ARGSUSED*/
2149 static boolean_t
2150 trapstat_cpr(void *arg, int code)
2151 {
2152 	cpu_t *cp;
2153 
2154 	if (code == CB_CODE_CPR_CHKPT)
2155 		return (B_TRUE);
2156 
2157 	ASSERT(code == CB_CODE_CPR_RESUME);
2158 
2159 	mutex_enter(&cpu_lock);
2160 	mutex_enter(&tstat_lock);
2161 
2162 	if (!tstat_running) {
2163 		mutex_exit(&tstat_lock);
2164 		mutex_exit(&cpu_lock);
2165 		return (B_TRUE);
2166 	}
2167 
2168 	cp = cpu_list;
2169 	do {
2170 		tstat_percpu_t *tcpu = &tstat_percpu[cp->cpu_id];
2171 
2172 		if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED))
2173 			continue;
2174 
2175 		ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
2176 		ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
2177 
2178 		xc_one(cp->cpu_id, (xcfunc_t *)trapstat_disable, 0, 0);
2179 		ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
2180 
2181 		/*
2182 		 * Preserve this CPU's data in tstat_buffer and rip down its
2183 		 * interposing trap table.
2184 		 */
2185 #ifdef sun4v
2186 		bcopy(tcpu->tcpu_data, tstat_buffer, TSTAT_DATA_SIZE);
2187 #else
2188 		bcopy(tcpu->tcpu_data, tstat_buffer, tstat_data_t_size);
2189 #endif /* sun4v */
2190 		trapstat_teardown(cp->cpu_id);
2191 		ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED));
2192 
2193 		/*
2194 		 * Reestablish the interposing trap table and restore the old
2195 		 * data.
2196 		 */
2197 		trapstat_setup(cp->cpu_id);
2198 		ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
2199 #ifdef sun4v
2200 		bcopy(tstat_buffer, tcpu->tcpu_data, TSTAT_DATA_SIZE);
2201 #else
2202 		bcopy(tstat_buffer, tcpu->tcpu_data, tstat_data_t_size);
2203 #endif /* sun4v */
2204 
2205 		xc_one(cp->cpu_id, (xcfunc_t *)trapstat_enable, 0, 0);
2206 	} while ((cp = cp->cpu_next) != cpu_list);
2207 
2208 	mutex_exit(&tstat_lock);
2209 	mutex_exit(&cpu_lock);
2210 
2211 	return (B_TRUE);
2212 }
2213 
2214 /*ARGSUSED*/
2215 static int
2216 trapstat_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
2217 {
2218 	int i;
2219 
2220 	mutex_enter(&cpu_lock);
2221 	mutex_enter(&tstat_lock);
2222 	if (tstat_open != 0) {
2223 		mutex_exit(&tstat_lock);
2224 		mutex_exit(&cpu_lock);
2225 		return (EBUSY);
2226 	}
2227 
2228 	/*
2229 	 * Register this in open() rather than in attach() to prevent deadlock
2230 	 * with DR code. During attach, I/O device tree locks are grabbed
2231 	 * before trapstat_attach() is invoked - registering in attach
2232 	 * will result in the lock order: device tree lock, cpu_lock.
2233 	 * DR code however requires that cpu_lock be acquired before
2234 	 * device tree locks.
2235 	 */
2236 	ASSERT(!tstat_running);
2237 	register_cpu_setup_func((cpu_setup_func_t *)trapstat_cpu_setup, NULL);
2238 
2239 	/*
2240 	 * Clear all options.  And until specific CPUs are specified, we'll
2241 	 * mark all CPUs as selected.
2242 	 */
2243 	tstat_options = 0;
2244 
2245 	for (i = 0; i <= max_cpuid; i++)
2246 		tstat_percpu[i].tcpu_flags |= TSTAT_CPU_SELECTED;
2247 
2248 	/*
2249 	 * By default, all traps at TL=0 are enabled.  Traps at TL>0 must
2250 	 * be disabled.
2251 	 */
2252 	for (i = 0; i < TSTAT_TOTAL_NENT; i++)
2253 		tstat_enabled[i] = i < TSTAT_NENT ? 1 : 0;
2254 
2255 	tstat_open = 1;
2256 	mutex_exit(&tstat_lock);
2257 	mutex_exit(&cpu_lock);
2258 
2259 	return (0);
2260 }
2261 
2262 /*ARGSUSED*/
2263 static int
2264 trapstat_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
2265 {
2266 	(void) trapstat_stop();
2267 
2268 	ASSERT(!tstat_running);
2269 
2270 	mutex_enter(&cpu_lock);
2271 	unregister_cpu_setup_func((cpu_setup_func_t *)trapstat_cpu_setup, NULL);
2272 	mutex_exit(&cpu_lock);
2273 
2274 	tstat_open = 0;
2275 	return (DDI_SUCCESS);
2276 }
2277 
2278 static int
2279 trapstat_option(int option)
2280 {
2281 	mutex_enter(&tstat_lock);
2282 
2283 	if (tstat_running) {
2284 		mutex_exit(&tstat_lock);
2285 		return (EBUSY);
2286 	}
2287 
2288 	tstat_options |= option;
2289 	mutex_exit(&tstat_lock);
2290 
2291 	return (0);
2292 }
2293 
2294 /*ARGSUSED*/
2295 static int
2296 trapstat_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *crd, int *rval)
2297 {
2298 	int i, j, out;
2299 	size_t dsize;
2300 
2301 	switch (cmd) {
2302 	case TSTATIOC_GO:
2303 		return (trapstat_go());
2304 
2305 	case TSTATIOC_NOGO:
2306 		return (trapstat_option(TSTAT_OPT_NOGO));
2307 
2308 	case TSTATIOC_STOP:
2309 		return (trapstat_stop());
2310 
2311 	case TSTATIOC_CPU:
2312 		if (arg < 0 || arg > max_cpuid)
2313 			return (EINVAL);
2314 		/*FALLTHROUGH*/
2315 
2316 	case TSTATIOC_NOCPU:
2317 		mutex_enter(&tstat_lock);
2318 
2319 		if (tstat_running) {
2320 			mutex_exit(&tstat_lock);
2321 			return (EBUSY);
2322 		}
2323 
2324 		/*
2325 		 * If this is the first CPU to be specified (or if we are
2326 		 * being asked to explicitly de-select CPUs), disable all CPUs.
2327 		 */
2328 		if (!(tstat_options & TSTAT_OPT_CPU) || cmd == TSTATIOC_NOCPU) {
2329 			tstat_options |= TSTAT_OPT_CPU;
2330 
2331 			for (i = 0; i <= max_cpuid; i++) {
2332 				tstat_percpu_t *tcpu = &tstat_percpu[i];
2333 
2334 				ASSERT(cmd == TSTATIOC_NOCPU ||
2335 				    (tcpu->tcpu_flags & TSTAT_CPU_SELECTED));
2336 				tcpu->tcpu_flags &= ~TSTAT_CPU_SELECTED;
2337 			}
2338 		}
2339 
2340 		if (cmd == TSTATIOC_CPU)
2341 			tstat_percpu[arg].tcpu_flags |= TSTAT_CPU_SELECTED;
2342 
2343 		mutex_exit(&tstat_lock);
2344 
2345 		return (0);
2346 
2347 	case TSTATIOC_ENTRY:
2348 		mutex_enter(&tstat_lock);
2349 
2350 		if (tstat_running) {
2351 			mutex_exit(&tstat_lock);
2352 			return (EBUSY);
2353 		}
2354 
2355 		if (arg >= TSTAT_NENT || arg < 0) {
2356 			mutex_exit(&tstat_lock);
2357 			return (EINVAL);
2358 		}
2359 
2360 		if (!(tstat_options & TSTAT_OPT_ENTRY)) {
2361 			/*
2362 			 * If this is the first entry that we are explicitly
2363 			 * enabling, explicitly disable every TL=0 entry.
2364 			 */
2365 			for (i = 0; i < TSTAT_NENT; i++)
2366 				tstat_enabled[i] = 0;
2367 
2368 			tstat_options |= TSTAT_OPT_ENTRY;
2369 		}
2370 
2371 		tstat_enabled[arg] = 1;
2372 		mutex_exit(&tstat_lock);
2373 		return (0);
2374 
2375 	case TSTATIOC_NOENTRY:
2376 		mutex_enter(&tstat_lock);
2377 
2378 		if (tstat_running) {
2379 			mutex_exit(&tstat_lock);
2380 			return (EBUSY);
2381 		}
2382 
2383 		for (i = 0; i < TSTAT_NENT; i++)
2384 			tstat_enabled[i] = 0;
2385 
2386 		mutex_exit(&tstat_lock);
2387 		return (0);
2388 
2389 	case TSTATIOC_READ:
2390 		mutex_enter(&tstat_lock);
2391 
2392 		if (tstat_options & TSTAT_OPT_TLBDATA) {
2393 			dsize = tstat_data_t_exported_size;
2394 		} else {
2395 			dsize = sizeof (tstat_data_t);
2396 		}
2397 
2398 		for (i = 0, out = 0; i <= max_cpuid; i++) {
2399 			tstat_percpu_t *tcpu = &tstat_percpu[i];
2400 
2401 			if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED))
2402 				continue;
2403 
2404 			ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
2405 			ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
2406 
2407 			tstat_buffer->tdata_cpuid = -1;
2408 			xc_one(i, (xcfunc_t *)trapstat_snapshot, 0, 0);
2409 
2410 			if (tstat_buffer->tdata_cpuid == -1) {
2411 				/*
2412 				 * This CPU is not currently responding to
2413 				 * cross calls; we have caught it while it is
2414 				 * being unconfigured.  We'll drop tstat_lock
2415 				 * and pick up and drop cpu_lock.  By the
2416 				 * time we acquire cpu_lock, the DR operation
2417 				 * will appear consistent and we can assert
2418 				 * that trapstat_cpu_setup() has cleared
2419 				 * TSTAT_CPU_ENABLED.
2420 				 */
2421 				mutex_exit(&tstat_lock);
2422 				mutex_enter(&cpu_lock);
2423 				mutex_exit(&cpu_lock);
2424 				mutex_enter(&tstat_lock);
2425 				ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
2426 				continue;
2427 			}
2428 
2429 			/*
2430 			 * Need to compensate for the difference between page
2431 			 * sizes exported to users and page sizes available
2432 			 * within the kernel.
2433 			 */
2434 			if ((tstat_options & TSTAT_OPT_TLBDATA) &&
2435 			    (tstat_pgszs != tstat_user_pgszs)) {
2436 				tstat_pgszdata_t *tp;
2437 				uint_t szc;
2438 
2439 				tp = &tstat_buffer->tdata_pgsz[0];
2440 				for (j = 0; j < tstat_user_pgszs; j++) {
2441 					if ((szc = USERSZC_2_SZC(j)) != j) {
2442 						bcopy(&tp[szc], &tp[j],
2443 						    sizeof (tstat_pgszdata_t));
2444 					}
2445 				}
2446 			}
2447 
2448 			if (copyout(tstat_buffer, (void *)arg, dsize) != 0) {
2449 				mutex_exit(&tstat_lock);
2450 				return (EFAULT);
2451 			}
2452 
2453 			out++;
2454 			arg += dsize;
2455 		}
2456 
2457 		if (out != max_cpuid + 1) {
2458 			processorid_t cpuid = -1;
2459 			arg += offsetof(tstat_data_t, tdata_cpuid);
2460 
2461 			if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0) {
2462 				mutex_exit(&tstat_lock);
2463 				return (EFAULT);
2464 			}
2465 		}
2466 
2467 		mutex_exit(&tstat_lock);
2468 
2469 		return (0);
2470 
2471 	case TSTATIOC_TLBDATA:
2472 		return (trapstat_option(TSTAT_OPT_TLBDATA));
2473 
2474 	default:
2475 		break;
2476 	}
2477 
2478 	return (ENOTTY);
2479 }
2480 
2481 /*ARGSUSED*/
2482 static int
2483 trapstat_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
2484 {
2485 	int error;
2486 
2487 	switch (infocmd) {
2488 	case DDI_INFO_DEVT2DEVINFO:
2489 		*result = (void *)tstat_devi;
2490 		error = DDI_SUCCESS;
2491 		break;
2492 	case DDI_INFO_DEVT2INSTANCE:
2493 		*result = (void *)0;
2494 		error = DDI_SUCCESS;
2495 		break;
2496 	default:
2497 		error = DDI_FAILURE;
2498 	}
2499 	return (error);
2500 }
2501 
2502 static int
2503 trapstat_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
2504 {
2505 	switch (cmd) {
2506 	case DDI_ATTACH:
2507 		break;
2508 
2509 	case DDI_RESUME:
2510 		return (DDI_SUCCESS);
2511 
2512 	default:
2513 		return (DDI_FAILURE);
2514 	}
2515 
2516 	if (ddi_create_minor_node(devi, "trapstat", S_IFCHR,
2517 	    0, DDI_PSEUDO, 0) == DDI_FAILURE) {
2518 		ddi_remove_minor_node(devi, NULL);
2519 		return (DDI_FAILURE);
2520 	}
2521 
2522 	ddi_report_dev(devi);
2523 	tstat_devi = devi;
2524 
2525 	tstat_pgszs = page_num_pagesizes();
2526 	tstat_user_pgszs = page_num_user_pagesizes(0);
2527 	tstat_data_t_size = sizeof (tstat_data_t) +
2528 	    (tstat_pgszs - 1) * sizeof (tstat_pgszdata_t);
2529 	tstat_data_t_exported_size = sizeof (tstat_data_t) +
2530 	    (tstat_user_pgszs - 1) * sizeof (tstat_pgszdata_t);
2531 #ifndef sun4v
2532 	tstat_data_pages = (tstat_data_t_size >> MMU_PAGESHIFT) + 1;
2533 	tstat_total_pages = TSTAT_INSTR_PAGES + tstat_data_pages;
2534 	tstat_data_size = tstat_data_pages * MMU_PAGESIZE;
2535 	tstat_total_size = TSTAT_INSTR_SIZE + tstat_data_size;
2536 #else
2537 	/*
2538 	 * For sun4v, the tstat_data_t_size reflect the tstat_buffer
2539 	 * output size based on tstat_data_t structure. For tlbstats
2540 	 * collection, we use the internal tstat_tdata_t structure
2541 	 * to collect the tlbstats for the pages. Therefore we
2542 	 * need to adjust the size for the assertion.
2543 	 */
2544 	ASSERT((tstat_data_t_size - sizeof (tstat_data_t) +
2545 	    sizeof (tstat_tdata_t)) <= TSTAT_DATA_SIZE);
2546 #endif
2547 
2548 	tstat_percpu = kmem_zalloc((max_cpuid + 1) *
2549 	    sizeof (tstat_percpu_t), KM_SLEEP);
2550 
2551 	/*
2552 	 * Create our own arena backed by segkmem to assure a source of
2553 	 * MMU_PAGESIZE-aligned allocations.  We allocate out of the
2554 	 * heap32_arena to assure that we can address the allocated memory with
2555 	 * a single sethi/simm13 pair in the interposing trap table entries.
2556 	 */
2557 	tstat_arena = vmem_create("trapstat", NULL, 0, MMU_PAGESIZE,
2558 	    segkmem_alloc_permanent, segkmem_free, heap32_arena, 0, VM_SLEEP);
2559 
2560 	tstat_enabled = kmem_alloc(TSTAT_TOTAL_NENT * sizeof (int), KM_SLEEP);
2561 	tstat_buffer = kmem_alloc(tstat_data_t_size, KM_SLEEP);
2562 
2563 	/*
2564 	 * CB_CL_CPR_POST_USER is the class that executes from cpr_resume()
2565 	 * after user threads can be restarted.  By executing in this class,
2566 	 * we are assured of the availability of system services needed to
2567 	 * resume trapstat (specifically, we are assured that all CPUs are
2568 	 * restarted and responding to cross calls).
2569 	 */
2570 	tstat_cprcb =
2571 	    callb_add(trapstat_cpr, NULL, CB_CL_CPR_POST_USER, "trapstat");
2572 
2573 	return (DDI_SUCCESS);
2574 }
2575 
2576 static int
2577 trapstat_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
2578 {
2579 	int rval;
2580 
2581 	ASSERT(devi == tstat_devi);
2582 
2583 	switch (cmd) {
2584 	case DDI_DETACH:
2585 		break;
2586 
2587 	case DDI_SUSPEND:
2588 		return (DDI_SUCCESS);
2589 
2590 	default:
2591 		return (DDI_FAILURE);
2592 	}
2593 
2594 	ASSERT(!tstat_running);
2595 
2596 	rval = callb_delete(tstat_cprcb);
2597 	ASSERT(rval == 0);
2598 
2599 	kmem_free(tstat_buffer, tstat_data_t_size);
2600 	kmem_free(tstat_enabled, TSTAT_TOTAL_NENT * sizeof (int));
2601 	vmem_destroy(tstat_arena);
2602 	kmem_free(tstat_percpu, (max_cpuid + 1) * sizeof (tstat_percpu_t));
2603 	ddi_remove_minor_node(devi, NULL);
2604 
2605 	return (DDI_SUCCESS);
2606 }
2607 
2608 /*
2609  * Configuration data structures
2610  */
2611 static struct cb_ops trapstat_cb_ops = {
2612 	trapstat_open,		/* open */
2613 	trapstat_close,		/* close */
2614 	nulldev,		/* strategy */
2615 	nulldev,		/* print */
2616 	nodev,			/* dump */
2617 	nodev,			/* read */
2618 	nodev,			/* write */
2619 	trapstat_ioctl,		/* ioctl */
2620 	nodev,			/* devmap */
2621 	nodev,			/* mmap */
2622 	nodev,			/* segmap */
2623 	nochpoll,		/* poll */
2624 	ddi_prop_op,		/* cb_prop_op */
2625 	0,			/* streamtab */
2626 	D_MP | D_NEW		/* Driver compatibility flag */
2627 };
2628 
2629 static struct dev_ops trapstat_ops = {
2630 	DEVO_REV,		/* devo_rev, */
2631 	0,			/* refcnt */
2632 	trapstat_info,		/* getinfo */
2633 	nulldev,		/* identify */
2634 	nulldev,		/* probe */
2635 	trapstat_attach,	/* attach */
2636 	trapstat_detach,	/* detach */
2637 	nulldev,		/* reset */
2638 	&trapstat_cb_ops,	/* cb_ops */
2639 	(struct bus_ops *)0,	/* bus_ops */
2640 	NULL,			/* power */
2641 	ddi_quiesce_not_needed,		/* quiesce */
2642 };
2643 
2644 static struct modldrv modldrv = {
2645 	&mod_driverops,		/* Type of module.  This one is a driver */
2646 	"Trap Statistics 1.1",	/* name of module */
2647 	&trapstat_ops,		/* driver ops */
2648 };
2649 
2650 static struct modlinkage modlinkage = {
2651 	MODREV_1, (void *)&modldrv, NULL
2652 };
2653 
2654 int
2655 _init(void)
2656 {
2657 	return (mod_install(&modlinkage));
2658 }
2659 
2660 int
2661 _fini(void)
2662 {
2663 	return (mod_remove(&modlinkage));
2664 }
2665 
2666 int
2667 _info(struct modinfo *modinfop)
2668 {
2669 	return (mod_info(&modlinkage, modinfop));
2670 }
2671