xref: /titanic_52/usr/src/uts/sun4/io/trapstat.c (revision 6d22b73346a02763769401e9f28b596670cc3d16)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/systm.h>
30 #include <sys/conf.h>
31 #include <sys/stat.h>
32 #include <sys/ddi.h>
33 #include <sys/sunddi.h>
34 #include <sys/modctl.h>
35 #include <sys/cpu_module.h>
36 #include <vm/hat_sfmmu.h>
37 #include <vm/seg_kmem.h>
38 #include <vm/seg_kpm.h>
39 #include <vm/vm_dep.h>
40 #include <sys/machsystm.h>
41 #include <sys/machasi.h>
42 #include <sys/sysmacros.h>
43 #include <sys/callb.h>
44 #include <sys/archsystm.h>
45 #include <sys/trapstat.h>
46 #ifdef sun4v
47 #include <sys/hypervisor_api.h>
48 #endif
49 
50 /* BEGIN CSTYLED */
51 /*
52  * trapstat:  Trap Statistics through Dynamic Trap Table Interposition
53  * -------------------------------------------------------------------
54  *
55  * Motivation and Overview
56  *
57  * Despite being a fundamental indicator of system behavior, there has
58  * historically been very little insight provided into the frequency and cost
59  * of machine-specific traps.  The lack of insight has been especially acute
60  * on UltraSPARC microprocessors:  because these microprocessors handle TLB
61  * misses as software traps, the frequency and duration of traps play a
62  * decisive role in the performance of the memory system.  As applications have
63  * increasingly outstripped TLB reach, this has become increasingly true.
64  *
65  * Part of the difficulty of observing trap behavior is that the trap handlers
66  * are so frequently called (e.g. millions of times per second) that any
67  * permanently enabled instrumentation would induce an unacceptable performance
68  * degradation.  Thus, it is a constraint on any trap observability
69  * infrastructure that it have no probe effect when not explicitly enabled.
70  *
71  * The basic idea, then, is to create an interposing trap table in which each
72  * entry increments a per-trap, in-memory counter and then jumps to the actual,
73  * underlying trap table entry.  To enable trapstat, we atomically write to the
74  * trap base address (%tba) register to point to our interposing trap table.
75  * (Note that per-CPU statistics fall out by creating a different trap table
76  * for each CPU.)
77  *
78  * Implementation Details
79  *
80  * While the idea is straight-forward, a nuance of SPARC V9 slightly
81  * complicates the implementation.  Unlike its predecessors, SPARC V9 supports
82  * the notion of nested traps.  The trap level is kept in the TL register:
83  * during normal operation it is 0; when a trap is taken, the TL register is
84  * incremented by 1.  To aid system software, SPARC V9 breaks the trap table
85  * into two halves:  the lower half contains the trap handlers for traps taken
86  * when TL is 0; the upper half contains the trap handlers for traps taken
87  * when TL is greater than 0.  Each half is further subdivided into two
88  * subsequent halves:  the lower half contains the trap handlers for traps
89  * other than those induced by the trap instruction (Tcc variants); the upper
90  * half contains the trap handlers for traps induced by the trap instruction.
91  * This gives a total of four ranges, with each range containing 256 traps:
92  *
93  *       +--------------------------------+- 3ff
94  *       |                                |   .
95  *       |     Trap instruction, TL>0     |   .
96  *       |                                |   .
97  *       |- - - - - - - - - - - - - - - - +- 300
98  *       |- - - - - - - - - - - - - - - - +- 2ff
99  *       |                                |   .
100  *       |   Non-trap instruction, TL>0   |   .
101  *       |                                |   .
102  *       |- - - - - - - - - - - - - - - - +- 200
103  *       |- - - - - - - - - - - - - - - - +- 1ff
104  *       |                                |   .
105  *       |     Trap instruction, TL=0     |   .
106  *       |                                |   .
107  *       |- - - - - - - - - - - - - - - - +- 100
108  *       |- - - - - - - - - - - - - - - - +- 0ff
109  *       |                                |   .
110  *       |   Non-trap instruction, TL=0   |   .
111  *       |                                |   .
112  *       +--------------------------------+- 000
113  *
114  *
115  * Solaris, however, doesn't have reason to support trap instructions when
116  * TL>0 (only privileged code may execute at TL>0; not supporting this only
117  * constrains our own implementation).  The trap table actually looks like:
118  *
119  *       +--------------------------------+- 2ff
120  *       |                                |   .
121  *       |   Non-trap instruction, TL>0   |   .
122  *       |                                |   .
123  *       |- - - - - - - - - - - - - - - - +- 200
124  *       |- - - - - - - - - - - - - - - - +- 1ff
125  *       |                                |   .
126  *       |     Trap instruction, TL=0     |   .
127  *       |                                |   .
128  *       |- - - - - - - - - - - - - - - - +- 100
129  *       |- - - - - - - - - - - - - - - - +- 0ff
130  *       |                                |   .
131  *       |   Non-trap instruction, TL=0   |   .
132  *       |                                |   .
133  *       +--------------------------------+- 000
134  *
135  * Putatively to aid system software, SPARC V9 has the notion of multiple
136  * sets of global registers.  UltraSPARC defines four sets of global
137  * registers:
138  *
139  *    Normal Globals
140  *    Alternate Globals (AGs)
141  *    MMU Globals (MGs)
142  *    Interrupt Globals (IGs)
143  *
144  * The set of globals in use is controlled by bits in PSTATE; when TL is 0
145  * (and PSTATE has not been otherwise explicitly modified), the Normal Globals
146  * are in use.  When a trap is issued, PSTATE is modified to point to a set of
147  * globals corresponding to the trap type.  Most traps correspond to the
148  * Alternate Globals, with a minority corresponding to the MMU Globals, and
149  * only the interrupt-vector trap (vector 0x60) corresponding to the Interrupt
150  * Globals.  (The complete mapping can be found in the UltraSPARC I&II User's
151  * Manual.)
152  *
153  * Note that the sets of globals are per trap _type_, not per trap _level_.
154  * Thus, when executing a TL>0 trap handler, one may not have registers
155  * available (for example, both trap-instruction traps and spill traps execute
156  * on the alternate globals; if a trap-instruction trap induces a window spill,
157  * the window spill handler has no available globals).  For trapstat, this is
158  * problematic:  a register is required to transfer control from one arbitrary
159  * location (in the interposing trap table) to another (in the actual trap
160  * table).
161  *
162  * We solve this problem by exploiting the trap table's location at the bottom
163  * of valid kernel memory (i.e. at KERNELBASE).  We locate the interposing trap
164  * tables just below KERNELBASE -- thereby allowing us to use a branch-always
165  * instruction (ba) instead of a jump instruction (jmp) to transfer control
166  * from the TL>0 entries in the interposing trap table to the TL>0 entries in
167  * the actual trap table.  (N.B. while this allows trap table interposition to
168  * work, it necessarily limits trapstat to only recording information about
169  * TL=0 traps -- there is no way to increment a counter without using a
170  * register.)  Diagrammatically:
171  *
172  *  Actual trap table:
173  *
174  *       +--------------------------------+- 2ff
175  *       |                                |   .
176  *       |   Non-trap instruction, TL>0   |   .   <-----------------------+
177  *       |                                |   .   <-----------------------|-+
178  *       |- - - - - - - - - - - - - - - - +- 200  <-----------------------|-|-+
179  *       |- - - - - - - - - - - - - - - - +- 1ff                          | | |
180  *       |                                |   .                           | | |
181  *       |     Trap instruction, TL=0     |   .   <-----------------+     | | |
182  *       |                                |   .   <-----------------|-+   | | |
183  *       |- - - - - - - - - - - - - - - - +- 100  <-----------------|-|-+ | | |
184  *       |- - - - - - - - - - - - - - - - +- 0ff                    | | | | | |
185  *       |                                |   .                     | | | | | |
186  *       |   Non-trap instruction, TL=0   |   .   <-----------+     | | | | | |
187  *       |                                |   .   <-----------|-+   | | | | | |
188  *       +--------------------------------+- 000  <-----------|-|-+ | | | | | |
189  *        KERNELBASE                                          | | | | | | | | |
190  *                                                            | | | | | | | | |
191  *                                                            | | | | | | | | |
192  *  Interposing trap table:                                   | | | | | | | | |
193  *                                                            | | | | | | | | |
194  *       +--------------------------------+- 2ff              | | | | | | | | |
195  *       |  ...                           |   .               | | | | | | | | |
196  *       |  ...                           |   .               | | | | | | | | |
197  *       |  ...                           |   .               | | | | | | | | |
198  *       |- - - - - - - - - - - - - - - - +- 203              | | | | | | | | |
199  *       |  ba,a                          |      -------------|-|-|-|-|-|-+ | |
200  *       |- - - - - - - - - - - - - - - - +- 202              | | | | | |   | |
201  *       |  ba,a                          |      -------------|-|-|-|-|-|---+ |
202  *       |- - - - - - - - - - - - - - - - +- 201              | | | | | |     |
203  *       |  ba,a                          |      -------------|-|-|-|-|-|-----+
204  *       |- - - - - - - - - - - - - - - - +- 200              | | | | | |
205  *       |  ...                           |   .               | | | | | |
206  *       |  ...                           |   .               | | | | | |
207  *       |  ...                           |   .               | | | | | |
208  *       |- - - - - - - - - - - - - - - - +- 103              | | | | | |
209  *       |  (Increment counter)           |                   | | | | | |
210  *       |  ba,a                          |      -------------------+ | |
211  *       |- - - - - - - - - - - - - - - - +- 102              | | |   | |
212  *       |  (Increment counter)           |                   | | |   | |
213  *       |  ba,a                          |      ---------------------+ |
214  *       |- - - - - - - - - - - - - - - - +- 101              | | |     |
215  *       |  (Increment counter)           |                   | | |     |
216  *       |  ba,a                          |      -----------------------+
217  *       |- - - - - - - - - - - - - - - - +- 100              | | |
218  *       |  ...                           |   .               | | |
219  *       |  ...                           |   .               | | |
220  *       |  ...                           |   .               | | |
221  *       |- - - - - - - - - - - - - - - - +- 003              | | |
222  *       |  (Increment counter)           |                   | | |
223  *       |  ba,a                          |      -------------+ | |
224  *       |- - - - - - - - - - - - - - - - +- 002                | |
225  *       |  (Increment counter)           |                     | |
226  *       |  ba,a                          |      ---------------+ |
227  *       |- - - - - - - - - - - - - - - - +- 001                  |
228  *       |  (Increment counter)           |                       |
229  *       |  ba,a                          |      -----------------+
230  *       +--------------------------------+- 000
231  *        KERNELBASE - tstat_total_size
232  *
233  * tstat_total_size is the number of pages required for each trap table.  It
234  * must be true that KERNELBASE - tstat_total_size is less than the maximum
235  * branch displacement; if each CPU were to consume a disjoint virtual range
236  * below KERNELBASE for its trap table, we could support at most
237  * (maximum_branch_displacement / tstat_total_size) CPUs.  The maximum branch
238  * displacement for Bicc variants is just under eight megabytes, and (because
239  * the %tba must be 32K aligned), tstat_total_size must be at least 32K; if
240  * each CPU were to consume a disjoint virtual range, we would have an
241  * unacceptably low upper bound of 256 CPUs.
242  *
243  * While there are tricks that one could use to address this constraint (e.g.,
244  * creating trampolines every maximum_branch_displacement bytes), we instead
245  * solve this by not permitting each CPU to consume a disjoint virtual range.
246  * Rather, we have each CPU's interposing trap table use the _same_ virtual
247  * range, but we back the trap tables with disjoint physical memory.  Normally,
248  * such one-to-many virtual-to-physical mappings are illegal; this is
249  * permissible here only because the pages for the interposing trap table are
250  * necessarily locked in the TLB.  (The CPUs thus never have the opportunity to
251  * discover that they have conflicting translations.)
252  *
253  * On CMT architectures in which CPUs can share MMUs, the above trick will not
254  * work: two CPUs that share an MMU cannot have the same virtual address map
255  * to disjoint physical pages.  On these architectures, any CPUs sharing the
256  * same MMU must consume a disjoint 32K virtual address range -- limiting the
257  * number of CPUs sharing an MMU on these architectures to 256 due to the
258  * branch displacement limitation described above.  On the sun4v architecture,
259  * there is a further limitation: a guest may not have more than eight locked
260  * TLB entries per MMU.  To allow operation under this restriction, the
261  * interposing trap table and the trap statistics are each accessed through
262  * a single 4M TLB entry.  This limits the footprint to two locked entries
263  * (one for the I-TLB and one for the D-TLB), but further restricts the number
264  * of CPUs to 128 per MMU.  However, support for more than 128 CPUs can easily
265  * be added via a hybrid scheme, where the same 4M virtual address is used
266  * on different MMUs.
267  *
268  *
269  * TLB Statistics
270  *
271  * Because TLB misses are an important component of system performance, we wish
272  * to know much more about these traps than simply the number received.
273  * Specifically, we wish to know:
274  *
275  *  (a)	The amount of time spent executing the TLB miss handler
276  *  (b)	TLB misses versus TSB misses
277  *  (c) Kernel-level misses versus user-level misses
278  *  (d) Misses per pagesize
279  *
280  * TLB Statistics: Time Spent Executing
281  *
282  * To accurately determine the amount of time spent executing the TLB miss
283  * handler, one must get a timestamp on trap entry and trap exit, subtract the
284  * latter from the former, and add the result to an accumulating count.
285  * Consider flow of control during normal TLB miss processing (where "ldx
286  * [%g2], %g2" is an arbitrary TLB-missing instruction):
287  *
288  * + - - - - - - - -+
289  * :                :
290  * : ldx [%g2], %g2 :<-------------------------------------------------------+
291  * :                :              Return from trap:                         |
292  * + - - - - - - - -+                TL <- TL - 1 (0)                        |
293  *	  |                          %pc <- TSTATE[TL].TPC (address of load) |
294  *	  | TLB miss:                                                        |
295  *        |   TL <- TL + 1 (1)                                               |
296  *        |   %pc <- TLB-miss-trap-handler                                   |
297  *        |                                                                  |
298  *        v                                                                  |
299  * + - - - - - - - - - - - - - - - +                                         |
300  * :                               :                                         |
301  * : Lookup VA in TSB              :                                         |
302  * : If (hit)                      :                                         |
303  * :     Fill TLB                  :                                         |
304  * : Else                          :                                         |
305  * :     Lookup VA (hme hash table :                                         |
306  * :                or segkpm)     :                                         |
307  * :     Fill TLB                  :                                         |
308  * : Endif                         :                                         |
309  * : Issue "retry"  ---------------------------------------------------------+
310  * :                               :
311  * + - - - - - - - - - - - - - - - +
312  *  TLB-miss-trap-handler
313  *
314  *
315  * As the above diagram indicates, interposing on the trap table allows one
316  * only to determine a timestamp on trap _entry_:  when the TLB miss handler
317  * has completed filling the TLB, a "retry" will be issued, and control will
318  * transfer immediately back to the missing %pc.
319  *
320  * To obtain a timestamp on trap exit, we must then somehow interpose between
321  * the "retry" and the subsequent control transfer to the TLB-missing
322  * instruction.  To do this, we _push_ a trap level.  The basic idea is to
323  * spoof a TLB miss by raising TL, setting the %tpc to be within text
324  * controlled by trapstat (the "TLB return entry") and branching to the
325  * underlying TLB miss handler.  When the TLB miss handler issues its "retry",
326  * control will transfer not to the TLB-missing instruction, but rather to the
327  * TLB return entry.  This code can then obtain a timestamp, and issue its own
328  * "retry" -- thereby correctly returning to the TLB-missing instruction.
329  * Here is the above TLB miss flow control diagram modified to reflect
330  * trapstat's operation:
331  *
332  * + - - - - - - - -+
333  * :                :
334  * : ldx [%g2], %g2 :<-------------------------------------------------------+
335  * :                :             Return from trap:                          |
336  * + - - - - - - - -+               TL <- TL - 1 (0)                         |
337  *	  |                         %pc <- TSTATE[TL].TPC (address of load)  |
338  *	  | TLB miss:                                                        |
339  *        |   TL <- TL + 1 (1)                                               |
340  *        |   %pc <- TLB-miss-trap-handler (trapstat)                        |
341  *        |                                                                  |
342  *        v                                    TLB-return-entry (trapstat)   |
343  * + - - - - - - - - - - - - - - - - - - +    + - - - - - - - - - - - - - +  |
344  * :                                     :    :                           :  |
345  * : Record timestamp                    :    : Record timestamp          :  |
346  * : TL <- 2                             :    : Take timestamp difference :  |
347  * : TSTATE[1].TPC <- TLB-return-entry   :    : Add to running total      :  |
348  * : ba,a TLB-miss-trap-handler -----------+  : Issue "retry"  --------------+
349  * :                                     : |  :                           :
350  * + - - - - - - - - - - - - - - - - - - + |  + - - - - - - - - - - - - - +
351  *  TLB-miss-trap-handler	           |                  ^
352  *  (trapstat)                             |                  |
353  *                                         |                  |
354  *                                         |                  |
355  *                 +-----------------------+                  |
356  *                 |                                          |
357  *                 |                                          |
358  *                 v                                          |
359  * + - - - - - - - - - - - - - - - +                          |
360  * :                               :                          |
361  * : Lookup VA in TSB              :                          |
362  * : If (hit)                      :                          |
363  * :     Fill TLB                  :                          |
364  * : Else                          :                          |
365  * :     Lookup VA (hme hash table :                          |
366  * :                or segkpm)     :                          |
367  * :     Fill TLB                  :                          |
368  * : Endif                         :                          |
369  * : Issue "retry"  ------------------------------------------+
370  * :                               : Return from trap:
371  * + - - - - - - - - - - - - - - - +   TL <- TL - 1 (1)
372  *  TLB-miss-trap-handler              %pc <- TSTATE[TL].TPC (TLB-return-entry)
373  *
374  *
375  * A final subterfuge is required to complete our artifice:  if we miss in
376  * the TLB, the TSB _and_ the subsequent hash or segkpm lookup (that is, if
377  * there is no valid translation for the TLB-missing address), common system
378  * software will need to accurately determine the %tpc as part of its page
379  * fault handling. We therefore modify the kernel to check the %tpc in this
380  * case: if the %tpc falls within the VA range controlled by trapstat and
381  * the TL is 2, TL is simply lowered back to 1 (this check is implemented
382  * by the TSTAT_CHECK_TL1 macro).  Lowering TL to 1 has the effect of
383  * discarding the state pushed by trapstat.
384  *
385  * TLB Statistics: TLB Misses versus TSB Misses
386  *
387  * Distinguishing TLB misses from TSB misses requires further interposition
388  * on the TLB miss handler:  we cannot know a priori or a posteriori if a
389  * given VA will or has hit in the TSB.
390  *
391  * We achieve this distinction by adding a second TLB return entry almost
392  * identical to the first -- differing only in the address to which it
393  * stores its results.  We then modify the TLB miss handlers of the kernel
394  * such that they check the %tpc when they determine that a TLB miss has
395  * subsequently missed in the TSB:  if the %tpc lies within trapstat's VA
396  * range and TL is 2 (that is, if trapstat is running), the TLB miss handler
397  * _increments_ the %tpc by the size of the TLB return entry.  The ensuing
398  * "retry" will thus transfer control to the second TLB return entry, and
399  * the time spent in the handler will be accumulated in a memory location
400  * specific to TSB misses.
401  *
402  * N.B.:  To minimize the amount of knowledge the kernel must have of trapstat,
403  * we do not allow the kernel to hard-code the size of the TLB return entry.
404  * Rather, the actual tsbmiss handler executes a known instruction at the
405  * corresponding tsbmiss patch points (see the tstat_tsbmiss_patch_table) with
406  * the %tpc in %g7:  when trapstat is not running, these points contain the
407  * harmless TSTAT_TSBMISS_INSTR instruction ("add %g7, 0, %g7"). Before
408  * running, trapstat modifies the instructions at these patch points such
409  * that the simm13 equals the size of the TLB return entry.
410  *
411  * TLB Statistics: Kernel-level Misses versus User-level Misses
412  *
413  * Differentiating user-level misses from kernel-level misses employs a
414  * similar technique, but is simplified by the ability to distinguish a
415  * user-level miss from a kernel-level miss a priori by reading the context
416  * register:  we implement kernel-/user-level differentiation by again doubling
417  * the number of TLB return entries, and setting the %tpc to the appropriate
418  * TLB return entry in trapstat's TLB miss handler.  Together with the doubling
419  * of entries required for TLB-miss/TSB-miss differentiation, this yields a
420  * total of four TLB return entries:
421  *
422  *	Level		TSB hit?	Structure member
423  *	------------------------------------------------------------
424  *	Kernel		Yes		tstat_tlbret_t.ttlbr_ktlb
425  *	Kernel		No		tstat_tlbret_t.ttlbr_ktsb
426  *	User		Yes		tstat_tlbret_t.ttlbr_utlb
427  *	User		No		tstat_tlbret_t.ttlbr_utsb
428  *
429  * TLB Statistics: Misses per Pagesize
430  *
431  * As with the TLB-/TSB-miss differentiation, we have no way of determining
432  * pagesize a priori.  This is therefore implemented by mandating a new rule:
433  * whenever the kernel fills the TLB in its TLB miss handler, the TTE
434  * corresponding to the TLB-missing VA must be in %g5 when the handler
435  * executes its "retry".  This allows the TLB return entry to determine
436  * pagesize by simply looking at the pagesize field in the TTE stored in
437  * %g5.
438  *
439  * TLB Statistics: Probe Effect
440  *
441  * As one might imagine, gathering TLB statistics by pushing a trap level
442  * induces significant probe effect.  To account for this probe effect,
443  * trapstat attempts to observe it by executing a code sequence with a known
444  * number of TLB misses both before and after interposing on the trap table.
445  * This allows trapstat to determine a per-trap probe effect which can then be
446  * factored into the "%tim" fields of the trapstat command.
447  *
448  * Note that on sun4v platforms, TLB misses are normally handled by the
449  * hypervisor or the hardware TSB walker. Thus no fast MMU miss information
450  * is reported for normal operation. However, when trapstat is invoked with
451  * -t or -T option to collect detailed TLB statistics, kernel takes
452  * over TLB miss handling. This results in significantly more overhead
453  * and TLB statistics may not be as accurate as on sun4u platforms.
454  *
455  * Locking
456  *
457  * The implementation uses two locks:  tstat_lock (a local lock) and the global
458  * cpu_lock.  tstat_lock is used to assure trapstat's consistency in the
459  * presence of multithreaded /dev/trapstat consumers (while as of this writing
460  * the only consumer of /dev/trapstat is single threaded, it is obviously
461  * necessary to correctly support multithreaded access).  cpu_lock is held
462  * whenever CPUs are being manipulated directly, to prevent them from
463  * disappearing in the process.  Because trapstat's DR callback
464  * (trapstat_cpu_setup()) must grab tstat_lock and is called with cpu_lock
465  * held, the lock ordering is necessarily cpu_lock before tstat_lock.
466  *
467  */
468 /* END CSTYLED */
469 
470 static dev_info_t	*tstat_devi;	/* saved in xxattach() for xxinfo() */
471 static int		tstat_open;	/* set if driver is open */
472 static kmutex_t		tstat_lock;	/* serialize access */
473 static vmem_t		*tstat_arena;	/* arena for TLB-locked pages */
474 static tstat_percpu_t	*tstat_percpu;	/* per-CPU data */
475 static int		tstat_running;	/* set if trapstat is running */
476 static tstat_data_t	*tstat_buffer;	/* staging buffer for outgoing data */
477 static int		tstat_options;	/* bit-wise indication of options */
478 static int		*tstat_enabled;	/* map of enabled trap entries */
479 static int		tstat_tsbmiss_patched; /* tsbmiss patch flag */
480 static callb_id_t	tstat_cprcb;	/* CPR callback */
481 static char		*tstat_probe_area; /* VA range used for probe effect */
482 static caddr_t		tstat_probe_phys; /* physical to back above VA */
483 static hrtime_t		tstat_probe_time; /* time spent on probe effect */
484 static hrtime_t		tstat_probe_before[TSTAT_PROBE_NLAPS];
485 static hrtime_t		tstat_probe_after[TSTAT_PROBE_NLAPS];
486 static uint_t		tstat_pgszs;		/* # of kernel page sizes */
487 static uint_t		tstat_user_pgszs;	/* # of user page sizes */
488 
489 /*
490  * sizeof tstat_data_t + pgsz data for the kernel.  For simplicity's sake, when
491  * we collect data, we do it based upon szc, but when we report data back to
492  * userland, we have to do it based upon the userszc which may not match.
493  * So, these two variables are for internal use and exported use respectively.
494  */
495 static size_t		tstat_data_t_size;
496 static size_t		tstat_data_t_exported_size;
497 
498 static size_t		tstat_data_pages;  /* number of pages of tstat data */
499 static size_t		tstat_data_size;   /* tstat data size in bytes */
500 static size_t		tstat_total_pages; /* #data pages + #instr pages */
501 static size_t		tstat_total_size;  /* tstat data size + instr size */
502 #ifdef sun4v
503 static caddr_t		tstat_va;	/* VA of memory reserved for TBA */
504 static pfn_t		tstat_pfn;	/* PFN of memory reserved for TBA */
505 #endif
506 
507 /*
508  * In the above block comment, see "TLB Statistics: TLB Misses versus
509  * TSB Misses" for an explanation of the tsbmiss patch points.
510  */
511 extern uint32_t		tsbmiss_trapstat_patch_point;
512 extern uint32_t		tsbmiss_trapstat_patch_point_kpm;
513 extern uint32_t		tsbmiss_trapstat_patch_point_kpm_small;
514 
515 /*
516  * Trapstat tsbmiss patch table
517  */
518 tstat_tsbmiss_patch_entry_t tstat_tsbmiss_patch_table[] = {
519 	{(uint32_t *)&tsbmiss_trapstat_patch_point, 0},
520 	{(uint32_t *)&tsbmiss_trapstat_patch_point_kpm, 0},
521 	{(uint32_t *)&tsbmiss_trapstat_patch_point_kpm_small, 0},
522 	{(uint32_t *)NULL, 0}
523 };
524 
525 /*
526  * We define some general SPARC-specific constants to allow more readable
527  * relocations.
528  */
529 #define	NOP	0x01000000
530 #define	HI22(v) ((uint32_t)(v) >> 10)
531 #define	LO10(v) ((uint32_t)(v) & 0x3ff)
532 #define	LO12(v) ((uint32_t)(v) & 0xfff)
533 #define	DISP22(from, to) \
534 	((((uintptr_t)(to) - (uintptr_t)(from)) >> 2) & 0x3fffff)
535 #define	ASI(asi)	((asi) << 5)
536 
537 /*
538  * The interposing trap table must be locked in the I-TLB, and any data
539  * referred to in the interposing trap handler must be locked in the D-TLB.
540  * This function locks these pages in the appropriate TLBs by creating TTEs
541  * from whole cloth, and manually loading them into the TLB.  This function is
542  * called from cross call context.
543  *
544  * On sun4v platforms, we use 4M page size mappings to minimize the number
545  * of locked down entries (i.e. permanent mappings). Each CPU uses a
546  * reserved portion of that 4M page for its TBA and data.
547  */
548 static void
549 trapstat_load_tlb(void)
550 {
551 #ifndef sun4v
552 	int i;
553 #else
554 	uint64_t ret;
555 #endif
556 	tte_t tte;
557 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
558 	caddr_t va = tcpu->tcpu_vabase;
559 
560 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
561 	ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
562 
563 #ifndef sun4v
564 	for (i = 0; i < tstat_total_pages; i++, va += MMU_PAGESIZE) {
565 		tte.tte_inthi = TTE_VALID_INT | TTE_SZ_INT(TTE8K) |
566 			TTE_PFN_INTHI(tcpu->tcpu_pfn[i]);
567 		if (i < TSTAT_INSTR_PAGES) {
568 			tte.tte_intlo = TTE_PFN_INTLO(tcpu->tcpu_pfn[i]) |
569 				TTE_LCK_INT | TTE_CP_INT | TTE_PRIV_INT;
570 			sfmmu_itlb_ld(va, KCONTEXT, &tte);
571 		} else {
572 			tte.tte_intlo = TTE_PFN_INTLO(tcpu->tcpu_pfn[i]) |
573 				TTE_LCK_INT | TTE_CP_INT | TTE_CV_INT |
574 				TTE_PRIV_INT | TTE_HWWR_INT;
575 			sfmmu_dtlb_ld(va, KCONTEXT, &tte);
576 		}
577 	}
578 #else /* sun4v */
579 	tte.tte_inthi = TTE_VALID_INT | TTE_PFN_INTHI(tstat_pfn);
580 	tte.tte_intlo = TTE_PFN_INTLO(tstat_pfn) | TTE_CP_INT |
581 		TTE_CV_INT | TTE_PRIV_INT | TTE_HWWR_INT |
582 		TTE_SZ_INTLO(TTE4M);
583 	ret = hv_mmu_map_perm_addr(va, KCONTEXT, *(uint64_t *)&tte,
584 		MAP_ITLB | MAP_DTLB);
585 
586 	if (ret != H_EOK)
587 		cmn_err(CE_PANIC, "trapstat: cannot map new TBA "
588 		    "for cpu %d  (error: 0x%lx)", CPU->cpu_id, ret);
589 #endif /* sun4v */
590 }
591 
592 /*
593  * As mentioned in the "TLB Statistics: TLB Misses versus TSB Misses" section
594  * of the block comment, TLB misses are differentiated from TSB misses in
595  * part by hot-patching the instructions at the tsbmiss patch points (see
596  * tstat_tsbmiss_patch_table). This routine is used both to initially patch
597  * the instructions, and to patch them back to their original values upon
598  * restoring the original trap table.
599  */
600 static void
601 trapstat_hotpatch()
602 {
603 	uint32_t instr;
604 	uint32_t simm13;
605 	tstat_tsbmiss_patch_entry_t *ep;
606 
607 	ASSERT(MUTEX_HELD(&tstat_lock));
608 
609 	if (!(tstat_options & TSTAT_OPT_TLBDATA))
610 		return;
611 
612 	if (!tstat_tsbmiss_patched) {
613 		/*
614 		 * We haven't patched the TSB paths; do so now.
615 		 */
616 		/*CONSTCOND*/
617 		ASSERT(offsetof(tstat_tlbret_t, ttlbr_ktsb) -
618 		    offsetof(tstat_tlbret_t, ttlbr_ktlb) ==
619 		    offsetof(tstat_tlbret_t, ttlbr_utsb) -
620 		    offsetof(tstat_tlbret_t, ttlbr_utlb));
621 
622 		simm13 = offsetof(tstat_tlbret_t, ttlbr_ktsb) -
623 		    offsetof(tstat_tlbret_t, ttlbr_ktlb);
624 
625 		for (ep = tstat_tsbmiss_patch_table; ep->tpe_addr; ep++) {
626 			ASSERT(ep->tpe_instr == 0);
627 			instr = ep->tpe_instr = *ep->tpe_addr;
628 
629 			/*
630 			 * Assert that the instruction we're about to patch is
631 			 * "add %g7, 0, %g7" (0x8e01e000).
632 			 */
633 			ASSERT(instr == TSTAT_TSBMISS_INSTR);
634 
635 			instr |= simm13;
636 			hot_patch_kernel_text((caddr_t)ep->tpe_addr,
637 			    instr, sizeof (instr));
638 		}
639 
640 		tstat_tsbmiss_patched = 1;
641 
642 	} else {
643 		/*
644 		 * Remove patches from the TSB paths.
645 		 */
646 		for (ep = tstat_tsbmiss_patch_table; ep->tpe_addr; ep++) {
647 			ASSERT(ep->tpe_instr == TSTAT_TSBMISS_INSTR);
648 			hot_patch_kernel_text((caddr_t)ep->tpe_addr,
649 			    ep->tpe_instr, sizeof (instr));
650 			ep->tpe_instr = 0;
651 		}
652 
653 		tstat_tsbmiss_patched = 0;
654 	}
655 }
656 
657 /*
658  * This is the routine executed to clock the performance of the trap table,
659  * executed both before and after interposing on the trap table to attempt to
660  * determine probe effect.  The probe effect is used to adjust the "%tim"
661  * fields of trapstat's -t and -T output; we only use TLB misses to clock the
662  * trap table.  We execute the inner loop (which is designed to exceed the
663  * TLB's reach) nlaps times, taking the best time as our time (thereby
664  * factoring out the effects of interrupts, cache misses or other perturbing
665  * events.
666  */
667 static hrtime_t
668 trapstat_probe_laps(int nlaps, hrtime_t *buf)
669 {
670 	int i, j = 0;
671 	hrtime_t ts, best = INT64_MAX;
672 
673 	while (nlaps--) {
674 		ts = rdtick();
675 
676 		for (i = 0; i < TSTAT_PROBE_SIZE; i += MMU_PAGESIZE)
677 			*((volatile char *)&tstat_probe_area[i]);
678 
679 		if ((ts = rdtick() - ts) < best)
680 			best = ts;
681 		buf[j++] = ts;
682 	}
683 
684 	return (best);
685 }
686 
687 /*
688  * This routine determines the probe effect by calling trapstat_probe_laps()
689  * both without and with the interposing trap table.  Note that this is
690  * called from a cross call on the desired CPU, and that it is called on
691  * every CPU (this is necessary because the probe effect may differ from
692  * one CPU to another).
693  */
694 static void
695 trapstat_probe()
696 {
697 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
698 	hrtime_t before, after;
699 
700 	if (!(tcpu->tcpu_flags & TSTAT_CPU_SELECTED))
701 		return;
702 
703 	if (tstat_probe_area == NULL || (tstat_options & TSTAT_OPT_NOGO))
704 		return;
705 
706 	/*
707 	 * We very much expect the %tba to be KERNELBASE; this is a
708 	 * precautionary measure to assure that trapstat doesn't melt the
709 	 * machine should the %tba point unexpectedly elsewhere.
710 	 */
711 	if (get_tba() != (caddr_t)KERNELBASE)
712 		return;
713 
714 	/*
715 	 * Preserve this CPU's data before destroying it by enabling the
716 	 * interposing trap table.  We can safely use tstat_buffer because
717 	 * the caller of the trapstat_probe() cross call is holding tstat_lock.
718 	 */
719 	bcopy(tcpu->tcpu_data, tstat_buffer, tstat_data_t_size);
720 
721 	tstat_probe_time = gethrtime();
722 
723 	before = trapstat_probe_laps(TSTAT_PROBE_NLAPS, tstat_probe_before);
724 	(void) set_tba(tcpu->tcpu_ibase);
725 
726 	after = trapstat_probe_laps(TSTAT_PROBE_NLAPS, tstat_probe_after);
727 	(void) set_tba((caddr_t)KERNELBASE);
728 
729 	tstat_probe_time = gethrtime() - tstat_probe_time;
730 
731 	bcopy(tstat_buffer, tcpu->tcpu_data, tstat_data_t_size);
732 	tcpu->tcpu_data->tdata_peffect = (after - before) / TSTAT_PROBE_NPAGES;
733 }
734 
735 static void
736 trapstat_probe_alloc()
737 {
738 	pfn_t pfn;
739 	caddr_t va;
740 	int i;
741 
742 	ASSERT(MUTEX_HELD(&tstat_lock));
743 	ASSERT(tstat_probe_area == NULL);
744 	ASSERT(tstat_probe_phys == NULL);
745 
746 	if (!(tstat_options & TSTAT_OPT_TLBDATA))
747 		return;
748 
749 	/*
750 	 * Grab some virtual from the heap arena.
751 	 */
752 	tstat_probe_area = vmem_alloc(heap_arena, TSTAT_PROBE_SIZE, VM_SLEEP);
753 	va = tstat_probe_area;
754 
755 	/*
756 	 * Grab a single physical page.
757 	 */
758 	tstat_probe_phys = vmem_alloc(tstat_arena, MMU_PAGESIZE, VM_SLEEP);
759 	pfn = hat_getpfnum(kas.a_hat, tstat_probe_phys);
760 
761 	/*
762 	 * Now set the translation for every page in our virtual range
763 	 * to be our allocated physical page.
764 	 */
765 	for (i = 0; i < TSTAT_PROBE_NPAGES; i++) {
766 		hat_devload(kas.a_hat, va, MMU_PAGESIZE, pfn, PROT_READ,
767 		    HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
768 		va += MMU_PAGESIZE;
769 	}
770 }
771 
772 static void
773 trapstat_probe_free()
774 {
775 	caddr_t va;
776 	int i;
777 
778 	ASSERT(MUTEX_HELD(&tstat_lock));
779 
780 	if ((va = tstat_probe_area) == NULL)
781 		return;
782 
783 	for (i = 0; i < TSTAT_PROBE_NPAGES; i++) {
784 		hat_unload(kas.a_hat, va, MMU_PAGESIZE, HAT_UNLOAD_UNLOCK);
785 		va += MMU_PAGESIZE;
786 	}
787 
788 	vmem_free(tstat_arena, tstat_probe_phys, MMU_PAGESIZE);
789 	vmem_free(heap_arena, tstat_probe_area, TSTAT_PROBE_SIZE);
790 
791 	tstat_probe_phys = NULL;
792 	tstat_probe_area = NULL;
793 }
794 
795 /*
796  * This routine actually enables a CPU by setting its %tba to be the
797  * CPU's interposing trap table.  It is called out of cross call context.
798  */
799 static void
800 trapstat_enable()
801 {
802 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
803 
804 	if (!(tcpu->tcpu_flags & TSTAT_CPU_SELECTED))
805 		return;
806 
807 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
808 	ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
809 
810 	if (get_tba() != (caddr_t)KERNELBASE)
811 		return;
812 
813 	if (!(tstat_options & TSTAT_OPT_NOGO))
814 		(void) set_tba(tcpu->tcpu_ibase);
815 	tcpu->tcpu_flags |= TSTAT_CPU_ENABLED;
816 #ifdef sun4v
817 	if (tstat_options & (TSTAT_OPT_TLBDATA | TSTAT_OPT_NOGO)) {
818 		/*
819 		 * On sun4v platforms, TLB misses are normally handled by the
820 		 * hypervisor or the hardware -- provided one or more TSBs
821 		 * have been setup and communicated via hv_set_ctx0 and
822 		 * hv_set_nonctx0 API.  However, as part of collecting TLB
823 		 * statistics, we disabled this miss processing by telling the
824 		 * hypervisor that there was not a TSB; we now need to
825 		 * communicate the proper kernel/user TSB information to
826 		 * resume efficient operation.
827 		 *
828 		 * While we restore kernel TSB information immediately, to
829 		 * avoid any locking dependency, we don't restore user TSB
830 		 * information right away.  Rather, we simply clear the
831 		 * TSTAT_TLB_STATS flag so that the user TSB information is
832 		 * automatically restored on the next context switch.
833 		 *
834 		 * Note that the call to restore kernel TSB information is not
835 		 * expected to fail.  Even in the event of failure, the system
836 		 * will still continue to function properly, if in a state of
837 		 * reduced performance due to the guest kernel handling all
838 		 * TLB misses.
839 		 */
840 		cpu_t *cp = CPU;
841 
842 		cp->cpu_m.cpu_tstat_flags |= TSTAT_TLB_STATS;
843 		(void) hv_set_ctx0(NULL, NULL);
844 		(void) hv_set_ctxnon0(NULL, NULL);
845 	}
846 #endif
847 }
848 
849 /*
850  * This routine disables a CPU (vis a vis trapstat) by setting its %tba to be
851  * the actual, underlying trap table.  It is called out of cross call context.
852  */
853 static void
854 trapstat_disable()
855 {
856 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
857 
858 	if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED))
859 		return;
860 
861 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
862 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
863 
864 	if (!(tstat_options & TSTAT_OPT_NOGO))
865 		(void) set_tba((caddr_t)KERNELBASE);
866 
867 	tcpu->tcpu_flags &= ~TSTAT_CPU_ENABLED;
868 
869 #ifdef sun4v
870 	if (tstat_options & (TSTAT_OPT_TLBDATA | TSTAT_OPT_NOGO)) {
871 		/*
872 		 * On sun4v platforms, TlB misses are normally handled by
873 		 * the hypervisor or the hardware provided one or more TSBs
874 		 * have been setup and communicated via hv_set_ctx0 and
875 		 * hv_set_nonctx0 API. However, as part of collecting TLB
876 		 * statistics, we disabled that by faking NO TSB and we
877 		 * need to communicate proper kernel/user TSB information
878 		 * so that TLB misses can be handled by the hypervisor or
879 		 * the hardware more efficiently.
880 		 *
881 		 * We restore kernel TSB information right away. However,
882 		 * to minimize any locking dependency, we don't restore
883 		 * user TSB information right away. Instead, we simply
884 		 * clear the TSTAT_TLB_STATS flag so that the user TSB
885 		 * information is automatically restored on next context
886 		 * switch.
887 		 *
888 		 * Note that the call to restore kernel TSB information
889 		 * will normally not fail, unless wrong information is
890 		 * passed here. In that scenario, system will still
891 		 * continue to function properly with the exception of
892 		 * kernel handling all the TLB misses.
893 		 */
894 		struct hv_tsb_block *hvbp = &ksfmmup->sfmmu_hvblock;
895 		cpu_t *cp = CPU;
896 
897 		cp->cpu_m.cpu_tstat_flags &= ~TSTAT_TLB_STATS;
898 		(void) hv_set_ctx0(hvbp->hv_tsb_info_cnt, hvbp->hv_tsb_info_pa);
899 	}
900 #endif
901 }
902 
903 /*
904  * We use %tick as the time base when recording the time spent executing
905  * the trap handler.  %tick, however, is not necessarily kept in sync
906  * across CPUs (indeed, different CPUs may have different %tick frequencies).
907  * We therefore cross call onto a CPU to get a snapshot of its data to
908  * copy out; this is the routine executed out of that cross call.
909  */
910 static void
911 trapstat_snapshot()
912 {
913 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
914 	tstat_data_t *data = tcpu->tcpu_data;
915 
916 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
917 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
918 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ENABLED);
919 
920 	data->tdata_snapts = gethrtime();
921 	data->tdata_snaptick = rdtick();
922 	bcopy(data, tstat_buffer, tstat_data_t_size);
923 }
924 
925 /*
926  * The TSTAT_RETENT_* constants define offsets in the TLB return entry.
927  * They are used only in trapstat_tlbretent() (below) and #undef'd
928  * immediately afterwards.  Any change to "retent" in trapstat_tlbretent()
929  * will likely require changes to these constants.
930  */
931 
932 #ifndef	sun4v
933 #define	TSTAT_RETENT_STATHI	1
934 #define	TSTAT_RETENT_STATLO	2
935 #define	TSTAT_RETENT_SHIFT	11
936 #define	TSTAT_RETENT_COUNT_LD	13
937 #define	TSTAT_RETENT_COUNT_ST	15
938 #define	TSTAT_RETENT_TMPTSHI	16
939 #define	TSTAT_RETENT_TMPTSLO	17
940 #define	TSTAT_RETENT_TIME_LD	19
941 #define	TSTAT_RETENT_TIME_ST	21
942 #else /* sun4v */
943 #define	TSTAT_RETENT_STATHI	1
944 #define	TSTAT_RETENT_STATLO	2
945 #define	TSTAT_RETENT_SHIFT	5
946 #define	TSTAT_RETENT_COUNT_LD	7
947 #define	TSTAT_RETENT_COUNT_ST	9
948 #define	TSTAT_RETENT_TMPTSHI	10
949 #define	TSTAT_RETENT_TMPTSLO	11
950 #define	TSTAT_RETENT_TIME_LD	13
951 #define	TSTAT_RETENT_TIME_ST	15
952 #endif /* sun4v */
953 
954 static void
955 trapstat_tlbretent(tstat_percpu_t *tcpu, tstat_tlbretent_t *ret,
956     tstat_missdata_t *data)
957 {
958 	uint32_t *ent = ret->ttlbrent_instr, shift;
959 	uintptr_t base, tmptick = TSTAT_DATA_OFFS(tcpu, tdata_tmptick);
960 
961 	/*
962 	 * This is the entry executed upon return from the TLB/TSB miss
963 	 * handler (i.e. the code interpositioned between the "retry" and
964 	 * the actual return to the TLB-missing instruction).  Detail on its
965 	 * theory of operation can be found in the "TLB Statistics" section
966 	 * of the block comment.  Note that we expect the TTE just loaded
967 	 * into the TLB to be in %g5; all other globals are available as
968 	 * scratch.  Finally, note that the page size information in sun4v is
969 	 * located in the lower bits of the TTE -- requiring us to have a
970 	 * different return entry on sun4v.
971 	 */
972 	static const uint32_t retent[TSTAT_TLBRET_NINSTR] = {
973 #ifndef sun4v
974 	    0x87410000,		/* rd    %tick, %g3			*/
975 	    0x03000000, 	/* sethi %hi(stat), %g1			*/
976 	    0x82106000,		/* or    %g1, %lo(stat), %g1		*/
977 	    0x89297001,		/* sllx  %g5, 1, %g4			*/
978 	    0x8931303e,		/* srlx  %g4, 62, %g4			*/
979 	    0x8531702e,		/* srlx  %g5, 46, %g2			*/
980 	    0x8408a004,		/* and   %g2, 4, %g2			*/
981 	    0x88110002,		/* or    %g4, %g2, %g4			*/
982 	    0x80a12005,		/* cmp   %g4, 5				*/
983 	    0x34400002,		/* bg,a,pn %icc, +8			*/
984 	    0x88102004,		/* mov   4, %g4				*/
985 	    0x89292000,		/* sll   %g4, shift, %g4		*/
986 	    0x82004004,		/* add   %g1, %g4, %g1			*/
987 	    0xc4586000,		/* ldx   [%g1 + tmiss_count], %g2	*/
988 	    0x8400a001,		/* add   %g2, 1, %g2			*/
989 	    0xc4706000,		/* stx   %g2, [%g1 + tmiss_count]	*/
990 	    0x0d000000, 	/* sethi %hi(tdata_tmptick), %g6	*/
991 	    0xc459a000, 	/* ldx   [%g6 + %lo(tdata_tmptick)], %g2 */
992 	    0x8620c002,		/* sub   %g3, %g2, %g3			*/
993 	    0xc4586000,		/* ldx   [%g1 + tmiss_time], %g2	*/
994 	    0x84008003,		/* add   %g2, %g3, %g2			*/
995 	    0xc4706000,		/* stx   %g2, [%g1 + tmiss_time]	*/
996 	    0x83f00000		/* retry				*/
997 #else /* sun4v */
998 	    0x87410000,		/* rd    %tick, %g3			*/
999 	    0x03000000, 	/* sethi %hi(stat), %g1			*/
1000 	    0x82106000,		/* or    %g1, %lo(stat), %g1		*/
1001 	    0x8929703d,		/* sllx  %g5, 61, %g4			*/
1002 	    0x8931303d,		/* srlx  %g4, 61, %g4			*/
1003 	    0x89292000,		/* sll   %g4, shift, %g4		*/
1004 	    0x82004004,		/* add   %g1, %g4, %g1			*/
1005 	    0xc4586000,		/* ldx   [%g1 + tmiss_count], %g2	*/
1006 	    0x8400a001,		/* add   %g2, 1, %g2			*/
1007 	    0xc4706000,		/* stx   %g2, [%g1 + tmiss_count]	*/
1008 	    0x0d000000, 	/* sethi %hi(tdata_tmptick), %g6	*/
1009 	    0xc459a000, 	/* ldx   [%g6 + %lo(tdata_tmptick)], %g2 */
1010 	    0x8620c002,		/* sub   %g3, %g2, %g3			*/
1011 	    0xc4586000,		/* ldx   [%g1 + tmiss_time], %g2	*/
1012 	    0x84008003,		/* add   %g2, %g3, %g2			*/
1013 	    0xc4706000,		/* stx   %g2, [%g1 + tmiss_time]	*/
1014 	    0x83f00000		/* retry				*/
1015 #endif /* sun4v */
1016 	};
1017 
1018 	ASSERT(MUTEX_HELD(&tstat_lock));
1019 	/*CONSTCOND*/
1020 	ASSERT(offsetof(tstat_missdata_t, tmiss_count) <= LO10(-1));
1021 	/*CONSTCOND*/
1022 	ASSERT(offsetof(tstat_missdata_t, tmiss_time) <= LO10(-1));
1023 	/*CONSTCOND*/
1024 	ASSERT(!((sizeof (tstat_pgszdata_t) - 1) & sizeof (tstat_pgszdata_t)));
1025 
1026 	for (shift = 1; (1 << shift) != sizeof (tstat_pgszdata_t); shift++)
1027 		continue;
1028 
1029 	base = (uintptr_t)tcpu->tcpu_dbase +
1030 	    ((uintptr_t)data - (uintptr_t)tcpu->tcpu_data);
1031 
1032 	bcopy(retent, ent, sizeof (retent));
1033 
1034 	ent[TSTAT_RETENT_STATHI] |= HI22(base);
1035 	ent[TSTAT_RETENT_STATLO] |= LO10(base);
1036 	ent[TSTAT_RETENT_SHIFT] |= shift;
1037 	/* LINTED E_EXPR_NULL_EFFECT */
1038 	ent[TSTAT_RETENT_COUNT_LD] |= offsetof(tstat_missdata_t, tmiss_count);
1039 	/* LINTED E_EXPR_NULL_EFFECT */
1040 	ent[TSTAT_RETENT_COUNT_ST] |= offsetof(tstat_missdata_t, tmiss_count);
1041 	ent[TSTAT_RETENT_TMPTSHI] |= HI22(tmptick);
1042 	ent[TSTAT_RETENT_TMPTSLO] |= LO10(tmptick);
1043 	ent[TSTAT_RETENT_TIME_LD] |= offsetof(tstat_missdata_t, tmiss_time);
1044 	ent[TSTAT_RETENT_TIME_ST] |= offsetof(tstat_missdata_t, tmiss_time);
1045 }
1046 
1047 #undef TSTAT_RETENT_STATHI
1048 #undef TSTAT_RETENT_STATLO
1049 #undef TSTAT_RETENT_SHIFT
1050 #undef TSTAT_RETENT_COUNT_LD
1051 #undef TSTAT_RETENT_COUNT_ST
1052 #undef TSTAT_RETENT_TMPTSHI
1053 #undef TSTAT_RETENT_TMPTSLO
1054 #undef TSTAT_RETENT_TIME_LD
1055 #undef TSTAT_RETENT_TIME_ST
1056 
1057 /*
1058  * The TSTAT_TLBENT_* constants define offsets in the TLB entry.  They are
1059  * used only in trapstat_tlbent() (below) and #undef'd immediately afterwards.
1060  * Any change to "tlbent" in trapstat_tlbent() will likely require changes
1061  * to these constants.
1062  */
1063 
1064 #ifndef sun4v
1065 #define	TSTAT_TLBENT_STATHI	0
1066 #define	TSTAT_TLBENT_STATLO_LD	1
1067 #define	TSTAT_TLBENT_STATLO_ST	3
1068 #define	TSTAT_TLBENT_MMUASI	15
1069 #define	TSTAT_TLBENT_TPCHI	18
1070 #define	TSTAT_TLBENT_TPCLO_USER	19
1071 #define	TSTAT_TLBENT_TPCLO_KERN	21
1072 #define	TSTAT_TLBENT_TSHI	25
1073 #define	TSTAT_TLBENT_TSLO	27
1074 #define	TSTAT_TLBENT_BA		28
1075 #else /* sun4v */
1076 #define	TSTAT_TLBENT_STATHI	0
1077 #define	TSTAT_TLBENT_STATLO_LD	1
1078 #define	TSTAT_TLBENT_STATLO_ST	3
1079 #define	TSTAT_TLBENT_TAGTARGET	19
1080 #define	TSTAT_TLBENT_TPCHI	21
1081 #define	TSTAT_TLBENT_TPCLO_USER	22
1082 #define	TSTAT_TLBENT_TPCLO_KERN	24
1083 #define	TSTAT_TLBENT_TSHI	28
1084 #define	TSTAT_TLBENT_TSLO	30
1085 #define	TSTAT_TLBENT_BA		31
1086 #endif /* sun4v */
1087 
1088 static void
1089 trapstat_tlbent(tstat_percpu_t *tcpu, int entno)
1090 {
1091 	uint32_t *ent;
1092 	uintptr_t orig, va, baoffs;
1093 	int itlb = entno == TSTAT_ENT_ITLBMISS;
1094 	int entoffs = entno << TSTAT_ENT_SHIFT;
1095 	uintptr_t tmptick, stat, tpc, utpc;
1096 	tstat_pgszdata_t *data = &tcpu->tcpu_data->tdata_pgsz[0];
1097 	tstat_tlbdata_t *udata, *kdata;
1098 	tstat_tlbret_t *ret;
1099 #ifndef sun4v
1100 	uint32_t asi = itlb ? ASI(ASI_IMMU) : ASI(ASI_DMMU);
1101 #else
1102 	uint32_t tagtarget_off = itlb ? MMFSA_I_CTX : MMFSA_D_CTX;
1103 #endif
1104 
1105 	/*
1106 	 * When trapstat is run with TLB statistics, this is the entry for
1107 	 * both I- and D-TLB misses; this code performs trap level pushing,
1108 	 * as described in the "TLB Statistics" section of the block comment.
1109 	 * This code is executing at TL 1; %tstate[0] contains the saved
1110 	 * state at the time of the TLB miss.  Pushing trap level 1 (and thus
1111 	 * raising TL to 2) requires us to fill in %tstate[1] with our %pstate,
1112 	 * %cwp and %asi.  We leave %tt unchanged, and we set %tpc and %tnpc to
1113 	 * the appropriate TLB return entry (based on the context of the miss).
1114 	 * Finally, we sample %tick, and stash it in the tdata_tmptick member
1115 	 * the per-CPU tstat_data structure.  tdata_tmptick will be used in
1116 	 * the TLB return entry to determine the amount of time spent in the
1117 	 * TLB miss handler.
1118 	 *
1119 	 * Note that on sun4v platforms, we must obtain the context information
1120 	 * from the MMU fault status area. (The base address of this MMU fault
1121 	 * status area is kept in the scratchpad register 0.)
1122 	 */
1123 	static const uint32_t tlbent[] = {
1124 #ifndef sun4v
1125 	    0x03000000, 		/* sethi %hi(stat), %g1		*/
1126 	    0xc4586000,			/* ldx   [%g1 + %lo(stat)], %g2	*/
1127 	    0x8400a001,			/* add   %g2, 1, %g2		*/
1128 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(stat)]	*/
1129 	    0x85524000,			/* rdpr  %cwp, %g2		*/
1130 	    0x87518000,			/* rdpr  %pstate, %g3		*/
1131 	    0x8728f008,			/* sllx  %g3, 8, %g3		*/
1132 	    0x84108003,			/* or    %g2, %g3, %g2		*/
1133 	    0x8740c000,			/* rd    %asi, %g3		*/
1134 	    0x8728f018,			/* sllx  %g3, 24, %g3		*/
1135 	    0x84108003,			/* or    %g2, %g3, %g2		*/
1136 	    0x8350c000,			/* rdpr  %tt, %g1		*/
1137 	    0x8f902002,			/* wrpr  %g0, 2, %tl		*/
1138 	    0x85908000,			/* wrpr  %g2, %g0, %tstate	*/
1139 	    0x87904000,			/* wrpr  %g1, %g0, %tt		*/
1140 	    0xc2d80000,			/* ldxa  [%g0]ASI_MMU, %g1	*/
1141 	    0x83307030,			/* srlx  %g1, CTXSHIFT, %g1	*/
1142 	    0x02c04004,			/* brz,pn %g1, .+0x10		*/
1143 	    0x03000000, 		/* sethi %hi(new_tpc), %g1	*/
1144 	    0x82106000,			/* or    %g1, %lo(new_tpc), %g1	*/
1145 	    0x30800002,			/* ba,a  .+0x8			*/
1146 	    0x82106000,			/* or    %g1, %lo(new_tpc), %g1	*/
1147 	    0x81904000,			/* wrpr  %g1, %g0, %tpc		*/
1148 	    0x82006004,			/* add   %g1, 4, %g1		*/
1149 	    0x83904000,			/* wrpr  %g1, %g0, %tnpc	*/
1150 	    0x03000000, 		/* sethi %hi(tmptick), %g1	*/
1151 	    0x85410000,			/* rd    %tick, %g2		*/
1152 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(tmptick)] */
1153 	    0x30800000,			/* ba,a  addr			*/
1154 	    NOP, NOP, NOP
1155 #else /* sun4v */
1156 	    0x03000000, 		/* sethi %hi(stat), %g1		*/
1157 	    0xc4586000,			/* ldx   [%g1 + %lo(stat)], %g2	*/
1158 	    0x8400a001,			/* add   %g2, 1, %g2		*/
1159 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(stat)]	*/
1160 	    0x85524000,			/* rdpr  %cwp, %g2		*/
1161 	    0x87518000,			/* rdpr  %pstate, %g3		*/
1162 	    0x8728f008,			/* sllx  %g3, 8, %g3		*/
1163 	    0x84108003,			/* or    %g2, %g3, %g2		*/
1164 	    0x8740c000,			/* rd    %asi, %g3		*/
1165 	    0x8728f018,			/* sllx  %g3, 24, %g3		*/
1166 	    0x83540000,			/* rdpr  %gl, %g1		*/
1167 	    0x83287028,			/* sllx  %g1, 40, %g1		*/
1168 	    0x86104003,			/* or    %g1, %g3, %g3		*/
1169 	    0x84108003,			/* or    %g2, %g3, %g2		*/
1170 	    0x8350c000,			/* rdpr  %tt, %g1		*/
1171 	    0x8f902002,			/* wrpr  %g0, 2, %tl		*/
1172 	    0x85908000,			/* wrpr  %g2, %g0, %tstate	*/
1173 	    0x87904000,			/* wrpr  %g1, %g0, %tt		*/
1174 	    0xc2d80400,			/* ldxa  [%g0]ASI_SCRATCHPAD, %g1 */
1175 	    0xc2586000,			/* ldx  [%g1 + MMFSA_?_CTX], %g1 */
1176 	    0x02c04004,			/* brz,pn %g1, .+0x10		*/
1177 	    0x03000000, 		/* sethi %hi(new_tpc), %g1	*/
1178 	    0x82106000,			/* or    %g1, %lo(new_tpc), %g1	*/
1179 	    0x30800002,			/* ba,a  .+0x8			*/
1180 	    0x82106000,			/* or    %g1, %lo(new_tpc), %g1	*/
1181 	    0x81904000,			/* wrpr  %g1, %g0, %tpc		*/
1182 	    0x82006004,			/* add   %g1, 4, %g1		*/
1183 	    0x83904000,			/* wrpr  %g1, %g0, %tnpc	*/
1184 	    0x03000000, 		/* sethi %hi(tmptick), %g1	*/
1185 	    0x85410000,			/* rd    %tick, %g2		*/
1186 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(tmptick)] */
1187 	    0x30800000			/* ba,a  addr			*/
1188 #endif /* sun4v */
1189 	};
1190 
1191 	ASSERT(MUTEX_HELD(&tstat_lock));
1192 	ASSERT(entno == TSTAT_ENT_ITLBMISS || entno == TSTAT_ENT_DTLBMISS);
1193 
1194 	stat = TSTAT_DATA_OFFS(tcpu, tdata_traps) + entoffs;
1195 	tmptick = TSTAT_DATA_OFFS(tcpu, tdata_tmptick);
1196 
1197 	if (itlb) {
1198 		ret = &tcpu->tcpu_instr->tinst_itlbret;
1199 		udata = &data->tpgsz_user.tmode_itlb;
1200 		kdata = &data->tpgsz_kernel.tmode_itlb;
1201 		tpc = TSTAT_INSTR_OFFS(tcpu, tinst_itlbret.ttlbr_ktlb);
1202 	} else {
1203 		ret = &tcpu->tcpu_instr->tinst_dtlbret;
1204 		udata = &data->tpgsz_user.tmode_dtlb;
1205 		kdata = &data->tpgsz_kernel.tmode_dtlb;
1206 		tpc = TSTAT_INSTR_OFFS(tcpu, tinst_dtlbret.ttlbr_ktlb);
1207 	}
1208 
1209 	utpc = tpc + offsetof(tstat_tlbret_t, ttlbr_utlb) -
1210 	    offsetof(tstat_tlbret_t, ttlbr_ktlb);
1211 
1212 	ASSERT(HI22(tpc) == HI22(utpc));
1213 
1214 	ent = (uint32_t *)((uintptr_t)tcpu->tcpu_instr + entoffs);
1215 	orig = KERNELBASE + entoffs;
1216 	va = (uintptr_t)tcpu->tcpu_ibase + entoffs;
1217 	baoffs = TSTAT_TLBENT_BA * sizeof (uint32_t);
1218 
1219 	bcopy(tlbent, ent, sizeof (tlbent));
1220 
1221 	ent[TSTAT_TLBENT_STATHI] |= HI22(stat);
1222 	ent[TSTAT_TLBENT_STATLO_LD] |= LO10(stat);
1223 	ent[TSTAT_TLBENT_STATLO_ST] |= LO10(stat);
1224 #ifndef sun4v
1225 	ent[TSTAT_TLBENT_MMUASI] |= asi;
1226 #else
1227 	ent[TSTAT_TLBENT_TAGTARGET] |= tagtarget_off;
1228 #endif
1229 	ent[TSTAT_TLBENT_TPCHI] |= HI22(tpc);
1230 	ent[TSTAT_TLBENT_TPCLO_USER] |= LO10(utpc);
1231 	ent[TSTAT_TLBENT_TPCLO_KERN] |= LO10(tpc);
1232 	ent[TSTAT_TLBENT_TSHI] |= HI22(tmptick);
1233 	ent[TSTAT_TLBENT_TSLO] |= LO10(tmptick);
1234 	ent[TSTAT_TLBENT_BA] |= DISP22(va + baoffs, orig);
1235 
1236 	/*
1237 	 * And now set up the TLB return entries.
1238 	 */
1239 	trapstat_tlbretent(tcpu, &ret->ttlbr_ktlb, &kdata->ttlb_tlb);
1240 	trapstat_tlbretent(tcpu, &ret->ttlbr_ktsb, &kdata->ttlb_tsb);
1241 	trapstat_tlbretent(tcpu, &ret->ttlbr_utlb, &udata->ttlb_tlb);
1242 	trapstat_tlbretent(tcpu, &ret->ttlbr_utsb, &udata->ttlb_tsb);
1243 }
1244 
1245 #undef TSTAT_TLBENT_STATHI
1246 #undef TSTAT_TLBENT_STATLO_LD
1247 #undef TSTAT_TLBENT_STATLO_ST
1248 #ifndef sun4v
1249 #undef TSTAT_TLBENT_MMUASI
1250 #else
1251 #undef TSTAT_TLBENT_TAGTARGET
1252 #endif
1253 #undef TSTAT_TLBENT_TPCHI
1254 #undef TSTAT_TLBENT_TPCLO_USER
1255 #undef TSTAT_TLBENT_TPCLO_KERN
1256 #undef TSTAT_TLBENT_TSHI
1257 #undef TSTAT_TLBENT_TSLO
1258 #undef TSTAT_TLBENT_BA
1259 
1260 /*
1261  * The TSTAT_ENABLED_* constants define offsets in the enabled entry; the
1262  * TSTAT_DISABLED_BA constant defines an offset in the disabled entry.  Both
1263  * sets of constants are used only in trapstat_make_traptab() (below) and
1264  * #undef'd immediately afterwards.  Any change to "enabled" or "disabled"
1265  * in trapstat_make_traptab() will likely require changes to these constants.
1266  */
1267 #define	TSTAT_ENABLED_STATHI	0
1268 #define	TSTAT_ENABLED_STATLO_LD	1
1269 #define	TSTAT_ENABLED_STATLO_ST 3
1270 #define	TSTAT_ENABLED_BA	4
1271 #define	TSTAT_DISABLED_BA	0
1272 
1273 static void
1274 trapstat_make_traptab(tstat_percpu_t *tcpu)
1275 {
1276 	uint32_t *ent;
1277 	uint64_t *stat;
1278 	uintptr_t orig, va, en_baoffs, dis_baoffs;
1279 	int nent;
1280 
1281 	/*
1282 	 * This is the entry in the interposing trap table for enabled trap
1283 	 * table entries.  It loads a counter, increments it and stores it
1284 	 * back before branching to the actual trap table entry.
1285 	 */
1286 	static const uint32_t enabled[TSTAT_ENT_NINSTR] = {
1287 	    0x03000000, 		/* sethi %hi(stat), %g1		*/
1288 	    0xc4586000,			/* ldx   [%g1 + %lo(stat)], %g2	*/
1289 	    0x8400a001,			/* add   %g2, 1, %g2		*/
1290 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(stat)]	*/
1291 	    0x30800000,			/* ba,a addr			*/
1292 	    NOP, NOP, NOP
1293 	};
1294 
1295 	/*
1296 	 * This is the entry in the interposing trap table for disabled trap
1297 	 * table entries.  It simply branches to the actual, underlying trap
1298 	 * table entry.  As explained in the "Implementation Details" section
1299 	 * of the block comment, all TL>0 traps _must_ use the disabled entry;
1300 	 * additional entries may be explicitly disabled through the use
1301 	 * of TSTATIOC_ENTRY/TSTATIOC_NOENTRY.
1302 	 */
1303 	static const uint32_t disabled[TSTAT_ENT_NINSTR] = {
1304 	    0x30800000,			/* ba,a addr			*/
1305 	    NOP, NOP, NOP, NOP, NOP, NOP, NOP,
1306 	};
1307 
1308 	ASSERT(MUTEX_HELD(&tstat_lock));
1309 
1310 	ent = tcpu->tcpu_instr->tinst_traptab;
1311 	stat = (uint64_t *)TSTAT_DATA_OFFS(tcpu, tdata_traps);
1312 	orig = KERNELBASE;
1313 	va = (uintptr_t)tcpu->tcpu_ibase;
1314 	en_baoffs = TSTAT_ENABLED_BA * sizeof (uint32_t);
1315 	dis_baoffs = TSTAT_DISABLED_BA * sizeof (uint32_t);
1316 
1317 	for (nent = 0; nent < TSTAT_TOTAL_NENT; nent++) {
1318 		if (tstat_enabled[nent]) {
1319 			bcopy(enabled, ent, sizeof (enabled));
1320 			ent[TSTAT_ENABLED_STATHI] |= HI22((uintptr_t)stat);
1321 			ent[TSTAT_ENABLED_STATLO_LD] |= LO10((uintptr_t)stat);
1322 			ent[TSTAT_ENABLED_STATLO_ST] |= LO10((uintptr_t)stat);
1323 			ent[TSTAT_ENABLED_BA] |= DISP22(va + en_baoffs, orig);
1324 		} else {
1325 			bcopy(disabled, ent, sizeof (disabled));
1326 			ent[TSTAT_DISABLED_BA] |= DISP22(va + dis_baoffs, orig);
1327 		}
1328 
1329 		stat++;
1330 		orig += sizeof (enabled);
1331 		ent += sizeof (enabled) / sizeof (*ent);
1332 		va += sizeof (enabled);
1333 	}
1334 }
1335 
1336 #undef TSTAT_ENABLED_STATHI
1337 #undef TSTAT_ENABLED_STATLO_LD
1338 #undef TSTAT_ENABLED_STATLO_ST
1339 #undef TSTAT_ENABLED_BA
1340 #undef TSTAT_DISABLED_BA
1341 
1342 static void
1343 trapstat_setup(processorid_t cpu)
1344 {
1345 	tstat_percpu_t *tcpu = &tstat_percpu[cpu];
1346 #ifndef sun4v
1347 	int i;
1348 	caddr_t va;
1349 	pfn_t *pfn;
1350 #endif
1351 
1352 	ASSERT(tcpu->tcpu_pfn == NULL);
1353 	ASSERT(tcpu->tcpu_instr == NULL);
1354 	ASSERT(tcpu->tcpu_data == NULL);
1355 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
1356 	ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED));
1357 	ASSERT(MUTEX_HELD(&cpu_lock));
1358 	ASSERT(MUTEX_HELD(&tstat_lock));
1359 
1360 	/*
1361 	 * The lower fifteen bits of the %tba are always read as zero; we must
1362 	 * align our instruction base address appropriately.
1363 	 */
1364 #ifndef sun4v
1365 	tcpu->tcpu_ibase = (caddr_t)((KERNELBASE - tstat_total_size)
1366 		& TSTAT_TBA_MASK);
1367 	tcpu->tcpu_dbase = tcpu->tcpu_ibase + TSTAT_INSTR_SIZE;
1368 	tcpu->tcpu_vabase = tcpu->tcpu_ibase;
1369 
1370 	tcpu->tcpu_pfn = vmem_alloc(tstat_arena, tstat_total_pages, VM_SLEEP);
1371 	bzero(tcpu->tcpu_pfn, tstat_total_pages);
1372 	pfn = tcpu->tcpu_pfn;
1373 
1374 	tcpu->tcpu_instr = vmem_alloc(tstat_arena, TSTAT_INSTR_SIZE, VM_SLEEP);
1375 
1376 	va = (caddr_t)tcpu->tcpu_instr;
1377 	for (i = 0; i < TSTAT_INSTR_PAGES; i++, va += MMU_PAGESIZE)
1378 		*pfn++ = hat_getpfnum(kas.a_hat, va);
1379 
1380 	/*
1381 	 * We must be sure that the pages that we will use to examine the data
1382 	 * have the same virtual color as the pages to which the data is being
1383 	 * recorded, hence the alignment and phase constraints on the
1384 	 * allocation.
1385 	 */
1386 	tcpu->tcpu_data = vmem_xalloc(tstat_arena, tstat_data_size,
1387 	    shm_alignment, (uintptr_t)tcpu->tcpu_dbase & (shm_alignment - 1),
1388 	    0, 0, NULL, VM_SLEEP);
1389 	bzero(tcpu->tcpu_data, tstat_data_size);
1390 	tcpu->tcpu_data->tdata_cpuid = cpu;
1391 
1392 	va = (caddr_t)tcpu->tcpu_data;
1393 	for (i = 0; i < tstat_data_pages; i++, va += MMU_PAGESIZE)
1394 		*pfn++ = hat_getpfnum(kas.a_hat, va);
1395 #else /* sun4v */
1396 	ASSERT(!(tstat_total_size > (1 + ~TSTAT_TBA_MASK)));
1397 	tcpu->tcpu_vabase = (caddr_t)(KERNELBASE - MMU_PAGESIZE4M);
1398 	tcpu->tcpu_ibase = tcpu->tcpu_vabase + (cpu * (1 + ~TSTAT_TBA_MASK));
1399 	tcpu->tcpu_dbase = tcpu->tcpu_ibase + TSTAT_INSTR_SIZE;
1400 
1401 	tcpu->tcpu_pfn = &tstat_pfn;
1402 	tcpu->tcpu_instr = (tstat_instr_t *)(tstat_va + (cpu *
1403 		(1 + ~TSTAT_TBA_MASK)));
1404 	tcpu->tcpu_data = (tstat_data_t *)(tstat_va + (cpu *
1405 		(1 + ~TSTAT_TBA_MASK)) + TSTAT_INSTR_SIZE);
1406 	bzero(tcpu->tcpu_data, tstat_data_size);
1407 	tcpu->tcpu_data->tdata_cpuid = cpu;
1408 #endif /* sun4v */
1409 
1410 	/*
1411 	 * Now that we have all of the instruction and data pages allocated,
1412 	 * make the trap table from scratch.
1413 	 */
1414 	trapstat_make_traptab(tcpu);
1415 
1416 	if (tstat_options & TSTAT_OPT_TLBDATA) {
1417 		/*
1418 		 * TLB Statistics have been specified; set up the I- and D-TLB
1419 		 * entries and corresponding TLB return entries.
1420 		 */
1421 		trapstat_tlbent(tcpu, TSTAT_ENT_ITLBMISS);
1422 		trapstat_tlbent(tcpu, TSTAT_ENT_DTLBMISS);
1423 	}
1424 
1425 	tcpu->tcpu_flags |= TSTAT_CPU_ALLOCATED;
1426 
1427 	/*
1428 	 * Finally, get the target CPU to load the locked pages into its TLBs.
1429 	 */
1430 	xc_one(cpu, (xcfunc_t *)trapstat_load_tlb, 0, 0);
1431 }
1432 
1433 static void
1434 trapstat_teardown(processorid_t cpu)
1435 {
1436 	tstat_percpu_t *tcpu = &tstat_percpu[cpu];
1437 #ifndef sun4v
1438 	int i;
1439 #endif
1440 	caddr_t va = tcpu->tcpu_vabase;
1441 
1442 	ASSERT(tcpu->tcpu_pfn != NULL);
1443 	ASSERT(tcpu->tcpu_instr != NULL);
1444 	ASSERT(tcpu->tcpu_data != NULL);
1445 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
1446 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
1447 	ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
1448 	ASSERT(MUTEX_HELD(&cpu_lock));
1449 	ASSERT(MUTEX_HELD(&tstat_lock));
1450 
1451 #ifndef sun4v
1452 	vmem_free(tstat_arena, tcpu->tcpu_pfn, tstat_total_pages);
1453 	vmem_free(tstat_arena, tcpu->tcpu_instr, TSTAT_INSTR_SIZE);
1454 	vmem_free(tstat_arena, tcpu->tcpu_data, tstat_data_size);
1455 
1456 	for (i = 0; i < tstat_total_pages; i++, va += MMU_PAGESIZE) {
1457 		xt_one(cpu, vtag_flushpage_tl1, (uint64_t)va, KCONTEXT);
1458 	}
1459 #else
1460 	xt_one(cpu, vtag_unmap_perm_tl1, (uint64_t)va, KCONTEXT);
1461 #endif
1462 
1463 	tcpu->tcpu_pfn = NULL;
1464 	tcpu->tcpu_instr = NULL;
1465 	tcpu->tcpu_data = NULL;
1466 	tcpu->tcpu_flags &= ~TSTAT_CPU_ALLOCATED;
1467 }
1468 
1469 static int
1470 trapstat_go()
1471 {
1472 	cpu_t *cp;
1473 
1474 	mutex_enter(&cpu_lock);
1475 	mutex_enter(&tstat_lock);
1476 
1477 	if (tstat_running) {
1478 		mutex_exit(&tstat_lock);
1479 		mutex_exit(&cpu_lock);
1480 		return (EBUSY);
1481 	}
1482 
1483 #ifdef sun4v
1484 	/*
1485 	 * Allocate large page to hold interposing tables
1486 	 */
1487 	tstat_va = contig_mem_alloc(MMU_PAGESIZE4M);
1488 	tstat_pfn = va_to_pfn(tstat_va);
1489 	if (tstat_pfn == PFN_INVALID) {
1490 		contig_mem_free(tstat_va, MMU_PAGESIZE4M);
1491 		return (EAGAIN);
1492 	}
1493 #endif
1494 
1495 	/*
1496 	 * First, perform any necessary hot patching.
1497 	 */
1498 	trapstat_hotpatch();
1499 
1500 	/*
1501 	 * Allocate the resources we'll need to measure probe effect.
1502 	 */
1503 	trapstat_probe_alloc();
1504 
1505 
1506 	cp = cpu_list;
1507 	do {
1508 		if (!(tstat_percpu[cp->cpu_id].tcpu_flags & TSTAT_CPU_SELECTED))
1509 			continue;
1510 
1511 		trapstat_setup(cp->cpu_id);
1512 
1513 		/*
1514 		 * Note that due to trapstat_probe()'s use of global data,
1515 		 * we determine the probe effect on each CPU serially instead
1516 		 * of in parallel with an xc_all().
1517 		 */
1518 		xc_one(cp->cpu_id, (xcfunc_t *)trapstat_probe, 0, 0);
1519 	} while ((cp = cp->cpu_next) != cpu_list);
1520 
1521 	xc_all((xcfunc_t *)trapstat_enable, 0, 0);
1522 
1523 	trapstat_probe_free();
1524 	tstat_running = 1;
1525 	mutex_exit(&tstat_lock);
1526 	mutex_exit(&cpu_lock);
1527 
1528 	return (0);
1529 }
1530 
1531 static int
1532 trapstat_stop()
1533 {
1534 	int i;
1535 
1536 	mutex_enter(&cpu_lock);
1537 	mutex_enter(&tstat_lock);
1538 	if (!tstat_running) {
1539 		mutex_exit(&tstat_lock);
1540 		mutex_exit(&cpu_lock);
1541 		return (ENXIO);
1542 	}
1543 
1544 	xc_all((xcfunc_t *)trapstat_disable, 0, 0);
1545 
1546 	for (i = 0; i <= max_cpuid; i++) {
1547 		if (tstat_percpu[i].tcpu_flags & TSTAT_CPU_ALLOCATED)
1548 			trapstat_teardown(i);
1549 	}
1550 
1551 #ifdef sun4v
1552 	contig_mem_free(tstat_va, MMU_PAGESIZE4M);
1553 #endif
1554 	trapstat_hotpatch();
1555 	tstat_running = 0;
1556 	mutex_exit(&tstat_lock);
1557 	mutex_exit(&cpu_lock);
1558 
1559 	return (0);
1560 }
1561 
1562 /*
1563  * This is trapstat's DR CPU configuration callback.  It's called (with
1564  * cpu_lock held) to unconfigure a newly powered-off CPU, or to configure a
1565  * powered-off CPU that is to be brought into the system.  We need only take
1566  * action in the unconfigure case:  because a powered-off CPU will have its
1567  * trap table restored to KERNELBASE if it is ever powered back on, we must
1568  * update the flags to reflect that trapstat is no longer enabled on the
1569  * powered-off CPU.  Note that this means that a TSTAT_CPU_ENABLED CPU that
1570  * is unconfigured/powered off and later powered back on/reconfigured will
1571  * _not_ be re-TSTAT_CPU_ENABLED.
1572  */
1573 static int
1574 trapstat_cpu_setup(cpu_setup_t what, processorid_t cpu)
1575 {
1576 	tstat_percpu_t *tcpu = &tstat_percpu[cpu];
1577 
1578 	ASSERT(MUTEX_HELD(&cpu_lock));
1579 	mutex_enter(&tstat_lock);
1580 
1581 	if (!tstat_running) {
1582 		mutex_exit(&tstat_lock);
1583 		return (0);
1584 	}
1585 
1586 	switch (what) {
1587 	case CPU_CONFIG:
1588 		ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
1589 		break;
1590 
1591 	case CPU_UNCONFIG:
1592 		if (tcpu->tcpu_flags & TSTAT_CPU_ENABLED)
1593 			tcpu->tcpu_flags &= ~TSTAT_CPU_ENABLED;
1594 		break;
1595 
1596 	default:
1597 		break;
1598 	}
1599 
1600 	mutex_exit(&tstat_lock);
1601 	return (0);
1602 }
1603 
1604 /*
1605  * This is called before a CPR suspend and after a CPR resume.  We don't have
1606  * anything to do before a suspend, but after a restart we must restore the
1607  * trap table to be our interposing trap table.  However, we don't actually
1608  * know whether or not the CPUs have been powered off -- this routine may be
1609  * called while restoring from a failed CPR suspend.  We thus run through each
1610  * TSTAT_CPU_ENABLED CPU, and explicitly destroy and reestablish its
1611  * interposing trap table.  This assures that our state is correct regardless
1612  * of whether or not the CPU has been newly powered on.
1613  */
1614 /*ARGSUSED*/
1615 static boolean_t
1616 trapstat_cpr(void *arg, int code)
1617 {
1618 	cpu_t *cp;
1619 
1620 	if (code == CB_CODE_CPR_CHKPT)
1621 		return (B_TRUE);
1622 
1623 	ASSERT(code == CB_CODE_CPR_RESUME);
1624 
1625 	mutex_enter(&cpu_lock);
1626 	mutex_enter(&tstat_lock);
1627 
1628 	if (!tstat_running) {
1629 		mutex_exit(&tstat_lock);
1630 		mutex_exit(&cpu_lock);
1631 		return (B_TRUE);
1632 	}
1633 
1634 	cp = cpu_list;
1635 	do {
1636 		tstat_percpu_t *tcpu = &tstat_percpu[cp->cpu_id];
1637 
1638 		if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED))
1639 			continue;
1640 
1641 		ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
1642 		ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
1643 
1644 		xc_one(cp->cpu_id, (xcfunc_t *)trapstat_disable, 0, 0);
1645 		ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
1646 
1647 		/*
1648 		 * Preserve this CPU's data in tstat_buffer and rip down its
1649 		 * interposing trap table.
1650 		 */
1651 		bcopy(tcpu->tcpu_data, tstat_buffer, tstat_data_t_size);
1652 		trapstat_teardown(cp->cpu_id);
1653 		ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED));
1654 
1655 		/*
1656 		 * Reestablish the interposing trap table and restore the old
1657 		 * data.
1658 		 */
1659 		trapstat_setup(cp->cpu_id);
1660 		ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
1661 		bcopy(tstat_buffer, tcpu->tcpu_data, tstat_data_t_size);
1662 
1663 		xc_one(cp->cpu_id, (xcfunc_t *)trapstat_enable, 0, 0);
1664 	} while ((cp = cp->cpu_next) != cpu_list);
1665 
1666 	mutex_exit(&tstat_lock);
1667 	mutex_exit(&cpu_lock);
1668 
1669 	return (B_TRUE);
1670 }
1671 
1672 /*ARGSUSED*/
1673 static int
1674 trapstat_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
1675 {
1676 	int i;
1677 
1678 	mutex_enter(&cpu_lock);
1679 	mutex_enter(&tstat_lock);
1680 	if (tstat_open != 0) {
1681 		mutex_exit(&tstat_lock);
1682 		mutex_exit(&cpu_lock);
1683 		return (EBUSY);
1684 	}
1685 
1686 	/*
1687 	 * Register this in open() rather than in attach() to prevent deadlock
1688 	 * with DR code. During attach, I/O device tree locks are grabbed
1689 	 * before trapstat_attach() is invoked - registering in attach
1690 	 * will result in the lock order: device tree lock, cpu_lock.
1691 	 * DR code however requires that cpu_lock be acquired before
1692 	 * device tree locks.
1693 	 */
1694 	ASSERT(!tstat_running);
1695 	register_cpu_setup_func((cpu_setup_func_t *)trapstat_cpu_setup, NULL);
1696 
1697 	/*
1698 	 * Clear all options.  And until specific CPUs are specified, we'll
1699 	 * mark all CPUs as selected.
1700 	 */
1701 	tstat_options = 0;
1702 
1703 	for (i = 0; i <= max_cpuid; i++)
1704 		tstat_percpu[i].tcpu_flags |= TSTAT_CPU_SELECTED;
1705 
1706 	/*
1707 	 * By default, all traps at TL=0 are enabled.  Traps at TL>0 must
1708 	 * be disabled.
1709 	 */
1710 	for (i = 0; i < TSTAT_TOTAL_NENT; i++)
1711 		tstat_enabled[i] = i < TSTAT_NENT ? 1 : 0;
1712 
1713 	tstat_open = 1;
1714 	mutex_exit(&tstat_lock);
1715 	mutex_exit(&cpu_lock);
1716 
1717 	return (0);
1718 }
1719 
1720 /*ARGSUSED*/
1721 static int
1722 trapstat_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
1723 {
1724 	(void) trapstat_stop();
1725 
1726 	ASSERT(!tstat_running);
1727 
1728 	mutex_enter(&cpu_lock);
1729 	unregister_cpu_setup_func((cpu_setup_func_t *)trapstat_cpu_setup, NULL);
1730 	mutex_exit(&cpu_lock);
1731 
1732 	tstat_open = 0;
1733 	return (DDI_SUCCESS);
1734 }
1735 
1736 static int
1737 trapstat_option(int option)
1738 {
1739 	mutex_enter(&tstat_lock);
1740 
1741 	if (tstat_running) {
1742 		mutex_exit(&tstat_lock);
1743 		return (EBUSY);
1744 	}
1745 
1746 	tstat_options |= option;
1747 	mutex_exit(&tstat_lock);
1748 
1749 	return (0);
1750 }
1751 
1752 /*ARGSUSED*/
1753 static int
1754 trapstat_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *crd, int *rval)
1755 {
1756 	int i, j, out;
1757 	size_t dsize;
1758 
1759 	switch (cmd) {
1760 	case TSTATIOC_GO:
1761 		return (trapstat_go());
1762 
1763 	case TSTATIOC_NOGO:
1764 		return (trapstat_option(TSTAT_OPT_NOGO));
1765 
1766 	case TSTATIOC_STOP:
1767 		return (trapstat_stop());
1768 
1769 	case TSTATIOC_CPU:
1770 		if (arg < 0 || arg > max_cpuid)
1771 			return (EINVAL);
1772 		/*FALLTHROUGH*/
1773 
1774 	case TSTATIOC_NOCPU:
1775 		mutex_enter(&tstat_lock);
1776 
1777 		if (tstat_running) {
1778 			mutex_exit(&tstat_lock);
1779 			return (EBUSY);
1780 		}
1781 
1782 		/*
1783 		 * If this is the first CPU to be specified (or if we are
1784 		 * being asked to explicitly de-select CPUs), disable all CPUs.
1785 		 */
1786 		if (!(tstat_options & TSTAT_OPT_CPU) || cmd == TSTATIOC_NOCPU) {
1787 			tstat_options |= TSTAT_OPT_CPU;
1788 
1789 			for (i = 0; i <= max_cpuid; i++) {
1790 				tstat_percpu_t *tcpu = &tstat_percpu[i];
1791 
1792 				ASSERT(cmd == TSTATIOC_NOCPU ||
1793 				    (tcpu->tcpu_flags & TSTAT_CPU_SELECTED));
1794 				tcpu->tcpu_flags &= ~TSTAT_CPU_SELECTED;
1795 			}
1796 		}
1797 
1798 		if (cmd == TSTATIOC_CPU)
1799 			tstat_percpu[arg].tcpu_flags |= TSTAT_CPU_SELECTED;
1800 
1801 		mutex_exit(&tstat_lock);
1802 
1803 		return (0);
1804 
1805 	case TSTATIOC_ENTRY:
1806 		mutex_enter(&tstat_lock);
1807 
1808 		if (tstat_running) {
1809 			mutex_exit(&tstat_lock);
1810 			return (EBUSY);
1811 		}
1812 
1813 		if (arg >= TSTAT_NENT || arg < 0) {
1814 			mutex_exit(&tstat_lock);
1815 			return (EINVAL);
1816 		}
1817 
1818 		if (!(tstat_options & TSTAT_OPT_ENTRY)) {
1819 			/*
1820 			 * If this is the first entry that we are explicitly
1821 			 * enabling, explicitly disable every TL=0 entry.
1822 			 */
1823 			for (i = 0; i < TSTAT_NENT; i++)
1824 				tstat_enabled[i] = 0;
1825 
1826 			tstat_options |= TSTAT_OPT_ENTRY;
1827 		}
1828 
1829 		tstat_enabled[arg] = 1;
1830 		mutex_exit(&tstat_lock);
1831 		return (0);
1832 
1833 	case TSTATIOC_NOENTRY:
1834 		mutex_enter(&tstat_lock);
1835 
1836 		if (tstat_running) {
1837 			mutex_exit(&tstat_lock);
1838 			return (EBUSY);
1839 		}
1840 
1841 		for (i = 0; i < TSTAT_NENT; i++)
1842 			tstat_enabled[i] = 0;
1843 
1844 		mutex_exit(&tstat_lock);
1845 		return (0);
1846 
1847 	case TSTATIOC_READ:
1848 		mutex_enter(&tstat_lock);
1849 
1850 		if (tstat_options & TSTAT_OPT_TLBDATA) {
1851 			dsize = tstat_data_t_exported_size;
1852 		} else {
1853 			dsize = sizeof (tstat_data_t);
1854 		}
1855 
1856 		for (i = 0, out = 0; i <= max_cpuid; i++) {
1857 			tstat_percpu_t *tcpu = &tstat_percpu[i];
1858 
1859 			if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED))
1860 				continue;
1861 
1862 			ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
1863 			ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
1864 
1865 			tstat_buffer->tdata_cpuid = -1;
1866 			xc_one(i, (xcfunc_t *)trapstat_snapshot, 0, 0);
1867 
1868 			if (tstat_buffer->tdata_cpuid == -1) {
1869 				/*
1870 				 * This CPU is not currently responding to
1871 				 * cross calls; we have caught it while it is
1872 				 * being unconfigured.  We'll drop tstat_lock
1873 				 * and pick up and drop cpu_lock.  By the
1874 				 * time we acquire cpu_lock, the DR operation
1875 				 * will appear consistent and we can assert
1876 				 * that trapstat_cpu_setup() has cleared
1877 				 * TSTAT_CPU_ENABLED.
1878 				 */
1879 				mutex_exit(&tstat_lock);
1880 				mutex_enter(&cpu_lock);
1881 				mutex_exit(&cpu_lock);
1882 				mutex_enter(&tstat_lock);
1883 				ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
1884 				continue;
1885 			}
1886 
1887 			/*
1888 			 * Need to compensate for the difference between page
1889 			 * sizes exported to users and page sizes available
1890 			 * within the kernel.
1891 			 */
1892 			if ((tstat_options & TSTAT_OPT_TLBDATA) &&
1893 			    (tstat_pgszs != tstat_user_pgszs)) {
1894 				tstat_pgszdata_t *tp;
1895 				uint_t szc;
1896 
1897 				tp = &tstat_buffer->tdata_pgsz[0];
1898 				for (j = 0; j < tstat_user_pgszs; j++) {
1899 					if ((szc = USERSZC_2_SZC(j)) != j) {
1900 						bcopy(&tp[szc], &tp[j],
1901 						    sizeof (tstat_pgszdata_t));
1902 					}
1903 				}
1904 			}
1905 
1906 			if (copyout(tstat_buffer, (void *)arg, dsize) != 0) {
1907 				mutex_exit(&tstat_lock);
1908 				return (EFAULT);
1909 			}
1910 
1911 			out++;
1912 			arg += dsize;
1913 		}
1914 
1915 		if (out != max_cpuid + 1) {
1916 			processorid_t cpuid = -1;
1917 			arg += offsetof(tstat_data_t, tdata_cpuid);
1918 
1919 			if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0) {
1920 				mutex_exit(&tstat_lock);
1921 				return (EFAULT);
1922 			}
1923 		}
1924 
1925 		mutex_exit(&tstat_lock);
1926 
1927 		return (0);
1928 
1929 	case TSTATIOC_TLBDATA:
1930 		return (trapstat_option(TSTAT_OPT_TLBDATA));
1931 
1932 	default:
1933 		break;
1934 	}
1935 
1936 	return (ENOTTY);
1937 }
1938 
1939 /*ARGSUSED*/
1940 static int
1941 trapstat_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
1942 {
1943 	int error;
1944 
1945 	switch (infocmd) {
1946 	case DDI_INFO_DEVT2DEVINFO:
1947 		*result = (void *)tstat_devi;
1948 		error = DDI_SUCCESS;
1949 		break;
1950 	case DDI_INFO_DEVT2INSTANCE:
1951 		*result = (void *)0;
1952 		error = DDI_SUCCESS;
1953 		break;
1954 	default:
1955 		error = DDI_FAILURE;
1956 	}
1957 	return (error);
1958 }
1959 
1960 static int
1961 trapstat_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
1962 {
1963 	switch (cmd) {
1964 	case DDI_ATTACH:
1965 		break;
1966 
1967 	case DDI_RESUME:
1968 		return (DDI_SUCCESS);
1969 
1970 	default:
1971 		return (DDI_FAILURE);
1972 	}
1973 
1974 	if (ddi_create_minor_node(devi, "trapstat", S_IFCHR,
1975 	    0, DDI_PSEUDO, 0) == DDI_FAILURE) {
1976 		ddi_remove_minor_node(devi, NULL);
1977 		return (DDI_FAILURE);
1978 	}
1979 
1980 	ddi_report_dev(devi);
1981 	tstat_devi = devi;
1982 
1983 	tstat_pgszs = page_num_pagesizes();
1984 	tstat_user_pgszs = page_num_user_pagesizes();
1985 	tstat_data_t_size = sizeof (tstat_data_t) +
1986 	    (tstat_pgszs - 1) * sizeof (tstat_pgszdata_t);
1987 	tstat_data_t_exported_size = sizeof (tstat_data_t) +
1988 	    (tstat_user_pgszs - 1) * sizeof (tstat_pgszdata_t);
1989 #ifndef sun4v
1990 	tstat_data_pages = (tstat_data_t_size >> MMU_PAGESHIFT) + 1;
1991 	tstat_total_pages = TSTAT_INSTR_PAGES + tstat_data_pages;
1992 	tstat_data_size = tstat_data_pages * MMU_PAGESIZE;
1993 	tstat_total_size = TSTAT_INSTR_SIZE + tstat_data_size;
1994 #else
1995 	tstat_data_pages = 0;
1996 	tstat_data_size = tstat_data_t_size;
1997 	tstat_total_pages = ((TSTAT_INSTR_SIZE + tstat_data_size) >>
1998 		MMU_PAGESHIFT) + 1;
1999 	tstat_total_size = tstat_total_pages * MMU_PAGESIZE;
2000 #endif
2001 
2002 	tstat_percpu = kmem_zalloc((max_cpuid + 1) *
2003 	    sizeof (tstat_percpu_t), KM_SLEEP);
2004 
2005 	/*
2006 	 * Create our own arena backed by segkmem to assure a source of
2007 	 * MMU_PAGESIZE-aligned allocations.  We allocate out of the
2008 	 * heap32_arena to assure that we can address the allocated memory with
2009 	 * a single sethi/simm13 pair in the interposing trap table entries.
2010 	 */
2011 	tstat_arena = vmem_create("trapstat", NULL, 0, MMU_PAGESIZE,
2012 	    segkmem_alloc_permanent, segkmem_free, heap32_arena, 0, VM_SLEEP);
2013 
2014 	tstat_enabled = kmem_alloc(TSTAT_TOTAL_NENT * sizeof (int), KM_SLEEP);
2015 	tstat_buffer = kmem_alloc(tstat_data_t_size, KM_SLEEP);
2016 
2017 	/*
2018 	 * CB_CL_CPR_POST_USER is the class that executes from cpr_resume()
2019 	 * after user threads can be restarted.  By executing in this class,
2020 	 * we are assured of the availability of system services needed to
2021 	 * resume trapstat (specifically, we are assured that all CPUs are
2022 	 * restarted and responding to cross calls).
2023 	 */
2024 	tstat_cprcb =
2025 	    callb_add(trapstat_cpr, NULL, CB_CL_CPR_POST_USER, "trapstat");
2026 
2027 	return (DDI_SUCCESS);
2028 }
2029 
2030 static int
2031 trapstat_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
2032 {
2033 	int rval;
2034 
2035 	ASSERT(devi == tstat_devi);
2036 
2037 	switch (cmd) {
2038 	case DDI_DETACH:
2039 		break;
2040 
2041 	case DDI_SUSPEND:
2042 		return (DDI_SUCCESS);
2043 
2044 	default:
2045 		return (DDI_FAILURE);
2046 	}
2047 
2048 	ASSERT(!tstat_running);
2049 
2050 	rval = callb_delete(tstat_cprcb);
2051 	ASSERT(rval == 0);
2052 
2053 	kmem_free(tstat_buffer, tstat_data_t_size);
2054 	kmem_free(tstat_enabled, TSTAT_TOTAL_NENT * sizeof (int));
2055 	vmem_destroy(tstat_arena);
2056 	kmem_free(tstat_percpu, (max_cpuid + 1) * sizeof (tstat_percpu_t));
2057 	ddi_remove_minor_node(devi, NULL);
2058 
2059 	return (DDI_SUCCESS);
2060 }
2061 
2062 /*
2063  * Configuration data structures
2064  */
2065 static struct cb_ops trapstat_cb_ops = {
2066 	trapstat_open,		/* open */
2067 	trapstat_close,		/* close */
2068 	nulldev,		/* strategy */
2069 	nulldev,		/* print */
2070 	nodev,			/* dump */
2071 	nodev,			/* read */
2072 	nodev,			/* write */
2073 	trapstat_ioctl,		/* ioctl */
2074 	nodev,			/* devmap */
2075 	nodev,			/* mmap */
2076 	nodev,			/* segmap */
2077 	nochpoll,		/* poll */
2078 	ddi_prop_op,		/* cb_prop_op */
2079 	0,			/* streamtab */
2080 	D_MP | D_NEW		/* Driver compatibility flag */
2081 };
2082 
2083 static struct dev_ops trapstat_ops = {
2084 	DEVO_REV,		/* devo_rev, */
2085 	0,			/* refcnt */
2086 	trapstat_info,		/* getinfo */
2087 	nulldev,		/* identify */
2088 	nulldev,		/* probe */
2089 	trapstat_attach,	/* attach */
2090 	trapstat_detach,	/* detach */
2091 	nulldev,		/* reset */
2092 	&trapstat_cb_ops,	/* cb_ops */
2093 	(struct bus_ops *)0,	/* bus_ops */
2094 };
2095 
2096 static struct modldrv modldrv = {
2097 	&mod_driverops,		/* Type of module.  This one is a driver */
2098 	"Trap Statistics",	/* name of module */
2099 	&trapstat_ops,		/* driver ops */
2100 };
2101 
2102 static struct modlinkage modlinkage = {
2103 	MODREV_1, (void *)&modldrv, NULL
2104 };
2105 
2106 int
2107 _init(void)
2108 {
2109 	return (mod_install(&modlinkage));
2110 }
2111 
2112 int
2113 _fini(void)
2114 {
2115 	return (mod_remove(&modlinkage));
2116 }
2117 
2118 int
2119 _info(struct modinfo *modinfop)
2120 {
2121 	return (mod_info(&modlinkage, modinfop));
2122 }
2123