xref: /titanic_50/usr/src/uts/sun4/io/trapstat.c (revision 532877c46d04a2d0b254f9b5797720078adcea07)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/systm.h>
29 #include <sys/conf.h>
30 #include <sys/stat.h>
31 #include <sys/ddi.h>
32 #include <sys/sunddi.h>
33 #include <sys/modctl.h>
34 #include <sys/cpu_module.h>
35 #include <vm/hat_sfmmu.h>
36 #include <vm/seg_kmem.h>
37 #include <vm/seg_kpm.h>
38 #include <vm/vm_dep.h>
39 #include <sys/machsystm.h>
40 #include <sys/machasi.h>
41 #include <sys/sysmacros.h>
42 #include <sys/callb.h>
43 #include <sys/archsystm.h>
44 #include <sys/trapstat.h>
45 #ifdef sun4v
46 #include <sys/hypervisor_api.h>
47 #endif
48 #ifndef sun4v
49 #include <sys/pghw.h>
50 #endif
51 
52 /* BEGIN CSTYLED */
53 /*
54  * trapstat:  Trap Statistics through Dynamic Trap Table Interposition
55  * -------------------------------------------------------------------
56  *
57  * Motivation and Overview
58  *
59  * Despite being a fundamental indicator of system behavior, there has
60  * historically been very little insight provided into the frequency and cost
61  * of machine-specific traps.  The lack of insight has been especially acute
62  * on UltraSPARC microprocessors:  because these microprocessors handle TLB
63  * misses as software traps, the frequency and duration of traps play a
64  * decisive role in the performance of the memory system.  As applications have
65  * increasingly outstripped TLB reach, this has become increasingly true.
66  *
67  * Part of the difficulty of observing trap behavior is that the trap handlers
68  * are so frequently called (e.g. millions of times per second) that any
69  * permanently enabled instrumentation would induce an unacceptable performance
70  * degradation.  Thus, it is a constraint on any trap observability
71  * infrastructure that it have no probe effect when not explicitly enabled.
72  *
73  * The basic idea, then, is to create an interposing trap table in which each
74  * entry increments a per-trap, in-memory counter and then jumps to the actual,
75  * underlying trap table entry.  To enable trapstat, we atomically write to the
76  * trap base address (%tba) register to point to our interposing trap table.
77  * (Note that per-CPU statistics fall out by creating a different trap table
78  * for each CPU.)
79  *
80  * Implementation Details
81  *
82  * While the idea is straight-forward, a nuance of SPARC V9 slightly
83  * complicates the implementation.  Unlike its predecessors, SPARC V9 supports
84  * the notion of nested traps.  The trap level is kept in the TL register:
85  * during normal operation it is 0; when a trap is taken, the TL register is
86  * incremented by 1.  To aid system software, SPARC V9 breaks the trap table
87  * into two halves:  the lower half contains the trap handlers for traps taken
88  * when TL is 0; the upper half contains the trap handlers for traps taken
89  * when TL is greater than 0.  Each half is further subdivided into two
90  * subsequent halves:  the lower half contains the trap handlers for traps
91  * other than those induced by the trap instruction (Tcc variants); the upper
92  * half contains the trap handlers for traps induced by the trap instruction.
93  * This gives a total of four ranges, with each range containing 256 traps:
94  *
95  *       +--------------------------------+- 3ff
96  *       |                                |   .
97  *       |     Trap instruction, TL>0     |   .
98  *       |                                |   .
99  *       |- - - - - - - - - - - - - - - - +- 300
100  *       |- - - - - - - - - - - - - - - - +- 2ff
101  *       |                                |   .
102  *       |   Non-trap instruction, TL>0   |   .
103  *       |                                |   .
104  *       |- - - - - - - - - - - - - - - - +- 200
105  *       |- - - - - - - - - - - - - - - - +- 1ff
106  *       |                                |   .
107  *       |     Trap instruction, TL=0     |   .
108  *       |                                |   .
109  *       |- - - - - - - - - - - - - - - - +- 100
110  *       |- - - - - - - - - - - - - - - - +- 0ff
111  *       |                                |   .
112  *       |   Non-trap instruction, TL=0   |   .
113  *       |                                |   .
114  *       +--------------------------------+- 000
115  *
116  *
117  * Solaris, however, doesn't have reason to support trap instructions when
118  * TL>0 (only privileged code may execute at TL>0; not supporting this only
119  * constrains our own implementation).  The trap table actually looks like:
120  *
121  *       +--------------------------------+- 2ff
122  *       |                                |   .
123  *       |   Non-trap instruction, TL>0   |   .
124  *       |                                |   .
125  *       |- - - - - - - - - - - - - - - - +- 200
126  *       |- - - - - - - - - - - - - - - - +- 1ff
127  *       |                                |   .
128  *       |     Trap instruction, TL=0     |   .
129  *       |                                |   .
130  *       |- - - - - - - - - - - - - - - - +- 100
131  *       |- - - - - - - - - - - - - - - - +- 0ff
132  *       |                                |   .
133  *       |   Non-trap instruction, TL=0   |   .
134  *       |                                |   .
135  *       +--------------------------------+- 000
136  *
137  * Putatively to aid system software, SPARC V9 has the notion of multiple
138  * sets of global registers.  UltraSPARC defines four sets of global
139  * registers:
140  *
141  *    Normal Globals
142  *    Alternate Globals (AGs)
143  *    MMU Globals (MGs)
144  *    Interrupt Globals (IGs)
145  *
146  * The set of globals in use is controlled by bits in PSTATE; when TL is 0
147  * (and PSTATE has not been otherwise explicitly modified), the Normal Globals
148  * are in use.  When a trap is issued, PSTATE is modified to point to a set of
149  * globals corresponding to the trap type.  Most traps correspond to the
150  * Alternate Globals, with a minority corresponding to the MMU Globals, and
151  * only the interrupt-vector trap (vector 0x60) corresponding to the Interrupt
152  * Globals.  (The complete mapping can be found in the UltraSPARC I&II User's
153  * Manual.)
154  *
155  * Note that the sets of globals are per trap _type_, not per trap _level_.
156  * Thus, when executing a TL>0 trap handler, one may not have registers
157  * available (for example, both trap-instruction traps and spill traps execute
158  * on the alternate globals; if a trap-instruction trap induces a window spill,
159  * the window spill handler has no available globals).  For trapstat, this is
160  * problematic:  a register is required to transfer control from one arbitrary
161  * location (in the interposing trap table) to another (in the actual trap
162  * table).
163  *
164  * We solve this problem by exploiting the trap table's location at the bottom
165  * of valid kernel memory (i.e. at KERNELBASE).  We locate the interposing trap
166  * tables just below KERNELBASE -- thereby allowing us to use a branch-always
167  * instruction (ba) instead of a jump instruction (jmp) to transfer control
168  * from the TL>0 entries in the interposing trap table to the TL>0 entries in
169  * the actual trap table.  (N.B. while this allows trap table interposition to
170  * work, it necessarily limits trapstat to only recording information about
171  * TL=0 traps -- there is no way to increment a counter without using a
172  * register.)  Diagrammatically:
173  *
174  *  Actual trap table:
175  *
176  *       +--------------------------------+- 2ff
177  *       |                                |   .
178  *       |   Non-trap instruction, TL>0   |   .   <-----------------------+
179  *       |                                |   .   <-----------------------|-+
180  *       |- - - - - - - - - - - - - - - - +- 200  <-----------------------|-|-+
181  *       |- - - - - - - - - - - - - - - - +- 1ff                          | | |
182  *       |                                |   .                           | | |
183  *       |     Trap instruction, TL=0     |   .   <-----------------+     | | |
184  *       |                                |   .   <-----------------|-+   | | |
185  *       |- - - - - - - - - - - - - - - - +- 100  <-----------------|-|-+ | | |
186  *       |- - - - - - - - - - - - - - - - +- 0ff                    | | | | | |
187  *       |                                |   .                     | | | | | |
188  *       |   Non-trap instruction, TL=0   |   .   <-----------+     | | | | | |
189  *       |                                |   .   <-----------|-+   | | | | | |
190  *       +--------------------------------+- 000  <-----------|-|-+ | | | | | |
191  *        KERNELBASE                                          | | | | | | | | |
192  *                                                            | | | | | | | | |
193  *                                                            | | | | | | | | |
194  *  Interposing trap table:                                   | | | | | | | | |
195  *                                                            | | | | | | | | |
196  *       +--------------------------------+- 2ff              | | | | | | | | |
197  *       |  ...                           |   .               | | | | | | | | |
198  *       |  ...                           |   .               | | | | | | | | |
199  *       |  ...                           |   .               | | | | | | | | |
200  *       |- - - - - - - - - - - - - - - - +- 203              | | | | | | | | |
201  *       |  ba,a                          |      -------------|-|-|-|-|-|-+ | |
202  *       |- - - - - - - - - - - - - - - - +- 202              | | | | | |   | |
203  *       |  ba,a                          |      -------------|-|-|-|-|-|---+ |
204  *       |- - - - - - - - - - - - - - - - +- 201              | | | | | |     |
205  *       |  ba,a                          |      -------------|-|-|-|-|-|-----+
206  *       |- - - - - - - - - - - - - - - - +- 200              | | | | | |
207  *       |  ...                           |   .               | | | | | |
208  *       |  ...                           |   .               | | | | | |
209  *       |  ...                           |   .               | | | | | |
210  *       |- - - - - - - - - - - - - - - - +- 103              | | | | | |
211  *       |  (Increment counter)           |                   | | | | | |
212  *       |  ba,a                          |      -------------------+ | |
213  *       |- - - - - - - - - - - - - - - - +- 102              | | |   | |
214  *       |  (Increment counter)           |                   | | |   | |
215  *       |  ba,a                          |      ---------------------+ |
216  *       |- - - - - - - - - - - - - - - - +- 101              | | |     |
217  *       |  (Increment counter)           |                   | | |     |
218  *       |  ba,a                          |      -----------------------+
219  *       |- - - - - - - - - - - - - - - - +- 100              | | |
220  *       |  ...                           |   .               | | |
221  *       |  ...                           |   .               | | |
222  *       |  ...                           |   .               | | |
223  *       |- - - - - - - - - - - - - - - - +- 003              | | |
224  *       |  (Increment counter)           |                   | | |
225  *       |  ba,a                          |      -------------+ | |
226  *       |- - - - - - - - - - - - - - - - +- 002                | |
227  *       |  (Increment counter)           |                     | |
228  *       |  ba,a                          |      ---------------+ |
229  *       |- - - - - - - - - - - - - - - - +- 001                  |
230  *       |  (Increment counter)           |                       |
231  *       |  ba,a                          |      -----------------+
232  *       +--------------------------------+- 000
233  *        KERNELBASE - tstat_total_size
234  *
235  * tstat_total_size is the number of pages required for each trap table.  It
236  * must be true that KERNELBASE - tstat_total_size is less than the maximum
237  * branch displacement; if each CPU were to consume a disjoint virtual range
238  * below KERNELBASE for its trap table, we could support at most
239  * (maximum_branch_displacement / tstat_total_size) CPUs.  The maximum branch
240  * displacement for Bicc variants is just under eight megabytes, and (because
241  * the %tba must be 32K aligned), tstat_total_size must be at least 32K; if
242  * each CPU were to consume a disjoint virtual range, we would have an
243  * unacceptably low upper bound of 256 CPUs.
244  *
245  * While there are tricks that one could use to address this constraint (e.g.,
246  * creating trampolines every maximum_branch_displacement bytes), we instead
247  * solve this by not permitting each CPU to consume a disjoint virtual range.
248  * Rather, we have each CPU's interposing trap table use the _same_ virtual
249  * range, but we back the trap tables with disjoint physical memory.  Normally,
250  * such one-to-many virtual-to-physical mappings are illegal; this is
251  * permissible here only because the pages for the interposing trap table are
252  * necessarily locked in the TLB.  (The CPUs thus never have the opportunity to
253  * discover that they have conflicting translations.)
254  *
255  * On CMT architectures in which CPUs can share MMUs, the above trick will not
256  * work: two CPUs that share an MMU cannot have the same virtual address map
257  * to disjoint physical pages.  On these architectures, any CPUs sharing the
258  * same MMU must consume a disjoint 32K virtual address range -- limiting the
259  * number of CPUs sharing an MMU on these architectures to 256 due to the
260  * branch displacement limitation described above.  On the sun4v architecture,
261  * there is a further limitation: a guest may not have more than eight locked
262  * TLB entries per MMU.  To allow operation under this restriction, the
263  * interposing trap table and the trap statistics are each accessed through
264  * a single 4M TLB entry.  This limits the footprint to two locked entries
265  * (one for the I-TLB and one for the D-TLB), but further restricts the number
266  * of CPUs to 128 per MMU.  However, support for more than 128 CPUs can easily
267  * be added via a hybrid scheme, where the same 4M virtual address is used
268  * on different MMUs.
269  *
270  *
271  * TLB Statistics
272  *
273  * Because TLB misses are an important component of system performance, we wish
274  * to know much more about these traps than simply the number received.
275  * Specifically, we wish to know:
276  *
277  *  (a)	The amount of time spent executing the TLB miss handler
278  *  (b)	TLB misses versus TSB misses
279  *  (c) Kernel-level misses versus user-level misses
280  *  (d) Misses per pagesize
281  *
282  * TLB Statistics: Time Spent Executing
283  *
284  * To accurately determine the amount of time spent executing the TLB miss
285  * handler, one must get a timestamp on trap entry and trap exit, subtract the
286  * latter from the former, and add the result to an accumulating count.
287  * Consider flow of control during normal TLB miss processing (where "ldx
288  * [%g2], %g2" is an arbitrary TLB-missing instruction):
289  *
290  * + - - - - - - - -+
291  * :                :
292  * : ldx [%g2], %g2 :<-------------------------------------------------------+
293  * :                :              Return from trap:                         |
294  * + - - - - - - - -+                TL <- TL - 1 (0)                        |
295  *	  |                          %pc <- TSTATE[TL].TPC (address of load) |
296  *	  | TLB miss:                                                        |
297  *        |   TL <- TL + 1 (1)                                               |
298  *        |   %pc <- TLB-miss-trap-handler                                   |
299  *        |                                                                  |
300  *        v                                                                  |
301  * + - - - - - - - - - - - - - - - +                                         |
302  * :                               :                                         |
303  * : Lookup VA in TSB              :                                         |
304  * : If (hit)                      :                                         |
305  * :     Fill TLB                  :                                         |
306  * : Else                          :                                         |
307  * :     Lookup VA (hme hash table :                                         |
308  * :                or segkpm)     :                                         |
309  * :     Fill TLB                  :                                         |
310  * : Endif                         :                                         |
311  * : Issue "retry"  ---------------------------------------------------------+
312  * :                               :
313  * + - - - - - - - - - - - - - - - +
314  *  TLB-miss-trap-handler
315  *
316  *
317  * As the above diagram indicates, interposing on the trap table allows one
318  * only to determine a timestamp on trap _entry_:  when the TLB miss handler
319  * has completed filling the TLB, a "retry" will be issued, and control will
320  * transfer immediately back to the missing %pc.
321  *
322  * To obtain a timestamp on trap exit, we must then somehow interpose between
323  * the "retry" and the subsequent control transfer to the TLB-missing
324  * instruction.  To do this, we _push_ a trap level.  The basic idea is to
325  * spoof a TLB miss by raising TL, setting the %tpc to be within text
326  * controlled by trapstat (the "TLB return entry") and branching to the
327  * underlying TLB miss handler.  When the TLB miss handler issues its "retry",
328  * control will transfer not to the TLB-missing instruction, but rather to the
329  * TLB return entry.  This code can then obtain a timestamp, and issue its own
330  * "retry" -- thereby correctly returning to the TLB-missing instruction.
331  * Here is the above TLB miss flow control diagram modified to reflect
332  * trapstat's operation:
333  *
334  * + - - - - - - - -+
335  * :                :
336  * : ldx [%g2], %g2 :<-------------------------------------------------------+
337  * :                :             Return from trap:                          |
338  * + - - - - - - - -+               TL <- TL - 1 (0)                         |
339  *	  |                         %pc <- TSTATE[TL].TPC (address of load)  |
340  *	  | TLB miss:                                                        |
341  *        |   TL <- TL + 1 (1)                                               |
342  *        |   %pc <- TLB-miss-trap-handler (trapstat)                        |
343  *        |                                                                  |
344  *        v                                    TLB-return-entry (trapstat)   |
345  * + - - - - - - - - - - - - - - - - - - +    + - - - - - - - - - - - - - +  |
346  * :                                     :    :                           :  |
347  * : Record timestamp                    :    : Record timestamp          :  |
348  * : TL <- 2                             :    : Take timestamp difference :  |
349  * : TSTATE[1].TPC <- TLB-return-entry   :    : Add to running total      :  |
350  * : ba,a TLB-miss-trap-handler -----------+  : Issue "retry"  --------------+
351  * :                                     : |  :                           :
352  * + - - - - - - - - - - - - - - - - - - + |  + - - - - - - - - - - - - - +
353  *  TLB-miss-trap-handler	           |                  ^
354  *  (trapstat)                             |                  |
355  *                                         |                  |
356  *                                         |                  |
357  *                 +-----------------------+                  |
358  *                 |                                          |
359  *                 |                                          |
360  *                 v                                          |
361  * + - - - - - - - - - - - - - - - +                          |
362  * :                               :                          |
363  * : Lookup VA in TSB              :                          |
364  * : If (hit)                      :                          |
365  * :     Fill TLB                  :                          |
366  * : Else                          :                          |
367  * :     Lookup VA (hme hash table :                          |
368  * :                or segkpm)     :                          |
369  * :     Fill TLB                  :                          |
370  * : Endif                         :                          |
371  * : Issue "retry"  ------------------------------------------+
372  * :                               : Return from trap:
373  * + - - - - - - - - - - - - - - - +   TL <- TL - 1 (1)
374  *  TLB-miss-trap-handler              %pc <- TSTATE[TL].TPC (TLB-return-entry)
375  *
376  *
377  * A final subterfuge is required to complete our artifice:  if we miss in
378  * the TLB, the TSB _and_ the subsequent hash or segkpm lookup (that is, if
379  * there is no valid translation for the TLB-missing address), common system
380  * software will need to accurately determine the %tpc as part of its page
381  * fault handling. We therefore modify the kernel to check the %tpc in this
382  * case: if the %tpc falls within the VA range controlled by trapstat and
383  * the TL is 2, TL is simply lowered back to 1 (this check is implemented
384  * by the TSTAT_CHECK_TL1 macro).  Lowering TL to 1 has the effect of
385  * discarding the state pushed by trapstat.
386  *
387  * TLB Statistics: TLB Misses versus TSB Misses
388  *
389  * Distinguishing TLB misses from TSB misses requires further interposition
390  * on the TLB miss handler:  we cannot know a priori or a posteriori if a
391  * given VA will or has hit in the TSB.
392  *
393  * We achieve this distinction by adding a second TLB return entry almost
394  * identical to the first -- differing only in the address to which it
395  * stores its results.  We then modify the TLB miss handlers of the kernel
396  * such that they check the %tpc when they determine that a TLB miss has
397  * subsequently missed in the TSB:  if the %tpc lies within trapstat's VA
398  * range and TL is 2 (that is, if trapstat is running), the TLB miss handler
399  * _increments_ the %tpc by the size of the TLB return entry.  The ensuing
400  * "retry" will thus transfer control to the second TLB return entry, and
401  * the time spent in the handler will be accumulated in a memory location
402  * specific to TSB misses.
403  *
404  * N.B.:  To minimize the amount of knowledge the kernel must have of trapstat,
405  * we do not allow the kernel to hard-code the size of the TLB return entry.
406  * Rather, the actual tsbmiss handler executes a known instruction at the
407  * corresponding tsbmiss patch points (see the tstat_tsbmiss_patch_table) with
408  * the %tpc in %g7:  when trapstat is not running, these points contain the
409  * harmless TSTAT_TSBMISS_INSTR instruction ("add %g7, 0, %g7"). Before
410  * running, trapstat modifies the instructions at these patch points such
411  * that the simm13 equals the size of the TLB return entry.
412  *
413  * TLB Statistics: Kernel-level Misses versus User-level Misses
414  *
415  * Differentiating user-level misses from kernel-level misses employs a
416  * similar technique, but is simplified by the ability to distinguish a
417  * user-level miss from a kernel-level miss a priori by reading the context
418  * register:  we implement kernel-/user-level differentiation by again doubling
419  * the number of TLB return entries, and setting the %tpc to the appropriate
420  * TLB return entry in trapstat's TLB miss handler.  Together with the doubling
421  * of entries required for TLB-miss/TSB-miss differentiation, this yields a
422  * total of four TLB return entries:
423  *
424  *	Level		TSB hit?	Structure member
425  *	------------------------------------------------------------
426  *	Kernel		Yes		tstat_tlbret_t.ttlbr_ktlb
427  *	Kernel		No		tstat_tlbret_t.ttlbr_ktsb
428  *	User		Yes		tstat_tlbret_t.ttlbr_utlb
429  *	User		No		tstat_tlbret_t.ttlbr_utsb
430  *
431  * TLB Statistics: Misses per Pagesize
432  *
433  * As with the TLB-/TSB-miss differentiation, we have no way of determining
434  * pagesize a priori.  This is therefore implemented by mandating a new rule:
435  * whenever the kernel fills the TLB in its TLB miss handler, the TTE
436  * corresponding to the TLB-missing VA must be in %g5 when the handler
437  * executes its "retry".  This allows the TLB return entry to determine
438  * pagesize by simply looking at the pagesize field in the TTE stored in
439  * %g5.
440  *
441  * TLB Statistics: Probe Effect
442  *
443  * As one might imagine, gathering TLB statistics by pushing a trap level
444  * induces significant probe effect.  To account for this probe effect,
445  * trapstat attempts to observe it by executing a code sequence with a known
446  * number of TLB misses both before and after interposing on the trap table.
447  * This allows trapstat to determine a per-trap probe effect which can then be
448  * factored into the "%tim" fields of the trapstat command.
449  *
450  * Note that on sun4v platforms, TLB misses are normally handled by the
451  * hypervisor or the hardware TSB walker. Thus no fast MMU miss information
452  * is reported for normal operation. However, when trapstat is invoked
453  * with -t or -T option to collect detailed TLB statistics, kernel takes
454  * over TLB miss handling. This results in significantly more overhead
455  * and TLB statistics may not be as accurate as on sun4u platforms.
456  * On some processors, hypervisor or hardware may provide a low overhead
457  * interface to collect TSB hit statistics. This support is exposed via
458  * a well defined CPU module interface (cpu_trapstat_conf to enable this
459  * interface and cpu_trapstat_data to get detailed TSB hit statistics).
460  * In this scenario, TSB miss statistics is collected by intercepting the
461  * IMMU_miss and DMMU_miss traps using above mentioned trap interposition
462  * approach.
463  *
464  * Locking
465  *
466  * The implementation uses two locks:  tstat_lock (a local lock) and the global
467  * cpu_lock.  tstat_lock is used to assure trapstat's consistency in the
468  * presence of multithreaded /dev/trapstat consumers (while as of this writing
469  * the only consumer of /dev/trapstat is single threaded, it is obviously
470  * necessary to correctly support multithreaded access).  cpu_lock is held
471  * whenever CPUs are being manipulated directly, to prevent them from
472  * disappearing in the process.  Because trapstat's DR callback
473  * (trapstat_cpu_setup()) must grab tstat_lock and is called with cpu_lock
474  * held, the lock ordering is necessarily cpu_lock before tstat_lock.
475  *
476  */
477 /* END CSTYLED */
478 
479 static dev_info_t	*tstat_devi;	/* saved in xxattach() for xxinfo() */
480 static int		tstat_open;	/* set if driver is open */
481 static kmutex_t		tstat_lock;	/* serialize access */
482 static vmem_t		*tstat_arena;	/* arena for TLB-locked pages */
483 static tstat_percpu_t	*tstat_percpu;	/* per-CPU data */
484 static int		tstat_running;	/* set if trapstat is running */
485 static tstat_data_t	*tstat_buffer;	/* staging buffer for outgoing data */
486 static int		tstat_options;	/* bit-wise indication of options */
487 static int		*tstat_enabled;	/* map of enabled trap entries */
488 static int		tstat_tsbmiss_patched; /* tsbmiss patch flag */
489 static callb_id_t	tstat_cprcb;	/* CPR callback */
490 static char		*tstat_probe_area; /* VA range used for probe effect */
491 static caddr_t		tstat_probe_phys; /* physical to back above VA */
492 static hrtime_t		tstat_probe_time; /* time spent on probe effect */
493 static hrtime_t		tstat_probe_before[TSTAT_PROBE_NLAPS];
494 static hrtime_t		tstat_probe_after[TSTAT_PROBE_NLAPS];
495 static uint_t		tstat_pgszs;		/* # of kernel page sizes */
496 static uint_t		tstat_user_pgszs;	/* # of user page sizes */
497 
498 /*
499  * sizeof tstat_data_t + pgsz data for the kernel.  For simplicity's sake, when
500  * we collect data, we do it based upon szc, but when we report data back to
501  * userland, we have to do it based upon the userszc which may not match.
502  * So, these two variables are for internal use and exported use respectively.
503  */
504 static size_t		tstat_data_t_size;
505 static size_t		tstat_data_t_exported_size;
506 
507 static size_t		tstat_data_pages;  /* number of pages of tstat data */
508 static size_t		tstat_data_size;   /* tstat data size in bytes */
509 static size_t		tstat_total_pages; /* #data pages + #instr pages */
510 static size_t		tstat_total_size;  /* tstat data size + instr size */
511 #ifdef sun4v
512 static caddr_t		tstat_va;	/* VA of memory reserved for TBA */
513 static pfn_t		tstat_pfn;	/* PFN of memory reserved for TBA */
514 static boolean_t	tstat_fast_tlbstat = B_FALSE;
515 #endif
516 
517 /*
518  * In the above block comment, see "TLB Statistics: TLB Misses versus
519  * TSB Misses" for an explanation of the tsbmiss patch points.
520  */
521 extern uint32_t		tsbmiss_trapstat_patch_point;
522 extern uint32_t		tsbmiss_trapstat_patch_point_kpm;
523 extern uint32_t		tsbmiss_trapstat_patch_point_kpm_small;
524 
525 /*
526  * Trapstat tsbmiss patch table
527  */
528 tstat_tsbmiss_patch_entry_t tstat_tsbmiss_patch_table[] = {
529 	{(uint32_t *)&tsbmiss_trapstat_patch_point, 0},
530 	{(uint32_t *)&tsbmiss_trapstat_patch_point_kpm, 0},
531 	{(uint32_t *)&tsbmiss_trapstat_patch_point_kpm_small, 0},
532 	{(uint32_t *)NULL, 0}
533 };
534 
535 /*
536  * We define some general SPARC-specific constants to allow more readable
537  * relocations.
538  */
539 #define	NOP	0x01000000
540 #define	HI22(v) ((uint32_t)(v) >> 10)
541 #define	LO10(v) ((uint32_t)(v) & 0x3ff)
542 #define	LO12(v) ((uint32_t)(v) & 0xfff)
543 #define	DISP22(from, to) \
544 	((((uintptr_t)(to) - (uintptr_t)(from)) >> 2) & 0x3fffff)
545 #define	ASI(asi)	((asi) << 5)
546 
547 /*
548  * The interposing trap table must be locked in the I-TLB, and any data
549  * referred to in the interposing trap handler must be locked in the D-TLB.
550  * This function locks these pages in the appropriate TLBs by creating TTEs
551  * from whole cloth, and manually loading them into the TLB.  This function is
552  * called from cross call context.
553  *
554  * On sun4v platforms, we use 4M page size mappings to minimize the number
555  * of locked down entries (i.e. permanent mappings). Each CPU uses a
556  * reserved portion of that 4M page for its TBA and data.
557  */
558 static void
559 trapstat_load_tlb(void)
560 {
561 #ifndef sun4v
562 	int i;
563 #else
564 	uint64_t ret;
565 #endif
566 	tte_t tte;
567 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
568 	caddr_t va = tcpu->tcpu_vabase;
569 
570 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
571 	ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
572 
573 #ifndef sun4v
574 	for (i = 0; i < tstat_total_pages; i++, va += MMU_PAGESIZE) {
575 		tte.tte_inthi = TTE_VALID_INT | TTE_SZ_INT(TTE8K) |
576 			TTE_PFN_INTHI(tcpu->tcpu_pfn[i]);
577 		if (i < TSTAT_INSTR_PAGES) {
578 			tte.tte_intlo = TTE_PFN_INTLO(tcpu->tcpu_pfn[i]) |
579 				TTE_LCK_INT | TTE_CP_INT | TTE_PRIV_INT;
580 			sfmmu_itlb_ld_kva(va, &tte);
581 		} else {
582 			tte.tte_intlo = TTE_PFN_INTLO(tcpu->tcpu_pfn[i]) |
583 				TTE_LCK_INT | TTE_CP_INT | TTE_CV_INT |
584 				TTE_PRIV_INT | TTE_HWWR_INT;
585 			sfmmu_dtlb_ld_kva(va, &tte);
586 		}
587 	}
588 #else /* sun4v */
589 	tte.tte_inthi = TTE_VALID_INT | TTE_PFN_INTHI(tstat_pfn);
590 	tte.tte_intlo = TTE_PFN_INTLO(tstat_pfn) | TTE_CP_INT |
591 		TTE_CV_INT | TTE_PRIV_INT | TTE_HWWR_INT |
592 		TTE_SZ_INTLO(TTE4M);
593 	ret = hv_mmu_map_perm_addr(va, KCONTEXT, *(uint64_t *)&tte,
594 		MAP_ITLB | MAP_DTLB);
595 
596 	if (ret != H_EOK)
597 		cmn_err(CE_PANIC, "trapstat: cannot map new TBA "
598 		    "for cpu %d  (error: 0x%lx)", CPU->cpu_id, ret);
599 #endif /* sun4v */
600 }
601 
602 /*
603  * As mentioned in the "TLB Statistics: TLB Misses versus TSB Misses" section
604  * of the block comment, TLB misses are differentiated from TSB misses in
605  * part by hot-patching the instructions at the tsbmiss patch points (see
606  * tstat_tsbmiss_patch_table). This routine is used both to initially patch
607  * the instructions, and to patch them back to their original values upon
608  * restoring the original trap table.
609  */
610 static void
611 trapstat_hotpatch()
612 {
613 	uint32_t instr;
614 	uint32_t simm13;
615 	tstat_tsbmiss_patch_entry_t *ep;
616 
617 	ASSERT(MUTEX_HELD(&tstat_lock));
618 
619 	if (!(tstat_options & TSTAT_OPT_TLBDATA))
620 		return;
621 
622 	if (!tstat_tsbmiss_patched) {
623 		/*
624 		 * We haven't patched the TSB paths; do so now.
625 		 */
626 		/*CONSTCOND*/
627 		ASSERT(offsetof(tstat_tlbret_t, ttlbr_ktsb) -
628 		    offsetof(tstat_tlbret_t, ttlbr_ktlb) ==
629 		    offsetof(tstat_tlbret_t, ttlbr_utsb) -
630 		    offsetof(tstat_tlbret_t, ttlbr_utlb));
631 
632 		simm13 = offsetof(tstat_tlbret_t, ttlbr_ktsb) -
633 		    offsetof(tstat_tlbret_t, ttlbr_ktlb);
634 
635 		for (ep = tstat_tsbmiss_patch_table; ep->tpe_addr; ep++) {
636 			ASSERT(ep->tpe_instr == 0);
637 			instr = ep->tpe_instr = *ep->tpe_addr;
638 
639 			/*
640 			 * Assert that the instruction we're about to patch is
641 			 * "add %g7, 0, %g7" (0x8e01e000).
642 			 */
643 			ASSERT(instr == TSTAT_TSBMISS_INSTR);
644 
645 			instr |= simm13;
646 			hot_patch_kernel_text((caddr_t)ep->tpe_addr,
647 			    instr, sizeof (instr));
648 		}
649 
650 		tstat_tsbmiss_patched = 1;
651 
652 	} else {
653 		/*
654 		 * Remove patches from the TSB paths.
655 		 */
656 		for (ep = tstat_tsbmiss_patch_table; ep->tpe_addr; ep++) {
657 			ASSERT(ep->tpe_instr == TSTAT_TSBMISS_INSTR);
658 			hot_patch_kernel_text((caddr_t)ep->tpe_addr,
659 			    ep->tpe_instr, sizeof (instr));
660 			ep->tpe_instr = 0;
661 		}
662 
663 		tstat_tsbmiss_patched = 0;
664 	}
665 }
666 
667 /*
668  * This is the routine executed to clock the performance of the trap table,
669  * executed both before and after interposing on the trap table to attempt to
670  * determine probe effect.  The probe effect is used to adjust the "%tim"
671  * fields of trapstat's -t and -T output; we only use TLB misses to clock the
672  * trap table.  We execute the inner loop (which is designed to exceed the
673  * TLB's reach) nlaps times, taking the best time as our time (thereby
674  * factoring out the effects of interrupts, cache misses or other perturbing
675  * events.
676  */
677 static hrtime_t
678 trapstat_probe_laps(int nlaps, hrtime_t *buf)
679 {
680 	int i, j = 0;
681 	hrtime_t ts, best = INT64_MAX;
682 
683 	while (nlaps--) {
684 		ts = rdtick();
685 
686 		for (i = 0; i < TSTAT_PROBE_SIZE; i += MMU_PAGESIZE)
687 			*((volatile char *)&tstat_probe_area[i]);
688 
689 		if ((ts = rdtick() - ts) < best)
690 			best = ts;
691 		buf[j++] = ts;
692 	}
693 
694 	return (best);
695 }
696 
697 /*
698  * This routine determines the probe effect by calling trapstat_probe_laps()
699  * both without and with the interposing trap table.  Note that this is
700  * called from a cross call on the desired CPU, and that it is called on
701  * every CPU (this is necessary because the probe effect may differ from
702  * one CPU to another).
703  */
704 static void
705 trapstat_probe()
706 {
707 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
708 	hrtime_t before, after;
709 
710 	if (!(tcpu->tcpu_flags & TSTAT_CPU_SELECTED))
711 		return;
712 
713 	if (tstat_probe_area == NULL || (tstat_options & TSTAT_OPT_NOGO))
714 		return;
715 
716 	/*
717 	 * We very much expect the %tba to be KERNELBASE; this is a
718 	 * precautionary measure to assure that trapstat doesn't melt the
719 	 * machine should the %tba point unexpectedly elsewhere.
720 	 */
721 	if (get_tba() != (caddr_t)KERNELBASE)
722 		return;
723 
724 	/*
725 	 * Preserve this CPU's data before destroying it by enabling the
726 	 * interposing trap table.  We can safely use tstat_buffer because
727 	 * the caller of the trapstat_probe() cross call is holding tstat_lock.
728 	 */
729 	bcopy(tcpu->tcpu_data, tstat_buffer, tstat_data_t_size);
730 
731 	tstat_probe_time = gethrtime();
732 
733 	before = trapstat_probe_laps(TSTAT_PROBE_NLAPS, tstat_probe_before);
734 	(void) set_tba(tcpu->tcpu_ibase);
735 
736 	after = trapstat_probe_laps(TSTAT_PROBE_NLAPS, tstat_probe_after);
737 	(void) set_tba((caddr_t)KERNELBASE);
738 
739 	tstat_probe_time = gethrtime() - tstat_probe_time;
740 
741 	bcopy(tstat_buffer, tcpu->tcpu_data, tstat_data_t_size);
742 	tcpu->tcpu_data->tdata_peffect = (after - before) / TSTAT_PROBE_NPAGES;
743 }
744 
745 static void
746 trapstat_probe_alloc()
747 {
748 	pfn_t pfn;
749 	caddr_t va;
750 	int i;
751 
752 	ASSERT(MUTEX_HELD(&tstat_lock));
753 	ASSERT(tstat_probe_area == NULL);
754 	ASSERT(tstat_probe_phys == NULL);
755 
756 	if (!(tstat_options & TSTAT_OPT_TLBDATA))
757 		return;
758 
759 	/*
760 	 * Grab some virtual from the heap arena.
761 	 */
762 	tstat_probe_area = vmem_alloc(heap_arena, TSTAT_PROBE_SIZE, VM_SLEEP);
763 	va = tstat_probe_area;
764 
765 	/*
766 	 * Grab a single physical page.
767 	 */
768 	tstat_probe_phys = vmem_alloc(tstat_arena, MMU_PAGESIZE, VM_SLEEP);
769 	pfn = hat_getpfnum(kas.a_hat, tstat_probe_phys);
770 
771 	/*
772 	 * Now set the translation for every page in our virtual range
773 	 * to be our allocated physical page.
774 	 */
775 	for (i = 0; i < TSTAT_PROBE_NPAGES; i++) {
776 		hat_devload(kas.a_hat, va, MMU_PAGESIZE, pfn, PROT_READ,
777 		    HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
778 		va += MMU_PAGESIZE;
779 	}
780 }
781 
782 static void
783 trapstat_probe_free()
784 {
785 	caddr_t va;
786 	int i;
787 
788 	ASSERT(MUTEX_HELD(&tstat_lock));
789 
790 	if ((va = tstat_probe_area) == NULL)
791 		return;
792 
793 	for (i = 0; i < TSTAT_PROBE_NPAGES; i++) {
794 		hat_unload(kas.a_hat, va, MMU_PAGESIZE, HAT_UNLOAD_UNLOCK);
795 		va += MMU_PAGESIZE;
796 	}
797 
798 	vmem_free(tstat_arena, tstat_probe_phys, MMU_PAGESIZE);
799 	vmem_free(heap_arena, tstat_probe_area, TSTAT_PROBE_SIZE);
800 
801 	tstat_probe_phys = NULL;
802 	tstat_probe_area = NULL;
803 }
804 
805 /*
806  * This routine actually enables a CPU by setting its %tba to be the
807  * CPU's interposing trap table.  It is called out of cross call context.
808  */
809 static void
810 trapstat_enable()
811 {
812 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
813 
814 	if (!(tcpu->tcpu_flags & TSTAT_CPU_SELECTED))
815 		return;
816 
817 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
818 	ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
819 
820 	if (get_tba() != (caddr_t)KERNELBASE)
821 		return;
822 
823 	if (!(tstat_options & TSTAT_OPT_NOGO))
824 		(void) set_tba(tcpu->tcpu_ibase);
825 	tcpu->tcpu_flags |= TSTAT_CPU_ENABLED;
826 #ifdef sun4v
827 	if ((tstat_options & TSTAT_OPT_TLBDATA) &&
828 	    !(tstat_options & TSTAT_OPT_NOGO)) {
829 		if (tstat_fast_tlbstat) {
830 			/*
831 			 * Invoke processor specific interface to enable
832 			 * collection of TSB hit statistics.
833 			 */
834 			cpu_trapstat_conf(CPU_TSTATCONF_ENABLE);
835 		} else {
836 			/*
837 			 * Collect TLB miss statistics by taking over
838 			 * TLB miss handling from the hypervisor. This
839 			 * is done by telling the hypervisor that there
840 			 * is no TSB configured. Also set TSTAT_TLB_STATS
841 			 * flag so that no user TSB is configured during
842 			 * context switch time.
843 			 */
844 			cpu_t *cp = CPU;
845 
846 			cp->cpu_m.cpu_tstat_flags |= TSTAT_TLB_STATS;
847 			(void) hv_set_ctx0(NULL, NULL);
848 			(void) hv_set_ctxnon0(NULL, NULL);
849 		}
850 	}
851 #endif
852 }
853 
854 /*
855  * This routine disables a CPU (vis a vis trapstat) by setting its %tba to be
856  * the actual, underlying trap table.  It is called out of cross call context.
857  */
858 static void
859 trapstat_disable()
860 {
861 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
862 
863 	if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED))
864 		return;
865 
866 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
867 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
868 
869 	if (!(tstat_options & TSTAT_OPT_NOGO))
870 		(void) set_tba((caddr_t)KERNELBASE);
871 
872 	tcpu->tcpu_flags &= ~TSTAT_CPU_ENABLED;
873 
874 #ifdef sun4v
875 	if ((tstat_options & TSTAT_OPT_TLBDATA) &&
876 	    !(tstat_options & TSTAT_OPT_NOGO)) {
877 		if (tstat_fast_tlbstat) {
878 			/*
879 			 * Invoke processor specific interface to disable
880 			 * collection of TSB hit statistics on each processor.
881 			 */
882 			cpu_trapstat_conf(CPU_TSTATCONF_DISABLE);
883 		} else {
884 			/*
885 			 * As part of collecting TLB miss statistics, we took
886 			 * over TLB miss handling from the hypervisor by
887 			 * telling the hypervisor that NO TSB is configured.
888 			 * We need to restore that by communicating proper
889 			 * kernel/user TSB information so that TLB misses
890 			 * can be handled by the hypervisor or the hardware
891 			 * more efficiently.
892 			 *
893 			 * We restore kernel TSB information right away.
894 			 * However, to minimize any locking dependency, we
895 			 * don't restore user TSB information right away.
896 			 * Instead, we simply clear the TSTAT_TLB_STATS flag
897 			 * so that the user TSB information is automatically
898 			 * restored on next context switch.
899 			 *
900 			 * Note that the call to restore kernel TSB information
901 			 * will normally not fail, unless wrong information is
902 			 * passed here. In that scenario, system will still
903 			 * continue to function properly with the exception of
904 			 * kernel handling all the TLB misses.
905 			 */
906 			struct hv_tsb_block *hvbp = &ksfmmup->sfmmu_hvblock;
907 			cpu_t *cp = CPU;
908 
909 			cp->cpu_m.cpu_tstat_flags &= ~TSTAT_TLB_STATS;
910 			(void) hv_set_ctx0(hvbp->hv_tsb_info_cnt,
911 			    hvbp->hv_tsb_info_pa);
912 		}
913 	}
914 #endif
915 }
916 
917 /*
918  * We use %tick as the time base when recording the time spent executing
919  * the trap handler.  %tick, however, is not necessarily kept in sync
920  * across CPUs (indeed, different CPUs may have different %tick frequencies).
921  * We therefore cross call onto a CPU to get a snapshot of its data to
922  * copy out; this is the routine executed out of that cross call.
923  */
924 static void
925 trapstat_snapshot()
926 {
927 	tstat_percpu_t *tcpu = &tstat_percpu[CPU->cpu_id];
928 	tstat_data_t *data = tcpu->tcpu_data;
929 
930 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
931 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
932 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ENABLED);
933 
934 	data->tdata_snapts = gethrtime();
935 	data->tdata_snaptick = rdtick();
936 	bcopy(data, tstat_buffer, tstat_data_t_size);
937 #ifdef sun4v
938 	/*
939 	 * Invoke processor specific interface to collect TSB hit
940 	 * statistics on each processor.
941 	 */
942 	if ((tstat_options & TSTAT_OPT_TLBDATA) && tstat_fast_tlbstat)
943 		cpu_trapstat_data((void *) tstat_buffer->tdata_pgsz,
944 		    tstat_pgszs);
945 #endif
946 }
947 
948 /*
949  * The TSTAT_RETENT_* constants define offsets in the TLB return entry.
950  * They are used only in trapstat_tlbretent() (below) and #undef'd
951  * immediately afterwards.  Any change to "retent" in trapstat_tlbretent()
952  * will likely require changes to these constants.
953  */
954 
955 #ifndef sun4v
956 #define	TSTAT_RETENT_STATHI	1
957 #define	TSTAT_RETENT_STATLO	2
958 #define	TSTAT_RETENT_SHIFT	11
959 #define	TSTAT_RETENT_COUNT_LD	13
960 #define	TSTAT_RETENT_COUNT_ST	15
961 #define	TSTAT_RETENT_TMPTSHI	16
962 #define	TSTAT_RETENT_TMPTSLO	17
963 #define	TSTAT_RETENT_TIME_LD	19
964 #define	TSTAT_RETENT_TIME_ST	21
965 #else /* sun4v */
966 #define	TSTAT_RETENT_STATHI	1
967 #define	TSTAT_RETENT_STATLO	2
968 #define	TSTAT_RETENT_SHIFT	5
969 #define	TSTAT_RETENT_COUNT_LD	7
970 #define	TSTAT_RETENT_COUNT_ST	9
971 #define	TSTAT_RETENT_TMPTSHI	10
972 #define	TSTAT_RETENT_TMPTSLO	11
973 #define	TSTAT_RETENT_TIME_LD	13
974 #define	TSTAT_RETENT_TIME_ST	15
975 #endif /* sun4v */
976 
977 static void
978 trapstat_tlbretent(tstat_percpu_t *tcpu, tstat_tlbretent_t *ret,
979     tstat_missdata_t *data)
980 {
981 	uint32_t *ent = ret->ttlbrent_instr, shift;
982 	uintptr_t base, tmptick = TSTAT_DATA_OFFS(tcpu, tdata_tmptick);
983 
984 	/*
985 	 * This is the entry executed upon return from the TLB/TSB miss
986 	 * handler (i.e. the code interpositioned between the "retry" and
987 	 * the actual return to the TLB-missing instruction).  Detail on its
988 	 * theory of operation can be found in the "TLB Statistics" section
989 	 * of the block comment.  Note that we expect the TTE just loaded
990 	 * into the TLB to be in %g5; all other globals are available as
991 	 * scratch.  Finally, note that the page size information in sun4v is
992 	 * located in the lower bits of the TTE -- requiring us to have a
993 	 * different return entry on sun4v.
994 	 */
995 	static const uint32_t retent[TSTAT_TLBRET_NINSTR] = {
996 #ifndef sun4v
997 	    0x87410000,		/* rd    %tick, %g3			*/
998 	    0x03000000, 	/* sethi %hi(stat), %g1			*/
999 	    0x82106000,		/* or    %g1, %lo(stat), %g1		*/
1000 	    0x89297001,		/* sllx  %g5, 1, %g4			*/
1001 	    0x8931303e,		/* srlx  %g4, 62, %g4			*/
1002 	    0x8531702e,		/* srlx  %g5, 46, %g2			*/
1003 	    0x8408a004,		/* and   %g2, 4, %g2			*/
1004 	    0x88110002,		/* or    %g4, %g2, %g4			*/
1005 	    0x80a12005,		/* cmp   %g4, 5				*/
1006 	    0x34400002,		/* bg,a,pn %icc, +8			*/
1007 	    0x88102004,		/* mov   4, %g4				*/
1008 	    0x89292000,		/* sll   %g4, shift, %g4		*/
1009 	    0x82004004,		/* add   %g1, %g4, %g1			*/
1010 	    0xc4586000,		/* ldx   [%g1 + tmiss_count], %g2	*/
1011 	    0x8400a001,		/* add   %g2, 1, %g2			*/
1012 	    0xc4706000,		/* stx   %g2, [%g1 + tmiss_count]	*/
1013 	    0x0d000000, 	/* sethi %hi(tdata_tmptick), %g6	*/
1014 	    0xc459a000, 	/* ldx   [%g6 + %lo(tdata_tmptick)], %g2 */
1015 	    0x8620c002,		/* sub   %g3, %g2, %g3			*/
1016 	    0xc4586000,		/* ldx   [%g1 + tmiss_time], %g2	*/
1017 	    0x84008003,		/* add   %g2, %g3, %g2			*/
1018 	    0xc4706000,		/* stx   %g2, [%g1 + tmiss_time]	*/
1019 	    0x83f00000		/* retry				*/
1020 #else /* sun4v */
1021 	    0x87410000,		/* rd    %tick, %g3			*/
1022 	    0x03000000, 	/* sethi %hi(stat), %g1			*/
1023 	    0x82106000,		/* or    %g1, %lo(stat), %g1		*/
1024 	    0x8929703d,		/* sllx  %g5, 61, %g4			*/
1025 	    0x8931303d,		/* srlx  %g4, 61, %g4			*/
1026 	    0x89292000,		/* sll   %g4, shift, %g4		*/
1027 	    0x82004004,		/* add   %g1, %g4, %g1			*/
1028 	    0xc4586000,		/* ldx   [%g1 + tmiss_count], %g2	*/
1029 	    0x8400a001,		/* add   %g2, 1, %g2			*/
1030 	    0xc4706000,		/* stx   %g2, [%g1 + tmiss_count]	*/
1031 	    0x0d000000, 	/* sethi %hi(tdata_tmptick), %g6	*/
1032 	    0xc459a000, 	/* ldx   [%g6 + %lo(tdata_tmptick)], %g2 */
1033 	    0x8620c002,		/* sub   %g3, %g2, %g3			*/
1034 	    0xc4586000,		/* ldx   [%g1 + tmiss_time], %g2	*/
1035 	    0x84008003,		/* add   %g2, %g3, %g2			*/
1036 	    0xc4706000,		/* stx   %g2, [%g1 + tmiss_time]	*/
1037 	    0x83f00000		/* retry				*/
1038 #endif /* sun4v */
1039 	};
1040 
1041 	ASSERT(MUTEX_HELD(&tstat_lock));
1042 	/*CONSTCOND*/
1043 	ASSERT(offsetof(tstat_missdata_t, tmiss_count) <= LO10(-1));
1044 	/*CONSTCOND*/
1045 	ASSERT(offsetof(tstat_missdata_t, tmiss_time) <= LO10(-1));
1046 	/*CONSTCOND*/
1047 	ASSERT(!((sizeof (tstat_pgszdata_t) - 1) & sizeof (tstat_pgszdata_t)));
1048 
1049 	for (shift = 1; (1 << shift) != sizeof (tstat_pgszdata_t); shift++)
1050 		continue;
1051 
1052 	base = (uintptr_t)tcpu->tcpu_dbase +
1053 	    ((uintptr_t)data - (uintptr_t)tcpu->tcpu_data);
1054 
1055 	bcopy(retent, ent, sizeof (retent));
1056 
1057 	ent[TSTAT_RETENT_STATHI] |= HI22(base);
1058 	ent[TSTAT_RETENT_STATLO] |= LO10(base);
1059 	ent[TSTAT_RETENT_SHIFT] |= shift;
1060 	/* LINTED E_EXPR_NULL_EFFECT */
1061 	ent[TSTAT_RETENT_COUNT_LD] |= offsetof(tstat_missdata_t, tmiss_count);
1062 	/* LINTED E_EXPR_NULL_EFFECT */
1063 	ent[TSTAT_RETENT_COUNT_ST] |= offsetof(tstat_missdata_t, tmiss_count);
1064 	ent[TSTAT_RETENT_TMPTSHI] |= HI22(tmptick);
1065 	ent[TSTAT_RETENT_TMPTSLO] |= LO10(tmptick);
1066 	ent[TSTAT_RETENT_TIME_LD] |= offsetof(tstat_missdata_t, tmiss_time);
1067 	ent[TSTAT_RETENT_TIME_ST] |= offsetof(tstat_missdata_t, tmiss_time);
1068 }
1069 
1070 #undef TSTAT_RETENT_STATHI
1071 #undef TSTAT_RETENT_STATLO
1072 #undef TSTAT_RETENT_SHIFT
1073 #undef TSTAT_RETENT_COUNT_LD
1074 #undef TSTAT_RETENT_COUNT_ST
1075 #undef TSTAT_RETENT_TMPTSHI
1076 #undef TSTAT_RETENT_TMPTSLO
1077 #undef TSTAT_RETENT_TIME_LD
1078 #undef TSTAT_RETENT_TIME_ST
1079 
1080 /*
1081  * The TSTAT_TLBENT_* constants define offsets in the TLB entry.  They are
1082  * used only in trapstat_tlbent() (below) and #undef'd immediately afterwards.
1083  * Any change to "tlbent" in trapstat_tlbent() will likely require changes
1084  * to these constants.
1085  */
1086 
1087 #ifndef sun4v
1088 #define	TSTAT_TLBENT_STATHI	0
1089 #define	TSTAT_TLBENT_STATLO_LD	1
1090 #define	TSTAT_TLBENT_STATLO_ST	3
1091 #define	TSTAT_TLBENT_MMUASI	15
1092 #define	TSTAT_TLBENT_TPCHI	18
1093 #define	TSTAT_TLBENT_TPCLO_USER	19
1094 #define	TSTAT_TLBENT_TPCLO_KERN	21
1095 #define	TSTAT_TLBENT_TSHI	25
1096 #define	TSTAT_TLBENT_TSLO	27
1097 #define	TSTAT_TLBENT_BA		28
1098 #else /* sun4v */
1099 #define	TSTAT_TLBENT_STATHI	0
1100 #define	TSTAT_TLBENT_STATLO_LD	1
1101 #define	TSTAT_TLBENT_STATLO_ST	3
1102 #define	TSTAT_TLBENT_TAGTARGET	19
1103 #define	TSTAT_TLBENT_TPCHI	21
1104 #define	TSTAT_TLBENT_TPCLO_USER	22
1105 #define	TSTAT_TLBENT_TPCLO_KERN	24
1106 #define	TSTAT_TLBENT_TSHI	28
1107 #define	TSTAT_TLBENT_TSLO	30
1108 #define	TSTAT_TLBENT_BA		31
1109 #endif /* sun4v */
1110 
1111 static void
1112 trapstat_tlbent(tstat_percpu_t *tcpu, int entno)
1113 {
1114 	uint32_t *ent;
1115 	uintptr_t orig, va, baoffs;
1116 #ifndef sun4v
1117 	int itlb = entno == TSTAT_ENT_ITLBMISS;
1118 #else
1119 	int itlb = (entno == TSTAT_ENT_IMMUMISS || entno == TSTAT_ENT_ITLBMISS);
1120 #endif
1121 	int entoffs = entno << TSTAT_ENT_SHIFT;
1122 	uintptr_t tmptick, stat, tpc, utpc;
1123 	tstat_pgszdata_t *data = &tcpu->tcpu_data->tdata_pgsz[0];
1124 	tstat_tlbdata_t *udata, *kdata;
1125 	tstat_tlbret_t *ret;
1126 #ifndef sun4v
1127 	uint32_t asi = itlb ? ASI(ASI_IMMU) : ASI(ASI_DMMU);
1128 #else
1129 	uint32_t tagtarget_off = itlb ? MMFSA_I_CTX : MMFSA_D_CTX;
1130 #endif
1131 
1132 	/*
1133 	 * When trapstat is run with TLB statistics, this is the entry for
1134 	 * both I- and D-TLB misses; this code performs trap level pushing,
1135 	 * as described in the "TLB Statistics" section of the block comment.
1136 	 * This code is executing at TL 1; %tstate[0] contains the saved
1137 	 * state at the time of the TLB miss.  Pushing trap level 1 (and thus
1138 	 * raising TL to 2) requires us to fill in %tstate[1] with our %pstate,
1139 	 * %cwp and %asi.  We leave %tt unchanged, and we set %tpc and %tnpc to
1140 	 * the appropriate TLB return entry (based on the context of the miss).
1141 	 * Finally, we sample %tick, and stash it in the tdata_tmptick member
1142 	 * the per-CPU tstat_data structure.  tdata_tmptick will be used in
1143 	 * the TLB return entry to determine the amount of time spent in the
1144 	 * TLB miss handler.
1145 	 *
1146 	 * Note that on sun4v platforms, we must obtain the context information
1147 	 * from the MMU fault status area. (The base address of this MMU fault
1148 	 * status area is kept in the scratchpad register 0.)
1149 	 */
1150 	static const uint32_t tlbent[] = {
1151 #ifndef sun4v
1152 	    0x03000000, 		/* sethi %hi(stat), %g1		*/
1153 	    0xc4586000,			/* ldx   [%g1 + %lo(stat)], %g2	*/
1154 	    0x8400a001,			/* add   %g2, 1, %g2		*/
1155 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(stat)]	*/
1156 	    0x85524000,			/* rdpr  %cwp, %g2		*/
1157 	    0x87518000,			/* rdpr  %pstate, %g3		*/
1158 	    0x8728f008,			/* sllx  %g3, 8, %g3		*/
1159 	    0x84108003,			/* or    %g2, %g3, %g2		*/
1160 	    0x8740c000,			/* rd    %asi, %g3		*/
1161 	    0x8728f018,			/* sllx  %g3, 24, %g3		*/
1162 	    0x84108003,			/* or    %g2, %g3, %g2		*/
1163 	    0x8350c000,			/* rdpr  %tt, %g1		*/
1164 	    0x8f902002,			/* wrpr  %g0, 2, %tl		*/
1165 	    0x85908000,			/* wrpr  %g2, %g0, %tstate	*/
1166 	    0x87904000,			/* wrpr  %g1, %g0, %tt		*/
1167 	    0xc2d80000,			/* ldxa  [%g0]ASI_MMU, %g1	*/
1168 	    0x83307030,			/* srlx  %g1, CTXSHIFT, %g1	*/
1169 	    0x02c04004,			/* brz,pn %g1, .+0x10		*/
1170 	    0x03000000, 		/* sethi %hi(new_tpc), %g1	*/
1171 	    0x82106000,			/* or    %g1, %lo(new_tpc), %g1	*/
1172 	    0x30800002,			/* ba,a  .+0x8			*/
1173 	    0x82106000,			/* or    %g1, %lo(new_tpc), %g1	*/
1174 	    0x81904000,			/* wrpr  %g1, %g0, %tpc		*/
1175 	    0x82006004,			/* add   %g1, 4, %g1		*/
1176 	    0x83904000,			/* wrpr  %g1, %g0, %tnpc	*/
1177 	    0x03000000, 		/* sethi %hi(tmptick), %g1	*/
1178 	    0x85410000,			/* rd    %tick, %g2		*/
1179 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(tmptick)] */
1180 	    0x30800000,			/* ba,a  addr			*/
1181 	    NOP, NOP, NOP
1182 #else /* sun4v */
1183 	    0x03000000, 		/* sethi %hi(stat), %g1		*/
1184 	    0xc4586000,			/* ldx   [%g1 + %lo(stat)], %g2	*/
1185 	    0x8400a001,			/* add   %g2, 1, %g2		*/
1186 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(stat)]	*/
1187 	    0x85524000,			/* rdpr  %cwp, %g2		*/
1188 	    0x87518000,			/* rdpr  %pstate, %g3		*/
1189 	    0x8728f008,			/* sllx  %g3, 8, %g3		*/
1190 	    0x84108003,			/* or    %g2, %g3, %g2		*/
1191 	    0x8740c000,			/* rd    %asi, %g3		*/
1192 	    0x8728f018,			/* sllx  %g3, 24, %g3		*/
1193 	    0x83540000,			/* rdpr  %gl, %g1		*/
1194 	    0x83287028,			/* sllx  %g1, 40, %g1		*/
1195 	    0x86104003,			/* or    %g1, %g3, %g3		*/
1196 	    0x84108003,			/* or    %g2, %g3, %g2		*/
1197 	    0x8350c000,			/* rdpr  %tt, %g1		*/
1198 	    0x8f902002,			/* wrpr  %g0, 2, %tl		*/
1199 	    0x85908000,			/* wrpr  %g2, %g0, %tstate	*/
1200 	    0x87904000,			/* wrpr  %g1, %g0, %tt		*/
1201 	    0xc2d80400,			/* ldxa  [%g0]ASI_SCRATCHPAD, %g1 */
1202 	    0xc2586000,			/* ldx  [%g1 + MMFSA_?_CTX], %g1 */
1203 	    0x02c04004,			/* brz,pn %g1, .+0x10		*/
1204 	    0x03000000, 		/* sethi %hi(new_tpc), %g1	*/
1205 	    0x82106000,			/* or    %g1, %lo(new_tpc), %g1	*/
1206 	    0x30800002,			/* ba,a  .+0x8			*/
1207 	    0x82106000,			/* or    %g1, %lo(new_tpc), %g1	*/
1208 	    0x81904000,			/* wrpr  %g1, %g0, %tpc		*/
1209 	    0x82006004,			/* add   %g1, 4, %g1		*/
1210 	    0x83904000,			/* wrpr  %g1, %g0, %tnpc	*/
1211 	    0x03000000, 		/* sethi %hi(tmptick), %g1	*/
1212 	    0x85410000,			/* rd    %tick, %g2		*/
1213 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(tmptick)] */
1214 	    0x30800000			/* ba,a  addr			*/
1215 #endif /* sun4v */
1216 	};
1217 
1218 	ASSERT(MUTEX_HELD(&tstat_lock));
1219 #ifndef sun4v
1220 	ASSERT(entno == TSTAT_ENT_ITLBMISS || entno == TSTAT_ENT_DTLBMISS);
1221 #else
1222 	ASSERT(entno == TSTAT_ENT_ITLBMISS || entno == TSTAT_ENT_DTLBMISS ||
1223 	    entno == TSTAT_ENT_IMMUMISS || entno == TSTAT_ENT_DMMUMISS);
1224 #endif
1225 
1226 	stat = TSTAT_DATA_OFFS(tcpu, tdata_traps) + entoffs;
1227 	tmptick = TSTAT_DATA_OFFS(tcpu, tdata_tmptick);
1228 
1229 	if (itlb) {
1230 		ret = &tcpu->tcpu_instr->tinst_itlbret;
1231 		udata = &data->tpgsz_user.tmode_itlb;
1232 		kdata = &data->tpgsz_kernel.tmode_itlb;
1233 		tpc = TSTAT_INSTR_OFFS(tcpu, tinst_itlbret.ttlbr_ktlb);
1234 	} else {
1235 		ret = &tcpu->tcpu_instr->tinst_dtlbret;
1236 		udata = &data->tpgsz_user.tmode_dtlb;
1237 		kdata = &data->tpgsz_kernel.tmode_dtlb;
1238 		tpc = TSTAT_INSTR_OFFS(tcpu, tinst_dtlbret.ttlbr_ktlb);
1239 	}
1240 
1241 	utpc = tpc + offsetof(tstat_tlbret_t, ttlbr_utlb) -
1242 	    offsetof(tstat_tlbret_t, ttlbr_ktlb);
1243 
1244 	ASSERT(HI22(tpc) == HI22(utpc));
1245 
1246 	ent = (uint32_t *)((uintptr_t)tcpu->tcpu_instr + entoffs);
1247 	orig = KERNELBASE + entoffs;
1248 	va = (uintptr_t)tcpu->tcpu_ibase + entoffs;
1249 	baoffs = TSTAT_TLBENT_BA * sizeof (uint32_t);
1250 
1251 #ifdef sun4v
1252 	if (entno == TSTAT_ENT_IMMUMISS || entno == TSTAT_ENT_DMMUMISS) {
1253 		/*
1254 		 * Because of lack of space, interposing tlbent trap
1255 		 * handler for IMMU_miss and DMMU_miss traps cannot be
1256 		 * placed in-line. Instead, we copy it to the space set
1257 		 * aside for these traps in per CPU trapstat area and
1258 		 * invoke it by placing a branch in the trap table itself.
1259 		 */
1260 		static const uint32_t mmumiss[TSTAT_ENT_NINSTR] = {
1261 		    0x30800000,			/* ba,a addr */
1262 		    NOP, NOP, NOP, NOP, NOP, NOP, NOP
1263 		};
1264 		uint32_t *tent = ent;		/* trap vector entry */
1265 		uintptr_t tentva = va;		/* trap vector entry va */
1266 
1267 		if (itlb) {
1268 			ent = (uint32_t *)((uintptr_t)
1269 				&tcpu->tcpu_instr->tinst_immumiss);
1270 			va = TSTAT_INSTR_OFFS(tcpu, tinst_immumiss);
1271 		} else {
1272 			ent = (uint32_t *)((uintptr_t)
1273 				&tcpu->tcpu_instr->tinst_dmmumiss);
1274 			va = TSTAT_INSTR_OFFS(tcpu, tinst_dmmumiss);
1275 		}
1276 		bcopy(mmumiss, tent, sizeof (mmumiss));
1277 		tent[0] |= DISP22(tentva, va);
1278 	}
1279 #endif /* sun4v */
1280 
1281 	bcopy(tlbent, ent, sizeof (tlbent));
1282 
1283 	ent[TSTAT_TLBENT_STATHI] |= HI22(stat);
1284 	ent[TSTAT_TLBENT_STATLO_LD] |= LO10(stat);
1285 	ent[TSTAT_TLBENT_STATLO_ST] |= LO10(stat);
1286 #ifndef sun4v
1287 	ent[TSTAT_TLBENT_MMUASI] |= asi;
1288 #else
1289 	ent[TSTAT_TLBENT_TAGTARGET] |= tagtarget_off;
1290 #endif
1291 	ent[TSTAT_TLBENT_TPCHI] |= HI22(tpc);
1292 	ent[TSTAT_TLBENT_TPCLO_USER] |= LO10(utpc);
1293 	ent[TSTAT_TLBENT_TPCLO_KERN] |= LO10(tpc);
1294 	ent[TSTAT_TLBENT_TSHI] |= HI22(tmptick);
1295 	ent[TSTAT_TLBENT_TSLO] |= LO10(tmptick);
1296 	ent[TSTAT_TLBENT_BA] |= DISP22(va + baoffs, orig);
1297 
1298 	/*
1299 	 * And now set up the TLB return entries.
1300 	 */
1301 	trapstat_tlbretent(tcpu, &ret->ttlbr_ktlb, &kdata->ttlb_tlb);
1302 	trapstat_tlbretent(tcpu, &ret->ttlbr_ktsb, &kdata->ttlb_tsb);
1303 	trapstat_tlbretent(tcpu, &ret->ttlbr_utlb, &udata->ttlb_tlb);
1304 	trapstat_tlbretent(tcpu, &ret->ttlbr_utsb, &udata->ttlb_tsb);
1305 }
1306 
1307 #undef TSTAT_TLBENT_STATHI
1308 #undef TSTAT_TLBENT_STATLO_LD
1309 #undef TSTAT_TLBENT_STATLO_ST
1310 #ifndef sun4v
1311 #undef TSTAT_TLBENT_MMUASI
1312 #else
1313 #undef TSTAT_TLBENT_TAGTARGET
1314 #endif
1315 #undef TSTAT_TLBENT_TPCHI
1316 #undef TSTAT_TLBENT_TPCLO_USER
1317 #undef TSTAT_TLBENT_TPCLO_KERN
1318 #undef TSTAT_TLBENT_TSHI
1319 #undef TSTAT_TLBENT_TSLO
1320 #undef TSTAT_TLBENT_BA
1321 
1322 /*
1323  * The TSTAT_ENABLED_* constants define offsets in the enabled entry; the
1324  * TSTAT_DISABLED_BA constant defines an offset in the disabled entry.  Both
1325  * sets of constants are used only in trapstat_make_traptab() (below) and
1326  * #undef'd immediately afterwards.  Any change to "enabled" or "disabled"
1327  * in trapstat_make_traptab() will likely require changes to these constants.
1328  */
1329 #define	TSTAT_ENABLED_STATHI	0
1330 #define	TSTAT_ENABLED_STATLO_LD	1
1331 #define	TSTAT_ENABLED_STATLO_ST 3
1332 #define	TSTAT_ENABLED_BA	4
1333 #define	TSTAT_DISABLED_BA	0
1334 
1335 static void
1336 trapstat_make_traptab(tstat_percpu_t *tcpu)
1337 {
1338 	uint32_t *ent;
1339 	uint64_t *stat;
1340 	uintptr_t orig, va, en_baoffs, dis_baoffs;
1341 	int nent;
1342 
1343 	/*
1344 	 * This is the entry in the interposing trap table for enabled trap
1345 	 * table entries.  It loads a counter, increments it and stores it
1346 	 * back before branching to the actual trap table entry.
1347 	 */
1348 	static const uint32_t enabled[TSTAT_ENT_NINSTR] = {
1349 	    0x03000000, 		/* sethi %hi(stat), %g1		*/
1350 	    0xc4586000,			/* ldx   [%g1 + %lo(stat)], %g2	*/
1351 	    0x8400a001,			/* add   %g2, 1, %g2		*/
1352 	    0xc4706000,			/* stx   %g2, [%g1 + %lo(stat)]	*/
1353 	    0x30800000,			/* ba,a addr			*/
1354 	    NOP, NOP, NOP
1355 	};
1356 
1357 	/*
1358 	 * This is the entry in the interposing trap table for disabled trap
1359 	 * table entries.  It simply branches to the actual, underlying trap
1360 	 * table entry.  As explained in the "Implementation Details" section
1361 	 * of the block comment, all TL>0 traps _must_ use the disabled entry;
1362 	 * additional entries may be explicitly disabled through the use
1363 	 * of TSTATIOC_ENTRY/TSTATIOC_NOENTRY.
1364 	 */
1365 	static const uint32_t disabled[TSTAT_ENT_NINSTR] = {
1366 	    0x30800000,			/* ba,a addr			*/
1367 	    NOP, NOP, NOP, NOP, NOP, NOP, NOP,
1368 	};
1369 
1370 	ASSERT(MUTEX_HELD(&tstat_lock));
1371 
1372 	ent = tcpu->tcpu_instr->tinst_traptab;
1373 	stat = (uint64_t *)TSTAT_DATA_OFFS(tcpu, tdata_traps);
1374 	orig = KERNELBASE;
1375 	va = (uintptr_t)tcpu->tcpu_ibase;
1376 	en_baoffs = TSTAT_ENABLED_BA * sizeof (uint32_t);
1377 	dis_baoffs = TSTAT_DISABLED_BA * sizeof (uint32_t);
1378 
1379 	for (nent = 0; nent < TSTAT_TOTAL_NENT; nent++) {
1380 		if (tstat_enabled[nent]) {
1381 			bcopy(enabled, ent, sizeof (enabled));
1382 			ent[TSTAT_ENABLED_STATHI] |= HI22((uintptr_t)stat);
1383 			ent[TSTAT_ENABLED_STATLO_LD] |= LO10((uintptr_t)stat);
1384 			ent[TSTAT_ENABLED_STATLO_ST] |= LO10((uintptr_t)stat);
1385 			ent[TSTAT_ENABLED_BA] |= DISP22(va + en_baoffs, orig);
1386 		} else {
1387 			bcopy(disabled, ent, sizeof (disabled));
1388 			ent[TSTAT_DISABLED_BA] |= DISP22(va + dis_baoffs, orig);
1389 		}
1390 
1391 		stat++;
1392 		orig += sizeof (enabled);
1393 		ent += sizeof (enabled) / sizeof (*ent);
1394 		va += sizeof (enabled);
1395 	}
1396 }
1397 
1398 #undef TSTAT_ENABLED_STATHI
1399 #undef TSTAT_ENABLED_STATLO_LD
1400 #undef TSTAT_ENABLED_STATLO_ST
1401 #undef TSTAT_ENABLED_BA
1402 #undef TSTAT_DISABLED_BA
1403 
1404 #ifndef sun4v
1405 /*
1406  * See Section A.6 in SPARC v9 Manual.
1407  * max branch = 4*((2^21)-1) = 8388604
1408  */
1409 #define	MAX_BICC_BRANCH_DISPLACEMENT (4 * ((1 << 21) - 1))
1410 #endif
1411 
1412 static void
1413 trapstat_setup(processorid_t cpu)
1414 {
1415 	tstat_percpu_t *tcpu = &tstat_percpu[cpu];
1416 #ifndef sun4v
1417 	int i;
1418 	caddr_t va;
1419 	pfn_t *pfn;
1420 	cpu_t *cp;
1421 	uint_t strand_idx;
1422 	size_t tstat_offset;
1423 #endif
1424 
1425 	ASSERT(tcpu->tcpu_pfn == NULL);
1426 	ASSERT(tcpu->tcpu_instr == NULL);
1427 	ASSERT(tcpu->tcpu_data == NULL);
1428 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
1429 	ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED));
1430 	ASSERT(MUTEX_HELD(&cpu_lock));
1431 	ASSERT(MUTEX_HELD(&tstat_lock));
1432 
1433 	/*
1434 	 * The lower fifteen bits of the %tba are always read as zero; we must
1435 	 * align our instruction base address appropriately.
1436 	 */
1437 #ifndef sun4v
1438 	tstat_offset = tstat_total_size;
1439 
1440 	cp = cpu_get(cpu);
1441 	ASSERT(cp != NULL);
1442 	if ((strand_idx = cpu ^ pg_plat_hw_instance_id(cp, PGHW_IPIPE)) != 0) {
1443 		/*
1444 		 * On sun4u platforms with multiple CPUs sharing the MMU
1445 		 * (Olympus-C has 2 strands per core), each CPU uses a
1446 		 * disjoint trap table.  The indexing is based on the
1447 		 * strand id, which is obtained by XOR'ing the cpuid with
1448 		 * the coreid.
1449 		 */
1450 		tstat_offset += tstat_total_size * strand_idx;
1451 
1452 		/*
1453 		 * Offset must be less than the maximum PC-relative branch
1454 		 * displacement for Bicc variants.  See the Implementation
1455 		 * Details comment.
1456 		 */
1457 		ASSERT(tstat_offset <= MAX_BICC_BRANCH_DISPLACEMENT);
1458 	}
1459 
1460 	tcpu->tcpu_ibase = (caddr_t)((KERNELBASE - tstat_offset)
1461 		& TSTAT_TBA_MASK);
1462 	tcpu->tcpu_dbase = tcpu->tcpu_ibase + TSTAT_INSTR_SIZE;
1463 	tcpu->tcpu_vabase = tcpu->tcpu_ibase;
1464 
1465 	tcpu->tcpu_pfn = vmem_alloc(tstat_arena, tstat_total_pages, VM_SLEEP);
1466 	bzero(tcpu->tcpu_pfn, tstat_total_pages);
1467 	pfn = tcpu->tcpu_pfn;
1468 
1469 	tcpu->tcpu_instr = vmem_alloc(tstat_arena, TSTAT_INSTR_SIZE, VM_SLEEP);
1470 
1471 	va = (caddr_t)tcpu->tcpu_instr;
1472 	for (i = 0; i < TSTAT_INSTR_PAGES; i++, va += MMU_PAGESIZE)
1473 		*pfn++ = hat_getpfnum(kas.a_hat, va);
1474 
1475 	/*
1476 	 * We must be sure that the pages that we will use to examine the data
1477 	 * have the same virtual color as the pages to which the data is being
1478 	 * recorded, hence the alignment and phase constraints on the
1479 	 * allocation.
1480 	 */
1481 	tcpu->tcpu_data = vmem_xalloc(tstat_arena, tstat_data_size,
1482 	    shm_alignment, (uintptr_t)tcpu->tcpu_dbase & (shm_alignment - 1),
1483 	    0, 0, NULL, VM_SLEEP);
1484 	bzero(tcpu->tcpu_data, tstat_data_size);
1485 	tcpu->tcpu_data->tdata_cpuid = cpu;
1486 
1487 	va = (caddr_t)tcpu->tcpu_data;
1488 	for (i = 0; i < tstat_data_pages; i++, va += MMU_PAGESIZE)
1489 		*pfn++ = hat_getpfnum(kas.a_hat, va);
1490 #else /* sun4v */
1491 	ASSERT(!(tstat_total_size > (1 + ~TSTAT_TBA_MASK)));
1492 	tcpu->tcpu_vabase = (caddr_t)(KERNELBASE - MMU_PAGESIZE4M);
1493 	tcpu->tcpu_ibase = tcpu->tcpu_vabase + (cpu * (1 + ~TSTAT_TBA_MASK));
1494 	tcpu->tcpu_dbase = tcpu->tcpu_ibase + TSTAT_INSTR_SIZE;
1495 
1496 	tcpu->tcpu_pfn = &tstat_pfn;
1497 	tcpu->tcpu_instr = (tstat_instr_t *)(tstat_va + (cpu *
1498 		(1 + ~TSTAT_TBA_MASK)));
1499 	tcpu->tcpu_data = (tstat_data_t *)(tstat_va + (cpu *
1500 		(1 + ~TSTAT_TBA_MASK)) + TSTAT_INSTR_SIZE);
1501 	bzero(tcpu->tcpu_data, tstat_data_size);
1502 	tcpu->tcpu_data->tdata_cpuid = cpu;
1503 #endif /* sun4v */
1504 
1505 	/*
1506 	 * Now that we have all of the instruction and data pages allocated,
1507 	 * make the trap table from scratch.
1508 	 */
1509 	trapstat_make_traptab(tcpu);
1510 
1511 	if (tstat_options & TSTAT_OPT_TLBDATA) {
1512 		/*
1513 		 * TLB Statistics have been specified; set up the I- and D-TLB
1514 		 * entries and corresponding TLB return entries.
1515 		 */
1516 #ifndef sun4v
1517 		trapstat_tlbent(tcpu, TSTAT_ENT_ITLBMISS);
1518 		trapstat_tlbent(tcpu, TSTAT_ENT_DTLBMISS);
1519 #else
1520 		if (tstat_fast_tlbstat) {
1521 			trapstat_tlbent(tcpu, TSTAT_ENT_IMMUMISS);
1522 			trapstat_tlbent(tcpu, TSTAT_ENT_DMMUMISS);
1523 		} else {
1524 			trapstat_tlbent(tcpu, TSTAT_ENT_ITLBMISS);
1525 			trapstat_tlbent(tcpu, TSTAT_ENT_DTLBMISS);
1526 		}
1527 #endif
1528 	}
1529 
1530 	tcpu->tcpu_flags |= TSTAT_CPU_ALLOCATED;
1531 
1532 	/*
1533 	 * Finally, get the target CPU to load the locked pages into its TLBs.
1534 	 */
1535 	xc_one(cpu, (xcfunc_t *)trapstat_load_tlb, 0, 0);
1536 }
1537 
1538 static void
1539 trapstat_teardown(processorid_t cpu)
1540 {
1541 	tstat_percpu_t *tcpu = &tstat_percpu[cpu];
1542 #ifndef sun4v
1543 	int i;
1544 #endif
1545 	caddr_t va = tcpu->tcpu_vabase;
1546 
1547 	ASSERT(tcpu->tcpu_pfn != NULL);
1548 	ASSERT(tcpu->tcpu_instr != NULL);
1549 	ASSERT(tcpu->tcpu_data != NULL);
1550 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
1551 	ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
1552 	ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
1553 	ASSERT(MUTEX_HELD(&cpu_lock));
1554 	ASSERT(MUTEX_HELD(&tstat_lock));
1555 
1556 #ifndef sun4v
1557 	vmem_free(tstat_arena, tcpu->tcpu_pfn, tstat_total_pages);
1558 	vmem_free(tstat_arena, tcpu->tcpu_instr, TSTAT_INSTR_SIZE);
1559 	vmem_free(tstat_arena, tcpu->tcpu_data, tstat_data_size);
1560 
1561 	for (i = 0; i < tstat_total_pages; i++, va += MMU_PAGESIZE) {
1562 		xt_one(cpu, vtag_flushpage_tl1, (uint64_t)va,
1563 		    (uint64_t)ksfmmup);
1564 	}
1565 #else
1566 	xt_one(cpu, vtag_unmap_perm_tl1, (uint64_t)va, KCONTEXT);
1567 #endif
1568 
1569 	tcpu->tcpu_pfn = NULL;
1570 	tcpu->tcpu_instr = NULL;
1571 	tcpu->tcpu_data = NULL;
1572 	tcpu->tcpu_flags &= ~TSTAT_CPU_ALLOCATED;
1573 }
1574 
1575 static int
1576 trapstat_go()
1577 {
1578 	cpu_t *cp;
1579 
1580 	mutex_enter(&cpu_lock);
1581 	mutex_enter(&tstat_lock);
1582 
1583 	if (tstat_running) {
1584 		mutex_exit(&tstat_lock);
1585 		mutex_exit(&cpu_lock);
1586 		return (EBUSY);
1587 	}
1588 
1589 #ifdef sun4v
1590 	/*
1591 	 * Allocate large page to hold interposing tables.
1592 	 */
1593 	tstat_va = contig_mem_alloc(MMU_PAGESIZE4M);
1594 	tstat_pfn = va_to_pfn(tstat_va);
1595 	if (tstat_pfn == PFN_INVALID)
1596 		return (EAGAIN);
1597 
1598 	/*
1599 	 * For detailed TLB statistics, invoke CPU specific interface
1600 	 * to see if it supports a low overhead interface to collect
1601 	 * TSB hit statistics. If so, make set tstat_fast_tlbstat flag
1602 	 * to reflect that.
1603 	 */
1604 	if (tstat_options & TSTAT_OPT_TLBDATA) {
1605 		int error;
1606 
1607 		error = cpu_trapstat_conf(CPU_TSTATCONF_INIT);
1608 		if (error == 0)
1609 			tstat_fast_tlbstat = B_TRUE;
1610 		else if (error != ENOTSUP) {
1611 			contig_mem_free(tstat_va, MMU_PAGESIZE4M);
1612 			return (error);
1613 		}
1614 	}
1615 #endif
1616 
1617 	/*
1618 	 * First, perform any necessary hot patching.
1619 	 */
1620 	trapstat_hotpatch();
1621 
1622 	/*
1623 	 * Allocate the resources we'll need to measure probe effect.
1624 	 */
1625 	trapstat_probe_alloc();
1626 
1627 
1628 	cp = cpu_list;
1629 	do {
1630 		if (!(tstat_percpu[cp->cpu_id].tcpu_flags & TSTAT_CPU_SELECTED))
1631 			continue;
1632 
1633 		trapstat_setup(cp->cpu_id);
1634 
1635 		/*
1636 		 * Note that due to trapstat_probe()'s use of global data,
1637 		 * we determine the probe effect on each CPU serially instead
1638 		 * of in parallel with an xc_all().
1639 		 */
1640 		xc_one(cp->cpu_id, (xcfunc_t *)trapstat_probe, 0, 0);
1641 	} while ((cp = cp->cpu_next) != cpu_list);
1642 
1643 	xc_all((xcfunc_t *)trapstat_enable, 0, 0);
1644 
1645 	trapstat_probe_free();
1646 	tstat_running = 1;
1647 	mutex_exit(&tstat_lock);
1648 	mutex_exit(&cpu_lock);
1649 
1650 	return (0);
1651 }
1652 
1653 static int
1654 trapstat_stop()
1655 {
1656 	int i;
1657 
1658 	mutex_enter(&cpu_lock);
1659 	mutex_enter(&tstat_lock);
1660 	if (!tstat_running) {
1661 		mutex_exit(&tstat_lock);
1662 		mutex_exit(&cpu_lock);
1663 		return (ENXIO);
1664 	}
1665 
1666 	xc_all((xcfunc_t *)trapstat_disable, 0, 0);
1667 
1668 	for (i = 0; i <= max_cpuid; i++) {
1669 		if (tstat_percpu[i].tcpu_flags & TSTAT_CPU_ALLOCATED)
1670 			trapstat_teardown(i);
1671 	}
1672 
1673 #ifdef sun4v
1674 	if (tstat_options & TSTAT_OPT_TLBDATA)
1675 		cpu_trapstat_conf(CPU_TSTATCONF_FINI);
1676 	contig_mem_free(tstat_va, MMU_PAGESIZE4M);
1677 #endif
1678 	trapstat_hotpatch();
1679 	tstat_running = 0;
1680 	mutex_exit(&tstat_lock);
1681 	mutex_exit(&cpu_lock);
1682 
1683 	return (0);
1684 }
1685 
1686 /*
1687  * This is trapstat's DR CPU configuration callback.  It's called (with
1688  * cpu_lock held) to unconfigure a newly powered-off CPU, or to configure a
1689  * powered-off CPU that is to be brought into the system.  We need only take
1690  * action in the unconfigure case:  because a powered-off CPU will have its
1691  * trap table restored to KERNELBASE if it is ever powered back on, we must
1692  * update the flags to reflect that trapstat is no longer enabled on the
1693  * powered-off CPU.  Note that this means that a TSTAT_CPU_ENABLED CPU that
1694  * is unconfigured/powered off and later powered back on/reconfigured will
1695  * _not_ be re-TSTAT_CPU_ENABLED.
1696  */
1697 static int
1698 trapstat_cpu_setup(cpu_setup_t what, processorid_t cpu)
1699 {
1700 	tstat_percpu_t *tcpu = &tstat_percpu[cpu];
1701 
1702 	ASSERT(MUTEX_HELD(&cpu_lock));
1703 	mutex_enter(&tstat_lock);
1704 
1705 	if (!tstat_running) {
1706 		mutex_exit(&tstat_lock);
1707 		return (0);
1708 	}
1709 
1710 	switch (what) {
1711 	case CPU_CONFIG:
1712 		ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
1713 		break;
1714 
1715 	case CPU_UNCONFIG:
1716 		if (tcpu->tcpu_flags & TSTAT_CPU_ENABLED) {
1717 			tcpu->tcpu_flags &= ~TSTAT_CPU_ENABLED;
1718 #ifdef	sun4v
1719 			/*
1720 			 * A power-off, causes the cpu mondo queues to be
1721 			 * unconfigured on sun4v. Since we can't teardown
1722 			 * trapstat's mappings on the cpu that is going away,
1723 			 * we simply mark it as not allocated. This will
1724 			 * prevent a teardown on a cpu with the same cpu id
1725 			 * that might have been added while trapstat is running.
1726 			 */
1727 			if (tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED) {
1728 				tcpu->tcpu_pfn = NULL;
1729 				tcpu->tcpu_instr = NULL;
1730 				tcpu->tcpu_data = NULL;
1731 				tcpu->tcpu_flags &= ~TSTAT_CPU_ALLOCATED;
1732 			}
1733 #endif
1734 		}
1735 		break;
1736 
1737 	default:
1738 		break;
1739 	}
1740 
1741 	mutex_exit(&tstat_lock);
1742 	return (0);
1743 }
1744 
1745 /*
1746  * This is called before a CPR suspend and after a CPR resume.  We don't have
1747  * anything to do before a suspend, but after a restart we must restore the
1748  * trap table to be our interposing trap table.  However, we don't actually
1749  * know whether or not the CPUs have been powered off -- this routine may be
1750  * called while restoring from a failed CPR suspend.  We thus run through each
1751  * TSTAT_CPU_ENABLED CPU, and explicitly destroy and reestablish its
1752  * interposing trap table.  This assures that our state is correct regardless
1753  * of whether or not the CPU has been newly powered on.
1754  */
1755 /*ARGSUSED*/
1756 static boolean_t
1757 trapstat_cpr(void *arg, int code)
1758 {
1759 	cpu_t *cp;
1760 
1761 	if (code == CB_CODE_CPR_CHKPT)
1762 		return (B_TRUE);
1763 
1764 	ASSERT(code == CB_CODE_CPR_RESUME);
1765 
1766 	mutex_enter(&cpu_lock);
1767 	mutex_enter(&tstat_lock);
1768 
1769 	if (!tstat_running) {
1770 		mutex_exit(&tstat_lock);
1771 		mutex_exit(&cpu_lock);
1772 		return (B_TRUE);
1773 	}
1774 
1775 	cp = cpu_list;
1776 	do {
1777 		tstat_percpu_t *tcpu = &tstat_percpu[cp->cpu_id];
1778 
1779 		if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED))
1780 			continue;
1781 
1782 		ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
1783 		ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
1784 
1785 		xc_one(cp->cpu_id, (xcfunc_t *)trapstat_disable, 0, 0);
1786 		ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
1787 
1788 		/*
1789 		 * Preserve this CPU's data in tstat_buffer and rip down its
1790 		 * interposing trap table.
1791 		 */
1792 		bcopy(tcpu->tcpu_data, tstat_buffer, tstat_data_t_size);
1793 		trapstat_teardown(cp->cpu_id);
1794 		ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED));
1795 
1796 		/*
1797 		 * Reestablish the interposing trap table and restore the old
1798 		 * data.
1799 		 */
1800 		trapstat_setup(cp->cpu_id);
1801 		ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
1802 		bcopy(tstat_buffer, tcpu->tcpu_data, tstat_data_t_size);
1803 
1804 		xc_one(cp->cpu_id, (xcfunc_t *)trapstat_enable, 0, 0);
1805 	} while ((cp = cp->cpu_next) != cpu_list);
1806 
1807 	mutex_exit(&tstat_lock);
1808 	mutex_exit(&cpu_lock);
1809 
1810 	return (B_TRUE);
1811 }
1812 
1813 /*ARGSUSED*/
1814 static int
1815 trapstat_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
1816 {
1817 	int i;
1818 
1819 	mutex_enter(&cpu_lock);
1820 	mutex_enter(&tstat_lock);
1821 	if (tstat_open != 0) {
1822 		mutex_exit(&tstat_lock);
1823 		mutex_exit(&cpu_lock);
1824 		return (EBUSY);
1825 	}
1826 
1827 	/*
1828 	 * Register this in open() rather than in attach() to prevent deadlock
1829 	 * with DR code. During attach, I/O device tree locks are grabbed
1830 	 * before trapstat_attach() is invoked - registering in attach
1831 	 * will result in the lock order: device tree lock, cpu_lock.
1832 	 * DR code however requires that cpu_lock be acquired before
1833 	 * device tree locks.
1834 	 */
1835 	ASSERT(!tstat_running);
1836 	register_cpu_setup_func((cpu_setup_func_t *)trapstat_cpu_setup, NULL);
1837 
1838 	/*
1839 	 * Clear all options.  And until specific CPUs are specified, we'll
1840 	 * mark all CPUs as selected.
1841 	 */
1842 	tstat_options = 0;
1843 
1844 	for (i = 0; i <= max_cpuid; i++)
1845 		tstat_percpu[i].tcpu_flags |= TSTAT_CPU_SELECTED;
1846 
1847 	/*
1848 	 * By default, all traps at TL=0 are enabled.  Traps at TL>0 must
1849 	 * be disabled.
1850 	 */
1851 	for (i = 0; i < TSTAT_TOTAL_NENT; i++)
1852 		tstat_enabled[i] = i < TSTAT_NENT ? 1 : 0;
1853 
1854 	tstat_open = 1;
1855 	mutex_exit(&tstat_lock);
1856 	mutex_exit(&cpu_lock);
1857 
1858 	return (0);
1859 }
1860 
1861 /*ARGSUSED*/
1862 static int
1863 trapstat_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
1864 {
1865 	(void) trapstat_stop();
1866 
1867 	ASSERT(!tstat_running);
1868 
1869 	mutex_enter(&cpu_lock);
1870 	unregister_cpu_setup_func((cpu_setup_func_t *)trapstat_cpu_setup, NULL);
1871 	mutex_exit(&cpu_lock);
1872 
1873 	tstat_open = 0;
1874 	return (DDI_SUCCESS);
1875 }
1876 
1877 static int
1878 trapstat_option(int option)
1879 {
1880 	mutex_enter(&tstat_lock);
1881 
1882 	if (tstat_running) {
1883 		mutex_exit(&tstat_lock);
1884 		return (EBUSY);
1885 	}
1886 
1887 	tstat_options |= option;
1888 	mutex_exit(&tstat_lock);
1889 
1890 	return (0);
1891 }
1892 
1893 /*ARGSUSED*/
1894 static int
1895 trapstat_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *crd, int *rval)
1896 {
1897 	int i, j, out;
1898 	size_t dsize;
1899 
1900 	switch (cmd) {
1901 	case TSTATIOC_GO:
1902 		return (trapstat_go());
1903 
1904 	case TSTATIOC_NOGO:
1905 		return (trapstat_option(TSTAT_OPT_NOGO));
1906 
1907 	case TSTATIOC_STOP:
1908 		return (trapstat_stop());
1909 
1910 	case TSTATIOC_CPU:
1911 		if (arg < 0 || arg > max_cpuid)
1912 			return (EINVAL);
1913 		/*FALLTHROUGH*/
1914 
1915 	case TSTATIOC_NOCPU:
1916 		mutex_enter(&tstat_lock);
1917 
1918 		if (tstat_running) {
1919 			mutex_exit(&tstat_lock);
1920 			return (EBUSY);
1921 		}
1922 
1923 		/*
1924 		 * If this is the first CPU to be specified (or if we are
1925 		 * being asked to explicitly de-select CPUs), disable all CPUs.
1926 		 */
1927 		if (!(tstat_options & TSTAT_OPT_CPU) || cmd == TSTATIOC_NOCPU) {
1928 			tstat_options |= TSTAT_OPT_CPU;
1929 
1930 			for (i = 0; i <= max_cpuid; i++) {
1931 				tstat_percpu_t *tcpu = &tstat_percpu[i];
1932 
1933 				ASSERT(cmd == TSTATIOC_NOCPU ||
1934 				    (tcpu->tcpu_flags & TSTAT_CPU_SELECTED));
1935 				tcpu->tcpu_flags &= ~TSTAT_CPU_SELECTED;
1936 			}
1937 		}
1938 
1939 		if (cmd == TSTATIOC_CPU)
1940 			tstat_percpu[arg].tcpu_flags |= TSTAT_CPU_SELECTED;
1941 
1942 		mutex_exit(&tstat_lock);
1943 
1944 		return (0);
1945 
1946 	case TSTATIOC_ENTRY:
1947 		mutex_enter(&tstat_lock);
1948 
1949 		if (tstat_running) {
1950 			mutex_exit(&tstat_lock);
1951 			return (EBUSY);
1952 		}
1953 
1954 		if (arg >= TSTAT_NENT || arg < 0) {
1955 			mutex_exit(&tstat_lock);
1956 			return (EINVAL);
1957 		}
1958 
1959 		if (!(tstat_options & TSTAT_OPT_ENTRY)) {
1960 			/*
1961 			 * If this is the first entry that we are explicitly
1962 			 * enabling, explicitly disable every TL=0 entry.
1963 			 */
1964 			for (i = 0; i < TSTAT_NENT; i++)
1965 				tstat_enabled[i] = 0;
1966 
1967 			tstat_options |= TSTAT_OPT_ENTRY;
1968 		}
1969 
1970 		tstat_enabled[arg] = 1;
1971 		mutex_exit(&tstat_lock);
1972 		return (0);
1973 
1974 	case TSTATIOC_NOENTRY:
1975 		mutex_enter(&tstat_lock);
1976 
1977 		if (tstat_running) {
1978 			mutex_exit(&tstat_lock);
1979 			return (EBUSY);
1980 		}
1981 
1982 		for (i = 0; i < TSTAT_NENT; i++)
1983 			tstat_enabled[i] = 0;
1984 
1985 		mutex_exit(&tstat_lock);
1986 		return (0);
1987 
1988 	case TSTATIOC_READ:
1989 		mutex_enter(&tstat_lock);
1990 
1991 		if (tstat_options & TSTAT_OPT_TLBDATA) {
1992 			dsize = tstat_data_t_exported_size;
1993 		} else {
1994 			dsize = sizeof (tstat_data_t);
1995 		}
1996 
1997 		for (i = 0, out = 0; i <= max_cpuid; i++) {
1998 			tstat_percpu_t *tcpu = &tstat_percpu[i];
1999 
2000 			if (!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED))
2001 				continue;
2002 
2003 			ASSERT(tcpu->tcpu_flags & TSTAT_CPU_SELECTED);
2004 			ASSERT(tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED);
2005 
2006 			tstat_buffer->tdata_cpuid = -1;
2007 			xc_one(i, (xcfunc_t *)trapstat_snapshot, 0, 0);
2008 
2009 			if (tstat_buffer->tdata_cpuid == -1) {
2010 				/*
2011 				 * This CPU is not currently responding to
2012 				 * cross calls; we have caught it while it is
2013 				 * being unconfigured.  We'll drop tstat_lock
2014 				 * and pick up and drop cpu_lock.  By the
2015 				 * time we acquire cpu_lock, the DR operation
2016 				 * will appear consistent and we can assert
2017 				 * that trapstat_cpu_setup() has cleared
2018 				 * TSTAT_CPU_ENABLED.
2019 				 */
2020 				mutex_exit(&tstat_lock);
2021 				mutex_enter(&cpu_lock);
2022 				mutex_exit(&cpu_lock);
2023 				mutex_enter(&tstat_lock);
2024 				ASSERT(!(tcpu->tcpu_flags & TSTAT_CPU_ENABLED));
2025 				continue;
2026 			}
2027 
2028 			/*
2029 			 * Need to compensate for the difference between page
2030 			 * sizes exported to users and page sizes available
2031 			 * within the kernel.
2032 			 */
2033 			if ((tstat_options & TSTAT_OPT_TLBDATA) &&
2034 			    (tstat_pgszs != tstat_user_pgszs)) {
2035 				tstat_pgszdata_t *tp;
2036 				uint_t szc;
2037 
2038 				tp = &tstat_buffer->tdata_pgsz[0];
2039 				for (j = 0; j < tstat_user_pgszs; j++) {
2040 					if ((szc = USERSZC_2_SZC(j)) != j) {
2041 						bcopy(&tp[szc], &tp[j],
2042 						    sizeof (tstat_pgszdata_t));
2043 					}
2044 				}
2045 			}
2046 
2047 			if (copyout(tstat_buffer, (void *)arg, dsize) != 0) {
2048 				mutex_exit(&tstat_lock);
2049 				return (EFAULT);
2050 			}
2051 
2052 			out++;
2053 			arg += dsize;
2054 		}
2055 
2056 		if (out != max_cpuid + 1) {
2057 			processorid_t cpuid = -1;
2058 			arg += offsetof(tstat_data_t, tdata_cpuid);
2059 
2060 			if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0) {
2061 				mutex_exit(&tstat_lock);
2062 				return (EFAULT);
2063 			}
2064 		}
2065 
2066 		mutex_exit(&tstat_lock);
2067 
2068 		return (0);
2069 
2070 	case TSTATIOC_TLBDATA:
2071 		return (trapstat_option(TSTAT_OPT_TLBDATA));
2072 
2073 	default:
2074 		break;
2075 	}
2076 
2077 	return (ENOTTY);
2078 }
2079 
2080 /*ARGSUSED*/
2081 static int
2082 trapstat_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
2083 {
2084 	int error;
2085 
2086 	switch (infocmd) {
2087 	case DDI_INFO_DEVT2DEVINFO:
2088 		*result = (void *)tstat_devi;
2089 		error = DDI_SUCCESS;
2090 		break;
2091 	case DDI_INFO_DEVT2INSTANCE:
2092 		*result = (void *)0;
2093 		error = DDI_SUCCESS;
2094 		break;
2095 	default:
2096 		error = DDI_FAILURE;
2097 	}
2098 	return (error);
2099 }
2100 
2101 static int
2102 trapstat_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
2103 {
2104 	switch (cmd) {
2105 	case DDI_ATTACH:
2106 		break;
2107 
2108 	case DDI_RESUME:
2109 		return (DDI_SUCCESS);
2110 
2111 	default:
2112 		return (DDI_FAILURE);
2113 	}
2114 
2115 	if (ddi_create_minor_node(devi, "trapstat", S_IFCHR,
2116 	    0, DDI_PSEUDO, 0) == DDI_FAILURE) {
2117 		ddi_remove_minor_node(devi, NULL);
2118 		return (DDI_FAILURE);
2119 	}
2120 
2121 	ddi_report_dev(devi);
2122 	tstat_devi = devi;
2123 
2124 	tstat_pgszs = page_num_pagesizes();
2125 	tstat_user_pgszs = page_num_user_pagesizes();
2126 	tstat_data_t_size = sizeof (tstat_data_t) +
2127 	    (tstat_pgszs - 1) * sizeof (tstat_pgszdata_t);
2128 	tstat_data_t_exported_size = sizeof (tstat_data_t) +
2129 	    (tstat_user_pgszs - 1) * sizeof (tstat_pgszdata_t);
2130 #ifndef sun4v
2131 	tstat_data_pages = (tstat_data_t_size >> MMU_PAGESHIFT) + 1;
2132 	tstat_total_pages = TSTAT_INSTR_PAGES + tstat_data_pages;
2133 	tstat_data_size = tstat_data_pages * MMU_PAGESIZE;
2134 	tstat_total_size = TSTAT_INSTR_SIZE + tstat_data_size;
2135 #else
2136 	tstat_data_pages = 0;
2137 	tstat_data_size = tstat_data_t_size;
2138 	tstat_total_pages = ((TSTAT_INSTR_SIZE + tstat_data_size) >>
2139 		MMU_PAGESHIFT) + 1;
2140 	tstat_total_size = tstat_total_pages * MMU_PAGESIZE;
2141 #endif
2142 
2143 	tstat_percpu = kmem_zalloc((max_cpuid + 1) *
2144 	    sizeof (tstat_percpu_t), KM_SLEEP);
2145 
2146 	/*
2147 	 * Create our own arena backed by segkmem to assure a source of
2148 	 * MMU_PAGESIZE-aligned allocations.  We allocate out of the
2149 	 * heap32_arena to assure that we can address the allocated memory with
2150 	 * a single sethi/simm13 pair in the interposing trap table entries.
2151 	 */
2152 	tstat_arena = vmem_create("trapstat", NULL, 0, MMU_PAGESIZE,
2153 	    segkmem_alloc_permanent, segkmem_free, heap32_arena, 0, VM_SLEEP);
2154 
2155 	tstat_enabled = kmem_alloc(TSTAT_TOTAL_NENT * sizeof (int), KM_SLEEP);
2156 	tstat_buffer = kmem_alloc(tstat_data_t_size, KM_SLEEP);
2157 
2158 	/*
2159 	 * CB_CL_CPR_POST_USER is the class that executes from cpr_resume()
2160 	 * after user threads can be restarted.  By executing in this class,
2161 	 * we are assured of the availability of system services needed to
2162 	 * resume trapstat (specifically, we are assured that all CPUs are
2163 	 * restarted and responding to cross calls).
2164 	 */
2165 	tstat_cprcb =
2166 	    callb_add(trapstat_cpr, NULL, CB_CL_CPR_POST_USER, "trapstat");
2167 
2168 	return (DDI_SUCCESS);
2169 }
2170 
2171 static int
2172 trapstat_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
2173 {
2174 	int rval;
2175 
2176 	ASSERT(devi == tstat_devi);
2177 
2178 	switch (cmd) {
2179 	case DDI_DETACH:
2180 		break;
2181 
2182 	case DDI_SUSPEND:
2183 		return (DDI_SUCCESS);
2184 
2185 	default:
2186 		return (DDI_FAILURE);
2187 	}
2188 
2189 	ASSERT(!tstat_running);
2190 
2191 	rval = callb_delete(tstat_cprcb);
2192 	ASSERT(rval == 0);
2193 
2194 	kmem_free(tstat_buffer, tstat_data_t_size);
2195 	kmem_free(tstat_enabled, TSTAT_TOTAL_NENT * sizeof (int));
2196 	vmem_destroy(tstat_arena);
2197 	kmem_free(tstat_percpu, (max_cpuid + 1) * sizeof (tstat_percpu_t));
2198 	ddi_remove_minor_node(devi, NULL);
2199 
2200 	return (DDI_SUCCESS);
2201 }
2202 
2203 /*
2204  * Configuration data structures
2205  */
2206 static struct cb_ops trapstat_cb_ops = {
2207 	trapstat_open,		/* open */
2208 	trapstat_close,		/* close */
2209 	nulldev,		/* strategy */
2210 	nulldev,		/* print */
2211 	nodev,			/* dump */
2212 	nodev,			/* read */
2213 	nodev,			/* write */
2214 	trapstat_ioctl,		/* ioctl */
2215 	nodev,			/* devmap */
2216 	nodev,			/* mmap */
2217 	nodev,			/* segmap */
2218 	nochpoll,		/* poll */
2219 	ddi_prop_op,		/* cb_prop_op */
2220 	0,			/* streamtab */
2221 	D_MP | D_NEW		/* Driver compatibility flag */
2222 };
2223 
2224 static struct dev_ops trapstat_ops = {
2225 	DEVO_REV,		/* devo_rev, */
2226 	0,			/* refcnt */
2227 	trapstat_info,		/* getinfo */
2228 	nulldev,		/* identify */
2229 	nulldev,		/* probe */
2230 	trapstat_attach,	/* attach */
2231 	trapstat_detach,	/* detach */
2232 	nulldev,		/* reset */
2233 	&trapstat_cb_ops,	/* cb_ops */
2234 	(struct bus_ops *)0,	/* bus_ops */
2235 };
2236 
2237 static struct modldrv modldrv = {
2238 	&mod_driverops,		/* Type of module.  This one is a driver */
2239 	"Trap Statistics",	/* name of module */
2240 	&trapstat_ops,		/* driver ops */
2241 };
2242 
2243 static struct modlinkage modlinkage = {
2244 	MODREV_1, (void *)&modldrv, NULL
2245 };
2246 
2247 int
2248 _init(void)
2249 {
2250 	return (mod_install(&modlinkage));
2251 }
2252 
2253 int
2254 _fini(void)
2255 {
2256 	return (mod_remove(&modlinkage));
2257 }
2258 
2259 int
2260 _info(struct modinfo *modinfop)
2261 {
2262 	return (mod_info(&modlinkage, modinfop));
2263 }
2264