xref: /freebsd/sys/dev/hwpmc/hwpmc_mod.c (revision ebccf1e3a6b11b97cbf5f813dd76636e892a9035)
1 /*-
2  * Copyright (c) 2003-2005 Joseph Koshy
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include <sys/param.h>
32 #include <sys/eventhandler.h>
33 #include <sys/jail.h>
34 #include <sys/kernel.h>
35 #include <sys/limits.h>
36 #include <sys/lock.h>
37 #include <sys/malloc.h>
38 #include <sys/module.h>
39 #include <sys/mutex.h>
40 #include <sys/pmc.h>
41 #include <sys/pmckern.h>
42 #include <sys/proc.h>
43 #include <sys/queue.h>
44 #include <sys/sched.h>
45 #include <sys/signalvar.h>
46 #include <sys/smp.h>
47 #include <sys/sx.h>
48 #include <sys/sysctl.h>
49 #include <sys/sysent.h>
50 #include <sys/systm.h>
51 
52 #include <machine/md_var.h>
53 #include <machine/pmc_mdep.h>
54 #include <machine/specialreg.h>
55 
56 /*
57  * Types
58  */
59 
60 enum pmc_flags {
61 	PMC_FLAG_NONE	  = 0x00, /* do nothing */
62 	PMC_FLAG_REMOVE   = 0x01, /* atomically remove entry from hash */
63 	PMC_FLAG_ALLOCATE = 0x02, /* add entry to hash if not found */
64 };
65 
66 /*
67  * The offset in sysent where the syscall is allocated.
68  */
69 
70 static int pmc_syscall_num = NO_SYSCALL;
71 struct pmc_cpu		**pmc_pcpu;	 /* per-cpu state */
72 pmc_value_t		*pmc_pcpu_saved; /* saved PMC values: CSW handling */
73 
74 #define	PMC_PCPU_SAVED(C,R)	pmc_pcpu_saved[(R) + md->pmd_npmc*(C)]
75 
76 struct mtx_pool		*pmc_mtxpool;
77 static int		*pmc_pmcdisp;	 /* PMC row dispositions */
78 
79 #define	PMC_ROW_DISP_IS_FREE(R)		(pmc_pmcdisp[(R)] == 0)
80 #define	PMC_ROW_DISP_IS_THREAD(R)	(pmc_pmcdisp[(R)] > 0)
81 #define	PMC_ROW_DISP_IS_STANDALONE(R)	(pmc_pmcdisp[(R)] < 0)
82 
83 #define	PMC_MARK_ROW_FREE(R) do {					  \
84 	pmc_pmcdisp[(R)] = 0;						  \
85 } while (0)
86 
87 #define	PMC_MARK_ROW_STANDALONE(R) do {					  \
88 	KASSERT(pmc_pmcdisp[(R)] <= 0, ("[pmc,%d] row disposition error", \
89 		    __LINE__));						  \
90 	atomic_add_int(&pmc_pmcdisp[(R)], -1);				  \
91 	KASSERT(pmc_pmcdisp[(R)] >= (-mp_ncpus), ("[pmc,%d] row "	  \
92 		"disposition error", __LINE__));			  \
93 } while (0)
94 
95 #define	PMC_UNMARK_ROW_STANDALONE(R) do { 				  \
96 	atomic_add_int(&pmc_pmcdisp[(R)], 1);				  \
97 	KASSERT(pmc_pmcdisp[(R)] <= 0, ("[pmc,%d] row disposition error", \
98 		    __LINE__));						  \
99 } while (0)
100 
101 #define	PMC_MARK_ROW_THREAD(R) do {					  \
102 	KASSERT(pmc_pmcdisp[(R)] >= 0, ("[pmc,%d] row disposition error", \
103 		    __LINE__));						  \
104 	atomic_add_int(&pmc_pmcdisp[(R)], 1);				  \
105 } while (0)
106 
107 #define	PMC_UNMARK_ROW_THREAD(R) do {					  \
108 	atomic_add_int(&pmc_pmcdisp[(R)], -1);				  \
109 	KASSERT(pmc_pmcdisp[(R)] >= 0, ("[pmc,%d] row disposition error", \
110 		    __LINE__));						  \
111 } while (0)
112 
113 
114 /* various event handlers */
115 static eventhandler_tag	pmc_exit_tag, pmc_fork_tag;
116 
117 /* Module statistics */
118 struct pmc_op_getdriverstats pmc_stats;
119 
120 /* Machine/processor dependent operations */
121 struct pmc_mdep  *md;
122 
123 /*
124  * Hash tables mapping owner processes and target threads to PMCs.
125  */
126 
127 struct mtx pmc_processhash_mtx;		/* spin mutex */
128 static u_long pmc_processhashmask;
129 static LIST_HEAD(pmc_processhash, pmc_process)	*pmc_processhash;
130 
131 /*
132  * Hash table of PMC owner descriptors.  This table is protected by
133  * the shared PMC "sx" lock.
134  */
135 
136 static u_long pmc_ownerhashmask;
137 static LIST_HEAD(pmc_ownerhash, pmc_owner)	*pmc_ownerhash;
138 
139 /*
140  * Prototypes
141  */
142 
143 #if	DEBUG
144 static int	pmc_debugflags_sysctl_handler(SYSCTL_HANDLER_ARGS);
145 static int	pmc_debugflags_parse(char *newstr, char *fence);
146 #endif
147 
148 static int	load(struct module *module, int cmd, void *arg);
149 static int	pmc_syscall_handler(struct thread *td, void *syscall_args);
150 static int	pmc_configure_log(struct pmc_owner *po, int logfd);
151 static void	pmc_log_process_exit(struct pmc *pm, struct pmc_process *pp);
152 static struct pmc *pmc_allocate_pmc_descriptor(void);
153 static struct pmc *pmc_find_pmc_descriptor_in_process(struct pmc_owner *po,
154     pmc_id_t pmc);
155 static void	pmc_release_pmc_descriptor(struct pmc *pmc);
156 static int	pmc_can_allocate_rowindex(struct proc *p, unsigned int ri);
157 static struct pmc_process *pmc_find_process_descriptor(struct proc *p,
158     uint32_t mode);
159 static void	pmc_remove_process_descriptor(struct pmc_process *pp);
160 static struct pmc_owner *pmc_find_owner_descriptor(struct proc *p);
161 static int	pmc_find_pmc(pmc_id_t pmcid, struct pmc **pm);
162 static void	pmc_remove_owner(struct pmc_owner *po);
163 static void	pmc_maybe_remove_owner(struct pmc_owner *po);
164 static void	pmc_unlink_target_process(struct pmc *pmc,
165     struct pmc_process *pp);
166 static void	pmc_link_target_process(struct pmc *pm,
167     struct pmc_process *pp);
168 static void	pmc_unlink_owner(struct pmc *pmc);
169 static void	pmc_cleanup(void);
170 static void	pmc_save_cpu_binding(struct pmc_binding *pb);
171 static void	pmc_restore_cpu_binding(struct pmc_binding *pb);
172 static void	pmc_select_cpu(int cpu);
173 static void	pmc_process_exit(void *arg, struct proc *p);
174 static void	pmc_process_fork(void *arg, struct proc *p1,
175     struct proc *p2, int n);
176 static int	pmc_attach_one_process(struct proc *p, struct pmc *pm);
177 static int	pmc_attach_process(struct proc *p, struct pmc *pm);
178 static int	pmc_detach_one_process(struct proc *p, struct pmc *pm,
179     int flags);
180 static int	pmc_detach_process(struct proc *p, struct pmc *pm);
181 static int	pmc_start(struct pmc *pm);
182 static int	pmc_stop(struct pmc *pm);
183 static int	pmc_can_attach(struct pmc *pm, struct proc *p);
184 
185 /*
186  * Kernel tunables and sysctl(8) interface.
187  */
188 
189 #define PMC_SYSCTL_NAME_PREFIX "kern." PMC_MODULE_NAME "."
190 
191 SYSCTL_NODE(_kern, OID_AUTO, hwpmc, CTLFLAG_RW, 0, "HWPMC parameters");
192 
193 #if	DEBUG
194 unsigned int pmc_debugflags = PMC_DEBUG_DEFAULT_FLAGS;
195 char	pmc_debugstr[PMC_DEBUG_STRSIZE];
196 TUNABLE_STR(PMC_SYSCTL_NAME_PREFIX "debugflags", pmc_debugstr,
197     sizeof(pmc_debugstr));
198 SYSCTL_PROC(_kern_hwpmc, OID_AUTO, debugflags,
199     CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_TUN,
200     0, 0, pmc_debugflags_sysctl_handler, "A", "debug flags");
201 #endif
202 
203 /*
204  * kern.pmc.hashrows -- determines the number of rows in the
205  * of the hash table used to look up threads
206  */
207 
208 static int pmc_hashsize = PMC_HASH_SIZE;
209 TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "hashsize", &pmc_hashsize);
210 SYSCTL_INT(_kern_hwpmc, OID_AUTO, hashsize, CTLFLAG_TUN|CTLFLAG_RD,
211     &pmc_hashsize, 0, "rows in hash tables");
212 
213 /*
214  * kern.pmc.pcpusize -- the size of each per-cpu
215  * area for collection PC samples.
216  */
217 
218 static int pmc_pcpu_buffer_size = PMC_PCPU_BUFFER_SIZE;
219 TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "pcpubuffersize", &pmc_pcpu_buffer_size);
220 SYSCTL_INT(_kern_hwpmc, OID_AUTO, pcpubuffersize, CTLFLAG_TUN|CTLFLAG_RD,
221     &pmc_pcpu_buffer_size, 0, "size of per-cpu buffer in 4K pages");
222 
223 /*
224  * kern.pmc.mtxpoolsize -- number of mutexes in the mutex pool.
225  */
226 
227 static int pmc_mtxpool_size = PMC_MTXPOOL_SIZE;
228 TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "mtxpoolsize", &pmc_mtxpool_size);
229 SYSCTL_INT(_kern_hwpmc, OID_AUTO, mtxpoolsize, CTLFLAG_TUN|CTLFLAG_RD,
230     &pmc_mtxpool_size, 0, "size of spin mutex pool");
231 
232 
233 
234 /*
235  * security.bsd.unprivileged_syspmcs -- allow non-root processes to
236  * allocate system-wide PMCs.
237  *
238  * Allowing unprivileged processes to allocate system PMCs is convenient
239  * if system-wide measurements need to be taken concurrently with other
240  * per-process measurements.  This feature is turned off by default.
241  */
242 
243 SYSCTL_DECL(_security_bsd);
244 
245 static int pmc_unprivileged_syspmcs = 0;
246 TUNABLE_INT("security.bsd.unprivileged_syspmcs", &pmc_unprivileged_syspmcs);
247 SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_syspmcs, CTLFLAG_RW,
248     &pmc_unprivileged_syspmcs, 0,
249     "allow unprivileged process to allocate system PMCs");
250 
251 #if	PMC_HASH_USE_CRC32
252 
253 #define	PMC_HASH_PTR(P,M)	(crc32(&(P), sizeof((P))) & (M))
254 
255 #else 	/* integer multiplication */
256 
257 #if	LONG_BIT == 64
258 #define	_PMC_HM		11400714819323198486u
259 #elif	LONG_BIT == 32
260 #define	_PMC_HM		2654435769u
261 #else
262 #error 	Must know the size of 'long' to compile
263 #endif
264 
265 /*
266  * Hash function.  Discard the lower 2 bits of the pointer since
267  * these are always zero for our uses.  The hash multiplier is
268  * round((2^LONG_BIT) * ((sqrt(5)-1)/2)).
269  */
270 
271 #define	PMC_HASH_PTR(P,M)	((((unsigned long) (P) >> 2) * _PMC_HM) & (M))
272 
273 #endif
274 
275 /*
276  * Syscall structures
277  */
278 
279 /* The `sysent' for the new syscall */
280 static struct sysent pmc_sysent = {
281 	2,			/* sy_narg */
282 	pmc_syscall_handler	/* sy_call */
283 };
284 
285 static struct syscall_module_data pmc_syscall_mod = {
286 	load,
287 	NULL,
288 	&pmc_syscall_num,
289 	&pmc_sysent,
290 	{ 0, NULL }
291 };
292 
293 static moduledata_t pmc_mod = {
294 	PMC_MODULE_NAME,
295 	syscall_module_handler,
296 	&pmc_syscall_mod
297 };
298 
299 DECLARE_MODULE(pmc, pmc_mod, SI_SUB_SMP, SI_ORDER_ANY);
300 MODULE_VERSION(pmc, PMC_VERSION);
301 
302 #if	DEBUG
303 static int
304 pmc_debugflags_parse(char *newstr, char *fence)
305 {
306 	char c, *e, *p, *q;
307 	unsigned int tmpflags;
308 	int level;
309 	char tmpbuf[4];		/* 3 character keyword + '\0' */
310 
311 	tmpflags = 0;
312 	level = 0xF;	/* max verbosity */
313 
314 	p = newstr;
315 
316 	for (; p < fence && (c = *p);) {
317 
318 		/* skip separators */
319 		if (c == ' ' || c == '\t' || c == ',') {
320 			p++; continue;
321 		}
322 
323 		(void) strlcpy(tmpbuf, p, sizeof(tmpbuf));
324 
325 #define	CMP_SET_FLAG_MAJ(S,F)					\
326 		else if (strncmp(tmpbuf, S, 3) == 0)		\
327 			tmpflags |= __PMCDFMAJ(F)
328 
329 #define	CMP_SET_FLAG_MIN(S,F)					\
330 		else if (strncmp(tmpbuf, S, 3) == 0)		\
331 			tmpflags |= __PMCDFMIN(F)
332 
333 		if (e - p > 6 && strncmp(p, "level=", 6) == 0) {
334 			p += 6;	/* skip over keyword */
335 			level = strtoul(p, &q, 16);
336 		}
337 		CMP_SET_FLAG_MAJ("mod", MOD);
338 		CMP_SET_FLAG_MAJ("pmc", PMC);
339 		CMP_SET_FLAG_MAJ("ctx", CTX);
340 		CMP_SET_FLAG_MAJ("own", OWN);
341 		CMP_SET_FLAG_MAJ("prc", PRC);
342 		CMP_SET_FLAG_MAJ("mdp", MDP);
343 		CMP_SET_FLAG_MAJ("cpu", CPU);
344 
345 		CMP_SET_FLAG_MIN("all", ALL);
346 		CMP_SET_FLAG_MIN("rel", REL);
347 		CMP_SET_FLAG_MIN("ops", OPS);
348 		CMP_SET_FLAG_MIN("ini", INI);
349 		CMP_SET_FLAG_MIN("fnd", FND);
350 		CMP_SET_FLAG_MIN("pmh", PMH);
351 		CMP_SET_FLAG_MIN("pms", PMS);
352 		CMP_SET_FLAG_MIN("orm", ORM);
353 		CMP_SET_FLAG_MIN("omr", OMR);
354 		CMP_SET_FLAG_MIN("tlk", TLK);
355 		CMP_SET_FLAG_MIN("tul", TUL);
356 		CMP_SET_FLAG_MIN("ext", EXT);
357 		CMP_SET_FLAG_MIN("exc", EXC);
358 		CMP_SET_FLAG_MIN("frk", FRK);
359 		CMP_SET_FLAG_MIN("att", ATT);
360 		CMP_SET_FLAG_MIN("swi", SWI);
361 		CMP_SET_FLAG_MIN("swo", SWO);
362 		CMP_SET_FLAG_MIN("reg", REG);
363 		CMP_SET_FLAG_MIN("alr", ALR);
364 		CMP_SET_FLAG_MIN("rea", REA);
365 		CMP_SET_FLAG_MIN("wri", WRI);
366 		CMP_SET_FLAG_MIN("cfg", CFG);
367 		CMP_SET_FLAG_MIN("sta", STA);
368 		CMP_SET_FLAG_MIN("sto", STO);
369 		CMP_SET_FLAG_MIN("bnd", BND);
370 		CMP_SET_FLAG_MIN("sel", SEL);
371 		else	/* unrecognized keyword */
372 			return EINVAL;
373 
374 		p += 4;	/* skip keyword and separator */
375 	}
376 
377 	pmc_debugflags = (tmpflags|level);
378 
379 	return 0;
380 }
381 
382 static int
383 pmc_debugflags_sysctl_handler(SYSCTL_HANDLER_ARGS)
384 {
385 	char *fence, *newstr;
386 	int error;
387 	unsigned int n;
388 
389 	(void) arg1; (void) arg2; /* unused parameters */
390 
391 	n = sizeof(pmc_debugstr);
392 	MALLOC(newstr, char *, n, M_PMC, M_ZERO|M_WAITOK);
393 	(void) strlcpy(newstr, pmc_debugstr, sizeof(pmc_debugstr));
394 
395 	error = sysctl_handle_string(oidp, newstr, n, req);
396 
397 	/* if there is a new string, parse and copy it */
398 	if (error == 0 && req->newptr != NULL) {
399 		fence = newstr + (n < req->newlen ? n : req->newlen);
400 		if ((error = pmc_debugflags_parse(newstr, fence)) == 0)
401 			(void) strlcpy(pmc_debugstr, newstr,
402 			    sizeof(pmc_debugstr));
403 	}
404 
405 	FREE(newstr, M_PMC);
406 
407 	return error;
408 }
409 #endif
410 
411 /*
412  * Concurrency Control
413  *
414  * The driver manages the following data structures:
415  *
416  *   - target process descriptors, one per target process
417  *   - owner process descriptors (and attached lists), one per owner process
418  *   - lookup hash tables for owner and target processes
419  *   - PMC descriptors (and attached lists)
420  *   - per-cpu hardware state
421  *   - the 'hook' variable through which the kernel calls into
422  *     this module
423  *   - the machine hardware state (managed by the MD layer)
424  *
425  * These data structures are accessed from:
426  *
427  * - thread context-switch code
428  * - interrupt handlers (possibly on multiple cpus)
429  * - kernel threads on multiple cpus running on behalf of user
430  *   processes doing system calls
431  * - this driver's private kernel threads
432  *
433  * = Locks and Locking strategy =
434  *
435  * The driver uses four locking strategies for its operation:
436  *
437  * - There is a 'global' SX lock "pmc_sx" that is used to protect
438  *   the its 'meta-data'.
439  *
440  *   Calls into the module (via syscall() or by the kernel) start with
441  *   this lock being held in exclusive mode.  Depending on the requested
442  *   operation, the lock may be downgraded to 'shared' mode to allow
443  *   more concurrent readers into the module.
444  *
445  *   This SX lock is held in exclusive mode for any operations that
446  *   modify the linkages between the driver's internal data structures.
447  *
448  *   The 'pmc_hook' function pointer is also protected by this lock.
449  *   It is only examined with the sx lock held in exclusive mode.  The
450  *   kernel module is allowed to be unloaded only with the sx lock
451  *   held in exclusive mode.  In normal syscall handling, after
452  *   acquiring the pmc_sx lock we first check that 'pmc_hook' is
453  *   non-null before proceeding.  This prevents races between the
454  *   thread unloading the module and other threads seeking to use the
455  *   module.
456  *
457  * - Lookups of target process structures and owner process structures
458  *   cannot use the global "pmc_sx" SX lock because these lookups need
459  *   to happen during context switches and in other critical sections
460  *   where sleeping is not allowed.  We protect these lookup tables
461  *   with their own private spin-mutexes, "pmc_processhash_mtx" and
462  *   "pmc_ownerhash_mtx".  These are 'leaf' mutexes, in that no other
463  *   lock is acquired with these locks held.
464  *
465  * - Interrupt handlers work in a lock free manner.  At interrupt
466  *   time, handlers look at the PMC pointer (phw->phw_pmc) configured
467  *   when the PMC was started.  If this pointer is NULL, the interrupt
468  *   is ignored after updating driver statistics.  We ensure that this
469  *   pointer is set (using an atomic operation if necessary) before the
470  *   PMC hardware is started.  Conversely, this pointer is unset atomically
471  *   only after the PMC hardware is stopped.
472  *
473  *   We ensure that everything needed for the operation of an
474  *   interrupt handler is available without it needing to acquire any
475  *   locks.  We also ensure that a PMC's software state is destroyed only
476  *   after the PMC is taken off hardware (on all CPUs).
477  *
478  * - Context-switch handling with process-private PMCs needs more
479  *   care.
480  *
481  *   A given process may be the target of multiple PMCs.  For example,
482  *   PMCATTACH and PMCDETACH may be requested by a process on one CPU
483  *   while the target process is running on another.  A PMC could also
484  *   be getting released because its owner is exiting.  We tackle
485  *   these situations in the following manner:
486  *
487  *   - each target process structure 'pmc_process' has an array
488  *     of 'struct pmc *' pointers, one for each hardware PMC.
489  *
490  *   - At context switch IN time, each "target" PMC in RUNNING state
491  *     gets started on hardware and a pointer to each PMC is copied into
492  *     the per-cpu phw array.  The 'runcount' for the PMC is
493  *     incremented.
494  *
495  *   - At context switch OUT time, all process-virtual PMCs are stopped
496  *     on hardware.  The saved value is added to the PMCs value field
497  *     only if the PMC is in a non-deleted state (the PMCs state could
498  *     have changed during the current time slice).
499  *
500  *     Note that since in-between a switch IN on a processor and a switch
501  *     OUT, the PMC could have been released on another CPU.  Therefore
502  *     context switch OUT always looks at the hardware state to turn
503  *     OFF PMCs and will update a PMC's saved value only if reachable
504  *     from the target process record.
505  *
506  *   - OP PMCRELEASE could be called on a PMC at any time (the PMC could
507  *     be attached to many processes at the time of the call and could
508  *     be active on multiple CPUs).
509  *
510  *     We prevent further scheduling of the PMC by marking it as in
511  *     state 'DELETED'.  If the runcount of the PMC is non-zero then
512  *     this PMC is currently running on a CPU somewhere.  The thread
513  *     doing the PMCRELEASE operation waits by repeatedly doing an
514  *     tsleep() till the runcount comes to zero.
515  *
516  */
517 
518 /*
519  * save the cpu binding of the current kthread
520  */
521 
522 static void
523 pmc_save_cpu_binding(struct pmc_binding *pb)
524 {
525 	PMCDBG(CPU,BND,2, "%s", "save-cpu");
526 	mtx_lock_spin(&sched_lock);
527 	pb->pb_bound = sched_is_bound(curthread);
528 	pb->pb_cpu   = curthread->td_oncpu;
529 	mtx_unlock_spin(&sched_lock);
530 	PMCDBG(CPU,BND,2, "save-cpu cpu=%d", pb->pb_cpu);
531 }
532 
533 /*
534  * restore the cpu binding of the current thread
535  */
536 
537 static void
538 pmc_restore_cpu_binding(struct pmc_binding *pb)
539 {
540 	PMCDBG(CPU,BND,2, "restore-cpu curcpu=%d restore=%d",
541 	    curthread->td_oncpu, pb->pb_cpu);
542 	mtx_lock_spin(&sched_lock);
543 	if (pb->pb_bound)
544 		sched_bind(curthread, pb->pb_cpu);
545 	else
546 		sched_unbind(curthread);
547 	mtx_unlock_spin(&sched_lock);
548 	PMCDBG(CPU,BND,2, "%s", "restore-cpu done");
549 }
550 
551 /*
552  * move execution over the specified cpu and bind it there.
553  */
554 
555 static void
556 pmc_select_cpu(int cpu)
557 {
558 	KASSERT(cpu >= 0 && cpu < mp_ncpus,
559 	    ("[pmc,%d] bad cpu number %d", __LINE__, cpu));
560 
561 	/* never move to a disabled CPU */
562 	KASSERT(pmc_cpu_is_disabled(cpu) == 0, ("[pmc,%d] selecting "
563 	    "disabled CPU %d", __LINE__, cpu));
564 
565 	PMCDBG(CPU,SEL,2, "select-cpu cpu=%d", cpu);
566 	mtx_lock_spin(&sched_lock);
567 	sched_bind(curthread, cpu);
568 	mtx_unlock_spin(&sched_lock);
569 
570 	KASSERT(curthread->td_oncpu == cpu,
571 	    ("[pmc,%d] CPU not bound [cpu=%d, curr=%d]", __LINE__,
572 		cpu, curthread->td_oncpu));
573 
574 	PMCDBG(CPU,SEL,2, "select-cpu cpu=%d ok", cpu);
575 }
576 
577 /*
578  * Update the per-pmc histogram
579  */
580 
581 void
582 pmc_update_histogram(struct pmc_hw *phw, uintptr_t pc)
583 {
584 	(void) phw;
585 	(void) pc;
586 }
587 
588 /*
589  * Send a signal to a process.  This is meant to be invoked from an
590  * interrupt handler.
591  */
592 
593 void
594 pmc_send_signal(struct pmc *pmc)
595 {
596 	(void) pmc;	/* shutup gcc */
597 
598 #if	0
599 	struct proc   *proc;
600 	struct thread *td;
601 
602 	KASSERT(pmc->pm_owner != NULL,
603 	    ("[pmc,%d] No owner for PMC", __LINE__));
604 
605 	KASSERT((pmc->pm_owner->po_flags & PMC_FLAG_IS_OWNER) &&
606 	    (pmc->pm_owner->po_flags & PMC_FLAG_HAS_TS_PMC),
607 	    ("[pmc,%d] interrupting PMC owner has wrong flags 0x%x",
608 		__LINE__, pmc->pm_owner->po_flags));
609 
610 	proc = pmc->pm_owner->po_owner;
611 
612 	KASSERT(curthread->td_proc == proc,
613 	    ("[pmc,%d] interruping the wrong thread (owner %p, "
614 		"cur %p)", __LINE__, (void *) proc, curthread->td_proc));
615 
616 	mtx_lock_spin(&sched_lock);
617 	td = TAILQ_FIRST(&proc->p_threads);
618 	mtx_unlock_spin(&sched_lock);
619 	/* XXX RACE HERE: can 'td' disappear now? */
620 	trapsignal(td, SIGPROF, 0);
621 	/* XXX rework this to use the regular 'psignal' interface from a
622 	   helper thread */
623 #endif
624 
625 }
626 
627 /*
628  * remove an process owning PMCs
629  */
630 
631 void
632 pmc_remove_owner(struct pmc_owner *po)
633 {
634 	struct pmc_list *pl, *tmp;
635 
636 	sx_assert(&pmc_sx, SX_XLOCKED);
637 
638 	PMCDBG(OWN,ORM,1, "remove-owner po=%p", po);
639 
640 	/* Remove descriptor from the owner hash table */
641 	LIST_REMOVE(po, po_next);
642 
643 	/* pass 1: release all owned PMC descriptors */
644 	LIST_FOREACH_SAFE(pl, &po->po_pmcs, pl_next, tmp) {
645 
646 		PMCDBG(OWN,ORM,2, "pl=%p pmc=%p", pl, pl->pl_pmc);
647 
648 		/* remove the associated PMC descriptor, if present */
649 		if (pl->pl_pmc)
650 			pmc_release_pmc_descriptor(pl->pl_pmc);
651 
652 		/* remove the linked list entry */
653 		LIST_REMOVE(pl, pl_next);
654 		FREE(pl, M_PMC);
655 	}
656 
657 	/* pass 2: delete the pmc_list chain */
658 	LIST_FOREACH_SAFE(pl, &po->po_pmcs, pl_next, tmp) {
659 		KASSERT(pl->pl_pmc == NULL,
660 		    ("[pmc,%d] non-null pmc pointer", __LINE__));
661 		LIST_REMOVE(pl, pl_next);
662 		FREE(pl, M_PMC);
663 	}
664 
665 	KASSERT(LIST_EMPTY(&po->po_pmcs),
666 		("[pmc,%d] PMC list not empty", __LINE__));
667 
668 
669 	/*
670 	 * If this process owns a log file used for system wide logging,
671 	 * remove the log file.
672 	 *
673 	 * XXX rework needed.
674 	 */
675 
676 	if (po->po_flags & PMC_FLAG_OWNS_LOGFILE)
677 		pmc_configure_log(po, -1);
678 
679 }
680 
681 /*
682  * remove an owner process record if all conditions are met.
683  */
684 
685 static void
686 pmc_maybe_remove_owner(struct pmc_owner *po)
687 {
688 
689 	PMCDBG(OWN,OMR,1, "maybe-remove-owner po=%p", po);
690 
691 	/*
692 	 * Remove owner record if
693 	 * - this process does not own any PMCs
694 	 * - this process has not allocated a system-wide sampling buffer
695 	 */
696 
697 	if (LIST_EMPTY(&po->po_pmcs) &&
698 	    ((po->po_flags & PMC_FLAG_OWNS_LOGFILE) == 0)) {
699 		pmc_remove_owner(po);
700 		FREE(po, M_PMC);
701 	}
702 }
703 
704 /*
705  * Add an association between a target process and a PMC.
706  */
707 
708 static void
709 pmc_link_target_process(struct pmc *pm, struct pmc_process *pp)
710 {
711 	int ri;
712 	struct pmc_target *pt;
713 
714 	sx_assert(&pmc_sx, SX_XLOCKED);
715 
716 	KASSERT(pm != NULL && pp != NULL,
717 	    ("[pmc,%d] Null pm %p or pp %p", __LINE__, pm, pp));
718 
719 	KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt < ((int) md->pmd_npmc - 1),
720 	    ("[pmc,%d] Illegal reference count %d for process record %p",
721 		__LINE__, pp->pp_refcnt, (void *) pp));
722 
723 	ri = pm->pm_rowindex;
724 
725 	PMCDBG(PRC,TLK,1, "link-target pmc=%p ri=%d pmc-process=%p",
726 	    pm, ri, pp);
727 
728 #if	DEBUG
729 	LIST_FOREACH(pt, &pm->pm_targets, pt_next)
730 	    if (pt->pt_process == pp)
731 		    KASSERT(0, ("[pmc,%d] pp %p already in pmc %p targets",
732 				__LINE__, pp, pm));
733 #endif
734 
735 	MALLOC(pt, struct pmc_target *, sizeof(struct pmc_target),
736 	    M_PMC, M_ZERO|M_WAITOK);
737 
738 	pt->pt_process = pp;
739 
740 	LIST_INSERT_HEAD(&pm->pm_targets, pt, pt_next);
741 
742 	atomic_store_rel_ptr(&pp->pp_pmcs[ri].pp_pmc, pm);
743 
744 	pp->pp_refcnt++;
745 
746 }
747 
748 /*
749  * Removes the association between a target process and a PMC.
750  */
751 
752 static void
753 pmc_unlink_target_process(struct pmc *pm, struct pmc_process *pp)
754 {
755 	int ri;
756 	struct pmc_target *ptgt;
757 
758 	sx_assert(&pmc_sx, SX_XLOCKED);
759 
760 	KASSERT(pm != NULL && pp != NULL,
761 	    ("[pmc,%d] Null pm %p or pp %p", __LINE__, pm, pp));
762 
763 	KASSERT(pp->pp_refcnt >= 1 && pp->pp_refcnt < (int) md->pmd_npmc,
764 	    ("[pmc,%d] Illegal ref count %d on process record %p",
765 		__LINE__, pp->pp_refcnt, (void *) pp));
766 
767 	ri = pm->pm_rowindex;
768 
769 	PMCDBG(PRC,TUL,1, "unlink-target pmc=%p ri=%d pmc-process=%p",
770 	    pm, ri, pp);
771 
772 	KASSERT(pp->pp_pmcs[ri].pp_pmc == pm,
773 	    ("[pmc,%d] PMC ri %d mismatch pmc %p pp->[ri] %p", __LINE__,
774 		ri, pm, pp->pp_pmcs[ri].pp_pmc));
775 
776 	pp->pp_pmcs[ri].pp_pmc = NULL;
777 	pp->pp_pmcs[ri].pp_pmcval = (pmc_value_t) 0;
778 
779 	pp->pp_refcnt--;
780 
781 	/* Remove the target process from the PMC structure */
782 	LIST_FOREACH(ptgt, &pm->pm_targets, pt_next)
783 		if (ptgt->pt_process == pp)
784 			break;
785 
786 	KASSERT(ptgt != NULL, ("[pmc,%d] process %p (pp: %p) not found "
787 		    "in pmc %p", __LINE__, pp->pp_proc, pp, pm));
788 
789 	PMCDBG(PRC,TUL,4, "unlink ptgt=%p", ptgt);
790 
791 	LIST_REMOVE(ptgt, pt_next);
792 	FREE(ptgt, M_PMC);
793 }
794 
795 /*
796  * Remove PMC descriptor 'pmc' from the owner descriptor.
797  */
798 
799 void
800 pmc_unlink_owner(struct pmc *pm)
801 {
802 	struct pmc_list	*pl, *tmp;
803 	struct pmc_owner *po;
804 
805 #if	DEBUG
806 	KASSERT(LIST_EMPTY(&pm->pm_targets),
807 	    ("[pmc,%d] unlinking PMC with targets", __LINE__));
808 #endif
809 
810 	po = pm->pm_owner;
811 
812 	KASSERT(po != NULL, ("[pmc,%d] No owner for PMC", __LINE__));
813 
814 	LIST_FOREACH_SAFE(pl, &po->po_pmcs, pl_next, tmp) {
815 		if (pl->pl_pmc == pm) {
816 			pl->pl_pmc    = NULL;
817 			pm->pm_owner = NULL;
818 			return;
819 		}
820 	}
821 
822 	KASSERT(0, ("[pmc,%d] couldn't find pmc in owner list", __LINE__));
823 }
824 
825 /*
826  * Check if PMC 'pm' may be attached to target process 't'.
827  */
828 
829 static int
830 pmc_can_attach(struct pmc *pm, struct proc *t)
831 {
832 	struct proc *o;		/* pmc owner */
833 	struct ucred *oc, *tc;	/* owner, target credentials */
834 	int decline_attach, i;
835 
836 	/*
837 	 * A PMC's owner can always attach that PMC to itself.
838 	 */
839 
840 	if ((o = pm->pm_owner->po_owner) == t)
841 		return 0;
842 
843 	PROC_LOCK(o);
844 	oc = o->p_ucred;
845 	crhold(oc);
846 	PROC_UNLOCK(o);
847 
848 	PROC_LOCK(t);
849 	tc = t->p_ucred;
850 	crhold(tc);
851 	PROC_UNLOCK(t);
852 
853 	/*
854 	 * The effective uid of the PMC owner should match at least one
855 	 * of the {effective,real,saved} uids of the target process.
856 	 */
857 
858 	decline_attach = oc->cr_uid != tc->cr_uid &&
859 	    oc->cr_uid != tc->cr_svuid &&
860 	    oc->cr_uid != tc->cr_ruid;
861 
862 	/*
863 	 * Every one of the target's group ids, must be in the owner's
864 	 * group list.
865 	 */
866 	for (i = 0; !decline_attach && i < tc->cr_ngroups; i++)
867 		decline_attach = !groupmember(tc->cr_groups[i], oc);
868 
869 	/* check the read and saved gids too */
870 	if (decline_attach == 0)
871 		decline_attach = !groupmember(tc->cr_rgid, oc) ||
872 		    !groupmember(tc->cr_svgid, oc);
873 
874 	crfree(tc);
875 	crfree(oc);
876 
877 	return !decline_attach;
878 }
879 
880 /*
881  * Attach a process to a PMC.
882  */
883 
884 static int
885 pmc_attach_one_process(struct proc *p, struct pmc *pm)
886 {
887 	int ri;
888 	struct pmc_process	*pp;
889 
890 	sx_assert(&pmc_sx, SX_XLOCKED);
891 
892 	PMCDBG(PRC,ATT,2, "attach-one pm=%p ri=%d proc=%p (%d, %s)", pm,
893 	    pm->pm_rowindex, p, p->p_pid, p->p_comm);
894 
895 	/*
896 	 * Locate the process descriptor corresponding to process 'p',
897 	 * allocating space as needed.
898 	 *
899 	 * Verify that rowindex 'pm_rowindex' is free in the process
900 	 * descriptor.
901 	 *
902 	 * If not, allocate space for a descriptor and link the
903 	 * process descriptor and PMC.
904 	 */
905 
906 	ri = pm->pm_rowindex;
907 
908 	if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_ALLOCATE)) == NULL)
909 		return ENOMEM;
910 
911 	if (pp->pp_pmcs[ri].pp_pmc == pm) /* already present at slot [ri] */
912 		return EEXIST;
913 
914 	if (pp->pp_pmcs[ri].pp_pmc != NULL)
915 		return EBUSY;
916 
917 	pmc_link_target_process(pm, pp);
918 
919 	/* mark process as using HWPMCs */
920 	PROC_LOCK(p);
921 	p->p_flag |= P_HWPMC;
922 	PROC_UNLOCK(p);
923 
924 	return 0;
925 }
926 
927 /*
928  * Attach a process and optionally its children
929  */
930 
931 static int
932 pmc_attach_process(struct proc *p, struct pmc *pm)
933 {
934 	int error;
935 	struct proc *top;
936 
937 	sx_assert(&pmc_sx, SX_XLOCKED);
938 
939 	PMCDBG(PRC,ATT,1, "attach pm=%p ri=%d proc=%p (%d, %s)", pm,
940 	    pm->pm_rowindex, p, p->p_pid, p->p_comm);
941 
942 	if ((pm->pm_flags & PMC_F_DESCENDANTS) == 0)
943 		return pmc_attach_one_process(p, pm);
944 
945 	/*
946 	 * Traverse all child processes, attaching them to
947 	 * this PMC.
948 	 */
949 
950 	sx_slock(&proctree_lock);
951 
952 	top = p;
953 
954 	for (;;) {
955 		if ((error = pmc_attach_one_process(p, pm)) != 0)
956 			break;
957 		if (!LIST_EMPTY(&p->p_children))
958 			p = LIST_FIRST(&p->p_children);
959 		else for (;;) {
960 			if (p == top)
961 				goto done;
962 			if (LIST_NEXT(p, p_sibling)) {
963 				p = LIST_NEXT(p, p_sibling);
964 				break;
965 			}
966 			p = p->p_pptr;
967 		}
968 	}
969 
970 	if (error)
971 		(void) pmc_detach_process(top, pm);
972 
973  done:
974 	sx_sunlock(&proctree_lock);
975 	return error;
976 }
977 
978 /*
979  * Detach a process from a PMC.  If there are no other PMCs tracking
980  * this process, remove the process structure from its hash table.  If
981  * 'flags' contains PMC_FLAG_REMOVE, then free the process structure.
982  */
983 
984 static int
985 pmc_detach_one_process(struct proc *p, struct pmc *pm, int flags)
986 {
987 	int ri;
988 	struct pmc_process *pp;
989 
990 	sx_assert(&pmc_sx, SX_XLOCKED);
991 
992 	KASSERT(pm != NULL,
993 	    ("[pmc,%d] null pm pointer", __LINE__));
994 
995 	PMCDBG(PRC,ATT,2, "detach-one pm=%p ri=%d proc=%p (%d, %s) flags=0x%x",
996 	    pm, pm->pm_rowindex, p, p->p_pid, p->p_comm, flags);
997 
998 	ri = pm->pm_rowindex;
999 
1000 	if ((pp = pmc_find_process_descriptor(p, 0)) == NULL)
1001 		return ESRCH;
1002 
1003 	if (pp->pp_pmcs[ri].pp_pmc != pm)
1004 		return EINVAL;
1005 
1006 	pmc_unlink_target_process(pm, pp);
1007 
1008 	/*
1009 	 * If there are no PMCs targetting this process, we remove its
1010 	 * descriptor from the target hash table and unset the P_HWPMC
1011 	 * flag in the struct proc.
1012 	 */
1013 
1014 	KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt < (int) md->pmd_npmc,
1015 	    ("[pmc,%d] Illegal refcnt %d for process struct %p",
1016 		__LINE__, pp->pp_refcnt, pp));
1017 
1018 	if (pp->pp_refcnt != 0)	/* still a target of some PMC */
1019 		return 0;
1020 
1021 	pmc_remove_process_descriptor(pp);
1022 
1023 	if (flags & PMC_FLAG_REMOVE)
1024 		FREE(pp, M_PMC);
1025 
1026 	PROC_LOCK(p);
1027 	p->p_flag &= ~P_HWPMC;
1028 	PROC_UNLOCK(p);
1029 
1030 	return 0;
1031 }
1032 
1033 /*
1034  * Detach a process and optionally its descendants from a PMC.
1035  */
1036 
1037 static int
1038 pmc_detach_process(struct proc *p, struct pmc *pm)
1039 {
1040 	struct proc *top;
1041 
1042 	sx_assert(&pmc_sx, SX_XLOCKED);
1043 
1044 	PMCDBG(PRC,ATT,1, "detach pm=%p ri=%d proc=%p (%d, %s)", pm,
1045 	    pm->pm_rowindex, p, p->p_pid, p->p_comm);
1046 
1047 	if ((pm->pm_flags & PMC_F_DESCENDANTS) == 0)
1048 		return pmc_detach_one_process(p, pm, PMC_FLAG_REMOVE);
1049 
1050 	/*
1051 	 * Traverse all children, detaching them from this PMC.  We
1052 	 * ignore errors since we could be detaching a PMC from a
1053 	 * partially attached proc tree.
1054 	 */
1055 
1056 	sx_slock(&proctree_lock);
1057 
1058 	top = p;
1059 
1060 	for (;;) {
1061 		(void) pmc_detach_one_process(p, pm, PMC_FLAG_REMOVE);
1062 
1063 		if (!LIST_EMPTY(&p->p_children))
1064 			p = LIST_FIRST(&p->p_children);
1065 		else for (;;) {
1066 			if (p == top)
1067 				goto done;
1068 			if (LIST_NEXT(p, p_sibling)) {
1069 				p = LIST_NEXT(p, p_sibling);
1070 				break;
1071 			}
1072 			p = p->p_pptr;
1073 		}
1074 	}
1075 
1076  done:
1077 	sx_sunlock(&proctree_lock);
1078 	return 0;
1079 }
1080 
1081 /*
1082  * The 'hook' invoked from the kernel proper
1083  */
1084 
1085 
1086 #if	DEBUG
1087 const char *pmc_hooknames[] = {
1088 	"",
1089 	"EXIT",
1090 	"EXEC",
1091 	"FORK",
1092 	"CSW-IN",
1093 	"CSW-OUT"
1094 };
1095 #endif
1096 
1097 static int
1098 pmc_hook_handler(struct thread *td, int function, void *arg)
1099 {
1100 
1101 	KASSERT(td->td_proc->p_flag & P_HWPMC,
1102 	    ("[pmc,%d] unregistered thread called pmc_hook()", __LINE__));
1103 
1104 	PMCDBG(MOD,PMH,1, "hook td=%p func=%d \"%s\" arg=%p", td, function,
1105 	    pmc_hooknames[function], arg);
1106 
1107 	switch (function)
1108 	{
1109 
1110 	/*
1111 	 * Process exit.
1112 	 *
1113 	 * Remove this process from all hash tables.  If this process
1114 	 * owned any PMCs, turn off those PMCs and deallocate them,
1115 	 * removing any associations with target processes.
1116 	 *
1117 	 * This function will be called by the last 'thread' of a
1118 	 * process.
1119 	 *
1120 	 */
1121 
1122 	case PMC_FN_PROCESS_EXIT: /* release PMCs */
1123 	{
1124 		int cpu;
1125 		unsigned int ri;
1126 		struct pmc *pm;
1127 		struct pmc_hw *phw;
1128 		struct pmc_process *pp;
1129 		struct pmc_owner *po;
1130 		struct proc *p;
1131 		pmc_value_t newvalue, tmp;
1132 
1133 		sx_assert(&pmc_sx, SX_XLOCKED);
1134 
1135 		p = (struct proc *) arg;
1136 
1137 		/*
1138 		 * Since this code is invoked by the last thread in an
1139 		 * exiting process, we would have context switched IN
1140 		 * at some prior point.  Kernel mode context switches
1141 		 * may happen any time, so we want to disable a context
1142 		 * switch OUT till we get any PMCs targetting this
1143 		 * process off the hardware.
1144 		 *
1145 		 * We also need to atomically remove this process'
1146 		 * entry from our target process hash table, using
1147 		 * PMC_FLAG_REMOVE.
1148 		 */
1149 
1150 		PMCDBG(PRC,EXT,1, "process-exit proc=%p (%d, %s)", p, p->p_pid,
1151 		    p->p_comm);
1152 
1153 		critical_enter(); /* no preemption */
1154 
1155 		cpu = curthread->td_oncpu;
1156 
1157 		if ((pp = pmc_find_process_descriptor(p,
1158 			 PMC_FLAG_REMOVE)) != NULL) {
1159 
1160 			PMCDBG(PRC,EXT,2,
1161 			    "process-exit proc=%p pmc-process=%p", p, pp);
1162 
1163 			/*
1164 			 * This process could the target of some PMCs.
1165 			 * Such PMCs will thus be running on currently
1166 			 * executing CPU at this point in the code
1167 			 * since we've disallowed context switches.
1168 			 * We need to turn these PMCs off like we
1169 			 * would do at context switch OUT time.
1170 			 */
1171 
1172 			for (ri = 0; ri < md->pmd_npmc; ri++) {
1173 
1174 				/*
1175 				 * Pick up the pmc pointer from hardware
1176 				 * state similar to the CSW_OUT code.
1177 				 */
1178 
1179 				phw = pmc_pcpu[cpu]->pc_hwpmcs[ri];
1180 				pm  = phw->phw_pmc;
1181 
1182 				PMCDBG(PRC,EXT,2, "ri=%d pm=%p", ri, pm);
1183 
1184 				if (pm == NULL ||
1185 				    !PMC_IS_VIRTUAL_MODE(pm->pm_mode))
1186 					continue;
1187 
1188 				PMCDBG(PRC,EXT,2, "ppmcs[%d]=%p pm=%p "
1189 				    "state=%d", ri, pp->pp_pmcs[ri].pp_pmc,
1190 				    pm, pm->pm_state);
1191 
1192 				KASSERT(pm->pm_rowindex == ri,
1193 				    ("[pmc,%d] ri mismatch pmc(%d) ri(%d)",
1194 					__LINE__, pm->pm_rowindex, ri));
1195 
1196 				KASSERT(pm == pp->pp_pmcs[ri].pp_pmc,
1197 				    ("[pmc,%d] pm %p != pp_pmcs[%d] %p",
1198 					__LINE__, pm, ri,
1199 					pp->pp_pmcs[ri].pp_pmc));
1200 
1201 				(void) md->pmd_stop_pmc(cpu, ri);
1202 
1203 				KASSERT(pm->pm_runcount > 0,
1204 				    ("[pmc,%d] bad runcount ri %d rc %d",
1205 					__LINE__, ri, pm->pm_runcount));
1206 
1207 				if (pm->pm_state == PMC_STATE_RUNNING) {
1208 					md->pmd_read_pmc(cpu, ri, &newvalue);
1209 					tmp = newvalue -
1210 					    PMC_PCPU_SAVED(cpu,ri);
1211 
1212 					mtx_pool_lock_spin(pmc_mtxpool, pm);
1213 					pm->pm_gv.pm_savedvalue += tmp;
1214 					pp->pp_pmcs[ri].pp_pmcval += tmp;
1215 					mtx_pool_unlock_spin(pmc_mtxpool, pm);
1216 				}
1217 
1218 				KASSERT((int) pm->pm_runcount >= 0,
1219 				    ("[pmc,%d] runcount is %d", __LINE__, ri));
1220 
1221 				atomic_subtract_rel_32(&pm->pm_runcount,1);
1222 				(void) md->pmd_config_pmc(cpu, ri, NULL);
1223 			}
1224 			critical_exit(); /* ok to be pre-empted now */
1225 
1226 			/*
1227 			 * Unlink this process from the PMCs that are
1228 			 * targetting it.  Log value at exit() time if
1229 			 * requested.
1230 			 */
1231 
1232 			for (ri = 0; ri < md->pmd_npmc; ri++)
1233 				if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL) {
1234 					if (pm->pm_flags &
1235 					    PMC_F_LOG_TC_PROCEXIT)
1236 						pmc_log_process_exit(pm, pp);
1237 					pmc_unlink_target_process(pm, pp);
1238 				}
1239 
1240 			FREE(pp, M_PMC);
1241 
1242 		} else
1243 			critical_exit(); /* pp == NULL */
1244 
1245 		/*
1246 		 * If the process owned PMCs, free them up and free up
1247 		 * memory.
1248 		 */
1249 
1250 		if ((po = pmc_find_owner_descriptor(p)) != NULL) {
1251 			pmc_remove_owner(po);
1252 			FREE(po, M_PMC);
1253 		}
1254 
1255 	}
1256 	break;
1257 
1258 	/*
1259 	 * Process exec()
1260 	 */
1261 
1262 	case PMC_FN_PROCESS_EXEC:
1263 	{
1264 		int *credentials_changed;
1265 		unsigned int ri;
1266 		struct pmc *pm;
1267 		struct proc *p;
1268 		struct pmc_owner *po;
1269 		struct pmc_process *pp;
1270 
1271 		sx_assert(&pmc_sx, SX_XLOCKED);
1272 
1273 		/*
1274 		 * PMCs are not inherited across an exec():  remove any
1275 		 * PMCs that this process is the owner of.
1276 		 */
1277 
1278 		p = td->td_proc;
1279 
1280 		if ((po = pmc_find_owner_descriptor(p)) != NULL) {
1281 			pmc_remove_owner(po);
1282 			FREE(po, M_PMC);
1283 		}
1284 
1285 		/*
1286 		 * If this process is the target of a PMC, check if the new
1287 		 * credentials are compatible with the owner's permissions.
1288 		 */
1289 
1290 		if ((pp = pmc_find_process_descriptor(p, 0)) == NULL)
1291 			break;
1292 
1293 		credentials_changed = arg;
1294 
1295 		PMCDBG(PRC,EXC,1, "exec proc=%p (%d, %s) cred-changed=%d",
1296 		    p, p->p_pid, p->p_comm, *credentials_changed);
1297 
1298 		if (*credentials_changed == 0) /* credentials didn't change */
1299 			break;
1300 
1301 		/*
1302 		 * If the newly exec()'ed process has a different credential
1303 		 * than before, allow it to be the target of a PMC only if
1304 		 * the PMC's owner has sufficient priviledge.
1305 		 */
1306 
1307 		for (ri = 0; ri < md->pmd_npmc; ri++)
1308 			if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL)
1309 				if (pmc_can_attach(pm, td->td_proc) != 0)
1310 					pmc_detach_one_process(td->td_proc,
1311 					    pm, PMC_FLAG_NONE);
1312 
1313 		KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt < (int) md->pmd_npmc,
1314 		    ("[pmc,%d] Illegal ref count %d on pp %p", __LINE__,
1315 			pp->pp_refcnt, pp));
1316 
1317 		/*
1318 		 * If this process is no longer the target of any
1319 		 * PMCs, we can remove the process entry and free
1320 		 * up space.
1321 		 */
1322 
1323 		if (pp->pp_refcnt == 0) {
1324 			pmc_remove_process_descriptor(pp);
1325 			FREE(pp, M_PMC);
1326 		}
1327 	}
1328 	break;
1329 
1330 	/*
1331 	 * Process fork()
1332 	 */
1333 
1334 	case PMC_FN_PROCESS_FORK:
1335 	{
1336 		unsigned int ri;
1337 		uint32_t do_descendants;
1338 		struct pmc *pm;
1339 		struct pmc_process *ppnew, *ppold;
1340 		struct proc *newproc;
1341 
1342 		sx_assert(&pmc_sx, SX_XLOCKED);
1343 
1344 		newproc = (struct proc *) arg;
1345 
1346 		PMCDBG(PMC,FRK,2, "process-fork p1=%p p2=%p",
1347 		    curthread->td_proc, newproc);
1348 		/*
1349 		 * If the parent process (curthread->td_proc) is a
1350 		 * target of any PMCs, look for PMCs that are to be
1351 		 * inherited, and link these into the new process
1352 		 * descriptor.
1353 		 */
1354 
1355 		if ((ppold = pmc_find_process_descriptor(
1356 		    curthread->td_proc, PMC_FLAG_NONE)) == NULL)
1357 			break;
1358 
1359 		do_descendants = 0;
1360 		for (ri = 0; ri < md->pmd_npmc; ri++)
1361 			if ((pm = ppold->pp_pmcs[ri].pp_pmc) != NULL)
1362 				do_descendants |=
1363 				    pm->pm_flags & PMC_F_DESCENDANTS;
1364 		if (do_descendants == 0) /* nothing to do */
1365 			break;
1366 
1367 		if ((ppnew = pmc_find_process_descriptor(newproc,
1368 		    PMC_FLAG_ALLOCATE)) == NULL)
1369 			return ENOMEM;
1370 
1371 		/*
1372 		 * Run through all PMCs targeting the old process and
1373 		 * attach them to the new process.
1374 		 */
1375 
1376 		for (ri = 0; ri < md->pmd_npmc; ri++)
1377 			if ((pm = ppold->pp_pmcs[ri].pp_pmc) != NULL &&
1378 			    pm->pm_flags & PMC_F_DESCENDANTS)
1379 				pmc_link_target_process(pm, ppnew);
1380 
1381 		/*
1382 		 * Now mark the new process as being tracked by this
1383 		 * driver.
1384 		 */
1385 
1386 		PROC_LOCK(newproc);
1387 		newproc->p_flag |= P_HWPMC;
1388 		PROC_UNLOCK(newproc);
1389 
1390 	}
1391 	break;
1392 
1393 	/*
1394 	 * Thread context switch IN
1395 	 */
1396 
1397 	case PMC_FN_CSW_IN:
1398 	{
1399 		int cpu;
1400 		unsigned int ri;
1401 		struct pmc *pm;
1402 		struct proc *p;
1403 		struct pmc_cpu *pc;
1404 		struct pmc_hw *phw;
1405 		struct pmc_process *pp;
1406 		pmc_value_t newvalue;
1407 
1408 		p = td->td_proc;
1409 
1410 		if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_NONE)) == NULL)
1411 			break;
1412 
1413 		KASSERT(pp->pp_proc == td->td_proc,
1414 		    ("[pmc,%d] not my thread state", __LINE__));
1415 
1416 		critical_enter(); /* no preemption on this CPU */
1417 
1418 		cpu = PCPU_GET(cpuid); /* td->td_oncpu is invalid */
1419 
1420 		PMCDBG(CTX,SWI,1, "cpu=%d proc=%p (%d, %s) pp=%p", cpu, p,
1421 		    p->p_pid, p->p_comm, pp);
1422 
1423 		KASSERT(cpu >= 0 && cpu < mp_ncpus,
1424 		    ("[pmc,%d] wierd CPU id %d", __LINE__, cpu));
1425 
1426 		pc = pmc_pcpu[cpu];
1427 
1428 		for (ri = 0; ri < md->pmd_npmc; ri++) {
1429 
1430 			if ((pm = pp->pp_pmcs[ri].pp_pmc) == NULL)
1431 				continue;
1432 
1433 			KASSERT(PMC_IS_VIRTUAL_MODE(pm->pm_mode),
1434 			    ("[pmc,%d] Target PMC in non-virtual mode (%d)",
1435 				__LINE__, pm->pm_mode));
1436 
1437 			KASSERT(pm->pm_rowindex == ri,
1438 			    ("[pmc,%d] Row index mismatch pmc %d != ri %d",
1439 				__LINE__, pm->pm_rowindex, ri));
1440 
1441 			/*
1442 			 * Only PMCs that are marked as 'RUNNING' need
1443 			 * be placed on hardware.
1444 			 */
1445 
1446 			if (pm->pm_state != PMC_STATE_RUNNING)
1447 				continue;
1448 
1449 			/* increment PMC runcount */
1450 			atomic_add_rel_32(&pm->pm_runcount, 1);
1451 
1452 			/* configure the HWPMC we are going to use. */
1453 			md->pmd_config_pmc(cpu, ri, pm);
1454 
1455 			phw = pc->pc_hwpmcs[ri];
1456 
1457 			KASSERT(phw != NULL,
1458 			    ("[pmc,%d] null hw pointer", __LINE__));
1459 
1460 			KASSERT(phw->phw_pmc == pm,
1461 			    ("[pmc,%d] hw->pmc %p != pmc %p", __LINE__,
1462 				phw->phw_pmc, pm));
1463 
1464 			/* write out saved value and start the PMC */
1465 			mtx_pool_lock_spin(pmc_mtxpool, pm);
1466 			newvalue = PMC_PCPU_SAVED(cpu, ri) =
1467 			    pm->pm_gv.pm_savedvalue;
1468 			mtx_pool_unlock_spin(pmc_mtxpool, pm);
1469 
1470 			md->pmd_write_pmc(cpu, ri, newvalue);
1471 			md->pmd_start_pmc(cpu, ri);
1472 
1473 		}
1474 
1475 		/*
1476 		 * perform any other architecture/cpu dependent thread
1477 		 * switch-in actions.
1478 		 */
1479 
1480 		(void) (*md->pmd_switch_in)(pc);
1481 
1482 		critical_exit();
1483 
1484 	}
1485 	break;
1486 
1487 	/*
1488 	 * Thread context switch OUT.
1489 	 */
1490 
1491 	case PMC_FN_CSW_OUT:
1492 	{
1493 		int cpu;
1494 		unsigned int ri;
1495 		struct pmc *pm;
1496 		struct proc *p;
1497 		struct pmc_cpu *pc;
1498 		struct pmc_hw *phw;
1499 		struct pmc_process *pp;
1500 		pmc_value_t newvalue, tmp;
1501 
1502 		/*
1503 		 * Locate our process descriptor; this may be NULL if
1504 		 * this process is exiting and we have already removed
1505 		 * the process from the target process table.
1506 		 *
1507 		 * Note that due to kernel preemption, multiple
1508 		 * context switches may happen while the process is
1509 		 * exiting.
1510 		 *
1511 		 * Note also that if the target process cannot be
1512 		 * found we still need to deconfigure any PMCs that
1513 		 * are currently running on hardware.
1514 		 */
1515 
1516 		p = td->td_proc;
1517 		pp = pmc_find_process_descriptor(p, PMC_FLAG_NONE);
1518 
1519 		/*
1520 		 * save PMCs
1521 		 */
1522 
1523 		critical_enter();
1524 
1525 		cpu = PCPU_GET(cpuid); /* td->td_oncpu is invalid */
1526 
1527 		PMCDBG(CTX,SWO,1, "cpu=%d proc=%p (%d, %s) pp=%p", cpu, p,
1528 		    p->p_pid, p->p_comm, pp);
1529 
1530 		KASSERT(cpu >= 0 && cpu < mp_ncpus,
1531 		    ("[pmc,%d wierd CPU id %d", __LINE__, cpu));
1532 
1533 		pc = pmc_pcpu[cpu];
1534 
1535 		/*
1536 		 * When a PMC gets unlinked from a target PMC, it will
1537 		 * be removed from the target's pp_pmc[] array.
1538 		 *
1539 		 * However, on a MP system, the target could have been
1540 		 * executing on another CPU at the time of the unlink.
1541 		 * So, at context switch OUT time, we need to look at
1542 		 * the hardware to determine if a PMC is scheduled on
1543 		 * it.
1544 		 */
1545 
1546 		for (ri = 0; ri < md->pmd_npmc; ri++) {
1547 
1548 			phw = pc->pc_hwpmcs[ri];
1549 			pm  = phw->phw_pmc;
1550 
1551 			if (pm == NULL)	/* nothing at this row index */
1552 				continue;
1553 
1554 			if (!PMC_IS_VIRTUAL_MODE(pm->pm_mode))
1555 				continue; /* not a process virtual PMC */
1556 
1557 			KASSERT(pm->pm_rowindex == ri,
1558 			    ("[pmc,%d] ri mismatch pmc(%d) ri(%d)",
1559 				__LINE__, pm->pm_rowindex, ri));
1560 
1561 			/* Stop hardware */
1562 			md->pmd_stop_pmc(cpu, ri);
1563 
1564 			/* reduce this PMC's runcount */
1565 			atomic_subtract_rel_32(&pm->pm_runcount, 1);
1566 
1567 			/*
1568 			 * If this PMC is associated with this process,
1569 			 * save the reading.
1570 			 */
1571 
1572 			if (pp != NULL && pp->pp_pmcs[ri].pp_pmc != NULL) {
1573 
1574 				KASSERT(pm == pp->pp_pmcs[ri].pp_pmc,
1575 				    ("[pmc,%d] pm %p != pp_pmcs[%d] %p",
1576 					__LINE__, pm, ri,
1577 					pp->pp_pmcs[ri].pp_pmc));
1578 
1579 				KASSERT(pp->pp_refcnt > 0,
1580 				    ("[pmc,%d] pp refcnt = %d", __LINE__,
1581 					pp->pp_refcnt));
1582 
1583 				md->pmd_read_pmc(cpu, ri, &newvalue);
1584 
1585 				tmp = newvalue - PMC_PCPU_SAVED(cpu,ri);
1586 
1587 				KASSERT((int64_t) tmp >= 0,
1588 				    ("[pmc,%d] negative increment cpu=%d "
1589 					"ri=%d newvalue=%jx saved=%jx "
1590 					"incr=%jx", __LINE__, cpu, ri,
1591 					newvalue, PMC_PCPU_SAVED(cpu,ri),
1592 					tmp));
1593 
1594 				/*
1595 				 * Increment the PMC's count and this
1596 				 * target process's count by the difference
1597 				 * between the current reading and the
1598 				 * saved value at context switch in time.
1599 				 */
1600 
1601 				mtx_pool_lock_spin(pmc_mtxpool, pm);
1602 
1603 				pm->pm_gv.pm_savedvalue += tmp;
1604 				pp->pp_pmcs[ri].pp_pmcval += tmp;
1605 
1606 				mtx_pool_unlock_spin(pmc_mtxpool, pm);
1607 
1608 			}
1609 
1610 			/* mark hardware as free */
1611 			md->pmd_config_pmc(cpu, ri, NULL);
1612 		}
1613 
1614 		/*
1615 		 * perform any other architecture/cpu dependent thread
1616 		 * switch out functions.
1617 		 */
1618 
1619 		(void) (*md->pmd_switch_out)(pc);
1620 
1621 		critical_exit();
1622 
1623 	}
1624 	break;
1625 
1626 	default:
1627 #if DEBUG
1628 		KASSERT(0, ("[pmc,%d] unknown hook %d\n", __LINE__, function));
1629 #endif
1630 		break;
1631 
1632 	}
1633 
1634 	return 0;
1635 }
1636 
1637 /*
1638  * allocate a 'struct pmc_owner' descriptor in the owner hash table.
1639  */
1640 
1641 static struct pmc_owner *
1642 pmc_allocate_owner_descriptor(struct proc *p)
1643 {
1644 	uint32_t hindex;
1645 	struct pmc_owner *po;
1646 	struct pmc_ownerhash *poh;
1647 
1648 	hindex = PMC_HASH_PTR(p, pmc_ownerhashmask);
1649 	poh = &pmc_ownerhash[hindex];
1650 
1651 	/* allocate space for N pointers and one descriptor struct */
1652 	MALLOC(po, struct pmc_owner *, sizeof(struct pmc_owner),
1653 	    M_PMC, M_WAITOK);
1654 
1655 	po->po_flags = 0;
1656 	po->po_owner = p;
1657 	LIST_INIT(&po->po_pmcs);
1658 	LIST_INSERT_HEAD(poh, po, po_next); /* insert into hash table */
1659 
1660 	PMCDBG(OWN,ALL,1, "allocate-owner proc=%p (%d, %s) pmc-owner=%p",
1661 	    p, p->p_pid, p->p_comm, po);
1662 
1663 	return po;
1664 }
1665 
1666 /*
1667  * find the descriptor corresponding to process 'p', adding or removing it
1668  * as specified by 'mode'.
1669  */
1670 
1671 static struct pmc_process *
1672 pmc_find_process_descriptor(struct proc *p, uint32_t mode)
1673 {
1674 	uint32_t hindex;
1675 	struct pmc_process *pp, *ppnew;
1676 	struct pmc_processhash *pph;
1677 
1678 	hindex = PMC_HASH_PTR(p, pmc_processhashmask);
1679 	pph = &pmc_processhash[hindex];
1680 
1681 	ppnew = NULL;
1682 
1683 	/*
1684 	 * Pre-allocate memory in the FIND_ALLOCATE case since we
1685 	 * cannot call malloc(9) once we hold a spin lock.
1686 	 */
1687 
1688 	if (mode & PMC_FLAG_ALLOCATE) {
1689 		/* allocate additional space for 'n' pmc pointers */
1690 		MALLOC(ppnew, struct pmc_process *,
1691 		    sizeof(struct pmc_process) + md->pmd_npmc *
1692 		    sizeof(struct pmc_targetstate), M_PMC, M_ZERO|M_WAITOK);
1693 	}
1694 
1695 	mtx_lock_spin(&pmc_processhash_mtx);
1696 	LIST_FOREACH(pp, pph, pp_next)
1697 	    if (pp->pp_proc == p)
1698 		    break;
1699 
1700 	if ((mode & PMC_FLAG_REMOVE) && pp != NULL)
1701 		LIST_REMOVE(pp, pp_next);
1702 
1703 	if ((mode & PMC_FLAG_ALLOCATE) && pp == NULL &&
1704 	    ppnew != NULL) {
1705 		ppnew->pp_proc = p;
1706 		LIST_INSERT_HEAD(pph, ppnew, pp_next);
1707 		pp = ppnew;
1708 		ppnew = NULL;
1709 	}
1710 	mtx_unlock_spin(&pmc_processhash_mtx);
1711 
1712 	if (pp != NULL && ppnew != NULL)
1713 		FREE(ppnew, M_PMC);
1714 
1715 	return pp;
1716 }
1717 
1718 /*
1719  * remove a process descriptor from the process hash table.
1720  */
1721 
1722 static void
1723 pmc_remove_process_descriptor(struct pmc_process *pp)
1724 {
1725 	KASSERT(pp->pp_refcnt == 0,
1726 	    ("[pmc,%d] Removing process descriptor %p with count %d",
1727 		__LINE__, pp, pp->pp_refcnt));
1728 
1729 	mtx_lock_spin(&pmc_processhash_mtx);
1730 	LIST_REMOVE(pp, pp_next);
1731 	mtx_unlock_spin(&pmc_processhash_mtx);
1732 }
1733 
1734 
1735 /*
1736  * find an owner descriptor corresponding to proc 'p'
1737  */
1738 
1739 static struct pmc_owner *
1740 pmc_find_owner_descriptor(struct proc *p)
1741 {
1742 	uint32_t hindex;
1743 	struct pmc_owner *po;
1744 	struct pmc_ownerhash *poh;
1745 
1746 	hindex = PMC_HASH_PTR(p, pmc_ownerhashmask);
1747 	poh = &pmc_ownerhash[hindex];
1748 
1749 	po = NULL;
1750 	LIST_FOREACH(po, poh, po_next)
1751 	    if (po->po_owner == p)
1752 		    break;
1753 
1754 	PMCDBG(OWN,FND,1, "find-owner proc=%p (%d, %s) hindex=0x%x -> "
1755 	    "pmc-owner=%p", p, p->p_pid, p->p_comm, hindex, po);
1756 
1757 	return po;
1758 }
1759 
1760 /*
1761  * pmc_allocate_pmc_descriptor
1762  *
1763  * Allocate a pmc descriptor and initialize its
1764  * fields.
1765  */
1766 
1767 static struct pmc *
1768 pmc_allocate_pmc_descriptor(void)
1769 {
1770 	struct pmc *pmc;
1771 
1772 	MALLOC(pmc, struct pmc *, sizeof(struct pmc), M_PMC, M_ZERO|M_WAITOK);
1773 
1774 	if (pmc != NULL) {
1775 		pmc->pm_owner = NULL;
1776 		LIST_INIT(&pmc->pm_targets);
1777 	}
1778 
1779 	PMCDBG(PMC,ALL,1, "allocate-pmc -> pmc=%p", pmc);
1780 
1781 	return pmc;
1782 }
1783 
1784 /*
1785  * Destroy a pmc descriptor.
1786  */
1787 
1788 static void
1789 pmc_destroy_pmc_descriptor(struct pmc *pm)
1790 {
1791 	(void) pm;
1792 
1793 #if	DEBUG
1794 	KASSERT(pm->pm_state == PMC_STATE_DELETED ||
1795 	    pm->pm_state == PMC_STATE_FREE,
1796 	    ("[pmc,%d] destroying non-deleted PMC", __LINE__));
1797 	KASSERT(LIST_EMPTY(&pm->pm_targets),
1798 	    ("[pmc,%d] destroying pmc with targets", __LINE__));
1799 	KASSERT(pm->pm_owner == NULL,
1800 	    ("[pmc,%d] destroying pmc attached to an owner", __LINE__));
1801 	KASSERT(pm->pm_runcount == 0,
1802 	    ("[pmc,%d] pmc has non-zero run count %d", __LINE__,
1803 		pm->pm_runcount));
1804 #endif
1805 }
1806 
1807 /*
1808  * This function does the following things:
1809  *
1810  *  - detaches the PMC from hardware
1811  *  - unlinks all target threads that were attached to it
1812  *  - removes the PMC from its owner's list
1813  *  - destroy's the PMC private mutex
1814  *
1815  * Once this function completes, the given pmc pointer can be safely
1816  * FREE'd by the caller.
1817  */
1818 
1819 static void
1820 pmc_release_pmc_descriptor(struct pmc *pm)
1821 {
1822 #if	DEBUG
1823 	volatile int maxloop;
1824 #endif
1825 	u_int ri, cpu;
1826 	u_char curpri;
1827 	struct pmc_hw *phw;
1828 	struct pmc_process *pp;
1829 	struct pmc_target *ptgt, *tmp;
1830 	struct pmc_binding pb;
1831 
1832 	sx_assert(&pmc_sx, SX_XLOCKED);
1833 
1834 	KASSERT(pm, ("[pmc,%d] null pmc", __LINE__));
1835 
1836 	ri = pm->pm_rowindex;
1837 
1838 	PMCDBG(PMC,REL,1, "release-pmc pmc=%p ri=%d mode=%d", pm, ri,
1839 	    pm->pm_mode);
1840 
1841 	/*
1842 	 * First, we take the PMC off hardware.
1843 	 */
1844 
1845 	if (PMC_IS_SYSTEM_MODE(pm->pm_mode)) {
1846 
1847 		/*
1848 		 * A system mode PMC runs on a specific CPU.  Switch
1849 		 * to this CPU and turn hardware off.
1850 		 */
1851 
1852 		pmc_save_cpu_binding(&pb);
1853 
1854 		cpu = pm->pm_gv.pm_cpu;
1855 
1856 		if (pm->pm_state == PMC_STATE_RUNNING) {
1857 
1858 			pmc_select_cpu(cpu);
1859 
1860 			phw = pmc_pcpu[cpu]->pc_hwpmcs[ri];
1861 
1862 			KASSERT(phw->phw_pmc == pm,
1863 			    ("[pmc, %d] pmc ptr ri(%d) hw(%p) pm(%p)",
1864 				__LINE__, ri, phw->phw_pmc, pm));
1865 
1866 			PMCDBG(PMC,REL,2, "stopping cpu=%d ri=%d", cpu, ri);
1867 
1868 			critical_enter();
1869 			md->pmd_stop_pmc(cpu, ri);
1870 			critical_exit();
1871 		}
1872 
1873 		PMCDBG(PMC,REL,2, "decfg cpu=%d ri=%d", cpu, ri);
1874 
1875 		critical_enter();
1876 		md->pmd_config_pmc(cpu, ri, NULL);
1877 		critical_exit();
1878 
1879 		pm->pm_state = PMC_STATE_DELETED;
1880 
1881 		pmc_restore_cpu_binding(&pb);
1882 
1883 	} else if (PMC_IS_VIRTUAL_MODE(pm->pm_mode)) {
1884 
1885 		/*
1886 		 * A virtual PMC could be running on multiple CPUs at
1887 		 * a given instant.
1888 		 *
1889 		 * By marking its state as DELETED, we ensure that
1890 		 * this PMC is never further scheduled on hardware.
1891 		 *
1892 		 * Then we wait till all CPUs are done with this PMC.
1893 		 */
1894 
1895 		pm->pm_state = PMC_STATE_DELETED;
1896 
1897 
1898 		/*
1899 		 * Wait for the PMCs runcount to come to zero.
1900 		 */
1901 
1902 #if	DEBUG
1903 		maxloop = 100 * mp_ncpus;
1904 #endif
1905 
1906 		while (atomic_load_acq_32(&pm->pm_runcount) > 0) {
1907 
1908 #if	DEBUG
1909 			maxloop--;
1910 			KASSERT(maxloop > 0,
1911 			    ("[pmc,%d] (ri%d, rc%d) waiting too long for "
1912 				"pmc to be free", __LINE__, pm->pm_rowindex,
1913 				pm->pm_runcount));
1914 #endif
1915 
1916 			mtx_lock_spin(&sched_lock);
1917 			curpri = curthread->td_priority;
1918 			mtx_unlock_spin(&sched_lock);
1919 
1920 			(void) tsleep((void *) pmc_release_pmc_descriptor,
1921 			    curpri, "pmcrel", 1);
1922 
1923 		}
1924 
1925 		/*
1926 		 * At this point the PMC is off all CPUs and cannot be
1927 		 * freshly scheduled onto a CPU.  It is now safe to
1928 		 * unlink all targets from this PMC.  If a
1929 		 * process-record's refcount falls to zero, we remove
1930 		 * it from the hash table.  The module-wide SX lock
1931 		 * protects us from races.
1932 		 */
1933 
1934 		LIST_FOREACH_SAFE(ptgt, &pm->pm_targets, pt_next, tmp) {
1935 			pp = ptgt->pt_process;
1936 			pmc_unlink_target_process(pm, pp); /* frees 'ptgt' */
1937 
1938 			PMCDBG(PMC,REL,3, "pp->refcnt=%d", pp->pp_refcnt);
1939 
1940 			/*
1941 			 * If the target process record shows that no
1942 			 * PMCs are attached to it, reclaim its space.
1943 			 */
1944 
1945 			if (pp->pp_refcnt == 0) {
1946 				pmc_remove_process_descriptor(pp);
1947 				FREE(pp, M_PMC);
1948 			}
1949 		}
1950 
1951 		cpu = curthread->td_oncpu; /* setup cpu for pmd_release() */
1952 
1953 	}
1954 
1955 	/*
1956 	 * Release any MD resources
1957 	 */
1958 
1959 	(void) md->pmd_release_pmc(cpu, ri, pm);
1960 
1961 	/*
1962 	 * Update row disposition
1963 	 */
1964 
1965 	if (PMC_IS_SYSTEM_MODE(pm->pm_mode))
1966 		PMC_UNMARK_ROW_STANDALONE(ri);
1967 	else
1968 		PMC_UNMARK_ROW_THREAD(ri);
1969 
1970 	/* unlink from the owner's list */
1971 	if (pm->pm_owner)
1972 		pmc_unlink_owner(pm);
1973 
1974 	pmc_destroy_pmc_descriptor(pm);
1975 }
1976 
1977 /*
1978  * Register an owner and a pmc.
1979  */
1980 
1981 static int
1982 pmc_register_owner(struct proc *p, struct pmc *pmc)
1983 {
1984 	struct pmc_list	*pl;
1985 	struct pmc_owner *po;
1986 
1987 	sx_assert(&pmc_sx, SX_XLOCKED);
1988 
1989 	MALLOC(pl, struct pmc_list *, sizeof(struct pmc_list), M_PMC,
1990 	    M_WAITOK);
1991 
1992 	if (pl == NULL)
1993 		return ENOMEM;
1994 
1995 	if ((po = pmc_find_owner_descriptor(p)) == NULL) {
1996 		if ((po = pmc_allocate_owner_descriptor(p)) == NULL) {
1997 			FREE(pl, M_PMC);
1998 			return ENOMEM;
1999 		}
2000 		po->po_flags |= PMC_FLAG_IS_OWNER; /* real owner */
2001 	}
2002 
2003 	if (pmc->pm_mode == PMC_MODE_TS) {
2004 		/* can have only one TS mode PMC per process */
2005 		if (po->po_flags & PMC_FLAG_HAS_TS_PMC) {
2006 			FREE(pl, M_PMC);
2007 			return EINVAL;
2008 		}
2009 		po->po_flags |= PMC_FLAG_HAS_TS_PMC;
2010 	}
2011 
2012 	KASSERT(pmc->pm_owner == NULL,
2013 	    ("[pmc,%d] attempting to own an initialized PMC", __LINE__));
2014 	pmc->pm_owner  = po;
2015 
2016 	pl->pl_pmc = pmc;
2017 
2018 	LIST_INSERT_HEAD(&po->po_pmcs, pl, pl_next);
2019 
2020 	PROC_LOCK(p);
2021 	p->p_flag |= P_HWPMC;
2022 	PROC_UNLOCK(p);
2023 
2024 	PMCDBG(PMC,REG,1, "register-owner pmc-owner=%p pl=%p pmc=%p",
2025 	    po, pl, pmc);
2026 
2027 	return 0;
2028 }
2029 
2030 /*
2031  * Return the current row disposition:
2032  * == 0 => FREE
2033  *  > 0 => PROCESS MODE
2034  *  < 0 => SYSTEM MODE
2035  */
2036 
2037 int
2038 pmc_getrowdisp(int ri)
2039 {
2040 	return pmc_pmcdisp[ri];
2041 }
2042 
2043 /*
2044  * Check if a PMC at row index 'ri' can be allocated to the current
2045  * process.
2046  *
2047  * Allocation can fail if:
2048  *   - the current process is already being profiled by a PMC at index 'ri',
2049  *     attached to it via OP_PMCATTACH.
2050  *   - the current process has already allocated a PMC at index 'ri'
2051  *     via OP_ALLOCATE.
2052  */
2053 
2054 static int
2055 pmc_can_allocate_rowindex(struct proc *p, unsigned int ri)
2056 {
2057 	struct pmc_list *pl;
2058 	struct pmc_owner *po;
2059 	struct pmc_process *pp;
2060 
2061 	PMCDBG(PMC,ALR,1, "can-allocate-rowindex proc=%p (%d, %s) ri=%d",
2062 	    p, p->p_pid, p->p_comm, ri);
2063 
2064 	/* we shouldn't have allocated a PMC at row index 'ri' */
2065 	if ((po = pmc_find_owner_descriptor(p)) != NULL)
2066 		LIST_FOREACH(pl, &po->po_pmcs, pl_next)
2067 		    if (pl->pl_pmc->pm_rowindex == ri)
2068 			    return EEXIST;
2069 
2070 	/* we shouldn't be the target of any PMC ourselves at this index */
2071 	if ((pp = pmc_find_process_descriptor(p, 0)) != NULL)
2072 		if (pp->pp_pmcs[ri].pp_pmc)
2073 			return EEXIST;
2074 
2075 	PMCDBG(PMC,ALR,2, "can-allocate-rowindex proc=%p (%d, %s) ri=%d ok",
2076 	    p, p->p_pid, p->p_comm, ri);
2077 
2078 	return 0;
2079 }
2080 
2081 /*
2082  * Check if a given PMC at row index 'ri' can be currently used in
2083  * mode 'mode'.
2084  */
2085 
2086 static int
2087 pmc_can_allocate_row(int ri, enum pmc_mode mode)
2088 {
2089 	enum pmc_disp	disp;
2090 
2091 	sx_assert(&pmc_sx, SX_XLOCKED);
2092 
2093 	PMCDBG(PMC,ALR,1, "can-allocate-row ri=%d mode=%d", ri, mode);
2094 
2095 	if (PMC_IS_SYSTEM_MODE(mode))
2096 		disp = PMC_DISP_STANDALONE;
2097 	else
2098 		disp = PMC_DISP_THREAD;
2099 
2100 	/*
2101 	 * check disposition for PMC row 'ri':
2102 	 *
2103 	 * Expected disposition		Row-disposition		Result
2104 	 *
2105 	 * STANDALONE			STANDALONE or FREE	proceed
2106 	 * STANDALONE			THREAD			fail
2107 	 * THREAD			THREAD or FREE		proceed
2108 	 * THREAD			STANDALONE		fail
2109 	 */
2110 
2111 	if (!PMC_ROW_DISP_IS_FREE(ri) &&
2112 	    !(disp == PMC_DISP_THREAD && PMC_ROW_DISP_IS_THREAD(ri)) &&
2113 	    !(disp == PMC_DISP_STANDALONE && PMC_ROW_DISP_IS_STANDALONE(ri)))
2114 		return EBUSY;
2115 
2116 	/*
2117 	 * All OK
2118 	 */
2119 
2120 	PMCDBG(PMC,ALR,2, "can-allocate-row ri=%d mode=%d ok", ri, mode);
2121 
2122 	return 0;
2123 
2124 }
2125 
2126 /*
2127  * Find a PMC descriptor with user handle 'pmc' for thread 'td'.
2128  */
2129 
2130 static struct pmc *
2131 pmc_find_pmc_descriptor_in_process(struct pmc_owner *po, pmc_id_t pmcid)
2132 {
2133 	struct pmc_list	*pl;
2134 
2135 	KASSERT(pmcid < md->pmd_npmc,
2136 	    ("[pmc,%d] Illegal pmc index %d (max %d)", __LINE__, pmcid,
2137 		md->pmd_npmc));
2138 
2139 	LIST_FOREACH(pl, &po->po_pmcs, pl_next)
2140 	    if (pl->pl_pmc->pm_rowindex == pmcid)
2141 		    return pl->pl_pmc;
2142 
2143 	return NULL;
2144 }
2145 
2146 static int
2147 pmc_find_pmc(pmc_id_t pmcid, struct pmc **pmc)
2148 {
2149 
2150 	struct pmc *pm;
2151 	struct pmc_owner *po;
2152 
2153 	PMCDBG(PMC,FND,1, "find-pmc id=%d", pmcid);
2154 
2155 	if ((po = pmc_find_owner_descriptor(curthread->td_proc)) == NULL)
2156 		return ESRCH;
2157 
2158 	if ((pm = pmc_find_pmc_descriptor_in_process(po, pmcid)) == NULL)
2159 		return EINVAL;
2160 
2161 	PMCDBG(PMC,FND,2, "find-pmc id=%d -> pmc=%p", pmcid, pm);
2162 
2163 	*pmc = pm;
2164 	return 0;
2165 }
2166 
2167 /*
2168  * Start a PMC.
2169  */
2170 
2171 static int
2172 pmc_start(struct pmc *pm)
2173 {
2174 	int error, cpu, ri;
2175 	struct pmc_binding pb;
2176 
2177 	KASSERT(pm != NULL,
2178 	    ("[pmc,%d] null pm", __LINE__));
2179 
2180 	PMCDBG(PMC,OPS,1, "start pmc=%p mode=%d ri=%d", pm, pm->pm_mode,
2181 	    pm->pm_rowindex);
2182 
2183 	pm->pm_state = PMC_STATE_RUNNING;
2184 
2185 	if (PMC_IS_VIRTUAL_MODE(pm->pm_mode)) {
2186 
2187 		/*
2188 		 * If a PMCATTACH hadn't been done on this
2189 		 * PMC, attach this PMC to its owner process.
2190 		 */
2191 
2192 		if (LIST_EMPTY(&pm->pm_targets))
2193 			return pmc_attach_process(pm->pm_owner->po_owner, pm);
2194 
2195 
2196 		/*
2197 		 * Nothing further to be done; thread context switch code
2198 		 * will start/stop the PMC as appropriate.
2199 		 */
2200 
2201 		return 0;
2202 
2203 	}
2204 
2205 	/*
2206 	 * A system-mode PMC.  Move to the CPU associated with this
2207 	 * PMC, and start the hardware.
2208 	 */
2209 
2210 	pmc_save_cpu_binding(&pb);
2211 
2212 	cpu = pm->pm_gv.pm_cpu;
2213 
2214 	if (pmc_cpu_is_disabled(cpu))
2215 		return ENXIO;
2216 
2217 	ri  = pm->pm_rowindex;
2218 
2219 	pmc_select_cpu(cpu);
2220 
2221 	/*
2222 	 * global PMCs are configured at allocation time
2223 	 * so write out the initial value and start the PMC.
2224 	 */
2225 
2226 	if ((error = md->pmd_write_pmc(cpu, ri,
2227 		 PMC_IS_SAMPLING_MODE(pm->pm_mode) ?
2228 		 pm->pm_sc.pm_reloadcount :
2229 		 pm->pm_sc.pm_initial)) == 0)
2230 		error = md->pmd_start_pmc(cpu, ri);
2231 
2232 	pmc_restore_cpu_binding(&pb);
2233 
2234 	return error;
2235 }
2236 
2237 /*
2238  * Stop a PMC.
2239  */
2240 
2241 static int
2242 pmc_stop(struct pmc *pm)
2243 {
2244 	int error, cpu;
2245 	struct pmc_binding pb;
2246 
2247 	KASSERT(pm != NULL, ("[pmc,%d] null pmc", __LINE__));
2248 
2249 	PMCDBG(PMC,OPS,1, "stop pmc=%p mode=%d ri=%d", pm, pm->pm_mode,
2250 	    pm->pm_rowindex);
2251 
2252 	pm->pm_state = PMC_STATE_STOPPED;
2253 
2254 	/*
2255 	 * If the PMC is a virtual mode one, changing the state to
2256 	 * non-RUNNING is enough to ensure that the PMC never gets
2257 	 * scheduled.
2258 	 *
2259 	 * If this PMC is current running on a CPU, then it will
2260 	 * handled correctly at the time its target process is context
2261 	 * switched out.
2262 	 */
2263 
2264 	if (PMC_IS_VIRTUAL_MODE(pm->pm_mode))
2265 		return 0;
2266 
2267 	/*
2268 	 * A system-mode PMC.  Move to the CPU associated with
2269 	 * this PMC, and stop the hardware.  We update the
2270 	 * 'initial count' so that a subsequent PMCSTART will
2271 	 * resume counting from the current hardware count.
2272 	 */
2273 
2274 	pmc_save_cpu_binding(&pb);
2275 
2276 	cpu = pm->pm_gv.pm_cpu;
2277 
2278 	if (pmc_cpu_is_disabled(cpu))
2279 		return ENXIO;
2280 
2281 	pmc_select_cpu(cpu);
2282 
2283 	if ((error = md->pmd_stop_pmc(cpu, pm->pm_rowindex)) == 0)
2284 		error = md->pmd_read_pmc(cpu, pm->pm_rowindex,
2285 		    &pm->pm_sc.pm_initial);
2286 
2287 	pmc_restore_cpu_binding(&pb);
2288 
2289 	return error;
2290 }
2291 
2292 
2293 #if	DEBUG
2294 static const char *pmc_op_to_name[] = {
2295 #undef	__PMC_OP
2296 #define	__PMC_OP(N, D)	#N ,
2297 	__PMC_OPS()
2298 	NULL
2299 };
2300 #endif
2301 
2302 /*
2303  * The syscall interface
2304  */
2305 
2306 #define	PMC_GET_SX_XLOCK(...) do {		\
2307 	sx_xlock(&pmc_sx);			\
2308 	if (pmc_hook == NULL) {			\
2309 		sx_xunlock(&pmc_sx);		\
2310 		return __VA_ARGS__;		\
2311 	}					\
2312 } while (0)
2313 
2314 #define	PMC_DOWNGRADE_SX() do {			\
2315 	sx_downgrade(&pmc_sx);			\
2316 	is_sx_downgraded = 1;			\
2317 } while (0)
2318 
2319 static int
2320 pmc_syscall_handler(struct thread *td, void *syscall_args)
2321 {
2322 	int error, is_sx_downgraded, op;
2323 	struct pmc_syscall_args *c;
2324 	void *arg;
2325 
2326 	PMC_GET_SX_XLOCK(ENOSYS);
2327 
2328 	is_sx_downgraded = 0;
2329 
2330 	c = (struct pmc_syscall_args *) syscall_args;
2331 
2332 	op = c->pmop_code;
2333 	arg = c->pmop_data;
2334 
2335 	PMCDBG(MOD,PMS,1, "syscall op=%d \"%s\" arg=%p", op,
2336 	    pmc_op_to_name[op], arg);
2337 
2338 	error = 0;
2339 	atomic_add_int(&pmc_stats.pm_syscalls, 1);
2340 
2341 	switch(op)
2342 	{
2343 
2344 
2345 	/*
2346 	 * Configure a log file.
2347 	 *
2348 	 * XXX This OP will be reworked.
2349 	 */
2350 
2351 	case PMC_OP_CONFIGURELOG:
2352 	{
2353 		struct pmc_owner *po;
2354 		struct pmc_op_configurelog cl;
2355 		struct proc *p;
2356 
2357 		sx_assert(&pmc_sx, SX_XLOCKED);
2358 
2359 		if ((error = copyin(arg, &cl, sizeof(cl))) != 0)
2360 			break;
2361 
2362 		/* mark this process as owning a log file */
2363 		p = td->td_proc;
2364 		if ((po = pmc_find_owner_descriptor(p)) == NULL)
2365 			if ((po = pmc_allocate_owner_descriptor(p)) == NULL)
2366 				return ENOMEM;
2367 
2368 		if ((error = pmc_configure_log(po, cl.pm_logfd)) != 0)
2369 			break;
2370 
2371 	}
2372 	break;
2373 
2374 
2375 	/*
2376 	 * Retrieve hardware configuration.
2377 	 */
2378 
2379 	case PMC_OP_GETCPUINFO:	/* CPU information */
2380 	{
2381 		struct pmc_op_getcpuinfo gci;
2382 
2383 		gci.pm_cputype = md->pmd_cputype;
2384 		gci.pm_npmc    = md->pmd_npmc;
2385 		gci.pm_nclass  = md->pmd_nclass;
2386 		bcopy(md->pmd_classes, &gci.pm_classes,
2387 		    sizeof(gci.pm_classes));
2388 		gci.pm_ncpu    = mp_ncpus;
2389 		error = copyout(&gci, arg, sizeof(gci));
2390 	}
2391 	break;
2392 
2393 
2394 	/*
2395 	 * Get module statistics
2396 	 */
2397 
2398 	case PMC_OP_GETDRIVERSTATS:
2399 	{
2400 		struct pmc_op_getdriverstats gms;
2401 
2402 		bcopy(&pmc_stats, &gms, sizeof(gms));
2403 		error = copyout(&gms, arg, sizeof(gms));
2404 	}
2405 	break;
2406 
2407 
2408 	/*
2409 	 * Retrieve module version number
2410 	 */
2411 
2412 	case PMC_OP_GETMODULEVERSION:
2413 	{
2414 		error = copyout(&_pmc_version.mv_version, arg, sizeof(int));
2415 	}
2416 	break;
2417 
2418 
2419 	/*
2420 	 * Retrieve the state of all the PMCs on a given
2421 	 * CPU.
2422 	 */
2423 
2424 	case PMC_OP_GETPMCINFO:
2425 	{
2426 		uint32_t cpu, n, npmc;
2427 		size_t pmcinfo_size;
2428 		struct pmc *pm;
2429 		struct pmc_info *p, *pmcinfo;
2430 		struct pmc_op_getpmcinfo *gpi;
2431 		struct pmc_owner *po;
2432 		struct pmc_binding pb;
2433 
2434 		PMC_DOWNGRADE_SX();
2435 
2436 		gpi = (struct pmc_op_getpmcinfo *) arg;
2437 
2438 		if ((error = copyin(&gpi->pm_cpu, &cpu, sizeof(cpu))) != 0)
2439 			break;
2440 
2441 		if (cpu >= (unsigned int) mp_ncpus) {
2442 			error = EINVAL;
2443 			break;
2444 		}
2445 
2446 		if (pmc_cpu_is_disabled(cpu)) {
2447 			error = ENXIO;
2448 			break;
2449 		}
2450 
2451 		/* switch to CPU 'cpu' */
2452 		pmc_save_cpu_binding(&pb);
2453 		pmc_select_cpu(cpu);
2454 
2455 		npmc = md->pmd_npmc;
2456 
2457 		pmcinfo_size = npmc * sizeof(struct pmc_info);
2458 		MALLOC(pmcinfo, struct pmc_info *, pmcinfo_size, M_PMC,
2459 		    M_WAITOK);
2460 
2461 		p = pmcinfo;
2462 
2463 		for (n = 0; n < md->pmd_npmc; n++, p++) {
2464 
2465 			if ((error = md->pmd_describe(cpu, n, p, &pm)) != 0)
2466 				break;
2467 
2468 			if (PMC_ROW_DISP_IS_STANDALONE(n))
2469 				p->pm_rowdisp = PMC_DISP_STANDALONE;
2470 			else if (PMC_ROW_DISP_IS_THREAD(n))
2471 				p->pm_rowdisp = PMC_DISP_THREAD;
2472 			else
2473 				p->pm_rowdisp = PMC_DISP_FREE;
2474 
2475 			p->pm_ownerpid = -1;
2476 
2477 			if (pm == NULL)	/* no PMC associated */
2478 				continue;
2479 
2480 			po = pm->pm_owner;
2481 
2482 			KASSERT(po->po_owner != NULL,
2483 			    ("[pmc,%d] pmc_owner had a null proc pointer",
2484 				__LINE__));
2485 
2486 			p->pm_ownerpid = po->po_owner->p_pid;
2487 			p->pm_mode     = pm->pm_mode;
2488 			p->pm_event    = pm->pm_event;
2489 			p->pm_flags    = pm->pm_flags;
2490 
2491 			if (PMC_IS_SAMPLING_MODE(pm->pm_mode))
2492 				p->pm_reloadcount =
2493 				    pm->pm_sc.pm_reloadcount;
2494 		}
2495 
2496 		pmc_restore_cpu_binding(&pb);
2497 
2498 		/* now copy out the PMC info collected */
2499 		if (error == 0)
2500 			error = copyout(pmcinfo, &gpi->pm_pmcs, pmcinfo_size);
2501 
2502 		FREE(pmcinfo, M_PMC);
2503 	}
2504 	break;
2505 
2506 
2507 	/*
2508 	 * Set the administrative state of a PMC.  I.e. whether
2509 	 * the PMC is to be used or not.
2510 	 */
2511 
2512 	case PMC_OP_PMCADMIN:
2513 	{
2514 		int cpu, ri;
2515 		enum pmc_state request;
2516 		struct pmc_cpu *pc;
2517 		struct pmc_hw *phw;
2518 		struct pmc_op_pmcadmin pma;
2519 		struct pmc_binding pb;
2520 
2521 		sx_assert(&pmc_sx, SX_XLOCKED);
2522 
2523 		KASSERT(td == curthread,
2524 		    ("[pmc,%d] td != curthread", __LINE__));
2525 
2526 		if (suser(td) || jailed(td->td_ucred)) {
2527 			error =  EPERM;
2528 			break;
2529 		}
2530 
2531 		if ((error = copyin(arg, &pma, sizeof(pma))) != 0)
2532 			break;
2533 
2534 		cpu = pma.pm_cpu;
2535 
2536 		if (cpu < 0 || cpu >= mp_ncpus) {
2537 			error = EINVAL;
2538 			break;
2539 		}
2540 
2541 		if (pmc_cpu_is_disabled(cpu)) {
2542 			error = ENXIO;
2543 			break;
2544 		}
2545 
2546 		request = pma.pm_state;
2547 
2548 		if (request != PMC_STATE_DISABLED &&
2549 		    request != PMC_STATE_FREE) {
2550 			error = EINVAL;
2551 			break;
2552 		}
2553 
2554 		ri = pma.pm_pmc; /* pmc id == row index */
2555 		if (ri < 0 || ri >= (int) md->pmd_npmc) {
2556 			error = EINVAL;
2557 			break;
2558 		}
2559 
2560 		/*
2561 		 * We can't disable a PMC with a row-index allocated
2562 		 * for process virtual PMCs.
2563 		 */
2564 
2565 		if (PMC_ROW_DISP_IS_THREAD(ri) &&
2566 		    request == PMC_STATE_DISABLED) {
2567 			error = EBUSY;
2568 			break;
2569 		}
2570 
2571 		/*
2572 		 * otherwise, this PMC on this CPU is either free or
2573 		 * in system-wide mode.
2574 		 */
2575 
2576 		pmc_save_cpu_binding(&pb);
2577 		pmc_select_cpu(cpu);
2578 
2579 		pc  = pmc_pcpu[cpu];
2580 		phw = pc->pc_hwpmcs[ri];
2581 
2582 		/*
2583 		 * XXX do we need some kind of 'forced' disable?
2584 		 */
2585 
2586 		if (phw->phw_pmc == NULL) {
2587 			if (request == PMC_STATE_DISABLED &&
2588 			    (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED)) {
2589 				phw->phw_state &= ~PMC_PHW_FLAG_IS_ENABLED;
2590 				PMC_MARK_ROW_STANDALONE(ri);
2591 			} else if (request == PMC_STATE_FREE &&
2592 			    (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) == 0) {
2593 				phw->phw_state |=  PMC_PHW_FLAG_IS_ENABLED;
2594 				PMC_UNMARK_ROW_STANDALONE(ri);
2595 			}
2596 			/* other cases are a no-op */
2597 		} else
2598 			error = EBUSY;
2599 
2600 		pmc_restore_cpu_binding(&pb);
2601 	}
2602 	break;
2603 
2604 
2605 	/*
2606 	 * Allocate a PMC.
2607 	 */
2608 
2609 	case PMC_OP_PMCALLOCATE:
2610 	{
2611 		uint32_t caps;
2612 		u_int cpu;
2613 		int n;
2614 		enum pmc_mode mode;
2615 		struct pmc *pmc;
2616 		struct pmc_op_pmcallocate pa;
2617 		struct pmc_binding pb;
2618 
2619 		if ((error = copyin(arg, &pa, sizeof(pa))) != 0)
2620 			break;
2621 
2622 		caps = pa.pm_caps;
2623 		mode = pa.pm_mode;
2624 		cpu  = pa.pm_cpu;
2625 
2626 		if ((mode != PMC_MODE_SS  &&  mode != PMC_MODE_SC  &&
2627 		     mode != PMC_MODE_TS  &&  mode != PMC_MODE_TC) ||
2628 		    (cpu != (u_int) PMC_CPU_ANY && cpu >= (u_int) mp_ncpus)) {
2629 			error = EINVAL;
2630 			break;
2631 		}
2632 
2633 		/*
2634 		 * Virtual PMCs should only ask for a default CPU.
2635 		 * System mode PMCs need to specify a non-default CPU.
2636 		 */
2637 
2638 		if ((PMC_IS_VIRTUAL_MODE(mode) && cpu != (u_int) PMC_CPU_ANY) ||
2639 		    (PMC_IS_SYSTEM_MODE(mode) && cpu == (u_int) PMC_CPU_ANY)) {
2640 			error = EINVAL;
2641 			break;
2642 		}
2643 
2644 		/*
2645 		 * Check that a disabled CPU is not being asked for.
2646 		 */
2647 
2648 		if (PMC_IS_SYSTEM_MODE(mode) && pmc_cpu_is_disabled(cpu)) {
2649 			error = ENXIO;
2650 			break;
2651 		}
2652 
2653 		/*
2654 		 * Refuse an allocation for a system-wide PMC if this
2655 		 * process has been jailed, or if this process lacks
2656 		 * super-user credentials and the sysctl tunable
2657 		 * 'security.bsd.unprivileged_syspmcs' is zero.
2658 		 */
2659 
2660 		if (PMC_IS_SYSTEM_MODE(mode)) {
2661 			if (jailed(curthread->td_ucred))
2662 				error = EPERM;
2663 			else if (suser(curthread) &&
2664 			    (pmc_unprivileged_syspmcs == 0))
2665 				error = EPERM;
2666 		}
2667 
2668 		if (error)
2669 			break;
2670 
2671 		/*
2672 		 * Look for valid values for 'pm_flags'
2673 		 */
2674 
2675 		if ((pa.pm_flags & ~(PMC_F_DESCENDANTS|PMC_F_LOG_TC_CSW))
2676 		    != 0) {
2677 			error = EINVAL;
2678 			break;
2679 		}
2680 
2681 		/*
2682 		 * All sampling mode PMCs need to be able to interrupt the
2683 		 * CPU.
2684 		 */
2685 
2686 		if (PMC_IS_SAMPLING_MODE(mode)) {
2687 			caps |= PMC_CAP_INTERRUPT;
2688 			error = ENOSYS; /* for snapshot 6 */
2689 			break;
2690 		}
2691 
2692 		PMCDBG(PMC,ALL,2, "event=%d caps=0x%x mode=%d cpu=%d",
2693 		    pa.pm_ev, caps, mode, cpu);
2694 
2695 		pmc = pmc_allocate_pmc_descriptor();
2696 		pmc->pm_event = pa.pm_ev;
2697 		pmc->pm_class = pa.pm_class;
2698 		pmc->pm_state = PMC_STATE_FREE;
2699 		pmc->pm_mode  = mode;
2700 		pmc->pm_caps  = caps;
2701 		pmc->pm_flags = pa.pm_flags;
2702 
2703 		/* switch thread to CPU 'cpu' */
2704 		pmc_save_cpu_binding(&pb);
2705 
2706 #define	PMC_IS_SHAREABLE_PMC(cpu, n)				\
2707 	(pmc_pcpu[(cpu)]->pc_hwpmcs[(n)]->phw_state &		\
2708 	 PMC_PHW_FLAG_IS_SHAREABLE)
2709 #define	PMC_IS_UNALLOCATED(cpu, n)				\
2710 	(pmc_pcpu[(cpu)]->pc_hwpmcs[(n)]->phw_pmc == NULL)
2711 
2712 		if (PMC_IS_SYSTEM_MODE(mode)) {
2713 			pmc_select_cpu(cpu);
2714 			for (n = 0; n < (int) md->pmd_npmc; n++)
2715 				if (pmc_can_allocate_row(n, mode) == 0 &&
2716 				    pmc_can_allocate_rowindex(
2717 					    curthread->td_proc, n) == 0 &&
2718 				    (PMC_IS_UNALLOCATED(cpu, n) ||
2719 				     PMC_IS_SHAREABLE_PMC(cpu, n)) &&
2720 				    md->pmd_allocate_pmc(cpu, n, pmc,
2721 					&pa) == 0)
2722 					break;
2723 		} else {
2724 			/* Process virtual mode */
2725 			for (n = 0; n < (int) md->pmd_npmc; n++) {
2726 				if (pmc_can_allocate_row(n, mode) == 0 &&
2727 				    pmc_can_allocate_rowindex(
2728 					    curthread->td_proc, n) == 0 &&
2729 				    md->pmd_allocate_pmc(curthread->td_oncpu,
2730 					n, pmc, &pa) == 0)
2731 					break;
2732 			}
2733 		}
2734 
2735 #undef	PMC_IS_UNALLOCATED
2736 #undef	PMC_IS_SHAREABLE_PMC
2737 
2738 		pmc_restore_cpu_binding(&pb);
2739 
2740 		if (n == (int) md->pmd_npmc) {
2741 			pmc_destroy_pmc_descriptor(pmc);
2742 			FREE(pmc, M_PMC);
2743 			pmc = NULL;
2744 			error = EINVAL;
2745 			break;
2746 		}
2747 
2748 		PMCDBG(PMC,ALL,2, "ev=%d class=%d mode=%d -> n=%d",
2749 		    pmc->pm_event, pmc->pm_class, pmc->pm_mode, n);
2750 
2751 		/*
2752 		 * Configure global pmc's immediately
2753 		 */
2754 
2755 		if (PMC_IS_SYSTEM_MODE(pmc->pm_mode))
2756 			if ((error = md->pmd_config_pmc(cpu, n, pmc)) != 0) {
2757 				(void) md->pmd_release_pmc(cpu, n, pmc);
2758 				pmc_destroy_pmc_descriptor(pmc);
2759 				FREE(pmc, M_PMC);
2760 				pmc = NULL;
2761 				break;
2762 			}
2763 
2764 		/*
2765 		 * Mark the row index allocated.
2766 		 */
2767 
2768 		pmc->pm_rowindex = n;
2769 		pmc->pm_state    = PMC_STATE_ALLOCATED;
2770 
2771 		/*
2772 		 * mark row disposition
2773 		 */
2774 
2775 		if (PMC_IS_SYSTEM_MODE(mode))
2776 			PMC_MARK_ROW_STANDALONE(n);
2777 		else
2778 			PMC_MARK_ROW_THREAD(n);
2779 
2780 		/*
2781 		 * If this is a system-wide CPU, mark the CPU it
2782 		 * was allocated on.
2783 		 */
2784 
2785 		if (PMC_IS_SYSTEM_MODE(mode))
2786 			pmc->pm_gv.pm_cpu = cpu;
2787 
2788 		/*
2789 		 * Register this PMC with the current thread as its owner.
2790 		 */
2791 
2792 		if ((error =
2793 		    pmc_register_owner(curthread->td_proc, pmc)) != 0) {
2794 			pmc_release_pmc_descriptor(pmc);
2795 			FREE(pmc, M_PMC);
2796 			pmc = NULL;
2797 			break;
2798 		}
2799 
2800 		/*
2801 		 * Return the allocated index.
2802 		 */
2803 
2804 		pa.pm_pmcid = n;
2805 
2806 		error = copyout(&pa, arg, sizeof(pa));
2807 	}
2808 	break;
2809 
2810 
2811 	/*
2812 	 * Attach a PMC to a process.
2813 	 */
2814 
2815 	case PMC_OP_PMCATTACH:
2816 	{
2817 		struct pmc *pm;
2818 		struct proc *p;
2819 		struct pmc_op_pmcattach a;
2820 
2821 		sx_assert(&pmc_sx, SX_XLOCKED);
2822 
2823 		if ((error = copyin(arg, &a, sizeof(a))) != 0)
2824 			break;
2825 
2826 		if (a.pm_pid < 0) {
2827 			error = EINVAL;
2828 			break;
2829 		} else if (a.pm_pid == 0)
2830 			a.pm_pid = td->td_proc->p_pid;
2831 
2832 		if ((error = pmc_find_pmc(a.pm_pmc, &pm)) != 0)
2833 			break;
2834 
2835 		if (PMC_IS_SYSTEM_MODE(pm->pm_mode)) {
2836 			error = EINVAL;
2837 			break;
2838 		}
2839 
2840 		/* PMCs may be (re)attached only when allocated or stopped */
2841 		if (pm->pm_state == PMC_STATE_RUNNING) {
2842 			error = EBUSY;
2843 			break;
2844 		} else if (pm->pm_state != PMC_STATE_ALLOCATED &&
2845 		    pm->pm_state != PMC_STATE_STOPPED) {
2846 			error = EINVAL;
2847 			break;
2848 		}
2849 
2850 		/* lookup pid */
2851 		if ((p = pfind(a.pm_pid)) == NULL) {
2852 			error = ESRCH;
2853 			break;
2854 		}
2855 
2856 		/*
2857 		 * Ignore processes that are working on exiting.
2858 		 */
2859 		if (p->p_flag & P_WEXIT) {
2860 			error = ESRCH;
2861 			PROC_UNLOCK(p);	/* pfind() returns a locked process */
2862 			break;
2863 		}
2864 
2865 		/*
2866 		 * we are allowed to attach a PMC to a process if
2867 		 * we can debug it.
2868 		 */
2869 		error = p_candebug(curthread, p);
2870 
2871 		PROC_UNLOCK(p);
2872 
2873 		if (error == 0)
2874 			error = pmc_attach_process(p, pm);
2875 	}
2876 	break;
2877 
2878 
2879 	/*
2880 	 * Detach an attached PMC from a process.
2881 	 */
2882 
2883 	case PMC_OP_PMCDETACH:
2884 	{
2885 		struct pmc *pm;
2886 		struct proc *p;
2887 		struct pmc_op_pmcattach a;
2888 
2889 		if ((error = copyin(arg, &a, sizeof(a))) != 0)
2890 			break;
2891 
2892 		if (a.pm_pid < 0) {
2893 			error = EINVAL;
2894 			break;
2895 		} else if (a.pm_pid == 0)
2896 			a.pm_pid = td->td_proc->p_pid;
2897 
2898 		if ((error = pmc_find_pmc(a.pm_pmc, &pm)) != 0)
2899 			break;
2900 
2901 		if ((p = pfind(a.pm_pid)) == NULL) {
2902 			error = ESRCH;
2903 			break;
2904 		}
2905 
2906 		/*
2907 		 * Treat processes that are in the process of exiting
2908 		 * as if they were not present.
2909 		 */
2910 
2911 		if (p->p_flag & P_WEXIT)
2912 			error = ESRCH;
2913 
2914 		PROC_UNLOCK(p);	/* pfind() returns a locked process */
2915 
2916 		if (error == 0)
2917 			error = pmc_detach_process(p, pm);
2918 	}
2919 	break;
2920 
2921 
2922 	/*
2923 	 * Release an allocated PMC
2924 	 */
2925 
2926 	case PMC_OP_PMCRELEASE:
2927 	{
2928 		pmc_id_t pmcid;
2929 		struct pmc *pm;
2930 		struct pmc_owner *po;
2931 		struct pmc_op_simple sp;
2932 
2933 		/*
2934 		 * Find PMC pointer for the named PMC.
2935 		 *
2936 		 * Use pmc_release_pmc_descriptor() to switch off the
2937 		 * PMC, remove all its target threads, and remove the
2938 		 * PMC from its owner's list.
2939 		 *
2940 		 * Remove the owner record if this is the last PMC
2941 		 * owned.
2942 		 *
2943 		 * Free up space.
2944 		 */
2945 
2946 		if ((error = copyin(arg, &sp, sizeof(sp))) != 0)
2947 			break;
2948 
2949 		pmcid = sp.pm_pmcid;
2950 
2951 		if ((error = pmc_find_pmc(pmcid, &pm)) != 0)
2952 			break;
2953 
2954 		po = pm->pm_owner;
2955 		pmc_release_pmc_descriptor(pm);
2956 		pmc_maybe_remove_owner(po);
2957 
2958 		FREE(pm, M_PMC);
2959 	}
2960 	break;
2961 
2962 
2963 	/*
2964 	 * Read and/or write a PMC.
2965 	 */
2966 
2967 	case PMC_OP_PMCRW:
2968 	{
2969 		uint32_t cpu, ri;
2970 		struct pmc *pm;
2971 		struct pmc_op_pmcrw *pprw;
2972 		struct pmc_op_pmcrw prw;
2973 		struct pmc_binding pb;
2974 		pmc_value_t oldvalue;
2975 
2976 		PMC_DOWNGRADE_SX();
2977 
2978 		if ((error = copyin(arg, &prw, sizeof(prw))) != 0)
2979 			break;
2980 
2981 		PMCDBG(PMC,OPS,1, "rw id=%d flags=0x%x", prw.pm_pmcid,
2982 		    prw.pm_flags);
2983 
2984 		/* must have at least one flag set */
2985 		if ((prw.pm_flags & (PMC_F_OLDVALUE|PMC_F_NEWVALUE)) == 0) {
2986 			error = EINVAL;
2987 			break;
2988 		}
2989 
2990 		/* locate pmc descriptor */
2991 		if ((error = pmc_find_pmc(prw.pm_pmcid, &pm)) != 0)
2992 			break;
2993 
2994 		/* Can't read a PMC that hasn't been started. */
2995 		if (pm->pm_state != PMC_STATE_ALLOCATED &&
2996 		    pm->pm_state != PMC_STATE_STOPPED &&
2997 		    pm->pm_state != PMC_STATE_RUNNING) {
2998 			error = EINVAL;
2999 			break;
3000 		}
3001 
3002 		/* writing a new value is allowed only for 'STOPPED' pmcs */
3003 		if (pm->pm_state == PMC_STATE_RUNNING &&
3004 		    (prw.pm_flags & PMC_F_NEWVALUE)) {
3005 			error = EBUSY;
3006 			break;
3007 		}
3008 
3009 		if (PMC_IS_VIRTUAL_MODE(pm->pm_mode)) {
3010 
3011 			/* read/write the saved value in the PMC record */
3012 			mtx_pool_lock_spin(pmc_mtxpool, pm);
3013 			if (prw.pm_flags & PMC_F_OLDVALUE)
3014 				oldvalue = pm->pm_gv.pm_savedvalue;
3015 			if (prw.pm_flags & PMC_F_NEWVALUE)
3016 				pm->pm_gv.pm_savedvalue = prw.pm_value;
3017 			mtx_pool_unlock_spin(pmc_mtxpool, pm);
3018 
3019 		} else { /* System mode PMCs */
3020 			cpu = pm->pm_gv.pm_cpu;
3021 			ri  = pm->pm_rowindex;
3022 
3023 			if (pmc_cpu_is_disabled(cpu)) {
3024 				error = ENXIO;
3025 				break;
3026 			}
3027 
3028 			/* move this thread to CPU 'cpu' */
3029 			pmc_save_cpu_binding(&pb);
3030 			pmc_select_cpu(cpu);
3031 
3032 			/* save old value */
3033 			if (prw.pm_flags & PMC_F_OLDVALUE)
3034 				if ((error = (*md->pmd_read_pmc)(cpu, ri,
3035 					 &oldvalue)))
3036 					goto error;
3037 			/* write out new value */
3038 			if (prw.pm_flags & PMC_F_NEWVALUE)
3039 				error = (*md->pmd_write_pmc)(cpu, ri,
3040 				    prw.pm_value);
3041 		error:
3042 			pmc_restore_cpu_binding(&pb);
3043 			if (error)
3044 				break;
3045 		}
3046 
3047 		pprw = (struct pmc_op_pmcrw *) arg;
3048 
3049 #if	DEBUG
3050 		if (prw.pm_flags & PMC_F_NEWVALUE)
3051 			PMCDBG(PMC,OPS,2, "rw id=%d new %jx -> old %jx",
3052 			    ri, prw.pm_value, oldvalue);
3053 		else
3054 			PMCDBG(PMC,OPS,2, "rw id=%d -> old %jx", ri, oldvalue);
3055 #endif
3056 
3057 		/* return old value if requested */
3058 		if (prw.pm_flags & PMC_F_OLDVALUE)
3059 			if ((error = copyout(&oldvalue, &pprw->pm_value,
3060 				 sizeof(prw.pm_value))))
3061 				break;
3062 
3063 		/*
3064 		 * send a signal (SIGIO) to the owner if it is trying to read
3065 		 * a PMC with no target processes attached.
3066 		 */
3067 
3068 		if (LIST_EMPTY(&pm->pm_targets) &&
3069 		    (prw.pm_flags & PMC_F_OLDVALUE)) {
3070 			PROC_LOCK(curthread->td_proc);
3071 			psignal(curthread->td_proc, SIGIO);
3072 			PROC_UNLOCK(curthread->td_proc);
3073 		}
3074 	}
3075 	break;
3076 
3077 
3078 	/*
3079 	 * Set the sampling rate for a sampling mode PMC and the
3080 	 * initial count for a counting mode PMC.
3081 	 */
3082 
3083 	case PMC_OP_PMCSETCOUNT:
3084 	{
3085 		struct pmc *pm;
3086 		struct pmc_op_pmcsetcount sc;
3087 
3088 		PMC_DOWNGRADE_SX();
3089 
3090 		if ((error = copyin(arg, &sc, sizeof(sc))) != 0)
3091 			break;
3092 
3093 		if ((error = pmc_find_pmc(sc.pm_pmcid, &pm)) != 0)
3094 			break;
3095 
3096 		if (pm->pm_state == PMC_STATE_RUNNING) {
3097 			error = EBUSY;
3098 			break;
3099 		}
3100 
3101 		if (PMC_IS_SAMPLING_MODE(pm->pm_mode))
3102 			pm->pm_sc.pm_reloadcount = sc.pm_count;
3103 		else
3104 			pm->pm_sc.pm_initial = sc.pm_count;
3105 	}
3106 	break;
3107 
3108 
3109 	/*
3110 	 * Start a PMC.
3111 	 */
3112 
3113 	case PMC_OP_PMCSTART:
3114 	{
3115 		pmc_id_t pmcid;
3116 		struct pmc *pm;
3117 		struct pmc_op_simple sp;
3118 
3119 		sx_assert(&pmc_sx, SX_XLOCKED);
3120 
3121 		if ((error = copyin(arg, &sp, sizeof(sp))) != 0)
3122 			break;
3123 
3124 		pmcid = sp.pm_pmcid;
3125 
3126 		if ((error = pmc_find_pmc(pmcid, &pm)) != 0)
3127 			break;
3128 
3129 		KASSERT(pmcid == pm->pm_rowindex,
3130 		    ("[pmc,%d] row index %d != id %d", __LINE__,
3131 			pm->pm_rowindex, pmcid));
3132 
3133 		if (pm->pm_state == PMC_STATE_RUNNING) /* already running */
3134 			break;
3135 		else if (pm->pm_state != PMC_STATE_STOPPED &&
3136 		    pm->pm_state != PMC_STATE_ALLOCATED) {
3137 			error = EINVAL;
3138 			break;
3139 		}
3140 
3141 		error = pmc_start(pm);
3142 	}
3143 	break;
3144 
3145 
3146 	/*
3147 	 * Stop a PMC.
3148 	 */
3149 
3150 	case PMC_OP_PMCSTOP:
3151 	{
3152 		pmc_id_t pmcid;
3153 		struct pmc *pm;
3154 		struct pmc_op_simple sp;
3155 
3156 		PMC_DOWNGRADE_SX();
3157 
3158 		if ((error = copyin(arg, &sp, sizeof(sp))) != 0)
3159 			break;
3160 
3161 		pmcid = sp.pm_pmcid;
3162 
3163 		/*
3164 		 * Mark the PMC as inactive and invoke the MD stop
3165 		 * routines if needed.
3166 		 */
3167 
3168 		if ((error = pmc_find_pmc(pmcid, &pm)) != 0)
3169 			break;
3170 
3171 		KASSERT(pmcid == pm->pm_rowindex,
3172 		    ("[pmc,%d] row index %d != pmcid %d", __LINE__,
3173 			pm->pm_rowindex, pmcid));
3174 
3175 		if (pm->pm_state == PMC_STATE_STOPPED) /* already stopped */
3176 			break;
3177 		else if (pm->pm_state != PMC_STATE_RUNNING) {
3178 			error = EINVAL;
3179 			break;
3180 		}
3181 
3182 		error = pmc_stop(pm);
3183 	}
3184 	break;
3185 
3186 
3187 	/*
3188 	 * Write a user-entry to the log file.
3189 	 */
3190 
3191 	case PMC_OP_WRITELOG:
3192 	{
3193 
3194 		PMC_DOWNGRADE_SX();
3195 
3196 		/*
3197 		 * flush all per-cpu hash tables
3198 		 * append user-log entry
3199 		 */
3200 
3201 		error = ENOSYS;
3202 	}
3203 	break;
3204 
3205 
3206 #if __i386__ || __amd64__
3207 
3208 	/*
3209 	 * Machine dependent operation for i386-class processors.
3210 	 *
3211 	 * Retrieve the MSR number associated with the counter
3212 	 * 'pmc_id'.  This allows processes to directly use RDPMC
3213 	 * instructions to read their PMCs, without the overhead of a
3214 	 * system call.
3215 	 */
3216 
3217 	case PMC_OP_PMCX86GETMSR:
3218 	{
3219 		int ri;
3220 		struct pmc	*pm;
3221 		struct pmc_op_x86_getmsr gm;
3222 
3223 		PMC_DOWNGRADE_SX();
3224 
3225 		/* CPU has no 'GETMSR' support */
3226 		if (md->pmd_get_msr == NULL) {
3227 			error = ENOSYS;
3228 			break;
3229 		}
3230 
3231 		if ((error = copyin(arg, &gm, sizeof(gm))) != 0)
3232 			break;
3233 
3234 		if ((error = pmc_find_pmc(gm.pm_pmcid, &pm)) != 0)
3235 			break;
3236 
3237 		/*
3238 		 * The allocated PMC needs to be a process virtual PMC,
3239 		 * i.e., of type T[CS].
3240 		 *
3241 		 * Global PMCs can only be read using the PMCREAD
3242 		 * operation since they may be allocated on a
3243 		 * different CPU than the one we could be running on
3244 		 * at the time of the read.
3245 		 */
3246 
3247 		if (!PMC_IS_VIRTUAL_MODE(pm->pm_mode)) {
3248 			error = EINVAL;
3249 			break;
3250 		}
3251 
3252 		ri = pm->pm_rowindex;
3253 
3254 		if ((error = (*md->pmd_get_msr)(ri, &gm.pm_msr)) < 0)
3255 			break;
3256 		if ((error = copyout(&gm, arg, sizeof(gm))) < 0)
3257 			break;
3258 	}
3259 	break;
3260 #endif
3261 
3262 	default:
3263 		error = EINVAL;
3264 		break;
3265 	}
3266 
3267 	if (is_sx_downgraded)
3268 		sx_sunlock(&pmc_sx);
3269 	else
3270 		sx_xunlock(&pmc_sx);
3271 
3272 	if (error)
3273 		atomic_add_int(&pmc_stats.pm_syscall_errors, 1);
3274 
3275 	return error;
3276 }
3277 
3278 /*
3279  * Helper functions
3280  */
3281 
3282 /*
3283  * Configure a log file.
3284  */
3285 
3286 static int
3287 pmc_configure_log(struct pmc_owner *po, int logfd)
3288 {
3289 	struct proc *p;
3290 
3291 	return ENOSYS; /* for now */
3292 
3293 	p = po->po_owner;
3294 
3295 	if (po->po_logfd < 0 && logfd < 0) /* nothing to do */
3296 		return 0;
3297 
3298 	if (po->po_logfd >= 0 && logfd < 0) {
3299 		/* deconfigure log */
3300 		/* XXX */
3301 		po->po_flags &= ~PMC_FLAG_OWNS_LOGFILE;
3302 		pmc_maybe_remove_owner(po);
3303 
3304 	} else if (po->po_logfd < 0 && logfd >= 0) {
3305 		/* configure log file */
3306 		/* XXX */
3307 		po->po_flags |= PMC_FLAG_OWNS_LOGFILE;
3308 
3309 		/* mark process as using HWPMCs */
3310 		PROC_LOCK(p);
3311 		p->p_flag |= P_HWPMC;
3312 		PROC_UNLOCK(p);
3313 	} else
3314 		return EBUSY;
3315 
3316 	return 0;
3317 }
3318 
3319 /*
3320  * Log an exit event to the PMC owner's log file.
3321  */
3322 
3323 static void
3324 pmc_log_process_exit(struct pmc *pm, struct pmc_process *pp)
3325 {
3326 	KASSERT(pm->pm_flags & PMC_F_LOG_TC_PROCEXIT,
3327 	    ("[pmc,%d] log-process-exit called gratuitously", __LINE__));
3328 
3329 	(void) pm;
3330 	(void) pp;
3331 
3332 	return;
3333 }
3334 
3335 /*
3336  * Event handlers.
3337  */
3338 
3339 /*
3340  * Handle a process exit.
3341  *
3342  * XXX This eventhandler gets called early in the exit process.
3343  * Consider using a 'hook' invocation from thread_exit() or equivalent
3344  * spot.  Another negative is that kse_exit doesn't seem to call
3345  * exit1() [??].
3346  */
3347 
3348 static void
3349 pmc_process_exit(void *arg __unused, struct proc *p)
3350 {
3351 	int is_using_hwpmcs;
3352 
3353 	PROC_LOCK(p);
3354 	is_using_hwpmcs = p->p_flag & P_HWPMC;
3355 	PROC_UNLOCK(p);
3356 
3357 	if (is_using_hwpmcs) {
3358 		PMCDBG(PRC,EXT,1,"process-exit proc=%p (%d, %s)", p, p->p_pid,
3359 		    p->p_comm);
3360 
3361 		PMC_GET_SX_XLOCK();
3362 		(void) pmc_hook_handler(curthread, PMC_FN_PROCESS_EXIT,
3363 		    (void *) p);
3364 		sx_xunlock(&pmc_sx);
3365 	}
3366 }
3367 
3368 /*
3369  * Handle a process fork.
3370  *
3371  * If the parent process 'p1' is under HWPMC monitoring, then copy
3372  * over any attached PMCs that have 'do_descendants' semantics.
3373  */
3374 
3375 static void
3376 pmc_process_fork(void *arg __unused, struct proc *p1, struct proc *p2,
3377     int flags)
3378 {
3379 	int is_using_hwpmcs;
3380 
3381 	(void) flags;		/* unused parameter */
3382 
3383 	PROC_LOCK(p1);
3384 	is_using_hwpmcs = p1->p_flag & P_HWPMC;
3385 	PROC_UNLOCK(p1);
3386 
3387 	if (is_using_hwpmcs) {
3388 		PMCDBG(PMC,FRK,1, "process-fork proc=%p (%d, %s)", p1,
3389 		    p1->p_pid, p1->p_comm);
3390 		PMC_GET_SX_XLOCK();
3391 		(void) pmc_hook_handler(curthread, PMC_FN_PROCESS_FORK,
3392 		    (void *) p2);
3393 		sx_xunlock(&pmc_sx);
3394 	}
3395 }
3396 
3397 
3398 /*
3399  * initialization
3400  */
3401 
3402 static const char *pmc_name_of_pmcclass[] = {
3403 #undef	__PMC_CLASS
3404 #define	__PMC_CLASS(N) #N ,
3405 	__PMC_CLASSES()
3406 };
3407 
3408 static int
3409 pmc_initialize(void)
3410 {
3411 	int error, cpu, n;
3412 	struct pmc_binding pb;
3413 
3414 	md = NULL;
3415 	error = 0;
3416 
3417 #if	DEBUG
3418 	/* parse debug flags first */
3419 	if (TUNABLE_STR_FETCH(PMC_SYSCTL_NAME_PREFIX "debugflags",
3420 		pmc_debugstr, sizeof(pmc_debugstr)))
3421 		pmc_debugflags_parse(pmc_debugstr,
3422 		    pmc_debugstr+strlen(pmc_debugstr));
3423 #endif
3424 
3425 	PMCDBG(MOD,INI,0, "PMC Initialize (version %x)", PMC_VERSION);
3426 
3427 	/*
3428 	 * check sysctl parameters
3429 	 */
3430 
3431 	if (pmc_hashsize <= 0) {
3432 		(void) printf("pmc: sysctl variable \""
3433 		    PMC_SYSCTL_NAME_PREFIX "hashsize\" must be greater than "
3434 		    "zero\n");
3435 		pmc_hashsize = PMC_HASH_SIZE;
3436 	}
3437 
3438 #if	defined(__i386__)
3439 	/* determine the CPU kind.  This is i386 specific */
3440 	if (strcmp(cpu_vendor, "AuthenticAMD") == 0)
3441 		md = pmc_amd_initialize();
3442 	else if (strcmp(cpu_vendor, "GenuineIntel") == 0)
3443 		md = pmc_intel_initialize();
3444 	/* XXX: what about the other i386 CPU manufacturers? */
3445 #elif	defined(__amd64__)
3446 	if (strcmp(cpu_vendor, "AuthenticAMD") == 0)
3447 		md = pmc_amd_initialize();
3448 #else  /* other architectures */
3449 	md = NULL;
3450 #endif
3451 
3452 	if (md == NULL || md->pmd_init == NULL)
3453 		return ENOSYS;
3454 
3455 	/* allocate space for the per-cpu array */
3456 	MALLOC(pmc_pcpu, struct pmc_cpu **, mp_ncpus * sizeof(struct pmc_cpu *),
3457 	    M_PMC, M_WAITOK|M_ZERO);
3458 
3459 	/* per-cpu 'saved values' for managing process-mode PMCs */
3460 	MALLOC(pmc_pcpu_saved, pmc_value_t *,
3461 	    sizeof(pmc_value_t) * mp_ncpus * md->pmd_npmc, M_PMC, M_WAITOK);
3462 
3463 	/* perform cpu dependent initialization */
3464 	pmc_save_cpu_binding(&pb);
3465 	for (cpu = 0; cpu < mp_ncpus; cpu++) {
3466 		if (pmc_cpu_is_disabled(cpu))
3467 			continue;
3468 		pmc_select_cpu(cpu);
3469 		if ((error = md->pmd_init(cpu)) != 0)
3470 			break;
3471 	}
3472 	pmc_restore_cpu_binding(&pb);
3473 
3474 	if (error != 0)
3475 		return error;
3476 
3477 	/* allocate space for the row disposition array */
3478 	pmc_pmcdisp = malloc(sizeof(enum pmc_mode) * md->pmd_npmc,
3479 	    M_PMC, M_WAITOK|M_ZERO);
3480 
3481 	KASSERT(pmc_pmcdisp != NULL,
3482 	    ("[pmc,%d] pmcdisp allocation returned NULL", __LINE__));
3483 
3484 	/* mark all PMCs as available */
3485 	for (n = 0; n < (int) md->pmd_npmc; n++)
3486 		PMC_MARK_ROW_FREE(n);
3487 
3488 	/* allocate thread hash tables */
3489 	pmc_ownerhash = hashinit(pmc_hashsize, M_PMC,
3490 	    &pmc_ownerhashmask);
3491 
3492 	pmc_processhash = hashinit(pmc_hashsize, M_PMC,
3493 	    &pmc_processhashmask);
3494 	mtx_init(&pmc_processhash_mtx, "pmc-process-hash", "pmc", MTX_SPIN);
3495 
3496 	/* allocate a pool of spin mutexes */
3497 	pmc_mtxpool = mtx_pool_create("pmc", pmc_mtxpool_size, MTX_SPIN);
3498 
3499 	PMCDBG(MOD,INI,1, "pmc_ownerhash=%p, mask=0x%lx "
3500 	    "targethash=%p mask=0x%lx", pmc_ownerhash, pmc_ownerhashmask,
3501 	    pmc_processhash, pmc_processhashmask);
3502 
3503 	/* register process {exit,fork,exec} handlers */
3504 	pmc_exit_tag = EVENTHANDLER_REGISTER(process_exit,
3505 	    pmc_process_exit, NULL, EVENTHANDLER_PRI_ANY);
3506 	pmc_fork_tag = EVENTHANDLER_REGISTER(process_fork,
3507 	    pmc_process_fork, NULL, EVENTHANDLER_PRI_ANY);
3508 
3509 	/* set hook functions */
3510 	pmc_intr = md->pmd_intr;
3511 	pmc_hook = pmc_hook_handler;
3512 
3513 	if (error == 0) {
3514 		printf(PMC_MODULE_NAME ":");
3515 		for (n = 0; n < (int) md->pmd_nclass; n++)
3516 			printf(" %s(%d)",
3517 			    pmc_name_of_pmcclass[md->pmd_classes[n]],
3518 			    md->pmd_nclasspmcs[n]);
3519 		printf("\n");
3520 	}
3521 
3522 	return error;
3523 }
3524 
3525 /* prepare to be unloaded */
3526 static void
3527 pmc_cleanup(void)
3528 {
3529 	int cpu;
3530 	struct pmc_ownerhash *ph;
3531 	struct pmc_owner *po, *tmp;
3532 	struct pmc_binding pb;
3533 #if	DEBUG
3534 	struct pmc_processhash *prh;
3535 #endif
3536 
3537 	PMCDBG(MOD,INI,0, "%s", "cleanup");
3538 
3539 	pmc_intr = NULL;	/* no more interrupts please */
3540 
3541 	sx_xlock(&pmc_sx);
3542 	if (pmc_hook == NULL) {	/* being unloaded already */
3543 		sx_xunlock(&pmc_sx);
3544 		return;
3545 	}
3546 
3547 	pmc_hook = NULL; /* prevent new threads from entering module */
3548 
3549 	/* deregister event handlers */
3550 	EVENTHANDLER_DEREGISTER(process_fork, pmc_fork_tag);
3551 	EVENTHANDLER_DEREGISTER(process_exit, pmc_exit_tag);
3552 
3553 	/* send SIGBUS to all owner threads, free up allocations */
3554 	if (pmc_ownerhash)
3555 		for (ph = pmc_ownerhash;
3556 		     ph <= &pmc_ownerhash[pmc_ownerhashmask];
3557 		     ph++) {
3558 			LIST_FOREACH_SAFE(po, ph, po_next, tmp) {
3559 				pmc_remove_owner(po);
3560 
3561 				/* send SIGBUS to owner processes */
3562 				PMCDBG(MOD,INI,2, "cleanup signal proc=%p "
3563 				    "(%d, %s)", po->po_owner,
3564 				    po->po_owner->p_pid,
3565 				    po->po_owner->p_comm);
3566 
3567 				PROC_LOCK(po->po_owner);
3568 				psignal(po->po_owner, SIGBUS);
3569 				PROC_UNLOCK(po->po_owner);
3570 				FREE(po, M_PMC);
3571 			}
3572 		}
3573 
3574 	/* reclaim allocated data structures */
3575 	if (pmc_mtxpool)
3576 		mtx_pool_destroy(&pmc_mtxpool);
3577 
3578 	mtx_destroy(&pmc_processhash_mtx);
3579 	if (pmc_processhash) {
3580 #if	DEBUG
3581 		struct pmc_process *pp;
3582 
3583 		PMCDBG(MOD,INI,3, "%s", "destroy process hash");
3584 		for (prh = pmc_processhash;
3585 		     prh <= &pmc_processhash[pmc_processhashmask];
3586 		     prh++)
3587 			LIST_FOREACH(pp, prh, pp_next)
3588 			    PMCDBG(MOD,INI,3, "pid=%d", pp->pp_proc->p_pid);
3589 #endif
3590 
3591 		hashdestroy(pmc_processhash, M_PMC, pmc_processhashmask);
3592 		pmc_processhash = NULL;
3593 	}
3594 
3595 	if (pmc_ownerhash) {
3596 		PMCDBG(MOD,INI,3, "%s", "destroy owner hash");
3597 		hashdestroy(pmc_ownerhash, M_PMC, pmc_ownerhashmask);
3598 		pmc_ownerhash = NULL;
3599 	}
3600 
3601  	/* do processor dependent cleanup */
3602 	PMCDBG(MOD,INI,3, "%s", "md cleanup");
3603 	if (md) {
3604 		pmc_save_cpu_binding(&pb);
3605 		for (cpu = 0; cpu < mp_ncpus; cpu++) {
3606 			PMCDBG(MOD,INI,1,"pmc-cleanup cpu=%d pcs=%p",
3607 			    cpu, pmc_pcpu[cpu]);
3608 			if (pmc_cpu_is_disabled(cpu))
3609 				continue;
3610 			pmc_select_cpu(cpu);
3611 			if (pmc_pcpu[cpu])
3612 				(void) md->pmd_cleanup(cpu);
3613 		}
3614 		FREE(md, M_PMC);
3615 		md = NULL;
3616 		pmc_restore_cpu_binding(&pb);
3617 	}
3618 
3619 	/* deallocate per-cpu structures */
3620 	FREE(pmc_pcpu, M_PMC);
3621 	pmc_pcpu = NULL;
3622 
3623 	FREE(pmc_pcpu_saved, M_PMC);
3624 	pmc_pcpu_saved = NULL;
3625 
3626 	if (pmc_pmcdisp) {
3627 		FREE(pmc_pmcdisp, M_PMC);
3628 		pmc_pmcdisp = NULL;
3629 	}
3630 
3631 	sx_xunlock(&pmc_sx); 	/* we are done */
3632 }
3633 
3634 /*
3635  * The function called at load/unload.
3636  */
3637 
3638 static int
3639 load (struct module *module __unused, int cmd, void *arg __unused)
3640 {
3641 	int error;
3642 
3643 	error = 0;
3644 
3645 	switch (cmd) {
3646 	case MOD_LOAD :
3647 		/* initialize the subsystem */
3648 		error = pmc_initialize();
3649 		if (error != 0)
3650 			break;
3651 		PMCDBG(MOD,INI,1, "syscall=%d ncpus=%d",
3652 		    pmc_syscall_num, mp_ncpus);
3653 		break;
3654 
3655 
3656 	case MOD_UNLOAD :
3657 	case MOD_SHUTDOWN:
3658 		pmc_cleanup();
3659 		PMCDBG(MOD,INI,1, "%s", "unloaded");
3660 		break;
3661 
3662 	default :
3663 		error = EINVAL;	/* XXX should panic(9) */
3664 		break;
3665 	}
3666 
3667 	return error;
3668 }
3669 
3670 /* memory pool */
3671 MALLOC_DEFINE(M_PMC, "pmc", "Memory space for the PMC module");
3672