xref: /freebsd/sys/dev/hwpmc/hwpmc_mod.c (revision 91c878a6935c5c2e99866eb267e5bc3028bf6d2f)
1 /*-
2  * Copyright (c) 2003-2006 Joseph Koshy
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include <sys/param.h>
32 #include <sys/eventhandler.h>
33 #include <sys/jail.h>
34 #include <sys/kernel.h>
35 #include <sys/kthread.h>
36 #include <sys/limits.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/module.h>
40 #include <sys/mutex.h>
41 #include <sys/pmc.h>
42 #include <sys/pmckern.h>
43 #include <sys/pmclog.h>
44 #include <sys/proc.h>
45 #include <sys/queue.h>
46 #include <sys/resourcevar.h>
47 #include <sys/sched.h>
48 #include <sys/signalvar.h>
49 #include <sys/smp.h>
50 #include <sys/sx.h>
51 #include <sys/sysctl.h>
52 #include <sys/sysent.h>
53 #include <sys/systm.h>
54 #include <sys/vnode.h>
55 
56 #include <sys/linker.h>		/* needs to be after <sys/malloc.h> */
57 
58 #include <machine/atomic.h>
59 #include <machine/md_var.h>
60 
61 /*
62  * Types
63  */
64 
65 enum pmc_flags {
66 	PMC_FLAG_NONE	  = 0x00, /* do nothing */
67 	PMC_FLAG_REMOVE   = 0x01, /* atomically remove entry from hash */
68 	PMC_FLAG_ALLOCATE = 0x02, /* add entry to hash if not found */
69 };
70 
71 /*
72  * The offset in sysent where the syscall is allocated.
73  */
74 
75 static int pmc_syscall_num = NO_SYSCALL;
76 struct pmc_cpu		**pmc_pcpu;	 /* per-cpu state */
77 pmc_value_t		*pmc_pcpu_saved; /* saved PMC values: CSW handling */
78 
79 #define	PMC_PCPU_SAVED(C,R)	pmc_pcpu_saved[(R) + md->pmd_npmc*(C)]
80 
81 struct mtx_pool		*pmc_mtxpool;
82 static int		*pmc_pmcdisp;	 /* PMC row dispositions */
83 
84 #define	PMC_ROW_DISP_IS_FREE(R)		(pmc_pmcdisp[(R)] == 0)
85 #define	PMC_ROW_DISP_IS_THREAD(R)	(pmc_pmcdisp[(R)] > 0)
86 #define	PMC_ROW_DISP_IS_STANDALONE(R)	(pmc_pmcdisp[(R)] < 0)
87 
88 #define	PMC_MARK_ROW_FREE(R) do {					  \
89 	pmc_pmcdisp[(R)] = 0;						  \
90 } while (0)
91 
92 #define	PMC_MARK_ROW_STANDALONE(R) do {					  \
93 	KASSERT(pmc_pmcdisp[(R)] <= 0, ("[pmc,%d] row disposition error", \
94 		    __LINE__));						  \
95 	atomic_add_int(&pmc_pmcdisp[(R)], -1);				  \
96 	KASSERT(pmc_pmcdisp[(R)] >= (-mp_ncpus), ("[pmc,%d] row "	  \
97 		"disposition error", __LINE__));			  \
98 } while (0)
99 
100 #define	PMC_UNMARK_ROW_STANDALONE(R) do { 				  \
101 	atomic_add_int(&pmc_pmcdisp[(R)], 1);				  \
102 	KASSERT(pmc_pmcdisp[(R)] <= 0, ("[pmc,%d] row disposition error", \
103 		    __LINE__));						  \
104 } while (0)
105 
106 #define	PMC_MARK_ROW_THREAD(R) do {					  \
107 	KASSERT(pmc_pmcdisp[(R)] >= 0, ("[pmc,%d] row disposition error", \
108 		    __LINE__));						  \
109 	atomic_add_int(&pmc_pmcdisp[(R)], 1);				  \
110 } while (0)
111 
112 #define	PMC_UNMARK_ROW_THREAD(R) do {					  \
113 	atomic_add_int(&pmc_pmcdisp[(R)], -1);				  \
114 	KASSERT(pmc_pmcdisp[(R)] >= 0, ("[pmc,%d] row disposition error", \
115 		    __LINE__));						  \
116 } while (0)
117 
118 
119 /* various event handlers */
120 static eventhandler_tag	pmc_exit_tag, pmc_fork_tag;
121 
122 /* Module statistics */
123 struct pmc_op_getdriverstats pmc_stats;
124 
125 /* Machine/processor dependent operations */
126 struct pmc_mdep  *md;
127 
128 /*
129  * Hash tables mapping owner processes and target threads to PMCs.
130  */
131 
132 struct mtx pmc_processhash_mtx;		/* spin mutex */
133 static u_long pmc_processhashmask;
134 static LIST_HEAD(pmc_processhash, pmc_process)	*pmc_processhash;
135 
136 /*
137  * Hash table of PMC owner descriptors.  This table is protected by
138  * the shared PMC "sx" lock.
139  */
140 
141 static u_long pmc_ownerhashmask;
142 static LIST_HEAD(pmc_ownerhash, pmc_owner)	*pmc_ownerhash;
143 
144 /*
145  * List of PMC owners with system-wide sampling PMCs.
146  */
147 
148 static LIST_HEAD(, pmc_owner)			pmc_ss_owners;
149 
150 
151 /*
152  * Prototypes
153  */
154 
155 #ifdef	DEBUG
156 static int	pmc_debugflags_sysctl_handler(SYSCTL_HANDLER_ARGS);
157 static int	pmc_debugflags_parse(char *newstr, char *fence);
158 #endif
159 
160 static int	load(struct module *module, int cmd, void *arg);
161 static int	pmc_attach_process(struct proc *p, struct pmc *pm);
162 static struct pmc *pmc_allocate_pmc_descriptor(void);
163 static struct pmc_owner *pmc_allocate_owner_descriptor(struct proc *p);
164 static int	pmc_attach_one_process(struct proc *p, struct pmc *pm);
165 static int	pmc_can_allocate_rowindex(struct proc *p, unsigned int ri,
166     int cpu);
167 static int	pmc_can_attach(struct pmc *pm, struct proc *p);
168 static void	pmc_cleanup(void);
169 static int	pmc_detach_process(struct proc *p, struct pmc *pm);
170 static int	pmc_detach_one_process(struct proc *p, struct pmc *pm,
171     int flags);
172 static void	pmc_destroy_owner_descriptor(struct pmc_owner *po);
173 static struct pmc_owner *pmc_find_owner_descriptor(struct proc *p);
174 static int	pmc_find_pmc(pmc_id_t pmcid, struct pmc **pm);
175 static struct pmc *pmc_find_pmc_descriptor_in_process(struct pmc_owner *po,
176     pmc_id_t pmc);
177 static struct pmc_process *pmc_find_process_descriptor(struct proc *p,
178     uint32_t mode);
179 static void	pmc_force_context_switch(void);
180 static void	pmc_link_target_process(struct pmc *pm,
181     struct pmc_process *pp);
182 static void	pmc_maybe_remove_owner(struct pmc_owner *po);
183 static void	pmc_process_csw_in(struct thread *td);
184 static void	pmc_process_csw_out(struct thread *td);
185 static void	pmc_process_exit(void *arg, struct proc *p);
186 static void	pmc_process_fork(void *arg, struct proc *p1,
187     struct proc *p2, int n);
188 static void	pmc_process_samples(int cpu);
189 static void	pmc_release_pmc_descriptor(struct pmc *pmc);
190 static void	pmc_remove_owner(struct pmc_owner *po);
191 static void	pmc_remove_process_descriptor(struct pmc_process *pp);
192 static void	pmc_restore_cpu_binding(struct pmc_binding *pb);
193 static void	pmc_save_cpu_binding(struct pmc_binding *pb);
194 static void	pmc_select_cpu(int cpu);
195 static int	pmc_start(struct pmc *pm);
196 static int	pmc_stop(struct pmc *pm);
197 static int	pmc_syscall_handler(struct thread *td, void *syscall_args);
198 static void	pmc_unlink_target_process(struct pmc *pmc,
199     struct pmc_process *pp);
200 
201 /*
202  * Kernel tunables and sysctl(8) interface.
203  */
204 
205 SYSCTL_NODE(_kern, OID_AUTO, hwpmc, CTLFLAG_RW, 0, "HWPMC parameters");
206 
207 #ifdef	DEBUG
208 struct pmc_debugflags pmc_debugflags = PMC_DEBUG_DEFAULT_FLAGS;
209 char	pmc_debugstr[PMC_DEBUG_STRSIZE];
210 TUNABLE_STR(PMC_SYSCTL_NAME_PREFIX "debugflags", pmc_debugstr,
211     sizeof(pmc_debugstr));
212 SYSCTL_PROC(_kern_hwpmc, OID_AUTO, debugflags,
213     CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_TUN,
214     0, 0, pmc_debugflags_sysctl_handler, "A", "debug flags");
215 #endif
216 
217 /*
218  * kern.hwpmc.hashrows -- determines the number of rows in the
219  * of the hash table used to look up threads
220  */
221 
222 static int pmc_hashsize = PMC_HASH_SIZE;
223 TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "hashsize", &pmc_hashsize);
224 SYSCTL_INT(_kern_hwpmc, OID_AUTO, hashsize, CTLFLAG_TUN|CTLFLAG_RD,
225     &pmc_hashsize, 0, "rows in hash tables");
226 
227 /*
228  * kern.hwpmc.nsamples --- number of PC samples per CPU
229  */
230 
231 static int pmc_nsamples = PMC_NSAMPLES;
232 TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "nsamples", &pmc_nsamples);
233 SYSCTL_INT(_kern_hwpmc, OID_AUTO, nsamples, CTLFLAG_TUN|CTLFLAG_RD,
234     &pmc_nsamples, 0, "number of PC samples per CPU");
235 
236 /*
237  * kern.hwpmc.mtxpoolsize -- number of mutexes in the mutex pool.
238  */
239 
240 static int pmc_mtxpool_size = PMC_MTXPOOL_SIZE;
241 TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "mtxpoolsize", &pmc_mtxpool_size);
242 SYSCTL_INT(_kern_hwpmc, OID_AUTO, mtxpoolsize, CTLFLAG_TUN|CTLFLAG_RD,
243     &pmc_mtxpool_size, 0, "size of spin mutex pool");
244 
245 
246 /*
247  * security.bsd.unprivileged_syspmcs -- allow non-root processes to
248  * allocate system-wide PMCs.
249  *
250  * Allowing unprivileged processes to allocate system PMCs is convenient
251  * if system-wide measurements need to be taken concurrently with other
252  * per-process measurements.  This feature is turned off by default.
253  */
254 
255 static int pmc_unprivileged_syspmcs = 0;
256 TUNABLE_INT("security.bsd.unprivileged_syspmcs", &pmc_unprivileged_syspmcs);
257 SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_syspmcs, CTLFLAG_RW,
258     &pmc_unprivileged_syspmcs, 0,
259     "allow unprivileged process to allocate system PMCs");
260 
261 /*
262  * Hash function.  Discard the lower 2 bits of the pointer since
263  * these are always zero for our uses.  The hash multiplier is
264  * round((2^LONG_BIT) * ((sqrt(5)-1)/2)).
265  */
266 
267 #if	LONG_BIT == 64
268 #define	_PMC_HM		11400714819323198486u
269 #elif	LONG_BIT == 32
270 #define	_PMC_HM		2654435769u
271 #else
272 #error 	Must know the size of 'long' to compile
273 #endif
274 
275 #define	PMC_HASH_PTR(P,M)	((((unsigned long) (P) >> 2) * _PMC_HM) & (M))
276 
277 /*
278  * Syscall structures
279  */
280 
281 /* The `sysent' for the new syscall */
282 static struct sysent pmc_sysent = {
283 	2,			/* sy_narg */
284 	pmc_syscall_handler	/* sy_call */
285 };
286 
287 static struct syscall_module_data pmc_syscall_mod = {
288 	load,
289 	NULL,
290 	&pmc_syscall_num,
291 	&pmc_sysent,
292 	{ 0, NULL }
293 };
294 
295 static moduledata_t pmc_mod = {
296 	PMC_MODULE_NAME,
297 	syscall_module_handler,
298 	&pmc_syscall_mod
299 };
300 
301 DECLARE_MODULE(pmc, pmc_mod, SI_SUB_SMP, SI_ORDER_ANY);
302 MODULE_VERSION(pmc, PMC_VERSION);
303 
304 #ifdef	DEBUG
305 enum pmc_dbgparse_state {
306 	PMCDS_WS,		/* in whitespace */
307 	PMCDS_MAJOR,		/* seen a major keyword */
308 	PMCDS_MINOR
309 };
310 
311 static int
312 pmc_debugflags_parse(char *newstr, char *fence)
313 {
314 	char c, *p, *q;
315 	struct pmc_debugflags *tmpflags;
316 	int error, found, *newbits, tmp;
317 	size_t kwlen;
318 
319 	MALLOC(tmpflags, struct pmc_debugflags *, sizeof(*tmpflags),
320 	    M_PMC, M_WAITOK|M_ZERO);
321 
322 	p = newstr;
323 	error = 0;
324 
325 	for (; p < fence && (c = *p); p++) {
326 
327 		/* skip white space */
328 		if (c == ' ' || c == '\t')
329 			continue;
330 
331 		/* look for a keyword followed by "=" */
332 		for (q = p; p < fence && (c = *p) && c != '='; p++)
333 			;
334 		if (c != '=') {
335 			error = EINVAL;
336 			goto done;
337 		}
338 
339 		kwlen = p - q;
340 		newbits = NULL;
341 
342 		/* lookup flag group name */
343 #define	DBG_SET_FLAG_MAJ(S,F)						\
344 		if (kwlen == sizeof(S)-1 && strncmp(q, S, kwlen) == 0)	\
345 			newbits = &tmpflags->pdb_ ## F;
346 
347 		DBG_SET_FLAG_MAJ("cpu",		CPU);
348 		DBG_SET_FLAG_MAJ("csw",		CSW);
349 		DBG_SET_FLAG_MAJ("logging",	LOG);
350 		DBG_SET_FLAG_MAJ("module",	MOD);
351 		DBG_SET_FLAG_MAJ("md", 		MDP);
352 		DBG_SET_FLAG_MAJ("owner",	OWN);
353 		DBG_SET_FLAG_MAJ("pmc",		PMC);
354 		DBG_SET_FLAG_MAJ("process",	PRC);
355 		DBG_SET_FLAG_MAJ("sampling", 	SAM);
356 
357 		if (newbits == NULL) {
358 			error = EINVAL;
359 			goto done;
360 		}
361 
362 		p++;		/* skip the '=' */
363 
364 		/* Now parse the individual flags */
365 		tmp = 0;
366 	newflag:
367 		for (q = p; p < fence && (c = *p); p++)
368 			if (c == ' ' || c == '\t' || c == ',')
369 				break;
370 
371 		/* p == fence or c == ws or c == "," or c == 0 */
372 
373 		if ((kwlen = p - q) == 0) {
374 			*newbits = tmp;
375 			continue;
376 		}
377 
378 		found = 0;
379 #define	DBG_SET_FLAG_MIN(S,F)						\
380 		if (kwlen == sizeof(S)-1 && strncmp(q, S, kwlen) == 0)	\
381 			tmp |= found = (1 << PMC_DEBUG_MIN_ ## F)
382 
383 		/* a '*' denotes all possible flags in the group */
384 		if (kwlen == 1 && *q == '*')
385 			tmp = found = ~0;
386 		/* look for individual flag names */
387 		DBG_SET_FLAG_MIN("allocaterow", ALR);
388 		DBG_SET_FLAG_MIN("allocate",	ALL);
389 		DBG_SET_FLAG_MIN("attach",	ATT);
390 		DBG_SET_FLAG_MIN("bind",	BND);
391 		DBG_SET_FLAG_MIN("config",	CFG);
392 		DBG_SET_FLAG_MIN("exec",	EXC);
393 		DBG_SET_FLAG_MIN("exit",	EXT);
394 		DBG_SET_FLAG_MIN("find",	FND);
395 		DBG_SET_FLAG_MIN("flush",	FLS);
396 		DBG_SET_FLAG_MIN("fork",	FRK);
397 		DBG_SET_FLAG_MIN("getbuf",	GTB);
398 		DBG_SET_FLAG_MIN("hook",	PMH);
399 		DBG_SET_FLAG_MIN("init",	INI);
400 		DBG_SET_FLAG_MIN("intr",	INT);
401 		DBG_SET_FLAG_MIN("linktarget",	TLK);
402 		DBG_SET_FLAG_MIN("mayberemove", OMR);
403 		DBG_SET_FLAG_MIN("ops",		OPS);
404 		DBG_SET_FLAG_MIN("read",	REA);
405 		DBG_SET_FLAG_MIN("register",	REG);
406 		DBG_SET_FLAG_MIN("release",	REL);
407 		DBG_SET_FLAG_MIN("remove",	ORM);
408 		DBG_SET_FLAG_MIN("sample",	SAM);
409 		DBG_SET_FLAG_MIN("scheduleio",	SIO);
410 		DBG_SET_FLAG_MIN("select",	SEL);
411 		DBG_SET_FLAG_MIN("signal",	SIG);
412 		DBG_SET_FLAG_MIN("swi",		SWI);
413 		DBG_SET_FLAG_MIN("swo",		SWO);
414 		DBG_SET_FLAG_MIN("start",	STA);
415 		DBG_SET_FLAG_MIN("stop",	STO);
416 		DBG_SET_FLAG_MIN("syscall",	PMS);
417 		DBG_SET_FLAG_MIN("unlinktarget", TUL);
418 		DBG_SET_FLAG_MIN("write",	WRI);
419 		if (found == 0) {
420 			/* unrecognized flag name */
421 			error = EINVAL;
422 			goto done;
423 		}
424 
425 		if (c == 0 || c == ' ' || c == '\t') {	/* end of flag group */
426 			*newbits = tmp;
427 			continue;
428 		}
429 
430 		p++;
431 		goto newflag;
432 	}
433 
434 	/* save the new flag set */
435 	bcopy(tmpflags, &pmc_debugflags, sizeof(pmc_debugflags));
436 
437  done:
438 	FREE(tmpflags, M_PMC);
439 	return error;
440 }
441 
442 static int
443 pmc_debugflags_sysctl_handler(SYSCTL_HANDLER_ARGS)
444 {
445 	char *fence, *newstr;
446 	int error;
447 	unsigned int n;
448 
449 	(void) arg1; (void) arg2; /* unused parameters */
450 
451 	n = sizeof(pmc_debugstr);
452 	MALLOC(newstr, char *, n, M_PMC, M_ZERO|M_WAITOK);
453 	(void) strlcpy(newstr, pmc_debugstr, n);
454 
455 	error = sysctl_handle_string(oidp, newstr, n, req);
456 
457 	/* if there is a new string, parse and copy it */
458 	if (error == 0 && req->newptr != NULL) {
459 		fence = newstr + (n < req->newlen ? n : req->newlen + 1);
460 		if ((error = pmc_debugflags_parse(newstr, fence)) == 0)
461 			(void) strlcpy(pmc_debugstr, newstr,
462 			    sizeof(pmc_debugstr));
463 	}
464 
465 	FREE(newstr, M_PMC);
466 
467 	return error;
468 }
469 #endif
470 
471 /*
472  * Concurrency Control
473  *
474  * The driver manages the following data structures:
475  *
476  *   - target process descriptors, one per target process
477  *   - owner process descriptors (and attached lists), one per owner process
478  *   - lookup hash tables for owner and target processes
479  *   - PMC descriptors (and attached lists)
480  *   - per-cpu hardware state
481  *   - the 'hook' variable through which the kernel calls into
482  *     this module
483  *   - the machine hardware state (managed by the MD layer)
484  *
485  * These data structures are accessed from:
486  *
487  * - thread context-switch code
488  * - interrupt handlers (possibly on multiple cpus)
489  * - kernel threads on multiple cpus running on behalf of user
490  *   processes doing system calls
491  * - this driver's private kernel threads
492  *
493  * = Locks and Locking strategy =
494  *
495  * The driver uses four locking strategies for its operation:
496  *
497  * - There is a 'global' SX lock "pmc_sx" that is used to protect
498  *   the its 'meta-data'.
499  *
500  *   Calls into the module (via syscall() or by the kernel) start with
501  *   this lock being held in exclusive mode.  Depending on the requested
502  *   operation, the lock may be downgraded to 'shared' mode to allow
503  *   more concurrent readers into the module.
504  *
505  *   This SX lock is held in exclusive mode for any operations that
506  *   modify the linkages between the driver's internal data structures.
507  *
508  *   The 'pmc_hook' function pointer is also protected by this lock.
509  *   It is only examined with the sx lock held in exclusive mode.  The
510  *   kernel module is allowed to be unloaded only with the sx lock
511  *   held in exclusive mode.  In normal syscall handling, after
512  *   acquiring the pmc_sx lock we first check that 'pmc_hook' is
513  *   non-null before proceeding.  This prevents races between the
514  *   thread unloading the module and other threads seeking to use the
515  *   module.
516  *
517  * - Lookups of target process structures and owner process structures
518  *   cannot use the global "pmc_sx" SX lock because these lookups need
519  *   to happen during context switches and in other critical sections
520  *   where sleeping is not allowed.  We protect these lookup tables
521  *   with their own private spin-mutexes, "pmc_processhash_mtx" and
522  *   "pmc_ownerhash_mtx".  These are 'leaf' mutexes, in that no other
523  *   lock is acquired with these locks held.
524  *
525  * - Interrupt handlers work in a lock free manner.  At interrupt
526  *   time, handlers look at the PMC pointer (phw->phw_pmc) configured
527  *   when the PMC was started.  If this pointer is NULL, the interrupt
528  *   is ignored after updating driver statistics.  We ensure that this
529  *   pointer is set (using an atomic operation if necessary) before the
530  *   PMC hardware is started.  Conversely, this pointer is unset atomically
531  *   only after the PMC hardware is stopped.
532  *
533  *   We ensure that everything needed for the operation of an
534  *   interrupt handler is available without it needing to acquire any
535  *   locks.  We also ensure that a PMC's software state is destroyed only
536  *   after the PMC is taken off hardware (on all CPUs).
537  *
538  * - Context-switch handling with process-private PMCs needs more
539  *   care.
540  *
541  *   A given process may be the target of multiple PMCs.  For example,
542  *   PMCATTACH and PMCDETACH may be requested by a process on one CPU
543  *   while the target process is running on another.  A PMC could also
544  *   be getting released because its owner is exiting.  We tackle
545  *   these situations in the following manner:
546  *
547  *   - each target process structure 'pmc_process' has an array
548  *     of 'struct pmc *' pointers, one for each hardware PMC.
549  *
550  *   - At context switch IN time, each "target" PMC in RUNNING state
551  *     gets started on hardware and a pointer to each PMC is copied into
552  *     the per-cpu phw array.  The 'runcount' for the PMC is
553  *     incremented.
554  *
555  *   - At context switch OUT time, all process-virtual PMCs are stopped
556  *     on hardware.  The saved value is added to the PMCs value field
557  *     only if the PMC is in a non-deleted state (the PMCs state could
558  *     have changed during the current time slice).
559  *
560  *     Note that since in-between a switch IN on a processor and a switch
561  *     OUT, the PMC could have been released on another CPU.  Therefore
562  *     context switch OUT always looks at the hardware state to turn
563  *     OFF PMCs and will update a PMC's saved value only if reachable
564  *     from the target process record.
565  *
566  *   - OP PMCRELEASE could be called on a PMC at any time (the PMC could
567  *     be attached to many processes at the time of the call and could
568  *     be active on multiple CPUs).
569  *
570  *     We prevent further scheduling of the PMC by marking it as in
571  *     state 'DELETED'.  If the runcount of the PMC is non-zero then
572  *     this PMC is currently running on a CPU somewhere.  The thread
573  *     doing the PMCRELEASE operation waits by repeatedly doing an
574  *     tsleep() till the runcount comes to zero.
575  *
576  */
577 
578 /*
579  * save the cpu binding of the current kthread
580  */
581 
582 static void
583 pmc_save_cpu_binding(struct pmc_binding *pb)
584 {
585 	PMCDBG(CPU,BND,2, "%s", "save-cpu");
586 	mtx_lock_spin(&sched_lock);
587 	pb->pb_bound = sched_is_bound(curthread);
588 	pb->pb_cpu   = curthread->td_oncpu;
589 	mtx_unlock_spin(&sched_lock);
590 	PMCDBG(CPU,BND,2, "save-cpu cpu=%d", pb->pb_cpu);
591 }
592 
593 /*
594  * restore the cpu binding of the current thread
595  */
596 
597 static void
598 pmc_restore_cpu_binding(struct pmc_binding *pb)
599 {
600 	PMCDBG(CPU,BND,2, "restore-cpu curcpu=%d restore=%d",
601 	    curthread->td_oncpu, pb->pb_cpu);
602 	mtx_lock_spin(&sched_lock);
603 	if (pb->pb_bound)
604 		sched_bind(curthread, pb->pb_cpu);
605 	else
606 		sched_unbind(curthread);
607 	mtx_unlock_spin(&sched_lock);
608 	PMCDBG(CPU,BND,2, "%s", "restore-cpu done");
609 }
610 
611 /*
612  * move execution over the specified cpu and bind it there.
613  */
614 
615 static void
616 pmc_select_cpu(int cpu)
617 {
618 	KASSERT(cpu >= 0 && cpu < mp_ncpus,
619 	    ("[pmc,%d] bad cpu number %d", __LINE__, cpu));
620 
621 	/* never move to a disabled CPU */
622 	KASSERT(pmc_cpu_is_disabled(cpu) == 0, ("[pmc,%d] selecting "
623 	    "disabled CPU %d", __LINE__, cpu));
624 
625 	PMCDBG(CPU,SEL,2, "select-cpu cpu=%d", cpu);
626 	mtx_lock_spin(&sched_lock);
627 	sched_bind(curthread, cpu);
628 	mtx_unlock_spin(&sched_lock);
629 
630 	KASSERT(curthread->td_oncpu == cpu,
631 	    ("[pmc,%d] CPU not bound [cpu=%d, curr=%d]", __LINE__,
632 		cpu, curthread->td_oncpu));
633 
634 	PMCDBG(CPU,SEL,2, "select-cpu cpu=%d ok", cpu);
635 }
636 
637 /*
638  * Force a context switch.
639  *
640  * We do this by tsleep'ing for 1 tick -- invoking mi_switch() is not
641  * guaranteed to force a context switch.
642  */
643 
644 static void
645 pmc_force_context_switch(void)
646 {
647 
648 	(void) tsleep((void *) pmc_force_context_switch, 0, "pmcctx", 1);
649 }
650 
651 /*
652  * Get the file name for an executable.  This is a simple wrapper
653  * around vn_fullpath(9).
654  */
655 
656 static void
657 pmc_getfilename(struct vnode *v, char **fullpath, char **freepath)
658 {
659 	struct thread *td;
660 
661 	td = curthread;
662 	*fullpath = "unknown";
663 	*freepath = NULL;
664 	vn_lock(v, LK_CANRECURSE | LK_EXCLUSIVE | LK_RETRY, td);
665 	vn_fullpath(td, v, fullpath, freepath);
666 	VOP_UNLOCK(v, 0, td);
667 }
668 
669 /*
670  * remove an process owning PMCs
671  */
672 
673 void
674 pmc_remove_owner(struct pmc_owner *po)
675 {
676 	struct pmc *pm, *tmp;
677 
678 	sx_assert(&pmc_sx, SX_XLOCKED);
679 
680 	PMCDBG(OWN,ORM,1, "remove-owner po=%p", po);
681 
682 	/* Remove descriptor from the owner hash table */
683 	LIST_REMOVE(po, po_next);
684 
685 	/* release all owned PMC descriptors */
686 	LIST_FOREACH_SAFE(pm, &po->po_pmcs, pm_next, tmp) {
687 		PMCDBG(OWN,ORM,2, "pmc=%p", pm);
688 		KASSERT(pm->pm_owner == po,
689 		    ("[pmc,%d] owner %p != po %p", __LINE__, pm->pm_owner, po));
690 
691 		pmc_release_pmc_descriptor(pm);	/* will unlink from the list */
692 	}
693 
694 	KASSERT(po->po_sscount == 0,
695 	    ("[pmc,%d] SS count not zero", __LINE__));
696 	KASSERT(LIST_EMPTY(&po->po_pmcs),
697 	    ("[pmc,%d] PMC list not empty", __LINE__));
698 
699 	/* de-configure the log file if present */
700 	if (po->po_flags & PMC_PO_OWNS_LOGFILE)
701 		pmclog_deconfigure_log(po);
702 }
703 
704 /*
705  * remove an owner process record if all conditions are met.
706  */
707 
708 static void
709 pmc_maybe_remove_owner(struct pmc_owner *po)
710 {
711 
712 	PMCDBG(OWN,OMR,1, "maybe-remove-owner po=%p", po);
713 
714 	/*
715 	 * Remove owner record if
716 	 * - this process does not own any PMCs
717 	 * - this process has not allocated a system-wide sampling buffer
718 	 */
719 
720 	if (LIST_EMPTY(&po->po_pmcs) &&
721 	    ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)) {
722 		pmc_remove_owner(po);
723 		pmc_destroy_owner_descriptor(po);
724 	}
725 }
726 
727 /*
728  * Add an association between a target process and a PMC.
729  */
730 
731 static void
732 pmc_link_target_process(struct pmc *pm, struct pmc_process *pp)
733 {
734 	int ri;
735 	struct pmc_target *pt;
736 
737 	sx_assert(&pmc_sx, SX_XLOCKED);
738 
739 	KASSERT(pm != NULL && pp != NULL,
740 	    ("[pmc,%d] Null pm %p or pp %p", __LINE__, pm, pp));
741 	KASSERT(PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)),
742 	    ("[pmc,%d] Attaching a non-process-virtual pmc=%p to pid=%d",
743 		__LINE__, pm, pp->pp_proc->p_pid));
744 	KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt < ((int) md->pmd_npmc - 1),
745 	    ("[pmc,%d] Illegal reference count %d for process record %p",
746 		__LINE__, pp->pp_refcnt, (void *) pp));
747 
748 	ri = PMC_TO_ROWINDEX(pm);
749 
750 	PMCDBG(PRC,TLK,1, "link-target pmc=%p ri=%d pmc-process=%p",
751 	    pm, ri, pp);
752 
753 #ifdef	DEBUG
754 	LIST_FOREACH(pt, &pm->pm_targets, pt_next)
755 	    if (pt->pt_process == pp)
756 		    KASSERT(0, ("[pmc,%d] pp %p already in pmc %p targets",
757 				__LINE__, pp, pm));
758 #endif
759 
760 	MALLOC(pt, struct pmc_target *, sizeof(struct pmc_target),
761 	    M_PMC, M_ZERO|M_WAITOK);
762 
763 	pt->pt_process = pp;
764 
765 	LIST_INSERT_HEAD(&pm->pm_targets, pt, pt_next);
766 
767 	atomic_store_rel_ptr((uintptr_t *)&pp->pp_pmcs[ri].pp_pmc,
768 	    (uintptr_t)pm);
769 
770 	if (pm->pm_owner->po_owner == pp->pp_proc)
771 		pm->pm_flags |= PMC_F_ATTACHED_TO_OWNER;
772 
773 	/*
774 	 * Initialize the per-process values at this row index.
775 	 */
776 	pp->pp_pmcs[ri].pp_pmcval = PMC_TO_MODE(pm) == PMC_MODE_TS ?
777 	    pm->pm_sc.pm_reloadcount : 0;
778 
779 	pp->pp_refcnt++;
780 
781 }
782 
783 /*
784  * Removes the association between a target process and a PMC.
785  */
786 
787 static void
788 pmc_unlink_target_process(struct pmc *pm, struct pmc_process *pp)
789 {
790 	int ri;
791 	struct proc *p;
792 	struct pmc_target *ptgt;
793 
794 	sx_assert(&pmc_sx, SX_XLOCKED);
795 
796 	KASSERT(pm != NULL && pp != NULL,
797 	    ("[pmc,%d] Null pm %p or pp %p", __LINE__, pm, pp));
798 
799 	KASSERT(pp->pp_refcnt >= 1 && pp->pp_refcnt < (int) md->pmd_npmc,
800 	    ("[pmc,%d] Illegal ref count %d on process record %p",
801 		__LINE__, pp->pp_refcnt, (void *) pp));
802 
803 	ri = PMC_TO_ROWINDEX(pm);
804 
805 	PMCDBG(PRC,TUL,1, "unlink-target pmc=%p ri=%d pmc-process=%p",
806 	    pm, ri, pp);
807 
808 	KASSERT(pp->pp_pmcs[ri].pp_pmc == pm,
809 	    ("[pmc,%d] PMC ri %d mismatch pmc %p pp->[ri] %p", __LINE__,
810 		ri, pm, pp->pp_pmcs[ri].pp_pmc));
811 
812 	pp->pp_pmcs[ri].pp_pmc = NULL;
813 	pp->pp_pmcs[ri].pp_pmcval = (pmc_value_t) 0;
814 
815 	/* Remove owner-specific flags */
816 	if (pm->pm_owner->po_owner == pp->pp_proc) {
817 		pp->pp_flags &= ~PMC_PP_ENABLE_MSR_ACCESS;
818 		pm->pm_flags &= ~PMC_F_ATTACHED_TO_OWNER;
819 	}
820 
821 	pp->pp_refcnt--;
822 
823 	/* Remove the target process from the PMC structure */
824 	LIST_FOREACH(ptgt, &pm->pm_targets, pt_next)
825 		if (ptgt->pt_process == pp)
826 			break;
827 
828 	KASSERT(ptgt != NULL, ("[pmc,%d] process %p (pp: %p) not found "
829 		    "in pmc %p", __LINE__, pp->pp_proc, pp, pm));
830 
831 	LIST_REMOVE(ptgt, pt_next);
832 	FREE(ptgt, M_PMC);
833 
834 	/* if the PMC now lacks targets, send the owner a SIGIO */
835 	if (LIST_EMPTY(&pm->pm_targets)) {
836 		p = pm->pm_owner->po_owner;
837 		PROC_LOCK(p);
838 		psignal(p, SIGIO);
839 		PROC_UNLOCK(p);
840 
841 		PMCDBG(PRC,SIG,2, "signalling proc=%p signal=%d", p,
842 		    SIGIO);
843 	}
844 }
845 
846 /*
847  * Check if PMC 'pm' may be attached to target process 't'.
848  */
849 
850 static int
851 pmc_can_attach(struct pmc *pm, struct proc *t)
852 {
853 	struct proc *o;		/* pmc owner */
854 	struct ucred *oc, *tc;	/* owner, target credentials */
855 	int decline_attach, i;
856 
857 	/*
858 	 * A PMC's owner can always attach that PMC to itself.
859 	 */
860 
861 	if ((o = pm->pm_owner->po_owner) == t)
862 		return 0;
863 
864 	PROC_LOCK(o);
865 	oc = o->p_ucred;
866 	crhold(oc);
867 	PROC_UNLOCK(o);
868 
869 	PROC_LOCK(t);
870 	tc = t->p_ucred;
871 	crhold(tc);
872 	PROC_UNLOCK(t);
873 
874 	/*
875 	 * The effective uid of the PMC owner should match at least one
876 	 * of the {effective,real,saved} uids of the target process.
877 	 */
878 
879 	decline_attach = oc->cr_uid != tc->cr_uid &&
880 	    oc->cr_uid != tc->cr_svuid &&
881 	    oc->cr_uid != tc->cr_ruid;
882 
883 	/*
884 	 * Every one of the target's group ids, must be in the owner's
885 	 * group list.
886 	 */
887 	for (i = 0; !decline_attach && i < tc->cr_ngroups; i++)
888 		decline_attach = !groupmember(tc->cr_groups[i], oc);
889 
890 	/* check the read and saved gids too */
891 	if (decline_attach == 0)
892 		decline_attach = !groupmember(tc->cr_rgid, oc) ||
893 		    !groupmember(tc->cr_svgid, oc);
894 
895 	crfree(tc);
896 	crfree(oc);
897 
898 	return !decline_attach;
899 }
900 
901 /*
902  * Attach a process to a PMC.
903  */
904 
905 static int
906 pmc_attach_one_process(struct proc *p, struct pmc *pm)
907 {
908 	int ri;
909 	char *fullpath, *freepath;
910 	struct pmc_process	*pp;
911 
912 	sx_assert(&pmc_sx, SX_XLOCKED);
913 
914 	PMCDBG(PRC,ATT,2, "attach-one pm=%p ri=%d proc=%p (%d, %s)", pm,
915 	    PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm);
916 
917 	/*
918 	 * Locate the process descriptor corresponding to process 'p',
919 	 * allocating space as needed.
920 	 *
921 	 * Verify that rowindex 'pm_rowindex' is free in the process
922 	 * descriptor.
923 	 *
924 	 * If not, allocate space for a descriptor and link the
925 	 * process descriptor and PMC.
926 	 */
927 	ri = PMC_TO_ROWINDEX(pm);
928 
929 	if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_ALLOCATE)) == NULL)
930 		return ENOMEM;
931 
932 	if (pp->pp_pmcs[ri].pp_pmc == pm) /* already present at slot [ri] */
933 		return EEXIST;
934 
935 	if (pp->pp_pmcs[ri].pp_pmc != NULL)
936 		return EBUSY;
937 
938 	pmc_link_target_process(pm, pp);
939 
940 	if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)) &&
941 	    (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) == 0)
942 		pm->pm_flags |= PMC_F_NEEDS_LOGFILE;
943 
944 	pm->pm_flags |= PMC_F_ATTACH_DONE; /* mark as attached */
945 
946 	/* issue an attach event to a configured log file */
947 	if (pm->pm_owner->po_flags & PMC_PO_OWNS_LOGFILE) {
948 		pmc_getfilename(p->p_textvp, &fullpath, &freepath);
949 		pmclog_process_pmcattach(pm, p->p_pid, fullpath);
950 		if (freepath)
951 			FREE(freepath, M_TEMP);
952 	}
953 	/* mark process as using HWPMCs */
954 	PROC_LOCK(p);
955 	p->p_flag |= P_HWPMC;
956 	PROC_UNLOCK(p);
957 
958 	return 0;
959 }
960 
961 /*
962  * Attach a process and optionally its children
963  */
964 
965 static int
966 pmc_attach_process(struct proc *p, struct pmc *pm)
967 {
968 	int error;
969 	struct proc *top;
970 
971 	sx_assert(&pmc_sx, SX_XLOCKED);
972 
973 	PMCDBG(PRC,ATT,1, "attach pm=%p ri=%d proc=%p (%d, %s)", pm,
974 	    PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm);
975 
976 
977 	/*
978 	 * If this PMC successfully allowed a GETMSR operation
979 	 * in the past, disallow further ATTACHes.
980 	 */
981 
982 	if ((pm->pm_flags & PMC_PP_ENABLE_MSR_ACCESS) != 0)
983 		return EPERM;
984 
985 	if ((pm->pm_flags & PMC_F_DESCENDANTS) == 0)
986 		return pmc_attach_one_process(p, pm);
987 
988 	/*
989 	 * Traverse all child processes, attaching them to
990 	 * this PMC.
991 	 */
992 
993 	sx_slock(&proctree_lock);
994 
995 	top = p;
996 
997 	for (;;) {
998 		if ((error = pmc_attach_one_process(p, pm)) != 0)
999 			break;
1000 		if (!LIST_EMPTY(&p->p_children))
1001 			p = LIST_FIRST(&p->p_children);
1002 		else for (;;) {
1003 			if (p == top)
1004 				goto done;
1005 			if (LIST_NEXT(p, p_sibling)) {
1006 				p = LIST_NEXT(p, p_sibling);
1007 				break;
1008 			}
1009 			p = p->p_pptr;
1010 		}
1011 	}
1012 
1013 	if (error)
1014 		(void) pmc_detach_process(top, pm);
1015 
1016  done:
1017 	sx_sunlock(&proctree_lock);
1018 	return error;
1019 }
1020 
1021 /*
1022  * Detach a process from a PMC.  If there are no other PMCs tracking
1023  * this process, remove the process structure from its hash table.  If
1024  * 'flags' contains PMC_FLAG_REMOVE, then free the process structure.
1025  */
1026 
1027 static int
1028 pmc_detach_one_process(struct proc *p, struct pmc *pm, int flags)
1029 {
1030 	int ri;
1031 	struct pmc_process *pp;
1032 
1033 	sx_assert(&pmc_sx, SX_XLOCKED);
1034 
1035 	KASSERT(pm != NULL,
1036 	    ("[pmc,%d] null pm pointer", __LINE__));
1037 
1038 	ri = PMC_TO_ROWINDEX(pm);
1039 
1040 	PMCDBG(PRC,ATT,2, "detach-one pm=%p ri=%d proc=%p (%d, %s) flags=0x%x",
1041 	    pm, ri, p, p->p_pid, p->p_comm, flags);
1042 
1043 	if ((pp = pmc_find_process_descriptor(p, 0)) == NULL)
1044 		return ESRCH;
1045 
1046 	if (pp->pp_pmcs[ri].pp_pmc != pm)
1047 		return EINVAL;
1048 
1049 	pmc_unlink_target_process(pm, pp);
1050 
1051 	/* Issue a detach entry if a log file is configured */
1052 	if (pm->pm_owner->po_flags & PMC_PO_OWNS_LOGFILE)
1053 		pmclog_process_pmcdetach(pm, p->p_pid);
1054 
1055 	/*
1056 	 * If there are no PMCs targetting this process, we remove its
1057 	 * descriptor from the target hash table and unset the P_HWPMC
1058 	 * flag in the struct proc.
1059 	 */
1060 	KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt < (int) md->pmd_npmc,
1061 	    ("[pmc,%d] Illegal refcnt %d for process struct %p",
1062 		__LINE__, pp->pp_refcnt, pp));
1063 
1064 	if (pp->pp_refcnt != 0)	/* still a target of some PMC */
1065 		return 0;
1066 
1067 	pmc_remove_process_descriptor(pp);
1068 
1069 	if (flags & PMC_FLAG_REMOVE)
1070 		FREE(pp, M_PMC);
1071 
1072 	PROC_LOCK(p);
1073 	p->p_flag &= ~P_HWPMC;
1074 	PROC_UNLOCK(p);
1075 
1076 	return 0;
1077 }
1078 
1079 /*
1080  * Detach a process and optionally its descendants from a PMC.
1081  */
1082 
1083 static int
1084 pmc_detach_process(struct proc *p, struct pmc *pm)
1085 {
1086 	struct proc *top;
1087 
1088 	sx_assert(&pmc_sx, SX_XLOCKED);
1089 
1090 	PMCDBG(PRC,ATT,1, "detach pm=%p ri=%d proc=%p (%d, %s)", pm,
1091 	    PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm);
1092 
1093 	if ((pm->pm_flags & PMC_F_DESCENDANTS) == 0)
1094 		return pmc_detach_one_process(p, pm, PMC_FLAG_REMOVE);
1095 
1096 	/*
1097 	 * Traverse all children, detaching them from this PMC.  We
1098 	 * ignore errors since we could be detaching a PMC from a
1099 	 * partially attached proc tree.
1100 	 */
1101 
1102 	sx_slock(&proctree_lock);
1103 
1104 	top = p;
1105 
1106 	for (;;) {
1107 		(void) pmc_detach_one_process(p, pm, PMC_FLAG_REMOVE);
1108 
1109 		if (!LIST_EMPTY(&p->p_children))
1110 			p = LIST_FIRST(&p->p_children);
1111 		else for (;;) {
1112 			if (p == top)
1113 				goto done;
1114 			if (LIST_NEXT(p, p_sibling)) {
1115 				p = LIST_NEXT(p, p_sibling);
1116 				break;
1117 			}
1118 			p = p->p_pptr;
1119 		}
1120 	}
1121 
1122  done:
1123 	sx_sunlock(&proctree_lock);
1124 
1125 	if (LIST_EMPTY(&pm->pm_targets))
1126 		pm->pm_flags &= ~PMC_F_ATTACH_DONE;
1127 
1128 	return 0;
1129 }
1130 
1131 
1132 /*
1133  * Thread context switch IN
1134  */
1135 
1136 static void
1137 pmc_process_csw_in(struct thread *td)
1138 {
1139 	int cpu;
1140 	unsigned int ri;
1141 	struct pmc *pm;
1142 	struct proc *p;
1143 	struct pmc_cpu *pc;
1144 	struct pmc_hw *phw;
1145 	struct pmc_process *pp;
1146 	pmc_value_t newvalue;
1147 
1148 	p = td->td_proc;
1149 
1150 	if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_NONE)) == NULL)
1151 		return;
1152 
1153 	KASSERT(pp->pp_proc == td->td_proc,
1154 	    ("[pmc,%d] not my thread state", __LINE__));
1155 
1156 	critical_enter(); /* no preemption from this point */
1157 
1158 	cpu = PCPU_GET(cpuid); /* td->td_oncpu is invalid */
1159 
1160 	PMCDBG(CSW,SWI,1, "cpu=%d proc=%p (%d, %s) pp=%p", cpu, p,
1161 	    p->p_pid, p->p_comm, pp);
1162 
1163 	KASSERT(cpu >= 0 && cpu < mp_ncpus,
1164 	    ("[pmc,%d] wierd CPU id %d", __LINE__, cpu));
1165 
1166 	pc = pmc_pcpu[cpu];
1167 
1168 	for (ri = 0; ri < md->pmd_npmc; ri++) {
1169 
1170 		if ((pm = pp->pp_pmcs[ri].pp_pmc) == NULL)
1171 			continue;
1172 
1173 		KASSERT(PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)),
1174 		    ("[pmc,%d] Target PMC in non-virtual mode (%d)",
1175 			__LINE__, PMC_TO_MODE(pm)));
1176 
1177 		KASSERT(PMC_TO_ROWINDEX(pm) == ri,
1178 		    ("[pmc,%d] Row index mismatch pmc %d != ri %d",
1179 			__LINE__, PMC_TO_ROWINDEX(pm), ri));
1180 
1181 		/*
1182 		 * Only PMCs that are marked as 'RUNNING' need
1183 		 * be placed on hardware.
1184 		 */
1185 
1186 		if (pm->pm_state != PMC_STATE_RUNNING)
1187 			continue;
1188 
1189 		/* increment PMC runcount */
1190 		atomic_add_rel_32(&pm->pm_runcount, 1);
1191 
1192 		/* configure the HWPMC we are going to use. */
1193 		md->pmd_config_pmc(cpu, ri, pm);
1194 
1195 		phw = pc->pc_hwpmcs[ri];
1196 
1197 		KASSERT(phw != NULL,
1198 		    ("[pmc,%d] null hw pointer", __LINE__));
1199 
1200 		KASSERT(phw->phw_pmc == pm,
1201 		    ("[pmc,%d] hw->pmc %p != pmc %p", __LINE__,
1202 			phw->phw_pmc, pm));
1203 
1204 		/*
1205 		 * Write out saved value and start the PMC.
1206 		 *
1207 		 * Sampling PMCs use a per-process value, while
1208 		 * counting mode PMCs use a per-pmc value that is
1209 		 * inherited across descendants.
1210 		 */
1211 		if (PMC_TO_MODE(pm) == PMC_MODE_TS) {
1212 			mtx_pool_lock_spin(pmc_mtxpool, pm);
1213 			newvalue = PMC_PCPU_SAVED(cpu,ri) =
1214 			    pp->pp_pmcs[ri].pp_pmcval;
1215 			mtx_pool_unlock_spin(pmc_mtxpool, pm);
1216 		} else {
1217 			KASSERT(PMC_TO_MODE(pm) == PMC_MODE_TC,
1218 			    ("[pmc,%d] illegal mode=%d", __LINE__,
1219 			    PMC_TO_MODE(pm)));
1220 			mtx_pool_lock_spin(pmc_mtxpool, pm);
1221 			newvalue = PMC_PCPU_SAVED(cpu, ri) =
1222 			    pm->pm_gv.pm_savedvalue;
1223 			mtx_pool_unlock_spin(pmc_mtxpool, pm);
1224 		}
1225 
1226 		PMCDBG(CSW,SWI,1,"cpu=%d ri=%d new=%jd", cpu, ri, newvalue);
1227 
1228 		md->pmd_write_pmc(cpu, ri, newvalue);
1229 		md->pmd_start_pmc(cpu, ri);
1230 	}
1231 
1232 	/*
1233 	 * perform any other architecture/cpu dependent thread
1234 	 * switch-in actions.
1235 	 */
1236 
1237 	(void) (*md->pmd_switch_in)(pc, pp);
1238 
1239 	critical_exit();
1240 
1241 }
1242 
1243 /*
1244  * Thread context switch OUT.
1245  */
1246 
1247 static void
1248 pmc_process_csw_out(struct thread *td)
1249 {
1250 	int cpu;
1251 	enum pmc_mode mode;
1252 	unsigned int ri;
1253 	struct pmc *pm;
1254 	struct proc *p;
1255 	struct pmc_cpu *pc;
1256 	struct pmc_process *pp;
1257 	int64_t tmp;
1258 	pmc_value_t newvalue;
1259 
1260 	/*
1261 	 * Locate our process descriptor; this may be NULL if
1262 	 * this process is exiting and we have already removed
1263 	 * the process from the target process table.
1264 	 *
1265 	 * Note that due to kernel preemption, multiple
1266 	 * context switches may happen while the process is
1267 	 * exiting.
1268 	 *
1269 	 * Note also that if the target process cannot be
1270 	 * found we still need to deconfigure any PMCs that
1271 	 * are currently running on hardware.
1272 	 */
1273 
1274 	p = td->td_proc;
1275 	pp = pmc_find_process_descriptor(p, PMC_FLAG_NONE);
1276 
1277 	/*
1278 	 * save PMCs
1279 	 */
1280 
1281 	critical_enter();
1282 
1283 	cpu = PCPU_GET(cpuid); /* td->td_oncpu is invalid */
1284 
1285 	PMCDBG(CSW,SWO,1, "cpu=%d proc=%p (%d, %s) pp=%p", cpu, p,
1286 	    p->p_pid, p->p_comm, pp);
1287 
1288 	KASSERT(cpu >= 0 && cpu < mp_ncpus,
1289 	    ("[pmc,%d wierd CPU id %d", __LINE__, cpu));
1290 
1291 	pc = pmc_pcpu[cpu];
1292 
1293 	/*
1294 	 * When a PMC gets unlinked from a target PMC, it will
1295 	 * be removed from the target's pp_pmc[] array.
1296 	 *
1297 	 * However, on a MP system, the target could have been
1298 	 * executing on another CPU at the time of the unlink.
1299 	 * So, at context switch OUT time, we need to look at
1300 	 * the hardware to determine if a PMC is scheduled on
1301 	 * it.
1302 	 */
1303 
1304 	for (ri = 0; ri < md->pmd_npmc; ri++) {
1305 
1306 		pm = NULL;
1307 		(void) (*md->pmd_get_config)(cpu, ri, &pm);
1308 
1309 		if (pm == NULL)	/* nothing at this row index */
1310 			continue;
1311 
1312 		mode = PMC_TO_MODE(pm);
1313 		if (!PMC_IS_VIRTUAL_MODE(mode))
1314 			continue; /* not a process virtual PMC */
1315 
1316 		KASSERT(PMC_TO_ROWINDEX(pm) == ri,
1317 		    ("[pmc,%d] ri mismatch pmc(%d) ri(%d)",
1318 			__LINE__, PMC_TO_ROWINDEX(pm), ri));
1319 
1320 		/* Stop hardware if not already stopped */
1321 		if (pm->pm_stalled == 0)
1322 			md->pmd_stop_pmc(cpu, ri);
1323 
1324 		/* reduce this PMC's runcount */
1325 		atomic_subtract_rel_32(&pm->pm_runcount, 1);
1326 
1327 		/*
1328 		 * If this PMC is associated with this process,
1329 		 * save the reading.
1330 		 */
1331 
1332 		if (pp != NULL && pp->pp_pmcs[ri].pp_pmc != NULL) {
1333 
1334 			KASSERT(pm == pp->pp_pmcs[ri].pp_pmc,
1335 			    ("[pmc,%d] pm %p != pp_pmcs[%d] %p", __LINE__,
1336 				pm, ri, pp->pp_pmcs[ri].pp_pmc));
1337 
1338 			KASSERT(pp->pp_refcnt > 0,
1339 			    ("[pmc,%d] pp refcnt = %d", __LINE__,
1340 				pp->pp_refcnt));
1341 
1342 			md->pmd_read_pmc(cpu, ri, &newvalue);
1343 
1344 			tmp = newvalue - PMC_PCPU_SAVED(cpu,ri);
1345 
1346 			PMCDBG(CSW,SWI,1,"cpu=%d ri=%d tmp=%jd", cpu, ri,
1347 			    tmp);
1348 
1349 			if (mode == PMC_MODE_TS) {
1350 
1351 				/*
1352 				 * For sampling process-virtual PMCs,
1353 				 * we expect the count to be
1354 				 * decreasing as the 'value'
1355 				 * programmed into the PMC is the
1356 				 * number of events to be seen till
1357 				 * the next sampling interrupt.
1358 				 */
1359 				if (tmp < 0)
1360 					tmp += pm->pm_sc.pm_reloadcount;
1361 				mtx_pool_lock_spin(pmc_mtxpool, pm);
1362 				pp->pp_pmcs[ri].pp_pmcval -= tmp;
1363 				if ((int64_t) pp->pp_pmcs[ri].pp_pmcval < 0)
1364 					pp->pp_pmcs[ri].pp_pmcval +=
1365 					    pm->pm_sc.pm_reloadcount;
1366 				mtx_pool_unlock_spin(pmc_mtxpool, pm);
1367 
1368 			} else {
1369 
1370 				/*
1371 				 * For counting process-virtual PMCs,
1372 				 * we expect the count to be
1373 				 * increasing monotonically, modulo a 64
1374 				 * bit wraparound.
1375 				 */
1376 				KASSERT((int64_t) tmp >= 0,
1377 				    ("[pmc,%d] negative increment cpu=%d "
1378 				     "ri=%d newvalue=%jx saved=%jx "
1379 				     "incr=%jx", __LINE__, cpu, ri,
1380 				     newvalue, PMC_PCPU_SAVED(cpu,ri), tmp));
1381 
1382 				mtx_pool_lock_spin(pmc_mtxpool, pm);
1383 				pm->pm_gv.pm_savedvalue += tmp;
1384 				pp->pp_pmcs[ri].pp_pmcval += tmp;
1385 				mtx_pool_unlock_spin(pmc_mtxpool, pm);
1386 
1387 				if (pm->pm_flags & PMC_F_LOG_PROCCSW)
1388 					pmclog_process_proccsw(pm, pp, tmp);
1389 			}
1390 		}
1391 
1392 		/* mark hardware as free */
1393 		md->pmd_config_pmc(cpu, ri, NULL);
1394 	}
1395 
1396 	/*
1397 	 * perform any other architecture/cpu dependent thread
1398 	 * switch out functions.
1399 	 */
1400 
1401 	(void) (*md->pmd_switch_out)(pc, pp);
1402 
1403 	critical_exit();
1404 }
1405 
1406 /*
1407  * Log a KLD operation.
1408  */
1409 
1410 static void
1411 pmc_process_kld_load(struct pmckern_map_in *pkm)
1412 {
1413 	struct pmc_owner *po;
1414 
1415 	sx_assert(&pmc_sx, SX_LOCKED);
1416 
1417 	/*
1418 	 * Notify owners of system sampling PMCs about KLD operations.
1419 	 */
1420 
1421 	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
1422 	    if (po->po_flags & PMC_PO_OWNS_LOGFILE)
1423 	    	pmclog_process_map_in(po, (pid_t) -1, pkm->pm_address,
1424 		    (char *) pkm->pm_file);
1425 
1426 	/*
1427 	 * TODO: Notify owners of (all) process-sampling PMCs too.
1428 	 */
1429 
1430 	return;
1431 }
1432 
1433 static void
1434 pmc_process_kld_unload(struct pmckern_map_out *pkm)
1435 {
1436 	struct pmc_owner *po;
1437 
1438 	sx_assert(&pmc_sx, SX_LOCKED);
1439 
1440 	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
1441 	    if (po->po_flags & PMC_PO_OWNS_LOGFILE)
1442 		pmclog_process_map_out(po, (pid_t) -1,
1443 		    pkm->pm_address, pkm->pm_address + pkm->pm_size);
1444 
1445 	/*
1446 	 * TODO: Notify owners of process-sampling PMCs.
1447 	 */
1448 }
1449 
1450 /*
1451  * A mapping change for a process.
1452  */
1453 
1454 static void
1455 pmc_process_mmap(struct thread *td, struct pmckern_map_in *pkm)
1456 {
1457 	int ri;
1458 	pid_t pid;
1459 	char *fullpath, *freepath;
1460 	const struct pmc *pm;
1461 	struct pmc_owner *po;
1462 	const struct pmc_process *pp;
1463 
1464 	freepath = fullpath = NULL;
1465 	pmc_getfilename((struct vnode *) pkm->pm_file, &fullpath, &freepath);
1466 
1467 	pid = td->td_proc->p_pid;
1468 
1469 	/* Inform owners of all system-wide sampling PMCs. */
1470 	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
1471 	    if (po->po_flags & PMC_PO_OWNS_LOGFILE)
1472 		pmclog_process_map_in(po, pid, pkm->pm_address, fullpath);
1473 
1474 	if ((pp = pmc_find_process_descriptor(td->td_proc, 0)) == NULL)
1475 		goto done;
1476 
1477 	/*
1478 	 * Inform sampling PMC owners tracking this process.
1479 	 */
1480 	for (ri = 0; ri < md->pmd_npmc; ri++)
1481 		if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL &&
1482 		    PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
1483 			pmclog_process_map_in(pm->pm_owner,
1484 			    pid, pkm->pm_address, fullpath);
1485 
1486   done:
1487 	if (freepath)
1488 		FREE(freepath, M_TEMP);
1489 }
1490 
1491 
1492 /*
1493  * Log an munmap request.
1494  */
1495 
1496 static void
1497 pmc_process_munmap(struct thread *td, struct pmckern_map_out *pkm)
1498 {
1499 	int ri;
1500 	pid_t pid;
1501 	struct pmc_owner *po;
1502 	const struct pmc *pm;
1503 	const struct pmc_process *pp;
1504 
1505 	pid = td->td_proc->p_pid;
1506 
1507 	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
1508 	    if (po->po_flags & PMC_PO_OWNS_LOGFILE)
1509 		pmclog_process_map_out(po, pid, pkm->pm_address,
1510 		    pkm->pm_address + pkm->pm_size);
1511 
1512 	if ((pp = pmc_find_process_descriptor(td->td_proc, 0)) == NULL)
1513 		return;
1514 
1515 	for (ri = 0; ri < md->pmd_npmc; ri++)
1516 		if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL &&
1517 		    PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
1518 			pmclog_process_map_out(pm->pm_owner, pid,
1519 			    pkm->pm_address, pkm->pm_address + pkm->pm_size);
1520 }
1521 
1522 /*
1523  * The 'hook' invoked from the kernel proper
1524  */
1525 
1526 
1527 #ifdef	DEBUG
1528 const char *pmc_hooknames[] = {
1529 	/* these strings correspond to PMC_FN_* in <sys/pmckern.h> */
1530 	"",
1531 	"EXEC",
1532 	"CSW-IN",
1533 	"CSW-OUT",
1534 	"SAMPLE",
1535 	"KLDLOAD",
1536 	"KLDUNLOAD",
1537 	"MMAP",
1538 	"MUNMAP"
1539 };
1540 #endif
1541 
1542 static int
1543 pmc_hook_handler(struct thread *td, int function, void *arg)
1544 {
1545 
1546 	PMCDBG(MOD,PMH,1, "hook td=%p func=%d \"%s\" arg=%p", td, function,
1547 	    pmc_hooknames[function], arg);
1548 
1549 	switch (function)
1550 	{
1551 
1552 	/*
1553 	 * Process exec()
1554 	 */
1555 
1556 	case PMC_FN_PROCESS_EXEC:
1557 	{
1558 		char *fullpath, *freepath;
1559 		unsigned int ri;
1560 		int is_using_hwpmcs;
1561 		struct pmc *pm;
1562 		struct proc *p;
1563 		struct pmc_owner *po;
1564 		struct pmc_process *pp;
1565 		struct pmckern_procexec *pk;
1566 
1567 		sx_assert(&pmc_sx, SX_XLOCKED);
1568 
1569 		p = td->td_proc;
1570 		pmc_getfilename(p->p_textvp, &fullpath, &freepath);
1571 
1572 		pk = (struct pmckern_procexec *) arg;
1573 
1574 		/* Inform owners of SS mode PMCs of the exec event. */
1575 		LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
1576 		    if (po->po_flags & PMC_PO_OWNS_LOGFILE)
1577 			    pmclog_process_procexec(po, PMC_ID_INVALID,
1578 				p->p_pid, pk->pm_entryaddr, fullpath);
1579 
1580 		PROC_LOCK(p);
1581 		is_using_hwpmcs = p->p_flag & P_HWPMC;
1582 		PROC_UNLOCK(p);
1583 
1584 		if (!is_using_hwpmcs) {
1585 			if (freepath)
1586 				FREE(freepath, M_TEMP);
1587 			break;
1588 		}
1589 
1590 		/*
1591 		 * PMCs are not inherited across an exec():  remove any
1592 		 * PMCs that this process is the owner of.
1593 		 */
1594 
1595 		if ((po = pmc_find_owner_descriptor(p)) != NULL) {
1596 			pmc_remove_owner(po);
1597 			pmc_destroy_owner_descriptor(po);
1598 		}
1599 
1600 		/*
1601 		 * If the process being exec'ed is not the target of any
1602 		 * PMC, we are done.
1603 		 */
1604 		if ((pp = pmc_find_process_descriptor(p, 0)) == NULL) {
1605 			if (freepath)
1606 				FREE(freepath, M_TEMP);
1607 			break;
1608 		}
1609 
1610 		/*
1611 		 * Log the exec event to all monitoring owners.  Skip
1612 		 * owners who have already recieved the event because
1613 		 * they had system sampling PMCs active.
1614 		 */
1615 		for (ri = 0; ri < md->pmd_npmc; ri++)
1616 			if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL) {
1617 				po = pm->pm_owner;
1618 				if (po->po_sscount == 0 &&
1619 				    po->po_flags & PMC_PO_OWNS_LOGFILE)
1620 					pmclog_process_procexec(po, pm->pm_id,
1621 					    p->p_pid, pk->pm_entryaddr,
1622 					    fullpath);
1623 			}
1624 
1625 		if (freepath)
1626 			FREE(freepath, M_TEMP);
1627 
1628 
1629 		PMCDBG(PRC,EXC,1, "exec proc=%p (%d, %s) cred-changed=%d",
1630 		    p, p->p_pid, p->p_comm, pk->pm_credentialschanged);
1631 
1632 		if (pk->pm_credentialschanged == 0) /* no change */
1633 			break;
1634 
1635 		/*
1636 		 * If the newly exec()'ed process has a different credential
1637 		 * than before, allow it to be the target of a PMC only if
1638 		 * the PMC's owner has sufficient priviledge.
1639 		 */
1640 
1641 		for (ri = 0; ri < md->pmd_npmc; ri++)
1642 			if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL)
1643 				if (pmc_can_attach(pm, td->td_proc) != 0)
1644 					pmc_detach_one_process(td->td_proc,
1645 					    pm, PMC_FLAG_NONE);
1646 
1647 		KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt < (int) md->pmd_npmc,
1648 		    ("[pmc,%d] Illegal ref count %d on pp %p", __LINE__,
1649 			pp->pp_refcnt, pp));
1650 
1651 		/*
1652 		 * If this process is no longer the target of any
1653 		 * PMCs, we can remove the process entry and free
1654 		 * up space.
1655 		 */
1656 
1657 		if (pp->pp_refcnt == 0) {
1658 			pmc_remove_process_descriptor(pp);
1659 			FREE(pp, M_PMC);
1660 			break;
1661 		}
1662 
1663 	}
1664 	break;
1665 
1666 	case PMC_FN_CSW_IN:
1667 		pmc_process_csw_in(td);
1668 		break;
1669 
1670 	case PMC_FN_CSW_OUT:
1671 		pmc_process_csw_out(td);
1672 		break;
1673 
1674 	/*
1675 	 * Process accumulated PC samples.
1676 	 *
1677 	 * This function is expected to be called by hardclock() for
1678 	 * each CPU that has accumulated PC samples.
1679 	 *
1680 	 * This function is to be executed on the CPU whose samples
1681 	 * are being processed.
1682 	 */
1683 	case PMC_FN_DO_SAMPLES:
1684 
1685 		/*
1686 		 * Clear the cpu specific bit in the CPU mask before
1687 		 * do the rest of the processing.  If the NMI handler
1688 		 * gets invoked after the "atomic_clear_int()" call
1689 		 * below but before "pmc_process_samples()" gets
1690 		 * around to processing the interrupt, then we will
1691 		 * come back here at the next hardclock() tick (and
1692 		 * may find nothing to do if "pmc_process_samples()"
1693 		 * had already processed the interrupt).  We don't
1694 		 * lose the interrupt sample.
1695 		 */
1696 		atomic_clear_int(&pmc_cpumask, (1 << PCPU_GET(cpuid)));
1697 		pmc_process_samples(PCPU_GET(cpuid));
1698 		break;
1699 
1700 
1701 	case PMC_FN_KLD_LOAD:
1702 		sx_assert(&pmc_sx, SX_LOCKED);
1703 		pmc_process_kld_load((struct pmckern_map_in *) arg);
1704 		break;
1705 
1706 	case PMC_FN_KLD_UNLOAD:
1707 		sx_assert(&pmc_sx, SX_LOCKED);
1708 		pmc_process_kld_unload((struct pmckern_map_out *) arg);
1709 		break;
1710 
1711 	case PMC_FN_MMAP:
1712 		sx_assert(&pmc_sx, SX_LOCKED);
1713 		pmc_process_mmap(td, (struct pmckern_map_in *) arg);
1714 		break;
1715 
1716 	case PMC_FN_MUNMAP:
1717 		sx_assert(&pmc_sx, SX_LOCKED);
1718 		pmc_process_munmap(td, (struct pmckern_map_out *) arg);
1719 		break;
1720 
1721 	default:
1722 #ifdef	DEBUG
1723 		KASSERT(0, ("[pmc,%d] unknown hook %d\n", __LINE__, function));
1724 #endif
1725 		break;
1726 
1727 	}
1728 
1729 	return 0;
1730 }
1731 
1732 /*
1733  * allocate a 'struct pmc_owner' descriptor in the owner hash table.
1734  */
1735 
1736 static struct pmc_owner *
1737 pmc_allocate_owner_descriptor(struct proc *p)
1738 {
1739 	uint32_t hindex;
1740 	struct pmc_owner *po;
1741 	struct pmc_ownerhash *poh;
1742 
1743 	hindex = PMC_HASH_PTR(p, pmc_ownerhashmask);
1744 	poh = &pmc_ownerhash[hindex];
1745 
1746 	/* allocate space for N pointers and one descriptor struct */
1747 	MALLOC(po, struct pmc_owner *, sizeof(struct pmc_owner),
1748 	    M_PMC, M_ZERO|M_WAITOK);
1749 
1750 	po->po_sscount = po->po_error = po->po_flags = 0;
1751 	po->po_file  = NULL;
1752 	po->po_owner = p;
1753 	po->po_kthread = NULL;
1754 	LIST_INIT(&po->po_pmcs);
1755 	LIST_INSERT_HEAD(poh, po, po_next); /* insert into hash table */
1756 
1757 	TAILQ_INIT(&po->po_logbuffers);
1758 	mtx_init(&po->po_mtx, "pmc-owner-mtx", "pmc", MTX_SPIN);
1759 
1760 	PMCDBG(OWN,ALL,1, "allocate-owner proc=%p (%d, %s) pmc-owner=%p",
1761 	    p, p->p_pid, p->p_comm, po);
1762 
1763 	return po;
1764 }
1765 
1766 static void
1767 pmc_destroy_owner_descriptor(struct pmc_owner *po)
1768 {
1769 
1770 	PMCDBG(OWN,REL,1, "destroy-owner po=%p proc=%p (%d, %s)",
1771 	    po, po->po_owner, po->po_owner->p_pid, po->po_owner->p_comm);
1772 
1773 	mtx_destroy(&po->po_mtx);
1774 	FREE(po, M_PMC);
1775 }
1776 
1777 /*
1778  * find the descriptor corresponding to process 'p', adding or removing it
1779  * as specified by 'mode'.
1780  */
1781 
1782 static struct pmc_process *
1783 pmc_find_process_descriptor(struct proc *p, uint32_t mode)
1784 {
1785 	uint32_t hindex;
1786 	struct pmc_process *pp, *ppnew;
1787 	struct pmc_processhash *pph;
1788 
1789 	hindex = PMC_HASH_PTR(p, pmc_processhashmask);
1790 	pph = &pmc_processhash[hindex];
1791 
1792 	ppnew = NULL;
1793 
1794 	/*
1795 	 * Pre-allocate memory in the FIND_ALLOCATE case since we
1796 	 * cannot call malloc(9) once we hold a spin lock.
1797 	 */
1798 
1799 	if (mode & PMC_FLAG_ALLOCATE) {
1800 		/* allocate additional space for 'n' pmc pointers */
1801 		MALLOC(ppnew, struct pmc_process *,
1802 		    sizeof(struct pmc_process) + md->pmd_npmc *
1803 		    sizeof(struct pmc_targetstate), M_PMC, M_ZERO|M_WAITOK);
1804 	}
1805 
1806 	mtx_lock_spin(&pmc_processhash_mtx);
1807 	LIST_FOREACH(pp, pph, pp_next)
1808 	    if (pp->pp_proc == p)
1809 		    break;
1810 
1811 	if ((mode & PMC_FLAG_REMOVE) && pp != NULL)
1812 		LIST_REMOVE(pp, pp_next);
1813 
1814 	if ((mode & PMC_FLAG_ALLOCATE) && pp == NULL &&
1815 	    ppnew != NULL) {
1816 		ppnew->pp_proc = p;
1817 		LIST_INSERT_HEAD(pph, ppnew, pp_next);
1818 		pp = ppnew;
1819 		ppnew = NULL;
1820 	}
1821 	mtx_unlock_spin(&pmc_processhash_mtx);
1822 
1823 	if (pp != NULL && ppnew != NULL)
1824 		FREE(ppnew, M_PMC);
1825 
1826 	return pp;
1827 }
1828 
1829 /*
1830  * remove a process descriptor from the process hash table.
1831  */
1832 
1833 static void
1834 pmc_remove_process_descriptor(struct pmc_process *pp)
1835 {
1836 	KASSERT(pp->pp_refcnt == 0,
1837 	    ("[pmc,%d] Removing process descriptor %p with count %d",
1838 		__LINE__, pp, pp->pp_refcnt));
1839 
1840 	mtx_lock_spin(&pmc_processhash_mtx);
1841 	LIST_REMOVE(pp, pp_next);
1842 	mtx_unlock_spin(&pmc_processhash_mtx);
1843 }
1844 
1845 
1846 /*
1847  * find an owner descriptor corresponding to proc 'p'
1848  */
1849 
1850 static struct pmc_owner *
1851 pmc_find_owner_descriptor(struct proc *p)
1852 {
1853 	uint32_t hindex;
1854 	struct pmc_owner *po;
1855 	struct pmc_ownerhash *poh;
1856 
1857 	hindex = PMC_HASH_PTR(p, pmc_ownerhashmask);
1858 	poh = &pmc_ownerhash[hindex];
1859 
1860 	po = NULL;
1861 	LIST_FOREACH(po, poh, po_next)
1862 	    if (po->po_owner == p)
1863 		    break;
1864 
1865 	PMCDBG(OWN,FND,1, "find-owner proc=%p (%d, %s) hindex=0x%x -> "
1866 	    "pmc-owner=%p", p, p->p_pid, p->p_comm, hindex, po);
1867 
1868 	return po;
1869 }
1870 
1871 /*
1872  * pmc_allocate_pmc_descriptor
1873  *
1874  * Allocate a pmc descriptor and initialize its
1875  * fields.
1876  */
1877 
1878 static struct pmc *
1879 pmc_allocate_pmc_descriptor(void)
1880 {
1881 	struct pmc *pmc;
1882 
1883 	MALLOC(pmc, struct pmc *, sizeof(struct pmc), M_PMC, M_ZERO|M_WAITOK);
1884 
1885 	if (pmc != NULL) {
1886 		pmc->pm_owner = NULL;
1887 		LIST_INIT(&pmc->pm_targets);
1888 	}
1889 
1890 	PMCDBG(PMC,ALL,1, "allocate-pmc -> pmc=%p", pmc);
1891 
1892 	return pmc;
1893 }
1894 
1895 /*
1896  * Destroy a pmc descriptor.
1897  */
1898 
1899 static void
1900 pmc_destroy_pmc_descriptor(struct pmc *pm)
1901 {
1902 	(void) pm;
1903 
1904 #ifdef	DEBUG
1905 	KASSERT(pm->pm_state == PMC_STATE_DELETED ||
1906 	    pm->pm_state == PMC_STATE_FREE,
1907 	    ("[pmc,%d] destroying non-deleted PMC", __LINE__));
1908 	KASSERT(LIST_EMPTY(&pm->pm_targets),
1909 	    ("[pmc,%d] destroying pmc with targets", __LINE__));
1910 	KASSERT(pm->pm_owner == NULL,
1911 	    ("[pmc,%d] destroying pmc attached to an owner", __LINE__));
1912 	KASSERT(pm->pm_runcount == 0,
1913 	    ("[pmc,%d] pmc has non-zero run count %d", __LINE__,
1914 		pm->pm_runcount));
1915 #endif
1916 }
1917 
1918 static void
1919 pmc_wait_for_pmc_idle(struct pmc *pm)
1920 {
1921 #ifdef	DEBUG
1922 	volatile int maxloop;
1923 
1924 	maxloop = 100 * mp_ncpus;
1925 #endif
1926 
1927 	/*
1928 	 * Loop (with a forced context switch) till the PMC's runcount
1929 	 * comes down to zero.
1930 	 */
1931 	while (atomic_load_acq_32(&pm->pm_runcount) > 0) {
1932 #ifdef	DEBUG
1933 		maxloop--;
1934 		KASSERT(maxloop > 0,
1935 		    ("[pmc,%d] (ri%d, rc%d) waiting too long for "
1936 			"pmc to be free", __LINE__,
1937 			PMC_TO_ROWINDEX(pm), pm->pm_runcount));
1938 #endif
1939 		pmc_force_context_switch();
1940 	}
1941 }
1942 
1943 /*
1944  * This function does the following things:
1945  *
1946  *  - detaches the PMC from hardware
1947  *  - unlinks all target threads that were attached to it
1948  *  - removes the PMC from its owner's list
1949  *  - destroy's the PMC private mutex
1950  *
1951  * Once this function completes, the given pmc pointer can be safely
1952  * FREE'd by the caller.
1953  */
1954 
1955 static void
1956 pmc_release_pmc_descriptor(struct pmc *pm)
1957 {
1958 	u_int ri, cpu;
1959 	enum pmc_mode mode;
1960 	struct pmc_hw *phw;
1961 	struct pmc_owner *po;
1962 	struct pmc_process *pp;
1963 	struct pmc_target *ptgt, *tmp;
1964 	struct pmc_binding pb;
1965 
1966 	sx_assert(&pmc_sx, SX_XLOCKED);
1967 
1968 	KASSERT(pm, ("[pmc,%d] null pmc", __LINE__));
1969 
1970 	ri   = PMC_TO_ROWINDEX(pm);
1971 	mode = PMC_TO_MODE(pm);
1972 
1973 	PMCDBG(PMC,REL,1, "release-pmc pmc=%p ri=%d mode=%d", pm, ri,
1974 	    mode);
1975 
1976 	/*
1977 	 * First, we take the PMC off hardware.
1978 	 */
1979 	cpu = 0;
1980 	if (PMC_IS_SYSTEM_MODE(mode)) {
1981 
1982 		/*
1983 		 * A system mode PMC runs on a specific CPU.  Switch
1984 		 * to this CPU and turn hardware off.
1985 		 */
1986 		pmc_save_cpu_binding(&pb);
1987 
1988 		cpu = PMC_TO_CPU(pm);
1989 
1990 		pmc_select_cpu(cpu);
1991 
1992 		/* switch off non-stalled CPUs */
1993 		if (pm->pm_state == PMC_STATE_RUNNING &&
1994 		    pm->pm_stalled == 0) {
1995 
1996 			phw = pmc_pcpu[cpu]->pc_hwpmcs[ri];
1997 
1998 			KASSERT(phw->phw_pmc == pm,
1999 			    ("[pmc, %d] pmc ptr ri(%d) hw(%p) pm(%p)",
2000 				__LINE__, ri, phw->phw_pmc, pm));
2001 			PMCDBG(PMC,REL,2, "stopping cpu=%d ri=%d", cpu, ri);
2002 
2003 			critical_enter();
2004 			md->pmd_stop_pmc(cpu, ri);
2005 			critical_exit();
2006 		}
2007 
2008 		PMCDBG(PMC,REL,2, "decfg cpu=%d ri=%d", cpu, ri);
2009 
2010 		critical_enter();
2011 		md->pmd_config_pmc(cpu, ri, NULL);
2012 		critical_exit();
2013 
2014 		/* adjust the global and process count of SS mode PMCs */
2015 		if (mode == PMC_MODE_SS && pm->pm_state == PMC_STATE_RUNNING) {
2016 			po = pm->pm_owner;
2017 			po->po_sscount--;
2018 			if (po->po_sscount == 0) {
2019 				atomic_subtract_rel_int(&pmc_ss_count, 1);
2020 				LIST_REMOVE(po, po_ssnext);
2021 			}
2022 		}
2023 
2024 		pm->pm_state = PMC_STATE_DELETED;
2025 
2026 		pmc_restore_cpu_binding(&pb);
2027 
2028 		/*
2029 		 * We could have references to this PMC structure in
2030 		 * the per-cpu sample queues.  Wait for the queue to
2031 		 * drain.
2032 		 */
2033 		pmc_wait_for_pmc_idle(pm);
2034 
2035 	} else if (PMC_IS_VIRTUAL_MODE(mode)) {
2036 
2037 		/*
2038 		 * A virtual PMC could be running on multiple CPUs at
2039 		 * a given instant.
2040 		 *
2041 		 * By marking its state as DELETED, we ensure that
2042 		 * this PMC is never further scheduled on hardware.
2043 		 *
2044 		 * Then we wait till all CPUs are done with this PMC.
2045 		 */
2046 		pm->pm_state = PMC_STATE_DELETED;
2047 
2048 
2049 		/* Wait for the PMCs runcount to come to zero. */
2050 		pmc_wait_for_pmc_idle(pm);
2051 
2052 		/*
2053 		 * At this point the PMC is off all CPUs and cannot be
2054 		 * freshly scheduled onto a CPU.  It is now safe to
2055 		 * unlink all targets from this PMC.  If a
2056 		 * process-record's refcount falls to zero, we remove
2057 		 * it from the hash table.  The module-wide SX lock
2058 		 * protects us from races.
2059 		 */
2060 		LIST_FOREACH_SAFE(ptgt, &pm->pm_targets, pt_next, tmp) {
2061 			pp = ptgt->pt_process;
2062 			pmc_unlink_target_process(pm, pp); /* frees 'ptgt' */
2063 
2064 			PMCDBG(PMC,REL,3, "pp->refcnt=%d", pp->pp_refcnt);
2065 
2066 			/*
2067 			 * If the target process record shows that no
2068 			 * PMCs are attached to it, reclaim its space.
2069 			 */
2070 
2071 			if (pp->pp_refcnt == 0) {
2072 				pmc_remove_process_descriptor(pp);
2073 				FREE(pp, M_PMC);
2074 			}
2075 		}
2076 
2077 		cpu = curthread->td_oncpu; /* setup cpu for pmd_release() */
2078 
2079 	}
2080 
2081 	/*
2082 	 * Release any MD resources
2083 	 */
2084 
2085 	(void) md->pmd_release_pmc(cpu, ri, pm);
2086 
2087 	/*
2088 	 * Update row disposition
2089 	 */
2090 
2091 	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm)))
2092 		PMC_UNMARK_ROW_STANDALONE(ri);
2093 	else
2094 		PMC_UNMARK_ROW_THREAD(ri);
2095 
2096 	/* unlink from the owner's list */
2097 	if (pm->pm_owner) {
2098 		LIST_REMOVE(pm, pm_next);
2099 		pm->pm_owner = NULL;
2100 	}
2101 
2102 	pmc_destroy_pmc_descriptor(pm);
2103 }
2104 
2105 /*
2106  * Register an owner and a pmc.
2107  */
2108 
2109 static int
2110 pmc_register_owner(struct proc *p, struct pmc *pmc)
2111 {
2112 	struct pmc_owner *po;
2113 
2114 	sx_assert(&pmc_sx, SX_XLOCKED);
2115 
2116 	if ((po = pmc_find_owner_descriptor(p)) == NULL)
2117 		if ((po = pmc_allocate_owner_descriptor(p)) == NULL)
2118 			return ENOMEM;
2119 
2120 	KASSERT(pmc->pm_owner == NULL,
2121 	    ("[pmc,%d] attempting to own an initialized PMC", __LINE__));
2122 	pmc->pm_owner  = po;
2123 
2124 	LIST_INSERT_HEAD(&po->po_pmcs, pmc, pm_next);
2125 
2126 	PROC_LOCK(p);
2127 	p->p_flag |= P_HWPMC;
2128 	PROC_UNLOCK(p);
2129 
2130 	if (po->po_flags & PMC_PO_OWNS_LOGFILE)
2131 		pmclog_process_pmcallocate(pmc);
2132 
2133 	PMCDBG(PMC,REG,1, "register-owner pmc-owner=%p pmc=%p",
2134 	    po, pmc);
2135 
2136 	return 0;
2137 }
2138 
2139 /*
2140  * Return the current row disposition:
2141  * == 0 => FREE
2142  *  > 0 => PROCESS MODE
2143  *  < 0 => SYSTEM MODE
2144  */
2145 
2146 int
2147 pmc_getrowdisp(int ri)
2148 {
2149 	return pmc_pmcdisp[ri];
2150 }
2151 
2152 /*
2153  * Check if a PMC at row index 'ri' can be allocated to the current
2154  * process.
2155  *
2156  * Allocation can fail if:
2157  *   - the current process is already being profiled by a PMC at index 'ri',
2158  *     attached to it via OP_PMCATTACH.
2159  *   - the current process has already allocated a PMC at index 'ri'
2160  *     via OP_ALLOCATE.
2161  */
2162 
2163 static int
2164 pmc_can_allocate_rowindex(struct proc *p, unsigned int ri, int cpu)
2165 {
2166 	enum pmc_mode mode;
2167 	struct pmc *pm;
2168 	struct pmc_owner *po;
2169 	struct pmc_process *pp;
2170 
2171 	PMCDBG(PMC,ALR,1, "can-allocate-rowindex proc=%p (%d, %s) ri=%d "
2172 	    "cpu=%d", p, p->p_pid, p->p_comm, ri, cpu);
2173 
2174 	/*
2175 	 * We shouldn't have already allocated a process-mode PMC at
2176 	 * row index 'ri'.
2177 	 *
2178 	 * We shouldn't have allocated a system-wide PMC on the same
2179 	 * CPU and same RI.
2180 	 */
2181 	if ((po = pmc_find_owner_descriptor(p)) != NULL)
2182 		LIST_FOREACH(pm, &po->po_pmcs, pm_next) {
2183 		    if (PMC_TO_ROWINDEX(pm) == ri) {
2184 			    mode = PMC_TO_MODE(pm);
2185 			    if (PMC_IS_VIRTUAL_MODE(mode))
2186 				    return EEXIST;
2187 			    if (PMC_IS_SYSTEM_MODE(mode) &&
2188 				(int) PMC_TO_CPU(pm) == cpu)
2189 				    return EEXIST;
2190 		    }
2191 	        }
2192 
2193 	/*
2194 	 * We also shouldn't be the target of any PMC at this index
2195 	 * since otherwise a PMC_ATTACH to ourselves will fail.
2196 	 */
2197 	if ((pp = pmc_find_process_descriptor(p, 0)) != NULL)
2198 		if (pp->pp_pmcs[ri].pp_pmc)
2199 			return EEXIST;
2200 
2201 	PMCDBG(PMC,ALR,2, "can-allocate-rowindex proc=%p (%d, %s) ri=%d ok",
2202 	    p, p->p_pid, p->p_comm, ri);
2203 
2204 	return 0;
2205 }
2206 
2207 /*
2208  * Check if a given PMC at row index 'ri' can be currently used in
2209  * mode 'mode'.
2210  */
2211 
2212 static int
2213 pmc_can_allocate_row(int ri, enum pmc_mode mode)
2214 {
2215 	enum pmc_disp	disp;
2216 
2217 	sx_assert(&pmc_sx, SX_XLOCKED);
2218 
2219 	PMCDBG(PMC,ALR,1, "can-allocate-row ri=%d mode=%d", ri, mode);
2220 
2221 	if (PMC_IS_SYSTEM_MODE(mode))
2222 		disp = PMC_DISP_STANDALONE;
2223 	else
2224 		disp = PMC_DISP_THREAD;
2225 
2226 	/*
2227 	 * check disposition for PMC row 'ri':
2228 	 *
2229 	 * Expected disposition		Row-disposition		Result
2230 	 *
2231 	 * STANDALONE			STANDALONE or FREE	proceed
2232 	 * STANDALONE			THREAD			fail
2233 	 * THREAD			THREAD or FREE		proceed
2234 	 * THREAD			STANDALONE		fail
2235 	 */
2236 
2237 	if (!PMC_ROW_DISP_IS_FREE(ri) &&
2238 	    !(disp == PMC_DISP_THREAD && PMC_ROW_DISP_IS_THREAD(ri)) &&
2239 	    !(disp == PMC_DISP_STANDALONE && PMC_ROW_DISP_IS_STANDALONE(ri)))
2240 		return EBUSY;
2241 
2242 	/*
2243 	 * All OK
2244 	 */
2245 
2246 	PMCDBG(PMC,ALR,2, "can-allocate-row ri=%d mode=%d ok", ri, mode);
2247 
2248 	return 0;
2249 
2250 }
2251 
2252 /*
2253  * Find a PMC descriptor with user handle 'pmcid' for thread 'td'.
2254  */
2255 
2256 static struct pmc *
2257 pmc_find_pmc_descriptor_in_process(struct pmc_owner *po, pmc_id_t pmcid)
2258 {
2259 	struct pmc *pm;
2260 
2261 	KASSERT(PMC_ID_TO_ROWINDEX(pmcid) < md->pmd_npmc,
2262 	    ("[pmc,%d] Illegal pmc index %d (max %d)", __LINE__,
2263 		PMC_ID_TO_ROWINDEX(pmcid), md->pmd_npmc));
2264 
2265 	LIST_FOREACH(pm, &po->po_pmcs, pm_next)
2266 	    if (pm->pm_id == pmcid)
2267 		    return pm;
2268 
2269 	return NULL;
2270 }
2271 
2272 static int
2273 pmc_find_pmc(pmc_id_t pmcid, struct pmc **pmc)
2274 {
2275 
2276 	struct pmc *pm;
2277 	struct pmc_owner *po;
2278 
2279 	PMCDBG(PMC,FND,1, "find-pmc id=%d", pmcid);
2280 
2281 	if ((po = pmc_find_owner_descriptor(curthread->td_proc)) == NULL)
2282 		return ESRCH;
2283 
2284 	if ((pm = pmc_find_pmc_descriptor_in_process(po, pmcid)) == NULL)
2285 		return EINVAL;
2286 
2287 	PMCDBG(PMC,FND,2, "find-pmc id=%d -> pmc=%p", pmcid, pm);
2288 
2289 	*pmc = pm;
2290 	return 0;
2291 }
2292 
2293 /*
2294  * Start a PMC.
2295  */
2296 
2297 static int
2298 pmc_start(struct pmc *pm)
2299 {
2300 	int error, cpu, ri;
2301 	enum pmc_mode mode;
2302 	struct pmc_owner *po;
2303 	struct pmc_binding pb;
2304 
2305 	KASSERT(pm != NULL,
2306 	    ("[pmc,%d] null pm", __LINE__));
2307 
2308 	mode = PMC_TO_MODE(pm);
2309 	ri   = PMC_TO_ROWINDEX(pm);
2310 	error = 0;
2311 
2312 	PMCDBG(PMC,OPS,1, "start pmc=%p mode=%d ri=%d", pm, mode, ri);
2313 
2314 	po = pm->pm_owner;
2315 
2316 	if (PMC_IS_VIRTUAL_MODE(mode)) {
2317 
2318 		/*
2319 		 * If a PMCATTACH has never been done on this PMC,
2320 		 * attach it to its owner process.
2321 		 */
2322 
2323 		if (LIST_EMPTY(&pm->pm_targets))
2324 			error = (pm->pm_flags & PMC_F_ATTACH_DONE) ? ESRCH :
2325 			    pmc_attach_process(po->po_owner, pm);
2326 
2327 		/*
2328 		 * Disallow PMCSTART if a logfile is required but has not
2329 		 * been configured yet.
2330 		 */
2331 
2332 		if (error == 0 && (pm->pm_flags & PMC_F_NEEDS_LOGFILE) &&
2333 		    (po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)
2334 			error = EDOOFUS;
2335 
2336 		/*
2337 		 * If the PMC is attached to its owner, then force a context
2338 		 * switch to ensure that the MD state gets set correctly.
2339 		 */
2340 
2341 		if (error == 0) {
2342 			pm->pm_state = PMC_STATE_RUNNING;
2343 			if (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER)
2344 				pmc_force_context_switch();
2345 		}
2346 
2347 		return error;
2348 	}
2349 
2350 
2351 	/*
2352 	 * A system-wide PMC.
2353 	 */
2354 
2355 	if ((pm->pm_flags & PMC_F_NEEDS_LOGFILE) &&
2356 	    (po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)
2357 		return EDOOFUS;	/* programming error */
2358 
2359 	/*
2360 	 * Add the owner to the global list if this is a system-wide
2361 	 * sampling PMC.
2362 	 */
2363 
2364 	if (mode == PMC_MODE_SS) {
2365 		if (po->po_sscount == 0) {
2366 			LIST_INSERT_HEAD(&pmc_ss_owners, po, po_ssnext);
2367 			atomic_add_rel_int(&pmc_ss_count, 1);
2368 			PMCDBG(PMC,OPS,1, "po=%p in global list", po);
2369 		}
2370 		po->po_sscount++;
2371 	}
2372 
2373 	/* TODO: dump system wide process mappings to the log? */
2374 
2375 	/*
2376 	 * Move to the CPU associated with this
2377 	 * PMC, and start the hardware.
2378 	 */
2379 
2380 	pmc_save_cpu_binding(&pb);
2381 
2382 	cpu = PMC_TO_CPU(pm);
2383 
2384 	if (pmc_cpu_is_disabled(cpu))
2385 		return ENXIO;
2386 
2387 	pmc_select_cpu(cpu);
2388 
2389 	/*
2390 	 * global PMCs are configured at allocation time
2391 	 * so write out the initial value and start the PMC.
2392 	 */
2393 
2394 	pm->pm_state = PMC_STATE_RUNNING;
2395 
2396 	critical_enter();
2397 	if ((error = md->pmd_write_pmc(cpu, ri,
2398 		 PMC_IS_SAMPLING_MODE(mode) ?
2399 		 pm->pm_sc.pm_reloadcount :
2400 		 pm->pm_sc.pm_initial)) == 0)
2401 		error = md->pmd_start_pmc(cpu, ri);
2402 	critical_exit();
2403 
2404 	pmc_restore_cpu_binding(&pb);
2405 
2406 	return error;
2407 }
2408 
2409 /*
2410  * Stop a PMC.
2411  */
2412 
2413 static int
2414 pmc_stop(struct pmc *pm)
2415 {
2416 	int cpu, error, ri;
2417 	struct pmc_owner *po;
2418 	struct pmc_binding pb;
2419 
2420 	KASSERT(pm != NULL, ("[pmc,%d] null pmc", __LINE__));
2421 
2422 	PMCDBG(PMC,OPS,1, "stop pmc=%p mode=%d ri=%d", pm,
2423 	    PMC_TO_MODE(pm), PMC_TO_ROWINDEX(pm));
2424 
2425 	pm->pm_state = PMC_STATE_STOPPED;
2426 
2427 	/*
2428 	 * If the PMC is a virtual mode one, changing the state to
2429 	 * non-RUNNING is enough to ensure that the PMC never gets
2430 	 * scheduled.
2431 	 *
2432 	 * If this PMC is current running on a CPU, then it will
2433 	 * handled correctly at the time its target process is context
2434 	 * switched out.
2435 	 */
2436 
2437 	if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)))
2438 		return 0;
2439 
2440 	/*
2441 	 * A system-mode PMC.  Move to the CPU associated with
2442 	 * this PMC, and stop the hardware.  We update the
2443 	 * 'initial count' so that a subsequent PMCSTART will
2444 	 * resume counting from the current hardware count.
2445 	 */
2446 
2447 	pmc_save_cpu_binding(&pb);
2448 
2449 	cpu = PMC_TO_CPU(pm);
2450 
2451 	KASSERT(cpu >= 0 && cpu < mp_ncpus,
2452 	    ("[pmc,%d] illegal cpu=%d", __LINE__, cpu));
2453 
2454 	if (pmc_cpu_is_disabled(cpu))
2455 		return ENXIO;
2456 
2457 	pmc_select_cpu(cpu);
2458 
2459 	ri = PMC_TO_ROWINDEX(pm);
2460 
2461 	critical_enter();
2462 	if ((error = md->pmd_stop_pmc(cpu, ri)) == 0)
2463 		error = md->pmd_read_pmc(cpu, ri, &pm->pm_sc.pm_initial);
2464 	critical_exit();
2465 
2466 	pmc_restore_cpu_binding(&pb);
2467 
2468 	po = pm->pm_owner;
2469 
2470 	/* remove this owner from the global list of SS PMC owners */
2471 	if (PMC_TO_MODE(pm) == PMC_MODE_SS) {
2472 		po->po_sscount--;
2473 		if (po->po_sscount == 0) {
2474 			atomic_subtract_rel_int(&pmc_ss_count, 1);
2475 			LIST_REMOVE(po, po_ssnext);
2476 			PMCDBG(PMC,OPS,2,"po=%p removed from global list", po);
2477 		}
2478 	}
2479 
2480 	return error;
2481 }
2482 
2483 
2484 #ifdef	DEBUG
2485 static const char *pmc_op_to_name[] = {
2486 #undef	__PMC_OP
2487 #define	__PMC_OP(N, D)	#N ,
2488 	__PMC_OPS()
2489 	NULL
2490 };
2491 #endif
2492 
2493 /*
2494  * The syscall interface
2495  */
2496 
2497 #define	PMC_GET_SX_XLOCK(...) do {		\
2498 	sx_xlock(&pmc_sx);			\
2499 	if (pmc_hook == NULL) {			\
2500 		sx_xunlock(&pmc_sx);		\
2501 		return __VA_ARGS__;		\
2502 	}					\
2503 } while (0)
2504 
2505 #define	PMC_DOWNGRADE_SX() do {			\
2506 	sx_downgrade(&pmc_sx);			\
2507 	is_sx_downgraded = 1;			\
2508 } while (0)
2509 
2510 static int
2511 pmc_syscall_handler(struct thread *td, void *syscall_args)
2512 {
2513 	int error, is_sx_downgraded, op;
2514 	struct pmc_syscall_args *c;
2515 	void *arg;
2516 
2517 	PMC_GET_SX_XLOCK(ENOSYS);
2518 
2519 	DROP_GIANT();
2520 
2521 	is_sx_downgraded = 0;
2522 
2523 	c = (struct pmc_syscall_args *) syscall_args;
2524 
2525 	op = c->pmop_code;
2526 	arg = c->pmop_data;
2527 
2528 	PMCDBG(MOD,PMS,1, "syscall op=%d \"%s\" arg=%p", op,
2529 	    pmc_op_to_name[op], arg);
2530 
2531 	error = 0;
2532 	atomic_add_int(&pmc_stats.pm_syscalls, 1);
2533 
2534 	switch(op)
2535 	{
2536 
2537 
2538 	/*
2539 	 * Configure a log file.
2540 	 *
2541 	 * XXX This OP will be reworked.
2542 	 */
2543 
2544 	case PMC_OP_CONFIGURELOG:
2545 	{
2546 		struct proc *p;
2547 		struct pmc *pm;
2548 		struct pmc_owner *po;
2549 		struct pmckern_map_in *km, *kmbase;
2550 		struct pmc_op_configurelog cl;
2551 
2552 		sx_assert(&pmc_sx, SX_XLOCKED);
2553 
2554 		if ((error = copyin(arg, &cl, sizeof(cl))) != 0)
2555 			break;
2556 
2557 		/* mark this process as owning a log file */
2558 		p = td->td_proc;
2559 		if ((po = pmc_find_owner_descriptor(p)) == NULL)
2560 			if ((po = pmc_allocate_owner_descriptor(p)) == NULL) {
2561 				error = ENOMEM;
2562 				break;
2563 			}
2564 
2565 		/*
2566 		 * If a valid fd was passed in, try to configure that,
2567 		 * otherwise if 'fd' was less than zero and there was
2568 		 * a log file configured, flush its buffers and
2569 		 * de-configure it.
2570 		 */
2571 		if (cl.pm_logfd >= 0)
2572 			error = pmclog_configure_log(po, cl.pm_logfd);
2573 		else if (po->po_flags & PMC_PO_OWNS_LOGFILE) {
2574 			pmclog_process_closelog(po);
2575 			error = pmclog_flush(po);
2576 			if (error == 0) {
2577 				LIST_FOREACH(pm, &po->po_pmcs, pm_next)
2578 				    if (pm->pm_flags & PMC_F_NEEDS_LOGFILE &&
2579 					pm->pm_state == PMC_STATE_RUNNING)
2580 					    pmc_stop(pm);
2581 				error = pmclog_deconfigure_log(po);
2582 			}
2583 		} else
2584 			error = EINVAL;
2585 
2586 		if (error)
2587 			break;
2588 
2589 		/*
2590 		 * Log the current set of kernel modules.
2591 		 */
2592 		kmbase = linker_hwpmc_list_objects();
2593 		for (km = kmbase; km->pm_file != NULL; km++) {
2594 			PMCDBG(LOG,REG,1,"%s %p", (char *) km->pm_file,
2595 			    (void *) km->pm_address);
2596 			pmclog_process_map_in(po, (pid_t) -1, km->pm_address,
2597 			    km->pm_file);
2598 		}
2599 		FREE(kmbase, M_LINKER);
2600 	}
2601 	break;
2602 
2603 
2604 	/*
2605 	 * Flush a log file.
2606 	 */
2607 
2608 	case PMC_OP_FLUSHLOG:
2609 	{
2610 		struct pmc_owner *po;
2611 
2612 		sx_assert(&pmc_sx, SX_XLOCKED);
2613 
2614 		if ((po = pmc_find_owner_descriptor(td->td_proc)) == NULL) {
2615 			error = EINVAL;
2616 			break;
2617 		}
2618 
2619 		error = pmclog_flush(po);
2620 	}
2621 	break;
2622 
2623 	/*
2624 	 * Retrieve hardware configuration.
2625 	 */
2626 
2627 	case PMC_OP_GETCPUINFO:	/* CPU information */
2628 	{
2629 		struct pmc_op_getcpuinfo gci;
2630 
2631 		gci.pm_cputype = md->pmd_cputype;
2632 		gci.pm_ncpu    = mp_ncpus;
2633 		gci.pm_npmc    = md->pmd_npmc;
2634 		gci.pm_nclass  = md->pmd_nclass;
2635 		bcopy(md->pmd_classes, &gci.pm_classes,
2636 		    sizeof(gci.pm_classes));
2637 		error = copyout(&gci, arg, sizeof(gci));
2638 	}
2639 	break;
2640 
2641 
2642 	/*
2643 	 * Get module statistics
2644 	 */
2645 
2646 	case PMC_OP_GETDRIVERSTATS:
2647 	{
2648 		struct pmc_op_getdriverstats gms;
2649 
2650 		bcopy(&pmc_stats, &gms, sizeof(gms));
2651 		error = copyout(&gms, arg, sizeof(gms));
2652 	}
2653 	break;
2654 
2655 
2656 	/*
2657 	 * Retrieve module version number
2658 	 */
2659 
2660 	case PMC_OP_GETMODULEVERSION:
2661 	{
2662 		uint32_t cv, modv;
2663 
2664 		/* retrieve the client's idea of the ABI version */
2665 		if ((error = copyin(arg, &cv, sizeof(uint32_t))) != 0)
2666 			break;
2667 		/* don't service clients newer than our driver */
2668 		modv = PMC_VERSION;
2669 		if ((cv & 0xFFFF0000) > (modv & 0xFFFF0000)) {
2670 			error = EPROGMISMATCH;
2671 			break;
2672 		}
2673 		error = copyout(&modv, arg, sizeof(int));
2674 	}
2675 	break;
2676 
2677 
2678 	/*
2679 	 * Retrieve the state of all the PMCs on a given
2680 	 * CPU.
2681 	 */
2682 
2683 	case PMC_OP_GETPMCINFO:
2684 	{
2685 		uint32_t cpu, n, npmc;
2686 		size_t pmcinfo_size;
2687 		struct pmc *pm;
2688 		struct pmc_info *p, *pmcinfo;
2689 		struct pmc_op_getpmcinfo *gpi;
2690 		struct pmc_owner *po;
2691 		struct pmc_binding pb;
2692 
2693 		PMC_DOWNGRADE_SX();
2694 
2695 		gpi = (struct pmc_op_getpmcinfo *) arg;
2696 
2697 		if ((error = copyin(&gpi->pm_cpu, &cpu, sizeof(cpu))) != 0)
2698 			break;
2699 
2700 		if (cpu >= (unsigned int) mp_ncpus) {
2701 			error = EINVAL;
2702 			break;
2703 		}
2704 
2705 		if (pmc_cpu_is_disabled(cpu)) {
2706 			error = ENXIO;
2707 			break;
2708 		}
2709 
2710 		/* switch to CPU 'cpu' */
2711 		pmc_save_cpu_binding(&pb);
2712 		pmc_select_cpu(cpu);
2713 
2714 		npmc = md->pmd_npmc;
2715 
2716 		pmcinfo_size = npmc * sizeof(struct pmc_info);
2717 		MALLOC(pmcinfo, struct pmc_info *, pmcinfo_size, M_PMC,
2718 		    M_WAITOK);
2719 
2720 		p = pmcinfo;
2721 
2722 		for (n = 0; n < md->pmd_npmc; n++, p++) {
2723 
2724 			if ((error = md->pmd_describe(cpu, n, p, &pm)) != 0)
2725 				break;
2726 
2727 			if (PMC_ROW_DISP_IS_STANDALONE(n))
2728 				p->pm_rowdisp = PMC_DISP_STANDALONE;
2729 			else if (PMC_ROW_DISP_IS_THREAD(n))
2730 				p->pm_rowdisp = PMC_DISP_THREAD;
2731 			else
2732 				p->pm_rowdisp = PMC_DISP_FREE;
2733 
2734 			p->pm_ownerpid = -1;
2735 
2736 			if (pm == NULL)	/* no PMC associated */
2737 				continue;
2738 
2739 			po = pm->pm_owner;
2740 
2741 			KASSERT(po->po_owner != NULL,
2742 			    ("[pmc,%d] pmc_owner had a null proc pointer",
2743 				__LINE__));
2744 
2745 			p->pm_ownerpid = po->po_owner->p_pid;
2746 			p->pm_mode     = PMC_TO_MODE(pm);
2747 			p->pm_event    = pm->pm_event;
2748 			p->pm_flags    = pm->pm_flags;
2749 
2750 			if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
2751 				p->pm_reloadcount =
2752 				    pm->pm_sc.pm_reloadcount;
2753 		}
2754 
2755 		pmc_restore_cpu_binding(&pb);
2756 
2757 		/* now copy out the PMC info collected */
2758 		if (error == 0)
2759 			error = copyout(pmcinfo, &gpi->pm_pmcs, pmcinfo_size);
2760 
2761 		FREE(pmcinfo, M_PMC);
2762 	}
2763 	break;
2764 
2765 
2766 	/*
2767 	 * Set the administrative state of a PMC.  I.e. whether
2768 	 * the PMC is to be used or not.
2769 	 */
2770 
2771 	case PMC_OP_PMCADMIN:
2772 	{
2773 		int cpu, ri;
2774 		enum pmc_state request;
2775 		struct pmc_cpu *pc;
2776 		struct pmc_hw *phw;
2777 		struct pmc_op_pmcadmin pma;
2778 		struct pmc_binding pb;
2779 
2780 		sx_assert(&pmc_sx, SX_XLOCKED);
2781 
2782 		KASSERT(td == curthread,
2783 		    ("[pmc,%d] td != curthread", __LINE__));
2784 
2785 		if (suser(td) || jailed(td->td_ucred)) {
2786 			error =  EPERM;
2787 			break;
2788 		}
2789 
2790 		if ((error = copyin(arg, &pma, sizeof(pma))) != 0)
2791 			break;
2792 
2793 		cpu = pma.pm_cpu;
2794 
2795 		if (cpu < 0 || cpu >= mp_ncpus) {
2796 			error = EINVAL;
2797 			break;
2798 		}
2799 
2800 		if (pmc_cpu_is_disabled(cpu)) {
2801 			error = ENXIO;
2802 			break;
2803 		}
2804 
2805 		request = pma.pm_state;
2806 
2807 		if (request != PMC_STATE_DISABLED &&
2808 		    request != PMC_STATE_FREE) {
2809 			error = EINVAL;
2810 			break;
2811 		}
2812 
2813 		ri = pma.pm_pmc; /* pmc id == row index */
2814 		if (ri < 0 || ri >= (int) md->pmd_npmc) {
2815 			error = EINVAL;
2816 			break;
2817 		}
2818 
2819 		/*
2820 		 * We can't disable a PMC with a row-index allocated
2821 		 * for process virtual PMCs.
2822 		 */
2823 
2824 		if (PMC_ROW_DISP_IS_THREAD(ri) &&
2825 		    request == PMC_STATE_DISABLED) {
2826 			error = EBUSY;
2827 			break;
2828 		}
2829 
2830 		/*
2831 		 * otherwise, this PMC on this CPU is either free or
2832 		 * in system-wide mode.
2833 		 */
2834 
2835 		pmc_save_cpu_binding(&pb);
2836 		pmc_select_cpu(cpu);
2837 
2838 		pc  = pmc_pcpu[cpu];
2839 		phw = pc->pc_hwpmcs[ri];
2840 
2841 		/*
2842 		 * XXX do we need some kind of 'forced' disable?
2843 		 */
2844 
2845 		if (phw->phw_pmc == NULL) {
2846 			if (request == PMC_STATE_DISABLED &&
2847 			    (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED)) {
2848 				phw->phw_state &= ~PMC_PHW_FLAG_IS_ENABLED;
2849 				PMC_MARK_ROW_STANDALONE(ri);
2850 			} else if (request == PMC_STATE_FREE &&
2851 			    (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) == 0) {
2852 				phw->phw_state |=  PMC_PHW_FLAG_IS_ENABLED;
2853 				PMC_UNMARK_ROW_STANDALONE(ri);
2854 			}
2855 			/* other cases are a no-op */
2856 		} else
2857 			error = EBUSY;
2858 
2859 		pmc_restore_cpu_binding(&pb);
2860 	}
2861 	break;
2862 
2863 
2864 	/*
2865 	 * Allocate a PMC.
2866 	 */
2867 
2868 	case PMC_OP_PMCALLOCATE:
2869 	{
2870 		uint32_t caps;
2871 		u_int cpu;
2872 		int n;
2873 		enum pmc_mode mode;
2874 		struct pmc *pmc;
2875 		struct pmc_hw *phw;
2876 		struct pmc_op_pmcallocate pa;
2877 		struct pmc_binding pb;
2878 
2879 		if ((error = copyin(arg, &pa, sizeof(pa))) != 0)
2880 			break;
2881 
2882 		caps = pa.pm_caps;
2883 		mode = pa.pm_mode;
2884 		cpu  = pa.pm_cpu;
2885 
2886 		if ((mode != PMC_MODE_SS  &&  mode != PMC_MODE_SC  &&
2887 		     mode != PMC_MODE_TS  &&  mode != PMC_MODE_TC) ||
2888 		    (cpu != (u_int) PMC_CPU_ANY && cpu >= (u_int) mp_ncpus)) {
2889 			error = EINVAL;
2890 			break;
2891 		}
2892 
2893 		/*
2894 		 * Virtual PMCs should only ask for a default CPU.
2895 		 * System mode PMCs need to specify a non-default CPU.
2896 		 */
2897 
2898 		if ((PMC_IS_VIRTUAL_MODE(mode) && cpu != (u_int) PMC_CPU_ANY) ||
2899 		    (PMC_IS_SYSTEM_MODE(mode) && cpu == (u_int) PMC_CPU_ANY)) {
2900 			error = EINVAL;
2901 			break;
2902 		}
2903 
2904 		/*
2905 		 * Check that a disabled CPU is not being asked for.
2906 		 */
2907 
2908 		if (PMC_IS_SYSTEM_MODE(mode) && pmc_cpu_is_disabled(cpu)) {
2909 			error = ENXIO;
2910 			break;
2911 		}
2912 
2913 		/*
2914 		 * Refuse an allocation for a system-wide PMC if this
2915 		 * process has been jailed, or if this process lacks
2916 		 * super-user credentials and the sysctl tunable
2917 		 * 'security.bsd.unprivileged_syspmcs' is zero.
2918 		 */
2919 
2920 		if (PMC_IS_SYSTEM_MODE(mode)) {
2921 			if (jailed(curthread->td_ucred))
2922 				error = EPERM;
2923 			else if (suser(curthread) &&
2924 			    (pmc_unprivileged_syspmcs == 0))
2925 				error = EPERM;
2926 		}
2927 
2928 		if (error)
2929 			break;
2930 
2931 		/*
2932 		 * Look for valid values for 'pm_flags'
2933 		 */
2934 
2935 		if ((pa.pm_flags & ~(PMC_F_DESCENDANTS | PMC_F_LOG_PROCCSW |
2936 		    PMC_F_LOG_PROCEXIT)) != 0) {
2937 			error = EINVAL;
2938 			break;
2939 		}
2940 
2941 		/* process logging options are not allowed for system PMCs */
2942 		if (PMC_IS_SYSTEM_MODE(mode) && (pa.pm_flags &
2943 		    (PMC_F_LOG_PROCCSW | PMC_F_LOG_PROCEXIT))) {
2944 			error = EINVAL;
2945 			break;
2946 		}
2947 
2948 		/*
2949 		 * All sampling mode PMCs need to be able to interrupt the
2950 		 * CPU.
2951 		 */
2952 		if (PMC_IS_SAMPLING_MODE(mode))
2953 			caps |= PMC_CAP_INTERRUPT;
2954 
2955 		/* A valid class specifier should have been passed in. */
2956 		for (n = 0; n < md->pmd_nclass; n++)
2957 			if (md->pmd_classes[n].pm_class == pa.pm_class)
2958 				break;
2959 		if (n == md->pmd_nclass) {
2960 			error = EINVAL;
2961 			break;
2962 		}
2963 
2964 		/* The requested PMC capabilities should be feasible. */
2965 		if ((md->pmd_classes[n].pm_caps & caps) != caps) {
2966 			error = EOPNOTSUPP;
2967 			break;
2968 		}
2969 
2970 		PMCDBG(PMC,ALL,2, "event=%d caps=0x%x mode=%d cpu=%d",
2971 		    pa.pm_ev, caps, mode, cpu);
2972 
2973 		pmc = pmc_allocate_pmc_descriptor();
2974 		pmc->pm_id    = PMC_ID_MAKE_ID(cpu,pa.pm_mode,pa.pm_class,
2975 		    PMC_ID_INVALID);
2976 		pmc->pm_event = pa.pm_ev;
2977 		pmc->pm_state = PMC_STATE_FREE;
2978 		pmc->pm_caps  = caps;
2979 		pmc->pm_flags = pa.pm_flags;
2980 
2981 		/* switch thread to CPU 'cpu' */
2982 		pmc_save_cpu_binding(&pb);
2983 
2984 #define	PMC_IS_SHAREABLE_PMC(cpu, n)				\
2985 	(pmc_pcpu[(cpu)]->pc_hwpmcs[(n)]->phw_state &		\
2986 	 PMC_PHW_FLAG_IS_SHAREABLE)
2987 #define	PMC_IS_UNALLOCATED(cpu, n)				\
2988 	(pmc_pcpu[(cpu)]->pc_hwpmcs[(n)]->phw_pmc == NULL)
2989 
2990 		if (PMC_IS_SYSTEM_MODE(mode)) {
2991 			pmc_select_cpu(cpu);
2992 			for (n = 0; n < (int) md->pmd_npmc; n++)
2993 				if (pmc_can_allocate_row(n, mode) == 0 &&
2994 				    pmc_can_allocate_rowindex(
2995 					    curthread->td_proc, n, cpu) == 0 &&
2996 				    (PMC_IS_UNALLOCATED(cpu, n) ||
2997 				     PMC_IS_SHAREABLE_PMC(cpu, n)) &&
2998 				    md->pmd_allocate_pmc(cpu, n, pmc,
2999 					&pa) == 0)
3000 					break;
3001 		} else {
3002 			/* Process virtual mode */
3003 			for (n = 0; n < (int) md->pmd_npmc; n++) {
3004 				if (pmc_can_allocate_row(n, mode) == 0 &&
3005 				    pmc_can_allocate_rowindex(
3006 					    curthread->td_proc, n,
3007 					    PMC_CPU_ANY) == 0 &&
3008 				    md->pmd_allocate_pmc(curthread->td_oncpu,
3009 					n, pmc, &pa) == 0)
3010 					break;
3011 			}
3012 		}
3013 
3014 #undef	PMC_IS_UNALLOCATED
3015 #undef	PMC_IS_SHAREABLE_PMC
3016 
3017 		pmc_restore_cpu_binding(&pb);
3018 
3019 		if (n == (int) md->pmd_npmc) {
3020 			pmc_destroy_pmc_descriptor(pmc);
3021 			FREE(pmc, M_PMC);
3022 			pmc = NULL;
3023 			error = EINVAL;
3024 			break;
3025 		}
3026 
3027 		/* Fill in the correct value in the ID field */
3028 		pmc->pm_id = PMC_ID_MAKE_ID(cpu,mode,pa.pm_class,n);
3029 
3030 		PMCDBG(PMC,ALL,2, "ev=%d class=%d mode=%d n=%d -> pmcid=%x",
3031 		    pmc->pm_event, pa.pm_class, mode, n, pmc->pm_id);
3032 
3033 		/* Process mode PMCs with logging enabled need log files */
3034 		if (pmc->pm_flags & (PMC_F_LOG_PROCEXIT | PMC_F_LOG_PROCCSW))
3035 			pmc->pm_flags |= PMC_F_NEEDS_LOGFILE;
3036 
3037 		/* All system mode sampling PMCs require a log file */
3038 		if (PMC_IS_SAMPLING_MODE(mode) && PMC_IS_SYSTEM_MODE(mode))
3039 			pmc->pm_flags |= PMC_F_NEEDS_LOGFILE;
3040 
3041 		/*
3042 		 * Configure global pmc's immediately
3043 		 */
3044 
3045 		if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pmc))) {
3046 
3047 			pmc_save_cpu_binding(&pb);
3048 			pmc_select_cpu(cpu);
3049 
3050 			phw = pmc_pcpu[cpu]->pc_hwpmcs[n];
3051 
3052 			if ((phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) == 0 ||
3053 			    (error = md->pmd_config_pmc(cpu, n, pmc)) != 0) {
3054 				(void) md->pmd_release_pmc(cpu, n, pmc);
3055 				pmc_destroy_pmc_descriptor(pmc);
3056 				FREE(pmc, M_PMC);
3057 				pmc = NULL;
3058 				pmc_restore_cpu_binding(&pb);
3059 				error = EPERM;
3060 				break;
3061 			}
3062 
3063 			pmc_restore_cpu_binding(&pb);
3064 		}
3065 
3066 		pmc->pm_state    = PMC_STATE_ALLOCATED;
3067 
3068 		/*
3069 		 * mark row disposition
3070 		 */
3071 
3072 		if (PMC_IS_SYSTEM_MODE(mode))
3073 			PMC_MARK_ROW_STANDALONE(n);
3074 		else
3075 			PMC_MARK_ROW_THREAD(n);
3076 
3077 		/*
3078 		 * Register this PMC with the current thread as its owner.
3079 		 */
3080 
3081 		if ((error =
3082 		    pmc_register_owner(curthread->td_proc, pmc)) != 0) {
3083 			pmc_release_pmc_descriptor(pmc);
3084 			FREE(pmc, M_PMC);
3085 			pmc = NULL;
3086 			break;
3087 		}
3088 
3089 		/*
3090 		 * Return the allocated index.
3091 		 */
3092 
3093 		pa.pm_pmcid = pmc->pm_id;
3094 
3095 		error = copyout(&pa, arg, sizeof(pa));
3096 	}
3097 	break;
3098 
3099 
3100 	/*
3101 	 * Attach a PMC to a process.
3102 	 */
3103 
3104 	case PMC_OP_PMCATTACH:
3105 	{
3106 		struct pmc *pm;
3107 		struct proc *p;
3108 		struct pmc_op_pmcattach a;
3109 
3110 		sx_assert(&pmc_sx, SX_XLOCKED);
3111 
3112 		if ((error = copyin(arg, &a, sizeof(a))) != 0)
3113 			break;
3114 
3115 		if (a.pm_pid < 0) {
3116 			error = EINVAL;
3117 			break;
3118 		} else if (a.pm_pid == 0)
3119 			a.pm_pid = td->td_proc->p_pid;
3120 
3121 		if ((error = pmc_find_pmc(a.pm_pmc, &pm)) != 0)
3122 			break;
3123 
3124 		if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
3125 			error = EINVAL;
3126 			break;
3127 		}
3128 
3129 		/* PMCs may be (re)attached only when allocated or stopped */
3130 		if (pm->pm_state == PMC_STATE_RUNNING) {
3131 			error = EBUSY;
3132 			break;
3133 		} else if (pm->pm_state != PMC_STATE_ALLOCATED &&
3134 		    pm->pm_state != PMC_STATE_STOPPED) {
3135 			error = EINVAL;
3136 			break;
3137 		}
3138 
3139 		/* lookup pid */
3140 		if ((p = pfind(a.pm_pid)) == NULL) {
3141 			error = ESRCH;
3142 			break;
3143 		}
3144 
3145 		/*
3146 		 * Ignore processes that are working on exiting.
3147 		 */
3148 		if (p->p_flag & P_WEXIT) {
3149 			error = ESRCH;
3150 			PROC_UNLOCK(p);	/* pfind() returns a locked process */
3151 			break;
3152 		}
3153 
3154 		/*
3155 		 * we are allowed to attach a PMC to a process if
3156 		 * we can debug it.
3157 		 */
3158 		error = p_candebug(curthread, p);
3159 
3160 		PROC_UNLOCK(p);
3161 
3162 		if (error == 0)
3163 			error = pmc_attach_process(p, pm);
3164 	}
3165 	break;
3166 
3167 
3168 	/*
3169 	 * Detach an attached PMC from a process.
3170 	 */
3171 
3172 	case PMC_OP_PMCDETACH:
3173 	{
3174 		struct pmc *pm;
3175 		struct proc *p;
3176 		struct pmc_op_pmcattach a;
3177 
3178 		if ((error = copyin(arg, &a, sizeof(a))) != 0)
3179 			break;
3180 
3181 		if (a.pm_pid < 0) {
3182 			error = EINVAL;
3183 			break;
3184 		} else if (a.pm_pid == 0)
3185 			a.pm_pid = td->td_proc->p_pid;
3186 
3187 		if ((error = pmc_find_pmc(a.pm_pmc, &pm)) != 0)
3188 			break;
3189 
3190 		if ((p = pfind(a.pm_pid)) == NULL) {
3191 			error = ESRCH;
3192 			break;
3193 		}
3194 
3195 		/*
3196 		 * Treat processes that are in the process of exiting
3197 		 * as if they were not present.
3198 		 */
3199 
3200 		if (p->p_flag & P_WEXIT)
3201 			error = ESRCH;
3202 
3203 		PROC_UNLOCK(p);	/* pfind() returns a locked process */
3204 
3205 		if (error == 0)
3206 			error = pmc_detach_process(p, pm);
3207 	}
3208 	break;
3209 
3210 
3211 	/*
3212 	 * Retrieve the MSR number associated with the counter
3213 	 * 'pmc_id'.  This allows processes to directly use RDPMC
3214 	 * instructions to read their PMCs, without the overhead of a
3215 	 * system call.
3216 	 */
3217 
3218 	case PMC_OP_PMCGETMSR:
3219 	{
3220 		int ri;
3221 		struct pmc	*pm;
3222 		struct pmc_target *pt;
3223 		struct pmc_op_getmsr gm;
3224 
3225 		PMC_DOWNGRADE_SX();
3226 
3227 		/* CPU has no 'GETMSR' support */
3228 		if (md->pmd_get_msr == NULL) {
3229 			error = ENOSYS;
3230 			break;
3231 		}
3232 
3233 		if ((error = copyin(arg, &gm, sizeof(gm))) != 0)
3234 			break;
3235 
3236 		if ((error = pmc_find_pmc(gm.pm_pmcid, &pm)) != 0)
3237 			break;
3238 
3239 		/*
3240 		 * The allocated PMC has to be a process virtual PMC,
3241 		 * i.e., of type MODE_T[CS].  Global PMCs can only be
3242 		 * read using the PMCREAD operation since they may be
3243 		 * allocated on a different CPU than the one we could
3244 		 * be running on at the time of the RDPMC instruction.
3245 		 *
3246 		 * The GETMSR operation is not allowed for PMCs that
3247 		 * are inherited across processes.
3248 		 */
3249 
3250 		if (!PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) ||
3251 		    (pm->pm_flags & PMC_F_DESCENDANTS)) {
3252 			error = EINVAL;
3253 			break;
3254 		}
3255 
3256 		/*
3257 		 * It only makes sense to use a RDPMC (or its
3258 		 * equivalent instruction on non-x86 architectures) on
3259 		 * a process that has allocated and attached a PMC to
3260 		 * itself.  Conversely the PMC is only allowed to have
3261 		 * one process attached to it -- its owner.
3262 		 */
3263 
3264 		if ((pt = LIST_FIRST(&pm->pm_targets)) == NULL ||
3265 		    LIST_NEXT(pt, pt_next) != NULL ||
3266 		    pt->pt_process->pp_proc != pm->pm_owner->po_owner) {
3267 			error = EINVAL;
3268 			break;
3269 		}
3270 
3271 		ri = PMC_TO_ROWINDEX(pm);
3272 
3273 		if ((error = (*md->pmd_get_msr)(ri, &gm.pm_msr)) < 0)
3274 			break;
3275 
3276 		if ((error = copyout(&gm, arg, sizeof(gm))) < 0)
3277 			break;
3278 
3279 		/*
3280 		 * Mark our process as using MSRs.  Update machine
3281 		 * state using a forced context switch.
3282 		 */
3283 
3284 		pt->pt_process->pp_flags |= PMC_PP_ENABLE_MSR_ACCESS;
3285 		pmc_force_context_switch();
3286 
3287 	}
3288 	break;
3289 
3290 	/*
3291 	 * Release an allocated PMC
3292 	 */
3293 
3294 	case PMC_OP_PMCRELEASE:
3295 	{
3296 		pmc_id_t pmcid;
3297 		struct pmc *pm;
3298 		struct pmc_owner *po;
3299 		struct pmc_op_simple sp;
3300 
3301 		/*
3302 		 * Find PMC pointer for the named PMC.
3303 		 *
3304 		 * Use pmc_release_pmc_descriptor() to switch off the
3305 		 * PMC, remove all its target threads, and remove the
3306 		 * PMC from its owner's list.
3307 		 *
3308 		 * Remove the owner record if this is the last PMC
3309 		 * owned.
3310 		 *
3311 		 * Free up space.
3312 		 */
3313 
3314 		if ((error = copyin(arg, &sp, sizeof(sp))) != 0)
3315 			break;
3316 
3317 		pmcid = sp.pm_pmcid;
3318 
3319 		if ((error = pmc_find_pmc(pmcid, &pm)) != 0)
3320 			break;
3321 
3322 		po = pm->pm_owner;
3323 		pmc_release_pmc_descriptor(pm);
3324 		pmc_maybe_remove_owner(po);
3325 
3326 		FREE(pm, M_PMC);
3327 	}
3328 	break;
3329 
3330 
3331 	/*
3332 	 * Read and/or write a PMC.
3333 	 */
3334 
3335 	case PMC_OP_PMCRW:
3336 	{
3337 		uint32_t cpu, ri;
3338 		struct pmc *pm;
3339 		struct pmc_op_pmcrw *pprw;
3340 		struct pmc_op_pmcrw prw;
3341 		struct pmc_binding pb;
3342 		pmc_value_t oldvalue;
3343 
3344 		PMC_DOWNGRADE_SX();
3345 
3346 		if ((error = copyin(arg, &prw, sizeof(prw))) != 0)
3347 			break;
3348 
3349 		ri = 0;
3350 		PMCDBG(PMC,OPS,1, "rw id=%d flags=0x%x", prw.pm_pmcid,
3351 		    prw.pm_flags);
3352 
3353 		/* must have at least one flag set */
3354 		if ((prw.pm_flags & (PMC_F_OLDVALUE|PMC_F_NEWVALUE)) == 0) {
3355 			error = EINVAL;
3356 			break;
3357 		}
3358 
3359 		/* locate pmc descriptor */
3360 		if ((error = pmc_find_pmc(prw.pm_pmcid, &pm)) != 0)
3361 			break;
3362 
3363 		/* Can't read a PMC that hasn't been started. */
3364 		if (pm->pm_state != PMC_STATE_ALLOCATED &&
3365 		    pm->pm_state != PMC_STATE_STOPPED &&
3366 		    pm->pm_state != PMC_STATE_RUNNING) {
3367 			error = EINVAL;
3368 			break;
3369 		}
3370 
3371 		/* writing a new value is allowed only for 'STOPPED' pmcs */
3372 		if (pm->pm_state == PMC_STATE_RUNNING &&
3373 		    (prw.pm_flags & PMC_F_NEWVALUE)) {
3374 			error = EBUSY;
3375 			break;
3376 		}
3377 
3378 		if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) {
3379 
3380 			/*
3381 			 * If this PMC is attached to its owner (i.e.,
3382 			 * the process requesting this operation) and
3383 			 * is running, then attempt to get an
3384 			 * upto-date reading from hardware for a READ.
3385 			 * Writes are only allowed when the PMC is
3386 			 * stopped, so only update the saved value
3387 			 * field.
3388 			 *
3389 			 * If the PMC is not running, or is not
3390 			 * attached to its owner, read/write to the
3391 			 * savedvalue field.
3392 			 */
3393 
3394 			ri = PMC_TO_ROWINDEX(pm);
3395 
3396 			mtx_pool_lock_spin(pmc_mtxpool, pm);
3397 			cpu = curthread->td_oncpu;
3398 
3399 			if (prw.pm_flags & PMC_F_OLDVALUE) {
3400 				if ((pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) &&
3401 				    (pm->pm_state == PMC_STATE_RUNNING))
3402 					error = (*md->pmd_read_pmc)(cpu, ri,
3403 					    &oldvalue);
3404 				else
3405 					oldvalue = pm->pm_gv.pm_savedvalue;
3406 			}
3407 			if (prw.pm_flags & PMC_F_NEWVALUE)
3408 				pm->pm_gv.pm_savedvalue = prw.pm_value;
3409 
3410 			mtx_pool_unlock_spin(pmc_mtxpool, pm);
3411 
3412 		} else { /* System mode PMCs */
3413 			cpu = PMC_TO_CPU(pm);
3414 			ri  = PMC_TO_ROWINDEX(pm);
3415 
3416 			if (pmc_cpu_is_disabled(cpu)) {
3417 				error = ENXIO;
3418 				break;
3419 			}
3420 
3421 			/* move this thread to CPU 'cpu' */
3422 			pmc_save_cpu_binding(&pb);
3423 			pmc_select_cpu(cpu);
3424 
3425 			critical_enter();
3426 			/* save old value */
3427 			if (prw.pm_flags & PMC_F_OLDVALUE)
3428 				if ((error = (*md->pmd_read_pmc)(cpu, ri,
3429 					 &oldvalue)))
3430 					goto error;
3431 			/* write out new value */
3432 			if (prw.pm_flags & PMC_F_NEWVALUE)
3433 				error = (*md->pmd_write_pmc)(cpu, ri,
3434 				    prw.pm_value);
3435 		error:
3436 			critical_exit();
3437 			pmc_restore_cpu_binding(&pb);
3438 			if (error)
3439 				break;
3440 		}
3441 
3442 		pprw = (struct pmc_op_pmcrw *) arg;
3443 
3444 #ifdef	DEBUG
3445 		if (prw.pm_flags & PMC_F_NEWVALUE)
3446 			PMCDBG(PMC,OPS,2, "rw id=%d new %jx -> old %jx",
3447 			    ri, prw.pm_value, oldvalue);
3448 		else if (prw.pm_flags & PMC_F_OLDVALUE)
3449 			PMCDBG(PMC,OPS,2, "rw id=%d -> old %jx", ri, oldvalue);
3450 #endif
3451 
3452 		/* return old value if requested */
3453 		if (prw.pm_flags & PMC_F_OLDVALUE)
3454 			if ((error = copyout(&oldvalue, &pprw->pm_value,
3455 				 sizeof(prw.pm_value))))
3456 				break;
3457 
3458 	}
3459 	break;
3460 
3461 
3462 	/*
3463 	 * Set the sampling rate for a sampling mode PMC and the
3464 	 * initial count for a counting mode PMC.
3465 	 */
3466 
3467 	case PMC_OP_PMCSETCOUNT:
3468 	{
3469 		struct pmc *pm;
3470 		struct pmc_op_pmcsetcount sc;
3471 
3472 		PMC_DOWNGRADE_SX();
3473 
3474 		if ((error = copyin(arg, &sc, sizeof(sc))) != 0)
3475 			break;
3476 
3477 		if ((error = pmc_find_pmc(sc.pm_pmcid, &pm)) != 0)
3478 			break;
3479 
3480 		if (pm->pm_state == PMC_STATE_RUNNING) {
3481 			error = EBUSY;
3482 			break;
3483 		}
3484 
3485 		if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
3486 			pm->pm_sc.pm_reloadcount = sc.pm_count;
3487 		else
3488 			pm->pm_sc.pm_initial = sc.pm_count;
3489 	}
3490 	break;
3491 
3492 
3493 	/*
3494 	 * Start a PMC.
3495 	 */
3496 
3497 	case PMC_OP_PMCSTART:
3498 	{
3499 		pmc_id_t pmcid;
3500 		struct pmc *pm;
3501 		struct pmc_op_simple sp;
3502 
3503 		sx_assert(&pmc_sx, SX_XLOCKED);
3504 
3505 		if ((error = copyin(arg, &sp, sizeof(sp))) != 0)
3506 			break;
3507 
3508 		pmcid = sp.pm_pmcid;
3509 
3510 		if ((error = pmc_find_pmc(pmcid, &pm)) != 0)
3511 			break;
3512 
3513 		KASSERT(pmcid == pm->pm_id,
3514 		    ("[pmc,%d] pmcid %x != id %x", __LINE__,
3515 			pm->pm_id, pmcid));
3516 
3517 		if (pm->pm_state == PMC_STATE_RUNNING) /* already running */
3518 			break;
3519 		else if (pm->pm_state != PMC_STATE_STOPPED &&
3520 		    pm->pm_state != PMC_STATE_ALLOCATED) {
3521 			error = EINVAL;
3522 			break;
3523 		}
3524 
3525 		error = pmc_start(pm);
3526 	}
3527 	break;
3528 
3529 
3530 	/*
3531 	 * Stop a PMC.
3532 	 */
3533 
3534 	case PMC_OP_PMCSTOP:
3535 	{
3536 		pmc_id_t pmcid;
3537 		struct pmc *pm;
3538 		struct pmc_op_simple sp;
3539 
3540 		PMC_DOWNGRADE_SX();
3541 
3542 		if ((error = copyin(arg, &sp, sizeof(sp))) != 0)
3543 			break;
3544 
3545 		pmcid = sp.pm_pmcid;
3546 
3547 		/*
3548 		 * Mark the PMC as inactive and invoke the MD stop
3549 		 * routines if needed.
3550 		 */
3551 
3552 		if ((error = pmc_find_pmc(pmcid, &pm)) != 0)
3553 			break;
3554 
3555 		KASSERT(pmcid == pm->pm_id,
3556 		    ("[pmc,%d] pmc id %x != pmcid %x", __LINE__,
3557 			pm->pm_id, pmcid));
3558 
3559 		if (pm->pm_state == PMC_STATE_STOPPED) /* already stopped */
3560 			break;
3561 		else if (pm->pm_state != PMC_STATE_RUNNING) {
3562 			error = EINVAL;
3563 			break;
3564 		}
3565 
3566 		error = pmc_stop(pm);
3567 	}
3568 	break;
3569 
3570 
3571 	/*
3572 	 * Write a user supplied value to the log file.
3573 	 */
3574 
3575 	case PMC_OP_WRITELOG:
3576 	{
3577 		struct pmc_op_writelog wl;
3578 		struct pmc_owner *po;
3579 
3580 		PMC_DOWNGRADE_SX();
3581 
3582 		if ((error = copyin(arg, &wl, sizeof(wl))) != 0)
3583 			break;
3584 
3585 		if ((po = pmc_find_owner_descriptor(td->td_proc)) == NULL) {
3586 			error = EINVAL;
3587 			break;
3588 		}
3589 
3590 		if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) {
3591 			error = EINVAL;
3592 			break;
3593 		}
3594 
3595 		error = pmclog_process_userlog(po, &wl);
3596 	}
3597 	break;
3598 
3599 
3600 	default:
3601 		error = EINVAL;
3602 		break;
3603 	}
3604 
3605 	if (is_sx_downgraded)
3606 		sx_sunlock(&pmc_sx);
3607 	else
3608 		sx_xunlock(&pmc_sx);
3609 
3610 	if (error)
3611 		atomic_add_int(&pmc_stats.pm_syscall_errors, 1);
3612 
3613 	PICKUP_GIANT();
3614 
3615 	return error;
3616 }
3617 
3618 /*
3619  * Helper functions
3620  */
3621 
3622 
3623 /*
3624  * Interrupt processing.
3625  *
3626  * Find a free slot in the per-cpu array of PC samples and write the
3627  * current (PMC,PID,PC) triple to it.  If an event was successfully
3628  * added, a bit is set in mask 'pmc_cpumask' denoting that the
3629  * DO_SAMPLES hook needs to be invoked from the clock handler.
3630  *
3631  * This function is meant to be called from an NMI handler.  It cannot
3632  * use any of the locking primitives supplied by the OS.
3633  */
3634 
3635 int
3636 pmc_process_interrupt(int cpu, struct pmc *pm, uintfptr_t pc, int usermode)
3637 {
3638 	int error, ri;
3639 	struct thread *td;
3640 	struct pmc_sample *ps;
3641 	struct pmc_samplebuffer *psb;
3642 
3643 	error = 0;
3644 	ri = PMC_TO_ROWINDEX(pm);
3645 
3646 	psb = pmc_pcpu[cpu]->pc_sb;
3647 
3648 	ps = psb->ps_write;
3649 	if (ps->ps_pc) {	/* in use, reader hasn't caught up */
3650 		pm->pm_stalled = 1;
3651 		atomic_add_int(&pmc_stats.pm_intr_bufferfull, 1);
3652 		PMCDBG(SAM,INT,1,"(spc) cpu=%d pm=%p pc=%jx um=%d wr=%d rd=%d",
3653 		    cpu, pm, (uint64_t) pc, usermode,
3654 		    (int) (psb->ps_write - psb->ps_samples),
3655 		    (int) (psb->ps_read - psb->ps_samples));
3656 		error = ENOMEM;
3657 		goto done;
3658 	}
3659 
3660 	/* fill in entry */
3661 	PMCDBG(SAM,INT,1,"cpu=%d pm=%p pc=%jx um=%d wr=%d rd=%d", cpu, pm,
3662 	    (uint64_t) pc, usermode,
3663 	    (int) (psb->ps_write - psb->ps_samples),
3664 	    (int) (psb->ps_read - psb->ps_samples));
3665 
3666 	atomic_add_rel_32(&pm->pm_runcount, 1);		/* hold onto PMC */
3667 	ps->ps_pmc = pm;
3668 	if ((td = curthread) && td->td_proc)
3669 		ps->ps_pid = td->td_proc->p_pid;
3670 	else
3671 		ps->ps_pid = -1;
3672 	ps->ps_usermode = usermode;
3673 	ps->ps_pc = pc;		/* mark entry as in use */
3674 
3675 	/* increment write pointer, modulo ring buffer size */
3676 	ps++;
3677 	if (ps == psb->ps_fence)
3678 		psb->ps_write = psb->ps_samples;
3679 	else
3680 		psb->ps_write = ps;
3681 
3682  done:
3683 	/* mark CPU as needing processing */
3684 	atomic_set_rel_int(&pmc_cpumask, (1 << cpu));
3685 
3686 	return error;
3687 }
3688 
3689 
3690 /*
3691  * Process saved PC samples.
3692  */
3693 
3694 static void
3695 pmc_process_samples(int cpu)
3696 {
3697 	int n, ri;
3698 	struct pmc *pm;
3699 	struct thread *td;
3700 	struct pmc_owner *po;
3701 	struct pmc_sample *ps;
3702 	struct pmc_samplebuffer *psb;
3703 
3704 	KASSERT(PCPU_GET(cpuid) == cpu,
3705 	    ("[pmc,%d] not on the correct CPU pcpu=%d cpu=%d", __LINE__,
3706 		PCPU_GET(cpuid), cpu));
3707 
3708 	psb = pmc_pcpu[cpu]->pc_sb;
3709 
3710 	for (n = 0; n < pmc_nsamples; n++) { /* bound on #iterations */
3711 
3712 		ps = psb->ps_read;
3713 		if (ps->ps_pc == (uintfptr_t) 0)	/* no data */
3714 			break;
3715 
3716 		pm = ps->ps_pmc;
3717 		po = pm->pm_owner;
3718 
3719 		KASSERT(PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)),
3720 		    ("[pmc,%d] pmc=%p non-sampling mode=%d", __LINE__,
3721 			pm, PMC_TO_MODE(pm)));
3722 
3723 		/* Ignore PMCs that have been switched off */
3724 		if (pm->pm_state != PMC_STATE_RUNNING)
3725 			goto entrydone;
3726 
3727 		PMCDBG(SAM,OPS,1,"cpu=%d pm=%p pc=%jx um=%d wr=%d rd=%d", cpu,
3728 		    pm, (uint64_t) ps->ps_pc, ps->ps_usermode,
3729 		    (int) (psb->ps_write - psb->ps_samples),
3730 		    (int) (psb->ps_read - psb->ps_samples));
3731 
3732 		/*
3733 		 * If this is a process-mode PMC that is attached to
3734 		 * its owner, and if the PC is in user mode, update
3735 		 * profiling statistics like timer-based profiling
3736 		 * would have done.
3737 		 */
3738 		if (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) {
3739 			if (ps->ps_usermode) {
3740 				td = FIRST_THREAD_IN_PROC(po->po_owner);
3741 				addupc_intr(td, ps->ps_pc, 1);
3742 			}
3743 			goto entrydone;
3744 		}
3745 
3746 		/*
3747 		 * Otherwise, this is either a sampling mode PMC that
3748 		 * is attached to a different process than its owner,
3749 		 * or a system-wide sampling PMC.  Dispatch a log
3750 		 * entry to the PMC's owner process.
3751 		 */
3752 
3753 		pmclog_process_pcsample(pm, ps);
3754 
3755 	entrydone:
3756 		ps->ps_pc = (uintfptr_t) 0;	/* mark entry as free */
3757 		atomic_subtract_rel_32(&pm->pm_runcount, 1);
3758 
3759 		/* increment read pointer, modulo sample size */
3760 		if (++ps == psb->ps_fence)
3761 			psb->ps_read = psb->ps_samples;
3762 		else
3763 			psb->ps_read = ps;
3764 	}
3765 
3766 	atomic_add_int(&pmc_stats.pm_log_sweeps, 1);
3767 
3768 	/* Do not re-enable stalled PMCs if we failed to process any samples */
3769 	if (n == 0)
3770 		return;
3771 
3772 	/*
3773 	 * Restart any stalled sampling PMCs on this CPU.
3774 	 *
3775 	 * If the NMI handler sets the pm_stalled field of a PMC after
3776 	 * the check below, we'll end up processing the stalled PMC at
3777 	 * the next hardclock tick.
3778 	 */
3779 	for (n = 0; n < md->pmd_npmc; n++) {
3780 		(void) (*md->pmd_get_config)(cpu,n,&pm);
3781 		if (pm == NULL ||			 /* !cfg'ed */
3782 		    pm->pm_state != PMC_STATE_RUNNING || /* !active */
3783 		    !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)) || /* !sampling */
3784 		    pm->pm_stalled == 0) /* !stalled */
3785 			continue;
3786 
3787 		pm->pm_stalled = 0;
3788 		ri = PMC_TO_ROWINDEX(pm);
3789 		(*md->pmd_start_pmc)(cpu, ri);
3790 	}
3791 }
3792 
3793 /*
3794  * Event handlers.
3795  */
3796 
3797 /*
3798  * Handle a process exit.
3799  *
3800  * Remove this process from all hash tables.  If this process
3801  * owned any PMCs, turn off those PMCs and deallocate them,
3802  * removing any associations with target processes.
3803  *
3804  * This function will be called by the last 'thread' of a
3805  * process.
3806  *
3807  * XXX This eventhandler gets called early in the exit process.
3808  * Consider using a 'hook' invocation from thread_exit() or equivalent
3809  * spot.  Another negative is that kse_exit doesn't seem to call
3810  * exit1() [??].
3811  *
3812  */
3813 
3814 static void
3815 pmc_process_exit(void *arg __unused, struct proc *p)
3816 {
3817 	int is_using_hwpmcs;
3818 	int cpu;
3819 	unsigned int ri;
3820 	struct pmc *pm;
3821 	struct pmc_process *pp;
3822 	struct pmc_owner *po;
3823 	pmc_value_t newvalue, tmp;
3824 
3825 	PROC_LOCK(p);
3826 	is_using_hwpmcs = p->p_flag & P_HWPMC;
3827 	PROC_UNLOCK(p);
3828 
3829 	/*
3830 	 * Log a sysexit event to all SS PMC owners.
3831 	 */
3832 	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
3833 	    if (po->po_flags & PMC_PO_OWNS_LOGFILE)
3834 		    pmclog_process_sysexit(po, p->p_pid);
3835 
3836 	if (!is_using_hwpmcs)
3837 		return;
3838 
3839 	PMC_GET_SX_XLOCK();
3840 	PMCDBG(PRC,EXT,1,"process-exit proc=%p (%d, %s)", p, p->p_pid,
3841 	    p->p_comm);
3842 
3843 	/*
3844 	 * Since this code is invoked by the last thread in an exiting
3845 	 * process, we would have context switched IN at some prior
3846 	 * point.  However, with PREEMPTION, kernel mode context
3847 	 * switches may happen any time, so we want to disable a
3848 	 * context switch OUT till we get any PMCs targetting this
3849 	 * process off the hardware.
3850 	 *
3851 	 * We also need to atomically remove this process'
3852 	 * entry from our target process hash table, using
3853 	 * PMC_FLAG_REMOVE.
3854 	 */
3855 	PMCDBG(PRC,EXT,1, "process-exit proc=%p (%d, %s)", p, p->p_pid,
3856 	    p->p_comm);
3857 
3858 	critical_enter(); /* no preemption */
3859 
3860 	cpu = curthread->td_oncpu;
3861 
3862 	if ((pp = pmc_find_process_descriptor(p,
3863 		 PMC_FLAG_REMOVE)) != NULL) {
3864 
3865 		PMCDBG(PRC,EXT,2,
3866 		    "process-exit proc=%p pmc-process=%p", p, pp);
3867 
3868 		/*
3869 		 * The exiting process could the target of
3870 		 * some PMCs which will be running on
3871 		 * currently executing CPU.
3872 		 *
3873 		 * We need to turn these PMCs off like we
3874 		 * would do at context switch OUT time.
3875 		 */
3876 		for (ri = 0; ri < md->pmd_npmc; ri++) {
3877 
3878 			/*
3879 			 * Pick up the pmc pointer from hardware
3880 			 * state similar to the CSW_OUT code.
3881 			 */
3882 			pm = NULL;
3883 			(void) (*md->pmd_get_config)(cpu, ri, &pm);
3884 
3885 			PMCDBG(PRC,EXT,2, "ri=%d pm=%p", ri, pm);
3886 
3887 			if (pm == NULL ||
3888 			    !PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)))
3889 				continue;
3890 
3891 			PMCDBG(PRC,EXT,2, "ppmcs[%d]=%p pm=%p "
3892 			    "state=%d", ri, pp->pp_pmcs[ri].pp_pmc,
3893 			    pm, pm->pm_state);
3894 
3895 			KASSERT(PMC_TO_ROWINDEX(pm) == ri,
3896 			    ("[pmc,%d] ri mismatch pmc(%d) ri(%d)",
3897 				__LINE__, PMC_TO_ROWINDEX(pm), ri));
3898 
3899 			KASSERT(pm == pp->pp_pmcs[ri].pp_pmc,
3900 			    ("[pmc,%d] pm %p != pp_pmcs[%d] %p",
3901 				__LINE__, pm, ri, pp->pp_pmcs[ri].pp_pmc));
3902 
3903 			(void) md->pmd_stop_pmc(cpu, ri);
3904 
3905 			KASSERT(pm->pm_runcount > 0,
3906 			    ("[pmc,%d] bad runcount ri %d rc %d",
3907 				__LINE__, ri, pm->pm_runcount));
3908 
3909 			/* Stop hardware only if it is actually running */
3910 			if (pm->pm_state == PMC_STATE_RUNNING &&
3911 			    pm->pm_stalled == 0) {
3912 				md->pmd_read_pmc(cpu, ri, &newvalue);
3913 				tmp = newvalue -
3914 				    PMC_PCPU_SAVED(cpu,ri);
3915 
3916 				mtx_pool_lock_spin(pmc_mtxpool, pm);
3917 				pm->pm_gv.pm_savedvalue += tmp;
3918 				pp->pp_pmcs[ri].pp_pmcval += tmp;
3919 				mtx_pool_unlock_spin(pmc_mtxpool, pm);
3920 			}
3921 
3922 			atomic_subtract_rel_32(&pm->pm_runcount,1);
3923 
3924 			KASSERT((int) pm->pm_runcount >= 0,
3925 			    ("[pmc,%d] runcount is %d", __LINE__, ri));
3926 
3927 			(void) md->pmd_config_pmc(cpu, ri, NULL);
3928 		}
3929 
3930 		/*
3931 		 * Inform the MD layer of this pseudo "context switch
3932 		 * out"
3933 		 */
3934 		(void) md->pmd_switch_out(pmc_pcpu[cpu], pp);
3935 
3936 		critical_exit(); /* ok to be pre-empted now */
3937 
3938 		/*
3939 		 * Unlink this process from the PMCs that are
3940 		 * targetting it.  This will send a signal to
3941 		 * all PMC owner's whose PMCs are orphaned.
3942 		 *
3943 		 * Log PMC value at exit time if requested.
3944 		 */
3945 		for (ri = 0; ri < md->pmd_npmc; ri++)
3946 			if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL) {
3947 				if (pm->pm_flags & PMC_F_NEEDS_LOGFILE &&
3948 				    PMC_IS_COUNTING_MODE(PMC_TO_MODE(pm)))
3949 					pmclog_process_procexit(pm, pp);
3950 				pmc_unlink_target_process(pm, pp);
3951 			}
3952 		FREE(pp, M_PMC);
3953 
3954 	} else
3955 		critical_exit(); /* pp == NULL */
3956 
3957 
3958 	/*
3959 	 * If the process owned PMCs, free them up and free up
3960 	 * memory.
3961 	 */
3962 	if ((po = pmc_find_owner_descriptor(p)) != NULL) {
3963 		pmc_remove_owner(po);
3964 		pmc_destroy_owner_descriptor(po);
3965 	}
3966 
3967 	sx_xunlock(&pmc_sx);
3968 }
3969 
3970 /*
3971  * Handle a process fork.
3972  *
3973  * If the parent process 'p1' is under HWPMC monitoring, then copy
3974  * over any attached PMCs that have 'do_descendants' semantics.
3975  */
3976 
3977 static void
3978 pmc_process_fork(void *arg __unused, struct proc *p1, struct proc *newproc,
3979     int flags)
3980 {
3981 	int is_using_hwpmcs;
3982 	unsigned int ri;
3983 	uint32_t do_descendants;
3984 	struct pmc *pm;
3985 	struct pmc_owner *po;
3986 	struct pmc_process *ppnew, *ppold;
3987 
3988 	(void) flags;		/* unused parameter */
3989 
3990 	PROC_LOCK(p1);
3991 	is_using_hwpmcs = p1->p_flag & P_HWPMC;
3992 	PROC_UNLOCK(p1);
3993 
3994 	/*
3995 	 * If there are system-wide sampling PMCs active, we need to
3996 	 * log all fork events to their owner's logs.
3997 	 */
3998 
3999 	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
4000 	    if (po->po_flags & PMC_PO_OWNS_LOGFILE)
4001 		    pmclog_process_procfork(po, p1->p_pid, newproc->p_pid);
4002 
4003 	if (!is_using_hwpmcs)
4004 		return;
4005 
4006 	PMC_GET_SX_XLOCK();
4007 	PMCDBG(PMC,FRK,1, "process-fork proc=%p (%d, %s) -> %p", p1,
4008 	    p1->p_pid, p1->p_comm, newproc);
4009 
4010 	/*
4011 	 * If the parent process (curthread->td_proc) is a
4012 	 * target of any PMCs, look for PMCs that are to be
4013 	 * inherited, and link these into the new process
4014 	 * descriptor.
4015 	 */
4016 	if ((ppold = pmc_find_process_descriptor(curthread->td_proc,
4017 		 PMC_FLAG_NONE)) == NULL)
4018 		goto done;		/* nothing to do */
4019 
4020 	do_descendants = 0;
4021 	for (ri = 0; ri < md->pmd_npmc; ri++)
4022 		if ((pm = ppold->pp_pmcs[ri].pp_pmc) != NULL)
4023 			do_descendants |= pm->pm_flags & PMC_F_DESCENDANTS;
4024 	if (do_descendants == 0) /* nothing to do */
4025 		goto done;
4026 
4027 	/* allocate a descriptor for the new process  */
4028 	if ((ppnew = pmc_find_process_descriptor(newproc,
4029 		 PMC_FLAG_ALLOCATE)) == NULL)
4030 		goto done;
4031 
4032 	/*
4033 	 * Run through all PMCs that were targeting the old process
4034 	 * and which specified F_DESCENDANTS and attach them to the
4035 	 * new process.
4036 	 *
4037 	 * Log the fork event to all owners of PMCs attached to this
4038 	 * process, if not already logged.
4039 	 */
4040 	for (ri = 0; ri < md->pmd_npmc; ri++)
4041 		if ((pm = ppold->pp_pmcs[ri].pp_pmc) != NULL &&
4042 		    (pm->pm_flags & PMC_F_DESCENDANTS)) {
4043 			pmc_link_target_process(pm, ppnew);
4044 			po = pm->pm_owner;
4045 			if (po->po_sscount == 0 &&
4046 			    po->po_flags & PMC_PO_OWNS_LOGFILE)
4047 				pmclog_process_procfork(po, p1->p_pid,
4048 				    newproc->p_pid);
4049 		}
4050 
4051 	/*
4052 	 * Now mark the new process as being tracked by this driver.
4053 	 */
4054 	PROC_LOCK(newproc);
4055 	newproc->p_flag |= P_HWPMC;
4056 	PROC_UNLOCK(newproc);
4057 
4058  done:
4059 	sx_xunlock(&pmc_sx);
4060 }
4061 
4062 
4063 /*
4064  * initialization
4065  */
4066 
4067 static const char *pmc_name_of_pmcclass[] = {
4068 #undef	__PMC_CLASS
4069 #define	__PMC_CLASS(N) #N ,
4070 	__PMC_CLASSES()
4071 };
4072 
4073 static int
4074 pmc_initialize(void)
4075 {
4076 	int cpu, error, n;
4077 	struct pmc_binding pb;
4078 	struct pmc_samplebuffer *sb;
4079 
4080 	md = NULL;
4081 	error = 0;
4082 
4083 #ifdef	DEBUG
4084 	/* parse debug flags first */
4085 	if (TUNABLE_STR_FETCH(PMC_SYSCTL_NAME_PREFIX "debugflags",
4086 		pmc_debugstr, sizeof(pmc_debugstr)))
4087 		pmc_debugflags_parse(pmc_debugstr,
4088 		    pmc_debugstr+strlen(pmc_debugstr));
4089 #endif
4090 
4091 	PMCDBG(MOD,INI,0, "PMC Initialize (version %x)", PMC_VERSION);
4092 
4093 	/* check kernel version */
4094 	if (pmc_kernel_version != PMC_VERSION) {
4095 		if (pmc_kernel_version == 0)
4096 			printf("hwpmc: this kernel has not been compiled with "
4097 			    "'options HWPMC_HOOKS'.\n");
4098 		else
4099 			printf("hwpmc: kernel version (0x%x) does not match "
4100 			    "module version (0x%x).\n", pmc_kernel_version,
4101 			    PMC_VERSION);
4102 		return EPROGMISMATCH;
4103 	}
4104 
4105 	/*
4106 	 * check sysctl parameters
4107 	 */
4108 
4109 	if (pmc_hashsize <= 0) {
4110 		(void) printf("hwpmc: tunable hashsize=%d must be greater "
4111 		    "than zero.\n", pmc_hashsize);
4112 		pmc_hashsize = PMC_HASH_SIZE;
4113 	}
4114 
4115 	if (pmc_nsamples <= 0 || pmc_nsamples > 65535) {
4116 		(void) printf("hwpmc: tunable nsamples=%d out of range.\n",
4117 		    pmc_nsamples);
4118 		pmc_nsamples = PMC_NSAMPLES;
4119 	}
4120 
4121 	md = pmc_md_initialize();
4122 
4123 	if (md == NULL || md->pmd_init == NULL)
4124 		return ENOSYS;
4125 
4126 	/* allocate space for the per-cpu array */
4127 	MALLOC(pmc_pcpu, struct pmc_cpu **, mp_ncpus * sizeof(struct pmc_cpu *),
4128 	    M_PMC, M_WAITOK|M_ZERO);
4129 
4130 	/* per-cpu 'saved values' for managing process-mode PMCs */
4131 	MALLOC(pmc_pcpu_saved, pmc_value_t *,
4132 	    sizeof(pmc_value_t) * mp_ncpus * md->pmd_npmc, M_PMC, M_WAITOK);
4133 
4134 	/* perform cpu dependent initialization */
4135 	pmc_save_cpu_binding(&pb);
4136 	for (cpu = 0; cpu < mp_ncpus; cpu++) {
4137 		if (pmc_cpu_is_disabled(cpu))
4138 			continue;
4139 		pmc_select_cpu(cpu);
4140 		if ((error = md->pmd_init(cpu)) != 0)
4141 			break;
4142 	}
4143 	pmc_restore_cpu_binding(&pb);
4144 
4145 	if (error != 0)
4146 		return error;
4147 
4148 	/* allocate space for the sample array */
4149 	for (cpu = 0; cpu < mp_ncpus; cpu++) {
4150 		if (pmc_cpu_is_disabled(cpu))
4151 			continue;
4152 		MALLOC(sb, struct pmc_samplebuffer *,
4153 		    sizeof(struct pmc_samplebuffer) +
4154 		    pmc_nsamples * sizeof(struct pmc_sample), M_PMC,
4155 		    M_WAITOK|M_ZERO);
4156 
4157 		sb->ps_read = sb->ps_write = sb->ps_samples;
4158 		sb->ps_fence = sb->ps_samples + pmc_nsamples;
4159 		KASSERT(pmc_pcpu[cpu] != NULL,
4160 		    ("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu));
4161 
4162 		pmc_pcpu[cpu]->pc_sb = sb;
4163 	}
4164 
4165 	/* allocate space for the row disposition array */
4166 	pmc_pmcdisp = malloc(sizeof(enum pmc_mode) * md->pmd_npmc,
4167 	    M_PMC, M_WAITOK|M_ZERO);
4168 
4169 	KASSERT(pmc_pmcdisp != NULL,
4170 	    ("[pmc,%d] pmcdisp allocation returned NULL", __LINE__));
4171 
4172 	/* mark all PMCs as available */
4173 	for (n = 0; n < (int) md->pmd_npmc; n++)
4174 		PMC_MARK_ROW_FREE(n);
4175 
4176 	/* allocate thread hash tables */
4177 	pmc_ownerhash = hashinit(pmc_hashsize, M_PMC,
4178 	    &pmc_ownerhashmask);
4179 
4180 	pmc_processhash = hashinit(pmc_hashsize, M_PMC,
4181 	    &pmc_processhashmask);
4182 	mtx_init(&pmc_processhash_mtx, "pmc-process-hash", "pmc", MTX_SPIN);
4183 
4184 	LIST_INIT(&pmc_ss_owners);
4185 	pmc_ss_count = 0;
4186 
4187 	/* allocate a pool of spin mutexes */
4188 	pmc_mtxpool = mtx_pool_create("pmc", pmc_mtxpool_size, MTX_SPIN);
4189 
4190 	PMCDBG(MOD,INI,1, "pmc_ownerhash=%p, mask=0x%lx "
4191 	    "targethash=%p mask=0x%lx", pmc_ownerhash, pmc_ownerhashmask,
4192 	    pmc_processhash, pmc_processhashmask);
4193 
4194 	/* register process {exit,fork,exec} handlers */
4195 	pmc_exit_tag = EVENTHANDLER_REGISTER(process_exit,
4196 	    pmc_process_exit, NULL, EVENTHANDLER_PRI_ANY);
4197 	pmc_fork_tag = EVENTHANDLER_REGISTER(process_fork,
4198 	    pmc_process_fork, NULL, EVENTHANDLER_PRI_ANY);
4199 
4200 	/* initialize logging */
4201 	pmclog_initialize();
4202 
4203 	/* set hook functions */
4204 	pmc_intr = md->pmd_intr;
4205 	pmc_hook = pmc_hook_handler;
4206 
4207 	if (error == 0) {
4208 		printf(PMC_MODULE_NAME ":");
4209 		for (n = 0; n < (int) md->pmd_nclass; n++) {
4210 			printf(" %s/%d/0x%b",
4211 			    pmc_name_of_pmcclass[md->pmd_classes[n].pm_class],
4212 			    md->pmd_nclasspmcs[n],
4213 			    md->pmd_classes[n].pm_caps,
4214 			    "\20"
4215 			    "\1INT\2USR\3SYS\4EDG\5THR"
4216 			    "\6REA\7WRI\10INV\11QUA\12PRC"
4217 			    "\13TAG\14CSC");
4218 		}
4219 		printf("\n");
4220 	}
4221 
4222 	return error;
4223 }
4224 
4225 /* prepare to be unloaded */
4226 static void
4227 pmc_cleanup(void)
4228 {
4229 	int cpu;
4230 	struct pmc_ownerhash *ph;
4231 	struct pmc_owner *po, *tmp;
4232 	struct pmc_binding pb;
4233 #ifdef	DEBUG
4234 	struct pmc_processhash *prh;
4235 #endif
4236 
4237 	PMCDBG(MOD,INI,0, "%s", "cleanup");
4238 
4239 	/* switch off sampling */
4240 	atomic_store_rel_int(&pmc_cpumask, 0);
4241 	pmc_intr = NULL;
4242 
4243 	sx_xlock(&pmc_sx);
4244 	if (pmc_hook == NULL) {	/* being unloaded already */
4245 		sx_xunlock(&pmc_sx);
4246 		return;
4247 	}
4248 
4249 	pmc_hook = NULL; /* prevent new threads from entering module */
4250 
4251 	/* deregister event handlers */
4252 	EVENTHANDLER_DEREGISTER(process_fork, pmc_fork_tag);
4253 	EVENTHANDLER_DEREGISTER(process_exit, pmc_exit_tag);
4254 
4255 	/* send SIGBUS to all owner threads, free up allocations */
4256 	if (pmc_ownerhash)
4257 		for (ph = pmc_ownerhash;
4258 		     ph <= &pmc_ownerhash[pmc_ownerhashmask];
4259 		     ph++) {
4260 			LIST_FOREACH_SAFE(po, ph, po_next, tmp) {
4261 				pmc_remove_owner(po);
4262 
4263 				/* send SIGBUS to owner processes */
4264 				PMCDBG(MOD,INI,2, "cleanup signal proc=%p "
4265 				    "(%d, %s)", po->po_owner,
4266 				    po->po_owner->p_pid,
4267 				    po->po_owner->p_comm);
4268 
4269 				PROC_LOCK(po->po_owner);
4270 				psignal(po->po_owner, SIGBUS);
4271 				PROC_UNLOCK(po->po_owner);
4272 
4273 				pmc_destroy_owner_descriptor(po);
4274 			}
4275 		}
4276 
4277 	/* reclaim allocated data structures */
4278 	if (pmc_mtxpool)
4279 		mtx_pool_destroy(&pmc_mtxpool);
4280 
4281 	mtx_destroy(&pmc_processhash_mtx);
4282 	if (pmc_processhash) {
4283 #ifdef	DEBUG
4284 		struct pmc_process *pp;
4285 
4286 		PMCDBG(MOD,INI,3, "%s", "destroy process hash");
4287 		for (prh = pmc_processhash;
4288 		     prh <= &pmc_processhash[pmc_processhashmask];
4289 		     prh++)
4290 			LIST_FOREACH(pp, prh, pp_next)
4291 			    PMCDBG(MOD,INI,3, "pid=%d", pp->pp_proc->p_pid);
4292 #endif
4293 
4294 		hashdestroy(pmc_processhash, M_PMC, pmc_processhashmask);
4295 		pmc_processhash = NULL;
4296 	}
4297 
4298 	if (pmc_ownerhash) {
4299 		PMCDBG(MOD,INI,3, "%s", "destroy owner hash");
4300 		hashdestroy(pmc_ownerhash, M_PMC, pmc_ownerhashmask);
4301 		pmc_ownerhash = NULL;
4302 	}
4303 
4304 	KASSERT(LIST_EMPTY(&pmc_ss_owners),
4305 	    ("[pmc,%d] Global SS owner list not empty", __LINE__));
4306 	KASSERT(pmc_ss_count == 0,
4307 	    ("[pmc,%d] Global SS count not empty", __LINE__));
4308 
4309 	/* free the per-cpu sample buffers */
4310 	for (cpu = 0; cpu < mp_ncpus; cpu++) {
4311 		if (pmc_cpu_is_disabled(cpu))
4312 			continue;
4313 		KASSERT(pmc_pcpu[cpu]->pc_sb != NULL,
4314 		    ("[pmc,%d] Null cpu sample buffer cpu=%d", __LINE__,
4315 			cpu));
4316 		FREE(pmc_pcpu[cpu]->pc_sb, M_PMC);
4317 		pmc_pcpu[cpu]->pc_sb = NULL;
4318 	}
4319 
4320  	/* do processor dependent cleanup */
4321 	PMCDBG(MOD,INI,3, "%s", "md cleanup");
4322 	if (md) {
4323 		pmc_save_cpu_binding(&pb);
4324 		for (cpu = 0; cpu < mp_ncpus; cpu++) {
4325 			PMCDBG(MOD,INI,1,"pmc-cleanup cpu=%d pcs=%p",
4326 			    cpu, pmc_pcpu[cpu]);
4327 			if (pmc_cpu_is_disabled(cpu))
4328 				continue;
4329 			pmc_select_cpu(cpu);
4330 			if (pmc_pcpu[cpu])
4331 				(void) md->pmd_cleanup(cpu);
4332 		}
4333 		FREE(md, M_PMC);
4334 		md = NULL;
4335 		pmc_restore_cpu_binding(&pb);
4336 	}
4337 
4338 	/* deallocate per-cpu structures */
4339 	FREE(pmc_pcpu, M_PMC);
4340 	pmc_pcpu = NULL;
4341 
4342 	FREE(pmc_pcpu_saved, M_PMC);
4343 	pmc_pcpu_saved = NULL;
4344 
4345 	if (pmc_pmcdisp) {
4346 		FREE(pmc_pmcdisp, M_PMC);
4347 		pmc_pmcdisp = NULL;
4348 	}
4349 
4350 	pmclog_shutdown();
4351 
4352 	sx_xunlock(&pmc_sx); 	/* we are done */
4353 }
4354 
4355 /*
4356  * The function called at load/unload.
4357  */
4358 
4359 static int
4360 load (struct module *module __unused, int cmd, void *arg __unused)
4361 {
4362 	int error;
4363 
4364 	error = 0;
4365 
4366 	switch (cmd) {
4367 	case MOD_LOAD :
4368 		/* initialize the subsystem */
4369 		error = pmc_initialize();
4370 		if (error != 0)
4371 			break;
4372 		PMCDBG(MOD,INI,1, "syscall=%d ncpus=%d",
4373 		    pmc_syscall_num, mp_ncpus);
4374 		break;
4375 
4376 
4377 	case MOD_UNLOAD :
4378 	case MOD_SHUTDOWN:
4379 		pmc_cleanup();
4380 		PMCDBG(MOD,INI,1, "%s", "unloaded");
4381 		break;
4382 
4383 	default :
4384 		error = EINVAL;	/* XXX should panic(9) */
4385 		break;
4386 	}
4387 
4388 	return error;
4389 }
4390 
4391 /* memory pool */
4392 MALLOC_DEFINE(M_PMC, "pmc", "Memory space for the PMC module");
4393