xref: /illumos-gate/usr/src/uts/common/os/kcpc.c (revision 0ebf3797ed9aceba2a3b361cf14badb82ac13478)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/param.h>
30 #include <sys/thread.h>
31 #include <sys/cpuvar.h>
32 #include <sys/inttypes.h>
33 #include <sys/cmn_err.h>
34 #include <sys/time.h>
35 #include <sys/ksynch.h>
36 #include <sys/systm.h>
37 #include <sys/kcpc.h>
38 #include <sys/cpc_impl.h>
39 #include <sys/cpc_pcbe.h>
40 #include <sys/atomic.h>
41 #include <sys/sunddi.h>
42 #include <sys/modctl.h>
43 #include <sys/sdt.h>
44 #if defined(__x86)
45 #include <asm/clock.h>
46 #endif
47 
48 kmutex_t	kcpc_ctx_llock[CPC_HASH_BUCKETS];	/* protects ctx_list */
49 kcpc_ctx_t	*kcpc_ctx_list[CPC_HASH_BUCKETS];	/* head of list */
50 
51 
52 krwlock_t	kcpc_cpuctx_lock;	/* lock for 'kcpc_cpuctx' below */
53 int		kcpc_cpuctx;		/* number of cpu-specific contexts */
54 
55 int kcpc_counts_include_idle = 1; /* Project Private /etc/system variable */
56 
57 /*
58  * These are set when a PCBE module is loaded.
59  */
60 uint_t		cpc_ncounters = 0;
61 pcbe_ops_t	*pcbe_ops = NULL;
62 
63 /*
64  * Statistics on (mis)behavior
65  */
66 static uint32_t kcpc_intrctx_count;    /* # overflows in an interrupt handler */
67 static uint32_t kcpc_nullctx_count;    /* # overflows in a thread with no ctx */
68 
69 /*
70  * Is misbehaviour (overflow in a thread with no context) fatal?
71  */
72 #ifdef DEBUG
73 static int kcpc_nullctx_panic = 1;
74 #else
75 static int kcpc_nullctx_panic = 0;
76 #endif
77 
78 static void kcpc_lwp_create(kthread_t *t, kthread_t *ct);
79 static void kcpc_restore(kcpc_ctx_t *ctx);
80 static void kcpc_save(kcpc_ctx_t *ctx);
81 static void kcpc_free(kcpc_ctx_t *ctx, int isexec);
82 static int kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode);
83 static void kcpc_free_configs(kcpc_set_t *set);
84 static kcpc_ctx_t *kcpc_ctx_alloc(void);
85 static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx);
86 static void kcpc_ctx_free(kcpc_ctx_t *ctx);
87 static int kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx);
88 static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch);
89 static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set);
90 
91 void
92 kcpc_register_pcbe(pcbe_ops_t *ops)
93 {
94 	pcbe_ops = ops;
95 	cpc_ncounters = pcbe_ops->pcbe_ncounters();
96 }
97 
98 int
99 kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode)
100 {
101 	cpu_t		*cp;
102 	kcpc_ctx_t	*ctx;
103 	int		error;
104 
105 	ctx = kcpc_ctx_alloc();
106 
107 	if (kcpc_assign_reqs(set, ctx) != 0) {
108 		kcpc_ctx_free(ctx);
109 		*subcode = CPC_RESOURCE_UNAVAIL;
110 		return (EINVAL);
111 	}
112 
113 	ctx->kc_cpuid = cpuid;
114 	ctx->kc_thread = curthread;
115 
116 	set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
117 
118 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
119 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
120 		kcpc_ctx_free(ctx);
121 		return (error);
122 	}
123 
124 	set->ks_ctx = ctx;
125 	ctx->kc_set = set;
126 
127 	/*
128 	 * We must hold cpu_lock to prevent DR, offlining, or unbinding while
129 	 * we are manipulating the cpu_t and programming the hardware, else the
130 	 * the cpu_t could go away while we're looking at it.
131 	 */
132 	mutex_enter(&cpu_lock);
133 	cp = cpu_get(cpuid);
134 
135 	if (cp == NULL)
136 		/*
137 		 * The CPU could have been DRd out while we were getting set up.
138 		 */
139 		goto unbound;
140 
141 	mutex_enter(&cp->cpu_cpc_ctxlock);
142 
143 	if (cp->cpu_cpc_ctx != NULL) {
144 		/*
145 		 * If this CPU already has a bound set, return an error.
146 		 */
147 		mutex_exit(&cp->cpu_cpc_ctxlock);
148 		goto unbound;
149 	}
150 
151 	if (curthread->t_bind_cpu != cpuid) {
152 		mutex_exit(&cp->cpu_cpc_ctxlock);
153 		goto unbound;
154 	}
155 	cp->cpu_cpc_ctx = ctx;
156 
157 	/*
158 	 * Kernel preemption must be disabled while fiddling with the hardware
159 	 * registers to prevent partial updates.
160 	 */
161 	kpreempt_disable();
162 	ctx->kc_rawtick = KCPC_GET_TICK();
163 	pcbe_ops->pcbe_program(ctx);
164 	kpreempt_enable();
165 
166 	mutex_exit(&cp->cpu_cpc_ctxlock);
167 	mutex_exit(&cpu_lock);
168 
169 	mutex_enter(&set->ks_lock);
170 	set->ks_state |= KCPC_SET_BOUND;
171 	cv_signal(&set->ks_condv);
172 	mutex_exit(&set->ks_lock);
173 
174 	return (0);
175 
176 unbound:
177 	mutex_exit(&cpu_lock);
178 	set->ks_ctx = NULL;
179 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
180 	kcpc_ctx_free(ctx);
181 	return (EAGAIN);
182 }
183 
184 int
185 kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
186 {
187 	kcpc_ctx_t	*ctx;
188 	int		error;
189 
190 	/*
191 	 * Only one set is allowed per context, so ensure there is no
192 	 * existing context.
193 	 */
194 
195 	if (t->t_cpc_ctx != NULL)
196 		return (EEXIST);
197 
198 	ctx = kcpc_ctx_alloc();
199 
200 	/*
201 	 * The context must begin life frozen until it has been properly
202 	 * programmed onto the hardware. This prevents the context ops from
203 	 * worrying about it until we're ready.
204 	 */
205 	ctx->kc_flags |= KCPC_CTX_FREEZE;
206 	ctx->kc_hrtime = gethrtime();
207 
208 	if (kcpc_assign_reqs(set, ctx) != 0) {
209 		kcpc_ctx_free(ctx);
210 		*subcode = CPC_RESOURCE_UNAVAIL;
211 		return (EINVAL);
212 	}
213 
214 	ctx->kc_cpuid = -1;
215 	if (set->ks_flags & CPC_BIND_LWP_INHERIT)
216 		ctx->kc_flags |= KCPC_CTX_LWPINHERIT;
217 	ctx->kc_thread = t;
218 	t->t_cpc_ctx = ctx;
219 	/*
220 	 * Permit threads to look at their own hardware counters from userland.
221 	 */
222 	ctx->kc_flags |= KCPC_CTX_NONPRIV;
223 
224 	/*
225 	 * Create the data store for this set.
226 	 */
227 	set->ks_data = kmem_alloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
228 
229 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
230 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
231 		kcpc_ctx_free(ctx);
232 		t->t_cpc_ctx = NULL;
233 		return (error);
234 	}
235 
236 	set->ks_ctx = ctx;
237 	ctx->kc_set = set;
238 
239 	/*
240 	 * Add a device context to the subject thread.
241 	 */
242 	installctx(t, ctx, kcpc_save, kcpc_restore, NULL,
243 	    kcpc_lwp_create, NULL, kcpc_free);
244 
245 	/*
246 	 * Ask the backend to program the hardware.
247 	 */
248 	if (t == curthread) {
249 		kpreempt_disable();
250 		ctx->kc_rawtick = KCPC_GET_TICK();
251 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
252 		pcbe_ops->pcbe_program(ctx);
253 		kpreempt_enable();
254 	} else
255 		/*
256 		 * Since we are the agent LWP, we know the victim LWP is stopped
257 		 * until we're done here; no need to worry about preemption or
258 		 * migration here. We still use an atomic op to clear the flag
259 		 * to ensure the flags are always self-consistent; they can
260 		 * still be accessed from, for instance, another CPU doing a
261 		 * kcpc_invalidate_all().
262 		 */
263 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
264 
265 	mutex_enter(&set->ks_lock);
266 	set->ks_state |= KCPC_SET_BOUND;
267 	cv_signal(&set->ks_condv);
268 	mutex_exit(&set->ks_lock);
269 
270 	return (0);
271 }
272 
273 /*
274  * Walk through each request in the set and ask the PCBE to configure a
275  * corresponding counter.
276  */
277 static int
278 kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode)
279 {
280 	int		i;
281 	int		ret;
282 	kcpc_request_t	*rp;
283 
284 	for (i = 0; i < set->ks_nreqs; i++) {
285 		int n;
286 		rp = &set->ks_req[i];
287 
288 		n = rp->kr_picnum;
289 
290 		ASSERT(n >= 0 && n < cpc_ncounters);
291 
292 		ASSERT(ctx->kc_pics[n].kp_req == NULL);
293 
294 		if (rp->kr_flags & CPC_OVF_NOTIFY_EMT) {
295 			if ((pcbe_ops->pcbe_caps & CPC_CAP_OVERFLOW_INTERRUPT)
296 			    == 0) {
297 				*subcode = -1;
298 				return (ENOTSUP);
299 			}
300 			/*
301 			 * If any of the counters have requested overflow
302 			 * notification, we flag the context as being one that
303 			 * cares about overflow.
304 			 */
305 			ctx->kc_flags |= KCPC_CTX_SIGOVF;
306 		}
307 
308 		rp->kr_config = NULL;
309 		if ((ret = pcbe_ops->pcbe_configure(n, rp->kr_event,
310 		    rp->kr_preset, rp->kr_flags, rp->kr_nattrs, rp->kr_attr,
311 		    &(rp->kr_config), (void *)ctx)) != 0) {
312 			kcpc_free_configs(set);
313 			*subcode = ret;
314 			switch (ret) {
315 			case CPC_ATTR_REQUIRES_PRIVILEGE:
316 			case CPC_HV_NO_ACCESS:
317 				return (EACCES);
318 			default:
319 				return (EINVAL);
320 			}
321 		}
322 
323 		ctx->kc_pics[n].kp_req = rp;
324 		rp->kr_picp = &ctx->kc_pics[n];
325 		rp->kr_data = set->ks_data + rp->kr_index;
326 		*rp->kr_data = rp->kr_preset;
327 	}
328 
329 	return (0);
330 }
331 
332 static void
333 kcpc_free_configs(kcpc_set_t *set)
334 {
335 	int i;
336 
337 	for (i = 0; i < set->ks_nreqs; i++)
338 		if (set->ks_req[i].kr_config != NULL)
339 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
340 }
341 
342 /*
343  * buf points to a user address and the data should be copied out to that
344  * address in the current process.
345  */
346 int
347 kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick)
348 {
349 	kcpc_ctx_t	*ctx = set->ks_ctx;
350 	uint64_t	curtick = KCPC_GET_TICK();
351 
352 	mutex_enter(&set->ks_lock);
353 	if ((set->ks_state & KCPC_SET_BOUND) == 0) {
354 		mutex_exit(&set->ks_lock);
355 		return (EINVAL);
356 	}
357 	mutex_exit(&set->ks_lock);
358 
359 	if (ctx->kc_flags & KCPC_CTX_INVALID)
360 		return (EAGAIN);
361 
362 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) {
363 		/*
364 		 * Kernel preemption must be disabled while reading the
365 		 * hardware regs, and if this is a CPU-bound context, while
366 		 * checking the CPU binding of the current thread.
367 		 */
368 		kpreempt_disable();
369 
370 		if (ctx->kc_cpuid != -1) {
371 			if (curthread->t_bind_cpu != ctx->kc_cpuid) {
372 				kpreempt_enable();
373 				return (EAGAIN);
374 			}
375 		}
376 
377 		if (ctx->kc_thread == curthread) {
378 			ctx->kc_hrtime = gethrtime();
379 			pcbe_ops->pcbe_sample(ctx);
380 			ctx->kc_vtick += curtick - ctx->kc_rawtick;
381 			ctx->kc_rawtick = curtick;
382 		}
383 
384 		kpreempt_enable();
385 
386 		/*
387 		 * The config may have been invalidated by
388 		 * the pcbe_sample op.
389 		 */
390 		if (ctx->kc_flags & KCPC_CTX_INVALID)
391 			return (EAGAIN);
392 	}
393 
394 	if (copyout(set->ks_data, buf,
395 	    set->ks_nreqs * sizeof (uint64_t)) == -1)
396 		return (EFAULT);
397 	if (copyout(&ctx->kc_hrtime, hrtime, sizeof (uint64_t)) == -1)
398 		return (EFAULT);
399 	if (copyout(&ctx->kc_vtick, tick, sizeof (uint64_t)) == -1)
400 		return (EFAULT);
401 
402 	return (0);
403 }
404 
405 /*
406  * Stop the counters on the CPU this context is bound to.
407  */
408 static void
409 kcpc_stop_hw(kcpc_ctx_t *ctx)
410 {
411 	cpu_t *cp;
412 
413 	ASSERT((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED))
414 	    == KCPC_CTX_INVALID);
415 
416 	kpreempt_disable();
417 
418 	cp = cpu_get(ctx->kc_cpuid);
419 	ASSERT(cp != NULL);
420 
421 	if (cp == CPU) {
422 		pcbe_ops->pcbe_allstop();
423 		atomic_or_uint(&ctx->kc_flags,
424 		    KCPC_CTX_INVALID_STOPPED);
425 	} else
426 		kcpc_remote_stop(cp);
427 	kpreempt_enable();
428 }
429 
430 int
431 kcpc_unbind(kcpc_set_t *set)
432 {
433 	kcpc_ctx_t	*ctx;
434 	kthread_t	*t;
435 
436 	/*
437 	 * We could be racing with the process's agent thread as it
438 	 * binds the set; we must wait for the set to finish binding
439 	 * before attempting to tear it down.
440 	 */
441 	mutex_enter(&set->ks_lock);
442 	while ((set->ks_state & KCPC_SET_BOUND) == 0)
443 		cv_wait(&set->ks_condv, &set->ks_lock);
444 	mutex_exit(&set->ks_lock);
445 
446 	ctx = set->ks_ctx;
447 
448 	/*
449 	 * Use kc_lock to synchronize with kcpc_restore().
450 	 */
451 	mutex_enter(&ctx->kc_lock);
452 	ctx->kc_flags |= KCPC_CTX_INVALID;
453 	mutex_exit(&ctx->kc_lock);
454 
455 	if (ctx->kc_cpuid == -1) {
456 		t = ctx->kc_thread;
457 		/*
458 		 * The context is thread-bound and therefore has a device
459 		 * context.  It will be freed via removectx() calling
460 		 * freectx() calling kcpc_free().
461 		 */
462 		if (t == curthread &&
463 		    (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
464 			kpreempt_disable();
465 			pcbe_ops->pcbe_allstop();
466 			atomic_or_uint(&ctx->kc_flags,
467 			    KCPC_CTX_INVALID_STOPPED);
468 			kpreempt_enable();
469 		}
470 #ifdef DEBUG
471 		if (removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
472 		    kcpc_lwp_create, NULL, kcpc_free) == 0)
473 			panic("kcpc_unbind: context %p not preset on thread %p",
474 			    ctx, t);
475 #else
476 		(void) removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
477 		    kcpc_lwp_create, NULL, kcpc_free);
478 #endif /* DEBUG */
479 		t->t_cpc_set = NULL;
480 		t->t_cpc_ctx = NULL;
481 	} else {
482 		/*
483 		 * If we are unbinding a CPU-bound set from a remote CPU, the
484 		 * native CPU's idle thread could be in the midst of programming
485 		 * this context onto the CPU. We grab the context's lock here to
486 		 * ensure that the idle thread is done with it. When we release
487 		 * the lock, the CPU no longer has a context and the idle thread
488 		 * will move on.
489 		 *
490 		 * cpu_lock must be held to prevent the CPU from being DR'd out
491 		 * while we disassociate the context from the cpu_t.
492 		 */
493 		cpu_t *cp;
494 		mutex_enter(&cpu_lock);
495 		cp = cpu_get(ctx->kc_cpuid);
496 		if (cp != NULL) {
497 			/*
498 			 * The CPU may have been DR'd out of the system.
499 			 */
500 			mutex_enter(&cp->cpu_cpc_ctxlock);
501 			if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0)
502 				kcpc_stop_hw(ctx);
503 			ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
504 			cp->cpu_cpc_ctx = NULL;
505 			mutex_exit(&cp->cpu_cpc_ctxlock);
506 		}
507 		mutex_exit(&cpu_lock);
508 		if (ctx->kc_thread == curthread) {
509 			kcpc_free(ctx, 0);
510 			curthread->t_cpc_set = NULL;
511 		}
512 	}
513 
514 	return (0);
515 }
516 
517 int
518 kcpc_preset(kcpc_set_t *set, int index, uint64_t preset)
519 {
520 	int i;
521 
522 	ASSERT(set != NULL);
523 	ASSERT(set->ks_state & KCPC_SET_BOUND);
524 	ASSERT(set->ks_ctx->kc_thread == curthread);
525 	ASSERT(set->ks_ctx->kc_cpuid == -1);
526 
527 	if (index < 0 || index >= set->ks_nreqs)
528 		return (EINVAL);
529 
530 	for (i = 0; i < set->ks_nreqs; i++)
531 		if (set->ks_req[i].kr_index == index)
532 			break;
533 	ASSERT(i != set->ks_nreqs);
534 
535 	set->ks_req[i].kr_preset = preset;
536 	return (0);
537 }
538 
539 int
540 kcpc_restart(kcpc_set_t *set)
541 {
542 	kcpc_ctx_t	*ctx = set->ks_ctx;
543 	int		i;
544 
545 	ASSERT(set->ks_state & KCPC_SET_BOUND);
546 	ASSERT(ctx->kc_thread == curthread);
547 	ASSERT(ctx->kc_cpuid == -1);
548 
549 	kpreempt_disable();
550 
551 	/*
552 	 * If the user is doing this on a running set, make sure the counters
553 	 * are stopped first.
554 	 */
555 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
556 		pcbe_ops->pcbe_allstop();
557 
558 	for (i = 0; i < set->ks_nreqs; i++) {
559 		*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
560 		pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset,
561 		    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
562 	}
563 
564 	/*
565 	 * Ask the backend to program the hardware.
566 	 */
567 	ctx->kc_rawtick = KCPC_GET_TICK();
568 	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
569 	pcbe_ops->pcbe_program(ctx);
570 	kpreempt_enable();
571 
572 	return (0);
573 }
574 
575 /*
576  * Caller must hold kcpc_cpuctx_lock.
577  */
578 int
579 kcpc_enable(kthread_t *t, int cmd, int enable)
580 {
581 	kcpc_ctx_t	*ctx = t->t_cpc_ctx;
582 	kcpc_set_t	*set = t->t_cpc_set;
583 	kcpc_set_t	*newset;
584 	int		i;
585 	int		flag;
586 	int		err;
587 
588 	ASSERT(RW_READ_HELD(&kcpc_cpuctx_lock));
589 
590 	if (ctx == NULL) {
591 		/*
592 		 * This thread has a set but no context; it must be a
593 		 * CPU-bound set.
594 		 */
595 		ASSERT(t->t_cpc_set != NULL);
596 		ASSERT(t->t_cpc_set->ks_ctx->kc_cpuid != -1);
597 		return (EINVAL);
598 	} else if (ctx->kc_flags & KCPC_CTX_INVALID)
599 		return (EAGAIN);
600 
601 	if (cmd == CPC_ENABLE) {
602 		if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
603 			return (EINVAL);
604 		kpreempt_disable();
605 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
606 		kcpc_restore(ctx);
607 		kpreempt_enable();
608 	} else if (cmd == CPC_DISABLE) {
609 		if (ctx->kc_flags & KCPC_CTX_FREEZE)
610 			return (EINVAL);
611 		kpreempt_disable();
612 		kcpc_save(ctx);
613 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
614 		kpreempt_enable();
615 	} else if (cmd == CPC_USR_EVENTS || cmd == CPC_SYS_EVENTS) {
616 		/*
617 		 * Strategy for usr/sys: stop counters and update set's presets
618 		 * with current counter values, unbind, update requests with
619 		 * new config, then re-bind.
620 		 */
621 		flag = (cmd == CPC_USR_EVENTS) ?
622 		    CPC_COUNT_USER: CPC_COUNT_SYSTEM;
623 
624 		kpreempt_disable();
625 		atomic_or_uint(&ctx->kc_flags,
626 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
627 		pcbe_ops->pcbe_allstop();
628 		kpreempt_enable();
629 		for (i = 0; i < set->ks_nreqs; i++) {
630 			set->ks_req[i].kr_preset = *(set->ks_req[i].kr_data);
631 			if (enable)
632 				set->ks_req[i].kr_flags |= flag;
633 			else
634 				set->ks_req[i].kr_flags &= ~flag;
635 		}
636 		newset = kcpc_dup_set(set);
637 		if (kcpc_unbind(set) != 0)
638 			return (EINVAL);
639 		t->t_cpc_set = newset;
640 		if (kcpc_bind_thread(newset, t, &err) != 0) {
641 			t->t_cpc_set = NULL;
642 			kcpc_free_set(newset);
643 			return (EINVAL);
644 		}
645 	} else
646 		return (EINVAL);
647 
648 	return (0);
649 }
650 
651 /*
652  * Provide PCBEs with a way of obtaining the configs of every counter which will
653  * be programmed together.
654  *
655  * If current is NULL, provide the first config.
656  *
657  * If data != NULL, caller wants to know where the data store associated with
658  * the config we return is located.
659  */
660 void *
661 kcpc_next_config(void *token, void *current, uint64_t **data)
662 {
663 	int		i;
664 	kcpc_pic_t	*pic;
665 	kcpc_ctx_t *ctx = (kcpc_ctx_t *)token;
666 
667 	if (current == NULL) {
668 		/*
669 		 * Client would like the first config, which may not be in
670 		 * counter 0; we need to search through the counters for the
671 		 * first config.
672 		 */
673 		for (i = 0; i < cpc_ncounters; i++)
674 			if (ctx->kc_pics[i].kp_req != NULL)
675 				break;
676 		/*
677 		 * There are no counters configured for the given context.
678 		 */
679 		if (i == cpc_ncounters)
680 			return (NULL);
681 	} else {
682 		/*
683 		 * There surely is a faster way to do this.
684 		 */
685 		for (i = 0; i < cpc_ncounters; i++) {
686 			pic = &ctx->kc_pics[i];
687 
688 			if (pic->kp_req != NULL &&
689 			    current == pic->kp_req->kr_config)
690 				break;
691 		}
692 
693 		/*
694 		 * We found the current config at picnum i. Now search for the
695 		 * next configured PIC.
696 		 */
697 		for (i++; i < cpc_ncounters; i++) {
698 			pic = &ctx->kc_pics[i];
699 			if (pic->kp_req != NULL)
700 				break;
701 		}
702 
703 		if (i == cpc_ncounters)
704 			return (NULL);
705 	}
706 
707 	if (data != NULL) {
708 		*data = ctx->kc_pics[i].kp_req->kr_data;
709 	}
710 
711 	return (ctx->kc_pics[i].kp_req->kr_config);
712 }
713 
714 
715 static kcpc_ctx_t *
716 kcpc_ctx_alloc(void)
717 {
718 	kcpc_ctx_t	*ctx;
719 	long		hash;
720 
721 	ctx = (kcpc_ctx_t *)kmem_zalloc(sizeof (kcpc_ctx_t), KM_SLEEP);
722 
723 	hash = CPC_HASH_CTX(ctx);
724 	mutex_enter(&kcpc_ctx_llock[hash]);
725 	ctx->kc_next = kcpc_ctx_list[hash];
726 	kcpc_ctx_list[hash] = ctx;
727 	mutex_exit(&kcpc_ctx_llock[hash]);
728 
729 	ctx->kc_pics = (kcpc_pic_t *)kmem_zalloc(sizeof (kcpc_pic_t) *
730 	    cpc_ncounters, KM_SLEEP);
731 
732 	ctx->kc_cpuid = -1;
733 
734 	return (ctx);
735 }
736 
737 /*
738  * Copy set from ctx to the child context, cctx, if it has CPC_BIND_LWP_INHERIT
739  * in the flags.
740  */
741 static void
742 kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx)
743 {
744 	kcpc_set_t	*ks = ctx->kc_set, *cks;
745 	int		i, j;
746 	int		code;
747 
748 	ASSERT(ks != NULL);
749 
750 	if ((ks->ks_flags & CPC_BIND_LWP_INHERIT) == 0)
751 		return;
752 
753 	cks = kmem_zalloc(sizeof (*cks), KM_SLEEP);
754 	cks->ks_state &= ~KCPC_SET_BOUND;
755 	cctx->kc_set = cks;
756 	cks->ks_flags = ks->ks_flags;
757 	cks->ks_nreqs = ks->ks_nreqs;
758 	cks->ks_req = kmem_alloc(cks->ks_nreqs *
759 	    sizeof (kcpc_request_t), KM_SLEEP);
760 	cks->ks_data = kmem_alloc(cks->ks_nreqs * sizeof (uint64_t),
761 	    KM_SLEEP);
762 	cks->ks_ctx = cctx;
763 
764 	for (i = 0; i < cks->ks_nreqs; i++) {
765 		cks->ks_req[i].kr_index = ks->ks_req[i].kr_index;
766 		cks->ks_req[i].kr_picnum = ks->ks_req[i].kr_picnum;
767 		(void) strncpy(cks->ks_req[i].kr_event,
768 		    ks->ks_req[i].kr_event, CPC_MAX_EVENT_LEN);
769 		cks->ks_req[i].kr_preset = ks->ks_req[i].kr_preset;
770 		cks->ks_req[i].kr_flags = ks->ks_req[i].kr_flags;
771 		cks->ks_req[i].kr_nattrs = ks->ks_req[i].kr_nattrs;
772 		if (ks->ks_req[i].kr_nattrs > 0) {
773 			cks->ks_req[i].kr_attr =
774 			    kmem_alloc(ks->ks_req[i].kr_nattrs *
775 			    sizeof (kcpc_attr_t), KM_SLEEP);
776 		}
777 		for (j = 0; j < ks->ks_req[i].kr_nattrs; j++) {
778 			(void) strncpy(cks->ks_req[i].kr_attr[j].ka_name,
779 			    ks->ks_req[i].kr_attr[j].ka_name,
780 			    CPC_MAX_ATTR_LEN);
781 			cks->ks_req[i].kr_attr[j].ka_val =
782 			    ks->ks_req[i].kr_attr[j].ka_val;
783 		}
784 	}
785 	if (kcpc_configure_reqs(cctx, cks, &code) != 0)
786 		kcpc_invalidate_config(cctx);
787 
788 	mutex_enter(&cks->ks_lock);
789 	cks->ks_state |= KCPC_SET_BOUND;
790 	cv_signal(&cks->ks_condv);
791 	mutex_exit(&cks->ks_lock);
792 }
793 
794 
795 static void
796 kcpc_ctx_free(kcpc_ctx_t *ctx)
797 {
798 	kcpc_ctx_t	**loc;
799 	long		hash = CPC_HASH_CTX(ctx);
800 
801 	mutex_enter(&kcpc_ctx_llock[hash]);
802 	loc = &kcpc_ctx_list[hash];
803 	ASSERT(*loc != NULL);
804 	while (*loc != ctx)
805 		loc = &(*loc)->kc_next;
806 	*loc = ctx->kc_next;
807 	mutex_exit(&kcpc_ctx_llock[hash]);
808 
809 	kmem_free(ctx->kc_pics, cpc_ncounters * sizeof (kcpc_pic_t));
810 	cv_destroy(&ctx->kc_condv);
811 	mutex_destroy(&ctx->kc_lock);
812 	kmem_free(ctx, sizeof (*ctx));
813 }
814 
815 /*
816  * Generic interrupt handler used on hardware that generates
817  * overflow interrupts.
818  *
819  * Note: executed at high-level interrupt context!
820  */
821 /*ARGSUSED*/
822 kcpc_ctx_t *
823 kcpc_overflow_intr(caddr_t arg, uint64_t bitmap)
824 {
825 	kcpc_ctx_t	*ctx;
826 	kthread_t	*t = curthread;
827 	int		i;
828 
829 	/*
830 	 * On both x86 and UltraSPARC, we may deliver the high-level
831 	 * interrupt in kernel mode, just after we've started to run an
832 	 * interrupt thread.  (That's because the hardware helpfully
833 	 * delivers the overflow interrupt some random number of cycles
834 	 * after the instruction that caused the overflow by which time
835 	 * we're in some part of the kernel, not necessarily running on
836 	 * the right thread).
837 	 *
838 	 * Check for this case here -- find the pinned thread
839 	 * that was running when the interrupt went off.
840 	 */
841 	if (t->t_flag & T_INTR_THREAD) {
842 		klwp_t *lwp;
843 
844 		atomic_add_32(&kcpc_intrctx_count, 1);
845 
846 		/*
847 		 * Note that t_lwp is always set to point at the underlying
848 		 * thread, thus this will work in the presence of nested
849 		 * interrupts.
850 		 */
851 		ctx = NULL;
852 		if ((lwp = t->t_lwp) != NULL) {
853 			t = lwptot(lwp);
854 			ctx = t->t_cpc_ctx;
855 		}
856 	} else
857 		ctx = t->t_cpc_ctx;
858 
859 	if (ctx == NULL) {
860 		/*
861 		 * This can easily happen if we're using the counters in
862 		 * "shared" mode, for example, and an overflow interrupt
863 		 * occurs while we are running cpustat.  In that case, the
864 		 * bound thread that has the context that belongs to this
865 		 * CPU is almost certainly sleeping (if it was running on
866 		 * the CPU we'd have found it above), and the actual
867 		 * interrupted thread has no knowledge of performance counters!
868 		 */
869 		ctx = curthread->t_cpu->cpu_cpc_ctx;
870 		if (ctx != NULL) {
871 			/*
872 			 * Return the bound context for this CPU to
873 			 * the interrupt handler so that it can synchronously
874 			 * sample the hardware counters and restart them.
875 			 */
876 			return (ctx);
877 		}
878 
879 		/*
880 		 * As long as the overflow interrupt really is delivered early
881 		 * enough after trapping into the kernel to avoid switching
882 		 * threads, we must always be able to find the cpc context,
883 		 * or something went terribly wrong i.e. we ended up
884 		 * running a passivated interrupt thread, a kernel
885 		 * thread or we interrupted idle, all of which are Very Bad.
886 		 */
887 		if (kcpc_nullctx_panic)
888 			panic("null cpc context, thread %p", (void *)t);
889 		atomic_add_32(&kcpc_nullctx_count, 1);
890 	} else if ((ctx->kc_flags & KCPC_CTX_INVALID) == 0) {
891 		/*
892 		 * Schedule an ast to sample the counters, which will
893 		 * propagate any overflow into the virtualized performance
894 		 * counter(s), and may deliver a signal.
895 		 */
896 		ttolwp(t)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
897 		/*
898 		 * If a counter has overflowed which was counting on behalf of
899 		 * a request which specified CPC_OVF_NOTIFY_EMT, send the
900 		 * process a signal.
901 		 */
902 		for (i = 0; i < cpc_ncounters; i++) {
903 			if (ctx->kc_pics[i].kp_req != NULL &&
904 			    bitmap & (1 << i) &&
905 			    ctx->kc_pics[i].kp_req->kr_flags &
906 			    CPC_OVF_NOTIFY_EMT) {
907 				/*
908 				 * A signal has been requested for this PIC, so
909 				 * so freeze the context. The interrupt handler
910 				 * has already stopped the counter hardware.
911 				 */
912 				atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
913 				atomic_or_uint(&ctx->kc_pics[i].kp_flags,
914 				    KCPC_PIC_OVERFLOWED);
915 			}
916 		}
917 		aston(t);
918 	}
919 	return (NULL);
920 }
921 
922 /*
923  * The current thread context had an overflow interrupt; we're
924  * executing here in high-level interrupt context.
925  */
926 /*ARGSUSED*/
927 uint_t
928 kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
929 {
930 	kcpc_ctx_t	*ctx;
931 	uint64_t	bitmap;
932 
933 	if (pcbe_ops == NULL ||
934 	    (bitmap = pcbe_ops->pcbe_overflow_bitmap()) == 0)
935 		return (DDI_INTR_UNCLAIMED);
936 
937 	/*
938 	 * Prevent any further interrupts.
939 	 */
940 	pcbe_ops->pcbe_allstop();
941 
942 	/*
943 	 * Invoke the "generic" handler.
944 	 *
945 	 * If the interrupt has occurred in the context of an lwp owning
946 	 * the counters, then the handler posts an AST to the lwp to
947 	 * trigger the actual sampling, and optionally deliver a signal or
948 	 * restart the counters, on the way out of the kernel using
949 	 * kcpc_hw_overflow_ast() (see below).
950 	 *
951 	 * On the other hand, if the handler returns the context to us
952 	 * directly, then it means that there are no other threads in
953 	 * the middle of updating it, no AST has been posted, and so we
954 	 * should sample the counters here, and restart them with no
955 	 * further fuss.
956 	 */
957 	if ((ctx = kcpc_overflow_intr(arg1, bitmap)) != NULL) {
958 		uint64_t curtick = KCPC_GET_TICK();
959 
960 		ctx->kc_hrtime = gethrtime_waitfree();
961 		ctx->kc_vtick += curtick - ctx->kc_rawtick;
962 		ctx->kc_rawtick = curtick;
963 		pcbe_ops->pcbe_sample(ctx);
964 		pcbe_ops->pcbe_program(ctx);
965 	}
966 
967 	return (DDI_INTR_CLAIMED);
968 }
969 
970 /*
971  * Called from trap() when processing the ast posted by the high-level
972  * interrupt handler.
973  */
974 int
975 kcpc_overflow_ast()
976 {
977 	kcpc_ctx_t	*ctx = curthread->t_cpc_ctx;
978 	int		i;
979 	int		found = 0;
980 	uint64_t	curtick = KCPC_GET_TICK();
981 
982 	ASSERT(ctx != NULL);	/* Beware of interrupt skid. */
983 
984 	/*
985 	 * An overflow happened: sample the context to ensure that
986 	 * the overflow is propagated into the upper bits of the
987 	 * virtualized 64-bit counter(s).
988 	 */
989 	kpreempt_disable();
990 	ctx->kc_hrtime = gethrtime_waitfree();
991 	pcbe_ops->pcbe_sample(ctx);
992 	kpreempt_enable();
993 
994 	ctx->kc_vtick += curtick - ctx->kc_rawtick;
995 
996 	/*
997 	 * The interrupt handler has marked any pics with KCPC_PIC_OVERFLOWED
998 	 * if that pic generated an overflow and if the request it was counting
999 	 * on behalf of had CPC_OVERFLOW_REQUEST specified. We go through all
1000 	 * pics in the context and clear the KCPC_PIC_OVERFLOWED flags. If we
1001 	 * found any overflowed pics, keep the context frozen and return true
1002 	 * (thus causing a signal to be sent).
1003 	 */
1004 	for (i = 0; i < cpc_ncounters; i++) {
1005 		if (ctx->kc_pics[i].kp_flags & KCPC_PIC_OVERFLOWED) {
1006 			atomic_and_uint(&ctx->kc_pics[i].kp_flags,
1007 			    ~KCPC_PIC_OVERFLOWED);
1008 			found = 1;
1009 		}
1010 	}
1011 	if (found)
1012 		return (1);
1013 
1014 	/*
1015 	 * Otherwise, re-enable the counters and continue life as before.
1016 	 */
1017 	kpreempt_disable();
1018 	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
1019 	pcbe_ops->pcbe_program(ctx);
1020 	kpreempt_enable();
1021 	return (0);
1022 }
1023 
1024 /*
1025  * Called when switching away from current thread.
1026  */
1027 static void
1028 kcpc_save(kcpc_ctx_t *ctx)
1029 {
1030 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
1031 		if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)
1032 			return;
1033 		/*
1034 		 * This context has been invalidated but the counters have not
1035 		 * been stopped. Stop them here and mark the context stopped.
1036 		 */
1037 		pcbe_ops->pcbe_allstop();
1038 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
1039 		return;
1040 	}
1041 
1042 	pcbe_ops->pcbe_allstop();
1043 	if (ctx->kc_flags & KCPC_CTX_FREEZE)
1044 		return;
1045 
1046 	/*
1047 	 * Need to sample for all reqs into each req's current mpic.
1048 	 */
1049 	ctx->kc_hrtime = gethrtime();
1050 	ctx->kc_vtick += KCPC_GET_TICK() - ctx->kc_rawtick;
1051 	pcbe_ops->pcbe_sample(ctx);
1052 }
1053 
1054 static void
1055 kcpc_restore(kcpc_ctx_t *ctx)
1056 {
1057 	mutex_enter(&ctx->kc_lock);
1058 	if ((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED)) ==
1059 	    KCPC_CTX_INVALID)
1060 		/*
1061 		 * The context is invalidated but has not been marked stopped.
1062 		 * We mark it as such here because we will not start the
1063 		 * counters during this context switch.
1064 		 */
1065 		ctx->kc_flags |= KCPC_CTX_INVALID_STOPPED;
1066 
1067 
1068 	if (ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_FREEZE)) {
1069 		mutex_exit(&ctx->kc_lock);
1070 		return;
1071 	}
1072 
1073 	/*
1074 	 * Set kc_flags to show that a kcpc_restore() is in progress to avoid
1075 	 * ctx & set related memory objects being freed without us knowing.
1076 	 * This can happen if an agent thread is executing a kcpc_unbind(),
1077 	 * with this thread as the target, whilst we're concurrently doing a
1078 	 * restorectx() during, for example, a proc_exit().  Effectively, by
1079 	 * doing this, we're asking kcpc_free() to cv_wait() until
1080 	 * kcpc_restore() has completed.
1081 	 */
1082 	ctx->kc_flags |= KCPC_CTX_RESTORE;
1083 	mutex_exit(&ctx->kc_lock);
1084 
1085 	/*
1086 	 * While programming the hardware, the counters should be stopped. We
1087 	 * don't do an explicit pcbe_allstop() here because they should have
1088 	 * been stopped already by the last consumer.
1089 	 */
1090 	ctx->kc_rawtick = KCPC_GET_TICK();
1091 	pcbe_ops->pcbe_program(ctx);
1092 
1093 	/*
1094 	 * Wake the agent thread if it's waiting in kcpc_free().
1095 	 */
1096 	mutex_enter(&ctx->kc_lock);
1097 	ctx->kc_flags &= ~KCPC_CTX_RESTORE;
1098 	cv_signal(&ctx->kc_condv);
1099 	mutex_exit(&ctx->kc_lock);
1100 }
1101 
1102 /*
1103  * If kcpc_counts_include_idle is set to 0 by the sys admin, we add the the
1104  * following context operators to the idle thread on each CPU. They stop the
1105  * counters when the idle thread is switched on, and they start them again when
1106  * it is switched off.
1107  */
1108 
1109 /*ARGSUSED*/
1110 void
1111 kcpc_idle_save(struct cpu *cp)
1112 {
1113 	/*
1114 	 * The idle thread shouldn't be run anywhere else.
1115 	 */
1116 	ASSERT(CPU == cp);
1117 
1118 	/*
1119 	 * We must hold the CPU's context lock to ensure the context isn't freed
1120 	 * while we're looking at it.
1121 	 */
1122 	mutex_enter(&cp->cpu_cpc_ctxlock);
1123 
1124 	if ((cp->cpu_cpc_ctx == NULL) ||
1125 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1126 		mutex_exit(&cp->cpu_cpc_ctxlock);
1127 		return;
1128 	}
1129 
1130 	pcbe_ops->pcbe_program(cp->cpu_cpc_ctx);
1131 	mutex_exit(&cp->cpu_cpc_ctxlock);
1132 }
1133 
1134 void
1135 kcpc_idle_restore(struct cpu *cp)
1136 {
1137 	/*
1138 	 * The idle thread shouldn't be run anywhere else.
1139 	 */
1140 	ASSERT(CPU == cp);
1141 
1142 	/*
1143 	 * We must hold the CPU's context lock to ensure the context isn't freed
1144 	 * while we're looking at it.
1145 	 */
1146 	mutex_enter(&cp->cpu_cpc_ctxlock);
1147 
1148 	if ((cp->cpu_cpc_ctx == NULL) ||
1149 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1150 		mutex_exit(&cp->cpu_cpc_ctxlock);
1151 		return;
1152 	}
1153 
1154 	pcbe_ops->pcbe_allstop();
1155 	mutex_exit(&cp->cpu_cpc_ctxlock);
1156 }
1157 
1158 /*ARGSUSED*/
1159 static void
1160 kcpc_lwp_create(kthread_t *t, kthread_t *ct)
1161 {
1162 	kcpc_ctx_t	*ctx = t->t_cpc_ctx, *cctx;
1163 	int		i;
1164 
1165 	if (ctx == NULL || (ctx->kc_flags & KCPC_CTX_LWPINHERIT) == 0)
1166 		return;
1167 
1168 	rw_enter(&kcpc_cpuctx_lock, RW_READER);
1169 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
1170 		rw_exit(&kcpc_cpuctx_lock);
1171 		return;
1172 	}
1173 	cctx = kcpc_ctx_alloc();
1174 	kcpc_ctx_clone(ctx, cctx);
1175 	rw_exit(&kcpc_cpuctx_lock);
1176 
1177 	/*
1178 	 * Copy the parent context's kc_flags field, but don't overwrite
1179 	 * the child's in case it was modified during kcpc_ctx_clone.
1180 	 */
1181 	cctx->kc_flags |= ctx->kc_flags;
1182 	cctx->kc_thread = ct;
1183 	cctx->kc_cpuid = -1;
1184 	ct->t_cpc_set = cctx->kc_set;
1185 	ct->t_cpc_ctx = cctx;
1186 
1187 	if (cctx->kc_flags & KCPC_CTX_SIGOVF) {
1188 		kcpc_set_t *ks = cctx->kc_set;
1189 		/*
1190 		 * Our contract with the user requires us to immediately send an
1191 		 * overflow signal to all children if we have the LWPINHERIT
1192 		 * and SIGOVF flags set. In addition, all counters should be
1193 		 * set to UINT64_MAX, and their pic's overflow flag turned on
1194 		 * so that our trap() processing knows to send a signal.
1195 		 */
1196 		atomic_or_uint(&cctx->kc_flags, KCPC_CTX_FREEZE);
1197 		for (i = 0; i < ks->ks_nreqs; i++) {
1198 			kcpc_request_t *kr = &ks->ks_req[i];
1199 
1200 			if (kr->kr_flags & CPC_OVF_NOTIFY_EMT) {
1201 				*(kr->kr_data) = UINT64_MAX;
1202 				kr->kr_picp->kp_flags |= KCPC_PIC_OVERFLOWED;
1203 			}
1204 		}
1205 		ttolwp(ct)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
1206 		aston(ct);
1207 	}
1208 
1209 	installctx(ct, cctx, kcpc_save, kcpc_restore,
1210 	    NULL, kcpc_lwp_create, NULL, kcpc_free);
1211 }
1212 
1213 /*
1214  * Counter Stoppage Theory
1215  *
1216  * The counters may need to be stopped properly at the following occasions:
1217  *
1218  * 1) An LWP exits.
1219  * 2) A thread exits.
1220  * 3) An LWP performs an exec().
1221  * 4) A bound set is unbound.
1222  *
1223  * In addition to stopping the counters, the CPC context (a kcpc_ctx_t) may need
1224  * to be freed as well.
1225  *
1226  * Case 1: kcpc_passivate(), called via lwp_exit(), stops the counters. Later on
1227  * when the thread is freed, kcpc_free(), called by freectx(), frees the
1228  * context.
1229  *
1230  * Case 2: same as case 1 except kcpc_passivate is called from thread_exit().
1231  *
1232  * Case 3: kcpc_free(), called via freectx() via exec(), recognizes that it has
1233  * been called from exec. It stops the counters _and_ frees the context.
1234  *
1235  * Case 4: kcpc_unbind() stops the hardware _and_ frees the context.
1236  *
1237  * CPU-bound counters are always stopped via kcpc_unbind().
1238  */
1239 
1240 /*
1241  * We're being called to delete the context; we ensure that all associated data
1242  * structures are freed, and that the hardware is passivated if this is an exec.
1243  */
1244 
1245 /*ARGSUSED*/
1246 static void
1247 kcpc_free(kcpc_ctx_t *ctx, int isexec)
1248 {
1249 	int		i;
1250 	kcpc_set_t	*set = ctx->kc_set;
1251 
1252 	ASSERT(set != NULL);
1253 
1254 	/*
1255 	 * Wait for kcpc_restore() to finish before we tear things down.
1256 	 */
1257 	mutex_enter(&ctx->kc_lock);
1258 	while (ctx->kc_flags & KCPC_CTX_RESTORE)
1259 		cv_wait(&ctx->kc_condv, &ctx->kc_lock);
1260 	ctx->kc_flags |= KCPC_CTX_INVALID;
1261 	mutex_exit(&ctx->kc_lock);
1262 
1263 	if (isexec) {
1264 		/*
1265 		 * This thread is execing, and after the exec it should not have
1266 		 * any performance counter context. Stop the counters properly
1267 		 * here so the system isn't surprised by an overflow interrupt
1268 		 * later.
1269 		 */
1270 		if (ctx->kc_cpuid != -1) {
1271 			cpu_t *cp;
1272 			/*
1273 			 * CPU-bound context; stop the appropriate CPU's ctrs.
1274 			 * Hold cpu_lock while examining the CPU to ensure it
1275 			 * doesn't go away.
1276 			 */
1277 			mutex_enter(&cpu_lock);
1278 			cp = cpu_get(ctx->kc_cpuid);
1279 			/*
1280 			 * The CPU could have been DR'd out, so only stop the
1281 			 * CPU and clear its context pointer if the CPU still
1282 			 * exists.
1283 			 */
1284 			if (cp != NULL) {
1285 				mutex_enter(&cp->cpu_cpc_ctxlock);
1286 				kcpc_stop_hw(ctx);
1287 				cp->cpu_cpc_ctx = NULL;
1288 				mutex_exit(&cp->cpu_cpc_ctxlock);
1289 			}
1290 			mutex_exit(&cpu_lock);
1291 			ASSERT(curthread->t_cpc_ctx == NULL);
1292 		} else {
1293 			/*
1294 			 * Thread-bound context; stop _this_ CPU's counters.
1295 			 */
1296 			kpreempt_disable();
1297 			pcbe_ops->pcbe_allstop();
1298 			atomic_or_uint(&ctx->kc_flags,
1299 			    KCPC_CTX_INVALID_STOPPED);
1300 			kpreempt_enable();
1301 			curthread->t_cpc_ctx = NULL;
1302 		}
1303 
1304 		/*
1305 		 * Since we are being called from an exec and we know that
1306 		 * exec is not permitted via the agent thread, we should clean
1307 		 * up this thread's CPC state completely, and not leave dangling
1308 		 * CPC pointers behind.
1309 		 */
1310 		ASSERT(ctx->kc_thread == curthread);
1311 		curthread->t_cpc_set = NULL;
1312 	}
1313 
1314 	/*
1315 	 * Walk through each request in this context's set and free the PCBE's
1316 	 * configuration if it exists.
1317 	 */
1318 	for (i = 0; i < set->ks_nreqs; i++) {
1319 		if (set->ks_req[i].kr_config != NULL)
1320 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
1321 	}
1322 
1323 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
1324 	kcpc_ctx_free(ctx);
1325 	kcpc_free_set(set);
1326 }
1327 
1328 /*
1329  * Free the memory associated with a request set.
1330  */
1331 void
1332 kcpc_free_set(kcpc_set_t *set)
1333 {
1334 	int		i;
1335 	kcpc_request_t	*req;
1336 
1337 	ASSERT(set->ks_req != NULL);
1338 
1339 	for (i = 0; i < set->ks_nreqs; i++) {
1340 		req = &set->ks_req[i];
1341 
1342 		if (req->kr_nattrs != 0) {
1343 			kmem_free(req->kr_attr,
1344 			    req->kr_nattrs * sizeof (kcpc_attr_t));
1345 		}
1346 	}
1347 
1348 	kmem_free(set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
1349 	cv_destroy(&set->ks_condv);
1350 	mutex_destroy(&set->ks_lock);
1351 	kmem_free(set, sizeof (kcpc_set_t));
1352 }
1353 
1354 /*
1355  * Grab every existing context and mark it as invalid.
1356  */
1357 void
1358 kcpc_invalidate_all(void)
1359 {
1360 	kcpc_ctx_t *ctx;
1361 	long hash;
1362 
1363 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++) {
1364 		mutex_enter(&kcpc_ctx_llock[hash]);
1365 		for (ctx = kcpc_ctx_list[hash]; ctx; ctx = ctx->kc_next)
1366 			atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1367 		mutex_exit(&kcpc_ctx_llock[hash]);
1368 	}
1369 }
1370 
1371 /*
1372  * Interface for PCBEs to signal that an existing configuration has suddenly
1373  * become invalid.
1374  */
1375 void
1376 kcpc_invalidate_config(void *token)
1377 {
1378 	kcpc_ctx_t *ctx = token;
1379 
1380 	ASSERT(ctx != NULL);
1381 
1382 	atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1383 }
1384 
1385 /*
1386  * Called from lwp_exit() and thread_exit()
1387  */
1388 void
1389 kcpc_passivate(void)
1390 {
1391 	kcpc_ctx_t *ctx = curthread->t_cpc_ctx;
1392 	kcpc_set_t *set = curthread->t_cpc_set;
1393 
1394 	if (set == NULL)
1395 		return;
1396 
1397 	/*
1398 	 * We're cleaning up after this thread; ensure there are no dangling
1399 	 * CPC pointers left behind. The context and set will be freed by
1400 	 * freectx() in the case of an LWP-bound set, and by kcpc_unbind() in
1401 	 * the case of a CPU-bound set.
1402 	 */
1403 	curthread->t_cpc_ctx = NULL;
1404 
1405 	if (ctx == NULL) {
1406 		/*
1407 		 * This thread has a set but no context; it must be a CPU-bound
1408 		 * set. The hardware will be stopped via kcpc_unbind() when the
1409 		 * process exits and closes its file descriptors with
1410 		 * kcpc_close(). Our only job here is to clean up this thread's
1411 		 * state; the set will be freed with the unbind().
1412 		 */
1413 		(void) kcpc_unbind(set);
1414 		/*
1415 		 * Unbinding a set belonging to the current thread should clear
1416 		 * its set pointer.
1417 		 */
1418 		ASSERT(curthread->t_cpc_set == NULL);
1419 		return;
1420 	}
1421 
1422 	curthread->t_cpc_set = NULL;
1423 
1424 	/*
1425 	 * This thread/LWP is exiting but context switches will continue to
1426 	 * happen for a bit as the exit proceeds.  Kernel preemption must be
1427 	 * disabled here to prevent a race between checking or setting the
1428 	 * INVALID_STOPPED flag here and kcpc_restore() setting the flag during
1429 	 * a context switch.
1430 	 */
1431 
1432 	kpreempt_disable();
1433 	if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
1434 		pcbe_ops->pcbe_allstop();
1435 		atomic_or_uint(&ctx->kc_flags,
1436 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
1437 	}
1438 	kpreempt_enable();
1439 }
1440 
1441 /*
1442  * Assign the requests in the given set to the PICs in the context.
1443  * Returns 0 if successful, -1 on failure.
1444  */
1445 /*ARGSUSED*/
1446 static int
1447 kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx)
1448 {
1449 	int i;
1450 	int *picnum_save;
1451 
1452 	ASSERT(set->ks_nreqs <= cpc_ncounters);
1453 
1454 	/*
1455 	 * Provide kcpc_tryassign() with scratch space to avoid doing an
1456 	 * alloc/free with every invocation.
1457 	 */
1458 	picnum_save = kmem_alloc(set->ks_nreqs * sizeof (int), KM_SLEEP);
1459 	/*
1460 	 * kcpc_tryassign() blindly walks through each request in the set,
1461 	 * seeing if a counter can count its event. If yes, it assigns that
1462 	 * counter. However, that counter may have been the only capable counter
1463 	 * for _another_ request's event. The solution is to try every possible
1464 	 * request first. Note that this does not cover all solutions, as
1465 	 * that would require all unique orderings of requests, an n^n operation
1466 	 * which would be unacceptable for architectures with many counters.
1467 	 */
1468 	for (i = 0; i < set->ks_nreqs; i++)
1469 		if (kcpc_tryassign(set, i, picnum_save) == 0)
1470 			break;
1471 
1472 	kmem_free(picnum_save, set->ks_nreqs * sizeof (int));
1473 	if (i == set->ks_nreqs)
1474 		return (-1);
1475 	return (0);
1476 }
1477 
1478 static int
1479 kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch)
1480 {
1481 	int		i;
1482 	int		j;
1483 	uint64_t	bitmap = 0, resmap = 0;
1484 	uint64_t	ctrmap;
1485 
1486 	/*
1487 	 * We are attempting to assign the reqs to pics, but we may fail. If we
1488 	 * fail, we need to restore the state of the requests to what it was
1489 	 * when we found it, as some reqs may have been explicitly assigned to
1490 	 * a specific PIC beforehand. We do this by snapshotting the assignments
1491 	 * now and restoring from it later if we fail.
1492 	 *
1493 	 * Also we note here which counters have already been claimed by
1494 	 * requests with explicit counter assignments.
1495 	 */
1496 	for (i = 0; i < set->ks_nreqs; i++) {
1497 		scratch[i] = set->ks_req[i].kr_picnum;
1498 		if (set->ks_req[i].kr_picnum != -1)
1499 			resmap |= (1 << set->ks_req[i].kr_picnum);
1500 	}
1501 
1502 	/*
1503 	 * Walk through requests assigning them to the first PIC that is
1504 	 * capable.
1505 	 */
1506 	i = starting_req;
1507 	do {
1508 		if (set->ks_req[i].kr_picnum != -1) {
1509 			ASSERT((bitmap & (1 << set->ks_req[i].kr_picnum)) == 0);
1510 			bitmap |= (1 << set->ks_req[i].kr_picnum);
1511 			if (++i == set->ks_nreqs)
1512 				i = 0;
1513 			continue;
1514 		}
1515 
1516 		ctrmap = pcbe_ops->pcbe_event_coverage(set->ks_req[i].kr_event);
1517 		for (j = 0; j < cpc_ncounters; j++) {
1518 			if (ctrmap & (1 << j) && (bitmap & (1 << j)) == 0 &&
1519 			    (resmap & (1 << j)) == 0) {
1520 				/*
1521 				 * We can assign this counter because:
1522 				 *
1523 				 * 1. It can count the event (ctrmap)
1524 				 * 2. It hasn't been assigned yet (bitmap)
1525 				 * 3. It wasn't reserved by a request (resmap)
1526 				 */
1527 				bitmap |= (1 << j);
1528 				break;
1529 			}
1530 		}
1531 		if (j == cpc_ncounters) {
1532 			for (i = 0; i < set->ks_nreqs; i++)
1533 				set->ks_req[i].kr_picnum = scratch[i];
1534 			return (-1);
1535 		}
1536 		set->ks_req[i].kr_picnum = j;
1537 
1538 		if (++i == set->ks_nreqs)
1539 			i = 0;
1540 	} while (i != starting_req);
1541 
1542 	return (0);
1543 }
1544 
1545 kcpc_set_t *
1546 kcpc_dup_set(kcpc_set_t *set)
1547 {
1548 	kcpc_set_t	*new;
1549 	int		i;
1550 	int		j;
1551 
1552 	new = kmem_zalloc(sizeof (*new), KM_SLEEP);
1553 	new->ks_state &= ~KCPC_SET_BOUND;
1554 	new->ks_flags = set->ks_flags;
1555 	new->ks_nreqs = set->ks_nreqs;
1556 	new->ks_req = kmem_alloc(set->ks_nreqs * sizeof (kcpc_request_t),
1557 	    KM_SLEEP);
1558 	new->ks_data = NULL;
1559 	new->ks_ctx = NULL;
1560 
1561 	for (i = 0; i < new->ks_nreqs; i++) {
1562 		new->ks_req[i].kr_config = NULL;
1563 		new->ks_req[i].kr_index = set->ks_req[i].kr_index;
1564 		new->ks_req[i].kr_picnum = set->ks_req[i].kr_picnum;
1565 		new->ks_req[i].kr_picp = NULL;
1566 		new->ks_req[i].kr_data = NULL;
1567 		(void) strncpy(new->ks_req[i].kr_event, set->ks_req[i].kr_event,
1568 		    CPC_MAX_EVENT_LEN);
1569 		new->ks_req[i].kr_preset = set->ks_req[i].kr_preset;
1570 		new->ks_req[i].kr_flags = set->ks_req[i].kr_flags;
1571 		new->ks_req[i].kr_nattrs = set->ks_req[i].kr_nattrs;
1572 		new->ks_req[i].kr_attr = kmem_alloc(new->ks_req[i].kr_nattrs *
1573 		    sizeof (kcpc_attr_t), KM_SLEEP);
1574 		for (j = 0; j < new->ks_req[i].kr_nattrs; j++) {
1575 			new->ks_req[i].kr_attr[j].ka_val =
1576 			    set->ks_req[i].kr_attr[j].ka_val;
1577 			(void) strncpy(new->ks_req[i].kr_attr[j].ka_name,
1578 			    set->ks_req[i].kr_attr[j].ka_name,
1579 			    CPC_MAX_ATTR_LEN);
1580 		}
1581 	}
1582 
1583 	return (new);
1584 }
1585 
1586 int
1587 kcpc_allow_nonpriv(void *token)
1588 {
1589 	return (((kcpc_ctx_t *)token)->kc_flags & KCPC_CTX_NONPRIV);
1590 }
1591 
1592 void
1593 kcpc_invalidate(kthread_t *t)
1594 {
1595 	kcpc_ctx_t *ctx = t->t_cpc_ctx;
1596 
1597 	if (ctx != NULL)
1598 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1599 }
1600 
1601 /*
1602  * Given a PCBE ID, attempt to load a matching PCBE module. The strings given
1603  * are used to construct PCBE names, starting with the most specific,
1604  * "pcbe.first.second.third.fourth" and ending with the least specific,
1605  * "pcbe.first".
1606  *
1607  * Returns 0 if a PCBE was successfully loaded and -1 upon error.
1608  */
1609 int
1610 kcpc_pcbe_tryload(const char *prefix, uint_t first, uint_t second, uint_t third)
1611 {
1612 	uint_t s[3];
1613 
1614 	s[0] = first;
1615 	s[1] = second;
1616 	s[2] = third;
1617 
1618 	return (modload_qualified("pcbe",
1619 	    "pcbe", prefix, ".", s, 3, NULL) < 0 ? -1 : 0);
1620 }
1621