xref: /titanic_50/usr/src/uts/common/os/kcpc.c (revision a1e3386ed23bbb4894435257b2006a5f8edcc9ea)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/param.h>
30 #include <sys/thread.h>
31 #include <sys/cpuvar.h>
32 #include <sys/inttypes.h>
33 #include <sys/cmn_err.h>
34 #include <sys/time.h>
35 #include <sys/mutex.h>
36 #include <sys/systm.h>
37 #include <sys/kcpc.h>
38 #include <sys/cpc_impl.h>
39 #include <sys/cpc_pcbe.h>
40 #include <sys/atomic.h>
41 #include <sys/sunddi.h>
42 #include <sys/modctl.h>
43 #include <sys/sdt.h>
44 #if defined(__x86)
45 #include <asm/clock.h>
46 #endif
47 
48 kmutex_t	kcpc_ctx_llock[CPC_HASH_BUCKETS];	/* protects ctx_list */
49 kcpc_ctx_t	*kcpc_ctx_list[CPC_HASH_BUCKETS];	/* head of list */
50 
51 
52 krwlock_t	kcpc_cpuctx_lock;	/* lock for 'kcpc_cpuctx' below */
53 int		kcpc_cpuctx;		/* number of cpu-specific contexts */
54 
55 int kcpc_counts_include_idle = 1; /* Project Private /etc/system variable */
56 
57 /*
58  * These are set when a PCBE module is loaded.
59  */
60 uint_t		cpc_ncounters = 0;
61 pcbe_ops_t	*pcbe_ops = NULL;
62 
63 /*
64  * Statistics on (mis)behavior
65  */
66 static uint32_t kcpc_intrctx_count;    /* # overflows in an interrupt handler */
67 static uint32_t kcpc_nullctx_count;    /* # overflows in a thread with no ctx */
68 
69 /*
70  * Is misbehaviour (overflow in a thread with no context) fatal?
71  */
72 #ifdef DEBUG
73 static int kcpc_nullctx_panic = 1;
74 #else
75 static int kcpc_nullctx_panic = 0;
76 #endif
77 
78 static void kcpc_lwp_create(kthread_t *t, kthread_t *ct);
79 static void kcpc_restore(kcpc_ctx_t *ctx);
80 static void kcpc_save(kcpc_ctx_t *ctx);
81 static void kcpc_free(kcpc_ctx_t *ctx, int isexec);
82 static int kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode);
83 static void kcpc_free_configs(kcpc_set_t *set);
84 static kcpc_ctx_t *kcpc_ctx_alloc(void);
85 static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx);
86 static void kcpc_ctx_free(kcpc_ctx_t *ctx);
87 static int kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx);
88 static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch);
89 static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set);
90 
91 void
92 kcpc_register_pcbe(pcbe_ops_t *ops)
93 {
94 	pcbe_ops = ops;
95 	cpc_ncounters = pcbe_ops->pcbe_ncounters();
96 }
97 
98 int
99 kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode)
100 {
101 	cpu_t		*cp;
102 	kcpc_ctx_t	*ctx;
103 	int		error;
104 
105 	ctx = kcpc_ctx_alloc();
106 
107 	if (kcpc_assign_reqs(set, ctx) != 0) {
108 		kcpc_ctx_free(ctx);
109 		*subcode = CPC_RESOURCE_UNAVAIL;
110 		return (EINVAL);
111 	}
112 
113 	ctx->kc_cpuid = cpuid;
114 	ctx->kc_thread = curthread;
115 
116 	set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
117 
118 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
119 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
120 		kcpc_ctx_free(ctx);
121 		return (error);
122 	}
123 
124 	set->ks_ctx = ctx;
125 	ctx->kc_set = set;
126 
127 	/*
128 	 * We must hold cpu_lock to prevent DR, offlining, or unbinding while
129 	 * we are manipulating the cpu_t and programming the hardware, else the
130 	 * the cpu_t could go away while we're looking at it.
131 	 */
132 	mutex_enter(&cpu_lock);
133 	cp = cpu_get(cpuid);
134 
135 	if (cp == NULL)
136 		/*
137 		 * The CPU could have been DRd out while we were getting set up.
138 		 */
139 		goto unbound;
140 
141 	mutex_enter(&cp->cpu_cpc_ctxlock);
142 
143 	if (cp->cpu_cpc_ctx != NULL) {
144 		/*
145 		 * If this CPU already has a bound set, return an error.
146 		 */
147 		mutex_exit(&cp->cpu_cpc_ctxlock);
148 		goto unbound;
149 	}
150 
151 	if (curthread->t_bind_cpu != cpuid) {
152 		mutex_exit(&cp->cpu_cpc_ctxlock);
153 		goto unbound;
154 	}
155 	cp->cpu_cpc_ctx = ctx;
156 
157 	/*
158 	 * Kernel preemption must be disabled while fiddling with the hardware
159 	 * registers to prevent partial updates.
160 	 */
161 	kpreempt_disable();
162 	ctx->kc_rawtick = KCPC_GET_TICK();
163 	pcbe_ops->pcbe_program(ctx);
164 	kpreempt_enable();
165 
166 	mutex_exit(&cp->cpu_cpc_ctxlock);
167 	mutex_exit(&cpu_lock);
168 
169 	return (0);
170 
171 unbound:
172 	mutex_exit(&cpu_lock);
173 	set->ks_ctx = NULL;
174 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
175 	kcpc_ctx_free(ctx);
176 	return (EAGAIN);
177 }
178 
179 int
180 kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
181 {
182 	kcpc_ctx_t	*ctx;
183 	int		error;
184 
185 	/*
186 	 * Only one set is allowed per context, so ensure there is no
187 	 * existing context.
188 	 */
189 
190 	if (t->t_cpc_ctx != NULL)
191 		return (EEXIST);
192 
193 	ctx = kcpc_ctx_alloc();
194 
195 	/*
196 	 * The context must begin life frozen until it has been properly
197 	 * programmed onto the hardware. This prevents the context ops from
198 	 * worrying about it until we're ready.
199 	 */
200 	ctx->kc_flags |= KCPC_CTX_FREEZE;
201 	ctx->kc_hrtime = gethrtime();
202 
203 	if (kcpc_assign_reqs(set, ctx) != 0) {
204 		kcpc_ctx_free(ctx);
205 		*subcode = CPC_RESOURCE_UNAVAIL;
206 		return (EINVAL);
207 	}
208 
209 	ctx->kc_cpuid = -1;
210 	if (set->ks_flags & CPC_BIND_LWP_INHERIT)
211 		ctx->kc_flags |= KCPC_CTX_LWPINHERIT;
212 	ctx->kc_thread = t;
213 	t->t_cpc_ctx = ctx;
214 	/*
215 	 * Permit threads to look at their own hardware counters from userland.
216 	 */
217 	ctx->kc_flags |= KCPC_CTX_NONPRIV;
218 
219 	/*
220 	 * Create the data store for this set.
221 	 */
222 	set->ks_data = kmem_alloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
223 
224 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
225 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
226 		kcpc_ctx_free(ctx);
227 		t->t_cpc_ctx = NULL;
228 		return (error);
229 	}
230 
231 	set->ks_ctx = ctx;
232 	ctx->kc_set = set;
233 
234 	/*
235 	 * Add a device context to the subject thread.
236 	 */
237 	installctx(t, ctx, kcpc_save, kcpc_restore, NULL,
238 	    kcpc_lwp_create, NULL, kcpc_free);
239 
240 	/*
241 	 * Ask the backend to program the hardware.
242 	 */
243 	if (t == curthread) {
244 		kpreempt_disable();
245 		ctx->kc_rawtick = KCPC_GET_TICK();
246 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
247 		pcbe_ops->pcbe_program(ctx);
248 		kpreempt_enable();
249 	} else
250 		/*
251 		 * Since we are the agent LWP, we know the victim LWP is stopped
252 		 * until we're done here; no need to worry about preemption or
253 		 * migration here. We still use an atomic op to clear the flag
254 		 * to ensure the flags are always self-consistent; they can
255 		 * still be accessed from, for instance, another CPU doing a
256 		 * kcpc_invalidate_all().
257 		 */
258 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
259 
260 
261 	return (0);
262 }
263 
264 /*
265  * Walk through each request in the set and ask the PCBE to configure a
266  * corresponding counter.
267  */
268 static int
269 kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode)
270 {
271 	int		i;
272 	int		ret;
273 	kcpc_request_t	*rp;
274 
275 	for (i = 0; i < set->ks_nreqs; i++) {
276 		int n;
277 		rp = &set->ks_req[i];
278 
279 		n = rp->kr_picnum;
280 
281 		ASSERT(n >= 0 && n < cpc_ncounters);
282 
283 		ASSERT(ctx->kc_pics[n].kp_req == NULL);
284 
285 		if (rp->kr_flags & CPC_OVF_NOTIFY_EMT) {
286 			if ((pcbe_ops->pcbe_caps & CPC_CAP_OVERFLOW_INTERRUPT)
287 			    == 0) {
288 				*subcode = -1;
289 				return (ENOTSUP);
290 			}
291 			/*
292 			 * If any of the counters have requested overflow
293 			 * notification, we flag the context as being one that
294 			 * cares about overflow.
295 			 */
296 			ctx->kc_flags |= KCPC_CTX_SIGOVF;
297 		}
298 
299 		rp->kr_config = NULL;
300 		if ((ret = pcbe_ops->pcbe_configure(n, rp->kr_event,
301 		    rp->kr_preset, rp->kr_flags, rp->kr_nattrs, rp->kr_attr,
302 		    &(rp->kr_config), (void *)ctx)) != 0) {
303 			kcpc_free_configs(set);
304 			*subcode = ret;
305 			switch (ret) {
306 			case CPC_ATTR_REQUIRES_PRIVILEGE:
307 			case CPC_HV_NO_ACCESS:
308 				return (EACCES);
309 			default:
310 				return (EINVAL);
311 			}
312 		}
313 
314 		ctx->kc_pics[n].kp_req = rp;
315 		rp->kr_picp = &ctx->kc_pics[n];
316 		rp->kr_data = set->ks_data + rp->kr_index;
317 		*rp->kr_data = rp->kr_preset;
318 	}
319 
320 	return (0);
321 }
322 
323 static void
324 kcpc_free_configs(kcpc_set_t *set)
325 {
326 	int i;
327 
328 	for (i = 0; i < set->ks_nreqs; i++)
329 		if (set->ks_req[i].kr_config != NULL)
330 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
331 }
332 
333 /*
334  * buf points to a user address and the data should be copied out to that
335  * address in the current process.
336  */
337 int
338 kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick)
339 {
340 	kcpc_ctx_t	*ctx = set->ks_ctx;
341 	uint64_t	curtick = KCPC_GET_TICK();
342 
343 	if (ctx == NULL)
344 		return (EINVAL);
345 	else if (ctx->kc_flags & KCPC_CTX_INVALID)
346 		return (EAGAIN);
347 
348 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) {
349 		/*
350 		 * Kernel preemption must be disabled while reading the
351 		 * hardware regs, and if this is a CPU-bound context, while
352 		 * checking the CPU binding of the current thread.
353 		 */
354 		kpreempt_disable();
355 
356 		if (ctx->kc_cpuid != -1) {
357 			if (curthread->t_bind_cpu != ctx->kc_cpuid) {
358 				kpreempt_enable();
359 				return (EAGAIN);
360 			}
361 		}
362 
363 		if (ctx->kc_thread == curthread) {
364 			ctx->kc_hrtime = gethrtime();
365 			pcbe_ops->pcbe_sample(ctx);
366 			ctx->kc_vtick += curtick - ctx->kc_rawtick;
367 			ctx->kc_rawtick = curtick;
368 		}
369 
370 		kpreempt_enable();
371 
372 		/*
373 		 * The config may have been invalidated by
374 		 * the pcbe_sample op.
375 		 */
376 		if (ctx->kc_flags & KCPC_CTX_INVALID)
377 			return (EAGAIN);
378 	}
379 
380 	if (copyout(set->ks_data, buf,
381 	    set->ks_nreqs * sizeof (uint64_t)) == -1)
382 		return (EFAULT);
383 	if (copyout(&ctx->kc_hrtime, hrtime, sizeof (uint64_t)) == -1)
384 		return (EFAULT);
385 	if (copyout(&ctx->kc_vtick, tick, sizeof (uint64_t)) == -1)
386 		return (EFAULT);
387 
388 	return (0);
389 }
390 
391 /*
392  * Stop the counters on the CPU this context is bound to.
393  */
394 static void
395 kcpc_stop_hw(kcpc_ctx_t *ctx)
396 {
397 	cpu_t *cp;
398 
399 	ASSERT((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED))
400 	    == KCPC_CTX_INVALID);
401 
402 	kpreempt_disable();
403 
404 	cp = cpu_get(ctx->kc_cpuid);
405 	ASSERT(cp != NULL);
406 
407 	if (cp == CPU) {
408 		pcbe_ops->pcbe_allstop();
409 		atomic_or_uint(&ctx->kc_flags,
410 		    KCPC_CTX_INVALID_STOPPED);
411 	} else
412 		kcpc_remote_stop(cp);
413 	kpreempt_enable();
414 }
415 
416 int
417 kcpc_unbind(kcpc_set_t *set)
418 {
419 	kcpc_ctx_t	*ctx = set->ks_ctx;
420 	kthread_t	*t;
421 
422 	if (ctx == NULL)
423 		return (EINVAL);
424 
425 	atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
426 
427 	if (ctx->kc_cpuid == -1) {
428 		t = ctx->kc_thread;
429 		/*
430 		 * The context is thread-bound and therefore has a device
431 		 * context.  It will be freed via removectx() calling
432 		 * freectx() calling kcpc_free().
433 		 */
434 		if (t == curthread &&
435 			(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
436 			kpreempt_disable();
437 			pcbe_ops->pcbe_allstop();
438 			atomic_or_uint(&ctx->kc_flags,
439 			    KCPC_CTX_INVALID_STOPPED);
440 			kpreempt_enable();
441 		}
442 #ifdef DEBUG
443 		if (removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
444 		    kcpc_lwp_create, NULL, kcpc_free) == 0)
445 			panic("kcpc_unbind: context %p not preset on thread %p",
446 			    ctx, t);
447 #else
448 		(void) removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
449 		    kcpc_lwp_create, NULL, kcpc_free);
450 #endif /* DEBUG */
451 		t->t_cpc_set = NULL;
452 		t->t_cpc_ctx = NULL;
453 	} else {
454 		/*
455 		 * If we are unbinding a CPU-bound set from a remote CPU, the
456 		 * native CPU's idle thread could be in the midst of programming
457 		 * this context onto the CPU. We grab the context's lock here to
458 		 * ensure that the idle thread is done with it. When we release
459 		 * the lock, the CPU no longer has a context and the idle thread
460 		 * will move on.
461 		 *
462 		 * cpu_lock must be held to prevent the CPU from being DR'd out
463 		 * while we disassociate the context from the cpu_t.
464 		 */
465 		cpu_t *cp;
466 		mutex_enter(&cpu_lock);
467 		cp = cpu_get(ctx->kc_cpuid);
468 		if (cp != NULL) {
469 			/*
470 			 * The CPU may have been DR'd out of the system.
471 			 */
472 			mutex_enter(&cp->cpu_cpc_ctxlock);
473 			if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0)
474 				kcpc_stop_hw(ctx);
475 			ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
476 			cp->cpu_cpc_ctx = NULL;
477 			mutex_exit(&cp->cpu_cpc_ctxlock);
478 		}
479 		mutex_exit(&cpu_lock);
480 		if (ctx->kc_thread == curthread) {
481 			kcpc_free(ctx, 0);
482 			curthread->t_cpc_set = NULL;
483 		}
484 	}
485 
486 	return (0);
487 }
488 
489 int
490 kcpc_preset(kcpc_set_t *set, int index, uint64_t preset)
491 {
492 	int i;
493 
494 	ASSERT(set != NULL);
495 	ASSERT(set->ks_ctx != NULL);
496 	ASSERT(set->ks_ctx->kc_thread == curthread);
497 	ASSERT(set->ks_ctx->kc_cpuid == -1);
498 
499 	if (index < 0 || index >= set->ks_nreqs)
500 		return (EINVAL);
501 
502 	for (i = 0; i < set->ks_nreqs; i++)
503 		if (set->ks_req[i].kr_index == index)
504 			break;
505 	ASSERT(i != set->ks_nreqs);
506 
507 	set->ks_req[i].kr_preset = preset;
508 	return (0);
509 }
510 
511 int
512 kcpc_restart(kcpc_set_t *set)
513 {
514 	kcpc_ctx_t	*ctx = set->ks_ctx;
515 	int		i;
516 
517 	ASSERT(ctx != NULL);
518 	ASSERT(ctx->kc_thread == curthread);
519 	ASSERT(ctx->kc_cpuid == -1);
520 
521 	kpreempt_disable();
522 
523 	/*
524 	 * If the user is doing this on a running set, make sure the counters
525 	 * are stopped first.
526 	 */
527 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
528 		pcbe_ops->pcbe_allstop();
529 
530 	for (i = 0; i < set->ks_nreqs; i++) {
531 		*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
532 		pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset,
533 		    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
534 	}
535 
536 	/*
537 	 * Ask the backend to program the hardware.
538 	 */
539 	ctx->kc_rawtick = KCPC_GET_TICK();
540 	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
541 	pcbe_ops->pcbe_program(ctx);
542 	kpreempt_enable();
543 
544 	return (0);
545 }
546 
547 /*
548  * Caller must hold kcpc_cpuctx_lock.
549  */
550 int
551 kcpc_enable(kthread_t *t, int cmd, int enable)
552 {
553 	kcpc_ctx_t	*ctx = t->t_cpc_ctx;
554 	kcpc_set_t	*set = t->t_cpc_set;
555 	kcpc_set_t	*newset;
556 	int		i;
557 	int		flag;
558 	int		err;
559 
560 	ASSERT(RW_READ_HELD(&kcpc_cpuctx_lock));
561 
562 	if (ctx == NULL) {
563 		/*
564 		 * This thread has a set but no context; it must be a
565 		 * CPU-bound set.
566 		 */
567 		ASSERT(t->t_cpc_set != NULL);
568 		ASSERT(t->t_cpc_set->ks_ctx->kc_cpuid != -1);
569 		return (EINVAL);
570 	} else if (ctx->kc_flags & KCPC_CTX_INVALID)
571 		return (EAGAIN);
572 
573 	if (cmd == CPC_ENABLE) {
574 		if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
575 			return (EINVAL);
576 		kpreempt_disable();
577 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
578 		kcpc_restore(ctx);
579 		kpreempt_enable();
580 	} else if (cmd == CPC_DISABLE) {
581 		if (ctx->kc_flags & KCPC_CTX_FREEZE)
582 			return (EINVAL);
583 		kpreempt_disable();
584 		kcpc_save(ctx);
585 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
586 		kpreempt_enable();
587 	} else if (cmd == CPC_USR_EVENTS || cmd == CPC_SYS_EVENTS) {
588 		/*
589 		 * Strategy for usr/sys: stop counters and update set's presets
590 		 * with current counter values, unbind, update requests with
591 		 * new config, then re-bind.
592 		 */
593 		flag = (cmd == CPC_USR_EVENTS) ?
594 		    CPC_COUNT_USER: CPC_COUNT_SYSTEM;
595 
596 		kpreempt_disable();
597 		atomic_or_uint(&ctx->kc_flags,
598 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
599 		pcbe_ops->pcbe_allstop();
600 		kpreempt_enable();
601 		for (i = 0; i < set->ks_nreqs; i++) {
602 			set->ks_req[i].kr_preset = *(set->ks_req[i].kr_data);
603 			if (enable)
604 				set->ks_req[i].kr_flags |= flag;
605 			else
606 				set->ks_req[i].kr_flags &= ~flag;
607 		}
608 		newset = kcpc_dup_set(set);
609 		if (kcpc_unbind(set) != 0)
610 			return (EINVAL);
611 		t->t_cpc_set = newset;
612 		if (kcpc_bind_thread(newset, t, &err) != 0) {
613 			t->t_cpc_set = NULL;
614 			kcpc_free_set(newset);
615 			return (EINVAL);
616 		}
617 	} else
618 		return (EINVAL);
619 
620 	return (0);
621 }
622 
623 /*
624  * Provide PCBEs with a way of obtaining the configs of every counter which will
625  * be programmed together.
626  *
627  * If current is NULL, provide the first config.
628  *
629  * If data != NULL, caller wants to know where the data store associated with
630  * the config we return is located.
631  */
632 void *
633 kcpc_next_config(void *token, void *current, uint64_t **data)
634 {
635 	int		i;
636 	kcpc_pic_t	*pic;
637 	kcpc_ctx_t *ctx = (kcpc_ctx_t *)token;
638 
639 	if (current == NULL) {
640 		/*
641 		 * Client would like the first config, which may not be in
642 		 * counter 0; we need to search through the counters for the
643 		 * first config.
644 		 */
645 		for (i = 0; i < cpc_ncounters; i++)
646 			if (ctx->kc_pics[i].kp_req != NULL)
647 				break;
648 		/*
649 		 * There are no counters configured for the given context.
650 		 */
651 		if (i == cpc_ncounters)
652 			return (NULL);
653 	} else {
654 		/*
655 		 * There surely is a faster way to do this.
656 		 */
657 		for (i = 0; i < cpc_ncounters; i++) {
658 			pic = &ctx->kc_pics[i];
659 
660 			if (pic->kp_req != NULL &&
661 			    current == pic->kp_req->kr_config)
662 				break;
663 		}
664 
665 		/*
666 		 * We found the current config at picnum i. Now search for the
667 		 * next configured PIC.
668 		 */
669 		for (i++; i < cpc_ncounters; i++) {
670 			pic = &ctx->kc_pics[i];
671 			if (pic->kp_req != NULL)
672 				break;
673 		}
674 
675 		if (i == cpc_ncounters)
676 			return (NULL);
677 	}
678 
679 	if (data != NULL) {
680 		*data = ctx->kc_pics[i].kp_req->kr_data;
681 	}
682 
683 	return (ctx->kc_pics[i].kp_req->kr_config);
684 }
685 
686 
687 static kcpc_ctx_t *
688 kcpc_ctx_alloc(void)
689 {
690 	kcpc_ctx_t	*ctx;
691 	long		hash;
692 
693 	ctx = (kcpc_ctx_t *)kmem_alloc(sizeof (kcpc_ctx_t), KM_SLEEP);
694 
695 	hash = CPC_HASH_CTX(ctx);
696 	mutex_enter(&kcpc_ctx_llock[hash]);
697 	ctx->kc_next = kcpc_ctx_list[hash];
698 	kcpc_ctx_list[hash] = ctx;
699 	mutex_exit(&kcpc_ctx_llock[hash]);
700 
701 	ctx->kc_pics = (kcpc_pic_t *)kmem_zalloc(sizeof (kcpc_pic_t) *
702 	    cpc_ncounters, KM_SLEEP);
703 
704 	ctx->kc_flags = 0;
705 	ctx->kc_vtick = 0;
706 	ctx->kc_rawtick = 0;
707 	ctx->kc_cpuid = -1;
708 
709 	return (ctx);
710 }
711 
712 /*
713  * Copy set from ctx to the child context, cctx, if it has CPC_BIND_LWP_INHERIT
714  * in the flags.
715  */
716 static void
717 kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx)
718 {
719 	kcpc_set_t	*ks = ctx->kc_set, *cks;
720 	int		i, j;
721 	int		code;
722 
723 	ASSERT(ks != NULL);
724 
725 	if ((ks->ks_flags & CPC_BIND_LWP_INHERIT) == 0)
726 		return;
727 
728 	cks = kmem_alloc(sizeof (*cks), KM_SLEEP);
729 	cctx->kc_set = cks;
730 	cks->ks_flags = ks->ks_flags;
731 	cks->ks_nreqs = ks->ks_nreqs;
732 	cks->ks_req = kmem_alloc(cks->ks_nreqs *
733 	    sizeof (kcpc_request_t), KM_SLEEP);
734 	cks->ks_data = kmem_alloc(cks->ks_nreqs * sizeof (uint64_t),
735 	    KM_SLEEP);
736 	cks->ks_ctx = cctx;
737 
738 	for (i = 0; i < cks->ks_nreqs; i++) {
739 		cks->ks_req[i].kr_index = ks->ks_req[i].kr_index;
740 		cks->ks_req[i].kr_picnum = ks->ks_req[i].kr_picnum;
741 		(void) strncpy(cks->ks_req[i].kr_event,
742 		    ks->ks_req[i].kr_event, CPC_MAX_EVENT_LEN);
743 		cks->ks_req[i].kr_preset = ks->ks_req[i].kr_preset;
744 		cks->ks_req[i].kr_flags = ks->ks_req[i].kr_flags;
745 		cks->ks_req[i].kr_nattrs = ks->ks_req[i].kr_nattrs;
746 		if (ks->ks_req[i].kr_nattrs > 0) {
747 			cks->ks_req[i].kr_attr =
748 			    kmem_alloc(ks->ks_req[i].kr_nattrs *
749 				sizeof (kcpc_attr_t), KM_SLEEP);
750 		}
751 		for (j = 0; j < ks->ks_req[i].kr_nattrs; j++) {
752 			(void) strncpy(cks->ks_req[i].kr_attr[j].ka_name,
753 			    ks->ks_req[i].kr_attr[j].ka_name,
754 			    CPC_MAX_ATTR_LEN);
755 			cks->ks_req[i].kr_attr[j].ka_val =
756 			    ks->ks_req[i].kr_attr[j].ka_val;
757 		}
758 	}
759 	if (kcpc_configure_reqs(cctx, cks, &code) != 0)
760 		kcpc_invalidate_config(cctx);
761 }
762 
763 
764 static void
765 kcpc_ctx_free(kcpc_ctx_t *ctx)
766 {
767 	kcpc_ctx_t	**loc;
768 	long		hash = CPC_HASH_CTX(ctx);
769 
770 	mutex_enter(&kcpc_ctx_llock[hash]);
771 	loc = &kcpc_ctx_list[hash];
772 	ASSERT(*loc != NULL);
773 	while (*loc != ctx)
774 		loc = &(*loc)->kc_next;
775 	*loc = ctx->kc_next;
776 	mutex_exit(&kcpc_ctx_llock[hash]);
777 
778 	kmem_free(ctx->kc_pics, cpc_ncounters * sizeof (kcpc_pic_t));
779 	kmem_free(ctx, sizeof (*ctx));
780 }
781 
782 /*
783  * Generic interrupt handler used on hardware that generates
784  * overflow interrupts.
785  *
786  * Note: executed at high-level interrupt context!
787  */
788 /*ARGSUSED*/
789 kcpc_ctx_t *
790 kcpc_overflow_intr(caddr_t arg, uint64_t bitmap)
791 {
792 	kcpc_ctx_t	*ctx;
793 	kthread_t	*t = curthread;
794 	int		i;
795 
796 	/*
797 	 * On both x86 and UltraSPARC, we may deliver the high-level
798 	 * interrupt in kernel mode, just after we've started to run an
799 	 * interrupt thread.  (That's because the hardware helpfully
800 	 * delivers the overflow interrupt some random number of cycles
801 	 * after the instruction that caused the overflow by which time
802 	 * we're in some part of the kernel, not necessarily running on
803 	 * the right thread).
804 	 *
805 	 * Check for this case here -- find the pinned thread
806 	 * that was running when the interrupt went off.
807 	 */
808 	if (t->t_flag & T_INTR_THREAD) {
809 		klwp_t *lwp;
810 
811 		atomic_add_32(&kcpc_intrctx_count, 1);
812 
813 		/*
814 		 * Note that t_lwp is always set to point at the underlying
815 		 * thread, thus this will work in the presence of nested
816 		 * interrupts.
817 		 */
818 		ctx = NULL;
819 		if ((lwp = t->t_lwp) != NULL) {
820 			t = lwptot(lwp);
821 			ctx = t->t_cpc_ctx;
822 		}
823 	} else
824 		ctx = t->t_cpc_ctx;
825 
826 	if (ctx == NULL) {
827 		/*
828 		 * This can easily happen if we're using the counters in
829 		 * "shared" mode, for example, and an overflow interrupt
830 		 * occurs while we are running cpustat.  In that case, the
831 		 * bound thread that has the context that belongs to this
832 		 * CPU is almost certainly sleeping (if it was running on
833 		 * the CPU we'd have found it above), and the actual
834 		 * interrupted thread has no knowledge of performance counters!
835 		 */
836 		ctx = curthread->t_cpu->cpu_cpc_ctx;
837 		if (ctx != NULL) {
838 			/*
839 			 * Return the bound context for this CPU to
840 			 * the interrupt handler so that it can synchronously
841 			 * sample the hardware counters and restart them.
842 			 */
843 			return (ctx);
844 		}
845 
846 		/*
847 		 * As long as the overflow interrupt really is delivered early
848 		 * enough after trapping into the kernel to avoid switching
849 		 * threads, we must always be able to find the cpc context,
850 		 * or something went terribly wrong i.e. we ended up
851 		 * running a passivated interrupt thread, a kernel
852 		 * thread or we interrupted idle, all of which are Very Bad.
853 		 */
854 		if (kcpc_nullctx_panic)
855 			panic("null cpc context, thread %p", (void *)t);
856 		atomic_add_32(&kcpc_nullctx_count, 1);
857 	} else if ((ctx->kc_flags & KCPC_CTX_INVALID) == 0) {
858 		/*
859 		 * Schedule an ast to sample the counters, which will
860 		 * propagate any overflow into the virtualized performance
861 		 * counter(s), and may deliver a signal.
862 		 */
863 		ttolwp(t)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
864 		/*
865 		 * If a counter has overflowed which was counting on behalf of
866 		 * a request which specified CPC_OVF_NOTIFY_EMT, send the
867 		 * process a signal.
868 		 */
869 		for (i = 0; i < cpc_ncounters; i++) {
870 			if (ctx->kc_pics[i].kp_req != NULL &&
871 			    bitmap & (1 << i) &&
872 			    ctx->kc_pics[i].kp_req->kr_flags &
873 			    CPC_OVF_NOTIFY_EMT) {
874 				/*
875 				 * A signal has been requested for this PIC, so
876 				 * so freeze the context. The interrupt handler
877 				 * has already stopped the counter hardware.
878 				 */
879 				atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
880 				atomic_or_uint(&ctx->kc_pics[i].kp_flags,
881 				    KCPC_PIC_OVERFLOWED);
882 			}
883 		}
884 		aston(t);
885 	}
886 	return (NULL);
887 }
888 
889 /*
890  * The current thread context had an overflow interrupt; we're
891  * executing here in high-level interrupt context.
892  */
893 /*ARGSUSED*/
894 uint_t
895 kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
896 {
897 	kcpc_ctx_t	*ctx;
898 	uint64_t	bitmap;
899 
900 	if (pcbe_ops == NULL ||
901 	    (bitmap = pcbe_ops->pcbe_overflow_bitmap()) == 0)
902 		return (DDI_INTR_UNCLAIMED);
903 #ifdef N2_1x_CPC_WORKAROUNDS
904 	/*
905 	 * Check if any of the supported counters overflowed. If
906 	 * not, it's a spurious overflow trap (Niagara2 1.x silicon
907 	 * bug). Ignore this trap.
908 	 */
909 	if ((bitmap & ((1 <<cpc_ncounters)-1)) == 0)
910 		return (DDI_INTR_CLAIMED);
911 #endif
912 	/*
913 	 * Prevent any further interrupts.
914 	 */
915 	pcbe_ops->pcbe_allstop();
916 
917 	/*
918 	 * Invoke the "generic" handler.
919 	 *
920 	 * If the interrupt has occurred in the context of an lwp owning
921 	 * the counters, then the handler posts an AST to the lwp to
922 	 * trigger the actual sampling, and optionally deliver a signal or
923 	 * restart the counters, on the way out of the kernel using
924 	 * kcpc_hw_overflow_ast() (see below).
925 	 *
926 	 * On the other hand, if the handler returns the context to us
927 	 * directly, then it means that there are no other threads in
928 	 * the middle of updating it, no AST has been posted, and so we
929 	 * should sample the counters here, and restart them with no
930 	 * further fuss.
931 	 */
932 	if ((ctx = kcpc_overflow_intr(arg1, bitmap)) != NULL) {
933 		uint64_t curtick = KCPC_GET_TICK();
934 
935 		ctx->kc_hrtime = gethrtime_waitfree();
936 		ctx->kc_vtick += curtick - ctx->kc_rawtick;
937 		ctx->kc_rawtick = curtick;
938 		pcbe_ops->pcbe_sample(ctx);
939 		pcbe_ops->pcbe_program(ctx);
940 	}
941 
942 	return (DDI_INTR_CLAIMED);
943 }
944 
945 /*
946  * Called from trap() when processing the ast posted by the high-level
947  * interrupt handler.
948  */
949 int
950 kcpc_overflow_ast()
951 {
952 	kcpc_ctx_t	*ctx = curthread->t_cpc_ctx;
953 	int		i;
954 	int		found = 0;
955 	uint64_t	curtick = KCPC_GET_TICK();
956 
957 	ASSERT(ctx != NULL);	/* Beware of interrupt skid. */
958 
959 	/*
960 	 * An overflow happened: sample the context to ensure that
961 	 * the overflow is propagated into the upper bits of the
962 	 * virtualized 64-bit counter(s).
963 	 */
964 	kpreempt_disable();
965 	ctx->kc_hrtime = gethrtime_waitfree();
966 	pcbe_ops->pcbe_sample(ctx);
967 	kpreempt_enable();
968 
969 	ctx->kc_vtick += curtick - ctx->kc_rawtick;
970 
971 	/*
972 	 * The interrupt handler has marked any pics with KCPC_PIC_OVERFLOWED
973 	 * if that pic generated an overflow and if the request it was counting
974 	 * on behalf of had CPC_OVERFLOW_REQUEST specified. We go through all
975 	 * pics in the context and clear the KCPC_PIC_OVERFLOWED flags. If we
976 	 * found any overflowed pics, keep the context frozen and return true
977 	 * (thus causing a signal to be sent).
978 	 */
979 	for (i = 0; i < cpc_ncounters; i++) {
980 		if (ctx->kc_pics[i].kp_flags & KCPC_PIC_OVERFLOWED) {
981 			atomic_and_uint(&ctx->kc_pics[i].kp_flags,
982 			    ~KCPC_PIC_OVERFLOWED);
983 			found = 1;
984 		}
985 	}
986 	if (found)
987 		return (1);
988 
989 	/*
990 	 * Otherwise, re-enable the counters and continue life as before.
991 	 */
992 	kpreempt_disable();
993 	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
994 	pcbe_ops->pcbe_program(ctx);
995 	kpreempt_enable();
996 	return (0);
997 }
998 
999 /*
1000  * Called when switching away from current thread.
1001  */
1002 static void
1003 kcpc_save(kcpc_ctx_t *ctx)
1004 {
1005 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
1006 		if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)
1007 			return;
1008 		/*
1009 		 * This context has been invalidated but the counters have not
1010 		 * been stopped. Stop them here and mark the context stopped.
1011 		 */
1012 		pcbe_ops->pcbe_allstop();
1013 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
1014 		return;
1015 	}
1016 
1017 	pcbe_ops->pcbe_allstop();
1018 	if (ctx->kc_flags & KCPC_CTX_FREEZE)
1019 		return;
1020 
1021 	/*
1022 	 * Need to sample for all reqs into each req's current mpic.
1023 	 */
1024 	ctx->kc_hrtime = gethrtime();
1025 	ctx->kc_vtick += KCPC_GET_TICK() - ctx->kc_rawtick;
1026 	pcbe_ops->pcbe_sample(ctx);
1027 }
1028 
1029 static void
1030 kcpc_restore(kcpc_ctx_t *ctx)
1031 {
1032 	if ((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED)) ==
1033 	    KCPC_CTX_INVALID)
1034 		/*
1035 		 * The context is invalidated but has not been marked stopped.
1036 		 * We mark it as such here because we will not start the
1037 		 * counters during this context switch.
1038 		 */
1039 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
1040 
1041 
1042 	if (ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_FREEZE))
1043 		return;
1044 
1045 	/*
1046 	 * While programming the hardware, the counters should be stopped. We
1047 	 * don't do an explicit pcbe_allstop() here because they should have
1048 	 * been stopped already by the last consumer.
1049 	 */
1050 	ctx->kc_rawtick = KCPC_GET_TICK();
1051 	pcbe_ops->pcbe_program(ctx);
1052 }
1053 
1054 /*
1055  * If kcpc_counts_include_idle is set to 0 by the sys admin, we add the the
1056  * following context operators to the idle thread on each CPU. They stop the
1057  * counters when the idle thread is switched on, and they start them again when
1058  * it is switched off.
1059  */
1060 
1061 /*ARGSUSED*/
1062 void
1063 kcpc_idle_save(struct cpu *cp)
1064 {
1065 	/*
1066 	 * The idle thread shouldn't be run anywhere else.
1067 	 */
1068 	ASSERT(CPU == cp);
1069 
1070 	/*
1071 	 * We must hold the CPU's context lock to ensure the context isn't freed
1072 	 * while we're looking at it.
1073 	 */
1074 	mutex_enter(&cp->cpu_cpc_ctxlock);
1075 
1076 	if ((cp->cpu_cpc_ctx == NULL) ||
1077 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1078 		mutex_exit(&cp->cpu_cpc_ctxlock);
1079 		return;
1080 	}
1081 
1082 	pcbe_ops->pcbe_program(cp->cpu_cpc_ctx);
1083 	mutex_exit(&cp->cpu_cpc_ctxlock);
1084 }
1085 
1086 void
1087 kcpc_idle_restore(struct cpu *cp)
1088 {
1089 	/*
1090 	 * The idle thread shouldn't be run anywhere else.
1091 	 */
1092 	ASSERT(CPU == cp);
1093 
1094 	/*
1095 	 * We must hold the CPU's context lock to ensure the context isn't freed
1096 	 * while we're looking at it.
1097 	 */
1098 	mutex_enter(&cp->cpu_cpc_ctxlock);
1099 
1100 	if ((cp->cpu_cpc_ctx == NULL) ||
1101 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1102 		mutex_exit(&cp->cpu_cpc_ctxlock);
1103 		return;
1104 	}
1105 
1106 	pcbe_ops->pcbe_allstop();
1107 	mutex_exit(&cp->cpu_cpc_ctxlock);
1108 }
1109 
1110 /*ARGSUSED*/
1111 static void
1112 kcpc_lwp_create(kthread_t *t, kthread_t *ct)
1113 {
1114 	kcpc_ctx_t	*ctx = t->t_cpc_ctx, *cctx;
1115 	int		i;
1116 
1117 	if (ctx == NULL || (ctx->kc_flags & KCPC_CTX_LWPINHERIT) == 0)
1118 		return;
1119 
1120 	rw_enter(&kcpc_cpuctx_lock, RW_READER);
1121 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
1122 		rw_exit(&kcpc_cpuctx_lock);
1123 		return;
1124 	}
1125 	cctx = kcpc_ctx_alloc();
1126 	kcpc_ctx_clone(ctx, cctx);
1127 	rw_exit(&kcpc_cpuctx_lock);
1128 
1129 	/*
1130 	 * Copy the parent context's kc_flags field, but don't overwrite
1131 	 * the child's in case it was modified during kcpc_ctx_clone.
1132 	 */
1133 	cctx->kc_flags |= ctx->kc_flags;
1134 	cctx->kc_thread = ct;
1135 	cctx->kc_cpuid = -1;
1136 	ct->t_cpc_set = cctx->kc_set;
1137 	ct->t_cpc_ctx = cctx;
1138 
1139 	if (cctx->kc_flags & KCPC_CTX_SIGOVF) {
1140 		kcpc_set_t *ks = cctx->kc_set;
1141 		/*
1142 		 * Our contract with the user requires us to immediately send an
1143 		 * overflow signal to all children if we have the LWPINHERIT
1144 		 * and SIGOVF flags set. In addition, all counters should be
1145 		 * set to UINT64_MAX, and their pic's overflow flag turned on
1146 		 * so that our trap() processing knows to send a signal.
1147 		 */
1148 		atomic_or_uint(&cctx->kc_flags, KCPC_CTX_FREEZE);
1149 		for (i = 0; i < ks->ks_nreqs; i++) {
1150 			kcpc_request_t *kr = &ks->ks_req[i];
1151 
1152 			if (kr->kr_flags & CPC_OVF_NOTIFY_EMT) {
1153 				*(kr->kr_data) = UINT64_MAX;
1154 				kr->kr_picp->kp_flags |= KCPC_PIC_OVERFLOWED;
1155 			}
1156 		}
1157 		ttolwp(ct)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
1158 		aston(ct);
1159 	}
1160 
1161 	installctx(ct, cctx, kcpc_save, kcpc_restore,
1162 	    NULL, kcpc_lwp_create, NULL, kcpc_free);
1163 }
1164 
1165 /*
1166  * Counter Stoppage Theory
1167  *
1168  * The counters may need to be stopped properly at the following occasions:
1169  *
1170  * 1) An LWP exits.
1171  * 2) A thread exits.
1172  * 3) An LWP performs an exec().
1173  * 4) A bound set is unbound.
1174  *
1175  * In addition to stopping the counters, the CPC context (a kcpc_ctx_t) may need
1176  * to be freed as well.
1177  *
1178  * Case 1: kcpc_passivate(), called via lwp_exit(), stops the counters. Later on
1179  * when the thread is freed, kcpc_free(), called by freectx(), frees the
1180  * context.
1181  *
1182  * Case 2: same as case 1 except kcpc_passivate is called from thread_exit().
1183  *
1184  * Case 3: kcpc_free(), called via freectx() via exec(), recognizes that it has
1185  * been called from exec. It stops the counters _and_ frees the context.
1186  *
1187  * Case 4: kcpc_unbind() stops the hardware _and_ frees the context.
1188  *
1189  * CPU-bound counters are always stopped via kcpc_unbind().
1190  */
1191 
1192 /*
1193  * We're being called to delete the context; we ensure that all associated data
1194  * structures are freed, and that the hardware is passivated if this is an exec.
1195  */
1196 
1197 /*ARGSUSED*/
1198 static void
1199 kcpc_free(kcpc_ctx_t *ctx, int isexec)
1200 {
1201 	int		i;
1202 	kcpc_set_t	*set = ctx->kc_set;
1203 
1204 	ASSERT(set != NULL);
1205 
1206 	atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1207 
1208 	if (isexec) {
1209 		/*
1210 		 * This thread is execing, and after the exec it should not have
1211 		 * any performance counter context. Stop the counters properly
1212 		 * here so the system isn't surprised by an overflow interrupt
1213 		 * later.
1214 		 */
1215 		if (ctx->kc_cpuid != -1) {
1216 			cpu_t *cp;
1217 			/*
1218 			 * CPU-bound context; stop the appropriate CPU's ctrs.
1219 			 * Hold cpu_lock while examining the CPU to ensure it
1220 			 * doesn't go away.
1221 			 */
1222 			mutex_enter(&cpu_lock);
1223 			cp = cpu_get(ctx->kc_cpuid);
1224 			/*
1225 			 * The CPU could have been DR'd out, so only stop the
1226 			 * CPU and clear its context pointer if the CPU still
1227 			 * exists.
1228 			 */
1229 			if (cp != NULL) {
1230 				mutex_enter(&cp->cpu_cpc_ctxlock);
1231 				kcpc_stop_hw(ctx);
1232 				cp->cpu_cpc_ctx = NULL;
1233 				mutex_exit(&cp->cpu_cpc_ctxlock);
1234 			}
1235 			mutex_exit(&cpu_lock);
1236 			ASSERT(curthread->t_cpc_ctx == NULL);
1237 		} else {
1238 			/*
1239 			 * Thread-bound context; stop _this_ CPU's counters.
1240 			 */
1241 			kpreempt_disable();
1242 			pcbe_ops->pcbe_allstop();
1243 			atomic_or_uint(&ctx->kc_flags,
1244 			    KCPC_CTX_INVALID_STOPPED);
1245 			kpreempt_enable();
1246 			curthread->t_cpc_ctx = NULL;
1247 		}
1248 
1249 		/*
1250 		 * Since we are being called from an exec and we know that
1251 		 * exec is not permitted via the agent thread, we should clean
1252 		 * up this thread's CPC state completely, and not leave dangling
1253 		 * CPC pointers behind.
1254 		 */
1255 		ASSERT(ctx->kc_thread == curthread);
1256 		curthread->t_cpc_set = NULL;
1257 	}
1258 
1259 	/*
1260 	 * Walk through each request in this context's set and free the PCBE's
1261 	 * configuration if it exists.
1262 	 */
1263 	for (i = 0; i < set->ks_nreqs; i++) {
1264 		if (set->ks_req[i].kr_config != NULL)
1265 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
1266 	}
1267 
1268 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
1269 	kcpc_ctx_free(ctx);
1270 	kcpc_free_set(set);
1271 }
1272 
1273 /*
1274  * Free the memory associated with a request set.
1275  */
1276 void
1277 kcpc_free_set(kcpc_set_t *set)
1278 {
1279 	int		i;
1280 	kcpc_request_t	*req;
1281 
1282 	ASSERT(set->ks_req != NULL);
1283 
1284 	for (i = 0; i < set->ks_nreqs; i++) {
1285 		req = &set->ks_req[i];
1286 
1287 		if (req->kr_nattrs != 0) {
1288 			kmem_free(req->kr_attr,
1289 			    req->kr_nattrs * sizeof (kcpc_attr_t));
1290 		}
1291 	}
1292 
1293 	kmem_free(set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
1294 	kmem_free(set, sizeof (kcpc_set_t));
1295 }
1296 
1297 /*
1298  * Grab every existing context and mark it as invalid.
1299  */
1300 void
1301 kcpc_invalidate_all(void)
1302 {
1303 	kcpc_ctx_t *ctx;
1304 	long hash;
1305 
1306 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++) {
1307 		mutex_enter(&kcpc_ctx_llock[hash]);
1308 		for (ctx = kcpc_ctx_list[hash]; ctx; ctx = ctx->kc_next)
1309 			atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1310 		mutex_exit(&kcpc_ctx_llock[hash]);
1311 	}
1312 }
1313 
1314 /*
1315  * Interface for PCBEs to signal that an existing configuration has suddenly
1316  * become invalid.
1317  */
1318 void
1319 kcpc_invalidate_config(void *token)
1320 {
1321 	kcpc_ctx_t *ctx = token;
1322 
1323 	ASSERT(ctx != NULL);
1324 
1325 	atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1326 }
1327 
1328 /*
1329  * Called from lwp_exit() and thread_exit()
1330  */
1331 void
1332 kcpc_passivate(void)
1333 {
1334 	kcpc_ctx_t *ctx = curthread->t_cpc_ctx;
1335 	kcpc_set_t *set = curthread->t_cpc_set;
1336 
1337 	if (set == NULL)
1338 		return;
1339 
1340 	/*
1341 	 * We're cleaning up after this thread; ensure there are no dangling
1342 	 * CPC pointers left behind. The context and set will be freed by
1343 	 * freectx() in the case of an LWP-bound set, and by kcpc_unbind() in
1344 	 * the case of a CPU-bound set.
1345 	 */
1346 	curthread->t_cpc_ctx = NULL;
1347 
1348 	if (ctx == NULL) {
1349 		/*
1350 		 * This thread has a set but no context; it must be a CPU-bound
1351 		 * set. The hardware will be stopped via kcpc_unbind() when the
1352 		 * process exits and closes its file descriptors with
1353 		 * kcpc_close(). Our only job here is to clean up this thread's
1354 		 * state; the set will be freed with the unbind().
1355 		 */
1356 		(void) kcpc_unbind(set);
1357 		/*
1358 		 * Unbinding a set belonging to the current thread should clear
1359 		 * its set pointer.
1360 		 */
1361 		ASSERT(curthread->t_cpc_set == NULL);
1362 		return;
1363 	}
1364 
1365 	curthread->t_cpc_set = NULL;
1366 
1367 	/*
1368 	 * This thread/LWP is exiting but context switches will continue to
1369 	 * happen for a bit as the exit proceeds.  Kernel preemption must be
1370 	 * disabled here to prevent a race between checking or setting the
1371 	 * INVALID_STOPPED flag here and kcpc_restore() setting the flag during
1372 	 * a context switch.
1373 	 */
1374 
1375 	kpreempt_disable();
1376 	if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
1377 		pcbe_ops->pcbe_allstop();
1378 		atomic_or_uint(&ctx->kc_flags,
1379 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
1380 	}
1381 	kpreempt_enable();
1382 }
1383 
1384 /*
1385  * Assign the requests in the given set to the PICs in the context.
1386  * Returns 0 if successful, -1 on failure.
1387  */
1388 /*ARGSUSED*/
1389 static int
1390 kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx)
1391 {
1392 	int i;
1393 	int *picnum_save;
1394 
1395 	ASSERT(set->ks_nreqs <= cpc_ncounters);
1396 
1397 	/*
1398 	 * Provide kcpc_tryassign() with scratch space to avoid doing an
1399 	 * alloc/free with every invocation.
1400 	 */
1401 	picnum_save = kmem_alloc(set->ks_nreqs * sizeof (int), KM_SLEEP);
1402 	/*
1403 	 * kcpc_tryassign() blindly walks through each request in the set,
1404 	 * seeing if a counter can count its event. If yes, it assigns that
1405 	 * counter. However, that counter may have been the only capable counter
1406 	 * for _another_ request's event. The solution is to try every possible
1407 	 * request first. Note that this does not cover all solutions, as
1408 	 * that would require all unique orderings of requests, an n^n operation
1409 	 * which would be unacceptable for architectures with many counters.
1410 	 */
1411 	for (i = 0; i < set->ks_nreqs; i++)
1412 		if (kcpc_tryassign(set, i, picnum_save) == 0)
1413 			break;
1414 
1415 	kmem_free(picnum_save, set->ks_nreqs * sizeof (int));
1416 	if (i == set->ks_nreqs)
1417 		return (-1);
1418 	return (0);
1419 }
1420 
1421 static int
1422 kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch)
1423 {
1424 	int		i;
1425 	int		j;
1426 	uint64_t	bitmap = 0, resmap = 0;
1427 	uint64_t	ctrmap;
1428 
1429 	/*
1430 	 * We are attempting to assign the reqs to pics, but we may fail. If we
1431 	 * fail, we need to restore the state of the requests to what it was
1432 	 * when we found it, as some reqs may have been explicitly assigned to
1433 	 * a specific PIC beforehand. We do this by snapshotting the assignments
1434 	 * now and restoring from it later if we fail.
1435 	 *
1436 	 * Also we note here which counters have already been claimed by
1437 	 * requests with explicit counter assignments.
1438 	 */
1439 	for (i = 0; i < set->ks_nreqs; i++) {
1440 		scratch[i] = set->ks_req[i].kr_picnum;
1441 		if (set->ks_req[i].kr_picnum != -1)
1442 			resmap |= (1 << set->ks_req[i].kr_picnum);
1443 	}
1444 
1445 	/*
1446 	 * Walk through requests assigning them to the first PIC that is
1447 	 * capable.
1448 	 */
1449 	i = starting_req;
1450 	do {
1451 		if (set->ks_req[i].kr_picnum != -1) {
1452 			ASSERT((bitmap & (1 << set->ks_req[i].kr_picnum)) == 0);
1453 			bitmap |= (1 << set->ks_req[i].kr_picnum);
1454 			if (++i == set->ks_nreqs)
1455 				i = 0;
1456 			continue;
1457 		}
1458 
1459 		ctrmap = pcbe_ops->pcbe_event_coverage(set->ks_req[i].kr_event);
1460 		for (j = 0; j < cpc_ncounters; j++) {
1461 			if (ctrmap & (1 << j) && (bitmap & (1 << j)) == 0 &&
1462 			    (resmap & (1 << j)) == 0) {
1463 				/*
1464 				 * We can assign this counter because:
1465 				 *
1466 				 * 1. It can count the event (ctrmap)
1467 				 * 2. It hasn't been assigned yet (bitmap)
1468 				 * 3. It wasn't reserved by a request (resmap)
1469 				 */
1470 				bitmap |= (1 << j);
1471 				break;
1472 			}
1473 		}
1474 		if (j == cpc_ncounters) {
1475 			for (i = 0; i < set->ks_nreqs; i++)
1476 				set->ks_req[i].kr_picnum = scratch[i];
1477 			return (-1);
1478 		}
1479 		set->ks_req[i].kr_picnum = j;
1480 
1481 		if (++i == set->ks_nreqs)
1482 			i = 0;
1483 	} while (i != starting_req);
1484 
1485 	return (0);
1486 }
1487 
1488 kcpc_set_t *
1489 kcpc_dup_set(kcpc_set_t *set)
1490 {
1491 	kcpc_set_t	*new;
1492 	int		i;
1493 	int		j;
1494 
1495 	new = kmem_alloc(sizeof (*new), KM_SLEEP);
1496 	new->ks_flags = set->ks_flags;
1497 	new->ks_nreqs = set->ks_nreqs;
1498 	new->ks_req = kmem_alloc(set->ks_nreqs * sizeof (kcpc_request_t),
1499 	    KM_SLEEP);
1500 	new->ks_data = NULL;
1501 	new->ks_ctx = NULL;
1502 
1503 	for (i = 0; i < new->ks_nreqs; i++) {
1504 		new->ks_req[i].kr_config = NULL;
1505 		new->ks_req[i].kr_index = set->ks_req[i].kr_index;
1506 		new->ks_req[i].kr_picnum = set->ks_req[i].kr_picnum;
1507 		new->ks_req[i].kr_picp = NULL;
1508 		new->ks_req[i].kr_data = NULL;
1509 		(void) strncpy(new->ks_req[i].kr_event, set->ks_req[i].kr_event,
1510 		    CPC_MAX_EVENT_LEN);
1511 		new->ks_req[i].kr_preset = set->ks_req[i].kr_preset;
1512 		new->ks_req[i].kr_flags = set->ks_req[i].kr_flags;
1513 		new->ks_req[i].kr_nattrs = set->ks_req[i].kr_nattrs;
1514 		new->ks_req[i].kr_attr = kmem_alloc(new->ks_req[i].kr_nattrs *
1515 		    sizeof (kcpc_attr_t), KM_SLEEP);
1516 		for (j = 0; j < new->ks_req[i].kr_nattrs; j++) {
1517 			new->ks_req[i].kr_attr[j].ka_val =
1518 			    set->ks_req[i].kr_attr[j].ka_val;
1519 			(void) strncpy(new->ks_req[i].kr_attr[j].ka_name,
1520 			    set->ks_req[i].kr_attr[j].ka_name,
1521 			    CPC_MAX_ATTR_LEN);
1522 		}
1523 	}
1524 
1525 	return (new);
1526 }
1527 
1528 int
1529 kcpc_allow_nonpriv(void *token)
1530 {
1531 	return (((kcpc_ctx_t *)token)->kc_flags & KCPC_CTX_NONPRIV);
1532 }
1533 
1534 void
1535 kcpc_invalidate(kthread_t *t)
1536 {
1537 	kcpc_ctx_t *ctx = t->t_cpc_ctx;
1538 
1539 	if (ctx != NULL)
1540 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1541 }
1542 
1543 /*
1544  * Given a PCBE ID, attempt to load a matching PCBE module. The strings given
1545  * are used to construct PCBE names, starting with the most specific,
1546  * "pcbe.first.second.third.fourth" and ending with the least specific,
1547  * "pcbe.first".
1548  *
1549  * Returns 0 if a PCBE was successfully loaded and -1 upon error.
1550  */
1551 int
1552 kcpc_pcbe_tryload(const char *prefix, uint_t first, uint_t second, uint_t third)
1553 {
1554 	uint_t s[3];
1555 
1556 	s[0] = first;
1557 	s[1] = second;
1558 	s[2] = third;
1559 
1560 	return (modload_qualified("pcbe",
1561 	    "pcbe", prefix, ".", s, 3) < 0 ? -1 : 0);
1562 }
1563