xref: /titanic_41/usr/src/uts/common/os/kcpc.c (revision 66f9d5cb3cc0652e2d9d1366fb950efbe4ca2f24)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/param.h>
30 #include <sys/thread.h>
31 #include <sys/cpuvar.h>
32 #include <sys/inttypes.h>
33 #include <sys/cmn_err.h>
34 #include <sys/time.h>
35 #include <sys/mutex.h>
36 #include <sys/systm.h>
37 #include <sys/kcpc.h>
38 #include <sys/cpc_impl.h>
39 #include <sys/cpc_pcbe.h>
40 #include <sys/atomic.h>
41 #include <sys/sunddi.h>
42 #include <sys/modctl.h>
43 #include <sys/sdt.h>
44 #if defined(__x86)
45 #include <asm/clock.h>
46 #endif
47 
48 kmutex_t	kcpc_ctx_llock[CPC_HASH_BUCKETS];	/* protects ctx_list */
49 kcpc_ctx_t	*kcpc_ctx_list[CPC_HASH_BUCKETS];	/* head of list */
50 
51 
52 krwlock_t	kcpc_cpuctx_lock;	/* lock for 'kcpc_cpuctx' below */
53 int		kcpc_cpuctx;		/* number of cpu-specific contexts */
54 
55 int kcpc_counts_include_idle = 1; /* Project Private /etc/system variable */
56 
57 /*
58  * These are set when a PCBE module is loaded.
59  */
60 uint_t		cpc_ncounters = 0;
61 pcbe_ops_t	*pcbe_ops = NULL;
62 
63 /*
64  * Statistics on (mis)behavior
65  */
66 static uint32_t kcpc_intrctx_count;    /* # overflows in an interrupt handler */
67 static uint32_t kcpc_nullctx_count;    /* # overflows in a thread with no ctx */
68 
69 /*
70  * Is misbehaviour (overflow in a thread with no context) fatal?
71  */
72 #ifdef DEBUG
73 static int kcpc_nullctx_panic = 1;
74 #else
75 static int kcpc_nullctx_panic = 0;
76 #endif
77 
78 static void kcpc_lwp_create(kthread_t *t, kthread_t *ct);
79 static void kcpc_restore(kcpc_ctx_t *ctx);
80 static void kcpc_save(kcpc_ctx_t *ctx);
81 static void kcpc_free(kcpc_ctx_t *ctx, int isexec);
82 static int kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode);
83 static void kcpc_free_configs(kcpc_set_t *set);
84 static kcpc_ctx_t *kcpc_ctx_alloc(void);
85 static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx);
86 static void kcpc_ctx_free(kcpc_ctx_t *ctx);
87 static int kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx);
88 static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch);
89 static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set);
90 
91 void
92 kcpc_register_pcbe(pcbe_ops_t *ops)
93 {
94 	pcbe_ops = ops;
95 	cpc_ncounters = pcbe_ops->pcbe_ncounters();
96 }
97 
98 int
99 kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode)
100 {
101 	cpu_t		*cp;
102 	kcpc_ctx_t	*ctx;
103 	int		error;
104 
105 	ctx = kcpc_ctx_alloc();
106 
107 	if (kcpc_assign_reqs(set, ctx) != 0) {
108 		kcpc_ctx_free(ctx);
109 		*subcode = CPC_RESOURCE_UNAVAIL;
110 		return (EINVAL);
111 	}
112 
113 	ctx->kc_cpuid = cpuid;
114 	ctx->kc_thread = curthread;
115 
116 	set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
117 
118 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
119 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
120 		kcpc_ctx_free(ctx);
121 		return (error);
122 	}
123 
124 	set->ks_ctx = ctx;
125 	ctx->kc_set = set;
126 
127 	/*
128 	 * We must hold cpu_lock to prevent DR, offlining, or unbinding while
129 	 * we are manipulating the cpu_t and programming the hardware, else the
130 	 * the cpu_t could go away while we're looking at it.
131 	 */
132 	mutex_enter(&cpu_lock);
133 	cp = cpu_get(cpuid);
134 
135 	if (cp == NULL)
136 		/*
137 		 * The CPU could have been DRd out while we were getting set up.
138 		 */
139 		goto unbound;
140 
141 	mutex_enter(&cp->cpu_cpc_ctxlock);
142 
143 	if (cp->cpu_cpc_ctx != NULL) {
144 		/*
145 		 * If this CPU already has a bound set, return an error.
146 		 */
147 		mutex_exit(&cp->cpu_cpc_ctxlock);
148 		goto unbound;
149 	}
150 
151 	if (curthread->t_bind_cpu != cpuid) {
152 		mutex_exit(&cp->cpu_cpc_ctxlock);
153 		goto unbound;
154 	}
155 	cp->cpu_cpc_ctx = ctx;
156 
157 	/*
158 	 * Kernel preemption must be disabled while fiddling with the hardware
159 	 * registers to prevent partial updates.
160 	 */
161 	kpreempt_disable();
162 	ctx->kc_rawtick = KCPC_GET_TICK();
163 	pcbe_ops->pcbe_program(ctx);
164 	kpreempt_enable();
165 
166 	mutex_exit(&cp->cpu_cpc_ctxlock);
167 	mutex_exit(&cpu_lock);
168 
169 	return (0);
170 
171 unbound:
172 	mutex_exit(&cpu_lock);
173 	set->ks_ctx = NULL;
174 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
175 	kcpc_ctx_free(ctx);
176 	return (EAGAIN);
177 }
178 
179 int
180 kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
181 {
182 	kcpc_ctx_t	*ctx;
183 	int		error;
184 
185 	/*
186 	 * Only one set is allowed per context, so ensure there is no
187 	 * existing context.
188 	 */
189 
190 	if (t->t_cpc_ctx != NULL)
191 		return (EEXIST);
192 
193 	ctx = kcpc_ctx_alloc();
194 
195 	/*
196 	 * The context must begin life frozen until it has been properly
197 	 * programmed onto the hardware. This prevents the context ops from
198 	 * worrying about it until we're ready.
199 	 */
200 	ctx->kc_flags |= KCPC_CTX_FREEZE;
201 	ctx->kc_hrtime = gethrtime();
202 
203 	if (kcpc_assign_reqs(set, ctx) != 0) {
204 		kcpc_ctx_free(ctx);
205 		*subcode = CPC_RESOURCE_UNAVAIL;
206 		return (EINVAL);
207 	}
208 
209 	ctx->kc_cpuid = -1;
210 	if (set->ks_flags & CPC_BIND_LWP_INHERIT)
211 		ctx->kc_flags |= KCPC_CTX_LWPINHERIT;
212 	ctx->kc_thread = t;
213 	t->t_cpc_ctx = ctx;
214 	/*
215 	 * Permit threads to look at their own hardware counters from userland.
216 	 */
217 	ctx->kc_flags |= KCPC_CTX_NONPRIV;
218 
219 	/*
220 	 * Create the data store for this set.
221 	 */
222 	set->ks_data = kmem_alloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
223 
224 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
225 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
226 		kcpc_ctx_free(ctx);
227 		t->t_cpc_ctx = NULL;
228 		return (error);
229 	}
230 
231 	set->ks_ctx = ctx;
232 	ctx->kc_set = set;
233 
234 	/*
235 	 * Add a device context to the subject thread.
236 	 */
237 	installctx(t, ctx, kcpc_save, kcpc_restore, NULL,
238 	    kcpc_lwp_create, NULL, kcpc_free);
239 
240 	/*
241 	 * Ask the backend to program the hardware.
242 	 */
243 	if (t == curthread) {
244 		kpreempt_disable();
245 		ctx->kc_rawtick = KCPC_GET_TICK();
246 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
247 		pcbe_ops->pcbe_program(ctx);
248 		kpreempt_enable();
249 	} else
250 		/*
251 		 * Since we are the agent LWP, we know the victim LWP is stopped
252 		 * until we're done here; no need to worry about preemption or
253 		 * migration here. We still use an atomic op to clear the flag
254 		 * to ensure the flags are always self-consistent; they can
255 		 * still be accessed from, for instance, another CPU doing a
256 		 * kcpc_invalidate_all().
257 		 */
258 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
259 
260 
261 	return (0);
262 }
263 
264 /*
265  * Walk through each request in the set and ask the PCBE to configure a
266  * corresponding counter.
267  */
268 static int
269 kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode)
270 {
271 	int		i;
272 	int		ret;
273 	kcpc_request_t	*rp;
274 
275 	for (i = 0; i < set->ks_nreqs; i++) {
276 		int n;
277 		rp = &set->ks_req[i];
278 
279 		n = rp->kr_picnum;
280 
281 		ASSERT(n >= 0 && n < cpc_ncounters);
282 
283 		ASSERT(ctx->kc_pics[n].kp_req == NULL);
284 
285 		if (rp->kr_flags & CPC_OVF_NOTIFY_EMT) {
286 			if ((pcbe_ops->pcbe_caps & CPC_CAP_OVERFLOW_INTERRUPT)
287 			    == 0) {
288 				*subcode = -1;
289 				return (ENOTSUP);
290 			}
291 			/*
292 			 * If any of the counters have requested overflow
293 			 * notification, we flag the context as being one that
294 			 * cares about overflow.
295 			 */
296 			ctx->kc_flags |= KCPC_CTX_SIGOVF;
297 		}
298 
299 		rp->kr_config = NULL;
300 		if ((ret = pcbe_ops->pcbe_configure(n, rp->kr_event,
301 		    rp->kr_preset, rp->kr_flags, rp->kr_nattrs, rp->kr_attr,
302 		    &(rp->kr_config), (void *)ctx)) != 0) {
303 			kcpc_free_configs(set);
304 			*subcode = ret;
305 			if (ret == CPC_ATTR_REQUIRES_PRIVILEGE)
306 				return (EACCES);
307 			return (EINVAL);
308 		}
309 
310 		ctx->kc_pics[n].kp_req = rp;
311 		rp->kr_picp = &ctx->kc_pics[n];
312 		rp->kr_data = set->ks_data + rp->kr_index;
313 		*rp->kr_data = rp->kr_preset;
314 	}
315 
316 	return (0);
317 }
318 
319 static void
320 kcpc_free_configs(kcpc_set_t *set)
321 {
322 	int i;
323 
324 	for (i = 0; i < set->ks_nreqs; i++)
325 		if (set->ks_req[i].kr_config != NULL)
326 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
327 }
328 
329 /*
330  * buf points to a user address and the data should be copied out to that
331  * address in the current process.
332  */
333 int
334 kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick)
335 {
336 	kcpc_ctx_t	*ctx = set->ks_ctx;
337 	uint64_t	curtick = KCPC_GET_TICK();
338 
339 	if (ctx == NULL)
340 		return (EINVAL);
341 	else if (ctx->kc_flags & KCPC_CTX_INVALID)
342 		return (EAGAIN);
343 
344 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) {
345 		/*
346 		 * Kernel preemption must be disabled while reading the
347 		 * hardware regs, and if this is a CPU-bound context, while
348 		 * checking the CPU binding of the current thread.
349 		 */
350 		kpreempt_disable();
351 
352 		if (ctx->kc_cpuid != -1) {
353 			if (curthread->t_bind_cpu != ctx->kc_cpuid) {
354 				kpreempt_enable();
355 				return (EAGAIN);
356 			}
357 		}
358 
359 		if (ctx->kc_thread == curthread) {
360 			ctx->kc_hrtime = gethrtime();
361 			pcbe_ops->pcbe_sample(ctx);
362 			ctx->kc_vtick += curtick - ctx->kc_rawtick;
363 			ctx->kc_rawtick = curtick;
364 		}
365 
366 		kpreempt_enable();
367 	}
368 
369 	if (copyout(set->ks_data, buf,
370 	    set->ks_nreqs * sizeof (uint64_t)) == -1)
371 		return (EFAULT);
372 	if (copyout(&ctx->kc_hrtime, hrtime, sizeof (uint64_t)) == -1)
373 		return (EFAULT);
374 	if (copyout(&ctx->kc_vtick, tick, sizeof (uint64_t)) == -1)
375 		return (EFAULT);
376 
377 	return (0);
378 }
379 
380 /*
381  * Stop the counters on the CPU this context is bound to.
382  */
383 static void
384 kcpc_stop_hw(kcpc_ctx_t *ctx)
385 {
386 	cpu_t *cp;
387 
388 	ASSERT((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED))
389 	    == KCPC_CTX_INVALID);
390 
391 	kpreempt_disable();
392 
393 	cp = cpu_get(ctx->kc_cpuid);
394 	ASSERT(cp != NULL);
395 
396 	if (cp == CPU) {
397 		pcbe_ops->pcbe_allstop();
398 		atomic_or_uint(&ctx->kc_flags,
399 		    KCPC_CTX_INVALID_STOPPED);
400 	} else
401 		kcpc_remote_stop(cp);
402 	kpreempt_enable();
403 }
404 
405 int
406 kcpc_unbind(kcpc_set_t *set)
407 {
408 	kcpc_ctx_t	*ctx = set->ks_ctx;
409 	kthread_t	*t;
410 
411 	if (ctx == NULL)
412 		return (EINVAL);
413 
414 	atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
415 
416 	if (ctx->kc_cpuid == -1) {
417 		t = ctx->kc_thread;
418 		/*
419 		 * The context is thread-bound and therefore has a device
420 		 * context.  It will be freed via removectx() calling
421 		 * freectx() calling kcpc_free().
422 		 */
423 		if (t == curthread &&
424 			(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
425 			kpreempt_disable();
426 			pcbe_ops->pcbe_allstop();
427 			atomic_or_uint(&ctx->kc_flags,
428 			    KCPC_CTX_INVALID_STOPPED);
429 			kpreempt_enable();
430 		}
431 #ifdef DEBUG
432 		if (removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
433 		    kcpc_lwp_create, NULL, kcpc_free) == 0)
434 			panic("kcpc_unbind: context %p not preset on thread %p",
435 			    ctx, t);
436 #else
437 		(void) removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
438 		    kcpc_lwp_create, NULL, kcpc_free);
439 #endif /* DEBUG */
440 		t->t_cpc_set = NULL;
441 		t->t_cpc_ctx = NULL;
442 	} else {
443 		/*
444 		 * If we are unbinding a CPU-bound set from a remote CPU, the
445 		 * native CPU's idle thread could be in the midst of programming
446 		 * this context onto the CPU. We grab the context's lock here to
447 		 * ensure that the idle thread is done with it. When we release
448 		 * the lock, the CPU no longer has a context and the idle thread
449 		 * will move on.
450 		 *
451 		 * cpu_lock must be held to prevent the CPU from being DR'd out
452 		 * while we disassociate the context from the cpu_t.
453 		 */
454 		cpu_t *cp;
455 		mutex_enter(&cpu_lock);
456 		cp = cpu_get(ctx->kc_cpuid);
457 		if (cp != NULL) {
458 			/*
459 			 * The CPU may have been DR'd out of the system.
460 			 */
461 			mutex_enter(&cp->cpu_cpc_ctxlock);
462 			if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0)
463 				kcpc_stop_hw(ctx);
464 			ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
465 			cp->cpu_cpc_ctx = NULL;
466 			mutex_exit(&cp->cpu_cpc_ctxlock);
467 		}
468 		mutex_exit(&cpu_lock);
469 		if (ctx->kc_thread == curthread) {
470 			kcpc_free(ctx, 0);
471 			curthread->t_cpc_set = NULL;
472 		}
473 	}
474 
475 	return (0);
476 }
477 
478 int
479 kcpc_preset(kcpc_set_t *set, int index, uint64_t preset)
480 {
481 	int i;
482 
483 	ASSERT(set != NULL);
484 	ASSERT(set->ks_ctx != NULL);
485 	ASSERT(set->ks_ctx->kc_thread == curthread);
486 	ASSERT(set->ks_ctx->kc_cpuid == -1);
487 
488 	if (index < 0 || index >= set->ks_nreqs)
489 		return (EINVAL);
490 
491 	for (i = 0; i < set->ks_nreqs; i++)
492 		if (set->ks_req[i].kr_index == index)
493 			break;
494 	ASSERT(i != set->ks_nreqs);
495 
496 	set->ks_req[i].kr_preset = preset;
497 	return (0);
498 }
499 
500 int
501 kcpc_restart(kcpc_set_t *set)
502 {
503 	kcpc_ctx_t	*ctx = set->ks_ctx;
504 	int		i;
505 
506 	ASSERT(ctx != NULL);
507 	ASSERT(ctx->kc_thread == curthread);
508 	ASSERT(ctx->kc_cpuid == -1);
509 
510 	kpreempt_disable();
511 
512 	/*
513 	 * If the user is doing this on a running set, make sure the counters
514 	 * are stopped first.
515 	 */
516 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
517 		pcbe_ops->pcbe_allstop();
518 
519 	for (i = 0; i < set->ks_nreqs; i++) {
520 		*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
521 		pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset,
522 		    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
523 	}
524 
525 	/*
526 	 * Ask the backend to program the hardware.
527 	 */
528 	ctx->kc_rawtick = KCPC_GET_TICK();
529 	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
530 	pcbe_ops->pcbe_program(ctx);
531 	kpreempt_enable();
532 
533 	return (0);
534 }
535 
536 /*
537  * Caller must hold kcpc_cpuctx_lock.
538  */
539 int
540 kcpc_enable(kthread_t *t, int cmd, int enable)
541 {
542 	kcpc_ctx_t	*ctx = t->t_cpc_ctx;
543 	kcpc_set_t	*set = t->t_cpc_set;
544 	kcpc_set_t	*newset;
545 	int		i;
546 	int		flag;
547 	int		err;
548 
549 	ASSERT(RW_READ_HELD(&kcpc_cpuctx_lock));
550 
551 	if (ctx == NULL) {
552 		/*
553 		 * This thread has a set but no context; it must be a
554 		 * CPU-bound set.
555 		 */
556 		ASSERT(t->t_cpc_set != NULL);
557 		ASSERT(t->t_cpc_set->ks_ctx->kc_cpuid != -1);
558 		return (EINVAL);
559 	} else if (ctx->kc_flags & KCPC_CTX_INVALID)
560 		return (EAGAIN);
561 
562 	if (cmd == CPC_ENABLE) {
563 		if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
564 			return (EINVAL);
565 		kpreempt_disable();
566 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
567 		kcpc_restore(ctx);
568 		kpreempt_enable();
569 	} else if (cmd == CPC_DISABLE) {
570 		if (ctx->kc_flags & KCPC_CTX_FREEZE)
571 			return (EINVAL);
572 		kpreempt_disable();
573 		kcpc_save(ctx);
574 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
575 		kpreempt_enable();
576 	} else if (cmd == CPC_USR_EVENTS || cmd == CPC_SYS_EVENTS) {
577 		/*
578 		 * Strategy for usr/sys: stop counters and update set's presets
579 		 * with current counter values, unbind, update requests with
580 		 * new config, then re-bind.
581 		 */
582 		flag = (cmd == CPC_USR_EVENTS) ?
583 		    CPC_COUNT_USER: CPC_COUNT_SYSTEM;
584 
585 		kpreempt_disable();
586 		atomic_or_uint(&ctx->kc_flags,
587 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
588 		pcbe_ops->pcbe_allstop();
589 		kpreempt_enable();
590 		for (i = 0; i < set->ks_nreqs; i++) {
591 			set->ks_req[i].kr_preset = *(set->ks_req[i].kr_data);
592 			if (enable)
593 				set->ks_req[i].kr_flags |= flag;
594 			else
595 				set->ks_req[i].kr_flags &= ~flag;
596 		}
597 		newset = kcpc_dup_set(set);
598 		if (kcpc_unbind(set) != 0)
599 			return (EINVAL);
600 		t->t_cpc_set = newset;
601 		if (kcpc_bind_thread(newset, t, &err) != 0) {
602 			t->t_cpc_set = NULL;
603 			kcpc_free_set(newset);
604 			return (EINVAL);
605 		}
606 	} else
607 		return (EINVAL);
608 
609 	return (0);
610 }
611 
612 /*
613  * Provide PCBEs with a way of obtaining the configs of every counter which will
614  * be programmed together.
615  *
616  * If current is NULL, provide the first config.
617  *
618  * If data != NULL, caller wants to know where the data store associated with
619  * the config we return is located.
620  */
621 void *
622 kcpc_next_config(void *token, void *current, uint64_t **data)
623 {
624 	int		i;
625 	kcpc_pic_t	*pic;
626 	kcpc_ctx_t *ctx = (kcpc_ctx_t *)token;
627 
628 	if (current == NULL) {
629 		/*
630 		 * Client would like the first config, which may not be in
631 		 * counter 0; we need to search through the counters for the
632 		 * first config.
633 		 */
634 		for (i = 0; i < cpc_ncounters; i++)
635 			if (ctx->kc_pics[i].kp_req != NULL)
636 				break;
637 		/*
638 		 * There are no counters configured for the given context.
639 		 */
640 		if (i == cpc_ncounters)
641 			return (NULL);
642 	} else {
643 		/*
644 		 * There surely is a faster way to do this.
645 		 */
646 		for (i = 0; i < cpc_ncounters; i++) {
647 			pic = &ctx->kc_pics[i];
648 
649 			if (pic->kp_req != NULL &&
650 			    current == pic->kp_req->kr_config)
651 				break;
652 		}
653 
654 		/*
655 		 * We found the current config at picnum i. Now search for the
656 		 * next configured PIC.
657 		 */
658 		for (i++; i < cpc_ncounters; i++) {
659 			pic = &ctx->kc_pics[i];
660 			if (pic->kp_req != NULL)
661 				break;
662 		}
663 
664 		if (i == cpc_ncounters)
665 			return (NULL);
666 	}
667 
668 	if (data != NULL) {
669 		*data = ctx->kc_pics[i].kp_req->kr_data;
670 	}
671 
672 	return (ctx->kc_pics[i].kp_req->kr_config);
673 }
674 
675 
676 static kcpc_ctx_t *
677 kcpc_ctx_alloc(void)
678 {
679 	kcpc_ctx_t	*ctx;
680 	long		hash;
681 
682 	ctx = (kcpc_ctx_t *)kmem_alloc(sizeof (kcpc_ctx_t), KM_SLEEP);
683 
684 	hash = CPC_HASH_CTX(ctx);
685 	mutex_enter(&kcpc_ctx_llock[hash]);
686 	ctx->kc_next = kcpc_ctx_list[hash];
687 	kcpc_ctx_list[hash] = ctx;
688 	mutex_exit(&kcpc_ctx_llock[hash]);
689 
690 	ctx->kc_pics = (kcpc_pic_t *)kmem_zalloc(sizeof (kcpc_pic_t) *
691 	    cpc_ncounters, KM_SLEEP);
692 
693 	ctx->kc_flags = 0;
694 	ctx->kc_vtick = 0;
695 	ctx->kc_rawtick = 0;
696 	ctx->kc_cpuid = -1;
697 
698 	return (ctx);
699 }
700 
701 /*
702  * Copy set from ctx to the child context, cctx, if it has CPC_BIND_LWP_INHERIT
703  * in the flags.
704  */
705 static void
706 kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx)
707 {
708 	kcpc_set_t	*ks = ctx->kc_set, *cks;
709 	int		i, j;
710 	int		code;
711 
712 	ASSERT(ks != NULL);
713 
714 	if ((ks->ks_flags & CPC_BIND_LWP_INHERIT) == 0)
715 		return;
716 
717 	cks = kmem_alloc(sizeof (*cks), KM_SLEEP);
718 	cctx->kc_set = cks;
719 	cks->ks_flags = ks->ks_flags;
720 	cks->ks_nreqs = ks->ks_nreqs;
721 	cks->ks_req = kmem_alloc(cks->ks_nreqs *
722 	    sizeof (kcpc_request_t), KM_SLEEP);
723 	cks->ks_data = kmem_alloc(cks->ks_nreqs * sizeof (uint64_t),
724 	    KM_SLEEP);
725 	cks->ks_ctx = cctx;
726 
727 	for (i = 0; i < cks->ks_nreqs; i++) {
728 		cks->ks_req[i].kr_index = ks->ks_req[i].kr_index;
729 		cks->ks_req[i].kr_picnum = ks->ks_req[i].kr_picnum;
730 		(void) strncpy(cks->ks_req[i].kr_event,
731 		    ks->ks_req[i].kr_event, CPC_MAX_EVENT_LEN);
732 		cks->ks_req[i].kr_preset = ks->ks_req[i].kr_preset;
733 		cks->ks_req[i].kr_flags = ks->ks_req[i].kr_flags;
734 		cks->ks_req[i].kr_nattrs = ks->ks_req[i].kr_nattrs;
735 		if (ks->ks_req[i].kr_nattrs > 0) {
736 			cks->ks_req[i].kr_attr =
737 			    kmem_alloc(ks->ks_req[i].kr_nattrs *
738 				sizeof (kcpc_attr_t), KM_SLEEP);
739 		}
740 		for (j = 0; j < ks->ks_req[i].kr_nattrs; j++) {
741 			(void) strncpy(cks->ks_req[i].kr_attr[j].ka_name,
742 			    ks->ks_req[i].kr_attr[j].ka_name,
743 			    CPC_MAX_ATTR_LEN);
744 			cks->ks_req[i].kr_attr[j].ka_val =
745 			    ks->ks_req[i].kr_attr[j].ka_val;
746 		}
747 	}
748 	if (kcpc_configure_reqs(cctx, cks, &code) != 0)
749 		panic("kcpc_ctx_clone: configure of context %p with "
750 		    "set %p failed with subcode %d", cctx, cks, code);
751 }
752 
753 
754 static void
755 kcpc_ctx_free(kcpc_ctx_t *ctx)
756 {
757 	kcpc_ctx_t	**loc;
758 	long		hash = CPC_HASH_CTX(ctx);
759 
760 	mutex_enter(&kcpc_ctx_llock[hash]);
761 	loc = &kcpc_ctx_list[hash];
762 	ASSERT(*loc != NULL);
763 	while (*loc != ctx)
764 		loc = &(*loc)->kc_next;
765 	*loc = ctx->kc_next;
766 	mutex_exit(&kcpc_ctx_llock[hash]);
767 
768 	kmem_free(ctx->kc_pics, cpc_ncounters * sizeof (kcpc_pic_t));
769 	kmem_free(ctx, sizeof (*ctx));
770 }
771 
772 /*
773  * Generic interrupt handler used on hardware that generates
774  * overflow interrupts.
775  *
776  * Note: executed at high-level interrupt context!
777  */
778 /*ARGSUSED*/
779 kcpc_ctx_t *
780 kcpc_overflow_intr(caddr_t arg, uint64_t bitmap)
781 {
782 	kcpc_ctx_t	*ctx;
783 	kthread_t	*t = curthread;
784 	int		i;
785 
786 	/*
787 	 * On both x86 and UltraSPARC, we may deliver the high-level
788 	 * interrupt in kernel mode, just after we've started to run an
789 	 * interrupt thread.  (That's because the hardware helpfully
790 	 * delivers the overflow interrupt some random number of cycles
791 	 * after the instruction that caused the overflow by which time
792 	 * we're in some part of the kernel, not necessarily running on
793 	 * the right thread).
794 	 *
795 	 * Check for this case here -- find the pinned thread
796 	 * that was running when the interrupt went off.
797 	 */
798 	if (t->t_flag & T_INTR_THREAD) {
799 		klwp_t *lwp;
800 
801 		atomic_add_32(&kcpc_intrctx_count, 1);
802 
803 		/*
804 		 * Note that t_lwp is always set to point at the underlying
805 		 * thread, thus this will work in the presence of nested
806 		 * interrupts.
807 		 */
808 		ctx = NULL;
809 		if ((lwp = t->t_lwp) != NULL) {
810 			t = lwptot(lwp);
811 			ctx = t->t_cpc_ctx;
812 		}
813 	} else
814 		ctx = t->t_cpc_ctx;
815 
816 	if (ctx == NULL) {
817 		/*
818 		 * This can easily happen if we're using the counters in
819 		 * "shared" mode, for example, and an overflow interrupt
820 		 * occurs while we are running cpustat.  In that case, the
821 		 * bound thread that has the context that belongs to this
822 		 * CPU is almost certainly sleeping (if it was running on
823 		 * the CPU we'd have found it above), and the actual
824 		 * interrupted thread has no knowledge of performance counters!
825 		 */
826 		ctx = curthread->t_cpu->cpu_cpc_ctx;
827 		if (ctx != NULL) {
828 			/*
829 			 * Return the bound context for this CPU to
830 			 * the interrupt handler so that it can synchronously
831 			 * sample the hardware counters and restart them.
832 			 */
833 			return (ctx);
834 		}
835 
836 		/*
837 		 * As long as the overflow interrupt really is delivered early
838 		 * enough after trapping into the kernel to avoid switching
839 		 * threads, we must always be able to find the cpc context,
840 		 * or something went terribly wrong i.e. we ended up
841 		 * running a passivated interrupt thread, a kernel
842 		 * thread or we interrupted idle, all of which are Very Bad.
843 		 */
844 		if (kcpc_nullctx_panic)
845 			panic("null cpc context, thread %p", (void *)t);
846 		atomic_add_32(&kcpc_nullctx_count, 1);
847 	} else if ((ctx->kc_flags & KCPC_CTX_INVALID) == 0) {
848 		/*
849 		 * Schedule an ast to sample the counters, which will
850 		 * propagate any overflow into the virtualized performance
851 		 * counter(s), and may deliver a signal.
852 		 */
853 		ttolwp(t)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
854 		/*
855 		 * If a counter has overflowed which was counting on behalf of
856 		 * a request which specified CPC_OVF_NOTIFY_EMT, send the
857 		 * process a signal.
858 		 */
859 		for (i = 0; i < cpc_ncounters; i++) {
860 			if (ctx->kc_pics[i].kp_req != NULL &&
861 			    bitmap & (1 << i) &&
862 			    ctx->kc_pics[i].kp_req->kr_flags &
863 			    CPC_OVF_NOTIFY_EMT) {
864 				/*
865 				 * A signal has been requested for this PIC, so
866 				 * so freeze the context. The interrupt handler
867 				 * has already stopped the counter hardware.
868 				 */
869 				atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
870 				atomic_or_uint(&ctx->kc_pics[i].kp_flags,
871 				    KCPC_PIC_OVERFLOWED);
872 			}
873 		}
874 		aston(t);
875 	}
876 	return (NULL);
877 }
878 
879 /*
880  * The current thread context had an overflow interrupt; we're
881  * executing here in high-level interrupt context.
882  */
883 /*ARGSUSED*/
884 uint_t
885 kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
886 {
887 	kcpc_ctx_t	*ctx;
888 	uint64_t	bitmap;
889 
890 	if (pcbe_ops == NULL ||
891 	    (bitmap = pcbe_ops->pcbe_overflow_bitmap()) == 0)
892 		return (DDI_INTR_UNCLAIMED);
893 
894 	/*
895 	 * Prevent any further interrupts.
896 	 */
897 	pcbe_ops->pcbe_allstop();
898 
899 	/*
900 	 * Invoke the "generic" handler.
901 	 *
902 	 * If the interrupt has occurred in the context of an lwp owning
903 	 * the counters, then the handler posts an AST to the lwp to
904 	 * trigger the actual sampling, and optionally deliver a signal or
905 	 * restart the counters, on the way out of the kernel using
906 	 * kcpc_hw_overflow_ast() (see below).
907 	 *
908 	 * On the other hand, if the handler returns the context to us
909 	 * directly, then it means that there are no other threads in
910 	 * the middle of updating it, no AST has been posted, and so we
911 	 * should sample the counters here, and restart them with no
912 	 * further fuss.
913 	 */
914 	if ((ctx = kcpc_overflow_intr(arg1, bitmap)) != NULL) {
915 		uint64_t curtick = KCPC_GET_TICK();
916 
917 		ctx->kc_hrtime = gethrtime_waitfree();
918 		ctx->kc_vtick += curtick - ctx->kc_rawtick;
919 		ctx->kc_rawtick = curtick;
920 		pcbe_ops->pcbe_sample(ctx);
921 		pcbe_ops->pcbe_program(ctx);
922 	}
923 
924 	return (DDI_INTR_CLAIMED);
925 }
926 
927 /*
928  * Called from trap() when processing the ast posted by the high-level
929  * interrupt handler.
930  */
931 int
932 kcpc_overflow_ast()
933 {
934 	kcpc_ctx_t	*ctx = curthread->t_cpc_ctx;
935 	int		i;
936 	int		found = 0;
937 	uint64_t	curtick = KCPC_GET_TICK();
938 
939 	ASSERT(ctx != NULL);	/* Beware of interrupt skid. */
940 
941 	/*
942 	 * An overflow happened: sample the context to ensure that
943 	 * the overflow is propagated into the upper bits of the
944 	 * virtualized 64-bit counter(s).
945 	 */
946 	kpreempt_disable();
947 	ctx->kc_hrtime = gethrtime_waitfree();
948 	pcbe_ops->pcbe_sample(ctx);
949 	kpreempt_enable();
950 
951 	ctx->kc_vtick += curtick - ctx->kc_rawtick;
952 
953 	/*
954 	 * The interrupt handler has marked any pics with KCPC_PIC_OVERFLOWED
955 	 * if that pic generated an overflow and if the request it was counting
956 	 * on behalf of had CPC_OVERFLOW_REQUEST specified. We go through all
957 	 * pics in the context and clear the KCPC_PIC_OVERFLOWED flags. If we
958 	 * found any overflowed pics, keep the context frozen and return true
959 	 * (thus causing a signal to be sent).
960 	 */
961 	for (i = 0; i < cpc_ncounters; i++) {
962 		if (ctx->kc_pics[i].kp_flags & KCPC_PIC_OVERFLOWED) {
963 			atomic_and_uint(&ctx->kc_pics[i].kp_flags,
964 			    ~KCPC_PIC_OVERFLOWED);
965 			found = 1;
966 		}
967 	}
968 	if (found)
969 		return (1);
970 
971 	/*
972 	 * Otherwise, re-enable the counters and continue life as before.
973 	 */
974 	kpreempt_disable();
975 	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
976 	pcbe_ops->pcbe_program(ctx);
977 	kpreempt_enable();
978 	return (0);
979 }
980 
981 /*
982  * Called when switching away from current thread.
983  */
984 static void
985 kcpc_save(kcpc_ctx_t *ctx)
986 {
987 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
988 		if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)
989 			return;
990 		/*
991 		 * This context has been invalidated but the counters have not
992 		 * been stopped. Stop them here and mark the context stopped.
993 		 */
994 		pcbe_ops->pcbe_allstop();
995 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
996 		return;
997 	}
998 
999 	pcbe_ops->pcbe_allstop();
1000 	if (ctx->kc_flags & KCPC_CTX_FREEZE)
1001 		return;
1002 
1003 	/*
1004 	 * Need to sample for all reqs into each req's current mpic.
1005 	 */
1006 	ctx->kc_hrtime = gethrtime();
1007 	ctx->kc_vtick += KCPC_GET_TICK() - ctx->kc_rawtick;
1008 	pcbe_ops->pcbe_sample(ctx);
1009 }
1010 
1011 static void
1012 kcpc_restore(kcpc_ctx_t *ctx)
1013 {
1014 	if ((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED)) ==
1015 	    KCPC_CTX_INVALID)
1016 		/*
1017 		 * The context is invalidated but has not been marked stopped.
1018 		 * We mark it as such here because we will not start the
1019 		 * counters during this context switch.
1020 		 */
1021 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
1022 
1023 
1024 	if (ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_FREEZE))
1025 		return;
1026 
1027 	/*
1028 	 * While programming the hardware, the counters should be stopped. We
1029 	 * don't do an explicit pcbe_allstop() here because they should have
1030 	 * been stopped already by the last consumer.
1031 	 */
1032 	ctx->kc_rawtick = KCPC_GET_TICK();
1033 	pcbe_ops->pcbe_program(ctx);
1034 }
1035 
1036 /*
1037  * If kcpc_counts_include_idle is set to 0 by the sys admin, we add the the
1038  * following context operators to the idle thread on each CPU. They stop the
1039  * counters when the idle thread is switched on, and they start them again when
1040  * it is switched off.
1041  */
1042 
1043 /*ARGSUSED*/
1044 void
1045 kcpc_idle_save(struct cpu *cp)
1046 {
1047 	/*
1048 	 * The idle thread shouldn't be run anywhere else.
1049 	 */
1050 	ASSERT(CPU == cp);
1051 
1052 	/*
1053 	 * We must hold the CPU's context lock to ensure the context isn't freed
1054 	 * while we're looking at it.
1055 	 */
1056 	mutex_enter(&cp->cpu_cpc_ctxlock);
1057 
1058 	if ((cp->cpu_cpc_ctx == NULL) ||
1059 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1060 		mutex_exit(&cp->cpu_cpc_ctxlock);
1061 		return;
1062 	}
1063 
1064 	pcbe_ops->pcbe_program(cp->cpu_cpc_ctx);
1065 	mutex_exit(&cp->cpu_cpc_ctxlock);
1066 }
1067 
1068 void
1069 kcpc_idle_restore(struct cpu *cp)
1070 {
1071 	/*
1072 	 * The idle thread shouldn't be run anywhere else.
1073 	 */
1074 	ASSERT(CPU == cp);
1075 
1076 	/*
1077 	 * We must hold the CPU's context lock to ensure the context isn't freed
1078 	 * while we're looking at it.
1079 	 */
1080 	mutex_enter(&cp->cpu_cpc_ctxlock);
1081 
1082 	if ((cp->cpu_cpc_ctx == NULL) ||
1083 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1084 		mutex_exit(&cp->cpu_cpc_ctxlock);
1085 		return;
1086 	}
1087 
1088 	pcbe_ops->pcbe_allstop();
1089 	mutex_exit(&cp->cpu_cpc_ctxlock);
1090 }
1091 
1092 /*ARGSUSED*/
1093 static void
1094 kcpc_lwp_create(kthread_t *t, kthread_t *ct)
1095 {
1096 	kcpc_ctx_t	*ctx = t->t_cpc_ctx, *cctx;
1097 	int		i;
1098 
1099 	if (ctx == NULL || (ctx->kc_flags & KCPC_CTX_LWPINHERIT) == 0)
1100 		return;
1101 
1102 	rw_enter(&kcpc_cpuctx_lock, RW_READER);
1103 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
1104 		rw_exit(&kcpc_cpuctx_lock);
1105 		return;
1106 	}
1107 	cctx = kcpc_ctx_alloc();
1108 	kcpc_ctx_clone(ctx, cctx);
1109 	rw_exit(&kcpc_cpuctx_lock);
1110 
1111 	cctx->kc_flags = ctx->kc_flags;
1112 	cctx->kc_thread = ct;
1113 	cctx->kc_cpuid = -1;
1114 	ct->t_cpc_set = cctx->kc_set;
1115 	ct->t_cpc_ctx = cctx;
1116 
1117 	if (cctx->kc_flags & KCPC_CTX_SIGOVF) {
1118 		kcpc_set_t *ks = cctx->kc_set;
1119 		/*
1120 		 * Our contract with the user requires us to immediately send an
1121 		 * overflow signal to all children if we have the LWPINHERIT
1122 		 * and SIGOVF flags set. In addition, all counters should be
1123 		 * set to UINT64_MAX, and their pic's overflow flag turned on
1124 		 * so that our trap() processing knows to send a signal.
1125 		 */
1126 		atomic_or_uint(&cctx->kc_flags, KCPC_CTX_FREEZE);
1127 		for (i = 0; i < ks->ks_nreqs; i++) {
1128 			kcpc_request_t *kr = &ks->ks_req[i];
1129 
1130 			if (kr->kr_flags & CPC_OVF_NOTIFY_EMT) {
1131 				*(kr->kr_data) = UINT64_MAX;
1132 				kr->kr_picp->kp_flags |= KCPC_PIC_OVERFLOWED;
1133 			}
1134 		}
1135 		ttolwp(ct)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
1136 		aston(ct);
1137 	}
1138 
1139 	installctx(ct, cctx, kcpc_save, kcpc_restore,
1140 	    NULL, kcpc_lwp_create, NULL, kcpc_free);
1141 }
1142 
1143 /*
1144  * Counter Stoppage Theory
1145  *
1146  * The counters may need to be stopped properly at the following occasions:
1147  *
1148  * 1) An LWP exits.
1149  * 2) A thread exits.
1150  * 3) An LWP performs an exec().
1151  * 4) A bound set is unbound.
1152  *
1153  * In addition to stopping the counters, the CPC context (a kcpc_ctx_t) may need
1154  * to be freed as well.
1155  *
1156  * Case 1: kcpc_passivate(), called via lwp_exit(), stops the counters. Later on
1157  * when the thread is freed, kcpc_free(), called by freectx(), frees the
1158  * context.
1159  *
1160  * Case 2: same as case 1 except kcpc_passivate is called from thread_exit().
1161  *
1162  * Case 3: kcpc_free(), called via freectx() via exec(), recognizes that it has
1163  * been called from exec. It stops the counters _and_ frees the context.
1164  *
1165  * Case 4: kcpc_unbind() stops the hardware _and_ frees the context.
1166  *
1167  * CPU-bound counters are always stopped via kcpc_unbind().
1168  */
1169 
1170 /*
1171  * We're being called to delete the context; we ensure that all associated data
1172  * structures are freed, and that the hardware is passivated if this is an exec.
1173  */
1174 
1175 /*ARGSUSED*/
1176 static void
1177 kcpc_free(kcpc_ctx_t *ctx, int isexec)
1178 {
1179 	int		i;
1180 	kcpc_set_t	*set = ctx->kc_set;
1181 
1182 	ASSERT(set != NULL);
1183 
1184 	atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1185 
1186 	if (isexec) {
1187 		/*
1188 		 * This thread is execing, and after the exec it should not have
1189 		 * any performance counter context. Stop the counters properly
1190 		 * here so the system isn't surprised by an overflow interrupt
1191 		 * later.
1192 		 */
1193 		if (ctx->kc_cpuid != -1) {
1194 			cpu_t *cp;
1195 			/*
1196 			 * CPU-bound context; stop the appropriate CPU's ctrs.
1197 			 * Hold cpu_lock while examining the CPU to ensure it
1198 			 * doesn't go away.
1199 			 */
1200 			mutex_enter(&cpu_lock);
1201 			cp = cpu_get(ctx->kc_cpuid);
1202 			/*
1203 			 * The CPU could have been DR'd out, so only stop the
1204 			 * CPU and clear its context pointer if the CPU still
1205 			 * exists.
1206 			 */
1207 			if (cp != NULL) {
1208 				mutex_enter(&cp->cpu_cpc_ctxlock);
1209 				kcpc_stop_hw(ctx);
1210 				cp->cpu_cpc_ctx = NULL;
1211 				mutex_exit(&cp->cpu_cpc_ctxlock);
1212 			}
1213 			mutex_exit(&cpu_lock);
1214 			ASSERT(curthread->t_cpc_ctx == NULL);
1215 		} else {
1216 			/*
1217 			 * Thread-bound context; stop _this_ CPU's counters.
1218 			 */
1219 			kpreempt_disable();
1220 			pcbe_ops->pcbe_allstop();
1221 			atomic_or_uint(&ctx->kc_flags,
1222 			    KCPC_CTX_INVALID_STOPPED);
1223 			kpreempt_enable();
1224 			curthread->t_cpc_ctx = NULL;
1225 		}
1226 
1227 		/*
1228 		 * Since we are being called from an exec and we know that
1229 		 * exec is not permitted via the agent thread, we should clean
1230 		 * up this thread's CPC state completely, and not leave dangling
1231 		 * CPC pointers behind.
1232 		 */
1233 		ASSERT(ctx->kc_thread == curthread);
1234 		curthread->t_cpc_set = NULL;
1235 	}
1236 
1237 	/*
1238 	 * Walk through each request in this context's set and free the PCBE's
1239 	 * configuration if it exists.
1240 	 */
1241 	for (i = 0; i < set->ks_nreqs; i++) {
1242 		if (set->ks_req[i].kr_config != NULL)
1243 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
1244 	}
1245 
1246 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
1247 	kcpc_ctx_free(ctx);
1248 	kcpc_free_set(set);
1249 }
1250 
1251 /*
1252  * Free the memory associated with a request set.
1253  */
1254 void
1255 kcpc_free_set(kcpc_set_t *set)
1256 {
1257 	int		i;
1258 	kcpc_request_t	*req;
1259 
1260 	ASSERT(set->ks_req != NULL);
1261 
1262 	for (i = 0; i < set->ks_nreqs; i++) {
1263 		req = &set->ks_req[i];
1264 
1265 		if (req->kr_nattrs != 0) {
1266 			kmem_free(req->kr_attr,
1267 			    req->kr_nattrs * sizeof (kcpc_attr_t));
1268 		}
1269 	}
1270 
1271 	kmem_free(set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
1272 	kmem_free(set, sizeof (kcpc_set_t));
1273 }
1274 
1275 /*
1276  * Grab every existing context and mark it as invalid.
1277  */
1278 void
1279 kcpc_invalidate_all(void)
1280 {
1281 	kcpc_ctx_t *ctx;
1282 	long hash;
1283 
1284 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++) {
1285 		mutex_enter(&kcpc_ctx_llock[hash]);
1286 		for (ctx = kcpc_ctx_list[hash]; ctx; ctx = ctx->kc_next)
1287 			atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1288 		mutex_exit(&kcpc_ctx_llock[hash]);
1289 	}
1290 }
1291 
1292 /*
1293  * Called from lwp_exit() and thread_exit()
1294  */
1295 void
1296 kcpc_passivate(void)
1297 {
1298 	kcpc_ctx_t *ctx = curthread->t_cpc_ctx;
1299 	kcpc_set_t *set = curthread->t_cpc_set;
1300 
1301 	if (set == NULL)
1302 		return;
1303 
1304 	/*
1305 	 * We're cleaning up after this thread; ensure there are no dangling
1306 	 * CPC pointers left behind. The context and set will be freed by
1307 	 * freectx() in the case of an LWP-bound set, and by kcpc_unbind() in
1308 	 * the case of a CPU-bound set.
1309 	 */
1310 	curthread->t_cpc_ctx = NULL;
1311 
1312 	if (ctx == NULL) {
1313 		/*
1314 		 * This thread has a set but no context; it must be a CPU-bound
1315 		 * set. The hardware will be stopped via kcpc_unbind() when the
1316 		 * process exits and closes its file descriptors with
1317 		 * kcpc_close(). Our only job here is to clean up this thread's
1318 		 * state; the set will be freed with the unbind().
1319 		 */
1320 		(void) kcpc_unbind(set);
1321 		/*
1322 		 * Unbinding a set belonging to the current thread should clear
1323 		 * its set pointer.
1324 		 */
1325 		ASSERT(curthread->t_cpc_set == NULL);
1326 		return;
1327 	}
1328 
1329 	curthread->t_cpc_set = NULL;
1330 
1331 	/*
1332 	 * This thread/LWP is exiting but context switches will continue to
1333 	 * happen for a bit as the exit proceeds.  Kernel preemption must be
1334 	 * disabled here to prevent a race between checking or setting the
1335 	 * INVALID_STOPPED flag here and kcpc_restore() setting the flag during
1336 	 * a context switch.
1337 	 */
1338 
1339 	kpreempt_disable();
1340 	if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
1341 		pcbe_ops->pcbe_allstop();
1342 		atomic_or_uint(&ctx->kc_flags,
1343 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
1344 	}
1345 	kpreempt_enable();
1346 }
1347 
1348 /*
1349  * Assign the requests in the given set to the PICs in the context.
1350  * Returns 0 if successful, -1 on failure.
1351  */
1352 /*ARGSUSED*/
1353 static int
1354 kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx)
1355 {
1356 	int i;
1357 	int *picnum_save;
1358 
1359 	ASSERT(set->ks_nreqs <= cpc_ncounters);
1360 
1361 	/*
1362 	 * Provide kcpc_tryassign() with scratch space to avoid doing an
1363 	 * alloc/free with every invocation.
1364 	 */
1365 	picnum_save = kmem_alloc(set->ks_nreqs * sizeof (int), KM_SLEEP);
1366 	/*
1367 	 * kcpc_tryassign() blindly walks through each request in the set,
1368 	 * seeing if a counter can count its event. If yes, it assigns that
1369 	 * counter. However, that counter may have been the only capable counter
1370 	 * for _another_ request's event. The solution is to try every possible
1371 	 * request first. Note that this does not cover all solutions, as
1372 	 * that would require all unique orderings of requests, an n^n operation
1373 	 * which would be unacceptable for architectures with many counters.
1374 	 */
1375 	for (i = 0; i < set->ks_nreqs; i++)
1376 		if (kcpc_tryassign(set, i, picnum_save) == 0)
1377 			break;
1378 
1379 	kmem_free(picnum_save, set->ks_nreqs * sizeof (int));
1380 	if (i == set->ks_nreqs)
1381 		return (-1);
1382 	return (0);
1383 }
1384 
1385 static int
1386 kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch)
1387 {
1388 	int		i;
1389 	int		j;
1390 	uint64_t	bitmap = 0, resmap = 0;
1391 	uint64_t	ctrmap;
1392 
1393 	/*
1394 	 * We are attempting to assign the reqs to pics, but we may fail. If we
1395 	 * fail, we need to restore the state of the requests to what it was
1396 	 * when we found it, as some reqs may have been explicitly assigned to
1397 	 * a specific PIC beforehand. We do this by snapshotting the assignments
1398 	 * now and restoring from it later if we fail.
1399 	 *
1400 	 * Also we note here which counters have already been claimed by
1401 	 * requests with explicit counter assignments.
1402 	 */
1403 	for (i = 0; i < set->ks_nreqs; i++) {
1404 		scratch[i] = set->ks_req[i].kr_picnum;
1405 		if (set->ks_req[i].kr_picnum != -1)
1406 			resmap |= (1 << set->ks_req[i].kr_picnum);
1407 	}
1408 
1409 	/*
1410 	 * Walk through requests assigning them to the first PIC that is
1411 	 * capable.
1412 	 */
1413 	i = starting_req;
1414 	do {
1415 		if (set->ks_req[i].kr_picnum != -1) {
1416 			ASSERT((bitmap & (1 << set->ks_req[i].kr_picnum)) == 0);
1417 			bitmap |= (1 << set->ks_req[i].kr_picnum);
1418 			if (++i == set->ks_nreqs)
1419 				i = 0;
1420 			continue;
1421 		}
1422 
1423 		ctrmap = pcbe_ops->pcbe_event_coverage(set->ks_req[i].kr_event);
1424 		for (j = 0; j < cpc_ncounters; j++) {
1425 			if (ctrmap & (1 << j) && (bitmap & (1 << j)) == 0 &&
1426 			    (resmap & (1 << j)) == 0) {
1427 				/*
1428 				 * We can assign this counter because:
1429 				 *
1430 				 * 1. It can count the event (ctrmap)
1431 				 * 2. It hasn't been assigned yet (bitmap)
1432 				 * 3. It wasn't reserved by a request (resmap)
1433 				 */
1434 				bitmap |= (1 << j);
1435 				break;
1436 			}
1437 		}
1438 		if (j == cpc_ncounters) {
1439 			for (i = 0; i < set->ks_nreqs; i++)
1440 				set->ks_req[i].kr_picnum = scratch[i];
1441 			return (-1);
1442 		}
1443 		set->ks_req[i].kr_picnum = j;
1444 
1445 		if (++i == set->ks_nreqs)
1446 			i = 0;
1447 	} while (i != starting_req);
1448 
1449 	return (0);
1450 }
1451 
1452 kcpc_set_t *
1453 kcpc_dup_set(kcpc_set_t *set)
1454 {
1455 	kcpc_set_t	*new;
1456 	int		i;
1457 	int		j;
1458 
1459 	new = kmem_alloc(sizeof (*new), KM_SLEEP);
1460 	new->ks_flags = set->ks_flags;
1461 	new->ks_nreqs = set->ks_nreqs;
1462 	new->ks_req = kmem_alloc(set->ks_nreqs * sizeof (kcpc_request_t),
1463 	    KM_SLEEP);
1464 	new->ks_data = NULL;
1465 	new->ks_ctx = NULL;
1466 
1467 	for (i = 0; i < new->ks_nreqs; i++) {
1468 		new->ks_req[i].kr_config = NULL;
1469 		new->ks_req[i].kr_index = set->ks_req[i].kr_index;
1470 		new->ks_req[i].kr_picnum = set->ks_req[i].kr_picnum;
1471 		new->ks_req[i].kr_picp = NULL;
1472 		new->ks_req[i].kr_data = NULL;
1473 		(void) strncpy(new->ks_req[i].kr_event, set->ks_req[i].kr_event,
1474 		    CPC_MAX_EVENT_LEN);
1475 		new->ks_req[i].kr_preset = set->ks_req[i].kr_preset;
1476 		new->ks_req[i].kr_flags = set->ks_req[i].kr_flags;
1477 		new->ks_req[i].kr_nattrs = set->ks_req[i].kr_nattrs;
1478 		new->ks_req[i].kr_attr = kmem_alloc(new->ks_req[i].kr_nattrs *
1479 		    sizeof (kcpc_attr_t), KM_SLEEP);
1480 		for (j = 0; j < new->ks_req[i].kr_nattrs; j++) {
1481 			new->ks_req[i].kr_attr[j].ka_val =
1482 			    set->ks_req[i].kr_attr[j].ka_val;
1483 			(void) strncpy(new->ks_req[i].kr_attr[j].ka_name,
1484 			    set->ks_req[i].kr_attr[j].ka_name,
1485 			    CPC_MAX_ATTR_LEN);
1486 		}
1487 	}
1488 
1489 	return (new);
1490 }
1491 
1492 int
1493 kcpc_allow_nonpriv(void *token)
1494 {
1495 	return (((kcpc_ctx_t *)token)->kc_flags & KCPC_CTX_NONPRIV);
1496 }
1497 
1498 void
1499 kcpc_invalidate(kthread_t *t)
1500 {
1501 	kcpc_ctx_t *ctx = t->t_cpc_ctx;
1502 
1503 	if (ctx != NULL)
1504 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1505 }
1506 
1507 /*
1508  * Given a PCBE ID, attempt to load a matching PCBE module. The strings given
1509  * are used to construct PCBE names, starting with the most specific,
1510  * "pcbe.first.second.third.fourth" and ending with the least specific,
1511  * "pcbe.first".
1512  *
1513  * Returns 0 if a PCBE was successfully loaded and -1 upon error.
1514  */
1515 int
1516 kcpc_pcbe_tryload(const char *prefix, uint_t first, uint_t second, uint_t third)
1517 {
1518 	char	modname[PCBE_NAMELEN];
1519 	char	stub[PCBE_NAMELEN];
1520 
1521 	if (prefix != NULL)
1522 		(void) snprintf(stub, PCBE_NAMELEN, "pcbe.%s", prefix);
1523 	else
1524 		(void) snprintf(stub, PCBE_NAMELEN, "pcbe");
1525 
1526 	(void) snprintf(modname, PCBE_NAMELEN, "%s.%u.%u.%u",
1527 	    stub, first, second, third);
1528 
1529 	DTRACE_PROBE1(kcpc__pcbe__spec, char *, modname);
1530 
1531 	if (modload("pcbe", modname) >= 0)
1532 		return (0);
1533 
1534 	(void) snprintf(modname, PCBE_NAMELEN, "%s.%u.%u",
1535 	    stub, first, second);
1536 	if (modload("pcbe", modname) >= 0)
1537 		return (0);
1538 
1539 	(void) snprintf(modname, PCBE_NAMELEN, "%s.%u", stub, first);
1540 	if (modload("pcbe", modname) >= 0)
1541 		return (0);
1542 
1543 	if (prefix == NULL)
1544 		/*
1545 		 * If no prefix was given, we have tried all possible
1546 		 * PCBE names.
1547 		 */
1548 		return (-1);
1549 
1550 	(void) snprintf(modname, PCBE_NAMELEN, "%s", stub);
1551 	if (modload("pcbe", modname) >= 0)
1552 		return (0);
1553 
1554 	return (-1);
1555 }
1556