xref: /titanic_50/usr/src/uts/common/os/kcpc.c (revision a0de58d66161bc0edbe32c2a250d723f4daf2124)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/param.h>
28 #include <sys/thread.h>
29 #include <sys/cpuvar.h>
30 #include <sys/inttypes.h>
31 #include <sys/cmn_err.h>
32 #include <sys/time.h>
33 #include <sys/ksynch.h>
34 #include <sys/systm.h>
35 #include <sys/kcpc.h>
36 #include <sys/cpc_impl.h>
37 #include <sys/cpc_pcbe.h>
38 #include <sys/atomic.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/sdt.h>
42 #if defined(__x86)
43 #include <asm/clock.h>
44 #endif
45 
46 kmutex_t	kcpc_ctx_llock[CPC_HASH_BUCKETS];	/* protects ctx_list */
47 kcpc_ctx_t	*kcpc_ctx_list[CPC_HASH_BUCKETS];	/* head of list */
48 
49 
50 krwlock_t	kcpc_cpuctx_lock;	/* lock for 'kcpc_cpuctx' below */
51 int		kcpc_cpuctx;		/* number of cpu-specific contexts */
52 
53 int kcpc_counts_include_idle = 1; /* Project Private /etc/system variable */
54 
55 /*
56  * These are set when a PCBE module is loaded.
57  */
58 uint_t		cpc_ncounters = 0;
59 pcbe_ops_t	*pcbe_ops = NULL;
60 
61 /*
62  * Statistics on (mis)behavior
63  */
64 static uint32_t kcpc_intrctx_count;    /* # overflows in an interrupt handler */
65 static uint32_t kcpc_nullctx_count;    /* # overflows in a thread with no ctx */
66 
67 /*
68  * Is misbehaviour (overflow in a thread with no context) fatal?
69  */
70 #ifdef DEBUG
71 static int kcpc_nullctx_panic = 1;
72 #else
73 static int kcpc_nullctx_panic = 0;
74 #endif
75 
76 static void kcpc_lwp_create(kthread_t *t, kthread_t *ct);
77 static void kcpc_restore(kcpc_ctx_t *ctx);
78 static void kcpc_save(kcpc_ctx_t *ctx);
79 static void kcpc_free(kcpc_ctx_t *ctx, int isexec);
80 static int kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode);
81 static void kcpc_free_configs(kcpc_set_t *set);
82 static kcpc_ctx_t *kcpc_ctx_alloc(void);
83 static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx);
84 static void kcpc_ctx_free(kcpc_ctx_t *ctx);
85 static int kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx);
86 static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch);
87 static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set);
88 
89 void
90 kcpc_register_pcbe(pcbe_ops_t *ops)
91 {
92 	pcbe_ops = ops;
93 	cpc_ncounters = pcbe_ops->pcbe_ncounters();
94 }
95 
96 int
97 kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode)
98 {
99 	cpu_t		*cp;
100 	kcpc_ctx_t	*ctx;
101 	int		error;
102 
103 	ctx = kcpc_ctx_alloc();
104 
105 	if (kcpc_assign_reqs(set, ctx) != 0) {
106 		kcpc_ctx_free(ctx);
107 		*subcode = CPC_RESOURCE_UNAVAIL;
108 		return (EINVAL);
109 	}
110 
111 	ctx->kc_cpuid = cpuid;
112 	ctx->kc_thread = curthread;
113 
114 	set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
115 
116 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
117 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
118 		kcpc_ctx_free(ctx);
119 		return (error);
120 	}
121 
122 	set->ks_ctx = ctx;
123 	ctx->kc_set = set;
124 
125 	/*
126 	 * We must hold cpu_lock to prevent DR, offlining, or unbinding while
127 	 * we are manipulating the cpu_t and programming the hardware, else the
128 	 * the cpu_t could go away while we're looking at it.
129 	 */
130 	mutex_enter(&cpu_lock);
131 	cp = cpu_get(cpuid);
132 
133 	if (cp == NULL)
134 		/*
135 		 * The CPU could have been DRd out while we were getting set up.
136 		 */
137 		goto unbound;
138 
139 	mutex_enter(&cp->cpu_cpc_ctxlock);
140 
141 	if (cp->cpu_cpc_ctx != NULL) {
142 		/*
143 		 * If this CPU already has a bound set, return an error.
144 		 */
145 		mutex_exit(&cp->cpu_cpc_ctxlock);
146 		goto unbound;
147 	}
148 
149 	if (curthread->t_bind_cpu != cpuid) {
150 		mutex_exit(&cp->cpu_cpc_ctxlock);
151 		goto unbound;
152 	}
153 	cp->cpu_cpc_ctx = ctx;
154 
155 	/*
156 	 * Kernel preemption must be disabled while fiddling with the hardware
157 	 * registers to prevent partial updates.
158 	 */
159 	kpreempt_disable();
160 	ctx->kc_rawtick = KCPC_GET_TICK();
161 	pcbe_ops->pcbe_program(ctx);
162 	kpreempt_enable();
163 
164 	mutex_exit(&cp->cpu_cpc_ctxlock);
165 	mutex_exit(&cpu_lock);
166 
167 	mutex_enter(&set->ks_lock);
168 	set->ks_state |= KCPC_SET_BOUND;
169 	cv_signal(&set->ks_condv);
170 	mutex_exit(&set->ks_lock);
171 
172 	return (0);
173 
174 unbound:
175 	mutex_exit(&cpu_lock);
176 	set->ks_ctx = NULL;
177 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
178 	kcpc_ctx_free(ctx);
179 	return (EAGAIN);
180 }
181 
182 int
183 kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
184 {
185 	kcpc_ctx_t	*ctx;
186 	int		error;
187 
188 	/*
189 	 * Only one set is allowed per context, so ensure there is no
190 	 * existing context.
191 	 */
192 
193 	if (t->t_cpc_ctx != NULL)
194 		return (EEXIST);
195 
196 	ctx = kcpc_ctx_alloc();
197 
198 	/*
199 	 * The context must begin life frozen until it has been properly
200 	 * programmed onto the hardware. This prevents the context ops from
201 	 * worrying about it until we're ready.
202 	 */
203 	ctx->kc_flags |= KCPC_CTX_FREEZE;
204 	ctx->kc_hrtime = gethrtime();
205 
206 	if (kcpc_assign_reqs(set, ctx) != 0) {
207 		kcpc_ctx_free(ctx);
208 		*subcode = CPC_RESOURCE_UNAVAIL;
209 		return (EINVAL);
210 	}
211 
212 	ctx->kc_cpuid = -1;
213 	if (set->ks_flags & CPC_BIND_LWP_INHERIT)
214 		ctx->kc_flags |= KCPC_CTX_LWPINHERIT;
215 	ctx->kc_thread = t;
216 	t->t_cpc_ctx = ctx;
217 	/*
218 	 * Permit threads to look at their own hardware counters from userland.
219 	 */
220 	ctx->kc_flags |= KCPC_CTX_NONPRIV;
221 
222 	/*
223 	 * Create the data store for this set.
224 	 */
225 	set->ks_data = kmem_alloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
226 
227 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
228 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
229 		kcpc_ctx_free(ctx);
230 		t->t_cpc_ctx = NULL;
231 		return (error);
232 	}
233 
234 	set->ks_ctx = ctx;
235 	ctx->kc_set = set;
236 
237 	/*
238 	 * Add a device context to the subject thread.
239 	 */
240 	installctx(t, ctx, kcpc_save, kcpc_restore, NULL,
241 	    kcpc_lwp_create, NULL, kcpc_free);
242 
243 	/*
244 	 * Ask the backend to program the hardware.
245 	 */
246 	if (t == curthread) {
247 		kpreempt_disable();
248 		ctx->kc_rawtick = KCPC_GET_TICK();
249 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
250 		pcbe_ops->pcbe_program(ctx);
251 		kpreempt_enable();
252 	} else
253 		/*
254 		 * Since we are the agent LWP, we know the victim LWP is stopped
255 		 * until we're done here; no need to worry about preemption or
256 		 * migration here. We still use an atomic op to clear the flag
257 		 * to ensure the flags are always self-consistent; they can
258 		 * still be accessed from, for instance, another CPU doing a
259 		 * kcpc_invalidate_all().
260 		 */
261 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
262 
263 	mutex_enter(&set->ks_lock);
264 	set->ks_state |= KCPC_SET_BOUND;
265 	cv_signal(&set->ks_condv);
266 	mutex_exit(&set->ks_lock);
267 
268 	return (0);
269 }
270 
271 /*
272  * Walk through each request in the set and ask the PCBE to configure a
273  * corresponding counter.
274  */
275 static int
276 kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode)
277 {
278 	int		i;
279 	int		ret;
280 	kcpc_request_t	*rp;
281 
282 	for (i = 0; i < set->ks_nreqs; i++) {
283 		int n;
284 		rp = &set->ks_req[i];
285 
286 		n = rp->kr_picnum;
287 
288 		ASSERT(n >= 0 && n < cpc_ncounters);
289 
290 		ASSERT(ctx->kc_pics[n].kp_req == NULL);
291 
292 		if (rp->kr_flags & CPC_OVF_NOTIFY_EMT) {
293 			if ((pcbe_ops->pcbe_caps & CPC_CAP_OVERFLOW_INTERRUPT)
294 			    == 0) {
295 				*subcode = -1;
296 				return (ENOTSUP);
297 			}
298 			/*
299 			 * If any of the counters have requested overflow
300 			 * notification, we flag the context as being one that
301 			 * cares about overflow.
302 			 */
303 			ctx->kc_flags |= KCPC_CTX_SIGOVF;
304 		}
305 
306 		rp->kr_config = NULL;
307 		if ((ret = pcbe_ops->pcbe_configure(n, rp->kr_event,
308 		    rp->kr_preset, rp->kr_flags, rp->kr_nattrs, rp->kr_attr,
309 		    &(rp->kr_config), (void *)ctx)) != 0) {
310 			kcpc_free_configs(set);
311 			*subcode = ret;
312 			switch (ret) {
313 			case CPC_ATTR_REQUIRES_PRIVILEGE:
314 			case CPC_HV_NO_ACCESS:
315 				return (EACCES);
316 			default:
317 				return (EINVAL);
318 			}
319 		}
320 
321 		ctx->kc_pics[n].kp_req = rp;
322 		rp->kr_picp = &ctx->kc_pics[n];
323 		rp->kr_data = set->ks_data + rp->kr_index;
324 		*rp->kr_data = rp->kr_preset;
325 	}
326 
327 	return (0);
328 }
329 
330 static void
331 kcpc_free_configs(kcpc_set_t *set)
332 {
333 	int i;
334 
335 	for (i = 0; i < set->ks_nreqs; i++)
336 		if (set->ks_req[i].kr_config != NULL)
337 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
338 }
339 
340 /*
341  * buf points to a user address and the data should be copied out to that
342  * address in the current process.
343  */
344 int
345 kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick)
346 {
347 	kcpc_ctx_t	*ctx = set->ks_ctx;
348 	uint64_t	curtick = KCPC_GET_TICK();
349 
350 	mutex_enter(&set->ks_lock);
351 	if ((set->ks_state & KCPC_SET_BOUND) == 0) {
352 		mutex_exit(&set->ks_lock);
353 		return (EINVAL);
354 	}
355 	mutex_exit(&set->ks_lock);
356 
357 	if (ctx->kc_flags & KCPC_CTX_INVALID)
358 		return (EAGAIN);
359 
360 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) {
361 		/*
362 		 * Kernel preemption must be disabled while reading the
363 		 * hardware regs, and if this is a CPU-bound context, while
364 		 * checking the CPU binding of the current thread.
365 		 */
366 		kpreempt_disable();
367 
368 		if (ctx->kc_cpuid != -1) {
369 			if (curthread->t_bind_cpu != ctx->kc_cpuid) {
370 				kpreempt_enable();
371 				return (EAGAIN);
372 			}
373 		}
374 
375 		if (ctx->kc_thread == curthread) {
376 			ctx->kc_hrtime = gethrtime();
377 			pcbe_ops->pcbe_sample(ctx);
378 			ctx->kc_vtick += curtick - ctx->kc_rawtick;
379 			ctx->kc_rawtick = curtick;
380 		}
381 
382 		kpreempt_enable();
383 
384 		/*
385 		 * The config may have been invalidated by
386 		 * the pcbe_sample op.
387 		 */
388 		if (ctx->kc_flags & KCPC_CTX_INVALID)
389 			return (EAGAIN);
390 	}
391 
392 	if (copyout(set->ks_data, buf,
393 	    set->ks_nreqs * sizeof (uint64_t)) == -1)
394 		return (EFAULT);
395 	if (copyout(&ctx->kc_hrtime, hrtime, sizeof (uint64_t)) == -1)
396 		return (EFAULT);
397 	if (copyout(&ctx->kc_vtick, tick, sizeof (uint64_t)) == -1)
398 		return (EFAULT);
399 
400 	return (0);
401 }
402 
403 /*
404  * Stop the counters on the CPU this context is bound to.
405  */
406 static void
407 kcpc_stop_hw(kcpc_ctx_t *ctx)
408 {
409 	cpu_t *cp;
410 
411 	ASSERT((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED))
412 	    == KCPC_CTX_INVALID);
413 
414 	kpreempt_disable();
415 
416 	cp = cpu_get(ctx->kc_cpuid);
417 	ASSERT(cp != NULL);
418 
419 	if (cp == CPU) {
420 		pcbe_ops->pcbe_allstop();
421 		atomic_or_uint(&ctx->kc_flags,
422 		    KCPC_CTX_INVALID_STOPPED);
423 	} else
424 		kcpc_remote_stop(cp);
425 	kpreempt_enable();
426 }
427 
428 int
429 kcpc_unbind(kcpc_set_t *set)
430 {
431 	kcpc_ctx_t	*ctx;
432 	kthread_t	*t;
433 
434 	/*
435 	 * We could be racing with the process's agent thread as it
436 	 * binds the set; we must wait for the set to finish binding
437 	 * before attempting to tear it down.
438 	 */
439 	mutex_enter(&set->ks_lock);
440 	while ((set->ks_state & KCPC_SET_BOUND) == 0)
441 		cv_wait(&set->ks_condv, &set->ks_lock);
442 	mutex_exit(&set->ks_lock);
443 
444 	ctx = set->ks_ctx;
445 
446 	/*
447 	 * Use kc_lock to synchronize with kcpc_restore().
448 	 */
449 	mutex_enter(&ctx->kc_lock);
450 	ctx->kc_flags |= KCPC_CTX_INVALID;
451 	mutex_exit(&ctx->kc_lock);
452 
453 	if (ctx->kc_cpuid == -1) {
454 		t = ctx->kc_thread;
455 		/*
456 		 * The context is thread-bound and therefore has a device
457 		 * context.  It will be freed via removectx() calling
458 		 * freectx() calling kcpc_free().
459 		 */
460 		if (t == curthread &&
461 		    (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
462 			kpreempt_disable();
463 			pcbe_ops->pcbe_allstop();
464 			atomic_or_uint(&ctx->kc_flags,
465 			    KCPC_CTX_INVALID_STOPPED);
466 			kpreempt_enable();
467 		}
468 #ifdef DEBUG
469 		if (removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
470 		    kcpc_lwp_create, NULL, kcpc_free) == 0)
471 			panic("kcpc_unbind: context %p not preset on thread %p",
472 			    (void *)ctx, (void *)t);
473 #else
474 		(void) removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
475 		    kcpc_lwp_create, NULL, kcpc_free);
476 #endif /* DEBUG */
477 		t->t_cpc_set = NULL;
478 		t->t_cpc_ctx = NULL;
479 	} else {
480 		/*
481 		 * If we are unbinding a CPU-bound set from a remote CPU, the
482 		 * native CPU's idle thread could be in the midst of programming
483 		 * this context onto the CPU. We grab the context's lock here to
484 		 * ensure that the idle thread is done with it. When we release
485 		 * the lock, the CPU no longer has a context and the idle thread
486 		 * will move on.
487 		 *
488 		 * cpu_lock must be held to prevent the CPU from being DR'd out
489 		 * while we disassociate the context from the cpu_t.
490 		 */
491 		cpu_t *cp;
492 		mutex_enter(&cpu_lock);
493 		cp = cpu_get(ctx->kc_cpuid);
494 		if (cp != NULL) {
495 			/*
496 			 * The CPU may have been DR'd out of the system.
497 			 */
498 			mutex_enter(&cp->cpu_cpc_ctxlock);
499 			if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0)
500 				kcpc_stop_hw(ctx);
501 			ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
502 			cp->cpu_cpc_ctx = NULL;
503 			mutex_exit(&cp->cpu_cpc_ctxlock);
504 		}
505 		mutex_exit(&cpu_lock);
506 		if (ctx->kc_thread == curthread) {
507 			kcpc_free(ctx, 0);
508 			curthread->t_cpc_set = NULL;
509 		}
510 	}
511 
512 	return (0);
513 }
514 
515 int
516 kcpc_preset(kcpc_set_t *set, int index, uint64_t preset)
517 {
518 	int i;
519 
520 	ASSERT(set != NULL);
521 	ASSERT(set->ks_state & KCPC_SET_BOUND);
522 	ASSERT(set->ks_ctx->kc_thread == curthread);
523 	ASSERT(set->ks_ctx->kc_cpuid == -1);
524 
525 	if (index < 0 || index >= set->ks_nreqs)
526 		return (EINVAL);
527 
528 	for (i = 0; i < set->ks_nreqs; i++)
529 		if (set->ks_req[i].kr_index == index)
530 			break;
531 	ASSERT(i != set->ks_nreqs);
532 
533 	set->ks_req[i].kr_preset = preset;
534 	return (0);
535 }
536 
537 int
538 kcpc_restart(kcpc_set_t *set)
539 {
540 	kcpc_ctx_t	*ctx = set->ks_ctx;
541 	int		i;
542 
543 	ASSERT(set->ks_state & KCPC_SET_BOUND);
544 	ASSERT(ctx->kc_thread == curthread);
545 	ASSERT(ctx->kc_cpuid == -1);
546 
547 	kpreempt_disable();
548 
549 	/*
550 	 * If the user is doing this on a running set, make sure the counters
551 	 * are stopped first.
552 	 */
553 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
554 		pcbe_ops->pcbe_allstop();
555 
556 	for (i = 0; i < set->ks_nreqs; i++) {
557 		*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
558 		pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset,
559 		    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
560 	}
561 
562 	/*
563 	 * Ask the backend to program the hardware.
564 	 */
565 	ctx->kc_rawtick = KCPC_GET_TICK();
566 	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
567 	pcbe_ops->pcbe_program(ctx);
568 	kpreempt_enable();
569 
570 	return (0);
571 }
572 
573 /*
574  * Caller must hold kcpc_cpuctx_lock.
575  */
576 int
577 kcpc_enable(kthread_t *t, int cmd, int enable)
578 {
579 	kcpc_ctx_t	*ctx = t->t_cpc_ctx;
580 	kcpc_set_t	*set = t->t_cpc_set;
581 	kcpc_set_t	*newset;
582 	int		i;
583 	int		flag;
584 	int		err;
585 
586 	ASSERT(RW_READ_HELD(&kcpc_cpuctx_lock));
587 
588 	if (ctx == NULL) {
589 		/*
590 		 * This thread has a set but no context; it must be a
591 		 * CPU-bound set.
592 		 */
593 		ASSERT(t->t_cpc_set != NULL);
594 		ASSERT(t->t_cpc_set->ks_ctx->kc_cpuid != -1);
595 		return (EINVAL);
596 	} else if (ctx->kc_flags & KCPC_CTX_INVALID)
597 		return (EAGAIN);
598 
599 	if (cmd == CPC_ENABLE) {
600 		if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
601 			return (EINVAL);
602 		kpreempt_disable();
603 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
604 		kcpc_restore(ctx);
605 		kpreempt_enable();
606 	} else if (cmd == CPC_DISABLE) {
607 		if (ctx->kc_flags & KCPC_CTX_FREEZE)
608 			return (EINVAL);
609 		kpreempt_disable();
610 		kcpc_save(ctx);
611 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
612 		kpreempt_enable();
613 	} else if (cmd == CPC_USR_EVENTS || cmd == CPC_SYS_EVENTS) {
614 		/*
615 		 * Strategy for usr/sys: stop counters and update set's presets
616 		 * with current counter values, unbind, update requests with
617 		 * new config, then re-bind.
618 		 */
619 		flag = (cmd == CPC_USR_EVENTS) ?
620 		    CPC_COUNT_USER: CPC_COUNT_SYSTEM;
621 
622 		kpreempt_disable();
623 		atomic_or_uint(&ctx->kc_flags,
624 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
625 		pcbe_ops->pcbe_allstop();
626 		kpreempt_enable();
627 		for (i = 0; i < set->ks_nreqs; i++) {
628 			set->ks_req[i].kr_preset = *(set->ks_req[i].kr_data);
629 			if (enable)
630 				set->ks_req[i].kr_flags |= flag;
631 			else
632 				set->ks_req[i].kr_flags &= ~flag;
633 		}
634 		newset = kcpc_dup_set(set);
635 		if (kcpc_unbind(set) != 0)
636 			return (EINVAL);
637 		t->t_cpc_set = newset;
638 		if (kcpc_bind_thread(newset, t, &err) != 0) {
639 			t->t_cpc_set = NULL;
640 			kcpc_free_set(newset);
641 			return (EINVAL);
642 		}
643 	} else
644 		return (EINVAL);
645 
646 	return (0);
647 }
648 
649 /*
650  * Provide PCBEs with a way of obtaining the configs of every counter which will
651  * be programmed together.
652  *
653  * If current is NULL, provide the first config.
654  *
655  * If data != NULL, caller wants to know where the data store associated with
656  * the config we return is located.
657  */
658 void *
659 kcpc_next_config(void *token, void *current, uint64_t **data)
660 {
661 	int		i;
662 	kcpc_pic_t	*pic;
663 	kcpc_ctx_t *ctx = (kcpc_ctx_t *)token;
664 
665 	if (current == NULL) {
666 		/*
667 		 * Client would like the first config, which may not be in
668 		 * counter 0; we need to search through the counters for the
669 		 * first config.
670 		 */
671 		for (i = 0; i < cpc_ncounters; i++)
672 			if (ctx->kc_pics[i].kp_req != NULL)
673 				break;
674 		/*
675 		 * There are no counters configured for the given context.
676 		 */
677 		if (i == cpc_ncounters)
678 			return (NULL);
679 	} else {
680 		/*
681 		 * There surely is a faster way to do this.
682 		 */
683 		for (i = 0; i < cpc_ncounters; i++) {
684 			pic = &ctx->kc_pics[i];
685 
686 			if (pic->kp_req != NULL &&
687 			    current == pic->kp_req->kr_config)
688 				break;
689 		}
690 
691 		/*
692 		 * We found the current config at picnum i. Now search for the
693 		 * next configured PIC.
694 		 */
695 		for (i++; i < cpc_ncounters; i++) {
696 			pic = &ctx->kc_pics[i];
697 			if (pic->kp_req != NULL)
698 				break;
699 		}
700 
701 		if (i == cpc_ncounters)
702 			return (NULL);
703 	}
704 
705 	if (data != NULL) {
706 		*data = ctx->kc_pics[i].kp_req->kr_data;
707 	}
708 
709 	return (ctx->kc_pics[i].kp_req->kr_config);
710 }
711 
712 
713 static kcpc_ctx_t *
714 kcpc_ctx_alloc(void)
715 {
716 	kcpc_ctx_t	*ctx;
717 	long		hash;
718 
719 	ctx = (kcpc_ctx_t *)kmem_zalloc(sizeof (kcpc_ctx_t), KM_SLEEP);
720 
721 	hash = CPC_HASH_CTX(ctx);
722 	mutex_enter(&kcpc_ctx_llock[hash]);
723 	ctx->kc_next = kcpc_ctx_list[hash];
724 	kcpc_ctx_list[hash] = ctx;
725 	mutex_exit(&kcpc_ctx_llock[hash]);
726 
727 	ctx->kc_pics = (kcpc_pic_t *)kmem_zalloc(sizeof (kcpc_pic_t) *
728 	    cpc_ncounters, KM_SLEEP);
729 
730 	ctx->kc_cpuid = -1;
731 
732 	return (ctx);
733 }
734 
735 /*
736  * Copy set from ctx to the child context, cctx, if it has CPC_BIND_LWP_INHERIT
737  * in the flags.
738  */
739 static void
740 kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx)
741 {
742 	kcpc_set_t	*ks = ctx->kc_set, *cks;
743 	int		i, j;
744 	int		code;
745 
746 	ASSERT(ks != NULL);
747 
748 	if ((ks->ks_flags & CPC_BIND_LWP_INHERIT) == 0)
749 		return;
750 
751 	cks = kmem_zalloc(sizeof (*cks), KM_SLEEP);
752 	cks->ks_state &= ~KCPC_SET_BOUND;
753 	cctx->kc_set = cks;
754 	cks->ks_flags = ks->ks_flags;
755 	cks->ks_nreqs = ks->ks_nreqs;
756 	cks->ks_req = kmem_alloc(cks->ks_nreqs *
757 	    sizeof (kcpc_request_t), KM_SLEEP);
758 	cks->ks_data = kmem_alloc(cks->ks_nreqs * sizeof (uint64_t),
759 	    KM_SLEEP);
760 	cks->ks_ctx = cctx;
761 
762 	for (i = 0; i < cks->ks_nreqs; i++) {
763 		cks->ks_req[i].kr_index = ks->ks_req[i].kr_index;
764 		cks->ks_req[i].kr_picnum = ks->ks_req[i].kr_picnum;
765 		(void) strncpy(cks->ks_req[i].kr_event,
766 		    ks->ks_req[i].kr_event, CPC_MAX_EVENT_LEN);
767 		cks->ks_req[i].kr_preset = ks->ks_req[i].kr_preset;
768 		cks->ks_req[i].kr_flags = ks->ks_req[i].kr_flags;
769 		cks->ks_req[i].kr_nattrs = ks->ks_req[i].kr_nattrs;
770 		if (ks->ks_req[i].kr_nattrs > 0) {
771 			cks->ks_req[i].kr_attr =
772 			    kmem_alloc(ks->ks_req[i].kr_nattrs *
773 			    sizeof (kcpc_attr_t), KM_SLEEP);
774 		}
775 		for (j = 0; j < ks->ks_req[i].kr_nattrs; j++) {
776 			(void) strncpy(cks->ks_req[i].kr_attr[j].ka_name,
777 			    ks->ks_req[i].kr_attr[j].ka_name,
778 			    CPC_MAX_ATTR_LEN);
779 			cks->ks_req[i].kr_attr[j].ka_val =
780 			    ks->ks_req[i].kr_attr[j].ka_val;
781 		}
782 	}
783 	if (kcpc_configure_reqs(cctx, cks, &code) != 0)
784 		kcpc_invalidate_config(cctx);
785 
786 	mutex_enter(&cks->ks_lock);
787 	cks->ks_state |= KCPC_SET_BOUND;
788 	cv_signal(&cks->ks_condv);
789 	mutex_exit(&cks->ks_lock);
790 }
791 
792 
793 static void
794 kcpc_ctx_free(kcpc_ctx_t *ctx)
795 {
796 	kcpc_ctx_t	**loc;
797 	long		hash = CPC_HASH_CTX(ctx);
798 
799 	mutex_enter(&kcpc_ctx_llock[hash]);
800 	loc = &kcpc_ctx_list[hash];
801 	ASSERT(*loc != NULL);
802 	while (*loc != ctx)
803 		loc = &(*loc)->kc_next;
804 	*loc = ctx->kc_next;
805 	mutex_exit(&kcpc_ctx_llock[hash]);
806 
807 	kmem_free(ctx->kc_pics, cpc_ncounters * sizeof (kcpc_pic_t));
808 	cv_destroy(&ctx->kc_condv);
809 	mutex_destroy(&ctx->kc_lock);
810 	kmem_free(ctx, sizeof (*ctx));
811 }
812 
813 /*
814  * Generic interrupt handler used on hardware that generates
815  * overflow interrupts.
816  *
817  * Note: executed at high-level interrupt context!
818  */
819 /*ARGSUSED*/
820 kcpc_ctx_t *
821 kcpc_overflow_intr(caddr_t arg, uint64_t bitmap)
822 {
823 	kcpc_ctx_t	*ctx;
824 	kthread_t	*t = curthread;
825 	int		i;
826 
827 	/*
828 	 * On both x86 and UltraSPARC, we may deliver the high-level
829 	 * interrupt in kernel mode, just after we've started to run an
830 	 * interrupt thread.  (That's because the hardware helpfully
831 	 * delivers the overflow interrupt some random number of cycles
832 	 * after the instruction that caused the overflow by which time
833 	 * we're in some part of the kernel, not necessarily running on
834 	 * the right thread).
835 	 *
836 	 * Check for this case here -- find the pinned thread
837 	 * that was running when the interrupt went off.
838 	 */
839 	if (t->t_flag & T_INTR_THREAD) {
840 		klwp_t *lwp;
841 
842 		atomic_add_32(&kcpc_intrctx_count, 1);
843 
844 		/*
845 		 * Note that t_lwp is always set to point at the underlying
846 		 * thread, thus this will work in the presence of nested
847 		 * interrupts.
848 		 */
849 		ctx = NULL;
850 		if ((lwp = t->t_lwp) != NULL) {
851 			t = lwptot(lwp);
852 			ctx = t->t_cpc_ctx;
853 		}
854 	} else
855 		ctx = t->t_cpc_ctx;
856 
857 	if (ctx == NULL) {
858 		/*
859 		 * This can easily happen if we're using the counters in
860 		 * "shared" mode, for example, and an overflow interrupt
861 		 * occurs while we are running cpustat.  In that case, the
862 		 * bound thread that has the context that belongs to this
863 		 * CPU is almost certainly sleeping (if it was running on
864 		 * the CPU we'd have found it above), and the actual
865 		 * interrupted thread has no knowledge of performance counters!
866 		 */
867 		ctx = curthread->t_cpu->cpu_cpc_ctx;
868 		if (ctx != NULL) {
869 			/*
870 			 * Return the bound context for this CPU to
871 			 * the interrupt handler so that it can synchronously
872 			 * sample the hardware counters and restart them.
873 			 */
874 			return (ctx);
875 		}
876 
877 		/*
878 		 * As long as the overflow interrupt really is delivered early
879 		 * enough after trapping into the kernel to avoid switching
880 		 * threads, we must always be able to find the cpc context,
881 		 * or something went terribly wrong i.e. we ended up
882 		 * running a passivated interrupt thread, a kernel
883 		 * thread or we interrupted idle, all of which are Very Bad.
884 		 */
885 		if (kcpc_nullctx_panic)
886 			panic("null cpc context, thread %p", (void *)t);
887 		atomic_add_32(&kcpc_nullctx_count, 1);
888 	} else if ((ctx->kc_flags & KCPC_CTX_INVALID) == 0) {
889 		/*
890 		 * Schedule an ast to sample the counters, which will
891 		 * propagate any overflow into the virtualized performance
892 		 * counter(s), and may deliver a signal.
893 		 */
894 		ttolwp(t)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
895 		/*
896 		 * If a counter has overflowed which was counting on behalf of
897 		 * a request which specified CPC_OVF_NOTIFY_EMT, send the
898 		 * process a signal.
899 		 */
900 		for (i = 0; i < cpc_ncounters; i++) {
901 			if (ctx->kc_pics[i].kp_req != NULL &&
902 			    bitmap & (1 << i) &&
903 			    ctx->kc_pics[i].kp_req->kr_flags &
904 			    CPC_OVF_NOTIFY_EMT) {
905 				/*
906 				 * A signal has been requested for this PIC, so
907 				 * so freeze the context. The interrupt handler
908 				 * has already stopped the counter hardware.
909 				 */
910 				atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
911 				atomic_or_uint(&ctx->kc_pics[i].kp_flags,
912 				    KCPC_PIC_OVERFLOWED);
913 			}
914 		}
915 		aston(t);
916 	}
917 	return (NULL);
918 }
919 
920 /*
921  * The current thread context had an overflow interrupt; we're
922  * executing here in high-level interrupt context.
923  */
924 /*ARGSUSED*/
925 uint_t
926 kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
927 {
928 	kcpc_ctx_t	*ctx;
929 	uint64_t	bitmap;
930 
931 	if (pcbe_ops == NULL ||
932 	    (bitmap = pcbe_ops->pcbe_overflow_bitmap()) == 0)
933 		return (DDI_INTR_UNCLAIMED);
934 
935 	/*
936 	 * Prevent any further interrupts.
937 	 */
938 	pcbe_ops->pcbe_allstop();
939 
940 	/*
941 	 * Invoke the "generic" handler.
942 	 *
943 	 * If the interrupt has occurred in the context of an lwp owning
944 	 * the counters, then the handler posts an AST to the lwp to
945 	 * trigger the actual sampling, and optionally deliver a signal or
946 	 * restart the counters, on the way out of the kernel using
947 	 * kcpc_hw_overflow_ast() (see below).
948 	 *
949 	 * On the other hand, if the handler returns the context to us
950 	 * directly, then it means that there are no other threads in
951 	 * the middle of updating it, no AST has been posted, and so we
952 	 * should sample the counters here, and restart them with no
953 	 * further fuss.
954 	 */
955 	if ((ctx = kcpc_overflow_intr(arg1, bitmap)) != NULL) {
956 		uint64_t curtick = KCPC_GET_TICK();
957 
958 		ctx->kc_hrtime = gethrtime_waitfree();
959 		ctx->kc_vtick += curtick - ctx->kc_rawtick;
960 		ctx->kc_rawtick = curtick;
961 		pcbe_ops->pcbe_sample(ctx);
962 		pcbe_ops->pcbe_program(ctx);
963 	}
964 
965 	return (DDI_INTR_CLAIMED);
966 }
967 
968 /*
969  * Called from trap() when processing the ast posted by the high-level
970  * interrupt handler.
971  */
972 int
973 kcpc_overflow_ast()
974 {
975 	kcpc_ctx_t	*ctx = curthread->t_cpc_ctx;
976 	int		i;
977 	int		found = 0;
978 	uint64_t	curtick = KCPC_GET_TICK();
979 
980 	ASSERT(ctx != NULL);	/* Beware of interrupt skid. */
981 
982 	/*
983 	 * An overflow happened: sample the context to ensure that
984 	 * the overflow is propagated into the upper bits of the
985 	 * virtualized 64-bit counter(s).
986 	 */
987 	kpreempt_disable();
988 	ctx->kc_hrtime = gethrtime_waitfree();
989 	pcbe_ops->pcbe_sample(ctx);
990 	kpreempt_enable();
991 
992 	ctx->kc_vtick += curtick - ctx->kc_rawtick;
993 
994 	/*
995 	 * The interrupt handler has marked any pics with KCPC_PIC_OVERFLOWED
996 	 * if that pic generated an overflow and if the request it was counting
997 	 * on behalf of had CPC_OVERFLOW_REQUEST specified. We go through all
998 	 * pics in the context and clear the KCPC_PIC_OVERFLOWED flags. If we
999 	 * found any overflowed pics, keep the context frozen and return true
1000 	 * (thus causing a signal to be sent).
1001 	 */
1002 	for (i = 0; i < cpc_ncounters; i++) {
1003 		if (ctx->kc_pics[i].kp_flags & KCPC_PIC_OVERFLOWED) {
1004 			atomic_and_uint(&ctx->kc_pics[i].kp_flags,
1005 			    ~KCPC_PIC_OVERFLOWED);
1006 			found = 1;
1007 		}
1008 	}
1009 	if (found)
1010 		return (1);
1011 
1012 	/*
1013 	 * Otherwise, re-enable the counters and continue life as before.
1014 	 */
1015 	kpreempt_disable();
1016 	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
1017 	pcbe_ops->pcbe_program(ctx);
1018 	kpreempt_enable();
1019 	return (0);
1020 }
1021 
1022 /*
1023  * Called when switching away from current thread.
1024  */
1025 static void
1026 kcpc_save(kcpc_ctx_t *ctx)
1027 {
1028 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
1029 		if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)
1030 			return;
1031 		/*
1032 		 * This context has been invalidated but the counters have not
1033 		 * been stopped. Stop them here and mark the context stopped.
1034 		 */
1035 		pcbe_ops->pcbe_allstop();
1036 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
1037 		return;
1038 	}
1039 
1040 	pcbe_ops->pcbe_allstop();
1041 	if (ctx->kc_flags & KCPC_CTX_FREEZE)
1042 		return;
1043 
1044 	/*
1045 	 * Need to sample for all reqs into each req's current mpic.
1046 	 */
1047 	ctx->kc_hrtime = gethrtime();
1048 	ctx->kc_vtick += KCPC_GET_TICK() - ctx->kc_rawtick;
1049 	pcbe_ops->pcbe_sample(ctx);
1050 }
1051 
1052 static void
1053 kcpc_restore(kcpc_ctx_t *ctx)
1054 {
1055 	mutex_enter(&ctx->kc_lock);
1056 	if ((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED)) ==
1057 	    KCPC_CTX_INVALID)
1058 		/*
1059 		 * The context is invalidated but has not been marked stopped.
1060 		 * We mark it as such here because we will not start the
1061 		 * counters during this context switch.
1062 		 */
1063 		ctx->kc_flags |= KCPC_CTX_INVALID_STOPPED;
1064 
1065 
1066 	if (ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_FREEZE)) {
1067 		mutex_exit(&ctx->kc_lock);
1068 		return;
1069 	}
1070 
1071 	/*
1072 	 * Set kc_flags to show that a kcpc_restore() is in progress to avoid
1073 	 * ctx & set related memory objects being freed without us knowing.
1074 	 * This can happen if an agent thread is executing a kcpc_unbind(),
1075 	 * with this thread as the target, whilst we're concurrently doing a
1076 	 * restorectx() during, for example, a proc_exit().  Effectively, by
1077 	 * doing this, we're asking kcpc_free() to cv_wait() until
1078 	 * kcpc_restore() has completed.
1079 	 */
1080 	ctx->kc_flags |= KCPC_CTX_RESTORE;
1081 	mutex_exit(&ctx->kc_lock);
1082 
1083 	/*
1084 	 * While programming the hardware, the counters should be stopped. We
1085 	 * don't do an explicit pcbe_allstop() here because they should have
1086 	 * been stopped already by the last consumer.
1087 	 */
1088 	ctx->kc_rawtick = KCPC_GET_TICK();
1089 	pcbe_ops->pcbe_program(ctx);
1090 
1091 	/*
1092 	 * Wake the agent thread if it's waiting in kcpc_free().
1093 	 */
1094 	mutex_enter(&ctx->kc_lock);
1095 	ctx->kc_flags &= ~KCPC_CTX_RESTORE;
1096 	cv_signal(&ctx->kc_condv);
1097 	mutex_exit(&ctx->kc_lock);
1098 }
1099 
1100 /*
1101  * If kcpc_counts_include_idle is set to 0 by the sys admin, we add the the
1102  * following context operators to the idle thread on each CPU. They stop the
1103  * counters when the idle thread is switched on, and they start them again when
1104  * it is switched off.
1105  */
1106 
1107 /*ARGSUSED*/
1108 void
1109 kcpc_idle_save(struct cpu *cp)
1110 {
1111 	/*
1112 	 * The idle thread shouldn't be run anywhere else.
1113 	 */
1114 	ASSERT(CPU == cp);
1115 
1116 	/*
1117 	 * We must hold the CPU's context lock to ensure the context isn't freed
1118 	 * while we're looking at it.
1119 	 */
1120 	mutex_enter(&cp->cpu_cpc_ctxlock);
1121 
1122 	if ((cp->cpu_cpc_ctx == NULL) ||
1123 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1124 		mutex_exit(&cp->cpu_cpc_ctxlock);
1125 		return;
1126 	}
1127 
1128 	pcbe_ops->pcbe_program(cp->cpu_cpc_ctx);
1129 	mutex_exit(&cp->cpu_cpc_ctxlock);
1130 }
1131 
1132 void
1133 kcpc_idle_restore(struct cpu *cp)
1134 {
1135 	/*
1136 	 * The idle thread shouldn't be run anywhere else.
1137 	 */
1138 	ASSERT(CPU == cp);
1139 
1140 	/*
1141 	 * We must hold the CPU's context lock to ensure the context isn't freed
1142 	 * while we're looking at it.
1143 	 */
1144 	mutex_enter(&cp->cpu_cpc_ctxlock);
1145 
1146 	if ((cp->cpu_cpc_ctx == NULL) ||
1147 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1148 		mutex_exit(&cp->cpu_cpc_ctxlock);
1149 		return;
1150 	}
1151 
1152 	pcbe_ops->pcbe_allstop();
1153 	mutex_exit(&cp->cpu_cpc_ctxlock);
1154 }
1155 
1156 /*ARGSUSED*/
1157 static void
1158 kcpc_lwp_create(kthread_t *t, kthread_t *ct)
1159 {
1160 	kcpc_ctx_t	*ctx = t->t_cpc_ctx, *cctx;
1161 	int		i;
1162 
1163 	if (ctx == NULL || (ctx->kc_flags & KCPC_CTX_LWPINHERIT) == 0)
1164 		return;
1165 
1166 	rw_enter(&kcpc_cpuctx_lock, RW_READER);
1167 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
1168 		rw_exit(&kcpc_cpuctx_lock);
1169 		return;
1170 	}
1171 	cctx = kcpc_ctx_alloc();
1172 	kcpc_ctx_clone(ctx, cctx);
1173 	rw_exit(&kcpc_cpuctx_lock);
1174 
1175 	/*
1176 	 * Copy the parent context's kc_flags field, but don't overwrite
1177 	 * the child's in case it was modified during kcpc_ctx_clone.
1178 	 */
1179 	cctx->kc_flags |= ctx->kc_flags;
1180 	cctx->kc_thread = ct;
1181 	cctx->kc_cpuid = -1;
1182 	ct->t_cpc_set = cctx->kc_set;
1183 	ct->t_cpc_ctx = cctx;
1184 
1185 	if (cctx->kc_flags & KCPC_CTX_SIGOVF) {
1186 		kcpc_set_t *ks = cctx->kc_set;
1187 		/*
1188 		 * Our contract with the user requires us to immediately send an
1189 		 * overflow signal to all children if we have the LWPINHERIT
1190 		 * and SIGOVF flags set. In addition, all counters should be
1191 		 * set to UINT64_MAX, and their pic's overflow flag turned on
1192 		 * so that our trap() processing knows to send a signal.
1193 		 */
1194 		atomic_or_uint(&cctx->kc_flags, KCPC_CTX_FREEZE);
1195 		for (i = 0; i < ks->ks_nreqs; i++) {
1196 			kcpc_request_t *kr = &ks->ks_req[i];
1197 
1198 			if (kr->kr_flags & CPC_OVF_NOTIFY_EMT) {
1199 				*(kr->kr_data) = UINT64_MAX;
1200 				kr->kr_picp->kp_flags |= KCPC_PIC_OVERFLOWED;
1201 			}
1202 		}
1203 		ttolwp(ct)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
1204 		aston(ct);
1205 	}
1206 
1207 	installctx(ct, cctx, kcpc_save, kcpc_restore,
1208 	    NULL, kcpc_lwp_create, NULL, kcpc_free);
1209 }
1210 
1211 /*
1212  * Counter Stoppage Theory
1213  *
1214  * The counters may need to be stopped properly at the following occasions:
1215  *
1216  * 1) An LWP exits.
1217  * 2) A thread exits.
1218  * 3) An LWP performs an exec().
1219  * 4) A bound set is unbound.
1220  *
1221  * In addition to stopping the counters, the CPC context (a kcpc_ctx_t) may need
1222  * to be freed as well.
1223  *
1224  * Case 1: kcpc_passivate(), called via lwp_exit(), stops the counters. Later on
1225  * when the thread is freed, kcpc_free(), called by freectx(), frees the
1226  * context.
1227  *
1228  * Case 2: same as case 1 except kcpc_passivate is called from thread_exit().
1229  *
1230  * Case 3: kcpc_free(), called via freectx() via exec(), recognizes that it has
1231  * been called from exec. It stops the counters _and_ frees the context.
1232  *
1233  * Case 4: kcpc_unbind() stops the hardware _and_ frees the context.
1234  *
1235  * CPU-bound counters are always stopped via kcpc_unbind().
1236  */
1237 
1238 /*
1239  * We're being called to delete the context; we ensure that all associated data
1240  * structures are freed, and that the hardware is passivated if this is an exec.
1241  */
1242 
1243 /*ARGSUSED*/
1244 static void
1245 kcpc_free(kcpc_ctx_t *ctx, int isexec)
1246 {
1247 	int		i;
1248 	kcpc_set_t	*set = ctx->kc_set;
1249 
1250 	ASSERT(set != NULL);
1251 
1252 	/*
1253 	 * Wait for kcpc_restore() to finish before we tear things down.
1254 	 */
1255 	mutex_enter(&ctx->kc_lock);
1256 	while (ctx->kc_flags & KCPC_CTX_RESTORE)
1257 		cv_wait(&ctx->kc_condv, &ctx->kc_lock);
1258 	ctx->kc_flags |= KCPC_CTX_INVALID;
1259 	mutex_exit(&ctx->kc_lock);
1260 
1261 	if (isexec) {
1262 		/*
1263 		 * This thread is execing, and after the exec it should not have
1264 		 * any performance counter context. Stop the counters properly
1265 		 * here so the system isn't surprised by an overflow interrupt
1266 		 * later.
1267 		 */
1268 		if (ctx->kc_cpuid != -1) {
1269 			cpu_t *cp;
1270 			/*
1271 			 * CPU-bound context; stop the appropriate CPU's ctrs.
1272 			 * Hold cpu_lock while examining the CPU to ensure it
1273 			 * doesn't go away.
1274 			 */
1275 			mutex_enter(&cpu_lock);
1276 			cp = cpu_get(ctx->kc_cpuid);
1277 			/*
1278 			 * The CPU could have been DR'd out, so only stop the
1279 			 * CPU and clear its context pointer if the CPU still
1280 			 * exists.
1281 			 */
1282 			if (cp != NULL) {
1283 				mutex_enter(&cp->cpu_cpc_ctxlock);
1284 				kcpc_stop_hw(ctx);
1285 				cp->cpu_cpc_ctx = NULL;
1286 				mutex_exit(&cp->cpu_cpc_ctxlock);
1287 			}
1288 			mutex_exit(&cpu_lock);
1289 			ASSERT(curthread->t_cpc_ctx == NULL);
1290 		} else {
1291 			/*
1292 			 * Thread-bound context; stop _this_ CPU's counters.
1293 			 */
1294 			kpreempt_disable();
1295 			pcbe_ops->pcbe_allstop();
1296 			atomic_or_uint(&ctx->kc_flags,
1297 			    KCPC_CTX_INVALID_STOPPED);
1298 			kpreempt_enable();
1299 			curthread->t_cpc_ctx = NULL;
1300 		}
1301 
1302 		/*
1303 		 * Since we are being called from an exec and we know that
1304 		 * exec is not permitted via the agent thread, we should clean
1305 		 * up this thread's CPC state completely, and not leave dangling
1306 		 * CPC pointers behind.
1307 		 */
1308 		ASSERT(ctx->kc_thread == curthread);
1309 		curthread->t_cpc_set = NULL;
1310 	}
1311 
1312 	/*
1313 	 * Walk through each request in this context's set and free the PCBE's
1314 	 * configuration if it exists.
1315 	 */
1316 	for (i = 0; i < set->ks_nreqs; i++) {
1317 		if (set->ks_req[i].kr_config != NULL)
1318 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
1319 	}
1320 
1321 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
1322 	kcpc_ctx_free(ctx);
1323 	kcpc_free_set(set);
1324 }
1325 
1326 /*
1327  * Free the memory associated with a request set.
1328  */
1329 void
1330 kcpc_free_set(kcpc_set_t *set)
1331 {
1332 	int		i;
1333 	kcpc_request_t	*req;
1334 
1335 	ASSERT(set->ks_req != NULL);
1336 
1337 	for (i = 0; i < set->ks_nreqs; i++) {
1338 		req = &set->ks_req[i];
1339 
1340 		if (req->kr_nattrs != 0) {
1341 			kmem_free(req->kr_attr,
1342 			    req->kr_nattrs * sizeof (kcpc_attr_t));
1343 		}
1344 	}
1345 
1346 	kmem_free(set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
1347 	cv_destroy(&set->ks_condv);
1348 	mutex_destroy(&set->ks_lock);
1349 	kmem_free(set, sizeof (kcpc_set_t));
1350 }
1351 
1352 /*
1353  * Grab every existing context and mark it as invalid.
1354  */
1355 void
1356 kcpc_invalidate_all(void)
1357 {
1358 	kcpc_ctx_t *ctx;
1359 	long hash;
1360 
1361 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++) {
1362 		mutex_enter(&kcpc_ctx_llock[hash]);
1363 		for (ctx = kcpc_ctx_list[hash]; ctx; ctx = ctx->kc_next)
1364 			atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1365 		mutex_exit(&kcpc_ctx_llock[hash]);
1366 	}
1367 }
1368 
1369 /*
1370  * Interface for PCBEs to signal that an existing configuration has suddenly
1371  * become invalid.
1372  */
1373 void
1374 kcpc_invalidate_config(void *token)
1375 {
1376 	kcpc_ctx_t *ctx = token;
1377 
1378 	ASSERT(ctx != NULL);
1379 
1380 	atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1381 }
1382 
1383 /*
1384  * Called from lwp_exit() and thread_exit()
1385  */
1386 void
1387 kcpc_passivate(void)
1388 {
1389 	kcpc_ctx_t *ctx = curthread->t_cpc_ctx;
1390 	kcpc_set_t *set = curthread->t_cpc_set;
1391 
1392 	if (set == NULL)
1393 		return;
1394 
1395 	/*
1396 	 * We're cleaning up after this thread; ensure there are no dangling
1397 	 * CPC pointers left behind. The context and set will be freed by
1398 	 * freectx() in the case of an LWP-bound set, and by kcpc_unbind() in
1399 	 * the case of a CPU-bound set.
1400 	 */
1401 	curthread->t_cpc_ctx = NULL;
1402 
1403 	if (ctx == NULL) {
1404 		/*
1405 		 * This thread has a set but no context; it must be a CPU-bound
1406 		 * set. The hardware will be stopped via kcpc_unbind() when the
1407 		 * process exits and closes its file descriptors with
1408 		 * kcpc_close(). Our only job here is to clean up this thread's
1409 		 * state; the set will be freed with the unbind().
1410 		 */
1411 		(void) kcpc_unbind(set);
1412 		/*
1413 		 * Unbinding a set belonging to the current thread should clear
1414 		 * its set pointer.
1415 		 */
1416 		ASSERT(curthread->t_cpc_set == NULL);
1417 		return;
1418 	}
1419 
1420 	curthread->t_cpc_set = NULL;
1421 
1422 	/*
1423 	 * This thread/LWP is exiting but context switches will continue to
1424 	 * happen for a bit as the exit proceeds.  Kernel preemption must be
1425 	 * disabled here to prevent a race between checking or setting the
1426 	 * INVALID_STOPPED flag here and kcpc_restore() setting the flag during
1427 	 * a context switch.
1428 	 */
1429 
1430 	kpreempt_disable();
1431 	if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
1432 		pcbe_ops->pcbe_allstop();
1433 		atomic_or_uint(&ctx->kc_flags,
1434 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
1435 	}
1436 	kpreempt_enable();
1437 }
1438 
1439 /*
1440  * Assign the requests in the given set to the PICs in the context.
1441  * Returns 0 if successful, -1 on failure.
1442  */
1443 /*ARGSUSED*/
1444 static int
1445 kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx)
1446 {
1447 	int i;
1448 	int *picnum_save;
1449 
1450 	ASSERT(set->ks_nreqs <= cpc_ncounters);
1451 
1452 	/*
1453 	 * Provide kcpc_tryassign() with scratch space to avoid doing an
1454 	 * alloc/free with every invocation.
1455 	 */
1456 	picnum_save = kmem_alloc(set->ks_nreqs * sizeof (int), KM_SLEEP);
1457 	/*
1458 	 * kcpc_tryassign() blindly walks through each request in the set,
1459 	 * seeing if a counter can count its event. If yes, it assigns that
1460 	 * counter. However, that counter may have been the only capable counter
1461 	 * for _another_ request's event. The solution is to try every possible
1462 	 * request first. Note that this does not cover all solutions, as
1463 	 * that would require all unique orderings of requests, an n^n operation
1464 	 * which would be unacceptable for architectures with many counters.
1465 	 */
1466 	for (i = 0; i < set->ks_nreqs; i++)
1467 		if (kcpc_tryassign(set, i, picnum_save) == 0)
1468 			break;
1469 
1470 	kmem_free(picnum_save, set->ks_nreqs * sizeof (int));
1471 	if (i == set->ks_nreqs)
1472 		return (-1);
1473 	return (0);
1474 }
1475 
1476 static int
1477 kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch)
1478 {
1479 	int		i;
1480 	int		j;
1481 	uint64_t	bitmap = 0, resmap = 0;
1482 	uint64_t	ctrmap;
1483 
1484 	/*
1485 	 * We are attempting to assign the reqs to pics, but we may fail. If we
1486 	 * fail, we need to restore the state of the requests to what it was
1487 	 * when we found it, as some reqs may have been explicitly assigned to
1488 	 * a specific PIC beforehand. We do this by snapshotting the assignments
1489 	 * now and restoring from it later if we fail.
1490 	 *
1491 	 * Also we note here which counters have already been claimed by
1492 	 * requests with explicit counter assignments.
1493 	 */
1494 	for (i = 0; i < set->ks_nreqs; i++) {
1495 		scratch[i] = set->ks_req[i].kr_picnum;
1496 		if (set->ks_req[i].kr_picnum != -1)
1497 			resmap |= (1 << set->ks_req[i].kr_picnum);
1498 	}
1499 
1500 	/*
1501 	 * Walk through requests assigning them to the first PIC that is
1502 	 * capable.
1503 	 */
1504 	i = starting_req;
1505 	do {
1506 		if (set->ks_req[i].kr_picnum != -1) {
1507 			ASSERT((bitmap & (1 << set->ks_req[i].kr_picnum)) == 0);
1508 			bitmap |= (1 << set->ks_req[i].kr_picnum);
1509 			if (++i == set->ks_nreqs)
1510 				i = 0;
1511 			continue;
1512 		}
1513 
1514 		ctrmap = pcbe_ops->pcbe_event_coverage(set->ks_req[i].kr_event);
1515 		for (j = 0; j < cpc_ncounters; j++) {
1516 			if (ctrmap & (1 << j) && (bitmap & (1 << j)) == 0 &&
1517 			    (resmap & (1 << j)) == 0) {
1518 				/*
1519 				 * We can assign this counter because:
1520 				 *
1521 				 * 1. It can count the event (ctrmap)
1522 				 * 2. It hasn't been assigned yet (bitmap)
1523 				 * 3. It wasn't reserved by a request (resmap)
1524 				 */
1525 				bitmap |= (1 << j);
1526 				break;
1527 			}
1528 		}
1529 		if (j == cpc_ncounters) {
1530 			for (i = 0; i < set->ks_nreqs; i++)
1531 				set->ks_req[i].kr_picnum = scratch[i];
1532 			return (-1);
1533 		}
1534 		set->ks_req[i].kr_picnum = j;
1535 
1536 		if (++i == set->ks_nreqs)
1537 			i = 0;
1538 	} while (i != starting_req);
1539 
1540 	return (0);
1541 }
1542 
1543 kcpc_set_t *
1544 kcpc_dup_set(kcpc_set_t *set)
1545 {
1546 	kcpc_set_t	*new;
1547 	int		i;
1548 	int		j;
1549 
1550 	new = kmem_zalloc(sizeof (*new), KM_SLEEP);
1551 	new->ks_state &= ~KCPC_SET_BOUND;
1552 	new->ks_flags = set->ks_flags;
1553 	new->ks_nreqs = set->ks_nreqs;
1554 	new->ks_req = kmem_alloc(set->ks_nreqs * sizeof (kcpc_request_t),
1555 	    KM_SLEEP);
1556 	new->ks_data = NULL;
1557 	new->ks_ctx = NULL;
1558 
1559 	for (i = 0; i < new->ks_nreqs; i++) {
1560 		new->ks_req[i].kr_config = NULL;
1561 		new->ks_req[i].kr_index = set->ks_req[i].kr_index;
1562 		new->ks_req[i].kr_picnum = set->ks_req[i].kr_picnum;
1563 		new->ks_req[i].kr_picp = NULL;
1564 		new->ks_req[i].kr_data = NULL;
1565 		(void) strncpy(new->ks_req[i].kr_event, set->ks_req[i].kr_event,
1566 		    CPC_MAX_EVENT_LEN);
1567 		new->ks_req[i].kr_preset = set->ks_req[i].kr_preset;
1568 		new->ks_req[i].kr_flags = set->ks_req[i].kr_flags;
1569 		new->ks_req[i].kr_nattrs = set->ks_req[i].kr_nattrs;
1570 		new->ks_req[i].kr_attr = kmem_alloc(new->ks_req[i].kr_nattrs *
1571 		    sizeof (kcpc_attr_t), KM_SLEEP);
1572 		for (j = 0; j < new->ks_req[i].kr_nattrs; j++) {
1573 			new->ks_req[i].kr_attr[j].ka_val =
1574 			    set->ks_req[i].kr_attr[j].ka_val;
1575 			(void) strncpy(new->ks_req[i].kr_attr[j].ka_name,
1576 			    set->ks_req[i].kr_attr[j].ka_name,
1577 			    CPC_MAX_ATTR_LEN);
1578 		}
1579 	}
1580 
1581 	return (new);
1582 }
1583 
1584 int
1585 kcpc_allow_nonpriv(void *token)
1586 {
1587 	return (((kcpc_ctx_t *)token)->kc_flags & KCPC_CTX_NONPRIV);
1588 }
1589 
1590 void
1591 kcpc_invalidate(kthread_t *t)
1592 {
1593 	kcpc_ctx_t *ctx = t->t_cpc_ctx;
1594 
1595 	if (ctx != NULL)
1596 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1597 }
1598 
1599 /*
1600  * Given a PCBE ID, attempt to load a matching PCBE module. The strings given
1601  * are used to construct PCBE names, starting with the most specific,
1602  * "pcbe.first.second.third.fourth" and ending with the least specific,
1603  * "pcbe.first".
1604  *
1605  * Returns 0 if a PCBE was successfully loaded and -1 upon error.
1606  */
1607 int
1608 kcpc_pcbe_tryload(const char *prefix, uint_t first, uint_t second, uint_t third)
1609 {
1610 	uint_t s[3];
1611 
1612 	s[0] = first;
1613 	s[1] = second;
1614 	s[2] = third;
1615 
1616 	return (modload_qualified("pcbe",
1617 	    "pcbe", prefix, ".", s, 3, NULL) < 0 ? -1 : 0);
1618 }
1619