xref: /titanic_44/usr/src/uts/common/os/kcpc.c (revision 1db2880b3a411e3c56e50c7dc42d3b137fcc4e48)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/param.h>
27 #include <sys/thread.h>
28 #include <sys/cpuvar.h>
29 #include <sys/inttypes.h>
30 #include <sys/cmn_err.h>
31 #include <sys/time.h>
32 #include <sys/ksynch.h>
33 #include <sys/systm.h>
34 #include <sys/kcpc.h>
35 #include <sys/cpc_impl.h>
36 #include <sys/cpc_pcbe.h>
37 #include <sys/atomic.h>
38 #include <sys/sunddi.h>
39 #include <sys/modctl.h>
40 #include <sys/sdt.h>
41 #include <sys/archsystm.h>
42 #include <sys/promif.h>
43 #include <sys/x_call.h>
44 #include <sys/cap_util.h>
45 #if defined(__x86)
46 #include <asm/clock.h>
47 #include <sys/xc_levels.h>
48 #endif
49 
50 static kmutex_t	kcpc_ctx_llock[CPC_HASH_BUCKETS];	/* protects ctx_list */
51 static kcpc_ctx_t *kcpc_ctx_list[CPC_HASH_BUCKETS];	/* head of list */
52 
53 
54 krwlock_t	kcpc_cpuctx_lock;	/* lock for 'kcpc_cpuctx' below */
55 int		kcpc_cpuctx;		/* number of cpu-specific contexts */
56 
57 int kcpc_counts_include_idle = 1; /* Project Private /etc/system variable */
58 
59 /*
60  * These are set when a PCBE module is loaded.
61  */
62 uint_t		cpc_ncounters = 0;
63 pcbe_ops_t	*pcbe_ops = NULL;
64 
65 /*
66  * Statistics on (mis)behavior
67  */
68 static uint32_t kcpc_intrctx_count;    /* # overflows in an interrupt handler */
69 static uint32_t kcpc_nullctx_count;    /* # overflows in a thread with no ctx */
70 
71 /*
72  * By setting 'kcpc_nullctx_panic' to 1, any overflow interrupts in a thread
73  * with no valid context will result in a panic.
74  */
75 static int kcpc_nullctx_panic = 0;
76 
77 static void kcpc_lwp_create(kthread_t *t, kthread_t *ct);
78 static void kcpc_restore(kcpc_ctx_t *ctx);
79 static void kcpc_save(kcpc_ctx_t *ctx);
80 static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx);
81 static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch);
82 static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set);
83 static kcpc_set_t *kcpc_set_create(kcpc_request_t *reqs, int nreqs,
84     int set_flags, int kmem_flags);
85 
86 /*
87  * Macros to manipulate context flags. All flag updates should use one of these
88  * two macros
89  *
90  * Flags should be always be updated atomically since some of the updates are
91  * not protected by locks.
92  */
93 #define	KCPC_CTX_FLAG_SET(ctx, flag) atomic_or_uint(&(ctx)->kc_flags, (flag))
94 #define	KCPC_CTX_FLAG_CLR(ctx, flag) atomic_and_uint(&(ctx)->kc_flags, ~(flag))
95 
96 /*
97  * The IS_HIPIL() macro verifies that the code is executed either from a
98  * cross-call or from high-PIL interrupt
99  */
100 #ifdef DEBUG
101 #define	IS_HIPIL() (getpil() >= XCALL_PIL)
102 #else
103 #define	IS_HIPIL()
104 #endif	/* DEBUG */
105 
106 
107 extern int kcpc_hw_load_pcbe(void);
108 
109 /*
110  * Return value from kcpc_hw_load_pcbe()
111  */
112 static int kcpc_pcbe_error = 0;
113 
114 /*
115  * Perform one-time initialization of kcpc framework.
116  * This function performs the initialization only the first time it is called.
117  * It is safe to call it multiple times.
118  */
119 int
120 kcpc_init(void)
121 {
122 	long hash;
123 	static uint32_t kcpc_initialized = 0;
124 
125 	/*
126 	 * We already tried loading platform pcbe module and failed
127 	 */
128 	if (kcpc_pcbe_error != 0)
129 		return (-1);
130 
131 	/*
132 	 * The kcpc framework should be initialized at most once
133 	 */
134 	if (atomic_cas_32(&kcpc_initialized, 0, 1) != 0)
135 		return (0);
136 
137 	rw_init(&kcpc_cpuctx_lock, NULL, RW_DEFAULT, NULL);
138 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
139 		mutex_init(&kcpc_ctx_llock[hash],
140 		    NULL, MUTEX_DRIVER, (void *)(uintptr_t)15);
141 
142 	/*
143 	 * Load platform-specific pcbe module
144 	 */
145 	kcpc_pcbe_error = kcpc_hw_load_pcbe();
146 
147 	return (kcpc_pcbe_error == 0 ? 0 : -1);
148 }
149 
150 void
151 kcpc_register_pcbe(pcbe_ops_t *ops)
152 {
153 	pcbe_ops = ops;
154 	cpc_ncounters = pcbe_ops->pcbe_ncounters();
155 }
156 
157 void
158 kcpc_register_dcpc(void (*func)(uint64_t))
159 {
160 	dtrace_cpc_fire = func;
161 }
162 
163 void
164 kcpc_unregister_dcpc(void)
165 {
166 	dtrace_cpc_fire = NULL;
167 }
168 
169 int
170 kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode)
171 {
172 	cpu_t		*cp;
173 	kcpc_ctx_t	*ctx;
174 	int		error;
175 	int		save_spl;
176 
177 	ctx = kcpc_ctx_alloc(KM_SLEEP);
178 
179 	if (kcpc_assign_reqs(set, ctx) != 0) {
180 		kcpc_ctx_free(ctx);
181 		*subcode = CPC_RESOURCE_UNAVAIL;
182 		return (EINVAL);
183 	}
184 
185 	ctx->kc_cpuid = cpuid;
186 	ctx->kc_thread = curthread;
187 
188 	set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
189 
190 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
191 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
192 		kcpc_ctx_free(ctx);
193 		return (error);
194 	}
195 
196 	set->ks_ctx = ctx;
197 	ctx->kc_set = set;
198 
199 	/*
200 	 * We must hold cpu_lock to prevent DR, offlining, or unbinding while
201 	 * we are manipulating the cpu_t and programming the hardware, else the
202 	 * the cpu_t could go away while we're looking at it.
203 	 */
204 	mutex_enter(&cpu_lock);
205 	cp = cpu_get(cpuid);
206 
207 	if (cp == NULL)
208 		/*
209 		 * The CPU could have been DRd out while we were getting set up.
210 		 */
211 		goto unbound;
212 
213 	mutex_enter(&cp->cpu_cpc_ctxlock);
214 	kpreempt_disable();
215 	save_spl = spl_xcall();
216 
217 	/*
218 	 * Check to see whether counters for CPU already being used by someone
219 	 * other than kernel for capacity and utilization (since kernel will
220 	 * let go of counters for user in kcpc_program() below)
221 	 */
222 	if (cp->cpu_cpc_ctx != NULL && !CU_CPC_ON(cp)) {
223 		/*
224 		 * If this CPU already has a bound set, return an error.
225 		 */
226 		splx(save_spl);
227 		kpreempt_enable();
228 		mutex_exit(&cp->cpu_cpc_ctxlock);
229 		goto unbound;
230 	}
231 
232 	if (curthread->t_bind_cpu != cpuid) {
233 		splx(save_spl);
234 		kpreempt_enable();
235 		mutex_exit(&cp->cpu_cpc_ctxlock);
236 		goto unbound;
237 	}
238 
239 	kcpc_program(ctx, B_FALSE, B_TRUE);
240 
241 	splx(save_spl);
242 	kpreempt_enable();
243 
244 	mutex_exit(&cp->cpu_cpc_ctxlock);
245 	mutex_exit(&cpu_lock);
246 
247 	mutex_enter(&set->ks_lock);
248 	set->ks_state |= KCPC_SET_BOUND;
249 	cv_signal(&set->ks_condv);
250 	mutex_exit(&set->ks_lock);
251 
252 	return (0);
253 
254 unbound:
255 	mutex_exit(&cpu_lock);
256 	set->ks_ctx = NULL;
257 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
258 	kcpc_ctx_free(ctx);
259 	return (EAGAIN);
260 }
261 
262 int
263 kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
264 {
265 	kcpc_ctx_t	*ctx;
266 	int		error;
267 
268 	/*
269 	 * Only one set is allowed per context, so ensure there is no
270 	 * existing context.
271 	 */
272 
273 	if (t->t_cpc_ctx != NULL)
274 		return (EEXIST);
275 
276 	ctx = kcpc_ctx_alloc(KM_SLEEP);
277 
278 	/*
279 	 * The context must begin life frozen until it has been properly
280 	 * programmed onto the hardware. This prevents the context ops from
281 	 * worrying about it until we're ready.
282 	 */
283 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
284 	ctx->kc_hrtime = gethrtime();
285 
286 	if (kcpc_assign_reqs(set, ctx) != 0) {
287 		kcpc_ctx_free(ctx);
288 		*subcode = CPC_RESOURCE_UNAVAIL;
289 		return (EINVAL);
290 	}
291 
292 	ctx->kc_cpuid = -1;
293 	if (set->ks_flags & CPC_BIND_LWP_INHERIT)
294 		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_LWPINHERIT);
295 	ctx->kc_thread = t;
296 	t->t_cpc_ctx = ctx;
297 	/*
298 	 * Permit threads to look at their own hardware counters from userland.
299 	 */
300 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_NONPRIV);
301 
302 	/*
303 	 * Create the data store for this set.
304 	 */
305 	set->ks_data = kmem_alloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
306 
307 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
308 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
309 		kcpc_ctx_free(ctx);
310 		t->t_cpc_ctx = NULL;
311 		return (error);
312 	}
313 
314 	set->ks_ctx = ctx;
315 	ctx->kc_set = set;
316 
317 	/*
318 	 * Add a device context to the subject thread.
319 	 */
320 	installctx(t, ctx, kcpc_save, kcpc_restore, NULL,
321 	    kcpc_lwp_create, NULL, kcpc_free);
322 
323 	/*
324 	 * Ask the backend to program the hardware.
325 	 */
326 	if (t == curthread) {
327 		int save_spl;
328 
329 		kpreempt_disable();
330 		save_spl = spl_xcall();
331 		kcpc_program(ctx, B_TRUE, B_TRUE);
332 		splx(save_spl);
333 		kpreempt_enable();
334 	} else {
335 		/*
336 		 * Since we are the agent LWP, we know the victim LWP is stopped
337 		 * until we're done here; no need to worry about preemption or
338 		 * migration here. We still use an atomic op to clear the flag
339 		 * to ensure the flags are always self-consistent; they can
340 		 * still be accessed from, for instance, another CPU doing a
341 		 * kcpc_invalidate_all().
342 		 */
343 		KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
344 	}
345 
346 	mutex_enter(&set->ks_lock);
347 	set->ks_state |= KCPC_SET_BOUND;
348 	cv_signal(&set->ks_condv);
349 	mutex_exit(&set->ks_lock);
350 
351 	return (0);
352 }
353 
354 /*
355  * Walk through each request in the set and ask the PCBE to configure a
356  * corresponding counter.
357  */
358 int
359 kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode)
360 {
361 	int		i;
362 	int		ret;
363 	kcpc_request_t	*rp;
364 
365 	for (i = 0; i < set->ks_nreqs; i++) {
366 		int n;
367 		rp = &set->ks_req[i];
368 
369 		n = rp->kr_picnum;
370 
371 		ASSERT(n >= 0 && n < cpc_ncounters);
372 
373 		ASSERT(ctx->kc_pics[n].kp_req == NULL);
374 
375 		if (rp->kr_flags & CPC_OVF_NOTIFY_EMT) {
376 			if ((pcbe_ops->pcbe_caps & CPC_CAP_OVERFLOW_INTERRUPT)
377 			    == 0) {
378 				*subcode = -1;
379 				return (ENOTSUP);
380 			}
381 			/*
382 			 * If any of the counters have requested overflow
383 			 * notification, we flag the context as being one that
384 			 * cares about overflow.
385 			 */
386 			KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_SIGOVF);
387 		}
388 
389 		rp->kr_config = NULL;
390 		if ((ret = pcbe_ops->pcbe_configure(n, rp->kr_event,
391 		    rp->kr_preset, rp->kr_flags, rp->kr_nattrs, rp->kr_attr,
392 		    &(rp->kr_config), (void *)ctx)) != 0) {
393 			kcpc_free_configs(set);
394 			*subcode = ret;
395 			switch (ret) {
396 			case CPC_ATTR_REQUIRES_PRIVILEGE:
397 			case CPC_HV_NO_ACCESS:
398 				return (EACCES);
399 			default:
400 				return (EINVAL);
401 			}
402 		}
403 
404 		ctx->kc_pics[n].kp_req = rp;
405 		rp->kr_picp = &ctx->kc_pics[n];
406 		rp->kr_data = set->ks_data + rp->kr_index;
407 		*rp->kr_data = rp->kr_preset;
408 	}
409 
410 	return (0);
411 }
412 
413 void
414 kcpc_free_configs(kcpc_set_t *set)
415 {
416 	int i;
417 
418 	for (i = 0; i < set->ks_nreqs; i++)
419 		if (set->ks_req[i].kr_config != NULL)
420 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
421 }
422 
423 /*
424  * buf points to a user address and the data should be copied out to that
425  * address in the current process.
426  */
427 int
428 kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick)
429 {
430 	kcpc_ctx_t	*ctx = set->ks_ctx;
431 	int		save_spl;
432 
433 	mutex_enter(&set->ks_lock);
434 	if ((set->ks_state & KCPC_SET_BOUND) == 0) {
435 		mutex_exit(&set->ks_lock);
436 		return (EINVAL);
437 	}
438 	mutex_exit(&set->ks_lock);
439 
440 	/*
441 	 * Kernel preemption must be disabled while reading the hardware regs,
442 	 * and if this is a CPU-bound context, while checking the CPU binding of
443 	 * the current thread.
444 	 */
445 	kpreempt_disable();
446 	save_spl = spl_xcall();
447 
448 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
449 		splx(save_spl);
450 		kpreempt_enable();
451 		return (EAGAIN);
452 	}
453 
454 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) {
455 		if (ctx->kc_cpuid != -1) {
456 			if (curthread->t_bind_cpu != ctx->kc_cpuid) {
457 				splx(save_spl);
458 				kpreempt_enable();
459 				return (EAGAIN);
460 			}
461 		}
462 
463 		if (ctx->kc_thread == curthread) {
464 			uint64_t curtick = KCPC_GET_TICK();
465 
466 			ctx->kc_hrtime = gethrtime_waitfree();
467 			pcbe_ops->pcbe_sample(ctx);
468 			ctx->kc_vtick += curtick - ctx->kc_rawtick;
469 			ctx->kc_rawtick = curtick;
470 		}
471 
472 		/*
473 		 * The config may have been invalidated by
474 		 * the pcbe_sample op.
475 		 */
476 		if (ctx->kc_flags & KCPC_CTX_INVALID) {
477 			splx(save_spl);
478 			kpreempt_enable();
479 			return (EAGAIN);
480 		}
481 
482 	}
483 
484 	splx(save_spl);
485 	kpreempt_enable();
486 
487 	if (copyout(set->ks_data, buf,
488 	    set->ks_nreqs * sizeof (uint64_t)) == -1)
489 		return (EFAULT);
490 	if (copyout(&ctx->kc_hrtime, hrtime, sizeof (uint64_t)) == -1)
491 		return (EFAULT);
492 	if (copyout(&ctx->kc_vtick, tick, sizeof (uint64_t)) == -1)
493 		return (EFAULT);
494 
495 	return (0);
496 }
497 
498 /*
499  * Stop the counters on the CPU this context is bound to.
500  */
501 static void
502 kcpc_stop_hw(kcpc_ctx_t *ctx)
503 {
504 	cpu_t *cp;
505 
506 	kpreempt_disable();
507 
508 	if (ctx->kc_cpuid == CPU->cpu_id) {
509 		cp = CPU;
510 	} else {
511 		cp = cpu_get(ctx->kc_cpuid);
512 	}
513 
514 	ASSERT(cp != NULL && cp->cpu_cpc_ctx == ctx);
515 	kcpc_cpu_stop(cp, B_FALSE);
516 
517 	kpreempt_enable();
518 }
519 
520 int
521 kcpc_unbind(kcpc_set_t *set)
522 {
523 	kcpc_ctx_t	*ctx;
524 	kthread_t	*t;
525 
526 	/*
527 	 * We could be racing with the process's agent thread as it
528 	 * binds the set; we must wait for the set to finish binding
529 	 * before attempting to tear it down.
530 	 */
531 	mutex_enter(&set->ks_lock);
532 	while ((set->ks_state & KCPC_SET_BOUND) == 0)
533 		cv_wait(&set->ks_condv, &set->ks_lock);
534 	mutex_exit(&set->ks_lock);
535 
536 	ctx = set->ks_ctx;
537 
538 	/*
539 	 * Use kc_lock to synchronize with kcpc_restore().
540 	 */
541 	mutex_enter(&ctx->kc_lock);
542 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
543 	mutex_exit(&ctx->kc_lock);
544 
545 	if (ctx->kc_cpuid == -1) {
546 		t = ctx->kc_thread;
547 		/*
548 		 * The context is thread-bound and therefore has a device
549 		 * context.  It will be freed via removectx() calling
550 		 * freectx() calling kcpc_free().
551 		 */
552 		if (t == curthread) {
553 			int save_spl;
554 
555 			kpreempt_disable();
556 			save_spl = spl_xcall();
557 			if (!(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED))
558 				kcpc_unprogram(ctx, B_TRUE);
559 			splx(save_spl);
560 			kpreempt_enable();
561 		}
562 #ifdef DEBUG
563 		if (removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
564 		    kcpc_lwp_create, NULL, kcpc_free) == 0)
565 			panic("kcpc_unbind: context %p not preset on thread %p",
566 			    (void *)ctx, (void *)t);
567 #else
568 		(void) removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
569 		    kcpc_lwp_create, NULL, kcpc_free);
570 #endif /* DEBUG */
571 		t->t_cpc_set = NULL;
572 		t->t_cpc_ctx = NULL;
573 	} else {
574 		/*
575 		 * If we are unbinding a CPU-bound set from a remote CPU, the
576 		 * native CPU's idle thread could be in the midst of programming
577 		 * this context onto the CPU. We grab the context's lock here to
578 		 * ensure that the idle thread is done with it. When we release
579 		 * the lock, the CPU no longer has a context and the idle thread
580 		 * will move on.
581 		 *
582 		 * cpu_lock must be held to prevent the CPU from being DR'd out
583 		 * while we disassociate the context from the cpu_t.
584 		 */
585 		cpu_t *cp;
586 		mutex_enter(&cpu_lock);
587 		cp = cpu_get(ctx->kc_cpuid);
588 		if (cp != NULL) {
589 			/*
590 			 * The CPU may have been DR'd out of the system.
591 			 */
592 			mutex_enter(&cp->cpu_cpc_ctxlock);
593 			if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0)
594 				kcpc_stop_hw(ctx);
595 			ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
596 			mutex_exit(&cp->cpu_cpc_ctxlock);
597 		}
598 		mutex_exit(&cpu_lock);
599 		if (ctx->kc_thread == curthread) {
600 			kcpc_free(ctx, 0);
601 			curthread->t_cpc_set = NULL;
602 		}
603 	}
604 
605 	return (0);
606 }
607 
608 int
609 kcpc_preset(kcpc_set_t *set, int index, uint64_t preset)
610 {
611 	int i;
612 
613 	ASSERT(set != NULL);
614 	ASSERT(set->ks_state & KCPC_SET_BOUND);
615 	ASSERT(set->ks_ctx->kc_thread == curthread);
616 	ASSERT(set->ks_ctx->kc_cpuid == -1);
617 
618 	if (index < 0 || index >= set->ks_nreqs)
619 		return (EINVAL);
620 
621 	for (i = 0; i < set->ks_nreqs; i++)
622 		if (set->ks_req[i].kr_index == index)
623 			break;
624 	ASSERT(i != set->ks_nreqs);
625 
626 	set->ks_req[i].kr_preset = preset;
627 	return (0);
628 }
629 
630 int
631 kcpc_restart(kcpc_set_t *set)
632 {
633 	kcpc_ctx_t	*ctx = set->ks_ctx;
634 	int		i;
635 	int		save_spl;
636 
637 	ASSERT(set->ks_state & KCPC_SET_BOUND);
638 	ASSERT(ctx->kc_thread == curthread);
639 	ASSERT(ctx->kc_cpuid == -1);
640 
641 	for (i = 0; i < set->ks_nreqs; i++) {
642 		*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
643 		pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset,
644 		    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
645 	}
646 
647 	kpreempt_disable();
648 	save_spl = spl_xcall();
649 
650 	/*
651 	 * If the user is doing this on a running set, make sure the counters
652 	 * are stopped first.
653 	 */
654 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
655 		pcbe_ops->pcbe_allstop();
656 
657 	/*
658 	 * Ask the backend to program the hardware.
659 	 */
660 	ctx->kc_rawtick = KCPC_GET_TICK();
661 	KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
662 	pcbe_ops->pcbe_program(ctx);
663 	splx(save_spl);
664 	kpreempt_enable();
665 
666 	return (0);
667 }
668 
669 /*
670  * Caller must hold kcpc_cpuctx_lock.
671  */
672 int
673 kcpc_enable(kthread_t *t, int cmd, int enable)
674 {
675 	kcpc_ctx_t	*ctx = t->t_cpc_ctx;
676 	kcpc_set_t	*set = t->t_cpc_set;
677 	kcpc_set_t	*newset;
678 	int		i;
679 	int		flag;
680 	int		err;
681 
682 	ASSERT(RW_READ_HELD(&kcpc_cpuctx_lock));
683 
684 	if (ctx == NULL) {
685 		/*
686 		 * This thread has a set but no context; it must be a
687 		 * CPU-bound set.
688 		 */
689 		ASSERT(t->t_cpc_set != NULL);
690 		ASSERT(t->t_cpc_set->ks_ctx->kc_cpuid != -1);
691 		return (EINVAL);
692 	} else if (ctx->kc_flags & KCPC_CTX_INVALID)
693 		return (EAGAIN);
694 
695 	if (cmd == CPC_ENABLE) {
696 		if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
697 			return (EINVAL);
698 		kpreempt_disable();
699 		KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
700 		kcpc_restore(ctx);
701 		kpreempt_enable();
702 	} else if (cmd == CPC_DISABLE) {
703 		if (ctx->kc_flags & KCPC_CTX_FREEZE)
704 			return (EINVAL);
705 		kpreempt_disable();
706 		kcpc_save(ctx);
707 		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
708 		kpreempt_enable();
709 	} else if (cmd == CPC_USR_EVENTS || cmd == CPC_SYS_EVENTS) {
710 		/*
711 		 * Strategy for usr/sys: stop counters and update set's presets
712 		 * with current counter values, unbind, update requests with
713 		 * new config, then re-bind.
714 		 */
715 		flag = (cmd == CPC_USR_EVENTS) ?
716 		    CPC_COUNT_USER: CPC_COUNT_SYSTEM;
717 
718 		kpreempt_disable();
719 		KCPC_CTX_FLAG_SET(ctx,
720 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
721 		pcbe_ops->pcbe_allstop();
722 		kpreempt_enable();
723 
724 		for (i = 0; i < set->ks_nreqs; i++) {
725 			set->ks_req[i].kr_preset = *(set->ks_req[i].kr_data);
726 			if (enable)
727 				set->ks_req[i].kr_flags |= flag;
728 			else
729 				set->ks_req[i].kr_flags &= ~flag;
730 		}
731 		newset = kcpc_dup_set(set);
732 		if (kcpc_unbind(set) != 0)
733 			return (EINVAL);
734 		t->t_cpc_set = newset;
735 		if (kcpc_bind_thread(newset, t, &err) != 0) {
736 			t->t_cpc_set = NULL;
737 			kcpc_free_set(newset);
738 			return (EINVAL);
739 		}
740 	} else
741 		return (EINVAL);
742 
743 	return (0);
744 }
745 
746 /*
747  * Provide PCBEs with a way of obtaining the configs of every counter which will
748  * be programmed together.
749  *
750  * If current is NULL, provide the first config.
751  *
752  * If data != NULL, caller wants to know where the data store associated with
753  * the config we return is located.
754  */
755 void *
756 kcpc_next_config(void *token, void *current, uint64_t **data)
757 {
758 	int		i;
759 	kcpc_pic_t	*pic;
760 	kcpc_ctx_t *ctx = (kcpc_ctx_t *)token;
761 
762 	if (current == NULL) {
763 		/*
764 		 * Client would like the first config, which may not be in
765 		 * counter 0; we need to search through the counters for the
766 		 * first config.
767 		 */
768 		for (i = 0; i < cpc_ncounters; i++)
769 			if (ctx->kc_pics[i].kp_req != NULL)
770 				break;
771 		/*
772 		 * There are no counters configured for the given context.
773 		 */
774 		if (i == cpc_ncounters)
775 			return (NULL);
776 	} else {
777 		/*
778 		 * There surely is a faster way to do this.
779 		 */
780 		for (i = 0; i < cpc_ncounters; i++) {
781 			pic = &ctx->kc_pics[i];
782 
783 			if (pic->kp_req != NULL &&
784 			    current == pic->kp_req->kr_config)
785 				break;
786 		}
787 
788 		/*
789 		 * We found the current config at picnum i. Now search for the
790 		 * next configured PIC.
791 		 */
792 		for (i++; i < cpc_ncounters; i++) {
793 			pic = &ctx->kc_pics[i];
794 			if (pic->kp_req != NULL)
795 				break;
796 		}
797 
798 		if (i == cpc_ncounters)
799 			return (NULL);
800 	}
801 
802 	if (data != NULL) {
803 		*data = ctx->kc_pics[i].kp_req->kr_data;
804 	}
805 
806 	return (ctx->kc_pics[i].kp_req->kr_config);
807 }
808 
809 
810 kcpc_ctx_t *
811 kcpc_ctx_alloc(int kmem_flags)
812 {
813 	kcpc_ctx_t	*ctx;
814 	long		hash;
815 
816 	ctx = (kcpc_ctx_t *)kmem_zalloc(sizeof (kcpc_ctx_t), kmem_flags);
817 	if (ctx == NULL)
818 		return (NULL);
819 
820 	hash = CPC_HASH_CTX(ctx);
821 	mutex_enter(&kcpc_ctx_llock[hash]);
822 	ctx->kc_next = kcpc_ctx_list[hash];
823 	kcpc_ctx_list[hash] = ctx;
824 	mutex_exit(&kcpc_ctx_llock[hash]);
825 
826 	ctx->kc_pics = (kcpc_pic_t *)kmem_zalloc(sizeof (kcpc_pic_t) *
827 	    cpc_ncounters, KM_SLEEP);
828 
829 	ctx->kc_cpuid = -1;
830 
831 	return (ctx);
832 }
833 
834 /*
835  * Copy set from ctx to the child context, cctx, if it has CPC_BIND_LWP_INHERIT
836  * in the flags.
837  */
838 static void
839 kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx)
840 {
841 	kcpc_set_t	*ks = ctx->kc_set, *cks;
842 	int		i, j;
843 	int		code;
844 
845 	ASSERT(ks != NULL);
846 
847 	if ((ks->ks_flags & CPC_BIND_LWP_INHERIT) == 0)
848 		return;
849 
850 	cks = kmem_zalloc(sizeof (*cks), KM_SLEEP);
851 	cks->ks_state &= ~KCPC_SET_BOUND;
852 	cctx->kc_set = cks;
853 	cks->ks_flags = ks->ks_flags;
854 	cks->ks_nreqs = ks->ks_nreqs;
855 	cks->ks_req = kmem_alloc(cks->ks_nreqs *
856 	    sizeof (kcpc_request_t), KM_SLEEP);
857 	cks->ks_data = kmem_alloc(cks->ks_nreqs * sizeof (uint64_t),
858 	    KM_SLEEP);
859 	cks->ks_ctx = cctx;
860 
861 	for (i = 0; i < cks->ks_nreqs; i++) {
862 		cks->ks_req[i].kr_index = ks->ks_req[i].kr_index;
863 		cks->ks_req[i].kr_picnum = ks->ks_req[i].kr_picnum;
864 		(void) strncpy(cks->ks_req[i].kr_event,
865 		    ks->ks_req[i].kr_event, CPC_MAX_EVENT_LEN);
866 		cks->ks_req[i].kr_preset = ks->ks_req[i].kr_preset;
867 		cks->ks_req[i].kr_flags = ks->ks_req[i].kr_flags;
868 		cks->ks_req[i].kr_nattrs = ks->ks_req[i].kr_nattrs;
869 		if (ks->ks_req[i].kr_nattrs > 0) {
870 			cks->ks_req[i].kr_attr =
871 			    kmem_alloc(ks->ks_req[i].kr_nattrs *
872 			    sizeof (kcpc_attr_t), KM_SLEEP);
873 		}
874 		for (j = 0; j < ks->ks_req[i].kr_nattrs; j++) {
875 			(void) strncpy(cks->ks_req[i].kr_attr[j].ka_name,
876 			    ks->ks_req[i].kr_attr[j].ka_name,
877 			    CPC_MAX_ATTR_LEN);
878 			cks->ks_req[i].kr_attr[j].ka_val =
879 			    ks->ks_req[i].kr_attr[j].ka_val;
880 		}
881 	}
882 	if (kcpc_configure_reqs(cctx, cks, &code) != 0)
883 		kcpc_invalidate_config(cctx);
884 
885 	mutex_enter(&cks->ks_lock);
886 	cks->ks_state |= KCPC_SET_BOUND;
887 	cv_signal(&cks->ks_condv);
888 	mutex_exit(&cks->ks_lock);
889 }
890 
891 
892 void
893 kcpc_ctx_free(kcpc_ctx_t *ctx)
894 {
895 	kcpc_ctx_t	**loc;
896 	long		hash = CPC_HASH_CTX(ctx);
897 
898 	mutex_enter(&kcpc_ctx_llock[hash]);
899 	loc = &kcpc_ctx_list[hash];
900 	ASSERT(*loc != NULL);
901 	while (*loc != ctx)
902 		loc = &(*loc)->kc_next;
903 	*loc = ctx->kc_next;
904 	mutex_exit(&kcpc_ctx_llock[hash]);
905 
906 	kmem_free(ctx->kc_pics, cpc_ncounters * sizeof (kcpc_pic_t));
907 	cv_destroy(&ctx->kc_condv);
908 	mutex_destroy(&ctx->kc_lock);
909 	kmem_free(ctx, sizeof (*ctx));
910 }
911 
912 /*
913  * Generic interrupt handler used on hardware that generates
914  * overflow interrupts.
915  *
916  * Note: executed at high-level interrupt context!
917  */
918 /*ARGSUSED*/
919 kcpc_ctx_t *
920 kcpc_overflow_intr(caddr_t arg, uint64_t bitmap)
921 {
922 	kcpc_ctx_t	*ctx;
923 	kthread_t	*t = curthread;
924 	int		i;
925 
926 	/*
927 	 * On both x86 and UltraSPARC, we may deliver the high-level
928 	 * interrupt in kernel mode, just after we've started to run an
929 	 * interrupt thread.  (That's because the hardware helpfully
930 	 * delivers the overflow interrupt some random number of cycles
931 	 * after the instruction that caused the overflow by which time
932 	 * we're in some part of the kernel, not necessarily running on
933 	 * the right thread).
934 	 *
935 	 * Check for this case here -- find the pinned thread
936 	 * that was running when the interrupt went off.
937 	 */
938 	if (t->t_flag & T_INTR_THREAD) {
939 		klwp_t *lwp;
940 
941 		atomic_inc_32(&kcpc_intrctx_count);
942 
943 		/*
944 		 * Note that t_lwp is always set to point at the underlying
945 		 * thread, thus this will work in the presence of nested
946 		 * interrupts.
947 		 */
948 		ctx = NULL;
949 		if ((lwp = t->t_lwp) != NULL) {
950 			t = lwptot(lwp);
951 			ctx = t->t_cpc_ctx;
952 		}
953 	} else
954 		ctx = t->t_cpc_ctx;
955 
956 	if (ctx == NULL) {
957 		/*
958 		 * This can easily happen if we're using the counters in
959 		 * "shared" mode, for example, and an overflow interrupt
960 		 * occurs while we are running cpustat.  In that case, the
961 		 * bound thread that has the context that belongs to this
962 		 * CPU is almost certainly sleeping (if it was running on
963 		 * the CPU we'd have found it above), and the actual
964 		 * interrupted thread has no knowledge of performance counters!
965 		 */
966 		ctx = curthread->t_cpu->cpu_cpc_ctx;
967 		if (ctx != NULL) {
968 			/*
969 			 * Return the bound context for this CPU to
970 			 * the interrupt handler so that it can synchronously
971 			 * sample the hardware counters and restart them.
972 			 */
973 			return (ctx);
974 		}
975 
976 		/*
977 		 * As long as the overflow interrupt really is delivered early
978 		 * enough after trapping into the kernel to avoid switching
979 		 * threads, we must always be able to find the cpc context,
980 		 * or something went terribly wrong i.e. we ended up
981 		 * running a passivated interrupt thread, a kernel
982 		 * thread or we interrupted idle, all of which are Very Bad.
983 		 *
984 		 * We also could end up here owing to an incredibly unlikely
985 		 * race condition that exists on x86 based architectures when
986 		 * the cpc provider is in use; overflow interrupts are directed
987 		 * to the cpc provider if the 'dtrace_cpc_in_use' variable is
988 		 * set when we enter the handler. This variable is unset after
989 		 * overflow interrupts have been disabled on all CPUs and all
990 		 * contexts have been torn down. To stop interrupts, the cpc
991 		 * provider issues a xcall to the remote CPU before it tears
992 		 * down that CPUs context. As high priority xcalls, on an x86
993 		 * architecture, execute at a higher PIL than this handler, it
994 		 * is possible (though extremely unlikely) that the xcall could
995 		 * interrupt the overflow handler before the handler has
996 		 * checked the 'dtrace_cpc_in_use' variable, stop the counters,
997 		 * return to the cpc provider which could then rip down
998 		 * contexts and unset 'dtrace_cpc_in_use' *before* the CPUs
999 		 * overflow handler has had a chance to check the variable. In
1000 		 * that case, the handler would direct the overflow into this
1001 		 * code and no valid context will be found. The default behavior
1002 		 * when no valid context is found is now to shout a warning to
1003 		 * the console and bump the 'kcpc_nullctx_count' variable.
1004 		 */
1005 		if (kcpc_nullctx_panic)
1006 			panic("null cpc context, thread %p", (void *)t);
1007 #ifdef DEBUG
1008 		cmn_err(CE_NOTE,
1009 		    "null cpc context found in overflow handler!\n");
1010 #endif
1011 		atomic_inc_32(&kcpc_nullctx_count);
1012 	} else if ((ctx->kc_flags & KCPC_CTX_INVALID) == 0) {
1013 		/*
1014 		 * Schedule an ast to sample the counters, which will
1015 		 * propagate any overflow into the virtualized performance
1016 		 * counter(s), and may deliver a signal.
1017 		 */
1018 		ttolwp(t)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
1019 		/*
1020 		 * If a counter has overflowed which was counting on behalf of
1021 		 * a request which specified CPC_OVF_NOTIFY_EMT, send the
1022 		 * process a signal.
1023 		 */
1024 		for (i = 0; i < cpc_ncounters; i++) {
1025 			if (ctx->kc_pics[i].kp_req != NULL &&
1026 			    bitmap & (1 << i) &&
1027 			    ctx->kc_pics[i].kp_req->kr_flags &
1028 			    CPC_OVF_NOTIFY_EMT) {
1029 				/*
1030 				 * A signal has been requested for this PIC, so
1031 				 * so freeze the context. The interrupt handler
1032 				 * has already stopped the counter hardware.
1033 				 */
1034 				KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
1035 				atomic_or_uint(&ctx->kc_pics[i].kp_flags,
1036 				    KCPC_PIC_OVERFLOWED);
1037 			}
1038 		}
1039 		aston(t);
1040 	} else if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) {
1041 		/*
1042 		 * Thread context is no longer valid, but here may be a valid
1043 		 * CPU context.
1044 		 */
1045 		return (curthread->t_cpu->cpu_cpc_ctx);
1046 	}
1047 
1048 	return (NULL);
1049 }
1050 
1051 /*
1052  * The current thread context had an overflow interrupt; we're
1053  * executing here in high-level interrupt context.
1054  */
1055 /*ARGSUSED*/
1056 uint_t
1057 kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
1058 {
1059 	kcpc_ctx_t *ctx;
1060 	uint64_t bitmap;
1061 	uint8_t *state;
1062 	int	save_spl;
1063 
1064 	if (pcbe_ops == NULL ||
1065 	    (bitmap = pcbe_ops->pcbe_overflow_bitmap()) == 0)
1066 		return (DDI_INTR_UNCLAIMED);
1067 
1068 	/*
1069 	 * Prevent any further interrupts.
1070 	 */
1071 	pcbe_ops->pcbe_allstop();
1072 
1073 	if (dtrace_cpc_in_use) {
1074 		state = &cpu_core[CPU->cpu_id].cpuc_dcpc_intr_state;
1075 
1076 		/*
1077 		 * Set the per-CPU state bit to indicate that we are currently
1078 		 * processing an interrupt if it is currently free. Drop the
1079 		 * interrupt if the state isn't free (i.e. a configuration
1080 		 * event is taking place).
1081 		 */
1082 		if (atomic_cas_8(state, DCPC_INTR_FREE,
1083 		    DCPC_INTR_PROCESSING) == DCPC_INTR_FREE) {
1084 			int i;
1085 			kcpc_request_t req;
1086 
1087 			ASSERT(dtrace_cpc_fire != NULL);
1088 
1089 			(*dtrace_cpc_fire)(bitmap);
1090 
1091 			ctx = curthread->t_cpu->cpu_cpc_ctx;
1092 			if (ctx == NULL) {
1093 #ifdef DEBUG
1094 				cmn_err(CE_NOTE, "null cpc context in"
1095 				    "hardware overflow handler!\n");
1096 #endif
1097 				return (DDI_INTR_CLAIMED);
1098 			}
1099 
1100 			/* Reset any counters that have overflowed */
1101 			for (i = 0; i < ctx->kc_set->ks_nreqs; i++) {
1102 				req = ctx->kc_set->ks_req[i];
1103 
1104 				if (bitmap & (1 << req.kr_picnum)) {
1105 					pcbe_ops->pcbe_configure(req.kr_picnum,
1106 					    req.kr_event, req.kr_preset,
1107 					    req.kr_flags, req.kr_nattrs,
1108 					    req.kr_attr, &(req.kr_config),
1109 					    (void *)ctx);
1110 				}
1111 			}
1112 			pcbe_ops->pcbe_program(ctx);
1113 
1114 			/*
1115 			 * We've finished processing the interrupt so set
1116 			 * the state back to free.
1117 			 */
1118 			cpu_core[CPU->cpu_id].cpuc_dcpc_intr_state =
1119 			    DCPC_INTR_FREE;
1120 			membar_producer();
1121 		}
1122 		return (DDI_INTR_CLAIMED);
1123 	}
1124 
1125 	/*
1126 	 * DTrace isn't involved so pass on accordingly.
1127 	 *
1128 	 * If the interrupt has occurred in the context of an lwp owning
1129 	 * the counters, then the handler posts an AST to the lwp to
1130 	 * trigger the actual sampling, and optionally deliver a signal or
1131 	 * restart the counters, on the way out of the kernel using
1132 	 * kcpc_hw_overflow_ast() (see below).
1133 	 *
1134 	 * On the other hand, if the handler returns the context to us
1135 	 * directly, then it means that there are no other threads in
1136 	 * the middle of updating it, no AST has been posted, and so we
1137 	 * should sample the counters here, and restart them with no
1138 	 * further fuss.
1139 	 *
1140 	 * The CPU's CPC context may disappear as a result of cross-call which
1141 	 * has higher PIL on x86, so protect the context by raising PIL to the
1142 	 * cross-call level.
1143 	 */
1144 	save_spl = spl_xcall();
1145 	if ((ctx = kcpc_overflow_intr(arg1, bitmap)) != NULL) {
1146 		uint64_t curtick = KCPC_GET_TICK();
1147 
1148 		ctx->kc_hrtime = gethrtime_waitfree();
1149 		ctx->kc_vtick += curtick - ctx->kc_rawtick;
1150 		ctx->kc_rawtick = curtick;
1151 		pcbe_ops->pcbe_sample(ctx);
1152 		pcbe_ops->pcbe_program(ctx);
1153 	}
1154 	splx(save_spl);
1155 
1156 	return (DDI_INTR_CLAIMED);
1157 }
1158 
1159 /*
1160  * Called from trap() when processing the ast posted by the high-level
1161  * interrupt handler.
1162  */
1163 int
1164 kcpc_overflow_ast()
1165 {
1166 	kcpc_ctx_t	*ctx = curthread->t_cpc_ctx;
1167 	int		i;
1168 	int		found = 0;
1169 	uint64_t	curtick = KCPC_GET_TICK();
1170 
1171 	ASSERT(ctx != NULL);	/* Beware of interrupt skid. */
1172 
1173 	/*
1174 	 * An overflow happened: sample the context to ensure that
1175 	 * the overflow is propagated into the upper bits of the
1176 	 * virtualized 64-bit counter(s).
1177 	 */
1178 	kpreempt_disable();
1179 	ctx->kc_hrtime = gethrtime_waitfree();
1180 	pcbe_ops->pcbe_sample(ctx);
1181 	kpreempt_enable();
1182 
1183 	ctx->kc_vtick += curtick - ctx->kc_rawtick;
1184 
1185 	/*
1186 	 * The interrupt handler has marked any pics with KCPC_PIC_OVERFLOWED
1187 	 * if that pic generated an overflow and if the request it was counting
1188 	 * on behalf of had CPC_OVERFLOW_REQUEST specified. We go through all
1189 	 * pics in the context and clear the KCPC_PIC_OVERFLOWED flags. If we
1190 	 * found any overflowed pics, keep the context frozen and return true
1191 	 * (thus causing a signal to be sent).
1192 	 */
1193 	for (i = 0; i < cpc_ncounters; i++) {
1194 		if (ctx->kc_pics[i].kp_flags & KCPC_PIC_OVERFLOWED) {
1195 			atomic_and_uint(&ctx->kc_pics[i].kp_flags,
1196 			    ~KCPC_PIC_OVERFLOWED);
1197 			found = 1;
1198 		}
1199 	}
1200 	if (found)
1201 		return (1);
1202 
1203 	/*
1204 	 * Otherwise, re-enable the counters and continue life as before.
1205 	 */
1206 	kpreempt_disable();
1207 	KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
1208 	pcbe_ops->pcbe_program(ctx);
1209 	kpreempt_enable();
1210 	return (0);
1211 }
1212 
1213 /*
1214  * Called when switching away from current thread.
1215  */
1216 static void
1217 kcpc_save(kcpc_ctx_t *ctx)
1218 {
1219 	int err;
1220 	int save_spl;
1221 
1222 	kpreempt_disable();
1223 	save_spl = spl_xcall();
1224 
1225 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
1226 		if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) {
1227 			splx(save_spl);
1228 			kpreempt_enable();
1229 			return;
1230 		}
1231 		/*
1232 		 * This context has been invalidated but the counters have not
1233 		 * been stopped. Stop them here and mark the context stopped.
1234 		 */
1235 		kcpc_unprogram(ctx, B_TRUE);
1236 		splx(save_spl);
1237 		kpreempt_enable();
1238 		return;
1239 	}
1240 
1241 	pcbe_ops->pcbe_allstop();
1242 	if (ctx->kc_flags & KCPC_CTX_FREEZE) {
1243 		splx(save_spl);
1244 		kpreempt_enable();
1245 		return;
1246 	}
1247 
1248 	/*
1249 	 * Need to sample for all reqs into each req's current mpic.
1250 	 */
1251 	ctx->kc_hrtime = gethrtime_waitfree();
1252 	ctx->kc_vtick += KCPC_GET_TICK() - ctx->kc_rawtick;
1253 	pcbe_ops->pcbe_sample(ctx);
1254 
1255 	/*
1256 	 * Program counter for measuring capacity and utilization since user
1257 	 * thread isn't using counter anymore
1258 	 */
1259 	ASSERT(ctx->kc_cpuid == -1);
1260 	cu_cpc_program(CPU, &err);
1261 	splx(save_spl);
1262 	kpreempt_enable();
1263 }
1264 
1265 static void
1266 kcpc_restore(kcpc_ctx_t *ctx)
1267 {
1268 	int save_spl;
1269 
1270 	mutex_enter(&ctx->kc_lock);
1271 
1272 	if ((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED)) ==
1273 	    KCPC_CTX_INVALID) {
1274 		/*
1275 		 * The context is invalidated but has not been marked stopped.
1276 		 * We mark it as such here because we will not start the
1277 		 * counters during this context switch.
1278 		 */
1279 		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID_STOPPED);
1280 	}
1281 
1282 	if (ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_FREEZE)) {
1283 		mutex_exit(&ctx->kc_lock);
1284 		return;
1285 	}
1286 
1287 	/*
1288 	 * Set kc_flags to show that a kcpc_restore() is in progress to avoid
1289 	 * ctx & set related memory objects being freed without us knowing.
1290 	 * This can happen if an agent thread is executing a kcpc_unbind(),
1291 	 * with this thread as the target, whilst we're concurrently doing a
1292 	 * restorectx() during, for example, a proc_exit().  Effectively, by
1293 	 * doing this, we're asking kcpc_free() to cv_wait() until
1294 	 * kcpc_restore() has completed.
1295 	 */
1296 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_RESTORE);
1297 	mutex_exit(&ctx->kc_lock);
1298 
1299 	/*
1300 	 * While programming the hardware, the counters should be stopped. We
1301 	 * don't do an explicit pcbe_allstop() here because they should have
1302 	 * been stopped already by the last consumer.
1303 	 */
1304 	kpreempt_disable();
1305 	save_spl = spl_xcall();
1306 	kcpc_program(ctx, B_TRUE, B_TRUE);
1307 	splx(save_spl);
1308 	kpreempt_enable();
1309 
1310 	/*
1311 	 * Wake the agent thread if it's waiting in kcpc_free().
1312 	 */
1313 	mutex_enter(&ctx->kc_lock);
1314 	KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_RESTORE);
1315 	cv_signal(&ctx->kc_condv);
1316 	mutex_exit(&ctx->kc_lock);
1317 }
1318 
1319 /*
1320  * If kcpc_counts_include_idle is set to 0 by the sys admin, we add the the
1321  * following context operators to the idle thread on each CPU. They stop the
1322  * counters when the idle thread is switched on, and they start them again when
1323  * it is switched off.
1324  */
1325 /*ARGSUSED*/
1326 void
1327 kcpc_idle_save(struct cpu *cp)
1328 {
1329 	/*
1330 	 * The idle thread shouldn't be run anywhere else.
1331 	 */
1332 	ASSERT(CPU == cp);
1333 
1334 	/*
1335 	 * We must hold the CPU's context lock to ensure the context isn't freed
1336 	 * while we're looking at it.
1337 	 */
1338 	mutex_enter(&cp->cpu_cpc_ctxlock);
1339 
1340 	if ((cp->cpu_cpc_ctx == NULL) ||
1341 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1342 		mutex_exit(&cp->cpu_cpc_ctxlock);
1343 		return;
1344 	}
1345 
1346 	pcbe_ops->pcbe_program(cp->cpu_cpc_ctx);
1347 	mutex_exit(&cp->cpu_cpc_ctxlock);
1348 }
1349 
1350 void
1351 kcpc_idle_restore(struct cpu *cp)
1352 {
1353 	/*
1354 	 * The idle thread shouldn't be run anywhere else.
1355 	 */
1356 	ASSERT(CPU == cp);
1357 
1358 	/*
1359 	 * We must hold the CPU's context lock to ensure the context isn't freed
1360 	 * while we're looking at it.
1361 	 */
1362 	mutex_enter(&cp->cpu_cpc_ctxlock);
1363 
1364 	if ((cp->cpu_cpc_ctx == NULL) ||
1365 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1366 		mutex_exit(&cp->cpu_cpc_ctxlock);
1367 		return;
1368 	}
1369 
1370 	pcbe_ops->pcbe_allstop();
1371 	mutex_exit(&cp->cpu_cpc_ctxlock);
1372 }
1373 
1374 /*ARGSUSED*/
1375 static void
1376 kcpc_lwp_create(kthread_t *t, kthread_t *ct)
1377 {
1378 	kcpc_ctx_t	*ctx = t->t_cpc_ctx, *cctx;
1379 	int		i;
1380 
1381 	if (ctx == NULL || (ctx->kc_flags & KCPC_CTX_LWPINHERIT) == 0)
1382 		return;
1383 
1384 	rw_enter(&kcpc_cpuctx_lock, RW_READER);
1385 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
1386 		rw_exit(&kcpc_cpuctx_lock);
1387 		return;
1388 	}
1389 	cctx = kcpc_ctx_alloc(KM_SLEEP);
1390 	kcpc_ctx_clone(ctx, cctx);
1391 	rw_exit(&kcpc_cpuctx_lock);
1392 
1393 	/*
1394 	 * Copy the parent context's kc_flags field, but don't overwrite
1395 	 * the child's in case it was modified during kcpc_ctx_clone.
1396 	 */
1397 	KCPC_CTX_FLAG_SET(cctx,  ctx->kc_flags);
1398 	cctx->kc_thread = ct;
1399 	cctx->kc_cpuid = -1;
1400 	ct->t_cpc_set = cctx->kc_set;
1401 	ct->t_cpc_ctx = cctx;
1402 
1403 	if (cctx->kc_flags & KCPC_CTX_SIGOVF) {
1404 		kcpc_set_t *ks = cctx->kc_set;
1405 		/*
1406 		 * Our contract with the user requires us to immediately send an
1407 		 * overflow signal to all children if we have the LWPINHERIT
1408 		 * and SIGOVF flags set. In addition, all counters should be
1409 		 * set to UINT64_MAX, and their pic's overflow flag turned on
1410 		 * so that our trap() processing knows to send a signal.
1411 		 */
1412 		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
1413 		for (i = 0; i < ks->ks_nreqs; i++) {
1414 			kcpc_request_t *kr = &ks->ks_req[i];
1415 
1416 			if (kr->kr_flags & CPC_OVF_NOTIFY_EMT) {
1417 				*(kr->kr_data) = UINT64_MAX;
1418 				atomic_or_uint(&kr->kr_picp->kp_flags,
1419 				    KCPC_PIC_OVERFLOWED);
1420 			}
1421 		}
1422 		ttolwp(ct)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
1423 		aston(ct);
1424 	}
1425 
1426 	installctx(ct, cctx, kcpc_save, kcpc_restore,
1427 	    NULL, kcpc_lwp_create, NULL, kcpc_free);
1428 }
1429 
1430 /*
1431  * Counter Stoppage Theory
1432  *
1433  * The counters may need to be stopped properly at the following occasions:
1434  *
1435  * 1) An LWP exits.
1436  * 2) A thread exits.
1437  * 3) An LWP performs an exec().
1438  * 4) A bound set is unbound.
1439  *
1440  * In addition to stopping the counters, the CPC context (a kcpc_ctx_t) may need
1441  * to be freed as well.
1442  *
1443  * Case 1: kcpc_passivate(), called via lwp_exit(), stops the counters. Later on
1444  * when the thread is freed, kcpc_free(), called by freectx(), frees the
1445  * context.
1446  *
1447  * Case 2: same as case 1 except kcpc_passivate is called from thread_exit().
1448  *
1449  * Case 3: kcpc_free(), called via freectx() via exec(), recognizes that it has
1450  * been called from exec. It stops the counters _and_ frees the context.
1451  *
1452  * Case 4: kcpc_unbind() stops the hardware _and_ frees the context.
1453  *
1454  * CPU-bound counters are always stopped via kcpc_unbind().
1455  */
1456 
1457 /*
1458  * We're being called to delete the context; we ensure that all associated data
1459  * structures are freed, and that the hardware is passivated if this is an exec.
1460  */
1461 
1462 /*ARGSUSED*/
1463 void
1464 kcpc_free(kcpc_ctx_t *ctx, int isexec)
1465 {
1466 	int		i;
1467 	kcpc_set_t	*set = ctx->kc_set;
1468 
1469 	ASSERT(set != NULL);
1470 
1471 	/*
1472 	 * Wait for kcpc_restore() to finish before we tear things down.
1473 	 */
1474 	mutex_enter(&ctx->kc_lock);
1475 	while (ctx->kc_flags & KCPC_CTX_RESTORE)
1476 		cv_wait(&ctx->kc_condv, &ctx->kc_lock);
1477 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1478 	mutex_exit(&ctx->kc_lock);
1479 
1480 	if (isexec) {
1481 		/*
1482 		 * This thread is execing, and after the exec it should not have
1483 		 * any performance counter context. Stop the counters properly
1484 		 * here so the system isn't surprised by an overflow interrupt
1485 		 * later.
1486 		 */
1487 		if (ctx->kc_cpuid != -1) {
1488 			cpu_t *cp;
1489 			/*
1490 			 * CPU-bound context; stop the appropriate CPU's ctrs.
1491 			 * Hold cpu_lock while examining the CPU to ensure it
1492 			 * doesn't go away.
1493 			 */
1494 			mutex_enter(&cpu_lock);
1495 			cp = cpu_get(ctx->kc_cpuid);
1496 			/*
1497 			 * The CPU could have been DR'd out, so only stop the
1498 			 * CPU and clear its context pointer if the CPU still
1499 			 * exists.
1500 			 */
1501 			if (cp != NULL) {
1502 				mutex_enter(&cp->cpu_cpc_ctxlock);
1503 				kcpc_stop_hw(ctx);
1504 				mutex_exit(&cp->cpu_cpc_ctxlock);
1505 			}
1506 			mutex_exit(&cpu_lock);
1507 			ASSERT(curthread->t_cpc_ctx == NULL);
1508 		} else {
1509 			int save_spl;
1510 
1511 			/*
1512 			 * Thread-bound context; stop _this_ CPU's counters.
1513 			 */
1514 			kpreempt_disable();
1515 			save_spl = spl_xcall();
1516 			kcpc_unprogram(ctx, B_TRUE);
1517 			curthread->t_cpc_ctx = NULL;
1518 			splx(save_spl);
1519 			kpreempt_enable();
1520 		}
1521 
1522 		/*
1523 		 * Since we are being called from an exec and we know that
1524 		 * exec is not permitted via the agent thread, we should clean
1525 		 * up this thread's CPC state completely, and not leave dangling
1526 		 * CPC pointers behind.
1527 		 */
1528 		ASSERT(ctx->kc_thread == curthread);
1529 		curthread->t_cpc_set = NULL;
1530 	}
1531 
1532 	/*
1533 	 * Walk through each request in this context's set and free the PCBE's
1534 	 * configuration if it exists.
1535 	 */
1536 	for (i = 0; i < set->ks_nreqs; i++) {
1537 		if (set->ks_req[i].kr_config != NULL)
1538 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
1539 	}
1540 
1541 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
1542 	kcpc_ctx_free(ctx);
1543 	kcpc_free_set(set);
1544 }
1545 
1546 /*
1547  * Free the memory associated with a request set.
1548  */
1549 void
1550 kcpc_free_set(kcpc_set_t *set)
1551 {
1552 	int		i;
1553 	kcpc_request_t	*req;
1554 
1555 	ASSERT(set->ks_req != NULL);
1556 
1557 	for (i = 0; i < set->ks_nreqs; i++) {
1558 		req = &set->ks_req[i];
1559 
1560 		if (req->kr_nattrs != 0) {
1561 			kmem_free(req->kr_attr,
1562 			    req->kr_nattrs * sizeof (kcpc_attr_t));
1563 		}
1564 	}
1565 
1566 	kmem_free(set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
1567 	cv_destroy(&set->ks_condv);
1568 	mutex_destroy(&set->ks_lock);
1569 	kmem_free(set, sizeof (kcpc_set_t));
1570 }
1571 
1572 /*
1573  * Grab every existing context and mark it as invalid.
1574  */
1575 void
1576 kcpc_invalidate_all(void)
1577 {
1578 	kcpc_ctx_t *ctx;
1579 	long hash;
1580 
1581 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++) {
1582 		mutex_enter(&kcpc_ctx_llock[hash]);
1583 		for (ctx = kcpc_ctx_list[hash]; ctx; ctx = ctx->kc_next)
1584 			KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1585 		mutex_exit(&kcpc_ctx_llock[hash]);
1586 	}
1587 }
1588 
1589 /*
1590  * Interface for PCBEs to signal that an existing configuration has suddenly
1591  * become invalid.
1592  */
1593 void
1594 kcpc_invalidate_config(void *token)
1595 {
1596 	kcpc_ctx_t *ctx = token;
1597 
1598 	ASSERT(ctx != NULL);
1599 
1600 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1601 }
1602 
1603 /*
1604  * Called from lwp_exit() and thread_exit()
1605  */
1606 void
1607 kcpc_passivate(void)
1608 {
1609 	kcpc_ctx_t *ctx = curthread->t_cpc_ctx;
1610 	kcpc_set_t *set = curthread->t_cpc_set;
1611 	int	save_spl;
1612 
1613 	if (set == NULL)
1614 		return;
1615 
1616 	if (ctx == NULL) {
1617 		/*
1618 		 * This thread has a set but no context; it must be a CPU-bound
1619 		 * set. The hardware will be stopped via kcpc_unbind() when the
1620 		 * process exits and closes its file descriptors with
1621 		 * kcpc_close(). Our only job here is to clean up this thread's
1622 		 * state; the set will be freed with the unbind().
1623 		 */
1624 		(void) kcpc_unbind(set);
1625 		/*
1626 		 * Unbinding a set belonging to the current thread should clear
1627 		 * its set pointer.
1628 		 */
1629 		ASSERT(curthread->t_cpc_set == NULL);
1630 		return;
1631 	}
1632 
1633 	kpreempt_disable();
1634 	save_spl = spl_xcall();
1635 	curthread->t_cpc_set = NULL;
1636 
1637 	/*
1638 	 * This thread/LWP is exiting but context switches will continue to
1639 	 * happen for a bit as the exit proceeds.  Kernel preemption must be
1640 	 * disabled here to prevent a race between checking or setting the
1641 	 * INVALID_STOPPED flag here and kcpc_restore() setting the flag during
1642 	 * a context switch.
1643 	 */
1644 	if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
1645 		kcpc_unprogram(ctx, B_TRUE);
1646 		KCPC_CTX_FLAG_SET(ctx,
1647 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
1648 	}
1649 
1650 	/*
1651 	 * We're cleaning up after this thread; ensure there are no dangling
1652 	 * CPC pointers left behind. The context and set will be freed by
1653 	 * freectx().
1654 	 */
1655 	curthread->t_cpc_ctx = NULL;
1656 
1657 	splx(save_spl);
1658 	kpreempt_enable();
1659 }
1660 
1661 /*
1662  * Assign the requests in the given set to the PICs in the context.
1663  * Returns 0 if successful, -1 on failure.
1664  */
1665 /*ARGSUSED*/
1666 int
1667 kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx)
1668 {
1669 	int i;
1670 	int *picnum_save;
1671 
1672 	ASSERT(set->ks_nreqs <= cpc_ncounters);
1673 
1674 	/*
1675 	 * Provide kcpc_tryassign() with scratch space to avoid doing an
1676 	 * alloc/free with every invocation.
1677 	 */
1678 	picnum_save = kmem_alloc(set->ks_nreqs * sizeof (int), KM_SLEEP);
1679 	/*
1680 	 * kcpc_tryassign() blindly walks through each request in the set,
1681 	 * seeing if a counter can count its event. If yes, it assigns that
1682 	 * counter. However, that counter may have been the only capable counter
1683 	 * for _another_ request's event. The solution is to try every possible
1684 	 * request first. Note that this does not cover all solutions, as
1685 	 * that would require all unique orderings of requests, an n^n operation
1686 	 * which would be unacceptable for architectures with many counters.
1687 	 */
1688 	for (i = 0; i < set->ks_nreqs; i++)
1689 		if (kcpc_tryassign(set, i, picnum_save) == 0)
1690 			break;
1691 
1692 	kmem_free(picnum_save, set->ks_nreqs * sizeof (int));
1693 	if (i == set->ks_nreqs)
1694 		return (-1);
1695 	return (0);
1696 }
1697 
1698 static int
1699 kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch)
1700 {
1701 	int		i;
1702 	int		j;
1703 	uint64_t	bitmap = 0, resmap = 0;
1704 	uint64_t	ctrmap;
1705 
1706 	/*
1707 	 * We are attempting to assign the reqs to pics, but we may fail. If we
1708 	 * fail, we need to restore the state of the requests to what it was
1709 	 * when we found it, as some reqs may have been explicitly assigned to
1710 	 * a specific PIC beforehand. We do this by snapshotting the assignments
1711 	 * now and restoring from it later if we fail.
1712 	 *
1713 	 * Also we note here which counters have already been claimed by
1714 	 * requests with explicit counter assignments.
1715 	 */
1716 	for (i = 0; i < set->ks_nreqs; i++) {
1717 		scratch[i] = set->ks_req[i].kr_picnum;
1718 		if (set->ks_req[i].kr_picnum != -1)
1719 			resmap |= (1 << set->ks_req[i].kr_picnum);
1720 	}
1721 
1722 	/*
1723 	 * Walk through requests assigning them to the first PIC that is
1724 	 * capable.
1725 	 */
1726 	i = starting_req;
1727 	do {
1728 		if (set->ks_req[i].kr_picnum != -1) {
1729 			ASSERT((bitmap & (1 << set->ks_req[i].kr_picnum)) == 0);
1730 			bitmap |= (1 << set->ks_req[i].kr_picnum);
1731 			if (++i == set->ks_nreqs)
1732 				i = 0;
1733 			continue;
1734 		}
1735 
1736 		ctrmap = pcbe_ops->pcbe_event_coverage(set->ks_req[i].kr_event);
1737 		for (j = 0; j < cpc_ncounters; j++) {
1738 			if (ctrmap & (1 << j) && (bitmap & (1 << j)) == 0 &&
1739 			    (resmap & (1 << j)) == 0) {
1740 				/*
1741 				 * We can assign this counter because:
1742 				 *
1743 				 * 1. It can count the event (ctrmap)
1744 				 * 2. It hasn't been assigned yet (bitmap)
1745 				 * 3. It wasn't reserved by a request (resmap)
1746 				 */
1747 				bitmap |= (1 << j);
1748 				break;
1749 			}
1750 		}
1751 		if (j == cpc_ncounters) {
1752 			for (i = 0; i < set->ks_nreqs; i++)
1753 				set->ks_req[i].kr_picnum = scratch[i];
1754 			return (-1);
1755 		}
1756 		set->ks_req[i].kr_picnum = j;
1757 
1758 		if (++i == set->ks_nreqs)
1759 			i = 0;
1760 	} while (i != starting_req);
1761 
1762 	return (0);
1763 }
1764 
1765 kcpc_set_t *
1766 kcpc_dup_set(kcpc_set_t *set)
1767 {
1768 	kcpc_set_t	*new;
1769 	int		i;
1770 	int		j;
1771 
1772 	new = kmem_zalloc(sizeof (*new), KM_SLEEP);
1773 	new->ks_state &= ~KCPC_SET_BOUND;
1774 	new->ks_flags = set->ks_flags;
1775 	new->ks_nreqs = set->ks_nreqs;
1776 	new->ks_req = kmem_alloc(set->ks_nreqs * sizeof (kcpc_request_t),
1777 	    KM_SLEEP);
1778 	new->ks_data = NULL;
1779 	new->ks_ctx = NULL;
1780 
1781 	for (i = 0; i < new->ks_nreqs; i++) {
1782 		new->ks_req[i].kr_config = NULL;
1783 		new->ks_req[i].kr_index = set->ks_req[i].kr_index;
1784 		new->ks_req[i].kr_picnum = set->ks_req[i].kr_picnum;
1785 		new->ks_req[i].kr_picp = NULL;
1786 		new->ks_req[i].kr_data = NULL;
1787 		(void) strncpy(new->ks_req[i].kr_event, set->ks_req[i].kr_event,
1788 		    CPC_MAX_EVENT_LEN);
1789 		new->ks_req[i].kr_preset = set->ks_req[i].kr_preset;
1790 		new->ks_req[i].kr_flags = set->ks_req[i].kr_flags;
1791 		new->ks_req[i].kr_nattrs = set->ks_req[i].kr_nattrs;
1792 		new->ks_req[i].kr_attr = kmem_alloc(new->ks_req[i].kr_nattrs *
1793 		    sizeof (kcpc_attr_t), KM_SLEEP);
1794 		for (j = 0; j < new->ks_req[i].kr_nattrs; j++) {
1795 			new->ks_req[i].kr_attr[j].ka_val =
1796 			    set->ks_req[i].kr_attr[j].ka_val;
1797 			(void) strncpy(new->ks_req[i].kr_attr[j].ka_name,
1798 			    set->ks_req[i].kr_attr[j].ka_name,
1799 			    CPC_MAX_ATTR_LEN);
1800 		}
1801 	}
1802 
1803 	return (new);
1804 }
1805 
1806 int
1807 kcpc_allow_nonpriv(void *token)
1808 {
1809 	return (((kcpc_ctx_t *)token)->kc_flags & KCPC_CTX_NONPRIV);
1810 }
1811 
1812 void
1813 kcpc_invalidate(kthread_t *t)
1814 {
1815 	kcpc_ctx_t *ctx = t->t_cpc_ctx;
1816 
1817 	if (ctx != NULL)
1818 		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1819 }
1820 
1821 /*
1822  * Given a PCBE ID, attempt to load a matching PCBE module. The strings given
1823  * are used to construct PCBE names, starting with the most specific,
1824  * "pcbe.first.second.third.fourth" and ending with the least specific,
1825  * "pcbe.first".
1826  *
1827  * Returns 0 if a PCBE was successfully loaded and -1 upon error.
1828  */
1829 int
1830 kcpc_pcbe_tryload(const char *prefix, uint_t first, uint_t second, uint_t third)
1831 {
1832 	uint_t s[3];
1833 
1834 	s[0] = first;
1835 	s[1] = second;
1836 	s[2] = third;
1837 
1838 	return (modload_qualified("pcbe",
1839 	    "pcbe", prefix, ".", s, 3, NULL) < 0 ? -1 : 0);
1840 }
1841 
1842 /*
1843  * Create one or more CPC context for given CPU with specified counter event
1844  * requests
1845  *
1846  * If number of requested counter events is less than or equal number of
1847  * hardware counters on a CPU and can all be assigned to the counters on a CPU
1848  * at the same time, then make one CPC context.
1849  *
1850  * Otherwise, multiple CPC contexts are created to allow multiplexing more
1851  * counter events than existing counters onto the counters by iterating through
1852  * all of the CPC contexts, programming the counters with each CPC context one
1853  * at a time and measuring the resulting counter values.  Each of the resulting
1854  * CPC contexts contains some number of requested counter events less than or
1855  * equal the number of counters on a CPU depending on whether all the counter
1856  * events can be programmed on all the counters at the same time or not.
1857  *
1858  * Flags to kmem_{,z}alloc() are passed in as an argument to allow specifying
1859  * whether memory allocation should be non-blocking or not.  The code will try
1860  * to allocate *whole* CPC contexts if possible.  If there is any memory
1861  * allocation failure during the allocations needed for a given CPC context, it
1862  * will skip allocating that CPC context because it cannot allocate the whole
1863  * thing.  Thus, the only time that it will end up allocating none (ie. no CPC
1864  * contexts whatsoever) is when it cannot even allocate *one* whole CPC context
1865  * without a memory allocation failure occurring.
1866  */
1867 int
1868 kcpc_cpu_ctx_create(cpu_t *cp, kcpc_request_list_t *req_list, int kmem_flags,
1869     kcpc_ctx_t ***ctx_ptr_array, size_t *ctx_ptr_array_sz)
1870 {
1871 	kcpc_ctx_t	**ctx_ptrs;
1872 	int		nctx;
1873 	int		nctx_ptrs;
1874 	int		nreqs;
1875 	kcpc_request_t	*reqs;
1876 
1877 	if (cp == NULL || ctx_ptr_array == NULL || ctx_ptr_array_sz == NULL ||
1878 	    req_list == NULL || req_list->krl_cnt < 1)
1879 		return (-1);
1880 
1881 	/*
1882 	 * Allocate number of sets assuming that each set contains one and only
1883 	 * one counter event request for each counter on a CPU
1884 	 */
1885 	nreqs = req_list->krl_cnt;
1886 	nctx_ptrs = (nreqs + cpc_ncounters - 1) / cpc_ncounters;
1887 	ctx_ptrs = kmem_zalloc(nctx_ptrs * sizeof (kcpc_ctx_t *), kmem_flags);
1888 	if (ctx_ptrs == NULL)
1889 		return (-2);
1890 
1891 	/*
1892 	 * Fill in sets of requests
1893 	 */
1894 	nctx = 0;
1895 	reqs = req_list->krl_list;
1896 	while (nreqs > 0) {
1897 		kcpc_ctx_t	*ctx;
1898 		kcpc_set_t	*set;
1899 		int		subcode;
1900 
1901 		/*
1902 		 * Allocate CPC context and set for requested counter events
1903 		 */
1904 		ctx = kcpc_ctx_alloc(kmem_flags);
1905 		set = kcpc_set_create(reqs, nreqs, 0, kmem_flags);
1906 		if (set == NULL) {
1907 			kcpc_ctx_free(ctx);
1908 			break;
1909 		}
1910 
1911 		/*
1912 		 * Determine assignment of requested counter events to specific
1913 		 * counters
1914 		 */
1915 		if (kcpc_assign_reqs(set, ctx) != 0) {
1916 			/*
1917 			 * May not be able to assign requested counter events
1918 			 * to all counters since all counters may not be able
1919 			 * to do all events, so only do one counter event in
1920 			 * set of counter requests when this happens since at
1921 			 * least one of the counters must be able to do the
1922 			 * event.
1923 			 */
1924 			kcpc_free_set(set);
1925 			set = kcpc_set_create(reqs, 1, 0, kmem_flags);
1926 			if (set == NULL) {
1927 				kcpc_ctx_free(ctx);
1928 				break;
1929 			}
1930 			if (kcpc_assign_reqs(set, ctx) != 0) {
1931 #ifdef DEBUG
1932 				cmn_err(CE_NOTE, "!kcpc_cpu_ctx_create: can't "
1933 				    "assign counter event %s!\n",
1934 				    set->ks_req->kr_event);
1935 #endif
1936 				kcpc_free_set(set);
1937 				kcpc_ctx_free(ctx);
1938 				reqs++;
1939 				nreqs--;
1940 				continue;
1941 			}
1942 		}
1943 
1944 		/*
1945 		 * Allocate memory needed to hold requested counter event data
1946 		 */
1947 		set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t),
1948 		    kmem_flags);
1949 		if (set->ks_data == NULL) {
1950 			kcpc_free_set(set);
1951 			kcpc_ctx_free(ctx);
1952 			break;
1953 		}
1954 
1955 		/*
1956 		 * Configure requested counter events
1957 		 */
1958 		if (kcpc_configure_reqs(ctx, set, &subcode) != 0) {
1959 #ifdef DEBUG
1960 			cmn_err(CE_NOTE,
1961 			    "!kcpc_cpu_ctx_create: can't configure "
1962 			    "set of counter event requests!\n");
1963 #endif
1964 			reqs += set->ks_nreqs;
1965 			nreqs -= set->ks_nreqs;
1966 			kmem_free(set->ks_data,
1967 			    set->ks_nreqs * sizeof (uint64_t));
1968 			kcpc_free_set(set);
1969 			kcpc_ctx_free(ctx);
1970 			continue;
1971 		}
1972 
1973 		/*
1974 		 * Point set of counter event requests at this context and fill
1975 		 * in CPC context
1976 		 */
1977 		set->ks_ctx = ctx;
1978 		ctx->kc_set = set;
1979 		ctx->kc_cpuid = cp->cpu_id;
1980 		ctx->kc_thread = curthread;
1981 
1982 		ctx_ptrs[nctx] = ctx;
1983 
1984 		/*
1985 		 * Update requests and how many are left to be assigned to sets
1986 		 */
1987 		reqs += set->ks_nreqs;
1988 		nreqs -= set->ks_nreqs;
1989 
1990 		/*
1991 		 * Increment number of CPC contexts and allocate bigger array
1992 		 * for context pointers as needed
1993 		 */
1994 		nctx++;
1995 		if (nctx >= nctx_ptrs) {
1996 			kcpc_ctx_t	**new;
1997 			int		new_cnt;
1998 
1999 			/*
2000 			 * Allocate more CPC contexts based on how many
2001 			 * contexts allocated so far and how many counter
2002 			 * requests left to assign
2003 			 */
2004 			new_cnt = nctx_ptrs +
2005 			    ((nreqs + cpc_ncounters - 1) / cpc_ncounters);
2006 			new = kmem_zalloc(new_cnt * sizeof (kcpc_ctx_t *),
2007 			    kmem_flags);
2008 			if (new == NULL)
2009 				break;
2010 
2011 			/*
2012 			 * Copy contents of old sets into new ones
2013 			 */
2014 			bcopy(ctx_ptrs, new,
2015 			    nctx_ptrs * sizeof (kcpc_ctx_t *));
2016 
2017 			/*
2018 			 * Free old array of context pointers and use newly
2019 			 * allocated one instead now
2020 			 */
2021 			kmem_free(ctx_ptrs, nctx_ptrs * sizeof (kcpc_ctx_t *));
2022 			ctx_ptrs = new;
2023 			nctx_ptrs = new_cnt;
2024 		}
2025 	}
2026 
2027 	/*
2028 	 * Return NULL if no CPC contexts filled in
2029 	 */
2030 	if (nctx == 0) {
2031 		kmem_free(ctx_ptrs, nctx_ptrs * sizeof (kcpc_ctx_t *));
2032 		*ctx_ptr_array = NULL;
2033 		*ctx_ptr_array_sz = 0;
2034 		return (-2);
2035 	}
2036 
2037 	*ctx_ptr_array = ctx_ptrs;
2038 	*ctx_ptr_array_sz = nctx_ptrs * sizeof (kcpc_ctx_t *);
2039 	return (nctx);
2040 }
2041 
2042 /*
2043  * Return whether PCBE supports given counter event
2044  */
2045 boolean_t
2046 kcpc_event_supported(char *event)
2047 {
2048 	if (pcbe_ops == NULL || pcbe_ops->pcbe_event_coverage(event) == 0)
2049 		return (B_FALSE);
2050 
2051 	return (B_TRUE);
2052 }
2053 
2054 /*
2055  * Program counters on current CPU with given CPC context
2056  *
2057  * If kernel is interposing on counters to measure hardware capacity and
2058  * utilization, then unprogram counters for kernel *before* programming them
2059  * with specified CPC context.
2060  *
2061  * kcpc_{program,unprogram}() may be called either directly by a thread running
2062  * on the target CPU or from a cross-call from another CPU. To protect
2063  * programming and unprogramming from being interrupted by cross-calls, callers
2064  * who execute kcpc_{program,unprogram} should raise PIL to the level used by
2065  * cross-calls.
2066  */
2067 void
2068 kcpc_program(kcpc_ctx_t *ctx, boolean_t for_thread, boolean_t cu_interpose)
2069 {
2070 	int	error;
2071 
2072 	ASSERT(IS_HIPIL());
2073 
2074 	/*
2075 	 * CPC context shouldn't be NULL, its CPU field should specify current
2076 	 * CPU or be -1 to specify any CPU when the context is bound to a
2077 	 * thread, and preemption should be disabled
2078 	 */
2079 	ASSERT(ctx != NULL && (ctx->kc_cpuid == CPU->cpu_id ||
2080 	    ctx->kc_cpuid == -1) && curthread->t_preempt > 0);
2081 	if (ctx == NULL || (ctx->kc_cpuid != CPU->cpu_id &&
2082 	    ctx->kc_cpuid != -1) || curthread->t_preempt < 1)
2083 		return;
2084 
2085 	/*
2086 	 * Unprogram counters for kernel measuring hardware capacity and
2087 	 * utilization
2088 	 */
2089 	if (cu_interpose == B_TRUE) {
2090 		cu_cpc_unprogram(CPU, &error);
2091 	} else {
2092 		kcpc_set_t *set = ctx->kc_set;
2093 		int i;
2094 
2095 		ASSERT(set != NULL);
2096 
2097 		/*
2098 		 * Since cu_interpose is false, we are programming CU context.
2099 		 * In general, PCBE can continue from the state saved in the
2100 		 * set, but it is not very reliable, so we start again from the
2101 		 * preset value.
2102 		 */
2103 		for (i = 0; i < set->ks_nreqs; i++) {
2104 			/*
2105 			 * Reset the virtual counter value to the preset value.
2106 			 */
2107 			*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
2108 
2109 			/*
2110 			 * Reset PCBE to the preset value.
2111 			 */
2112 			pcbe_ops->pcbe_configure(0, NULL,
2113 			    set->ks_req[i].kr_preset,
2114 			    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
2115 		}
2116 	}
2117 
2118 	/*
2119 	 * Program counters with specified CPC context
2120 	 */
2121 	ctx->kc_rawtick = KCPC_GET_TICK();
2122 	pcbe_ops->pcbe_program(ctx);
2123 
2124 	/*
2125 	 * Denote that counters programmed for thread or CPU CPC context
2126 	 * differently
2127 	 */
2128 	if (for_thread == B_TRUE)
2129 		KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
2130 	else
2131 		CPU->cpu_cpc_ctx = ctx;
2132 }
2133 
2134 /*
2135  * Unprogram counters with given CPC context on current CPU
2136  *
2137  * If kernel is interposing on counters to measure hardware capacity and
2138  * utilization, then program counters for the kernel capacity and utilization
2139  * *after* unprogramming them for given CPC context.
2140  *
2141  * See the comment for kcpc_program regarding the synchronization with
2142  * cross-calls.
2143  */
2144 void
2145 kcpc_unprogram(kcpc_ctx_t *ctx, boolean_t cu_interpose)
2146 {
2147 	int	error;
2148 
2149 	ASSERT(IS_HIPIL());
2150 
2151 	/*
2152 	 * CPC context shouldn't be NULL, its CPU field should specify current
2153 	 * CPU or be -1 to specify any CPU when the context is bound to a
2154 	 * thread, and preemption should be disabled
2155 	 */
2156 	ASSERT(ctx != NULL && (ctx->kc_cpuid == CPU->cpu_id ||
2157 	    ctx->kc_cpuid == -1) && curthread->t_preempt > 0);
2158 
2159 	if (ctx == NULL || (ctx->kc_cpuid != CPU->cpu_id &&
2160 	    ctx->kc_cpuid != -1) || curthread->t_preempt < 1 ||
2161 	    (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) != 0) {
2162 		return;
2163 	}
2164 
2165 	/*
2166 	 * Specified CPC context to be unprogrammed should be bound to current
2167 	 * CPU or thread
2168 	 */
2169 	ASSERT(CPU->cpu_cpc_ctx == ctx || curthread->t_cpc_ctx == ctx);
2170 
2171 	/*
2172 	 * Stop counters
2173 	 */
2174 	pcbe_ops->pcbe_allstop();
2175 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID_STOPPED);
2176 
2177 	/*
2178 	 * Allow kernel to interpose on counters and program them for its own
2179 	 * use to measure hardware capacity and utilization if cu_interpose
2180 	 * argument is true
2181 	 */
2182 	if (cu_interpose == B_TRUE)
2183 		cu_cpc_program(CPU, &error);
2184 }
2185 
2186 /*
2187  * Read CPU Performance Counter (CPC) on current CPU and call specified update
2188  * routine with data for each counter event currently programmed on CPU
2189  */
2190 int
2191 kcpc_read(kcpc_update_func_t update_func)
2192 {
2193 	kcpc_ctx_t	*ctx;
2194 	int		i;
2195 	kcpc_request_t	*req;
2196 	int		retval;
2197 	kcpc_set_t	*set;
2198 
2199 	ASSERT(IS_HIPIL());
2200 
2201 	/*
2202 	 * Can't grab locks or block because may be called inside dispatcher
2203 	 */
2204 	kpreempt_disable();
2205 
2206 	ctx = CPU->cpu_cpc_ctx;
2207 	if (ctx == NULL) {
2208 		kpreempt_enable();
2209 		return (0);
2210 	}
2211 
2212 	/*
2213 	 * Read counter data from current CPU
2214 	 */
2215 	pcbe_ops->pcbe_sample(ctx);
2216 
2217 	set = ctx->kc_set;
2218 	if (set == NULL || set->ks_req == NULL) {
2219 		kpreempt_enable();
2220 		return (0);
2221 	}
2222 
2223 	/*
2224 	 * Call update function with preset pointer and data for each CPC event
2225 	 * request currently programmed on current CPU
2226 	 */
2227 	req = set->ks_req;
2228 	retval = 0;
2229 	for (i = 0; i < set->ks_nreqs; i++) {
2230 		int	ret;
2231 
2232 		if (req[i].kr_data == NULL)
2233 			break;
2234 
2235 		ret = update_func(req[i].kr_ptr, *req[i].kr_data);
2236 		if (ret < 0)
2237 			retval = ret;
2238 	}
2239 
2240 	kpreempt_enable();
2241 
2242 	return (retval);
2243 }
2244 
2245 /*
2246  * Initialize list of counter event requests
2247  */
2248 kcpc_request_list_t *
2249 kcpc_reqs_init(int nreqs, int kmem_flags)
2250 {
2251 	kcpc_request_list_t	*req_list;
2252 	kcpc_request_t		*reqs;
2253 
2254 	if (nreqs < 1)
2255 		return (NULL);
2256 
2257 	req_list = kmem_zalloc(sizeof (kcpc_request_list_t), kmem_flags);
2258 	if (req_list == NULL)
2259 		return (NULL);
2260 
2261 	reqs = kmem_zalloc(nreqs * sizeof (kcpc_request_t), kmem_flags);
2262 	if (reqs == NULL) {
2263 		kmem_free(req_list, sizeof (kcpc_request_list_t));
2264 		return (NULL);
2265 	}
2266 
2267 	req_list->krl_list = reqs;
2268 	req_list->krl_cnt = 0;
2269 	req_list->krl_max = nreqs;
2270 	return (req_list);
2271 }
2272 
2273 
2274 /*
2275  * Add counter event request to given list of counter event requests
2276  */
2277 int
2278 kcpc_reqs_add(kcpc_request_list_t *req_list, char *event, uint64_t preset,
2279     uint_t flags, uint_t nattrs, kcpc_attr_t *attr, void *ptr, int kmem_flags)
2280 {
2281 	kcpc_request_t	*req;
2282 
2283 	if (req_list == NULL || req_list->krl_list == NULL)
2284 		return (-1);
2285 
2286 	ASSERT(req_list->krl_max != 0);
2287 
2288 	/*
2289 	 * Allocate more space (if needed)
2290 	 */
2291 	if (req_list->krl_cnt > req_list->krl_max) {
2292 		kcpc_request_t	*new;
2293 		kcpc_request_t	*old;
2294 
2295 		old = req_list->krl_list;
2296 		new = kmem_zalloc((req_list->krl_max +
2297 		    cpc_ncounters) * sizeof (kcpc_request_t), kmem_flags);
2298 		if (new == NULL)
2299 			return (-2);
2300 
2301 		req_list->krl_list = new;
2302 		bcopy(old, req_list->krl_list,
2303 		    req_list->krl_cnt * sizeof (kcpc_request_t));
2304 		kmem_free(old, req_list->krl_max * sizeof (kcpc_request_t));
2305 		req_list->krl_cnt = 0;
2306 		req_list->krl_max += cpc_ncounters;
2307 	}
2308 
2309 	/*
2310 	 * Fill in request as much as possible now, but some fields will need
2311 	 * to be set when request is assigned to a set.
2312 	 */
2313 	req = &req_list->krl_list[req_list->krl_cnt];
2314 	req->kr_config = NULL;
2315 	req->kr_picnum = -1;	/* have CPC pick this */
2316 	req->kr_index = -1;	/* set when assigning request to set */
2317 	req->kr_data = NULL;	/* set when configuring request */
2318 	(void) strcpy(req->kr_event, event);
2319 	req->kr_preset = preset;
2320 	req->kr_flags = flags;
2321 	req->kr_nattrs = nattrs;
2322 	req->kr_attr = attr;
2323 	/*
2324 	 * Keep pointer given by caller to give to update function when this
2325 	 * counter event is sampled/read
2326 	 */
2327 	req->kr_ptr = ptr;
2328 
2329 	req_list->krl_cnt++;
2330 
2331 	return (0);
2332 }
2333 
2334 /*
2335  * Reset list of CPC event requests so its space can be used for another set
2336  * of requests
2337  */
2338 int
2339 kcpc_reqs_reset(kcpc_request_list_t *req_list)
2340 {
2341 	/*
2342 	 * Return when pointer to request list structure or request is NULL or
2343 	 * when max requests is less than or equal to 0
2344 	 */
2345 	if (req_list == NULL || req_list->krl_list == NULL ||
2346 	    req_list->krl_max <= 0)
2347 		return (-1);
2348 
2349 	/*
2350 	 * Zero out requests and number of requests used
2351 	 */
2352 	bzero(req_list->krl_list, req_list->krl_max * sizeof (kcpc_request_t));
2353 	req_list->krl_cnt = 0;
2354 	return (0);
2355 }
2356 
2357 /*
2358  * Free given list of counter event requests
2359  */
2360 int
2361 kcpc_reqs_fini(kcpc_request_list_t *req_list)
2362 {
2363 	kmem_free(req_list->krl_list,
2364 	    req_list->krl_max * sizeof (kcpc_request_t));
2365 	kmem_free(req_list, sizeof (kcpc_request_list_t));
2366 	return (0);
2367 }
2368 
2369 /*
2370  * Create set of given counter event requests
2371  */
2372 static kcpc_set_t *
2373 kcpc_set_create(kcpc_request_t *reqs, int nreqs, int set_flags, int kmem_flags)
2374 {
2375 	int		i;
2376 	kcpc_set_t	*set;
2377 
2378 	/*
2379 	 * Allocate set and assign number of requests in set and flags
2380 	 */
2381 	set = kmem_zalloc(sizeof (kcpc_set_t), kmem_flags);
2382 	if (set == NULL)
2383 		return (NULL);
2384 
2385 	if (nreqs < cpc_ncounters)
2386 		set->ks_nreqs = nreqs;
2387 	else
2388 		set->ks_nreqs = cpc_ncounters;
2389 
2390 	set->ks_flags = set_flags;
2391 
2392 	/*
2393 	 * Allocate requests needed, copy requests into set, and set index into
2394 	 * data for each request (which may change when we assign requested
2395 	 * counter events to counters)
2396 	 */
2397 	set->ks_req = (kcpc_request_t *)kmem_zalloc(sizeof (kcpc_request_t) *
2398 	    set->ks_nreqs, kmem_flags);
2399 	if (set->ks_req == NULL) {
2400 		kmem_free(set, sizeof (kcpc_set_t));
2401 		return (NULL);
2402 	}
2403 
2404 	bcopy(reqs, set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
2405 
2406 	for (i = 0; i < set->ks_nreqs; i++)
2407 		set->ks_req[i].kr_index = i;
2408 
2409 	return (set);
2410 }
2411 
2412 
2413 /*
2414  * Stop counters on current CPU.
2415  *
2416  * If preserve_context is true, the caller is interested in the CPU's CPC
2417  * context and wants it to be preserved.
2418  *
2419  * If preserve_context is false, the caller does not need the CPU's CPC context
2420  * to be preserved, so it is set to NULL.
2421  */
2422 static void
2423 kcpc_cpustop_func(boolean_t preserve_context)
2424 {
2425 	kpreempt_disable();
2426 
2427 	/*
2428 	 * Someone already stopped this context before us, so there is nothing
2429 	 * to do.
2430 	 */
2431 	if (CPU->cpu_cpc_ctx == NULL) {
2432 		kpreempt_enable();
2433 		return;
2434 	}
2435 
2436 	kcpc_unprogram(CPU->cpu_cpc_ctx, B_TRUE);
2437 	/*
2438 	 * If CU does not use counters, then clear the CPU's CPC context
2439 	 * If the caller requested to preserve context it should disable CU
2440 	 * first, so there should be no CU context now.
2441 	 */
2442 	ASSERT(!preserve_context || !CU_CPC_ON(CPU));
2443 	if (!preserve_context && CPU->cpu_cpc_ctx != NULL && !CU_CPC_ON(CPU))
2444 		CPU->cpu_cpc_ctx = NULL;
2445 
2446 	kpreempt_enable();
2447 }
2448 
2449 /*
2450  * Stop counters on given CPU and set its CPC context to NULL unless
2451  * preserve_context is true.
2452  */
2453 void
2454 kcpc_cpu_stop(cpu_t *cp, boolean_t preserve_context)
2455 {
2456 	cpu_call(cp, (cpu_call_func_t)kcpc_cpustop_func,
2457 	    preserve_context, 0);
2458 }
2459 
2460 /*
2461  * Program the context on the current CPU
2462  */
2463 static void
2464 kcpc_remoteprogram_func(kcpc_ctx_t *ctx, uintptr_t arg)
2465 {
2466 	boolean_t for_thread = (boolean_t)arg;
2467 
2468 	ASSERT(ctx != NULL);
2469 
2470 	kpreempt_disable();
2471 	kcpc_program(ctx, for_thread, B_TRUE);
2472 	kpreempt_enable();
2473 }
2474 
2475 /*
2476  * Program counters on given CPU
2477  */
2478 void
2479 kcpc_cpu_program(cpu_t *cp, kcpc_ctx_t *ctx)
2480 {
2481 	cpu_call(cp, (cpu_call_func_t)kcpc_remoteprogram_func, (uintptr_t)ctx,
2482 	    (uintptr_t)B_FALSE);
2483 }
2484 
2485 char *
2486 kcpc_list_attrs(void)
2487 {
2488 	ASSERT(pcbe_ops != NULL);
2489 
2490 	return (pcbe_ops->pcbe_list_attrs());
2491 }
2492 
2493 char *
2494 kcpc_list_events(uint_t pic)
2495 {
2496 	ASSERT(pcbe_ops != NULL);
2497 
2498 	return (pcbe_ops->pcbe_list_events(pic));
2499 }
2500 
2501 uint_t
2502 kcpc_pcbe_capabilities(void)
2503 {
2504 	ASSERT(pcbe_ops != NULL);
2505 
2506 	return (pcbe_ops->pcbe_caps);
2507 }
2508 
2509 int
2510 kcpc_pcbe_loaded(void)
2511 {
2512 	return (pcbe_ops == NULL ? -1 : 0);
2513 }
2514