xref: /illumos-gate/usr/src/uts/common/os/kcpc.c (revision 9ccfd38e64be4d6a2c01daa15fe7f5627122bdc2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2021 Joyent, Inc.
25  */
26 
27 #include <sys/param.h>
28 #include <sys/thread.h>
29 #include <sys/cpuvar.h>
30 #include <sys/inttypes.h>
31 #include <sys/cmn_err.h>
32 #include <sys/time.h>
33 #include <sys/ksynch.h>
34 #include <sys/systm.h>
35 #include <sys/kcpc.h>
36 #include <sys/cpc_impl.h>
37 #include <sys/cpc_pcbe.h>
38 #include <sys/atomic.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/sdt.h>
42 #include <sys/archsystm.h>
43 #include <sys/promif.h>
44 #include <sys/x_call.h>
45 #include <sys/cap_util.h>
46 #if defined(__x86)
47 #include <asm/clock.h>
48 #include <sys/xc_levels.h>
49 #endif
50 
51 static kmutex_t	kcpc_ctx_llock[CPC_HASH_BUCKETS];	/* protects ctx_list */
52 static kcpc_ctx_t *kcpc_ctx_list[CPC_HASH_BUCKETS];	/* head of list */
53 
54 
55 krwlock_t	kcpc_cpuctx_lock;	/* lock for 'kcpc_cpuctx' below */
56 int		kcpc_cpuctx;		/* number of cpu-specific contexts */
57 
58 int kcpc_counts_include_idle = 1; /* Project Private /etc/system variable */
59 
60 /*
61  * These are set when a PCBE module is loaded.
62  */
63 uint_t		cpc_ncounters = 0;
64 pcbe_ops_t	*pcbe_ops = NULL;
65 
66 /*
67  * Statistics on (mis)behavior
68  */
69 static uint32_t kcpc_intrctx_count;    /* # overflows in an interrupt handler */
70 static uint32_t kcpc_nullctx_count;    /* # overflows in a thread with no ctx */
71 
72 /*
73  * By setting 'kcpc_nullctx_panic' to 1, any overflow interrupts in a thread
74  * with no valid context will result in a panic.
75  */
76 static int kcpc_nullctx_panic = 0;
77 
78 static void kcpc_lwp_create(kthread_t *t, kthread_t *ct);
79 static void kcpc_restore(kcpc_ctx_t *ctx);
80 static void kcpc_save(kcpc_ctx_t *ctx);
81 static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx);
82 static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch);
83 static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set);
84 static kcpc_set_t *kcpc_set_create(kcpc_request_t *reqs, int nreqs,
85     int set_flags, int kmem_flags);
86 
87 /*
88  * Macros to manipulate context flags. All flag updates should use one of these
89  * two macros
90  *
91  * Flags should be always be updated atomically since some of the updates are
92  * not protected by locks.
93  */
94 #define	KCPC_CTX_FLAG_SET(ctx, flag) atomic_or_uint(&(ctx)->kc_flags, (flag))
95 #define	KCPC_CTX_FLAG_CLR(ctx, flag) atomic_and_uint(&(ctx)->kc_flags, ~(flag))
96 
97 /*
98  * The IS_HIPIL() macro verifies that the code is executed either from a
99  * cross-call or from high-PIL interrupt
100  */
101 #ifdef DEBUG
102 #define	IS_HIPIL() (getpil() >= XCALL_PIL)
103 #else
104 #define	IS_HIPIL()
105 #endif	/* DEBUG */
106 
107 
108 extern int kcpc_hw_load_pcbe(void);
109 
110 /*
111  * Return value from kcpc_hw_load_pcbe()
112  */
113 static int kcpc_pcbe_error = 0;
114 
115 /*
116  * Perform one-time initialization of kcpc framework.
117  * This function performs the initialization only the first time it is called.
118  * It is safe to call it multiple times.
119  */
120 int
121 kcpc_init(void)
122 {
123 	long hash;
124 	static uint32_t kcpc_initialized = 0;
125 
126 	/*
127 	 * We already tried loading platform pcbe module and failed
128 	 */
129 	if (kcpc_pcbe_error != 0)
130 		return (-1);
131 
132 	/*
133 	 * The kcpc framework should be initialized at most once
134 	 */
135 	if (atomic_cas_32(&kcpc_initialized, 0, 1) != 0)
136 		return (0);
137 
138 	rw_init(&kcpc_cpuctx_lock, NULL, RW_DEFAULT, NULL);
139 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
140 		mutex_init(&kcpc_ctx_llock[hash],
141 		    NULL, MUTEX_DRIVER, (void *)(uintptr_t)15);
142 
143 	/*
144 	 * Load platform-specific pcbe module
145 	 */
146 	kcpc_pcbe_error = kcpc_hw_load_pcbe();
147 
148 	return (kcpc_pcbe_error == 0 ? 0 : -1);
149 }
150 
151 void
152 kcpc_register_pcbe(pcbe_ops_t *ops)
153 {
154 	pcbe_ops = ops;
155 	cpc_ncounters = pcbe_ops->pcbe_ncounters();
156 }
157 
158 void
159 kcpc_register_dcpc(void (*func)(uint64_t))
160 {
161 	dtrace_cpc_fire = func;
162 }
163 
164 void
165 kcpc_unregister_dcpc(void)
166 {
167 	dtrace_cpc_fire = NULL;
168 }
169 
170 int
171 kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode)
172 {
173 	cpu_t		*cp;
174 	kcpc_ctx_t	*ctx;
175 	int		error;
176 	int		save_spl;
177 
178 	ctx = kcpc_ctx_alloc(KM_SLEEP);
179 
180 	if (kcpc_assign_reqs(set, ctx) != 0) {
181 		kcpc_ctx_free(ctx);
182 		*subcode = CPC_RESOURCE_UNAVAIL;
183 		return (EINVAL);
184 	}
185 
186 	ctx->kc_cpuid = cpuid;
187 	ctx->kc_thread = curthread;
188 
189 	set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
190 
191 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
192 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
193 		kcpc_ctx_free(ctx);
194 		return (error);
195 	}
196 
197 	set->ks_ctx = ctx;
198 	ctx->kc_set = set;
199 
200 	/*
201 	 * We must hold cpu_lock to prevent DR, offlining, or unbinding while
202 	 * we are manipulating the cpu_t and programming the hardware, else the
203 	 * the cpu_t could go away while we're looking at it.
204 	 */
205 	mutex_enter(&cpu_lock);
206 	cp = cpu_get(cpuid);
207 
208 	if (cp == NULL)
209 		/*
210 		 * The CPU could have been DRd out while we were getting set up.
211 		 */
212 		goto unbound;
213 
214 	mutex_enter(&cp->cpu_cpc_ctxlock);
215 	kpreempt_disable();
216 	save_spl = spl_xcall();
217 
218 	/*
219 	 * Check to see whether counters for CPU already being used by someone
220 	 * other than kernel for capacity and utilization (since kernel will
221 	 * let go of counters for user in kcpc_program() below)
222 	 */
223 	if (cp->cpu_cpc_ctx != NULL && !CU_CPC_ON(cp)) {
224 		/*
225 		 * If this CPU already has a bound set, return an error.
226 		 */
227 		splx(save_spl);
228 		kpreempt_enable();
229 		mutex_exit(&cp->cpu_cpc_ctxlock);
230 		goto unbound;
231 	}
232 
233 	if (curthread->t_bind_cpu != cpuid) {
234 		splx(save_spl);
235 		kpreempt_enable();
236 		mutex_exit(&cp->cpu_cpc_ctxlock);
237 		goto unbound;
238 	}
239 
240 	kcpc_program(ctx, B_FALSE, B_TRUE);
241 
242 	splx(save_spl);
243 	kpreempt_enable();
244 
245 	mutex_exit(&cp->cpu_cpc_ctxlock);
246 	mutex_exit(&cpu_lock);
247 
248 	mutex_enter(&set->ks_lock);
249 	set->ks_state |= KCPC_SET_BOUND;
250 	cv_signal(&set->ks_condv);
251 	mutex_exit(&set->ks_lock);
252 
253 	return (0);
254 
255 unbound:
256 	mutex_exit(&cpu_lock);
257 	set->ks_ctx = NULL;
258 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
259 	kcpc_ctx_free(ctx);
260 	return (EAGAIN);
261 }
262 
263 int
264 kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
265 {
266 	kcpc_ctx_t	*ctx;
267 	int		error;
268 
269 	/*
270 	 * Only one set is allowed per context, so ensure there is no
271 	 * existing context.
272 	 */
273 
274 	if (t->t_cpc_ctx != NULL)
275 		return (EEXIST);
276 
277 	ctx = kcpc_ctx_alloc(KM_SLEEP);
278 
279 	/*
280 	 * The context must begin life frozen until it has been properly
281 	 * programmed onto the hardware. This prevents the context ops from
282 	 * worrying about it until we're ready.
283 	 */
284 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
285 	ctx->kc_hrtime = gethrtime();
286 
287 	if (kcpc_assign_reqs(set, ctx) != 0) {
288 		kcpc_ctx_free(ctx);
289 		*subcode = CPC_RESOURCE_UNAVAIL;
290 		return (EINVAL);
291 	}
292 
293 	ctx->kc_cpuid = -1;
294 	if (set->ks_flags & CPC_BIND_LWP_INHERIT)
295 		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_LWPINHERIT);
296 	ctx->kc_thread = t;
297 	t->t_cpc_ctx = ctx;
298 	/*
299 	 * Permit threads to look at their own hardware counters from userland.
300 	 */
301 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_NONPRIV);
302 
303 	/*
304 	 * Create the data store for this set.
305 	 */
306 	set->ks_data = kmem_alloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
307 
308 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
309 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
310 		kcpc_ctx_free(ctx);
311 		t->t_cpc_ctx = NULL;
312 		return (error);
313 	}
314 
315 	set->ks_ctx = ctx;
316 	ctx->kc_set = set;
317 
318 	/*
319 	 * Add a device context to the subject thread.
320 	 */
321 	installctx(t, ctx, kcpc_save, kcpc_restore, NULL,
322 	    kcpc_lwp_create, NULL, kcpc_free, NULL);
323 
324 	/*
325 	 * Ask the backend to program the hardware.
326 	 */
327 	if (t == curthread) {
328 		int save_spl;
329 
330 		kpreempt_disable();
331 		save_spl = spl_xcall();
332 		kcpc_program(ctx, B_TRUE, B_TRUE);
333 		splx(save_spl);
334 		kpreempt_enable();
335 	} else {
336 		/*
337 		 * Since we are the agent LWP, we know the victim LWP is stopped
338 		 * until we're done here; no need to worry about preemption or
339 		 * migration here. We still use an atomic op to clear the flag
340 		 * to ensure the flags are always self-consistent; they can
341 		 * still be accessed from, for instance, another CPU doing a
342 		 * kcpc_invalidate_all().
343 		 */
344 		KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
345 	}
346 
347 	mutex_enter(&set->ks_lock);
348 	set->ks_state |= KCPC_SET_BOUND;
349 	cv_signal(&set->ks_condv);
350 	mutex_exit(&set->ks_lock);
351 
352 	return (0);
353 }
354 
355 /*
356  * Walk through each request in the set and ask the PCBE to configure a
357  * corresponding counter.
358  */
359 int
360 kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode)
361 {
362 	int		i;
363 	int		ret;
364 	kcpc_request_t	*rp;
365 
366 	for (i = 0; i < set->ks_nreqs; i++) {
367 		int n;
368 		rp = &set->ks_req[i];
369 
370 		n = rp->kr_picnum;
371 
372 		ASSERT(n >= 0 && n < cpc_ncounters);
373 
374 		ASSERT(ctx->kc_pics[n].kp_req == NULL);
375 
376 		if (rp->kr_flags & CPC_OVF_NOTIFY_EMT) {
377 			if ((pcbe_ops->pcbe_caps & CPC_CAP_OVERFLOW_INTERRUPT)
378 			    == 0) {
379 				*subcode = -1;
380 				return (ENOTSUP);
381 			}
382 			/*
383 			 * If any of the counters have requested overflow
384 			 * notification, we flag the context as being one that
385 			 * cares about overflow.
386 			 */
387 			KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_SIGOVF);
388 		}
389 
390 		rp->kr_config = NULL;
391 		if ((ret = pcbe_ops->pcbe_configure(n, rp->kr_event,
392 		    rp->kr_preset, rp->kr_flags, rp->kr_nattrs, rp->kr_attr,
393 		    &(rp->kr_config), (void *)ctx)) != 0) {
394 			kcpc_free_configs(set);
395 			*subcode = ret;
396 			switch (ret) {
397 			case CPC_ATTR_REQUIRES_PRIVILEGE:
398 			case CPC_HV_NO_ACCESS:
399 				return (EACCES);
400 			default:
401 				return (EINVAL);
402 			}
403 		}
404 
405 		ctx->kc_pics[n].kp_req = rp;
406 		rp->kr_picp = &ctx->kc_pics[n];
407 		rp->kr_data = set->ks_data + rp->kr_index;
408 		*rp->kr_data = rp->kr_preset;
409 	}
410 
411 	return (0);
412 }
413 
414 void
415 kcpc_free_configs(kcpc_set_t *set)
416 {
417 	int i;
418 
419 	for (i = 0; i < set->ks_nreqs; i++)
420 		if (set->ks_req[i].kr_config != NULL)
421 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
422 }
423 
424 /*
425  * buf points to a user address and the data should be copied out to that
426  * address in the current process.
427  */
428 int
429 kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick)
430 {
431 	kcpc_ctx_t	*ctx = set->ks_ctx;
432 	int		save_spl;
433 
434 	mutex_enter(&set->ks_lock);
435 	if ((set->ks_state & KCPC_SET_BOUND) == 0) {
436 		mutex_exit(&set->ks_lock);
437 		return (EINVAL);
438 	}
439 	mutex_exit(&set->ks_lock);
440 
441 	/*
442 	 * Kernel preemption must be disabled while reading the hardware regs,
443 	 * and if this is a CPU-bound context, while checking the CPU binding of
444 	 * the current thread.
445 	 */
446 	kpreempt_disable();
447 	save_spl = spl_xcall();
448 
449 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
450 		splx(save_spl);
451 		kpreempt_enable();
452 		return (EAGAIN);
453 	}
454 
455 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) {
456 		if (ctx->kc_cpuid != -1) {
457 			if (curthread->t_bind_cpu != ctx->kc_cpuid) {
458 				splx(save_spl);
459 				kpreempt_enable();
460 				return (EAGAIN);
461 			}
462 		}
463 
464 		if (ctx->kc_thread == curthread) {
465 			uint64_t curtick = KCPC_GET_TICK();
466 
467 			ctx->kc_hrtime = gethrtime_waitfree();
468 			pcbe_ops->pcbe_sample(ctx);
469 			ctx->kc_vtick += curtick - ctx->kc_rawtick;
470 			ctx->kc_rawtick = curtick;
471 		}
472 
473 		/*
474 		 * The config may have been invalidated by
475 		 * the pcbe_sample op.
476 		 */
477 		if (ctx->kc_flags & KCPC_CTX_INVALID) {
478 			splx(save_spl);
479 			kpreempt_enable();
480 			return (EAGAIN);
481 		}
482 
483 	}
484 
485 	splx(save_spl);
486 	kpreempt_enable();
487 
488 	if (copyout(set->ks_data, buf,
489 	    set->ks_nreqs * sizeof (uint64_t)) == -1)
490 		return (EFAULT);
491 	if (copyout(&ctx->kc_hrtime, hrtime, sizeof (uint64_t)) == -1)
492 		return (EFAULT);
493 	if (copyout(&ctx->kc_vtick, tick, sizeof (uint64_t)) == -1)
494 		return (EFAULT);
495 
496 	return (0);
497 }
498 
499 /*
500  * Stop the counters on the CPU this context is bound to.
501  */
502 static void
503 kcpc_stop_hw(kcpc_ctx_t *ctx)
504 {
505 	cpu_t *cp;
506 
507 	kpreempt_disable();
508 
509 	if (ctx->kc_cpuid == CPU->cpu_id) {
510 		cp = CPU;
511 	} else {
512 		cp = cpu_get(ctx->kc_cpuid);
513 	}
514 
515 	ASSERT(cp != NULL && cp->cpu_cpc_ctx == ctx);
516 	kcpc_cpu_stop(cp, B_FALSE);
517 
518 	kpreempt_enable();
519 }
520 
521 int
522 kcpc_unbind(kcpc_set_t *set)
523 {
524 	kcpc_ctx_t	*ctx;
525 	kthread_t	*t;
526 
527 	/*
528 	 * We could be racing with the process's agent thread as it
529 	 * binds the set; we must wait for the set to finish binding
530 	 * before attempting to tear it down.
531 	 */
532 	mutex_enter(&set->ks_lock);
533 	while ((set->ks_state & KCPC_SET_BOUND) == 0)
534 		cv_wait(&set->ks_condv, &set->ks_lock);
535 	mutex_exit(&set->ks_lock);
536 
537 	ctx = set->ks_ctx;
538 
539 	/*
540 	 * Use kc_lock to synchronize with kcpc_restore().
541 	 */
542 	mutex_enter(&ctx->kc_lock);
543 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
544 	mutex_exit(&ctx->kc_lock);
545 
546 	if (ctx->kc_cpuid == -1) {
547 		t = ctx->kc_thread;
548 		/*
549 		 * The context is thread-bound and therefore has a device
550 		 * context.  It will be freed via removectx() calling
551 		 * freectx() calling kcpc_free().
552 		 */
553 		if (t == curthread) {
554 			int save_spl;
555 
556 			kpreempt_disable();
557 			save_spl = spl_xcall();
558 			if (!(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED))
559 				kcpc_unprogram(ctx, B_TRUE);
560 			splx(save_spl);
561 			kpreempt_enable();
562 		}
563 #ifdef DEBUG
564 		if (removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
565 		    kcpc_lwp_create, NULL, kcpc_free) == 0)
566 			panic("kcpc_unbind: context %p not preset on thread %p",
567 			    (void *)ctx, (void *)t);
568 #else
569 		(void) removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
570 		    kcpc_lwp_create, NULL, kcpc_free);
571 #endif /* DEBUG */
572 		t->t_cpc_set = NULL;
573 		t->t_cpc_ctx = NULL;
574 	} else {
575 		/*
576 		 * If we are unbinding a CPU-bound set from a remote CPU, the
577 		 * native CPU's idle thread could be in the midst of programming
578 		 * this context onto the CPU. We grab the context's lock here to
579 		 * ensure that the idle thread is done with it. When we release
580 		 * the lock, the CPU no longer has a context and the idle thread
581 		 * will move on.
582 		 *
583 		 * cpu_lock must be held to prevent the CPU from being DR'd out
584 		 * while we disassociate the context from the cpu_t.
585 		 */
586 		cpu_t *cp;
587 		mutex_enter(&cpu_lock);
588 		cp = cpu_get(ctx->kc_cpuid);
589 		if (cp != NULL) {
590 			/*
591 			 * The CPU may have been DR'd out of the system.
592 			 */
593 			mutex_enter(&cp->cpu_cpc_ctxlock);
594 			if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0)
595 				kcpc_stop_hw(ctx);
596 			ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
597 			mutex_exit(&cp->cpu_cpc_ctxlock);
598 		}
599 		mutex_exit(&cpu_lock);
600 		if (ctx->kc_thread == curthread) {
601 			kcpc_free(ctx, 0);
602 			curthread->t_cpc_set = NULL;
603 		}
604 	}
605 
606 	return (0);
607 }
608 
609 int
610 kcpc_preset(kcpc_set_t *set, int index, uint64_t preset)
611 {
612 	int i;
613 
614 	ASSERT(set != NULL);
615 	ASSERT(set->ks_state & KCPC_SET_BOUND);
616 	ASSERT(set->ks_ctx->kc_thread == curthread);
617 	ASSERT(set->ks_ctx->kc_cpuid == -1);
618 
619 	if (index < 0 || index >= set->ks_nreqs)
620 		return (EINVAL);
621 
622 	for (i = 0; i < set->ks_nreqs; i++)
623 		if (set->ks_req[i].kr_index == index)
624 			break;
625 	ASSERT(i != set->ks_nreqs);
626 
627 	set->ks_req[i].kr_preset = preset;
628 	return (0);
629 }
630 
631 int
632 kcpc_restart(kcpc_set_t *set)
633 {
634 	kcpc_ctx_t	*ctx = set->ks_ctx;
635 	int		i;
636 	int		save_spl;
637 
638 	ASSERT(set->ks_state & KCPC_SET_BOUND);
639 	ASSERT(ctx->kc_thread == curthread);
640 	ASSERT(ctx->kc_cpuid == -1);
641 
642 	for (i = 0; i < set->ks_nreqs; i++) {
643 		*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
644 		pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset,
645 		    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
646 	}
647 
648 	kpreempt_disable();
649 	save_spl = spl_xcall();
650 
651 	/*
652 	 * If the user is doing this on a running set, make sure the counters
653 	 * are stopped first.
654 	 */
655 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
656 		pcbe_ops->pcbe_allstop();
657 
658 	/*
659 	 * Ask the backend to program the hardware.
660 	 */
661 	ctx->kc_rawtick = KCPC_GET_TICK();
662 	KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
663 	pcbe_ops->pcbe_program(ctx);
664 	splx(save_spl);
665 	kpreempt_enable();
666 
667 	return (0);
668 }
669 
670 /*
671  * Caller must hold kcpc_cpuctx_lock.
672  */
673 int
674 kcpc_enable(kthread_t *t, int cmd, int enable)
675 {
676 	kcpc_ctx_t	*ctx = t->t_cpc_ctx;
677 	kcpc_set_t	*set = t->t_cpc_set;
678 	kcpc_set_t	*newset;
679 	int		i;
680 	int		flag;
681 	int		err;
682 
683 	ASSERT(RW_READ_HELD(&kcpc_cpuctx_lock));
684 
685 	if (ctx == NULL) {
686 		/*
687 		 * This thread has a set but no context; it must be a
688 		 * CPU-bound set.
689 		 */
690 		ASSERT(t->t_cpc_set != NULL);
691 		ASSERT(t->t_cpc_set->ks_ctx->kc_cpuid != -1);
692 		return (EINVAL);
693 	} else if (ctx->kc_flags & KCPC_CTX_INVALID)
694 		return (EAGAIN);
695 
696 	if (cmd == CPC_ENABLE) {
697 		if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
698 			return (EINVAL);
699 		kpreempt_disable();
700 		KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
701 		kcpc_restore(ctx);
702 		kpreempt_enable();
703 	} else if (cmd == CPC_DISABLE) {
704 		if (ctx->kc_flags & KCPC_CTX_FREEZE)
705 			return (EINVAL);
706 		kpreempt_disable();
707 		kcpc_save(ctx);
708 		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
709 		kpreempt_enable();
710 	} else if (cmd == CPC_USR_EVENTS || cmd == CPC_SYS_EVENTS) {
711 		/*
712 		 * Strategy for usr/sys: stop counters and update set's presets
713 		 * with current counter values, unbind, update requests with
714 		 * new config, then re-bind.
715 		 */
716 		flag = (cmd == CPC_USR_EVENTS) ?
717 		    CPC_COUNT_USER: CPC_COUNT_SYSTEM;
718 
719 		kpreempt_disable();
720 		KCPC_CTX_FLAG_SET(ctx,
721 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
722 		pcbe_ops->pcbe_allstop();
723 		kpreempt_enable();
724 
725 		for (i = 0; i < set->ks_nreqs; i++) {
726 			set->ks_req[i].kr_preset = *(set->ks_req[i].kr_data);
727 			if (enable)
728 				set->ks_req[i].kr_flags |= flag;
729 			else
730 				set->ks_req[i].kr_flags &= ~flag;
731 		}
732 		newset = kcpc_dup_set(set);
733 		if (kcpc_unbind(set) != 0)
734 			return (EINVAL);
735 		t->t_cpc_set = newset;
736 		if (kcpc_bind_thread(newset, t, &err) != 0) {
737 			t->t_cpc_set = NULL;
738 			kcpc_free_set(newset);
739 			return (EINVAL);
740 		}
741 	} else
742 		return (EINVAL);
743 
744 	return (0);
745 }
746 
747 /*
748  * Provide PCBEs with a way of obtaining the configs of every counter which will
749  * be programmed together.
750  *
751  * If current is NULL, provide the first config.
752  *
753  * If data != NULL, caller wants to know where the data store associated with
754  * the config we return is located.
755  */
756 void *
757 kcpc_next_config(void *token, void *current, uint64_t **data)
758 {
759 	int		i;
760 	kcpc_pic_t	*pic;
761 	kcpc_ctx_t *ctx = (kcpc_ctx_t *)token;
762 
763 	if (current == NULL) {
764 		/*
765 		 * Client would like the first config, which may not be in
766 		 * counter 0; we need to search through the counters for the
767 		 * first config.
768 		 */
769 		for (i = 0; i < cpc_ncounters; i++)
770 			if (ctx->kc_pics[i].kp_req != NULL)
771 				break;
772 		/*
773 		 * There are no counters configured for the given context.
774 		 */
775 		if (i == cpc_ncounters)
776 			return (NULL);
777 	} else {
778 		/*
779 		 * There surely is a faster way to do this.
780 		 */
781 		for (i = 0; i < cpc_ncounters; i++) {
782 			pic = &ctx->kc_pics[i];
783 
784 			if (pic->kp_req != NULL &&
785 			    current == pic->kp_req->kr_config)
786 				break;
787 		}
788 
789 		/*
790 		 * We found the current config at picnum i. Now search for the
791 		 * next configured PIC.
792 		 */
793 		for (i++; i < cpc_ncounters; i++) {
794 			pic = &ctx->kc_pics[i];
795 			if (pic->kp_req != NULL)
796 				break;
797 		}
798 
799 		if (i == cpc_ncounters)
800 			return (NULL);
801 	}
802 
803 	if (data != NULL) {
804 		*data = ctx->kc_pics[i].kp_req->kr_data;
805 	}
806 
807 	return (ctx->kc_pics[i].kp_req->kr_config);
808 }
809 
810 
811 kcpc_ctx_t *
812 kcpc_ctx_alloc(int kmem_flags)
813 {
814 	kcpc_ctx_t	*ctx;
815 	long		hash;
816 
817 	ctx = (kcpc_ctx_t *)kmem_zalloc(sizeof (kcpc_ctx_t), kmem_flags);
818 	if (ctx == NULL)
819 		return (NULL);
820 
821 	hash = CPC_HASH_CTX(ctx);
822 	mutex_enter(&kcpc_ctx_llock[hash]);
823 	ctx->kc_next = kcpc_ctx_list[hash];
824 	kcpc_ctx_list[hash] = ctx;
825 	mutex_exit(&kcpc_ctx_llock[hash]);
826 
827 	ctx->kc_pics = (kcpc_pic_t *)kmem_zalloc(sizeof (kcpc_pic_t) *
828 	    cpc_ncounters, KM_SLEEP);
829 
830 	ctx->kc_cpuid = -1;
831 
832 	return (ctx);
833 }
834 
835 /*
836  * Copy set from ctx to the child context, cctx, if it has CPC_BIND_LWP_INHERIT
837  * in the flags.
838  */
839 static void
840 kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx)
841 {
842 	kcpc_set_t	*ks = ctx->kc_set, *cks;
843 	int		i, j;
844 	int		code;
845 
846 	ASSERT(ks != NULL);
847 
848 	if ((ks->ks_flags & CPC_BIND_LWP_INHERIT) == 0)
849 		return;
850 
851 	cks = kmem_zalloc(sizeof (*cks), KM_SLEEP);
852 	cks->ks_state &= ~KCPC_SET_BOUND;
853 	cctx->kc_set = cks;
854 	cks->ks_flags = ks->ks_flags;
855 	cks->ks_nreqs = ks->ks_nreqs;
856 	cks->ks_req = kmem_alloc(cks->ks_nreqs *
857 	    sizeof (kcpc_request_t), KM_SLEEP);
858 	cks->ks_data = kmem_alloc(cks->ks_nreqs * sizeof (uint64_t),
859 	    KM_SLEEP);
860 	cks->ks_ctx = cctx;
861 
862 	for (i = 0; i < cks->ks_nreqs; i++) {
863 		cks->ks_req[i].kr_index = ks->ks_req[i].kr_index;
864 		cks->ks_req[i].kr_picnum = ks->ks_req[i].kr_picnum;
865 		(void) strncpy(cks->ks_req[i].kr_event,
866 		    ks->ks_req[i].kr_event, CPC_MAX_EVENT_LEN);
867 		cks->ks_req[i].kr_preset = ks->ks_req[i].kr_preset;
868 		cks->ks_req[i].kr_flags = ks->ks_req[i].kr_flags;
869 		cks->ks_req[i].kr_nattrs = ks->ks_req[i].kr_nattrs;
870 		if (ks->ks_req[i].kr_nattrs > 0) {
871 			cks->ks_req[i].kr_attr =
872 			    kmem_alloc(ks->ks_req[i].kr_nattrs *
873 			    sizeof (kcpc_attr_t), KM_SLEEP);
874 		}
875 		for (j = 0; j < ks->ks_req[i].kr_nattrs; j++) {
876 			(void) strncpy(cks->ks_req[i].kr_attr[j].ka_name,
877 			    ks->ks_req[i].kr_attr[j].ka_name,
878 			    CPC_MAX_ATTR_LEN);
879 			cks->ks_req[i].kr_attr[j].ka_val =
880 			    ks->ks_req[i].kr_attr[j].ka_val;
881 		}
882 	}
883 	if (kcpc_configure_reqs(cctx, cks, &code) != 0)
884 		kcpc_invalidate_config(cctx);
885 
886 	mutex_enter(&cks->ks_lock);
887 	cks->ks_state |= KCPC_SET_BOUND;
888 	cv_signal(&cks->ks_condv);
889 	mutex_exit(&cks->ks_lock);
890 }
891 
892 
893 void
894 kcpc_ctx_free(kcpc_ctx_t *ctx)
895 {
896 	kcpc_ctx_t	**loc;
897 	long		hash = CPC_HASH_CTX(ctx);
898 
899 	mutex_enter(&kcpc_ctx_llock[hash]);
900 	loc = &kcpc_ctx_list[hash];
901 	ASSERT(*loc != NULL);
902 	while (*loc != ctx)
903 		loc = &(*loc)->kc_next;
904 	*loc = ctx->kc_next;
905 	mutex_exit(&kcpc_ctx_llock[hash]);
906 
907 	kmem_free(ctx->kc_pics, cpc_ncounters * sizeof (kcpc_pic_t));
908 	cv_destroy(&ctx->kc_condv);
909 	mutex_destroy(&ctx->kc_lock);
910 	kmem_free(ctx, sizeof (*ctx));
911 }
912 
913 /*
914  * Generic interrupt handler used on hardware that generates
915  * overflow interrupts.
916  *
917  * Note: executed at high-level interrupt context!
918  */
919 /*ARGSUSED*/
920 kcpc_ctx_t *
921 kcpc_overflow_intr(caddr_t arg, uint64_t bitmap)
922 {
923 	kcpc_ctx_t	*ctx;
924 	kthread_t	*t = curthread;
925 	int		i;
926 
927 	/*
928 	 * On both x86 and UltraSPARC, we may deliver the high-level
929 	 * interrupt in kernel mode, just after we've started to run an
930 	 * interrupt thread.  (That's because the hardware helpfully
931 	 * delivers the overflow interrupt some random number of cycles
932 	 * after the instruction that caused the overflow by which time
933 	 * we're in some part of the kernel, not necessarily running on
934 	 * the right thread).
935 	 *
936 	 * Check for this case here -- find the pinned thread
937 	 * that was running when the interrupt went off.
938 	 */
939 	if (t->t_flag & T_INTR_THREAD) {
940 		klwp_t *lwp;
941 
942 		atomic_inc_32(&kcpc_intrctx_count);
943 
944 		/*
945 		 * Note that t_lwp is always set to point at the underlying
946 		 * thread, thus this will work in the presence of nested
947 		 * interrupts.
948 		 */
949 		ctx = NULL;
950 		if ((lwp = t->t_lwp) != NULL) {
951 			t = lwptot(lwp);
952 			ctx = t->t_cpc_ctx;
953 		}
954 	} else
955 		ctx = t->t_cpc_ctx;
956 
957 	if (ctx == NULL) {
958 		/*
959 		 * This can easily happen if we're using the counters in
960 		 * "shared" mode, for example, and an overflow interrupt
961 		 * occurs while we are running cpustat.  In that case, the
962 		 * bound thread that has the context that belongs to this
963 		 * CPU is almost certainly sleeping (if it was running on
964 		 * the CPU we'd have found it above), and the actual
965 		 * interrupted thread has no knowledge of performance counters!
966 		 */
967 		ctx = curthread->t_cpu->cpu_cpc_ctx;
968 		if (ctx != NULL) {
969 			/*
970 			 * Return the bound context for this CPU to
971 			 * the interrupt handler so that it can synchronously
972 			 * sample the hardware counters and restart them.
973 			 */
974 			return (ctx);
975 		}
976 
977 		/*
978 		 * As long as the overflow interrupt really is delivered early
979 		 * enough after trapping into the kernel to avoid switching
980 		 * threads, we must always be able to find the cpc context,
981 		 * or something went terribly wrong i.e. we ended up
982 		 * running a passivated interrupt thread, a kernel
983 		 * thread or we interrupted idle, all of which are Very Bad.
984 		 *
985 		 * We also could end up here owing to an incredibly unlikely
986 		 * race condition that exists on x86 based architectures when
987 		 * the cpc provider is in use; overflow interrupts are directed
988 		 * to the cpc provider if the 'dtrace_cpc_in_use' variable is
989 		 * set when we enter the handler. This variable is unset after
990 		 * overflow interrupts have been disabled on all CPUs and all
991 		 * contexts have been torn down. To stop interrupts, the cpc
992 		 * provider issues a xcall to the remote CPU before it tears
993 		 * down that CPUs context. As high priority xcalls, on an x86
994 		 * architecture, execute at a higher PIL than this handler, it
995 		 * is possible (though extremely unlikely) that the xcall could
996 		 * interrupt the overflow handler before the handler has
997 		 * checked the 'dtrace_cpc_in_use' variable, stop the counters,
998 		 * return to the cpc provider which could then rip down
999 		 * contexts and unset 'dtrace_cpc_in_use' *before* the CPUs
1000 		 * overflow handler has had a chance to check the variable. In
1001 		 * that case, the handler would direct the overflow into this
1002 		 * code and no valid context will be found. The default behavior
1003 		 * when no valid context is found is now to shout a warning to
1004 		 * the console and bump the 'kcpc_nullctx_count' variable.
1005 		 */
1006 		if (kcpc_nullctx_panic)
1007 			panic("null cpc context, thread %p", (void *)t);
1008 #ifdef DEBUG
1009 		cmn_err(CE_NOTE,
1010 		    "null cpc context found in overflow handler!\n");
1011 #endif
1012 		atomic_inc_32(&kcpc_nullctx_count);
1013 	} else if ((ctx->kc_flags & KCPC_CTX_INVALID) == 0) {
1014 		/*
1015 		 * Schedule an ast to sample the counters, which will
1016 		 * propagate any overflow into the virtualized performance
1017 		 * counter(s), and may deliver a signal.
1018 		 */
1019 		ttolwp(t)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
1020 		/*
1021 		 * If a counter has overflowed which was counting on behalf of
1022 		 * a request which specified CPC_OVF_NOTIFY_EMT, send the
1023 		 * process a signal.
1024 		 */
1025 		for (i = 0; i < cpc_ncounters; i++) {
1026 			if (ctx->kc_pics[i].kp_req != NULL &&
1027 			    bitmap & (1 << i) &&
1028 			    ctx->kc_pics[i].kp_req->kr_flags &
1029 			    CPC_OVF_NOTIFY_EMT) {
1030 				/*
1031 				 * A signal has been requested for this PIC, so
1032 				 * so freeze the context. The interrupt handler
1033 				 * has already stopped the counter hardware.
1034 				 */
1035 				KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
1036 				atomic_or_uint(&ctx->kc_pics[i].kp_flags,
1037 				    KCPC_PIC_OVERFLOWED);
1038 			}
1039 		}
1040 		aston(t);
1041 	} else if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) {
1042 		/*
1043 		 * Thread context is no longer valid, but here may be a valid
1044 		 * CPU context.
1045 		 */
1046 		return (curthread->t_cpu->cpu_cpc_ctx);
1047 	}
1048 
1049 	return (NULL);
1050 }
1051 
1052 /*
1053  * The current thread context had an overflow interrupt; we're
1054  * executing here in high-level interrupt context.
1055  */
1056 /*ARGSUSED*/
1057 uint_t
1058 kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
1059 {
1060 	kcpc_ctx_t *ctx;
1061 	uint64_t bitmap;
1062 	uint8_t *state;
1063 	int	save_spl;
1064 
1065 	if (pcbe_ops == NULL ||
1066 	    (bitmap = pcbe_ops->pcbe_overflow_bitmap()) == 0)
1067 		return (DDI_INTR_UNCLAIMED);
1068 
1069 	/*
1070 	 * Prevent any further interrupts.
1071 	 */
1072 	pcbe_ops->pcbe_allstop();
1073 
1074 	if (dtrace_cpc_in_use) {
1075 		state = &cpu_core[CPU->cpu_id].cpuc_dcpc_intr_state;
1076 
1077 		/*
1078 		 * Set the per-CPU state bit to indicate that we are currently
1079 		 * processing an interrupt if it is currently free. Drop the
1080 		 * interrupt if the state isn't free (i.e. a configuration
1081 		 * event is taking place).
1082 		 */
1083 		if (atomic_cas_8(state, DCPC_INTR_FREE,
1084 		    DCPC_INTR_PROCESSING) == DCPC_INTR_FREE) {
1085 			int i;
1086 			kcpc_request_t req;
1087 
1088 			ASSERT(dtrace_cpc_fire != NULL);
1089 
1090 			(*dtrace_cpc_fire)(bitmap);
1091 
1092 			ctx = curthread->t_cpu->cpu_cpc_ctx;
1093 			if (ctx == NULL) {
1094 #ifdef DEBUG
1095 				cmn_err(CE_NOTE, "null cpc context in"
1096 				    "hardware overflow handler!\n");
1097 #endif
1098 				return (DDI_INTR_CLAIMED);
1099 			}
1100 
1101 			/* Reset any counters that have overflowed */
1102 			for (i = 0; i < ctx->kc_set->ks_nreqs; i++) {
1103 				req = ctx->kc_set->ks_req[i];
1104 
1105 				if (bitmap & (1 << req.kr_picnum)) {
1106 					pcbe_ops->pcbe_configure(req.kr_picnum,
1107 					    req.kr_event, req.kr_preset,
1108 					    req.kr_flags, req.kr_nattrs,
1109 					    req.kr_attr, &(req.kr_config),
1110 					    (void *)ctx);
1111 				}
1112 			}
1113 			pcbe_ops->pcbe_program(ctx);
1114 
1115 			/*
1116 			 * We've finished processing the interrupt so set
1117 			 * the state back to free.
1118 			 */
1119 			cpu_core[CPU->cpu_id].cpuc_dcpc_intr_state =
1120 			    DCPC_INTR_FREE;
1121 			membar_producer();
1122 		}
1123 		return (DDI_INTR_CLAIMED);
1124 	}
1125 
1126 	/*
1127 	 * DTrace isn't involved so pass on accordingly.
1128 	 *
1129 	 * If the interrupt has occurred in the context of an lwp owning
1130 	 * the counters, then the handler posts an AST to the lwp to
1131 	 * trigger the actual sampling, and optionally deliver a signal or
1132 	 * restart the counters, on the way out of the kernel using
1133 	 * kcpc_hw_overflow_ast() (see below).
1134 	 *
1135 	 * On the other hand, if the handler returns the context to us
1136 	 * directly, then it means that there are no other threads in
1137 	 * the middle of updating it, no AST has been posted, and so we
1138 	 * should sample the counters here, and restart them with no
1139 	 * further fuss.
1140 	 *
1141 	 * The CPU's CPC context may disappear as a result of cross-call which
1142 	 * has higher PIL on x86, so protect the context by raising PIL to the
1143 	 * cross-call level.
1144 	 */
1145 	save_spl = spl_xcall();
1146 	if ((ctx = kcpc_overflow_intr(arg1, bitmap)) != NULL) {
1147 		uint64_t curtick = KCPC_GET_TICK();
1148 
1149 		ctx->kc_hrtime = gethrtime_waitfree();
1150 		ctx->kc_vtick += curtick - ctx->kc_rawtick;
1151 		ctx->kc_rawtick = curtick;
1152 		pcbe_ops->pcbe_sample(ctx);
1153 		pcbe_ops->pcbe_program(ctx);
1154 	}
1155 	splx(save_spl);
1156 
1157 	return (DDI_INTR_CLAIMED);
1158 }
1159 
1160 /*
1161  * Called from trap() when processing the ast posted by the high-level
1162  * interrupt handler.
1163  */
1164 int
1165 kcpc_overflow_ast()
1166 {
1167 	kcpc_ctx_t	*ctx = curthread->t_cpc_ctx;
1168 	int		i;
1169 	int		found = 0;
1170 	uint64_t	curtick = KCPC_GET_TICK();
1171 
1172 	ASSERT(ctx != NULL);	/* Beware of interrupt skid. */
1173 
1174 	/*
1175 	 * An overflow happened: sample the context to ensure that
1176 	 * the overflow is propagated into the upper bits of the
1177 	 * virtualized 64-bit counter(s).
1178 	 */
1179 	kpreempt_disable();
1180 	ctx->kc_hrtime = gethrtime_waitfree();
1181 	pcbe_ops->pcbe_sample(ctx);
1182 	kpreempt_enable();
1183 
1184 	ctx->kc_vtick += curtick - ctx->kc_rawtick;
1185 
1186 	/*
1187 	 * The interrupt handler has marked any pics with KCPC_PIC_OVERFLOWED
1188 	 * if that pic generated an overflow and if the request it was counting
1189 	 * on behalf of had CPC_OVERFLOW_REQUEST specified. We go through all
1190 	 * pics in the context and clear the KCPC_PIC_OVERFLOWED flags. If we
1191 	 * found any overflowed pics, keep the context frozen and return true
1192 	 * (thus causing a signal to be sent).
1193 	 */
1194 	for (i = 0; i < cpc_ncounters; i++) {
1195 		if (ctx->kc_pics[i].kp_flags & KCPC_PIC_OVERFLOWED) {
1196 			atomic_and_uint(&ctx->kc_pics[i].kp_flags,
1197 			    ~KCPC_PIC_OVERFLOWED);
1198 			found = 1;
1199 		}
1200 	}
1201 	if (found)
1202 		return (1);
1203 
1204 	/*
1205 	 * Otherwise, re-enable the counters and continue life as before.
1206 	 */
1207 	kpreempt_disable();
1208 	KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
1209 	pcbe_ops->pcbe_program(ctx);
1210 	kpreempt_enable();
1211 	return (0);
1212 }
1213 
1214 /*
1215  * Called when switching away from current thread.
1216  */
1217 static void
1218 kcpc_save(kcpc_ctx_t *ctx)
1219 {
1220 	int err;
1221 	int save_spl;
1222 
1223 	kpreempt_disable();
1224 	save_spl = spl_xcall();
1225 
1226 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
1227 		if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) {
1228 			splx(save_spl);
1229 			kpreempt_enable();
1230 			return;
1231 		}
1232 		/*
1233 		 * This context has been invalidated but the counters have not
1234 		 * been stopped. Stop them here and mark the context stopped.
1235 		 */
1236 		kcpc_unprogram(ctx, B_TRUE);
1237 		splx(save_spl);
1238 		kpreempt_enable();
1239 		return;
1240 	}
1241 
1242 	pcbe_ops->pcbe_allstop();
1243 	if (ctx->kc_flags & KCPC_CTX_FREEZE) {
1244 		splx(save_spl);
1245 		kpreempt_enable();
1246 		return;
1247 	}
1248 
1249 	/*
1250 	 * Need to sample for all reqs into each req's current mpic.
1251 	 */
1252 	ctx->kc_hrtime = gethrtime_waitfree();
1253 	ctx->kc_vtick += KCPC_GET_TICK() - ctx->kc_rawtick;
1254 	pcbe_ops->pcbe_sample(ctx);
1255 
1256 	/*
1257 	 * Program counter for measuring capacity and utilization since user
1258 	 * thread isn't using counter anymore
1259 	 */
1260 	ASSERT(ctx->kc_cpuid == -1);
1261 	cu_cpc_program(CPU, &err);
1262 	splx(save_spl);
1263 	kpreempt_enable();
1264 }
1265 
1266 static void
1267 kcpc_restore(kcpc_ctx_t *ctx)
1268 {
1269 	int save_spl;
1270 
1271 	mutex_enter(&ctx->kc_lock);
1272 
1273 	if ((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED)) ==
1274 	    KCPC_CTX_INVALID) {
1275 		/*
1276 		 * The context is invalidated but has not been marked stopped.
1277 		 * We mark it as such here because we will not start the
1278 		 * counters during this context switch.
1279 		 */
1280 		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID_STOPPED);
1281 	}
1282 
1283 	if (ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_FREEZE)) {
1284 		mutex_exit(&ctx->kc_lock);
1285 		return;
1286 	}
1287 
1288 	/*
1289 	 * Set kc_flags to show that a kcpc_restore() is in progress to avoid
1290 	 * ctx & set related memory objects being freed without us knowing.
1291 	 * This can happen if an agent thread is executing a kcpc_unbind(),
1292 	 * with this thread as the target, whilst we're concurrently doing a
1293 	 * restorectx() during, for example, a proc_exit().  Effectively, by
1294 	 * doing this, we're asking kcpc_free() to cv_wait() until
1295 	 * kcpc_restore() has completed.
1296 	 */
1297 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_RESTORE);
1298 	mutex_exit(&ctx->kc_lock);
1299 
1300 	/*
1301 	 * While programming the hardware, the counters should be stopped. We
1302 	 * don't do an explicit pcbe_allstop() here because they should have
1303 	 * been stopped already by the last consumer.
1304 	 */
1305 	kpreempt_disable();
1306 	save_spl = spl_xcall();
1307 	kcpc_program(ctx, B_TRUE, B_TRUE);
1308 	splx(save_spl);
1309 	kpreempt_enable();
1310 
1311 	/*
1312 	 * Wake the agent thread if it's waiting in kcpc_free().
1313 	 */
1314 	mutex_enter(&ctx->kc_lock);
1315 	KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_RESTORE);
1316 	cv_signal(&ctx->kc_condv);
1317 	mutex_exit(&ctx->kc_lock);
1318 }
1319 
1320 /*
1321  * If kcpc_counts_include_idle is set to 0 by the sys admin, we add the the
1322  * following context operators to the idle thread on each CPU. They stop the
1323  * counters when the idle thread is switched on, and they start them again when
1324  * it is switched off.
1325  */
1326 /*ARGSUSED*/
1327 void
1328 kcpc_idle_save(struct cpu *cp)
1329 {
1330 	/*
1331 	 * The idle thread shouldn't be run anywhere else.
1332 	 */
1333 	ASSERT(CPU == cp);
1334 
1335 	/*
1336 	 * We must hold the CPU's context lock to ensure the context isn't freed
1337 	 * while we're looking at it.
1338 	 */
1339 	mutex_enter(&cp->cpu_cpc_ctxlock);
1340 
1341 	if ((cp->cpu_cpc_ctx == NULL) ||
1342 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1343 		mutex_exit(&cp->cpu_cpc_ctxlock);
1344 		return;
1345 	}
1346 
1347 	pcbe_ops->pcbe_program(cp->cpu_cpc_ctx);
1348 	mutex_exit(&cp->cpu_cpc_ctxlock);
1349 }
1350 
1351 void
1352 kcpc_idle_restore(struct cpu *cp)
1353 {
1354 	/*
1355 	 * The idle thread shouldn't be run anywhere else.
1356 	 */
1357 	ASSERT(CPU == cp);
1358 
1359 	/*
1360 	 * We must hold the CPU's context lock to ensure the context isn't freed
1361 	 * while we're looking at it.
1362 	 */
1363 	mutex_enter(&cp->cpu_cpc_ctxlock);
1364 
1365 	if ((cp->cpu_cpc_ctx == NULL) ||
1366 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1367 		mutex_exit(&cp->cpu_cpc_ctxlock);
1368 		return;
1369 	}
1370 
1371 	pcbe_ops->pcbe_allstop();
1372 	mutex_exit(&cp->cpu_cpc_ctxlock);
1373 }
1374 
1375 /*ARGSUSED*/
1376 static void
1377 kcpc_lwp_create(kthread_t *t, kthread_t *ct)
1378 {
1379 	kcpc_ctx_t	*ctx = t->t_cpc_ctx, *cctx;
1380 	int		i;
1381 
1382 	if (ctx == NULL || (ctx->kc_flags & KCPC_CTX_LWPINHERIT) == 0)
1383 		return;
1384 
1385 	rw_enter(&kcpc_cpuctx_lock, RW_READER);
1386 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
1387 		rw_exit(&kcpc_cpuctx_lock);
1388 		return;
1389 	}
1390 	cctx = kcpc_ctx_alloc(KM_SLEEP);
1391 	kcpc_ctx_clone(ctx, cctx);
1392 	rw_exit(&kcpc_cpuctx_lock);
1393 
1394 	/*
1395 	 * Copy the parent context's kc_flags field, but don't overwrite
1396 	 * the child's in case it was modified during kcpc_ctx_clone.
1397 	 */
1398 	KCPC_CTX_FLAG_SET(cctx,  ctx->kc_flags);
1399 	cctx->kc_thread = ct;
1400 	cctx->kc_cpuid = -1;
1401 	ct->t_cpc_set = cctx->kc_set;
1402 	ct->t_cpc_ctx = cctx;
1403 
1404 	if (cctx->kc_flags & KCPC_CTX_SIGOVF) {
1405 		kcpc_set_t *ks = cctx->kc_set;
1406 		/*
1407 		 * Our contract with the user requires us to immediately send an
1408 		 * overflow signal to all children if we have the LWPINHERIT
1409 		 * and SIGOVF flags set. In addition, all counters should be
1410 		 * set to UINT64_MAX, and their pic's overflow flag turned on
1411 		 * so that our trap() processing knows to send a signal.
1412 		 */
1413 		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
1414 		for (i = 0; i < ks->ks_nreqs; i++) {
1415 			kcpc_request_t *kr = &ks->ks_req[i];
1416 
1417 			if (kr->kr_flags & CPC_OVF_NOTIFY_EMT) {
1418 				*(kr->kr_data) = UINT64_MAX;
1419 				atomic_or_uint(&kr->kr_picp->kp_flags,
1420 				    KCPC_PIC_OVERFLOWED);
1421 			}
1422 		}
1423 		ttolwp(ct)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
1424 		aston(ct);
1425 	}
1426 
1427 	installctx(ct, cctx, kcpc_save, kcpc_restore,
1428 	    NULL, kcpc_lwp_create, NULL, kcpc_free, NULL);
1429 }
1430 
1431 /*
1432  * Counter Stoppage Theory
1433  *
1434  * The counters may need to be stopped properly at the following occasions:
1435  *
1436  * 1) An LWP exits.
1437  * 2) A thread exits.
1438  * 3) An LWP performs an exec().
1439  * 4) A bound set is unbound.
1440  *
1441  * In addition to stopping the counters, the CPC context (a kcpc_ctx_t) may need
1442  * to be freed as well.
1443  *
1444  * Case 1: kcpc_passivate(), called via lwp_exit(), stops the counters. Later on
1445  * when the thread is freed, kcpc_free(), called by freectx(), frees the
1446  * context.
1447  *
1448  * Case 2: same as case 1 except kcpc_passivate is called from thread_exit().
1449  *
1450  * Case 3: kcpc_free(), called via freectx() via exec(), recognizes that it has
1451  * been called from exec. It stops the counters _and_ frees the context.
1452  *
1453  * Case 4: kcpc_unbind() stops the hardware _and_ frees the context.
1454  *
1455  * CPU-bound counters are always stopped via kcpc_unbind().
1456  */
1457 
1458 /*
1459  * We're being called to delete the context; we ensure that all associated data
1460  * structures are freed, and that the hardware is passivated if this is an exec.
1461  */
1462 
1463 /*ARGSUSED*/
1464 void
1465 kcpc_free(kcpc_ctx_t *ctx, int isexec)
1466 {
1467 	int		i;
1468 	kcpc_set_t	*set = ctx->kc_set;
1469 
1470 	ASSERT(set != NULL);
1471 
1472 	/*
1473 	 * Wait for kcpc_restore() to finish before we tear things down.
1474 	 */
1475 	mutex_enter(&ctx->kc_lock);
1476 	while (ctx->kc_flags & KCPC_CTX_RESTORE)
1477 		cv_wait(&ctx->kc_condv, &ctx->kc_lock);
1478 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1479 	mutex_exit(&ctx->kc_lock);
1480 
1481 	if (isexec) {
1482 		/*
1483 		 * This thread is execing, and after the exec it should not have
1484 		 * any performance counter context. Stop the counters properly
1485 		 * here so the system isn't surprised by an overflow interrupt
1486 		 * later.
1487 		 */
1488 		if (ctx->kc_cpuid != -1) {
1489 			cpu_t *cp;
1490 			/*
1491 			 * CPU-bound context; stop the appropriate CPU's ctrs.
1492 			 * Hold cpu_lock while examining the CPU to ensure it
1493 			 * doesn't go away.
1494 			 */
1495 			mutex_enter(&cpu_lock);
1496 			cp = cpu_get(ctx->kc_cpuid);
1497 			/*
1498 			 * The CPU could have been DR'd out, so only stop the
1499 			 * CPU and clear its context pointer if the CPU still
1500 			 * exists.
1501 			 */
1502 			if (cp != NULL) {
1503 				mutex_enter(&cp->cpu_cpc_ctxlock);
1504 				kcpc_stop_hw(ctx);
1505 				mutex_exit(&cp->cpu_cpc_ctxlock);
1506 			}
1507 			mutex_exit(&cpu_lock);
1508 			ASSERT(curthread->t_cpc_ctx == NULL);
1509 		} else {
1510 			int save_spl;
1511 
1512 			/*
1513 			 * Thread-bound context; stop _this_ CPU's counters.
1514 			 */
1515 			kpreempt_disable();
1516 			save_spl = spl_xcall();
1517 			kcpc_unprogram(ctx, B_TRUE);
1518 			curthread->t_cpc_ctx = NULL;
1519 			splx(save_spl);
1520 			kpreempt_enable();
1521 		}
1522 
1523 		/*
1524 		 * Since we are being called from an exec and we know that
1525 		 * exec is not permitted via the agent thread, we should clean
1526 		 * up this thread's CPC state completely, and not leave dangling
1527 		 * CPC pointers behind.
1528 		 */
1529 		ASSERT(ctx->kc_thread == curthread);
1530 		curthread->t_cpc_set = NULL;
1531 	}
1532 
1533 	/*
1534 	 * Walk through each request in this context's set and free the PCBE's
1535 	 * configuration if it exists.
1536 	 */
1537 	for (i = 0; i < set->ks_nreqs; i++) {
1538 		if (set->ks_req[i].kr_config != NULL)
1539 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
1540 	}
1541 
1542 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
1543 	kcpc_ctx_free(ctx);
1544 	kcpc_free_set(set);
1545 }
1546 
1547 /*
1548  * Free the memory associated with a request set.
1549  */
1550 void
1551 kcpc_free_set(kcpc_set_t *set)
1552 {
1553 	int		i;
1554 	kcpc_request_t	*req;
1555 
1556 	ASSERT(set->ks_req != NULL);
1557 
1558 	for (i = 0; i < set->ks_nreqs; i++) {
1559 		req = &set->ks_req[i];
1560 
1561 		if (req->kr_nattrs != 0) {
1562 			kmem_free(req->kr_attr,
1563 			    req->kr_nattrs * sizeof (kcpc_attr_t));
1564 		}
1565 	}
1566 
1567 	kmem_free(set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
1568 	cv_destroy(&set->ks_condv);
1569 	mutex_destroy(&set->ks_lock);
1570 	kmem_free(set, sizeof (kcpc_set_t));
1571 }
1572 
1573 /*
1574  * Grab every existing context and mark it as invalid.
1575  */
1576 void
1577 kcpc_invalidate_all(void)
1578 {
1579 	kcpc_ctx_t *ctx;
1580 	long hash;
1581 
1582 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++) {
1583 		mutex_enter(&kcpc_ctx_llock[hash]);
1584 		for (ctx = kcpc_ctx_list[hash]; ctx; ctx = ctx->kc_next)
1585 			KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1586 		mutex_exit(&kcpc_ctx_llock[hash]);
1587 	}
1588 }
1589 
1590 /*
1591  * Interface for PCBEs to signal that an existing configuration has suddenly
1592  * become invalid.
1593  */
1594 void
1595 kcpc_invalidate_config(void *token)
1596 {
1597 	kcpc_ctx_t *ctx = token;
1598 
1599 	ASSERT(ctx != NULL);
1600 
1601 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1602 }
1603 
1604 /*
1605  * Called from lwp_exit() and thread_exit()
1606  */
1607 void
1608 kcpc_passivate(void)
1609 {
1610 	kcpc_ctx_t *ctx = curthread->t_cpc_ctx;
1611 	kcpc_set_t *set = curthread->t_cpc_set;
1612 	int	save_spl;
1613 
1614 	if (set == NULL)
1615 		return;
1616 
1617 	if (ctx == NULL) {
1618 		/*
1619 		 * This thread has a set but no context; it must be a CPU-bound
1620 		 * set. The hardware will be stopped via kcpc_unbind() when the
1621 		 * process exits and closes its file descriptors with
1622 		 * kcpc_close(). Our only job here is to clean up this thread's
1623 		 * state; the set will be freed with the unbind().
1624 		 */
1625 		(void) kcpc_unbind(set);
1626 		/*
1627 		 * Unbinding a set belonging to the current thread should clear
1628 		 * its set pointer.
1629 		 */
1630 		ASSERT(curthread->t_cpc_set == NULL);
1631 		return;
1632 	}
1633 
1634 	kpreempt_disable();
1635 	save_spl = spl_xcall();
1636 	curthread->t_cpc_set = NULL;
1637 
1638 	/*
1639 	 * This thread/LWP is exiting but context switches will continue to
1640 	 * happen for a bit as the exit proceeds.  Kernel preemption must be
1641 	 * disabled here to prevent a race between checking or setting the
1642 	 * INVALID_STOPPED flag here and kcpc_restore() setting the flag during
1643 	 * a context switch.
1644 	 */
1645 	if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
1646 		kcpc_unprogram(ctx, B_TRUE);
1647 		KCPC_CTX_FLAG_SET(ctx,
1648 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
1649 	}
1650 
1651 	/*
1652 	 * We're cleaning up after this thread; ensure there are no dangling
1653 	 * CPC pointers left behind. The context and set will be freed by
1654 	 * freectx().
1655 	 */
1656 	curthread->t_cpc_ctx = NULL;
1657 
1658 	splx(save_spl);
1659 	kpreempt_enable();
1660 }
1661 
1662 /*
1663  * Assign the requests in the given set to the PICs in the context.
1664  * Returns 0 if successful, -1 on failure.
1665  */
1666 /*ARGSUSED*/
1667 int
1668 kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx)
1669 {
1670 	int i;
1671 	int *picnum_save;
1672 
1673 	ASSERT(set->ks_nreqs <= cpc_ncounters);
1674 
1675 	/*
1676 	 * Provide kcpc_tryassign() with scratch space to avoid doing an
1677 	 * alloc/free with every invocation.
1678 	 */
1679 	picnum_save = kmem_alloc(set->ks_nreqs * sizeof (int), KM_SLEEP);
1680 	/*
1681 	 * kcpc_tryassign() blindly walks through each request in the set,
1682 	 * seeing if a counter can count its event. If yes, it assigns that
1683 	 * counter. However, that counter may have been the only capable counter
1684 	 * for _another_ request's event. The solution is to try every possible
1685 	 * request first. Note that this does not cover all solutions, as
1686 	 * that would require all unique orderings of requests, an n^n operation
1687 	 * which would be unacceptable for architectures with many counters.
1688 	 */
1689 	for (i = 0; i < set->ks_nreqs; i++)
1690 		if (kcpc_tryassign(set, i, picnum_save) == 0)
1691 			break;
1692 
1693 	kmem_free(picnum_save, set->ks_nreqs * sizeof (int));
1694 	if (i == set->ks_nreqs)
1695 		return (-1);
1696 	return (0);
1697 }
1698 
1699 static int
1700 kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch)
1701 {
1702 	int		i;
1703 	int		j;
1704 	uint64_t	bitmap = 0, resmap = 0;
1705 	uint64_t	ctrmap;
1706 
1707 	/*
1708 	 * We are attempting to assign the reqs to pics, but we may fail. If we
1709 	 * fail, we need to restore the state of the requests to what it was
1710 	 * when we found it, as some reqs may have been explicitly assigned to
1711 	 * a specific PIC beforehand. We do this by snapshotting the assignments
1712 	 * now and restoring from it later if we fail.
1713 	 *
1714 	 * Also we note here which counters have already been claimed by
1715 	 * requests with explicit counter assignments.
1716 	 */
1717 	for (i = 0; i < set->ks_nreqs; i++) {
1718 		scratch[i] = set->ks_req[i].kr_picnum;
1719 		if (set->ks_req[i].kr_picnum != -1)
1720 			resmap |= (1 << set->ks_req[i].kr_picnum);
1721 	}
1722 
1723 	/*
1724 	 * Walk through requests assigning them to the first PIC that is
1725 	 * capable.
1726 	 */
1727 	i = starting_req;
1728 	do {
1729 		if (set->ks_req[i].kr_picnum != -1) {
1730 			ASSERT((bitmap & (1 << set->ks_req[i].kr_picnum)) == 0);
1731 			bitmap |= (1 << set->ks_req[i].kr_picnum);
1732 			if (++i == set->ks_nreqs)
1733 				i = 0;
1734 			continue;
1735 		}
1736 
1737 		ctrmap = pcbe_ops->pcbe_event_coverage(set->ks_req[i].kr_event);
1738 		for (j = 0; j < cpc_ncounters; j++) {
1739 			if (ctrmap & (1 << j) && (bitmap & (1 << j)) == 0 &&
1740 			    (resmap & (1 << j)) == 0) {
1741 				/*
1742 				 * We can assign this counter because:
1743 				 *
1744 				 * 1. It can count the event (ctrmap)
1745 				 * 2. It hasn't been assigned yet (bitmap)
1746 				 * 3. It wasn't reserved by a request (resmap)
1747 				 */
1748 				bitmap |= (1 << j);
1749 				break;
1750 			}
1751 		}
1752 		if (j == cpc_ncounters) {
1753 			for (i = 0; i < set->ks_nreqs; i++)
1754 				set->ks_req[i].kr_picnum = scratch[i];
1755 			return (-1);
1756 		}
1757 		set->ks_req[i].kr_picnum = j;
1758 
1759 		if (++i == set->ks_nreqs)
1760 			i = 0;
1761 	} while (i != starting_req);
1762 
1763 	return (0);
1764 }
1765 
1766 kcpc_set_t *
1767 kcpc_dup_set(kcpc_set_t *set)
1768 {
1769 	kcpc_set_t	*new;
1770 	int		i;
1771 	int		j;
1772 
1773 	new = kmem_zalloc(sizeof (*new), KM_SLEEP);
1774 	new->ks_state &= ~KCPC_SET_BOUND;
1775 	new->ks_flags = set->ks_flags;
1776 	new->ks_nreqs = set->ks_nreqs;
1777 	new->ks_req = kmem_alloc(set->ks_nreqs * sizeof (kcpc_request_t),
1778 	    KM_SLEEP);
1779 	new->ks_data = NULL;
1780 	new->ks_ctx = NULL;
1781 
1782 	for (i = 0; i < new->ks_nreqs; i++) {
1783 		new->ks_req[i].kr_config = NULL;
1784 		new->ks_req[i].kr_index = set->ks_req[i].kr_index;
1785 		new->ks_req[i].kr_picnum = set->ks_req[i].kr_picnum;
1786 		new->ks_req[i].kr_picp = NULL;
1787 		new->ks_req[i].kr_data = NULL;
1788 		(void) strncpy(new->ks_req[i].kr_event, set->ks_req[i].kr_event,
1789 		    CPC_MAX_EVENT_LEN);
1790 		new->ks_req[i].kr_preset = set->ks_req[i].kr_preset;
1791 		new->ks_req[i].kr_flags = set->ks_req[i].kr_flags;
1792 		new->ks_req[i].kr_nattrs = set->ks_req[i].kr_nattrs;
1793 		new->ks_req[i].kr_attr = kmem_alloc(new->ks_req[i].kr_nattrs *
1794 		    sizeof (kcpc_attr_t), KM_SLEEP);
1795 		for (j = 0; j < new->ks_req[i].kr_nattrs; j++) {
1796 			new->ks_req[i].kr_attr[j].ka_val =
1797 			    set->ks_req[i].kr_attr[j].ka_val;
1798 			(void) strncpy(new->ks_req[i].kr_attr[j].ka_name,
1799 			    set->ks_req[i].kr_attr[j].ka_name,
1800 			    CPC_MAX_ATTR_LEN);
1801 		}
1802 	}
1803 
1804 	return (new);
1805 }
1806 
1807 int
1808 kcpc_allow_nonpriv(void *token)
1809 {
1810 	return (((kcpc_ctx_t *)token)->kc_flags & KCPC_CTX_NONPRIV);
1811 }
1812 
1813 void
1814 kcpc_invalidate(kthread_t *t)
1815 {
1816 	kcpc_ctx_t *ctx = t->t_cpc_ctx;
1817 
1818 	if (ctx != NULL)
1819 		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1820 }
1821 
1822 /*
1823  * Given a PCBE ID, attempt to load a matching PCBE module. The strings given
1824  * are used to construct PCBE names, starting with the most specific,
1825  * "pcbe.first.second.third.fourth" and ending with the least specific,
1826  * "pcbe.first".
1827  *
1828  * Returns 0 if a PCBE was successfully loaded and -1 upon error.
1829  */
1830 int
1831 kcpc_pcbe_tryload(const char *prefix, uint_t first, uint_t second, uint_t third)
1832 {
1833 	uint_t s[3];
1834 
1835 	s[0] = first;
1836 	s[1] = second;
1837 	s[2] = third;
1838 
1839 	return (modload_qualified("pcbe",
1840 	    "pcbe", prefix, ".", s, 3, NULL) < 0 ? -1 : 0);
1841 }
1842 
1843 /*
1844  * Create one or more CPC context for given CPU with specified counter event
1845  * requests
1846  *
1847  * If number of requested counter events is less than or equal number of
1848  * hardware counters on a CPU and can all be assigned to the counters on a CPU
1849  * at the same time, then make one CPC context.
1850  *
1851  * Otherwise, multiple CPC contexts are created to allow multiplexing more
1852  * counter events than existing counters onto the counters by iterating through
1853  * all of the CPC contexts, programming the counters with each CPC context one
1854  * at a time and measuring the resulting counter values.  Each of the resulting
1855  * CPC contexts contains some number of requested counter events less than or
1856  * equal the number of counters on a CPU depending on whether all the counter
1857  * events can be programmed on all the counters at the same time or not.
1858  *
1859  * Flags to kmem_{,z}alloc() are passed in as an argument to allow specifying
1860  * whether memory allocation should be non-blocking or not.  The code will try
1861  * to allocate *whole* CPC contexts if possible.  If there is any memory
1862  * allocation failure during the allocations needed for a given CPC context, it
1863  * will skip allocating that CPC context because it cannot allocate the whole
1864  * thing.  Thus, the only time that it will end up allocating none (ie. no CPC
1865  * contexts whatsoever) is when it cannot even allocate *one* whole CPC context
1866  * without a memory allocation failure occurring.
1867  */
1868 int
1869 kcpc_cpu_ctx_create(cpu_t *cp, kcpc_request_list_t *req_list, int kmem_flags,
1870     kcpc_ctx_t ***ctx_ptr_array, size_t *ctx_ptr_array_sz)
1871 {
1872 	kcpc_ctx_t	**ctx_ptrs;
1873 	int		nctx;
1874 	int		nctx_ptrs;
1875 	int		nreqs;
1876 	kcpc_request_t	*reqs;
1877 
1878 	if (cp == NULL || ctx_ptr_array == NULL || ctx_ptr_array_sz == NULL ||
1879 	    req_list == NULL || req_list->krl_cnt < 1)
1880 		return (-1);
1881 
1882 	/*
1883 	 * Allocate number of sets assuming that each set contains one and only
1884 	 * one counter event request for each counter on a CPU
1885 	 */
1886 	nreqs = req_list->krl_cnt;
1887 	nctx_ptrs = (nreqs + cpc_ncounters - 1) / cpc_ncounters;
1888 	ctx_ptrs = kmem_zalloc(nctx_ptrs * sizeof (kcpc_ctx_t *), kmem_flags);
1889 	if (ctx_ptrs == NULL)
1890 		return (-2);
1891 
1892 	/*
1893 	 * Fill in sets of requests
1894 	 */
1895 	nctx = 0;
1896 	reqs = req_list->krl_list;
1897 	while (nreqs > 0) {
1898 		kcpc_ctx_t	*ctx;
1899 		kcpc_set_t	*set;
1900 		int		subcode;
1901 
1902 		/*
1903 		 * Allocate CPC context and set for requested counter events
1904 		 */
1905 		ctx = kcpc_ctx_alloc(kmem_flags);
1906 		set = kcpc_set_create(reqs, nreqs, 0, kmem_flags);
1907 		if (set == NULL) {
1908 			kcpc_ctx_free(ctx);
1909 			break;
1910 		}
1911 
1912 		/*
1913 		 * Determine assignment of requested counter events to specific
1914 		 * counters
1915 		 */
1916 		if (kcpc_assign_reqs(set, ctx) != 0) {
1917 			/*
1918 			 * May not be able to assign requested counter events
1919 			 * to all counters since all counters may not be able
1920 			 * to do all events, so only do one counter event in
1921 			 * set of counter requests when this happens since at
1922 			 * least one of the counters must be able to do the
1923 			 * event.
1924 			 */
1925 			kcpc_free_set(set);
1926 			set = kcpc_set_create(reqs, 1, 0, kmem_flags);
1927 			if (set == NULL) {
1928 				kcpc_ctx_free(ctx);
1929 				break;
1930 			}
1931 			if (kcpc_assign_reqs(set, ctx) != 0) {
1932 #ifdef DEBUG
1933 				cmn_err(CE_NOTE, "!kcpc_cpu_ctx_create: can't "
1934 				    "assign counter event %s!\n",
1935 				    set->ks_req->kr_event);
1936 #endif
1937 				kcpc_free_set(set);
1938 				kcpc_ctx_free(ctx);
1939 				reqs++;
1940 				nreqs--;
1941 				continue;
1942 			}
1943 		}
1944 
1945 		/*
1946 		 * Allocate memory needed to hold requested counter event data
1947 		 */
1948 		set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t),
1949 		    kmem_flags);
1950 		if (set->ks_data == NULL) {
1951 			kcpc_free_set(set);
1952 			kcpc_ctx_free(ctx);
1953 			break;
1954 		}
1955 
1956 		/*
1957 		 * Configure requested counter events
1958 		 */
1959 		if (kcpc_configure_reqs(ctx, set, &subcode) != 0) {
1960 #ifdef DEBUG
1961 			cmn_err(CE_NOTE,
1962 			    "!kcpc_cpu_ctx_create: can't configure "
1963 			    "set of counter event requests!\n");
1964 #endif
1965 			reqs += set->ks_nreqs;
1966 			nreqs -= set->ks_nreqs;
1967 			kmem_free(set->ks_data,
1968 			    set->ks_nreqs * sizeof (uint64_t));
1969 			kcpc_free_set(set);
1970 			kcpc_ctx_free(ctx);
1971 			continue;
1972 		}
1973 
1974 		/*
1975 		 * Point set of counter event requests at this context and fill
1976 		 * in CPC context
1977 		 */
1978 		set->ks_ctx = ctx;
1979 		ctx->kc_set = set;
1980 		ctx->kc_cpuid = cp->cpu_id;
1981 		ctx->kc_thread = curthread;
1982 
1983 		ctx_ptrs[nctx] = ctx;
1984 
1985 		/*
1986 		 * Update requests and how many are left to be assigned to sets
1987 		 */
1988 		reqs += set->ks_nreqs;
1989 		nreqs -= set->ks_nreqs;
1990 
1991 		/*
1992 		 * Increment number of CPC contexts and allocate bigger array
1993 		 * for context pointers as needed
1994 		 */
1995 		nctx++;
1996 		if (nctx >= nctx_ptrs) {
1997 			kcpc_ctx_t	**new;
1998 			int		new_cnt;
1999 
2000 			/*
2001 			 * Allocate more CPC contexts based on how many
2002 			 * contexts allocated so far and how many counter
2003 			 * requests left to assign
2004 			 */
2005 			new_cnt = nctx_ptrs +
2006 			    ((nreqs + cpc_ncounters - 1) / cpc_ncounters);
2007 			new = kmem_zalloc(new_cnt * sizeof (kcpc_ctx_t *),
2008 			    kmem_flags);
2009 			if (new == NULL)
2010 				break;
2011 
2012 			/*
2013 			 * Copy contents of old sets into new ones
2014 			 */
2015 			bcopy(ctx_ptrs, new,
2016 			    nctx_ptrs * sizeof (kcpc_ctx_t *));
2017 
2018 			/*
2019 			 * Free old array of context pointers and use newly
2020 			 * allocated one instead now
2021 			 */
2022 			kmem_free(ctx_ptrs, nctx_ptrs * sizeof (kcpc_ctx_t *));
2023 			ctx_ptrs = new;
2024 			nctx_ptrs = new_cnt;
2025 		}
2026 	}
2027 
2028 	/*
2029 	 * Return NULL if no CPC contexts filled in
2030 	 */
2031 	if (nctx == 0) {
2032 		kmem_free(ctx_ptrs, nctx_ptrs * sizeof (kcpc_ctx_t *));
2033 		*ctx_ptr_array = NULL;
2034 		*ctx_ptr_array_sz = 0;
2035 		return (-2);
2036 	}
2037 
2038 	*ctx_ptr_array = ctx_ptrs;
2039 	*ctx_ptr_array_sz = nctx_ptrs * sizeof (kcpc_ctx_t *);
2040 	return (nctx);
2041 }
2042 
2043 /*
2044  * Return whether PCBE supports given counter event
2045  */
2046 boolean_t
2047 kcpc_event_supported(char *event)
2048 {
2049 	if (pcbe_ops == NULL || pcbe_ops->pcbe_event_coverage(event) == 0)
2050 		return (B_FALSE);
2051 
2052 	return (B_TRUE);
2053 }
2054 
2055 /*
2056  * Program counters on current CPU with given CPC context
2057  *
2058  * If kernel is interposing on counters to measure hardware capacity and
2059  * utilization, then unprogram counters for kernel *before* programming them
2060  * with specified CPC context.
2061  *
2062  * kcpc_{program,unprogram}() may be called either directly by a thread running
2063  * on the target CPU or from a cross-call from another CPU. To protect
2064  * programming and unprogramming from being interrupted by cross-calls, callers
2065  * who execute kcpc_{program,unprogram} should raise PIL to the level used by
2066  * cross-calls.
2067  */
2068 void
2069 kcpc_program(kcpc_ctx_t *ctx, boolean_t for_thread, boolean_t cu_interpose)
2070 {
2071 	int	error;
2072 
2073 	ASSERT(IS_HIPIL());
2074 
2075 	/*
2076 	 * CPC context shouldn't be NULL, its CPU field should specify current
2077 	 * CPU or be -1 to specify any CPU when the context is bound to a
2078 	 * thread, and preemption should be disabled
2079 	 */
2080 	ASSERT(ctx != NULL && (ctx->kc_cpuid == CPU->cpu_id ||
2081 	    ctx->kc_cpuid == -1) && curthread->t_preempt > 0);
2082 	if (ctx == NULL || (ctx->kc_cpuid != CPU->cpu_id &&
2083 	    ctx->kc_cpuid != -1) || curthread->t_preempt < 1)
2084 		return;
2085 
2086 	/*
2087 	 * Unprogram counters for kernel measuring hardware capacity and
2088 	 * utilization
2089 	 */
2090 	if (cu_interpose == B_TRUE) {
2091 		cu_cpc_unprogram(CPU, &error);
2092 	} else {
2093 		kcpc_set_t *set = ctx->kc_set;
2094 		int i;
2095 
2096 		ASSERT(set != NULL);
2097 
2098 		/*
2099 		 * Since cu_interpose is false, we are programming CU context.
2100 		 * In general, PCBE can continue from the state saved in the
2101 		 * set, but it is not very reliable, so we start again from the
2102 		 * preset value.
2103 		 */
2104 		for (i = 0; i < set->ks_nreqs; i++) {
2105 			/*
2106 			 * Reset the virtual counter value to the preset value.
2107 			 */
2108 			*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
2109 
2110 			/*
2111 			 * Reset PCBE to the preset value.
2112 			 */
2113 			pcbe_ops->pcbe_configure(0, NULL,
2114 			    set->ks_req[i].kr_preset,
2115 			    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
2116 		}
2117 	}
2118 
2119 	/*
2120 	 * Program counters with specified CPC context
2121 	 */
2122 	ctx->kc_rawtick = KCPC_GET_TICK();
2123 	pcbe_ops->pcbe_program(ctx);
2124 
2125 	/*
2126 	 * Denote that counters programmed for thread or CPU CPC context
2127 	 * differently
2128 	 */
2129 	if (for_thread == B_TRUE)
2130 		KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
2131 	else
2132 		CPU->cpu_cpc_ctx = ctx;
2133 }
2134 
2135 /*
2136  * Unprogram counters with given CPC context on current CPU
2137  *
2138  * If kernel is interposing on counters to measure hardware capacity and
2139  * utilization, then program counters for the kernel capacity and utilization
2140  * *after* unprogramming them for given CPC context.
2141  *
2142  * See the comment for kcpc_program regarding the synchronization with
2143  * cross-calls.
2144  */
2145 void
2146 kcpc_unprogram(kcpc_ctx_t *ctx, boolean_t cu_interpose)
2147 {
2148 	int	error;
2149 
2150 	ASSERT(IS_HIPIL());
2151 
2152 	/*
2153 	 * CPC context shouldn't be NULL, its CPU field should specify current
2154 	 * CPU or be -1 to specify any CPU when the context is bound to a
2155 	 * thread, and preemption should be disabled
2156 	 */
2157 	ASSERT(ctx != NULL && (ctx->kc_cpuid == CPU->cpu_id ||
2158 	    ctx->kc_cpuid == -1) && curthread->t_preempt > 0);
2159 
2160 	if (ctx == NULL || (ctx->kc_cpuid != CPU->cpu_id &&
2161 	    ctx->kc_cpuid != -1) || curthread->t_preempt < 1 ||
2162 	    (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) != 0) {
2163 		return;
2164 	}
2165 
2166 	/*
2167 	 * Specified CPC context to be unprogrammed should be bound to current
2168 	 * CPU or thread
2169 	 */
2170 	ASSERT(CPU->cpu_cpc_ctx == ctx || curthread->t_cpc_ctx == ctx);
2171 
2172 	/*
2173 	 * Stop counters
2174 	 */
2175 	pcbe_ops->pcbe_allstop();
2176 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID_STOPPED);
2177 
2178 	/*
2179 	 * Allow kernel to interpose on counters and program them for its own
2180 	 * use to measure hardware capacity and utilization if cu_interpose
2181 	 * argument is true
2182 	 */
2183 	if (cu_interpose == B_TRUE)
2184 		cu_cpc_program(CPU, &error);
2185 }
2186 
2187 /*
2188  * Read CPU Performance Counter (CPC) on current CPU and call specified update
2189  * routine with data for each counter event currently programmed on CPU
2190  */
2191 int
2192 kcpc_read(kcpc_update_func_t update_func)
2193 {
2194 	kcpc_ctx_t	*ctx;
2195 	int		i;
2196 	kcpc_request_t	*req;
2197 	int		retval;
2198 	kcpc_set_t	*set;
2199 
2200 	ASSERT(IS_HIPIL());
2201 
2202 	/*
2203 	 * Can't grab locks or block because may be called inside dispatcher
2204 	 */
2205 	kpreempt_disable();
2206 
2207 	ctx = CPU->cpu_cpc_ctx;
2208 	if (ctx == NULL) {
2209 		kpreempt_enable();
2210 		return (0);
2211 	}
2212 
2213 	/*
2214 	 * Read counter data from current CPU
2215 	 */
2216 	pcbe_ops->pcbe_sample(ctx);
2217 
2218 	set = ctx->kc_set;
2219 	if (set == NULL || set->ks_req == NULL) {
2220 		kpreempt_enable();
2221 		return (0);
2222 	}
2223 
2224 	/*
2225 	 * Call update function with preset pointer and data for each CPC event
2226 	 * request currently programmed on current CPU
2227 	 */
2228 	req = set->ks_req;
2229 	retval = 0;
2230 	for (i = 0; i < set->ks_nreqs; i++) {
2231 		int	ret;
2232 
2233 		if (req[i].kr_data == NULL)
2234 			break;
2235 
2236 		ret = update_func(req[i].kr_ptr, *req[i].kr_data);
2237 		if (ret < 0)
2238 			retval = ret;
2239 	}
2240 
2241 	kpreempt_enable();
2242 
2243 	return (retval);
2244 }
2245 
2246 /*
2247  * Initialize list of counter event requests
2248  */
2249 kcpc_request_list_t *
2250 kcpc_reqs_init(int nreqs, int kmem_flags)
2251 {
2252 	kcpc_request_list_t	*req_list;
2253 	kcpc_request_t		*reqs;
2254 
2255 	if (nreqs < 1)
2256 		return (NULL);
2257 
2258 	req_list = kmem_zalloc(sizeof (kcpc_request_list_t), kmem_flags);
2259 	if (req_list == NULL)
2260 		return (NULL);
2261 
2262 	reqs = kmem_zalloc(nreqs * sizeof (kcpc_request_t), kmem_flags);
2263 	if (reqs == NULL) {
2264 		kmem_free(req_list, sizeof (kcpc_request_list_t));
2265 		return (NULL);
2266 	}
2267 
2268 	req_list->krl_list = reqs;
2269 	req_list->krl_cnt = 0;
2270 	req_list->krl_max = nreqs;
2271 	return (req_list);
2272 }
2273 
2274 
2275 /*
2276  * Add counter event request to given list of counter event requests
2277  */
2278 int
2279 kcpc_reqs_add(kcpc_request_list_t *req_list, char *event, uint64_t preset,
2280     uint_t flags, uint_t nattrs, kcpc_attr_t *attr, void *ptr, int kmem_flags)
2281 {
2282 	kcpc_request_t	*req;
2283 
2284 	if (req_list == NULL || req_list->krl_list == NULL)
2285 		return (-1);
2286 
2287 	ASSERT(req_list->krl_max != 0);
2288 
2289 	/*
2290 	 * Allocate more space (if needed)
2291 	 */
2292 	if (req_list->krl_cnt > req_list->krl_max) {
2293 		kcpc_request_t	*new;
2294 		kcpc_request_t	*old;
2295 
2296 		old = req_list->krl_list;
2297 		new = kmem_zalloc((req_list->krl_max +
2298 		    cpc_ncounters) * sizeof (kcpc_request_t), kmem_flags);
2299 		if (new == NULL)
2300 			return (-2);
2301 
2302 		req_list->krl_list = new;
2303 		bcopy(old, req_list->krl_list,
2304 		    req_list->krl_cnt * sizeof (kcpc_request_t));
2305 		kmem_free(old, req_list->krl_max * sizeof (kcpc_request_t));
2306 		req_list->krl_cnt = 0;
2307 		req_list->krl_max += cpc_ncounters;
2308 	}
2309 
2310 	/*
2311 	 * Fill in request as much as possible now, but some fields will need
2312 	 * to be set when request is assigned to a set.
2313 	 */
2314 	req = &req_list->krl_list[req_list->krl_cnt];
2315 	req->kr_config = NULL;
2316 	req->kr_picnum = -1;	/* have CPC pick this */
2317 	req->kr_index = -1;	/* set when assigning request to set */
2318 	req->kr_data = NULL;	/* set when configuring request */
2319 	(void) strcpy(req->kr_event, event);
2320 	req->kr_preset = preset;
2321 	req->kr_flags = flags;
2322 	req->kr_nattrs = nattrs;
2323 	req->kr_attr = attr;
2324 	/*
2325 	 * Keep pointer given by caller to give to update function when this
2326 	 * counter event is sampled/read
2327 	 */
2328 	req->kr_ptr = ptr;
2329 
2330 	req_list->krl_cnt++;
2331 
2332 	return (0);
2333 }
2334 
2335 /*
2336  * Reset list of CPC event requests so its space can be used for another set
2337  * of requests
2338  */
2339 int
2340 kcpc_reqs_reset(kcpc_request_list_t *req_list)
2341 {
2342 	/*
2343 	 * Return when pointer to request list structure or request is NULL or
2344 	 * when max requests is less than or equal to 0
2345 	 */
2346 	if (req_list == NULL || req_list->krl_list == NULL ||
2347 	    req_list->krl_max <= 0)
2348 		return (-1);
2349 
2350 	/*
2351 	 * Zero out requests and number of requests used
2352 	 */
2353 	bzero(req_list->krl_list, req_list->krl_max * sizeof (kcpc_request_t));
2354 	req_list->krl_cnt = 0;
2355 	return (0);
2356 }
2357 
2358 /*
2359  * Free given list of counter event requests
2360  */
2361 int
2362 kcpc_reqs_fini(kcpc_request_list_t *req_list)
2363 {
2364 	kmem_free(req_list->krl_list,
2365 	    req_list->krl_max * sizeof (kcpc_request_t));
2366 	kmem_free(req_list, sizeof (kcpc_request_list_t));
2367 	return (0);
2368 }
2369 
2370 /*
2371  * Create set of given counter event requests
2372  */
2373 static kcpc_set_t *
2374 kcpc_set_create(kcpc_request_t *reqs, int nreqs, int set_flags, int kmem_flags)
2375 {
2376 	int		i;
2377 	kcpc_set_t	*set;
2378 
2379 	/*
2380 	 * Allocate set and assign number of requests in set and flags
2381 	 */
2382 	set = kmem_zalloc(sizeof (kcpc_set_t), kmem_flags);
2383 	if (set == NULL)
2384 		return (NULL);
2385 
2386 	if (nreqs < cpc_ncounters)
2387 		set->ks_nreqs = nreqs;
2388 	else
2389 		set->ks_nreqs = cpc_ncounters;
2390 
2391 	set->ks_flags = set_flags;
2392 
2393 	/*
2394 	 * Allocate requests needed, copy requests into set, and set index into
2395 	 * data for each request (which may change when we assign requested
2396 	 * counter events to counters)
2397 	 */
2398 	set->ks_req = (kcpc_request_t *)kmem_zalloc(sizeof (kcpc_request_t) *
2399 	    set->ks_nreqs, kmem_flags);
2400 	if (set->ks_req == NULL) {
2401 		kmem_free(set, sizeof (kcpc_set_t));
2402 		return (NULL);
2403 	}
2404 
2405 	bcopy(reqs, set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
2406 
2407 	for (i = 0; i < set->ks_nreqs; i++)
2408 		set->ks_req[i].kr_index = i;
2409 
2410 	return (set);
2411 }
2412 
2413 
2414 /*
2415  * Stop counters on current CPU.
2416  *
2417  * If preserve_context is true, the caller is interested in the CPU's CPC
2418  * context and wants it to be preserved.
2419  *
2420  * If preserve_context is false, the caller does not need the CPU's CPC context
2421  * to be preserved, so it is set to NULL.
2422  */
2423 static void
2424 kcpc_cpustop_func(uintptr_t arg1, uintptr_t arg2 __unused)
2425 {
2426 	boolean_t preserve_context;
2427 	kpreempt_disable();
2428 
2429 	preserve_context = (boolean_t)arg1;
2430 	/*
2431 	 * Someone already stopped this context before us, so there is nothing
2432 	 * to do.
2433 	 */
2434 	if (CPU->cpu_cpc_ctx == NULL) {
2435 		kpreempt_enable();
2436 		return;
2437 	}
2438 
2439 	kcpc_unprogram(CPU->cpu_cpc_ctx, B_TRUE);
2440 	/*
2441 	 * If CU does not use counters, then clear the CPU's CPC context
2442 	 * If the caller requested to preserve context it should disable CU
2443 	 * first, so there should be no CU context now.
2444 	 */
2445 	ASSERT(!preserve_context || !CU_CPC_ON(CPU));
2446 	if (!preserve_context && CPU->cpu_cpc_ctx != NULL && !CU_CPC_ON(CPU))
2447 		CPU->cpu_cpc_ctx = NULL;
2448 
2449 	kpreempt_enable();
2450 }
2451 
2452 /*
2453  * Stop counters on given CPU and set its CPC context to NULL unless
2454  * preserve_context is true.
2455  */
2456 void
2457 kcpc_cpu_stop(cpu_t *cp, boolean_t preserve_context)
2458 {
2459 	cpu_call(cp, kcpc_cpustop_func, preserve_context, 0);
2460 }
2461 
2462 /*
2463  * Program the context on the current CPU
2464  */
2465 static void
2466 kcpc_remoteprogram_func(uintptr_t arg1, uintptr_t arg2)
2467 {
2468 	kcpc_ctx_t *ctx = (kcpc_ctx_t *)arg1;
2469 	boolean_t for_thread = (boolean_t)arg2;
2470 
2471 	ASSERT(ctx != NULL);
2472 
2473 	kpreempt_disable();
2474 	kcpc_program(ctx, for_thread, B_TRUE);
2475 	kpreempt_enable();
2476 }
2477 
2478 /*
2479  * Program counters on given CPU
2480  */
2481 void
2482 kcpc_cpu_program(cpu_t *cp, kcpc_ctx_t *ctx)
2483 {
2484 	cpu_call(cp, kcpc_remoteprogram_func, (uintptr_t)ctx,
2485 	    (uintptr_t)B_FALSE);
2486 }
2487 
2488 char *
2489 kcpc_list_attrs(void)
2490 {
2491 	ASSERT(pcbe_ops != NULL);
2492 
2493 	return (pcbe_ops->pcbe_list_attrs());
2494 }
2495 
2496 char *
2497 kcpc_list_events(uint_t pic)
2498 {
2499 	ASSERT(pcbe_ops != NULL);
2500 
2501 	return (pcbe_ops->pcbe_list_events(pic));
2502 }
2503 
2504 uint_t
2505 kcpc_pcbe_capabilities(void)
2506 {
2507 	ASSERT(pcbe_ops != NULL);
2508 
2509 	return (pcbe_ops->pcbe_caps);
2510 }
2511 
2512 int
2513 kcpc_pcbe_loaded(void)
2514 {
2515 	return (pcbe_ops == NULL ? -1 : 0);
2516 }
2517