xref: /illumos-gate/usr/src/uts/common/os/kcpc.c (revision 8ac8a393f9ba5b2bf3aeabc50511c40334e9f5c8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2021 Joyent, Inc.
25  * Copyright 2021 Oxide Computer Company
26  */
27 
28 #include <sys/param.h>
29 #include <sys/thread.h>
30 #include <sys/cpuvar.h>
31 #include <sys/inttypes.h>
32 #include <sys/cmn_err.h>
33 #include <sys/time.h>
34 #include <sys/ksynch.h>
35 #include <sys/systm.h>
36 #include <sys/kcpc.h>
37 #include <sys/cpc_impl.h>
38 #include <sys/cpc_pcbe.h>
39 #include <sys/atomic.h>
40 #include <sys/sunddi.h>
41 #include <sys/modctl.h>
42 #include <sys/sdt.h>
43 #include <sys/archsystm.h>
44 #include <sys/promif.h>
45 #include <sys/x_call.h>
46 #include <sys/cap_util.h>
47 #if defined(__x86)
48 #include <asm/clock.h>
49 #include <sys/xc_levels.h>
50 #endif
51 
52 static kmutex_t	kcpc_ctx_llock[CPC_HASH_BUCKETS];	/* protects ctx_list */
53 static kcpc_ctx_t *kcpc_ctx_list[CPC_HASH_BUCKETS];	/* head of list */
54 
55 
56 krwlock_t	kcpc_cpuctx_lock;	/* lock for 'kcpc_cpuctx' below */
57 int		kcpc_cpuctx;		/* number of cpu-specific contexts */
58 
59 int kcpc_counts_include_idle = 1; /* Project Private /etc/system variable */
60 
61 /*
62  * These are set when a PCBE module is loaded.
63  */
64 uint_t		cpc_ncounters = 0;
65 pcbe_ops_t	*pcbe_ops = NULL;
66 
67 /*
68  * Statistics on (mis)behavior
69  */
70 static uint32_t kcpc_intrctx_count;    /* # overflows in an interrupt handler */
71 static uint32_t kcpc_nullctx_count;    /* # overflows in a thread with no ctx */
72 
73 /*
74  * By setting 'kcpc_nullctx_panic' to 1, any overflow interrupts in a thread
75  * with no valid context will result in a panic.
76  */
77 static int kcpc_nullctx_panic = 0;
78 
79 static void kcpc_save(void *);
80 static void kcpc_restore(void *);
81 static void kcpc_lwp_create(void *, void *);
82 static void kcpc_free(void *, int);
83 static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx);
84 static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch);
85 static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set);
86 static kcpc_set_t *kcpc_set_create(kcpc_request_t *reqs, int nreqs,
87     int set_flags, int kmem_flags);
88 
89 /*
90  * Macros to manipulate context flags. All flag updates should use one of these
91  * two macros
92  *
93  * Flags should be always be updated atomically since some of the updates are
94  * not protected by locks.
95  */
96 #define	KCPC_CTX_FLAG_SET(ctx, flag) atomic_or_uint(&(ctx)->kc_flags, (flag))
97 #define	KCPC_CTX_FLAG_CLR(ctx, flag) atomic_and_uint(&(ctx)->kc_flags, ~(flag))
98 
99 /*
100  * The IS_HIPIL() macro verifies that the code is executed either from a
101  * cross-call or from high-PIL interrupt
102  */
103 #ifdef DEBUG
104 #define	IS_HIPIL() (getpil() >= XCALL_PIL)
105 #else
106 #define	IS_HIPIL()
107 #endif	/* DEBUG */
108 
109 
110 extern int kcpc_hw_load_pcbe(void);
111 
112 /*
113  * Return value from kcpc_hw_load_pcbe()
114  */
115 static int kcpc_pcbe_error = 0;
116 
117 static const struct ctxop_template kcpc_ctxop_tpl = {
118 	.ct_rev		= CTXOP_TPL_REV,
119 	.ct_save	= kcpc_save,
120 	.ct_restore	= kcpc_restore,
121 	.ct_lwp_create	= kcpc_lwp_create,
122 	.ct_free	= kcpc_free,
123 };
124 
125 /*
126  * Perform one-time initialization of kcpc framework.
127  * This function performs the initialization only the first time it is called.
128  * It is safe to call it multiple times.
129  */
130 int
131 kcpc_init(void)
132 {
133 	long hash;
134 	static uint32_t kcpc_initialized = 0;
135 
136 	/*
137 	 * We already tried loading platform pcbe module and failed
138 	 */
139 	if (kcpc_pcbe_error != 0)
140 		return (-1);
141 
142 	/*
143 	 * The kcpc framework should be initialized at most once
144 	 */
145 	if (atomic_cas_32(&kcpc_initialized, 0, 1) != 0)
146 		return (0);
147 
148 	rw_init(&kcpc_cpuctx_lock, NULL, RW_DEFAULT, NULL);
149 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
150 		mutex_init(&kcpc_ctx_llock[hash],
151 		    NULL, MUTEX_DRIVER, (void *)(uintptr_t)15);
152 
153 	/*
154 	 * Load platform-specific pcbe module
155 	 */
156 	kcpc_pcbe_error = kcpc_hw_load_pcbe();
157 
158 	return (kcpc_pcbe_error == 0 ? 0 : -1);
159 }
160 
161 void
162 kcpc_register_pcbe(pcbe_ops_t *ops)
163 {
164 	pcbe_ops = ops;
165 	cpc_ncounters = pcbe_ops->pcbe_ncounters();
166 }
167 
168 void
169 kcpc_register_dcpc(void (*func)(uint64_t))
170 {
171 	dtrace_cpc_fire = func;
172 }
173 
174 void
175 kcpc_unregister_dcpc(void)
176 {
177 	dtrace_cpc_fire = NULL;
178 }
179 
180 int
181 kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode)
182 {
183 	cpu_t		*cp;
184 	kcpc_ctx_t	*ctx;
185 	int		error;
186 	int		save_spl;
187 
188 	ctx = kcpc_ctx_alloc(KM_SLEEP);
189 
190 	if (kcpc_assign_reqs(set, ctx) != 0) {
191 		kcpc_ctx_free(ctx);
192 		*subcode = CPC_RESOURCE_UNAVAIL;
193 		return (EINVAL);
194 	}
195 
196 	ctx->kc_cpuid = cpuid;
197 	ctx->kc_thread = curthread;
198 
199 	set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
200 
201 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
202 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
203 		kcpc_ctx_free(ctx);
204 		return (error);
205 	}
206 
207 	set->ks_ctx = ctx;
208 	ctx->kc_set = set;
209 
210 	/*
211 	 * We must hold cpu_lock to prevent DR, offlining, or unbinding while
212 	 * we are manipulating the cpu_t and programming the hardware, else the
213 	 * the cpu_t could go away while we're looking at it.
214 	 */
215 	mutex_enter(&cpu_lock);
216 	cp = cpu_get(cpuid);
217 
218 	if (cp == NULL)
219 		/*
220 		 * The CPU could have been DRd out while we were getting set up.
221 		 */
222 		goto unbound;
223 
224 	mutex_enter(&cp->cpu_cpc_ctxlock);
225 	kpreempt_disable();
226 	save_spl = spl_xcall();
227 
228 	/*
229 	 * Check to see whether counters for CPU already being used by someone
230 	 * other than kernel for capacity and utilization (since kernel will
231 	 * let go of counters for user in kcpc_program() below)
232 	 */
233 	if (cp->cpu_cpc_ctx != NULL && !CU_CPC_ON(cp)) {
234 		/*
235 		 * If this CPU already has a bound set, return an error.
236 		 */
237 		splx(save_spl);
238 		kpreempt_enable();
239 		mutex_exit(&cp->cpu_cpc_ctxlock);
240 		goto unbound;
241 	}
242 
243 	if (curthread->t_bind_cpu != cpuid) {
244 		splx(save_spl);
245 		kpreempt_enable();
246 		mutex_exit(&cp->cpu_cpc_ctxlock);
247 		goto unbound;
248 	}
249 
250 	kcpc_program(ctx, B_FALSE, B_TRUE);
251 
252 	splx(save_spl);
253 	kpreempt_enable();
254 
255 	mutex_exit(&cp->cpu_cpc_ctxlock);
256 	mutex_exit(&cpu_lock);
257 
258 	mutex_enter(&set->ks_lock);
259 	set->ks_state |= KCPC_SET_BOUND;
260 	cv_signal(&set->ks_condv);
261 	mutex_exit(&set->ks_lock);
262 
263 	return (0);
264 
265 unbound:
266 	mutex_exit(&cpu_lock);
267 	set->ks_ctx = NULL;
268 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
269 	kcpc_ctx_free(ctx);
270 	return (EAGAIN);
271 }
272 
273 int
274 kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
275 {
276 	kcpc_ctx_t	*ctx;
277 	int		error;
278 
279 	/*
280 	 * Only one set is allowed per context, so ensure there is no
281 	 * existing context.
282 	 */
283 
284 	if (t->t_cpc_ctx != NULL)
285 		return (EEXIST);
286 
287 	ctx = kcpc_ctx_alloc(KM_SLEEP);
288 
289 	/*
290 	 * The context must begin life frozen until it has been properly
291 	 * programmed onto the hardware. This prevents the context ops from
292 	 * worrying about it until we're ready.
293 	 */
294 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
295 	ctx->kc_hrtime = gethrtime();
296 
297 	if (kcpc_assign_reqs(set, ctx) != 0) {
298 		kcpc_ctx_free(ctx);
299 		*subcode = CPC_RESOURCE_UNAVAIL;
300 		return (EINVAL);
301 	}
302 
303 	ctx->kc_cpuid = -1;
304 	if (set->ks_flags & CPC_BIND_LWP_INHERIT)
305 		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_LWPINHERIT);
306 	ctx->kc_thread = t;
307 	t->t_cpc_ctx = ctx;
308 	/*
309 	 * Permit threads to look at their own hardware counters from userland.
310 	 */
311 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_NONPRIV);
312 
313 	/*
314 	 * Create the data store for this set.
315 	 */
316 	set->ks_data = kmem_alloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
317 
318 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
319 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
320 		kcpc_ctx_free(ctx);
321 		t->t_cpc_ctx = NULL;
322 		return (error);
323 	}
324 
325 	set->ks_ctx = ctx;
326 	ctx->kc_set = set;
327 
328 	/*
329 	 * Add a device context to the subject thread.
330 	 */
331 	ctxop_install(t, &kcpc_ctxop_tpl, ctx);
332 
333 	/*
334 	 * Ask the backend to program the hardware.
335 	 */
336 	if (t == curthread) {
337 		int save_spl;
338 
339 		kpreempt_disable();
340 		save_spl = spl_xcall();
341 		kcpc_program(ctx, B_TRUE, B_TRUE);
342 		splx(save_spl);
343 		kpreempt_enable();
344 	} else {
345 		/*
346 		 * Since we are the agent LWP, we know the victim LWP is stopped
347 		 * until we're done here; no need to worry about preemption or
348 		 * migration here. We still use an atomic op to clear the flag
349 		 * to ensure the flags are always self-consistent; they can
350 		 * still be accessed from, for instance, another CPU doing a
351 		 * kcpc_invalidate_all().
352 		 */
353 		KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
354 	}
355 
356 	mutex_enter(&set->ks_lock);
357 	set->ks_state |= KCPC_SET_BOUND;
358 	cv_signal(&set->ks_condv);
359 	mutex_exit(&set->ks_lock);
360 
361 	return (0);
362 }
363 
364 /*
365  * Walk through each request in the set and ask the PCBE to configure a
366  * corresponding counter.
367  */
368 int
369 kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode)
370 {
371 	int		i;
372 	int		ret;
373 	kcpc_request_t	*rp;
374 
375 	for (i = 0; i < set->ks_nreqs; i++) {
376 		int n;
377 		rp = &set->ks_req[i];
378 
379 		n = rp->kr_picnum;
380 
381 		ASSERT(n >= 0 && n < cpc_ncounters);
382 
383 		ASSERT(ctx->kc_pics[n].kp_req == NULL);
384 
385 		if (rp->kr_flags & CPC_OVF_NOTIFY_EMT) {
386 			if ((pcbe_ops->pcbe_caps & CPC_CAP_OVERFLOW_INTERRUPT)
387 			    == 0) {
388 				*subcode = -1;
389 				return (ENOTSUP);
390 			}
391 			/*
392 			 * If any of the counters have requested overflow
393 			 * notification, we flag the context as being one that
394 			 * cares about overflow.
395 			 */
396 			KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_SIGOVF);
397 		}
398 
399 		rp->kr_config = NULL;
400 		if ((ret = pcbe_ops->pcbe_configure(n, rp->kr_event,
401 		    rp->kr_preset, rp->kr_flags, rp->kr_nattrs, rp->kr_attr,
402 		    &(rp->kr_config), (void *)ctx)) != 0) {
403 			kcpc_free_configs(set);
404 			*subcode = ret;
405 			switch (ret) {
406 			case CPC_ATTR_REQUIRES_PRIVILEGE:
407 			case CPC_HV_NO_ACCESS:
408 				return (EACCES);
409 			default:
410 				return (EINVAL);
411 			}
412 		}
413 
414 		ctx->kc_pics[n].kp_req = rp;
415 		rp->kr_picp = &ctx->kc_pics[n];
416 		rp->kr_data = set->ks_data + rp->kr_index;
417 		*rp->kr_data = rp->kr_preset;
418 	}
419 
420 	return (0);
421 }
422 
423 void
424 kcpc_free_configs(kcpc_set_t *set)
425 {
426 	int i;
427 
428 	for (i = 0; i < set->ks_nreqs; i++)
429 		if (set->ks_req[i].kr_config != NULL)
430 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
431 }
432 
433 /*
434  * buf points to a user address and the data should be copied out to that
435  * address in the current process.
436  */
437 int
438 kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick)
439 {
440 	kcpc_ctx_t	*ctx = set->ks_ctx;
441 	int		save_spl;
442 
443 	mutex_enter(&set->ks_lock);
444 	if ((set->ks_state & KCPC_SET_BOUND) == 0) {
445 		mutex_exit(&set->ks_lock);
446 		return (EINVAL);
447 	}
448 	mutex_exit(&set->ks_lock);
449 
450 	/*
451 	 * Kernel preemption must be disabled while reading the hardware regs,
452 	 * and if this is a CPU-bound context, while checking the CPU binding of
453 	 * the current thread.
454 	 */
455 	kpreempt_disable();
456 	save_spl = spl_xcall();
457 
458 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
459 		splx(save_spl);
460 		kpreempt_enable();
461 		return (EAGAIN);
462 	}
463 
464 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) {
465 		if (ctx->kc_cpuid != -1) {
466 			if (curthread->t_bind_cpu != ctx->kc_cpuid) {
467 				splx(save_spl);
468 				kpreempt_enable();
469 				return (EAGAIN);
470 			}
471 		}
472 
473 		if (ctx->kc_thread == curthread) {
474 			uint64_t curtick = KCPC_GET_TICK();
475 
476 			ctx->kc_hrtime = gethrtime_waitfree();
477 			pcbe_ops->pcbe_sample(ctx);
478 			ctx->kc_vtick += curtick - ctx->kc_rawtick;
479 			ctx->kc_rawtick = curtick;
480 		}
481 
482 		/*
483 		 * The config may have been invalidated by
484 		 * the pcbe_sample op.
485 		 */
486 		if (ctx->kc_flags & KCPC_CTX_INVALID) {
487 			splx(save_spl);
488 			kpreempt_enable();
489 			return (EAGAIN);
490 		}
491 
492 	}
493 
494 	splx(save_spl);
495 	kpreempt_enable();
496 
497 	if (copyout(set->ks_data, buf,
498 	    set->ks_nreqs * sizeof (uint64_t)) == -1)
499 		return (EFAULT);
500 	if (copyout(&ctx->kc_hrtime, hrtime, sizeof (uint64_t)) == -1)
501 		return (EFAULT);
502 	if (copyout(&ctx->kc_vtick, tick, sizeof (uint64_t)) == -1)
503 		return (EFAULT);
504 
505 	return (0);
506 }
507 
508 /*
509  * Stop the counters on the CPU this context is bound to.
510  */
511 static void
512 kcpc_stop_hw(kcpc_ctx_t *ctx)
513 {
514 	cpu_t *cp;
515 
516 	kpreempt_disable();
517 
518 	if (ctx->kc_cpuid == CPU->cpu_id) {
519 		cp = CPU;
520 	} else {
521 		cp = cpu_get(ctx->kc_cpuid);
522 	}
523 
524 	ASSERT(cp != NULL && cp->cpu_cpc_ctx == ctx);
525 	kcpc_cpu_stop(cp, B_FALSE);
526 
527 	kpreempt_enable();
528 }
529 
530 int
531 kcpc_unbind(kcpc_set_t *set)
532 {
533 	kcpc_ctx_t	*ctx;
534 	kthread_t	*t;
535 
536 	/*
537 	 * We could be racing with the process's agent thread as it
538 	 * binds the set; we must wait for the set to finish binding
539 	 * before attempting to tear it down.
540 	 */
541 	mutex_enter(&set->ks_lock);
542 	while ((set->ks_state & KCPC_SET_BOUND) == 0)
543 		cv_wait(&set->ks_condv, &set->ks_lock);
544 	mutex_exit(&set->ks_lock);
545 
546 	ctx = set->ks_ctx;
547 
548 	/*
549 	 * Use kc_lock to synchronize with kcpc_restore().
550 	 */
551 	mutex_enter(&ctx->kc_lock);
552 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
553 	mutex_exit(&ctx->kc_lock);
554 
555 	if (ctx->kc_cpuid == -1) {
556 		t = ctx->kc_thread;
557 		/*
558 		 * The context is thread-bound and therefore has a device
559 		 * context.  It will be freed via ctxop_remove() calling
560 		 * freectx() calling kcpc_free().
561 		 */
562 		if (t == curthread) {
563 			int save_spl;
564 
565 			kpreempt_disable();
566 			save_spl = spl_xcall();
567 			if (!(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED))
568 				kcpc_unprogram(ctx, B_TRUE);
569 			splx(save_spl);
570 			kpreempt_enable();
571 		}
572 		VERIFY3U(ctxop_remove(t, &kcpc_ctxop_tpl, ctx), !=, 0);
573 		t->t_cpc_set = NULL;
574 		t->t_cpc_ctx = NULL;
575 	} else {
576 		/*
577 		 * If we are unbinding a CPU-bound set from a remote CPU, the
578 		 * native CPU's idle thread could be in the midst of programming
579 		 * this context onto the CPU. We grab the context's lock here to
580 		 * ensure that the idle thread is done with it. When we release
581 		 * the lock, the CPU no longer has a context and the idle thread
582 		 * will move on.
583 		 *
584 		 * cpu_lock must be held to prevent the CPU from being DR'd out
585 		 * while we disassociate the context from the cpu_t.
586 		 */
587 		cpu_t *cp;
588 		mutex_enter(&cpu_lock);
589 		cp = cpu_get(ctx->kc_cpuid);
590 		if (cp != NULL) {
591 			/*
592 			 * The CPU may have been DR'd out of the system.
593 			 */
594 			mutex_enter(&cp->cpu_cpc_ctxlock);
595 			if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0)
596 				kcpc_stop_hw(ctx);
597 			ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
598 			mutex_exit(&cp->cpu_cpc_ctxlock);
599 		}
600 		mutex_exit(&cpu_lock);
601 		if (ctx->kc_thread == curthread) {
602 			kcpc_free(ctx, 0);
603 			curthread->t_cpc_set = NULL;
604 		}
605 	}
606 
607 	return (0);
608 }
609 
610 int
611 kcpc_preset(kcpc_set_t *set, int index, uint64_t preset)
612 {
613 	int i;
614 
615 	ASSERT(set != NULL);
616 	ASSERT(set->ks_state & KCPC_SET_BOUND);
617 	ASSERT(set->ks_ctx->kc_thread == curthread);
618 	ASSERT(set->ks_ctx->kc_cpuid == -1);
619 
620 	if (index < 0 || index >= set->ks_nreqs)
621 		return (EINVAL);
622 
623 	for (i = 0; i < set->ks_nreqs; i++)
624 		if (set->ks_req[i].kr_index == index)
625 			break;
626 	ASSERT(i != set->ks_nreqs);
627 
628 	set->ks_req[i].kr_preset = preset;
629 	return (0);
630 }
631 
632 int
633 kcpc_restart(kcpc_set_t *set)
634 {
635 	kcpc_ctx_t	*ctx = set->ks_ctx;
636 	int		i;
637 	int		save_spl;
638 
639 	ASSERT(set->ks_state & KCPC_SET_BOUND);
640 	ASSERT(ctx->kc_thread == curthread);
641 	ASSERT(ctx->kc_cpuid == -1);
642 
643 	for (i = 0; i < set->ks_nreqs; i++) {
644 		*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
645 		pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset,
646 		    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
647 	}
648 
649 	kpreempt_disable();
650 	save_spl = spl_xcall();
651 
652 	/*
653 	 * If the user is doing this on a running set, make sure the counters
654 	 * are stopped first.
655 	 */
656 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
657 		pcbe_ops->pcbe_allstop();
658 
659 	/*
660 	 * Ask the backend to program the hardware.
661 	 */
662 	ctx->kc_rawtick = KCPC_GET_TICK();
663 	KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
664 	pcbe_ops->pcbe_program(ctx);
665 	splx(save_spl);
666 	kpreempt_enable();
667 
668 	return (0);
669 }
670 
671 /*
672  * Caller must hold kcpc_cpuctx_lock.
673  */
674 int
675 kcpc_enable(kthread_t *t, int cmd, int enable)
676 {
677 	kcpc_ctx_t	*ctx = t->t_cpc_ctx;
678 	kcpc_set_t	*set = t->t_cpc_set;
679 	kcpc_set_t	*newset;
680 	int		i;
681 	int		flag;
682 	int		err;
683 
684 	ASSERT(RW_READ_HELD(&kcpc_cpuctx_lock));
685 
686 	if (ctx == NULL) {
687 		/*
688 		 * This thread has a set but no context; it must be a
689 		 * CPU-bound set.
690 		 */
691 		ASSERT(t->t_cpc_set != NULL);
692 		ASSERT(t->t_cpc_set->ks_ctx->kc_cpuid != -1);
693 		return (EINVAL);
694 	} else if (ctx->kc_flags & KCPC_CTX_INVALID)
695 		return (EAGAIN);
696 
697 	if (cmd == CPC_ENABLE) {
698 		if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
699 			return (EINVAL);
700 		kpreempt_disable();
701 		KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
702 		kcpc_restore(ctx);
703 		kpreempt_enable();
704 	} else if (cmd == CPC_DISABLE) {
705 		if (ctx->kc_flags & KCPC_CTX_FREEZE)
706 			return (EINVAL);
707 		kpreempt_disable();
708 		kcpc_save(ctx);
709 		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
710 		kpreempt_enable();
711 	} else if (cmd == CPC_USR_EVENTS || cmd == CPC_SYS_EVENTS) {
712 		/*
713 		 * Strategy for usr/sys: stop counters and update set's presets
714 		 * with current counter values, unbind, update requests with
715 		 * new config, then re-bind.
716 		 */
717 		flag = (cmd == CPC_USR_EVENTS) ?
718 		    CPC_COUNT_USER: CPC_COUNT_SYSTEM;
719 
720 		kpreempt_disable();
721 		KCPC_CTX_FLAG_SET(ctx,
722 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
723 		pcbe_ops->pcbe_allstop();
724 		kpreempt_enable();
725 
726 		for (i = 0; i < set->ks_nreqs; i++) {
727 			set->ks_req[i].kr_preset = *(set->ks_req[i].kr_data);
728 			if (enable)
729 				set->ks_req[i].kr_flags |= flag;
730 			else
731 				set->ks_req[i].kr_flags &= ~flag;
732 		}
733 		newset = kcpc_dup_set(set);
734 		if (kcpc_unbind(set) != 0)
735 			return (EINVAL);
736 		t->t_cpc_set = newset;
737 		if (kcpc_bind_thread(newset, t, &err) != 0) {
738 			t->t_cpc_set = NULL;
739 			kcpc_free_set(newset);
740 			return (EINVAL);
741 		}
742 	} else
743 		return (EINVAL);
744 
745 	return (0);
746 }
747 
748 /*
749  * Provide PCBEs with a way of obtaining the configs of every counter which will
750  * be programmed together.
751  *
752  * If current is NULL, provide the first config.
753  *
754  * If data != NULL, caller wants to know where the data store associated with
755  * the config we return is located.
756  */
757 void *
758 kcpc_next_config(void *token, void *current, uint64_t **data)
759 {
760 	int		i;
761 	kcpc_pic_t	*pic;
762 	kcpc_ctx_t *ctx = (kcpc_ctx_t *)token;
763 
764 	if (current == NULL) {
765 		/*
766 		 * Client would like the first config, which may not be in
767 		 * counter 0; we need to search through the counters for the
768 		 * first config.
769 		 */
770 		for (i = 0; i < cpc_ncounters; i++)
771 			if (ctx->kc_pics[i].kp_req != NULL)
772 				break;
773 		/*
774 		 * There are no counters configured for the given context.
775 		 */
776 		if (i == cpc_ncounters)
777 			return (NULL);
778 	} else {
779 		/*
780 		 * There surely is a faster way to do this.
781 		 */
782 		for (i = 0; i < cpc_ncounters; i++) {
783 			pic = &ctx->kc_pics[i];
784 
785 			if (pic->kp_req != NULL &&
786 			    current == pic->kp_req->kr_config)
787 				break;
788 		}
789 
790 		/*
791 		 * We found the current config at picnum i. Now search for the
792 		 * next configured PIC.
793 		 */
794 		for (i++; i < cpc_ncounters; i++) {
795 			pic = &ctx->kc_pics[i];
796 			if (pic->kp_req != NULL)
797 				break;
798 		}
799 
800 		if (i == cpc_ncounters)
801 			return (NULL);
802 	}
803 
804 	if (data != NULL) {
805 		*data = ctx->kc_pics[i].kp_req->kr_data;
806 	}
807 
808 	return (ctx->kc_pics[i].kp_req->kr_config);
809 }
810 
811 
812 kcpc_ctx_t *
813 kcpc_ctx_alloc(int kmem_flags)
814 {
815 	kcpc_ctx_t	*ctx;
816 	long		hash;
817 
818 	ctx = (kcpc_ctx_t *)kmem_zalloc(sizeof (kcpc_ctx_t), kmem_flags);
819 	if (ctx == NULL)
820 		return (NULL);
821 
822 	hash = CPC_HASH_CTX(ctx);
823 	mutex_enter(&kcpc_ctx_llock[hash]);
824 	ctx->kc_next = kcpc_ctx_list[hash];
825 	kcpc_ctx_list[hash] = ctx;
826 	mutex_exit(&kcpc_ctx_llock[hash]);
827 
828 	ctx->kc_pics = (kcpc_pic_t *)kmem_zalloc(sizeof (kcpc_pic_t) *
829 	    cpc_ncounters, KM_SLEEP);
830 
831 	ctx->kc_cpuid = -1;
832 
833 	return (ctx);
834 }
835 
836 /*
837  * Copy set from ctx to the child context, cctx, if it has CPC_BIND_LWP_INHERIT
838  * in the flags.
839  */
840 static void
841 kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx)
842 {
843 	kcpc_set_t	*ks = ctx->kc_set, *cks;
844 	int		i, j;
845 	int		code;
846 
847 	ASSERT(ks != NULL);
848 
849 	if ((ks->ks_flags & CPC_BIND_LWP_INHERIT) == 0)
850 		return;
851 
852 	cks = kmem_zalloc(sizeof (*cks), KM_SLEEP);
853 	cks->ks_state &= ~KCPC_SET_BOUND;
854 	cctx->kc_set = cks;
855 	cks->ks_flags = ks->ks_flags;
856 	cks->ks_nreqs = ks->ks_nreqs;
857 	cks->ks_req = kmem_alloc(cks->ks_nreqs *
858 	    sizeof (kcpc_request_t), KM_SLEEP);
859 	cks->ks_data = kmem_alloc(cks->ks_nreqs * sizeof (uint64_t),
860 	    KM_SLEEP);
861 	cks->ks_ctx = cctx;
862 
863 	for (i = 0; i < cks->ks_nreqs; i++) {
864 		cks->ks_req[i].kr_index = ks->ks_req[i].kr_index;
865 		cks->ks_req[i].kr_picnum = ks->ks_req[i].kr_picnum;
866 		(void) strncpy(cks->ks_req[i].kr_event,
867 		    ks->ks_req[i].kr_event, CPC_MAX_EVENT_LEN);
868 		cks->ks_req[i].kr_preset = ks->ks_req[i].kr_preset;
869 		cks->ks_req[i].kr_flags = ks->ks_req[i].kr_flags;
870 		cks->ks_req[i].kr_nattrs = ks->ks_req[i].kr_nattrs;
871 		if (ks->ks_req[i].kr_nattrs > 0) {
872 			cks->ks_req[i].kr_attr =
873 			    kmem_alloc(ks->ks_req[i].kr_nattrs *
874 			    sizeof (kcpc_attr_t), KM_SLEEP);
875 		}
876 		for (j = 0; j < ks->ks_req[i].kr_nattrs; j++) {
877 			(void) strncpy(cks->ks_req[i].kr_attr[j].ka_name,
878 			    ks->ks_req[i].kr_attr[j].ka_name,
879 			    CPC_MAX_ATTR_LEN);
880 			cks->ks_req[i].kr_attr[j].ka_val =
881 			    ks->ks_req[i].kr_attr[j].ka_val;
882 		}
883 	}
884 	if (kcpc_configure_reqs(cctx, cks, &code) != 0)
885 		kcpc_invalidate_config(cctx);
886 
887 	mutex_enter(&cks->ks_lock);
888 	cks->ks_state |= KCPC_SET_BOUND;
889 	cv_signal(&cks->ks_condv);
890 	mutex_exit(&cks->ks_lock);
891 }
892 
893 
894 void
895 kcpc_ctx_free(kcpc_ctx_t *ctx)
896 {
897 	kcpc_ctx_t	**loc;
898 	long		hash = CPC_HASH_CTX(ctx);
899 
900 	mutex_enter(&kcpc_ctx_llock[hash]);
901 	loc = &kcpc_ctx_list[hash];
902 	ASSERT(*loc != NULL);
903 	while (*loc != ctx)
904 		loc = &(*loc)->kc_next;
905 	*loc = ctx->kc_next;
906 	mutex_exit(&kcpc_ctx_llock[hash]);
907 
908 	kmem_free(ctx->kc_pics, cpc_ncounters * sizeof (kcpc_pic_t));
909 	cv_destroy(&ctx->kc_condv);
910 	mutex_destroy(&ctx->kc_lock);
911 	kmem_free(ctx, sizeof (*ctx));
912 }
913 
914 /*
915  * Generic interrupt handler used on hardware that generates
916  * overflow interrupts.
917  *
918  * Note: executed at high-level interrupt context!
919  */
920 /*ARGSUSED*/
921 kcpc_ctx_t *
922 kcpc_overflow_intr(caddr_t arg, uint64_t bitmap)
923 {
924 	kcpc_ctx_t	*ctx;
925 	kthread_t	*t = curthread;
926 	int		i;
927 
928 	/*
929 	 * On both x86 and UltraSPARC, we may deliver the high-level
930 	 * interrupt in kernel mode, just after we've started to run an
931 	 * interrupt thread.  (That's because the hardware helpfully
932 	 * delivers the overflow interrupt some random number of cycles
933 	 * after the instruction that caused the overflow by which time
934 	 * we're in some part of the kernel, not necessarily running on
935 	 * the right thread).
936 	 *
937 	 * Check for this case here -- find the pinned thread
938 	 * that was running when the interrupt went off.
939 	 */
940 	if (t->t_flag & T_INTR_THREAD) {
941 		klwp_t *lwp;
942 
943 		atomic_inc_32(&kcpc_intrctx_count);
944 
945 		/*
946 		 * Note that t_lwp is always set to point at the underlying
947 		 * thread, thus this will work in the presence of nested
948 		 * interrupts.
949 		 */
950 		ctx = NULL;
951 		if ((lwp = t->t_lwp) != NULL) {
952 			t = lwptot(lwp);
953 			ctx = t->t_cpc_ctx;
954 		}
955 	} else
956 		ctx = t->t_cpc_ctx;
957 
958 	if (ctx == NULL) {
959 		/*
960 		 * This can easily happen if we're using the counters in
961 		 * "shared" mode, for example, and an overflow interrupt
962 		 * occurs while we are running cpustat.  In that case, the
963 		 * bound thread that has the context that belongs to this
964 		 * CPU is almost certainly sleeping (if it was running on
965 		 * the CPU we'd have found it above), and the actual
966 		 * interrupted thread has no knowledge of performance counters!
967 		 */
968 		ctx = curthread->t_cpu->cpu_cpc_ctx;
969 		if (ctx != NULL) {
970 			/*
971 			 * Return the bound context for this CPU to
972 			 * the interrupt handler so that it can synchronously
973 			 * sample the hardware counters and restart them.
974 			 */
975 			return (ctx);
976 		}
977 
978 		/*
979 		 * As long as the overflow interrupt really is delivered early
980 		 * enough after trapping into the kernel to avoid switching
981 		 * threads, we must always be able to find the cpc context,
982 		 * or something went terribly wrong i.e. we ended up
983 		 * running a passivated interrupt thread, a kernel
984 		 * thread or we interrupted idle, all of which are Very Bad.
985 		 *
986 		 * We also could end up here owing to an incredibly unlikely
987 		 * race condition that exists on x86 based architectures when
988 		 * the cpc provider is in use; overflow interrupts are directed
989 		 * to the cpc provider if the 'dtrace_cpc_in_use' variable is
990 		 * set when we enter the handler. This variable is unset after
991 		 * overflow interrupts have been disabled on all CPUs and all
992 		 * contexts have been torn down. To stop interrupts, the cpc
993 		 * provider issues a xcall to the remote CPU before it tears
994 		 * down that CPUs context. As high priority xcalls, on an x86
995 		 * architecture, execute at a higher PIL than this handler, it
996 		 * is possible (though extremely unlikely) that the xcall could
997 		 * interrupt the overflow handler before the handler has
998 		 * checked the 'dtrace_cpc_in_use' variable, stop the counters,
999 		 * return to the cpc provider which could then rip down
1000 		 * contexts and unset 'dtrace_cpc_in_use' *before* the CPUs
1001 		 * overflow handler has had a chance to check the variable. In
1002 		 * that case, the handler would direct the overflow into this
1003 		 * code and no valid context will be found. The default behavior
1004 		 * when no valid context is found is now to shout a warning to
1005 		 * the console and bump the 'kcpc_nullctx_count' variable.
1006 		 */
1007 		if (kcpc_nullctx_panic)
1008 			panic("null cpc context, thread %p", (void *)t);
1009 #ifdef DEBUG
1010 		cmn_err(CE_NOTE,
1011 		    "null cpc context found in overflow handler!\n");
1012 #endif
1013 		atomic_inc_32(&kcpc_nullctx_count);
1014 	} else if ((ctx->kc_flags & KCPC_CTX_INVALID) == 0) {
1015 		/*
1016 		 * Schedule an ast to sample the counters, which will
1017 		 * propagate any overflow into the virtualized performance
1018 		 * counter(s), and may deliver a signal.
1019 		 */
1020 		ttolwp(t)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
1021 		/*
1022 		 * If a counter has overflowed which was counting on behalf of
1023 		 * a request which specified CPC_OVF_NOTIFY_EMT, send the
1024 		 * process a signal.
1025 		 */
1026 		for (i = 0; i < cpc_ncounters; i++) {
1027 			if (ctx->kc_pics[i].kp_req != NULL &&
1028 			    bitmap & (1 << i) &&
1029 			    ctx->kc_pics[i].kp_req->kr_flags &
1030 			    CPC_OVF_NOTIFY_EMT) {
1031 				/*
1032 				 * A signal has been requested for this PIC, so
1033 				 * so freeze the context. The interrupt handler
1034 				 * has already stopped the counter hardware.
1035 				 */
1036 				KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
1037 				atomic_or_uint(&ctx->kc_pics[i].kp_flags,
1038 				    KCPC_PIC_OVERFLOWED);
1039 			}
1040 		}
1041 		aston(t);
1042 	} else if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) {
1043 		/*
1044 		 * Thread context is no longer valid, but here may be a valid
1045 		 * CPU context.
1046 		 */
1047 		return (curthread->t_cpu->cpu_cpc_ctx);
1048 	}
1049 
1050 	return (NULL);
1051 }
1052 
1053 /*
1054  * The current thread context had an overflow interrupt; we're
1055  * executing here in high-level interrupt context.
1056  */
1057 /*ARGSUSED*/
1058 uint_t
1059 kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
1060 {
1061 	kcpc_ctx_t *ctx;
1062 	uint64_t bitmap;
1063 	uint8_t *state;
1064 	int	save_spl;
1065 
1066 	if (pcbe_ops == NULL ||
1067 	    (bitmap = pcbe_ops->pcbe_overflow_bitmap()) == 0)
1068 		return (DDI_INTR_UNCLAIMED);
1069 
1070 	/*
1071 	 * Prevent any further interrupts.
1072 	 */
1073 	pcbe_ops->pcbe_allstop();
1074 
1075 	if (dtrace_cpc_in_use) {
1076 		state = &cpu_core[CPU->cpu_id].cpuc_dcpc_intr_state;
1077 
1078 		/*
1079 		 * Set the per-CPU state bit to indicate that we are currently
1080 		 * processing an interrupt if it is currently free. Drop the
1081 		 * interrupt if the state isn't free (i.e. a configuration
1082 		 * event is taking place).
1083 		 */
1084 		if (atomic_cas_8(state, DCPC_INTR_FREE,
1085 		    DCPC_INTR_PROCESSING) == DCPC_INTR_FREE) {
1086 			int i;
1087 			kcpc_request_t req;
1088 
1089 			ASSERT(dtrace_cpc_fire != NULL);
1090 
1091 			(*dtrace_cpc_fire)(bitmap);
1092 
1093 			ctx = curthread->t_cpu->cpu_cpc_ctx;
1094 			if (ctx == NULL) {
1095 #ifdef DEBUG
1096 				cmn_err(CE_NOTE, "null cpc context in"
1097 				    "hardware overflow handler!\n");
1098 #endif
1099 				return (DDI_INTR_CLAIMED);
1100 			}
1101 
1102 			/* Reset any counters that have overflowed */
1103 			for (i = 0; i < ctx->kc_set->ks_nreqs; i++) {
1104 				req = ctx->kc_set->ks_req[i];
1105 
1106 				if (bitmap & (1 << req.kr_picnum)) {
1107 					pcbe_ops->pcbe_configure(req.kr_picnum,
1108 					    req.kr_event, req.kr_preset,
1109 					    req.kr_flags, req.kr_nattrs,
1110 					    req.kr_attr, &(req.kr_config),
1111 					    (void *)ctx);
1112 				}
1113 			}
1114 			pcbe_ops->pcbe_program(ctx);
1115 
1116 			/*
1117 			 * We've finished processing the interrupt so set
1118 			 * the state back to free.
1119 			 */
1120 			cpu_core[CPU->cpu_id].cpuc_dcpc_intr_state =
1121 			    DCPC_INTR_FREE;
1122 			membar_producer();
1123 		}
1124 		return (DDI_INTR_CLAIMED);
1125 	}
1126 
1127 	/*
1128 	 * DTrace isn't involved so pass on accordingly.
1129 	 *
1130 	 * If the interrupt has occurred in the context of an lwp owning
1131 	 * the counters, then the handler posts an AST to the lwp to
1132 	 * trigger the actual sampling, and optionally deliver a signal or
1133 	 * restart the counters, on the way out of the kernel using
1134 	 * kcpc_hw_overflow_ast() (see below).
1135 	 *
1136 	 * On the other hand, if the handler returns the context to us
1137 	 * directly, then it means that there are no other threads in
1138 	 * the middle of updating it, no AST has been posted, and so we
1139 	 * should sample the counters here, and restart them with no
1140 	 * further fuss.
1141 	 *
1142 	 * The CPU's CPC context may disappear as a result of cross-call which
1143 	 * has higher PIL on x86, so protect the context by raising PIL to the
1144 	 * cross-call level.
1145 	 */
1146 	save_spl = spl_xcall();
1147 	if ((ctx = kcpc_overflow_intr(arg1, bitmap)) != NULL) {
1148 		uint64_t curtick = KCPC_GET_TICK();
1149 
1150 		ctx->kc_hrtime = gethrtime_waitfree();
1151 		ctx->kc_vtick += curtick - ctx->kc_rawtick;
1152 		ctx->kc_rawtick = curtick;
1153 		pcbe_ops->pcbe_sample(ctx);
1154 		pcbe_ops->pcbe_program(ctx);
1155 	}
1156 	splx(save_spl);
1157 
1158 	return (DDI_INTR_CLAIMED);
1159 }
1160 
1161 /*
1162  * Called from trap() when processing the ast posted by the high-level
1163  * interrupt handler.
1164  */
1165 int
1166 kcpc_overflow_ast()
1167 {
1168 	kcpc_ctx_t	*ctx = curthread->t_cpc_ctx;
1169 	int		i;
1170 	int		found = 0;
1171 	uint64_t	curtick = KCPC_GET_TICK();
1172 
1173 	ASSERT(ctx != NULL);	/* Beware of interrupt skid. */
1174 
1175 	/*
1176 	 * An overflow happened: sample the context to ensure that
1177 	 * the overflow is propagated into the upper bits of the
1178 	 * virtualized 64-bit counter(s).
1179 	 */
1180 	kpreempt_disable();
1181 	ctx->kc_hrtime = gethrtime_waitfree();
1182 	pcbe_ops->pcbe_sample(ctx);
1183 	kpreempt_enable();
1184 
1185 	ctx->kc_vtick += curtick - ctx->kc_rawtick;
1186 
1187 	/*
1188 	 * The interrupt handler has marked any pics with KCPC_PIC_OVERFLOWED
1189 	 * if that pic generated an overflow and if the request it was counting
1190 	 * on behalf of had CPC_OVERFLOW_REQUEST specified. We go through all
1191 	 * pics in the context and clear the KCPC_PIC_OVERFLOWED flags. If we
1192 	 * found any overflowed pics, keep the context frozen and return true
1193 	 * (thus causing a signal to be sent).
1194 	 */
1195 	for (i = 0; i < cpc_ncounters; i++) {
1196 		if (ctx->kc_pics[i].kp_flags & KCPC_PIC_OVERFLOWED) {
1197 			atomic_and_uint(&ctx->kc_pics[i].kp_flags,
1198 			    ~KCPC_PIC_OVERFLOWED);
1199 			found = 1;
1200 		}
1201 	}
1202 	if (found)
1203 		return (1);
1204 
1205 	/*
1206 	 * Otherwise, re-enable the counters and continue life as before.
1207 	 */
1208 	kpreempt_disable();
1209 	KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
1210 	pcbe_ops->pcbe_program(ctx);
1211 	kpreempt_enable();
1212 	return (0);
1213 }
1214 
1215 /*
1216  * Called when switching away from current thread.
1217  */
1218 static void
1219 kcpc_save(void *arg)
1220 {
1221 	kcpc_ctx_t *ctx = arg;
1222 	int err;
1223 	int save_spl;
1224 
1225 	kpreempt_disable();
1226 	save_spl = spl_xcall();
1227 
1228 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
1229 		if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) {
1230 			splx(save_spl);
1231 			kpreempt_enable();
1232 			return;
1233 		}
1234 		/*
1235 		 * This context has been invalidated but the counters have not
1236 		 * been stopped. Stop them here and mark the context stopped.
1237 		 */
1238 		kcpc_unprogram(ctx, B_TRUE);
1239 		splx(save_spl);
1240 		kpreempt_enable();
1241 		return;
1242 	}
1243 
1244 	pcbe_ops->pcbe_allstop();
1245 	if (ctx->kc_flags & KCPC_CTX_FREEZE) {
1246 		splx(save_spl);
1247 		kpreempt_enable();
1248 		return;
1249 	}
1250 
1251 	/*
1252 	 * Need to sample for all reqs into each req's current mpic.
1253 	 */
1254 	ctx->kc_hrtime = gethrtime_waitfree();
1255 	ctx->kc_vtick += KCPC_GET_TICK() - ctx->kc_rawtick;
1256 	pcbe_ops->pcbe_sample(ctx);
1257 
1258 	/*
1259 	 * Program counter for measuring capacity and utilization since user
1260 	 * thread isn't using counter anymore
1261 	 */
1262 	ASSERT(ctx->kc_cpuid == -1);
1263 	cu_cpc_program(CPU, &err);
1264 	splx(save_spl);
1265 	kpreempt_enable();
1266 }
1267 
1268 static void
1269 kcpc_restore(void *arg)
1270 {
1271 	kcpc_ctx_t *ctx = arg;
1272 	int save_spl;
1273 
1274 	mutex_enter(&ctx->kc_lock);
1275 
1276 	if ((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED)) ==
1277 	    KCPC_CTX_INVALID) {
1278 		/*
1279 		 * The context is invalidated but has not been marked stopped.
1280 		 * We mark it as such here because we will not start the
1281 		 * counters during this context switch.
1282 		 */
1283 		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID_STOPPED);
1284 	}
1285 
1286 	if (ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_FREEZE)) {
1287 		mutex_exit(&ctx->kc_lock);
1288 		return;
1289 	}
1290 
1291 	/*
1292 	 * Set kc_flags to show that a kcpc_restore() is in progress to avoid
1293 	 * ctx & set related memory objects being freed without us knowing.
1294 	 * This can happen if an agent thread is executing a kcpc_unbind(),
1295 	 * with this thread as the target, whilst we're concurrently doing a
1296 	 * restorectx() during, for example, a proc_exit().  Effectively, by
1297 	 * doing this, we're asking kcpc_free() to cv_wait() until
1298 	 * kcpc_restore() has completed.
1299 	 */
1300 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_RESTORE);
1301 	mutex_exit(&ctx->kc_lock);
1302 
1303 	/*
1304 	 * While programming the hardware, the counters should be stopped. We
1305 	 * don't do an explicit pcbe_allstop() here because they should have
1306 	 * been stopped already by the last consumer.
1307 	 */
1308 	kpreempt_disable();
1309 	save_spl = spl_xcall();
1310 	kcpc_program(ctx, B_TRUE, B_TRUE);
1311 	splx(save_spl);
1312 	kpreempt_enable();
1313 
1314 	/*
1315 	 * Wake the agent thread if it's waiting in kcpc_free().
1316 	 */
1317 	mutex_enter(&ctx->kc_lock);
1318 	KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_RESTORE);
1319 	cv_signal(&ctx->kc_condv);
1320 	mutex_exit(&ctx->kc_lock);
1321 }
1322 
1323 /*
1324  * If kcpc_counts_include_idle is set to 0 by the sys admin, we add the the
1325  * following context operators to the idle thread on each CPU. They stop the
1326  * counters when the idle thread is switched on, and they start them again when
1327  * it is switched off.
1328  */
1329 /*ARGSUSED*/
1330 static void
1331 kcpc_idle_save(void *arg)
1332 {
1333 	struct cpu *cp = arg;
1334 
1335 	/*
1336 	 * The idle thread shouldn't be run anywhere else.
1337 	 */
1338 	ASSERT(CPU == cp);
1339 
1340 	/*
1341 	 * We must hold the CPU's context lock to ensure the context isn't freed
1342 	 * while we're looking at it.
1343 	 */
1344 	mutex_enter(&cp->cpu_cpc_ctxlock);
1345 
1346 	if ((cp->cpu_cpc_ctx == NULL) ||
1347 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1348 		mutex_exit(&cp->cpu_cpc_ctxlock);
1349 		return;
1350 	}
1351 
1352 	pcbe_ops->pcbe_program(cp->cpu_cpc_ctx);
1353 	mutex_exit(&cp->cpu_cpc_ctxlock);
1354 }
1355 
1356 static void
1357 kcpc_idle_restore(void *arg)
1358 {
1359 	struct cpu *cp = arg;
1360 
1361 	/*
1362 	 * The idle thread shouldn't be run anywhere else.
1363 	 */
1364 	ASSERT(CPU == cp);
1365 
1366 	/*
1367 	 * We must hold the CPU's context lock to ensure the context isn't freed
1368 	 * while we're looking at it.
1369 	 */
1370 	mutex_enter(&cp->cpu_cpc_ctxlock);
1371 
1372 	if ((cp->cpu_cpc_ctx == NULL) ||
1373 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1374 		mutex_exit(&cp->cpu_cpc_ctxlock);
1375 		return;
1376 	}
1377 
1378 	pcbe_ops->pcbe_allstop();
1379 	mutex_exit(&cp->cpu_cpc_ctxlock);
1380 }
1381 
1382 static const struct ctxop_template kcpc_idle_ctxop_tpl = {
1383 	.ct_rev		= CTXOP_TPL_REV,
1384 	.ct_save	= kcpc_idle_save,
1385 	.ct_restore	= kcpc_idle_restore,
1386 };
1387 
1388 void
1389 kcpc_idle_ctxop_install(kthread_t *t, struct cpu *cp)
1390 {
1391 	ctxop_install(t, &kcpc_idle_ctxop_tpl, cp);
1392 }
1393 
1394 /*ARGSUSED*/
1395 static void
1396 kcpc_lwp_create(void *parent, void *child)
1397 {
1398 	kthread_t *t = parent, *ct = child;
1399 	kcpc_ctx_t	*ctx = t->t_cpc_ctx, *cctx;
1400 	int		i;
1401 
1402 	if (ctx == NULL || (ctx->kc_flags & KCPC_CTX_LWPINHERIT) == 0)
1403 		return;
1404 
1405 	rw_enter(&kcpc_cpuctx_lock, RW_READER);
1406 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
1407 		rw_exit(&kcpc_cpuctx_lock);
1408 		return;
1409 	}
1410 	cctx = kcpc_ctx_alloc(KM_SLEEP);
1411 	kcpc_ctx_clone(ctx, cctx);
1412 	rw_exit(&kcpc_cpuctx_lock);
1413 
1414 	/*
1415 	 * Copy the parent context's kc_flags field, but don't overwrite
1416 	 * the child's in case it was modified during kcpc_ctx_clone.
1417 	 */
1418 	KCPC_CTX_FLAG_SET(cctx,  ctx->kc_flags);
1419 	cctx->kc_thread = ct;
1420 	cctx->kc_cpuid = -1;
1421 	ct->t_cpc_set = cctx->kc_set;
1422 	ct->t_cpc_ctx = cctx;
1423 
1424 	if (cctx->kc_flags & KCPC_CTX_SIGOVF) {
1425 		kcpc_set_t *ks = cctx->kc_set;
1426 		/*
1427 		 * Our contract with the user requires us to immediately send an
1428 		 * overflow signal to all children if we have the LWPINHERIT
1429 		 * and SIGOVF flags set. In addition, all counters should be
1430 		 * set to UINT64_MAX, and their pic's overflow flag turned on
1431 		 * so that our trap() processing knows to send a signal.
1432 		 */
1433 		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
1434 		for (i = 0; i < ks->ks_nreqs; i++) {
1435 			kcpc_request_t *kr = &ks->ks_req[i];
1436 
1437 			if (kr->kr_flags & CPC_OVF_NOTIFY_EMT) {
1438 				*(kr->kr_data) = UINT64_MAX;
1439 				atomic_or_uint(&kr->kr_picp->kp_flags,
1440 				    KCPC_PIC_OVERFLOWED);
1441 			}
1442 		}
1443 		ttolwp(ct)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
1444 		aston(ct);
1445 	}
1446 
1447 	ctxop_install(ct, &kcpc_ctxop_tpl, cctx);
1448 }
1449 
1450 /*
1451  * Counter Stoppage Theory
1452  *
1453  * The counters may need to be stopped properly at the following occasions:
1454  *
1455  * 1) An LWP exits.
1456  * 2) A thread exits.
1457  * 3) An LWP performs an exec().
1458  * 4) A bound set is unbound.
1459  *
1460  * In addition to stopping the counters, the CPC context (a kcpc_ctx_t) may need
1461  * to be freed as well.
1462  *
1463  * Case 1: kcpc_passivate(), called via lwp_exit(), stops the counters. Later on
1464  * when the thread is freed, kcpc_free(), called by freectx(), frees the
1465  * context.
1466  *
1467  * Case 2: same as case 1 except kcpc_passivate is called from thread_exit().
1468  *
1469  * Case 3: kcpc_free(), called via freectx() via exec(), recognizes that it has
1470  * been called from exec. It stops the counters _and_ frees the context.
1471  *
1472  * Case 4: kcpc_unbind() stops the hardware _and_ frees the context.
1473  *
1474  * CPU-bound counters are always stopped via kcpc_unbind().
1475  */
1476 
1477 /*
1478  * We're being called to delete the context; we ensure that all associated data
1479  * structures are freed, and that the hardware is passivated if this is an exec.
1480  */
1481 
1482 /*ARGSUSED*/
1483 void
1484 kcpc_free(void *arg, int isexec)
1485 {
1486 	kcpc_ctx_t *ctx = arg;
1487 	int		i;
1488 	kcpc_set_t	*set = ctx->kc_set;
1489 
1490 	ASSERT(set != NULL);
1491 
1492 	/*
1493 	 * Wait for kcpc_restore() to finish before we tear things down.
1494 	 */
1495 	mutex_enter(&ctx->kc_lock);
1496 	while (ctx->kc_flags & KCPC_CTX_RESTORE)
1497 		cv_wait(&ctx->kc_condv, &ctx->kc_lock);
1498 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1499 	mutex_exit(&ctx->kc_lock);
1500 
1501 	if (isexec) {
1502 		/*
1503 		 * This thread is execing, and after the exec it should not have
1504 		 * any performance counter context. Stop the counters properly
1505 		 * here so the system isn't surprised by an overflow interrupt
1506 		 * later.
1507 		 */
1508 		if (ctx->kc_cpuid != -1) {
1509 			cpu_t *cp;
1510 			/*
1511 			 * CPU-bound context; stop the appropriate CPU's ctrs.
1512 			 * Hold cpu_lock while examining the CPU to ensure it
1513 			 * doesn't go away.
1514 			 */
1515 			mutex_enter(&cpu_lock);
1516 			cp = cpu_get(ctx->kc_cpuid);
1517 			/*
1518 			 * The CPU could have been DR'd out, so only stop the
1519 			 * CPU and clear its context pointer if the CPU still
1520 			 * exists.
1521 			 */
1522 			if (cp != NULL) {
1523 				mutex_enter(&cp->cpu_cpc_ctxlock);
1524 				kcpc_stop_hw(ctx);
1525 				mutex_exit(&cp->cpu_cpc_ctxlock);
1526 			}
1527 			mutex_exit(&cpu_lock);
1528 			ASSERT(curthread->t_cpc_ctx == NULL);
1529 		} else {
1530 			int save_spl;
1531 
1532 			/*
1533 			 * Thread-bound context; stop _this_ CPU's counters.
1534 			 */
1535 			kpreempt_disable();
1536 			save_spl = spl_xcall();
1537 			kcpc_unprogram(ctx, B_TRUE);
1538 			curthread->t_cpc_ctx = NULL;
1539 			splx(save_spl);
1540 			kpreempt_enable();
1541 		}
1542 
1543 		/*
1544 		 * Since we are being called from an exec and we know that
1545 		 * exec is not permitted via the agent thread, we should clean
1546 		 * up this thread's CPC state completely, and not leave dangling
1547 		 * CPC pointers behind.
1548 		 */
1549 		ASSERT(ctx->kc_thread == curthread);
1550 		curthread->t_cpc_set = NULL;
1551 	}
1552 
1553 	/*
1554 	 * Walk through each request in this context's set and free the PCBE's
1555 	 * configuration if it exists.
1556 	 */
1557 	for (i = 0; i < set->ks_nreqs; i++) {
1558 		if (set->ks_req[i].kr_config != NULL)
1559 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
1560 	}
1561 
1562 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
1563 	kcpc_ctx_free(ctx);
1564 	kcpc_free_set(set);
1565 }
1566 
1567 void
1568 kcpc_free_cpu(kcpc_ctx_t *ctx)
1569 {
1570 	kcpc_free(ctx, 0);
1571 }
1572 
1573 /*
1574  * Free the memory associated with a request set.
1575  */
1576 void
1577 kcpc_free_set(kcpc_set_t *set)
1578 {
1579 	int		i;
1580 	kcpc_request_t	*req;
1581 
1582 	ASSERT(set->ks_req != NULL);
1583 
1584 	for (i = 0; i < set->ks_nreqs; i++) {
1585 		req = &set->ks_req[i];
1586 
1587 		if (req->kr_nattrs != 0) {
1588 			kmem_free(req->kr_attr,
1589 			    req->kr_nattrs * sizeof (kcpc_attr_t));
1590 		}
1591 	}
1592 
1593 	kmem_free(set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
1594 	cv_destroy(&set->ks_condv);
1595 	mutex_destroy(&set->ks_lock);
1596 	kmem_free(set, sizeof (kcpc_set_t));
1597 }
1598 
1599 /*
1600  * Grab every existing context and mark it as invalid.
1601  */
1602 void
1603 kcpc_invalidate_all(void)
1604 {
1605 	kcpc_ctx_t *ctx;
1606 	long hash;
1607 
1608 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++) {
1609 		mutex_enter(&kcpc_ctx_llock[hash]);
1610 		for (ctx = kcpc_ctx_list[hash]; ctx; ctx = ctx->kc_next)
1611 			KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1612 		mutex_exit(&kcpc_ctx_llock[hash]);
1613 	}
1614 }
1615 
1616 /*
1617  * Interface for PCBEs to signal that an existing configuration has suddenly
1618  * become invalid.
1619  */
1620 void
1621 kcpc_invalidate_config(void *token)
1622 {
1623 	kcpc_ctx_t *ctx = token;
1624 
1625 	ASSERT(ctx != NULL);
1626 
1627 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1628 }
1629 
1630 /*
1631  * Called from lwp_exit() and thread_exit()
1632  */
1633 void
1634 kcpc_passivate(void)
1635 {
1636 	kcpc_ctx_t *ctx = curthread->t_cpc_ctx;
1637 	kcpc_set_t *set = curthread->t_cpc_set;
1638 	int	save_spl;
1639 
1640 	if (set == NULL)
1641 		return;
1642 
1643 	if (ctx == NULL) {
1644 		/*
1645 		 * This thread has a set but no context; it must be a CPU-bound
1646 		 * set. The hardware will be stopped via kcpc_unbind() when the
1647 		 * process exits and closes its file descriptors with
1648 		 * kcpc_close(). Our only job here is to clean up this thread's
1649 		 * state; the set will be freed with the unbind().
1650 		 */
1651 		(void) kcpc_unbind(set);
1652 		/*
1653 		 * Unbinding a set belonging to the current thread should clear
1654 		 * its set pointer.
1655 		 */
1656 		ASSERT(curthread->t_cpc_set == NULL);
1657 		return;
1658 	}
1659 
1660 	kpreempt_disable();
1661 	save_spl = spl_xcall();
1662 	curthread->t_cpc_set = NULL;
1663 
1664 	/*
1665 	 * This thread/LWP is exiting but context switches will continue to
1666 	 * happen for a bit as the exit proceeds.  Kernel preemption must be
1667 	 * disabled here to prevent a race between checking or setting the
1668 	 * INVALID_STOPPED flag here and kcpc_restore() setting the flag during
1669 	 * a context switch.
1670 	 */
1671 	if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
1672 		kcpc_unprogram(ctx, B_TRUE);
1673 		KCPC_CTX_FLAG_SET(ctx,
1674 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
1675 	}
1676 
1677 	/*
1678 	 * We're cleaning up after this thread; ensure there are no dangling
1679 	 * CPC pointers left behind. The context and set will be freed by
1680 	 * freectx().
1681 	 */
1682 	curthread->t_cpc_ctx = NULL;
1683 
1684 	splx(save_spl);
1685 	kpreempt_enable();
1686 }
1687 
1688 /*
1689  * Assign the requests in the given set to the PICs in the context.
1690  * Returns 0 if successful, -1 on failure.
1691  */
1692 /*ARGSUSED*/
1693 int
1694 kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx)
1695 {
1696 	int i;
1697 	int *picnum_save;
1698 
1699 	ASSERT(set->ks_nreqs <= cpc_ncounters);
1700 
1701 	/*
1702 	 * Provide kcpc_tryassign() with scratch space to avoid doing an
1703 	 * alloc/free with every invocation.
1704 	 */
1705 	picnum_save = kmem_alloc(set->ks_nreqs * sizeof (int), KM_SLEEP);
1706 	/*
1707 	 * kcpc_tryassign() blindly walks through each request in the set,
1708 	 * seeing if a counter can count its event. If yes, it assigns that
1709 	 * counter. However, that counter may have been the only capable counter
1710 	 * for _another_ request's event. The solution is to try every possible
1711 	 * request first. Note that this does not cover all solutions, as
1712 	 * that would require all unique orderings of requests, an n^n operation
1713 	 * which would be unacceptable for architectures with many counters.
1714 	 */
1715 	for (i = 0; i < set->ks_nreqs; i++)
1716 		if (kcpc_tryassign(set, i, picnum_save) == 0)
1717 			break;
1718 
1719 	kmem_free(picnum_save, set->ks_nreqs * sizeof (int));
1720 	if (i == set->ks_nreqs)
1721 		return (-1);
1722 	return (0);
1723 }
1724 
1725 static int
1726 kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch)
1727 {
1728 	int		i;
1729 	int		j;
1730 	uint64_t	bitmap = 0, resmap = 0;
1731 	uint64_t	ctrmap;
1732 
1733 	/*
1734 	 * We are attempting to assign the reqs to pics, but we may fail. If we
1735 	 * fail, we need to restore the state of the requests to what it was
1736 	 * when we found it, as some reqs may have been explicitly assigned to
1737 	 * a specific PIC beforehand. We do this by snapshotting the assignments
1738 	 * now and restoring from it later if we fail.
1739 	 *
1740 	 * Also we note here which counters have already been claimed by
1741 	 * requests with explicit counter assignments.
1742 	 */
1743 	for (i = 0; i < set->ks_nreqs; i++) {
1744 		scratch[i] = set->ks_req[i].kr_picnum;
1745 		if (set->ks_req[i].kr_picnum != -1)
1746 			resmap |= (1 << set->ks_req[i].kr_picnum);
1747 	}
1748 
1749 	/*
1750 	 * Walk through requests assigning them to the first PIC that is
1751 	 * capable.
1752 	 */
1753 	i = starting_req;
1754 	do {
1755 		if (set->ks_req[i].kr_picnum != -1) {
1756 			ASSERT((bitmap & (1 << set->ks_req[i].kr_picnum)) == 0);
1757 			bitmap |= (1 << set->ks_req[i].kr_picnum);
1758 			if (++i == set->ks_nreqs)
1759 				i = 0;
1760 			continue;
1761 		}
1762 
1763 		ctrmap = pcbe_ops->pcbe_event_coverage(set->ks_req[i].kr_event);
1764 		for (j = 0; j < cpc_ncounters; j++) {
1765 			if (ctrmap & (1 << j) && (bitmap & (1 << j)) == 0 &&
1766 			    (resmap & (1 << j)) == 0) {
1767 				/*
1768 				 * We can assign this counter because:
1769 				 *
1770 				 * 1. It can count the event (ctrmap)
1771 				 * 2. It hasn't been assigned yet (bitmap)
1772 				 * 3. It wasn't reserved by a request (resmap)
1773 				 */
1774 				bitmap |= (1 << j);
1775 				break;
1776 			}
1777 		}
1778 		if (j == cpc_ncounters) {
1779 			for (i = 0; i < set->ks_nreqs; i++)
1780 				set->ks_req[i].kr_picnum = scratch[i];
1781 			return (-1);
1782 		}
1783 		set->ks_req[i].kr_picnum = j;
1784 
1785 		if (++i == set->ks_nreqs)
1786 			i = 0;
1787 	} while (i != starting_req);
1788 
1789 	return (0);
1790 }
1791 
1792 kcpc_set_t *
1793 kcpc_dup_set(kcpc_set_t *set)
1794 {
1795 	kcpc_set_t	*new;
1796 	int		i;
1797 	int		j;
1798 
1799 	new = kmem_zalloc(sizeof (*new), KM_SLEEP);
1800 	new->ks_state &= ~KCPC_SET_BOUND;
1801 	new->ks_flags = set->ks_flags;
1802 	new->ks_nreqs = set->ks_nreqs;
1803 	new->ks_req = kmem_alloc(set->ks_nreqs * sizeof (kcpc_request_t),
1804 	    KM_SLEEP);
1805 	new->ks_data = NULL;
1806 	new->ks_ctx = NULL;
1807 
1808 	for (i = 0; i < new->ks_nreqs; i++) {
1809 		new->ks_req[i].kr_config = NULL;
1810 		new->ks_req[i].kr_index = set->ks_req[i].kr_index;
1811 		new->ks_req[i].kr_picnum = set->ks_req[i].kr_picnum;
1812 		new->ks_req[i].kr_picp = NULL;
1813 		new->ks_req[i].kr_data = NULL;
1814 		(void) strncpy(new->ks_req[i].kr_event, set->ks_req[i].kr_event,
1815 		    CPC_MAX_EVENT_LEN);
1816 		new->ks_req[i].kr_preset = set->ks_req[i].kr_preset;
1817 		new->ks_req[i].kr_flags = set->ks_req[i].kr_flags;
1818 		new->ks_req[i].kr_nattrs = set->ks_req[i].kr_nattrs;
1819 		new->ks_req[i].kr_attr = kmem_alloc(new->ks_req[i].kr_nattrs *
1820 		    sizeof (kcpc_attr_t), KM_SLEEP);
1821 		for (j = 0; j < new->ks_req[i].kr_nattrs; j++) {
1822 			new->ks_req[i].kr_attr[j].ka_val =
1823 			    set->ks_req[i].kr_attr[j].ka_val;
1824 			(void) strncpy(new->ks_req[i].kr_attr[j].ka_name,
1825 			    set->ks_req[i].kr_attr[j].ka_name,
1826 			    CPC_MAX_ATTR_LEN);
1827 		}
1828 	}
1829 
1830 	return (new);
1831 }
1832 
1833 int
1834 kcpc_allow_nonpriv(void *token)
1835 {
1836 	return (((kcpc_ctx_t *)token)->kc_flags & KCPC_CTX_NONPRIV);
1837 }
1838 
1839 void
1840 kcpc_invalidate(kthread_t *t)
1841 {
1842 	kcpc_ctx_t *ctx = t->t_cpc_ctx;
1843 
1844 	if (ctx != NULL)
1845 		KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1846 }
1847 
1848 /*
1849  * Given a PCBE ID, attempt to load a matching PCBE module. The strings given
1850  * are used to construct PCBE names, starting with the most specific,
1851  * "pcbe.first.second.third.fourth" and ending with the least specific,
1852  * "pcbe.first".
1853  *
1854  * Returns 0 if a PCBE was successfully loaded and -1 upon error.
1855  */
1856 int
1857 kcpc_pcbe_tryload(const char *prefix, uint_t first, uint_t second, uint_t third)
1858 {
1859 	uint_t s[3];
1860 
1861 	s[0] = first;
1862 	s[1] = second;
1863 	s[2] = third;
1864 
1865 	return (modload_qualified("pcbe",
1866 	    "pcbe", prefix, ".", s, 3, NULL) < 0 ? -1 : 0);
1867 }
1868 
1869 /*
1870  * Create one or more CPC context for given CPU with specified counter event
1871  * requests
1872  *
1873  * If number of requested counter events is less than or equal number of
1874  * hardware counters on a CPU and can all be assigned to the counters on a CPU
1875  * at the same time, then make one CPC context.
1876  *
1877  * Otherwise, multiple CPC contexts are created to allow multiplexing more
1878  * counter events than existing counters onto the counters by iterating through
1879  * all of the CPC contexts, programming the counters with each CPC context one
1880  * at a time and measuring the resulting counter values.  Each of the resulting
1881  * CPC contexts contains some number of requested counter events less than or
1882  * equal the number of counters on a CPU depending on whether all the counter
1883  * events can be programmed on all the counters at the same time or not.
1884  *
1885  * Flags to kmem_{,z}alloc() are passed in as an argument to allow specifying
1886  * whether memory allocation should be non-blocking or not.  The code will try
1887  * to allocate *whole* CPC contexts if possible.  If there is any memory
1888  * allocation failure during the allocations needed for a given CPC context, it
1889  * will skip allocating that CPC context because it cannot allocate the whole
1890  * thing.  Thus, the only time that it will end up allocating none (ie. no CPC
1891  * contexts whatsoever) is when it cannot even allocate *one* whole CPC context
1892  * without a memory allocation failure occurring.
1893  */
1894 int
1895 kcpc_cpu_ctx_create(cpu_t *cp, kcpc_request_list_t *req_list, int kmem_flags,
1896     kcpc_ctx_t ***ctx_ptr_array, size_t *ctx_ptr_array_sz)
1897 {
1898 	kcpc_ctx_t	**ctx_ptrs;
1899 	int		nctx;
1900 	int		nctx_ptrs;
1901 	int		nreqs;
1902 	kcpc_request_t	*reqs;
1903 
1904 	if (cp == NULL || ctx_ptr_array == NULL || ctx_ptr_array_sz == NULL ||
1905 	    req_list == NULL || req_list->krl_cnt < 1)
1906 		return (-1);
1907 
1908 	/*
1909 	 * Allocate number of sets assuming that each set contains one and only
1910 	 * one counter event request for each counter on a CPU
1911 	 */
1912 	nreqs = req_list->krl_cnt;
1913 	nctx_ptrs = (nreqs + cpc_ncounters - 1) / cpc_ncounters;
1914 	ctx_ptrs = kmem_zalloc(nctx_ptrs * sizeof (kcpc_ctx_t *), kmem_flags);
1915 	if (ctx_ptrs == NULL)
1916 		return (-2);
1917 
1918 	/*
1919 	 * Fill in sets of requests
1920 	 */
1921 	nctx = 0;
1922 	reqs = req_list->krl_list;
1923 	while (nreqs > 0) {
1924 		kcpc_ctx_t	*ctx;
1925 		kcpc_set_t	*set;
1926 		int		subcode;
1927 
1928 		/*
1929 		 * Allocate CPC context and set for requested counter events
1930 		 */
1931 		ctx = kcpc_ctx_alloc(kmem_flags);
1932 		set = kcpc_set_create(reqs, nreqs, 0, kmem_flags);
1933 		if (set == NULL) {
1934 			kcpc_ctx_free(ctx);
1935 			break;
1936 		}
1937 
1938 		/*
1939 		 * Determine assignment of requested counter events to specific
1940 		 * counters
1941 		 */
1942 		if (kcpc_assign_reqs(set, ctx) != 0) {
1943 			/*
1944 			 * May not be able to assign requested counter events
1945 			 * to all counters since all counters may not be able
1946 			 * to do all events, so only do one counter event in
1947 			 * set of counter requests when this happens since at
1948 			 * least one of the counters must be able to do the
1949 			 * event.
1950 			 */
1951 			kcpc_free_set(set);
1952 			set = kcpc_set_create(reqs, 1, 0, kmem_flags);
1953 			if (set == NULL) {
1954 				kcpc_ctx_free(ctx);
1955 				break;
1956 			}
1957 			if (kcpc_assign_reqs(set, ctx) != 0) {
1958 #ifdef DEBUG
1959 				cmn_err(CE_NOTE, "!kcpc_cpu_ctx_create: can't "
1960 				    "assign counter event %s!\n",
1961 				    set->ks_req->kr_event);
1962 #endif
1963 				kcpc_free_set(set);
1964 				kcpc_ctx_free(ctx);
1965 				reqs++;
1966 				nreqs--;
1967 				continue;
1968 			}
1969 		}
1970 
1971 		/*
1972 		 * Allocate memory needed to hold requested counter event data
1973 		 */
1974 		set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t),
1975 		    kmem_flags);
1976 		if (set->ks_data == NULL) {
1977 			kcpc_free_set(set);
1978 			kcpc_ctx_free(ctx);
1979 			break;
1980 		}
1981 
1982 		/*
1983 		 * Configure requested counter events
1984 		 */
1985 		if (kcpc_configure_reqs(ctx, set, &subcode) != 0) {
1986 #ifdef DEBUG
1987 			cmn_err(CE_NOTE,
1988 			    "!kcpc_cpu_ctx_create: can't configure "
1989 			    "set of counter event requests!\n");
1990 #endif
1991 			reqs += set->ks_nreqs;
1992 			nreqs -= set->ks_nreqs;
1993 			kmem_free(set->ks_data,
1994 			    set->ks_nreqs * sizeof (uint64_t));
1995 			kcpc_free_set(set);
1996 			kcpc_ctx_free(ctx);
1997 			continue;
1998 		}
1999 
2000 		/*
2001 		 * Point set of counter event requests at this context and fill
2002 		 * in CPC context
2003 		 */
2004 		set->ks_ctx = ctx;
2005 		ctx->kc_set = set;
2006 		ctx->kc_cpuid = cp->cpu_id;
2007 		ctx->kc_thread = curthread;
2008 
2009 		ctx_ptrs[nctx] = ctx;
2010 
2011 		/*
2012 		 * Update requests and how many are left to be assigned to sets
2013 		 */
2014 		reqs += set->ks_nreqs;
2015 		nreqs -= set->ks_nreqs;
2016 
2017 		/*
2018 		 * Increment number of CPC contexts and allocate bigger array
2019 		 * for context pointers as needed
2020 		 */
2021 		nctx++;
2022 		if (nctx >= nctx_ptrs) {
2023 			kcpc_ctx_t	**new;
2024 			int		new_cnt;
2025 
2026 			/*
2027 			 * Allocate more CPC contexts based on how many
2028 			 * contexts allocated so far and how many counter
2029 			 * requests left to assign
2030 			 */
2031 			new_cnt = nctx_ptrs +
2032 			    ((nreqs + cpc_ncounters - 1) / cpc_ncounters);
2033 			new = kmem_zalloc(new_cnt * sizeof (kcpc_ctx_t *),
2034 			    kmem_flags);
2035 			if (new == NULL)
2036 				break;
2037 
2038 			/*
2039 			 * Copy contents of old sets into new ones
2040 			 */
2041 			bcopy(ctx_ptrs, new,
2042 			    nctx_ptrs * sizeof (kcpc_ctx_t *));
2043 
2044 			/*
2045 			 * Free old array of context pointers and use newly
2046 			 * allocated one instead now
2047 			 */
2048 			kmem_free(ctx_ptrs, nctx_ptrs * sizeof (kcpc_ctx_t *));
2049 			ctx_ptrs = new;
2050 			nctx_ptrs = new_cnt;
2051 		}
2052 	}
2053 
2054 	/*
2055 	 * Return NULL if no CPC contexts filled in
2056 	 */
2057 	if (nctx == 0) {
2058 		kmem_free(ctx_ptrs, nctx_ptrs * sizeof (kcpc_ctx_t *));
2059 		*ctx_ptr_array = NULL;
2060 		*ctx_ptr_array_sz = 0;
2061 		return (-2);
2062 	}
2063 
2064 	*ctx_ptr_array = ctx_ptrs;
2065 	*ctx_ptr_array_sz = nctx_ptrs * sizeof (kcpc_ctx_t *);
2066 	return (nctx);
2067 }
2068 
2069 /*
2070  * Return whether PCBE supports given counter event
2071  */
2072 boolean_t
2073 kcpc_event_supported(char *event)
2074 {
2075 	if (pcbe_ops == NULL || pcbe_ops->pcbe_event_coverage(event) == 0)
2076 		return (B_FALSE);
2077 
2078 	return (B_TRUE);
2079 }
2080 
2081 /*
2082  * Program counters on current CPU with given CPC context
2083  *
2084  * If kernel is interposing on counters to measure hardware capacity and
2085  * utilization, then unprogram counters for kernel *before* programming them
2086  * with specified CPC context.
2087  *
2088  * kcpc_{program,unprogram}() may be called either directly by a thread running
2089  * on the target CPU or from a cross-call from another CPU. To protect
2090  * programming and unprogramming from being interrupted by cross-calls, callers
2091  * who execute kcpc_{program,unprogram} should raise PIL to the level used by
2092  * cross-calls.
2093  */
2094 void
2095 kcpc_program(kcpc_ctx_t *ctx, boolean_t for_thread, boolean_t cu_interpose)
2096 {
2097 	int	error;
2098 
2099 	ASSERT(IS_HIPIL());
2100 
2101 	/*
2102 	 * CPC context shouldn't be NULL, its CPU field should specify current
2103 	 * CPU or be -1 to specify any CPU when the context is bound to a
2104 	 * thread, and preemption should be disabled
2105 	 */
2106 	ASSERT(ctx != NULL && (ctx->kc_cpuid == CPU->cpu_id ||
2107 	    ctx->kc_cpuid == -1) && curthread->t_preempt > 0);
2108 	if (ctx == NULL || (ctx->kc_cpuid != CPU->cpu_id &&
2109 	    ctx->kc_cpuid != -1) || curthread->t_preempt < 1)
2110 		return;
2111 
2112 	/*
2113 	 * Unprogram counters for kernel measuring hardware capacity and
2114 	 * utilization
2115 	 */
2116 	if (cu_interpose == B_TRUE) {
2117 		cu_cpc_unprogram(CPU, &error);
2118 	} else {
2119 		kcpc_set_t *set = ctx->kc_set;
2120 		int i;
2121 
2122 		ASSERT(set != NULL);
2123 
2124 		/*
2125 		 * Since cu_interpose is false, we are programming CU context.
2126 		 * In general, PCBE can continue from the state saved in the
2127 		 * set, but it is not very reliable, so we start again from the
2128 		 * preset value.
2129 		 */
2130 		for (i = 0; i < set->ks_nreqs; i++) {
2131 			/*
2132 			 * Reset the virtual counter value to the preset value.
2133 			 */
2134 			*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
2135 
2136 			/*
2137 			 * Reset PCBE to the preset value.
2138 			 */
2139 			pcbe_ops->pcbe_configure(0, NULL,
2140 			    set->ks_req[i].kr_preset,
2141 			    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
2142 		}
2143 	}
2144 
2145 	/*
2146 	 * Program counters with specified CPC context
2147 	 */
2148 	ctx->kc_rawtick = KCPC_GET_TICK();
2149 	pcbe_ops->pcbe_program(ctx);
2150 
2151 	/*
2152 	 * Denote that counters programmed for thread or CPU CPC context
2153 	 * differently
2154 	 */
2155 	if (for_thread == B_TRUE)
2156 		KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
2157 	else
2158 		CPU->cpu_cpc_ctx = ctx;
2159 }
2160 
2161 /*
2162  * Unprogram counters with given CPC context on current CPU
2163  *
2164  * If kernel is interposing on counters to measure hardware capacity and
2165  * utilization, then program counters for the kernel capacity and utilization
2166  * *after* unprogramming them for given CPC context.
2167  *
2168  * See the comment for kcpc_program regarding the synchronization with
2169  * cross-calls.
2170  */
2171 void
2172 kcpc_unprogram(kcpc_ctx_t *ctx, boolean_t cu_interpose)
2173 {
2174 	int	error;
2175 
2176 	ASSERT(IS_HIPIL());
2177 
2178 	/*
2179 	 * CPC context shouldn't be NULL, its CPU field should specify current
2180 	 * CPU or be -1 to specify any CPU when the context is bound to a
2181 	 * thread, and preemption should be disabled
2182 	 */
2183 	ASSERT(ctx != NULL && (ctx->kc_cpuid == CPU->cpu_id ||
2184 	    ctx->kc_cpuid == -1) && curthread->t_preempt > 0);
2185 
2186 	if (ctx == NULL || (ctx->kc_cpuid != CPU->cpu_id &&
2187 	    ctx->kc_cpuid != -1) || curthread->t_preempt < 1 ||
2188 	    (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) != 0) {
2189 		return;
2190 	}
2191 
2192 	/*
2193 	 * Specified CPC context to be unprogrammed should be bound to current
2194 	 * CPU or thread
2195 	 */
2196 	ASSERT(CPU->cpu_cpc_ctx == ctx || curthread->t_cpc_ctx == ctx);
2197 
2198 	/*
2199 	 * Stop counters
2200 	 */
2201 	pcbe_ops->pcbe_allstop();
2202 	KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID_STOPPED);
2203 
2204 	/*
2205 	 * Allow kernel to interpose on counters and program them for its own
2206 	 * use to measure hardware capacity and utilization if cu_interpose
2207 	 * argument is true
2208 	 */
2209 	if (cu_interpose == B_TRUE)
2210 		cu_cpc_program(CPU, &error);
2211 }
2212 
2213 /*
2214  * Read CPU Performance Counter (CPC) on current CPU and call specified update
2215  * routine with data for each counter event currently programmed on CPU
2216  */
2217 int
2218 kcpc_read(kcpc_update_func_t update_func)
2219 {
2220 	kcpc_ctx_t	*ctx;
2221 	int		i;
2222 	kcpc_request_t	*req;
2223 	int		retval;
2224 	kcpc_set_t	*set;
2225 
2226 	ASSERT(IS_HIPIL());
2227 
2228 	/*
2229 	 * Can't grab locks or block because may be called inside dispatcher
2230 	 */
2231 	kpreempt_disable();
2232 
2233 	ctx = CPU->cpu_cpc_ctx;
2234 	if (ctx == NULL) {
2235 		kpreempt_enable();
2236 		return (0);
2237 	}
2238 
2239 	/*
2240 	 * Read counter data from current CPU
2241 	 */
2242 	pcbe_ops->pcbe_sample(ctx);
2243 
2244 	set = ctx->kc_set;
2245 	if (set == NULL || set->ks_req == NULL) {
2246 		kpreempt_enable();
2247 		return (0);
2248 	}
2249 
2250 	/*
2251 	 * Call update function with preset pointer and data for each CPC event
2252 	 * request currently programmed on current CPU
2253 	 */
2254 	req = set->ks_req;
2255 	retval = 0;
2256 	for (i = 0; i < set->ks_nreqs; i++) {
2257 		int	ret;
2258 
2259 		if (req[i].kr_data == NULL)
2260 			break;
2261 
2262 		ret = update_func(req[i].kr_ptr, *req[i].kr_data);
2263 		if (ret < 0)
2264 			retval = ret;
2265 	}
2266 
2267 	kpreempt_enable();
2268 
2269 	return (retval);
2270 }
2271 
2272 /*
2273  * Initialize list of counter event requests
2274  */
2275 kcpc_request_list_t *
2276 kcpc_reqs_init(int nreqs, int kmem_flags)
2277 {
2278 	kcpc_request_list_t	*req_list;
2279 	kcpc_request_t		*reqs;
2280 
2281 	if (nreqs < 1)
2282 		return (NULL);
2283 
2284 	req_list = kmem_zalloc(sizeof (kcpc_request_list_t), kmem_flags);
2285 	if (req_list == NULL)
2286 		return (NULL);
2287 
2288 	reqs = kmem_zalloc(nreqs * sizeof (kcpc_request_t), kmem_flags);
2289 	if (reqs == NULL) {
2290 		kmem_free(req_list, sizeof (kcpc_request_list_t));
2291 		return (NULL);
2292 	}
2293 
2294 	req_list->krl_list = reqs;
2295 	req_list->krl_cnt = 0;
2296 	req_list->krl_max = nreqs;
2297 	return (req_list);
2298 }
2299 
2300 
2301 /*
2302  * Add counter event request to given list of counter event requests
2303  */
2304 int
2305 kcpc_reqs_add(kcpc_request_list_t *req_list, char *event, uint64_t preset,
2306     uint_t flags, uint_t nattrs, kcpc_attr_t *attr, void *ptr, int kmem_flags)
2307 {
2308 	kcpc_request_t	*req;
2309 
2310 	if (req_list == NULL || req_list->krl_list == NULL)
2311 		return (-1);
2312 
2313 	ASSERT(req_list->krl_max != 0);
2314 
2315 	/*
2316 	 * Allocate more space (if needed)
2317 	 */
2318 	if (req_list->krl_cnt > req_list->krl_max) {
2319 		kcpc_request_t	*new;
2320 		kcpc_request_t	*old;
2321 
2322 		old = req_list->krl_list;
2323 		new = kmem_zalloc((req_list->krl_max +
2324 		    cpc_ncounters) * sizeof (kcpc_request_t), kmem_flags);
2325 		if (new == NULL)
2326 			return (-2);
2327 
2328 		req_list->krl_list = new;
2329 		bcopy(old, req_list->krl_list,
2330 		    req_list->krl_cnt * sizeof (kcpc_request_t));
2331 		kmem_free(old, req_list->krl_max * sizeof (kcpc_request_t));
2332 		req_list->krl_cnt = 0;
2333 		req_list->krl_max += cpc_ncounters;
2334 	}
2335 
2336 	/*
2337 	 * Fill in request as much as possible now, but some fields will need
2338 	 * to be set when request is assigned to a set.
2339 	 */
2340 	req = &req_list->krl_list[req_list->krl_cnt];
2341 	req->kr_config = NULL;
2342 	req->kr_picnum = -1;	/* have CPC pick this */
2343 	req->kr_index = -1;	/* set when assigning request to set */
2344 	req->kr_data = NULL;	/* set when configuring request */
2345 	(void) strcpy(req->kr_event, event);
2346 	req->kr_preset = preset;
2347 	req->kr_flags = flags;
2348 	req->kr_nattrs = nattrs;
2349 	req->kr_attr = attr;
2350 	/*
2351 	 * Keep pointer given by caller to give to update function when this
2352 	 * counter event is sampled/read
2353 	 */
2354 	req->kr_ptr = ptr;
2355 
2356 	req_list->krl_cnt++;
2357 
2358 	return (0);
2359 }
2360 
2361 /*
2362  * Reset list of CPC event requests so its space can be used for another set
2363  * of requests
2364  */
2365 int
2366 kcpc_reqs_reset(kcpc_request_list_t *req_list)
2367 {
2368 	/*
2369 	 * Return when pointer to request list structure or request is NULL or
2370 	 * when max requests is less than or equal to 0
2371 	 */
2372 	if (req_list == NULL || req_list->krl_list == NULL ||
2373 	    req_list->krl_max <= 0)
2374 		return (-1);
2375 
2376 	/*
2377 	 * Zero out requests and number of requests used
2378 	 */
2379 	bzero(req_list->krl_list, req_list->krl_max * sizeof (kcpc_request_t));
2380 	req_list->krl_cnt = 0;
2381 	return (0);
2382 }
2383 
2384 /*
2385  * Free given list of counter event requests
2386  */
2387 int
2388 kcpc_reqs_fini(kcpc_request_list_t *req_list)
2389 {
2390 	kmem_free(req_list->krl_list,
2391 	    req_list->krl_max * sizeof (kcpc_request_t));
2392 	kmem_free(req_list, sizeof (kcpc_request_list_t));
2393 	return (0);
2394 }
2395 
2396 /*
2397  * Create set of given counter event requests
2398  */
2399 static kcpc_set_t *
2400 kcpc_set_create(kcpc_request_t *reqs, int nreqs, int set_flags, int kmem_flags)
2401 {
2402 	int		i;
2403 	kcpc_set_t	*set;
2404 
2405 	/*
2406 	 * Allocate set and assign number of requests in set and flags
2407 	 */
2408 	set = kmem_zalloc(sizeof (kcpc_set_t), kmem_flags);
2409 	if (set == NULL)
2410 		return (NULL);
2411 
2412 	if (nreqs < cpc_ncounters)
2413 		set->ks_nreqs = nreqs;
2414 	else
2415 		set->ks_nreqs = cpc_ncounters;
2416 
2417 	set->ks_flags = set_flags;
2418 
2419 	/*
2420 	 * Allocate requests needed, copy requests into set, and set index into
2421 	 * data for each request (which may change when we assign requested
2422 	 * counter events to counters)
2423 	 */
2424 	set->ks_req = (kcpc_request_t *)kmem_zalloc(sizeof (kcpc_request_t) *
2425 	    set->ks_nreqs, kmem_flags);
2426 	if (set->ks_req == NULL) {
2427 		kmem_free(set, sizeof (kcpc_set_t));
2428 		return (NULL);
2429 	}
2430 
2431 	bcopy(reqs, set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
2432 
2433 	for (i = 0; i < set->ks_nreqs; i++)
2434 		set->ks_req[i].kr_index = i;
2435 
2436 	return (set);
2437 }
2438 
2439 
2440 /*
2441  * Stop counters on current CPU.
2442  *
2443  * If preserve_context is true, the caller is interested in the CPU's CPC
2444  * context and wants it to be preserved.
2445  *
2446  * If preserve_context is false, the caller does not need the CPU's CPC context
2447  * to be preserved, so it is set to NULL.
2448  */
2449 static void
2450 kcpc_cpustop_func(uintptr_t arg1, uintptr_t arg2 __unused)
2451 {
2452 	boolean_t preserve_context;
2453 	kpreempt_disable();
2454 
2455 	preserve_context = (boolean_t)arg1;
2456 	/*
2457 	 * Someone already stopped this context before us, so there is nothing
2458 	 * to do.
2459 	 */
2460 	if (CPU->cpu_cpc_ctx == NULL) {
2461 		kpreempt_enable();
2462 		return;
2463 	}
2464 
2465 	kcpc_unprogram(CPU->cpu_cpc_ctx, B_TRUE);
2466 	/*
2467 	 * If CU does not use counters, then clear the CPU's CPC context
2468 	 * If the caller requested to preserve context it should disable CU
2469 	 * first, so there should be no CU context now.
2470 	 */
2471 	ASSERT(!preserve_context || !CU_CPC_ON(CPU));
2472 	if (!preserve_context && CPU->cpu_cpc_ctx != NULL && !CU_CPC_ON(CPU))
2473 		CPU->cpu_cpc_ctx = NULL;
2474 
2475 	kpreempt_enable();
2476 }
2477 
2478 /*
2479  * Stop counters on given CPU and set its CPC context to NULL unless
2480  * preserve_context is true.
2481  */
2482 void
2483 kcpc_cpu_stop(cpu_t *cp, boolean_t preserve_context)
2484 {
2485 	cpu_call(cp, kcpc_cpustop_func, preserve_context, 0);
2486 }
2487 
2488 /*
2489  * Program the context on the current CPU
2490  */
2491 static void
2492 kcpc_remoteprogram_func(uintptr_t arg1, uintptr_t arg2)
2493 {
2494 	kcpc_ctx_t *ctx = (kcpc_ctx_t *)arg1;
2495 	boolean_t for_thread = (boolean_t)arg2;
2496 
2497 	ASSERT(ctx != NULL);
2498 
2499 	kpreempt_disable();
2500 	kcpc_program(ctx, for_thread, B_TRUE);
2501 	kpreempt_enable();
2502 }
2503 
2504 /*
2505  * Program counters on given CPU
2506  */
2507 void
2508 kcpc_cpu_program(cpu_t *cp, kcpc_ctx_t *ctx)
2509 {
2510 	cpu_call(cp, kcpc_remoteprogram_func, (uintptr_t)ctx,
2511 	    (uintptr_t)B_FALSE);
2512 }
2513 
2514 char *
2515 kcpc_list_attrs(void)
2516 {
2517 	ASSERT(pcbe_ops != NULL);
2518 
2519 	return (pcbe_ops->pcbe_list_attrs());
2520 }
2521 
2522 char *
2523 kcpc_list_events(uint_t pic)
2524 {
2525 	ASSERT(pcbe_ops != NULL);
2526 
2527 	return (pcbe_ops->pcbe_list_events(pic));
2528 }
2529 
2530 uint_t
2531 kcpc_pcbe_capabilities(void)
2532 {
2533 	ASSERT(pcbe_ops != NULL);
2534 
2535 	return (pcbe_ops->pcbe_caps);
2536 }
2537 
2538 int
2539 kcpc_pcbe_loaded(void)
2540 {
2541 	return (pcbe_ops == NULL ? -1 : 0);
2542 }
2543