xref: /illumos-gate/usr/src/uts/common/io/cpc.c (revision a6e6969cf9cfe2070eae4cd6071f76b0fa4f539f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * CPU Performance Counter system calls and device driver.
30  *
31  * This module uses a combination of thread context operators, and
32  * thread-specific data to export CPU performance counters
33  * via both a system call and a driver interface.
34  *
35  * There are three access methods exported - the 'shared' device
36  * and the 'private' and 'agent' variants of the system call.
37  *
38  * The shared device treats the performance counter registers as
39  * a processor metric, regardless of the work scheduled on them.
40  * The private system call treats the performance counter registers
41  * as a property of a single lwp.  This is achieved by using the
42  * thread context operators to virtualize the contents of the
43  * performance counter registers between lwps.
44  *
45  * The agent method is like the private method, except that it must
46  * be accessed via /proc's agent lwp to allow the counter context of
47  * other threads to be examined safely.
48  *
49  * The shared usage fundamentally conflicts with the agent and private usage;
50  * almost all of the complexity of the module is needed to allow these two
51  * models to co-exist in a reasonable way.
52  */
53 
54 #include <sys/types.h>
55 #include <sys/file.h>
56 #include <sys/errno.h>
57 #include <sys/open.h>
58 #include <sys/cred.h>
59 #include <sys/conf.h>
60 #include <sys/stat.h>
61 #include <sys/processor.h>
62 #include <sys/cpuvar.h>
63 #include <sys/disp.h>
64 #include <sys/kmem.h>
65 #include <sys/modctl.h>
66 #include <sys/ddi.h>
67 #include <sys/sunddi.h>
68 #include <sys/nvpair.h>
69 #include <sys/policy.h>
70 #include <sys/machsystm.h>
71 #include <sys/cpc_impl.h>
72 #include <sys/cpc_pcbe.h>
73 #include <sys/kcpc.h>
74 
75 static int kcpc_copyin_set(kcpc_set_t **set, void *ubuf, size_t len);
76 static int kcpc_verify_set(kcpc_set_t *set);
77 static uint32_t kcpc_nvlist_npairs(nvlist_t *list);
78 
79 /*
80  * Generic attributes supported regardless of processor.
81  */
82 
83 #define	ATTRLIST "picnum"
84 #define	SEPARATOR ","
85 
86 /*
87  * System call to access CPU performance counters.
88  */
89 static int
90 cpc(int cmd, id_t lwpid, void *udata1, void *udata2, void *udata3)
91 {
92 	kthread_t	*t;
93 	int		error;
94 	int		size;
95 	const char	*str;
96 	int		code;
97 
98 	/*
99 	 * This CPC syscall should only be loaded if it found a PCBE to use.
100 	 */
101 	ASSERT(pcbe_ops != NULL);
102 
103 	if (curproc->p_agenttp == curthread) {
104 		/*
105 		 * Only if /proc is invoking this system call from
106 		 * the agent thread do we allow the caller to examine
107 		 * the contexts of other lwps in the process.  And
108 		 * because we know we're the agent, we know we don't
109 		 * have to grab p_lock because no-one else can change
110 		 * the state of the process.
111 		 */
112 		if ((t = idtot(curproc, lwpid)) == NULL || t == curthread)
113 			return (set_errno(ESRCH));
114 		ASSERT(t->t_tid == lwpid && ttolwp(t) != NULL);
115 	} else
116 		t = curthread;
117 
118 	if (t->t_cpc_set == NULL && (cmd == CPC_SAMPLE || cmd == CPC_RELE))
119 		return (set_errno(EINVAL));
120 
121 	switch (cmd) {
122 	case CPC_BIND:
123 		/*
124 		 * udata1 = pointer to packed nvlist buffer
125 		 * udata2 = size of packed nvlist buffer
126 		 * udata3 = User addr to return error subcode in.
127 		 */
128 
129 		rw_enter(&kcpc_cpuctx_lock, RW_READER);
130 		if (kcpc_cpuctx) {
131 			rw_exit(&kcpc_cpuctx_lock);
132 			return (set_errno(EAGAIN));
133 		}
134 
135 		if (kcpc_hw_lwp_hook() != 0) {
136 			rw_exit(&kcpc_cpuctx_lock);
137 			return (set_errno(EACCES));
138 		}
139 
140 		/*
141 		 * An LWP may only have one set bound to it at a time; if there
142 		 * is a set bound to this LWP already, we unbind it here.
143 		 */
144 		if (t->t_cpc_set != NULL)
145 			(void) kcpc_unbind(t->t_cpc_set);
146 		ASSERT(t->t_cpc_set == NULL);
147 
148 		if ((error = kcpc_copyin_set(&t->t_cpc_set, udata1,
149 		    (size_t)udata2)) != 0) {
150 			rw_exit(&kcpc_cpuctx_lock);
151 			return (set_errno(error));
152 		}
153 
154 		if ((error = kcpc_verify_set(t->t_cpc_set)) != 0) {
155 			rw_exit(&kcpc_cpuctx_lock);
156 			kcpc_free_set(t->t_cpc_set);
157 			t->t_cpc_set = NULL;
158 			if (copyout(&error, udata3, sizeof (error)) == -1)
159 				return (set_errno(EFAULT));
160 			return (set_errno(EINVAL));
161 		}
162 
163 		if ((error = kcpc_bind_thread(t->t_cpc_set, t, &code)) != 0) {
164 			rw_exit(&kcpc_cpuctx_lock);
165 			kcpc_free_set(t->t_cpc_set);
166 			t->t_cpc_set = NULL;
167 			/*
168 			 * EINVAL and EACCES are the only errors with more
169 			 * specific subcodes.
170 			 */
171 			if ((error == EINVAL || error == EACCES) &&
172 			    copyout(&code, udata3, sizeof (code)) == -1)
173 				return (set_errno(EFAULT));
174 			return (set_errno(error));
175 		}
176 
177 		rw_exit(&kcpc_cpuctx_lock);
178 		return (0);
179 	case CPC_SAMPLE:
180 		/*
181 		 * udata1 = pointer to user's buffer
182 		 * udata2 = pointer to user's hrtime
183 		 * udata3 = pointer to user's tick
184 		 */
185 		/*
186 		 * We only allow thread-bound sets to be sampled via the
187 		 * syscall, so if this set has a CPU-bound context, return an
188 		 * error.
189 		 */
190 		if (t->t_cpc_set->ks_ctx->kc_cpuid != -1)
191 			return (set_errno(EINVAL));
192 		if ((error = kcpc_sample(t->t_cpc_set, udata1, udata2,
193 		    udata3)) != 0)
194 			return (set_errno(error));
195 
196 		return (0);
197 	case CPC_PRESET:
198 	case CPC_RESTART:
199 		/*
200 		 * These are valid only if this lwp has a bound set.
201 		 */
202 		if (t->t_cpc_set == NULL)
203 			return (set_errno(EINVAL));
204 		if (cmd == CPC_PRESET) {
205 			/*
206 			 * The preset is shipped up to us from userland in two
207 			 * parts. This lets us handle 64-bit values from 32-bit
208 			 * and 64-bit applications in the same manner.
209 			 *
210 			 * udata1 = index of request to preset
211 			 * udata2 = new 64-bit preset (most sig. 32 bits)
212 			 * udata3 = new 64-bit preset (least sig. 32 bits)
213 			 */
214 			if ((error = kcpc_preset(t->t_cpc_set, (intptr_t)udata1,
215 			    ((uint64_t)(uintptr_t)udata2 << 32ULL) |
216 			    (uint64_t)(uintptr_t)udata3)) != 0)
217 				return (set_errno(error));
218 		} else {
219 			/*
220 			 * udata[1-3] = unused
221 			 */
222 			if ((error = kcpc_restart(t->t_cpc_set)) != 0)
223 				return (set_errno(error));
224 		}
225 		return (0);
226 	case CPC_ENABLE:
227 	case CPC_DISABLE:
228 		udata1 = 0;
229 		/*FALLTHROUGH*/
230 	case CPC_USR_EVENTS:
231 	case CPC_SYS_EVENTS:
232 		if (t != curthread || t->t_cpc_set == NULL)
233 			return (set_errno(EINVAL));
234 		/*
235 		 * Provided for backwards compatibility with CPCv1.
236 		 *
237 		 * Stop the counters and record the current counts. Use the
238 		 * counts as the preset to rebind a new set with the requests
239 		 * reconfigured as requested.
240 		 *
241 		 * udata1: 1 == enable; 0 == disable
242 		 * udata{2,3}: unused
243 		 */
244 		rw_enter(&kcpc_cpuctx_lock, RW_READER);
245 		if ((error = kcpc_enable(t,
246 		    cmd, (int)(uintptr_t)udata1)) != 0) {
247 			rw_exit(&kcpc_cpuctx_lock);
248 			return (set_errno(error));
249 		}
250 		rw_exit(&kcpc_cpuctx_lock);
251 		return (0);
252 	case CPC_NPIC:
253 		return (cpc_ncounters);
254 	case CPC_CAPS:
255 		return (pcbe_ops->pcbe_caps);
256 	case CPC_EVLIST_SIZE:
257 	case CPC_LIST_EVENTS:
258 		/*
259 		 * udata1 = pointer to user's int or buffer
260 		 * udata2 = picnum
261 		 * udata3 = unused
262 		 */
263 		if ((uintptr_t)udata2 >= cpc_ncounters)
264 			return (set_errno(EINVAL));
265 
266 		size = strlen(
267 		    pcbe_ops->pcbe_list_events((uintptr_t)udata2)) + 1;
268 
269 		if (cmd == CPC_EVLIST_SIZE) {
270 			if (suword32(udata1, size) == -1)
271 				return (set_errno(EFAULT));
272 		} else {
273 			if (copyout(
274 			    pcbe_ops->pcbe_list_events((uintptr_t)udata2),
275 			    udata1, size) == -1)
276 				return (set_errno(EFAULT));
277 		}
278 		return (0);
279 	case CPC_ATTRLIST_SIZE:
280 	case CPC_LIST_ATTRS:
281 		/*
282 		 * udata1 = pointer to user's int or buffer
283 		 * udata2 = unused
284 		 * udata3 = unused
285 		 *
286 		 * attrlist size is length of PCBE-supported attributes, plus
287 		 * room for "picnum\0" plus an optional ',' separator char.
288 		 */
289 		str = pcbe_ops->pcbe_list_attrs();
290 		size = strlen(str) + sizeof (SEPARATOR ATTRLIST) + 1;
291 		if (str[0] != '\0')
292 			/*
293 			 * A ',' separator character is necessary.
294 			 */
295 			size += 1;
296 
297 		if (cmd == CPC_ATTRLIST_SIZE) {
298 			if (suword32(udata1, size) == -1)
299 				return (set_errno(EFAULT));
300 		} else {
301 			/*
302 			 * Copyout the PCBE attributes, and then append the
303 			 * generic attribute list (with separator if necessary).
304 			 */
305 			if (copyout(str, udata1, strlen(str)) == -1)
306 				return (set_errno(EFAULT));
307 			if (str[0] != '\0') {
308 				if (copyout(SEPARATOR ATTRLIST,
309 				    ((char *)udata1) + strlen(str),
310 				    strlen(SEPARATOR ATTRLIST) + 1)
311 				    == -1)
312 					return (set_errno(EFAULT));
313 			} else
314 				if (copyout(ATTRLIST,
315 				    (char *)udata1 + strlen(str),
316 				    strlen(ATTRLIST) + 1) == -1)
317 					return (set_errno(EFAULT));
318 		}
319 		return (0);
320 	case CPC_IMPL_NAME:
321 	case CPC_CPUREF:
322 		/*
323 		 * udata1 = pointer to user's buffer
324 		 * udata2 = unused
325 		 * udata3 = unused
326 		 */
327 		if (cmd == CPC_IMPL_NAME) {
328 			str = pcbe_ops->pcbe_impl_name();
329 			ASSERT(strlen(str) < CPC_MAX_IMPL_NAME);
330 		} else {
331 			str = pcbe_ops->pcbe_cpuref();
332 			ASSERT(strlen(str) < CPC_MAX_CPUREF);
333 		}
334 
335 		if (copyout(str, udata1, strlen(str) + 1) != 0)
336 			return (set_errno(EFAULT));
337 		return (0);
338 	case CPC_INVALIDATE:
339 		kcpc_invalidate(t);
340 		return (0);
341 	case CPC_RELE:
342 		if ((error = kcpc_unbind(t->t_cpc_set)) != 0)
343 			return (set_errno(error));
344 		return (0);
345 	default:
346 		return (set_errno(EINVAL));
347 	}
348 }
349 
350 /*
351  * The 'shared' device allows direct access to the
352  * performance counter control register of the current CPU.
353  * The major difference between the contexts created here and those
354  * above is that the context handlers are -not- installed, thus
355  * no context switching behaviour occurs.
356  *
357  * Because they manipulate per-cpu state, these ioctls can
358  * only be invoked from a bound lwp, by a caller with the cpc_cpu privilege
359  * who can open the relevant entry in /devices (the act of holding it open
360  * causes other uses of the counters to be suspended).
361  *
362  * Note that for correct results, the caller -must- ensure that
363  * all existing per-lwp contexts are either inactive or marked invalid;
364  * that's what the open routine does.
365  */
366 /*ARGSUSED*/
367 static int
368 kcpc_ioctl(dev_t dev, int cmd, intptr_t data, int flags, cred_t *cr, int *rvp)
369 {
370 	kthread_t	*t = curthread;
371 	processorid_t	cpuid;
372 	void		*udata1 = NULL;
373 	void		*udata2 = NULL;
374 	void		*udata3 = NULL;
375 	int		error;
376 	int		code;
377 
378 	STRUCT_DECL(__cpc_args, args);
379 
380 	STRUCT_INIT(args, flags);
381 
382 	if (curthread->t_bind_cpu != getminor(dev))
383 		return (EAGAIN);  /* someone unbound it? */
384 
385 	cpuid = getminor(dev);
386 
387 	if (cmd == CPCIO_BIND || cmd == CPCIO_SAMPLE) {
388 		if (copyin((void *)data, STRUCT_BUF(args),
389 		    STRUCT_SIZE(args)) == -1)
390 			return (EFAULT);
391 
392 		udata1 = STRUCT_FGETP(args, udata1);
393 		udata2 = STRUCT_FGETP(args, udata2);
394 		udata3 = STRUCT_FGETP(args, udata3);
395 	}
396 
397 	switch (cmd) {
398 	case CPCIO_BIND:
399 		/*
400 		 * udata1 = pointer to packed nvlist buffer
401 		 * udata2 = size of packed nvlist buffer
402 		 * udata3 = User addr to return error subcode in.
403 		 */
404 		if (t->t_cpc_set != NULL) {
405 			(void) kcpc_unbind(t->t_cpc_set);
406 			ASSERT(t->t_cpc_set == NULL);
407 		}
408 
409 		if ((error = kcpc_copyin_set(&t->t_cpc_set, udata1,
410 		    (size_t)udata2)) != 0) {
411 			return (error);
412 		}
413 
414 		if ((error = kcpc_verify_set(t->t_cpc_set)) != 0) {
415 			kcpc_free_set(t->t_cpc_set);
416 			t->t_cpc_set = NULL;
417 			if (copyout(&error, udata3, sizeof (error)) == -1)
418 				return (EFAULT);
419 			return (EINVAL);
420 		}
421 
422 		if ((error = kcpc_bind_cpu(t->t_cpc_set, cpuid, &code)) != 0) {
423 			kcpc_free_set(t->t_cpc_set);
424 			t->t_cpc_set = NULL;
425 			/*
426 			 * Subcodes are only returned for EINVAL and EACCESS.
427 			 */
428 			if ((error == EINVAL || error == EACCES) &&
429 			    copyout(&code, udata3, sizeof (code)) == -1)
430 				return (EFAULT);
431 			return (error);
432 		}
433 
434 		return (0);
435 	case CPCIO_SAMPLE:
436 		/*
437 		 * udata1 = pointer to user's buffer
438 		 * udata2 = pointer to user's hrtime
439 		 * udata3 = pointer to user's tick
440 		 */
441 		/*
442 		 * Only CPU-bound sets may be sampled via the ioctl(). If this
443 		 * set has no CPU-bound context, return an error.
444 		 */
445 		if (t->t_cpc_set == NULL)
446 			return (EINVAL);
447 		if ((error = kcpc_sample(t->t_cpc_set, udata1, udata2,
448 		    udata3)) != 0)
449 			return (error);
450 		return (0);
451 	case CPCIO_RELE:
452 		if (t->t_cpc_set == NULL)
453 			return (EINVAL);
454 		return (kcpc_unbind(t->t_cpc_set));
455 	default:
456 		return (EINVAL);
457 	}
458 }
459 
460 /*
461  * The device supports multiple opens, but only one open
462  * is allowed per processor.  This is to enable multiple
463  * instances of tools looking at different processors.
464  */
465 #define	KCPC_MINOR_SHARED		((minor_t)0x3fffful)
466 
467 static ulong_t *kcpc_cpumap;		/* bitmap of cpus */
468 
469 /*ARGSUSED1*/
470 static int
471 kcpc_open(dev_t *dev, int flags, int otyp, cred_t *cr)
472 {
473 	processorid_t	cpuid;
474 	int		error;
475 
476 	ASSERT(pcbe_ops != NULL);
477 
478 	if ((error = secpolicy_cpc_cpu(cr)) != 0)
479 		return (error);
480 	if (getminor(*dev) != KCPC_MINOR_SHARED)
481 		return (ENXIO);
482 	if ((cpuid = curthread->t_bind_cpu) == PBIND_NONE)
483 		return (EINVAL);
484 	if (cpuid > max_cpuid)
485 		return (EINVAL);
486 
487 	rw_enter(&kcpc_cpuctx_lock, RW_WRITER);
488 	if (++kcpc_cpuctx == 1) {
489 		ASSERT(kcpc_cpumap == NULL);
490 		kcpc_cpumap = kmem_zalloc(BT_SIZEOFMAP(max_cpuid + 1),
491 		    KM_SLEEP);
492 		/*
493 		 * When this device is open for processor-based contexts,
494 		 * no further lwp-based contexts can be created.
495 		 *
496 		 * Since this is the first open, ensure that all existing
497 		 * contexts are invalidated.
498 		 */
499 		kcpc_invalidate_all();
500 	} else if (BT_TEST(kcpc_cpumap, cpuid)) {
501 		kcpc_cpuctx--;
502 		rw_exit(&kcpc_cpuctx_lock);
503 		return (EAGAIN);
504 	} else if (kcpc_hw_cpu_hook(cpuid, kcpc_cpumap) != 0) {
505 		kcpc_cpuctx--;
506 		rw_exit(&kcpc_cpuctx_lock);
507 		return (EACCES);
508 	}
509 	BT_SET(kcpc_cpumap, cpuid);
510 	rw_exit(&kcpc_cpuctx_lock);
511 
512 	*dev = makedevice(getmajor(*dev), (minor_t)cpuid);
513 
514 	return (0);
515 }
516 
517 /*ARGSUSED1*/
518 static int
519 kcpc_close(dev_t dev, int flags, int otyp, cred_t *cr)
520 {
521 	rw_enter(&kcpc_cpuctx_lock, RW_WRITER);
522 	BT_CLEAR(kcpc_cpumap, getminor(dev));
523 	if (--kcpc_cpuctx == 0) {
524 		kmem_free(kcpc_cpumap, BT_SIZEOFMAP(max_cpuid + 1));
525 		kcpc_cpumap = NULL;
526 	}
527 	ASSERT(kcpc_cpuctx >= 0);
528 	rw_exit(&kcpc_cpuctx_lock);
529 
530 	return (0);
531 }
532 
533 /*
534  * Sane boundaries on the size of packed lists. In bytes.
535  */
536 #define	CPC_MIN_PACKSIZE 4
537 #define	CPC_MAX_PACKSIZE 10000
538 
539 /*
540  * Sane boundary on the number of requests a set can contain.
541  */
542 #define	CPC_MAX_NREQS 100
543 
544 /*
545  * Sane boundary on the number of attributes a request can contain.
546  */
547 #define	CPC_MAX_ATTRS 50
548 
549 /*
550  * Copy in a packed nvlist from the user and create a request set out of it.
551  * If successful, return 0 and store a pointer to the set we've created. Returns
552  * error code on error.
553  */
554 int
555 kcpc_copyin_set(kcpc_set_t **inset, void *ubuf, size_t len)
556 {
557 	kcpc_set_t	*set;
558 	int		i;
559 	int		j;
560 	char		*packbuf;
561 
562 	nvlist_t	*nvl;
563 	nvpair_t	*nvp = NULL;
564 
565 	nvlist_t	*attrs;
566 	nvpair_t	*nvp_attr;
567 	kcpc_attr_t	*attrp;
568 
569 	nvlist_t	**reqlist;
570 	uint_t		nreqs;
571 	uint64_t	uint64;
572 	uint32_t	uint32;
573 	uint32_t	setflags = (uint32_t)-1;
574 	char		*string;
575 	char		*name;
576 
577 	if (len < CPC_MIN_PACKSIZE || len > CPC_MAX_PACKSIZE)
578 		return (EINVAL);
579 
580 	packbuf = kmem_alloc(len, KM_SLEEP);
581 
582 	if (copyin(ubuf, packbuf, len) == -1) {
583 		kmem_free(packbuf, len);
584 		return (EFAULT);
585 	}
586 
587 	if (nvlist_unpack(packbuf, len, &nvl, KM_SLEEP) != 0) {
588 		kmem_free(packbuf, len);
589 		return (EINVAL);
590 	}
591 
592 	/*
593 	 * The nvlist has been unpacked so there is no need for the packed
594 	 * representation from this point on.
595 	 */
596 	kmem_free(packbuf, len);
597 
598 	i = 0;
599 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
600 		switch (nvpair_type(nvp)) {
601 		case DATA_TYPE_UINT32:
602 			if (strcmp(nvpair_name(nvp), "flags") != 0 ||
603 			    nvpair_value_uint32(nvp, &setflags) != 0) {
604 				nvlist_free(nvl);
605 				return (EINVAL);
606 			}
607 			break;
608 		case DATA_TYPE_NVLIST_ARRAY:
609 			if (strcmp(nvpair_name(nvp), "reqs") != 0 ||
610 			    nvpair_value_nvlist_array(nvp, &reqlist,
611 				&nreqs) != 0) {
612 				nvlist_free(nvl);
613 				return (EINVAL);
614 			}
615 			break;
616 		default:
617 			nvlist_free(nvl);
618 			return (EINVAL);
619 		}
620 		i++;
621 	}
622 
623 	/*
624 	 * There should be two members in the top-level nvlist:
625 	 * an array of nvlists consisting of the requests, and flags.
626 	 * Anything else is an invalid set.
627 	 */
628 	if (i != 2) {
629 		nvlist_free(nvl);
630 		return (EINVAL);
631 	}
632 
633 	if (nreqs > CPC_MAX_NREQS) {
634 		nvlist_free(nvl);
635 		return (EINVAL);
636 	}
637 
638 	/*
639 	 * The requests are now stored in the nvlist array at reqlist.
640 	 * Note that the use of kmem_zalloc() to alloc the kcpc_set_t means
641 	 * we don't need to call the init routines for ks_lock and ks_condv.
642 	 */
643 	set = kmem_zalloc(sizeof (kcpc_set_t), KM_SLEEP);
644 	set->ks_req = (kcpc_request_t *)kmem_zalloc(sizeof (kcpc_request_t) *
645 	    nreqs, KM_SLEEP);
646 	set->ks_nreqs = nreqs;
647 	/*
648 	 * If the nvlist didn't contain a flags member, setflags was initialized
649 	 * with an illegal value and this set will fail sanity checks later on.
650 	 */
651 	set->ks_flags = setflags;
652 	/*
653 	 * Initialize bind/unbind set synchronization.
654 	 */
655 	set->ks_state &= ~KCPC_SET_BOUND;
656 
657 	/*
658 	 * Build the set up one request at a time, always keeping it self-
659 	 * consistent so we can give it to kcpc_free_set() if we need to back
660 	 * out and return and error.
661 	 */
662 	for (i = 0; i < nreqs; i++) {
663 		nvp = NULL;
664 		set->ks_req[i].kr_picnum = -1;
665 		while ((nvp = nvlist_next_nvpair(reqlist[i], nvp)) != NULL) {
666 			name = nvpair_name(nvp);
667 			switch (nvpair_type(nvp)) {
668 			case DATA_TYPE_UINT32:
669 				if (nvpair_value_uint32(nvp, &uint32) == EINVAL)
670 					goto inval;
671 				if (strcmp(name, "cr_flags") == 0)
672 					set->ks_req[i].kr_flags = uint32;
673 				if (strcmp(name, "cr_index") == 0)
674 					set->ks_req[i].kr_index = uint32;
675 				break;
676 			case DATA_TYPE_UINT64:
677 				if (nvpair_value_uint64(nvp, &uint64) == EINVAL)
678 					goto inval;
679 				if (strcmp(name, "cr_preset") == 0)
680 					set->ks_req[i].kr_preset = uint64;
681 				break;
682 			case DATA_TYPE_STRING:
683 				if (nvpair_value_string(nvp, &string) == EINVAL)
684 					goto inval;
685 				if (strcmp(name, "cr_event") == 0)
686 					(void) strncpy(set->ks_req[i].kr_event,
687 					    string, CPC_MAX_EVENT_LEN);
688 				break;
689 			case DATA_TYPE_NVLIST:
690 				if (strcmp(name, "cr_attr") != 0)
691 					goto inval;
692 				if (nvpair_value_nvlist(nvp, &attrs) == EINVAL)
693 					goto inval;
694 				nvp_attr = NULL;
695 				/*
696 				 * If the picnum has been specified as an
697 				 * attribute, consume that attribute here and
698 				 * remove it from the list of attributes.
699 				 */
700 				if (nvlist_lookup_uint64(attrs, "picnum",
701 				    &uint64) == 0) {
702 					if (nvlist_remove(attrs, "picnum",
703 					    DATA_TYPE_UINT64) != 0)
704 						panic("nvlist %p faulty",
705 						    attrs);
706 					set->ks_req[i].kr_picnum = uint64;
707 				}
708 
709 				if ((set->ks_req[i].kr_nattrs =
710 				    kcpc_nvlist_npairs(attrs)) == 0)
711 					break;
712 
713 				if (set->ks_req[i].kr_nattrs > CPC_MAX_ATTRS)
714 					goto inval;
715 
716 				set->ks_req[i].kr_attr =
717 				    kmem_alloc(set->ks_req[i].kr_nattrs *
718 				    sizeof (kcpc_attr_t), KM_SLEEP);
719 				j = 0;
720 
721 				while ((nvp_attr = nvlist_next_nvpair(attrs,
722 				    nvp_attr)) != NULL) {
723 					attrp = &set->ks_req[i].kr_attr[j];
724 
725 					if (nvpair_type(nvp_attr) !=
726 					    DATA_TYPE_UINT64)
727 						goto inval;
728 
729 					(void) strncpy(attrp->ka_name,
730 					    nvpair_name(nvp_attr),
731 					    CPC_MAX_ATTR_LEN);
732 
733 					if (nvpair_value_uint64(nvp_attr,
734 					    &(attrp->ka_val)) == EINVAL)
735 						goto inval;
736 					j++;
737 				}
738 				ASSERT(j == set->ks_req[i].kr_nattrs);
739 			default:
740 				break;
741 			}
742 		}
743 	}
744 
745 	nvlist_free(nvl);
746 	*inset = set;
747 	return (0);
748 
749 inval:
750 	nvlist_free(nvl);
751 	kcpc_free_set(set);
752 	return (EINVAL);
753 }
754 
755 /*
756  * Count the number of nvpairs in the supplied nvlist.
757  */
758 static uint32_t
759 kcpc_nvlist_npairs(nvlist_t *list)
760 {
761 	nvpair_t *nvp = NULL;
762 	uint32_t n = 0;
763 
764 	while ((nvp = nvlist_next_nvpair(list, nvp)) != NULL)
765 		n++;
766 
767 	return (n);
768 }
769 
770 /*
771  * Performs sanity checks on the given set.
772  * Returns 0 if the set checks out OK.
773  * Returns a detailed error subcode, or -1 if there is no applicable subcode.
774  */
775 static int
776 kcpc_verify_set(kcpc_set_t *set)
777 {
778 	kcpc_request_t	*rp;
779 	int		i;
780 	uint64_t	bitmap = 0;
781 	int		n;
782 
783 	if (set->ks_nreqs > cpc_ncounters)
784 		return (-1);
785 
786 	if (CPC_SET_VALID_FLAGS(set->ks_flags) == 0)
787 		return (-1);
788 
789 	for (i = 0; i < set->ks_nreqs; i++) {
790 		rp = &set->ks_req[i];
791 
792 		/*
793 		 * The following comparison must cast cpc_ncounters to an int,
794 		 * because kr_picnum will be -1 if the request didn't explicitly
795 		 * choose a PIC.
796 		 */
797 		if (rp->kr_picnum >= (int)cpc_ncounters)
798 			return (CPC_INVALID_PICNUM);
799 
800 		/*
801 		 * Of the pics whose physical picnum has been specified, make
802 		 * sure each PIC appears only once in set.
803 		 */
804 		if ((n = set->ks_req[i].kr_picnum) != -1) {
805 			if ((bitmap & (1 << n)) != 0)
806 				return (-1);
807 			bitmap |= (1 << n);
808 		}
809 
810 		/*
811 		 * Make sure the requested index falls within the range of all
812 		 * requests.
813 		 */
814 		if (rp->kr_index < 0 || rp->kr_index >= set->ks_nreqs)
815 			return (-1);
816 
817 		/*
818 		 * Make sure there are no unknown flags.
819 		 */
820 		if (KCPC_REQ_VALID_FLAGS(rp->kr_flags) == 0)
821 			return (CPC_REQ_INVALID_FLAGS);
822 	}
823 
824 	return (0);
825 }
826 
827 static struct cb_ops cb_ops = {
828 	kcpc_open,
829 	kcpc_close,
830 	nodev,		/* strategy */
831 	nodev,		/* print */
832 	nodev,		/* dump */
833 	nodev,		/* read */
834 	nodev,		/* write */
835 	kcpc_ioctl,
836 	nodev,		/* devmap */
837 	nodev,		/* mmap */
838 	nodev,		/* segmap */
839 	nochpoll,	/* poll */
840 	ddi_prop_op,
841 	NULL,
842 	D_NEW | D_MP
843 };
844 
845 /*ARGSUSED*/
846 static int
847 kcpc_probe(dev_info_t *devi)
848 {
849 	return (DDI_PROBE_SUCCESS);
850 }
851 
852 static dev_info_t *kcpc_devi;
853 
854 static int
855 kcpc_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
856 {
857 	if (cmd != DDI_ATTACH)
858 		return (DDI_FAILURE);
859 	kcpc_devi = devi;
860 	return (ddi_create_minor_node(devi, "shared", S_IFCHR,
861 	    KCPC_MINOR_SHARED, DDI_PSEUDO, 0));
862 }
863 
864 /*ARGSUSED*/
865 static int
866 kcpc_getinfo(dev_info_t *devi, ddi_info_cmd_t cmd, void *arg, void **result)
867 {
868 	switch (cmd) {
869 	case DDI_INFO_DEVT2DEVINFO:
870 		switch (getminor((dev_t)arg)) {
871 		case KCPC_MINOR_SHARED:
872 			*result = kcpc_devi;
873 			return (DDI_SUCCESS);
874 		default:
875 			break;
876 		}
877 		break;
878 	case DDI_INFO_DEVT2INSTANCE:
879 		*result = 0;
880 		return (DDI_SUCCESS);
881 	default:
882 		break;
883 	}
884 
885 	return (DDI_FAILURE);
886 }
887 
888 static struct dev_ops dev_ops = {
889 	DEVO_REV,
890 	0,
891 	kcpc_getinfo,
892 	nulldev,		/* identify */
893 	kcpc_probe,
894 	kcpc_attach,
895 	nodev,			/* detach */
896 	nodev,			/* reset */
897 	&cb_ops,
898 	(struct bus_ops *)0
899 };
900 
901 static struct modldrv modldrv = {
902 	&mod_driverops,
903 	"cpc sampling driver v%I%",
904 	&dev_ops
905 };
906 
907 static struct sysent cpc_sysent = {
908 	5,
909 	SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
910 	cpc
911 };
912 
913 static struct modlsys modlsys = {
914 	&mod_syscallops,
915 	"cpc sampling system call",
916 	&cpc_sysent
917 };
918 
919 #ifdef _SYSCALL32_IMPL
920 static struct modlsys modlsys32 = {
921 	&mod_syscallops32,
922 	"32-bit cpc sampling system call",
923 	&cpc_sysent
924 };
925 #endif
926 
927 static struct modlinkage modl = {
928 	MODREV_1,
929 	&modldrv,
930 	&modlsys,
931 #ifdef _SYSCALL32_IMPL
932 	&modlsys32,
933 #endif
934 };
935 
936 static void
937 kcpc_init(void)
938 {
939 	long hash;
940 
941 	rw_init(&kcpc_cpuctx_lock, NULL, RW_DEFAULT, NULL);
942 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
943 		mutex_init(&kcpc_ctx_llock[hash],
944 		    NULL, MUTEX_DRIVER, (void *)(uintptr_t)15);
945 }
946 
947 static void
948 kcpc_fini(void)
949 {
950 	long hash;
951 
952 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
953 		mutex_destroy(&kcpc_ctx_llock[hash]);
954 	rw_destroy(&kcpc_cpuctx_lock);
955 }
956 
957 int
958 _init(void)
959 {
960 	int ret;
961 
962 	if (kcpc_hw_load_pcbe() != 0)
963 		return (ENOTSUP);
964 
965 	kcpc_init();
966 	if ((ret = mod_install(&modl)) != 0)
967 		kcpc_fini();
968 	return (ret);
969 }
970 
971 int
972 _fini(void)
973 {
974 	int ret;
975 
976 	if ((ret = mod_remove(&modl)) == 0)
977 		kcpc_fini();
978 	return (ret);
979 }
980 
981 int
982 _info(struct modinfo *mi)
983 {
984 	return (mod_info(&modl, mi));
985 }
986