xref: /illumos-gate/usr/src/uts/common/io/cpc.c (revision 8475e04352e630e4bd0f59a283286ee2475a14ce)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * CPU Performance Counter system calls and device driver.
28  *
29  * This module uses a combination of thread context operators, and
30  * thread-specific data to export CPU performance counters
31  * via both a system call and a driver interface.
32  *
33  * There are three access methods exported - the 'shared' device
34  * and the 'private' and 'agent' variants of the system call.
35  *
36  * The shared device treats the performance counter registers as
37  * a processor metric, regardless of the work scheduled on them.
38  * The private system call treats the performance counter registers
39  * as a property of a single lwp.  This is achieved by using the
40  * thread context operators to virtualize the contents of the
41  * performance counter registers between lwps.
42  *
43  * The agent method is like the private method, except that it must
44  * be accessed via /proc's agent lwp to allow the counter context of
45  * other threads to be examined safely.
46  *
47  * The shared usage fundamentally conflicts with the agent and private usage;
48  * almost all of the complexity of the module is needed to allow these two
49  * models to co-exist in a reasonable way.
50  */
51 
52 #include <sys/types.h>
53 #include <sys/file.h>
54 #include <sys/errno.h>
55 #include <sys/open.h>
56 #include <sys/cred.h>
57 #include <sys/conf.h>
58 #include <sys/stat.h>
59 #include <sys/processor.h>
60 #include <sys/cpuvar.h>
61 #include <sys/disp.h>
62 #include <sys/kmem.h>
63 #include <sys/modctl.h>
64 #include <sys/ddi.h>
65 #include <sys/sunddi.h>
66 #include <sys/nvpair.h>
67 #include <sys/policy.h>
68 #include <sys/machsystm.h>
69 #include <sys/cpc_impl.h>
70 #include <sys/cpc_pcbe.h>
71 #include <sys/kcpc.h>
72 
73 static int kcpc_copyin_set(kcpc_set_t **set, void *ubuf, size_t len);
74 static int kcpc_verify_set(kcpc_set_t *set);
75 static uint32_t kcpc_nvlist_npairs(nvlist_t *list);
76 
77 /*
78  * Generic attributes supported regardless of processor.
79  */
80 
81 #define	ATTRLIST "picnum"
82 #define	SEPARATOR ","
83 
84 /*
85  * System call to access CPU performance counters.
86  */
87 static int
88 cpc(int cmd, id_t lwpid, void *udata1, void *udata2, void *udata3)
89 {
90 	kthread_t	*t;
91 	int		error;
92 	int		size;
93 	const char	*str;
94 	int		code;
95 
96 	/*
97 	 * This CPC syscall should only be loaded if it found a PCBE to use.
98 	 */
99 	ASSERT(pcbe_ops != NULL);
100 
101 	if (curproc->p_agenttp == curthread) {
102 		/*
103 		 * Only if /proc is invoking this system call from
104 		 * the agent thread do we allow the caller to examine
105 		 * the contexts of other lwps in the process.  And
106 		 * because we know we're the agent, we know we don't
107 		 * have to grab p_lock because no-one else can change
108 		 * the state of the process.
109 		 */
110 		if ((t = idtot(curproc, lwpid)) == NULL || t == curthread)
111 			return (set_errno(ESRCH));
112 		ASSERT(t->t_tid == lwpid && ttolwp(t) != NULL);
113 	} else
114 		t = curthread;
115 
116 	if (t->t_cpc_set == NULL && (cmd == CPC_SAMPLE || cmd == CPC_RELE))
117 		return (set_errno(EINVAL));
118 
119 	switch (cmd) {
120 	case CPC_BIND:
121 		/*
122 		 * udata1 = pointer to packed nvlist buffer
123 		 * udata2 = size of packed nvlist buffer
124 		 * udata3 = User addr to return error subcode in.
125 		 */
126 
127 		rw_enter(&kcpc_cpuctx_lock, RW_READER);
128 		if (kcpc_cpuctx || dtrace_cpc_in_use) {
129 			rw_exit(&kcpc_cpuctx_lock);
130 			return (set_errno(EAGAIN));
131 		}
132 
133 		if (kcpc_hw_lwp_hook() != 0) {
134 			rw_exit(&kcpc_cpuctx_lock);
135 			return (set_errno(EACCES));
136 		}
137 
138 		/*
139 		 * An LWP may only have one set bound to it at a time; if there
140 		 * is a set bound to this LWP already, we unbind it here.
141 		 */
142 		if (t->t_cpc_set != NULL)
143 			(void) kcpc_unbind(t->t_cpc_set);
144 		ASSERT(t->t_cpc_set == NULL);
145 
146 		if ((error = kcpc_copyin_set(&t->t_cpc_set, udata1,
147 		    (size_t)udata2)) != 0) {
148 			rw_exit(&kcpc_cpuctx_lock);
149 			return (set_errno(error));
150 		}
151 
152 		if ((error = kcpc_verify_set(t->t_cpc_set)) != 0) {
153 			rw_exit(&kcpc_cpuctx_lock);
154 			kcpc_free_set(t->t_cpc_set);
155 			t->t_cpc_set = NULL;
156 			if (copyout(&error, udata3, sizeof (error)) == -1)
157 				return (set_errno(EFAULT));
158 			return (set_errno(EINVAL));
159 		}
160 
161 		if ((error = kcpc_bind_thread(t->t_cpc_set, t, &code)) != 0) {
162 			rw_exit(&kcpc_cpuctx_lock);
163 			kcpc_free_set(t->t_cpc_set);
164 			t->t_cpc_set = NULL;
165 			/*
166 			 * EINVAL and EACCES are the only errors with more
167 			 * specific subcodes.
168 			 */
169 			if ((error == EINVAL || error == EACCES) &&
170 			    copyout(&code, udata3, sizeof (code)) == -1)
171 				return (set_errno(EFAULT));
172 			return (set_errno(error));
173 		}
174 
175 		rw_exit(&kcpc_cpuctx_lock);
176 		return (0);
177 	case CPC_SAMPLE:
178 		/*
179 		 * udata1 = pointer to user's buffer
180 		 * udata2 = pointer to user's hrtime
181 		 * udata3 = pointer to user's tick
182 		 */
183 		/*
184 		 * We only allow thread-bound sets to be sampled via the
185 		 * syscall, so if this set has a CPU-bound context, return an
186 		 * error.
187 		 */
188 		if (t->t_cpc_set->ks_ctx->kc_cpuid != -1)
189 			return (set_errno(EINVAL));
190 		if ((error = kcpc_sample(t->t_cpc_set, udata1, udata2,
191 		    udata3)) != 0)
192 			return (set_errno(error));
193 
194 		return (0);
195 	case CPC_PRESET:
196 	case CPC_RESTART:
197 		/*
198 		 * These are valid only if this lwp has a bound set.
199 		 */
200 		if (t->t_cpc_set == NULL)
201 			return (set_errno(EINVAL));
202 		if (cmd == CPC_PRESET) {
203 			/*
204 			 * The preset is shipped up to us from userland in two
205 			 * parts. This lets us handle 64-bit values from 32-bit
206 			 * and 64-bit applications in the same manner.
207 			 *
208 			 * udata1 = index of request to preset
209 			 * udata2 = new 64-bit preset (most sig. 32 bits)
210 			 * udata3 = new 64-bit preset (least sig. 32 bits)
211 			 */
212 			if ((error = kcpc_preset(t->t_cpc_set, (intptr_t)udata1,
213 			    ((uint64_t)(uintptr_t)udata2 << 32ULL) |
214 			    (uint64_t)(uintptr_t)udata3)) != 0)
215 				return (set_errno(error));
216 		} else {
217 			/*
218 			 * udata[1-3] = unused
219 			 */
220 			if ((error = kcpc_restart(t->t_cpc_set)) != 0)
221 				return (set_errno(error));
222 		}
223 		return (0);
224 	case CPC_ENABLE:
225 	case CPC_DISABLE:
226 		udata1 = 0;
227 		/*FALLTHROUGH*/
228 	case CPC_USR_EVENTS:
229 	case CPC_SYS_EVENTS:
230 		if (t != curthread || t->t_cpc_set == NULL)
231 			return (set_errno(EINVAL));
232 		/*
233 		 * Provided for backwards compatibility with CPCv1.
234 		 *
235 		 * Stop the counters and record the current counts. Use the
236 		 * counts as the preset to rebind a new set with the requests
237 		 * reconfigured as requested.
238 		 *
239 		 * udata1: 1 == enable; 0 == disable
240 		 * udata{2,3}: unused
241 		 */
242 		rw_enter(&kcpc_cpuctx_lock, RW_READER);
243 		if ((error = kcpc_enable(t,
244 		    cmd, (int)(uintptr_t)udata1)) != 0) {
245 			rw_exit(&kcpc_cpuctx_lock);
246 			return (set_errno(error));
247 		}
248 		rw_exit(&kcpc_cpuctx_lock);
249 		return (0);
250 	case CPC_NPIC:
251 		return (cpc_ncounters);
252 	case CPC_CAPS:
253 		return (pcbe_ops->pcbe_caps);
254 	case CPC_EVLIST_SIZE:
255 	case CPC_LIST_EVENTS:
256 		/*
257 		 * udata1 = pointer to user's int or buffer
258 		 * udata2 = picnum
259 		 * udata3 = unused
260 		 */
261 		if ((uintptr_t)udata2 >= cpc_ncounters)
262 			return (set_errno(EINVAL));
263 
264 		size = strlen(
265 		    pcbe_ops->pcbe_list_events((uintptr_t)udata2)) + 1;
266 
267 		if (cmd == CPC_EVLIST_SIZE) {
268 			if (suword32(udata1, size) == -1)
269 				return (set_errno(EFAULT));
270 		} else {
271 			if (copyout(
272 			    pcbe_ops->pcbe_list_events((uintptr_t)udata2),
273 			    udata1, size) == -1)
274 				return (set_errno(EFAULT));
275 		}
276 		return (0);
277 	case CPC_ATTRLIST_SIZE:
278 	case CPC_LIST_ATTRS:
279 		/*
280 		 * udata1 = pointer to user's int or buffer
281 		 * udata2 = unused
282 		 * udata3 = unused
283 		 *
284 		 * attrlist size is length of PCBE-supported attributes, plus
285 		 * room for "picnum\0" plus an optional ',' separator char.
286 		 */
287 		str = pcbe_ops->pcbe_list_attrs();
288 		size = strlen(str) + sizeof (SEPARATOR ATTRLIST) + 1;
289 		if (str[0] != '\0')
290 			/*
291 			 * A ',' separator character is necessary.
292 			 */
293 			size += 1;
294 
295 		if (cmd == CPC_ATTRLIST_SIZE) {
296 			if (suword32(udata1, size) == -1)
297 				return (set_errno(EFAULT));
298 		} else {
299 			/*
300 			 * Copyout the PCBE attributes, and then append the
301 			 * generic attribute list (with separator if necessary).
302 			 */
303 			if (copyout(str, udata1, strlen(str)) == -1)
304 				return (set_errno(EFAULT));
305 			if (str[0] != '\0') {
306 				if (copyout(SEPARATOR ATTRLIST,
307 				    ((char *)udata1) + strlen(str),
308 				    strlen(SEPARATOR ATTRLIST) + 1)
309 				    == -1)
310 					return (set_errno(EFAULT));
311 			} else
312 				if (copyout(ATTRLIST,
313 				    (char *)udata1 + strlen(str),
314 				    strlen(ATTRLIST) + 1) == -1)
315 					return (set_errno(EFAULT));
316 		}
317 		return (0);
318 	case CPC_IMPL_NAME:
319 	case CPC_CPUREF:
320 		/*
321 		 * udata1 = pointer to user's buffer
322 		 * udata2 = unused
323 		 * udata3 = unused
324 		 */
325 		if (cmd == CPC_IMPL_NAME) {
326 			str = pcbe_ops->pcbe_impl_name();
327 			ASSERT(strlen(str) < CPC_MAX_IMPL_NAME);
328 		} else {
329 			str = pcbe_ops->pcbe_cpuref();
330 			ASSERT(strlen(str) < CPC_MAX_CPUREF);
331 		}
332 
333 		if (copyout(str, udata1, strlen(str) + 1) != 0)
334 			return (set_errno(EFAULT));
335 		return (0);
336 	case CPC_INVALIDATE:
337 		kcpc_invalidate(t);
338 		return (0);
339 	case CPC_RELE:
340 		if ((error = kcpc_unbind(t->t_cpc_set)) != 0)
341 			return (set_errno(error));
342 		return (0);
343 	default:
344 		return (set_errno(EINVAL));
345 	}
346 }
347 
348 /*
349  * The 'shared' device allows direct access to the
350  * performance counter control register of the current CPU.
351  * The major difference between the contexts created here and those
352  * above is that the context handlers are -not- installed, thus
353  * no context switching behaviour occurs.
354  *
355  * Because they manipulate per-cpu state, these ioctls can
356  * only be invoked from a bound lwp, by a caller with the cpc_cpu privilege
357  * who can open the relevant entry in /devices (the act of holding it open
358  * causes other uses of the counters to be suspended).
359  *
360  * Note that for correct results, the caller -must- ensure that
361  * all existing per-lwp contexts are either inactive or marked invalid;
362  * that's what the open routine does.
363  */
364 /*ARGSUSED*/
365 static int
366 kcpc_ioctl(dev_t dev, int cmd, intptr_t data, int flags, cred_t *cr, int *rvp)
367 {
368 	kthread_t	*t = curthread;
369 	processorid_t	cpuid;
370 	void		*udata1 = NULL;
371 	void		*udata2 = NULL;
372 	void		*udata3 = NULL;
373 	int		error;
374 	int		code;
375 
376 	STRUCT_DECL(__cpc_args, args);
377 
378 	STRUCT_INIT(args, flags);
379 
380 	if (curthread->t_bind_cpu != getminor(dev))
381 		return (EAGAIN);  /* someone unbound it? */
382 
383 	cpuid = getminor(dev);
384 
385 	if (cmd == CPCIO_BIND || cmd == CPCIO_SAMPLE) {
386 		if (copyin((void *)data, STRUCT_BUF(args),
387 		    STRUCT_SIZE(args)) == -1)
388 			return (EFAULT);
389 
390 		udata1 = STRUCT_FGETP(args, udata1);
391 		udata2 = STRUCT_FGETP(args, udata2);
392 		udata3 = STRUCT_FGETP(args, udata3);
393 	}
394 
395 	switch (cmd) {
396 	case CPCIO_BIND:
397 		/*
398 		 * udata1 = pointer to packed nvlist buffer
399 		 * udata2 = size of packed nvlist buffer
400 		 * udata3 = User addr to return error subcode in.
401 		 */
402 		if (t->t_cpc_set != NULL) {
403 			(void) kcpc_unbind(t->t_cpc_set);
404 			ASSERT(t->t_cpc_set == NULL);
405 		}
406 
407 		if ((error = kcpc_copyin_set(&t->t_cpc_set, udata1,
408 		    (size_t)udata2)) != 0) {
409 			return (error);
410 		}
411 
412 		if ((error = kcpc_verify_set(t->t_cpc_set)) != 0) {
413 			kcpc_free_set(t->t_cpc_set);
414 			t->t_cpc_set = NULL;
415 			if (copyout(&error, udata3, sizeof (error)) == -1)
416 				return (EFAULT);
417 			return (EINVAL);
418 		}
419 
420 		if ((error = kcpc_bind_cpu(t->t_cpc_set, cpuid, &code)) != 0) {
421 			kcpc_free_set(t->t_cpc_set);
422 			t->t_cpc_set = NULL;
423 			/*
424 			 * Subcodes are only returned for EINVAL and EACCESS.
425 			 */
426 			if ((error == EINVAL || error == EACCES) &&
427 			    copyout(&code, udata3, sizeof (code)) == -1)
428 				return (EFAULT);
429 			return (error);
430 		}
431 
432 		return (0);
433 	case CPCIO_SAMPLE:
434 		/*
435 		 * udata1 = pointer to user's buffer
436 		 * udata2 = pointer to user's hrtime
437 		 * udata3 = pointer to user's tick
438 		 */
439 		/*
440 		 * Only CPU-bound sets may be sampled via the ioctl(). If this
441 		 * set has no CPU-bound context, return an error.
442 		 */
443 		if (t->t_cpc_set == NULL)
444 			return (EINVAL);
445 		if ((error = kcpc_sample(t->t_cpc_set, udata1, udata2,
446 		    udata3)) != 0)
447 			return (error);
448 		return (0);
449 	case CPCIO_RELE:
450 		if (t->t_cpc_set == NULL)
451 			return (EINVAL);
452 		return (kcpc_unbind(t->t_cpc_set));
453 	default:
454 		return (EINVAL);
455 	}
456 }
457 
458 /*
459  * The device supports multiple opens, but only one open
460  * is allowed per processor.  This is to enable multiple
461  * instances of tools looking at different processors.
462  */
463 #define	KCPC_MINOR_SHARED		((minor_t)0x3fffful)
464 
465 static ulong_t *kcpc_cpumap;		/* bitmap of cpus */
466 
467 /*ARGSUSED1*/
468 static int
469 kcpc_open(dev_t *dev, int flags, int otyp, cred_t *cr)
470 {
471 	processorid_t	cpuid;
472 	int		error;
473 
474 	ASSERT(pcbe_ops != NULL);
475 
476 	if ((error = secpolicy_cpc_cpu(cr)) != 0)
477 		return (error);
478 	if (getminor(*dev) != KCPC_MINOR_SHARED)
479 		return (ENXIO);
480 	if ((cpuid = curthread->t_bind_cpu) == PBIND_NONE)
481 		return (EINVAL);
482 	if (cpuid > max_cpuid)
483 		return (EINVAL);
484 
485 	rw_enter(&kcpc_cpuctx_lock, RW_WRITER);
486 	if (++kcpc_cpuctx == 1) {
487 		ASSERT(kcpc_cpumap == NULL);
488 
489 		/*
490 		 * Bail out if DTrace is already using the counters.
491 		 */
492 		if (dtrace_cpc_in_use) {
493 			kcpc_cpuctx--;
494 			rw_exit(&kcpc_cpuctx_lock);
495 			return (EAGAIN);
496 		}
497 		kcpc_cpumap = kmem_zalloc(BT_SIZEOFMAP(max_cpuid + 1),
498 		    KM_SLEEP);
499 		/*
500 		 * When this device is open for processor-based contexts,
501 		 * no further lwp-based contexts can be created.
502 		 *
503 		 * Since this is the first open, ensure that all existing
504 		 * contexts are invalidated.
505 		 */
506 		kcpc_invalidate_all();
507 	} else if (BT_TEST(kcpc_cpumap, cpuid)) {
508 		kcpc_cpuctx--;
509 		rw_exit(&kcpc_cpuctx_lock);
510 		return (EAGAIN);
511 	} else if (kcpc_hw_cpu_hook(cpuid, kcpc_cpumap) != 0) {
512 		kcpc_cpuctx--;
513 		rw_exit(&kcpc_cpuctx_lock);
514 		return (EACCES);
515 	}
516 	BT_SET(kcpc_cpumap, cpuid);
517 	rw_exit(&kcpc_cpuctx_lock);
518 
519 	*dev = makedevice(getmajor(*dev), (minor_t)cpuid);
520 
521 	return (0);
522 }
523 
524 /*ARGSUSED1*/
525 static int
526 kcpc_close(dev_t dev, int flags, int otyp, cred_t *cr)
527 {
528 	rw_enter(&kcpc_cpuctx_lock, RW_WRITER);
529 	BT_CLEAR(kcpc_cpumap, getminor(dev));
530 	if (--kcpc_cpuctx == 0) {
531 		kmem_free(kcpc_cpumap, BT_SIZEOFMAP(max_cpuid + 1));
532 		kcpc_cpumap = NULL;
533 	}
534 	ASSERT(kcpc_cpuctx >= 0);
535 	rw_exit(&kcpc_cpuctx_lock);
536 
537 	return (0);
538 }
539 
540 /*
541  * Sane boundaries on the size of packed lists. In bytes.
542  */
543 #define	CPC_MIN_PACKSIZE 4
544 #define	CPC_MAX_PACKSIZE 10000
545 
546 /*
547  * Sane boundary on the number of requests a set can contain.
548  */
549 #define	CPC_MAX_NREQS 100
550 
551 /*
552  * Sane boundary on the number of attributes a request can contain.
553  */
554 #define	CPC_MAX_ATTRS 50
555 
556 /*
557  * Copy in a packed nvlist from the user and create a request set out of it.
558  * If successful, return 0 and store a pointer to the set we've created. Returns
559  * error code on error.
560  */
561 int
562 kcpc_copyin_set(kcpc_set_t **inset, void *ubuf, size_t len)
563 {
564 	kcpc_set_t	*set;
565 	int		i;
566 	int		j;
567 	char		*packbuf;
568 
569 	nvlist_t	*nvl;
570 	nvpair_t	*nvp = NULL;
571 
572 	nvlist_t	*attrs;
573 	nvpair_t	*nvp_attr;
574 	kcpc_attr_t	*attrp;
575 
576 	nvlist_t	**reqlist;
577 	uint_t		nreqs;
578 	uint64_t	uint64;
579 	uint32_t	uint32;
580 	uint32_t	setflags = (uint32_t)-1;
581 	char		*string;
582 	char		*name;
583 
584 	if (len < CPC_MIN_PACKSIZE || len > CPC_MAX_PACKSIZE)
585 		return (EINVAL);
586 
587 	packbuf = kmem_alloc(len, KM_SLEEP);
588 
589 	if (copyin(ubuf, packbuf, len) == -1) {
590 		kmem_free(packbuf, len);
591 		return (EFAULT);
592 	}
593 
594 	if (nvlist_unpack(packbuf, len, &nvl, KM_SLEEP) != 0) {
595 		kmem_free(packbuf, len);
596 		return (EINVAL);
597 	}
598 
599 	/*
600 	 * The nvlist has been unpacked so there is no need for the packed
601 	 * representation from this point on.
602 	 */
603 	kmem_free(packbuf, len);
604 
605 	i = 0;
606 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
607 		switch (nvpair_type(nvp)) {
608 		case DATA_TYPE_UINT32:
609 			if (strcmp(nvpair_name(nvp), "flags") != 0 ||
610 			    nvpair_value_uint32(nvp, &setflags) != 0) {
611 				nvlist_free(nvl);
612 				return (EINVAL);
613 			}
614 			break;
615 		case DATA_TYPE_NVLIST_ARRAY:
616 			if (strcmp(nvpair_name(nvp), "reqs") != 0 ||
617 			    nvpair_value_nvlist_array(nvp, &reqlist,
618 			    &nreqs) != 0) {
619 				nvlist_free(nvl);
620 				return (EINVAL);
621 			}
622 			break;
623 		default:
624 			nvlist_free(nvl);
625 			return (EINVAL);
626 		}
627 		i++;
628 	}
629 
630 	/*
631 	 * There should be two members in the top-level nvlist:
632 	 * an array of nvlists consisting of the requests, and flags.
633 	 * Anything else is an invalid set.
634 	 */
635 	if (i != 2) {
636 		nvlist_free(nvl);
637 		return (EINVAL);
638 	}
639 
640 	if (nreqs > CPC_MAX_NREQS) {
641 		nvlist_free(nvl);
642 		return (EINVAL);
643 	}
644 
645 	/*
646 	 * The requests are now stored in the nvlist array at reqlist.
647 	 * Note that the use of kmem_zalloc() to alloc the kcpc_set_t means
648 	 * we don't need to call the init routines for ks_lock and ks_condv.
649 	 */
650 	set = kmem_zalloc(sizeof (kcpc_set_t), KM_SLEEP);
651 	set->ks_req = (kcpc_request_t *)kmem_zalloc(sizeof (kcpc_request_t) *
652 	    nreqs, KM_SLEEP);
653 	set->ks_nreqs = nreqs;
654 	/*
655 	 * If the nvlist didn't contain a flags member, setflags was initialized
656 	 * with an illegal value and this set will fail sanity checks later on.
657 	 */
658 	set->ks_flags = setflags;
659 	/*
660 	 * Initialize bind/unbind set synchronization.
661 	 */
662 	set->ks_state &= ~KCPC_SET_BOUND;
663 
664 	/*
665 	 * Build the set up one request at a time, always keeping it self-
666 	 * consistent so we can give it to kcpc_free_set() if we need to back
667 	 * out and return and error.
668 	 */
669 	for (i = 0; i < nreqs; i++) {
670 		nvp = NULL;
671 		set->ks_req[i].kr_picnum = -1;
672 		while ((nvp = nvlist_next_nvpair(reqlist[i], nvp)) != NULL) {
673 			name = nvpair_name(nvp);
674 			switch (nvpair_type(nvp)) {
675 			case DATA_TYPE_UINT32:
676 				if (nvpair_value_uint32(nvp, &uint32) == EINVAL)
677 					goto inval;
678 				if (strcmp(name, "cr_flags") == 0)
679 					set->ks_req[i].kr_flags = uint32;
680 				if (strcmp(name, "cr_index") == 0)
681 					set->ks_req[i].kr_index = uint32;
682 				break;
683 			case DATA_TYPE_UINT64:
684 				if (nvpair_value_uint64(nvp, &uint64) == EINVAL)
685 					goto inval;
686 				if (strcmp(name, "cr_preset") == 0)
687 					set->ks_req[i].kr_preset = uint64;
688 				break;
689 			case DATA_TYPE_STRING:
690 				if (nvpair_value_string(nvp, &string) == EINVAL)
691 					goto inval;
692 				if (strcmp(name, "cr_event") == 0)
693 					(void) strncpy(set->ks_req[i].kr_event,
694 					    string, CPC_MAX_EVENT_LEN);
695 				break;
696 			case DATA_TYPE_NVLIST:
697 				if (strcmp(name, "cr_attr") != 0)
698 					goto inval;
699 				if (nvpair_value_nvlist(nvp, &attrs) == EINVAL)
700 					goto inval;
701 				nvp_attr = NULL;
702 				/*
703 				 * If the picnum has been specified as an
704 				 * attribute, consume that attribute here and
705 				 * remove it from the list of attributes.
706 				 */
707 				if (nvlist_lookup_uint64(attrs, "picnum",
708 				    &uint64) == 0) {
709 					if (nvlist_remove(attrs, "picnum",
710 					    DATA_TYPE_UINT64) != 0)
711 						panic("nvlist %p faulty",
712 						    (void *)attrs);
713 					set->ks_req[i].kr_picnum = uint64;
714 				}
715 
716 				if ((set->ks_req[i].kr_nattrs =
717 				    kcpc_nvlist_npairs(attrs)) == 0)
718 					break;
719 
720 				if (set->ks_req[i].kr_nattrs > CPC_MAX_ATTRS)
721 					goto inval;
722 
723 				set->ks_req[i].kr_attr =
724 				    kmem_alloc(set->ks_req[i].kr_nattrs *
725 				    sizeof (kcpc_attr_t), KM_SLEEP);
726 				j = 0;
727 
728 				while ((nvp_attr = nvlist_next_nvpair(attrs,
729 				    nvp_attr)) != NULL) {
730 					attrp = &set->ks_req[i].kr_attr[j];
731 
732 					if (nvpair_type(nvp_attr) !=
733 					    DATA_TYPE_UINT64)
734 						goto inval;
735 
736 					(void) strncpy(attrp->ka_name,
737 					    nvpair_name(nvp_attr),
738 					    CPC_MAX_ATTR_LEN);
739 
740 					if (nvpair_value_uint64(nvp_attr,
741 					    &(attrp->ka_val)) == EINVAL)
742 						goto inval;
743 					j++;
744 				}
745 				ASSERT(j == set->ks_req[i].kr_nattrs);
746 			default:
747 				break;
748 			}
749 		}
750 	}
751 
752 	nvlist_free(nvl);
753 	*inset = set;
754 	return (0);
755 
756 inval:
757 	nvlist_free(nvl);
758 	kcpc_free_set(set);
759 	return (EINVAL);
760 }
761 
762 /*
763  * Count the number of nvpairs in the supplied nvlist.
764  */
765 static uint32_t
766 kcpc_nvlist_npairs(nvlist_t *list)
767 {
768 	nvpair_t *nvp = NULL;
769 	uint32_t n = 0;
770 
771 	while ((nvp = nvlist_next_nvpair(list, nvp)) != NULL)
772 		n++;
773 
774 	return (n);
775 }
776 
777 /*
778  * Performs sanity checks on the given set.
779  * Returns 0 if the set checks out OK.
780  * Returns a detailed error subcode, or -1 if there is no applicable subcode.
781  */
782 static int
783 kcpc_verify_set(kcpc_set_t *set)
784 {
785 	kcpc_request_t	*rp;
786 	int		i;
787 	uint64_t	bitmap = 0;
788 	int		n;
789 
790 	if (set->ks_nreqs > cpc_ncounters)
791 		return (-1);
792 
793 	if (CPC_SET_VALID_FLAGS(set->ks_flags) == 0)
794 		return (-1);
795 
796 	for (i = 0; i < set->ks_nreqs; i++) {
797 		rp = &set->ks_req[i];
798 
799 		/*
800 		 * The following comparison must cast cpc_ncounters to an int,
801 		 * because kr_picnum will be -1 if the request didn't explicitly
802 		 * choose a PIC.
803 		 */
804 		if (rp->kr_picnum >= (int)cpc_ncounters)
805 			return (CPC_INVALID_PICNUM);
806 
807 		/*
808 		 * Of the pics whose physical picnum has been specified, make
809 		 * sure each PIC appears only once in set.
810 		 */
811 		if ((n = set->ks_req[i].kr_picnum) != -1) {
812 			if ((bitmap & (1 << n)) != 0)
813 				return (-1);
814 			bitmap |= (1 << n);
815 		}
816 
817 		/*
818 		 * Make sure the requested index falls within the range of all
819 		 * requests.
820 		 */
821 		if (rp->kr_index < 0 || rp->kr_index >= set->ks_nreqs)
822 			return (-1);
823 
824 		/*
825 		 * Make sure there are no unknown flags.
826 		 */
827 		if (KCPC_REQ_VALID_FLAGS(rp->kr_flags) == 0)
828 			return (CPC_REQ_INVALID_FLAGS);
829 	}
830 
831 	return (0);
832 }
833 
834 static struct cb_ops cb_ops = {
835 	kcpc_open,
836 	kcpc_close,
837 	nodev,		/* strategy */
838 	nodev,		/* print */
839 	nodev,		/* dump */
840 	nodev,		/* read */
841 	nodev,		/* write */
842 	kcpc_ioctl,
843 	nodev,		/* devmap */
844 	nodev,		/* mmap */
845 	nodev,		/* segmap */
846 	nochpoll,	/* poll */
847 	ddi_prop_op,
848 	NULL,
849 	D_NEW | D_MP
850 };
851 
852 /*ARGSUSED*/
853 static int
854 kcpc_probe(dev_info_t *devi)
855 {
856 	return (DDI_PROBE_SUCCESS);
857 }
858 
859 static dev_info_t *kcpc_devi;
860 
861 static int
862 kcpc_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
863 {
864 	if (cmd != DDI_ATTACH)
865 		return (DDI_FAILURE);
866 	kcpc_devi = devi;
867 	return (ddi_create_minor_node(devi, "shared", S_IFCHR,
868 	    KCPC_MINOR_SHARED, DDI_PSEUDO, 0));
869 }
870 
871 /*ARGSUSED*/
872 static int
873 kcpc_getinfo(dev_info_t *devi, ddi_info_cmd_t cmd, void *arg, void **result)
874 {
875 	switch (cmd) {
876 	case DDI_INFO_DEVT2DEVINFO:
877 		switch (getminor((dev_t)arg)) {
878 		case KCPC_MINOR_SHARED:
879 			*result = kcpc_devi;
880 			return (DDI_SUCCESS);
881 		default:
882 			break;
883 		}
884 		break;
885 	case DDI_INFO_DEVT2INSTANCE:
886 		*result = 0;
887 		return (DDI_SUCCESS);
888 	default:
889 		break;
890 	}
891 
892 	return (DDI_FAILURE);
893 }
894 
895 static struct dev_ops dev_ops = {
896 	DEVO_REV,
897 	0,
898 	kcpc_getinfo,
899 	nulldev,		/* identify */
900 	kcpc_probe,
901 	kcpc_attach,
902 	nodev,			/* detach */
903 	nodev,			/* reset */
904 	&cb_ops,
905 	(struct bus_ops *)0,
906 	NULL,
907 	ddi_quiesce_not_needed,		/* quiesce */
908 };
909 
910 static struct modldrv modldrv = {
911 	&mod_driverops,
912 	"cpc sampling driver",
913 	&dev_ops
914 };
915 
916 static struct sysent cpc_sysent = {
917 	5,
918 	SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
919 	cpc
920 };
921 
922 static struct modlsys modlsys = {
923 	&mod_syscallops,
924 	"cpc sampling system call",
925 	&cpc_sysent
926 };
927 
928 #ifdef _SYSCALL32_IMPL
929 static struct modlsys modlsys32 = {
930 	&mod_syscallops32,
931 	"32-bit cpc sampling system call",
932 	&cpc_sysent
933 };
934 #endif
935 
936 static struct modlinkage modl = {
937 	MODREV_1,
938 	&modldrv,
939 	&modlsys,
940 #ifdef _SYSCALL32_IMPL
941 	&modlsys32,
942 #endif
943 };
944 
945 static void
946 kcpc_init(void)
947 {
948 	long hash;
949 
950 	rw_init(&kcpc_cpuctx_lock, NULL, RW_DEFAULT, NULL);
951 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
952 		mutex_init(&kcpc_ctx_llock[hash],
953 		    NULL, MUTEX_DRIVER, (void *)(uintptr_t)15);
954 }
955 
956 static void
957 kcpc_fini(void)
958 {
959 	long hash;
960 
961 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
962 		mutex_destroy(&kcpc_ctx_llock[hash]);
963 	rw_destroy(&kcpc_cpuctx_lock);
964 }
965 
966 int
967 _init(void)
968 {
969 	int ret;
970 
971 	if (kcpc_hw_load_pcbe() != 0)
972 		return (ENOTSUP);
973 
974 	kcpc_init();
975 	if ((ret = mod_install(&modl)) != 0)
976 		kcpc_fini();
977 	return (ret);
978 }
979 
980 int
981 _fini(void)
982 {
983 	int ret;
984 
985 	if ((ret = mod_remove(&modl)) == 0)
986 		kcpc_fini();
987 	return (ret);
988 }
989 
990 int
991 _info(struct modinfo *mi)
992 {
993 	return (mod_info(&modl, mi));
994 }
995