xref: /titanic_41/usr/src/uts/common/io/cpc.c (revision 8b464eb836173b92f2b7a65623cd06c8c3c59289)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * CPU Performance Counter system calls and device driver.
31  *
32  * This module uses a combination of thread context operators, and
33  * thread-specific data to export CPU performance counters
34  * via both a system call and a driver interface.
35  *
36  * There are three access methods exported - the 'shared' device
37  * and the 'private' and 'agent' variants of the system call.
38  *
39  * The shared device treats the performance counter registers as
40  * a processor metric, regardless of the work scheduled on them.
41  * The private system call treats the performance counter registers
42  * as a property of a single lwp.  This is achieved by using the
43  * thread context operators to virtualize the contents of the
44  * performance counter registers between lwps.
45  *
46  * The agent method is like the private method, except that it must
47  * be accessed via /proc's agent lwp to allow the counter context of
48  * other threads to be examined safely.
49  *
50  * The shared usage fundamentally conflicts with the agent and private usage;
51  * almost all of the complexity of the module is needed to allow these two
52  * models to co-exist in a reasonable way.
53  */
54 
55 #include <sys/types.h>
56 #include <sys/file.h>
57 #include <sys/errno.h>
58 #include <sys/open.h>
59 #include <sys/cred.h>
60 #include <sys/conf.h>
61 #include <sys/stat.h>
62 #include <sys/processor.h>
63 #include <sys/cpuvar.h>
64 #include <sys/disp.h>
65 #include <sys/kmem.h>
66 #include <sys/modctl.h>
67 #include <sys/ddi.h>
68 #include <sys/sunddi.h>
69 #include <sys/nvpair.h>
70 #include <sys/policy.h>
71 #include <sys/machsystm.h>
72 #include <sys/cpc_impl.h>
73 #include <sys/cpc_pcbe.h>
74 #include <sys/kcpc.h>
75 
76 static int kcpc_copyin_set(kcpc_set_t **set, void *ubuf, size_t len);
77 static int kcpc_verify_set(kcpc_set_t *set);
78 static uint32_t kcpc_nvlist_npairs(nvlist_t *list);
79 
80 /*
81  * Generic attributes supported regardless of processor.
82  */
83 
84 #define	ATTRLIST "picnum"
85 #define	SEPARATOR ","
86 
87 /*
88  * System call to access CPU performance counters.
89  */
90 static int
91 cpc(int cmd, id_t lwpid, void *udata1, void *udata2, void *udata3)
92 {
93 	kthread_t	*t;
94 	int		error;
95 	int		size;
96 	const char	*str;
97 	int		code;
98 
99 	/*
100 	 * This CPC syscall should only be loaded if it found a PCBE to use.
101 	 */
102 	ASSERT(pcbe_ops != NULL);
103 
104 	if (curproc->p_agenttp == curthread) {
105 		/*
106 		 * Only if /proc is invoking this system call from
107 		 * the agent thread do we allow the caller to examine
108 		 * the contexts of other lwps in the process.  And
109 		 * because we know we're the agent, we know we don't
110 		 * have to grab p_lock because no-one else can change
111 		 * the state of the process.
112 		 */
113 		if ((t = idtot(curproc, lwpid)) == NULL || t == curthread)
114 			return (set_errno(ESRCH));
115 		ASSERT(t->t_tid == lwpid && ttolwp(t) != NULL);
116 	} else
117 		t = curthread;
118 
119 	if (t->t_cpc_set == NULL && (cmd == CPC_SAMPLE || cmd == CPC_RELE))
120 		return (set_errno(EINVAL));
121 
122 	switch (cmd) {
123 	case CPC_BIND:
124 		/*
125 		 * udata1 = pointer to packed nvlist buffer
126 		 * udata2 = size of packed nvlist buffer
127 		 * udata3 = User addr to return error subcode in.
128 		 */
129 
130 		rw_enter(&kcpc_cpuctx_lock, RW_READER);
131 		if (kcpc_cpuctx) {
132 			rw_exit(&kcpc_cpuctx_lock);
133 			return (set_errno(EAGAIN));
134 		}
135 
136 		if (kcpc_hw_lwp_hook() != 0) {
137 			rw_exit(&kcpc_cpuctx_lock);
138 			return (set_errno(EACCES));
139 		}
140 
141 		/*
142 		 * An LWP may only have one set bound to it at a time; if there
143 		 * is a set bound to this LWP already, we unbind it here.
144 		 */
145 		if (t->t_cpc_set != NULL)
146 			(void) kcpc_unbind(t->t_cpc_set);
147 		ASSERT(t->t_cpc_set == NULL);
148 
149 		if ((error = kcpc_copyin_set(&t->t_cpc_set, udata1,
150 		    (size_t)udata2)) != 0) {
151 			rw_exit(&kcpc_cpuctx_lock);
152 			return (set_errno(error));
153 		}
154 
155 		if ((error = kcpc_verify_set(t->t_cpc_set)) != 0) {
156 			rw_exit(&kcpc_cpuctx_lock);
157 			kcpc_free_set(t->t_cpc_set);
158 			t->t_cpc_set = NULL;
159 			if (copyout(&error, udata3, sizeof (error)) == -1)
160 				return (set_errno(EFAULT));
161 			return (set_errno(EINVAL));
162 		}
163 
164 		if ((error = kcpc_bind_thread(t->t_cpc_set, t, &code)) != 0) {
165 			rw_exit(&kcpc_cpuctx_lock);
166 			kcpc_free_set(t->t_cpc_set);
167 			t->t_cpc_set = NULL;
168 			/*
169 			 * EINVAL and EACCES are the only errors with more
170 			 * specific subcodes.
171 			 */
172 			if ((error == EINVAL || error == EACCES) &&
173 			    copyout(&code, udata3, sizeof (code)) == -1)
174 				return (set_errno(EFAULT));
175 			return (set_errno(error));
176 		}
177 
178 		rw_exit(&kcpc_cpuctx_lock);
179 		return (0);
180 	case CPC_SAMPLE:
181 		/*
182 		 * udata1 = pointer to user's buffer
183 		 * udata2 = pointer to user's hrtime
184 		 * udata3 = pointer to user's tick
185 		 */
186 		/*
187 		 * We only allow thread-bound sets to be sampled via the
188 		 * syscall, so if this set has a CPU-bound context, return an
189 		 * error.
190 		 */
191 		if (t->t_cpc_set->ks_ctx->kc_cpuid != -1)
192 			return (set_errno(EINVAL));
193 		if ((error = kcpc_sample(t->t_cpc_set, udata1, udata2,
194 		    udata3)) != 0)
195 			return (set_errno(error));
196 
197 		return (0);
198 	case CPC_PRESET:
199 	case CPC_RESTART:
200 		/*
201 		 * These are valid only if this lwp has a bound set.
202 		 */
203 		if (t->t_cpc_set == NULL)
204 			return (set_errno(EINVAL));
205 		if (cmd == CPC_PRESET) {
206 			/*
207 			 * The preset is shipped up to us from userland in two
208 			 * parts. This lets us handle 64-bit values from 32-bit
209 			 * and 64-bit applications in the same manner.
210 			 *
211 			 * udata1 = index of request to preset
212 			 * udata2 = new 64-bit preset (most sig. 32 bits)
213 			 * udata3 = new 64-bit preset (least sig. 32 bits)
214 			 */
215 			if ((error = kcpc_preset(t->t_cpc_set, (intptr_t)udata1,
216 			    ((uint64_t)(uintptr_t)udata2 << 32ULL) |
217 			    (uint64_t)(uintptr_t)udata3)) != 0)
218 				return (set_errno(error));
219 		} else {
220 			/*
221 			 * udata[1-3] = unused
222 			 */
223 			if ((error = kcpc_restart(t->t_cpc_set)) != 0)
224 				return (set_errno(error));
225 		}
226 		return (0);
227 	case CPC_ENABLE:
228 	case CPC_DISABLE:
229 		udata1 = 0;
230 		/*FALLTHROUGH*/
231 	case CPC_USR_EVENTS:
232 	case CPC_SYS_EVENTS:
233 		if (t != curthread || t->t_cpc_set == NULL)
234 			return (set_errno(EINVAL));
235 		/*
236 		 * Provided for backwards compatibility with CPCv1.
237 		 *
238 		 * Stop the counters and record the current counts. Use the
239 		 * counts as the preset to rebind a new set with the requests
240 		 * reconfigured as requested.
241 		 *
242 		 * udata1: 1 == enable; 0 == disable
243 		 * udata{2,3}: unused
244 		 */
245 		rw_enter(&kcpc_cpuctx_lock, RW_READER);
246 		if ((error = kcpc_enable(t,
247 		    cmd, (int)(uintptr_t)udata1)) != 0) {
248 			rw_exit(&kcpc_cpuctx_lock);
249 			return (set_errno(error));
250 		}
251 		rw_exit(&kcpc_cpuctx_lock);
252 		return (0);
253 	case CPC_NPIC:
254 		return (cpc_ncounters);
255 	case CPC_CAPS:
256 		return (pcbe_ops->pcbe_caps);
257 	case CPC_EVLIST_SIZE:
258 	case CPC_LIST_EVENTS:
259 		/*
260 		 * udata1 = pointer to user's int or buffer
261 		 * udata2 = picnum
262 		 * udata3 = unused
263 		 */
264 		if ((uintptr_t)udata2 >= cpc_ncounters)
265 			return (set_errno(EINVAL));
266 
267 		size = strlen(
268 		    pcbe_ops->pcbe_list_events((uintptr_t)udata2)) + 1;
269 
270 		if (cmd == CPC_EVLIST_SIZE) {
271 			if (suword32(udata1, size) == -1)
272 				return (set_errno(EFAULT));
273 		} else {
274 			if (copyout(
275 			    pcbe_ops->pcbe_list_events((uintptr_t)udata2),
276 			    udata1, size) == -1)
277 				return (set_errno(EFAULT));
278 		}
279 		return (0);
280 	case CPC_ATTRLIST_SIZE:
281 	case CPC_LIST_ATTRS:
282 		/*
283 		 * udata1 = pointer to user's int or buffer
284 		 * udata2 = unused
285 		 * udata3 = unused
286 		 *
287 		 * attrlist size is length of PCBE-supported attributes, plus
288 		 * room for "picnum\0" plus an optional ',' separator char.
289 		 */
290 		str = pcbe_ops->pcbe_list_attrs();
291 		size = strlen(str) + sizeof (SEPARATOR ATTRLIST) + 1;
292 		if (str[0] != '\0')
293 			/*
294 			 * A ',' separator character is necessary.
295 			 */
296 			size += 1;
297 
298 		if (cmd == CPC_ATTRLIST_SIZE) {
299 			if (suword32(udata1, size) == -1)
300 				return (set_errno(EFAULT));
301 		} else {
302 			/*
303 			 * Copyout the PCBE attributes, and then append the
304 			 * generic attribute list (with separator if necessary).
305 			 */
306 			if (copyout(str, udata1, strlen(str)) == -1)
307 				return (set_errno(EFAULT));
308 			if (str[0] != '\0') {
309 				if (copyout(SEPARATOR ATTRLIST,
310 				    ((char *)udata1) + strlen(str),
311 				    strlen(SEPARATOR ATTRLIST) + 1)
312 				    == -1)
313 					return (set_errno(EFAULT));
314 			} else
315 				if (copyout(ATTRLIST,
316 				    (char *)udata1 + strlen(str),
317 				    strlen(ATTRLIST) + 1) == -1)
318 					return (set_errno(EFAULT));
319 		}
320 		return (0);
321 	case CPC_IMPL_NAME:
322 	case CPC_CPUREF:
323 		/*
324 		 * udata1 = pointer to user's buffer
325 		 * udata2 = unused
326 		 * udata3 = unused
327 		 */
328 		if (cmd == CPC_IMPL_NAME) {
329 			str = pcbe_ops->pcbe_impl_name();
330 			ASSERT(strlen(str) < CPC_MAX_IMPL_NAME);
331 		} else {
332 			str = pcbe_ops->pcbe_cpuref();
333 			ASSERT(strlen(str) < CPC_MAX_CPUREF);
334 		}
335 
336 		if (copyout(str, udata1, strlen(str) + 1) != 0)
337 			return (set_errno(EFAULT));
338 		return (0);
339 	case CPC_INVALIDATE:
340 		kcpc_invalidate(t);
341 		return (0);
342 	case CPC_RELE:
343 		if ((error = kcpc_unbind(t->t_cpc_set)) != 0)
344 			return (set_errno(error));
345 		return (0);
346 	default:
347 		return (set_errno(EINVAL));
348 	}
349 }
350 
351 /*
352  * The 'shared' device allows direct access to the
353  * performance counter control register of the current CPU.
354  * The major difference between the contexts created here and those
355  * above is that the context handlers are -not- installed, thus
356  * no context switching behaviour occurs.
357  *
358  * Because they manipulate per-cpu state, these ioctls can
359  * only be invoked from a bound lwp, by a caller with the cpc_cpu privilege
360  * who can open the relevant entry in /devices (the act of holding it open
361  * causes other uses of the counters to be suspended).
362  *
363  * Note that for correct results, the caller -must- ensure that
364  * all existing per-lwp contexts are either inactive or marked invalid;
365  * that's what the open routine does.
366  */
367 /*ARGSUSED*/
368 static int
369 kcpc_ioctl(dev_t dev, int cmd, intptr_t data, int flags, cred_t *cr, int *rvp)
370 {
371 	kthread_t	*t = curthread;
372 	processorid_t	cpuid;
373 	void		*udata1 = NULL;
374 	void		*udata2 = NULL;
375 	void		*udata3 = NULL;
376 	int		error;
377 	int		code;
378 
379 	STRUCT_DECL(__cpc_args, args);
380 
381 	STRUCT_INIT(args, flags);
382 
383 	if (curthread->t_bind_cpu != getminor(dev))
384 		return (EAGAIN);  /* someone unbound it? */
385 
386 	cpuid = getminor(dev);
387 
388 	if (cmd == CPCIO_BIND || cmd == CPCIO_SAMPLE) {
389 		if (copyin((void *)data, STRUCT_BUF(args),
390 		    STRUCT_SIZE(args)) == -1)
391 			return (EFAULT);
392 
393 		udata1 = STRUCT_FGETP(args, udata1);
394 		udata2 = STRUCT_FGETP(args, udata2);
395 		udata3 = STRUCT_FGETP(args, udata3);
396 	}
397 
398 	switch (cmd) {
399 	case CPCIO_BIND:
400 		/*
401 		 * udata1 = pointer to packed nvlist buffer
402 		 * udata2 = size of packed nvlist buffer
403 		 * udata3 = User addr to return error subcode in.
404 		 */
405 		if (t->t_cpc_set != NULL) {
406 			(void) kcpc_unbind(t->t_cpc_set);
407 			ASSERT(t->t_cpc_set == NULL);
408 		}
409 
410 		if ((error = kcpc_copyin_set(&t->t_cpc_set, udata1,
411 		    (size_t)udata2)) != 0) {
412 			return (error);
413 		}
414 
415 		if ((error = kcpc_verify_set(t->t_cpc_set)) != 0) {
416 			kcpc_free_set(t->t_cpc_set);
417 			t->t_cpc_set = NULL;
418 			if (copyout(&error, udata3, sizeof (error)) == -1)
419 				return (EFAULT);
420 			return (EINVAL);
421 		}
422 
423 		if ((error = kcpc_bind_cpu(t->t_cpc_set, cpuid, &code)) != 0) {
424 			kcpc_free_set(t->t_cpc_set);
425 			t->t_cpc_set = NULL;
426 			/*
427 			 * Subcodes are only returned for EINVAL and EACCESS.
428 			 */
429 			if ((error == EINVAL || error == EACCES) &&
430 			    copyout(&code, udata3, sizeof (code)) == -1)
431 				return (EFAULT);
432 			return (error);
433 		}
434 
435 		return (0);
436 	case CPCIO_SAMPLE:
437 		/*
438 		 * udata1 = pointer to user's buffer
439 		 * udata2 = pointer to user's hrtime
440 		 * udata3 = pointer to user's tick
441 		 */
442 		/*
443 		 * Only CPU-bound sets may be sampled via the ioctl(). If this
444 		 * set has no CPU-bound context, return an error.
445 		 */
446 		if (t->t_cpc_set == NULL)
447 			return (EINVAL);
448 		if ((error = kcpc_sample(t->t_cpc_set, udata1, udata2,
449 		    udata3)) != 0)
450 			return (error);
451 		return (0);
452 	case CPCIO_RELE:
453 		if (t->t_cpc_set == NULL)
454 			return (EINVAL);
455 		return (kcpc_unbind(t->t_cpc_set));
456 	default:
457 		return (EINVAL);
458 	}
459 }
460 
461 /*
462  * The device supports multiple opens, but only one open
463  * is allowed per processor.  This is to enable multiple
464  * instances of tools looking at different processors.
465  */
466 #define	KCPC_MINOR_SHARED		((minor_t)0x3fffful)
467 
468 static ulong_t *kcpc_cpumap;		/* bitmap of cpus */
469 
470 /*ARGSUSED1*/
471 static int
472 kcpc_open(dev_t *dev, int flags, int otyp, cred_t *cr)
473 {
474 	processorid_t	cpuid;
475 	int		error;
476 
477 	ASSERT(pcbe_ops != NULL);
478 
479 	if ((error = secpolicy_cpc_cpu(cr)) != 0)
480 		return (error);
481 	if (getminor(*dev) != KCPC_MINOR_SHARED)
482 		return (ENXIO);
483 	if ((cpuid = curthread->t_bind_cpu) == PBIND_NONE)
484 		return (EINVAL);
485 	if (cpuid > max_cpuid)
486 		return (EINVAL);
487 
488 	rw_enter(&kcpc_cpuctx_lock, RW_WRITER);
489 	if (++kcpc_cpuctx == 1) {
490 		ASSERT(kcpc_cpumap == NULL);
491 		kcpc_cpumap = kmem_zalloc(BT_SIZEOFMAP(max_cpuid + 1),
492 		    KM_SLEEP);
493 		/*
494 		 * When this device is open for processor-based contexts,
495 		 * no further lwp-based contexts can be created.
496 		 *
497 		 * Since this is the first open, ensure that all existing
498 		 * contexts are invalidated.
499 		 */
500 		kcpc_invalidate_all();
501 	} else if (BT_TEST(kcpc_cpumap, cpuid)) {
502 		kcpc_cpuctx--;
503 		rw_exit(&kcpc_cpuctx_lock);
504 		return (EAGAIN);
505 	} else if (kcpc_hw_cpu_hook(cpuid, kcpc_cpumap) != 0) {
506 		kcpc_cpuctx--;
507 		rw_exit(&kcpc_cpuctx_lock);
508 		return (EACCES);
509 	}
510 	BT_SET(kcpc_cpumap, cpuid);
511 	rw_exit(&kcpc_cpuctx_lock);
512 
513 	*dev = makedevice(getmajor(*dev), (minor_t)cpuid);
514 
515 	return (0);
516 }
517 
518 /*ARGSUSED1*/
519 static int
520 kcpc_close(dev_t dev, int flags, int otyp, cred_t *cr)
521 {
522 	rw_enter(&kcpc_cpuctx_lock, RW_WRITER);
523 	BT_CLEAR(kcpc_cpumap, getminor(dev));
524 	if (--kcpc_cpuctx == 0) {
525 		kmem_free(kcpc_cpumap, BT_SIZEOFMAP(max_cpuid + 1));
526 		kcpc_cpumap = NULL;
527 	}
528 	ASSERT(kcpc_cpuctx >= 0);
529 	rw_exit(&kcpc_cpuctx_lock);
530 
531 	return (0);
532 }
533 
534 /*
535  * Sane boundaries on the size of packed lists. In bytes.
536  */
537 #define	CPC_MIN_PACKSIZE 4
538 #define	CPC_MAX_PACKSIZE 10000
539 
540 /*
541  * Sane boundary on the number of requests a set can contain.
542  */
543 #define	CPC_MAX_NREQS 100
544 
545 /*
546  * Sane boundary on the number of attributes a request can contain.
547  */
548 #define	CPC_MAX_ATTRS 50
549 
550 /*
551  * Copy in a packed nvlist from the user and create a request set out of it.
552  * If successful, return 0 and store a pointer to the set we've created. Returns
553  * error code on error.
554  */
555 int
556 kcpc_copyin_set(kcpc_set_t **inset, void *ubuf, size_t len)
557 {
558 	kcpc_set_t	*set;
559 	int		i;
560 	int		j;
561 	char		*packbuf;
562 
563 	nvlist_t	*nvl;
564 	nvpair_t	*nvp = NULL;
565 
566 	nvlist_t	*attrs;
567 	nvpair_t	*nvp_attr;
568 	kcpc_attr_t	*attrp;
569 
570 	nvlist_t	**reqlist;
571 	uint_t		nreqs;
572 	uint64_t	uint64;
573 	uint32_t	uint32;
574 	uint32_t	setflags = (uint32_t)-1;
575 	char		*string;
576 	char		*name;
577 
578 	if (len < CPC_MIN_PACKSIZE || len > CPC_MAX_PACKSIZE)
579 		return (EINVAL);
580 
581 	packbuf = kmem_alloc(len, KM_SLEEP);
582 
583 	if (copyin(ubuf, packbuf, len) == -1) {
584 		kmem_free(packbuf, len);
585 		return (EFAULT);
586 	}
587 
588 	if (nvlist_unpack(packbuf, len, &nvl, KM_SLEEP) != 0) {
589 		kmem_free(packbuf, len);
590 		return (EINVAL);
591 	}
592 
593 	/*
594 	 * The nvlist has been unpacked so there is no need for the packed
595 	 * representation from this point on.
596 	 */
597 	kmem_free(packbuf, len);
598 
599 	i = 0;
600 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
601 		switch (nvpair_type(nvp)) {
602 		case DATA_TYPE_UINT32:
603 			if (strcmp(nvpair_name(nvp), "flags") != 0 ||
604 			    nvpair_value_uint32(nvp, &setflags) != 0) {
605 				nvlist_free(nvl);
606 				return (EINVAL);
607 			}
608 			break;
609 		case DATA_TYPE_NVLIST_ARRAY:
610 			if (strcmp(nvpair_name(nvp), "reqs") != 0 ||
611 			    nvpair_value_nvlist_array(nvp, &reqlist,
612 				&nreqs) != 0) {
613 				nvlist_free(nvl);
614 				return (EINVAL);
615 			}
616 			break;
617 		default:
618 			nvlist_free(nvl);
619 			return (EINVAL);
620 		}
621 		i++;
622 	}
623 
624 	/*
625 	 * There should be two members in the top-level nvlist:
626 	 * an array of nvlists consisting of the requests, and flags.
627 	 * Anything else is an invalid set.
628 	 */
629 	if (i != 2) {
630 		nvlist_free(nvl);
631 		return (EINVAL);
632 	}
633 
634 	if (nreqs > CPC_MAX_NREQS) {
635 		nvlist_free(nvl);
636 		return (EINVAL);
637 	}
638 
639 	/*
640 	 * The requests are now stored in the nvlist array at reqlist.
641 	 */
642 	set = kmem_alloc(sizeof (kcpc_set_t), KM_SLEEP);
643 	set->ks_req = (kcpc_request_t *)kmem_zalloc(sizeof (kcpc_request_t) *
644 	    nreqs, KM_SLEEP);
645 	set->ks_nreqs = nreqs;
646 	/*
647 	 * If the nvlist didn't contain a flags member, setflags was initialized
648 	 * with an illegal value and this set will fail sanity checks later on.
649 	 */
650 	set->ks_flags = setflags;
651 
652 	/*
653 	 * Build the set up one request at a time, always keeping it self-
654 	 * consistent so we can give it to kcpc_free_set() if we need to back
655 	 * out and return and error.
656 	 */
657 	for (i = 0; i < nreqs; i++) {
658 		nvp = NULL;
659 		set->ks_req[i].kr_picnum = -1;
660 		while ((nvp = nvlist_next_nvpair(reqlist[i], nvp)) != NULL) {
661 			name = nvpair_name(nvp);
662 			switch (nvpair_type(nvp)) {
663 			case DATA_TYPE_UINT32:
664 				if (nvpair_value_uint32(nvp, &uint32) == EINVAL)
665 					goto inval;
666 				if (strcmp(name, "cr_flags") == 0)
667 					set->ks_req[i].kr_flags = uint32;
668 				if (strcmp(name, "cr_index") == 0)
669 					set->ks_req[i].kr_index = uint32;
670 				break;
671 			case DATA_TYPE_UINT64:
672 				if (nvpair_value_uint64(nvp, &uint64) == EINVAL)
673 					goto inval;
674 				if (strcmp(name, "cr_preset") == 0)
675 					set->ks_req[i].kr_preset = uint64;
676 				break;
677 			case DATA_TYPE_STRING:
678 				if (nvpair_value_string(nvp, &string) == EINVAL)
679 					goto inval;
680 				if (strcmp(name, "cr_event") == 0)
681 					(void) strncpy(set->ks_req[i].kr_event,
682 					    string, CPC_MAX_EVENT_LEN);
683 				break;
684 			case DATA_TYPE_NVLIST:
685 				if (strcmp(name, "cr_attr") != 0)
686 					goto inval;
687 				if (nvpair_value_nvlist(nvp, &attrs) == EINVAL)
688 					goto inval;
689 				nvp_attr = NULL;
690 				/*
691 				 * If the picnum has been specified as an
692 				 * attribute, consume that attribute here and
693 				 * remove it from the list of attributes.
694 				 */
695 				if (nvlist_lookup_uint64(attrs, "picnum",
696 				    &uint64) == 0) {
697 					if (nvlist_remove(attrs, "picnum",
698 					    DATA_TYPE_UINT64) != 0)
699 						panic("nvlist %p faulty",
700 						    attrs);
701 					set->ks_req[i].kr_picnum = uint64;
702 				}
703 
704 				if ((set->ks_req[i].kr_nattrs =
705 				    kcpc_nvlist_npairs(attrs)) == 0)
706 					break;
707 
708 				if (set->ks_req[i].kr_nattrs > CPC_MAX_ATTRS)
709 					goto inval;
710 
711 				set->ks_req[i].kr_attr =
712 				    kmem_alloc(set->ks_req[i].kr_nattrs *
713 				    sizeof (kcpc_attr_t), KM_SLEEP);
714 				j = 0;
715 
716 				while ((nvp_attr = nvlist_next_nvpair(attrs,
717 				    nvp_attr)) != NULL) {
718 					attrp = &set->ks_req[i].kr_attr[j];
719 
720 					if (nvpair_type(nvp_attr) !=
721 					    DATA_TYPE_UINT64)
722 						goto inval;
723 
724 					(void) strncpy(attrp->ka_name,
725 					    nvpair_name(nvp_attr),
726 					    CPC_MAX_ATTR_LEN);
727 
728 					if (nvpair_value_uint64(nvp_attr,
729 					    &(attrp->ka_val)) == EINVAL)
730 						goto inval;
731 					j++;
732 				}
733 				ASSERT(j == set->ks_req[i].kr_nattrs);
734 			default:
735 				break;
736 			}
737 		}
738 	}
739 
740 	nvlist_free(nvl);
741 	*inset = set;
742 	return (0);
743 
744 inval:
745 	nvlist_free(nvl);
746 	kcpc_free_set(set);
747 	return (EINVAL);
748 }
749 
750 /*
751  * Count the number of nvpairs in the supplied nvlist.
752  */
753 static uint32_t
754 kcpc_nvlist_npairs(nvlist_t *list)
755 {
756 	nvpair_t *nvp = NULL;
757 	uint32_t n = 0;
758 
759 	while ((nvp = nvlist_next_nvpair(list, nvp)) != NULL)
760 		n++;
761 
762 	return (n);
763 }
764 
765 /*
766  * Performs sanity checks on the given set.
767  * Returns 0 if the set checks out OK.
768  * Returns a detailed error subcode, or -1 if there is no applicable subcode.
769  */
770 static int
771 kcpc_verify_set(kcpc_set_t *set)
772 {
773 	kcpc_request_t	*rp;
774 	int		i;
775 	uint64_t	bitmap = 0;
776 	int		n;
777 
778 	if (set->ks_nreqs > cpc_ncounters)
779 		return (-1);
780 
781 	if (CPC_SET_VALID_FLAGS(set->ks_flags) == 0)
782 		return (-1);
783 
784 	for (i = 0; i < set->ks_nreqs; i++) {
785 		rp = &set->ks_req[i];
786 
787 		/*
788 		 * The following comparison must cast cpc_ncounters to an int,
789 		 * because kr_picnum will be -1 if the request didn't explicitly
790 		 * choose a PIC.
791 		 */
792 		if (rp->kr_picnum >= (int)cpc_ncounters)
793 			return (CPC_INVALID_PICNUM);
794 
795 		/*
796 		 * Of the pics whose physical picnum has been specified, make
797 		 * sure each PIC appears only once in set.
798 		 */
799 		if ((n = set->ks_req[i].kr_picnum) != -1) {
800 			if ((bitmap & (1 << n)) != 0)
801 				return (-1);
802 			bitmap |= (1 << n);
803 		}
804 
805 		/*
806 		 * Make sure the requested index falls within the range of all
807 		 * requests.
808 		 */
809 		if (rp->kr_index < 0 || rp->kr_index >= set->ks_nreqs)
810 			return (-1);
811 
812 		/*
813 		 * Make sure there are no unknown flags.
814 		 */
815 		if (KCPC_REQ_VALID_FLAGS(rp->kr_flags) == 0)
816 			return (CPC_REQ_INVALID_FLAGS);
817 	}
818 
819 	return (0);
820 }
821 
822 static struct cb_ops cb_ops = {
823 	kcpc_open,
824 	kcpc_close,
825 	nodev,		/* strategy */
826 	nodev,		/* print */
827 	nodev,		/* dump */
828 	nodev,		/* read */
829 	nodev,		/* write */
830 	kcpc_ioctl,
831 	nodev,		/* devmap */
832 	nodev,		/* mmap */
833 	nodev,		/* segmap */
834 	nochpoll,	/* poll */
835 	ddi_prop_op,
836 	NULL,
837 	D_NEW | D_MP
838 };
839 
840 /*ARGSUSED*/
841 static int
842 kcpc_probe(dev_info_t *devi)
843 {
844 	return (DDI_PROBE_SUCCESS);
845 }
846 
847 static dev_info_t *kcpc_devi;
848 
849 static int
850 kcpc_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
851 {
852 	if (cmd != DDI_ATTACH)
853 		return (DDI_FAILURE);
854 	kcpc_devi = devi;
855 	return (ddi_create_minor_node(devi, "shared", S_IFCHR,
856 	    KCPC_MINOR_SHARED, DDI_PSEUDO, 0));
857 }
858 
859 /*ARGSUSED*/
860 static int
861 kcpc_getinfo(dev_info_t *devi, ddi_info_cmd_t cmd, void *arg, void **result)
862 {
863 	switch (cmd) {
864 	case DDI_INFO_DEVT2DEVINFO:
865 		switch (getminor((dev_t)arg)) {
866 		case KCPC_MINOR_SHARED:
867 			*result = kcpc_devi;
868 			return (DDI_SUCCESS);
869 		default:
870 			break;
871 		}
872 		break;
873 	case DDI_INFO_DEVT2INSTANCE:
874 		*result = 0;
875 		return (DDI_SUCCESS);
876 	default:
877 		break;
878 	}
879 
880 	return (DDI_FAILURE);
881 }
882 
883 static struct dev_ops dev_ops = {
884 	DEVO_REV,
885 	0,
886 	kcpc_getinfo,
887 	nulldev,		/* identify */
888 	kcpc_probe,
889 	kcpc_attach,
890 	nodev,			/* detach */
891 	nodev,			/* reset */
892 	&cb_ops,
893 	(struct bus_ops *)0
894 };
895 
896 static struct modldrv modldrv = {
897 	&mod_driverops,
898 	"cpc sampling driver v%I%",
899 	&dev_ops
900 };
901 
902 static struct sysent cpc_sysent = {
903 	5,
904 	SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
905 	cpc
906 };
907 
908 static struct modlsys modlsys = {
909 	&mod_syscallops,
910 	"cpc sampling system call",
911 	&cpc_sysent
912 };
913 
914 #ifdef _SYSCALL32_IMPL
915 static struct modlsys modlsys32 = {
916 	&mod_syscallops32,
917 	"32-bit cpc sampling system call",
918 	&cpc_sysent
919 };
920 #endif
921 
922 static struct modlinkage modl = {
923 	MODREV_1,
924 	&modldrv,
925 	&modlsys,
926 #ifdef _SYSCALL32_IMPL
927 	&modlsys32,
928 #endif
929 };
930 
931 static void
932 kcpc_init(void)
933 {
934 	long hash;
935 
936 	rw_init(&kcpc_cpuctx_lock, NULL, RW_DEFAULT, NULL);
937 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
938 		mutex_init(&kcpc_ctx_llock[hash],
939 		    NULL, MUTEX_DRIVER, (void *)(uintptr_t)15);
940 }
941 
942 static void
943 kcpc_fini(void)
944 {
945 	long hash;
946 
947 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
948 		mutex_destroy(&kcpc_ctx_llock[hash]);
949 	rw_destroy(&kcpc_cpuctx_lock);
950 }
951 
952 int
953 _init(void)
954 {
955 	int ret;
956 
957 	if (kcpc_hw_load_pcbe() != 0)
958 		return (ENOTSUP);
959 
960 	kcpc_init();
961 	if ((ret = mod_install(&modl)) != 0)
962 		kcpc_fini();
963 	return (ret);
964 }
965 
966 int
967 _fini(void)
968 {
969 	int ret;
970 
971 	if ((ret = mod_remove(&modl)) == 0)
972 		kcpc_fini();
973 	return (ret);
974 }
975 
976 int
977 _info(struct modinfo *mi)
978 {
979 	return (mod_info(&modl, mi));
980 }
981