xref: /titanic_41/usr/src/uts/common/io/cpc.c (revision e11c3f44f531fdff80941ce57c065d2ae861cefc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 
27 /*
28  * CPU Performance Counter system calls and device driver.
29  *
30  * This module uses a combination of thread context operators, and
31  * thread-specific data to export CPU performance counters
32  * via both a system call and a driver interface.
33  *
34  * There are three access methods exported - the 'shared' device
35  * and the 'private' and 'agent' variants of the system call.
36  *
37  * The shared device treats the performance counter registers as
38  * a processor metric, regardless of the work scheduled on them.
39  * The private system call treats the performance counter registers
40  * as a property of a single lwp.  This is achieved by using the
41  * thread context operators to virtualize the contents of the
42  * performance counter registers between lwps.
43  *
44  * The agent method is like the private method, except that it must
45  * be accessed via /proc's agent lwp to allow the counter context of
46  * other threads to be examined safely.
47  *
48  * The shared usage fundamentally conflicts with the agent and private usage;
49  * almost all of the complexity of the module is needed to allow these two
50  * models to co-exist in a reasonable way.
51  */
52 
53 #include <sys/types.h>
54 #include <sys/file.h>
55 #include <sys/errno.h>
56 #include <sys/open.h>
57 #include <sys/cred.h>
58 #include <sys/conf.h>
59 #include <sys/stat.h>
60 #include <sys/processor.h>
61 #include <sys/cpuvar.h>
62 #include <sys/disp.h>
63 #include <sys/kmem.h>
64 #include <sys/modctl.h>
65 #include <sys/ddi.h>
66 #include <sys/sunddi.h>
67 #include <sys/nvpair.h>
68 #include <sys/policy.h>
69 #include <sys/machsystm.h>
70 #include <sys/cpc_impl.h>
71 #include <sys/cpc_pcbe.h>
72 #include <sys/kcpc.h>
73 
74 static int kcpc_copyin_set(kcpc_set_t **set, void *ubuf, size_t len);
75 static int kcpc_verify_set(kcpc_set_t *set);
76 static uint32_t kcpc_nvlist_npairs(nvlist_t *list);
77 
78 /*
79  * Generic attributes supported regardless of processor.
80  */
81 
82 #define	ATTRLIST "picnum"
83 #define	SEPARATOR ","
84 
85 /*
86  * System call to access CPU performance counters.
87  */
88 static int
89 cpc(int cmd, id_t lwpid, void *udata1, void *udata2, void *udata3)
90 {
91 	kthread_t	*t;
92 	int		error;
93 	int		size;
94 	const char	*str;
95 	int		code;
96 
97 	/*
98 	 * This CPC syscall should only be loaded if it found a PCBE to use.
99 	 */
100 	ASSERT(pcbe_ops != NULL);
101 
102 	if (curproc->p_agenttp == curthread) {
103 		/*
104 		 * Only if /proc is invoking this system call from
105 		 * the agent thread do we allow the caller to examine
106 		 * the contexts of other lwps in the process.  And
107 		 * because we know we're the agent, we know we don't
108 		 * have to grab p_lock because no-one else can change
109 		 * the state of the process.
110 		 */
111 		if ((t = idtot(curproc, lwpid)) == NULL || t == curthread)
112 			return (set_errno(ESRCH));
113 		ASSERT(t->t_tid == lwpid && ttolwp(t) != NULL);
114 	} else
115 		t = curthread;
116 
117 	if (t->t_cpc_set == NULL && (cmd == CPC_SAMPLE || cmd == CPC_RELE))
118 		return (set_errno(EINVAL));
119 
120 	switch (cmd) {
121 	case CPC_BIND:
122 		/*
123 		 * udata1 = pointer to packed nvlist buffer
124 		 * udata2 = size of packed nvlist buffer
125 		 * udata3 = User addr to return error subcode in.
126 		 */
127 
128 		rw_enter(&kcpc_cpuctx_lock, RW_READER);
129 		if (kcpc_cpuctx) {
130 			rw_exit(&kcpc_cpuctx_lock);
131 			return (set_errno(EAGAIN));
132 		}
133 
134 		if (kcpc_hw_lwp_hook() != 0) {
135 			rw_exit(&kcpc_cpuctx_lock);
136 			return (set_errno(EACCES));
137 		}
138 
139 		/*
140 		 * An LWP may only have one set bound to it at a time; if there
141 		 * is a set bound to this LWP already, we unbind it here.
142 		 */
143 		if (t->t_cpc_set != NULL)
144 			(void) kcpc_unbind(t->t_cpc_set);
145 		ASSERT(t->t_cpc_set == NULL);
146 
147 		if ((error = kcpc_copyin_set(&t->t_cpc_set, udata1,
148 		    (size_t)udata2)) != 0) {
149 			rw_exit(&kcpc_cpuctx_lock);
150 			return (set_errno(error));
151 		}
152 
153 		if ((error = kcpc_verify_set(t->t_cpc_set)) != 0) {
154 			rw_exit(&kcpc_cpuctx_lock);
155 			kcpc_free_set(t->t_cpc_set);
156 			t->t_cpc_set = NULL;
157 			if (copyout(&error, udata3, sizeof (error)) == -1)
158 				return (set_errno(EFAULT));
159 			return (set_errno(EINVAL));
160 		}
161 
162 		if ((error = kcpc_bind_thread(t->t_cpc_set, t, &code)) != 0) {
163 			rw_exit(&kcpc_cpuctx_lock);
164 			kcpc_free_set(t->t_cpc_set);
165 			t->t_cpc_set = NULL;
166 			/*
167 			 * EINVAL and EACCES are the only errors with more
168 			 * specific subcodes.
169 			 */
170 			if ((error == EINVAL || error == EACCES) &&
171 			    copyout(&code, udata3, sizeof (code)) == -1)
172 				return (set_errno(EFAULT));
173 			return (set_errno(error));
174 		}
175 
176 		rw_exit(&kcpc_cpuctx_lock);
177 		return (0);
178 	case CPC_SAMPLE:
179 		/*
180 		 * udata1 = pointer to user's buffer
181 		 * udata2 = pointer to user's hrtime
182 		 * udata3 = pointer to user's tick
183 		 */
184 		/*
185 		 * We only allow thread-bound sets to be sampled via the
186 		 * syscall, so if this set has a CPU-bound context, return an
187 		 * error.
188 		 */
189 		if (t->t_cpc_set->ks_ctx->kc_cpuid != -1)
190 			return (set_errno(EINVAL));
191 		if ((error = kcpc_sample(t->t_cpc_set, udata1, udata2,
192 		    udata3)) != 0)
193 			return (set_errno(error));
194 
195 		return (0);
196 	case CPC_PRESET:
197 	case CPC_RESTART:
198 		/*
199 		 * These are valid only if this lwp has a bound set.
200 		 */
201 		if (t->t_cpc_set == NULL)
202 			return (set_errno(EINVAL));
203 		if (cmd == CPC_PRESET) {
204 			/*
205 			 * The preset is shipped up to us from userland in two
206 			 * parts. This lets us handle 64-bit values from 32-bit
207 			 * and 64-bit applications in the same manner.
208 			 *
209 			 * udata1 = index of request to preset
210 			 * udata2 = new 64-bit preset (most sig. 32 bits)
211 			 * udata3 = new 64-bit preset (least sig. 32 bits)
212 			 */
213 			if ((error = kcpc_preset(t->t_cpc_set, (intptr_t)udata1,
214 			    ((uint64_t)(uintptr_t)udata2 << 32ULL) |
215 			    (uint64_t)(uintptr_t)udata3)) != 0)
216 				return (set_errno(error));
217 		} else {
218 			/*
219 			 * udata[1-3] = unused
220 			 */
221 			if ((error = kcpc_restart(t->t_cpc_set)) != 0)
222 				return (set_errno(error));
223 		}
224 		return (0);
225 	case CPC_ENABLE:
226 	case CPC_DISABLE:
227 		udata1 = 0;
228 		/*FALLTHROUGH*/
229 	case CPC_USR_EVENTS:
230 	case CPC_SYS_EVENTS:
231 		if (t != curthread || t->t_cpc_set == NULL)
232 			return (set_errno(EINVAL));
233 		/*
234 		 * Provided for backwards compatibility with CPCv1.
235 		 *
236 		 * Stop the counters and record the current counts. Use the
237 		 * counts as the preset to rebind a new set with the requests
238 		 * reconfigured as requested.
239 		 *
240 		 * udata1: 1 == enable; 0 == disable
241 		 * udata{2,3}: unused
242 		 */
243 		rw_enter(&kcpc_cpuctx_lock, RW_READER);
244 		if ((error = kcpc_enable(t,
245 		    cmd, (int)(uintptr_t)udata1)) != 0) {
246 			rw_exit(&kcpc_cpuctx_lock);
247 			return (set_errno(error));
248 		}
249 		rw_exit(&kcpc_cpuctx_lock);
250 		return (0);
251 	case CPC_NPIC:
252 		return (cpc_ncounters);
253 	case CPC_CAPS:
254 		return (pcbe_ops->pcbe_caps);
255 	case CPC_EVLIST_SIZE:
256 	case CPC_LIST_EVENTS:
257 		/*
258 		 * udata1 = pointer to user's int or buffer
259 		 * udata2 = picnum
260 		 * udata3 = unused
261 		 */
262 		if ((uintptr_t)udata2 >= cpc_ncounters)
263 			return (set_errno(EINVAL));
264 
265 		size = strlen(
266 		    pcbe_ops->pcbe_list_events((uintptr_t)udata2)) + 1;
267 
268 		if (cmd == CPC_EVLIST_SIZE) {
269 			if (suword32(udata1, size) == -1)
270 				return (set_errno(EFAULT));
271 		} else {
272 			if (copyout(
273 			    pcbe_ops->pcbe_list_events((uintptr_t)udata2),
274 			    udata1, size) == -1)
275 				return (set_errno(EFAULT));
276 		}
277 		return (0);
278 	case CPC_ATTRLIST_SIZE:
279 	case CPC_LIST_ATTRS:
280 		/*
281 		 * udata1 = pointer to user's int or buffer
282 		 * udata2 = unused
283 		 * udata3 = unused
284 		 *
285 		 * attrlist size is length of PCBE-supported attributes, plus
286 		 * room for "picnum\0" plus an optional ',' separator char.
287 		 */
288 		str = pcbe_ops->pcbe_list_attrs();
289 		size = strlen(str) + sizeof (SEPARATOR ATTRLIST) + 1;
290 		if (str[0] != '\0')
291 			/*
292 			 * A ',' separator character is necessary.
293 			 */
294 			size += 1;
295 
296 		if (cmd == CPC_ATTRLIST_SIZE) {
297 			if (suword32(udata1, size) == -1)
298 				return (set_errno(EFAULT));
299 		} else {
300 			/*
301 			 * Copyout the PCBE attributes, and then append the
302 			 * generic attribute list (with separator if necessary).
303 			 */
304 			if (copyout(str, udata1, strlen(str)) == -1)
305 				return (set_errno(EFAULT));
306 			if (str[0] != '\0') {
307 				if (copyout(SEPARATOR ATTRLIST,
308 				    ((char *)udata1) + strlen(str),
309 				    strlen(SEPARATOR ATTRLIST) + 1)
310 				    == -1)
311 					return (set_errno(EFAULT));
312 			} else
313 				if (copyout(ATTRLIST,
314 				    (char *)udata1 + strlen(str),
315 				    strlen(ATTRLIST) + 1) == -1)
316 					return (set_errno(EFAULT));
317 		}
318 		return (0);
319 	case CPC_IMPL_NAME:
320 	case CPC_CPUREF:
321 		/*
322 		 * udata1 = pointer to user's buffer
323 		 * udata2 = unused
324 		 * udata3 = unused
325 		 */
326 		if (cmd == CPC_IMPL_NAME) {
327 			str = pcbe_ops->pcbe_impl_name();
328 			ASSERT(strlen(str) < CPC_MAX_IMPL_NAME);
329 		} else {
330 			str = pcbe_ops->pcbe_cpuref();
331 			ASSERT(strlen(str) < CPC_MAX_CPUREF);
332 		}
333 
334 		if (copyout(str, udata1, strlen(str) + 1) != 0)
335 			return (set_errno(EFAULT));
336 		return (0);
337 	case CPC_INVALIDATE:
338 		kcpc_invalidate(t);
339 		return (0);
340 	case CPC_RELE:
341 		if ((error = kcpc_unbind(t->t_cpc_set)) != 0)
342 			return (set_errno(error));
343 		return (0);
344 	default:
345 		return (set_errno(EINVAL));
346 	}
347 }
348 
349 /*
350  * The 'shared' device allows direct access to the
351  * performance counter control register of the current CPU.
352  * The major difference between the contexts created here and those
353  * above is that the context handlers are -not- installed, thus
354  * no context switching behaviour occurs.
355  *
356  * Because they manipulate per-cpu state, these ioctls can
357  * only be invoked from a bound lwp, by a caller with the cpc_cpu privilege
358  * who can open the relevant entry in /devices (the act of holding it open
359  * causes other uses of the counters to be suspended).
360  *
361  * Note that for correct results, the caller -must- ensure that
362  * all existing per-lwp contexts are either inactive or marked invalid;
363  * that's what the open routine does.
364  */
365 /*ARGSUSED*/
366 static int
367 kcpc_ioctl(dev_t dev, int cmd, intptr_t data, int flags, cred_t *cr, int *rvp)
368 {
369 	kthread_t	*t = curthread;
370 	processorid_t	cpuid;
371 	void		*udata1 = NULL;
372 	void		*udata2 = NULL;
373 	void		*udata3 = NULL;
374 	int		error;
375 	int		code;
376 
377 	STRUCT_DECL(__cpc_args, args);
378 
379 	STRUCT_INIT(args, flags);
380 
381 	if (curthread->t_bind_cpu != getminor(dev))
382 		return (EAGAIN);  /* someone unbound it? */
383 
384 	cpuid = getminor(dev);
385 
386 	if (cmd == CPCIO_BIND || cmd == CPCIO_SAMPLE) {
387 		if (copyin((void *)data, STRUCT_BUF(args),
388 		    STRUCT_SIZE(args)) == -1)
389 			return (EFAULT);
390 
391 		udata1 = STRUCT_FGETP(args, udata1);
392 		udata2 = STRUCT_FGETP(args, udata2);
393 		udata3 = STRUCT_FGETP(args, udata3);
394 	}
395 
396 	switch (cmd) {
397 	case CPCIO_BIND:
398 		/*
399 		 * udata1 = pointer to packed nvlist buffer
400 		 * udata2 = size of packed nvlist buffer
401 		 * udata3 = User addr to return error subcode in.
402 		 */
403 		if (t->t_cpc_set != NULL) {
404 			(void) kcpc_unbind(t->t_cpc_set);
405 			ASSERT(t->t_cpc_set == NULL);
406 		}
407 
408 		if ((error = kcpc_copyin_set(&t->t_cpc_set, udata1,
409 		    (size_t)udata2)) != 0) {
410 			return (error);
411 		}
412 
413 		if ((error = kcpc_verify_set(t->t_cpc_set)) != 0) {
414 			kcpc_free_set(t->t_cpc_set);
415 			t->t_cpc_set = NULL;
416 			if (copyout(&error, udata3, sizeof (error)) == -1)
417 				return (EFAULT);
418 			return (EINVAL);
419 		}
420 
421 		if ((error = kcpc_bind_cpu(t->t_cpc_set, cpuid, &code)) != 0) {
422 			kcpc_free_set(t->t_cpc_set);
423 			t->t_cpc_set = NULL;
424 			/*
425 			 * Subcodes are only returned for EINVAL and EACCESS.
426 			 */
427 			if ((error == EINVAL || error == EACCES) &&
428 			    copyout(&code, udata3, sizeof (code)) == -1)
429 				return (EFAULT);
430 			return (error);
431 		}
432 
433 		return (0);
434 	case CPCIO_SAMPLE:
435 		/*
436 		 * udata1 = pointer to user's buffer
437 		 * udata2 = pointer to user's hrtime
438 		 * udata3 = pointer to user's tick
439 		 */
440 		/*
441 		 * Only CPU-bound sets may be sampled via the ioctl(). If this
442 		 * set has no CPU-bound context, return an error.
443 		 */
444 		if (t->t_cpc_set == NULL)
445 			return (EINVAL);
446 		if ((error = kcpc_sample(t->t_cpc_set, udata1, udata2,
447 		    udata3)) != 0)
448 			return (error);
449 		return (0);
450 	case CPCIO_RELE:
451 		if (t->t_cpc_set == NULL)
452 			return (EINVAL);
453 		return (kcpc_unbind(t->t_cpc_set));
454 	default:
455 		return (EINVAL);
456 	}
457 }
458 
459 /*
460  * The device supports multiple opens, but only one open
461  * is allowed per processor.  This is to enable multiple
462  * instances of tools looking at different processors.
463  */
464 #define	KCPC_MINOR_SHARED		((minor_t)0x3fffful)
465 
466 static ulong_t *kcpc_cpumap;		/* bitmap of cpus */
467 
468 /*ARGSUSED1*/
469 static int
470 kcpc_open(dev_t *dev, int flags, int otyp, cred_t *cr)
471 {
472 	processorid_t	cpuid;
473 	int		error;
474 
475 	ASSERT(pcbe_ops != NULL);
476 
477 	if ((error = secpolicy_cpc_cpu(cr)) != 0)
478 		return (error);
479 	if (getminor(*dev) != KCPC_MINOR_SHARED)
480 		return (ENXIO);
481 	if ((cpuid = curthread->t_bind_cpu) == PBIND_NONE)
482 		return (EINVAL);
483 	if (cpuid > max_cpuid)
484 		return (EINVAL);
485 
486 	rw_enter(&kcpc_cpuctx_lock, RW_WRITER);
487 	if (++kcpc_cpuctx == 1) {
488 		ASSERT(kcpc_cpumap == NULL);
489 		kcpc_cpumap = kmem_zalloc(BT_SIZEOFMAP(max_cpuid + 1),
490 		    KM_SLEEP);
491 		/*
492 		 * When this device is open for processor-based contexts,
493 		 * no further lwp-based contexts can be created.
494 		 *
495 		 * Since this is the first open, ensure that all existing
496 		 * contexts are invalidated.
497 		 */
498 		kcpc_invalidate_all();
499 	} else if (BT_TEST(kcpc_cpumap, cpuid)) {
500 		kcpc_cpuctx--;
501 		rw_exit(&kcpc_cpuctx_lock);
502 		return (EAGAIN);
503 	} else if (kcpc_hw_cpu_hook(cpuid, kcpc_cpumap) != 0) {
504 		kcpc_cpuctx--;
505 		rw_exit(&kcpc_cpuctx_lock);
506 		return (EACCES);
507 	}
508 	BT_SET(kcpc_cpumap, cpuid);
509 	rw_exit(&kcpc_cpuctx_lock);
510 
511 	*dev = makedevice(getmajor(*dev), (minor_t)cpuid);
512 
513 	return (0);
514 }
515 
516 /*ARGSUSED1*/
517 static int
518 kcpc_close(dev_t dev, int flags, int otyp, cred_t *cr)
519 {
520 	rw_enter(&kcpc_cpuctx_lock, RW_WRITER);
521 	BT_CLEAR(kcpc_cpumap, getminor(dev));
522 	if (--kcpc_cpuctx == 0) {
523 		kmem_free(kcpc_cpumap, BT_SIZEOFMAP(max_cpuid + 1));
524 		kcpc_cpumap = NULL;
525 	}
526 	ASSERT(kcpc_cpuctx >= 0);
527 	rw_exit(&kcpc_cpuctx_lock);
528 
529 	return (0);
530 }
531 
532 /*
533  * Sane boundaries on the size of packed lists. In bytes.
534  */
535 #define	CPC_MIN_PACKSIZE 4
536 #define	CPC_MAX_PACKSIZE 10000
537 
538 /*
539  * Sane boundary on the number of requests a set can contain.
540  */
541 #define	CPC_MAX_NREQS 100
542 
543 /*
544  * Sane boundary on the number of attributes a request can contain.
545  */
546 #define	CPC_MAX_ATTRS 50
547 
548 /*
549  * Copy in a packed nvlist from the user and create a request set out of it.
550  * If successful, return 0 and store a pointer to the set we've created. Returns
551  * error code on error.
552  */
553 int
554 kcpc_copyin_set(kcpc_set_t **inset, void *ubuf, size_t len)
555 {
556 	kcpc_set_t	*set;
557 	int		i;
558 	int		j;
559 	char		*packbuf;
560 
561 	nvlist_t	*nvl;
562 	nvpair_t	*nvp = NULL;
563 
564 	nvlist_t	*attrs;
565 	nvpair_t	*nvp_attr;
566 	kcpc_attr_t	*attrp;
567 
568 	nvlist_t	**reqlist;
569 	uint_t		nreqs;
570 	uint64_t	uint64;
571 	uint32_t	uint32;
572 	uint32_t	setflags = (uint32_t)-1;
573 	char		*string;
574 	char		*name;
575 
576 	if (len < CPC_MIN_PACKSIZE || len > CPC_MAX_PACKSIZE)
577 		return (EINVAL);
578 
579 	packbuf = kmem_alloc(len, KM_SLEEP);
580 
581 	if (copyin(ubuf, packbuf, len) == -1) {
582 		kmem_free(packbuf, len);
583 		return (EFAULT);
584 	}
585 
586 	if (nvlist_unpack(packbuf, len, &nvl, KM_SLEEP) != 0) {
587 		kmem_free(packbuf, len);
588 		return (EINVAL);
589 	}
590 
591 	/*
592 	 * The nvlist has been unpacked so there is no need for the packed
593 	 * representation from this point on.
594 	 */
595 	kmem_free(packbuf, len);
596 
597 	i = 0;
598 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
599 		switch (nvpair_type(nvp)) {
600 		case DATA_TYPE_UINT32:
601 			if (strcmp(nvpair_name(nvp), "flags") != 0 ||
602 			    nvpair_value_uint32(nvp, &setflags) != 0) {
603 				nvlist_free(nvl);
604 				return (EINVAL);
605 			}
606 			break;
607 		case DATA_TYPE_NVLIST_ARRAY:
608 			if (strcmp(nvpair_name(nvp), "reqs") != 0 ||
609 			    nvpair_value_nvlist_array(nvp, &reqlist,
610 			    &nreqs) != 0) {
611 				nvlist_free(nvl);
612 				return (EINVAL);
613 			}
614 			break;
615 		default:
616 			nvlist_free(nvl);
617 			return (EINVAL);
618 		}
619 		i++;
620 	}
621 
622 	/*
623 	 * There should be two members in the top-level nvlist:
624 	 * an array of nvlists consisting of the requests, and flags.
625 	 * Anything else is an invalid set.
626 	 */
627 	if (i != 2) {
628 		nvlist_free(nvl);
629 		return (EINVAL);
630 	}
631 
632 	if (nreqs > CPC_MAX_NREQS) {
633 		nvlist_free(nvl);
634 		return (EINVAL);
635 	}
636 
637 	/*
638 	 * The requests are now stored in the nvlist array at reqlist.
639 	 * Note that the use of kmem_zalloc() to alloc the kcpc_set_t means
640 	 * we don't need to call the init routines for ks_lock and ks_condv.
641 	 */
642 	set = kmem_zalloc(sizeof (kcpc_set_t), KM_SLEEP);
643 	set->ks_req = (kcpc_request_t *)kmem_zalloc(sizeof (kcpc_request_t) *
644 	    nreqs, KM_SLEEP);
645 	set->ks_nreqs = nreqs;
646 	/*
647 	 * If the nvlist didn't contain a flags member, setflags was initialized
648 	 * with an illegal value and this set will fail sanity checks later on.
649 	 */
650 	set->ks_flags = setflags;
651 	/*
652 	 * Initialize bind/unbind set synchronization.
653 	 */
654 	set->ks_state &= ~KCPC_SET_BOUND;
655 
656 	/*
657 	 * Build the set up one request at a time, always keeping it self-
658 	 * consistent so we can give it to kcpc_free_set() if we need to back
659 	 * out and return and error.
660 	 */
661 	for (i = 0; i < nreqs; i++) {
662 		nvp = NULL;
663 		set->ks_req[i].kr_picnum = -1;
664 		while ((nvp = nvlist_next_nvpair(reqlist[i], nvp)) != NULL) {
665 			name = nvpair_name(nvp);
666 			switch (nvpair_type(nvp)) {
667 			case DATA_TYPE_UINT32:
668 				if (nvpair_value_uint32(nvp, &uint32) == EINVAL)
669 					goto inval;
670 				if (strcmp(name, "cr_flags") == 0)
671 					set->ks_req[i].kr_flags = uint32;
672 				if (strcmp(name, "cr_index") == 0)
673 					set->ks_req[i].kr_index = uint32;
674 				break;
675 			case DATA_TYPE_UINT64:
676 				if (nvpair_value_uint64(nvp, &uint64) == EINVAL)
677 					goto inval;
678 				if (strcmp(name, "cr_preset") == 0)
679 					set->ks_req[i].kr_preset = uint64;
680 				break;
681 			case DATA_TYPE_STRING:
682 				if (nvpair_value_string(nvp, &string) == EINVAL)
683 					goto inval;
684 				if (strcmp(name, "cr_event") == 0)
685 					(void) strncpy(set->ks_req[i].kr_event,
686 					    string, CPC_MAX_EVENT_LEN);
687 				break;
688 			case DATA_TYPE_NVLIST:
689 				if (strcmp(name, "cr_attr") != 0)
690 					goto inval;
691 				if (nvpair_value_nvlist(nvp, &attrs) == EINVAL)
692 					goto inval;
693 				nvp_attr = NULL;
694 				/*
695 				 * If the picnum has been specified as an
696 				 * attribute, consume that attribute here and
697 				 * remove it from the list of attributes.
698 				 */
699 				if (nvlist_lookup_uint64(attrs, "picnum",
700 				    &uint64) == 0) {
701 					if (nvlist_remove(attrs, "picnum",
702 					    DATA_TYPE_UINT64) != 0)
703 						panic("nvlist %p faulty",
704 						    (void *)attrs);
705 					set->ks_req[i].kr_picnum = uint64;
706 				}
707 
708 				if ((set->ks_req[i].kr_nattrs =
709 				    kcpc_nvlist_npairs(attrs)) == 0)
710 					break;
711 
712 				if (set->ks_req[i].kr_nattrs > CPC_MAX_ATTRS)
713 					goto inval;
714 
715 				set->ks_req[i].kr_attr =
716 				    kmem_alloc(set->ks_req[i].kr_nattrs *
717 				    sizeof (kcpc_attr_t), KM_SLEEP);
718 				j = 0;
719 
720 				while ((nvp_attr = nvlist_next_nvpair(attrs,
721 				    nvp_attr)) != NULL) {
722 					attrp = &set->ks_req[i].kr_attr[j];
723 
724 					if (nvpair_type(nvp_attr) !=
725 					    DATA_TYPE_UINT64)
726 						goto inval;
727 
728 					(void) strncpy(attrp->ka_name,
729 					    nvpair_name(nvp_attr),
730 					    CPC_MAX_ATTR_LEN);
731 
732 					if (nvpair_value_uint64(nvp_attr,
733 					    &(attrp->ka_val)) == EINVAL)
734 						goto inval;
735 					j++;
736 				}
737 				ASSERT(j == set->ks_req[i].kr_nattrs);
738 			default:
739 				break;
740 			}
741 		}
742 	}
743 
744 	nvlist_free(nvl);
745 	*inset = set;
746 	return (0);
747 
748 inval:
749 	nvlist_free(nvl);
750 	kcpc_free_set(set);
751 	return (EINVAL);
752 }
753 
754 /*
755  * Count the number of nvpairs in the supplied nvlist.
756  */
757 static uint32_t
758 kcpc_nvlist_npairs(nvlist_t *list)
759 {
760 	nvpair_t *nvp = NULL;
761 	uint32_t n = 0;
762 
763 	while ((nvp = nvlist_next_nvpair(list, nvp)) != NULL)
764 		n++;
765 
766 	return (n);
767 }
768 
769 /*
770  * Performs sanity checks on the given set.
771  * Returns 0 if the set checks out OK.
772  * Returns a detailed error subcode, or -1 if there is no applicable subcode.
773  */
774 static int
775 kcpc_verify_set(kcpc_set_t *set)
776 {
777 	kcpc_request_t	*rp;
778 	int		i;
779 	uint64_t	bitmap = 0;
780 	int		n;
781 
782 	if (set->ks_nreqs > cpc_ncounters)
783 		return (-1);
784 
785 	if (CPC_SET_VALID_FLAGS(set->ks_flags) == 0)
786 		return (-1);
787 
788 	for (i = 0; i < set->ks_nreqs; i++) {
789 		rp = &set->ks_req[i];
790 
791 		/*
792 		 * The following comparison must cast cpc_ncounters to an int,
793 		 * because kr_picnum will be -1 if the request didn't explicitly
794 		 * choose a PIC.
795 		 */
796 		if (rp->kr_picnum >= (int)cpc_ncounters)
797 			return (CPC_INVALID_PICNUM);
798 
799 		/*
800 		 * Of the pics whose physical picnum has been specified, make
801 		 * sure each PIC appears only once in set.
802 		 */
803 		if ((n = set->ks_req[i].kr_picnum) != -1) {
804 			if ((bitmap & (1 << n)) != 0)
805 				return (-1);
806 			bitmap |= (1 << n);
807 		}
808 
809 		/*
810 		 * Make sure the requested index falls within the range of all
811 		 * requests.
812 		 */
813 		if (rp->kr_index < 0 || rp->kr_index >= set->ks_nreqs)
814 			return (-1);
815 
816 		/*
817 		 * Make sure there are no unknown flags.
818 		 */
819 		if (KCPC_REQ_VALID_FLAGS(rp->kr_flags) == 0)
820 			return (CPC_REQ_INVALID_FLAGS);
821 	}
822 
823 	return (0);
824 }
825 
826 static struct cb_ops cb_ops = {
827 	kcpc_open,
828 	kcpc_close,
829 	nodev,		/* strategy */
830 	nodev,		/* print */
831 	nodev,		/* dump */
832 	nodev,		/* read */
833 	nodev,		/* write */
834 	kcpc_ioctl,
835 	nodev,		/* devmap */
836 	nodev,		/* mmap */
837 	nodev,		/* segmap */
838 	nochpoll,	/* poll */
839 	ddi_prop_op,
840 	NULL,
841 	D_NEW | D_MP
842 };
843 
844 /*ARGSUSED*/
845 static int
846 kcpc_probe(dev_info_t *devi)
847 {
848 	return (DDI_PROBE_SUCCESS);
849 }
850 
851 static dev_info_t *kcpc_devi;
852 
853 static int
854 kcpc_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
855 {
856 	if (cmd != DDI_ATTACH)
857 		return (DDI_FAILURE);
858 	kcpc_devi = devi;
859 	return (ddi_create_minor_node(devi, "shared", S_IFCHR,
860 	    KCPC_MINOR_SHARED, DDI_PSEUDO, 0));
861 }
862 
863 /*ARGSUSED*/
864 static int
865 kcpc_getinfo(dev_info_t *devi, ddi_info_cmd_t cmd, void *arg, void **result)
866 {
867 	switch (cmd) {
868 	case DDI_INFO_DEVT2DEVINFO:
869 		switch (getminor((dev_t)arg)) {
870 		case KCPC_MINOR_SHARED:
871 			*result = kcpc_devi;
872 			return (DDI_SUCCESS);
873 		default:
874 			break;
875 		}
876 		break;
877 	case DDI_INFO_DEVT2INSTANCE:
878 		*result = 0;
879 		return (DDI_SUCCESS);
880 	default:
881 		break;
882 	}
883 
884 	return (DDI_FAILURE);
885 }
886 
887 static struct dev_ops dev_ops = {
888 	DEVO_REV,
889 	0,
890 	kcpc_getinfo,
891 	nulldev,		/* identify */
892 	kcpc_probe,
893 	kcpc_attach,
894 	nodev,			/* detach */
895 	nodev,			/* reset */
896 	&cb_ops,
897 	(struct bus_ops *)0,
898 	NULL,
899 	ddi_quiesce_not_needed,		/* quiesce */
900 };
901 
902 static struct modldrv modldrv = {
903 	&mod_driverops,
904 	"cpc sampling driver",
905 	&dev_ops
906 };
907 
908 static struct sysent cpc_sysent = {
909 	5,
910 	SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
911 	cpc
912 };
913 
914 static struct modlsys modlsys = {
915 	&mod_syscallops,
916 	"cpc sampling system call",
917 	&cpc_sysent
918 };
919 
920 #ifdef _SYSCALL32_IMPL
921 static struct modlsys modlsys32 = {
922 	&mod_syscallops32,
923 	"32-bit cpc sampling system call",
924 	&cpc_sysent
925 };
926 #endif
927 
928 static struct modlinkage modl = {
929 	MODREV_1,
930 	&modldrv,
931 	&modlsys,
932 #ifdef _SYSCALL32_IMPL
933 	&modlsys32,
934 #endif
935 };
936 
937 static void
938 kcpc_init(void)
939 {
940 	long hash;
941 
942 	rw_init(&kcpc_cpuctx_lock, NULL, RW_DEFAULT, NULL);
943 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
944 		mutex_init(&kcpc_ctx_llock[hash],
945 		    NULL, MUTEX_DRIVER, (void *)(uintptr_t)15);
946 }
947 
948 static void
949 kcpc_fini(void)
950 {
951 	long hash;
952 
953 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
954 		mutex_destroy(&kcpc_ctx_llock[hash]);
955 	rw_destroy(&kcpc_cpuctx_lock);
956 }
957 
958 int
959 _init(void)
960 {
961 	int ret;
962 
963 	if (kcpc_hw_load_pcbe() != 0)
964 		return (ENOTSUP);
965 
966 	kcpc_init();
967 	if ((ret = mod_install(&modl)) != 0)
968 		kcpc_fini();
969 	return (ret);
970 }
971 
972 int
973 _fini(void)
974 {
975 	int ret;
976 
977 	if ((ret = mod_remove(&modl)) == 0)
978 		kcpc_fini();
979 	return (ret);
980 }
981 
982 int
983 _info(struct modinfo *mi)
984 {
985 	return (mod_info(&modl, mi));
986 }
987