xref: /titanic_41/usr/src/uts/common/io/devinfo.c (revision 70025d765b044c6d8594bb965a2247a61e991a99)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * driver for accessing kernel devinfo tree.
31  */
32 #include <sys/types.h>
33 #include <sys/pathname.h>
34 #include <sys/debug.h>
35 #include <sys/autoconf.h>
36 #include <sys/conf.h>
37 #include <sys/file.h>
38 #include <sys/kmem.h>
39 #include <sys/modctl.h>
40 #include <sys/stat.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/sunldi_impl.h>
44 #include <sys/sunndi.h>
45 #include <sys/esunddi.h>
46 #include <sys/sunmdi.h>
47 #include <sys/ddi_impldefs.h>
48 #include <sys/ndi_impldefs.h>
49 #include <sys/mdi_impldefs.h>
50 #include <sys/devinfo_impl.h>
51 #include <sys/thread.h>
52 #include <sys/modhash.h>
53 #include <sys/bitmap.h>
54 #include <util/qsort.h>
55 #include <sys/disp.h>
56 #include <sys/kobj.h>
57 #include <sys/crc32.h>
58 
59 
60 #ifdef DEBUG
61 static int di_debug;
62 #define	dcmn_err(args) if (di_debug >= 1) cmn_err args
63 #define	dcmn_err2(args) if (di_debug >= 2) cmn_err args
64 #define	dcmn_err3(args) if (di_debug >= 3) cmn_err args
65 #else
66 #define	dcmn_err(args) /* nothing */
67 #define	dcmn_err2(args) /* nothing */
68 #define	dcmn_err3(args) /* nothing */
69 #endif
70 
71 /*
72  * We partition the space of devinfo minor nodes equally between the full and
73  * unprivileged versions of the driver.  The even-numbered minor nodes are the
74  * full version, while the odd-numbered ones are the read-only version.
75  */
76 static int di_max_opens = 32;
77 
78 #define	DI_FULL_PARENT		0
79 #define	DI_READONLY_PARENT	1
80 #define	DI_NODE_SPECIES		2
81 #define	DI_UNPRIVILEGED_NODE(x)	(((x) % 2) != 0)
82 
83 #define	IOC_IDLE	0	/* snapshot ioctl states */
84 #define	IOC_SNAP	1	/* snapshot in progress */
85 #define	IOC_DONE	2	/* snapshot done, but not copied out */
86 #define	IOC_COPY	3	/* copyout in progress */
87 
88 /*
89  * Keep max alignment so we can move snapshot to different platforms
90  */
91 #define	DI_ALIGN(addr)	((addr + 7l) & ~7l)
92 
93 /*
94  * To avoid wasting memory, make a linked list of memory chunks.
95  * Size of each chunk is buf_size.
96  */
97 struct di_mem {
98 	struct di_mem *next;	/* link to next chunk */
99 	char *buf;		/* contiguous kernel memory */
100 	size_t buf_size;	/* size of buf in bytes */
101 	devmap_cookie_t cook;	/* cookie from ddi_umem_alloc */
102 };
103 
104 /*
105  * This is a stack for walking the tree without using recursion.
106  * When the devinfo tree height is above some small size, one
107  * gets watchdog resets on sun4m.
108  */
109 struct di_stack {
110 	void		*offset[MAX_TREE_DEPTH];
111 	struct dev_info *dip[MAX_TREE_DEPTH];
112 	int		circ[MAX_TREE_DEPTH];
113 	int		depth;	/* depth of current node to be copied */
114 };
115 
116 #define	TOP_OFFSET(stack)	\
117 	((di_off_t *)(stack)->offset[(stack)->depth - 1])
118 #define	TOP_NODE(stack)		\
119 	((stack)->dip[(stack)->depth - 1])
120 #define	PARENT_OFFSET(stack)	\
121 	((di_off_t *)(stack)->offset[(stack)->depth - 2])
122 #define	EMPTY_STACK(stack)	((stack)->depth == 0)
123 #define	POP_STACK(stack)	{ \
124 	ndi_devi_exit((dev_info_t *)TOP_NODE(stack), \
125 		(stack)->circ[(stack)->depth - 1]); \
126 	((stack)->depth--); \
127 }
128 #define	PUSH_STACK(stack, node, offp)	{ \
129 	ASSERT(node != NULL); \
130 	ndi_devi_enter((dev_info_t *)node, &(stack)->circ[(stack)->depth]); \
131 	(stack)->dip[(stack)->depth] = (node); \
132 	(stack)->offset[(stack)->depth] = (void *)(offp); \
133 	((stack)->depth)++; \
134 }
135 
136 #define	DI_ALL_PTR(s)	((struct di_all *)di_mem_addr((s), 0))
137 
138 /*
139  * With devfs, the device tree has no global locks. The device tree is
140  * dynamic and dips may come and go if they are not locked locally. Under
141  * these conditions, pointers are no longer reliable as unique IDs.
142  * Specifically, these pointers cannot be used as keys for hash tables
143  * as the same devinfo structure may be freed in one part of the tree only
144  * to be allocated as the structure for a different device in another
145  * part of the tree. This can happen if DR and the snapshot are
146  * happening concurrently.
147  * The following data structures act as keys for devinfo nodes and
148  * pathinfo nodes.
149  */
150 
151 enum di_ktype {
152 	DI_DKEY = 1,
153 	DI_PKEY = 2
154 };
155 
156 struct di_dkey {
157 	dev_info_t	*dk_dip;
158 	major_t		dk_major;
159 	int		dk_inst;
160 	pnode_t		dk_nodeid;
161 };
162 
163 struct di_pkey {
164 	mdi_pathinfo_t	*pk_pip;
165 	char		*pk_path_addr;
166 	dev_info_t	*pk_client;
167 	dev_info_t	*pk_phci;
168 };
169 
170 struct di_key {
171 	enum di_ktype	k_type;
172 	union {
173 		struct di_dkey dkey;
174 		struct di_pkey pkey;
175 	} k_u;
176 };
177 
178 
179 struct i_lnode;
180 
181 typedef struct i_link {
182 	/*
183 	 * If a di_link struct representing this i_link struct makes it
184 	 * into the snapshot, then self will point to the offset of
185 	 * the di_link struct in the snapshot
186 	 */
187 	di_off_t	self;
188 
189 	int		spec_type;	/* block or char access type */
190 	struct i_lnode	*src_lnode;	/* src i_lnode */
191 	struct i_lnode	*tgt_lnode;	/* tgt i_lnode */
192 	struct i_link	*src_link_next;	/* next src i_link /w same i_lnode */
193 	struct i_link	*tgt_link_next;	/* next tgt i_link /w same i_lnode */
194 } i_link_t;
195 
196 typedef struct i_lnode {
197 	/*
198 	 * If a di_lnode struct representing this i_lnode struct makes it
199 	 * into the snapshot, then self will point to the offset of
200 	 * the di_lnode struct in the snapshot
201 	 */
202 	di_off_t	self;
203 
204 	/*
205 	 * used for hashing and comparing i_lnodes
206 	 */
207 	int		modid;
208 
209 	/*
210 	 * public information describing a link endpoint
211 	 */
212 	struct di_node	*di_node;	/* di_node in snapshot */
213 	dev_t		devt;		/* devt */
214 
215 	/*
216 	 * i_link ptr to links coming into this i_lnode node
217 	 * (this i_lnode is the target of these i_links)
218 	 */
219 	i_link_t	*link_in;
220 
221 	/*
222 	 * i_link ptr to links going out of this i_lnode node
223 	 * (this i_lnode is the source of these i_links)
224 	 */
225 	i_link_t	*link_out;
226 } i_lnode_t;
227 
228 /*
229  * Soft state associated with each instance of driver open.
230  */
231 static struct di_state {
232 	di_off_t mem_size;	/* total # bytes in memlist	*/
233 	struct di_mem *memlist;	/* head of memlist		*/
234 	uint_t command;		/* command from ioctl		*/
235 	int di_iocstate;	/* snapshot ioctl state		*/
236 	mod_hash_t *reg_dip_hash;
237 	mod_hash_t *reg_pip_hash;
238 	int lnode_count;
239 	int link_count;
240 
241 	mod_hash_t *lnode_hash;
242 	mod_hash_t *link_hash;
243 } **di_states;
244 
245 static kmutex_t di_lock;	/* serialize instance assignment */
246 
247 typedef enum {
248 	DI_QUIET = 0,	/* DI_QUIET must always be 0 */
249 	DI_ERR,
250 	DI_INFO,
251 	DI_TRACE,
252 	DI_TRACE1,
253 	DI_TRACE2
254 } di_cache_debug_t;
255 
256 static uint_t	di_chunk = 32;		/* I/O chunk size in pages */
257 
258 #define	DI_CACHE_LOCK(c)	(mutex_enter(&(c).cache_lock))
259 #define	DI_CACHE_UNLOCK(c)	(mutex_exit(&(c).cache_lock))
260 #define	DI_CACHE_LOCKED(c)	(mutex_owned(&(c).cache_lock))
261 
262 /*
263  * Check that whole device tree is being configured as a pre-condition for
264  * cleaning up /etc/devices files.
265  */
266 #define	DEVICES_FILES_CLEANABLE(st)	\
267 	(((st)->command & DINFOSUBTREE) && ((st)->command & DINFOFORCE) && \
268 	strcmp(DI_ALL_PTR(st)->root_path, "/") == 0)
269 
270 #define	CACHE_DEBUG(args)	\
271 	{ if (di_cache_debug != DI_QUIET) di_cache_print args; }
272 
273 static int di_open(dev_t *, int, int, cred_t *);
274 static int di_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
275 static int di_close(dev_t, int, int, cred_t *);
276 static int di_info(dev_info_t *, ddi_info_cmd_t, void *, void **);
277 static int di_attach(dev_info_t *, ddi_attach_cmd_t);
278 static int di_detach(dev_info_t *, ddi_detach_cmd_t);
279 
280 static di_off_t di_copyformat(di_off_t, struct di_state *, intptr_t, int);
281 static di_off_t di_snapshot_and_clean(struct di_state *);
282 static di_off_t di_copydevnm(di_off_t *, struct di_state *);
283 static di_off_t di_copytree(struct dev_info *, di_off_t *, struct di_state *);
284 static di_off_t di_copynode(struct di_stack *, struct di_state *);
285 static di_off_t di_getmdata(struct ddi_minor_data *, di_off_t *, di_off_t,
286     struct di_state *);
287 static di_off_t di_getppdata(struct dev_info *, di_off_t *, struct di_state *);
288 static di_off_t di_getdpdata(struct dev_info *, di_off_t *, struct di_state *);
289 static di_off_t di_getprop(struct ddi_prop *, di_off_t *,
290     struct di_state *, struct dev_info *, int);
291 static void di_allocmem(struct di_state *, size_t);
292 static void di_freemem(struct di_state *);
293 static void di_copymem(struct di_state *st, caddr_t buf, size_t bufsiz);
294 static di_off_t di_checkmem(struct di_state *, di_off_t, size_t);
295 static caddr_t di_mem_addr(struct di_state *, di_off_t);
296 static int di_setstate(struct di_state *, int);
297 static void di_register_dip(struct di_state *, dev_info_t *, di_off_t);
298 static void di_register_pip(struct di_state *, mdi_pathinfo_t *, di_off_t);
299 static di_off_t di_getpath_data(dev_info_t *, di_off_t *, di_off_t,
300     struct di_state *, int);
301 static di_off_t di_getlink_data(di_off_t, struct di_state *);
302 static int di_dip_find(struct di_state *st, dev_info_t *node, di_off_t *off_p);
303 
304 static int cache_args_valid(struct di_state *st, int *error);
305 static int snapshot_is_cacheable(struct di_state *st);
306 static int di_cache_lookup(struct di_state *st);
307 static int di_cache_update(struct di_state *st);
308 static void di_cache_print(di_cache_debug_t msglevel, char *fmt, ...);
309 
310 static struct cb_ops di_cb_ops = {
311 	di_open,		/* open */
312 	di_close,		/* close */
313 	nodev,			/* strategy */
314 	nodev,			/* print */
315 	nodev,			/* dump */
316 	nodev,			/* read */
317 	nodev,			/* write */
318 	di_ioctl,		/* ioctl */
319 	nodev,			/* devmap */
320 	nodev,			/* mmap */
321 	nodev,			/* segmap */
322 	nochpoll,		/* poll */
323 	ddi_prop_op,		/* prop_op */
324 	NULL,			/* streamtab  */
325 	D_NEW | D_MP		/* Driver compatibility flag */
326 };
327 
328 static struct dev_ops di_ops = {
329 	DEVO_REV,		/* devo_rev, */
330 	0,			/* refcnt  */
331 	di_info,		/* info */
332 	nulldev,		/* identify */
333 	nulldev,		/* probe */
334 	di_attach,		/* attach */
335 	di_detach,		/* detach */
336 	nodev,			/* reset */
337 	&di_cb_ops,		/* driver operations */
338 	NULL			/* bus operations */
339 };
340 
341 /*
342  * Module linkage information for the kernel.
343  */
344 static struct modldrv modldrv = {
345 	&mod_driverops,
346 	"DEVINFO Driver %I%",
347 	&di_ops
348 };
349 
350 static struct modlinkage modlinkage = {
351 	MODREV_1,
352 	&modldrv,
353 	NULL
354 };
355 
356 int
357 _init(void)
358 {
359 	int	error;
360 
361 	mutex_init(&di_lock, NULL, MUTEX_DRIVER, NULL);
362 
363 	error = mod_install(&modlinkage);
364 	if (error != 0) {
365 		mutex_destroy(&di_lock);
366 		return (error);
367 	}
368 
369 	return (0);
370 }
371 
372 int
373 _info(struct modinfo *modinfop)
374 {
375 	return (mod_info(&modlinkage, modinfop));
376 }
377 
378 int
379 _fini(void)
380 {
381 	int	error;
382 
383 	error = mod_remove(&modlinkage);
384 	if (error != 0) {
385 		return (error);
386 	}
387 
388 	mutex_destroy(&di_lock);
389 	return (0);
390 }
391 
392 static dev_info_t *di_dip;
393 
394 /*ARGSUSED*/
395 static int
396 di_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
397 {
398 	int error = DDI_FAILURE;
399 
400 	switch (infocmd) {
401 	case DDI_INFO_DEVT2DEVINFO:
402 		*result = (void *)di_dip;
403 		error = DDI_SUCCESS;
404 		break;
405 	case DDI_INFO_DEVT2INSTANCE:
406 		/*
407 		 * All dev_t's map to the same, single instance.
408 		 */
409 		*result = (void *)0;
410 		error = DDI_SUCCESS;
411 		break;
412 	default:
413 		break;
414 	}
415 
416 	return (error);
417 }
418 
419 static int
420 di_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
421 {
422 	int error = DDI_FAILURE;
423 
424 	switch (cmd) {
425 	case DDI_ATTACH:
426 		di_states = kmem_zalloc(
427 		    di_max_opens * sizeof (struct di_state *), KM_SLEEP);
428 
429 		if (ddi_create_minor_node(dip, "devinfo", S_IFCHR,
430 		    DI_FULL_PARENT, DDI_PSEUDO, NULL) == DDI_FAILURE ||
431 		    ddi_create_minor_node(dip, "devinfo,ro", S_IFCHR,
432 		    DI_READONLY_PARENT, DDI_PSEUDO, NULL) == DDI_FAILURE) {
433 			kmem_free(di_states,
434 			    di_max_opens * sizeof (struct di_state *));
435 			ddi_remove_minor_node(dip, NULL);
436 			error = DDI_FAILURE;
437 		} else {
438 			di_dip = dip;
439 			ddi_report_dev(dip);
440 
441 			error = DDI_SUCCESS;
442 		}
443 		break;
444 	default:
445 		error = DDI_FAILURE;
446 		break;
447 	}
448 
449 	return (error);
450 }
451 
452 static int
453 di_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
454 {
455 	int error = DDI_FAILURE;
456 
457 	switch (cmd) {
458 	case DDI_DETACH:
459 		ddi_remove_minor_node(dip, NULL);
460 		di_dip = NULL;
461 		kmem_free(di_states, di_max_opens * sizeof (struct di_state *));
462 
463 		error = DDI_SUCCESS;
464 		break;
465 	default:
466 		error = DDI_FAILURE;
467 		break;
468 	}
469 
470 	return (error);
471 }
472 
473 /*
474  * Allow multiple opens by tweaking the dev_t such that it looks like each
475  * open is getting a different minor device.  Each minor gets a separate
476  * entry in the di_states[] table.  Based on the original minor number, we
477  * discriminate opens of the full and read-only nodes.  If all of the instances
478  * of the selected minor node are currently open, we return EAGAIN.
479  */
480 /*ARGSUSED*/
481 static int
482 di_open(dev_t *devp, int flag, int otyp, cred_t *credp)
483 {
484 	int m;
485 	minor_t minor_parent = getminor(*devp);
486 
487 	if (minor_parent != DI_FULL_PARENT &&
488 	    minor_parent != DI_READONLY_PARENT)
489 		return (ENXIO);
490 
491 	mutex_enter(&di_lock);
492 
493 	for (m = minor_parent; m < di_max_opens; m += DI_NODE_SPECIES) {
494 		if (di_states[m] != NULL)
495 			continue;
496 
497 		di_states[m] = kmem_zalloc(sizeof (struct di_state), KM_SLEEP);
498 		break;	/* It's ours. */
499 	}
500 
501 	if (m >= di_max_opens) {
502 		/*
503 		 * maximum open instance for device reached
504 		 */
505 		mutex_exit(&di_lock);
506 		dcmn_err((CE_WARN, "devinfo: maximum devinfo open reached"));
507 		return (EAGAIN);
508 	}
509 	mutex_exit(&di_lock);
510 
511 	ASSERT(m < di_max_opens);
512 	*devp = makedevice(getmajor(*devp), (minor_t)(m + DI_NODE_SPECIES));
513 
514 	dcmn_err((CE_CONT, "di_open: thread = %p, assigned minor = %d\n",
515 		(void *)curthread, m + DI_NODE_SPECIES));
516 
517 	return (0);
518 }
519 
520 /*ARGSUSED*/
521 static int
522 di_close(dev_t dev, int flag, int otype, cred_t *cred_p)
523 {
524 	struct di_state *st;
525 	int m = (int)getminor(dev) - DI_NODE_SPECIES;
526 
527 	if (m < 0) {
528 		cmn_err(CE_WARN, "closing non-existent devinfo minor %d",
529 		    m + DI_NODE_SPECIES);
530 		return (ENXIO);
531 	}
532 
533 	st = di_states[m];
534 	ASSERT(m < di_max_opens && st != NULL);
535 
536 	di_freemem(st);
537 	kmem_free(st, sizeof (struct di_state));
538 
539 	/*
540 	 * empty slot in state table
541 	 */
542 	mutex_enter(&di_lock);
543 	di_states[m] = NULL;
544 	dcmn_err((CE_CONT, "di_close: thread = %p, assigned minor = %d\n",
545 		(void *)curthread, m + DI_NODE_SPECIES));
546 	mutex_exit(&di_lock);
547 
548 	return (0);
549 }
550 
551 
552 /*ARGSUSED*/
553 static int
554 di_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
555 {
556 	int rv, error;
557 	di_off_t off;
558 	struct di_all *all;
559 	struct di_state *st;
560 	int m = (int)getminor(dev) - DI_NODE_SPECIES;
561 
562 	major_t i;
563 	char *drv_name;
564 	size_t map_size, size;
565 	struct di_mem *dcp;
566 	int ndi_flags;
567 
568 	if (m < 0 || m >= di_max_opens) {
569 		return (ENXIO);
570 	}
571 
572 	st = di_states[m];
573 	ASSERT(st != NULL);
574 
575 	dcmn_err2((CE_CONT, "di_ioctl: mode = %x, cmd = %x\n", mode, cmd));
576 
577 	switch (cmd) {
578 	case DINFOIDENT:
579 		/*
580 		 * This is called from di_init to verify that the driver
581 		 * opened is indeed devinfo. The purpose is to guard against
582 		 * sending ioctl to an unknown driver in case of an
583 		 * unresolved major number conflict during bfu.
584 		 */
585 		*rvalp = DI_MAGIC;
586 		return (0);
587 
588 	case DINFOLODRV:
589 		/*
590 		 * Hold an installed driver and return the result
591 		 */
592 		if (DI_UNPRIVILEGED_NODE(m)) {
593 			/*
594 			 * Only the fully enabled instances may issue
595 			 * DINFOLDDRV.
596 			 */
597 			return (EACCES);
598 		}
599 
600 		drv_name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
601 		if (ddi_copyin((void *)arg, drv_name, MAXNAMELEN, mode) != 0) {
602 			kmem_free(drv_name, MAXNAMELEN);
603 			return (EFAULT);
604 		}
605 
606 		/*
607 		 * Some 3rd party driver's _init() walks the device tree,
608 		 * so we load the driver module before configuring driver.
609 		 */
610 		i = ddi_name_to_major(drv_name);
611 		if (ddi_hold_driver(i) == NULL) {
612 			kmem_free(drv_name, MAXNAMELEN);
613 			return (ENXIO);
614 		}
615 
616 		ndi_flags = NDI_DEVI_PERSIST | NDI_CONFIG | NDI_NO_EVENT;
617 
618 		/*
619 		 * i_ddi_load_drvconf() below will trigger a reprobe
620 		 * via reset_nexus_flags(). NDI_DRV_CONF_REPROBE isn't
621 		 * needed here.
622 		 */
623 		modunload_disable();
624 		(void) i_ddi_load_drvconf(i);
625 		(void) ndi_devi_config_driver(ddi_root_node(), ndi_flags, i);
626 		kmem_free(drv_name, MAXNAMELEN);
627 		ddi_rele_driver(i);
628 		rv = i_ddi_devs_attached(i);
629 		modunload_enable();
630 
631 		i_ddi_di_cache_invalidate(KM_SLEEP);
632 
633 		return ((rv == DDI_SUCCESS)? 0 : ENXIO);
634 
635 	case DINFOUSRLD:
636 		/*
637 		 * The case for copying snapshot to userland
638 		 */
639 		if (di_setstate(st, IOC_COPY) == -1)
640 			return (EBUSY);
641 
642 		map_size = ((struct di_all *)di_mem_addr(st, 0))->map_size;
643 		if (map_size == 0) {
644 			(void) di_setstate(st, IOC_DONE);
645 			return (EFAULT);
646 		}
647 
648 		/*
649 		 * copyout the snapshot
650 		 */
651 		map_size = (map_size + PAGEOFFSET) & PAGEMASK;
652 
653 		/*
654 		 * Return the map size, so caller may do a sanity
655 		 * check against the return value of snapshot ioctl()
656 		 */
657 		*rvalp = (int)map_size;
658 
659 		/*
660 		 * Copy one chunk at a time
661 		 */
662 		off = 0;
663 		dcp = st->memlist;
664 		while (map_size) {
665 			size = dcp->buf_size;
666 			if (map_size <= size) {
667 				size = map_size;
668 			}
669 
670 			if (ddi_copyout(di_mem_addr(st, off),
671 			    (void *)(arg + off), size, mode) != 0) {
672 				(void) di_setstate(st, IOC_DONE);
673 				return (EFAULT);
674 			}
675 
676 			map_size -= size;
677 			off += size;
678 			dcp = dcp->next;
679 		}
680 
681 		di_freemem(st);
682 		(void) di_setstate(st, IOC_IDLE);
683 		return (0);
684 
685 	default:
686 		if ((cmd & ~DIIOC_MASK) != DIIOC) {
687 			/*
688 			 * Invalid ioctl command
689 			 */
690 			return (ENOTTY);
691 		}
692 		/*
693 		 * take a snapshot
694 		 */
695 		st->command = cmd & DIIOC_MASK;
696 		/*FALLTHROUGH*/
697 	}
698 
699 	/*
700 	 * Obtain enough memory to hold header + rootpath.  We prevent kernel
701 	 * memory exhaustion by freeing any previously allocated snapshot and
702 	 * refusing the operation; otherwise we would be allowing ioctl(),
703 	 * ioctl(), ioctl(), ..., panic.
704 	 */
705 	if (di_setstate(st, IOC_SNAP) == -1)
706 		return (EBUSY);
707 
708 	size = sizeof (struct di_all) +
709 	    sizeof (((struct dinfo_io *)(NULL))->root_path);
710 	if (size < PAGESIZE)
711 		size = PAGESIZE;
712 	di_allocmem(st, size);
713 
714 	all = (struct di_all *)di_mem_addr(st, 0);
715 	all->devcnt = devcnt;
716 	all->command = st->command;
717 	all->version = DI_SNAPSHOT_VERSION;
718 
719 	/*
720 	 * Note the endianness in case we need to transport snapshot
721 	 * over the network.
722 	 */
723 #if defined(_LITTLE_ENDIAN)
724 	all->endianness = DI_LITTLE_ENDIAN;
725 #else
726 	all->endianness = DI_BIG_ENDIAN;
727 #endif
728 
729 	/* Copyin ioctl args, store in the snapshot. */
730 	if (copyinstr((void *)arg, all->root_path,
731 	    sizeof (((struct dinfo_io *)(NULL))->root_path), &size) != 0) {
732 		di_freemem(st);
733 		(void) di_setstate(st, IOC_IDLE);
734 		return (EFAULT);
735 	}
736 
737 	if ((st->command & DINFOCLEANUP) && !DEVICES_FILES_CLEANABLE(st)) {
738 		di_freemem(st);
739 		(void) di_setstate(st, IOC_IDLE);
740 		return (EINVAL);
741 	}
742 
743 	error = 0;
744 	if ((st->command & DINFOCACHE) && !cache_args_valid(st, &error)) {
745 		di_freemem(st);
746 		(void) di_setstate(st, IOC_IDLE);
747 		return (error);
748 	}
749 
750 	off = DI_ALIGN(sizeof (struct di_all) + size);
751 
752 	/*
753 	 * Only the fully enabled version may force load drivers or read
754 	 * the parent private data from a driver.
755 	 */
756 	if ((st->command & (DINFOPRIVDATA | DINFOFORCE)) != 0 &&
757 	    DI_UNPRIVILEGED_NODE(m)) {
758 		di_freemem(st);
759 		(void) di_setstate(st, IOC_IDLE);
760 		return (EACCES);
761 	}
762 
763 	/* Do we need private data? */
764 	if (st->command & DINFOPRIVDATA) {
765 		arg += sizeof (((struct dinfo_io *)(NULL))->root_path);
766 
767 #ifdef _MULTI_DATAMODEL
768 		switch (ddi_model_convert_from(mode & FMODELS)) {
769 		case DDI_MODEL_ILP32: {
770 			/*
771 			 * Cannot copy private data from 64-bit kernel
772 			 * to 32-bit app
773 			 */
774 			di_freemem(st);
775 			(void) di_setstate(st, IOC_IDLE);
776 			return (EINVAL);
777 		}
778 		case DDI_MODEL_NONE:
779 			if ((off = di_copyformat(off, st, arg, mode)) == 0) {
780 				di_freemem(st);
781 				(void) di_setstate(st, IOC_IDLE);
782 				return (EFAULT);
783 			}
784 			break;
785 		}
786 #else /* !_MULTI_DATAMODEL */
787 		if ((off = di_copyformat(off, st, arg, mode)) == 0) {
788 			di_freemem(st);
789 			(void) di_setstate(st, IOC_IDLE);
790 			return (EFAULT);
791 		}
792 #endif /* _MULTI_DATAMODEL */
793 	}
794 
795 	all->top_devinfo = DI_ALIGN(off);
796 
797 	/*
798 	 * For cache lookups we reallocate memory from scratch,
799 	 * so the value of "all" is no longer valid.
800 	 */
801 	all = NULL;
802 
803 	if (st->command & DINFOCACHE) {
804 		*rvalp = di_cache_lookup(st);
805 	} else if (snapshot_is_cacheable(st)) {
806 		DI_CACHE_LOCK(di_cache);
807 		*rvalp = di_cache_update(st);
808 		DI_CACHE_UNLOCK(di_cache);
809 	} else
810 		*rvalp = di_snapshot_and_clean(st);
811 
812 	if (*rvalp) {
813 		DI_ALL_PTR(st)->map_size = *rvalp;
814 		(void) di_setstate(st, IOC_DONE);
815 	} else {
816 		di_freemem(st);
817 		(void) di_setstate(st, IOC_IDLE);
818 	}
819 
820 	return (0);
821 }
822 
823 /*
824  * Get a chunk of memory >= size, for the snapshot
825  */
826 static void
827 di_allocmem(struct di_state *st, size_t size)
828 {
829 	struct di_mem *mem = kmem_zalloc(sizeof (struct di_mem),
830 	    KM_SLEEP);
831 	/*
832 	 * Round up size to nearest power of 2. If it is less
833 	 * than st->mem_size, set it to st->mem_size (i.e.,
834 	 * the mem_size is doubled every time) to reduce the
835 	 * number of memory allocations.
836 	 */
837 	size_t tmp = 1;
838 	while (tmp < size) {
839 		tmp <<= 1;
840 	}
841 	size = (tmp > st->mem_size) ? tmp : st->mem_size;
842 
843 	mem->buf = ddi_umem_alloc(size, DDI_UMEM_SLEEP, &mem->cook);
844 	mem->buf_size = size;
845 
846 	dcmn_err2((CE_CONT, "di_allocmem: mem_size=%x\n", st->mem_size));
847 
848 	if (st->mem_size == 0) {	/* first chunk */
849 		st->memlist = mem;
850 	} else {
851 		/*
852 		 * locate end of linked list and add a chunk at the end
853 		 */
854 		struct di_mem *dcp = st->memlist;
855 		while (dcp->next != NULL) {
856 			dcp = dcp->next;
857 		}
858 
859 		dcp->next = mem;
860 	}
861 
862 	st->mem_size += size;
863 }
864 
865 /*
866  * Copy upto bufsiz bytes of the memlist to buf
867  */
868 static void
869 di_copymem(struct di_state *st, caddr_t buf, size_t bufsiz)
870 {
871 	struct di_mem *dcp;
872 	size_t copysz;
873 
874 	if (st->mem_size == 0) {
875 		ASSERT(st->memlist == NULL);
876 		return;
877 	}
878 
879 	copysz = 0;
880 	for (dcp = st->memlist; dcp; dcp = dcp->next) {
881 
882 		ASSERT(bufsiz > 0);
883 
884 		if (bufsiz <= dcp->buf_size)
885 			copysz = bufsiz;
886 		else
887 			copysz = dcp->buf_size;
888 
889 		bcopy(dcp->buf, buf, copysz);
890 
891 		buf += copysz;
892 		bufsiz -= copysz;
893 
894 		if (bufsiz == 0)
895 			break;
896 	}
897 }
898 
899 /*
900  * Free all memory for the snapshot
901  */
902 static void
903 di_freemem(struct di_state *st)
904 {
905 	struct di_mem *dcp, *tmp;
906 
907 	dcmn_err2((CE_CONT, "di_freemem\n"));
908 
909 	if (st->mem_size) {
910 		dcp = st->memlist;
911 		while (dcp) {	/* traverse the linked list */
912 			tmp = dcp;
913 			dcp = dcp->next;
914 			ddi_umem_free(tmp->cook);
915 			kmem_free(tmp, sizeof (struct di_mem));
916 		}
917 		st->mem_size = 0;
918 		st->memlist = NULL;
919 	}
920 
921 	ASSERT(st->mem_size == 0);
922 	ASSERT(st->memlist == NULL);
923 }
924 
925 /*
926  * Copies cached data to the di_state structure.
927  * Returns:
928  *	- size of data copied, on SUCCESS
929  *	- 0 on failure
930  */
931 static int
932 di_cache2mem(struct di_cache *cache, struct di_state *st)
933 {
934 	caddr_t	pa;
935 
936 	ASSERT(st->mem_size == 0);
937 	ASSERT(st->memlist == NULL);
938 	ASSERT(!servicing_interrupt());
939 	ASSERT(DI_CACHE_LOCKED(*cache));
940 
941 	if (cache->cache_size == 0) {
942 		ASSERT(cache->cache_data == NULL);
943 		CACHE_DEBUG((DI_ERR, "Empty cache. Skipping copy"));
944 		return (0);
945 	}
946 
947 	ASSERT(cache->cache_data);
948 
949 	di_allocmem(st, cache->cache_size);
950 
951 	pa = di_mem_addr(st, 0);
952 
953 	ASSERT(pa);
954 
955 	/*
956 	 * Verify that di_allocmem() allocates contiguous memory,
957 	 * so that it is safe to do straight bcopy()
958 	 */
959 	ASSERT(st->memlist != NULL);
960 	ASSERT(st->memlist->next == NULL);
961 	bcopy(cache->cache_data, pa, cache->cache_size);
962 
963 	return (cache->cache_size);
964 }
965 
966 /*
967  * Copies a snapshot from di_state to the cache
968  * Returns:
969  *	- 0 on failure
970  *	- size of copied data on success
971  */
972 static int
973 di_mem2cache(struct di_state *st, struct di_cache *cache)
974 {
975 	size_t map_size;
976 
977 	ASSERT(cache->cache_size == 0);
978 	ASSERT(cache->cache_data == NULL);
979 	ASSERT(!servicing_interrupt());
980 	ASSERT(DI_CACHE_LOCKED(*cache));
981 
982 	if (st->mem_size == 0) {
983 		ASSERT(st->memlist == NULL);
984 		CACHE_DEBUG((DI_ERR, "Empty memlist. Skipping copy"));
985 		return (0);
986 	}
987 
988 	ASSERT(st->memlist);
989 
990 	/*
991 	 * The size of the memory list may be much larger than the
992 	 * size of valid data (map_size). Cache only the valid data
993 	 */
994 	map_size = DI_ALL_PTR(st)->map_size;
995 	if (map_size == 0 || map_size < sizeof (struct di_all) ||
996 	    map_size > st->mem_size) {
997 		CACHE_DEBUG((DI_ERR, "cannot cache: bad size: 0x%x", map_size));
998 		return (0);
999 	}
1000 
1001 	cache->cache_data = kmem_alloc(map_size, KM_SLEEP);
1002 	cache->cache_size = map_size;
1003 	di_copymem(st, cache->cache_data, cache->cache_size);
1004 
1005 	return (map_size);
1006 }
1007 
1008 /*
1009  * Make sure there is at least "size" bytes memory left before
1010  * going on. Otherwise, start on a new chunk.
1011  */
1012 static di_off_t
1013 di_checkmem(struct di_state *st, di_off_t off, size_t size)
1014 {
1015 	dcmn_err3((CE_CONT, "di_checkmem: off=%x size=%x\n",
1016 			off, (int)size));
1017 
1018 	/*
1019 	 * di_checkmem() shouldn't be called with a size of zero.
1020 	 * But in case it is, we want to make sure we return a valid
1021 	 * offset within the memlist and not an offset that points us
1022 	 * at the end of the memlist.
1023 	 */
1024 	if (size == 0) {
1025 		dcmn_err((CE_WARN, "di_checkmem: invalid zero size used"));
1026 		size = 1;
1027 	}
1028 
1029 	off = DI_ALIGN(off);
1030 	if ((st->mem_size - off) < size) {
1031 		off = st->mem_size;
1032 		di_allocmem(st, size);
1033 	}
1034 
1035 	return (off);
1036 }
1037 
1038 /*
1039  * Copy the private data format from ioctl arg.
1040  * On success, the ending offset is returned. On error 0 is returned.
1041  */
1042 static di_off_t
1043 di_copyformat(di_off_t off, struct di_state *st, intptr_t arg, int mode)
1044 {
1045 	di_off_t size;
1046 	struct di_priv_data *priv;
1047 	struct di_all *all = (struct di_all *)di_mem_addr(st, 0);
1048 
1049 	dcmn_err2((CE_CONT, "di_copyformat: off=%x, arg=%p mode=%x\n",
1050 		off, (void *)arg, mode));
1051 
1052 	/*
1053 	 * Copyin data and check version.
1054 	 * We only handle private data version 0.
1055 	 */
1056 	priv = kmem_alloc(sizeof (struct di_priv_data), KM_SLEEP);
1057 	if ((ddi_copyin((void *)arg, priv, sizeof (struct di_priv_data),
1058 	    mode) != 0) || (priv->version != DI_PRIVDATA_VERSION_0)) {
1059 		kmem_free(priv, sizeof (struct di_priv_data));
1060 		return (0);
1061 	}
1062 
1063 	/*
1064 	 * Save di_priv_data copied from userland in snapshot.
1065 	 */
1066 	all->pd_version = priv->version;
1067 	all->n_ppdata = priv->n_parent;
1068 	all->n_dpdata = priv->n_driver;
1069 
1070 	/*
1071 	 * copyin private data format, modify offset accordingly
1072 	 */
1073 	if (all->n_ppdata) {	/* parent private data format */
1074 		/*
1075 		 * check memory
1076 		 */
1077 		size = all->n_ppdata * sizeof (struct di_priv_format);
1078 		off = di_checkmem(st, off, size);
1079 		all->ppdata_format = off;
1080 		if (ddi_copyin(priv->parent, di_mem_addr(st, off), size,
1081 		    mode) != 0) {
1082 			kmem_free(priv, sizeof (struct di_priv_data));
1083 			return (0);
1084 		}
1085 
1086 		off += size;
1087 	}
1088 
1089 	if (all->n_dpdata) {	/* driver private data format */
1090 		/*
1091 		 * check memory
1092 		 */
1093 		size = all->n_dpdata * sizeof (struct di_priv_format);
1094 		off = di_checkmem(st, off, size);
1095 		all->dpdata_format = off;
1096 		if (ddi_copyin(priv->driver, di_mem_addr(st, off), size,
1097 		    mode) != 0) {
1098 			kmem_free(priv, sizeof (struct di_priv_data));
1099 			return (0);
1100 		}
1101 
1102 		off += size;
1103 	}
1104 
1105 	kmem_free(priv, sizeof (struct di_priv_data));
1106 	return (off);
1107 }
1108 
1109 /*
1110  * Return the real address based on the offset (off) within snapshot
1111  */
1112 static caddr_t
1113 di_mem_addr(struct di_state *st, di_off_t off)
1114 {
1115 	struct di_mem *dcp = st->memlist;
1116 
1117 	dcmn_err3((CE_CONT, "di_mem_addr: dcp=%p off=%x\n",
1118 		(void *)dcp, off));
1119 
1120 	ASSERT(off < st->mem_size);
1121 
1122 	while (off >= dcp->buf_size) {
1123 		off -= dcp->buf_size;
1124 		dcp = dcp->next;
1125 	}
1126 
1127 	dcmn_err3((CE_CONT, "di_mem_addr: new off=%x, return = %p\n",
1128 		off, (void *)(dcp->buf + off)));
1129 
1130 	return (dcp->buf + off);
1131 }
1132 
1133 /*
1134  * Ideally we would use the whole key to derive the hash
1135  * value. However, the probability that two keys will
1136  * have the same dip (or pip) is very low, so
1137  * hashing by dip (or pip) pointer should suffice.
1138  */
1139 static uint_t
1140 di_hash_byptr(void *arg, mod_hash_key_t key)
1141 {
1142 	struct di_key *dik = key;
1143 	size_t rshift;
1144 	void *ptr;
1145 
1146 	ASSERT(arg == NULL);
1147 
1148 	switch (dik->k_type) {
1149 	case DI_DKEY:
1150 		ptr = dik->k_u.dkey.dk_dip;
1151 		rshift = highbit(sizeof (struct dev_info));
1152 		break;
1153 	case DI_PKEY:
1154 		ptr = dik->k_u.pkey.pk_pip;
1155 		rshift = highbit(sizeof (struct mdi_pathinfo));
1156 		break;
1157 	default:
1158 		panic("devinfo: unknown key type");
1159 		/*NOTREACHED*/
1160 	}
1161 	return (mod_hash_byptr((void *)rshift, ptr));
1162 }
1163 
1164 static void
1165 di_key_dtor(mod_hash_key_t key)
1166 {
1167 	char		*path_addr;
1168 	struct di_key	*dik = key;
1169 
1170 	switch (dik->k_type) {
1171 	case DI_DKEY:
1172 		break;
1173 	case DI_PKEY:
1174 		path_addr = dik->k_u.pkey.pk_path_addr;
1175 		if (path_addr)
1176 			kmem_free(path_addr, strlen(path_addr) + 1);
1177 		break;
1178 	default:
1179 		panic("devinfo: unknown key type");
1180 		/*NOTREACHED*/
1181 	}
1182 
1183 	kmem_free(dik, sizeof (struct di_key));
1184 }
1185 
1186 static int
1187 di_dkey_cmp(struct di_dkey *dk1, struct di_dkey *dk2)
1188 {
1189 	if (dk1->dk_dip !=  dk2->dk_dip)
1190 		return (dk1->dk_dip > dk2->dk_dip ? 1 : -1);
1191 
1192 	if (dk1->dk_major != -1 && dk2->dk_major != -1) {
1193 		if (dk1->dk_major !=  dk2->dk_major)
1194 			return (dk1->dk_major > dk2->dk_major ? 1 : -1);
1195 
1196 		if (dk1->dk_inst !=  dk2->dk_inst)
1197 			return (dk1->dk_inst > dk2->dk_inst ? 1 : -1);
1198 	}
1199 
1200 	if (dk1->dk_nodeid != dk2->dk_nodeid)
1201 		return (dk1->dk_nodeid > dk2->dk_nodeid ? 1 : -1);
1202 
1203 	return (0);
1204 }
1205 
1206 static int
1207 di_pkey_cmp(struct di_pkey *pk1, struct di_pkey *pk2)
1208 {
1209 	char *p1, *p2;
1210 	int rv;
1211 
1212 	if (pk1->pk_pip !=  pk2->pk_pip)
1213 		return (pk1->pk_pip > pk2->pk_pip ? 1 : -1);
1214 
1215 	p1 = pk1->pk_path_addr;
1216 	p2 = pk2->pk_path_addr;
1217 
1218 	p1 = p1 ? p1 : "";
1219 	p2 = p2 ? p2 : "";
1220 
1221 	rv = strcmp(p1, p2);
1222 	if (rv)
1223 		return (rv > 0  ? 1 : -1);
1224 
1225 	if (pk1->pk_client !=  pk2->pk_client)
1226 		return (pk1->pk_client > pk2->pk_client ? 1 : -1);
1227 
1228 	if (pk1->pk_phci !=  pk2->pk_phci)
1229 		return (pk1->pk_phci > pk2->pk_phci ? 1 : -1);
1230 
1231 	return (0);
1232 }
1233 
1234 static int
1235 di_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1236 {
1237 	struct di_key *dik1, *dik2;
1238 
1239 	dik1 = key1;
1240 	dik2 = key2;
1241 
1242 	if (dik1->k_type != dik2->k_type) {
1243 		panic("devinfo: mismatched keys");
1244 		/*NOTREACHED*/
1245 	}
1246 
1247 	switch (dik1->k_type) {
1248 	case DI_DKEY:
1249 		return (di_dkey_cmp(&(dik1->k_u.dkey), &(dik2->k_u.dkey)));
1250 	case DI_PKEY:
1251 		return (di_pkey_cmp(&(dik1->k_u.pkey), &(dik2->k_u.pkey)));
1252 	default:
1253 		panic("devinfo: unknown key type");
1254 		/*NOTREACHED*/
1255 	}
1256 }
1257 
1258 /*
1259  * This is the main function that takes a snapshot
1260  */
1261 static di_off_t
1262 di_snapshot(struct di_state *st)
1263 {
1264 	di_off_t off;
1265 	struct di_all *all;
1266 	dev_info_t *rootnode;
1267 	char buf[80];
1268 	int plen;
1269 	char *path;
1270 	vnode_t *vp;
1271 
1272 	all = (struct di_all *)di_mem_addr(st, 0);
1273 	dcmn_err((CE_CONT, "Taking a snapshot of devinfo tree...\n"));
1274 
1275 	/*
1276 	 * Verify path before entrusting it to e_ddi_hold_devi_by_path because
1277 	 * some platforms have OBP bugs where executing the NDI_PROMNAME code
1278 	 * path against an invalid path results in panic.  The lookupnameat
1279 	 * is done relative to rootdir without a leading '/' on "devices/"
1280 	 * to force the lookup to occur in the global zone.
1281 	 */
1282 	plen = strlen("devices/") + strlen(all->root_path) + 1;
1283 	path = kmem_alloc(plen, KM_SLEEP);
1284 	(void) snprintf(path, plen, "devices/%s", all->root_path);
1285 	if (lookupnameat(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp, rootdir)) {
1286 		dcmn_err((CE_CONT, "Devinfo node %s not found\n",
1287 		    all->root_path));
1288 		kmem_free(path, plen);
1289 		return (0);
1290 	}
1291 	kmem_free(path, plen);
1292 	VN_RELE(vp);
1293 
1294 	/*
1295 	 * Hold the devinfo node referred by the path.
1296 	 */
1297 	rootnode = e_ddi_hold_devi_by_path(all->root_path, 0);
1298 	if (rootnode == NULL) {
1299 		dcmn_err((CE_CONT, "Devinfo node %s not found\n",
1300 		    all->root_path));
1301 		return (0);
1302 	}
1303 
1304 	(void) snprintf(buf, sizeof (buf),
1305 	    "devinfo registered dips (statep=%p)", (void *)st);
1306 
1307 	st->reg_dip_hash = mod_hash_create_extended(buf, 64,
1308 	    di_key_dtor, mod_hash_null_valdtor, di_hash_byptr,
1309 	    NULL, di_key_cmp, KM_SLEEP);
1310 
1311 
1312 	(void) snprintf(buf, sizeof (buf),
1313 	    "devinfo registered pips (statep=%p)", (void *)st);
1314 
1315 	st->reg_pip_hash = mod_hash_create_extended(buf, 64,
1316 	    di_key_dtor, mod_hash_null_valdtor, di_hash_byptr,
1317 	    NULL, di_key_cmp, KM_SLEEP);
1318 
1319 	/*
1320 	 * copy the device tree
1321 	 */
1322 	off = di_copytree(DEVI(rootnode), &all->top_devinfo, st);
1323 
1324 	ddi_release_devi(rootnode);
1325 
1326 	/*
1327 	 * copy the devnames array
1328 	 */
1329 	all->devnames = off;
1330 	off = di_copydevnm(&all->devnames, st);
1331 
1332 
1333 	/* initialize the hash tables */
1334 	st->lnode_count = 0;
1335 	st->link_count = 0;
1336 
1337 	if (DINFOLYR & st->command) {
1338 		off = di_getlink_data(off, st);
1339 	}
1340 
1341 	/*
1342 	 * Free up hash tables
1343 	 */
1344 	mod_hash_destroy_hash(st->reg_dip_hash);
1345 	mod_hash_destroy_hash(st->reg_pip_hash);
1346 
1347 	/*
1348 	 * Record the timestamp now that we are done with snapshot.
1349 	 *
1350 	 * We compute the checksum later and then only if we cache
1351 	 * the snapshot, since checksumming adds some overhead.
1352 	 * The checksum is checked later if we read the cache file.
1353 	 * from disk.
1354 	 *
1355 	 * Set checksum field to 0 as CRC is calculated with that
1356 	 * field set to 0.
1357 	 */
1358 	all->snapshot_time = ddi_get_time();
1359 	all->cache_checksum = 0;
1360 
1361 	return (off);
1362 }
1363 
1364 /*
1365  * Take a snapshot and clean /etc/devices files if DINFOCLEANUP is set
1366  */
1367 static di_off_t
1368 di_snapshot_and_clean(struct di_state *st)
1369 {
1370 	di_off_t off;
1371 
1372 	modunload_disable();
1373 	off = di_snapshot(st);
1374 	if (off != 0 && (st->command & DINFOCLEANUP)) {
1375 		ASSERT(DEVICES_FILES_CLEANABLE(st));
1376 		/*
1377 		 * Cleanup /etc/devices files:
1378 		 * In order to accurately account for the system configuration
1379 		 * in /etc/devices files, the appropriate drivers must be
1380 		 * fully configured before the cleanup starts.
1381 		 * So enable modunload only after the cleanup.
1382 		 */
1383 		i_ddi_clean_devices_files();
1384 	}
1385 	modunload_enable();
1386 
1387 	return (off);
1388 }
1389 
1390 /*
1391  * Assumes all devinfo nodes in device tree have been snapshotted
1392  */
1393 static void
1394 snap_driver_list(struct di_state *st, struct devnames *dnp, di_off_t *poff_p)
1395 {
1396 	struct dev_info *node;
1397 	struct di_node *me;
1398 	di_off_t off;
1399 
1400 	ASSERT(mutex_owned(&dnp->dn_lock));
1401 
1402 	node = DEVI(dnp->dn_head);
1403 	for (; node; node = node->devi_next) {
1404 		if (di_dip_find(st, (dev_info_t *)node, &off) != 0)
1405 			continue;
1406 
1407 		ASSERT(off > 0);
1408 		me = (struct di_node *)di_mem_addr(st, off);
1409 		ASSERT(me->next == 0 || me->next == -1);
1410 		/*
1411 		 * Only nodes which were BOUND when they were
1412 		 * snapshotted will be added to per-driver list.
1413 		 */
1414 		if (me->next != -1)
1415 			continue;
1416 
1417 		*poff_p = off;
1418 		poff_p = &me->next;
1419 	}
1420 
1421 	*poff_p = 0;
1422 }
1423 
1424 /*
1425  * Copy the devnames array, so we have a list of drivers in the snapshot.
1426  * Also makes it possible to locate the per-driver devinfo nodes.
1427  */
1428 static di_off_t
1429 di_copydevnm(di_off_t *off_p, struct di_state *st)
1430 {
1431 	int i;
1432 	di_off_t off;
1433 	size_t size;
1434 	struct di_devnm *dnp;
1435 
1436 	dcmn_err2((CE_CONT, "di_copydevnm: *off_p = %p\n", (void *)off_p));
1437 
1438 	/*
1439 	 * make sure there is some allocated memory
1440 	 */
1441 	size = devcnt * sizeof (struct di_devnm);
1442 	off = di_checkmem(st, *off_p, size);
1443 	*off_p = off;
1444 
1445 	dcmn_err((CE_CONT, "Start copying devnamesp[%d] at offset 0x%x\n",
1446 		devcnt, off));
1447 
1448 	dnp = (struct di_devnm *)di_mem_addr(st, off);
1449 	off += size;
1450 
1451 	for (i = 0; i < devcnt; i++) {
1452 		if (devnamesp[i].dn_name == NULL) {
1453 			continue;
1454 		}
1455 
1456 		/*
1457 		 * dn_name is not freed during driver unload or removal.
1458 		 *
1459 		 * There is a race condition when make_devname() changes
1460 		 * dn_name during our strcpy. This should be rare since
1461 		 * only add_drv does this. At any rate, we never had a
1462 		 * problem with ddi_name_to_major(), which should have
1463 		 * the same problem.
1464 		 */
1465 		dcmn_err2((CE_CONT, "di_copydevnm: %s%d, off=%x\n",
1466 			devnamesp[i].dn_name, devnamesp[i].dn_instance,
1467 			off));
1468 
1469 		off = di_checkmem(st, off, strlen(devnamesp[i].dn_name) + 1);
1470 		dnp[i].name = off;
1471 		(void) strcpy((char *)di_mem_addr(st, off),
1472 			devnamesp[i].dn_name);
1473 		off += DI_ALIGN(strlen(devnamesp[i].dn_name) + 1);
1474 
1475 		mutex_enter(&devnamesp[i].dn_lock);
1476 
1477 		/*
1478 		 * Snapshot per-driver node list
1479 		 */
1480 		snap_driver_list(st, &devnamesp[i], &dnp[i].head);
1481 
1482 		/*
1483 		 * This is not used by libdevinfo, leave it for now
1484 		 */
1485 		dnp[i].flags = devnamesp[i].dn_flags;
1486 		dnp[i].instance = devnamesp[i].dn_instance;
1487 
1488 		/*
1489 		 * get global properties
1490 		 */
1491 		if ((DINFOPROP & st->command) &&
1492 		    devnamesp[i].dn_global_prop_ptr) {
1493 			dnp[i].global_prop = off;
1494 			off = di_getprop(
1495 			    devnamesp[i].dn_global_prop_ptr->prop_list,
1496 			    &dnp[i].global_prop, st, NULL, DI_PROP_GLB_LIST);
1497 		}
1498 
1499 		/*
1500 		 * Bit encode driver ops: & bus_ops, cb_ops, & cb_ops->cb_str
1501 		 */
1502 		if (CB_DRV_INSTALLED(devopsp[i])) {
1503 			if (devopsp[i]->devo_cb_ops) {
1504 				dnp[i].ops |= DI_CB_OPS;
1505 				if (devopsp[i]->devo_cb_ops->cb_str)
1506 					dnp[i].ops |= DI_STREAM_OPS;
1507 			}
1508 			if (NEXUS_DRV(devopsp[i])) {
1509 				dnp[i].ops |= DI_BUS_OPS;
1510 			}
1511 		}
1512 
1513 		mutex_exit(&devnamesp[i].dn_lock);
1514 	}
1515 
1516 	dcmn_err((CE_CONT, "End copying devnamesp at offset 0x%x\n", off));
1517 
1518 	return (off);
1519 }
1520 
1521 /*
1522  * Copy the kernel devinfo tree. The tree and the devnames array forms
1523  * the entire snapshot (see also di_copydevnm).
1524  */
1525 static di_off_t
1526 di_copytree(struct dev_info *root, di_off_t *off_p, struct di_state *st)
1527 {
1528 	di_off_t off;
1529 	struct di_stack *dsp = kmem_zalloc(sizeof (struct di_stack), KM_SLEEP);
1530 
1531 	dcmn_err((CE_CONT, "di_copytree: root = %p, *off_p = %x\n",
1532 		(void *)root, *off_p));
1533 
1534 	/* force attach drivers */
1535 	if ((i_ddi_node_state((dev_info_t *)root) == DS_READY) &&
1536 	    (st->command & DINFOSUBTREE) && (st->command & DINFOFORCE)) {
1537 		(void) ndi_devi_config((dev_info_t *)root,
1538 		    NDI_CONFIG | NDI_DEVI_PERSIST | NDI_NO_EVENT |
1539 		    NDI_DRV_CONF_REPROBE);
1540 	}
1541 
1542 	/*
1543 	 * Push top_devinfo onto a stack
1544 	 *
1545 	 * The stack is necessary to avoid recursion, which can overrun
1546 	 * the kernel stack.
1547 	 */
1548 	PUSH_STACK(dsp, root, off_p);
1549 
1550 	/*
1551 	 * As long as there is a node on the stack, copy the node.
1552 	 * di_copynode() is responsible for pushing and popping
1553 	 * child and sibling nodes on the stack.
1554 	 */
1555 	while (!EMPTY_STACK(dsp)) {
1556 		off = di_copynode(dsp, st);
1557 	}
1558 
1559 	/*
1560 	 * Free the stack structure
1561 	 */
1562 	kmem_free(dsp, sizeof (struct di_stack));
1563 
1564 	return (off);
1565 }
1566 
1567 /*
1568  * This is the core function, which copies all data associated with a single
1569  * node into the snapshot. The amount of information is determined by the
1570  * ioctl command.
1571  */
1572 static di_off_t
1573 di_copynode(struct di_stack *dsp, struct di_state *st)
1574 {
1575 	di_off_t off;
1576 	struct di_node *me;
1577 	struct dev_info *node;
1578 
1579 	dcmn_err2((CE_CONT, "di_copynode: depth = %x\n",
1580 			dsp->depth));
1581 
1582 	node = TOP_NODE(dsp);
1583 
1584 	ASSERT(node != NULL);
1585 
1586 	/*
1587 	 * check memory usage, and fix offsets accordingly.
1588 	 */
1589 	off = di_checkmem(st, *(TOP_OFFSET(dsp)), sizeof (struct di_node));
1590 	*(TOP_OFFSET(dsp)) = off;
1591 	me = DI_NODE(di_mem_addr(st, off));
1592 
1593 	dcmn_err((CE_CONT, "copy node %s, instance #%d, at offset 0x%x\n",
1594 			node->devi_node_name, node->devi_instance, off));
1595 
1596 	/*
1597 	 * Node parameters:
1598 	 * self		-- offset of current node within snapshot
1599 	 * nodeid	-- pointer to PROM node (tri-valued)
1600 	 * state	-- hot plugging device state
1601 	 * node_state	-- devinfo node state (CF1, CF2, etc.)
1602 	 */
1603 	me->self = off;
1604 	me->instance = node->devi_instance;
1605 	me->nodeid = node->devi_nodeid;
1606 	me->node_class = node->devi_node_class;
1607 	me->attributes = node->devi_node_attributes;
1608 	me->state = node->devi_state;
1609 	me->node_state = node->devi_node_state;
1610 	me->user_private_data = NULL;
1611 
1612 	/*
1613 	 * Get parent's offset in snapshot from the stack
1614 	 * and store it in the current node
1615 	 */
1616 	if (dsp->depth > 1) {
1617 		me->parent = *(PARENT_OFFSET(dsp));
1618 	}
1619 
1620 	/*
1621 	 * Save the offset of this di_node in a hash table.
1622 	 * This is used later to resolve references to this
1623 	 * dip from other parts of the tree (per-driver list,
1624 	 * multipathing linkages, layered usage linkages).
1625 	 * The key used for the hash table is derived from
1626 	 * information in the dip.
1627 	 */
1628 	di_register_dip(st, (dev_info_t *)node, me->self);
1629 
1630 	/*
1631 	 * increment offset
1632 	 */
1633 	off += sizeof (struct di_node);
1634 
1635 #ifdef	DEVID_COMPATIBILITY
1636 	/* check for devid as property marker */
1637 	if (node->devi_devid) {
1638 		ddi_devid_t	devid;
1639 		char 		*devidstr;
1640 		int		devid_size;
1641 
1642 		/*
1643 		 * The devid is now represented as a property.
1644 		 * For micro release compatibility with di_devid interface
1645 		 * in libdevinfo we must return it as a binary structure in'
1646 		 * the snapshot.  When di_devid is removed from libdevinfo
1647 		 * in a future release (and devi_devid is deleted) then
1648 		 * code related to DEVID_COMPATIBILITY can be removed.
1649 		 */
1650 		ASSERT(node->devi_devid == DEVID_COMPATIBILITY);
1651 /* XXX should be DDI_DEV_T_NONE! */
1652 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, (dev_info_t *)node,
1653 		    DDI_PROP_DONTPASS, DEVID_PROP_NAME, &devidstr) ==
1654 		    DDI_PROP_SUCCESS) {
1655 			if (ddi_devid_str_decode(devidstr, &devid, NULL) ==
1656 			    DDI_SUCCESS) {
1657 				devid_size = ddi_devid_sizeof(devid);
1658 				off = di_checkmem(st, off, devid_size);
1659 				me->devid = off;
1660 				bcopy(devid,
1661 				    di_mem_addr(st, off), devid_size);
1662 				off += devid_size;
1663 				ddi_devid_free(devid);
1664 			}
1665 			ddi_prop_free(devidstr);
1666 		}
1667 	}
1668 #endif	/* DEVID_COMPATIBILITY */
1669 
1670 	if (node->devi_node_name) {
1671 		off = di_checkmem(st, off, strlen(node->devi_node_name) + 1);
1672 		me->node_name = off;
1673 		(void) strcpy(di_mem_addr(st, off), node->devi_node_name);
1674 		off += strlen(node->devi_node_name) + 1;
1675 	}
1676 
1677 	if (node->devi_compat_names && (node->devi_compat_length > 1)) {
1678 		off = di_checkmem(st, off, node->devi_compat_length);
1679 		me->compat_names = off;
1680 		me->compat_length = node->devi_compat_length;
1681 		bcopy(node->devi_compat_names, di_mem_addr(st, off),
1682 			node->devi_compat_length);
1683 		off += node->devi_compat_length;
1684 	}
1685 
1686 	if (node->devi_addr) {
1687 		off = di_checkmem(st, off, strlen(node->devi_addr) + 1);
1688 		me->address = off;
1689 		(void) strcpy(di_mem_addr(st, off), node->devi_addr);
1690 		off += strlen(node->devi_addr) + 1;
1691 	}
1692 
1693 	if (node->devi_binding_name) {
1694 		off = di_checkmem(st, off, strlen(node->devi_binding_name) + 1);
1695 		me->bind_name = off;
1696 		(void) strcpy(di_mem_addr(st, off), node->devi_binding_name);
1697 		off += strlen(node->devi_binding_name) + 1;
1698 	}
1699 
1700 	me->drv_major = node->devi_major;
1701 
1702 	/*
1703 	 * If the dip is BOUND, set the next pointer of the
1704 	 * per-instance list to -1, indicating that it is yet to be resolved.
1705 	 * This will be resolved later in snap_driver_list().
1706 	 */
1707 	if (me->drv_major != -1) {
1708 		me->next = -1;
1709 	} else {
1710 		me->next = 0;
1711 	}
1712 
1713 	/*
1714 	 * An optimization to skip mutex_enter when not needed.
1715 	 */
1716 	if (!((DINFOMINOR | DINFOPROP | DINFOPATH) & st->command)) {
1717 		goto priv_data;
1718 	}
1719 
1720 	/*
1721 	 * Grab current per dev_info node lock to
1722 	 * get minor data and properties.
1723 	 */
1724 	mutex_enter(&(node->devi_lock));
1725 
1726 	if (!(DINFOMINOR & st->command)) {
1727 		goto path;
1728 	}
1729 
1730 	if (node->devi_minor) {		/* minor data */
1731 		me->minor_data = DI_ALIGN(off);
1732 		off = di_getmdata(node->devi_minor, &me->minor_data,
1733 		    me->self, st);
1734 	}
1735 
1736 path:
1737 	if (!(DINFOPATH & st->command)) {
1738 		goto property;
1739 	}
1740 
1741 	if (MDI_CLIENT(node)) {
1742 		me->multipath_client = DI_ALIGN(off);
1743 		off = di_getpath_data((dev_info_t *)node, &me->multipath_client,
1744 		    me->self, st, 1);
1745 		dcmn_err((CE_WARN, "me->multipath_client = %x for node %p "
1746 		    "component type = %d.  off=%d",
1747 		    me->multipath_client,
1748 		    (void *)node, node->devi_mdi_component, off));
1749 	}
1750 
1751 	if (MDI_PHCI(node)) {
1752 		me->multipath_phci = DI_ALIGN(off);
1753 		off = di_getpath_data((dev_info_t *)node, &me->multipath_phci,
1754 		    me->self, st, 0);
1755 		dcmn_err((CE_WARN, "me->multipath_phci = %x for node %p "
1756 		    "component type = %d.  off=%d",
1757 		    me->multipath_phci,
1758 		    (void *)node, node->devi_mdi_component, off));
1759 	}
1760 
1761 property:
1762 	if (!(DINFOPROP & st->command)) {
1763 		goto unlock;
1764 	}
1765 
1766 	if (node->devi_drv_prop_ptr) {	/* driver property list */
1767 		me->drv_prop = DI_ALIGN(off);
1768 		off = di_getprop(node->devi_drv_prop_ptr, &me->drv_prop, st,
1769 			node, DI_PROP_DRV_LIST);
1770 	}
1771 
1772 	if (node->devi_sys_prop_ptr) {	/* system property list */
1773 		me->sys_prop = DI_ALIGN(off);
1774 		off = di_getprop(node->devi_sys_prop_ptr, &me->sys_prop, st,
1775 			node, DI_PROP_SYS_LIST);
1776 	}
1777 
1778 	if (node->devi_hw_prop_ptr) {	/* hardware property list */
1779 		me->hw_prop = DI_ALIGN(off);
1780 		off = di_getprop(node->devi_hw_prop_ptr, &me->hw_prop, st,
1781 			node, DI_PROP_HW_LIST);
1782 	}
1783 
1784 	if (node->devi_global_prop_list == NULL) {
1785 		me->glob_prop = (di_off_t)-1;	/* not global property */
1786 	} else {
1787 		/*
1788 		 * Make copy of global property list if this devinfo refers
1789 		 * global properties different from what's on the devnames
1790 		 * array. It can happen if there has been a forced
1791 		 * driver.conf update. See mod_drv(1M).
1792 		 */
1793 		ASSERT(me->drv_major != -1);
1794 		if (node->devi_global_prop_list !=
1795 		    devnamesp[me->drv_major].dn_global_prop_ptr) {
1796 			me->glob_prop = DI_ALIGN(off);
1797 			off = di_getprop(node->devi_global_prop_list->prop_list,
1798 			    &me->glob_prop, st, node, DI_PROP_GLB_LIST);
1799 		}
1800 	}
1801 
1802 unlock:
1803 	/*
1804 	 * release current per dev_info node lock
1805 	 */
1806 	mutex_exit(&(node->devi_lock));
1807 
1808 priv_data:
1809 	if (!(DINFOPRIVDATA & st->command)) {
1810 		goto pm_info;
1811 	}
1812 
1813 	if (ddi_get_parent_data((dev_info_t *)node) != NULL) {
1814 		me->parent_data = DI_ALIGN(off);
1815 		off = di_getppdata(node, &me->parent_data, st);
1816 	}
1817 
1818 	if (ddi_get_driver_private((dev_info_t *)node) != NULL) {
1819 		me->driver_data = DI_ALIGN(off);
1820 		off = di_getdpdata(node, &me->driver_data, st);
1821 	}
1822 
1823 pm_info: /* NOT implemented */
1824 
1825 subtree:
1826 	if (!(DINFOSUBTREE & st->command)) {
1827 		POP_STACK(dsp);
1828 		return (DI_ALIGN(off));
1829 	}
1830 
1831 child:
1832 	/*
1833 	 * If there is a child--push child onto stack.
1834 	 * Hold the parent busy while doing so.
1835 	 */
1836 	if (node->devi_child) {
1837 		me->child = DI_ALIGN(off);
1838 		PUSH_STACK(dsp, node->devi_child, &me->child);
1839 		return (me->child);
1840 	}
1841 
1842 sibling:
1843 	/*
1844 	 * no child node, unroll the stack till a sibling of
1845 	 * a parent node is found or root node is reached
1846 	 */
1847 	POP_STACK(dsp);
1848 	while (!EMPTY_STACK(dsp) && (node->devi_sibling == NULL)) {
1849 		node = TOP_NODE(dsp);
1850 		me = DI_NODE(di_mem_addr(st, *(TOP_OFFSET(dsp))));
1851 		POP_STACK(dsp);
1852 	}
1853 
1854 	if (!EMPTY_STACK(dsp)) {
1855 		/*
1856 		 * a sibling is found, replace top of stack by its sibling
1857 		 */
1858 		me->sibling = DI_ALIGN(off);
1859 		PUSH_STACK(dsp, node->devi_sibling, &me->sibling);
1860 		return (me->sibling);
1861 	}
1862 
1863 	/*
1864 	 * DONE with all nodes
1865 	 */
1866 	return (DI_ALIGN(off));
1867 }
1868 
1869 static i_lnode_t *
1870 i_lnode_alloc(int modid)
1871 {
1872 	i_lnode_t	*i_lnode;
1873 
1874 	i_lnode = kmem_zalloc(sizeof (i_lnode_t), KM_SLEEP);
1875 
1876 	ASSERT(modid != -1);
1877 	i_lnode->modid = modid;
1878 
1879 	return (i_lnode);
1880 }
1881 
1882 static void
1883 i_lnode_free(i_lnode_t *i_lnode)
1884 {
1885 	kmem_free(i_lnode, sizeof (i_lnode_t));
1886 }
1887 
1888 static void
1889 i_lnode_check_free(i_lnode_t *i_lnode)
1890 {
1891 	/* This lnode and its dip must have been snapshotted */
1892 	ASSERT(i_lnode->self > 0);
1893 	ASSERT(i_lnode->di_node->self > 0);
1894 
1895 	/* at least 1 link (in or out) must exist for this lnode */
1896 	ASSERT(i_lnode->link_in || i_lnode->link_out);
1897 
1898 	i_lnode_free(i_lnode);
1899 }
1900 
1901 static i_link_t *
1902 i_link_alloc(int spec_type)
1903 {
1904 	i_link_t *i_link;
1905 
1906 	i_link = kmem_zalloc(sizeof (i_link_t), KM_SLEEP);
1907 	i_link->spec_type = spec_type;
1908 
1909 	return (i_link);
1910 }
1911 
1912 static void
1913 i_link_check_free(i_link_t *i_link)
1914 {
1915 	/* This link must have been snapshotted */
1916 	ASSERT(i_link->self > 0);
1917 
1918 	/* Both endpoint lnodes must exist for this link */
1919 	ASSERT(i_link->src_lnode);
1920 	ASSERT(i_link->tgt_lnode);
1921 
1922 	kmem_free(i_link, sizeof (i_link_t));
1923 }
1924 
1925 /*ARGSUSED*/
1926 static uint_t
1927 i_lnode_hashfunc(void *arg, mod_hash_key_t key)
1928 {
1929 	i_lnode_t	*i_lnode = (i_lnode_t *)key;
1930 	struct di_node	*ptr;
1931 	dev_t		dev;
1932 
1933 	dev = i_lnode->devt;
1934 	if (dev != DDI_DEV_T_NONE)
1935 		return (i_lnode->modid + getminor(dev) + getmajor(dev));
1936 
1937 	ptr = i_lnode->di_node;
1938 	ASSERT(ptr->self > 0);
1939 	if (ptr) {
1940 		uintptr_t k = (uintptr_t)ptr;
1941 		k >>= (int)highbit(sizeof (struct di_node));
1942 		return ((uint_t)k);
1943 	}
1944 
1945 	return (i_lnode->modid);
1946 }
1947 
1948 static int
1949 i_lnode_cmp(void *arg1, void *arg2)
1950 {
1951 	i_lnode_t	*i_lnode1 = (i_lnode_t *)arg1;
1952 	i_lnode_t	*i_lnode2 = (i_lnode_t *)arg2;
1953 
1954 	if (i_lnode1->modid != i_lnode2->modid) {
1955 		return ((i_lnode1->modid < i_lnode2->modid) ? -1 : 1);
1956 	}
1957 
1958 	if (i_lnode1->di_node != i_lnode2->di_node)
1959 		return ((i_lnode1->di_node < i_lnode2->di_node) ? -1 : 1);
1960 
1961 	if (i_lnode1->devt != i_lnode2->devt)
1962 		return ((i_lnode1->devt < i_lnode2->devt) ? -1 : 1);
1963 
1964 	return (0);
1965 }
1966 
1967 /*
1968  * An lnode represents a {dip, dev_t} tuple. A link represents a
1969  * {src_lnode, tgt_lnode, spec_type} tuple.
1970  * The following callback assumes that LDI framework ref-counts the
1971  * src_dip and tgt_dip while invoking this callback.
1972  */
1973 static int
1974 di_ldi_callback(const ldi_usage_t *ldi_usage, void *arg)
1975 {
1976 	struct di_state	*st = (struct di_state *)arg;
1977 	i_lnode_t	*src_lnode, *tgt_lnode, *i_lnode;
1978 	i_link_t	**i_link_next, *i_link;
1979 	di_off_t	soff, toff;
1980 	mod_hash_val_t	nodep = NULL;
1981 	int		res;
1982 
1983 	/*
1984 	 * if the source or target of this device usage information doesn't
1985 	 * corrospond to a device node then we don't report it via
1986 	 * libdevinfo so return.
1987 	 */
1988 	if ((ldi_usage->src_dip == NULL) || (ldi_usage->tgt_dip == NULL))
1989 		return (LDI_USAGE_CONTINUE);
1990 
1991 	ASSERT(e_ddi_devi_holdcnt(ldi_usage->src_dip));
1992 	ASSERT(e_ddi_devi_holdcnt(ldi_usage->tgt_dip));
1993 
1994 	/*
1995 	 * Skip the ldi_usage if either src or tgt dip is not in the
1996 	 * snapshot. This saves us from pruning bad lnodes/links later.
1997 	 */
1998 	if (di_dip_find(st, ldi_usage->src_dip, &soff) != 0)
1999 		return (LDI_USAGE_CONTINUE);
2000 	if (di_dip_find(st, ldi_usage->tgt_dip, &toff) != 0)
2001 		return (LDI_USAGE_CONTINUE);
2002 
2003 	ASSERT(soff > 0);
2004 	ASSERT(toff > 0);
2005 
2006 	/*
2007 	 * allocate an i_lnode and add it to the lnode hash
2008 	 * if it is not already present. For this particular
2009 	 * link the lnode is a source, but it may
2010 	 * participate as tgt or src in any number of layered
2011 	 * operations - so it may already be in the hash.
2012 	 */
2013 	i_lnode = i_lnode_alloc(ldi_usage->src_modid);
2014 	i_lnode->di_node = (struct di_node *)di_mem_addr(st, soff);
2015 	i_lnode->devt = ldi_usage->src_devt;
2016 
2017 	res = mod_hash_find(st->lnode_hash, i_lnode, &nodep);
2018 	if (res == MH_ERR_NOTFOUND) {
2019 		/*
2020 		 * new i_lnode
2021 		 * add it to the hash and increment the lnode count
2022 		 */
2023 		res = mod_hash_insert(st->lnode_hash, i_lnode, i_lnode);
2024 		ASSERT(res == 0);
2025 		st->lnode_count++;
2026 		src_lnode = i_lnode;
2027 	} else {
2028 		/* this i_lnode already exists in the lnode_hash */
2029 		i_lnode_free(i_lnode);
2030 		src_lnode = (i_lnode_t *)nodep;
2031 	}
2032 
2033 	/*
2034 	 * allocate a tgt i_lnode and add it to the lnode hash
2035 	 */
2036 	i_lnode = i_lnode_alloc(ldi_usage->tgt_modid);
2037 	i_lnode->di_node = (struct di_node *)di_mem_addr(st, toff);
2038 	i_lnode->devt = ldi_usage->tgt_devt;
2039 
2040 	res = mod_hash_find(st->lnode_hash, i_lnode, &nodep);
2041 	if (res == MH_ERR_NOTFOUND) {
2042 		/*
2043 		 * new i_lnode
2044 		 * add it to the hash and increment the lnode count
2045 		 */
2046 		res = mod_hash_insert(st->lnode_hash, i_lnode, i_lnode);
2047 		ASSERT(res == 0);
2048 		st->lnode_count++;
2049 		tgt_lnode = i_lnode;
2050 	} else {
2051 		/* this i_lnode already exists in the lnode_hash */
2052 		i_lnode_free(i_lnode);
2053 		tgt_lnode = (i_lnode_t *)nodep;
2054 	}
2055 
2056 	/*
2057 	 * allocate a i_link
2058 	 */
2059 	i_link = i_link_alloc(ldi_usage->tgt_spec_type);
2060 	i_link->src_lnode = src_lnode;
2061 	i_link->tgt_lnode = tgt_lnode;
2062 
2063 	/*
2064 	 * add this link onto the src i_lnodes outbound i_link list
2065 	 */
2066 	i_link_next = &(src_lnode->link_out);
2067 	while (*i_link_next != NULL) {
2068 		if ((i_lnode_cmp(tgt_lnode, (*i_link_next)->tgt_lnode) == 0) &&
2069 		    (i_link->spec_type == (*i_link_next)->spec_type)) {
2070 			/* this link already exists */
2071 			kmem_free(i_link, sizeof (i_link_t));
2072 			return (LDI_USAGE_CONTINUE);
2073 		}
2074 		i_link_next = &((*i_link_next)->src_link_next);
2075 	}
2076 	*i_link_next = i_link;
2077 
2078 	/*
2079 	 * add this link onto the tgt i_lnodes inbound i_link list
2080 	 */
2081 	i_link_next = &(tgt_lnode->link_in);
2082 	while (*i_link_next != NULL) {
2083 		ASSERT(i_lnode_cmp(src_lnode, (*i_link_next)->src_lnode) != 0);
2084 		i_link_next = &((*i_link_next)->tgt_link_next);
2085 	}
2086 	*i_link_next = i_link;
2087 
2088 	/*
2089 	 * add this i_link to the link hash
2090 	 */
2091 	res = mod_hash_insert(st->link_hash, i_link, i_link);
2092 	ASSERT(res == 0);
2093 	st->link_count++;
2094 
2095 	return (LDI_USAGE_CONTINUE);
2096 }
2097 
2098 struct i_layer_data {
2099 	struct di_state	*st;
2100 	int		lnode_count;
2101 	int		link_count;
2102 	di_off_t	lnode_off;
2103 	di_off_t 	link_off;
2104 };
2105 
2106 /*ARGSUSED*/
2107 static uint_t
2108 i_link_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
2109 {
2110 	i_link_t		*i_link  = (i_link_t *)key;
2111 	struct i_layer_data	*data = arg;
2112 	struct di_link		*me;
2113 	struct di_lnode		*melnode;
2114 	struct di_node		*medinode;
2115 
2116 	ASSERT(i_link->self == 0);
2117 
2118 	i_link->self = data->link_off +
2119 	    (data->link_count * sizeof (struct di_link));
2120 	data->link_count++;
2121 
2122 	ASSERT(data->link_off > 0 && data->link_count > 0);
2123 	ASSERT(data->lnode_count == data->st->lnode_count); /* lnodes done */
2124 	ASSERT(data->link_count <= data->st->link_count);
2125 
2126 	/* fill in fields for the di_link snapshot */
2127 	me = (struct di_link *)di_mem_addr(data->st, i_link->self);
2128 	me->self = i_link->self;
2129 	me->spec_type = i_link->spec_type;
2130 
2131 	/*
2132 	 * The src_lnode and tgt_lnode i_lnode_t for this i_link_t
2133 	 * are created during the LDI table walk. Since we are
2134 	 * walking the link hash, the lnode hash has already been
2135 	 * walked and the lnodes have been snapshotted. Save lnode
2136 	 * offsets.
2137 	 */
2138 	me->src_lnode = i_link->src_lnode->self;
2139 	me->tgt_lnode = i_link->tgt_lnode->self;
2140 
2141 	/*
2142 	 * Save this link's offset in the src_lnode snapshot's link_out
2143 	 * field
2144 	 */
2145 	melnode = (struct di_lnode *)di_mem_addr(data->st, me->src_lnode);
2146 	me->src_link_next = melnode->link_out;
2147 	melnode->link_out = me->self;
2148 
2149 	/*
2150 	 * Put this link on the tgt_lnode's link_in field
2151 	 */
2152 	melnode = (struct di_lnode *)di_mem_addr(data->st, me->tgt_lnode);
2153 	me->tgt_link_next = melnode->link_in;
2154 	melnode->link_in = me->self;
2155 
2156 	/*
2157 	 * An i_lnode_t is only created if the corresponding dip exists
2158 	 * in the snapshot. A pointer to the di_node is saved in the
2159 	 * i_lnode_t when it is allocated. For this link, get the di_node
2160 	 * for the source lnode. Then put the link on the di_node's list
2161 	 * of src links
2162 	 */
2163 	medinode = i_link->src_lnode->di_node;
2164 	me->src_node_next = medinode->src_links;
2165 	medinode->src_links = me->self;
2166 
2167 	/*
2168 	 * Put this link on the tgt_links list of the target
2169 	 * dip.
2170 	 */
2171 	medinode = i_link->tgt_lnode->di_node;
2172 	me->tgt_node_next = medinode->tgt_links;
2173 	medinode->tgt_links = me->self;
2174 
2175 	return (MH_WALK_CONTINUE);
2176 }
2177 
2178 /*ARGSUSED*/
2179 static uint_t
2180 i_lnode_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
2181 {
2182 	i_lnode_t		*i_lnode = (i_lnode_t *)key;
2183 	struct i_layer_data	*data = arg;
2184 	struct di_lnode		*me;
2185 	struct di_node		*medinode;
2186 
2187 	ASSERT(i_lnode->self == 0);
2188 
2189 	i_lnode->self = data->lnode_off +
2190 	    (data->lnode_count * sizeof (struct di_lnode));
2191 	data->lnode_count++;
2192 
2193 	ASSERT(data->lnode_off > 0 && data->lnode_count > 0);
2194 	ASSERT(data->link_count == 0); /* links not done yet */
2195 	ASSERT(data->lnode_count <= data->st->lnode_count);
2196 
2197 	/* fill in fields for the di_lnode snapshot */
2198 	me = (struct di_lnode *)di_mem_addr(data->st, i_lnode->self);
2199 	me->self = i_lnode->self;
2200 
2201 	if (i_lnode->devt == DDI_DEV_T_NONE) {
2202 		me->dev_major = (major_t)-1;
2203 		me->dev_minor = (minor_t)-1;
2204 	} else {
2205 		me->dev_major = getmajor(i_lnode->devt);
2206 		me->dev_minor = getminor(i_lnode->devt);
2207 	}
2208 
2209 	/*
2210 	 * The dip corresponding to this lnode must exist in
2211 	 * the snapshot or we wouldn't have created the i_lnode_t
2212 	 * during LDI walk. Save the offset of the dip.
2213 	 */
2214 	ASSERT(i_lnode->di_node && i_lnode->di_node->self > 0);
2215 	me->node = i_lnode->di_node->self;
2216 
2217 	/*
2218 	 * There must be at least one link in or out of this lnode
2219 	 * or we wouldn't have created it. These fields will be set
2220 	 * during the link hash walk.
2221 	 */
2222 	ASSERT((i_lnode->link_in != NULL) || (i_lnode->link_out != NULL));
2223 
2224 	/*
2225 	 * set the offset of the devinfo node associated with this
2226 	 * lnode. Also update the node_next next pointer.  this pointer
2227 	 * is set if there are multiple lnodes associated with the same
2228 	 * devinfo node.  (could occure when multiple minor nodes
2229 	 * are open for one device, etc.)
2230 	 */
2231 	medinode = i_lnode->di_node;
2232 	me->node_next = medinode->lnodes;
2233 	medinode->lnodes = me->self;
2234 
2235 	return (MH_WALK_CONTINUE);
2236 }
2237 
2238 static di_off_t
2239 di_getlink_data(di_off_t off, struct di_state *st)
2240 {
2241 	struct i_layer_data data = {0};
2242 	size_t size;
2243 
2244 	dcmn_err2((CE_CONT, "di_copylyr: off = %x\n", off));
2245 
2246 	st->lnode_hash = mod_hash_create_extended("di_lnode_hash", 32,
2247 	    mod_hash_null_keydtor, (void (*)(mod_hash_val_t))i_lnode_check_free,
2248 	    i_lnode_hashfunc, NULL, i_lnode_cmp, KM_SLEEP);
2249 
2250 	st->link_hash = mod_hash_create_ptrhash("di_link_hash", 32,
2251 	    (void (*)(mod_hash_val_t))i_link_check_free, sizeof (i_link_t));
2252 
2253 	/* get driver layering information */
2254 	(void) ldi_usage_walker(st, di_ldi_callback);
2255 
2256 	/* check if there is any link data to include in the snapshot */
2257 	if (st->lnode_count == 0) {
2258 		ASSERT(st->link_count == 0);
2259 		goto out;
2260 	}
2261 
2262 	ASSERT(st->link_count != 0);
2263 
2264 	/* get a pointer to snapshot memory for all the di_lnodes */
2265 	size = sizeof (struct di_lnode) * st->lnode_count;
2266 	data.lnode_off = off = di_checkmem(st, off, size);
2267 	off += DI_ALIGN(size);
2268 
2269 	/* get a pointer to snapshot memory for all the di_links */
2270 	size = sizeof (struct di_link) * st->link_count;
2271 	data.link_off = off = di_checkmem(st, off, size);
2272 	off += DI_ALIGN(size);
2273 
2274 	data.lnode_count = data.link_count = 0;
2275 	data.st = st;
2276 
2277 	/*
2278 	 * We have lnodes and links that will go into the
2279 	 * snapshot, so let's walk the respective hashes
2280 	 * and snapshot them. The various linkages are
2281 	 * also set up during the walk.
2282 	 */
2283 	mod_hash_walk(st->lnode_hash, i_lnode_walker, (void *)&data);
2284 	ASSERT(data.lnode_count == st->lnode_count);
2285 
2286 	mod_hash_walk(st->link_hash, i_link_walker, (void *)&data);
2287 	ASSERT(data.link_count == st->link_count);
2288 
2289 out:
2290 	/* free up the i_lnodes and i_links used to create the snapshot */
2291 	mod_hash_destroy_hash(st->lnode_hash);
2292 	mod_hash_destroy_hash(st->link_hash);
2293 	st->lnode_count = 0;
2294 	st->link_count = 0;
2295 
2296 	return (off);
2297 }
2298 
2299 
2300 /*
2301  * Copy all minor data nodes attached to a devinfo node into the snapshot.
2302  * It is called from di_copynode with devi_lock held.
2303  */
2304 static di_off_t
2305 di_getmdata(struct ddi_minor_data *mnode, di_off_t *off_p, di_off_t node,
2306 	struct di_state *st)
2307 {
2308 	di_off_t off;
2309 	struct di_minor *me;
2310 
2311 	dcmn_err2((CE_CONT, "di_getmdata:\n"));
2312 
2313 	/*
2314 	 * check memory first
2315 	 */
2316 	off = di_checkmem(st, *off_p, sizeof (struct di_minor));
2317 	*off_p = off;
2318 
2319 	do {
2320 		me = (struct di_minor *)di_mem_addr(st, off);
2321 		me->self = off;
2322 		me->type = mnode->type;
2323 		me->node = node;
2324 		me->user_private_data = NULL;
2325 
2326 		off += DI_ALIGN(sizeof (struct di_minor));
2327 
2328 		/*
2329 		 * Split dev_t to major/minor, so it works for
2330 		 * both ILP32 and LP64 model
2331 		 */
2332 		me->dev_major = getmajor(mnode->ddm_dev);
2333 		me->dev_minor = getminor(mnode->ddm_dev);
2334 		me->spec_type = mnode->ddm_spec_type;
2335 
2336 		if (mnode->ddm_name) {
2337 			off = di_checkmem(st, off,
2338 				strlen(mnode->ddm_name) + 1);
2339 			me->name = off;
2340 			(void) strcpy(di_mem_addr(st, off), mnode->ddm_name);
2341 			off += DI_ALIGN(strlen(mnode->ddm_name) + 1);
2342 		}
2343 
2344 		if (mnode->ddm_node_type) {
2345 			off = di_checkmem(st, off,
2346 				strlen(mnode->ddm_node_type) + 1);
2347 			me->node_type = off;
2348 			(void) strcpy(di_mem_addr(st, off),
2349 					mnode->ddm_node_type);
2350 			off += DI_ALIGN(strlen(mnode->ddm_node_type) + 1);
2351 		}
2352 
2353 		off = di_checkmem(st, off, sizeof (struct di_minor));
2354 		me->next = off;
2355 		mnode = mnode->next;
2356 	} while (mnode);
2357 
2358 	me->next = 0;
2359 
2360 	return (off);
2361 }
2362 
2363 /*
2364  * di_register_dip(), di_find_dip(): The dip must be protected
2365  * from deallocation when using these routines - this can either
2366  * be a reference count, a busy hold or a per-driver lock.
2367  */
2368 
2369 static void
2370 di_register_dip(struct di_state *st, dev_info_t *dip, di_off_t off)
2371 {
2372 	struct dev_info *node = DEVI(dip);
2373 	struct di_key *key = kmem_zalloc(sizeof (*key), KM_SLEEP);
2374 	struct di_dkey *dk;
2375 
2376 	ASSERT(dip);
2377 	ASSERT(off > 0);
2378 
2379 	key->k_type = DI_DKEY;
2380 	dk = &(key->k_u.dkey);
2381 
2382 	dk->dk_dip = dip;
2383 	dk->dk_major = node->devi_major;
2384 	dk->dk_inst = node->devi_instance;
2385 	dk->dk_nodeid = node->devi_nodeid;
2386 
2387 	if (mod_hash_insert(st->reg_dip_hash, (mod_hash_key_t)key,
2388 	    (mod_hash_val_t)(uintptr_t)off) != 0) {
2389 		panic(
2390 		    "duplicate devinfo (%p) registered during device "
2391 		    "tree walk", (void *)dip);
2392 	}
2393 }
2394 
2395 
2396 static int
2397 di_dip_find(struct di_state *st, dev_info_t *dip, di_off_t *off_p)
2398 {
2399 	/*
2400 	 * uintptr_t must be used because it matches the size of void *;
2401 	 * mod_hash expects clients to place results into pointer-size
2402 	 * containers; since di_off_t is always a 32-bit offset, alignment
2403 	 * would otherwise be broken on 64-bit kernels.
2404 	 */
2405 	uintptr_t	offset;
2406 	struct		di_key key = {0};
2407 	struct		di_dkey *dk;
2408 
2409 	ASSERT(st->reg_dip_hash);
2410 	ASSERT(dip);
2411 	ASSERT(off_p);
2412 
2413 
2414 	key.k_type = DI_DKEY;
2415 	dk = &(key.k_u.dkey);
2416 
2417 	dk->dk_dip = dip;
2418 	dk->dk_major = DEVI(dip)->devi_major;
2419 	dk->dk_inst = DEVI(dip)->devi_instance;
2420 	dk->dk_nodeid = DEVI(dip)->devi_nodeid;
2421 
2422 	if (mod_hash_find(st->reg_dip_hash, (mod_hash_key_t)&key,
2423 	    (mod_hash_val_t *)&offset) == 0) {
2424 		*off_p = (di_off_t)offset;
2425 		return (0);
2426 	} else {
2427 		return (-1);
2428 	}
2429 }
2430 
2431 /*
2432  * di_register_pip(), di_find_pip(): The pip must be protected from deallocation
2433  * when using these routines. The caller must do this by protecting the
2434  * client(or phci)<->pip linkage while traversing the list and then holding the
2435  * pip when it is found in the list.
2436  */
2437 
2438 static void
2439 di_register_pip(struct di_state *st, mdi_pathinfo_t *pip, di_off_t off)
2440 {
2441 	struct di_key	*key = kmem_zalloc(sizeof (*key), KM_SLEEP);
2442 	char		*path_addr;
2443 	struct di_pkey	*pk;
2444 
2445 	ASSERT(pip);
2446 	ASSERT(off > 0);
2447 
2448 	key->k_type = DI_PKEY;
2449 	pk = &(key->k_u.pkey);
2450 
2451 	pk->pk_pip = pip;
2452 	path_addr = mdi_pi_get_addr(pip);
2453 	if (path_addr)
2454 		pk->pk_path_addr = i_ddi_strdup(path_addr, KM_SLEEP);
2455 	pk->pk_client = mdi_pi_get_client(pip);
2456 	pk->pk_phci = mdi_pi_get_phci(pip);
2457 
2458 	if (mod_hash_insert(st->reg_pip_hash, (mod_hash_key_t)key,
2459 	    (mod_hash_val_t)(uintptr_t)off) != 0) {
2460 		panic(
2461 		    "duplicate pathinfo (%p) registered during device "
2462 		    "tree walk", (void *)pip);
2463 	}
2464 }
2465 
2466 /*
2467  * As with di_register_pip, the caller must hold or lock the pip
2468  */
2469 static int
2470 di_pip_find(struct di_state *st, mdi_pathinfo_t *pip, di_off_t *off_p)
2471 {
2472 	/*
2473 	 * uintptr_t must be used because it matches the size of void *;
2474 	 * mod_hash expects clients to place results into pointer-size
2475 	 * containers; since di_off_t is always a 32-bit offset, alignment
2476 	 * would otherwise be broken on 64-bit kernels.
2477 	 */
2478 	uintptr_t	offset;
2479 	struct di_key	key = {0};
2480 	struct di_pkey	*pk;
2481 
2482 	ASSERT(st->reg_pip_hash);
2483 	ASSERT(off_p);
2484 
2485 	if (pip == NULL) {
2486 		*off_p = 0;
2487 		return (0);
2488 	}
2489 
2490 	key.k_type = DI_PKEY;
2491 	pk = &(key.k_u.pkey);
2492 
2493 	pk->pk_pip = pip;
2494 	pk->pk_path_addr = mdi_pi_get_addr(pip);
2495 	pk->pk_client = mdi_pi_get_client(pip);
2496 	pk->pk_phci = mdi_pi_get_phci(pip);
2497 
2498 	if (mod_hash_find(st->reg_pip_hash, (mod_hash_key_t)&key,
2499 	    (mod_hash_val_t *)&offset) == 0) {
2500 		*off_p = (di_off_t)offset;
2501 		return (0);
2502 	} else {
2503 		return (-1);
2504 	}
2505 }
2506 
2507 static di_path_state_t
2508 path_state_convert(mdi_pathinfo_state_t st)
2509 {
2510 	switch (st) {
2511 	case MDI_PATHINFO_STATE_ONLINE:
2512 		return (DI_PATH_STATE_ONLINE);
2513 	case MDI_PATHINFO_STATE_STANDBY:
2514 		return (DI_PATH_STATE_STANDBY);
2515 	case MDI_PATHINFO_STATE_OFFLINE:
2516 		return (DI_PATH_STATE_OFFLINE);
2517 	case MDI_PATHINFO_STATE_FAULT:
2518 		return (DI_PATH_STATE_FAULT);
2519 	default:
2520 		return (DI_PATH_STATE_UNKNOWN);
2521 	}
2522 }
2523 
2524 
2525 static di_off_t
2526 di_path_getprop(mdi_pathinfo_t *pip, di_off_t off, di_off_t *off_p,
2527     struct di_state *st)
2528 {
2529 	nvpair_t *prop = NULL;
2530 	struct di_path_prop *me;
2531 
2532 	if (mdi_pi_get_next_prop(pip, NULL) == NULL) {
2533 		*off_p = 0;
2534 		return (off);
2535 	}
2536 
2537 	off = di_checkmem(st, off, sizeof (struct di_path_prop));
2538 	*off_p = off;
2539 
2540 	while (prop = mdi_pi_get_next_prop(pip, prop)) {
2541 		int delta = 0;
2542 
2543 		me = (struct di_path_prop *)di_mem_addr(st, off);
2544 		me->self = off;
2545 		off += sizeof (struct di_path_prop);
2546 
2547 		/*
2548 		 * property name
2549 		 */
2550 		off = di_checkmem(st, off, strlen(nvpair_name(prop)) + 1);
2551 		me->prop_name = off;
2552 		(void) strcpy(di_mem_addr(st, off), nvpair_name(prop));
2553 		off += strlen(nvpair_name(prop)) + 1;
2554 
2555 		switch (nvpair_type(prop)) {
2556 		case DATA_TYPE_BYTE:
2557 		case DATA_TYPE_INT16:
2558 		case DATA_TYPE_UINT16:
2559 		case DATA_TYPE_INT32:
2560 		case DATA_TYPE_UINT32:
2561 			delta = sizeof (int32_t);
2562 			me->prop_type = DDI_PROP_TYPE_INT;
2563 			off = di_checkmem(st, off, delta);
2564 			(void) nvpair_value_int32(prop,
2565 			    (int32_t *)di_mem_addr(st, off));
2566 			break;
2567 
2568 		case DATA_TYPE_INT64:
2569 		case DATA_TYPE_UINT64:
2570 			delta = sizeof (int64_t);
2571 			me->prop_type = DDI_PROP_TYPE_INT64;
2572 			off = di_checkmem(st, off, delta);
2573 			(void) nvpair_value_int64(prop,
2574 			    (int64_t *)di_mem_addr(st, off));
2575 			break;
2576 
2577 		case DATA_TYPE_STRING:
2578 		{
2579 			char *str;
2580 			(void) nvpair_value_string(prop, &str);
2581 			delta = strlen(str) + 1;
2582 			me->prop_type = DDI_PROP_TYPE_STRING;
2583 			off = di_checkmem(st, off, delta);
2584 			(void) strcpy(di_mem_addr(st, off), str);
2585 			break;
2586 		}
2587 		case DATA_TYPE_BYTE_ARRAY:
2588 		case DATA_TYPE_INT16_ARRAY:
2589 		case DATA_TYPE_UINT16_ARRAY:
2590 		case DATA_TYPE_INT32_ARRAY:
2591 		case DATA_TYPE_UINT32_ARRAY:
2592 		case DATA_TYPE_INT64_ARRAY:
2593 		case DATA_TYPE_UINT64_ARRAY:
2594 		{
2595 			uchar_t *buf;
2596 			uint_t nelems;
2597 			(void) nvpair_value_byte_array(prop, &buf, &nelems);
2598 			delta = nelems;
2599 			me->prop_type = DDI_PROP_TYPE_BYTE;
2600 			if (nelems != 0) {
2601 				off = di_checkmem(st, off, delta);
2602 				bcopy(buf, di_mem_addr(st, off), nelems);
2603 			}
2604 			break;
2605 		}
2606 
2607 		default:	/* Unknown or unhandled type; skip it */
2608 			delta = 0;
2609 			break;
2610 		}
2611 
2612 		if (delta > 0) {
2613 			me->prop_data = off;
2614 		}
2615 
2616 		me->prop_len = delta;
2617 		off += delta;
2618 
2619 		off = di_checkmem(st, off, sizeof (struct di_path_prop));
2620 		me->prop_next = off;
2621 	}
2622 
2623 	me->prop_next = 0;
2624 	return (off);
2625 }
2626 
2627 
2628 static void
2629 di_path_one_endpoint(struct di_path *me, di_off_t noff, di_off_t **off_pp,
2630     int get_client)
2631 {
2632 	if (get_client) {
2633 		ASSERT(me->path_client == 0);
2634 		me->path_client = noff;
2635 		ASSERT(me->path_c_link == 0);
2636 		*off_pp = &me->path_c_link;
2637 		me->path_snap_state &=
2638 		    ~(DI_PATH_SNAP_NOCLIENT | DI_PATH_SNAP_NOCLINK);
2639 	} else {
2640 		ASSERT(me->path_phci == 0);
2641 		me->path_phci = noff;
2642 		ASSERT(me->path_p_link == 0);
2643 		*off_pp = &me->path_p_link;
2644 		me->path_snap_state &=
2645 		    ~(DI_PATH_SNAP_NOPHCI | DI_PATH_SNAP_NOPLINK);
2646 	}
2647 }
2648 
2649 /*
2650  * poff_p: pointer to the linkage field. This links pips along the client|phci
2651  *	   linkage list.
2652  * noff  : Offset for the endpoint dip snapshot.
2653  */
2654 static di_off_t
2655 di_getpath_data(dev_info_t *dip, di_off_t *poff_p, di_off_t noff,
2656     struct di_state *st, int get_client)
2657 {
2658 	di_off_t off;
2659 	mdi_pathinfo_t *pip;
2660 	struct di_path *me;
2661 	mdi_pathinfo_t *(*next_pip)(dev_info_t *, mdi_pathinfo_t *);
2662 
2663 	dcmn_err2((CE_WARN, "di_getpath_data: client = %d", get_client));
2664 
2665 	/*
2666 	 * The naming of the following mdi_xyz() is unfortunately
2667 	 * non-intuitive. mdi_get_next_phci_path() follows the
2668 	 * client_link i.e. the list of pip's belonging to the
2669 	 * given client dip.
2670 	 */
2671 	if (get_client)
2672 		next_pip = &mdi_get_next_phci_path;
2673 	else
2674 		next_pip = &mdi_get_next_client_path;
2675 
2676 	off = *poff_p;
2677 
2678 	pip = NULL;
2679 	while (pip = (*next_pip)(dip, pip)) {
2680 		mdi_pathinfo_state_t state;
2681 		di_off_t stored_offset;
2682 
2683 		dcmn_err((CE_WARN, "marshalling pip = %p", (void *)pip));
2684 
2685 		mdi_pi_lock(pip);
2686 
2687 		if (di_pip_find(st, pip, &stored_offset) != -1) {
2688 			/*
2689 			 * We've already seen this pathinfo node so we need to
2690 			 * take care not to snap it again; However, one endpoint
2691 			 * and linkage will be set here. The other endpoint
2692 			 * and linkage has already been set when the pip was
2693 			 * first snapshotted i.e. when the other endpoint dip
2694 			 * was snapshotted.
2695 			 */
2696 			me = (struct di_path *)di_mem_addr(st, stored_offset);
2697 
2698 			*poff_p = stored_offset;
2699 
2700 			di_path_one_endpoint(me, noff, &poff_p, get_client);
2701 
2702 			/*
2703 			 * The other endpoint and linkage were set when this
2704 			 * pip was snapshotted. So we are done with both
2705 			 * endpoints and linkages.
2706 			 */
2707 			ASSERT(!(me->path_snap_state &
2708 			    (DI_PATH_SNAP_NOCLIENT|DI_PATH_SNAP_NOPHCI)));
2709 			ASSERT(!(me->path_snap_state &
2710 			    (DI_PATH_SNAP_NOCLINK|DI_PATH_SNAP_NOPLINK)));
2711 
2712 			mdi_pi_unlock(pip);
2713 			continue;
2714 		}
2715 
2716 		/*
2717 		 * Now that we need to snapshot this pip, check memory
2718 		 */
2719 		off = di_checkmem(st, off, sizeof (struct di_path));
2720 		me = (struct di_path *)di_mem_addr(st, off);
2721 		me->self = off;
2722 		*poff_p = off;
2723 		off += sizeof (struct di_path);
2724 
2725 		me->path_snap_state =
2726 		    DI_PATH_SNAP_NOCLINK | DI_PATH_SNAP_NOPLINK;
2727 		me->path_snap_state |=
2728 		    DI_PATH_SNAP_NOCLIENT | DI_PATH_SNAP_NOPHCI;
2729 
2730 		/*
2731 		 * Zero out fields as di_checkmem() doesn't guarantee
2732 		 * zero-filled memory
2733 		 */
2734 		me->path_client = me->path_phci = 0;
2735 		me->path_c_link = me->path_p_link = 0;
2736 
2737 		di_path_one_endpoint(me, noff, &poff_p, get_client);
2738 
2739 		/*
2740 		 * Note the existence of this pathinfo
2741 		 */
2742 		di_register_pip(st, pip, me->self);
2743 
2744 		state = mdi_pi_get_state(pip);
2745 		me->path_state = path_state_convert(state);
2746 
2747 		/*
2748 		 * Get intermediate addressing info.
2749 		 */
2750 		off = di_checkmem(st, off, strlen(mdi_pi_get_addr(pip)) + 1);
2751 		me->path_addr = off;
2752 		(void) strcpy(di_mem_addr(st, off), mdi_pi_get_addr(pip));
2753 		off += strlen(mdi_pi_get_addr(pip)) + 1;
2754 
2755 		/*
2756 		 * Get path properties if props are to be included in the
2757 		 * snapshot
2758 		 */
2759 		if (DINFOPROP & st->command) {
2760 			off = di_path_getprop(pip, off, &me->path_prop, st);
2761 		} else {
2762 			me->path_prop = 0;
2763 		}
2764 
2765 		mdi_pi_unlock(pip);
2766 	}
2767 
2768 	*poff_p = 0;
2769 
2770 	return (off);
2771 }
2772 
2773 /*
2774  * Copy a list of properties attached to a devinfo node. Called from
2775  * di_copynode with devi_lock held. The major number is passed in case
2776  * we need to call driver's prop_op entry. The value of list indicates
2777  * which list we are copying. Possible values are:
2778  * DI_PROP_DRV_LIST, DI_PROP_SYS_LIST, DI_PROP_GLB_LIST, DI_PROP_HW_LIST
2779  */
2780 static di_off_t
2781 di_getprop(struct ddi_prop *prop, di_off_t *off_p, struct di_state *st,
2782 	struct dev_info *dip, int list)
2783 {
2784 	dev_t dev;
2785 	int (*prop_op)();
2786 	int off, need_prop_op = 0;
2787 	int prop_op_fail = 0;
2788 	ddi_prop_t *propp = NULL;
2789 	struct di_prop *pp;
2790 	struct dev_ops *ops = NULL;
2791 	int prop_len;
2792 	caddr_t prop_val;
2793 
2794 
2795 	dcmn_err2((CE_CONT, "di_getprop:\n"));
2796 
2797 	ASSERT(st != NULL);
2798 
2799 	dcmn_err((CE_CONT, "copy property list at addr %p\n", (void *)prop));
2800 
2801 	/*
2802 	 * Figure out if we need to call driver's prop_op entry point.
2803 	 * The conditions are:
2804 	 *	-- driver property list
2805 	 *	-- driver must be attached and held
2806 	 *	-- driver's cb_prop_op != ddi_prop_op
2807 	 *		or parent's bus_prop_op != ddi_bus_prop_op
2808 	 */
2809 
2810 	if (list != DI_PROP_DRV_LIST) {
2811 		goto getprop;
2812 	}
2813 
2814 	/*
2815 	 * If driver is not attached or if major is -1, we ignore
2816 	 * the driver property list. No one should rely on such
2817 	 * properties.
2818 	 */
2819 	if (i_ddi_node_state((dev_info_t *)dip) < DS_ATTACHED) {
2820 		off = *off_p;
2821 		*off_p = 0;
2822 		return (off);
2823 	}
2824 
2825 	/*
2826 	 * Now we have a driver which is held. We can examine entry points
2827 	 * and check the condition listed above.
2828 	 */
2829 	ops = dip->devi_ops;
2830 
2831 	/*
2832 	 * Some nexus drivers incorrectly set cb_prop_op to nodev,
2833 	 * nulldev or even NULL.
2834 	 */
2835 	if (ops && ops->devo_cb_ops &&
2836 	    (ops->devo_cb_ops->cb_prop_op != ddi_prop_op) &&
2837 	    (ops->devo_cb_ops->cb_prop_op != nodev) &&
2838 	    (ops->devo_cb_ops->cb_prop_op != nulldev) &&
2839 	    (ops->devo_cb_ops->cb_prop_op != NULL)) {
2840 		need_prop_op = 1;
2841 	}
2842 
2843 getprop:
2844 	/*
2845 	 * check memory availability
2846 	 */
2847 	off = di_checkmem(st, *off_p, sizeof (struct di_prop));
2848 	*off_p = off;
2849 	/*
2850 	 * Now copy properties
2851 	 */
2852 	do {
2853 		pp = (struct di_prop *)di_mem_addr(st, off);
2854 		pp->self = off;
2855 		/*
2856 		 * Split dev_t to major/minor, so it works for
2857 		 * both ILP32 and LP64 model
2858 		 */
2859 		pp->dev_major = getmajor(prop->prop_dev);
2860 		pp->dev_minor = getminor(prop->prop_dev);
2861 		pp->prop_flags = prop->prop_flags;
2862 		pp->prop_list = list;
2863 
2864 		/*
2865 		 * property name
2866 		 */
2867 		off += sizeof (struct di_prop);
2868 		if (prop->prop_name) {
2869 			off = di_checkmem(st, off, strlen(prop->prop_name)
2870 			    + 1);
2871 			pp->prop_name = off;
2872 			(void) strcpy(di_mem_addr(st, off), prop->prop_name);
2873 			off += strlen(prop->prop_name) + 1;
2874 		}
2875 
2876 		/*
2877 		 * Set prop_len here. This may change later
2878 		 * if cb_prop_op returns a different length.
2879 		 */
2880 		pp->prop_len = prop->prop_len;
2881 		if (!need_prop_op) {
2882 			if (prop->prop_val == NULL) {
2883 				dcmn_err((CE_WARN,
2884 				    "devinfo: property fault at %p",
2885 				    (void *)prop));
2886 				pp->prop_data = -1;
2887 			} else if (prop->prop_len != 0) {
2888 				off = di_checkmem(st, off, prop->prop_len);
2889 				pp->prop_data = off;
2890 				bcopy(prop->prop_val, di_mem_addr(st, off),
2891 				    prop->prop_len);
2892 				off += DI_ALIGN(pp->prop_len);
2893 			}
2894 		}
2895 
2896 		off = di_checkmem(st, off, sizeof (struct di_prop));
2897 		pp->next = off;
2898 		prop = prop->prop_next;
2899 	} while (prop);
2900 
2901 	pp->next = 0;
2902 
2903 	if (!need_prop_op) {
2904 		dcmn_err((CE_CONT, "finished property "
2905 		    "list at offset 0x%x\n", off));
2906 		return (off);
2907 	}
2908 
2909 	/*
2910 	 * If there is a need to call driver's prop_op entry,
2911 	 * we must release driver's devi_lock, because the
2912 	 * cb_prop_op entry point will grab it.
2913 	 *
2914 	 * The snapshot memory has already been allocated above,
2915 	 * which means the length of an active property should
2916 	 * remain fixed for this implementation to work.
2917 	 */
2918 
2919 
2920 	prop_op = ops->devo_cb_ops->cb_prop_op;
2921 	pp = (struct di_prop *)di_mem_addr(st, *off_p);
2922 
2923 	mutex_exit(&dip->devi_lock);
2924 
2925 	do {
2926 		int err;
2927 		struct di_prop *tmp;
2928 
2929 		if (pp->next) {
2930 			tmp = (struct di_prop *)
2931 			    di_mem_addr(st, pp->next);
2932 		} else {
2933 			tmp = NULL;
2934 		}
2935 
2936 		/*
2937 		 * call into driver's prop_op entry point
2938 		 *
2939 		 * Must search DDI_DEV_T_NONE with DDI_DEV_T_ANY
2940 		 */
2941 		dev = makedevice(pp->dev_major, pp->dev_minor);
2942 		if (dev == DDI_DEV_T_NONE)
2943 			dev = DDI_DEV_T_ANY;
2944 
2945 		dcmn_err((CE_CONT, "call prop_op"
2946 		    "(%lx, %p, PROP_LEN_AND_VAL_BUF, "
2947 		    "DDI_PROP_DONTPASS, \"%s\", %p, &%d)\n",
2948 		    dev,
2949 		    (void *)dip,
2950 		    (char *)di_mem_addr(st, pp->prop_name),
2951 		    (void *)di_mem_addr(st, pp->prop_data),
2952 		    pp->prop_len));
2953 
2954 		if ((err = (*prop_op)(dev, (dev_info_t)dip,
2955 		    PROP_LEN_AND_VAL_ALLOC, DDI_PROP_DONTPASS,
2956 		    (char *)di_mem_addr(st, pp->prop_name),
2957 		    &prop_val, &prop_len)) != DDI_PROP_SUCCESS) {
2958 			if ((propp = i_ddi_prop_search(dev,
2959 			    (char *)di_mem_addr(st, pp->prop_name),
2960 			    (uint_t)pp->prop_flags,
2961 			    &(DEVI(dip)->devi_drv_prop_ptr))) != NULL) {
2962 				pp->prop_len = propp->prop_len;
2963 				if (pp->prop_len != 0) {
2964 					off = di_checkmem(st, off,
2965 					    pp->prop_len);
2966 					pp->prop_data = off;
2967 					bcopy(propp->prop_val, di_mem_addr(st,
2968 					    pp->prop_data), propp->prop_len);
2969 					off += DI_ALIGN(pp->prop_len);
2970 				}
2971 			} else {
2972 				prop_op_fail = 1;
2973 			}
2974 		} else if (prop_len != 0) {
2975 			pp->prop_len = prop_len;
2976 			off = di_checkmem(st, off, prop_len);
2977 			pp->prop_data = off;
2978 			bcopy(prop_val, di_mem_addr(st, off), prop_len);
2979 			off += DI_ALIGN(prop_len);
2980 			kmem_free(prop_val, prop_len);
2981 		}
2982 
2983 		if (prop_op_fail) {
2984 			pp->prop_data = -1;
2985 			dcmn_err((CE_WARN, "devinfo: prop_op failure "
2986 			    "for \"%s\" err %d",
2987 			    di_mem_addr(st, pp->prop_name), err));
2988 		}
2989 
2990 		pp = tmp;
2991 
2992 	} while (pp);
2993 
2994 	mutex_enter(&dip->devi_lock);
2995 	dcmn_err((CE_CONT, "finished property list at offset 0x%x\n", off));
2996 	return (off);
2997 }
2998 
2999 /*
3000  * find private data format attached to a dip
3001  * parent = 1 to match driver name of parent dip (for parent private data)
3002  *	0 to match driver name of current dip (for driver private data)
3003  */
3004 #define	DI_MATCH_DRIVER	0
3005 #define	DI_MATCH_PARENT	1
3006 
3007 struct di_priv_format *
3008 di_match_drv_name(struct dev_info *node, struct di_state *st, int match)
3009 {
3010 	int i, count, len;
3011 	char *drv_name;
3012 	major_t major;
3013 	struct di_all *all;
3014 	struct di_priv_format *form;
3015 
3016 	dcmn_err2((CE_CONT, "di_match_drv_name: node = %s, match = %x\n",
3017 		node->devi_node_name, match));
3018 
3019 	if (match == DI_MATCH_PARENT) {
3020 		node = DEVI(node->devi_parent);
3021 	}
3022 
3023 	if (node == NULL) {
3024 		return (NULL);
3025 	}
3026 
3027 	major = ddi_name_to_major(node->devi_binding_name);
3028 	if (major == (major_t)(-1)) {
3029 		return (NULL);
3030 	}
3031 
3032 	/*
3033 	 * Match the driver name.
3034 	 */
3035 	drv_name = ddi_major_to_name(major);
3036 	if ((drv_name == NULL) || *drv_name == '\0') {
3037 		return (NULL);
3038 	}
3039 
3040 	/* Now get the di_priv_format array */
3041 	all = (struct di_all *)di_mem_addr(st, 0);
3042 
3043 	if (match == DI_MATCH_PARENT) {
3044 		count = all->n_ppdata;
3045 		form = (struct di_priv_format *)
3046 			(di_mem_addr(st, 0) + all->ppdata_format);
3047 	} else {
3048 		count = all->n_dpdata;
3049 		form = (struct di_priv_format *)
3050 			((caddr_t)all + all->dpdata_format);
3051 	}
3052 
3053 	len = strlen(drv_name);
3054 	for (i = 0; i < count; i++) {
3055 		char *tmp;
3056 
3057 		tmp = form[i].drv_name;
3058 		while (tmp && (*tmp != '\0')) {
3059 			if (strncmp(drv_name, tmp, len) == 0) {
3060 				return (&form[i]);
3061 			}
3062 			/*
3063 			 * Move to next driver name, skipping a white space
3064 			 */
3065 			if (tmp = strchr(tmp, ' ')) {
3066 				tmp++;
3067 			}
3068 		}
3069 	}
3070 
3071 	return (NULL);
3072 }
3073 
3074 /*
3075  * The following functions copy data as specified by the format passed in.
3076  * To prevent invalid format from panicing the system, we call on_fault().
3077  * A return value of 0 indicates an error. Otherwise, the total offset
3078  * is returned.
3079  */
3080 #define	DI_MAX_PRIVDATA	(PAGESIZE >> 1)	/* max private data size */
3081 
3082 static di_off_t
3083 di_getprvdata(struct di_priv_format *pdp, void *data, di_off_t *off_p,
3084 	struct di_state *st)
3085 {
3086 	caddr_t pa;
3087 	void *ptr;
3088 	int i, size, repeat;
3089 	di_off_t off, off0, *tmp;
3090 
3091 	label_t ljb;
3092 
3093 	dcmn_err2((CE_CONT, "di_getprvdata:\n"));
3094 
3095 	/*
3096 	 * check memory availability. Private data size is
3097 	 * limited to DI_MAX_PRIVDATA.
3098 	 */
3099 	off = di_checkmem(st, *off_p, DI_MAX_PRIVDATA);
3100 
3101 	if ((pdp->bytes <= 0) || pdp->bytes > DI_MAX_PRIVDATA) {
3102 		goto failure;
3103 	}
3104 
3105 	if (!on_fault(&ljb)) {
3106 		/* copy the struct */
3107 		bcopy(data, di_mem_addr(st, off), pdp->bytes);
3108 		off0 = DI_ALIGN(pdp->bytes);
3109 
3110 		/* dereferencing pointers */
3111 		for (i = 0; i < MAX_PTR_IN_PRV; i++) {
3112 
3113 			if (pdp->ptr[i].size == 0) {
3114 				goto success;	/* no more ptrs */
3115 			}
3116 
3117 			/*
3118 			 * first, get the pointer content
3119 			 */
3120 			if ((pdp->ptr[i].offset < 0) ||
3121 				(pdp->ptr[i].offset >
3122 				pdp->bytes - sizeof (char *)))
3123 				goto failure;	/* wrong offset */
3124 
3125 			pa = di_mem_addr(st, off + pdp->ptr[i].offset);
3126 			tmp = (di_off_t *)pa;	/* to store off_t later */
3127 
3128 			ptr = *((void **) pa);	/* get pointer value */
3129 			if (ptr == NULL) {	/* if NULL pointer, go on */
3130 				continue;
3131 			}
3132 
3133 			/*
3134 			 * next, find the repeat count (array dimension)
3135 			 */
3136 			repeat = pdp->ptr[i].len_offset;
3137 
3138 			/*
3139 			 * Positive value indicates a fixed sized array.
3140 			 * 0 or negative value indicates variable sized array.
3141 			 *
3142 			 * For variable sized array, the variable must be
3143 			 * an int member of the structure, with an offset
3144 			 * equal to the absolution value of struct member.
3145 			 */
3146 			if (repeat > pdp->bytes - sizeof (int)) {
3147 				goto failure;	/* wrong offset */
3148 			}
3149 
3150 			if (repeat >= 0) {
3151 				repeat = *((int *)((caddr_t)data + repeat));
3152 			} else {
3153 				repeat = -repeat;
3154 			}
3155 
3156 			/*
3157 			 * next, get the size of the object to be copied
3158 			 */
3159 			size = pdp->ptr[i].size * repeat;
3160 
3161 			/*
3162 			 * Arbitrarily limit the total size of object to be
3163 			 * copied (1 byte to 1/4 page).
3164 			 */
3165 			if ((size <= 0) || (size > (DI_MAX_PRIVDATA - off0))) {
3166 				goto failure;	/* wrong size or too big */
3167 			}
3168 
3169 			/*
3170 			 * Now copy the data
3171 			 */
3172 			*tmp = off0;
3173 			bcopy(ptr, di_mem_addr(st, off + off0), size);
3174 			off0 += DI_ALIGN(size);
3175 		}
3176 	} else {
3177 		goto failure;
3178 	}
3179 
3180 success:
3181 	/*
3182 	 * success if reached here
3183 	 */
3184 	no_fault();
3185 	*off_p = off;
3186 
3187 	return (off + off0);
3188 	/*NOTREACHED*/
3189 
3190 failure:
3191 	/*
3192 	 * fault occurred
3193 	 */
3194 	no_fault();
3195 	cmn_err(CE_WARN, "devinfo: fault in private data at %p", data);
3196 	*off_p = -1;	/* set private data to indicate error */
3197 
3198 	return (off);
3199 }
3200 
3201 /*
3202  * get parent private data; on error, returns original offset
3203  */
3204 static di_off_t
3205 di_getppdata(struct dev_info *node, di_off_t *off_p, struct di_state *st)
3206 {
3207 	int off;
3208 	struct di_priv_format *ppdp;
3209 
3210 	dcmn_err2((CE_CONT, "di_getppdata:\n"));
3211 
3212 	/* find the parent data format */
3213 	if ((ppdp = di_match_drv_name(node, st, DI_MATCH_PARENT)) == NULL) {
3214 		off = *off_p;
3215 		*off_p = 0;	/* set parent data to none */
3216 		return (off);
3217 	}
3218 
3219 	return (di_getprvdata(ppdp, ddi_get_parent_data((dev_info_t *)node),
3220 	    off_p, st));
3221 }
3222 
3223 /*
3224  * get parent private data; returns original offset
3225  */
3226 static di_off_t
3227 di_getdpdata(struct dev_info *node, di_off_t *off_p, struct di_state *st)
3228 {
3229 	int off;
3230 	struct di_priv_format *dpdp;
3231 
3232 	dcmn_err2((CE_CONT, "di_getdpdata:"));
3233 
3234 	/* find the parent data format */
3235 	if ((dpdp = di_match_drv_name(node, st, DI_MATCH_DRIVER)) == NULL) {
3236 		off = *off_p;
3237 		*off_p = 0;	/* set driver data to none */
3238 		return (off);
3239 	}
3240 
3241 	return (di_getprvdata(dpdp, ddi_get_driver_private((dev_info_t *)node),
3242 	    off_p, st));
3243 }
3244 
3245 /*
3246  * The driver is stateful across DINFOCPYALL and DINFOUSRLD.
3247  * This function encapsulates the state machine:
3248  *
3249  *	-> IOC_IDLE -> IOC_SNAP -> IOC_DONE -> IOC_COPY ->
3250  *	|		SNAPSHOT		USRLD	 |
3251  *	--------------------------------------------------
3252  *
3253  * Returns 0 on success and -1 on failure
3254  */
3255 static int
3256 di_setstate(struct di_state *st, int new_state)
3257 {
3258 	int ret = 0;
3259 
3260 	mutex_enter(&di_lock);
3261 	switch (new_state) {
3262 	case IOC_IDLE:
3263 	case IOC_DONE:
3264 		break;
3265 	case IOC_SNAP:
3266 		if (st->di_iocstate != IOC_IDLE)
3267 			ret = -1;
3268 		break;
3269 	case IOC_COPY:
3270 		if (st->di_iocstate != IOC_DONE)
3271 			ret = -1;
3272 		break;
3273 	default:
3274 		ret = -1;
3275 	}
3276 
3277 	if (ret == 0)
3278 		st->di_iocstate = new_state;
3279 	else
3280 		cmn_err(CE_NOTE, "incorrect state transition from %d to %d",
3281 		    st->di_iocstate, new_state);
3282 	mutex_exit(&di_lock);
3283 	return (ret);
3284 }
3285 
3286 /*
3287  * We cannot assume the presence of the entire
3288  * snapshot in this routine. All we are guaranteed
3289  * is the di_all struct + 1 byte (for root_path)
3290  */
3291 static int
3292 header_plus_one_ok(struct di_all *all)
3293 {
3294 	/*
3295 	 * Refuse to read old versions
3296 	 */
3297 	if (all->version != DI_SNAPSHOT_VERSION) {
3298 		CACHE_DEBUG((DI_ERR, "bad version: 0x%x", all->version));
3299 		return (0);
3300 	}
3301 
3302 	if (all->cache_magic != DI_CACHE_MAGIC) {
3303 		CACHE_DEBUG((DI_ERR, "bad magic #: 0x%x", all->cache_magic));
3304 		return (0);
3305 	}
3306 
3307 	if (all->snapshot_time <= 0) {
3308 		CACHE_DEBUG((DI_ERR, "bad timestamp: %ld", all->snapshot_time));
3309 		return (0);
3310 	}
3311 
3312 	if (all->top_devinfo == 0) {
3313 		CACHE_DEBUG((DI_ERR, "NULL top devinfo"));
3314 		return (0);
3315 	}
3316 
3317 	if (all->map_size < sizeof (*all) + 1) {
3318 		CACHE_DEBUG((DI_ERR, "bad map size: %u", all->map_size));
3319 		return (0);
3320 	}
3321 
3322 	if (all->root_path[0] != '/' || all->root_path[1] != '\0') {
3323 		CACHE_DEBUG((DI_ERR, "bad rootpath: %c%c",
3324 		    all->root_path[0], all->root_path[1]));
3325 		return (0);
3326 	}
3327 
3328 	/*
3329 	 * We can't check checksum here as we just have the header
3330 	 */
3331 
3332 	return (1);
3333 }
3334 
3335 static int
3336 chunk_write(struct vnode *vp, offset_t off, caddr_t buf, size_t len)
3337 {
3338 	rlim64_t	rlimit;
3339 	ssize_t		resid;
3340 	int		error = 0;
3341 
3342 
3343 	rlimit = RLIM64_INFINITY;
3344 
3345 	while (len) {
3346 		resid = 0;
3347 		error = vn_rdwr(UIO_WRITE, vp, buf, len, off,
3348 		    UIO_SYSSPACE, FSYNC, rlimit, kcred, &resid);
3349 
3350 		if (error || resid < 0) {
3351 			error = error ? error : EIO;
3352 			CACHE_DEBUG((DI_ERR, "write error: %d", error));
3353 			break;
3354 		}
3355 
3356 		/*
3357 		 * Check if we are making progress
3358 		 */
3359 		if (resid >= len) {
3360 			error = ENOSPC;
3361 			break;
3362 		}
3363 		buf += len - resid;
3364 		off += len - resid;
3365 		len = resid;
3366 	}
3367 
3368 	return (error);
3369 }
3370 
3371 extern int modrootloaded;
3372 
3373 static void
3374 di_cache_write(struct di_cache *cache)
3375 {
3376 	struct di_all	*all;
3377 	struct vnode	*vp;
3378 	int		oflags;
3379 	size_t		map_size;
3380 	size_t		chunk;
3381 	offset_t	off;
3382 	int		error;
3383 	char		*buf;
3384 
3385 	ASSERT(DI_CACHE_LOCKED(*cache));
3386 	ASSERT(!servicing_interrupt());
3387 
3388 	if (cache->cache_size == 0) {
3389 		ASSERT(cache->cache_data == NULL);
3390 		CACHE_DEBUG((DI_ERR, "Empty cache. Skipping write"));
3391 		return;
3392 	}
3393 
3394 	ASSERT(cache->cache_size > 0);
3395 	ASSERT(cache->cache_data);
3396 
3397 	if (!modrootloaded || rootvp == NULL || vn_is_readonly(rootvp)) {
3398 		CACHE_DEBUG((DI_ERR, "Can't write to rootFS. Skipping write"));
3399 		return;
3400 	}
3401 
3402 	all = (struct di_all *)cache->cache_data;
3403 
3404 	if (!header_plus_one_ok(all)) {
3405 		CACHE_DEBUG((DI_ERR, "Invalid header. Skipping write"));
3406 		return;
3407 	}
3408 
3409 	ASSERT(strcmp(all->root_path, "/") == 0);
3410 
3411 	/*
3412 	 * The cache_size is the total allocated memory for the cache.
3413 	 * The map_size is the actual size of valid data in the cache.
3414 	 * map_size may be smaller than cache_size but cannot exceed
3415 	 * cache_size.
3416 	 */
3417 	if (all->map_size > cache->cache_size) {
3418 		CACHE_DEBUG((DI_ERR, "map_size (0x%x) > cache_size (0x%x)."
3419 		    " Skipping write", all->map_size, cache->cache_size));
3420 		return;
3421 	}
3422 
3423 	/*
3424 	 * First unlink the temp file
3425 	 */
3426 	error = vn_remove(DI_CACHE_TEMP, UIO_SYSSPACE, RMFILE);
3427 	if (error && error != ENOENT) {
3428 		CACHE_DEBUG((DI_ERR, "%s: unlink failed: %d",
3429 		    DI_CACHE_TEMP, error));
3430 	}
3431 
3432 	if (error == EROFS) {
3433 		CACHE_DEBUG((DI_ERR, "RDONLY FS. Skipping write"));
3434 		return;
3435 	}
3436 
3437 	vp = NULL;
3438 	oflags = (FCREAT|FWRITE);
3439 	if (error = vn_open(DI_CACHE_TEMP, UIO_SYSSPACE, oflags,
3440 	    DI_CACHE_PERMS, &vp, CRCREAT, 0)) {
3441 		CACHE_DEBUG((DI_ERR, "%s: create failed: %d",
3442 		    DI_CACHE_TEMP, error));
3443 		return;
3444 	}
3445 
3446 	ASSERT(vp);
3447 
3448 	/*
3449 	 * Paranoid: Check if the file is on a read-only FS
3450 	 */
3451 	if (vn_is_readonly(vp)) {
3452 		CACHE_DEBUG((DI_ERR, "cannot write: readonly FS"));
3453 		goto fail;
3454 	}
3455 
3456 	/*
3457 	 * Note that we only write map_size bytes to disk - this saves
3458 	 * space as the actual cache size may be larger than size of
3459 	 * valid data in the cache.
3460 	 * Another advantage is that it makes verification of size
3461 	 * easier when the file is read later.
3462 	 */
3463 	map_size = all->map_size;
3464 	off = 0;
3465 	buf = cache->cache_data;
3466 
3467 	while (map_size) {
3468 		ASSERT(map_size > 0);
3469 		/*
3470 		 * Write in chunks so that VM system
3471 		 * is not overwhelmed
3472 		 */
3473 		if (map_size > di_chunk * PAGESIZE)
3474 			chunk = di_chunk * PAGESIZE;
3475 		else
3476 			chunk = map_size;
3477 
3478 		error = chunk_write(vp, off, buf, chunk);
3479 		if (error) {
3480 			CACHE_DEBUG((DI_ERR, "write failed: off=0x%x: %d",
3481 			    off, error));
3482 			goto fail;
3483 		}
3484 
3485 		off += chunk;
3486 		buf += chunk;
3487 		map_size -= chunk;
3488 
3489 		/* Give pageout a chance to run */
3490 		delay(1);
3491 	}
3492 
3493 	/*
3494 	 * Now sync the file and close it
3495 	 */
3496 	if (error = VOP_FSYNC(vp, FSYNC, kcred)) {
3497 		CACHE_DEBUG((DI_ERR, "FSYNC failed: %d", error));
3498 	}
3499 
3500 	if (error = VOP_CLOSE(vp, oflags, 1, (offset_t)0, kcred)) {
3501 		CACHE_DEBUG((DI_ERR, "close() failed: %d", error));
3502 		VN_RELE(vp);
3503 		return;
3504 	}
3505 
3506 	VN_RELE(vp);
3507 
3508 	/*
3509 	 * Now do the rename
3510 	 */
3511 	if (error = vn_rename(DI_CACHE_TEMP, DI_CACHE_FILE, UIO_SYSSPACE)) {
3512 		CACHE_DEBUG((DI_ERR, "rename failed: %d", error));
3513 		return;
3514 	}
3515 
3516 	CACHE_DEBUG((DI_INFO, "Cache write successful."));
3517 
3518 	return;
3519 
3520 fail:
3521 	(void) VOP_CLOSE(vp, oflags, 1, (offset_t)0, kcred);
3522 	VN_RELE(vp);
3523 }
3524 
3525 
3526 /*
3527  * Since we could be called early in boot,
3528  * use kobj_read_file()
3529  */
3530 static void
3531 di_cache_read(struct di_cache *cache)
3532 {
3533 	struct _buf	*file;
3534 	struct di_all	*all;
3535 	int		n;
3536 	size_t		map_size, sz, chunk;
3537 	offset_t	off;
3538 	caddr_t		buf;
3539 	uint32_t	saved_crc, crc;
3540 
3541 	ASSERT(modrootloaded);
3542 	ASSERT(DI_CACHE_LOCKED(*cache));
3543 	ASSERT(cache->cache_data == NULL);
3544 	ASSERT(cache->cache_size == 0);
3545 	ASSERT(!servicing_interrupt());
3546 
3547 	file = kobj_open_file(DI_CACHE_FILE);
3548 	if (file == (struct _buf *)-1) {
3549 		CACHE_DEBUG((DI_ERR, "%s: open failed: %d",
3550 		    DI_CACHE_FILE, ENOENT));
3551 		return;
3552 	}
3553 
3554 	/*
3555 	 * Read in the header+root_path first. The root_path must be "/"
3556 	 */
3557 	all = kmem_zalloc(sizeof (*all) + 1, KM_SLEEP);
3558 	n = kobj_read_file(file, (caddr_t)all, sizeof (*all) + 1, 0);
3559 
3560 	if ((n != sizeof (*all) + 1) || !header_plus_one_ok(all)) {
3561 		kmem_free(all, sizeof (*all) + 1);
3562 		kobj_close_file(file);
3563 		CACHE_DEBUG((DI_ERR, "cache header: read error or invalid"));
3564 		return;
3565 	}
3566 
3567 	map_size = all->map_size;
3568 
3569 	kmem_free(all, sizeof (*all) + 1);
3570 
3571 	ASSERT(map_size >= sizeof (*all) + 1);
3572 
3573 	buf = di_cache.cache_data = kmem_alloc(map_size, KM_SLEEP);
3574 	sz = map_size;
3575 	off = 0;
3576 	while (sz) {
3577 		/* Don't overload VM with large reads */
3578 		chunk = (sz > di_chunk * PAGESIZE) ? di_chunk * PAGESIZE : sz;
3579 		n = kobj_read_file(file, buf, chunk, off);
3580 		if (n != chunk) {
3581 			CACHE_DEBUG((DI_ERR, "%s: read error at offset: %lld",
3582 			    DI_CACHE_FILE, off));
3583 			goto fail;
3584 		}
3585 		off += chunk;
3586 		buf += chunk;
3587 		sz -= chunk;
3588 	}
3589 
3590 	ASSERT(off == map_size);
3591 
3592 	/*
3593 	 * Read past expected EOF to verify size.
3594 	 */
3595 	if (kobj_read_file(file, (caddr_t)&sz, 1, off) > 0) {
3596 		CACHE_DEBUG((DI_ERR, "%s: file size changed", DI_CACHE_FILE));
3597 		goto fail;
3598 	}
3599 
3600 	all = (struct di_all *)di_cache.cache_data;
3601 	if (!header_plus_one_ok(all)) {
3602 		CACHE_DEBUG((DI_ERR, "%s: file header changed", DI_CACHE_FILE));
3603 		goto fail;
3604 	}
3605 
3606 	/*
3607 	 * Compute CRC with checksum field in the cache data set to 0
3608 	 */
3609 	saved_crc = all->cache_checksum;
3610 	all->cache_checksum = 0;
3611 	CRC32(crc, di_cache.cache_data, map_size, -1U, crc32_table);
3612 	all->cache_checksum = saved_crc;
3613 
3614 	if (crc != all->cache_checksum) {
3615 		CACHE_DEBUG((DI_ERR,
3616 		    "%s: checksum error: expected=0x%x actual=0x%x",
3617 		    DI_CACHE_FILE, all->cache_checksum, crc));
3618 		goto fail;
3619 	}
3620 
3621 	if (all->map_size != map_size) {
3622 		CACHE_DEBUG((DI_ERR, "%s: map size changed", DI_CACHE_FILE));
3623 		goto fail;
3624 	}
3625 
3626 	kobj_close_file(file);
3627 
3628 	di_cache.cache_size = map_size;
3629 
3630 	return;
3631 
3632 fail:
3633 	kmem_free(di_cache.cache_data, map_size);
3634 	kobj_close_file(file);
3635 	di_cache.cache_data = NULL;
3636 	di_cache.cache_size = 0;
3637 }
3638 
3639 
3640 /*
3641  * Checks if arguments are valid for using the cache.
3642  */
3643 static int
3644 cache_args_valid(struct di_state *st, int *error)
3645 {
3646 	ASSERT(error);
3647 	ASSERT(st->mem_size > 0);
3648 	ASSERT(st->memlist != NULL);
3649 
3650 	if (!modrootloaded || !i_ddi_io_initialized()) {
3651 		CACHE_DEBUG((DI_ERR,
3652 		    "cache lookup failure: I/O subsystem not inited"));
3653 		*error = ENOTACTIVE;
3654 		return (0);
3655 	}
3656 
3657 	/*
3658 	 * No other flags allowed with DINFOCACHE
3659 	 */
3660 	if (st->command != (DINFOCACHE & DIIOC_MASK)) {
3661 		CACHE_DEBUG((DI_ERR,
3662 		    "cache lookup failure: bad flags: 0x%x",
3663 		    st->command));
3664 		*error = EINVAL;
3665 		return (0);
3666 	}
3667 
3668 	if (strcmp(DI_ALL_PTR(st)->root_path, "/") != 0) {
3669 		CACHE_DEBUG((DI_ERR,
3670 		    "cache lookup failure: bad root: %s",
3671 		    DI_ALL_PTR(st)->root_path));
3672 		*error = EINVAL;
3673 		return (0);
3674 	}
3675 
3676 	CACHE_DEBUG((DI_INFO, "cache lookup args ok: 0x%x", st->command));
3677 
3678 	*error = 0;
3679 
3680 	return (1);
3681 }
3682 
3683 static int
3684 snapshot_is_cacheable(struct di_state *st)
3685 {
3686 	ASSERT(st->mem_size > 0);
3687 	ASSERT(st->memlist != NULL);
3688 
3689 	if ((st->command & DI_CACHE_SNAPSHOT_FLAGS) !=
3690 	    (DI_CACHE_SNAPSHOT_FLAGS & DIIOC_MASK)) {
3691 		CACHE_DEBUG((DI_INFO,
3692 		    "not cacheable: incompatible flags: 0x%x",
3693 		    st->command));
3694 		return (0);
3695 	}
3696 
3697 	if (strcmp(DI_ALL_PTR(st)->root_path, "/") != 0) {
3698 		CACHE_DEBUG((DI_INFO,
3699 		    "not cacheable: incompatible root path: %s",
3700 		    DI_ALL_PTR(st)->root_path));
3701 		return (0);
3702 	}
3703 
3704 	CACHE_DEBUG((DI_INFO, "cacheable snapshot request: 0x%x", st->command));
3705 
3706 	return (1);
3707 }
3708 
3709 static int
3710 di_cache_lookup(struct di_state *st)
3711 {
3712 	size_t	rval;
3713 	int	cache_valid;
3714 
3715 	ASSERT(cache_args_valid(st, &cache_valid));
3716 	ASSERT(modrootloaded);
3717 
3718 	DI_CACHE_LOCK(di_cache);
3719 
3720 	/*
3721 	 * The following assignment determines the validity
3722 	 * of the cache as far as this snapshot is concerned.
3723 	 */
3724 	cache_valid = di_cache.cache_valid;
3725 
3726 	if (cache_valid && di_cache.cache_data == NULL) {
3727 		di_cache_read(&di_cache);
3728 		/* check for read or file error */
3729 		if (di_cache.cache_data == NULL)
3730 			cache_valid = 0;
3731 	}
3732 
3733 	if (cache_valid) {
3734 		/*
3735 		 * Ok, the cache was valid as of this particular
3736 		 * snapshot. Copy the cached snapshot. This is safe
3737 		 * to do as the cache cannot be freed (we hold the
3738 		 * cache lock). Free the memory allocated in di_state
3739 		 * up until this point - we will simply copy everything
3740 		 * in the cache.
3741 		 */
3742 
3743 		ASSERT(di_cache.cache_data != NULL);
3744 		ASSERT(di_cache.cache_size > 0);
3745 
3746 		di_freemem(st);
3747 
3748 		rval = 0;
3749 		if (di_cache2mem(&di_cache, st) > 0) {
3750 
3751 			ASSERT(DI_ALL_PTR(st));
3752 
3753 			/*
3754 			 * map_size is size of valid data in the
3755 			 * cached snapshot and may be less than
3756 			 * size of the cache.
3757 			 */
3758 			rval = DI_ALL_PTR(st)->map_size;
3759 
3760 			ASSERT(rval >= sizeof (struct di_all));
3761 			ASSERT(rval <= di_cache.cache_size);
3762 		}
3763 	} else {
3764 		/*
3765 		 * The cache isn't valid, we need to take a snapshot.
3766 		 * Set the command flags appropriately
3767 		 */
3768 		ASSERT(st->command == (DINFOCACHE & DIIOC_MASK));
3769 		st->command = (DI_CACHE_SNAPSHOT_FLAGS & DIIOC_MASK);
3770 		rval = di_cache_update(st);
3771 		st->command = (DINFOCACHE & DIIOC_MASK);
3772 	}
3773 
3774 	DI_CACHE_UNLOCK(di_cache);
3775 
3776 	/*
3777 	 * For cached snapshots, the devinfo driver always returns
3778 	 * a snapshot rooted at "/".
3779 	 */
3780 	ASSERT(rval == 0 || strcmp(DI_ALL_PTR(st)->root_path, "/") == 0);
3781 
3782 	return (rval);
3783 }
3784 
3785 /*
3786  * This is a forced update of the cache  - the previous state of the cache
3787  * may be:
3788  *	- unpopulated
3789  *	- populated and invalid
3790  *	- populated and valid
3791  */
3792 static int
3793 di_cache_update(struct di_state *st)
3794 {
3795 	int rval;
3796 	uint32_t crc;
3797 	struct di_all *all;
3798 
3799 	ASSERT(DI_CACHE_LOCKED(di_cache));
3800 	ASSERT(snapshot_is_cacheable(st));
3801 
3802 	/*
3803 	 * Free the in-core cache and the on-disk file (if they exist)
3804 	 */
3805 	i_ddi_di_cache_free(&di_cache);
3806 
3807 	/*
3808 	 * Set valid flag before taking the snapshot,
3809 	 * so that any invalidations that arrive
3810 	 * during or after the snapshot are not
3811 	 * removed by us.
3812 	 */
3813 	atomic_or_32(&di_cache.cache_valid, 1);
3814 
3815 	rval = di_snapshot_and_clean(st);
3816 
3817 	if (rval == 0) {
3818 		CACHE_DEBUG((DI_ERR, "can't update cache: bad snapshot"));
3819 		return (0);
3820 	}
3821 
3822 	DI_ALL_PTR(st)->map_size = rval;
3823 
3824 	if (di_mem2cache(st, &di_cache) == 0) {
3825 		CACHE_DEBUG((DI_ERR, "can't update cache: copy failed"));
3826 		return (0);
3827 	}
3828 
3829 	ASSERT(di_cache.cache_data);
3830 	ASSERT(di_cache.cache_size > 0);
3831 
3832 	/*
3833 	 * Now that we have cached the snapshot, compute its checksum.
3834 	 * The checksum is only computed over the valid data in the
3835 	 * cache, not the entire cache.
3836 	 * Also, set all the fields (except checksum) before computing
3837 	 * checksum.
3838 	 */
3839 	all = (struct di_all *)di_cache.cache_data;
3840 	all->cache_magic = DI_CACHE_MAGIC;
3841 	all->map_size = rval;
3842 
3843 	ASSERT(all->cache_checksum == 0);
3844 	CRC32(crc, di_cache.cache_data, all->map_size, -1U, crc32_table);
3845 	all->cache_checksum = crc;
3846 
3847 	di_cache_write(&di_cache);
3848 
3849 	return (rval);
3850 }
3851 
3852 static void
3853 di_cache_print(di_cache_debug_t msglevel, char *fmt, ...)
3854 {
3855 	va_list	ap;
3856 
3857 	if (di_cache_debug <= DI_QUIET)
3858 		return;
3859 
3860 	if (di_cache_debug < msglevel)
3861 		return;
3862 
3863 	switch (msglevel) {
3864 		case DI_ERR:
3865 			msglevel = CE_WARN;
3866 			break;
3867 		case DI_INFO:
3868 		case DI_TRACE:
3869 		default:
3870 			msglevel = CE_NOTE;
3871 			break;
3872 	}
3873 
3874 	va_start(ap, fmt);
3875 	vcmn_err(msglevel, fmt, ap);
3876 	va_end(ap);
3877 }
3878