xref: /illumos-gate/usr/src/uts/common/io/overlay/overlay_target.c (revision 9b9d39d2a32ff806d2431dbcc50968ef1e6d46b2)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2016 Joyent, Inc.
14  * Copyright 2022 MNX Cloud, Inc.
15  */
16 
17 /*
18  * Overlay device target cache management
19  *
20  * For more information, see the big theory statement in
21  * uts/common/io/overlay/overlay.c
22  */
23 
24 #include <sys/types.h>
25 #include <sys/ethernet.h>
26 #include <sys/kmem.h>
27 #include <sys/policy.h>
28 #include <sys/sysmacros.h>
29 #include <sys/stream.h>
30 #include <sys/strsun.h>
31 #include <sys/strsubr.h>
32 #include <sys/mac_provider.h>
33 #include <sys/mac_client.h>
34 #include <sys/mac_client_priv.h>
35 #include <sys/vlan.h>
36 #include <sys/crc32.h>
37 #include <sys/cred.h>
38 #include <sys/file.h>
39 #include <sys/errno.h>
40 #include <sys/ddi.h>
41 #include <sys/sunddi.h>
42 
43 #include <sys/overlay_impl.h>
44 #include <sys/sdt.h>
45 
46 /*
47  * This is total straw man, but at least it's a prime number. Here we're
48  * going to have to go through and do a lot of evaluation and understanding as
49  * to how these target caches should grow and shrink, as well as, memory
50  * pressure and evictions. This just gives us a starting point that'll be 'good
51  * enough', until it's not.
52  */
53 #define	OVERLAY_HSIZE	823
54 
55 /*
56  * We use this data structure to keep track of what requests have been actively
57  * allocated to a given instance so we know what to put back on the pending
58  * list.
59  */
60 typedef struct overlay_target_hdl {
61 	minor_t oth_minor;		/* RO */
62 	zoneid_t oth_zoneid;		/* RO */
63 	int oth_oflags;			/* RO */
64 	list_node_t oth_link;		/* overlay_target_lock */
65 	kmutex_t oth_lock;
66 	list_t	oth_outstanding;	/* oth_lock */
67 } overlay_target_hdl_t;
68 
69 typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int);
70 typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *);
71 typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int);
72 
73 typedef struct overlay_target_ioctl {
74 	int		oti_cmd;	/* ioctl id */
75 	boolean_t	oti_write;	/* ioctl requires FWRITE */
76 	boolean_t	oti_ncopyout;	/* copyout data? */
77 	overlay_target_copyin_f oti_copyin;	/* copyin func */
78 	overlay_target_ioctl_f oti_func; /* function to call */
79 	overlay_target_copyout_f oti_copyout;	/* copyin func */
80 	size_t		oti_size;	/* size of user level structure */
81 } overlay_target_ioctl_t;
82 
83 static kmem_cache_t *overlay_target_cache;
84 static kmem_cache_t *overlay_entry_cache;
85 static id_space_t *overlay_thdl_idspace;
86 static void *overlay_thdl_state;
87 
88 /*
89  * When we support overlay devices in the NGZ, then all of these need to become
90  * zone aware, by plugging into the netstack engine and becoming per-netstack
91  * data.
92  */
93 static list_t overlay_thdl_list;
94 static kmutex_t overlay_target_lock;
95 static kcondvar_t overlay_target_condvar;
96 static list_t overlay_target_list;
97 static boolean_t overlay_target_excl;
98 
99 /*
100  * Outstanding data per hash table entry.
101  */
102 static int overlay_ent_size = 128 * 1024;
103 
104 /* ARGSUSED */
105 static int
106 overlay_target_cache_constructor(void *buf, void *arg, int kmflgs)
107 {
108 	overlay_target_t *ott = buf;
109 
110 	mutex_init(&ott->ott_lock, NULL, MUTEX_DRIVER, NULL);
111 	cv_init(&ott->ott_cond, NULL, CV_DRIVER, NULL);
112 	return (0);
113 }
114 
115 /* ARGSUSED */
116 static void
117 overlay_target_cache_destructor(void *buf, void *arg)
118 {
119 	overlay_target_t *ott = buf;
120 
121 	cv_destroy(&ott->ott_cond);
122 	mutex_destroy(&ott->ott_lock);
123 }
124 
125 /* ARGSUSED */
126 static int
127 overlay_entry_cache_constructor(void *buf, void *arg, int kmflgs)
128 {
129 	overlay_target_entry_t *ote = buf;
130 
131 	bzero(ote, sizeof (overlay_target_entry_t));
132 	mutex_init(&ote->ote_lock, NULL, MUTEX_DRIVER, NULL);
133 	return (0);
134 }
135 
136 /* ARGSUSED */
137 static void
138 overlay_entry_cache_destructor(void *buf, void *arg)
139 {
140 	overlay_target_entry_t *ote = buf;
141 
142 	mutex_destroy(&ote->ote_lock);
143 }
144 
145 static uint64_t
146 overlay_mac_hash(const void *v)
147 {
148 	uint32_t crc;
149 	CRC32(crc, v, ETHERADDRL, -1U, crc32_table);
150 	return (crc);
151 }
152 
153 static int
154 overlay_mac_cmp(const void *a, const void *b)
155 {
156 	return (bcmp(a, b, ETHERADDRL));
157 }
158 
159 /* ARGSUSED */
160 static void
161 overlay_target_entry_dtor(void *arg)
162 {
163 	overlay_target_entry_t *ote = arg;
164 
165 	ote->ote_flags = 0;
166 	bzero(ote->ote_addr, ETHERADDRL);
167 	ote->ote_ott = NULL;
168 	ote->ote_odd = NULL;
169 	freemsgchain(ote->ote_chead);
170 	ote->ote_chead = ote->ote_ctail = NULL;
171 	ote->ote_mbsize = 0;
172 	ote->ote_vtime = 0;
173 	kmem_cache_free(overlay_entry_cache, ote);
174 }
175 
176 static int
177 overlay_mac_avl(const void *a, const void *b)
178 {
179 	int i;
180 	const overlay_target_entry_t *l, *r;
181 	l = a;
182 	r = b;
183 
184 	for (i = 0; i < ETHERADDRL; i++) {
185 		if (l->ote_addr[i] > r->ote_addr[i])
186 			return (1);
187 		else if (l->ote_addr[i] < r->ote_addr[i])
188 			return (-1);
189 	}
190 
191 	return (0);
192 }
193 
194 void
195 overlay_target_init(void)
196 {
197 	int ret;
198 	ret = ddi_soft_state_init(&overlay_thdl_state,
199 	    sizeof (overlay_target_hdl_t), 1);
200 	VERIFY(ret == 0);
201 	overlay_target_cache = kmem_cache_create("overlay_target",
202 	    sizeof (overlay_target_t), 0, overlay_target_cache_constructor,
203 	    overlay_target_cache_destructor, NULL, NULL, NULL, 0);
204 	overlay_entry_cache = kmem_cache_create("overlay_entry",
205 	    sizeof (overlay_target_entry_t), 0, overlay_entry_cache_constructor,
206 	    overlay_entry_cache_destructor, NULL, NULL, NULL, 0);
207 	mutex_init(&overlay_target_lock, NULL, MUTEX_DRIVER, NULL);
208 	cv_init(&overlay_target_condvar, NULL, CV_DRIVER, NULL);
209 	list_create(&overlay_target_list, sizeof (overlay_target_entry_t),
210 	    offsetof(overlay_target_entry_t, ote_qlink));
211 	list_create(&overlay_thdl_list, sizeof (overlay_target_hdl_t),
212 	    offsetof(overlay_target_hdl_t, oth_link));
213 	overlay_thdl_idspace = id_space_create("overlay_target_minors",
214 	    1, INT32_MAX);
215 }
216 
217 void
218 overlay_target_fini(void)
219 {
220 	id_space_destroy(overlay_thdl_idspace);
221 	list_destroy(&overlay_thdl_list);
222 	list_destroy(&overlay_target_list);
223 	cv_destroy(&overlay_target_condvar);
224 	mutex_destroy(&overlay_target_lock);
225 	kmem_cache_destroy(overlay_entry_cache);
226 	kmem_cache_destroy(overlay_target_cache);
227 	ddi_soft_state_fini(&overlay_thdl_state);
228 }
229 
230 void
231 overlay_target_free(overlay_dev_t *odd)
232 {
233 	if (odd->odd_target == NULL)
234 		return;
235 
236 	if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) {
237 		refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash;
238 		avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree;
239 		overlay_target_entry_t *ote;
240 
241 		/*
242 		 * Our AVL tree and hashtable contain the same elements,
243 		 * therefore we should just remove it from the tree, but then
244 		 * delete the entries when we remove them from the hash table
245 		 * (which happens through the refhash dtor).
246 		 */
247 		while ((ote = avl_first(ap)) != NULL)
248 			avl_remove(ap, ote);
249 
250 		avl_destroy(ap);
251 		for (ote = refhash_first(rp); ote != NULL;
252 		    ote = refhash_next(rp, ote)) {
253 			refhash_remove(rp, ote);
254 		}
255 		refhash_destroy(rp);
256 	}
257 
258 	ASSERT(odd->odd_target->ott_ocount == 0);
259 	kmem_cache_free(overlay_target_cache, odd->odd_target);
260 }
261 
262 int
263 overlay_target_busy()
264 {
265 	int ret;
266 
267 	mutex_enter(&overlay_target_lock);
268 	ret = !list_is_empty(&overlay_thdl_list);
269 	mutex_exit(&overlay_target_lock);
270 
271 	return (ret);
272 }
273 
274 static void
275 overlay_target_queue(overlay_target_entry_t *entry)
276 {
277 	mutex_enter(&overlay_target_lock);
278 	mutex_enter(&entry->ote_ott->ott_lock);
279 	if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) {
280 		mutex_exit(&entry->ote_ott->ott_lock);
281 		mutex_exit(&overlay_target_lock);
282 		return;
283 	}
284 	entry->ote_ott->ott_ocount++;
285 	mutex_exit(&entry->ote_ott->ott_lock);
286 	list_insert_tail(&overlay_target_list, entry);
287 	cv_signal(&overlay_target_condvar);
288 	mutex_exit(&overlay_target_lock);
289 }
290 
291 void
292 overlay_target_quiesce(overlay_target_t *ott)
293 {
294 	if (ott == NULL)
295 		return;
296 	mutex_enter(&ott->ott_lock);
297 	ott->ott_flags |= OVERLAY_T_TEARDOWN;
298 	while (ott->ott_ocount != 0)
299 		cv_wait(&ott->ott_cond, &ott->ott_lock);
300 	mutex_exit(&ott->ott_lock);
301 }
302 
303 /*
304  * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP |
305  * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at
306  * this time, say for NVGRE, we drop all packets that mcuh this.
307  */
308 int
309 overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
310     socklen_t *slenp)
311 {
312 	int ret;
313 	struct sockaddr_in6 *v6;
314 	overlay_target_t *ott;
315 	mac_header_info_t mhi;
316 	overlay_target_entry_t *entry;
317 
318 	ASSERT(odd->odd_target != NULL);
319 
320 	/*
321 	 * At this point, the overlay device is in a mux which means that it's
322 	 * been activated. At this point, parts of the target, such as the mode
323 	 * and the destination are now read-only and we don't have to worry
324 	 * about synchronization for them.
325 	 */
326 	ott = odd->odd_target;
327 	if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT))
328 		return (OVERLAY_TARGET_DROP);
329 
330 	v6 = (struct sockaddr_in6 *)sock;
331 	bzero(v6, sizeof (struct sockaddr_in6));
332 	v6->sin6_family = AF_INET6;
333 
334 	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
335 		mutex_enter(&ott->ott_lock);
336 		bcopy(&ott->ott_u.ott_point.otp_ip, &v6->sin6_addr,
337 		    sizeof (struct in6_addr));
338 		v6->sin6_port = htons(ott->ott_u.ott_point.otp_port);
339 		mutex_exit(&ott->ott_lock);
340 		*slenp = sizeof (struct sockaddr_in6);
341 
342 		return (OVERLAY_TARGET_OK);
343 	}
344 
345 	ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC);
346 
347 	/*
348 	 * Note we only want the MAC address here, therefore we won't bother
349 	 * using mac_vlan_header_info(). If any caller needs the vlan info at
350 	 * this point, this should change to a call to mac_vlan_header_info().
351 	 */
352 	if (mac_header_info(odd->odd_mh, mp, &mhi) != 0)
353 		return (OVERLAY_TARGET_DROP);
354 	mutex_enter(&ott->ott_lock);
355 	entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
356 	    mhi.mhi_daddr);
357 	if (entry == NULL) {
358 		entry = kmem_cache_alloc(overlay_entry_cache, KM_NOSLEEP_LAZY);
359 		if (entry == NULL) {
360 			mutex_exit(&ott->ott_lock);
361 			return (OVERLAY_TARGET_DROP);
362 		}
363 		bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL);
364 		entry->ote_chead = entry->ote_ctail = mp;
365 		entry->ote_mbsize = msgsize(mp);
366 		entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
367 		entry->ote_ott = ott;
368 		entry->ote_odd = odd;
369 		refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry);
370 		avl_add(&ott->ott_u.ott_dyn.ott_tree, entry);
371 		mutex_exit(&ott->ott_lock);
372 		overlay_target_queue(entry);
373 		return (OVERLAY_TARGET_ASYNC);
374 	}
375 	refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
376 	mutex_exit(&ott->ott_lock);
377 
378 	mutex_enter(&entry->ote_lock);
379 	if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) {
380 		ret = OVERLAY_TARGET_DROP;
381 	} else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
382 		bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr,
383 		    sizeof (struct in6_addr));
384 		v6->sin6_port = htons(entry->ote_dest.otp_port);
385 		*slenp = sizeof (struct sockaddr_in6);
386 		ret = OVERLAY_TARGET_OK;
387 	} else {
388 		size_t mlen = msgsize(mp);
389 
390 		if (mlen + entry->ote_mbsize > overlay_ent_size) {
391 			ret = OVERLAY_TARGET_DROP;
392 		} else {
393 			if (entry->ote_ctail != NULL) {
394 				ASSERT(entry->ote_ctail->b_next ==
395 				    NULL);
396 				entry->ote_ctail->b_next = mp;
397 				entry->ote_ctail = mp;
398 			} else {
399 				entry->ote_chead = mp;
400 				entry->ote_ctail = mp;
401 			}
402 			entry->ote_mbsize += mlen;
403 			if ((entry->ote_flags &
404 			    OVERLAY_ENTRY_F_PENDING) == 0) {
405 				entry->ote_flags |=
406 				    OVERLAY_ENTRY_F_PENDING;
407 				overlay_target_queue(entry);
408 			}
409 			ret = OVERLAY_TARGET_ASYNC;
410 		}
411 	}
412 	mutex_exit(&entry->ote_lock);
413 
414 	mutex_enter(&ott->ott_lock);
415 	refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
416 	mutex_exit(&ott->ott_lock);
417 
418 	return (ret);
419 }
420 
421 /* ARGSUSED */
422 static int
423 overlay_target_info(overlay_target_hdl_t *thdl, void *arg)
424 {
425 	overlay_dev_t *odd;
426 	overlay_targ_info_t *oti = arg;
427 
428 	odd = overlay_hold_by_dlid(oti->oti_linkid);
429 	if (odd == NULL)
430 		return (ENOENT);
431 
432 	mutex_enter(&odd->odd_lock);
433 	oti->oti_flags = 0;
434 	oti->oti_needs = odd->odd_plugin->ovp_dest;
435 	if (odd->odd_flags & OVERLAY_F_DEGRADED)
436 		oti->oti_flags |= OVERLAY_TARG_INFO_F_DEGRADED;
437 	if (odd->odd_flags & OVERLAY_F_ACTIVATED)
438 		oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE;
439 	oti->oti_vnetid = odd->odd_vid;
440 	mutex_exit(&odd->odd_lock);
441 	overlay_hold_rele(odd);
442 	return (0);
443 }
444 
445 /* ARGSUSED */
446 static int
447 overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
448 {
449 	overlay_dev_t *odd;
450 	overlay_target_t *ott;
451 	overlay_targ_associate_t *ota = arg;
452 
453 	odd = overlay_hold_by_dlid(ota->ota_linkid);
454 	if (odd == NULL)
455 		return (ENOENT);
456 
457 	if (ota->ota_id == 0) {
458 		overlay_hold_rele(odd);
459 		return (EINVAL);
460 	}
461 
462 	if (ota->ota_mode != OVERLAY_TARGET_POINT &&
463 	    ota->ota_mode != OVERLAY_TARGET_DYNAMIC) {
464 		overlay_hold_rele(odd);
465 		return (EINVAL);
466 	}
467 
468 	if (ota->ota_provides != odd->odd_plugin->ovp_dest) {
469 		overlay_hold_rele(odd);
470 		return (EINVAL);
471 	}
472 
473 	if (ota->ota_mode == OVERLAY_TARGET_POINT) {
474 		if (ota->ota_provides & OVERLAY_PLUGIN_D_IP) {
475 			if (IN6_IS_ADDR_UNSPECIFIED(&ota->ota_point.otp_ip) ||
476 			    IN6_IS_ADDR_V4COMPAT(&ota->ota_point.otp_ip) ||
477 			    IN6_IS_ADDR_V4MAPPED_ANY(&ota->ota_point.otp_ip)) {
478 				overlay_hold_rele(odd);
479 				return (EINVAL);
480 			}
481 		}
482 
483 		if (ota->ota_provides & OVERLAY_PLUGIN_D_PORT) {
484 			if (ota->ota_point.otp_port == 0) {
485 				overlay_hold_rele(odd);
486 				return (EINVAL);
487 			}
488 		}
489 	}
490 
491 	ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP);
492 	ott->ott_flags = 0;
493 	ott->ott_ocount = 0;
494 	ott->ott_mode = ota->ota_mode;
495 	ott->ott_dest = ota->ota_provides;
496 	ott->ott_id = ota->ota_id;
497 
498 	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
499 		bcopy(&ota->ota_point, &ott->ott_u.ott_point,
500 		    sizeof (overlay_target_point_t));
501 	} else {
502 		ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE,
503 		    overlay_mac_hash, overlay_mac_cmp,
504 		    overlay_target_entry_dtor, sizeof (overlay_target_entry_t),
505 		    offsetof(overlay_target_entry_t, ote_reflink),
506 		    offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP);
507 		avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl,
508 		    sizeof (overlay_target_entry_t),
509 		    offsetof(overlay_target_entry_t, ote_avllink));
510 	}
511 	mutex_enter(&odd->odd_lock);
512 	if (odd->odd_flags & OVERLAY_F_VARPD) {
513 		mutex_exit(&odd->odd_lock);
514 		kmem_cache_free(overlay_target_cache, ott);
515 		overlay_hold_rele(odd);
516 		return (EEXIST);
517 	}
518 
519 	odd->odd_flags |= OVERLAY_F_VARPD;
520 	odd->odd_target = ott;
521 	mutex_exit(&odd->odd_lock);
522 
523 	overlay_hold_rele(odd);
524 
525 
526 	return (0);
527 }
528 
529 
530 /* ARGSUSED */
531 static int
532 overlay_target_degrade(overlay_target_hdl_t *thdl, void *arg)
533 {
534 	overlay_dev_t *odd;
535 	overlay_targ_degrade_t *otd = arg;
536 
537 	odd = overlay_hold_by_dlid(otd->otd_linkid);
538 	if (odd == NULL)
539 		return (ENOENT);
540 
541 	overlay_fm_degrade(odd, otd->otd_buf);
542 	overlay_hold_rele(odd);
543 	return (0);
544 }
545 
546 /* ARGSUSED */
547 static int
548 overlay_target_restore(overlay_target_hdl_t *thdl, void *arg)
549 {
550 	overlay_dev_t *odd;
551 	overlay_targ_id_t *otid = arg;
552 
553 	odd = overlay_hold_by_dlid(otid->otid_linkid);
554 	if (odd == NULL)
555 		return (ENOENT);
556 
557 	overlay_fm_restore(odd);
558 	overlay_hold_rele(odd);
559 	return (0);
560 }
561 
562 /* ARGSUSED */
563 static int
564 overlay_target_disassociate(overlay_target_hdl_t *thdl, void *arg)
565 {
566 	overlay_dev_t *odd;
567 	overlay_targ_id_t *otid = arg;
568 
569 	odd = overlay_hold_by_dlid(otid->otid_linkid);
570 	if (odd == NULL)
571 		return (ENOENT);
572 
573 	mutex_enter(&odd->odd_lock);
574 	odd->odd_flags &= ~OVERLAY_F_VARPD;
575 	mutex_exit(&odd->odd_lock);
576 
577 	overlay_hold_rele(odd);
578 	return (0);
579 
580 }
581 
582 static int
583 overlay_target_lookup_request(overlay_target_hdl_t *thdl, void *arg)
584 {
585 	overlay_targ_lookup_t *otl = arg;
586 	overlay_target_entry_t *entry;
587 	clock_t ret, timeout;
588 	mac_header_info_t mhi;
589 
590 	timeout = ddi_get_lbolt() + drv_usectohz(MICROSEC);
591 again:
592 	mutex_enter(&overlay_target_lock);
593 	while (list_is_empty(&overlay_target_list)) {
594 		ret = cv_timedwait(&overlay_target_condvar,
595 		    &overlay_target_lock, timeout);
596 		if (ret == -1) {
597 			mutex_exit(&overlay_target_lock);
598 			return (ETIME);
599 		}
600 	}
601 	entry = list_remove_head(&overlay_target_list);
602 	mutex_exit(&overlay_target_lock);
603 	mutex_enter(&entry->ote_lock);
604 	if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
605 		ASSERT(entry->ote_chead == NULL);
606 		mutex_exit(&entry->ote_lock);
607 		goto again;
608 	}
609 	ASSERT(entry->ote_chead != NULL);
610 
611 	/*
612 	 * If we have a bogon that doesn't have a valid mac header, drop it and
613 	 * try again.
614 	 */
615 	if (mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead,
616 	    &mhi) != 0) {
617 		boolean_t queue = B_FALSE;
618 		mblk_t *mp = entry->ote_chead;
619 		entry->ote_chead = mp->b_next;
620 		mp->b_next = NULL;
621 		if (entry->ote_ctail == mp)
622 			entry->ote_ctail = entry->ote_chead;
623 		entry->ote_mbsize -= msgsize(mp);
624 		if (entry->ote_chead != NULL)
625 			queue = B_TRUE;
626 		mutex_exit(&entry->ote_lock);
627 		if (queue == B_TRUE)
628 			overlay_target_queue(entry);
629 		freemsg(mp);
630 		goto again;
631 	}
632 
633 	otl->otl_dlid = entry->ote_odd->odd_linkid;
634 	otl->otl_reqid = (uintptr_t)entry;
635 	otl->otl_varpdid = entry->ote_ott->ott_id;
636 	otl->otl_vnetid = entry->ote_odd->odd_vid;
637 
638 	otl->otl_hdrsize = mhi.mhi_hdrsize;
639 	otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize;
640 	bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL);
641 	bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL);
642 	otl->otl_dsttype = mhi.mhi_dsttype;
643 	otl->otl_sap = mhi.mhi_bindsap;
644 	otl->otl_vlan = VLAN_ID(mhi.mhi_tci);
645 	mutex_exit(&entry->ote_lock);
646 
647 	mutex_enter(&thdl->oth_lock);
648 	list_insert_tail(&thdl->oth_outstanding, entry);
649 	mutex_exit(&thdl->oth_lock);
650 
651 	return (0);
652 }
653 
654 static int
655 overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg)
656 {
657 	const overlay_targ_resp_t *otr = arg;
658 	overlay_target_entry_t *entry;
659 	mblk_t *mp;
660 
661 	mutex_enter(&thdl->oth_lock);
662 	for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
663 	    entry = list_next(&thdl->oth_outstanding, entry)) {
664 		if ((uintptr_t)entry == otr->otr_reqid)
665 			break;
666 	}
667 
668 	if (entry == NULL) {
669 		mutex_exit(&thdl->oth_lock);
670 		return (EINVAL);
671 	}
672 	list_remove(&thdl->oth_outstanding, entry);
673 	mutex_exit(&thdl->oth_lock);
674 
675 	mutex_enter(&entry->ote_lock);
676 	bcopy(&otr->otr_answer, &entry->ote_dest,
677 	    sizeof (overlay_target_point_t));
678 	entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
679 	entry->ote_flags |= OVERLAY_ENTRY_F_VALID;
680 	mp = entry->ote_chead;
681 	entry->ote_chead = NULL;
682 	entry->ote_ctail = NULL;
683 	entry->ote_mbsize = 0;
684 	entry->ote_vtime = gethrtime();
685 	mutex_exit(&entry->ote_lock);
686 
687 	/*
688 	 * For now do an in-situ drain.
689 	 */
690 	mp = overlay_m_tx(entry->ote_odd, mp);
691 	freemsgchain(mp);
692 
693 	mutex_enter(&entry->ote_ott->ott_lock);
694 	entry->ote_ott->ott_ocount--;
695 	cv_signal(&entry->ote_ott->ott_cond);
696 	mutex_exit(&entry->ote_ott->ott_lock);
697 
698 	return (0);
699 }
700 
701 static int
702 overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
703 {
704 	const overlay_targ_resp_t *otr = arg;
705 	overlay_target_entry_t *entry;
706 	mblk_t *mp;
707 	boolean_t queue = B_FALSE;
708 
709 	mutex_enter(&thdl->oth_lock);
710 	for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
711 	    entry = list_next(&thdl->oth_outstanding, entry)) {
712 		if ((uintptr_t)entry == otr->otr_reqid)
713 			break;
714 	}
715 
716 	if (entry == NULL) {
717 		mutex_exit(&thdl->oth_lock);
718 		return (EINVAL);
719 	}
720 	list_remove(&thdl->oth_outstanding, entry);
721 	mutex_exit(&thdl->oth_lock);
722 
723 	mutex_enter(&entry->ote_lock);
724 
725 	/* Safeguard against a confused varpd */
726 	if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
727 		entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
728 		DTRACE_PROBE1(overlay__target__valid__drop,
729 		    overlay_target_entry_t *, entry);
730 		mutex_exit(&entry->ote_lock);
731 		goto done;
732 	}
733 
734 	mp = entry->ote_chead;
735 	if (mp != NULL) {
736 		entry->ote_chead = mp->b_next;
737 		mp->b_next = NULL;
738 		if (entry->ote_ctail == mp)
739 			entry->ote_ctail = entry->ote_chead;
740 		entry->ote_mbsize -= msgsize(mp);
741 	}
742 	if (entry->ote_chead != NULL) {
743 		queue = B_TRUE;
744 		entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
745 	} else {
746 		entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
747 	}
748 	mutex_exit(&entry->ote_lock);
749 
750 	if (queue == B_TRUE)
751 		overlay_target_queue(entry);
752 	freemsg(mp);
753 
754 done:
755 	mutex_enter(&entry->ote_ott->ott_lock);
756 	entry->ote_ott->ott_ocount--;
757 	cv_signal(&entry->ote_ott->ott_cond);
758 	mutex_exit(&entry->ote_ott->ott_lock);
759 
760 	return (0);
761 }
762 
763 /* ARGSUSED */
764 static int
765 overlay_target_pkt_copyin(const void *ubuf, void **outp, size_t *bsize,
766     int flags)
767 {
768 	overlay_targ_pkt_t *pkt;
769 	overlay_targ_pkt32_t *pkt32;
770 
771 	pkt = kmem_alloc(sizeof (overlay_targ_pkt_t), KM_SLEEP);
772 	*outp = pkt;
773 	*bsize = sizeof (overlay_targ_pkt_t);
774 	if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
775 		uintptr_t addr;
776 
777 		if (ddi_copyin(ubuf, pkt, sizeof (overlay_targ_pkt32_t),
778 		    flags & FKIOCTL) != 0) {
779 			kmem_free(pkt, *bsize);
780 			return (EFAULT);
781 		}
782 		pkt32 = (overlay_targ_pkt32_t *)pkt;
783 		addr = pkt32->otp_buf;
784 		pkt->otp_buf = (void *)addr;
785 	} else {
786 		if (ddi_copyin(ubuf, pkt, *bsize, flags & FKIOCTL) != 0) {
787 			kmem_free(pkt, *bsize);
788 			return (EFAULT);
789 		}
790 	}
791 	return (0);
792 }
793 
794 static int
795 overlay_target_pkt_copyout(void *ubuf, void *buf, size_t bufsize,
796     int flags)
797 {
798 	if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
799 		overlay_targ_pkt_t *pkt = buf;
800 		overlay_targ_pkt32_t *pkt32 = buf;
801 		uintptr_t addr = (uintptr_t)pkt->otp_buf;
802 		pkt32->otp_buf = (caddr32_t)addr;
803 		if (ddi_copyout(buf, ubuf, sizeof (overlay_targ_pkt32_t),
804 		    flags & FKIOCTL) != 0)
805 			return (EFAULT);
806 	} else {
807 		if (ddi_copyout(buf, ubuf, bufsize, flags & FKIOCTL) != 0)
808 			return (EFAULT);
809 	}
810 	return (0);
811 }
812 
813 static int
814 overlay_target_packet(overlay_target_hdl_t *thdl, void *arg)
815 {
816 	overlay_targ_pkt_t *pkt = arg;
817 	overlay_target_entry_t *entry;
818 	mblk_t *mp;
819 	size_t mlen;
820 	size_t boff;
821 
822 	mutex_enter(&thdl->oth_lock);
823 	for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
824 	    entry = list_next(&thdl->oth_outstanding, entry)) {
825 		if ((uintptr_t)entry == pkt->otp_reqid)
826 			break;
827 	}
828 
829 	if (entry == NULL) {
830 		mutex_exit(&thdl->oth_lock);
831 		return (EINVAL);
832 	}
833 	mutex_enter(&entry->ote_lock);
834 	mutex_exit(&thdl->oth_lock);
835 	mp = entry->ote_chead;
836 	/* Protect against a rogue varpd */
837 	if (mp == NULL) {
838 		mutex_exit(&entry->ote_lock);
839 		return (EINVAL);
840 	}
841 	mlen = MIN(msgsize(mp), pkt->otp_size);
842 	pkt->otp_size = mlen;
843 	boff = 0;
844 	while (mlen > 0) {
845 		size_t wlen = MIN(MBLKL(mp), mlen);
846 		if (ddi_copyout(mp->b_rptr,
847 		    (void *)((uintptr_t)pkt->otp_buf + boff),
848 		    wlen, 0) != 0) {
849 			mutex_exit(&entry->ote_lock);
850 			return (EFAULT);
851 		}
852 		mlen -= wlen;
853 		boff += wlen;
854 		mp = mp->b_cont;
855 	}
856 	mutex_exit(&entry->ote_lock);
857 	return (0);
858 }
859 
860 static int
861 overlay_target_inject(overlay_target_hdl_t *thdl, void *arg)
862 {
863 	overlay_targ_pkt_t *pkt = arg;
864 	overlay_target_entry_t *entry;
865 	overlay_dev_t *odd;
866 	mblk_t *mp;
867 
868 	if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
869 		return (EINVAL);
870 
871 	mp = allocb(pkt->otp_size, 0);
872 	if (mp == NULL)
873 		return (ENOMEM);
874 
875 	if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
876 		freeb(mp);
877 		return (EFAULT);
878 	}
879 	mp->b_wptr += pkt->otp_size;
880 
881 	if (pkt->otp_linkid != UINT64_MAX) {
882 		odd = overlay_hold_by_dlid(pkt->otp_linkid);
883 		if (odd == NULL) {
884 			freeb(mp);
885 			return (ENOENT);
886 		}
887 	} else {
888 		mutex_enter(&thdl->oth_lock);
889 		for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
890 		    entry = list_next(&thdl->oth_outstanding, entry)) {
891 			if ((uintptr_t)entry == pkt->otp_reqid)
892 				break;
893 		}
894 
895 		if (entry == NULL) {
896 			mutex_exit(&thdl->oth_lock);
897 			freeb(mp);
898 			return (ENOENT);
899 		}
900 		odd = entry->ote_odd;
901 		mutex_exit(&thdl->oth_lock);
902 	}
903 
904 	mutex_enter(&odd->odd_lock);
905 	if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
906 	    !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
907 		/* Can't do receive... */
908 		mutex_exit(&odd->odd_lock);
909 		OVERLAY_FREEMSG(mp, "dev dropped");
910 		freeb(mp);
911 		return (EBUSY);
912 	}
913 	overlay_io_start(odd, OVERLAY_F_IN_RX);
914 	mutex_exit(&odd->odd_lock);
915 
916 	mac_rx(odd->odd_mh, NULL, mp);
917 
918 	mutex_enter(&odd->odd_lock);
919 	overlay_io_done(odd, OVERLAY_F_IN_RX);
920 	mutex_exit(&odd->odd_lock);
921 
922 	return (0);
923 }
924 
925 static int
926 overlay_target_resend(overlay_target_hdl_t *thdl, void *arg)
927 {
928 	overlay_targ_pkt_t *pkt = arg;
929 	overlay_target_entry_t *entry;
930 	overlay_dev_t *odd;
931 	mblk_t *mp;
932 
933 	if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
934 		return (EINVAL);
935 
936 	mp = allocb(pkt->otp_size, 0);
937 	if (mp == NULL)
938 		return (ENOMEM);
939 
940 	if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
941 		freeb(mp);
942 		return (EFAULT);
943 	}
944 	mp->b_wptr += pkt->otp_size;
945 
946 	if (pkt->otp_linkid != UINT64_MAX) {
947 		odd = overlay_hold_by_dlid(pkt->otp_linkid);
948 		if (odd == NULL) {
949 			freeb(mp);
950 			return (ENOENT);
951 		}
952 	} else {
953 		mutex_enter(&thdl->oth_lock);
954 		for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
955 		    entry = list_next(&thdl->oth_outstanding, entry)) {
956 			if ((uintptr_t)entry == pkt->otp_reqid)
957 				break;
958 		}
959 
960 		if (entry == NULL) {
961 			mutex_exit(&thdl->oth_lock);
962 			freeb(mp);
963 			return (ENOENT);
964 		}
965 		odd = entry->ote_odd;
966 		mutex_exit(&thdl->oth_lock);
967 	}
968 
969 	mp = overlay_m_tx(odd, mp);
970 	freemsgchain(mp);
971 
972 	return (0);
973 }
974 
975 typedef struct overlay_targ_list_int {
976 	boolean_t	otli_count;
977 	uint32_t	otli_cur;
978 	uint32_t	otli_nents;
979 	uint32_t	otli_ents[];
980 } overlay_targ_list_int_t;
981 
982 static int
983 overlay_target_list_copyin(const void *ubuf, void **outp, size_t *bsize,
984     int flags)
985 {
986 	overlay_targ_list_t n;
987 	overlay_targ_list_int_t *otl;
988 
989 	if (ddi_copyin(ubuf, &n, sizeof (overlay_targ_list_t),
990 	    flags & FKIOCTL) != 0)
991 		return (EFAULT);
992 
993 	/*
994 	 */
995 	if (n.otl_nents >= INT32_MAX / sizeof (uint32_t))
996 		return (EINVAL);
997 	*bsize = sizeof (overlay_targ_list_int_t) +
998 	    sizeof (uint32_t) * n.otl_nents;
999 	otl = kmem_zalloc(*bsize, KM_SLEEP);
1000 	otl->otli_cur = 0;
1001 	otl->otli_nents = n.otl_nents;
1002 	if (otl->otli_nents != 0) {
1003 		otl->otli_count = B_FALSE;
1004 		if (ddi_copyin((void *)((uintptr_t)ubuf +
1005 		    offsetof(overlay_targ_list_t, otl_ents)),
1006 		    otl->otli_ents, n.otl_nents * sizeof (uint32_t),
1007 		    flags & FKIOCTL) != 0) {
1008 			kmem_free(otl, *bsize);
1009 			return (EFAULT);
1010 		}
1011 	} else {
1012 		otl->otli_count = B_TRUE;
1013 	}
1014 
1015 	*outp = otl;
1016 	return (0);
1017 }
1018 
1019 static int
1020 overlay_target_ioctl_list_cb(overlay_dev_t *odd, void *arg)
1021 {
1022 	overlay_targ_list_int_t *otl = arg;
1023 
1024 	if (otl->otli_cur < otl->otli_nents)
1025 		otl->otli_ents[otl->otli_cur] = odd->odd_linkid;
1026 	otl->otli_cur++;
1027 	return (0);
1028 }
1029 
1030 /* ARGSUSED */
1031 static int
1032 overlay_target_ioctl_list(overlay_target_hdl_t *thdl, void *arg)
1033 {
1034 	overlay_dev_iter(overlay_target_ioctl_list_cb, arg);
1035 	return (0);
1036 }
1037 
1038 /* ARGSUSED */
1039 static int
1040 overlay_target_list_copyout(void *ubuf, void *buf, size_t bufsize, int flags)
1041 {
1042 	overlay_targ_list_int_t *otl = buf;
1043 
1044 	if (ddi_copyout(&otl->otli_cur, ubuf, sizeof (uint32_t),
1045 	    flags & FKIOCTL) != 0)
1046 		return (EFAULT);
1047 
1048 	if (otl->otli_count == B_FALSE) {
1049 		if (ddi_copyout(otl->otli_ents,
1050 		    (void *)((uintptr_t)ubuf +
1051 		    offsetof(overlay_targ_list_t, otl_ents)),
1052 		    sizeof (uint32_t) * otl->otli_nents,
1053 		    flags & FKIOCTL) != 0)
1054 			return (EFAULT);
1055 	}
1056 	return (0);
1057 }
1058 
1059 /* ARGSUSED */
1060 static int
1061 overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg)
1062 {
1063 	int ret = 0;
1064 	overlay_dev_t *odd;
1065 	overlay_target_t *ott;
1066 	overlay_targ_cache_t *otc = arg;
1067 
1068 	odd = overlay_hold_by_dlid(otc->otc_linkid);
1069 	if (odd == NULL)
1070 		return (ENOENT);
1071 
1072 	mutex_enter(&odd->odd_lock);
1073 	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1074 		mutex_exit(&odd->odd_lock);
1075 		overlay_hold_rele(odd);
1076 		return (ENXIO);
1077 	}
1078 	ott = odd->odd_target;
1079 	if (ott->ott_mode != OVERLAY_TARGET_POINT &&
1080 	    ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1081 		mutex_exit(&odd->odd_lock);
1082 		overlay_hold_rele(odd);
1083 		return (ENOTSUP);
1084 	}
1085 	mutex_enter(&ott->ott_lock);
1086 	mutex_exit(&odd->odd_lock);
1087 
1088 	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
1089 		otc->otc_entry.otce_flags = 0;
1090 		bcopy(&ott->ott_u.ott_point, &otc->otc_entry.otce_dest,
1091 		    sizeof (overlay_target_point_t));
1092 	} else {
1093 		overlay_target_entry_t *ote;
1094 		ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1095 		    otc->otc_entry.otce_mac);
1096 		if (ote != NULL) {
1097 			mutex_enter(&ote->ote_lock);
1098 			if ((ote->ote_flags &
1099 			    OVERLAY_ENTRY_F_VALID_MASK) != 0) {
1100 				if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) {
1101 					otc->otc_entry.otce_flags =
1102 					    OVERLAY_TARGET_CACHE_DROP;
1103 				} else {
1104 					otc->otc_entry.otce_flags = 0;
1105 					bcopy(&ote->ote_dest,
1106 					    &otc->otc_entry.otce_dest,
1107 					    sizeof (overlay_target_point_t));
1108 				}
1109 				ret = 0;
1110 			} else {
1111 				ret = ENOENT;
1112 			}
1113 			mutex_exit(&ote->ote_lock);
1114 		} else {
1115 			ret = ENOENT;
1116 		}
1117 	}
1118 
1119 	mutex_exit(&ott->ott_lock);
1120 	overlay_hold_rele(odd);
1121 
1122 	return (ret);
1123 }
1124 
1125 /* ARGSUSED */
1126 static int
1127 overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
1128 {
1129 	overlay_dev_t *odd;
1130 	overlay_target_t *ott;
1131 	overlay_target_entry_t *ote;
1132 	overlay_targ_cache_t *otc = arg;
1133 	mblk_t *mp = NULL;
1134 
1135 	if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP)
1136 		return (EINVAL);
1137 
1138 	odd = overlay_hold_by_dlid(otc->otc_linkid);
1139 	if (odd == NULL)
1140 		return (ENOENT);
1141 
1142 	mutex_enter(&odd->odd_lock);
1143 	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1144 		mutex_exit(&odd->odd_lock);
1145 		overlay_hold_rele(odd);
1146 		return (ENXIO);
1147 	}
1148 	ott = odd->odd_target;
1149 	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1150 		mutex_exit(&odd->odd_lock);
1151 		overlay_hold_rele(odd);
1152 		return (ENOTSUP);
1153 	}
1154 	mutex_enter(&ott->ott_lock);
1155 	mutex_exit(&odd->odd_lock);
1156 
1157 	ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1158 	    otc->otc_entry.otce_mac);
1159 	if (ote == NULL) {
1160 		ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP);
1161 		bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL);
1162 		ote->ote_chead = ote->ote_ctail = NULL;
1163 		ote->ote_mbsize = 0;
1164 		ote->ote_ott = ott;
1165 		ote->ote_odd = odd;
1166 		mutex_enter(&ote->ote_lock);
1167 		refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote);
1168 		avl_add(&ott->ott_u.ott_dyn.ott_tree, ote);
1169 	} else {
1170 		mutex_enter(&ote->ote_lock);
1171 	}
1172 
1173 	if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) {
1174 		ote->ote_flags |= OVERLAY_ENTRY_F_DROP;
1175 	} else {
1176 		ote->ote_flags |= OVERLAY_ENTRY_F_VALID;
1177 		bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest,
1178 		    sizeof (overlay_target_point_t));
1179 		mp = ote->ote_chead;
1180 		ote->ote_chead = NULL;
1181 		ote->ote_ctail = NULL;
1182 		ote->ote_mbsize = 0;
1183 		ote->ote_vtime = gethrtime();
1184 	}
1185 
1186 	mutex_exit(&ote->ote_lock);
1187 	mutex_exit(&ott->ott_lock);
1188 
1189 	if (mp != NULL) {
1190 		mp = overlay_m_tx(ote->ote_odd, mp);
1191 		freemsgchain(mp);
1192 	}
1193 
1194 	overlay_hold_rele(odd);
1195 
1196 	return (0);
1197 }
1198 
1199 /* ARGSUSED */
1200 static int
1201 overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg)
1202 {
1203 	int ret = 0;
1204 	overlay_dev_t *odd;
1205 	overlay_target_t *ott;
1206 	overlay_target_entry_t *ote;
1207 	overlay_targ_cache_t *otc = arg;
1208 
1209 	odd = overlay_hold_by_dlid(otc->otc_linkid);
1210 	if (odd == NULL)
1211 		return (ENOENT);
1212 
1213 	mutex_enter(&odd->odd_lock);
1214 	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1215 		mutex_exit(&odd->odd_lock);
1216 		overlay_hold_rele(odd);
1217 		return (ENXIO);
1218 	}
1219 	ott = odd->odd_target;
1220 	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1221 		mutex_exit(&odd->odd_lock);
1222 		overlay_hold_rele(odd);
1223 		return (ENOTSUP);
1224 	}
1225 	mutex_enter(&ott->ott_lock);
1226 	mutex_exit(&odd->odd_lock);
1227 
1228 	ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1229 	    otc->otc_entry.otce_mac);
1230 	if (ote != NULL) {
1231 		mutex_enter(&ote->ote_lock);
1232 		ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
1233 		mutex_exit(&ote->ote_lock);
1234 		ret = 0;
1235 	} else {
1236 		ret = ENOENT;
1237 	}
1238 
1239 	mutex_exit(&ott->ott_lock);
1240 	overlay_hold_rele(odd);
1241 
1242 	return (ret);
1243 }
1244 
1245 /* ARGSUSED */
1246 static int
1247 overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg)
1248 {
1249 	avl_tree_t *avl;
1250 	overlay_dev_t *odd;
1251 	overlay_target_t *ott;
1252 	overlay_target_entry_t *ote;
1253 	overlay_targ_cache_t *otc = arg;
1254 
1255 	odd = overlay_hold_by_dlid(otc->otc_linkid);
1256 	if (odd == NULL)
1257 		return (ENOENT);
1258 
1259 	mutex_enter(&odd->odd_lock);
1260 	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1261 		mutex_exit(&odd->odd_lock);
1262 		overlay_hold_rele(odd);
1263 		return (ENXIO);
1264 	}
1265 	ott = odd->odd_target;
1266 	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1267 		mutex_exit(&odd->odd_lock);
1268 		overlay_hold_rele(odd);
1269 		return (ENOTSUP);
1270 	}
1271 	mutex_enter(&ott->ott_lock);
1272 	mutex_exit(&odd->odd_lock);
1273 	avl = &ott->ott_u.ott_dyn.ott_tree;
1274 
1275 	for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) {
1276 		mutex_enter(&ote->ote_lock);
1277 		ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
1278 		mutex_exit(&ote->ote_lock);
1279 	}
1280 	ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1281 	    otc->otc_entry.otce_mac);
1282 
1283 	mutex_exit(&ott->ott_lock);
1284 	overlay_hold_rele(odd);
1285 
1286 	return (0);
1287 }
1288 
1289 static int
1290 overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize,
1291     int flags)
1292 {
1293 	overlay_targ_cache_iter_t base, *iter;
1294 
1295 	if (ddi_copyin(ubuf, &base, sizeof (overlay_targ_cache_iter_t),
1296 	    flags & FKIOCTL) != 0)
1297 		return (EFAULT);
1298 
1299 	if (base.otci_count > OVERLAY_TARGET_ITER_MAX)
1300 		return (E2BIG);
1301 
1302 	if (base.otci_count == 0)
1303 		return (EINVAL);
1304 
1305 	*bsize = sizeof (overlay_targ_cache_iter_t) +
1306 	    base.otci_count * sizeof (overlay_targ_cache_entry_t);
1307 	iter = kmem_alloc(*bsize, KM_SLEEP);
1308 	bcopy(&base, iter, sizeof (overlay_targ_cache_iter_t));
1309 	*outp = iter;
1310 
1311 	return (0);
1312 }
1313 
1314 typedef struct overlay_targ_cache_marker {
1315 	uint8_t		otcm_mac[ETHERADDRL];
1316 	uint16_t	otcm_done;
1317 } overlay_targ_cache_marker_t;
1318 
1319 /* ARGSUSED */
1320 static int
1321 overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
1322 {
1323 	overlay_dev_t *odd;
1324 	overlay_target_t *ott;
1325 	overlay_target_entry_t lookup, *ent;
1326 	overlay_targ_cache_marker_t *mark;
1327 	avl_index_t where;
1328 	avl_tree_t *avl;
1329 	uint16_t written = 0;
1330 
1331 	overlay_targ_cache_iter_t *iter = arg;
1332 	mark = (void *)&iter->otci_marker;
1333 
1334 	if (mark->otcm_done != 0) {
1335 		iter->otci_count = 0;
1336 		return (0);
1337 	}
1338 
1339 	odd = overlay_hold_by_dlid(iter->otci_linkid);
1340 	if (odd == NULL)
1341 		return (ENOENT);
1342 
1343 	mutex_enter(&odd->odd_lock);
1344 	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1345 		mutex_exit(&odd->odd_lock);
1346 		overlay_hold_rele(odd);
1347 		return (ENXIO);
1348 	}
1349 	ott = odd->odd_target;
1350 	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC &&
1351 	    ott->ott_mode != OVERLAY_TARGET_POINT) {
1352 		mutex_exit(&odd->odd_lock);
1353 		overlay_hold_rele(odd);
1354 		return (ENOTSUP);
1355 	}
1356 
1357 	/*
1358 	 * Holding this lock across the entire iteration probably isn't very
1359 	 * good. We should perhaps add an r/w lock for the avl tree. But we'll
1360 	 * wait until we now it's necessary before we do more.
1361 	 */
1362 	mutex_enter(&ott->ott_lock);
1363 	mutex_exit(&odd->odd_lock);
1364 
1365 	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
1366 		overlay_targ_cache_entry_t *out = &iter->otci_ents[0];
1367 		bzero(out->otce_mac, ETHERADDRL);
1368 		out->otce_flags = 0;
1369 		bcopy(&ott->ott_u.ott_point, &out->otce_dest,
1370 		    sizeof (overlay_target_point_t));
1371 		written++;
1372 		mark->otcm_done = 1;
1373 	}
1374 
1375 	avl = &ott->ott_u.ott_dyn.ott_tree;
1376 	bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL);
1377 	ent = avl_find(avl, &lookup, &where);
1378 
1379 	/*
1380 	 * NULL ent means that the entry does not exist, so we want to start
1381 	 * with the closest node in the tree. This means that we implicitly rely
1382 	 * on the tree's order and the first node will be the mac 00:00:00:00:00
1383 	 * and the last will be ff:ff:ff:ff:ff:ff.
1384 	 */
1385 	if (ent == NULL) {
1386 		ent = avl_nearest(avl, where, AVL_AFTER);
1387 		if (ent == NULL) {
1388 			mark->otcm_done = 1;
1389 			goto done;
1390 		}
1391 	}
1392 
1393 	for (; ent != NULL && written < iter->otci_count;
1394 	    ent = AVL_NEXT(avl, ent)) {
1395 		overlay_targ_cache_entry_t *out = &iter->otci_ents[written];
1396 		mutex_enter(&ent->ote_lock);
1397 		if ((ent->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) == 0) {
1398 			mutex_exit(&ent->ote_lock);
1399 			continue;
1400 		}
1401 		bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL);
1402 		out->otce_flags = 0;
1403 		if (ent->ote_flags & OVERLAY_ENTRY_F_DROP)
1404 			out->otce_flags |= OVERLAY_TARGET_CACHE_DROP;
1405 		if (ent->ote_flags & OVERLAY_ENTRY_F_VALID)
1406 			bcopy(&ent->ote_dest, &out->otce_dest,
1407 			    sizeof (overlay_target_point_t));
1408 		written++;
1409 		mutex_exit(&ent->ote_lock);
1410 	}
1411 
1412 	if (ent != NULL) {
1413 		bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL);
1414 	} else {
1415 		mark->otcm_done = 1;
1416 	}
1417 
1418 done:
1419 	iter->otci_count = written;
1420 	mutex_exit(&ott->ott_lock);
1421 	overlay_hold_rele(odd);
1422 
1423 	return (0);
1424 }
1425 
1426 /* ARGSUSED */
1427 static int
1428 overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize,
1429     int flags)
1430 {
1431 	size_t outsize;
1432 	const overlay_targ_cache_iter_t *iter = buf;
1433 
1434 	outsize = sizeof (overlay_targ_cache_iter_t) +
1435 	    iter->otci_count * sizeof (overlay_targ_cache_entry_t);
1436 
1437 	if (ddi_copyout(buf, ubuf, outsize, flags & FKIOCTL) != 0)
1438 		return (EFAULT);
1439 
1440 	return (0);
1441 }
1442 
1443 static overlay_target_ioctl_t overlay_target_ioctab[] = {
1444 	{ OVERLAY_TARG_INFO, B_TRUE, B_TRUE,
1445 		NULL, overlay_target_info,
1446 		NULL, sizeof (overlay_targ_info_t)	},
1447 	{ OVERLAY_TARG_ASSOCIATE, B_TRUE, B_FALSE,
1448 		NULL, overlay_target_associate,
1449 		NULL, sizeof (overlay_targ_associate_t)	},
1450 	{ OVERLAY_TARG_DISASSOCIATE, B_TRUE, B_FALSE,
1451 		NULL, overlay_target_disassociate,
1452 		NULL, sizeof (overlay_targ_id_t)	},
1453 	{ OVERLAY_TARG_DEGRADE, B_TRUE, B_FALSE,
1454 		NULL, overlay_target_degrade,
1455 		NULL, sizeof (overlay_targ_degrade_t)	},
1456 	{ OVERLAY_TARG_RESTORE, B_TRUE, B_FALSE,
1457 		NULL, overlay_target_restore,
1458 		NULL, sizeof (overlay_targ_id_t)	},
1459 	{ OVERLAY_TARG_LOOKUP, B_FALSE, B_TRUE,
1460 		NULL, overlay_target_lookup_request,
1461 		NULL, sizeof (overlay_targ_lookup_t)	},
1462 	{ OVERLAY_TARG_RESPOND, B_TRUE, B_FALSE,
1463 		NULL, overlay_target_lookup_respond,
1464 		NULL, sizeof (overlay_targ_resp_t)	},
1465 	{ OVERLAY_TARG_DROP, B_TRUE, B_FALSE,
1466 		NULL, overlay_target_lookup_drop,
1467 		NULL, sizeof (overlay_targ_resp_t)	},
1468 	{ OVERLAY_TARG_PKT, B_TRUE, B_TRUE,
1469 		overlay_target_pkt_copyin,
1470 		overlay_target_packet,
1471 		overlay_target_pkt_copyout,
1472 		sizeof (overlay_targ_pkt_t)		},
1473 	{ OVERLAY_TARG_INJECT, B_TRUE, B_FALSE,
1474 		overlay_target_pkt_copyin,
1475 		overlay_target_inject,
1476 		NULL, sizeof (overlay_targ_pkt_t)	},
1477 	{ OVERLAY_TARG_RESEND, B_TRUE, B_FALSE,
1478 		overlay_target_pkt_copyin,
1479 		overlay_target_resend,
1480 		NULL, sizeof (overlay_targ_pkt_t)	},
1481 	{ OVERLAY_TARG_LIST, B_FALSE, B_TRUE,
1482 		overlay_target_list_copyin,
1483 		overlay_target_ioctl_list,
1484 		overlay_target_list_copyout,
1485 		sizeof (overlay_targ_list_t)		},
1486 	{ OVERLAY_TARG_CACHE_GET, B_FALSE, B_TRUE,
1487 		NULL, overlay_target_cache_get,
1488 		NULL, sizeof (overlay_targ_cache_t)	},
1489 	{ OVERLAY_TARG_CACHE_SET, B_TRUE, B_TRUE,
1490 		NULL, overlay_target_cache_set,
1491 		NULL, sizeof (overlay_targ_cache_t)	},
1492 	{ OVERLAY_TARG_CACHE_REMOVE, B_TRUE, B_TRUE,
1493 		NULL, overlay_target_cache_remove,
1494 		NULL, sizeof (overlay_targ_cache_t)	},
1495 	{ OVERLAY_TARG_CACHE_FLUSH, B_TRUE, B_TRUE,
1496 		NULL, overlay_target_cache_flush,
1497 		NULL, sizeof (overlay_targ_cache_t)	},
1498 	{ OVERLAY_TARG_CACHE_ITER, B_FALSE, B_TRUE,
1499 		overlay_target_cache_iter_copyin,
1500 		overlay_target_cache_iter,
1501 		overlay_target_cache_iter_copyout,
1502 		sizeof (overlay_targ_cache_iter_t)		},
1503 	{ 0 }
1504 };
1505 
1506 int
1507 overlay_target_open(dev_t *devp, int flags, int otype, cred_t *credp)
1508 {
1509 	minor_t mid;
1510 	overlay_target_hdl_t *thdl;
1511 
1512 	if (secpolicy_dl_config(credp) != 0)
1513 		return (EPERM);
1514 
1515 	if (getminor(*devp) != 0)
1516 		return (ENXIO);
1517 
1518 	if (otype & OTYP_BLK)
1519 		return (EINVAL);
1520 
1521 	if (flags & ~(FREAD | FWRITE | FEXCL))
1522 		return (EINVAL);
1523 
1524 	if ((flags & FWRITE) &&
1525 	    !(flags & FEXCL))
1526 		return (EINVAL);
1527 
1528 	if (!(flags & FREAD) && !(flags & FWRITE))
1529 		return (EINVAL);
1530 
1531 	if (crgetzoneid(credp) != GLOBAL_ZONEID)
1532 		return (EPERM);
1533 
1534 	mid = id_alloc(overlay_thdl_idspace);
1535 	if (ddi_soft_state_zalloc(overlay_thdl_state, mid) != 0) {
1536 		id_free(overlay_thdl_idspace, mid);
1537 		return (ENXIO);
1538 	}
1539 
1540 	thdl = ddi_get_soft_state(overlay_thdl_state, mid);
1541 	VERIFY(thdl != NULL);
1542 	thdl->oth_minor = mid;
1543 	thdl->oth_zoneid = crgetzoneid(credp);
1544 	thdl->oth_oflags = flags;
1545 	mutex_init(&thdl->oth_lock, NULL, MUTEX_DRIVER, NULL);
1546 	list_create(&thdl->oth_outstanding, sizeof (overlay_target_entry_t),
1547 	    offsetof(overlay_target_entry_t, ote_qlink));
1548 	*devp = makedevice(getmajor(*devp), mid);
1549 
1550 	mutex_enter(&overlay_target_lock);
1551 	if ((flags & FEXCL) && overlay_target_excl == B_TRUE) {
1552 		mutex_exit(&overlay_target_lock);
1553 		list_destroy(&thdl->oth_outstanding);
1554 		mutex_destroy(&thdl->oth_lock);
1555 		ddi_soft_state_free(overlay_thdl_state, mid);
1556 		id_free(overlay_thdl_idspace, mid);
1557 		return (EEXIST);
1558 	} else if ((flags & FEXCL) != 0) {
1559 		VERIFY(overlay_target_excl == B_FALSE);
1560 		overlay_target_excl = B_TRUE;
1561 	}
1562 	list_insert_tail(&overlay_thdl_list, thdl);
1563 	mutex_exit(&overlay_target_lock);
1564 
1565 	return (0);
1566 }
1567 
1568 /* ARGSUSED */
1569 int
1570 overlay_target_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1571     int *rvalp)
1572 {
1573 	overlay_target_ioctl_t *ioc;
1574 	overlay_target_hdl_t *thdl;
1575 
1576 	if (secpolicy_dl_config(credp) != 0)
1577 		return (EPERM);
1578 
1579 	if ((thdl = ddi_get_soft_state(overlay_thdl_state,
1580 	    getminor(dev))) == NULL)
1581 		return (ENXIO);
1582 
1583 	for (ioc = &overlay_target_ioctab[0]; ioc->oti_cmd != 0; ioc++) {
1584 		int ret;
1585 		caddr_t buf;
1586 		size_t bufsize;
1587 
1588 		if (ioc->oti_cmd != cmd)
1589 			continue;
1590 
1591 		if (ioc->oti_write == B_TRUE && !(mode & FWRITE))
1592 			return (EBADF);
1593 
1594 		if (ioc->oti_copyin == NULL) {
1595 			bufsize = ioc->oti_size;
1596 			buf = kmem_alloc(bufsize, KM_SLEEP);
1597 			if (ddi_copyin((void *)(uintptr_t)arg, buf, bufsize,
1598 			    mode & FKIOCTL) != 0) {
1599 				kmem_free(buf, bufsize);
1600 				return (EFAULT);
1601 			}
1602 		} else {
1603 			if ((ret = ioc->oti_copyin((void *)(uintptr_t)arg,
1604 			    (void **)&buf, &bufsize, mode)) != 0)
1605 				return (ret);
1606 		}
1607 
1608 		ret = ioc->oti_func(thdl, buf);
1609 		if (ret == 0 && ioc->oti_size != 0 &&
1610 		    ioc->oti_ncopyout == B_TRUE) {
1611 			if (ioc->oti_copyout == NULL) {
1612 				if (ddi_copyout(buf, (void *)(uintptr_t)arg,
1613 				    bufsize, mode & FKIOCTL) != 0)
1614 					ret = EFAULT;
1615 			} else {
1616 				ret = ioc->oti_copyout((void *)(uintptr_t)arg,
1617 				    buf, bufsize, mode);
1618 			}
1619 		}
1620 
1621 		kmem_free(buf, bufsize);
1622 		return (ret);
1623 	}
1624 
1625 	return (ENOTTY);
1626 }
1627 
1628 /* ARGSUSED */
1629 int
1630 overlay_target_close(dev_t dev, int flags, int otype, cred_t *credp)
1631 {
1632 	overlay_target_hdl_t *thdl;
1633 	overlay_target_entry_t *entry;
1634 	minor_t mid = getminor(dev);
1635 
1636 	if ((thdl = ddi_get_soft_state(overlay_thdl_state, mid)) == NULL)
1637 		return (ENXIO);
1638 
1639 	mutex_enter(&overlay_target_lock);
1640 	list_remove(&overlay_thdl_list, thdl);
1641 	mutex_enter(&thdl->oth_lock);
1642 	while ((entry = list_remove_head(&thdl->oth_outstanding)) != NULL)
1643 		list_insert_tail(&overlay_target_list, entry);
1644 	cv_signal(&overlay_target_condvar);
1645 	mutex_exit(&thdl->oth_lock);
1646 	if ((thdl->oth_oflags & FEXCL) != 0) {
1647 		VERIFY(overlay_target_excl == B_TRUE);
1648 		overlay_target_excl = B_FALSE;
1649 	}
1650 	mutex_exit(&overlay_target_lock);
1651 
1652 	list_destroy(&thdl->oth_outstanding);
1653 	mutex_destroy(&thdl->oth_lock);
1654 	mid = thdl->oth_minor;
1655 	ddi_soft_state_free(overlay_thdl_state, mid);
1656 	id_free(overlay_thdl_idspace, mid);
1657 
1658 	return (0);
1659 }
1660