xref: /illumos-gate/usr/src/uts/common/io/overlay/overlay_target.c (revision 6446bd46ed1b4e9f69da153665f82181ccaedad5)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2016 Joyent, Inc.
14  */
15 
16 /*
17  * Overlay device target cache management
18  *
19  * For more information, see the big theory statement in
20  * uts/common/io/overlay/overlay.c
21  */
22 
23 #include <sys/types.h>
24 #include <sys/ethernet.h>
25 #include <sys/kmem.h>
26 #include <sys/policy.h>
27 #include <sys/sysmacros.h>
28 #include <sys/stream.h>
29 #include <sys/strsun.h>
30 #include <sys/strsubr.h>
31 #include <sys/mac_provider.h>
32 #include <sys/mac_client.h>
33 #include <sys/mac_client_priv.h>
34 #include <sys/vlan.h>
35 #include <sys/crc32.h>
36 #include <sys/cred.h>
37 #include <sys/file.h>
38 #include <sys/errno.h>
39 #include <sys/ddi.h>
40 #include <sys/sunddi.h>
41 
42 #include <sys/overlay_impl.h>
43 #include <sys/sdt.h>
44 
45 /*
46  * This is total straw man, but at least it's a prime number. Here we're
47  * going to have to go through and do a lot of evaluation and understanding as
48  * to how these target caches should grow and shrink, as well as, memory
49  * pressure and evictions. This just gives us a starting point that'll be 'good
50  * enough', until it's not.
51  */
52 #define	OVERLAY_HSIZE	823
53 
54 /*
55  * We use this data structure to keep track of what requests have been actively
56  * allocated to a given instance so we know what to put back on the pending
57  * list.
58  */
59 typedef struct overlay_target_hdl {
60 	minor_t oth_minor;		/* RO */
61 	zoneid_t oth_zoneid;		/* RO */
62 	int oth_oflags;			/* RO */
63 	list_node_t oth_link;		/* overlay_target_lock */
64 	kmutex_t oth_lock;
65 	list_t	oth_outstanding;	/* oth_lock */
66 } overlay_target_hdl_t;
67 
68 typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int);
69 typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *);
70 typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int);
71 
72 typedef struct overlay_target_ioctl {
73 	int		oti_cmd;	/* ioctl id */
74 	boolean_t	oti_write;	/* ioctl requires FWRITE */
75 	boolean_t	oti_ncopyout;	/* copyout data? */
76 	overlay_target_copyin_f oti_copyin;	/* copyin func */
77 	overlay_target_ioctl_f oti_func; /* function to call */
78 	overlay_target_copyout_f oti_copyout;	/* copyin func */
79 	size_t		oti_size;	/* size of user level structure */
80 } overlay_target_ioctl_t;
81 
82 static kmem_cache_t *overlay_target_cache;
83 static kmem_cache_t *overlay_entry_cache;
84 static id_space_t *overlay_thdl_idspace;
85 static void *overlay_thdl_state;
86 
87 /*
88  * When we support overlay devices in the NGZ, then all of these need to become
89  * zone aware, by plugging into the netstack engine and becoming per-netstack
90  * data.
91  */
92 static list_t overlay_thdl_list;
93 static kmutex_t overlay_target_lock;
94 static kcondvar_t overlay_target_condvar;
95 static list_t overlay_target_list;
96 static boolean_t overlay_target_excl;
97 
98 /*
99  * Outstanding data per hash table entry.
100  */
101 static int overlay_ent_size = 128 * 1024;
102 
103 /* ARGSUSED */
104 static int
105 overlay_target_cache_constructor(void *buf, void *arg, int kmflgs)
106 {
107 	overlay_target_t *ott = buf;
108 
109 	mutex_init(&ott->ott_lock, NULL, MUTEX_DRIVER, NULL);
110 	cv_init(&ott->ott_cond, NULL, CV_DRIVER, NULL);
111 	return (0);
112 }
113 
114 /* ARGSUSED */
115 static void
116 overlay_target_cache_destructor(void *buf, void *arg)
117 {
118 	overlay_target_t *ott = buf;
119 
120 	cv_destroy(&ott->ott_cond);
121 	mutex_destroy(&ott->ott_lock);
122 }
123 
124 /* ARGSUSED */
125 static int
126 overlay_entry_cache_constructor(void *buf, void *arg, int kmflgs)
127 {
128 	overlay_target_entry_t *ote = buf;
129 
130 	bzero(ote, sizeof (overlay_target_entry_t));
131 	mutex_init(&ote->ote_lock, NULL, MUTEX_DRIVER, NULL);
132 	return (0);
133 }
134 
135 /* ARGSUSED */
136 static void
137 overlay_entry_cache_destructor(void *buf, void *arg)
138 {
139 	overlay_target_entry_t *ote = buf;
140 
141 	mutex_destroy(&ote->ote_lock);
142 }
143 
144 static uint64_t
145 overlay_mac_hash(const void *v)
146 {
147 	uint32_t crc;
148 	CRC32(crc, v, ETHERADDRL, -1U, crc32_table);
149 	return (crc);
150 }
151 
152 static int
153 overlay_mac_cmp(const void *a, const void *b)
154 {
155 	return (bcmp(a, b, ETHERADDRL));
156 }
157 
158 /* ARGSUSED */
159 static void
160 overlay_target_entry_dtor(void *arg)
161 {
162 	overlay_target_entry_t *ote = arg;
163 
164 	ote->ote_flags = 0;
165 	bzero(ote->ote_addr, ETHERADDRL);
166 	ote->ote_ott = NULL;
167 	ote->ote_odd = NULL;
168 	freemsgchain(ote->ote_chead);
169 	ote->ote_chead = ote->ote_ctail = NULL;
170 	ote->ote_mbsize = 0;
171 	ote->ote_vtime = 0;
172 	kmem_cache_free(overlay_entry_cache, ote);
173 }
174 
175 static int
176 overlay_mac_avl(const void *a, const void *b)
177 {
178 	int i;
179 	const overlay_target_entry_t *l, *r;
180 	l = a;
181 	r = b;
182 
183 	for (i = 0; i < ETHERADDRL; i++) {
184 		if (l->ote_addr[i] > r->ote_addr[i])
185 			return (1);
186 		else if (l->ote_addr[i] < r->ote_addr[i])
187 			return (-1);
188 	}
189 
190 	return (0);
191 }
192 
193 void
194 overlay_target_init(void)
195 {
196 	int ret;
197 	ret = ddi_soft_state_init(&overlay_thdl_state,
198 	    sizeof (overlay_target_hdl_t), 1);
199 	VERIFY(ret == 0);
200 	overlay_target_cache = kmem_cache_create("overlay_target",
201 	    sizeof (overlay_target_t), 0, overlay_target_cache_constructor,
202 	    overlay_target_cache_destructor, NULL, NULL, NULL, 0);
203 	overlay_entry_cache = kmem_cache_create("overlay_entry",
204 	    sizeof (overlay_target_entry_t), 0, overlay_entry_cache_constructor,
205 	    overlay_entry_cache_destructor, NULL, NULL, NULL, 0);
206 	mutex_init(&overlay_target_lock, NULL, MUTEX_DRIVER, NULL);
207 	cv_init(&overlay_target_condvar, NULL, CV_DRIVER, NULL);
208 	list_create(&overlay_target_list, sizeof (overlay_target_entry_t),
209 	    offsetof(overlay_target_entry_t, ote_qlink));
210 	list_create(&overlay_thdl_list, sizeof (overlay_target_hdl_t),
211 	    offsetof(overlay_target_hdl_t, oth_link));
212 	overlay_thdl_idspace = id_space_create("overlay_target_minors",
213 	    1, INT32_MAX);
214 }
215 
216 void
217 overlay_target_fini(void)
218 {
219 	id_space_destroy(overlay_thdl_idspace);
220 	list_destroy(&overlay_thdl_list);
221 	list_destroy(&overlay_target_list);
222 	cv_destroy(&overlay_target_condvar);
223 	mutex_destroy(&overlay_target_lock);
224 	kmem_cache_destroy(overlay_entry_cache);
225 	kmem_cache_destroy(overlay_target_cache);
226 	ddi_soft_state_fini(&overlay_thdl_state);
227 }
228 
229 void
230 overlay_target_free(overlay_dev_t *odd)
231 {
232 	if (odd->odd_target == NULL)
233 		return;
234 
235 	if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) {
236 		refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash;
237 		avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree;
238 		overlay_target_entry_t *ote;
239 
240 		/*
241 		 * Our AVL tree and hashtable contain the same elements,
242 		 * therefore we should just remove it from the tree, but then
243 		 * delete the entries when we remove them from the hash table
244 		 * (which happens through the refhash dtor).
245 		 */
246 		while ((ote = avl_first(ap)) != NULL)
247 			avl_remove(ap, ote);
248 
249 		avl_destroy(ap);
250 		for (ote = refhash_first(rp); ote != NULL;
251 		    ote = refhash_next(rp, ote)) {
252 			refhash_remove(rp, ote);
253 		}
254 		refhash_destroy(rp);
255 	}
256 
257 	ASSERT(odd->odd_target->ott_ocount == 0);
258 	kmem_cache_free(overlay_target_cache, odd->odd_target);
259 }
260 
261 int
262 overlay_target_busy()
263 {
264 	int ret;
265 
266 	mutex_enter(&overlay_target_lock);
267 	ret = !list_is_empty(&overlay_thdl_list);
268 	mutex_exit(&overlay_target_lock);
269 
270 	return (ret);
271 }
272 
273 static void
274 overlay_target_queue(overlay_target_entry_t *entry)
275 {
276 	mutex_enter(&overlay_target_lock);
277 	mutex_enter(&entry->ote_ott->ott_lock);
278 	if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) {
279 		mutex_exit(&entry->ote_ott->ott_lock);
280 		mutex_exit(&overlay_target_lock);
281 		return;
282 	}
283 	entry->ote_ott->ott_ocount++;
284 	mutex_exit(&entry->ote_ott->ott_lock);
285 	list_insert_tail(&overlay_target_list, entry);
286 	cv_signal(&overlay_target_condvar);
287 	mutex_exit(&overlay_target_lock);
288 }
289 
290 void
291 overlay_target_quiesce(overlay_target_t *ott)
292 {
293 	if (ott == NULL)
294 		return;
295 	mutex_enter(&ott->ott_lock);
296 	ott->ott_flags |= OVERLAY_T_TEARDOWN;
297 	while (ott->ott_ocount != 0)
298 		cv_wait(&ott->ott_cond, &ott->ott_lock);
299 	mutex_exit(&ott->ott_lock);
300 }
301 
302 /*
303  * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP |
304  * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at
305  * this time, say for NVGRE, we drop all packets that mcuh this.
306  */
307 int
308 overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
309     socklen_t *slenp)
310 {
311 	int ret;
312 	struct sockaddr_in6 *v6;
313 	overlay_target_t *ott;
314 	mac_header_info_t mhi;
315 	overlay_target_entry_t *entry;
316 
317 	ASSERT(odd->odd_target != NULL);
318 
319 	/*
320 	 * At this point, the overlay device is in a mux which means that it's
321 	 * been activated. At this point, parts of the target, such as the mode
322 	 * and the destination are now read-only and we don't have to worry
323 	 * about synchronization for them.
324 	 */
325 	ott = odd->odd_target;
326 	if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT))
327 		return (OVERLAY_TARGET_DROP);
328 
329 	v6 = (struct sockaddr_in6 *)sock;
330 	bzero(v6, sizeof (struct sockaddr_in6));
331 	v6->sin6_family = AF_INET6;
332 
333 	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
334 		mutex_enter(&ott->ott_lock);
335 		bcopy(&ott->ott_u.ott_point.otp_ip, &v6->sin6_addr,
336 		    sizeof (struct in6_addr));
337 		v6->sin6_port = htons(ott->ott_u.ott_point.otp_port);
338 		mutex_exit(&ott->ott_lock);
339 		*slenp = sizeof (struct sockaddr_in6);
340 
341 		return (OVERLAY_TARGET_OK);
342 	}
343 
344 	ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC);
345 
346 	/*
347 	 * Note we only want the MAC address here, therefore we won't bother
348 	 * using mac_vlan_header_info(). If any caller needs the vlan info at
349 	 * this point, this should change to a call to mac_vlan_header_info().
350 	 */
351 	if (mac_header_info(odd->odd_mh, mp, &mhi) != 0)
352 		return (OVERLAY_TARGET_DROP);
353 	mutex_enter(&ott->ott_lock);
354 	entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
355 	    mhi.mhi_daddr);
356 	if (entry == NULL) {
357 		entry = kmem_cache_alloc(overlay_entry_cache, KM_NOSLEEP_LAZY);
358 		if (entry == NULL) {
359 			mutex_exit(&ott->ott_lock);
360 			return (OVERLAY_TARGET_DROP);
361 		}
362 		bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL);
363 		entry->ote_chead = entry->ote_ctail = mp;
364 		entry->ote_mbsize = msgsize(mp);
365 		entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
366 		entry->ote_ott = ott;
367 		entry->ote_odd = odd;
368 		refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry);
369 		avl_add(&ott->ott_u.ott_dyn.ott_tree, entry);
370 		mutex_exit(&ott->ott_lock);
371 		overlay_target_queue(entry);
372 		return (OVERLAY_TARGET_ASYNC);
373 	}
374 	refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
375 	mutex_exit(&ott->ott_lock);
376 
377 	mutex_enter(&entry->ote_lock);
378 	if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) {
379 		ret = OVERLAY_TARGET_DROP;
380 	} else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
381 		bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr,
382 		    sizeof (struct in6_addr));
383 		v6->sin6_port = htons(entry->ote_dest.otp_port);
384 		*slenp = sizeof (struct sockaddr_in6);
385 		ret = OVERLAY_TARGET_OK;
386 	} else {
387 		size_t mlen = msgsize(mp);
388 
389 		if (mlen + entry->ote_mbsize > overlay_ent_size) {
390 			ret = OVERLAY_TARGET_DROP;
391 		} else {
392 			if (entry->ote_ctail != NULL) {
393 				ASSERT(entry->ote_ctail->b_next ==
394 				    NULL);
395 				entry->ote_ctail->b_next = mp;
396 				entry->ote_ctail = mp;
397 			} else {
398 				entry->ote_chead = mp;
399 				entry->ote_ctail = mp;
400 			}
401 			entry->ote_mbsize += mlen;
402 			if ((entry->ote_flags &
403 			    OVERLAY_ENTRY_F_PENDING) == 0) {
404 				entry->ote_flags |=
405 				    OVERLAY_ENTRY_F_PENDING;
406 				overlay_target_queue(entry);
407 			}
408 			ret = OVERLAY_TARGET_ASYNC;
409 		}
410 	}
411 	mutex_exit(&entry->ote_lock);
412 
413 	mutex_enter(&ott->ott_lock);
414 	refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
415 	mutex_exit(&ott->ott_lock);
416 
417 	return (ret);
418 }
419 
420 /* ARGSUSED */
421 static int
422 overlay_target_info(overlay_target_hdl_t *thdl, void *arg)
423 {
424 	overlay_dev_t *odd;
425 	overlay_targ_info_t *oti = arg;
426 
427 	odd = overlay_hold_by_dlid(oti->oti_linkid);
428 	if (odd == NULL)
429 		return (ENOENT);
430 
431 	mutex_enter(&odd->odd_lock);
432 	oti->oti_flags = 0;
433 	oti->oti_needs = odd->odd_plugin->ovp_dest;
434 	if (odd->odd_flags & OVERLAY_F_DEGRADED)
435 		oti->oti_flags |= OVERLAY_TARG_INFO_F_DEGRADED;
436 	if (odd->odd_flags & OVERLAY_F_ACTIVATED)
437 		oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE;
438 	oti->oti_vnetid = odd->odd_vid;
439 	mutex_exit(&odd->odd_lock);
440 	overlay_hold_rele(odd);
441 	return (0);
442 }
443 
444 /* ARGSUSED */
445 static int
446 overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
447 {
448 	overlay_dev_t *odd;
449 	overlay_target_t *ott;
450 	overlay_targ_associate_t *ota = arg;
451 
452 	odd = overlay_hold_by_dlid(ota->ota_linkid);
453 	if (odd == NULL)
454 		return (ENOENT);
455 
456 	if (ota->ota_id == 0) {
457 		overlay_hold_rele(odd);
458 		return (EINVAL);
459 	}
460 
461 	if (ota->ota_mode != OVERLAY_TARGET_POINT &&
462 	    ota->ota_mode != OVERLAY_TARGET_DYNAMIC) {
463 		overlay_hold_rele(odd);
464 		return (EINVAL);
465 	}
466 
467 	if (ota->ota_provides != odd->odd_plugin->ovp_dest) {
468 		overlay_hold_rele(odd);
469 		return (EINVAL);
470 	}
471 
472 	if (ota->ota_mode == OVERLAY_TARGET_POINT) {
473 		if (ota->ota_provides & OVERLAY_PLUGIN_D_IP) {
474 			if (IN6_IS_ADDR_UNSPECIFIED(&ota->ota_point.otp_ip) ||
475 			    IN6_IS_ADDR_V4COMPAT(&ota->ota_point.otp_ip) ||
476 			    IN6_IS_ADDR_V4MAPPED_ANY(&ota->ota_point.otp_ip)) {
477 				overlay_hold_rele(odd);
478 				return (EINVAL);
479 			}
480 		}
481 
482 		if (ota->ota_provides & OVERLAY_PLUGIN_D_PORT) {
483 			if (ota->ota_point.otp_port == 0) {
484 				overlay_hold_rele(odd);
485 				return (EINVAL);
486 			}
487 		}
488 	}
489 
490 	ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP);
491 	ott->ott_flags = 0;
492 	ott->ott_ocount = 0;
493 	ott->ott_mode = ota->ota_mode;
494 	ott->ott_dest = ota->ota_provides;
495 	ott->ott_id = ota->ota_id;
496 
497 	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
498 		bcopy(&ota->ota_point, &ott->ott_u.ott_point,
499 		    sizeof (overlay_target_point_t));
500 	} else {
501 		ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE,
502 		    overlay_mac_hash, overlay_mac_cmp,
503 		    overlay_target_entry_dtor, sizeof (overlay_target_entry_t),
504 		    offsetof(overlay_target_entry_t, ote_reflink),
505 		    offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP);
506 		avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl,
507 		    sizeof (overlay_target_entry_t),
508 		    offsetof(overlay_target_entry_t, ote_avllink));
509 	}
510 	mutex_enter(&odd->odd_lock);
511 	if (odd->odd_flags & OVERLAY_F_VARPD) {
512 		mutex_exit(&odd->odd_lock);
513 		kmem_cache_free(overlay_target_cache, ott);
514 		overlay_hold_rele(odd);
515 		return (EEXIST);
516 	}
517 
518 	odd->odd_flags |= OVERLAY_F_VARPD;
519 	odd->odd_target = ott;
520 	mutex_exit(&odd->odd_lock);
521 
522 	overlay_hold_rele(odd);
523 
524 
525 	return (0);
526 }
527 
528 
529 /* ARGSUSED */
530 static int
531 overlay_target_degrade(overlay_target_hdl_t *thdl, void *arg)
532 {
533 	overlay_dev_t *odd;
534 	overlay_targ_degrade_t *otd = arg;
535 
536 	odd = overlay_hold_by_dlid(otd->otd_linkid);
537 	if (odd == NULL)
538 		return (ENOENT);
539 
540 	overlay_fm_degrade(odd, otd->otd_buf);
541 	overlay_hold_rele(odd);
542 	return (0);
543 }
544 
545 /* ARGSUSED */
546 static int
547 overlay_target_restore(overlay_target_hdl_t *thdl, void *arg)
548 {
549 	overlay_dev_t *odd;
550 	overlay_targ_id_t *otid = arg;
551 
552 	odd = overlay_hold_by_dlid(otid->otid_linkid);
553 	if (odd == NULL)
554 		return (ENOENT);
555 
556 	overlay_fm_restore(odd);
557 	overlay_hold_rele(odd);
558 	return (0);
559 }
560 
561 /* ARGSUSED */
562 static int
563 overlay_target_disassociate(overlay_target_hdl_t *thdl, void *arg)
564 {
565 	overlay_dev_t *odd;
566 	overlay_targ_id_t *otid = arg;
567 
568 	odd = overlay_hold_by_dlid(otid->otid_linkid);
569 	if (odd == NULL)
570 		return (ENOENT);
571 
572 	mutex_enter(&odd->odd_lock);
573 	odd->odd_flags &= ~OVERLAY_F_VARPD;
574 	mutex_exit(&odd->odd_lock);
575 
576 	overlay_hold_rele(odd);
577 	return (0);
578 
579 }
580 
581 static int
582 overlay_target_lookup_request(overlay_target_hdl_t *thdl, void *arg)
583 {
584 	overlay_targ_lookup_t *otl = arg;
585 	overlay_target_entry_t *entry;
586 	clock_t ret, timeout;
587 	mac_header_info_t mhi;
588 
589 	timeout = ddi_get_lbolt() + drv_usectohz(MICROSEC);
590 again:
591 	mutex_enter(&overlay_target_lock);
592 	while (list_is_empty(&overlay_target_list)) {
593 		ret = cv_timedwait(&overlay_target_condvar,
594 		    &overlay_target_lock, timeout);
595 		if (ret == -1) {
596 			mutex_exit(&overlay_target_lock);
597 			return (ETIME);
598 		}
599 	}
600 	entry = list_remove_head(&overlay_target_list);
601 	mutex_exit(&overlay_target_lock);
602 	mutex_enter(&entry->ote_lock);
603 	if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
604 		ASSERT(entry->ote_chead == NULL);
605 		mutex_exit(&entry->ote_lock);
606 		goto again;
607 	}
608 	ASSERT(entry->ote_chead != NULL);
609 
610 	/*
611 	 * If we have a bogon that doesn't have a valid mac header, drop it and
612 	 * try again.
613 	 */
614 	if (mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead,
615 	    &mhi) != 0) {
616 		boolean_t queue = B_FALSE;
617 		mblk_t *mp = entry->ote_chead;
618 		entry->ote_chead = mp->b_next;
619 		mp->b_next = NULL;
620 		if (entry->ote_ctail == mp)
621 			entry->ote_ctail = entry->ote_chead;
622 		entry->ote_mbsize -= msgsize(mp);
623 		if (entry->ote_chead != NULL)
624 			queue = B_TRUE;
625 		mutex_exit(&entry->ote_lock);
626 		if (queue == B_TRUE)
627 			overlay_target_queue(entry);
628 		freemsg(mp);
629 		goto again;
630 	}
631 
632 	otl->otl_dlid = entry->ote_odd->odd_linkid;
633 	otl->otl_reqid = (uintptr_t)entry;
634 	otl->otl_varpdid = entry->ote_ott->ott_id;
635 	otl->otl_vnetid = entry->ote_odd->odd_vid;
636 
637 	otl->otl_hdrsize = mhi.mhi_hdrsize;
638 	otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize;
639 	bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL);
640 	bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL);
641 	otl->otl_dsttype = mhi.mhi_dsttype;
642 	otl->otl_sap = mhi.mhi_bindsap;
643 	otl->otl_vlan = VLAN_ID(mhi.mhi_tci);
644 	mutex_exit(&entry->ote_lock);
645 
646 	mutex_enter(&thdl->oth_lock);
647 	list_insert_tail(&thdl->oth_outstanding, entry);
648 	mutex_exit(&thdl->oth_lock);
649 
650 	return (0);
651 }
652 
653 static int
654 overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg)
655 {
656 	const overlay_targ_resp_t *otr = arg;
657 	overlay_target_entry_t *entry;
658 	mblk_t *mp;
659 
660 	mutex_enter(&thdl->oth_lock);
661 	for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
662 	    entry = list_next(&thdl->oth_outstanding, entry)) {
663 		if ((uintptr_t)entry == otr->otr_reqid)
664 			break;
665 	}
666 
667 	if (entry == NULL) {
668 		mutex_exit(&thdl->oth_lock);
669 		return (EINVAL);
670 	}
671 	list_remove(&thdl->oth_outstanding, entry);
672 	mutex_exit(&thdl->oth_lock);
673 
674 	mutex_enter(&entry->ote_lock);
675 	bcopy(&otr->otr_answer, &entry->ote_dest,
676 	    sizeof (overlay_target_point_t));
677 	entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
678 	entry->ote_flags |= OVERLAY_ENTRY_F_VALID;
679 	mp = entry->ote_chead;
680 	entry->ote_chead = NULL;
681 	entry->ote_ctail = NULL;
682 	entry->ote_mbsize = 0;
683 	entry->ote_vtime = gethrtime();
684 	mutex_exit(&entry->ote_lock);
685 
686 	/*
687 	 * For now do an in-situ drain.
688 	 */
689 	mp = overlay_m_tx(entry->ote_odd, mp);
690 	freemsgchain(mp);
691 
692 	mutex_enter(&entry->ote_ott->ott_lock);
693 	entry->ote_ott->ott_ocount--;
694 	cv_signal(&entry->ote_ott->ott_cond);
695 	mutex_exit(&entry->ote_ott->ott_lock);
696 
697 	return (0);
698 }
699 
700 static int
701 overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
702 {
703 	const overlay_targ_resp_t *otr = arg;
704 	overlay_target_entry_t *entry;
705 	mblk_t *mp;
706 	boolean_t queue = B_FALSE;
707 
708 	mutex_enter(&thdl->oth_lock);
709 	for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
710 	    entry = list_next(&thdl->oth_outstanding, entry)) {
711 		if ((uintptr_t)entry == otr->otr_reqid)
712 			break;
713 	}
714 
715 	if (entry == NULL) {
716 		mutex_exit(&thdl->oth_lock);
717 		return (EINVAL);
718 	}
719 	list_remove(&thdl->oth_outstanding, entry);
720 	mutex_exit(&thdl->oth_lock);
721 
722 	mutex_enter(&entry->ote_lock);
723 
724 	/* Safeguard against a confused varpd */
725 	if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
726 		entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
727 		DTRACE_PROBE1(overlay__target__valid__drop,
728 		    overlay_target_entry_t *, entry);
729 		mutex_exit(&entry->ote_lock);
730 		goto done;
731 	}
732 
733 	mp = entry->ote_chead;
734 	if (mp != NULL) {
735 		entry->ote_chead = mp->b_next;
736 		mp->b_next = NULL;
737 		if (entry->ote_ctail == mp)
738 			entry->ote_ctail = entry->ote_chead;
739 		entry->ote_mbsize -= msgsize(mp);
740 	}
741 	if (entry->ote_chead != NULL) {
742 		queue = B_TRUE;
743 		entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
744 	} else {
745 		entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
746 	}
747 	mutex_exit(&entry->ote_lock);
748 
749 	if (queue == B_TRUE)
750 		overlay_target_queue(entry);
751 	freemsg(mp);
752 
753 done:
754 	mutex_enter(&entry->ote_ott->ott_lock);
755 	entry->ote_ott->ott_ocount--;
756 	cv_signal(&entry->ote_ott->ott_cond);
757 	mutex_exit(&entry->ote_ott->ott_lock);
758 
759 	return (0);
760 }
761 
762 /* ARGSUSED */
763 static int
764 overlay_target_pkt_copyin(const void *ubuf, void **outp, size_t *bsize,
765     int flags)
766 {
767 	overlay_targ_pkt_t *pkt;
768 	overlay_targ_pkt32_t *pkt32;
769 
770 	pkt = kmem_alloc(sizeof (overlay_targ_pkt_t), KM_SLEEP);
771 	*outp = pkt;
772 	*bsize = sizeof (overlay_targ_pkt_t);
773 	if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
774 		uintptr_t addr;
775 
776 		if (ddi_copyin(ubuf, pkt, sizeof (overlay_targ_pkt32_t),
777 		    flags & FKIOCTL) != 0) {
778 			kmem_free(pkt, *bsize);
779 			return (EFAULT);
780 		}
781 		pkt32 = (overlay_targ_pkt32_t *)pkt;
782 		addr = pkt32->otp_buf;
783 		pkt->otp_buf = (void *)addr;
784 	} else {
785 		if (ddi_copyin(ubuf, pkt, *bsize, flags & FKIOCTL) != 0) {
786 			kmem_free(pkt, *bsize);
787 			return (EFAULT);
788 		}
789 	}
790 	return (0);
791 }
792 
793 static int
794 overlay_target_pkt_copyout(void *ubuf, void *buf, size_t bufsize,
795     int flags)
796 {
797 	if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
798 		overlay_targ_pkt_t *pkt = buf;
799 		overlay_targ_pkt32_t *pkt32 = buf;
800 		uintptr_t addr = (uintptr_t)pkt->otp_buf;
801 		pkt32->otp_buf = (caddr32_t)addr;
802 		if (ddi_copyout(buf, ubuf, sizeof (overlay_targ_pkt32_t),
803 		    flags & FKIOCTL) != 0)
804 			return (EFAULT);
805 	} else {
806 		if (ddi_copyout(buf, ubuf, bufsize, flags & FKIOCTL) != 0)
807 			return (EFAULT);
808 	}
809 	return (0);
810 }
811 
812 static int
813 overlay_target_packet(overlay_target_hdl_t *thdl, void *arg)
814 {
815 	overlay_targ_pkt_t *pkt = arg;
816 	overlay_target_entry_t *entry;
817 	mblk_t *mp;
818 	size_t mlen;
819 	size_t boff;
820 
821 	mutex_enter(&thdl->oth_lock);
822 	for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
823 	    entry = list_next(&thdl->oth_outstanding, entry)) {
824 		if ((uintptr_t)entry == pkt->otp_reqid)
825 			break;
826 	}
827 
828 	if (entry == NULL) {
829 		mutex_exit(&thdl->oth_lock);
830 		return (EINVAL);
831 	}
832 	mutex_enter(&entry->ote_lock);
833 	mutex_exit(&thdl->oth_lock);
834 	mp = entry->ote_chead;
835 	/* Protect against a rogue varpd */
836 	if (mp == NULL) {
837 		mutex_exit(&entry->ote_lock);
838 		return (EINVAL);
839 	}
840 	mlen = MIN(msgsize(mp), pkt->otp_size);
841 	pkt->otp_size = mlen;
842 	boff = 0;
843 	while (mlen > 0) {
844 		size_t wlen = MIN(MBLKL(mp), mlen);
845 		if (ddi_copyout(mp->b_rptr,
846 		    (void *)((uintptr_t)pkt->otp_buf + boff),
847 		    wlen, 0) != 0) {
848 			mutex_exit(&entry->ote_lock);
849 			return (EFAULT);
850 		}
851 		mlen -= wlen;
852 		boff += wlen;
853 		mp = mp->b_cont;
854 	}
855 	mutex_exit(&entry->ote_lock);
856 	return (0);
857 }
858 
859 static int
860 overlay_target_inject(overlay_target_hdl_t *thdl, void *arg)
861 {
862 	overlay_targ_pkt_t *pkt = arg;
863 	overlay_target_entry_t *entry;
864 	overlay_dev_t *odd;
865 	mblk_t *mp;
866 
867 	if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
868 		return (EINVAL);
869 
870 	mp = allocb(pkt->otp_size, 0);
871 	if (mp == NULL)
872 		return (ENOMEM);
873 
874 	if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
875 		freeb(mp);
876 		return (EFAULT);
877 	}
878 	mp->b_wptr += pkt->otp_size;
879 
880 	if (pkt->otp_linkid != UINT64_MAX) {
881 		odd = overlay_hold_by_dlid(pkt->otp_linkid);
882 		if (odd == NULL) {
883 			freeb(mp);
884 			return (ENOENT);
885 		}
886 	} else {
887 		mutex_enter(&thdl->oth_lock);
888 		for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
889 		    entry = list_next(&thdl->oth_outstanding, entry)) {
890 			if ((uintptr_t)entry == pkt->otp_reqid)
891 				break;
892 		}
893 
894 		if (entry == NULL) {
895 			mutex_exit(&thdl->oth_lock);
896 			freeb(mp);
897 			return (ENOENT);
898 		}
899 		odd = entry->ote_odd;
900 		mutex_exit(&thdl->oth_lock);
901 	}
902 
903 	mutex_enter(&odd->odd_lock);
904 	overlay_io_start(odd, OVERLAY_F_IN_RX);
905 	mutex_exit(&odd->odd_lock);
906 
907 	mac_rx(odd->odd_mh, NULL, mp);
908 
909 	mutex_enter(&odd->odd_lock);
910 	overlay_io_done(odd, OVERLAY_F_IN_RX);
911 	mutex_exit(&odd->odd_lock);
912 
913 	return (0);
914 }
915 
916 static int
917 overlay_target_resend(overlay_target_hdl_t *thdl, void *arg)
918 {
919 	overlay_targ_pkt_t *pkt = arg;
920 	overlay_target_entry_t *entry;
921 	overlay_dev_t *odd;
922 	mblk_t *mp;
923 
924 	if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
925 		return (EINVAL);
926 
927 	mp = allocb(pkt->otp_size, 0);
928 	if (mp == NULL)
929 		return (ENOMEM);
930 
931 	if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
932 		freeb(mp);
933 		return (EFAULT);
934 	}
935 	mp->b_wptr += pkt->otp_size;
936 
937 	if (pkt->otp_linkid != UINT64_MAX) {
938 		odd = overlay_hold_by_dlid(pkt->otp_linkid);
939 		if (odd == NULL) {
940 			freeb(mp);
941 			return (ENOENT);
942 		}
943 	} else {
944 		mutex_enter(&thdl->oth_lock);
945 		for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
946 		    entry = list_next(&thdl->oth_outstanding, entry)) {
947 			if ((uintptr_t)entry == pkt->otp_reqid)
948 				break;
949 		}
950 
951 		if (entry == NULL) {
952 			mutex_exit(&thdl->oth_lock);
953 			freeb(mp);
954 			return (ENOENT);
955 		}
956 		odd = entry->ote_odd;
957 		mutex_exit(&thdl->oth_lock);
958 	}
959 
960 	mp = overlay_m_tx(odd, mp);
961 	freemsgchain(mp);
962 
963 	return (0);
964 }
965 
966 typedef struct overlay_targ_list_int {
967 	boolean_t	otli_count;
968 	uint32_t	otli_cur;
969 	uint32_t	otli_nents;
970 	uint32_t	otli_ents[];
971 } overlay_targ_list_int_t;
972 
973 static int
974 overlay_target_list_copyin(const void *ubuf, void **outp, size_t *bsize,
975     int flags)
976 {
977 	overlay_targ_list_t n;
978 	overlay_targ_list_int_t *otl;
979 
980 	if (ddi_copyin(ubuf, &n, sizeof (overlay_targ_list_t),
981 	    flags & FKIOCTL) != 0)
982 		return (EFAULT);
983 
984 	/*
985 	 */
986 	if (n.otl_nents >= INT32_MAX / sizeof (uint32_t))
987 		return (EINVAL);
988 	*bsize = sizeof (overlay_targ_list_int_t) +
989 	    sizeof (uint32_t) * n.otl_nents;
990 	otl = kmem_zalloc(*bsize, KM_SLEEP);
991 	otl->otli_cur = 0;
992 	otl->otli_nents = n.otl_nents;
993 	if (otl->otli_nents != 0) {
994 		otl->otli_count = B_FALSE;
995 		if (ddi_copyin((void *)((uintptr_t)ubuf +
996 		    offsetof(overlay_targ_list_t, otl_ents)),
997 		    otl->otli_ents, n.otl_nents * sizeof (uint32_t),
998 		    flags & FKIOCTL) != 0) {
999 			kmem_free(otl, *bsize);
1000 			return (EFAULT);
1001 		}
1002 	} else {
1003 		otl->otli_count = B_TRUE;
1004 	}
1005 
1006 	*outp = otl;
1007 	return (0);
1008 }
1009 
1010 static int
1011 overlay_target_ioctl_list_cb(overlay_dev_t *odd, void *arg)
1012 {
1013 	overlay_targ_list_int_t *otl = arg;
1014 
1015 	if (otl->otli_cur < otl->otli_nents)
1016 		otl->otli_ents[otl->otli_cur] = odd->odd_linkid;
1017 	otl->otli_cur++;
1018 	return (0);
1019 }
1020 
1021 /* ARGSUSED */
1022 static int
1023 overlay_target_ioctl_list(overlay_target_hdl_t *thdl, void *arg)
1024 {
1025 	overlay_dev_iter(overlay_target_ioctl_list_cb, arg);
1026 	return (0);
1027 }
1028 
1029 /* ARGSUSED */
1030 static int
1031 overlay_target_list_copyout(void *ubuf, void *buf, size_t bufsize, int flags)
1032 {
1033 	overlay_targ_list_int_t *otl = buf;
1034 
1035 	if (ddi_copyout(&otl->otli_cur, ubuf, sizeof (uint32_t),
1036 	    flags & FKIOCTL) != 0)
1037 		return (EFAULT);
1038 
1039 	if (otl->otli_count == B_FALSE) {
1040 		if (ddi_copyout(otl->otli_ents,
1041 		    (void *)((uintptr_t)ubuf +
1042 		    offsetof(overlay_targ_list_t, otl_ents)),
1043 		    sizeof (uint32_t) * otl->otli_nents,
1044 		    flags & FKIOCTL) != 0)
1045 			return (EFAULT);
1046 	}
1047 	return (0);
1048 }
1049 
1050 /* ARGSUSED */
1051 static int
1052 overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg)
1053 {
1054 	int ret = 0;
1055 	overlay_dev_t *odd;
1056 	overlay_target_t *ott;
1057 	overlay_targ_cache_t *otc = arg;
1058 
1059 	odd = overlay_hold_by_dlid(otc->otc_linkid);
1060 	if (odd == NULL)
1061 		return (ENOENT);
1062 
1063 	mutex_enter(&odd->odd_lock);
1064 	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1065 		mutex_exit(&odd->odd_lock);
1066 		overlay_hold_rele(odd);
1067 		return (ENXIO);
1068 	}
1069 	ott = odd->odd_target;
1070 	if (ott->ott_mode != OVERLAY_TARGET_POINT &&
1071 	    ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1072 		mutex_exit(&odd->odd_lock);
1073 		overlay_hold_rele(odd);
1074 		return (ENOTSUP);
1075 	}
1076 	mutex_enter(&ott->ott_lock);
1077 	mutex_exit(&odd->odd_lock);
1078 
1079 	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
1080 		otc->otc_entry.otce_flags = 0;
1081 		bcopy(&ott->ott_u.ott_point, &otc->otc_entry.otce_dest,
1082 		    sizeof (overlay_target_point_t));
1083 	} else {
1084 		overlay_target_entry_t *ote;
1085 		ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1086 		    otc->otc_entry.otce_mac);
1087 		if (ote != NULL) {
1088 			mutex_enter(&ote->ote_lock);
1089 			if ((ote->ote_flags &
1090 			    OVERLAY_ENTRY_F_VALID_MASK) != 0) {
1091 				if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) {
1092 					otc->otc_entry.otce_flags =
1093 					    OVERLAY_TARGET_CACHE_DROP;
1094 				} else {
1095 					otc->otc_entry.otce_flags = 0;
1096 					bcopy(&ote->ote_dest,
1097 					    &otc->otc_entry.otce_dest,
1098 					    sizeof (overlay_target_point_t));
1099 				}
1100 				ret = 0;
1101 			} else {
1102 				ret = ENOENT;
1103 			}
1104 			mutex_exit(&ote->ote_lock);
1105 		} else {
1106 			ret = ENOENT;
1107 		}
1108 	}
1109 
1110 	mutex_exit(&ott->ott_lock);
1111 	overlay_hold_rele(odd);
1112 
1113 	return (ret);
1114 }
1115 
1116 /* ARGSUSED */
1117 static int
1118 overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
1119 {
1120 	overlay_dev_t *odd;
1121 	overlay_target_t *ott;
1122 	overlay_target_entry_t *ote;
1123 	overlay_targ_cache_t *otc = arg;
1124 	mblk_t *mp = NULL;
1125 
1126 	if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP)
1127 		return (EINVAL);
1128 
1129 	odd = overlay_hold_by_dlid(otc->otc_linkid);
1130 	if (odd == NULL)
1131 		return (ENOENT);
1132 
1133 	mutex_enter(&odd->odd_lock);
1134 	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1135 		mutex_exit(&odd->odd_lock);
1136 		overlay_hold_rele(odd);
1137 		return (ENXIO);
1138 	}
1139 	ott = odd->odd_target;
1140 	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1141 		mutex_exit(&odd->odd_lock);
1142 		overlay_hold_rele(odd);
1143 		return (ENOTSUP);
1144 	}
1145 	mutex_enter(&ott->ott_lock);
1146 	mutex_exit(&odd->odd_lock);
1147 
1148 	ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1149 	    otc->otc_entry.otce_mac);
1150 	if (ote == NULL) {
1151 		ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP);
1152 		bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL);
1153 		ote->ote_chead = ote->ote_ctail = NULL;
1154 		ote->ote_mbsize = 0;
1155 		ote->ote_ott = ott;
1156 		ote->ote_odd = odd;
1157 		mutex_enter(&ote->ote_lock);
1158 		refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote);
1159 		avl_add(&ott->ott_u.ott_dyn.ott_tree, ote);
1160 	} else {
1161 		mutex_enter(&ote->ote_lock);
1162 	}
1163 
1164 	if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) {
1165 		ote->ote_flags |= OVERLAY_ENTRY_F_DROP;
1166 	} else {
1167 		ote->ote_flags |= OVERLAY_ENTRY_F_VALID;
1168 		bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest,
1169 		    sizeof (overlay_target_point_t));
1170 		mp = ote->ote_chead;
1171 		ote->ote_chead = NULL;
1172 		ote->ote_ctail = NULL;
1173 		ote->ote_mbsize = 0;
1174 		ote->ote_vtime = gethrtime();
1175 	}
1176 
1177 	mutex_exit(&ote->ote_lock);
1178 	mutex_exit(&ott->ott_lock);
1179 
1180 	if (mp != NULL) {
1181 		mp = overlay_m_tx(ote->ote_odd, mp);
1182 		freemsgchain(mp);
1183 	}
1184 
1185 	overlay_hold_rele(odd);
1186 
1187 	return (0);
1188 }
1189 
1190 /* ARGSUSED */
1191 static int
1192 overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg)
1193 {
1194 	int ret = 0;
1195 	overlay_dev_t *odd;
1196 	overlay_target_t *ott;
1197 	overlay_target_entry_t *ote;
1198 	overlay_targ_cache_t *otc = arg;
1199 
1200 	odd = overlay_hold_by_dlid(otc->otc_linkid);
1201 	if (odd == NULL)
1202 		return (ENOENT);
1203 
1204 	mutex_enter(&odd->odd_lock);
1205 	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1206 		mutex_exit(&odd->odd_lock);
1207 		overlay_hold_rele(odd);
1208 		return (ENXIO);
1209 	}
1210 	ott = odd->odd_target;
1211 	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1212 		mutex_exit(&odd->odd_lock);
1213 		overlay_hold_rele(odd);
1214 		return (ENOTSUP);
1215 	}
1216 	mutex_enter(&ott->ott_lock);
1217 	mutex_exit(&odd->odd_lock);
1218 
1219 	ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1220 	    otc->otc_entry.otce_mac);
1221 	if (ote != NULL) {
1222 		mutex_enter(&ote->ote_lock);
1223 		ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
1224 		mutex_exit(&ote->ote_lock);
1225 		ret = 0;
1226 	} else {
1227 		ret = ENOENT;
1228 	}
1229 
1230 	mutex_exit(&ott->ott_lock);
1231 	overlay_hold_rele(odd);
1232 
1233 	return (ret);
1234 }
1235 
1236 /* ARGSUSED */
1237 static int
1238 overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg)
1239 {
1240 	avl_tree_t *avl;
1241 	overlay_dev_t *odd;
1242 	overlay_target_t *ott;
1243 	overlay_target_entry_t *ote;
1244 	overlay_targ_cache_t *otc = arg;
1245 
1246 	odd = overlay_hold_by_dlid(otc->otc_linkid);
1247 	if (odd == NULL)
1248 		return (ENOENT);
1249 
1250 	mutex_enter(&odd->odd_lock);
1251 	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1252 		mutex_exit(&odd->odd_lock);
1253 		overlay_hold_rele(odd);
1254 		return (ENXIO);
1255 	}
1256 	ott = odd->odd_target;
1257 	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1258 		mutex_exit(&odd->odd_lock);
1259 		overlay_hold_rele(odd);
1260 		return (ENOTSUP);
1261 	}
1262 	mutex_enter(&ott->ott_lock);
1263 	mutex_exit(&odd->odd_lock);
1264 	avl = &ott->ott_u.ott_dyn.ott_tree;
1265 
1266 	for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) {
1267 		mutex_enter(&ote->ote_lock);
1268 		ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
1269 		mutex_exit(&ote->ote_lock);
1270 	}
1271 	ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1272 	    otc->otc_entry.otce_mac);
1273 
1274 	mutex_exit(&ott->ott_lock);
1275 	overlay_hold_rele(odd);
1276 
1277 	return (0);
1278 }
1279 
1280 static int
1281 overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize,
1282     int flags)
1283 {
1284 	overlay_targ_cache_iter_t base, *iter;
1285 
1286 	if (ddi_copyin(ubuf, &base, sizeof (overlay_targ_cache_iter_t),
1287 	    flags & FKIOCTL) != 0)
1288 		return (EFAULT);
1289 
1290 	if (base.otci_count > OVERLAY_TARGET_ITER_MAX)
1291 		return (E2BIG);
1292 
1293 	if (base.otci_count == 0)
1294 		return (EINVAL);
1295 
1296 	*bsize = sizeof (overlay_targ_cache_iter_t) +
1297 	    base.otci_count * sizeof (overlay_targ_cache_entry_t);
1298 	iter = kmem_alloc(*bsize, KM_SLEEP);
1299 	bcopy(&base, iter, sizeof (overlay_targ_cache_iter_t));
1300 	*outp = iter;
1301 
1302 	return (0);
1303 }
1304 
1305 typedef struct overlay_targ_cache_marker {
1306 	uint8_t		otcm_mac[ETHERADDRL];
1307 	uint16_t	otcm_done;
1308 } overlay_targ_cache_marker_t;
1309 
1310 /* ARGSUSED */
1311 static int
1312 overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
1313 {
1314 	overlay_dev_t *odd;
1315 	overlay_target_t *ott;
1316 	overlay_target_entry_t lookup, *ent;
1317 	overlay_targ_cache_marker_t *mark;
1318 	avl_index_t where;
1319 	avl_tree_t *avl;
1320 	uint16_t written = 0;
1321 
1322 	overlay_targ_cache_iter_t *iter = arg;
1323 	mark = (void *)&iter->otci_marker;
1324 
1325 	if (mark->otcm_done != 0) {
1326 		iter->otci_count = 0;
1327 		return (0);
1328 	}
1329 
1330 	odd = overlay_hold_by_dlid(iter->otci_linkid);
1331 	if (odd == NULL)
1332 		return (ENOENT);
1333 
1334 	mutex_enter(&odd->odd_lock);
1335 	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1336 		mutex_exit(&odd->odd_lock);
1337 		overlay_hold_rele(odd);
1338 		return (ENXIO);
1339 	}
1340 	ott = odd->odd_target;
1341 	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC &&
1342 	    ott->ott_mode != OVERLAY_TARGET_POINT) {
1343 		mutex_exit(&odd->odd_lock);
1344 		overlay_hold_rele(odd);
1345 		return (ENOTSUP);
1346 	}
1347 
1348 	/*
1349 	 * Holding this lock across the entire iteration probably isn't very
1350 	 * good. We should perhaps add an r/w lock for the avl tree. But we'll
1351 	 * wait until we now it's necessary before we do more.
1352 	 */
1353 	mutex_enter(&ott->ott_lock);
1354 	mutex_exit(&odd->odd_lock);
1355 
1356 	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
1357 		overlay_targ_cache_entry_t *out = &iter->otci_ents[0];
1358 		bzero(out->otce_mac, ETHERADDRL);
1359 		out->otce_flags = 0;
1360 		bcopy(&ott->ott_u.ott_point, &out->otce_dest,
1361 		    sizeof (overlay_target_point_t));
1362 		written++;
1363 		mark->otcm_done = 1;
1364 	}
1365 
1366 	avl = &ott->ott_u.ott_dyn.ott_tree;
1367 	bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL);
1368 	ent = avl_find(avl, &lookup, &where);
1369 
1370 	/*
1371 	 * NULL ent means that the entry does not exist, so we want to start
1372 	 * with the closest node in the tree. This means that we implicitly rely
1373 	 * on the tree's order and the first node will be the mac 00:00:00:00:00
1374 	 * and the last will be ff:ff:ff:ff:ff:ff.
1375 	 */
1376 	if (ent == NULL) {
1377 		ent = avl_nearest(avl, where, AVL_AFTER);
1378 		if (ent == NULL) {
1379 			mark->otcm_done = 1;
1380 			goto done;
1381 		}
1382 	}
1383 
1384 	for (; ent != NULL && written < iter->otci_count;
1385 	    ent = AVL_NEXT(avl, ent)) {
1386 		overlay_targ_cache_entry_t *out = &iter->otci_ents[written];
1387 		mutex_enter(&ent->ote_lock);
1388 		if ((ent->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) == 0) {
1389 			mutex_exit(&ent->ote_lock);
1390 			continue;
1391 		}
1392 		bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL);
1393 		out->otce_flags = 0;
1394 		if (ent->ote_flags & OVERLAY_ENTRY_F_DROP)
1395 			out->otce_flags |= OVERLAY_TARGET_CACHE_DROP;
1396 		if (ent->ote_flags & OVERLAY_ENTRY_F_VALID)
1397 			bcopy(&ent->ote_dest, &out->otce_dest,
1398 			    sizeof (overlay_target_point_t));
1399 		written++;
1400 		mutex_exit(&ent->ote_lock);
1401 	}
1402 
1403 	if (ent != NULL) {
1404 		bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL);
1405 	} else {
1406 		mark->otcm_done = 1;
1407 	}
1408 
1409 done:
1410 	iter->otci_count = written;
1411 	mutex_exit(&ott->ott_lock);
1412 	overlay_hold_rele(odd);
1413 
1414 	return (0);
1415 }
1416 
1417 /* ARGSUSED */
1418 static int
1419 overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize,
1420     int flags)
1421 {
1422 	size_t outsize;
1423 	const overlay_targ_cache_iter_t *iter = buf;
1424 
1425 	outsize = sizeof (overlay_targ_cache_iter_t) +
1426 	    iter->otci_count * sizeof (overlay_targ_cache_entry_t);
1427 
1428 	if (ddi_copyout(buf, ubuf, outsize, flags & FKIOCTL) != 0)
1429 		return (EFAULT);
1430 
1431 	return (0);
1432 }
1433 
1434 static overlay_target_ioctl_t overlay_target_ioctab[] = {
1435 	{ OVERLAY_TARG_INFO, B_TRUE, B_TRUE,
1436 		NULL, overlay_target_info,
1437 		NULL, sizeof (overlay_targ_info_t)	},
1438 	{ OVERLAY_TARG_ASSOCIATE, B_TRUE, B_FALSE,
1439 		NULL, overlay_target_associate,
1440 		NULL, sizeof (overlay_targ_associate_t)	},
1441 	{ OVERLAY_TARG_DISASSOCIATE, B_TRUE, B_FALSE,
1442 		NULL, overlay_target_disassociate,
1443 		NULL, sizeof (overlay_targ_id_t)	},
1444 	{ OVERLAY_TARG_DEGRADE, B_TRUE, B_FALSE,
1445 		NULL, overlay_target_degrade,
1446 		NULL, sizeof (overlay_targ_degrade_t)	},
1447 	{ OVERLAY_TARG_RESTORE, B_TRUE, B_FALSE,
1448 		NULL, overlay_target_restore,
1449 		NULL, sizeof (overlay_targ_id_t)	},
1450 	{ OVERLAY_TARG_LOOKUP, B_FALSE, B_TRUE,
1451 		NULL, overlay_target_lookup_request,
1452 		NULL, sizeof (overlay_targ_lookup_t)	},
1453 	{ OVERLAY_TARG_RESPOND, B_TRUE, B_FALSE,
1454 		NULL, overlay_target_lookup_respond,
1455 		NULL, sizeof (overlay_targ_resp_t)	},
1456 	{ OVERLAY_TARG_DROP, B_TRUE, B_FALSE,
1457 		NULL, overlay_target_lookup_drop,
1458 		NULL, sizeof (overlay_targ_resp_t)	},
1459 	{ OVERLAY_TARG_PKT, B_TRUE, B_TRUE,
1460 		overlay_target_pkt_copyin,
1461 		overlay_target_packet,
1462 		overlay_target_pkt_copyout,
1463 		sizeof (overlay_targ_pkt_t)		},
1464 	{ OVERLAY_TARG_INJECT, B_TRUE, B_FALSE,
1465 		overlay_target_pkt_copyin,
1466 		overlay_target_inject,
1467 		NULL, sizeof (overlay_targ_pkt_t)	},
1468 	{ OVERLAY_TARG_RESEND, B_TRUE, B_FALSE,
1469 		overlay_target_pkt_copyin,
1470 		overlay_target_resend,
1471 		NULL, sizeof (overlay_targ_pkt_t)	},
1472 	{ OVERLAY_TARG_LIST, B_FALSE, B_TRUE,
1473 		overlay_target_list_copyin,
1474 		overlay_target_ioctl_list,
1475 		overlay_target_list_copyout,
1476 		sizeof (overlay_targ_list_t)		},
1477 	{ OVERLAY_TARG_CACHE_GET, B_FALSE, B_TRUE,
1478 		NULL, overlay_target_cache_get,
1479 		NULL, sizeof (overlay_targ_cache_t)	},
1480 	{ OVERLAY_TARG_CACHE_SET, B_TRUE, B_TRUE,
1481 		NULL, overlay_target_cache_set,
1482 		NULL, sizeof (overlay_targ_cache_t)	},
1483 	{ OVERLAY_TARG_CACHE_REMOVE, B_TRUE, B_TRUE,
1484 		NULL, overlay_target_cache_remove,
1485 		NULL, sizeof (overlay_targ_cache_t)	},
1486 	{ OVERLAY_TARG_CACHE_FLUSH, B_TRUE, B_TRUE,
1487 		NULL, overlay_target_cache_flush,
1488 		NULL, sizeof (overlay_targ_cache_t)	},
1489 	{ OVERLAY_TARG_CACHE_ITER, B_FALSE, B_TRUE,
1490 		overlay_target_cache_iter_copyin,
1491 		overlay_target_cache_iter,
1492 		overlay_target_cache_iter_copyout,
1493 		sizeof (overlay_targ_cache_iter_t)		},
1494 	{ 0 }
1495 };
1496 
1497 int
1498 overlay_target_open(dev_t *devp, int flags, int otype, cred_t *credp)
1499 {
1500 	minor_t mid;
1501 	overlay_target_hdl_t *thdl;
1502 
1503 	if (secpolicy_dl_config(credp) != 0)
1504 		return (EPERM);
1505 
1506 	if (getminor(*devp) != 0)
1507 		return (ENXIO);
1508 
1509 	if (otype & OTYP_BLK)
1510 		return (EINVAL);
1511 
1512 	if (flags & ~(FREAD | FWRITE | FEXCL))
1513 		return (EINVAL);
1514 
1515 	if ((flags & FWRITE) &&
1516 	    !(flags & FEXCL))
1517 		return (EINVAL);
1518 
1519 	if (!(flags & FREAD) && !(flags & FWRITE))
1520 		return (EINVAL);
1521 
1522 	if (crgetzoneid(credp) != GLOBAL_ZONEID)
1523 		return (EPERM);
1524 
1525 	mid = id_alloc(overlay_thdl_idspace);
1526 	if (ddi_soft_state_zalloc(overlay_thdl_state, mid) != 0) {
1527 		id_free(overlay_thdl_idspace, mid);
1528 		return (ENXIO);
1529 	}
1530 
1531 	thdl = ddi_get_soft_state(overlay_thdl_state, mid);
1532 	VERIFY(thdl != NULL);
1533 	thdl->oth_minor = mid;
1534 	thdl->oth_zoneid = crgetzoneid(credp);
1535 	thdl->oth_oflags = flags;
1536 	mutex_init(&thdl->oth_lock, NULL, MUTEX_DRIVER, NULL);
1537 	list_create(&thdl->oth_outstanding, sizeof (overlay_target_entry_t),
1538 	    offsetof(overlay_target_entry_t, ote_qlink));
1539 	*devp = makedevice(getmajor(*devp), mid);
1540 
1541 	mutex_enter(&overlay_target_lock);
1542 	if ((flags & FEXCL) && overlay_target_excl == B_TRUE) {
1543 		mutex_exit(&overlay_target_lock);
1544 		list_destroy(&thdl->oth_outstanding);
1545 		mutex_destroy(&thdl->oth_lock);
1546 		ddi_soft_state_free(overlay_thdl_state, mid);
1547 		id_free(overlay_thdl_idspace, mid);
1548 		return (EEXIST);
1549 	} else if ((flags & FEXCL) != 0) {
1550 		VERIFY(overlay_target_excl == B_FALSE);
1551 		overlay_target_excl = B_TRUE;
1552 	}
1553 	list_insert_tail(&overlay_thdl_list, thdl);
1554 	mutex_exit(&overlay_target_lock);
1555 
1556 	return (0);
1557 }
1558 
1559 /* ARGSUSED */
1560 int
1561 overlay_target_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1562     int *rvalp)
1563 {
1564 	overlay_target_ioctl_t *ioc;
1565 	overlay_target_hdl_t *thdl;
1566 
1567 	if (secpolicy_dl_config(credp) != 0)
1568 		return (EPERM);
1569 
1570 	if ((thdl = ddi_get_soft_state(overlay_thdl_state,
1571 	    getminor(dev))) == NULL)
1572 		return (ENXIO);
1573 
1574 	for (ioc = &overlay_target_ioctab[0]; ioc->oti_cmd != 0; ioc++) {
1575 		int ret;
1576 		caddr_t buf;
1577 		size_t bufsize;
1578 
1579 		if (ioc->oti_cmd != cmd)
1580 			continue;
1581 
1582 		if (ioc->oti_write == B_TRUE && !(mode & FWRITE))
1583 			return (EBADF);
1584 
1585 		if (ioc->oti_copyin == NULL) {
1586 			bufsize = ioc->oti_size;
1587 			buf = kmem_alloc(bufsize, KM_SLEEP);
1588 			if (ddi_copyin((void *)(uintptr_t)arg, buf, bufsize,
1589 			    mode & FKIOCTL) != 0) {
1590 				kmem_free(buf, bufsize);
1591 				return (EFAULT);
1592 			}
1593 		} else {
1594 			if ((ret = ioc->oti_copyin((void *)(uintptr_t)arg,
1595 			    (void **)&buf, &bufsize, mode)) != 0)
1596 				return (ret);
1597 		}
1598 
1599 		ret = ioc->oti_func(thdl, buf);
1600 		if (ret == 0 && ioc->oti_size != 0 &&
1601 		    ioc->oti_ncopyout == B_TRUE) {
1602 			if (ioc->oti_copyout == NULL) {
1603 				if (ddi_copyout(buf, (void *)(uintptr_t)arg,
1604 				    bufsize, mode & FKIOCTL) != 0)
1605 					ret = EFAULT;
1606 			} else {
1607 				ret = ioc->oti_copyout((void *)(uintptr_t)arg,
1608 				    buf, bufsize, mode);
1609 			}
1610 		}
1611 
1612 		kmem_free(buf, bufsize);
1613 		return (ret);
1614 	}
1615 
1616 	return (ENOTTY);
1617 }
1618 
1619 /* ARGSUSED */
1620 int
1621 overlay_target_close(dev_t dev, int flags, int otype, cred_t *credp)
1622 {
1623 	overlay_target_hdl_t *thdl;
1624 	overlay_target_entry_t *entry;
1625 	minor_t mid = getminor(dev);
1626 
1627 	if ((thdl = ddi_get_soft_state(overlay_thdl_state, mid)) == NULL)
1628 		return (ENXIO);
1629 
1630 	mutex_enter(&overlay_target_lock);
1631 	list_remove(&overlay_thdl_list, thdl);
1632 	mutex_enter(&thdl->oth_lock);
1633 	while ((entry = list_remove_head(&thdl->oth_outstanding)) != NULL)
1634 		list_insert_tail(&overlay_target_list, entry);
1635 	cv_signal(&overlay_target_condvar);
1636 	mutex_exit(&thdl->oth_lock);
1637 	if ((thdl->oth_oflags & FEXCL) != 0) {
1638 		VERIFY(overlay_target_excl == B_TRUE);
1639 		overlay_target_excl = B_FALSE;
1640 	}
1641 	mutex_exit(&overlay_target_lock);
1642 
1643 	list_destroy(&thdl->oth_outstanding);
1644 	mutex_destroy(&thdl->oth_lock);
1645 	mid = thdl->oth_minor;
1646 	ddi_soft_state_free(overlay_thdl_state, mid);
1647 	id_free(overlay_thdl_idspace, mid);
1648 
1649 	return (0);
1650 }
1651