xref: /illumos-gate/usr/src/uts/common/io/overlay/overlay_target.c (revision 36589d6bb0cdae89e166b57b0d64ae56d53247d9)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2016 Joyent, Inc.
14  */
15 
16 /*
17  * Overlay device target cache management
18  *
19  * For more information, see the big theory statement in
20  * uts/common/io/overlay/overlay.c
21  */
22 
23 #include <sys/types.h>
24 #include <sys/ethernet.h>
25 #include <sys/kmem.h>
26 #include <sys/policy.h>
27 #include <sys/sysmacros.h>
28 #include <sys/stream.h>
29 #include <sys/strsun.h>
30 #include <sys/strsubr.h>
31 #include <sys/mac_provider.h>
32 #include <sys/mac_client.h>
33 #include <sys/mac_client_priv.h>
34 #include <sys/vlan.h>
35 #include <sys/crc32.h>
36 #include <sys/cred.h>
37 #include <sys/file.h>
38 #include <sys/errno.h>
39 #include <sys/ddi.h>
40 #include <sys/sunddi.h>
41 
42 #include <sys/overlay_impl.h>
43 #include <sys/sdt.h>
44 
45 /*
46  * This is total straw man, but at least it's a prime number. Here we're
47  * going to have to go through and do a lot of evaluation and understanding as
48  * to how these target caches should grow and shrink, as well as, memory
49  * pressure and evictions. This just gives us a starting point that'll be 'good
50  * enough', until it's not.
51  */
52 #define	OVERLAY_HSIZE	823
53 
54 /*
55  * We use this data structure to keep track of what requests have been actively
56  * allocated to a given instance so we know what to put back on the pending
57  * list.
58  */
59 typedef struct overlay_target_hdl {
60 	minor_t oth_minor;		/* RO */
61 	zoneid_t oth_zoneid;		/* RO */
62 	int oth_oflags;			/* RO */
63 	list_node_t oth_link;		/* overlay_target_lock */
64 	kmutex_t oth_lock;
65 	list_t	oth_outstanding;	/* oth_lock */
66 } overlay_target_hdl_t;
67 
68 typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int);
69 typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *);
70 typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int);
71 
72 typedef struct overlay_target_ioctl {
73 	int		oti_cmd;	/* ioctl id */
74 	boolean_t	oti_write;	/* ioctl requires FWRITE */
75 	boolean_t	oti_ncopyout;	/* copyout data? */
76 	overlay_target_copyin_f oti_copyin;	/* copyin func */
77 	overlay_target_ioctl_f oti_func; /* function to call */
78 	overlay_target_copyout_f oti_copyout;	/* copyin func */
79 	size_t		oti_size;	/* size of user level structure */
80 } overlay_target_ioctl_t;
81 
82 static kmem_cache_t *overlay_target_cache;
83 static kmem_cache_t *overlay_entry_cache;
84 static id_space_t *overlay_thdl_idspace;
85 static void *overlay_thdl_state;
86 
87 /*
88  * When we support overlay devices in the NGZ, then all of these need to become
89  * zone aware, by plugging into the netstack engine and becoming per-netstack
90  * data.
91  */
92 static list_t overlay_thdl_list;
93 static kmutex_t overlay_target_lock;
94 static kcondvar_t overlay_target_condvar;
95 static list_t overlay_target_list;
96 static boolean_t overlay_target_excl;
97 
98 /*
99  * Outstanding data per hash table entry.
100  */
101 static int overlay_ent_size = 128 * 1024;
102 
103 /* ARGSUSED */
104 static int
105 overlay_target_cache_constructor(void *buf, void *arg, int kmflgs)
106 {
107 	overlay_target_t *ott = buf;
108 
109 	mutex_init(&ott->ott_lock, NULL, MUTEX_DRIVER, NULL);
110 	cv_init(&ott->ott_cond, NULL, CV_DRIVER, NULL);
111 	return (0);
112 }
113 
114 /* ARGSUSED */
115 static void
116 overlay_target_cache_destructor(void *buf, void *arg)
117 {
118 	overlay_target_t *ott = buf;
119 
120 	cv_destroy(&ott->ott_cond);
121 	mutex_destroy(&ott->ott_lock);
122 }
123 
124 /* ARGSUSED */
125 static int
126 overlay_entry_cache_constructor(void *buf, void *arg, int kmflgs)
127 {
128 	overlay_target_entry_t *ote = buf;
129 
130 	bzero(ote, sizeof (overlay_target_entry_t));
131 	mutex_init(&ote->ote_lock, NULL, MUTEX_DRIVER, NULL);
132 	return (0);
133 }
134 
135 /* ARGSUSED */
136 static void
137 overlay_entry_cache_destructor(void *buf, void *arg)
138 {
139 	overlay_target_entry_t *ote = buf;
140 
141 	mutex_destroy(&ote->ote_lock);
142 }
143 
144 static uint64_t
145 overlay_mac_hash(const void *v)
146 {
147 	uint32_t crc;
148 	CRC32(crc, v, ETHERADDRL, -1U, crc32_table);
149 	return (crc);
150 }
151 
152 static int
153 overlay_mac_cmp(const void *a, const void *b)
154 {
155 	return (bcmp(a, b, ETHERADDRL));
156 }
157 
158 /* ARGSUSED */
159 static void
160 overlay_target_entry_dtor(void *arg)
161 {
162 	overlay_target_entry_t *ote = arg;
163 
164 	ote->ote_flags = 0;
165 	bzero(ote->ote_addr, ETHERADDRL);
166 	ote->ote_ott = NULL;
167 	ote->ote_odd = NULL;
168 	freemsgchain(ote->ote_chead);
169 	ote->ote_chead = ote->ote_ctail = NULL;
170 	ote->ote_mbsize = 0;
171 	ote->ote_vtime = 0;
172 	kmem_cache_free(overlay_entry_cache, ote);
173 }
174 
175 static int
176 overlay_mac_avl(const void *a, const void *b)
177 {
178 	int i;
179 	const overlay_target_entry_t *l, *r;
180 	l = a;
181 	r = b;
182 
183 	for (i = 0; i < ETHERADDRL; i++) {
184 		if (l->ote_addr[i] > r->ote_addr[i])
185 			return (1);
186 		else if (l->ote_addr[i] < r->ote_addr[i])
187 			return (-1);
188 	}
189 
190 	return (0);
191 }
192 
193 void
194 overlay_target_init(void)
195 {
196 	int ret;
197 	ret = ddi_soft_state_init(&overlay_thdl_state,
198 	    sizeof (overlay_target_hdl_t), 1);
199 	VERIFY(ret == 0);
200 	overlay_target_cache = kmem_cache_create("overlay_target",
201 	    sizeof (overlay_target_t), 0, overlay_target_cache_constructor,
202 	    overlay_target_cache_destructor, NULL, NULL, NULL, 0);
203 	overlay_entry_cache = kmem_cache_create("overlay_entry",
204 	    sizeof (overlay_target_entry_t), 0, overlay_entry_cache_constructor,
205 	    overlay_entry_cache_destructor, NULL, NULL, NULL, 0);
206 	mutex_init(&overlay_target_lock, NULL, MUTEX_DRIVER, NULL);
207 	cv_init(&overlay_target_condvar, NULL, CV_DRIVER, NULL);
208 	list_create(&overlay_target_list, sizeof (overlay_target_entry_t),
209 	    offsetof(overlay_target_entry_t, ote_qlink));
210 	list_create(&overlay_thdl_list, sizeof (overlay_target_hdl_t),
211 	    offsetof(overlay_target_hdl_t, oth_link));
212 	overlay_thdl_idspace = id_space_create("overlay_target_minors",
213 	    1, INT32_MAX);
214 }
215 
216 void
217 overlay_target_fini(void)
218 {
219 	id_space_destroy(overlay_thdl_idspace);
220 	list_destroy(&overlay_thdl_list);
221 	list_destroy(&overlay_target_list);
222 	cv_destroy(&overlay_target_condvar);
223 	mutex_destroy(&overlay_target_lock);
224 	kmem_cache_destroy(overlay_entry_cache);
225 	kmem_cache_destroy(overlay_target_cache);
226 	ddi_soft_state_fini(&overlay_thdl_state);
227 }
228 
229 void
230 overlay_target_free(overlay_dev_t *odd)
231 {
232 	if (odd->odd_target == NULL)
233 		return;
234 
235 	if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) {
236 		refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash;
237 		avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree;
238 		overlay_target_entry_t *ote;
239 
240 		/*
241 		 * Our AVL tree and hashtable contain the same elements,
242 		 * therefore we should just remove it from the tree, but then
243 		 * delete the entries when we remove them from the hash table
244 		 * (which happens through the refhash dtor).
245 		 */
246 		while ((ote = avl_first(ap)) != NULL)
247 			avl_remove(ap, ote);
248 
249 		avl_destroy(ap);
250 		for (ote = refhash_first(rp); ote != NULL;
251 		    ote = refhash_next(rp, ote)) {
252 			refhash_remove(rp, ote);
253 		}
254 		refhash_destroy(rp);
255 	}
256 
257 	ASSERT(odd->odd_target->ott_ocount == 0);
258 	kmem_cache_free(overlay_target_cache, odd->odd_target);
259 }
260 
261 int
262 overlay_target_busy()
263 {
264 	int ret;
265 
266 	mutex_enter(&overlay_target_lock);
267 	ret = !list_is_empty(&overlay_thdl_list);
268 	mutex_exit(&overlay_target_lock);
269 
270 	return (ret);
271 }
272 
273 static void
274 overlay_target_queue(overlay_target_entry_t *entry)
275 {
276 	mutex_enter(&overlay_target_lock);
277 	mutex_enter(&entry->ote_ott->ott_lock);
278 	if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) {
279 		mutex_exit(&entry->ote_ott->ott_lock);
280 		mutex_exit(&overlay_target_lock);
281 		return;
282 	}
283 	entry->ote_ott->ott_ocount++;
284 	mutex_exit(&entry->ote_ott->ott_lock);
285 	list_insert_tail(&overlay_target_list, entry);
286 	cv_signal(&overlay_target_condvar);
287 	mutex_exit(&overlay_target_lock);
288 }
289 
290 void
291 overlay_target_quiesce(overlay_target_t *ott)
292 {
293 	if (ott == NULL)
294 		return;
295 	mutex_enter(&ott->ott_lock);
296 	ott->ott_flags |= OVERLAY_T_TEARDOWN;
297 	while (ott->ott_ocount != 0)
298 		cv_wait(&ott->ott_cond, &ott->ott_lock);
299 	mutex_exit(&ott->ott_lock);
300 }
301 
302 /*
303  * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP |
304  * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at
305  * this time, say for NVGRE, we drop all packets that mcuh this.
306  */
307 int
308 overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
309     socklen_t *slenp)
310 {
311 	int ret;
312 	struct sockaddr_in6 *v6;
313 	overlay_target_t *ott;
314 	mac_header_info_t mhi;
315 	overlay_target_entry_t *entry;
316 
317 	ASSERT(odd->odd_target != NULL);
318 
319 	/*
320 	 * At this point, the overlay device is in a mux which means that it's
321 	 * been activated. At this point, parts of the target, such as the mode
322 	 * and the destination are now read-only and we don't have to worry
323 	 * about synchronization for them.
324 	 */
325 	ott = odd->odd_target;
326 	if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT))
327 		return (OVERLAY_TARGET_DROP);
328 
329 	v6 = (struct sockaddr_in6 *)sock;
330 	bzero(v6, sizeof (struct sockaddr_in6));
331 	v6->sin6_family = AF_INET6;
332 
333 	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
334 		mutex_enter(&ott->ott_lock);
335 		bcopy(&ott->ott_u.ott_point.otp_ip, &v6->sin6_addr,
336 		    sizeof (struct in6_addr));
337 		v6->sin6_port = htons(ott->ott_u.ott_point.otp_port);
338 		mutex_exit(&ott->ott_lock);
339 		*slenp = sizeof (struct sockaddr_in6);
340 
341 		return (OVERLAY_TARGET_OK);
342 	}
343 
344 	ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC);
345 
346 	/*
347 	 * Note we only want the MAC address here, therefore we won't bother
348 	 * using mac_vlan_header_info(). If any caller needs the vlan info at
349 	 * this point, this should change to a call to mac_vlan_header_info().
350 	 */
351 	if (mac_header_info(odd->odd_mh, mp, &mhi) != 0)
352 		return (OVERLAY_TARGET_DROP);
353 	mutex_enter(&ott->ott_lock);
354 	entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
355 	    mhi.mhi_daddr);
356 	if (entry == NULL) {
357 		entry = kmem_cache_alloc(overlay_entry_cache,
358 		    KM_NOSLEEP | KM_NORMALPRI);
359 		if (entry == NULL) {
360 			mutex_exit(&ott->ott_lock);
361 			return (OVERLAY_TARGET_DROP);
362 		}
363 		bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL);
364 		entry->ote_chead = entry->ote_ctail = mp;
365 		entry->ote_mbsize = msgsize(mp);
366 		entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
367 		entry->ote_ott = ott;
368 		entry->ote_odd = odd;
369 		refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry);
370 		avl_add(&ott->ott_u.ott_dyn.ott_tree, entry);
371 		mutex_exit(&ott->ott_lock);
372 		overlay_target_queue(entry);
373 		return (OVERLAY_TARGET_ASYNC);
374 	}
375 	refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
376 	mutex_exit(&ott->ott_lock);
377 
378 	mutex_enter(&entry->ote_lock);
379 	if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) {
380 		ret = OVERLAY_TARGET_DROP;
381 	} else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
382 		bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr,
383 		    sizeof (struct in6_addr));
384 		v6->sin6_port = htons(entry->ote_dest.otp_port);
385 		*slenp = sizeof (struct sockaddr_in6);
386 		ret = OVERLAY_TARGET_OK;
387 	} else {
388 		size_t mlen = msgsize(mp);
389 
390 		if (mlen + entry->ote_mbsize > overlay_ent_size) {
391 			ret = OVERLAY_TARGET_DROP;
392 		} else {
393 			if (entry->ote_ctail != NULL) {
394 				ASSERT(entry->ote_ctail->b_next ==
395 				    NULL);
396 				entry->ote_ctail->b_next = mp;
397 				entry->ote_ctail = mp;
398 			} else {
399 				entry->ote_chead = mp;
400 				entry->ote_ctail = mp;
401 			}
402 			entry->ote_mbsize += mlen;
403 			if ((entry->ote_flags &
404 			    OVERLAY_ENTRY_F_PENDING) == 0) {
405 				entry->ote_flags |=
406 				    OVERLAY_ENTRY_F_PENDING;
407 				overlay_target_queue(entry);
408 			}
409 			ret = OVERLAY_TARGET_ASYNC;
410 		}
411 	}
412 	mutex_exit(&entry->ote_lock);
413 
414 	mutex_enter(&ott->ott_lock);
415 	refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
416 	mutex_exit(&ott->ott_lock);
417 
418 	return (ret);
419 }
420 
421 /* ARGSUSED */
422 static int
423 overlay_target_info(overlay_target_hdl_t *thdl, void *arg)
424 {
425 	overlay_dev_t *odd;
426 	overlay_targ_info_t *oti = arg;
427 
428 	odd = overlay_hold_by_dlid(oti->oti_linkid);
429 	if (odd == NULL)
430 		return (ENOENT);
431 
432 	mutex_enter(&odd->odd_lock);
433 	oti->oti_flags = 0;
434 	oti->oti_needs = odd->odd_plugin->ovp_dest;
435 	if (odd->odd_flags & OVERLAY_F_DEGRADED)
436 		oti->oti_flags |= OVERLAY_TARG_INFO_F_DEGRADED;
437 	if (odd->odd_flags & OVERLAY_F_ACTIVATED)
438 		oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE;
439 	oti->oti_vnetid = odd->odd_vid;
440 	mutex_exit(&odd->odd_lock);
441 	overlay_hold_rele(odd);
442 	return (0);
443 }
444 
445 /* ARGSUSED */
446 static int
447 overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
448 {
449 	overlay_dev_t *odd;
450 	overlay_target_t *ott;
451 	overlay_targ_associate_t *ota = arg;
452 
453 	odd = overlay_hold_by_dlid(ota->ota_linkid);
454 	if (odd == NULL)
455 		return (ENOENT);
456 
457 	if (ota->ota_id == 0) {
458 		overlay_hold_rele(odd);
459 		return (EINVAL);
460 	}
461 
462 	if (ota->ota_mode != OVERLAY_TARGET_POINT &&
463 	    ota->ota_mode != OVERLAY_TARGET_DYNAMIC) {
464 		overlay_hold_rele(odd);
465 		return (EINVAL);
466 	}
467 
468 	if (ota->ota_provides != odd->odd_plugin->ovp_dest) {
469 		overlay_hold_rele(odd);
470 		return (EINVAL);
471 	}
472 
473 	if (ota->ota_mode == OVERLAY_TARGET_POINT) {
474 		if (ota->ota_provides & OVERLAY_PLUGIN_D_IP) {
475 			if (IN6_IS_ADDR_UNSPECIFIED(&ota->ota_point.otp_ip) ||
476 			    IN6_IS_ADDR_V4COMPAT(&ota->ota_point.otp_ip) ||
477 			    IN6_IS_ADDR_V4MAPPED_ANY(&ota->ota_point.otp_ip)) {
478 				overlay_hold_rele(odd);
479 				return (EINVAL);
480 			}
481 		}
482 
483 		if (ota->ota_provides & OVERLAY_PLUGIN_D_PORT) {
484 			if (ota->ota_point.otp_port == 0) {
485 				overlay_hold_rele(odd);
486 				return (EINVAL);
487 			}
488 		}
489 	}
490 
491 	ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP);
492 	ott->ott_flags = 0;
493 	ott->ott_ocount = 0;
494 	ott->ott_mode = ota->ota_mode;
495 	ott->ott_dest = ota->ota_provides;
496 	ott->ott_id = ota->ota_id;
497 
498 	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
499 		bcopy(&ota->ota_point, &ott->ott_u.ott_point,
500 		    sizeof (overlay_target_point_t));
501 	} else {
502 		ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE,
503 		    overlay_mac_hash, overlay_mac_cmp,
504 		    overlay_target_entry_dtor, sizeof (overlay_target_entry_t),
505 		    offsetof(overlay_target_entry_t, ote_reflink),
506 		    offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP);
507 		avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl,
508 		    sizeof (overlay_target_entry_t),
509 		    offsetof(overlay_target_entry_t, ote_avllink));
510 	}
511 	mutex_enter(&odd->odd_lock);
512 	if (odd->odd_flags & OVERLAY_F_VARPD) {
513 		mutex_exit(&odd->odd_lock);
514 		kmem_cache_free(overlay_target_cache, ott);
515 		overlay_hold_rele(odd);
516 		return (EEXIST);
517 	}
518 
519 	odd->odd_flags |= OVERLAY_F_VARPD;
520 	odd->odd_target = ott;
521 	mutex_exit(&odd->odd_lock);
522 
523 	overlay_hold_rele(odd);
524 
525 
526 	return (0);
527 }
528 
529 
530 /* ARGSUSED */
531 static int
532 overlay_target_degrade(overlay_target_hdl_t *thdl, void *arg)
533 {
534 	overlay_dev_t *odd;
535 	overlay_targ_degrade_t *otd = arg;
536 
537 	odd = overlay_hold_by_dlid(otd->otd_linkid);
538 	if (odd == NULL)
539 		return (ENOENT);
540 
541 	overlay_fm_degrade(odd, otd->otd_buf);
542 	overlay_hold_rele(odd);
543 	return (0);
544 }
545 
546 /* ARGSUSED */
547 static int
548 overlay_target_restore(overlay_target_hdl_t *thdl, void *arg)
549 {
550 	overlay_dev_t *odd;
551 	overlay_targ_id_t *otid = arg;
552 
553 	odd = overlay_hold_by_dlid(otid->otid_linkid);
554 	if (odd == NULL)
555 		return (ENOENT);
556 
557 	overlay_fm_restore(odd);
558 	overlay_hold_rele(odd);
559 	return (0);
560 }
561 
562 /* ARGSUSED */
563 static int
564 overlay_target_disassociate(overlay_target_hdl_t *thdl, void *arg)
565 {
566 	overlay_dev_t *odd;
567 	overlay_targ_id_t *otid = arg;
568 
569 	odd = overlay_hold_by_dlid(otid->otid_linkid);
570 	if (odd == NULL)
571 		return (ENOENT);
572 
573 	mutex_enter(&odd->odd_lock);
574 	odd->odd_flags &= ~OVERLAY_F_VARPD;
575 	mutex_exit(&odd->odd_lock);
576 
577 	overlay_hold_rele(odd);
578 	return (0);
579 
580 }
581 
582 static int
583 overlay_target_lookup_request(overlay_target_hdl_t *thdl, void *arg)
584 {
585 	overlay_targ_lookup_t *otl = arg;
586 	overlay_target_entry_t *entry;
587 	clock_t ret, timeout;
588 	mac_header_info_t mhi;
589 
590 	timeout = ddi_get_lbolt() + drv_usectohz(MICROSEC);
591 again:
592 	mutex_enter(&overlay_target_lock);
593 	while (list_is_empty(&overlay_target_list)) {
594 		ret = cv_timedwait(&overlay_target_condvar,
595 		    &overlay_target_lock, timeout);
596 		if (ret == -1) {
597 			mutex_exit(&overlay_target_lock);
598 			return (ETIME);
599 		}
600 	}
601 	entry = list_remove_head(&overlay_target_list);
602 	mutex_exit(&overlay_target_lock);
603 	mutex_enter(&entry->ote_lock);
604 	if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
605 		ASSERT(entry->ote_chead == NULL);
606 		mutex_exit(&entry->ote_lock);
607 		goto again;
608 	}
609 	ASSERT(entry->ote_chead != NULL);
610 
611 	/*
612 	 * If we have a bogon that doesn't have a valid mac header, drop it and
613 	 * try again.
614 	 */
615 	if (mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead,
616 	    &mhi) != 0) {
617 		boolean_t queue = B_FALSE;
618 		mblk_t *mp = entry->ote_chead;
619 		entry->ote_chead = mp->b_next;
620 		mp->b_next = NULL;
621 		if (entry->ote_ctail == mp)
622 			entry->ote_ctail = entry->ote_chead;
623 		entry->ote_mbsize -= msgsize(mp);
624 		if (entry->ote_chead != NULL)
625 			queue = B_TRUE;
626 		mutex_exit(&entry->ote_lock);
627 		if (queue == B_TRUE)
628 			overlay_target_queue(entry);
629 		freemsg(mp);
630 		goto again;
631 	}
632 
633 	otl->otl_dlid = entry->ote_odd->odd_linkid;
634 	otl->otl_reqid = (uintptr_t)entry;
635 	otl->otl_varpdid = entry->ote_ott->ott_id;
636 	otl->otl_vnetid = entry->ote_odd->odd_vid;
637 
638 	otl->otl_hdrsize = mhi.mhi_hdrsize;
639 	otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize;
640 	bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL);
641 	bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL);
642 	otl->otl_dsttype = mhi.mhi_dsttype;
643 	otl->otl_sap = mhi.mhi_bindsap;
644 	otl->otl_vlan = VLAN_ID(mhi.mhi_tci);
645 	mutex_exit(&entry->ote_lock);
646 
647 	mutex_enter(&thdl->oth_lock);
648 	list_insert_tail(&thdl->oth_outstanding, entry);
649 	mutex_exit(&thdl->oth_lock);
650 
651 	return (0);
652 }
653 
654 static int
655 overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg)
656 {
657 	const overlay_targ_resp_t *otr = arg;
658 	overlay_target_entry_t *entry;
659 	mblk_t *mp;
660 
661 	mutex_enter(&thdl->oth_lock);
662 	for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
663 	    entry = list_next(&thdl->oth_outstanding, entry)) {
664 		if ((uintptr_t)entry == otr->otr_reqid)
665 			break;
666 	}
667 
668 	if (entry == NULL) {
669 		mutex_exit(&thdl->oth_lock);
670 		return (EINVAL);
671 	}
672 	list_remove(&thdl->oth_outstanding, entry);
673 	mutex_exit(&thdl->oth_lock);
674 
675 	mutex_enter(&entry->ote_lock);
676 	bcopy(&otr->otr_answer, &entry->ote_dest,
677 	    sizeof (overlay_target_point_t));
678 	entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
679 	entry->ote_flags |= OVERLAY_ENTRY_F_VALID;
680 	mp = entry->ote_chead;
681 	entry->ote_chead = NULL;
682 	entry->ote_ctail = NULL;
683 	entry->ote_mbsize = 0;
684 	entry->ote_vtime = gethrtime();
685 	mutex_exit(&entry->ote_lock);
686 
687 	/*
688 	 * For now do an in-situ drain.
689 	 */
690 	mp = overlay_m_tx(entry->ote_odd, mp);
691 	freemsgchain(mp);
692 
693 	mutex_enter(&entry->ote_ott->ott_lock);
694 	entry->ote_ott->ott_ocount--;
695 	cv_signal(&entry->ote_ott->ott_cond);
696 	mutex_exit(&entry->ote_ott->ott_lock);
697 
698 	return (0);
699 }
700 
701 static int
702 overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
703 {
704 	const overlay_targ_resp_t *otr = arg;
705 	overlay_target_entry_t *entry;
706 	mblk_t *mp;
707 	boolean_t queue = B_FALSE;
708 
709 	mutex_enter(&thdl->oth_lock);
710 	for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
711 	    entry = list_next(&thdl->oth_outstanding, entry)) {
712 		if ((uintptr_t)entry == otr->otr_reqid)
713 			break;
714 	}
715 
716 	if (entry == NULL) {
717 		mutex_exit(&thdl->oth_lock);
718 		return (EINVAL);
719 	}
720 	list_remove(&thdl->oth_outstanding, entry);
721 	mutex_exit(&thdl->oth_lock);
722 
723 	mutex_enter(&entry->ote_lock);
724 
725 	/* Safeguard against a confused varpd */
726 	if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
727 		entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
728 		DTRACE_PROBE1(overlay__target__valid__drop,
729 		    overlay_target_entry_t *, entry);
730 		mutex_exit(&entry->ote_lock);
731 		goto done;
732 	}
733 
734 	mp = entry->ote_chead;
735 	if (mp != NULL) {
736 		entry->ote_chead = mp->b_next;
737 		mp->b_next = NULL;
738 		if (entry->ote_ctail == mp)
739 			entry->ote_ctail = entry->ote_chead;
740 		entry->ote_mbsize -= msgsize(mp);
741 	}
742 	if (entry->ote_chead != NULL) {
743 		queue = B_TRUE;
744 		entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
745 	} else {
746 		entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
747 	}
748 	mutex_exit(&entry->ote_lock);
749 
750 	if (queue == B_TRUE)
751 		overlay_target_queue(entry);
752 	freemsg(mp);
753 
754 done:
755 	mutex_enter(&entry->ote_ott->ott_lock);
756 	entry->ote_ott->ott_ocount--;
757 	cv_signal(&entry->ote_ott->ott_cond);
758 	mutex_exit(&entry->ote_ott->ott_lock);
759 
760 	return (0);
761 }
762 
763 /* ARGSUSED */
764 static int
765 overlay_target_pkt_copyin(const void *ubuf, void **outp, size_t *bsize,
766     int flags)
767 {
768 	overlay_targ_pkt_t *pkt;
769 	overlay_targ_pkt32_t *pkt32;
770 
771 	pkt = kmem_alloc(sizeof (overlay_targ_pkt_t), KM_SLEEP);
772 	*outp = pkt;
773 	*bsize = sizeof (overlay_targ_pkt_t);
774 	if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
775 		uintptr_t addr;
776 
777 		if (ddi_copyin(ubuf, pkt, sizeof (overlay_targ_pkt32_t),
778 		    flags & FKIOCTL) != 0) {
779 			kmem_free(pkt, *bsize);
780 			return (EFAULT);
781 		}
782 		pkt32 = (overlay_targ_pkt32_t *)pkt;
783 		addr = pkt32->otp_buf;
784 		pkt->otp_buf = (void *)addr;
785 	} else {
786 		if (ddi_copyin(ubuf, pkt, *bsize, flags & FKIOCTL) != 0) {
787 			kmem_free(pkt, *bsize);
788 			return (EFAULT);
789 		}
790 	}
791 	return (0);
792 }
793 
794 static int
795 overlay_target_pkt_copyout(void *ubuf, void *buf, size_t bufsize,
796     int flags)
797 {
798 	if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
799 		overlay_targ_pkt_t *pkt = buf;
800 		overlay_targ_pkt32_t *pkt32 = buf;
801 		uintptr_t addr = (uintptr_t)pkt->otp_buf;
802 		pkt32->otp_buf = (caddr32_t)addr;
803 		if (ddi_copyout(buf, ubuf, sizeof (overlay_targ_pkt32_t),
804 		    flags & FKIOCTL) != 0)
805 			return (EFAULT);
806 	} else {
807 		if (ddi_copyout(buf, ubuf, bufsize, flags & FKIOCTL) != 0)
808 			return (EFAULT);
809 	}
810 	return (0);
811 }
812 
813 static int
814 overlay_target_packet(overlay_target_hdl_t *thdl, void *arg)
815 {
816 	overlay_targ_pkt_t *pkt = arg;
817 	overlay_target_entry_t *entry;
818 	mblk_t *mp;
819 	size_t mlen;
820 	size_t boff;
821 
822 	mutex_enter(&thdl->oth_lock);
823 	for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
824 	    entry = list_next(&thdl->oth_outstanding, entry)) {
825 		if ((uintptr_t)entry == pkt->otp_reqid)
826 			break;
827 	}
828 
829 	if (entry == NULL) {
830 		mutex_exit(&thdl->oth_lock);
831 		return (EINVAL);
832 	}
833 	mutex_enter(&entry->ote_lock);
834 	mutex_exit(&thdl->oth_lock);
835 	mp = entry->ote_chead;
836 	/* Protect against a rogue varpd */
837 	if (mp == NULL) {
838 		mutex_exit(&entry->ote_lock);
839 		return (EINVAL);
840 	}
841 	mlen = MIN(msgsize(mp), pkt->otp_size);
842 	pkt->otp_size = mlen;
843 	boff = 0;
844 	while (mlen > 0) {
845 		size_t wlen = MIN(MBLKL(mp), mlen);
846 		if (ddi_copyout(mp->b_rptr,
847 		    (void *)((uintptr_t)pkt->otp_buf + boff),
848 		    wlen, 0) != 0) {
849 			mutex_exit(&entry->ote_lock);
850 			return (EFAULT);
851 		}
852 		mlen -= wlen;
853 		boff += wlen;
854 		mp = mp->b_cont;
855 	}
856 	mutex_exit(&entry->ote_lock);
857 	return (0);
858 }
859 
860 static int
861 overlay_target_inject(overlay_target_hdl_t *thdl, void *arg)
862 {
863 	overlay_targ_pkt_t *pkt = arg;
864 	overlay_target_entry_t *entry;
865 	overlay_dev_t *odd;
866 	mblk_t *mp;
867 
868 	if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
869 		return (EINVAL);
870 
871 	mp = allocb(pkt->otp_size, 0);
872 	if (mp == NULL)
873 		return (ENOMEM);
874 
875 	if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
876 		freeb(mp);
877 		return (EFAULT);
878 	}
879 	mp->b_wptr += pkt->otp_size;
880 
881 	if (pkt->otp_linkid != UINT64_MAX) {
882 		odd = overlay_hold_by_dlid(pkt->otp_linkid);
883 		if (odd == NULL) {
884 			freeb(mp);
885 			return (ENOENT);
886 		}
887 	} else {
888 		mutex_enter(&thdl->oth_lock);
889 		for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
890 		    entry = list_next(&thdl->oth_outstanding, entry)) {
891 			if ((uintptr_t)entry == pkt->otp_reqid)
892 				break;
893 		}
894 
895 		if (entry == NULL) {
896 			mutex_exit(&thdl->oth_lock);
897 			freeb(mp);
898 			return (ENOENT);
899 		}
900 		odd = entry->ote_odd;
901 		mutex_exit(&thdl->oth_lock);
902 	}
903 
904 	mutex_enter(&odd->odd_lock);
905 	overlay_io_start(odd, OVERLAY_F_IN_RX);
906 	mutex_exit(&odd->odd_lock);
907 
908 	mac_rx(odd->odd_mh, NULL, mp);
909 
910 	mutex_enter(&odd->odd_lock);
911 	overlay_io_done(odd, OVERLAY_F_IN_RX);
912 	mutex_exit(&odd->odd_lock);
913 
914 	return (0);
915 }
916 
917 static int
918 overlay_target_resend(overlay_target_hdl_t *thdl, void *arg)
919 {
920 	overlay_targ_pkt_t *pkt = arg;
921 	overlay_target_entry_t *entry;
922 	overlay_dev_t *odd;
923 	mblk_t *mp;
924 
925 	if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
926 		return (EINVAL);
927 
928 	mp = allocb(pkt->otp_size, 0);
929 	if (mp == NULL)
930 		return (ENOMEM);
931 
932 	if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
933 		freeb(mp);
934 		return (EFAULT);
935 	}
936 	mp->b_wptr += pkt->otp_size;
937 
938 	if (pkt->otp_linkid != UINT64_MAX) {
939 		odd = overlay_hold_by_dlid(pkt->otp_linkid);
940 		if (odd == NULL) {
941 			freeb(mp);
942 			return (ENOENT);
943 		}
944 	} else {
945 		mutex_enter(&thdl->oth_lock);
946 		for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
947 		    entry = list_next(&thdl->oth_outstanding, entry)) {
948 			if ((uintptr_t)entry == pkt->otp_reqid)
949 				break;
950 		}
951 
952 		if (entry == NULL) {
953 			mutex_exit(&thdl->oth_lock);
954 			freeb(mp);
955 			return (ENOENT);
956 		}
957 		odd = entry->ote_odd;
958 		mutex_exit(&thdl->oth_lock);
959 	}
960 
961 	mp = overlay_m_tx(odd, mp);
962 	freemsgchain(mp);
963 
964 	return (0);
965 }
966 
967 typedef struct overlay_targ_list_int {
968 	boolean_t	otli_count;
969 	uint32_t	otli_cur;
970 	uint32_t	otli_nents;
971 	uint32_t	otli_ents[];
972 } overlay_targ_list_int_t;
973 
974 static int
975 overlay_target_list_copyin(const void *ubuf, void **outp, size_t *bsize,
976     int flags)
977 {
978 	overlay_targ_list_t n;
979 	overlay_targ_list_int_t *otl;
980 
981 	if (ddi_copyin(ubuf, &n, sizeof (overlay_targ_list_t),
982 	    flags & FKIOCTL) != 0)
983 		return (EFAULT);
984 
985 	/*
986 	 */
987 	if (n.otl_nents >= INT32_MAX / sizeof (uint32_t))
988 		return (EINVAL);
989 	*bsize = sizeof (overlay_targ_list_int_t) +
990 	    sizeof (uint32_t) * n.otl_nents;
991 	otl = kmem_zalloc(*bsize, KM_SLEEP);
992 	otl->otli_cur = 0;
993 	otl->otli_nents = n.otl_nents;
994 	if (otl->otli_nents != 0) {
995 		otl->otli_count = B_FALSE;
996 		if (ddi_copyin((void *)((uintptr_t)ubuf +
997 		    offsetof(overlay_targ_list_t, otl_ents)),
998 		    otl->otli_ents, n.otl_nents * sizeof (uint32_t),
999 		    flags & FKIOCTL) != 0) {
1000 			kmem_free(otl, *bsize);
1001 			return (EFAULT);
1002 		}
1003 	} else {
1004 		otl->otli_count = B_TRUE;
1005 	}
1006 
1007 	*outp = otl;
1008 	return (0);
1009 }
1010 
1011 static int
1012 overlay_target_ioctl_list_cb(overlay_dev_t *odd, void *arg)
1013 {
1014 	overlay_targ_list_int_t *otl = arg;
1015 
1016 	if (otl->otli_cur < otl->otli_nents)
1017 		otl->otli_ents[otl->otli_cur] = odd->odd_linkid;
1018 	otl->otli_cur++;
1019 	return (0);
1020 }
1021 
1022 /* ARGSUSED */
1023 static int
1024 overlay_target_ioctl_list(overlay_target_hdl_t *thdl, void *arg)
1025 {
1026 	overlay_dev_iter(overlay_target_ioctl_list_cb, arg);
1027 	return (0);
1028 }
1029 
1030 /* ARGSUSED */
1031 static int
1032 overlay_target_list_copyout(void *ubuf, void *buf, size_t bufsize, int flags)
1033 {
1034 	overlay_targ_list_int_t *otl = buf;
1035 
1036 	if (ddi_copyout(&otl->otli_cur, ubuf, sizeof (uint32_t),
1037 	    flags & FKIOCTL) != 0)
1038 		return (EFAULT);
1039 
1040 	if (otl->otli_count == B_FALSE) {
1041 		if (ddi_copyout(otl->otli_ents,
1042 		    (void *)((uintptr_t)ubuf +
1043 		    offsetof(overlay_targ_list_t, otl_ents)),
1044 		    sizeof (uint32_t) * otl->otli_nents,
1045 		    flags & FKIOCTL) != 0)
1046 			return (EFAULT);
1047 	}
1048 	return (0);
1049 }
1050 
1051 /* ARGSUSED */
1052 static int
1053 overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg)
1054 {
1055 	int ret = 0;
1056 	overlay_dev_t *odd;
1057 	overlay_target_t *ott;
1058 	overlay_targ_cache_t *otc = arg;
1059 
1060 	odd = overlay_hold_by_dlid(otc->otc_linkid);
1061 	if (odd == NULL)
1062 		return (ENOENT);
1063 
1064 	mutex_enter(&odd->odd_lock);
1065 	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1066 		mutex_exit(&odd->odd_lock);
1067 		overlay_hold_rele(odd);
1068 		return (ENXIO);
1069 	}
1070 	ott = odd->odd_target;
1071 	if (ott->ott_mode != OVERLAY_TARGET_POINT &&
1072 	    ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1073 		mutex_exit(&odd->odd_lock);
1074 		overlay_hold_rele(odd);
1075 		return (ENOTSUP);
1076 	}
1077 	mutex_enter(&ott->ott_lock);
1078 	mutex_exit(&odd->odd_lock);
1079 
1080 	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
1081 		otc->otc_entry.otce_flags = 0;
1082 		bcopy(&ott->ott_u.ott_point, &otc->otc_entry.otce_dest,
1083 		    sizeof (overlay_target_point_t));
1084 	} else {
1085 		overlay_target_entry_t *ote;
1086 		ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1087 		    otc->otc_entry.otce_mac);
1088 		if (ote != NULL) {
1089 			mutex_enter(&ote->ote_lock);
1090 			if ((ote->ote_flags &
1091 			    OVERLAY_ENTRY_F_VALID_MASK) != 0) {
1092 				if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) {
1093 					otc->otc_entry.otce_flags =
1094 					    OVERLAY_TARGET_CACHE_DROP;
1095 				} else {
1096 					otc->otc_entry.otce_flags = 0;
1097 					bcopy(&ote->ote_dest,
1098 					    &otc->otc_entry.otce_dest,
1099 					    sizeof (overlay_target_point_t));
1100 				}
1101 				ret = 0;
1102 			} else {
1103 				ret = ENOENT;
1104 			}
1105 			mutex_exit(&ote->ote_lock);
1106 		} else {
1107 			ret = ENOENT;
1108 		}
1109 	}
1110 
1111 	mutex_exit(&ott->ott_lock);
1112 	overlay_hold_rele(odd);
1113 
1114 	return (ret);
1115 }
1116 
1117 /* ARGSUSED */
1118 static int
1119 overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
1120 {
1121 	overlay_dev_t *odd;
1122 	overlay_target_t *ott;
1123 	overlay_target_entry_t *ote;
1124 	overlay_targ_cache_t *otc = arg;
1125 	mblk_t *mp = NULL;
1126 
1127 	if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP)
1128 		return (EINVAL);
1129 
1130 	odd = overlay_hold_by_dlid(otc->otc_linkid);
1131 	if (odd == NULL)
1132 		return (ENOENT);
1133 
1134 	mutex_enter(&odd->odd_lock);
1135 	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1136 		mutex_exit(&odd->odd_lock);
1137 		overlay_hold_rele(odd);
1138 		return (ENXIO);
1139 	}
1140 	ott = odd->odd_target;
1141 	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1142 		mutex_exit(&odd->odd_lock);
1143 		overlay_hold_rele(odd);
1144 		return (ENOTSUP);
1145 	}
1146 	mutex_enter(&ott->ott_lock);
1147 	mutex_exit(&odd->odd_lock);
1148 
1149 	ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1150 	    otc->otc_entry.otce_mac);
1151 	if (ote == NULL) {
1152 		ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP);
1153 		bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL);
1154 		ote->ote_chead = ote->ote_ctail = NULL;
1155 		ote->ote_mbsize = 0;
1156 		ote->ote_ott = ott;
1157 		ote->ote_odd = odd;
1158 		mutex_enter(&ote->ote_lock);
1159 		refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote);
1160 		avl_add(&ott->ott_u.ott_dyn.ott_tree, ote);
1161 	} else {
1162 		mutex_enter(&ote->ote_lock);
1163 	}
1164 
1165 	if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) {
1166 		ote->ote_flags |= OVERLAY_ENTRY_F_DROP;
1167 	} else {
1168 		ote->ote_flags |= OVERLAY_ENTRY_F_VALID;
1169 		bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest,
1170 		    sizeof (overlay_target_point_t));
1171 		mp = ote->ote_chead;
1172 		ote->ote_chead = NULL;
1173 		ote->ote_ctail = NULL;
1174 		ote->ote_mbsize = 0;
1175 		ote->ote_vtime = gethrtime();
1176 	}
1177 
1178 	mutex_exit(&ote->ote_lock);
1179 	mutex_exit(&ott->ott_lock);
1180 
1181 	if (mp != NULL) {
1182 		mp = overlay_m_tx(ote->ote_odd, mp);
1183 		freemsgchain(mp);
1184 	}
1185 
1186 	overlay_hold_rele(odd);
1187 
1188 	return (0);
1189 }
1190 
1191 /* ARGSUSED */
1192 static int
1193 overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg)
1194 {
1195 	int ret = 0;
1196 	overlay_dev_t *odd;
1197 	overlay_target_t *ott;
1198 	overlay_target_entry_t *ote;
1199 	overlay_targ_cache_t *otc = arg;
1200 
1201 	odd = overlay_hold_by_dlid(otc->otc_linkid);
1202 	if (odd == NULL)
1203 		return (ENOENT);
1204 
1205 	mutex_enter(&odd->odd_lock);
1206 	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1207 		mutex_exit(&odd->odd_lock);
1208 		overlay_hold_rele(odd);
1209 		return (ENXIO);
1210 	}
1211 	ott = odd->odd_target;
1212 	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1213 		mutex_exit(&odd->odd_lock);
1214 		overlay_hold_rele(odd);
1215 		return (ENOTSUP);
1216 	}
1217 	mutex_enter(&ott->ott_lock);
1218 	mutex_exit(&odd->odd_lock);
1219 
1220 	ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1221 	    otc->otc_entry.otce_mac);
1222 	if (ote != NULL) {
1223 		mutex_enter(&ote->ote_lock);
1224 		ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
1225 		mutex_exit(&ote->ote_lock);
1226 		ret = 0;
1227 	} else {
1228 		ret = ENOENT;
1229 	}
1230 
1231 	mutex_exit(&ott->ott_lock);
1232 	overlay_hold_rele(odd);
1233 
1234 	return (ret);
1235 }
1236 
1237 /* ARGSUSED */
1238 static int
1239 overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg)
1240 {
1241 	avl_tree_t *avl;
1242 	overlay_dev_t *odd;
1243 	overlay_target_t *ott;
1244 	overlay_target_entry_t *ote;
1245 	overlay_targ_cache_t *otc = arg;
1246 
1247 	odd = overlay_hold_by_dlid(otc->otc_linkid);
1248 	if (odd == NULL)
1249 		return (ENOENT);
1250 
1251 	mutex_enter(&odd->odd_lock);
1252 	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1253 		mutex_exit(&odd->odd_lock);
1254 		overlay_hold_rele(odd);
1255 		return (ENXIO);
1256 	}
1257 	ott = odd->odd_target;
1258 	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1259 		mutex_exit(&odd->odd_lock);
1260 		overlay_hold_rele(odd);
1261 		return (ENOTSUP);
1262 	}
1263 	mutex_enter(&ott->ott_lock);
1264 	mutex_exit(&odd->odd_lock);
1265 	avl = &ott->ott_u.ott_dyn.ott_tree;
1266 
1267 	for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) {
1268 		mutex_enter(&ote->ote_lock);
1269 		ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
1270 		mutex_exit(&ote->ote_lock);
1271 	}
1272 	ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1273 	    otc->otc_entry.otce_mac);
1274 
1275 	mutex_exit(&ott->ott_lock);
1276 	overlay_hold_rele(odd);
1277 
1278 	return (0);
1279 }
1280 
1281 static int
1282 overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize,
1283     int flags)
1284 {
1285 	overlay_targ_cache_iter_t base, *iter;
1286 
1287 	if (ddi_copyin(ubuf, &base, sizeof (overlay_targ_cache_iter_t),
1288 	    flags & FKIOCTL) != 0)
1289 		return (EFAULT);
1290 
1291 	if (base.otci_count > OVERLAY_TARGET_ITER_MAX)
1292 		return (E2BIG);
1293 
1294 	if (base.otci_count == 0)
1295 		return (EINVAL);
1296 
1297 	*bsize = sizeof (overlay_targ_cache_iter_t) +
1298 	    base.otci_count * sizeof (overlay_targ_cache_entry_t);
1299 	iter = kmem_alloc(*bsize, KM_SLEEP);
1300 	bcopy(&base, iter, sizeof (overlay_targ_cache_iter_t));
1301 	*outp = iter;
1302 
1303 	return (0);
1304 }
1305 
1306 typedef struct overlay_targ_cache_marker {
1307 	uint8_t		otcm_mac[ETHERADDRL];
1308 	uint16_t	otcm_done;
1309 } overlay_targ_cache_marker_t;
1310 
1311 /* ARGSUSED */
1312 static int
1313 overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
1314 {
1315 	overlay_dev_t *odd;
1316 	overlay_target_t *ott;
1317 	overlay_target_entry_t lookup, *ent;
1318 	overlay_targ_cache_marker_t *mark;
1319 	avl_index_t where;
1320 	avl_tree_t *avl;
1321 	uint16_t written = 0;
1322 
1323 	overlay_targ_cache_iter_t *iter = arg;
1324 	mark = (void *)&iter->otci_marker;
1325 
1326 	if (mark->otcm_done != 0) {
1327 		iter->otci_count = 0;
1328 		return (0);
1329 	}
1330 
1331 	odd = overlay_hold_by_dlid(iter->otci_linkid);
1332 	if (odd == NULL)
1333 		return (ENOENT);
1334 
1335 	mutex_enter(&odd->odd_lock);
1336 	if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1337 		mutex_exit(&odd->odd_lock);
1338 		overlay_hold_rele(odd);
1339 		return (ENXIO);
1340 	}
1341 	ott = odd->odd_target;
1342 	if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC &&
1343 	    ott->ott_mode != OVERLAY_TARGET_POINT) {
1344 		mutex_exit(&odd->odd_lock);
1345 		overlay_hold_rele(odd);
1346 		return (ENOTSUP);
1347 	}
1348 
1349 	/*
1350 	 * Holding this lock across the entire iteration probably isn't very
1351 	 * good. We should perhaps add an r/w lock for the avl tree. But we'll
1352 	 * wait until we now it's necessary before we do more.
1353 	 */
1354 	mutex_enter(&ott->ott_lock);
1355 	mutex_exit(&odd->odd_lock);
1356 
1357 	if (ott->ott_mode == OVERLAY_TARGET_POINT) {
1358 		overlay_targ_cache_entry_t *out = &iter->otci_ents[0];
1359 		bzero(out->otce_mac, ETHERADDRL);
1360 		out->otce_flags = 0;
1361 		bcopy(&ott->ott_u.ott_point, &out->otce_dest,
1362 		    sizeof (overlay_target_point_t));
1363 		written++;
1364 		mark->otcm_done = 1;
1365 	}
1366 
1367 	avl = &ott->ott_u.ott_dyn.ott_tree;
1368 	bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL);
1369 	ent = avl_find(avl, &lookup, &where);
1370 
1371 	/*
1372 	 * NULL ent means that the entry does not exist, so we want to start
1373 	 * with the closest node in the tree. This means that we implicitly rely
1374 	 * on the tree's order and the first node will be the mac 00:00:00:00:00
1375 	 * and the last will be ff:ff:ff:ff:ff:ff.
1376 	 */
1377 	if (ent == NULL) {
1378 		ent = avl_nearest(avl, where, AVL_AFTER);
1379 		if (ent == NULL) {
1380 			mark->otcm_done = 1;
1381 			goto done;
1382 		}
1383 	}
1384 
1385 	for (; ent != NULL && written < iter->otci_count;
1386 	    ent = AVL_NEXT(avl, ent)) {
1387 		overlay_targ_cache_entry_t *out = &iter->otci_ents[written];
1388 		mutex_enter(&ent->ote_lock);
1389 		if ((ent->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) == 0) {
1390 			mutex_exit(&ent->ote_lock);
1391 			continue;
1392 		}
1393 		bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL);
1394 		out->otce_flags = 0;
1395 		if (ent->ote_flags & OVERLAY_ENTRY_F_DROP)
1396 			out->otce_flags |= OVERLAY_TARGET_CACHE_DROP;
1397 		if (ent->ote_flags & OVERLAY_ENTRY_F_VALID)
1398 			bcopy(&ent->ote_dest, &out->otce_dest,
1399 			    sizeof (overlay_target_point_t));
1400 		written++;
1401 		mutex_exit(&ent->ote_lock);
1402 	}
1403 
1404 	if (ent != NULL) {
1405 		bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL);
1406 	} else {
1407 		mark->otcm_done = 1;
1408 	}
1409 
1410 done:
1411 	iter->otci_count = written;
1412 	mutex_exit(&ott->ott_lock);
1413 	overlay_hold_rele(odd);
1414 
1415 	return (0);
1416 }
1417 
1418 /* ARGSUSED */
1419 static int
1420 overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize,
1421     int flags)
1422 {
1423 	size_t outsize;
1424 	const overlay_targ_cache_iter_t *iter = buf;
1425 
1426 	outsize = sizeof (overlay_targ_cache_iter_t) +
1427 	    iter->otci_count * sizeof (overlay_targ_cache_entry_t);
1428 
1429 	if (ddi_copyout(buf, ubuf, outsize, flags & FKIOCTL) != 0)
1430 		return (EFAULT);
1431 
1432 	return (0);
1433 }
1434 
1435 static overlay_target_ioctl_t overlay_target_ioctab[] = {
1436 	{ OVERLAY_TARG_INFO, B_TRUE, B_TRUE,
1437 		NULL, overlay_target_info,
1438 		NULL, sizeof (overlay_targ_info_t)	},
1439 	{ OVERLAY_TARG_ASSOCIATE, B_TRUE, B_FALSE,
1440 		NULL, overlay_target_associate,
1441 		NULL, sizeof (overlay_targ_associate_t)	},
1442 	{ OVERLAY_TARG_DISASSOCIATE, B_TRUE, B_FALSE,
1443 		NULL, overlay_target_disassociate,
1444 		NULL, sizeof (overlay_targ_id_t)	},
1445 	{ OVERLAY_TARG_DEGRADE, B_TRUE, B_FALSE,
1446 		NULL, overlay_target_degrade,
1447 		NULL, sizeof (overlay_targ_degrade_t)	},
1448 	{ OVERLAY_TARG_RESTORE, B_TRUE, B_FALSE,
1449 		NULL, overlay_target_restore,
1450 		NULL, sizeof (overlay_targ_id_t)	},
1451 	{ OVERLAY_TARG_LOOKUP, B_FALSE, B_TRUE,
1452 		NULL, overlay_target_lookup_request,
1453 		NULL, sizeof (overlay_targ_lookup_t)	},
1454 	{ OVERLAY_TARG_RESPOND, B_TRUE, B_FALSE,
1455 		NULL, overlay_target_lookup_respond,
1456 		NULL, sizeof (overlay_targ_resp_t)	},
1457 	{ OVERLAY_TARG_DROP, B_TRUE, B_FALSE,
1458 		NULL, overlay_target_lookup_drop,
1459 		NULL, sizeof (overlay_targ_resp_t)	},
1460 	{ OVERLAY_TARG_PKT, B_TRUE, B_TRUE,
1461 		overlay_target_pkt_copyin,
1462 		overlay_target_packet,
1463 		overlay_target_pkt_copyout,
1464 		sizeof (overlay_targ_pkt_t)		},
1465 	{ OVERLAY_TARG_INJECT, B_TRUE, B_FALSE,
1466 		overlay_target_pkt_copyin,
1467 		overlay_target_inject,
1468 		NULL, sizeof (overlay_targ_pkt_t)	},
1469 	{ OVERLAY_TARG_RESEND, B_TRUE, B_FALSE,
1470 		overlay_target_pkt_copyin,
1471 		overlay_target_resend,
1472 		NULL, sizeof (overlay_targ_pkt_t)	},
1473 	{ OVERLAY_TARG_LIST, B_FALSE, B_TRUE,
1474 		overlay_target_list_copyin,
1475 		overlay_target_ioctl_list,
1476 		overlay_target_list_copyout,
1477 		sizeof (overlay_targ_list_t)		},
1478 	{ OVERLAY_TARG_CACHE_GET, B_FALSE, B_TRUE,
1479 		NULL, overlay_target_cache_get,
1480 		NULL, sizeof (overlay_targ_cache_t)	},
1481 	{ OVERLAY_TARG_CACHE_SET, B_TRUE, B_TRUE,
1482 		NULL, overlay_target_cache_set,
1483 		NULL, sizeof (overlay_targ_cache_t)	},
1484 	{ OVERLAY_TARG_CACHE_REMOVE, B_TRUE, B_TRUE,
1485 		NULL, overlay_target_cache_remove,
1486 		NULL, sizeof (overlay_targ_cache_t)	},
1487 	{ OVERLAY_TARG_CACHE_FLUSH, B_TRUE, B_TRUE,
1488 		NULL, overlay_target_cache_flush,
1489 		NULL, sizeof (overlay_targ_cache_t)	},
1490 	{ OVERLAY_TARG_CACHE_ITER, B_FALSE, B_TRUE,
1491 		overlay_target_cache_iter_copyin,
1492 		overlay_target_cache_iter,
1493 		overlay_target_cache_iter_copyout,
1494 		sizeof (overlay_targ_cache_iter_t)		},
1495 	{ 0 }
1496 };
1497 
1498 int
1499 overlay_target_open(dev_t *devp, int flags, int otype, cred_t *credp)
1500 {
1501 	minor_t mid;
1502 	overlay_target_hdl_t *thdl;
1503 
1504 	if (secpolicy_dl_config(credp) != 0)
1505 		return (EPERM);
1506 
1507 	if (getminor(*devp) != 0)
1508 		return (ENXIO);
1509 
1510 	if (otype & OTYP_BLK)
1511 		return (EINVAL);
1512 
1513 	if (flags & ~(FREAD | FWRITE | FEXCL))
1514 		return (EINVAL);
1515 
1516 	if ((flags & FWRITE) &&
1517 	    !(flags & FEXCL))
1518 		return (EINVAL);
1519 
1520 	if (!(flags & FREAD) && !(flags & FWRITE))
1521 		return (EINVAL);
1522 
1523 	if (crgetzoneid(credp) != GLOBAL_ZONEID)
1524 		return (EPERM);
1525 
1526 	mid = id_alloc(overlay_thdl_idspace);
1527 	if (ddi_soft_state_zalloc(overlay_thdl_state, mid) != 0) {
1528 		id_free(overlay_thdl_idspace, mid);
1529 		return (ENXIO);
1530 	}
1531 
1532 	thdl = ddi_get_soft_state(overlay_thdl_state, mid);
1533 	VERIFY(thdl != NULL);
1534 	thdl->oth_minor = mid;
1535 	thdl->oth_zoneid = crgetzoneid(credp);
1536 	thdl->oth_oflags = flags;
1537 	mutex_init(&thdl->oth_lock, NULL, MUTEX_DRIVER, NULL);
1538 	list_create(&thdl->oth_outstanding, sizeof (overlay_target_entry_t),
1539 	    offsetof(overlay_target_entry_t, ote_qlink));
1540 	*devp = makedevice(getmajor(*devp), mid);
1541 
1542 	mutex_enter(&overlay_target_lock);
1543 	if ((flags & FEXCL) && overlay_target_excl == B_TRUE) {
1544 		mutex_exit(&overlay_target_lock);
1545 		list_destroy(&thdl->oth_outstanding);
1546 		mutex_destroy(&thdl->oth_lock);
1547 		ddi_soft_state_free(overlay_thdl_state, mid);
1548 		id_free(overlay_thdl_idspace, mid);
1549 		return (EEXIST);
1550 	} else if ((flags & FEXCL) != 0) {
1551 		VERIFY(overlay_target_excl == B_FALSE);
1552 		overlay_target_excl = B_TRUE;
1553 	}
1554 	list_insert_tail(&overlay_thdl_list, thdl);
1555 	mutex_exit(&overlay_target_lock);
1556 
1557 	return (0);
1558 }
1559 
1560 /* ARGSUSED */
1561 int
1562 overlay_target_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1563     int *rvalp)
1564 {
1565 	overlay_target_ioctl_t *ioc;
1566 	overlay_target_hdl_t *thdl;
1567 
1568 	if (secpolicy_dl_config(credp) != 0)
1569 		return (EPERM);
1570 
1571 	if ((thdl = ddi_get_soft_state(overlay_thdl_state,
1572 	    getminor(dev))) == NULL)
1573 		return (ENXIO);
1574 
1575 	for (ioc = &overlay_target_ioctab[0]; ioc->oti_cmd != 0; ioc++) {
1576 		int ret;
1577 		caddr_t buf;
1578 		size_t bufsize;
1579 
1580 		if (ioc->oti_cmd != cmd)
1581 			continue;
1582 
1583 		if (ioc->oti_write == B_TRUE && !(mode & FWRITE))
1584 			return (EBADF);
1585 
1586 		if (ioc->oti_copyin == NULL) {
1587 			bufsize = ioc->oti_size;
1588 			buf = kmem_alloc(bufsize, KM_SLEEP);
1589 			if (ddi_copyin((void *)(uintptr_t)arg, buf, bufsize,
1590 			    mode & FKIOCTL) != 0) {
1591 				kmem_free(buf, bufsize);
1592 				return (EFAULT);
1593 			}
1594 		} else {
1595 			if ((ret = ioc->oti_copyin((void *)(uintptr_t)arg,
1596 			    (void **)&buf, &bufsize, mode)) != 0)
1597 				return (ret);
1598 		}
1599 
1600 		ret = ioc->oti_func(thdl, buf);
1601 		if (ret == 0 && ioc->oti_size != 0 &&
1602 		    ioc->oti_ncopyout == B_TRUE) {
1603 			if (ioc->oti_copyout == NULL) {
1604 				if (ddi_copyout(buf, (void *)(uintptr_t)arg,
1605 				    bufsize, mode & FKIOCTL) != 0)
1606 					ret = EFAULT;
1607 			} else {
1608 				ret = ioc->oti_copyout((void *)(uintptr_t)arg,
1609 				    buf, bufsize, mode);
1610 			}
1611 		}
1612 
1613 		kmem_free(buf, bufsize);
1614 		return (ret);
1615 	}
1616 
1617 	return (ENOTTY);
1618 }
1619 
1620 /* ARGSUSED */
1621 int
1622 overlay_target_close(dev_t dev, int flags, int otype, cred_t *credp)
1623 {
1624 	overlay_target_hdl_t *thdl;
1625 	overlay_target_entry_t *entry;
1626 	minor_t mid = getminor(dev);
1627 
1628 	if ((thdl = ddi_get_soft_state(overlay_thdl_state, mid)) == NULL)
1629 		return (ENXIO);
1630 
1631 	mutex_enter(&overlay_target_lock);
1632 	list_remove(&overlay_thdl_list, thdl);
1633 	mutex_enter(&thdl->oth_lock);
1634 	while ((entry = list_remove_head(&thdl->oth_outstanding)) != NULL)
1635 		list_insert_tail(&overlay_target_list, entry);
1636 	cv_signal(&overlay_target_condvar);
1637 	mutex_exit(&thdl->oth_lock);
1638 	if ((thdl->oth_oflags & FEXCL) != 0) {
1639 		VERIFY(overlay_target_excl == B_TRUE);
1640 		overlay_target_excl = B_FALSE;
1641 	}
1642 	mutex_exit(&overlay_target_lock);
1643 
1644 	list_destroy(&thdl->oth_outstanding);
1645 	mutex_destroy(&thdl->oth_lock);
1646 	mid = thdl->oth_minor;
1647 	ddi_soft_state_free(overlay_thdl_state, mid);
1648 	id_free(overlay_thdl_idspace, mid);
1649 
1650 	return (0);
1651 }
1652