xref: /illumos-gate/usr/src/uts/common/io/overlay/overlay_mux.c (revision 43f863f959a7ec8a6ee3645d33997561ff808c39)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  */
15 
16 /*
17  * Overlay device ksocket multiplexer.
18  *
19  * For more information, see the big theory statement in
20  * uts/common/io/overlay/overlay.c
21  */
22 
23 #include <sys/types.h>
24 #include <sys/socket.h>
25 #include <sys/ksynch.h>
26 #include <sys/ksocket.h>
27 #include <sys/avl.h>
28 #include <sys/list.h>
29 #include <sys/pattr.h>
30 #include <sys/sysmacros.h>
31 #include <sys/strsubr.h>
32 #include <sys/strsun.h>
33 #include <sys/tihdr.h>
34 
35 #include <sys/overlay_impl.h>
36 
37 #include <sys/sdt.h>
38 
39 #define	OVERLAY_FREEMSG(mp, reason) \
40     DTRACE_PROBE2(overlay__freemsg, mblk_t *, mp, char *, reason)
41 
42 static list_t overlay_mux_list;
43 static kmutex_t overlay_mux_lock;
44 
45 void
46 overlay_mux_init(void)
47 {
48 	list_create(&overlay_mux_list, sizeof (overlay_mux_t),
49 	    offsetof(overlay_mux_t, omux_lnode));
50 	mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL);
51 }
52 
53 void
54 overlay_mux_fini(void)
55 {
56 	mutex_destroy(&overlay_mux_lock);
57 	list_destroy(&overlay_mux_list);
58 }
59 
60 static int
61 overlay_mux_comparator(const void *a, const void *b)
62 {
63 	const overlay_dev_t *odl, *odr;
64 	odl = a;
65 	odr = b;
66 	if (odl->odd_vid > odr->odd_vid)
67 		return (1);
68 	else if (odl->odd_vid < odr->odd_vid)
69 		return (-1);
70 	else
71 		return (0);
72 }
73 
74 /*
75  * This is the central receive data path. We need to decode the packet, if we
76  * can, and then deliver it to the appropriate overlay.
77  */
78 /* ARGSUSED */
79 static boolean_t
80 overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
81     void *arg)
82 {
83 	mblk_t *mp, *nmp, *fmp;
84 	overlay_mux_t *mux = arg;
85 
86 	/*
87 	 * We may have a received a chain of messages. Each message in the
88 	 * chain will likely have a T_unitdata_ind attached to it as an M_PROTO.
89 	 * If we aren't getting that, we should probably drop that for the
90 	 * moment.
91 	 */
92 	for (mp = mpchain; mp != NULL; mp = nmp) {
93 		struct T_unitdata_ind *tudi;
94 		ovep_encap_info_t infop;
95 		overlay_dev_t od, *odd;
96 		int ret;
97 
98 		nmp = mp->b_next;
99 		mp->b_next = NULL;
100 
101 		if (DB_TYPE(mp) != M_PROTO) {
102 			OVERLAY_FREEMSG(mp, "first one isn't M_PROTO");
103 			freemsg(mp);
104 			continue;
105 		}
106 
107 		if (mp->b_cont == NULL) {
108 			OVERLAY_FREEMSG(mp, "missing a b_cont");
109 			freemsg(mp);
110 			continue;
111 		}
112 
113 		tudi = (struct T_unitdata_ind *)mp->b_rptr;
114 		if (tudi->PRIM_type != T_UNITDATA_IND) {
115 			OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *");
116 			freemsg(mp);
117 			continue;
118 		}
119 
120 		/*
121 		 * In the future, we'll care about the source information
122 		 * for purposes of telling varpd for oob invalidation. But for
123 		 * now, just drop that block.
124 		 */
125 		fmp = mp;
126 		mp = fmp->b_cont;
127 		freeb(fmp);
128 
129 		/*
130 		 * Until we have VXLAN-or-other-decap HW acceleration support
131 		 * (e.g.  we support NICs that reach into VXLAN-encapsulated
132 		 * packets and check the inside-VXLAN IP packets' checksums,
133 		 * or do LSO with VXLAN), we should clear any HW-accelerated-
134 		 * performed bits.
135 		 */
136 		DB_CKSUMFLAGS(mp) = 0;
137 
138 		/*
139 		 * Decap and deliver.
140 		 */
141 		bzero(&infop, sizeof (ovep_encap_info_t));
142 		ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop);
143 		if (ret != 0) {
144 			OVERLAY_FREEMSG(mp, "decap failed");
145 			freemsg(mp);
146 			continue;
147 		}
148 		if (MBLKL(mp) > infop.ovdi_hdr_size) {
149 			mp->b_rptr += infop.ovdi_hdr_size;
150 		} else {
151 			while (infop.ovdi_hdr_size != 0) {
152 				size_t rem, blkl;
153 
154 				if (mp == NULL)
155 					break;
156 
157 				blkl = MBLKL(mp);
158 				rem = MIN(infop.ovdi_hdr_size, blkl);
159 				infop.ovdi_hdr_size -= rem;
160 				mp->b_rptr += rem;
161 				if (rem == blkl) {
162 					fmp = mp;
163 					mp = fmp->b_cont;
164 					fmp->b_cont = NULL;
165 					OVERLAY_FREEMSG(mp,
166 					    "freed a fmp block");
167 					freemsg(fmp);
168 				}
169 			}
170 			if (mp == NULL) {
171 				OVERLAY_FREEMSG(mp, "freed it all...");
172 				continue;
173 			}
174 		}
175 
176 
177 		od.odd_vid = infop.ovdi_id;
178 		mutex_enter(&mux->omux_lock);
179 		odd = avl_find(&mux->omux_devices, &od, NULL);
180 		if (odd == NULL) {
181 			mutex_exit(&mux->omux_lock);
182 			OVERLAY_FREEMSG(mp, "no matching vid");
183 			freemsg(mp);
184 			continue;
185 		}
186 		mutex_enter(&odd->odd_lock);
187 		if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
188 		    !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
189 			mutex_exit(&odd->odd_lock);
190 			mutex_exit(&mux->omux_lock);
191 			OVERLAY_FREEMSG(mp, "dev dropped");
192 			freemsg(mp);
193 			continue;
194 		}
195 		overlay_io_start(odd, OVERLAY_F_IN_RX);
196 		mutex_exit(&odd->odd_lock);
197 		mutex_exit(&mux->omux_lock);
198 
199 		mac_rx(odd->odd_mh, NULL, mp);
200 
201 		mutex_enter(&odd->odd_lock);
202 		overlay_io_done(odd, OVERLAY_F_IN_RX);
203 		mutex_exit(&odd->odd_lock);
204 	}
205 
206 	return (B_TRUE);
207 }
208 
209 /*
210  * Register a given device with a socket backend. If no such device socket
211  * exists, create a new one.
212  */
213 overlay_mux_t *
214 overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
215     struct sockaddr *addr, socklen_t len, int *errp)
216 {
217 	int err;
218 	overlay_mux_t *mux;
219 	ksocket_t ksock;
220 
221 	if (errp == NULL)
222 		errp = &err;
223 
224 	mutex_enter(&overlay_mux_lock);
225 	for (mux = list_head(&overlay_mux_list); mux != NULL;
226 	    mux = list_next(&overlay_mux_list, mux)) {
227 		if (domain == mux->omux_domain &&
228 		    family == mux->omux_family &&
229 		    protocol == mux->omux_protocol &&
230 		    len == mux->omux_alen &&
231 		    bcmp(addr, mux->omux_addr, len) == 0) {
232 
233 			if (opp != mux->omux_plugin) {
234 				*errp = EEXIST;
235 				return (NULL);
236 			}
237 
238 			mutex_enter(&mux->omux_lock);
239 			mux->omux_count++;
240 			mutex_exit(&mux->omux_lock);
241 			mutex_exit(&overlay_mux_lock);
242 			*errp = 0;
243 			return (mux);
244 		}
245 	}
246 
247 	/*
248 	 * Today we aren't zone-aware and only exist in the global zone. When we
249 	 * allow for things to exist in the non-global zone, we'll want to use a
250 	 * credential that's actually specific to the zone.
251 	 */
252 	*errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP,
253 	    kcred);
254 	if (*errp != 0) {
255 		mutex_exit(&overlay_mux_lock);
256 		return (NULL);
257 	}
258 
259 	*errp = ksocket_bind(ksock, addr, len, kcred);
260 	if (*errp != 0) {
261 		mutex_exit(&overlay_mux_lock);
262 		ksocket_close(ksock, kcred);
263 		return (NULL);
264 	}
265 
266 	/*
267 	 * Ask our lower layer to optionally toggle anything they need on this
268 	 * socket. Because a socket is owned by a single type of plugin, we can
269 	 * then ask it to perform any additional socket set up it'd like to do.
270 	 */
271 	if (opp->ovp_ops->ovpo_sockopt != NULL &&
272 	    (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) {
273 		mutex_exit(&overlay_mux_lock);
274 		ksocket_close(ksock, kcred);
275 		return (NULL);
276 	}
277 
278 	mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP);
279 	list_link_init(&mux->omux_lnode);
280 	mux->omux_ksock = ksock;
281 	mux->omux_plugin = opp;
282 	mux->omux_domain = domain;
283 	mux->omux_family = family;
284 	mux->omux_protocol = protocol;
285 	mux->omux_addr = kmem_alloc(len, KM_SLEEP);
286 	bcopy(addr, mux->omux_addr, len);
287 	mux->omux_alen = len;
288 	mux->omux_count = 1;
289 	avl_create(&mux->omux_devices, overlay_mux_comparator,
290 	    sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode));
291 	mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL);
292 
293 
294 	/* Once this is called, we need to expect to rx data */
295 	*errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux);
296 	if (*errp != 0) {
297 		ksocket_close(ksock, kcred);
298 		mutex_destroy(&mux->omux_lock);
299 		avl_destroy(&mux->omux_devices);
300 		kmem_free(mux->omux_addr, len);
301 		kmem_free(mux, sizeof (overlay_mux_t));
302 		return (NULL);
303 	}
304 
305 	list_insert_tail(&overlay_mux_list, mux);
306 	mutex_exit(&overlay_mux_lock);
307 
308 	*errp = 0;
309 	return (mux);
310 }
311 
312 void
313 overlay_mux_close(overlay_mux_t *mux)
314 {
315 	mutex_enter(&overlay_mux_lock);
316 	mutex_enter(&mux->omux_lock);
317 	mux->omux_count--;
318 	if (mux->omux_count != 0) {
319 		mutex_exit(&mux->omux_lock);
320 		mutex_exit(&overlay_mux_lock);
321 		return;
322 	}
323 	list_remove(&overlay_mux_list, mux);
324 	mutex_exit(&mux->omux_lock);
325 	mutex_exit(&overlay_mux_lock);
326 
327 	ksocket_close(mux->omux_ksock, kcred);
328 	avl_destroy(&mux->omux_devices);
329 	kmem_free(mux->omux_addr, mux->omux_alen);
330 	kmem_free(mux, sizeof (overlay_mux_t));
331 }
332 
333 void
334 overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd)
335 {
336 	mutex_enter(&mux->omux_lock);
337 	avl_add(&mux->omux_devices, odd);
338 	mutex_exit(&mux->omux_lock);
339 }
340 
341 void
342 overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd)
343 {
344 	mutex_enter(&mux->omux_lock);
345 	avl_remove(&mux->omux_devices, odd);
346 	mutex_exit(&mux->omux_lock);
347 }
348 
349 int
350 overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp)
351 {
352 	int ret;
353 
354 	/*
355 	 * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately,
356 	 * that isn't actually supported by UDP at this time.
357 	 */
358 	ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred);
359 	if (ret != 0)
360 		freemsg(mp);
361 
362 	return (ret);
363 }
364