1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2019 Joyent, Inc.
14 */
15
16 /*
17 * Overlay device ksocket multiplexer.
18 *
19 * For more information, see the big theory statement in
20 * uts/common/io/overlay/overlay.c
21 */
22
23 #include <sys/types.h>
24 #include <sys/socket.h>
25 #include <sys/ksynch.h>
26 #include <sys/ksocket.h>
27 #include <sys/avl.h>
28 #include <sys/list.h>
29 #include <sys/pattr.h>
30 #include <sys/sysmacros.h>
31 #include <sys/strsubr.h>
32 #include <sys/strsun.h>
33 #include <sys/tihdr.h>
34
35 #include <sys/overlay_impl.h>
36
37 #include <sys/sdt.h>
38
39 static list_t overlay_mux_list;
40 static kmutex_t overlay_mux_lock;
41
42 void
overlay_mux_init(void)43 overlay_mux_init(void)
44 {
45 list_create(&overlay_mux_list, sizeof (overlay_mux_t),
46 offsetof(overlay_mux_t, omux_lnode));
47 mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL);
48 }
49
50 void
overlay_mux_fini(void)51 overlay_mux_fini(void)
52 {
53 mutex_destroy(&overlay_mux_lock);
54 list_destroy(&overlay_mux_list);
55 }
56
57 static int
overlay_mux_comparator(const void * a,const void * b)58 overlay_mux_comparator(const void *a, const void *b)
59 {
60 const overlay_dev_t *odl, *odr;
61 odl = a;
62 odr = b;
63 if (odl->odd_vid > odr->odd_vid)
64 return (1);
65 else if (odl->odd_vid < odr->odd_vid)
66 return (-1);
67 else
68 return (0);
69 }
70
71 /*
72 * This is the central receive data path. We need to decode the packet, if we
73 * can, and then deliver it to the appropriate overlay.
74 */
75 /* ARGSUSED */
76 static boolean_t
overlay_mux_recv(ksocket_t ks,mblk_t * mpchain,size_t msgsize,int oob,void * arg)77 overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
78 void *arg)
79 {
80 mblk_t *mp, *nmp, *fmp;
81 overlay_mux_t *mux = arg;
82
83 /*
84 * We may have a received a chain of messages. Each message in the
85 * chain will likely have a T_unitdata_ind attached to it as an M_PROTO.
86 * If we aren't getting that, we should probably drop that for the
87 * moment.
88 */
89 for (mp = mpchain; mp != NULL; mp = nmp) {
90 struct T_unitdata_ind *tudi;
91 ovep_encap_info_t infop;
92 overlay_dev_t od, *odd;
93 int ret;
94
95 nmp = mp->b_next;
96 mp->b_next = NULL;
97
98 if (DB_TYPE(mp) != M_PROTO) {
99 OVERLAY_FREEMSG(mp, "first one isn't M_PROTO");
100 freemsg(mp);
101 continue;
102 }
103
104 if (mp->b_cont == NULL) {
105 OVERLAY_FREEMSG(mp, "missing a b_cont");
106 freemsg(mp);
107 continue;
108 }
109
110 tudi = (struct T_unitdata_ind *)mp->b_rptr;
111 if (tudi->PRIM_type != T_UNITDATA_IND) {
112 OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *");
113 freemsg(mp);
114 continue;
115 }
116
117 /*
118 * In the future, we'll care about the source information
119 * for purposes of telling varpd for oob invalidation. But for
120 * now, just drop that block.
121 */
122 fmp = mp;
123 mp = fmp->b_cont;
124 freeb(fmp);
125
126 /*
127 * Until we have VXLAN-or-other-decap HW acceleration support
128 * (e.g. we support NICs that reach into VXLAN-encapsulated
129 * packets and check the inside-VXLAN IP packets' checksums,
130 * or do LSO with VXLAN), we should clear any HW-accelerated-
131 * performed bits.
132 */
133 DB_CKSUMFLAGS(mp) = 0;
134
135 /*
136 * Decap and deliver.
137 */
138 bzero(&infop, sizeof (ovep_encap_info_t));
139 ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop);
140 if (ret != 0) {
141 OVERLAY_FREEMSG(mp, "decap failed");
142 freemsg(mp);
143 continue;
144 }
145 if (MBLKL(mp) > infop.ovdi_hdr_size) {
146 mp->b_rptr += infop.ovdi_hdr_size;
147 } else {
148 while (infop.ovdi_hdr_size != 0) {
149 size_t rem, blkl;
150
151 if (mp == NULL)
152 break;
153
154 blkl = MBLKL(mp);
155 rem = MIN(infop.ovdi_hdr_size, blkl);
156 infop.ovdi_hdr_size -= rem;
157 mp->b_rptr += rem;
158 if (rem == blkl) {
159 fmp = mp;
160 mp = fmp->b_cont;
161 fmp->b_cont = NULL;
162 OVERLAY_FREEMSG(mp,
163 "freed a fmp block");
164 freemsg(fmp);
165 }
166 }
167 if (mp == NULL) {
168 OVERLAY_FREEMSG(mp, "freed it all...");
169 continue;
170 }
171 }
172
173
174 od.odd_vid = infop.ovdi_id;
175 mutex_enter(&mux->omux_lock);
176 odd = avl_find(&mux->omux_devices, &od, NULL);
177 if (odd == NULL) {
178 mutex_exit(&mux->omux_lock);
179 OVERLAY_FREEMSG(mp, "no matching vid");
180 freemsg(mp);
181 continue;
182 }
183 mutex_enter(&odd->odd_lock);
184 if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
185 !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
186 mutex_exit(&odd->odd_lock);
187 mutex_exit(&mux->omux_lock);
188 OVERLAY_FREEMSG(mp, "dev dropped");
189 freemsg(mp);
190 continue;
191 }
192 overlay_io_start(odd, OVERLAY_F_IN_RX);
193 mutex_exit(&odd->odd_lock);
194 mutex_exit(&mux->omux_lock);
195
196 mac_rx(odd->odd_mh, NULL, mp);
197
198 mutex_enter(&odd->odd_lock);
199 overlay_io_done(odd, OVERLAY_F_IN_RX);
200 mutex_exit(&odd->odd_lock);
201 }
202
203 return (B_TRUE);
204 }
205
206 /*
207 * Register a given device with a socket backend. If no such device socket
208 * exists, create a new one.
209 */
210 overlay_mux_t *
overlay_mux_open(overlay_plugin_t * opp,int domain,int family,int protocol,struct sockaddr * addr,socklen_t len,int * errp)211 overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
212 struct sockaddr *addr, socklen_t len, int *errp)
213 {
214 int err;
215 overlay_mux_t *mux;
216 ksocket_t ksock;
217
218 if (errp == NULL)
219 errp = &err;
220
221 mutex_enter(&overlay_mux_lock);
222 for (mux = list_head(&overlay_mux_list); mux != NULL;
223 mux = list_next(&overlay_mux_list, mux)) {
224 if (domain == mux->omux_domain &&
225 family == mux->omux_family &&
226 protocol == mux->omux_protocol &&
227 len == mux->omux_alen &&
228 bcmp(addr, mux->omux_addr, len) == 0) {
229
230 if (opp != mux->omux_plugin) {
231 *errp = EEXIST;
232 return (NULL);
233 }
234
235 mutex_enter(&mux->omux_lock);
236 mux->omux_count++;
237 mutex_exit(&mux->omux_lock);
238 mutex_exit(&overlay_mux_lock);
239 *errp = 0;
240 return (mux);
241 }
242 }
243
244 /*
245 * Today we aren't zone-aware and only exist in the global zone. When we
246 * allow for things to exist in the non-global zone, we'll want to use a
247 * credential that's actually specific to the zone.
248 */
249 *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP,
250 kcred);
251 if (*errp != 0) {
252 mutex_exit(&overlay_mux_lock);
253 return (NULL);
254 }
255
256 *errp = ksocket_bind(ksock, addr, len, kcred);
257 if (*errp != 0) {
258 mutex_exit(&overlay_mux_lock);
259 ksocket_close(ksock, kcred);
260 return (NULL);
261 }
262
263 /*
264 * Ask our lower layer to optionally toggle anything they need on this
265 * socket. Because a socket is owned by a single type of plugin, we can
266 * then ask it to perform any additional socket set up it'd like to do.
267 */
268 if (opp->ovp_ops->ovpo_sockopt != NULL &&
269 (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) {
270 mutex_exit(&overlay_mux_lock);
271 ksocket_close(ksock, kcred);
272 return (NULL);
273 }
274
275 mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP);
276 list_link_init(&mux->omux_lnode);
277 mux->omux_ksock = ksock;
278 mux->omux_plugin = opp;
279 mux->omux_domain = domain;
280 mux->omux_family = family;
281 mux->omux_protocol = protocol;
282 mux->omux_addr = kmem_alloc(len, KM_SLEEP);
283 bcopy(addr, mux->omux_addr, len);
284 mux->omux_alen = len;
285 mux->omux_count = 1;
286 avl_create(&mux->omux_devices, overlay_mux_comparator,
287 sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode));
288 mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL);
289
290
291 /* Once this is called, we need to expect to rx data */
292 *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux);
293 if (*errp != 0) {
294 ksocket_close(ksock, kcred);
295 mutex_destroy(&mux->omux_lock);
296 avl_destroy(&mux->omux_devices);
297 kmem_free(mux->omux_addr, len);
298 kmem_free(mux, sizeof (overlay_mux_t));
299 return (NULL);
300 }
301
302 list_insert_tail(&overlay_mux_list, mux);
303 mutex_exit(&overlay_mux_lock);
304
305 *errp = 0;
306 return (mux);
307 }
308
309 void
overlay_mux_close(overlay_mux_t * mux)310 overlay_mux_close(overlay_mux_t *mux)
311 {
312 mutex_enter(&overlay_mux_lock);
313 mutex_enter(&mux->omux_lock);
314 mux->omux_count--;
315 if (mux->omux_count != 0) {
316 mutex_exit(&mux->omux_lock);
317 mutex_exit(&overlay_mux_lock);
318 return;
319 }
320 list_remove(&overlay_mux_list, mux);
321 mutex_exit(&mux->omux_lock);
322 mutex_exit(&overlay_mux_lock);
323
324 ksocket_close(mux->omux_ksock, kcred);
325 avl_destroy(&mux->omux_devices);
326 kmem_free(mux->omux_addr, mux->omux_alen);
327 kmem_free(mux, sizeof (overlay_mux_t));
328 }
329
330 void
overlay_mux_add_dev(overlay_mux_t * mux,overlay_dev_t * odd)331 overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd)
332 {
333 mutex_enter(&mux->omux_lock);
334 avl_add(&mux->omux_devices, odd);
335 mutex_exit(&mux->omux_lock);
336 }
337
338 void
overlay_mux_remove_dev(overlay_mux_t * mux,overlay_dev_t * odd)339 overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd)
340 {
341 mutex_enter(&mux->omux_lock);
342 avl_remove(&mux->omux_devices, odd);
343 mutex_exit(&mux->omux_lock);
344 }
345
346 int
overlay_mux_tx(overlay_mux_t * mux,struct msghdr * hdr,mblk_t * mp)347 overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp)
348 {
349 int ret;
350
351 /*
352 * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately,
353 * that isn't actually supported by UDP at this time.
354 */
355 ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred);
356 if (ret != 0)
357 freemsg(mp);
358
359 return (ret);
360 }
361