1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2019 Joyent, Inc. 14 */ 15 16 /* 17 * Overlay device ksocket multiplexer. 18 * 19 * For more information, see the big theory statement in 20 * uts/common/io/overlay/overlay.c 21 */ 22 23 #include <sys/types.h> 24 #include <sys/socket.h> 25 #include <sys/ksynch.h> 26 #include <sys/ksocket.h> 27 #include <sys/avl.h> 28 #include <sys/list.h> 29 #include <sys/pattr.h> 30 #include <sys/sysmacros.h> 31 #include <sys/strsubr.h> 32 #include <sys/strsun.h> 33 #include <sys/tihdr.h> 34 35 #include <sys/overlay_impl.h> 36 37 #include <sys/sdt.h> 38 39 #define OVERLAY_FREEMSG(mp, reason) \ 40 DTRACE_PROBE2(overlay__freemsg, mblk_t *, mp, char *, reason) 41 42 static list_t overlay_mux_list; 43 static kmutex_t overlay_mux_lock; 44 45 void 46 overlay_mux_init(void) 47 { 48 list_create(&overlay_mux_list, sizeof (overlay_mux_t), 49 offsetof(overlay_mux_t, omux_lnode)); 50 mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL); 51 } 52 53 void 54 overlay_mux_fini(void) 55 { 56 mutex_destroy(&overlay_mux_lock); 57 list_destroy(&overlay_mux_list); 58 } 59 60 static int 61 overlay_mux_comparator(const void *a, const void *b) 62 { 63 const overlay_dev_t *odl, *odr; 64 odl = a; 65 odr = b; 66 if (odl->odd_vid > odr->odd_vid) 67 return (1); 68 else if (odl->odd_vid < odr->odd_vid) 69 return (-1); 70 else 71 return (0); 72 } 73 74 /* 75 * This is the central receive data path. We need to decode the packet, if we 76 * can, and then deliver it to the appropriate overlay. 77 */ 78 /* ARGSUSED */ 79 static boolean_t 80 overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob, 81 void *arg) 82 { 83 mblk_t *mp, *nmp, *fmp; 84 overlay_mux_t *mux = arg; 85 86 /* 87 * We may have a received a chain of messages. Each message in the 88 * chain will likely have a T_unitdata_ind attached to it as an M_PROTO. 89 * If we aren't getting that, we should probably drop that for the 90 * moment. 91 */ 92 for (mp = mpchain; mp != NULL; mp = nmp) { 93 struct T_unitdata_ind *tudi; 94 ovep_encap_info_t infop; 95 overlay_dev_t od, *odd; 96 int ret; 97 98 nmp = mp->b_next; 99 mp->b_next = NULL; 100 101 if (DB_TYPE(mp) != M_PROTO) { 102 OVERLAY_FREEMSG(mp, "first one isn't M_PROTO"); 103 freemsg(mp); 104 continue; 105 } 106 107 if (mp->b_cont == NULL) { 108 OVERLAY_FREEMSG(mp, "missing a b_cont"); 109 freemsg(mp); 110 continue; 111 } 112 113 tudi = (struct T_unitdata_ind *)mp->b_rptr; 114 if (tudi->PRIM_type != T_UNITDATA_IND) { 115 OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *"); 116 freemsg(mp); 117 continue; 118 } 119 120 /* 121 * In the future, we'll care about the source information 122 * for purposes of telling varpd for oob invalidation. But for 123 * now, just drop that block. 124 */ 125 fmp = mp; 126 mp = fmp->b_cont; 127 freeb(fmp); 128 129 /* 130 * Until we have VXLAN-or-other-decap HW acceleration support 131 * (e.g. we support NICs that reach into VXLAN-encapsulated 132 * packets and check the inside-VXLAN IP packets' checksums, 133 * or do LSO with VXLAN), we should clear any HW-accelerated- 134 * performed bits. 135 */ 136 DB_CKSUMFLAGS(mp) = 0; 137 138 /* 139 * Decap and deliver. 140 */ 141 bzero(&infop, sizeof (ovep_encap_info_t)); 142 ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop); 143 if (ret != 0) { 144 OVERLAY_FREEMSG(mp, "decap failed"); 145 freemsg(mp); 146 continue; 147 } 148 if (MBLKL(mp) > infop.ovdi_hdr_size) { 149 mp->b_rptr += infop.ovdi_hdr_size; 150 } else { 151 while (infop.ovdi_hdr_size != 0) { 152 size_t rem, blkl; 153 154 if (mp == NULL) 155 break; 156 157 blkl = MBLKL(mp); 158 rem = MIN(infop.ovdi_hdr_size, blkl); 159 infop.ovdi_hdr_size -= rem; 160 mp->b_rptr += rem; 161 if (rem == blkl) { 162 fmp = mp; 163 mp = fmp->b_cont; 164 fmp->b_cont = NULL; 165 OVERLAY_FREEMSG(mp, 166 "freed a fmp block"); 167 freemsg(fmp); 168 } 169 } 170 if (mp == NULL) { 171 OVERLAY_FREEMSG(mp, "freed it all..."); 172 continue; 173 } 174 } 175 176 177 od.odd_vid = infop.ovdi_id; 178 mutex_enter(&mux->omux_lock); 179 odd = avl_find(&mux->omux_devices, &od, NULL); 180 if (odd == NULL) { 181 mutex_exit(&mux->omux_lock); 182 OVERLAY_FREEMSG(mp, "no matching vid"); 183 freemsg(mp); 184 continue; 185 } 186 mutex_enter(&odd->odd_lock); 187 if ((odd->odd_flags & OVERLAY_F_MDDROP) || 188 !(odd->odd_flags & OVERLAY_F_IN_MUX)) { 189 mutex_exit(&odd->odd_lock); 190 mutex_exit(&mux->omux_lock); 191 OVERLAY_FREEMSG(mp, "dev dropped"); 192 freemsg(mp); 193 continue; 194 } 195 overlay_io_start(odd, OVERLAY_F_IN_RX); 196 mutex_exit(&odd->odd_lock); 197 mutex_exit(&mux->omux_lock); 198 199 mac_rx(odd->odd_mh, NULL, mp); 200 201 mutex_enter(&odd->odd_lock); 202 overlay_io_done(odd, OVERLAY_F_IN_RX); 203 mutex_exit(&odd->odd_lock); 204 } 205 206 return (B_TRUE); 207 } 208 209 /* 210 * Register a given device with a socket backend. If no such device socket 211 * exists, create a new one. 212 */ 213 overlay_mux_t * 214 overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol, 215 struct sockaddr *addr, socklen_t len, int *errp) 216 { 217 int err; 218 overlay_mux_t *mux; 219 ksocket_t ksock; 220 221 if (errp == NULL) 222 errp = &err; 223 224 mutex_enter(&overlay_mux_lock); 225 for (mux = list_head(&overlay_mux_list); mux != NULL; 226 mux = list_next(&overlay_mux_list, mux)) { 227 if (domain == mux->omux_domain && 228 family == mux->omux_family && 229 protocol == mux->omux_protocol && 230 len == mux->omux_alen && 231 bcmp(addr, mux->omux_addr, len) == 0) { 232 233 if (opp != mux->omux_plugin) { 234 *errp = EEXIST; 235 return (NULL); 236 } 237 238 mutex_enter(&mux->omux_lock); 239 mux->omux_count++; 240 mutex_exit(&mux->omux_lock); 241 mutex_exit(&overlay_mux_lock); 242 *errp = 0; 243 return (mux); 244 } 245 } 246 247 /* 248 * Today we aren't zone-aware and only exist in the global zone. When we 249 * allow for things to exist in the non-global zone, we'll want to use a 250 * credential that's actually specific to the zone. 251 */ 252 *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP, 253 kcred); 254 if (*errp != 0) { 255 mutex_exit(&overlay_mux_lock); 256 return (NULL); 257 } 258 259 *errp = ksocket_bind(ksock, addr, len, kcred); 260 if (*errp != 0) { 261 mutex_exit(&overlay_mux_lock); 262 ksocket_close(ksock, kcred); 263 return (NULL); 264 } 265 266 /* 267 * Ask our lower layer to optionally toggle anything they need on this 268 * socket. Because a socket is owned by a single type of plugin, we can 269 * then ask it to perform any additional socket set up it'd like to do. 270 */ 271 if (opp->ovp_ops->ovpo_sockopt != NULL && 272 (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) { 273 mutex_exit(&overlay_mux_lock); 274 ksocket_close(ksock, kcred); 275 return (NULL); 276 } 277 278 mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP); 279 list_link_init(&mux->omux_lnode); 280 mux->omux_ksock = ksock; 281 mux->omux_plugin = opp; 282 mux->omux_domain = domain; 283 mux->omux_family = family; 284 mux->omux_protocol = protocol; 285 mux->omux_addr = kmem_alloc(len, KM_SLEEP); 286 bcopy(addr, mux->omux_addr, len); 287 mux->omux_alen = len; 288 mux->omux_count = 1; 289 avl_create(&mux->omux_devices, overlay_mux_comparator, 290 sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode)); 291 mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL); 292 293 294 /* Once this is called, we need to expect to rx data */ 295 *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux); 296 if (*errp != 0) { 297 ksocket_close(ksock, kcred); 298 mutex_destroy(&mux->omux_lock); 299 avl_destroy(&mux->omux_devices); 300 kmem_free(mux->omux_addr, len); 301 kmem_free(mux, sizeof (overlay_mux_t)); 302 return (NULL); 303 } 304 305 list_insert_tail(&overlay_mux_list, mux); 306 mutex_exit(&overlay_mux_lock); 307 308 *errp = 0; 309 return (mux); 310 } 311 312 void 313 overlay_mux_close(overlay_mux_t *mux) 314 { 315 mutex_enter(&overlay_mux_lock); 316 mutex_enter(&mux->omux_lock); 317 mux->omux_count--; 318 if (mux->omux_count != 0) { 319 mutex_exit(&mux->omux_lock); 320 mutex_exit(&overlay_mux_lock); 321 return; 322 } 323 list_remove(&overlay_mux_list, mux); 324 mutex_exit(&mux->omux_lock); 325 mutex_exit(&overlay_mux_lock); 326 327 ksocket_close(mux->omux_ksock, kcred); 328 avl_destroy(&mux->omux_devices); 329 kmem_free(mux->omux_addr, mux->omux_alen); 330 kmem_free(mux, sizeof (overlay_mux_t)); 331 } 332 333 void 334 overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd) 335 { 336 mutex_enter(&mux->omux_lock); 337 avl_add(&mux->omux_devices, odd); 338 mutex_exit(&mux->omux_lock); 339 } 340 341 void 342 overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd) 343 { 344 mutex_enter(&mux->omux_lock); 345 avl_remove(&mux->omux_devices, odd); 346 mutex_exit(&mux->omux_lock); 347 } 348 349 int 350 overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp) 351 { 352 int ret; 353 354 /* 355 * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately, 356 * that isn't actually supported by UDP at this time. 357 */ 358 ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred); 359 if (ret != 0) 360 freemsg(mp); 361 362 return (ret); 363 } 364