1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2019 Joyent, Inc. 14 */ 15 16 /* 17 * Overlay device ksocket multiplexer. 18 * 19 * For more information, see the big theory statement in 20 * uts/common/io/overlay/overlay.c 21 */ 22 23 #include <sys/types.h> 24 #include <sys/socket.h> 25 #include <sys/ksynch.h> 26 #include <sys/ksocket.h> 27 #include <sys/avl.h> 28 #include <sys/list.h> 29 #include <sys/pattr.h> 30 #include <sys/sysmacros.h> 31 #include <sys/strsubr.h> 32 #include <sys/strsun.h> 33 #include <sys/tihdr.h> 34 35 #include <sys/overlay_impl.h> 36 37 #include <sys/sdt.h> 38 39 static list_t overlay_mux_list; 40 static kmutex_t overlay_mux_lock; 41 42 void 43 overlay_mux_init(void) 44 { 45 list_create(&overlay_mux_list, sizeof (overlay_mux_t), 46 offsetof(overlay_mux_t, omux_lnode)); 47 mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL); 48 } 49 50 void 51 overlay_mux_fini(void) 52 { 53 mutex_destroy(&overlay_mux_lock); 54 list_destroy(&overlay_mux_list); 55 } 56 57 static int 58 overlay_mux_comparator(const void *a, const void *b) 59 { 60 const overlay_dev_t *odl, *odr; 61 odl = a; 62 odr = b; 63 if (odl->odd_vid > odr->odd_vid) 64 return (1); 65 else if (odl->odd_vid < odr->odd_vid) 66 return (-1); 67 else 68 return (0); 69 } 70 71 /* 72 * This is the central receive data path. We need to decode the packet, if we 73 * can, and then deliver it to the appropriate overlay. 74 */ 75 /* ARGSUSED */ 76 static boolean_t 77 overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob, 78 void *arg) 79 { 80 mblk_t *mp, *nmp, *fmp; 81 overlay_mux_t *mux = arg; 82 83 /* 84 * We may have a received a chain of messages. Each message in the 85 * chain will likely have a T_unitdata_ind attached to it as an M_PROTO. 86 * If we aren't getting that, we should probably drop that for the 87 * moment. 88 */ 89 for (mp = mpchain; mp != NULL; mp = nmp) { 90 struct T_unitdata_ind *tudi; 91 ovep_encap_info_t infop; 92 overlay_dev_t od, *odd; 93 int ret; 94 95 nmp = mp->b_next; 96 mp->b_next = NULL; 97 98 if (DB_TYPE(mp) != M_PROTO) { 99 OVERLAY_FREEMSG(mp, "first one isn't M_PROTO"); 100 freemsg(mp); 101 continue; 102 } 103 104 if (mp->b_cont == NULL) { 105 OVERLAY_FREEMSG(mp, "missing a b_cont"); 106 freemsg(mp); 107 continue; 108 } 109 110 tudi = (struct T_unitdata_ind *)mp->b_rptr; 111 if (tudi->PRIM_type != T_UNITDATA_IND) { 112 OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *"); 113 freemsg(mp); 114 continue; 115 } 116 117 /* 118 * In the future, we'll care about the source information 119 * for purposes of telling varpd for oob invalidation. But for 120 * now, just drop that block. 121 */ 122 fmp = mp; 123 mp = fmp->b_cont; 124 freeb(fmp); 125 126 /* 127 * Until we have VXLAN-or-other-decap HW acceleration support 128 * (e.g. we support NICs that reach into VXLAN-encapsulated 129 * packets and check the inside-VXLAN IP packets' checksums, 130 * or do LSO with VXLAN), we should clear any HW-accelerated- 131 * performed bits. 132 */ 133 DB_CKSUMFLAGS(mp) = 0; 134 135 /* 136 * Decap and deliver. 137 */ 138 bzero(&infop, sizeof (ovep_encap_info_t)); 139 ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop); 140 if (ret != 0) { 141 OVERLAY_FREEMSG(mp, "decap failed"); 142 freemsg(mp); 143 continue; 144 } 145 if (MBLKL(mp) > infop.ovdi_hdr_size) { 146 mp->b_rptr += infop.ovdi_hdr_size; 147 } else { 148 while (infop.ovdi_hdr_size != 0) { 149 size_t rem, blkl; 150 151 if (mp == NULL) 152 break; 153 154 blkl = MBLKL(mp); 155 rem = MIN(infop.ovdi_hdr_size, blkl); 156 infop.ovdi_hdr_size -= rem; 157 mp->b_rptr += rem; 158 if (rem == blkl) { 159 fmp = mp; 160 mp = fmp->b_cont; 161 fmp->b_cont = NULL; 162 OVERLAY_FREEMSG(mp, 163 "freed a fmp block"); 164 freemsg(fmp); 165 } 166 } 167 if (mp == NULL) { 168 OVERLAY_FREEMSG(mp, "freed it all..."); 169 continue; 170 } 171 } 172 173 174 od.odd_vid = infop.ovdi_id; 175 mutex_enter(&mux->omux_lock); 176 odd = avl_find(&mux->omux_devices, &od, NULL); 177 if (odd == NULL) { 178 mutex_exit(&mux->omux_lock); 179 OVERLAY_FREEMSG(mp, "no matching vid"); 180 freemsg(mp); 181 continue; 182 } 183 mutex_enter(&odd->odd_lock); 184 if ((odd->odd_flags & OVERLAY_F_MDDROP) || 185 !(odd->odd_flags & OVERLAY_F_IN_MUX)) { 186 mutex_exit(&odd->odd_lock); 187 mutex_exit(&mux->omux_lock); 188 OVERLAY_FREEMSG(mp, "dev dropped"); 189 freemsg(mp); 190 continue; 191 } 192 overlay_io_start(odd, OVERLAY_F_IN_RX); 193 mutex_exit(&odd->odd_lock); 194 mutex_exit(&mux->omux_lock); 195 196 mac_rx(odd->odd_mh, NULL, mp); 197 198 mutex_enter(&odd->odd_lock); 199 overlay_io_done(odd, OVERLAY_F_IN_RX); 200 mutex_exit(&odd->odd_lock); 201 } 202 203 return (B_TRUE); 204 } 205 206 /* 207 * Register a given device with a socket backend. If no such device socket 208 * exists, create a new one. 209 */ 210 overlay_mux_t * 211 overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol, 212 struct sockaddr *addr, socklen_t len, int *errp) 213 { 214 int err; 215 overlay_mux_t *mux; 216 ksocket_t ksock; 217 218 if (errp == NULL) 219 errp = &err; 220 221 mutex_enter(&overlay_mux_lock); 222 for (mux = list_head(&overlay_mux_list); mux != NULL; 223 mux = list_next(&overlay_mux_list, mux)) { 224 if (domain == mux->omux_domain && 225 family == mux->omux_family && 226 protocol == mux->omux_protocol && 227 len == mux->omux_alen && 228 bcmp(addr, mux->omux_addr, len) == 0) { 229 230 if (opp != mux->omux_plugin) { 231 *errp = EEXIST; 232 return (NULL); 233 } 234 235 mutex_enter(&mux->omux_lock); 236 mux->omux_count++; 237 mutex_exit(&mux->omux_lock); 238 mutex_exit(&overlay_mux_lock); 239 *errp = 0; 240 return (mux); 241 } 242 } 243 244 /* 245 * Today we aren't zone-aware and only exist in the global zone. When we 246 * allow for things to exist in the non-global zone, we'll want to use a 247 * credential that's actually specific to the zone. 248 */ 249 *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP, 250 kcred); 251 if (*errp != 0) { 252 mutex_exit(&overlay_mux_lock); 253 return (NULL); 254 } 255 256 *errp = ksocket_bind(ksock, addr, len, kcred); 257 if (*errp != 0) { 258 mutex_exit(&overlay_mux_lock); 259 ksocket_close(ksock, kcred); 260 return (NULL); 261 } 262 263 /* 264 * Ask our lower layer to optionally toggle anything they need on this 265 * socket. Because a socket is owned by a single type of plugin, we can 266 * then ask it to perform any additional socket set up it'd like to do. 267 */ 268 if (opp->ovp_ops->ovpo_sockopt != NULL && 269 (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) { 270 mutex_exit(&overlay_mux_lock); 271 ksocket_close(ksock, kcred); 272 return (NULL); 273 } 274 275 mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP); 276 list_link_init(&mux->omux_lnode); 277 mux->omux_ksock = ksock; 278 mux->omux_plugin = opp; 279 mux->omux_domain = domain; 280 mux->omux_family = family; 281 mux->omux_protocol = protocol; 282 mux->omux_addr = kmem_alloc(len, KM_SLEEP); 283 bcopy(addr, mux->omux_addr, len); 284 mux->omux_alen = len; 285 mux->omux_count = 1; 286 avl_create(&mux->omux_devices, overlay_mux_comparator, 287 sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode)); 288 mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL); 289 290 291 /* Once this is called, we need to expect to rx data */ 292 *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux); 293 if (*errp != 0) { 294 ksocket_close(ksock, kcred); 295 mutex_destroy(&mux->omux_lock); 296 avl_destroy(&mux->omux_devices); 297 kmem_free(mux->omux_addr, len); 298 kmem_free(mux, sizeof (overlay_mux_t)); 299 return (NULL); 300 } 301 302 list_insert_tail(&overlay_mux_list, mux); 303 mutex_exit(&overlay_mux_lock); 304 305 *errp = 0; 306 return (mux); 307 } 308 309 void 310 overlay_mux_close(overlay_mux_t *mux) 311 { 312 mutex_enter(&overlay_mux_lock); 313 mutex_enter(&mux->omux_lock); 314 mux->omux_count--; 315 if (mux->omux_count != 0) { 316 mutex_exit(&mux->omux_lock); 317 mutex_exit(&overlay_mux_lock); 318 return; 319 } 320 list_remove(&overlay_mux_list, mux); 321 mutex_exit(&mux->omux_lock); 322 mutex_exit(&overlay_mux_lock); 323 324 ksocket_close(mux->omux_ksock, kcred); 325 avl_destroy(&mux->omux_devices); 326 kmem_free(mux->omux_addr, mux->omux_alen); 327 kmem_free(mux, sizeof (overlay_mux_t)); 328 } 329 330 void 331 overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd) 332 { 333 mutex_enter(&mux->omux_lock); 334 avl_add(&mux->omux_devices, odd); 335 mutex_exit(&mux->omux_lock); 336 } 337 338 void 339 overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd) 340 { 341 mutex_enter(&mux->omux_lock); 342 avl_remove(&mux->omux_devices, odd); 343 mutex_exit(&mux->omux_lock); 344 } 345 346 int 347 overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp) 348 { 349 int ret; 350 351 /* 352 * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately, 353 * that isn't actually supported by UDP at this time. 354 */ 355 ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred); 356 if (ret != 0) 357 freemsg(mp); 358 359 return (ret); 360 } 361