1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2018 Joyent, Inc. 14 * Copyright 2022 MNX Cloud, Inc. 15 */ 16 17 /* 18 * VXLAN encapsulation module 19 * 20 * 21 * The VXLAN header looks as follows in network byte order: 22 * 23 * |0 3| 4 |5 31| 24 * +----------+---+------------------------+ 25 * | Reserved | I | Reserved | 26 * +---------------------------------------+ 27 * | Virtual Network ID | Reserved | 28 * +----------------------------+----------+ 29 * |0 23|24 31| 30 * 31 * All reserved values must be 0. The I bit must be 1. We call the top 32 * word the VXLAN magic field for the time being. The second word is 33 * definitely not the most friendly way to operate. Specifically, the ID 34 * is a 24-bit big endian value, but we have to make sure not to use the 35 * reserved byte. 36 * 37 * For us, VXLAN encapsulation is a fairly straightforward implementation. It 38 * only has two properties, a listen_ip and a listen_port. These determine on 39 * what address we should be listening on. While we do not have a default 40 * address to listen upon, we do have a default port, which is the IANA assigned 41 * port for VXLAN -- 4789. 42 */ 43 44 #include <sys/overlay_plugin.h> 45 #include <sys/modctl.h> 46 #include <sys/errno.h> 47 #include <sys/byteorder.h> 48 #include <sys/vxlan.h> 49 #include <inet/ip.h> 50 #include <netinet/in.h> 51 #include <sys/strsun.h> 52 #include <netinet/udp.h> 53 54 static const char *vxlan_ident = "vxlan"; 55 static uint16_t vxlan_defport = IPPORT_VXLAN; 56 57 /* 58 * Should we enable UDP source port hashing for fanout. 59 */ 60 boolean_t vxlan_fanout = B_TRUE; 61 62 /* 63 * This represents the size in bytes that we want to allocate when allocating a 64 * vxlan header block. This is intended such that lower levels can try and use 65 * the message block that we allocate for the IP and UPD header. The hope is 66 * that even if this is tunneled, that this is enough space. 67 * 68 * The vxlan_noalloc_min value represents the minimum amount of space we need to 69 * consider not allocating a message block and just passing it down the stack in 70 * this form. This number assumes that we have a VLAN tag, so 18 byte Ethernet 71 * header, 20 byte IP header, 8 byte UDP header, and 8 byte VXLAN header. 72 */ 73 uint_t vxlan_alloc_size = 128; 74 uint_t vxlan_noalloc_min = 54; 75 76 static const char *vxlan_props[] = { 77 "vxlan/listen_ip", 78 "vxlan/listen_port", 79 NULL 80 }; 81 82 typedef struct vxlan { 83 kmutex_t vxl_lock; 84 overlay_handle_t vxl_oh; 85 uint16_t vxl_lport; 86 boolean_t vxl_hladdr; 87 struct in6_addr vxl_laddr; 88 } vxlan_t; 89 90 static int 91 vxlan_o_init(overlay_handle_t oh, void **outp) 92 { 93 vxlan_t *vxl; 94 95 vxl = kmem_alloc(sizeof (vxlan_t), KM_SLEEP); 96 *outp = vxl; 97 mutex_init(&vxl->vxl_lock, NULL, MUTEX_DRIVER, NULL); 98 vxl->vxl_oh = oh; 99 vxl->vxl_lport = vxlan_defport; 100 vxl->vxl_hladdr = B_FALSE; 101 102 return (0); 103 } 104 105 static void 106 vxlan_o_fini(void *arg) 107 { 108 vxlan_t *vxl = arg; 109 110 mutex_destroy(&vxl->vxl_lock); 111 kmem_free(arg, sizeof (vxlan_t)); 112 } 113 114 static int 115 vxlan_o_socket(void *arg, int *dp, int *fp, int *pp, struct sockaddr *addr, 116 socklen_t *slenp) 117 { 118 vxlan_t *vxl = arg; 119 struct sockaddr_in6 *in; 120 121 in = (struct sockaddr_in6 *)addr; 122 *dp = AF_INET6; 123 *fp = SOCK_DGRAM; 124 *pp = 0; 125 bzero(in, sizeof (struct sockaddr_in6)); 126 in->sin6_family = AF_INET6; 127 128 /* 129 * We should consider a more expressive private errno set that 130 * provider's can use. 131 */ 132 mutex_enter(&vxl->vxl_lock); 133 if (vxl->vxl_hladdr == B_FALSE) { 134 mutex_exit(&vxl->vxl_lock); 135 return (EINVAL); 136 } 137 in->sin6_port = htons(vxl->vxl_lport); 138 in->sin6_addr = vxl->vxl_laddr; 139 mutex_exit(&vxl->vxl_lock); 140 *slenp = sizeof (struct sockaddr_in6); 141 142 return (0); 143 } 144 145 static int 146 vxlan_o_sockopt(ksocket_t ksock) 147 { 148 int val, err; 149 if (vxlan_fanout == B_FALSE) 150 return (0); 151 152 val = UDP_HASH_VXLAN; 153 err = ksocket_setsockopt(ksock, IPPROTO_UDP, UDP_SRCPORT_HASH, &val, 154 sizeof (val), kcred); 155 return (err); 156 } 157 158 /* ARGSUSED */ 159 static int 160 vxlan_o_encap(void *arg, mblk_t *mp, ovep_encap_info_t *einfop, 161 mblk_t **outp) 162 { 163 mblk_t *ob; 164 vxlan_hdr_t *vxh; 165 166 ASSERT(einfop->ovdi_id < (1 << 24)); 167 168 if (DB_REF(mp) != 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) { 169 /* 170 * This allocation could get hot. We may want to have a good 171 * way to cache and handle this allocation the same way that IP 172 * does with keeping around a message block per entry, or 173 * basically treating this as an immutable message block in the 174 * system. Basically freemsg() will be a nop, but we'll do the 175 * right thing with respect to the rest of the chain. 176 */ 177 ob = allocb(vxlan_alloc_size, 0); 178 if (ob == NULL) 179 return (ENOMEM); 180 181 ob->b_wptr = DB_LIM(ob); 182 ob->b_rptr = ob->b_wptr; 183 ob->b_cont = mp; 184 } else { 185 ob = mp; 186 } 187 ob->b_rptr -= VXLAN_HDR_LEN; 188 189 vxh = (vxlan_hdr_t *)ob->b_rptr; 190 vxh->vxlan_flags = ntohl(VXLAN_F_VDI); 191 vxh->vxlan_id = htonl((uint32_t)einfop->ovdi_id << VXLAN_ID_SHIFT); 192 *outp = ob; 193 194 return (0); 195 } 196 197 /* ARGSUSED */ 198 static int 199 vxlan_o_decap(void *arg, mblk_t *mp, ovep_encap_info_t *dinfop) 200 { 201 vxlan_hdr_t *vxh; 202 203 if (MBLKL(mp) < sizeof (vxlan_hdr_t)) 204 return (EINVAL); 205 vxh = (vxlan_hdr_t *)mp->b_rptr; 206 if ((ntohl(vxh->vxlan_flags) & VXLAN_F_VDI) == 0) 207 return (EINVAL); 208 209 dinfop->ovdi_id = ntohl(vxh->vxlan_id) >> VXLAN_ID_SHIFT; 210 dinfop->ovdi_hdr_size = VXLAN_HDR_LEN; 211 212 return (0); 213 } 214 215 static int 216 vxlan_o_getprop(void *arg, const char *pr_name, void *buf, uint32_t *bufsize) 217 { 218 vxlan_t *vxl = arg; 219 220 /* vxlan/listen_ip */ 221 if (strcmp(pr_name, vxlan_props[0]) == 0) { 222 if (*bufsize < sizeof (struct in6_addr)) 223 return (EOVERFLOW); 224 225 mutex_enter(&vxl->vxl_lock); 226 if (vxl->vxl_hladdr == B_FALSE) { 227 *bufsize = 0; 228 } else { 229 bcopy(&vxl->vxl_laddr, buf, sizeof (struct in6_addr)); 230 *bufsize = sizeof (struct in6_addr); 231 } 232 mutex_exit(&vxl->vxl_lock); 233 return (0); 234 } 235 236 /* vxlan/listen_port */ 237 if (strcmp(pr_name, vxlan_props[1]) == 0) { 238 uint64_t val; 239 if (*bufsize < sizeof (uint64_t)) 240 return (EOVERFLOW); 241 242 mutex_enter(&vxl->vxl_lock); 243 val = vxl->vxl_lport; 244 bcopy(&val, buf, sizeof (uint64_t)); 245 *bufsize = sizeof (uint64_t); 246 mutex_exit(&vxl->vxl_lock); 247 return (0); 248 } 249 250 return (EINVAL); 251 } 252 253 static int 254 vxlan_o_setprop(void *arg, const char *pr_name, const void *buf, 255 uint32_t bufsize) 256 { 257 vxlan_t *vxl = arg; 258 259 /* vxlan/listen_ip */ 260 if (strcmp(pr_name, vxlan_props[0]) == 0) { 261 const struct in6_addr *ipv6 = buf; 262 if (bufsize != sizeof (struct in6_addr)) 263 return (EINVAL); 264 265 if (IN6_IS_ADDR_V4COMPAT(ipv6)) 266 return (EINVAL); 267 268 if (IN6_IS_ADDR_MULTICAST(ipv6)) 269 return (EINVAL); 270 271 if (IN6_IS_ADDR_6TO4(ipv6)) 272 return (EINVAL); 273 274 if (IN6_IS_ADDR_V4MAPPED(ipv6)) { 275 ipaddr_t v4; 276 IN6_V4MAPPED_TO_IPADDR(ipv6, v4); 277 if (IN_MULTICAST(ntohl(v4))) 278 return (EINVAL); 279 } 280 281 mutex_enter(&vxl->vxl_lock); 282 vxl->vxl_hladdr = B_TRUE; 283 bcopy(ipv6, &vxl->vxl_laddr, sizeof (struct in6_addr)); 284 mutex_exit(&vxl->vxl_lock); 285 286 return (0); 287 } 288 289 /* vxlan/listen_port */ 290 if (strcmp(pr_name, vxlan_props[1]) == 0) { 291 const uint64_t *valp = buf; 292 if (bufsize != 8) 293 return (EINVAL); 294 295 if (*valp == 0 || *valp > UINT16_MAX) 296 return (EINVAL); 297 298 mutex_enter(&vxl->vxl_lock); 299 vxl->vxl_lport = *valp; 300 mutex_exit(&vxl->vxl_lock); 301 return (0); 302 } 303 return (EINVAL); 304 } 305 306 static int 307 vxlan_o_propinfo(const char *pr_name, overlay_prop_handle_t phdl) 308 { 309 /* vxlan/listen_ip */ 310 if (strcmp(pr_name, vxlan_props[0]) == 0) { 311 overlay_prop_set_name(phdl, vxlan_props[0]); 312 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW); 313 overlay_prop_set_type(phdl, OVERLAY_PROP_T_IP); 314 overlay_prop_set_nodefault(phdl); 315 return (0); 316 } 317 318 if (strcmp(pr_name, vxlan_props[1]) == 0) { 319 overlay_prop_set_name(phdl, vxlan_props[1]); 320 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW); 321 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); 322 (void) overlay_prop_set_default(phdl, &vxlan_defport, 323 sizeof (vxlan_defport)); 324 overlay_prop_set_range_uint32(phdl, 1, UINT16_MAX); 325 return (0); 326 } 327 328 return (EINVAL); 329 } 330 331 static struct overlay_plugin_ops vxlan_o_ops = { 332 0, 333 vxlan_o_init, 334 vxlan_o_fini, 335 vxlan_o_encap, 336 vxlan_o_decap, 337 vxlan_o_socket, 338 vxlan_o_sockopt, 339 vxlan_o_getprop, 340 vxlan_o_setprop, 341 vxlan_o_propinfo 342 }; 343 344 static struct modlmisc vxlan_modlmisc = { 345 &mod_miscops, 346 "VXLAN encap plugin" 347 }; 348 349 static struct modlinkage vxlan_modlinkage = { 350 MODREV_1, 351 &vxlan_modlmisc 352 }; 353 354 int 355 _init(void) 356 { 357 int err; 358 overlay_plugin_register_t *ovrp; 359 360 ovrp = overlay_plugin_alloc(OVEP_VERSION); 361 if (ovrp == NULL) 362 return (ENOTSUP); 363 ovrp->ovep_name = vxlan_ident; 364 ovrp->ovep_ops = &vxlan_o_ops; 365 ovrp->ovep_id_size = VXLAN_ID_LEN; 366 ovrp->ovep_flags = OVEP_F_VLAN_TAG; 367 ovrp->ovep_dest = OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT; 368 ovrp->ovep_props = vxlan_props; 369 370 if ((err = overlay_plugin_register(ovrp)) == 0) { 371 if ((err = mod_install(&vxlan_modlinkage)) != 0) { 372 (void) overlay_plugin_unregister(vxlan_ident); 373 } 374 } 375 376 overlay_plugin_free(ovrp); 377 return (err); 378 } 379 380 int 381 _info(struct modinfo *modinfop) 382 { 383 return (mod_info(&vxlan_modlinkage, modinfop)); 384 } 385 386 int 387 _fini(void) 388 { 389 int err; 390 391 if ((err = overlay_plugin_unregister(vxlan_ident)) != 0) 392 return (err); 393 394 return (mod_remove(&vxlan_modlinkage)); 395 } 396