1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2018 Joyent, Inc. 14 */ 15 16 /* 17 * VXLAN encapsulation module 18 * 19 * 20 * The VXLAN header looks as follows in network byte order: 21 * 22 * |0 3| 4 |5 31| 23 * +----------+---+------------------------+ 24 * | Reserved | I | Reserved | 25 * +---------------------------------------+ 26 * | Virtual Network ID | Reserved | 27 * +----------------------------+----------+ 28 * |0 23|24 31| 29 * 30 * All reserved values must be 0. The I bit must be 1. We call the top 31 * word the VXLAN magic field for the time being. The second word is 32 * definitely not the most friendly way to operate. Specifically, the ID 33 * is a 24-bit big endian value, but we have to make sure not to use the 34 * reserved byte. 35 * 36 * For us, VXLAN encapsulation is a fairly straightforward implementation. It 37 * only has two properties, a listen_ip and a listen_port. These determine on 38 * what address we should be listening on. While we do not have a default 39 * address to listen upon, we do have a default port, which is the IANA assigned 40 * port for VXLAN -- 4789. 41 */ 42 43 #include <sys/overlay_plugin.h> 44 #include <sys/modctl.h> 45 #include <sys/errno.h> 46 #include <sys/byteorder.h> 47 #include <sys/vxlan.h> 48 #include <inet/ip.h> 49 #include <netinet/in.h> 50 #include <sys/strsun.h> 51 #include <netinet/udp.h> 52 53 static const char *vxlan_ident = "vxlan"; 54 static uint16_t vxlan_defport = IPPORT_VXLAN; 55 56 /* 57 * Should we enable UDP source port hashing for fanout. 58 */ 59 boolean_t vxlan_fanout = B_TRUE; 60 61 /* 62 * This represents the size in bytes that we want to allocate when allocating a 63 * vxlan header block. This is intended such that lower levels can try and use 64 * the message block that we allocate for the IP and UPD header. The hope is 65 * that even if this is tunneled, that this is enough space. 66 * 67 * The vxlan_noalloc_min value represents the minimum amount of space we need to 68 * consider not allocating a message block and just passing it down the stack in 69 * this form. This number assumes that we have a VLAN tag, so 18 byte Ethernet 70 * header, 20 byte IP header, 8 byte UDP header, and 8 byte VXLAN header. 71 */ 72 uint_t vxlan_alloc_size = 128; 73 uint_t vxlan_noalloc_min = 54; 74 75 static const char *vxlan_props[] = { 76 "vxlan/listen_ip", 77 "vxlan/listen_port", 78 NULL 79 }; 80 81 typedef struct vxlan { 82 kmutex_t vxl_lock; 83 overlay_handle_t vxl_oh; 84 uint16_t vxl_lport; 85 boolean_t vxl_hladdr; 86 struct in6_addr vxl_laddr; 87 } vxlan_t; 88 89 static int 90 vxlan_o_init(overlay_handle_t oh, void **outp) 91 { 92 vxlan_t *vxl; 93 94 vxl = kmem_alloc(sizeof (vxlan_t), KM_SLEEP); 95 *outp = vxl; 96 mutex_init(&vxl->vxl_lock, NULL, MUTEX_DRIVER, NULL); 97 vxl->vxl_oh = oh; 98 vxl->vxl_lport = vxlan_defport; 99 vxl->vxl_hladdr = B_FALSE; 100 101 return (0); 102 } 103 104 static void 105 vxlan_o_fini(void *arg) 106 { 107 vxlan_t *vxl = arg; 108 109 mutex_destroy(&vxl->vxl_lock); 110 kmem_free(arg, sizeof (vxlan_t)); 111 } 112 113 static int 114 vxlan_o_socket(void *arg, int *dp, int *fp, int *pp, struct sockaddr *addr, 115 socklen_t *slenp) 116 { 117 vxlan_t *vxl = arg; 118 struct sockaddr_in6 *in; 119 120 in = (struct sockaddr_in6 *)addr; 121 *dp = AF_INET6; 122 *fp = SOCK_DGRAM; 123 *pp = 0; 124 bzero(in, sizeof (struct sockaddr_in6)); 125 in->sin6_family = AF_INET6; 126 127 /* 128 * We should consider a more expressive private errno set that 129 * provider's can use. 130 */ 131 mutex_enter(&vxl->vxl_lock); 132 if (vxl->vxl_hladdr == B_FALSE) { 133 mutex_exit(&vxl->vxl_lock); 134 return (EINVAL); 135 } 136 in->sin6_port = htons(vxl->vxl_lport); 137 in->sin6_addr = vxl->vxl_laddr; 138 mutex_exit(&vxl->vxl_lock); 139 *slenp = sizeof (struct sockaddr_in6); 140 141 return (0); 142 } 143 144 static int 145 vxlan_o_sockopt(ksocket_t ksock) 146 { 147 int val, err; 148 if (vxlan_fanout == B_FALSE) 149 return (0); 150 151 val = UDP_HASH_VXLAN; 152 err = ksocket_setsockopt(ksock, IPPROTO_UDP, UDP_SRCPORT_HASH, &val, 153 sizeof (val), kcred); 154 return (err); 155 } 156 157 /* ARGSUSED */ 158 static int 159 vxlan_o_encap(void *arg, mblk_t *mp, ovep_encap_info_t *einfop, 160 mblk_t **outp) 161 { 162 mblk_t *ob; 163 vxlan_hdr_t *vxh; 164 165 ASSERT(einfop->ovdi_id < (1 << 24)); 166 167 if (DB_REF(mp) != 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) { 168 /* 169 * This allocation could get hot. We may want to have a good 170 * way to cache and handle this allocation the same way that IP 171 * does with keeping around a message block per entry, or 172 * basically treating this as an immutable message block in the 173 * system. Basically freemsg() will be a nop, but we'll do the 174 * right thing with respect to the rest of the chain. 175 */ 176 ob = allocb(vxlan_alloc_size, 0); 177 if (ob == NULL) 178 return (ENOMEM); 179 180 ob->b_wptr = DB_LIM(ob); 181 ob->b_rptr = ob->b_wptr; 182 ob->b_cont = mp; 183 } else { 184 ob = mp; 185 } 186 ob->b_rptr -= VXLAN_HDR_LEN; 187 188 vxh = (vxlan_hdr_t *)ob->b_rptr; 189 vxh->vxlan_flags = ntohl(VXLAN_F_VDI); 190 vxh->vxlan_id = htonl((uint32_t)einfop->ovdi_id << VXLAN_ID_SHIFT); 191 *outp = ob; 192 193 return (0); 194 } 195 196 /* ARGSUSED */ 197 static int 198 vxlan_o_decap(void *arg, mblk_t *mp, ovep_encap_info_t *dinfop) 199 { 200 vxlan_hdr_t *vxh; 201 202 if (MBLKL(mp) < sizeof (vxlan_hdr_t)) 203 return (EINVAL); 204 vxh = (vxlan_hdr_t *)mp->b_rptr; 205 if ((ntohl(vxh->vxlan_flags) & VXLAN_F_VDI) == 0) 206 return (EINVAL); 207 208 dinfop->ovdi_id = ntohl(vxh->vxlan_id) >> VXLAN_ID_SHIFT; 209 dinfop->ovdi_hdr_size = VXLAN_HDR_LEN; 210 211 return (0); 212 } 213 214 static int 215 vxlan_o_getprop(void *arg, const char *pr_name, void *buf, uint32_t *bufsize) 216 { 217 vxlan_t *vxl = arg; 218 219 /* vxlan/listen_ip */ 220 if (strcmp(pr_name, vxlan_props[0]) == 0) { 221 if (*bufsize < sizeof (struct in6_addr)) 222 return (EOVERFLOW); 223 224 mutex_enter(&vxl->vxl_lock); 225 if (vxl->vxl_hladdr == B_FALSE) { 226 *bufsize = 0; 227 } else { 228 bcopy(&vxl->vxl_laddr, buf, sizeof (struct in6_addr)); 229 *bufsize = sizeof (struct in6_addr); 230 } 231 mutex_exit(&vxl->vxl_lock); 232 return (0); 233 } 234 235 /* vxlan/listen_port */ 236 if (strcmp(pr_name, vxlan_props[1]) == 0) { 237 uint64_t val; 238 if (*bufsize < sizeof (uint64_t)) 239 return (EOVERFLOW); 240 241 mutex_enter(&vxl->vxl_lock); 242 val = vxl->vxl_lport; 243 bcopy(&val, buf, sizeof (uint64_t)); 244 *bufsize = sizeof (uint64_t); 245 mutex_exit(&vxl->vxl_lock); 246 return (0); 247 } 248 249 return (EINVAL); 250 } 251 252 static int 253 vxlan_o_setprop(void *arg, const char *pr_name, const void *buf, 254 uint32_t bufsize) 255 { 256 vxlan_t *vxl = arg; 257 258 /* vxlan/listen_ip */ 259 if (strcmp(pr_name, vxlan_props[0]) == 0) { 260 const struct in6_addr *ipv6 = buf; 261 if (bufsize != sizeof (struct in6_addr)) 262 return (EINVAL); 263 264 if (IN6_IS_ADDR_V4COMPAT(ipv6)) 265 return (EINVAL); 266 267 if (IN6_IS_ADDR_MULTICAST(ipv6)) 268 return (EINVAL); 269 270 if (IN6_IS_ADDR_6TO4(ipv6)) 271 return (EINVAL); 272 273 if (IN6_IS_ADDR_V4MAPPED(ipv6)) { 274 ipaddr_t v4; 275 IN6_V4MAPPED_TO_IPADDR(ipv6, v4); 276 if (IN_MULTICAST(v4)) 277 return (EINVAL); 278 } 279 280 mutex_enter(&vxl->vxl_lock); 281 vxl->vxl_hladdr = B_TRUE; 282 bcopy(ipv6, &vxl->vxl_laddr, sizeof (struct in6_addr)); 283 mutex_exit(&vxl->vxl_lock); 284 285 return (0); 286 } 287 288 /* vxlan/listen_port */ 289 if (strcmp(pr_name, vxlan_props[1]) == 0) { 290 const uint64_t *valp = buf; 291 if (bufsize != 8) 292 return (EINVAL); 293 294 if (*valp == 0 || *valp > UINT16_MAX) 295 return (EINVAL); 296 297 mutex_enter(&vxl->vxl_lock); 298 vxl->vxl_lport = *valp; 299 mutex_exit(&vxl->vxl_lock); 300 return (0); 301 } 302 return (EINVAL); 303 } 304 305 static int 306 vxlan_o_propinfo(const char *pr_name, overlay_prop_handle_t phdl) 307 { 308 /* vxlan/listen_ip */ 309 if (strcmp(pr_name, vxlan_props[0]) == 0) { 310 overlay_prop_set_name(phdl, vxlan_props[0]); 311 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW); 312 overlay_prop_set_type(phdl, OVERLAY_PROP_T_IP); 313 overlay_prop_set_nodefault(phdl); 314 return (0); 315 } 316 317 if (strcmp(pr_name, vxlan_props[1]) == 0) { 318 overlay_prop_set_name(phdl, vxlan_props[1]); 319 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW); 320 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); 321 (void) overlay_prop_set_default(phdl, &vxlan_defport, 322 sizeof (vxlan_defport)); 323 overlay_prop_set_range_uint32(phdl, 1, UINT16_MAX); 324 return (0); 325 } 326 327 return (EINVAL); 328 } 329 330 static struct overlay_plugin_ops vxlan_o_ops = { 331 0, 332 vxlan_o_init, 333 vxlan_o_fini, 334 vxlan_o_encap, 335 vxlan_o_decap, 336 vxlan_o_socket, 337 vxlan_o_sockopt, 338 vxlan_o_getprop, 339 vxlan_o_setprop, 340 vxlan_o_propinfo 341 }; 342 343 static struct modlmisc vxlan_modlmisc = { 344 &mod_miscops, 345 "VXLAN encap plugin" 346 }; 347 348 static struct modlinkage vxlan_modlinkage = { 349 MODREV_1, 350 &vxlan_modlmisc 351 }; 352 353 int 354 _init(void) 355 { 356 int err; 357 overlay_plugin_register_t *ovrp; 358 359 ovrp = overlay_plugin_alloc(OVEP_VERSION); 360 if (ovrp == NULL) 361 return (ENOTSUP); 362 ovrp->ovep_name = vxlan_ident; 363 ovrp->ovep_ops = &vxlan_o_ops; 364 ovrp->ovep_id_size = VXLAN_ID_LEN; 365 ovrp->ovep_flags = OVEP_F_VLAN_TAG; 366 ovrp->ovep_dest = OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT; 367 ovrp->ovep_props = vxlan_props; 368 369 if ((err = overlay_plugin_register(ovrp)) == 0) { 370 if ((err = mod_install(&vxlan_modlinkage)) != 0) { 371 (void) overlay_plugin_unregister(vxlan_ident); 372 } 373 } 374 375 overlay_plugin_free(ovrp); 376 return (err); 377 } 378 379 int 380 _info(struct modinfo *modinfop) 381 { 382 return (mod_info(&vxlan_modlinkage, modinfop)); 383 } 384 385 int 386 _fini(void) 387 { 388 int err; 389 390 if ((err = overlay_plugin_unregister(vxlan_ident)) != 0) 391 return (err); 392 393 return (mod_remove(&vxlan_modlinkage)); 394 } 395