xref: /illumos-gate/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c (revision 15f90b02bdacbf0ae47fa105944f15b6596f9748)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2018 Joyent, Inc.
14  */
15 
16 /*
17  * VXLAN encapsulation module
18  *
19  *
20  * The VXLAN header looks as follows in network byte order:
21  *
22  * |0        3| 4 |5                     31|
23  * +----------+---+------------------------+
24  * | Reserved | I | Reserved               |
25  * +---------------------------------------+
26  * | Virtual Network ID         | Reserved |
27  * +----------------------------+----------+
28  * |0                         23|24      31|
29  *
30  * All reserved values must be 0. The I bit must be 1. We call the top
31  * word the VXLAN magic field for the time being. The second word is
32  * definitely not the most friendly way to operate. Specifically, the ID
33  * is a 24-bit big endian value, but we have to make sure not to use the
34  * reserved byte.
35  *
36  * For us, VXLAN encapsulation is a fairly straightforward implementation. It
37  * only has two properties, a listen_ip and a listen_port. These determine on
38  * what address we should be listening on. While we do not have a default
39  * address to listen upon, we do have a default port, which is the IANA assigned
40  * port for VXLAN -- 4789.
41  */
42 
43 #include <sys/overlay_plugin.h>
44 #include <sys/modctl.h>
45 #include <sys/errno.h>
46 #include <sys/byteorder.h>
47 #include <sys/vxlan.h>
48 #include <inet/ip.h>
49 #include <netinet/in.h>
50 #include <sys/strsun.h>
51 #include <netinet/udp.h>
52 
53 static const char *vxlan_ident = "vxlan";
54 static uint16_t vxlan_defport = IPPORT_VXLAN;
55 
56 /*
57  * Should we enable UDP source port hashing for fanout.
58  */
59 boolean_t vxlan_fanout = B_TRUE;
60 
61 /*
62  * This represents the size in bytes that we want to allocate when allocating a
63  * vxlan header block. This is intended such that lower levels can try and use
64  * the message block that we allocate for the IP and UPD header. The hope is
65  * that even if this is tunneled, that this is enough space.
66  *
67  * The vxlan_noalloc_min value represents the minimum amount of space we need to
68  * consider not allocating a message block and just passing it down the stack in
69  * this form. This number assumes that we have a VLAN tag, so 18 byte Ethernet
70  * header, 20 byte IP header, 8 byte UDP header, and 8 byte VXLAN header.
71  */
72 uint_t vxlan_alloc_size = 128;
73 uint_t vxlan_noalloc_min = 54;
74 
75 static const char *vxlan_props[] = {
76 	"vxlan/listen_ip",
77 	"vxlan/listen_port",
78 	NULL
79 };
80 
81 typedef struct vxlan {
82 	kmutex_t vxl_lock;
83 	overlay_handle_t vxl_oh;
84 	uint16_t vxl_lport;
85 	boolean_t vxl_hladdr;
86 	struct in6_addr vxl_laddr;
87 } vxlan_t;
88 
89 static int
90 vxlan_o_init(overlay_handle_t oh, void **outp)
91 {
92 	vxlan_t *vxl;
93 
94 	vxl = kmem_alloc(sizeof (vxlan_t), KM_SLEEP);
95 	*outp = vxl;
96 	mutex_init(&vxl->vxl_lock, NULL, MUTEX_DRIVER, NULL);
97 	vxl->vxl_oh = oh;
98 	vxl->vxl_lport = vxlan_defport;
99 	vxl->vxl_hladdr = B_FALSE;
100 
101 	return (0);
102 }
103 
104 static void
105 vxlan_o_fini(void *arg)
106 {
107 	vxlan_t *vxl = arg;
108 
109 	mutex_destroy(&vxl->vxl_lock);
110 	kmem_free(arg, sizeof (vxlan_t));
111 }
112 
113 static int
114 vxlan_o_socket(void *arg, int *dp, int *fp, int *pp, struct sockaddr *addr,
115     socklen_t *slenp)
116 {
117 	vxlan_t *vxl = arg;
118 	struct sockaddr_in6 *in;
119 
120 	in = (struct sockaddr_in6 *)addr;
121 	*dp = AF_INET6;
122 	*fp = SOCK_DGRAM;
123 	*pp = 0;
124 	bzero(in, sizeof (struct sockaddr_in6));
125 	in->sin6_family = AF_INET6;
126 
127 	/*
128 	 * We should consider a more expressive private errno set that
129 	 * provider's can use.
130 	 */
131 	mutex_enter(&vxl->vxl_lock);
132 	if (vxl->vxl_hladdr == B_FALSE) {
133 		mutex_exit(&vxl->vxl_lock);
134 		return (EINVAL);
135 	}
136 	in->sin6_port = htons(vxl->vxl_lport);
137 	in->sin6_addr = vxl->vxl_laddr;
138 	mutex_exit(&vxl->vxl_lock);
139 	*slenp = sizeof (struct sockaddr_in6);
140 
141 	return (0);
142 }
143 
144 static int
145 vxlan_o_sockopt(ksocket_t ksock)
146 {
147 	int val, err;
148 	if (vxlan_fanout == B_FALSE)
149 		return (0);
150 
151 	val = UDP_HASH_VXLAN;
152 	err = ksocket_setsockopt(ksock, IPPROTO_UDP, UDP_SRCPORT_HASH, &val,
153 	    sizeof (val), kcred);
154 	return (err);
155 }
156 
157 /* ARGSUSED */
158 static int
159 vxlan_o_encap(void *arg, mblk_t *mp, ovep_encap_info_t *einfop,
160     mblk_t **outp)
161 {
162 	mblk_t *ob;
163 	vxlan_hdr_t *vxh;
164 
165 	ASSERT(einfop->ovdi_id < (1 << 24));
166 
167 	if (DB_REF(mp) != 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) {
168 		/*
169 		 * This allocation could get hot. We may want to have a good
170 		 * way to cache and handle this allocation the same way that IP
171 		 * does with keeping around a message block per entry, or
172 		 * basically treating this as an immutable message block in the
173 		 * system. Basically freemsg() will be a nop, but we'll do the
174 		 * right thing with respect to the rest of the chain.
175 		 */
176 		ob = allocb(vxlan_alloc_size, 0);
177 		if (ob == NULL)
178 			return (ENOMEM);
179 
180 		ob->b_wptr = DB_LIM(ob);
181 		ob->b_rptr = ob->b_wptr;
182 		ob->b_cont = mp;
183 	} else {
184 		ob = mp;
185 	}
186 	ob->b_rptr -= VXLAN_HDR_LEN;
187 
188 	vxh = (vxlan_hdr_t *)ob->b_rptr;
189 	vxh->vxlan_flags = ntohl(VXLAN_F_VDI);
190 	vxh->vxlan_id = htonl((uint32_t)einfop->ovdi_id << VXLAN_ID_SHIFT);
191 	*outp = ob;
192 
193 	return (0);
194 }
195 
196 /* ARGSUSED */
197 static int
198 vxlan_o_decap(void *arg, mblk_t *mp, ovep_encap_info_t *dinfop)
199 {
200 	vxlan_hdr_t *vxh;
201 
202 	if (MBLKL(mp) < sizeof (vxlan_hdr_t))
203 		return (EINVAL);
204 	vxh = (vxlan_hdr_t *)mp->b_rptr;
205 	if ((ntohl(vxh->vxlan_flags) & VXLAN_F_VDI) == 0)
206 		return (EINVAL);
207 
208 	dinfop->ovdi_id = ntohl(vxh->vxlan_id) >> VXLAN_ID_SHIFT;
209 	dinfop->ovdi_hdr_size = VXLAN_HDR_LEN;
210 
211 	return (0);
212 }
213 
214 static int
215 vxlan_o_getprop(void *arg, const char *pr_name, void *buf, uint32_t *bufsize)
216 {
217 	vxlan_t *vxl = arg;
218 
219 	/* vxlan/listen_ip */
220 	if (strcmp(pr_name, vxlan_props[0]) == 0) {
221 		if (*bufsize < sizeof (struct in6_addr))
222 			return (EOVERFLOW);
223 
224 		mutex_enter(&vxl->vxl_lock);
225 		if (vxl->vxl_hladdr == B_FALSE) {
226 			*bufsize = 0;
227 		} else {
228 			bcopy(&vxl->vxl_laddr, buf, sizeof (struct in6_addr));
229 			*bufsize = sizeof (struct in6_addr);
230 		}
231 		mutex_exit(&vxl->vxl_lock);
232 		return (0);
233 	}
234 
235 	/* vxlan/listen_port */
236 	if (strcmp(pr_name, vxlan_props[1]) == 0) {
237 		uint64_t val;
238 		if (*bufsize < sizeof (uint64_t))
239 			return (EOVERFLOW);
240 
241 		mutex_enter(&vxl->vxl_lock);
242 		val = vxl->vxl_lport;
243 		bcopy(&val, buf, sizeof (uint64_t));
244 		*bufsize = sizeof (uint64_t);
245 		mutex_exit(&vxl->vxl_lock);
246 		return (0);
247 	}
248 
249 	return (EINVAL);
250 }
251 
252 static int
253 vxlan_o_setprop(void *arg, const char *pr_name, const void *buf,
254     uint32_t bufsize)
255 {
256 	vxlan_t *vxl = arg;
257 
258 	/* vxlan/listen_ip */
259 	if (strcmp(pr_name, vxlan_props[0]) == 0) {
260 		const struct in6_addr *ipv6 = buf;
261 		if (bufsize != sizeof (struct in6_addr))
262 			return (EINVAL);
263 
264 		if (IN6_IS_ADDR_V4COMPAT(ipv6))
265 			return (EINVAL);
266 
267 		if (IN6_IS_ADDR_MULTICAST(ipv6))
268 			return (EINVAL);
269 
270 		if (IN6_IS_ADDR_6TO4(ipv6))
271 			return (EINVAL);
272 
273 		if (IN6_IS_ADDR_V4MAPPED(ipv6)) {
274 			ipaddr_t v4;
275 			IN6_V4MAPPED_TO_IPADDR(ipv6, v4);
276 			if (IN_MULTICAST(v4))
277 				return (EINVAL);
278 		}
279 
280 		mutex_enter(&vxl->vxl_lock);
281 		vxl->vxl_hladdr = B_TRUE;
282 		bcopy(ipv6, &vxl->vxl_laddr, sizeof (struct in6_addr));
283 		mutex_exit(&vxl->vxl_lock);
284 
285 		return (0);
286 	}
287 
288 	/* vxlan/listen_port */
289 	if (strcmp(pr_name, vxlan_props[1]) == 0) {
290 		const uint64_t *valp = buf;
291 		if (bufsize != 8)
292 			return (EINVAL);
293 
294 		if (*valp == 0 || *valp > UINT16_MAX)
295 			return (EINVAL);
296 
297 		mutex_enter(&vxl->vxl_lock);
298 		vxl->vxl_lport = *valp;
299 		mutex_exit(&vxl->vxl_lock);
300 		return (0);
301 	}
302 	return (EINVAL);
303 }
304 
305 static int
306 vxlan_o_propinfo(const char *pr_name, overlay_prop_handle_t phdl)
307 {
308 	/* vxlan/listen_ip */
309 	if (strcmp(pr_name, vxlan_props[0]) == 0) {
310 		overlay_prop_set_name(phdl, vxlan_props[0]);
311 		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW);
312 		overlay_prop_set_type(phdl, OVERLAY_PROP_T_IP);
313 		overlay_prop_set_nodefault(phdl);
314 		return (0);
315 	}
316 
317 	if (strcmp(pr_name, vxlan_props[1]) == 0) {
318 		overlay_prop_set_name(phdl, vxlan_props[1]);
319 		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW);
320 		overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
321 		(void) overlay_prop_set_default(phdl, &vxlan_defport,
322 		    sizeof (vxlan_defport));
323 		overlay_prop_set_range_uint32(phdl, 1, UINT16_MAX);
324 		return (0);
325 	}
326 
327 	return (EINVAL);
328 }
329 
330 static struct overlay_plugin_ops vxlan_o_ops = {
331 	0,
332 	vxlan_o_init,
333 	vxlan_o_fini,
334 	vxlan_o_encap,
335 	vxlan_o_decap,
336 	vxlan_o_socket,
337 	vxlan_o_sockopt,
338 	vxlan_o_getprop,
339 	vxlan_o_setprop,
340 	vxlan_o_propinfo
341 };
342 
343 static struct modlmisc vxlan_modlmisc = {
344 	&mod_miscops,
345 	"VXLAN encap plugin"
346 };
347 
348 static struct modlinkage vxlan_modlinkage = {
349 	MODREV_1,
350 	&vxlan_modlmisc
351 };
352 
353 int
354 _init(void)
355 {
356 	int err;
357 	overlay_plugin_register_t *ovrp;
358 
359 	ovrp = overlay_plugin_alloc(OVEP_VERSION);
360 	if (ovrp == NULL)
361 		return (ENOTSUP);
362 	ovrp->ovep_name = vxlan_ident;
363 	ovrp->ovep_ops = &vxlan_o_ops;
364 	ovrp->ovep_id_size = VXLAN_ID_LEN;
365 	ovrp->ovep_flags = OVEP_F_VLAN_TAG;
366 	ovrp->ovep_dest = OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT;
367 	ovrp->ovep_props = vxlan_props;
368 
369 	if ((err = overlay_plugin_register(ovrp)) == 0) {
370 		if ((err = mod_install(&vxlan_modlinkage)) != 0) {
371 			(void) overlay_plugin_unregister(vxlan_ident);
372 		}
373 	}
374 
375 	overlay_plugin_free(ovrp);
376 	return (err);
377 }
378 
379 int
380 _info(struct modinfo *modinfop)
381 {
382 	return (mod_info(&vxlan_modlinkage, modinfop));
383 }
384 
385 int
386 _fini(void)
387 {
388 	int err;
389 
390 	if ((err = overlay_plugin_unregister(vxlan_ident)) != 0)
391 		return (err);
392 
393 	return (mod_remove(&vxlan_modlinkage));
394 }
395