xref: /illumos-gate/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c (revision 76c08ae9d10f4e0b653a6ea98c06a7868246164b)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2018 Joyent, Inc.
14  * Copyright 2022 MNX Cloud, Inc.
15  */
16 
17 /*
18  * VXLAN encapsulation module
19  *
20  *
21  * The VXLAN header looks as follows in network byte order:
22  *
23  * |0        3| 4 |5                     31|
24  * +----------+---+------------------------+
25  * | Reserved | I | Reserved               |
26  * +---------------------------------------+
27  * | Virtual Network ID         | Reserved |
28  * +----------------------------+----------+
29  * |0                         23|24      31|
30  *
31  * All reserved values must be 0. The I bit must be 1. We call the top
32  * word the VXLAN magic field for the time being. The second word is
33  * definitely not the most friendly way to operate. Specifically, the ID
34  * is a 24-bit big endian value, but we have to make sure not to use the
35  * reserved byte.
36  *
37  * For us, VXLAN encapsulation is a fairly straightforward implementation. It
38  * only has two properties, a listen_ip and a listen_port. These determine on
39  * what address we should be listening on. While we do not have a default
40  * address to listen upon, we do have a default port, which is the IANA assigned
41  * port for VXLAN -- 4789.
42  */
43 
44 #include <sys/overlay_plugin.h>
45 #include <sys/modctl.h>
46 #include <sys/errno.h>
47 #include <sys/byteorder.h>
48 #include <sys/vxlan.h>
49 #include <inet/ip.h>
50 #include <netinet/in.h>
51 #include <sys/strsun.h>
52 #include <netinet/udp.h>
53 
54 static const char *vxlan_ident = "vxlan";
55 static uint16_t vxlan_defport = IPPORT_VXLAN;
56 
57 /*
58  * Should we enable UDP source port hashing for fanout.
59  */
60 boolean_t vxlan_fanout = B_TRUE;
61 
62 /*
63  * This represents the size in bytes that we want to allocate when allocating a
64  * vxlan header block. This is intended such that lower levels can try and use
65  * the message block that we allocate for the IP and UPD header. The hope is
66  * that even if this is tunneled, that this is enough space.
67  *
68  * The vxlan_noalloc_min value represents the minimum amount of space we need to
69  * consider not allocating a message block and just passing it down the stack in
70  * this form. This number assumes that we have a VLAN tag, so 18 byte Ethernet
71  * header, 20 byte IP header, 8 byte UDP header, and 8 byte VXLAN header.
72  */
73 uint_t vxlan_alloc_size = 128;
74 uint_t vxlan_noalloc_min = 54;
75 
76 static const char *vxlan_props[] = {
77 	"vxlan/listen_ip",
78 	"vxlan/listen_port",
79 	NULL
80 };
81 
82 typedef struct vxlan {
83 	kmutex_t vxl_lock;
84 	overlay_handle_t vxl_oh;
85 	uint16_t vxl_lport;
86 	boolean_t vxl_hladdr;
87 	struct in6_addr vxl_laddr;
88 } vxlan_t;
89 
90 static int
91 vxlan_o_init(overlay_handle_t oh, void **outp)
92 {
93 	vxlan_t *vxl;
94 
95 	vxl = kmem_alloc(sizeof (vxlan_t), KM_SLEEP);
96 	*outp = vxl;
97 	mutex_init(&vxl->vxl_lock, NULL, MUTEX_DRIVER, NULL);
98 	vxl->vxl_oh = oh;
99 	vxl->vxl_lport = vxlan_defport;
100 	vxl->vxl_hladdr = B_FALSE;
101 
102 	return (0);
103 }
104 
105 static void
106 vxlan_o_fini(void *arg)
107 {
108 	vxlan_t *vxl = arg;
109 
110 	mutex_destroy(&vxl->vxl_lock);
111 	kmem_free(arg, sizeof (vxlan_t));
112 }
113 
114 static int
115 vxlan_o_socket(void *arg, int *dp, int *fp, int *pp, struct sockaddr *addr,
116     socklen_t *slenp)
117 {
118 	vxlan_t *vxl = arg;
119 	struct sockaddr_in6 *in;
120 
121 	in = (struct sockaddr_in6 *)addr;
122 	*dp = AF_INET6;
123 	*fp = SOCK_DGRAM;
124 	*pp = 0;
125 	bzero(in, sizeof (struct sockaddr_in6));
126 	in->sin6_family = AF_INET6;
127 
128 	/*
129 	 * We should consider a more expressive private errno set that
130 	 * provider's can use.
131 	 */
132 	mutex_enter(&vxl->vxl_lock);
133 	if (vxl->vxl_hladdr == B_FALSE) {
134 		mutex_exit(&vxl->vxl_lock);
135 		return (EINVAL);
136 	}
137 	in->sin6_port = htons(vxl->vxl_lport);
138 	in->sin6_addr = vxl->vxl_laddr;
139 	mutex_exit(&vxl->vxl_lock);
140 	*slenp = sizeof (struct sockaddr_in6);
141 
142 	return (0);
143 }
144 
145 static int
146 vxlan_o_sockopt(ksocket_t ksock)
147 {
148 	int val, err;
149 	if (vxlan_fanout == B_FALSE)
150 		return (0);
151 
152 	val = UDP_HASH_VXLAN;
153 	err = ksocket_setsockopt(ksock, IPPROTO_UDP, UDP_SRCPORT_HASH, &val,
154 	    sizeof (val), kcred);
155 	return (err);
156 }
157 
158 /* ARGSUSED */
159 static int
160 vxlan_o_encap(void *arg, mblk_t *mp, ovep_encap_info_t *einfop,
161     mblk_t **outp)
162 {
163 	mblk_t *ob;
164 	vxlan_hdr_t *vxh;
165 
166 	ASSERT(einfop->ovdi_id < (1 << 24));
167 
168 	if (DB_REF(mp) != 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) {
169 		/*
170 		 * This allocation could get hot. We may want to have a good
171 		 * way to cache and handle this allocation the same way that IP
172 		 * does with keeping around a message block per entry, or
173 		 * basically treating this as an immutable message block in the
174 		 * system. Basically freemsg() will be a nop, but we'll do the
175 		 * right thing with respect to the rest of the chain.
176 		 */
177 		ob = allocb(vxlan_alloc_size, 0);
178 		if (ob == NULL)
179 			return (ENOMEM);
180 
181 		ob->b_wptr = DB_LIM(ob);
182 		ob->b_rptr = ob->b_wptr;
183 		ob->b_cont = mp;
184 	} else {
185 		ob = mp;
186 	}
187 	ob->b_rptr -= VXLAN_HDR_LEN;
188 
189 	vxh = (vxlan_hdr_t *)ob->b_rptr;
190 	vxh->vxlan_flags = ntohl(VXLAN_F_VDI);
191 	vxh->vxlan_id = htonl((uint32_t)einfop->ovdi_id << VXLAN_ID_SHIFT);
192 	*outp = ob;
193 
194 	return (0);
195 }
196 
197 /* ARGSUSED */
198 static int
199 vxlan_o_decap(void *arg, mblk_t *mp, ovep_encap_info_t *dinfop)
200 {
201 	vxlan_hdr_t *vxh;
202 
203 	if (MBLKL(mp) < sizeof (vxlan_hdr_t))
204 		return (EINVAL);
205 	vxh = (vxlan_hdr_t *)mp->b_rptr;
206 	if ((ntohl(vxh->vxlan_flags) & VXLAN_F_VDI) == 0)
207 		return (EINVAL);
208 
209 	dinfop->ovdi_id = ntohl(vxh->vxlan_id) >> VXLAN_ID_SHIFT;
210 	dinfop->ovdi_hdr_size = VXLAN_HDR_LEN;
211 
212 	return (0);
213 }
214 
215 static int
216 vxlan_o_getprop(void *arg, const char *pr_name, void *buf, uint32_t *bufsize)
217 {
218 	vxlan_t *vxl = arg;
219 
220 	/* vxlan/listen_ip */
221 	if (strcmp(pr_name, vxlan_props[0]) == 0) {
222 		if (*bufsize < sizeof (struct in6_addr))
223 			return (EOVERFLOW);
224 
225 		mutex_enter(&vxl->vxl_lock);
226 		if (vxl->vxl_hladdr == B_FALSE) {
227 			*bufsize = 0;
228 		} else {
229 			bcopy(&vxl->vxl_laddr, buf, sizeof (struct in6_addr));
230 			*bufsize = sizeof (struct in6_addr);
231 		}
232 		mutex_exit(&vxl->vxl_lock);
233 		return (0);
234 	}
235 
236 	/* vxlan/listen_port */
237 	if (strcmp(pr_name, vxlan_props[1]) == 0) {
238 		uint64_t val;
239 		if (*bufsize < sizeof (uint64_t))
240 			return (EOVERFLOW);
241 
242 		mutex_enter(&vxl->vxl_lock);
243 		val = vxl->vxl_lport;
244 		bcopy(&val, buf, sizeof (uint64_t));
245 		*bufsize = sizeof (uint64_t);
246 		mutex_exit(&vxl->vxl_lock);
247 		return (0);
248 	}
249 
250 	return (EINVAL);
251 }
252 
253 static int
254 vxlan_o_setprop(void *arg, const char *pr_name, const void *buf,
255     uint32_t bufsize)
256 {
257 	vxlan_t *vxl = arg;
258 
259 	/* vxlan/listen_ip */
260 	if (strcmp(pr_name, vxlan_props[0]) == 0) {
261 		const struct in6_addr *ipv6 = buf;
262 		if (bufsize != sizeof (struct in6_addr))
263 			return (EINVAL);
264 
265 		if (IN6_IS_ADDR_V4COMPAT(ipv6))
266 			return (EINVAL);
267 
268 		if (IN6_IS_ADDR_MULTICAST(ipv6))
269 			return (EINVAL);
270 
271 		if (IN6_IS_ADDR_6TO4(ipv6))
272 			return (EINVAL);
273 
274 		if (IN6_IS_ADDR_V4MAPPED(ipv6)) {
275 			ipaddr_t v4;
276 			IN6_V4MAPPED_TO_IPADDR(ipv6, v4);
277 			if (IN_MULTICAST(ntohl(v4)))
278 				return (EINVAL);
279 		}
280 
281 		mutex_enter(&vxl->vxl_lock);
282 		vxl->vxl_hladdr = B_TRUE;
283 		bcopy(ipv6, &vxl->vxl_laddr, sizeof (struct in6_addr));
284 		mutex_exit(&vxl->vxl_lock);
285 
286 		return (0);
287 	}
288 
289 	/* vxlan/listen_port */
290 	if (strcmp(pr_name, vxlan_props[1]) == 0) {
291 		const uint64_t *valp = buf;
292 		if (bufsize != 8)
293 			return (EINVAL);
294 
295 		if (*valp == 0 || *valp > UINT16_MAX)
296 			return (EINVAL);
297 
298 		mutex_enter(&vxl->vxl_lock);
299 		vxl->vxl_lport = *valp;
300 		mutex_exit(&vxl->vxl_lock);
301 		return (0);
302 	}
303 	return (EINVAL);
304 }
305 
306 static int
307 vxlan_o_propinfo(const char *pr_name, overlay_prop_handle_t phdl)
308 {
309 	/* vxlan/listen_ip */
310 	if (strcmp(pr_name, vxlan_props[0]) == 0) {
311 		overlay_prop_set_name(phdl, vxlan_props[0]);
312 		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW);
313 		overlay_prop_set_type(phdl, OVERLAY_PROP_T_IP);
314 		overlay_prop_set_nodefault(phdl);
315 		return (0);
316 	}
317 
318 	if (strcmp(pr_name, vxlan_props[1]) == 0) {
319 		overlay_prop_set_name(phdl, vxlan_props[1]);
320 		overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW);
321 		overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
322 		(void) overlay_prop_set_default(phdl, &vxlan_defport,
323 		    sizeof (vxlan_defport));
324 		overlay_prop_set_range_uint32(phdl, 1, UINT16_MAX);
325 		return (0);
326 	}
327 
328 	return (EINVAL);
329 }
330 
331 static struct overlay_plugin_ops vxlan_o_ops = {
332 	0,
333 	vxlan_o_init,
334 	vxlan_o_fini,
335 	vxlan_o_encap,
336 	vxlan_o_decap,
337 	vxlan_o_socket,
338 	vxlan_o_sockopt,
339 	vxlan_o_getprop,
340 	vxlan_o_setprop,
341 	vxlan_o_propinfo
342 };
343 
344 static struct modlmisc vxlan_modlmisc = {
345 	&mod_miscops,
346 	"VXLAN encap plugin"
347 };
348 
349 static struct modlinkage vxlan_modlinkage = {
350 	MODREV_1,
351 	&vxlan_modlmisc
352 };
353 
354 int
355 _init(void)
356 {
357 	int err;
358 	overlay_plugin_register_t *ovrp;
359 
360 	ovrp = overlay_plugin_alloc(OVEP_VERSION);
361 	if (ovrp == NULL)
362 		return (ENOTSUP);
363 	ovrp->ovep_name = vxlan_ident;
364 	ovrp->ovep_ops = &vxlan_o_ops;
365 	ovrp->ovep_id_size = VXLAN_ID_LEN;
366 	ovrp->ovep_flags = OVEP_F_VLAN_TAG;
367 	ovrp->ovep_dest = OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT;
368 	ovrp->ovep_props = vxlan_props;
369 
370 	if ((err = overlay_plugin_register(ovrp)) == 0) {
371 		if ((err = mod_install(&vxlan_modlinkage)) != 0) {
372 			(void) overlay_plugin_unregister(vxlan_ident);
373 		}
374 	}
375 
376 	overlay_plugin_free(ovrp);
377 	return (err);
378 }
379 
380 int
381 _info(struct modinfo *modinfop)
382 {
383 	return (mod_info(&vxlan_modlinkage, modinfop));
384 }
385 
386 int
387 _fini(void)
388 {
389 	int err;
390 
391 	if ((err = overlay_plugin_unregister(vxlan_ident)) != 0)
392 		return (err);
393 
394 	return (mod_remove(&vxlan_modlinkage));
395 }
396