1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2018 Joyent, Inc.
14 * Copyright 2022 MNX Cloud, Inc.
15 */
16
17 /*
18 * VXLAN encapsulation module
19 *
20 *
21 * The VXLAN header looks as follows in network byte order:
22 *
23 * |0 3| 4 |5 31|
24 * +----------+---+------------------------+
25 * | Reserved | I | Reserved |
26 * +---------------------------------------+
27 * | Virtual Network ID | Reserved |
28 * +----------------------------+----------+
29 * |0 23|24 31|
30 *
31 * All reserved values must be 0. The I bit must be 1. We call the top
32 * word the VXLAN magic field for the time being. The second word is
33 * definitely not the most friendly way to operate. Specifically, the ID
34 * is a 24-bit big endian value, but we have to make sure not to use the
35 * reserved byte.
36 *
37 * For us, VXLAN encapsulation is a fairly straightforward implementation. It
38 * only has two properties, a listen_ip and a listen_port. These determine on
39 * what address we should be listening on. While we do not have a default
40 * address to listen upon, we do have a default port, which is the IANA assigned
41 * port for VXLAN -- 4789.
42 */
43
44 #include <sys/overlay_plugin.h>
45 #include <sys/modctl.h>
46 #include <sys/errno.h>
47 #include <sys/byteorder.h>
48 #include <sys/vxlan.h>
49 #include <inet/ip.h>
50 #include <netinet/in.h>
51 #include <sys/strsun.h>
52 #include <netinet/udp.h>
53
54 static const char *vxlan_ident = "vxlan";
55 static uint16_t vxlan_defport = IPPORT_VXLAN;
56
57 /*
58 * Should we enable UDP source port hashing for fanout.
59 */
60 boolean_t vxlan_fanout = B_TRUE;
61
62 /*
63 * This represents the size in bytes that we want to allocate when allocating a
64 * vxlan header block. This is intended such that lower levels can try and use
65 * the message block that we allocate for the IP and UPD header. The hope is
66 * that even if this is tunneled, that this is enough space.
67 *
68 * The vxlan_noalloc_min value represents the minimum amount of space we need to
69 * consider not allocating a message block and just passing it down the stack in
70 * this form. This number assumes that we have a VLAN tag, so 18 byte Ethernet
71 * header, 20 byte IP header, 8 byte UDP header, and 8 byte VXLAN header.
72 */
73 uint_t vxlan_alloc_size = 128;
74 uint_t vxlan_noalloc_min = 54;
75
76 static const char *vxlan_props[] = {
77 "vxlan/listen_ip",
78 "vxlan/listen_port",
79 NULL
80 };
81
82 typedef struct vxlan {
83 kmutex_t vxl_lock;
84 overlay_handle_t vxl_oh;
85 uint16_t vxl_lport;
86 boolean_t vxl_hladdr;
87 struct in6_addr vxl_laddr;
88 } vxlan_t;
89
90 static int
vxlan_o_init(overlay_handle_t oh,void ** outp)91 vxlan_o_init(overlay_handle_t oh, void **outp)
92 {
93 vxlan_t *vxl;
94
95 vxl = kmem_alloc(sizeof (vxlan_t), KM_SLEEP);
96 *outp = vxl;
97 mutex_init(&vxl->vxl_lock, NULL, MUTEX_DRIVER, NULL);
98 vxl->vxl_oh = oh;
99 vxl->vxl_lport = vxlan_defport;
100 vxl->vxl_hladdr = B_FALSE;
101
102 return (0);
103 }
104
105 static void
vxlan_o_fini(void * arg)106 vxlan_o_fini(void *arg)
107 {
108 vxlan_t *vxl = arg;
109
110 mutex_destroy(&vxl->vxl_lock);
111 kmem_free(arg, sizeof (vxlan_t));
112 }
113
114 static int
vxlan_o_socket(void * arg,int * dp,int * fp,int * pp,struct sockaddr * addr,socklen_t * slenp)115 vxlan_o_socket(void *arg, int *dp, int *fp, int *pp, struct sockaddr *addr,
116 socklen_t *slenp)
117 {
118 vxlan_t *vxl = arg;
119 struct sockaddr_in6 *in;
120
121 in = (struct sockaddr_in6 *)addr;
122 *dp = AF_INET6;
123 *fp = SOCK_DGRAM;
124 *pp = 0;
125 bzero(in, sizeof (struct sockaddr_in6));
126 in->sin6_family = AF_INET6;
127
128 /*
129 * We should consider a more expressive private errno set that
130 * provider's can use.
131 */
132 mutex_enter(&vxl->vxl_lock);
133 if (vxl->vxl_hladdr == B_FALSE) {
134 mutex_exit(&vxl->vxl_lock);
135 return (EINVAL);
136 }
137 in->sin6_port = htons(vxl->vxl_lport);
138 in->sin6_addr = vxl->vxl_laddr;
139 mutex_exit(&vxl->vxl_lock);
140 *slenp = sizeof (struct sockaddr_in6);
141
142 return (0);
143 }
144
145 static int
vxlan_o_sockopt(ksocket_t ksock)146 vxlan_o_sockopt(ksocket_t ksock)
147 {
148 int val, err;
149 if (vxlan_fanout == B_FALSE)
150 return (0);
151
152 val = UDP_HASH_VXLAN;
153 err = ksocket_setsockopt(ksock, IPPROTO_UDP, UDP_SRCPORT_HASH, &val,
154 sizeof (val), kcred);
155 return (err);
156 }
157
158 /* ARGSUSED */
159 static int
vxlan_o_encap(void * arg,mblk_t * mp,ovep_encap_info_t * einfop,mblk_t ** outp)160 vxlan_o_encap(void *arg, mblk_t *mp, ovep_encap_info_t *einfop,
161 mblk_t **outp)
162 {
163 mblk_t *ob;
164 vxlan_hdr_t *vxh;
165
166 ASSERT(einfop->ovdi_id < (1 << 24));
167
168 if (DB_REF(mp) != 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) {
169 /*
170 * This allocation could get hot. We may want to have a good
171 * way to cache and handle this allocation the same way that IP
172 * does with keeping around a message block per entry, or
173 * basically treating this as an immutable message block in the
174 * system. Basically freemsg() will be a nop, but we'll do the
175 * right thing with respect to the rest of the chain.
176 */
177 ob = allocb(vxlan_alloc_size, 0);
178 if (ob == NULL)
179 return (ENOMEM);
180
181 ob->b_wptr = DB_LIM(ob);
182 ob->b_rptr = ob->b_wptr;
183 ob->b_cont = mp;
184 } else {
185 ob = mp;
186 }
187 ob->b_rptr -= VXLAN_HDR_LEN;
188
189 vxh = (vxlan_hdr_t *)ob->b_rptr;
190 vxh->vxlan_flags = ntohl(VXLAN_F_VDI);
191 vxh->vxlan_id = htonl((uint32_t)einfop->ovdi_id << VXLAN_ID_SHIFT);
192 *outp = ob;
193
194 return (0);
195 }
196
197 /* ARGSUSED */
198 static int
vxlan_o_decap(void * arg,mblk_t * mp,ovep_encap_info_t * dinfop)199 vxlan_o_decap(void *arg, mblk_t *mp, ovep_encap_info_t *dinfop)
200 {
201 vxlan_hdr_t *vxh;
202
203 if (MBLKL(mp) < sizeof (vxlan_hdr_t))
204 return (EINVAL);
205 vxh = (vxlan_hdr_t *)mp->b_rptr;
206 if ((ntohl(vxh->vxlan_flags) & VXLAN_F_VDI) == 0)
207 return (EINVAL);
208
209 dinfop->ovdi_id = ntohl(vxh->vxlan_id) >> VXLAN_ID_SHIFT;
210 dinfop->ovdi_hdr_size = VXLAN_HDR_LEN;
211
212 return (0);
213 }
214
215 static int
vxlan_o_getprop(void * arg,const char * pr_name,void * buf,uint32_t * bufsize)216 vxlan_o_getprop(void *arg, const char *pr_name, void *buf, uint32_t *bufsize)
217 {
218 vxlan_t *vxl = arg;
219
220 /* vxlan/listen_ip */
221 if (strcmp(pr_name, vxlan_props[0]) == 0) {
222 if (*bufsize < sizeof (struct in6_addr))
223 return (EOVERFLOW);
224
225 mutex_enter(&vxl->vxl_lock);
226 if (vxl->vxl_hladdr == B_FALSE) {
227 *bufsize = 0;
228 } else {
229 bcopy(&vxl->vxl_laddr, buf, sizeof (struct in6_addr));
230 *bufsize = sizeof (struct in6_addr);
231 }
232 mutex_exit(&vxl->vxl_lock);
233 return (0);
234 }
235
236 /* vxlan/listen_port */
237 if (strcmp(pr_name, vxlan_props[1]) == 0) {
238 uint64_t val;
239 if (*bufsize < sizeof (uint64_t))
240 return (EOVERFLOW);
241
242 mutex_enter(&vxl->vxl_lock);
243 val = vxl->vxl_lport;
244 bcopy(&val, buf, sizeof (uint64_t));
245 *bufsize = sizeof (uint64_t);
246 mutex_exit(&vxl->vxl_lock);
247 return (0);
248 }
249
250 return (EINVAL);
251 }
252
253 static int
vxlan_o_setprop(void * arg,const char * pr_name,const void * buf,uint32_t bufsize)254 vxlan_o_setprop(void *arg, const char *pr_name, const void *buf,
255 uint32_t bufsize)
256 {
257 vxlan_t *vxl = arg;
258
259 /* vxlan/listen_ip */
260 if (strcmp(pr_name, vxlan_props[0]) == 0) {
261 const struct in6_addr *ipv6 = buf;
262 if (bufsize != sizeof (struct in6_addr))
263 return (EINVAL);
264
265 if (IN6_IS_ADDR_V4COMPAT(ipv6))
266 return (EINVAL);
267
268 if (IN6_IS_ADDR_MULTICAST(ipv6))
269 return (EINVAL);
270
271 if (IN6_IS_ADDR_6TO4(ipv6))
272 return (EINVAL);
273
274 if (IN6_IS_ADDR_V4MAPPED(ipv6)) {
275 ipaddr_t v4;
276 IN6_V4MAPPED_TO_IPADDR(ipv6, v4);
277 if (IN_MULTICAST(ntohl(v4)))
278 return (EINVAL);
279 }
280
281 mutex_enter(&vxl->vxl_lock);
282 vxl->vxl_hladdr = B_TRUE;
283 bcopy(ipv6, &vxl->vxl_laddr, sizeof (struct in6_addr));
284 mutex_exit(&vxl->vxl_lock);
285
286 return (0);
287 }
288
289 /* vxlan/listen_port */
290 if (strcmp(pr_name, vxlan_props[1]) == 0) {
291 const uint64_t *valp = buf;
292 if (bufsize != 8)
293 return (EINVAL);
294
295 if (*valp == 0 || *valp > UINT16_MAX)
296 return (EINVAL);
297
298 mutex_enter(&vxl->vxl_lock);
299 vxl->vxl_lport = *valp;
300 mutex_exit(&vxl->vxl_lock);
301 return (0);
302 }
303 return (EINVAL);
304 }
305
306 static int
vxlan_o_propinfo(const char * pr_name,overlay_prop_handle_t phdl)307 vxlan_o_propinfo(const char *pr_name, overlay_prop_handle_t phdl)
308 {
309 /* vxlan/listen_ip */
310 if (strcmp(pr_name, vxlan_props[0]) == 0) {
311 overlay_prop_set_name(phdl, vxlan_props[0]);
312 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW);
313 overlay_prop_set_type(phdl, OVERLAY_PROP_T_IP);
314 overlay_prop_set_nodefault(phdl);
315 return (0);
316 }
317
318 if (strcmp(pr_name, vxlan_props[1]) == 0) {
319 overlay_prop_set_name(phdl, vxlan_props[1]);
320 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW);
321 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
322 (void) overlay_prop_set_default(phdl, &vxlan_defport,
323 sizeof (vxlan_defport));
324 overlay_prop_set_range_uint32(phdl, 1, UINT16_MAX);
325 return (0);
326 }
327
328 return (EINVAL);
329 }
330
331 static struct overlay_plugin_ops vxlan_o_ops = {
332 0,
333 vxlan_o_init,
334 vxlan_o_fini,
335 vxlan_o_encap,
336 vxlan_o_decap,
337 vxlan_o_socket,
338 vxlan_o_sockopt,
339 vxlan_o_getprop,
340 vxlan_o_setprop,
341 vxlan_o_propinfo
342 };
343
344 static struct modlmisc vxlan_modlmisc = {
345 &mod_miscops,
346 "VXLAN encap plugin"
347 };
348
349 static struct modlinkage vxlan_modlinkage = {
350 MODREV_1,
351 &vxlan_modlmisc
352 };
353
354 int
_init(void)355 _init(void)
356 {
357 int err;
358 overlay_plugin_register_t *ovrp;
359
360 ovrp = overlay_plugin_alloc(OVEP_VERSION);
361 if (ovrp == NULL)
362 return (ENOTSUP);
363 ovrp->ovep_name = vxlan_ident;
364 ovrp->ovep_ops = &vxlan_o_ops;
365 ovrp->ovep_id_size = VXLAN_ID_LEN;
366 ovrp->ovep_flags = OVEP_F_VLAN_TAG;
367 ovrp->ovep_dest = OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT;
368 ovrp->ovep_props = vxlan_props;
369
370 if ((err = overlay_plugin_register(ovrp)) == 0) {
371 if ((err = mod_install(&vxlan_modlinkage)) != 0) {
372 (void) overlay_plugin_unregister(vxlan_ident);
373 }
374 }
375
376 overlay_plugin_free(ovrp);
377 return (err);
378 }
379
380 int
_info(struct modinfo * modinfop)381 _info(struct modinfo *modinfop)
382 {
383 return (mod_info(&vxlan_modlinkage, modinfop));
384 }
385
386 int
_fini(void)387 _fini(void)
388 {
389 int err;
390
391 if ((err = overlay_plugin_unregister(vxlan_ident)) != 0)
392 return (err);
393
394 return (mod_remove(&vxlan_modlinkage));
395 }
396