1 /* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */
2 /*-
3 * SPDX-License-Identifier: BSD-2-Clause
4 *
5 * Copyright (C) 1999-2000 by Maksim Yevmenkin <m_evmenkin@yahoo.com>
6 * All rights reserved.
7 * Copyright (c) 2019 Kyle Evans <kevans@FreeBSD.org>
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 * BASED ON:
32 * -------------------------------------------------------------------------
33 *
34 * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk>
35 * Nottingham University 1987.
36 *
37 * This source may be freely distributed, however I would be interested
38 * in any changes that are made.
39 *
40 * This driver takes packets off the IP i/f and hands them up to a
41 * user process to have its wicked way with. This driver has it's
42 * roots in a similar driver written by Phil Cockcroft (formerly) at
43 * UCL. This driver is based much more on read/write/poll mode of
44 * operation though.
45 */
46
47 #include "opt_inet.h"
48 #include "opt_inet6.h"
49
50 #include <sys/param.h>
51 #include <sys/lock.h>
52 #include <sys/priv.h>
53 #include <sys/proc.h>
54 #include <sys/systm.h>
55 #include <sys/jail.h>
56 #include <sys/mbuf.h>
57 #include <sys/module.h>
58 #include <sys/socket.h>
59 #include <sys/eventhandler.h>
60 #include <sys/fcntl.h>
61 #include <sys/filio.h>
62 #include <sys/sockio.h>
63 #include <sys/sx.h>
64 #include <sys/syslog.h>
65 #include <sys/ttycom.h>
66 #include <sys/poll.h>
67 #include <sys/selinfo.h>
68 #include <sys/signalvar.h>
69 #include <sys/filedesc.h>
70 #include <sys/kernel.h>
71 #include <sys/sysctl.h>
72 #include <sys/conf.h>
73 #include <sys/uio.h>
74 #include <sys/malloc.h>
75 #include <sys/random.h>
76 #include <sys/ctype.h>
77 #include <sys/osd.h>
78
79 #include <net/ethernet.h>
80 #include <net/if.h>
81 #include <net/if_var.h>
82 #include <net/if_clone.h>
83 #include <net/if_dl.h>
84 #include <net/if_media.h>
85 #include <net/if_private.h>
86 #include <net/if_types.h>
87 #include <net/if_vlan_var.h>
88 #include <net/netisr.h>
89 #include <net/route.h>
90 #include <net/vnet.h>
91 #include <netinet/in.h>
92 #ifdef INET
93 #include <netinet/ip.h>
94 #endif
95 #ifdef INET6
96 #include <netinet/ip6.h>
97 #include <netinet6/ip6_var.h>
98 #endif
99 #include <netinet/udp.h>
100 #include <netinet/tcp.h>
101 #include <netinet/tcp_lro.h>
102 #include <net/bpf.h>
103 #include <net/if_tap.h>
104 #include <net/if_tun.h>
105
106 #include <dev/virtio/network/virtio_net.h>
107
108 #include <sys/queue.h>
109 #include <sys/condvar.h>
110 #include <security/mac/mac_framework.h>
111
112 struct tuntap_driver;
113
114 /*
115 * tun_list is protected by global tunmtx. Other mutable fields are
116 * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is
117 * static for the duration of a tunnel interface.
118 */
119 struct tuntap_softc {
120 TAILQ_ENTRY(tuntap_softc) tun_list;
121 struct cdev *tun_alias;
122 struct cdev *tun_dev;
123 u_short tun_flags; /* misc flags */
124 #define TUN_OPEN 0x0001
125 #define TUN_INITED 0x0002
126 #define TUN_UNUSED1 0x0008
127 #define TUN_UNUSED2 0x0010
128 #define TUN_LMODE 0x0020
129 #define TUN_RWAIT 0x0040
130 #define TUN_ASYNC 0x0080
131 #define TUN_IFHEAD 0x0100
132 #define TUN_DYING 0x0200
133 #define TUN_L2 0x0400
134 #define TUN_VMNET 0x0800
135 #define TUN_TRANSIENT 0x1000
136
137 #define TUN_DRIVER_IDENT_MASK (TUN_L2 | TUN_VMNET)
138 #define TUN_READY (TUN_OPEN | TUN_INITED)
139
140 pid_t tun_pid; /* owning pid */
141 struct epoch_context tun_epoch_ctx;
142 struct ifnet *tun_ifp; /* the interface */
143 struct sigio *tun_sigio; /* async I/O info */
144 struct tuntap_driver *tun_drv; /* appropriate driver */
145 struct selinfo tun_rsel; /* read select */
146 struct mtx tun_mtx; /* softc field mutex */
147 struct cv tun_cv; /* for ref'd dev destroy */
148 struct ether_addr tun_ether; /* remote address */
149 int tun_busy; /* busy count */
150 int tun_vhdrlen; /* virtio-net header length */
151 struct lro_ctrl tun_lro; /* for TCP LRO */
152 bool tun_lro_ready; /* TCP LRO initialized */
153 };
154 #define TUN2IFP(sc) ((sc)->tun_ifp)
155
156 #define TUNDEBUG if (tundebug) if_printf
157
158 #define TUN_LOCK(tp) mtx_lock(&(tp)->tun_mtx)
159 #define TUN_UNLOCK(tp) mtx_unlock(&(tp)->tun_mtx)
160 #define TUN_LOCK_ASSERT(tp) mtx_assert(&(tp)->tun_mtx, MA_OWNED);
161
162 #define TUN_VMIO_FLAG_MASK 0x0fff
163
164 /*
165 * Interface capabilities of a tap device that supports the virtio-net
166 * header.
167 */
168 #define TAP_VNET_HDR_CAPS (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 \
169 | IFCAP_VLAN_HWCSUM \
170 | IFCAP_TSO | IFCAP_LRO \
171 | IFCAP_VLAN_HWTSO)
172
173 #define TAP_ALL_OFFLOAD (CSUM_TSO | CSUM_TCP | CSUM_UDP |\
174 CSUM_TCP_IPV6 | CSUM_UDP_IPV6)
175
176 /*
177 * All mutable global variables in if_tun are locked using tunmtx, with
178 * the exception of tundebug, which is used unlocked, and the drivers' *clones,
179 * which are static after setup.
180 */
181 static struct mtx tunmtx;
182 static eventhandler_tag arrival_tag;
183 static eventhandler_tag clone_tag;
184 static int tuntap_osd_jail_slot;
185 static const char tunname[] = "tun";
186 static const char tapname[] = "tap";
187 static const char vmnetname[] = "vmnet";
188 static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface");
189 static int tundebug = 0;
190 static int tundclone = 1;
191 static int tap_allow_uopen = 0; /* allow user devfs cloning */
192 static int tapuponopen = 0; /* IFF_UP on open() */
193 static int tapdclone = 1; /* enable devfs cloning */
194
195 static TAILQ_HEAD(,tuntap_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead);
196 SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, "");
197
198 static struct sx tun_ioctl_sx;
199 SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl");
200
201 SYSCTL_DECL(_net_link);
202 /* tun */
203 static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
204 "IP tunnel software network interface");
205 SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0,
206 "Enable legacy devfs interface creation");
207
208 /* tap */
209 static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
210 "Ethernet tunnel software network interface");
211 SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tap_allow_uopen, 0,
212 "Enable legacy devfs interface creation for all users");
213 SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0,
214 "Bring interface up when /dev/tap is opened");
215 SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0,
216 "Enable legacy devfs interface creation");
217 SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tundebug, 0, "");
218
219 static int tun_create_device(struct tuntap_driver *drv, int unit,
220 struct ucred *cr, struct cdev **dev, const char *name);
221 static int tun_busy_locked(struct tuntap_softc *tp);
222 static void tun_unbusy_locked(struct tuntap_softc *tp);
223 static int tun_busy(struct tuntap_softc *tp);
224 static void tun_unbusy(struct tuntap_softc *tp);
225
226 static int tuntap_name2info(const char *name, int *unit, int *flags);
227 static void tunclone(void *arg, struct ucred *cred, char *name,
228 int namelen, struct cdev **dev);
229 static void tuncreate(struct cdev *dev);
230 static void tundtor(void *data);
231 static void tunrename(void *arg, struct ifnet *ifp);
232 static int tunifioctl(struct ifnet *, u_long, caddr_t);
233 static void tuninit(struct ifnet *);
234 static void tunifinit(void *xtp);
235 static int tuntapmodevent(module_t, int, void *);
236 static int tunoutput(struct ifnet *, struct mbuf *,
237 const struct sockaddr *, struct route *ro);
238 static void tunstart(struct ifnet *);
239 static void tunstart_l2(struct ifnet *);
240
241 static int tun_clone_match(struct if_clone *ifc, const char *name);
242 static int tap_clone_match(struct if_clone *ifc, const char *name);
243 static int vmnet_clone_match(struct if_clone *ifc, const char *name);
244 static int tun_clone_create(struct if_clone *, char *, size_t,
245 struct ifc_data *, struct ifnet **);
246 static int tun_clone_destroy(struct if_clone *, struct ifnet *, uint32_t);
247 static void tun_vnethdr_set(struct ifnet *ifp, int vhdrlen);
248
249 static d_open_t tunopen;
250 static d_read_t tunread;
251 static d_write_t tunwrite;
252 static d_ioctl_t tunioctl;
253 static d_poll_t tunpoll;
254 static d_kqfilter_t tunkqfilter;
255
256 static int tunkqread(struct knote *, long);
257 static int tunkqwrite(struct knote *, long);
258 static void tunkqdetach(struct knote *);
259
260 static const struct filterops tun_read_filterops = {
261 .f_isfd = 1,
262 .f_attach = NULL,
263 .f_detach = tunkqdetach,
264 .f_event = tunkqread,
265 .f_copy = knote_triv_copy,
266 };
267
268 static const struct filterops tun_write_filterops = {
269 .f_isfd = 1,
270 .f_attach = NULL,
271 .f_detach = tunkqdetach,
272 .f_event = tunkqwrite,
273 .f_copy = knote_triv_copy,
274 };
275
276 static struct tuntap_driver {
277 struct cdevsw cdevsw;
278 int ident_flags;
279 struct unrhdr *unrhdr;
280 struct clonedevs *clones;
281 ifc_match_f *clone_match_fn;
282 ifc_create_f *clone_create_fn;
283 ifc_destroy_f *clone_destroy_fn;
284 } tuntap_drivers[] = {
285 {
286 .ident_flags = 0,
287 .cdevsw = {
288 .d_version = D_VERSION,
289 .d_flags = D_NEEDMINOR,
290 .d_open = tunopen,
291 .d_read = tunread,
292 .d_write = tunwrite,
293 .d_ioctl = tunioctl,
294 .d_poll = tunpoll,
295 .d_kqfilter = tunkqfilter,
296 .d_name = tunname,
297 },
298 .clone_match_fn = tun_clone_match,
299 .clone_create_fn = tun_clone_create,
300 .clone_destroy_fn = tun_clone_destroy,
301 },
302 {
303 .ident_flags = TUN_L2,
304 .cdevsw = {
305 .d_version = D_VERSION,
306 .d_flags = D_NEEDMINOR,
307 .d_open = tunopen,
308 .d_read = tunread,
309 .d_write = tunwrite,
310 .d_ioctl = tunioctl,
311 .d_poll = tunpoll,
312 .d_kqfilter = tunkqfilter,
313 .d_name = tapname,
314 },
315 .clone_match_fn = tap_clone_match,
316 .clone_create_fn = tun_clone_create,
317 .clone_destroy_fn = tun_clone_destroy,
318 },
319 {
320 .ident_flags = TUN_L2 | TUN_VMNET,
321 .cdevsw = {
322 .d_version = D_VERSION,
323 .d_flags = D_NEEDMINOR,
324 .d_open = tunopen,
325 .d_read = tunread,
326 .d_write = tunwrite,
327 .d_ioctl = tunioctl,
328 .d_poll = tunpoll,
329 .d_kqfilter = tunkqfilter,
330 .d_name = vmnetname,
331 },
332 .clone_match_fn = vmnet_clone_match,
333 .clone_create_fn = tun_clone_create,
334 .clone_destroy_fn = tun_clone_destroy,
335 },
336 };
337 #define NDRV nitems(tuntap_drivers)
338
339 VNET_DEFINE_STATIC(struct if_clone *, tuntap_driver_cloners[NDRV]);
340 #define V_tuntap_driver_cloners VNET(tuntap_driver_cloners)
341
342 /*
343 * Mechanism for marking a tunnel device as busy so that we can safely do some
344 * orthogonal operations (such as operations on devices) without racing against
345 * tun_destroy. tun_destroy will wait on the condvar if we're at all busy or
346 * open, to be woken up when the condition is alleviated.
347 */
348 static int
tun_busy_locked(struct tuntap_softc * tp)349 tun_busy_locked(struct tuntap_softc *tp)
350 {
351
352 TUN_LOCK_ASSERT(tp);
353 if ((tp->tun_flags & TUN_DYING) != 0) {
354 /*
355 * Perhaps unintuitive, but the device is busy going away.
356 * Other interpretations of EBUSY from tun_busy make little
357 * sense, since making a busy device even more busy doesn't
358 * sound like a problem.
359 */
360 return (EBUSY);
361 }
362
363 ++tp->tun_busy;
364 return (0);
365 }
366
367 static void
tun_unbusy_locked(struct tuntap_softc * tp)368 tun_unbusy_locked(struct tuntap_softc *tp)
369 {
370
371 TUN_LOCK_ASSERT(tp);
372 KASSERT(tp->tun_busy != 0, ("tun_unbusy: called for non-busy tunnel"));
373
374 --tp->tun_busy;
375 /* Wake up anything that may be waiting on our busy tunnel. */
376 if (tp->tun_busy == 0)
377 cv_broadcast(&tp->tun_cv);
378 }
379
380 static int
tun_busy(struct tuntap_softc * tp)381 tun_busy(struct tuntap_softc *tp)
382 {
383 int ret;
384
385 TUN_LOCK(tp);
386 ret = tun_busy_locked(tp);
387 TUN_UNLOCK(tp);
388 return (ret);
389 }
390
391 static void
tun_unbusy(struct tuntap_softc * tp)392 tun_unbusy(struct tuntap_softc *tp)
393 {
394
395 TUN_LOCK(tp);
396 tun_unbusy_locked(tp);
397 TUN_UNLOCK(tp);
398 }
399
400 /*
401 * Sets unit and/or flags given the device name. Must be called with correct
402 * vnet context.
403 */
404 static int
tuntap_name2info(const char * name,int * outunit,int * outflags)405 tuntap_name2info(const char *name, int *outunit, int *outflags)
406 {
407 struct tuntap_driver *drv;
408 char *dname;
409 int flags, unit;
410 bool found;
411
412 if (name == NULL)
413 return (EINVAL);
414
415 /*
416 * Needed for dev_stdclone, but dev_stdclone will not modify, it just
417 * wants to be able to pass back a char * through the second param. We
418 * will always set that as NULL here, so we'll fake it.
419 */
420 dname = __DECONST(char *, name);
421 found = false;
422
423 for (u_int i = 0; i < NDRV; i++) {
424 drv = &tuntap_drivers[i];
425
426 if (strcmp(name, drv->cdevsw.d_name) == 0) {
427 found = true;
428 unit = -1;
429 flags = drv->ident_flags;
430 break;
431 }
432
433 if (dev_stdclone(dname, NULL, drv->cdevsw.d_name, &unit) == 1) {
434 found = true;
435 flags = drv->ident_flags;
436 break;
437 }
438 }
439
440 if (!found)
441 return (ENXIO);
442
443 if (outunit != NULL)
444 *outunit = unit;
445 if (outflags != NULL)
446 *outflags = flags;
447 return (0);
448 }
449
450 static struct if_clone *
tuntap_cloner_from_flags(int tun_flags)451 tuntap_cloner_from_flags(int tun_flags)
452 {
453
454 for (u_int i = 0; i < NDRV; i++)
455 if ((tun_flags & TUN_DRIVER_IDENT_MASK) ==
456 tuntap_drivers[i].ident_flags)
457 return (V_tuntap_driver_cloners[i]);
458
459 return (NULL);
460 }
461
462 /*
463 * Get driver information from a set of flags specified. Masks the identifying
464 * part of the flags and compares it against all of the available
465 * tuntap_drivers.
466 */
467 static struct tuntap_driver *
tuntap_driver_from_flags(int tun_flags)468 tuntap_driver_from_flags(int tun_flags)
469 {
470
471 for (u_int i = 0; i < NDRV; i++)
472 if ((tun_flags & TUN_DRIVER_IDENT_MASK) ==
473 tuntap_drivers[i].ident_flags)
474 return (&tuntap_drivers[i]);
475
476 return (NULL);
477 }
478
479 static int
tun_clone_match(struct if_clone * ifc,const char * name)480 tun_clone_match(struct if_clone *ifc, const char *name)
481 {
482 int tunflags;
483
484 if (tuntap_name2info(name, NULL, &tunflags) == 0) {
485 if ((tunflags & TUN_L2) == 0)
486 return (1);
487 }
488
489 return (0);
490 }
491
492 static int
tap_clone_match(struct if_clone * ifc,const char * name)493 tap_clone_match(struct if_clone *ifc, const char *name)
494 {
495 int tunflags;
496
497 if (tuntap_name2info(name, NULL, &tunflags) == 0) {
498 if ((tunflags & (TUN_L2 | TUN_VMNET)) == TUN_L2)
499 return (1);
500 }
501
502 return (0);
503 }
504
505 static int
vmnet_clone_match(struct if_clone * ifc,const char * name)506 vmnet_clone_match(struct if_clone *ifc, const char *name)
507 {
508 int tunflags;
509
510 if (tuntap_name2info(name, NULL, &tunflags) == 0) {
511 if ((tunflags & TUN_VMNET) != 0)
512 return (1);
513 }
514
515 return (0);
516 }
517
518 /*
519 * Create a clone via the ifnet cloning mechanism. Note that this is invoked
520 * indirectly by tunclone() below.
521 */
522 static int
tun_clone_create(struct if_clone * ifc,char * name,size_t len,struct ifc_data * ifd,struct ifnet ** ifpp)523 tun_clone_create(struct if_clone *ifc, char *name, size_t len,
524 struct ifc_data *ifd, struct ifnet **ifpp)
525 {
526 struct tuntap_driver *drv;
527 struct cdev *dev;
528 int err, i, tunflags, unit;
529
530 tunflags = 0;
531 /* The name here tells us exactly what we're creating */
532 err = tuntap_name2info(name, &unit, &tunflags);
533 if (err != 0)
534 return (err);
535
536 drv = tuntap_driver_from_flags(tunflags);
537 if (drv == NULL)
538 return (ENXIO);
539
540 if (unit != -1) {
541 /* If this unit number is still available that's okay. */
542 if (alloc_unr_specific(drv->unrhdr, unit) == -1)
543 return (EEXIST);
544 } else {
545 unit = alloc_unr(drv->unrhdr);
546 }
547
548 snprintf(name, IFNAMSIZ, "%s%d", drv->cdevsw.d_name, unit);
549
550 /* find any existing device, or allocate new unit number */
551 dev = NULL;
552 i = clone_create(&drv->clones, &drv->cdevsw, &unit, &dev, 0);
553 /* No preexisting struct cdev *, create one */
554 if (i != 0)
555 i = tun_create_device(drv, unit, NULL, &dev, name);
556 if (i == 0) {
557 struct tuntap_softc *tp;
558
559 tuncreate(dev);
560 tp = dev->si_drv1;
561 *ifpp = tp->tun_ifp;
562 }
563
564 return (i);
565 }
566
567 /*
568 * Create a clone via devfs access.
569 */
570 static void
tunclone(void * arg,struct ucred * cred,char * name,int namelen,struct cdev ** dev)571 tunclone(void *arg, struct ucred *cred, char *name, int namelen,
572 struct cdev **dev)
573 {
574 char devname[SPECNAMELEN + 1];
575 struct tuntap_driver *drv;
576 int append_unit, i, u, tunflags;
577 bool mayclone;
578
579 if (*dev != NULL)
580 return;
581
582 tunflags = 0;
583 CURVNET_SET(CRED_TO_VNET(cred));
584 if (tuntap_name2info(name, &u, &tunflags) != 0)
585 goto out; /* Not recognized */
586
587 if (u != -1 && u > IF_MAXUNIT)
588 goto out; /* Unit number too high */
589
590 mayclone = priv_check_cred(cred, PRIV_NET_IFCREATE) == 0;
591 if ((tunflags & TUN_L2) != 0) {
592 /* tap/vmnet allow user open with a sysctl */
593 mayclone = (mayclone || tap_allow_uopen) && tapdclone;
594 } else {
595 mayclone = mayclone && tundclone;
596 }
597
598 /*
599 * If tun cloning is enabled, only the superuser can create an
600 * interface.
601 */
602 if (!mayclone)
603 goto out;
604
605 if (u == -1)
606 append_unit = 1;
607 else
608 append_unit = 0;
609
610 drv = tuntap_driver_from_flags(tunflags);
611 if (drv == NULL)
612 goto out;
613
614 /* find any existing device, or allocate new unit number */
615 i = clone_create(&drv->clones, &drv->cdevsw, &u, dev, 0);
616 if (i) {
617 if (append_unit) {
618 namelen = snprintf(devname, sizeof(devname), "%s%d",
619 name, u);
620 name = devname;
621 }
622
623 i = tun_create_device(drv, u, cred, dev, name);
624 } else {
625 /* Consumed by the dev_clone invoker. */
626 dev_ref(*dev);
627 }
628 if (i == 0)
629 if_clone_create(name, namelen, NULL);
630 out:
631 CURVNET_RESTORE();
632 }
633
634 static void
tunfree(struct epoch_context * ctx)635 tunfree(struct epoch_context *ctx)
636 {
637 struct tuntap_softc *tp;
638
639 tp = __containerof(ctx, struct tuntap_softc, tun_epoch_ctx);
640
641 /* Any remaining resources that would be needed by a concurrent open. */
642 mtx_destroy(&tp->tun_mtx);
643 free(tp, M_TUN);
644 }
645
646 static int
tun_destroy(struct tuntap_softc * tp,bool may_intr)647 tun_destroy(struct tuntap_softc *tp, bool may_intr)
648 {
649 int error;
650
651 TUN_LOCK(tp);
652
653 /*
654 * Transient tunnels may have set TUN_DYING if we're being destroyed as
655 * a result of the last close, which we'll allow.
656 */
657 MPASS((tp->tun_flags & (TUN_DYING | TUN_TRANSIENT)) != TUN_DYING);
658 tp->tun_flags |= TUN_DYING;
659 error = 0;
660 while (tp->tun_busy != 0) {
661 if (may_intr)
662 error = cv_wait_sig(&tp->tun_cv, &tp->tun_mtx);
663 else
664 cv_wait(&tp->tun_cv, &tp->tun_mtx);
665 if (error != 0 && tp->tun_busy != 0) {
666 tp->tun_flags &= ~TUN_DYING;
667 TUN_UNLOCK(tp);
668 return (error);
669 }
670 }
671 TUN_UNLOCK(tp);
672
673 CURVNET_SET(TUN2IFP(tp)->if_vnet);
674
675 mtx_lock(&tunmtx);
676 TAILQ_REMOVE(&tunhead, tp, tun_list);
677 mtx_unlock(&tunmtx);
678
679 /*
680 * destroy_dev will take care of any alias. For transient tunnels,
681 * we're being called from close(2) so we can't destroy it ourselves
682 * without deadlocking, but we already know that we can cleanup
683 * everything else and just continue to prevent it from being reopened.
684 */
685 if ((tp->tun_flags & TUN_TRANSIENT) != 0) {
686 atomic_store_ptr(&tp->tun_dev->si_drv1, tp->tun_dev);
687 destroy_dev_sched(tp->tun_dev);
688 } else {
689 destroy_dev(tp->tun_dev);
690 }
691 seldrain(&tp->tun_rsel);
692 knlist_clear(&tp->tun_rsel.si_note, 0);
693 knlist_destroy(&tp->tun_rsel.si_note);
694 if ((tp->tun_flags & TUN_L2) != 0) {
695 ether_ifdetach(TUN2IFP(tp));
696 } else {
697 bpfdetach(TUN2IFP(tp));
698 if_detach(TUN2IFP(tp));
699 }
700 sx_xlock(&tun_ioctl_sx);
701 TUN2IFP(tp)->if_softc = NULL;
702 sx_xunlock(&tun_ioctl_sx);
703 free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit);
704 if_free(TUN2IFP(tp));
705 cv_destroy(&tp->tun_cv);
706 NET_EPOCH_CALL(tunfree, &tp->tun_epoch_ctx);
707 CURVNET_RESTORE();
708
709 return (0);
710 }
711
712 static int
tun_clone_destroy(struct if_clone * ifc __unused,struct ifnet * ifp,uint32_t flags)713 tun_clone_destroy(struct if_clone *ifc __unused, struct ifnet *ifp, uint32_t flags)
714 {
715 struct tuntap_softc *tp = ifp->if_softc;
716
717 return (tun_destroy(tp, true));
718 }
719
720 static void
vnet_tun_init(const void * unused __unused)721 vnet_tun_init(const void *unused __unused)
722 {
723
724 for (u_int i = 0; i < NDRV; ++i) {
725 struct if_clone_addreq req = {
726 .match_f = tuntap_drivers[i].clone_match_fn,
727 .create_f = tuntap_drivers[i].clone_create_fn,
728 .destroy_f = tuntap_drivers[i].clone_destroy_fn,
729 };
730 V_tuntap_driver_cloners[i] =
731 ifc_attach_cloner(tuntap_drivers[i].cdevsw.d_name, &req);
732 };
733 }
734 VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY,
735 vnet_tun_init, NULL);
736
737 static void
tun_uninit(const void * unused __unused)738 tun_uninit(const void *unused __unused)
739 {
740 struct tuntap_driver *drv;
741 struct tuntap_softc *tp;
742 int i;
743
744 EVENTHANDLER_DEREGISTER(ifnet_arrival_event, arrival_tag);
745 EVENTHANDLER_DEREGISTER(dev_clone, clone_tag);
746
747 CURVNET_SET(vnet0);
748 for (u_int i = 0; i < NDRV; i++) {
749 if_clone_detach(V_tuntap_driver_cloners[i]);
750 V_tuntap_driver_cloners[i] = NULL;
751 }
752 CURVNET_RESTORE();
753
754 if (tuntap_osd_jail_slot != 0)
755 osd_jail_deregister(tuntap_osd_jail_slot);
756
757 mtx_lock(&tunmtx);
758 while ((tp = TAILQ_FIRST(&tunhead)) != NULL) {
759 mtx_unlock(&tunmtx);
760 /* tun_destroy() will remove it from the tailq. */
761 tun_destroy(tp, false);
762 mtx_lock(&tunmtx);
763 }
764 mtx_unlock(&tunmtx);
765 for (i = 0; i < nitems(tuntap_drivers); ++i) {
766 drv = &tuntap_drivers[i];
767 destroy_dev_drain(&drv->cdevsw);
768 delete_unrhdr(drv->unrhdr);
769 clone_cleanup(&drv->clones);
770 }
771 NET_EPOCH_DRAIN_CALLBACKS();
772 mtx_destroy(&tunmtx);
773 }
774 SYSUNINIT(tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, tun_uninit, NULL);
775
776 static struct tuntap_driver *
tuntap_driver_from_ifnet(const struct ifnet * ifp)777 tuntap_driver_from_ifnet(const struct ifnet *ifp)
778 {
779 struct tuntap_driver *drv;
780 int i;
781
782 if (ifp == NULL)
783 return (NULL);
784
785 for (i = 0; i < nitems(tuntap_drivers); ++i) {
786 drv = &tuntap_drivers[i];
787 if (strcmp(ifp->if_dname, drv->cdevsw.d_name) == 0)
788 return (drv);
789 }
790
791 return (NULL);
792 }
793
794 /*
795 * Remove devices that were created by devfs cloning, as they hold references
796 * which prevent the prison from collapsing, in which state VNET sysuninits will
797 * not be invoked.
798 */
799 static int
tuntap_prison_remove(void * obj,void * data __unused)800 tuntap_prison_remove(void *obj, void *data __unused)
801 {
802 #ifdef VIMAGE
803 struct prison *pr;
804
805 pr = obj;
806 if (prison_owns_vnet(pr)) {
807 CURVNET_SET(pr->pr_vnet);
808 for (u_int i = 0; i < NDRV; i++) {
809 if_clone_detach(V_tuntap_driver_cloners[i]);
810 V_tuntap_driver_cloners[i] = NULL;
811 }
812 CURVNET_RESTORE();
813 }
814 #endif
815 return (0);
816 }
817
818 static int
tuntapmodevent(module_t mod,int type,void * data)819 tuntapmodevent(module_t mod, int type, void *data)
820 {
821 struct tuntap_driver *drv;
822 int i;
823
824 switch (type) {
825 case MOD_LOAD:
826 mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF);
827 for (i = 0; i < nitems(tuntap_drivers); ++i) {
828 drv = &tuntap_drivers[i];
829 clone_setup(&drv->clones);
830 drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx);
831 }
832 osd_method_t methods[PR_MAXMETHOD] = {
833 [PR_METHOD_REMOVE] = tuntap_prison_remove,
834 };
835 tuntap_osd_jail_slot = osd_jail_register(NULL, methods);
836 arrival_tag = EVENTHANDLER_REGISTER(ifnet_arrival_event,
837 tunrename, 0, 1000);
838 if (arrival_tag == NULL)
839 return (ENOMEM);
840 clone_tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000);
841 if (clone_tag == NULL)
842 return (ENOMEM);
843 break;
844 case MOD_UNLOAD:
845 /* See tun_uninit(). */
846 break;
847 default:
848 return EOPNOTSUPP;
849 }
850 return 0;
851 }
852
853 static moduledata_t tuntap_mod = {
854 "if_tuntap",
855 tuntapmodevent,
856 0
857 };
858
859 /* We'll only ever have these two, so no need for a macro. */
860 static moduledata_t tun_mod = { "if_tun", NULL, 0 };
861 static moduledata_t tap_mod = { "if_tap", NULL, 0 };
862
863 DECLARE_MODULE(if_tuntap, tuntap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
864 MODULE_VERSION(if_tuntap, 1);
865 DECLARE_MODULE(if_tun, tun_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
866 MODULE_VERSION(if_tun, 1);
867 DECLARE_MODULE(if_tap, tap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
868 MODULE_VERSION(if_tap, 1);
869
870 static int
tun_create_device(struct tuntap_driver * drv,int unit,struct ucred * cr,struct cdev ** dev,const char * name)871 tun_create_device(struct tuntap_driver *drv, int unit, struct ucred *cr,
872 struct cdev **dev, const char *name)
873 {
874 struct make_dev_args args;
875 struct tuntap_softc *tp;
876 int error;
877
878 tp = malloc(sizeof(*tp), M_TUN, M_WAITOK | M_ZERO);
879 mtx_init(&tp->tun_mtx, "tun_mtx", NULL, MTX_DEF);
880 cv_init(&tp->tun_cv, "tun_condvar");
881 tp->tun_flags = drv->ident_flags;
882 tp->tun_drv = drv;
883
884 make_dev_args_init(&args);
885 if (cr != NULL)
886 args.mda_flags = MAKEDEV_REF | MAKEDEV_CHECKNAME;
887 args.mda_devsw = &drv->cdevsw;
888 args.mda_cr = cr;
889 args.mda_uid = UID_UUCP;
890 args.mda_gid = GID_DIALER;
891 args.mda_mode = 0600;
892 args.mda_unit = unit;
893 args.mda_si_drv1 = tp;
894 error = make_dev_s(&args, dev, "%s", name);
895 if (error != 0) {
896 mtx_destroy(&tp->tun_mtx);
897 cv_destroy(&tp->tun_cv);
898 free(tp, M_TUN);
899 return (error);
900 }
901
902 KASSERT((*dev)->si_drv1 != NULL,
903 ("Failed to set si_drv1 at %s creation", name));
904 tp->tun_dev = *dev;
905 knlist_init_mtx(&tp->tun_rsel.si_note, &tp->tun_mtx);
906 mtx_lock(&tunmtx);
907 TAILQ_INSERT_TAIL(&tunhead, tp, tun_list);
908 mtx_unlock(&tunmtx);
909 return (0);
910 }
911
912 static void
tunstart(struct ifnet * ifp)913 tunstart(struct ifnet *ifp)
914 {
915 struct tuntap_softc *tp = ifp->if_softc;
916 struct mbuf *m;
917
918 TUNDEBUG(ifp, "starting\n");
919 if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
920 IFQ_LOCK(&ifp->if_snd);
921 IFQ_POLL_NOLOCK(&ifp->if_snd, m);
922 if (m == NULL) {
923 IFQ_UNLOCK(&ifp->if_snd);
924 return;
925 }
926 IFQ_UNLOCK(&ifp->if_snd);
927 }
928
929 TUN_LOCK(tp);
930 if (tp->tun_flags & TUN_RWAIT) {
931 tp->tun_flags &= ~TUN_RWAIT;
932 wakeup(tp);
933 }
934 selwakeuppri(&tp->tun_rsel, PZERO);
935 KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
936 if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) {
937 TUN_UNLOCK(tp);
938 pgsigio(&tp->tun_sigio, SIGIO, 0);
939 } else
940 TUN_UNLOCK(tp);
941 }
942
943 /*
944 * tunstart_l2
945 *
946 * queue packets from higher level ready to put out
947 */
948 static void
tunstart_l2(struct ifnet * ifp)949 tunstart_l2(struct ifnet *ifp)
950 {
951 struct tuntap_softc *tp = ifp->if_softc;
952
953 TUNDEBUG(ifp, "starting\n");
954
955 /*
956 * do not junk pending output if we are in VMnet mode.
957 * XXX: can this do any harm because of queue overflow?
958 */
959
960 TUN_LOCK(tp);
961 if (((tp->tun_flags & TUN_VMNET) == 0) &&
962 ((tp->tun_flags & TUN_READY) != TUN_READY)) {
963 struct mbuf *m;
964
965 /* Unlocked read. */
966 TUNDEBUG(ifp, "not ready, tun_flags = 0x%x\n", tp->tun_flags);
967
968 for (;;) {
969 IF_DEQUEUE(&ifp->if_snd, m);
970 if (m != NULL) {
971 m_freem(m);
972 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
973 } else
974 break;
975 }
976 TUN_UNLOCK(tp);
977
978 return;
979 }
980
981 ifp->if_drv_flags |= IFF_DRV_OACTIVE;
982
983 if (!IFQ_IS_EMPTY(&ifp->if_snd)) {
984 if (tp->tun_flags & TUN_RWAIT) {
985 tp->tun_flags &= ~TUN_RWAIT;
986 wakeup(tp);
987 }
988
989 if ((tp->tun_flags & TUN_ASYNC) && (tp->tun_sigio != NULL)) {
990 TUN_UNLOCK(tp);
991 pgsigio(&tp->tun_sigio, SIGIO, 0);
992 TUN_LOCK(tp);
993 }
994
995 selwakeuppri(&tp->tun_rsel, PZERO);
996 KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
997 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */
998 }
999
1000 ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
1001 TUN_UNLOCK(tp);
1002 } /* tunstart_l2 */
1003
1004 static int
tap_transmit(struct ifnet * ifp,struct mbuf * m)1005 tap_transmit(struct ifnet *ifp, struct mbuf *m)
1006 {
1007 int error;
1008
1009 BPF_MTAP(ifp, m);
1010 IFQ_HANDOFF(ifp, m, error);
1011 return (error);
1012 }
1013
1014 static void
tuncreate(struct cdev * dev)1015 tuncreate(struct cdev *dev)
1016 {
1017 struct tuntap_driver *drv;
1018 struct tuntap_softc *tp;
1019 struct ifnet *ifp;
1020 struct ether_addr eaddr;
1021 int iflags;
1022 u_char type;
1023
1024 tp = dev->si_drv1;
1025 KASSERT(tp != NULL,
1026 ("si_drv1 should have been initialized at creation"));
1027
1028 drv = tp->tun_drv;
1029 iflags = IFF_MULTICAST;
1030 if ((tp->tun_flags & TUN_L2) != 0) {
1031 type = IFT_ETHER;
1032 iflags |= IFF_BROADCAST | IFF_SIMPLEX;
1033 } else {
1034 type = IFT_PPP;
1035 iflags |= IFF_POINTOPOINT;
1036 }
1037 ifp = tp->tun_ifp = if_alloc(type);
1038 ifp->if_softc = tp;
1039 if_initname(ifp, drv->cdevsw.d_name, dev2unit(dev));
1040 ifp->if_ioctl = tunifioctl;
1041 ifp->if_flags = iflags;
1042 IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen);
1043 ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_MEXTPG;
1044 if ((tp->tun_flags & TUN_L2) != 0)
1045 ifp->if_capabilities |=
1046 IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO;
1047 ifp->if_capenable |= IFCAP_LINKSTATE | IFCAP_MEXTPG;
1048
1049 if ((tp->tun_flags & TUN_L2) != 0) {
1050 ifp->if_init = tunifinit;
1051 ifp->if_start = tunstart_l2;
1052 ifp->if_transmit = tap_transmit;
1053 ifp->if_qflush = if_qflush;
1054
1055 ether_gen_addr(ifp, &eaddr);
1056 ether_ifattach(ifp, eaddr.octet);
1057 } else {
1058 ifp->if_mtu = TUNMTU;
1059 ifp->if_start = tunstart;
1060 ifp->if_output = tunoutput;
1061
1062 ifp->if_snd.ifq_drv_maxlen = 0;
1063 IFQ_SET_READY(&ifp->if_snd);
1064
1065 if_attach(ifp);
1066 bpfattach(ifp, DLT_NULL, sizeof(u_int32_t));
1067 }
1068
1069 TUN_LOCK(tp);
1070 tp->tun_flags |= TUN_INITED;
1071 TUN_UNLOCK(tp);
1072
1073 TUNDEBUG(ifp, "interface %s is created, minor = %#x\n",
1074 ifp->if_xname, dev2unit(dev));
1075 }
1076
1077 static void
tunrename(void * arg __unused,struct ifnet * ifp)1078 tunrename(void *arg __unused, struct ifnet *ifp)
1079 {
1080 struct tuntap_softc *tp;
1081 int error;
1082
1083 if ((ifp->if_flags & IFF_RENAMING) == 0)
1084 return;
1085
1086 if (tuntap_driver_from_ifnet(ifp) == NULL)
1087 return;
1088
1089 /*
1090 * We need to grab the ioctl sx long enough to make sure the softc is
1091 * still there. If it is, we can safely try to busy the tun device.
1092 * The busy may fail if the device is currently dying, in which case
1093 * we do nothing. If it doesn't fail, the busy count stops the device
1094 * from dying until we've created the alias (that will then be
1095 * subsequently destroyed).
1096 */
1097 sx_xlock(&tun_ioctl_sx);
1098 tp = ifp->if_softc;
1099 if (tp == NULL) {
1100 sx_xunlock(&tun_ioctl_sx);
1101 return;
1102 }
1103 error = tun_busy(tp);
1104 sx_xunlock(&tun_ioctl_sx);
1105 if (error != 0)
1106 return;
1107 if (tp->tun_alias != NULL) {
1108 destroy_dev(tp->tun_alias);
1109 tp->tun_alias = NULL;
1110 }
1111
1112 if (strcmp(ifp->if_xname, tp->tun_dev->si_name) == 0)
1113 goto out;
1114
1115 /*
1116 * Failure's ok, aliases are created on a best effort basis. If a
1117 * tun user/consumer decides to rename the interface to conflict with
1118 * another device (non-ifnet) on the system, we will assume they know
1119 * what they are doing. make_dev_alias_p won't touch tun_alias on
1120 * failure, so we use it but ignore the return value.
1121 */
1122 make_dev_alias_p(MAKEDEV_CHECKNAME, &tp->tun_alias, tp->tun_dev, "%s",
1123 ifp->if_xname);
1124 out:
1125 tun_unbusy(tp);
1126 }
1127
1128 static int
tunopen(struct cdev * dev,int flag,int mode,struct thread * td)1129 tunopen(struct cdev *dev, int flag, int mode, struct thread *td)
1130 {
1131 struct epoch_tracker et;
1132 struct ifnet *ifp;
1133 struct tuntap_softc *tp;
1134 void *p;
1135 int error __diagused, tunflags;
1136
1137 /*
1138 * Transient tunnels do deferred destroy of the tun device but want
1139 * to immediately cleanup state, so they clobber si_drv1 to avoid a
1140 * use-after-free in case someone does happen to open it in the interim.
1141 * We avoid using NULL to be able to distinguish from an uninitialized
1142 * cdev.
1143 *
1144 * We use the net epoch here to let a concurrent tun_destroy() schedule
1145 * freeing our tuntap_softc, in case we entered here and loaded si_drv1
1146 * before it was swapped out. If we managed to load this while it was
1147 * still a softc, then the concurrent tun_destroy() hasn't yet scheduled
1148 * it to be free- that will take place sometime after the epoch we just
1149 * entered, so we can safely use it.
1150 */
1151 NET_EPOCH_ENTER(et);
1152 p = atomic_load_ptr(&dev->si_drv1);
1153 if (p == dev) {
1154 NET_EPOCH_EXIT(et);
1155 return (ENXIO);
1156 }
1157
1158 tunflags = 0;
1159 CURVNET_SET(TD_TO_VNET(td));
1160 error = tuntap_name2info(dev->si_name, NULL, &tunflags);
1161 if (error != 0) {
1162 CURVNET_RESTORE();
1163 NET_EPOCH_EXIT(et);
1164 return (error); /* Shouldn't happen */
1165 }
1166
1167 tp = p;
1168 KASSERT(tp != NULL,
1169 ("si_drv1 should have been initialized at creation"));
1170
1171 TUN_LOCK(tp);
1172 if ((tp->tun_flags & TUN_INITED) == 0) {
1173 TUN_UNLOCK(tp);
1174 CURVNET_RESTORE();
1175 NET_EPOCH_EXIT(et);
1176 return (ENXIO);
1177 }
1178 if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) {
1179 TUN_UNLOCK(tp);
1180 CURVNET_RESTORE();
1181 NET_EPOCH_EXIT(et);
1182 return (EBUSY);
1183 }
1184
1185 NET_EPOCH_EXIT(et);
1186 error = tun_busy_locked(tp);
1187 KASSERT(error == 0, ("Must be able to busy an unopen tunnel"));
1188 ifp = TUN2IFP(tp);
1189
1190 if ((tp->tun_flags & TUN_L2) != 0) {
1191 bcopy(IF_LLADDR(ifp), tp->tun_ether.octet,
1192 sizeof(tp->tun_ether.octet));
1193
1194 ifp->if_drv_flags |= IFF_DRV_RUNNING;
1195 ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
1196
1197 if (tapuponopen)
1198 ifp->if_flags |= IFF_UP;
1199 }
1200
1201 tp->tun_pid = td->td_proc->p_pid;
1202 tp->tun_flags |= TUN_OPEN;
1203
1204 if_link_state_change(ifp, LINK_STATE_UP);
1205 TUNDEBUG(ifp, "open\n");
1206 TUN_UNLOCK(tp);
1207
1208 /*
1209 * This can fail with either ENOENT or EBUSY. This is in the middle of
1210 * d_open, so ENOENT should not be possible. EBUSY is possible, but
1211 * the only cdevpriv dtor being set will be tundtor and the softc being
1212 * passed is constant for a given cdev. We ignore the possible error
1213 * because of this as either "unlikely" or "not actually a problem."
1214 */
1215 (void)devfs_set_cdevpriv(tp, tundtor);
1216 CURVNET_RESTORE();
1217 return (0);
1218 }
1219
1220 /*
1221 * tundtor - tear down the device - mark i/f down & delete
1222 * routing info
1223 */
1224 static void
tundtor(void * data)1225 tundtor(void *data)
1226 {
1227 struct proc *p;
1228 struct tuntap_softc *tp;
1229 struct ifnet *ifp;
1230 bool l2tun;
1231
1232 tp = data;
1233 p = curproc;
1234 ifp = TUN2IFP(tp);
1235
1236 TUN_LOCK(tp);
1237
1238 /*
1239 * Realistically, we can't be obstinate here. This only means that the
1240 * tuntap device was closed out of order, and the last closer wasn't the
1241 * controller. These are still good to know about, though, as software
1242 * should avoid multiple processes with a tuntap device open and
1243 * ill-defined transfer of control (e.g., handoff, TUNSIFPID, close in
1244 * parent).
1245 */
1246 if (p->p_pid != tp->tun_pid) {
1247 log(LOG_INFO,
1248 "pid %d (%s), %s: tun/tap protocol violation, non-controlling process closed last.\n",
1249 p->p_pid, p->p_comm, tp->tun_dev->si_name);
1250 }
1251
1252 /*
1253 * junk all pending output
1254 */
1255 CURVNET_SET(ifp->if_vnet);
1256
1257 l2tun = false;
1258 if ((tp->tun_flags & TUN_L2) != 0) {
1259 l2tun = true;
1260 IF_DRAIN(&ifp->if_snd);
1261 } else {
1262 IFQ_PURGE(&ifp->if_snd);
1263 }
1264
1265 /* For vmnet, we won't do most of the address/route bits */
1266 if ((tp->tun_flags & TUN_VMNET) != 0 ||
1267 (l2tun && (ifp->if_flags & IFF_LINK0) != 0))
1268 goto out;
1269 #if defined(INET) || defined(INET6)
1270 if (l2tun && tp->tun_lro_ready) {
1271 TUNDEBUG (ifp, "LRO disabled\n");
1272 tcp_lro_free(&tp->tun_lro);
1273 tp->tun_lro_ready = false;
1274 }
1275 #endif
1276 if (ifp->if_flags & IFF_UP) {
1277 TUN_UNLOCK(tp);
1278 if_down(ifp);
1279 TUN_LOCK(tp);
1280 }
1281
1282 /* Delete all addresses and routes which reference this interface. */
1283 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1284 ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
1285 TUN_UNLOCK(tp);
1286 if_purgeaddrs(ifp);
1287 TUN_LOCK(tp);
1288 }
1289
1290 out:
1291 if_link_state_change(ifp, LINK_STATE_DOWN);
1292 CURVNET_RESTORE();
1293
1294 funsetown(&tp->tun_sigio);
1295 selwakeuppri(&tp->tun_rsel, PZERO);
1296 KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
1297 TUNDEBUG (ifp, "closed\n");
1298 tp->tun_flags &= ~TUN_OPEN;
1299 tp->tun_pid = 0;
1300 tun_vnethdr_set(ifp, 0);
1301
1302 tun_unbusy_locked(tp);
1303 if ((tp->tun_flags & TUN_TRANSIENT) != 0) {
1304 struct if_clone *cloner;
1305 int error __diagused;
1306
1307 /* Mark it busy so that nothing can re-open it. */
1308 tp->tun_flags |= TUN_DYING;
1309 TUN_UNLOCK(tp);
1310
1311 CURVNET_SET_QUIET(ifp->if_home_vnet);
1312 cloner = tuntap_cloner_from_flags(tp->tun_flags);
1313 CURVNET_RESTORE();
1314
1315 error = if_clone_destroyif(cloner, ifp);
1316 MPASS(error == 0 || error == EINTR || error == ERESTART);
1317 return;
1318 }
1319
1320 TUN_UNLOCK(tp);
1321 }
1322
1323 static void
tuninit(struct ifnet * ifp)1324 tuninit(struct ifnet *ifp)
1325 {
1326 struct tuntap_softc *tp = ifp->if_softc;
1327
1328 TUNDEBUG(ifp, "tuninit\n");
1329
1330 TUN_LOCK(tp);
1331 ifp->if_drv_flags |= IFF_DRV_RUNNING;
1332 if ((tp->tun_flags & TUN_L2) == 0) {
1333 ifp->if_flags |= IFF_UP;
1334 getmicrotime(&ifp->if_lastchange);
1335 TUN_UNLOCK(tp);
1336 } else {
1337 #if defined(INET) || defined(INET6)
1338 if (tcp_lro_init(&tp->tun_lro) == 0) {
1339 TUNDEBUG(ifp, "LRO enabled\n");
1340 tp->tun_lro.ifp = ifp;
1341 tp->tun_lro_ready = true;
1342 } else {
1343 TUNDEBUG(ifp, "Could not enable LRO\n");
1344 tp->tun_lro_ready = false;
1345 }
1346 #endif
1347 ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
1348 TUN_UNLOCK(tp);
1349 /* attempt to start output */
1350 tunstart_l2(ifp);
1351 }
1352
1353 }
1354
1355 /*
1356 * Used only for l2 tunnel.
1357 */
1358 static void
tunifinit(void * xtp)1359 tunifinit(void *xtp)
1360 {
1361 struct tuntap_softc *tp;
1362
1363 tp = (struct tuntap_softc *)xtp;
1364 tuninit(tp->tun_ifp);
1365 }
1366
1367 /*
1368 * To be called under TUN_LOCK. Update ifp->if_hwassist according to the
1369 * current value of ifp->if_capenable.
1370 */
1371 static void
tun_caps_changed(struct ifnet * ifp)1372 tun_caps_changed(struct ifnet *ifp)
1373 {
1374 uint64_t hwassist = 0;
1375
1376 TUN_LOCK_ASSERT((struct tuntap_softc *)ifp->if_softc);
1377 if (ifp->if_capenable & IFCAP_TXCSUM)
1378 hwassist |= CSUM_TCP | CSUM_UDP;
1379 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1380 hwassist |= CSUM_TCP_IPV6
1381 | CSUM_UDP_IPV6;
1382 if (ifp->if_capenable & IFCAP_TSO4)
1383 hwassist |= CSUM_IP_TSO;
1384 if (ifp->if_capenable & IFCAP_TSO6)
1385 hwassist |= CSUM_IP6_TSO;
1386 ifp->if_hwassist = hwassist;
1387 }
1388
1389 /*
1390 * To be called under TUN_LOCK. Update tp->tun_vhdrlen and adjust
1391 * if_capabilities and if_capenable as needed.
1392 */
1393 static void
tun_vnethdr_set(struct ifnet * ifp,int vhdrlen)1394 tun_vnethdr_set(struct ifnet *ifp, int vhdrlen)
1395 {
1396 struct tuntap_softc *tp = ifp->if_softc;
1397
1398 TUN_LOCK_ASSERT(tp);
1399
1400 if (tp->tun_vhdrlen == vhdrlen)
1401 return;
1402
1403 /*
1404 * Update if_capabilities to reflect the
1405 * functionalities offered by the virtio-net
1406 * header.
1407 */
1408 if (vhdrlen != 0)
1409 ifp->if_capabilities |=
1410 TAP_VNET_HDR_CAPS;
1411 else
1412 ifp->if_capabilities &=
1413 ~TAP_VNET_HDR_CAPS;
1414 /*
1415 * Disable any capabilities that we don't
1416 * support anymore.
1417 */
1418 ifp->if_capenable &= ifp->if_capabilities;
1419 tun_caps_changed(ifp);
1420 tp->tun_vhdrlen = vhdrlen;
1421
1422 TUNDEBUG(ifp, "vnet_hdr_len=%d, if_capabilities=%x\n",
1423 vhdrlen, ifp->if_capabilities);
1424 }
1425
1426 /*
1427 * Process an ioctl request.
1428 */
1429 static int
tunifioctl(struct ifnet * ifp,u_long cmd,caddr_t data)1430 tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1431 {
1432 struct ifreq *ifr = (struct ifreq *)data;
1433 struct tuntap_softc *tp;
1434 struct ifstat *ifs;
1435 struct ifmediareq *ifmr;
1436 int dummy, error = 0;
1437 bool l2tun;
1438
1439 ifmr = NULL;
1440 sx_xlock(&tun_ioctl_sx);
1441 tp = ifp->if_softc;
1442 if (tp == NULL) {
1443 error = ENXIO;
1444 goto bad;
1445 }
1446 l2tun = (tp->tun_flags & TUN_L2) != 0;
1447 switch(cmd) {
1448 case SIOCGIFSTATUS:
1449 ifs = (struct ifstat *)data;
1450 TUN_LOCK(tp);
1451 if (tp->tun_pid)
1452 snprintf(ifs->ascii, sizeof(ifs->ascii),
1453 "\tOpened by PID %d\n", tp->tun_pid);
1454 else
1455 ifs->ascii[0] = '\0';
1456 TUN_UNLOCK(tp);
1457 break;
1458 case SIOCSIFADDR:
1459 if (l2tun)
1460 error = ether_ioctl(ifp, cmd, data);
1461 else
1462 tuninit(ifp);
1463 if (error == 0)
1464 TUNDEBUG(ifp, "address set\n");
1465 break;
1466 case SIOCSIFMTU:
1467 ifp->if_mtu = ifr->ifr_mtu;
1468 TUNDEBUG(ifp, "mtu set\n");
1469 break;
1470 case SIOCSIFFLAGS:
1471 case SIOCADDMULTI:
1472 case SIOCDELMULTI:
1473 break;
1474 case SIOCGIFMEDIA:
1475 if (!l2tun) {
1476 error = EINVAL;
1477 break;
1478 }
1479
1480 ifmr = (struct ifmediareq *)data;
1481 dummy = ifmr->ifm_count;
1482 ifmr->ifm_count = 1;
1483 ifmr->ifm_status = IFM_AVALID;
1484 ifmr->ifm_active = IFM_ETHER | IFM_FDX | IFM_1000_T;
1485 if (tp->tun_flags & TUN_OPEN)
1486 ifmr->ifm_status |= IFM_ACTIVE;
1487 ifmr->ifm_current = ifmr->ifm_active;
1488 if (dummy >= 1) {
1489 int media = IFM_ETHER;
1490 error = copyout(&media, ifmr->ifm_ulist, sizeof(int));
1491 }
1492 break;
1493 case SIOCSIFCAP:
1494 TUN_LOCK(tp);
1495 ifp->if_capenable = ifr->ifr_reqcap;
1496 tun_caps_changed(ifp);
1497 TUN_UNLOCK(tp);
1498 VLAN_CAPABILITIES(ifp);
1499 break;
1500 default:
1501 if (l2tun) {
1502 error = ether_ioctl(ifp, cmd, data);
1503 } else {
1504 error = EINVAL;
1505 }
1506 }
1507 bad:
1508 sx_xunlock(&tun_ioctl_sx);
1509 return (error);
1510 }
1511
1512 /*
1513 * tunoutput - queue packets from higher level ready to put out.
1514 */
1515 static int
tunoutput(struct ifnet * ifp,struct mbuf * m0,const struct sockaddr * dst,struct route * ro)1516 tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst,
1517 struct route *ro)
1518 {
1519 struct tuntap_softc *tp = ifp->if_softc;
1520 u_short cached_tun_flags;
1521 int error;
1522 u_int32_t af;
1523
1524 TUNDEBUG (ifp, "tunoutput\n");
1525
1526 #ifdef MAC
1527 error = mac_ifnet_check_transmit(ifp, m0);
1528 if (error) {
1529 m_freem(m0);
1530 return (error);
1531 }
1532 #endif
1533
1534 /* Could be unlocked read? */
1535 TUN_LOCK(tp);
1536 cached_tun_flags = tp->tun_flags;
1537 TUN_UNLOCK(tp);
1538 if ((cached_tun_flags & TUN_READY) != TUN_READY) {
1539 TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags);
1540 m_freem (m0);
1541 return (EHOSTDOWN);
1542 }
1543
1544 if ((ifp->if_flags & IFF_UP) != IFF_UP) {
1545 m_freem (m0);
1546 return (EHOSTDOWN);
1547 }
1548
1549 /* BPF writes need to be handled specially. */
1550 if (dst->sa_family == AF_UNSPEC || dst->sa_family == pseudo_AF_HDRCMPLT)
1551 bcopy(dst->sa_data, &af, sizeof(af));
1552 else
1553 af = RO_GET_FAMILY(ro, dst);
1554
1555 BPF_MTAP2(ifp, &af, sizeof(af), m0);
1556
1557 /* prepend sockaddr? this may abort if the mbuf allocation fails */
1558 if (cached_tun_flags & TUN_LMODE) {
1559 /* allocate space for sockaddr */
1560 M_PREPEND(m0, dst->sa_len, M_NOWAIT);
1561
1562 /* if allocation failed drop packet */
1563 if (m0 == NULL) {
1564 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
1565 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1566 return (ENOBUFS);
1567 } else {
1568 bcopy(dst, m0->m_data, dst->sa_len);
1569 }
1570 }
1571
1572 if (cached_tun_flags & TUN_IFHEAD) {
1573 /* Prepend the address family */
1574 M_PREPEND(m0, 4, M_NOWAIT);
1575
1576 /* if allocation failed drop packet */
1577 if (m0 == NULL) {
1578 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
1579 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1580 return (ENOBUFS);
1581 } else
1582 *(u_int32_t *)m0->m_data = htonl(af);
1583 } else {
1584 #ifdef INET
1585 if (af != AF_INET)
1586 #endif
1587 {
1588 m_freem(m0);
1589 return (EAFNOSUPPORT);
1590 }
1591 }
1592
1593 error = (ifp->if_transmit)(ifp, m0);
1594 if (error)
1595 return (ENOBUFS);
1596 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
1597 return (0);
1598 }
1599
1600 /*
1601 * the cdevsw interface is now pretty minimal.
1602 */
1603 static int
tunioctl(struct cdev * dev,u_long cmd,caddr_t data,int flag,struct thread * td)1604 tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
1605 struct thread *td)
1606 {
1607 struct ifreq ifr, *ifrp;
1608 struct tuntap_softc *tp = dev->si_drv1;
1609 struct ifnet *ifp = TUN2IFP(tp);
1610 struct tuninfo *tunp;
1611 int error, iflags, ival;
1612 bool l2tun;
1613
1614 l2tun = (tp->tun_flags & TUN_L2) != 0;
1615 if (l2tun) {
1616 /* tap specific ioctls */
1617 switch(cmd) {
1618 /* VMware/VMnet port ioctl's */
1619 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
1620 defined(COMPAT_FREEBSD4)
1621 case _IO('V', 0):
1622 ival = IOCPARM_IVAL(data);
1623 data = (caddr_t)&ival;
1624 /* FALLTHROUGH */
1625 #endif
1626 case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */
1627 iflags = *(int *)data;
1628 iflags &= TUN_VMIO_FLAG_MASK;
1629 iflags &= ~IFF_CANTCHANGE;
1630 iflags |= IFF_UP;
1631
1632 TUN_LOCK(tp);
1633 ifp->if_flags = iflags |
1634 (ifp->if_flags & IFF_CANTCHANGE);
1635 TUN_UNLOCK(tp);
1636
1637 return (0);
1638 case SIOCGIFADDR: /* get MAC address of the remote side */
1639 TUN_LOCK(tp);
1640 bcopy(&tp->tun_ether.octet, data,
1641 sizeof(tp->tun_ether.octet));
1642 TUN_UNLOCK(tp);
1643
1644 return (0);
1645 case SIOCSIFADDR: /* set MAC address of the remote side */
1646 TUN_LOCK(tp);
1647 bcopy(data, &tp->tun_ether.octet,
1648 sizeof(tp->tun_ether.octet));
1649 TUN_UNLOCK(tp);
1650
1651 return (0);
1652 case TAPSVNETHDR:
1653 ival = *(int *)data;
1654 if (ival != 0 &&
1655 ival != sizeof(struct virtio_net_hdr) &&
1656 ival != sizeof(struct virtio_net_hdr_mrg_rxbuf)) {
1657 return (EINVAL);
1658 }
1659 TUN_LOCK(tp);
1660 tun_vnethdr_set(ifp, ival);
1661 TUN_UNLOCK(tp);
1662
1663 return (0);
1664 case TAPGVNETHDR:
1665 TUN_LOCK(tp);
1666 *(int *)data = tp->tun_vhdrlen;
1667 TUN_UNLOCK(tp);
1668
1669 return (0);
1670 }
1671
1672 /* Fall through to the common ioctls if unhandled */
1673 } else {
1674 switch (cmd) {
1675 case TUNSLMODE:
1676 TUN_LOCK(tp);
1677 if (*(int *)data) {
1678 tp->tun_flags |= TUN_LMODE;
1679 tp->tun_flags &= ~TUN_IFHEAD;
1680 } else
1681 tp->tun_flags &= ~TUN_LMODE;
1682 TUN_UNLOCK(tp);
1683
1684 return (0);
1685 case TUNSIFHEAD:
1686 TUN_LOCK(tp);
1687 if (*(int *)data) {
1688 tp->tun_flags |= TUN_IFHEAD;
1689 tp->tun_flags &= ~TUN_LMODE;
1690 } else
1691 tp->tun_flags &= ~TUN_IFHEAD;
1692 TUN_UNLOCK(tp);
1693
1694 return (0);
1695 case TUNGIFHEAD:
1696 TUN_LOCK(tp);
1697 *(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0;
1698 TUN_UNLOCK(tp);
1699
1700 return (0);
1701 case TUNSIFMODE:
1702 /* deny this if UP */
1703 if (TUN2IFP(tp)->if_flags & IFF_UP)
1704 return (EBUSY);
1705
1706 switch (*(int *)data & ~IFF_MULTICAST) {
1707 case IFF_POINTOPOINT:
1708 case IFF_BROADCAST:
1709 TUN_LOCK(tp);
1710 TUN2IFP(tp)->if_flags &=
1711 ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST);
1712 TUN2IFP(tp)->if_flags |= *(int *)data;
1713 TUN_UNLOCK(tp);
1714
1715 break;
1716 default:
1717 return (EINVAL);
1718 }
1719
1720 return (0);
1721 case TUNSIFPID:
1722 TUN_LOCK(tp);
1723 tp->tun_pid = curthread->td_proc->p_pid;
1724 TUN_UNLOCK(tp);
1725
1726 return (0);
1727 }
1728 /* Fall through to the common ioctls if unhandled */
1729 }
1730
1731 switch (cmd) {
1732 case TUNGIFNAME:
1733 ifrp = (struct ifreq *)data;
1734 strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname, IFNAMSIZ);
1735
1736 return (0);
1737 case TUNSIFINFO:
1738 tunp = (struct tuninfo *)data;
1739 if (TUN2IFP(tp)->if_type != tunp->type)
1740 return (EPROTOTYPE);
1741 TUN_LOCK(tp);
1742 if (TUN2IFP(tp)->if_mtu != tunp->mtu) {
1743 strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ);
1744 ifr.ifr_mtu = tunp->mtu;
1745 CURVNET_SET(TUN2IFP(tp)->if_vnet);
1746 error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp),
1747 (caddr_t)&ifr, td);
1748 CURVNET_RESTORE();
1749 if (error) {
1750 TUN_UNLOCK(tp);
1751 return (error);
1752 }
1753 }
1754 TUN2IFP(tp)->if_baudrate = tunp->baudrate;
1755 TUN_UNLOCK(tp);
1756 break;
1757 case TUNGIFINFO:
1758 tunp = (struct tuninfo *)data;
1759 TUN_LOCK(tp);
1760 tunp->mtu = TUN2IFP(tp)->if_mtu;
1761 tunp->type = TUN2IFP(tp)->if_type;
1762 tunp->baudrate = TUN2IFP(tp)->if_baudrate;
1763 TUN_UNLOCK(tp);
1764 break;
1765 case TUNSDEBUG:
1766 tundebug = *(int *)data;
1767 break;
1768 case TUNGDEBUG:
1769 *(int *)data = tundebug;
1770 break;
1771 case TUNSTRANSIENT:
1772 TUN_LOCK(tp);
1773 if (*(int *)data)
1774 tp->tun_flags |= TUN_TRANSIENT;
1775 else
1776 tp->tun_flags &= ~TUN_TRANSIENT;
1777 TUN_UNLOCK(tp);
1778 break;
1779 case TUNGTRANSIENT:
1780 TUN_LOCK(tp);
1781 *(int *)data = (tp->tun_flags & TUN_TRANSIENT) != 0;
1782 TUN_UNLOCK(tp);
1783 break;
1784 case FIONBIO:
1785 break;
1786 case FIOASYNC:
1787 TUN_LOCK(tp);
1788 if (*(int *)data)
1789 tp->tun_flags |= TUN_ASYNC;
1790 else
1791 tp->tun_flags &= ~TUN_ASYNC;
1792 TUN_UNLOCK(tp);
1793 break;
1794 case FIONREAD:
1795 if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) {
1796 struct mbuf *mb;
1797 IFQ_LOCK(&TUN2IFP(tp)->if_snd);
1798 IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb);
1799 for (*(int *)data = 0; mb != NULL; mb = mb->m_next)
1800 *(int *)data += mb->m_len;
1801 IFQ_UNLOCK(&TUN2IFP(tp)->if_snd);
1802 } else
1803 *(int *)data = 0;
1804 break;
1805 case FIOSETOWN:
1806 return (fsetown(*(int *)data, &tp->tun_sigio));
1807
1808 case FIOGETOWN:
1809 *(int *)data = fgetown(&tp->tun_sigio);
1810 return (0);
1811
1812 /* This is deprecated, FIOSETOWN should be used instead. */
1813 case TIOCSPGRP:
1814 return (fsetown(-(*(int *)data), &tp->tun_sigio));
1815
1816 /* This is deprecated, FIOGETOWN should be used instead. */
1817 case TIOCGPGRP:
1818 *(int *)data = -fgetown(&tp->tun_sigio);
1819 return (0);
1820
1821 default:
1822 return (ENOTTY);
1823 }
1824 return (0);
1825 }
1826
1827 /*
1828 * The cdevsw read interface - reads a packet at a time, or at
1829 * least as much of a packet as can be read.
1830 */
1831 static int
tunread(struct cdev * dev,struct uio * uio,int flag)1832 tunread(struct cdev *dev, struct uio *uio, int flag)
1833 {
1834 struct tuntap_softc *tp = dev->si_drv1;
1835 struct ifnet *ifp = TUN2IFP(tp);
1836 struct mbuf *m;
1837 size_t len;
1838 int error = 0;
1839
1840 TUNDEBUG (ifp, "read\n");
1841 TUN_LOCK(tp);
1842 if ((tp->tun_flags & TUN_READY) != TUN_READY) {
1843 TUN_UNLOCK(tp);
1844 TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags);
1845 return (EHOSTDOWN);
1846 }
1847
1848 tp->tun_flags &= ~TUN_RWAIT;
1849
1850 for (;;) {
1851 IFQ_DEQUEUE(&ifp->if_snd, m);
1852 if (m != NULL)
1853 break;
1854 if (flag & O_NONBLOCK) {
1855 TUN_UNLOCK(tp);
1856 return (EWOULDBLOCK);
1857 }
1858 tp->tun_flags |= TUN_RWAIT;
1859 error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | PZERO,
1860 "tunread", 0);
1861 if (error != 0) {
1862 TUN_UNLOCK(tp);
1863 return (error);
1864 }
1865 }
1866 TUN_UNLOCK(tp);
1867
1868 len = min(tp->tun_vhdrlen, uio->uio_resid);
1869 if (len > 0) {
1870 struct virtio_net_hdr_mrg_rxbuf vhdr;
1871
1872 bzero(&vhdr, sizeof(vhdr));
1873 if (m->m_pkthdr.csum_flags & TAP_ALL_OFFLOAD) {
1874 m = virtio_net_tx_offload(ifp, m, false, &vhdr.hdr);
1875 }
1876
1877 TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, "
1878 "gs %u, cs %u, co %u\n", vhdr.hdr.flags,
1879 vhdr.hdr.gso_type, vhdr.hdr.hdr_len,
1880 vhdr.hdr.gso_size, vhdr.hdr.csum_start,
1881 vhdr.hdr.csum_offset);
1882 error = uiomove(&vhdr, len, uio);
1883 }
1884 if (error == 0)
1885 error = m_mbuftouio(uio, m, 0);
1886 m_freem(m);
1887 return (error);
1888 }
1889
1890 static int
tunwrite_l2(struct tuntap_softc * tp,struct mbuf * m,struct virtio_net_hdr_mrg_rxbuf * vhdr)1891 tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m,
1892 struct virtio_net_hdr_mrg_rxbuf *vhdr)
1893 {
1894 struct epoch_tracker et;
1895 struct ether_header *eh;
1896 struct ifnet *ifp;
1897
1898 ifp = TUN2IFP(tp);
1899
1900 /*
1901 * Only pass a unicast frame to ether_input(), if it would
1902 * actually have been received by non-virtual hardware.
1903 */
1904 if (m->m_len < sizeof(struct ether_header)) {
1905 m_freem(m);
1906 return (0);
1907 }
1908
1909 eh = mtod(m, struct ether_header *);
1910
1911 if ((ifp->if_flags & IFF_PROMISC) == 0 &&
1912 !ETHER_IS_MULTICAST(eh->ether_dhost) &&
1913 bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) {
1914 m_freem(m);
1915 return (0);
1916 }
1917
1918 if (vhdr != NULL) {
1919 if (virtio_net_rx_csum(m, &vhdr->hdr)) {
1920 m_freem(m);
1921 return (0);
1922 }
1923 } else {
1924 switch (ntohs(eh->ether_type)) {
1925 #ifdef INET
1926 case ETHERTYPE_IP:
1927 if (ifp->if_capenable & IFCAP_RXCSUM) {
1928 m->m_pkthdr.csum_flags |=
1929 CSUM_IP_CHECKED | CSUM_IP_VALID |
1930 CSUM_DATA_VALID | CSUM_SCTP_VALID |
1931 CSUM_PSEUDO_HDR;
1932 m->m_pkthdr.csum_data = 0xffff;
1933 }
1934 break;
1935 #endif
1936 #ifdef INET6
1937 case ETHERTYPE_IPV6:
1938 if (ifp->if_capenable & IFCAP_RXCSUM_IPV6) {
1939 m->m_pkthdr.csum_flags |=
1940 CSUM_DATA_VALID_IPV6 | CSUM_SCTP_VALID |
1941 CSUM_PSEUDO_HDR;
1942 m->m_pkthdr.csum_data = 0xffff;
1943 }
1944 break;
1945 #endif
1946 }
1947 }
1948
1949 /* Pass packet up to parent. */
1950 CURVNET_SET(ifp->if_vnet);
1951 NET_EPOCH_ENTER(et);
1952 #if defined(INET) || defined(INET6)
1953 if (tp->tun_lro_ready && ifp->if_capenable & IFCAP_LRO &&
1954 tcp_lro_rx(&tp->tun_lro, m, 0) == 0)
1955 tcp_lro_flush_all(&tp->tun_lro);
1956 else
1957 #endif
1958 (*ifp->if_input)(ifp, m);
1959 NET_EPOCH_EXIT(et);
1960 CURVNET_RESTORE();
1961 /* ibytes are counted in parent */
1962 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
1963 return (0);
1964 }
1965
1966 static int
tunwrite_l3(struct tuntap_softc * tp,struct mbuf * m)1967 tunwrite_l3(struct tuntap_softc *tp, struct mbuf *m)
1968 {
1969 struct epoch_tracker et;
1970 struct ifnet *ifp;
1971 int family, isr;
1972
1973 ifp = TUN2IFP(tp);
1974 /* Could be unlocked read? */
1975 TUN_LOCK(tp);
1976 if (tp->tun_flags & TUN_IFHEAD) {
1977 TUN_UNLOCK(tp);
1978 if (m->m_len < sizeof(family) &&
1979 (m = m_pullup(m, sizeof(family))) == NULL)
1980 return (ENOBUFS);
1981 family = ntohl(*mtod(m, u_int32_t *));
1982 m_adj(m, sizeof(family));
1983 } else {
1984 TUN_UNLOCK(tp);
1985 family = AF_INET;
1986 }
1987
1988 BPF_MTAP2(ifp, &family, sizeof(family), m);
1989
1990 switch (family) {
1991 #ifdef INET
1992 case AF_INET:
1993 isr = NETISR_IP;
1994 break;
1995 #endif
1996 #ifdef INET6
1997 case AF_INET6:
1998 isr = NETISR_IPV6;
1999 break;
2000 #endif
2001 default:
2002 m_freem(m);
2003 return (EAFNOSUPPORT);
2004 }
2005 random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN);
2006 if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
2007 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
2008 CURVNET_SET(ifp->if_vnet);
2009 M_SETFIB(m, ifp->if_fib);
2010 NET_EPOCH_ENTER(et);
2011 netisr_dispatch(isr, m);
2012 NET_EPOCH_EXIT(et);
2013 CURVNET_RESTORE();
2014 return (0);
2015 }
2016
2017 /*
2018 * the cdevsw write interface - an atomic write is a packet - or else!
2019 */
2020 static int
tunwrite(struct cdev * dev,struct uio * uio,int flag)2021 tunwrite(struct cdev *dev, struct uio *uio, int flag)
2022 {
2023 struct virtio_net_hdr_mrg_rxbuf vhdr;
2024 struct tuntap_softc *tp;
2025 struct ifnet *ifp;
2026 struct mbuf *m;
2027 uint32_t mru;
2028 int align, vhdrlen, error;
2029 bool l2tun;
2030
2031 tp = dev->si_drv1;
2032 ifp = TUN2IFP(tp);
2033 TUNDEBUG(ifp, "tunwrite\n");
2034 if ((ifp->if_flags & IFF_UP) != IFF_UP)
2035 /* ignore silently */
2036 return (0);
2037
2038 if (uio->uio_resid == 0)
2039 return (0);
2040
2041 l2tun = (tp->tun_flags & TUN_L2) != 0;
2042 mru = l2tun ? TAPMRU : TUNMRU;
2043 vhdrlen = tp->tun_vhdrlen;
2044 align = 0;
2045 if (l2tun) {
2046 align = ETHER_ALIGN;
2047 mru += vhdrlen;
2048 } else if ((tp->tun_flags & TUN_IFHEAD) != 0)
2049 mru += sizeof(uint32_t); /* family */
2050 if (uio->uio_resid < 0 || uio->uio_resid > mru) {
2051 TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid);
2052 return (EIO);
2053 }
2054
2055 if (vhdrlen > 0) {
2056 error = uiomove(&vhdr, vhdrlen, uio);
2057 if (error != 0)
2058 return (error);
2059 TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, "
2060 "gs %u, cs %u, co %u\n", vhdr.hdr.flags,
2061 vhdr.hdr.gso_type, vhdr.hdr.hdr_len,
2062 vhdr.hdr.gso_size, vhdr.hdr.csum_start,
2063 vhdr.hdr.csum_offset);
2064 }
2065
2066 if ((m = m_uiotombuf(uio, M_NOWAIT, 0, align, M_PKTHDR)) == NULL) {
2067 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2068 return (ENOBUFS);
2069 }
2070
2071 m->m_pkthdr.rcvif = ifp;
2072 #ifdef MAC
2073 mac_ifnet_create_mbuf(ifp, m);
2074 #endif
2075
2076 if (l2tun)
2077 return (tunwrite_l2(tp, m, vhdrlen > 0 ? &vhdr : NULL));
2078
2079 return (tunwrite_l3(tp, m));
2080 }
2081
2082 /*
2083 * tunpoll - the poll interface, this is only useful on reads
2084 * really. The write detect always returns true, write never blocks
2085 * anyway, it either accepts the packet or drops it.
2086 */
2087 static int
tunpoll(struct cdev * dev,int events,struct thread * td)2088 tunpoll(struct cdev *dev, int events, struct thread *td)
2089 {
2090 struct tuntap_softc *tp = dev->si_drv1;
2091 struct ifnet *ifp = TUN2IFP(tp);
2092 int revents = 0;
2093
2094 TUNDEBUG(ifp, "tunpoll\n");
2095
2096 if (events & (POLLIN | POLLRDNORM)) {
2097 IFQ_LOCK(&ifp->if_snd);
2098 if (!IFQ_IS_EMPTY(&ifp->if_snd)) {
2099 TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len);
2100 revents |= events & (POLLIN | POLLRDNORM);
2101 } else {
2102 TUNDEBUG(ifp, "tunpoll waiting\n");
2103 selrecord(td, &tp->tun_rsel);
2104 }
2105 IFQ_UNLOCK(&ifp->if_snd);
2106 }
2107 revents |= events & (POLLOUT | POLLWRNORM);
2108
2109 return (revents);
2110 }
2111
2112 /*
2113 * tunkqfilter - support for the kevent() system call.
2114 */
2115 static int
tunkqfilter(struct cdev * dev,struct knote * kn)2116 tunkqfilter(struct cdev *dev, struct knote *kn)
2117 {
2118 struct tuntap_softc *tp = dev->si_drv1;
2119 struct ifnet *ifp = TUN2IFP(tp);
2120
2121 switch(kn->kn_filter) {
2122 case EVFILT_READ:
2123 TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n",
2124 ifp->if_xname, dev2unit(dev));
2125 kn->kn_fop = &tun_read_filterops;
2126 break;
2127
2128 case EVFILT_WRITE:
2129 TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n",
2130 ifp->if_xname, dev2unit(dev));
2131 kn->kn_fop = &tun_write_filterops;
2132 break;
2133
2134 default:
2135 TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n",
2136 ifp->if_xname, dev2unit(dev));
2137 return(EINVAL);
2138 }
2139
2140 kn->kn_hook = tp;
2141 knlist_add(&tp->tun_rsel.si_note, kn, 0);
2142
2143 return (0);
2144 }
2145
2146 /*
2147 * Return true of there is data in the interface queue.
2148 */
2149 static int
tunkqread(struct knote * kn,long hint)2150 tunkqread(struct knote *kn, long hint)
2151 {
2152 int ret;
2153 struct tuntap_softc *tp = kn->kn_hook;
2154 struct cdev *dev = tp->tun_dev;
2155 struct ifnet *ifp = TUN2IFP(tp);
2156
2157 if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) {
2158 TUNDEBUG(ifp,
2159 "%s have data in the queue. Len = %d, minor = %#x\n",
2160 ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev));
2161 ret = 1;
2162 } else {
2163 TUNDEBUG(ifp,
2164 "%s waiting for data, minor = %#x\n", ifp->if_xname,
2165 dev2unit(dev));
2166 ret = 0;
2167 }
2168
2169 return (ret);
2170 }
2171
2172 /*
2173 * Always can write, always return MTU in kn->data.
2174 */
2175 static int
tunkqwrite(struct knote * kn,long hint)2176 tunkqwrite(struct knote *kn, long hint)
2177 {
2178 struct tuntap_softc *tp = kn->kn_hook;
2179 struct ifnet *ifp = TUN2IFP(tp);
2180
2181 kn->kn_data = ifp->if_mtu;
2182
2183 return (1);
2184 }
2185
2186 static void
tunkqdetach(struct knote * kn)2187 tunkqdetach(struct knote *kn)
2188 {
2189 struct tuntap_softc *tp = kn->kn_hook;
2190
2191 knlist_remove(&tp->tun_rsel.si_note, kn, 0);
2192 }
2193