1 /* 2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 /* 28 * $FreeBSD$ 29 * 30 * This module supports memory mapped access to network devices, 31 * see netmap(4). 32 * 33 * The module uses a large, memory pool allocated by the kernel 34 * and accessible as mmapped memory by multiple userspace threads/processes. 35 * The memory pool contains packet buffers and "netmap rings", 36 * i.e. user-accessible copies of the interface's queues. 37 * 38 * Access to the network card works like this: 39 * 1. a process/thread issues one or more open() on /dev/netmap, to create 40 * select()able file descriptor on which events are reported. 41 * 2. on each descriptor, the process issues an ioctl() to identify 42 * the interface that should report events to the file descriptor. 43 * 3. on each descriptor, the process issues an mmap() request to 44 * map the shared memory region within the process' address space. 45 * The list of interesting queues is indicated by a location in 46 * the shared memory region. 47 * 4. using the functions in the netmap(4) userspace API, a process 48 * can look up the occupation state of a queue, access memory buffers, 49 * and retrieve received packets or enqueue packets to transmit. 50 * 5. using some ioctl()s the process can synchronize the userspace view 51 * of the queue with the actual status in the kernel. This includes both 52 * receiving the notification of new packets, and transmitting new 53 * packets on the output interface. 54 * 6. select() or poll() can be used to wait for events on individual 55 * transmit or receive queues (or all queues for a given interface). 56 * 57 58 SYNCHRONIZATION (USER) 59 60 The netmap rings and data structures may be shared among multiple 61 user threads or even independent processes. 62 Any synchronization among those threads/processes is delegated 63 to the threads themselves. Only one thread at a time can be in 64 a system call on the same netmap ring. The OS does not enforce 65 this and only guarantees against system crashes in case of 66 invalid usage. 67 68 LOCKING (INTERNAL) 69 70 Within the kernel, access to the netmap rings is protected as follows: 71 72 - a spinlock on each ring, to handle producer/consumer races on 73 RX rings attached to the host stack (against multiple host 74 threads writing from the host stack to the same ring), 75 and on 'destination' rings attached to a VALE switch 76 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) 77 protecting multiple active senders for the same destination) 78 79 - an atomic variable to guarantee that there is at most one 80 instance of *_*xsync() on the ring at any time. 81 For rings connected to user file 82 descriptors, an atomic_test_and_set() protects this, and the 83 lock on the ring is not actually used. 84 For NIC RX rings connected to a VALE switch, an atomic_test_and_set() 85 is also used to prevent multiple executions (the driver might indeed 86 already guarantee this). 87 For NIC TX rings connected to a VALE switch, the lock arbitrates 88 access to the queue (both when allocating buffers and when pushing 89 them out). 90 91 - *xsync() should be protected against initializations of the card. 92 On FreeBSD most devices have the reset routine protected by 93 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing 94 the RING protection on rx_reset(), this should be added. 95 96 On linux there is an external lock on the tx path, which probably 97 also arbitrates access to the reset routine. XXX to be revised 98 99 - a per-interface core_lock protecting access from the host stack 100 while interfaces may be detached from netmap mode. 101 XXX there should be no need for this lock if we detach the interfaces 102 only while they are down. 103 104 105 --- VALE SWITCH --- 106 107 NMG_LOCK() serializes all modifications to switches and ports. 108 A switch cannot be deleted until all ports are gone. 109 110 For each switch, an SX lock (RWlock on linux) protects 111 deletion of ports. When configuring or deleting a new port, the 112 lock is acquired in exclusive mode (after holding NMG_LOCK). 113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 114 The lock is held throughout the entire forwarding cycle, 115 during which the thread may incur in a page fault. 116 Hence it is important that sleepable shared locks are used. 117 118 On the rx ring, the per-port lock is grabbed initially to reserve 119 a number of slot in the ring, then the lock is released, 120 packets are copied from source to destination, and then 121 the lock is acquired again and the receive ring is updated. 122 (A similar thing is done on the tx ring for NIC and host stack 123 ports attached to the switch) 124 125 */ 126 127 /* 128 * OS-specific code that is used only within this file. 129 * Other OS-specific code that must be accessed by drivers 130 * is present in netmap_kern.h 131 */ 132 133 #if defined(__FreeBSD__) 134 #include <sys/cdefs.h> /* prerequisite */ 135 #include <sys/types.h> 136 #include <sys/errno.h> 137 #include <sys/param.h> /* defines used in kernel.h */ 138 #include <sys/kernel.h> /* types used in module initialization */ 139 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 140 #include <sys/sockio.h> 141 #include <sys/socketvar.h> /* struct socket */ 142 #include <sys/malloc.h> 143 #include <sys/poll.h> 144 #include <sys/rwlock.h> 145 #include <sys/socket.h> /* sockaddrs */ 146 #include <sys/selinfo.h> 147 #include <sys/sysctl.h> 148 #include <sys/jail.h> 149 #include <net/vnet.h> 150 #include <net/if.h> 151 #include <net/if_var.h> 152 #include <net/bpf.h> /* BIOCIMMEDIATE */ 153 #include <machine/bus.h> /* bus_dmamap_* */ 154 #include <sys/endian.h> 155 #include <sys/refcount.h> 156 157 158 /* reduce conditional code */ 159 #define init_waitqueue_head(x) // only needed in linux 160 161 162 163 #elif defined(linux) 164 165 #include "bsd_glue.h" 166 167 168 169 #elif defined(__APPLE__) 170 171 #warning OSX support is only partial 172 #include "osx_glue.h" 173 174 #else 175 176 #error Unsupported platform 177 178 #endif /* unsupported */ 179 180 /* 181 * common headers 182 */ 183 #include <net/netmap.h> 184 #include <dev/netmap/netmap_kern.h> 185 #include <dev/netmap/netmap_mem2.h> 186 187 188 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); 189 190 /* 191 * The following variables are used by the drivers and replicate 192 * fields in the global memory pool. They only refer to buffers 193 * used by physical interfaces. 194 */ 195 u_int netmap_total_buffers; 196 u_int netmap_buf_size; 197 char *netmap_buffer_base; /* also address of an invalid buffer */ 198 199 /* user-controlled variables */ 200 int netmap_verbose; 201 202 static int netmap_no_timestamp; /* don't timestamp on rxsync */ 203 204 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); 205 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, 206 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); 207 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, 208 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); 209 int netmap_mitigate = 1; 210 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); 211 int netmap_no_pendintr = 1; 212 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, 213 CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); 214 int netmap_txsync_retry = 2; 215 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, 216 &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); 217 218 int netmap_flags = 0; /* debug flags */ 219 int netmap_fwd = 0; /* force transparent mode */ 220 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ 221 222 /* 223 * netmap_admode selects the netmap mode to use. 224 * Invalid values are reset to NETMAP_ADMODE_BEST 225 */ 226 enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ 227 NETMAP_ADMODE_NATIVE, /* either native or none */ 228 NETMAP_ADMODE_GENERIC, /* force generic */ 229 NETMAP_ADMODE_LAST }; 230 static int netmap_admode = NETMAP_ADMODE_BEST; 231 232 int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ 233 int netmap_generic_ringsize = 1024; /* Generic ringsize. */ 234 235 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); 236 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); 237 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); 238 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); 239 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); 240 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); 241 242 NMG_LOCK_T netmap_global_lock; 243 244 245 static void 246 nm_kr_get(struct netmap_kring *kr) 247 { 248 while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) 249 tsleep(kr, 0, "NM_KR_GET", 4); 250 } 251 252 253 /* 254 * mark the ring as stopped, and run through the locks 255 * to make sure other users get to see it. 256 */ 257 void 258 netmap_disable_ring(struct netmap_kring *kr) 259 { 260 kr->nkr_stopped = 1; 261 nm_kr_get(kr); 262 mtx_lock(&kr->q_lock); 263 mtx_unlock(&kr->q_lock); 264 nm_kr_put(kr); 265 } 266 267 268 static void 269 netmap_set_all_rings(struct ifnet *ifp, int stopped) 270 { 271 struct netmap_adapter *na; 272 int i; 273 274 if (!(ifp->if_capenable & IFCAP_NETMAP)) 275 return; 276 277 na = NA(ifp); 278 279 for (i = 0; i <= na->num_tx_rings; i++) { 280 if (stopped) 281 netmap_disable_ring(na->tx_rings + i); 282 else 283 na->tx_rings[i].nkr_stopped = 0; 284 na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY | 285 (i == na->num_tx_rings ? NAF_GLOBAL_NOTIFY: 0)); 286 } 287 288 for (i = 0; i <= na->num_rx_rings; i++) { 289 if (stopped) 290 netmap_disable_ring(na->rx_rings + i); 291 else 292 na->rx_rings[i].nkr_stopped = 0; 293 na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY | 294 (i == na->num_rx_rings ? NAF_GLOBAL_NOTIFY: 0)); 295 } 296 } 297 298 299 void 300 netmap_disable_all_rings(struct ifnet *ifp) 301 { 302 netmap_set_all_rings(ifp, 1 /* stopped */); 303 } 304 305 306 void 307 netmap_enable_all_rings(struct ifnet *ifp) 308 { 309 netmap_set_all_rings(ifp, 0 /* enabled */); 310 } 311 312 313 /* 314 * generic bound_checking function 315 */ 316 u_int 317 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) 318 { 319 u_int oldv = *v; 320 const char *op = NULL; 321 322 if (dflt < lo) 323 dflt = lo; 324 if (dflt > hi) 325 dflt = hi; 326 if (oldv < lo) { 327 *v = dflt; 328 op = "Bump"; 329 } else if (oldv > hi) { 330 *v = hi; 331 op = "Clamp"; 332 } 333 if (op && msg) 334 printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); 335 return *v; 336 } 337 338 339 /* 340 * packet-dump function, user-supplied or static buffer. 341 * The destination buffer must be at least 30+4*len 342 */ 343 const char * 344 nm_dump_buf(char *p, int len, int lim, char *dst) 345 { 346 static char _dst[8192]; 347 int i, j, i0; 348 static char hex[] ="0123456789abcdef"; 349 char *o; /* output position */ 350 351 #define P_HI(x) hex[((x) & 0xf0)>>4] 352 #define P_LO(x) hex[((x) & 0xf)] 353 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') 354 if (!dst) 355 dst = _dst; 356 if (lim <= 0 || lim > len) 357 lim = len; 358 o = dst; 359 sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); 360 o += strlen(o); 361 /* hexdump routine */ 362 for (i = 0; i < lim; ) { 363 sprintf(o, "%5d: ", i); 364 o += strlen(o); 365 memset(o, ' ', 48); 366 i0 = i; 367 for (j=0; j < 16 && i < lim; i++, j++) { 368 o[j*3] = P_HI(p[i]); 369 o[j*3+1] = P_LO(p[i]); 370 } 371 i = i0; 372 for (j=0; j < 16 && i < lim; i++, j++) 373 o[j + 48] = P_C(p[i]); 374 o[j+48] = '\n'; 375 o += j+49; 376 } 377 *o = '\0'; 378 #undef P_HI 379 #undef P_LO 380 #undef P_C 381 return dst; 382 } 383 384 385 /* 386 * Fetch configuration from the device, to cope with dynamic 387 * reconfigurations after loading the module. 388 */ 389 int 390 netmap_update_config(struct netmap_adapter *na) 391 { 392 struct ifnet *ifp = na->ifp; 393 u_int txr, txd, rxr, rxd; 394 395 txr = txd = rxr = rxd = 0; 396 if (na->nm_config) { 397 na->nm_config(na, &txr, &txd, &rxr, &rxd); 398 } else { 399 /* take whatever we had at init time */ 400 txr = na->num_tx_rings; 401 txd = na->num_tx_desc; 402 rxr = na->num_rx_rings; 403 rxd = na->num_rx_desc; 404 } 405 406 if (na->num_tx_rings == txr && na->num_tx_desc == txd && 407 na->num_rx_rings == rxr && na->num_rx_desc == rxd) 408 return 0; /* nothing changed */ 409 if (netmap_verbose || na->active_fds > 0) { 410 D("stored config %s: txring %d x %d, rxring %d x %d", 411 NM_IFPNAME(ifp), 412 na->num_tx_rings, na->num_tx_desc, 413 na->num_rx_rings, na->num_rx_desc); 414 D("new config %s: txring %d x %d, rxring %d x %d", 415 NM_IFPNAME(ifp), txr, txd, rxr, rxd); 416 } 417 if (na->active_fds == 0) { 418 D("configuration changed (but fine)"); 419 na->num_tx_rings = txr; 420 na->num_tx_desc = txd; 421 na->num_rx_rings = rxr; 422 na->num_rx_desc = rxd; 423 return 0; 424 } 425 D("configuration changed while active, this is bad..."); 426 return 1; 427 } 428 429 430 int 431 netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom) 432 { 433 u_int i, len, ndesc; 434 struct netmap_kring *kring; 435 436 // XXX additional space for extra rings ? 437 len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom; 438 439 na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); 440 if (na->tx_rings == NULL) { 441 D("Cannot allocate krings"); 442 return ENOMEM; 443 } 444 na->rx_rings = na->tx_rings + ntx; 445 446 /* 447 * All fields in krings are 0 except the one initialized below. 448 * but better be explicit on important kring fields. 449 */ 450 ndesc = na->num_tx_desc; 451 for (i = 0; i < ntx; i++) { /* Transmit rings */ 452 kring = &na->tx_rings[i]; 453 bzero(kring, sizeof(*kring)); 454 kring->na = na; 455 kring->ring_id = i; 456 kring->nkr_num_slots = ndesc; 457 /* 458 * IMPORTANT: Always keep one slot empty. 459 */ 460 kring->rhead = kring->rcur = kring->nr_hwcur = 0; 461 kring->rtail = kring->nr_hwtail = ndesc - 1; 462 snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i); 463 mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF); 464 init_waitqueue_head(&kring->si); 465 } 466 467 ndesc = na->num_rx_desc; 468 for (i = 0; i < nrx; i++) { /* Receive rings */ 469 kring = &na->rx_rings[i]; 470 bzero(kring, sizeof(*kring)); 471 kring->na = na; 472 kring->ring_id = i; 473 kring->nkr_num_slots = ndesc; 474 kring->rhead = kring->rcur = kring->nr_hwcur = 0; 475 kring->rtail = kring->nr_hwtail = 0; 476 snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i); 477 mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF); 478 init_waitqueue_head(&kring->si); 479 } 480 init_waitqueue_head(&na->tx_si); 481 init_waitqueue_head(&na->rx_si); 482 483 na->tailroom = na->rx_rings + nrx; 484 485 return 0; 486 } 487 488 489 /* XXX check boundaries */ 490 void 491 netmap_krings_delete(struct netmap_adapter *na) 492 { 493 int i; 494 495 for (i = 0; i < na->num_tx_rings + 1; i++) { 496 mtx_destroy(&na->tx_rings[i].q_lock); 497 } 498 for (i = 0; i < na->num_rx_rings + 1; i++) { 499 mtx_destroy(&na->rx_rings[i].q_lock); 500 } 501 free(na->tx_rings, M_DEVBUF); 502 na->tx_rings = na->rx_rings = na->tailroom = NULL; 503 } 504 505 506 /* 507 * Destructor for NIC ports. They also have an mbuf queue 508 * on the rings connected to the host so we need to purge 509 * them first. 510 */ 511 static void 512 netmap_hw_krings_delete(struct netmap_adapter *na) 513 { 514 struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue; 515 516 ND("destroy sw mbq with len %d", mbq_len(q)); 517 mbq_purge(q); 518 mbq_safe_destroy(q); 519 netmap_krings_delete(na); 520 } 521 522 523 static struct netmap_if* 524 netmap_if_new(const char *ifname, struct netmap_adapter *na) 525 { 526 struct netmap_if *nifp; 527 528 if (netmap_update_config(na)) { 529 /* configuration mismatch, report and fail */ 530 return NULL; 531 } 532 533 if (na->active_fds) 534 goto final; 535 536 if (na->nm_krings_create(na)) 537 goto cleanup; 538 539 if (netmap_mem_rings_create(na)) 540 goto cleanup; 541 542 final: 543 544 nifp = netmap_mem_if_new(ifname, na); 545 if (nifp == NULL) 546 goto cleanup; 547 548 return (nifp); 549 550 cleanup: 551 552 if (na->active_fds == 0) { 553 netmap_mem_rings_delete(na); 554 na->nm_krings_delete(na); 555 } 556 557 return NULL; 558 } 559 560 561 /* grab a reference to the memory allocator, if we don't have one already. The 562 * reference is taken from the netmap_adapter registered with the priv. 563 * 564 */ 565 static int 566 netmap_get_memory_locked(struct netmap_priv_d* p) 567 { 568 struct netmap_mem_d *nmd; 569 int error = 0; 570 571 if (p->np_na == NULL) { 572 if (!netmap_mmap_unreg) 573 return ENODEV; 574 /* for compatibility with older versions of the API 575 * we use the global allocator when no interface has been 576 * registered 577 */ 578 nmd = &nm_mem; 579 } else { 580 nmd = p->np_na->nm_mem; 581 } 582 if (p->np_mref == NULL) { 583 error = netmap_mem_finalize(nmd); 584 if (!error) 585 p->np_mref = nmd; 586 } else if (p->np_mref != nmd) { 587 /* a virtual port has been registered, but previous 588 * syscalls already used the global allocator. 589 * We cannot continue 590 */ 591 error = ENODEV; 592 } 593 return error; 594 } 595 596 597 int 598 netmap_get_memory(struct netmap_priv_d* p) 599 { 600 int error; 601 NMG_LOCK(); 602 error = netmap_get_memory_locked(p); 603 NMG_UNLOCK(); 604 return error; 605 } 606 607 608 static int 609 netmap_have_memory_locked(struct netmap_priv_d* p) 610 { 611 return p->np_mref != NULL; 612 } 613 614 615 static void 616 netmap_drop_memory_locked(struct netmap_priv_d* p) 617 { 618 if (p->np_mref) { 619 netmap_mem_deref(p->np_mref); 620 p->np_mref = NULL; 621 } 622 } 623 624 625 /* 626 * File descriptor's private data destructor. 627 * 628 * Call nm_register(ifp,0) to stop netmap mode on the interface and 629 * revert to normal operation. We expect that np_na->ifp has not gone. 630 * The second argument is the nifp to work on. In some cases it is 631 * not attached yet to the netmap_priv_d so we need to pass it as 632 * a separate argument. 633 */ 634 /* call with NMG_LOCK held */ 635 static void 636 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) 637 { 638 struct netmap_adapter *na = priv->np_na; 639 struct ifnet *ifp = na->ifp; 640 641 NMG_LOCK_ASSERT(); 642 na->active_fds--; 643 if (na->active_fds <= 0) { /* last instance */ 644 645 if (netmap_verbose) 646 D("deleting last instance for %s", NM_IFPNAME(ifp)); 647 /* 648 * (TO CHECK) This function is only called 649 * when the last reference to this file descriptor goes 650 * away. This means we cannot have any pending poll() 651 * or interrupt routine operating on the structure. 652 * XXX The file may be closed in a thread while 653 * another thread is using it. 654 * Linux keeps the file opened until the last reference 655 * by any outstanding ioctl/poll or mmap is gone. 656 * FreeBSD does not track mmap()s (but we do) and 657 * wakes up any sleeping poll(). Need to check what 658 * happens if the close() occurs while a concurrent 659 * syscall is running. 660 */ 661 if (ifp) 662 na->nm_register(na, 0); /* off, clear flags */ 663 /* Wake up any sleeping threads. netmap_poll will 664 * then return POLLERR 665 * XXX The wake up now must happen during *_down(), when 666 * we order all activities to stop. -gl 667 */ 668 /* XXX kqueue(9) needed; these will mirror knlist_init. */ 669 /* knlist_destroy(&na->tx_si.si_note); */ 670 /* knlist_destroy(&na->rx_si.si_note); */ 671 672 /* delete rings and buffers */ 673 netmap_mem_rings_delete(na); 674 na->nm_krings_delete(na); 675 } 676 /* delete the nifp */ 677 netmap_mem_if_delete(na, nifp); 678 } 679 680 681 /* 682 * returns 1 if this is the last instance and we can free priv 683 */ 684 int 685 netmap_dtor_locked(struct netmap_priv_d *priv) 686 { 687 struct netmap_adapter *na = priv->np_na; 688 689 #ifdef __FreeBSD__ 690 /* 691 * np_refcount is the number of active mmaps on 692 * this file descriptor 693 */ 694 if (--priv->np_refcount > 0) { 695 return 0; 696 } 697 #endif /* __FreeBSD__ */ 698 if (!na) { 699 return 1; //XXX is it correct? 700 } 701 netmap_do_unregif(priv, priv->np_nifp); 702 priv->np_nifp = NULL; 703 netmap_drop_memory_locked(priv); 704 if (priv->np_na) { 705 netmap_adapter_put(na); 706 priv->np_na = NULL; 707 } 708 return 1; 709 } 710 711 712 void 713 netmap_dtor(void *data) 714 { 715 struct netmap_priv_d *priv = data; 716 int last_instance; 717 718 NMG_LOCK(); 719 last_instance = netmap_dtor_locked(priv); 720 NMG_UNLOCK(); 721 if (last_instance) { 722 bzero(priv, sizeof(*priv)); /* for safety */ 723 free(priv, M_DEVBUF); 724 } 725 } 726 727 728 729 730 /* 731 * Handlers for synchronization of the queues from/to the host. 732 * Netmap has two operating modes: 733 * - in the default mode, the rings connected to the host stack are 734 * just another ring pair managed by userspace; 735 * - in transparent mode (XXX to be defined) incoming packets 736 * (from the host or the NIC) are marked as NS_FORWARD upon 737 * arrival, and the user application has a chance to reset the 738 * flag for packets that should be dropped. 739 * On the RXSYNC or poll(), packets in RX rings between 740 * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved 741 * to the other side. 742 * The transfer NIC --> host is relatively easy, just encapsulate 743 * into mbufs and we are done. The host --> NIC side is slightly 744 * harder because there might not be room in the tx ring so it 745 * might take a while before releasing the buffer. 746 */ 747 748 749 /* 750 * pass a chain of buffers to the host stack as coming from 'dst' 751 * We do not need to lock because the queue is private. 752 */ 753 static void 754 netmap_send_up(struct ifnet *dst, struct mbq *q) 755 { 756 struct mbuf *m; 757 758 /* send packets up, outside the lock */ 759 while ((m = mbq_dequeue(q)) != NULL) { 760 if (netmap_verbose & NM_VERB_HOST) 761 D("sending up pkt %p size %d", m, MBUF_LEN(m)); 762 NM_SEND_UP(dst, m); 763 } 764 mbq_destroy(q); 765 } 766 767 768 /* 769 * put a copy of the buffers marked NS_FORWARD into an mbuf chain. 770 * Take packets from hwcur to ring->head marked NS_FORWARD (or forced) 771 * and pass them up. Drop remaining packets in the unlikely event 772 * of an mbuf shortage. 773 */ 774 static void 775 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) 776 { 777 u_int const lim = kring->nkr_num_slots - 1; 778 u_int const head = kring->ring->head; 779 u_int n; 780 struct netmap_adapter *na = kring->na; 781 782 for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) { 783 struct mbuf *m; 784 struct netmap_slot *slot = &kring->ring->slot[n]; 785 786 if ((slot->flags & NS_FORWARD) == 0 && !force) 787 continue; 788 if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { 789 RD(5, "bad pkt at %d len %d", n, slot->len); 790 continue; 791 } 792 slot->flags &= ~NS_FORWARD; // XXX needed ? 793 /* XXX TODO: adapt to the case of a multisegment packet */ 794 m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL); 795 796 if (m == NULL) 797 break; 798 mbq_enqueue(q, m); 799 } 800 } 801 802 803 /* 804 * Send to the NIC rings packets marked NS_FORWARD between 805 * kring->nr_hwcur and kring->rhead 806 * Called under kring->rx_queue.lock on the sw rx ring, 807 */ 808 static u_int 809 netmap_sw_to_nic(struct netmap_adapter *na) 810 { 811 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 812 struct netmap_slot *rxslot = kring->ring->slot; 813 u_int i, rxcur = kring->nr_hwcur; 814 u_int const head = kring->rhead; 815 u_int const src_lim = kring->nkr_num_slots - 1; 816 u_int sent = 0; 817 818 /* scan rings to find space, then fill as much as possible */ 819 for (i = 0; i < na->num_tx_rings; i++) { 820 struct netmap_kring *kdst = &na->tx_rings[i]; 821 struct netmap_ring *rdst = kdst->ring; 822 u_int const dst_lim = kdst->nkr_num_slots - 1; 823 824 /* XXX do we trust ring or kring->rcur,rtail ? */ 825 for (; rxcur != head && !nm_ring_empty(rdst); 826 rxcur = nm_next(rxcur, src_lim) ) { 827 struct netmap_slot *src, *dst, tmp; 828 u_int dst_cur = rdst->cur; 829 830 src = &rxslot[rxcur]; 831 if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd) 832 continue; 833 834 sent++; 835 836 dst = &rdst->slot[dst_cur]; 837 838 tmp = *src; 839 840 src->buf_idx = dst->buf_idx; 841 src->flags = NS_BUF_CHANGED; 842 843 dst->buf_idx = tmp.buf_idx; 844 dst->len = tmp.len; 845 dst->flags = NS_BUF_CHANGED; 846 847 rdst->cur = nm_next(dst_cur, dst_lim); 848 } 849 /* if (sent) XXX txsync ? */ 850 } 851 return sent; 852 } 853 854 855 /* 856 * netmap_txsync_to_host() passes packets up. We are called from a 857 * system call in user process context, and the only contention 858 * can be among multiple user threads erroneously calling 859 * this routine concurrently. 860 */ 861 void 862 netmap_txsync_to_host(struct netmap_adapter *na) 863 { 864 struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; 865 struct netmap_ring *ring = kring->ring; 866 u_int const lim = kring->nkr_num_slots - 1; 867 u_int const head = nm_txsync_prologue(kring); 868 struct mbq q; 869 int error; 870 871 error = nm_kr_tryget(kring); 872 if (error) { 873 if (error == NM_KR_BUSY) 874 D("ring %p busy (user error)", kring); 875 return; 876 } 877 if (head > lim) { 878 D("invalid ring index in stack TX kring %p", kring); 879 netmap_ring_reinit(kring); 880 nm_kr_put(kring); 881 return; 882 } 883 884 /* Take packets from hwcur to head and pass them up. 885 * force head = cur since netmap_grab_packets() stops at head 886 * In case of no buffers we give up. At the end of the loop, 887 * the queue is drained in all cases. 888 */ 889 mbq_init(&q); 890 ring->cur = head; 891 netmap_grab_packets(kring, &q, 1 /* force */); 892 ND("have %d pkts in queue", mbq_len(&q)); 893 kring->nr_hwcur = head; 894 kring->nr_hwtail = head + lim; 895 if (kring->nr_hwtail > lim) 896 kring->nr_hwtail -= lim + 1; 897 nm_txsync_finalize(kring); 898 899 nm_kr_put(kring); 900 netmap_send_up(na->ifp, &q); 901 } 902 903 904 /* 905 * rxsync backend for packets coming from the host stack. 906 * They have been put in kring->rx_queue by netmap_transmit(). 907 * We protect access to the kring using kring->rx_queue.lock 908 * 909 * This routine also does the selrecord if called from the poll handler 910 * (we know because td != NULL). 911 * 912 * NOTE: on linux, selrecord() is defined as a macro and uses pwait 913 * as an additional hidden argument. 914 * returns the number of packets delivered to tx queues in 915 * transparent mode, or a negative value if error 916 */ 917 int 918 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) 919 { 920 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 921 struct netmap_ring *ring = kring->ring; 922 u_int nm_i, n; 923 u_int const lim = kring->nkr_num_slots - 1; 924 u_int const head = nm_rxsync_prologue(kring); 925 int ret = 0; 926 struct mbq *q = &kring->rx_queue; 927 928 (void)pwait; /* disable unused warnings */ 929 930 if (head > lim) { 931 netmap_ring_reinit(kring); 932 return EINVAL; 933 } 934 935 if (kring->nkr_stopped) /* check a first time without lock */ 936 return EBUSY; 937 938 mtx_lock(&q->lock); 939 940 if (kring->nkr_stopped) { /* check again with lock held */ 941 ret = EBUSY; 942 goto unlock_out; 943 } 944 945 /* First part: import newly received packets */ 946 n = mbq_len(q); 947 if (n) { /* grab packets from the queue */ 948 struct mbuf *m; 949 uint32_t stop_i; 950 951 nm_i = kring->nr_hwtail; 952 stop_i = nm_prev(nm_i, lim); 953 while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) { 954 int len = MBUF_LEN(m); 955 struct netmap_slot *slot = &ring->slot[nm_i]; 956 957 m_copydata(m, 0, len, BDG_NMB(na, slot)); 958 ND("nm %d len %d", nm_i, len); 959 if (netmap_verbose) 960 D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL)); 961 962 slot->len = len; 963 slot->flags = kring->nkr_slot_flags; 964 nm_i = nm_next(nm_i, lim); 965 } 966 kring->nr_hwtail = nm_i; 967 } 968 969 /* 970 * Second part: skip past packets that userspace has released. 971 */ 972 nm_i = kring->nr_hwcur; 973 if (nm_i != head) { /* something was released */ 974 if (netmap_fwd || kring->ring->flags & NR_FORWARD) 975 ret = netmap_sw_to_nic(na); 976 kring->nr_hwcur = head; 977 } 978 979 nm_rxsync_finalize(kring); 980 981 /* access copies of cur,tail in the kring */ 982 if (kring->rcur == kring->rtail && td) /* no bufs available */ 983 selrecord(td, &kring->si); 984 985 unlock_out: 986 987 mtx_unlock(&q->lock); 988 return ret; 989 } 990 991 992 /* Get a netmap adapter for the port. 993 * 994 * If it is possible to satisfy the request, return 0 995 * with *na containing the netmap adapter found. 996 * Otherwise return an error code, with *na containing NULL. 997 * 998 * When the port is attached to a bridge, we always return 999 * EBUSY. 1000 * Otherwise, if the port is already bound to a file descriptor, 1001 * then we unconditionally return the existing adapter into *na. 1002 * In all the other cases, we return (into *na) either native, 1003 * generic or NULL, according to the following table: 1004 * 1005 * native_support 1006 * active_fds dev.netmap.admode YES NO 1007 * ------------------------------------------------------- 1008 * >0 * NA(ifp) NA(ifp) 1009 * 1010 * 0 NETMAP_ADMODE_BEST NATIVE GENERIC 1011 * 0 NETMAP_ADMODE_NATIVE NATIVE NULL 1012 * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC 1013 * 1014 */ 1015 1016 int 1017 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) 1018 { 1019 /* generic support */ 1020 int i = netmap_admode; /* Take a snapshot. */ 1021 int error = 0; 1022 struct netmap_adapter *prev_na; 1023 struct netmap_generic_adapter *gna; 1024 1025 *na = NULL; /* default */ 1026 1027 /* reset in case of invalid value */ 1028 if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST) 1029 i = netmap_admode = NETMAP_ADMODE_BEST; 1030 1031 if (NETMAP_CAPABLE(ifp)) { 1032 /* If an adapter already exists, but is 1033 * attached to a vale port, we report that the 1034 * port is busy. 1035 */ 1036 if (NETMAP_OWNED_BY_KERN(NA(ifp))) 1037 return EBUSY; 1038 1039 /* If an adapter already exists, return it if 1040 * there are active file descriptors or if 1041 * netmap is not forced to use generic 1042 * adapters. 1043 */ 1044 if (NA(ifp)->active_fds > 0 || 1045 i != NETMAP_ADMODE_GENERIC) { 1046 *na = NA(ifp); 1047 return 0; 1048 } 1049 } 1050 1051 /* If there isn't native support and netmap is not allowed 1052 * to use generic adapters, we cannot satisfy the request. 1053 */ 1054 if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE) 1055 return EINVAL; 1056 1057 /* Otherwise, create a generic adapter and return it, 1058 * saving the previously used netmap adapter, if any. 1059 * 1060 * Note that here 'prev_na', if not NULL, MUST be a 1061 * native adapter, and CANNOT be a generic one. This is 1062 * true because generic adapters are created on demand, and 1063 * destroyed when not used anymore. Therefore, if the adapter 1064 * currently attached to an interface 'ifp' is generic, it 1065 * must be that 1066 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))). 1067 * Consequently, if NA(ifp) is generic, we will enter one of 1068 * the branches above. This ensures that we never override 1069 * a generic adapter with another generic adapter. 1070 */ 1071 prev_na = NA(ifp); 1072 error = generic_netmap_attach(ifp); 1073 if (error) 1074 return error; 1075 1076 *na = NA(ifp); 1077 gna = (struct netmap_generic_adapter*)NA(ifp); 1078 gna->prev = prev_na; /* save old na */ 1079 if (prev_na != NULL) { 1080 ifunit_ref(ifp->if_xname); 1081 // XXX add a refcount ? 1082 netmap_adapter_get(prev_na); 1083 } 1084 ND("Created generic NA %p (prev %p)", gna, gna->prev); 1085 1086 return 0; 1087 } 1088 1089 1090 /* 1091 * MUST BE CALLED UNDER NMG_LOCK() 1092 * 1093 * get a refcounted reference to an interface. 1094 * This is always called in the execution of an ioctl(). 1095 * 1096 * Return ENXIO if the interface does not exist, EINVAL if netmap 1097 * is not supported by the interface. 1098 * If successful, hold a reference. 1099 * 1100 * When the NIC is attached to a bridge, reference is managed 1101 * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as 1102 * virtual ports. Hence, on the final DROP_BDG_REF(), the NIC 1103 * is detached from the bridge, then ifp's refcount is dropped (this 1104 * is equivalent to that ifp is destroyed in case of virtual ports. 1105 * 1106 * This function uses if_rele() when we want to prevent the NIC from 1107 * being detached from the bridge in error handling. But once refcount 1108 * is acquired by this function, it must be released using nm_if_rele(). 1109 */ 1110 int 1111 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) 1112 { 1113 struct ifnet *ifp; 1114 int error = 0; 1115 struct netmap_adapter *ret; 1116 1117 *na = NULL; /* default return value */ 1118 1119 /* first try to see if this is a bridge port. */ 1120 NMG_LOCK_ASSERT(); 1121 1122 error = netmap_get_bdg_na(nmr, na, create); 1123 if (error || *na != NULL) /* valid match in netmap_get_bdg_na() */ 1124 return error; 1125 1126 ifp = ifunit_ref(nmr->nr_name); 1127 if (ifp == NULL) { 1128 return ENXIO; 1129 } 1130 1131 error = netmap_get_hw_na(ifp, &ret); 1132 if (error) 1133 goto out; 1134 1135 if (ret != NULL) { 1136 /* Users cannot use the NIC attached to a bridge directly */ 1137 if (NETMAP_OWNED_BY_KERN(ret)) { 1138 error = EINVAL; 1139 goto out; 1140 } 1141 error = 0; 1142 *na = ret; 1143 netmap_adapter_get(ret); 1144 } 1145 out: 1146 if_rele(ifp); 1147 1148 return error; 1149 } 1150 1151 1152 /* 1153 * validate parameters on entry for *_txsync() 1154 * Returns ring->cur if ok, or something >= kring->nkr_num_slots 1155 * in case of error. 1156 * 1157 * rhead, rcur and rtail=hwtail are stored from previous round. 1158 * hwcur is the next packet to send to the ring. 1159 * 1160 * We want 1161 * hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail 1162 * 1163 * hwcur, rhead, rtail and hwtail are reliable 1164 */ 1165 u_int 1166 nm_txsync_prologue(struct netmap_kring *kring) 1167 { 1168 struct netmap_ring *ring = kring->ring; 1169 u_int head = ring->head; /* read only once */ 1170 u_int cur = ring->cur; /* read only once */ 1171 u_int n = kring->nkr_num_slots; 1172 1173 ND(5, "%s kcur %d ktail %d head %d cur %d tail %d", 1174 kring->name, 1175 kring->nr_hwcur, kring->nr_hwtail, 1176 ring->head, ring->cur, ring->tail); 1177 #if 1 /* kernel sanity checks; but we can trust the kring. */ 1178 if (kring->nr_hwcur >= n || kring->rhead >= n || 1179 kring->rtail >= n || kring->nr_hwtail >= n) 1180 goto error; 1181 #endif /* kernel sanity checks */ 1182 /* 1183 * user sanity checks. We only use 'cur', 1184 * A, B, ... are possible positions for cur: 1185 * 1186 * 0 A cur B tail C n-1 1187 * 0 D tail E cur F n-1 1188 * 1189 * B, F, D are valid. A, C, E are wrong 1190 */ 1191 if (kring->rtail >= kring->rhead) { 1192 /* want rhead <= head <= rtail */ 1193 if (head < kring->rhead || head > kring->rtail) 1194 goto error; 1195 /* and also head <= cur <= rtail */ 1196 if (cur < head || cur > kring->rtail) 1197 goto error; 1198 } else { /* here rtail < rhead */ 1199 /* we need head outside rtail .. rhead */ 1200 if (head > kring->rtail && head < kring->rhead) 1201 goto error; 1202 1203 /* two cases now: head <= rtail or head >= rhead */ 1204 if (head <= kring->rtail) { 1205 /* want head <= cur <= rtail */ 1206 if (cur < head || cur > kring->rtail) 1207 goto error; 1208 } else { /* head >= rhead */ 1209 /* cur must be outside rtail..head */ 1210 if (cur > kring->rtail && cur < head) 1211 goto error; 1212 } 1213 } 1214 if (ring->tail != kring->rtail) { 1215 RD(5, "tail overwritten was %d need %d", 1216 ring->tail, kring->rtail); 1217 ring->tail = kring->rtail; 1218 } 1219 kring->rhead = head; 1220 kring->rcur = cur; 1221 return head; 1222 1223 error: 1224 RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d", 1225 kring->name, 1226 kring->nr_hwcur, 1227 kring->rcur, kring->nr_hwtail, 1228 cur, ring->tail); 1229 return n; 1230 } 1231 1232 1233 /* 1234 * validate parameters on entry for *_rxsync() 1235 * Returns ring->head if ok, kring->nkr_num_slots on error. 1236 * 1237 * For a valid configuration, 1238 * hwcur <= head <= cur <= tail <= hwtail 1239 * 1240 * We only consider head and cur. 1241 * hwcur and hwtail are reliable. 1242 * 1243 */ 1244 u_int 1245 nm_rxsync_prologue(struct netmap_kring *kring) 1246 { 1247 struct netmap_ring *ring = kring->ring; 1248 uint32_t const n = kring->nkr_num_slots; 1249 uint32_t head, cur; 1250 1251 ND("%s kc %d kt %d h %d c %d t %d", 1252 kring->name, 1253 kring->nr_hwcur, kring->nr_hwtail, 1254 ring->head, ring->cur, ring->tail); 1255 /* 1256 * Before storing the new values, we should check they do not 1257 * move backwards. However: 1258 * - head is not an issue because the previous value is hwcur; 1259 * - cur could in principle go back, however it does not matter 1260 * because we are processing a brand new rxsync() 1261 */ 1262 cur = kring->rcur = ring->cur; /* read only once */ 1263 head = kring->rhead = ring->head; /* read only once */ 1264 #if 1 /* kernel sanity checks */ 1265 if (kring->nr_hwcur >= n || kring->nr_hwtail >= n) 1266 goto error; 1267 #endif /* kernel sanity checks */ 1268 /* user sanity checks */ 1269 if (kring->nr_hwtail >= kring->nr_hwcur) { 1270 /* want hwcur <= rhead <= hwtail */ 1271 if (head < kring->nr_hwcur || head > kring->nr_hwtail) 1272 goto error; 1273 /* and also rhead <= rcur <= hwtail */ 1274 if (cur < head || cur > kring->nr_hwtail) 1275 goto error; 1276 } else { 1277 /* we need rhead outside hwtail..hwcur */ 1278 if (head < kring->nr_hwcur && head > kring->nr_hwtail) 1279 goto error; 1280 /* two cases now: head <= hwtail or head >= hwcur */ 1281 if (head <= kring->nr_hwtail) { 1282 /* want head <= cur <= hwtail */ 1283 if (cur < head || cur > kring->nr_hwtail) 1284 goto error; 1285 } else { 1286 /* cur must be outside hwtail..head */ 1287 if (cur < head && cur > kring->nr_hwtail) 1288 goto error; 1289 } 1290 } 1291 if (ring->tail != kring->rtail) { 1292 RD(5, "%s tail overwritten was %d need %d", 1293 kring->name, 1294 ring->tail, kring->rtail); 1295 ring->tail = kring->rtail; 1296 } 1297 return head; 1298 1299 error: 1300 RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d", 1301 kring->nr_hwcur, 1302 kring->rcur, kring->nr_hwtail, 1303 kring->rhead, kring->rcur, ring->tail); 1304 return n; 1305 } 1306 1307 1308 /* 1309 * Error routine called when txsync/rxsync detects an error. 1310 * Can't do much more than resetting head =cur = hwcur, tail = hwtail 1311 * Return 1 on reinit. 1312 * 1313 * This routine is only called by the upper half of the kernel. 1314 * It only reads hwcur (which is changed only by the upper half, too) 1315 * and hwtail (which may be changed by the lower half, but only on 1316 * a tx ring and only to increase it, so any error will be recovered 1317 * on the next call). For the above, we don't strictly need to call 1318 * it under lock. 1319 */ 1320 int 1321 netmap_ring_reinit(struct netmap_kring *kring) 1322 { 1323 struct netmap_ring *ring = kring->ring; 1324 u_int i, lim = kring->nkr_num_slots - 1; 1325 int errors = 0; 1326 1327 // XXX KASSERT nm_kr_tryget 1328 RD(10, "called for %s", NM_IFPNAME(kring->na->ifp)); 1329 // XXX probably wrong to trust userspace 1330 kring->rhead = ring->head; 1331 kring->rcur = ring->cur; 1332 kring->rtail = ring->tail; 1333 1334 if (ring->cur > lim) 1335 errors++; 1336 if (ring->head > lim) 1337 errors++; 1338 if (ring->tail > lim) 1339 errors++; 1340 for (i = 0; i <= lim; i++) { 1341 u_int idx = ring->slot[i].buf_idx; 1342 u_int len = ring->slot[i].len; 1343 if (idx < 2 || idx >= netmap_total_buffers) { 1344 RD(5, "bad index at slot %d idx %d len %d ", i, idx, len); 1345 ring->slot[i].buf_idx = 0; 1346 ring->slot[i].len = 0; 1347 } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { 1348 ring->slot[i].len = 0; 1349 RD(5, "bad len at slot %d idx %d len %d", i, idx, len); 1350 } 1351 } 1352 if (errors) { 1353 RD(10, "total %d errors", errors); 1354 RD(10, "%s reinit, cur %d -> %d tail %d -> %d", 1355 kring->name, 1356 ring->cur, kring->nr_hwcur, 1357 ring->tail, kring->nr_hwtail); 1358 ring->head = kring->rhead = kring->nr_hwcur; 1359 ring->cur = kring->rcur = kring->nr_hwcur; 1360 ring->tail = kring->rtail = kring->nr_hwtail; 1361 } 1362 return (errors ? 1 : 0); 1363 } 1364 1365 1366 /* 1367 * Set the ring ID. For devices with a single queue, a request 1368 * for all rings is the same as a single ring. 1369 */ 1370 static int 1371 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) 1372 { 1373 struct netmap_adapter *na = priv->np_na; 1374 struct ifnet *ifp = na->ifp; 1375 u_int i = ringid & NETMAP_RING_MASK; 1376 /* initially (np_qfirst == np_qlast) we don't want to lock */ 1377 u_int lim = na->num_rx_rings; 1378 1379 if (na->num_tx_rings > lim) 1380 lim = na->num_tx_rings; 1381 if ( (ringid & NETMAP_HW_RING) && i >= lim) { 1382 D("invalid ring id %d", i); 1383 return (EINVAL); 1384 } 1385 priv->np_ringid = ringid; 1386 if (ringid & NETMAP_SW_RING) { 1387 priv->np_qfirst = NETMAP_SW_RING; 1388 priv->np_qlast = 0; 1389 } else if (ringid & NETMAP_HW_RING) { 1390 priv->np_qfirst = i; 1391 priv->np_qlast = i + 1; 1392 } else { 1393 priv->np_qfirst = 0; 1394 priv->np_qlast = NETMAP_HW_RING ; 1395 } 1396 priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; 1397 if (netmap_verbose) { 1398 if (ringid & NETMAP_SW_RING) 1399 D("ringid %s set to SW RING", NM_IFPNAME(ifp)); 1400 else if (ringid & NETMAP_HW_RING) 1401 D("ringid %s set to HW RING %d", NM_IFPNAME(ifp), 1402 priv->np_qfirst); 1403 else 1404 D("ringid %s set to all %d HW RINGS", NM_IFPNAME(ifp), lim); 1405 } 1406 return 0; 1407 } 1408 1409 1410 /* 1411 * possibly move the interface to netmap-mode. 1412 * If success it returns a pointer to netmap_if, otherwise NULL. 1413 * This must be called with NMG_LOCK held. 1414 */ 1415 struct netmap_if * 1416 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, 1417 uint16_t ringid, int *err) 1418 { 1419 struct ifnet *ifp = na->ifp; 1420 struct netmap_if *nifp = NULL; 1421 int error, need_mem = 0; 1422 1423 NMG_LOCK_ASSERT(); 1424 /* ring configuration may have changed, fetch from the card */ 1425 netmap_update_config(na); 1426 priv->np_na = na; /* store the reference */ 1427 error = netmap_set_ringid(priv, ringid); 1428 if (error) 1429 goto out; 1430 /* ensure allocators are ready */ 1431 need_mem = !netmap_have_memory_locked(priv); 1432 if (need_mem) { 1433 error = netmap_get_memory_locked(priv); 1434 ND("get_memory returned %d", error); 1435 if (error) 1436 goto out; 1437 } 1438 nifp = netmap_if_new(NM_IFPNAME(ifp), na); 1439 if (nifp == NULL) { /* allocation failed */ 1440 /* we should drop the allocator, but only 1441 * if we were the ones who grabbed it 1442 */ 1443 error = ENOMEM; 1444 goto out; 1445 } 1446 na->active_fds++; 1447 if (ifp->if_capenable & IFCAP_NETMAP) { 1448 /* was already set */ 1449 } else { 1450 /* Otherwise set the card in netmap mode 1451 * and make it use the shared buffers. 1452 * 1453 * do not core lock because the race is harmless here, 1454 * there cannot be any traffic to netmap_transmit() 1455 */ 1456 na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut; 1457 ND("%p->na_lut == %p", na, na->na_lut); 1458 na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal; 1459 error = na->nm_register(na, 1); /* mode on */ 1460 if (error) { 1461 netmap_do_unregif(priv, nifp); 1462 nifp = NULL; 1463 } 1464 } 1465 out: 1466 *err = error; 1467 if (error) { 1468 priv->np_na = NULL; 1469 if (need_mem) 1470 netmap_drop_memory_locked(priv); 1471 } 1472 if (nifp != NULL) { 1473 /* 1474 * advertise that the interface is ready bt setting ni_nifp. 1475 * The barrier is needed because readers (poll and *SYNC) 1476 * check for priv->np_nifp != NULL without locking 1477 */ 1478 wmb(); /* make sure previous writes are visible to all CPUs */ 1479 priv->np_nifp = nifp; 1480 } 1481 return nifp; 1482 } 1483 1484 1485 1486 /* 1487 * ioctl(2) support for the "netmap" device. 1488 * 1489 * Following a list of accepted commands: 1490 * - NIOCGINFO 1491 * - SIOCGIFADDR just for convenience 1492 * - NIOCREGIF 1493 * - NIOCTXSYNC 1494 * - NIOCRXSYNC 1495 * 1496 * Return 0 on success, errno otherwise. 1497 */ 1498 int 1499 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, 1500 int fflag, struct thread *td) 1501 { 1502 struct netmap_priv_d *priv = NULL; 1503 struct ifnet *ifp = NULL; 1504 struct nmreq *nmr = (struct nmreq *) data; 1505 struct netmap_adapter *na = NULL; 1506 int error; 1507 u_int i, lim; 1508 struct netmap_if *nifp; 1509 struct netmap_kring *krings; 1510 1511 (void)dev; /* UNUSED */ 1512 (void)fflag; /* UNUSED */ 1513 #ifdef linux 1514 #define devfs_get_cdevpriv(pp) \ 1515 ({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; \ 1516 (*pp ? 0 : ENOENT); }) 1517 1518 /* devfs_set_cdevpriv cannot fail on linux */ 1519 #define devfs_set_cdevpriv(p, fn) \ 1520 ({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); }) 1521 1522 1523 #define devfs_clear_cdevpriv() do { \ 1524 netmap_dtor(priv); ((struct file *)td)->private_data = 0; \ 1525 } while (0) 1526 #endif /* linux */ 1527 1528 if (cmd == NIOCGINFO || cmd == NIOCREGIF) { 1529 /* truncate name */ 1530 nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; 1531 if (nmr->nr_version != NETMAP_API) { 1532 D("API mismatch for %s got %d need %d", 1533 nmr->nr_name, 1534 nmr->nr_version, NETMAP_API); 1535 nmr->nr_version = NETMAP_API; 1536 return EINVAL; 1537 } 1538 } 1539 CURVNET_SET(TD_TO_VNET(td)); 1540 1541 error = devfs_get_cdevpriv((void **)&priv); 1542 if (error) { 1543 CURVNET_RESTORE(); 1544 /* XXX ENOENT should be impossible, since the priv 1545 * is now created in the open */ 1546 return (error == ENOENT ? ENXIO : error); 1547 } 1548 1549 switch (cmd) { 1550 case NIOCGINFO: /* return capabilities etc */ 1551 if (nmr->nr_cmd == NETMAP_BDG_LIST) { 1552 error = netmap_bdg_ctl(nmr, NULL); 1553 break; 1554 } 1555 1556 NMG_LOCK(); 1557 do { 1558 /* memsize is always valid */ 1559 struct netmap_mem_d *nmd = &nm_mem; 1560 u_int memflags; 1561 1562 if (nmr->nr_name[0] != '\0') { 1563 /* get a refcount */ 1564 error = netmap_get_na(nmr, &na, 1 /* create */); 1565 if (error) 1566 break; 1567 nmd = na->nm_mem; /* get memory allocator */ 1568 } 1569 1570 error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags); 1571 if (error) 1572 break; 1573 if (na == NULL) /* only memory info */ 1574 break; 1575 nmr->nr_offset = 0; 1576 nmr->nr_rx_slots = nmr->nr_tx_slots = 0; 1577 netmap_update_config(na); 1578 nmr->nr_rx_rings = na->num_rx_rings; 1579 nmr->nr_tx_rings = na->num_tx_rings; 1580 nmr->nr_rx_slots = na->num_rx_desc; 1581 nmr->nr_tx_slots = na->num_tx_desc; 1582 if (memflags & NETMAP_MEM_PRIVATE) 1583 nmr->nr_ringid |= NETMAP_PRIV_MEM; 1584 netmap_adapter_put(na); 1585 } while (0); 1586 NMG_UNLOCK(); 1587 break; 1588 1589 case NIOCREGIF: 1590 /* possibly attach/detach NIC and VALE switch */ 1591 i = nmr->nr_cmd; 1592 if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH 1593 || i == NETMAP_BDG_OFFSET) { 1594 error = netmap_bdg_ctl(nmr, NULL); 1595 break; 1596 } else if (i != 0) { 1597 D("nr_cmd must be 0 not %d", i); 1598 error = EINVAL; 1599 break; 1600 } 1601 1602 /* protect access to priv from concurrent NIOCREGIF */ 1603 NMG_LOCK(); 1604 do { 1605 u_int memflags; 1606 1607 if (priv->np_na != NULL) { /* thread already registered */ 1608 error = netmap_set_ringid(priv, nmr->nr_ringid); 1609 break; 1610 } 1611 /* find the interface and a reference */ 1612 error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */ 1613 if (error) 1614 break; 1615 ifp = na->ifp; 1616 if (NETMAP_OWNED_BY_KERN(na)) { 1617 netmap_adapter_put(na); 1618 error = EBUSY; 1619 break; 1620 } 1621 nifp = netmap_do_regif(priv, na, nmr->nr_ringid, &error); 1622 if (!nifp) { /* reg. failed, release priv and ref */ 1623 netmap_adapter_put(na); 1624 priv->np_nifp = NULL; 1625 break; 1626 } 1627 1628 /* return the offset of the netmap_if object */ 1629 nmr->nr_rx_rings = na->num_rx_rings; 1630 nmr->nr_tx_rings = na->num_tx_rings; 1631 nmr->nr_rx_slots = na->num_rx_desc; 1632 nmr->nr_tx_slots = na->num_tx_desc; 1633 error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags); 1634 if (error) { 1635 netmap_adapter_put(na); 1636 break; 1637 } 1638 if (memflags & NETMAP_MEM_PRIVATE) { 1639 nmr->nr_ringid |= NETMAP_PRIV_MEM; 1640 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; 1641 } 1642 nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); 1643 } while (0); 1644 NMG_UNLOCK(); 1645 break; 1646 1647 case NIOCTXSYNC: 1648 case NIOCRXSYNC: 1649 nifp = priv->np_nifp; 1650 1651 if (nifp == NULL) { 1652 error = ENXIO; 1653 break; 1654 } 1655 rmb(); /* make sure following reads are not from cache */ 1656 1657 na = priv->np_na; /* we have a reference */ 1658 1659 if (na == NULL) { 1660 D("Internal error: nifp != NULL && na == NULL"); 1661 error = ENXIO; 1662 break; 1663 } 1664 1665 ifp = na->ifp; 1666 if (ifp == NULL) { 1667 RD(1, "the ifp is gone"); 1668 error = ENXIO; 1669 break; 1670 } 1671 1672 if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ 1673 if (cmd == NIOCTXSYNC) 1674 netmap_txsync_to_host(na); 1675 else 1676 netmap_rxsync_from_host(na, NULL, NULL); 1677 break; 1678 } 1679 /* find the last ring to scan */ 1680 lim = priv->np_qlast; 1681 if (lim == NETMAP_HW_RING) 1682 lim = (cmd == NIOCTXSYNC) ? 1683 na->num_tx_rings : na->num_rx_rings; 1684 1685 krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings; 1686 for (i = priv->np_qfirst; i < lim; i++) { 1687 struct netmap_kring *kring = krings + i; 1688 if (nm_kr_tryget(kring)) { 1689 error = EBUSY; 1690 goto out; 1691 } 1692 if (cmd == NIOCTXSYNC) { 1693 if (netmap_verbose & NM_VERB_TXSYNC) 1694 D("pre txsync ring %d cur %d hwcur %d", 1695 i, kring->ring->cur, 1696 kring->nr_hwcur); 1697 if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { 1698 netmap_ring_reinit(kring); 1699 } else { 1700 na->nm_txsync(na, i, NAF_FORCE_RECLAIM); 1701 } 1702 if (netmap_verbose & NM_VERB_TXSYNC) 1703 D("post txsync ring %d cur %d hwcur %d", 1704 i, kring->ring->cur, 1705 kring->nr_hwcur); 1706 } else { 1707 na->nm_rxsync(na, i, NAF_FORCE_READ); 1708 microtime(&na->rx_rings[i].ring->ts); 1709 } 1710 nm_kr_put(kring); 1711 } 1712 1713 break; 1714 1715 #ifdef __FreeBSD__ 1716 case BIOCIMMEDIATE: 1717 case BIOCGHDRCMPLT: 1718 case BIOCSHDRCMPLT: 1719 case BIOCSSEESENT: 1720 D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); 1721 break; 1722 1723 default: /* allow device-specific ioctls */ 1724 { 1725 struct socket so; 1726 1727 bzero(&so, sizeof(so)); 1728 NMG_LOCK(); 1729 error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */ 1730 if (error) { 1731 netmap_adapter_put(na); 1732 NMG_UNLOCK(); 1733 break; 1734 } 1735 ifp = na->ifp; 1736 so.so_vnet = ifp->if_vnet; 1737 // so->so_proto not null. 1738 error = ifioctl(&so, cmd, data, td); 1739 netmap_adapter_put(na); 1740 NMG_UNLOCK(); 1741 break; 1742 } 1743 1744 #else /* linux */ 1745 default: 1746 error = EOPNOTSUPP; 1747 #endif /* linux */ 1748 } 1749 out: 1750 1751 CURVNET_RESTORE(); 1752 return (error); 1753 } 1754 1755 1756 /* 1757 * select(2) and poll(2) handlers for the "netmap" device. 1758 * 1759 * Can be called for one or more queues. 1760 * Return true the event mask corresponding to ready events. 1761 * If there are no ready events, do a selrecord on either individual 1762 * selinfo or on the global one. 1763 * Device-dependent parts (locking and sync of tx/rx rings) 1764 * are done through callbacks. 1765 * 1766 * On linux, arguments are really pwait, the poll table, and 'td' is struct file * 1767 * The first one is remapped to pwait as selrecord() uses the name as an 1768 * hidden argument. 1769 */ 1770 int 1771 netmap_poll(struct cdev *dev, int events, struct thread *td) 1772 { 1773 struct netmap_priv_d *priv = NULL; 1774 struct netmap_adapter *na; 1775 struct ifnet *ifp; 1776 struct netmap_kring *kring; 1777 u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; 1778 u_int lim_tx, lim_rx; 1779 struct mbq q; /* packets from hw queues to host stack */ 1780 void *pwait = dev; /* linux compatibility */ 1781 1782 /* 1783 * In order to avoid nested locks, we need to "double check" 1784 * txsync and rxsync if we decide to do a selrecord(). 1785 * retry_tx (and retry_rx, later) prevent looping forever. 1786 */ 1787 int retry_tx = 1, retry_rx = 1; 1788 1789 (void)pwait; 1790 mbq_init(&q); 1791 1792 if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) 1793 return POLLERR; 1794 1795 if (priv->np_nifp == NULL) { 1796 D("No if registered"); 1797 return POLLERR; 1798 } 1799 rmb(); /* make sure following reads are not from cache */ 1800 1801 na = priv->np_na; 1802 ifp = na->ifp; 1803 // check for deleted 1804 if (ifp == NULL) { 1805 RD(1, "the ifp is gone"); 1806 return POLLERR; 1807 } 1808 1809 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) 1810 return POLLERR; 1811 1812 if (netmap_verbose & 0x8000) 1813 D("device %s events 0x%x", NM_IFPNAME(ifp), events); 1814 want_tx = events & (POLLOUT | POLLWRNORM); 1815 want_rx = events & (POLLIN | POLLRDNORM); 1816 1817 lim_tx = na->num_tx_rings; 1818 lim_rx = na->num_rx_rings; 1819 1820 if (priv->np_qfirst == NETMAP_SW_RING) { 1821 // XXX locking ? 1822 /* handle the host stack ring */ 1823 if (priv->np_txpoll || want_tx) { 1824 /* push any packets up, then we are always ready */ 1825 netmap_txsync_to_host(na); 1826 revents |= want_tx; 1827 } 1828 if (want_rx) { 1829 kring = &na->rx_rings[lim_rx]; 1830 /* XXX replace with rxprologue etc. */ 1831 if (nm_ring_empty(kring->ring)) 1832 netmap_rxsync_from_host(na, td, dev); 1833 if (!nm_ring_empty(kring->ring)) 1834 revents |= want_rx; 1835 } 1836 return (revents); 1837 } 1838 1839 1840 /* 1841 * check_all_{tx|rx} are set if the card has more than one queue AND 1842 * the file descriptor is bound to all of them. If so, we sleep on 1843 * the "global" selinfo, otherwise we sleep on individual selinfo 1844 * (FreeBSD only allows two selinfo's per file descriptor). 1845 * The interrupt routine in the driver wake one or the other 1846 * (or both) depending on which clients are active. 1847 * 1848 * rxsync() is only called if we run out of buffers on a POLLIN. 1849 * txsync() is called if we run out of buffers on POLLOUT, or 1850 * there are pending packets to send. The latter can be disabled 1851 * passing NETMAP_NO_TX_POLL in the NIOCREG call. 1852 */ 1853 check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1); 1854 check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1); 1855 1856 if (priv->np_qlast != NETMAP_HW_RING) { 1857 lim_tx = lim_rx = priv->np_qlast; 1858 } 1859 1860 /* 1861 * We start with a lock free round which is cheap if we have 1862 * slots available. If this fails, then lock and call the sync 1863 * routines. 1864 */ 1865 for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { 1866 kring = &na->rx_rings[i]; 1867 /* XXX compare ring->cur and kring->tail */ 1868 if (!nm_ring_empty(kring->ring)) { 1869 revents |= want_rx; 1870 want_rx = 0; /* also breaks the loop */ 1871 } 1872 } 1873 for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) { 1874 kring = &na->tx_rings[i]; 1875 /* XXX compare ring->cur and kring->tail */ 1876 if (!nm_ring_empty(kring->ring)) { 1877 revents |= want_tx; 1878 want_tx = 0; /* also breaks the loop */ 1879 } 1880 } 1881 1882 /* 1883 * If we want to push packets out (priv->np_txpoll) or 1884 * want_tx is still set, we must issue txsync calls 1885 * (on all rings, to avoid that the tx rings stall). 1886 * XXX should also check cur != hwcur on the tx rings. 1887 * Fortunately, normal tx mode has np_txpoll set. 1888 */ 1889 if (priv->np_txpoll || want_tx) { 1890 /* 1891 * The first round checks if anyone is ready, if not 1892 * do a selrecord and another round to handle races. 1893 * want_tx goes to 0 if any space is found, and is 1894 * used to skip rings with no pending transmissions. 1895 */ 1896 flush_tx: 1897 for (i = priv->np_qfirst; i < lim_tx; i++) { 1898 int found = 0; 1899 1900 kring = &na->tx_rings[i]; 1901 if (!want_tx && kring->ring->cur == kring->nr_hwcur) 1902 continue; 1903 /* only one thread does txsync */ 1904 if (nm_kr_tryget(kring)) { 1905 D("%p lost race on txring %d, ok", priv, i); 1906 continue; 1907 } 1908 if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { 1909 netmap_ring_reinit(kring); 1910 revents |= POLLERR; 1911 } else { 1912 if (na->nm_txsync(na, i, 0)) 1913 revents |= POLLERR; 1914 } 1915 1916 /* 1917 * If we found new slots, notify potential 1918 * listeners on the same ring. 1919 * Since we just did a txsync, look at the copies 1920 * of cur,tail in the kring. 1921 */ 1922 found = kring->rcur != kring->rtail; 1923 nm_kr_put(kring); 1924 if (found) { /* notify other listeners */ 1925 revents |= want_tx; 1926 want_tx = 0; 1927 na->nm_notify(na, i, NR_TX, NAF_GLOBAL_NOTIFY); 1928 } 1929 } 1930 if (want_tx && retry_tx) { 1931 selrecord(td, check_all_tx ? 1932 &na->tx_si : &na->tx_rings[priv->np_qfirst].si); 1933 retry_tx = 0; 1934 goto flush_tx; 1935 } 1936 } 1937 1938 /* 1939 * If want_rx is still set scan receive rings. 1940 * Do it on all rings because otherwise we starve. 1941 */ 1942 if (want_rx) { 1943 int send_down = 0; /* transparent mode */ 1944 /* two rounds here to for race avoidance */ 1945 do_retry_rx: 1946 for (i = priv->np_qfirst; i < lim_rx; i++) { 1947 int found = 0; 1948 1949 kring = &na->rx_rings[i]; 1950 1951 if (nm_kr_tryget(kring)) { 1952 D("%p lost race on rxring %d, ok", priv, i); 1953 continue; 1954 } 1955 1956 /* 1957 * transparent mode support: collect packets 1958 * from the rxring(s). 1959 * XXX NR_FORWARD should only be read on 1960 * physical or NIC ports 1961 */ 1962 if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { 1963 ND(10, "forwarding some buffers up %d to %d", 1964 kring->nr_hwcur, kring->ring->cur); 1965 netmap_grab_packets(kring, &q, netmap_fwd); 1966 } 1967 1968 if (na->nm_rxsync(na, i, 0)) 1969 revents |= POLLERR; 1970 if (netmap_no_timestamp == 0 || 1971 kring->ring->flags & NR_TIMESTAMP) { 1972 microtime(&kring->ring->ts); 1973 } 1974 /* after an rxsync we can use kring->rcur, rtail */ 1975 found = kring->rcur != kring->rtail; 1976 nm_kr_put(kring); 1977 if (found) { 1978 revents |= want_rx; 1979 retry_rx = 0; 1980 na->nm_notify(na, i, NR_RX, NAF_GLOBAL_NOTIFY); 1981 } 1982 } 1983 1984 /* transparent mode XXX only during first pass ? */ 1985 kring = &na->rx_rings[lim_rx]; 1986 if (check_all_rx 1987 && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { 1988 /* XXX fix to use kring fields */ 1989 if (nm_ring_empty(kring->ring)) 1990 send_down = netmap_rxsync_from_host(na, td, dev); 1991 if (!nm_ring_empty(kring->ring)) 1992 revents |= want_rx; 1993 } 1994 1995 if (retry_rx) 1996 selrecord(td, check_all_rx ? 1997 &na->rx_si : &na->rx_rings[priv->np_qfirst].si); 1998 if (send_down > 0 || retry_rx) { 1999 retry_rx = 0; 2000 if (send_down) 2001 goto flush_tx; /* and retry_rx */ 2002 else 2003 goto do_retry_rx; 2004 } 2005 } 2006 2007 /* 2008 * Transparent mode: marked bufs on rx rings between 2009 * kring->nr_hwcur and ring->head 2010 * are passed to the other endpoint. 2011 * 2012 * In this mode we also scan the sw rxring, which in 2013 * turn passes packets up. 2014 * 2015 * XXX Transparent mode at the moment requires to bind all 2016 * rings to a single file descriptor. 2017 */ 2018 2019 if (q.head) 2020 netmap_send_up(na->ifp, &q); 2021 2022 return (revents); 2023 } 2024 2025 2026 /*-------------------- driver support routines -------------------*/ 2027 2028 static int netmap_hw_krings_create(struct netmap_adapter *); 2029 2030 static int 2031 netmap_notify(struct netmap_adapter *na, u_int n_ring, 2032 enum txrx tx, int flags) 2033 { 2034 struct netmap_kring *kring; 2035 2036 if (tx == NR_TX) { 2037 kring = na->tx_rings + n_ring; 2038 selwakeuppri(&kring->si, PI_NET); 2039 if (flags & NAF_GLOBAL_NOTIFY) 2040 selwakeuppri(&na->tx_si, PI_NET); 2041 } else { 2042 kring = na->rx_rings + n_ring; 2043 selwakeuppri(&kring->si, PI_NET); 2044 if (flags & NAF_GLOBAL_NOTIFY) 2045 selwakeuppri(&na->rx_si, PI_NET); 2046 } 2047 return 0; 2048 } 2049 2050 2051 // XXX check handling of failures 2052 int 2053 netmap_attach_common(struct netmap_adapter *na) 2054 { 2055 struct ifnet *ifp = na->ifp; 2056 2057 if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { 2058 D("%s: invalid rings tx %d rx %d", 2059 ifp->if_xname, na->num_tx_rings, na->num_rx_rings); 2060 return EINVAL; 2061 } 2062 WNA(ifp) = na; 2063 2064 /* the following is only needed for na that use the host port. 2065 * XXX do we have something similar for linux ? 2066 */ 2067 #ifdef __FreeBSD__ 2068 na->if_input = ifp->if_input; /* for netmap_send_up */ 2069 #endif /* __FreeBSD__ */ 2070 2071 NETMAP_SET_CAPABLE(ifp); 2072 if (na->nm_krings_create == NULL) { 2073 na->nm_krings_create = netmap_hw_krings_create; 2074 na->nm_krings_delete = netmap_hw_krings_delete; 2075 } 2076 if (na->nm_notify == NULL) 2077 na->nm_notify = netmap_notify; 2078 na->active_fds = 0; 2079 2080 if (na->nm_mem == NULL) 2081 na->nm_mem = &nm_mem; 2082 return 0; 2083 } 2084 2085 2086 void 2087 netmap_detach_common(struct netmap_adapter *na) 2088 { 2089 if (na->ifp) 2090 WNA(na->ifp) = NULL; /* XXX do we need this? */ 2091 2092 if (na->tx_rings) { /* XXX should not happen */ 2093 D("freeing leftover tx_rings"); 2094 na->nm_krings_delete(na); 2095 } 2096 if (na->na_flags & NAF_MEM_OWNER) 2097 netmap_mem_private_delete(na->nm_mem); 2098 bzero(na, sizeof(*na)); 2099 free(na, M_DEVBUF); 2100 } 2101 2102 2103 /* 2104 * Initialize a ``netmap_adapter`` object created by driver on attach. 2105 * We allocate a block of memory with room for a struct netmap_adapter 2106 * plus two sets of N+2 struct netmap_kring (where N is the number 2107 * of hardware rings): 2108 * krings 0..N-1 are for the hardware queues. 2109 * kring N is for the host stack queue 2110 * kring N+1 is only used for the selinfo for all queues. // XXX still true ? 2111 * Return 0 on success, ENOMEM otherwise. 2112 */ 2113 int 2114 netmap_attach(struct netmap_adapter *arg) 2115 { 2116 struct netmap_hw_adapter *hwna = NULL; 2117 // XXX when is arg == NULL ? 2118 struct ifnet *ifp = arg ? arg->ifp : NULL; 2119 2120 if (arg == NULL || ifp == NULL) 2121 goto fail; 2122 hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO); 2123 if (hwna == NULL) 2124 goto fail; 2125 hwna->up = *arg; 2126 if (netmap_attach_common(&hwna->up)) { 2127 free(hwna, M_DEVBUF); 2128 goto fail; 2129 } 2130 netmap_adapter_get(&hwna->up); 2131 2132 #ifdef linux 2133 if (ifp->netdev_ops) { 2134 /* prepare a clone of the netdev ops */ 2135 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) 2136 hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops; 2137 #else 2138 hwna->nm_ndo = *ifp->netdev_ops; 2139 #endif 2140 } 2141 hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; 2142 #endif /* linux */ 2143 2144 D("success for %s", NM_IFPNAME(ifp)); 2145 return 0; 2146 2147 fail: 2148 D("fail, arg %p ifp %p na %p", arg, ifp, hwna); 2149 netmap_detach(ifp); 2150 return (hwna ? EINVAL : ENOMEM); 2151 } 2152 2153 2154 void 2155 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na) 2156 { 2157 if (!na) { 2158 return; 2159 } 2160 2161 refcount_acquire(&na->na_refcount); 2162 } 2163 2164 2165 /* returns 1 iff the netmap_adapter is destroyed */ 2166 int 2167 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) 2168 { 2169 if (!na) 2170 return 1; 2171 2172 if (!refcount_release(&na->na_refcount)) 2173 return 0; 2174 2175 if (na->nm_dtor) 2176 na->nm_dtor(na); 2177 2178 netmap_detach_common(na); 2179 2180 return 1; 2181 } 2182 2183 2184 int 2185 netmap_hw_krings_create(struct netmap_adapter *na) 2186 { 2187 int ret = netmap_krings_create(na, 2188 na->num_tx_rings + 1, na->num_rx_rings + 1, 0); 2189 if (ret == 0) { 2190 /* initialize the mbq for the sw rx ring */ 2191 mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue); 2192 ND("initialized sw rx queue %d", na->num_rx_rings); 2193 } 2194 return ret; 2195 } 2196 2197 2198 2199 /* 2200 * Free the allocated memory linked to the given ``netmap_adapter`` 2201 * object. 2202 */ 2203 void 2204 netmap_detach(struct ifnet *ifp) 2205 { 2206 struct netmap_adapter *na = NA(ifp); 2207 2208 if (!na) 2209 return; 2210 2211 NMG_LOCK(); 2212 netmap_disable_all_rings(ifp); 2213 if (!netmap_adapter_put(na)) { 2214 /* someone is still using the adapter, 2215 * tell them that the interface is gone 2216 */ 2217 na->ifp = NULL; 2218 /* give them a chance to notice */ 2219 netmap_enable_all_rings(ifp); 2220 } 2221 NMG_UNLOCK(); 2222 } 2223 2224 2225 /* 2226 * Intercept packets from the network stack and pass them 2227 * to netmap as incoming packets on the 'software' ring. 2228 * 2229 * We only store packets in a bounded mbq and then copy them 2230 * in the relevant rxsync routine. 2231 * 2232 * We rely on the OS to make sure that the ifp and na do not go 2233 * away (typically the caller checks for IFF_DRV_RUNNING or the like). 2234 * In nm_register() or whenever there is a reinitialization, 2235 * we make sure to make the mode change visible here. 2236 */ 2237 int 2238 netmap_transmit(struct ifnet *ifp, struct mbuf *m) 2239 { 2240 struct netmap_adapter *na = NA(ifp); 2241 struct netmap_kring *kring; 2242 u_int len = MBUF_LEN(m); 2243 u_int error = ENOBUFS; 2244 struct mbq *q; 2245 int space; 2246 2247 // XXX [Linux] we do not need this lock 2248 // if we follow the down/configure/up protocol -gl 2249 // mtx_lock(&na->core_lock); 2250 2251 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { 2252 D("%s not in netmap mode anymore", NM_IFPNAME(ifp)); 2253 error = ENXIO; 2254 goto done; 2255 } 2256 2257 kring = &na->rx_rings[na->num_rx_rings]; 2258 q = &kring->rx_queue; 2259 2260 // XXX reconsider long packets if we handle fragments 2261 if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ 2262 D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp), 2263 len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); 2264 goto done; 2265 } 2266 2267 /* protect against rxsync_from_host(), netmap_sw_to_nic() 2268 * and maybe other instances of netmap_transmit (the latter 2269 * not possible on Linux). 2270 * Also avoid overflowing the queue. 2271 */ 2272 mtx_lock(&q->lock); 2273 2274 space = kring->nr_hwtail - kring->nr_hwcur; 2275 if (space < 0) 2276 space += kring->nkr_num_slots; 2277 if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX 2278 RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p", 2279 NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q), 2280 len, m); 2281 } else { 2282 mbq_enqueue(q, m); 2283 ND(10, "%s %d bufs in queue len %d m %p", 2284 NM_IFPNAME(ifp), mbq_len(q), len, m); 2285 /* notify outside the lock */ 2286 m = NULL; 2287 error = 0; 2288 } 2289 mtx_unlock(&q->lock); 2290 2291 done: 2292 if (m) 2293 m_freem(m); 2294 /* unconditionally wake up listeners */ 2295 na->nm_notify(na, na->num_rx_rings, NR_RX, 0); 2296 2297 return (error); 2298 } 2299 2300 2301 /* 2302 * netmap_reset() is called by the driver routines when reinitializing 2303 * a ring. The driver is in charge of locking to protect the kring. 2304 * If native netmap mode is not set just return NULL. 2305 */ 2306 struct netmap_slot * 2307 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, 2308 u_int new_cur) 2309 { 2310 struct netmap_kring *kring; 2311 int new_hwofs, lim; 2312 2313 if (na == NULL) { 2314 D("NULL na, should not happen"); 2315 return NULL; /* no netmap support here */ 2316 } 2317 if (!(na->ifp->if_capenable & IFCAP_NETMAP)) { 2318 ND("interface not in netmap mode"); 2319 return NULL; /* nothing to reinitialize */ 2320 } 2321 2322 /* XXX note- in the new scheme, we are not guaranteed to be 2323 * under lock (e.g. when called on a device reset). 2324 * In this case, we should set a flag and do not trust too 2325 * much the values. In practice: TODO 2326 * - set a RESET flag somewhere in the kring 2327 * - do the processing in a conservative way 2328 * - let the *sync() fixup at the end. 2329 */ 2330 if (tx == NR_TX) { 2331 if (n >= na->num_tx_rings) 2332 return NULL; 2333 kring = na->tx_rings + n; 2334 // XXX check whether we should use hwcur or rcur 2335 new_hwofs = kring->nr_hwcur - new_cur; 2336 } else { 2337 if (n >= na->num_rx_rings) 2338 return NULL; 2339 kring = na->rx_rings + n; 2340 new_hwofs = kring->nr_hwtail - new_cur; 2341 } 2342 lim = kring->nkr_num_slots - 1; 2343 if (new_hwofs > lim) 2344 new_hwofs -= lim + 1; 2345 2346 /* Always set the new offset value and realign the ring. */ 2347 if (netmap_verbose) 2348 D("%s %s%d hwofs %d -> %d, hwtail %d -> %d", 2349 NM_IFPNAME(na->ifp), 2350 tx == NR_TX ? "TX" : "RX", n, 2351 kring->nkr_hwofs, new_hwofs, 2352 kring->nr_hwtail, 2353 tx == NR_TX ? lim : kring->nr_hwtail); 2354 kring->nkr_hwofs = new_hwofs; 2355 if (tx == NR_TX) { 2356 kring->nr_hwtail = kring->nr_hwcur + lim; 2357 if (kring->nr_hwtail > lim) 2358 kring->nr_hwtail -= lim + 1; 2359 } 2360 2361 #if 0 // def linux 2362 /* XXX check that the mappings are correct */ 2363 /* need ring_nr, adapter->pdev, direction */ 2364 buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE); 2365 if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { 2366 D("error mapping rx netmap buffer %d", i); 2367 // XXX fix error handling 2368 } 2369 2370 #endif /* linux */ 2371 /* 2372 * Wakeup on the individual and global selwait 2373 * We do the wakeup here, but the ring is not yet reconfigured. 2374 * However, we are under lock so there are no races. 2375 */ 2376 na->nm_notify(na, n, tx, NAF_GLOBAL_NOTIFY); 2377 return kring->ring->slot; 2378 } 2379 2380 2381 /* 2382 * Dispatch rx/tx interrupts to the netmap rings. 2383 * 2384 * "work_done" is non-null on the RX path, NULL for the TX path. 2385 * We rely on the OS to make sure that there is only one active 2386 * instance per queue, and that there is appropriate locking. 2387 * 2388 * The 'notify' routine depends on what the ring is attached to. 2389 * - for a netmap file descriptor, do a selwakeup on the individual 2390 * waitqueue, plus one on the global one if needed 2391 * - for a switch, call the proper forwarding routine 2392 * - XXX more ? 2393 */ 2394 void 2395 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) 2396 { 2397 struct netmap_adapter *na = NA(ifp); 2398 struct netmap_kring *kring; 2399 2400 q &= NETMAP_RING_MASK; 2401 2402 if (netmap_verbose) { 2403 RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); 2404 } 2405 2406 if (work_done) { /* RX path */ 2407 if (q >= na->num_rx_rings) 2408 return; // not a physical queue 2409 kring = na->rx_rings + q; 2410 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? 2411 na->nm_notify(na, q, NR_RX, 2412 (na->num_rx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0)); 2413 *work_done = 1; /* do not fire napi again */ 2414 } else { /* TX path */ 2415 if (q >= na->num_tx_rings) 2416 return; // not a physical queue 2417 kring = na->tx_rings + q; 2418 na->nm_notify(na, q, NR_TX, 2419 (na->num_tx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0)); 2420 } 2421 } 2422 2423 2424 /* 2425 * Default functions to handle rx/tx interrupts from a physical device. 2426 * "work_done" is non-null on the RX path, NULL for the TX path. 2427 * 2428 * If the card is not in netmap mode, simply return 0, 2429 * so that the caller proceeds with regular processing. 2430 * Otherwise call netmap_common_irq() and return 1. 2431 * 2432 * If the card is connected to a netmap file descriptor, 2433 * do a selwakeup on the individual queue, plus one on the global one 2434 * if needed (multiqueue card _and_ there are multiqueue listeners), 2435 * and return 1. 2436 * 2437 * Finally, if called on rx from an interface connected to a switch, 2438 * calls the proper forwarding routine, and return 1. 2439 */ 2440 int 2441 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) 2442 { 2443 // XXX could we check NAF_NATIVE_ON ? 2444 if (!(ifp->if_capenable & IFCAP_NETMAP)) 2445 return 0; 2446 2447 if (NA(ifp)->na_flags & NAF_SKIP_INTR) { 2448 ND("use regular interrupt"); 2449 return 0; 2450 } 2451 2452 netmap_common_irq(ifp, q, work_done); 2453 return 1; 2454 } 2455 2456 2457 /* 2458 * Module loader and unloader 2459 * 2460 * netmap_init() creates the /dev/netmap device and initializes 2461 * all global variables. Returns 0 on success, errno on failure 2462 * (but there is no chance) 2463 * 2464 * netmap_fini() destroys everything. 2465 */ 2466 2467 static struct cdev *netmap_dev; /* /dev/netmap character device. */ 2468 extern struct cdevsw netmap_cdevsw; 2469 2470 2471 void 2472 netmap_fini(void) 2473 { 2474 // XXX destroy_bridges() ? 2475 if (netmap_dev) 2476 destroy_dev(netmap_dev); 2477 netmap_mem_fini(); 2478 NMG_LOCK_DESTROY(); 2479 printf("netmap: unloaded module.\n"); 2480 } 2481 2482 2483 int 2484 netmap_init(void) 2485 { 2486 int error; 2487 2488 NMG_LOCK_INIT(); 2489 2490 error = netmap_mem_init(); 2491 if (error != 0) 2492 goto fail; 2493 /* XXX could use make_dev_credv() to get error number */ 2494 netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, 2495 "netmap"); 2496 if (!netmap_dev) 2497 goto fail; 2498 2499 netmap_init_bridges(); 2500 printf("netmap: loaded module\n"); 2501 return (0); 2502 fail: 2503 netmap_fini(); 2504 return (EINVAL); /* may be incorrect */ 2505 } 2506