1 /* 2 * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 /* 28 * $FreeBSD$ 29 * 30 * This module supports memory mapped access to network devices, 31 * see netmap(4). 32 * 33 * The module uses a large, memory pool allocated by the kernel 34 * and accessible as mmapped memory by multiple userspace threads/processes. 35 * The memory pool contains packet buffers and "netmap rings", 36 * i.e. user-accessible copies of the interface's queues. 37 * 38 * Access to the network card works like this: 39 * 1. a process/thread issues one or more open() on /dev/netmap, to create 40 * select()able file descriptor on which events are reported. 41 * 2. on each descriptor, the process issues an ioctl() to identify 42 * the interface that should report events to the file descriptor. 43 * 3. on each descriptor, the process issues an mmap() request to 44 * map the shared memory region within the process' address space. 45 * The list of interesting queues is indicated by a location in 46 * the shared memory region. 47 * 4. using the functions in the netmap(4) userspace API, a process 48 * can look up the occupation state of a queue, access memory buffers, 49 * and retrieve received packets or enqueue packets to transmit. 50 * 5. using some ioctl()s the process can synchronize the userspace view 51 * of the queue with the actual status in the kernel. This includes both 52 * receiving the notification of new packets, and transmitting new 53 * packets on the output interface. 54 * 6. select() or poll() can be used to wait for events on individual 55 * transmit or receive queues (or all queues for a given interface). 56 * 57 58 SYNCHRONIZATION (USER) 59 60 The netmap rings and data structures may be shared among multiple 61 user threads or even independent processes. 62 Any synchronization among those threads/processes is delegated 63 to the threads themselves. Only one thread at a time can be in 64 a system call on the same netmap ring. The OS does not enforce 65 this and only guarantees against system crashes in case of 66 invalid usage. 67 68 LOCKING (INTERNAL) 69 70 Within the kernel, access to the netmap rings is protected as follows: 71 72 - a spinlock on each ring, to handle producer/consumer races on 73 RX rings attached to the host stack (against multiple host 74 threads writing from the host stack to the same ring), 75 and on 'destination' rings attached to a VALE switch 76 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) 77 protecting multiple active senders for the same destination) 78 79 - an atomic variable to guarantee that there is at most one 80 instance of *_*xsync() on the ring at any time. 81 For rings connected to user file 82 descriptors, an atomic_test_and_set() protects this, and the 83 lock on the ring is not actually used. 84 For NIC RX rings connected to a VALE switch, an atomic_test_and_set() 85 is also used to prevent multiple executions (the driver might indeed 86 already guarantee this). 87 For NIC TX rings connected to a VALE switch, the lock arbitrates 88 access to the queue (both when allocating buffers and when pushing 89 them out). 90 91 - *xsync() should be protected against initializations of the card. 92 On FreeBSD most devices have the reset routine protected by 93 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing 94 the RING protection on rx_reset(), this should be added. 95 96 On linux there is an external lock on the tx path, which probably 97 also arbitrates access to the reset routine. XXX to be revised 98 99 - a per-interface core_lock protecting access from the host stack 100 while interfaces may be detached from netmap mode. 101 XXX there should be no need for this lock if we detach the interfaces 102 only while they are down. 103 104 105 --- VALE SWITCH --- 106 107 NMG_LOCK() serializes all modifications to switches and ports. 108 A switch cannot be deleted until all ports are gone. 109 110 For each switch, an SX lock (RWlock on linux) protects 111 deletion of ports. When configuring or deleting a new port, the 112 lock is acquired in exclusive mode (after holding NMG_LOCK). 113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 114 The lock is held throughout the entire forwarding cycle, 115 during which the thread may incur in a page fault. 116 Hence it is important that sleepable shared locks are used. 117 118 On the rx ring, the per-port lock is grabbed initially to reserve 119 a number of slot in the ring, then the lock is released, 120 packets are copied from source to destination, and then 121 the lock is acquired again and the receive ring is updated. 122 (A similar thing is done on the tx ring for NIC and host stack 123 ports attached to the switch) 124 125 */ 126 127 /* 128 * OS-specific code that is used only within this file. 129 * Other OS-specific code that must be accessed by drivers 130 * is present in netmap_kern.h 131 */ 132 133 #if defined(__FreeBSD__) 134 #include <sys/cdefs.h> /* prerequisite */ 135 #include <sys/types.h> 136 #include <sys/errno.h> 137 #include <sys/param.h> /* defines used in kernel.h */ 138 #include <sys/kernel.h> /* types used in module initialization */ 139 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 140 #include <sys/sockio.h> 141 #include <sys/socketvar.h> /* struct socket */ 142 #include <sys/malloc.h> 143 #include <sys/poll.h> 144 #include <sys/rwlock.h> 145 #include <sys/socket.h> /* sockaddrs */ 146 #include <sys/selinfo.h> 147 #include <sys/sysctl.h> 148 #include <net/if.h> 149 #include <net/if_var.h> 150 #include <net/bpf.h> /* BIOCIMMEDIATE */ 151 #include <machine/bus.h> /* bus_dmamap_* */ 152 #include <sys/endian.h> 153 #include <sys/refcount.h> 154 #include <sys/jail.h> 155 156 157 /* reduce conditional code */ 158 #define init_waitqueue_head(x) // only needed in linux 159 160 161 162 #elif defined(linux) 163 164 #include "bsd_glue.h" 165 166 167 168 #elif defined(__APPLE__) 169 170 #warning OSX support is only partial 171 #include "osx_glue.h" 172 173 #else 174 175 #error Unsupported platform 176 177 #endif /* unsupported */ 178 179 /* 180 * common headers 181 */ 182 #include <net/netmap.h> 183 #include <dev/netmap/netmap_kern.h> 184 #include <dev/netmap/netmap_mem2.h> 185 186 187 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); 188 189 /* 190 * The following variables are used by the drivers and replicate 191 * fields in the global memory pool. They only refer to buffers 192 * used by physical interfaces. 193 */ 194 u_int netmap_total_buffers; 195 u_int netmap_buf_size; 196 char *netmap_buffer_base; /* also address of an invalid buffer */ 197 198 /* user-controlled variables */ 199 int netmap_verbose; 200 201 static int netmap_no_timestamp; /* don't timestamp on rxsync */ 202 203 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); 204 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, 205 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); 206 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, 207 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); 208 int netmap_mitigate = 1; 209 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); 210 int netmap_no_pendintr = 1; 211 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, 212 CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); 213 int netmap_txsync_retry = 2; 214 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, 215 &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); 216 217 int netmap_flags = 0; /* debug flags */ 218 int netmap_fwd = 0; /* force transparent mode */ 219 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ 220 221 /* 222 * netmap_admode selects the netmap mode to use. 223 * Invalid values are reset to NETMAP_ADMODE_BEST 224 */ 225 enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ 226 NETMAP_ADMODE_NATIVE, /* either native or none */ 227 NETMAP_ADMODE_GENERIC, /* force generic */ 228 NETMAP_ADMODE_LAST }; 229 #define NETMAP_ADMODE_NATIVE 1 /* Force native netmap adapter. */ 230 #define NETMAP_ADMODE_GENERIC 2 /* Force generic netmap adapter. */ 231 #define NETMAP_ADMODE_BEST 0 /* Priority to native netmap adapter. */ 232 static int netmap_admode = NETMAP_ADMODE_BEST; 233 234 int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ 235 int netmap_generic_ringsize = 1024; /* Generic ringsize. */ 236 237 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); 238 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); 239 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); 240 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); 241 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); 242 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); 243 244 NMG_LOCK_T netmap_global_lock; 245 246 247 static void 248 nm_kr_get(struct netmap_kring *kr) 249 { 250 while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) 251 tsleep(kr, 0, "NM_KR_GET", 4); 252 } 253 254 255 void 256 netmap_disable_ring(struct netmap_kring *kr) 257 { 258 kr->nkr_stopped = 1; 259 nm_kr_get(kr); 260 mtx_lock(&kr->q_lock); 261 mtx_unlock(&kr->q_lock); 262 nm_kr_put(kr); 263 } 264 265 266 static void 267 netmap_set_all_rings(struct ifnet *ifp, int stopped) 268 { 269 struct netmap_adapter *na; 270 int i; 271 272 if (!(ifp->if_capenable & IFCAP_NETMAP)) 273 return; 274 275 na = NA(ifp); 276 277 for (i = 0; i <= na->num_tx_rings; i++) { 278 if (stopped) 279 netmap_disable_ring(na->tx_rings + i); 280 else 281 na->tx_rings[i].nkr_stopped = 0; 282 na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY | 283 (i == na->num_tx_rings ? NAF_GLOBAL_NOTIFY: 0)); 284 } 285 286 for (i = 0; i <= na->num_rx_rings; i++) { 287 if (stopped) 288 netmap_disable_ring(na->rx_rings + i); 289 else 290 na->rx_rings[i].nkr_stopped = 0; 291 na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY | 292 (i == na->num_rx_rings ? NAF_GLOBAL_NOTIFY: 0)); 293 } 294 } 295 296 297 void 298 netmap_disable_all_rings(struct ifnet *ifp) 299 { 300 netmap_set_all_rings(ifp, 1 /* stopped */); 301 } 302 303 304 void 305 netmap_enable_all_rings(struct ifnet *ifp) 306 { 307 netmap_set_all_rings(ifp, 0 /* enabled */); 308 } 309 310 311 /* 312 * generic bound_checking function 313 */ 314 u_int 315 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) 316 { 317 u_int oldv = *v; 318 const char *op = NULL; 319 320 if (dflt < lo) 321 dflt = lo; 322 if (dflt > hi) 323 dflt = hi; 324 if (oldv < lo) { 325 *v = dflt; 326 op = "Bump"; 327 } else if (oldv > hi) { 328 *v = hi; 329 op = "Clamp"; 330 } 331 if (op && msg) 332 printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); 333 return *v; 334 } 335 336 337 /* 338 * packet-dump function, user-supplied or static buffer. 339 * The destination buffer must be at least 30+4*len 340 */ 341 const char * 342 nm_dump_buf(char *p, int len, int lim, char *dst) 343 { 344 static char _dst[8192]; 345 int i, j, i0; 346 static char hex[] ="0123456789abcdef"; 347 char *o; /* output position */ 348 349 #define P_HI(x) hex[((x) & 0xf0)>>4] 350 #define P_LO(x) hex[((x) & 0xf)] 351 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') 352 if (!dst) 353 dst = _dst; 354 if (lim <= 0 || lim > len) 355 lim = len; 356 o = dst; 357 sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); 358 o += strlen(o); 359 /* hexdump routine */ 360 for (i = 0; i < lim; ) { 361 sprintf(o, "%5d: ", i); 362 o += strlen(o); 363 memset(o, ' ', 48); 364 i0 = i; 365 for (j=0; j < 16 && i < lim; i++, j++) { 366 o[j*3] = P_HI(p[i]); 367 o[j*3+1] = P_LO(p[i]); 368 } 369 i = i0; 370 for (j=0; j < 16 && i < lim; i++, j++) 371 o[j + 48] = P_C(p[i]); 372 o[j+48] = '\n'; 373 o += j+49; 374 } 375 *o = '\0'; 376 #undef P_HI 377 #undef P_LO 378 #undef P_C 379 return dst; 380 } 381 382 383 384 /* 385 * Fetch configuration from the device, to cope with dynamic 386 * reconfigurations after loading the module. 387 */ 388 int 389 netmap_update_config(struct netmap_adapter *na) 390 { 391 struct ifnet *ifp = na->ifp; 392 u_int txr, txd, rxr, rxd; 393 394 txr = txd = rxr = rxd = 0; 395 if (na->nm_config) { 396 na->nm_config(na, &txr, &txd, &rxr, &rxd); 397 } else { 398 /* take whatever we had at init time */ 399 txr = na->num_tx_rings; 400 txd = na->num_tx_desc; 401 rxr = na->num_rx_rings; 402 rxd = na->num_rx_desc; 403 } 404 405 if (na->num_tx_rings == txr && na->num_tx_desc == txd && 406 na->num_rx_rings == rxr && na->num_rx_desc == rxd) 407 return 0; /* nothing changed */ 408 if (netmap_verbose || na->active_fds > 0) { 409 D("stored config %s: txring %d x %d, rxring %d x %d", 410 NM_IFPNAME(ifp), 411 na->num_tx_rings, na->num_tx_desc, 412 na->num_rx_rings, na->num_rx_desc); 413 D("new config %s: txring %d x %d, rxring %d x %d", 414 NM_IFPNAME(ifp), txr, txd, rxr, rxd); 415 } 416 if (na->active_fds == 0) { 417 D("configuration changed (but fine)"); 418 na->num_tx_rings = txr; 419 na->num_tx_desc = txd; 420 na->num_rx_rings = rxr; 421 na->num_rx_desc = rxd; 422 return 0; 423 } 424 D("configuration changed while active, this is bad..."); 425 return 1; 426 } 427 428 429 int 430 netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom) 431 { 432 u_int i, len, ndesc; 433 struct netmap_kring *kring; 434 435 len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom; 436 437 na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); 438 if (na->tx_rings == NULL) { 439 D("Cannot allocate krings"); 440 return ENOMEM; 441 } 442 na->rx_rings = na->tx_rings + ntx; 443 444 ndesc = na->num_tx_desc; 445 for (i = 0; i < ntx; i++) { /* Transmit rings */ 446 kring = &na->tx_rings[i]; 447 bzero(kring, sizeof(*kring)); 448 kring->na = na; 449 kring->nkr_num_slots = ndesc; 450 /* 451 * IMPORTANT: 452 * Always keep one slot empty, so we can detect new 453 * transmissions comparing cur and nr_hwcur (they are 454 * the same only if there are no new transmissions). 455 */ 456 kring->nr_hwavail = ndesc - 1; 457 mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF); 458 init_waitqueue_head(&kring->si); 459 } 460 461 ndesc = na->num_rx_desc; 462 for (i = 0; i < nrx; i++) { /* Receive rings */ 463 kring = &na->rx_rings[i]; 464 bzero(kring, sizeof(*kring)); 465 kring->na = na; 466 kring->nkr_num_slots = ndesc; 467 mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF); 468 init_waitqueue_head(&kring->si); 469 } 470 init_waitqueue_head(&na->tx_si); 471 init_waitqueue_head(&na->rx_si); 472 473 na->tailroom = na->rx_rings + nrx; 474 475 return 0; 476 477 } 478 479 480 void 481 netmap_krings_delete(struct netmap_adapter *na) 482 { 483 int i; 484 485 for (i = 0; i < na->num_tx_rings + 1; i++) { 486 mtx_destroy(&na->tx_rings[i].q_lock); 487 } 488 for (i = 0; i < na->num_rx_rings + 1; i++) { 489 mtx_destroy(&na->rx_rings[i].q_lock); 490 } 491 free(na->tx_rings, M_DEVBUF); 492 na->tx_rings = na->rx_rings = na->tailroom = NULL; 493 } 494 495 496 static struct netmap_if* 497 netmap_if_new(const char *ifname, struct netmap_adapter *na) 498 { 499 struct netmap_if *nifp; 500 501 if (netmap_update_config(na)) { 502 /* configuration mismatch, report and fail */ 503 return NULL; 504 } 505 506 if (na->active_fds) 507 goto final; 508 509 if (na->nm_krings_create(na)) 510 goto cleanup; 511 512 if (netmap_mem_rings_create(na)) 513 goto cleanup; 514 515 final: 516 517 nifp = netmap_mem_if_new(ifname, na); 518 if (nifp == NULL) 519 goto cleanup; 520 521 return (nifp); 522 523 cleanup: 524 525 if (na->active_fds == 0) { 526 netmap_mem_rings_delete(na); 527 na->nm_krings_delete(na); 528 } 529 530 return NULL; 531 } 532 533 534 /* grab a reference to the memory allocator, if we don't have one already. The 535 * reference is taken from the netmap_adapter registered with the priv. 536 * 537 */ 538 static int 539 netmap_get_memory_locked(struct netmap_priv_d* p) 540 { 541 struct netmap_mem_d *nmd; 542 int error = 0; 543 544 if (p->np_na == NULL) { 545 if (!netmap_mmap_unreg) 546 return ENODEV; 547 /* for compatibility with older versions of the API 548 * we use the global allocator when no interface has been 549 * registered 550 */ 551 nmd = &nm_mem; 552 } else { 553 nmd = p->np_na->nm_mem; 554 } 555 if (p->np_mref == NULL) { 556 error = netmap_mem_finalize(nmd); 557 if (!error) 558 p->np_mref = nmd; 559 } else if (p->np_mref != nmd) { 560 /* a virtual port has been registered, but previous 561 * syscalls already used the global allocator. 562 * We cannot continue 563 */ 564 error = ENODEV; 565 } 566 return error; 567 } 568 569 570 int 571 netmap_get_memory(struct netmap_priv_d* p) 572 { 573 int error; 574 NMG_LOCK(); 575 error = netmap_get_memory_locked(p); 576 NMG_UNLOCK(); 577 return error; 578 } 579 580 581 static int 582 netmap_have_memory_locked(struct netmap_priv_d* p) 583 { 584 return p->np_mref != NULL; 585 } 586 587 588 static void 589 netmap_drop_memory_locked(struct netmap_priv_d* p) 590 { 591 if (p->np_mref) { 592 netmap_mem_deref(p->np_mref); 593 p->np_mref = NULL; 594 } 595 } 596 597 598 /* 599 * File descriptor's private data destructor. 600 * 601 * Call nm_register(ifp,0) to stop netmap mode on the interface and 602 * revert to normal operation. We expect that np_na->ifp has not gone. 603 * The second argument is the nifp to work on. In some cases it is 604 * not attached yet to the netmap_priv_d so we need to pass it as 605 * a separate argument. 606 */ 607 /* call with NMG_LOCK held */ 608 static void 609 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) 610 { 611 struct netmap_adapter *na = priv->np_na; 612 struct ifnet *ifp = na->ifp; 613 614 NMG_LOCK_ASSERT(); 615 na->active_fds--; 616 if (na->active_fds <= 0) { /* last instance */ 617 618 if (netmap_verbose) 619 D("deleting last instance for %s", NM_IFPNAME(ifp)); 620 /* 621 * (TO CHECK) This function is only called 622 * when the last reference to this file descriptor goes 623 * away. This means we cannot have any pending poll() 624 * or interrupt routine operating on the structure. 625 * XXX The file may be closed in a thread while 626 * another thread is using it. 627 * Linux keeps the file opened until the last reference 628 * by any outstanding ioctl/poll or mmap is gone. 629 * FreeBSD does not track mmap()s (but we do) and 630 * wakes up any sleeping poll(). Need to check what 631 * happens if the close() occurs while a concurrent 632 * syscall is running. 633 */ 634 if (ifp) 635 na->nm_register(na, 0); /* off, clear flags */ 636 /* Wake up any sleeping threads. netmap_poll will 637 * then return POLLERR 638 * XXX The wake up now must happen during *_down(), when 639 * we order all activities to stop. -gl 640 */ 641 /* XXX kqueue(9) needed; these will mirror knlist_init. */ 642 /* knlist_destroy(&na->tx_si.si_note); */ 643 /* knlist_destroy(&na->rx_si.si_note); */ 644 645 /* delete rings and buffers */ 646 netmap_mem_rings_delete(na); 647 na->nm_krings_delete(na); 648 } 649 /* delete the nifp */ 650 netmap_mem_if_delete(na, nifp); 651 } 652 653 654 /* 655 * returns 1 if this is the last instance and we can free priv 656 */ 657 int 658 netmap_dtor_locked(struct netmap_priv_d *priv) 659 { 660 struct netmap_adapter *na = priv->np_na; 661 662 #ifdef __FreeBSD__ 663 /* 664 * np_refcount is the number of active mmaps on 665 * this file descriptor 666 */ 667 if (--priv->np_refcount > 0) { 668 return 0; 669 } 670 #endif /* __FreeBSD__ */ 671 if (!na) { 672 return 1; //XXX is it correct? 673 } 674 netmap_do_unregif(priv, priv->np_nifp); 675 priv->np_nifp = NULL; 676 netmap_drop_memory_locked(priv); 677 if (priv->np_na) { 678 netmap_adapter_put(na); 679 priv->np_na = NULL; 680 } 681 return 1; 682 } 683 684 685 void 686 netmap_dtor(void *data) 687 { 688 struct netmap_priv_d *priv = data; 689 int last_instance; 690 691 NMG_LOCK(); 692 last_instance = netmap_dtor_locked(priv); 693 NMG_UNLOCK(); 694 if (last_instance) { 695 bzero(priv, sizeof(*priv)); /* for safety */ 696 free(priv, M_DEVBUF); 697 } 698 } 699 700 701 702 703 /* 704 * Handlers for synchronization of the queues from/to the host. 705 * Netmap has two operating modes: 706 * - in the default mode, the rings connected to the host stack are 707 * just another ring pair managed by userspace; 708 * - in transparent mode (XXX to be defined) incoming packets 709 * (from the host or the NIC) are marked as NS_FORWARD upon 710 * arrival, and the user application has a chance to reset the 711 * flag for packets that should be dropped. 712 * On the RXSYNC or poll(), packets in RX rings between 713 * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved 714 * to the other side. 715 * The transfer NIC --> host is relatively easy, just encapsulate 716 * into mbufs and we are done. The host --> NIC side is slightly 717 * harder because there might not be room in the tx ring so it 718 * might take a while before releasing the buffer. 719 */ 720 721 722 /* 723 * pass a chain of buffers to the host stack as coming from 'dst' 724 */ 725 static void 726 netmap_send_up(struct ifnet *dst, struct mbq *q) 727 { 728 struct mbuf *m; 729 730 /* send packets up, outside the lock */ 731 while ((m = mbq_dequeue(q)) != NULL) { 732 if (netmap_verbose & NM_VERB_HOST) 733 D("sending up pkt %p size %d", m, MBUF_LEN(m)); 734 NM_SEND_UP(dst, m); 735 } 736 mbq_destroy(q); 737 } 738 739 740 /* 741 * put a copy of the buffers marked NS_FORWARD into an mbuf chain. 742 * Run from hwcur to cur - reserved 743 */ 744 static void 745 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) 746 { 747 /* Take packets from hwcur to cur-reserved and pass them up. 748 * In case of no buffers we give up. At the end of the loop, 749 * the queue is drained in all cases. 750 * XXX handle reserved 751 */ 752 u_int lim = kring->nkr_num_slots - 1; 753 struct mbuf *m; 754 u_int k = kring->ring->cur, n = kring->ring->reserved; 755 struct netmap_adapter *na = kring->na; 756 757 /* compute the final position, ring->cur - ring->reserved */ 758 if (n > 0) { 759 if (k < n) 760 k += kring->nkr_num_slots; 761 k += n; 762 } 763 for (n = kring->nr_hwcur; n != k;) { 764 struct netmap_slot *slot = &kring->ring->slot[n]; 765 766 n = nm_next(n, lim); 767 if ((slot->flags & NS_FORWARD) == 0 && !force) 768 continue; 769 if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { 770 D("bad pkt at %d len %d", n, slot->len); 771 continue; 772 } 773 slot->flags &= ~NS_FORWARD; // XXX needed ? 774 /* XXX adapt to the case of a multisegment packet */ 775 m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL); 776 777 if (m == NULL) 778 break; 779 mbq_enqueue(q, m); 780 } 781 } 782 783 784 /* 785 * The host ring has packets from nr_hwcur to (cur - reserved) 786 * to be sent down to the NIC. 787 * We need to use the queue lock on the source (host RX ring) 788 * to protect against netmap_transmit. 789 * If the user is well behaved we do not need to acquire locks 790 * on the destination(s), 791 * so we only need to make sure that there are no panics because 792 * of user errors. 793 * XXX verify 794 * 795 * We scan the tx rings, which have just been 796 * flushed so nr_hwcur == cur. Pushing packets down means 797 * increment cur and decrement avail. 798 * XXX to be verified 799 */ 800 static void 801 netmap_sw_to_nic(struct netmap_adapter *na) 802 { 803 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 804 struct netmap_kring *k1 = &na->tx_rings[0]; 805 u_int i, howmany, src_lim, dst_lim; 806 807 /* XXX we should also check that the carrier is on */ 808 if (kring->nkr_stopped) 809 return; 810 811 mtx_lock(&kring->q_lock); 812 813 if (kring->nkr_stopped) 814 goto out; 815 816 howmany = kring->nr_hwavail; /* XXX otherwise cur - reserved - nr_hwcur */ 817 818 src_lim = kring->nkr_num_slots - 1; 819 for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) { 820 ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail); 821 dst_lim = k1->nkr_num_slots - 1; 822 while (howmany > 0 && k1->ring->avail > 0) { 823 struct netmap_slot *src, *dst, tmp; 824 src = &kring->ring->slot[kring->nr_hwcur]; 825 dst = &k1->ring->slot[k1->ring->cur]; 826 tmp = *src; 827 src->buf_idx = dst->buf_idx; 828 src->flags = NS_BUF_CHANGED; 829 830 dst->buf_idx = tmp.buf_idx; 831 dst->len = tmp.len; 832 dst->flags = NS_BUF_CHANGED; 833 ND("out len %d buf %d from %d to %d", 834 dst->len, dst->buf_idx, 835 kring->nr_hwcur, k1->ring->cur); 836 837 kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim); 838 howmany--; 839 kring->nr_hwavail--; 840 k1->ring->cur = nm_next(k1->ring->cur, dst_lim); 841 k1->ring->avail--; 842 } 843 kring->ring->cur = kring->nr_hwcur; // XXX 844 k1++; // XXX why? 845 } 846 out: 847 mtx_unlock(&kring->q_lock); 848 } 849 850 851 /* 852 * netmap_txsync_to_host() passes packets up. We are called from a 853 * system call in user process context, and the only contention 854 * can be among multiple user threads erroneously calling 855 * this routine concurrently. 856 */ 857 void 858 netmap_txsync_to_host(struct netmap_adapter *na) 859 { 860 struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; 861 struct netmap_ring *ring = kring->ring; 862 u_int k, lim = kring->nkr_num_slots - 1; 863 struct mbq q; 864 int error; 865 866 error = nm_kr_tryget(kring); 867 if (error) { 868 if (error == NM_KR_BUSY) 869 D("ring %p busy (user error)", kring); 870 return; 871 } 872 k = ring->cur; 873 if (k > lim) { 874 D("invalid ring index in stack TX kring %p", kring); 875 netmap_ring_reinit(kring); 876 nm_kr_put(kring); 877 return; 878 } 879 880 /* Take packets from hwcur to cur and pass them up. 881 * In case of no buffers we give up. At the end of the loop, 882 * the queue is drained in all cases. 883 */ 884 mbq_init(&q); 885 netmap_grab_packets(kring, &q, 1); 886 kring->nr_hwcur = k; 887 kring->nr_hwavail = ring->avail = lim; 888 889 nm_kr_put(kring); 890 netmap_send_up(na->ifp, &q); 891 } 892 893 894 /* 895 * rxsync backend for packets coming from the host stack. 896 * They have been put in the queue by netmap_transmit() so we 897 * need to protect access to the kring using a lock. 898 * 899 * This routine also does the selrecord if called from the poll handler 900 * (we know because td != NULL). 901 * 902 * NOTE: on linux, selrecord() is defined as a macro and uses pwait 903 * as an additional hidden argument. 904 */ 905 static void 906 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) 907 { 908 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 909 struct netmap_ring *ring = kring->ring; 910 u_int j, n, lim = kring->nkr_num_slots; 911 u_int k = ring->cur, resvd = ring->reserved; 912 913 (void)pwait; /* disable unused warnings */ 914 915 if (kring->nkr_stopped) /* check a first time without lock */ 916 return; 917 918 mtx_lock(&kring->q_lock); 919 920 if (kring->nkr_stopped) /* check again with lock held */ 921 goto unlock_out; 922 923 if (k >= lim) { 924 netmap_ring_reinit(kring); 925 goto unlock_out; 926 } 927 /* new packets are already set in nr_hwavail */ 928 /* skip past packets that userspace has released */ 929 j = kring->nr_hwcur; 930 if (resvd > 0) { 931 if (resvd + ring->avail >= lim + 1) { 932 D("XXX invalid reserve/avail %d %d", resvd, ring->avail); 933 ring->reserved = resvd = 0; // XXX panic... 934 } 935 k = (k >= resvd) ? k - resvd : k + lim - resvd; 936 } 937 if (j != k) { 938 n = k >= j ? k - j : k + lim - j; 939 kring->nr_hwavail -= n; 940 kring->nr_hwcur = k; 941 } 942 k = ring->avail = kring->nr_hwavail - resvd; 943 if (k == 0 && td) 944 selrecord(td, &kring->si); 945 if (k && (netmap_verbose & NM_VERB_HOST)) 946 D("%d pkts from stack", k); 947 unlock_out: 948 949 mtx_unlock(&kring->q_lock); 950 } 951 952 953 /* Get a netmap adapter for the port. 954 * 955 * If it is possible to satisfy the request, return 0 956 * with *na containing the netmap adapter found. 957 * Otherwise return an error code, with *na containing NULL. 958 * 959 * When the port is attached to a bridge, we always return 960 * EBUSY. 961 * Otherwise, if the port is already bound to a file descriptor, 962 * then we unconditionally return the existing adapter into *na. 963 * In all the other cases, we return (into *na) either native, 964 * generic or NULL, according to the following table: 965 * 966 * native_support 967 * active_fds dev.netmap.admode YES NO 968 * ------------------------------------------------------- 969 * >0 * NA(ifp) NA(ifp) 970 * 971 * 0 NETMAP_ADMODE_BEST NATIVE GENERIC 972 * 0 NETMAP_ADMODE_NATIVE NATIVE NULL 973 * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC 974 * 975 */ 976 977 int 978 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) 979 { 980 /* generic support */ 981 int i = netmap_admode; /* Take a snapshot. */ 982 int error = 0; 983 struct netmap_adapter *prev_na; 984 struct netmap_generic_adapter *gna; 985 986 *na = NULL; /* default */ 987 988 /* reset in case of invalid value */ 989 if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST) 990 i = netmap_admode = NETMAP_ADMODE_BEST; 991 992 if (NETMAP_CAPABLE(ifp)) { 993 /* If an adapter already exists, but is 994 * attached to a vale port, we report that the 995 * port is busy. 996 */ 997 if (NETMAP_OWNED_BY_KERN(NA(ifp))) 998 return EBUSY; 999 1000 /* If an adapter already exists, return it if 1001 * there are active file descriptors or if 1002 * netmap is not forced to use generic 1003 * adapters. 1004 */ 1005 if (NA(ifp)->active_fds > 0 || 1006 i != NETMAP_ADMODE_GENERIC) { 1007 *na = NA(ifp); 1008 return 0; 1009 } 1010 } 1011 1012 /* If there isn't native support and netmap is not allowed 1013 * to use generic adapters, we cannot satisfy the request. 1014 */ 1015 if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE) 1016 return EINVAL; 1017 1018 /* Otherwise, create a generic adapter and return it, 1019 * saving the previously used netmap adapter, if any. 1020 * 1021 * Note that here 'prev_na', if not NULL, MUST be a 1022 * native adapter, and CANNOT be a generic one. This is 1023 * true because generic adapters are created on demand, and 1024 * destroyed when not used anymore. Therefore, if the adapter 1025 * currently attached to an interface 'ifp' is generic, it 1026 * must be that 1027 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))). 1028 * Consequently, if NA(ifp) is generic, we will enter one of 1029 * the branches above. This ensures that we never override 1030 * a generic adapter with another generic adapter. 1031 */ 1032 prev_na = NA(ifp); 1033 error = generic_netmap_attach(ifp); 1034 if (error) 1035 return error; 1036 1037 *na = NA(ifp); 1038 gna = (struct netmap_generic_adapter*)NA(ifp); 1039 gna->prev = prev_na; /* save old na */ 1040 if (prev_na != NULL) { 1041 ifunit_ref(ifp->if_xname); 1042 // XXX add a refcount ? 1043 netmap_adapter_get(prev_na); 1044 } 1045 D("Created generic NA %p (prev %p)", gna, gna->prev); 1046 1047 return 0; 1048 } 1049 1050 1051 /* 1052 * MUST BE CALLED UNDER NMG_LOCK() 1053 * 1054 * get a refcounted reference to an interface. 1055 * This is always called in the execution of an ioctl(). 1056 * 1057 * Return ENXIO if the interface does not exist, EINVAL if netmap 1058 * is not supported by the interface. 1059 * If successful, hold a reference. 1060 * 1061 * When the NIC is attached to a bridge, reference is managed 1062 * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as 1063 * virtual ports. Hence, on the final DROP_BDG_REF(), the NIC 1064 * is detached from the bridge, then ifp's refcount is dropped (this 1065 * is equivalent to that ifp is destroyed in case of virtual ports. 1066 * 1067 * This function uses if_rele() when we want to prevent the NIC from 1068 * being detached from the bridge in error handling. But once refcount 1069 * is acquired by this function, it must be released using nm_if_rele(). 1070 */ 1071 int 1072 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) 1073 { 1074 struct ifnet *ifp; 1075 int error = 0; 1076 struct netmap_adapter *ret; 1077 1078 *na = NULL; /* default return value */ 1079 1080 /* first try to see if this is a bridge port. */ 1081 NMG_LOCK_ASSERT(); 1082 1083 error = netmap_get_bdg_na(nmr, na, create); 1084 if (error || *na != NULL) /* valid match in netmap_get_bdg_na() */ 1085 return error; 1086 1087 ifp = ifunit_ref(nmr->nr_name); 1088 if (ifp == NULL) { 1089 return ENXIO; 1090 } 1091 1092 error = netmap_get_hw_na(ifp, &ret); 1093 if (error) 1094 goto out; 1095 1096 if (ret != NULL) { 1097 /* Users cannot use the NIC attached to a bridge directly */ 1098 if (NETMAP_OWNED_BY_KERN(ret)) { 1099 error = EINVAL; 1100 goto out; 1101 } 1102 error = 0; 1103 *na = ret; 1104 netmap_adapter_get(ret); 1105 } 1106 out: 1107 if_rele(ifp); 1108 1109 return error; 1110 } 1111 1112 1113 /* 1114 * validate parameters on entry for *_txsync() 1115 * Returns ring->cur if ok, or something >= kring->nkr_num_slots 1116 * in case of error. The extra argument is a pointer to 1117 * 'new_bufs'. XXX this may be deprecated at some point. 1118 * 1119 * Below is a correct configuration on input. ring->cur 1120 * must be in the region covered by kring->hwavail, 1121 * and ring->avail and kring->avail should end at the same slot. 1122 * 1123 * +-hwcur 1124 * | 1125 * v<--hwres-->|<-----hwavail----> 1126 * ------+------------------------------+-------- ring 1127 * | 1128 * |<---avail---> 1129 * +--cur 1130 * 1131 */ 1132 u_int 1133 nm_txsync_prologue(struct netmap_kring *kring, u_int *new_slots) 1134 { 1135 struct netmap_ring *ring = kring->ring; 1136 u_int cur = ring->cur; /* read only once */ 1137 u_int avail = ring->avail; /* read only once */ 1138 u_int n = kring->nkr_num_slots; 1139 u_int kstart, kend, a; 1140 1141 #if 1 /* kernel sanity checks */ 1142 if (kring->nr_hwcur >= n || 1143 kring->nr_hwreserved >= n || kring->nr_hwavail >= n || 1144 kring->nr_hwreserved + kring->nr_hwavail >= n) 1145 goto error; 1146 #endif /* kernel sanity checks */ 1147 kstart = kring->nr_hwcur + kring->nr_hwreserved; 1148 if (kstart >= n) 1149 kstart -= n; 1150 kend = kstart + kring->nr_hwavail; 1151 /* user sanity checks. a is the expected avail */ 1152 if (cur < kstart) { 1153 /* too low, but maybe wraparound */ 1154 if (cur + n > kend) 1155 goto error; 1156 *new_slots = cur + n - kstart; 1157 a = kend - cur - n; 1158 } else { 1159 if (cur > kend) 1160 goto error; 1161 *new_slots = cur - kstart; 1162 a = kend - cur; 1163 } 1164 if (a != avail) { 1165 RD(5, "wrong but fixable avail have %d need %d", 1166 avail, a); 1167 ring->avail = avail = a; 1168 } 1169 return cur; 1170 1171 error: 1172 RD(5, "kring error: hwcur %d hwres %d hwavail %d cur %d av %d", 1173 kring->nr_hwcur, 1174 kring->nr_hwreserved, kring->nr_hwavail, 1175 cur, avail); 1176 return n; 1177 } 1178 1179 1180 /* 1181 * validate parameters on entry for *_rxsync() 1182 * Returns ring->cur - ring->reserved if ok, 1183 * or something >= kring->nkr_num_slots 1184 * in case of error. The extra argument is a pointer to 1185 * 'resvd'. XXX this may be deprecated at some point. 1186 * 1187 * Below is a correct configuration on input. ring->cur and 1188 * ring->reserved must be in the region covered by kring->hwavail, 1189 * and ring->avail and kring->avail should end at the same slot. 1190 * 1191 * +-hwcur 1192 * | 1193 * v<-------hwavail----------> 1194 * ---------+--------------------------+-------- ring 1195 * |<--res-->| 1196 * |<---avail---> 1197 * +--cur 1198 * 1199 */ 1200 u_int 1201 nm_rxsync_prologue(struct netmap_kring *kring, u_int *resvd) 1202 { 1203 struct netmap_ring *ring = kring->ring; 1204 u_int cur = ring->cur; /* read only once */ 1205 u_int avail = ring->avail; /* read only once */ 1206 u_int res = ring->reserved; /* read only once */ 1207 u_int n = kring->nkr_num_slots; 1208 u_int kend = kring->nr_hwcur + kring->nr_hwavail; 1209 u_int a; 1210 1211 #if 1 /* kernel sanity checks */ 1212 if (kring->nr_hwcur >= n || kring->nr_hwavail >= n) 1213 goto error; 1214 #endif /* kernel sanity checks */ 1215 /* user sanity checks */ 1216 if (res >= n) 1217 goto error; 1218 /* check that cur is valid, a is the expected value of avail */ 1219 if (cur < kring->nr_hwcur) { 1220 /* too low, but maybe wraparound */ 1221 if (cur + n > kend) 1222 goto error; 1223 a = kend - (cur + n); 1224 } else { 1225 if (cur > kend) 1226 goto error; 1227 a = kend - cur; 1228 } 1229 if (a != avail) { 1230 RD(5, "wrong but fixable avail have %d need %d", 1231 avail, a); 1232 ring->avail = avail = a; 1233 } 1234 if (res != 0) { 1235 /* then repeat the check for cur + res */ 1236 cur = (cur >= res) ? cur - res : n + cur - res; 1237 if (cur < kring->nr_hwcur) { 1238 /* too low, but maybe wraparound */ 1239 if (cur + n > kend) 1240 goto error; 1241 } else if (cur > kend) { 1242 goto error; 1243 } 1244 } 1245 *resvd = res; 1246 return cur; 1247 1248 error: 1249 RD(5, "kring error: hwcur %d hwres %d hwavail %d cur %d av %d res %d", 1250 kring->nr_hwcur, 1251 kring->nr_hwreserved, kring->nr_hwavail, 1252 ring->cur, avail, res); 1253 return n; 1254 } 1255 1256 /* 1257 * Error routine called when txsync/rxsync detects an error. 1258 * Can't do much more than resetting cur = hwcur, avail = hwavail. 1259 * Return 1 on reinit. 1260 * 1261 * This routine is only called by the upper half of the kernel. 1262 * It only reads hwcur (which is changed only by the upper half, too) 1263 * and hwavail (which may be changed by the lower half, but only on 1264 * a tx ring and only to increase it, so any error will be recovered 1265 * on the next call). For the above, we don't strictly need to call 1266 * it under lock. 1267 */ 1268 int 1269 netmap_ring_reinit(struct netmap_kring *kring) 1270 { 1271 struct netmap_ring *ring = kring->ring; 1272 u_int i, lim = kring->nkr_num_slots - 1; 1273 int errors = 0; 1274 1275 // XXX KASSERT nm_kr_tryget 1276 RD(10, "called for %s", NM_IFPNAME(kring->na->ifp)); 1277 if (ring->cur > lim) 1278 errors++; 1279 for (i = 0; i <= lim; i++) { 1280 u_int idx = ring->slot[i].buf_idx; 1281 u_int len = ring->slot[i].len; 1282 if (idx < 2 || idx >= netmap_total_buffers) { 1283 if (!errors++) 1284 D("bad buffer at slot %d idx %d len %d ", i, idx, len); 1285 ring->slot[i].buf_idx = 0; 1286 ring->slot[i].len = 0; 1287 } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { 1288 ring->slot[i].len = 0; 1289 if (!errors++) 1290 D("bad len %d at slot %d idx %d", 1291 len, i, idx); 1292 } 1293 } 1294 if (errors) { 1295 int pos = kring - kring->na->tx_rings; 1296 int n = kring->na->num_tx_rings + 1; 1297 1298 RD(10, "total %d errors", errors); 1299 errors++; 1300 RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d", 1301 NM_IFPNAME(kring->na->ifp), 1302 pos < n ? "TX" : "RX", pos < n ? pos : pos - n, 1303 ring->cur, kring->nr_hwcur, 1304 ring->avail, kring->nr_hwavail); 1305 ring->cur = kring->nr_hwcur; 1306 ring->avail = kring->nr_hwavail; 1307 } 1308 return (errors ? 1 : 0); 1309 } 1310 1311 1312 /* 1313 * Set the ring ID. For devices with a single queue, a request 1314 * for all rings is the same as a single ring. 1315 */ 1316 static int 1317 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) 1318 { 1319 struct netmap_adapter *na = priv->np_na; 1320 struct ifnet *ifp = na->ifp; 1321 u_int i = ringid & NETMAP_RING_MASK; 1322 /* initially (np_qfirst == np_qlast) we don't want to lock */ 1323 u_int lim = na->num_rx_rings; 1324 1325 if (na->num_tx_rings > lim) 1326 lim = na->num_tx_rings; 1327 if ( (ringid & NETMAP_HW_RING) && i >= lim) { 1328 D("invalid ring id %d", i); 1329 return (EINVAL); 1330 } 1331 priv->np_ringid = ringid; 1332 if (ringid & NETMAP_SW_RING) { 1333 priv->np_qfirst = NETMAP_SW_RING; 1334 priv->np_qlast = 0; 1335 } else if (ringid & NETMAP_HW_RING) { 1336 priv->np_qfirst = i; 1337 priv->np_qlast = i + 1; 1338 } else { 1339 priv->np_qfirst = 0; 1340 priv->np_qlast = NETMAP_HW_RING ; 1341 } 1342 priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; 1343 if (netmap_verbose) { 1344 if (ringid & NETMAP_SW_RING) 1345 D("ringid %s set to SW RING", NM_IFPNAME(ifp)); 1346 else if (ringid & NETMAP_HW_RING) 1347 D("ringid %s set to HW RING %d", NM_IFPNAME(ifp), 1348 priv->np_qfirst); 1349 else 1350 D("ringid %s set to all %d HW RINGS", NM_IFPNAME(ifp), lim); 1351 } 1352 return 0; 1353 } 1354 1355 1356 /* 1357 * possibly move the interface to netmap-mode. 1358 * If success it returns a pointer to netmap_if, otherwise NULL. 1359 * This must be called with NMG_LOCK held. 1360 */ 1361 struct netmap_if * 1362 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, 1363 uint16_t ringid, int *err) 1364 { 1365 struct ifnet *ifp = na->ifp; 1366 struct netmap_if *nifp = NULL; 1367 int error, need_mem = 0; 1368 1369 NMG_LOCK_ASSERT(); 1370 /* ring configuration may have changed, fetch from the card */ 1371 netmap_update_config(na); 1372 priv->np_na = na; /* store the reference */ 1373 error = netmap_set_ringid(priv, ringid); 1374 if (error) 1375 goto out; 1376 /* ensure allocators are ready */ 1377 need_mem = !netmap_have_memory_locked(priv); 1378 if (need_mem) { 1379 error = netmap_get_memory_locked(priv); 1380 ND("get_memory returned %d", error); 1381 if (error) 1382 goto out; 1383 } 1384 nifp = netmap_if_new(NM_IFPNAME(ifp), na); 1385 if (nifp == NULL) { /* allocation failed */ 1386 /* we should drop the allocator, but only 1387 * if we were the ones who grabbed it 1388 */ 1389 error = ENOMEM; 1390 goto out; 1391 } 1392 na->active_fds++; 1393 if (ifp->if_capenable & IFCAP_NETMAP) { 1394 /* was already set */ 1395 } else { 1396 /* Otherwise set the card in netmap mode 1397 * and make it use the shared buffers. 1398 * 1399 * do not core lock because the race is harmless here, 1400 * there cannot be any traffic to netmap_transmit() 1401 */ 1402 na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut; 1403 ND("%p->na_lut == %p", na, na->na_lut); 1404 na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal; 1405 error = na->nm_register(na, 1); /* mode on */ 1406 if (error) { 1407 netmap_do_unregif(priv, nifp); 1408 nifp = NULL; 1409 } 1410 } 1411 out: 1412 *err = error; 1413 if (error) { 1414 priv->np_na = NULL; 1415 if (need_mem) 1416 netmap_drop_memory_locked(priv); 1417 } 1418 if (nifp != NULL) { 1419 /* 1420 * advertise that the interface is ready bt setting ni_nifp. 1421 * The barrier is needed because readers (poll and *SYNC) 1422 * check for priv->np_nifp != NULL without locking 1423 */ 1424 wmb(); /* make sure previous writes are visible to all CPUs */ 1425 priv->np_nifp = nifp; 1426 } 1427 return nifp; 1428 } 1429 1430 1431 1432 /* 1433 * ioctl(2) support for the "netmap" device. 1434 * 1435 * Following a list of accepted commands: 1436 * - NIOCGINFO 1437 * - SIOCGIFADDR just for convenience 1438 * - NIOCREGIF 1439 * - NIOCUNREGIF 1440 * - NIOCTXSYNC 1441 * - NIOCRXSYNC 1442 * 1443 * Return 0 on success, errno otherwise. 1444 */ 1445 int 1446 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, 1447 int fflag, struct thread *td) 1448 { 1449 struct netmap_priv_d *priv = NULL; 1450 struct ifnet *ifp = NULL; 1451 struct nmreq *nmr = (struct nmreq *) data; 1452 struct netmap_adapter *na = NULL; 1453 int error; 1454 u_int i, lim; 1455 struct netmap_if *nifp; 1456 struct netmap_kring *krings; 1457 1458 (void)dev; /* UNUSED */ 1459 (void)fflag; /* UNUSED */ 1460 #ifdef linux 1461 #define devfs_get_cdevpriv(pp) \ 1462 ({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; \ 1463 (*pp ? 0 : ENOENT); }) 1464 1465 /* devfs_set_cdevpriv cannot fail on linux */ 1466 #define devfs_set_cdevpriv(p, fn) \ 1467 ({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); }) 1468 1469 1470 #define devfs_clear_cdevpriv() do { \ 1471 netmap_dtor(priv); ((struct file *)td)->private_data = 0; \ 1472 } while (0) 1473 #endif /* linux */ 1474 1475 CURVNET_SET(TD_TO_VNET(td)); 1476 1477 error = devfs_get_cdevpriv((void **)&priv); 1478 if (error) { 1479 CURVNET_RESTORE(); 1480 /* XXX ENOENT should be impossible, since the priv 1481 * is now created in the open */ 1482 return (error == ENOENT ? ENXIO : error); 1483 } 1484 1485 nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; /* truncate name */ 1486 switch (cmd) { 1487 case NIOCGINFO: /* return capabilities etc */ 1488 if (nmr->nr_version != NETMAP_API) { 1489 D("API mismatch got %d have %d", 1490 nmr->nr_version, NETMAP_API); 1491 nmr->nr_version = NETMAP_API; 1492 error = EINVAL; 1493 break; 1494 } 1495 if (nmr->nr_cmd == NETMAP_BDG_LIST) { 1496 error = netmap_bdg_ctl(nmr, NULL); 1497 break; 1498 } 1499 1500 NMG_LOCK(); 1501 do { 1502 /* memsize is always valid */ 1503 struct netmap_mem_d *nmd = &nm_mem; 1504 u_int memflags; 1505 1506 if (nmr->nr_name[0] != '\0') { 1507 /* get a refcount */ 1508 error = netmap_get_na(nmr, &na, 1 /* create */); 1509 if (error) 1510 break; 1511 nmd = na->nm_mem; /* get memory allocator */ 1512 } 1513 1514 error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags); 1515 if (error) 1516 break; 1517 if (na == NULL) /* only memory info */ 1518 break; 1519 nmr->nr_offset = 0; 1520 nmr->nr_rx_slots = nmr->nr_tx_slots = 0; 1521 netmap_update_config(na); 1522 nmr->nr_rx_rings = na->num_rx_rings; 1523 nmr->nr_tx_rings = na->num_tx_rings; 1524 nmr->nr_rx_slots = na->num_rx_desc; 1525 nmr->nr_tx_slots = na->num_tx_desc; 1526 if (memflags & NETMAP_MEM_PRIVATE) 1527 nmr->nr_ringid |= NETMAP_PRIV_MEM; 1528 netmap_adapter_put(na); 1529 } while (0); 1530 NMG_UNLOCK(); 1531 break; 1532 1533 case NIOCREGIF: 1534 if (nmr->nr_version != NETMAP_API) { 1535 nmr->nr_version = NETMAP_API; 1536 error = EINVAL; 1537 break; 1538 } 1539 /* possibly attach/detach NIC and VALE switch */ 1540 i = nmr->nr_cmd; 1541 if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH 1542 || i == NETMAP_BDG_OFFSET) { 1543 error = netmap_bdg_ctl(nmr, NULL); 1544 break; 1545 } else if (i != 0) { 1546 D("nr_cmd must be 0 not %d", i); 1547 error = EINVAL; 1548 break; 1549 } 1550 1551 /* protect access to priv from concurrent NIOCREGIF */ 1552 NMG_LOCK(); 1553 do { 1554 u_int memflags; 1555 1556 if (priv->np_na != NULL) { /* thread already registered */ 1557 error = netmap_set_ringid(priv, nmr->nr_ringid); 1558 break; 1559 } 1560 /* find the interface and a reference */ 1561 error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */ 1562 if (error) 1563 break; 1564 ifp = na->ifp; 1565 if (NETMAP_OWNED_BY_KERN(na)) { 1566 netmap_adapter_put(na); 1567 error = EBUSY; 1568 break; 1569 } 1570 nifp = netmap_do_regif(priv, na, nmr->nr_ringid, &error); 1571 if (!nifp) { /* reg. failed, release priv and ref */ 1572 netmap_adapter_put(na); 1573 priv->np_nifp = NULL; 1574 break; 1575 } 1576 1577 /* return the offset of the netmap_if object */ 1578 nmr->nr_rx_rings = na->num_rx_rings; 1579 nmr->nr_tx_rings = na->num_tx_rings; 1580 nmr->nr_rx_slots = na->num_rx_desc; 1581 nmr->nr_tx_slots = na->num_tx_desc; 1582 error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags); 1583 if (error) { 1584 netmap_adapter_put(na); 1585 break; 1586 } 1587 if (memflags & NETMAP_MEM_PRIVATE) { 1588 nmr->nr_ringid |= NETMAP_PRIV_MEM; 1589 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; 1590 } 1591 nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); 1592 } while (0); 1593 NMG_UNLOCK(); 1594 break; 1595 1596 case NIOCUNREGIF: 1597 // XXX we have no data here ? 1598 D("deprecated, data is %p", nmr); 1599 error = EINVAL; 1600 break; 1601 1602 case NIOCTXSYNC: 1603 case NIOCRXSYNC: 1604 nifp = priv->np_nifp; 1605 1606 if (nifp == NULL) { 1607 error = ENXIO; 1608 break; 1609 } 1610 rmb(); /* make sure following reads are not from cache */ 1611 1612 na = priv->np_na; /* we have a reference */ 1613 1614 if (na == NULL) { 1615 D("Internal error: nifp != NULL && na == NULL"); 1616 error = ENXIO; 1617 break; 1618 } 1619 1620 ifp = na->ifp; 1621 if (ifp == NULL) { 1622 RD(1, "the ifp is gone"); 1623 error = ENXIO; 1624 break; 1625 } 1626 1627 if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ 1628 if (cmd == NIOCTXSYNC) 1629 netmap_txsync_to_host(na); 1630 else 1631 netmap_rxsync_from_host(na, NULL, NULL); 1632 break; 1633 } 1634 /* find the last ring to scan */ 1635 lim = priv->np_qlast; 1636 if (lim == NETMAP_HW_RING) 1637 lim = (cmd == NIOCTXSYNC) ? 1638 na->num_tx_rings : na->num_rx_rings; 1639 1640 krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings; 1641 for (i = priv->np_qfirst; i < lim; i++) { 1642 struct netmap_kring *kring = krings + i; 1643 if (nm_kr_tryget(kring)) { 1644 error = EBUSY; 1645 goto out; 1646 } 1647 if (cmd == NIOCTXSYNC) { 1648 if (netmap_verbose & NM_VERB_TXSYNC) 1649 D("pre txsync ring %d cur %d hwcur %d", 1650 i, kring->ring->cur, 1651 kring->nr_hwcur); 1652 na->nm_txsync(na, i, NAF_FORCE_RECLAIM); 1653 if (netmap_verbose & NM_VERB_TXSYNC) 1654 D("post txsync ring %d cur %d hwcur %d", 1655 i, kring->ring->cur, 1656 kring->nr_hwcur); 1657 } else { 1658 na->nm_rxsync(na, i, NAF_FORCE_READ); 1659 microtime(&na->rx_rings[i].ring->ts); 1660 } 1661 nm_kr_put(kring); 1662 } 1663 1664 break; 1665 1666 #ifdef __FreeBSD__ 1667 case BIOCIMMEDIATE: 1668 case BIOCGHDRCMPLT: 1669 case BIOCSHDRCMPLT: 1670 case BIOCSSEESENT: 1671 D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); 1672 break; 1673 1674 default: /* allow device-specific ioctls */ 1675 { 1676 struct socket so; 1677 1678 bzero(&so, sizeof(so)); 1679 NMG_LOCK(); 1680 error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */ 1681 if (error) { 1682 netmap_adapter_put(na); 1683 NMG_UNLOCK(); 1684 break; 1685 } 1686 ifp = na->ifp; 1687 so.so_vnet = ifp->if_vnet; 1688 // so->so_proto not null. 1689 error = ifioctl(&so, cmd, data, td); 1690 netmap_adapter_put(na); 1691 NMG_UNLOCK(); 1692 break; 1693 } 1694 1695 #else /* linux */ 1696 default: 1697 error = EOPNOTSUPP; 1698 #endif /* linux */ 1699 } 1700 out: 1701 1702 CURVNET_RESTORE(); 1703 return (error); 1704 } 1705 1706 1707 /* 1708 * select(2) and poll(2) handlers for the "netmap" device. 1709 * 1710 * Can be called for one or more queues. 1711 * Return true the event mask corresponding to ready events. 1712 * If there are no ready events, do a selrecord on either individual 1713 * selinfo or on the global one. 1714 * Device-dependent parts (locking and sync of tx/rx rings) 1715 * are done through callbacks. 1716 * 1717 * On linux, arguments are really pwait, the poll table, and 'td' is struct file * 1718 * The first one is remapped to pwait as selrecord() uses the name as an 1719 * hidden argument. 1720 */ 1721 int 1722 netmap_poll(struct cdev *dev, int events, struct thread *td) 1723 { 1724 struct netmap_priv_d *priv = NULL; 1725 struct netmap_adapter *na; 1726 struct ifnet *ifp; 1727 struct netmap_kring *kring; 1728 u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; 1729 u_int lim_tx, lim_rx, host_forwarded = 0; 1730 struct mbq q; 1731 void *pwait = dev; /* linux compatibility */ 1732 1733 /* 1734 * In order to avoid nested locks, we need to "double check" 1735 * txsync and rxsync if we decide to do a selrecord(). 1736 * retry_tx (and retry_rx, later) prevent looping forever. 1737 */ 1738 int retry_tx = 1; 1739 1740 (void)pwait; 1741 mbq_init(&q); 1742 1743 if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) 1744 return POLLERR; 1745 1746 if (priv->np_nifp == NULL) { 1747 D("No if registered"); 1748 return POLLERR; 1749 } 1750 rmb(); /* make sure following reads are not from cache */ 1751 1752 na = priv->np_na; 1753 ifp = na->ifp; 1754 // check for deleted 1755 if (ifp == NULL) { 1756 RD(1, "the ifp is gone"); 1757 return POLLERR; 1758 } 1759 1760 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) 1761 return POLLERR; 1762 1763 if (netmap_verbose & 0x8000) 1764 D("device %s events 0x%x", NM_IFPNAME(ifp), events); 1765 want_tx = events & (POLLOUT | POLLWRNORM); 1766 want_rx = events & (POLLIN | POLLRDNORM); 1767 1768 lim_tx = na->num_tx_rings; 1769 lim_rx = na->num_rx_rings; 1770 1771 if (priv->np_qfirst == NETMAP_SW_RING) { 1772 /* handle the host stack ring */ 1773 if (priv->np_txpoll || want_tx) { 1774 /* push any packets up, then we are always ready */ 1775 netmap_txsync_to_host(na); 1776 revents |= want_tx; 1777 } 1778 if (want_rx) { 1779 kring = &na->rx_rings[lim_rx]; 1780 if (kring->ring->avail == 0) 1781 netmap_rxsync_from_host(na, td, dev); 1782 if (kring->ring->avail > 0) { 1783 revents |= want_rx; 1784 } 1785 } 1786 return (revents); 1787 } 1788 1789 /* 1790 * If we are in transparent mode, check also the host rx ring 1791 * XXX Transparent mode at the moment requires to bind all 1792 * rings to a single file descriptor. 1793 */ 1794 kring = &na->rx_rings[lim_rx]; 1795 if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all 1796 && want_rx 1797 && (netmap_fwd || kring->ring->flags & NR_FORWARD) ) { 1798 if (kring->ring->avail == 0) 1799 netmap_rxsync_from_host(na, td, dev); 1800 if (kring->ring->avail > 0) 1801 revents |= want_rx; 1802 } 1803 1804 /* 1805 * check_all_{tx|rx} are set if the card has more than one queue AND 1806 * the file descriptor is bound to all of them. If so, we sleep on 1807 * the "global" selinfo, otherwise we sleep on individual selinfo 1808 * (FreeBSD only allows two selinfo's per file descriptor). 1809 * The interrupt routine in the driver wake one or the other 1810 * (or both) depending on which clients are active. 1811 * 1812 * rxsync() is only called if we run out of buffers on a POLLIN. 1813 * txsync() is called if we run out of buffers on POLLOUT, or 1814 * there are pending packets to send. The latter can be disabled 1815 * passing NETMAP_NO_TX_POLL in the NIOCREG call. 1816 */ 1817 check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1); 1818 check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1); 1819 1820 if (priv->np_qlast != NETMAP_HW_RING) { 1821 lim_tx = lim_rx = priv->np_qlast; 1822 } 1823 1824 /* 1825 * We start with a lock free round which is cheap if we have 1826 * slots available. If this fails, then lock and call the sync 1827 * routines. 1828 * XXX rather than ring->avail >0 should check that 1829 * ring->cur has not reached hwcur+hwavail 1830 */ 1831 for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { 1832 kring = &na->rx_rings[i]; 1833 if (kring->ring->avail > 0) { 1834 revents |= want_rx; 1835 want_rx = 0; /* also breaks the loop */ 1836 } 1837 } 1838 for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) { 1839 kring = &na->tx_rings[i]; 1840 if (kring->ring->avail > 0) { 1841 revents |= want_tx; 1842 want_tx = 0; /* also breaks the loop */ 1843 } 1844 } 1845 1846 /* 1847 * If we to push packets out (priv->np_txpoll) or want_tx is 1848 * still set, we do need to run the txsync calls (on all rings, 1849 * to avoid that the tx rings stall). 1850 * XXX should also check cur != hwcur on the tx rings. 1851 * Fortunately, normal tx mode has np_txpoll set. 1852 */ 1853 if (priv->np_txpoll || want_tx) { 1854 /* If we really want to be woken up (want_tx), 1855 * do a selrecord, either on the global or on 1856 * the private structure. Then issue the txsync 1857 * so there is no race in the selrecord/selwait 1858 */ 1859 flush_tx: 1860 for (i = priv->np_qfirst; i < lim_tx; i++) { 1861 kring = &na->tx_rings[i]; 1862 /* 1863 * Skip this ring if want_tx == 0 1864 * (we have already done a successful sync on 1865 * a previous ring) AND kring->cur == kring->hwcur 1866 * (there are no pending transmissions for this ring). 1867 */ 1868 if (!want_tx && kring->ring->cur == kring->nr_hwcur) 1869 continue; 1870 /* make sure only one user thread is doing this */ 1871 if (nm_kr_tryget(kring)) { 1872 ND("ring %p busy is %d", 1873 kring, (int)kring->nr_busy); 1874 revents |= POLLERR; 1875 goto out; 1876 } 1877 1878 if (netmap_verbose & NM_VERB_TXSYNC) 1879 D("send %d on %s %d", 1880 kring->ring->cur, NM_IFPNAME(ifp), i); 1881 if (na->nm_txsync(na, i, 0)) 1882 revents |= POLLERR; 1883 1884 /* Check avail and call selrecord only if 1885 * called with POLLOUT and run out of bufs. 1886 * XXX Note, we cannot trust much ring->avail 1887 * as it is exposed to userspace (even though 1888 * just updated by txsync). We should really 1889 * check kring->nr_hwavail or better have 1890 * txsync set a flag telling if we need 1891 * to do a selrecord(). 1892 */ 1893 if (want_tx) { 1894 if (kring->ring->avail > 0) { 1895 /* stop at the first ring. We don't risk 1896 * starvation. 1897 */ 1898 revents |= want_tx; 1899 want_tx = 0; 1900 } 1901 } 1902 nm_kr_put(kring); 1903 } 1904 if (want_tx && retry_tx) { 1905 selrecord(td, check_all_tx ? 1906 &na->tx_si : &na->tx_rings[priv->np_qfirst].si); 1907 retry_tx = 0; 1908 goto flush_tx; 1909 } 1910 } 1911 1912 /* 1913 * now if want_rx is still set we need to lock and rxsync. 1914 * Do it on all rings because otherwise we starve. 1915 */ 1916 if (want_rx) { 1917 int retry_rx = 1; 1918 do_retry_rx: 1919 for (i = priv->np_qfirst; i < lim_rx; i++) { 1920 kring = &na->rx_rings[i]; 1921 1922 if (nm_kr_tryget(kring)) { 1923 revents |= POLLERR; 1924 goto out; 1925 } 1926 1927 /* XXX NR_FORWARD should only be read on 1928 * physical or NIC ports 1929 */ 1930 if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { 1931 ND(10, "forwarding some buffers up %d to %d", 1932 kring->nr_hwcur, kring->ring->cur); 1933 netmap_grab_packets(kring, &q, netmap_fwd); 1934 } 1935 1936 if (na->nm_rxsync(na, i, 0)) 1937 revents |= POLLERR; 1938 if (netmap_no_timestamp == 0 || 1939 kring->ring->flags & NR_TIMESTAMP) { 1940 microtime(&kring->ring->ts); 1941 } 1942 1943 if (kring->ring->avail > 0) { 1944 revents |= want_rx; 1945 retry_rx = 0; 1946 } 1947 nm_kr_put(kring); 1948 } 1949 if (retry_rx) { 1950 retry_rx = 0; 1951 selrecord(td, check_all_rx ? 1952 &na->rx_si : &na->rx_rings[priv->np_qfirst].si); 1953 goto do_retry_rx; 1954 } 1955 } 1956 1957 /* forward host to the netmap ring. 1958 * I am accessing nr_hwavail without lock, but netmap_transmit 1959 * can only increment it, so the operation is safe. 1960 */ 1961 kring = &na->rx_rings[lim_rx]; 1962 if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all 1963 && (netmap_fwd || kring->ring->flags & NR_FORWARD) 1964 && kring->nr_hwavail > 0 && !host_forwarded) { 1965 netmap_sw_to_nic(na); 1966 host_forwarded = 1; /* prevent another pass */ 1967 want_rx = 0; 1968 goto flush_tx; 1969 } 1970 1971 if (q.head) 1972 netmap_send_up(na->ifp, &q); 1973 1974 out: 1975 1976 return (revents); 1977 } 1978 1979 /*------- driver support routines ------*/ 1980 1981 static int netmap_hw_krings_create(struct netmap_adapter *); 1982 1983 static int 1984 netmap_notify(struct netmap_adapter *na, u_int n_ring, enum txrx tx, int flags) 1985 { 1986 struct netmap_kring *kring; 1987 1988 if (tx == NR_TX) { 1989 kring = na->tx_rings + n_ring; 1990 selwakeuppri(&kring->si, PI_NET); 1991 if (flags & NAF_GLOBAL_NOTIFY) 1992 selwakeuppri(&na->tx_si, PI_NET); 1993 } else { 1994 kring = na->rx_rings + n_ring; 1995 selwakeuppri(&kring->si, PI_NET); 1996 if (flags & NAF_GLOBAL_NOTIFY) 1997 selwakeuppri(&na->rx_si, PI_NET); 1998 } 1999 return 0; 2000 } 2001 2002 2003 // XXX check handling of failures 2004 int 2005 netmap_attach_common(struct netmap_adapter *na) 2006 { 2007 struct ifnet *ifp = na->ifp; 2008 2009 if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { 2010 D("%s: invalid rings tx %d rx %d", 2011 ifp->if_xname, na->num_tx_rings, na->num_rx_rings); 2012 return EINVAL; 2013 } 2014 WNA(ifp) = na; 2015 NETMAP_SET_CAPABLE(ifp); 2016 if (na->nm_krings_create == NULL) { 2017 na->nm_krings_create = netmap_hw_krings_create; 2018 na->nm_krings_delete = netmap_krings_delete; 2019 } 2020 if (na->nm_notify == NULL) 2021 na->nm_notify = netmap_notify; 2022 na->active_fds = 0; 2023 2024 if (na->nm_mem == NULL) 2025 na->nm_mem = &nm_mem; 2026 return 0; 2027 } 2028 2029 2030 void 2031 netmap_detach_common(struct netmap_adapter *na) 2032 { 2033 if (na->ifp) 2034 WNA(na->ifp) = NULL; /* XXX do we need this? */ 2035 2036 if (na->tx_rings) { /* XXX should not happen */ 2037 D("freeing leftover tx_rings"); 2038 na->nm_krings_delete(na); 2039 } 2040 if (na->na_flags & NAF_MEM_OWNER) 2041 netmap_mem_private_delete(na->nm_mem); 2042 bzero(na, sizeof(*na)); 2043 free(na, M_DEVBUF); 2044 } 2045 2046 2047 /* 2048 * Initialize a ``netmap_adapter`` object created by driver on attach. 2049 * We allocate a block of memory with room for a struct netmap_adapter 2050 * plus two sets of N+2 struct netmap_kring (where N is the number 2051 * of hardware rings): 2052 * krings 0..N-1 are for the hardware queues. 2053 * kring N is for the host stack queue 2054 * kring N+1 is only used for the selinfo for all queues. 2055 * Return 0 on success, ENOMEM otherwise. 2056 * 2057 * By default the receive and transmit adapter ring counts are both initialized 2058 * to num_queues. na->num_tx_rings can be set for cards with different tx/rx 2059 * setups. 2060 */ 2061 int 2062 netmap_attach(struct netmap_adapter *arg) 2063 { 2064 struct netmap_hw_adapter *hwna = NULL; 2065 // XXX when is arg == NULL ? 2066 struct ifnet *ifp = arg ? arg->ifp : NULL; 2067 2068 if (arg == NULL || ifp == NULL) 2069 goto fail; 2070 hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO); 2071 if (hwna == NULL) 2072 goto fail; 2073 hwna->up = *arg; 2074 if (netmap_attach_common(&hwna->up)) { 2075 free(hwna, M_DEVBUF); 2076 goto fail; 2077 } 2078 netmap_adapter_get(&hwna->up); 2079 2080 #ifdef linux 2081 if (ifp->netdev_ops) { 2082 /* prepare a clone of the netdev ops */ 2083 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) 2084 hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops; 2085 #else 2086 hwna->nm_ndo = *ifp->netdev_ops; 2087 #endif 2088 } 2089 hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; 2090 #endif /* linux */ 2091 2092 D("success for %s", NM_IFPNAME(ifp)); 2093 return 0; 2094 2095 fail: 2096 D("fail, arg %p ifp %p na %p", arg, ifp, hwna); 2097 netmap_detach(ifp); 2098 return (hwna ? EINVAL : ENOMEM); 2099 } 2100 2101 2102 void 2103 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na) 2104 { 2105 if (!na) { 2106 return; 2107 } 2108 2109 refcount_acquire(&na->na_refcount); 2110 } 2111 2112 2113 /* returns 1 iff the netmap_adapter is destroyed */ 2114 int 2115 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) 2116 { 2117 if (!na) 2118 return 1; 2119 2120 if (!refcount_release(&na->na_refcount)) 2121 return 0; 2122 2123 if (na->nm_dtor) 2124 na->nm_dtor(na); 2125 2126 netmap_detach_common(na); 2127 2128 return 1; 2129 } 2130 2131 2132 int 2133 netmap_hw_krings_create(struct netmap_adapter *na) 2134 { 2135 return netmap_krings_create(na, 2136 na->num_tx_rings + 1, na->num_rx_rings + 1, 0); 2137 } 2138 2139 2140 2141 /* 2142 * Free the allocated memory linked to the given ``netmap_adapter`` 2143 * object. 2144 */ 2145 void 2146 netmap_detach(struct ifnet *ifp) 2147 { 2148 struct netmap_adapter *na = NA(ifp); 2149 2150 if (!na) 2151 return; 2152 2153 NMG_LOCK(); 2154 netmap_disable_all_rings(ifp); 2155 netmap_adapter_put(na); 2156 na->ifp = NULL; 2157 netmap_enable_all_rings(ifp); 2158 NMG_UNLOCK(); 2159 } 2160 2161 2162 /* 2163 * Intercept packets from the network stack and pass them 2164 * to netmap as incoming packets on the 'software' ring. 2165 * We rely on the OS to make sure that the ifp and na do not go 2166 * away (typically the caller checks for IFF_DRV_RUNNING or the like). 2167 * In nm_register() or whenever there is a reinitialization, 2168 * we make sure to make the mode change visible here. 2169 */ 2170 int 2171 netmap_transmit(struct ifnet *ifp, struct mbuf *m) 2172 { 2173 struct netmap_adapter *na = NA(ifp); 2174 struct netmap_kring *kring; 2175 u_int i, len = MBUF_LEN(m); 2176 u_int error = EBUSY, lim; 2177 struct netmap_slot *slot; 2178 2179 // XXX [Linux] we do not need this lock 2180 // if we follow the down/configure/up protocol -gl 2181 // mtx_lock(&na->core_lock); 2182 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { 2183 /* interface not in netmap mode anymore */ 2184 error = ENXIO; 2185 goto done; 2186 } 2187 2188 kring = &na->rx_rings[na->num_rx_rings]; 2189 lim = kring->nkr_num_slots - 1; 2190 if (netmap_verbose & NM_VERB_HOST) 2191 D("%s packet %d len %d from the stack", NM_IFPNAME(ifp), 2192 kring->nr_hwcur + kring->nr_hwavail, len); 2193 // XXX reconsider long packets if we handle fragments 2194 if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ 2195 D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp), 2196 len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); 2197 goto done; 2198 } 2199 /* protect against other instances of netmap_transmit, 2200 * and userspace invocations of rxsync(). 2201 */ 2202 // XXX [Linux] there can be no other instances of netmap_transmit 2203 // on this same ring, but we still need this lock to protect 2204 // concurrent access from netmap_sw_to_nic() -gl 2205 mtx_lock(&kring->q_lock); 2206 if (kring->nr_hwavail >= lim) { 2207 if (netmap_verbose) 2208 D("stack ring %s full\n", NM_IFPNAME(ifp)); 2209 } else { 2210 /* compute the insert position */ 2211 i = nm_kr_rxpos(kring); 2212 slot = &kring->ring->slot[i]; 2213 m_copydata(m, 0, (int)len, BDG_NMB(na, slot)); 2214 slot->len = len; 2215 slot->flags = kring->nkr_slot_flags; 2216 kring->nr_hwavail++; 2217 if (netmap_verbose & NM_VERB_HOST) 2218 D("wake up host ring %s %d", NM_IFPNAME(na->ifp), na->num_rx_rings); 2219 na->nm_notify(na, na->num_rx_rings, NR_RX, 0); 2220 error = 0; 2221 } 2222 mtx_unlock(&kring->q_lock); 2223 2224 done: 2225 // mtx_unlock(&na->core_lock); 2226 2227 /* release the mbuf in either cases of success or failure. As an 2228 * alternative, put the mbuf in a free list and free the list 2229 * only when really necessary. 2230 */ 2231 m_freem(m); 2232 2233 return (error); 2234 } 2235 2236 2237 /* 2238 * netmap_reset() is called by the driver routines when reinitializing 2239 * a ring. The driver is in charge of locking to protect the kring. 2240 * If native netmap mode is not set just return NULL. 2241 */ 2242 struct netmap_slot * 2243 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, 2244 u_int new_cur) 2245 { 2246 struct netmap_kring *kring; 2247 int new_hwofs, lim; 2248 2249 if (na == NULL) { 2250 D("NULL na, should not happen"); 2251 return NULL; /* no netmap support here */ 2252 } 2253 if (!(na->ifp->if_capenable & IFCAP_NETMAP)) { 2254 ND("interface not in netmap mode"); 2255 return NULL; /* nothing to reinitialize */ 2256 } 2257 2258 /* XXX note- in the new scheme, we are not guaranteed to be 2259 * under lock (e.g. when called on a device reset). 2260 * In this case, we should set a flag and do not trust too 2261 * much the values. In practice: TODO 2262 * - set a RESET flag somewhere in the kring 2263 * - do the processing in a conservative way 2264 * - let the *sync() fixup at the end. 2265 */ 2266 if (tx == NR_TX) { 2267 if (n >= na->num_tx_rings) 2268 return NULL; 2269 kring = na->tx_rings + n; 2270 new_hwofs = kring->nr_hwcur - new_cur; 2271 } else { 2272 if (n >= na->num_rx_rings) 2273 return NULL; 2274 kring = na->rx_rings + n; 2275 new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur; 2276 } 2277 lim = kring->nkr_num_slots - 1; 2278 if (new_hwofs > lim) 2279 new_hwofs -= lim + 1; 2280 2281 /* Always set the new offset value and realign the ring. */ 2282 D("%s hwofs %d -> %d, hwavail %d -> %d", 2283 tx == NR_TX ? "TX" : "RX", 2284 kring->nkr_hwofs, new_hwofs, 2285 kring->nr_hwavail, 2286 tx == NR_TX ? lim : kring->nr_hwavail); 2287 kring->nkr_hwofs = new_hwofs; 2288 if (tx == NR_TX) 2289 kring->nr_hwavail = lim; 2290 kring->nr_hwreserved = 0; 2291 2292 #if 0 // def linux 2293 /* XXX check that the mappings are correct */ 2294 /* need ring_nr, adapter->pdev, direction */ 2295 buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE); 2296 if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { 2297 D("error mapping rx netmap buffer %d", i); 2298 // XXX fix error handling 2299 } 2300 2301 #endif /* linux */ 2302 /* 2303 * Wakeup on the individual and global selwait 2304 * We do the wakeup here, but the ring is not yet reconfigured. 2305 * However, we are under lock so there are no races. 2306 */ 2307 na->nm_notify(na, n, tx, NAF_GLOBAL_NOTIFY); 2308 return kring->ring->slot; 2309 } 2310 2311 2312 /* 2313 * Dispatch rx/tx interrupts to the netmap rings. 2314 * 2315 * "work_done" is non-null on the RX path, NULL for the TX path. 2316 * We rely on the OS to make sure that there is only one active 2317 * instance per queue, and that there is appropriate locking. 2318 * 2319 * The 'notify' routine depends on what the ring is attached to. 2320 * - for a netmap file descriptor, do a selwakeup on the individual 2321 * waitqueue, plus one on the global one if needed 2322 * - for a switch, call the proper forwarding routine 2323 * - XXX more ? 2324 */ 2325 void 2326 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) 2327 { 2328 struct netmap_adapter *na = NA(ifp); 2329 struct netmap_kring *kring; 2330 2331 q &= NETMAP_RING_MASK; 2332 2333 if (netmap_verbose) { 2334 RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); 2335 } 2336 2337 if (work_done) { /* RX path */ 2338 if (q >= na->num_rx_rings) 2339 return; // not a physical queue 2340 kring = na->rx_rings + q; 2341 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? 2342 na->nm_notify(na, q, NR_RX, 2343 (na->num_rx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0)); 2344 *work_done = 1; /* do not fire napi again */ 2345 } else { /* TX path */ 2346 if (q >= na->num_tx_rings) 2347 return; // not a physical queue 2348 kring = na->tx_rings + q; 2349 na->nm_notify(na, q, NR_TX, 2350 (na->num_tx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0)); 2351 } 2352 } 2353 2354 /* 2355 * Default functions to handle rx/tx interrupts from a physical device. 2356 * "work_done" is non-null on the RX path, NULL for the TX path. 2357 * 2358 * If the card is not in netmap mode, simply return 0, 2359 * so that the caller proceeds with regular processing. 2360 * Otherwise call netmap_common_irq() and return 1. 2361 * 2362 * If the card is connected to a netmap file descriptor, 2363 * do a selwakeup on the individual queue, plus one on the global one 2364 * if needed (multiqueue card _and_ there are multiqueue listeners), 2365 * and return 1. 2366 * 2367 * Finally, if called on rx from an interface connected to a switch, 2368 * calls the proper forwarding routine, and return 1. 2369 */ 2370 int 2371 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) 2372 { 2373 // XXX could we check NAF_NATIVE_ON ? 2374 if (!(ifp->if_capenable & IFCAP_NETMAP)) 2375 return 0; 2376 2377 if (NA(ifp)->na_flags & NAF_SKIP_INTR) { 2378 ND("use regular interrupt"); 2379 return 0; 2380 } 2381 2382 netmap_common_irq(ifp, q, work_done); 2383 return 1; 2384 } 2385 2386 2387 /* 2388 * Module loader and unloader 2389 * 2390 * netmap_init() creates the /dev/netmap device and initializes 2391 * all global variables. Returns 0 on success, errno on failure 2392 * (but there is no chance) 2393 * 2394 * netmap_fini() destroys everything. 2395 */ 2396 2397 static struct cdev *netmap_dev; /* /dev/netmap character device. */ 2398 extern struct cdevsw netmap_cdevsw; 2399 2400 void 2401 netmap_fini(void) 2402 { 2403 // XXX destroy_bridges() ? 2404 if (netmap_dev) 2405 destroy_dev(netmap_dev); 2406 netmap_mem_fini(); 2407 NMG_LOCK_DESTROY(); 2408 printf("netmap: unloaded module.\n"); 2409 } 2410 2411 int 2412 netmap_init(void) 2413 { 2414 int error; 2415 2416 NMG_LOCK_INIT(); 2417 2418 error = netmap_mem_init(); 2419 if (error != 0) 2420 goto fail; 2421 /* XXX could use make_dev_credv() to get error number */ 2422 netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, 2423 "netmap"); 2424 if (!netmap_dev) 2425 goto fail; 2426 2427 netmap_init_bridges(); 2428 printf("netmap: loaded module\n"); 2429 return (0); 2430 fail: 2431 netmap_fini(); 2432 return (EINVAL); /* may be incorrect */ 2433 } 2434