1 /* 2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 /* 28 * $FreeBSD$ 29 * 30 * This module supports memory mapped access to network devices, 31 * see netmap(4). 32 * 33 * The module uses a large, memory pool allocated by the kernel 34 * and accessible as mmapped memory by multiple userspace threads/processes. 35 * The memory pool contains packet buffers and "netmap rings", 36 * i.e. user-accessible copies of the interface's queues. 37 * 38 * Access to the network card works like this: 39 * 1. a process/thread issues one or more open() on /dev/netmap, to create 40 * select()able file descriptor on which events are reported. 41 * 2. on each descriptor, the process issues an ioctl() to identify 42 * the interface that should report events to the file descriptor. 43 * 3. on each descriptor, the process issues an mmap() request to 44 * map the shared memory region within the process' address space. 45 * The list of interesting queues is indicated by a location in 46 * the shared memory region. 47 * 4. using the functions in the netmap(4) userspace API, a process 48 * can look up the occupation state of a queue, access memory buffers, 49 * and retrieve received packets or enqueue packets to transmit. 50 * 5. using some ioctl()s the process can synchronize the userspace view 51 * of the queue with the actual status in the kernel. This includes both 52 * receiving the notification of new packets, and transmitting new 53 * packets on the output interface. 54 * 6. select() or poll() can be used to wait for events on individual 55 * transmit or receive queues (or all queues for a given interface). 56 * 57 58 SYNCHRONIZATION (USER) 59 60 The netmap rings and data structures may be shared among multiple 61 user threads or even independent processes. 62 Any synchronization among those threads/processes is delegated 63 to the threads themselves. Only one thread at a time can be in 64 a system call on the same netmap ring. The OS does not enforce 65 this and only guarantees against system crashes in case of 66 invalid usage. 67 68 LOCKING (INTERNAL) 69 70 Within the kernel, access to the netmap rings is protected as follows: 71 72 - a spinlock on each ring, to handle producer/consumer races on 73 RX rings attached to the host stack (against multiple host 74 threads writing from the host stack to the same ring), 75 and on 'destination' rings attached to a VALE switch 76 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) 77 protecting multiple active senders for the same destination) 78 79 - an atomic variable to guarantee that there is at most one 80 instance of *_*xsync() on the ring at any time. 81 For rings connected to user file 82 descriptors, an atomic_test_and_set() protects this, and the 83 lock on the ring is not actually used. 84 For NIC RX rings connected to a VALE switch, an atomic_test_and_set() 85 is also used to prevent multiple executions (the driver might indeed 86 already guarantee this). 87 For NIC TX rings connected to a VALE switch, the lock arbitrates 88 access to the queue (both when allocating buffers and when pushing 89 them out). 90 91 - *xsync() should be protected against initializations of the card. 92 On FreeBSD most devices have the reset routine protected by 93 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing 94 the RING protection on rx_reset(), this should be added. 95 96 On linux there is an external lock on the tx path, which probably 97 also arbitrates access to the reset routine. XXX to be revised 98 99 - a per-interface core_lock protecting access from the host stack 100 while interfaces may be detached from netmap mode. 101 XXX there should be no need for this lock if we detach the interfaces 102 only while they are down. 103 104 105 --- VALE SWITCH --- 106 107 NMG_LOCK() serializes all modifications to switches and ports. 108 A switch cannot be deleted until all ports are gone. 109 110 For each switch, an SX lock (RWlock on linux) protects 111 deletion of ports. When configuring or deleting a new port, the 112 lock is acquired in exclusive mode (after holding NMG_LOCK). 113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 114 The lock is held throughout the entire forwarding cycle, 115 during which the thread may incur in a page fault. 116 Hence it is important that sleepable shared locks are used. 117 118 On the rx ring, the per-port lock is grabbed initially to reserve 119 a number of slot in the ring, then the lock is released, 120 packets are copied from source to destination, and then 121 the lock is acquired again and the receive ring is updated. 122 (A similar thing is done on the tx ring for NIC and host stack 123 ports attached to the switch) 124 125 */ 126 127 /* 128 * OS-specific code that is used only within this file. 129 * Other OS-specific code that must be accessed by drivers 130 * is present in netmap_kern.h 131 */ 132 133 #if defined(__FreeBSD__) 134 #include <sys/cdefs.h> /* prerequisite */ 135 #include <sys/types.h> 136 #include <sys/errno.h> 137 #include <sys/param.h> /* defines used in kernel.h */ 138 #include <sys/kernel.h> /* types used in module initialization */ 139 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 140 #include <sys/filio.h> /* FIONBIO */ 141 #include <sys/sockio.h> 142 #include <sys/socketvar.h> /* struct socket */ 143 #include <sys/malloc.h> 144 #include <sys/poll.h> 145 #include <sys/rwlock.h> 146 #include <sys/socket.h> /* sockaddrs */ 147 #include <sys/selinfo.h> 148 #include <sys/sysctl.h> 149 #include <sys/jail.h> 150 #include <net/vnet.h> 151 #include <net/if.h> 152 #include <net/if_var.h> 153 #include <net/bpf.h> /* BIOCIMMEDIATE */ 154 #include <machine/bus.h> /* bus_dmamap_* */ 155 #include <sys/endian.h> 156 #include <sys/refcount.h> 157 158 159 /* reduce conditional code */ 160 // linux API, use for the knlist in FreeBSD 161 #define init_waitqueue_head(x) knlist_init_mtx(&(x)->si_note, NULL) 162 163 void freebsd_selwakeup(struct selinfo *si, int pri); 164 #define OS_selwakeup(a, b) freebsd_selwakeup(a, b) 165 166 #elif defined(linux) 167 168 #include "bsd_glue.h" 169 170 171 172 #elif defined(__APPLE__) 173 174 #warning OSX support is only partial 175 #include "osx_glue.h" 176 177 #else 178 179 #error Unsupported platform 180 181 #endif /* unsupported */ 182 183 /* 184 * common headers 185 */ 186 #include <net/netmap.h> 187 #include <dev/netmap/netmap_kern.h> 188 #include <dev/netmap/netmap_mem2.h> 189 190 191 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); 192 193 /* 194 * The following variables are used by the drivers and replicate 195 * fields in the global memory pool. They only refer to buffers 196 * used by physical interfaces. 197 */ 198 u_int netmap_total_buffers; 199 u_int netmap_buf_size; 200 char *netmap_buffer_base; /* also address of an invalid buffer */ 201 202 /* user-controlled variables */ 203 int netmap_verbose; 204 205 static int netmap_no_timestamp; /* don't timestamp on rxsync */ 206 207 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); 208 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, 209 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); 210 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, 211 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); 212 int netmap_mitigate = 1; 213 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); 214 int netmap_no_pendintr = 1; 215 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, 216 CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); 217 int netmap_txsync_retry = 2; 218 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, 219 &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); 220 221 int netmap_flags = 0; /* debug flags */ 222 int netmap_fwd = 0; /* force transparent mode */ 223 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ 224 225 /* 226 * netmap_admode selects the netmap mode to use. 227 * Invalid values are reset to NETMAP_ADMODE_BEST 228 */ 229 enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ 230 NETMAP_ADMODE_NATIVE, /* either native or none */ 231 NETMAP_ADMODE_GENERIC, /* force generic */ 232 NETMAP_ADMODE_LAST }; 233 static int netmap_admode = NETMAP_ADMODE_BEST; 234 235 int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ 236 int netmap_generic_ringsize = 1024; /* Generic ringsize. */ 237 int netmap_generic_rings = 1; /* number of queues in generic. */ 238 239 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); 240 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); 241 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); 242 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); 243 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); 244 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); 245 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , ""); 246 247 NMG_LOCK_T netmap_global_lock; 248 249 250 static void 251 nm_kr_get(struct netmap_kring *kr) 252 { 253 while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) 254 tsleep(kr, 0, "NM_KR_GET", 4); 255 } 256 257 258 /* 259 * mark the ring as stopped, and run through the locks 260 * to make sure other users get to see it. 261 */ 262 void 263 netmap_disable_ring(struct netmap_kring *kr) 264 { 265 kr->nkr_stopped = 1; 266 nm_kr_get(kr); 267 mtx_lock(&kr->q_lock); 268 mtx_unlock(&kr->q_lock); 269 nm_kr_put(kr); 270 } 271 272 273 static void 274 netmap_set_all_rings(struct ifnet *ifp, int stopped) 275 { 276 struct netmap_adapter *na; 277 int i; 278 u_int ntx, nrx; 279 280 if (!(ifp->if_capenable & IFCAP_NETMAP)) 281 return; 282 283 na = NA(ifp); 284 285 ntx = netmap_real_tx_rings(na); 286 nrx = netmap_real_rx_rings(na); 287 288 for (i = 0; i < ntx; i++) { 289 if (stopped) 290 netmap_disable_ring(na->tx_rings + i); 291 else 292 na->tx_rings[i].nkr_stopped = 0; 293 na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY); 294 } 295 296 for (i = 0; i < nrx; i++) { 297 if (stopped) 298 netmap_disable_ring(na->rx_rings + i); 299 else 300 na->rx_rings[i].nkr_stopped = 0; 301 na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY); 302 } 303 } 304 305 306 void 307 netmap_disable_all_rings(struct ifnet *ifp) 308 { 309 netmap_set_all_rings(ifp, 1 /* stopped */); 310 } 311 312 313 void 314 netmap_enable_all_rings(struct ifnet *ifp) 315 { 316 netmap_set_all_rings(ifp, 0 /* enabled */); 317 } 318 319 320 /* 321 * generic bound_checking function 322 */ 323 u_int 324 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) 325 { 326 u_int oldv = *v; 327 const char *op = NULL; 328 329 if (dflt < lo) 330 dflt = lo; 331 if (dflt > hi) 332 dflt = hi; 333 if (oldv < lo) { 334 *v = dflt; 335 op = "Bump"; 336 } else if (oldv > hi) { 337 *v = hi; 338 op = "Clamp"; 339 } 340 if (op && msg) 341 printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); 342 return *v; 343 } 344 345 346 /* 347 * packet-dump function, user-supplied or static buffer. 348 * The destination buffer must be at least 30+4*len 349 */ 350 const char * 351 nm_dump_buf(char *p, int len, int lim, char *dst) 352 { 353 static char _dst[8192]; 354 int i, j, i0; 355 static char hex[] ="0123456789abcdef"; 356 char *o; /* output position */ 357 358 #define P_HI(x) hex[((x) & 0xf0)>>4] 359 #define P_LO(x) hex[((x) & 0xf)] 360 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') 361 if (!dst) 362 dst = _dst; 363 if (lim <= 0 || lim > len) 364 lim = len; 365 o = dst; 366 sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); 367 o += strlen(o); 368 /* hexdump routine */ 369 for (i = 0; i < lim; ) { 370 sprintf(o, "%5d: ", i); 371 o += strlen(o); 372 memset(o, ' ', 48); 373 i0 = i; 374 for (j=0; j < 16 && i < lim; i++, j++) { 375 o[j*3] = P_HI(p[i]); 376 o[j*3+1] = P_LO(p[i]); 377 } 378 i = i0; 379 for (j=0; j < 16 && i < lim; i++, j++) 380 o[j + 48] = P_C(p[i]); 381 o[j+48] = '\n'; 382 o += j+49; 383 } 384 *o = '\0'; 385 #undef P_HI 386 #undef P_LO 387 #undef P_C 388 return dst; 389 } 390 391 392 /* 393 * Fetch configuration from the device, to cope with dynamic 394 * reconfigurations after loading the module. 395 */ 396 int 397 netmap_update_config(struct netmap_adapter *na) 398 { 399 struct ifnet *ifp = na->ifp; 400 u_int txr, txd, rxr, rxd; 401 402 txr = txd = rxr = rxd = 0; 403 if (na->nm_config) { 404 na->nm_config(na, &txr, &txd, &rxr, &rxd); 405 } else { 406 /* take whatever we had at init time */ 407 txr = na->num_tx_rings; 408 txd = na->num_tx_desc; 409 rxr = na->num_rx_rings; 410 rxd = na->num_rx_desc; 411 } 412 413 if (na->num_tx_rings == txr && na->num_tx_desc == txd && 414 na->num_rx_rings == rxr && na->num_rx_desc == rxd) 415 return 0; /* nothing changed */ 416 if (netmap_verbose || na->active_fds > 0) { 417 D("stored config %s: txring %d x %d, rxring %d x %d", 418 NM_IFPNAME(ifp), 419 na->num_tx_rings, na->num_tx_desc, 420 na->num_rx_rings, na->num_rx_desc); 421 D("new config %s: txring %d x %d, rxring %d x %d", 422 NM_IFPNAME(ifp), txr, txd, rxr, rxd); 423 } 424 if (na->active_fds == 0) { 425 D("configuration changed (but fine)"); 426 na->num_tx_rings = txr; 427 na->num_tx_desc = txd; 428 na->num_rx_rings = rxr; 429 na->num_rx_desc = rxd; 430 return 0; 431 } 432 D("configuration changed while active, this is bad..."); 433 return 1; 434 } 435 436 static int 437 netmap_txsync_compat(struct netmap_kring *kring, int flags) 438 { 439 struct netmap_adapter *na = kring->na; 440 return na->nm_txsync(na, kring->ring_id, flags); 441 } 442 443 static int 444 netmap_rxsync_compat(struct netmap_kring *kring, int flags) 445 { 446 struct netmap_adapter *na = kring->na; 447 return na->nm_rxsync(na, kring->ring_id, flags); 448 } 449 450 static int 451 netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags) 452 { 453 (void)flags; 454 netmap_txsync_to_host(kring->na); 455 return 0; 456 } 457 458 static int 459 netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags) 460 { 461 (void)flags; 462 netmap_rxsync_from_host(kring->na, NULL, NULL); 463 return 0; 464 } 465 466 467 468 /* create the krings array and initialize the fields common to all adapters. 469 * The array layout is this: 470 * 471 * +----------+ 472 * na->tx_rings ----->| | \ 473 * | | } na->num_tx_ring 474 * | | / 475 * +----------+ 476 * | | host tx kring 477 * na->rx_rings ----> +----------+ 478 * | | \ 479 * | | } na->num_rx_rings 480 * | | / 481 * +----------+ 482 * | | host rx kring 483 * +----------+ 484 * na->tailroom ----->| | \ 485 * | | } tailroom bytes 486 * | | / 487 * +----------+ 488 * 489 * Note: for compatibility, host krings are created even when not needed. 490 * The tailroom space is currently used by vale ports for allocating leases. 491 */ 492 int 493 netmap_krings_create(struct netmap_adapter *na, u_int tailroom) 494 { 495 u_int i, len, ndesc; 496 struct netmap_kring *kring; 497 u_int ntx, nrx; 498 499 /* account for the (possibly fake) host rings */ 500 ntx = na->num_tx_rings + 1; 501 nrx = na->num_rx_rings + 1; 502 503 len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom; 504 505 na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); 506 if (na->tx_rings == NULL) { 507 D("Cannot allocate krings"); 508 return ENOMEM; 509 } 510 na->rx_rings = na->tx_rings + ntx; 511 512 /* 513 * All fields in krings are 0 except the one initialized below. 514 * but better be explicit on important kring fields. 515 */ 516 ndesc = na->num_tx_desc; 517 for (i = 0; i < ntx; i++) { /* Transmit rings */ 518 kring = &na->tx_rings[i]; 519 bzero(kring, sizeof(*kring)); 520 kring->na = na; 521 kring->ring_id = i; 522 kring->nkr_num_slots = ndesc; 523 if (i < na->num_tx_rings) { 524 kring->nm_sync = netmap_txsync_compat; // XXX 525 } else if (i == na->num_tx_rings) { 526 kring->nm_sync = netmap_txsync_to_host_compat; 527 } 528 /* 529 * IMPORTANT: Always keep one slot empty. 530 */ 531 kring->rhead = kring->rcur = kring->nr_hwcur = 0; 532 kring->rtail = kring->nr_hwtail = ndesc - 1; 533 snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i); 534 ND("ktx %s h %d c %d t %d", 535 kring->name, kring->rhead, kring->rcur, kring->rtail); 536 mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF); 537 init_waitqueue_head(&kring->si); 538 } 539 540 ndesc = na->num_rx_desc; 541 for (i = 0; i < nrx; i++) { /* Receive rings */ 542 kring = &na->rx_rings[i]; 543 bzero(kring, sizeof(*kring)); 544 kring->na = na; 545 kring->ring_id = i; 546 kring->nkr_num_slots = ndesc; 547 if (i < na->num_rx_rings) { 548 kring->nm_sync = netmap_rxsync_compat; // XXX 549 } else if (i == na->num_rx_rings) { 550 kring->nm_sync = netmap_rxsync_from_host_compat; 551 } 552 kring->rhead = kring->rcur = kring->nr_hwcur = 0; 553 kring->rtail = kring->nr_hwtail = 0; 554 snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i); 555 ND("krx %s h %d c %d t %d", 556 kring->name, kring->rhead, kring->rcur, kring->rtail); 557 mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF); 558 init_waitqueue_head(&kring->si); 559 } 560 init_waitqueue_head(&na->tx_si); 561 init_waitqueue_head(&na->rx_si); 562 563 na->tailroom = na->rx_rings + nrx; 564 565 return 0; 566 } 567 568 569 /* undo the actions performed by netmap_krings_create */ 570 void 571 netmap_krings_delete(struct netmap_adapter *na) 572 { 573 struct netmap_kring *kring = na->tx_rings; 574 575 /* we rely on the krings layout described above */ 576 for ( ; kring != na->tailroom; kring++) { 577 mtx_destroy(&kring->q_lock); 578 } 579 free(na->tx_rings, M_DEVBUF); 580 na->tx_rings = na->rx_rings = na->tailroom = NULL; 581 } 582 583 584 /* 585 * Destructor for NIC ports. They also have an mbuf queue 586 * on the rings connected to the host so we need to purge 587 * them first. 588 */ 589 static void 590 netmap_hw_krings_delete(struct netmap_adapter *na) 591 { 592 struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue; 593 594 ND("destroy sw mbq with len %d", mbq_len(q)); 595 mbq_purge(q); 596 mbq_safe_destroy(q); 597 netmap_krings_delete(na); 598 } 599 600 601 static struct netmap_if* 602 netmap_if_new(const char *ifname, struct netmap_adapter *na) 603 { 604 struct netmap_if *nifp; 605 606 if (netmap_update_config(na)) { 607 /* configuration mismatch, report and fail */ 608 return NULL; 609 } 610 611 if (na->active_fds) 612 goto final; 613 614 if (na->nm_krings_create(na)) 615 goto cleanup; 616 617 if (netmap_mem_rings_create(na)) 618 goto cleanup; 619 620 final: 621 622 nifp = netmap_mem_if_new(ifname, na); 623 if (nifp == NULL) 624 goto cleanup; 625 626 return (nifp); 627 628 cleanup: 629 630 if (na->active_fds == 0) { 631 netmap_mem_rings_delete(na); 632 na->nm_krings_delete(na); 633 } 634 635 return NULL; 636 } 637 638 639 /* grab a reference to the memory allocator, if we don't have one already. The 640 * reference is taken from the netmap_adapter registered with the priv. 641 * 642 */ 643 static int 644 netmap_get_memory_locked(struct netmap_priv_d* p) 645 { 646 struct netmap_mem_d *nmd; 647 int error = 0; 648 649 if (p->np_na == NULL) { 650 if (!netmap_mmap_unreg) 651 return ENODEV; 652 /* for compatibility with older versions of the API 653 * we use the global allocator when no interface has been 654 * registered 655 */ 656 nmd = &nm_mem; 657 } else { 658 nmd = p->np_na->nm_mem; 659 } 660 if (p->np_mref == NULL) { 661 error = netmap_mem_finalize(nmd); 662 if (!error) 663 p->np_mref = nmd; 664 } else if (p->np_mref != nmd) { 665 /* a virtual port has been registered, but previous 666 * syscalls already used the global allocator. 667 * We cannot continue 668 */ 669 error = ENODEV; 670 } 671 return error; 672 } 673 674 675 int 676 netmap_get_memory(struct netmap_priv_d* p) 677 { 678 int error; 679 NMG_LOCK(); 680 error = netmap_get_memory_locked(p); 681 NMG_UNLOCK(); 682 return error; 683 } 684 685 686 static int 687 netmap_have_memory_locked(struct netmap_priv_d* p) 688 { 689 return p->np_mref != NULL; 690 } 691 692 693 static void 694 netmap_drop_memory_locked(struct netmap_priv_d* p) 695 { 696 if (p->np_mref) { 697 netmap_mem_deref(p->np_mref); 698 p->np_mref = NULL; 699 } 700 } 701 702 703 /* 704 * File descriptor's private data destructor. 705 * 706 * Call nm_register(ifp,0) to stop netmap mode on the interface and 707 * revert to normal operation. We expect that np_na->ifp has not gone. 708 * The second argument is the nifp to work on. In some cases it is 709 * not attached yet to the netmap_priv_d so we need to pass it as 710 * a separate argument. 711 */ 712 /* call with NMG_LOCK held */ 713 static void 714 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) 715 { 716 struct netmap_adapter *na = priv->np_na; 717 struct ifnet *ifp = na->ifp; 718 719 NMG_LOCK_ASSERT(); 720 na->active_fds--; 721 if (na->active_fds <= 0) { /* last instance */ 722 723 if (netmap_verbose) 724 D("deleting last instance for %s", NM_IFPNAME(ifp)); 725 /* 726 * (TO CHECK) This function is only called 727 * when the last reference to this file descriptor goes 728 * away. This means we cannot have any pending poll() 729 * or interrupt routine operating on the structure. 730 * XXX The file may be closed in a thread while 731 * another thread is using it. 732 * Linux keeps the file opened until the last reference 733 * by any outstanding ioctl/poll or mmap is gone. 734 * FreeBSD does not track mmap()s (but we do) and 735 * wakes up any sleeping poll(). Need to check what 736 * happens if the close() occurs while a concurrent 737 * syscall is running. 738 */ 739 if (ifp) 740 na->nm_register(na, 0); /* off, clear flags */ 741 /* Wake up any sleeping threads. netmap_poll will 742 * then return POLLERR 743 * XXX The wake up now must happen during *_down(), when 744 * we order all activities to stop. -gl 745 */ 746 /* XXX kqueue(9) needed; these will mirror knlist_init. */ 747 /* knlist_destroy(&na->tx_si.si_note); */ 748 /* knlist_destroy(&na->rx_si.si_note); */ 749 750 /* delete rings and buffers */ 751 netmap_mem_rings_delete(na); 752 na->nm_krings_delete(na); 753 } 754 /* delete the nifp */ 755 netmap_mem_if_delete(na, nifp); 756 } 757 758 static __inline int 759 nm_tx_si_user(struct netmap_priv_d *priv) 760 { 761 return (priv->np_na != NULL && 762 (priv->np_txqlast - priv->np_txqfirst > 1)); 763 } 764 765 static __inline int 766 nm_rx_si_user(struct netmap_priv_d *priv) 767 { 768 return (priv->np_na != NULL && 769 (priv->np_rxqlast - priv->np_rxqfirst > 1)); 770 } 771 772 773 /* 774 * returns 1 if this is the last instance and we can free priv 775 */ 776 int 777 netmap_dtor_locked(struct netmap_priv_d *priv) 778 { 779 struct netmap_adapter *na = priv->np_na; 780 781 #ifdef __FreeBSD__ 782 /* 783 * np_refcount is the number of active mmaps on 784 * this file descriptor 785 */ 786 if (--priv->np_refcount > 0) { 787 return 0; 788 } 789 #endif /* __FreeBSD__ */ 790 if (!na) { 791 return 1; //XXX is it correct? 792 } 793 netmap_do_unregif(priv, priv->np_nifp); 794 priv->np_nifp = NULL; 795 netmap_drop_memory_locked(priv); 796 if (priv->np_na) { 797 if (nm_tx_si_user(priv)) 798 na->tx_si_users--; 799 if (nm_rx_si_user(priv)) 800 na->rx_si_users--; 801 netmap_adapter_put(na); 802 priv->np_na = NULL; 803 } 804 return 1; 805 } 806 807 808 void 809 netmap_dtor(void *data) 810 { 811 struct netmap_priv_d *priv = data; 812 int last_instance; 813 814 NMG_LOCK(); 815 last_instance = netmap_dtor_locked(priv); 816 NMG_UNLOCK(); 817 if (last_instance) { 818 bzero(priv, sizeof(*priv)); /* for safety */ 819 free(priv, M_DEVBUF); 820 } 821 } 822 823 824 825 826 /* 827 * Handlers for synchronization of the queues from/to the host. 828 * Netmap has two operating modes: 829 * - in the default mode, the rings connected to the host stack are 830 * just another ring pair managed by userspace; 831 * - in transparent mode (XXX to be defined) incoming packets 832 * (from the host or the NIC) are marked as NS_FORWARD upon 833 * arrival, and the user application has a chance to reset the 834 * flag for packets that should be dropped. 835 * On the RXSYNC or poll(), packets in RX rings between 836 * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved 837 * to the other side. 838 * The transfer NIC --> host is relatively easy, just encapsulate 839 * into mbufs and we are done. The host --> NIC side is slightly 840 * harder because there might not be room in the tx ring so it 841 * might take a while before releasing the buffer. 842 */ 843 844 845 /* 846 * pass a chain of buffers to the host stack as coming from 'dst' 847 * We do not need to lock because the queue is private. 848 */ 849 static void 850 netmap_send_up(struct ifnet *dst, struct mbq *q) 851 { 852 struct mbuf *m; 853 854 /* send packets up, outside the lock */ 855 while ((m = mbq_dequeue(q)) != NULL) { 856 if (netmap_verbose & NM_VERB_HOST) 857 D("sending up pkt %p size %d", m, MBUF_LEN(m)); 858 NM_SEND_UP(dst, m); 859 } 860 mbq_destroy(q); 861 } 862 863 864 /* 865 * put a copy of the buffers marked NS_FORWARD into an mbuf chain. 866 * Take packets from hwcur to ring->head marked NS_FORWARD (or forced) 867 * and pass them up. Drop remaining packets in the unlikely event 868 * of an mbuf shortage. 869 */ 870 static void 871 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) 872 { 873 u_int const lim = kring->nkr_num_slots - 1; 874 u_int const head = kring->ring->head; 875 u_int n; 876 struct netmap_adapter *na = kring->na; 877 878 for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) { 879 struct mbuf *m; 880 struct netmap_slot *slot = &kring->ring->slot[n]; 881 882 if ((slot->flags & NS_FORWARD) == 0 && !force) 883 continue; 884 if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { 885 RD(5, "bad pkt at %d len %d", n, slot->len); 886 continue; 887 } 888 slot->flags &= ~NS_FORWARD; // XXX needed ? 889 /* XXX TODO: adapt to the case of a multisegment packet */ 890 m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL); 891 892 if (m == NULL) 893 break; 894 mbq_enqueue(q, m); 895 } 896 } 897 898 899 /* 900 * Send to the NIC rings packets marked NS_FORWARD between 901 * kring->nr_hwcur and kring->rhead 902 * Called under kring->rx_queue.lock on the sw rx ring, 903 */ 904 static u_int 905 netmap_sw_to_nic(struct netmap_adapter *na) 906 { 907 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 908 struct netmap_slot *rxslot = kring->ring->slot; 909 u_int i, rxcur = kring->nr_hwcur; 910 u_int const head = kring->rhead; 911 u_int const src_lim = kring->nkr_num_slots - 1; 912 u_int sent = 0; 913 914 /* scan rings to find space, then fill as much as possible */ 915 for (i = 0; i < na->num_tx_rings; i++) { 916 struct netmap_kring *kdst = &na->tx_rings[i]; 917 struct netmap_ring *rdst = kdst->ring; 918 u_int const dst_lim = kdst->nkr_num_slots - 1; 919 920 /* XXX do we trust ring or kring->rcur,rtail ? */ 921 for (; rxcur != head && !nm_ring_empty(rdst); 922 rxcur = nm_next(rxcur, src_lim) ) { 923 struct netmap_slot *src, *dst, tmp; 924 u_int dst_cur = rdst->cur; 925 926 src = &rxslot[rxcur]; 927 if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd) 928 continue; 929 930 sent++; 931 932 dst = &rdst->slot[dst_cur]; 933 934 tmp = *src; 935 936 src->buf_idx = dst->buf_idx; 937 src->flags = NS_BUF_CHANGED; 938 939 dst->buf_idx = tmp.buf_idx; 940 dst->len = tmp.len; 941 dst->flags = NS_BUF_CHANGED; 942 943 rdst->cur = nm_next(dst_cur, dst_lim); 944 } 945 /* if (sent) XXX txsync ? */ 946 } 947 return sent; 948 } 949 950 951 /* 952 * netmap_txsync_to_host() passes packets up. We are called from a 953 * system call in user process context, and the only contention 954 * can be among multiple user threads erroneously calling 955 * this routine concurrently. 956 */ 957 void 958 netmap_txsync_to_host(struct netmap_adapter *na) 959 { 960 struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; 961 struct netmap_ring *ring = kring->ring; 962 u_int const lim = kring->nkr_num_slots - 1; 963 u_int const head = kring->rhead; 964 struct mbq q; 965 966 /* Take packets from hwcur to head and pass them up. 967 * force head = cur since netmap_grab_packets() stops at head 968 * In case of no buffers we give up. At the end of the loop, 969 * the queue is drained in all cases. 970 */ 971 mbq_init(&q); 972 ring->cur = head; 973 netmap_grab_packets(kring, &q, 1 /* force */); 974 ND("have %d pkts in queue", mbq_len(&q)); 975 kring->nr_hwcur = head; 976 kring->nr_hwtail = head + lim; 977 if (kring->nr_hwtail > lim) 978 kring->nr_hwtail -= lim + 1; 979 nm_txsync_finalize(kring); 980 981 netmap_send_up(na->ifp, &q); 982 } 983 984 985 /* 986 * rxsync backend for packets coming from the host stack. 987 * They have been put in kring->rx_queue by netmap_transmit(). 988 * We protect access to the kring using kring->rx_queue.lock 989 * 990 * This routine also does the selrecord if called from the poll handler 991 * (we know because td != NULL). 992 * 993 * NOTE: on linux, selrecord() is defined as a macro and uses pwait 994 * as an additional hidden argument. 995 * returns the number of packets delivered to tx queues in 996 * transparent mode, or a negative value if error 997 */ 998 int 999 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) 1000 { 1001 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 1002 struct netmap_ring *ring = kring->ring; 1003 u_int nm_i, n; 1004 u_int const lim = kring->nkr_num_slots - 1; 1005 u_int const head = kring->rhead; 1006 int ret = 0; 1007 struct mbq *q = &kring->rx_queue; 1008 1009 (void)pwait; /* disable unused warnings */ 1010 (void)td; 1011 1012 mtx_lock(&q->lock); 1013 1014 /* First part: import newly received packets */ 1015 n = mbq_len(q); 1016 if (n) { /* grab packets from the queue */ 1017 struct mbuf *m; 1018 uint32_t stop_i; 1019 1020 nm_i = kring->nr_hwtail; 1021 stop_i = nm_prev(nm_i, lim); 1022 while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) { 1023 int len = MBUF_LEN(m); 1024 struct netmap_slot *slot = &ring->slot[nm_i]; 1025 1026 m_copydata(m, 0, len, BDG_NMB(na, slot)); 1027 ND("nm %d len %d", nm_i, len); 1028 if (netmap_verbose) 1029 D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL)); 1030 1031 slot->len = len; 1032 slot->flags = kring->nkr_slot_flags; 1033 nm_i = nm_next(nm_i, lim); 1034 } 1035 kring->nr_hwtail = nm_i; 1036 } 1037 1038 /* 1039 * Second part: skip past packets that userspace has released. 1040 */ 1041 nm_i = kring->nr_hwcur; 1042 if (nm_i != head) { /* something was released */ 1043 if (netmap_fwd || kring->ring->flags & NR_FORWARD) 1044 ret = netmap_sw_to_nic(na); 1045 kring->nr_hwcur = head; 1046 } 1047 1048 nm_rxsync_finalize(kring); 1049 1050 /* access copies of cur,tail in the kring */ 1051 if (kring->rcur == kring->rtail && td) /* no bufs available */ 1052 selrecord(td, &kring->si); 1053 1054 mtx_unlock(&q->lock); 1055 return ret; 1056 } 1057 1058 1059 /* Get a netmap adapter for the port. 1060 * 1061 * If it is possible to satisfy the request, return 0 1062 * with *na containing the netmap adapter found. 1063 * Otherwise return an error code, with *na containing NULL. 1064 * 1065 * When the port is attached to a bridge, we always return 1066 * EBUSY. 1067 * Otherwise, if the port is already bound to a file descriptor, 1068 * then we unconditionally return the existing adapter into *na. 1069 * In all the other cases, we return (into *na) either native, 1070 * generic or NULL, according to the following table: 1071 * 1072 * native_support 1073 * active_fds dev.netmap.admode YES NO 1074 * ------------------------------------------------------- 1075 * >0 * NA(ifp) NA(ifp) 1076 * 1077 * 0 NETMAP_ADMODE_BEST NATIVE GENERIC 1078 * 0 NETMAP_ADMODE_NATIVE NATIVE NULL 1079 * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC 1080 * 1081 */ 1082 1083 int 1084 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) 1085 { 1086 /* generic support */ 1087 int i = netmap_admode; /* Take a snapshot. */ 1088 int error = 0; 1089 struct netmap_adapter *prev_na; 1090 struct netmap_generic_adapter *gna; 1091 1092 *na = NULL; /* default */ 1093 1094 /* reset in case of invalid value */ 1095 if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST) 1096 i = netmap_admode = NETMAP_ADMODE_BEST; 1097 1098 if (NETMAP_CAPABLE(ifp)) { 1099 /* If an adapter already exists, but is 1100 * attached to a vale port, we report that the 1101 * port is busy. 1102 */ 1103 if (NETMAP_OWNED_BY_KERN(NA(ifp))) 1104 return EBUSY; 1105 1106 /* If an adapter already exists, return it if 1107 * there are active file descriptors or if 1108 * netmap is not forced to use generic 1109 * adapters. 1110 */ 1111 if (NA(ifp)->active_fds > 0 || 1112 i != NETMAP_ADMODE_GENERIC) { 1113 *na = NA(ifp); 1114 return 0; 1115 } 1116 } 1117 1118 /* If there isn't native support and netmap is not allowed 1119 * to use generic adapters, we cannot satisfy the request. 1120 */ 1121 if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE) 1122 return EOPNOTSUPP; 1123 1124 /* Otherwise, create a generic adapter and return it, 1125 * saving the previously used netmap adapter, if any. 1126 * 1127 * Note that here 'prev_na', if not NULL, MUST be a 1128 * native adapter, and CANNOT be a generic one. This is 1129 * true because generic adapters are created on demand, and 1130 * destroyed when not used anymore. Therefore, if the adapter 1131 * currently attached to an interface 'ifp' is generic, it 1132 * must be that 1133 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))). 1134 * Consequently, if NA(ifp) is generic, we will enter one of 1135 * the branches above. This ensures that we never override 1136 * a generic adapter with another generic adapter. 1137 */ 1138 prev_na = NA(ifp); 1139 error = generic_netmap_attach(ifp); 1140 if (error) 1141 return error; 1142 1143 *na = NA(ifp); 1144 gna = (struct netmap_generic_adapter*)NA(ifp); 1145 gna->prev = prev_na; /* save old na */ 1146 if (prev_na != NULL) { 1147 ifunit_ref(ifp->if_xname); 1148 // XXX add a refcount ? 1149 netmap_adapter_get(prev_na); 1150 } 1151 ND("Created generic NA %p (prev %p)", gna, gna->prev); 1152 1153 return 0; 1154 } 1155 1156 1157 /* 1158 * MUST BE CALLED UNDER NMG_LOCK() 1159 * 1160 * Get a refcounted reference to a netmap adapter attached 1161 * to the interface specified by nmr. 1162 * This is always called in the execution of an ioctl(). 1163 * 1164 * Return ENXIO if the interface specified by the request does 1165 * not exist, ENOTSUP if netmap is not supported by the interface, 1166 * EBUSY if the interface is already attached to a bridge, 1167 * EINVAL if parameters are invalid, ENOMEM if needed resources 1168 * could not be allocated. 1169 * If successful, hold a reference to the netmap adapter. 1170 * 1171 * No reference is kept on the real interface, which may then 1172 * disappear at any time. 1173 */ 1174 int 1175 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) 1176 { 1177 struct ifnet *ifp = NULL; 1178 int error = 0; 1179 struct netmap_adapter *ret = NULL; 1180 1181 *na = NULL; /* default return value */ 1182 1183 /* first try to see if this is a bridge port. */ 1184 NMG_LOCK_ASSERT(); 1185 1186 error = netmap_get_pipe_na(nmr, na, create); 1187 if (error || *na != NULL) 1188 return error; 1189 1190 error = netmap_get_bdg_na(nmr, na, create); 1191 if (error) 1192 return error; 1193 1194 if (*na != NULL) /* valid match in netmap_get_bdg_na() */ 1195 goto pipes; 1196 1197 ifp = ifunit_ref(nmr->nr_name); 1198 if (ifp == NULL) { 1199 return ENXIO; 1200 } 1201 1202 error = netmap_get_hw_na(ifp, &ret); 1203 if (error) 1204 goto out; 1205 1206 /* Users cannot use the NIC attached to a bridge directly */ 1207 if (NETMAP_OWNED_BY_KERN(ret)) { 1208 error = EBUSY; 1209 goto out; 1210 } 1211 *na = ret; 1212 netmap_adapter_get(ret); 1213 1214 pipes: 1215 error = netmap_pipe_alloc(*na, nmr); 1216 1217 out: 1218 if (error && ret != NULL) 1219 netmap_adapter_put(ret); 1220 1221 if (ifp) 1222 if_rele(ifp); 1223 1224 return error; 1225 } 1226 1227 1228 /* 1229 * validate parameters on entry for *_txsync() 1230 * Returns ring->cur if ok, or something >= kring->nkr_num_slots 1231 * in case of error. 1232 * 1233 * rhead, rcur and rtail=hwtail are stored from previous round. 1234 * hwcur is the next packet to send to the ring. 1235 * 1236 * We want 1237 * hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail 1238 * 1239 * hwcur, rhead, rtail and hwtail are reliable 1240 */ 1241 u_int 1242 nm_txsync_prologue(struct netmap_kring *kring) 1243 { 1244 struct netmap_ring *ring = kring->ring; 1245 u_int head = ring->head; /* read only once */ 1246 u_int cur = ring->cur; /* read only once */ 1247 u_int n = kring->nkr_num_slots; 1248 1249 ND(5, "%s kcur %d ktail %d head %d cur %d tail %d", 1250 kring->name, 1251 kring->nr_hwcur, kring->nr_hwtail, 1252 ring->head, ring->cur, ring->tail); 1253 #if 1 /* kernel sanity checks; but we can trust the kring. */ 1254 if (kring->nr_hwcur >= n || kring->rhead >= n || 1255 kring->rtail >= n || kring->nr_hwtail >= n) 1256 goto error; 1257 #endif /* kernel sanity checks */ 1258 /* 1259 * user sanity checks. We only use 'cur', 1260 * A, B, ... are possible positions for cur: 1261 * 1262 * 0 A cur B tail C n-1 1263 * 0 D tail E cur F n-1 1264 * 1265 * B, F, D are valid. A, C, E are wrong 1266 */ 1267 if (kring->rtail >= kring->rhead) { 1268 /* want rhead <= head <= rtail */ 1269 if (head < kring->rhead || head > kring->rtail) 1270 goto error; 1271 /* and also head <= cur <= rtail */ 1272 if (cur < head || cur > kring->rtail) 1273 goto error; 1274 } else { /* here rtail < rhead */ 1275 /* we need head outside rtail .. rhead */ 1276 if (head > kring->rtail && head < kring->rhead) 1277 goto error; 1278 1279 /* two cases now: head <= rtail or head >= rhead */ 1280 if (head <= kring->rtail) { 1281 /* want head <= cur <= rtail */ 1282 if (cur < head || cur > kring->rtail) 1283 goto error; 1284 } else { /* head >= rhead */ 1285 /* cur must be outside rtail..head */ 1286 if (cur > kring->rtail && cur < head) 1287 goto error; 1288 } 1289 } 1290 if (ring->tail != kring->rtail) { 1291 RD(5, "tail overwritten was %d need %d", 1292 ring->tail, kring->rtail); 1293 ring->tail = kring->rtail; 1294 } 1295 kring->rhead = head; 1296 kring->rcur = cur; 1297 return head; 1298 1299 error: 1300 RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d", 1301 kring->name, 1302 kring->nr_hwcur, 1303 kring->rcur, kring->nr_hwtail, 1304 cur, ring->tail); 1305 return n; 1306 } 1307 1308 1309 /* 1310 * validate parameters on entry for *_rxsync() 1311 * Returns ring->head if ok, kring->nkr_num_slots on error. 1312 * 1313 * For a valid configuration, 1314 * hwcur <= head <= cur <= tail <= hwtail 1315 * 1316 * We only consider head and cur. 1317 * hwcur and hwtail are reliable. 1318 * 1319 */ 1320 u_int 1321 nm_rxsync_prologue(struct netmap_kring *kring) 1322 { 1323 struct netmap_ring *ring = kring->ring; 1324 uint32_t const n = kring->nkr_num_slots; 1325 uint32_t head, cur; 1326 1327 ND("%s kc %d kt %d h %d c %d t %d", 1328 kring->name, 1329 kring->nr_hwcur, kring->nr_hwtail, 1330 ring->head, ring->cur, ring->tail); 1331 /* 1332 * Before storing the new values, we should check they do not 1333 * move backwards. However: 1334 * - head is not an issue because the previous value is hwcur; 1335 * - cur could in principle go back, however it does not matter 1336 * because we are processing a brand new rxsync() 1337 */ 1338 cur = kring->rcur = ring->cur; /* read only once */ 1339 head = kring->rhead = ring->head; /* read only once */ 1340 #if 1 /* kernel sanity checks */ 1341 if (kring->nr_hwcur >= n || kring->nr_hwtail >= n) 1342 goto error; 1343 #endif /* kernel sanity checks */ 1344 /* user sanity checks */ 1345 if (kring->nr_hwtail >= kring->nr_hwcur) { 1346 /* want hwcur <= rhead <= hwtail */ 1347 if (head < kring->nr_hwcur || head > kring->nr_hwtail) 1348 goto error; 1349 /* and also rhead <= rcur <= hwtail */ 1350 if (cur < head || cur > kring->nr_hwtail) 1351 goto error; 1352 } else { 1353 /* we need rhead outside hwtail..hwcur */ 1354 if (head < kring->nr_hwcur && head > kring->nr_hwtail) 1355 goto error; 1356 /* two cases now: head <= hwtail or head >= hwcur */ 1357 if (head <= kring->nr_hwtail) { 1358 /* want head <= cur <= hwtail */ 1359 if (cur < head || cur > kring->nr_hwtail) 1360 goto error; 1361 } else { 1362 /* cur must be outside hwtail..head */ 1363 if (cur < head && cur > kring->nr_hwtail) 1364 goto error; 1365 } 1366 } 1367 if (ring->tail != kring->rtail) { 1368 RD(5, "%s tail overwritten was %d need %d", 1369 kring->name, 1370 ring->tail, kring->rtail); 1371 ring->tail = kring->rtail; 1372 } 1373 return head; 1374 1375 error: 1376 RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d", 1377 kring->nr_hwcur, 1378 kring->rcur, kring->nr_hwtail, 1379 kring->rhead, kring->rcur, ring->tail); 1380 return n; 1381 } 1382 1383 1384 /* 1385 * Error routine called when txsync/rxsync detects an error. 1386 * Can't do much more than resetting head =cur = hwcur, tail = hwtail 1387 * Return 1 on reinit. 1388 * 1389 * This routine is only called by the upper half of the kernel. 1390 * It only reads hwcur (which is changed only by the upper half, too) 1391 * and hwtail (which may be changed by the lower half, but only on 1392 * a tx ring and only to increase it, so any error will be recovered 1393 * on the next call). For the above, we don't strictly need to call 1394 * it under lock. 1395 */ 1396 int 1397 netmap_ring_reinit(struct netmap_kring *kring) 1398 { 1399 struct netmap_ring *ring = kring->ring; 1400 u_int i, lim = kring->nkr_num_slots - 1; 1401 int errors = 0; 1402 1403 // XXX KASSERT nm_kr_tryget 1404 RD(10, "called for %s", NM_IFPNAME(kring->na->ifp)); 1405 // XXX probably wrong to trust userspace 1406 kring->rhead = ring->head; 1407 kring->rcur = ring->cur; 1408 kring->rtail = ring->tail; 1409 1410 if (ring->cur > lim) 1411 errors++; 1412 if (ring->head > lim) 1413 errors++; 1414 if (ring->tail > lim) 1415 errors++; 1416 for (i = 0; i <= lim; i++) { 1417 u_int idx = ring->slot[i].buf_idx; 1418 u_int len = ring->slot[i].len; 1419 if (idx < 2 || idx >= netmap_total_buffers) { 1420 RD(5, "bad index at slot %d idx %d len %d ", i, idx, len); 1421 ring->slot[i].buf_idx = 0; 1422 ring->slot[i].len = 0; 1423 } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { 1424 ring->slot[i].len = 0; 1425 RD(5, "bad len at slot %d idx %d len %d", i, idx, len); 1426 } 1427 } 1428 if (errors) { 1429 RD(10, "total %d errors", errors); 1430 RD(10, "%s reinit, cur %d -> %d tail %d -> %d", 1431 kring->name, 1432 ring->cur, kring->nr_hwcur, 1433 ring->tail, kring->nr_hwtail); 1434 ring->head = kring->rhead = kring->nr_hwcur; 1435 ring->cur = kring->rcur = kring->nr_hwcur; 1436 ring->tail = kring->rtail = kring->nr_hwtail; 1437 } 1438 return (errors ? 1 : 0); 1439 } 1440 1441 1442 /* 1443 * Set the ring ID. For devices with a single queue, a request 1444 * for all rings is the same as a single ring. 1445 */ 1446 static int 1447 netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) 1448 { 1449 struct netmap_adapter *na = priv->np_na; 1450 u_int j, i = ringid & NETMAP_RING_MASK; 1451 u_int reg = flags & NR_REG_MASK; 1452 1453 if (reg == NR_REG_DEFAULT) { 1454 /* convert from old ringid to flags */ 1455 if (ringid & NETMAP_SW_RING) { 1456 reg = NR_REG_SW; 1457 } else if (ringid & NETMAP_HW_RING) { 1458 reg = NR_REG_ONE_NIC; 1459 } else { 1460 reg = NR_REG_ALL_NIC; 1461 } 1462 D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg); 1463 } 1464 switch (reg) { 1465 case NR_REG_ALL_NIC: 1466 case NR_REG_PIPE_MASTER: 1467 case NR_REG_PIPE_SLAVE: 1468 priv->np_txqfirst = 0; 1469 priv->np_txqlast = na->num_tx_rings; 1470 priv->np_rxqfirst = 0; 1471 priv->np_rxqlast = na->num_rx_rings; 1472 ND("%s %d %d", "ALL/PIPE", 1473 priv->np_rxqfirst, priv->np_rxqlast); 1474 break; 1475 case NR_REG_SW: 1476 case NR_REG_NIC_SW: 1477 if (!(na->na_flags & NAF_HOST_RINGS)) { 1478 D("host rings not supported"); 1479 return EINVAL; 1480 } 1481 priv->np_txqfirst = (reg == NR_REG_SW ? 1482 na->num_tx_rings : 0); 1483 priv->np_txqlast = na->num_tx_rings + 1; 1484 priv->np_rxqfirst = (reg == NR_REG_SW ? 1485 na->num_rx_rings : 0); 1486 priv->np_rxqlast = na->num_rx_rings + 1; 1487 ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW", 1488 priv->np_rxqfirst, priv->np_rxqlast); 1489 break; 1490 case NR_REG_ONE_NIC: 1491 if (i >= na->num_tx_rings && i >= na->num_rx_rings) { 1492 D("invalid ring id %d", i); 1493 return EINVAL; 1494 } 1495 /* if not enough rings, use the first one */ 1496 j = i; 1497 if (j >= na->num_tx_rings) 1498 j = 0; 1499 priv->np_txqfirst = j; 1500 priv->np_txqlast = j + 1; 1501 j = i; 1502 if (j >= na->num_rx_rings) 1503 j = 0; 1504 priv->np_rxqfirst = j; 1505 priv->np_rxqlast = j + 1; 1506 break; 1507 default: 1508 D("invalid regif type %d", reg); 1509 return EINVAL; 1510 } 1511 priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; 1512 priv->np_flags = (flags & ~NR_REG_MASK) | reg; 1513 if (nm_tx_si_user(priv)) 1514 na->tx_si_users++; 1515 if (nm_rx_si_user(priv)) 1516 na->rx_si_users++; 1517 if (netmap_verbose) { 1518 D("%s: tx [%d,%d) rx [%d,%d) id %d", 1519 NM_IFPNAME(na->ifp), 1520 priv->np_txqfirst, 1521 priv->np_txqlast, 1522 priv->np_rxqfirst, 1523 priv->np_rxqlast, 1524 i); 1525 } 1526 return 0; 1527 } 1528 1529 /* 1530 * possibly move the interface to netmap-mode. 1531 * If success it returns a pointer to netmap_if, otherwise NULL. 1532 * This must be called with NMG_LOCK held. 1533 */ 1534 struct netmap_if * 1535 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, 1536 uint16_t ringid, uint32_t flags, int *err) 1537 { 1538 struct ifnet *ifp = na->ifp; 1539 struct netmap_if *nifp = NULL; 1540 int error, need_mem = 0; 1541 1542 NMG_LOCK_ASSERT(); 1543 /* ring configuration may have changed, fetch from the card */ 1544 netmap_update_config(na); 1545 priv->np_na = na; /* store the reference */ 1546 error = netmap_set_ringid(priv, ringid, flags); 1547 if (error) 1548 goto out; 1549 /* ensure allocators are ready */ 1550 need_mem = !netmap_have_memory_locked(priv); 1551 if (need_mem) { 1552 error = netmap_get_memory_locked(priv); 1553 ND("get_memory returned %d", error); 1554 if (error) 1555 goto out; 1556 } 1557 nifp = netmap_if_new(NM_IFPNAME(ifp), na); 1558 if (nifp == NULL) { /* allocation failed */ 1559 /* we should drop the allocator, but only 1560 * if we were the ones who grabbed it 1561 */ 1562 error = ENOMEM; 1563 goto out; 1564 } 1565 na->active_fds++; 1566 if (ifp->if_capenable & IFCAP_NETMAP) { 1567 /* was already set */ 1568 } else { 1569 /* Otherwise set the card in netmap mode 1570 * and make it use the shared buffers. 1571 * 1572 * do not core lock because the race is harmless here, 1573 * there cannot be any traffic to netmap_transmit() 1574 */ 1575 na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut; 1576 ND("%p->na_lut == %p", na, na->na_lut); 1577 na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal; 1578 error = na->nm_register(na, 1); /* mode on */ 1579 if (error) { 1580 netmap_do_unregif(priv, nifp); 1581 nifp = NULL; 1582 } 1583 } 1584 out: 1585 *err = error; 1586 if (error) { 1587 priv->np_na = NULL; 1588 if (need_mem) 1589 netmap_drop_memory_locked(priv); 1590 } 1591 if (nifp != NULL) { 1592 /* 1593 * advertise that the interface is ready bt setting ni_nifp. 1594 * The barrier is needed because readers (poll and *SYNC) 1595 * check for priv->np_nifp != NULL without locking 1596 */ 1597 wmb(); /* make sure previous writes are visible to all CPUs */ 1598 priv->np_nifp = nifp; 1599 } 1600 return nifp; 1601 } 1602 1603 1604 1605 /* 1606 * ioctl(2) support for the "netmap" device. 1607 * 1608 * Following a list of accepted commands: 1609 * - NIOCGINFO 1610 * - SIOCGIFADDR just for convenience 1611 * - NIOCREGIF 1612 * - NIOCTXSYNC 1613 * - NIOCRXSYNC 1614 * 1615 * Return 0 on success, errno otherwise. 1616 */ 1617 int 1618 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, 1619 int fflag, struct thread *td) 1620 { 1621 struct netmap_priv_d *priv = NULL; 1622 struct ifnet *ifp = NULL; 1623 struct nmreq *nmr = (struct nmreq *) data; 1624 struct netmap_adapter *na = NULL; 1625 int error; 1626 u_int i, qfirst, qlast; 1627 struct netmap_if *nifp; 1628 struct netmap_kring *krings; 1629 1630 (void)dev; /* UNUSED */ 1631 (void)fflag; /* UNUSED */ 1632 1633 if (cmd == NIOCGINFO || cmd == NIOCREGIF) { 1634 /* truncate name */ 1635 nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; 1636 if (nmr->nr_version != NETMAP_API) { 1637 D("API mismatch for %s got %d need %d", 1638 nmr->nr_name, 1639 nmr->nr_version, NETMAP_API); 1640 nmr->nr_version = NETMAP_API; 1641 } 1642 if (nmr->nr_version < NETMAP_MIN_API || 1643 nmr->nr_version > NETMAP_MAX_API) { 1644 return EINVAL; 1645 } 1646 } 1647 CURVNET_SET(TD_TO_VNET(td)); 1648 1649 error = devfs_get_cdevpriv((void **)&priv); 1650 if (error) { 1651 CURVNET_RESTORE(); 1652 /* XXX ENOENT should be impossible, since the priv 1653 * is now created in the open */ 1654 return (error == ENOENT ? ENXIO : error); 1655 } 1656 1657 switch (cmd) { 1658 case NIOCGINFO: /* return capabilities etc */ 1659 if (nmr->nr_cmd == NETMAP_BDG_LIST) { 1660 error = netmap_bdg_ctl(nmr, NULL); 1661 break; 1662 } 1663 1664 NMG_LOCK(); 1665 do { 1666 /* memsize is always valid */ 1667 struct netmap_mem_d *nmd = &nm_mem; 1668 u_int memflags; 1669 1670 if (nmr->nr_name[0] != '\0') { 1671 /* get a refcount */ 1672 error = netmap_get_na(nmr, &na, 1 /* create */); 1673 if (error) 1674 break; 1675 nmd = na->nm_mem; /* get memory allocator */ 1676 } 1677 1678 error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags, 1679 &nmr->nr_arg2); 1680 if (error) 1681 break; 1682 if (na == NULL) /* only memory info */ 1683 break; 1684 nmr->nr_offset = 0; 1685 nmr->nr_rx_slots = nmr->nr_tx_slots = 0; 1686 netmap_update_config(na); 1687 nmr->nr_rx_rings = na->num_rx_rings; 1688 nmr->nr_tx_rings = na->num_tx_rings; 1689 nmr->nr_rx_slots = na->num_rx_desc; 1690 nmr->nr_tx_slots = na->num_tx_desc; 1691 netmap_adapter_put(na); 1692 } while (0); 1693 NMG_UNLOCK(); 1694 break; 1695 1696 case NIOCREGIF: 1697 /* possibly attach/detach NIC and VALE switch */ 1698 i = nmr->nr_cmd; 1699 if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH 1700 || i == NETMAP_BDG_VNET_HDR) { 1701 error = netmap_bdg_ctl(nmr, NULL); 1702 break; 1703 } else if (i != 0) { 1704 D("nr_cmd must be 0 not %d", i); 1705 error = EINVAL; 1706 break; 1707 } 1708 1709 /* protect access to priv from concurrent NIOCREGIF */ 1710 NMG_LOCK(); 1711 do { 1712 u_int memflags; 1713 1714 if (priv->np_na != NULL) { /* thread already registered */ 1715 error = EBUSY; 1716 break; 1717 } 1718 /* find the interface and a reference */ 1719 error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */ 1720 if (error) 1721 break; 1722 ifp = na->ifp; 1723 if (NETMAP_OWNED_BY_KERN(na)) { 1724 netmap_adapter_put(na); 1725 error = EBUSY; 1726 break; 1727 } 1728 nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error); 1729 if (!nifp) { /* reg. failed, release priv and ref */ 1730 netmap_adapter_put(na); 1731 priv->np_nifp = NULL; 1732 break; 1733 } 1734 priv->np_td = td; // XXX kqueue, debugging only 1735 1736 /* return the offset of the netmap_if object */ 1737 nmr->nr_rx_rings = na->num_rx_rings; 1738 nmr->nr_tx_rings = na->num_tx_rings; 1739 nmr->nr_rx_slots = na->num_rx_desc; 1740 nmr->nr_tx_slots = na->num_tx_desc; 1741 error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags, 1742 &nmr->nr_arg2); 1743 if (error) { 1744 netmap_adapter_put(na); 1745 break; 1746 } 1747 if (memflags & NETMAP_MEM_PRIVATE) { 1748 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; 1749 } 1750 priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ? 1751 &na->tx_si : &na->tx_rings[priv->np_txqfirst].si; 1752 priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ? 1753 &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si; 1754 1755 if (nmr->nr_arg3) { 1756 D("requested %d extra buffers", nmr->nr_arg3); 1757 nmr->nr_arg3 = netmap_extra_alloc(na, 1758 &nifp->ni_bufs_head, nmr->nr_arg3); 1759 D("got %d extra buffers", nmr->nr_arg3); 1760 } 1761 nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); 1762 } while (0); 1763 NMG_UNLOCK(); 1764 break; 1765 1766 case NIOCTXSYNC: 1767 case NIOCRXSYNC: 1768 nifp = priv->np_nifp; 1769 1770 if (nifp == NULL) { 1771 error = ENXIO; 1772 break; 1773 } 1774 rmb(); /* make sure following reads are not from cache */ 1775 1776 na = priv->np_na; /* we have a reference */ 1777 1778 if (na == NULL) { 1779 D("Internal error: nifp != NULL && na == NULL"); 1780 error = ENXIO; 1781 break; 1782 } 1783 1784 ifp = na->ifp; 1785 if (ifp == NULL) { 1786 RD(1, "the ifp is gone"); 1787 error = ENXIO; 1788 break; 1789 } 1790 1791 if (cmd == NIOCTXSYNC) { 1792 krings = na->tx_rings; 1793 qfirst = priv->np_txqfirst; 1794 qlast = priv->np_txqlast; 1795 } else { 1796 krings = na->rx_rings; 1797 qfirst = priv->np_rxqfirst; 1798 qlast = priv->np_rxqlast; 1799 } 1800 1801 for (i = qfirst; i < qlast; i++) { 1802 struct netmap_kring *kring = krings + i; 1803 if (nm_kr_tryget(kring)) { 1804 error = EBUSY; 1805 goto out; 1806 } 1807 if (cmd == NIOCTXSYNC) { 1808 if (netmap_verbose & NM_VERB_TXSYNC) 1809 D("pre txsync ring %d cur %d hwcur %d", 1810 i, kring->ring->cur, 1811 kring->nr_hwcur); 1812 if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { 1813 netmap_ring_reinit(kring); 1814 } else { 1815 kring->nm_sync(kring, NAF_FORCE_RECLAIM); 1816 } 1817 if (netmap_verbose & NM_VERB_TXSYNC) 1818 D("post txsync ring %d cur %d hwcur %d", 1819 i, kring->ring->cur, 1820 kring->nr_hwcur); 1821 } else { 1822 kring->nm_sync(kring, NAF_FORCE_READ); 1823 microtime(&na->rx_rings[i].ring->ts); 1824 } 1825 nm_kr_put(kring); 1826 } 1827 1828 break; 1829 1830 #ifdef __FreeBSD__ 1831 case FIONBIO: 1832 case FIOASYNC: 1833 ND("FIONBIO/FIOASYNC are no-ops"); 1834 break; 1835 1836 case BIOCIMMEDIATE: 1837 case BIOCGHDRCMPLT: 1838 case BIOCSHDRCMPLT: 1839 case BIOCSSEESENT: 1840 D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); 1841 break; 1842 1843 default: /* allow device-specific ioctls */ 1844 { 1845 struct socket so; 1846 1847 bzero(&so, sizeof(so)); 1848 NMG_LOCK(); 1849 error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */ 1850 if (error) { 1851 netmap_adapter_put(na); 1852 NMG_UNLOCK(); 1853 break; 1854 } 1855 ifp = na->ifp; 1856 so.so_vnet = ifp->if_vnet; 1857 // so->so_proto not null. 1858 error = ifioctl(&so, cmd, data, td); 1859 netmap_adapter_put(na); 1860 NMG_UNLOCK(); 1861 break; 1862 } 1863 1864 #else /* linux */ 1865 default: 1866 error = EOPNOTSUPP; 1867 #endif /* linux */ 1868 } 1869 out: 1870 1871 CURVNET_RESTORE(); 1872 return (error); 1873 } 1874 1875 1876 /* 1877 * select(2) and poll(2) handlers for the "netmap" device. 1878 * 1879 * Can be called for one or more queues. 1880 * Return true the event mask corresponding to ready events. 1881 * If there are no ready events, do a selrecord on either individual 1882 * selinfo or on the global one. 1883 * Device-dependent parts (locking and sync of tx/rx rings) 1884 * are done through callbacks. 1885 * 1886 * On linux, arguments are really pwait, the poll table, and 'td' is struct file * 1887 * The first one is remapped to pwait as selrecord() uses the name as an 1888 * hidden argument. 1889 */ 1890 int 1891 netmap_poll(struct cdev *dev, int events, struct thread *td) 1892 { 1893 struct netmap_priv_d *priv = NULL; 1894 struct netmap_adapter *na; 1895 struct ifnet *ifp; 1896 struct netmap_kring *kring; 1897 u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; 1898 struct mbq q; /* packets from hw queues to host stack */ 1899 void *pwait = dev; /* linux compatibility */ 1900 int is_kevent = 0; 1901 1902 /* 1903 * In order to avoid nested locks, we need to "double check" 1904 * txsync and rxsync if we decide to do a selrecord(). 1905 * retry_tx (and retry_rx, later) prevent looping forever. 1906 */ 1907 int retry_tx = 1, retry_rx = 1; 1908 1909 (void)pwait; 1910 mbq_init(&q); 1911 1912 /* 1913 * XXX kevent has curthread->tp_fop == NULL, 1914 * so devfs_get_cdevpriv() fails. We circumvent this by passing 1915 * priv as the first argument, which is also useful to avoid 1916 * the selrecord() which are not necessary in that case. 1917 */ 1918 if (devfs_get_cdevpriv((void **)&priv) != 0) { 1919 is_kevent = 1; 1920 if (netmap_verbose) 1921 D("called from kevent"); 1922 priv = (struct netmap_priv_d *)dev; 1923 } 1924 if (priv == NULL) 1925 return POLLERR; 1926 1927 if (priv->np_nifp == NULL) { 1928 D("No if registered"); 1929 return POLLERR; 1930 } 1931 rmb(); /* make sure following reads are not from cache */ 1932 1933 na = priv->np_na; 1934 ifp = na->ifp; 1935 // check for deleted 1936 if (ifp == NULL) { 1937 RD(1, "the ifp is gone"); 1938 return POLLERR; 1939 } 1940 1941 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) 1942 return POLLERR; 1943 1944 if (netmap_verbose & 0x8000) 1945 D("device %s events 0x%x", NM_IFPNAME(ifp), events); 1946 want_tx = events & (POLLOUT | POLLWRNORM); 1947 want_rx = events & (POLLIN | POLLRDNORM); 1948 1949 1950 /* 1951 * check_all_{tx|rx} are set if the card has more than one queue AND 1952 * the file descriptor is bound to all of them. If so, we sleep on 1953 * the "global" selinfo, otherwise we sleep on individual selinfo 1954 * (FreeBSD only allows two selinfo's per file descriptor). 1955 * The interrupt routine in the driver wake one or the other 1956 * (or both) depending on which clients are active. 1957 * 1958 * rxsync() is only called if we run out of buffers on a POLLIN. 1959 * txsync() is called if we run out of buffers on POLLOUT, or 1960 * there are pending packets to send. The latter can be disabled 1961 * passing NETMAP_NO_TX_POLL in the NIOCREG call. 1962 */ 1963 check_all_tx = nm_tx_si_user(priv); 1964 check_all_rx = nm_rx_si_user(priv); 1965 1966 /* 1967 * We start with a lock free round which is cheap if we have 1968 * slots available. If this fails, then lock and call the sync 1969 * routines. 1970 */ 1971 for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) { 1972 kring = &na->rx_rings[i]; 1973 /* XXX compare ring->cur and kring->tail */ 1974 if (!nm_ring_empty(kring->ring)) { 1975 revents |= want_rx; 1976 want_rx = 0; /* also breaks the loop */ 1977 } 1978 } 1979 for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) { 1980 kring = &na->tx_rings[i]; 1981 /* XXX compare ring->cur and kring->tail */ 1982 if (!nm_ring_empty(kring->ring)) { 1983 revents |= want_tx; 1984 want_tx = 0; /* also breaks the loop */ 1985 } 1986 } 1987 1988 /* 1989 * If we want to push packets out (priv->np_txpoll) or 1990 * want_tx is still set, we must issue txsync calls 1991 * (on all rings, to avoid that the tx rings stall). 1992 * XXX should also check cur != hwcur on the tx rings. 1993 * Fortunately, normal tx mode has np_txpoll set. 1994 */ 1995 if (priv->np_txpoll || want_tx) { 1996 /* 1997 * The first round checks if anyone is ready, if not 1998 * do a selrecord and another round to handle races. 1999 * want_tx goes to 0 if any space is found, and is 2000 * used to skip rings with no pending transmissions. 2001 */ 2002 flush_tx: 2003 for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) { 2004 int found = 0; 2005 2006 kring = &na->tx_rings[i]; 2007 if (!want_tx && kring->ring->cur == kring->nr_hwcur) 2008 continue; 2009 /* only one thread does txsync */ 2010 if (nm_kr_tryget(kring)) { 2011 if (netmap_verbose) 2012 RD(2, "%p lost race on txring %d, ok", 2013 priv, i); 2014 continue; 2015 } 2016 if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { 2017 netmap_ring_reinit(kring); 2018 revents |= POLLERR; 2019 } else { 2020 if (kring->nm_sync(kring, 0)) 2021 revents |= POLLERR; 2022 } 2023 2024 /* 2025 * If we found new slots, notify potential 2026 * listeners on the same ring. 2027 * Since we just did a txsync, look at the copies 2028 * of cur,tail in the kring. 2029 */ 2030 found = kring->rcur != kring->rtail; 2031 nm_kr_put(kring); 2032 if (found) { /* notify other listeners */ 2033 revents |= want_tx; 2034 want_tx = 0; 2035 na->nm_notify(na, i, NR_TX, 0); 2036 } 2037 } 2038 if (want_tx && retry_tx && !is_kevent) { 2039 selrecord(td, check_all_tx ? 2040 &na->tx_si : &na->tx_rings[priv->np_txqfirst].si); 2041 retry_tx = 0; 2042 goto flush_tx; 2043 } 2044 } 2045 2046 /* 2047 * If want_rx is still set scan receive rings. 2048 * Do it on all rings because otherwise we starve. 2049 */ 2050 if (want_rx) { 2051 int send_down = 0; /* transparent mode */ 2052 /* two rounds here to for race avoidance */ 2053 do_retry_rx: 2054 for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) { 2055 int found = 0; 2056 2057 kring = &na->rx_rings[i]; 2058 2059 if (nm_kr_tryget(kring)) { 2060 if (netmap_verbose) 2061 RD(2, "%p lost race on rxring %d, ok", 2062 priv, i); 2063 continue; 2064 } 2065 2066 /* 2067 * transparent mode support: collect packets 2068 * from the rxring(s). 2069 * XXX NR_FORWARD should only be read on 2070 * physical or NIC ports 2071 */ 2072 if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { 2073 ND(10, "forwarding some buffers up %d to %d", 2074 kring->nr_hwcur, kring->ring->cur); 2075 netmap_grab_packets(kring, &q, netmap_fwd); 2076 } 2077 2078 if (kring->nm_sync(kring, 0)) 2079 revents |= POLLERR; 2080 if (netmap_no_timestamp == 0 || 2081 kring->ring->flags & NR_TIMESTAMP) { 2082 microtime(&kring->ring->ts); 2083 } 2084 /* after an rxsync we can use kring->rcur, rtail */ 2085 found = kring->rcur != kring->rtail; 2086 nm_kr_put(kring); 2087 if (found) { 2088 revents |= want_rx; 2089 retry_rx = 0; 2090 na->nm_notify(na, i, NR_RX, 0); 2091 } 2092 } 2093 2094 /* transparent mode XXX only during first pass ? */ 2095 if (na->na_flags & NAF_HOST_RINGS) { 2096 kring = &na->rx_rings[na->num_rx_rings]; 2097 if (check_all_rx 2098 && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { 2099 /* XXX fix to use kring fields */ 2100 if (nm_ring_empty(kring->ring)) 2101 send_down = netmap_rxsync_from_host(na, td, dev); 2102 if (!nm_ring_empty(kring->ring)) 2103 revents |= want_rx; 2104 } 2105 } 2106 2107 if (retry_rx && !is_kevent) 2108 selrecord(td, check_all_rx ? 2109 &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si); 2110 if (send_down > 0 || retry_rx) { 2111 retry_rx = 0; 2112 if (send_down) 2113 goto flush_tx; /* and retry_rx */ 2114 else 2115 goto do_retry_rx; 2116 } 2117 } 2118 2119 /* 2120 * Transparent mode: marked bufs on rx rings between 2121 * kring->nr_hwcur and ring->head 2122 * are passed to the other endpoint. 2123 * 2124 * In this mode we also scan the sw rxring, which in 2125 * turn passes packets up. 2126 * 2127 * XXX Transparent mode at the moment requires to bind all 2128 * rings to a single file descriptor. 2129 */ 2130 2131 if (q.head) 2132 netmap_send_up(na->ifp, &q); 2133 2134 return (revents); 2135 } 2136 2137 2138 /*-------------------- driver support routines -------------------*/ 2139 2140 static int netmap_hw_krings_create(struct netmap_adapter *); 2141 2142 static int 2143 netmap_notify(struct netmap_adapter *na, u_int n_ring, 2144 enum txrx tx, int flags) 2145 { 2146 struct netmap_kring *kring; 2147 2148 if (tx == NR_TX) { 2149 kring = na->tx_rings + n_ring; 2150 OS_selwakeup(&kring->si, PI_NET); 2151 if (na->tx_si_users > 0) 2152 OS_selwakeup(&na->tx_si, PI_NET); 2153 } else { 2154 kring = na->rx_rings + n_ring; 2155 OS_selwakeup(&kring->si, PI_NET); 2156 if (na->rx_si_users > 0) 2157 OS_selwakeup(&na->rx_si, PI_NET); 2158 } 2159 return 0; 2160 } 2161 2162 2163 // XXX check handling of failures 2164 int 2165 netmap_attach_common(struct netmap_adapter *na) 2166 { 2167 struct ifnet *ifp = na->ifp; 2168 2169 if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { 2170 D("%s: invalid rings tx %d rx %d", 2171 ifp->if_xname, na->num_tx_rings, na->num_rx_rings); 2172 return EINVAL; 2173 } 2174 WNA(ifp) = na; 2175 2176 /* the following is only needed for na that use the host port. 2177 * XXX do we have something similar for linux ? 2178 */ 2179 #ifdef __FreeBSD__ 2180 na->if_input = ifp->if_input; /* for netmap_send_up */ 2181 #endif /* __FreeBSD__ */ 2182 2183 NETMAP_SET_CAPABLE(ifp); 2184 if (na->nm_krings_create == NULL) { 2185 na->nm_krings_create = netmap_hw_krings_create; 2186 na->nm_krings_delete = netmap_hw_krings_delete; 2187 } 2188 if (na->nm_notify == NULL) 2189 na->nm_notify = netmap_notify; 2190 na->active_fds = 0; 2191 2192 if (na->nm_mem == NULL) 2193 na->nm_mem = &nm_mem; 2194 return 0; 2195 } 2196 2197 2198 void 2199 netmap_detach_common(struct netmap_adapter *na) 2200 { 2201 if (na->ifp) 2202 WNA(na->ifp) = NULL; /* XXX do we need this? */ 2203 2204 if (na->tx_rings) { /* XXX should not happen */ 2205 D("freeing leftover tx_rings"); 2206 na->nm_krings_delete(na); 2207 } 2208 netmap_pipe_dealloc(na); 2209 if (na->na_flags & NAF_MEM_OWNER) 2210 netmap_mem_private_delete(na->nm_mem); 2211 bzero(na, sizeof(*na)); 2212 free(na, M_DEVBUF); 2213 } 2214 2215 2216 /* 2217 * Initialize a ``netmap_adapter`` object created by driver on attach. 2218 * We allocate a block of memory with room for a struct netmap_adapter 2219 * plus two sets of N+2 struct netmap_kring (where N is the number 2220 * of hardware rings): 2221 * krings 0..N-1 are for the hardware queues. 2222 * kring N is for the host stack queue 2223 * kring N+1 is only used for the selinfo for all queues. // XXX still true ? 2224 * Return 0 on success, ENOMEM otherwise. 2225 */ 2226 int 2227 netmap_attach(struct netmap_adapter *arg) 2228 { 2229 struct netmap_hw_adapter *hwna = NULL; 2230 // XXX when is arg == NULL ? 2231 struct ifnet *ifp = arg ? arg->ifp : NULL; 2232 2233 if (arg == NULL || ifp == NULL) 2234 goto fail; 2235 hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO); 2236 if (hwna == NULL) 2237 goto fail; 2238 hwna->up = *arg; 2239 hwna->up.na_flags |= NAF_HOST_RINGS; 2240 if (netmap_attach_common(&hwna->up)) { 2241 free(hwna, M_DEVBUF); 2242 goto fail; 2243 } 2244 netmap_adapter_get(&hwna->up); 2245 2246 #ifdef linux 2247 if (ifp->netdev_ops) { 2248 /* prepare a clone of the netdev ops */ 2249 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) 2250 hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops; 2251 #else 2252 hwna->nm_ndo = *ifp->netdev_ops; 2253 #endif 2254 } 2255 hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; 2256 #endif /* linux */ 2257 2258 D("success for %s", NM_IFPNAME(ifp)); 2259 return 0; 2260 2261 fail: 2262 D("fail, arg %p ifp %p na %p", arg, ifp, hwna); 2263 netmap_detach(ifp); 2264 return (hwna ? EINVAL : ENOMEM); 2265 } 2266 2267 2268 void 2269 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na) 2270 { 2271 if (!na) { 2272 return; 2273 } 2274 2275 refcount_acquire(&na->na_refcount); 2276 } 2277 2278 2279 /* returns 1 iff the netmap_adapter is destroyed */ 2280 int 2281 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) 2282 { 2283 if (!na) 2284 return 1; 2285 2286 if (!refcount_release(&na->na_refcount)) 2287 return 0; 2288 2289 if (na->nm_dtor) 2290 na->nm_dtor(na); 2291 2292 netmap_detach_common(na); 2293 2294 return 1; 2295 } 2296 2297 int 2298 netmap_hw_krings_create(struct netmap_adapter *na) 2299 { 2300 int ret = netmap_krings_create(na, 0); 2301 if (ret == 0) { 2302 /* initialize the mbq for the sw rx ring */ 2303 mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue); 2304 ND("initialized sw rx queue %d", na->num_rx_rings); 2305 } 2306 return ret; 2307 } 2308 2309 2310 2311 /* 2312 * Free the allocated memory linked to the given ``netmap_adapter`` 2313 * object. 2314 */ 2315 void 2316 netmap_detach(struct ifnet *ifp) 2317 { 2318 struct netmap_adapter *na = NA(ifp); 2319 2320 if (!na) 2321 return; 2322 2323 NMG_LOCK(); 2324 netmap_disable_all_rings(ifp); 2325 if (!netmap_adapter_put(na)) { 2326 /* someone is still using the adapter, 2327 * tell them that the interface is gone 2328 */ 2329 na->ifp = NULL; 2330 /* give them a chance to notice */ 2331 netmap_enable_all_rings(ifp); 2332 } 2333 NMG_UNLOCK(); 2334 } 2335 2336 2337 /* 2338 * Intercept packets from the network stack and pass them 2339 * to netmap as incoming packets on the 'software' ring. 2340 * 2341 * We only store packets in a bounded mbq and then copy them 2342 * in the relevant rxsync routine. 2343 * 2344 * We rely on the OS to make sure that the ifp and na do not go 2345 * away (typically the caller checks for IFF_DRV_RUNNING or the like). 2346 * In nm_register() or whenever there is a reinitialization, 2347 * we make sure to make the mode change visible here. 2348 */ 2349 int 2350 netmap_transmit(struct ifnet *ifp, struct mbuf *m) 2351 { 2352 struct netmap_adapter *na = NA(ifp); 2353 struct netmap_kring *kring; 2354 u_int len = MBUF_LEN(m); 2355 u_int error = ENOBUFS; 2356 struct mbq *q; 2357 int space; 2358 2359 // XXX [Linux] we do not need this lock 2360 // if we follow the down/configure/up protocol -gl 2361 // mtx_lock(&na->core_lock); 2362 2363 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { 2364 D("%s not in netmap mode anymore", NM_IFPNAME(ifp)); 2365 error = ENXIO; 2366 goto done; 2367 } 2368 2369 kring = &na->rx_rings[na->num_rx_rings]; 2370 q = &kring->rx_queue; 2371 2372 // XXX reconsider long packets if we handle fragments 2373 if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ 2374 D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp), 2375 len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); 2376 goto done; 2377 } 2378 2379 /* protect against rxsync_from_host(), netmap_sw_to_nic() 2380 * and maybe other instances of netmap_transmit (the latter 2381 * not possible on Linux). 2382 * Also avoid overflowing the queue. 2383 */ 2384 mtx_lock(&q->lock); 2385 2386 space = kring->nr_hwtail - kring->nr_hwcur; 2387 if (space < 0) 2388 space += kring->nkr_num_slots; 2389 if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX 2390 RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p", 2391 NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q), 2392 len, m); 2393 } else { 2394 mbq_enqueue(q, m); 2395 ND(10, "%s %d bufs in queue len %d m %p", 2396 NM_IFPNAME(ifp), mbq_len(q), len, m); 2397 /* notify outside the lock */ 2398 m = NULL; 2399 error = 0; 2400 } 2401 mtx_unlock(&q->lock); 2402 2403 done: 2404 if (m) 2405 m_freem(m); 2406 /* unconditionally wake up listeners */ 2407 na->nm_notify(na, na->num_rx_rings, NR_RX, 0); 2408 2409 return (error); 2410 } 2411 2412 2413 /* 2414 * netmap_reset() is called by the driver routines when reinitializing 2415 * a ring. The driver is in charge of locking to protect the kring. 2416 * If native netmap mode is not set just return NULL. 2417 */ 2418 struct netmap_slot * 2419 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, 2420 u_int new_cur) 2421 { 2422 struct netmap_kring *kring; 2423 int new_hwofs, lim; 2424 2425 if (na == NULL) { 2426 D("NULL na, should not happen"); 2427 return NULL; /* no netmap support here */ 2428 } 2429 if (!(na->ifp->if_capenable & IFCAP_NETMAP)) { 2430 ND("interface not in netmap mode"); 2431 return NULL; /* nothing to reinitialize */ 2432 } 2433 2434 /* XXX note- in the new scheme, we are not guaranteed to be 2435 * under lock (e.g. when called on a device reset). 2436 * In this case, we should set a flag and do not trust too 2437 * much the values. In practice: TODO 2438 * - set a RESET flag somewhere in the kring 2439 * - do the processing in a conservative way 2440 * - let the *sync() fixup at the end. 2441 */ 2442 if (tx == NR_TX) { 2443 if (n >= na->num_tx_rings) 2444 return NULL; 2445 kring = na->tx_rings + n; 2446 // XXX check whether we should use hwcur or rcur 2447 new_hwofs = kring->nr_hwcur - new_cur; 2448 } else { 2449 if (n >= na->num_rx_rings) 2450 return NULL; 2451 kring = na->rx_rings + n; 2452 new_hwofs = kring->nr_hwtail - new_cur; 2453 } 2454 lim = kring->nkr_num_slots - 1; 2455 if (new_hwofs > lim) 2456 new_hwofs -= lim + 1; 2457 2458 /* Always set the new offset value and realign the ring. */ 2459 if (netmap_verbose) 2460 D("%s %s%d hwofs %d -> %d, hwtail %d -> %d", 2461 NM_IFPNAME(na->ifp), 2462 tx == NR_TX ? "TX" : "RX", n, 2463 kring->nkr_hwofs, new_hwofs, 2464 kring->nr_hwtail, 2465 tx == NR_TX ? lim : kring->nr_hwtail); 2466 kring->nkr_hwofs = new_hwofs; 2467 if (tx == NR_TX) { 2468 kring->nr_hwtail = kring->nr_hwcur + lim; 2469 if (kring->nr_hwtail > lim) 2470 kring->nr_hwtail -= lim + 1; 2471 } 2472 2473 #if 0 // def linux 2474 /* XXX check that the mappings are correct */ 2475 /* need ring_nr, adapter->pdev, direction */ 2476 buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE); 2477 if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { 2478 D("error mapping rx netmap buffer %d", i); 2479 // XXX fix error handling 2480 } 2481 2482 #endif /* linux */ 2483 /* 2484 * Wakeup on the individual and global selwait 2485 * We do the wakeup here, but the ring is not yet reconfigured. 2486 * However, we are under lock so there are no races. 2487 */ 2488 na->nm_notify(na, n, tx, 0); 2489 return kring->ring->slot; 2490 } 2491 2492 2493 /* 2494 * Dispatch rx/tx interrupts to the netmap rings. 2495 * 2496 * "work_done" is non-null on the RX path, NULL for the TX path. 2497 * We rely on the OS to make sure that there is only one active 2498 * instance per queue, and that there is appropriate locking. 2499 * 2500 * The 'notify' routine depends on what the ring is attached to. 2501 * - for a netmap file descriptor, do a selwakeup on the individual 2502 * waitqueue, plus one on the global one if needed 2503 * - for a switch, call the proper forwarding routine 2504 * - XXX more ? 2505 */ 2506 void 2507 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) 2508 { 2509 struct netmap_adapter *na = NA(ifp); 2510 struct netmap_kring *kring; 2511 2512 q &= NETMAP_RING_MASK; 2513 2514 if (netmap_verbose) { 2515 RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); 2516 } 2517 2518 if (work_done) { /* RX path */ 2519 if (q >= na->num_rx_rings) 2520 return; // not a physical queue 2521 kring = na->rx_rings + q; 2522 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? 2523 na->nm_notify(na, q, NR_RX, 0); 2524 *work_done = 1; /* do not fire napi again */ 2525 } else { /* TX path */ 2526 if (q >= na->num_tx_rings) 2527 return; // not a physical queue 2528 kring = na->tx_rings + q; 2529 na->nm_notify(na, q, NR_TX, 0); 2530 } 2531 } 2532 2533 2534 /* 2535 * Default functions to handle rx/tx interrupts from a physical device. 2536 * "work_done" is non-null on the RX path, NULL for the TX path. 2537 * 2538 * If the card is not in netmap mode, simply return 0, 2539 * so that the caller proceeds with regular processing. 2540 * Otherwise call netmap_common_irq() and return 1. 2541 * 2542 * If the card is connected to a netmap file descriptor, 2543 * do a selwakeup on the individual queue, plus one on the global one 2544 * if needed (multiqueue card _and_ there are multiqueue listeners), 2545 * and return 1. 2546 * 2547 * Finally, if called on rx from an interface connected to a switch, 2548 * calls the proper forwarding routine, and return 1. 2549 */ 2550 int 2551 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) 2552 { 2553 // XXX could we check NAF_NATIVE_ON ? 2554 if (!(ifp->if_capenable & IFCAP_NETMAP)) 2555 return 0; 2556 2557 if (NA(ifp)->na_flags & NAF_SKIP_INTR) { 2558 ND("use regular interrupt"); 2559 return 0; 2560 } 2561 2562 netmap_common_irq(ifp, q, work_done); 2563 return 1; 2564 } 2565 2566 2567 /* 2568 * Module loader and unloader 2569 * 2570 * netmap_init() creates the /dev/netmap device and initializes 2571 * all global variables. Returns 0 on success, errno on failure 2572 * (but there is no chance) 2573 * 2574 * netmap_fini() destroys everything. 2575 */ 2576 2577 static struct cdev *netmap_dev; /* /dev/netmap character device. */ 2578 extern struct cdevsw netmap_cdevsw; 2579 2580 2581 void 2582 netmap_fini(void) 2583 { 2584 // XXX destroy_bridges() ? 2585 if (netmap_dev) 2586 destroy_dev(netmap_dev); 2587 netmap_mem_fini(); 2588 NMG_LOCK_DESTROY(); 2589 printf("netmap: unloaded module.\n"); 2590 } 2591 2592 2593 int 2594 netmap_init(void) 2595 { 2596 int error; 2597 2598 NMG_LOCK_INIT(); 2599 2600 error = netmap_mem_init(); 2601 if (error != 0) 2602 goto fail; 2603 /* XXX could use make_dev_credv() to get error number */ 2604 netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, 2605 "netmap"); 2606 if (!netmap_dev) 2607 goto fail; 2608 2609 netmap_init_bridges(); 2610 printf("netmap: loaded module\n"); 2611 return (0); 2612 fail: 2613 netmap_fini(); 2614 return (EINVAL); /* may be incorrect */ 2615 } 2616