1 /* 2 * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 /* 28 * $FreeBSD$ 29 * 30 * This module supports memory mapped access to network devices, 31 * see netmap(4). 32 * 33 * The module uses a large, memory pool allocated by the kernel 34 * and accessible as mmapped memory by multiple userspace threads/processes. 35 * The memory pool contains packet buffers and "netmap rings", 36 * i.e. user-accessible copies of the interface's queues. 37 * 38 * Access to the network card works like this: 39 * 1. a process/thread issues one or more open() on /dev/netmap, to create 40 * select()able file descriptor on which events are reported. 41 * 2. on each descriptor, the process issues an ioctl() to identify 42 * the interface that should report events to the file descriptor. 43 * 3. on each descriptor, the process issues an mmap() request to 44 * map the shared memory region within the process' address space. 45 * The list of interesting queues is indicated by a location in 46 * the shared memory region. 47 * 4. using the functions in the netmap(4) userspace API, a process 48 * can look up the occupation state of a queue, access memory buffers, 49 * and retrieve received packets or enqueue packets to transmit. 50 * 5. using some ioctl()s the process can synchronize the userspace view 51 * of the queue with the actual status in the kernel. This includes both 52 * receiving the notification of new packets, and transmitting new 53 * packets on the output interface. 54 * 6. select() or poll() can be used to wait for events on individual 55 * transmit or receive queues (or all queues for a given interface). 56 * 57 58 SYNCHRONIZATION (USER) 59 60 The netmap rings and data structures may be shared among multiple 61 user threads or even independent processes. 62 Any synchronization among those threads/processes is delegated 63 to the threads themselves. Only one thread at a time can be in 64 a system call on the same netmap ring. The OS does not enforce 65 this and only guarantees against system crashes in case of 66 invalid usage. 67 68 LOCKING (INTERNAL) 69 70 Within the kernel, access to the netmap rings is protected as follows: 71 72 - a spinlock on each ring, to handle producer/consumer races on 73 RX rings attached to the host stack (against multiple host 74 threads writing from the host stack to the same ring), 75 and on 'destination' rings attached to a VALE switch 76 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) 77 protecting multiple active senders for the same destination) 78 79 - an atomic variable to guarantee that there is at most one 80 instance of *_*xsync() on the ring at any time. 81 For rings connected to user file 82 descriptors, an atomic_test_and_set() protects this, and the 83 lock on the ring is not actually used. 84 For NIC RX rings connected to a VALE switch, an atomic_test_and_set() 85 is also used to prevent multiple executions (the driver might indeed 86 already guarantee this). 87 For NIC TX rings connected to a VALE switch, the lock arbitrates 88 access to the queue (both when allocating buffers and when pushing 89 them out). 90 91 - *xsync() should be protected against initializations of the card. 92 On FreeBSD most devices have the reset routine protected by 93 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing 94 the RING protection on rx_reset(), this should be added. 95 96 On linux there is an external lock on the tx path, which probably 97 also arbitrates access to the reset routine. XXX to be revised 98 99 - a per-interface core_lock protecting access from the host stack 100 while interfaces may be detached from netmap mode. 101 XXX there should be no need for this lock if we detach the interfaces 102 only while they are down. 103 104 105 --- VALE SWITCH --- 106 107 NMG_LOCK() serializes all modifications to switches and ports. 108 A switch cannot be deleted until all ports are gone. 109 110 For each switch, an SX lock (RWlock on linux) protects 111 deletion of ports. When configuring or deleting a new port, the 112 lock is acquired in exclusive mode (after holding NMG_LOCK). 113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 114 The lock is held throughout the entire forwarding cycle, 115 during which the thread may incur in a page fault. 116 Hence it is important that sleepable shared locks are used. 117 118 On the rx ring, the per-port lock is grabbed initially to reserve 119 a number of slot in the ring, then the lock is released, 120 packets are copied from source to destination, and then 121 the lock is acquired again and the receive ring is updated. 122 (A similar thing is done on the tx ring for NIC and host stack 123 ports attached to the switch) 124 125 */ 126 127 /* 128 * OS-specific code that is used only within this file. 129 * Other OS-specific code that must be accessed by drivers 130 * is present in netmap_kern.h 131 */ 132 133 #if defined(__FreeBSD__) 134 #include <sys/cdefs.h> /* prerequisite */ 135 #include <sys/types.h> 136 #include <sys/errno.h> 137 #include <sys/param.h> /* defines used in kernel.h */ 138 #include <sys/kernel.h> /* types used in module initialization */ 139 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 140 #include <sys/sockio.h> 141 #include <sys/socketvar.h> /* struct socket */ 142 #include <sys/malloc.h> 143 #include <sys/poll.h> 144 #include <sys/rwlock.h> 145 #include <sys/socket.h> /* sockaddrs */ 146 #include <sys/selinfo.h> 147 #include <sys/sysctl.h> 148 #include <net/if.h> 149 #include <net/if_var.h> 150 #include <net/bpf.h> /* BIOCIMMEDIATE */ 151 #include <machine/bus.h> /* bus_dmamap_* */ 152 #include <sys/endian.h> 153 #include <sys/refcount.h> 154 155 156 /* reduce conditional code */ 157 #define init_waitqueue_head(x) // only needed in linux 158 159 160 161 #elif defined(linux) 162 163 #include "bsd_glue.h" 164 165 166 167 #elif defined(__APPLE__) 168 169 #warning OSX support is only partial 170 #include "osx_glue.h" 171 172 #else 173 174 #error Unsupported platform 175 176 #endif /* unsupported */ 177 178 /* 179 * common headers 180 */ 181 #include <net/netmap.h> 182 #include <dev/netmap/netmap_kern.h> 183 #include <dev/netmap/netmap_mem2.h> 184 185 186 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); 187 188 /* 189 * The following variables are used by the drivers and replicate 190 * fields in the global memory pool. They only refer to buffers 191 * used by physical interfaces. 192 */ 193 u_int netmap_total_buffers; 194 u_int netmap_buf_size; 195 char *netmap_buffer_base; /* also address of an invalid buffer */ 196 197 /* user-controlled variables */ 198 int netmap_verbose; 199 200 static int netmap_no_timestamp; /* don't timestamp on rxsync */ 201 202 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); 203 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, 204 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); 205 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, 206 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); 207 int netmap_mitigate = 1; 208 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); 209 int netmap_no_pendintr = 1; 210 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, 211 CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); 212 int netmap_txsync_retry = 2; 213 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, 214 &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); 215 216 int netmap_flags = 0; /* debug flags */ 217 int netmap_fwd = 0; /* force transparent mode */ 218 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ 219 220 /* 221 * netmap_admode selects the netmap mode to use. 222 * Invalid values are reset to NETMAP_ADMODE_BEST 223 */ 224 enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ 225 NETMAP_ADMODE_NATIVE, /* either native or none */ 226 NETMAP_ADMODE_GENERIC, /* force generic */ 227 NETMAP_ADMODE_LAST }; 228 #define NETMAP_ADMODE_NATIVE 1 /* Force native netmap adapter. */ 229 #define NETMAP_ADMODE_GENERIC 2 /* Force generic netmap adapter. */ 230 #define NETMAP_ADMODE_BEST 0 /* Priority to native netmap adapter. */ 231 static int netmap_admode = NETMAP_ADMODE_BEST; 232 233 int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ 234 int netmap_generic_ringsize = 1024; /* Generic ringsize. */ 235 236 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); 237 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); 238 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); 239 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); 240 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); 241 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); 242 243 NMG_LOCK_T netmap_global_lock; 244 245 246 static void 247 nm_kr_get(struct netmap_kring *kr) 248 { 249 while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) 250 tsleep(kr, 0, "NM_KR_GET", 4); 251 } 252 253 254 void 255 netmap_disable_ring(struct netmap_kring *kr) 256 { 257 kr->nkr_stopped = 1; 258 nm_kr_get(kr); 259 mtx_lock(&kr->q_lock); 260 mtx_unlock(&kr->q_lock); 261 nm_kr_put(kr); 262 } 263 264 265 static void 266 netmap_set_all_rings(struct ifnet *ifp, int stopped) 267 { 268 struct netmap_adapter *na; 269 int i; 270 271 if (!(ifp->if_capenable & IFCAP_NETMAP)) 272 return; 273 274 na = NA(ifp); 275 276 for (i = 0; i <= na->num_tx_rings; i++) { 277 if (stopped) 278 netmap_disable_ring(na->tx_rings + i); 279 else 280 na->tx_rings[i].nkr_stopped = 0; 281 na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY | 282 (i == na->num_tx_rings ? NAF_GLOBAL_NOTIFY: 0)); 283 } 284 285 for (i = 0; i <= na->num_rx_rings; i++) { 286 if (stopped) 287 netmap_disable_ring(na->rx_rings + i); 288 else 289 na->rx_rings[i].nkr_stopped = 0; 290 na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY | 291 (i == na->num_rx_rings ? NAF_GLOBAL_NOTIFY: 0)); 292 } 293 } 294 295 296 void 297 netmap_disable_all_rings(struct ifnet *ifp) 298 { 299 netmap_set_all_rings(ifp, 1 /* stopped */); 300 } 301 302 303 void 304 netmap_enable_all_rings(struct ifnet *ifp) 305 { 306 netmap_set_all_rings(ifp, 0 /* enabled */); 307 } 308 309 310 /* 311 * generic bound_checking function 312 */ 313 u_int 314 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) 315 { 316 u_int oldv = *v; 317 const char *op = NULL; 318 319 if (dflt < lo) 320 dflt = lo; 321 if (dflt > hi) 322 dflt = hi; 323 if (oldv < lo) { 324 *v = dflt; 325 op = "Bump"; 326 } else if (oldv > hi) { 327 *v = hi; 328 op = "Clamp"; 329 } 330 if (op && msg) 331 printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); 332 return *v; 333 } 334 335 336 /* 337 * packet-dump function, user-supplied or static buffer. 338 * The destination buffer must be at least 30+4*len 339 */ 340 const char * 341 nm_dump_buf(char *p, int len, int lim, char *dst) 342 { 343 static char _dst[8192]; 344 int i, j, i0; 345 static char hex[] ="0123456789abcdef"; 346 char *o; /* output position */ 347 348 #define P_HI(x) hex[((x) & 0xf0)>>4] 349 #define P_LO(x) hex[((x) & 0xf)] 350 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') 351 if (!dst) 352 dst = _dst; 353 if (lim <= 0 || lim > len) 354 lim = len; 355 o = dst; 356 sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); 357 o += strlen(o); 358 /* hexdump routine */ 359 for (i = 0; i < lim; ) { 360 sprintf(o, "%5d: ", i); 361 o += strlen(o); 362 memset(o, ' ', 48); 363 i0 = i; 364 for (j=0; j < 16 && i < lim; i++, j++) { 365 o[j*3] = P_HI(p[i]); 366 o[j*3+1] = P_LO(p[i]); 367 } 368 i = i0; 369 for (j=0; j < 16 && i < lim; i++, j++) 370 o[j + 48] = P_C(p[i]); 371 o[j+48] = '\n'; 372 o += j+49; 373 } 374 *o = '\0'; 375 #undef P_HI 376 #undef P_LO 377 #undef P_C 378 return dst; 379 } 380 381 382 383 /* 384 * Fetch configuration from the device, to cope with dynamic 385 * reconfigurations after loading the module. 386 */ 387 int 388 netmap_update_config(struct netmap_adapter *na) 389 { 390 struct ifnet *ifp = na->ifp; 391 u_int txr, txd, rxr, rxd; 392 393 txr = txd = rxr = rxd = 0; 394 if (na->nm_config) { 395 na->nm_config(na, &txr, &txd, &rxr, &rxd); 396 } else { 397 /* take whatever we had at init time */ 398 txr = na->num_tx_rings; 399 txd = na->num_tx_desc; 400 rxr = na->num_rx_rings; 401 rxd = na->num_rx_desc; 402 } 403 404 if (na->num_tx_rings == txr && na->num_tx_desc == txd && 405 na->num_rx_rings == rxr && na->num_rx_desc == rxd) 406 return 0; /* nothing changed */ 407 if (netmap_verbose || na->active_fds > 0) { 408 D("stored config %s: txring %d x %d, rxring %d x %d", 409 NM_IFPNAME(ifp), 410 na->num_tx_rings, na->num_tx_desc, 411 na->num_rx_rings, na->num_rx_desc); 412 D("new config %s: txring %d x %d, rxring %d x %d", 413 NM_IFPNAME(ifp), txr, txd, rxr, rxd); 414 } 415 if (na->active_fds == 0) { 416 D("configuration changed (but fine)"); 417 na->num_tx_rings = txr; 418 na->num_tx_desc = txd; 419 na->num_rx_rings = rxr; 420 na->num_rx_desc = rxd; 421 return 0; 422 } 423 D("configuration changed while active, this is bad..."); 424 return 1; 425 } 426 427 428 int 429 netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom) 430 { 431 u_int i, len, ndesc; 432 struct netmap_kring *kring; 433 434 len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom; 435 436 na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); 437 if (na->tx_rings == NULL) { 438 D("Cannot allocate krings"); 439 return ENOMEM; 440 } 441 na->rx_rings = na->tx_rings + ntx; 442 443 ndesc = na->num_tx_desc; 444 for (i = 0; i < ntx; i++) { /* Transmit rings */ 445 kring = &na->tx_rings[i]; 446 bzero(kring, sizeof(*kring)); 447 kring->na = na; 448 kring->nkr_num_slots = ndesc; 449 /* 450 * IMPORTANT: 451 * Always keep one slot empty, so we can detect new 452 * transmissions comparing cur and nr_hwcur (they are 453 * the same only if there are no new transmissions). 454 */ 455 kring->nr_hwavail = ndesc - 1; 456 mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF); 457 init_waitqueue_head(&kring->si); 458 } 459 460 ndesc = na->num_rx_desc; 461 for (i = 0; i < nrx; i++) { /* Receive rings */ 462 kring = &na->rx_rings[i]; 463 bzero(kring, sizeof(*kring)); 464 kring->na = na; 465 kring->nkr_num_slots = ndesc; 466 mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF); 467 init_waitqueue_head(&kring->si); 468 } 469 init_waitqueue_head(&na->tx_si); 470 init_waitqueue_head(&na->rx_si); 471 472 na->tailroom = na->rx_rings + nrx; 473 474 return 0; 475 476 } 477 478 479 void 480 netmap_krings_delete(struct netmap_adapter *na) 481 { 482 int i; 483 484 for (i = 0; i < na->num_tx_rings + 1; i++) { 485 mtx_destroy(&na->tx_rings[i].q_lock); 486 } 487 for (i = 0; i < na->num_rx_rings + 1; i++) { 488 mtx_destroy(&na->rx_rings[i].q_lock); 489 } 490 free(na->tx_rings, M_DEVBUF); 491 na->tx_rings = na->rx_rings = na->tailroom = NULL; 492 } 493 494 495 static struct netmap_if* 496 netmap_if_new(const char *ifname, struct netmap_adapter *na) 497 { 498 struct netmap_if *nifp; 499 500 if (netmap_update_config(na)) { 501 /* configuration mismatch, report and fail */ 502 return NULL; 503 } 504 505 if (na->active_fds) 506 goto final; 507 508 if (na->nm_krings_create(na)) 509 goto cleanup; 510 511 if (netmap_mem_rings_create(na)) 512 goto cleanup; 513 514 final: 515 516 nifp = netmap_mem_if_new(ifname, na); 517 if (nifp == NULL) 518 goto cleanup; 519 520 return (nifp); 521 522 cleanup: 523 524 if (na->active_fds == 0) { 525 netmap_mem_rings_delete(na); 526 na->nm_krings_delete(na); 527 } 528 529 return NULL; 530 } 531 532 533 /* grab a reference to the memory allocator, if we don't have one already. The 534 * reference is taken from the netmap_adapter registered with the priv. 535 * 536 */ 537 static int 538 netmap_get_memory_locked(struct netmap_priv_d* p) 539 { 540 struct netmap_mem_d *nmd; 541 int error = 0; 542 543 if (p->np_na == NULL) { 544 if (!netmap_mmap_unreg) 545 return ENODEV; 546 /* for compatibility with older versions of the API 547 * we use the global allocator when no interface has been 548 * registered 549 */ 550 nmd = &nm_mem; 551 } else { 552 nmd = p->np_na->nm_mem; 553 } 554 if (p->np_mref == NULL) { 555 error = netmap_mem_finalize(nmd); 556 if (!error) 557 p->np_mref = nmd; 558 } else if (p->np_mref != nmd) { 559 /* a virtual port has been registered, but previous 560 * syscalls already used the global allocator. 561 * We cannot continue 562 */ 563 error = ENODEV; 564 } 565 return error; 566 } 567 568 569 int 570 netmap_get_memory(struct netmap_priv_d* p) 571 { 572 int error; 573 NMG_LOCK(); 574 error = netmap_get_memory_locked(p); 575 NMG_UNLOCK(); 576 return error; 577 } 578 579 580 static int 581 netmap_have_memory_locked(struct netmap_priv_d* p) 582 { 583 return p->np_mref != NULL; 584 } 585 586 587 static void 588 netmap_drop_memory_locked(struct netmap_priv_d* p) 589 { 590 if (p->np_mref) { 591 netmap_mem_deref(p->np_mref); 592 p->np_mref = NULL; 593 } 594 } 595 596 597 /* 598 * File descriptor's private data destructor. 599 * 600 * Call nm_register(ifp,0) to stop netmap mode on the interface and 601 * revert to normal operation. We expect that np_na->ifp has not gone. 602 * The second argument is the nifp to work on. In some cases it is 603 * not attached yet to the netmap_priv_d so we need to pass it as 604 * a separate argument. 605 */ 606 /* call with NMG_LOCK held */ 607 static void 608 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) 609 { 610 struct netmap_adapter *na = priv->np_na; 611 struct ifnet *ifp = na->ifp; 612 613 NMG_LOCK_ASSERT(); 614 na->active_fds--; 615 if (na->active_fds <= 0) { /* last instance */ 616 617 if (netmap_verbose) 618 D("deleting last instance for %s", NM_IFPNAME(ifp)); 619 /* 620 * (TO CHECK) This function is only called 621 * when the last reference to this file descriptor goes 622 * away. This means we cannot have any pending poll() 623 * or interrupt routine operating on the structure. 624 * XXX The file may be closed in a thread while 625 * another thread is using it. 626 * Linux keeps the file opened until the last reference 627 * by any outstanding ioctl/poll or mmap is gone. 628 * FreeBSD does not track mmap()s (but we do) and 629 * wakes up any sleeping poll(). Need to check what 630 * happens if the close() occurs while a concurrent 631 * syscall is running. 632 */ 633 if (ifp) 634 na->nm_register(na, 0); /* off, clear flags */ 635 /* Wake up any sleeping threads. netmap_poll will 636 * then return POLLERR 637 * XXX The wake up now must happen during *_down(), when 638 * we order all activities to stop. -gl 639 */ 640 /* XXX kqueue(9) needed; these will mirror knlist_init. */ 641 /* knlist_destroy(&na->tx_si.si_note); */ 642 /* knlist_destroy(&na->rx_si.si_note); */ 643 644 /* delete rings and buffers */ 645 netmap_mem_rings_delete(na); 646 na->nm_krings_delete(na); 647 } 648 /* delete the nifp */ 649 netmap_mem_if_delete(na, nifp); 650 } 651 652 653 /* 654 * returns 1 if this is the last instance and we can free priv 655 */ 656 int 657 netmap_dtor_locked(struct netmap_priv_d *priv) 658 { 659 struct netmap_adapter *na = priv->np_na; 660 661 #ifdef __FreeBSD__ 662 /* 663 * np_refcount is the number of active mmaps on 664 * this file descriptor 665 */ 666 if (--priv->np_refcount > 0) { 667 return 0; 668 } 669 #endif /* __FreeBSD__ */ 670 if (!na) { 671 return 1; //XXX is it correct? 672 } 673 netmap_do_unregif(priv, priv->np_nifp); 674 priv->np_nifp = NULL; 675 netmap_drop_memory_locked(priv); 676 if (priv->np_na) { 677 netmap_adapter_put(na); 678 priv->np_na = NULL; 679 } 680 return 1; 681 } 682 683 684 void 685 netmap_dtor(void *data) 686 { 687 struct netmap_priv_d *priv = data; 688 int last_instance; 689 690 NMG_LOCK(); 691 last_instance = netmap_dtor_locked(priv); 692 NMG_UNLOCK(); 693 if (last_instance) { 694 bzero(priv, sizeof(*priv)); /* for safety */ 695 free(priv, M_DEVBUF); 696 } 697 } 698 699 700 701 702 /* 703 * Handlers for synchronization of the queues from/to the host. 704 * Netmap has two operating modes: 705 * - in the default mode, the rings connected to the host stack are 706 * just another ring pair managed by userspace; 707 * - in transparent mode (XXX to be defined) incoming packets 708 * (from the host or the NIC) are marked as NS_FORWARD upon 709 * arrival, and the user application has a chance to reset the 710 * flag for packets that should be dropped. 711 * On the RXSYNC or poll(), packets in RX rings between 712 * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved 713 * to the other side. 714 * The transfer NIC --> host is relatively easy, just encapsulate 715 * into mbufs and we are done. The host --> NIC side is slightly 716 * harder because there might not be room in the tx ring so it 717 * might take a while before releasing the buffer. 718 */ 719 720 721 /* 722 * pass a chain of buffers to the host stack as coming from 'dst' 723 */ 724 static void 725 netmap_send_up(struct ifnet *dst, struct mbq *q) 726 { 727 struct mbuf *m; 728 729 /* send packets up, outside the lock */ 730 while ((m = mbq_dequeue(q)) != NULL) { 731 if (netmap_verbose & NM_VERB_HOST) 732 D("sending up pkt %p size %d", m, MBUF_LEN(m)); 733 NM_SEND_UP(dst, m); 734 } 735 mbq_destroy(q); 736 } 737 738 739 /* 740 * put a copy of the buffers marked NS_FORWARD into an mbuf chain. 741 * Run from hwcur to cur - reserved 742 */ 743 static void 744 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) 745 { 746 /* Take packets from hwcur to cur-reserved and pass them up. 747 * In case of no buffers we give up. At the end of the loop, 748 * the queue is drained in all cases. 749 * XXX handle reserved 750 */ 751 u_int lim = kring->nkr_num_slots - 1; 752 struct mbuf *m; 753 u_int k = kring->ring->cur, n = kring->ring->reserved; 754 struct netmap_adapter *na = kring->na; 755 756 /* compute the final position, ring->cur - ring->reserved */ 757 if (n > 0) { 758 if (k < n) 759 k += kring->nkr_num_slots; 760 k += n; 761 } 762 for (n = kring->nr_hwcur; n != k;) { 763 struct netmap_slot *slot = &kring->ring->slot[n]; 764 765 n = nm_next(n, lim); 766 if ((slot->flags & NS_FORWARD) == 0 && !force) 767 continue; 768 if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { 769 D("bad pkt at %d len %d", n, slot->len); 770 continue; 771 } 772 slot->flags &= ~NS_FORWARD; // XXX needed ? 773 /* XXX adapt to the case of a multisegment packet */ 774 m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL); 775 776 if (m == NULL) 777 break; 778 mbq_enqueue(q, m); 779 } 780 } 781 782 783 /* 784 * The host ring has packets from nr_hwcur to (cur - reserved) 785 * to be sent down to the NIC. 786 * We need to use the queue lock on the source (host RX ring) 787 * to protect against netmap_transmit. 788 * If the user is well behaved we do not need to acquire locks 789 * on the destination(s), 790 * so we only need to make sure that there are no panics because 791 * of user errors. 792 * XXX verify 793 * 794 * We scan the tx rings, which have just been 795 * flushed so nr_hwcur == cur. Pushing packets down means 796 * increment cur and decrement avail. 797 * XXX to be verified 798 */ 799 static void 800 netmap_sw_to_nic(struct netmap_adapter *na) 801 { 802 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 803 struct netmap_kring *k1 = &na->tx_rings[0]; 804 u_int i, howmany, src_lim, dst_lim; 805 806 /* XXX we should also check that the carrier is on */ 807 if (kring->nkr_stopped) 808 return; 809 810 mtx_lock(&kring->q_lock); 811 812 if (kring->nkr_stopped) 813 goto out; 814 815 howmany = kring->nr_hwavail; /* XXX otherwise cur - reserved - nr_hwcur */ 816 817 src_lim = kring->nkr_num_slots - 1; 818 for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) { 819 ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail); 820 dst_lim = k1->nkr_num_slots - 1; 821 while (howmany > 0 && k1->ring->avail > 0) { 822 struct netmap_slot *src, *dst, tmp; 823 src = &kring->ring->slot[kring->nr_hwcur]; 824 dst = &k1->ring->slot[k1->ring->cur]; 825 tmp = *src; 826 src->buf_idx = dst->buf_idx; 827 src->flags = NS_BUF_CHANGED; 828 829 dst->buf_idx = tmp.buf_idx; 830 dst->len = tmp.len; 831 dst->flags = NS_BUF_CHANGED; 832 ND("out len %d buf %d from %d to %d", 833 dst->len, dst->buf_idx, 834 kring->nr_hwcur, k1->ring->cur); 835 836 kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim); 837 howmany--; 838 kring->nr_hwavail--; 839 k1->ring->cur = nm_next(k1->ring->cur, dst_lim); 840 k1->ring->avail--; 841 } 842 kring->ring->cur = kring->nr_hwcur; // XXX 843 k1++; // XXX why? 844 } 845 out: 846 mtx_unlock(&kring->q_lock); 847 } 848 849 850 /* 851 * netmap_txsync_to_host() passes packets up. We are called from a 852 * system call in user process context, and the only contention 853 * can be among multiple user threads erroneously calling 854 * this routine concurrently. 855 */ 856 void 857 netmap_txsync_to_host(struct netmap_adapter *na) 858 { 859 struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; 860 struct netmap_ring *ring = kring->ring; 861 u_int k, lim = kring->nkr_num_slots - 1; 862 struct mbq q; 863 int error; 864 865 error = nm_kr_tryget(kring); 866 if (error) { 867 if (error == NM_KR_BUSY) 868 D("ring %p busy (user error)", kring); 869 return; 870 } 871 k = ring->cur; 872 if (k > lim) { 873 D("invalid ring index in stack TX kring %p", kring); 874 netmap_ring_reinit(kring); 875 nm_kr_put(kring); 876 return; 877 } 878 879 /* Take packets from hwcur to cur and pass them up. 880 * In case of no buffers we give up. At the end of the loop, 881 * the queue is drained in all cases. 882 */ 883 mbq_init(&q); 884 netmap_grab_packets(kring, &q, 1); 885 kring->nr_hwcur = k; 886 kring->nr_hwavail = ring->avail = lim; 887 888 nm_kr_put(kring); 889 netmap_send_up(na->ifp, &q); 890 } 891 892 893 /* 894 * rxsync backend for packets coming from the host stack. 895 * They have been put in the queue by netmap_transmit() so we 896 * need to protect access to the kring using a lock. 897 * 898 * This routine also does the selrecord if called from the poll handler 899 * (we know because td != NULL). 900 * 901 * NOTE: on linux, selrecord() is defined as a macro and uses pwait 902 * as an additional hidden argument. 903 */ 904 static void 905 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) 906 { 907 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 908 struct netmap_ring *ring = kring->ring; 909 u_int j, n, lim = kring->nkr_num_slots; 910 u_int k = ring->cur, resvd = ring->reserved; 911 912 (void)pwait; /* disable unused warnings */ 913 914 if (kring->nkr_stopped) /* check a first time without lock */ 915 return; 916 917 mtx_lock(&kring->q_lock); 918 919 if (kring->nkr_stopped) /* check again with lock held */ 920 goto unlock_out; 921 922 if (k >= lim) { 923 netmap_ring_reinit(kring); 924 goto unlock_out; 925 } 926 /* new packets are already set in nr_hwavail */ 927 /* skip past packets that userspace has released */ 928 j = kring->nr_hwcur; 929 if (resvd > 0) { 930 if (resvd + ring->avail >= lim + 1) { 931 D("XXX invalid reserve/avail %d %d", resvd, ring->avail); 932 ring->reserved = resvd = 0; // XXX panic... 933 } 934 k = (k >= resvd) ? k - resvd : k + lim - resvd; 935 } 936 if (j != k) { 937 n = k >= j ? k - j : k + lim - j; 938 kring->nr_hwavail -= n; 939 kring->nr_hwcur = k; 940 } 941 k = ring->avail = kring->nr_hwavail - resvd; 942 if (k == 0 && td) 943 selrecord(td, &kring->si); 944 if (k && (netmap_verbose & NM_VERB_HOST)) 945 D("%d pkts from stack", k); 946 unlock_out: 947 948 mtx_unlock(&kring->q_lock); 949 } 950 951 952 /* Get a netmap adapter for the port. 953 * 954 * If it is possible to satisfy the request, return 0 955 * with *na containing the netmap adapter found. 956 * Otherwise return an error code, with *na containing NULL. 957 * 958 * When the port is attached to a bridge, we always return 959 * EBUSY. 960 * Otherwise, if the port is already bound to a file descriptor, 961 * then we unconditionally return the existing adapter into *na. 962 * In all the other cases, we return (into *na) either native, 963 * generic or NULL, according to the following table: 964 * 965 * native_support 966 * active_fds dev.netmap.admode YES NO 967 * ------------------------------------------------------- 968 * >0 * NA(ifp) NA(ifp) 969 * 970 * 0 NETMAP_ADMODE_BEST NATIVE GENERIC 971 * 0 NETMAP_ADMODE_NATIVE NATIVE NULL 972 * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC 973 * 974 */ 975 976 int 977 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) 978 { 979 /* generic support */ 980 int i = netmap_admode; /* Take a snapshot. */ 981 int error = 0; 982 struct netmap_adapter *prev_na; 983 struct netmap_generic_adapter *gna; 984 985 *na = NULL; /* default */ 986 987 /* reset in case of invalid value */ 988 if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST) 989 i = netmap_admode = NETMAP_ADMODE_BEST; 990 991 if (NETMAP_CAPABLE(ifp)) { 992 /* If an adapter already exists, but is 993 * attached to a vale port, we report that the 994 * port is busy. 995 */ 996 if (NETMAP_OWNED_BY_KERN(NA(ifp))) 997 return EBUSY; 998 999 /* If an adapter already exists, return it if 1000 * there are active file descriptors or if 1001 * netmap is not forced to use generic 1002 * adapters. 1003 */ 1004 if (NA(ifp)->active_fds > 0 || 1005 i != NETMAP_ADMODE_GENERIC) { 1006 *na = NA(ifp); 1007 return 0; 1008 } 1009 } 1010 1011 /* If there isn't native support and netmap is not allowed 1012 * to use generic adapters, we cannot satisfy the request. 1013 */ 1014 if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE) 1015 return EINVAL; 1016 1017 /* Otherwise, create a generic adapter and return it, 1018 * saving the previously used netmap adapter, if any. 1019 * 1020 * Note that here 'prev_na', if not NULL, MUST be a 1021 * native adapter, and CANNOT be a generic one. This is 1022 * true because generic adapters are created on demand, and 1023 * destroyed when not used anymore. Therefore, if the adapter 1024 * currently attached to an interface 'ifp' is generic, it 1025 * must be that 1026 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))). 1027 * Consequently, if NA(ifp) is generic, we will enter one of 1028 * the branches above. This ensures that we never override 1029 * a generic adapter with another generic adapter. 1030 */ 1031 prev_na = NA(ifp); 1032 error = generic_netmap_attach(ifp); 1033 if (error) 1034 return error; 1035 1036 *na = NA(ifp); 1037 gna = (struct netmap_generic_adapter*)NA(ifp); 1038 gna->prev = prev_na; /* save old na */ 1039 if (prev_na != NULL) { 1040 ifunit_ref(ifp->if_xname); 1041 // XXX add a refcount ? 1042 netmap_adapter_get(prev_na); 1043 } 1044 D("Created generic NA %p (prev %p)", gna, gna->prev); 1045 1046 return 0; 1047 } 1048 1049 1050 /* 1051 * MUST BE CALLED UNDER NMG_LOCK() 1052 * 1053 * get a refcounted reference to an interface. 1054 * This is always called in the execution of an ioctl(). 1055 * 1056 * Return ENXIO if the interface does not exist, EINVAL if netmap 1057 * is not supported by the interface. 1058 * If successful, hold a reference. 1059 * 1060 * When the NIC is attached to a bridge, reference is managed 1061 * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as 1062 * virtual ports. Hence, on the final DROP_BDG_REF(), the NIC 1063 * is detached from the bridge, then ifp's refcount is dropped (this 1064 * is equivalent to that ifp is destroyed in case of virtual ports. 1065 * 1066 * This function uses if_rele() when we want to prevent the NIC from 1067 * being detached from the bridge in error handling. But once refcount 1068 * is acquired by this function, it must be released using nm_if_rele(). 1069 */ 1070 int 1071 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) 1072 { 1073 struct ifnet *ifp; 1074 int error = 0; 1075 struct netmap_adapter *ret; 1076 1077 *na = NULL; /* default return value */ 1078 1079 /* first try to see if this is a bridge port. */ 1080 NMG_LOCK_ASSERT(); 1081 1082 error = netmap_get_bdg_na(nmr, na, create); 1083 if (error || *na != NULL) /* valid match in netmap_get_bdg_na() */ 1084 return error; 1085 1086 ifp = ifunit_ref(nmr->nr_name); 1087 if (ifp == NULL) { 1088 return ENXIO; 1089 } 1090 1091 error = netmap_get_hw_na(ifp, &ret); 1092 if (error) 1093 goto out; 1094 1095 if (ret != NULL) { 1096 /* Users cannot use the NIC attached to a bridge directly */ 1097 if (NETMAP_OWNED_BY_KERN(ret)) { 1098 error = EINVAL; 1099 goto out; 1100 } 1101 error = 0; 1102 *na = ret; 1103 netmap_adapter_get(ret); 1104 } 1105 out: 1106 if_rele(ifp); 1107 1108 return error; 1109 } 1110 1111 1112 /* 1113 * validate parameters on entry for *_txsync() 1114 * Returns ring->cur if ok, or something >= kring->nkr_num_slots 1115 * in case of error. The extra argument is a pointer to 1116 * 'new_bufs'. XXX this may be deprecated at some point. 1117 * 1118 * Below is a correct configuration on input. ring->cur 1119 * must be in the region covered by kring->hwavail, 1120 * and ring->avail and kring->avail should end at the same slot. 1121 * 1122 * +-hwcur 1123 * | 1124 * v<--hwres-->|<-----hwavail----> 1125 * ------+------------------------------+-------- ring 1126 * | 1127 * |<---avail---> 1128 * +--cur 1129 * 1130 */ 1131 u_int 1132 nm_txsync_prologue(struct netmap_kring *kring, u_int *new_slots) 1133 { 1134 struct netmap_ring *ring = kring->ring; 1135 u_int cur = ring->cur; /* read only once */ 1136 u_int avail = ring->avail; /* read only once */ 1137 u_int n = kring->nkr_num_slots; 1138 u_int kstart, kend, a; 1139 1140 #if 1 /* kernel sanity checks */ 1141 if (kring->nr_hwcur >= n || 1142 kring->nr_hwreserved >= n || kring->nr_hwavail >= n || 1143 kring->nr_hwreserved + kring->nr_hwavail >= n) 1144 goto error; 1145 #endif /* kernel sanity checks */ 1146 kstart = kring->nr_hwcur + kring->nr_hwreserved; 1147 if (kstart >= n) 1148 kstart -= n; 1149 kend = kstart + kring->nr_hwavail; 1150 /* user sanity checks. a is the expected avail */ 1151 if (cur < kstart) { 1152 /* too low, but maybe wraparound */ 1153 if (cur + n > kend) 1154 goto error; 1155 *new_slots = cur + n - kstart; 1156 a = kend - cur - n; 1157 } else { 1158 if (cur > kend) 1159 goto error; 1160 *new_slots = cur - kstart; 1161 a = kend - cur; 1162 } 1163 if (a != avail) { 1164 RD(5, "wrong but fixable avail have %d need %d", 1165 avail, a); 1166 ring->avail = avail = a; 1167 } 1168 return cur; 1169 1170 error: 1171 RD(5, "kring error: hwcur %d hwres %d hwavail %d cur %d av %d", 1172 kring->nr_hwcur, 1173 kring->nr_hwreserved, kring->nr_hwavail, 1174 cur, avail); 1175 return n; 1176 } 1177 1178 1179 /* 1180 * validate parameters on entry for *_rxsync() 1181 * Returns ring->cur - ring->reserved if ok, 1182 * or something >= kring->nkr_num_slots 1183 * in case of error. The extra argument is a pointer to 1184 * 'resvd'. XXX this may be deprecated at some point. 1185 * 1186 * Below is a correct configuration on input. ring->cur and 1187 * ring->reserved must be in the region covered by kring->hwavail, 1188 * and ring->avail and kring->avail should end at the same slot. 1189 * 1190 * +-hwcur 1191 * | 1192 * v<-------hwavail----------> 1193 * ---------+--------------------------+-------- ring 1194 * |<--res-->| 1195 * |<---avail---> 1196 * +--cur 1197 * 1198 */ 1199 u_int 1200 nm_rxsync_prologue(struct netmap_kring *kring, u_int *resvd) 1201 { 1202 struct netmap_ring *ring = kring->ring; 1203 u_int cur = ring->cur; /* read only once */ 1204 u_int avail = ring->avail; /* read only once */ 1205 u_int res = ring->reserved; /* read only once */ 1206 u_int n = kring->nkr_num_slots; 1207 u_int kend = kring->nr_hwcur + kring->nr_hwavail; 1208 u_int a; 1209 1210 #if 1 /* kernel sanity checks */ 1211 if (kring->nr_hwcur >= n || kring->nr_hwavail >= n) 1212 goto error; 1213 #endif /* kernel sanity checks */ 1214 /* user sanity checks */ 1215 if (res >= n) 1216 goto error; 1217 /* check that cur is valid, a is the expected value of avail */ 1218 if (cur < kring->nr_hwcur) { 1219 /* too low, but maybe wraparound */ 1220 if (cur + n > kend) 1221 goto error; 1222 a = kend - (cur + n); 1223 } else { 1224 if (cur > kend) 1225 goto error; 1226 a = kend - cur; 1227 } 1228 if (a != avail) { 1229 RD(5, "wrong but fixable avail have %d need %d", 1230 avail, a); 1231 ring->avail = avail = a; 1232 } 1233 if (res != 0) { 1234 /* then repeat the check for cur + res */ 1235 cur = (cur >= res) ? cur - res : n + cur - res; 1236 if (cur < kring->nr_hwcur) { 1237 /* too low, but maybe wraparound */ 1238 if (cur + n > kend) 1239 goto error; 1240 } else if (cur > kend) { 1241 goto error; 1242 } 1243 } 1244 *resvd = res; 1245 return cur; 1246 1247 error: 1248 RD(5, "kring error: hwcur %d hwres %d hwavail %d cur %d av %d res %d", 1249 kring->nr_hwcur, 1250 kring->nr_hwreserved, kring->nr_hwavail, 1251 ring->cur, avail, res); 1252 return n; 1253 } 1254 1255 /* 1256 * Error routine called when txsync/rxsync detects an error. 1257 * Can't do much more than resetting cur = hwcur, avail = hwavail. 1258 * Return 1 on reinit. 1259 * 1260 * This routine is only called by the upper half of the kernel. 1261 * It only reads hwcur (which is changed only by the upper half, too) 1262 * and hwavail (which may be changed by the lower half, but only on 1263 * a tx ring and only to increase it, so any error will be recovered 1264 * on the next call). For the above, we don't strictly need to call 1265 * it under lock. 1266 */ 1267 int 1268 netmap_ring_reinit(struct netmap_kring *kring) 1269 { 1270 struct netmap_ring *ring = kring->ring; 1271 u_int i, lim = kring->nkr_num_slots - 1; 1272 int errors = 0; 1273 1274 // XXX KASSERT nm_kr_tryget 1275 RD(10, "called for %s", NM_IFPNAME(kring->na->ifp)); 1276 if (ring->cur > lim) 1277 errors++; 1278 for (i = 0; i <= lim; i++) { 1279 u_int idx = ring->slot[i].buf_idx; 1280 u_int len = ring->slot[i].len; 1281 if (idx < 2 || idx >= netmap_total_buffers) { 1282 if (!errors++) 1283 D("bad buffer at slot %d idx %d len %d ", i, idx, len); 1284 ring->slot[i].buf_idx = 0; 1285 ring->slot[i].len = 0; 1286 } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { 1287 ring->slot[i].len = 0; 1288 if (!errors++) 1289 D("bad len %d at slot %d idx %d", 1290 len, i, idx); 1291 } 1292 } 1293 if (errors) { 1294 int pos = kring - kring->na->tx_rings; 1295 int n = kring->na->num_tx_rings + 1; 1296 1297 RD(10, "total %d errors", errors); 1298 errors++; 1299 RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d", 1300 NM_IFPNAME(kring->na->ifp), 1301 pos < n ? "TX" : "RX", pos < n ? pos : pos - n, 1302 ring->cur, kring->nr_hwcur, 1303 ring->avail, kring->nr_hwavail); 1304 ring->cur = kring->nr_hwcur; 1305 ring->avail = kring->nr_hwavail; 1306 } 1307 return (errors ? 1 : 0); 1308 } 1309 1310 1311 /* 1312 * Set the ring ID. For devices with a single queue, a request 1313 * for all rings is the same as a single ring. 1314 */ 1315 static int 1316 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) 1317 { 1318 struct netmap_adapter *na = priv->np_na; 1319 struct ifnet *ifp = na->ifp; 1320 u_int i = ringid & NETMAP_RING_MASK; 1321 /* initially (np_qfirst == np_qlast) we don't want to lock */ 1322 u_int lim = na->num_rx_rings; 1323 1324 if (na->num_tx_rings > lim) 1325 lim = na->num_tx_rings; 1326 if ( (ringid & NETMAP_HW_RING) && i >= lim) { 1327 D("invalid ring id %d", i); 1328 return (EINVAL); 1329 } 1330 priv->np_ringid = ringid; 1331 if (ringid & NETMAP_SW_RING) { 1332 priv->np_qfirst = NETMAP_SW_RING; 1333 priv->np_qlast = 0; 1334 } else if (ringid & NETMAP_HW_RING) { 1335 priv->np_qfirst = i; 1336 priv->np_qlast = i + 1; 1337 } else { 1338 priv->np_qfirst = 0; 1339 priv->np_qlast = NETMAP_HW_RING ; 1340 } 1341 priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; 1342 if (netmap_verbose) { 1343 if (ringid & NETMAP_SW_RING) 1344 D("ringid %s set to SW RING", NM_IFPNAME(ifp)); 1345 else if (ringid & NETMAP_HW_RING) 1346 D("ringid %s set to HW RING %d", NM_IFPNAME(ifp), 1347 priv->np_qfirst); 1348 else 1349 D("ringid %s set to all %d HW RINGS", NM_IFPNAME(ifp), lim); 1350 } 1351 return 0; 1352 } 1353 1354 1355 /* 1356 * possibly move the interface to netmap-mode. 1357 * If success it returns a pointer to netmap_if, otherwise NULL. 1358 * This must be called with NMG_LOCK held. 1359 */ 1360 struct netmap_if * 1361 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, 1362 uint16_t ringid, int *err) 1363 { 1364 struct ifnet *ifp = na->ifp; 1365 struct netmap_if *nifp = NULL; 1366 int error, need_mem = 0; 1367 1368 NMG_LOCK_ASSERT(); 1369 /* ring configuration may have changed, fetch from the card */ 1370 netmap_update_config(na); 1371 priv->np_na = na; /* store the reference */ 1372 error = netmap_set_ringid(priv, ringid); 1373 if (error) 1374 goto out; 1375 /* ensure allocators are ready */ 1376 need_mem = !netmap_have_memory_locked(priv); 1377 if (need_mem) { 1378 error = netmap_get_memory_locked(priv); 1379 ND("get_memory returned %d", error); 1380 if (error) 1381 goto out; 1382 } 1383 nifp = netmap_if_new(NM_IFPNAME(ifp), na); 1384 if (nifp == NULL) { /* allocation failed */ 1385 /* we should drop the allocator, but only 1386 * if we were the ones who grabbed it 1387 */ 1388 error = ENOMEM; 1389 goto out; 1390 } 1391 na->active_fds++; 1392 if (ifp->if_capenable & IFCAP_NETMAP) { 1393 /* was already set */ 1394 } else { 1395 /* Otherwise set the card in netmap mode 1396 * and make it use the shared buffers. 1397 * 1398 * do not core lock because the race is harmless here, 1399 * there cannot be any traffic to netmap_transmit() 1400 */ 1401 na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut; 1402 ND("%p->na_lut == %p", na, na->na_lut); 1403 na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal; 1404 error = na->nm_register(na, 1); /* mode on */ 1405 if (error) { 1406 netmap_do_unregif(priv, nifp); 1407 nifp = NULL; 1408 } 1409 } 1410 out: 1411 *err = error; 1412 if (error) { 1413 priv->np_na = NULL; 1414 if (need_mem) 1415 netmap_drop_memory_locked(priv); 1416 } 1417 if (nifp != NULL) { 1418 /* 1419 * advertise that the interface is ready bt setting ni_nifp. 1420 * The barrier is needed because readers (poll and *SYNC) 1421 * check for priv->np_nifp != NULL without locking 1422 */ 1423 wmb(); /* make sure previous writes are visible to all CPUs */ 1424 priv->np_nifp = nifp; 1425 } 1426 return nifp; 1427 } 1428 1429 1430 1431 /* 1432 * ioctl(2) support for the "netmap" device. 1433 * 1434 * Following a list of accepted commands: 1435 * - NIOCGINFO 1436 * - SIOCGIFADDR just for convenience 1437 * - NIOCREGIF 1438 * - NIOCUNREGIF 1439 * - NIOCTXSYNC 1440 * - NIOCRXSYNC 1441 * 1442 * Return 0 on success, errno otherwise. 1443 */ 1444 int 1445 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, 1446 int fflag, struct thread *td) 1447 { 1448 struct netmap_priv_d *priv = NULL; 1449 struct ifnet *ifp = NULL; 1450 struct nmreq *nmr = (struct nmreq *) data; 1451 struct netmap_adapter *na = NULL; 1452 int error; 1453 u_int i, lim; 1454 struct netmap_if *nifp; 1455 struct netmap_kring *krings; 1456 1457 (void)dev; /* UNUSED */ 1458 (void)fflag; /* UNUSED */ 1459 #ifdef linux 1460 #define devfs_get_cdevpriv(pp) \ 1461 ({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; \ 1462 (*pp ? 0 : ENOENT); }) 1463 1464 /* devfs_set_cdevpriv cannot fail on linux */ 1465 #define devfs_set_cdevpriv(p, fn) \ 1466 ({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); }) 1467 1468 1469 #define devfs_clear_cdevpriv() do { \ 1470 netmap_dtor(priv); ((struct file *)td)->private_data = 0; \ 1471 } while (0) 1472 #endif /* linux */ 1473 1474 CURVNET_SET(TD_TO_VNET(td)); 1475 1476 error = devfs_get_cdevpriv((void **)&priv); 1477 if (error) { 1478 CURVNET_RESTORE(); 1479 /* XXX ENOENT should be impossible, since the priv 1480 * is now created in the open */ 1481 return (error == ENOENT ? ENXIO : error); 1482 } 1483 1484 nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; /* truncate name */ 1485 switch (cmd) { 1486 case NIOCGINFO: /* return capabilities etc */ 1487 if (nmr->nr_version != NETMAP_API) { 1488 D("API mismatch got %d have %d", 1489 nmr->nr_version, NETMAP_API); 1490 nmr->nr_version = NETMAP_API; 1491 error = EINVAL; 1492 break; 1493 } 1494 if (nmr->nr_cmd == NETMAP_BDG_LIST) { 1495 error = netmap_bdg_ctl(nmr, NULL); 1496 break; 1497 } 1498 1499 NMG_LOCK(); 1500 do { 1501 /* memsize is always valid */ 1502 struct netmap_mem_d *nmd = &nm_mem; 1503 u_int memflags; 1504 1505 if (nmr->nr_name[0] != '\0') { 1506 /* get a refcount */ 1507 error = netmap_get_na(nmr, &na, 1 /* create */); 1508 if (error) 1509 break; 1510 nmd = na->nm_mem; /* get memory allocator */ 1511 } 1512 1513 error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags); 1514 if (error) 1515 break; 1516 if (na == NULL) /* only memory info */ 1517 break; 1518 nmr->nr_offset = 0; 1519 nmr->nr_rx_slots = nmr->nr_tx_slots = 0; 1520 netmap_update_config(na); 1521 nmr->nr_rx_rings = na->num_rx_rings; 1522 nmr->nr_tx_rings = na->num_tx_rings; 1523 nmr->nr_rx_slots = na->num_rx_desc; 1524 nmr->nr_tx_slots = na->num_tx_desc; 1525 if (memflags & NETMAP_MEM_PRIVATE) 1526 nmr->nr_ringid |= NETMAP_PRIV_MEM; 1527 netmap_adapter_put(na); 1528 } while (0); 1529 NMG_UNLOCK(); 1530 break; 1531 1532 case NIOCREGIF: 1533 if (nmr->nr_version != NETMAP_API) { 1534 nmr->nr_version = NETMAP_API; 1535 error = EINVAL; 1536 break; 1537 } 1538 /* possibly attach/detach NIC and VALE switch */ 1539 i = nmr->nr_cmd; 1540 if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH 1541 || i == NETMAP_BDG_OFFSET) { 1542 error = netmap_bdg_ctl(nmr, NULL); 1543 break; 1544 } else if (i != 0) { 1545 D("nr_cmd must be 0 not %d", i); 1546 error = EINVAL; 1547 break; 1548 } 1549 1550 /* protect access to priv from concurrent NIOCREGIF */ 1551 NMG_LOCK(); 1552 do { 1553 u_int memflags; 1554 1555 if (priv->np_na != NULL) { /* thread already registered */ 1556 error = netmap_set_ringid(priv, nmr->nr_ringid); 1557 break; 1558 } 1559 /* find the interface and a reference */ 1560 error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */ 1561 if (error) 1562 break; 1563 ifp = na->ifp; 1564 if (NETMAP_OWNED_BY_KERN(na)) { 1565 netmap_adapter_put(na); 1566 error = EBUSY; 1567 break; 1568 } 1569 nifp = netmap_do_regif(priv, na, nmr->nr_ringid, &error); 1570 if (!nifp) { /* reg. failed, release priv and ref */ 1571 netmap_adapter_put(na); 1572 priv->np_nifp = NULL; 1573 break; 1574 } 1575 1576 /* return the offset of the netmap_if object */ 1577 nmr->nr_rx_rings = na->num_rx_rings; 1578 nmr->nr_tx_rings = na->num_tx_rings; 1579 nmr->nr_rx_slots = na->num_rx_desc; 1580 nmr->nr_tx_slots = na->num_tx_desc; 1581 error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags); 1582 if (error) { 1583 netmap_adapter_put(na); 1584 break; 1585 } 1586 if (memflags & NETMAP_MEM_PRIVATE) { 1587 nmr->nr_ringid |= NETMAP_PRIV_MEM; 1588 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; 1589 } 1590 nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); 1591 } while (0); 1592 NMG_UNLOCK(); 1593 break; 1594 1595 case NIOCUNREGIF: 1596 // XXX we have no data here ? 1597 D("deprecated, data is %p", nmr); 1598 error = EINVAL; 1599 break; 1600 1601 case NIOCTXSYNC: 1602 case NIOCRXSYNC: 1603 nifp = priv->np_nifp; 1604 1605 if (nifp == NULL) { 1606 error = ENXIO; 1607 break; 1608 } 1609 rmb(); /* make sure following reads are not from cache */ 1610 1611 na = priv->np_na; /* we have a reference */ 1612 1613 if (na == NULL) { 1614 D("Internal error: nifp != NULL && na == NULL"); 1615 error = ENXIO; 1616 break; 1617 } 1618 1619 ifp = na->ifp; 1620 if (ifp == NULL) { 1621 RD(1, "the ifp is gone"); 1622 error = ENXIO; 1623 break; 1624 } 1625 1626 if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ 1627 if (cmd == NIOCTXSYNC) 1628 netmap_txsync_to_host(na); 1629 else 1630 netmap_rxsync_from_host(na, NULL, NULL); 1631 break; 1632 } 1633 /* find the last ring to scan */ 1634 lim = priv->np_qlast; 1635 if (lim == NETMAP_HW_RING) 1636 lim = (cmd == NIOCTXSYNC) ? 1637 na->num_tx_rings : na->num_rx_rings; 1638 1639 krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings; 1640 for (i = priv->np_qfirst; i < lim; i++) { 1641 struct netmap_kring *kring = krings + i; 1642 if (nm_kr_tryget(kring)) { 1643 error = EBUSY; 1644 goto out; 1645 } 1646 if (cmd == NIOCTXSYNC) { 1647 if (netmap_verbose & NM_VERB_TXSYNC) 1648 D("pre txsync ring %d cur %d hwcur %d", 1649 i, kring->ring->cur, 1650 kring->nr_hwcur); 1651 na->nm_txsync(na, i, NAF_FORCE_RECLAIM); 1652 if (netmap_verbose & NM_VERB_TXSYNC) 1653 D("post txsync ring %d cur %d hwcur %d", 1654 i, kring->ring->cur, 1655 kring->nr_hwcur); 1656 } else { 1657 na->nm_rxsync(na, i, NAF_FORCE_READ); 1658 microtime(&na->rx_rings[i].ring->ts); 1659 } 1660 nm_kr_put(kring); 1661 } 1662 1663 break; 1664 1665 #ifdef __FreeBSD__ 1666 case BIOCIMMEDIATE: 1667 case BIOCGHDRCMPLT: 1668 case BIOCSHDRCMPLT: 1669 case BIOCSSEESENT: 1670 D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); 1671 break; 1672 1673 default: /* allow device-specific ioctls */ 1674 { 1675 struct socket so; 1676 1677 bzero(&so, sizeof(so)); 1678 NMG_LOCK(); 1679 error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */ 1680 if (error) { 1681 netmap_adapter_put(na); 1682 NMG_UNLOCK(); 1683 break; 1684 } 1685 ifp = na->ifp; 1686 so.so_vnet = ifp->if_vnet; 1687 // so->so_proto not null. 1688 error = ifioctl(&so, cmd, data, td); 1689 netmap_adapter_put(na); 1690 NMG_UNLOCK(); 1691 break; 1692 } 1693 1694 #else /* linux */ 1695 default: 1696 error = EOPNOTSUPP; 1697 #endif /* linux */ 1698 } 1699 out: 1700 1701 CURVNET_RESTORE(); 1702 return (error); 1703 } 1704 1705 1706 /* 1707 * select(2) and poll(2) handlers for the "netmap" device. 1708 * 1709 * Can be called for one or more queues. 1710 * Return true the event mask corresponding to ready events. 1711 * If there are no ready events, do a selrecord on either individual 1712 * selinfo or on the global one. 1713 * Device-dependent parts (locking and sync of tx/rx rings) 1714 * are done through callbacks. 1715 * 1716 * On linux, arguments are really pwait, the poll table, and 'td' is struct file * 1717 * The first one is remapped to pwait as selrecord() uses the name as an 1718 * hidden argument. 1719 */ 1720 int 1721 netmap_poll(struct cdev *dev, int events, struct thread *td) 1722 { 1723 struct netmap_priv_d *priv = NULL; 1724 struct netmap_adapter *na; 1725 struct ifnet *ifp; 1726 struct netmap_kring *kring; 1727 u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; 1728 u_int lim_tx, lim_rx, host_forwarded = 0; 1729 struct mbq q; 1730 void *pwait = dev; /* linux compatibility */ 1731 1732 /* 1733 * In order to avoid nested locks, we need to "double check" 1734 * txsync and rxsync if we decide to do a selrecord(). 1735 * retry_tx (and retry_rx, later) prevent looping forever. 1736 */ 1737 int retry_tx = 1; 1738 1739 (void)pwait; 1740 mbq_init(&q); 1741 1742 if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) 1743 return POLLERR; 1744 1745 if (priv->np_nifp == NULL) { 1746 D("No if registered"); 1747 return POLLERR; 1748 } 1749 rmb(); /* make sure following reads are not from cache */ 1750 1751 na = priv->np_na; 1752 ifp = na->ifp; 1753 // check for deleted 1754 if (ifp == NULL) { 1755 RD(1, "the ifp is gone"); 1756 return POLLERR; 1757 } 1758 1759 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) 1760 return POLLERR; 1761 1762 if (netmap_verbose & 0x8000) 1763 D("device %s events 0x%x", NM_IFPNAME(ifp), events); 1764 want_tx = events & (POLLOUT | POLLWRNORM); 1765 want_rx = events & (POLLIN | POLLRDNORM); 1766 1767 lim_tx = na->num_tx_rings; 1768 lim_rx = na->num_rx_rings; 1769 1770 if (priv->np_qfirst == NETMAP_SW_RING) { 1771 /* handle the host stack ring */ 1772 if (priv->np_txpoll || want_tx) { 1773 /* push any packets up, then we are always ready */ 1774 netmap_txsync_to_host(na); 1775 revents |= want_tx; 1776 } 1777 if (want_rx) { 1778 kring = &na->rx_rings[lim_rx]; 1779 if (kring->ring->avail == 0) 1780 netmap_rxsync_from_host(na, td, dev); 1781 if (kring->ring->avail > 0) { 1782 revents |= want_rx; 1783 } 1784 } 1785 return (revents); 1786 } 1787 1788 /* 1789 * If we are in transparent mode, check also the host rx ring 1790 * XXX Transparent mode at the moment requires to bind all 1791 * rings to a single file descriptor. 1792 */ 1793 kring = &na->rx_rings[lim_rx]; 1794 if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all 1795 && want_rx 1796 && (netmap_fwd || kring->ring->flags & NR_FORWARD) ) { 1797 if (kring->ring->avail == 0) 1798 netmap_rxsync_from_host(na, td, dev); 1799 if (kring->ring->avail > 0) 1800 revents |= want_rx; 1801 } 1802 1803 /* 1804 * check_all_{tx|rx} are set if the card has more than one queue AND 1805 * the file descriptor is bound to all of them. If so, we sleep on 1806 * the "global" selinfo, otherwise we sleep on individual selinfo 1807 * (FreeBSD only allows two selinfo's per file descriptor). 1808 * The interrupt routine in the driver wake one or the other 1809 * (or both) depending on which clients are active. 1810 * 1811 * rxsync() is only called if we run out of buffers on a POLLIN. 1812 * txsync() is called if we run out of buffers on POLLOUT, or 1813 * there are pending packets to send. The latter can be disabled 1814 * passing NETMAP_NO_TX_POLL in the NIOCREG call. 1815 */ 1816 check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1); 1817 check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1); 1818 1819 if (priv->np_qlast != NETMAP_HW_RING) { 1820 lim_tx = lim_rx = priv->np_qlast; 1821 } 1822 1823 /* 1824 * We start with a lock free round which is cheap if we have 1825 * slots available. If this fails, then lock and call the sync 1826 * routines. 1827 * XXX rather than ring->avail >0 should check that 1828 * ring->cur has not reached hwcur+hwavail 1829 */ 1830 for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { 1831 kring = &na->rx_rings[i]; 1832 if (kring->ring->avail > 0) { 1833 revents |= want_rx; 1834 want_rx = 0; /* also breaks the loop */ 1835 } 1836 } 1837 for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) { 1838 kring = &na->tx_rings[i]; 1839 if (kring->ring->avail > 0) { 1840 revents |= want_tx; 1841 want_tx = 0; /* also breaks the loop */ 1842 } 1843 } 1844 1845 /* 1846 * If we to push packets out (priv->np_txpoll) or want_tx is 1847 * still set, we do need to run the txsync calls (on all rings, 1848 * to avoid that the tx rings stall). 1849 * XXX should also check cur != hwcur on the tx rings. 1850 * Fortunately, normal tx mode has np_txpoll set. 1851 */ 1852 if (priv->np_txpoll || want_tx) { 1853 /* If we really want to be woken up (want_tx), 1854 * do a selrecord, either on the global or on 1855 * the private structure. Then issue the txsync 1856 * so there is no race in the selrecord/selwait 1857 */ 1858 flush_tx: 1859 for (i = priv->np_qfirst; i < lim_tx; i++) { 1860 kring = &na->tx_rings[i]; 1861 /* 1862 * Skip this ring if want_tx == 0 1863 * (we have already done a successful sync on 1864 * a previous ring) AND kring->cur == kring->hwcur 1865 * (there are no pending transmissions for this ring). 1866 */ 1867 if (!want_tx && kring->ring->cur == kring->nr_hwcur) 1868 continue; 1869 /* make sure only one user thread is doing this */ 1870 if (nm_kr_tryget(kring)) { 1871 ND("ring %p busy is %d", 1872 kring, (int)kring->nr_busy); 1873 revents |= POLLERR; 1874 goto out; 1875 } 1876 1877 if (netmap_verbose & NM_VERB_TXSYNC) 1878 D("send %d on %s %d", 1879 kring->ring->cur, NM_IFPNAME(ifp), i); 1880 if (na->nm_txsync(na, i, 0)) 1881 revents |= POLLERR; 1882 1883 /* Check avail and call selrecord only if 1884 * called with POLLOUT and run out of bufs. 1885 * XXX Note, we cannot trust much ring->avail 1886 * as it is exposed to userspace (even though 1887 * just updated by txsync). We should really 1888 * check kring->nr_hwavail or better have 1889 * txsync set a flag telling if we need 1890 * to do a selrecord(). 1891 */ 1892 if (want_tx) { 1893 if (kring->ring->avail > 0) { 1894 /* stop at the first ring. We don't risk 1895 * starvation. 1896 */ 1897 revents |= want_tx; 1898 want_tx = 0; 1899 } 1900 } 1901 nm_kr_put(kring); 1902 } 1903 if (want_tx && retry_tx) { 1904 selrecord(td, check_all_tx ? 1905 &na->tx_si : &na->tx_rings[priv->np_qfirst].si); 1906 retry_tx = 0; 1907 goto flush_tx; 1908 } 1909 } 1910 1911 /* 1912 * now if want_rx is still set we need to lock and rxsync. 1913 * Do it on all rings because otherwise we starve. 1914 */ 1915 if (want_rx) { 1916 int retry_rx = 1; 1917 do_retry_rx: 1918 for (i = priv->np_qfirst; i < lim_rx; i++) { 1919 kring = &na->rx_rings[i]; 1920 1921 if (nm_kr_tryget(kring)) { 1922 revents |= POLLERR; 1923 goto out; 1924 } 1925 1926 /* XXX NR_FORWARD should only be read on 1927 * physical or NIC ports 1928 */ 1929 if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { 1930 ND(10, "forwarding some buffers up %d to %d", 1931 kring->nr_hwcur, kring->ring->cur); 1932 netmap_grab_packets(kring, &q, netmap_fwd); 1933 } 1934 1935 if (na->nm_rxsync(na, i, 0)) 1936 revents |= POLLERR; 1937 if (netmap_no_timestamp == 0 || 1938 kring->ring->flags & NR_TIMESTAMP) { 1939 microtime(&kring->ring->ts); 1940 } 1941 1942 if (kring->ring->avail > 0) { 1943 revents |= want_rx; 1944 retry_rx = 0; 1945 } 1946 nm_kr_put(kring); 1947 } 1948 if (retry_rx) { 1949 retry_rx = 0; 1950 selrecord(td, check_all_rx ? 1951 &na->rx_si : &na->rx_rings[priv->np_qfirst].si); 1952 goto do_retry_rx; 1953 } 1954 } 1955 1956 /* forward host to the netmap ring. 1957 * I am accessing nr_hwavail without lock, but netmap_transmit 1958 * can only increment it, so the operation is safe. 1959 */ 1960 kring = &na->rx_rings[lim_rx]; 1961 if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all 1962 && (netmap_fwd || kring->ring->flags & NR_FORWARD) 1963 && kring->nr_hwavail > 0 && !host_forwarded) { 1964 netmap_sw_to_nic(na); 1965 host_forwarded = 1; /* prevent another pass */ 1966 want_rx = 0; 1967 goto flush_tx; 1968 } 1969 1970 if (q.head) 1971 netmap_send_up(na->ifp, &q); 1972 1973 out: 1974 1975 return (revents); 1976 } 1977 1978 /*------- driver support routines ------*/ 1979 1980 static int netmap_hw_krings_create(struct netmap_adapter *); 1981 1982 static int 1983 netmap_notify(struct netmap_adapter *na, u_int n_ring, enum txrx tx, int flags) 1984 { 1985 struct netmap_kring *kring; 1986 1987 if (tx == NR_TX) { 1988 kring = na->tx_rings + n_ring; 1989 selwakeuppri(&kring->si, PI_NET); 1990 if (flags & NAF_GLOBAL_NOTIFY) 1991 selwakeuppri(&na->tx_si, PI_NET); 1992 } else { 1993 kring = na->rx_rings + n_ring; 1994 selwakeuppri(&kring->si, PI_NET); 1995 if (flags & NAF_GLOBAL_NOTIFY) 1996 selwakeuppri(&na->rx_si, PI_NET); 1997 } 1998 return 0; 1999 } 2000 2001 2002 // XXX check handling of failures 2003 int 2004 netmap_attach_common(struct netmap_adapter *na) 2005 { 2006 struct ifnet *ifp = na->ifp; 2007 2008 if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { 2009 D("%s: invalid rings tx %d rx %d", 2010 ifp->if_xname, na->num_tx_rings, na->num_rx_rings); 2011 return EINVAL; 2012 } 2013 WNA(ifp) = na; 2014 NETMAP_SET_CAPABLE(ifp); 2015 if (na->nm_krings_create == NULL) { 2016 na->nm_krings_create = netmap_hw_krings_create; 2017 na->nm_krings_delete = netmap_krings_delete; 2018 } 2019 if (na->nm_notify == NULL) 2020 na->nm_notify = netmap_notify; 2021 na->active_fds = 0; 2022 2023 if (na->nm_mem == NULL) 2024 na->nm_mem = &nm_mem; 2025 return 0; 2026 } 2027 2028 2029 void 2030 netmap_detach_common(struct netmap_adapter *na) 2031 { 2032 if (na->ifp) 2033 WNA(na->ifp) = NULL; /* XXX do we need this? */ 2034 2035 if (na->tx_rings) { /* XXX should not happen */ 2036 D("freeing leftover tx_rings"); 2037 na->nm_krings_delete(na); 2038 } 2039 if (na->na_flags & NAF_MEM_OWNER) 2040 netmap_mem_private_delete(na->nm_mem); 2041 bzero(na, sizeof(*na)); 2042 free(na, M_DEVBUF); 2043 } 2044 2045 2046 /* 2047 * Initialize a ``netmap_adapter`` object created by driver on attach. 2048 * We allocate a block of memory with room for a struct netmap_adapter 2049 * plus two sets of N+2 struct netmap_kring (where N is the number 2050 * of hardware rings): 2051 * krings 0..N-1 are for the hardware queues. 2052 * kring N is for the host stack queue 2053 * kring N+1 is only used for the selinfo for all queues. 2054 * Return 0 on success, ENOMEM otherwise. 2055 * 2056 * By default the receive and transmit adapter ring counts are both initialized 2057 * to num_queues. na->num_tx_rings can be set for cards with different tx/rx 2058 * setups. 2059 */ 2060 int 2061 netmap_attach(struct netmap_adapter *arg) 2062 { 2063 struct netmap_hw_adapter *hwna = NULL; 2064 // XXX when is arg == NULL ? 2065 struct ifnet *ifp = arg ? arg->ifp : NULL; 2066 2067 if (arg == NULL || ifp == NULL) 2068 goto fail; 2069 hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO); 2070 if (hwna == NULL) 2071 goto fail; 2072 hwna->up = *arg; 2073 if (netmap_attach_common(&hwna->up)) { 2074 free(hwna, M_DEVBUF); 2075 goto fail; 2076 } 2077 netmap_adapter_get(&hwna->up); 2078 2079 #ifdef linux 2080 if (ifp->netdev_ops) { 2081 /* prepare a clone of the netdev ops */ 2082 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) 2083 hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops; 2084 #else 2085 hwna->nm_ndo = *ifp->netdev_ops; 2086 #endif 2087 } 2088 hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; 2089 #endif /* linux */ 2090 2091 D("success for %s", NM_IFPNAME(ifp)); 2092 return 0; 2093 2094 fail: 2095 D("fail, arg %p ifp %p na %p", arg, ifp, hwna); 2096 netmap_detach(ifp); 2097 return (hwna ? EINVAL : ENOMEM); 2098 } 2099 2100 2101 void 2102 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na) 2103 { 2104 if (!na) { 2105 return; 2106 } 2107 2108 refcount_acquire(&na->na_refcount); 2109 } 2110 2111 2112 /* returns 1 iff the netmap_adapter is destroyed */ 2113 int 2114 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) 2115 { 2116 if (!na) 2117 return 1; 2118 2119 if (!refcount_release(&na->na_refcount)) 2120 return 0; 2121 2122 if (na->nm_dtor) 2123 na->nm_dtor(na); 2124 2125 netmap_detach_common(na); 2126 2127 return 1; 2128 } 2129 2130 2131 int 2132 netmap_hw_krings_create(struct netmap_adapter *na) 2133 { 2134 return netmap_krings_create(na, 2135 na->num_tx_rings + 1, na->num_rx_rings + 1, 0); 2136 } 2137 2138 2139 2140 /* 2141 * Free the allocated memory linked to the given ``netmap_adapter`` 2142 * object. 2143 */ 2144 void 2145 netmap_detach(struct ifnet *ifp) 2146 { 2147 struct netmap_adapter *na = NA(ifp); 2148 2149 if (!na) 2150 return; 2151 2152 NMG_LOCK(); 2153 netmap_disable_all_rings(ifp); 2154 netmap_adapter_put(na); 2155 na->ifp = NULL; 2156 netmap_enable_all_rings(ifp); 2157 NMG_UNLOCK(); 2158 } 2159 2160 2161 /* 2162 * Intercept packets from the network stack and pass them 2163 * to netmap as incoming packets on the 'software' ring. 2164 * We rely on the OS to make sure that the ifp and na do not go 2165 * away (typically the caller checks for IFF_DRV_RUNNING or the like). 2166 * In nm_register() or whenever there is a reinitialization, 2167 * we make sure to make the mode change visible here. 2168 */ 2169 int 2170 netmap_transmit(struct ifnet *ifp, struct mbuf *m) 2171 { 2172 struct netmap_adapter *na = NA(ifp); 2173 struct netmap_kring *kring; 2174 u_int i, len = MBUF_LEN(m); 2175 u_int error = EBUSY, lim; 2176 struct netmap_slot *slot; 2177 2178 // XXX [Linux] we do not need this lock 2179 // if we follow the down/configure/up protocol -gl 2180 // mtx_lock(&na->core_lock); 2181 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { 2182 /* interface not in netmap mode anymore */ 2183 error = ENXIO; 2184 goto done; 2185 } 2186 2187 kring = &na->rx_rings[na->num_rx_rings]; 2188 lim = kring->nkr_num_slots - 1; 2189 if (netmap_verbose & NM_VERB_HOST) 2190 D("%s packet %d len %d from the stack", NM_IFPNAME(ifp), 2191 kring->nr_hwcur + kring->nr_hwavail, len); 2192 // XXX reconsider long packets if we handle fragments 2193 if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ 2194 D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp), 2195 len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); 2196 goto done; 2197 } 2198 /* protect against other instances of netmap_transmit, 2199 * and userspace invocations of rxsync(). 2200 */ 2201 // XXX [Linux] there can be no other instances of netmap_transmit 2202 // on this same ring, but we still need this lock to protect 2203 // concurrent access from netmap_sw_to_nic() -gl 2204 mtx_lock(&kring->q_lock); 2205 if (kring->nr_hwavail >= lim) { 2206 if (netmap_verbose) 2207 D("stack ring %s full\n", NM_IFPNAME(ifp)); 2208 } else { 2209 /* compute the insert position */ 2210 i = nm_kr_rxpos(kring); 2211 slot = &kring->ring->slot[i]; 2212 m_copydata(m, 0, (int)len, BDG_NMB(na, slot)); 2213 slot->len = len; 2214 slot->flags = kring->nkr_slot_flags; 2215 kring->nr_hwavail++; 2216 if (netmap_verbose & NM_VERB_HOST) 2217 D("wake up host ring %s %d", NM_IFPNAME(na->ifp), na->num_rx_rings); 2218 na->nm_notify(na, na->num_rx_rings, NR_RX, 0); 2219 error = 0; 2220 } 2221 mtx_unlock(&kring->q_lock); 2222 2223 done: 2224 // mtx_unlock(&na->core_lock); 2225 2226 /* release the mbuf in either cases of success or failure. As an 2227 * alternative, put the mbuf in a free list and free the list 2228 * only when really necessary. 2229 */ 2230 m_freem(m); 2231 2232 return (error); 2233 } 2234 2235 2236 /* 2237 * netmap_reset() is called by the driver routines when reinitializing 2238 * a ring. The driver is in charge of locking to protect the kring. 2239 * If native netmap mode is not set just return NULL. 2240 */ 2241 struct netmap_slot * 2242 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, 2243 u_int new_cur) 2244 { 2245 struct netmap_kring *kring; 2246 int new_hwofs, lim; 2247 2248 if (na == NULL) { 2249 D("NULL na, should not happen"); 2250 return NULL; /* no netmap support here */ 2251 } 2252 if (!(na->ifp->if_capenable & IFCAP_NETMAP)) { 2253 ND("interface not in netmap mode"); 2254 return NULL; /* nothing to reinitialize */ 2255 } 2256 2257 /* XXX note- in the new scheme, we are not guaranteed to be 2258 * under lock (e.g. when called on a device reset). 2259 * In this case, we should set a flag and do not trust too 2260 * much the values. In practice: TODO 2261 * - set a RESET flag somewhere in the kring 2262 * - do the processing in a conservative way 2263 * - let the *sync() fixup at the end. 2264 */ 2265 if (tx == NR_TX) { 2266 if (n >= na->num_tx_rings) 2267 return NULL; 2268 kring = na->tx_rings + n; 2269 new_hwofs = kring->nr_hwcur - new_cur; 2270 } else { 2271 if (n >= na->num_rx_rings) 2272 return NULL; 2273 kring = na->rx_rings + n; 2274 new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur; 2275 } 2276 lim = kring->nkr_num_slots - 1; 2277 if (new_hwofs > lim) 2278 new_hwofs -= lim + 1; 2279 2280 /* Always set the new offset value and realign the ring. */ 2281 D("%s hwofs %d -> %d, hwavail %d -> %d", 2282 tx == NR_TX ? "TX" : "RX", 2283 kring->nkr_hwofs, new_hwofs, 2284 kring->nr_hwavail, 2285 tx == NR_TX ? lim : kring->nr_hwavail); 2286 kring->nkr_hwofs = new_hwofs; 2287 if (tx == NR_TX) 2288 kring->nr_hwavail = lim; 2289 kring->nr_hwreserved = 0; 2290 2291 #if 0 // def linux 2292 /* XXX check that the mappings are correct */ 2293 /* need ring_nr, adapter->pdev, direction */ 2294 buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE); 2295 if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { 2296 D("error mapping rx netmap buffer %d", i); 2297 // XXX fix error handling 2298 } 2299 2300 #endif /* linux */ 2301 /* 2302 * Wakeup on the individual and global selwait 2303 * We do the wakeup here, but the ring is not yet reconfigured. 2304 * However, we are under lock so there are no races. 2305 */ 2306 na->nm_notify(na, n, tx, NAF_GLOBAL_NOTIFY); 2307 return kring->ring->slot; 2308 } 2309 2310 2311 /* 2312 * Dispatch rx/tx interrupts to the netmap rings. 2313 * 2314 * "work_done" is non-null on the RX path, NULL for the TX path. 2315 * We rely on the OS to make sure that there is only one active 2316 * instance per queue, and that there is appropriate locking. 2317 * 2318 * The 'notify' routine depends on what the ring is attached to. 2319 * - for a netmap file descriptor, do a selwakeup on the individual 2320 * waitqueue, plus one on the global one if needed 2321 * - for a switch, call the proper forwarding routine 2322 * - XXX more ? 2323 */ 2324 void 2325 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) 2326 { 2327 struct netmap_adapter *na = NA(ifp); 2328 struct netmap_kring *kring; 2329 2330 q &= NETMAP_RING_MASK; 2331 2332 if (netmap_verbose) { 2333 RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); 2334 } 2335 2336 if (work_done) { /* RX path */ 2337 if (q >= na->num_rx_rings) 2338 return; // not a physical queue 2339 kring = na->rx_rings + q; 2340 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? 2341 na->nm_notify(na, q, NR_RX, 2342 (na->num_rx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0)); 2343 *work_done = 1; /* do not fire napi again */ 2344 } else { /* TX path */ 2345 if (q >= na->num_tx_rings) 2346 return; // not a physical queue 2347 kring = na->tx_rings + q; 2348 na->nm_notify(na, q, NR_TX, 2349 (na->num_tx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0)); 2350 } 2351 } 2352 2353 /* 2354 * Default functions to handle rx/tx interrupts from a physical device. 2355 * "work_done" is non-null on the RX path, NULL for the TX path. 2356 * 2357 * If the card is not in netmap mode, simply return 0, 2358 * so that the caller proceeds with regular processing. 2359 * Otherwise call netmap_common_irq() and return 1. 2360 * 2361 * If the card is connected to a netmap file descriptor, 2362 * do a selwakeup on the individual queue, plus one on the global one 2363 * if needed (multiqueue card _and_ there are multiqueue listeners), 2364 * and return 1. 2365 * 2366 * Finally, if called on rx from an interface connected to a switch, 2367 * calls the proper forwarding routine, and return 1. 2368 */ 2369 int 2370 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) 2371 { 2372 // XXX could we check NAF_NATIVE_ON ? 2373 if (!(ifp->if_capenable & IFCAP_NETMAP)) 2374 return 0; 2375 2376 if (NA(ifp)->na_flags & NAF_SKIP_INTR) { 2377 ND("use regular interrupt"); 2378 return 0; 2379 } 2380 2381 netmap_common_irq(ifp, q, work_done); 2382 return 1; 2383 } 2384 2385 2386 /* 2387 * Module loader and unloader 2388 * 2389 * netmap_init() creates the /dev/netmap device and initializes 2390 * all global variables. Returns 0 on success, errno on failure 2391 * (but there is no chance) 2392 * 2393 * netmap_fini() destroys everything. 2394 */ 2395 2396 static struct cdev *netmap_dev; /* /dev/netmap character device. */ 2397 extern struct cdevsw netmap_cdevsw; 2398 2399 void 2400 netmap_fini(void) 2401 { 2402 // XXX destroy_bridges() ? 2403 if (netmap_dev) 2404 destroy_dev(netmap_dev); 2405 netmap_mem_fini(); 2406 NMG_LOCK_DESTROY(); 2407 printf("netmap: unloaded module.\n"); 2408 } 2409 2410 int 2411 netmap_init(void) 2412 { 2413 int error; 2414 2415 NMG_LOCK_INIT(); 2416 2417 error = netmap_mem_init(); 2418 if (error != 0) 2419 goto fail; 2420 /* XXX could use make_dev_credv() to get error number */ 2421 netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, 2422 "netmap"); 2423 if (!netmap_dev) 2424 goto fail; 2425 2426 netmap_init_bridges(); 2427 printf("netmap: loaded module\n"); 2428 return (0); 2429 fail: 2430 netmap_fini(); 2431 return (EINVAL); /* may be incorrect */ 2432 } 2433