1 /* 2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 /* 28 * $FreeBSD$ 29 * 30 * This module supports memory mapped access to network devices, 31 * see netmap(4). 32 * 33 * The module uses a large, memory pool allocated by the kernel 34 * and accessible as mmapped memory by multiple userspace threads/processes. 35 * The memory pool contains packet buffers and "netmap rings", 36 * i.e. user-accessible copies of the interface's queues. 37 * 38 * Access to the network card works like this: 39 * 1. a process/thread issues one or more open() on /dev/netmap, to create 40 * select()able file descriptor on which events are reported. 41 * 2. on each descriptor, the process issues an ioctl() to identify 42 * the interface that should report events to the file descriptor. 43 * 3. on each descriptor, the process issues an mmap() request to 44 * map the shared memory region within the process' address space. 45 * The list of interesting queues is indicated by a location in 46 * the shared memory region. 47 * 4. using the functions in the netmap(4) userspace API, a process 48 * can look up the occupation state of a queue, access memory buffers, 49 * and retrieve received packets or enqueue packets to transmit. 50 * 5. using some ioctl()s the process can synchronize the userspace view 51 * of the queue with the actual status in the kernel. This includes both 52 * receiving the notification of new packets, and transmitting new 53 * packets on the output interface. 54 * 6. select() or poll() can be used to wait for events on individual 55 * transmit or receive queues (or all queues for a given interface). 56 * 57 58 SYNCHRONIZATION (USER) 59 60 The netmap rings and data structures may be shared among multiple 61 user threads or even independent processes. 62 Any synchronization among those threads/processes is delegated 63 to the threads themselves. Only one thread at a time can be in 64 a system call on the same netmap ring. The OS does not enforce 65 this and only guarantees against system crashes in case of 66 invalid usage. 67 68 LOCKING (INTERNAL) 69 70 Within the kernel, access to the netmap rings is protected as follows: 71 72 - a spinlock on each ring, to handle producer/consumer races on 73 RX rings attached to the host stack (against multiple host 74 threads writing from the host stack to the same ring), 75 and on 'destination' rings attached to a VALE switch 76 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) 77 protecting multiple active senders for the same destination) 78 79 - an atomic variable to guarantee that there is at most one 80 instance of *_*xsync() on the ring at any time. 81 For rings connected to user file 82 descriptors, an atomic_test_and_set() protects this, and the 83 lock on the ring is not actually used. 84 For NIC RX rings connected to a VALE switch, an atomic_test_and_set() 85 is also used to prevent multiple executions (the driver might indeed 86 already guarantee this). 87 For NIC TX rings connected to a VALE switch, the lock arbitrates 88 access to the queue (both when allocating buffers and when pushing 89 them out). 90 91 - *xsync() should be protected against initializations of the card. 92 On FreeBSD most devices have the reset routine protected by 93 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing 94 the RING protection on rx_reset(), this should be added. 95 96 On linux there is an external lock on the tx path, which probably 97 also arbitrates access to the reset routine. XXX to be revised 98 99 - a per-interface core_lock protecting access from the host stack 100 while interfaces may be detached from netmap mode. 101 XXX there should be no need for this lock if we detach the interfaces 102 only while they are down. 103 104 105 --- VALE SWITCH --- 106 107 NMG_LOCK() serializes all modifications to switches and ports. 108 A switch cannot be deleted until all ports are gone. 109 110 For each switch, an SX lock (RWlock on linux) protects 111 deletion of ports. When configuring or deleting a new port, the 112 lock is acquired in exclusive mode (after holding NMG_LOCK). 113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 114 The lock is held throughout the entire forwarding cycle, 115 during which the thread may incur in a page fault. 116 Hence it is important that sleepable shared locks are used. 117 118 On the rx ring, the per-port lock is grabbed initially to reserve 119 a number of slot in the ring, then the lock is released, 120 packets are copied from source to destination, and then 121 the lock is acquired again and the receive ring is updated. 122 (A similar thing is done on the tx ring for NIC and host stack 123 ports attached to the switch) 124 125 */ 126 127 128 /* --- internals ---- 129 * 130 * Roadmap to the code that implements the above. 131 * 132 * > 1. a process/thread issues one or more open() on /dev/netmap, to create 133 * > select()able file descriptor on which events are reported. 134 * 135 * Internally, we allocate a netmap_priv_d structure, that will be 136 * initialized on ioctl(NIOCREGIF). 137 * 138 * os-specific: 139 * FreeBSD: netmap_open (netmap_freebsd.c). The priv is 140 * per-thread. 141 * linux: linux_netmap_open (netmap_linux.c). The priv is 142 * per-open. 143 * 144 * > 2. on each descriptor, the process issues an ioctl() to identify 145 * > the interface that should report events to the file descriptor. 146 * 147 * Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0. 148 * Most important things happen in netmap_get_na() and 149 * netmap_do_regif(), called from there. Additional details can be 150 * found in the comments above those functions. 151 * 152 * In all cases, this action creates/takes-a-reference-to a 153 * netmap_*_adapter describing the port, and allocates a netmap_if 154 * and all necessary netmap rings, filling them with netmap buffers. 155 * 156 * In this phase, the sync callbacks for each ring are set (these are used 157 * in steps 5 and 6 below). The callbacks depend on the type of adapter. 158 * The adapter creation/initialization code puts them in the 159 * netmap_adapter (fields na->nm_txsync and na->nm_rxsync). Then, they 160 * are copied from there to the netmap_kring's during netmap_do_regif(), by 161 * the nm_krings_create() callback. All the nm_krings_create callbacks 162 * actually call netmap_krings_create() to perform this and the other 163 * common stuff. netmap_krings_create() also takes care of the host rings, 164 * if needed, by setting their sync callbacks appropriately. 165 * 166 * Additional actions depend on the kind of netmap_adapter that has been 167 * registered: 168 * 169 * - netmap_hw_adapter: [netmap.c] 170 * This is a system netdev/ifp with native netmap support. 171 * The ifp is detached from the host stack by redirecting: 172 * - transmissions (from the network stack) to netmap_transmit() 173 * - receive notifications to the nm_notify() callback for 174 * this adapter. The callback is normally netmap_notify(), unless 175 * the ifp is attached to a bridge using bwrap, in which case it 176 * is netmap_bwrap_intr_notify(). 177 * 178 * - netmap_generic_adapter: [netmap_generic.c] 179 * A system netdev/ifp without native netmap support. 180 * 181 * (the decision about native/non native support is taken in 182 * netmap_get_hw_na(), called by netmap_get_na()) 183 * 184 * - netmap_vp_adapter [netmap_vale.c] 185 * Returned by netmap_get_bdg_na(). 186 * This is a persistent or ephemeral VALE port. Ephemeral ports 187 * are created on the fly if they don't already exist, and are 188 * always attached to a bridge. 189 * Persistent VALE ports must must be created seperately, and i 190 * then attached like normal NICs. The NIOCREGIF we are examining 191 * will find them only if they had previosly been created and 192 * attached (see VALE_CTL below). 193 * 194 * - netmap_pipe_adapter [netmap_pipe.c] 195 * Returned by netmap_get_pipe_na(). 196 * Both pipe ends are created, if they didn't already exist. 197 * 198 * - netmap_monitor_adapter [netmap_monitor.c] 199 * Returned by netmap_get_monitor_na(). 200 * If successful, the nm_sync callbacks of the monitored adapter 201 * will be intercepted by the returned monitor. 202 * 203 * - netmap_bwrap_adapter [netmap_vale.c] 204 * Cannot be obtained in this way, see VALE_CTL below 205 * 206 * 207 * os-specific: 208 * linux: we first go through linux_netmap_ioctl() to 209 * adapt the FreeBSD interface to the linux one. 210 * 211 * 212 * > 3. on each descriptor, the process issues an mmap() request to 213 * > map the shared memory region within the process' address space. 214 * > The list of interesting queues is indicated by a location in 215 * > the shared memory region. 216 * 217 * os-specific: 218 * FreeBSD: netmap_mmap_single (netmap_freebsd.c). 219 * linux: linux_netmap_mmap (netmap_linux.c). 220 * 221 * > 4. using the functions in the netmap(4) userspace API, a process 222 * > can look up the occupation state of a queue, access memory buffers, 223 * > and retrieve received packets or enqueue packets to transmit. 224 * 225 * these actions do not involve the kernel. 226 * 227 * > 5. using some ioctl()s the process can synchronize the userspace view 228 * > of the queue with the actual status in the kernel. This includes both 229 * > receiving the notification of new packets, and transmitting new 230 * > packets on the output interface. 231 * 232 * These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC 233 * cases. They invoke the nm_sync callbacks on the netmap_kring 234 * structures, as initialized in step 2 and maybe later modified 235 * by a monitor. Monitors, however, will always call the original 236 * callback before doing anything else. 237 * 238 * 239 * > 6. select() or poll() can be used to wait for events on individual 240 * > transmit or receive queues (or all queues for a given interface). 241 * 242 * Implemented in netmap_poll(). This will call the same nm_sync() 243 * callbacks as in step 5 above. 244 * 245 * os-specific: 246 * linux: we first go through linux_netmap_poll() to adapt 247 * the FreeBSD interface to the linux one. 248 * 249 * 250 * ---- VALE_CTL ----- 251 * 252 * VALE switches are controlled by issuing a NIOCREGIF with a non-null 253 * nr_cmd in the nmreq structure. These subcommands are handled by 254 * netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created 255 * and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF 256 * subcommands, respectively. 257 * 258 * Any network interface known to the system (including a persistent VALE 259 * port) can be attached to a VALE switch by issuing the 260 * NETMAP_BDG_ATTACH subcommand. After the attachment, persistent VALE ports 261 * look exactly like ephemeral VALE ports (as created in step 2 above). The 262 * attachment of other interfaces, instead, requires the creation of a 263 * netmap_bwrap_adapter. Moreover, the attached interface must be put in 264 * netmap mode. This may require the creation of a netmap_generic_adapter if 265 * we have no native support for the interface, or if generic adapters have 266 * been forced by sysctl. 267 * 268 * Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(), 269 * called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach() 270 * callback. In the case of the bwrap, the callback creates the 271 * netmap_bwrap_adapter. The initialization of the bwrap is then 272 * completed by calling netmap_do_regif() on it, in the nm_bdg_ctl() 273 * callback (netmap_bwrap_bdg_ctl in netmap_vale.c). 274 * A generic adapter for the wrapped ifp will be created if needed, when 275 * netmap_get_bdg_na() calls netmap_get_hw_na(). 276 * 277 * 278 * ---- DATAPATHS ----- 279 * 280 * -= SYSTEM DEVICE WITH NATIVE SUPPORT =- 281 * 282 * na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach() 283 * 284 * - tx from netmap userspace: 285 * concurrently: 286 * 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context 287 * kring->nm_sync() == DEVICE_netmap_txsync() 288 * 2) device interrupt handler 289 * na->nm_notify() == netmap_notify() 290 * - rx from netmap userspace: 291 * concurrently: 292 * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context 293 * kring->nm_sync() == DEVICE_netmap_rxsync() 294 * 2) device interrupt handler 295 * na->nm_notify() == netmap_notify() 296 * - tx from host stack 297 * concurrently: 298 * 1) host stack 299 * netmap_transmit() 300 * na->nm_notify == netmap_notify() 301 * 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context 302 * kring->nm_sync() == netmap_rxsync_from_host_compat 303 * netmap_rxsync_from_host(na, NULL, NULL) 304 * - tx to host stack 305 * ioctl(NIOCTXSYNC)/netmap_poll() in process context 306 * kring->nm_sync() == netmap_txsync_to_host_compat 307 * netmap_txsync_to_host(na) 308 * NM_SEND_UP() 309 * FreeBSD: na->if_input() == ?? XXX 310 * linux: netif_rx() with NM_MAGIC_PRIORITY_RX 311 * 312 * 313 * 314 * -= SYSTEM DEVICE WITH GENERIC SUPPORT =- 315 * 316 * 317 * 318 * -= VALE PORT =- 319 * 320 * 321 * 322 * -= NETMAP PIPE =- 323 * 324 * 325 * 326 * -= SYSTEM DEVICE WITH NATIVE SUPPORT, CONNECTED TO VALE, NO HOST RINGS =- 327 * 328 * 329 * 330 * -= SYSTEM DEVICE WITH NATIVE SUPPORT, CONNECTED TO VALE, WITH HOST RINGS =- 331 * 332 * 333 * 334 * -= SYSTEM DEVICE WITH GENERIC SUPPORT, CONNECTED TO VALE, NO HOST RINGS =- 335 * 336 * 337 * 338 * -= SYSTEM DEVICE WITH GENERIC SUPPORT, CONNECTED TO VALE, WITH HOST RINGS =- 339 * 340 * 341 * 342 */ 343 344 /* 345 * OS-specific code that is used only within this file. 346 * Other OS-specific code that must be accessed by drivers 347 * is present in netmap_kern.h 348 */ 349 350 #if defined(__FreeBSD__) 351 #include <sys/cdefs.h> /* prerequisite */ 352 #include <sys/types.h> 353 #include <sys/errno.h> 354 #include <sys/param.h> /* defines used in kernel.h */ 355 #include <sys/kernel.h> /* types used in module initialization */ 356 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 357 #include <sys/filio.h> /* FIONBIO */ 358 #include <sys/sockio.h> 359 #include <sys/socketvar.h> /* struct socket */ 360 #include <sys/malloc.h> 361 #include <sys/poll.h> 362 #include <sys/rwlock.h> 363 #include <sys/socket.h> /* sockaddrs */ 364 #include <sys/selinfo.h> 365 #include <sys/sysctl.h> 366 #include <sys/jail.h> 367 #include <net/vnet.h> 368 #include <net/if.h> 369 #include <net/if_var.h> 370 #include <net/bpf.h> /* BIOCIMMEDIATE */ 371 #include <machine/bus.h> /* bus_dmamap_* */ 372 #include <sys/endian.h> 373 #include <sys/refcount.h> 374 375 376 /* reduce conditional code */ 377 // linux API, use for the knlist in FreeBSD 378 /* use a private mutex for the knlist */ 379 #define init_waitqueue_head(x) do { \ 380 struct mtx *m = &(x)->m; \ 381 mtx_init(m, "nm_kn_lock", NULL, MTX_DEF); \ 382 knlist_init_mtx(&(x)->si.si_note, m); \ 383 } while (0) 384 385 #define OS_selrecord(a, b) selrecord(a, &((b)->si)) 386 #define OS_selwakeup(a, b) freebsd_selwakeup(a, b) 387 388 #elif defined(linux) 389 390 #include "bsd_glue.h" 391 392 393 394 #elif defined(__APPLE__) 395 396 #warning OSX support is only partial 397 #include "osx_glue.h" 398 399 #else 400 401 #error Unsupported platform 402 403 #endif /* unsupported */ 404 405 /* 406 * common headers 407 */ 408 #include <net/netmap.h> 409 #include <dev/netmap/netmap_kern.h> 410 #include <dev/netmap/netmap_mem2.h> 411 412 413 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); 414 415 /* 416 * The following variables are used by the drivers and replicate 417 * fields in the global memory pool. They only refer to buffers 418 * used by physical interfaces. 419 */ 420 u_int netmap_total_buffers; 421 u_int netmap_buf_size; 422 char *netmap_buffer_base; /* also address of an invalid buffer */ 423 424 /* user-controlled variables */ 425 int netmap_verbose; 426 427 static int netmap_no_timestamp; /* don't timestamp on rxsync */ 428 429 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); 430 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, 431 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); 432 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, 433 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); 434 int netmap_mitigate = 1; 435 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); 436 int netmap_no_pendintr = 1; 437 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, 438 CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); 439 int netmap_txsync_retry = 2; 440 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, 441 &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); 442 443 int netmap_adaptive_io = 0; 444 SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW, 445 &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt"); 446 447 int netmap_flags = 0; /* debug flags */ 448 int netmap_fwd = 0; /* force transparent mode */ 449 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ 450 451 /* 452 * netmap_admode selects the netmap mode to use. 453 * Invalid values are reset to NETMAP_ADMODE_BEST 454 */ 455 enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ 456 NETMAP_ADMODE_NATIVE, /* either native or none */ 457 NETMAP_ADMODE_GENERIC, /* force generic */ 458 NETMAP_ADMODE_LAST }; 459 static int netmap_admode = NETMAP_ADMODE_BEST; 460 461 int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ 462 int netmap_generic_ringsize = 1024; /* Generic ringsize. */ 463 int netmap_generic_rings = 1; /* number of queues in generic. */ 464 465 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); 466 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); 467 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); 468 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); 469 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); 470 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); 471 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , ""); 472 473 NMG_LOCK_T netmap_global_lock; 474 475 476 static void 477 nm_kr_get(struct netmap_kring *kr) 478 { 479 while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) 480 tsleep(kr, 0, "NM_KR_GET", 4); 481 } 482 483 484 /* 485 * mark the ring as stopped, and run through the locks 486 * to make sure other users get to see it. 487 */ 488 static void 489 netmap_disable_ring(struct netmap_kring *kr) 490 { 491 kr->nkr_stopped = 1; 492 nm_kr_get(kr); 493 mtx_lock(&kr->q_lock); 494 mtx_unlock(&kr->q_lock); 495 nm_kr_put(kr); 496 } 497 498 /* stop or enable a single tx ring */ 499 void 500 netmap_set_txring(struct netmap_adapter *na, u_int ring_id, int stopped) 501 { 502 if (stopped) 503 netmap_disable_ring(na->tx_rings + ring_id); 504 else 505 na->tx_rings[ring_id].nkr_stopped = 0; 506 /* nofify that the stopped state has changed. This is currently 507 *only used by bwrap to propagate the state to its own krings. 508 * (see netmap_bwrap_intr_notify). 509 */ 510 na->nm_notify(na, ring_id, NR_TX, NAF_DISABLE_NOTIFY); 511 } 512 513 /* stop or enable a single rx ring */ 514 void 515 netmap_set_rxring(struct netmap_adapter *na, u_int ring_id, int stopped) 516 { 517 if (stopped) 518 netmap_disable_ring(na->rx_rings + ring_id); 519 else 520 na->rx_rings[ring_id].nkr_stopped = 0; 521 /* nofify that the stopped state has changed. This is currently 522 *only used by bwrap to propagate the state to its own krings. 523 * (see netmap_bwrap_intr_notify). 524 */ 525 na->nm_notify(na, ring_id, NR_RX, NAF_DISABLE_NOTIFY); 526 } 527 528 529 /* stop or enable all the rings of na */ 530 void 531 netmap_set_all_rings(struct netmap_adapter *na, int stopped) 532 { 533 int i; 534 u_int ntx, nrx; 535 536 if (!nm_netmap_on(na)) 537 return; 538 539 ntx = netmap_real_tx_rings(na); 540 nrx = netmap_real_rx_rings(na); 541 542 for (i = 0; i < ntx; i++) { 543 netmap_set_txring(na, i, stopped); 544 } 545 546 for (i = 0; i < nrx; i++) { 547 netmap_set_rxring(na, i, stopped); 548 } 549 } 550 551 /* 552 * Convenience function used in drivers. Waits for current txsync()s/rxsync()s 553 * to finish and prevents any new one from starting. Call this before turning 554 * netmap mode off, or before removing the harware rings (e.g., on module 555 * onload). As a rule of thumb for linux drivers, this should be placed near 556 * each napi_disable(). 557 */ 558 void 559 netmap_disable_all_rings(struct ifnet *ifp) 560 { 561 netmap_set_all_rings(NA(ifp), 1 /* stopped */); 562 } 563 564 /* 565 * Convenience function used in drivers. Re-enables rxsync and txsync on the 566 * adapter's rings In linux drivers, this should be placed near each 567 * napi_enable(). 568 */ 569 void 570 netmap_enable_all_rings(struct ifnet *ifp) 571 { 572 netmap_set_all_rings(NA(ifp), 0 /* enabled */); 573 } 574 575 576 /* 577 * generic bound_checking function 578 */ 579 u_int 580 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) 581 { 582 u_int oldv = *v; 583 const char *op = NULL; 584 585 if (dflt < lo) 586 dflt = lo; 587 if (dflt > hi) 588 dflt = hi; 589 if (oldv < lo) { 590 *v = dflt; 591 op = "Bump"; 592 } else if (oldv > hi) { 593 *v = hi; 594 op = "Clamp"; 595 } 596 if (op && msg) 597 printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); 598 return *v; 599 } 600 601 602 /* 603 * packet-dump function, user-supplied or static buffer. 604 * The destination buffer must be at least 30+4*len 605 */ 606 const char * 607 nm_dump_buf(char *p, int len, int lim, char *dst) 608 { 609 static char _dst[8192]; 610 int i, j, i0; 611 static char hex[] ="0123456789abcdef"; 612 char *o; /* output position */ 613 614 #define P_HI(x) hex[((x) & 0xf0)>>4] 615 #define P_LO(x) hex[((x) & 0xf)] 616 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') 617 if (!dst) 618 dst = _dst; 619 if (lim <= 0 || lim > len) 620 lim = len; 621 o = dst; 622 sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); 623 o += strlen(o); 624 /* hexdump routine */ 625 for (i = 0; i < lim; ) { 626 sprintf(o, "%5d: ", i); 627 o += strlen(o); 628 memset(o, ' ', 48); 629 i0 = i; 630 for (j=0; j < 16 && i < lim; i++, j++) { 631 o[j*3] = P_HI(p[i]); 632 o[j*3+1] = P_LO(p[i]); 633 } 634 i = i0; 635 for (j=0; j < 16 && i < lim; i++, j++) 636 o[j + 48] = P_C(p[i]); 637 o[j+48] = '\n'; 638 o += j+49; 639 } 640 *o = '\0'; 641 #undef P_HI 642 #undef P_LO 643 #undef P_C 644 return dst; 645 } 646 647 648 /* 649 * Fetch configuration from the device, to cope with dynamic 650 * reconfigurations after loading the module. 651 */ 652 /* call with NMG_LOCK held */ 653 int 654 netmap_update_config(struct netmap_adapter *na) 655 { 656 u_int txr, txd, rxr, rxd; 657 658 txr = txd = rxr = rxd = 0; 659 if (na->nm_config) { 660 na->nm_config(na, &txr, &txd, &rxr, &rxd); 661 } else { 662 /* take whatever we had at init time */ 663 txr = na->num_tx_rings; 664 txd = na->num_tx_desc; 665 rxr = na->num_rx_rings; 666 rxd = na->num_rx_desc; 667 } 668 669 if (na->num_tx_rings == txr && na->num_tx_desc == txd && 670 na->num_rx_rings == rxr && na->num_rx_desc == rxd) 671 return 0; /* nothing changed */ 672 if (netmap_verbose || na->active_fds > 0) { 673 D("stored config %s: txring %d x %d, rxring %d x %d", 674 na->name, 675 na->num_tx_rings, na->num_tx_desc, 676 na->num_rx_rings, na->num_rx_desc); 677 D("new config %s: txring %d x %d, rxring %d x %d", 678 na->name, txr, txd, rxr, rxd); 679 } 680 if (na->active_fds == 0) { 681 D("configuration changed (but fine)"); 682 na->num_tx_rings = txr; 683 na->num_tx_desc = txd; 684 na->num_rx_rings = rxr; 685 na->num_rx_desc = rxd; 686 return 0; 687 } 688 D("configuration changed while active, this is bad..."); 689 return 1; 690 } 691 692 /* kring->nm_sync callback for the host tx ring */ 693 static int 694 netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags) 695 { 696 (void)flags; /* unused */ 697 netmap_txsync_to_host(kring->na); 698 return 0; 699 } 700 701 /* kring->nm_sync callback for the host rx ring */ 702 static int 703 netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags) 704 { 705 (void)flags; /* unused */ 706 netmap_rxsync_from_host(kring->na, NULL, NULL); 707 return 0; 708 } 709 710 711 712 /* create the krings array and initialize the fields common to all adapters. 713 * The array layout is this: 714 * 715 * +----------+ 716 * na->tx_rings ----->| | \ 717 * | | } na->num_tx_ring 718 * | | / 719 * +----------+ 720 * | | host tx kring 721 * na->rx_rings ----> +----------+ 722 * | | \ 723 * | | } na->num_rx_rings 724 * | | / 725 * +----------+ 726 * | | host rx kring 727 * +----------+ 728 * na->tailroom ----->| | \ 729 * | | } tailroom bytes 730 * | | / 731 * +----------+ 732 * 733 * Note: for compatibility, host krings are created even when not needed. 734 * The tailroom space is currently used by vale ports for allocating leases. 735 */ 736 /* call with NMG_LOCK held */ 737 int 738 netmap_krings_create(struct netmap_adapter *na, u_int tailroom) 739 { 740 u_int i, len, ndesc; 741 struct netmap_kring *kring; 742 u_int ntx, nrx; 743 744 /* account for the (possibly fake) host rings */ 745 ntx = na->num_tx_rings + 1; 746 nrx = na->num_rx_rings + 1; 747 748 len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom; 749 750 na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); 751 if (na->tx_rings == NULL) { 752 D("Cannot allocate krings"); 753 return ENOMEM; 754 } 755 na->rx_rings = na->tx_rings + ntx; 756 757 /* 758 * All fields in krings are 0 except the one initialized below. 759 * but better be explicit on important kring fields. 760 */ 761 ndesc = na->num_tx_desc; 762 for (i = 0; i < ntx; i++) { /* Transmit rings */ 763 kring = &na->tx_rings[i]; 764 bzero(kring, sizeof(*kring)); 765 kring->na = na; 766 kring->ring_id = i; 767 kring->nkr_num_slots = ndesc; 768 if (i < na->num_tx_rings) { 769 kring->nm_sync = na->nm_txsync; 770 } else if (i == na->num_tx_rings) { 771 kring->nm_sync = netmap_txsync_to_host_compat; 772 } 773 /* 774 * IMPORTANT: Always keep one slot empty. 775 */ 776 kring->rhead = kring->rcur = kring->nr_hwcur = 0; 777 kring->rtail = kring->nr_hwtail = ndesc - 1; 778 snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", na->name, i); 779 ND("ktx %s h %d c %d t %d", 780 kring->name, kring->rhead, kring->rcur, kring->rtail); 781 mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF); 782 init_waitqueue_head(&kring->si); 783 } 784 785 ndesc = na->num_rx_desc; 786 for (i = 0; i < nrx; i++) { /* Receive rings */ 787 kring = &na->rx_rings[i]; 788 bzero(kring, sizeof(*kring)); 789 kring->na = na; 790 kring->ring_id = i; 791 kring->nkr_num_slots = ndesc; 792 if (i < na->num_rx_rings) { 793 kring->nm_sync = na->nm_rxsync; 794 } else if (i == na->num_rx_rings) { 795 kring->nm_sync = netmap_rxsync_from_host_compat; 796 } 797 kring->rhead = kring->rcur = kring->nr_hwcur = 0; 798 kring->rtail = kring->nr_hwtail = 0; 799 snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", na->name, i); 800 ND("krx %s h %d c %d t %d", 801 kring->name, kring->rhead, kring->rcur, kring->rtail); 802 mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF); 803 init_waitqueue_head(&kring->si); 804 } 805 init_waitqueue_head(&na->tx_si); 806 init_waitqueue_head(&na->rx_si); 807 808 na->tailroom = na->rx_rings + nrx; 809 810 return 0; 811 } 812 813 814 #ifdef __FreeBSD__ 815 static void 816 netmap_knlist_destroy(NM_SELINFO_T *si) 817 { 818 /* XXX kqueue(9) needed; these will mirror knlist_init. */ 819 knlist_delete(&si->si.si_note, curthread, 0 /* not locked */ ); 820 knlist_destroy(&si->si.si_note); 821 /* now we don't need the mutex anymore */ 822 mtx_destroy(&si->m); 823 } 824 #endif /* __FreeBSD__ */ 825 826 827 /* undo the actions performed by netmap_krings_create */ 828 /* call with NMG_LOCK held */ 829 void 830 netmap_krings_delete(struct netmap_adapter *na) 831 { 832 struct netmap_kring *kring = na->tx_rings; 833 834 /* we rely on the krings layout described above */ 835 for ( ; kring != na->tailroom; kring++) { 836 mtx_destroy(&kring->q_lock); 837 netmap_knlist_destroy(&kring->si); 838 } 839 free(na->tx_rings, M_DEVBUF); 840 na->tx_rings = na->rx_rings = na->tailroom = NULL; 841 } 842 843 844 /* 845 * Destructor for NIC ports. They also have an mbuf queue 846 * on the rings connected to the host so we need to purge 847 * them first. 848 */ 849 /* call with NMG_LOCK held */ 850 static void 851 netmap_hw_krings_delete(struct netmap_adapter *na) 852 { 853 struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue; 854 855 ND("destroy sw mbq with len %d", mbq_len(q)); 856 mbq_purge(q); 857 mbq_safe_destroy(q); 858 netmap_krings_delete(na); 859 } 860 861 862 /* create a new netmap_if for a newly registered fd. 863 * If this is the first registration of the adapter, 864 * also create the netmap rings and their in-kernel view, 865 * the netmap krings. 866 */ 867 /* call with NMG_LOCK held */ 868 static struct netmap_if* 869 netmap_if_new(struct netmap_adapter *na) 870 { 871 struct netmap_if *nifp; 872 873 if (netmap_update_config(na)) { 874 /* configuration mismatch, report and fail */ 875 return NULL; 876 } 877 878 if (na->active_fds) /* already registered */ 879 goto final; 880 881 /* create and init the krings arrays. 882 * Depending on the adapter, this may also create 883 * the netmap rings themselves 884 */ 885 if (na->nm_krings_create(na)) 886 return NULL; 887 888 /* create all missing netmap rings */ 889 if (netmap_mem_rings_create(na)) 890 goto cleanup; 891 892 final: 893 894 /* in all cases, create a new netmap if */ 895 nifp = netmap_mem_if_new(na); 896 if (nifp == NULL) 897 goto cleanup; 898 899 return (nifp); 900 901 cleanup: 902 903 if (na->active_fds == 0) { 904 netmap_mem_rings_delete(na); 905 na->nm_krings_delete(na); 906 } 907 908 return NULL; 909 } 910 911 912 /* grab a reference to the memory allocator, if we don't have one already. The 913 * reference is taken from the netmap_adapter registered with the priv. 914 */ 915 /* call with NMG_LOCK held */ 916 static int 917 netmap_get_memory_locked(struct netmap_priv_d* p) 918 { 919 struct netmap_mem_d *nmd; 920 int error = 0; 921 922 if (p->np_na == NULL) { 923 if (!netmap_mmap_unreg) 924 return ENODEV; 925 /* for compatibility with older versions of the API 926 * we use the global allocator when no interface has been 927 * registered 928 */ 929 nmd = &nm_mem; 930 } else { 931 nmd = p->np_na->nm_mem; 932 } 933 if (p->np_mref == NULL) { 934 error = netmap_mem_finalize(nmd, p->np_na); 935 if (!error) 936 p->np_mref = nmd; 937 } else if (p->np_mref != nmd) { 938 /* a virtual port has been registered, but previous 939 * syscalls already used the global allocator. 940 * We cannot continue 941 */ 942 error = ENODEV; 943 } 944 return error; 945 } 946 947 948 /* call with NMG_LOCK *not* held */ 949 int 950 netmap_get_memory(struct netmap_priv_d* p) 951 { 952 int error; 953 NMG_LOCK(); 954 error = netmap_get_memory_locked(p); 955 NMG_UNLOCK(); 956 return error; 957 } 958 959 960 /* call with NMG_LOCK held */ 961 static int 962 netmap_have_memory_locked(struct netmap_priv_d* p) 963 { 964 return p->np_mref != NULL; 965 } 966 967 968 /* call with NMG_LOCK held */ 969 static void 970 netmap_drop_memory_locked(struct netmap_priv_d* p) 971 { 972 if (p->np_mref) { 973 netmap_mem_deref(p->np_mref, p->np_na); 974 p->np_mref = NULL; 975 } 976 } 977 978 979 /* 980 * Call nm_register(ifp,0) to stop netmap mode on the interface and 981 * revert to normal operation. 982 * The second argument is the nifp to work on. In some cases it is 983 * not attached yet to the netmap_priv_d so we need to pass it as 984 * a separate argument. 985 */ 986 /* call with NMG_LOCK held */ 987 static void 988 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) 989 { 990 struct netmap_adapter *na = priv->np_na; 991 992 NMG_LOCK_ASSERT(); 993 na->active_fds--; 994 if (na->active_fds <= 0) { /* last instance */ 995 996 if (netmap_verbose) 997 D("deleting last instance for %s", na->name); 998 /* 999 * (TO CHECK) This function is only called 1000 * when the last reference to this file descriptor goes 1001 * away. This means we cannot have any pending poll() 1002 * or interrupt routine operating on the structure. 1003 * XXX The file may be closed in a thread while 1004 * another thread is using it. 1005 * Linux keeps the file opened until the last reference 1006 * by any outstanding ioctl/poll or mmap is gone. 1007 * FreeBSD does not track mmap()s (but we do) and 1008 * wakes up any sleeping poll(). Need to check what 1009 * happens if the close() occurs while a concurrent 1010 * syscall is running. 1011 */ 1012 na->nm_register(na, 0); /* off, clear flags */ 1013 /* Wake up any sleeping threads. netmap_poll will 1014 * then return POLLERR 1015 * XXX The wake up now must happen during *_down(), when 1016 * we order all activities to stop. -gl 1017 */ 1018 netmap_knlist_destroy(&na->tx_si); 1019 netmap_knlist_destroy(&na->rx_si); 1020 1021 /* delete rings and buffers */ 1022 netmap_mem_rings_delete(na); 1023 na->nm_krings_delete(na); 1024 } 1025 /* delete the nifp */ 1026 netmap_mem_if_delete(na, nifp); 1027 } 1028 1029 /* call with NMG_LOCK held */ 1030 static __inline int 1031 nm_tx_si_user(struct netmap_priv_d *priv) 1032 { 1033 return (priv->np_na != NULL && 1034 (priv->np_txqlast - priv->np_txqfirst > 1)); 1035 } 1036 1037 /* call with NMG_LOCK held */ 1038 static __inline int 1039 nm_rx_si_user(struct netmap_priv_d *priv) 1040 { 1041 return (priv->np_na != NULL && 1042 (priv->np_rxqlast - priv->np_rxqfirst > 1)); 1043 } 1044 1045 1046 /* 1047 * Destructor of the netmap_priv_d, called when the fd has 1048 * no active open() and mmap(). Also called in error paths. 1049 * 1050 * returns 1 if this is the last instance and we can free priv 1051 */ 1052 /* call with NMG_LOCK held */ 1053 int 1054 netmap_dtor_locked(struct netmap_priv_d *priv) 1055 { 1056 struct netmap_adapter *na = priv->np_na; 1057 1058 #ifdef __FreeBSD__ 1059 /* 1060 * np_refcount is the number of active mmaps on 1061 * this file descriptor 1062 */ 1063 if (--priv->np_refcount > 0) { 1064 return 0; 1065 } 1066 #endif /* __FreeBSD__ */ 1067 if (!na) { 1068 return 1; //XXX is it correct? 1069 } 1070 netmap_do_unregif(priv, priv->np_nifp); 1071 priv->np_nifp = NULL; 1072 netmap_drop_memory_locked(priv); 1073 if (priv->np_na) { 1074 if (nm_tx_si_user(priv)) 1075 na->tx_si_users--; 1076 if (nm_rx_si_user(priv)) 1077 na->rx_si_users--; 1078 netmap_adapter_put(na); 1079 priv->np_na = NULL; 1080 } 1081 return 1; 1082 } 1083 1084 1085 /* call with NMG_LOCK *not* held */ 1086 void 1087 netmap_dtor(void *data) 1088 { 1089 struct netmap_priv_d *priv = data; 1090 int last_instance; 1091 1092 NMG_LOCK(); 1093 last_instance = netmap_dtor_locked(priv); 1094 NMG_UNLOCK(); 1095 if (last_instance) { 1096 bzero(priv, sizeof(*priv)); /* for safety */ 1097 free(priv, M_DEVBUF); 1098 } 1099 } 1100 1101 1102 1103 1104 /* 1105 * Handlers for synchronization of the queues from/to the host. 1106 * Netmap has two operating modes: 1107 * - in the default mode, the rings connected to the host stack are 1108 * just another ring pair managed by userspace; 1109 * - in transparent mode (XXX to be defined) incoming packets 1110 * (from the host or the NIC) are marked as NS_FORWARD upon 1111 * arrival, and the user application has a chance to reset the 1112 * flag for packets that should be dropped. 1113 * On the RXSYNC or poll(), packets in RX rings between 1114 * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved 1115 * to the other side. 1116 * The transfer NIC --> host is relatively easy, just encapsulate 1117 * into mbufs and we are done. The host --> NIC side is slightly 1118 * harder because there might not be room in the tx ring so it 1119 * might take a while before releasing the buffer. 1120 */ 1121 1122 1123 /* 1124 * pass a chain of buffers to the host stack as coming from 'dst' 1125 * We do not need to lock because the queue is private. 1126 */ 1127 static void 1128 netmap_send_up(struct ifnet *dst, struct mbq *q) 1129 { 1130 struct mbuf *m; 1131 1132 /* send packets up, outside the lock */ 1133 while ((m = mbq_dequeue(q)) != NULL) { 1134 if (netmap_verbose & NM_VERB_HOST) 1135 D("sending up pkt %p size %d", m, MBUF_LEN(m)); 1136 NM_SEND_UP(dst, m); 1137 } 1138 mbq_destroy(q); 1139 } 1140 1141 1142 /* 1143 * put a copy of the buffers marked NS_FORWARD into an mbuf chain. 1144 * Take packets from hwcur to ring->head marked NS_FORWARD (or forced) 1145 * and pass them up. Drop remaining packets in the unlikely event 1146 * of an mbuf shortage. 1147 */ 1148 static void 1149 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) 1150 { 1151 u_int const lim = kring->nkr_num_slots - 1; 1152 u_int const head = kring->ring->head; 1153 u_int n; 1154 struct netmap_adapter *na = kring->na; 1155 1156 for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) { 1157 struct mbuf *m; 1158 struct netmap_slot *slot = &kring->ring->slot[n]; 1159 1160 if ((slot->flags & NS_FORWARD) == 0 && !force) 1161 continue; 1162 if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) { 1163 RD(5, "bad pkt at %d len %d", n, slot->len); 1164 continue; 1165 } 1166 slot->flags &= ~NS_FORWARD; // XXX needed ? 1167 /* XXX TODO: adapt to the case of a multisegment packet */ 1168 m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL); 1169 1170 if (m == NULL) 1171 break; 1172 mbq_enqueue(q, m); 1173 } 1174 } 1175 1176 1177 /* 1178 * Send to the NIC rings packets marked NS_FORWARD between 1179 * kring->nr_hwcur and kring->rhead 1180 * Called under kring->rx_queue.lock on the sw rx ring, 1181 */ 1182 static u_int 1183 netmap_sw_to_nic(struct netmap_adapter *na) 1184 { 1185 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 1186 struct netmap_slot *rxslot = kring->ring->slot; 1187 u_int i, rxcur = kring->nr_hwcur; 1188 u_int const head = kring->rhead; 1189 u_int const src_lim = kring->nkr_num_slots - 1; 1190 u_int sent = 0; 1191 1192 /* scan rings to find space, then fill as much as possible */ 1193 for (i = 0; i < na->num_tx_rings; i++) { 1194 struct netmap_kring *kdst = &na->tx_rings[i]; 1195 struct netmap_ring *rdst = kdst->ring; 1196 u_int const dst_lim = kdst->nkr_num_slots - 1; 1197 1198 /* XXX do we trust ring or kring->rcur,rtail ? */ 1199 for (; rxcur != head && !nm_ring_empty(rdst); 1200 rxcur = nm_next(rxcur, src_lim) ) { 1201 struct netmap_slot *src, *dst, tmp; 1202 u_int dst_cur = rdst->cur; 1203 1204 src = &rxslot[rxcur]; 1205 if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd) 1206 continue; 1207 1208 sent++; 1209 1210 dst = &rdst->slot[dst_cur]; 1211 1212 tmp = *src; 1213 1214 src->buf_idx = dst->buf_idx; 1215 src->flags = NS_BUF_CHANGED; 1216 1217 dst->buf_idx = tmp.buf_idx; 1218 dst->len = tmp.len; 1219 dst->flags = NS_BUF_CHANGED; 1220 1221 rdst->cur = nm_next(dst_cur, dst_lim); 1222 } 1223 /* if (sent) XXX txsync ? */ 1224 } 1225 return sent; 1226 } 1227 1228 1229 /* 1230 * netmap_txsync_to_host() passes packets up. We are called from a 1231 * system call in user process context, and the only contention 1232 * can be among multiple user threads erroneously calling 1233 * this routine concurrently. 1234 */ 1235 void 1236 netmap_txsync_to_host(struct netmap_adapter *na) 1237 { 1238 struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; 1239 struct netmap_ring *ring = kring->ring; 1240 u_int const lim = kring->nkr_num_slots - 1; 1241 u_int const head = kring->rhead; 1242 struct mbq q; 1243 1244 /* Take packets from hwcur to head and pass them up. 1245 * force head = cur since netmap_grab_packets() stops at head 1246 * In case of no buffers we give up. At the end of the loop, 1247 * the queue is drained in all cases. 1248 */ 1249 mbq_init(&q); 1250 ring->cur = head; 1251 netmap_grab_packets(kring, &q, 1 /* force */); 1252 ND("have %d pkts in queue", mbq_len(&q)); 1253 kring->nr_hwcur = head; 1254 kring->nr_hwtail = head + lim; 1255 if (kring->nr_hwtail > lim) 1256 kring->nr_hwtail -= lim + 1; 1257 nm_txsync_finalize(kring); 1258 1259 netmap_send_up(na->ifp, &q); 1260 } 1261 1262 1263 /* 1264 * rxsync backend for packets coming from the host stack. 1265 * They have been put in kring->rx_queue by netmap_transmit(). 1266 * We protect access to the kring using kring->rx_queue.lock 1267 * 1268 * This routine also does the selrecord if called from the poll handler 1269 * (we know because td != NULL). 1270 * 1271 * NOTE: on linux, selrecord() is defined as a macro and uses pwait 1272 * as an additional hidden argument. 1273 * returns the number of packets delivered to tx queues in 1274 * transparent mode, or a negative value if error 1275 */ 1276 int 1277 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) 1278 { 1279 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 1280 struct netmap_ring *ring = kring->ring; 1281 u_int nm_i, n; 1282 u_int const lim = kring->nkr_num_slots - 1; 1283 u_int const head = kring->rhead; 1284 int ret = 0; 1285 struct mbq *q = &kring->rx_queue; 1286 1287 (void)pwait; /* disable unused warnings */ 1288 (void)td; 1289 1290 mbq_lock(q); 1291 1292 /* First part: import newly received packets */ 1293 n = mbq_len(q); 1294 if (n) { /* grab packets from the queue */ 1295 struct mbuf *m; 1296 uint32_t stop_i; 1297 1298 nm_i = kring->nr_hwtail; 1299 stop_i = nm_prev(nm_i, lim); 1300 while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) { 1301 int len = MBUF_LEN(m); 1302 struct netmap_slot *slot = &ring->slot[nm_i]; 1303 1304 m_copydata(m, 0, len, NMB(na, slot)); 1305 ND("nm %d len %d", nm_i, len); 1306 if (netmap_verbose) 1307 D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL)); 1308 1309 slot->len = len; 1310 slot->flags = kring->nkr_slot_flags; 1311 nm_i = nm_next(nm_i, lim); 1312 m_freem(m); 1313 } 1314 kring->nr_hwtail = nm_i; 1315 } 1316 1317 /* 1318 * Second part: skip past packets that userspace has released. 1319 */ 1320 nm_i = kring->nr_hwcur; 1321 if (nm_i != head) { /* something was released */ 1322 if (netmap_fwd || kring->ring->flags & NR_FORWARD) 1323 ret = netmap_sw_to_nic(na); 1324 kring->nr_hwcur = head; 1325 } 1326 1327 nm_rxsync_finalize(kring); 1328 1329 /* access copies of cur,tail in the kring */ 1330 if (kring->rcur == kring->rtail && td) /* no bufs available */ 1331 OS_selrecord(td, &kring->si); 1332 1333 mbq_unlock(q); 1334 return ret; 1335 } 1336 1337 1338 /* Get a netmap adapter for the port. 1339 * 1340 * If it is possible to satisfy the request, return 0 1341 * with *na containing the netmap adapter found. 1342 * Otherwise return an error code, with *na containing NULL. 1343 * 1344 * When the port is attached to a bridge, we always return 1345 * EBUSY. 1346 * Otherwise, if the port is already bound to a file descriptor, 1347 * then we unconditionally return the existing adapter into *na. 1348 * In all the other cases, we return (into *na) either native, 1349 * generic or NULL, according to the following table: 1350 * 1351 * native_support 1352 * active_fds dev.netmap.admode YES NO 1353 * ------------------------------------------------------- 1354 * >0 * NA(ifp) NA(ifp) 1355 * 1356 * 0 NETMAP_ADMODE_BEST NATIVE GENERIC 1357 * 0 NETMAP_ADMODE_NATIVE NATIVE NULL 1358 * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC 1359 * 1360 */ 1361 1362 int 1363 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) 1364 { 1365 /* generic support */ 1366 int i = netmap_admode; /* Take a snapshot. */ 1367 int error = 0; 1368 struct netmap_adapter *prev_na; 1369 struct netmap_generic_adapter *gna; 1370 1371 *na = NULL; /* default */ 1372 1373 /* reset in case of invalid value */ 1374 if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST) 1375 i = netmap_admode = NETMAP_ADMODE_BEST; 1376 1377 if (NETMAP_CAPABLE(ifp)) { 1378 prev_na = NA(ifp); 1379 /* If an adapter already exists, return it if 1380 * there are active file descriptors or if 1381 * netmap is not forced to use generic 1382 * adapters. 1383 */ 1384 if (NETMAP_OWNED_BY_ANY(prev_na) 1385 || i != NETMAP_ADMODE_GENERIC 1386 || prev_na->na_flags & NAF_FORCE_NATIVE 1387 #ifdef WITH_PIPES 1388 /* ugly, but we cannot allow an adapter switch 1389 * if some pipe is referring to this one 1390 */ 1391 || prev_na->na_next_pipe > 0 1392 #endif 1393 ) { 1394 *na = prev_na; 1395 return 0; 1396 } 1397 } 1398 1399 /* If there isn't native support and netmap is not allowed 1400 * to use generic adapters, we cannot satisfy the request. 1401 */ 1402 if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE) 1403 return EOPNOTSUPP; 1404 1405 /* Otherwise, create a generic adapter and return it, 1406 * saving the previously used netmap adapter, if any. 1407 * 1408 * Note that here 'prev_na', if not NULL, MUST be a 1409 * native adapter, and CANNOT be a generic one. This is 1410 * true because generic adapters are created on demand, and 1411 * destroyed when not used anymore. Therefore, if the adapter 1412 * currently attached to an interface 'ifp' is generic, it 1413 * must be that 1414 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))). 1415 * Consequently, if NA(ifp) is generic, we will enter one of 1416 * the branches above. This ensures that we never override 1417 * a generic adapter with another generic adapter. 1418 */ 1419 prev_na = NA(ifp); 1420 error = generic_netmap_attach(ifp); 1421 if (error) 1422 return error; 1423 1424 *na = NA(ifp); 1425 gna = (struct netmap_generic_adapter*)NA(ifp); 1426 gna->prev = prev_na; /* save old na */ 1427 if (prev_na != NULL) { 1428 ifunit_ref(ifp->if_xname); 1429 // XXX add a refcount ? 1430 netmap_adapter_get(prev_na); 1431 } 1432 ND("Created generic NA %p (prev %p)", gna, gna->prev); 1433 1434 return 0; 1435 } 1436 1437 1438 /* 1439 * MUST BE CALLED UNDER NMG_LOCK() 1440 * 1441 * Get a refcounted reference to a netmap adapter attached 1442 * to the interface specified by nmr. 1443 * This is always called in the execution of an ioctl(). 1444 * 1445 * Return ENXIO if the interface specified by the request does 1446 * not exist, ENOTSUP if netmap is not supported by the interface, 1447 * EBUSY if the interface is already attached to a bridge, 1448 * EINVAL if parameters are invalid, ENOMEM if needed resources 1449 * could not be allocated. 1450 * If successful, hold a reference to the netmap adapter. 1451 * 1452 * No reference is kept on the real interface, which may then 1453 * disappear at any time. 1454 */ 1455 int 1456 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) 1457 { 1458 struct ifnet *ifp = NULL; 1459 int error = 0; 1460 struct netmap_adapter *ret = NULL; 1461 1462 *na = NULL; /* default return value */ 1463 1464 NMG_LOCK_ASSERT(); 1465 1466 /* we cascade through all possibile types of netmap adapter. 1467 * All netmap_get_*_na() functions return an error and an na, 1468 * with the following combinations: 1469 * 1470 * error na 1471 * 0 NULL type doesn't match 1472 * !0 NULL type matches, but na creation/lookup failed 1473 * 0 !NULL type matches and na created/found 1474 * !0 !NULL impossible 1475 */ 1476 1477 /* try to see if this is a monitor port */ 1478 error = netmap_get_monitor_na(nmr, na, create); 1479 if (error || *na != NULL) 1480 return error; 1481 1482 /* try to see if this is a pipe port */ 1483 error = netmap_get_pipe_na(nmr, na, create); 1484 if (error || *na != NULL) 1485 return error; 1486 1487 /* try to see if this is a bridge port */ 1488 error = netmap_get_bdg_na(nmr, na, create); 1489 if (error) 1490 return error; 1491 1492 if (*na != NULL) /* valid match in netmap_get_bdg_na() */ 1493 goto pipes; 1494 1495 /* 1496 * This must be a hardware na, lookup the name in the system. 1497 * Note that by hardware we actually mean "it shows up in ifconfig". 1498 * This may still be a tap, a veth/epair, or even a 1499 * persistent VALE port. 1500 */ 1501 ifp = ifunit_ref(nmr->nr_name); 1502 if (ifp == NULL) { 1503 return ENXIO; 1504 } 1505 1506 error = netmap_get_hw_na(ifp, &ret); 1507 if (error) 1508 goto out; 1509 1510 *na = ret; 1511 netmap_adapter_get(ret); 1512 1513 pipes: 1514 /* 1515 * If we are opening a pipe whose parent was not in netmap mode, 1516 * we have to allocate the pipe array now. 1517 * XXX get rid of this clumsiness (2014-03-15) 1518 */ 1519 error = netmap_pipe_alloc(*na, nmr); 1520 1521 out: 1522 if (error && ret != NULL) 1523 netmap_adapter_put(ret); 1524 1525 if (ifp) 1526 if_rele(ifp); /* allow live unloading of drivers modules */ 1527 1528 return error; 1529 } 1530 1531 1532 /* 1533 * validate parameters on entry for *_txsync() 1534 * Returns ring->cur if ok, or something >= kring->nkr_num_slots 1535 * in case of error. 1536 * 1537 * rhead, rcur and rtail=hwtail are stored from previous round. 1538 * hwcur is the next packet to send to the ring. 1539 * 1540 * We want 1541 * hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail 1542 * 1543 * hwcur, rhead, rtail and hwtail are reliable 1544 */ 1545 u_int 1546 nm_txsync_prologue(struct netmap_kring *kring) 1547 { 1548 struct netmap_ring *ring = kring->ring; 1549 u_int head = ring->head; /* read only once */ 1550 u_int cur = ring->cur; /* read only once */ 1551 u_int n = kring->nkr_num_slots; 1552 1553 ND(5, "%s kcur %d ktail %d head %d cur %d tail %d", 1554 kring->name, 1555 kring->nr_hwcur, kring->nr_hwtail, 1556 ring->head, ring->cur, ring->tail); 1557 #if 1 /* kernel sanity checks; but we can trust the kring. */ 1558 if (kring->nr_hwcur >= n || kring->rhead >= n || 1559 kring->rtail >= n || kring->nr_hwtail >= n) 1560 goto error; 1561 #endif /* kernel sanity checks */ 1562 /* 1563 * user sanity checks. We only use 'cur', 1564 * A, B, ... are possible positions for cur: 1565 * 1566 * 0 A cur B tail C n-1 1567 * 0 D tail E cur F n-1 1568 * 1569 * B, F, D are valid. A, C, E are wrong 1570 */ 1571 if (kring->rtail >= kring->rhead) { 1572 /* want rhead <= head <= rtail */ 1573 if (head < kring->rhead || head > kring->rtail) 1574 goto error; 1575 /* and also head <= cur <= rtail */ 1576 if (cur < head || cur > kring->rtail) 1577 goto error; 1578 } else { /* here rtail < rhead */ 1579 /* we need head outside rtail .. rhead */ 1580 if (head > kring->rtail && head < kring->rhead) 1581 goto error; 1582 1583 /* two cases now: head <= rtail or head >= rhead */ 1584 if (head <= kring->rtail) { 1585 /* want head <= cur <= rtail */ 1586 if (cur < head || cur > kring->rtail) 1587 goto error; 1588 } else { /* head >= rhead */ 1589 /* cur must be outside rtail..head */ 1590 if (cur > kring->rtail && cur < head) 1591 goto error; 1592 } 1593 } 1594 if (ring->tail != kring->rtail) { 1595 RD(5, "tail overwritten was %d need %d", 1596 ring->tail, kring->rtail); 1597 ring->tail = kring->rtail; 1598 } 1599 kring->rhead = head; 1600 kring->rcur = cur; 1601 return head; 1602 1603 error: 1604 RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d", 1605 kring->name, 1606 kring->nr_hwcur, 1607 kring->rcur, kring->nr_hwtail, 1608 cur, ring->tail); 1609 return n; 1610 } 1611 1612 1613 /* 1614 * validate parameters on entry for *_rxsync() 1615 * Returns ring->head if ok, kring->nkr_num_slots on error. 1616 * 1617 * For a valid configuration, 1618 * hwcur <= head <= cur <= tail <= hwtail 1619 * 1620 * We only consider head and cur. 1621 * hwcur and hwtail are reliable. 1622 * 1623 */ 1624 u_int 1625 nm_rxsync_prologue(struct netmap_kring *kring) 1626 { 1627 struct netmap_ring *ring = kring->ring; 1628 uint32_t const n = kring->nkr_num_slots; 1629 uint32_t head, cur; 1630 1631 ND("%s kc %d kt %d h %d c %d t %d", 1632 kring->name, 1633 kring->nr_hwcur, kring->nr_hwtail, 1634 ring->head, ring->cur, ring->tail); 1635 /* 1636 * Before storing the new values, we should check they do not 1637 * move backwards. However: 1638 * - head is not an issue because the previous value is hwcur; 1639 * - cur could in principle go back, however it does not matter 1640 * because we are processing a brand new rxsync() 1641 */ 1642 cur = kring->rcur = ring->cur; /* read only once */ 1643 head = kring->rhead = ring->head; /* read only once */ 1644 #if 1 /* kernel sanity checks */ 1645 if (kring->nr_hwcur >= n || kring->nr_hwtail >= n) 1646 goto error; 1647 #endif /* kernel sanity checks */ 1648 /* user sanity checks */ 1649 if (kring->nr_hwtail >= kring->nr_hwcur) { 1650 /* want hwcur <= rhead <= hwtail */ 1651 if (head < kring->nr_hwcur || head > kring->nr_hwtail) 1652 goto error; 1653 /* and also rhead <= rcur <= hwtail */ 1654 if (cur < head || cur > kring->nr_hwtail) 1655 goto error; 1656 } else { 1657 /* we need rhead outside hwtail..hwcur */ 1658 if (head < kring->nr_hwcur && head > kring->nr_hwtail) 1659 goto error; 1660 /* two cases now: head <= hwtail or head >= hwcur */ 1661 if (head <= kring->nr_hwtail) { 1662 /* want head <= cur <= hwtail */ 1663 if (cur < head || cur > kring->nr_hwtail) 1664 goto error; 1665 } else { 1666 /* cur must be outside hwtail..head */ 1667 if (cur < head && cur > kring->nr_hwtail) 1668 goto error; 1669 } 1670 } 1671 if (ring->tail != kring->rtail) { 1672 RD(5, "%s tail overwritten was %d need %d", 1673 kring->name, 1674 ring->tail, kring->rtail); 1675 ring->tail = kring->rtail; 1676 } 1677 return head; 1678 1679 error: 1680 RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d", 1681 kring->nr_hwcur, 1682 kring->rcur, kring->nr_hwtail, 1683 kring->rhead, kring->rcur, ring->tail); 1684 return n; 1685 } 1686 1687 1688 /* 1689 * Error routine called when txsync/rxsync detects an error. 1690 * Can't do much more than resetting head =cur = hwcur, tail = hwtail 1691 * Return 1 on reinit. 1692 * 1693 * This routine is only called by the upper half of the kernel. 1694 * It only reads hwcur (which is changed only by the upper half, too) 1695 * and hwtail (which may be changed by the lower half, but only on 1696 * a tx ring and only to increase it, so any error will be recovered 1697 * on the next call). For the above, we don't strictly need to call 1698 * it under lock. 1699 */ 1700 int 1701 netmap_ring_reinit(struct netmap_kring *kring) 1702 { 1703 struct netmap_ring *ring = kring->ring; 1704 u_int i, lim = kring->nkr_num_slots - 1; 1705 int errors = 0; 1706 1707 // XXX KASSERT nm_kr_tryget 1708 RD(10, "called for %s", kring->name); 1709 // XXX probably wrong to trust userspace 1710 kring->rhead = ring->head; 1711 kring->rcur = ring->cur; 1712 kring->rtail = ring->tail; 1713 1714 if (ring->cur > lim) 1715 errors++; 1716 if (ring->head > lim) 1717 errors++; 1718 if (ring->tail > lim) 1719 errors++; 1720 for (i = 0; i <= lim; i++) { 1721 u_int idx = ring->slot[i].buf_idx; 1722 u_int len = ring->slot[i].len; 1723 if (idx < 2 || idx >= netmap_total_buffers) { 1724 RD(5, "bad index at slot %d idx %d len %d ", i, idx, len); 1725 ring->slot[i].buf_idx = 0; 1726 ring->slot[i].len = 0; 1727 } else if (len > NETMAP_BUF_SIZE(kring->na)) { 1728 ring->slot[i].len = 0; 1729 RD(5, "bad len at slot %d idx %d len %d", i, idx, len); 1730 } 1731 } 1732 if (errors) { 1733 RD(10, "total %d errors", errors); 1734 RD(10, "%s reinit, cur %d -> %d tail %d -> %d", 1735 kring->name, 1736 ring->cur, kring->nr_hwcur, 1737 ring->tail, kring->nr_hwtail); 1738 ring->head = kring->rhead = kring->nr_hwcur; 1739 ring->cur = kring->rcur = kring->nr_hwcur; 1740 ring->tail = kring->rtail = kring->nr_hwtail; 1741 } 1742 return (errors ? 1 : 0); 1743 } 1744 1745 /* interpret the ringid and flags fields of an nmreq, by translating them 1746 * into a pair of intervals of ring indices: 1747 * 1748 * [priv->np_txqfirst, priv->np_txqlast) and 1749 * [priv->np_rxqfirst, priv->np_rxqlast) 1750 * 1751 */ 1752 int 1753 netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) 1754 { 1755 struct netmap_adapter *na = priv->np_na; 1756 u_int j, i = ringid & NETMAP_RING_MASK; 1757 u_int reg = flags & NR_REG_MASK; 1758 1759 if (reg == NR_REG_DEFAULT) { 1760 /* convert from old ringid to flags */ 1761 if (ringid & NETMAP_SW_RING) { 1762 reg = NR_REG_SW; 1763 } else if (ringid & NETMAP_HW_RING) { 1764 reg = NR_REG_ONE_NIC; 1765 } else { 1766 reg = NR_REG_ALL_NIC; 1767 } 1768 D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg); 1769 } 1770 switch (reg) { 1771 case NR_REG_ALL_NIC: 1772 case NR_REG_PIPE_MASTER: 1773 case NR_REG_PIPE_SLAVE: 1774 priv->np_txqfirst = 0; 1775 priv->np_txqlast = na->num_tx_rings; 1776 priv->np_rxqfirst = 0; 1777 priv->np_rxqlast = na->num_rx_rings; 1778 ND("%s %d %d", "ALL/PIPE", 1779 priv->np_rxqfirst, priv->np_rxqlast); 1780 break; 1781 case NR_REG_SW: 1782 case NR_REG_NIC_SW: 1783 if (!(na->na_flags & NAF_HOST_RINGS)) { 1784 D("host rings not supported"); 1785 return EINVAL; 1786 } 1787 priv->np_txqfirst = (reg == NR_REG_SW ? 1788 na->num_tx_rings : 0); 1789 priv->np_txqlast = na->num_tx_rings + 1; 1790 priv->np_rxqfirst = (reg == NR_REG_SW ? 1791 na->num_rx_rings : 0); 1792 priv->np_rxqlast = na->num_rx_rings + 1; 1793 ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW", 1794 priv->np_rxqfirst, priv->np_rxqlast); 1795 break; 1796 case NR_REG_ONE_NIC: 1797 if (i >= na->num_tx_rings && i >= na->num_rx_rings) { 1798 D("invalid ring id %d", i); 1799 return EINVAL; 1800 } 1801 /* if not enough rings, use the first one */ 1802 j = i; 1803 if (j >= na->num_tx_rings) 1804 j = 0; 1805 priv->np_txqfirst = j; 1806 priv->np_txqlast = j + 1; 1807 j = i; 1808 if (j >= na->num_rx_rings) 1809 j = 0; 1810 priv->np_rxqfirst = j; 1811 priv->np_rxqlast = j + 1; 1812 break; 1813 default: 1814 D("invalid regif type %d", reg); 1815 return EINVAL; 1816 } 1817 priv->np_flags = (flags & ~NR_REG_MASK) | reg; 1818 1819 if (netmap_verbose) { 1820 D("%s: tx [%d,%d) rx [%d,%d) id %d", 1821 na->name, 1822 priv->np_txqfirst, 1823 priv->np_txqlast, 1824 priv->np_rxqfirst, 1825 priv->np_rxqlast, 1826 i); 1827 } 1828 return 0; 1829 } 1830 1831 1832 /* 1833 * Set the ring ID. For devices with a single queue, a request 1834 * for all rings is the same as a single ring. 1835 */ 1836 static int 1837 netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) 1838 { 1839 struct netmap_adapter *na = priv->np_na; 1840 int error; 1841 1842 error = netmap_interp_ringid(priv, ringid, flags); 1843 if (error) { 1844 return error; 1845 } 1846 1847 priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; 1848 1849 /* optimization: count the users registered for more than 1850 * one ring, which are the ones sleeping on the global queue. 1851 * The default netmap_notify() callback will then 1852 * avoid signaling the global queue if nobody is using it 1853 */ 1854 if (nm_tx_si_user(priv)) 1855 na->tx_si_users++; 1856 if (nm_rx_si_user(priv)) 1857 na->rx_si_users++; 1858 return 0; 1859 } 1860 1861 /* 1862 * possibly move the interface to netmap-mode. 1863 * If success it returns a pointer to netmap_if, otherwise NULL. 1864 * This must be called with NMG_LOCK held. 1865 * 1866 * The following na callbacks are called in the process: 1867 * 1868 * na->nm_config() [by netmap_update_config] 1869 * (get current number and size of rings) 1870 * 1871 * We have a generic one for linux (netmap_linux_config). 1872 * The bwrap has to override this, since it has to forward 1873 * the request to the wrapped adapter (netmap_bwrap_config). 1874 * 1875 * XXX netmap_if_new calls this again (2014-03-15) 1876 * 1877 * na->nm_krings_create() [by netmap_if_new] 1878 * (create and init the krings array) 1879 * 1880 * One of the following: 1881 * 1882 * * netmap_hw_krings_create, (hw ports) 1883 * creates the standard layout for the krings 1884 * and adds the mbq (used for the host rings). 1885 * 1886 * * netmap_vp_krings_create (VALE ports) 1887 * add leases and scratchpads 1888 * 1889 * * netmap_pipe_krings_create (pipes) 1890 * create the krings and rings of both ends and 1891 * cross-link them 1892 * 1893 * * netmap_monitor_krings_create (monitors) 1894 * avoid allocating the mbq 1895 * 1896 * * netmap_bwrap_krings_create (bwraps) 1897 * create both the brap krings array, 1898 * the krings array of the wrapped adapter, and 1899 * (if needed) the fake array for the host adapter 1900 * 1901 * na->nm_register(, 1) 1902 * (put the adapter in netmap mode) 1903 * 1904 * This may be one of the following: 1905 * (XXX these should be either all *_register or all *_reg 2014-03-15) 1906 * 1907 * * netmap_hw_register (hw ports) 1908 * checks that the ifp is still there, then calls 1909 * the hardware specific callback; 1910 * 1911 * * netmap_vp_reg (VALE ports) 1912 * If the port is connected to a bridge, 1913 * set the NAF_NETMAP_ON flag under the 1914 * bridge write lock. 1915 * 1916 * * netmap_pipe_reg (pipes) 1917 * inform the other pipe end that it is no 1918 * longer responsibile for the lifetime of this 1919 * pipe end 1920 * 1921 * * netmap_monitor_reg (monitors) 1922 * intercept the sync callbacks of the monitored 1923 * rings 1924 * 1925 * * netmap_bwrap_register (bwraps) 1926 * cross-link the bwrap and hwna rings, 1927 * forward the request to the hwna, override 1928 * the hwna notify callback (to get the frames 1929 * coming from outside go through the bridge). 1930 * 1931 * XXX maybe netmap_if_new() should be merged with this (2014-03-15). 1932 * 1933 */ 1934 struct netmap_if * 1935 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, 1936 uint16_t ringid, uint32_t flags, int *err) 1937 { 1938 struct netmap_if *nifp = NULL; 1939 int error, need_mem = 0; 1940 1941 NMG_LOCK_ASSERT(); 1942 /* ring configuration may have changed, fetch from the card */ 1943 netmap_update_config(na); 1944 priv->np_na = na; /* store the reference */ 1945 error = netmap_set_ringid(priv, ringid, flags); 1946 if (error) 1947 goto out; 1948 /* ensure allocators are ready */ 1949 need_mem = !netmap_have_memory_locked(priv); 1950 if (need_mem) { 1951 error = netmap_get_memory_locked(priv); 1952 ND("get_memory returned %d", error); 1953 if (error) 1954 goto out; 1955 } 1956 /* Allocate a netmap_if and, if necessary, all the netmap_ring's */ 1957 nifp = netmap_if_new(na); 1958 if (nifp == NULL) { /* allocation failed */ 1959 error = ENOMEM; 1960 goto out; 1961 } 1962 na->active_fds++; 1963 if (!nm_netmap_on(na)) { 1964 /* Netmap not active, set the card in netmap mode 1965 * and make it use the shared buffers. 1966 */ 1967 /* cache the allocator info in the na */ 1968 na->na_lut = netmap_mem_get_lut(na->nm_mem); 1969 ND("%p->na_lut == %p", na, na->na_lut); 1970 na->na_lut_objtotal = netmap_mem_get_buftotal(na->nm_mem); 1971 na->na_lut_objsize = netmap_mem_get_bufsize(na->nm_mem); 1972 error = na->nm_register(na, 1); /* mode on */ 1973 if (error) { 1974 netmap_do_unregif(priv, nifp); 1975 nifp = NULL; 1976 } 1977 } 1978 out: 1979 *err = error; 1980 if (error) { 1981 /* we should drop the allocator, but only 1982 * if we were the ones who grabbed it 1983 */ 1984 if (need_mem) 1985 netmap_drop_memory_locked(priv); 1986 priv->np_na = NULL; 1987 } 1988 if (nifp != NULL) { 1989 /* 1990 * advertise that the interface is ready bt setting ni_nifp. 1991 * The barrier is needed because readers (poll and *SYNC) 1992 * check for priv->np_nifp != NULL without locking 1993 */ 1994 wmb(); /* make sure previous writes are visible to all CPUs */ 1995 priv->np_nifp = nifp; 1996 } 1997 return nifp; 1998 } 1999 2000 2001 2002 /* 2003 * ioctl(2) support for the "netmap" device. 2004 * 2005 * Following a list of accepted commands: 2006 * - NIOCGINFO 2007 * - SIOCGIFADDR just for convenience 2008 * - NIOCREGIF 2009 * - NIOCTXSYNC 2010 * - NIOCRXSYNC 2011 * 2012 * Return 0 on success, errno otherwise. 2013 */ 2014 int 2015 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, 2016 int fflag, struct thread *td) 2017 { 2018 struct netmap_priv_d *priv = NULL; 2019 struct nmreq *nmr = (struct nmreq *) data; 2020 struct netmap_adapter *na = NULL; 2021 int error; 2022 u_int i, qfirst, qlast; 2023 struct netmap_if *nifp; 2024 struct netmap_kring *krings; 2025 2026 (void)dev; /* UNUSED */ 2027 (void)fflag; /* UNUSED */ 2028 2029 if (cmd == NIOCGINFO || cmd == NIOCREGIF) { 2030 /* truncate name */ 2031 nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; 2032 if (nmr->nr_version != NETMAP_API) { 2033 D("API mismatch for %s got %d need %d", 2034 nmr->nr_name, 2035 nmr->nr_version, NETMAP_API); 2036 nmr->nr_version = NETMAP_API; 2037 } 2038 if (nmr->nr_version < NETMAP_MIN_API || 2039 nmr->nr_version > NETMAP_MAX_API) { 2040 return EINVAL; 2041 } 2042 } 2043 CURVNET_SET(TD_TO_VNET(td)); 2044 2045 error = devfs_get_cdevpriv((void **)&priv); 2046 if (error) { 2047 CURVNET_RESTORE(); 2048 /* XXX ENOENT should be impossible, since the priv 2049 * is now created in the open */ 2050 return (error == ENOENT ? ENXIO : error); 2051 } 2052 2053 switch (cmd) { 2054 case NIOCGINFO: /* return capabilities etc */ 2055 if (nmr->nr_cmd == NETMAP_BDG_LIST) { 2056 error = netmap_bdg_ctl(nmr, NULL); 2057 break; 2058 } 2059 2060 NMG_LOCK(); 2061 do { 2062 /* memsize is always valid */ 2063 struct netmap_mem_d *nmd = &nm_mem; 2064 u_int memflags; 2065 2066 if (nmr->nr_name[0] != '\0') { 2067 /* get a refcount */ 2068 error = netmap_get_na(nmr, &na, 1 /* create */); 2069 if (error) 2070 break; 2071 nmd = na->nm_mem; /* get memory allocator */ 2072 } 2073 2074 error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags, 2075 &nmr->nr_arg2); 2076 if (error) 2077 break; 2078 if (na == NULL) /* only memory info */ 2079 break; 2080 nmr->nr_offset = 0; 2081 nmr->nr_rx_slots = nmr->nr_tx_slots = 0; 2082 netmap_update_config(na); 2083 nmr->nr_rx_rings = na->num_rx_rings; 2084 nmr->nr_tx_rings = na->num_tx_rings; 2085 nmr->nr_rx_slots = na->num_rx_desc; 2086 nmr->nr_tx_slots = na->num_tx_desc; 2087 netmap_adapter_put(na); 2088 } while (0); 2089 NMG_UNLOCK(); 2090 break; 2091 2092 case NIOCREGIF: 2093 /* possibly attach/detach NIC and VALE switch */ 2094 i = nmr->nr_cmd; 2095 if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH 2096 || i == NETMAP_BDG_VNET_HDR 2097 || i == NETMAP_BDG_NEWIF 2098 || i == NETMAP_BDG_DELIF) { 2099 error = netmap_bdg_ctl(nmr, NULL); 2100 break; 2101 } else if (i != 0) { 2102 D("nr_cmd must be 0 not %d", i); 2103 error = EINVAL; 2104 break; 2105 } 2106 2107 /* protect access to priv from concurrent NIOCREGIF */ 2108 NMG_LOCK(); 2109 do { 2110 u_int memflags; 2111 2112 if (priv->np_na != NULL) { /* thread already registered */ 2113 error = EBUSY; 2114 break; 2115 } 2116 /* find the interface and a reference */ 2117 error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */ 2118 if (error) 2119 break; 2120 if (NETMAP_OWNED_BY_KERN(na)) { 2121 netmap_adapter_put(na); 2122 error = EBUSY; 2123 break; 2124 } 2125 nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error); 2126 if (!nifp) { /* reg. failed, release priv and ref */ 2127 netmap_adapter_put(na); 2128 priv->np_nifp = NULL; 2129 break; 2130 } 2131 priv->np_td = td; // XXX kqueue, debugging only 2132 2133 /* return the offset of the netmap_if object */ 2134 nmr->nr_rx_rings = na->num_rx_rings; 2135 nmr->nr_tx_rings = na->num_tx_rings; 2136 nmr->nr_rx_slots = na->num_rx_desc; 2137 nmr->nr_tx_slots = na->num_tx_desc; 2138 error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags, 2139 &nmr->nr_arg2); 2140 if (error) { 2141 netmap_adapter_put(na); 2142 break; 2143 } 2144 if (memflags & NETMAP_MEM_PRIVATE) { 2145 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; 2146 } 2147 priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ? 2148 &na->tx_si : &na->tx_rings[priv->np_txqfirst].si; 2149 priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ? 2150 &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si; 2151 2152 if (nmr->nr_arg3) { 2153 D("requested %d extra buffers", nmr->nr_arg3); 2154 nmr->nr_arg3 = netmap_extra_alloc(na, 2155 &nifp->ni_bufs_head, nmr->nr_arg3); 2156 D("got %d extra buffers", nmr->nr_arg3); 2157 } 2158 nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); 2159 } while (0); 2160 NMG_UNLOCK(); 2161 break; 2162 2163 case NIOCTXSYNC: 2164 case NIOCRXSYNC: 2165 nifp = priv->np_nifp; 2166 2167 if (nifp == NULL) { 2168 error = ENXIO; 2169 break; 2170 } 2171 rmb(); /* make sure following reads are not from cache */ 2172 2173 na = priv->np_na; /* we have a reference */ 2174 2175 if (na == NULL) { 2176 D("Internal error: nifp != NULL && na == NULL"); 2177 error = ENXIO; 2178 break; 2179 } 2180 2181 if (!nm_netmap_on(na)) { 2182 error = ENXIO; 2183 break; 2184 } 2185 2186 if (cmd == NIOCTXSYNC) { 2187 krings = na->tx_rings; 2188 qfirst = priv->np_txqfirst; 2189 qlast = priv->np_txqlast; 2190 } else { 2191 krings = na->rx_rings; 2192 qfirst = priv->np_rxqfirst; 2193 qlast = priv->np_rxqlast; 2194 } 2195 2196 for (i = qfirst; i < qlast; i++) { 2197 struct netmap_kring *kring = krings + i; 2198 if (nm_kr_tryget(kring)) { 2199 error = EBUSY; 2200 goto out; 2201 } 2202 if (cmd == NIOCTXSYNC) { 2203 if (netmap_verbose & NM_VERB_TXSYNC) 2204 D("pre txsync ring %d cur %d hwcur %d", 2205 i, kring->ring->cur, 2206 kring->nr_hwcur); 2207 if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { 2208 netmap_ring_reinit(kring); 2209 } else { 2210 kring->nm_sync(kring, NAF_FORCE_RECLAIM); 2211 } 2212 if (netmap_verbose & NM_VERB_TXSYNC) 2213 D("post txsync ring %d cur %d hwcur %d", 2214 i, kring->ring->cur, 2215 kring->nr_hwcur); 2216 } else { 2217 kring->nm_sync(kring, NAF_FORCE_READ); 2218 microtime(&na->rx_rings[i].ring->ts); 2219 } 2220 nm_kr_put(kring); 2221 } 2222 2223 break; 2224 2225 case NIOCCONFIG: 2226 error = netmap_bdg_config(nmr); 2227 break; 2228 #ifdef __FreeBSD__ 2229 case FIONBIO: 2230 case FIOASYNC: 2231 ND("FIONBIO/FIOASYNC are no-ops"); 2232 break; 2233 2234 case BIOCIMMEDIATE: 2235 case BIOCGHDRCMPLT: 2236 case BIOCSHDRCMPLT: 2237 case BIOCSSEESENT: 2238 D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); 2239 break; 2240 2241 default: /* allow device-specific ioctls */ 2242 { 2243 struct ifnet *ifp = ifunit_ref(nmr->nr_name); 2244 if (ifp == NULL) { 2245 error = ENXIO; 2246 } else { 2247 struct socket so; 2248 2249 bzero(&so, sizeof(so)); 2250 so.so_vnet = ifp->if_vnet; 2251 // so->so_proto not null. 2252 error = ifioctl(&so, cmd, data, td); 2253 if_rele(ifp); 2254 } 2255 break; 2256 } 2257 2258 #else /* linux */ 2259 default: 2260 error = EOPNOTSUPP; 2261 #endif /* linux */ 2262 } 2263 out: 2264 2265 CURVNET_RESTORE(); 2266 return (error); 2267 } 2268 2269 2270 /* 2271 * select(2) and poll(2) handlers for the "netmap" device. 2272 * 2273 * Can be called for one or more queues. 2274 * Return true the event mask corresponding to ready events. 2275 * If there are no ready events, do a selrecord on either individual 2276 * selinfo or on the global one. 2277 * Device-dependent parts (locking and sync of tx/rx rings) 2278 * are done through callbacks. 2279 * 2280 * On linux, arguments are really pwait, the poll table, and 'td' is struct file * 2281 * The first one is remapped to pwait as selrecord() uses the name as an 2282 * hidden argument. 2283 */ 2284 int 2285 netmap_poll(struct cdev *dev, int events, struct thread *td) 2286 { 2287 struct netmap_priv_d *priv = NULL; 2288 struct netmap_adapter *na; 2289 struct netmap_kring *kring; 2290 u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; 2291 struct mbq q; /* packets from hw queues to host stack */ 2292 void *pwait = dev; /* linux compatibility */ 2293 int is_kevent = 0; 2294 2295 /* 2296 * In order to avoid nested locks, we need to "double check" 2297 * txsync and rxsync if we decide to do a selrecord(). 2298 * retry_tx (and retry_rx, later) prevent looping forever. 2299 */ 2300 int retry_tx = 1, retry_rx = 1; 2301 2302 (void)pwait; 2303 mbq_init(&q); 2304 2305 /* 2306 * XXX kevent has curthread->tp_fop == NULL, 2307 * so devfs_get_cdevpriv() fails. We circumvent this by passing 2308 * priv as the first argument, which is also useful to avoid 2309 * the selrecord() which are not necessary in that case. 2310 */ 2311 if (devfs_get_cdevpriv((void **)&priv) != 0) { 2312 is_kevent = 1; 2313 if (netmap_verbose) 2314 D("called from kevent"); 2315 priv = (struct netmap_priv_d *)dev; 2316 } 2317 if (priv == NULL) 2318 return POLLERR; 2319 2320 if (priv->np_nifp == NULL) { 2321 D("No if registered"); 2322 return POLLERR; 2323 } 2324 rmb(); /* make sure following reads are not from cache */ 2325 2326 na = priv->np_na; 2327 2328 if (!nm_netmap_on(na)) 2329 return POLLERR; 2330 2331 if (netmap_verbose & 0x8000) 2332 D("device %s events 0x%x", na->name, events); 2333 want_tx = events & (POLLOUT | POLLWRNORM); 2334 want_rx = events & (POLLIN | POLLRDNORM); 2335 2336 2337 /* 2338 * check_all_{tx|rx} are set if the card has more than one queue AND 2339 * the file descriptor is bound to all of them. If so, we sleep on 2340 * the "global" selinfo, otherwise we sleep on individual selinfo 2341 * (FreeBSD only allows two selinfo's per file descriptor). 2342 * The interrupt routine in the driver wake one or the other 2343 * (or both) depending on which clients are active. 2344 * 2345 * rxsync() is only called if we run out of buffers on a POLLIN. 2346 * txsync() is called if we run out of buffers on POLLOUT, or 2347 * there are pending packets to send. The latter can be disabled 2348 * passing NETMAP_NO_TX_POLL in the NIOCREG call. 2349 */ 2350 check_all_tx = nm_tx_si_user(priv); 2351 check_all_rx = nm_rx_si_user(priv); 2352 2353 /* 2354 * We start with a lock free round which is cheap if we have 2355 * slots available. If this fails, then lock and call the sync 2356 * routines. 2357 */ 2358 for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) { 2359 kring = &na->rx_rings[i]; 2360 /* XXX compare ring->cur and kring->tail */ 2361 if (!nm_ring_empty(kring->ring)) { 2362 revents |= want_rx; 2363 want_rx = 0; /* also breaks the loop */ 2364 } 2365 } 2366 for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) { 2367 kring = &na->tx_rings[i]; 2368 /* XXX compare ring->cur and kring->tail */ 2369 if (!nm_ring_empty(kring->ring)) { 2370 revents |= want_tx; 2371 want_tx = 0; /* also breaks the loop */ 2372 } 2373 } 2374 2375 /* 2376 * If we want to push packets out (priv->np_txpoll) or 2377 * want_tx is still set, we must issue txsync calls 2378 * (on all rings, to avoid that the tx rings stall). 2379 * XXX should also check cur != hwcur on the tx rings. 2380 * Fortunately, normal tx mode has np_txpoll set. 2381 */ 2382 if (priv->np_txpoll || want_tx) { 2383 /* 2384 * The first round checks if anyone is ready, if not 2385 * do a selrecord and another round to handle races. 2386 * want_tx goes to 0 if any space is found, and is 2387 * used to skip rings with no pending transmissions. 2388 */ 2389 flush_tx: 2390 for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) { 2391 int found = 0; 2392 2393 kring = &na->tx_rings[i]; 2394 if (!want_tx && kring->ring->cur == kring->nr_hwcur) 2395 continue; 2396 /* only one thread does txsync */ 2397 if (nm_kr_tryget(kring)) { 2398 /* either busy or stopped 2399 * XXX if the ring is stopped, sleeping would 2400 * be better. In current code, however, we only 2401 * stop the rings for brief intervals (2014-03-14) 2402 */ 2403 if (netmap_verbose) 2404 RD(2, "%p lost race on txring %d, ok", 2405 priv, i); 2406 continue; 2407 } 2408 if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { 2409 netmap_ring_reinit(kring); 2410 revents |= POLLERR; 2411 } else { 2412 if (kring->nm_sync(kring, 0)) 2413 revents |= POLLERR; 2414 } 2415 2416 /* 2417 * If we found new slots, notify potential 2418 * listeners on the same ring. 2419 * Since we just did a txsync, look at the copies 2420 * of cur,tail in the kring. 2421 */ 2422 found = kring->rcur != kring->rtail; 2423 nm_kr_put(kring); 2424 if (found) { /* notify other listeners */ 2425 revents |= want_tx; 2426 want_tx = 0; 2427 na->nm_notify(na, i, NR_TX, 0); 2428 } 2429 } 2430 if (want_tx && retry_tx && !is_kevent) { 2431 OS_selrecord(td, check_all_tx ? 2432 &na->tx_si : &na->tx_rings[priv->np_txqfirst].si); 2433 retry_tx = 0; 2434 goto flush_tx; 2435 } 2436 } 2437 2438 /* 2439 * If want_rx is still set scan receive rings. 2440 * Do it on all rings because otherwise we starve. 2441 */ 2442 if (want_rx) { 2443 int send_down = 0; /* transparent mode */ 2444 /* two rounds here for race avoidance */ 2445 do_retry_rx: 2446 for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) { 2447 int found = 0; 2448 2449 kring = &na->rx_rings[i]; 2450 2451 if (nm_kr_tryget(kring)) { 2452 if (netmap_verbose) 2453 RD(2, "%p lost race on rxring %d, ok", 2454 priv, i); 2455 continue; 2456 } 2457 2458 /* 2459 * transparent mode support: collect packets 2460 * from the rxring(s). 2461 * XXX NR_FORWARD should only be read on 2462 * physical or NIC ports 2463 */ 2464 if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { 2465 ND(10, "forwarding some buffers up %d to %d", 2466 kring->nr_hwcur, kring->ring->cur); 2467 netmap_grab_packets(kring, &q, netmap_fwd); 2468 } 2469 2470 if (kring->nm_sync(kring, 0)) 2471 revents |= POLLERR; 2472 if (netmap_no_timestamp == 0 || 2473 kring->ring->flags & NR_TIMESTAMP) { 2474 microtime(&kring->ring->ts); 2475 } 2476 /* after an rxsync we can use kring->rcur, rtail */ 2477 found = kring->rcur != kring->rtail; 2478 nm_kr_put(kring); 2479 if (found) { 2480 revents |= want_rx; 2481 retry_rx = 0; 2482 na->nm_notify(na, i, NR_RX, 0); 2483 } 2484 } 2485 2486 /* transparent mode XXX only during first pass ? */ 2487 if (na->na_flags & NAF_HOST_RINGS) { 2488 kring = &na->rx_rings[na->num_rx_rings]; 2489 if (check_all_rx 2490 && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { 2491 /* XXX fix to use kring fields */ 2492 if (nm_ring_empty(kring->ring)) 2493 send_down = netmap_rxsync_from_host(na, td, dev); 2494 if (!nm_ring_empty(kring->ring)) 2495 revents |= want_rx; 2496 } 2497 } 2498 2499 if (retry_rx && !is_kevent) 2500 OS_selrecord(td, check_all_rx ? 2501 &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si); 2502 if (send_down > 0 || retry_rx) { 2503 retry_rx = 0; 2504 if (send_down) 2505 goto flush_tx; /* and retry_rx */ 2506 else 2507 goto do_retry_rx; 2508 } 2509 } 2510 2511 /* 2512 * Transparent mode: marked bufs on rx rings between 2513 * kring->nr_hwcur and ring->head 2514 * are passed to the other endpoint. 2515 * 2516 * In this mode we also scan the sw rxring, which in 2517 * turn passes packets up. 2518 * 2519 * XXX Transparent mode at the moment requires to bind all 2520 * rings to a single file descriptor. 2521 */ 2522 2523 if (q.head && na->ifp != NULL) 2524 netmap_send_up(na->ifp, &q); 2525 2526 return (revents); 2527 } 2528 2529 2530 /*-------------------- driver support routines -------------------*/ 2531 2532 static int netmap_hw_krings_create(struct netmap_adapter *); 2533 2534 /* default notify callback */ 2535 static int 2536 netmap_notify(struct netmap_adapter *na, u_int n_ring, 2537 enum txrx tx, int flags) 2538 { 2539 struct netmap_kring *kring; 2540 2541 if (tx == NR_TX) { 2542 kring = na->tx_rings + n_ring; 2543 OS_selwakeup(&kring->si, PI_NET); 2544 /* optimization: avoid a wake up on the global 2545 * queue if nobody has registered for more 2546 * than one ring 2547 */ 2548 if (na->tx_si_users > 0) 2549 OS_selwakeup(&na->tx_si, PI_NET); 2550 } else { 2551 kring = na->rx_rings + n_ring; 2552 OS_selwakeup(&kring->si, PI_NET); 2553 /* optimization: same as above */ 2554 if (na->rx_si_users > 0) 2555 OS_selwakeup(&na->rx_si, PI_NET); 2556 } 2557 return 0; 2558 } 2559 2560 2561 /* called by all routines that create netmap_adapters. 2562 * Attach na to the ifp (if any) and provide defaults 2563 * for optional callbacks. Defaults assume that we 2564 * are creating an hardware netmap_adapter. 2565 */ 2566 int 2567 netmap_attach_common(struct netmap_adapter *na) 2568 { 2569 struct ifnet *ifp = na->ifp; 2570 2571 if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { 2572 D("%s: invalid rings tx %d rx %d", 2573 na->name, na->num_tx_rings, na->num_rx_rings); 2574 return EINVAL; 2575 } 2576 /* ifp is NULL for virtual adapters (bwrap, non-persistent VALE ports, 2577 * pipes, monitors). For bwrap we actually have a non-null ifp for 2578 * use by the external modules, but that is set after this 2579 * function has been called. 2580 * XXX this is ugly, maybe split this function in two (2014-03-14) 2581 */ 2582 if (ifp != NULL) { 2583 WNA(ifp) = na; 2584 2585 /* the following is only needed for na that use the host port. 2586 * XXX do we have something similar for linux ? 2587 */ 2588 #ifdef __FreeBSD__ 2589 na->if_input = ifp->if_input; /* for netmap_send_up */ 2590 #endif /* __FreeBSD__ */ 2591 2592 NETMAP_SET_CAPABLE(ifp); 2593 } 2594 if (na->nm_krings_create == NULL) { 2595 /* we assume that we have been called by a driver, 2596 * since other port types all provide their own 2597 * nm_krings_create 2598 */ 2599 na->nm_krings_create = netmap_hw_krings_create; 2600 na->nm_krings_delete = netmap_hw_krings_delete; 2601 } 2602 if (na->nm_notify == NULL) 2603 na->nm_notify = netmap_notify; 2604 na->active_fds = 0; 2605 2606 if (na->nm_mem == NULL) 2607 /* use the global allocator */ 2608 na->nm_mem = &nm_mem; 2609 if (na->nm_bdg_attach == NULL) 2610 /* no special nm_bdg_attach callback. On VALE 2611 * attach, we need to interpose a bwrap 2612 */ 2613 na->nm_bdg_attach = netmap_bwrap_attach; 2614 return 0; 2615 } 2616 2617 2618 /* standard cleanup, called by all destructors */ 2619 void 2620 netmap_detach_common(struct netmap_adapter *na) 2621 { 2622 if (na->ifp != NULL) 2623 WNA(na->ifp) = NULL; /* XXX do we need this? */ 2624 2625 if (na->tx_rings) { /* XXX should not happen */ 2626 D("freeing leftover tx_rings"); 2627 na->nm_krings_delete(na); 2628 } 2629 netmap_pipe_dealloc(na); 2630 if (na->na_flags & NAF_MEM_OWNER) 2631 netmap_mem_private_delete(na->nm_mem); 2632 bzero(na, sizeof(*na)); 2633 free(na, M_DEVBUF); 2634 } 2635 2636 /* Wrapper for the register callback provided hardware drivers. 2637 * na->ifp == NULL means the the driver module has been 2638 * unloaded, so we cannot call into it. 2639 * Note that module unloading, in our patched linux drivers, 2640 * happens under NMG_LOCK and after having stopped all the 2641 * nic rings (see netmap_detach). This provides sufficient 2642 * protection for the other driver-provied callbacks 2643 * (i.e., nm_config and nm_*xsync), that therefore don't need 2644 * to wrapped. 2645 */ 2646 static int 2647 netmap_hw_register(struct netmap_adapter *na, int onoff) 2648 { 2649 struct netmap_hw_adapter *hwna = 2650 (struct netmap_hw_adapter*)na; 2651 2652 if (na->ifp == NULL) 2653 return onoff ? ENXIO : 0; 2654 2655 return hwna->nm_hw_register(na, onoff); 2656 } 2657 2658 2659 /* 2660 * Initialize a ``netmap_adapter`` object created by driver on attach. 2661 * We allocate a block of memory with room for a struct netmap_adapter 2662 * plus two sets of N+2 struct netmap_kring (where N is the number 2663 * of hardware rings): 2664 * krings 0..N-1 are for the hardware queues. 2665 * kring N is for the host stack queue 2666 * kring N+1 is only used for the selinfo for all queues. // XXX still true ? 2667 * Return 0 on success, ENOMEM otherwise. 2668 */ 2669 int 2670 netmap_attach(struct netmap_adapter *arg) 2671 { 2672 struct netmap_hw_adapter *hwna = NULL; 2673 // XXX when is arg == NULL ? 2674 struct ifnet *ifp = arg ? arg->ifp : NULL; 2675 2676 if (arg == NULL || ifp == NULL) 2677 goto fail; 2678 hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO); 2679 if (hwna == NULL) 2680 goto fail; 2681 hwna->up = *arg; 2682 hwna->up.na_flags |= NAF_HOST_RINGS; 2683 strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name)); 2684 hwna->nm_hw_register = hwna->up.nm_register; 2685 hwna->up.nm_register = netmap_hw_register; 2686 if (netmap_attach_common(&hwna->up)) { 2687 free(hwna, M_DEVBUF); 2688 goto fail; 2689 } 2690 netmap_adapter_get(&hwna->up); 2691 2692 #ifdef linux 2693 if (ifp->netdev_ops) { 2694 /* prepare a clone of the netdev ops */ 2695 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) 2696 hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops; 2697 #else 2698 hwna->nm_ndo = *ifp->netdev_ops; 2699 #endif 2700 } 2701 hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; 2702 if (ifp->ethtool_ops) { 2703 hwna->nm_eto = *ifp->ethtool_ops; 2704 } 2705 hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam; 2706 #ifdef ETHTOOL_SCHANNELS 2707 hwna->nm_eto.set_channels = linux_netmap_set_channels; 2708 #endif 2709 if (arg->nm_config == NULL) { 2710 hwna->up.nm_config = netmap_linux_config; 2711 } 2712 #endif /* linux */ 2713 2714 D("success for %s tx %d/%d rx %d/%d queues/slots", 2715 hwna->up.name, 2716 hwna->up.num_tx_rings, hwna->up.num_tx_desc, 2717 hwna->up.num_rx_rings, hwna->up.num_rx_desc 2718 ); 2719 return 0; 2720 2721 fail: 2722 D("fail, arg %p ifp %p na %p", arg, ifp, hwna); 2723 if (ifp) 2724 netmap_detach(ifp); 2725 return (hwna ? EINVAL : ENOMEM); 2726 } 2727 2728 2729 void 2730 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na) 2731 { 2732 if (!na) { 2733 return; 2734 } 2735 2736 refcount_acquire(&na->na_refcount); 2737 } 2738 2739 2740 /* returns 1 iff the netmap_adapter is destroyed */ 2741 int 2742 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) 2743 { 2744 if (!na) 2745 return 1; 2746 2747 if (!refcount_release(&na->na_refcount)) 2748 return 0; 2749 2750 if (na->nm_dtor) 2751 na->nm_dtor(na); 2752 2753 netmap_detach_common(na); 2754 2755 return 1; 2756 } 2757 2758 /* nm_krings_create callback for all hardware native adapters */ 2759 int 2760 netmap_hw_krings_create(struct netmap_adapter *na) 2761 { 2762 int ret = netmap_krings_create(na, 0); 2763 if (ret == 0) { 2764 /* initialize the mbq for the sw rx ring */ 2765 mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue); 2766 ND("initialized sw rx queue %d", na->num_rx_rings); 2767 } 2768 return ret; 2769 } 2770 2771 2772 2773 /* 2774 * Called on module unload by the netmap-enabled drivers 2775 */ 2776 void 2777 netmap_detach(struct ifnet *ifp) 2778 { 2779 struct netmap_adapter *na = NA(ifp); 2780 2781 if (!na) 2782 return; 2783 2784 NMG_LOCK(); 2785 netmap_disable_all_rings(ifp); 2786 if (!netmap_adapter_put(na)) { 2787 /* someone is still using the adapter, 2788 * tell them that the interface is gone 2789 */ 2790 na->ifp = NULL; 2791 // XXX also clear NAF_NATIVE_ON ? 2792 na->na_flags &= ~NAF_NETMAP_ON; 2793 /* give them a chance to notice */ 2794 netmap_enable_all_rings(ifp); 2795 } 2796 NMG_UNLOCK(); 2797 } 2798 2799 2800 /* 2801 * Intercept packets from the network stack and pass them 2802 * to netmap as incoming packets on the 'software' ring. 2803 * 2804 * We only store packets in a bounded mbq and then copy them 2805 * in the relevant rxsync routine. 2806 * 2807 * We rely on the OS to make sure that the ifp and na do not go 2808 * away (typically the caller checks for IFF_DRV_RUNNING or the like). 2809 * In nm_register() or whenever there is a reinitialization, 2810 * we make sure to make the mode change visible here. 2811 */ 2812 int 2813 netmap_transmit(struct ifnet *ifp, struct mbuf *m) 2814 { 2815 struct netmap_adapter *na = NA(ifp); 2816 struct netmap_kring *kring; 2817 u_int len = MBUF_LEN(m); 2818 u_int error = ENOBUFS; 2819 struct mbq *q; 2820 int space; 2821 2822 // XXX [Linux] we do not need this lock 2823 // if we follow the down/configure/up protocol -gl 2824 // mtx_lock(&na->core_lock); 2825 2826 if (!nm_netmap_on(na)) { 2827 D("%s not in netmap mode anymore", na->name); 2828 error = ENXIO; 2829 goto done; 2830 } 2831 2832 kring = &na->rx_rings[na->num_rx_rings]; 2833 q = &kring->rx_queue; 2834 2835 // XXX reconsider long packets if we handle fragments 2836 if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */ 2837 D("%s from_host, drop packet size %d > %d", na->name, 2838 len, NETMAP_BUF_SIZE(na)); 2839 goto done; 2840 } 2841 2842 /* protect against rxsync_from_host(), netmap_sw_to_nic() 2843 * and maybe other instances of netmap_transmit (the latter 2844 * not possible on Linux). 2845 * Also avoid overflowing the queue. 2846 */ 2847 mbq_lock(q); 2848 2849 space = kring->nr_hwtail - kring->nr_hwcur; 2850 if (space < 0) 2851 space += kring->nkr_num_slots; 2852 if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX 2853 RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p", 2854 na->name, kring->nr_hwcur, kring->nr_hwtail, mbq_len(q), 2855 len, m); 2856 } else { 2857 mbq_enqueue(q, m); 2858 ND(10, "%s %d bufs in queue len %d m %p", 2859 na->name, mbq_len(q), len, m); 2860 /* notify outside the lock */ 2861 m = NULL; 2862 error = 0; 2863 } 2864 mbq_unlock(q); 2865 2866 done: 2867 if (m) 2868 m_freem(m); 2869 /* unconditionally wake up listeners */ 2870 na->nm_notify(na, na->num_rx_rings, NR_RX, 0); 2871 /* this is normally netmap_notify(), but for nics 2872 * connected to a bridge it is netmap_bwrap_intr_notify(), 2873 * that possibly forwards the frames through the switch 2874 */ 2875 2876 return (error); 2877 } 2878 2879 2880 /* 2881 * netmap_reset() is called by the driver routines when reinitializing 2882 * a ring. The driver is in charge of locking to protect the kring. 2883 * If native netmap mode is not set just return NULL. 2884 */ 2885 struct netmap_slot * 2886 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, 2887 u_int new_cur) 2888 { 2889 struct netmap_kring *kring; 2890 int new_hwofs, lim; 2891 2892 if (!nm_native_on(na)) { 2893 ND("interface not in native netmap mode"); 2894 return NULL; /* nothing to reinitialize */ 2895 } 2896 2897 /* XXX note- in the new scheme, we are not guaranteed to be 2898 * under lock (e.g. when called on a device reset). 2899 * In this case, we should set a flag and do not trust too 2900 * much the values. In practice: TODO 2901 * - set a RESET flag somewhere in the kring 2902 * - do the processing in a conservative way 2903 * - let the *sync() fixup at the end. 2904 */ 2905 if (tx == NR_TX) { 2906 if (n >= na->num_tx_rings) 2907 return NULL; 2908 kring = na->tx_rings + n; 2909 // XXX check whether we should use hwcur or rcur 2910 new_hwofs = kring->nr_hwcur - new_cur; 2911 } else { 2912 if (n >= na->num_rx_rings) 2913 return NULL; 2914 kring = na->rx_rings + n; 2915 new_hwofs = kring->nr_hwtail - new_cur; 2916 } 2917 lim = kring->nkr_num_slots - 1; 2918 if (new_hwofs > lim) 2919 new_hwofs -= lim + 1; 2920 2921 /* Always set the new offset value and realign the ring. */ 2922 if (netmap_verbose) 2923 D("%s %s%d hwofs %d -> %d, hwtail %d -> %d", 2924 na->name, 2925 tx == NR_TX ? "TX" : "RX", n, 2926 kring->nkr_hwofs, new_hwofs, 2927 kring->nr_hwtail, 2928 tx == NR_TX ? lim : kring->nr_hwtail); 2929 kring->nkr_hwofs = new_hwofs; 2930 if (tx == NR_TX) { 2931 kring->nr_hwtail = kring->nr_hwcur + lim; 2932 if (kring->nr_hwtail > lim) 2933 kring->nr_hwtail -= lim + 1; 2934 } 2935 2936 #if 0 // def linux 2937 /* XXX check that the mappings are correct */ 2938 /* need ring_nr, adapter->pdev, direction */ 2939 buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE); 2940 if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { 2941 D("error mapping rx netmap buffer %d", i); 2942 // XXX fix error handling 2943 } 2944 2945 #endif /* linux */ 2946 /* 2947 * Wakeup on the individual and global selwait 2948 * We do the wakeup here, but the ring is not yet reconfigured. 2949 * However, we are under lock so there are no races. 2950 */ 2951 na->nm_notify(na, n, tx, 0); 2952 return kring->ring->slot; 2953 } 2954 2955 2956 /* 2957 * Dispatch rx/tx interrupts to the netmap rings. 2958 * 2959 * "work_done" is non-null on the RX path, NULL for the TX path. 2960 * We rely on the OS to make sure that there is only one active 2961 * instance per queue, and that there is appropriate locking. 2962 * 2963 * The 'notify' routine depends on what the ring is attached to. 2964 * - for a netmap file descriptor, do a selwakeup on the individual 2965 * waitqueue, plus one on the global one if needed 2966 * (see netmap_notify) 2967 * - for a nic connected to a switch, call the proper forwarding routine 2968 * (see netmap_bwrap_intr_notify) 2969 */ 2970 void 2971 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) 2972 { 2973 struct netmap_adapter *na = NA(ifp); 2974 struct netmap_kring *kring; 2975 2976 q &= NETMAP_RING_MASK; 2977 2978 if (netmap_verbose) { 2979 RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); 2980 } 2981 2982 if (work_done) { /* RX path */ 2983 if (q >= na->num_rx_rings) 2984 return; // not a physical queue 2985 kring = na->rx_rings + q; 2986 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? 2987 na->nm_notify(na, q, NR_RX, 0); 2988 *work_done = 1; /* do not fire napi again */ 2989 } else { /* TX path */ 2990 if (q >= na->num_tx_rings) 2991 return; // not a physical queue 2992 kring = na->tx_rings + q; 2993 na->nm_notify(na, q, NR_TX, 0); 2994 } 2995 } 2996 2997 2998 /* 2999 * Default functions to handle rx/tx interrupts from a physical device. 3000 * "work_done" is non-null on the RX path, NULL for the TX path. 3001 * 3002 * If the card is not in netmap mode, simply return 0, 3003 * so that the caller proceeds with regular processing. 3004 * Otherwise call netmap_common_irq() and return 1. 3005 * 3006 * If the card is connected to a netmap file descriptor, 3007 * do a selwakeup on the individual queue, plus one on the global one 3008 * if needed (multiqueue card _and_ there are multiqueue listeners), 3009 * and return 1. 3010 * 3011 * Finally, if called on rx from an interface connected to a switch, 3012 * calls the proper forwarding routine, and return 1. 3013 */ 3014 int 3015 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) 3016 { 3017 struct netmap_adapter *na = NA(ifp); 3018 3019 /* 3020 * XXX emulated netmap mode sets NAF_SKIP_INTR so 3021 * we still use the regular driver even though the previous 3022 * check fails. It is unclear whether we should use 3023 * nm_native_on() here. 3024 */ 3025 if (!nm_netmap_on(na)) 3026 return 0; 3027 3028 if (na->na_flags & NAF_SKIP_INTR) { 3029 ND("use regular interrupt"); 3030 return 0; 3031 } 3032 3033 netmap_common_irq(ifp, q, work_done); 3034 return 1; 3035 } 3036 3037 3038 /* 3039 * Module loader and unloader 3040 * 3041 * netmap_init() creates the /dev/netmap device and initializes 3042 * all global variables. Returns 0 on success, errno on failure 3043 * (but there is no chance) 3044 * 3045 * netmap_fini() destroys everything. 3046 */ 3047 3048 static struct cdev *netmap_dev; /* /dev/netmap character device. */ 3049 extern struct cdevsw netmap_cdevsw; 3050 3051 3052 void 3053 netmap_fini(void) 3054 { 3055 // XXX destroy_bridges() ? 3056 if (netmap_dev) 3057 destroy_dev(netmap_dev); 3058 netmap_mem_fini(); 3059 NMG_LOCK_DESTROY(); 3060 printf("netmap: unloaded module.\n"); 3061 } 3062 3063 3064 int 3065 netmap_init(void) 3066 { 3067 int error; 3068 3069 NMG_LOCK_INIT(); 3070 3071 error = netmap_mem_init(); 3072 if (error != 0) 3073 goto fail; 3074 /* XXX could use make_dev_credv() to get error number */ 3075 #ifdef __FreeBSD__ 3076 /* support for the 'eternal' flag */ 3077 netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD, 3078 &netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0660, 3079 "netmap"); 3080 #else 3081 netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, 3082 "netmap"); 3083 #endif 3084 if (!netmap_dev) 3085 goto fail; 3086 3087 netmap_init_bridges(); 3088 #ifdef __FreeBSD__ 3089 nm_vi_init_index(); 3090 #endif 3091 printf("netmap: loaded module\n"); 3092 return (0); 3093 fail: 3094 netmap_fini(); 3095 return (EINVAL); /* may be incorrect */ 3096 } 3097