1 /* 2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 /* 28 * $FreeBSD$ 29 * 30 * This module supports memory mapped access to network devices, 31 * see netmap(4). 32 * 33 * The module uses a large, memory pool allocated by the kernel 34 * and accessible as mmapped memory by multiple userspace threads/processes. 35 * The memory pool contains packet buffers and "netmap rings", 36 * i.e. user-accessible copies of the interface's queues. 37 * 38 * Access to the network card works like this: 39 * 1. a process/thread issues one or more open() on /dev/netmap, to create 40 * select()able file descriptor on which events are reported. 41 * 2. on each descriptor, the process issues an ioctl() to identify 42 * the interface that should report events to the file descriptor. 43 * 3. on each descriptor, the process issues an mmap() request to 44 * map the shared memory region within the process' address space. 45 * The list of interesting queues is indicated by a location in 46 * the shared memory region. 47 * 4. using the functions in the netmap(4) userspace API, a process 48 * can look up the occupation state of a queue, access memory buffers, 49 * and retrieve received packets or enqueue packets to transmit. 50 * 5. using some ioctl()s the process can synchronize the userspace view 51 * of the queue with the actual status in the kernel. This includes both 52 * receiving the notification of new packets, and transmitting new 53 * packets on the output interface. 54 * 6. select() or poll() can be used to wait for events on individual 55 * transmit or receive queues (or all queues for a given interface). 56 * 57 58 SYNCHRONIZATION (USER) 59 60 The netmap rings and data structures may be shared among multiple 61 user threads or even independent processes. 62 Any synchronization among those threads/processes is delegated 63 to the threads themselves. Only one thread at a time can be in 64 a system call on the same netmap ring. The OS does not enforce 65 this and only guarantees against system crashes in case of 66 invalid usage. 67 68 LOCKING (INTERNAL) 69 70 Within the kernel, access to the netmap rings is protected as follows: 71 72 - a spinlock on each ring, to handle producer/consumer races on 73 RX rings attached to the host stack (against multiple host 74 threads writing from the host stack to the same ring), 75 and on 'destination' rings attached to a VALE switch 76 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) 77 protecting multiple active senders for the same destination) 78 79 - an atomic variable to guarantee that there is at most one 80 instance of *_*xsync() on the ring at any time. 81 For rings connected to user file 82 descriptors, an atomic_test_and_set() protects this, and the 83 lock on the ring is not actually used. 84 For NIC RX rings connected to a VALE switch, an atomic_test_and_set() 85 is also used to prevent multiple executions (the driver might indeed 86 already guarantee this). 87 For NIC TX rings connected to a VALE switch, the lock arbitrates 88 access to the queue (both when allocating buffers and when pushing 89 them out). 90 91 - *xsync() should be protected against initializations of the card. 92 On FreeBSD most devices have the reset routine protected by 93 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing 94 the RING protection on rx_reset(), this should be added. 95 96 On linux there is an external lock on the tx path, which probably 97 also arbitrates access to the reset routine. XXX to be revised 98 99 - a per-interface core_lock protecting access from the host stack 100 while interfaces may be detached from netmap mode. 101 XXX there should be no need for this lock if we detach the interfaces 102 only while they are down. 103 104 105 --- VALE SWITCH --- 106 107 NMG_LOCK() serializes all modifications to switches and ports. 108 A switch cannot be deleted until all ports are gone. 109 110 For each switch, an SX lock (RWlock on linux) protects 111 deletion of ports. When configuring or deleting a new port, the 112 lock is acquired in exclusive mode (after holding NMG_LOCK). 113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 114 The lock is held throughout the entire forwarding cycle, 115 during which the thread may incur in a page fault. 116 Hence it is important that sleepable shared locks are used. 117 118 On the rx ring, the per-port lock is grabbed initially to reserve 119 a number of slot in the ring, then the lock is released, 120 packets are copied from source to destination, and then 121 the lock is acquired again and the receive ring is updated. 122 (A similar thing is done on the tx ring for NIC and host stack 123 ports attached to the switch) 124 125 */ 126 127 128 /* --- internals ---- 129 * 130 * Roadmap to the code that implements the above. 131 * 132 * > 1. a process/thread issues one or more open() on /dev/netmap, to create 133 * > select()able file descriptor on which events are reported. 134 * 135 * Internally, we allocate a netmap_priv_d structure, that will be 136 * initialized on ioctl(NIOCREGIF). 137 * 138 * os-specific: 139 * FreeBSD: netmap_open (netmap_freebsd.c). The priv is 140 * per-thread. 141 * linux: linux_netmap_open (netmap_linux.c). The priv is 142 * per-open. 143 * 144 * > 2. on each descriptor, the process issues an ioctl() to identify 145 * > the interface that should report events to the file descriptor. 146 * 147 * Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0. 148 * Most important things happen in netmap_get_na() and 149 * netmap_do_regif(), called from there. Additional details can be 150 * found in the comments above those functions. 151 * 152 * In all cases, this action creates/takes-a-reference-to a 153 * netmap_*_adapter describing the port, and allocates a netmap_if 154 * and all necessary netmap rings, filling them with netmap buffers. 155 * 156 * In this phase, the sync callbacks for each ring are set (these are used 157 * in steps 5 and 6 below). The callbacks depend on the type of adapter. 158 * The adapter creation/initialization code puts them in the 159 * netmap_adapter (fields na->nm_txsync and na->nm_rxsync). Then, they 160 * are copied from there to the netmap_kring's during netmap_do_regif(), by 161 * the nm_krings_create() callback. All the nm_krings_create callbacks 162 * actually call netmap_krings_create() to perform this and the other 163 * common stuff. netmap_krings_create() also takes care of the host rings, 164 * if needed, by setting their sync callbacks appropriately. 165 * 166 * Additional actions depend on the kind of netmap_adapter that has been 167 * registered: 168 * 169 * - netmap_hw_adapter: [netmap.c] 170 * This is a system netdev/ifp with native netmap support. 171 * The ifp is detached from the host stack by redirecting: 172 * - transmissions (from the network stack) to netmap_transmit() 173 * - receive notifications to the nm_notify() callback for 174 * this adapter. The callback is normally netmap_notify(), unless 175 * the ifp is attached to a bridge using bwrap, in which case it 176 * is netmap_bwrap_intr_notify(). 177 * 178 * - netmap_generic_adapter: [netmap_generic.c] 179 * A system netdev/ifp without native netmap support. 180 * 181 * (the decision about native/non native support is taken in 182 * netmap_get_hw_na(), called by netmap_get_na()) 183 * 184 * - netmap_vp_adapter [netmap_vale.c] 185 * Returned by netmap_get_bdg_na(). 186 * This is a persistent or ephemeral VALE port. Ephemeral ports 187 * are created on the fly if they don't already exist, and are 188 * always attached to a bridge. 189 * Persistent VALE ports must must be created seperately, and i 190 * then attached like normal NICs. The NIOCREGIF we are examining 191 * will find them only if they had previosly been created and 192 * attached (see VALE_CTL below). 193 * 194 * - netmap_pipe_adapter [netmap_pipe.c] 195 * Returned by netmap_get_pipe_na(). 196 * Both pipe ends are created, if they didn't already exist. 197 * 198 * - netmap_monitor_adapter [netmap_monitor.c] 199 * Returned by netmap_get_monitor_na(). 200 * If successful, the nm_sync callbacks of the monitored adapter 201 * will be intercepted by the returned monitor. 202 * 203 * - netmap_bwrap_adapter [netmap_vale.c] 204 * Cannot be obtained in this way, see VALE_CTL below 205 * 206 * 207 * os-specific: 208 * linux: we first go through linux_netmap_ioctl() to 209 * adapt the FreeBSD interface to the linux one. 210 * 211 * 212 * > 3. on each descriptor, the process issues an mmap() request to 213 * > map the shared memory region within the process' address space. 214 * > The list of interesting queues is indicated by a location in 215 * > the shared memory region. 216 * 217 * os-specific: 218 * FreeBSD: netmap_mmap_single (netmap_freebsd.c). 219 * linux: linux_netmap_mmap (netmap_linux.c). 220 * 221 * > 4. using the functions in the netmap(4) userspace API, a process 222 * > can look up the occupation state of a queue, access memory buffers, 223 * > and retrieve received packets or enqueue packets to transmit. 224 * 225 * these actions do not involve the kernel. 226 * 227 * > 5. using some ioctl()s the process can synchronize the userspace view 228 * > of the queue with the actual status in the kernel. This includes both 229 * > receiving the notification of new packets, and transmitting new 230 * > packets on the output interface. 231 * 232 * These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC 233 * cases. They invoke the nm_sync callbacks on the netmap_kring 234 * structures, as initialized in step 2 and maybe later modified 235 * by a monitor. Monitors, however, will always call the original 236 * callback before doing anything else. 237 * 238 * 239 * > 6. select() or poll() can be used to wait for events on individual 240 * > transmit or receive queues (or all queues for a given interface). 241 * 242 * Implemented in netmap_poll(). This will call the same nm_sync() 243 * callbacks as in step 5 above. 244 * 245 * os-specific: 246 * linux: we first go through linux_netmap_poll() to adapt 247 * the FreeBSD interface to the linux one. 248 * 249 * 250 * ---- VALE_CTL ----- 251 * 252 * VALE switches are controlled by issuing a NIOCREGIF with a non-null 253 * nr_cmd in the nmreq structure. These subcommands are handled by 254 * netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created 255 * and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF 256 * subcommands, respectively. 257 * 258 * Any network interface known to the system (including a persistent VALE 259 * port) can be attached to a VALE switch by issuing the 260 * NETMAP_BDG_ATTACH subcommand. After the attachment, persistent VALE ports 261 * look exactly like ephemeral VALE ports (as created in step 2 above). The 262 * attachment of other interfaces, instead, requires the creation of a 263 * netmap_bwrap_adapter. Moreover, the attached interface must be put in 264 * netmap mode. This may require the creation of a netmap_generic_adapter if 265 * we have no native support for the interface, or if generic adapters have 266 * been forced by sysctl. 267 * 268 * Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(), 269 * called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach() 270 * callback. In the case of the bwrap, the callback creates the 271 * netmap_bwrap_adapter. The initialization of the bwrap is then 272 * completed by calling netmap_do_regif() on it, in the nm_bdg_ctl() 273 * callback (netmap_bwrap_bdg_ctl in netmap_vale.c). 274 * A generic adapter for the wrapped ifp will be created if needed, when 275 * netmap_get_bdg_na() calls netmap_get_hw_na(). 276 * 277 * 278 * ---- DATAPATHS ----- 279 * 280 * -= SYSTEM DEVICE WITH NATIVE SUPPORT =- 281 * 282 * na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach() 283 * 284 * - tx from netmap userspace: 285 * concurrently: 286 * 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context 287 * kring->nm_sync() == DEVICE_netmap_txsync() 288 * 2) device interrupt handler 289 * na->nm_notify() == netmap_notify() 290 * - rx from netmap userspace: 291 * concurrently: 292 * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context 293 * kring->nm_sync() == DEVICE_netmap_rxsync() 294 * 2) device interrupt handler 295 * na->nm_notify() == netmap_notify() 296 * - tx from host stack 297 * concurrently: 298 * 1) host stack 299 * netmap_transmit() 300 * na->nm_notify == netmap_notify() 301 * 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context 302 * kring->nm_sync() == netmap_rxsync_from_host_compat 303 * netmap_rxsync_from_host(na, NULL, NULL) 304 * - tx to host stack 305 * ioctl(NIOCTXSYNC)/netmap_poll() in process context 306 * kring->nm_sync() == netmap_txsync_to_host_compat 307 * netmap_txsync_to_host(na) 308 * NM_SEND_UP() 309 * FreeBSD: na->if_input() == ?? XXX 310 * linux: netif_rx() with NM_MAGIC_PRIORITY_RX 311 * 312 * 313 * 314 * -= SYSTEM DEVICE WITH GENERIC SUPPORT =- 315 * 316 * 317 * 318 * -= VALE PORT =- 319 * 320 * 321 * 322 * -= NETMAP PIPE =- 323 * 324 * 325 * 326 * -= SYSTEM DEVICE WITH NATIVE SUPPORT, CONNECTED TO VALE, NO HOST RINGS =- 327 * 328 * 329 * 330 * -= SYSTEM DEVICE WITH NATIVE SUPPORT, CONNECTED TO VALE, WITH HOST RINGS =- 331 * 332 * 333 * 334 * -= SYSTEM DEVICE WITH GENERIC SUPPORT, CONNECTED TO VALE, NO HOST RINGS =- 335 * 336 * 337 * 338 * -= SYSTEM DEVICE WITH GENERIC SUPPORT, CONNECTED TO VALE, WITH HOST RINGS =- 339 * 340 * 341 * 342 */ 343 344 /* 345 * OS-specific code that is used only within this file. 346 * Other OS-specific code that must be accessed by drivers 347 * is present in netmap_kern.h 348 */ 349 350 #if defined(__FreeBSD__) 351 #include <sys/cdefs.h> /* prerequisite */ 352 #include <sys/types.h> 353 #include <sys/errno.h> 354 #include <sys/param.h> /* defines used in kernel.h */ 355 #include <sys/kernel.h> /* types used in module initialization */ 356 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 357 #include <sys/filio.h> /* FIONBIO */ 358 #include <sys/sockio.h> 359 #include <sys/socketvar.h> /* struct socket */ 360 #include <sys/malloc.h> 361 #include <sys/poll.h> 362 #include <sys/rwlock.h> 363 #include <sys/socket.h> /* sockaddrs */ 364 #include <sys/selinfo.h> 365 #include <sys/sysctl.h> 366 #include <sys/jail.h> 367 #include <net/vnet.h> 368 #include <net/if.h> 369 #include <net/if_var.h> 370 #include <net/bpf.h> /* BIOCIMMEDIATE */ 371 #include <machine/bus.h> /* bus_dmamap_* */ 372 #include <sys/endian.h> 373 #include <sys/refcount.h> 374 375 376 /* reduce conditional code */ 377 // linux API, use for the knlist in FreeBSD 378 #define init_waitqueue_head(x) knlist_init_mtx(&(x)->si_note, NULL) 379 380 void freebsd_selwakeup(struct selinfo *si, int pri); 381 #define OS_selwakeup(a, b) freebsd_selwakeup(a, b) 382 383 #elif defined(linux) 384 385 #include "bsd_glue.h" 386 387 388 389 #elif defined(__APPLE__) 390 391 #warning OSX support is only partial 392 #include "osx_glue.h" 393 394 #else 395 396 #error Unsupported platform 397 398 #endif /* unsupported */ 399 400 /* 401 * common headers 402 */ 403 #include <net/netmap.h> 404 #include <dev/netmap/netmap_kern.h> 405 #include <dev/netmap/netmap_mem2.h> 406 407 408 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); 409 410 /* 411 * The following variables are used by the drivers and replicate 412 * fields in the global memory pool. They only refer to buffers 413 * used by physical interfaces. 414 */ 415 u_int netmap_total_buffers; 416 u_int netmap_buf_size; 417 char *netmap_buffer_base; /* also address of an invalid buffer */ 418 419 /* user-controlled variables */ 420 int netmap_verbose; 421 422 static int netmap_no_timestamp; /* don't timestamp on rxsync */ 423 424 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); 425 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, 426 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); 427 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, 428 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); 429 int netmap_mitigate = 1; 430 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); 431 int netmap_no_pendintr = 1; 432 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, 433 CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); 434 int netmap_txsync_retry = 2; 435 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, 436 &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); 437 438 int netmap_adaptive_io = 0; 439 SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW, 440 &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt"); 441 442 int netmap_flags = 0; /* debug flags */ 443 int netmap_fwd = 0; /* force transparent mode */ 444 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ 445 446 /* 447 * netmap_admode selects the netmap mode to use. 448 * Invalid values are reset to NETMAP_ADMODE_BEST 449 */ 450 enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ 451 NETMAP_ADMODE_NATIVE, /* either native or none */ 452 NETMAP_ADMODE_GENERIC, /* force generic */ 453 NETMAP_ADMODE_LAST }; 454 static int netmap_admode = NETMAP_ADMODE_BEST; 455 456 int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ 457 int netmap_generic_ringsize = 1024; /* Generic ringsize. */ 458 int netmap_generic_rings = 1; /* number of queues in generic. */ 459 460 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); 461 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); 462 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); 463 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); 464 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); 465 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); 466 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , ""); 467 468 NMG_LOCK_T netmap_global_lock; 469 470 471 static void 472 nm_kr_get(struct netmap_kring *kr) 473 { 474 while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) 475 tsleep(kr, 0, "NM_KR_GET", 4); 476 } 477 478 479 /* 480 * mark the ring as stopped, and run through the locks 481 * to make sure other users get to see it. 482 */ 483 static void 484 netmap_disable_ring(struct netmap_kring *kr) 485 { 486 kr->nkr_stopped = 1; 487 nm_kr_get(kr); 488 mtx_lock(&kr->q_lock); 489 mtx_unlock(&kr->q_lock); 490 nm_kr_put(kr); 491 } 492 493 /* stop or enable a single tx ring */ 494 void 495 netmap_set_txring(struct netmap_adapter *na, u_int ring_id, int stopped) 496 { 497 if (stopped) 498 netmap_disable_ring(na->tx_rings + ring_id); 499 else 500 na->tx_rings[ring_id].nkr_stopped = 0; 501 /* nofify that the stopped state has changed. This is currently 502 *only used by bwrap to propagate the state to its own krings. 503 * (see netmap_bwrap_intr_notify). 504 */ 505 na->nm_notify(na, ring_id, NR_TX, NAF_DISABLE_NOTIFY); 506 } 507 508 /* stop or enable a single rx ring */ 509 void 510 netmap_set_rxring(struct netmap_adapter *na, u_int ring_id, int stopped) 511 { 512 if (stopped) 513 netmap_disable_ring(na->rx_rings + ring_id); 514 else 515 na->rx_rings[ring_id].nkr_stopped = 0; 516 /* nofify that the stopped state has changed. This is currently 517 *only used by bwrap to propagate the state to its own krings. 518 * (see netmap_bwrap_intr_notify). 519 */ 520 na->nm_notify(na, ring_id, NR_RX, NAF_DISABLE_NOTIFY); 521 } 522 523 524 /* stop or enable all the rings of na */ 525 void 526 netmap_set_all_rings(struct netmap_adapter *na, int stopped) 527 { 528 int i; 529 u_int ntx, nrx; 530 531 if (!nm_netmap_on(na)) 532 return; 533 534 ntx = netmap_real_tx_rings(na); 535 nrx = netmap_real_rx_rings(na); 536 537 for (i = 0; i < ntx; i++) { 538 netmap_set_txring(na, i, stopped); 539 } 540 541 for (i = 0; i < nrx; i++) { 542 netmap_set_rxring(na, i, stopped); 543 } 544 } 545 546 /* 547 * Convenience function used in drivers. Waits for current txsync()s/rxsync()s 548 * to finish and prevents any new one from starting. Call this before turning 549 * netmap mode off, or before removing the harware rings (e.g., on module 550 * onload). As a rule of thumb for linux drivers, this should be placed near 551 * each napi_disable(). 552 */ 553 void 554 netmap_disable_all_rings(struct ifnet *ifp) 555 { 556 netmap_set_all_rings(NA(ifp), 1 /* stopped */); 557 } 558 559 /* 560 * Convenience function used in drivers. Re-enables rxsync and txsync on the 561 * adapter's rings In linux drivers, this should be placed near each 562 * napi_enable(). 563 */ 564 void 565 netmap_enable_all_rings(struct ifnet *ifp) 566 { 567 netmap_set_all_rings(NA(ifp), 0 /* enabled */); 568 } 569 570 571 /* 572 * generic bound_checking function 573 */ 574 u_int 575 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) 576 { 577 u_int oldv = *v; 578 const char *op = NULL; 579 580 if (dflt < lo) 581 dflt = lo; 582 if (dflt > hi) 583 dflt = hi; 584 if (oldv < lo) { 585 *v = dflt; 586 op = "Bump"; 587 } else if (oldv > hi) { 588 *v = hi; 589 op = "Clamp"; 590 } 591 if (op && msg) 592 printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); 593 return *v; 594 } 595 596 597 /* 598 * packet-dump function, user-supplied or static buffer. 599 * The destination buffer must be at least 30+4*len 600 */ 601 const char * 602 nm_dump_buf(char *p, int len, int lim, char *dst) 603 { 604 static char _dst[8192]; 605 int i, j, i0; 606 static char hex[] ="0123456789abcdef"; 607 char *o; /* output position */ 608 609 #define P_HI(x) hex[((x) & 0xf0)>>4] 610 #define P_LO(x) hex[((x) & 0xf)] 611 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') 612 if (!dst) 613 dst = _dst; 614 if (lim <= 0 || lim > len) 615 lim = len; 616 o = dst; 617 sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); 618 o += strlen(o); 619 /* hexdump routine */ 620 for (i = 0; i < lim; ) { 621 sprintf(o, "%5d: ", i); 622 o += strlen(o); 623 memset(o, ' ', 48); 624 i0 = i; 625 for (j=0; j < 16 && i < lim; i++, j++) { 626 o[j*3] = P_HI(p[i]); 627 o[j*3+1] = P_LO(p[i]); 628 } 629 i = i0; 630 for (j=0; j < 16 && i < lim; i++, j++) 631 o[j + 48] = P_C(p[i]); 632 o[j+48] = '\n'; 633 o += j+49; 634 } 635 *o = '\0'; 636 #undef P_HI 637 #undef P_LO 638 #undef P_C 639 return dst; 640 } 641 642 643 /* 644 * Fetch configuration from the device, to cope with dynamic 645 * reconfigurations after loading the module. 646 */ 647 /* call with NMG_LOCK held */ 648 int 649 netmap_update_config(struct netmap_adapter *na) 650 { 651 u_int txr, txd, rxr, rxd; 652 653 txr = txd = rxr = rxd = 0; 654 if (na->nm_config) { 655 na->nm_config(na, &txr, &txd, &rxr, &rxd); 656 } else { 657 /* take whatever we had at init time */ 658 txr = na->num_tx_rings; 659 txd = na->num_tx_desc; 660 rxr = na->num_rx_rings; 661 rxd = na->num_rx_desc; 662 } 663 664 if (na->num_tx_rings == txr && na->num_tx_desc == txd && 665 na->num_rx_rings == rxr && na->num_rx_desc == rxd) 666 return 0; /* nothing changed */ 667 if (netmap_verbose || na->active_fds > 0) { 668 D("stored config %s: txring %d x %d, rxring %d x %d", 669 na->name, 670 na->num_tx_rings, na->num_tx_desc, 671 na->num_rx_rings, na->num_rx_desc); 672 D("new config %s: txring %d x %d, rxring %d x %d", 673 na->name, txr, txd, rxr, rxd); 674 } 675 if (na->active_fds == 0) { 676 D("configuration changed (but fine)"); 677 na->num_tx_rings = txr; 678 na->num_tx_desc = txd; 679 na->num_rx_rings = rxr; 680 na->num_rx_desc = rxd; 681 return 0; 682 } 683 D("configuration changed while active, this is bad..."); 684 return 1; 685 } 686 687 /* kring->nm_sync callback for the host tx ring */ 688 static int 689 netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags) 690 { 691 (void)flags; /* unused */ 692 netmap_txsync_to_host(kring->na); 693 return 0; 694 } 695 696 /* kring->nm_sync callback for the host rx ring */ 697 static int 698 netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags) 699 { 700 (void)flags; /* unused */ 701 netmap_rxsync_from_host(kring->na, NULL, NULL); 702 return 0; 703 } 704 705 706 707 /* create the krings array and initialize the fields common to all adapters. 708 * The array layout is this: 709 * 710 * +----------+ 711 * na->tx_rings ----->| | \ 712 * | | } na->num_tx_ring 713 * | | / 714 * +----------+ 715 * | | host tx kring 716 * na->rx_rings ----> +----------+ 717 * | | \ 718 * | | } na->num_rx_rings 719 * | | / 720 * +----------+ 721 * | | host rx kring 722 * +----------+ 723 * na->tailroom ----->| | \ 724 * | | } tailroom bytes 725 * | | / 726 * +----------+ 727 * 728 * Note: for compatibility, host krings are created even when not needed. 729 * The tailroom space is currently used by vale ports for allocating leases. 730 */ 731 /* call with NMG_LOCK held */ 732 int 733 netmap_krings_create(struct netmap_adapter *na, u_int tailroom) 734 { 735 u_int i, len, ndesc; 736 struct netmap_kring *kring; 737 u_int ntx, nrx; 738 739 /* account for the (possibly fake) host rings */ 740 ntx = na->num_tx_rings + 1; 741 nrx = na->num_rx_rings + 1; 742 743 len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom; 744 745 na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); 746 if (na->tx_rings == NULL) { 747 D("Cannot allocate krings"); 748 return ENOMEM; 749 } 750 na->rx_rings = na->tx_rings + ntx; 751 752 /* 753 * All fields in krings are 0 except the one initialized below. 754 * but better be explicit on important kring fields. 755 */ 756 ndesc = na->num_tx_desc; 757 for (i = 0; i < ntx; i++) { /* Transmit rings */ 758 kring = &na->tx_rings[i]; 759 bzero(kring, sizeof(*kring)); 760 kring->na = na; 761 kring->ring_id = i; 762 kring->nkr_num_slots = ndesc; 763 if (i < na->num_tx_rings) { 764 kring->nm_sync = na->nm_txsync; 765 } else if (i == na->num_tx_rings) { 766 kring->nm_sync = netmap_txsync_to_host_compat; 767 } 768 /* 769 * IMPORTANT: Always keep one slot empty. 770 */ 771 kring->rhead = kring->rcur = kring->nr_hwcur = 0; 772 kring->rtail = kring->nr_hwtail = ndesc - 1; 773 snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", na->name, i); 774 ND("ktx %s h %d c %d t %d", 775 kring->name, kring->rhead, kring->rcur, kring->rtail); 776 mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF); 777 init_waitqueue_head(&kring->si); 778 } 779 780 ndesc = na->num_rx_desc; 781 for (i = 0; i < nrx; i++) { /* Receive rings */ 782 kring = &na->rx_rings[i]; 783 bzero(kring, sizeof(*kring)); 784 kring->na = na; 785 kring->ring_id = i; 786 kring->nkr_num_slots = ndesc; 787 if (i < na->num_rx_rings) { 788 kring->nm_sync = na->nm_rxsync; 789 } else if (i == na->num_rx_rings) { 790 kring->nm_sync = netmap_rxsync_from_host_compat; 791 } 792 kring->rhead = kring->rcur = kring->nr_hwcur = 0; 793 kring->rtail = kring->nr_hwtail = 0; 794 snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", na->name, i); 795 ND("krx %s h %d c %d t %d", 796 kring->name, kring->rhead, kring->rcur, kring->rtail); 797 mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF); 798 init_waitqueue_head(&kring->si); 799 } 800 init_waitqueue_head(&na->tx_si); 801 init_waitqueue_head(&na->rx_si); 802 803 na->tailroom = na->rx_rings + nrx; 804 805 return 0; 806 } 807 808 809 /* undo the actions performed by netmap_krings_create */ 810 /* call with NMG_LOCK held */ 811 void 812 netmap_krings_delete(struct netmap_adapter *na) 813 { 814 struct netmap_kring *kring = na->tx_rings; 815 816 /* we rely on the krings layout described above */ 817 for ( ; kring != na->tailroom; kring++) { 818 mtx_destroy(&kring->q_lock); 819 } 820 free(na->tx_rings, M_DEVBUF); 821 na->tx_rings = na->rx_rings = na->tailroom = NULL; 822 } 823 824 825 /* 826 * Destructor for NIC ports. They also have an mbuf queue 827 * on the rings connected to the host so we need to purge 828 * them first. 829 */ 830 /* call with NMG_LOCK held */ 831 static void 832 netmap_hw_krings_delete(struct netmap_adapter *na) 833 { 834 struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue; 835 836 ND("destroy sw mbq with len %d", mbq_len(q)); 837 mbq_purge(q); 838 mbq_safe_destroy(q); 839 netmap_krings_delete(na); 840 } 841 842 843 /* create a new netmap_if for a newly registered fd. 844 * If this is the first registration of the adapter, 845 * also create the netmap rings and their in-kernel view, 846 * the netmap krings. 847 */ 848 /* call with NMG_LOCK held */ 849 static struct netmap_if* 850 netmap_if_new(struct netmap_adapter *na) 851 { 852 struct netmap_if *nifp; 853 854 if (netmap_update_config(na)) { 855 /* configuration mismatch, report and fail */ 856 return NULL; 857 } 858 859 if (na->active_fds) /* already registered */ 860 goto final; 861 862 /* create and init the krings arrays. 863 * Depending on the adapter, this may also create 864 * the netmap rings themselves 865 */ 866 if (na->nm_krings_create(na)) 867 return NULL; 868 869 /* create all missing netmap rings */ 870 if (netmap_mem_rings_create(na)) 871 goto cleanup; 872 873 final: 874 875 /* in all cases, create a new netmap if */ 876 nifp = netmap_mem_if_new(na); 877 if (nifp == NULL) 878 goto cleanup; 879 880 return (nifp); 881 882 cleanup: 883 884 if (na->active_fds == 0) { 885 netmap_mem_rings_delete(na); 886 na->nm_krings_delete(na); 887 } 888 889 return NULL; 890 } 891 892 893 /* grab a reference to the memory allocator, if we don't have one already. The 894 * reference is taken from the netmap_adapter registered with the priv. 895 */ 896 /* call with NMG_LOCK held */ 897 static int 898 netmap_get_memory_locked(struct netmap_priv_d* p) 899 { 900 struct netmap_mem_d *nmd; 901 int error = 0; 902 903 if (p->np_na == NULL) { 904 if (!netmap_mmap_unreg) 905 return ENODEV; 906 /* for compatibility with older versions of the API 907 * we use the global allocator when no interface has been 908 * registered 909 */ 910 nmd = &nm_mem; 911 } else { 912 nmd = p->np_na->nm_mem; 913 } 914 if (p->np_mref == NULL) { 915 error = netmap_mem_finalize(nmd, p->np_na); 916 if (!error) 917 p->np_mref = nmd; 918 } else if (p->np_mref != nmd) { 919 /* a virtual port has been registered, but previous 920 * syscalls already used the global allocator. 921 * We cannot continue 922 */ 923 error = ENODEV; 924 } 925 return error; 926 } 927 928 929 /* call with NMG_LOCK *not* held */ 930 int 931 netmap_get_memory(struct netmap_priv_d* p) 932 { 933 int error; 934 NMG_LOCK(); 935 error = netmap_get_memory_locked(p); 936 NMG_UNLOCK(); 937 return error; 938 } 939 940 941 /* call with NMG_LOCK held */ 942 static int 943 netmap_have_memory_locked(struct netmap_priv_d* p) 944 { 945 return p->np_mref != NULL; 946 } 947 948 949 /* call with NMG_LOCK held */ 950 static void 951 netmap_drop_memory_locked(struct netmap_priv_d* p) 952 { 953 if (p->np_mref) { 954 netmap_mem_deref(p->np_mref, p->np_na); 955 p->np_mref = NULL; 956 } 957 } 958 959 960 /* 961 * Call nm_register(ifp,0) to stop netmap mode on the interface and 962 * revert to normal operation. 963 * The second argument is the nifp to work on. In some cases it is 964 * not attached yet to the netmap_priv_d so we need to pass it as 965 * a separate argument. 966 */ 967 /* call with NMG_LOCK held */ 968 static void 969 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) 970 { 971 struct netmap_adapter *na = priv->np_na; 972 973 NMG_LOCK_ASSERT(); 974 na->active_fds--; 975 if (na->active_fds <= 0) { /* last instance */ 976 977 if (netmap_verbose) 978 D("deleting last instance for %s", na->name); 979 /* 980 * (TO CHECK) This function is only called 981 * when the last reference to this file descriptor goes 982 * away. This means we cannot have any pending poll() 983 * or interrupt routine operating on the structure. 984 * XXX The file may be closed in a thread while 985 * another thread is using it. 986 * Linux keeps the file opened until the last reference 987 * by any outstanding ioctl/poll or mmap is gone. 988 * FreeBSD does not track mmap()s (but we do) and 989 * wakes up any sleeping poll(). Need to check what 990 * happens if the close() occurs while a concurrent 991 * syscall is running. 992 */ 993 na->nm_register(na, 0); /* off, clear flags */ 994 /* Wake up any sleeping threads. netmap_poll will 995 * then return POLLERR 996 * XXX The wake up now must happen during *_down(), when 997 * we order all activities to stop. -gl 998 */ 999 /* XXX kqueue(9) needed; these will mirror knlist_init. */ 1000 /* knlist_destroy(&na->tx_si.si_note); */ 1001 /* knlist_destroy(&na->rx_si.si_note); */ 1002 1003 /* delete rings and buffers */ 1004 netmap_mem_rings_delete(na); 1005 na->nm_krings_delete(na); 1006 } 1007 /* delete the nifp */ 1008 netmap_mem_if_delete(na, nifp); 1009 } 1010 1011 /* call with NMG_LOCK held */ 1012 static __inline int 1013 nm_tx_si_user(struct netmap_priv_d *priv) 1014 { 1015 return (priv->np_na != NULL && 1016 (priv->np_txqlast - priv->np_txqfirst > 1)); 1017 } 1018 1019 /* call with NMG_LOCK held */ 1020 static __inline int 1021 nm_rx_si_user(struct netmap_priv_d *priv) 1022 { 1023 return (priv->np_na != NULL && 1024 (priv->np_rxqlast - priv->np_rxqfirst > 1)); 1025 } 1026 1027 1028 /* 1029 * Destructor of the netmap_priv_d, called when the fd has 1030 * no active open() and mmap(). Also called in error paths. 1031 * 1032 * returns 1 if this is the last instance and we can free priv 1033 */ 1034 /* call with NMG_LOCK held */ 1035 int 1036 netmap_dtor_locked(struct netmap_priv_d *priv) 1037 { 1038 struct netmap_adapter *na = priv->np_na; 1039 1040 #ifdef __FreeBSD__ 1041 /* 1042 * np_refcount is the number of active mmaps on 1043 * this file descriptor 1044 */ 1045 if (--priv->np_refcount > 0) { 1046 return 0; 1047 } 1048 #endif /* __FreeBSD__ */ 1049 if (!na) { 1050 return 1; //XXX is it correct? 1051 } 1052 netmap_do_unregif(priv, priv->np_nifp); 1053 priv->np_nifp = NULL; 1054 netmap_drop_memory_locked(priv); 1055 if (priv->np_na) { 1056 if (nm_tx_si_user(priv)) 1057 na->tx_si_users--; 1058 if (nm_rx_si_user(priv)) 1059 na->rx_si_users--; 1060 netmap_adapter_put(na); 1061 priv->np_na = NULL; 1062 } 1063 return 1; 1064 } 1065 1066 1067 /* call with NMG_LOCK *not* held */ 1068 void 1069 netmap_dtor(void *data) 1070 { 1071 struct netmap_priv_d *priv = data; 1072 int last_instance; 1073 1074 NMG_LOCK(); 1075 last_instance = netmap_dtor_locked(priv); 1076 NMG_UNLOCK(); 1077 if (last_instance) { 1078 bzero(priv, sizeof(*priv)); /* for safety */ 1079 free(priv, M_DEVBUF); 1080 } 1081 } 1082 1083 1084 1085 1086 /* 1087 * Handlers for synchronization of the queues from/to the host. 1088 * Netmap has two operating modes: 1089 * - in the default mode, the rings connected to the host stack are 1090 * just another ring pair managed by userspace; 1091 * - in transparent mode (XXX to be defined) incoming packets 1092 * (from the host or the NIC) are marked as NS_FORWARD upon 1093 * arrival, and the user application has a chance to reset the 1094 * flag for packets that should be dropped. 1095 * On the RXSYNC or poll(), packets in RX rings between 1096 * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved 1097 * to the other side. 1098 * The transfer NIC --> host is relatively easy, just encapsulate 1099 * into mbufs and we are done. The host --> NIC side is slightly 1100 * harder because there might not be room in the tx ring so it 1101 * might take a while before releasing the buffer. 1102 */ 1103 1104 1105 /* 1106 * pass a chain of buffers to the host stack as coming from 'dst' 1107 * We do not need to lock because the queue is private. 1108 */ 1109 static void 1110 netmap_send_up(struct ifnet *dst, struct mbq *q) 1111 { 1112 struct mbuf *m; 1113 1114 /* send packets up, outside the lock */ 1115 while ((m = mbq_dequeue(q)) != NULL) { 1116 if (netmap_verbose & NM_VERB_HOST) 1117 D("sending up pkt %p size %d", m, MBUF_LEN(m)); 1118 NM_SEND_UP(dst, m); 1119 } 1120 mbq_destroy(q); 1121 } 1122 1123 1124 /* 1125 * put a copy of the buffers marked NS_FORWARD into an mbuf chain. 1126 * Take packets from hwcur to ring->head marked NS_FORWARD (or forced) 1127 * and pass them up. Drop remaining packets in the unlikely event 1128 * of an mbuf shortage. 1129 */ 1130 static void 1131 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) 1132 { 1133 u_int const lim = kring->nkr_num_slots - 1; 1134 u_int const head = kring->ring->head; 1135 u_int n; 1136 struct netmap_adapter *na = kring->na; 1137 1138 for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) { 1139 struct mbuf *m; 1140 struct netmap_slot *slot = &kring->ring->slot[n]; 1141 1142 if ((slot->flags & NS_FORWARD) == 0 && !force) 1143 continue; 1144 if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) { 1145 RD(5, "bad pkt at %d len %d", n, slot->len); 1146 continue; 1147 } 1148 slot->flags &= ~NS_FORWARD; // XXX needed ? 1149 /* XXX TODO: adapt to the case of a multisegment packet */ 1150 m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL); 1151 1152 if (m == NULL) 1153 break; 1154 mbq_enqueue(q, m); 1155 } 1156 } 1157 1158 1159 /* 1160 * Send to the NIC rings packets marked NS_FORWARD between 1161 * kring->nr_hwcur and kring->rhead 1162 * Called under kring->rx_queue.lock on the sw rx ring, 1163 */ 1164 static u_int 1165 netmap_sw_to_nic(struct netmap_adapter *na) 1166 { 1167 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 1168 struct netmap_slot *rxslot = kring->ring->slot; 1169 u_int i, rxcur = kring->nr_hwcur; 1170 u_int const head = kring->rhead; 1171 u_int const src_lim = kring->nkr_num_slots - 1; 1172 u_int sent = 0; 1173 1174 /* scan rings to find space, then fill as much as possible */ 1175 for (i = 0; i < na->num_tx_rings; i++) { 1176 struct netmap_kring *kdst = &na->tx_rings[i]; 1177 struct netmap_ring *rdst = kdst->ring; 1178 u_int const dst_lim = kdst->nkr_num_slots - 1; 1179 1180 /* XXX do we trust ring or kring->rcur,rtail ? */ 1181 for (; rxcur != head && !nm_ring_empty(rdst); 1182 rxcur = nm_next(rxcur, src_lim) ) { 1183 struct netmap_slot *src, *dst, tmp; 1184 u_int dst_cur = rdst->cur; 1185 1186 src = &rxslot[rxcur]; 1187 if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd) 1188 continue; 1189 1190 sent++; 1191 1192 dst = &rdst->slot[dst_cur]; 1193 1194 tmp = *src; 1195 1196 src->buf_idx = dst->buf_idx; 1197 src->flags = NS_BUF_CHANGED; 1198 1199 dst->buf_idx = tmp.buf_idx; 1200 dst->len = tmp.len; 1201 dst->flags = NS_BUF_CHANGED; 1202 1203 rdst->cur = nm_next(dst_cur, dst_lim); 1204 } 1205 /* if (sent) XXX txsync ? */ 1206 } 1207 return sent; 1208 } 1209 1210 1211 /* 1212 * netmap_txsync_to_host() passes packets up. We are called from a 1213 * system call in user process context, and the only contention 1214 * can be among multiple user threads erroneously calling 1215 * this routine concurrently. 1216 */ 1217 void 1218 netmap_txsync_to_host(struct netmap_adapter *na) 1219 { 1220 struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; 1221 struct netmap_ring *ring = kring->ring; 1222 u_int const lim = kring->nkr_num_slots - 1; 1223 u_int const head = kring->rhead; 1224 struct mbq q; 1225 1226 /* Take packets from hwcur to head and pass them up. 1227 * force head = cur since netmap_grab_packets() stops at head 1228 * In case of no buffers we give up. At the end of the loop, 1229 * the queue is drained in all cases. 1230 */ 1231 mbq_init(&q); 1232 ring->cur = head; 1233 netmap_grab_packets(kring, &q, 1 /* force */); 1234 ND("have %d pkts in queue", mbq_len(&q)); 1235 kring->nr_hwcur = head; 1236 kring->nr_hwtail = head + lim; 1237 if (kring->nr_hwtail > lim) 1238 kring->nr_hwtail -= lim + 1; 1239 nm_txsync_finalize(kring); 1240 1241 netmap_send_up(na->ifp, &q); 1242 } 1243 1244 1245 /* 1246 * rxsync backend for packets coming from the host stack. 1247 * They have been put in kring->rx_queue by netmap_transmit(). 1248 * We protect access to the kring using kring->rx_queue.lock 1249 * 1250 * This routine also does the selrecord if called from the poll handler 1251 * (we know because td != NULL). 1252 * 1253 * NOTE: on linux, selrecord() is defined as a macro and uses pwait 1254 * as an additional hidden argument. 1255 * returns the number of packets delivered to tx queues in 1256 * transparent mode, or a negative value if error 1257 */ 1258 int 1259 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) 1260 { 1261 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 1262 struct netmap_ring *ring = kring->ring; 1263 u_int nm_i, n; 1264 u_int const lim = kring->nkr_num_slots - 1; 1265 u_int const head = kring->rhead; 1266 int ret = 0; 1267 struct mbq *q = &kring->rx_queue; 1268 1269 (void)pwait; /* disable unused warnings */ 1270 (void)td; 1271 1272 mbq_lock(q); 1273 1274 /* First part: import newly received packets */ 1275 n = mbq_len(q); 1276 if (n) { /* grab packets from the queue */ 1277 struct mbuf *m; 1278 uint32_t stop_i; 1279 1280 nm_i = kring->nr_hwtail; 1281 stop_i = nm_prev(nm_i, lim); 1282 while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) { 1283 int len = MBUF_LEN(m); 1284 struct netmap_slot *slot = &ring->slot[nm_i]; 1285 1286 m_copydata(m, 0, len, NMB(na, slot)); 1287 ND("nm %d len %d", nm_i, len); 1288 if (netmap_verbose) 1289 D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL)); 1290 1291 slot->len = len; 1292 slot->flags = kring->nkr_slot_flags; 1293 nm_i = nm_next(nm_i, lim); 1294 m_freem(m); 1295 } 1296 kring->nr_hwtail = nm_i; 1297 } 1298 1299 /* 1300 * Second part: skip past packets that userspace has released. 1301 */ 1302 nm_i = kring->nr_hwcur; 1303 if (nm_i != head) { /* something was released */ 1304 if (netmap_fwd || kring->ring->flags & NR_FORWARD) 1305 ret = netmap_sw_to_nic(na); 1306 kring->nr_hwcur = head; 1307 } 1308 1309 nm_rxsync_finalize(kring); 1310 1311 /* access copies of cur,tail in the kring */ 1312 if (kring->rcur == kring->rtail && td) /* no bufs available */ 1313 selrecord(td, &kring->si); 1314 1315 mbq_unlock(q); 1316 return ret; 1317 } 1318 1319 1320 /* Get a netmap adapter for the port. 1321 * 1322 * If it is possible to satisfy the request, return 0 1323 * with *na containing the netmap adapter found. 1324 * Otherwise return an error code, with *na containing NULL. 1325 * 1326 * When the port is attached to a bridge, we always return 1327 * EBUSY. 1328 * Otherwise, if the port is already bound to a file descriptor, 1329 * then we unconditionally return the existing adapter into *na. 1330 * In all the other cases, we return (into *na) either native, 1331 * generic or NULL, according to the following table: 1332 * 1333 * native_support 1334 * active_fds dev.netmap.admode YES NO 1335 * ------------------------------------------------------- 1336 * >0 * NA(ifp) NA(ifp) 1337 * 1338 * 0 NETMAP_ADMODE_BEST NATIVE GENERIC 1339 * 0 NETMAP_ADMODE_NATIVE NATIVE NULL 1340 * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC 1341 * 1342 */ 1343 1344 int 1345 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) 1346 { 1347 /* generic support */ 1348 int i = netmap_admode; /* Take a snapshot. */ 1349 int error = 0; 1350 struct netmap_adapter *prev_na; 1351 struct netmap_generic_adapter *gna; 1352 1353 *na = NULL; /* default */ 1354 1355 /* reset in case of invalid value */ 1356 if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST) 1357 i = netmap_admode = NETMAP_ADMODE_BEST; 1358 1359 if (NETMAP_CAPABLE(ifp)) { 1360 prev_na = NA(ifp); 1361 /* If an adapter already exists, return it if 1362 * there are active file descriptors or if 1363 * netmap is not forced to use generic 1364 * adapters. 1365 */ 1366 if (NETMAP_OWNED_BY_ANY(prev_na) 1367 || i != NETMAP_ADMODE_GENERIC 1368 || prev_na->na_flags & NAF_FORCE_NATIVE 1369 #ifdef WITH_PIPES 1370 /* ugly, but we cannot allow an adapter switch 1371 * if some pipe is referring to this one 1372 */ 1373 || prev_na->na_next_pipe > 0 1374 #endif 1375 ) { 1376 *na = prev_na; 1377 return 0; 1378 } 1379 } 1380 1381 /* If there isn't native support and netmap is not allowed 1382 * to use generic adapters, we cannot satisfy the request. 1383 */ 1384 if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE) 1385 return EOPNOTSUPP; 1386 1387 /* Otherwise, create a generic adapter and return it, 1388 * saving the previously used netmap adapter, if any. 1389 * 1390 * Note that here 'prev_na', if not NULL, MUST be a 1391 * native adapter, and CANNOT be a generic one. This is 1392 * true because generic adapters are created on demand, and 1393 * destroyed when not used anymore. Therefore, if the adapter 1394 * currently attached to an interface 'ifp' is generic, it 1395 * must be that 1396 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))). 1397 * Consequently, if NA(ifp) is generic, we will enter one of 1398 * the branches above. This ensures that we never override 1399 * a generic adapter with another generic adapter. 1400 */ 1401 prev_na = NA(ifp); 1402 error = generic_netmap_attach(ifp); 1403 if (error) 1404 return error; 1405 1406 *na = NA(ifp); 1407 gna = (struct netmap_generic_adapter*)NA(ifp); 1408 gna->prev = prev_na; /* save old na */ 1409 if (prev_na != NULL) { 1410 ifunit_ref(ifp->if_xname); 1411 // XXX add a refcount ? 1412 netmap_adapter_get(prev_na); 1413 } 1414 ND("Created generic NA %p (prev %p)", gna, gna->prev); 1415 1416 return 0; 1417 } 1418 1419 1420 /* 1421 * MUST BE CALLED UNDER NMG_LOCK() 1422 * 1423 * Get a refcounted reference to a netmap adapter attached 1424 * to the interface specified by nmr. 1425 * This is always called in the execution of an ioctl(). 1426 * 1427 * Return ENXIO if the interface specified by the request does 1428 * not exist, ENOTSUP if netmap is not supported by the interface, 1429 * EBUSY if the interface is already attached to a bridge, 1430 * EINVAL if parameters are invalid, ENOMEM if needed resources 1431 * could not be allocated. 1432 * If successful, hold a reference to the netmap adapter. 1433 * 1434 * No reference is kept on the real interface, which may then 1435 * disappear at any time. 1436 */ 1437 int 1438 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) 1439 { 1440 struct ifnet *ifp = NULL; 1441 int error = 0; 1442 struct netmap_adapter *ret = NULL; 1443 1444 *na = NULL; /* default return value */ 1445 1446 NMG_LOCK_ASSERT(); 1447 1448 /* we cascade through all possibile types of netmap adapter. 1449 * All netmap_get_*_na() functions return an error and an na, 1450 * with the following combinations: 1451 * 1452 * error na 1453 * 0 NULL type doesn't match 1454 * !0 NULL type matches, but na creation/lookup failed 1455 * 0 !NULL type matches and na created/found 1456 * !0 !NULL impossible 1457 */ 1458 1459 /* try to see if this is a monitor port */ 1460 error = netmap_get_monitor_na(nmr, na, create); 1461 if (error || *na != NULL) 1462 return error; 1463 1464 /* try to see if this is a pipe port */ 1465 error = netmap_get_pipe_na(nmr, na, create); 1466 if (error || *na != NULL) 1467 return error; 1468 1469 /* try to see if this is a bridge port */ 1470 error = netmap_get_bdg_na(nmr, na, create); 1471 if (error) 1472 return error; 1473 1474 if (*na != NULL) /* valid match in netmap_get_bdg_na() */ 1475 goto pipes; 1476 1477 /* 1478 * This must be a hardware na, lookup the name in the system. 1479 * Note that by hardware we actually mean "it shows up in ifconfig". 1480 * This may still be a tap, a veth/epair, or even a 1481 * persistent VALE port. 1482 */ 1483 ifp = ifunit_ref(nmr->nr_name); 1484 if (ifp == NULL) { 1485 return ENXIO; 1486 } 1487 1488 error = netmap_get_hw_na(ifp, &ret); 1489 if (error) 1490 goto out; 1491 1492 *na = ret; 1493 netmap_adapter_get(ret); 1494 1495 pipes: 1496 /* 1497 * If we are opening a pipe whose parent was not in netmap mode, 1498 * we have to allocate the pipe array now. 1499 * XXX get rid of this clumsiness (2014-03-15) 1500 */ 1501 error = netmap_pipe_alloc(*na, nmr); 1502 1503 out: 1504 if (error && ret != NULL) 1505 netmap_adapter_put(ret); 1506 1507 if (ifp) 1508 if_rele(ifp); /* allow live unloading of drivers modules */ 1509 1510 return error; 1511 } 1512 1513 1514 /* 1515 * validate parameters on entry for *_txsync() 1516 * Returns ring->cur if ok, or something >= kring->nkr_num_slots 1517 * in case of error. 1518 * 1519 * rhead, rcur and rtail=hwtail are stored from previous round. 1520 * hwcur is the next packet to send to the ring. 1521 * 1522 * We want 1523 * hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail 1524 * 1525 * hwcur, rhead, rtail and hwtail are reliable 1526 */ 1527 u_int 1528 nm_txsync_prologue(struct netmap_kring *kring) 1529 { 1530 struct netmap_ring *ring = kring->ring; 1531 u_int head = ring->head; /* read only once */ 1532 u_int cur = ring->cur; /* read only once */ 1533 u_int n = kring->nkr_num_slots; 1534 1535 ND(5, "%s kcur %d ktail %d head %d cur %d tail %d", 1536 kring->name, 1537 kring->nr_hwcur, kring->nr_hwtail, 1538 ring->head, ring->cur, ring->tail); 1539 #if 1 /* kernel sanity checks; but we can trust the kring. */ 1540 if (kring->nr_hwcur >= n || kring->rhead >= n || 1541 kring->rtail >= n || kring->nr_hwtail >= n) 1542 goto error; 1543 #endif /* kernel sanity checks */ 1544 /* 1545 * user sanity checks. We only use 'cur', 1546 * A, B, ... are possible positions for cur: 1547 * 1548 * 0 A cur B tail C n-1 1549 * 0 D tail E cur F n-1 1550 * 1551 * B, F, D are valid. A, C, E are wrong 1552 */ 1553 if (kring->rtail >= kring->rhead) { 1554 /* want rhead <= head <= rtail */ 1555 if (head < kring->rhead || head > kring->rtail) 1556 goto error; 1557 /* and also head <= cur <= rtail */ 1558 if (cur < head || cur > kring->rtail) 1559 goto error; 1560 } else { /* here rtail < rhead */ 1561 /* we need head outside rtail .. rhead */ 1562 if (head > kring->rtail && head < kring->rhead) 1563 goto error; 1564 1565 /* two cases now: head <= rtail or head >= rhead */ 1566 if (head <= kring->rtail) { 1567 /* want head <= cur <= rtail */ 1568 if (cur < head || cur > kring->rtail) 1569 goto error; 1570 } else { /* head >= rhead */ 1571 /* cur must be outside rtail..head */ 1572 if (cur > kring->rtail && cur < head) 1573 goto error; 1574 } 1575 } 1576 if (ring->tail != kring->rtail) { 1577 RD(5, "tail overwritten was %d need %d", 1578 ring->tail, kring->rtail); 1579 ring->tail = kring->rtail; 1580 } 1581 kring->rhead = head; 1582 kring->rcur = cur; 1583 return head; 1584 1585 error: 1586 RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d", 1587 kring->name, 1588 kring->nr_hwcur, 1589 kring->rcur, kring->nr_hwtail, 1590 cur, ring->tail); 1591 return n; 1592 } 1593 1594 1595 /* 1596 * validate parameters on entry for *_rxsync() 1597 * Returns ring->head if ok, kring->nkr_num_slots on error. 1598 * 1599 * For a valid configuration, 1600 * hwcur <= head <= cur <= tail <= hwtail 1601 * 1602 * We only consider head and cur. 1603 * hwcur and hwtail are reliable. 1604 * 1605 */ 1606 u_int 1607 nm_rxsync_prologue(struct netmap_kring *kring) 1608 { 1609 struct netmap_ring *ring = kring->ring; 1610 uint32_t const n = kring->nkr_num_slots; 1611 uint32_t head, cur; 1612 1613 ND("%s kc %d kt %d h %d c %d t %d", 1614 kring->name, 1615 kring->nr_hwcur, kring->nr_hwtail, 1616 ring->head, ring->cur, ring->tail); 1617 /* 1618 * Before storing the new values, we should check they do not 1619 * move backwards. However: 1620 * - head is not an issue because the previous value is hwcur; 1621 * - cur could in principle go back, however it does not matter 1622 * because we are processing a brand new rxsync() 1623 */ 1624 cur = kring->rcur = ring->cur; /* read only once */ 1625 head = kring->rhead = ring->head; /* read only once */ 1626 #if 1 /* kernel sanity checks */ 1627 if (kring->nr_hwcur >= n || kring->nr_hwtail >= n) 1628 goto error; 1629 #endif /* kernel sanity checks */ 1630 /* user sanity checks */ 1631 if (kring->nr_hwtail >= kring->nr_hwcur) { 1632 /* want hwcur <= rhead <= hwtail */ 1633 if (head < kring->nr_hwcur || head > kring->nr_hwtail) 1634 goto error; 1635 /* and also rhead <= rcur <= hwtail */ 1636 if (cur < head || cur > kring->nr_hwtail) 1637 goto error; 1638 } else { 1639 /* we need rhead outside hwtail..hwcur */ 1640 if (head < kring->nr_hwcur && head > kring->nr_hwtail) 1641 goto error; 1642 /* two cases now: head <= hwtail or head >= hwcur */ 1643 if (head <= kring->nr_hwtail) { 1644 /* want head <= cur <= hwtail */ 1645 if (cur < head || cur > kring->nr_hwtail) 1646 goto error; 1647 } else { 1648 /* cur must be outside hwtail..head */ 1649 if (cur < head && cur > kring->nr_hwtail) 1650 goto error; 1651 } 1652 } 1653 if (ring->tail != kring->rtail) { 1654 RD(5, "%s tail overwritten was %d need %d", 1655 kring->name, 1656 ring->tail, kring->rtail); 1657 ring->tail = kring->rtail; 1658 } 1659 return head; 1660 1661 error: 1662 RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d", 1663 kring->nr_hwcur, 1664 kring->rcur, kring->nr_hwtail, 1665 kring->rhead, kring->rcur, ring->tail); 1666 return n; 1667 } 1668 1669 1670 /* 1671 * Error routine called when txsync/rxsync detects an error. 1672 * Can't do much more than resetting head =cur = hwcur, tail = hwtail 1673 * Return 1 on reinit. 1674 * 1675 * This routine is only called by the upper half of the kernel. 1676 * It only reads hwcur (which is changed only by the upper half, too) 1677 * and hwtail (which may be changed by the lower half, but only on 1678 * a tx ring and only to increase it, so any error will be recovered 1679 * on the next call). For the above, we don't strictly need to call 1680 * it under lock. 1681 */ 1682 int 1683 netmap_ring_reinit(struct netmap_kring *kring) 1684 { 1685 struct netmap_ring *ring = kring->ring; 1686 u_int i, lim = kring->nkr_num_slots - 1; 1687 int errors = 0; 1688 1689 // XXX KASSERT nm_kr_tryget 1690 RD(10, "called for %s", kring->name); 1691 // XXX probably wrong to trust userspace 1692 kring->rhead = ring->head; 1693 kring->rcur = ring->cur; 1694 kring->rtail = ring->tail; 1695 1696 if (ring->cur > lim) 1697 errors++; 1698 if (ring->head > lim) 1699 errors++; 1700 if (ring->tail > lim) 1701 errors++; 1702 for (i = 0; i <= lim; i++) { 1703 u_int idx = ring->slot[i].buf_idx; 1704 u_int len = ring->slot[i].len; 1705 if (idx < 2 || idx >= netmap_total_buffers) { 1706 RD(5, "bad index at slot %d idx %d len %d ", i, idx, len); 1707 ring->slot[i].buf_idx = 0; 1708 ring->slot[i].len = 0; 1709 } else if (len > NETMAP_BUF_SIZE(kring->na)) { 1710 ring->slot[i].len = 0; 1711 RD(5, "bad len at slot %d idx %d len %d", i, idx, len); 1712 } 1713 } 1714 if (errors) { 1715 RD(10, "total %d errors", errors); 1716 RD(10, "%s reinit, cur %d -> %d tail %d -> %d", 1717 kring->name, 1718 ring->cur, kring->nr_hwcur, 1719 ring->tail, kring->nr_hwtail); 1720 ring->head = kring->rhead = kring->nr_hwcur; 1721 ring->cur = kring->rcur = kring->nr_hwcur; 1722 ring->tail = kring->rtail = kring->nr_hwtail; 1723 } 1724 return (errors ? 1 : 0); 1725 } 1726 1727 /* interpret the ringid and flags fields of an nmreq, by translating them 1728 * into a pair of intervals of ring indices: 1729 * 1730 * [priv->np_txqfirst, priv->np_txqlast) and 1731 * [priv->np_rxqfirst, priv->np_rxqlast) 1732 * 1733 */ 1734 int 1735 netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) 1736 { 1737 struct netmap_adapter *na = priv->np_na; 1738 u_int j, i = ringid & NETMAP_RING_MASK; 1739 u_int reg = flags & NR_REG_MASK; 1740 1741 if (reg == NR_REG_DEFAULT) { 1742 /* convert from old ringid to flags */ 1743 if (ringid & NETMAP_SW_RING) { 1744 reg = NR_REG_SW; 1745 } else if (ringid & NETMAP_HW_RING) { 1746 reg = NR_REG_ONE_NIC; 1747 } else { 1748 reg = NR_REG_ALL_NIC; 1749 } 1750 D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg); 1751 } 1752 switch (reg) { 1753 case NR_REG_ALL_NIC: 1754 case NR_REG_PIPE_MASTER: 1755 case NR_REG_PIPE_SLAVE: 1756 priv->np_txqfirst = 0; 1757 priv->np_txqlast = na->num_tx_rings; 1758 priv->np_rxqfirst = 0; 1759 priv->np_rxqlast = na->num_rx_rings; 1760 ND("%s %d %d", "ALL/PIPE", 1761 priv->np_rxqfirst, priv->np_rxqlast); 1762 break; 1763 case NR_REG_SW: 1764 case NR_REG_NIC_SW: 1765 if (!(na->na_flags & NAF_HOST_RINGS)) { 1766 D("host rings not supported"); 1767 return EINVAL; 1768 } 1769 priv->np_txqfirst = (reg == NR_REG_SW ? 1770 na->num_tx_rings : 0); 1771 priv->np_txqlast = na->num_tx_rings + 1; 1772 priv->np_rxqfirst = (reg == NR_REG_SW ? 1773 na->num_rx_rings : 0); 1774 priv->np_rxqlast = na->num_rx_rings + 1; 1775 ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW", 1776 priv->np_rxqfirst, priv->np_rxqlast); 1777 break; 1778 case NR_REG_ONE_NIC: 1779 if (i >= na->num_tx_rings && i >= na->num_rx_rings) { 1780 D("invalid ring id %d", i); 1781 return EINVAL; 1782 } 1783 /* if not enough rings, use the first one */ 1784 j = i; 1785 if (j >= na->num_tx_rings) 1786 j = 0; 1787 priv->np_txqfirst = j; 1788 priv->np_txqlast = j + 1; 1789 j = i; 1790 if (j >= na->num_rx_rings) 1791 j = 0; 1792 priv->np_rxqfirst = j; 1793 priv->np_rxqlast = j + 1; 1794 break; 1795 default: 1796 D("invalid regif type %d", reg); 1797 return EINVAL; 1798 } 1799 priv->np_flags = (flags & ~NR_REG_MASK) | reg; 1800 1801 if (netmap_verbose) { 1802 D("%s: tx [%d,%d) rx [%d,%d) id %d", 1803 na->name, 1804 priv->np_txqfirst, 1805 priv->np_txqlast, 1806 priv->np_rxqfirst, 1807 priv->np_rxqlast, 1808 i); 1809 } 1810 return 0; 1811 } 1812 1813 1814 /* 1815 * Set the ring ID. For devices with a single queue, a request 1816 * for all rings is the same as a single ring. 1817 */ 1818 static int 1819 netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) 1820 { 1821 struct netmap_adapter *na = priv->np_na; 1822 int error; 1823 1824 error = netmap_interp_ringid(priv, ringid, flags); 1825 if (error) { 1826 return error; 1827 } 1828 1829 priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; 1830 1831 /* optimization: count the users registered for more than 1832 * one ring, which are the ones sleeping on the global queue. 1833 * The default netmap_notify() callback will then 1834 * avoid signaling the global queue if nobody is using it 1835 */ 1836 if (nm_tx_si_user(priv)) 1837 na->tx_si_users++; 1838 if (nm_rx_si_user(priv)) 1839 na->rx_si_users++; 1840 return 0; 1841 } 1842 1843 /* 1844 * possibly move the interface to netmap-mode. 1845 * If success it returns a pointer to netmap_if, otherwise NULL. 1846 * This must be called with NMG_LOCK held. 1847 * 1848 * The following na callbacks are called in the process: 1849 * 1850 * na->nm_config() [by netmap_update_config] 1851 * (get current number and size of rings) 1852 * 1853 * We have a generic one for linux (netmap_linux_config). 1854 * The bwrap has to override this, since it has to forward 1855 * the request to the wrapped adapter (netmap_bwrap_config). 1856 * 1857 * XXX netmap_if_new calls this again (2014-03-15) 1858 * 1859 * na->nm_krings_create() [by netmap_if_new] 1860 * (create and init the krings array) 1861 * 1862 * One of the following: 1863 * 1864 * * netmap_hw_krings_create, (hw ports) 1865 * creates the standard layout for the krings 1866 * and adds the mbq (used for the host rings). 1867 * 1868 * * netmap_vp_krings_create (VALE ports) 1869 * add leases and scratchpads 1870 * 1871 * * netmap_pipe_krings_create (pipes) 1872 * create the krings and rings of both ends and 1873 * cross-link them 1874 * 1875 * * netmap_monitor_krings_create (monitors) 1876 * avoid allocating the mbq 1877 * 1878 * * netmap_bwrap_krings_create (bwraps) 1879 * create both the brap krings array, 1880 * the krings array of the wrapped adapter, and 1881 * (if needed) the fake array for the host adapter 1882 * 1883 * na->nm_register(, 1) 1884 * (put the adapter in netmap mode) 1885 * 1886 * This may be one of the following: 1887 * (XXX these should be either all *_register or all *_reg 2014-03-15) 1888 * 1889 * * netmap_hw_register (hw ports) 1890 * checks that the ifp is still there, then calls 1891 * the hardware specific callback; 1892 * 1893 * * netmap_vp_reg (VALE ports) 1894 * If the port is connected to a bridge, 1895 * set the NAF_NETMAP_ON flag under the 1896 * bridge write lock. 1897 * 1898 * * netmap_pipe_reg (pipes) 1899 * inform the other pipe end that it is no 1900 * longer responsibile for the lifetime of this 1901 * pipe end 1902 * 1903 * * netmap_monitor_reg (monitors) 1904 * intercept the sync callbacks of the monitored 1905 * rings 1906 * 1907 * * netmap_bwrap_register (bwraps) 1908 * cross-link the bwrap and hwna rings, 1909 * forward the request to the hwna, override 1910 * the hwna notify callback (to get the frames 1911 * coming from outside go through the bridge). 1912 * 1913 * XXX maybe netmap_if_new() should be merged with this (2014-03-15). 1914 * 1915 */ 1916 struct netmap_if * 1917 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, 1918 uint16_t ringid, uint32_t flags, int *err) 1919 { 1920 struct netmap_if *nifp = NULL; 1921 int error, need_mem = 0; 1922 1923 NMG_LOCK_ASSERT(); 1924 /* ring configuration may have changed, fetch from the card */ 1925 netmap_update_config(na); 1926 priv->np_na = na; /* store the reference */ 1927 error = netmap_set_ringid(priv, ringid, flags); 1928 if (error) 1929 goto out; 1930 /* ensure allocators are ready */ 1931 need_mem = !netmap_have_memory_locked(priv); 1932 if (need_mem) { 1933 error = netmap_get_memory_locked(priv); 1934 ND("get_memory returned %d", error); 1935 if (error) 1936 goto out; 1937 } 1938 /* Allocate a netmap_if and, if necessary, all the netmap_ring's */ 1939 nifp = netmap_if_new(na); 1940 if (nifp == NULL) { /* allocation failed */ 1941 error = ENOMEM; 1942 goto out; 1943 } 1944 na->active_fds++; 1945 if (!nm_netmap_on(na)) { 1946 /* Netmap not active, set the card in netmap mode 1947 * and make it use the shared buffers. 1948 */ 1949 /* cache the allocator info in the na */ 1950 na->na_lut = netmap_mem_get_lut(na->nm_mem); 1951 ND("%p->na_lut == %p", na, na->na_lut); 1952 na->na_lut_objtotal = netmap_mem_get_buftotal(na->nm_mem); 1953 na->na_lut_objsize = netmap_mem_get_bufsize(na->nm_mem); 1954 error = na->nm_register(na, 1); /* mode on */ 1955 if (error) { 1956 netmap_do_unregif(priv, nifp); 1957 nifp = NULL; 1958 } 1959 } 1960 out: 1961 *err = error; 1962 if (error) { 1963 /* we should drop the allocator, but only 1964 * if we were the ones who grabbed it 1965 */ 1966 if (need_mem) 1967 netmap_drop_memory_locked(priv); 1968 priv->np_na = NULL; 1969 } 1970 if (nifp != NULL) { 1971 /* 1972 * advertise that the interface is ready bt setting ni_nifp. 1973 * The barrier is needed because readers (poll and *SYNC) 1974 * check for priv->np_nifp != NULL without locking 1975 */ 1976 wmb(); /* make sure previous writes are visible to all CPUs */ 1977 priv->np_nifp = nifp; 1978 } 1979 return nifp; 1980 } 1981 1982 1983 1984 /* 1985 * ioctl(2) support for the "netmap" device. 1986 * 1987 * Following a list of accepted commands: 1988 * - NIOCGINFO 1989 * - SIOCGIFADDR just for convenience 1990 * - NIOCREGIF 1991 * - NIOCTXSYNC 1992 * - NIOCRXSYNC 1993 * 1994 * Return 0 on success, errno otherwise. 1995 */ 1996 int 1997 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, 1998 int fflag, struct thread *td) 1999 { 2000 struct netmap_priv_d *priv = NULL; 2001 struct nmreq *nmr = (struct nmreq *) data; 2002 struct netmap_adapter *na = NULL; 2003 int error; 2004 u_int i, qfirst, qlast; 2005 struct netmap_if *nifp; 2006 struct netmap_kring *krings; 2007 2008 (void)dev; /* UNUSED */ 2009 (void)fflag; /* UNUSED */ 2010 2011 if (cmd == NIOCGINFO || cmd == NIOCREGIF) { 2012 /* truncate name */ 2013 nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; 2014 if (nmr->nr_version != NETMAP_API) { 2015 D("API mismatch for %s got %d need %d", 2016 nmr->nr_name, 2017 nmr->nr_version, NETMAP_API); 2018 nmr->nr_version = NETMAP_API; 2019 } 2020 if (nmr->nr_version < NETMAP_MIN_API || 2021 nmr->nr_version > NETMAP_MAX_API) { 2022 return EINVAL; 2023 } 2024 } 2025 CURVNET_SET(TD_TO_VNET(td)); 2026 2027 error = devfs_get_cdevpriv((void **)&priv); 2028 if (error) { 2029 CURVNET_RESTORE(); 2030 /* XXX ENOENT should be impossible, since the priv 2031 * is now created in the open */ 2032 return (error == ENOENT ? ENXIO : error); 2033 } 2034 2035 switch (cmd) { 2036 case NIOCGINFO: /* return capabilities etc */ 2037 if (nmr->nr_cmd == NETMAP_BDG_LIST) { 2038 error = netmap_bdg_ctl(nmr, NULL); 2039 break; 2040 } 2041 2042 NMG_LOCK(); 2043 do { 2044 /* memsize is always valid */ 2045 struct netmap_mem_d *nmd = &nm_mem; 2046 u_int memflags; 2047 2048 if (nmr->nr_name[0] != '\0') { 2049 /* get a refcount */ 2050 error = netmap_get_na(nmr, &na, 1 /* create */); 2051 if (error) 2052 break; 2053 nmd = na->nm_mem; /* get memory allocator */ 2054 } 2055 2056 error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags, 2057 &nmr->nr_arg2); 2058 if (error) 2059 break; 2060 if (na == NULL) /* only memory info */ 2061 break; 2062 nmr->nr_offset = 0; 2063 nmr->nr_rx_slots = nmr->nr_tx_slots = 0; 2064 netmap_update_config(na); 2065 nmr->nr_rx_rings = na->num_rx_rings; 2066 nmr->nr_tx_rings = na->num_tx_rings; 2067 nmr->nr_rx_slots = na->num_rx_desc; 2068 nmr->nr_tx_slots = na->num_tx_desc; 2069 netmap_adapter_put(na); 2070 } while (0); 2071 NMG_UNLOCK(); 2072 break; 2073 2074 case NIOCREGIF: 2075 /* possibly attach/detach NIC and VALE switch */ 2076 i = nmr->nr_cmd; 2077 if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH 2078 || i == NETMAP_BDG_VNET_HDR 2079 || i == NETMAP_BDG_NEWIF 2080 || i == NETMAP_BDG_DELIF) { 2081 error = netmap_bdg_ctl(nmr, NULL); 2082 break; 2083 } else if (i != 0) { 2084 D("nr_cmd must be 0 not %d", i); 2085 error = EINVAL; 2086 break; 2087 } 2088 2089 /* protect access to priv from concurrent NIOCREGIF */ 2090 NMG_LOCK(); 2091 do { 2092 u_int memflags; 2093 2094 if (priv->np_na != NULL) { /* thread already registered */ 2095 error = EBUSY; 2096 break; 2097 } 2098 /* find the interface and a reference */ 2099 error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */ 2100 if (error) 2101 break; 2102 if (NETMAP_OWNED_BY_KERN(na)) { 2103 netmap_adapter_put(na); 2104 error = EBUSY; 2105 break; 2106 } 2107 nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error); 2108 if (!nifp) { /* reg. failed, release priv and ref */ 2109 netmap_adapter_put(na); 2110 priv->np_nifp = NULL; 2111 break; 2112 } 2113 priv->np_td = td; // XXX kqueue, debugging only 2114 2115 /* return the offset of the netmap_if object */ 2116 nmr->nr_rx_rings = na->num_rx_rings; 2117 nmr->nr_tx_rings = na->num_tx_rings; 2118 nmr->nr_rx_slots = na->num_rx_desc; 2119 nmr->nr_tx_slots = na->num_tx_desc; 2120 error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags, 2121 &nmr->nr_arg2); 2122 if (error) { 2123 netmap_adapter_put(na); 2124 break; 2125 } 2126 if (memflags & NETMAP_MEM_PRIVATE) { 2127 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; 2128 } 2129 priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ? 2130 &na->tx_si : &na->tx_rings[priv->np_txqfirst].si; 2131 priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ? 2132 &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si; 2133 2134 if (nmr->nr_arg3) { 2135 D("requested %d extra buffers", nmr->nr_arg3); 2136 nmr->nr_arg3 = netmap_extra_alloc(na, 2137 &nifp->ni_bufs_head, nmr->nr_arg3); 2138 D("got %d extra buffers", nmr->nr_arg3); 2139 } 2140 nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); 2141 } while (0); 2142 NMG_UNLOCK(); 2143 break; 2144 2145 case NIOCTXSYNC: 2146 case NIOCRXSYNC: 2147 nifp = priv->np_nifp; 2148 2149 if (nifp == NULL) { 2150 error = ENXIO; 2151 break; 2152 } 2153 rmb(); /* make sure following reads are not from cache */ 2154 2155 na = priv->np_na; /* we have a reference */ 2156 2157 if (na == NULL) { 2158 D("Internal error: nifp != NULL && na == NULL"); 2159 error = ENXIO; 2160 break; 2161 } 2162 2163 if (!nm_netmap_on(na)) { 2164 error = ENXIO; 2165 break; 2166 } 2167 2168 if (cmd == NIOCTXSYNC) { 2169 krings = na->tx_rings; 2170 qfirst = priv->np_txqfirst; 2171 qlast = priv->np_txqlast; 2172 } else { 2173 krings = na->rx_rings; 2174 qfirst = priv->np_rxqfirst; 2175 qlast = priv->np_rxqlast; 2176 } 2177 2178 for (i = qfirst; i < qlast; i++) { 2179 struct netmap_kring *kring = krings + i; 2180 if (nm_kr_tryget(kring)) { 2181 error = EBUSY; 2182 goto out; 2183 } 2184 if (cmd == NIOCTXSYNC) { 2185 if (netmap_verbose & NM_VERB_TXSYNC) 2186 D("pre txsync ring %d cur %d hwcur %d", 2187 i, kring->ring->cur, 2188 kring->nr_hwcur); 2189 if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { 2190 netmap_ring_reinit(kring); 2191 } else { 2192 kring->nm_sync(kring, NAF_FORCE_RECLAIM); 2193 } 2194 if (netmap_verbose & NM_VERB_TXSYNC) 2195 D("post txsync ring %d cur %d hwcur %d", 2196 i, kring->ring->cur, 2197 kring->nr_hwcur); 2198 } else { 2199 kring->nm_sync(kring, NAF_FORCE_READ); 2200 microtime(&na->rx_rings[i].ring->ts); 2201 } 2202 nm_kr_put(kring); 2203 } 2204 2205 break; 2206 2207 case NIOCCONFIG: 2208 error = netmap_bdg_config(nmr); 2209 break; 2210 #ifdef __FreeBSD__ 2211 case FIONBIO: 2212 case FIOASYNC: 2213 ND("FIONBIO/FIOASYNC are no-ops"); 2214 break; 2215 2216 case BIOCIMMEDIATE: 2217 case BIOCGHDRCMPLT: 2218 case BIOCSHDRCMPLT: 2219 case BIOCSSEESENT: 2220 D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); 2221 break; 2222 2223 default: /* allow device-specific ioctls */ 2224 { 2225 struct socket so; 2226 struct ifnet *ifp; 2227 2228 bzero(&so, sizeof(so)); 2229 NMG_LOCK(); 2230 error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */ 2231 if (error) { 2232 netmap_adapter_put(na); 2233 NMG_UNLOCK(); 2234 break; 2235 } 2236 ifp = na->ifp; 2237 so.so_vnet = ifp->if_vnet; 2238 // so->so_proto not null. 2239 error = ifioctl(&so, cmd, data, td); 2240 netmap_adapter_put(na); 2241 NMG_UNLOCK(); 2242 break; 2243 } 2244 2245 #else /* linux */ 2246 default: 2247 error = EOPNOTSUPP; 2248 #endif /* linux */ 2249 } 2250 out: 2251 2252 CURVNET_RESTORE(); 2253 return (error); 2254 } 2255 2256 2257 /* 2258 * select(2) and poll(2) handlers for the "netmap" device. 2259 * 2260 * Can be called for one or more queues. 2261 * Return true the event mask corresponding to ready events. 2262 * If there are no ready events, do a selrecord on either individual 2263 * selinfo or on the global one. 2264 * Device-dependent parts (locking and sync of tx/rx rings) 2265 * are done through callbacks. 2266 * 2267 * On linux, arguments are really pwait, the poll table, and 'td' is struct file * 2268 * The first one is remapped to pwait as selrecord() uses the name as an 2269 * hidden argument. 2270 */ 2271 int 2272 netmap_poll(struct cdev *dev, int events, struct thread *td) 2273 { 2274 struct netmap_priv_d *priv = NULL; 2275 struct netmap_adapter *na; 2276 struct netmap_kring *kring; 2277 u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; 2278 struct mbq q; /* packets from hw queues to host stack */ 2279 void *pwait = dev; /* linux compatibility */ 2280 int is_kevent = 0; 2281 2282 /* 2283 * In order to avoid nested locks, we need to "double check" 2284 * txsync and rxsync if we decide to do a selrecord(). 2285 * retry_tx (and retry_rx, later) prevent looping forever. 2286 */ 2287 int retry_tx = 1, retry_rx = 1; 2288 2289 (void)pwait; 2290 mbq_init(&q); 2291 2292 /* 2293 * XXX kevent has curthread->tp_fop == NULL, 2294 * so devfs_get_cdevpriv() fails. We circumvent this by passing 2295 * priv as the first argument, which is also useful to avoid 2296 * the selrecord() which are not necessary in that case. 2297 */ 2298 if (devfs_get_cdevpriv((void **)&priv) != 0) { 2299 is_kevent = 1; 2300 if (netmap_verbose) 2301 D("called from kevent"); 2302 priv = (struct netmap_priv_d *)dev; 2303 } 2304 if (priv == NULL) 2305 return POLLERR; 2306 2307 if (priv->np_nifp == NULL) { 2308 D("No if registered"); 2309 return POLLERR; 2310 } 2311 rmb(); /* make sure following reads are not from cache */ 2312 2313 na = priv->np_na; 2314 2315 if (!nm_netmap_on(na)) 2316 return POLLERR; 2317 2318 if (netmap_verbose & 0x8000) 2319 D("device %s events 0x%x", na->name, events); 2320 want_tx = events & (POLLOUT | POLLWRNORM); 2321 want_rx = events & (POLLIN | POLLRDNORM); 2322 2323 2324 /* 2325 * check_all_{tx|rx} are set if the card has more than one queue AND 2326 * the file descriptor is bound to all of them. If so, we sleep on 2327 * the "global" selinfo, otherwise we sleep on individual selinfo 2328 * (FreeBSD only allows two selinfo's per file descriptor). 2329 * The interrupt routine in the driver wake one or the other 2330 * (or both) depending on which clients are active. 2331 * 2332 * rxsync() is only called if we run out of buffers on a POLLIN. 2333 * txsync() is called if we run out of buffers on POLLOUT, or 2334 * there are pending packets to send. The latter can be disabled 2335 * passing NETMAP_NO_TX_POLL in the NIOCREG call. 2336 */ 2337 check_all_tx = nm_tx_si_user(priv); 2338 check_all_rx = nm_rx_si_user(priv); 2339 2340 /* 2341 * We start with a lock free round which is cheap if we have 2342 * slots available. If this fails, then lock and call the sync 2343 * routines. 2344 */ 2345 for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) { 2346 kring = &na->rx_rings[i]; 2347 /* XXX compare ring->cur and kring->tail */ 2348 if (!nm_ring_empty(kring->ring)) { 2349 revents |= want_rx; 2350 want_rx = 0; /* also breaks the loop */ 2351 } 2352 } 2353 for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) { 2354 kring = &na->tx_rings[i]; 2355 /* XXX compare ring->cur and kring->tail */ 2356 if (!nm_ring_empty(kring->ring)) { 2357 revents |= want_tx; 2358 want_tx = 0; /* also breaks the loop */ 2359 } 2360 } 2361 2362 /* 2363 * If we want to push packets out (priv->np_txpoll) or 2364 * want_tx is still set, we must issue txsync calls 2365 * (on all rings, to avoid that the tx rings stall). 2366 * XXX should also check cur != hwcur on the tx rings. 2367 * Fortunately, normal tx mode has np_txpoll set. 2368 */ 2369 if (priv->np_txpoll || want_tx) { 2370 /* 2371 * The first round checks if anyone is ready, if not 2372 * do a selrecord and another round to handle races. 2373 * want_tx goes to 0 if any space is found, and is 2374 * used to skip rings with no pending transmissions. 2375 */ 2376 flush_tx: 2377 for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) { 2378 int found = 0; 2379 2380 kring = &na->tx_rings[i]; 2381 if (!want_tx && kring->ring->cur == kring->nr_hwcur) 2382 continue; 2383 /* only one thread does txsync */ 2384 if (nm_kr_tryget(kring)) { 2385 /* either busy or stopped 2386 * XXX if the ring is stopped, sleeping would 2387 * be better. In current code, however, we only 2388 * stop the rings for brief intervals (2014-03-14) 2389 */ 2390 if (netmap_verbose) 2391 RD(2, "%p lost race on txring %d, ok", 2392 priv, i); 2393 continue; 2394 } 2395 if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { 2396 netmap_ring_reinit(kring); 2397 revents |= POLLERR; 2398 } else { 2399 if (kring->nm_sync(kring, 0)) 2400 revents |= POLLERR; 2401 } 2402 2403 /* 2404 * If we found new slots, notify potential 2405 * listeners on the same ring. 2406 * Since we just did a txsync, look at the copies 2407 * of cur,tail in the kring. 2408 */ 2409 found = kring->rcur != kring->rtail; 2410 nm_kr_put(kring); 2411 if (found) { /* notify other listeners */ 2412 revents |= want_tx; 2413 want_tx = 0; 2414 na->nm_notify(na, i, NR_TX, 0); 2415 } 2416 } 2417 if (want_tx && retry_tx && !is_kevent) { 2418 selrecord(td, check_all_tx ? 2419 &na->tx_si : &na->tx_rings[priv->np_txqfirst].si); 2420 retry_tx = 0; 2421 goto flush_tx; 2422 } 2423 } 2424 2425 /* 2426 * If want_rx is still set scan receive rings. 2427 * Do it on all rings because otherwise we starve. 2428 */ 2429 if (want_rx) { 2430 int send_down = 0; /* transparent mode */ 2431 /* two rounds here for race avoidance */ 2432 do_retry_rx: 2433 for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) { 2434 int found = 0; 2435 2436 kring = &na->rx_rings[i]; 2437 2438 if (nm_kr_tryget(kring)) { 2439 if (netmap_verbose) 2440 RD(2, "%p lost race on rxring %d, ok", 2441 priv, i); 2442 continue; 2443 } 2444 2445 /* 2446 * transparent mode support: collect packets 2447 * from the rxring(s). 2448 * XXX NR_FORWARD should only be read on 2449 * physical or NIC ports 2450 */ 2451 if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { 2452 ND(10, "forwarding some buffers up %d to %d", 2453 kring->nr_hwcur, kring->ring->cur); 2454 netmap_grab_packets(kring, &q, netmap_fwd); 2455 } 2456 2457 if (kring->nm_sync(kring, 0)) 2458 revents |= POLLERR; 2459 if (netmap_no_timestamp == 0 || 2460 kring->ring->flags & NR_TIMESTAMP) { 2461 microtime(&kring->ring->ts); 2462 } 2463 /* after an rxsync we can use kring->rcur, rtail */ 2464 found = kring->rcur != kring->rtail; 2465 nm_kr_put(kring); 2466 if (found) { 2467 revents |= want_rx; 2468 retry_rx = 0; 2469 na->nm_notify(na, i, NR_RX, 0); 2470 } 2471 } 2472 2473 /* transparent mode XXX only during first pass ? */ 2474 if (na->na_flags & NAF_HOST_RINGS) { 2475 kring = &na->rx_rings[na->num_rx_rings]; 2476 if (check_all_rx 2477 && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { 2478 /* XXX fix to use kring fields */ 2479 if (nm_ring_empty(kring->ring)) 2480 send_down = netmap_rxsync_from_host(na, td, dev); 2481 if (!nm_ring_empty(kring->ring)) 2482 revents |= want_rx; 2483 } 2484 } 2485 2486 if (retry_rx && !is_kevent) 2487 selrecord(td, check_all_rx ? 2488 &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si); 2489 if (send_down > 0 || retry_rx) { 2490 retry_rx = 0; 2491 if (send_down) 2492 goto flush_tx; /* and retry_rx */ 2493 else 2494 goto do_retry_rx; 2495 } 2496 } 2497 2498 /* 2499 * Transparent mode: marked bufs on rx rings between 2500 * kring->nr_hwcur and ring->head 2501 * are passed to the other endpoint. 2502 * 2503 * In this mode we also scan the sw rxring, which in 2504 * turn passes packets up. 2505 * 2506 * XXX Transparent mode at the moment requires to bind all 2507 * rings to a single file descriptor. 2508 */ 2509 2510 if (q.head && na->ifp != NULL) 2511 netmap_send_up(na->ifp, &q); 2512 2513 return (revents); 2514 } 2515 2516 2517 /*-------------------- driver support routines -------------------*/ 2518 2519 static int netmap_hw_krings_create(struct netmap_adapter *); 2520 2521 /* default notify callback */ 2522 static int 2523 netmap_notify(struct netmap_adapter *na, u_int n_ring, 2524 enum txrx tx, int flags) 2525 { 2526 struct netmap_kring *kring; 2527 2528 if (tx == NR_TX) { 2529 kring = na->tx_rings + n_ring; 2530 OS_selwakeup(&kring->si, PI_NET); 2531 /* optimization: avoid a wake up on the global 2532 * queue if nobody has registered for more 2533 * than one ring 2534 */ 2535 if (na->tx_si_users > 0) 2536 OS_selwakeup(&na->tx_si, PI_NET); 2537 } else { 2538 kring = na->rx_rings + n_ring; 2539 OS_selwakeup(&kring->si, PI_NET); 2540 /* optimization: same as above */ 2541 if (na->rx_si_users > 0) 2542 OS_selwakeup(&na->rx_si, PI_NET); 2543 } 2544 return 0; 2545 } 2546 2547 2548 /* called by all routines that create netmap_adapters. 2549 * Attach na to the ifp (if any) and provide defaults 2550 * for optional callbacks. Defaults assume that we 2551 * are creating an hardware netmap_adapter. 2552 */ 2553 int 2554 netmap_attach_common(struct netmap_adapter *na) 2555 { 2556 struct ifnet *ifp = na->ifp; 2557 2558 if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { 2559 D("%s: invalid rings tx %d rx %d", 2560 na->name, na->num_tx_rings, na->num_rx_rings); 2561 return EINVAL; 2562 } 2563 /* ifp is NULL for virtual adapters (bwrap, non-persistent VALE ports, 2564 * pipes, monitors). For bwrap we actually have a non-null ifp for 2565 * use by the external modules, but that is set after this 2566 * function has been called. 2567 * XXX this is ugly, maybe split this function in two (2014-03-14) 2568 */ 2569 if (ifp != NULL) { 2570 WNA(ifp) = na; 2571 2572 /* the following is only needed for na that use the host port. 2573 * XXX do we have something similar for linux ? 2574 */ 2575 #ifdef __FreeBSD__ 2576 na->if_input = ifp->if_input; /* for netmap_send_up */ 2577 #endif /* __FreeBSD__ */ 2578 2579 NETMAP_SET_CAPABLE(ifp); 2580 } 2581 if (na->nm_krings_create == NULL) { 2582 /* we assume that we have been called by a driver, 2583 * since other port types all provide their own 2584 * nm_krings_create 2585 */ 2586 na->nm_krings_create = netmap_hw_krings_create; 2587 na->nm_krings_delete = netmap_hw_krings_delete; 2588 } 2589 if (na->nm_notify == NULL) 2590 na->nm_notify = netmap_notify; 2591 na->active_fds = 0; 2592 2593 if (na->nm_mem == NULL) 2594 /* use the global allocator */ 2595 na->nm_mem = &nm_mem; 2596 if (na->nm_bdg_attach == NULL) 2597 /* no special nm_bdg_attach callback. On VALE 2598 * attach, we need to interpose a bwrap 2599 */ 2600 na->nm_bdg_attach = netmap_bwrap_attach; 2601 return 0; 2602 } 2603 2604 2605 /* standard cleanup, called by all destructors */ 2606 void 2607 netmap_detach_common(struct netmap_adapter *na) 2608 { 2609 if (na->ifp != NULL) 2610 WNA(na->ifp) = NULL; /* XXX do we need this? */ 2611 2612 if (na->tx_rings) { /* XXX should not happen */ 2613 D("freeing leftover tx_rings"); 2614 na->nm_krings_delete(na); 2615 } 2616 netmap_pipe_dealloc(na); 2617 if (na->na_flags & NAF_MEM_OWNER) 2618 netmap_mem_private_delete(na->nm_mem); 2619 bzero(na, sizeof(*na)); 2620 free(na, M_DEVBUF); 2621 } 2622 2623 /* Wrapper for the register callback provided hardware drivers. 2624 * na->ifp == NULL means the the driver module has been 2625 * unloaded, so we cannot call into it. 2626 * Note that module unloading, in our patched linux drivers, 2627 * happens under NMG_LOCK and after having stopped all the 2628 * nic rings (see netmap_detach). This provides sufficient 2629 * protection for the other driver-provied callbacks 2630 * (i.e., nm_config and nm_*xsync), that therefore don't need 2631 * to wrapped. 2632 */ 2633 static int 2634 netmap_hw_register(struct netmap_adapter *na, int onoff) 2635 { 2636 struct netmap_hw_adapter *hwna = 2637 (struct netmap_hw_adapter*)na; 2638 2639 if (na->ifp == NULL) 2640 return onoff ? ENXIO : 0; 2641 2642 return hwna->nm_hw_register(na, onoff); 2643 } 2644 2645 2646 /* 2647 * Initialize a ``netmap_adapter`` object created by driver on attach. 2648 * We allocate a block of memory with room for a struct netmap_adapter 2649 * plus two sets of N+2 struct netmap_kring (where N is the number 2650 * of hardware rings): 2651 * krings 0..N-1 are for the hardware queues. 2652 * kring N is for the host stack queue 2653 * kring N+1 is only used for the selinfo for all queues. // XXX still true ? 2654 * Return 0 on success, ENOMEM otherwise. 2655 */ 2656 int 2657 netmap_attach(struct netmap_adapter *arg) 2658 { 2659 struct netmap_hw_adapter *hwna = NULL; 2660 // XXX when is arg == NULL ? 2661 struct ifnet *ifp = arg ? arg->ifp : NULL; 2662 2663 if (arg == NULL || ifp == NULL) 2664 goto fail; 2665 hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO); 2666 if (hwna == NULL) 2667 goto fail; 2668 hwna->up = *arg; 2669 hwna->up.na_flags |= NAF_HOST_RINGS; 2670 strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name)); 2671 hwna->nm_hw_register = hwna->up.nm_register; 2672 hwna->up.nm_register = netmap_hw_register; 2673 if (netmap_attach_common(&hwna->up)) { 2674 free(hwna, M_DEVBUF); 2675 goto fail; 2676 } 2677 netmap_adapter_get(&hwna->up); 2678 2679 #ifdef linux 2680 if (ifp->netdev_ops) { 2681 /* prepare a clone of the netdev ops */ 2682 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) 2683 hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops; 2684 #else 2685 hwna->nm_ndo = *ifp->netdev_ops; 2686 #endif 2687 } 2688 hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; 2689 if (ifp->ethtool_ops) { 2690 hwna->nm_eto = *ifp->ethtool_ops; 2691 } 2692 hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam; 2693 #ifdef ETHTOOL_SCHANNELS 2694 hwna->nm_eto.set_channels = linux_netmap_set_channels; 2695 #endif 2696 if (arg->nm_config == NULL) { 2697 hwna->up.nm_config = netmap_linux_config; 2698 } 2699 #endif /* linux */ 2700 2701 D("success for %s tx %d/%d rx %d/%d queues/slots", 2702 hwna->up.name, 2703 hwna->up.num_tx_rings, hwna->up.num_tx_desc, 2704 hwna->up.num_rx_rings, hwna->up.num_rx_desc 2705 ); 2706 return 0; 2707 2708 fail: 2709 D("fail, arg %p ifp %p na %p", arg, ifp, hwna); 2710 if (ifp) 2711 netmap_detach(ifp); 2712 return (hwna ? EINVAL : ENOMEM); 2713 } 2714 2715 2716 void 2717 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na) 2718 { 2719 if (!na) { 2720 return; 2721 } 2722 2723 refcount_acquire(&na->na_refcount); 2724 } 2725 2726 2727 /* returns 1 iff the netmap_adapter is destroyed */ 2728 int 2729 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) 2730 { 2731 if (!na) 2732 return 1; 2733 2734 if (!refcount_release(&na->na_refcount)) 2735 return 0; 2736 2737 if (na->nm_dtor) 2738 na->nm_dtor(na); 2739 2740 netmap_detach_common(na); 2741 2742 return 1; 2743 } 2744 2745 /* nm_krings_create callback for all hardware native adapters */ 2746 int 2747 netmap_hw_krings_create(struct netmap_adapter *na) 2748 { 2749 int ret = netmap_krings_create(na, 0); 2750 if (ret == 0) { 2751 /* initialize the mbq for the sw rx ring */ 2752 mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue); 2753 ND("initialized sw rx queue %d", na->num_rx_rings); 2754 } 2755 return ret; 2756 } 2757 2758 2759 2760 /* 2761 * Called on module unload by the netmap-enabled drivers 2762 */ 2763 void 2764 netmap_detach(struct ifnet *ifp) 2765 { 2766 struct netmap_adapter *na = NA(ifp); 2767 2768 if (!na) 2769 return; 2770 2771 NMG_LOCK(); 2772 netmap_disable_all_rings(ifp); 2773 if (!netmap_adapter_put(na)) { 2774 /* someone is still using the adapter, 2775 * tell them that the interface is gone 2776 */ 2777 na->ifp = NULL; 2778 // XXX also clear NAF_NATIVE_ON ? 2779 na->na_flags &= ~NAF_NETMAP_ON; 2780 /* give them a chance to notice */ 2781 netmap_enable_all_rings(ifp); 2782 } 2783 NMG_UNLOCK(); 2784 } 2785 2786 2787 /* 2788 * Intercept packets from the network stack and pass them 2789 * to netmap as incoming packets on the 'software' ring. 2790 * 2791 * We only store packets in a bounded mbq and then copy them 2792 * in the relevant rxsync routine. 2793 * 2794 * We rely on the OS to make sure that the ifp and na do not go 2795 * away (typically the caller checks for IFF_DRV_RUNNING or the like). 2796 * In nm_register() or whenever there is a reinitialization, 2797 * we make sure to make the mode change visible here. 2798 */ 2799 int 2800 netmap_transmit(struct ifnet *ifp, struct mbuf *m) 2801 { 2802 struct netmap_adapter *na = NA(ifp); 2803 struct netmap_kring *kring; 2804 u_int len = MBUF_LEN(m); 2805 u_int error = ENOBUFS; 2806 struct mbq *q; 2807 int space; 2808 2809 // XXX [Linux] we do not need this lock 2810 // if we follow the down/configure/up protocol -gl 2811 // mtx_lock(&na->core_lock); 2812 2813 if (!nm_netmap_on(na)) { 2814 D("%s not in netmap mode anymore", na->name); 2815 error = ENXIO; 2816 goto done; 2817 } 2818 2819 kring = &na->rx_rings[na->num_rx_rings]; 2820 q = &kring->rx_queue; 2821 2822 // XXX reconsider long packets if we handle fragments 2823 if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */ 2824 D("%s from_host, drop packet size %d > %d", na->name, 2825 len, NETMAP_BUF_SIZE(na)); 2826 goto done; 2827 } 2828 2829 /* protect against rxsync_from_host(), netmap_sw_to_nic() 2830 * and maybe other instances of netmap_transmit (the latter 2831 * not possible on Linux). 2832 * Also avoid overflowing the queue. 2833 */ 2834 mbq_lock(q); 2835 2836 space = kring->nr_hwtail - kring->nr_hwcur; 2837 if (space < 0) 2838 space += kring->nkr_num_slots; 2839 if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX 2840 RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p", 2841 na->name, kring->nr_hwcur, kring->nr_hwtail, mbq_len(q), 2842 len, m); 2843 } else { 2844 mbq_enqueue(q, m); 2845 ND(10, "%s %d bufs in queue len %d m %p", 2846 na->name, mbq_len(q), len, m); 2847 /* notify outside the lock */ 2848 m = NULL; 2849 error = 0; 2850 } 2851 mbq_unlock(q); 2852 2853 done: 2854 if (m) 2855 m_freem(m); 2856 /* unconditionally wake up listeners */ 2857 na->nm_notify(na, na->num_rx_rings, NR_RX, 0); 2858 /* this is normally netmap_notify(), but for nics 2859 * connected to a bridge it is netmap_bwrap_intr_notify(), 2860 * that possibly forwards the frames through the switch 2861 */ 2862 2863 return (error); 2864 } 2865 2866 2867 /* 2868 * netmap_reset() is called by the driver routines when reinitializing 2869 * a ring. The driver is in charge of locking to protect the kring. 2870 * If native netmap mode is not set just return NULL. 2871 */ 2872 struct netmap_slot * 2873 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, 2874 u_int new_cur) 2875 { 2876 struct netmap_kring *kring; 2877 int new_hwofs, lim; 2878 2879 if (!nm_native_on(na)) { 2880 ND("interface not in native netmap mode"); 2881 return NULL; /* nothing to reinitialize */ 2882 } 2883 2884 /* XXX note- in the new scheme, we are not guaranteed to be 2885 * under lock (e.g. when called on a device reset). 2886 * In this case, we should set a flag and do not trust too 2887 * much the values. In practice: TODO 2888 * - set a RESET flag somewhere in the kring 2889 * - do the processing in a conservative way 2890 * - let the *sync() fixup at the end. 2891 */ 2892 if (tx == NR_TX) { 2893 if (n >= na->num_tx_rings) 2894 return NULL; 2895 kring = na->tx_rings + n; 2896 // XXX check whether we should use hwcur or rcur 2897 new_hwofs = kring->nr_hwcur - new_cur; 2898 } else { 2899 if (n >= na->num_rx_rings) 2900 return NULL; 2901 kring = na->rx_rings + n; 2902 new_hwofs = kring->nr_hwtail - new_cur; 2903 } 2904 lim = kring->nkr_num_slots - 1; 2905 if (new_hwofs > lim) 2906 new_hwofs -= lim + 1; 2907 2908 /* Always set the new offset value and realign the ring. */ 2909 if (netmap_verbose) 2910 D("%s %s%d hwofs %d -> %d, hwtail %d -> %d", 2911 na->name, 2912 tx == NR_TX ? "TX" : "RX", n, 2913 kring->nkr_hwofs, new_hwofs, 2914 kring->nr_hwtail, 2915 tx == NR_TX ? lim : kring->nr_hwtail); 2916 kring->nkr_hwofs = new_hwofs; 2917 if (tx == NR_TX) { 2918 kring->nr_hwtail = kring->nr_hwcur + lim; 2919 if (kring->nr_hwtail > lim) 2920 kring->nr_hwtail -= lim + 1; 2921 } 2922 2923 #if 0 // def linux 2924 /* XXX check that the mappings are correct */ 2925 /* need ring_nr, adapter->pdev, direction */ 2926 buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE); 2927 if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { 2928 D("error mapping rx netmap buffer %d", i); 2929 // XXX fix error handling 2930 } 2931 2932 #endif /* linux */ 2933 /* 2934 * Wakeup on the individual and global selwait 2935 * We do the wakeup here, but the ring is not yet reconfigured. 2936 * However, we are under lock so there are no races. 2937 */ 2938 na->nm_notify(na, n, tx, 0); 2939 return kring->ring->slot; 2940 } 2941 2942 2943 /* 2944 * Dispatch rx/tx interrupts to the netmap rings. 2945 * 2946 * "work_done" is non-null on the RX path, NULL for the TX path. 2947 * We rely on the OS to make sure that there is only one active 2948 * instance per queue, and that there is appropriate locking. 2949 * 2950 * The 'notify' routine depends on what the ring is attached to. 2951 * - for a netmap file descriptor, do a selwakeup on the individual 2952 * waitqueue, plus one on the global one if needed 2953 * (see netmap_notify) 2954 * - for a nic connected to a switch, call the proper forwarding routine 2955 * (see netmap_bwrap_intr_notify) 2956 */ 2957 void 2958 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) 2959 { 2960 struct netmap_adapter *na = NA(ifp); 2961 struct netmap_kring *kring; 2962 2963 q &= NETMAP_RING_MASK; 2964 2965 if (netmap_verbose) { 2966 RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); 2967 } 2968 2969 if (work_done) { /* RX path */ 2970 if (q >= na->num_rx_rings) 2971 return; // not a physical queue 2972 kring = na->rx_rings + q; 2973 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? 2974 na->nm_notify(na, q, NR_RX, 0); 2975 *work_done = 1; /* do not fire napi again */ 2976 } else { /* TX path */ 2977 if (q >= na->num_tx_rings) 2978 return; // not a physical queue 2979 kring = na->tx_rings + q; 2980 na->nm_notify(na, q, NR_TX, 0); 2981 } 2982 } 2983 2984 2985 /* 2986 * Default functions to handle rx/tx interrupts from a physical device. 2987 * "work_done" is non-null on the RX path, NULL for the TX path. 2988 * 2989 * If the card is not in netmap mode, simply return 0, 2990 * so that the caller proceeds with regular processing. 2991 * Otherwise call netmap_common_irq() and return 1. 2992 * 2993 * If the card is connected to a netmap file descriptor, 2994 * do a selwakeup on the individual queue, plus one on the global one 2995 * if needed (multiqueue card _and_ there are multiqueue listeners), 2996 * and return 1. 2997 * 2998 * Finally, if called on rx from an interface connected to a switch, 2999 * calls the proper forwarding routine, and return 1. 3000 */ 3001 int 3002 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) 3003 { 3004 struct netmap_adapter *na = NA(ifp); 3005 3006 /* 3007 * XXX emulated netmap mode sets NAF_SKIP_INTR so 3008 * we still use the regular driver even though the previous 3009 * check fails. It is unclear whether we should use 3010 * nm_native_on() here. 3011 */ 3012 if (!nm_netmap_on(na)) 3013 return 0; 3014 3015 if (na->na_flags & NAF_SKIP_INTR) { 3016 ND("use regular interrupt"); 3017 return 0; 3018 } 3019 3020 netmap_common_irq(ifp, q, work_done); 3021 return 1; 3022 } 3023 3024 3025 /* 3026 * Module loader and unloader 3027 * 3028 * netmap_init() creates the /dev/netmap device and initializes 3029 * all global variables. Returns 0 on success, errno on failure 3030 * (but there is no chance) 3031 * 3032 * netmap_fini() destroys everything. 3033 */ 3034 3035 static struct cdev *netmap_dev; /* /dev/netmap character device. */ 3036 extern struct cdevsw netmap_cdevsw; 3037 3038 3039 void 3040 netmap_fini(void) 3041 { 3042 // XXX destroy_bridges() ? 3043 if (netmap_dev) 3044 destroy_dev(netmap_dev); 3045 netmap_mem_fini(); 3046 NMG_LOCK_DESTROY(); 3047 printf("netmap: unloaded module.\n"); 3048 } 3049 3050 3051 int 3052 netmap_init(void) 3053 { 3054 int error; 3055 3056 NMG_LOCK_INIT(); 3057 3058 error = netmap_mem_init(); 3059 if (error != 0) 3060 goto fail; 3061 /* XXX could use make_dev_credv() to get error number */ 3062 netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, 3063 "netmap"); 3064 if (!netmap_dev) 3065 goto fail; 3066 3067 netmap_init_bridges(); 3068 #ifdef __FreeBSD__ 3069 nm_vi_init_index(); 3070 #endif 3071 printf("netmap: loaded module\n"); 3072 return (0); 3073 fail: 3074 netmap_fini(); 3075 return (EINVAL); /* may be incorrect */ 3076 } 3077