1 /* 2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 /* 28 * $FreeBSD$ 29 * 30 * The header contains the definitions of constants and function 31 * prototypes used only in kernelspace. 32 */ 33 34 #ifndef _NET_NETMAP_KERN_H_ 35 #define _NET_NETMAP_KERN_H_ 36 37 #define WITH_VALE // comment out to disable VALE support 38 39 #if defined(__FreeBSD__) 40 41 #define likely(x) __builtin_expect((long)!!(x), 1L) 42 #define unlikely(x) __builtin_expect((long)!!(x), 0L) 43 44 #define NM_LOCK_T struct mtx 45 #define NMG_LOCK_T struct mtx 46 #define NMG_LOCK_INIT() mtx_init(&netmap_global_lock, \ 47 "netmap global lock", NULL, MTX_DEF) 48 #define NMG_LOCK_DESTROY() mtx_destroy(&netmap_global_lock) 49 #define NMG_LOCK() mtx_lock(&netmap_global_lock) 50 #define NMG_UNLOCK() mtx_unlock(&netmap_global_lock) 51 #define NMG_LOCK_ASSERT() mtx_assert(&netmap_global_lock, MA_OWNED) 52 53 #define NM_SELINFO_T struct selinfo 54 #define MBUF_LEN(m) ((m)->m_pkthdr.len) 55 #define MBUF_IFP(m) ((m)->m_pkthdr.rcvif) 56 #define NM_SEND_UP(ifp, m) ((NA(ifp))->if_input)(ifp, m) 57 58 #define NM_ATOMIC_T volatile int // XXX ? 59 /* atomic operations */ 60 #include <machine/atomic.h> 61 #define NM_ATOMIC_TEST_AND_SET(p) (!atomic_cmpset_acq_int((p), 0, 1)) 62 #define NM_ATOMIC_CLEAR(p) atomic_store_rel_int((p), 0) 63 64 65 MALLOC_DECLARE(M_NETMAP); 66 67 // XXX linux struct, not used in FreeBSD 68 struct net_device_ops { 69 }; 70 struct hrtimer { 71 }; 72 73 #elif defined (linux) 74 75 #define NM_LOCK_T safe_spinlock_t // see bsd_glue.h 76 #define NM_SELINFO_T wait_queue_head_t 77 #define MBUF_LEN(m) ((m)->len) 78 #define MBUF_IFP(m) ((m)->dev) 79 #define NM_SEND_UP(ifp, m) \ 80 do { \ 81 m->priority = NM_MAGIC_PRIORITY; \ 82 netif_rx(m); \ 83 } while (0) 84 85 #define NM_ATOMIC_T volatile long unsigned int 86 87 // XXX a mtx would suffice here too 20130404 gl 88 #define NMG_LOCK_T struct semaphore 89 #define NMG_LOCK_INIT() sema_init(&netmap_global_lock, 1) 90 #define NMG_LOCK_DESTROY() 91 #define NMG_LOCK() down(&netmap_global_lock) 92 #define NMG_UNLOCK() up(&netmap_global_lock) 93 #define NMG_LOCK_ASSERT() // XXX to be completed 94 95 #ifndef DEV_NETMAP 96 #define DEV_NETMAP 97 #endif /* DEV_NETMAP */ 98 99 /* 100 * IFCAP_NETMAP goes into net_device's priv_flags (if_capenable). 101 * This was 16 bits up to linux 2.6.36, so we need a 16 bit value on older 102 * platforms and tolerate the clash with IFF_DYNAMIC and IFF_BRIDGE_PORT. 103 * For the 32-bit value, 0x100000 has no clashes until at least 3.5.1 104 */ 105 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) 106 #define IFCAP_NETMAP 0x8000 107 #else 108 #define IFCAP_NETMAP 0x200000 109 #endif 110 111 #elif defined (__APPLE__) 112 113 #warning apple support is incomplete. 114 #define likely(x) __builtin_expect(!!(x), 1) 115 #define unlikely(x) __builtin_expect(!!(x), 0) 116 #define NM_LOCK_T IOLock * 117 #define NM_SELINFO_T struct selinfo 118 #define MBUF_LEN(m) ((m)->m_pkthdr.len) 119 #define NM_SEND_UP(ifp, m) ((ifp)->if_input)(ifp, m) 120 121 #else 122 123 #error unsupported platform 124 125 #endif /* end - platform-specific code */ 126 127 #define ND(format, ...) 128 #define D(format, ...) \ 129 do { \ 130 struct timeval __xxts; \ 131 microtime(&__xxts); \ 132 printf("%03d.%06d [%4d] %-25s " format "\n", \ 133 (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \ 134 __LINE__, __FUNCTION__, ##__VA_ARGS__); \ 135 } while (0) 136 137 /* rate limited, lps indicates how many per second */ 138 #define RD(lps, format, ...) \ 139 do { \ 140 static int t0, __cnt; \ 141 if (t0 != time_second) { \ 142 t0 = time_second; \ 143 __cnt = 0; \ 144 } \ 145 if (__cnt++ < lps) \ 146 D(format, ##__VA_ARGS__); \ 147 } while (0) 148 149 struct netmap_adapter; 150 struct nm_bdg_fwd; 151 struct nm_bridge; 152 struct netmap_priv_d; 153 154 const char *nm_dump_buf(char *p, int len, int lim, char *dst); 155 156 #include "netmap_mbq.h" 157 158 extern NMG_LOCK_T netmap_global_lock; 159 160 /* 161 * private, kernel view of a ring. Keeps track of the status of 162 * a ring across system calls. 163 * 164 * nr_hwcur index of the next buffer to refill. 165 * It corresponds to ring->head 166 * at the time the system call returns. 167 * 168 * nr_hwtail index of the first buffer owned by the kernel. 169 * On RX, hwcur->hwtail are receive buffers 170 * not yet released. hwcur is advanced following 171 * ring->head, hwtail is advanced on incoming packets, 172 * and a wakeup is generated when hwtail passes ring->cur 173 * On TX, hwcur->rcur have been filled by the sender 174 * but not sent yet to the NIC; rcur->hwtail are available 175 * for new transmissions, and hwtail->hwcur-1 are pending 176 * transmissions not yet acknowledged. 177 * 178 * The indexes in the NIC and netmap rings are offset by nkr_hwofs slots. 179 * This is so that, on a reset, buffers owned by userspace are not 180 * modified by the kernel. In particular: 181 * RX rings: the next empty buffer (hwtail + hwofs) coincides with 182 * the next empty buffer as known by the hardware (next_to_check or so). 183 * TX rings: hwcur + hwofs coincides with next_to_send 184 * 185 * Clients cannot issue concurrent syscall on a ring. The system 186 * detects this and reports an error using two flags, 187 * NKR_WBUSY and NKR_RBUSY 188 * For received packets, slot->flags is set to nkr_slot_flags 189 * so we can provide a proper initial value (e.g. set NS_FORWARD 190 * when operating in 'transparent' mode). 191 * 192 * The following fields are used to implement lock-free copy of packets 193 * from input to output ports in VALE switch: 194 * nkr_hwlease buffer after the last one being copied. 195 * A writer in nm_bdg_flush reserves N buffers 196 * from nr_hwlease, advances it, then does the 197 * copy outside the lock. 198 * In RX rings (used for VALE ports), 199 * nkr_hwtail <= nkr_hwlease < nkr_hwcur+N-1 200 * In TX rings (used for NIC or host stack ports) 201 * nkr_hwcur <= nkr_hwlease < nkr_hwtail 202 * nkr_leases array of nkr_num_slots where writers can report 203 * completion of their block. NR_NOSLOT (~0) indicates 204 * that the writer has not finished yet 205 * nkr_lease_idx index of next free slot in nr_leases, to be assigned 206 * 207 * The kring is manipulated by txsync/rxsync and generic netmap function. 208 * 209 * Concurrent rxsync or txsync on the same ring are prevented through 210 * by nm_kr_lock() which in turn uses nr_busy. This is all we need 211 * for NIC rings, and for TX rings attached to the host stack. 212 * 213 * RX rings attached to the host stack use an mbq (rx_queue) on both 214 * rxsync_from_host() and netmap_transmit(). The mbq is protected 215 * by its internal lock. 216 * 217 * RX rings attached to the VALE switch are accessed by both sender 218 * and receiver. They are protected through the q_lock on the RX ring. 219 */ 220 struct netmap_kring { 221 struct netmap_ring *ring; 222 223 uint32_t nr_hwcur; 224 uint32_t nr_hwtail; 225 226 /* 227 * Copies of values in user rings, so we do not need to look 228 * at the ring (which could be modified). These are set in the 229 * *sync_prologue()/finalize() routines. 230 */ 231 uint32_t rhead; 232 uint32_t rcur; 233 uint32_t rtail; 234 235 uint32_t nr_kflags; /* private driver flags */ 236 #define NKR_PENDINTR 0x1 // Pending interrupt. 237 uint32_t nkr_num_slots; 238 239 /* 240 * On a NIC reset, the NIC ring indexes may be reset but the 241 * indexes in the netmap rings remain the same. nkr_hwofs 242 * keeps track of the offset between the two. 243 */ 244 int32_t nkr_hwofs; 245 246 uint16_t nkr_slot_flags; /* initial value for flags */ 247 248 /* last_reclaim is opaque marker to help reduce the frequency 249 * of operations such as reclaiming tx buffers. A possible use 250 * is set it to ticks and do the reclaim only once per tick. 251 */ 252 uint64_t last_reclaim; 253 254 255 NM_SELINFO_T si; /* poll/select wait queue */ 256 NM_LOCK_T q_lock; /* protects kring and ring. */ 257 NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */ 258 259 struct netmap_adapter *na; 260 261 /* The folloiwing fields are for VALE switch support */ 262 struct nm_bdg_fwd *nkr_ft; 263 uint32_t *nkr_leases; 264 #define NR_NOSLOT ((uint32_t)~0) /* used in nkr_*lease* */ 265 uint32_t nkr_hwlease; 266 uint32_t nkr_lease_idx; 267 268 volatile int nkr_stopped; // XXX what for ? 269 270 /* support for adapters without native netmap support. 271 * On tx rings we preallocate an array of tx buffers 272 * (same size as the netmap ring), on rx rings we 273 * store incoming packets in a queue. 274 * XXX who writes to the rx queue ? 275 */ 276 struct mbuf **tx_pool; 277 // u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */ 278 struct mbq rx_queue; /* intercepted rx mbufs. */ 279 280 uint32_t ring_id; /* debugging */ 281 char name[64]; /* diagnostic */ 282 283 } __attribute__((__aligned__(64))); 284 285 286 /* return the next index, with wraparound */ 287 static inline uint32_t 288 nm_next(uint32_t i, uint32_t lim) 289 { 290 return unlikely (i == lim) ? 0 : i + 1; 291 } 292 293 294 /* return the previous index, with wraparound */ 295 static inline uint32_t 296 nm_prev(uint32_t i, uint32_t lim) 297 { 298 return unlikely (i == 0) ? lim : i - 1; 299 } 300 301 302 /* 303 * 304 * Here is the layout for the Rx and Tx rings. 305 306 RxRING TxRING 307 308 +-----------------+ +-----------------+ 309 | | | | 310 |XXX free slot XXX| |XXX free slot XXX| 311 +-----------------+ +-----------------+ 312 head->| owned by user |<-hwcur | not sent to nic |<-hwcur 313 | | | yet | 314 +-----------------+ | | 315 cur->| available to | | | 316 | user, not read | +-----------------+ 317 | yet | cur->| (being | 318 | | | prepared) | 319 | | | | 320 +-----------------+ + ------ + 321 tail->| |<-hwtail | |<-hwlease 322 | (being | ... | | ... 323 | prepared) | ... | | ... 324 +-----------------+ ... | | ... 325 | |<-hwlease +-----------------+ 326 | | tail->| |<-hwtail 327 | | | | 328 | | | | 329 | | | | 330 +-----------------+ +-----------------+ 331 332 * The cur/tail (user view) and hwcur/hwtail (kernel view) 333 * are used in the normal operation of the card. 334 * 335 * When a ring is the output of a switch port (Rx ring for 336 * a VALE port, Tx ring for the host stack or NIC), slots 337 * are reserved in blocks through 'hwlease' which points 338 * to the next unused slot. 339 * On an Rx ring, hwlease is always after hwtail, 340 * and completions cause hwtail to advance. 341 * On a Tx ring, hwlease is always between cur and hwtail, 342 * and completions cause cur to advance. 343 * 344 * nm_kr_space() returns the maximum number of slots that 345 * can be assigned. 346 * nm_kr_lease() reserves the required number of buffers, 347 * advances nkr_hwlease and also returns an entry in 348 * a circular array where completions should be reported. 349 */ 350 351 352 353 enum txrx { NR_RX = 0, NR_TX = 1 }; 354 355 /* 356 * The "struct netmap_adapter" extends the "struct adapter" 357 * (or equivalent) device descriptor. 358 * It contains all base fields needed to support netmap operation. 359 * There are in fact different types of netmap adapters 360 * (native, generic, VALE switch...) so a netmap_adapter is 361 * just the first field in the derived type. 362 */ 363 struct netmap_adapter { 364 /* 365 * On linux we do not have a good way to tell if an interface 366 * is netmap-capable. So we always use the following trick: 367 * NA(ifp) points here, and the first entry (which hopefully 368 * always exists and is at least 32 bits) contains a magic 369 * value which we can use to detect that the interface is good. 370 */ 371 uint32_t magic; 372 uint32_t na_flags; /* enabled, and other flags */ 373 #define NAF_SKIP_INTR 1 /* use the regular interrupt handler. 374 * useful during initialization 375 */ 376 #define NAF_SW_ONLY 2 /* forward packets only to sw adapter */ 377 #define NAF_BDG_MAYSLEEP 4 /* the bridge is allowed to sleep when 378 * forwarding packets coming from this 379 * interface 380 */ 381 #define NAF_MEM_OWNER 8 /* the adapter is responsible for the 382 * deallocation of the memory allocator 383 */ 384 #define NAF_NATIVE_ON 16 /* the adapter is native and the attached 385 * interface is in netmap mode 386 */ 387 #define NAF_NETMAP_ON 32 /* netmap is active (either native or 388 * emulated. Where possible (e.g. FreeBSD) 389 * IFCAP_NETMAP also mirrors this flag. 390 */ 391 int active_fds; /* number of user-space descriptors using this 392 interface, which is equal to the number of 393 struct netmap_if objs in the mapped region. */ 394 395 u_int num_rx_rings; /* number of adapter receive rings */ 396 u_int num_tx_rings; /* number of adapter transmit rings */ 397 398 u_int num_tx_desc; /* number of descriptor in each queue */ 399 u_int num_rx_desc; 400 401 /* tx_rings and rx_rings are private but allocated 402 * as a contiguous chunk of memory. Each array has 403 * N+1 entries, for the adapter queues and for the host queue. 404 */ 405 struct netmap_kring *tx_rings; /* array of TX rings. */ 406 struct netmap_kring *rx_rings; /* array of RX rings. */ 407 408 void *tailroom; /* space below the rings array */ 409 /* (used for leases) */ 410 411 412 NM_SELINFO_T tx_si, rx_si; /* global wait queues */ 413 414 /* copy of if_qflush and if_transmit pointers, to intercept 415 * packets from the network stack when netmap is active. 416 */ 417 int (*if_transmit)(struct ifnet *, struct mbuf *); 418 419 /* copy of if_input for netmap_send_up() */ 420 void (*if_input)(struct ifnet *, struct mbuf *); 421 422 /* references to the ifnet and device routines, used by 423 * the generic netmap functions. 424 */ 425 struct ifnet *ifp; /* adapter is ifp->if_softc */ 426 427 /*---- callbacks for this netmap adapter -----*/ 428 /* 429 * nm_dtor() is the cleanup routine called when destroying 430 * the adapter. 431 * 432 * nm_register() is called on NIOCREGIF and close() to enter 433 * or exit netmap mode on the NIC 434 * 435 * nm_txsync() pushes packets to the underlying hw/switch 436 * 437 * nm_rxsync() collects packets from the underlying hw/switch 438 * 439 * nm_config() returns configuration information from the OS 440 * 441 * nm_krings_create() XXX 442 * 443 * nm_krings_delete() XXX 444 * 445 * nm_notify() is used to act after data have become available. 446 * For hw devices this is typically a selwakeup(), 447 * but for NIC/host ports attached to a switch (or vice-versa) 448 * we also need to invoke the 'txsync' code downstream. 449 */ 450 451 /* private cleanup */ 452 void (*nm_dtor)(struct netmap_adapter *); 453 454 int (*nm_register)(struct netmap_adapter *, int onoff); 455 456 int (*nm_txsync)(struct netmap_adapter *, u_int ring, int flags); 457 int (*nm_rxsync)(struct netmap_adapter *, u_int ring, int flags); 458 #define NAF_FORCE_READ 1 459 #define NAF_FORCE_RECLAIM 2 460 /* return configuration information */ 461 int (*nm_config)(struct netmap_adapter *, 462 u_int *txr, u_int *txd, u_int *rxr, u_int *rxd); 463 int (*nm_krings_create)(struct netmap_adapter *); 464 void (*nm_krings_delete)(struct netmap_adapter *); 465 int (*nm_notify)(struct netmap_adapter *, 466 u_int ring, enum txrx, int flags); 467 #define NAF_GLOBAL_NOTIFY 4 468 #define NAF_DISABLE_NOTIFY 8 469 470 /* standard refcount to control the lifetime of the adapter 471 * (it should be equal to the lifetime of the corresponding ifp) 472 */ 473 int na_refcount; 474 475 /* memory allocator (opaque) 476 * We also cache a pointer to the lut_entry for translating 477 * buffer addresses, and the total number of buffers. 478 */ 479 struct netmap_mem_d *nm_mem; 480 struct lut_entry *na_lut; 481 uint32_t na_lut_objtotal; /* max buffer index */ 482 483 /* used internally. If non-null, the interface cannot be bound 484 * from userspace 485 */ 486 void *na_private; 487 }; 488 489 490 /* 491 * If the NIC is owned by the kernel 492 * (i.e., bridge), neither another bridge nor user can use it; 493 * if the NIC is owned by a user, only users can share it. 494 * Evaluation must be done under NMG_LOCK(). 495 */ 496 #define NETMAP_OWNED_BY_KERN(na) (na->na_private) 497 #define NETMAP_OWNED_BY_ANY(na) \ 498 (NETMAP_OWNED_BY_KERN(na) || (na->active_fds > 0)) 499 500 501 /* 502 * derived netmap adapters for various types of ports 503 */ 504 struct netmap_vp_adapter { /* VALE software port */ 505 struct netmap_adapter up; 506 507 /* 508 * Bridge support: 509 * 510 * bdg_port is the port number used in the bridge; 511 * na_bdg points to the bridge this NA is attached to. 512 */ 513 int bdg_port; 514 struct nm_bridge *na_bdg; 515 int retry; 516 517 u_int offset; /* Offset of ethernet header for each packet. */ 518 }; 519 520 521 struct netmap_hw_adapter { /* physical device */ 522 struct netmap_adapter up; 523 524 struct net_device_ops nm_ndo; // XXX linux only 525 }; 526 527 528 struct netmap_generic_adapter { /* emulated device */ 529 struct netmap_hw_adapter up; 530 531 /* Pointer to a previously used netmap adapter. */ 532 struct netmap_adapter *prev; 533 534 /* generic netmap adapters support: 535 * a net_device_ops struct overrides ndo_select_queue(), 536 * save_if_input saves the if_input hook (FreeBSD), 537 * mit_timer and mit_pending implement rx interrupt mitigation, 538 */ 539 struct net_device_ops generic_ndo; 540 void (*save_if_input)(struct ifnet *, struct mbuf *); 541 542 struct hrtimer mit_timer; 543 int mit_pending; 544 #ifdef linux 545 netdev_tx_t (*save_start_xmit)(struct mbuf *, struct ifnet *); 546 #endif 547 }; 548 549 #ifdef WITH_VALE 550 551 /* 552 * Bridge wrapper for non VALE ports attached to a VALE switch. 553 * 554 * The real device must already have its own netmap adapter (hwna). 555 * The bridge wrapper and the hwna adapter share the same set of 556 * netmap rings and buffers, but they have two separate sets of 557 * krings descriptors, with tx/rx meanings swapped: 558 * 559 * netmap 560 * bwrap krings rings krings hwna 561 * +------+ +------+ +-----+ +------+ +------+ 562 * |tx_rings->| |\ /| |----| |<-tx_rings| 563 * | | +------+ \ / +-----+ +------+ | | 564 * | | X | | 565 * | | / \ | | 566 * | | +------+/ \+-----+ +------+ | | 567 * |rx_rings->| | | |----| |<-rx_rings| 568 * | | +------+ +-----+ +------+ | | 569 * +------+ +------+ 570 * 571 * - packets coming from the bridge go to the brwap rx rings, 572 * which are also the hwna tx rings. The bwrap notify callback 573 * will then complete the hwna tx (see netmap_bwrap_notify). 574 * 575 * - packets coming from the outside go to the hwna rx rings, 576 * which are also the bwrap tx rings. The (overwritten) hwna 577 * notify method will then complete the bridge tx 578 * (see netmap_bwrap_intr_notify). 579 * 580 * The bridge wrapper may optionally connect the hwna 'host' rings 581 * to the bridge. This is done by using a second port in the 582 * bridge and connecting it to the 'host' netmap_vp_adapter 583 * contained in the netmap_bwrap_adapter. The brwap host adapter 584 * cross-links the hwna host rings in the same way as shown above. 585 * 586 * - packets coming from the bridge and directed to the host stack 587 * are handled by the bwrap host notify callback 588 * (see netmap_bwrap_host_notify) 589 * 590 * - packets coming from the host stack are still handled by the 591 * overwritten hwna notify callback (netmap_bwrap_intr_notify), 592 * but are diverted to the host adapter depending on the ring number. 593 * 594 */ 595 struct netmap_bwrap_adapter { 596 struct netmap_vp_adapter up; 597 struct netmap_vp_adapter host; /* for host rings */ 598 struct netmap_adapter *hwna; /* the underlying device */ 599 600 /* backup of the hwna notify callback */ 601 int (*save_notify)(struct netmap_adapter *, 602 u_int ring, enum txrx, int flags); 603 604 /* 605 * When we attach a physical interface to the bridge, we 606 * allow the controlling process to terminate, so we need 607 * a place to store the netmap_priv_d data structure. 608 * This is only done when physical interfaces 609 * are attached to a bridge. 610 */ 611 struct netmap_priv_d *na_kpriv; 612 }; 613 614 615 #endif /* WITH_VALE */ 616 617 618 /* return slots reserved to rx clients; used in drivers */ 619 static inline uint32_t 620 nm_kr_rxspace(struct netmap_kring *k) 621 { 622 int space = k->nr_hwtail - k->nr_hwcur; 623 if (space < 0) 624 space += k->nkr_num_slots; 625 ND("preserving %d rx slots %d -> %d", space, k->nr_hwcur, k->nr_hwtail); 626 627 return space; 628 } 629 630 631 /* True if no space in the tx ring. only valid after txsync_prologue */ 632 static inline int 633 nm_kr_txempty(struct netmap_kring *kring) 634 { 635 return kring->rcur == kring->nr_hwtail; 636 } 637 638 639 /* 640 * protect against multiple threads using the same ring. 641 * also check that the ring has not been stopped. 642 * We only care for 0 or !=0 as a return code. 643 */ 644 #define NM_KR_BUSY 1 645 #define NM_KR_STOPPED 2 646 647 648 static __inline void nm_kr_put(struct netmap_kring *kr) 649 { 650 NM_ATOMIC_CLEAR(&kr->nr_busy); 651 } 652 653 654 static __inline int nm_kr_tryget(struct netmap_kring *kr) 655 { 656 /* check a first time without taking the lock 657 * to avoid starvation for nm_kr_get() 658 */ 659 if (unlikely(kr->nkr_stopped)) { 660 ND("ring %p stopped (%d)", kr, kr->nkr_stopped); 661 return NM_KR_STOPPED; 662 } 663 if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))) 664 return NM_KR_BUSY; 665 /* check a second time with lock held */ 666 if (unlikely(kr->nkr_stopped)) { 667 ND("ring %p stopped (%d)", kr, kr->nkr_stopped); 668 nm_kr_put(kr); 669 return NM_KR_STOPPED; 670 } 671 return 0; 672 } 673 674 675 /* 676 * The following functions are used by individual drivers to 677 * support netmap operation. 678 * 679 * netmap_attach() initializes a struct netmap_adapter, allocating the 680 * struct netmap_ring's and the struct selinfo. 681 * 682 * netmap_detach() frees the memory allocated by netmap_attach(). 683 * 684 * netmap_transmit() replaces the if_transmit routine of the interface, 685 * and is used to intercept packets coming from the stack. 686 * 687 * netmap_load_map/netmap_reload_map are helper routines to set/reset 688 * the dmamap for a packet buffer 689 * 690 * netmap_reset() is a helper routine to be called in the driver 691 * when reinitializing a ring. 692 */ 693 int netmap_attach(struct netmap_adapter *); 694 int netmap_attach_common(struct netmap_adapter *); 695 void netmap_detach_common(struct netmap_adapter *na); 696 void netmap_detach(struct ifnet *); 697 int netmap_transmit(struct ifnet *, struct mbuf *); 698 struct netmap_slot *netmap_reset(struct netmap_adapter *na, 699 enum txrx tx, u_int n, u_int new_cur); 700 int netmap_ring_reinit(struct netmap_kring *); 701 702 /* default functions to handle rx/tx interrupts */ 703 int netmap_rx_irq(struct ifnet *, u_int, u_int *); 704 #define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) 705 void netmap_common_irq(struct ifnet *, u_int, u_int *work_done); 706 707 void netmap_disable_all_rings(struct ifnet *); 708 void netmap_enable_all_rings(struct ifnet *); 709 void netmap_disable_ring(struct netmap_kring *kr); 710 711 712 /* set/clear native flags and if_transmit/netdev_ops */ 713 static inline void 714 nm_set_native_flags(struct netmap_adapter *na) 715 { 716 struct ifnet *ifp = na->ifp; 717 718 na->na_flags |= (NAF_NATIVE_ON | NAF_NETMAP_ON); 719 #ifdef IFCAP_NETMAP /* or FreeBSD ? */ 720 ifp->if_capenable |= IFCAP_NETMAP; 721 #endif 722 #ifdef __FreeBSD__ 723 na->if_transmit = ifp->if_transmit; 724 ifp->if_transmit = netmap_transmit; 725 #else 726 na->if_transmit = (void *)ifp->netdev_ops; 727 ifp->netdev_ops = &((struct netmap_hw_adapter *)na)->nm_ndo; 728 #endif 729 } 730 731 732 static inline void 733 nm_clear_native_flags(struct netmap_adapter *na) 734 { 735 struct ifnet *ifp = na->ifp; 736 737 #ifdef __FreeBSD__ 738 ifp->if_transmit = na->if_transmit; 739 #else 740 ifp->netdev_ops = (void *)na->if_transmit; 741 #endif 742 na->na_flags &= ~(NAF_NATIVE_ON | NAF_NETMAP_ON); 743 #ifdef IFCAP_NETMAP /* or FreeBSD ? */ 744 ifp->if_capenable &= ~IFCAP_NETMAP; 745 #endif 746 } 747 748 749 /* 750 * validates parameters in the ring/kring, returns a value for head 751 * If any error, returns ring_size to force a reinit. 752 */ 753 uint32_t nm_txsync_prologue(struct netmap_kring *); 754 755 756 /* 757 * validates parameters in the ring/kring, returns a value for head, 758 * and the 'reserved' value in the argument. 759 * If any error, returns ring_size lim to force a reinit. 760 */ 761 uint32_t nm_rxsync_prologue(struct netmap_kring *); 762 763 764 /* 765 * update kring and ring at the end of txsync. 766 */ 767 static inline void 768 nm_txsync_finalize(struct netmap_kring *kring) 769 { 770 /* update ring head/tail to what the kernel knows */ 771 kring->ring->tail = kring->rtail = kring->nr_hwtail; 772 kring->ring->head = kring->rhead = kring->nr_hwcur; 773 774 /* note, head/rhead/hwcur might be behind cur/rcur 775 * if no carrier 776 */ 777 ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d", 778 kring->name, kring->nr_hwcur, kring->nr_hwtail, 779 kring->rhead, kring->rcur, kring->rtail); 780 } 781 782 783 /* 784 * update kring and ring at the end of rxsync 785 */ 786 static inline void 787 nm_rxsync_finalize(struct netmap_kring *kring) 788 { 789 /* tell userspace that there might be new packets */ 790 //struct netmap_ring *ring = kring->ring; 791 ND("head %d cur %d tail %d -> %d", ring->head, ring->cur, ring->tail, 792 kring->nr_hwtail); 793 kring->ring->tail = kring->rtail = kring->nr_hwtail; 794 /* make a copy of the state for next round */ 795 kring->rhead = kring->ring->head; 796 kring->rcur = kring->ring->cur; 797 } 798 799 800 /* check/fix address and len in tx rings */ 801 #if 1 /* debug version */ 802 #define NM_CHECK_ADDR_LEN(_a, _l) do { \ 803 if (_a == netmap_buffer_base || _l > NETMAP_BUF_SIZE) { \ 804 RD(5, "bad addr/len ring %d slot %d idx %d len %d", \ 805 ring_nr, nm_i, slot->buf_idx, len); \ 806 if (_l > NETMAP_BUF_SIZE) \ 807 _l = NETMAP_BUF_SIZE; \ 808 } } while (0) 809 #else /* no debug version */ 810 #define NM_CHECK_ADDR_LEN(_a, _l) do { \ 811 if (_l > NETMAP_BUF_SIZE) \ 812 _l = NETMAP_BUF_SIZE; \ 813 } while (0) 814 #endif 815 816 817 /*---------------------------------------------------------------*/ 818 /* 819 * Support routines to be used with the VALE switch 820 */ 821 int netmap_update_config(struct netmap_adapter *na); 822 int netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom); 823 void netmap_krings_delete(struct netmap_adapter *na); 824 int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait); 825 826 827 struct netmap_if * 828 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, 829 uint16_t ringid, int *err); 830 831 832 833 u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg); 834 int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create); 835 int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na); 836 837 838 #ifdef WITH_VALE 839 /* 840 * The following bridge-related functions are used by other 841 * kernel modules. 842 * 843 * VALE only supports unicast or broadcast. The lookup 844 * function can return 0 .. NM_BDG_MAXPORTS-1 for regular ports, 845 * NM_BDG_MAXPORTS for broadcast, NM_BDG_MAXPORTS+1 for unknown. 846 * XXX in practice "unknown" might be handled same as broadcast. 847 */ 848 typedef u_int (*bdg_lookup_fn_t)(char *buf, u_int len, 849 uint8_t *ring_nr, struct netmap_vp_adapter *); 850 u_int netmap_bdg_learning(char *, u_int, uint8_t *, 851 struct netmap_vp_adapter *); 852 853 #define NM_BDG_MAXPORTS 254 /* up to 254 */ 854 #define NM_BDG_BROADCAST NM_BDG_MAXPORTS 855 #define NM_BDG_NOPORT (NM_BDG_MAXPORTS+1) 856 857 #define NM_NAME "vale" /* prefix for bridge port name */ 858 859 860 /* these are redefined in case of no VALE support */ 861 int netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create); 862 void netmap_init_bridges(void); 863 int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func); 864 865 #else /* !WITH_VALE */ 866 #define netmap_get_bdg_na(_1, _2, _3) 0 867 #define netmap_init_bridges(_1) 868 #define netmap_bdg_ctl(_1, _2) EINVAL 869 #endif /* !WITH_VALE */ 870 871 /* Various prototypes */ 872 int netmap_poll(struct cdev *dev, int events, struct thread *td); 873 int netmap_init(void); 874 void netmap_fini(void); 875 int netmap_get_memory(struct netmap_priv_d* p); 876 void netmap_dtor(void *data); 877 int netmap_dtor_locked(struct netmap_priv_d *priv); 878 879 int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td); 880 881 /* netmap_adapter creation/destruction */ 882 #define NM_IFPNAME(ifp) ((ifp) ? (ifp)->if_xname : "zombie") 883 884 // #define NM_DEBUG_PUTGET 1 885 886 #ifdef NM_DEBUG_PUTGET 887 888 #define NM_DBG(f) __##f 889 890 void __netmap_adapter_get(struct netmap_adapter *na); 891 892 #define netmap_adapter_get(na) \ 893 do { \ 894 struct netmap_adapter *__na = na; \ 895 D("getting %p:%s (%d)", __na, NM_IFPNAME(__na->ifp), __na->na_refcount); \ 896 __netmap_adapter_get(__na); \ 897 } while (0) 898 899 int __netmap_adapter_put(struct netmap_adapter *na); 900 901 #define netmap_adapter_put(na) \ 902 do { \ 903 struct netmap_adapter *__na = na; \ 904 D("putting %p:%s (%d)", __na, NM_IFPNAME(__na->ifp), __na->na_refcount); \ 905 __netmap_adapter_put(__na); \ 906 } while (0) 907 908 #else /* !NM_DEBUG_PUTGET */ 909 910 #define NM_DBG(f) f 911 void netmap_adapter_get(struct netmap_adapter *na); 912 int netmap_adapter_put(struct netmap_adapter *na); 913 914 #endif /* !NM_DEBUG_PUTGET */ 915 916 917 /* 918 * module variables 919 */ 920 extern u_int netmap_buf_size; 921 #define NETMAP_BUF_SIZE netmap_buf_size // XXX remove 922 extern int netmap_mitigate; // XXX not really used 923 extern int netmap_no_pendintr; 924 extern u_int netmap_total_buffers; // global allocator 925 extern char *netmap_buffer_base; // global allocator 926 extern int netmap_verbose; // XXX debugging 927 enum { /* verbose flags */ 928 NM_VERB_ON = 1, /* generic verbose */ 929 NM_VERB_HOST = 0x2, /* verbose host stack */ 930 NM_VERB_RXSYNC = 0x10, /* verbose on rxsync/txsync */ 931 NM_VERB_TXSYNC = 0x20, 932 NM_VERB_RXINTR = 0x100, /* verbose on rx/tx intr (driver) */ 933 NM_VERB_TXINTR = 0x200, 934 NM_VERB_NIC_RXSYNC = 0x1000, /* verbose on rx/tx intr (driver) */ 935 NM_VERB_NIC_TXSYNC = 0x2000, 936 }; 937 938 extern int netmap_txsync_retry; 939 extern int netmap_generic_mit; 940 extern int netmap_generic_ringsize; 941 942 /* 943 * NA returns a pointer to the struct netmap adapter from the ifp, 944 * WNA is used to write it. 945 */ 946 #ifndef WNA 947 #define WNA(_ifp) (_ifp)->if_pspare[0] 948 #endif 949 #define NA(_ifp) ((struct netmap_adapter *)WNA(_ifp)) 950 951 /* 952 * Macros to determine if an interface is netmap capable or netmap enabled. 953 * See the magic field in struct netmap_adapter. 954 */ 955 #ifdef __FreeBSD__ 956 /* 957 * on FreeBSD just use if_capabilities and if_capenable. 958 */ 959 #define NETMAP_CAPABLE(ifp) (NA(ifp) && \ 960 (ifp)->if_capabilities & IFCAP_NETMAP ) 961 962 #define NETMAP_SET_CAPABLE(ifp) \ 963 (ifp)->if_capabilities |= IFCAP_NETMAP 964 965 #else /* linux */ 966 967 /* 968 * on linux: 969 * we check if NA(ifp) is set and its first element has a related 970 * magic value. The capenable is within the struct netmap_adapter. 971 */ 972 #define NETMAP_MAGIC 0x52697a7a 973 974 #define NETMAP_CAPABLE(ifp) (NA(ifp) && \ 975 ((uint32_t)(uintptr_t)NA(ifp) ^ NA(ifp)->magic) == NETMAP_MAGIC ) 976 977 #define NETMAP_SET_CAPABLE(ifp) \ 978 NA(ifp)->magic = ((uint32_t)(uintptr_t)NA(ifp)) ^ NETMAP_MAGIC 979 980 #endif /* linux */ 981 982 #ifdef __FreeBSD__ 983 984 /* Callback invoked by the dma machinery after a successful dmamap_load */ 985 static void netmap_dmamap_cb(__unused void *arg, 986 __unused bus_dma_segment_t * segs, __unused int nseg, __unused int error) 987 { 988 } 989 990 /* bus_dmamap_load wrapper: call aforementioned function if map != NULL. 991 * XXX can we do it without a callback ? 992 */ 993 static inline void 994 netmap_load_map(bus_dma_tag_t tag, bus_dmamap_t map, void *buf) 995 { 996 if (map) 997 bus_dmamap_load(tag, map, buf, NETMAP_BUF_SIZE, 998 netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT); 999 } 1000 1001 /* update the map when a buffer changes. */ 1002 static inline void 1003 netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map, void *buf) 1004 { 1005 if (map) { 1006 bus_dmamap_unload(tag, map); 1007 bus_dmamap_load(tag, map, buf, NETMAP_BUF_SIZE, 1008 netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT); 1009 } 1010 } 1011 1012 #else /* linux */ 1013 1014 /* 1015 * XXX How do we redefine these functions: 1016 * 1017 * on linux we need 1018 * dma_map_single(&pdev->dev, virt_addr, len, direction) 1019 * dma_unmap_single(&adapter->pdev->dev, phys_addr, len, direction 1020 * The len can be implicit (on netmap it is NETMAP_BUF_SIZE) 1021 * unfortunately the direction is not, so we need to change 1022 * something to have a cross API 1023 */ 1024 #define netmap_load_map(_t, _m, _b) 1025 #define netmap_reload_map(_t, _m, _b) 1026 #if 0 1027 struct e1000_buffer *buffer_info = &tx_ring->buffer_info[l]; 1028 /* set time_stamp *before* dma to help avoid a possible race */ 1029 buffer_info->time_stamp = jiffies; 1030 buffer_info->mapped_as_page = false; 1031 buffer_info->length = len; 1032 //buffer_info->next_to_watch = l; 1033 /* reload dma map */ 1034 dma_unmap_single(&adapter->pdev->dev, buffer_info->dma, 1035 NETMAP_BUF_SIZE, DMA_TO_DEVICE); 1036 buffer_info->dma = dma_map_single(&adapter->pdev->dev, 1037 addr, NETMAP_BUF_SIZE, DMA_TO_DEVICE); 1038 1039 if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { 1040 D("dma mapping error"); 1041 /* goto dma_error; See e1000_put_txbuf() */ 1042 /* XXX reset */ 1043 } 1044 tx_desc->buffer_addr = htole64(buffer_info->dma); //XXX 1045 1046 #endif 1047 1048 /* 1049 * The bus_dmamap_sync() can be one of wmb() or rmb() depending on direction. 1050 */ 1051 #define bus_dmamap_sync(_a, _b, _c) 1052 1053 #endif /* linux */ 1054 1055 1056 /* 1057 * functions to map NIC to KRING indexes (n2k) and vice versa (k2n) 1058 */ 1059 static inline int 1060 netmap_idx_n2k(struct netmap_kring *kr, int idx) 1061 { 1062 int n = kr->nkr_num_slots; 1063 idx += kr->nkr_hwofs; 1064 if (idx < 0) 1065 return idx + n; 1066 else if (idx < n) 1067 return idx; 1068 else 1069 return idx - n; 1070 } 1071 1072 1073 static inline int 1074 netmap_idx_k2n(struct netmap_kring *kr, int idx) 1075 { 1076 int n = kr->nkr_num_slots; 1077 idx -= kr->nkr_hwofs; 1078 if (idx < 0) 1079 return idx + n; 1080 else if (idx < n) 1081 return idx; 1082 else 1083 return idx - n; 1084 } 1085 1086 1087 /* Entries of the look-up table. */ 1088 struct lut_entry { 1089 void *vaddr; /* virtual address. */ 1090 vm_paddr_t paddr; /* physical address. */ 1091 }; 1092 1093 struct netmap_obj_pool; 1094 extern struct lut_entry *netmap_buffer_lut; 1095 #define NMB_VA(i) (netmap_buffer_lut[i].vaddr) 1096 #define NMB_PA(i) (netmap_buffer_lut[i].paddr) 1097 1098 /* 1099 * NMB return the virtual address of a buffer (buffer 0 on bad index) 1100 * PNMB also fills the physical address 1101 */ 1102 static inline void * 1103 NMB(struct netmap_slot *slot) 1104 { 1105 uint32_t i = slot->buf_idx; 1106 return (unlikely(i >= netmap_total_buffers)) ? NMB_VA(0) : NMB_VA(i); 1107 } 1108 1109 static inline void * 1110 PNMB(struct netmap_slot *slot, uint64_t *pp) 1111 { 1112 uint32_t i = slot->buf_idx; 1113 void *ret = (i >= netmap_total_buffers) ? NMB_VA(0) : NMB_VA(i); 1114 1115 *pp = (i >= netmap_total_buffers) ? NMB_PA(0) : NMB_PA(i); 1116 return ret; 1117 } 1118 1119 /* Generic version of NMB, which uses device-specific memory. */ 1120 static inline void * 1121 BDG_NMB(struct netmap_adapter *na, struct netmap_slot *slot) 1122 { 1123 struct lut_entry *lut = na->na_lut; 1124 uint32_t i = slot->buf_idx; 1125 return (unlikely(i >= na->na_lut_objtotal)) ? 1126 lut[0].vaddr : lut[i].vaddr; 1127 } 1128 1129 1130 1131 void netmap_txsync_to_host(struct netmap_adapter *na); 1132 1133 1134 /* 1135 * Structure associated to each thread which registered an interface. 1136 * 1137 * The first 4 fields of this structure are written by NIOCREGIF and 1138 * read by poll() and NIOC?XSYNC. 1139 * 1140 * There is low contention among writers (a correct user program 1141 * should have none) and among writers and readers, so we use a 1142 * single global lock to protect the structure initialization; 1143 * since initialization involves the allocation of memory, 1144 * we reuse the memory allocator lock. 1145 * 1146 * Read access to the structure is lock free. Readers must check that 1147 * np_nifp is not NULL before using the other fields. 1148 * If np_nifp is NULL initialization has not been performed, 1149 * so they should return an error to userspace. 1150 * 1151 * The ref_done field is used to regulate access to the refcount in the 1152 * memory allocator. The refcount must be incremented at most once for 1153 * each open("/dev/netmap"). The increment is performed by the first 1154 * function that calls netmap_get_memory() (currently called by 1155 * mmap(), NIOCGINFO and NIOCREGIF). 1156 * If the refcount is incremented, it is then decremented when the 1157 * private structure is destroyed. 1158 */ 1159 struct netmap_priv_d { 1160 struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ 1161 1162 struct netmap_adapter *np_na; 1163 int np_ringid; /* from the ioctl */ 1164 u_int np_qfirst, np_qlast; /* range of rings to scan */ 1165 uint16_t np_txpoll; 1166 1167 struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */ 1168 /* np_refcount is only used on FreeBSD */ 1169 int np_refcount; /* use with NMG_LOCK held */ 1170 }; 1171 1172 1173 /* 1174 * generic netmap emulation for devices that do not have 1175 * native netmap support. 1176 */ 1177 int generic_netmap_attach(struct ifnet *ifp); 1178 1179 int netmap_catch_rx(struct netmap_adapter *na, int intercept); 1180 void generic_rx_handler(struct ifnet *ifp, struct mbuf *m);; 1181 void netmap_catch_tx(struct netmap_generic_adapter *na, int enable); 1182 int generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, void *addr, u_int len, u_int ring_nr); 1183 int generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx); 1184 void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq); 1185 1186 /* 1187 * netmap_mitigation API. This is used by the generic adapter 1188 * to reduce the number of interrupt requests/selwakeup 1189 * to clients on incoming packets. 1190 */ 1191 void netmap_mitigation_init(struct netmap_generic_adapter *na); 1192 void netmap_mitigation_start(struct netmap_generic_adapter *na); 1193 void netmap_mitigation_restart(struct netmap_generic_adapter *na); 1194 int netmap_mitigation_active(struct netmap_generic_adapter *na); 1195 void netmap_mitigation_cleanup(struct netmap_generic_adapter *na); 1196 1197 #endif /* _NET_NETMAP_KERN_H_ */ 1198