1 /* 2 * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 /* 28 * This module supports memory mapped access to network devices, 29 * see netmap(4). 30 * 31 * The module uses a large, memory pool allocated by the kernel 32 * and accessible as mmapped memory by multiple userspace threads/processes. 33 * The memory pool contains packet buffers and "netmap rings", 34 * i.e. user-accessible copies of the interface's queues. 35 * 36 * Access to the network card works like this: 37 * 1. a process/thread issues one or more open() on /dev/netmap, to create 38 * select()able file descriptor on which events are reported. 39 * 2. on each descriptor, the process issues an ioctl() to identify 40 * the interface that should report events to the file descriptor. 41 * 3. on each descriptor, the process issues an mmap() request to 42 * map the shared memory region within the process' address space. 43 * The list of interesting queues is indicated by a location in 44 * the shared memory region. 45 * 4. using the functions in the netmap(4) userspace API, a process 46 * can look up the occupation state of a queue, access memory buffers, 47 * and retrieve received packets or enqueue packets to transmit. 48 * 5. using some ioctl()s the process can synchronize the userspace view 49 * of the queue with the actual status in the kernel. This includes both 50 * receiving the notification of new packets, and transmitting new 51 * packets on the output interface. 52 * 6. select() or poll() can be used to wait for events on individual 53 * transmit or receive queues (or all queues for a given interface). 54 * 55 56 SYNCHRONIZATION (USER) 57 58 The netmap rings and data structures may be shared among multiple 59 user threads or even independent processes. 60 Any synchronization among those threads/processes is delegated 61 to the threads themselves. Only one thread at a time can be in 62 a system call on the same netmap ring. The OS does not enforce 63 this and only guarantees against system crashes in case of 64 invalid usage. 65 66 LOCKING (INTERNAL) 67 68 Within the kernel, access to the netmap rings is protected as follows: 69 70 - a spinlock on each ring, to handle producer/consumer races on 71 RX rings attached to the host stack (against multiple host 72 threads writing from the host stack to the same ring), 73 and on 'destination' rings attached to a VALE switch 74 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports) 75 protecting multiple active senders for the same destination) 76 77 - an atomic variable to guarantee that there is at most one 78 instance of *_*xsync() on the ring at any time. 79 For rings connected to user file 80 descriptors, an atomic_test_and_set() protects this, and the 81 lock on the ring is not actually used. 82 For NIC RX rings connected to a VALE switch, an atomic_test_and_set() 83 is also used to prevent multiple executions (the driver might indeed 84 already guarantee this). 85 For NIC TX rings connected to a VALE switch, the lock arbitrates 86 access to the queue (both when allocating buffers and when pushing 87 them out). 88 89 - *xsync() should be protected against initializations of the card. 90 On FreeBSD most devices have the reset routine protected by 91 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing 92 the RING protection on rx_reset(), this should be added. 93 94 On linux there is an external lock on the tx path, which probably 95 also arbitrates access to the reset routine. XXX to be revised 96 97 - a per-interface core_lock protecting access from the host stack 98 while interfaces may be detached from netmap mode. 99 XXX there should be no need for this lock if we detach the interfaces 100 only while they are down. 101 102 103 --- VALE SWITCH --- 104 105 NMG_LOCK() serializes all modifications to switches and ports. 106 A switch cannot be deleted until all ports are gone. 107 108 For each switch, an SX lock (RWlock on linux) protects 109 deletion of ports. When configuring or deleting a new port, the 110 lock is acquired in exclusive mode (after holding NMG_LOCK). 111 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 112 The lock is held throughout the entire forwarding cycle, 113 during which the thread may incur in a page fault. 114 Hence it is important that sleepable shared locks are used. 115 116 On the rx ring, the per-port lock is grabbed initially to reserve 117 a number of slot in the ring, then the lock is released, 118 packets are copied from source to destination, and then 119 the lock is acquired again and the receive ring is updated. 120 (A similar thing is done on the tx ring for NIC and host stack 121 ports attached to the switch) 122 123 */ 124 125 /* 126 * OS-specific code that is used only within this file. 127 * Other OS-specific code that must be accessed by drivers 128 * is present in netmap_kern.h 129 */ 130 131 #if defined(__FreeBSD__) 132 #include <sys/cdefs.h> /* prerequisite */ 133 __FBSDID("$FreeBSD$"); 134 135 #include <sys/types.h> 136 #include <sys/module.h> 137 #include <sys/errno.h> 138 #include <sys/param.h> /* defines used in kernel.h */ 139 #include <sys/jail.h> 140 #include <sys/kernel.h> /* types used in module initialization */ 141 #include <sys/conf.h> /* cdevsw struct */ 142 #include <sys/uio.h> /* uio struct */ 143 #include <sys/sockio.h> 144 #include <sys/socketvar.h> /* struct socket */ 145 #include <sys/malloc.h> 146 #include <sys/mman.h> /* PROT_EXEC */ 147 #include <sys/poll.h> 148 #include <sys/proc.h> 149 #include <sys/rwlock.h> 150 #include <vm/vm.h> /* vtophys */ 151 #include <vm/pmap.h> /* vtophys */ 152 #include <vm/vm_param.h> 153 #include <vm/vm_object.h> 154 #include <vm/vm_page.h> 155 #include <vm/vm_pager.h> 156 #include <vm/uma.h> 157 #include <sys/socket.h> /* sockaddrs */ 158 #include <sys/selinfo.h> 159 #include <sys/sysctl.h> 160 #include <net/if.h> 161 #include <net/if_var.h> 162 #include <net/bpf.h> /* BIOCIMMEDIATE */ 163 #include <net/vnet.h> 164 #include <machine/bus.h> /* bus_dmamap_* */ 165 #include <sys/endian.h> 166 #include <sys/refcount.h> 167 168 #define prefetch(x) __builtin_prefetch(x) 169 170 #define BDG_RWLOCK_T struct rwlock // struct rwlock 171 172 #define BDG_RWINIT(b) \ 173 rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) 174 #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) 175 #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) 176 #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) 177 #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) 178 #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) 179 #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) 180 181 182 /* netmap global lock. 183 * normally called within the user thread (upon a system call) 184 * or when a file descriptor or process is terminated 185 * (last close or last munmap) 186 */ 187 188 #define NMG_LOCK_T struct mtx 189 #define NMG_LOCK_INIT() mtx_init(&netmap_global_lock, "netmap global lock", NULL, MTX_DEF) 190 #define NMG_LOCK_DESTROY() mtx_destroy(&netmap_global_lock) 191 #define NMG_LOCK() mtx_lock(&netmap_global_lock) 192 #define NMG_UNLOCK() mtx_unlock(&netmap_global_lock) 193 #define NMG_LOCK_ASSERT() mtx_assert(&netmap_global_lock, MA_OWNED) 194 195 196 /* atomic operations */ 197 #include <machine/atomic.h> 198 #define NM_ATOMIC_TEST_AND_SET(p) (!atomic_cmpset_acq_int((p), 0, 1)) 199 #define NM_ATOMIC_CLEAR(p) atomic_store_rel_int((p), 0) 200 201 202 #elif defined(linux) 203 204 #include "bsd_glue.h" 205 206 static netdev_tx_t linux_netmap_start_xmit(struct sk_buff *, struct net_device *); 207 208 static struct device_driver* 209 linux_netmap_find_driver(struct device *dev) 210 { 211 struct device_driver *dd; 212 213 while ( (dd = dev->driver) == NULL ) { 214 if ( (dev = dev->parent) == NULL ) 215 return NULL; 216 } 217 return dd; 218 } 219 220 static struct net_device* 221 ifunit_ref(const char *name) 222 { 223 struct net_device *ifp = dev_get_by_name(&init_net, name); 224 struct device_driver *dd; 225 226 if (ifp == NULL) 227 return NULL; 228 229 if ( (dd = linux_netmap_find_driver(&ifp->dev)) == NULL ) 230 goto error; 231 232 if (!try_module_get(dd->owner)) 233 goto error; 234 235 return ifp; 236 error: 237 dev_put(ifp); 238 return NULL; 239 } 240 241 static void 242 if_rele(struct net_device *ifp) 243 { 244 struct device_driver *dd; 245 dd = linux_netmap_find_driver(&ifp->dev); 246 dev_put(ifp); 247 if (dd) 248 module_put(dd->owner); 249 } 250 251 // XXX a mtx would suffice here too 20130404 gl 252 #define NMG_LOCK_T struct semaphore 253 #define NMG_LOCK_INIT() sema_init(&netmap_global_lock, 1) 254 #define NMG_LOCK_DESTROY() 255 #define NMG_LOCK() down(&netmap_global_lock) 256 #define NMG_UNLOCK() up(&netmap_global_lock) 257 #define NMG_LOCK_ASSERT() // XXX to be completed 258 259 260 #elif defined(__APPLE__) 261 262 #warning OSX support is only partial 263 #include "osx_glue.h" 264 265 #else 266 267 #error Unsupported platform 268 269 #endif /* unsupported */ 270 271 /* 272 * common headers 273 */ 274 #include <net/netmap.h> 275 #include <dev/netmap/netmap_kern.h> 276 #include <dev/netmap/netmap_mem2.h> 277 278 279 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); 280 281 /* 282 * The following variables are used by the drivers and replicate 283 * fields in the global memory pool. They only refer to buffers 284 * used by physical interfaces. 285 */ 286 u_int netmap_total_buffers; 287 u_int netmap_buf_size; 288 char *netmap_buffer_base; /* also address of an invalid buffer */ 289 290 /* user-controlled variables */ 291 int netmap_verbose; 292 293 static int netmap_no_timestamp; /* don't timestamp on rxsync */ 294 295 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); 296 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, 297 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); 298 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, 299 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); 300 int netmap_mitigate = 1; 301 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); 302 int netmap_no_pendintr = 1; 303 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, 304 CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); 305 int netmap_txsync_retry = 2; 306 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, 307 &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); 308 309 int netmap_drop = 0; /* debugging */ 310 int netmap_flags = 0; /* debug flags */ 311 int netmap_fwd = 0; /* force transparent mode */ 312 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ 313 314 SYSCTL_INT(_dev_netmap, OID_AUTO, drop, CTLFLAG_RW, &netmap_drop, 0 , ""); 315 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); 316 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); 317 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); 318 319 NMG_LOCK_T netmap_global_lock; 320 321 /* 322 * protect against multiple threads using the same ring. 323 * also check that the ring has not been stopped. 324 */ 325 #define NM_KR_BUSY 1 326 #define NM_KR_STOPPED 2 327 static void nm_kr_put(struct netmap_kring *kr); 328 static __inline int nm_kr_tryget(struct netmap_kring *kr) 329 { 330 /* check a first time without taking the lock 331 * to avoid starvation for nm_kr_get() 332 */ 333 if (unlikely(kr->nkr_stopped)) { 334 ND("ring %p stopped (%d)", kr, kr->nkr_stopped); 335 return NM_KR_STOPPED; 336 } 337 if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))) 338 return NM_KR_BUSY; 339 /* check a second time with lock held */ 340 if (unlikely(kr->nkr_stopped)) { 341 ND("ring %p stopped (%d)", kr, kr->nkr_stopped); 342 nm_kr_put(kr); 343 return NM_KR_STOPPED; 344 } 345 return 0; 346 } 347 348 static __inline void nm_kr_put(struct netmap_kring *kr) 349 { 350 NM_ATOMIC_CLEAR(&kr->nr_busy); 351 } 352 353 static void nm_kr_get(struct netmap_kring *kr) 354 { 355 while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) 356 tsleep(kr, 0, "NM_KR_GET", 4); 357 } 358 359 static void nm_disable_ring(struct netmap_kring *kr) 360 { 361 kr->nkr_stopped = 1; 362 nm_kr_get(kr); 363 mtx_lock(&kr->q_lock); 364 mtx_unlock(&kr->q_lock); 365 nm_kr_put(kr); 366 } 367 368 void netmap_disable_all_rings(struct ifnet *ifp) 369 { 370 struct netmap_adapter *na; 371 int i; 372 373 if (!(ifp->if_capenable & IFCAP_NETMAP)) 374 return; 375 376 na = NA(ifp); 377 378 for (i = 0; i < na->num_tx_rings + 1; i++) { 379 nm_disable_ring(na->tx_rings + i); 380 selwakeuppri(&na->tx_rings[i].si, PI_NET); 381 } 382 for (i = 0; i < na->num_rx_rings + 1; i++) { 383 nm_disable_ring(na->rx_rings + i); 384 selwakeuppri(&na->rx_rings[i].si, PI_NET); 385 } 386 selwakeuppri(&na->tx_si, PI_NET); 387 selwakeuppri(&na->rx_si, PI_NET); 388 } 389 390 void netmap_enable_all_rings(struct ifnet *ifp) 391 { 392 struct netmap_adapter *na; 393 int i; 394 395 if (!(ifp->if_capenable & IFCAP_NETMAP)) 396 return; 397 398 na = NA(ifp); 399 for (i = 0; i < na->num_tx_rings + 1; i++) { 400 D("enabling %p", na->tx_rings + i); 401 na->tx_rings[i].nkr_stopped = 0; 402 } 403 for (i = 0; i < na->num_rx_rings + 1; i++) { 404 D("enabling %p", na->rx_rings + i); 405 na->rx_rings[i].nkr_stopped = 0; 406 } 407 } 408 409 410 /* 411 * generic bound_checking function 412 */ 413 u_int 414 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) 415 { 416 u_int oldv = *v; 417 const char *op = NULL; 418 419 if (dflt < lo) 420 dflt = lo; 421 if (dflt > hi) 422 dflt = hi; 423 if (oldv < lo) { 424 *v = dflt; 425 op = "Bump"; 426 } else if (oldv > hi) { 427 *v = hi; 428 op = "Clamp"; 429 } 430 if (op && msg) 431 printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); 432 return *v; 433 } 434 435 /* 436 * packet-dump function, user-supplied or static buffer. 437 * The destination buffer must be at least 30+4*len 438 */ 439 const char * 440 nm_dump_buf(char *p, int len, int lim, char *dst) 441 { 442 static char _dst[8192]; 443 int i, j, i0; 444 static char hex[] ="0123456789abcdef"; 445 char *o; /* output position */ 446 447 #define P_HI(x) hex[((x) & 0xf0)>>4] 448 #define P_LO(x) hex[((x) & 0xf)] 449 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.') 450 if (!dst) 451 dst = _dst; 452 if (lim <= 0 || lim > len) 453 lim = len; 454 o = dst; 455 sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim); 456 o += strlen(o); 457 /* hexdump routine */ 458 for (i = 0; i < lim; ) { 459 sprintf(o, "%5d: ", i); 460 o += strlen(o); 461 memset(o, ' ', 48); 462 i0 = i; 463 for (j=0; j < 16 && i < lim; i++, j++) { 464 o[j*3] = P_HI(p[i]); 465 o[j*3+1] = P_LO(p[i]); 466 } 467 i = i0; 468 for (j=0; j < 16 && i < lim; i++, j++) 469 o[j + 48] = P_C(p[i]); 470 o[j+48] = '\n'; 471 o += j+49; 472 } 473 *o = '\0'; 474 #undef P_HI 475 #undef P_LO 476 #undef P_C 477 return dst; 478 } 479 480 /* 481 * system parameters (most of them in netmap_kern.h) 482 * NM_NAME prefix for switch port names, default "vale" 483 * NM_BDG_MAXPORTS number of ports 484 * NM_BRIDGES max number of switches in the system. 485 * XXX should become a sysctl or tunable 486 * 487 * Switch ports are named valeX:Y where X is the switch name and Y 488 * is the port. If Y matches a physical interface name, the port is 489 * connected to a physical device. 490 * 491 * Unlike physical interfaces, switch ports use their own memory region 492 * for rings and buffers. 493 * The virtual interfaces use per-queue lock instead of core lock. 494 * In the tx loop, we aggregate traffic in batches to make all operations 495 * faster. The batch size is bridge_batch. 496 */ 497 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ 498 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ 499 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ 500 #define NM_BDG_HASH 1024 /* forwarding table entries */ 501 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ 502 #define NM_MULTISEG 64 /* max size of a chain of bufs */ 503 /* actual size of the tables */ 504 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) 505 /* NM_FT_NULL terminates a list of slots in the ft */ 506 #define NM_FT_NULL NM_BDG_BATCH_MAX 507 #define NM_BRIDGES 8 /* number of bridges */ 508 509 510 /* 511 * bridge_batch is set via sysctl to the max batch size to be 512 * used in the bridge. The actual value may be larger as the 513 * last packet in the block may overflow the size. 514 */ 515 int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ 516 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); 517 518 519 /* 520 * These are used to handle reference counters for bridge ports. 521 */ 522 #define ADD_BDG_REF(ifp) refcount_acquire(&NA(ifp)->na_bdg_refcount) 523 #define DROP_BDG_REF(ifp) refcount_release(&NA(ifp)->na_bdg_refcount) 524 525 /* The bridge references the buffers using the device specific look up table */ 526 static inline void * 527 BDG_NMB(struct netmap_mem_d *nmd, struct netmap_slot *slot) 528 { 529 struct lut_entry *lut = nmd->pools[NETMAP_BUF_POOL].lut; 530 uint32_t i = slot->buf_idx; 531 return (unlikely(i >= nmd->pools[NETMAP_BUF_POOL].objtotal)) ? lut[0].vaddr : lut[i].vaddr; 532 } 533 534 static int bdg_netmap_attach(struct netmap_adapter *); 535 static int bdg_netmap_reg(struct ifnet *ifp, int onoff); 536 int kern_netmap_regif(struct nmreq *nmr); 537 538 /* 539 * Each transmit queue accumulates a batch of packets into 540 * a structure before forwarding. Packets to the same 541 * destination are put in a list using ft_next as a link field. 542 * ft_frags and ft_next are valid only on the first fragment. 543 */ 544 struct nm_bdg_fwd { /* forwarding entry for a bridge */ 545 void *ft_buf; /* netmap or indirect buffer */ 546 uint8_t ft_frags; /* how many fragments (only on 1st frag) */ 547 uint8_t _ft_port; /* dst port (unused) */ 548 uint16_t ft_flags; /* flags, e.g. indirect */ 549 uint16_t ft_len; /* src fragment len */ 550 uint16_t ft_next; /* next packet to same destination */ 551 }; 552 553 /* 554 * For each output interface, nm_bdg_q is used to construct a list. 555 * bq_len is the number of output buffers (we can have coalescing 556 * during the copy). 557 */ 558 struct nm_bdg_q { 559 uint16_t bq_head; 560 uint16_t bq_tail; 561 uint32_t bq_len; /* number of buffers */ 562 }; 563 564 /* XXX revise this */ 565 struct nm_hash_ent { 566 uint64_t mac; /* the top 2 bytes are the epoch */ 567 uint64_t ports; 568 }; 569 570 /* 571 * nm_bridge is a descriptor for a VALE switch. 572 * Interfaces for a bridge are all in bdg_ports[]. 573 * The array has fixed size, an empty entry does not terminate 574 * the search, but lookups only occur on attach/detach so we 575 * don't mind if they are slow. 576 * 577 * The bridge is non blocking on the transmit ports: excess 578 * packets are dropped if there is no room on the output port. 579 * 580 * bdg_lock protects accesses to the bdg_ports array. 581 * This is a rw lock (or equivalent). 582 */ 583 struct nm_bridge { 584 /* XXX what is the proper alignment/layout ? */ 585 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ 586 int bdg_namelen; 587 uint32_t bdg_active_ports; /* 0 means free */ 588 char bdg_basename[IFNAMSIZ]; 589 590 /* Indexes of active ports (up to active_ports) 591 * and all other remaining ports. 592 */ 593 uint8_t bdg_port_index[NM_BDG_MAXPORTS]; 594 595 struct netmap_adapter *bdg_ports[NM_BDG_MAXPORTS]; 596 597 598 /* 599 * The function to decide the destination port. 600 * It returns either of an index of the destination port, 601 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to 602 * forward this packet. ring_nr is the source ring index, and the 603 * function may overwrite this value to forward this packet to a 604 * different ring index. 605 * This function must be set by netmap_bdgctl(). 606 */ 607 bdg_lookup_fn_t nm_bdg_lookup; 608 609 /* the forwarding table, MAC+ports. 610 * XXX should be changed to an argument to be passed to 611 * the lookup function, and allocated on attach 612 */ 613 struct nm_hash_ent ht[NM_BDG_HASH]; 614 }; 615 616 617 /* 618 * XXX in principle nm_bridges could be created dynamically 619 * Right now we have a static array and deletions are protected 620 * by an exclusive lock. 621 */ 622 struct nm_bridge nm_bridges[NM_BRIDGES]; 623 624 625 /* 626 * A few function to tell which kind of port are we using. 627 * XXX should we hold a lock ? 628 * 629 * nma_is_vp() virtual port 630 * nma_is_host() port connected to the host stack 631 * nma_is_hw() port connected to a NIC 632 */ 633 int nma_is_vp(struct netmap_adapter *na); 634 int 635 nma_is_vp(struct netmap_adapter *na) 636 { 637 return na->nm_register == bdg_netmap_reg; 638 } 639 640 static __inline int 641 nma_is_host(struct netmap_adapter *na) 642 { 643 return na->nm_register == NULL; 644 } 645 646 static __inline int 647 nma_is_hw(struct netmap_adapter *na) 648 { 649 /* In case of sw adapter, nm_register is NULL */ 650 return !nma_is_vp(na) && !nma_is_host(na); 651 } 652 653 654 /* 655 * If the NIC is owned by the kernel 656 * (i.e., bridge), neither another bridge nor user can use it; 657 * if the NIC is owned by a user, only users can share it. 658 * Evaluation must be done under NMG_LOCK(). 659 */ 660 #define NETMAP_OWNED_BY_KERN(ifp) (!nma_is_vp(NA(ifp)) && NA(ifp)->na_bdg) 661 #define NETMAP_OWNED_BY_ANY(ifp) \ 662 (NETMAP_OWNED_BY_KERN(ifp) || (NA(ifp)->refcount > 0)) 663 664 /* 665 * NA(ifp)->bdg_port port index 666 */ 667 668 669 /* 670 * this is a slightly optimized copy routine which rounds 671 * to multiple of 64 bytes and is often faster than dealing 672 * with other odd sizes. We assume there is enough room 673 * in the source and destination buffers. 674 * 675 * XXX only for multiples of 64 bytes, non overlapped. 676 */ 677 static inline void 678 pkt_copy(void *_src, void *_dst, int l) 679 { 680 uint64_t *src = _src; 681 uint64_t *dst = _dst; 682 if (unlikely(l >= 1024)) { 683 memcpy(dst, src, l); 684 return; 685 } 686 for (; likely(l > 0); l-=64) { 687 *dst++ = *src++; 688 *dst++ = *src++; 689 *dst++ = *src++; 690 *dst++ = *src++; 691 *dst++ = *src++; 692 *dst++ = *src++; 693 *dst++ = *src++; 694 *dst++ = *src++; 695 } 696 } 697 698 699 /* 700 * locate a bridge among the existing ones. 701 * MUST BE CALLED WITH NMG_LOCK() 702 * 703 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. 704 * We assume that this is called with a name of at least NM_NAME chars. 705 */ 706 static struct nm_bridge * 707 nm_find_bridge(const char *name, int create) 708 { 709 int i, l, namelen; 710 struct nm_bridge *b = NULL; 711 712 NMG_LOCK_ASSERT(); 713 714 namelen = strlen(NM_NAME); /* base length */ 715 l = name ? strlen(name) : 0; /* actual length */ 716 if (l < namelen) { 717 D("invalid bridge name %s", name ? name : NULL); 718 return NULL; 719 } 720 for (i = namelen + 1; i < l; i++) { 721 if (name[i] == ':') { 722 namelen = i; 723 break; 724 } 725 } 726 if (namelen >= IFNAMSIZ) 727 namelen = IFNAMSIZ; 728 ND("--- prefix is '%.*s' ---", namelen, name); 729 730 /* lookup the name, remember empty slot if there is one */ 731 for (i = 0; i < NM_BRIDGES; i++) { 732 struct nm_bridge *x = nm_bridges + i; 733 734 if (x->bdg_active_ports == 0) { 735 if (create && b == NULL) 736 b = x; /* record empty slot */ 737 } else if (x->bdg_namelen != namelen) { 738 continue; 739 } else if (strncmp(name, x->bdg_basename, namelen) == 0) { 740 ND("found '%.*s' at %d", namelen, name, i); 741 b = x; 742 break; 743 } 744 } 745 if (i == NM_BRIDGES && b) { /* name not found, can create entry */ 746 /* initialize the bridge */ 747 strncpy(b->bdg_basename, name, namelen); 748 ND("create new bridge %s with ports %d", b->bdg_basename, 749 b->bdg_active_ports); 750 b->bdg_namelen = namelen; 751 b->bdg_active_ports = 0; 752 for (i = 0; i < NM_BDG_MAXPORTS; i++) 753 b->bdg_port_index[i] = i; 754 /* set the default function */ 755 b->nm_bdg_lookup = netmap_bdg_learning; 756 /* reset the MAC address table */ 757 bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); 758 } 759 return b; 760 } 761 762 763 /* 764 * Free the forwarding tables for rings attached to switch ports. 765 */ 766 static void 767 nm_free_bdgfwd(struct netmap_adapter *na) 768 { 769 int nrings, i; 770 struct netmap_kring *kring; 771 772 NMG_LOCK_ASSERT(); 773 nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; 774 kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; 775 for (i = 0; i < nrings; i++) { 776 if (kring[i].nkr_ft) { 777 free(kring[i].nkr_ft, M_DEVBUF); 778 kring[i].nkr_ft = NULL; /* protect from freeing twice */ 779 } 780 } 781 if (nma_is_hw(na)) 782 nm_free_bdgfwd(SWNA(na->ifp)); 783 } 784 785 786 /* 787 * Allocate the forwarding tables for the rings attached to the bridge ports. 788 */ 789 static int 790 nm_alloc_bdgfwd(struct netmap_adapter *na) 791 { 792 int nrings, l, i, num_dstq; 793 struct netmap_kring *kring; 794 795 NMG_LOCK_ASSERT(); 796 /* all port:rings + broadcast */ 797 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; 798 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; 799 l += sizeof(struct nm_bdg_q) * num_dstq; 800 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; 801 802 nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; 803 kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; 804 for (i = 0; i < nrings; i++) { 805 struct nm_bdg_fwd *ft; 806 struct nm_bdg_q *dstq; 807 int j; 808 809 ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); 810 if (!ft) { 811 nm_free_bdgfwd(na); 812 return ENOMEM; 813 } 814 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 815 for (j = 0; j < num_dstq; j++) { 816 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; 817 dstq[j].bq_len = 0; 818 } 819 kring[i].nkr_ft = ft; 820 } 821 if (nma_is_hw(na)) 822 nm_alloc_bdgfwd(SWNA(na->ifp)); 823 return 0; 824 } 825 826 827 /* 828 * Fetch configuration from the device, to cope with dynamic 829 * reconfigurations after loading the module. 830 */ 831 static int 832 netmap_update_config(struct netmap_adapter *na) 833 { 834 struct ifnet *ifp = na->ifp; 835 u_int txr, txd, rxr, rxd; 836 837 txr = txd = rxr = rxd = 0; 838 if (na->nm_config) { 839 na->nm_config(ifp, &txr, &txd, &rxr, &rxd); 840 } else { 841 /* take whatever we had at init time */ 842 txr = na->num_tx_rings; 843 txd = na->num_tx_desc; 844 rxr = na->num_rx_rings; 845 rxd = na->num_rx_desc; 846 } 847 848 if (na->num_tx_rings == txr && na->num_tx_desc == txd && 849 na->num_rx_rings == rxr && na->num_rx_desc == rxd) 850 return 0; /* nothing changed */ 851 if (netmap_verbose || na->refcount > 0) { 852 D("stored config %s: txring %d x %d, rxring %d x %d", 853 ifp->if_xname, 854 na->num_tx_rings, na->num_tx_desc, 855 na->num_rx_rings, na->num_rx_desc); 856 D("new config %s: txring %d x %d, rxring %d x %d", 857 ifp->if_xname, txr, txd, rxr, rxd); 858 } 859 if (na->refcount == 0) { 860 D("configuration changed (but fine)"); 861 na->num_tx_rings = txr; 862 na->num_tx_desc = txd; 863 na->num_rx_rings = rxr; 864 na->num_rx_desc = rxd; 865 return 0; 866 } 867 D("configuration changed while active, this is bad..."); 868 return 1; 869 } 870 871 static struct netmap_if * 872 netmap_if_new(const char *ifname, struct netmap_adapter *na) 873 { 874 if (netmap_update_config(na)) { 875 /* configuration mismatch, report and fail */ 876 return NULL; 877 } 878 return netmap_mem_if_new(ifname, na); 879 } 880 881 882 /* Structure associated to each thread which registered an interface. 883 * 884 * The first 4 fields of this structure are written by NIOCREGIF and 885 * read by poll() and NIOC?XSYNC. 886 * There is low contention among writers (actually, a correct user program 887 * should have no contention among writers) and among writers and readers, 888 * so we use a single global lock to protect the structure initialization. 889 * Since initialization involves the allocation of memory, we reuse the memory 890 * allocator lock. 891 * Read access to the structure is lock free. Readers must check that 892 * np_nifp is not NULL before using the other fields. 893 * If np_nifp is NULL initialization has not been performed, so they should 894 * return an error to userlevel. 895 * 896 * The ref_done field is used to regulate access to the refcount in the 897 * memory allocator. The refcount must be incremented at most once for 898 * each open("/dev/netmap"). The increment is performed by the first 899 * function that calls netmap_get_memory() (currently called by 900 * mmap(), NIOCGINFO and NIOCREGIF). 901 * If the refcount is incremented, it is then decremented when the 902 * private structure is destroyed. 903 */ 904 struct netmap_priv_d { 905 struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ 906 907 struct ifnet *np_ifp; /* device for which we hold a ref. */ 908 int np_ringid; /* from the ioctl */ 909 u_int np_qfirst, np_qlast; /* range of rings to scan */ 910 uint16_t np_txpoll; 911 912 struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */ 913 #ifdef __FreeBSD__ 914 int np_refcount; /* use with NMG_LOCK held */ 915 #endif /* __FreeBSD__ */ 916 }; 917 918 /* grab a reference to the memory allocator, if we don't have one already. The 919 * reference is taken from the netmap_adapter registered with the priv. 920 * 921 */ 922 static int 923 netmap_get_memory_locked(struct netmap_priv_d* p) 924 { 925 struct netmap_mem_d *nmd; 926 int error = 0; 927 928 if (p->np_ifp == NULL) { 929 if (!netmap_mmap_unreg) 930 return ENODEV; 931 /* for compatibility with older versions of the API 932 * we use the global allocator when no interface has been 933 * registered 934 */ 935 nmd = &nm_mem; 936 } else { 937 nmd = NA(p->np_ifp)->nm_mem; 938 } 939 if (p->np_mref == NULL) { 940 error = netmap_mem_finalize(nmd); 941 if (!error) 942 p->np_mref = nmd; 943 } else if (p->np_mref != nmd) { 944 /* a virtual port has been registered, but previous 945 * syscalls already used the global allocator. 946 * We cannot continue 947 */ 948 error = ENODEV; 949 } 950 return error; 951 } 952 953 static int 954 netmap_get_memory(struct netmap_priv_d* p) 955 { 956 int error; 957 NMG_LOCK(); 958 error = netmap_get_memory_locked(p); 959 NMG_UNLOCK(); 960 return error; 961 } 962 963 static int 964 netmap_have_memory_locked(struct netmap_priv_d* p) 965 { 966 return p->np_mref != NULL; 967 } 968 969 static void 970 netmap_drop_memory_locked(struct netmap_priv_d* p) 971 { 972 if (p->np_mref) { 973 netmap_mem_deref(p->np_mref); 974 p->np_mref = NULL; 975 } 976 } 977 978 /* 979 * File descriptor's private data destructor. 980 * 981 * Call nm_register(ifp,0) to stop netmap mode on the interface and 982 * revert to normal operation. We expect that np_ifp has not gone. 983 * The second argument is the nifp to work on. In some cases it is 984 * not attached yet to the netmap_priv_d so we need to pass it as 985 * a separate argument. 986 */ 987 /* call with NMG_LOCK held */ 988 static void 989 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) 990 { 991 struct ifnet *ifp = priv->np_ifp; 992 struct netmap_adapter *na = NA(ifp); 993 994 NMG_LOCK_ASSERT(); 995 na->refcount--; 996 if (na->refcount <= 0) { /* last instance */ 997 u_int i; 998 999 if (netmap_verbose) 1000 D("deleting last instance for %s", ifp->if_xname); 1001 /* 1002 * (TO CHECK) This function is only called 1003 * when the last reference to this file descriptor goes 1004 * away. This means we cannot have any pending poll() 1005 * or interrupt routine operating on the structure. 1006 * XXX The file may be closed in a thread while 1007 * another thread is using it. 1008 * Linux keeps the file opened until the last reference 1009 * by any outstanding ioctl/poll or mmap is gone. 1010 * FreeBSD does not track mmap()s (but we do) and 1011 * wakes up any sleeping poll(). Need to check what 1012 * happens if the close() occurs while a concurrent 1013 * syscall is running. 1014 */ 1015 na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */ 1016 /* Wake up any sleeping threads. netmap_poll will 1017 * then return POLLERR 1018 * XXX The wake up now must happen during *_down(), when 1019 * we order all activities to stop. -gl 1020 */ 1021 nm_free_bdgfwd(na); 1022 for (i = 0; i < na->num_tx_rings + 1; i++) { 1023 mtx_destroy(&na->tx_rings[i].q_lock); 1024 } 1025 for (i = 0; i < na->num_rx_rings + 1; i++) { 1026 mtx_destroy(&na->rx_rings[i].q_lock); 1027 } 1028 /* XXX kqueue(9) needed; these will mirror knlist_init. */ 1029 /* knlist_destroy(&na->tx_si.si_note); */ 1030 /* knlist_destroy(&na->rx_si.si_note); */ 1031 if (nma_is_hw(na)) 1032 SWNA(ifp)->tx_rings = SWNA(ifp)->rx_rings = NULL; 1033 } 1034 /* 1035 * netmap_mem_if_delete() deletes the nifp, and if this is 1036 * the last instance also buffers, rings and krings. 1037 */ 1038 netmap_mem_if_delete(na, nifp); 1039 } 1040 1041 1042 /* we assume netmap adapter exists 1043 * Called with NMG_LOCK held 1044 */ 1045 static void 1046 nm_if_rele(struct ifnet *ifp) 1047 { 1048 int i, is_hw, hw, sw, lim; 1049 struct nm_bridge *b; 1050 struct netmap_adapter *na; 1051 uint8_t tmp[NM_BDG_MAXPORTS]; 1052 1053 NMG_LOCK_ASSERT(); 1054 /* I can be called not only for get_ifp()-ed references where netmap's 1055 * capability is guaranteed, but also for non-netmap-capable NICs. 1056 */ 1057 if (!NETMAP_CAPABLE(ifp) || !NA(ifp)->na_bdg) { 1058 if_rele(ifp); 1059 return; 1060 } 1061 na = NA(ifp); 1062 b = na->na_bdg; 1063 is_hw = nma_is_hw(na); 1064 1065 ND("%s has %d references", ifp->if_xname, NA(ifp)->na_bdg_refcount); 1066 1067 if (!DROP_BDG_REF(ifp)) 1068 return; 1069 1070 /* 1071 New algorithm: 1072 make a copy of bdg_port_index; 1073 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port 1074 in the array of bdg_port_index, replacing them with 1075 entries from the bottom of the array; 1076 decrement bdg_active_ports; 1077 acquire BDG_WLOCK() and copy back the array. 1078 */ 1079 1080 hw = NA(ifp)->bdg_port; 1081 sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1; 1082 lim = b->bdg_active_ports; 1083 1084 ND("detach %d and %d (lim %d)", hw, sw, lim); 1085 /* make a copy of the list of active ports, update it, 1086 * and then copy back within BDG_WLOCK(). 1087 */ 1088 memcpy(tmp, b->bdg_port_index, sizeof(tmp)); 1089 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { 1090 if (hw >= 0 && tmp[i] == hw) { 1091 ND("detach hw %d at %d", hw, i); 1092 lim--; /* point to last active port */ 1093 tmp[i] = tmp[lim]; /* swap with i */ 1094 tmp[lim] = hw; /* now this is inactive */ 1095 hw = -1; 1096 } else if (sw >= 0 && tmp[i] == sw) { 1097 ND("detach sw %d at %d", sw, i); 1098 lim--; 1099 tmp[i] = tmp[lim]; 1100 tmp[lim] = sw; 1101 sw = -1; 1102 } else { 1103 i++; 1104 } 1105 } 1106 if (hw >= 0 || sw >= 0) { 1107 D("XXX delete failed hw %d sw %d, should panic...", hw, sw); 1108 } 1109 hw = NA(ifp)->bdg_port; 1110 sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1; 1111 1112 BDG_WLOCK(b); 1113 b->bdg_ports[hw] = NULL; 1114 na->na_bdg = NULL; 1115 if (sw >= 0) { 1116 b->bdg_ports[sw] = NULL; 1117 SWNA(ifp)->na_bdg = NULL; 1118 } 1119 memcpy(b->bdg_port_index, tmp, sizeof(tmp)); 1120 b->bdg_active_ports = lim; 1121 BDG_WUNLOCK(b); 1122 1123 ND("now %d active ports", lim); 1124 if (lim == 0) { 1125 ND("marking bridge %s as free", b->bdg_basename); 1126 b->nm_bdg_lookup = NULL; 1127 } 1128 1129 if (is_hw) { 1130 if_rele(ifp); 1131 } else { 1132 if (na->na_flags & NAF_MEM_OWNER) 1133 netmap_mem_private_delete(na->nm_mem); 1134 bzero(na, sizeof(*na)); 1135 free(na, M_DEVBUF); 1136 bzero(ifp, sizeof(*ifp)); 1137 free(ifp, M_DEVBUF); 1138 } 1139 } 1140 1141 1142 /* 1143 * returns 1 if this is the last instance and we can free priv 1144 */ 1145 static int 1146 netmap_dtor_locked(struct netmap_priv_d *priv) 1147 { 1148 struct ifnet *ifp = priv->np_ifp; 1149 1150 #ifdef __FreeBSD__ 1151 /* 1152 * np_refcount is the number of active mmaps on 1153 * this file descriptor 1154 */ 1155 if (--priv->np_refcount > 0) { 1156 return 0; 1157 } 1158 #endif /* __FreeBSD__ */ 1159 if (ifp) { 1160 netmap_do_unregif(priv, priv->np_nifp); 1161 } 1162 netmap_drop_memory_locked(priv); 1163 if (ifp) { 1164 nm_if_rele(ifp); /* might also destroy *na */ 1165 } 1166 return 1; 1167 } 1168 1169 static void 1170 netmap_dtor(void *data) 1171 { 1172 struct netmap_priv_d *priv = data; 1173 int last_instance; 1174 1175 NMG_LOCK(); 1176 last_instance = netmap_dtor_locked(priv); 1177 NMG_UNLOCK(); 1178 if (last_instance) { 1179 bzero(priv, sizeof(*priv)); /* for safety */ 1180 free(priv, M_DEVBUF); 1181 } 1182 } 1183 1184 1185 #ifdef __FreeBSD__ 1186 1187 /* 1188 * In order to track whether pages are still mapped, we hook into 1189 * the standard cdev_pager and intercept the constructor and 1190 * destructor. 1191 */ 1192 1193 struct netmap_vm_handle_t { 1194 struct cdev *dev; 1195 struct netmap_priv_d *priv; 1196 }; 1197 1198 static int 1199 netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, 1200 vm_ooffset_t foff, struct ucred *cred, u_short *color) 1201 { 1202 struct netmap_vm_handle_t *vmh = handle; 1203 D("handle %p size %jd prot %d foff %jd", 1204 handle, (intmax_t)size, prot, (intmax_t)foff); 1205 dev_ref(vmh->dev); 1206 return 0; 1207 } 1208 1209 1210 static void 1211 netmap_dev_pager_dtor(void *handle) 1212 { 1213 struct netmap_vm_handle_t *vmh = handle; 1214 struct cdev *dev = vmh->dev; 1215 struct netmap_priv_d *priv = vmh->priv; 1216 D("handle %p", handle); 1217 netmap_dtor(priv); 1218 free(vmh, M_DEVBUF); 1219 dev_rel(dev); 1220 } 1221 1222 static int 1223 netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, 1224 int prot, vm_page_t *mres) 1225 { 1226 struct netmap_vm_handle_t *vmh = object->handle; 1227 struct netmap_priv_d *priv = vmh->priv; 1228 vm_paddr_t paddr; 1229 vm_page_t page; 1230 vm_memattr_t memattr; 1231 vm_pindex_t pidx; 1232 1233 ND("object %p offset %jd prot %d mres %p", 1234 object, (intmax_t)offset, prot, mres); 1235 memattr = object->memattr; 1236 pidx = OFF_TO_IDX(offset); 1237 paddr = netmap_mem_ofstophys(priv->np_mref, offset); 1238 if (paddr == 0) 1239 return VM_PAGER_FAIL; 1240 1241 if (((*mres)->flags & PG_FICTITIOUS) != 0) { 1242 /* 1243 * If the passed in result page is a fake page, update it with 1244 * the new physical address. 1245 */ 1246 page = *mres; 1247 vm_page_updatefake(page, paddr, memattr); 1248 } else { 1249 /* 1250 * Replace the passed in reqpage page with our own fake page and 1251 * free up the all of the original pages. 1252 */ 1253 #ifndef VM_OBJECT_WUNLOCK /* FreeBSD < 10.x */ 1254 #define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK 1255 #define VM_OBJECT_WLOCK VM_OBJECT_LOCK 1256 #endif /* VM_OBJECT_WUNLOCK */ 1257 1258 VM_OBJECT_WUNLOCK(object); 1259 page = vm_page_getfake(paddr, memattr); 1260 VM_OBJECT_WLOCK(object); 1261 vm_page_lock(*mres); 1262 vm_page_free(*mres); 1263 vm_page_unlock(*mres); 1264 *mres = page; 1265 vm_page_insert(page, object, pidx); 1266 } 1267 page->valid = VM_PAGE_BITS_ALL; 1268 return (VM_PAGER_OK); 1269 } 1270 1271 1272 static struct cdev_pager_ops netmap_cdev_pager_ops = { 1273 .cdev_pg_ctor = netmap_dev_pager_ctor, 1274 .cdev_pg_dtor = netmap_dev_pager_dtor, 1275 .cdev_pg_fault = netmap_dev_pager_fault, 1276 }; 1277 1278 1279 static int 1280 netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, 1281 vm_size_t objsize, vm_object_t *objp, int prot) 1282 { 1283 int error; 1284 struct netmap_vm_handle_t *vmh; 1285 struct netmap_priv_d *priv; 1286 vm_object_t obj; 1287 1288 D("cdev %p foff %jd size %jd objp %p prot %d", cdev, 1289 (intmax_t )*foff, (intmax_t )objsize, objp, prot); 1290 1291 vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF, 1292 M_NOWAIT | M_ZERO); 1293 if (vmh == NULL) 1294 return ENOMEM; 1295 vmh->dev = cdev; 1296 1297 NMG_LOCK(); 1298 error = devfs_get_cdevpriv((void**)&priv); 1299 if (error) 1300 goto err_unlock; 1301 vmh->priv = priv; 1302 priv->np_refcount++; 1303 NMG_UNLOCK(); 1304 1305 error = netmap_get_memory(priv); 1306 if (error) 1307 goto err_deref; 1308 1309 obj = cdev_pager_allocate(vmh, OBJT_DEVICE, 1310 &netmap_cdev_pager_ops, objsize, prot, 1311 *foff, NULL); 1312 if (obj == NULL) { 1313 D("cdev_pager_allocate failed"); 1314 error = EINVAL; 1315 goto err_deref; 1316 } 1317 1318 *objp = obj; 1319 return 0; 1320 1321 err_deref: 1322 NMG_LOCK(); 1323 priv->np_refcount--; 1324 err_unlock: 1325 NMG_UNLOCK(); 1326 // err: 1327 free(vmh, M_DEVBUF); 1328 return error; 1329 } 1330 1331 1332 // XXX can we remove this ? 1333 static int 1334 netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) 1335 { 1336 if (netmap_verbose) 1337 D("dev %p fflag 0x%x devtype %d td %p", 1338 dev, fflag, devtype, td); 1339 return 0; 1340 } 1341 1342 1343 static int 1344 netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) 1345 { 1346 struct netmap_priv_d *priv; 1347 int error; 1348 1349 (void)dev; 1350 (void)oflags; 1351 (void)devtype; 1352 (void)td; 1353 1354 // XXX wait or nowait ? 1355 priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, 1356 M_NOWAIT | M_ZERO); 1357 if (priv == NULL) 1358 return ENOMEM; 1359 1360 error = devfs_set_cdevpriv(priv, netmap_dtor); 1361 if (error) 1362 return error; 1363 1364 priv->np_refcount = 1; 1365 1366 return 0; 1367 } 1368 #endif /* __FreeBSD__ */ 1369 1370 1371 /* 1372 * Handlers for synchronization of the queues from/to the host. 1373 * Netmap has two operating modes: 1374 * - in the default mode, the rings connected to the host stack are 1375 * just another ring pair managed by userspace; 1376 * - in transparent mode (XXX to be defined) incoming packets 1377 * (from the host or the NIC) are marked as NS_FORWARD upon 1378 * arrival, and the user application has a chance to reset the 1379 * flag for packets that should be dropped. 1380 * On the RXSYNC or poll(), packets in RX rings between 1381 * kring->nr_kcur and ring->cur with NS_FORWARD still set are moved 1382 * to the other side. 1383 * The transfer NIC --> host is relatively easy, just encapsulate 1384 * into mbufs and we are done. The host --> NIC side is slightly 1385 * harder because there might not be room in the tx ring so it 1386 * might take a while before releasing the buffer. 1387 */ 1388 1389 1390 /* 1391 * pass a chain of buffers to the host stack as coming from 'dst' 1392 */ 1393 static void 1394 netmap_send_up(struct ifnet *dst, struct mbuf *head) 1395 { 1396 struct mbuf *m; 1397 1398 /* send packets up, outside the lock */ 1399 while ((m = head) != NULL) { 1400 head = head->m_nextpkt; 1401 m->m_nextpkt = NULL; 1402 if (netmap_verbose & NM_VERB_HOST) 1403 D("sending up pkt %p size %d", m, MBUF_LEN(m)); 1404 NM_SEND_UP(dst, m); 1405 } 1406 } 1407 1408 struct mbq { 1409 struct mbuf *head; 1410 struct mbuf *tail; 1411 int count; 1412 }; 1413 1414 1415 /* 1416 * put a copy of the buffers marked NS_FORWARD into an mbuf chain. 1417 * Run from hwcur to cur - reserved 1418 */ 1419 static void 1420 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) 1421 { 1422 /* Take packets from hwcur to cur-reserved and pass them up. 1423 * In case of no buffers we give up. At the end of the loop, 1424 * the queue is drained in all cases. 1425 * XXX handle reserved 1426 */ 1427 u_int lim = kring->nkr_num_slots - 1; 1428 struct mbuf *m, *tail = q->tail; 1429 u_int k = kring->ring->cur, n = kring->ring->reserved; 1430 struct netmap_mem_d *nmd = kring->na->nm_mem; 1431 1432 /* compute the final position, ring->cur - ring->reserved */ 1433 if (n > 0) { 1434 if (k < n) 1435 k += kring->nkr_num_slots; 1436 k += n; 1437 } 1438 for (n = kring->nr_hwcur; n != k;) { 1439 struct netmap_slot *slot = &kring->ring->slot[n]; 1440 1441 n = nm_next(n, lim); 1442 if ((slot->flags & NS_FORWARD) == 0 && !force) 1443 continue; 1444 if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(nmd)) { 1445 D("bad pkt at %d len %d", n, slot->len); 1446 continue; 1447 } 1448 slot->flags &= ~NS_FORWARD; // XXX needed ? 1449 /* XXX adapt to the case of a multisegment packet */ 1450 m = m_devget(BDG_NMB(nmd, slot), slot->len, 0, kring->na->ifp, NULL); 1451 1452 if (m == NULL) 1453 break; 1454 if (tail) 1455 tail->m_nextpkt = m; 1456 else 1457 q->head = m; 1458 tail = m; 1459 q->count++; 1460 m->m_nextpkt = NULL; 1461 } 1462 q->tail = tail; 1463 } 1464 1465 1466 /* 1467 * The host ring has packets from nr_hwcur to (cur - reserved) 1468 * to be sent down to the NIC. 1469 * We need to use the queue lock on the source (host RX ring) 1470 * to protect against netmap_transmit. 1471 * If the user is well behaved we do not need to acquire locks 1472 * on the destination(s), 1473 * so we only need to make sure that there are no panics because 1474 * of user errors. 1475 * XXX verify 1476 * 1477 * We scan the tx rings, which have just been 1478 * flushed so nr_hwcur == cur. Pushing packets down means 1479 * increment cur and decrement avail. 1480 * XXX to be verified 1481 */ 1482 static void 1483 netmap_sw_to_nic(struct netmap_adapter *na) 1484 { 1485 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 1486 struct netmap_kring *k1 = &na->tx_rings[0]; 1487 u_int i, howmany, src_lim, dst_lim; 1488 1489 /* XXX we should also check that the carrier is on */ 1490 if (kring->nkr_stopped) 1491 return; 1492 1493 mtx_lock(&kring->q_lock); 1494 1495 if (kring->nkr_stopped) 1496 goto out; 1497 1498 howmany = kring->nr_hwavail; /* XXX otherwise cur - reserved - nr_hwcur */ 1499 1500 src_lim = kring->nkr_num_slots - 1; 1501 for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) { 1502 ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail); 1503 dst_lim = k1->nkr_num_slots - 1; 1504 while (howmany > 0 && k1->ring->avail > 0) { 1505 struct netmap_slot *src, *dst, tmp; 1506 src = &kring->ring->slot[kring->nr_hwcur]; 1507 dst = &k1->ring->slot[k1->ring->cur]; 1508 tmp = *src; 1509 src->buf_idx = dst->buf_idx; 1510 src->flags = NS_BUF_CHANGED; 1511 1512 dst->buf_idx = tmp.buf_idx; 1513 dst->len = tmp.len; 1514 dst->flags = NS_BUF_CHANGED; 1515 ND("out len %d buf %d from %d to %d", 1516 dst->len, dst->buf_idx, 1517 kring->nr_hwcur, k1->ring->cur); 1518 1519 kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim); 1520 howmany--; 1521 kring->nr_hwavail--; 1522 k1->ring->cur = nm_next(k1->ring->cur, dst_lim); 1523 k1->ring->avail--; 1524 } 1525 kring->ring->cur = kring->nr_hwcur; // XXX 1526 k1++; // XXX why? 1527 } 1528 out: 1529 mtx_unlock(&kring->q_lock); 1530 } 1531 1532 1533 /* 1534 * netmap_txsync_to_host() passes packets up. We are called from a 1535 * system call in user process context, and the only contention 1536 * can be among multiple user threads erroneously calling 1537 * this routine concurrently. 1538 */ 1539 static void 1540 netmap_txsync_to_host(struct netmap_adapter *na) 1541 { 1542 struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; 1543 struct netmap_ring *ring = kring->ring; 1544 u_int k, lim = kring->nkr_num_slots - 1; 1545 struct mbq q = { NULL, NULL, 0 }; 1546 1547 if (nm_kr_tryget(kring)) { 1548 D("ring %p busy (user error)", kring); 1549 return; 1550 } 1551 k = ring->cur; 1552 if (k > lim) { 1553 D("invalid ring index in stack TX kring %p", kring); 1554 netmap_ring_reinit(kring); 1555 nm_kr_put(kring); 1556 return; 1557 } 1558 1559 /* Take packets from hwcur to cur and pass them up. 1560 * In case of no buffers we give up. At the end of the loop, 1561 * the queue is drained in all cases. 1562 */ 1563 netmap_grab_packets(kring, &q, 1); 1564 kring->nr_hwcur = k; 1565 kring->nr_hwavail = ring->avail = lim; 1566 1567 nm_kr_put(kring); 1568 netmap_send_up(na->ifp, q.head); 1569 } 1570 1571 1572 /* 1573 * This is the 'txsync' handler to send from a software ring to the 1574 * host stack. 1575 */ 1576 /* SWNA(ifp)->txrings[0] is always NA(ifp)->txrings[NA(ifp)->num_txrings] */ 1577 static int 1578 netmap_bdg_to_host(struct ifnet *ifp, u_int ring_nr, int flags) 1579 { 1580 (void)ring_nr; 1581 (void)flags; 1582 if (netmap_verbose > 255) 1583 RD(5, "sync to host %s ring %d", ifp->if_xname, ring_nr); 1584 netmap_txsync_to_host(NA(ifp)); 1585 return 0; 1586 } 1587 1588 1589 /* 1590 * rxsync backend for packets coming from the host stack. 1591 * They have been put in the queue by netmap_transmit() so we 1592 * need to protect access to the kring using a lock. 1593 * 1594 * This routine also does the selrecord if called from the poll handler 1595 * (we know because td != NULL). 1596 * 1597 * NOTE: on linux, selrecord() is defined as a macro and uses pwait 1598 * as an additional hidden argument. 1599 */ 1600 static void 1601 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) 1602 { 1603 struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; 1604 struct netmap_ring *ring = kring->ring; 1605 u_int j, n, lim = kring->nkr_num_slots; 1606 u_int k = ring->cur, resvd = ring->reserved; 1607 1608 (void)pwait; /* disable unused warnings */ 1609 1610 if (kring->nkr_stopped) /* check a first time without lock */ 1611 return; 1612 1613 /* XXX as an optimization we could reuse na->core_lock */ 1614 mtx_lock(&kring->q_lock); 1615 1616 if (kring->nkr_stopped) /* check again with lock held */ 1617 goto unlock_out; 1618 1619 if (k >= lim) { 1620 netmap_ring_reinit(kring); 1621 goto unlock_out; 1622 } 1623 /* new packets are already set in nr_hwavail */ 1624 /* skip past packets that userspace has released */ 1625 j = kring->nr_hwcur; 1626 if (resvd > 0) { 1627 if (resvd + ring->avail >= lim + 1) { 1628 D("XXX invalid reserve/avail %d %d", resvd, ring->avail); 1629 ring->reserved = resvd = 0; // XXX panic... 1630 } 1631 k = (k >= resvd) ? k - resvd : k + lim - resvd; 1632 } 1633 if (j != k) { 1634 n = k >= j ? k - j : k + lim - j; 1635 kring->nr_hwavail -= n; 1636 kring->nr_hwcur = k; 1637 } 1638 k = ring->avail = kring->nr_hwavail - resvd; 1639 if (k == 0 && td) 1640 selrecord(td, &kring->si); 1641 if (k && (netmap_verbose & NM_VERB_HOST)) 1642 D("%d pkts from stack", k); 1643 unlock_out: 1644 1645 mtx_unlock(&kring->q_lock); 1646 } 1647 1648 1649 /* 1650 * MUST BE CALLED UNDER NMG_LOCK() 1651 * 1652 * get a refcounted reference to an interface. 1653 * This is always called in the execution of an ioctl(). 1654 * 1655 * Return ENXIO if the interface does not exist, EINVAL if netmap 1656 * is not supported by the interface. 1657 * If successful, hold a reference. 1658 * 1659 * When the NIC is attached to a bridge, reference is managed 1660 * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as 1661 * virtual ports. Hence, on the final DROP_BDG_REF(), the NIC 1662 * is detached from the bridge, then ifp's refcount is dropped (this 1663 * is equivalent to that ifp is destroyed in case of virtual ports. 1664 * 1665 * This function uses if_rele() when we want to prevent the NIC from 1666 * being detached from the bridge in error handling. But once refcount 1667 * is acquired by this function, it must be released using nm_if_rele(). 1668 */ 1669 static int 1670 get_ifp(struct nmreq *nmr, struct ifnet **ifp, int create) 1671 { 1672 const char *name = nmr->nr_name; 1673 int namelen = strlen(name); 1674 struct ifnet *iter = NULL; 1675 int no_prefix = 0; 1676 1677 /* first try to see if this is a bridge port. */ 1678 struct nm_bridge *b; 1679 struct netmap_adapter *na; 1680 int i, j, cand = -1, cand2 = -1; 1681 int needed; 1682 1683 NMG_LOCK_ASSERT(); 1684 *ifp = NULL; /* default */ 1685 if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) { 1686 no_prefix = 1; /* no VALE prefix */ 1687 goto no_bridge_port; 1688 } 1689 1690 b = nm_find_bridge(name, create); 1691 if (b == NULL) { 1692 D("no bridges available for '%s'", name); 1693 return (ENXIO); 1694 } 1695 1696 /* Now we are sure that name starts with the bridge's name, 1697 * lookup the port in the bridge. We need to scan the entire 1698 * list. It is not important to hold a WLOCK on the bridge 1699 * during the search because NMG_LOCK already guarantees 1700 * that there are no other possible writers. 1701 */ 1702 1703 /* lookup in the local list of ports */ 1704 for (j = 0; j < b->bdg_active_ports; j++) { 1705 i = b->bdg_port_index[j]; 1706 na = b->bdg_ports[i]; 1707 // KASSERT(na != NULL); 1708 iter = na->ifp; 1709 /* XXX make sure the name only contains one : */ 1710 if (!strcmp(iter->if_xname, name) /* virtual port */ || 1711 (namelen > b->bdg_namelen && !strcmp(iter->if_xname, 1712 name + b->bdg_namelen + 1)) /* NIC */) { 1713 ADD_BDG_REF(iter); 1714 ND("found existing if %s refs %d", name, 1715 NA(iter)->na_bdg_refcount); 1716 *ifp = iter; 1717 /* we are done, this is surely netmap capable */ 1718 return 0; 1719 } 1720 } 1721 /* not found, should we create it? */ 1722 if (!create) 1723 return ENXIO; 1724 /* yes we should, see if we have space to attach entries */ 1725 needed = 2; /* in some cases we only need 1 */ 1726 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { 1727 D("bridge full %d, cannot create new port", b->bdg_active_ports); 1728 return EINVAL; 1729 } 1730 /* record the next two ports available, but do not allocate yet */ 1731 cand = b->bdg_port_index[b->bdg_active_ports]; 1732 cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; 1733 ND("+++ bridge %s port %s used %d avail %d %d", 1734 b->bdg_basename, name, b->bdg_active_ports, cand, cand2); 1735 1736 /* 1737 * try see if there is a matching NIC with this name 1738 * (after the bridge's name) 1739 */ 1740 iter = ifunit_ref(name + b->bdg_namelen + 1); 1741 if (!iter) { /* this is a virtual port */ 1742 /* Create a temporary NA with arguments, then 1743 * bdg_netmap_attach() will allocate the real one 1744 * and attach it to the ifp 1745 */ 1746 struct netmap_adapter tmp_na; 1747 int error; 1748 1749 if (nmr->nr_cmd) { 1750 /* nr_cmd must be 0 for a virtual port */ 1751 return EINVAL; 1752 } 1753 bzero(&tmp_na, sizeof(tmp_na)); 1754 /* bound checking */ 1755 tmp_na.num_tx_rings = nmr->nr_tx_rings; 1756 nm_bound_var(&tmp_na.num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1757 nmr->nr_tx_rings = tmp_na.num_tx_rings; // write back 1758 tmp_na.num_rx_rings = nmr->nr_rx_rings; 1759 nm_bound_var(&tmp_na.num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1760 nmr->nr_rx_rings = tmp_na.num_rx_rings; // write back 1761 nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, 1762 1, NM_BDG_MAXSLOTS, NULL); 1763 tmp_na.num_tx_desc = nmr->nr_tx_slots; 1764 nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, 1765 1, NM_BDG_MAXSLOTS, NULL); 1766 tmp_na.num_rx_desc = nmr->nr_rx_slots; 1767 1768 /* create a struct ifnet for the new port. 1769 * need M_NOWAIT as we are under nma_lock 1770 */ 1771 iter = malloc(sizeof(*iter), M_DEVBUF, M_NOWAIT | M_ZERO); 1772 if (!iter) 1773 return ENOMEM; 1774 1775 strcpy(iter->if_xname, name); 1776 tmp_na.ifp = iter; 1777 /* bdg_netmap_attach creates a struct netmap_adapter */ 1778 error = bdg_netmap_attach(&tmp_na); 1779 if (error) { 1780 D("error %d", error); 1781 free(iter, M_DEVBUF); 1782 return error; 1783 } 1784 cand2 = -1; /* only need one port */ 1785 } else if (NETMAP_CAPABLE(iter)) { /* this is a NIC */ 1786 /* make sure the NIC is not already in use */ 1787 if (NETMAP_OWNED_BY_ANY(iter)) { 1788 D("NIC %s busy, cannot attach to bridge", 1789 iter->if_xname); 1790 if_rele(iter); /* don't detach from bridge */ 1791 return EINVAL; 1792 } 1793 if (nmr->nr_arg1 != NETMAP_BDG_HOST) 1794 cand2 = -1; /* only need one port */ 1795 } else { /* not a netmap-capable NIC */ 1796 if_rele(iter); /* don't detach from bridge */ 1797 return EINVAL; 1798 } 1799 na = NA(iter); 1800 1801 BDG_WLOCK(b); 1802 na->bdg_port = cand; 1803 ND("NIC %p to bridge port %d", NA(iter), cand); 1804 /* bind the port to the bridge (virtual ports are not active) */ 1805 b->bdg_ports[cand] = na; 1806 na->na_bdg = b; 1807 b->bdg_active_ports++; 1808 if (cand2 >= 0) { 1809 /* also bind the host stack to the bridge */ 1810 b->bdg_ports[cand2] = SWNA(iter); 1811 SWNA(iter)->bdg_port = cand2; 1812 SWNA(iter)->na_bdg = b; 1813 b->bdg_active_ports++; 1814 ND("host %p to bridge port %d", SWNA(iter), cand2); 1815 } 1816 ADD_BDG_REF(iter); // XXX one or two ? 1817 ND("if %s refs %d", name, NA(iter)->na_bdg_refcount); 1818 BDG_WUNLOCK(b); 1819 *ifp = iter; 1820 return 0; 1821 1822 no_bridge_port: 1823 *ifp = iter; 1824 if (! *ifp) 1825 *ifp = ifunit_ref(name); 1826 if (*ifp == NULL) 1827 return (ENXIO); 1828 1829 if (NETMAP_CAPABLE(*ifp)) { 1830 /* Users cannot use the NIC attached to a bridge directly */ 1831 if (no_prefix && NETMAP_OWNED_BY_KERN(*ifp)) { 1832 if_rele(*ifp); /* don't detach from bridge */ 1833 return EINVAL; 1834 } else 1835 return 0; /* valid pointer, we hold the refcount */ 1836 } 1837 nm_if_rele(*ifp); 1838 return EINVAL; // not NETMAP capable 1839 } 1840 1841 1842 /* 1843 * Error routine called when txsync/rxsync detects an error. 1844 * Can't do much more than resetting cur = hwcur, avail = hwavail. 1845 * Return 1 on reinit. 1846 * 1847 * This routine is only called by the upper half of the kernel. 1848 * It only reads hwcur (which is changed only by the upper half, too) 1849 * and hwavail (which may be changed by the lower half, but only on 1850 * a tx ring and only to increase it, so any error will be recovered 1851 * on the next call). For the above, we don't strictly need to call 1852 * it under lock. 1853 */ 1854 int 1855 netmap_ring_reinit(struct netmap_kring *kring) 1856 { 1857 struct netmap_ring *ring = kring->ring; 1858 u_int i, lim = kring->nkr_num_slots - 1; 1859 int errors = 0; 1860 1861 // XXX KASSERT nm_kr_tryget 1862 RD(10, "called for %s", kring->na->ifp->if_xname); 1863 if (ring->cur > lim) 1864 errors++; 1865 for (i = 0; i <= lim; i++) { 1866 u_int idx = ring->slot[i].buf_idx; 1867 u_int len = ring->slot[i].len; 1868 if (idx < 2 || idx >= netmap_total_buffers) { 1869 if (!errors++) 1870 D("bad buffer at slot %d idx %d len %d ", i, idx, len); 1871 ring->slot[i].buf_idx = 0; 1872 ring->slot[i].len = 0; 1873 } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { 1874 ring->slot[i].len = 0; 1875 if (!errors++) 1876 D("bad len %d at slot %d idx %d", 1877 len, i, idx); 1878 } 1879 } 1880 if (errors) { 1881 int pos = kring - kring->na->tx_rings; 1882 int n = kring->na->num_tx_rings + 1; 1883 1884 RD(10, "total %d errors", errors); 1885 errors++; 1886 RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d", 1887 kring->na->ifp->if_xname, 1888 pos < n ? "TX" : "RX", pos < n ? pos : pos - n, 1889 ring->cur, kring->nr_hwcur, 1890 ring->avail, kring->nr_hwavail); 1891 ring->cur = kring->nr_hwcur; 1892 ring->avail = kring->nr_hwavail; 1893 } 1894 return (errors ? 1 : 0); 1895 } 1896 1897 1898 /* 1899 * Set the ring ID. For devices with a single queue, a request 1900 * for all rings is the same as a single ring. 1901 */ 1902 static int 1903 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) 1904 { 1905 struct ifnet *ifp = priv->np_ifp; 1906 struct netmap_adapter *na = NA(ifp); 1907 u_int i = ringid & NETMAP_RING_MASK; 1908 /* initially (np_qfirst == np_qlast) we don't want to lock */ 1909 u_int lim = na->num_rx_rings; 1910 1911 if (na->num_tx_rings > lim) 1912 lim = na->num_tx_rings; 1913 if ( (ringid & NETMAP_HW_RING) && i >= lim) { 1914 D("invalid ring id %d", i); 1915 return (EINVAL); 1916 } 1917 priv->np_ringid = ringid; 1918 if (ringid & NETMAP_SW_RING) { 1919 priv->np_qfirst = NETMAP_SW_RING; 1920 priv->np_qlast = 0; 1921 } else if (ringid & NETMAP_HW_RING) { 1922 priv->np_qfirst = i; 1923 priv->np_qlast = i + 1; 1924 } else { 1925 priv->np_qfirst = 0; 1926 priv->np_qlast = NETMAP_HW_RING ; 1927 } 1928 priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; 1929 if (netmap_verbose) { 1930 if (ringid & NETMAP_SW_RING) 1931 D("ringid %s set to SW RING", ifp->if_xname); 1932 else if (ringid & NETMAP_HW_RING) 1933 D("ringid %s set to HW RING %d", ifp->if_xname, 1934 priv->np_qfirst); 1935 else 1936 D("ringid %s set to all %d HW RINGS", ifp->if_xname, lim); 1937 } 1938 return 0; 1939 } 1940 1941 1942 /* 1943 * possibly move the interface to netmap-mode. 1944 * If success it returns a pointer to netmap_if, otherwise NULL. 1945 * This must be called with NMG_LOCK held. 1946 */ 1947 static struct netmap_if * 1948 netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp, 1949 uint16_t ringid, int *err) 1950 { 1951 struct netmap_adapter *na = NA(ifp); 1952 struct netmap_if *nifp = NULL; 1953 int error, need_mem; 1954 1955 NMG_LOCK_ASSERT(); 1956 /* ring configuration may have changed, fetch from the card */ 1957 netmap_update_config(na); 1958 priv->np_ifp = ifp; /* store the reference */ 1959 error = netmap_set_ringid(priv, ringid); 1960 if (error) 1961 goto out; 1962 /* ensure allocators are ready */ 1963 need_mem = !netmap_have_memory_locked(priv); 1964 if (need_mem) { 1965 error = netmap_get_memory_locked(priv); 1966 ND("get_memory returned %d", error); 1967 if (error) 1968 goto out; 1969 } 1970 nifp = netmap_if_new(ifp->if_xname, na); 1971 if (nifp == NULL) { /* allocation failed */ 1972 /* we should drop the allocator, but only 1973 * if we were the ones who grabbed it 1974 */ 1975 if (need_mem) 1976 netmap_drop_memory_locked(priv); 1977 error = ENOMEM; 1978 goto out; 1979 } 1980 na->refcount++; 1981 if (ifp->if_capenable & IFCAP_NETMAP) { 1982 /* was already set */ 1983 } else { 1984 u_int i; 1985 /* Otherwise set the card in netmap mode 1986 * and make it use the shared buffers. 1987 * 1988 * If the interface is attached to a bridge, lock it. 1989 */ 1990 if (NETMAP_OWNED_BY_KERN(ifp)) 1991 BDG_WLOCK(NA(ifp)->na_bdg); 1992 for (i = 0 ; i < na->num_tx_rings + 1; i++) 1993 mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock", 1994 NULL, MTX_DEF); 1995 for (i = 0 ; i < na->num_rx_rings + 1; i++) { 1996 mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock", 1997 NULL, MTX_DEF); 1998 } 1999 if (nma_is_hw(na)) { 2000 SWNA(ifp)->tx_rings = &na->tx_rings[na->num_tx_rings]; 2001 SWNA(ifp)->rx_rings = &na->rx_rings[na->num_rx_rings]; 2002 } 2003 /* 2004 * do not core lock because the race is harmless here, 2005 * there cannot be any traffic to netmap_transmit() 2006 */ 2007 error = na->nm_register(ifp, 1); /* mode on */ 2008 // XXX do we need to nm_alloc_bdgfwd() in all cases ? 2009 if (!error) 2010 error = nm_alloc_bdgfwd(na); 2011 if (error) { 2012 netmap_do_unregif(priv, nifp); 2013 nifp = NULL; 2014 } 2015 if (NETMAP_OWNED_BY_KERN(ifp)) 2016 BDG_WUNLOCK(NA(ifp)->na_bdg); 2017 2018 } 2019 out: 2020 *err = error; 2021 if (nifp != NULL) { 2022 /* 2023 * advertise that the interface is ready bt setting ni_nifp. 2024 * The barrier is needed because readers (poll and *SYNC) 2025 * check for priv->np_nifp != NULL without locking 2026 */ 2027 wmb(); /* make sure previous writes are visible to all CPUs */ 2028 priv->np_nifp = nifp; 2029 } 2030 return nifp; 2031 } 2032 2033 /* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */ 2034 static int 2035 nm_bdg_attach(struct nmreq *nmr) 2036 { 2037 struct ifnet *ifp; 2038 struct netmap_if *nifp; 2039 struct netmap_priv_d *npriv; 2040 int error; 2041 2042 npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); 2043 if (npriv == NULL) 2044 return ENOMEM; 2045 NMG_LOCK(); 2046 error = get_ifp(nmr, &ifp, 1 /* create if not exists */); 2047 if (error) /* no device, or another bridge or user owns the device */ 2048 goto unlock_exit; 2049 /* get_ifp() sets na_bdg if this is a physical interface 2050 * that we can attach to a switch. 2051 */ 2052 if (!NETMAP_OWNED_BY_KERN(ifp)) { 2053 /* got reference to a virtual port or direct access to a NIC. 2054 * perhaps specified no bridge prefix or wrong NIC name 2055 */ 2056 error = EINVAL; 2057 goto unref_exit; 2058 } 2059 2060 if (NA(ifp)->refcount > 0) { /* already registered */ 2061 error = EBUSY; 2062 DROP_BDG_REF(ifp); 2063 goto unlock_exit; 2064 } 2065 2066 nifp = netmap_do_regif(npriv, ifp, nmr->nr_ringid, &error); 2067 if (!nifp) { 2068 goto unref_exit; 2069 } 2070 2071 NA(ifp)->na_kpriv = npriv; 2072 NMG_UNLOCK(); 2073 ND("registered %s to netmap-mode", ifp->if_xname); 2074 return 0; 2075 2076 unref_exit: 2077 nm_if_rele(ifp); 2078 unlock_exit: 2079 NMG_UNLOCK(); 2080 bzero(npriv, sizeof(*npriv)); 2081 free(npriv, M_DEVBUF); 2082 return error; 2083 } 2084 2085 static int 2086 nm_bdg_detach(struct nmreq *nmr) 2087 { 2088 struct ifnet *ifp; 2089 int error; 2090 int last_instance; 2091 2092 NMG_LOCK(); 2093 error = get_ifp(nmr, &ifp, 0 /* don't create */); 2094 if (error) { /* no device, or another bridge or user owns the device */ 2095 goto unlock_exit; 2096 } 2097 /* XXX do we need to check this ? */ 2098 if (!NETMAP_OWNED_BY_KERN(ifp)) { 2099 /* got reference to a virtual port or direct access to a NIC. 2100 * perhaps specified no bridge's prefix or wrong NIC's name 2101 */ 2102 error = EINVAL; 2103 goto unref_exit; 2104 } 2105 2106 if (NA(ifp)->refcount == 0) { /* not registered */ 2107 error = EINVAL; 2108 goto unref_exit; 2109 } 2110 2111 DROP_BDG_REF(ifp); /* the one from get_ifp */ 2112 last_instance = netmap_dtor_locked(NA(ifp)->na_kpriv); /* unregister */ 2113 NMG_UNLOCK(); 2114 if (!last_instance) { 2115 D("--- error, trying to detach an entry with active mmaps"); 2116 error = EINVAL; 2117 } else { 2118 struct netmap_priv_d *npriv = NA(ifp)->na_kpriv; 2119 NA(ifp)->na_kpriv = NULL; 2120 2121 bzero(npriv, sizeof(*npriv)); 2122 free(npriv, M_DEVBUF); 2123 } 2124 return error; 2125 2126 unref_exit: 2127 nm_if_rele(ifp); 2128 unlock_exit: 2129 NMG_UNLOCK(); 2130 return error; 2131 } 2132 2133 2134 /* Initialize necessary fields of sw adapter located in right after hw's 2135 * one. sw adapter attaches a pair of sw rings of the netmap-mode NIC. 2136 * It is always activated and deactivated at the same tie with the hw's one. 2137 * Thus we don't need refcounting on the sw adapter. 2138 * Regardless of NIC's feature we use separate lock so that anybody can lock 2139 * me independently from the hw adapter. 2140 * Make sure nm_register is NULL to be handled as FALSE in nma_is_hw 2141 */ 2142 static void 2143 netmap_attach_sw(struct ifnet *ifp) 2144 { 2145 struct netmap_adapter *hw_na = NA(ifp); 2146 struct netmap_adapter *na = SWNA(ifp); 2147 2148 na->ifp = ifp; 2149 na->num_rx_rings = na->num_tx_rings = 1; 2150 na->num_tx_desc = hw_na->num_tx_desc; 2151 na->num_rx_desc = hw_na->num_rx_desc; 2152 na->nm_txsync = netmap_bdg_to_host; 2153 /* we use the same memory allocator as the 2154 * the hw adapter */ 2155 na->nm_mem = hw_na->nm_mem; 2156 } 2157 2158 2159 /* exported to kernel callers, e.g. OVS ? 2160 * Entry point. 2161 * Called without NMG_LOCK. 2162 */ 2163 int 2164 netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) 2165 { 2166 struct nm_bridge *b; 2167 struct netmap_adapter *na; 2168 struct ifnet *iter; 2169 char *name = nmr->nr_name; 2170 int cmd = nmr->nr_cmd, namelen = strlen(name); 2171 int error = 0, i, j; 2172 2173 switch (cmd) { 2174 case NETMAP_BDG_ATTACH: 2175 error = nm_bdg_attach(nmr); 2176 break; 2177 2178 case NETMAP_BDG_DETACH: 2179 error = nm_bdg_detach(nmr); 2180 break; 2181 2182 case NETMAP_BDG_LIST: 2183 /* this is used to enumerate bridges and ports */ 2184 if (namelen) { /* look up indexes of bridge and port */ 2185 if (strncmp(name, NM_NAME, strlen(NM_NAME))) { 2186 error = EINVAL; 2187 break; 2188 } 2189 NMG_LOCK(); 2190 b = nm_find_bridge(name, 0 /* don't create */); 2191 if (!b) { 2192 error = ENOENT; 2193 NMG_UNLOCK(); 2194 break; 2195 } 2196 2197 error = ENOENT; 2198 for (j = 0; j < b->bdg_active_ports; j++) { 2199 i = b->bdg_port_index[j]; 2200 na = b->bdg_ports[i]; 2201 if (na == NULL) { 2202 D("---AAAAAAAAARGH-------"); 2203 continue; 2204 } 2205 iter = na->ifp; 2206 /* the former and the latter identify a 2207 * virtual port and a NIC, respectively 2208 */ 2209 if (!strcmp(iter->if_xname, name) || 2210 (namelen > b->bdg_namelen && 2211 !strcmp(iter->if_xname, 2212 name + b->bdg_namelen + 1))) { 2213 /* bridge index */ 2214 nmr->nr_arg1 = b - nm_bridges; 2215 nmr->nr_arg2 = i; /* port index */ 2216 error = 0; 2217 break; 2218 } 2219 } 2220 NMG_UNLOCK(); 2221 } else { 2222 /* return the first non-empty entry starting from 2223 * bridge nr_arg1 and port nr_arg2. 2224 * 2225 * Users can detect the end of the same bridge by 2226 * seeing the new and old value of nr_arg1, and can 2227 * detect the end of all the bridge by error != 0 2228 */ 2229 i = nmr->nr_arg1; 2230 j = nmr->nr_arg2; 2231 2232 NMG_LOCK(); 2233 for (error = ENOENT; i < NM_BRIDGES; i++) { 2234 b = nm_bridges + i; 2235 if (j >= b->bdg_active_ports) { 2236 j = 0; /* following bridges scan from 0 */ 2237 continue; 2238 } 2239 nmr->nr_arg1 = i; 2240 nmr->nr_arg2 = j; 2241 j = b->bdg_port_index[j]; 2242 na = b->bdg_ports[j]; 2243 iter = na->ifp; 2244 strncpy(name, iter->if_xname, (size_t)IFNAMSIZ); 2245 error = 0; 2246 break; 2247 } 2248 NMG_UNLOCK(); 2249 } 2250 break; 2251 2252 case NETMAP_BDG_LOOKUP_REG: 2253 /* register a lookup function to the given bridge. 2254 * nmr->nr_name may be just bridge's name (including ':' 2255 * if it is not just NM_NAME). 2256 */ 2257 if (!func) { 2258 error = EINVAL; 2259 break; 2260 } 2261 NMG_LOCK(); 2262 b = nm_find_bridge(name, 0 /* don't create */); 2263 if (!b) { 2264 error = EINVAL; 2265 } else { 2266 b->nm_bdg_lookup = func; 2267 } 2268 NMG_UNLOCK(); 2269 break; 2270 2271 default: 2272 D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); 2273 error = EINVAL; 2274 break; 2275 } 2276 return error; 2277 } 2278 2279 2280 /* 2281 * ioctl(2) support for the "netmap" device. 2282 * 2283 * Following a list of accepted commands: 2284 * - NIOCGINFO 2285 * - SIOCGIFADDR just for convenience 2286 * - NIOCREGIF 2287 * - NIOCUNREGIF 2288 * - NIOCTXSYNC 2289 * - NIOCRXSYNC 2290 * 2291 * Return 0 on success, errno otherwise. 2292 */ 2293 static int 2294 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, 2295 int fflag, struct thread *td) 2296 { 2297 struct netmap_priv_d *priv = NULL; 2298 struct ifnet *ifp = NULL; 2299 struct nmreq *nmr = (struct nmreq *) data; 2300 struct netmap_adapter *na = NULL; 2301 int error; 2302 u_int i, lim; 2303 struct netmap_if *nifp; 2304 struct netmap_kring *krings; 2305 2306 (void)dev; /* UNUSED */ 2307 (void)fflag; /* UNUSED */ 2308 #ifdef linux 2309 #define devfs_get_cdevpriv(pp) \ 2310 ({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; \ 2311 (*pp ? 0 : ENOENT); }) 2312 2313 /* devfs_set_cdevpriv cannot fail on linux */ 2314 #define devfs_set_cdevpriv(p, fn) \ 2315 ({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); }) 2316 2317 2318 #define devfs_clear_cdevpriv() do { \ 2319 netmap_dtor(priv); ((struct file *)td)->private_data = 0; \ 2320 } while (0) 2321 #endif /* linux */ 2322 2323 CURVNET_SET(TD_TO_VNET(td)); 2324 2325 error = devfs_get_cdevpriv((void **)&priv); 2326 if (error) { 2327 CURVNET_RESTORE(); 2328 /* XXX ENOENT should be impossible, since the priv 2329 * is now created in the open */ 2330 return (error == ENOENT ? ENXIO : error); 2331 } 2332 2333 nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; /* truncate name */ 2334 switch (cmd) { 2335 case NIOCGINFO: /* return capabilities etc */ 2336 if (nmr->nr_version != NETMAP_API) { 2337 D("API mismatch got %d have %d", 2338 nmr->nr_version, NETMAP_API); 2339 nmr->nr_version = NETMAP_API; 2340 error = EINVAL; 2341 break; 2342 } 2343 if (nmr->nr_cmd == NETMAP_BDG_LIST) { 2344 error = netmap_bdg_ctl(nmr, NULL); 2345 break; 2346 } 2347 2348 NMG_LOCK(); 2349 do { 2350 /* memsize is always valid */ 2351 struct netmap_mem_d *nmd = &nm_mem; 2352 u_int memflags; 2353 2354 if (nmr->nr_name[0] != '\0') { 2355 /* get a refcount */ 2356 error = get_ifp(nmr, &ifp, 1 /* create */); 2357 if (error) 2358 break; 2359 na = NA(ifp); /* retrieve the netmap adapter */ 2360 nmd = na->nm_mem; /* and its memory allocator */ 2361 } 2362 2363 error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags); 2364 if (error) 2365 break; 2366 if (na == NULL) /* only memory info */ 2367 break; 2368 nmr->nr_offset = 0; 2369 nmr->nr_rx_slots = nmr->nr_tx_slots = 0; 2370 netmap_update_config(na); 2371 nmr->nr_rx_rings = na->num_rx_rings; 2372 nmr->nr_tx_rings = na->num_tx_rings; 2373 nmr->nr_rx_slots = na->num_rx_desc; 2374 nmr->nr_tx_slots = na->num_tx_desc; 2375 if (memflags & NETMAP_MEM_PRIVATE) 2376 nmr->nr_ringid |= NETMAP_PRIV_MEM; 2377 } while (0); 2378 if (ifp) 2379 nm_if_rele(ifp); /* return the refcount */ 2380 NMG_UNLOCK(); 2381 break; 2382 2383 case NIOCREGIF: 2384 if (nmr->nr_version != NETMAP_API) { 2385 nmr->nr_version = NETMAP_API; 2386 error = EINVAL; 2387 break; 2388 } 2389 /* possibly attach/detach NIC and VALE switch */ 2390 i = nmr->nr_cmd; 2391 if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH) { 2392 error = netmap_bdg_ctl(nmr, NULL); 2393 break; 2394 } else if (i != 0) { 2395 D("nr_cmd must be 0 not %d", i); 2396 error = EINVAL; 2397 break; 2398 } 2399 2400 /* protect access to priv from concurrent NIOCREGIF */ 2401 NMG_LOCK(); 2402 do { 2403 u_int memflags; 2404 2405 if (priv->np_ifp != NULL) { /* thread already registered */ 2406 error = netmap_set_ringid(priv, nmr->nr_ringid); 2407 break; 2408 } 2409 /* find the interface and a reference */ 2410 error = get_ifp(nmr, &ifp, 1 /* create */); /* keep reference */ 2411 if (error) 2412 break; 2413 if (NETMAP_OWNED_BY_KERN(ifp)) { 2414 nm_if_rele(ifp); 2415 error = EBUSY; 2416 break; 2417 } 2418 nifp = netmap_do_regif(priv, ifp, nmr->nr_ringid, &error); 2419 if (!nifp) { /* reg. failed, release priv and ref */ 2420 nm_if_rele(ifp); /* return the refcount */ 2421 priv->np_ifp = NULL; 2422 priv->np_nifp = NULL; 2423 break; 2424 } 2425 2426 /* return the offset of the netmap_if object */ 2427 na = NA(ifp); /* retrieve netmap adapter */ 2428 nmr->nr_rx_rings = na->num_rx_rings; 2429 nmr->nr_tx_rings = na->num_tx_rings; 2430 nmr->nr_rx_slots = na->num_rx_desc; 2431 nmr->nr_tx_slots = na->num_tx_desc; 2432 error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags); 2433 if (error) { 2434 nm_if_rele(ifp); 2435 break; 2436 } 2437 if (memflags & NETMAP_MEM_PRIVATE) { 2438 nmr->nr_ringid |= NETMAP_PRIV_MEM; 2439 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; 2440 } 2441 nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); 2442 } while (0); 2443 NMG_UNLOCK(); 2444 break; 2445 2446 case NIOCUNREGIF: 2447 // XXX we have no data here ? 2448 D("deprecated, data is %p", nmr); 2449 error = EINVAL; 2450 break; 2451 2452 case NIOCTXSYNC: 2453 case NIOCRXSYNC: 2454 nifp = priv->np_nifp; 2455 2456 if (nifp == NULL) { 2457 error = ENXIO; 2458 break; 2459 } 2460 rmb(); /* make sure following reads are not from cache */ 2461 2462 ifp = priv->np_ifp; /* we have a reference */ 2463 2464 if (ifp == NULL) { 2465 D("Internal error: nifp != NULL && ifp == NULL"); 2466 error = ENXIO; 2467 break; 2468 } 2469 2470 na = NA(ifp); /* retrieve netmap adapter */ 2471 if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ 2472 if (cmd == NIOCTXSYNC) 2473 netmap_txsync_to_host(na); 2474 else 2475 netmap_rxsync_from_host(na, NULL, NULL); 2476 break; 2477 } 2478 /* find the last ring to scan */ 2479 lim = priv->np_qlast; 2480 if (lim == NETMAP_HW_RING) 2481 lim = (cmd == NIOCTXSYNC) ? 2482 na->num_tx_rings : na->num_rx_rings; 2483 2484 krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings; 2485 for (i = priv->np_qfirst; i < lim; i++) { 2486 struct netmap_kring *kring = krings + i; 2487 if (nm_kr_tryget(kring)) { 2488 error = EBUSY; 2489 goto out; 2490 } 2491 if (cmd == NIOCTXSYNC) { 2492 if (netmap_verbose & NM_VERB_TXSYNC) 2493 D("pre txsync ring %d cur %d hwcur %d", 2494 i, kring->ring->cur, 2495 kring->nr_hwcur); 2496 na->nm_txsync(ifp, i, NAF_FORCE_RECLAIM); 2497 if (netmap_verbose & NM_VERB_TXSYNC) 2498 D("post txsync ring %d cur %d hwcur %d", 2499 i, kring->ring->cur, 2500 kring->nr_hwcur); 2501 } else { 2502 na->nm_rxsync(ifp, i, NAF_FORCE_READ); 2503 microtime(&na->rx_rings[i].ring->ts); 2504 } 2505 nm_kr_put(kring); 2506 } 2507 2508 break; 2509 2510 #ifdef __FreeBSD__ 2511 case BIOCIMMEDIATE: 2512 case BIOCGHDRCMPLT: 2513 case BIOCSHDRCMPLT: 2514 case BIOCSSEESENT: 2515 D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT"); 2516 break; 2517 2518 default: /* allow device-specific ioctls */ 2519 { 2520 struct socket so; 2521 2522 bzero(&so, sizeof(so)); 2523 NMG_LOCK(); 2524 error = get_ifp(nmr, &ifp, 0 /* don't create */); /* keep reference */ 2525 if (error) { 2526 NMG_UNLOCK(); 2527 break; 2528 } 2529 so.so_vnet = ifp->if_vnet; 2530 // so->so_proto not null. 2531 error = ifioctl(&so, cmd, data, td); 2532 nm_if_rele(ifp); 2533 NMG_UNLOCK(); 2534 break; 2535 } 2536 2537 #else /* linux */ 2538 default: 2539 error = EOPNOTSUPP; 2540 #endif /* linux */ 2541 } 2542 out: 2543 2544 CURVNET_RESTORE(); 2545 return (error); 2546 } 2547 2548 2549 /* 2550 * select(2) and poll(2) handlers for the "netmap" device. 2551 * 2552 * Can be called for one or more queues. 2553 * Return true the event mask corresponding to ready events. 2554 * If there are no ready events, do a selrecord on either individual 2555 * selinfo or on the global one. 2556 * Device-dependent parts (locking and sync of tx/rx rings) 2557 * are done through callbacks. 2558 * 2559 * On linux, arguments are really pwait, the poll table, and 'td' is struct file * 2560 * The first one is remapped to pwait as selrecord() uses the name as an 2561 * hidden argument. 2562 */ 2563 static int 2564 netmap_poll(struct cdev *dev, int events, struct thread *td) 2565 { 2566 struct netmap_priv_d *priv = NULL; 2567 struct netmap_adapter *na; 2568 struct ifnet *ifp; 2569 struct netmap_kring *kring; 2570 u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; 2571 u_int lim_tx, lim_rx, host_forwarded = 0; 2572 struct mbq q = { NULL, NULL, 0 }; 2573 void *pwait = dev; /* linux compatibility */ 2574 2575 int retry_tx = 1; 2576 2577 (void)pwait; 2578 2579 if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) 2580 return POLLERR; 2581 2582 if (priv->np_nifp == NULL) { 2583 D("No if registered"); 2584 return POLLERR; 2585 } 2586 rmb(); /* make sure following reads are not from cache */ 2587 2588 ifp = priv->np_ifp; 2589 // XXX check for deleting() ? 2590 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) 2591 return POLLERR; 2592 2593 if (netmap_verbose & 0x8000) 2594 D("device %s events 0x%x", ifp->if_xname, events); 2595 want_tx = events & (POLLOUT | POLLWRNORM); 2596 want_rx = events & (POLLIN | POLLRDNORM); 2597 2598 na = NA(ifp); /* retrieve netmap adapter */ 2599 2600 lim_tx = na->num_tx_rings; 2601 lim_rx = na->num_rx_rings; 2602 2603 if (priv->np_qfirst == NETMAP_SW_RING) { 2604 /* handle the host stack ring */ 2605 if (priv->np_txpoll || want_tx) { 2606 /* push any packets up, then we are always ready */ 2607 netmap_txsync_to_host(na); 2608 revents |= want_tx; 2609 } 2610 if (want_rx) { 2611 kring = &na->rx_rings[lim_rx]; 2612 if (kring->ring->avail == 0) 2613 netmap_rxsync_from_host(na, td, dev); 2614 if (kring->ring->avail > 0) { 2615 revents |= want_rx; 2616 } 2617 } 2618 return (revents); 2619 } 2620 2621 /* if we are in transparent mode, check also the host rx ring */ 2622 kring = &na->rx_rings[lim_rx]; 2623 if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all 2624 && want_rx 2625 && (netmap_fwd || kring->ring->flags & NR_FORWARD) ) { 2626 if (kring->ring->avail == 0) 2627 netmap_rxsync_from_host(na, td, dev); 2628 if (kring->ring->avail > 0) 2629 revents |= want_rx; 2630 } 2631 2632 /* 2633 * check_all is set if the card has more than one queue AND 2634 * the client is polling all of them. If true, we sleep on 2635 * the "global" selinfo, otherwise we sleep on individual selinfo 2636 * (FreeBSD only allows two selinfo's per file descriptor). 2637 * The interrupt routine in the driver wake one or the other 2638 * (or both) depending on which clients are active. 2639 * 2640 * rxsync() is only called if we run out of buffers on a POLLIN. 2641 * txsync() is called if we run out of buffers on POLLOUT, or 2642 * there are pending packets to send. The latter can be disabled 2643 * passing NETMAP_NO_TX_POLL in the NIOCREG call. 2644 */ 2645 check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1); 2646 check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1); 2647 2648 if (priv->np_qlast != NETMAP_HW_RING) { 2649 lim_tx = lim_rx = priv->np_qlast; 2650 } 2651 2652 /* 2653 * We start with a lock free round which is good if we have 2654 * data available. If this fails, then lock and call the sync 2655 * routines. 2656 */ 2657 for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { 2658 kring = &na->rx_rings[i]; 2659 if (kring->ring->avail > 0) { 2660 revents |= want_rx; 2661 want_rx = 0; /* also breaks the loop */ 2662 } 2663 } 2664 for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) { 2665 kring = &na->tx_rings[i]; 2666 if (kring->ring->avail > 0) { 2667 revents |= want_tx; 2668 want_tx = 0; /* also breaks the loop */ 2669 } 2670 } 2671 2672 /* 2673 * If we to push packets out (priv->np_txpoll) or want_tx is 2674 * still set, we do need to run the txsync calls (on all rings, 2675 * to avoid that the tx rings stall). 2676 */ 2677 if (priv->np_txpoll || want_tx) { 2678 /* If we really want to be woken up (want_tx), 2679 * do a selrecord, either on the global or on 2680 * the private structure. Then issue the txsync 2681 * so there is no race in the selrecord/selwait 2682 */ 2683 flush_tx: 2684 for (i = priv->np_qfirst; i < lim_tx; i++) { 2685 kring = &na->tx_rings[i]; 2686 /* 2687 * Skip this ring if want_tx == 0 2688 * (we have already done a successful sync on 2689 * a previous ring) AND kring->cur == kring->hwcur 2690 * (there are no pending transmissions for this ring). 2691 */ 2692 if (!want_tx && kring->ring->cur == kring->nr_hwcur) 2693 continue; 2694 /* make sure only one user thread is doing this */ 2695 if (nm_kr_tryget(kring)) { 2696 ND("ring %p busy is %d", kring, (int)kring->nr_busy); 2697 revents |= POLLERR; 2698 goto out; 2699 } 2700 2701 if (netmap_verbose & NM_VERB_TXSYNC) 2702 D("send %d on %s %d", 2703 kring->ring->cur, ifp->if_xname, i); 2704 if (na->nm_txsync(ifp, i, 0)) 2705 revents |= POLLERR; 2706 2707 /* Check avail/call selrecord only if called with POLLOUT */ 2708 if (want_tx) { 2709 if (kring->ring->avail > 0) { 2710 /* stop at the first ring. We don't risk 2711 * starvation. 2712 */ 2713 revents |= want_tx; 2714 want_tx = 0; 2715 } 2716 } 2717 nm_kr_put(kring); 2718 } 2719 if (want_tx && retry_tx) { 2720 selrecord(td, check_all_tx ? 2721 &na->tx_si : &na->tx_rings[priv->np_qfirst].si); 2722 retry_tx = 0; 2723 goto flush_tx; 2724 } 2725 } 2726 2727 /* 2728 * now if want_rx is still set we need to lock and rxsync. 2729 * Do it on all rings because otherwise we starve. 2730 */ 2731 if (want_rx) { 2732 int retry_rx = 1; 2733 do_retry_rx: 2734 for (i = priv->np_qfirst; i < lim_rx; i++) { 2735 kring = &na->rx_rings[i]; 2736 2737 if (nm_kr_tryget(kring)) { 2738 revents |= POLLERR; 2739 goto out; 2740 } 2741 2742 /* XXX NR_FORWARD should only be read on 2743 * physical or NIC ports 2744 */ 2745 if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { 2746 ND(10, "forwarding some buffers up %d to %d", 2747 kring->nr_hwcur, kring->ring->cur); 2748 netmap_grab_packets(kring, &q, netmap_fwd); 2749 } 2750 2751 if (na->nm_rxsync(ifp, i, 0)) 2752 revents |= POLLERR; 2753 if (netmap_no_timestamp == 0 || 2754 kring->ring->flags & NR_TIMESTAMP) { 2755 microtime(&kring->ring->ts); 2756 } 2757 2758 if (kring->ring->avail > 0) { 2759 revents |= want_rx; 2760 retry_rx = 0; 2761 } 2762 nm_kr_put(kring); 2763 } 2764 if (retry_rx) { 2765 retry_rx = 0; 2766 selrecord(td, check_all_rx ? 2767 &na->rx_si : &na->rx_rings[priv->np_qfirst].si); 2768 goto do_retry_rx; 2769 } 2770 } 2771 2772 /* forward host to the netmap ring. 2773 * I am accessing nr_hwavail without lock, but netmap_transmit 2774 * can only increment it, so the operation is safe. 2775 */ 2776 kring = &na->rx_rings[lim_rx]; 2777 if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all 2778 && (netmap_fwd || kring->ring->flags & NR_FORWARD) 2779 && kring->nr_hwavail > 0 && !host_forwarded) { 2780 netmap_sw_to_nic(na); 2781 host_forwarded = 1; /* prevent another pass */ 2782 want_rx = 0; 2783 goto flush_tx; 2784 } 2785 2786 if (q.head) 2787 netmap_send_up(na->ifp, q.head); 2788 2789 out: 2790 2791 return (revents); 2792 } 2793 2794 /*------- driver support routines ------*/ 2795 2796 2797 /* 2798 * Initialize a ``netmap_adapter`` object created by driver on attach. 2799 * We allocate a block of memory with room for a struct netmap_adapter 2800 * plus two sets of N+2 struct netmap_kring (where N is the number 2801 * of hardware rings): 2802 * krings 0..N-1 are for the hardware queues. 2803 * kring N is for the host stack queue 2804 * kring N+1 is only used for the selinfo for all queues. 2805 * Return 0 on success, ENOMEM otherwise. 2806 * 2807 * By default the receive and transmit adapter ring counts are both initialized 2808 * to num_queues. na->num_tx_rings can be set for cards with different tx/rx 2809 * setups. 2810 */ 2811 int 2812 netmap_attach(struct netmap_adapter *arg, u_int num_queues) 2813 { 2814 struct netmap_adapter *na = NULL; 2815 struct ifnet *ifp = arg ? arg->ifp : NULL; 2816 size_t len; 2817 2818 if (arg == NULL || ifp == NULL) 2819 goto fail; 2820 /* a VALE port uses two endpoints */ 2821 len = nma_is_vp(arg) ? sizeof(*na) : sizeof(*na) * 2; 2822 na = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO); 2823 if (na == NULL) 2824 goto fail; 2825 WNA(ifp) = na; 2826 *na = *arg; /* copy everything, trust the driver to not pass junk */ 2827 NETMAP_SET_CAPABLE(ifp); 2828 if (na->num_tx_rings == 0) 2829 na->num_tx_rings = num_queues; 2830 na->num_rx_rings = num_queues; 2831 na->refcount = na->na_single = na->na_multi = 0; 2832 /* Core lock initialized here, others after netmap_if_new. */ 2833 mtx_init(&na->core_lock, "netmap core lock", MTX_NETWORK_LOCK, MTX_DEF); 2834 #ifdef linux 2835 if (ifp->netdev_ops) { 2836 ND("netdev_ops %p", ifp->netdev_ops); 2837 /* prepare a clone of the netdev ops */ 2838 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) 2839 na->nm_ndo.ndo_start_xmit = ifp->netdev_ops; 2840 #else 2841 na->nm_ndo = *ifp->netdev_ops; 2842 #endif 2843 } 2844 na->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; 2845 #endif /* linux */ 2846 na->nm_mem = arg->nm_mem ? arg->nm_mem : &nm_mem; 2847 if (!nma_is_vp(arg)) 2848 netmap_attach_sw(ifp); 2849 D("success for %s", ifp->if_xname); 2850 return 0; 2851 2852 fail: 2853 D("fail, arg %p ifp %p na %p", arg, ifp, na); 2854 netmap_detach(ifp); 2855 return (na ? EINVAL : ENOMEM); 2856 } 2857 2858 2859 /* 2860 * Free the allocated memory linked to the given ``netmap_adapter`` 2861 * object. 2862 */ 2863 void 2864 netmap_detach(struct ifnet *ifp) 2865 { 2866 struct netmap_adapter *na = NA(ifp); 2867 2868 if (!na) 2869 return; 2870 2871 mtx_destroy(&na->core_lock); 2872 2873 if (na->tx_rings) { /* XXX should not happen */ 2874 D("freeing leftover tx_rings"); 2875 free(na->tx_rings, M_DEVBUF); 2876 } 2877 if (na->na_flags & NAF_MEM_OWNER) 2878 netmap_mem_private_delete(na->nm_mem); 2879 bzero(na, sizeof(*na)); 2880 WNA(ifp) = NULL; 2881 free(na, M_DEVBUF); 2882 } 2883 2884 2885 int 2886 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, 2887 struct netmap_adapter *na, u_int ring_nr); 2888 2889 2890 /* 2891 * Intercept packets from the network stack and pass them 2892 * to netmap as incoming packets on the 'software' ring. 2893 * We rely on the OS to make sure that the ifp and na do not go 2894 * away (typically the caller checks for IFF_DRV_RUNNING or the like). 2895 * In nm_register() or whenever there is a reinitialization, 2896 * we make sure to access the core lock and per-ring locks 2897 * so that IFCAP_NETMAP is visible here. 2898 */ 2899 int 2900 netmap_transmit(struct ifnet *ifp, struct mbuf *m) 2901 { 2902 struct netmap_adapter *na = NA(ifp); 2903 struct netmap_kring *kring; 2904 u_int i, len = MBUF_LEN(m); 2905 u_int error = EBUSY, lim; 2906 struct netmap_slot *slot; 2907 2908 // XXX [Linux] we do not need this lock 2909 // if we follow the down/configure/up protocol -gl 2910 // mtx_lock(&na->core_lock); 2911 if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { 2912 /* interface not in netmap mode anymore */ 2913 error = ENXIO; 2914 goto done; 2915 } 2916 2917 kring = &na->rx_rings[na->num_rx_rings]; 2918 lim = kring->nkr_num_slots - 1; 2919 if (netmap_verbose & NM_VERB_HOST) 2920 D("%s packet %d len %d from the stack", ifp->if_xname, 2921 kring->nr_hwcur + kring->nr_hwavail, len); 2922 // XXX reconsider long packets if we handle fragments 2923 if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ 2924 D("%s from_host, drop packet size %d > %d", ifp->if_xname, 2925 len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); 2926 goto done; 2927 } 2928 if (SWNA(ifp)->na_bdg) { 2929 struct nm_bdg_fwd *ft; 2930 char *dst; 2931 2932 na = SWNA(ifp); /* we operate on the host port */ 2933 ft = na->rx_rings[0].nkr_ft; 2934 dst = BDG_NMB(na->nm_mem, &na->rx_rings[0].ring->slot[0]); 2935 2936 /* use slot 0 in the ft, there is nothing queued here */ 2937 /* XXX we can save the copy calling m_copydata in nm_bdg_flush, 2938 * need a special flag for this. 2939 */ 2940 m_copydata(m, 0, (int)len, dst); 2941 ft->ft_flags = 0; 2942 ft->ft_len = len; 2943 ft->ft_buf = dst; 2944 ft->ft_next = NM_FT_NULL; 2945 ft->ft_frags = 1; 2946 if (netmap_verbose & NM_VERB_HOST) 2947 RD(5, "pkt %p size %d to bridge port %d", 2948 dst, len, na->bdg_port); 2949 nm_bdg_flush(ft, 1, na, 0); 2950 na = NA(ifp); /* back to the regular object/lock */ 2951 error = 0; 2952 goto done; 2953 } 2954 2955 /* protect against other instances of netmap_transmit, 2956 * and userspace invocations of rxsync(). 2957 * XXX could reuse core_lock 2958 */ 2959 // XXX [Linux] there can be no other instances of netmap_transmit 2960 // on this same ring, but we still need this lock to protect 2961 // concurrent access from netmap_sw_to_nic() -gl 2962 mtx_lock(&kring->q_lock); 2963 if (kring->nr_hwavail >= lim) { 2964 if (netmap_verbose) 2965 D("stack ring %s full\n", ifp->if_xname); 2966 } else { 2967 /* compute the insert position */ 2968 i = nm_kr_rxpos(kring); 2969 slot = &kring->ring->slot[i]; 2970 m_copydata(m, 0, (int)len, BDG_NMB(na->nm_mem, slot)); 2971 slot->len = len; 2972 slot->flags = kring->nkr_slot_flags; 2973 kring->nr_hwavail++; 2974 if (netmap_verbose & NM_VERB_HOST) 2975 D("wake up host ring %s %d", na->ifp->if_xname, na->num_rx_rings); 2976 selwakeuppri(&kring->si, PI_NET); 2977 error = 0; 2978 } 2979 mtx_unlock(&kring->q_lock); 2980 2981 done: 2982 // mtx_unlock(&na->core_lock); 2983 2984 /* release the mbuf in either cases of success or failure. As an 2985 * alternative, put the mbuf in a free list and free the list 2986 * only when really necessary. 2987 */ 2988 m_freem(m); 2989 2990 return (error); 2991 } 2992 2993 2994 /* 2995 * netmap_reset() is called by the driver routines when reinitializing 2996 * a ring. The driver is in charge of locking to protect the kring. 2997 * If netmap mode is not set just return NULL. 2998 */ 2999 struct netmap_slot * 3000 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, 3001 u_int new_cur) 3002 { 3003 struct netmap_kring *kring; 3004 int new_hwofs, lim; 3005 3006 if (na == NULL) { 3007 D("NULL na, should not happen"); 3008 return NULL; /* no netmap support here */ 3009 } 3010 if (!(na->ifp->if_capenable & IFCAP_NETMAP)) { 3011 ND("interface not in netmap mode"); 3012 return NULL; /* nothing to reinitialize */ 3013 } 3014 3015 /* XXX note- in the new scheme, we are not guaranteed to be 3016 * under lock (e.g. when called on a device reset). 3017 * In this case, we should set a flag and do not trust too 3018 * much the values. In practice: TODO 3019 * - set a RESET flag somewhere in the kring 3020 * - do the processing in a conservative way 3021 * - let the *sync() fixup at the end. 3022 */ 3023 if (tx == NR_TX) { 3024 if (n >= na->num_tx_rings) 3025 return NULL; 3026 kring = na->tx_rings + n; 3027 new_hwofs = kring->nr_hwcur - new_cur; 3028 } else { 3029 if (n >= na->num_rx_rings) 3030 return NULL; 3031 kring = na->rx_rings + n; 3032 new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur; 3033 } 3034 lim = kring->nkr_num_slots - 1; 3035 if (new_hwofs > lim) 3036 new_hwofs -= lim + 1; 3037 3038 /* Always set the new offset value and realign the ring. */ 3039 D("%s hwofs %d -> %d, hwavail %d -> %d", 3040 tx == NR_TX ? "TX" : "RX", 3041 kring->nkr_hwofs, new_hwofs, 3042 kring->nr_hwavail, 3043 tx == NR_TX ? lim : kring->nr_hwavail); 3044 kring->nkr_hwofs = new_hwofs; 3045 if (tx == NR_TX) 3046 kring->nr_hwavail = lim; 3047 3048 #if 0 // def linux 3049 /* XXX check that the mappings are correct */ 3050 /* need ring_nr, adapter->pdev, direction */ 3051 buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE); 3052 if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { 3053 D("error mapping rx netmap buffer %d", i); 3054 // XXX fix error handling 3055 } 3056 3057 #endif /* linux */ 3058 /* 3059 * Wakeup on the individual and global selwait 3060 * We do the wakeup here, but the ring is not yet reconfigured. 3061 * However, we are under lock so there are no races. 3062 */ 3063 selwakeuppri(&kring->si, PI_NET); 3064 selwakeuppri(tx == NR_TX ? &na->tx_si : &na->rx_si, PI_NET); 3065 return kring->ring->slot; 3066 } 3067 3068 3069 /* 3070 * Grab packets from a kring, move them into the ft structure 3071 * associated to the tx (input) port. Max one instance per port, 3072 * filtered on input (ioctl, poll or XXX). 3073 * Returns the next position in the ring. 3074 */ 3075 static int 3076 nm_bdg_preflush(struct netmap_adapter *na, u_int ring_nr, 3077 struct netmap_kring *kring, u_int end) 3078 { 3079 struct netmap_ring *ring = kring->ring; 3080 struct nm_bdg_fwd *ft; 3081 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; 3082 u_int ft_i = 0; /* start from 0 */ 3083 u_int frags = 1; /* how many frags ? */ 3084 struct nm_bridge *b = na->na_bdg; 3085 3086 /* To protect against modifications to the bridge we acquire a 3087 * shared lock, waiting if we can sleep (if the source port is 3088 * attached to a user process) or with a trylock otherwise (NICs). 3089 */ 3090 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); 3091 if (na->na_flags & NAF_BDG_MAYSLEEP) 3092 BDG_RLOCK(b); 3093 else if (!BDG_RTRYLOCK(b)) 3094 return 0; 3095 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); 3096 ft = kring->nkr_ft; 3097 3098 for (; likely(j != end); j = nm_next(j, lim)) { 3099 struct netmap_slot *slot = &ring->slot[j]; 3100 char *buf; 3101 3102 ft[ft_i].ft_len = slot->len; 3103 ft[ft_i].ft_flags = slot->flags; 3104 3105 ND("flags is 0x%x", slot->flags); 3106 /* this slot goes into a list so initialize the link field */ 3107 ft[ft_i].ft_next = NM_FT_NULL; 3108 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? 3109 (void *)(uintptr_t)slot->ptr : BDG_NMB(na->nm_mem, slot); 3110 prefetch(buf); 3111 ++ft_i; 3112 if (slot->flags & NS_MOREFRAG) { 3113 frags++; 3114 continue; 3115 } 3116 if (unlikely(netmap_verbose && frags > 1)) 3117 RD(5, "%d frags at %d", frags, ft_i - frags); 3118 ft[ft_i - frags].ft_frags = frags; 3119 frags = 1; 3120 if (unlikely((int)ft_i >= bridge_batch)) 3121 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 3122 } 3123 if (frags > 1) { 3124 D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); 3125 // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG 3126 ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; 3127 ft[ft_i - frags].ft_frags = frags - 1; 3128 } 3129 if (ft_i) 3130 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 3131 BDG_RUNLOCK(b); 3132 return j; 3133 } 3134 3135 3136 /* 3137 * Pass packets from nic to the bridge. 3138 * XXX TODO check locking: this is called from the interrupt 3139 * handler so we should make sure that the interface is not 3140 * disconnected while passing down an interrupt. 3141 * 3142 * Note, no user process can access this NIC so we can ignore 3143 * the info in the 'ring'. 3144 */ 3145 static void 3146 netmap_nic_to_bdg(struct ifnet *ifp, u_int ring_nr) 3147 { 3148 struct netmap_adapter *na = NA(ifp); 3149 struct netmap_kring *kring = &na->rx_rings[ring_nr]; 3150 struct netmap_ring *ring = kring->ring; 3151 u_int j, k; 3152 3153 /* make sure that only one thread is ever in here, 3154 * after which we can unlock. Probably unnecessary XXX. 3155 */ 3156 if (nm_kr_tryget(kring)) 3157 return; 3158 /* fetch packets that have arrived. 3159 * XXX maybe do this in a loop ? 3160 */ 3161 if (na->nm_rxsync(ifp, ring_nr, 0)) 3162 goto put_out; 3163 if (kring->nr_hwavail == 0 && netmap_verbose) { 3164 D("how strange, interrupt with no packets on %s", 3165 ifp->if_xname); 3166 goto put_out; 3167 } 3168 k = nm_kr_rxpos(kring); 3169 3170 j = nm_bdg_preflush(na, ring_nr, kring, k); 3171 3172 /* we consume everything, but we cannot update kring directly 3173 * because the nic may have destroyed the info in the NIC ring. 3174 * So we need to call rxsync again to restore it. 3175 */ 3176 ring->cur = j; 3177 ring->avail = 0; 3178 na->nm_rxsync(ifp, ring_nr, 0); 3179 3180 put_out: 3181 nm_kr_put(kring); 3182 return; 3183 } 3184 3185 3186 /* 3187 * Default functions to handle rx/tx interrupts from a physical device. 3188 * "work_done" is non-null on the RX path, NULL for the TX path. 3189 * We rely on the OS to make sure that there is only one active 3190 * instance per queue, and that there is appropriate locking. 3191 * 3192 * If the card is not in netmap mode, simply return 0, 3193 * so that the caller proceeds with regular processing. 3194 * 3195 * If the card is connected to a netmap file descriptor, 3196 * do a selwakeup on the individual queue, plus one on the global one 3197 * if needed (multiqueue card _and_ there are multiqueue listeners), 3198 * and return 1. 3199 * 3200 * Finally, if called on rx from an interface connected to a switch, 3201 * calls the proper forwarding routine, and return 1. 3202 */ 3203 int 3204 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) 3205 { 3206 struct netmap_adapter *na; 3207 struct netmap_kring *kring; 3208 3209 if (!(ifp->if_capenable & IFCAP_NETMAP)) 3210 return 0; 3211 3212 q &= NETMAP_RING_MASK; 3213 3214 if (netmap_verbose) 3215 RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); 3216 na = NA(ifp); 3217 if (na->na_flags & NAF_SKIP_INTR) { 3218 ND("use regular interrupt"); 3219 return 0; 3220 } 3221 3222 if (work_done) { /* RX path */ 3223 if (q >= na->num_rx_rings) 3224 return 0; // not a physical queue 3225 kring = na->rx_rings + q; 3226 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? 3227 if (na->na_bdg != NULL) { 3228 netmap_nic_to_bdg(ifp, q); 3229 } else { 3230 selwakeuppri(&kring->si, PI_NET); 3231 if (na->num_rx_rings > 1 /* or multiple listeners */ ) 3232 selwakeuppri(&na->rx_si, PI_NET); 3233 } 3234 *work_done = 1; /* do not fire napi again */ 3235 } else { /* TX path */ 3236 if (q >= na->num_tx_rings) 3237 return 0; // not a physical queue 3238 kring = na->tx_rings + q; 3239 selwakeuppri(&kring->si, PI_NET); 3240 if (na->num_tx_rings > 1 /* or multiple listeners */ ) 3241 selwakeuppri(&na->tx_si, PI_NET); 3242 } 3243 return 1; 3244 } 3245 3246 3247 #ifdef linux /* linux-specific routines */ 3248 3249 3250 /* 3251 * Remap linux arguments into the FreeBSD call. 3252 * - pwait is the poll table, passed as 'dev'; 3253 * If pwait == NULL someone else already woke up before. We can report 3254 * events but they are filtered upstream. 3255 * If pwait != NULL, then pwait->key contains the list of events. 3256 * - events is computed from pwait as above. 3257 * - file is passed as 'td'; 3258 */ 3259 static u_int 3260 linux_netmap_poll(struct file * file, struct poll_table_struct *pwait) 3261 { 3262 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) 3263 int events = POLLIN | POLLOUT; /* XXX maybe... */ 3264 #elif LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0) 3265 int events = pwait ? pwait->key : POLLIN | POLLOUT; 3266 #else /* in 3.4.0 field 'key' was renamed to '_key' */ 3267 int events = pwait ? pwait->_key : POLLIN | POLLOUT; 3268 #endif 3269 return netmap_poll((void *)pwait, events, (void *)file); 3270 } 3271 3272 3273 static int 3274 linux_netmap_mmap(struct file *f, struct vm_area_struct *vma) 3275 { 3276 int error = 0; 3277 unsigned long off, va; 3278 vm_ooffset_t pa; 3279 struct netmap_priv_d *priv = f->private_data; 3280 /* 3281 * vma->vm_start: start of mapping user address space 3282 * vma->vm_end: end of the mapping user address space 3283 * vma->vm_pfoff: offset of first page in the device 3284 */ 3285 3286 // XXX security checks 3287 3288 error = netmap_get_memory(priv); 3289 ND("get_memory returned %d", error); 3290 if (error) 3291 return -error; 3292 3293 if ((vma->vm_start & ~PAGE_MASK) || (vma->vm_end & ~PAGE_MASK)) { 3294 ND("vm_start = %lx vm_end = %lx", vma->vm_start, vma->vm_end); 3295 return -EINVAL; 3296 } 3297 3298 for (va = vma->vm_start, off = vma->vm_pgoff; 3299 va < vma->vm_end; 3300 va += PAGE_SIZE, off++) 3301 { 3302 pa = netmap_mem_ofstophys(priv->np_mref, off << PAGE_SHIFT); 3303 if (pa == 0) 3304 return -EINVAL; 3305 3306 ND("va %lx pa %p", va, pa); 3307 error = remap_pfn_range(vma, va, pa >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot); 3308 if (error) 3309 return error; 3310 } 3311 return 0; 3312 } 3313 3314 3315 /* 3316 * This one is probably already protected by the netif lock XXX 3317 */ 3318 static netdev_tx_t 3319 linux_netmap_start_xmit(struct sk_buff *skb, struct net_device *dev) 3320 { 3321 netmap_transmit(dev, skb); 3322 return (NETDEV_TX_OK); 3323 } 3324 3325 3326 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36) // XXX was 37 3327 #define LIN_IOCTL_NAME .ioctl 3328 int 3329 linux_netmap_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long data /* arg */) 3330 #else 3331 #define LIN_IOCTL_NAME .unlocked_ioctl 3332 long 3333 linux_netmap_ioctl(struct file *file, u_int cmd, u_long data /* arg */) 3334 #endif 3335 { 3336 int ret; 3337 struct nmreq nmr; 3338 bzero(&nmr, sizeof(nmr)); 3339 3340 if (cmd == NIOCTXSYNC || cmd == NIOCRXSYNC) { 3341 data = 0; /* no argument required here */ 3342 } 3343 if (data && copy_from_user(&nmr, (void *)data, sizeof(nmr) ) != 0) 3344 return -EFAULT; 3345 ret = netmap_ioctl(NULL, cmd, (caddr_t)&nmr, 0, (void *)file); 3346 if (data && copy_to_user((void*)data, &nmr, sizeof(nmr) ) != 0) 3347 return -EFAULT; 3348 return -ret; 3349 } 3350 3351 3352 static int 3353 netmap_release(struct inode *inode, struct file *file) 3354 { 3355 (void)inode; /* UNUSED */ 3356 if (file->private_data) 3357 netmap_dtor(file->private_data); 3358 return (0); 3359 } 3360 3361 3362 static int 3363 linux_netmap_open(struct inode *inode, struct file *file) 3364 { 3365 struct netmap_priv_d *priv; 3366 (void)inode; /* UNUSED */ 3367 3368 priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, 3369 M_NOWAIT | M_ZERO); 3370 if (priv == NULL) 3371 return -ENOMEM; 3372 3373 file->private_data = priv; 3374 3375 return (0); 3376 } 3377 3378 3379 static struct file_operations netmap_fops = { 3380 .owner = THIS_MODULE, 3381 .open = linux_netmap_open, 3382 .mmap = linux_netmap_mmap, 3383 LIN_IOCTL_NAME = linux_netmap_ioctl, 3384 .poll = linux_netmap_poll, 3385 .release = netmap_release, 3386 }; 3387 3388 3389 static struct miscdevice netmap_cdevsw = { /* same name as FreeBSD */ 3390 MISC_DYNAMIC_MINOR, 3391 "netmap", 3392 &netmap_fops, 3393 }; 3394 3395 static int netmap_init(void); 3396 static void netmap_fini(void); 3397 3398 3399 /* Errors have negative values on linux */ 3400 static int linux_netmap_init(void) 3401 { 3402 return -netmap_init(); 3403 } 3404 3405 module_init(linux_netmap_init); 3406 module_exit(netmap_fini); 3407 /* export certain symbols to other modules */ 3408 EXPORT_SYMBOL(netmap_attach); // driver attach routines 3409 EXPORT_SYMBOL(netmap_detach); // driver detach routines 3410 EXPORT_SYMBOL(netmap_ring_reinit); // ring init on error 3411 EXPORT_SYMBOL(netmap_buffer_lut); 3412 EXPORT_SYMBOL(netmap_total_buffers); // index check 3413 EXPORT_SYMBOL(netmap_buffer_base); 3414 EXPORT_SYMBOL(netmap_reset); // ring init routines 3415 EXPORT_SYMBOL(netmap_buf_size); 3416 EXPORT_SYMBOL(netmap_rx_irq); // default irq handler 3417 EXPORT_SYMBOL(netmap_no_pendintr); // XXX mitigation - should go away 3418 EXPORT_SYMBOL(netmap_bdg_ctl); // bridge configuration routine 3419 EXPORT_SYMBOL(netmap_bdg_learning); // the default lookup function 3420 EXPORT_SYMBOL(netmap_disable_all_rings); 3421 EXPORT_SYMBOL(netmap_enable_all_rings); 3422 3423 3424 MODULE_AUTHOR("http://info.iet.unipi.it/~luigi/netmap/"); 3425 MODULE_DESCRIPTION("The netmap packet I/O framework"); 3426 MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */ 3427 3428 #else /* __FreeBSD__ */ 3429 3430 3431 static struct cdevsw netmap_cdevsw = { 3432 .d_version = D_VERSION, 3433 .d_name = "netmap", 3434 .d_open = netmap_open, 3435 .d_mmap_single = netmap_mmap_single, 3436 .d_ioctl = netmap_ioctl, 3437 .d_poll = netmap_poll, 3438 .d_close = netmap_close, 3439 }; 3440 #endif /* __FreeBSD__ */ 3441 3442 /* 3443 *---- support for virtual bridge ----- 3444 */ 3445 3446 /* ----- FreeBSD if_bridge hash function ------- */ 3447 3448 /* 3449 * The following hash function is adapted from "Hash Functions" by Bob Jenkins 3450 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). 3451 * 3452 * http://www.burtleburtle.net/bob/hash/spooky.html 3453 */ 3454 #define mix(a, b, c) \ 3455 do { \ 3456 a -= b; a -= c; a ^= (c >> 13); \ 3457 b -= c; b -= a; b ^= (a << 8); \ 3458 c -= a; c -= b; c ^= (b >> 13); \ 3459 a -= b; a -= c; a ^= (c >> 12); \ 3460 b -= c; b -= a; b ^= (a << 16); \ 3461 c -= a; c -= b; c ^= (b >> 5); \ 3462 a -= b; a -= c; a ^= (c >> 3); \ 3463 b -= c; b -= a; b ^= (a << 10); \ 3464 c -= a; c -= b; c ^= (b >> 15); \ 3465 } while (/*CONSTCOND*/0) 3466 3467 static __inline uint32_t 3468 nm_bridge_rthash(const uint8_t *addr) 3469 { 3470 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key 3471 3472 b += addr[5] << 8; 3473 b += addr[4]; 3474 a += addr[3] << 24; 3475 a += addr[2] << 16; 3476 a += addr[1] << 8; 3477 a += addr[0]; 3478 3479 mix(a, b, c); 3480 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) 3481 return (c & BRIDGE_RTHASH_MASK); 3482 } 3483 3484 #undef mix 3485 3486 3487 static int 3488 bdg_netmap_reg(struct ifnet *ifp, int onoff) 3489 { 3490 /* the interface is already attached to the bridge, 3491 * so we only need to toggle IFCAP_NETMAP. 3492 */ 3493 if (onoff) { 3494 ifp->if_capenable |= IFCAP_NETMAP; 3495 } else { 3496 ifp->if_capenable &= ~IFCAP_NETMAP; 3497 } 3498 return 0; 3499 } 3500 3501 3502 /* 3503 * Lookup function for a learning bridge. 3504 * Update the hash table with the source address, 3505 * and then returns the destination port index, and the 3506 * ring in *dst_ring (at the moment, always use ring 0) 3507 */ 3508 u_int 3509 netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring, 3510 struct netmap_adapter *na) 3511 { 3512 struct nm_hash_ent *ht = na->na_bdg->ht; 3513 uint32_t sh, dh; 3514 u_int dst, mysrc = na->bdg_port; 3515 uint64_t smac, dmac; 3516 3517 if (buf_len < 14) { 3518 D("invalid buf length %d", buf_len); 3519 return NM_BDG_NOPORT; 3520 } 3521 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; 3522 smac = le64toh(*(uint64_t *)(buf + 4)); 3523 smac >>= 16; 3524 3525 /* 3526 * The hash is somewhat expensive, there might be some 3527 * worthwhile optimizations here. 3528 */ 3529 if ((buf[6] & 1) == 0) { /* valid src */ 3530 uint8_t *s = buf+6; 3531 sh = nm_bridge_rthash(s); // XXX hash of source 3532 /* update source port forwarding entry */ 3533 ht[sh].mac = smac; /* XXX expire ? */ 3534 ht[sh].ports = mysrc; 3535 if (netmap_verbose) 3536 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", 3537 s[0], s[1], s[2], s[3], s[4], s[5], mysrc); 3538 } 3539 dst = NM_BDG_BROADCAST; 3540 if ((buf[0] & 1) == 0) { /* unicast */ 3541 dh = nm_bridge_rthash(buf); // XXX hash of dst 3542 if (ht[dh].mac == dmac) { /* found dst */ 3543 dst = ht[dh].ports; 3544 } 3545 /* XXX otherwise return NM_BDG_UNKNOWN ? */ 3546 } 3547 *dst_ring = 0; 3548 return dst; 3549 } 3550 3551 3552 /* 3553 * This flush routine supports only unicast and broadcast but a large 3554 * number of ports, and lets us replace the learn and dispatch functions. 3555 */ 3556 int 3557 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_adapter *na, 3558 u_int ring_nr) 3559 { 3560 struct nm_bdg_q *dst_ents, *brddst; 3561 uint16_t num_dsts = 0, *dsts; 3562 struct nm_bridge *b = na->na_bdg; 3563 u_int i, j, me = na->bdg_port; 3564 3565 /* 3566 * The work area (pointed by ft) is followed by an array of 3567 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS 3568 * queues per port plus one for the broadcast traffic. 3569 * Then we have an array of destination indexes. 3570 */ 3571 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 3572 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); 3573 3574 /* first pass: find a destination for each packet in the batch */ 3575 for (i = 0; likely(i < n); i += ft[i].ft_frags) { 3576 uint8_t dst_ring = ring_nr; /* default, same ring as origin */ 3577 uint16_t dst_port, d_i; 3578 struct nm_bdg_q *d; 3579 3580 ND("slot %d frags %d", i, ft[i].ft_frags); 3581 dst_port = b->nm_bdg_lookup(ft[i].ft_buf, ft[i].ft_len, 3582 &dst_ring, na); 3583 if (netmap_verbose > 255) 3584 RD(5, "slot %d port %d -> %d", i, me, dst_port); 3585 if (dst_port == NM_BDG_NOPORT) 3586 continue; /* this packet is identified to be dropped */ 3587 else if (unlikely(dst_port > NM_BDG_MAXPORTS)) 3588 continue; 3589 else if (dst_port == NM_BDG_BROADCAST) 3590 dst_ring = 0; /* broadcasts always go to ring 0 */ 3591 else if (unlikely(dst_port == me || 3592 !b->bdg_ports[dst_port])) 3593 continue; 3594 3595 /* get a position in the scratch pad */ 3596 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; 3597 d = dst_ents + d_i; 3598 3599 /* append the first fragment to the list */ 3600 if (d->bq_head == NM_FT_NULL) { /* new destination */ 3601 d->bq_head = d->bq_tail = i; 3602 /* remember this position to be scanned later */ 3603 if (dst_port != NM_BDG_BROADCAST) 3604 dsts[num_dsts++] = d_i; 3605 } else { 3606 ft[d->bq_tail].ft_next = i; 3607 d->bq_tail = i; 3608 } 3609 d->bq_len += ft[i].ft_frags; 3610 } 3611 3612 /* 3613 * Broadcast traffic goes to ring 0 on all destinations. 3614 * So we need to add these rings to the list of ports to scan. 3615 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is 3616 * expensive. We should keep a compact list of active destinations 3617 * so we could shorten this loop. 3618 */ 3619 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; 3620 if (brddst->bq_head != NM_FT_NULL) { 3621 for (j = 0; likely(j < b->bdg_active_ports); j++) { 3622 uint16_t d_i; 3623 i = b->bdg_port_index[j]; 3624 if (unlikely(i == me)) 3625 continue; 3626 d_i = i * NM_BDG_MAXRINGS; 3627 if (dst_ents[d_i].bq_head == NM_FT_NULL) 3628 dsts[num_dsts++] = d_i; 3629 } 3630 } 3631 3632 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); 3633 /* second pass: scan destinations (XXX will be modular somehow) */ 3634 for (i = 0; i < num_dsts; i++) { 3635 struct ifnet *dst_ifp; 3636 struct netmap_adapter *dst_na; 3637 struct netmap_kring *kring; 3638 struct netmap_ring *ring; 3639 u_int dst_nr, is_vp, lim, j, sent = 0, d_i, next, brd_next; 3640 u_int needed, howmany; 3641 int retry = netmap_txsync_retry; 3642 struct nm_bdg_q *d; 3643 uint32_t my_start = 0, lease_idx = 0; 3644 int nrings; 3645 3646 d_i = dsts[i]; 3647 ND("second pass %d port %d", i, d_i); 3648 d = dst_ents + d_i; 3649 // XXX fix the division 3650 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; 3651 /* protect from the lookup function returning an inactive 3652 * destination port 3653 */ 3654 if (unlikely(dst_na == NULL)) 3655 goto cleanup; 3656 if (dst_na->na_flags & NAF_SW_ONLY) 3657 goto cleanup; 3658 dst_ifp = dst_na->ifp; 3659 /* 3660 * The interface may be in !netmap mode in two cases: 3661 * - when na is attached but not activated yet; 3662 * - when na is being deactivated but is still attached. 3663 */ 3664 if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) { 3665 ND("not in netmap mode!"); 3666 goto cleanup; 3667 } 3668 3669 /* there is at least one either unicast or broadcast packet */ 3670 brd_next = brddst->bq_head; 3671 next = d->bq_head; 3672 /* we need to reserve this many slots. If fewer are 3673 * available, some packets will be dropped. 3674 * Packets may have multiple fragments, so we may not use 3675 * there is a chance that we may not use all of the slots 3676 * we have claimed, so we will need to handle the leftover 3677 * ones when we regain the lock. 3678 */ 3679 needed = d->bq_len + brddst->bq_len; 3680 3681 is_vp = nma_is_vp(dst_na); 3682 ND(5, "pass 2 dst %d is %x %s", 3683 i, d_i, is_vp ? "virtual" : "nic/host"); 3684 dst_nr = d_i & (NM_BDG_MAXRINGS-1); 3685 if (is_vp) { /* virtual port */ 3686 nrings = dst_na->num_rx_rings; 3687 } else { 3688 nrings = dst_na->num_tx_rings; 3689 } 3690 if (dst_nr >= nrings) 3691 dst_nr = dst_nr % nrings; 3692 kring = is_vp ? &dst_na->rx_rings[dst_nr] : 3693 &dst_na->tx_rings[dst_nr]; 3694 ring = kring->ring; 3695 lim = kring->nkr_num_slots - 1; 3696 3697 retry: 3698 3699 /* reserve the buffers in the queue and an entry 3700 * to report completion, and drop lock. 3701 * XXX this might become a helper function. 3702 */ 3703 mtx_lock(&kring->q_lock); 3704 if (kring->nkr_stopped) { 3705 mtx_unlock(&kring->q_lock); 3706 goto cleanup; 3707 } 3708 /* on physical interfaces, do a txsync to recover 3709 * slots for packets already transmitted. 3710 * XXX maybe we could be optimistic and rely on a retry 3711 * in case of failure. 3712 */ 3713 if (nma_is_hw(dst_na)) { 3714 dst_na->nm_txsync(dst_ifp, dst_nr, 0); 3715 } 3716 my_start = j = kring->nkr_hwlease; 3717 howmany = nm_kr_space(kring, is_vp); 3718 if (needed < howmany) 3719 howmany = needed; 3720 lease_idx = nm_kr_lease(kring, howmany, is_vp); 3721 mtx_unlock(&kring->q_lock); 3722 3723 /* only retry if we need more than available slots */ 3724 if (retry && needed <= howmany) 3725 retry = 0; 3726 3727 /* copy to the destination queue */ 3728 while (howmany > 0) { 3729 struct netmap_slot *slot; 3730 struct nm_bdg_fwd *ft_p, *ft_end; 3731 u_int cnt; 3732 3733 /* find the queue from which we pick next packet. 3734 * NM_FT_NULL is always higher than valid indexes 3735 * so we never dereference it if the other list 3736 * has packets (and if both are empty we never 3737 * get here). 3738 */ 3739 if (next < brd_next) { 3740 ft_p = ft + next; 3741 next = ft_p->ft_next; 3742 } else { /* insert broadcast */ 3743 ft_p = ft + brd_next; 3744 brd_next = ft_p->ft_next; 3745 } 3746 cnt = ft_p->ft_frags; // cnt > 0 3747 if (unlikely(cnt > howmany)) 3748 break; /* no more space */ 3749 howmany -= cnt; 3750 if (netmap_verbose && cnt > 1) 3751 RD(5, "rx %d frags to %d", cnt, j); 3752 ft_end = ft_p + cnt; 3753 do { 3754 void *dst, *src = ft_p->ft_buf; 3755 size_t len = (ft_p->ft_len + 63) & ~63; 3756 3757 slot = &ring->slot[j]; 3758 dst = BDG_NMB(dst_na->nm_mem, slot); 3759 /* round to a multiple of 64 */ 3760 3761 ND("send %d %d bytes at %s:%d", 3762 i, ft_p->ft_len, dst_ifp->if_xname, j); 3763 if (ft_p->ft_flags & NS_INDIRECT) { 3764 if (copyin(src, dst, len)) { 3765 // invalid user pointer, pretend len is 0 3766 ft_p->ft_len = 0; 3767 } 3768 } else { 3769 //memcpy(dst, src, len); 3770 pkt_copy(src, dst, (int)len); 3771 } 3772 slot->len = ft_p->ft_len; 3773 slot->flags = (cnt << 8)| NS_MOREFRAG; 3774 j = nm_next(j, lim); 3775 ft_p++; 3776 sent++; 3777 } while (ft_p != ft_end); 3778 slot->flags = (cnt << 8); /* clear flag on last entry */ 3779 /* are we done ? */ 3780 if (next == NM_FT_NULL && brd_next == NM_FT_NULL) 3781 break; 3782 } 3783 { 3784 /* current position */ 3785 uint32_t *p = kring->nkr_leases; /* shorthand */ 3786 uint32_t update_pos; 3787 int still_locked = 1; 3788 3789 mtx_lock(&kring->q_lock); 3790 if (unlikely(howmany > 0)) { 3791 /* not used all bufs. If i am the last one 3792 * i can recover the slots, otherwise must 3793 * fill them with 0 to mark empty packets. 3794 */ 3795 ND("leftover %d bufs", howmany); 3796 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { 3797 /* yes i am the last one */ 3798 ND("roll back nkr_hwlease to %d", j); 3799 kring->nkr_hwlease = j; 3800 } else { 3801 while (howmany-- > 0) { 3802 ring->slot[j].len = 0; 3803 ring->slot[j].flags = 0; 3804 j = nm_next(j, lim); 3805 } 3806 } 3807 } 3808 p[lease_idx] = j; /* report I am done */ 3809 3810 update_pos = is_vp ? nm_kr_rxpos(kring) : ring->cur; 3811 3812 if (my_start == update_pos) { 3813 /* all slots before my_start have been reported, 3814 * so scan subsequent leases to see if other ranges 3815 * have been completed, and to a selwakeup or txsync. 3816 */ 3817 while (lease_idx != kring->nkr_lease_idx && 3818 p[lease_idx] != NR_NOSLOT) { 3819 j = p[lease_idx]; 3820 p[lease_idx] = NR_NOSLOT; 3821 lease_idx = nm_next(lease_idx, lim); 3822 } 3823 /* j is the new 'write' position. j != my_start 3824 * means there are new buffers to report 3825 */ 3826 if (likely(j != my_start)) { 3827 if (is_vp) { 3828 uint32_t old_avail = kring->nr_hwavail; 3829 3830 kring->nr_hwavail = (j >= kring->nr_hwcur) ? 3831 j - kring->nr_hwcur : 3832 j + lim + 1 - kring->nr_hwcur; 3833 if (kring->nr_hwavail < old_avail) { 3834 D("avail shrink %d -> %d", 3835 old_avail, kring->nr_hwavail); 3836 } 3837 still_locked = 0; 3838 mtx_unlock(&kring->q_lock); 3839 selwakeuppri(&kring->si, PI_NET); 3840 } else { 3841 ring->cur = j; 3842 /* XXX update avail ? */ 3843 still_locked = 0; 3844 dst_na->nm_txsync(dst_ifp, dst_nr, 0); 3845 mtx_unlock(&kring->q_lock); 3846 3847 /* retry to send more packets */ 3848 if (nma_is_hw(dst_na) && retry--) 3849 goto retry; 3850 } 3851 } 3852 } 3853 if (still_locked) 3854 mtx_unlock(&kring->q_lock); 3855 } 3856 cleanup: 3857 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ 3858 d->bq_len = 0; 3859 } 3860 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ 3861 brddst->bq_len = 0; 3862 return 0; 3863 } 3864 3865 3866 /* 3867 * main dispatch routine for the bridge. 3868 * We already know that only one thread is running this. 3869 * we must run nm_bdg_preflush without lock. 3870 */ 3871 static int 3872 bdg_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) 3873 { 3874 struct netmap_adapter *na = NA(ifp); 3875 struct netmap_kring *kring = &na->tx_rings[ring_nr]; 3876 struct netmap_ring *ring = kring->ring; 3877 u_int j, k, lim = kring->nkr_num_slots - 1; 3878 3879 k = ring->cur; 3880 if (k > lim) 3881 return netmap_ring_reinit(kring); 3882 3883 if (bridge_batch <= 0) { /* testing only */ 3884 j = k; // used all 3885 goto done; 3886 } 3887 if (bridge_batch > NM_BDG_BATCH) 3888 bridge_batch = NM_BDG_BATCH; 3889 3890 j = nm_bdg_preflush(na, ring_nr, kring, k); 3891 if (j != k) 3892 D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail); 3893 /* k-j modulo ring size is the number of slots processed */ 3894 if (k < j) 3895 k += kring->nkr_num_slots; 3896 kring->nr_hwavail = lim - (k - j); 3897 3898 done: 3899 kring->nr_hwcur = j; 3900 ring->avail = kring->nr_hwavail; 3901 if (netmap_verbose) 3902 D("%s ring %d flags %d", ifp->if_xname, ring_nr, flags); 3903 return 0; 3904 } 3905 3906 3907 /* 3908 * user process reading from a VALE switch. 3909 * Already protected against concurrent calls from userspace, 3910 * but we must acquire the queue's lock to protect against 3911 * writers on the same queue. 3912 */ 3913 static int 3914 bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) 3915 { 3916 struct netmap_adapter *na = NA(ifp); 3917 struct netmap_kring *kring = &na->rx_rings[ring_nr]; 3918 struct netmap_ring *ring = kring->ring; 3919 u_int j, lim = kring->nkr_num_slots - 1; 3920 u_int k = ring->cur, resvd = ring->reserved; 3921 int n; 3922 3923 mtx_lock(&kring->q_lock); 3924 if (k > lim) { 3925 D("ouch dangerous reset!!!"); 3926 n = netmap_ring_reinit(kring); 3927 goto done; 3928 } 3929 3930 /* skip past packets that userspace has released */ 3931 j = kring->nr_hwcur; /* netmap ring index */ 3932 if (resvd > 0) { 3933 if (resvd + ring->avail >= lim + 1) { 3934 D("XXX invalid reserve/avail %d %d", resvd, ring->avail); 3935 ring->reserved = resvd = 0; // XXX panic... 3936 } 3937 k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; 3938 } 3939 3940 if (j != k) { /* userspace has released some packets. */ 3941 n = k - j; 3942 if (n < 0) 3943 n += kring->nkr_num_slots; 3944 ND("userspace releases %d packets", n); 3945 for (n = 0; likely(j != k); n++) { 3946 struct netmap_slot *slot = &ring->slot[j]; 3947 void *addr = BDG_NMB(na->nm_mem, slot); 3948 3949 if (addr == netmap_buffer_base) { /* bad buf */ 3950 D("bad buffer index %d, ignore ?", 3951 slot->buf_idx); 3952 } 3953 slot->flags &= ~NS_BUF_CHANGED; 3954 j = nm_next(j, lim); 3955 } 3956 kring->nr_hwavail -= n; 3957 kring->nr_hwcur = k; 3958 } 3959 /* tell userspace that there are new packets */ 3960 ring->avail = kring->nr_hwavail - resvd; 3961 n = 0; 3962 done: 3963 mtx_unlock(&kring->q_lock); 3964 return n; 3965 } 3966 3967 3968 static int 3969 bdg_netmap_attach(struct netmap_adapter *arg) 3970 { 3971 struct netmap_adapter na; 3972 3973 ND("attaching virtual bridge"); 3974 bzero(&na, sizeof(na)); 3975 3976 na.ifp = arg->ifp; 3977 na.na_flags = NAF_BDG_MAYSLEEP | NAF_MEM_OWNER; 3978 na.num_tx_rings = arg->num_tx_rings; 3979 na.num_rx_rings = arg->num_rx_rings; 3980 na.num_tx_desc = arg->num_tx_desc; 3981 na.num_rx_desc = arg->num_rx_desc; 3982 na.nm_txsync = bdg_netmap_txsync; 3983 na.nm_rxsync = bdg_netmap_rxsync; 3984 na.nm_register = bdg_netmap_reg; 3985 na.nm_mem = netmap_mem_private_new(arg->ifp->if_xname, 3986 na.num_tx_rings, na.num_tx_desc, 3987 na.num_rx_rings, na.num_rx_desc); 3988 return netmap_attach(&na, na.num_tx_rings); 3989 } 3990 3991 3992 static struct cdev *netmap_dev; /* /dev/netmap character device. */ 3993 3994 3995 /* 3996 * Module loader. 3997 * 3998 * Create the /dev/netmap device and initialize all global 3999 * variables. 4000 * 4001 * Return 0 on success, errno on failure. 4002 */ 4003 static int 4004 netmap_init(void) 4005 { 4006 int i, error; 4007 4008 NMG_LOCK_INIT(); 4009 4010 error = netmap_mem_init(); 4011 if (error != 0) { 4012 printf("netmap: unable to initialize the memory allocator.\n"); 4013 return (error); 4014 } 4015 printf("netmap: loaded module\n"); 4016 netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, 4017 "netmap"); 4018 4019 bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */ 4020 for (i = 0; i < NM_BRIDGES; i++) 4021 BDG_RWINIT(&nm_bridges[i]); 4022 return (error); 4023 } 4024 4025 4026 /* 4027 * Module unloader. 4028 * 4029 * Free all the memory, and destroy the ``/dev/netmap`` device. 4030 */ 4031 static void 4032 netmap_fini(void) 4033 { 4034 destroy_dev(netmap_dev); 4035 netmap_mem_fini(); 4036 NMG_LOCK_DESTROY(); 4037 printf("netmap: unloaded module.\n"); 4038 } 4039 4040 4041 #ifdef __FreeBSD__ 4042 /* 4043 * Kernel entry point. 4044 * 4045 * Initialize/finalize the module and return. 4046 * 4047 * Return 0 on success, errno on failure. 4048 */ 4049 static int 4050 netmap_loader(__unused struct module *module, int event, __unused void *arg) 4051 { 4052 int error = 0; 4053 4054 switch (event) { 4055 case MOD_LOAD: 4056 error = netmap_init(); 4057 break; 4058 4059 case MOD_UNLOAD: 4060 netmap_fini(); 4061 break; 4062 4063 default: 4064 error = EOPNOTSUPP; 4065 break; 4066 } 4067 4068 return (error); 4069 } 4070 4071 4072 DEV_MODULE(netmap, netmap_loader, NULL); 4073 #endif /* __FreeBSD__ */ 4074