1 /* 2 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 /* 28 * This module implements the VALE switch for netmap 29 30 --- VALE SWITCH --- 31 32 NMG_LOCK() serializes all modifications to switches and ports. 33 A switch cannot be deleted until all ports are gone. 34 35 For each switch, an SX lock (RWlock on linux) protects 36 deletion of ports. When configuring or deleting a new port, the 37 lock is acquired in exclusive mode (after holding NMG_LOCK). 38 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 39 The lock is held throughout the entire forwarding cycle, 40 during which the thread may incur in a page fault. 41 Hence it is important that sleepable shared locks are used. 42 43 On the rx ring, the per-port lock is grabbed initially to reserve 44 a number of slot in the ring, then the lock is released, 45 packets are copied from source to destination, and then 46 the lock is acquired again and the receive ring is updated. 47 (A similar thing is done on the tx ring for NIC and host stack 48 ports attached to the switch) 49 50 */ 51 52 /* 53 * OS-specific code that is used only within this file. 54 * Other OS-specific code that must be accessed by drivers 55 * is present in netmap_kern.h 56 */ 57 58 #if defined(__FreeBSD__) 59 #include <sys/cdefs.h> /* prerequisite */ 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/types.h> 63 #include <sys/errno.h> 64 #include <sys/param.h> /* defines used in kernel.h */ 65 #include <sys/kernel.h> /* types used in module initialization */ 66 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 67 #include <sys/sockio.h> 68 #include <sys/socketvar.h> /* struct socket */ 69 #include <sys/malloc.h> 70 #include <sys/poll.h> 71 #include <sys/rwlock.h> 72 #include <sys/socket.h> /* sockaddrs */ 73 #include <sys/selinfo.h> 74 #include <sys/sysctl.h> 75 #include <net/if.h> 76 #include <net/if_var.h> 77 #include <net/bpf.h> /* BIOCIMMEDIATE */ 78 #include <machine/bus.h> /* bus_dmamap_* */ 79 #include <sys/endian.h> 80 #include <sys/refcount.h> 81 82 83 #define BDG_RWLOCK_T struct rwlock // struct rwlock 84 85 #define BDG_RWINIT(b) \ 86 rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) 87 #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) 88 #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) 89 #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) 90 #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) 91 #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) 92 #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) 93 94 95 #elif defined(linux) 96 97 #include "bsd_glue.h" 98 99 #elif defined(__APPLE__) 100 101 #warning OSX support is only partial 102 #include "osx_glue.h" 103 104 #else 105 106 #error Unsupported platform 107 108 #endif /* unsupported */ 109 110 /* 111 * common headers 112 */ 113 114 #include <net/netmap.h> 115 #include <dev/netmap/netmap_kern.h> 116 #include <dev/netmap/netmap_mem2.h> 117 118 #ifdef WITH_VALE 119 120 /* 121 * system parameters (most of them in netmap_kern.h) 122 * NM_NAME prefix for switch port names, default "vale" 123 * NM_BDG_MAXPORTS number of ports 124 * NM_BRIDGES max number of switches in the system. 125 * XXX should become a sysctl or tunable 126 * 127 * Switch ports are named valeX:Y where X is the switch name and Y 128 * is the port. If Y matches a physical interface name, the port is 129 * connected to a physical device. 130 * 131 * Unlike physical interfaces, switch ports use their own memory region 132 * for rings and buffers. 133 * The virtual interfaces use per-queue lock instead of core lock. 134 * In the tx loop, we aggregate traffic in batches to make all operations 135 * faster. The batch size is bridge_batch. 136 */ 137 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ 138 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ 139 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ 140 #define NM_BDG_HASH 1024 /* forwarding table entries */ 141 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ 142 #define NM_MULTISEG 64 /* max size of a chain of bufs */ 143 /* actual size of the tables */ 144 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) 145 /* NM_FT_NULL terminates a list of slots in the ft */ 146 #define NM_FT_NULL NM_BDG_BATCH_MAX 147 #define NM_BRIDGES 8 /* number of bridges */ 148 149 150 /* 151 * bridge_batch is set via sysctl to the max batch size to be 152 * used in the bridge. The actual value may be larger as the 153 * last packet in the block may overflow the size. 154 */ 155 int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ 156 SYSCTL_DECL(_dev_netmap); 157 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); 158 159 160 static int bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp); 161 static int bdg_netmap_reg(struct netmap_adapter *na, int onoff); 162 static int netmap_bwrap_attach(struct ifnet *, struct ifnet *); 163 static int netmap_bwrap_register(struct netmap_adapter *, int onoff); 164 int kern_netmap_regif(struct nmreq *nmr); 165 166 /* 167 * For each output interface, nm_bdg_q is used to construct a list. 168 * bq_len is the number of output buffers (we can have coalescing 169 * during the copy). 170 */ 171 struct nm_bdg_q { 172 uint16_t bq_head; 173 uint16_t bq_tail; 174 uint32_t bq_len; /* number of buffers */ 175 }; 176 177 /* XXX revise this */ 178 struct nm_hash_ent { 179 uint64_t mac; /* the top 2 bytes are the epoch */ 180 uint64_t ports; 181 }; 182 183 /* 184 * nm_bridge is a descriptor for a VALE switch. 185 * Interfaces for a bridge are all in bdg_ports[]. 186 * The array has fixed size, an empty entry does not terminate 187 * the search, but lookups only occur on attach/detach so we 188 * don't mind if they are slow. 189 * 190 * The bridge is non blocking on the transmit ports: excess 191 * packets are dropped if there is no room on the output port. 192 * 193 * bdg_lock protects accesses to the bdg_ports array. 194 * This is a rw lock (or equivalent). 195 */ 196 struct nm_bridge { 197 /* XXX what is the proper alignment/layout ? */ 198 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ 199 int bdg_namelen; 200 uint32_t bdg_active_ports; /* 0 means free */ 201 char bdg_basename[IFNAMSIZ]; 202 203 /* Indexes of active ports (up to active_ports) 204 * and all other remaining ports. 205 */ 206 uint8_t bdg_port_index[NM_BDG_MAXPORTS]; 207 208 struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS]; 209 210 211 /* 212 * The function to decide the destination port. 213 * It returns either of an index of the destination port, 214 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to 215 * forward this packet. ring_nr is the source ring index, and the 216 * function may overwrite this value to forward this packet to a 217 * different ring index. 218 * This function must be set by netmap_bdgctl(). 219 */ 220 bdg_lookup_fn_t nm_bdg_lookup; 221 222 /* the forwarding table, MAC+ports. 223 * XXX should be changed to an argument to be passed to 224 * the lookup function, and allocated on attach 225 */ 226 struct nm_hash_ent ht[NM_BDG_HASH]; 227 }; 228 229 230 /* 231 * XXX in principle nm_bridges could be created dynamically 232 * Right now we have a static array and deletions are protected 233 * by an exclusive lock. 234 */ 235 struct nm_bridge nm_bridges[NM_BRIDGES]; 236 237 238 /* 239 * this is a slightly optimized copy routine which rounds 240 * to multiple of 64 bytes and is often faster than dealing 241 * with other odd sizes. We assume there is enough room 242 * in the source and destination buffers. 243 * 244 * XXX only for multiples of 64 bytes, non overlapped. 245 */ 246 static inline void 247 pkt_copy(void *_src, void *_dst, int l) 248 { 249 uint64_t *src = _src; 250 uint64_t *dst = _dst; 251 if (unlikely(l >= 1024)) { 252 memcpy(dst, src, l); 253 return; 254 } 255 for (; likely(l > 0); l-=64) { 256 *dst++ = *src++; 257 *dst++ = *src++; 258 *dst++ = *src++; 259 *dst++ = *src++; 260 *dst++ = *src++; 261 *dst++ = *src++; 262 *dst++ = *src++; 263 *dst++ = *src++; 264 } 265 } 266 267 268 /* 269 * locate a bridge among the existing ones. 270 * MUST BE CALLED WITH NMG_LOCK() 271 * 272 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. 273 * We assume that this is called with a name of at least NM_NAME chars. 274 */ 275 static struct nm_bridge * 276 nm_find_bridge(const char *name, int create) 277 { 278 int i, l, namelen; 279 struct nm_bridge *b = NULL; 280 281 NMG_LOCK_ASSERT(); 282 283 namelen = strlen(NM_NAME); /* base length */ 284 l = name ? strlen(name) : 0; /* actual length */ 285 if (l < namelen) { 286 D("invalid bridge name %s", name ? name : NULL); 287 return NULL; 288 } 289 for (i = namelen + 1; i < l; i++) { 290 if (name[i] == ':') { 291 namelen = i; 292 break; 293 } 294 } 295 if (namelen >= IFNAMSIZ) 296 namelen = IFNAMSIZ; 297 ND("--- prefix is '%.*s' ---", namelen, name); 298 299 /* lookup the name, remember empty slot if there is one */ 300 for (i = 0; i < NM_BRIDGES; i++) { 301 struct nm_bridge *x = nm_bridges + i; 302 303 if (x->bdg_active_ports == 0) { 304 if (create && b == NULL) 305 b = x; /* record empty slot */ 306 } else if (x->bdg_namelen != namelen) { 307 continue; 308 } else if (strncmp(name, x->bdg_basename, namelen) == 0) { 309 ND("found '%.*s' at %d", namelen, name, i); 310 b = x; 311 break; 312 } 313 } 314 if (i == NM_BRIDGES && b) { /* name not found, can create entry */ 315 /* initialize the bridge */ 316 strncpy(b->bdg_basename, name, namelen); 317 ND("create new bridge %s with ports %d", b->bdg_basename, 318 b->bdg_active_ports); 319 b->bdg_namelen = namelen; 320 b->bdg_active_ports = 0; 321 for (i = 0; i < NM_BDG_MAXPORTS; i++) 322 b->bdg_port_index[i] = i; 323 /* set the default function */ 324 b->nm_bdg_lookup = netmap_bdg_learning; 325 /* reset the MAC address table */ 326 bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); 327 } 328 return b; 329 } 330 331 332 /* 333 * Free the forwarding tables for rings attached to switch ports. 334 */ 335 static void 336 nm_free_bdgfwd(struct netmap_adapter *na) 337 { 338 int nrings, i; 339 struct netmap_kring *kring; 340 341 NMG_LOCK_ASSERT(); 342 nrings = na->num_tx_rings; 343 kring = na->tx_rings; 344 for (i = 0; i < nrings; i++) { 345 if (kring[i].nkr_ft) { 346 free(kring[i].nkr_ft, M_DEVBUF); 347 kring[i].nkr_ft = NULL; /* protect from freeing twice */ 348 } 349 } 350 } 351 352 353 /* 354 * Allocate the forwarding tables for the rings attached to the bridge ports. 355 */ 356 static int 357 nm_alloc_bdgfwd(struct netmap_adapter *na) 358 { 359 int nrings, l, i, num_dstq; 360 struct netmap_kring *kring; 361 362 NMG_LOCK_ASSERT(); 363 /* all port:rings + broadcast */ 364 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; 365 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; 366 l += sizeof(struct nm_bdg_q) * num_dstq; 367 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; 368 369 nrings = netmap_real_tx_rings(na); 370 kring = na->tx_rings; 371 for (i = 0; i < nrings; i++) { 372 struct nm_bdg_fwd *ft; 373 struct nm_bdg_q *dstq; 374 int j; 375 376 ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); 377 if (!ft) { 378 nm_free_bdgfwd(na); 379 return ENOMEM; 380 } 381 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 382 for (j = 0; j < num_dstq; j++) { 383 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; 384 dstq[j].bq_len = 0; 385 } 386 kring[i].nkr_ft = ft; 387 } 388 return 0; 389 } 390 391 392 static void 393 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) 394 { 395 int s_hw = hw, s_sw = sw; 396 int i, lim =b->bdg_active_ports; 397 uint8_t tmp[NM_BDG_MAXPORTS]; 398 399 /* 400 New algorithm: 401 make a copy of bdg_port_index; 402 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port 403 in the array of bdg_port_index, replacing them with 404 entries from the bottom of the array; 405 decrement bdg_active_ports; 406 acquire BDG_WLOCK() and copy back the array. 407 */ 408 409 if (netmap_verbose) 410 D("detach %d and %d (lim %d)", hw, sw, lim); 411 /* make a copy of the list of active ports, update it, 412 * and then copy back within BDG_WLOCK(). 413 */ 414 memcpy(tmp, b->bdg_port_index, sizeof(tmp)); 415 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { 416 if (hw >= 0 && tmp[i] == hw) { 417 ND("detach hw %d at %d", hw, i); 418 lim--; /* point to last active port */ 419 tmp[i] = tmp[lim]; /* swap with i */ 420 tmp[lim] = hw; /* now this is inactive */ 421 hw = -1; 422 } else if (sw >= 0 && tmp[i] == sw) { 423 ND("detach sw %d at %d", sw, i); 424 lim--; 425 tmp[i] = tmp[lim]; 426 tmp[lim] = sw; 427 sw = -1; 428 } else { 429 i++; 430 } 431 } 432 if (hw >= 0 || sw >= 0) { 433 D("XXX delete failed hw %d sw %d, should panic...", hw, sw); 434 } 435 436 BDG_WLOCK(b); 437 b->bdg_ports[s_hw] = NULL; 438 if (s_sw >= 0) { 439 b->bdg_ports[s_sw] = NULL; 440 } 441 memcpy(b->bdg_port_index, tmp, sizeof(tmp)); 442 b->bdg_active_ports = lim; 443 BDG_WUNLOCK(b); 444 445 ND("now %d active ports", lim); 446 if (lim == 0) { 447 ND("marking bridge %s as free", b->bdg_basename); 448 b->nm_bdg_lookup = NULL; 449 } 450 } 451 452 453 static void 454 netmap_adapter_vp_dtor(struct netmap_adapter *na) 455 { 456 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; 457 struct nm_bridge *b = vpna->na_bdg; 458 struct ifnet *ifp = na->ifp; 459 460 ND("%s has %d references", NM_IFPNAME(ifp), na->na_refcount); 461 462 if (b) { 463 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 464 } 465 466 bzero(ifp, sizeof(*ifp)); 467 free(ifp, M_DEVBUF); 468 na->ifp = NULL; 469 } 470 471 472 /* Try to get a reference to a netmap adapter attached to a VALE switch. 473 * If the adapter is found (or is created), this function returns 0, a 474 * non NULL pointer is returned into *na, and the caller holds a 475 * reference to the adapter. 476 * If an adapter is not found, then no reference is grabbed and the 477 * function returns an error code, or 0 if there is just a VALE prefix 478 * mismatch. Therefore the caller holds a reference when 479 * (*na != NULL && return == 0). 480 */ 481 int 482 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) 483 { 484 const char *name = nmr->nr_name; 485 struct ifnet *ifp; 486 int error = 0; 487 struct netmap_adapter *ret; 488 struct netmap_vp_adapter *vpna; 489 struct nm_bridge *b; 490 int i, j, cand = -1, cand2 = -1; 491 int needed; 492 493 *na = NULL; /* default return value */ 494 495 /* first try to see if this is a bridge port. */ 496 NMG_LOCK_ASSERT(); 497 if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) { 498 return 0; /* no error, but no VALE prefix */ 499 } 500 501 b = nm_find_bridge(name, create); 502 if (b == NULL) { 503 D("no bridges available for '%s'", name); 504 return (create ? ENOMEM : ENXIO); 505 } 506 507 /* Now we are sure that name starts with the bridge's name, 508 * lookup the port in the bridge. We need to scan the entire 509 * list. It is not important to hold a WLOCK on the bridge 510 * during the search because NMG_LOCK already guarantees 511 * that there are no other possible writers. 512 */ 513 514 /* lookup in the local list of ports */ 515 for (j = 0; j < b->bdg_active_ports; j++) { 516 i = b->bdg_port_index[j]; 517 vpna = b->bdg_ports[i]; 518 // KASSERT(na != NULL); 519 ifp = vpna->up.ifp; 520 /* XXX make sure the name only contains one : */ 521 if (!strcmp(NM_IFPNAME(ifp), name)) { 522 netmap_adapter_get(&vpna->up); 523 ND("found existing if %s refs %d", name, 524 vpna->na_bdg_refcount); 525 *na = (struct netmap_adapter *)vpna; 526 return 0; 527 } 528 } 529 /* not found, should we create it? */ 530 if (!create) 531 return ENXIO; 532 /* yes we should, see if we have space to attach entries */ 533 needed = 2; /* in some cases we only need 1 */ 534 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { 535 D("bridge full %d, cannot create new port", b->bdg_active_ports); 536 return ENOMEM; 537 } 538 /* record the next two ports available, but do not allocate yet */ 539 cand = b->bdg_port_index[b->bdg_active_ports]; 540 cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; 541 ND("+++ bridge %s port %s used %d avail %d %d", 542 b->bdg_basename, name, b->bdg_active_ports, cand, cand2); 543 544 /* 545 * try see if there is a matching NIC with this name 546 * (after the bridge's name) 547 */ 548 ifp = ifunit_ref(name + b->bdg_namelen + 1); 549 if (!ifp) { /* this is a virtual port */ 550 if (nmr->nr_cmd) { 551 /* nr_cmd must be 0 for a virtual port */ 552 return EINVAL; 553 } 554 555 /* create a struct ifnet for the new port. 556 * need M_NOWAIT as we are under nma_lock 557 */ 558 ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); 559 if (!ifp) 560 return ENOMEM; 561 562 strcpy(ifp->if_xname, name); 563 /* bdg_netmap_attach creates a struct netmap_adapter */ 564 error = bdg_netmap_attach(nmr, ifp); 565 if (error) { 566 D("error %d", error); 567 free(ifp, M_DEVBUF); 568 return error; 569 } 570 ret = NA(ifp); 571 cand2 = -1; /* only need one port */ 572 } else { /* this is a NIC */ 573 struct ifnet *fake_ifp; 574 575 error = netmap_get_hw_na(ifp, &ret); 576 if (error || ret == NULL) 577 goto out; 578 579 /* make sure the NIC is not already in use */ 580 if (NETMAP_OWNED_BY_ANY(ret)) { 581 D("NIC %s busy, cannot attach to bridge", 582 NM_IFPNAME(ifp)); 583 error = EBUSY; 584 goto out; 585 } 586 /* create a fake interface */ 587 fake_ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); 588 if (!fake_ifp) { 589 error = ENOMEM; 590 goto out; 591 } 592 strcpy(fake_ifp->if_xname, name); 593 error = netmap_bwrap_attach(fake_ifp, ifp); 594 if (error) { 595 free(fake_ifp, M_DEVBUF); 596 goto out; 597 } 598 ret = NA(fake_ifp); 599 if (nmr->nr_arg1 != NETMAP_BDG_HOST) 600 cand2 = -1; /* only need one port */ 601 if_rele(ifp); 602 } 603 vpna = (struct netmap_vp_adapter *)ret; 604 605 BDG_WLOCK(b); 606 vpna->bdg_port = cand; 607 ND("NIC %p to bridge port %d", vpna, cand); 608 /* bind the port to the bridge (virtual ports are not active) */ 609 b->bdg_ports[cand] = vpna; 610 vpna->na_bdg = b; 611 b->bdg_active_ports++; 612 if (cand2 >= 0) { 613 struct netmap_vp_adapter *hostna = vpna + 1; 614 /* also bind the host stack to the bridge */ 615 b->bdg_ports[cand2] = hostna; 616 hostna->bdg_port = cand2; 617 hostna->na_bdg = b; 618 b->bdg_active_ports++; 619 ND("host %p to bridge port %d", hostna, cand2); 620 } 621 ND("if %s refs %d", name, vpna->up.na_refcount); 622 BDG_WUNLOCK(b); 623 *na = ret; 624 netmap_adapter_get(ret); 625 return 0; 626 627 out: 628 if_rele(ifp); 629 630 return error; 631 } 632 633 634 /* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */ 635 static int 636 nm_bdg_attach(struct nmreq *nmr) 637 { 638 struct netmap_adapter *na; 639 struct netmap_if *nifp; 640 struct netmap_priv_d *npriv; 641 struct netmap_bwrap_adapter *bna; 642 int error; 643 644 npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); 645 if (npriv == NULL) 646 return ENOMEM; 647 648 NMG_LOCK(); 649 650 error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */); 651 if (error) /* no device, or another bridge or user owns the device */ 652 goto unlock_exit; 653 654 if (na == NULL) { /* VALE prefix missing */ 655 error = EINVAL; 656 goto unlock_exit; 657 } 658 659 if (na->active_fds > 0) { /* already registered */ 660 error = EBUSY; 661 goto unref_exit; 662 } 663 664 nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags, &error); 665 if (!nifp) { 666 goto unref_exit; 667 } 668 669 bna = (struct netmap_bwrap_adapter*)na; 670 bna->na_kpriv = npriv; 671 NMG_UNLOCK(); 672 ND("registered %s to netmap-mode", NM_IFPNAME(na->ifp)); 673 return 0; 674 675 unref_exit: 676 netmap_adapter_put(na); 677 unlock_exit: 678 NMG_UNLOCK(); 679 bzero(npriv, sizeof(*npriv)); 680 free(npriv, M_DEVBUF); 681 return error; 682 } 683 684 685 static int 686 nm_bdg_detach(struct nmreq *nmr) 687 { 688 struct netmap_adapter *na; 689 int error; 690 struct netmap_bwrap_adapter *bna; 691 int last_instance; 692 693 NMG_LOCK(); 694 error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */); 695 if (error) { /* no device, or another bridge or user owns the device */ 696 goto unlock_exit; 697 } 698 699 if (na == NULL) { /* VALE prefix missing */ 700 error = EINVAL; 701 goto unlock_exit; 702 } 703 704 bna = (struct netmap_bwrap_adapter *)na; 705 706 if (na->active_fds == 0) { /* not registered */ 707 error = EINVAL; 708 goto unref_exit; 709 } 710 711 last_instance = netmap_dtor_locked(bna->na_kpriv); /* unregister */ 712 if (!last_instance) { 713 D("--- error, trying to detach an entry with active mmaps"); 714 error = EINVAL; 715 } else { 716 struct netmap_priv_d *npriv = bna->na_kpriv; 717 718 bna->na_kpriv = NULL; 719 D("deleting priv"); 720 721 bzero(npriv, sizeof(*npriv)); 722 free(npriv, M_DEVBUF); 723 } 724 725 unref_exit: 726 netmap_adapter_put(na); 727 unlock_exit: 728 NMG_UNLOCK(); 729 return error; 730 731 } 732 733 734 /* exported to kernel callers, e.g. OVS ? 735 * Entry point. 736 * Called without NMG_LOCK. 737 */ 738 int 739 netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) 740 { 741 struct nm_bridge *b; 742 struct netmap_adapter *na; 743 struct netmap_vp_adapter *vpna; 744 struct ifnet *iter; 745 char *name = nmr->nr_name; 746 int cmd = nmr->nr_cmd, namelen = strlen(name); 747 int error = 0, i, j; 748 749 switch (cmd) { 750 case NETMAP_BDG_ATTACH: 751 error = nm_bdg_attach(nmr); 752 break; 753 754 case NETMAP_BDG_DETACH: 755 error = nm_bdg_detach(nmr); 756 break; 757 758 case NETMAP_BDG_LIST: 759 /* this is used to enumerate bridges and ports */ 760 if (namelen) { /* look up indexes of bridge and port */ 761 if (strncmp(name, NM_NAME, strlen(NM_NAME))) { 762 error = EINVAL; 763 break; 764 } 765 NMG_LOCK(); 766 b = nm_find_bridge(name, 0 /* don't create */); 767 if (!b) { 768 error = ENOENT; 769 NMG_UNLOCK(); 770 break; 771 } 772 773 error = ENOENT; 774 for (j = 0; j < b->bdg_active_ports; j++) { 775 i = b->bdg_port_index[j]; 776 vpna = b->bdg_ports[i]; 777 if (vpna == NULL) { 778 D("---AAAAAAAAARGH-------"); 779 continue; 780 } 781 iter = vpna->up.ifp; 782 /* the former and the latter identify a 783 * virtual port and a NIC, respectively 784 */ 785 if (!strcmp(iter->if_xname, name)) { 786 /* bridge index */ 787 nmr->nr_arg1 = b - nm_bridges; 788 nmr->nr_arg2 = i; /* port index */ 789 error = 0; 790 break; 791 } 792 } 793 NMG_UNLOCK(); 794 } else { 795 /* return the first non-empty entry starting from 796 * bridge nr_arg1 and port nr_arg2. 797 * 798 * Users can detect the end of the same bridge by 799 * seeing the new and old value of nr_arg1, and can 800 * detect the end of all the bridge by error != 0 801 */ 802 i = nmr->nr_arg1; 803 j = nmr->nr_arg2; 804 805 NMG_LOCK(); 806 for (error = ENOENT; i < NM_BRIDGES; i++) { 807 b = nm_bridges + i; 808 if (j >= b->bdg_active_ports) { 809 j = 0; /* following bridges scan from 0 */ 810 continue; 811 } 812 nmr->nr_arg1 = i; 813 nmr->nr_arg2 = j; 814 j = b->bdg_port_index[j]; 815 vpna = b->bdg_ports[j]; 816 iter = vpna->up.ifp; 817 strncpy(name, iter->if_xname, (size_t)IFNAMSIZ); 818 error = 0; 819 break; 820 } 821 NMG_UNLOCK(); 822 } 823 break; 824 825 case NETMAP_BDG_LOOKUP_REG: 826 /* register a lookup function to the given bridge. 827 * nmr->nr_name may be just bridge's name (including ':' 828 * if it is not just NM_NAME). 829 */ 830 if (!func) { 831 error = EINVAL; 832 break; 833 } 834 NMG_LOCK(); 835 b = nm_find_bridge(name, 0 /* don't create */); 836 if (!b) { 837 error = EINVAL; 838 } else { 839 b->nm_bdg_lookup = func; 840 } 841 NMG_UNLOCK(); 842 break; 843 844 case NETMAP_BDG_VNET_HDR: 845 /* Valid lengths for the virtio-net header are 0 (no header), 846 10 and 12. */ 847 if (nmr->nr_arg1 != 0 && 848 nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) && 849 nmr->nr_arg1 != 12) { 850 error = EINVAL; 851 break; 852 } 853 NMG_LOCK(); 854 error = netmap_get_bdg_na(nmr, &na, 0); 855 if (na && !error) { 856 vpna = (struct netmap_vp_adapter *)na; 857 vpna->virt_hdr_len = nmr->nr_arg1; 858 if (vpna->virt_hdr_len) 859 vpna->mfs = NETMAP_BDG_BUF_SIZE(na->nm_mem); 860 D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna); 861 netmap_adapter_put(na); 862 } 863 NMG_UNLOCK(); 864 break; 865 866 default: 867 D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); 868 error = EINVAL; 869 break; 870 } 871 return error; 872 } 873 874 static int 875 netmap_vp_krings_create(struct netmap_adapter *na) 876 { 877 u_int tailroom; 878 int error, i; 879 uint32_t *leases; 880 u_int nrx = netmap_real_rx_rings(na); 881 882 /* 883 * Leases are attached to RX rings on vale ports 884 */ 885 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; 886 887 error = netmap_krings_create(na, tailroom); 888 if (error) 889 return error; 890 891 leases = na->tailroom; 892 893 for (i = 0; i < nrx; i++) { /* Receive rings */ 894 na->rx_rings[i].nkr_leases = leases; 895 leases += na->num_rx_desc; 896 } 897 898 error = nm_alloc_bdgfwd(na); 899 if (error) { 900 netmap_krings_delete(na); 901 return error; 902 } 903 904 return 0; 905 } 906 907 908 static void 909 netmap_vp_krings_delete(struct netmap_adapter *na) 910 { 911 nm_free_bdgfwd(na); 912 netmap_krings_delete(na); 913 } 914 915 916 static int 917 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, 918 struct netmap_vp_adapter *na, u_int ring_nr); 919 920 921 /* 922 * Grab packets from a kring, move them into the ft structure 923 * associated to the tx (input) port. Max one instance per port, 924 * filtered on input (ioctl, poll or XXX). 925 * Returns the next position in the ring. 926 */ 927 static int 928 nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr, 929 struct netmap_kring *kring, u_int end) 930 { 931 struct netmap_ring *ring = kring->ring; 932 struct nm_bdg_fwd *ft; 933 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; 934 u_int ft_i = 0; /* start from 0 */ 935 u_int frags = 1; /* how many frags ? */ 936 struct nm_bridge *b = na->na_bdg; 937 938 /* To protect against modifications to the bridge we acquire a 939 * shared lock, waiting if we can sleep (if the source port is 940 * attached to a user process) or with a trylock otherwise (NICs). 941 */ 942 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); 943 if (na->up.na_flags & NAF_BDG_MAYSLEEP) 944 BDG_RLOCK(b); 945 else if (!BDG_RTRYLOCK(b)) 946 return 0; 947 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); 948 ft = kring->nkr_ft; 949 950 for (; likely(j != end); j = nm_next(j, lim)) { 951 struct netmap_slot *slot = &ring->slot[j]; 952 char *buf; 953 954 ft[ft_i].ft_len = slot->len; 955 ft[ft_i].ft_flags = slot->flags; 956 957 ND("flags is 0x%x", slot->flags); 958 /* this slot goes into a list so initialize the link field */ 959 ft[ft_i].ft_next = NM_FT_NULL; 960 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? 961 (void *)(uintptr_t)slot->ptr : BDG_NMB(&na->up, slot); 962 if (unlikely(buf == NULL)) { 963 RD(5, "NULL %s buffer pointer from %s slot %d len %d", 964 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT", 965 kring->name, j, ft[ft_i].ft_len); 966 buf = ft[ft_i].ft_buf = NMB_VA(0); /* the 'null' buffer */ 967 ft[ft_i].ft_len = 0; 968 ft[ft_i].ft_flags = 0; 969 } 970 __builtin_prefetch(buf); 971 ++ft_i; 972 if (slot->flags & NS_MOREFRAG) { 973 frags++; 974 continue; 975 } 976 if (unlikely(netmap_verbose && frags > 1)) 977 RD(5, "%d frags at %d", frags, ft_i - frags); 978 ft[ft_i - frags].ft_frags = frags; 979 frags = 1; 980 if (unlikely((int)ft_i >= bridge_batch)) 981 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 982 } 983 if (frags > 1) { 984 D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); 985 // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG 986 ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; 987 ft[ft_i - frags].ft_frags = frags - 1; 988 } 989 if (ft_i) 990 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 991 BDG_RUNLOCK(b); 992 return j; 993 } 994 995 996 /* ----- FreeBSD if_bridge hash function ------- */ 997 998 /* 999 * The following hash function is adapted from "Hash Functions" by Bob Jenkins 1000 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). 1001 * 1002 * http://www.burtleburtle.net/bob/hash/spooky.html 1003 */ 1004 #define mix(a, b, c) \ 1005 do { \ 1006 a -= b; a -= c; a ^= (c >> 13); \ 1007 b -= c; b -= a; b ^= (a << 8); \ 1008 c -= a; c -= b; c ^= (b >> 13); \ 1009 a -= b; a -= c; a ^= (c >> 12); \ 1010 b -= c; b -= a; b ^= (a << 16); \ 1011 c -= a; c -= b; c ^= (b >> 5); \ 1012 a -= b; a -= c; a ^= (c >> 3); \ 1013 b -= c; b -= a; b ^= (a << 10); \ 1014 c -= a; c -= b; c ^= (b >> 15); \ 1015 } while (/*CONSTCOND*/0) 1016 1017 1018 static __inline uint32_t 1019 nm_bridge_rthash(const uint8_t *addr) 1020 { 1021 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key 1022 1023 b += addr[5] << 8; 1024 b += addr[4]; 1025 a += addr[3] << 24; 1026 a += addr[2] << 16; 1027 a += addr[1] << 8; 1028 a += addr[0]; 1029 1030 mix(a, b, c); 1031 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) 1032 return (c & BRIDGE_RTHASH_MASK); 1033 } 1034 1035 #undef mix 1036 1037 1038 static int 1039 bdg_netmap_reg(struct netmap_adapter *na, int onoff) 1040 { 1041 struct netmap_vp_adapter *vpna = 1042 (struct netmap_vp_adapter*)na; 1043 struct ifnet *ifp = na->ifp; 1044 1045 /* the interface is already attached to the bridge, 1046 * so we only need to toggle IFCAP_NETMAP. 1047 */ 1048 BDG_WLOCK(vpna->na_bdg); 1049 if (onoff) { 1050 ifp->if_capenable |= IFCAP_NETMAP; 1051 } else { 1052 ifp->if_capenable &= ~IFCAP_NETMAP; 1053 } 1054 BDG_WUNLOCK(vpna->na_bdg); 1055 return 0; 1056 } 1057 1058 1059 /* 1060 * Lookup function for a learning bridge. 1061 * Update the hash table with the source address, 1062 * and then returns the destination port index, and the 1063 * ring in *dst_ring (at the moment, always use ring 0) 1064 */ 1065 u_int 1066 netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring, 1067 struct netmap_vp_adapter *na) 1068 { 1069 struct nm_hash_ent *ht = na->na_bdg->ht; 1070 uint32_t sh, dh; 1071 u_int dst, mysrc = na->bdg_port; 1072 uint64_t smac, dmac; 1073 1074 if (buf_len < 14) { 1075 RD(5, "invalid buf length %d", buf_len); 1076 return NM_BDG_NOPORT; 1077 } 1078 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; 1079 smac = le64toh(*(uint64_t *)(buf + 4)); 1080 smac >>= 16; 1081 1082 /* 1083 * The hash is somewhat expensive, there might be some 1084 * worthwhile optimizations here. 1085 */ 1086 if ((buf[6] & 1) == 0) { /* valid src */ 1087 uint8_t *s = buf+6; 1088 sh = nm_bridge_rthash(s); // XXX hash of source 1089 /* update source port forwarding entry */ 1090 ht[sh].mac = smac; /* XXX expire ? */ 1091 ht[sh].ports = mysrc; 1092 if (netmap_verbose) 1093 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", 1094 s[0], s[1], s[2], s[3], s[4], s[5], mysrc); 1095 } 1096 dst = NM_BDG_BROADCAST; 1097 if ((buf[0] & 1) == 0) { /* unicast */ 1098 dh = nm_bridge_rthash(buf); // XXX hash of dst 1099 if (ht[dh].mac == dmac) { /* found dst */ 1100 dst = ht[dh].ports; 1101 } 1102 /* XXX otherwise return NM_BDG_UNKNOWN ? */ 1103 } 1104 *dst_ring = 0; 1105 return dst; 1106 } 1107 1108 1109 /* 1110 * Available space in the ring. Only used in VALE code 1111 * and only with is_rx = 1 1112 */ 1113 static inline uint32_t 1114 nm_kr_space(struct netmap_kring *k, int is_rx) 1115 { 1116 int space; 1117 1118 if (is_rx) { 1119 int busy = k->nkr_hwlease - k->nr_hwcur; 1120 if (busy < 0) 1121 busy += k->nkr_num_slots; 1122 space = k->nkr_num_slots - 1 - busy; 1123 } else { 1124 /* XXX never used in this branch */ 1125 space = k->nr_hwtail - k->nkr_hwlease; 1126 if (space < 0) 1127 space += k->nkr_num_slots; 1128 } 1129 #if 0 1130 // sanity check 1131 if (k->nkr_hwlease >= k->nkr_num_slots || 1132 k->nr_hwcur >= k->nkr_num_slots || 1133 k->nr_tail >= k->nkr_num_slots || 1134 busy < 0 || 1135 busy >= k->nkr_num_slots) { 1136 D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 1137 k->nkr_lease_idx, k->nkr_num_slots); 1138 } 1139 #endif 1140 return space; 1141 } 1142 1143 1144 1145 1146 /* make a lease on the kring for N positions. return the 1147 * lease index 1148 * XXX only used in VALE code and with is_rx = 1 1149 */ 1150 static inline uint32_t 1151 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) 1152 { 1153 uint32_t lim = k->nkr_num_slots - 1; 1154 uint32_t lease_idx = k->nkr_lease_idx; 1155 1156 k->nkr_leases[lease_idx] = NR_NOSLOT; 1157 k->nkr_lease_idx = nm_next(lease_idx, lim); 1158 1159 if (n > nm_kr_space(k, is_rx)) { 1160 D("invalid request for %d slots", n); 1161 panic("x"); 1162 } 1163 /* XXX verify that there are n slots */ 1164 k->nkr_hwlease += n; 1165 if (k->nkr_hwlease > lim) 1166 k->nkr_hwlease -= lim + 1; 1167 1168 if (k->nkr_hwlease >= k->nkr_num_slots || 1169 k->nr_hwcur >= k->nkr_num_slots || 1170 k->nr_hwtail >= k->nkr_num_slots || 1171 k->nkr_lease_idx >= k->nkr_num_slots) { 1172 D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", 1173 k->na->ifp->if_xname, 1174 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 1175 k->nkr_lease_idx, k->nkr_num_slots); 1176 } 1177 return lease_idx; 1178 } 1179 1180 /* 1181 * This flush routine supports only unicast and broadcast but a large 1182 * number of ports, and lets us replace the learn and dispatch functions. 1183 */ 1184 int 1185 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, 1186 u_int ring_nr) 1187 { 1188 struct nm_bdg_q *dst_ents, *brddst; 1189 uint16_t num_dsts = 0, *dsts; 1190 struct nm_bridge *b = na->na_bdg; 1191 u_int i, j, me = na->bdg_port; 1192 1193 /* 1194 * The work area (pointed by ft) is followed by an array of 1195 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS 1196 * queues per port plus one for the broadcast traffic. 1197 * Then we have an array of destination indexes. 1198 */ 1199 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 1200 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); 1201 1202 /* first pass: find a destination for each packet in the batch */ 1203 for (i = 0; likely(i < n); i += ft[i].ft_frags) { 1204 uint8_t dst_ring = ring_nr; /* default, same ring as origin */ 1205 uint16_t dst_port, d_i; 1206 struct nm_bdg_q *d; 1207 uint8_t *buf = ft[i].ft_buf; 1208 u_int len = ft[i].ft_len; 1209 1210 ND("slot %d frags %d", i, ft[i].ft_frags); 1211 /* Drop the packet if the virtio-net header is not into the first 1212 fragment nor at the very beginning of the second. */ 1213 if (unlikely(na->virt_hdr_len > len)) 1214 continue; 1215 if (len == na->virt_hdr_len) { 1216 buf = ft[i+1].ft_buf; 1217 len = ft[i+1].ft_len; 1218 } else { 1219 buf += na->virt_hdr_len; 1220 len -= na->virt_hdr_len; 1221 } 1222 dst_port = b->nm_bdg_lookup(buf, len, &dst_ring, na); 1223 if (netmap_verbose > 255) 1224 RD(5, "slot %d port %d -> %d", i, me, dst_port); 1225 if (dst_port == NM_BDG_NOPORT) 1226 continue; /* this packet is identified to be dropped */ 1227 else if (unlikely(dst_port > NM_BDG_MAXPORTS)) 1228 continue; 1229 else if (dst_port == NM_BDG_BROADCAST) 1230 dst_ring = 0; /* broadcasts always go to ring 0 */ 1231 else if (unlikely(dst_port == me || 1232 !b->bdg_ports[dst_port])) 1233 continue; 1234 1235 /* get a position in the scratch pad */ 1236 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; 1237 d = dst_ents + d_i; 1238 1239 /* append the first fragment to the list */ 1240 if (d->bq_head == NM_FT_NULL) { /* new destination */ 1241 d->bq_head = d->bq_tail = i; 1242 /* remember this position to be scanned later */ 1243 if (dst_port != NM_BDG_BROADCAST) 1244 dsts[num_dsts++] = d_i; 1245 } else { 1246 ft[d->bq_tail].ft_next = i; 1247 d->bq_tail = i; 1248 } 1249 d->bq_len += ft[i].ft_frags; 1250 } 1251 1252 /* 1253 * Broadcast traffic goes to ring 0 on all destinations. 1254 * So we need to add these rings to the list of ports to scan. 1255 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is 1256 * expensive. We should keep a compact list of active destinations 1257 * so we could shorten this loop. 1258 */ 1259 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; 1260 if (brddst->bq_head != NM_FT_NULL) { 1261 for (j = 0; likely(j < b->bdg_active_ports); j++) { 1262 uint16_t d_i; 1263 i = b->bdg_port_index[j]; 1264 if (unlikely(i == me)) 1265 continue; 1266 d_i = i * NM_BDG_MAXRINGS; 1267 if (dst_ents[d_i].bq_head == NM_FT_NULL) 1268 dsts[num_dsts++] = d_i; 1269 } 1270 } 1271 1272 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); 1273 /* second pass: scan destinations (XXX will be modular somehow) */ 1274 for (i = 0; i < num_dsts; i++) { 1275 struct ifnet *dst_ifp; 1276 struct netmap_vp_adapter *dst_na; 1277 struct netmap_kring *kring; 1278 struct netmap_ring *ring; 1279 u_int dst_nr, lim, j, d_i, next, brd_next; 1280 u_int needed, howmany; 1281 int retry = netmap_txsync_retry; 1282 struct nm_bdg_q *d; 1283 uint32_t my_start = 0, lease_idx = 0; 1284 int nrings; 1285 int virt_hdr_mismatch = 0; 1286 1287 d_i = dsts[i]; 1288 ND("second pass %d port %d", i, d_i); 1289 d = dst_ents + d_i; 1290 // XXX fix the division 1291 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; 1292 /* protect from the lookup function returning an inactive 1293 * destination port 1294 */ 1295 if (unlikely(dst_na == NULL)) 1296 goto cleanup; 1297 if (dst_na->up.na_flags & NAF_SW_ONLY) 1298 goto cleanup; 1299 dst_ifp = dst_na->up.ifp; 1300 /* 1301 * The interface may be in !netmap mode in two cases: 1302 * - when na is attached but not activated yet; 1303 * - when na is being deactivated but is still attached. 1304 */ 1305 if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) { 1306 ND("not in netmap mode!"); 1307 goto cleanup; 1308 } 1309 1310 /* there is at least one either unicast or broadcast packet */ 1311 brd_next = brddst->bq_head; 1312 next = d->bq_head; 1313 /* we need to reserve this many slots. If fewer are 1314 * available, some packets will be dropped. 1315 * Packets may have multiple fragments, so we may not use 1316 * there is a chance that we may not use all of the slots 1317 * we have claimed, so we will need to handle the leftover 1318 * ones when we regain the lock. 1319 */ 1320 needed = d->bq_len + brddst->bq_len; 1321 1322 if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) { 1323 RD(3, "virt_hdr_mismatch, src %d len %d", na->virt_hdr_len, dst_na->virt_hdr_len); 1324 /* There is a virtio-net header/offloadings mismatch between 1325 * source and destination. The slower mismatch datapath will 1326 * be used to cope with all the mismatches. 1327 */ 1328 virt_hdr_mismatch = 1; 1329 if (dst_na->mfs < na->mfs) { 1330 /* We may need to do segmentation offloadings, and so 1331 * we may need a number of destination slots greater 1332 * than the number of input slots ('needed'). 1333 * We look for the smallest integer 'x' which satisfies: 1334 * needed * na->mfs + x * H <= x * na->mfs 1335 * where 'H' is the length of the longest header that may 1336 * be replicated in the segmentation process (e.g. for 1337 * TCPv4 we must account for ethernet header, IP header 1338 * and TCPv4 header). 1339 */ 1340 needed = (needed * na->mfs) / 1341 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; 1342 ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); 1343 } 1344 } 1345 1346 ND(5, "pass 2 dst %d is %x %s", 1347 i, d_i, is_vp ? "virtual" : "nic/host"); 1348 dst_nr = d_i & (NM_BDG_MAXRINGS-1); 1349 nrings = dst_na->up.num_rx_rings; 1350 if (dst_nr >= nrings) 1351 dst_nr = dst_nr % nrings; 1352 kring = &dst_na->up.rx_rings[dst_nr]; 1353 ring = kring->ring; 1354 lim = kring->nkr_num_slots - 1; 1355 1356 retry: 1357 1358 if (dst_na->retry && retry) { 1359 /* try to get some free slot from the previous run */ 1360 dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); 1361 } 1362 /* reserve the buffers in the queue and an entry 1363 * to report completion, and drop lock. 1364 * XXX this might become a helper function. 1365 */ 1366 mtx_lock(&kring->q_lock); 1367 if (kring->nkr_stopped) { 1368 mtx_unlock(&kring->q_lock); 1369 goto cleanup; 1370 } 1371 my_start = j = kring->nkr_hwlease; 1372 howmany = nm_kr_space(kring, 1); 1373 if (needed < howmany) 1374 howmany = needed; 1375 lease_idx = nm_kr_lease(kring, howmany, 1); 1376 mtx_unlock(&kring->q_lock); 1377 1378 /* only retry if we need more than available slots */ 1379 if (retry && needed <= howmany) 1380 retry = 0; 1381 1382 /* copy to the destination queue */ 1383 while (howmany > 0) { 1384 struct netmap_slot *slot; 1385 struct nm_bdg_fwd *ft_p, *ft_end; 1386 u_int cnt; 1387 1388 /* find the queue from which we pick next packet. 1389 * NM_FT_NULL is always higher than valid indexes 1390 * so we never dereference it if the other list 1391 * has packets (and if both are empty we never 1392 * get here). 1393 */ 1394 if (next < brd_next) { 1395 ft_p = ft + next; 1396 next = ft_p->ft_next; 1397 } else { /* insert broadcast */ 1398 ft_p = ft + brd_next; 1399 brd_next = ft_p->ft_next; 1400 } 1401 cnt = ft_p->ft_frags; // cnt > 0 1402 if (unlikely(cnt > howmany)) 1403 break; /* no more space */ 1404 if (netmap_verbose && cnt > 1) 1405 RD(5, "rx %d frags to %d", cnt, j); 1406 ft_end = ft_p + cnt; 1407 if (unlikely(virt_hdr_mismatch)) { 1408 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); 1409 } else { 1410 howmany -= cnt; 1411 do { 1412 char *dst, *src = ft_p->ft_buf; 1413 size_t copy_len = ft_p->ft_len, dst_len = copy_len; 1414 1415 slot = &ring->slot[j]; 1416 dst = BDG_NMB(&dst_na->up, slot); 1417 1418 ND("send [%d] %d(%d) bytes at %s:%d", 1419 i, (int)copy_len, (int)dst_len, 1420 NM_IFPNAME(dst_ifp), j); 1421 /* round to a multiple of 64 */ 1422 copy_len = (copy_len + 63) & ~63; 1423 1424 if (unlikely(copy_len > NETMAP_BUF_SIZE || 1425 copy_len > NETMAP_BUF_SIZE)) { 1426 RD(5, "invalid len %d, down to 64", (int)copy_len); 1427 copy_len = dst_len = 64; // XXX 1428 } 1429 if (ft_p->ft_flags & NS_INDIRECT) { 1430 if (copyin(src, dst, copy_len)) { 1431 // invalid user pointer, pretend len is 0 1432 dst_len = 0; 1433 } 1434 } else { 1435 //memcpy(dst, src, copy_len); 1436 pkt_copy(src, dst, (int)copy_len); 1437 } 1438 slot->len = dst_len; 1439 slot->flags = (cnt << 8)| NS_MOREFRAG; 1440 j = nm_next(j, lim); 1441 needed--; 1442 ft_p++; 1443 } while (ft_p != ft_end); 1444 slot->flags = (cnt << 8); /* clear flag on last entry */ 1445 } 1446 /* are we done ? */ 1447 if (next == NM_FT_NULL && brd_next == NM_FT_NULL) 1448 break; 1449 } 1450 { 1451 /* current position */ 1452 uint32_t *p = kring->nkr_leases; /* shorthand */ 1453 uint32_t update_pos; 1454 int still_locked = 1; 1455 1456 mtx_lock(&kring->q_lock); 1457 if (unlikely(howmany > 0)) { 1458 /* not used all bufs. If i am the last one 1459 * i can recover the slots, otherwise must 1460 * fill them with 0 to mark empty packets. 1461 */ 1462 ND("leftover %d bufs", howmany); 1463 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { 1464 /* yes i am the last one */ 1465 ND("roll back nkr_hwlease to %d", j); 1466 kring->nkr_hwlease = j; 1467 } else { 1468 while (howmany-- > 0) { 1469 ring->slot[j].len = 0; 1470 ring->slot[j].flags = 0; 1471 j = nm_next(j, lim); 1472 } 1473 } 1474 } 1475 p[lease_idx] = j; /* report I am done */ 1476 1477 update_pos = kring->nr_hwtail; 1478 1479 if (my_start == update_pos) { 1480 /* all slots before my_start have been reported, 1481 * so scan subsequent leases to see if other ranges 1482 * have been completed, and to a selwakeup or txsync. 1483 */ 1484 while (lease_idx != kring->nkr_lease_idx && 1485 p[lease_idx] != NR_NOSLOT) { 1486 j = p[lease_idx]; 1487 p[lease_idx] = NR_NOSLOT; 1488 lease_idx = nm_next(lease_idx, lim); 1489 } 1490 /* j is the new 'write' position. j != my_start 1491 * means there are new buffers to report 1492 */ 1493 if (likely(j != my_start)) { 1494 kring->nr_hwtail = j; 1495 still_locked = 0; 1496 mtx_unlock(&kring->q_lock); 1497 dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); 1498 if (dst_na->retry && retry--) 1499 goto retry; 1500 } 1501 } 1502 if (still_locked) 1503 mtx_unlock(&kring->q_lock); 1504 } 1505 cleanup: 1506 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ 1507 d->bq_len = 0; 1508 } 1509 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ 1510 brddst->bq_len = 0; 1511 return 0; 1512 } 1513 1514 1515 static int 1516 netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags) 1517 { 1518 struct netmap_kring *kring = &na->up.tx_rings[ring_nr]; 1519 u_int done; 1520 u_int const lim = kring->nkr_num_slots - 1; 1521 u_int const cur = kring->rcur; 1522 1523 if (bridge_batch <= 0) { /* testing only */ 1524 done = cur; // used all 1525 goto done; 1526 } 1527 if (bridge_batch > NM_BDG_BATCH) 1528 bridge_batch = NM_BDG_BATCH; 1529 1530 done = nm_bdg_preflush(na, ring_nr, kring, cur); 1531 done: 1532 if (done != cur) 1533 D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail); 1534 /* 1535 * packets between 'done' and 'cur' are left unsent. 1536 */ 1537 kring->nr_hwcur = done; 1538 kring->nr_hwtail = nm_prev(done, lim); 1539 nm_txsync_finalize(kring); 1540 if (netmap_verbose) 1541 D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags); 1542 return 0; 1543 } 1544 1545 1546 /* 1547 * main dispatch routine for the bridge. 1548 * We already know that only one thread is running this. 1549 * we must run nm_bdg_preflush without lock. 1550 */ 1551 static int 1552 bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) 1553 { 1554 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; 1555 return netmap_vp_txsync(vpna, ring_nr, flags); 1556 } 1557 1558 static int 1559 netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) 1560 { 1561 struct netmap_kring *kring = &na->rx_rings[ring_nr]; 1562 struct netmap_ring *ring = kring->ring; 1563 u_int nm_i, lim = kring->nkr_num_slots - 1; 1564 u_int head = nm_rxsync_prologue(kring); 1565 int n; 1566 1567 if (head > lim) { 1568 D("ouch dangerous reset!!!"); 1569 n = netmap_ring_reinit(kring); 1570 goto done; 1571 } 1572 1573 /* First part, import newly received packets. */ 1574 /* actually nothing to do here, they are already in the kring */ 1575 1576 /* Second part, skip past packets that userspace has released. */ 1577 nm_i = kring->nr_hwcur; 1578 if (nm_i != head) { 1579 /* consistency check, but nothing really important here */ 1580 for (n = 0; likely(nm_i != head); n++) { 1581 struct netmap_slot *slot = &ring->slot[nm_i]; 1582 void *addr = BDG_NMB(na, slot); 1583 1584 if (addr == netmap_buffer_base) { /* bad buf */ 1585 D("bad buffer index %d, ignore ?", 1586 slot->buf_idx); 1587 } 1588 slot->flags &= ~NS_BUF_CHANGED; 1589 nm_i = nm_next(nm_i, lim); 1590 } 1591 kring->nr_hwcur = head; 1592 } 1593 1594 /* tell userspace that there are new packets */ 1595 nm_rxsync_finalize(kring); 1596 n = 0; 1597 done: 1598 return n; 1599 } 1600 1601 /* 1602 * user process reading from a VALE switch. 1603 * Already protected against concurrent calls from userspace, 1604 * but we must acquire the queue's lock to protect against 1605 * writers on the same queue. 1606 */ 1607 static int 1608 bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) 1609 { 1610 struct netmap_kring *kring = &na->rx_rings[ring_nr]; 1611 int n; 1612 1613 mtx_lock(&kring->q_lock); 1614 n = netmap_vp_rxsync(na, ring_nr, flags); 1615 mtx_unlock(&kring->q_lock); 1616 return n; 1617 } 1618 1619 1620 static int 1621 bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp) 1622 { 1623 struct netmap_vp_adapter *vpna; 1624 struct netmap_adapter *na; 1625 int error; 1626 u_int npipes = 0; 1627 1628 vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO); 1629 if (vpna == NULL) 1630 return ENOMEM; 1631 1632 na = &vpna->up; 1633 1634 na->ifp = ifp; 1635 1636 /* bound checking */ 1637 na->num_tx_rings = nmr->nr_tx_rings; 1638 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1639 nmr->nr_tx_rings = na->num_tx_rings; // write back 1640 na->num_rx_rings = nmr->nr_rx_rings; 1641 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1642 nmr->nr_rx_rings = na->num_rx_rings; // write back 1643 nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, 1644 1, NM_BDG_MAXSLOTS, NULL); 1645 na->num_tx_desc = nmr->nr_tx_slots; 1646 nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, 1647 1, NM_BDG_MAXSLOTS, NULL); 1648 /* validate number of pipes. We want at least 1, 1649 * but probably can do with some more. 1650 * So let's use 2 as default (when 0 is supplied) 1651 */ 1652 npipes = nmr->nr_arg1; 1653 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); 1654 nmr->nr_arg1 = npipes; /* write back */ 1655 /* validate extra bufs */ 1656 nm_bound_var(&nmr->nr_arg3, 0, 0, 1657 128*NM_BDG_MAXSLOTS, NULL); 1658 na->num_rx_desc = nmr->nr_rx_slots; 1659 vpna->virt_hdr_len = 0; 1660 vpna->mfs = 1514; 1661 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? 1662 vpna->mfs = netmap_buf_size; */ 1663 if (netmap_verbose) 1664 D("max frame size %u", vpna->mfs); 1665 1666 na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER; 1667 na->nm_txsync = bdg_netmap_txsync; 1668 na->nm_rxsync = bdg_netmap_rxsync; 1669 na->nm_register = bdg_netmap_reg; 1670 na->nm_dtor = netmap_adapter_vp_dtor; 1671 na->nm_krings_create = netmap_vp_krings_create; 1672 na->nm_krings_delete = netmap_vp_krings_delete; 1673 na->nm_mem = netmap_mem_private_new(NM_IFPNAME(na->ifp), 1674 na->num_tx_rings, na->num_tx_desc, 1675 na->num_rx_rings, na->num_rx_desc, 1676 nmr->nr_arg3, npipes, &error); 1677 if (na->nm_mem == NULL) 1678 goto err; 1679 /* other nmd fields are set in the common routine */ 1680 error = netmap_attach_common(na); 1681 if (error) 1682 goto err; 1683 return 0; 1684 1685 err: 1686 if (na->nm_mem != NULL) 1687 netmap_mem_private_delete(na->nm_mem); 1688 free(vpna, M_DEVBUF); 1689 return error; 1690 } 1691 1692 1693 static void 1694 netmap_bwrap_dtor(struct netmap_adapter *na) 1695 { 1696 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; 1697 struct netmap_adapter *hwna = bna->hwna; 1698 struct nm_bridge *b = bna->up.na_bdg, 1699 *bh = bna->host.na_bdg; 1700 struct ifnet *ifp = na->ifp; 1701 1702 ND("na %p", na); 1703 1704 if (b) { 1705 netmap_bdg_detach_common(b, bna->up.bdg_port, 1706 (bh ? bna->host.bdg_port : -1)); 1707 } 1708 1709 hwna->na_private = NULL; 1710 netmap_adapter_put(hwna); 1711 1712 bzero(ifp, sizeof(*ifp)); 1713 free(ifp, M_DEVBUF); 1714 na->ifp = NULL; 1715 1716 } 1717 1718 1719 /* 1720 * Intr callback for NICs connected to a bridge. 1721 * Simply ignore tx interrupts (maybe we could try to recover space ?) 1722 * and pass received packets from nic to the bridge. 1723 * 1724 * XXX TODO check locking: this is called from the interrupt 1725 * handler so we should make sure that the interface is not 1726 * disconnected while passing down an interrupt. 1727 * 1728 * Note, no user process can access this NIC or the host stack. 1729 * The only part of the ring that is significant are the slots, 1730 * and head/cur/tail are set from the kring as needed 1731 * (part as a receive ring, part as a transmit ring). 1732 * 1733 * callback that overwrites the hwna notify callback. 1734 * Packets come from the outside or from the host stack and are put on an hwna rx ring. 1735 * The bridge wrapper then sends the packets through the bridge. 1736 */ 1737 static int 1738 netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags) 1739 { 1740 struct ifnet *ifp = na->ifp; 1741 struct netmap_bwrap_adapter *bna = na->na_private; 1742 struct netmap_vp_adapter *hostna = &bna->host; 1743 struct netmap_kring *kring, *bkring; 1744 struct netmap_ring *ring; 1745 int is_host_ring = ring_nr == na->num_rx_rings; 1746 struct netmap_vp_adapter *vpna = &bna->up; 1747 int error = 0; 1748 1749 if (netmap_verbose) 1750 D("%s %s%d 0x%x", NM_IFPNAME(ifp), 1751 (tx == NR_TX ? "TX" : "RX"), ring_nr, flags); 1752 1753 if (flags & NAF_DISABLE_NOTIFY) { 1754 kring = tx == NR_TX ? na->tx_rings : na->rx_rings; 1755 bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings; 1756 if (kring[ring_nr].nkr_stopped) 1757 netmap_disable_ring(&bkring[ring_nr]); 1758 else 1759 bkring[ring_nr].nkr_stopped = 0; 1760 return 0; 1761 } 1762 1763 if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP)) 1764 return 0; 1765 1766 /* we only care about receive interrupts */ 1767 if (tx == NR_TX) 1768 return 0; 1769 1770 kring = &na->rx_rings[ring_nr]; 1771 ring = kring->ring; 1772 1773 /* make sure the ring is not disabled */ 1774 if (nm_kr_tryget(kring)) 1775 return 0; 1776 1777 if (is_host_ring && hostna->na_bdg == NULL) { 1778 error = bna->save_notify(na, ring_nr, tx, flags); 1779 goto put_out; 1780 } 1781 1782 /* Here we expect ring->head = ring->cur = ring->tail 1783 * because everything has been released from the previous round. 1784 * However the ring is shared and we might have info from 1785 * the wrong side (the tx ring). Hence we overwrite with 1786 * the info from the rx kring. 1787 */ 1788 if (netmap_verbose) 1789 D("%s head %d cur %d tail %d (kring %d %d %d)", NM_IFPNAME(ifp), 1790 ring->head, ring->cur, ring->tail, 1791 kring->rhead, kring->rcur, kring->rtail); 1792 1793 ring->head = kring->rhead; 1794 ring->cur = kring->rcur; 1795 ring->tail = kring->rtail; 1796 1797 if (is_host_ring) { 1798 vpna = hostna; 1799 ring_nr = 0; 1800 } 1801 /* simulate a user wakeup on the rx ring */ 1802 /* fetch packets that have arrived. 1803 * XXX maybe do this in a loop ? 1804 */ 1805 error = kring->nm_sync(kring, 0); 1806 if (error) 1807 goto put_out; 1808 if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) { 1809 D("how strange, interrupt with no packets on %s", 1810 NM_IFPNAME(ifp)); 1811 goto put_out; 1812 } 1813 1814 /* new packets are ring->cur to ring->tail, and the bkring 1815 * had hwcur == ring->cur. So advance ring->cur to ring->tail 1816 * to push all packets out. 1817 */ 1818 ring->head = ring->cur = ring->tail; 1819 1820 /* also set tail to what the bwrap expects */ 1821 bkring = &vpna->up.tx_rings[ring_nr]; 1822 ring->tail = bkring->nr_hwtail; // rtail too ? 1823 1824 /* pass packets to the switch */ 1825 nm_txsync_prologue(bkring); // XXX error checking ? 1826 netmap_vp_txsync(vpna, ring_nr, flags); 1827 1828 /* mark all buffers as released on this ring */ 1829 ring->head = ring->cur = kring->nr_hwtail; 1830 ring->tail = kring->rtail; 1831 /* another call to actually release the buffers */ 1832 if (!is_host_ring) { 1833 error = kring->nm_sync(kring, 0); 1834 } else { 1835 /* mark all packets as released, as in the 1836 * second part of netmap_rxsync_from_host() 1837 */ 1838 kring->nr_hwcur = kring->nr_hwtail; 1839 nm_rxsync_finalize(kring); 1840 } 1841 1842 put_out: 1843 nm_kr_put(kring); 1844 return error; 1845 } 1846 1847 1848 static int 1849 netmap_bwrap_register(struct netmap_adapter *na, int onoff) 1850 { 1851 struct netmap_bwrap_adapter *bna = 1852 (struct netmap_bwrap_adapter *)na; 1853 struct netmap_adapter *hwna = bna->hwna; 1854 struct netmap_vp_adapter *hostna = &bna->host; 1855 int error; 1856 1857 ND("%s %s", NM_IFPNAME(na->ifp), onoff ? "on" : "off"); 1858 1859 if (onoff) { 1860 int i; 1861 1862 hwna->na_lut = na->na_lut; 1863 hwna->na_lut_objtotal = na->na_lut_objtotal; 1864 1865 if (hostna->na_bdg) { 1866 hostna->up.na_lut = na->na_lut; 1867 hostna->up.na_lut_objtotal = na->na_lut_objtotal; 1868 } 1869 1870 /* cross-link the netmap rings 1871 * The original number of rings comes from hwna, 1872 * rx rings on one side equals tx rings on the other. 1873 */ 1874 for (i = 0; i < na->num_rx_rings + 1; i++) { 1875 hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots; 1876 hwna->tx_rings[i].ring = na->rx_rings[i].ring; 1877 } 1878 for (i = 0; i < na->num_tx_rings + 1; i++) { 1879 hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots; 1880 hwna->rx_rings[i].ring = na->tx_rings[i].ring; 1881 } 1882 } 1883 1884 if (hwna->ifp) { 1885 error = hwna->nm_register(hwna, onoff); 1886 if (error) 1887 return error; 1888 } 1889 1890 bdg_netmap_reg(na, onoff); 1891 1892 if (onoff) { 1893 bna->save_notify = hwna->nm_notify; 1894 hwna->nm_notify = netmap_bwrap_intr_notify; 1895 } else { 1896 hwna->nm_notify = bna->save_notify; 1897 hwna->na_lut = NULL; 1898 hwna->na_lut_objtotal = 0; 1899 } 1900 1901 return 0; 1902 } 1903 1904 1905 static int 1906 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, 1907 u_int *rxr, u_int *rxd) 1908 { 1909 struct netmap_bwrap_adapter *bna = 1910 (struct netmap_bwrap_adapter *)na; 1911 struct netmap_adapter *hwna = bna->hwna; 1912 1913 /* forward the request */ 1914 netmap_update_config(hwna); 1915 /* swap the results */ 1916 *txr = hwna->num_rx_rings; 1917 *txd = hwna->num_rx_desc; 1918 *rxr = hwna->num_tx_rings; 1919 *rxd = hwna->num_rx_desc; 1920 1921 return 0; 1922 } 1923 1924 1925 static int 1926 netmap_bwrap_krings_create(struct netmap_adapter *na) 1927 { 1928 struct netmap_bwrap_adapter *bna = 1929 (struct netmap_bwrap_adapter *)na; 1930 struct netmap_adapter *hwna = bna->hwna; 1931 struct netmap_adapter *hostna = &bna->host.up; 1932 int error; 1933 1934 ND("%s", NM_IFPNAME(na->ifp)); 1935 1936 error = netmap_vp_krings_create(na); 1937 if (error) 1938 return error; 1939 1940 error = hwna->nm_krings_create(hwna); 1941 if (error) { 1942 netmap_vp_krings_delete(na); 1943 return error; 1944 } 1945 1946 if (na->na_flags & NAF_HOST_RINGS) { 1947 hostna->tx_rings = na->tx_rings + na->num_tx_rings; 1948 hostna->rx_rings = na->rx_rings + na->num_rx_rings; 1949 } 1950 1951 return 0; 1952 } 1953 1954 1955 static void 1956 netmap_bwrap_krings_delete(struct netmap_adapter *na) 1957 { 1958 struct netmap_bwrap_adapter *bna = 1959 (struct netmap_bwrap_adapter *)na; 1960 struct netmap_adapter *hwna = bna->hwna; 1961 1962 ND("%s", NM_IFPNAME(na->ifp)); 1963 1964 hwna->nm_krings_delete(hwna); 1965 netmap_vp_krings_delete(na); 1966 } 1967 1968 1969 /* notify method for the bridge-->hwna direction */ 1970 static int 1971 netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) 1972 { 1973 struct netmap_bwrap_adapter *bna = 1974 (struct netmap_bwrap_adapter *)na; 1975 struct netmap_adapter *hwna = bna->hwna; 1976 struct netmap_kring *kring, *hw_kring; 1977 struct netmap_ring *ring; 1978 u_int lim; 1979 int error = 0; 1980 1981 if (tx == NR_TX) 1982 return EINVAL; 1983 1984 kring = &na->rx_rings[ring_n]; 1985 hw_kring = &hwna->tx_rings[ring_n]; 1986 ring = kring->ring; 1987 lim = kring->nkr_num_slots - 1; 1988 1989 if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP)) 1990 return 0; 1991 mtx_lock(&kring->q_lock); 1992 /* first step: simulate a user wakeup on the rx ring */ 1993 netmap_vp_rxsync(na, ring_n, flags); 1994 ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", 1995 NM_IFPNAME(na->ifp), ring_n, 1996 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, 1997 ring->head, ring->cur, ring->tail, 1998 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail); 1999 /* second step: the simulated user consumes all new packets */ 2000 ring->head = ring->cur = ring->tail; 2001 2002 /* third step: the new packets are sent on the tx ring 2003 * (which is actually the same ring) 2004 */ 2005 /* set tail to what the hw expects */ 2006 ring->tail = hw_kring->rtail; 2007 nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ? 2008 error = hw_kring->nm_sync(hw_kring, flags); 2009 2010 /* fourth step: now we are back the rx ring */ 2011 /* claim ownership on all hw owned bufs */ 2012 ring->head = nm_next(ring->tail, lim); /* skip past reserved slot */ 2013 ring->tail = kring->rtail; /* restore saved value of tail, for safety */ 2014 2015 /* fifth step: the user goes to sleep again, causing another rxsync */ 2016 netmap_vp_rxsync(na, ring_n, flags); 2017 ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", 2018 NM_IFPNAME(na->ifp), ring_n, 2019 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, 2020 ring->head, ring->cur, ring->tail, 2021 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); 2022 mtx_unlock(&kring->q_lock); 2023 return error; 2024 } 2025 2026 2027 static int 2028 netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) 2029 { 2030 struct netmap_bwrap_adapter *bna = na->na_private; 2031 struct netmap_adapter *port_na = &bna->up.up; 2032 if (tx == NR_TX || ring_n != 0) 2033 return EINVAL; 2034 return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags); 2035 } 2036 2037 2038 /* attach a bridge wrapper to the 'real' device */ 2039 static int 2040 netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real) 2041 { 2042 struct netmap_bwrap_adapter *bna; 2043 struct netmap_adapter *na; 2044 struct netmap_adapter *hwna = NA(real); 2045 struct netmap_adapter *hostna; 2046 int error; 2047 2048 2049 bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO); 2050 if (bna == NULL) 2051 return ENOMEM; 2052 2053 na = &bna->up.up; 2054 na->ifp = fake; 2055 /* fill the ring data for the bwrap adapter with rx/tx meanings 2056 * swapped. The real cross-linking will be done during register, 2057 * when all the krings will have been created. 2058 */ 2059 na->num_rx_rings = hwna->num_tx_rings; 2060 na->num_tx_rings = hwna->num_rx_rings; 2061 na->num_tx_desc = hwna->num_rx_desc; 2062 na->num_rx_desc = hwna->num_tx_desc; 2063 na->nm_dtor = netmap_bwrap_dtor; 2064 na->nm_register = netmap_bwrap_register; 2065 // na->nm_txsync = netmap_bwrap_txsync; 2066 // na->nm_rxsync = netmap_bwrap_rxsync; 2067 na->nm_config = netmap_bwrap_config; 2068 na->nm_krings_create = netmap_bwrap_krings_create; 2069 na->nm_krings_delete = netmap_bwrap_krings_delete; 2070 na->nm_notify = netmap_bwrap_notify; 2071 na->nm_mem = hwna->nm_mem; 2072 na->na_private = na; /* prevent NIOCREGIF */ 2073 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ 2074 2075 bna->hwna = hwna; 2076 netmap_adapter_get(hwna); 2077 hwna->na_private = bna; /* weak reference */ 2078 2079 if (hwna->na_flags & NAF_HOST_RINGS) { 2080 na->na_flags |= NAF_HOST_RINGS; 2081 hostna = &bna->host.up; 2082 hostna->ifp = hwna->ifp; 2083 hostna->num_tx_rings = 1; 2084 hostna->num_tx_desc = hwna->num_rx_desc; 2085 hostna->num_rx_rings = 1; 2086 hostna->num_rx_desc = hwna->num_tx_desc; 2087 // hostna->nm_txsync = netmap_bwrap_host_txsync; 2088 // hostna->nm_rxsync = netmap_bwrap_host_rxsync; 2089 hostna->nm_notify = netmap_bwrap_host_notify; 2090 hostna->nm_mem = na->nm_mem; 2091 hostna->na_private = bna; 2092 } 2093 2094 ND("%s<->%s txr %d txd %d rxr %d rxd %d", 2095 fake->if_xname, real->if_xname, 2096 na->num_tx_rings, na->num_tx_desc, 2097 na->num_rx_rings, na->num_rx_desc); 2098 2099 error = netmap_attach_common(na); 2100 if (error) { 2101 netmap_adapter_put(hwna); 2102 free(bna, M_DEVBUF); 2103 return error; 2104 } 2105 return 0; 2106 } 2107 2108 2109 void 2110 netmap_init_bridges(void) 2111 { 2112 int i; 2113 bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */ 2114 for (i = 0; i < NM_BRIDGES; i++) 2115 BDG_RWINIT(&nm_bridges[i]); 2116 } 2117 #endif /* WITH_VALE */ 2118