1 /* 2 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 /* 28 * This module implements the VALE switch for netmap 29 30 --- VALE SWITCH --- 31 32 NMG_LOCK() serializes all modifications to switches and ports. 33 A switch cannot be deleted until all ports are gone. 34 35 For each switch, an SX lock (RWlock on linux) protects 36 deletion of ports. When configuring or deleting a new port, the 37 lock is acquired in exclusive mode (after holding NMG_LOCK). 38 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 39 The lock is held throughout the entire forwarding cycle, 40 during which the thread may incur in a page fault. 41 Hence it is important that sleepable shared locks are used. 42 43 On the rx ring, the per-port lock is grabbed initially to reserve 44 a number of slot in the ring, then the lock is released, 45 packets are copied from source to destination, and then 46 the lock is acquired again and the receive ring is updated. 47 (A similar thing is done on the tx ring for NIC and host stack 48 ports attached to the switch) 49 50 */ 51 52 /* 53 * OS-specific code that is used only within this file. 54 * Other OS-specific code that must be accessed by drivers 55 * is present in netmap_kern.h 56 */ 57 58 #if defined(__FreeBSD__) 59 #include <sys/cdefs.h> /* prerequisite */ 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/types.h> 63 #include <sys/errno.h> 64 #include <sys/param.h> /* defines used in kernel.h */ 65 #include <sys/kernel.h> /* types used in module initialization */ 66 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 67 #include <sys/sockio.h> 68 #include <sys/socketvar.h> /* struct socket */ 69 #include <sys/malloc.h> 70 #include <sys/poll.h> 71 #include <sys/rwlock.h> 72 #include <sys/socket.h> /* sockaddrs */ 73 #include <sys/selinfo.h> 74 #include <sys/sysctl.h> 75 #include <net/if.h> 76 #include <net/if_var.h> 77 #include <net/bpf.h> /* BIOCIMMEDIATE */ 78 #include <machine/bus.h> /* bus_dmamap_* */ 79 #include <sys/endian.h> 80 #include <sys/refcount.h> 81 82 83 #define BDG_RWLOCK_T struct rwlock // struct rwlock 84 85 #define BDG_RWINIT(b) \ 86 rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) 87 #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) 88 #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) 89 #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) 90 #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) 91 #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) 92 #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) 93 94 95 #elif defined(linux) 96 97 #include "bsd_glue.h" 98 99 #elif defined(__APPLE__) 100 101 #warning OSX support is only partial 102 #include "osx_glue.h" 103 104 #else 105 106 #error Unsupported platform 107 108 #endif /* unsupported */ 109 110 /* 111 * common headers 112 */ 113 114 #include <net/netmap.h> 115 #include <dev/netmap/netmap_kern.h> 116 #include <dev/netmap/netmap_mem2.h> 117 118 #ifdef WITH_VALE 119 120 /* 121 * system parameters (most of them in netmap_kern.h) 122 * NM_NAME prefix for switch port names, default "vale" 123 * NM_BDG_MAXPORTS number of ports 124 * NM_BRIDGES max number of switches in the system. 125 * XXX should become a sysctl or tunable 126 * 127 * Switch ports are named valeX:Y where X is the switch name and Y 128 * is the port. If Y matches a physical interface name, the port is 129 * connected to a physical device. 130 * 131 * Unlike physical interfaces, switch ports use their own memory region 132 * for rings and buffers. 133 * The virtual interfaces use per-queue lock instead of core lock. 134 * In the tx loop, we aggregate traffic in batches to make all operations 135 * faster. The batch size is bridge_batch. 136 */ 137 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ 138 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ 139 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ 140 #define NM_BDG_HASH 1024 /* forwarding table entries */ 141 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ 142 #define NM_MULTISEG 64 /* max size of a chain of bufs */ 143 /* actual size of the tables */ 144 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) 145 /* NM_FT_NULL terminates a list of slots in the ft */ 146 #define NM_FT_NULL NM_BDG_BATCH_MAX 147 #define NM_BRIDGES 8 /* number of bridges */ 148 149 150 /* 151 * bridge_batch is set via sysctl to the max batch size to be 152 * used in the bridge. The actual value may be larger as the 153 * last packet in the block may overflow the size. 154 */ 155 int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ 156 SYSCTL_DECL(_dev_netmap); 157 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); 158 159 160 static int netmap_vp_create(struct nmreq *, struct ifnet *, struct netmap_vp_adapter **); 161 static int netmap_vp_reg(struct netmap_adapter *na, int onoff); 162 static int netmap_bwrap_register(struct netmap_adapter *, int onoff); 163 164 /* 165 * For each output interface, nm_bdg_q is used to construct a list. 166 * bq_len is the number of output buffers (we can have coalescing 167 * during the copy). 168 */ 169 struct nm_bdg_q { 170 uint16_t bq_head; 171 uint16_t bq_tail; 172 uint32_t bq_len; /* number of buffers */ 173 }; 174 175 /* XXX revise this */ 176 struct nm_hash_ent { 177 uint64_t mac; /* the top 2 bytes are the epoch */ 178 uint64_t ports; 179 }; 180 181 /* 182 * nm_bridge is a descriptor for a VALE switch. 183 * Interfaces for a bridge are all in bdg_ports[]. 184 * The array has fixed size, an empty entry does not terminate 185 * the search, but lookups only occur on attach/detach so we 186 * don't mind if they are slow. 187 * 188 * The bridge is non blocking on the transmit ports: excess 189 * packets are dropped if there is no room on the output port. 190 * 191 * bdg_lock protects accesses to the bdg_ports array. 192 * This is a rw lock (or equivalent). 193 */ 194 struct nm_bridge { 195 /* XXX what is the proper alignment/layout ? */ 196 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ 197 int bdg_namelen; 198 uint32_t bdg_active_ports; /* 0 means free */ 199 char bdg_basename[IFNAMSIZ]; 200 201 /* Indexes of active ports (up to active_ports) 202 * and all other remaining ports. 203 */ 204 uint8_t bdg_port_index[NM_BDG_MAXPORTS]; 205 206 struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS]; 207 208 209 /* 210 * The function to decide the destination port. 211 * It returns either of an index of the destination port, 212 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to 213 * forward this packet. ring_nr is the source ring index, and the 214 * function may overwrite this value to forward this packet to a 215 * different ring index. 216 * This function must be set by netmap_bdgctl(). 217 */ 218 struct netmap_bdg_ops bdg_ops; 219 220 /* the forwarding table, MAC+ports. 221 * XXX should be changed to an argument to be passed to 222 * the lookup function, and allocated on attach 223 */ 224 struct nm_hash_ent ht[NM_BDG_HASH]; 225 }; 226 227 const char* 228 netmap_bdg_name(struct netmap_vp_adapter *vp) 229 { 230 struct nm_bridge *b = vp->na_bdg; 231 if (b == NULL) 232 return NULL; 233 return b->bdg_basename; 234 } 235 236 237 /* 238 * XXX in principle nm_bridges could be created dynamically 239 * Right now we have a static array and deletions are protected 240 * by an exclusive lock. 241 */ 242 struct nm_bridge nm_bridges[NM_BRIDGES]; 243 244 245 /* 246 * this is a slightly optimized copy routine which rounds 247 * to multiple of 64 bytes and is often faster than dealing 248 * with other odd sizes. We assume there is enough room 249 * in the source and destination buffers. 250 * 251 * XXX only for multiples of 64 bytes, non overlapped. 252 */ 253 static inline void 254 pkt_copy(void *_src, void *_dst, int l) 255 { 256 uint64_t *src = _src; 257 uint64_t *dst = _dst; 258 if (unlikely(l >= 1024)) { 259 memcpy(dst, src, l); 260 return; 261 } 262 for (; likely(l > 0); l-=64) { 263 *dst++ = *src++; 264 *dst++ = *src++; 265 *dst++ = *src++; 266 *dst++ = *src++; 267 *dst++ = *src++; 268 *dst++ = *src++; 269 *dst++ = *src++; 270 *dst++ = *src++; 271 } 272 } 273 274 275 /* 276 * locate a bridge among the existing ones. 277 * MUST BE CALLED WITH NMG_LOCK() 278 * 279 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. 280 * We assume that this is called with a name of at least NM_NAME chars. 281 */ 282 static struct nm_bridge * 283 nm_find_bridge(const char *name, int create) 284 { 285 int i, l, namelen; 286 struct nm_bridge *b = NULL; 287 288 NMG_LOCK_ASSERT(); 289 290 namelen = strlen(NM_NAME); /* base length */ 291 l = name ? strlen(name) : 0; /* actual length */ 292 if (l < namelen) { 293 D("invalid bridge name %s", name ? name : NULL); 294 return NULL; 295 } 296 for (i = namelen + 1; i < l; i++) { 297 if (name[i] == ':') { 298 namelen = i; 299 break; 300 } 301 } 302 if (namelen >= IFNAMSIZ) 303 namelen = IFNAMSIZ; 304 ND("--- prefix is '%.*s' ---", namelen, name); 305 306 /* lookup the name, remember empty slot if there is one */ 307 for (i = 0; i < NM_BRIDGES; i++) { 308 struct nm_bridge *x = nm_bridges + i; 309 310 if (x->bdg_active_ports == 0) { 311 if (create && b == NULL) 312 b = x; /* record empty slot */ 313 } else if (x->bdg_namelen != namelen) { 314 continue; 315 } else if (strncmp(name, x->bdg_basename, namelen) == 0) { 316 ND("found '%.*s' at %d", namelen, name, i); 317 b = x; 318 break; 319 } 320 } 321 if (i == NM_BRIDGES && b) { /* name not found, can create entry */ 322 /* initialize the bridge */ 323 strncpy(b->bdg_basename, name, namelen); 324 ND("create new bridge %s with ports %d", b->bdg_basename, 325 b->bdg_active_ports); 326 b->bdg_namelen = namelen; 327 b->bdg_active_ports = 0; 328 for (i = 0; i < NM_BDG_MAXPORTS; i++) 329 b->bdg_port_index[i] = i; 330 /* set the default function */ 331 b->bdg_ops.lookup = netmap_bdg_learning; 332 /* reset the MAC address table */ 333 bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); 334 } 335 return b; 336 } 337 338 339 /* 340 * Free the forwarding tables for rings attached to switch ports. 341 */ 342 static void 343 nm_free_bdgfwd(struct netmap_adapter *na) 344 { 345 int nrings, i; 346 struct netmap_kring *kring; 347 348 NMG_LOCK_ASSERT(); 349 nrings = na->num_tx_rings; 350 kring = na->tx_rings; 351 for (i = 0; i < nrings; i++) { 352 if (kring[i].nkr_ft) { 353 free(kring[i].nkr_ft, M_DEVBUF); 354 kring[i].nkr_ft = NULL; /* protect from freeing twice */ 355 } 356 } 357 } 358 359 360 /* 361 * Allocate the forwarding tables for the rings attached to the bridge ports. 362 */ 363 static int 364 nm_alloc_bdgfwd(struct netmap_adapter *na) 365 { 366 int nrings, l, i, num_dstq; 367 struct netmap_kring *kring; 368 369 NMG_LOCK_ASSERT(); 370 /* all port:rings + broadcast */ 371 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; 372 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; 373 l += sizeof(struct nm_bdg_q) * num_dstq; 374 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; 375 376 nrings = netmap_real_tx_rings(na); 377 kring = na->tx_rings; 378 for (i = 0; i < nrings; i++) { 379 struct nm_bdg_fwd *ft; 380 struct nm_bdg_q *dstq; 381 int j; 382 383 ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); 384 if (!ft) { 385 nm_free_bdgfwd(na); 386 return ENOMEM; 387 } 388 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 389 for (j = 0; j < num_dstq; j++) { 390 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; 391 dstq[j].bq_len = 0; 392 } 393 kring[i].nkr_ft = ft; 394 } 395 return 0; 396 } 397 398 399 /* remove from bridge b the ports in slots hw and sw 400 * (sw can be -1 if not needed) 401 */ 402 static void 403 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) 404 { 405 int s_hw = hw, s_sw = sw; 406 int i, lim =b->bdg_active_ports; 407 uint8_t tmp[NM_BDG_MAXPORTS]; 408 409 /* 410 New algorithm: 411 make a copy of bdg_port_index; 412 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port 413 in the array of bdg_port_index, replacing them with 414 entries from the bottom of the array; 415 decrement bdg_active_ports; 416 acquire BDG_WLOCK() and copy back the array. 417 */ 418 419 if (netmap_verbose) 420 D("detach %d and %d (lim %d)", hw, sw, lim); 421 /* make a copy of the list of active ports, update it, 422 * and then copy back within BDG_WLOCK(). 423 */ 424 memcpy(tmp, b->bdg_port_index, sizeof(tmp)); 425 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { 426 if (hw >= 0 && tmp[i] == hw) { 427 ND("detach hw %d at %d", hw, i); 428 lim--; /* point to last active port */ 429 tmp[i] = tmp[lim]; /* swap with i */ 430 tmp[lim] = hw; /* now this is inactive */ 431 hw = -1; 432 } else if (sw >= 0 && tmp[i] == sw) { 433 ND("detach sw %d at %d", sw, i); 434 lim--; 435 tmp[i] = tmp[lim]; 436 tmp[lim] = sw; 437 sw = -1; 438 } else { 439 i++; 440 } 441 } 442 if (hw >= 0 || sw >= 0) { 443 D("XXX delete failed hw %d sw %d, should panic...", hw, sw); 444 } 445 446 BDG_WLOCK(b); 447 if (b->bdg_ops.dtor) 448 b->bdg_ops.dtor(b->bdg_ports[s_hw]); 449 b->bdg_ports[s_hw] = NULL; 450 if (s_sw >= 0) { 451 b->bdg_ports[s_sw] = NULL; 452 } 453 memcpy(b->bdg_port_index, tmp, sizeof(tmp)); 454 b->bdg_active_ports = lim; 455 BDG_WUNLOCK(b); 456 457 ND("now %d active ports", lim); 458 if (lim == 0) { 459 ND("marking bridge %s as free", b->bdg_basename); 460 bzero(&b->bdg_ops, sizeof(b->bdg_ops)); 461 } 462 } 463 464 /* nm_bdg_ctl callback for VALE ports */ 465 static int 466 netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) 467 { 468 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 469 struct nm_bridge *b = vpna->na_bdg; 470 471 if (attach) 472 return 0; /* nothing to do */ 473 if (b) { 474 netmap_set_all_rings(na, 0 /* disable */); 475 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 476 vpna->na_bdg = NULL; 477 netmap_set_all_rings(na, 1 /* enable */); 478 } 479 /* I have took reference just for attach */ 480 netmap_adapter_put(na); 481 return 0; 482 } 483 484 /* nm_dtor callback for ephemeral VALE ports */ 485 static void 486 netmap_vp_dtor(struct netmap_adapter *na) 487 { 488 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; 489 struct nm_bridge *b = vpna->na_bdg; 490 491 ND("%s has %d references", na->name, na->na_refcount); 492 493 if (b) { 494 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 495 } 496 } 497 498 /* nm_dtor callback for persistent VALE ports */ 499 static void 500 netmap_persist_vp_dtor(struct netmap_adapter *na) 501 { 502 struct ifnet *ifp = na->ifp; 503 504 netmap_vp_dtor(na); 505 na->ifp = NULL; 506 nm_vi_detach(ifp); 507 } 508 509 /* remove a persistent VALE port from the system */ 510 static int 511 nm_vi_destroy(const char *name) 512 { 513 struct ifnet *ifp; 514 int error; 515 516 ifp = ifunit_ref(name); 517 if (!ifp) 518 return ENXIO; 519 NMG_LOCK(); 520 /* make sure this is actually a VALE port */ 521 if (!NETMAP_CAPABLE(ifp) || NA(ifp)->nm_register != netmap_vp_reg) { 522 error = EINVAL; 523 goto err; 524 } 525 526 if (NA(ifp)->na_refcount > 1) { 527 error = EBUSY; 528 goto err; 529 } 530 NMG_UNLOCK(); 531 532 D("destroying a persistent vale interface %s", ifp->if_xname); 533 /* Linux requires all the references are released 534 * before unregister 535 */ 536 if_rele(ifp); 537 netmap_detach(ifp); 538 return 0; 539 540 err: 541 NMG_UNLOCK(); 542 if_rele(ifp); 543 return error; 544 } 545 546 /* 547 * Create a virtual interface registered to the system. 548 * The interface will be attached to a bridge later. 549 */ 550 static int 551 nm_vi_create(struct nmreq *nmr) 552 { 553 struct ifnet *ifp; 554 struct netmap_vp_adapter *vpna; 555 int error; 556 557 /* don't include VALE prefix */ 558 if (!strncmp(nmr->nr_name, NM_NAME, strlen(NM_NAME))) 559 return EINVAL; 560 ifp = ifunit_ref(nmr->nr_name); 561 if (ifp) { /* already exist, cannot create new one */ 562 if_rele(ifp); 563 return EEXIST; 564 } 565 error = nm_vi_persist(nmr->nr_name, &ifp); 566 if (error) 567 return error; 568 569 NMG_LOCK(); 570 /* netmap_vp_create creates a struct netmap_vp_adapter */ 571 error = netmap_vp_create(nmr, ifp, &vpna); 572 if (error) { 573 D("error %d", error); 574 nm_vi_detach(ifp); 575 return error; 576 } 577 /* persist-specific routines */ 578 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl; 579 vpna->up.nm_dtor = netmap_persist_vp_dtor; 580 netmap_adapter_get(&vpna->up); 581 NMG_UNLOCK(); 582 D("created %s", ifp->if_xname); 583 return 0; 584 } 585 586 /* Try to get a reference to a netmap adapter attached to a VALE switch. 587 * If the adapter is found (or is created), this function returns 0, a 588 * non NULL pointer is returned into *na, and the caller holds a 589 * reference to the adapter. 590 * If an adapter is not found, then no reference is grabbed and the 591 * function returns an error code, or 0 if there is just a VALE prefix 592 * mismatch. Therefore the caller holds a reference when 593 * (*na != NULL && return == 0). 594 */ 595 int 596 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) 597 { 598 char *nr_name = nmr->nr_name; 599 const char *ifname; 600 struct ifnet *ifp; 601 int error = 0; 602 struct netmap_vp_adapter *vpna, *hostna = NULL; 603 struct nm_bridge *b; 604 int i, j, cand = -1, cand2 = -1; 605 int needed; 606 607 *na = NULL; /* default return value */ 608 609 /* first try to see if this is a bridge port. */ 610 NMG_LOCK_ASSERT(); 611 if (strncmp(nr_name, NM_NAME, sizeof(NM_NAME) - 1)) { 612 return 0; /* no error, but no VALE prefix */ 613 } 614 615 b = nm_find_bridge(nr_name, create); 616 if (b == NULL) { 617 D("no bridges available for '%s'", nr_name); 618 return (create ? ENOMEM : ENXIO); 619 } 620 if (strlen(nr_name) < b->bdg_namelen) /* impossible */ 621 panic("x"); 622 623 /* Now we are sure that name starts with the bridge's name, 624 * lookup the port in the bridge. We need to scan the entire 625 * list. It is not important to hold a WLOCK on the bridge 626 * during the search because NMG_LOCK already guarantees 627 * that there are no other possible writers. 628 */ 629 630 /* lookup in the local list of ports */ 631 for (j = 0; j < b->bdg_active_ports; j++) { 632 i = b->bdg_port_index[j]; 633 vpna = b->bdg_ports[i]; 634 // KASSERT(na != NULL); 635 D("checking %s", vpna->up.name); 636 if (!strcmp(vpna->up.name, nr_name)) { 637 netmap_adapter_get(&vpna->up); 638 ND("found existing if %s refs %d", nr_name) 639 *na = &vpna->up; 640 return 0; 641 } 642 } 643 /* not found, should we create it? */ 644 if (!create) 645 return ENXIO; 646 /* yes we should, see if we have space to attach entries */ 647 needed = 2; /* in some cases we only need 1 */ 648 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { 649 D("bridge full %d, cannot create new port", b->bdg_active_ports); 650 return ENOMEM; 651 } 652 /* record the next two ports available, but do not allocate yet */ 653 cand = b->bdg_port_index[b->bdg_active_ports]; 654 cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; 655 ND("+++ bridge %s port %s used %d avail %d %d", 656 b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2); 657 658 /* 659 * try see if there is a matching NIC with this name 660 * (after the bridge's name) 661 */ 662 ifname = nr_name + b->bdg_namelen + 1; 663 ifp = ifunit_ref(ifname); 664 if (!ifp) { 665 /* Create an ephemeral virtual port 666 * This block contains all the ephemeral-specific logics 667 */ 668 if (nmr->nr_cmd) { 669 /* nr_cmd must be 0 for a virtual port */ 670 return EINVAL; 671 } 672 673 /* bdg_netmap_attach creates a struct netmap_adapter */ 674 error = netmap_vp_create(nmr, NULL, &vpna); 675 if (error) { 676 D("error %d", error); 677 free(ifp, M_DEVBUF); 678 return error; 679 } 680 /* shortcut - we can skip get_hw_na(), 681 * ownership check and nm_bdg_attach() 682 */ 683 } else { 684 struct netmap_adapter *hw; 685 686 error = netmap_get_hw_na(ifp, &hw); 687 if (error || hw == NULL) 688 goto out; 689 690 /* host adapter might not be created */ 691 error = hw->nm_bdg_attach(nr_name, hw); 692 if (error) 693 goto out; 694 vpna = hw->na_vp; 695 hostna = hw->na_hostvp; 696 if_rele(ifp); 697 if (nmr->nr_arg1 != NETMAP_BDG_HOST) 698 hostna = NULL; 699 } 700 701 BDG_WLOCK(b); 702 vpna->bdg_port = cand; 703 ND("NIC %p to bridge port %d", vpna, cand); 704 /* bind the port to the bridge (virtual ports are not active) */ 705 b->bdg_ports[cand] = vpna; 706 vpna->na_bdg = b; 707 b->bdg_active_ports++; 708 if (hostna != NULL) { 709 /* also bind the host stack to the bridge */ 710 b->bdg_ports[cand2] = hostna; 711 hostna->bdg_port = cand2; 712 hostna->na_bdg = b; 713 b->bdg_active_ports++; 714 ND("host %p to bridge port %d", hostna, cand2); 715 } 716 ND("if %s refs %d", ifname, vpna->up.na_refcount); 717 BDG_WUNLOCK(b); 718 *na = &vpna->up; 719 netmap_adapter_get(*na); 720 return 0; 721 722 out: 723 if_rele(ifp); 724 725 return error; 726 } 727 728 729 /* Process NETMAP_BDG_ATTACH */ 730 static int 731 nm_bdg_ctl_attach(struct nmreq *nmr) 732 { 733 struct netmap_adapter *na; 734 int error; 735 736 NMG_LOCK(); 737 738 error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */); 739 if (error) /* no device */ 740 goto unlock_exit; 741 742 if (na == NULL) { /* VALE prefix missing */ 743 error = EINVAL; 744 goto unlock_exit; 745 } 746 747 if (NETMAP_OWNED_BY_ANY(na)) { 748 error = EBUSY; 749 goto unref_exit; 750 } 751 752 if (na->nm_bdg_ctl) { 753 /* nop for VALE ports. The bwrap needs to put the hwna 754 * in netmap mode (see netmap_bwrap_bdg_ctl) 755 */ 756 error = na->nm_bdg_ctl(na, nmr, 1); 757 if (error) 758 goto unref_exit; 759 ND("registered %s to netmap-mode", na->name); 760 } 761 NMG_UNLOCK(); 762 return 0; 763 764 unref_exit: 765 netmap_adapter_put(na); 766 unlock_exit: 767 NMG_UNLOCK(); 768 return error; 769 } 770 771 772 /* process NETMAP_BDG_DETACH */ 773 static int 774 nm_bdg_ctl_detach(struct nmreq *nmr) 775 { 776 struct netmap_adapter *na; 777 int error; 778 779 NMG_LOCK(); 780 error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */); 781 if (error) { /* no device, or another bridge or user owns the device */ 782 goto unlock_exit; 783 } 784 785 if (na == NULL) { /* VALE prefix missing */ 786 error = EINVAL; 787 goto unlock_exit; 788 } 789 790 if (na->nm_bdg_ctl) { 791 /* remove the port from bridge. The bwrap 792 * also needs to put the hwna in normal mode 793 */ 794 error = na->nm_bdg_ctl(na, nmr, 0); 795 } 796 797 netmap_adapter_put(na); 798 unlock_exit: 799 NMG_UNLOCK(); 800 return error; 801 802 } 803 804 805 /* Called by either user's context (netmap_ioctl()) 806 * or external kernel modules (e.g., Openvswitch). 807 * Operation is indicated in nmr->nr_cmd. 808 * NETMAP_BDG_OPS that sets configure/lookup/dtor functions to the bridge 809 * requires bdg_ops argument; the other commands ignore this argument. 810 * 811 * Called without NMG_LOCK. 812 */ 813 int 814 netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops) 815 { 816 struct nm_bridge *b; 817 struct netmap_adapter *na; 818 struct netmap_vp_adapter *vpna; 819 char *name = nmr->nr_name; 820 int cmd = nmr->nr_cmd, namelen = strlen(name); 821 int error = 0, i, j; 822 823 switch (cmd) { 824 case NETMAP_BDG_NEWIF: 825 error = nm_vi_create(nmr); 826 break; 827 828 case NETMAP_BDG_DELIF: 829 error = nm_vi_destroy(nmr->nr_name); 830 break; 831 832 case NETMAP_BDG_ATTACH: 833 error = nm_bdg_ctl_attach(nmr); 834 break; 835 836 case NETMAP_BDG_DETACH: 837 error = nm_bdg_ctl_detach(nmr); 838 break; 839 840 case NETMAP_BDG_LIST: 841 /* this is used to enumerate bridges and ports */ 842 if (namelen) { /* look up indexes of bridge and port */ 843 if (strncmp(name, NM_NAME, strlen(NM_NAME))) { 844 error = EINVAL; 845 break; 846 } 847 NMG_LOCK(); 848 b = nm_find_bridge(name, 0 /* don't create */); 849 if (!b) { 850 error = ENOENT; 851 NMG_UNLOCK(); 852 break; 853 } 854 855 name = name + b->bdg_namelen + 1; 856 error = ENOENT; 857 for (j = 0; j < b->bdg_active_ports; j++) { 858 i = b->bdg_port_index[j]; 859 vpna = b->bdg_ports[i]; 860 if (vpna == NULL) { 861 D("---AAAAAAAAARGH-------"); 862 continue; 863 } 864 /* the former and the latter identify a 865 * virtual port and a NIC, respectively 866 */ 867 if (!strcmp(vpna->up.name, name)) { 868 /* bridge index */ 869 nmr->nr_arg1 = b - nm_bridges; 870 nmr->nr_arg2 = i; /* port index */ 871 error = 0; 872 break; 873 } 874 } 875 NMG_UNLOCK(); 876 } else { 877 /* return the first non-empty entry starting from 878 * bridge nr_arg1 and port nr_arg2. 879 * 880 * Users can detect the end of the same bridge by 881 * seeing the new and old value of nr_arg1, and can 882 * detect the end of all the bridge by error != 0 883 */ 884 i = nmr->nr_arg1; 885 j = nmr->nr_arg2; 886 887 NMG_LOCK(); 888 for (error = ENOENT; i < NM_BRIDGES; i++) { 889 b = nm_bridges + i; 890 if (j >= b->bdg_active_ports) { 891 j = 0; /* following bridges scan from 0 */ 892 continue; 893 } 894 nmr->nr_arg1 = i; 895 nmr->nr_arg2 = j; 896 j = b->bdg_port_index[j]; 897 vpna = b->bdg_ports[j]; 898 strncpy(name, vpna->up.name, (size_t)IFNAMSIZ); 899 error = 0; 900 break; 901 } 902 NMG_UNLOCK(); 903 } 904 break; 905 906 case NETMAP_BDG_REGOPS: /* XXX this should not be available from userspace */ 907 /* register callbacks to the given bridge. 908 * nmr->nr_name may be just bridge's name (including ':' 909 * if it is not just NM_NAME). 910 */ 911 if (!bdg_ops) { 912 error = EINVAL; 913 break; 914 } 915 NMG_LOCK(); 916 b = nm_find_bridge(name, 0 /* don't create */); 917 if (!b) { 918 error = EINVAL; 919 } else { 920 b->bdg_ops = *bdg_ops; 921 } 922 NMG_UNLOCK(); 923 break; 924 925 case NETMAP_BDG_VNET_HDR: 926 /* Valid lengths for the virtio-net header are 0 (no header), 927 10 and 12. */ 928 if (nmr->nr_arg1 != 0 && 929 nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) && 930 nmr->nr_arg1 != 12) { 931 error = EINVAL; 932 break; 933 } 934 NMG_LOCK(); 935 error = netmap_get_bdg_na(nmr, &na, 0); 936 if (na && !error) { 937 vpna = (struct netmap_vp_adapter *)na; 938 vpna->virt_hdr_len = nmr->nr_arg1; 939 if (vpna->virt_hdr_len) 940 vpna->mfs = NETMAP_BUF_SIZE(na); 941 D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna); 942 netmap_adapter_put(na); 943 } 944 NMG_UNLOCK(); 945 break; 946 947 default: 948 D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); 949 error = EINVAL; 950 break; 951 } 952 return error; 953 } 954 955 int 956 netmap_bdg_config(struct nmreq *nmr) 957 { 958 struct nm_bridge *b; 959 int error = EINVAL; 960 961 NMG_LOCK(); 962 b = nm_find_bridge(nmr->nr_name, 0); 963 if (!b) { 964 NMG_UNLOCK(); 965 return error; 966 } 967 NMG_UNLOCK(); 968 /* Don't call config() with NMG_LOCK() held */ 969 BDG_RLOCK(b); 970 if (b->bdg_ops.config != NULL) 971 error = b->bdg_ops.config((struct nm_ifreq *)nmr); 972 BDG_RUNLOCK(b); 973 return error; 974 } 975 976 977 /* nm_krings_create callback for VALE ports. 978 * Calls the standard netmap_krings_create, then adds leases on rx 979 * rings and bdgfwd on tx rings. 980 */ 981 static int 982 netmap_vp_krings_create(struct netmap_adapter *na) 983 { 984 u_int tailroom; 985 int error, i; 986 uint32_t *leases; 987 u_int nrx = netmap_real_rx_rings(na); 988 989 /* 990 * Leases are attached to RX rings on vale ports 991 */ 992 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; 993 994 error = netmap_krings_create(na, tailroom); 995 if (error) 996 return error; 997 998 leases = na->tailroom; 999 1000 for (i = 0; i < nrx; i++) { /* Receive rings */ 1001 na->rx_rings[i].nkr_leases = leases; 1002 leases += na->num_rx_desc; 1003 } 1004 1005 error = nm_alloc_bdgfwd(na); 1006 if (error) { 1007 netmap_krings_delete(na); 1008 return error; 1009 } 1010 1011 return 0; 1012 } 1013 1014 1015 /* nm_krings_delete callback for VALE ports. */ 1016 static void 1017 netmap_vp_krings_delete(struct netmap_adapter *na) 1018 { 1019 nm_free_bdgfwd(na); 1020 netmap_krings_delete(na); 1021 } 1022 1023 1024 static int 1025 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, 1026 struct netmap_vp_adapter *na, u_int ring_nr); 1027 1028 1029 /* 1030 * main dispatch routine for the bridge. 1031 * Grab packets from a kring, move them into the ft structure 1032 * associated to the tx (input) port. Max one instance per port, 1033 * filtered on input (ioctl, poll or XXX). 1034 * Returns the next position in the ring. 1035 */ 1036 static int 1037 nm_bdg_preflush(struct netmap_kring *kring, u_int end) 1038 { 1039 struct netmap_vp_adapter *na = 1040 (struct netmap_vp_adapter*)kring->na; 1041 struct netmap_ring *ring = kring->ring; 1042 struct nm_bdg_fwd *ft; 1043 u_int ring_nr = kring->ring_id; 1044 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; 1045 u_int ft_i = 0; /* start from 0 */ 1046 u_int frags = 1; /* how many frags ? */ 1047 struct nm_bridge *b = na->na_bdg; 1048 1049 /* To protect against modifications to the bridge we acquire a 1050 * shared lock, waiting if we can sleep (if the source port is 1051 * attached to a user process) or with a trylock otherwise (NICs). 1052 */ 1053 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); 1054 if (na->up.na_flags & NAF_BDG_MAYSLEEP) 1055 BDG_RLOCK(b); 1056 else if (!BDG_RTRYLOCK(b)) 1057 return 0; 1058 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); 1059 ft = kring->nkr_ft; 1060 1061 for (; likely(j != end); j = nm_next(j, lim)) { 1062 struct netmap_slot *slot = &ring->slot[j]; 1063 char *buf; 1064 1065 ft[ft_i].ft_len = slot->len; 1066 ft[ft_i].ft_flags = slot->flags; 1067 1068 ND("flags is 0x%x", slot->flags); 1069 /* this slot goes into a list so initialize the link field */ 1070 ft[ft_i].ft_next = NM_FT_NULL; 1071 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? 1072 (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot); 1073 if (unlikely(buf == NULL)) { 1074 RD(5, "NULL %s buffer pointer from %s slot %d len %d", 1075 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT", 1076 kring->name, j, ft[ft_i].ft_len); 1077 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up); 1078 ft[ft_i].ft_len = 0; 1079 ft[ft_i].ft_flags = 0; 1080 } 1081 __builtin_prefetch(buf); 1082 ++ft_i; 1083 if (slot->flags & NS_MOREFRAG) { 1084 frags++; 1085 continue; 1086 } 1087 if (unlikely(netmap_verbose && frags > 1)) 1088 RD(5, "%d frags at %d", frags, ft_i - frags); 1089 ft[ft_i - frags].ft_frags = frags; 1090 frags = 1; 1091 if (unlikely((int)ft_i >= bridge_batch)) 1092 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 1093 } 1094 if (frags > 1) { 1095 D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); 1096 // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG 1097 ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; 1098 ft[ft_i - frags].ft_frags = frags - 1; 1099 } 1100 if (ft_i) 1101 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 1102 BDG_RUNLOCK(b); 1103 return j; 1104 } 1105 1106 1107 /* ----- FreeBSD if_bridge hash function ------- */ 1108 1109 /* 1110 * The following hash function is adapted from "Hash Functions" by Bob Jenkins 1111 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). 1112 * 1113 * http://www.burtleburtle.net/bob/hash/spooky.html 1114 */ 1115 #define mix(a, b, c) \ 1116 do { \ 1117 a -= b; a -= c; a ^= (c >> 13); \ 1118 b -= c; b -= a; b ^= (a << 8); \ 1119 c -= a; c -= b; c ^= (b >> 13); \ 1120 a -= b; a -= c; a ^= (c >> 12); \ 1121 b -= c; b -= a; b ^= (a << 16); \ 1122 c -= a; c -= b; c ^= (b >> 5); \ 1123 a -= b; a -= c; a ^= (c >> 3); \ 1124 b -= c; b -= a; b ^= (a << 10); \ 1125 c -= a; c -= b; c ^= (b >> 15); \ 1126 } while (/*CONSTCOND*/0) 1127 1128 1129 static __inline uint32_t 1130 nm_bridge_rthash(const uint8_t *addr) 1131 { 1132 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key 1133 1134 b += addr[5] << 8; 1135 b += addr[4]; 1136 a += addr[3] << 24; 1137 a += addr[2] << 16; 1138 a += addr[1] << 8; 1139 a += addr[0]; 1140 1141 mix(a, b, c); 1142 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) 1143 return (c & BRIDGE_RTHASH_MASK); 1144 } 1145 1146 #undef mix 1147 1148 1149 /* nm_register callback for VALE ports */ 1150 static int 1151 netmap_vp_reg(struct netmap_adapter *na, int onoff) 1152 { 1153 struct netmap_vp_adapter *vpna = 1154 (struct netmap_vp_adapter*)na; 1155 1156 /* persistent ports may be put in netmap mode 1157 * before being attached to a bridge 1158 */ 1159 if (vpna->na_bdg) 1160 BDG_WLOCK(vpna->na_bdg); 1161 if (onoff) { 1162 na->na_flags |= NAF_NETMAP_ON; 1163 /* XXX on FreeBSD, persistent VALE ports should also 1164 * toggle IFCAP_NETMAP in na->ifp (2014-03-16) 1165 */ 1166 } else { 1167 na->na_flags &= ~NAF_NETMAP_ON; 1168 } 1169 if (vpna->na_bdg) 1170 BDG_WUNLOCK(vpna->na_bdg); 1171 return 0; 1172 } 1173 1174 1175 /* 1176 * Lookup function for a learning bridge. 1177 * Update the hash table with the source address, 1178 * and then returns the destination port index, and the 1179 * ring in *dst_ring (at the moment, always use ring 0) 1180 */ 1181 u_int 1182 netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, 1183 const struct netmap_vp_adapter *na) 1184 { 1185 uint8_t *buf = ft->ft_buf; 1186 u_int buf_len = ft->ft_len; 1187 struct nm_hash_ent *ht = na->na_bdg->ht; 1188 uint32_t sh, dh; 1189 u_int dst, mysrc = na->bdg_port; 1190 uint64_t smac, dmac; 1191 1192 /* safety check, unfortunately we have many cases */ 1193 if (buf_len >= 14 + na->virt_hdr_len) { 1194 /* virthdr + mac_hdr in the same slot */ 1195 buf += na->virt_hdr_len; 1196 buf_len -= na->virt_hdr_len; 1197 } else if (buf_len == na->virt_hdr_len && ft->ft_flags & NS_MOREFRAG) { 1198 /* only header in first fragment */ 1199 ft++; 1200 buf = ft->ft_buf; 1201 buf_len = ft->ft_len; 1202 } else { 1203 RD(5, "invalid buf format, length %d", buf_len); 1204 return NM_BDG_NOPORT; 1205 } 1206 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; 1207 smac = le64toh(*(uint64_t *)(buf + 4)); 1208 smac >>= 16; 1209 1210 /* 1211 * The hash is somewhat expensive, there might be some 1212 * worthwhile optimizations here. 1213 */ 1214 if ((buf[6] & 1) == 0) { /* valid src */ 1215 uint8_t *s = buf+6; 1216 sh = nm_bridge_rthash(s); // XXX hash of source 1217 /* update source port forwarding entry */ 1218 ht[sh].mac = smac; /* XXX expire ? */ 1219 ht[sh].ports = mysrc; 1220 if (netmap_verbose) 1221 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", 1222 s[0], s[1], s[2], s[3], s[4], s[5], mysrc); 1223 } 1224 dst = NM_BDG_BROADCAST; 1225 if ((buf[0] & 1) == 0) { /* unicast */ 1226 dh = nm_bridge_rthash(buf); // XXX hash of dst 1227 if (ht[dh].mac == dmac) { /* found dst */ 1228 dst = ht[dh].ports; 1229 } 1230 /* XXX otherwise return NM_BDG_UNKNOWN ? */ 1231 } 1232 *dst_ring = 0; 1233 return dst; 1234 } 1235 1236 1237 /* 1238 * Available space in the ring. Only used in VALE code 1239 * and only with is_rx = 1 1240 */ 1241 static inline uint32_t 1242 nm_kr_space(struct netmap_kring *k, int is_rx) 1243 { 1244 int space; 1245 1246 if (is_rx) { 1247 int busy = k->nkr_hwlease - k->nr_hwcur; 1248 if (busy < 0) 1249 busy += k->nkr_num_slots; 1250 space = k->nkr_num_slots - 1 - busy; 1251 } else { 1252 /* XXX never used in this branch */ 1253 space = k->nr_hwtail - k->nkr_hwlease; 1254 if (space < 0) 1255 space += k->nkr_num_slots; 1256 } 1257 #if 0 1258 // sanity check 1259 if (k->nkr_hwlease >= k->nkr_num_slots || 1260 k->nr_hwcur >= k->nkr_num_slots || 1261 k->nr_tail >= k->nkr_num_slots || 1262 busy < 0 || 1263 busy >= k->nkr_num_slots) { 1264 D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 1265 k->nkr_lease_idx, k->nkr_num_slots); 1266 } 1267 #endif 1268 return space; 1269 } 1270 1271 1272 1273 1274 /* make a lease on the kring for N positions. return the 1275 * lease index 1276 * XXX only used in VALE code and with is_rx = 1 1277 */ 1278 static inline uint32_t 1279 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) 1280 { 1281 uint32_t lim = k->nkr_num_slots - 1; 1282 uint32_t lease_idx = k->nkr_lease_idx; 1283 1284 k->nkr_leases[lease_idx] = NR_NOSLOT; 1285 k->nkr_lease_idx = nm_next(lease_idx, lim); 1286 1287 if (n > nm_kr_space(k, is_rx)) { 1288 D("invalid request for %d slots", n); 1289 panic("x"); 1290 } 1291 /* XXX verify that there are n slots */ 1292 k->nkr_hwlease += n; 1293 if (k->nkr_hwlease > lim) 1294 k->nkr_hwlease -= lim + 1; 1295 1296 if (k->nkr_hwlease >= k->nkr_num_slots || 1297 k->nr_hwcur >= k->nkr_num_slots || 1298 k->nr_hwtail >= k->nkr_num_slots || 1299 k->nkr_lease_idx >= k->nkr_num_slots) { 1300 D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", 1301 k->na->name, 1302 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 1303 k->nkr_lease_idx, k->nkr_num_slots); 1304 } 1305 return lease_idx; 1306 } 1307 1308 /* 1309 * 1310 * This flush routine supports only unicast and broadcast but a large 1311 * number of ports, and lets us replace the learn and dispatch functions. 1312 */ 1313 int 1314 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, 1315 u_int ring_nr) 1316 { 1317 struct nm_bdg_q *dst_ents, *brddst; 1318 uint16_t num_dsts = 0, *dsts; 1319 struct nm_bridge *b = na->na_bdg; 1320 u_int i, j, me = na->bdg_port; 1321 1322 /* 1323 * The work area (pointed by ft) is followed by an array of 1324 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS 1325 * queues per port plus one for the broadcast traffic. 1326 * Then we have an array of destination indexes. 1327 */ 1328 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 1329 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); 1330 1331 /* first pass: find a destination for each packet in the batch */ 1332 for (i = 0; likely(i < n); i += ft[i].ft_frags) { 1333 uint8_t dst_ring = ring_nr; /* default, same ring as origin */ 1334 uint16_t dst_port, d_i; 1335 struct nm_bdg_q *d; 1336 1337 ND("slot %d frags %d", i, ft[i].ft_frags); 1338 /* Drop the packet if the virtio-net header is not into the first 1339 fragment nor at the very beginning of the second. */ 1340 if (unlikely(na->virt_hdr_len > ft[i].ft_len)) 1341 continue; 1342 dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na); 1343 if (netmap_verbose > 255) 1344 RD(5, "slot %d port %d -> %d", i, me, dst_port); 1345 if (dst_port == NM_BDG_NOPORT) 1346 continue; /* this packet is identified to be dropped */ 1347 else if (unlikely(dst_port > NM_BDG_MAXPORTS)) 1348 continue; 1349 else if (dst_port == NM_BDG_BROADCAST) 1350 dst_ring = 0; /* broadcasts always go to ring 0 */ 1351 else if (unlikely(dst_port == me || 1352 !b->bdg_ports[dst_port])) 1353 continue; 1354 1355 /* get a position in the scratch pad */ 1356 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; 1357 d = dst_ents + d_i; 1358 1359 /* append the first fragment to the list */ 1360 if (d->bq_head == NM_FT_NULL) { /* new destination */ 1361 d->bq_head = d->bq_tail = i; 1362 /* remember this position to be scanned later */ 1363 if (dst_port != NM_BDG_BROADCAST) 1364 dsts[num_dsts++] = d_i; 1365 } else { 1366 ft[d->bq_tail].ft_next = i; 1367 d->bq_tail = i; 1368 } 1369 d->bq_len += ft[i].ft_frags; 1370 } 1371 1372 /* 1373 * Broadcast traffic goes to ring 0 on all destinations. 1374 * So we need to add these rings to the list of ports to scan. 1375 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is 1376 * expensive. We should keep a compact list of active destinations 1377 * so we could shorten this loop. 1378 */ 1379 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; 1380 if (brddst->bq_head != NM_FT_NULL) { 1381 for (j = 0; likely(j < b->bdg_active_ports); j++) { 1382 uint16_t d_i; 1383 i = b->bdg_port_index[j]; 1384 if (unlikely(i == me)) 1385 continue; 1386 d_i = i * NM_BDG_MAXRINGS; 1387 if (dst_ents[d_i].bq_head == NM_FT_NULL) 1388 dsts[num_dsts++] = d_i; 1389 } 1390 } 1391 1392 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); 1393 /* second pass: scan destinations */ 1394 for (i = 0; i < num_dsts; i++) { 1395 struct netmap_vp_adapter *dst_na; 1396 struct netmap_kring *kring; 1397 struct netmap_ring *ring; 1398 u_int dst_nr, lim, j, d_i, next, brd_next; 1399 u_int needed, howmany; 1400 int retry = netmap_txsync_retry; 1401 struct nm_bdg_q *d; 1402 uint32_t my_start = 0, lease_idx = 0; 1403 int nrings; 1404 int virt_hdr_mismatch = 0; 1405 1406 d_i = dsts[i]; 1407 ND("second pass %d port %d", i, d_i); 1408 d = dst_ents + d_i; 1409 // XXX fix the division 1410 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; 1411 /* protect from the lookup function returning an inactive 1412 * destination port 1413 */ 1414 if (unlikely(dst_na == NULL)) 1415 goto cleanup; 1416 if (dst_na->up.na_flags & NAF_SW_ONLY) 1417 goto cleanup; 1418 /* 1419 * The interface may be in !netmap mode in two cases: 1420 * - when na is attached but not activated yet; 1421 * - when na is being deactivated but is still attached. 1422 */ 1423 if (unlikely(!nm_netmap_on(&dst_na->up))) { 1424 ND("not in netmap mode!"); 1425 goto cleanup; 1426 } 1427 1428 /* there is at least one either unicast or broadcast packet */ 1429 brd_next = brddst->bq_head; 1430 next = d->bq_head; 1431 /* we need to reserve this many slots. If fewer are 1432 * available, some packets will be dropped. 1433 * Packets may have multiple fragments, so we may not use 1434 * there is a chance that we may not use all of the slots 1435 * we have claimed, so we will need to handle the leftover 1436 * ones when we regain the lock. 1437 */ 1438 needed = d->bq_len + brddst->bq_len; 1439 1440 if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) { 1441 RD(3, "virt_hdr_mismatch, src %d dst %d", na->virt_hdr_len, dst_na->virt_hdr_len); 1442 /* There is a virtio-net header/offloadings mismatch between 1443 * source and destination. The slower mismatch datapath will 1444 * be used to cope with all the mismatches. 1445 */ 1446 virt_hdr_mismatch = 1; 1447 if (dst_na->mfs < na->mfs) { 1448 /* We may need to do segmentation offloadings, and so 1449 * we may need a number of destination slots greater 1450 * than the number of input slots ('needed'). 1451 * We look for the smallest integer 'x' which satisfies: 1452 * needed * na->mfs + x * H <= x * na->mfs 1453 * where 'H' is the length of the longest header that may 1454 * be replicated in the segmentation process (e.g. for 1455 * TCPv4 we must account for ethernet header, IP header 1456 * and TCPv4 header). 1457 */ 1458 needed = (needed * na->mfs) / 1459 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; 1460 ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); 1461 } 1462 } 1463 1464 ND(5, "pass 2 dst %d is %x %s", 1465 i, d_i, is_vp ? "virtual" : "nic/host"); 1466 dst_nr = d_i & (NM_BDG_MAXRINGS-1); 1467 nrings = dst_na->up.num_rx_rings; 1468 if (dst_nr >= nrings) 1469 dst_nr = dst_nr % nrings; 1470 kring = &dst_na->up.rx_rings[dst_nr]; 1471 ring = kring->ring; 1472 lim = kring->nkr_num_slots - 1; 1473 1474 retry: 1475 1476 if (dst_na->retry && retry) { 1477 /* try to get some free slot from the previous run */ 1478 dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); 1479 /* actually useful only for bwraps, since there 1480 * the notify will trigger a txsync on the hwna. VALE ports 1481 * have dst_na->retry == 0 1482 */ 1483 } 1484 /* reserve the buffers in the queue and an entry 1485 * to report completion, and drop lock. 1486 * XXX this might become a helper function. 1487 */ 1488 mtx_lock(&kring->q_lock); 1489 if (kring->nkr_stopped) { 1490 mtx_unlock(&kring->q_lock); 1491 goto cleanup; 1492 } 1493 my_start = j = kring->nkr_hwlease; 1494 howmany = nm_kr_space(kring, 1); 1495 if (needed < howmany) 1496 howmany = needed; 1497 lease_idx = nm_kr_lease(kring, howmany, 1); 1498 mtx_unlock(&kring->q_lock); 1499 1500 /* only retry if we need more than available slots */ 1501 if (retry && needed <= howmany) 1502 retry = 0; 1503 1504 /* copy to the destination queue */ 1505 while (howmany > 0) { 1506 struct netmap_slot *slot; 1507 struct nm_bdg_fwd *ft_p, *ft_end; 1508 u_int cnt; 1509 1510 /* find the queue from which we pick next packet. 1511 * NM_FT_NULL is always higher than valid indexes 1512 * so we never dereference it if the other list 1513 * has packets (and if both are empty we never 1514 * get here). 1515 */ 1516 if (next < brd_next) { 1517 ft_p = ft + next; 1518 next = ft_p->ft_next; 1519 } else { /* insert broadcast */ 1520 ft_p = ft + brd_next; 1521 brd_next = ft_p->ft_next; 1522 } 1523 cnt = ft_p->ft_frags; // cnt > 0 1524 if (unlikely(cnt > howmany)) 1525 break; /* no more space */ 1526 if (netmap_verbose && cnt > 1) 1527 RD(5, "rx %d frags to %d", cnt, j); 1528 ft_end = ft_p + cnt; 1529 if (unlikely(virt_hdr_mismatch)) { 1530 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); 1531 } else { 1532 howmany -= cnt; 1533 do { 1534 char *dst, *src = ft_p->ft_buf; 1535 size_t copy_len = ft_p->ft_len, dst_len = copy_len; 1536 1537 slot = &ring->slot[j]; 1538 dst = NMB(&dst_na->up, slot); 1539 1540 ND("send [%d] %d(%d) bytes at %s:%d", 1541 i, (int)copy_len, (int)dst_len, 1542 NM_IFPNAME(dst_ifp), j); 1543 /* round to a multiple of 64 */ 1544 copy_len = (copy_len + 63) & ~63; 1545 1546 if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) || 1547 copy_len > NETMAP_BUF_SIZE(&na->up))) { 1548 RD(5, "invalid len %d, down to 64", (int)copy_len); 1549 copy_len = dst_len = 64; // XXX 1550 } 1551 if (ft_p->ft_flags & NS_INDIRECT) { 1552 if (copyin(src, dst, copy_len)) { 1553 // invalid user pointer, pretend len is 0 1554 dst_len = 0; 1555 } 1556 } else { 1557 //memcpy(dst, src, copy_len); 1558 pkt_copy(src, dst, (int)copy_len); 1559 } 1560 slot->len = dst_len; 1561 slot->flags = (cnt << 8)| NS_MOREFRAG; 1562 j = nm_next(j, lim); 1563 needed--; 1564 ft_p++; 1565 } while (ft_p != ft_end); 1566 slot->flags = (cnt << 8); /* clear flag on last entry */ 1567 } 1568 /* are we done ? */ 1569 if (next == NM_FT_NULL && brd_next == NM_FT_NULL) 1570 break; 1571 } 1572 { 1573 /* current position */ 1574 uint32_t *p = kring->nkr_leases; /* shorthand */ 1575 uint32_t update_pos; 1576 int still_locked = 1; 1577 1578 mtx_lock(&kring->q_lock); 1579 if (unlikely(howmany > 0)) { 1580 /* not used all bufs. If i am the last one 1581 * i can recover the slots, otherwise must 1582 * fill them with 0 to mark empty packets. 1583 */ 1584 ND("leftover %d bufs", howmany); 1585 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { 1586 /* yes i am the last one */ 1587 ND("roll back nkr_hwlease to %d", j); 1588 kring->nkr_hwlease = j; 1589 } else { 1590 while (howmany-- > 0) { 1591 ring->slot[j].len = 0; 1592 ring->slot[j].flags = 0; 1593 j = nm_next(j, lim); 1594 } 1595 } 1596 } 1597 p[lease_idx] = j; /* report I am done */ 1598 1599 update_pos = kring->nr_hwtail; 1600 1601 if (my_start == update_pos) { 1602 /* all slots before my_start have been reported, 1603 * so scan subsequent leases to see if other ranges 1604 * have been completed, and to a selwakeup or txsync. 1605 */ 1606 while (lease_idx != kring->nkr_lease_idx && 1607 p[lease_idx] != NR_NOSLOT) { 1608 j = p[lease_idx]; 1609 p[lease_idx] = NR_NOSLOT; 1610 lease_idx = nm_next(lease_idx, lim); 1611 } 1612 /* j is the new 'write' position. j != my_start 1613 * means there are new buffers to report 1614 */ 1615 if (likely(j != my_start)) { 1616 kring->nr_hwtail = j; 1617 still_locked = 0; 1618 mtx_unlock(&kring->q_lock); 1619 dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); 1620 /* this is netmap_notify for VALE ports and 1621 * netmap_bwrap_notify for bwrap. The latter will 1622 * trigger a txsync on the underlying hwna 1623 */ 1624 if (dst_na->retry && retry--) { 1625 /* XXX this is going to call nm_notify again. 1626 * Only useful for bwrap in virtual machines 1627 */ 1628 goto retry; 1629 } 1630 } 1631 } 1632 if (still_locked) 1633 mtx_unlock(&kring->q_lock); 1634 } 1635 cleanup: 1636 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ 1637 d->bq_len = 0; 1638 } 1639 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ 1640 brddst->bq_len = 0; 1641 return 0; 1642 } 1643 1644 /* nm_txsync callback for VALE ports */ 1645 static int 1646 netmap_vp_txsync(struct netmap_kring *kring, int flags) 1647 { 1648 struct netmap_vp_adapter *na = 1649 (struct netmap_vp_adapter *)kring->na; 1650 u_int done; 1651 u_int const lim = kring->nkr_num_slots - 1; 1652 u_int const cur = kring->rcur; 1653 1654 if (bridge_batch <= 0) { /* testing only */ 1655 done = cur; // used all 1656 goto done; 1657 } 1658 if (!na->na_bdg) { 1659 done = cur; 1660 goto done; 1661 } 1662 if (bridge_batch > NM_BDG_BATCH) 1663 bridge_batch = NM_BDG_BATCH; 1664 1665 done = nm_bdg_preflush(kring, cur); 1666 done: 1667 if (done != cur) 1668 D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail); 1669 /* 1670 * packets between 'done' and 'cur' are left unsent. 1671 */ 1672 kring->nr_hwcur = done; 1673 kring->nr_hwtail = nm_prev(done, lim); 1674 nm_txsync_finalize(kring); 1675 if (netmap_verbose) 1676 D("%s ring %d flags %d", na->up.name, kring->ring_id, flags); 1677 return 0; 1678 } 1679 1680 1681 /* rxsync code used by VALE ports nm_rxsync callback and also 1682 * internally by the brwap 1683 */ 1684 static int 1685 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags) 1686 { 1687 struct netmap_adapter *na = kring->na; 1688 struct netmap_ring *ring = kring->ring; 1689 u_int nm_i, lim = kring->nkr_num_slots - 1; 1690 u_int head = nm_rxsync_prologue(kring); 1691 int n; 1692 1693 if (head > lim) { 1694 D("ouch dangerous reset!!!"); 1695 n = netmap_ring_reinit(kring); 1696 goto done; 1697 } 1698 1699 /* First part, import newly received packets. */ 1700 /* actually nothing to do here, they are already in the kring */ 1701 1702 /* Second part, skip past packets that userspace has released. */ 1703 nm_i = kring->nr_hwcur; 1704 if (nm_i != head) { 1705 /* consistency check, but nothing really important here */ 1706 for (n = 0; likely(nm_i != head); n++) { 1707 struct netmap_slot *slot = &ring->slot[nm_i]; 1708 void *addr = NMB(na, slot); 1709 1710 if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */ 1711 D("bad buffer index %d, ignore ?", 1712 slot->buf_idx); 1713 } 1714 slot->flags &= ~NS_BUF_CHANGED; 1715 nm_i = nm_next(nm_i, lim); 1716 } 1717 kring->nr_hwcur = head; 1718 } 1719 1720 /* tell userspace that there are new packets */ 1721 nm_rxsync_finalize(kring); 1722 n = 0; 1723 done: 1724 return n; 1725 } 1726 1727 /* 1728 * nm_rxsync callback for VALE ports 1729 * user process reading from a VALE switch. 1730 * Already protected against concurrent calls from userspace, 1731 * but we must acquire the queue's lock to protect against 1732 * writers on the same queue. 1733 */ 1734 static int 1735 netmap_vp_rxsync(struct netmap_kring *kring, int flags) 1736 { 1737 int n; 1738 1739 mtx_lock(&kring->q_lock); 1740 n = netmap_vp_rxsync_locked(kring, flags); 1741 mtx_unlock(&kring->q_lock); 1742 return n; 1743 } 1744 1745 1746 /* nm_bdg_attach callback for VALE ports 1747 * The na_vp port is this same netmap_adapter. There is no host port. 1748 */ 1749 static int 1750 netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na) 1751 { 1752 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 1753 1754 if (vpna->na_bdg) 1755 return EBUSY; 1756 na->na_vp = vpna; 1757 strncpy(na->name, name, sizeof(na->name)); 1758 na->na_hostvp = NULL; 1759 return 0; 1760 } 1761 1762 /* create a netmap_vp_adapter that describes a VALE port. 1763 * Only persistent VALE ports have a non-null ifp. 1764 */ 1765 static int 1766 netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp, struct netmap_vp_adapter **ret) 1767 { 1768 struct netmap_vp_adapter *vpna; 1769 struct netmap_adapter *na; 1770 int error; 1771 u_int npipes = 0; 1772 1773 vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO); 1774 if (vpna == NULL) 1775 return ENOMEM; 1776 1777 na = &vpna->up; 1778 1779 na->ifp = ifp; 1780 strncpy(na->name, nmr->nr_name, sizeof(na->name)); 1781 1782 /* bound checking */ 1783 na->num_tx_rings = nmr->nr_tx_rings; 1784 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1785 nmr->nr_tx_rings = na->num_tx_rings; // write back 1786 na->num_rx_rings = nmr->nr_rx_rings; 1787 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1788 nmr->nr_rx_rings = na->num_rx_rings; // write back 1789 nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, 1790 1, NM_BDG_MAXSLOTS, NULL); 1791 na->num_tx_desc = nmr->nr_tx_slots; 1792 nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, 1793 1, NM_BDG_MAXSLOTS, NULL); 1794 /* validate number of pipes. We want at least 1, 1795 * but probably can do with some more. 1796 * So let's use 2 as default (when 0 is supplied) 1797 */ 1798 npipes = nmr->nr_arg1; 1799 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); 1800 nmr->nr_arg1 = npipes; /* write back */ 1801 /* validate extra bufs */ 1802 nm_bound_var(&nmr->nr_arg3, 0, 0, 1803 128*NM_BDG_MAXSLOTS, NULL); 1804 na->num_rx_desc = nmr->nr_rx_slots; 1805 vpna->virt_hdr_len = 0; 1806 vpna->mfs = 1514; 1807 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? 1808 vpna->mfs = netmap_buf_size; */ 1809 if (netmap_verbose) 1810 D("max frame size %u", vpna->mfs); 1811 1812 na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER; 1813 na->nm_txsync = netmap_vp_txsync; 1814 na->nm_rxsync = netmap_vp_rxsync; 1815 na->nm_register = netmap_vp_reg; 1816 na->nm_krings_create = netmap_vp_krings_create; 1817 na->nm_krings_delete = netmap_vp_krings_delete; 1818 na->nm_dtor = netmap_vp_dtor; 1819 na->nm_mem = netmap_mem_private_new(na->name, 1820 na->num_tx_rings, na->num_tx_desc, 1821 na->num_rx_rings, na->num_rx_desc, 1822 nmr->nr_arg3, npipes, &error); 1823 if (na->nm_mem == NULL) 1824 goto err; 1825 na->nm_bdg_attach = netmap_vp_bdg_attach; 1826 /* other nmd fields are set in the common routine */ 1827 error = netmap_attach_common(na); 1828 if (error) 1829 goto err; 1830 *ret = vpna; 1831 return 0; 1832 1833 err: 1834 if (na->nm_mem != NULL) 1835 netmap_mem_private_delete(na->nm_mem); 1836 free(vpna, M_DEVBUF); 1837 return error; 1838 } 1839 1840 /* Bridge wrapper code (bwrap). 1841 * This is used to connect a non-VALE-port netmap_adapter (hwna) to a 1842 * VALE switch. 1843 * The main task is to swap the meaning of tx and rx rings to match the 1844 * expectations of the VALE switch code (see nm_bdg_flush). 1845 * 1846 * The bwrap works by interposing a netmap_bwrap_adapter between the 1847 * rest of the system and the hwna. The netmap_bwrap_adapter looks like 1848 * a netmap_vp_adapter to the rest the system, but, internally, it 1849 * translates all callbacks to what the hwna expects. 1850 * 1851 * Note that we have to intercept callbacks coming from two sides: 1852 * 1853 * - callbacks coming from the netmap module are intercepted by 1854 * passing around the netmap_bwrap_adapter instead of the hwna 1855 * 1856 * - callbacks coming from outside of the netmap module only know 1857 * about the hwna. This, however, only happens in interrupt 1858 * handlers, where only the hwna->nm_notify callback is called. 1859 * What the bwrap does is to overwrite the hwna->nm_notify callback 1860 * with its own netmap_bwrap_intr_notify. 1861 * XXX This assumes that the hwna->nm_notify callback was the 1862 * standard netmap_notify(), as it is the case for nic adapters. 1863 * Any additional action performed by hwna->nm_notify will not be 1864 * performed by netmap_bwrap_intr_notify. 1865 * 1866 * Additionally, the bwrap can optionally attach the host rings pair 1867 * of the wrapped adapter to a different port of the switch. 1868 */ 1869 1870 1871 static void 1872 netmap_bwrap_dtor(struct netmap_adapter *na) 1873 { 1874 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; 1875 struct netmap_adapter *hwna = bna->hwna; 1876 1877 ND("na %p", na); 1878 /* drop reference to hwna->ifp. 1879 * If we don't do this, netmap_detach_common(na) 1880 * will think it has set NA(na->ifp) to NULL 1881 */ 1882 na->ifp = NULL; 1883 /* for safety, also drop the possible reference 1884 * in the hostna 1885 */ 1886 bna->host.up.ifp = NULL; 1887 1888 hwna->nm_mem = bna->save_nmd; 1889 hwna->na_private = NULL; 1890 hwna->na_vp = hwna->na_hostvp = NULL; 1891 hwna->na_flags &= ~NAF_BUSY; 1892 netmap_adapter_put(hwna); 1893 1894 } 1895 1896 1897 /* 1898 * Intr callback for NICs connected to a bridge. 1899 * Simply ignore tx interrupts (maybe we could try to recover space ?) 1900 * and pass received packets from nic to the bridge. 1901 * 1902 * XXX TODO check locking: this is called from the interrupt 1903 * handler so we should make sure that the interface is not 1904 * disconnected while passing down an interrupt. 1905 * 1906 * Note, no user process can access this NIC or the host stack. 1907 * The only part of the ring that is significant are the slots, 1908 * and head/cur/tail are set from the kring as needed 1909 * (part as a receive ring, part as a transmit ring). 1910 * 1911 * callback that overwrites the hwna notify callback. 1912 * Packets come from the outside or from the host stack and are put on an hwna rx ring. 1913 * The bridge wrapper then sends the packets through the bridge. 1914 */ 1915 static int 1916 netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags) 1917 { 1918 struct netmap_bwrap_adapter *bna = na->na_private; 1919 struct netmap_vp_adapter *hostna = &bna->host; 1920 struct netmap_kring *kring, *bkring; 1921 struct netmap_ring *ring; 1922 int is_host_ring = ring_nr == na->num_rx_rings; 1923 struct netmap_vp_adapter *vpna = &bna->up; 1924 int error = 0; 1925 1926 if (netmap_verbose) 1927 D("%s %s%d 0x%x", na->name, 1928 (tx == NR_TX ? "TX" : "RX"), ring_nr, flags); 1929 1930 if (flags & NAF_DISABLE_NOTIFY) { 1931 /* the enabled/disabled state of the ring has changed, 1932 * propagate the info to the wrapper (with tx/rx swapped) 1933 */ 1934 if (tx == NR_TX) { 1935 netmap_set_rxring(&vpna->up, ring_nr, 1936 na->tx_rings[ring_nr].nkr_stopped); 1937 } else { 1938 netmap_set_txring(&vpna->up, ring_nr, 1939 na->rx_rings[ring_nr].nkr_stopped); 1940 } 1941 return 0; 1942 } 1943 1944 if (!nm_netmap_on(na)) 1945 return 0; 1946 1947 /* we only care about receive interrupts */ 1948 if (tx == NR_TX) 1949 return 0; 1950 1951 kring = &na->rx_rings[ring_nr]; 1952 ring = kring->ring; 1953 1954 /* make sure the ring is not disabled */ 1955 if (nm_kr_tryget(kring)) 1956 return 0; 1957 1958 if (is_host_ring && hostna->na_bdg == NULL) { 1959 error = bna->save_notify(na, ring_nr, tx, flags); 1960 goto put_out; 1961 } 1962 1963 /* Here we expect ring->head = ring->cur = ring->tail 1964 * because everything has been released from the previous round. 1965 * However the ring is shared and we might have info from 1966 * the wrong side (the tx ring). Hence we overwrite with 1967 * the info from the rx kring. 1968 */ 1969 if (netmap_verbose) 1970 D("%s head %d cur %d tail %d (kring %d %d %d)", na->name, 1971 ring->head, ring->cur, ring->tail, 1972 kring->rhead, kring->rcur, kring->rtail); 1973 1974 ring->head = kring->rhead; 1975 ring->cur = kring->rcur; 1976 ring->tail = kring->rtail; 1977 1978 if (is_host_ring) { 1979 vpna = hostna; 1980 ring_nr = 0; 1981 } 1982 /* simulate a user wakeup on the rx ring */ 1983 /* fetch packets that have arrived. 1984 * XXX maybe do this in a loop ? 1985 */ 1986 error = kring->nm_sync(kring, 0); 1987 if (error) 1988 goto put_out; 1989 if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) { 1990 D("how strange, interrupt with no packets on %s", 1991 na->name); 1992 goto put_out; 1993 } 1994 1995 /* new packets are ring->cur to ring->tail, and the bkring 1996 * had hwcur == ring->cur. So advance ring->cur to ring->tail 1997 * to push all packets out. 1998 */ 1999 ring->head = ring->cur = ring->tail; 2000 2001 /* also set tail to what the bwrap expects */ 2002 bkring = &vpna->up.tx_rings[ring_nr]; 2003 ring->tail = bkring->nr_hwtail; // rtail too ? 2004 2005 /* pass packets to the switch */ 2006 nm_txsync_prologue(bkring); // XXX error checking ? 2007 netmap_vp_txsync(bkring, flags); 2008 2009 /* mark all buffers as released on this ring */ 2010 ring->head = ring->cur = kring->nr_hwtail; 2011 ring->tail = kring->rtail; 2012 /* another call to actually release the buffers */ 2013 if (!is_host_ring) { 2014 error = kring->nm_sync(kring, 0); 2015 } else { 2016 /* mark all packets as released, as in the 2017 * second part of netmap_rxsync_from_host() 2018 */ 2019 kring->nr_hwcur = kring->nr_hwtail; 2020 nm_rxsync_finalize(kring); 2021 } 2022 2023 put_out: 2024 nm_kr_put(kring); 2025 return error; 2026 } 2027 2028 2029 /* nm_register callback for bwrap */ 2030 static int 2031 netmap_bwrap_register(struct netmap_adapter *na, int onoff) 2032 { 2033 struct netmap_bwrap_adapter *bna = 2034 (struct netmap_bwrap_adapter *)na; 2035 struct netmap_adapter *hwna = bna->hwna; 2036 struct netmap_vp_adapter *hostna = &bna->host; 2037 int error; 2038 2039 ND("%s %s", na->name, onoff ? "on" : "off"); 2040 2041 if (onoff) { 2042 int i; 2043 2044 /* netmap_do_regif has been called on the bwrap na. 2045 * We need to pass the information about the 2046 * memory allocator down to the hwna before 2047 * putting it in netmap mode 2048 */ 2049 hwna->na_lut = na->na_lut; 2050 hwna->na_lut_objtotal = na->na_lut_objtotal; 2051 hwna->na_lut_objsize = na->na_lut_objsize; 2052 2053 if (hostna->na_bdg) { 2054 /* if the host rings have been attached to switch, 2055 * we need to copy the memory allocator information 2056 * in the hostna also 2057 */ 2058 hostna->up.na_lut = na->na_lut; 2059 hostna->up.na_lut_objtotal = na->na_lut_objtotal; 2060 hostna->up.na_lut_objsize = na->na_lut_objsize; 2061 } 2062 2063 /* cross-link the netmap rings 2064 * The original number of rings comes from hwna, 2065 * rx rings on one side equals tx rings on the other. 2066 * We need to do this now, after the initialization 2067 * of the kring->ring pointers 2068 */ 2069 for (i = 0; i < na->num_rx_rings + 1; i++) { 2070 hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots; 2071 hwna->tx_rings[i].ring = na->rx_rings[i].ring; 2072 } 2073 for (i = 0; i < na->num_tx_rings + 1; i++) { 2074 hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots; 2075 hwna->rx_rings[i].ring = na->tx_rings[i].ring; 2076 } 2077 } 2078 2079 /* forward the request to the hwna */ 2080 error = hwna->nm_register(hwna, onoff); 2081 if (error) 2082 return error; 2083 2084 /* impersonate a netmap_vp_adapter */ 2085 netmap_vp_reg(na, onoff); 2086 if (hostna->na_bdg) 2087 netmap_vp_reg(&hostna->up, onoff); 2088 2089 if (onoff) { 2090 /* intercept the hwna nm_nofify callback */ 2091 bna->save_notify = hwna->nm_notify; 2092 hwna->nm_notify = netmap_bwrap_intr_notify; 2093 } else { 2094 hwna->nm_notify = bna->save_notify; 2095 hwna->na_lut = NULL; 2096 hwna->na_lut_objtotal = 0; 2097 hwna->na_lut_objsize = 0; 2098 } 2099 2100 return 0; 2101 } 2102 2103 /* nm_config callback for bwrap */ 2104 static int 2105 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, 2106 u_int *rxr, u_int *rxd) 2107 { 2108 struct netmap_bwrap_adapter *bna = 2109 (struct netmap_bwrap_adapter *)na; 2110 struct netmap_adapter *hwna = bna->hwna; 2111 2112 /* forward the request */ 2113 netmap_update_config(hwna); 2114 /* swap the results */ 2115 *txr = hwna->num_rx_rings; 2116 *txd = hwna->num_rx_desc; 2117 *rxr = hwna->num_tx_rings; 2118 *rxd = hwna->num_rx_desc; 2119 2120 return 0; 2121 } 2122 2123 2124 /* nm_krings_create callback for bwrap */ 2125 static int 2126 netmap_bwrap_krings_create(struct netmap_adapter *na) 2127 { 2128 struct netmap_bwrap_adapter *bna = 2129 (struct netmap_bwrap_adapter *)na; 2130 struct netmap_adapter *hwna = bna->hwna; 2131 struct netmap_adapter *hostna = &bna->host.up; 2132 int error; 2133 2134 ND("%s", na->name); 2135 2136 /* impersonate a netmap_vp_adapter */ 2137 error = netmap_vp_krings_create(na); 2138 if (error) 2139 return error; 2140 2141 /* also create the hwna krings */ 2142 error = hwna->nm_krings_create(hwna); 2143 if (error) { 2144 netmap_vp_krings_delete(na); 2145 return error; 2146 } 2147 /* the connection between the bwrap krings and the hwna krings 2148 * will be perfomed later, in the nm_register callback, since 2149 * now the kring->ring pointers have not been initialized yet 2150 */ 2151 2152 if (na->na_flags & NAF_HOST_RINGS) { 2153 /* the hostna rings are the host rings of the bwrap. 2154 * The corresponding krings must point back to the 2155 * hostna 2156 */ 2157 hostna->tx_rings = na->tx_rings + na->num_tx_rings; 2158 hostna->tx_rings[0].na = hostna; 2159 hostna->rx_rings = na->rx_rings + na->num_rx_rings; 2160 hostna->rx_rings[0].na = hostna; 2161 } 2162 2163 return 0; 2164 } 2165 2166 2167 static void 2168 netmap_bwrap_krings_delete(struct netmap_adapter *na) 2169 { 2170 struct netmap_bwrap_adapter *bna = 2171 (struct netmap_bwrap_adapter *)na; 2172 struct netmap_adapter *hwna = bna->hwna; 2173 2174 ND("%s", na->name); 2175 2176 hwna->nm_krings_delete(hwna); 2177 netmap_vp_krings_delete(na); 2178 } 2179 2180 2181 /* notify method for the bridge-->hwna direction */ 2182 static int 2183 netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) 2184 { 2185 struct netmap_bwrap_adapter *bna = 2186 (struct netmap_bwrap_adapter *)na; 2187 struct netmap_adapter *hwna = bna->hwna; 2188 struct netmap_kring *kring, *hw_kring; 2189 struct netmap_ring *ring; 2190 u_int lim; 2191 int error = 0; 2192 2193 if (tx == NR_TX) 2194 return EINVAL; 2195 2196 kring = &na->rx_rings[ring_n]; 2197 hw_kring = &hwna->tx_rings[ring_n]; 2198 ring = kring->ring; 2199 lim = kring->nkr_num_slots - 1; 2200 2201 if (!nm_netmap_on(hwna)) 2202 return 0; 2203 mtx_lock(&kring->q_lock); 2204 /* first step: simulate a user wakeup on the rx ring */ 2205 netmap_vp_rxsync_locked(kring, flags); 2206 ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", 2207 na->name, ring_n, 2208 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, 2209 ring->head, ring->cur, ring->tail, 2210 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail); 2211 /* second step: the simulated user consumes all new packets */ 2212 ring->head = ring->cur = ring->tail; 2213 2214 /* third step: the new packets are sent on the tx ring 2215 * (which is actually the same ring) 2216 */ 2217 /* set tail to what the hw expects */ 2218 ring->tail = hw_kring->rtail; 2219 nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ? 2220 error = hw_kring->nm_sync(hw_kring, flags); 2221 2222 /* fourth step: now we are back the rx ring */ 2223 /* claim ownership on all hw owned bufs */ 2224 ring->head = nm_next(ring->tail, lim); /* skip past reserved slot */ 2225 ring->tail = kring->rtail; /* restore saved value of tail, for safety */ 2226 2227 /* fifth step: the user goes to sleep again, causing another rxsync */ 2228 netmap_vp_rxsync_locked(kring, flags); 2229 ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", 2230 na->name, ring_n, 2231 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, 2232 ring->head, ring->cur, ring->tail, 2233 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); 2234 mtx_unlock(&kring->q_lock); 2235 return error; 2236 } 2237 2238 2239 /* notify method for the bridge-->host-rings path */ 2240 static int 2241 netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) 2242 { 2243 struct netmap_bwrap_adapter *bna = na->na_private; 2244 struct netmap_adapter *port_na = &bna->up.up; 2245 if (tx == NR_TX || ring_n != 0) 2246 return EINVAL; 2247 return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags); 2248 } 2249 2250 2251 /* nm_bdg_ctl callback for the bwrap. 2252 * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd]. 2253 * On attach, it needs to provide a fake netmap_priv_d structure and 2254 * perform a netmap_do_regif() on the bwrap. This will put both the 2255 * bwrap and the hwna in netmap mode, with the netmap rings shared 2256 * and cross linked. Moroever, it will start intercepting interrupts 2257 * directed to hwna. 2258 */ 2259 static int 2260 netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) 2261 { 2262 struct netmap_priv_d *npriv; 2263 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; 2264 struct netmap_if *nifp; 2265 int error = 0; 2266 2267 if (attach) { 2268 if (NETMAP_OWNED_BY_ANY(na)) { 2269 return EBUSY; 2270 } 2271 if (bna->na_kpriv) { 2272 /* nothing to do */ 2273 return 0; 2274 } 2275 npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); 2276 if (npriv == NULL) 2277 return ENOMEM; 2278 nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags, &error); 2279 if (!nifp) { 2280 bzero(npriv, sizeof(*npriv)); 2281 free(npriv, M_DEVBUF); 2282 return error; 2283 } 2284 bna->na_kpriv = npriv; 2285 na->na_flags |= NAF_BUSY; 2286 } else { 2287 int last_instance; 2288 2289 if (na->active_fds == 0) /* not registered */ 2290 return EINVAL; 2291 last_instance = netmap_dtor_locked(bna->na_kpriv); 2292 if (!last_instance) { 2293 D("--- error, trying to detach an entry with active mmaps"); 2294 error = EINVAL; 2295 } else { 2296 struct nm_bridge *b = bna->up.na_bdg, 2297 *bh = bna->host.na_bdg; 2298 npriv = bna->na_kpriv; 2299 bna->na_kpriv = NULL; 2300 D("deleting priv"); 2301 2302 bzero(npriv, sizeof(*npriv)); 2303 free(npriv, M_DEVBUF); 2304 if (b) { 2305 /* XXX the bwrap dtor should take care 2306 * of this (2014-06-16) 2307 */ 2308 netmap_bdg_detach_common(b, bna->up.bdg_port, 2309 (bh ? bna->host.bdg_port : -1)); 2310 } 2311 na->na_flags &= ~NAF_BUSY; 2312 } 2313 } 2314 return error; 2315 2316 } 2317 2318 /* attach a bridge wrapper to the 'real' device */ 2319 int 2320 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) 2321 { 2322 struct netmap_bwrap_adapter *bna; 2323 struct netmap_adapter *na = NULL; 2324 struct netmap_adapter *hostna = NULL; 2325 int error = 0; 2326 2327 /* make sure the NIC is not already in use */ 2328 if (NETMAP_OWNED_BY_ANY(hwna)) { 2329 D("NIC %s busy, cannot attach to bridge", hwna->name); 2330 return EBUSY; 2331 } 2332 2333 bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO); 2334 if (bna == NULL) { 2335 return ENOMEM; 2336 } 2337 2338 na = &bna->up.up; 2339 strncpy(na->name, nr_name, sizeof(na->name)); 2340 /* fill the ring data for the bwrap adapter with rx/tx meanings 2341 * swapped. The real cross-linking will be done during register, 2342 * when all the krings will have been created. 2343 */ 2344 na->num_rx_rings = hwna->num_tx_rings; 2345 na->num_tx_rings = hwna->num_rx_rings; 2346 na->num_tx_desc = hwna->num_rx_desc; 2347 na->num_rx_desc = hwna->num_tx_desc; 2348 na->nm_dtor = netmap_bwrap_dtor; 2349 na->nm_register = netmap_bwrap_register; 2350 // na->nm_txsync = netmap_bwrap_txsync; 2351 // na->nm_rxsync = netmap_bwrap_rxsync; 2352 na->nm_config = netmap_bwrap_config; 2353 na->nm_krings_create = netmap_bwrap_krings_create; 2354 na->nm_krings_delete = netmap_bwrap_krings_delete; 2355 na->nm_notify = netmap_bwrap_notify; 2356 na->nm_bdg_ctl = netmap_bwrap_bdg_ctl; 2357 na->pdev = hwna->pdev; 2358 na->nm_mem = netmap_mem_private_new(na->name, 2359 na->num_tx_rings, na->num_tx_desc, 2360 na->num_rx_rings, na->num_rx_desc, 2361 0, 0, &error); 2362 na->na_flags |= NAF_MEM_OWNER; 2363 if (na->nm_mem == NULL) 2364 goto err_put; 2365 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ 2366 2367 bna->hwna = hwna; 2368 netmap_adapter_get(hwna); 2369 hwna->na_private = bna; /* weak reference */ 2370 hwna->na_vp = &bna->up; 2371 2372 if (hwna->na_flags & NAF_HOST_RINGS) { 2373 if (hwna->na_flags & NAF_SW_ONLY) 2374 na->na_flags |= NAF_SW_ONLY; 2375 na->na_flags |= NAF_HOST_RINGS; 2376 hostna = &bna->host.up; 2377 snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name); 2378 hostna->ifp = hwna->ifp; 2379 hostna->num_tx_rings = 1; 2380 hostna->num_tx_desc = hwna->num_rx_desc; 2381 hostna->num_rx_rings = 1; 2382 hostna->num_rx_desc = hwna->num_tx_desc; 2383 // hostna->nm_txsync = netmap_bwrap_host_txsync; 2384 // hostna->nm_rxsync = netmap_bwrap_host_rxsync; 2385 hostna->nm_notify = netmap_bwrap_host_notify; 2386 hostna->nm_mem = na->nm_mem; 2387 hostna->na_private = bna; 2388 hostna->na_vp = &bna->up; 2389 na->na_hostvp = hwna->na_hostvp = 2390 hostna->na_hostvp = &bna->host; 2391 hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */ 2392 } 2393 2394 ND("%s<->%s txr %d txd %d rxr %d rxd %d", 2395 na->name, ifp->if_xname, 2396 na->num_tx_rings, na->num_tx_desc, 2397 na->num_rx_rings, na->num_rx_desc); 2398 2399 error = netmap_attach_common(na); 2400 if (error) { 2401 goto err_free; 2402 } 2403 /* make bwrap ifp point to the real ifp 2404 * NOTE: netmap_attach_common() interprets a non-NULL na->ifp 2405 * as a request to make the ifp point to the na. Since we 2406 * do not want to change the na already pointed to by hwna->ifp, 2407 * the following assignment has to be delayed until now 2408 */ 2409 na->ifp = hwna->ifp; 2410 hwna->na_flags |= NAF_BUSY; 2411 /* make hwna point to the allocator we are actually using, 2412 * so that monitors will be able to find it 2413 */ 2414 bna->save_nmd = hwna->nm_mem; 2415 hwna->nm_mem = na->nm_mem; 2416 return 0; 2417 2418 err_free: 2419 netmap_mem_private_delete(na->nm_mem); 2420 err_put: 2421 hwna->na_vp = hwna->na_hostvp = NULL; 2422 netmap_adapter_put(hwna); 2423 free(bna, M_DEVBUF); 2424 return error; 2425 2426 } 2427 2428 2429 void 2430 netmap_init_bridges(void) 2431 { 2432 int i; 2433 bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */ 2434 for (i = 0; i < NM_BRIDGES; i++) 2435 BDG_RWINIT(&nm_bridges[i]); 2436 } 2437 #endif /* WITH_VALE */ 2438