1 /* 2 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 27 /* 28 * This module implements the VALE switch for netmap 29 30 --- VALE SWITCH --- 31 32 NMG_LOCK() serializes all modifications to switches and ports. 33 A switch cannot be deleted until all ports are gone. 34 35 For each switch, an SX lock (RWlock on linux) protects 36 deletion of ports. When configuring or deleting a new port, the 37 lock is acquired in exclusive mode (after holding NMG_LOCK). 38 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 39 The lock is held throughout the entire forwarding cycle, 40 during which the thread may incur in a page fault. 41 Hence it is important that sleepable shared locks are used. 42 43 On the rx ring, the per-port lock is grabbed initially to reserve 44 a number of slot in the ring, then the lock is released, 45 packets are copied from source to destination, and then 46 the lock is acquired again and the receive ring is updated. 47 (A similar thing is done on the tx ring for NIC and host stack 48 ports attached to the switch) 49 50 */ 51 52 /* 53 * OS-specific code that is used only within this file. 54 * Other OS-specific code that must be accessed by drivers 55 * is present in netmap_kern.h 56 */ 57 58 #if defined(__FreeBSD__) 59 #include <sys/cdefs.h> /* prerequisite */ 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/types.h> 63 #include <sys/errno.h> 64 #include <sys/param.h> /* defines used in kernel.h */ 65 #include <sys/kernel.h> /* types used in module initialization */ 66 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 67 #include <sys/sockio.h> 68 #include <sys/socketvar.h> /* struct socket */ 69 #include <sys/malloc.h> 70 #include <sys/poll.h> 71 #include <sys/rwlock.h> 72 #include <sys/socket.h> /* sockaddrs */ 73 #include <sys/selinfo.h> 74 #include <sys/sysctl.h> 75 #include <net/if.h> 76 #include <net/if_var.h> 77 #include <net/bpf.h> /* BIOCIMMEDIATE */ 78 #include <machine/bus.h> /* bus_dmamap_* */ 79 #include <sys/endian.h> 80 #include <sys/refcount.h> 81 82 83 #define BDG_RWLOCK_T struct rwlock // struct rwlock 84 85 #define BDG_RWINIT(b) \ 86 rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) 87 #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) 88 #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) 89 #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) 90 #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) 91 #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) 92 #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) 93 94 95 #elif defined(linux) 96 97 #include "bsd_glue.h" 98 99 #elif defined(__APPLE__) 100 101 #warning OSX support is only partial 102 #include "osx_glue.h" 103 104 #else 105 106 #error Unsupported platform 107 108 #endif /* unsupported */ 109 110 /* 111 * common headers 112 */ 113 114 #include <net/netmap.h> 115 #include <dev/netmap/netmap_kern.h> 116 #include <dev/netmap/netmap_mem2.h> 117 118 #ifdef WITH_VALE 119 120 /* 121 * system parameters (most of them in netmap_kern.h) 122 * NM_NAME prefix for switch port names, default "vale" 123 * NM_BDG_MAXPORTS number of ports 124 * NM_BRIDGES max number of switches in the system. 125 * XXX should become a sysctl or tunable 126 * 127 * Switch ports are named valeX:Y where X is the switch name and Y 128 * is the port. If Y matches a physical interface name, the port is 129 * connected to a physical device. 130 * 131 * Unlike physical interfaces, switch ports use their own memory region 132 * for rings and buffers. 133 * The virtual interfaces use per-queue lock instead of core lock. 134 * In the tx loop, we aggregate traffic in batches to make all operations 135 * faster. The batch size is bridge_batch. 136 */ 137 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ 138 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ 139 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ 140 #define NM_BDG_HASH 1024 /* forwarding table entries */ 141 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ 142 #define NM_MULTISEG 64 /* max size of a chain of bufs */ 143 /* actual size of the tables */ 144 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) 145 /* NM_FT_NULL terminates a list of slots in the ft */ 146 #define NM_FT_NULL NM_BDG_BATCH_MAX 147 #define NM_BRIDGES 8 /* number of bridges */ 148 149 150 /* 151 * bridge_batch is set via sysctl to the max batch size to be 152 * used in the bridge. The actual value may be larger as the 153 * last packet in the block may overflow the size. 154 */ 155 int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ 156 SYSCTL_DECL(_dev_netmap); 157 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); 158 159 160 static int netmap_vp_create(struct nmreq *, struct ifnet *, struct netmap_vp_adapter **); 161 static int netmap_vp_reg(struct netmap_adapter *na, int onoff); 162 static int netmap_bwrap_register(struct netmap_adapter *, int onoff); 163 164 /* 165 * For each output interface, nm_bdg_q is used to construct a list. 166 * bq_len is the number of output buffers (we can have coalescing 167 * during the copy). 168 */ 169 struct nm_bdg_q { 170 uint16_t bq_head; 171 uint16_t bq_tail; 172 uint32_t bq_len; /* number of buffers */ 173 }; 174 175 /* XXX revise this */ 176 struct nm_hash_ent { 177 uint64_t mac; /* the top 2 bytes are the epoch */ 178 uint64_t ports; 179 }; 180 181 /* 182 * nm_bridge is a descriptor for a VALE switch. 183 * Interfaces for a bridge are all in bdg_ports[]. 184 * The array has fixed size, an empty entry does not terminate 185 * the search, but lookups only occur on attach/detach so we 186 * don't mind if they are slow. 187 * 188 * The bridge is non blocking on the transmit ports: excess 189 * packets are dropped if there is no room on the output port. 190 * 191 * bdg_lock protects accesses to the bdg_ports array. 192 * This is a rw lock (or equivalent). 193 */ 194 struct nm_bridge { 195 /* XXX what is the proper alignment/layout ? */ 196 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ 197 int bdg_namelen; 198 uint32_t bdg_active_ports; /* 0 means free */ 199 char bdg_basename[IFNAMSIZ]; 200 201 /* Indexes of active ports (up to active_ports) 202 * and all other remaining ports. 203 */ 204 uint8_t bdg_port_index[NM_BDG_MAXPORTS]; 205 206 struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS]; 207 208 209 /* 210 * The function to decide the destination port. 211 * It returns either of an index of the destination port, 212 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to 213 * forward this packet. ring_nr is the source ring index, and the 214 * function may overwrite this value to forward this packet to a 215 * different ring index. 216 * This function must be set by netmap_bdgctl(). 217 */ 218 struct netmap_bdg_ops bdg_ops; 219 220 /* the forwarding table, MAC+ports. 221 * XXX should be changed to an argument to be passed to 222 * the lookup function, and allocated on attach 223 */ 224 struct nm_hash_ent ht[NM_BDG_HASH]; 225 226 #ifdef CONFIG_NET_NS 227 struct net *ns; 228 #endif /* CONFIG_NET_NS */ 229 }; 230 231 const char* 232 netmap_bdg_name(struct netmap_vp_adapter *vp) 233 { 234 struct nm_bridge *b = vp->na_bdg; 235 if (b == NULL) 236 return NULL; 237 return b->bdg_basename; 238 } 239 240 241 #ifndef CONFIG_NET_NS 242 /* 243 * XXX in principle nm_bridges could be created dynamically 244 * Right now we have a static array and deletions are protected 245 * by an exclusive lock. 246 */ 247 struct nm_bridge *nm_bridges; 248 #endif /* !CONFIG_NET_NS */ 249 250 251 /* 252 * this is a slightly optimized copy routine which rounds 253 * to multiple of 64 bytes and is often faster than dealing 254 * with other odd sizes. We assume there is enough room 255 * in the source and destination buffers. 256 * 257 * XXX only for multiples of 64 bytes, non overlapped. 258 */ 259 static inline void 260 pkt_copy(void *_src, void *_dst, int l) 261 { 262 uint64_t *src = _src; 263 uint64_t *dst = _dst; 264 if (unlikely(l >= 1024)) { 265 memcpy(dst, src, l); 266 return; 267 } 268 for (; likely(l > 0); l-=64) { 269 *dst++ = *src++; 270 *dst++ = *src++; 271 *dst++ = *src++; 272 *dst++ = *src++; 273 *dst++ = *src++; 274 *dst++ = *src++; 275 *dst++ = *src++; 276 *dst++ = *src++; 277 } 278 } 279 280 281 /* 282 * locate a bridge among the existing ones. 283 * MUST BE CALLED WITH NMG_LOCK() 284 * 285 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. 286 * We assume that this is called with a name of at least NM_NAME chars. 287 */ 288 static struct nm_bridge * 289 nm_find_bridge(const char *name, int create) 290 { 291 int i, l, namelen; 292 struct nm_bridge *b = NULL, *bridges; 293 u_int num_bridges; 294 295 NMG_LOCK_ASSERT(); 296 297 netmap_bns_getbridges(&bridges, &num_bridges); 298 299 namelen = strlen(NM_NAME); /* base length */ 300 l = name ? strlen(name) : 0; /* actual length */ 301 if (l < namelen) { 302 D("invalid bridge name %s", name ? name : NULL); 303 return NULL; 304 } 305 for (i = namelen + 1; i < l; i++) { 306 if (name[i] == ':') { 307 namelen = i; 308 break; 309 } 310 } 311 if (namelen >= IFNAMSIZ) 312 namelen = IFNAMSIZ; 313 ND("--- prefix is '%.*s' ---", namelen, name); 314 315 /* lookup the name, remember empty slot if there is one */ 316 for (i = 0; i < num_bridges; i++) { 317 struct nm_bridge *x = bridges + i; 318 319 if (x->bdg_active_ports == 0) { 320 if (create && b == NULL) 321 b = x; /* record empty slot */ 322 } else if (x->bdg_namelen != namelen) { 323 continue; 324 } else if (strncmp(name, x->bdg_basename, namelen) == 0) { 325 ND("found '%.*s' at %d", namelen, name, i); 326 b = x; 327 break; 328 } 329 } 330 if (i == num_bridges && b) { /* name not found, can create entry */ 331 /* initialize the bridge */ 332 strncpy(b->bdg_basename, name, namelen); 333 ND("create new bridge %s with ports %d", b->bdg_basename, 334 b->bdg_active_ports); 335 b->bdg_namelen = namelen; 336 b->bdg_active_ports = 0; 337 for (i = 0; i < NM_BDG_MAXPORTS; i++) 338 b->bdg_port_index[i] = i; 339 /* set the default function */ 340 b->bdg_ops.lookup = netmap_bdg_learning; 341 /* reset the MAC address table */ 342 bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); 343 NM_BNS_GET(b); 344 } 345 return b; 346 } 347 348 349 /* 350 * Free the forwarding tables for rings attached to switch ports. 351 */ 352 static void 353 nm_free_bdgfwd(struct netmap_adapter *na) 354 { 355 int nrings, i; 356 struct netmap_kring *kring; 357 358 NMG_LOCK_ASSERT(); 359 nrings = na->num_tx_rings; 360 kring = na->tx_rings; 361 for (i = 0; i < nrings; i++) { 362 if (kring[i].nkr_ft) { 363 free(kring[i].nkr_ft, M_DEVBUF); 364 kring[i].nkr_ft = NULL; /* protect from freeing twice */ 365 } 366 } 367 } 368 369 370 /* 371 * Allocate the forwarding tables for the rings attached to the bridge ports. 372 */ 373 static int 374 nm_alloc_bdgfwd(struct netmap_adapter *na) 375 { 376 int nrings, l, i, num_dstq; 377 struct netmap_kring *kring; 378 379 NMG_LOCK_ASSERT(); 380 /* all port:rings + broadcast */ 381 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; 382 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; 383 l += sizeof(struct nm_bdg_q) * num_dstq; 384 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; 385 386 nrings = netmap_real_rings(na, NR_TX); 387 kring = na->tx_rings; 388 for (i = 0; i < nrings; i++) { 389 struct nm_bdg_fwd *ft; 390 struct nm_bdg_q *dstq; 391 int j; 392 393 ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); 394 if (!ft) { 395 nm_free_bdgfwd(na); 396 return ENOMEM; 397 } 398 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 399 for (j = 0; j < num_dstq; j++) { 400 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; 401 dstq[j].bq_len = 0; 402 } 403 kring[i].nkr_ft = ft; 404 } 405 return 0; 406 } 407 408 409 /* remove from bridge b the ports in slots hw and sw 410 * (sw can be -1 if not needed) 411 */ 412 static void 413 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) 414 { 415 int s_hw = hw, s_sw = sw; 416 int i, lim =b->bdg_active_ports; 417 uint8_t tmp[NM_BDG_MAXPORTS]; 418 419 /* 420 New algorithm: 421 make a copy of bdg_port_index; 422 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port 423 in the array of bdg_port_index, replacing them with 424 entries from the bottom of the array; 425 decrement bdg_active_ports; 426 acquire BDG_WLOCK() and copy back the array. 427 */ 428 429 if (netmap_verbose) 430 D("detach %d and %d (lim %d)", hw, sw, lim); 431 /* make a copy of the list of active ports, update it, 432 * and then copy back within BDG_WLOCK(). 433 */ 434 memcpy(tmp, b->bdg_port_index, sizeof(tmp)); 435 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { 436 if (hw >= 0 && tmp[i] == hw) { 437 ND("detach hw %d at %d", hw, i); 438 lim--; /* point to last active port */ 439 tmp[i] = tmp[lim]; /* swap with i */ 440 tmp[lim] = hw; /* now this is inactive */ 441 hw = -1; 442 } else if (sw >= 0 && tmp[i] == sw) { 443 ND("detach sw %d at %d", sw, i); 444 lim--; 445 tmp[i] = tmp[lim]; 446 tmp[lim] = sw; 447 sw = -1; 448 } else { 449 i++; 450 } 451 } 452 if (hw >= 0 || sw >= 0) { 453 D("XXX delete failed hw %d sw %d, should panic...", hw, sw); 454 } 455 456 BDG_WLOCK(b); 457 if (b->bdg_ops.dtor) 458 b->bdg_ops.dtor(b->bdg_ports[s_hw]); 459 b->bdg_ports[s_hw] = NULL; 460 if (s_sw >= 0) { 461 b->bdg_ports[s_sw] = NULL; 462 } 463 memcpy(b->bdg_port_index, tmp, sizeof(tmp)); 464 b->bdg_active_ports = lim; 465 BDG_WUNLOCK(b); 466 467 ND("now %d active ports", lim); 468 if (lim == 0) { 469 ND("marking bridge %s as free", b->bdg_basename); 470 bzero(&b->bdg_ops, sizeof(b->bdg_ops)); 471 NM_BNS_PUT(b); 472 } 473 } 474 475 /* nm_bdg_ctl callback for VALE ports */ 476 static int 477 netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) 478 { 479 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 480 struct nm_bridge *b = vpna->na_bdg; 481 482 if (attach) 483 return 0; /* nothing to do */ 484 if (b) { 485 netmap_set_all_rings(na, 0 /* disable */); 486 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 487 vpna->na_bdg = NULL; 488 netmap_set_all_rings(na, 1 /* enable */); 489 } 490 /* I have took reference just for attach */ 491 netmap_adapter_put(na); 492 return 0; 493 } 494 495 /* nm_dtor callback for ephemeral VALE ports */ 496 static void 497 netmap_vp_dtor(struct netmap_adapter *na) 498 { 499 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; 500 struct nm_bridge *b = vpna->na_bdg; 501 502 ND("%s has %d references", na->name, na->na_refcount); 503 504 if (b) { 505 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 506 } 507 } 508 509 /* remove a persistent VALE port from the system */ 510 static int 511 nm_vi_destroy(const char *name) 512 { 513 struct ifnet *ifp; 514 int error; 515 516 ifp = ifunit_ref(name); 517 if (!ifp) 518 return ENXIO; 519 NMG_LOCK(); 520 /* make sure this is actually a VALE port */ 521 if (!NETMAP_CAPABLE(ifp) || NA(ifp)->nm_register != netmap_vp_reg) { 522 error = EINVAL; 523 goto err; 524 } 525 526 if (NA(ifp)->na_refcount > 1) { 527 error = EBUSY; 528 goto err; 529 } 530 NMG_UNLOCK(); 531 532 D("destroying a persistent vale interface %s", ifp->if_xname); 533 /* Linux requires all the references are released 534 * before unregister 535 */ 536 if_rele(ifp); 537 netmap_detach(ifp); 538 nm_vi_detach(ifp); 539 return 0; 540 541 err: 542 NMG_UNLOCK(); 543 if_rele(ifp); 544 return error; 545 } 546 547 /* 548 * Create a virtual interface registered to the system. 549 * The interface will be attached to a bridge later. 550 */ 551 static int 552 nm_vi_create(struct nmreq *nmr) 553 { 554 struct ifnet *ifp; 555 struct netmap_vp_adapter *vpna; 556 int error; 557 558 /* don't include VALE prefix */ 559 if (!strncmp(nmr->nr_name, NM_NAME, strlen(NM_NAME))) 560 return EINVAL; 561 ifp = ifunit_ref(nmr->nr_name); 562 if (ifp) { /* already exist, cannot create new one */ 563 if_rele(ifp); 564 return EEXIST; 565 } 566 error = nm_vi_persist(nmr->nr_name, &ifp); 567 if (error) 568 return error; 569 570 NMG_LOCK(); 571 /* netmap_vp_create creates a struct netmap_vp_adapter */ 572 error = netmap_vp_create(nmr, ifp, &vpna); 573 if (error) { 574 D("error %d", error); 575 nm_vi_detach(ifp); 576 return error; 577 } 578 /* persist-specific routines */ 579 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl; 580 netmap_adapter_get(&vpna->up); 581 NMG_UNLOCK(); 582 D("created %s", ifp->if_xname); 583 return 0; 584 } 585 586 /* Try to get a reference to a netmap adapter attached to a VALE switch. 587 * If the adapter is found (or is created), this function returns 0, a 588 * non NULL pointer is returned into *na, and the caller holds a 589 * reference to the adapter. 590 * If an adapter is not found, then no reference is grabbed and the 591 * function returns an error code, or 0 if there is just a VALE prefix 592 * mismatch. Therefore the caller holds a reference when 593 * (*na != NULL && return == 0). 594 */ 595 int 596 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) 597 { 598 char *nr_name = nmr->nr_name; 599 const char *ifname; 600 struct ifnet *ifp; 601 int error = 0; 602 struct netmap_vp_adapter *vpna, *hostna = NULL; 603 struct nm_bridge *b; 604 int i, j, cand = -1, cand2 = -1; 605 int needed; 606 607 *na = NULL; /* default return value */ 608 609 /* first try to see if this is a bridge port. */ 610 NMG_LOCK_ASSERT(); 611 if (strncmp(nr_name, NM_NAME, sizeof(NM_NAME) - 1)) { 612 return 0; /* no error, but no VALE prefix */ 613 } 614 615 b = nm_find_bridge(nr_name, create); 616 if (b == NULL) { 617 D("no bridges available for '%s'", nr_name); 618 return (create ? ENOMEM : ENXIO); 619 } 620 if (strlen(nr_name) < b->bdg_namelen) /* impossible */ 621 panic("x"); 622 623 /* Now we are sure that name starts with the bridge's name, 624 * lookup the port in the bridge. We need to scan the entire 625 * list. It is not important to hold a WLOCK on the bridge 626 * during the search because NMG_LOCK already guarantees 627 * that there are no other possible writers. 628 */ 629 630 /* lookup in the local list of ports */ 631 for (j = 0; j < b->bdg_active_ports; j++) { 632 i = b->bdg_port_index[j]; 633 vpna = b->bdg_ports[i]; 634 // KASSERT(na != NULL); 635 ND("checking %s", vpna->up.name); 636 if (!strcmp(vpna->up.name, nr_name)) { 637 netmap_adapter_get(&vpna->up); 638 ND("found existing if %s refs %d", nr_name) 639 *na = &vpna->up; 640 return 0; 641 } 642 } 643 /* not found, should we create it? */ 644 if (!create) 645 return ENXIO; 646 /* yes we should, see if we have space to attach entries */ 647 needed = 2; /* in some cases we only need 1 */ 648 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { 649 D("bridge full %d, cannot create new port", b->bdg_active_ports); 650 return ENOMEM; 651 } 652 /* record the next two ports available, but do not allocate yet */ 653 cand = b->bdg_port_index[b->bdg_active_ports]; 654 cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; 655 ND("+++ bridge %s port %s used %d avail %d %d", 656 b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2); 657 658 /* 659 * try see if there is a matching NIC with this name 660 * (after the bridge's name) 661 */ 662 ifname = nr_name + b->bdg_namelen + 1; 663 ifp = ifunit_ref(ifname); 664 if (!ifp) { 665 /* Create an ephemeral virtual port 666 * This block contains all the ephemeral-specific logics 667 */ 668 if (nmr->nr_cmd) { 669 /* nr_cmd must be 0 for a virtual port */ 670 return EINVAL; 671 } 672 673 /* bdg_netmap_attach creates a struct netmap_adapter */ 674 error = netmap_vp_create(nmr, NULL, &vpna); 675 if (error) { 676 D("error %d", error); 677 free(ifp, M_DEVBUF); 678 return error; 679 } 680 /* shortcut - we can skip get_hw_na(), 681 * ownership check and nm_bdg_attach() 682 */ 683 } else { 684 struct netmap_adapter *hw; 685 686 error = netmap_get_hw_na(ifp, &hw); 687 if (error || hw == NULL) 688 goto out; 689 690 /* host adapter might not be created */ 691 error = hw->nm_bdg_attach(nr_name, hw); 692 if (error) 693 goto out; 694 vpna = hw->na_vp; 695 hostna = hw->na_hostvp; 696 if_rele(ifp); 697 if (nmr->nr_arg1 != NETMAP_BDG_HOST) 698 hostna = NULL; 699 } 700 701 BDG_WLOCK(b); 702 vpna->bdg_port = cand; 703 ND("NIC %p to bridge port %d", vpna, cand); 704 /* bind the port to the bridge (virtual ports are not active) */ 705 b->bdg_ports[cand] = vpna; 706 vpna->na_bdg = b; 707 b->bdg_active_ports++; 708 if (hostna != NULL) { 709 /* also bind the host stack to the bridge */ 710 b->bdg_ports[cand2] = hostna; 711 hostna->bdg_port = cand2; 712 hostna->na_bdg = b; 713 b->bdg_active_ports++; 714 ND("host %p to bridge port %d", hostna, cand2); 715 } 716 ND("if %s refs %d", ifname, vpna->up.na_refcount); 717 BDG_WUNLOCK(b); 718 *na = &vpna->up; 719 netmap_adapter_get(*na); 720 return 0; 721 722 out: 723 if_rele(ifp); 724 725 return error; 726 } 727 728 729 /* Process NETMAP_BDG_ATTACH */ 730 static int 731 nm_bdg_ctl_attach(struct nmreq *nmr) 732 { 733 struct netmap_adapter *na; 734 int error; 735 736 NMG_LOCK(); 737 738 error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */); 739 if (error) /* no device */ 740 goto unlock_exit; 741 742 if (na == NULL) { /* VALE prefix missing */ 743 error = EINVAL; 744 goto unlock_exit; 745 } 746 747 if (NETMAP_OWNED_BY_ANY(na)) { 748 error = EBUSY; 749 goto unref_exit; 750 } 751 752 if (na->nm_bdg_ctl) { 753 /* nop for VALE ports. The bwrap needs to put the hwna 754 * in netmap mode (see netmap_bwrap_bdg_ctl) 755 */ 756 error = na->nm_bdg_ctl(na, nmr, 1); 757 if (error) 758 goto unref_exit; 759 ND("registered %s to netmap-mode", na->name); 760 } 761 NMG_UNLOCK(); 762 return 0; 763 764 unref_exit: 765 netmap_adapter_put(na); 766 unlock_exit: 767 NMG_UNLOCK(); 768 return error; 769 } 770 771 772 /* process NETMAP_BDG_DETACH */ 773 static int 774 nm_bdg_ctl_detach(struct nmreq *nmr) 775 { 776 struct netmap_adapter *na; 777 int error; 778 779 NMG_LOCK(); 780 error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */); 781 if (error) { /* no device, or another bridge or user owns the device */ 782 goto unlock_exit; 783 } 784 785 if (na == NULL) { /* VALE prefix missing */ 786 error = EINVAL; 787 goto unlock_exit; 788 } 789 790 if (na->nm_bdg_ctl) { 791 /* remove the port from bridge. The bwrap 792 * also needs to put the hwna in normal mode 793 */ 794 error = na->nm_bdg_ctl(na, nmr, 0); 795 } 796 797 netmap_adapter_put(na); 798 unlock_exit: 799 NMG_UNLOCK(); 800 return error; 801 802 } 803 804 805 /* Called by either user's context (netmap_ioctl()) 806 * or external kernel modules (e.g., Openvswitch). 807 * Operation is indicated in nmr->nr_cmd. 808 * NETMAP_BDG_OPS that sets configure/lookup/dtor functions to the bridge 809 * requires bdg_ops argument; the other commands ignore this argument. 810 * 811 * Called without NMG_LOCK. 812 */ 813 int 814 netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops) 815 { 816 struct nm_bridge *b, *bridges; 817 struct netmap_adapter *na; 818 struct netmap_vp_adapter *vpna; 819 char *name = nmr->nr_name; 820 int cmd = nmr->nr_cmd, namelen = strlen(name); 821 int error = 0, i, j; 822 u_int num_bridges; 823 824 netmap_bns_getbridges(&bridges, &num_bridges); 825 826 switch (cmd) { 827 case NETMAP_BDG_NEWIF: 828 error = nm_vi_create(nmr); 829 break; 830 831 case NETMAP_BDG_DELIF: 832 error = nm_vi_destroy(nmr->nr_name); 833 break; 834 835 case NETMAP_BDG_ATTACH: 836 error = nm_bdg_ctl_attach(nmr); 837 break; 838 839 case NETMAP_BDG_DETACH: 840 error = nm_bdg_ctl_detach(nmr); 841 break; 842 843 case NETMAP_BDG_LIST: 844 /* this is used to enumerate bridges and ports */ 845 if (namelen) { /* look up indexes of bridge and port */ 846 if (strncmp(name, NM_NAME, strlen(NM_NAME))) { 847 error = EINVAL; 848 break; 849 } 850 NMG_LOCK(); 851 b = nm_find_bridge(name, 0 /* don't create */); 852 if (!b) { 853 error = ENOENT; 854 NMG_UNLOCK(); 855 break; 856 } 857 858 error = ENOENT; 859 for (j = 0; j < b->bdg_active_ports; j++) { 860 i = b->bdg_port_index[j]; 861 vpna = b->bdg_ports[i]; 862 if (vpna == NULL) { 863 D("---AAAAAAAAARGH-------"); 864 continue; 865 } 866 /* the former and the latter identify a 867 * virtual port and a NIC, respectively 868 */ 869 if (!strcmp(vpna->up.name, name)) { 870 /* bridge index */ 871 nmr->nr_arg1 = b - bridges; 872 nmr->nr_arg2 = i; /* port index */ 873 error = 0; 874 break; 875 } 876 } 877 NMG_UNLOCK(); 878 } else { 879 /* return the first non-empty entry starting from 880 * bridge nr_arg1 and port nr_arg2. 881 * 882 * Users can detect the end of the same bridge by 883 * seeing the new and old value of nr_arg1, and can 884 * detect the end of all the bridge by error != 0 885 */ 886 i = nmr->nr_arg1; 887 j = nmr->nr_arg2; 888 889 NMG_LOCK(); 890 for (error = ENOENT; i < NM_BRIDGES; i++) { 891 b = bridges + i; 892 if (j >= b->bdg_active_ports) { 893 j = 0; /* following bridges scan from 0 */ 894 continue; 895 } 896 nmr->nr_arg1 = i; 897 nmr->nr_arg2 = j; 898 j = b->bdg_port_index[j]; 899 vpna = b->bdg_ports[j]; 900 strncpy(name, vpna->up.name, (size_t)IFNAMSIZ); 901 error = 0; 902 break; 903 } 904 NMG_UNLOCK(); 905 } 906 break; 907 908 case NETMAP_BDG_REGOPS: /* XXX this should not be available from userspace */ 909 /* register callbacks to the given bridge. 910 * nmr->nr_name may be just bridge's name (including ':' 911 * if it is not just NM_NAME). 912 */ 913 if (!bdg_ops) { 914 error = EINVAL; 915 break; 916 } 917 NMG_LOCK(); 918 b = nm_find_bridge(name, 0 /* don't create */); 919 if (!b) { 920 error = EINVAL; 921 } else { 922 b->bdg_ops = *bdg_ops; 923 } 924 NMG_UNLOCK(); 925 break; 926 927 case NETMAP_BDG_VNET_HDR: 928 /* Valid lengths for the virtio-net header are 0 (no header), 929 10 and 12. */ 930 if (nmr->nr_arg1 != 0 && 931 nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) && 932 nmr->nr_arg1 != 12) { 933 error = EINVAL; 934 break; 935 } 936 NMG_LOCK(); 937 error = netmap_get_bdg_na(nmr, &na, 0); 938 if (na && !error) { 939 vpna = (struct netmap_vp_adapter *)na; 940 vpna->virt_hdr_len = nmr->nr_arg1; 941 if (vpna->virt_hdr_len) 942 vpna->mfs = NETMAP_BUF_SIZE(na); 943 D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna); 944 netmap_adapter_put(na); 945 } 946 NMG_UNLOCK(); 947 break; 948 949 default: 950 D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); 951 error = EINVAL; 952 break; 953 } 954 return error; 955 } 956 957 int 958 netmap_bdg_config(struct nmreq *nmr) 959 { 960 struct nm_bridge *b; 961 int error = EINVAL; 962 963 NMG_LOCK(); 964 b = nm_find_bridge(nmr->nr_name, 0); 965 if (!b) { 966 NMG_UNLOCK(); 967 return error; 968 } 969 NMG_UNLOCK(); 970 /* Don't call config() with NMG_LOCK() held */ 971 BDG_RLOCK(b); 972 if (b->bdg_ops.config != NULL) 973 error = b->bdg_ops.config((struct nm_ifreq *)nmr); 974 BDG_RUNLOCK(b); 975 return error; 976 } 977 978 979 /* nm_krings_create callback for VALE ports. 980 * Calls the standard netmap_krings_create, then adds leases on rx 981 * rings and bdgfwd on tx rings. 982 */ 983 static int 984 netmap_vp_krings_create(struct netmap_adapter *na) 985 { 986 u_int tailroom; 987 int error, i; 988 uint32_t *leases; 989 u_int nrx = netmap_real_rings(na, NR_RX); 990 991 /* 992 * Leases are attached to RX rings on vale ports 993 */ 994 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; 995 996 error = netmap_krings_create(na, tailroom); 997 if (error) 998 return error; 999 1000 leases = na->tailroom; 1001 1002 for (i = 0; i < nrx; i++) { /* Receive rings */ 1003 na->rx_rings[i].nkr_leases = leases; 1004 leases += na->num_rx_desc; 1005 } 1006 1007 error = nm_alloc_bdgfwd(na); 1008 if (error) { 1009 netmap_krings_delete(na); 1010 return error; 1011 } 1012 1013 return 0; 1014 } 1015 1016 1017 /* nm_krings_delete callback for VALE ports. */ 1018 static void 1019 netmap_vp_krings_delete(struct netmap_adapter *na) 1020 { 1021 nm_free_bdgfwd(na); 1022 netmap_krings_delete(na); 1023 } 1024 1025 1026 static int 1027 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, 1028 struct netmap_vp_adapter *na, u_int ring_nr); 1029 1030 1031 /* 1032 * main dispatch routine for the bridge. 1033 * Grab packets from a kring, move them into the ft structure 1034 * associated to the tx (input) port. Max one instance per port, 1035 * filtered on input (ioctl, poll or XXX). 1036 * Returns the next position in the ring. 1037 */ 1038 static int 1039 nm_bdg_preflush(struct netmap_kring *kring, u_int end) 1040 { 1041 struct netmap_vp_adapter *na = 1042 (struct netmap_vp_adapter*)kring->na; 1043 struct netmap_ring *ring = kring->ring; 1044 struct nm_bdg_fwd *ft; 1045 u_int ring_nr = kring->ring_id; 1046 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; 1047 u_int ft_i = 0; /* start from 0 */ 1048 u_int frags = 1; /* how many frags ? */ 1049 struct nm_bridge *b = na->na_bdg; 1050 1051 /* To protect against modifications to the bridge we acquire a 1052 * shared lock, waiting if we can sleep (if the source port is 1053 * attached to a user process) or with a trylock otherwise (NICs). 1054 */ 1055 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); 1056 if (na->up.na_flags & NAF_BDG_MAYSLEEP) 1057 BDG_RLOCK(b); 1058 else if (!BDG_RTRYLOCK(b)) 1059 return 0; 1060 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); 1061 ft = kring->nkr_ft; 1062 1063 for (; likely(j != end); j = nm_next(j, lim)) { 1064 struct netmap_slot *slot = &ring->slot[j]; 1065 char *buf; 1066 1067 ft[ft_i].ft_len = slot->len; 1068 ft[ft_i].ft_flags = slot->flags; 1069 1070 ND("flags is 0x%x", slot->flags); 1071 /* we do not use the buf changed flag, but we still need to reset it */ 1072 slot->flags &= ~NS_BUF_CHANGED; 1073 1074 /* this slot goes into a list so initialize the link field */ 1075 ft[ft_i].ft_next = NM_FT_NULL; 1076 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? 1077 (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot); 1078 if (unlikely(buf == NULL)) { 1079 RD(5, "NULL %s buffer pointer from %s slot %d len %d", 1080 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT", 1081 kring->name, j, ft[ft_i].ft_len); 1082 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up); 1083 ft[ft_i].ft_len = 0; 1084 ft[ft_i].ft_flags = 0; 1085 } 1086 __builtin_prefetch(buf); 1087 ++ft_i; 1088 if (slot->flags & NS_MOREFRAG) { 1089 frags++; 1090 continue; 1091 } 1092 if (unlikely(netmap_verbose && frags > 1)) 1093 RD(5, "%d frags at %d", frags, ft_i - frags); 1094 ft[ft_i - frags].ft_frags = frags; 1095 frags = 1; 1096 if (unlikely((int)ft_i >= bridge_batch)) 1097 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 1098 } 1099 if (frags > 1) { 1100 D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); 1101 // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG 1102 ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; 1103 ft[ft_i - frags].ft_frags = frags - 1; 1104 } 1105 if (ft_i) 1106 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 1107 BDG_RUNLOCK(b); 1108 return j; 1109 } 1110 1111 1112 /* ----- FreeBSD if_bridge hash function ------- */ 1113 1114 /* 1115 * The following hash function is adapted from "Hash Functions" by Bob Jenkins 1116 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). 1117 * 1118 * http://www.burtleburtle.net/bob/hash/spooky.html 1119 */ 1120 #define mix(a, b, c) \ 1121 do { \ 1122 a -= b; a -= c; a ^= (c >> 13); \ 1123 b -= c; b -= a; b ^= (a << 8); \ 1124 c -= a; c -= b; c ^= (b >> 13); \ 1125 a -= b; a -= c; a ^= (c >> 12); \ 1126 b -= c; b -= a; b ^= (a << 16); \ 1127 c -= a; c -= b; c ^= (b >> 5); \ 1128 a -= b; a -= c; a ^= (c >> 3); \ 1129 b -= c; b -= a; b ^= (a << 10); \ 1130 c -= a; c -= b; c ^= (b >> 15); \ 1131 } while (/*CONSTCOND*/0) 1132 1133 1134 static __inline uint32_t 1135 nm_bridge_rthash(const uint8_t *addr) 1136 { 1137 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key 1138 1139 b += addr[5] << 8; 1140 b += addr[4]; 1141 a += addr[3] << 24; 1142 a += addr[2] << 16; 1143 a += addr[1] << 8; 1144 a += addr[0]; 1145 1146 mix(a, b, c); 1147 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) 1148 return (c & BRIDGE_RTHASH_MASK); 1149 } 1150 1151 #undef mix 1152 1153 1154 /* nm_register callback for VALE ports */ 1155 static int 1156 netmap_vp_reg(struct netmap_adapter *na, int onoff) 1157 { 1158 struct netmap_vp_adapter *vpna = 1159 (struct netmap_vp_adapter*)na; 1160 1161 /* persistent ports may be put in netmap mode 1162 * before being attached to a bridge 1163 */ 1164 if (vpna->na_bdg) 1165 BDG_WLOCK(vpna->na_bdg); 1166 if (onoff) { 1167 na->na_flags |= NAF_NETMAP_ON; 1168 /* XXX on FreeBSD, persistent VALE ports should also 1169 * toggle IFCAP_NETMAP in na->ifp (2014-03-16) 1170 */ 1171 } else { 1172 na->na_flags &= ~NAF_NETMAP_ON; 1173 } 1174 if (vpna->na_bdg) 1175 BDG_WUNLOCK(vpna->na_bdg); 1176 return 0; 1177 } 1178 1179 1180 /* 1181 * Lookup function for a learning bridge. 1182 * Update the hash table with the source address, 1183 * and then returns the destination port index, and the 1184 * ring in *dst_ring (at the moment, always use ring 0) 1185 */ 1186 u_int 1187 netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, 1188 struct netmap_vp_adapter *na) 1189 { 1190 uint8_t *buf = ft->ft_buf; 1191 u_int buf_len = ft->ft_len; 1192 struct nm_hash_ent *ht = na->na_bdg->ht; 1193 uint32_t sh, dh; 1194 u_int dst, mysrc = na->bdg_port; 1195 uint64_t smac, dmac; 1196 1197 /* safety check, unfortunately we have many cases */ 1198 if (buf_len >= 14 + na->virt_hdr_len) { 1199 /* virthdr + mac_hdr in the same slot */ 1200 buf += na->virt_hdr_len; 1201 buf_len -= na->virt_hdr_len; 1202 } else if (buf_len == na->virt_hdr_len && ft->ft_flags & NS_MOREFRAG) { 1203 /* only header in first fragment */ 1204 ft++; 1205 buf = ft->ft_buf; 1206 buf_len = ft->ft_len; 1207 } else { 1208 RD(5, "invalid buf format, length %d", buf_len); 1209 return NM_BDG_NOPORT; 1210 } 1211 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; 1212 smac = le64toh(*(uint64_t *)(buf + 4)); 1213 smac >>= 16; 1214 1215 /* 1216 * The hash is somewhat expensive, there might be some 1217 * worthwhile optimizations here. 1218 */ 1219 if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */ 1220 uint8_t *s = buf+6; 1221 sh = nm_bridge_rthash(s); // XXX hash of source 1222 /* update source port forwarding entry */ 1223 na->last_smac = ht[sh].mac = smac; /* XXX expire ? */ 1224 ht[sh].ports = mysrc; 1225 if (netmap_verbose) 1226 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", 1227 s[0], s[1], s[2], s[3], s[4], s[5], mysrc); 1228 } 1229 dst = NM_BDG_BROADCAST; 1230 if ((buf[0] & 1) == 0) { /* unicast */ 1231 dh = nm_bridge_rthash(buf); // XXX hash of dst 1232 if (ht[dh].mac == dmac) { /* found dst */ 1233 dst = ht[dh].ports; 1234 } 1235 /* XXX otherwise return NM_BDG_UNKNOWN ? */ 1236 } 1237 return dst; 1238 } 1239 1240 1241 /* 1242 * Available space in the ring. Only used in VALE code 1243 * and only with is_rx = 1 1244 */ 1245 static inline uint32_t 1246 nm_kr_space(struct netmap_kring *k, int is_rx) 1247 { 1248 int space; 1249 1250 if (is_rx) { 1251 int busy = k->nkr_hwlease - k->nr_hwcur; 1252 if (busy < 0) 1253 busy += k->nkr_num_slots; 1254 space = k->nkr_num_slots - 1 - busy; 1255 } else { 1256 /* XXX never used in this branch */ 1257 space = k->nr_hwtail - k->nkr_hwlease; 1258 if (space < 0) 1259 space += k->nkr_num_slots; 1260 } 1261 #if 0 1262 // sanity check 1263 if (k->nkr_hwlease >= k->nkr_num_slots || 1264 k->nr_hwcur >= k->nkr_num_slots || 1265 k->nr_tail >= k->nkr_num_slots || 1266 busy < 0 || 1267 busy >= k->nkr_num_slots) { 1268 D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 1269 k->nkr_lease_idx, k->nkr_num_slots); 1270 } 1271 #endif 1272 return space; 1273 } 1274 1275 1276 1277 1278 /* make a lease on the kring for N positions. return the 1279 * lease index 1280 * XXX only used in VALE code and with is_rx = 1 1281 */ 1282 static inline uint32_t 1283 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) 1284 { 1285 uint32_t lim = k->nkr_num_slots - 1; 1286 uint32_t lease_idx = k->nkr_lease_idx; 1287 1288 k->nkr_leases[lease_idx] = NR_NOSLOT; 1289 k->nkr_lease_idx = nm_next(lease_idx, lim); 1290 1291 if (n > nm_kr_space(k, is_rx)) { 1292 D("invalid request for %d slots", n); 1293 panic("x"); 1294 } 1295 /* XXX verify that there are n slots */ 1296 k->nkr_hwlease += n; 1297 if (k->nkr_hwlease > lim) 1298 k->nkr_hwlease -= lim + 1; 1299 1300 if (k->nkr_hwlease >= k->nkr_num_slots || 1301 k->nr_hwcur >= k->nkr_num_slots || 1302 k->nr_hwtail >= k->nkr_num_slots || 1303 k->nkr_lease_idx >= k->nkr_num_slots) { 1304 D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", 1305 k->na->name, 1306 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 1307 k->nkr_lease_idx, k->nkr_num_slots); 1308 } 1309 return lease_idx; 1310 } 1311 1312 /* 1313 * 1314 * This flush routine supports only unicast and broadcast but a large 1315 * number of ports, and lets us replace the learn and dispatch functions. 1316 */ 1317 int 1318 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, 1319 u_int ring_nr) 1320 { 1321 struct nm_bdg_q *dst_ents, *brddst; 1322 uint16_t num_dsts = 0, *dsts; 1323 struct nm_bridge *b = na->na_bdg; 1324 u_int i, j, me = na->bdg_port; 1325 1326 /* 1327 * The work area (pointed by ft) is followed by an array of 1328 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS 1329 * queues per port plus one for the broadcast traffic. 1330 * Then we have an array of destination indexes. 1331 */ 1332 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 1333 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); 1334 1335 /* first pass: find a destination for each packet in the batch */ 1336 for (i = 0; likely(i < n); i += ft[i].ft_frags) { 1337 uint8_t dst_ring = ring_nr; /* default, same ring as origin */ 1338 uint16_t dst_port, d_i; 1339 struct nm_bdg_q *d; 1340 1341 ND("slot %d frags %d", i, ft[i].ft_frags); 1342 /* Drop the packet if the virtio-net header is not into the first 1343 fragment nor at the very beginning of the second. */ 1344 if (unlikely(na->virt_hdr_len > ft[i].ft_len)) 1345 continue; 1346 dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na); 1347 if (netmap_verbose > 255) 1348 RD(5, "slot %d port %d -> %d", i, me, dst_port); 1349 if (dst_port == NM_BDG_NOPORT) 1350 continue; /* this packet is identified to be dropped */ 1351 else if (unlikely(dst_port > NM_BDG_MAXPORTS)) 1352 continue; 1353 else if (dst_port == NM_BDG_BROADCAST) 1354 dst_ring = 0; /* broadcasts always go to ring 0 */ 1355 else if (unlikely(dst_port == me || 1356 !b->bdg_ports[dst_port])) 1357 continue; 1358 1359 /* get a position in the scratch pad */ 1360 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; 1361 d = dst_ents + d_i; 1362 1363 /* append the first fragment to the list */ 1364 if (d->bq_head == NM_FT_NULL) { /* new destination */ 1365 d->bq_head = d->bq_tail = i; 1366 /* remember this position to be scanned later */ 1367 if (dst_port != NM_BDG_BROADCAST) 1368 dsts[num_dsts++] = d_i; 1369 } else { 1370 ft[d->bq_tail].ft_next = i; 1371 d->bq_tail = i; 1372 } 1373 d->bq_len += ft[i].ft_frags; 1374 } 1375 1376 /* 1377 * Broadcast traffic goes to ring 0 on all destinations. 1378 * So we need to add these rings to the list of ports to scan. 1379 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is 1380 * expensive. We should keep a compact list of active destinations 1381 * so we could shorten this loop. 1382 */ 1383 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; 1384 if (brddst->bq_head != NM_FT_NULL) { 1385 for (j = 0; likely(j < b->bdg_active_ports); j++) { 1386 uint16_t d_i; 1387 i = b->bdg_port_index[j]; 1388 if (unlikely(i == me)) 1389 continue; 1390 d_i = i * NM_BDG_MAXRINGS; 1391 if (dst_ents[d_i].bq_head == NM_FT_NULL) 1392 dsts[num_dsts++] = d_i; 1393 } 1394 } 1395 1396 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); 1397 /* second pass: scan destinations */ 1398 for (i = 0; i < num_dsts; i++) { 1399 struct netmap_vp_adapter *dst_na; 1400 struct netmap_kring *kring; 1401 struct netmap_ring *ring; 1402 u_int dst_nr, lim, j, d_i, next, brd_next; 1403 u_int needed, howmany; 1404 int retry = netmap_txsync_retry; 1405 struct nm_bdg_q *d; 1406 uint32_t my_start = 0, lease_idx = 0; 1407 int nrings; 1408 int virt_hdr_mismatch = 0; 1409 1410 d_i = dsts[i]; 1411 ND("second pass %d port %d", i, d_i); 1412 d = dst_ents + d_i; 1413 // XXX fix the division 1414 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; 1415 /* protect from the lookup function returning an inactive 1416 * destination port 1417 */ 1418 if (unlikely(dst_na == NULL)) 1419 goto cleanup; 1420 if (dst_na->up.na_flags & NAF_SW_ONLY) 1421 goto cleanup; 1422 /* 1423 * The interface may be in !netmap mode in two cases: 1424 * - when na is attached but not activated yet; 1425 * - when na is being deactivated but is still attached. 1426 */ 1427 if (unlikely(!nm_netmap_on(&dst_na->up))) { 1428 ND("not in netmap mode!"); 1429 goto cleanup; 1430 } 1431 1432 /* there is at least one either unicast or broadcast packet */ 1433 brd_next = brddst->bq_head; 1434 next = d->bq_head; 1435 /* we need to reserve this many slots. If fewer are 1436 * available, some packets will be dropped. 1437 * Packets may have multiple fragments, so we may not use 1438 * there is a chance that we may not use all of the slots 1439 * we have claimed, so we will need to handle the leftover 1440 * ones when we regain the lock. 1441 */ 1442 needed = d->bq_len + brddst->bq_len; 1443 1444 if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) { 1445 RD(3, "virt_hdr_mismatch, src %d dst %d", na->virt_hdr_len, dst_na->virt_hdr_len); 1446 /* There is a virtio-net header/offloadings mismatch between 1447 * source and destination. The slower mismatch datapath will 1448 * be used to cope with all the mismatches. 1449 */ 1450 virt_hdr_mismatch = 1; 1451 if (dst_na->mfs < na->mfs) { 1452 /* We may need to do segmentation offloadings, and so 1453 * we may need a number of destination slots greater 1454 * than the number of input slots ('needed'). 1455 * We look for the smallest integer 'x' which satisfies: 1456 * needed * na->mfs + x * H <= x * na->mfs 1457 * where 'H' is the length of the longest header that may 1458 * be replicated in the segmentation process (e.g. for 1459 * TCPv4 we must account for ethernet header, IP header 1460 * and TCPv4 header). 1461 */ 1462 needed = (needed * na->mfs) / 1463 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; 1464 ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); 1465 } 1466 } 1467 1468 ND(5, "pass 2 dst %d is %x %s", 1469 i, d_i, is_vp ? "virtual" : "nic/host"); 1470 dst_nr = d_i & (NM_BDG_MAXRINGS-1); 1471 nrings = dst_na->up.num_rx_rings; 1472 if (dst_nr >= nrings) 1473 dst_nr = dst_nr % nrings; 1474 kring = &dst_na->up.rx_rings[dst_nr]; 1475 ring = kring->ring; 1476 lim = kring->nkr_num_slots - 1; 1477 1478 retry: 1479 1480 if (dst_na->retry && retry) { 1481 /* try to get some free slot from the previous run */ 1482 kring->nm_notify(kring, 0); 1483 /* actually useful only for bwraps, since there 1484 * the notify will trigger a txsync on the hwna. VALE ports 1485 * have dst_na->retry == 0 1486 */ 1487 } 1488 /* reserve the buffers in the queue and an entry 1489 * to report completion, and drop lock. 1490 * XXX this might become a helper function. 1491 */ 1492 mtx_lock(&kring->q_lock); 1493 if (kring->nkr_stopped) { 1494 mtx_unlock(&kring->q_lock); 1495 goto cleanup; 1496 } 1497 my_start = j = kring->nkr_hwlease; 1498 howmany = nm_kr_space(kring, 1); 1499 if (needed < howmany) 1500 howmany = needed; 1501 lease_idx = nm_kr_lease(kring, howmany, 1); 1502 mtx_unlock(&kring->q_lock); 1503 1504 /* only retry if we need more than available slots */ 1505 if (retry && needed <= howmany) 1506 retry = 0; 1507 1508 /* copy to the destination queue */ 1509 while (howmany > 0) { 1510 struct netmap_slot *slot; 1511 struct nm_bdg_fwd *ft_p, *ft_end; 1512 u_int cnt; 1513 1514 /* find the queue from which we pick next packet. 1515 * NM_FT_NULL is always higher than valid indexes 1516 * so we never dereference it if the other list 1517 * has packets (and if both are empty we never 1518 * get here). 1519 */ 1520 if (next < brd_next) { 1521 ft_p = ft + next; 1522 next = ft_p->ft_next; 1523 } else { /* insert broadcast */ 1524 ft_p = ft + brd_next; 1525 brd_next = ft_p->ft_next; 1526 } 1527 cnt = ft_p->ft_frags; // cnt > 0 1528 if (unlikely(cnt > howmany)) 1529 break; /* no more space */ 1530 if (netmap_verbose && cnt > 1) 1531 RD(5, "rx %d frags to %d", cnt, j); 1532 ft_end = ft_p + cnt; 1533 if (unlikely(virt_hdr_mismatch)) { 1534 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); 1535 } else { 1536 howmany -= cnt; 1537 do { 1538 char *dst, *src = ft_p->ft_buf; 1539 size_t copy_len = ft_p->ft_len, dst_len = copy_len; 1540 1541 slot = &ring->slot[j]; 1542 dst = NMB(&dst_na->up, slot); 1543 1544 ND("send [%d] %d(%d) bytes at %s:%d", 1545 i, (int)copy_len, (int)dst_len, 1546 NM_IFPNAME(dst_ifp), j); 1547 /* round to a multiple of 64 */ 1548 copy_len = (copy_len + 63) & ~63; 1549 1550 if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) || 1551 copy_len > NETMAP_BUF_SIZE(&na->up))) { 1552 RD(5, "invalid len %d, down to 64", (int)copy_len); 1553 copy_len = dst_len = 64; // XXX 1554 } 1555 if (ft_p->ft_flags & NS_INDIRECT) { 1556 if (copyin(src, dst, copy_len)) { 1557 // invalid user pointer, pretend len is 0 1558 dst_len = 0; 1559 } 1560 } else { 1561 //memcpy(dst, src, copy_len); 1562 pkt_copy(src, dst, (int)copy_len); 1563 } 1564 slot->len = dst_len; 1565 slot->flags = (cnt << 8)| NS_MOREFRAG; 1566 j = nm_next(j, lim); 1567 needed--; 1568 ft_p++; 1569 } while (ft_p != ft_end); 1570 slot->flags = (cnt << 8); /* clear flag on last entry */ 1571 } 1572 /* are we done ? */ 1573 if (next == NM_FT_NULL && brd_next == NM_FT_NULL) 1574 break; 1575 } 1576 { 1577 /* current position */ 1578 uint32_t *p = kring->nkr_leases; /* shorthand */ 1579 uint32_t update_pos; 1580 int still_locked = 1; 1581 1582 mtx_lock(&kring->q_lock); 1583 if (unlikely(howmany > 0)) { 1584 /* not used all bufs. If i am the last one 1585 * i can recover the slots, otherwise must 1586 * fill them with 0 to mark empty packets. 1587 */ 1588 ND("leftover %d bufs", howmany); 1589 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { 1590 /* yes i am the last one */ 1591 ND("roll back nkr_hwlease to %d", j); 1592 kring->nkr_hwlease = j; 1593 } else { 1594 while (howmany-- > 0) { 1595 ring->slot[j].len = 0; 1596 ring->slot[j].flags = 0; 1597 j = nm_next(j, lim); 1598 } 1599 } 1600 } 1601 p[lease_idx] = j; /* report I am done */ 1602 1603 update_pos = kring->nr_hwtail; 1604 1605 if (my_start == update_pos) { 1606 /* all slots before my_start have been reported, 1607 * so scan subsequent leases to see if other ranges 1608 * have been completed, and to a selwakeup or txsync. 1609 */ 1610 while (lease_idx != kring->nkr_lease_idx && 1611 p[lease_idx] != NR_NOSLOT) { 1612 j = p[lease_idx]; 1613 p[lease_idx] = NR_NOSLOT; 1614 lease_idx = nm_next(lease_idx, lim); 1615 } 1616 /* j is the new 'write' position. j != my_start 1617 * means there are new buffers to report 1618 */ 1619 if (likely(j != my_start)) { 1620 kring->nr_hwtail = j; 1621 still_locked = 0; 1622 mtx_unlock(&kring->q_lock); 1623 kring->nm_notify(kring, 0); 1624 /* this is netmap_notify for VALE ports and 1625 * netmap_bwrap_notify for bwrap. The latter will 1626 * trigger a txsync on the underlying hwna 1627 */ 1628 if (dst_na->retry && retry--) { 1629 /* XXX this is going to call nm_notify again. 1630 * Only useful for bwrap in virtual machines 1631 */ 1632 goto retry; 1633 } 1634 } 1635 } 1636 if (still_locked) 1637 mtx_unlock(&kring->q_lock); 1638 } 1639 cleanup: 1640 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ 1641 d->bq_len = 0; 1642 } 1643 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ 1644 brddst->bq_len = 0; 1645 return 0; 1646 } 1647 1648 /* nm_txsync callback for VALE ports */ 1649 static int 1650 netmap_vp_txsync(struct netmap_kring *kring, int flags) 1651 { 1652 struct netmap_vp_adapter *na = 1653 (struct netmap_vp_adapter *)kring->na; 1654 u_int done; 1655 u_int const lim = kring->nkr_num_slots - 1; 1656 u_int const head = kring->rhead; 1657 1658 if (bridge_batch <= 0) { /* testing only */ 1659 done = head; // used all 1660 goto done; 1661 } 1662 if (!na->na_bdg) { 1663 done = head; 1664 goto done; 1665 } 1666 if (bridge_batch > NM_BDG_BATCH) 1667 bridge_batch = NM_BDG_BATCH; 1668 1669 done = nm_bdg_preflush(kring, head); 1670 done: 1671 if (done != head) 1672 D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail); 1673 /* 1674 * packets between 'done' and 'cur' are left unsent. 1675 */ 1676 kring->nr_hwcur = done; 1677 kring->nr_hwtail = nm_prev(done, lim); 1678 if (netmap_verbose) 1679 D("%s ring %d flags %d", na->up.name, kring->ring_id, flags); 1680 return 0; 1681 } 1682 1683 1684 /* rxsync code used by VALE ports nm_rxsync callback and also 1685 * internally by the brwap 1686 */ 1687 static int 1688 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags) 1689 { 1690 struct netmap_adapter *na = kring->na; 1691 struct netmap_ring *ring = kring->ring; 1692 u_int nm_i, lim = kring->nkr_num_slots - 1; 1693 u_int head = kring->rhead; 1694 int n; 1695 1696 if (head > lim) { 1697 D("ouch dangerous reset!!!"); 1698 n = netmap_ring_reinit(kring); 1699 goto done; 1700 } 1701 1702 /* First part, import newly received packets. */ 1703 /* actually nothing to do here, they are already in the kring */ 1704 1705 /* Second part, skip past packets that userspace has released. */ 1706 nm_i = kring->nr_hwcur; 1707 if (nm_i != head) { 1708 /* consistency check, but nothing really important here */ 1709 for (n = 0; likely(nm_i != head); n++) { 1710 struct netmap_slot *slot = &ring->slot[nm_i]; 1711 void *addr = NMB(na, slot); 1712 1713 if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */ 1714 D("bad buffer index %d, ignore ?", 1715 slot->buf_idx); 1716 } 1717 slot->flags &= ~NS_BUF_CHANGED; 1718 nm_i = nm_next(nm_i, lim); 1719 } 1720 kring->nr_hwcur = head; 1721 } 1722 1723 n = 0; 1724 done: 1725 return n; 1726 } 1727 1728 /* 1729 * nm_rxsync callback for VALE ports 1730 * user process reading from a VALE switch. 1731 * Already protected against concurrent calls from userspace, 1732 * but we must acquire the queue's lock to protect against 1733 * writers on the same queue. 1734 */ 1735 static int 1736 netmap_vp_rxsync(struct netmap_kring *kring, int flags) 1737 { 1738 int n; 1739 1740 mtx_lock(&kring->q_lock); 1741 n = netmap_vp_rxsync_locked(kring, flags); 1742 mtx_unlock(&kring->q_lock); 1743 return n; 1744 } 1745 1746 1747 /* nm_bdg_attach callback for VALE ports 1748 * The na_vp port is this same netmap_adapter. There is no host port. 1749 */ 1750 static int 1751 netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na) 1752 { 1753 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 1754 1755 if (vpna->na_bdg) 1756 return EBUSY; 1757 na->na_vp = vpna; 1758 strncpy(na->name, name, sizeof(na->name)); 1759 na->na_hostvp = NULL; 1760 return 0; 1761 } 1762 1763 /* create a netmap_vp_adapter that describes a VALE port. 1764 * Only persistent VALE ports have a non-null ifp. 1765 */ 1766 static int 1767 netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp, struct netmap_vp_adapter **ret) 1768 { 1769 struct netmap_vp_adapter *vpna; 1770 struct netmap_adapter *na; 1771 int error; 1772 u_int npipes = 0; 1773 1774 vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO); 1775 if (vpna == NULL) 1776 return ENOMEM; 1777 1778 na = &vpna->up; 1779 1780 na->ifp = ifp; 1781 strncpy(na->name, nmr->nr_name, sizeof(na->name)); 1782 1783 /* bound checking */ 1784 na->num_tx_rings = nmr->nr_tx_rings; 1785 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1786 nmr->nr_tx_rings = na->num_tx_rings; // write back 1787 na->num_rx_rings = nmr->nr_rx_rings; 1788 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1789 nmr->nr_rx_rings = na->num_rx_rings; // write back 1790 nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, 1791 1, NM_BDG_MAXSLOTS, NULL); 1792 na->num_tx_desc = nmr->nr_tx_slots; 1793 nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, 1794 1, NM_BDG_MAXSLOTS, NULL); 1795 /* validate number of pipes. We want at least 1, 1796 * but probably can do with some more. 1797 * So let's use 2 as default (when 0 is supplied) 1798 */ 1799 npipes = nmr->nr_arg1; 1800 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); 1801 nmr->nr_arg1 = npipes; /* write back */ 1802 /* validate extra bufs */ 1803 nm_bound_var(&nmr->nr_arg3, 0, 0, 1804 128*NM_BDG_MAXSLOTS, NULL); 1805 na->num_rx_desc = nmr->nr_rx_slots; 1806 vpna->virt_hdr_len = 0; 1807 vpna->mfs = 1514; 1808 vpna->last_smac = ~0llu; 1809 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? 1810 vpna->mfs = netmap_buf_size; */ 1811 if (netmap_verbose) 1812 D("max frame size %u", vpna->mfs); 1813 1814 na->na_flags |= NAF_BDG_MAYSLEEP; 1815 /* persistent VALE ports look like hw devices 1816 * with a native netmap adapter 1817 */ 1818 if (ifp) 1819 na->na_flags |= NAF_NATIVE; 1820 na->nm_txsync = netmap_vp_txsync; 1821 na->nm_rxsync = netmap_vp_rxsync; 1822 na->nm_register = netmap_vp_reg; 1823 na->nm_krings_create = netmap_vp_krings_create; 1824 na->nm_krings_delete = netmap_vp_krings_delete; 1825 na->nm_dtor = netmap_vp_dtor; 1826 na->nm_mem = netmap_mem_private_new(na->name, 1827 na->num_tx_rings, na->num_tx_desc, 1828 na->num_rx_rings, na->num_rx_desc, 1829 nmr->nr_arg3, npipes, &error); 1830 if (na->nm_mem == NULL) 1831 goto err; 1832 na->nm_bdg_attach = netmap_vp_bdg_attach; 1833 /* other nmd fields are set in the common routine */ 1834 error = netmap_attach_common(na); 1835 if (error) 1836 goto err; 1837 *ret = vpna; 1838 return 0; 1839 1840 err: 1841 if (na->nm_mem != NULL) 1842 netmap_mem_delete(na->nm_mem); 1843 free(vpna, M_DEVBUF); 1844 return error; 1845 } 1846 1847 /* Bridge wrapper code (bwrap). 1848 * This is used to connect a non-VALE-port netmap_adapter (hwna) to a 1849 * VALE switch. 1850 * The main task is to swap the meaning of tx and rx rings to match the 1851 * expectations of the VALE switch code (see nm_bdg_flush). 1852 * 1853 * The bwrap works by interposing a netmap_bwrap_adapter between the 1854 * rest of the system and the hwna. The netmap_bwrap_adapter looks like 1855 * a netmap_vp_adapter to the rest the system, but, internally, it 1856 * translates all callbacks to what the hwna expects. 1857 * 1858 * Note that we have to intercept callbacks coming from two sides: 1859 * 1860 * - callbacks coming from the netmap module are intercepted by 1861 * passing around the netmap_bwrap_adapter instead of the hwna 1862 * 1863 * - callbacks coming from outside of the netmap module only know 1864 * about the hwna. This, however, only happens in interrupt 1865 * handlers, where only the hwna->nm_notify callback is called. 1866 * What the bwrap does is to overwrite the hwna->nm_notify callback 1867 * with its own netmap_bwrap_intr_notify. 1868 * XXX This assumes that the hwna->nm_notify callback was the 1869 * standard netmap_notify(), as it is the case for nic adapters. 1870 * Any additional action performed by hwna->nm_notify will not be 1871 * performed by netmap_bwrap_intr_notify. 1872 * 1873 * Additionally, the bwrap can optionally attach the host rings pair 1874 * of the wrapped adapter to a different port of the switch. 1875 */ 1876 1877 1878 static void 1879 netmap_bwrap_dtor(struct netmap_adapter *na) 1880 { 1881 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; 1882 struct netmap_adapter *hwna = bna->hwna; 1883 1884 ND("na %p", na); 1885 /* drop reference to hwna->ifp. 1886 * If we don't do this, netmap_detach_common(na) 1887 * will think it has set NA(na->ifp) to NULL 1888 */ 1889 na->ifp = NULL; 1890 /* for safety, also drop the possible reference 1891 * in the hostna 1892 */ 1893 bna->host.up.ifp = NULL; 1894 1895 hwna->nm_mem = bna->save_nmd; 1896 hwna->na_private = NULL; 1897 hwna->na_vp = hwna->na_hostvp = NULL; 1898 hwna->na_flags &= ~NAF_BUSY; 1899 netmap_adapter_put(hwna); 1900 1901 } 1902 1903 1904 /* 1905 * Intr callback for NICs connected to a bridge. 1906 * Simply ignore tx interrupts (maybe we could try to recover space ?) 1907 * and pass received packets from nic to the bridge. 1908 * 1909 * XXX TODO check locking: this is called from the interrupt 1910 * handler so we should make sure that the interface is not 1911 * disconnected while passing down an interrupt. 1912 * 1913 * Note, no user process can access this NIC or the host stack. 1914 * The only part of the ring that is significant are the slots, 1915 * and head/cur/tail are set from the kring as needed 1916 * (part as a receive ring, part as a transmit ring). 1917 * 1918 * callback that overwrites the hwna notify callback. 1919 * Packets come from the outside or from the host stack and are put on an hwna rx ring. 1920 * The bridge wrapper then sends the packets through the bridge. 1921 */ 1922 static int 1923 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags) 1924 { 1925 struct netmap_adapter *na = kring->na; 1926 struct netmap_bwrap_adapter *bna = na->na_private; 1927 struct netmap_kring *bkring; 1928 struct netmap_vp_adapter *vpna = &bna->up; 1929 u_int ring_nr = kring->ring_id; 1930 int error = 0; 1931 1932 if (netmap_verbose) 1933 D("%s %s 0x%x", na->name, kring->name, flags); 1934 1935 if (!nm_netmap_on(na)) 1936 return 0; 1937 1938 bkring = &vpna->up.tx_rings[ring_nr]; 1939 1940 /* make sure the ring is not disabled */ 1941 if (nm_kr_tryget(kring)) 1942 return 0; 1943 1944 if (netmap_verbose) 1945 D("%s head %d cur %d tail %d", na->name, 1946 kring->rhead, kring->rcur, kring->rtail); 1947 1948 /* simulate a user wakeup on the rx ring 1949 * fetch packets that have arrived. 1950 */ 1951 error = kring->nm_sync(kring, 0); 1952 if (error) 1953 goto put_out; 1954 if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) { 1955 D("how strange, interrupt with no packets on %s", 1956 na->name); 1957 goto put_out; 1958 } 1959 1960 /* new packets are kring->rcur to kring->nr_hwtail, and the bkring 1961 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail 1962 * to push all packets out. 1963 */ 1964 bkring->rhead = bkring->rcur = kring->nr_hwtail; 1965 1966 netmap_vp_txsync(bkring, flags); 1967 1968 /* mark all buffers as released on this ring */ 1969 kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail; 1970 /* another call to actually release the buffers */ 1971 error = kring->nm_sync(kring, 0); 1972 1973 put_out: 1974 nm_kr_put(kring); 1975 return error; 1976 } 1977 1978 1979 /* nm_register callback for bwrap */ 1980 static int 1981 netmap_bwrap_register(struct netmap_adapter *na, int onoff) 1982 { 1983 struct netmap_bwrap_adapter *bna = 1984 (struct netmap_bwrap_adapter *)na; 1985 struct netmap_adapter *hwna = bna->hwna; 1986 struct netmap_vp_adapter *hostna = &bna->host; 1987 int error; 1988 enum txrx t; 1989 1990 ND("%s %s", na->name, onoff ? "on" : "off"); 1991 1992 if (onoff) { 1993 int i; 1994 1995 /* netmap_do_regif has been called on the bwrap na. 1996 * We need to pass the information about the 1997 * memory allocator down to the hwna before 1998 * putting it in netmap mode 1999 */ 2000 hwna->na_lut = na->na_lut; 2001 2002 if (hostna->na_bdg) { 2003 /* if the host rings have been attached to switch, 2004 * we need to copy the memory allocator information 2005 * in the hostna also 2006 */ 2007 hostna->up.na_lut = na->na_lut; 2008 } 2009 2010 /* cross-link the netmap rings 2011 * The original number of rings comes from hwna, 2012 * rx rings on one side equals tx rings on the other. 2013 * We need to do this now, after the initialization 2014 * of the kring->ring pointers 2015 */ 2016 for_rx_tx(t) { 2017 enum txrx r= nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ 2018 for (i = 0; i < nma_get_nrings(na, r) + 1; i++) { 2019 NMR(hwna, t)[i].nkr_num_slots = NMR(na, r)[i].nkr_num_slots; 2020 NMR(hwna, t)[i].ring = NMR(na, r)[i].ring; 2021 } 2022 } 2023 } 2024 2025 /* forward the request to the hwna */ 2026 error = hwna->nm_register(hwna, onoff); 2027 if (error) 2028 return error; 2029 2030 /* impersonate a netmap_vp_adapter */ 2031 netmap_vp_reg(na, onoff); 2032 if (hostna->na_bdg) 2033 netmap_vp_reg(&hostna->up, onoff); 2034 2035 if (onoff) { 2036 u_int i; 2037 /* intercept the hwna nm_nofify callback on the hw rings */ 2038 for (i = 0; i < hwna->num_rx_rings; i++) { 2039 hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify; 2040 hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify; 2041 } 2042 i = hwna->num_rx_rings; /* for safety */ 2043 /* save the host ring notify unconditionally */ 2044 hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify; 2045 if (hostna->na_bdg) { 2046 /* also intercept the host ring notify */ 2047 hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify; 2048 } 2049 } else { 2050 u_int i; 2051 /* reset all notify callbacks (including host ring) */ 2052 for (i = 0; i <= hwna->num_rx_rings; i++) { 2053 hwna->rx_rings[i].nm_notify = hwna->rx_rings[i].save_notify; 2054 hwna->rx_rings[i].save_notify = NULL; 2055 } 2056 hwna->na_lut.lut = NULL; 2057 hwna->na_lut.objtotal = 0; 2058 hwna->na_lut.objsize = 0; 2059 } 2060 2061 return 0; 2062 } 2063 2064 /* nm_config callback for bwrap */ 2065 static int 2066 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, 2067 u_int *rxr, u_int *rxd) 2068 { 2069 struct netmap_bwrap_adapter *bna = 2070 (struct netmap_bwrap_adapter *)na; 2071 struct netmap_adapter *hwna = bna->hwna; 2072 2073 /* forward the request */ 2074 netmap_update_config(hwna); 2075 /* swap the results */ 2076 *txr = hwna->num_rx_rings; 2077 *txd = hwna->num_rx_desc; 2078 *rxr = hwna->num_tx_rings; 2079 *rxd = hwna->num_rx_desc; 2080 2081 return 0; 2082 } 2083 2084 2085 /* nm_krings_create callback for bwrap */ 2086 static int 2087 netmap_bwrap_krings_create(struct netmap_adapter *na) 2088 { 2089 struct netmap_bwrap_adapter *bna = 2090 (struct netmap_bwrap_adapter *)na; 2091 struct netmap_adapter *hwna = bna->hwna; 2092 struct netmap_adapter *hostna = &bna->host.up; 2093 int error; 2094 2095 ND("%s", na->name); 2096 2097 /* impersonate a netmap_vp_adapter */ 2098 error = netmap_vp_krings_create(na); 2099 if (error) 2100 return error; 2101 2102 /* also create the hwna krings */ 2103 error = hwna->nm_krings_create(hwna); 2104 if (error) { 2105 netmap_vp_krings_delete(na); 2106 return error; 2107 } 2108 /* the connection between the bwrap krings and the hwna krings 2109 * will be perfomed later, in the nm_register callback, since 2110 * now the kring->ring pointers have not been initialized yet 2111 */ 2112 2113 if (na->na_flags & NAF_HOST_RINGS) { 2114 /* the hostna rings are the host rings of the bwrap. 2115 * The corresponding krings must point back to the 2116 * hostna 2117 */ 2118 hostna->tx_rings = &na->tx_rings[na->num_tx_rings]; 2119 hostna->tx_rings[0].na = hostna; 2120 hostna->rx_rings = &na->rx_rings[na->num_rx_rings]; 2121 hostna->rx_rings[0].na = hostna; 2122 } 2123 2124 return 0; 2125 } 2126 2127 2128 static void 2129 netmap_bwrap_krings_delete(struct netmap_adapter *na) 2130 { 2131 struct netmap_bwrap_adapter *bna = 2132 (struct netmap_bwrap_adapter *)na; 2133 struct netmap_adapter *hwna = bna->hwna; 2134 2135 ND("%s", na->name); 2136 2137 hwna->nm_krings_delete(hwna); 2138 netmap_vp_krings_delete(na); 2139 } 2140 2141 2142 /* notify method for the bridge-->hwna direction */ 2143 static int 2144 netmap_bwrap_notify(struct netmap_kring *kring, int flags) 2145 { 2146 struct netmap_adapter *na = kring->na; 2147 struct netmap_bwrap_adapter *bna = na->na_private; 2148 struct netmap_adapter *hwna = bna->hwna; 2149 u_int ring_n = kring->ring_id; 2150 u_int lim = kring->nkr_num_slots - 1; 2151 struct netmap_kring *hw_kring; 2152 int error = 0; 2153 2154 ND("%s: na %s hwna %s", 2155 (kring ? kring->name : "NULL!"), 2156 (na ? na->name : "NULL!"), 2157 (hwna ? hwna->name : "NULL!")); 2158 hw_kring = &hwna->tx_rings[ring_n]; 2159 2160 if (nm_kr_tryget(hw_kring)) 2161 return 0; 2162 2163 if (!nm_netmap_on(hwna)) 2164 return 0; 2165 /* first step: simulate a user wakeup on the rx ring */ 2166 netmap_vp_rxsync(kring, flags); 2167 ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", 2168 na->name, ring_n, 2169 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, 2170 ring->head, ring->cur, ring->tail, 2171 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail); 2172 /* second step: the new packets are sent on the tx ring 2173 * (which is actually the same ring) 2174 */ 2175 hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail; 2176 error = hw_kring->nm_sync(hw_kring, flags); 2177 if (error) 2178 goto out; 2179 2180 /* third step: now we are back the rx ring */ 2181 /* claim ownership on all hw owned bufs */ 2182 kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */ 2183 2184 /* fourth step: the user goes to sleep again, causing another rxsync */ 2185 netmap_vp_rxsync(kring, flags); 2186 ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", 2187 na->name, ring_n, 2188 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, 2189 ring->head, ring->cur, ring->tail, 2190 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); 2191 out: 2192 nm_kr_put(hw_kring); 2193 return error; 2194 } 2195 2196 2197 /* nm_bdg_ctl callback for the bwrap. 2198 * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd]. 2199 * On attach, it needs to provide a fake netmap_priv_d structure and 2200 * perform a netmap_do_regif() on the bwrap. This will put both the 2201 * bwrap and the hwna in netmap mode, with the netmap rings shared 2202 * and cross linked. Moroever, it will start intercepting interrupts 2203 * directed to hwna. 2204 */ 2205 static int 2206 netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) 2207 { 2208 struct netmap_priv_d *npriv; 2209 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; 2210 int error = 0; 2211 2212 if (attach) { 2213 if (NETMAP_OWNED_BY_ANY(na)) { 2214 return EBUSY; 2215 } 2216 if (bna->na_kpriv) { 2217 /* nothing to do */ 2218 return 0; 2219 } 2220 npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); 2221 if (npriv == NULL) 2222 return ENOMEM; 2223 error = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags); 2224 if (error) { 2225 bzero(npriv, sizeof(*npriv)); 2226 free(npriv, M_DEVBUF); 2227 return error; 2228 } 2229 bna->na_kpriv = npriv; 2230 na->na_flags |= NAF_BUSY; 2231 } else { 2232 int last_instance; 2233 2234 if (na->active_fds == 0) /* not registered */ 2235 return EINVAL; 2236 last_instance = netmap_dtor_locked(bna->na_kpriv); 2237 if (!last_instance) { 2238 D("--- error, trying to detach an entry with active mmaps"); 2239 error = EINVAL; 2240 } else { 2241 struct nm_bridge *b = bna->up.na_bdg, 2242 *bh = bna->host.na_bdg; 2243 npriv = bna->na_kpriv; 2244 bna->na_kpriv = NULL; 2245 D("deleting priv"); 2246 2247 bzero(npriv, sizeof(*npriv)); 2248 free(npriv, M_DEVBUF); 2249 if (b) { 2250 /* XXX the bwrap dtor should take care 2251 * of this (2014-06-16) 2252 */ 2253 netmap_bdg_detach_common(b, bna->up.bdg_port, 2254 (bh ? bna->host.bdg_port : -1)); 2255 } 2256 na->na_flags &= ~NAF_BUSY; 2257 } 2258 } 2259 return error; 2260 2261 } 2262 2263 /* attach a bridge wrapper to the 'real' device */ 2264 int 2265 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) 2266 { 2267 struct netmap_bwrap_adapter *bna; 2268 struct netmap_adapter *na = NULL; 2269 struct netmap_adapter *hostna = NULL; 2270 int error = 0; 2271 enum txrx t; 2272 2273 /* make sure the NIC is not already in use */ 2274 if (NETMAP_OWNED_BY_ANY(hwna)) { 2275 D("NIC %s busy, cannot attach to bridge", hwna->name); 2276 return EBUSY; 2277 } 2278 2279 bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO); 2280 if (bna == NULL) { 2281 return ENOMEM; 2282 } 2283 2284 na = &bna->up.up; 2285 na->na_private = bna; 2286 strncpy(na->name, nr_name, sizeof(na->name)); 2287 /* fill the ring data for the bwrap adapter with rx/tx meanings 2288 * swapped. The real cross-linking will be done during register, 2289 * when all the krings will have been created. 2290 */ 2291 for_rx_tx(t) { 2292 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ 2293 nma_set_nrings(na, t, nma_get_nrings(hwna, r)); 2294 nma_set_ndesc(na, t, nma_get_ndesc(hwna, r)); 2295 } 2296 na->nm_dtor = netmap_bwrap_dtor; 2297 na->nm_register = netmap_bwrap_register; 2298 // na->nm_txsync = netmap_bwrap_txsync; 2299 // na->nm_rxsync = netmap_bwrap_rxsync; 2300 na->nm_config = netmap_bwrap_config; 2301 na->nm_krings_create = netmap_bwrap_krings_create; 2302 na->nm_krings_delete = netmap_bwrap_krings_delete; 2303 na->nm_notify = netmap_bwrap_notify; 2304 na->nm_bdg_ctl = netmap_bwrap_bdg_ctl; 2305 na->pdev = hwna->pdev; 2306 na->nm_mem = netmap_mem_private_new(na->name, 2307 na->num_tx_rings, na->num_tx_desc, 2308 na->num_rx_rings, na->num_rx_desc, 2309 0, 0, &error); 2310 na->na_flags |= NAF_MEM_OWNER; 2311 if (na->nm_mem == NULL) 2312 goto err_put; 2313 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ 2314 2315 bna->hwna = hwna; 2316 netmap_adapter_get(hwna); 2317 hwna->na_private = bna; /* weak reference */ 2318 hwna->na_vp = &bna->up; 2319 2320 if (hwna->na_flags & NAF_HOST_RINGS) { 2321 if (hwna->na_flags & NAF_SW_ONLY) 2322 na->na_flags |= NAF_SW_ONLY; 2323 na->na_flags |= NAF_HOST_RINGS; 2324 hostna = &bna->host.up; 2325 snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name); 2326 hostna->ifp = hwna->ifp; 2327 for_rx_tx(t) { 2328 enum txrx r = nm_txrx_swap(t); 2329 nma_set_nrings(hostna, t, 1); 2330 nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r)); 2331 } 2332 // hostna->nm_txsync = netmap_bwrap_host_txsync; 2333 // hostna->nm_rxsync = netmap_bwrap_host_rxsync; 2334 hostna->nm_notify = netmap_bwrap_notify; 2335 hostna->nm_mem = na->nm_mem; 2336 hostna->na_private = bna; 2337 hostna->na_vp = &bna->up; 2338 na->na_hostvp = hwna->na_hostvp = 2339 hostna->na_hostvp = &bna->host; 2340 hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */ 2341 } 2342 2343 ND("%s<->%s txr %d txd %d rxr %d rxd %d", 2344 na->name, ifp->if_xname, 2345 na->num_tx_rings, na->num_tx_desc, 2346 na->num_rx_rings, na->num_rx_desc); 2347 2348 error = netmap_attach_common(na); 2349 if (error) { 2350 goto err_free; 2351 } 2352 /* make bwrap ifp point to the real ifp 2353 * NOTE: netmap_attach_common() interprets a non-NULL na->ifp 2354 * as a request to make the ifp point to the na. Since we 2355 * do not want to change the na already pointed to by hwna->ifp, 2356 * the following assignment has to be delayed until now 2357 */ 2358 na->ifp = hwna->ifp; 2359 hwna->na_flags |= NAF_BUSY; 2360 /* make hwna point to the allocator we are actually using, 2361 * so that monitors will be able to find it 2362 */ 2363 bna->save_nmd = hwna->nm_mem; 2364 hwna->nm_mem = na->nm_mem; 2365 return 0; 2366 2367 err_free: 2368 netmap_mem_delete(na->nm_mem); 2369 err_put: 2370 hwna->na_vp = hwna->na_hostvp = NULL; 2371 netmap_adapter_put(hwna); 2372 free(bna, M_DEVBUF); 2373 return error; 2374 2375 } 2376 2377 struct nm_bridge * 2378 netmap_init_bridges2(u_int n) 2379 { 2380 int i; 2381 struct nm_bridge *b; 2382 2383 b = malloc(sizeof(struct nm_bridge) * n, M_DEVBUF, 2384 M_NOWAIT | M_ZERO); 2385 if (b == NULL) 2386 return NULL; 2387 for (i = 0; i < n; i++) 2388 BDG_RWINIT(&b[i]); 2389 return b; 2390 } 2391 2392 void 2393 netmap_uninit_bridges2(struct nm_bridge *b, u_int n) 2394 { 2395 int i; 2396 2397 if (b == NULL) 2398 return; 2399 2400 for (i = 0; i < n; i++) 2401 BDG_RWDESTROY(&b[i]); 2402 free(b, M_DEVBUF); 2403 } 2404 2405 int 2406 netmap_init_bridges(void) 2407 { 2408 #ifdef CONFIG_NET_NS 2409 return netmap_bns_register(); 2410 #else 2411 nm_bridges = netmap_init_bridges2(NM_BRIDGES); 2412 if (nm_bridges == NULL) 2413 return ENOMEM; 2414 return 0; 2415 #endif 2416 } 2417 2418 void 2419 netmap_uninit_bridges(void) 2420 { 2421 #ifdef CONFIG_NET_NS 2422 netmap_bns_unregister(); 2423 #else 2424 netmap_uninit_bridges2(nm_bridges, NM_BRIDGES); 2425 #endif 2426 } 2427 #endif /* WITH_VALE */ 2428