1 /* 2 * Copyright (C) 2013-2016 Universita` di Pisa 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 28 /* 29 * This module implements the VALE switch for netmap 30 31 --- VALE SWITCH --- 32 33 NMG_LOCK() serializes all modifications to switches and ports. 34 A switch cannot be deleted until all ports are gone. 35 36 For each switch, an SX lock (RWlock on linux) protects 37 deletion of ports. When configuring or deleting a new port, the 38 lock is acquired in exclusive mode (after holding NMG_LOCK). 39 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 40 The lock is held throughout the entire forwarding cycle, 41 during which the thread may incur in a page fault. 42 Hence it is important that sleepable shared locks are used. 43 44 On the rx ring, the per-port lock is grabbed initially to reserve 45 a number of slot in the ring, then the lock is released, 46 packets are copied from source to destination, and then 47 the lock is acquired again and the receive ring is updated. 48 (A similar thing is done on the tx ring for NIC and host stack 49 ports attached to the switch) 50 51 */ 52 53 /* 54 * OS-specific code that is used only within this file. 55 * Other OS-specific code that must be accessed by drivers 56 * is present in netmap_kern.h 57 */ 58 59 #if defined(__FreeBSD__) 60 #include <sys/cdefs.h> /* prerequisite */ 61 __FBSDID("$FreeBSD$"); 62 63 #include <sys/types.h> 64 #include <sys/errno.h> 65 #include <sys/param.h> /* defines used in kernel.h */ 66 #include <sys/kernel.h> /* types used in module initialization */ 67 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 68 #include <sys/sockio.h> 69 #include <sys/socketvar.h> /* struct socket */ 70 #include <sys/malloc.h> 71 #include <sys/poll.h> 72 #include <sys/rwlock.h> 73 #include <sys/socket.h> /* sockaddrs */ 74 #include <sys/selinfo.h> 75 #include <sys/sysctl.h> 76 #include <net/if.h> 77 #include <net/if_var.h> 78 #include <net/bpf.h> /* BIOCIMMEDIATE */ 79 #include <machine/bus.h> /* bus_dmamap_* */ 80 #include <sys/endian.h> 81 #include <sys/refcount.h> 82 83 84 #define BDG_RWLOCK_T struct rwlock // struct rwlock 85 86 #define BDG_RWINIT(b) \ 87 rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) 88 #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) 89 #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) 90 #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) 91 #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) 92 #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) 93 #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) 94 95 96 #elif defined(linux) 97 98 #include "bsd_glue.h" 99 100 #elif defined(__APPLE__) 101 102 #warning OSX support is only partial 103 #include "osx_glue.h" 104 105 #elif defined(_WIN32) 106 #include "win_glue.h" 107 108 #else 109 110 #error Unsupported platform 111 112 #endif /* unsupported */ 113 114 /* 115 * common headers 116 */ 117 118 #include <net/netmap.h> 119 #include <dev/netmap/netmap_kern.h> 120 #include <dev/netmap/netmap_mem2.h> 121 122 #ifdef WITH_VALE 123 124 /* 125 * system parameters (most of them in netmap_kern.h) 126 * NM_BDG_NAME prefix for switch port names, default "vale" 127 * NM_BDG_MAXPORTS number of ports 128 * NM_BRIDGES max number of switches in the system. 129 * XXX should become a sysctl or tunable 130 * 131 * Switch ports are named valeX:Y where X is the switch name and Y 132 * is the port. If Y matches a physical interface name, the port is 133 * connected to a physical device. 134 * 135 * Unlike physical interfaces, switch ports use their own memory region 136 * for rings and buffers. 137 * The virtual interfaces use per-queue lock instead of core lock. 138 * In the tx loop, we aggregate traffic in batches to make all operations 139 * faster. The batch size is bridge_batch. 140 */ 141 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ 142 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ 143 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ 144 #define NM_BDG_HASH 1024 /* forwarding table entries */ 145 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ 146 #define NM_MULTISEG 64 /* max size of a chain of bufs */ 147 /* actual size of the tables */ 148 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) 149 /* NM_FT_NULL terminates a list of slots in the ft */ 150 #define NM_FT_NULL NM_BDG_BATCH_MAX 151 152 153 /* 154 * bridge_batch is set via sysctl to the max batch size to be 155 * used in the bridge. The actual value may be larger as the 156 * last packet in the block may overflow the size. 157 */ 158 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ 159 SYSBEGIN(vars_vale); 160 SYSCTL_DECL(_dev_netmap); 161 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); 162 SYSEND; 163 164 static int netmap_vp_create(struct nmreq *, struct ifnet *, 165 struct netmap_mem_d *nmd, struct netmap_vp_adapter **); 166 static int netmap_vp_reg(struct netmap_adapter *na, int onoff); 167 static int netmap_bwrap_reg(struct netmap_adapter *, int onoff); 168 169 /* 170 * For each output interface, nm_bdg_q is used to construct a list. 171 * bq_len is the number of output buffers (we can have coalescing 172 * during the copy). 173 */ 174 struct nm_bdg_q { 175 uint16_t bq_head; 176 uint16_t bq_tail; 177 uint32_t bq_len; /* number of buffers */ 178 }; 179 180 /* XXX revise this */ 181 struct nm_hash_ent { 182 uint64_t mac; /* the top 2 bytes are the epoch */ 183 uint64_t ports; 184 }; 185 186 /* 187 * nm_bridge is a descriptor for a VALE switch. 188 * Interfaces for a bridge are all in bdg_ports[]. 189 * The array has fixed size, an empty entry does not terminate 190 * the search, but lookups only occur on attach/detach so we 191 * don't mind if they are slow. 192 * 193 * The bridge is non blocking on the transmit ports: excess 194 * packets are dropped if there is no room on the output port. 195 * 196 * bdg_lock protects accesses to the bdg_ports array. 197 * This is a rw lock (or equivalent). 198 */ 199 struct nm_bridge { 200 /* XXX what is the proper alignment/layout ? */ 201 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ 202 int bdg_namelen; 203 uint32_t bdg_active_ports; /* 0 means free */ 204 char bdg_basename[IFNAMSIZ]; 205 206 /* Indexes of active ports (up to active_ports) 207 * and all other remaining ports. 208 */ 209 uint8_t bdg_port_index[NM_BDG_MAXPORTS]; 210 211 struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS]; 212 213 214 /* 215 * The function to decide the destination port. 216 * It returns either of an index of the destination port, 217 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to 218 * forward this packet. ring_nr is the source ring index, and the 219 * function may overwrite this value to forward this packet to a 220 * different ring index. 221 * This function must be set by netmap_bdg_ctl(). 222 */ 223 struct netmap_bdg_ops bdg_ops; 224 225 /* the forwarding table, MAC+ports. 226 * XXX should be changed to an argument to be passed to 227 * the lookup function, and allocated on attach 228 */ 229 struct nm_hash_ent ht[NM_BDG_HASH]; 230 231 #ifdef CONFIG_NET_NS 232 struct net *ns; 233 #endif /* CONFIG_NET_NS */ 234 }; 235 236 const char* 237 netmap_bdg_name(struct netmap_vp_adapter *vp) 238 { 239 struct nm_bridge *b = vp->na_bdg; 240 if (b == NULL) 241 return NULL; 242 return b->bdg_basename; 243 } 244 245 246 #ifndef CONFIG_NET_NS 247 /* 248 * XXX in principle nm_bridges could be created dynamically 249 * Right now we have a static array and deletions are protected 250 * by an exclusive lock. 251 */ 252 static struct nm_bridge *nm_bridges; 253 #endif /* !CONFIG_NET_NS */ 254 255 256 /* 257 * this is a slightly optimized copy routine which rounds 258 * to multiple of 64 bytes and is often faster than dealing 259 * with other odd sizes. We assume there is enough room 260 * in the source and destination buffers. 261 * 262 * XXX only for multiples of 64 bytes, non overlapped. 263 */ 264 static inline void 265 pkt_copy(void *_src, void *_dst, int l) 266 { 267 uint64_t *src = _src; 268 uint64_t *dst = _dst; 269 if (unlikely(l >= 1024)) { 270 memcpy(dst, src, l); 271 return; 272 } 273 for (; likely(l > 0); l-=64) { 274 *dst++ = *src++; 275 *dst++ = *src++; 276 *dst++ = *src++; 277 *dst++ = *src++; 278 *dst++ = *src++; 279 *dst++ = *src++; 280 *dst++ = *src++; 281 *dst++ = *src++; 282 } 283 } 284 285 286 static int 287 nm_is_id_char(const char c) 288 { 289 return (c >= 'a' && c <= 'z') || 290 (c >= 'A' && c <= 'Z') || 291 (c >= '0' && c <= '9') || 292 (c == '_'); 293 } 294 295 /* Validate the name of a VALE bridge port and return the 296 * position of the ":" character. */ 297 static int 298 nm_vale_name_validate(const char *name) 299 { 300 int colon_pos = -1; 301 int i; 302 303 if (!name || strlen(name) < strlen(NM_BDG_NAME)) { 304 return -1; 305 } 306 307 for (i = 0; name[i]; i++) { 308 if (name[i] == ':') { 309 if (colon_pos != -1) { 310 return -1; 311 } 312 colon_pos = i; 313 } else if (!nm_is_id_char(name[i])) { 314 return -1; 315 } 316 } 317 318 if (i >= IFNAMSIZ) { 319 return -1; 320 } 321 322 return colon_pos; 323 } 324 325 /* 326 * locate a bridge among the existing ones. 327 * MUST BE CALLED WITH NMG_LOCK() 328 * 329 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. 330 * We assume that this is called with a name of at least NM_NAME chars. 331 */ 332 static struct nm_bridge * 333 nm_find_bridge(const char *name, int create) 334 { 335 int i, namelen; 336 struct nm_bridge *b = NULL, *bridges; 337 u_int num_bridges; 338 339 NMG_LOCK_ASSERT(); 340 341 netmap_bns_getbridges(&bridges, &num_bridges); 342 343 namelen = nm_vale_name_validate(name); 344 if (namelen < 0) { 345 D("invalid bridge name %s", name ? name : NULL); 346 return NULL; 347 } 348 349 /* lookup the name, remember empty slot if there is one */ 350 for (i = 0; i < num_bridges; i++) { 351 struct nm_bridge *x = bridges + i; 352 353 if (x->bdg_active_ports == 0) { 354 if (create && b == NULL) 355 b = x; /* record empty slot */ 356 } else if (x->bdg_namelen != namelen) { 357 continue; 358 } else if (strncmp(name, x->bdg_basename, namelen) == 0) { 359 ND("found '%.*s' at %d", namelen, name, i); 360 b = x; 361 break; 362 } 363 } 364 if (i == num_bridges && b) { /* name not found, can create entry */ 365 /* initialize the bridge */ 366 strncpy(b->bdg_basename, name, namelen); 367 ND("create new bridge %s with ports %d", b->bdg_basename, 368 b->bdg_active_ports); 369 b->bdg_namelen = namelen; 370 b->bdg_active_ports = 0; 371 for (i = 0; i < NM_BDG_MAXPORTS; i++) 372 b->bdg_port_index[i] = i; 373 /* set the default function */ 374 b->bdg_ops.lookup = netmap_bdg_learning; 375 /* reset the MAC address table */ 376 bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); 377 NM_BNS_GET(b); 378 } 379 return b; 380 } 381 382 383 /* 384 * Free the forwarding tables for rings attached to switch ports. 385 */ 386 static void 387 nm_free_bdgfwd(struct netmap_adapter *na) 388 { 389 int nrings, i; 390 struct netmap_kring *kring; 391 392 NMG_LOCK_ASSERT(); 393 nrings = na->num_tx_rings; 394 kring = na->tx_rings; 395 for (i = 0; i < nrings; i++) { 396 if (kring[i].nkr_ft) { 397 nm_os_free(kring[i].nkr_ft); 398 kring[i].nkr_ft = NULL; /* protect from freeing twice */ 399 } 400 } 401 } 402 403 404 /* 405 * Allocate the forwarding tables for the rings attached to the bridge ports. 406 */ 407 static int 408 nm_alloc_bdgfwd(struct netmap_adapter *na) 409 { 410 int nrings, l, i, num_dstq; 411 struct netmap_kring *kring; 412 413 NMG_LOCK_ASSERT(); 414 /* all port:rings + broadcast */ 415 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; 416 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; 417 l += sizeof(struct nm_bdg_q) * num_dstq; 418 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; 419 420 nrings = netmap_real_rings(na, NR_TX); 421 kring = na->tx_rings; 422 for (i = 0; i < nrings; i++) { 423 struct nm_bdg_fwd *ft; 424 struct nm_bdg_q *dstq; 425 int j; 426 427 ft = nm_os_malloc(l); 428 if (!ft) { 429 nm_free_bdgfwd(na); 430 return ENOMEM; 431 } 432 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 433 for (j = 0; j < num_dstq; j++) { 434 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; 435 dstq[j].bq_len = 0; 436 } 437 kring[i].nkr_ft = ft; 438 } 439 return 0; 440 } 441 442 443 /* remove from bridge b the ports in slots hw and sw 444 * (sw can be -1 if not needed) 445 */ 446 static void 447 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) 448 { 449 int s_hw = hw, s_sw = sw; 450 int i, lim =b->bdg_active_ports; 451 uint8_t tmp[NM_BDG_MAXPORTS]; 452 453 /* 454 New algorithm: 455 make a copy of bdg_port_index; 456 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port 457 in the array of bdg_port_index, replacing them with 458 entries from the bottom of the array; 459 decrement bdg_active_ports; 460 acquire BDG_WLOCK() and copy back the array. 461 */ 462 463 if (netmap_verbose) 464 D("detach %d and %d (lim %d)", hw, sw, lim); 465 /* make a copy of the list of active ports, update it, 466 * and then copy back within BDG_WLOCK(). 467 */ 468 memcpy(tmp, b->bdg_port_index, sizeof(tmp)); 469 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { 470 if (hw >= 0 && tmp[i] == hw) { 471 ND("detach hw %d at %d", hw, i); 472 lim--; /* point to last active port */ 473 tmp[i] = tmp[lim]; /* swap with i */ 474 tmp[lim] = hw; /* now this is inactive */ 475 hw = -1; 476 } else if (sw >= 0 && tmp[i] == sw) { 477 ND("detach sw %d at %d", sw, i); 478 lim--; 479 tmp[i] = tmp[lim]; 480 tmp[lim] = sw; 481 sw = -1; 482 } else { 483 i++; 484 } 485 } 486 if (hw >= 0 || sw >= 0) { 487 D("XXX delete failed hw %d sw %d, should panic...", hw, sw); 488 } 489 490 BDG_WLOCK(b); 491 if (b->bdg_ops.dtor) 492 b->bdg_ops.dtor(b->bdg_ports[s_hw]); 493 b->bdg_ports[s_hw] = NULL; 494 if (s_sw >= 0) { 495 b->bdg_ports[s_sw] = NULL; 496 } 497 memcpy(b->bdg_port_index, tmp, sizeof(tmp)); 498 b->bdg_active_ports = lim; 499 BDG_WUNLOCK(b); 500 501 ND("now %d active ports", lim); 502 if (lim == 0) { 503 ND("marking bridge %s as free", b->bdg_basename); 504 bzero(&b->bdg_ops, sizeof(b->bdg_ops)); 505 NM_BNS_PUT(b); 506 } 507 } 508 509 /* nm_bdg_ctl callback for VALE ports */ 510 static int 511 netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) 512 { 513 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 514 struct nm_bridge *b = vpna->na_bdg; 515 516 (void)nmr; // XXX merge ? 517 if (attach) 518 return 0; /* nothing to do */ 519 if (b) { 520 netmap_set_all_rings(na, 0 /* disable */); 521 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 522 vpna->na_bdg = NULL; 523 netmap_set_all_rings(na, 1 /* enable */); 524 } 525 /* I have took reference just for attach */ 526 netmap_adapter_put(na); 527 return 0; 528 } 529 530 /* nm_dtor callback for ephemeral VALE ports */ 531 static void 532 netmap_vp_dtor(struct netmap_adapter *na) 533 { 534 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; 535 struct nm_bridge *b = vpna->na_bdg; 536 537 ND("%s has %d references", na->name, na->na_refcount); 538 539 if (b) { 540 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 541 } 542 543 if (vpna->autodelete && na->ifp != NULL) { 544 ND("releasing %s", na->ifp->if_xname); 545 NMG_UNLOCK(); 546 nm_os_vi_detach(na->ifp); 547 NMG_LOCK(); 548 } 549 } 550 551 /* remove a persistent VALE port from the system */ 552 static int 553 nm_vi_destroy(const char *name) 554 { 555 struct ifnet *ifp; 556 struct netmap_vp_adapter *vpna; 557 int error; 558 559 ifp = ifunit_ref(name); 560 if (!ifp) 561 return ENXIO; 562 NMG_LOCK(); 563 /* make sure this is actually a VALE port */ 564 if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) { 565 error = EINVAL; 566 goto err; 567 } 568 569 vpna = (struct netmap_vp_adapter *)NA(ifp); 570 571 /* we can only destroy ports that were created via NETMAP_BDG_NEWIF */ 572 if (vpna->autodelete) { 573 error = EINVAL; 574 goto err; 575 } 576 577 /* also make sure that nobody is using the inferface */ 578 if (NETMAP_OWNED_BY_ANY(&vpna->up) || 579 vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) { 580 error = EBUSY; 581 goto err; 582 } 583 584 NMG_UNLOCK(); 585 586 D("destroying a persistent vale interface %s", ifp->if_xname); 587 /* Linux requires all the references are released 588 * before unregister 589 */ 590 netmap_detach(ifp); 591 if_rele(ifp); 592 nm_os_vi_detach(ifp); 593 return 0; 594 595 err: 596 NMG_UNLOCK(); 597 if_rele(ifp); 598 return error; 599 } 600 601 static int 602 nm_update_info(struct nmreq *nmr, struct netmap_adapter *na) 603 { 604 nmr->nr_rx_rings = na->num_rx_rings; 605 nmr->nr_tx_rings = na->num_tx_rings; 606 nmr->nr_rx_slots = na->num_rx_desc; 607 nmr->nr_tx_slots = na->num_tx_desc; 608 return netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, NULL, &nmr->nr_arg2); 609 } 610 611 /* 612 * Create a virtual interface registered to the system. 613 * The interface will be attached to a bridge later. 614 */ 615 int 616 netmap_vi_create(struct nmreq *nmr, int autodelete) 617 { 618 struct ifnet *ifp; 619 struct netmap_vp_adapter *vpna; 620 struct netmap_mem_d *nmd = NULL; 621 int error; 622 623 /* don't include VALE prefix */ 624 if (!strncmp(nmr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME))) 625 return EINVAL; 626 ifp = ifunit_ref(nmr->nr_name); 627 if (ifp) { /* already exist, cannot create new one */ 628 error = EEXIST; 629 NMG_LOCK(); 630 if (NM_NA_VALID(ifp)) { 631 int update_err = nm_update_info(nmr, NA(ifp)); 632 if (update_err) 633 error = update_err; 634 } 635 NMG_UNLOCK(); 636 if_rele(ifp); 637 return error; 638 } 639 error = nm_os_vi_persist(nmr->nr_name, &ifp); 640 if (error) 641 return error; 642 643 NMG_LOCK(); 644 if (nmr->nr_arg2) { 645 nmd = netmap_mem_find(nmr->nr_arg2); 646 if (nmd == NULL) { 647 error = EINVAL; 648 goto err_1; 649 } 650 } 651 /* netmap_vp_create creates a struct netmap_vp_adapter */ 652 error = netmap_vp_create(nmr, ifp, nmd, &vpna); 653 if (error) { 654 D("error %d", error); 655 goto err_1; 656 } 657 /* persist-specific routines */ 658 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl; 659 if (!autodelete) { 660 netmap_adapter_get(&vpna->up); 661 } else { 662 vpna->autodelete = 1; 663 } 664 NM_ATTACH_NA(ifp, &vpna->up); 665 /* return the updated info */ 666 error = nm_update_info(nmr, &vpna->up); 667 if (error) { 668 goto err_2; 669 } 670 D("returning nr_arg2 %d", nmr->nr_arg2); 671 if (nmd) 672 netmap_mem_put(nmd); 673 NMG_UNLOCK(); 674 D("created %s", ifp->if_xname); 675 return 0; 676 677 err_2: 678 netmap_detach(ifp); 679 err_1: 680 if (nmd) 681 netmap_mem_put(nmd); 682 NMG_UNLOCK(); 683 nm_os_vi_detach(ifp); 684 685 return error; 686 } 687 688 /* Try to get a reference to a netmap adapter attached to a VALE switch. 689 * If the adapter is found (or is created), this function returns 0, a 690 * non NULL pointer is returned into *na, and the caller holds a 691 * reference to the adapter. 692 * If an adapter is not found, then no reference is grabbed and the 693 * function returns an error code, or 0 if there is just a VALE prefix 694 * mismatch. Therefore the caller holds a reference when 695 * (*na != NULL && return == 0). 696 */ 697 int 698 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, 699 struct netmap_mem_d *nmd, int create) 700 { 701 char *nr_name = nmr->nr_name; 702 const char *ifname; 703 struct ifnet *ifp = NULL; 704 int error = 0; 705 struct netmap_vp_adapter *vpna, *hostna = NULL; 706 struct nm_bridge *b; 707 int i, j, cand = -1, cand2 = -1; 708 int needed; 709 710 *na = NULL; /* default return value */ 711 712 /* first try to see if this is a bridge port. */ 713 NMG_LOCK_ASSERT(); 714 if (strncmp(nr_name, NM_BDG_NAME, sizeof(NM_BDG_NAME) - 1)) { 715 return 0; /* no error, but no VALE prefix */ 716 } 717 718 b = nm_find_bridge(nr_name, create); 719 if (b == NULL) { 720 D("no bridges available for '%s'", nr_name); 721 return (create ? ENOMEM : ENXIO); 722 } 723 if (strlen(nr_name) < b->bdg_namelen) /* impossible */ 724 panic("x"); 725 726 /* Now we are sure that name starts with the bridge's name, 727 * lookup the port in the bridge. We need to scan the entire 728 * list. It is not important to hold a WLOCK on the bridge 729 * during the search because NMG_LOCK already guarantees 730 * that there are no other possible writers. 731 */ 732 733 /* lookup in the local list of ports */ 734 for (j = 0; j < b->bdg_active_ports; j++) { 735 i = b->bdg_port_index[j]; 736 vpna = b->bdg_ports[i]; 737 // KASSERT(na != NULL); 738 ND("checking %s", vpna->up.name); 739 if (!strcmp(vpna->up.name, nr_name)) { 740 netmap_adapter_get(&vpna->up); 741 ND("found existing if %s refs %d", nr_name) 742 *na = &vpna->up; 743 return 0; 744 } 745 } 746 /* not found, should we create it? */ 747 if (!create) 748 return ENXIO; 749 /* yes we should, see if we have space to attach entries */ 750 needed = 2; /* in some cases we only need 1 */ 751 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { 752 D("bridge full %d, cannot create new port", b->bdg_active_ports); 753 return ENOMEM; 754 } 755 /* record the next two ports available, but do not allocate yet */ 756 cand = b->bdg_port_index[b->bdg_active_ports]; 757 cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; 758 ND("+++ bridge %s port %s used %d avail %d %d", 759 b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2); 760 761 /* 762 * try see if there is a matching NIC with this name 763 * (after the bridge's name) 764 */ 765 ifname = nr_name + b->bdg_namelen + 1; 766 ifp = ifunit_ref(ifname); 767 if (!ifp) { 768 /* Create an ephemeral virtual port 769 * This block contains all the ephemeral-specific logics 770 */ 771 if (nmr->nr_cmd) { 772 /* nr_cmd must be 0 for a virtual port */ 773 error = EINVAL; 774 goto out; 775 } 776 777 /* bdg_netmap_attach creates a struct netmap_adapter */ 778 error = netmap_vp_create(nmr, NULL, nmd, &vpna); 779 if (error) { 780 D("error %d", error); 781 goto out; 782 } 783 /* shortcut - we can skip get_hw_na(), 784 * ownership check and nm_bdg_attach() 785 */ 786 } else { 787 struct netmap_adapter *hw; 788 789 error = netmap_get_hw_na(ifp, nmd, &hw); 790 if (error || hw == NULL) 791 goto out; 792 793 /* host adapter might not be created */ 794 error = hw->nm_bdg_attach(nr_name, hw); 795 if (error) 796 goto out; 797 vpna = hw->na_vp; 798 hostna = hw->na_hostvp; 799 if (nmr->nr_arg1 != NETMAP_BDG_HOST) 800 hostna = NULL; 801 } 802 803 BDG_WLOCK(b); 804 vpna->bdg_port = cand; 805 ND("NIC %p to bridge port %d", vpna, cand); 806 /* bind the port to the bridge (virtual ports are not active) */ 807 b->bdg_ports[cand] = vpna; 808 vpna->na_bdg = b; 809 b->bdg_active_ports++; 810 if (hostna != NULL) { 811 /* also bind the host stack to the bridge */ 812 b->bdg_ports[cand2] = hostna; 813 hostna->bdg_port = cand2; 814 hostna->na_bdg = b; 815 b->bdg_active_ports++; 816 ND("host %p to bridge port %d", hostna, cand2); 817 } 818 ND("if %s refs %d", ifname, vpna->up.na_refcount); 819 BDG_WUNLOCK(b); 820 *na = &vpna->up; 821 netmap_adapter_get(*na); 822 823 out: 824 if (ifp) 825 if_rele(ifp); 826 827 return error; 828 } 829 830 831 /* Process NETMAP_BDG_ATTACH */ 832 static int 833 nm_bdg_ctl_attach(struct nmreq *nmr) 834 { 835 struct netmap_adapter *na; 836 struct netmap_mem_d *nmd = NULL; 837 int error; 838 839 NMG_LOCK(); 840 841 if (nmr->nr_arg2) { 842 nmd = netmap_mem_find(nmr->nr_arg2); 843 if (nmd == NULL) { 844 error = EINVAL; 845 goto unlock_exit; 846 } 847 } 848 849 error = netmap_get_bdg_na(nmr, &na, nmd, 1 /* create if not exists */); 850 if (error) /* no device */ 851 goto unlock_exit; 852 853 if (na == NULL) { /* VALE prefix missing */ 854 error = EINVAL; 855 goto unlock_exit; 856 } 857 858 if (NETMAP_OWNED_BY_ANY(na)) { 859 error = EBUSY; 860 goto unref_exit; 861 } 862 863 if (na->nm_bdg_ctl) { 864 /* nop for VALE ports. The bwrap needs to put the hwna 865 * in netmap mode (see netmap_bwrap_bdg_ctl) 866 */ 867 error = na->nm_bdg_ctl(na, nmr, 1); 868 if (error) 869 goto unref_exit; 870 ND("registered %s to netmap-mode", na->name); 871 } 872 NMG_UNLOCK(); 873 return 0; 874 875 unref_exit: 876 netmap_adapter_put(na); 877 unlock_exit: 878 NMG_UNLOCK(); 879 return error; 880 } 881 882 static inline int 883 nm_is_bwrap(struct netmap_adapter *na) 884 { 885 return na->nm_register == netmap_bwrap_reg; 886 } 887 888 /* process NETMAP_BDG_DETACH */ 889 static int 890 nm_bdg_ctl_detach(struct nmreq *nmr) 891 { 892 struct netmap_adapter *na; 893 int error; 894 895 NMG_LOCK(); 896 error = netmap_get_bdg_na(nmr, &na, NULL, 0 /* don't create */); 897 if (error) { /* no device, or another bridge or user owns the device */ 898 goto unlock_exit; 899 } 900 901 if (na == NULL) { /* VALE prefix missing */ 902 error = EINVAL; 903 goto unlock_exit; 904 } else if (nm_is_bwrap(na) && 905 ((struct netmap_bwrap_adapter *)na)->na_polling_state) { 906 /* Don't detach a NIC with polling */ 907 error = EBUSY; 908 netmap_adapter_put(na); 909 goto unlock_exit; 910 } 911 if (na->nm_bdg_ctl) { 912 /* remove the port from bridge. The bwrap 913 * also needs to put the hwna in normal mode 914 */ 915 error = na->nm_bdg_ctl(na, nmr, 0); 916 } 917 918 netmap_adapter_put(na); 919 unlock_exit: 920 NMG_UNLOCK(); 921 return error; 922 923 } 924 925 struct nm_bdg_polling_state; 926 struct 927 nm_bdg_kthread { 928 struct nm_kctx *nmk; 929 u_int qfirst; 930 u_int qlast; 931 struct nm_bdg_polling_state *bps; 932 }; 933 934 struct nm_bdg_polling_state { 935 bool configured; 936 bool stopped; 937 struct netmap_bwrap_adapter *bna; 938 u_int reg; 939 u_int qfirst; 940 u_int qlast; 941 u_int cpu_from; 942 u_int ncpus; 943 struct nm_bdg_kthread *kthreads; 944 }; 945 946 static void 947 netmap_bwrap_polling(void *data, int is_kthread) 948 { 949 struct nm_bdg_kthread *nbk = data; 950 struct netmap_bwrap_adapter *bna; 951 u_int qfirst, qlast, i; 952 struct netmap_kring *kring0, *kring; 953 954 if (!nbk) 955 return; 956 qfirst = nbk->qfirst; 957 qlast = nbk->qlast; 958 bna = nbk->bps->bna; 959 kring0 = NMR(bna->hwna, NR_RX); 960 961 for (i = qfirst; i < qlast; i++) { 962 kring = kring0 + i; 963 kring->nm_notify(kring, 0); 964 } 965 } 966 967 static int 968 nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps) 969 { 970 struct nm_kctx_cfg kcfg; 971 int i, j; 972 973 bps->kthreads = nm_os_malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus); 974 if (bps->kthreads == NULL) 975 return ENOMEM; 976 977 bzero(&kcfg, sizeof(kcfg)); 978 kcfg.worker_fn = netmap_bwrap_polling; 979 kcfg.use_kthread = 1; 980 for (i = 0; i < bps->ncpus; i++) { 981 struct nm_bdg_kthread *t = bps->kthreads + i; 982 int all = (bps->ncpus == 1 && bps->reg == NR_REG_ALL_NIC); 983 int affinity = bps->cpu_from + i; 984 985 t->bps = bps; 986 t->qfirst = all ? bps->qfirst /* must be 0 */: affinity; 987 t->qlast = all ? bps->qlast : t->qfirst + 1; 988 D("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst, 989 t->qlast); 990 991 kcfg.type = i; 992 kcfg.worker_private = t; 993 t->nmk = nm_os_kctx_create(&kcfg, 0, NULL); 994 if (t->nmk == NULL) { 995 goto cleanup; 996 } 997 nm_os_kctx_worker_setaff(t->nmk, affinity); 998 } 999 return 0; 1000 1001 cleanup: 1002 for (j = 0; j < i; j++) { 1003 struct nm_bdg_kthread *t = bps->kthreads + i; 1004 nm_os_kctx_destroy(t->nmk); 1005 } 1006 nm_os_free(bps->kthreads); 1007 return EFAULT; 1008 } 1009 1010 /* A variant of ptnetmap_start_kthreads() */ 1011 static int 1012 nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps) 1013 { 1014 int error, i, j; 1015 1016 if (!bps) { 1017 D("polling is not configured"); 1018 return EFAULT; 1019 } 1020 bps->stopped = false; 1021 1022 for (i = 0; i < bps->ncpus; i++) { 1023 struct nm_bdg_kthread *t = bps->kthreads + i; 1024 error = nm_os_kctx_worker_start(t->nmk); 1025 if (error) { 1026 D("error in nm_kthread_start()"); 1027 goto cleanup; 1028 } 1029 } 1030 return 0; 1031 1032 cleanup: 1033 for (j = 0; j < i; j++) { 1034 struct nm_bdg_kthread *t = bps->kthreads + i; 1035 nm_os_kctx_worker_stop(t->nmk); 1036 } 1037 bps->stopped = true; 1038 return error; 1039 } 1040 1041 static void 1042 nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps) 1043 { 1044 int i; 1045 1046 if (!bps) 1047 return; 1048 1049 for (i = 0; i < bps->ncpus; i++) { 1050 struct nm_bdg_kthread *t = bps->kthreads + i; 1051 nm_os_kctx_worker_stop(t->nmk); 1052 nm_os_kctx_destroy(t->nmk); 1053 } 1054 bps->stopped = true; 1055 } 1056 1057 static int 1058 get_polling_cfg(struct nmreq *nmr, struct netmap_adapter *na, 1059 struct nm_bdg_polling_state *bps) 1060 { 1061 int req_cpus, avail_cpus, core_from; 1062 u_int reg, i, qfirst, qlast; 1063 1064 avail_cpus = nm_os_ncpus(); 1065 req_cpus = nmr->nr_arg1; 1066 1067 if (req_cpus == 0) { 1068 D("req_cpus must be > 0"); 1069 return EINVAL; 1070 } else if (req_cpus >= avail_cpus) { 1071 D("for safety, we need at least one core left in the system"); 1072 return EINVAL; 1073 } 1074 reg = nmr->nr_flags & NR_REG_MASK; 1075 i = nmr->nr_ringid & NETMAP_RING_MASK; 1076 /* 1077 * ONE_NIC: dedicate one core to one ring. If multiple cores 1078 * are specified, consecutive rings are also polled. 1079 * For example, if ringid=2 and 2 cores are given, 1080 * ring 2 and 3 are polled by core 2 and 3, respectively. 1081 * ALL_NIC: poll all the rings using a core specified by ringid. 1082 * the number of cores must be 1. 1083 */ 1084 if (reg == NR_REG_ONE_NIC) { 1085 if (i + req_cpus > nma_get_nrings(na, NR_RX)) { 1086 D("only %d rings exist (ring %u-%u is given)", 1087 nma_get_nrings(na, NR_RX), i, i+req_cpus); 1088 return EINVAL; 1089 } 1090 qfirst = i; 1091 qlast = qfirst + req_cpus; 1092 core_from = qfirst; 1093 } else if (reg == NR_REG_ALL_NIC) { 1094 if (req_cpus != 1) { 1095 D("ncpus must be 1 not %d for REG_ALL_NIC", req_cpus); 1096 return EINVAL; 1097 } 1098 qfirst = 0; 1099 qlast = nma_get_nrings(na, NR_RX); 1100 core_from = i; 1101 } else { 1102 D("reg must be ALL_NIC or ONE_NIC"); 1103 return EINVAL; 1104 } 1105 1106 bps->reg = reg; 1107 bps->qfirst = qfirst; 1108 bps->qlast = qlast; 1109 bps->cpu_from = core_from; 1110 bps->ncpus = req_cpus; 1111 D("%s qfirst %u qlast %u cpu_from %u ncpus %u", 1112 reg == NR_REG_ALL_NIC ? "REG_ALL_NIC" : "REG_ONE_NIC", 1113 qfirst, qlast, core_from, req_cpus); 1114 return 0; 1115 } 1116 1117 static int 1118 nm_bdg_ctl_polling_start(struct nmreq *nmr, struct netmap_adapter *na) 1119 { 1120 struct nm_bdg_polling_state *bps; 1121 struct netmap_bwrap_adapter *bna; 1122 int error; 1123 1124 bna = (struct netmap_bwrap_adapter *)na; 1125 if (bna->na_polling_state) { 1126 D("ERROR adapter already in polling mode"); 1127 return EFAULT; 1128 } 1129 1130 bps = nm_os_malloc(sizeof(*bps)); 1131 if (!bps) 1132 return ENOMEM; 1133 bps->configured = false; 1134 bps->stopped = true; 1135 1136 if (get_polling_cfg(nmr, na, bps)) { 1137 nm_os_free(bps); 1138 return EINVAL; 1139 } 1140 1141 if (nm_bdg_create_kthreads(bps)) { 1142 nm_os_free(bps); 1143 return EFAULT; 1144 } 1145 1146 bps->configured = true; 1147 bna->na_polling_state = bps; 1148 bps->bna = bna; 1149 1150 /* disable interrupt if possible */ 1151 if (bna->hwna->nm_intr) 1152 bna->hwna->nm_intr(bna->hwna, 0); 1153 /* start kthread now */ 1154 error = nm_bdg_polling_start_kthreads(bps); 1155 if (error) { 1156 D("ERROR nm_bdg_polling_start_kthread()"); 1157 nm_os_free(bps->kthreads); 1158 nm_os_free(bps); 1159 bna->na_polling_state = NULL; 1160 if (bna->hwna->nm_intr) 1161 bna->hwna->nm_intr(bna->hwna, 1); 1162 } 1163 return error; 1164 } 1165 1166 static int 1167 nm_bdg_ctl_polling_stop(struct nmreq *nmr, struct netmap_adapter *na) 1168 { 1169 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na; 1170 struct nm_bdg_polling_state *bps; 1171 1172 if (!bna->na_polling_state) { 1173 D("ERROR adapter is not in polling mode"); 1174 return EFAULT; 1175 } 1176 bps = bna->na_polling_state; 1177 nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state); 1178 bps->configured = false; 1179 nm_os_free(bps); 1180 bna->na_polling_state = NULL; 1181 /* reenable interrupt */ 1182 if (bna->hwna->nm_intr) 1183 bna->hwna->nm_intr(bna->hwna, 1); 1184 return 0; 1185 } 1186 1187 /* Called by either user's context (netmap_ioctl()) 1188 * or external kernel modules (e.g., Openvswitch). 1189 * Operation is indicated in nmr->nr_cmd. 1190 * NETMAP_BDG_OPS that sets configure/lookup/dtor functions to the bridge 1191 * requires bdg_ops argument; the other commands ignore this argument. 1192 * 1193 * Called without NMG_LOCK. 1194 */ 1195 int 1196 netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops) 1197 { 1198 struct nm_bridge *b, *bridges; 1199 struct netmap_adapter *na; 1200 struct netmap_vp_adapter *vpna; 1201 char *name = nmr->nr_name; 1202 int cmd = nmr->nr_cmd, namelen = strlen(name); 1203 int error = 0, i, j; 1204 u_int num_bridges; 1205 1206 netmap_bns_getbridges(&bridges, &num_bridges); 1207 1208 switch (cmd) { 1209 case NETMAP_BDG_NEWIF: 1210 error = netmap_vi_create(nmr, 0 /* no autodelete */); 1211 break; 1212 1213 case NETMAP_BDG_DELIF: 1214 error = nm_vi_destroy(nmr->nr_name); 1215 break; 1216 1217 case NETMAP_BDG_ATTACH: 1218 error = nm_bdg_ctl_attach(nmr); 1219 break; 1220 1221 case NETMAP_BDG_DETACH: 1222 error = nm_bdg_ctl_detach(nmr); 1223 break; 1224 1225 case NETMAP_BDG_LIST: 1226 /* this is used to enumerate bridges and ports */ 1227 if (namelen) { /* look up indexes of bridge and port */ 1228 if (strncmp(name, NM_BDG_NAME, strlen(NM_BDG_NAME))) { 1229 error = EINVAL; 1230 break; 1231 } 1232 NMG_LOCK(); 1233 b = nm_find_bridge(name, 0 /* don't create */); 1234 if (!b) { 1235 error = ENOENT; 1236 NMG_UNLOCK(); 1237 break; 1238 } 1239 1240 error = 0; 1241 nmr->nr_arg1 = b - bridges; /* bridge index */ 1242 nmr->nr_arg2 = NM_BDG_NOPORT; 1243 for (j = 0; j < b->bdg_active_ports; j++) { 1244 i = b->bdg_port_index[j]; 1245 vpna = b->bdg_ports[i]; 1246 if (vpna == NULL) { 1247 D("---AAAAAAAAARGH-------"); 1248 continue; 1249 } 1250 /* the former and the latter identify a 1251 * virtual port and a NIC, respectively 1252 */ 1253 if (!strcmp(vpna->up.name, name)) { 1254 nmr->nr_arg2 = i; /* port index */ 1255 break; 1256 } 1257 } 1258 NMG_UNLOCK(); 1259 } else { 1260 /* return the first non-empty entry starting from 1261 * bridge nr_arg1 and port nr_arg2. 1262 * 1263 * Users can detect the end of the same bridge by 1264 * seeing the new and old value of nr_arg1, and can 1265 * detect the end of all the bridge by error != 0 1266 */ 1267 i = nmr->nr_arg1; 1268 j = nmr->nr_arg2; 1269 1270 NMG_LOCK(); 1271 for (error = ENOENT; i < NM_BRIDGES; i++) { 1272 b = bridges + i; 1273 for ( ; j < NM_BDG_MAXPORTS; j++) { 1274 if (b->bdg_ports[j] == NULL) 1275 continue; 1276 vpna = b->bdg_ports[j]; 1277 strncpy(name, vpna->up.name, (size_t)IFNAMSIZ); 1278 error = 0; 1279 goto out; 1280 } 1281 j = 0; /* following bridges scan from 0 */ 1282 } 1283 out: 1284 nmr->nr_arg1 = i; 1285 nmr->nr_arg2 = j; 1286 NMG_UNLOCK(); 1287 } 1288 break; 1289 1290 case NETMAP_BDG_REGOPS: /* XXX this should not be available from userspace */ 1291 /* register callbacks to the given bridge. 1292 * nmr->nr_name may be just bridge's name (including ':' 1293 * if it is not just NM_NAME). 1294 */ 1295 if (!bdg_ops) { 1296 error = EINVAL; 1297 break; 1298 } 1299 NMG_LOCK(); 1300 b = nm_find_bridge(name, 0 /* don't create */); 1301 if (!b) { 1302 error = EINVAL; 1303 } else { 1304 b->bdg_ops = *bdg_ops; 1305 } 1306 NMG_UNLOCK(); 1307 break; 1308 1309 case NETMAP_BDG_VNET_HDR: 1310 /* Valid lengths for the virtio-net header are 0 (no header), 1311 10 and 12. */ 1312 if (nmr->nr_arg1 != 0 && 1313 nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) && 1314 nmr->nr_arg1 != 12) { 1315 error = EINVAL; 1316 break; 1317 } 1318 NMG_LOCK(); 1319 error = netmap_get_bdg_na(nmr, &na, NULL, 0); 1320 if (na && !error) { 1321 vpna = (struct netmap_vp_adapter *)na; 1322 na->virt_hdr_len = nmr->nr_arg1; 1323 if (na->virt_hdr_len) { 1324 vpna->mfs = NETMAP_BUF_SIZE(na); 1325 } 1326 D("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na); 1327 netmap_adapter_put(na); 1328 } else if (!na) { 1329 error = ENXIO; 1330 } 1331 NMG_UNLOCK(); 1332 break; 1333 1334 case NETMAP_BDG_POLLING_ON: 1335 case NETMAP_BDG_POLLING_OFF: 1336 NMG_LOCK(); 1337 error = netmap_get_bdg_na(nmr, &na, NULL, 0); 1338 if (na && !error) { 1339 if (!nm_is_bwrap(na)) { 1340 error = EOPNOTSUPP; 1341 } else if (cmd == NETMAP_BDG_POLLING_ON) { 1342 error = nm_bdg_ctl_polling_start(nmr, na); 1343 if (!error) 1344 netmap_adapter_get(na); 1345 } else { 1346 error = nm_bdg_ctl_polling_stop(nmr, na); 1347 if (!error) 1348 netmap_adapter_put(na); 1349 } 1350 netmap_adapter_put(na); 1351 } 1352 NMG_UNLOCK(); 1353 break; 1354 1355 default: 1356 D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); 1357 error = EINVAL; 1358 break; 1359 } 1360 return error; 1361 } 1362 1363 int 1364 netmap_bdg_config(struct nmreq *nmr) 1365 { 1366 struct nm_bridge *b; 1367 int error = EINVAL; 1368 1369 NMG_LOCK(); 1370 b = nm_find_bridge(nmr->nr_name, 0); 1371 if (!b) { 1372 NMG_UNLOCK(); 1373 return error; 1374 } 1375 NMG_UNLOCK(); 1376 /* Don't call config() with NMG_LOCK() held */ 1377 BDG_RLOCK(b); 1378 if (b->bdg_ops.config != NULL) 1379 error = b->bdg_ops.config((struct nm_ifreq *)nmr); 1380 BDG_RUNLOCK(b); 1381 return error; 1382 } 1383 1384 1385 /* nm_krings_create callback for VALE ports. 1386 * Calls the standard netmap_krings_create, then adds leases on rx 1387 * rings and bdgfwd on tx rings. 1388 */ 1389 static int 1390 netmap_vp_krings_create(struct netmap_adapter *na) 1391 { 1392 u_int tailroom; 1393 int error, i; 1394 uint32_t *leases; 1395 u_int nrx = netmap_real_rings(na, NR_RX); 1396 1397 /* 1398 * Leases are attached to RX rings on vale ports 1399 */ 1400 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; 1401 1402 error = netmap_krings_create(na, tailroom); 1403 if (error) 1404 return error; 1405 1406 leases = na->tailroom; 1407 1408 for (i = 0; i < nrx; i++) { /* Receive rings */ 1409 na->rx_rings[i].nkr_leases = leases; 1410 leases += na->num_rx_desc; 1411 } 1412 1413 error = nm_alloc_bdgfwd(na); 1414 if (error) { 1415 netmap_krings_delete(na); 1416 return error; 1417 } 1418 1419 return 0; 1420 } 1421 1422 1423 /* nm_krings_delete callback for VALE ports. */ 1424 static void 1425 netmap_vp_krings_delete(struct netmap_adapter *na) 1426 { 1427 nm_free_bdgfwd(na); 1428 netmap_krings_delete(na); 1429 } 1430 1431 1432 static int 1433 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, 1434 struct netmap_vp_adapter *na, u_int ring_nr); 1435 1436 1437 /* 1438 * main dispatch routine for the bridge. 1439 * Grab packets from a kring, move them into the ft structure 1440 * associated to the tx (input) port. Max one instance per port, 1441 * filtered on input (ioctl, poll or XXX). 1442 * Returns the next position in the ring. 1443 */ 1444 static int 1445 nm_bdg_preflush(struct netmap_kring *kring, u_int end) 1446 { 1447 struct netmap_vp_adapter *na = 1448 (struct netmap_vp_adapter*)kring->na; 1449 struct netmap_ring *ring = kring->ring; 1450 struct nm_bdg_fwd *ft; 1451 u_int ring_nr = kring->ring_id; 1452 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; 1453 u_int ft_i = 0; /* start from 0 */ 1454 u_int frags = 1; /* how many frags ? */ 1455 struct nm_bridge *b = na->na_bdg; 1456 1457 /* To protect against modifications to the bridge we acquire a 1458 * shared lock, waiting if we can sleep (if the source port is 1459 * attached to a user process) or with a trylock otherwise (NICs). 1460 */ 1461 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); 1462 if (na->up.na_flags & NAF_BDG_MAYSLEEP) 1463 BDG_RLOCK(b); 1464 else if (!BDG_RTRYLOCK(b)) 1465 return j; 1466 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); 1467 ft = kring->nkr_ft; 1468 1469 for (; likely(j != end); j = nm_next(j, lim)) { 1470 struct netmap_slot *slot = &ring->slot[j]; 1471 char *buf; 1472 1473 ft[ft_i].ft_len = slot->len; 1474 ft[ft_i].ft_flags = slot->flags; 1475 1476 ND("flags is 0x%x", slot->flags); 1477 /* we do not use the buf changed flag, but we still need to reset it */ 1478 slot->flags &= ~NS_BUF_CHANGED; 1479 1480 /* this slot goes into a list so initialize the link field */ 1481 ft[ft_i].ft_next = NM_FT_NULL; 1482 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? 1483 (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot); 1484 if (unlikely(buf == NULL)) { 1485 RD(5, "NULL %s buffer pointer from %s slot %d len %d", 1486 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT", 1487 kring->name, j, ft[ft_i].ft_len); 1488 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up); 1489 ft[ft_i].ft_len = 0; 1490 ft[ft_i].ft_flags = 0; 1491 } 1492 __builtin_prefetch(buf); 1493 ++ft_i; 1494 if (slot->flags & NS_MOREFRAG) { 1495 frags++; 1496 continue; 1497 } 1498 if (unlikely(netmap_verbose && frags > 1)) 1499 RD(5, "%d frags at %d", frags, ft_i - frags); 1500 ft[ft_i - frags].ft_frags = frags; 1501 frags = 1; 1502 if (unlikely((int)ft_i >= bridge_batch)) 1503 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 1504 } 1505 if (frags > 1) { 1506 /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we 1507 * have to fix frags count. */ 1508 frags--; 1509 ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG; 1510 ft[ft_i - frags].ft_frags = frags; 1511 D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags); 1512 } 1513 if (ft_i) 1514 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 1515 BDG_RUNLOCK(b); 1516 return j; 1517 } 1518 1519 1520 /* ----- FreeBSD if_bridge hash function ------- */ 1521 1522 /* 1523 * The following hash function is adapted from "Hash Functions" by Bob Jenkins 1524 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). 1525 * 1526 * http://www.burtleburtle.net/bob/hash/spooky.html 1527 */ 1528 #define mix(a, b, c) \ 1529 do { \ 1530 a -= b; a -= c; a ^= (c >> 13); \ 1531 b -= c; b -= a; b ^= (a << 8); \ 1532 c -= a; c -= b; c ^= (b >> 13); \ 1533 a -= b; a -= c; a ^= (c >> 12); \ 1534 b -= c; b -= a; b ^= (a << 16); \ 1535 c -= a; c -= b; c ^= (b >> 5); \ 1536 a -= b; a -= c; a ^= (c >> 3); \ 1537 b -= c; b -= a; b ^= (a << 10); \ 1538 c -= a; c -= b; c ^= (b >> 15); \ 1539 } while (/*CONSTCOND*/0) 1540 1541 1542 static __inline uint32_t 1543 nm_bridge_rthash(const uint8_t *addr) 1544 { 1545 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key 1546 1547 b += addr[5] << 8; 1548 b += addr[4]; 1549 a += addr[3] << 24; 1550 a += addr[2] << 16; 1551 a += addr[1] << 8; 1552 a += addr[0]; 1553 1554 mix(a, b, c); 1555 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) 1556 return (c & BRIDGE_RTHASH_MASK); 1557 } 1558 1559 #undef mix 1560 1561 1562 /* nm_register callback for VALE ports */ 1563 static int 1564 netmap_vp_reg(struct netmap_adapter *na, int onoff) 1565 { 1566 struct netmap_vp_adapter *vpna = 1567 (struct netmap_vp_adapter*)na; 1568 enum txrx t; 1569 int i; 1570 1571 /* persistent ports may be put in netmap mode 1572 * before being attached to a bridge 1573 */ 1574 if (vpna->na_bdg) 1575 BDG_WLOCK(vpna->na_bdg); 1576 if (onoff) { 1577 for_rx_tx(t) { 1578 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { 1579 struct netmap_kring *kring = &NMR(na, t)[i]; 1580 1581 if (nm_kring_pending_on(kring)) 1582 kring->nr_mode = NKR_NETMAP_ON; 1583 } 1584 } 1585 if (na->active_fds == 0) 1586 na->na_flags |= NAF_NETMAP_ON; 1587 /* XXX on FreeBSD, persistent VALE ports should also 1588 * toggle IFCAP_NETMAP in na->ifp (2014-03-16) 1589 */ 1590 } else { 1591 if (na->active_fds == 0) 1592 na->na_flags &= ~NAF_NETMAP_ON; 1593 for_rx_tx(t) { 1594 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { 1595 struct netmap_kring *kring = &NMR(na, t)[i]; 1596 1597 if (nm_kring_pending_off(kring)) 1598 kring->nr_mode = NKR_NETMAP_OFF; 1599 } 1600 } 1601 } 1602 if (vpna->na_bdg) 1603 BDG_WUNLOCK(vpna->na_bdg); 1604 return 0; 1605 } 1606 1607 1608 /* 1609 * Lookup function for a learning bridge. 1610 * Update the hash table with the source address, 1611 * and then returns the destination port index, and the 1612 * ring in *dst_ring (at the moment, always use ring 0) 1613 */ 1614 u_int 1615 netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, 1616 struct netmap_vp_adapter *na) 1617 { 1618 uint8_t *buf = ft->ft_buf; 1619 u_int buf_len = ft->ft_len; 1620 struct nm_hash_ent *ht = na->na_bdg->ht; 1621 uint32_t sh, dh; 1622 u_int dst, mysrc = na->bdg_port; 1623 uint64_t smac, dmac; 1624 uint8_t indbuf[12]; 1625 1626 /* safety check, unfortunately we have many cases */ 1627 if (buf_len >= 14 + na->up.virt_hdr_len) { 1628 /* virthdr + mac_hdr in the same slot */ 1629 buf += na->up.virt_hdr_len; 1630 buf_len -= na->up.virt_hdr_len; 1631 } else if (buf_len == na->up.virt_hdr_len && ft->ft_flags & NS_MOREFRAG) { 1632 /* only header in first fragment */ 1633 ft++; 1634 buf = ft->ft_buf; 1635 buf_len = ft->ft_len; 1636 } else { 1637 RD(5, "invalid buf format, length %d", buf_len); 1638 return NM_BDG_NOPORT; 1639 } 1640 1641 if (ft->ft_flags & NS_INDIRECT) { 1642 if (copyin(buf, indbuf, sizeof(indbuf))) { 1643 return NM_BDG_NOPORT; 1644 } 1645 buf = indbuf; 1646 } 1647 1648 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; 1649 smac = le64toh(*(uint64_t *)(buf + 4)); 1650 smac >>= 16; 1651 1652 /* 1653 * The hash is somewhat expensive, there might be some 1654 * worthwhile optimizations here. 1655 */ 1656 if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */ 1657 uint8_t *s = buf+6; 1658 sh = nm_bridge_rthash(s); // XXX hash of source 1659 /* update source port forwarding entry */ 1660 na->last_smac = ht[sh].mac = smac; /* XXX expire ? */ 1661 ht[sh].ports = mysrc; 1662 if (netmap_verbose) 1663 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", 1664 s[0], s[1], s[2], s[3], s[4], s[5], mysrc); 1665 } 1666 dst = NM_BDG_BROADCAST; 1667 if ((buf[0] & 1) == 0) { /* unicast */ 1668 dh = nm_bridge_rthash(buf); // XXX hash of dst 1669 if (ht[dh].mac == dmac) { /* found dst */ 1670 dst = ht[dh].ports; 1671 } 1672 /* XXX otherwise return NM_BDG_UNKNOWN ? */ 1673 } 1674 return dst; 1675 } 1676 1677 1678 /* 1679 * Available space in the ring. Only used in VALE code 1680 * and only with is_rx = 1 1681 */ 1682 static inline uint32_t 1683 nm_kr_space(struct netmap_kring *k, int is_rx) 1684 { 1685 int space; 1686 1687 if (is_rx) { 1688 int busy = k->nkr_hwlease - k->nr_hwcur; 1689 if (busy < 0) 1690 busy += k->nkr_num_slots; 1691 space = k->nkr_num_slots - 1 - busy; 1692 } else { 1693 /* XXX never used in this branch */ 1694 space = k->nr_hwtail - k->nkr_hwlease; 1695 if (space < 0) 1696 space += k->nkr_num_slots; 1697 } 1698 #if 0 1699 // sanity check 1700 if (k->nkr_hwlease >= k->nkr_num_slots || 1701 k->nr_hwcur >= k->nkr_num_slots || 1702 k->nr_tail >= k->nkr_num_slots || 1703 busy < 0 || 1704 busy >= k->nkr_num_slots) { 1705 D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 1706 k->nkr_lease_idx, k->nkr_num_slots); 1707 } 1708 #endif 1709 return space; 1710 } 1711 1712 1713 1714 1715 /* make a lease on the kring for N positions. return the 1716 * lease index 1717 * XXX only used in VALE code and with is_rx = 1 1718 */ 1719 static inline uint32_t 1720 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) 1721 { 1722 uint32_t lim = k->nkr_num_slots - 1; 1723 uint32_t lease_idx = k->nkr_lease_idx; 1724 1725 k->nkr_leases[lease_idx] = NR_NOSLOT; 1726 k->nkr_lease_idx = nm_next(lease_idx, lim); 1727 1728 if (n > nm_kr_space(k, is_rx)) { 1729 D("invalid request for %d slots", n); 1730 panic("x"); 1731 } 1732 /* XXX verify that there are n slots */ 1733 k->nkr_hwlease += n; 1734 if (k->nkr_hwlease > lim) 1735 k->nkr_hwlease -= lim + 1; 1736 1737 if (k->nkr_hwlease >= k->nkr_num_slots || 1738 k->nr_hwcur >= k->nkr_num_slots || 1739 k->nr_hwtail >= k->nkr_num_slots || 1740 k->nkr_lease_idx >= k->nkr_num_slots) { 1741 D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", 1742 k->na->name, 1743 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 1744 k->nkr_lease_idx, k->nkr_num_slots); 1745 } 1746 return lease_idx; 1747 } 1748 1749 /* 1750 * 1751 * This flush routine supports only unicast and broadcast but a large 1752 * number of ports, and lets us replace the learn and dispatch functions. 1753 */ 1754 int 1755 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, 1756 u_int ring_nr) 1757 { 1758 struct nm_bdg_q *dst_ents, *brddst; 1759 uint16_t num_dsts = 0, *dsts; 1760 struct nm_bridge *b = na->na_bdg; 1761 u_int i, me = na->bdg_port; 1762 1763 /* 1764 * The work area (pointed by ft) is followed by an array of 1765 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS 1766 * queues per port plus one for the broadcast traffic. 1767 * Then we have an array of destination indexes. 1768 */ 1769 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 1770 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); 1771 1772 /* first pass: find a destination for each packet in the batch */ 1773 for (i = 0; likely(i < n); i += ft[i].ft_frags) { 1774 uint8_t dst_ring = ring_nr; /* default, same ring as origin */ 1775 uint16_t dst_port, d_i; 1776 struct nm_bdg_q *d; 1777 1778 ND("slot %d frags %d", i, ft[i].ft_frags); 1779 /* Drop the packet if the virtio-net header is not into the first 1780 fragment nor at the very beginning of the second. */ 1781 if (unlikely(na->up.virt_hdr_len > ft[i].ft_len)) 1782 continue; 1783 dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na); 1784 if (netmap_verbose > 255) 1785 RD(5, "slot %d port %d -> %d", i, me, dst_port); 1786 if (dst_port == NM_BDG_NOPORT) 1787 continue; /* this packet is identified to be dropped */ 1788 else if (unlikely(dst_port > NM_BDG_MAXPORTS)) 1789 continue; 1790 else if (dst_port == NM_BDG_BROADCAST) 1791 dst_ring = 0; /* broadcasts always go to ring 0 */ 1792 else if (unlikely(dst_port == me || 1793 !b->bdg_ports[dst_port])) 1794 continue; 1795 1796 /* get a position in the scratch pad */ 1797 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; 1798 d = dst_ents + d_i; 1799 1800 /* append the first fragment to the list */ 1801 if (d->bq_head == NM_FT_NULL) { /* new destination */ 1802 d->bq_head = d->bq_tail = i; 1803 /* remember this position to be scanned later */ 1804 if (dst_port != NM_BDG_BROADCAST) 1805 dsts[num_dsts++] = d_i; 1806 } else { 1807 ft[d->bq_tail].ft_next = i; 1808 d->bq_tail = i; 1809 } 1810 d->bq_len += ft[i].ft_frags; 1811 } 1812 1813 /* 1814 * Broadcast traffic goes to ring 0 on all destinations. 1815 * So we need to add these rings to the list of ports to scan. 1816 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is 1817 * expensive. We should keep a compact list of active destinations 1818 * so we could shorten this loop. 1819 */ 1820 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; 1821 if (brddst->bq_head != NM_FT_NULL) { 1822 u_int j; 1823 for (j = 0; likely(j < b->bdg_active_ports); j++) { 1824 uint16_t d_i; 1825 i = b->bdg_port_index[j]; 1826 if (unlikely(i == me)) 1827 continue; 1828 d_i = i * NM_BDG_MAXRINGS; 1829 if (dst_ents[d_i].bq_head == NM_FT_NULL) 1830 dsts[num_dsts++] = d_i; 1831 } 1832 } 1833 1834 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); 1835 /* second pass: scan destinations */ 1836 for (i = 0; i < num_dsts; i++) { 1837 struct netmap_vp_adapter *dst_na; 1838 struct netmap_kring *kring; 1839 struct netmap_ring *ring; 1840 u_int dst_nr, lim, j, d_i, next, brd_next; 1841 u_int needed, howmany; 1842 int retry = netmap_txsync_retry; 1843 struct nm_bdg_q *d; 1844 uint32_t my_start = 0, lease_idx = 0; 1845 int nrings; 1846 int virt_hdr_mismatch = 0; 1847 1848 d_i = dsts[i]; 1849 ND("second pass %d port %d", i, d_i); 1850 d = dst_ents + d_i; 1851 // XXX fix the division 1852 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; 1853 /* protect from the lookup function returning an inactive 1854 * destination port 1855 */ 1856 if (unlikely(dst_na == NULL)) 1857 goto cleanup; 1858 if (dst_na->up.na_flags & NAF_SW_ONLY) 1859 goto cleanup; 1860 /* 1861 * The interface may be in !netmap mode in two cases: 1862 * - when na is attached but not activated yet; 1863 * - when na is being deactivated but is still attached. 1864 */ 1865 if (unlikely(!nm_netmap_on(&dst_na->up))) { 1866 ND("not in netmap mode!"); 1867 goto cleanup; 1868 } 1869 1870 /* there is at least one either unicast or broadcast packet */ 1871 brd_next = brddst->bq_head; 1872 next = d->bq_head; 1873 /* we need to reserve this many slots. If fewer are 1874 * available, some packets will be dropped. 1875 * Packets may have multiple fragments, so we may not use 1876 * there is a chance that we may not use all of the slots 1877 * we have claimed, so we will need to handle the leftover 1878 * ones when we regain the lock. 1879 */ 1880 needed = d->bq_len + brddst->bq_len; 1881 1882 if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) { 1883 if (netmap_verbose) { 1884 RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len, 1885 dst_na->up.virt_hdr_len); 1886 } 1887 /* There is a virtio-net header/offloadings mismatch between 1888 * source and destination. The slower mismatch datapath will 1889 * be used to cope with all the mismatches. 1890 */ 1891 virt_hdr_mismatch = 1; 1892 if (dst_na->mfs < na->mfs) { 1893 /* We may need to do segmentation offloadings, and so 1894 * we may need a number of destination slots greater 1895 * than the number of input slots ('needed'). 1896 * We look for the smallest integer 'x' which satisfies: 1897 * needed * na->mfs + x * H <= x * na->mfs 1898 * where 'H' is the length of the longest header that may 1899 * be replicated in the segmentation process (e.g. for 1900 * TCPv4 we must account for ethernet header, IP header 1901 * and TCPv4 header). 1902 */ 1903 needed = (needed * na->mfs) / 1904 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; 1905 ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); 1906 } 1907 } 1908 1909 ND(5, "pass 2 dst %d is %x %s", 1910 i, d_i, is_vp ? "virtual" : "nic/host"); 1911 dst_nr = d_i & (NM_BDG_MAXRINGS-1); 1912 nrings = dst_na->up.num_rx_rings; 1913 if (dst_nr >= nrings) 1914 dst_nr = dst_nr % nrings; 1915 kring = &dst_na->up.rx_rings[dst_nr]; 1916 ring = kring->ring; 1917 lim = kring->nkr_num_slots - 1; 1918 1919 retry: 1920 1921 if (dst_na->retry && retry) { 1922 /* try to get some free slot from the previous run */ 1923 kring->nm_notify(kring, 0); 1924 /* actually useful only for bwraps, since there 1925 * the notify will trigger a txsync on the hwna. VALE ports 1926 * have dst_na->retry == 0 1927 */ 1928 } 1929 /* reserve the buffers in the queue and an entry 1930 * to report completion, and drop lock. 1931 * XXX this might become a helper function. 1932 */ 1933 mtx_lock(&kring->q_lock); 1934 if (kring->nkr_stopped) { 1935 mtx_unlock(&kring->q_lock); 1936 goto cleanup; 1937 } 1938 my_start = j = kring->nkr_hwlease; 1939 howmany = nm_kr_space(kring, 1); 1940 if (needed < howmany) 1941 howmany = needed; 1942 lease_idx = nm_kr_lease(kring, howmany, 1); 1943 mtx_unlock(&kring->q_lock); 1944 1945 /* only retry if we need more than available slots */ 1946 if (retry && needed <= howmany) 1947 retry = 0; 1948 1949 /* copy to the destination queue */ 1950 while (howmany > 0) { 1951 struct netmap_slot *slot; 1952 struct nm_bdg_fwd *ft_p, *ft_end; 1953 u_int cnt; 1954 1955 /* find the queue from which we pick next packet. 1956 * NM_FT_NULL is always higher than valid indexes 1957 * so we never dereference it if the other list 1958 * has packets (and if both are empty we never 1959 * get here). 1960 */ 1961 if (next < brd_next) { 1962 ft_p = ft + next; 1963 next = ft_p->ft_next; 1964 } else { /* insert broadcast */ 1965 ft_p = ft + brd_next; 1966 brd_next = ft_p->ft_next; 1967 } 1968 cnt = ft_p->ft_frags; // cnt > 0 1969 if (unlikely(cnt > howmany)) 1970 break; /* no more space */ 1971 if (netmap_verbose && cnt > 1) 1972 RD(5, "rx %d frags to %d", cnt, j); 1973 ft_end = ft_p + cnt; 1974 if (unlikely(virt_hdr_mismatch)) { 1975 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); 1976 } else { 1977 howmany -= cnt; 1978 do { 1979 char *dst, *src = ft_p->ft_buf; 1980 size_t copy_len = ft_p->ft_len, dst_len = copy_len; 1981 1982 slot = &ring->slot[j]; 1983 dst = NMB(&dst_na->up, slot); 1984 1985 ND("send [%d] %d(%d) bytes at %s:%d", 1986 i, (int)copy_len, (int)dst_len, 1987 NM_IFPNAME(dst_ifp), j); 1988 /* round to a multiple of 64 */ 1989 copy_len = (copy_len + 63) & ~63; 1990 1991 if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) || 1992 copy_len > NETMAP_BUF_SIZE(&na->up))) { 1993 RD(5, "invalid len %d, down to 64", (int)copy_len); 1994 copy_len = dst_len = 64; // XXX 1995 } 1996 if (ft_p->ft_flags & NS_INDIRECT) { 1997 if (copyin(src, dst, copy_len)) { 1998 // invalid user pointer, pretend len is 0 1999 dst_len = 0; 2000 } 2001 } else { 2002 //memcpy(dst, src, copy_len); 2003 pkt_copy(src, dst, (int)copy_len); 2004 } 2005 slot->len = dst_len; 2006 slot->flags = (cnt << 8)| NS_MOREFRAG; 2007 j = nm_next(j, lim); 2008 needed--; 2009 ft_p++; 2010 } while (ft_p != ft_end); 2011 slot->flags = (cnt << 8); /* clear flag on last entry */ 2012 } 2013 /* are we done ? */ 2014 if (next == NM_FT_NULL && brd_next == NM_FT_NULL) 2015 break; 2016 } 2017 { 2018 /* current position */ 2019 uint32_t *p = kring->nkr_leases; /* shorthand */ 2020 uint32_t update_pos; 2021 int still_locked = 1; 2022 2023 mtx_lock(&kring->q_lock); 2024 if (unlikely(howmany > 0)) { 2025 /* not used all bufs. If i am the last one 2026 * i can recover the slots, otherwise must 2027 * fill them with 0 to mark empty packets. 2028 */ 2029 ND("leftover %d bufs", howmany); 2030 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { 2031 /* yes i am the last one */ 2032 ND("roll back nkr_hwlease to %d", j); 2033 kring->nkr_hwlease = j; 2034 } else { 2035 while (howmany-- > 0) { 2036 ring->slot[j].len = 0; 2037 ring->slot[j].flags = 0; 2038 j = nm_next(j, lim); 2039 } 2040 } 2041 } 2042 p[lease_idx] = j; /* report I am done */ 2043 2044 update_pos = kring->nr_hwtail; 2045 2046 if (my_start == update_pos) { 2047 /* all slots before my_start have been reported, 2048 * so scan subsequent leases to see if other ranges 2049 * have been completed, and to a selwakeup or txsync. 2050 */ 2051 while (lease_idx != kring->nkr_lease_idx && 2052 p[lease_idx] != NR_NOSLOT) { 2053 j = p[lease_idx]; 2054 p[lease_idx] = NR_NOSLOT; 2055 lease_idx = nm_next(lease_idx, lim); 2056 } 2057 /* j is the new 'write' position. j != my_start 2058 * means there are new buffers to report 2059 */ 2060 if (likely(j != my_start)) { 2061 kring->nr_hwtail = j; 2062 still_locked = 0; 2063 mtx_unlock(&kring->q_lock); 2064 kring->nm_notify(kring, 0); 2065 /* this is netmap_notify for VALE ports and 2066 * netmap_bwrap_notify for bwrap. The latter will 2067 * trigger a txsync on the underlying hwna 2068 */ 2069 if (dst_na->retry && retry--) { 2070 /* XXX this is going to call nm_notify again. 2071 * Only useful for bwrap in virtual machines 2072 */ 2073 goto retry; 2074 } 2075 } 2076 } 2077 if (still_locked) 2078 mtx_unlock(&kring->q_lock); 2079 } 2080 cleanup: 2081 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ 2082 d->bq_len = 0; 2083 } 2084 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ 2085 brddst->bq_len = 0; 2086 return 0; 2087 } 2088 2089 /* nm_txsync callback for VALE ports */ 2090 static int 2091 netmap_vp_txsync(struct netmap_kring *kring, int flags) 2092 { 2093 struct netmap_vp_adapter *na = 2094 (struct netmap_vp_adapter *)kring->na; 2095 u_int done; 2096 u_int const lim = kring->nkr_num_slots - 1; 2097 u_int const head = kring->rhead; 2098 2099 if (bridge_batch <= 0) { /* testing only */ 2100 done = head; // used all 2101 goto done; 2102 } 2103 if (!na->na_bdg) { 2104 done = head; 2105 goto done; 2106 } 2107 if (bridge_batch > NM_BDG_BATCH) 2108 bridge_batch = NM_BDG_BATCH; 2109 2110 done = nm_bdg_preflush(kring, head); 2111 done: 2112 if (done != head) 2113 D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail); 2114 /* 2115 * packets between 'done' and 'cur' are left unsent. 2116 */ 2117 kring->nr_hwcur = done; 2118 kring->nr_hwtail = nm_prev(done, lim); 2119 if (netmap_verbose) 2120 D("%s ring %d flags %d", na->up.name, kring->ring_id, flags); 2121 return 0; 2122 } 2123 2124 2125 /* rxsync code used by VALE ports nm_rxsync callback and also 2126 * internally by the brwap 2127 */ 2128 static int 2129 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags) 2130 { 2131 struct netmap_adapter *na = kring->na; 2132 struct netmap_ring *ring = kring->ring; 2133 u_int nm_i, lim = kring->nkr_num_slots - 1; 2134 u_int head = kring->rhead; 2135 int n; 2136 2137 if (head > lim) { 2138 D("ouch dangerous reset!!!"); 2139 n = netmap_ring_reinit(kring); 2140 goto done; 2141 } 2142 2143 /* First part, import newly received packets. */ 2144 /* actually nothing to do here, they are already in the kring */ 2145 2146 /* Second part, skip past packets that userspace has released. */ 2147 nm_i = kring->nr_hwcur; 2148 if (nm_i != head) { 2149 /* consistency check, but nothing really important here */ 2150 for (n = 0; likely(nm_i != head); n++) { 2151 struct netmap_slot *slot = &ring->slot[nm_i]; 2152 void *addr = NMB(na, slot); 2153 2154 if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */ 2155 D("bad buffer index %d, ignore ?", 2156 slot->buf_idx); 2157 } 2158 slot->flags &= ~NS_BUF_CHANGED; 2159 nm_i = nm_next(nm_i, lim); 2160 } 2161 kring->nr_hwcur = head; 2162 } 2163 2164 n = 0; 2165 done: 2166 return n; 2167 } 2168 2169 /* 2170 * nm_rxsync callback for VALE ports 2171 * user process reading from a VALE switch. 2172 * Already protected against concurrent calls from userspace, 2173 * but we must acquire the queue's lock to protect against 2174 * writers on the same queue. 2175 */ 2176 static int 2177 netmap_vp_rxsync(struct netmap_kring *kring, int flags) 2178 { 2179 int n; 2180 2181 mtx_lock(&kring->q_lock); 2182 n = netmap_vp_rxsync_locked(kring, flags); 2183 mtx_unlock(&kring->q_lock); 2184 return n; 2185 } 2186 2187 2188 /* nm_bdg_attach callback for VALE ports 2189 * The na_vp port is this same netmap_adapter. There is no host port. 2190 */ 2191 static int 2192 netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na) 2193 { 2194 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 2195 2196 if (vpna->na_bdg) 2197 return EBUSY; 2198 na->na_vp = vpna; 2199 strncpy(na->name, name, sizeof(na->name)); 2200 na->na_hostvp = NULL; 2201 return 0; 2202 } 2203 2204 /* create a netmap_vp_adapter that describes a VALE port. 2205 * Only persistent VALE ports have a non-null ifp. 2206 */ 2207 static int 2208 netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp, 2209 struct netmap_mem_d *nmd, 2210 struct netmap_vp_adapter **ret) 2211 { 2212 struct netmap_vp_adapter *vpna; 2213 struct netmap_adapter *na; 2214 int error = 0; 2215 u_int npipes = 0; 2216 2217 vpna = nm_os_malloc(sizeof(*vpna)); 2218 if (vpna == NULL) 2219 return ENOMEM; 2220 2221 na = &vpna->up; 2222 2223 na->ifp = ifp; 2224 strncpy(na->name, nmr->nr_name, sizeof(na->name)); 2225 2226 /* bound checking */ 2227 na->num_tx_rings = nmr->nr_tx_rings; 2228 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 2229 nmr->nr_tx_rings = na->num_tx_rings; // write back 2230 na->num_rx_rings = nmr->nr_rx_rings; 2231 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 2232 nmr->nr_rx_rings = na->num_rx_rings; // write back 2233 nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, 2234 1, NM_BDG_MAXSLOTS, NULL); 2235 na->num_tx_desc = nmr->nr_tx_slots; 2236 nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, 2237 1, NM_BDG_MAXSLOTS, NULL); 2238 /* validate number of pipes. We want at least 1, 2239 * but probably can do with some more. 2240 * So let's use 2 as default (when 0 is supplied) 2241 */ 2242 npipes = nmr->nr_arg1; 2243 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); 2244 nmr->nr_arg1 = npipes; /* write back */ 2245 /* validate extra bufs */ 2246 nm_bound_var(&nmr->nr_arg3, 0, 0, 2247 128*NM_BDG_MAXSLOTS, NULL); 2248 na->num_rx_desc = nmr->nr_rx_slots; 2249 vpna->mfs = 1514; 2250 vpna->last_smac = ~0llu; 2251 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? 2252 vpna->mfs = netmap_buf_size; */ 2253 if (netmap_verbose) 2254 D("max frame size %u", vpna->mfs); 2255 2256 na->na_flags |= NAF_BDG_MAYSLEEP; 2257 /* persistent VALE ports look like hw devices 2258 * with a native netmap adapter 2259 */ 2260 if (ifp) 2261 na->na_flags |= NAF_NATIVE; 2262 na->nm_txsync = netmap_vp_txsync; 2263 na->nm_rxsync = netmap_vp_rxsync; 2264 na->nm_register = netmap_vp_reg; 2265 na->nm_krings_create = netmap_vp_krings_create; 2266 na->nm_krings_delete = netmap_vp_krings_delete; 2267 na->nm_dtor = netmap_vp_dtor; 2268 D("nr_arg2 %d", nmr->nr_arg2); 2269 na->nm_mem = nmd ? 2270 netmap_mem_get(nmd): 2271 netmap_mem_private_new( 2272 na->num_tx_rings, na->num_tx_desc, 2273 na->num_rx_rings, na->num_rx_desc, 2274 nmr->nr_arg3, npipes, &error); 2275 if (na->nm_mem == NULL) 2276 goto err; 2277 na->nm_bdg_attach = netmap_vp_bdg_attach; 2278 /* other nmd fields are set in the common routine */ 2279 error = netmap_attach_common(na); 2280 if (error) 2281 goto err; 2282 *ret = vpna; 2283 return 0; 2284 2285 err: 2286 if (na->nm_mem != NULL) 2287 netmap_mem_put(na->nm_mem); 2288 nm_os_free(vpna); 2289 return error; 2290 } 2291 2292 /* Bridge wrapper code (bwrap). 2293 * This is used to connect a non-VALE-port netmap_adapter (hwna) to a 2294 * VALE switch. 2295 * The main task is to swap the meaning of tx and rx rings to match the 2296 * expectations of the VALE switch code (see nm_bdg_flush). 2297 * 2298 * The bwrap works by interposing a netmap_bwrap_adapter between the 2299 * rest of the system and the hwna. The netmap_bwrap_adapter looks like 2300 * a netmap_vp_adapter to the rest the system, but, internally, it 2301 * translates all callbacks to what the hwna expects. 2302 * 2303 * Note that we have to intercept callbacks coming from two sides: 2304 * 2305 * - callbacks coming from the netmap module are intercepted by 2306 * passing around the netmap_bwrap_adapter instead of the hwna 2307 * 2308 * - callbacks coming from outside of the netmap module only know 2309 * about the hwna. This, however, only happens in interrupt 2310 * handlers, where only the hwna->nm_notify callback is called. 2311 * What the bwrap does is to overwrite the hwna->nm_notify callback 2312 * with its own netmap_bwrap_intr_notify. 2313 * XXX This assumes that the hwna->nm_notify callback was the 2314 * standard netmap_notify(), as it is the case for nic adapters. 2315 * Any additional action performed by hwna->nm_notify will not be 2316 * performed by netmap_bwrap_intr_notify. 2317 * 2318 * Additionally, the bwrap can optionally attach the host rings pair 2319 * of the wrapped adapter to a different port of the switch. 2320 */ 2321 2322 2323 static void 2324 netmap_bwrap_dtor(struct netmap_adapter *na) 2325 { 2326 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; 2327 struct netmap_adapter *hwna = bna->hwna; 2328 struct nm_bridge *b = bna->up.na_bdg, 2329 *bh = bna->host.na_bdg; 2330 2331 netmap_mem_put(bna->host.up.nm_mem); 2332 2333 if (b) { 2334 netmap_bdg_detach_common(b, bna->up.bdg_port, 2335 (bh ? bna->host.bdg_port : -1)); 2336 } 2337 2338 ND("na %p", na); 2339 na->ifp = NULL; 2340 bna->host.up.ifp = NULL; 2341 hwna->na_private = NULL; 2342 hwna->na_vp = hwna->na_hostvp = NULL; 2343 hwna->na_flags &= ~NAF_BUSY; 2344 netmap_adapter_put(hwna); 2345 2346 } 2347 2348 2349 /* 2350 * Intr callback for NICs connected to a bridge. 2351 * Simply ignore tx interrupts (maybe we could try to recover space ?) 2352 * and pass received packets from nic to the bridge. 2353 * 2354 * XXX TODO check locking: this is called from the interrupt 2355 * handler so we should make sure that the interface is not 2356 * disconnected while passing down an interrupt. 2357 * 2358 * Note, no user process can access this NIC or the host stack. 2359 * The only part of the ring that is significant are the slots, 2360 * and head/cur/tail are set from the kring as needed 2361 * (part as a receive ring, part as a transmit ring). 2362 * 2363 * callback that overwrites the hwna notify callback. 2364 * Packets come from the outside or from the host stack and are put on an 2365 * hwna rx ring. 2366 * The bridge wrapper then sends the packets through the bridge. 2367 */ 2368 static int 2369 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags) 2370 { 2371 struct netmap_adapter *na = kring->na; 2372 struct netmap_bwrap_adapter *bna = na->na_private; 2373 struct netmap_kring *bkring; 2374 struct netmap_vp_adapter *vpna = &bna->up; 2375 u_int ring_nr = kring->ring_id; 2376 int ret = NM_IRQ_COMPLETED; 2377 int error; 2378 2379 if (netmap_verbose) 2380 D("%s %s 0x%x", na->name, kring->name, flags); 2381 2382 bkring = &vpna->up.tx_rings[ring_nr]; 2383 2384 /* make sure the ring is not disabled */ 2385 if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) { 2386 return EIO; 2387 } 2388 2389 if (netmap_verbose) 2390 D("%s head %d cur %d tail %d", na->name, 2391 kring->rhead, kring->rcur, kring->rtail); 2392 2393 /* simulate a user wakeup on the rx ring 2394 * fetch packets that have arrived. 2395 */ 2396 error = kring->nm_sync(kring, 0); 2397 if (error) 2398 goto put_out; 2399 if (kring->nr_hwcur == kring->nr_hwtail) { 2400 if (netmap_verbose) 2401 D("how strange, interrupt with no packets on %s", 2402 na->name); 2403 goto put_out; 2404 } 2405 2406 /* new packets are kring->rcur to kring->nr_hwtail, and the bkring 2407 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail 2408 * to push all packets out. 2409 */ 2410 bkring->rhead = bkring->rcur = kring->nr_hwtail; 2411 2412 netmap_vp_txsync(bkring, flags); 2413 2414 /* mark all buffers as released on this ring */ 2415 kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail; 2416 /* another call to actually release the buffers */ 2417 error = kring->nm_sync(kring, 0); 2418 2419 /* The second rxsync may have further advanced hwtail. If this happens, 2420 * return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */ 2421 if (kring->rcur != kring->nr_hwtail) { 2422 ret = NM_IRQ_RESCHED; 2423 } 2424 put_out: 2425 nm_kr_put(kring); 2426 2427 return error ? error : ret; 2428 } 2429 2430 2431 /* nm_register callback for bwrap */ 2432 static int 2433 netmap_bwrap_reg(struct netmap_adapter *na, int onoff) 2434 { 2435 struct netmap_bwrap_adapter *bna = 2436 (struct netmap_bwrap_adapter *)na; 2437 struct netmap_adapter *hwna = bna->hwna; 2438 struct netmap_vp_adapter *hostna = &bna->host; 2439 int error, i; 2440 enum txrx t; 2441 2442 ND("%s %s", na->name, onoff ? "on" : "off"); 2443 2444 if (onoff) { 2445 /* netmap_do_regif has been called on the bwrap na. 2446 * We need to pass the information about the 2447 * memory allocator down to the hwna before 2448 * putting it in netmap mode 2449 */ 2450 hwna->na_lut = na->na_lut; 2451 2452 if (hostna->na_bdg) { 2453 /* if the host rings have been attached to switch, 2454 * we need to copy the memory allocator information 2455 * in the hostna also 2456 */ 2457 hostna->up.na_lut = na->na_lut; 2458 } 2459 2460 /* cross-link the netmap rings 2461 * The original number of rings comes from hwna, 2462 * rx rings on one side equals tx rings on the other. 2463 */ 2464 for_rx_tx(t) { 2465 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ 2466 for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) { 2467 NMR(hwna, r)[i].ring = NMR(na, t)[i].ring; 2468 } 2469 } 2470 2471 if (na->na_flags & NAF_HOST_RINGS) { 2472 struct netmap_adapter *hna = &hostna->up; 2473 /* the hostna rings are the host rings of the bwrap. 2474 * The corresponding krings must point back to the 2475 * hostna 2476 */ 2477 hna->tx_rings = &na->tx_rings[na->num_tx_rings]; 2478 hna->tx_rings[0].na = hna; 2479 hna->rx_rings = &na->rx_rings[na->num_rx_rings]; 2480 hna->rx_rings[0].na = hna; 2481 } 2482 } 2483 2484 /* pass down the pending ring state information */ 2485 for_rx_tx(t) { 2486 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) 2487 NMR(hwna, t)[i].nr_pending_mode = 2488 NMR(na, t)[i].nr_pending_mode; 2489 } 2490 2491 /* forward the request to the hwna */ 2492 error = hwna->nm_register(hwna, onoff); 2493 if (error) 2494 return error; 2495 2496 /* copy up the current ring state information */ 2497 for_rx_tx(t) { 2498 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) 2499 NMR(na, t)[i].nr_mode = 2500 NMR(hwna, t)[i].nr_mode; 2501 } 2502 2503 /* impersonate a netmap_vp_adapter */ 2504 netmap_vp_reg(na, onoff); 2505 if (hostna->na_bdg) 2506 netmap_vp_reg(&hostna->up, onoff); 2507 2508 if (onoff) { 2509 u_int i; 2510 /* intercept the hwna nm_nofify callback on the hw rings */ 2511 for (i = 0; i < hwna->num_rx_rings; i++) { 2512 hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify; 2513 hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify; 2514 } 2515 i = hwna->num_rx_rings; /* for safety */ 2516 /* save the host ring notify unconditionally */ 2517 hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify; 2518 if (hostna->na_bdg) { 2519 /* also intercept the host ring notify */ 2520 hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify; 2521 } 2522 if (na->active_fds == 0) 2523 na->na_flags |= NAF_NETMAP_ON; 2524 } else { 2525 u_int i; 2526 2527 if (na->active_fds == 0) 2528 na->na_flags &= ~NAF_NETMAP_ON; 2529 2530 /* reset all notify callbacks (including host ring) */ 2531 for (i = 0; i <= hwna->num_rx_rings; i++) { 2532 hwna->rx_rings[i].nm_notify = hwna->rx_rings[i].save_notify; 2533 hwna->rx_rings[i].save_notify = NULL; 2534 } 2535 hwna->na_lut.lut = NULL; 2536 hwna->na_lut.objtotal = 0; 2537 hwna->na_lut.objsize = 0; 2538 } 2539 2540 return 0; 2541 } 2542 2543 /* nm_config callback for bwrap */ 2544 static int 2545 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, 2546 u_int *rxr, u_int *rxd) 2547 { 2548 struct netmap_bwrap_adapter *bna = 2549 (struct netmap_bwrap_adapter *)na; 2550 struct netmap_adapter *hwna = bna->hwna; 2551 2552 /* forward the request */ 2553 netmap_update_config(hwna); 2554 /* swap the results */ 2555 *txr = hwna->num_rx_rings; 2556 *txd = hwna->num_rx_desc; 2557 *rxr = hwna->num_tx_rings; 2558 *rxd = hwna->num_rx_desc; 2559 2560 return 0; 2561 } 2562 2563 2564 /* nm_krings_create callback for bwrap */ 2565 static int 2566 netmap_bwrap_krings_create(struct netmap_adapter *na) 2567 { 2568 struct netmap_bwrap_adapter *bna = 2569 (struct netmap_bwrap_adapter *)na; 2570 struct netmap_adapter *hwna = bna->hwna; 2571 int i, error = 0; 2572 enum txrx t; 2573 2574 ND("%s", na->name); 2575 2576 /* impersonate a netmap_vp_adapter */ 2577 error = netmap_vp_krings_create(na); 2578 if (error) 2579 return error; 2580 2581 /* also create the hwna krings */ 2582 error = hwna->nm_krings_create(hwna); 2583 if (error) { 2584 goto err_del_vp_rings; 2585 } 2586 2587 /* get each ring slot number from the corresponding hwna ring */ 2588 for_rx_tx(t) { 2589 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ 2590 for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) { 2591 NMR(na, t)[i].nkr_num_slots = NMR(hwna, r)[i].nkr_num_slots; 2592 } 2593 } 2594 2595 return 0; 2596 2597 err_del_vp_rings: 2598 netmap_vp_krings_delete(na); 2599 2600 return error; 2601 } 2602 2603 2604 static void 2605 netmap_bwrap_krings_delete(struct netmap_adapter *na) 2606 { 2607 struct netmap_bwrap_adapter *bna = 2608 (struct netmap_bwrap_adapter *)na; 2609 struct netmap_adapter *hwna = bna->hwna; 2610 2611 ND("%s", na->name); 2612 2613 hwna->nm_krings_delete(hwna); 2614 netmap_vp_krings_delete(na); 2615 } 2616 2617 2618 /* notify method for the bridge-->hwna direction */ 2619 static int 2620 netmap_bwrap_notify(struct netmap_kring *kring, int flags) 2621 { 2622 struct netmap_adapter *na = kring->na; 2623 struct netmap_bwrap_adapter *bna = na->na_private; 2624 struct netmap_adapter *hwna = bna->hwna; 2625 u_int ring_n = kring->ring_id; 2626 u_int lim = kring->nkr_num_slots - 1; 2627 struct netmap_kring *hw_kring; 2628 int error; 2629 2630 ND("%s: na %s hwna %s", 2631 (kring ? kring->name : "NULL!"), 2632 (na ? na->name : "NULL!"), 2633 (hwna ? hwna->name : "NULL!")); 2634 hw_kring = &hwna->tx_rings[ring_n]; 2635 2636 if (nm_kr_tryget(hw_kring, 0, NULL)) { 2637 return ENXIO; 2638 } 2639 2640 /* first step: simulate a user wakeup on the rx ring */ 2641 netmap_vp_rxsync(kring, flags); 2642 ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", 2643 na->name, ring_n, 2644 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, 2645 ring->head, ring->cur, ring->tail, 2646 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail); 2647 /* second step: the new packets are sent on the tx ring 2648 * (which is actually the same ring) 2649 */ 2650 hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail; 2651 error = hw_kring->nm_sync(hw_kring, flags); 2652 if (error) 2653 goto put_out; 2654 2655 /* third step: now we are back the rx ring */ 2656 /* claim ownership on all hw owned bufs */ 2657 kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */ 2658 2659 /* fourth step: the user goes to sleep again, causing another rxsync */ 2660 netmap_vp_rxsync(kring, flags); 2661 ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", 2662 na->name, ring_n, 2663 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, 2664 ring->head, ring->cur, ring->tail, 2665 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); 2666 put_out: 2667 nm_kr_put(hw_kring); 2668 2669 return error ? error : NM_IRQ_COMPLETED; 2670 } 2671 2672 2673 /* nm_bdg_ctl callback for the bwrap. 2674 * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd]. 2675 * On attach, it needs to provide a fake netmap_priv_d structure and 2676 * perform a netmap_do_regif() on the bwrap. This will put both the 2677 * bwrap and the hwna in netmap mode, with the netmap rings shared 2678 * and cross linked. Moroever, it will start intercepting interrupts 2679 * directed to hwna. 2680 */ 2681 static int 2682 netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) 2683 { 2684 struct netmap_priv_d *npriv; 2685 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; 2686 int error = 0; 2687 2688 if (attach) { 2689 if (NETMAP_OWNED_BY_ANY(na)) { 2690 return EBUSY; 2691 } 2692 if (bna->na_kpriv) { 2693 /* nothing to do */ 2694 return 0; 2695 } 2696 npriv = netmap_priv_new(); 2697 if (npriv == NULL) 2698 return ENOMEM; 2699 npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */ 2700 error = netmap_do_regif(npriv, na, 0, NR_REG_NIC_SW); 2701 if (error) { 2702 netmap_priv_delete(npriv); 2703 return error; 2704 } 2705 bna->na_kpriv = npriv; 2706 na->na_flags |= NAF_BUSY; 2707 } else { 2708 if (na->active_fds == 0) /* not registered */ 2709 return EINVAL; 2710 netmap_priv_delete(bna->na_kpriv); 2711 bna->na_kpriv = NULL; 2712 na->na_flags &= ~NAF_BUSY; 2713 } 2714 return error; 2715 2716 } 2717 2718 /* attach a bridge wrapper to the 'real' device */ 2719 int 2720 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) 2721 { 2722 struct netmap_bwrap_adapter *bna; 2723 struct netmap_adapter *na = NULL; 2724 struct netmap_adapter *hostna = NULL; 2725 int error = 0; 2726 enum txrx t; 2727 2728 /* make sure the NIC is not already in use */ 2729 if (NETMAP_OWNED_BY_ANY(hwna)) { 2730 D("NIC %s busy, cannot attach to bridge", hwna->name); 2731 return EBUSY; 2732 } 2733 2734 bna = nm_os_malloc(sizeof(*bna)); 2735 if (bna == NULL) { 2736 return ENOMEM; 2737 } 2738 2739 na = &bna->up.up; 2740 /* make bwrap ifp point to the real ifp */ 2741 na->ifp = hwna->ifp; 2742 if_ref(na->ifp); 2743 na->na_private = bna; 2744 strncpy(na->name, nr_name, sizeof(na->name)); 2745 /* fill the ring data for the bwrap adapter with rx/tx meanings 2746 * swapped. The real cross-linking will be done during register, 2747 * when all the krings will have been created. 2748 */ 2749 for_rx_tx(t) { 2750 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ 2751 nma_set_nrings(na, t, nma_get_nrings(hwna, r)); 2752 nma_set_ndesc(na, t, nma_get_ndesc(hwna, r)); 2753 } 2754 na->nm_dtor = netmap_bwrap_dtor; 2755 na->nm_register = netmap_bwrap_reg; 2756 // na->nm_txsync = netmap_bwrap_txsync; 2757 // na->nm_rxsync = netmap_bwrap_rxsync; 2758 na->nm_config = netmap_bwrap_config; 2759 na->nm_krings_create = netmap_bwrap_krings_create; 2760 na->nm_krings_delete = netmap_bwrap_krings_delete; 2761 na->nm_notify = netmap_bwrap_notify; 2762 na->nm_bdg_ctl = netmap_bwrap_bdg_ctl; 2763 na->pdev = hwna->pdev; 2764 na->nm_mem = netmap_mem_get(hwna->nm_mem); 2765 na->virt_hdr_len = hwna->virt_hdr_len; 2766 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ 2767 2768 bna->hwna = hwna; 2769 netmap_adapter_get(hwna); 2770 hwna->na_private = bna; /* weak reference */ 2771 hwna->na_vp = &bna->up; 2772 2773 if (hwna->na_flags & NAF_HOST_RINGS) { 2774 if (hwna->na_flags & NAF_SW_ONLY) 2775 na->na_flags |= NAF_SW_ONLY; 2776 na->na_flags |= NAF_HOST_RINGS; 2777 hostna = &bna->host.up; 2778 snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name); 2779 hostna->ifp = hwna->ifp; 2780 for_rx_tx(t) { 2781 enum txrx r = nm_txrx_swap(t); 2782 nma_set_nrings(hostna, t, 1); 2783 nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r)); 2784 } 2785 // hostna->nm_txsync = netmap_bwrap_host_txsync; 2786 // hostna->nm_rxsync = netmap_bwrap_host_rxsync; 2787 hostna->nm_notify = netmap_bwrap_notify; 2788 hostna->nm_mem = netmap_mem_get(na->nm_mem); 2789 hostna->na_private = bna; 2790 hostna->na_vp = &bna->up; 2791 na->na_hostvp = hwna->na_hostvp = 2792 hostna->na_hostvp = &bna->host; 2793 hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */ 2794 } 2795 2796 ND("%s<->%s txr %d txd %d rxr %d rxd %d", 2797 na->name, ifp->if_xname, 2798 na->num_tx_rings, na->num_tx_desc, 2799 na->num_rx_rings, na->num_rx_desc); 2800 2801 error = netmap_attach_common(na); 2802 if (error) { 2803 goto err_free; 2804 } 2805 hwna->na_flags |= NAF_BUSY; 2806 return 0; 2807 2808 err_free: 2809 hwna->na_vp = hwna->na_hostvp = NULL; 2810 netmap_adapter_put(hwna); 2811 nm_os_free(bna); 2812 return error; 2813 2814 } 2815 2816 struct nm_bridge * 2817 netmap_init_bridges2(u_int n) 2818 { 2819 int i; 2820 struct nm_bridge *b; 2821 2822 b = nm_os_malloc(sizeof(struct nm_bridge) * n); 2823 if (b == NULL) 2824 return NULL; 2825 for (i = 0; i < n; i++) 2826 BDG_RWINIT(&b[i]); 2827 return b; 2828 } 2829 2830 void 2831 netmap_uninit_bridges2(struct nm_bridge *b, u_int n) 2832 { 2833 int i; 2834 2835 if (b == NULL) 2836 return; 2837 2838 for (i = 0; i < n; i++) 2839 BDG_RWDESTROY(&b[i]); 2840 nm_os_free(b); 2841 } 2842 2843 int 2844 netmap_init_bridges(void) 2845 { 2846 #ifdef CONFIG_NET_NS 2847 return netmap_bns_register(); 2848 #else 2849 nm_bridges = netmap_init_bridges2(NM_BRIDGES); 2850 if (nm_bridges == NULL) 2851 return ENOMEM; 2852 return 0; 2853 #endif 2854 } 2855 2856 void 2857 netmap_uninit_bridges(void) 2858 { 2859 #ifdef CONFIG_NET_NS 2860 netmap_bns_unregister(); 2861 #else 2862 netmap_uninit_bridges2(nm_bridges, NM_BRIDGES); 2863 #endif 2864 } 2865 #endif /* WITH_VALE */ 2866