1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (C) 2013-2016 Universita` di Pisa 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 30 /* 31 * This module implements the VALE switch for netmap 32 33 --- VALE SWITCH --- 34 35 NMG_LOCK() serializes all modifications to switches and ports. 36 A switch cannot be deleted until all ports are gone. 37 38 For each switch, an SX lock (RWlock on linux) protects 39 deletion of ports. When configuring or deleting a new port, the 40 lock is acquired in exclusive mode (after holding NMG_LOCK). 41 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 42 The lock is held throughout the entire forwarding cycle, 43 during which the thread may incur in a page fault. 44 Hence it is important that sleepable shared locks are used. 45 46 On the rx ring, the per-port lock is grabbed initially to reserve 47 a number of slot in the ring, then the lock is released, 48 packets are copied from source to destination, and then 49 the lock is acquired again and the receive ring is updated. 50 (A similar thing is done on the tx ring for NIC and host stack 51 ports attached to the switch) 52 53 */ 54 55 /* 56 * OS-specific code that is used only within this file. 57 * Other OS-specific code that must be accessed by drivers 58 * is present in netmap_kern.h 59 */ 60 61 #if defined(__FreeBSD__) 62 #include <sys/cdefs.h> /* prerequisite */ 63 __FBSDID("$FreeBSD$"); 64 65 #include <sys/types.h> 66 #include <sys/errno.h> 67 #include <sys/param.h> /* defines used in kernel.h */ 68 #include <sys/kernel.h> /* types used in module initialization */ 69 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 70 #include <sys/sockio.h> 71 #include <sys/socketvar.h> /* struct socket */ 72 #include <sys/malloc.h> 73 #include <sys/poll.h> 74 #include <sys/rwlock.h> 75 #include <sys/socket.h> /* sockaddrs */ 76 #include <sys/selinfo.h> 77 #include <sys/sysctl.h> 78 #include <net/if.h> 79 #include <net/if_var.h> 80 #include <net/bpf.h> /* BIOCIMMEDIATE */ 81 #include <machine/bus.h> /* bus_dmamap_* */ 82 #include <sys/endian.h> 83 #include <sys/refcount.h> 84 85 86 #define BDG_RWLOCK_T struct rwlock // struct rwlock 87 88 #define BDG_RWINIT(b) \ 89 rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) 90 #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) 91 #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) 92 #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) 93 #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) 94 #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) 95 #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) 96 97 98 #elif defined(linux) 99 100 #include "bsd_glue.h" 101 102 #elif defined(__APPLE__) 103 104 #warning OSX support is only partial 105 #include "osx_glue.h" 106 107 #elif defined(_WIN32) 108 #include "win_glue.h" 109 110 #else 111 112 #error Unsupported platform 113 114 #endif /* unsupported */ 115 116 /* 117 * common headers 118 */ 119 120 #include <net/netmap.h> 121 #include <dev/netmap/netmap_kern.h> 122 #include <dev/netmap/netmap_mem2.h> 123 124 #ifdef WITH_VALE 125 126 /* 127 * system parameters (most of them in netmap_kern.h) 128 * NM_BDG_NAME prefix for switch port names, default "vale" 129 * NM_BDG_MAXPORTS number of ports 130 * NM_BRIDGES max number of switches in the system. 131 * XXX should become a sysctl or tunable 132 * 133 * Switch ports are named valeX:Y where X is the switch name and Y 134 * is the port. If Y matches a physical interface name, the port is 135 * connected to a physical device. 136 * 137 * Unlike physical interfaces, switch ports use their own memory region 138 * for rings and buffers. 139 * The virtual interfaces use per-queue lock instead of core lock. 140 * In the tx loop, we aggregate traffic in batches to make all operations 141 * faster. The batch size is bridge_batch. 142 */ 143 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ 144 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ 145 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ 146 #define NM_BDG_HASH 1024 /* forwarding table entries */ 147 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ 148 #define NM_MULTISEG 64 /* max size of a chain of bufs */ 149 /* actual size of the tables */ 150 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) 151 /* NM_FT_NULL terminates a list of slots in the ft */ 152 #define NM_FT_NULL NM_BDG_BATCH_MAX 153 154 155 /* 156 * bridge_batch is set via sysctl to the max batch size to be 157 * used in the bridge. The actual value may be larger as the 158 * last packet in the block may overflow the size. 159 */ 160 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ 161 SYSBEGIN(vars_vale); 162 SYSCTL_DECL(_dev_netmap); 163 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); 164 SYSEND; 165 166 static int netmap_vp_create(struct nmreq *, struct ifnet *, 167 struct netmap_mem_d *nmd, struct netmap_vp_adapter **); 168 static int netmap_vp_reg(struct netmap_adapter *na, int onoff); 169 static int netmap_bwrap_reg(struct netmap_adapter *, int onoff); 170 171 /* 172 * For each output interface, nm_bdg_q is used to construct a list. 173 * bq_len is the number of output buffers (we can have coalescing 174 * during the copy). 175 */ 176 struct nm_bdg_q { 177 uint16_t bq_head; 178 uint16_t bq_tail; 179 uint32_t bq_len; /* number of buffers */ 180 }; 181 182 /* XXX revise this */ 183 struct nm_hash_ent { 184 uint64_t mac; /* the top 2 bytes are the epoch */ 185 uint64_t ports; 186 }; 187 188 /* 189 * nm_bridge is a descriptor for a VALE switch. 190 * Interfaces for a bridge are all in bdg_ports[]. 191 * The array has fixed size, an empty entry does not terminate 192 * the search, but lookups only occur on attach/detach so we 193 * don't mind if they are slow. 194 * 195 * The bridge is non blocking on the transmit ports: excess 196 * packets are dropped if there is no room on the output port. 197 * 198 * bdg_lock protects accesses to the bdg_ports array. 199 * This is a rw lock (or equivalent). 200 */ 201 struct nm_bridge { 202 /* XXX what is the proper alignment/layout ? */ 203 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ 204 int bdg_namelen; 205 uint32_t bdg_active_ports; /* 0 means free */ 206 char bdg_basename[IFNAMSIZ]; 207 208 /* Indexes of active ports (up to active_ports) 209 * and all other remaining ports. 210 */ 211 uint8_t bdg_port_index[NM_BDG_MAXPORTS]; 212 213 struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS]; 214 215 216 /* 217 * The function to decide the destination port. 218 * It returns either of an index of the destination port, 219 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to 220 * forward this packet. ring_nr is the source ring index, and the 221 * function may overwrite this value to forward this packet to a 222 * different ring index. 223 * This function must be set by netmap_bdg_ctl(). 224 */ 225 struct netmap_bdg_ops bdg_ops; 226 227 /* the forwarding table, MAC+ports. 228 * XXX should be changed to an argument to be passed to 229 * the lookup function, and allocated on attach 230 */ 231 struct nm_hash_ent ht[NM_BDG_HASH]; 232 233 #ifdef CONFIG_NET_NS 234 struct net *ns; 235 #endif /* CONFIG_NET_NS */ 236 }; 237 238 const char* 239 netmap_bdg_name(struct netmap_vp_adapter *vp) 240 { 241 struct nm_bridge *b = vp->na_bdg; 242 if (b == NULL) 243 return NULL; 244 return b->bdg_basename; 245 } 246 247 248 #ifndef CONFIG_NET_NS 249 /* 250 * XXX in principle nm_bridges could be created dynamically 251 * Right now we have a static array and deletions are protected 252 * by an exclusive lock. 253 */ 254 static struct nm_bridge *nm_bridges; 255 #endif /* !CONFIG_NET_NS */ 256 257 258 /* 259 * this is a slightly optimized copy routine which rounds 260 * to multiple of 64 bytes and is often faster than dealing 261 * with other odd sizes. We assume there is enough room 262 * in the source and destination buffers. 263 * 264 * XXX only for multiples of 64 bytes, non overlapped. 265 */ 266 static inline void 267 pkt_copy(void *_src, void *_dst, int l) 268 { 269 uint64_t *src = _src; 270 uint64_t *dst = _dst; 271 if (unlikely(l >= 1024)) { 272 memcpy(dst, src, l); 273 return; 274 } 275 for (; likely(l > 0); l-=64) { 276 *dst++ = *src++; 277 *dst++ = *src++; 278 *dst++ = *src++; 279 *dst++ = *src++; 280 *dst++ = *src++; 281 *dst++ = *src++; 282 *dst++ = *src++; 283 *dst++ = *src++; 284 } 285 } 286 287 288 static int 289 nm_is_id_char(const char c) 290 { 291 return (c >= 'a' && c <= 'z') || 292 (c >= 'A' && c <= 'Z') || 293 (c >= '0' && c <= '9') || 294 (c == '_'); 295 } 296 297 /* Validate the name of a VALE bridge port and return the 298 * position of the ":" character. */ 299 static int 300 nm_vale_name_validate(const char *name) 301 { 302 int colon_pos = -1; 303 int i; 304 305 if (!name || strlen(name) < strlen(NM_BDG_NAME)) { 306 return -1; 307 } 308 309 for (i = 0; name[i]; i++) { 310 if (name[i] == ':') { 311 if (colon_pos != -1) { 312 return -1; 313 } 314 colon_pos = i; 315 } else if (!nm_is_id_char(name[i])) { 316 return -1; 317 } 318 } 319 320 if (i >= IFNAMSIZ) { 321 return -1; 322 } 323 324 return colon_pos; 325 } 326 327 /* 328 * locate a bridge among the existing ones. 329 * MUST BE CALLED WITH NMG_LOCK() 330 * 331 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. 332 * We assume that this is called with a name of at least NM_NAME chars. 333 */ 334 static struct nm_bridge * 335 nm_find_bridge(const char *name, int create) 336 { 337 int i, namelen; 338 struct nm_bridge *b = NULL, *bridges; 339 u_int num_bridges; 340 341 NMG_LOCK_ASSERT(); 342 343 netmap_bns_getbridges(&bridges, &num_bridges); 344 345 namelen = nm_vale_name_validate(name); 346 if (namelen < 0) { 347 D("invalid bridge name %s", name ? name : NULL); 348 return NULL; 349 } 350 351 /* lookup the name, remember empty slot if there is one */ 352 for (i = 0; i < num_bridges; i++) { 353 struct nm_bridge *x = bridges + i; 354 355 if (x->bdg_active_ports == 0) { 356 if (create && b == NULL) 357 b = x; /* record empty slot */ 358 } else if (x->bdg_namelen != namelen) { 359 continue; 360 } else if (strncmp(name, x->bdg_basename, namelen) == 0) { 361 ND("found '%.*s' at %d", namelen, name, i); 362 b = x; 363 break; 364 } 365 } 366 if (i == num_bridges && b) { /* name not found, can create entry */ 367 /* initialize the bridge */ 368 strncpy(b->bdg_basename, name, namelen); 369 ND("create new bridge %s with ports %d", b->bdg_basename, 370 b->bdg_active_ports); 371 b->bdg_namelen = namelen; 372 b->bdg_active_ports = 0; 373 for (i = 0; i < NM_BDG_MAXPORTS; i++) 374 b->bdg_port_index[i] = i; 375 /* set the default function */ 376 b->bdg_ops.lookup = netmap_bdg_learning; 377 /* reset the MAC address table */ 378 bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); 379 NM_BNS_GET(b); 380 } 381 return b; 382 } 383 384 385 /* 386 * Free the forwarding tables for rings attached to switch ports. 387 */ 388 static void 389 nm_free_bdgfwd(struct netmap_adapter *na) 390 { 391 int nrings, i; 392 struct netmap_kring *kring; 393 394 NMG_LOCK_ASSERT(); 395 nrings = na->num_tx_rings; 396 kring = na->tx_rings; 397 for (i = 0; i < nrings; i++) { 398 if (kring[i].nkr_ft) { 399 nm_os_free(kring[i].nkr_ft); 400 kring[i].nkr_ft = NULL; /* protect from freeing twice */ 401 } 402 } 403 } 404 405 406 /* 407 * Allocate the forwarding tables for the rings attached to the bridge ports. 408 */ 409 static int 410 nm_alloc_bdgfwd(struct netmap_adapter *na) 411 { 412 int nrings, l, i, num_dstq; 413 struct netmap_kring *kring; 414 415 NMG_LOCK_ASSERT(); 416 /* all port:rings + broadcast */ 417 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; 418 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; 419 l += sizeof(struct nm_bdg_q) * num_dstq; 420 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; 421 422 nrings = netmap_real_rings(na, NR_TX); 423 kring = na->tx_rings; 424 for (i = 0; i < nrings; i++) { 425 struct nm_bdg_fwd *ft; 426 struct nm_bdg_q *dstq; 427 int j; 428 429 ft = nm_os_malloc(l); 430 if (!ft) { 431 nm_free_bdgfwd(na); 432 return ENOMEM; 433 } 434 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 435 for (j = 0; j < num_dstq; j++) { 436 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; 437 dstq[j].bq_len = 0; 438 } 439 kring[i].nkr_ft = ft; 440 } 441 return 0; 442 } 443 444 445 /* remove from bridge b the ports in slots hw and sw 446 * (sw can be -1 if not needed) 447 */ 448 static void 449 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) 450 { 451 int s_hw = hw, s_sw = sw; 452 int i, lim =b->bdg_active_ports; 453 uint8_t tmp[NM_BDG_MAXPORTS]; 454 455 /* 456 New algorithm: 457 make a copy of bdg_port_index; 458 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port 459 in the array of bdg_port_index, replacing them with 460 entries from the bottom of the array; 461 decrement bdg_active_ports; 462 acquire BDG_WLOCK() and copy back the array. 463 */ 464 465 if (netmap_verbose) 466 D("detach %d and %d (lim %d)", hw, sw, lim); 467 /* make a copy of the list of active ports, update it, 468 * and then copy back within BDG_WLOCK(). 469 */ 470 memcpy(tmp, b->bdg_port_index, sizeof(tmp)); 471 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { 472 if (hw >= 0 && tmp[i] == hw) { 473 ND("detach hw %d at %d", hw, i); 474 lim--; /* point to last active port */ 475 tmp[i] = tmp[lim]; /* swap with i */ 476 tmp[lim] = hw; /* now this is inactive */ 477 hw = -1; 478 } else if (sw >= 0 && tmp[i] == sw) { 479 ND("detach sw %d at %d", sw, i); 480 lim--; 481 tmp[i] = tmp[lim]; 482 tmp[lim] = sw; 483 sw = -1; 484 } else { 485 i++; 486 } 487 } 488 if (hw >= 0 || sw >= 0) { 489 D("XXX delete failed hw %d sw %d, should panic...", hw, sw); 490 } 491 492 BDG_WLOCK(b); 493 if (b->bdg_ops.dtor) 494 b->bdg_ops.dtor(b->bdg_ports[s_hw]); 495 b->bdg_ports[s_hw] = NULL; 496 if (s_sw >= 0) { 497 b->bdg_ports[s_sw] = NULL; 498 } 499 memcpy(b->bdg_port_index, tmp, sizeof(tmp)); 500 b->bdg_active_ports = lim; 501 BDG_WUNLOCK(b); 502 503 ND("now %d active ports", lim); 504 if (lim == 0) { 505 ND("marking bridge %s as free", b->bdg_basename); 506 bzero(&b->bdg_ops, sizeof(b->bdg_ops)); 507 NM_BNS_PUT(b); 508 } 509 } 510 511 /* nm_bdg_ctl callback for VALE ports */ 512 static int 513 netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) 514 { 515 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 516 struct nm_bridge *b = vpna->na_bdg; 517 518 (void)nmr; // XXX merge ? 519 if (attach) 520 return 0; /* nothing to do */ 521 if (b) { 522 netmap_set_all_rings(na, 0 /* disable */); 523 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 524 vpna->na_bdg = NULL; 525 netmap_set_all_rings(na, 1 /* enable */); 526 } 527 /* I have took reference just for attach */ 528 netmap_adapter_put(na); 529 return 0; 530 } 531 532 /* nm_dtor callback for ephemeral VALE ports */ 533 static void 534 netmap_vp_dtor(struct netmap_adapter *na) 535 { 536 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; 537 struct nm_bridge *b = vpna->na_bdg; 538 539 ND("%s has %d references", na->name, na->na_refcount); 540 541 if (b) { 542 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 543 } 544 545 if (vpna->autodelete && na->ifp != NULL) { 546 ND("releasing %s", na->ifp->if_xname); 547 NMG_UNLOCK(); 548 nm_os_vi_detach(na->ifp); 549 NMG_LOCK(); 550 } 551 } 552 553 /* remove a persistent VALE port from the system */ 554 static int 555 nm_vi_destroy(const char *name) 556 { 557 struct ifnet *ifp; 558 struct netmap_vp_adapter *vpna; 559 int error; 560 561 ifp = ifunit_ref(name); 562 if (!ifp) 563 return ENXIO; 564 NMG_LOCK(); 565 /* make sure this is actually a VALE port */ 566 if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) { 567 error = EINVAL; 568 goto err; 569 } 570 571 vpna = (struct netmap_vp_adapter *)NA(ifp); 572 573 /* we can only destroy ports that were created via NETMAP_BDG_NEWIF */ 574 if (vpna->autodelete) { 575 error = EINVAL; 576 goto err; 577 } 578 579 /* also make sure that nobody is using the inferface */ 580 if (NETMAP_OWNED_BY_ANY(&vpna->up) || 581 vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) { 582 error = EBUSY; 583 goto err; 584 } 585 586 NMG_UNLOCK(); 587 588 D("destroying a persistent vale interface %s", ifp->if_xname); 589 /* Linux requires all the references are released 590 * before unregister 591 */ 592 netmap_detach(ifp); 593 if_rele(ifp); 594 nm_os_vi_detach(ifp); 595 return 0; 596 597 err: 598 NMG_UNLOCK(); 599 if_rele(ifp); 600 return error; 601 } 602 603 static int 604 nm_update_info(struct nmreq *nmr, struct netmap_adapter *na) 605 { 606 nmr->nr_rx_rings = na->num_rx_rings; 607 nmr->nr_tx_rings = na->num_tx_rings; 608 nmr->nr_rx_slots = na->num_rx_desc; 609 nmr->nr_tx_slots = na->num_tx_desc; 610 return netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, NULL, &nmr->nr_arg2); 611 } 612 613 /* 614 * Create a virtual interface registered to the system. 615 * The interface will be attached to a bridge later. 616 */ 617 int 618 netmap_vi_create(struct nmreq *nmr, int autodelete) 619 { 620 struct ifnet *ifp; 621 struct netmap_vp_adapter *vpna; 622 struct netmap_mem_d *nmd = NULL; 623 int error; 624 625 /* don't include VALE prefix */ 626 if (!strncmp(nmr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME))) 627 return EINVAL; 628 ifp = ifunit_ref(nmr->nr_name); 629 if (ifp) { /* already exist, cannot create new one */ 630 error = EEXIST; 631 NMG_LOCK(); 632 if (NM_NA_VALID(ifp)) { 633 int update_err = nm_update_info(nmr, NA(ifp)); 634 if (update_err) 635 error = update_err; 636 } 637 NMG_UNLOCK(); 638 if_rele(ifp); 639 return error; 640 } 641 error = nm_os_vi_persist(nmr->nr_name, &ifp); 642 if (error) 643 return error; 644 645 NMG_LOCK(); 646 if (nmr->nr_arg2) { 647 nmd = netmap_mem_find(nmr->nr_arg2); 648 if (nmd == NULL) { 649 error = EINVAL; 650 goto err_1; 651 } 652 } 653 /* netmap_vp_create creates a struct netmap_vp_adapter */ 654 error = netmap_vp_create(nmr, ifp, nmd, &vpna); 655 if (error) { 656 D("error %d", error); 657 goto err_1; 658 } 659 /* persist-specific routines */ 660 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl; 661 if (!autodelete) { 662 netmap_adapter_get(&vpna->up); 663 } else { 664 vpna->autodelete = 1; 665 } 666 NM_ATTACH_NA(ifp, &vpna->up); 667 /* return the updated info */ 668 error = nm_update_info(nmr, &vpna->up); 669 if (error) { 670 goto err_2; 671 } 672 D("returning nr_arg2 %d", nmr->nr_arg2); 673 if (nmd) 674 netmap_mem_put(nmd); 675 NMG_UNLOCK(); 676 D("created %s", ifp->if_xname); 677 return 0; 678 679 err_2: 680 netmap_detach(ifp); 681 err_1: 682 if (nmd) 683 netmap_mem_put(nmd); 684 NMG_UNLOCK(); 685 nm_os_vi_detach(ifp); 686 687 return error; 688 } 689 690 /* Try to get a reference to a netmap adapter attached to a VALE switch. 691 * If the adapter is found (or is created), this function returns 0, a 692 * non NULL pointer is returned into *na, and the caller holds a 693 * reference to the adapter. 694 * If an adapter is not found, then no reference is grabbed and the 695 * function returns an error code, or 0 if there is just a VALE prefix 696 * mismatch. Therefore the caller holds a reference when 697 * (*na != NULL && return == 0). 698 */ 699 int 700 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, 701 struct netmap_mem_d *nmd, int create) 702 { 703 char *nr_name = nmr->nr_name; 704 const char *ifname; 705 struct ifnet *ifp = NULL; 706 int error = 0; 707 struct netmap_vp_adapter *vpna, *hostna = NULL; 708 struct nm_bridge *b; 709 int i, j, cand = -1, cand2 = -1; 710 int needed; 711 712 *na = NULL; /* default return value */ 713 714 /* first try to see if this is a bridge port. */ 715 NMG_LOCK_ASSERT(); 716 if (strncmp(nr_name, NM_BDG_NAME, sizeof(NM_BDG_NAME) - 1)) { 717 return 0; /* no error, but no VALE prefix */ 718 } 719 720 b = nm_find_bridge(nr_name, create); 721 if (b == NULL) { 722 D("no bridges available for '%s'", nr_name); 723 return (create ? ENOMEM : ENXIO); 724 } 725 if (strlen(nr_name) < b->bdg_namelen) /* impossible */ 726 panic("x"); 727 728 /* Now we are sure that name starts with the bridge's name, 729 * lookup the port in the bridge. We need to scan the entire 730 * list. It is not important to hold a WLOCK on the bridge 731 * during the search because NMG_LOCK already guarantees 732 * that there are no other possible writers. 733 */ 734 735 /* lookup in the local list of ports */ 736 for (j = 0; j < b->bdg_active_ports; j++) { 737 i = b->bdg_port_index[j]; 738 vpna = b->bdg_ports[i]; 739 // KASSERT(na != NULL); 740 ND("checking %s", vpna->up.name); 741 if (!strcmp(vpna->up.name, nr_name)) { 742 netmap_adapter_get(&vpna->up); 743 ND("found existing if %s refs %d", nr_name) 744 *na = &vpna->up; 745 return 0; 746 } 747 } 748 /* not found, should we create it? */ 749 if (!create) 750 return ENXIO; 751 /* yes we should, see if we have space to attach entries */ 752 needed = 2; /* in some cases we only need 1 */ 753 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { 754 D("bridge full %d, cannot create new port", b->bdg_active_ports); 755 return ENOMEM; 756 } 757 /* record the next two ports available, but do not allocate yet */ 758 cand = b->bdg_port_index[b->bdg_active_ports]; 759 cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; 760 ND("+++ bridge %s port %s used %d avail %d %d", 761 b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2); 762 763 /* 764 * try see if there is a matching NIC with this name 765 * (after the bridge's name) 766 */ 767 ifname = nr_name + b->bdg_namelen + 1; 768 ifp = ifunit_ref(ifname); 769 if (!ifp) { 770 /* Create an ephemeral virtual port 771 * This block contains all the ephemeral-specific logics 772 */ 773 if (nmr->nr_cmd) { 774 /* nr_cmd must be 0 for a virtual port */ 775 error = EINVAL; 776 goto out; 777 } 778 779 /* bdg_netmap_attach creates a struct netmap_adapter */ 780 error = netmap_vp_create(nmr, NULL, nmd, &vpna); 781 if (error) { 782 D("error %d", error); 783 goto out; 784 } 785 /* shortcut - we can skip get_hw_na(), 786 * ownership check and nm_bdg_attach() 787 */ 788 } else { 789 struct netmap_adapter *hw; 790 791 error = netmap_get_hw_na(ifp, nmd, &hw); 792 if (error || hw == NULL) 793 goto out; 794 795 /* host adapter might not be created */ 796 error = hw->nm_bdg_attach(nr_name, hw); 797 if (error) 798 goto out; 799 vpna = hw->na_vp; 800 hostna = hw->na_hostvp; 801 if (nmr->nr_arg1 != NETMAP_BDG_HOST) 802 hostna = NULL; 803 } 804 805 BDG_WLOCK(b); 806 vpna->bdg_port = cand; 807 ND("NIC %p to bridge port %d", vpna, cand); 808 /* bind the port to the bridge (virtual ports are not active) */ 809 b->bdg_ports[cand] = vpna; 810 vpna->na_bdg = b; 811 b->bdg_active_ports++; 812 if (hostna != NULL) { 813 /* also bind the host stack to the bridge */ 814 b->bdg_ports[cand2] = hostna; 815 hostna->bdg_port = cand2; 816 hostna->na_bdg = b; 817 b->bdg_active_ports++; 818 ND("host %p to bridge port %d", hostna, cand2); 819 } 820 ND("if %s refs %d", ifname, vpna->up.na_refcount); 821 BDG_WUNLOCK(b); 822 *na = &vpna->up; 823 netmap_adapter_get(*na); 824 825 out: 826 if (ifp) 827 if_rele(ifp); 828 829 return error; 830 } 831 832 833 /* Process NETMAP_BDG_ATTACH */ 834 static int 835 nm_bdg_ctl_attach(struct nmreq *nmr) 836 { 837 struct netmap_adapter *na; 838 struct netmap_mem_d *nmd = NULL; 839 int error; 840 841 NMG_LOCK(); 842 843 if (nmr->nr_arg2) { 844 nmd = netmap_mem_find(nmr->nr_arg2); 845 if (nmd == NULL) { 846 error = EINVAL; 847 goto unlock_exit; 848 } 849 } 850 851 error = netmap_get_bdg_na(nmr, &na, nmd, 1 /* create if not exists */); 852 if (error) /* no device */ 853 goto unlock_exit; 854 855 if (na == NULL) { /* VALE prefix missing */ 856 error = EINVAL; 857 goto unlock_exit; 858 } 859 860 if (NETMAP_OWNED_BY_ANY(na)) { 861 error = EBUSY; 862 goto unref_exit; 863 } 864 865 if (na->nm_bdg_ctl) { 866 /* nop for VALE ports. The bwrap needs to put the hwna 867 * in netmap mode (see netmap_bwrap_bdg_ctl) 868 */ 869 error = na->nm_bdg_ctl(na, nmr, 1); 870 if (error) 871 goto unref_exit; 872 ND("registered %s to netmap-mode", na->name); 873 } 874 NMG_UNLOCK(); 875 return 0; 876 877 unref_exit: 878 netmap_adapter_put(na); 879 unlock_exit: 880 NMG_UNLOCK(); 881 return error; 882 } 883 884 static inline int 885 nm_is_bwrap(struct netmap_adapter *na) 886 { 887 return na->nm_register == netmap_bwrap_reg; 888 } 889 890 /* process NETMAP_BDG_DETACH */ 891 static int 892 nm_bdg_ctl_detach(struct nmreq *nmr) 893 { 894 struct netmap_adapter *na; 895 int error; 896 897 NMG_LOCK(); 898 error = netmap_get_bdg_na(nmr, &na, NULL, 0 /* don't create */); 899 if (error) { /* no device, or another bridge or user owns the device */ 900 goto unlock_exit; 901 } 902 903 if (na == NULL) { /* VALE prefix missing */ 904 error = EINVAL; 905 goto unlock_exit; 906 } else if (nm_is_bwrap(na) && 907 ((struct netmap_bwrap_adapter *)na)->na_polling_state) { 908 /* Don't detach a NIC with polling */ 909 error = EBUSY; 910 netmap_adapter_put(na); 911 goto unlock_exit; 912 } 913 if (na->nm_bdg_ctl) { 914 /* remove the port from bridge. The bwrap 915 * also needs to put the hwna in normal mode 916 */ 917 error = na->nm_bdg_ctl(na, nmr, 0); 918 } 919 920 netmap_adapter_put(na); 921 unlock_exit: 922 NMG_UNLOCK(); 923 return error; 924 925 } 926 927 struct nm_bdg_polling_state; 928 struct 929 nm_bdg_kthread { 930 struct nm_kctx *nmk; 931 u_int qfirst; 932 u_int qlast; 933 struct nm_bdg_polling_state *bps; 934 }; 935 936 struct nm_bdg_polling_state { 937 bool configured; 938 bool stopped; 939 struct netmap_bwrap_adapter *bna; 940 u_int reg; 941 u_int qfirst; 942 u_int qlast; 943 u_int cpu_from; 944 u_int ncpus; 945 struct nm_bdg_kthread *kthreads; 946 }; 947 948 static void 949 netmap_bwrap_polling(void *data, int is_kthread) 950 { 951 struct nm_bdg_kthread *nbk = data; 952 struct netmap_bwrap_adapter *bna; 953 u_int qfirst, qlast, i; 954 struct netmap_kring *kring0, *kring; 955 956 if (!nbk) 957 return; 958 qfirst = nbk->qfirst; 959 qlast = nbk->qlast; 960 bna = nbk->bps->bna; 961 kring0 = NMR(bna->hwna, NR_RX); 962 963 for (i = qfirst; i < qlast; i++) { 964 kring = kring0 + i; 965 kring->nm_notify(kring, 0); 966 } 967 } 968 969 static int 970 nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps) 971 { 972 struct nm_kctx_cfg kcfg; 973 int i, j; 974 975 bps->kthreads = nm_os_malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus); 976 if (bps->kthreads == NULL) 977 return ENOMEM; 978 979 bzero(&kcfg, sizeof(kcfg)); 980 kcfg.worker_fn = netmap_bwrap_polling; 981 kcfg.use_kthread = 1; 982 for (i = 0; i < bps->ncpus; i++) { 983 struct nm_bdg_kthread *t = bps->kthreads + i; 984 int all = (bps->ncpus == 1 && bps->reg == NR_REG_ALL_NIC); 985 int affinity = bps->cpu_from + i; 986 987 t->bps = bps; 988 t->qfirst = all ? bps->qfirst /* must be 0 */: affinity; 989 t->qlast = all ? bps->qlast : t->qfirst + 1; 990 D("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst, 991 t->qlast); 992 993 kcfg.type = i; 994 kcfg.worker_private = t; 995 t->nmk = nm_os_kctx_create(&kcfg, 0, NULL); 996 if (t->nmk == NULL) { 997 goto cleanup; 998 } 999 nm_os_kctx_worker_setaff(t->nmk, affinity); 1000 } 1001 return 0; 1002 1003 cleanup: 1004 for (j = 0; j < i; j++) { 1005 struct nm_bdg_kthread *t = bps->kthreads + i; 1006 nm_os_kctx_destroy(t->nmk); 1007 } 1008 nm_os_free(bps->kthreads); 1009 return EFAULT; 1010 } 1011 1012 /* A variant of ptnetmap_start_kthreads() */ 1013 static int 1014 nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps) 1015 { 1016 int error, i, j; 1017 1018 if (!bps) { 1019 D("polling is not configured"); 1020 return EFAULT; 1021 } 1022 bps->stopped = false; 1023 1024 for (i = 0; i < bps->ncpus; i++) { 1025 struct nm_bdg_kthread *t = bps->kthreads + i; 1026 error = nm_os_kctx_worker_start(t->nmk); 1027 if (error) { 1028 D("error in nm_kthread_start()"); 1029 goto cleanup; 1030 } 1031 } 1032 return 0; 1033 1034 cleanup: 1035 for (j = 0; j < i; j++) { 1036 struct nm_bdg_kthread *t = bps->kthreads + i; 1037 nm_os_kctx_worker_stop(t->nmk); 1038 } 1039 bps->stopped = true; 1040 return error; 1041 } 1042 1043 static void 1044 nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps) 1045 { 1046 int i; 1047 1048 if (!bps) 1049 return; 1050 1051 for (i = 0; i < bps->ncpus; i++) { 1052 struct nm_bdg_kthread *t = bps->kthreads + i; 1053 nm_os_kctx_worker_stop(t->nmk); 1054 nm_os_kctx_destroy(t->nmk); 1055 } 1056 bps->stopped = true; 1057 } 1058 1059 static int 1060 get_polling_cfg(struct nmreq *nmr, struct netmap_adapter *na, 1061 struct nm_bdg_polling_state *bps) 1062 { 1063 int req_cpus, avail_cpus, core_from; 1064 u_int reg, i, qfirst, qlast; 1065 1066 avail_cpus = nm_os_ncpus(); 1067 req_cpus = nmr->nr_arg1; 1068 1069 if (req_cpus == 0) { 1070 D("req_cpus must be > 0"); 1071 return EINVAL; 1072 } else if (req_cpus >= avail_cpus) { 1073 D("for safety, we need at least one core left in the system"); 1074 return EINVAL; 1075 } 1076 reg = nmr->nr_flags & NR_REG_MASK; 1077 i = nmr->nr_ringid & NETMAP_RING_MASK; 1078 /* 1079 * ONE_NIC: dedicate one core to one ring. If multiple cores 1080 * are specified, consecutive rings are also polled. 1081 * For example, if ringid=2 and 2 cores are given, 1082 * ring 2 and 3 are polled by core 2 and 3, respectively. 1083 * ALL_NIC: poll all the rings using a core specified by ringid. 1084 * the number of cores must be 1. 1085 */ 1086 if (reg == NR_REG_ONE_NIC) { 1087 if (i + req_cpus > nma_get_nrings(na, NR_RX)) { 1088 D("only %d rings exist (ring %u-%u is given)", 1089 nma_get_nrings(na, NR_RX), i, i+req_cpus); 1090 return EINVAL; 1091 } 1092 qfirst = i; 1093 qlast = qfirst + req_cpus; 1094 core_from = qfirst; 1095 } else if (reg == NR_REG_ALL_NIC) { 1096 if (req_cpus != 1) { 1097 D("ncpus must be 1 not %d for REG_ALL_NIC", req_cpus); 1098 return EINVAL; 1099 } 1100 qfirst = 0; 1101 qlast = nma_get_nrings(na, NR_RX); 1102 core_from = i; 1103 } else { 1104 D("reg must be ALL_NIC or ONE_NIC"); 1105 return EINVAL; 1106 } 1107 1108 bps->reg = reg; 1109 bps->qfirst = qfirst; 1110 bps->qlast = qlast; 1111 bps->cpu_from = core_from; 1112 bps->ncpus = req_cpus; 1113 D("%s qfirst %u qlast %u cpu_from %u ncpus %u", 1114 reg == NR_REG_ALL_NIC ? "REG_ALL_NIC" : "REG_ONE_NIC", 1115 qfirst, qlast, core_from, req_cpus); 1116 return 0; 1117 } 1118 1119 static int 1120 nm_bdg_ctl_polling_start(struct nmreq *nmr, struct netmap_adapter *na) 1121 { 1122 struct nm_bdg_polling_state *bps; 1123 struct netmap_bwrap_adapter *bna; 1124 int error; 1125 1126 bna = (struct netmap_bwrap_adapter *)na; 1127 if (bna->na_polling_state) { 1128 D("ERROR adapter already in polling mode"); 1129 return EFAULT; 1130 } 1131 1132 bps = nm_os_malloc(sizeof(*bps)); 1133 if (!bps) 1134 return ENOMEM; 1135 bps->configured = false; 1136 bps->stopped = true; 1137 1138 if (get_polling_cfg(nmr, na, bps)) { 1139 nm_os_free(bps); 1140 return EINVAL; 1141 } 1142 1143 if (nm_bdg_create_kthreads(bps)) { 1144 nm_os_free(bps); 1145 return EFAULT; 1146 } 1147 1148 bps->configured = true; 1149 bna->na_polling_state = bps; 1150 bps->bna = bna; 1151 1152 /* disable interrupt if possible */ 1153 if (bna->hwna->nm_intr) 1154 bna->hwna->nm_intr(bna->hwna, 0); 1155 /* start kthread now */ 1156 error = nm_bdg_polling_start_kthreads(bps); 1157 if (error) { 1158 D("ERROR nm_bdg_polling_start_kthread()"); 1159 nm_os_free(bps->kthreads); 1160 nm_os_free(bps); 1161 bna->na_polling_state = NULL; 1162 if (bna->hwna->nm_intr) 1163 bna->hwna->nm_intr(bna->hwna, 1); 1164 } 1165 return error; 1166 } 1167 1168 static int 1169 nm_bdg_ctl_polling_stop(struct nmreq *nmr, struct netmap_adapter *na) 1170 { 1171 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na; 1172 struct nm_bdg_polling_state *bps; 1173 1174 if (!bna->na_polling_state) { 1175 D("ERROR adapter is not in polling mode"); 1176 return EFAULT; 1177 } 1178 bps = bna->na_polling_state; 1179 nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state); 1180 bps->configured = false; 1181 nm_os_free(bps); 1182 bna->na_polling_state = NULL; 1183 /* reenable interrupt */ 1184 if (bna->hwna->nm_intr) 1185 bna->hwna->nm_intr(bna->hwna, 1); 1186 return 0; 1187 } 1188 1189 /* Called by either user's context (netmap_ioctl()) 1190 * or external kernel modules (e.g., Openvswitch). 1191 * Operation is indicated in nmr->nr_cmd. 1192 * NETMAP_BDG_OPS that sets configure/lookup/dtor functions to the bridge 1193 * requires bdg_ops argument; the other commands ignore this argument. 1194 * 1195 * Called without NMG_LOCK. 1196 */ 1197 int 1198 netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops) 1199 { 1200 struct nm_bridge *b, *bridges; 1201 struct netmap_adapter *na; 1202 struct netmap_vp_adapter *vpna; 1203 char *name = nmr->nr_name; 1204 int cmd = nmr->nr_cmd, namelen = strlen(name); 1205 int error = 0, i, j; 1206 u_int num_bridges; 1207 1208 netmap_bns_getbridges(&bridges, &num_bridges); 1209 1210 switch (cmd) { 1211 case NETMAP_BDG_NEWIF: 1212 error = netmap_vi_create(nmr, 0 /* no autodelete */); 1213 break; 1214 1215 case NETMAP_BDG_DELIF: 1216 error = nm_vi_destroy(nmr->nr_name); 1217 break; 1218 1219 case NETMAP_BDG_ATTACH: 1220 error = nm_bdg_ctl_attach(nmr); 1221 break; 1222 1223 case NETMAP_BDG_DETACH: 1224 error = nm_bdg_ctl_detach(nmr); 1225 break; 1226 1227 case NETMAP_BDG_LIST: 1228 /* this is used to enumerate bridges and ports */ 1229 if (namelen) { /* look up indexes of bridge and port */ 1230 if (strncmp(name, NM_BDG_NAME, strlen(NM_BDG_NAME))) { 1231 error = EINVAL; 1232 break; 1233 } 1234 NMG_LOCK(); 1235 b = nm_find_bridge(name, 0 /* don't create */); 1236 if (!b) { 1237 error = ENOENT; 1238 NMG_UNLOCK(); 1239 break; 1240 } 1241 1242 error = 0; 1243 nmr->nr_arg1 = b - bridges; /* bridge index */ 1244 nmr->nr_arg2 = NM_BDG_NOPORT; 1245 for (j = 0; j < b->bdg_active_ports; j++) { 1246 i = b->bdg_port_index[j]; 1247 vpna = b->bdg_ports[i]; 1248 if (vpna == NULL) { 1249 D("---AAAAAAAAARGH-------"); 1250 continue; 1251 } 1252 /* the former and the latter identify a 1253 * virtual port and a NIC, respectively 1254 */ 1255 if (!strcmp(vpna->up.name, name)) { 1256 nmr->nr_arg2 = i; /* port index */ 1257 break; 1258 } 1259 } 1260 NMG_UNLOCK(); 1261 } else { 1262 /* return the first non-empty entry starting from 1263 * bridge nr_arg1 and port nr_arg2. 1264 * 1265 * Users can detect the end of the same bridge by 1266 * seeing the new and old value of nr_arg1, and can 1267 * detect the end of all the bridge by error != 0 1268 */ 1269 i = nmr->nr_arg1; 1270 j = nmr->nr_arg2; 1271 1272 NMG_LOCK(); 1273 for (error = ENOENT; i < NM_BRIDGES; i++) { 1274 b = bridges + i; 1275 for ( ; j < NM_BDG_MAXPORTS; j++) { 1276 if (b->bdg_ports[j] == NULL) 1277 continue; 1278 vpna = b->bdg_ports[j]; 1279 strncpy(name, vpna->up.name, (size_t)IFNAMSIZ); 1280 error = 0; 1281 goto out; 1282 } 1283 j = 0; /* following bridges scan from 0 */ 1284 } 1285 out: 1286 nmr->nr_arg1 = i; 1287 nmr->nr_arg2 = j; 1288 NMG_UNLOCK(); 1289 } 1290 break; 1291 1292 case NETMAP_BDG_REGOPS: /* XXX this should not be available from userspace */ 1293 /* register callbacks to the given bridge. 1294 * nmr->nr_name may be just bridge's name (including ':' 1295 * if it is not just NM_NAME). 1296 */ 1297 if (!bdg_ops) { 1298 error = EINVAL; 1299 break; 1300 } 1301 NMG_LOCK(); 1302 b = nm_find_bridge(name, 0 /* don't create */); 1303 if (!b) { 1304 error = EINVAL; 1305 } else { 1306 b->bdg_ops = *bdg_ops; 1307 } 1308 NMG_UNLOCK(); 1309 break; 1310 1311 case NETMAP_BDG_VNET_HDR: 1312 /* Valid lengths for the virtio-net header are 0 (no header), 1313 10 and 12. */ 1314 if (nmr->nr_arg1 != 0 && 1315 nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) && 1316 nmr->nr_arg1 != 12) { 1317 error = EINVAL; 1318 break; 1319 } 1320 NMG_LOCK(); 1321 error = netmap_get_bdg_na(nmr, &na, NULL, 0); 1322 if (na && !error) { 1323 vpna = (struct netmap_vp_adapter *)na; 1324 na->virt_hdr_len = nmr->nr_arg1; 1325 if (na->virt_hdr_len) { 1326 vpna->mfs = NETMAP_BUF_SIZE(na); 1327 } 1328 D("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na); 1329 netmap_adapter_put(na); 1330 } else if (!na) { 1331 error = ENXIO; 1332 } 1333 NMG_UNLOCK(); 1334 break; 1335 1336 case NETMAP_BDG_POLLING_ON: 1337 case NETMAP_BDG_POLLING_OFF: 1338 NMG_LOCK(); 1339 error = netmap_get_bdg_na(nmr, &na, NULL, 0); 1340 if (na && !error) { 1341 if (!nm_is_bwrap(na)) { 1342 error = EOPNOTSUPP; 1343 } else if (cmd == NETMAP_BDG_POLLING_ON) { 1344 error = nm_bdg_ctl_polling_start(nmr, na); 1345 if (!error) 1346 netmap_adapter_get(na); 1347 } else { 1348 error = nm_bdg_ctl_polling_stop(nmr, na); 1349 if (!error) 1350 netmap_adapter_put(na); 1351 } 1352 netmap_adapter_put(na); 1353 } 1354 NMG_UNLOCK(); 1355 break; 1356 1357 default: 1358 D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); 1359 error = EINVAL; 1360 break; 1361 } 1362 return error; 1363 } 1364 1365 int 1366 netmap_bdg_config(struct nmreq *nmr) 1367 { 1368 struct nm_bridge *b; 1369 int error = EINVAL; 1370 1371 NMG_LOCK(); 1372 b = nm_find_bridge(nmr->nr_name, 0); 1373 if (!b) { 1374 NMG_UNLOCK(); 1375 return error; 1376 } 1377 NMG_UNLOCK(); 1378 /* Don't call config() with NMG_LOCK() held */ 1379 BDG_RLOCK(b); 1380 if (b->bdg_ops.config != NULL) 1381 error = b->bdg_ops.config((struct nm_ifreq *)nmr); 1382 BDG_RUNLOCK(b); 1383 return error; 1384 } 1385 1386 1387 /* nm_krings_create callback for VALE ports. 1388 * Calls the standard netmap_krings_create, then adds leases on rx 1389 * rings and bdgfwd on tx rings. 1390 */ 1391 static int 1392 netmap_vp_krings_create(struct netmap_adapter *na) 1393 { 1394 u_int tailroom; 1395 int error, i; 1396 uint32_t *leases; 1397 u_int nrx = netmap_real_rings(na, NR_RX); 1398 1399 /* 1400 * Leases are attached to RX rings on vale ports 1401 */ 1402 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; 1403 1404 error = netmap_krings_create(na, tailroom); 1405 if (error) 1406 return error; 1407 1408 leases = na->tailroom; 1409 1410 for (i = 0; i < nrx; i++) { /* Receive rings */ 1411 na->rx_rings[i].nkr_leases = leases; 1412 leases += na->num_rx_desc; 1413 } 1414 1415 error = nm_alloc_bdgfwd(na); 1416 if (error) { 1417 netmap_krings_delete(na); 1418 return error; 1419 } 1420 1421 return 0; 1422 } 1423 1424 1425 /* nm_krings_delete callback for VALE ports. */ 1426 static void 1427 netmap_vp_krings_delete(struct netmap_adapter *na) 1428 { 1429 nm_free_bdgfwd(na); 1430 netmap_krings_delete(na); 1431 } 1432 1433 1434 static int 1435 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, 1436 struct netmap_vp_adapter *na, u_int ring_nr); 1437 1438 1439 /* 1440 * main dispatch routine for the bridge. 1441 * Grab packets from a kring, move them into the ft structure 1442 * associated to the tx (input) port. Max one instance per port, 1443 * filtered on input (ioctl, poll or XXX). 1444 * Returns the next position in the ring. 1445 */ 1446 static int 1447 nm_bdg_preflush(struct netmap_kring *kring, u_int end) 1448 { 1449 struct netmap_vp_adapter *na = 1450 (struct netmap_vp_adapter*)kring->na; 1451 struct netmap_ring *ring = kring->ring; 1452 struct nm_bdg_fwd *ft; 1453 u_int ring_nr = kring->ring_id; 1454 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; 1455 u_int ft_i = 0; /* start from 0 */ 1456 u_int frags = 1; /* how many frags ? */ 1457 struct nm_bridge *b = na->na_bdg; 1458 1459 /* To protect against modifications to the bridge we acquire a 1460 * shared lock, waiting if we can sleep (if the source port is 1461 * attached to a user process) or with a trylock otherwise (NICs). 1462 */ 1463 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); 1464 if (na->up.na_flags & NAF_BDG_MAYSLEEP) 1465 BDG_RLOCK(b); 1466 else if (!BDG_RTRYLOCK(b)) 1467 return j; 1468 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); 1469 ft = kring->nkr_ft; 1470 1471 for (; likely(j != end); j = nm_next(j, lim)) { 1472 struct netmap_slot *slot = &ring->slot[j]; 1473 char *buf; 1474 1475 ft[ft_i].ft_len = slot->len; 1476 ft[ft_i].ft_flags = slot->flags; 1477 1478 ND("flags is 0x%x", slot->flags); 1479 /* we do not use the buf changed flag, but we still need to reset it */ 1480 slot->flags &= ~NS_BUF_CHANGED; 1481 1482 /* this slot goes into a list so initialize the link field */ 1483 ft[ft_i].ft_next = NM_FT_NULL; 1484 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? 1485 (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot); 1486 if (unlikely(buf == NULL)) { 1487 RD(5, "NULL %s buffer pointer from %s slot %d len %d", 1488 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT", 1489 kring->name, j, ft[ft_i].ft_len); 1490 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up); 1491 ft[ft_i].ft_len = 0; 1492 ft[ft_i].ft_flags = 0; 1493 } 1494 __builtin_prefetch(buf); 1495 ++ft_i; 1496 if (slot->flags & NS_MOREFRAG) { 1497 frags++; 1498 continue; 1499 } 1500 if (unlikely(netmap_verbose && frags > 1)) 1501 RD(5, "%d frags at %d", frags, ft_i - frags); 1502 ft[ft_i - frags].ft_frags = frags; 1503 frags = 1; 1504 if (unlikely((int)ft_i >= bridge_batch)) 1505 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 1506 } 1507 if (frags > 1) { 1508 /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we 1509 * have to fix frags count. */ 1510 frags--; 1511 ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG; 1512 ft[ft_i - frags].ft_frags = frags; 1513 D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags); 1514 } 1515 if (ft_i) 1516 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 1517 BDG_RUNLOCK(b); 1518 return j; 1519 } 1520 1521 1522 /* ----- FreeBSD if_bridge hash function ------- */ 1523 1524 /* 1525 * The following hash function is adapted from "Hash Functions" by Bob Jenkins 1526 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). 1527 * 1528 * http://www.burtleburtle.net/bob/hash/spooky.html 1529 */ 1530 #define mix(a, b, c) \ 1531 do { \ 1532 a -= b; a -= c; a ^= (c >> 13); \ 1533 b -= c; b -= a; b ^= (a << 8); \ 1534 c -= a; c -= b; c ^= (b >> 13); \ 1535 a -= b; a -= c; a ^= (c >> 12); \ 1536 b -= c; b -= a; b ^= (a << 16); \ 1537 c -= a; c -= b; c ^= (b >> 5); \ 1538 a -= b; a -= c; a ^= (c >> 3); \ 1539 b -= c; b -= a; b ^= (a << 10); \ 1540 c -= a; c -= b; c ^= (b >> 15); \ 1541 } while (/*CONSTCOND*/0) 1542 1543 1544 static __inline uint32_t 1545 nm_bridge_rthash(const uint8_t *addr) 1546 { 1547 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key 1548 1549 b += addr[5] << 8; 1550 b += addr[4]; 1551 a += addr[3] << 24; 1552 a += addr[2] << 16; 1553 a += addr[1] << 8; 1554 a += addr[0]; 1555 1556 mix(a, b, c); 1557 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) 1558 return (c & BRIDGE_RTHASH_MASK); 1559 } 1560 1561 #undef mix 1562 1563 1564 /* nm_register callback for VALE ports */ 1565 static int 1566 netmap_vp_reg(struct netmap_adapter *na, int onoff) 1567 { 1568 struct netmap_vp_adapter *vpna = 1569 (struct netmap_vp_adapter*)na; 1570 enum txrx t; 1571 int i; 1572 1573 /* persistent ports may be put in netmap mode 1574 * before being attached to a bridge 1575 */ 1576 if (vpna->na_bdg) 1577 BDG_WLOCK(vpna->na_bdg); 1578 if (onoff) { 1579 for_rx_tx(t) { 1580 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { 1581 struct netmap_kring *kring = &NMR(na, t)[i]; 1582 1583 if (nm_kring_pending_on(kring)) 1584 kring->nr_mode = NKR_NETMAP_ON; 1585 } 1586 } 1587 if (na->active_fds == 0) 1588 na->na_flags |= NAF_NETMAP_ON; 1589 /* XXX on FreeBSD, persistent VALE ports should also 1590 * toggle IFCAP_NETMAP in na->ifp (2014-03-16) 1591 */ 1592 } else { 1593 if (na->active_fds == 0) 1594 na->na_flags &= ~NAF_NETMAP_ON; 1595 for_rx_tx(t) { 1596 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { 1597 struct netmap_kring *kring = &NMR(na, t)[i]; 1598 1599 if (nm_kring_pending_off(kring)) 1600 kring->nr_mode = NKR_NETMAP_OFF; 1601 } 1602 } 1603 } 1604 if (vpna->na_bdg) 1605 BDG_WUNLOCK(vpna->na_bdg); 1606 return 0; 1607 } 1608 1609 1610 /* 1611 * Lookup function for a learning bridge. 1612 * Update the hash table with the source address, 1613 * and then returns the destination port index, and the 1614 * ring in *dst_ring (at the moment, always use ring 0) 1615 */ 1616 u_int 1617 netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, 1618 struct netmap_vp_adapter *na) 1619 { 1620 uint8_t *buf = ft->ft_buf; 1621 u_int buf_len = ft->ft_len; 1622 struct nm_hash_ent *ht = na->na_bdg->ht; 1623 uint32_t sh, dh; 1624 u_int dst, mysrc = na->bdg_port; 1625 uint64_t smac, dmac; 1626 uint8_t indbuf[12]; 1627 1628 /* safety check, unfortunately we have many cases */ 1629 if (buf_len >= 14 + na->up.virt_hdr_len) { 1630 /* virthdr + mac_hdr in the same slot */ 1631 buf += na->up.virt_hdr_len; 1632 buf_len -= na->up.virt_hdr_len; 1633 } else if (buf_len == na->up.virt_hdr_len && ft->ft_flags & NS_MOREFRAG) { 1634 /* only header in first fragment */ 1635 ft++; 1636 buf = ft->ft_buf; 1637 buf_len = ft->ft_len; 1638 } else { 1639 RD(5, "invalid buf format, length %d", buf_len); 1640 return NM_BDG_NOPORT; 1641 } 1642 1643 if (ft->ft_flags & NS_INDIRECT) { 1644 if (copyin(buf, indbuf, sizeof(indbuf))) { 1645 return NM_BDG_NOPORT; 1646 } 1647 buf = indbuf; 1648 } 1649 1650 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; 1651 smac = le64toh(*(uint64_t *)(buf + 4)); 1652 smac >>= 16; 1653 1654 /* 1655 * The hash is somewhat expensive, there might be some 1656 * worthwhile optimizations here. 1657 */ 1658 if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */ 1659 uint8_t *s = buf+6; 1660 sh = nm_bridge_rthash(s); // XXX hash of source 1661 /* update source port forwarding entry */ 1662 na->last_smac = ht[sh].mac = smac; /* XXX expire ? */ 1663 ht[sh].ports = mysrc; 1664 if (netmap_verbose) 1665 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", 1666 s[0], s[1], s[2], s[3], s[4], s[5], mysrc); 1667 } 1668 dst = NM_BDG_BROADCAST; 1669 if ((buf[0] & 1) == 0) { /* unicast */ 1670 dh = nm_bridge_rthash(buf); // XXX hash of dst 1671 if (ht[dh].mac == dmac) { /* found dst */ 1672 dst = ht[dh].ports; 1673 } 1674 /* XXX otherwise return NM_BDG_UNKNOWN ? */ 1675 } 1676 return dst; 1677 } 1678 1679 1680 /* 1681 * Available space in the ring. Only used in VALE code 1682 * and only with is_rx = 1 1683 */ 1684 static inline uint32_t 1685 nm_kr_space(struct netmap_kring *k, int is_rx) 1686 { 1687 int space; 1688 1689 if (is_rx) { 1690 int busy = k->nkr_hwlease - k->nr_hwcur; 1691 if (busy < 0) 1692 busy += k->nkr_num_slots; 1693 space = k->nkr_num_slots - 1 - busy; 1694 } else { 1695 /* XXX never used in this branch */ 1696 space = k->nr_hwtail - k->nkr_hwlease; 1697 if (space < 0) 1698 space += k->nkr_num_slots; 1699 } 1700 #if 0 1701 // sanity check 1702 if (k->nkr_hwlease >= k->nkr_num_slots || 1703 k->nr_hwcur >= k->nkr_num_slots || 1704 k->nr_tail >= k->nkr_num_slots || 1705 busy < 0 || 1706 busy >= k->nkr_num_slots) { 1707 D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 1708 k->nkr_lease_idx, k->nkr_num_slots); 1709 } 1710 #endif 1711 return space; 1712 } 1713 1714 1715 1716 1717 /* make a lease on the kring for N positions. return the 1718 * lease index 1719 * XXX only used in VALE code and with is_rx = 1 1720 */ 1721 static inline uint32_t 1722 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) 1723 { 1724 uint32_t lim = k->nkr_num_slots - 1; 1725 uint32_t lease_idx = k->nkr_lease_idx; 1726 1727 k->nkr_leases[lease_idx] = NR_NOSLOT; 1728 k->nkr_lease_idx = nm_next(lease_idx, lim); 1729 1730 if (n > nm_kr_space(k, is_rx)) { 1731 D("invalid request for %d slots", n); 1732 panic("x"); 1733 } 1734 /* XXX verify that there are n slots */ 1735 k->nkr_hwlease += n; 1736 if (k->nkr_hwlease > lim) 1737 k->nkr_hwlease -= lim + 1; 1738 1739 if (k->nkr_hwlease >= k->nkr_num_slots || 1740 k->nr_hwcur >= k->nkr_num_slots || 1741 k->nr_hwtail >= k->nkr_num_slots || 1742 k->nkr_lease_idx >= k->nkr_num_slots) { 1743 D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", 1744 k->na->name, 1745 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 1746 k->nkr_lease_idx, k->nkr_num_slots); 1747 } 1748 return lease_idx; 1749 } 1750 1751 /* 1752 * 1753 * This flush routine supports only unicast and broadcast but a large 1754 * number of ports, and lets us replace the learn and dispatch functions. 1755 */ 1756 int 1757 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, 1758 u_int ring_nr) 1759 { 1760 struct nm_bdg_q *dst_ents, *brddst; 1761 uint16_t num_dsts = 0, *dsts; 1762 struct nm_bridge *b = na->na_bdg; 1763 u_int i, me = na->bdg_port; 1764 1765 /* 1766 * The work area (pointed by ft) is followed by an array of 1767 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS 1768 * queues per port plus one for the broadcast traffic. 1769 * Then we have an array of destination indexes. 1770 */ 1771 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 1772 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); 1773 1774 /* first pass: find a destination for each packet in the batch */ 1775 for (i = 0; likely(i < n); i += ft[i].ft_frags) { 1776 uint8_t dst_ring = ring_nr; /* default, same ring as origin */ 1777 uint16_t dst_port, d_i; 1778 struct nm_bdg_q *d; 1779 1780 ND("slot %d frags %d", i, ft[i].ft_frags); 1781 /* Drop the packet if the virtio-net header is not into the first 1782 fragment nor at the very beginning of the second. */ 1783 if (unlikely(na->up.virt_hdr_len > ft[i].ft_len)) 1784 continue; 1785 dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na); 1786 if (netmap_verbose > 255) 1787 RD(5, "slot %d port %d -> %d", i, me, dst_port); 1788 if (dst_port == NM_BDG_NOPORT) 1789 continue; /* this packet is identified to be dropped */ 1790 else if (unlikely(dst_port > NM_BDG_MAXPORTS)) 1791 continue; 1792 else if (dst_port == NM_BDG_BROADCAST) 1793 dst_ring = 0; /* broadcasts always go to ring 0 */ 1794 else if (unlikely(dst_port == me || 1795 !b->bdg_ports[dst_port])) 1796 continue; 1797 1798 /* get a position in the scratch pad */ 1799 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; 1800 d = dst_ents + d_i; 1801 1802 /* append the first fragment to the list */ 1803 if (d->bq_head == NM_FT_NULL) { /* new destination */ 1804 d->bq_head = d->bq_tail = i; 1805 /* remember this position to be scanned later */ 1806 if (dst_port != NM_BDG_BROADCAST) 1807 dsts[num_dsts++] = d_i; 1808 } else { 1809 ft[d->bq_tail].ft_next = i; 1810 d->bq_tail = i; 1811 } 1812 d->bq_len += ft[i].ft_frags; 1813 } 1814 1815 /* 1816 * Broadcast traffic goes to ring 0 on all destinations. 1817 * So we need to add these rings to the list of ports to scan. 1818 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is 1819 * expensive. We should keep a compact list of active destinations 1820 * so we could shorten this loop. 1821 */ 1822 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; 1823 if (brddst->bq_head != NM_FT_NULL) { 1824 u_int j; 1825 for (j = 0; likely(j < b->bdg_active_ports); j++) { 1826 uint16_t d_i; 1827 i = b->bdg_port_index[j]; 1828 if (unlikely(i == me)) 1829 continue; 1830 d_i = i * NM_BDG_MAXRINGS; 1831 if (dst_ents[d_i].bq_head == NM_FT_NULL) 1832 dsts[num_dsts++] = d_i; 1833 } 1834 } 1835 1836 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); 1837 /* second pass: scan destinations */ 1838 for (i = 0; i < num_dsts; i++) { 1839 struct netmap_vp_adapter *dst_na; 1840 struct netmap_kring *kring; 1841 struct netmap_ring *ring; 1842 u_int dst_nr, lim, j, d_i, next, brd_next; 1843 u_int needed, howmany; 1844 int retry = netmap_txsync_retry; 1845 struct nm_bdg_q *d; 1846 uint32_t my_start = 0, lease_idx = 0; 1847 int nrings; 1848 int virt_hdr_mismatch = 0; 1849 1850 d_i = dsts[i]; 1851 ND("second pass %d port %d", i, d_i); 1852 d = dst_ents + d_i; 1853 // XXX fix the division 1854 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; 1855 /* protect from the lookup function returning an inactive 1856 * destination port 1857 */ 1858 if (unlikely(dst_na == NULL)) 1859 goto cleanup; 1860 if (dst_na->up.na_flags & NAF_SW_ONLY) 1861 goto cleanup; 1862 /* 1863 * The interface may be in !netmap mode in two cases: 1864 * - when na is attached but not activated yet; 1865 * - when na is being deactivated but is still attached. 1866 */ 1867 if (unlikely(!nm_netmap_on(&dst_na->up))) { 1868 ND("not in netmap mode!"); 1869 goto cleanup; 1870 } 1871 1872 /* there is at least one either unicast or broadcast packet */ 1873 brd_next = brddst->bq_head; 1874 next = d->bq_head; 1875 /* we need to reserve this many slots. If fewer are 1876 * available, some packets will be dropped. 1877 * Packets may have multiple fragments, so we may not use 1878 * there is a chance that we may not use all of the slots 1879 * we have claimed, so we will need to handle the leftover 1880 * ones when we regain the lock. 1881 */ 1882 needed = d->bq_len + brddst->bq_len; 1883 1884 if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) { 1885 if (netmap_verbose) { 1886 RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len, 1887 dst_na->up.virt_hdr_len); 1888 } 1889 /* There is a virtio-net header/offloadings mismatch between 1890 * source and destination. The slower mismatch datapath will 1891 * be used to cope with all the mismatches. 1892 */ 1893 virt_hdr_mismatch = 1; 1894 if (dst_na->mfs < na->mfs) { 1895 /* We may need to do segmentation offloadings, and so 1896 * we may need a number of destination slots greater 1897 * than the number of input slots ('needed'). 1898 * We look for the smallest integer 'x' which satisfies: 1899 * needed * na->mfs + x * H <= x * na->mfs 1900 * where 'H' is the length of the longest header that may 1901 * be replicated in the segmentation process (e.g. for 1902 * TCPv4 we must account for ethernet header, IP header 1903 * and TCPv4 header). 1904 */ 1905 needed = (needed * na->mfs) / 1906 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; 1907 ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); 1908 } 1909 } 1910 1911 ND(5, "pass 2 dst %d is %x %s", 1912 i, d_i, is_vp ? "virtual" : "nic/host"); 1913 dst_nr = d_i & (NM_BDG_MAXRINGS-1); 1914 nrings = dst_na->up.num_rx_rings; 1915 if (dst_nr >= nrings) 1916 dst_nr = dst_nr % nrings; 1917 kring = &dst_na->up.rx_rings[dst_nr]; 1918 ring = kring->ring; 1919 lim = kring->nkr_num_slots - 1; 1920 1921 retry: 1922 1923 if (dst_na->retry && retry) { 1924 /* try to get some free slot from the previous run */ 1925 kring->nm_notify(kring, 0); 1926 /* actually useful only for bwraps, since there 1927 * the notify will trigger a txsync on the hwna. VALE ports 1928 * have dst_na->retry == 0 1929 */ 1930 } 1931 /* reserve the buffers in the queue and an entry 1932 * to report completion, and drop lock. 1933 * XXX this might become a helper function. 1934 */ 1935 mtx_lock(&kring->q_lock); 1936 if (kring->nkr_stopped) { 1937 mtx_unlock(&kring->q_lock); 1938 goto cleanup; 1939 } 1940 my_start = j = kring->nkr_hwlease; 1941 howmany = nm_kr_space(kring, 1); 1942 if (needed < howmany) 1943 howmany = needed; 1944 lease_idx = nm_kr_lease(kring, howmany, 1); 1945 mtx_unlock(&kring->q_lock); 1946 1947 /* only retry if we need more than available slots */ 1948 if (retry && needed <= howmany) 1949 retry = 0; 1950 1951 /* copy to the destination queue */ 1952 while (howmany > 0) { 1953 struct netmap_slot *slot; 1954 struct nm_bdg_fwd *ft_p, *ft_end; 1955 u_int cnt; 1956 1957 /* find the queue from which we pick next packet. 1958 * NM_FT_NULL is always higher than valid indexes 1959 * so we never dereference it if the other list 1960 * has packets (and if both are empty we never 1961 * get here). 1962 */ 1963 if (next < brd_next) { 1964 ft_p = ft + next; 1965 next = ft_p->ft_next; 1966 } else { /* insert broadcast */ 1967 ft_p = ft + brd_next; 1968 brd_next = ft_p->ft_next; 1969 } 1970 cnt = ft_p->ft_frags; // cnt > 0 1971 if (unlikely(cnt > howmany)) 1972 break; /* no more space */ 1973 if (netmap_verbose && cnt > 1) 1974 RD(5, "rx %d frags to %d", cnt, j); 1975 ft_end = ft_p + cnt; 1976 if (unlikely(virt_hdr_mismatch)) { 1977 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); 1978 } else { 1979 howmany -= cnt; 1980 do { 1981 char *dst, *src = ft_p->ft_buf; 1982 size_t copy_len = ft_p->ft_len, dst_len = copy_len; 1983 1984 slot = &ring->slot[j]; 1985 dst = NMB(&dst_na->up, slot); 1986 1987 ND("send [%d] %d(%d) bytes at %s:%d", 1988 i, (int)copy_len, (int)dst_len, 1989 NM_IFPNAME(dst_ifp), j); 1990 /* round to a multiple of 64 */ 1991 copy_len = (copy_len + 63) & ~63; 1992 1993 if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) || 1994 copy_len > NETMAP_BUF_SIZE(&na->up))) { 1995 RD(5, "invalid len %d, down to 64", (int)copy_len); 1996 copy_len = dst_len = 64; // XXX 1997 } 1998 if (ft_p->ft_flags & NS_INDIRECT) { 1999 if (copyin(src, dst, copy_len)) { 2000 // invalid user pointer, pretend len is 0 2001 dst_len = 0; 2002 } 2003 } else { 2004 //memcpy(dst, src, copy_len); 2005 pkt_copy(src, dst, (int)copy_len); 2006 } 2007 slot->len = dst_len; 2008 slot->flags = (cnt << 8)| NS_MOREFRAG; 2009 j = nm_next(j, lim); 2010 needed--; 2011 ft_p++; 2012 } while (ft_p != ft_end); 2013 slot->flags = (cnt << 8); /* clear flag on last entry */ 2014 } 2015 /* are we done ? */ 2016 if (next == NM_FT_NULL && brd_next == NM_FT_NULL) 2017 break; 2018 } 2019 { 2020 /* current position */ 2021 uint32_t *p = kring->nkr_leases; /* shorthand */ 2022 uint32_t update_pos; 2023 int still_locked = 1; 2024 2025 mtx_lock(&kring->q_lock); 2026 if (unlikely(howmany > 0)) { 2027 /* not used all bufs. If i am the last one 2028 * i can recover the slots, otherwise must 2029 * fill them with 0 to mark empty packets. 2030 */ 2031 ND("leftover %d bufs", howmany); 2032 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { 2033 /* yes i am the last one */ 2034 ND("roll back nkr_hwlease to %d", j); 2035 kring->nkr_hwlease = j; 2036 } else { 2037 while (howmany-- > 0) { 2038 ring->slot[j].len = 0; 2039 ring->slot[j].flags = 0; 2040 j = nm_next(j, lim); 2041 } 2042 } 2043 } 2044 p[lease_idx] = j; /* report I am done */ 2045 2046 update_pos = kring->nr_hwtail; 2047 2048 if (my_start == update_pos) { 2049 /* all slots before my_start have been reported, 2050 * so scan subsequent leases to see if other ranges 2051 * have been completed, and to a selwakeup or txsync. 2052 */ 2053 while (lease_idx != kring->nkr_lease_idx && 2054 p[lease_idx] != NR_NOSLOT) { 2055 j = p[lease_idx]; 2056 p[lease_idx] = NR_NOSLOT; 2057 lease_idx = nm_next(lease_idx, lim); 2058 } 2059 /* j is the new 'write' position. j != my_start 2060 * means there are new buffers to report 2061 */ 2062 if (likely(j != my_start)) { 2063 kring->nr_hwtail = j; 2064 still_locked = 0; 2065 mtx_unlock(&kring->q_lock); 2066 kring->nm_notify(kring, 0); 2067 /* this is netmap_notify for VALE ports and 2068 * netmap_bwrap_notify for bwrap. The latter will 2069 * trigger a txsync on the underlying hwna 2070 */ 2071 if (dst_na->retry && retry--) { 2072 /* XXX this is going to call nm_notify again. 2073 * Only useful for bwrap in virtual machines 2074 */ 2075 goto retry; 2076 } 2077 } 2078 } 2079 if (still_locked) 2080 mtx_unlock(&kring->q_lock); 2081 } 2082 cleanup: 2083 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ 2084 d->bq_len = 0; 2085 } 2086 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ 2087 brddst->bq_len = 0; 2088 return 0; 2089 } 2090 2091 /* nm_txsync callback for VALE ports */ 2092 static int 2093 netmap_vp_txsync(struct netmap_kring *kring, int flags) 2094 { 2095 struct netmap_vp_adapter *na = 2096 (struct netmap_vp_adapter *)kring->na; 2097 u_int done; 2098 u_int const lim = kring->nkr_num_slots - 1; 2099 u_int const head = kring->rhead; 2100 2101 if (bridge_batch <= 0) { /* testing only */ 2102 done = head; // used all 2103 goto done; 2104 } 2105 if (!na->na_bdg) { 2106 done = head; 2107 goto done; 2108 } 2109 if (bridge_batch > NM_BDG_BATCH) 2110 bridge_batch = NM_BDG_BATCH; 2111 2112 done = nm_bdg_preflush(kring, head); 2113 done: 2114 if (done != head) 2115 D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail); 2116 /* 2117 * packets between 'done' and 'cur' are left unsent. 2118 */ 2119 kring->nr_hwcur = done; 2120 kring->nr_hwtail = nm_prev(done, lim); 2121 if (netmap_verbose) 2122 D("%s ring %d flags %d", na->up.name, kring->ring_id, flags); 2123 return 0; 2124 } 2125 2126 2127 /* rxsync code used by VALE ports nm_rxsync callback and also 2128 * internally by the brwap 2129 */ 2130 static int 2131 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags) 2132 { 2133 struct netmap_adapter *na = kring->na; 2134 struct netmap_ring *ring = kring->ring; 2135 u_int nm_i, lim = kring->nkr_num_slots - 1; 2136 u_int head = kring->rhead; 2137 int n; 2138 2139 if (head > lim) { 2140 D("ouch dangerous reset!!!"); 2141 n = netmap_ring_reinit(kring); 2142 goto done; 2143 } 2144 2145 /* First part, import newly received packets. */ 2146 /* actually nothing to do here, they are already in the kring */ 2147 2148 /* Second part, skip past packets that userspace has released. */ 2149 nm_i = kring->nr_hwcur; 2150 if (nm_i != head) { 2151 /* consistency check, but nothing really important here */ 2152 for (n = 0; likely(nm_i != head); n++) { 2153 struct netmap_slot *slot = &ring->slot[nm_i]; 2154 void *addr = NMB(na, slot); 2155 2156 if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */ 2157 D("bad buffer index %d, ignore ?", 2158 slot->buf_idx); 2159 } 2160 slot->flags &= ~NS_BUF_CHANGED; 2161 nm_i = nm_next(nm_i, lim); 2162 } 2163 kring->nr_hwcur = head; 2164 } 2165 2166 n = 0; 2167 done: 2168 return n; 2169 } 2170 2171 /* 2172 * nm_rxsync callback for VALE ports 2173 * user process reading from a VALE switch. 2174 * Already protected against concurrent calls from userspace, 2175 * but we must acquire the queue's lock to protect against 2176 * writers on the same queue. 2177 */ 2178 static int 2179 netmap_vp_rxsync(struct netmap_kring *kring, int flags) 2180 { 2181 int n; 2182 2183 mtx_lock(&kring->q_lock); 2184 n = netmap_vp_rxsync_locked(kring, flags); 2185 mtx_unlock(&kring->q_lock); 2186 return n; 2187 } 2188 2189 2190 /* nm_bdg_attach callback for VALE ports 2191 * The na_vp port is this same netmap_adapter. There is no host port. 2192 */ 2193 static int 2194 netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na) 2195 { 2196 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 2197 2198 if (vpna->na_bdg) 2199 return EBUSY; 2200 na->na_vp = vpna; 2201 strncpy(na->name, name, sizeof(na->name)); 2202 na->na_hostvp = NULL; 2203 return 0; 2204 } 2205 2206 /* create a netmap_vp_adapter that describes a VALE port. 2207 * Only persistent VALE ports have a non-null ifp. 2208 */ 2209 static int 2210 netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp, 2211 struct netmap_mem_d *nmd, 2212 struct netmap_vp_adapter **ret) 2213 { 2214 struct netmap_vp_adapter *vpna; 2215 struct netmap_adapter *na; 2216 int error = 0; 2217 u_int npipes = 0; 2218 2219 vpna = nm_os_malloc(sizeof(*vpna)); 2220 if (vpna == NULL) 2221 return ENOMEM; 2222 2223 na = &vpna->up; 2224 2225 na->ifp = ifp; 2226 strncpy(na->name, nmr->nr_name, sizeof(na->name)); 2227 2228 /* bound checking */ 2229 na->num_tx_rings = nmr->nr_tx_rings; 2230 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 2231 nmr->nr_tx_rings = na->num_tx_rings; // write back 2232 na->num_rx_rings = nmr->nr_rx_rings; 2233 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 2234 nmr->nr_rx_rings = na->num_rx_rings; // write back 2235 nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, 2236 1, NM_BDG_MAXSLOTS, NULL); 2237 na->num_tx_desc = nmr->nr_tx_slots; 2238 nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, 2239 1, NM_BDG_MAXSLOTS, NULL); 2240 /* validate number of pipes. We want at least 1, 2241 * but probably can do with some more. 2242 * So let's use 2 as default (when 0 is supplied) 2243 */ 2244 npipes = nmr->nr_arg1; 2245 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); 2246 nmr->nr_arg1 = npipes; /* write back */ 2247 /* validate extra bufs */ 2248 nm_bound_var(&nmr->nr_arg3, 0, 0, 2249 128*NM_BDG_MAXSLOTS, NULL); 2250 na->num_rx_desc = nmr->nr_rx_slots; 2251 vpna->mfs = 1514; 2252 vpna->last_smac = ~0llu; 2253 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? 2254 vpna->mfs = netmap_buf_size; */ 2255 if (netmap_verbose) 2256 D("max frame size %u", vpna->mfs); 2257 2258 na->na_flags |= NAF_BDG_MAYSLEEP; 2259 /* persistent VALE ports look like hw devices 2260 * with a native netmap adapter 2261 */ 2262 if (ifp) 2263 na->na_flags |= NAF_NATIVE; 2264 na->nm_txsync = netmap_vp_txsync; 2265 na->nm_rxsync = netmap_vp_rxsync; 2266 na->nm_register = netmap_vp_reg; 2267 na->nm_krings_create = netmap_vp_krings_create; 2268 na->nm_krings_delete = netmap_vp_krings_delete; 2269 na->nm_dtor = netmap_vp_dtor; 2270 D("nr_arg2 %d", nmr->nr_arg2); 2271 na->nm_mem = nmd ? 2272 netmap_mem_get(nmd): 2273 netmap_mem_private_new( 2274 na->num_tx_rings, na->num_tx_desc, 2275 na->num_rx_rings, na->num_rx_desc, 2276 nmr->nr_arg3, npipes, &error); 2277 if (na->nm_mem == NULL) 2278 goto err; 2279 na->nm_bdg_attach = netmap_vp_bdg_attach; 2280 /* other nmd fields are set in the common routine */ 2281 error = netmap_attach_common(na); 2282 if (error) 2283 goto err; 2284 *ret = vpna; 2285 return 0; 2286 2287 err: 2288 if (na->nm_mem != NULL) 2289 netmap_mem_put(na->nm_mem); 2290 nm_os_free(vpna); 2291 return error; 2292 } 2293 2294 /* Bridge wrapper code (bwrap). 2295 * This is used to connect a non-VALE-port netmap_adapter (hwna) to a 2296 * VALE switch. 2297 * The main task is to swap the meaning of tx and rx rings to match the 2298 * expectations of the VALE switch code (see nm_bdg_flush). 2299 * 2300 * The bwrap works by interposing a netmap_bwrap_adapter between the 2301 * rest of the system and the hwna. The netmap_bwrap_adapter looks like 2302 * a netmap_vp_adapter to the rest the system, but, internally, it 2303 * translates all callbacks to what the hwna expects. 2304 * 2305 * Note that we have to intercept callbacks coming from two sides: 2306 * 2307 * - callbacks coming from the netmap module are intercepted by 2308 * passing around the netmap_bwrap_adapter instead of the hwna 2309 * 2310 * - callbacks coming from outside of the netmap module only know 2311 * about the hwna. This, however, only happens in interrupt 2312 * handlers, where only the hwna->nm_notify callback is called. 2313 * What the bwrap does is to overwrite the hwna->nm_notify callback 2314 * with its own netmap_bwrap_intr_notify. 2315 * XXX This assumes that the hwna->nm_notify callback was the 2316 * standard netmap_notify(), as it is the case for nic adapters. 2317 * Any additional action performed by hwna->nm_notify will not be 2318 * performed by netmap_bwrap_intr_notify. 2319 * 2320 * Additionally, the bwrap can optionally attach the host rings pair 2321 * of the wrapped adapter to a different port of the switch. 2322 */ 2323 2324 2325 static void 2326 netmap_bwrap_dtor(struct netmap_adapter *na) 2327 { 2328 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; 2329 struct netmap_adapter *hwna = bna->hwna; 2330 struct nm_bridge *b = bna->up.na_bdg, 2331 *bh = bna->host.na_bdg; 2332 2333 netmap_mem_put(bna->host.up.nm_mem); 2334 2335 if (b) { 2336 netmap_bdg_detach_common(b, bna->up.bdg_port, 2337 (bh ? bna->host.bdg_port : -1)); 2338 } 2339 2340 ND("na %p", na); 2341 na->ifp = NULL; 2342 bna->host.up.ifp = NULL; 2343 hwna->na_private = NULL; 2344 hwna->na_vp = hwna->na_hostvp = NULL; 2345 hwna->na_flags &= ~NAF_BUSY; 2346 netmap_adapter_put(hwna); 2347 2348 } 2349 2350 2351 /* 2352 * Intr callback for NICs connected to a bridge. 2353 * Simply ignore tx interrupts (maybe we could try to recover space ?) 2354 * and pass received packets from nic to the bridge. 2355 * 2356 * XXX TODO check locking: this is called from the interrupt 2357 * handler so we should make sure that the interface is not 2358 * disconnected while passing down an interrupt. 2359 * 2360 * Note, no user process can access this NIC or the host stack. 2361 * The only part of the ring that is significant are the slots, 2362 * and head/cur/tail are set from the kring as needed 2363 * (part as a receive ring, part as a transmit ring). 2364 * 2365 * callback that overwrites the hwna notify callback. 2366 * Packets come from the outside or from the host stack and are put on an 2367 * hwna rx ring. 2368 * The bridge wrapper then sends the packets through the bridge. 2369 */ 2370 static int 2371 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags) 2372 { 2373 struct netmap_adapter *na = kring->na; 2374 struct netmap_bwrap_adapter *bna = na->na_private; 2375 struct netmap_kring *bkring; 2376 struct netmap_vp_adapter *vpna = &bna->up; 2377 u_int ring_nr = kring->ring_id; 2378 int ret = NM_IRQ_COMPLETED; 2379 int error; 2380 2381 if (netmap_verbose) 2382 D("%s %s 0x%x", na->name, kring->name, flags); 2383 2384 bkring = &vpna->up.tx_rings[ring_nr]; 2385 2386 /* make sure the ring is not disabled */ 2387 if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) { 2388 return EIO; 2389 } 2390 2391 if (netmap_verbose) 2392 D("%s head %d cur %d tail %d", na->name, 2393 kring->rhead, kring->rcur, kring->rtail); 2394 2395 /* simulate a user wakeup on the rx ring 2396 * fetch packets that have arrived. 2397 */ 2398 error = kring->nm_sync(kring, 0); 2399 if (error) 2400 goto put_out; 2401 if (kring->nr_hwcur == kring->nr_hwtail) { 2402 if (netmap_verbose) 2403 D("how strange, interrupt with no packets on %s", 2404 na->name); 2405 goto put_out; 2406 } 2407 2408 /* new packets are kring->rcur to kring->nr_hwtail, and the bkring 2409 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail 2410 * to push all packets out. 2411 */ 2412 bkring->rhead = bkring->rcur = kring->nr_hwtail; 2413 2414 netmap_vp_txsync(bkring, flags); 2415 2416 /* mark all buffers as released on this ring */ 2417 kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail; 2418 /* another call to actually release the buffers */ 2419 error = kring->nm_sync(kring, 0); 2420 2421 /* The second rxsync may have further advanced hwtail. If this happens, 2422 * return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */ 2423 if (kring->rcur != kring->nr_hwtail) { 2424 ret = NM_IRQ_RESCHED; 2425 } 2426 put_out: 2427 nm_kr_put(kring); 2428 2429 return error ? error : ret; 2430 } 2431 2432 2433 /* nm_register callback for bwrap */ 2434 static int 2435 netmap_bwrap_reg(struct netmap_adapter *na, int onoff) 2436 { 2437 struct netmap_bwrap_adapter *bna = 2438 (struct netmap_bwrap_adapter *)na; 2439 struct netmap_adapter *hwna = bna->hwna; 2440 struct netmap_vp_adapter *hostna = &bna->host; 2441 int error, i; 2442 enum txrx t; 2443 2444 ND("%s %s", na->name, onoff ? "on" : "off"); 2445 2446 if (onoff) { 2447 /* netmap_do_regif has been called on the bwrap na. 2448 * We need to pass the information about the 2449 * memory allocator down to the hwna before 2450 * putting it in netmap mode 2451 */ 2452 hwna->na_lut = na->na_lut; 2453 2454 if (hostna->na_bdg) { 2455 /* if the host rings have been attached to switch, 2456 * we need to copy the memory allocator information 2457 * in the hostna also 2458 */ 2459 hostna->up.na_lut = na->na_lut; 2460 } 2461 2462 /* cross-link the netmap rings 2463 * The original number of rings comes from hwna, 2464 * rx rings on one side equals tx rings on the other. 2465 */ 2466 for_rx_tx(t) { 2467 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ 2468 for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) { 2469 NMR(hwna, r)[i].ring = NMR(na, t)[i].ring; 2470 } 2471 } 2472 2473 if (na->na_flags & NAF_HOST_RINGS) { 2474 struct netmap_adapter *hna = &hostna->up; 2475 /* the hostna rings are the host rings of the bwrap. 2476 * The corresponding krings must point back to the 2477 * hostna 2478 */ 2479 hna->tx_rings = &na->tx_rings[na->num_tx_rings]; 2480 hna->tx_rings[0].na = hna; 2481 hna->rx_rings = &na->rx_rings[na->num_rx_rings]; 2482 hna->rx_rings[0].na = hna; 2483 } 2484 } 2485 2486 /* pass down the pending ring state information */ 2487 for_rx_tx(t) { 2488 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) 2489 NMR(hwna, t)[i].nr_pending_mode = 2490 NMR(na, t)[i].nr_pending_mode; 2491 } 2492 2493 /* forward the request to the hwna */ 2494 error = hwna->nm_register(hwna, onoff); 2495 if (error) 2496 return error; 2497 2498 /* copy up the current ring state information */ 2499 for_rx_tx(t) { 2500 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) 2501 NMR(na, t)[i].nr_mode = 2502 NMR(hwna, t)[i].nr_mode; 2503 } 2504 2505 /* impersonate a netmap_vp_adapter */ 2506 netmap_vp_reg(na, onoff); 2507 if (hostna->na_bdg) 2508 netmap_vp_reg(&hostna->up, onoff); 2509 2510 if (onoff) { 2511 u_int i; 2512 /* intercept the hwna nm_nofify callback on the hw rings */ 2513 for (i = 0; i < hwna->num_rx_rings; i++) { 2514 hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify; 2515 hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify; 2516 } 2517 i = hwna->num_rx_rings; /* for safety */ 2518 /* save the host ring notify unconditionally */ 2519 hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify; 2520 if (hostna->na_bdg) { 2521 /* also intercept the host ring notify */ 2522 hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify; 2523 } 2524 if (na->active_fds == 0) 2525 na->na_flags |= NAF_NETMAP_ON; 2526 } else { 2527 u_int i; 2528 2529 if (na->active_fds == 0) 2530 na->na_flags &= ~NAF_NETMAP_ON; 2531 2532 /* reset all notify callbacks (including host ring) */ 2533 for (i = 0; i <= hwna->num_rx_rings; i++) { 2534 hwna->rx_rings[i].nm_notify = hwna->rx_rings[i].save_notify; 2535 hwna->rx_rings[i].save_notify = NULL; 2536 } 2537 hwna->na_lut.lut = NULL; 2538 hwna->na_lut.objtotal = 0; 2539 hwna->na_lut.objsize = 0; 2540 } 2541 2542 return 0; 2543 } 2544 2545 /* nm_config callback for bwrap */ 2546 static int 2547 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, 2548 u_int *rxr, u_int *rxd) 2549 { 2550 struct netmap_bwrap_adapter *bna = 2551 (struct netmap_bwrap_adapter *)na; 2552 struct netmap_adapter *hwna = bna->hwna; 2553 2554 /* forward the request */ 2555 netmap_update_config(hwna); 2556 /* swap the results */ 2557 *txr = hwna->num_rx_rings; 2558 *txd = hwna->num_rx_desc; 2559 *rxr = hwna->num_tx_rings; 2560 *rxd = hwna->num_rx_desc; 2561 2562 return 0; 2563 } 2564 2565 2566 /* nm_krings_create callback for bwrap */ 2567 static int 2568 netmap_bwrap_krings_create(struct netmap_adapter *na) 2569 { 2570 struct netmap_bwrap_adapter *bna = 2571 (struct netmap_bwrap_adapter *)na; 2572 struct netmap_adapter *hwna = bna->hwna; 2573 int i, error = 0; 2574 enum txrx t; 2575 2576 ND("%s", na->name); 2577 2578 /* impersonate a netmap_vp_adapter */ 2579 error = netmap_vp_krings_create(na); 2580 if (error) 2581 return error; 2582 2583 /* also create the hwna krings */ 2584 error = hwna->nm_krings_create(hwna); 2585 if (error) { 2586 goto err_del_vp_rings; 2587 } 2588 2589 /* get each ring slot number from the corresponding hwna ring */ 2590 for_rx_tx(t) { 2591 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ 2592 for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) { 2593 NMR(na, t)[i].nkr_num_slots = NMR(hwna, r)[i].nkr_num_slots; 2594 } 2595 } 2596 2597 return 0; 2598 2599 err_del_vp_rings: 2600 netmap_vp_krings_delete(na); 2601 2602 return error; 2603 } 2604 2605 2606 static void 2607 netmap_bwrap_krings_delete(struct netmap_adapter *na) 2608 { 2609 struct netmap_bwrap_adapter *bna = 2610 (struct netmap_bwrap_adapter *)na; 2611 struct netmap_adapter *hwna = bna->hwna; 2612 2613 ND("%s", na->name); 2614 2615 hwna->nm_krings_delete(hwna); 2616 netmap_vp_krings_delete(na); 2617 } 2618 2619 2620 /* notify method for the bridge-->hwna direction */ 2621 static int 2622 netmap_bwrap_notify(struct netmap_kring *kring, int flags) 2623 { 2624 struct netmap_adapter *na = kring->na; 2625 struct netmap_bwrap_adapter *bna = na->na_private; 2626 struct netmap_adapter *hwna = bna->hwna; 2627 u_int ring_n = kring->ring_id; 2628 u_int lim = kring->nkr_num_slots - 1; 2629 struct netmap_kring *hw_kring; 2630 int error; 2631 2632 ND("%s: na %s hwna %s", 2633 (kring ? kring->name : "NULL!"), 2634 (na ? na->name : "NULL!"), 2635 (hwna ? hwna->name : "NULL!")); 2636 hw_kring = &hwna->tx_rings[ring_n]; 2637 2638 if (nm_kr_tryget(hw_kring, 0, NULL)) { 2639 return ENXIO; 2640 } 2641 2642 /* first step: simulate a user wakeup on the rx ring */ 2643 netmap_vp_rxsync(kring, flags); 2644 ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", 2645 na->name, ring_n, 2646 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, 2647 ring->head, ring->cur, ring->tail, 2648 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail); 2649 /* second step: the new packets are sent on the tx ring 2650 * (which is actually the same ring) 2651 */ 2652 hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail; 2653 error = hw_kring->nm_sync(hw_kring, flags); 2654 if (error) 2655 goto put_out; 2656 2657 /* third step: now we are back the rx ring */ 2658 /* claim ownership on all hw owned bufs */ 2659 kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */ 2660 2661 /* fourth step: the user goes to sleep again, causing another rxsync */ 2662 netmap_vp_rxsync(kring, flags); 2663 ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", 2664 na->name, ring_n, 2665 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, 2666 ring->head, ring->cur, ring->tail, 2667 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); 2668 put_out: 2669 nm_kr_put(hw_kring); 2670 2671 return error ? error : NM_IRQ_COMPLETED; 2672 } 2673 2674 2675 /* nm_bdg_ctl callback for the bwrap. 2676 * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd]. 2677 * On attach, it needs to provide a fake netmap_priv_d structure and 2678 * perform a netmap_do_regif() on the bwrap. This will put both the 2679 * bwrap and the hwna in netmap mode, with the netmap rings shared 2680 * and cross linked. Moroever, it will start intercepting interrupts 2681 * directed to hwna. 2682 */ 2683 static int 2684 netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach) 2685 { 2686 struct netmap_priv_d *npriv; 2687 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; 2688 int error = 0; 2689 2690 if (attach) { 2691 if (NETMAP_OWNED_BY_ANY(na)) { 2692 return EBUSY; 2693 } 2694 if (bna->na_kpriv) { 2695 /* nothing to do */ 2696 return 0; 2697 } 2698 npriv = netmap_priv_new(); 2699 if (npriv == NULL) 2700 return ENOMEM; 2701 npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */ 2702 error = netmap_do_regif(npriv, na, 0, NR_REG_NIC_SW); 2703 if (error) { 2704 netmap_priv_delete(npriv); 2705 return error; 2706 } 2707 bna->na_kpriv = npriv; 2708 na->na_flags |= NAF_BUSY; 2709 } else { 2710 if (na->active_fds == 0) /* not registered */ 2711 return EINVAL; 2712 netmap_priv_delete(bna->na_kpriv); 2713 bna->na_kpriv = NULL; 2714 na->na_flags &= ~NAF_BUSY; 2715 } 2716 return error; 2717 2718 } 2719 2720 /* attach a bridge wrapper to the 'real' device */ 2721 int 2722 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) 2723 { 2724 struct netmap_bwrap_adapter *bna; 2725 struct netmap_adapter *na = NULL; 2726 struct netmap_adapter *hostna = NULL; 2727 int error = 0; 2728 enum txrx t; 2729 2730 /* make sure the NIC is not already in use */ 2731 if (NETMAP_OWNED_BY_ANY(hwna)) { 2732 D("NIC %s busy, cannot attach to bridge", hwna->name); 2733 return EBUSY; 2734 } 2735 2736 bna = nm_os_malloc(sizeof(*bna)); 2737 if (bna == NULL) { 2738 return ENOMEM; 2739 } 2740 2741 na = &bna->up.up; 2742 /* make bwrap ifp point to the real ifp */ 2743 na->ifp = hwna->ifp; 2744 if_ref(na->ifp); 2745 na->na_private = bna; 2746 strncpy(na->name, nr_name, sizeof(na->name)); 2747 /* fill the ring data for the bwrap adapter with rx/tx meanings 2748 * swapped. The real cross-linking will be done during register, 2749 * when all the krings will have been created. 2750 */ 2751 for_rx_tx(t) { 2752 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ 2753 nma_set_nrings(na, t, nma_get_nrings(hwna, r)); 2754 nma_set_ndesc(na, t, nma_get_ndesc(hwna, r)); 2755 } 2756 na->nm_dtor = netmap_bwrap_dtor; 2757 na->nm_register = netmap_bwrap_reg; 2758 // na->nm_txsync = netmap_bwrap_txsync; 2759 // na->nm_rxsync = netmap_bwrap_rxsync; 2760 na->nm_config = netmap_bwrap_config; 2761 na->nm_krings_create = netmap_bwrap_krings_create; 2762 na->nm_krings_delete = netmap_bwrap_krings_delete; 2763 na->nm_notify = netmap_bwrap_notify; 2764 na->nm_bdg_ctl = netmap_bwrap_bdg_ctl; 2765 na->pdev = hwna->pdev; 2766 na->nm_mem = netmap_mem_get(hwna->nm_mem); 2767 na->virt_hdr_len = hwna->virt_hdr_len; 2768 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ 2769 2770 bna->hwna = hwna; 2771 netmap_adapter_get(hwna); 2772 hwna->na_private = bna; /* weak reference */ 2773 hwna->na_vp = &bna->up; 2774 2775 if (hwna->na_flags & NAF_HOST_RINGS) { 2776 if (hwna->na_flags & NAF_SW_ONLY) 2777 na->na_flags |= NAF_SW_ONLY; 2778 na->na_flags |= NAF_HOST_RINGS; 2779 hostna = &bna->host.up; 2780 snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name); 2781 hostna->ifp = hwna->ifp; 2782 for_rx_tx(t) { 2783 enum txrx r = nm_txrx_swap(t); 2784 nma_set_nrings(hostna, t, 1); 2785 nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r)); 2786 } 2787 // hostna->nm_txsync = netmap_bwrap_host_txsync; 2788 // hostna->nm_rxsync = netmap_bwrap_host_rxsync; 2789 hostna->nm_notify = netmap_bwrap_notify; 2790 hostna->nm_mem = netmap_mem_get(na->nm_mem); 2791 hostna->na_private = bna; 2792 hostna->na_vp = &bna->up; 2793 na->na_hostvp = hwna->na_hostvp = 2794 hostna->na_hostvp = &bna->host; 2795 hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */ 2796 } 2797 2798 ND("%s<->%s txr %d txd %d rxr %d rxd %d", 2799 na->name, ifp->if_xname, 2800 na->num_tx_rings, na->num_tx_desc, 2801 na->num_rx_rings, na->num_rx_desc); 2802 2803 error = netmap_attach_common(na); 2804 if (error) { 2805 goto err_free; 2806 } 2807 hwna->na_flags |= NAF_BUSY; 2808 return 0; 2809 2810 err_free: 2811 hwna->na_vp = hwna->na_hostvp = NULL; 2812 netmap_adapter_put(hwna); 2813 nm_os_free(bna); 2814 return error; 2815 2816 } 2817 2818 struct nm_bridge * 2819 netmap_init_bridges2(u_int n) 2820 { 2821 int i; 2822 struct nm_bridge *b; 2823 2824 b = nm_os_malloc(sizeof(struct nm_bridge) * n); 2825 if (b == NULL) 2826 return NULL; 2827 for (i = 0; i < n; i++) 2828 BDG_RWINIT(&b[i]); 2829 return b; 2830 } 2831 2832 void 2833 netmap_uninit_bridges2(struct nm_bridge *b, u_int n) 2834 { 2835 int i; 2836 2837 if (b == NULL) 2838 return; 2839 2840 for (i = 0; i < n; i++) 2841 BDG_RWDESTROY(&b[i]); 2842 nm_os_free(b); 2843 } 2844 2845 int 2846 netmap_init_bridges(void) 2847 { 2848 #ifdef CONFIG_NET_NS 2849 return netmap_bns_register(); 2850 #else 2851 nm_bridges = netmap_init_bridges2(NM_BRIDGES); 2852 if (nm_bridges == NULL) 2853 return ENOMEM; 2854 return 0; 2855 #endif 2856 } 2857 2858 void 2859 netmap_uninit_bridges(void) 2860 { 2861 #ifdef CONFIG_NET_NS 2862 netmap_bns_unregister(); 2863 #else 2864 netmap_uninit_bridges2(nm_bridges, NM_BRIDGES); 2865 #endif 2866 } 2867 #endif /* WITH_VALE */ 2868