1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (C) 2013-2016 Universita` di Pisa 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 30 /* 31 * This module implements the VALE switch for netmap 32 33 --- VALE SWITCH --- 34 35 NMG_LOCK() serializes all modifications to switches and ports. 36 A switch cannot be deleted until all ports are gone. 37 38 For each switch, an SX lock (RWlock on linux) protects 39 deletion of ports. When configuring or deleting a new port, the 40 lock is acquired in exclusive mode (after holding NMG_LOCK). 41 When forwarding, the lock is acquired in shared mode (without NMG_LOCK). 42 The lock is held throughout the entire forwarding cycle, 43 during which the thread may incur in a page fault. 44 Hence it is important that sleepable shared locks are used. 45 46 On the rx ring, the per-port lock is grabbed initially to reserve 47 a number of slot in the ring, then the lock is released, 48 packets are copied from source to destination, and then 49 the lock is acquired again and the receive ring is updated. 50 (A similar thing is done on the tx ring for NIC and host stack 51 ports attached to the switch) 52 53 */ 54 55 /* 56 * OS-specific code that is used only within this file. 57 * Other OS-specific code that must be accessed by drivers 58 * is present in netmap_kern.h 59 */ 60 61 #if defined(__FreeBSD__) 62 #include <sys/cdefs.h> /* prerequisite */ 63 __FBSDID("$FreeBSD$"); 64 65 #include <sys/types.h> 66 #include <sys/errno.h> 67 #include <sys/param.h> /* defines used in kernel.h */ 68 #include <sys/kernel.h> /* types used in module initialization */ 69 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 70 #include <sys/sockio.h> 71 #include <sys/socketvar.h> /* struct socket */ 72 #include <sys/malloc.h> 73 #include <sys/poll.h> 74 #include <sys/rwlock.h> 75 #include <sys/socket.h> /* sockaddrs */ 76 #include <sys/selinfo.h> 77 #include <sys/sysctl.h> 78 #include <net/if.h> 79 #include <net/if_var.h> 80 #include <net/bpf.h> /* BIOCIMMEDIATE */ 81 #include <machine/bus.h> /* bus_dmamap_* */ 82 #include <sys/endian.h> 83 #include <sys/refcount.h> 84 85 86 #define BDG_RWLOCK_T struct rwlock // struct rwlock 87 88 #define BDG_RWINIT(b) \ 89 rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) 90 #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) 91 #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) 92 #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) 93 #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) 94 #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) 95 #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) 96 97 98 #elif defined(linux) 99 100 #include "bsd_glue.h" 101 102 #elif defined(__APPLE__) 103 104 #warning OSX support is only partial 105 #include "osx_glue.h" 106 107 #elif defined(_WIN32) 108 #include "win_glue.h" 109 110 #else 111 112 #error Unsupported platform 113 114 #endif /* unsupported */ 115 116 /* 117 * common headers 118 */ 119 120 #include <net/netmap.h> 121 #include <dev/netmap/netmap_kern.h> 122 #include <dev/netmap/netmap_mem2.h> 123 124 #ifdef WITH_VALE 125 126 /* 127 * system parameters (most of them in netmap_kern.h) 128 * NM_BDG_NAME prefix for switch port names, default "vale" 129 * NM_BDG_MAXPORTS number of ports 130 * NM_BRIDGES max number of switches in the system. 131 * XXX should become a sysctl or tunable 132 * 133 * Switch ports are named valeX:Y where X is the switch name and Y 134 * is the port. If Y matches a physical interface name, the port is 135 * connected to a physical device. 136 * 137 * Unlike physical interfaces, switch ports use their own memory region 138 * for rings and buffers. 139 * The virtual interfaces use per-queue lock instead of core lock. 140 * In the tx loop, we aggregate traffic in batches to make all operations 141 * faster. The batch size is bridge_batch. 142 */ 143 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ 144 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ 145 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ 146 #define NM_BDG_HASH 1024 /* forwarding table entries */ 147 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ 148 #define NM_MULTISEG 64 /* max size of a chain of bufs */ 149 /* actual size of the tables */ 150 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) 151 /* NM_FT_NULL terminates a list of slots in the ft */ 152 #define NM_FT_NULL NM_BDG_BATCH_MAX 153 /* Default size for the Maximum Frame Size. */ 154 #define NM_BDG_MFS_DEFAULT 1514 155 156 157 /* 158 * bridge_batch is set via sysctl to the max batch size to be 159 * used in the bridge. The actual value may be larger as the 160 * last packet in the block may overflow the size. 161 */ 162 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ 163 SYSBEGIN(vars_vale); 164 SYSCTL_DECL(_dev_netmap); 165 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0, 166 "Max batch size to be used in the bridge"); 167 SYSEND; 168 169 static int netmap_vp_create(struct nmreq_header *hdr, struct ifnet *, 170 struct netmap_mem_d *nmd, struct netmap_vp_adapter **); 171 static int netmap_vp_reg(struct netmap_adapter *na, int onoff); 172 static int netmap_bwrap_reg(struct netmap_adapter *, int onoff); 173 174 /* 175 * For each output interface, nm_bdg_q is used to construct a list. 176 * bq_len is the number of output buffers (we can have coalescing 177 * during the copy). 178 */ 179 struct nm_bdg_q { 180 uint16_t bq_head; 181 uint16_t bq_tail; 182 uint32_t bq_len; /* number of buffers */ 183 }; 184 185 /* XXX revise this */ 186 struct nm_hash_ent { 187 uint64_t mac; /* the top 2 bytes are the epoch */ 188 uint64_t ports; 189 }; 190 191 /* Holds the default callbacks */ 192 static struct netmap_bdg_ops default_bdg_ops = {netmap_bdg_learning, NULL, NULL}; 193 194 /* 195 * nm_bridge is a descriptor for a VALE switch. 196 * Interfaces for a bridge are all in bdg_ports[]. 197 * The array has fixed size, an empty entry does not terminate 198 * the search, but lookups only occur on attach/detach so we 199 * don't mind if they are slow. 200 * 201 * The bridge is non blocking on the transmit ports: excess 202 * packets are dropped if there is no room on the output port. 203 * 204 * bdg_lock protects accesses to the bdg_ports array. 205 * This is a rw lock (or equivalent). 206 */ 207 #define NM_BDG_IFNAMSIZ IFNAMSIZ 208 struct nm_bridge { 209 /* XXX what is the proper alignment/layout ? */ 210 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ 211 int bdg_namelen; 212 uint32_t bdg_active_ports; 213 char bdg_basename[NM_BDG_IFNAMSIZ]; 214 215 /* Indexes of active ports (up to active_ports) 216 * and all other remaining ports. 217 */ 218 uint32_t bdg_port_index[NM_BDG_MAXPORTS]; 219 /* used by netmap_bdg_detach_common() */ 220 uint32_t tmp_bdg_port_index[NM_BDG_MAXPORTS]; 221 222 struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS]; 223 224 /* 225 * Programmable lookup functions to figure out the destination port. 226 * It returns either of an index of the destination port, 227 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to 228 * forward this packet. ring_nr is the source ring index, and the 229 * function may overwrite this value to forward this packet to a 230 * different ring index. 231 * The function is set by netmap_bdg_regops(). 232 */ 233 struct netmap_bdg_ops *bdg_ops; 234 235 /* 236 * Contains the data structure used by the bdg_ops.lookup function. 237 * By default points to *ht which is allocated on attach and used by the default lookup 238 * otherwise will point to the data structure received by netmap_bdg_regops(). 239 */ 240 void *private_data; 241 struct nm_hash_ent *ht; 242 243 /* Currently used to specify if the bridge is still in use while empty and 244 * if it has been put in exclusive mode by an external module, see netmap_bdg_regops() 245 * and netmap_bdg_create(). 246 */ 247 #define NM_BDG_ACTIVE 1 248 #define NM_BDG_EXCLUSIVE 2 249 uint8_t bdg_flags; 250 251 252 #ifdef CONFIG_NET_NS 253 struct net *ns; 254 #endif /* CONFIG_NET_NS */ 255 }; 256 257 const char* 258 netmap_bdg_name(struct netmap_vp_adapter *vp) 259 { 260 struct nm_bridge *b = vp->na_bdg; 261 if (b == NULL) 262 return NULL; 263 return b->bdg_basename; 264 } 265 266 267 #ifndef CONFIG_NET_NS 268 /* 269 * XXX in principle nm_bridges could be created dynamically 270 * Right now we have a static array and deletions are protected 271 * by an exclusive lock. 272 */ 273 static struct nm_bridge *nm_bridges; 274 #endif /* !CONFIG_NET_NS */ 275 276 277 /* 278 * this is a slightly optimized copy routine which rounds 279 * to multiple of 64 bytes and is often faster than dealing 280 * with other odd sizes. We assume there is enough room 281 * in the source and destination buffers. 282 * 283 * XXX only for multiples of 64 bytes, non overlapped. 284 */ 285 static inline void 286 pkt_copy(void *_src, void *_dst, int l) 287 { 288 uint64_t *src = _src; 289 uint64_t *dst = _dst; 290 if (unlikely(l >= 1024)) { 291 memcpy(dst, src, l); 292 return; 293 } 294 for (; likely(l > 0); l-=64) { 295 *dst++ = *src++; 296 *dst++ = *src++; 297 *dst++ = *src++; 298 *dst++ = *src++; 299 *dst++ = *src++; 300 *dst++ = *src++; 301 *dst++ = *src++; 302 *dst++ = *src++; 303 } 304 } 305 306 307 static int 308 nm_is_id_char(const char c) 309 { 310 return (c >= 'a' && c <= 'z') || 311 (c >= 'A' && c <= 'Z') || 312 (c >= '0' && c <= '9') || 313 (c == '_'); 314 } 315 316 /* Validate the name of a VALE bridge port and return the 317 * position of the ":" character. */ 318 static int 319 nm_vale_name_validate(const char *name) 320 { 321 int colon_pos = -1; 322 int i; 323 324 if (!name || strlen(name) < strlen(NM_BDG_NAME)) { 325 return -1; 326 } 327 328 for (i = 0; i < NM_BDG_IFNAMSIZ && name[i]; i++) { 329 if (name[i] == ':') { 330 colon_pos = i; 331 break; 332 } else if (!nm_is_id_char(name[i])) { 333 return -1; 334 } 335 } 336 337 if (strlen(name) - colon_pos > IFNAMSIZ) { 338 /* interface name too long */ 339 return -1; 340 } 341 342 return colon_pos; 343 } 344 345 /* 346 * locate a bridge among the existing ones. 347 * MUST BE CALLED WITH NMG_LOCK() 348 * 349 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. 350 * We assume that this is called with a name of at least NM_NAME chars. 351 */ 352 static struct nm_bridge * 353 nm_find_bridge(const char *name, int create) 354 { 355 int i, namelen; 356 struct nm_bridge *b = NULL, *bridges; 357 u_int num_bridges; 358 359 NMG_LOCK_ASSERT(); 360 361 netmap_bns_getbridges(&bridges, &num_bridges); 362 363 namelen = nm_vale_name_validate(name); 364 if (namelen < 0) { 365 D("invalid bridge name %s", name ? name : NULL); 366 return NULL; 367 } 368 369 /* lookup the name, remember empty slot if there is one */ 370 for (i = 0; i < num_bridges; i++) { 371 struct nm_bridge *x = bridges + i; 372 373 if ((x->bdg_flags & NM_BDG_ACTIVE) + x->bdg_active_ports == 0) { 374 if (create && b == NULL) 375 b = x; /* record empty slot */ 376 } else if (x->bdg_namelen != namelen) { 377 continue; 378 } else if (strncmp(name, x->bdg_basename, namelen) == 0) { 379 ND("found '%.*s' at %d", namelen, name, i); 380 b = x; 381 break; 382 } 383 } 384 if (i == num_bridges && b) { /* name not found, can create entry */ 385 /* initialize the bridge */ 386 ND("create new bridge %s with ports %d", b->bdg_basename, 387 b->bdg_active_ports); 388 b->ht = nm_os_malloc(sizeof(struct nm_hash_ent) * NM_BDG_HASH); 389 if (b->ht == NULL) { 390 D("failed to allocate hash table"); 391 return NULL; 392 } 393 strncpy(b->bdg_basename, name, namelen); 394 b->bdg_namelen = namelen; 395 b->bdg_active_ports = 0; 396 for (i = 0; i < NM_BDG_MAXPORTS; i++) 397 b->bdg_port_index[i] = i; 398 /* set the default function */ 399 b->bdg_ops = &default_bdg_ops; 400 b->private_data = b->ht; 401 b->bdg_flags = 0; 402 NM_BNS_GET(b); 403 } 404 return b; 405 } 406 407 408 /* 409 * Free the forwarding tables for rings attached to switch ports. 410 */ 411 static void 412 nm_free_bdgfwd(struct netmap_adapter *na) 413 { 414 int nrings, i; 415 struct netmap_kring **kring; 416 417 NMG_LOCK_ASSERT(); 418 nrings = na->num_tx_rings; 419 kring = na->tx_rings; 420 for (i = 0; i < nrings; i++) { 421 if (kring[i]->nkr_ft) { 422 nm_os_free(kring[i]->nkr_ft); 423 kring[i]->nkr_ft = NULL; /* protect from freeing twice */ 424 } 425 } 426 } 427 428 429 /* 430 * Allocate the forwarding tables for the rings attached to the bridge ports. 431 */ 432 static int 433 nm_alloc_bdgfwd(struct netmap_adapter *na) 434 { 435 int nrings, l, i, num_dstq; 436 struct netmap_kring **kring; 437 438 NMG_LOCK_ASSERT(); 439 /* all port:rings + broadcast */ 440 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; 441 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; 442 l += sizeof(struct nm_bdg_q) * num_dstq; 443 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; 444 445 nrings = netmap_real_rings(na, NR_TX); 446 kring = na->tx_rings; 447 for (i = 0; i < nrings; i++) { 448 struct nm_bdg_fwd *ft; 449 struct nm_bdg_q *dstq; 450 int j; 451 452 ft = nm_os_malloc(l); 453 if (!ft) { 454 nm_free_bdgfwd(na); 455 return ENOMEM; 456 } 457 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 458 for (j = 0; j < num_dstq; j++) { 459 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; 460 dstq[j].bq_len = 0; 461 } 462 kring[i]->nkr_ft = ft; 463 } 464 return 0; 465 } 466 467 static int 468 netmap_bdg_free(struct nm_bridge *b) 469 { 470 if ((b->bdg_flags & NM_BDG_ACTIVE) + b->bdg_active_ports != 0) { 471 return EBUSY; 472 } 473 474 ND("marking bridge %s as free", b->bdg_basename); 475 nm_os_free(b->ht); 476 b->bdg_ops = NULL; 477 b->bdg_flags = 0; 478 NM_BNS_PUT(b); 479 return 0; 480 } 481 482 483 /* remove from bridge b the ports in slots hw and sw 484 * (sw can be -1 if not needed) 485 */ 486 static void 487 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) 488 { 489 int s_hw = hw, s_sw = sw; 490 int i, lim =b->bdg_active_ports; 491 uint32_t *tmp = b->tmp_bdg_port_index; 492 493 /* 494 New algorithm: 495 make a copy of bdg_port_index; 496 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port 497 in the array of bdg_port_index, replacing them with 498 entries from the bottom of the array; 499 decrement bdg_active_ports; 500 acquire BDG_WLOCK() and copy back the array. 501 */ 502 503 if (netmap_verbose) 504 D("detach %d and %d (lim %d)", hw, sw, lim); 505 /* make a copy of the list of active ports, update it, 506 * and then copy back within BDG_WLOCK(). 507 */ 508 memcpy(b->tmp_bdg_port_index, b->bdg_port_index, sizeof(b->tmp_bdg_port_index)); 509 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { 510 if (hw >= 0 && tmp[i] == hw) { 511 ND("detach hw %d at %d", hw, i); 512 lim--; /* point to last active port */ 513 tmp[i] = tmp[lim]; /* swap with i */ 514 tmp[lim] = hw; /* now this is inactive */ 515 hw = -1; 516 } else if (sw >= 0 && tmp[i] == sw) { 517 ND("detach sw %d at %d", sw, i); 518 lim--; 519 tmp[i] = tmp[lim]; 520 tmp[lim] = sw; 521 sw = -1; 522 } else { 523 i++; 524 } 525 } 526 if (hw >= 0 || sw >= 0) { 527 D("XXX delete failed hw %d sw %d, should panic...", hw, sw); 528 } 529 530 BDG_WLOCK(b); 531 if (b->bdg_ops->dtor) 532 b->bdg_ops->dtor(b->bdg_ports[s_hw]); 533 b->bdg_ports[s_hw] = NULL; 534 if (s_sw >= 0) { 535 b->bdg_ports[s_sw] = NULL; 536 } 537 memcpy(b->bdg_port_index, b->tmp_bdg_port_index, sizeof(b->tmp_bdg_port_index)); 538 b->bdg_active_ports = lim; 539 BDG_WUNLOCK(b); 540 541 ND("now %d active ports", lim); 542 netmap_bdg_free(b); 543 } 544 545 static inline void * 546 nm_bdg_get_auth_token(struct nm_bridge *b) 547 { 548 return b->ht; 549 } 550 551 /* bridge not in exclusive mode ==> always valid 552 * bridge in exclusive mode (created through netmap_bdg_create()) ==> check authentication token 553 */ 554 static inline int 555 nm_bdg_valid_auth_token(struct nm_bridge *b, void *auth_token) 556 { 557 return !(b->bdg_flags & NM_BDG_EXCLUSIVE) || b->ht == auth_token; 558 } 559 560 /* Allows external modules to create bridges in exclusive mode, 561 * returns an authentication token that the external module will need 562 * to provide during nm_bdg_ctl_{attach, detach}(), netmap_bdg_regops(), 563 * and nm_bdg_update_private_data() operations. 564 * Successfully executed if ret != NULL and *return_status == 0. 565 */ 566 void * 567 netmap_bdg_create(const char *bdg_name, int *return_status) 568 { 569 struct nm_bridge *b = NULL; 570 void *ret = NULL; 571 572 NMG_LOCK(); 573 b = nm_find_bridge(bdg_name, 0 /* don't create */); 574 if (b) { 575 *return_status = EEXIST; 576 goto unlock_bdg_create; 577 } 578 579 b = nm_find_bridge(bdg_name, 1 /* create */); 580 if (!b) { 581 *return_status = ENOMEM; 582 goto unlock_bdg_create; 583 } 584 585 b->bdg_flags |= NM_BDG_ACTIVE | NM_BDG_EXCLUSIVE; 586 ret = nm_bdg_get_auth_token(b); 587 *return_status = 0; 588 589 unlock_bdg_create: 590 NMG_UNLOCK(); 591 return ret; 592 } 593 594 /* Allows external modules to destroy a bridge created through 595 * netmap_bdg_create(), the bridge must be empty. 596 */ 597 int 598 netmap_bdg_destroy(const char *bdg_name, void *auth_token) 599 { 600 struct nm_bridge *b = NULL; 601 int ret = 0; 602 603 NMG_LOCK(); 604 b = nm_find_bridge(bdg_name, 0 /* don't create */); 605 if (!b) { 606 ret = ENXIO; 607 goto unlock_bdg_free; 608 } 609 610 if (!nm_bdg_valid_auth_token(b, auth_token)) { 611 ret = EACCES; 612 goto unlock_bdg_free; 613 } 614 if (!(b->bdg_flags & NM_BDG_EXCLUSIVE)) { 615 ret = EINVAL; 616 goto unlock_bdg_free; 617 } 618 619 b->bdg_flags &= ~(NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE); 620 ret = netmap_bdg_free(b); 621 if (ret) { 622 b->bdg_flags |= NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE; 623 } 624 625 unlock_bdg_free: 626 NMG_UNLOCK(); 627 return ret; 628 } 629 630 631 632 /* nm_bdg_ctl callback for VALE ports */ 633 static int 634 netmap_vp_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na) 635 { 636 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 637 struct nm_bridge *b = vpna->na_bdg; 638 639 if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) { 640 return 0; /* nothing to do */ 641 } 642 if (b) { 643 netmap_set_all_rings(na, 0 /* disable */); 644 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 645 vpna->na_bdg = NULL; 646 netmap_set_all_rings(na, 1 /* enable */); 647 } 648 /* I have took reference just for attach */ 649 netmap_adapter_put(na); 650 return 0; 651 } 652 653 /* nm_dtor callback for ephemeral VALE ports */ 654 static void 655 netmap_vp_dtor(struct netmap_adapter *na) 656 { 657 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; 658 struct nm_bridge *b = vpna->na_bdg; 659 660 ND("%s has %d references", na->name, na->na_refcount); 661 662 if (b) { 663 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 664 } 665 666 if (na->ifp != NULL && !nm_iszombie(na)) { 667 WNA(na->ifp) = NULL; 668 if (vpna->autodelete) { 669 ND("releasing %s", na->ifp->if_xname); 670 NMG_UNLOCK(); 671 nm_os_vi_detach(na->ifp); 672 NMG_LOCK(); 673 } 674 } 675 } 676 677 /* creates a persistent VALE port */ 678 int 679 nm_vi_create(struct nmreq_header *hdr) 680 { 681 struct nmreq_vale_newif *req = 682 (struct nmreq_vale_newif *)(uintptr_t)hdr->nr_body; 683 int error = 0; 684 /* Build a nmreq_register out of the nmreq_vale_newif, 685 * so that we can call netmap_get_bdg_na(). */ 686 struct nmreq_register regreq; 687 bzero(®req, sizeof(regreq)); 688 regreq.nr_tx_slots = req->nr_tx_slots; 689 regreq.nr_rx_slots = req->nr_rx_slots; 690 regreq.nr_tx_rings = req->nr_tx_rings; 691 regreq.nr_rx_rings = req->nr_rx_rings; 692 regreq.nr_mem_id = req->nr_mem_id; 693 hdr->nr_reqtype = NETMAP_REQ_REGISTER; 694 hdr->nr_body = (uintptr_t)®req; 695 error = netmap_vi_create(hdr, 0 /* no autodelete */); 696 hdr->nr_reqtype = NETMAP_REQ_VALE_NEWIF; 697 hdr->nr_body = (uintptr_t)req; 698 /* Write back to the original struct. */ 699 req->nr_tx_slots = regreq.nr_tx_slots; 700 req->nr_rx_slots = regreq.nr_rx_slots; 701 req->nr_tx_rings = regreq.nr_tx_rings; 702 req->nr_rx_rings = regreq.nr_rx_rings; 703 req->nr_mem_id = regreq.nr_mem_id; 704 return error; 705 } 706 707 /* remove a persistent VALE port from the system */ 708 int 709 nm_vi_destroy(const char *name) 710 { 711 struct ifnet *ifp; 712 struct netmap_vp_adapter *vpna; 713 int error; 714 715 ifp = ifunit_ref(name); 716 if (!ifp) 717 return ENXIO; 718 NMG_LOCK(); 719 /* make sure this is actually a VALE port */ 720 if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) { 721 error = EINVAL; 722 goto err; 723 } 724 725 vpna = (struct netmap_vp_adapter *)NA(ifp); 726 727 /* we can only destroy ports that were created via NETMAP_BDG_NEWIF */ 728 if (vpna->autodelete) { 729 error = EINVAL; 730 goto err; 731 } 732 733 /* also make sure that nobody is using the inferface */ 734 if (NETMAP_OWNED_BY_ANY(&vpna->up) || 735 vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) { 736 error = EBUSY; 737 goto err; 738 } 739 740 NMG_UNLOCK(); 741 742 D("destroying a persistent vale interface %s", ifp->if_xname); 743 /* Linux requires all the references are released 744 * before unregister 745 */ 746 netmap_detach(ifp); 747 if_rele(ifp); 748 nm_os_vi_detach(ifp); 749 return 0; 750 751 err: 752 NMG_UNLOCK(); 753 if_rele(ifp); 754 return error; 755 } 756 757 static int 758 nm_update_info(struct nmreq_register *req, struct netmap_adapter *na) 759 { 760 req->nr_rx_rings = na->num_rx_rings; 761 req->nr_tx_rings = na->num_tx_rings; 762 req->nr_rx_slots = na->num_rx_desc; 763 req->nr_tx_slots = na->num_tx_desc; 764 return netmap_mem_get_info(na->nm_mem, &req->nr_memsize, NULL, 765 &req->nr_mem_id); 766 } 767 768 /* 769 * Create a virtual interface registered to the system. 770 * The interface will be attached to a bridge later. 771 */ 772 int 773 netmap_vi_create(struct nmreq_header *hdr, int autodelete) 774 { 775 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body; 776 struct ifnet *ifp; 777 struct netmap_vp_adapter *vpna; 778 struct netmap_mem_d *nmd = NULL; 779 int error; 780 781 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) { 782 return EINVAL; 783 } 784 785 /* don't include VALE prefix */ 786 if (!strncmp(hdr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME))) 787 return EINVAL; 788 if (strlen(hdr->nr_name) >= IFNAMSIZ) { 789 return EINVAL; 790 } 791 ifp = ifunit_ref(hdr->nr_name); 792 if (ifp) { /* already exist, cannot create new one */ 793 error = EEXIST; 794 NMG_LOCK(); 795 if (NM_NA_VALID(ifp)) { 796 int update_err = nm_update_info(req, NA(ifp)); 797 if (update_err) 798 error = update_err; 799 } 800 NMG_UNLOCK(); 801 if_rele(ifp); 802 return error; 803 } 804 error = nm_os_vi_persist(hdr->nr_name, &ifp); 805 if (error) 806 return error; 807 808 NMG_LOCK(); 809 if (req->nr_mem_id) { 810 nmd = netmap_mem_find(req->nr_mem_id); 811 if (nmd == NULL) { 812 error = EINVAL; 813 goto err_1; 814 } 815 } 816 /* netmap_vp_create creates a struct netmap_vp_adapter */ 817 error = netmap_vp_create(hdr, ifp, nmd, &vpna); 818 if (error) { 819 D("error %d", error); 820 goto err_1; 821 } 822 /* persist-specific routines */ 823 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl; 824 if (!autodelete) { 825 netmap_adapter_get(&vpna->up); 826 } else { 827 vpna->autodelete = 1; 828 } 829 NM_ATTACH_NA(ifp, &vpna->up); 830 /* return the updated info */ 831 error = nm_update_info(req, &vpna->up); 832 if (error) { 833 goto err_2; 834 } 835 ND("returning nr_mem_id %d", req->nr_mem_id); 836 if (nmd) 837 netmap_mem_put(nmd); 838 NMG_UNLOCK(); 839 ND("created %s", ifp->if_xname); 840 return 0; 841 842 err_2: 843 netmap_detach(ifp); 844 err_1: 845 if (nmd) 846 netmap_mem_put(nmd); 847 NMG_UNLOCK(); 848 nm_os_vi_detach(ifp); 849 850 return error; 851 } 852 853 /* Try to get a reference to a netmap adapter attached to a VALE switch. 854 * If the adapter is found (or is created), this function returns 0, a 855 * non NULL pointer is returned into *na, and the caller holds a 856 * reference to the adapter. 857 * If an adapter is not found, then no reference is grabbed and the 858 * function returns an error code, or 0 if there is just a VALE prefix 859 * mismatch. Therefore the caller holds a reference when 860 * (*na != NULL && return == 0). 861 */ 862 int 863 netmap_get_bdg_na(struct nmreq_header *hdr, struct netmap_adapter **na, 864 struct netmap_mem_d *nmd, int create) 865 { 866 char *nr_name = hdr->nr_name; 867 const char *ifname; 868 struct ifnet *ifp = NULL; 869 int error = 0; 870 struct netmap_vp_adapter *vpna, *hostna = NULL; 871 struct nm_bridge *b; 872 uint32_t i, j; 873 uint32_t cand = NM_BDG_NOPORT, cand2 = NM_BDG_NOPORT; 874 int needed; 875 876 *na = NULL; /* default return value */ 877 878 /* first try to see if this is a bridge port. */ 879 NMG_LOCK_ASSERT(); 880 if (strncmp(nr_name, NM_BDG_NAME, sizeof(NM_BDG_NAME) - 1)) { 881 return 0; /* no error, but no VALE prefix */ 882 } 883 884 b = nm_find_bridge(nr_name, create); 885 if (b == NULL) { 886 ND("no bridges available for '%s'", nr_name); 887 return (create ? ENOMEM : ENXIO); 888 } 889 if (strlen(nr_name) < b->bdg_namelen) /* impossible */ 890 panic("x"); 891 892 /* Now we are sure that name starts with the bridge's name, 893 * lookup the port in the bridge. We need to scan the entire 894 * list. It is not important to hold a WLOCK on the bridge 895 * during the search because NMG_LOCK already guarantees 896 * that there are no other possible writers. 897 */ 898 899 /* lookup in the local list of ports */ 900 for (j = 0; j < b->bdg_active_ports; j++) { 901 i = b->bdg_port_index[j]; 902 vpna = b->bdg_ports[i]; 903 ND("checking %s", vpna->up.name); 904 if (!strcmp(vpna->up.name, nr_name)) { 905 netmap_adapter_get(&vpna->up); 906 ND("found existing if %s refs %d", nr_name) 907 *na = &vpna->up; 908 return 0; 909 } 910 } 911 /* not found, should we create it? */ 912 if (!create) 913 return ENXIO; 914 /* yes we should, see if we have space to attach entries */ 915 needed = 2; /* in some cases we only need 1 */ 916 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { 917 D("bridge full %d, cannot create new port", b->bdg_active_ports); 918 return ENOMEM; 919 } 920 /* record the next two ports available, but do not allocate yet */ 921 cand = b->bdg_port_index[b->bdg_active_ports]; 922 cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; 923 ND("+++ bridge %s port %s used %d avail %d %d", 924 b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2); 925 926 /* 927 * try see if there is a matching NIC with this name 928 * (after the bridge's name) 929 */ 930 ifname = nr_name + b->bdg_namelen + 1; 931 ifp = ifunit_ref(ifname); 932 if (!ifp) { 933 /* Create an ephemeral virtual port. 934 * This block contains all the ephemeral-specific logic. 935 */ 936 937 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) { 938 error = EINVAL; 939 goto out; 940 } 941 942 /* bdg_netmap_attach creates a struct netmap_adapter */ 943 error = netmap_vp_create(hdr, NULL, nmd, &vpna); 944 if (error) { 945 D("error %d", error); 946 goto out; 947 } 948 /* shortcut - we can skip get_hw_na(), 949 * ownership check and nm_bdg_attach() 950 */ 951 952 } else { 953 struct netmap_adapter *hw; 954 955 /* the vale:nic syntax is only valid for some commands */ 956 switch (hdr->nr_reqtype) { 957 case NETMAP_REQ_VALE_ATTACH: 958 case NETMAP_REQ_VALE_DETACH: 959 case NETMAP_REQ_VALE_POLLING_ENABLE: 960 case NETMAP_REQ_VALE_POLLING_DISABLE: 961 break; /* ok */ 962 default: 963 error = EINVAL; 964 goto out; 965 } 966 967 error = netmap_get_hw_na(ifp, nmd, &hw); 968 if (error || hw == NULL) 969 goto out; 970 971 /* host adapter might not be created */ 972 error = hw->nm_bdg_attach(nr_name, hw); 973 if (error) 974 goto out; 975 vpna = hw->na_vp; 976 hostna = hw->na_hostvp; 977 if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) { 978 /* Check if we need to skip the host rings. */ 979 struct nmreq_vale_attach *areq = 980 (struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body; 981 if (areq->reg.nr_mode != NR_REG_NIC_SW) { 982 hostna = NULL; 983 } 984 } 985 } 986 987 BDG_WLOCK(b); 988 vpna->bdg_port = cand; 989 ND("NIC %p to bridge port %d", vpna, cand); 990 /* bind the port to the bridge (virtual ports are not active) */ 991 b->bdg_ports[cand] = vpna; 992 vpna->na_bdg = b; 993 b->bdg_active_ports++; 994 if (hostna != NULL) { 995 /* also bind the host stack to the bridge */ 996 b->bdg_ports[cand2] = hostna; 997 hostna->bdg_port = cand2; 998 hostna->na_bdg = b; 999 b->bdg_active_ports++; 1000 ND("host %p to bridge port %d", hostna, cand2); 1001 } 1002 ND("if %s refs %d", ifname, vpna->up.na_refcount); 1003 BDG_WUNLOCK(b); 1004 *na = &vpna->up; 1005 netmap_adapter_get(*na); 1006 1007 out: 1008 if (ifp) 1009 if_rele(ifp); 1010 1011 return error; 1012 } 1013 1014 /* Process NETMAP_REQ_VALE_ATTACH. 1015 */ 1016 int 1017 nm_bdg_ctl_attach(struct nmreq_header *hdr, void *auth_token) 1018 { 1019 struct nmreq_vale_attach *req = 1020 (struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body; 1021 struct netmap_vp_adapter * vpna; 1022 struct netmap_adapter *na; 1023 struct netmap_mem_d *nmd = NULL; 1024 struct nm_bridge *b = NULL; 1025 int error; 1026 1027 NMG_LOCK(); 1028 /* permission check for modified bridges */ 1029 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */); 1030 if (b && !nm_bdg_valid_auth_token(b, auth_token)) { 1031 error = EACCES; 1032 goto unlock_exit; 1033 } 1034 1035 if (req->reg.nr_mem_id) { 1036 nmd = netmap_mem_find(req->reg.nr_mem_id); 1037 if (nmd == NULL) { 1038 error = EINVAL; 1039 goto unlock_exit; 1040 } 1041 } 1042 1043 /* check for existing one */ 1044 error = netmap_get_bdg_na(hdr, &na, nmd, 0); 1045 if (!error) { 1046 error = EBUSY; 1047 goto unref_exit; 1048 } 1049 error = netmap_get_bdg_na(hdr, &na, 1050 nmd, 1 /* create if not exists */); 1051 if (error) { /* no device */ 1052 goto unlock_exit; 1053 } 1054 1055 if (na == NULL) { /* VALE prefix missing */ 1056 error = EINVAL; 1057 goto unlock_exit; 1058 } 1059 1060 if (NETMAP_OWNED_BY_ANY(na)) { 1061 error = EBUSY; 1062 goto unref_exit; 1063 } 1064 1065 if (na->nm_bdg_ctl) { 1066 /* nop for VALE ports. The bwrap needs to put the hwna 1067 * in netmap mode (see netmap_bwrap_bdg_ctl) 1068 */ 1069 error = na->nm_bdg_ctl(hdr, na); 1070 if (error) 1071 goto unref_exit; 1072 ND("registered %s to netmap-mode", na->name); 1073 } 1074 vpna = (struct netmap_vp_adapter *)na; 1075 req->port_index = vpna->bdg_port; 1076 NMG_UNLOCK(); 1077 return 0; 1078 1079 unref_exit: 1080 netmap_adapter_put(na); 1081 unlock_exit: 1082 NMG_UNLOCK(); 1083 return error; 1084 } 1085 1086 static inline int 1087 nm_is_bwrap(struct netmap_adapter *na) 1088 { 1089 return na->nm_register == netmap_bwrap_reg; 1090 } 1091 1092 /* Process NETMAP_REQ_VALE_DETACH. 1093 */ 1094 int 1095 nm_bdg_ctl_detach(struct nmreq_header *hdr, void *auth_token) 1096 { 1097 struct nmreq_vale_detach *nmreq_det = (void *)(uintptr_t)hdr->nr_body; 1098 struct netmap_vp_adapter *vpna; 1099 struct netmap_adapter *na; 1100 struct nm_bridge *b = NULL; 1101 int error; 1102 1103 NMG_LOCK(); 1104 /* permission check for modified bridges */ 1105 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */); 1106 if (b && !nm_bdg_valid_auth_token(b, auth_token)) { 1107 error = EACCES; 1108 goto unlock_exit; 1109 } 1110 1111 error = netmap_get_bdg_na(hdr, &na, NULL, 0 /* don't create */); 1112 if (error) { /* no device, or another bridge or user owns the device */ 1113 goto unlock_exit; 1114 } 1115 1116 if (na == NULL) { /* VALE prefix missing */ 1117 error = EINVAL; 1118 goto unlock_exit; 1119 } else if (nm_is_bwrap(na) && 1120 ((struct netmap_bwrap_adapter *)na)->na_polling_state) { 1121 /* Don't detach a NIC with polling */ 1122 error = EBUSY; 1123 goto unref_exit; 1124 } 1125 1126 vpna = (struct netmap_vp_adapter *)na; 1127 if (na->na_vp != vpna) { 1128 /* trying to detach first attach of VALE persistent port attached 1129 * to 2 bridges 1130 */ 1131 error = EBUSY; 1132 goto unref_exit; 1133 } 1134 nmreq_det->port_index = vpna->bdg_port; 1135 1136 if (na->nm_bdg_ctl) { 1137 /* remove the port from bridge. The bwrap 1138 * also needs to put the hwna in normal mode 1139 */ 1140 error = na->nm_bdg_ctl(hdr, na); 1141 } 1142 1143 unref_exit: 1144 netmap_adapter_put(na); 1145 unlock_exit: 1146 NMG_UNLOCK(); 1147 return error; 1148 1149 } 1150 1151 struct nm_bdg_polling_state; 1152 struct 1153 nm_bdg_kthread { 1154 struct nm_kctx *nmk; 1155 u_int qfirst; 1156 u_int qlast; 1157 struct nm_bdg_polling_state *bps; 1158 }; 1159 1160 struct nm_bdg_polling_state { 1161 bool configured; 1162 bool stopped; 1163 struct netmap_bwrap_adapter *bna; 1164 uint32_t mode; 1165 u_int qfirst; 1166 u_int qlast; 1167 u_int cpu_from; 1168 u_int ncpus; 1169 struct nm_bdg_kthread *kthreads; 1170 }; 1171 1172 static void 1173 netmap_bwrap_polling(void *data, int is_kthread) 1174 { 1175 struct nm_bdg_kthread *nbk = data; 1176 struct netmap_bwrap_adapter *bna; 1177 u_int qfirst, qlast, i; 1178 struct netmap_kring **kring0, *kring; 1179 1180 if (!nbk) 1181 return; 1182 qfirst = nbk->qfirst; 1183 qlast = nbk->qlast; 1184 bna = nbk->bps->bna; 1185 kring0 = NMR(bna->hwna, NR_RX); 1186 1187 for (i = qfirst; i < qlast; i++) { 1188 kring = kring0[i]; 1189 kring->nm_notify(kring, 0); 1190 } 1191 } 1192 1193 static int 1194 nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps) 1195 { 1196 struct nm_kctx_cfg kcfg; 1197 int i, j; 1198 1199 bps->kthreads = nm_os_malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus); 1200 if (bps->kthreads == NULL) 1201 return ENOMEM; 1202 1203 bzero(&kcfg, sizeof(kcfg)); 1204 kcfg.worker_fn = netmap_bwrap_polling; 1205 kcfg.use_kthread = 1; 1206 for (i = 0; i < bps->ncpus; i++) { 1207 struct nm_bdg_kthread *t = bps->kthreads + i; 1208 int all = (bps->ncpus == 1 && 1209 bps->mode == NETMAP_POLLING_MODE_SINGLE_CPU); 1210 int affinity = bps->cpu_from + i; 1211 1212 t->bps = bps; 1213 t->qfirst = all ? bps->qfirst /* must be 0 */: affinity; 1214 t->qlast = all ? bps->qlast : t->qfirst + 1; 1215 D("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst, 1216 t->qlast); 1217 1218 kcfg.type = i; 1219 kcfg.worker_private = t; 1220 t->nmk = nm_os_kctx_create(&kcfg, NULL); 1221 if (t->nmk == NULL) { 1222 goto cleanup; 1223 } 1224 nm_os_kctx_worker_setaff(t->nmk, affinity); 1225 } 1226 return 0; 1227 1228 cleanup: 1229 for (j = 0; j < i; j++) { 1230 struct nm_bdg_kthread *t = bps->kthreads + i; 1231 nm_os_kctx_destroy(t->nmk); 1232 } 1233 nm_os_free(bps->kthreads); 1234 return EFAULT; 1235 } 1236 1237 /* A variant of ptnetmap_start_kthreads() */ 1238 static int 1239 nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps) 1240 { 1241 int error, i, j; 1242 1243 if (!bps) { 1244 D("polling is not configured"); 1245 return EFAULT; 1246 } 1247 bps->stopped = false; 1248 1249 for (i = 0; i < bps->ncpus; i++) { 1250 struct nm_bdg_kthread *t = bps->kthreads + i; 1251 error = nm_os_kctx_worker_start(t->nmk); 1252 if (error) { 1253 D("error in nm_kthread_start()"); 1254 goto cleanup; 1255 } 1256 } 1257 return 0; 1258 1259 cleanup: 1260 for (j = 0; j < i; j++) { 1261 struct nm_bdg_kthread *t = bps->kthreads + i; 1262 nm_os_kctx_worker_stop(t->nmk); 1263 } 1264 bps->stopped = true; 1265 return error; 1266 } 1267 1268 static void 1269 nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps) 1270 { 1271 int i; 1272 1273 if (!bps) 1274 return; 1275 1276 for (i = 0; i < bps->ncpus; i++) { 1277 struct nm_bdg_kthread *t = bps->kthreads + i; 1278 nm_os_kctx_worker_stop(t->nmk); 1279 nm_os_kctx_destroy(t->nmk); 1280 } 1281 bps->stopped = true; 1282 } 1283 1284 static int 1285 get_polling_cfg(struct nmreq_vale_polling *req, struct netmap_adapter *na, 1286 struct nm_bdg_polling_state *bps) 1287 { 1288 unsigned int avail_cpus, core_from; 1289 unsigned int qfirst, qlast; 1290 uint32_t i = req->nr_first_cpu_id; 1291 uint32_t req_cpus = req->nr_num_polling_cpus; 1292 1293 avail_cpus = nm_os_ncpus(); 1294 1295 if (req_cpus == 0) { 1296 D("req_cpus must be > 0"); 1297 return EINVAL; 1298 } else if (req_cpus >= avail_cpus) { 1299 D("Cannot use all the CPUs in the system"); 1300 return EINVAL; 1301 } 1302 1303 if (req->nr_mode == NETMAP_POLLING_MODE_MULTI_CPU) { 1304 /* Use a separate core for each ring. If nr_num_polling_cpus>1 1305 * more consecutive rings are polled. 1306 * For example, if nr_first_cpu_id=2 and nr_num_polling_cpus=2, 1307 * ring 2 and 3 are polled by core 2 and 3, respectively. */ 1308 if (i + req_cpus > nma_get_nrings(na, NR_RX)) { 1309 D("Rings %u-%u not in range (have %d rings)", 1310 i, i + req_cpus, nma_get_nrings(na, NR_RX)); 1311 return EINVAL; 1312 } 1313 qfirst = i; 1314 qlast = qfirst + req_cpus; 1315 core_from = qfirst; 1316 1317 } else if (req->nr_mode == NETMAP_POLLING_MODE_SINGLE_CPU) { 1318 /* Poll all the rings using a core specified by nr_first_cpu_id. 1319 * the number of cores must be 1. */ 1320 if (req_cpus != 1) { 1321 D("ncpus must be 1 for NETMAP_POLLING_MODE_SINGLE_CPU " 1322 "(was %d)", req_cpus); 1323 return EINVAL; 1324 } 1325 qfirst = 0; 1326 qlast = nma_get_nrings(na, NR_RX); 1327 core_from = i; 1328 } else { 1329 D("Invalid polling mode"); 1330 return EINVAL; 1331 } 1332 1333 bps->mode = req->nr_mode; 1334 bps->qfirst = qfirst; 1335 bps->qlast = qlast; 1336 bps->cpu_from = core_from; 1337 bps->ncpus = req_cpus; 1338 D("%s qfirst %u qlast %u cpu_from %u ncpus %u", 1339 req->nr_mode == NETMAP_POLLING_MODE_MULTI_CPU ? 1340 "MULTI" : "SINGLE", 1341 qfirst, qlast, core_from, req_cpus); 1342 return 0; 1343 } 1344 1345 static int 1346 nm_bdg_ctl_polling_start(struct nmreq_vale_polling *req, struct netmap_adapter *na) 1347 { 1348 struct nm_bdg_polling_state *bps; 1349 struct netmap_bwrap_adapter *bna; 1350 int error; 1351 1352 bna = (struct netmap_bwrap_adapter *)na; 1353 if (bna->na_polling_state) { 1354 D("ERROR adapter already in polling mode"); 1355 return EFAULT; 1356 } 1357 1358 bps = nm_os_malloc(sizeof(*bps)); 1359 if (!bps) 1360 return ENOMEM; 1361 bps->configured = false; 1362 bps->stopped = true; 1363 1364 if (get_polling_cfg(req, na, bps)) { 1365 nm_os_free(bps); 1366 return EINVAL; 1367 } 1368 1369 if (nm_bdg_create_kthreads(bps)) { 1370 nm_os_free(bps); 1371 return EFAULT; 1372 } 1373 1374 bps->configured = true; 1375 bna->na_polling_state = bps; 1376 bps->bna = bna; 1377 1378 /* disable interrupts if possible */ 1379 nma_intr_enable(bna->hwna, 0); 1380 /* start kthread now */ 1381 error = nm_bdg_polling_start_kthreads(bps); 1382 if (error) { 1383 D("ERROR nm_bdg_polling_start_kthread()"); 1384 nm_os_free(bps->kthreads); 1385 nm_os_free(bps); 1386 bna->na_polling_state = NULL; 1387 nma_intr_enable(bna->hwna, 1); 1388 } 1389 return error; 1390 } 1391 1392 static int 1393 nm_bdg_ctl_polling_stop(struct netmap_adapter *na) 1394 { 1395 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na; 1396 struct nm_bdg_polling_state *bps; 1397 1398 if (!bna->na_polling_state) { 1399 D("ERROR adapter is not in polling mode"); 1400 return EFAULT; 1401 } 1402 bps = bna->na_polling_state; 1403 nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state); 1404 bps->configured = false; 1405 nm_os_free(bps); 1406 bna->na_polling_state = NULL; 1407 /* reenable interrupts */ 1408 nma_intr_enable(bna->hwna, 1); 1409 return 0; 1410 } 1411 1412 int 1413 nm_bdg_polling(struct nmreq_header *hdr) 1414 { 1415 struct nmreq_vale_polling *req = 1416 (struct nmreq_vale_polling *)(uintptr_t)hdr->nr_body; 1417 struct netmap_adapter *na = NULL; 1418 int error = 0; 1419 1420 NMG_LOCK(); 1421 error = netmap_get_bdg_na(hdr, &na, NULL, /*create=*/0); 1422 if (na && !error) { 1423 if (!nm_is_bwrap(na)) { 1424 error = EOPNOTSUPP; 1425 } else if (hdr->nr_reqtype == NETMAP_BDG_POLLING_ON) { 1426 error = nm_bdg_ctl_polling_start(req, na); 1427 if (!error) 1428 netmap_adapter_get(na); 1429 } else { 1430 error = nm_bdg_ctl_polling_stop(na); 1431 if (!error) 1432 netmap_adapter_put(na); 1433 } 1434 netmap_adapter_put(na); 1435 } else if (!na && !error) { 1436 /* Not VALE port. */ 1437 error = EINVAL; 1438 } 1439 NMG_UNLOCK(); 1440 1441 return error; 1442 } 1443 1444 /* Process NETMAP_REQ_VALE_LIST. */ 1445 int 1446 netmap_bdg_list(struct nmreq_header *hdr) 1447 { 1448 struct nmreq_vale_list *req = 1449 (struct nmreq_vale_list *)(uintptr_t)hdr->nr_body; 1450 int namelen = strlen(hdr->nr_name); 1451 struct nm_bridge *b, *bridges; 1452 struct netmap_vp_adapter *vpna; 1453 int error = 0, i, j; 1454 u_int num_bridges; 1455 1456 netmap_bns_getbridges(&bridges, &num_bridges); 1457 1458 /* this is used to enumerate bridges and ports */ 1459 if (namelen) { /* look up indexes of bridge and port */ 1460 if (strncmp(hdr->nr_name, NM_BDG_NAME, 1461 strlen(NM_BDG_NAME))) { 1462 return EINVAL; 1463 } 1464 NMG_LOCK(); 1465 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */); 1466 if (!b) { 1467 NMG_UNLOCK(); 1468 return ENOENT; 1469 } 1470 1471 req->nr_bridge_idx = b - bridges; /* bridge index */ 1472 req->nr_port_idx = NM_BDG_NOPORT; 1473 for (j = 0; j < b->bdg_active_ports; j++) { 1474 i = b->bdg_port_index[j]; 1475 vpna = b->bdg_ports[i]; 1476 if (vpna == NULL) { 1477 D("This should not happen"); 1478 continue; 1479 } 1480 /* the former and the latter identify a 1481 * virtual port and a NIC, respectively 1482 */ 1483 if (!strcmp(vpna->up.name, hdr->nr_name)) { 1484 req->nr_port_idx = i; /* port index */ 1485 break; 1486 } 1487 } 1488 NMG_UNLOCK(); 1489 } else { 1490 /* return the first non-empty entry starting from 1491 * bridge nr_arg1 and port nr_arg2. 1492 * 1493 * Users can detect the end of the same bridge by 1494 * seeing the new and old value of nr_arg1, and can 1495 * detect the end of all the bridge by error != 0 1496 */ 1497 i = req->nr_bridge_idx; 1498 j = req->nr_port_idx; 1499 1500 NMG_LOCK(); 1501 for (error = ENOENT; i < NM_BRIDGES; i++) { 1502 b = bridges + i; 1503 for ( ; j < NM_BDG_MAXPORTS; j++) { 1504 if (b->bdg_ports[j] == NULL) 1505 continue; 1506 vpna = b->bdg_ports[j]; 1507 /* write back the VALE switch name */ 1508 strncpy(hdr->nr_name, vpna->up.name, 1509 (size_t)IFNAMSIZ); 1510 error = 0; 1511 goto out; 1512 } 1513 j = 0; /* following bridges scan from 0 */ 1514 } 1515 out: 1516 req->nr_bridge_idx = i; 1517 req->nr_port_idx = j; 1518 NMG_UNLOCK(); 1519 } 1520 1521 return error; 1522 } 1523 1524 /* Called by external kernel modules (e.g., Openvswitch). 1525 * to set configure/lookup/dtor functions of a VALE instance. 1526 * Register callbacks to the given bridge. 'name' may be just 1527 * bridge's name (including ':' if it is not just NM_BDG_NAME). 1528 * 1529 * Called without NMG_LOCK. 1530 */ 1531 1532 int 1533 netmap_bdg_regops(const char *name, struct netmap_bdg_ops *bdg_ops, void *private_data, void *auth_token) 1534 { 1535 struct nm_bridge *b; 1536 int error = 0; 1537 1538 NMG_LOCK(); 1539 b = nm_find_bridge(name, 0 /* don't create */); 1540 if (!b) { 1541 error = ENXIO; 1542 goto unlock_regops; 1543 } 1544 if (!nm_bdg_valid_auth_token(b, auth_token)) { 1545 error = EACCES; 1546 goto unlock_regops; 1547 } 1548 1549 BDG_WLOCK(b); 1550 if (!bdg_ops) { 1551 /* resetting the bridge */ 1552 bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); 1553 b->bdg_ops = &default_bdg_ops; 1554 b->private_data = b->ht; 1555 } else { 1556 /* modifying the bridge */ 1557 b->private_data = private_data; 1558 b->bdg_ops = bdg_ops; 1559 } 1560 BDG_WUNLOCK(b); 1561 1562 unlock_regops: 1563 NMG_UNLOCK(); 1564 return error; 1565 } 1566 1567 /* Called by external kernel modules (e.g., Openvswitch). 1568 * to modify the private data previously given to regops(). 1569 * 'name' may be just bridge's name (including ':' if it 1570 * is not just NM_BDG_NAME). 1571 * Called without NMG_LOCK. 1572 */ 1573 int 1574 nm_bdg_update_private_data(const char *name, bdg_update_private_data_fn_t callback, 1575 void *callback_data, void *auth_token) 1576 { 1577 void *private_data = NULL; 1578 struct nm_bridge *b; 1579 int error = 0; 1580 1581 NMG_LOCK(); 1582 b = nm_find_bridge(name, 0 /* don't create */); 1583 if (!b) { 1584 error = EINVAL; 1585 goto unlock_update_priv; 1586 } 1587 if (!nm_bdg_valid_auth_token(b, auth_token)) { 1588 error = EACCES; 1589 goto unlock_update_priv; 1590 } 1591 BDG_WLOCK(b); 1592 private_data = callback(b->private_data, callback_data, &error); 1593 b->private_data = private_data; 1594 BDG_WUNLOCK(b); 1595 1596 unlock_update_priv: 1597 NMG_UNLOCK(); 1598 return error; 1599 } 1600 1601 int 1602 netmap_bdg_config(struct nm_ifreq *nr) 1603 { 1604 struct nm_bridge *b; 1605 int error = EINVAL; 1606 1607 NMG_LOCK(); 1608 b = nm_find_bridge(nr->nifr_name, 0); 1609 if (!b) { 1610 NMG_UNLOCK(); 1611 return error; 1612 } 1613 NMG_UNLOCK(); 1614 /* Don't call config() with NMG_LOCK() held */ 1615 BDG_RLOCK(b); 1616 if (b->bdg_ops->config != NULL) 1617 error = b->bdg_ops->config(nr); 1618 BDG_RUNLOCK(b); 1619 return error; 1620 } 1621 1622 1623 /* nm_krings_create callback for VALE ports. 1624 * Calls the standard netmap_krings_create, then adds leases on rx 1625 * rings and bdgfwd on tx rings. 1626 */ 1627 static int 1628 netmap_vp_krings_create(struct netmap_adapter *na) 1629 { 1630 u_int tailroom; 1631 int error, i; 1632 uint32_t *leases; 1633 u_int nrx = netmap_real_rings(na, NR_RX); 1634 1635 /* 1636 * Leases are attached to RX rings on vale ports 1637 */ 1638 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; 1639 1640 error = netmap_krings_create(na, tailroom); 1641 if (error) 1642 return error; 1643 1644 leases = na->tailroom; 1645 1646 for (i = 0; i < nrx; i++) { /* Receive rings */ 1647 na->rx_rings[i]->nkr_leases = leases; 1648 leases += na->num_rx_desc; 1649 } 1650 1651 error = nm_alloc_bdgfwd(na); 1652 if (error) { 1653 netmap_krings_delete(na); 1654 return error; 1655 } 1656 1657 return 0; 1658 } 1659 1660 1661 /* nm_krings_delete callback for VALE ports. */ 1662 static void 1663 netmap_vp_krings_delete(struct netmap_adapter *na) 1664 { 1665 nm_free_bdgfwd(na); 1666 netmap_krings_delete(na); 1667 } 1668 1669 1670 static int 1671 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, 1672 struct netmap_vp_adapter *na, u_int ring_nr); 1673 1674 1675 /* 1676 * main dispatch routine for the bridge. 1677 * Grab packets from a kring, move them into the ft structure 1678 * associated to the tx (input) port. Max one instance per port, 1679 * filtered on input (ioctl, poll or XXX). 1680 * Returns the next position in the ring. 1681 */ 1682 static int 1683 nm_bdg_preflush(struct netmap_kring *kring, u_int end) 1684 { 1685 struct netmap_vp_adapter *na = 1686 (struct netmap_vp_adapter*)kring->na; 1687 struct netmap_ring *ring = kring->ring; 1688 struct nm_bdg_fwd *ft; 1689 u_int ring_nr = kring->ring_id; 1690 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; 1691 u_int ft_i = 0; /* start from 0 */ 1692 u_int frags = 1; /* how many frags ? */ 1693 struct nm_bridge *b = na->na_bdg; 1694 1695 /* To protect against modifications to the bridge we acquire a 1696 * shared lock, waiting if we can sleep (if the source port is 1697 * attached to a user process) or with a trylock otherwise (NICs). 1698 */ 1699 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); 1700 if (na->up.na_flags & NAF_BDG_MAYSLEEP) 1701 BDG_RLOCK(b); 1702 else if (!BDG_RTRYLOCK(b)) 1703 return j; 1704 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); 1705 ft = kring->nkr_ft; 1706 1707 for (; likely(j != end); j = nm_next(j, lim)) { 1708 struct netmap_slot *slot = &ring->slot[j]; 1709 char *buf; 1710 1711 ft[ft_i].ft_len = slot->len; 1712 ft[ft_i].ft_flags = slot->flags; 1713 ft[ft_i].ft_offset = 0; 1714 1715 ND("flags is 0x%x", slot->flags); 1716 /* we do not use the buf changed flag, but we still need to reset it */ 1717 slot->flags &= ~NS_BUF_CHANGED; 1718 1719 /* this slot goes into a list so initialize the link field */ 1720 ft[ft_i].ft_next = NM_FT_NULL; 1721 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? 1722 (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot); 1723 if (unlikely(buf == NULL)) { 1724 RD(5, "NULL %s buffer pointer from %s slot %d len %d", 1725 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT", 1726 kring->name, j, ft[ft_i].ft_len); 1727 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up); 1728 ft[ft_i].ft_len = 0; 1729 ft[ft_i].ft_flags = 0; 1730 } 1731 __builtin_prefetch(buf); 1732 ++ft_i; 1733 if (slot->flags & NS_MOREFRAG) { 1734 frags++; 1735 continue; 1736 } 1737 if (unlikely(netmap_verbose && frags > 1)) 1738 RD(5, "%d frags at %d", frags, ft_i - frags); 1739 ft[ft_i - frags].ft_frags = frags; 1740 frags = 1; 1741 if (unlikely((int)ft_i >= bridge_batch)) 1742 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 1743 } 1744 if (frags > 1) { 1745 /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we 1746 * have to fix frags count. */ 1747 frags--; 1748 ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG; 1749 ft[ft_i - frags].ft_frags = frags; 1750 D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags); 1751 } 1752 if (ft_i) 1753 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); 1754 BDG_RUNLOCK(b); 1755 return j; 1756 } 1757 1758 1759 /* ----- FreeBSD if_bridge hash function ------- */ 1760 1761 /* 1762 * The following hash function is adapted from "Hash Functions" by Bob Jenkins 1763 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). 1764 * 1765 * http://www.burtleburtle.net/bob/hash/spooky.html 1766 */ 1767 #define mix(a, b, c) \ 1768 do { \ 1769 a -= b; a -= c; a ^= (c >> 13); \ 1770 b -= c; b -= a; b ^= (a << 8); \ 1771 c -= a; c -= b; c ^= (b >> 13); \ 1772 a -= b; a -= c; a ^= (c >> 12); \ 1773 b -= c; b -= a; b ^= (a << 16); \ 1774 c -= a; c -= b; c ^= (b >> 5); \ 1775 a -= b; a -= c; a ^= (c >> 3); \ 1776 b -= c; b -= a; b ^= (a << 10); \ 1777 c -= a; c -= b; c ^= (b >> 15); \ 1778 } while (/*CONSTCOND*/0) 1779 1780 1781 static __inline uint32_t 1782 nm_bridge_rthash(const uint8_t *addr) 1783 { 1784 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key 1785 1786 b += addr[5] << 8; 1787 b += addr[4]; 1788 a += addr[3] << 24; 1789 a += addr[2] << 16; 1790 a += addr[1] << 8; 1791 a += addr[0]; 1792 1793 mix(a, b, c); 1794 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) 1795 return (c & BRIDGE_RTHASH_MASK); 1796 } 1797 1798 #undef mix 1799 1800 1801 /* nm_register callback for VALE ports */ 1802 static int 1803 netmap_vp_reg(struct netmap_adapter *na, int onoff) 1804 { 1805 struct netmap_vp_adapter *vpna = 1806 (struct netmap_vp_adapter*)na; 1807 enum txrx t; 1808 int i; 1809 1810 /* persistent ports may be put in netmap mode 1811 * before being attached to a bridge 1812 */ 1813 if (vpna->na_bdg) 1814 BDG_WLOCK(vpna->na_bdg); 1815 if (onoff) { 1816 for_rx_tx(t) { 1817 for (i = 0; i < netmap_real_rings(na, t); i++) { 1818 struct netmap_kring *kring = NMR(na, t)[i]; 1819 1820 if (nm_kring_pending_on(kring)) 1821 kring->nr_mode = NKR_NETMAP_ON; 1822 } 1823 } 1824 if (na->active_fds == 0) 1825 na->na_flags |= NAF_NETMAP_ON; 1826 /* XXX on FreeBSD, persistent VALE ports should also 1827 * toggle IFCAP_NETMAP in na->ifp (2014-03-16) 1828 */ 1829 } else { 1830 if (na->active_fds == 0) 1831 na->na_flags &= ~NAF_NETMAP_ON; 1832 for_rx_tx(t) { 1833 for (i = 0; i < netmap_real_rings(na, t); i++) { 1834 struct netmap_kring *kring = NMR(na, t)[i]; 1835 1836 if (nm_kring_pending_off(kring)) 1837 kring->nr_mode = NKR_NETMAP_OFF; 1838 } 1839 } 1840 } 1841 if (vpna->na_bdg) 1842 BDG_WUNLOCK(vpna->na_bdg); 1843 return 0; 1844 } 1845 1846 1847 /* 1848 * Lookup function for a learning bridge. 1849 * Update the hash table with the source address, 1850 * and then returns the destination port index, and the 1851 * ring in *dst_ring (at the moment, always use ring 0) 1852 */ 1853 uint32_t 1854 netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, 1855 struct netmap_vp_adapter *na, void *private_data) 1856 { 1857 uint8_t *buf = ((uint8_t *)ft->ft_buf) + ft->ft_offset; 1858 u_int buf_len = ft->ft_len - ft->ft_offset; 1859 struct nm_hash_ent *ht = private_data; 1860 uint32_t sh, dh; 1861 u_int dst, mysrc = na->bdg_port; 1862 uint64_t smac, dmac; 1863 uint8_t indbuf[12]; 1864 1865 if (buf_len < 14) { 1866 return NM_BDG_NOPORT; 1867 } 1868 1869 if (ft->ft_flags & NS_INDIRECT) { 1870 if (copyin(buf, indbuf, sizeof(indbuf))) { 1871 return NM_BDG_NOPORT; 1872 } 1873 buf = indbuf; 1874 } 1875 1876 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; 1877 smac = le64toh(*(uint64_t *)(buf + 4)); 1878 smac >>= 16; 1879 1880 /* 1881 * The hash is somewhat expensive, there might be some 1882 * worthwhile optimizations here. 1883 */ 1884 if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */ 1885 uint8_t *s = buf+6; 1886 sh = nm_bridge_rthash(s); /* hash of source */ 1887 /* update source port forwarding entry */ 1888 na->last_smac = ht[sh].mac = smac; /* XXX expire ? */ 1889 ht[sh].ports = mysrc; 1890 if (netmap_verbose) 1891 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", 1892 s[0], s[1], s[2], s[3], s[4], s[5], mysrc); 1893 } 1894 dst = NM_BDG_BROADCAST; 1895 if ((buf[0] & 1) == 0) { /* unicast */ 1896 dh = nm_bridge_rthash(buf); /* hash of dst */ 1897 if (ht[dh].mac == dmac) { /* found dst */ 1898 dst = ht[dh].ports; 1899 } 1900 } 1901 return dst; 1902 } 1903 1904 1905 /* 1906 * Available space in the ring. Only used in VALE code 1907 * and only with is_rx = 1 1908 */ 1909 static inline uint32_t 1910 nm_kr_space(struct netmap_kring *k, int is_rx) 1911 { 1912 int space; 1913 1914 if (is_rx) { 1915 int busy = k->nkr_hwlease - k->nr_hwcur; 1916 if (busy < 0) 1917 busy += k->nkr_num_slots; 1918 space = k->nkr_num_slots - 1 - busy; 1919 } else { 1920 /* XXX never used in this branch */ 1921 space = k->nr_hwtail - k->nkr_hwlease; 1922 if (space < 0) 1923 space += k->nkr_num_slots; 1924 } 1925 #if 0 1926 // sanity check 1927 if (k->nkr_hwlease >= k->nkr_num_slots || 1928 k->nr_hwcur >= k->nkr_num_slots || 1929 k->nr_tail >= k->nkr_num_slots || 1930 busy < 0 || 1931 busy >= k->nkr_num_slots) { 1932 D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 1933 k->nkr_lease_idx, k->nkr_num_slots); 1934 } 1935 #endif 1936 return space; 1937 } 1938 1939 1940 1941 1942 /* make a lease on the kring for N positions. return the 1943 * lease index 1944 * XXX only used in VALE code and with is_rx = 1 1945 */ 1946 static inline uint32_t 1947 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) 1948 { 1949 uint32_t lim = k->nkr_num_slots - 1; 1950 uint32_t lease_idx = k->nkr_lease_idx; 1951 1952 k->nkr_leases[lease_idx] = NR_NOSLOT; 1953 k->nkr_lease_idx = nm_next(lease_idx, lim); 1954 1955 if (n > nm_kr_space(k, is_rx)) { 1956 D("invalid request for %d slots", n); 1957 panic("x"); 1958 } 1959 /* XXX verify that there are n slots */ 1960 k->nkr_hwlease += n; 1961 if (k->nkr_hwlease > lim) 1962 k->nkr_hwlease -= lim + 1; 1963 1964 if (k->nkr_hwlease >= k->nkr_num_slots || 1965 k->nr_hwcur >= k->nkr_num_slots || 1966 k->nr_hwtail >= k->nkr_num_slots || 1967 k->nkr_lease_idx >= k->nkr_num_slots) { 1968 D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", 1969 k->na->name, 1970 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 1971 k->nkr_lease_idx, k->nkr_num_slots); 1972 } 1973 return lease_idx; 1974 } 1975 1976 /* 1977 * 1978 * This flush routine supports only unicast and broadcast but a large 1979 * number of ports, and lets us replace the learn and dispatch functions. 1980 */ 1981 int 1982 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, 1983 u_int ring_nr) 1984 { 1985 struct nm_bdg_q *dst_ents, *brddst; 1986 uint16_t num_dsts = 0, *dsts; 1987 struct nm_bridge *b = na->na_bdg; 1988 u_int i, me = na->bdg_port; 1989 1990 /* 1991 * The work area (pointed by ft) is followed by an array of 1992 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS 1993 * queues per port plus one for the broadcast traffic. 1994 * Then we have an array of destination indexes. 1995 */ 1996 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); 1997 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); 1998 1999 /* first pass: find a destination for each packet in the batch */ 2000 for (i = 0; likely(i < n); i += ft[i].ft_frags) { 2001 uint8_t dst_ring = ring_nr; /* default, same ring as origin */ 2002 uint16_t dst_port, d_i; 2003 struct nm_bdg_q *d; 2004 struct nm_bdg_fwd *start_ft = NULL; 2005 2006 ND("slot %d frags %d", i, ft[i].ft_frags); 2007 2008 if (na->up.virt_hdr_len < ft[i].ft_len) { 2009 ft[i].ft_offset = na->up.virt_hdr_len; 2010 start_ft = &ft[i]; 2011 } else if (na->up.virt_hdr_len == ft[i].ft_len && ft[i].ft_flags & NS_MOREFRAG) { 2012 ft[i].ft_offset = ft[i].ft_len; 2013 start_ft = &ft[i+1]; 2014 } else { 2015 /* Drop the packet if the virtio-net header is not into the first 2016 * fragment nor at the very beginning of the second. 2017 */ 2018 continue; 2019 } 2020 dst_port = b->bdg_ops->lookup(start_ft, &dst_ring, na, b->private_data); 2021 if (netmap_verbose > 255) 2022 RD(5, "slot %d port %d -> %d", i, me, dst_port); 2023 if (dst_port >= NM_BDG_NOPORT) 2024 continue; /* this packet is identified to be dropped */ 2025 else if (dst_port == NM_BDG_BROADCAST) 2026 dst_ring = 0; /* broadcasts always go to ring 0 */ 2027 else if (unlikely(dst_port == me || 2028 !b->bdg_ports[dst_port])) 2029 continue; 2030 2031 /* get a position in the scratch pad */ 2032 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; 2033 d = dst_ents + d_i; 2034 2035 /* append the first fragment to the list */ 2036 if (d->bq_head == NM_FT_NULL) { /* new destination */ 2037 d->bq_head = d->bq_tail = i; 2038 /* remember this position to be scanned later */ 2039 if (dst_port != NM_BDG_BROADCAST) 2040 dsts[num_dsts++] = d_i; 2041 } else { 2042 ft[d->bq_tail].ft_next = i; 2043 d->bq_tail = i; 2044 } 2045 d->bq_len += ft[i].ft_frags; 2046 } 2047 2048 /* 2049 * Broadcast traffic goes to ring 0 on all destinations. 2050 * So we need to add these rings to the list of ports to scan. 2051 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is 2052 * expensive. We should keep a compact list of active destinations 2053 * so we could shorten this loop. 2054 */ 2055 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; 2056 if (brddst->bq_head != NM_FT_NULL) { 2057 u_int j; 2058 for (j = 0; likely(j < b->bdg_active_ports); j++) { 2059 uint16_t d_i; 2060 i = b->bdg_port_index[j]; 2061 if (unlikely(i == me)) 2062 continue; 2063 d_i = i * NM_BDG_MAXRINGS; 2064 if (dst_ents[d_i].bq_head == NM_FT_NULL) 2065 dsts[num_dsts++] = d_i; 2066 } 2067 } 2068 2069 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); 2070 /* second pass: scan destinations */ 2071 for (i = 0; i < num_dsts; i++) { 2072 struct netmap_vp_adapter *dst_na; 2073 struct netmap_kring *kring; 2074 struct netmap_ring *ring; 2075 u_int dst_nr, lim, j, d_i, next, brd_next; 2076 u_int needed, howmany; 2077 int retry = netmap_txsync_retry; 2078 struct nm_bdg_q *d; 2079 uint32_t my_start = 0, lease_idx = 0; 2080 int nrings; 2081 int virt_hdr_mismatch = 0; 2082 2083 d_i = dsts[i]; 2084 ND("second pass %d port %d", i, d_i); 2085 d = dst_ents + d_i; 2086 // XXX fix the division 2087 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; 2088 /* protect from the lookup function returning an inactive 2089 * destination port 2090 */ 2091 if (unlikely(dst_na == NULL)) 2092 goto cleanup; 2093 if (dst_na->up.na_flags & NAF_SW_ONLY) 2094 goto cleanup; 2095 /* 2096 * The interface may be in !netmap mode in two cases: 2097 * - when na is attached but not activated yet; 2098 * - when na is being deactivated but is still attached. 2099 */ 2100 if (unlikely(!nm_netmap_on(&dst_na->up))) { 2101 ND("not in netmap mode!"); 2102 goto cleanup; 2103 } 2104 2105 /* there is at least one either unicast or broadcast packet */ 2106 brd_next = brddst->bq_head; 2107 next = d->bq_head; 2108 /* we need to reserve this many slots. If fewer are 2109 * available, some packets will be dropped. 2110 * Packets may have multiple fragments, so we may not use 2111 * there is a chance that we may not use all of the slots 2112 * we have claimed, so we will need to handle the leftover 2113 * ones when we regain the lock. 2114 */ 2115 needed = d->bq_len + brddst->bq_len; 2116 2117 if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) { 2118 if (netmap_verbose) { 2119 RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len, 2120 dst_na->up.virt_hdr_len); 2121 } 2122 /* There is a virtio-net header/offloadings mismatch between 2123 * source and destination. The slower mismatch datapath will 2124 * be used to cope with all the mismatches. 2125 */ 2126 virt_hdr_mismatch = 1; 2127 if (dst_na->mfs < na->mfs) { 2128 /* We may need to do segmentation offloadings, and so 2129 * we may need a number of destination slots greater 2130 * than the number of input slots ('needed'). 2131 * We look for the smallest integer 'x' which satisfies: 2132 * needed * na->mfs + x * H <= x * na->mfs 2133 * where 'H' is the length of the longest header that may 2134 * be replicated in the segmentation process (e.g. for 2135 * TCPv4 we must account for ethernet header, IP header 2136 * and TCPv4 header). 2137 */ 2138 KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0")); 2139 needed = (needed * na->mfs) / 2140 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; 2141 ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); 2142 } 2143 } 2144 2145 ND(5, "pass 2 dst %d is %x %s", 2146 i, d_i, is_vp ? "virtual" : "nic/host"); 2147 dst_nr = d_i & (NM_BDG_MAXRINGS-1); 2148 nrings = dst_na->up.num_rx_rings; 2149 if (dst_nr >= nrings) 2150 dst_nr = dst_nr % nrings; 2151 kring = dst_na->up.rx_rings[dst_nr]; 2152 ring = kring->ring; 2153 /* the destination ring may have not been opened for RX */ 2154 if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON)) 2155 goto cleanup; 2156 lim = kring->nkr_num_slots - 1; 2157 2158 retry: 2159 2160 if (dst_na->retry && retry) { 2161 /* try to get some free slot from the previous run */ 2162 kring->nm_notify(kring, 0); 2163 /* actually useful only for bwraps, since there 2164 * the notify will trigger a txsync on the hwna. VALE ports 2165 * have dst_na->retry == 0 2166 */ 2167 } 2168 /* reserve the buffers in the queue and an entry 2169 * to report completion, and drop lock. 2170 * XXX this might become a helper function. 2171 */ 2172 mtx_lock(&kring->q_lock); 2173 if (kring->nkr_stopped) { 2174 mtx_unlock(&kring->q_lock); 2175 goto cleanup; 2176 } 2177 my_start = j = kring->nkr_hwlease; 2178 howmany = nm_kr_space(kring, 1); 2179 if (needed < howmany) 2180 howmany = needed; 2181 lease_idx = nm_kr_lease(kring, howmany, 1); 2182 mtx_unlock(&kring->q_lock); 2183 2184 /* only retry if we need more than available slots */ 2185 if (retry && needed <= howmany) 2186 retry = 0; 2187 2188 /* copy to the destination queue */ 2189 while (howmany > 0) { 2190 struct netmap_slot *slot; 2191 struct nm_bdg_fwd *ft_p, *ft_end; 2192 u_int cnt; 2193 2194 /* find the queue from which we pick next packet. 2195 * NM_FT_NULL is always higher than valid indexes 2196 * so we never dereference it if the other list 2197 * has packets (and if both are empty we never 2198 * get here). 2199 */ 2200 if (next < brd_next) { 2201 ft_p = ft + next; 2202 next = ft_p->ft_next; 2203 } else { /* insert broadcast */ 2204 ft_p = ft + brd_next; 2205 brd_next = ft_p->ft_next; 2206 } 2207 cnt = ft_p->ft_frags; // cnt > 0 2208 if (unlikely(cnt > howmany)) 2209 break; /* no more space */ 2210 if (netmap_verbose && cnt > 1) 2211 RD(5, "rx %d frags to %d", cnt, j); 2212 ft_end = ft_p + cnt; 2213 if (unlikely(virt_hdr_mismatch)) { 2214 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); 2215 } else { 2216 howmany -= cnt; 2217 do { 2218 char *dst, *src = ft_p->ft_buf; 2219 size_t copy_len = ft_p->ft_len, dst_len = copy_len; 2220 2221 slot = &ring->slot[j]; 2222 dst = NMB(&dst_na->up, slot); 2223 2224 ND("send [%d] %d(%d) bytes at %s:%d", 2225 i, (int)copy_len, (int)dst_len, 2226 NM_IFPNAME(dst_ifp), j); 2227 /* round to a multiple of 64 */ 2228 copy_len = (copy_len + 63) & ~63; 2229 2230 if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) || 2231 copy_len > NETMAP_BUF_SIZE(&na->up))) { 2232 RD(5, "invalid len %d, down to 64", (int)copy_len); 2233 copy_len = dst_len = 64; // XXX 2234 } 2235 if (ft_p->ft_flags & NS_INDIRECT) { 2236 if (copyin(src, dst, copy_len)) { 2237 // invalid user pointer, pretend len is 0 2238 dst_len = 0; 2239 } 2240 } else { 2241 //memcpy(dst, src, copy_len); 2242 pkt_copy(src, dst, (int)copy_len); 2243 } 2244 slot->len = dst_len; 2245 slot->flags = (cnt << 8)| NS_MOREFRAG; 2246 j = nm_next(j, lim); 2247 needed--; 2248 ft_p++; 2249 } while (ft_p != ft_end); 2250 slot->flags = (cnt << 8); /* clear flag on last entry */ 2251 } 2252 /* are we done ? */ 2253 if (next == NM_FT_NULL && brd_next == NM_FT_NULL) 2254 break; 2255 } 2256 { 2257 /* current position */ 2258 uint32_t *p = kring->nkr_leases; /* shorthand */ 2259 uint32_t update_pos; 2260 int still_locked = 1; 2261 2262 mtx_lock(&kring->q_lock); 2263 if (unlikely(howmany > 0)) { 2264 /* not used all bufs. If i am the last one 2265 * i can recover the slots, otherwise must 2266 * fill them with 0 to mark empty packets. 2267 */ 2268 ND("leftover %d bufs", howmany); 2269 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { 2270 /* yes i am the last one */ 2271 ND("roll back nkr_hwlease to %d", j); 2272 kring->nkr_hwlease = j; 2273 } else { 2274 while (howmany-- > 0) { 2275 ring->slot[j].len = 0; 2276 ring->slot[j].flags = 0; 2277 j = nm_next(j, lim); 2278 } 2279 } 2280 } 2281 p[lease_idx] = j; /* report I am done */ 2282 2283 update_pos = kring->nr_hwtail; 2284 2285 if (my_start == update_pos) { 2286 /* all slots before my_start have been reported, 2287 * so scan subsequent leases to see if other ranges 2288 * have been completed, and to a selwakeup or txsync. 2289 */ 2290 while (lease_idx != kring->nkr_lease_idx && 2291 p[lease_idx] != NR_NOSLOT) { 2292 j = p[lease_idx]; 2293 p[lease_idx] = NR_NOSLOT; 2294 lease_idx = nm_next(lease_idx, lim); 2295 } 2296 /* j is the new 'write' position. j != my_start 2297 * means there are new buffers to report 2298 */ 2299 if (likely(j != my_start)) { 2300 kring->nr_hwtail = j; 2301 still_locked = 0; 2302 mtx_unlock(&kring->q_lock); 2303 kring->nm_notify(kring, 0); 2304 /* this is netmap_notify for VALE ports and 2305 * netmap_bwrap_notify for bwrap. The latter will 2306 * trigger a txsync on the underlying hwna 2307 */ 2308 if (dst_na->retry && retry--) { 2309 /* XXX this is going to call nm_notify again. 2310 * Only useful for bwrap in virtual machines 2311 */ 2312 goto retry; 2313 } 2314 } 2315 } 2316 if (still_locked) 2317 mtx_unlock(&kring->q_lock); 2318 } 2319 cleanup: 2320 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ 2321 d->bq_len = 0; 2322 } 2323 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ 2324 brddst->bq_len = 0; 2325 return 0; 2326 } 2327 2328 /* nm_txsync callback for VALE ports */ 2329 static int 2330 netmap_vp_txsync(struct netmap_kring *kring, int flags) 2331 { 2332 struct netmap_vp_adapter *na = 2333 (struct netmap_vp_adapter *)kring->na; 2334 u_int done; 2335 u_int const lim = kring->nkr_num_slots - 1; 2336 u_int const head = kring->rhead; 2337 2338 if (bridge_batch <= 0) { /* testing only */ 2339 done = head; // used all 2340 goto done; 2341 } 2342 if (!na->na_bdg) { 2343 done = head; 2344 goto done; 2345 } 2346 if (bridge_batch > NM_BDG_BATCH) 2347 bridge_batch = NM_BDG_BATCH; 2348 2349 done = nm_bdg_preflush(kring, head); 2350 done: 2351 if (done != head) 2352 D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail); 2353 /* 2354 * packets between 'done' and 'cur' are left unsent. 2355 */ 2356 kring->nr_hwcur = done; 2357 kring->nr_hwtail = nm_prev(done, lim); 2358 if (netmap_verbose) 2359 D("%s ring %d flags %d", na->up.name, kring->ring_id, flags); 2360 return 0; 2361 } 2362 2363 2364 /* rxsync code used by VALE ports nm_rxsync callback and also 2365 * internally by the brwap 2366 */ 2367 static int 2368 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags) 2369 { 2370 struct netmap_adapter *na = kring->na; 2371 struct netmap_ring *ring = kring->ring; 2372 u_int nm_i, lim = kring->nkr_num_slots - 1; 2373 u_int head = kring->rhead; 2374 int n; 2375 2376 if (head > lim) { 2377 D("ouch dangerous reset!!!"); 2378 n = netmap_ring_reinit(kring); 2379 goto done; 2380 } 2381 2382 /* First part, import newly received packets. */ 2383 /* actually nothing to do here, they are already in the kring */ 2384 2385 /* Second part, skip past packets that userspace has released. */ 2386 nm_i = kring->nr_hwcur; 2387 if (nm_i != head) { 2388 /* consistency check, but nothing really important here */ 2389 for (n = 0; likely(nm_i != head); n++) { 2390 struct netmap_slot *slot = &ring->slot[nm_i]; 2391 void *addr = NMB(na, slot); 2392 2393 if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */ 2394 D("bad buffer index %d, ignore ?", 2395 slot->buf_idx); 2396 } 2397 slot->flags &= ~NS_BUF_CHANGED; 2398 nm_i = nm_next(nm_i, lim); 2399 } 2400 kring->nr_hwcur = head; 2401 } 2402 2403 n = 0; 2404 done: 2405 return n; 2406 } 2407 2408 /* 2409 * nm_rxsync callback for VALE ports 2410 * user process reading from a VALE switch. 2411 * Already protected against concurrent calls from userspace, 2412 * but we must acquire the queue's lock to protect against 2413 * writers on the same queue. 2414 */ 2415 static int 2416 netmap_vp_rxsync(struct netmap_kring *kring, int flags) 2417 { 2418 int n; 2419 2420 mtx_lock(&kring->q_lock); 2421 n = netmap_vp_rxsync_locked(kring, flags); 2422 mtx_unlock(&kring->q_lock); 2423 return n; 2424 } 2425 2426 2427 /* nm_bdg_attach callback for VALE ports 2428 * The na_vp port is this same netmap_adapter. There is no host port. 2429 */ 2430 static int 2431 netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na) 2432 { 2433 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 2434 2435 if (vpna->na_bdg) { 2436 return netmap_bwrap_attach(name, na); 2437 } 2438 na->na_vp = vpna; 2439 strncpy(na->name, name, sizeof(na->name)); 2440 na->na_hostvp = NULL; 2441 return 0; 2442 } 2443 2444 /* create a netmap_vp_adapter that describes a VALE port. 2445 * Only persistent VALE ports have a non-null ifp. 2446 */ 2447 static int 2448 netmap_vp_create(struct nmreq_header *hdr, struct ifnet *ifp, 2449 struct netmap_mem_d *nmd, struct netmap_vp_adapter **ret) 2450 { 2451 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body; 2452 struct netmap_vp_adapter *vpna; 2453 struct netmap_adapter *na; 2454 int error = 0; 2455 u_int npipes = 0; 2456 u_int extrabufs = 0; 2457 2458 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) { 2459 return EINVAL; 2460 } 2461 2462 vpna = nm_os_malloc(sizeof(*vpna)); 2463 if (vpna == NULL) 2464 return ENOMEM; 2465 2466 na = &vpna->up; 2467 2468 na->ifp = ifp; 2469 strncpy(na->name, hdr->nr_name, sizeof(na->name)); 2470 2471 /* bound checking */ 2472 na->num_tx_rings = req->nr_tx_rings; 2473 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 2474 req->nr_tx_rings = na->num_tx_rings; /* write back */ 2475 na->num_rx_rings = req->nr_rx_rings; 2476 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 2477 req->nr_rx_rings = na->num_rx_rings; /* write back */ 2478 nm_bound_var(&req->nr_tx_slots, NM_BRIDGE_RINGSIZE, 2479 1, NM_BDG_MAXSLOTS, NULL); 2480 na->num_tx_desc = req->nr_tx_slots; 2481 nm_bound_var(&req->nr_rx_slots, NM_BRIDGE_RINGSIZE, 2482 1, NM_BDG_MAXSLOTS, NULL); 2483 /* validate number of pipes. We want at least 1, 2484 * but probably can do with some more. 2485 * So let's use 2 as default (when 0 is supplied) 2486 */ 2487 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); 2488 /* validate extra bufs */ 2489 nm_bound_var(&extrabufs, 0, 0, 2490 128*NM_BDG_MAXSLOTS, NULL); 2491 req->nr_extra_bufs = extrabufs; /* write back */ 2492 na->num_rx_desc = req->nr_rx_slots; 2493 /* Set the mfs to a default value, as it is needed on the VALE 2494 * mismatch datapath. XXX We should set it according to the MTU 2495 * known to the kernel. */ 2496 vpna->mfs = NM_BDG_MFS_DEFAULT; 2497 vpna->last_smac = ~0llu; 2498 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? 2499 vpna->mfs = netmap_buf_size; */ 2500 if (netmap_verbose) 2501 D("max frame size %u", vpna->mfs); 2502 2503 na->na_flags |= NAF_BDG_MAYSLEEP; 2504 /* persistent VALE ports look like hw devices 2505 * with a native netmap adapter 2506 */ 2507 if (ifp) 2508 na->na_flags |= NAF_NATIVE; 2509 na->nm_txsync = netmap_vp_txsync; 2510 na->nm_rxsync = netmap_vp_rxsync; 2511 na->nm_register = netmap_vp_reg; 2512 na->nm_krings_create = netmap_vp_krings_create; 2513 na->nm_krings_delete = netmap_vp_krings_delete; 2514 na->nm_dtor = netmap_vp_dtor; 2515 ND("nr_mem_id %d", req->nr_mem_id); 2516 na->nm_mem = nmd ? 2517 netmap_mem_get(nmd): 2518 netmap_mem_private_new( 2519 na->num_tx_rings, na->num_tx_desc, 2520 na->num_rx_rings, na->num_rx_desc, 2521 req->nr_extra_bufs, npipes, &error); 2522 if (na->nm_mem == NULL) 2523 goto err; 2524 na->nm_bdg_attach = netmap_vp_bdg_attach; 2525 /* other nmd fields are set in the common routine */ 2526 error = netmap_attach_common(na); 2527 if (error) 2528 goto err; 2529 *ret = vpna; 2530 return 0; 2531 2532 err: 2533 if (na->nm_mem != NULL) 2534 netmap_mem_put(na->nm_mem); 2535 nm_os_free(vpna); 2536 return error; 2537 } 2538 2539 /* Bridge wrapper code (bwrap). 2540 * This is used to connect a non-VALE-port netmap_adapter (hwna) to a 2541 * VALE switch. 2542 * The main task is to swap the meaning of tx and rx rings to match the 2543 * expectations of the VALE switch code (see nm_bdg_flush). 2544 * 2545 * The bwrap works by interposing a netmap_bwrap_adapter between the 2546 * rest of the system and the hwna. The netmap_bwrap_adapter looks like 2547 * a netmap_vp_adapter to the rest the system, but, internally, it 2548 * translates all callbacks to what the hwna expects. 2549 * 2550 * Note that we have to intercept callbacks coming from two sides: 2551 * 2552 * - callbacks coming from the netmap module are intercepted by 2553 * passing around the netmap_bwrap_adapter instead of the hwna 2554 * 2555 * - callbacks coming from outside of the netmap module only know 2556 * about the hwna. This, however, only happens in interrupt 2557 * handlers, where only the hwna->nm_notify callback is called. 2558 * What the bwrap does is to overwrite the hwna->nm_notify callback 2559 * with its own netmap_bwrap_intr_notify. 2560 * XXX This assumes that the hwna->nm_notify callback was the 2561 * standard netmap_notify(), as it is the case for nic adapters. 2562 * Any additional action performed by hwna->nm_notify will not be 2563 * performed by netmap_bwrap_intr_notify. 2564 * 2565 * Additionally, the bwrap can optionally attach the host rings pair 2566 * of the wrapped adapter to a different port of the switch. 2567 */ 2568 2569 2570 static void 2571 netmap_bwrap_dtor(struct netmap_adapter *na) 2572 { 2573 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; 2574 struct netmap_adapter *hwna = bna->hwna; 2575 struct nm_bridge *b = bna->up.na_bdg, 2576 *bh = bna->host.na_bdg; 2577 2578 if (bna->host.up.nm_mem) 2579 netmap_mem_put(bna->host.up.nm_mem); 2580 2581 if (b) { 2582 netmap_bdg_detach_common(b, bna->up.bdg_port, 2583 (bh ? bna->host.bdg_port : -1)); 2584 } 2585 2586 ND("na %p", na); 2587 na->ifp = NULL; 2588 bna->host.up.ifp = NULL; 2589 hwna->na_vp = bna->saved_na_vp; 2590 hwna->na_hostvp = NULL; 2591 hwna->na_private = NULL; 2592 hwna->na_flags &= ~NAF_BUSY; 2593 netmap_adapter_put(hwna); 2594 2595 } 2596 2597 2598 /* 2599 * Intr callback for NICs connected to a bridge. 2600 * Simply ignore tx interrupts (maybe we could try to recover space ?) 2601 * and pass received packets from nic to the bridge. 2602 * 2603 * XXX TODO check locking: this is called from the interrupt 2604 * handler so we should make sure that the interface is not 2605 * disconnected while passing down an interrupt. 2606 * 2607 * Note, no user process can access this NIC or the host stack. 2608 * The only part of the ring that is significant are the slots, 2609 * and head/cur/tail are set from the kring as needed 2610 * (part as a receive ring, part as a transmit ring). 2611 * 2612 * callback that overwrites the hwna notify callback. 2613 * Packets come from the outside or from the host stack and are put on an 2614 * hwna rx ring. 2615 * The bridge wrapper then sends the packets through the bridge. 2616 */ 2617 static int 2618 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags) 2619 { 2620 struct netmap_adapter *na = kring->na; 2621 struct netmap_bwrap_adapter *bna = na->na_private; 2622 struct netmap_kring *bkring; 2623 struct netmap_vp_adapter *vpna = &bna->up; 2624 u_int ring_nr = kring->ring_id; 2625 int ret = NM_IRQ_COMPLETED; 2626 int error; 2627 2628 if (netmap_verbose) 2629 D("%s %s 0x%x", na->name, kring->name, flags); 2630 2631 bkring = vpna->up.tx_rings[ring_nr]; 2632 2633 /* make sure the ring is not disabled */ 2634 if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) { 2635 return EIO; 2636 } 2637 2638 if (netmap_verbose) 2639 D("%s head %d cur %d tail %d", na->name, 2640 kring->rhead, kring->rcur, kring->rtail); 2641 2642 /* simulate a user wakeup on the rx ring 2643 * fetch packets that have arrived. 2644 */ 2645 error = kring->nm_sync(kring, 0); 2646 if (error) 2647 goto put_out; 2648 if (kring->nr_hwcur == kring->nr_hwtail) { 2649 if (netmap_verbose) 2650 D("how strange, interrupt with no packets on %s", 2651 na->name); 2652 goto put_out; 2653 } 2654 2655 /* new packets are kring->rcur to kring->nr_hwtail, and the bkring 2656 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail 2657 * to push all packets out. 2658 */ 2659 bkring->rhead = bkring->rcur = kring->nr_hwtail; 2660 2661 netmap_vp_txsync(bkring, flags); 2662 2663 /* mark all buffers as released on this ring */ 2664 kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail; 2665 /* another call to actually release the buffers */ 2666 error = kring->nm_sync(kring, 0); 2667 2668 /* The second rxsync may have further advanced hwtail. If this happens, 2669 * return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */ 2670 if (kring->rcur != kring->nr_hwtail) { 2671 ret = NM_IRQ_RESCHED; 2672 } 2673 put_out: 2674 nm_kr_put(kring); 2675 2676 return error ? error : ret; 2677 } 2678 2679 2680 /* nm_register callback for bwrap */ 2681 static int 2682 netmap_bwrap_reg(struct netmap_adapter *na, int onoff) 2683 { 2684 struct netmap_bwrap_adapter *bna = 2685 (struct netmap_bwrap_adapter *)na; 2686 struct netmap_adapter *hwna = bna->hwna; 2687 struct netmap_vp_adapter *hostna = &bna->host; 2688 int error, i; 2689 enum txrx t; 2690 2691 ND("%s %s", na->name, onoff ? "on" : "off"); 2692 2693 if (onoff) { 2694 /* netmap_do_regif has been called on the bwrap na. 2695 * We need to pass the information about the 2696 * memory allocator down to the hwna before 2697 * putting it in netmap mode 2698 */ 2699 hwna->na_lut = na->na_lut; 2700 2701 if (hostna->na_bdg) { 2702 /* if the host rings have been attached to switch, 2703 * we need to copy the memory allocator information 2704 * in the hostna also 2705 */ 2706 hostna->up.na_lut = na->na_lut; 2707 } 2708 2709 } 2710 2711 /* pass down the pending ring state information */ 2712 for_rx_tx(t) { 2713 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) 2714 NMR(hwna, t)[i]->nr_pending_mode = 2715 NMR(na, t)[i]->nr_pending_mode; 2716 } 2717 2718 /* forward the request to the hwna */ 2719 error = hwna->nm_register(hwna, onoff); 2720 if (error) 2721 return error; 2722 2723 /* copy up the current ring state information */ 2724 for_rx_tx(t) { 2725 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { 2726 struct netmap_kring *kring = NMR(hwna, t)[i]; 2727 NMR(na, t)[i]->nr_mode = kring->nr_mode; 2728 } 2729 } 2730 2731 /* impersonate a netmap_vp_adapter */ 2732 netmap_vp_reg(na, onoff); 2733 if (hostna->na_bdg) 2734 netmap_vp_reg(&hostna->up, onoff); 2735 2736 if (onoff) { 2737 u_int i; 2738 /* intercept the hwna nm_nofify callback on the hw rings */ 2739 for (i = 0; i < hwna->num_rx_rings; i++) { 2740 hwna->rx_rings[i]->save_notify = hwna->rx_rings[i]->nm_notify; 2741 hwna->rx_rings[i]->nm_notify = netmap_bwrap_intr_notify; 2742 } 2743 i = hwna->num_rx_rings; /* for safety */ 2744 /* save the host ring notify unconditionally */ 2745 hwna->rx_rings[i]->save_notify = hwna->rx_rings[i]->nm_notify; 2746 if (hostna->na_bdg) { 2747 /* also intercept the host ring notify */ 2748 hwna->rx_rings[i]->nm_notify = netmap_bwrap_intr_notify; 2749 } 2750 if (na->active_fds == 0) 2751 na->na_flags |= NAF_NETMAP_ON; 2752 } else { 2753 u_int i; 2754 2755 if (na->active_fds == 0) 2756 na->na_flags &= ~NAF_NETMAP_ON; 2757 2758 /* reset all notify callbacks (including host ring) */ 2759 for (i = 0; i <= hwna->num_rx_rings; i++) { 2760 hwna->rx_rings[i]->nm_notify = hwna->rx_rings[i]->save_notify; 2761 hwna->rx_rings[i]->save_notify = NULL; 2762 } 2763 hwna->na_lut.lut = NULL; 2764 hwna->na_lut.plut = NULL; 2765 hwna->na_lut.objtotal = 0; 2766 hwna->na_lut.objsize = 0; 2767 2768 /* pass ownership of the netmap rings to the hwna */ 2769 for_rx_tx(t) { 2770 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) { 2771 NMR(na, t)[i]->ring = NULL; 2772 } 2773 } 2774 2775 } 2776 2777 return 0; 2778 } 2779 2780 /* nm_config callback for bwrap */ 2781 static int 2782 netmap_bwrap_config(struct netmap_adapter *na, struct nm_config_info *info) 2783 { 2784 struct netmap_bwrap_adapter *bna = 2785 (struct netmap_bwrap_adapter *)na; 2786 struct netmap_adapter *hwna = bna->hwna; 2787 int error; 2788 2789 /* Forward the request to the hwna. It may happen that nobody 2790 * registered hwna yet, so netmap_mem_get_lut() may have not 2791 * been called yet. */ 2792 error = netmap_mem_get_lut(hwna->nm_mem, &hwna->na_lut); 2793 if (error) 2794 return error; 2795 netmap_update_config(hwna); 2796 /* swap the results and propagate */ 2797 info->num_tx_rings = hwna->num_rx_rings; 2798 info->num_tx_descs = hwna->num_rx_desc; 2799 info->num_rx_rings = hwna->num_tx_rings; 2800 info->num_rx_descs = hwna->num_tx_desc; 2801 info->rx_buf_maxsize = hwna->rx_buf_maxsize; 2802 2803 return 0; 2804 } 2805 2806 2807 /* nm_krings_create callback for bwrap */ 2808 static int 2809 netmap_bwrap_krings_create(struct netmap_adapter *na) 2810 { 2811 struct netmap_bwrap_adapter *bna = 2812 (struct netmap_bwrap_adapter *)na; 2813 struct netmap_adapter *hwna = bna->hwna; 2814 struct netmap_adapter *hostna = &bna->host.up; 2815 int i, error = 0; 2816 enum txrx t; 2817 2818 ND("%s", na->name); 2819 2820 /* impersonate a netmap_vp_adapter */ 2821 error = netmap_vp_krings_create(na); 2822 if (error) 2823 return error; 2824 2825 /* also create the hwna krings */ 2826 error = hwna->nm_krings_create(hwna); 2827 if (error) { 2828 goto err_del_vp_rings; 2829 } 2830 2831 /* increment the usage counter for all the hwna krings */ 2832 for_rx_tx(t) { 2833 for (i = 0; i < nma_get_nrings(hwna, t) + 1; i++) { 2834 NMR(hwna, t)[i]->users++; 2835 } 2836 } 2837 2838 /* now create the actual rings */ 2839 error = netmap_mem_rings_create(hwna); 2840 if (error) { 2841 goto err_dec_users; 2842 } 2843 2844 /* cross-link the netmap rings 2845 * The original number of rings comes from hwna, 2846 * rx rings on one side equals tx rings on the other. 2847 */ 2848 for_rx_tx(t) { 2849 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ 2850 for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) { 2851 NMR(na, t)[i]->nkr_num_slots = NMR(hwna, r)[i]->nkr_num_slots; 2852 NMR(na, t)[i]->ring = NMR(hwna, r)[i]->ring; 2853 } 2854 } 2855 2856 if (na->na_flags & NAF_HOST_RINGS) { 2857 /* the hostna rings are the host rings of the bwrap. 2858 * The corresponding krings must point back to the 2859 * hostna 2860 */ 2861 hostna->tx_rings = &na->tx_rings[na->num_tx_rings]; 2862 hostna->tx_rings[0]->na = hostna; 2863 hostna->rx_rings = &na->rx_rings[na->num_rx_rings]; 2864 hostna->rx_rings[0]->na = hostna; 2865 } 2866 2867 return 0; 2868 2869 err_dec_users: 2870 for_rx_tx(t) { 2871 NMR(hwna, t)[i]->users--; 2872 } 2873 hwna->nm_krings_delete(hwna); 2874 err_del_vp_rings: 2875 netmap_vp_krings_delete(na); 2876 2877 return error; 2878 } 2879 2880 2881 static void 2882 netmap_bwrap_krings_delete(struct netmap_adapter *na) 2883 { 2884 struct netmap_bwrap_adapter *bna = 2885 (struct netmap_bwrap_adapter *)na; 2886 struct netmap_adapter *hwna = bna->hwna; 2887 enum txrx t; 2888 int i; 2889 2890 ND("%s", na->name); 2891 2892 /* decrement the usage counter for all the hwna krings */ 2893 for_rx_tx(t) { 2894 for (i = 0; i < nma_get_nrings(hwna, t) + 1; i++) { 2895 NMR(hwna, t)[i]->users--; 2896 } 2897 } 2898 2899 /* delete any netmap rings that are no longer needed */ 2900 netmap_mem_rings_delete(hwna); 2901 hwna->nm_krings_delete(hwna); 2902 netmap_vp_krings_delete(na); 2903 } 2904 2905 2906 /* notify method for the bridge-->hwna direction */ 2907 static int 2908 netmap_bwrap_notify(struct netmap_kring *kring, int flags) 2909 { 2910 struct netmap_adapter *na = kring->na; 2911 struct netmap_bwrap_adapter *bna = na->na_private; 2912 struct netmap_adapter *hwna = bna->hwna; 2913 u_int ring_n = kring->ring_id; 2914 u_int lim = kring->nkr_num_slots - 1; 2915 struct netmap_kring *hw_kring; 2916 int error; 2917 2918 ND("%s: na %s hwna %s", 2919 (kring ? kring->name : "NULL!"), 2920 (na ? na->name : "NULL!"), 2921 (hwna ? hwna->name : "NULL!")); 2922 hw_kring = hwna->tx_rings[ring_n]; 2923 2924 if (nm_kr_tryget(hw_kring, 0, NULL)) { 2925 return ENXIO; 2926 } 2927 2928 /* first step: simulate a user wakeup on the rx ring */ 2929 netmap_vp_rxsync(kring, flags); 2930 ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", 2931 na->name, ring_n, 2932 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, 2933 ring->head, ring->cur, ring->tail, 2934 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail); 2935 /* second step: the new packets are sent on the tx ring 2936 * (which is actually the same ring) 2937 */ 2938 hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail; 2939 error = hw_kring->nm_sync(hw_kring, flags); 2940 if (error) 2941 goto put_out; 2942 2943 /* third step: now we are back the rx ring */ 2944 /* claim ownership on all hw owned bufs */ 2945 kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */ 2946 2947 /* fourth step: the user goes to sleep again, causing another rxsync */ 2948 netmap_vp_rxsync(kring, flags); 2949 ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", 2950 na->name, ring_n, 2951 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, 2952 ring->head, ring->cur, ring->tail, 2953 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); 2954 put_out: 2955 nm_kr_put(hw_kring); 2956 2957 return error ? error : NM_IRQ_COMPLETED; 2958 } 2959 2960 2961 /* nm_bdg_ctl callback for the bwrap. 2962 * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd]. 2963 * On attach, it needs to provide a fake netmap_priv_d structure and 2964 * perform a netmap_do_regif() on the bwrap. This will put both the 2965 * bwrap and the hwna in netmap mode, with the netmap rings shared 2966 * and cross linked. Moroever, it will start intercepting interrupts 2967 * directed to hwna. 2968 */ 2969 static int 2970 netmap_bwrap_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na) 2971 { 2972 struct netmap_priv_d *npriv; 2973 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; 2974 int error = 0; 2975 2976 if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) { 2977 struct nmreq_vale_attach *req = 2978 (struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body; 2979 if (req->reg.nr_ringid != 0 || 2980 (req->reg.nr_mode != NR_REG_ALL_NIC && 2981 req->reg.nr_mode != NR_REG_NIC_SW)) { 2982 /* We only support attaching all the NIC rings 2983 * and/or the host stack. */ 2984 return EINVAL; 2985 } 2986 if (NETMAP_OWNED_BY_ANY(na)) { 2987 return EBUSY; 2988 } 2989 if (bna->na_kpriv) { 2990 /* nothing to do */ 2991 return 0; 2992 } 2993 npriv = netmap_priv_new(); 2994 if (npriv == NULL) 2995 return ENOMEM; 2996 npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */ 2997 error = netmap_do_regif(npriv, na, req->reg.nr_mode, 2998 req->reg.nr_ringid, req->reg.nr_flags); 2999 if (error) { 3000 netmap_priv_delete(npriv); 3001 return error; 3002 } 3003 bna->na_kpriv = npriv; 3004 na->na_flags |= NAF_BUSY; 3005 } else { 3006 if (na->active_fds == 0) /* not registered */ 3007 return EINVAL; 3008 netmap_priv_delete(bna->na_kpriv); 3009 bna->na_kpriv = NULL; 3010 na->na_flags &= ~NAF_BUSY; 3011 } 3012 3013 return error; 3014 } 3015 3016 /* attach a bridge wrapper to the 'real' device */ 3017 int 3018 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) 3019 { 3020 struct netmap_bwrap_adapter *bna; 3021 struct netmap_adapter *na = NULL; 3022 struct netmap_adapter *hostna = NULL; 3023 int error = 0; 3024 enum txrx t; 3025 3026 /* make sure the NIC is not already in use */ 3027 if (NETMAP_OWNED_BY_ANY(hwna)) { 3028 D("NIC %s busy, cannot attach to bridge", hwna->name); 3029 return EBUSY; 3030 } 3031 3032 bna = nm_os_malloc(sizeof(*bna)); 3033 if (bna == NULL) { 3034 return ENOMEM; 3035 } 3036 3037 na = &bna->up.up; 3038 /* make bwrap ifp point to the real ifp */ 3039 na->ifp = hwna->ifp; 3040 if_ref(na->ifp); 3041 na->na_private = bna; 3042 strncpy(na->name, nr_name, sizeof(na->name)); 3043 /* fill the ring data for the bwrap adapter with rx/tx meanings 3044 * swapped. The real cross-linking will be done during register, 3045 * when all the krings will have been created. 3046 */ 3047 for_rx_tx(t) { 3048 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */ 3049 nma_set_nrings(na, t, nma_get_nrings(hwna, r)); 3050 nma_set_ndesc(na, t, nma_get_ndesc(hwna, r)); 3051 } 3052 na->nm_dtor = netmap_bwrap_dtor; 3053 na->nm_register = netmap_bwrap_reg; 3054 // na->nm_txsync = netmap_bwrap_txsync; 3055 // na->nm_rxsync = netmap_bwrap_rxsync; 3056 na->nm_config = netmap_bwrap_config; 3057 na->nm_krings_create = netmap_bwrap_krings_create; 3058 na->nm_krings_delete = netmap_bwrap_krings_delete; 3059 na->nm_notify = netmap_bwrap_notify; 3060 na->nm_bdg_ctl = netmap_bwrap_bdg_ctl; 3061 na->pdev = hwna->pdev; 3062 na->nm_mem = netmap_mem_get(hwna->nm_mem); 3063 na->virt_hdr_len = hwna->virt_hdr_len; 3064 na->rx_buf_maxsize = hwna->rx_buf_maxsize; 3065 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ 3066 /* Set the mfs, needed on the VALE mismatch datapath. */ 3067 bna->up.mfs = NM_BDG_MFS_DEFAULT; 3068 3069 bna->hwna = hwna; 3070 netmap_adapter_get(hwna); 3071 hwna->na_private = bna; /* weak reference */ 3072 bna->saved_na_vp = hwna->na_vp; 3073 hwna->na_vp = &bna->up; 3074 bna->up.up.na_vp = &(bna->up); 3075 3076 if (hwna->na_flags & NAF_HOST_RINGS) { 3077 if (hwna->na_flags & NAF_SW_ONLY) 3078 na->na_flags |= NAF_SW_ONLY; 3079 na->na_flags |= NAF_HOST_RINGS; 3080 hostna = &bna->host.up; 3081 snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name); 3082 hostna->ifp = hwna->ifp; 3083 for_rx_tx(t) { 3084 enum txrx r = nm_txrx_swap(t); 3085 nma_set_nrings(hostna, t, 1); 3086 nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r)); 3087 } 3088 // hostna->nm_txsync = netmap_bwrap_host_txsync; 3089 // hostna->nm_rxsync = netmap_bwrap_host_rxsync; 3090 hostna->nm_notify = netmap_bwrap_notify; 3091 hostna->nm_mem = netmap_mem_get(na->nm_mem); 3092 hostna->na_private = bna; 3093 hostna->na_vp = &bna->up; 3094 na->na_hostvp = hwna->na_hostvp = 3095 hostna->na_hostvp = &bna->host; 3096 hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */ 3097 hostna->rx_buf_maxsize = hwna->rx_buf_maxsize; 3098 bna->host.mfs = NM_BDG_MFS_DEFAULT; 3099 } 3100 3101 ND("%s<->%s txr %d txd %d rxr %d rxd %d", 3102 na->name, ifp->if_xname, 3103 na->num_tx_rings, na->num_tx_desc, 3104 na->num_rx_rings, na->num_rx_desc); 3105 3106 error = netmap_attach_common(na); 3107 if (error) { 3108 goto err_free; 3109 } 3110 hwna->na_flags |= NAF_BUSY; 3111 return 0; 3112 3113 err_free: 3114 hwna->na_vp = hwna->na_hostvp = NULL; 3115 netmap_adapter_put(hwna); 3116 nm_os_free(bna); 3117 return error; 3118 3119 } 3120 3121 struct nm_bridge * 3122 netmap_init_bridges2(u_int n) 3123 { 3124 int i; 3125 struct nm_bridge *b; 3126 3127 b = nm_os_malloc(sizeof(struct nm_bridge) * n); 3128 if (b == NULL) 3129 return NULL; 3130 for (i = 0; i < n; i++) 3131 BDG_RWINIT(&b[i]); 3132 return b; 3133 } 3134 3135 void 3136 netmap_uninit_bridges2(struct nm_bridge *b, u_int n) 3137 { 3138 int i; 3139 3140 if (b == NULL) 3141 return; 3142 3143 for (i = 0; i < n; i++) 3144 BDG_RWDESTROY(&b[i]); 3145 nm_os_free(b); 3146 } 3147 3148 int 3149 netmap_init_bridges(void) 3150 { 3151 #ifdef CONFIG_NET_NS 3152 return netmap_bns_register(); 3153 #else 3154 nm_bridges = netmap_init_bridges2(NM_BRIDGES); 3155 if (nm_bridges == NULL) 3156 return ENOMEM; 3157 return 0; 3158 #endif 3159 } 3160 3161 void 3162 netmap_uninit_bridges(void) 3163 { 3164 #ifdef CONFIG_NET_NS 3165 netmap_bns_unregister(); 3166 #else 3167 netmap_uninit_bridges2(nm_bridges, NM_BRIDGES); 3168 #endif 3169 } 3170 #endif /* WITH_VALE */ 3171