1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (C) 2013-2016 Universita` di Pisa 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 30 #if defined(__FreeBSD__) 31 #include <sys/cdefs.h> /* prerequisite */ 32 __FBSDID("$FreeBSD$"); 33 34 #include <sys/types.h> 35 #include <sys/errno.h> 36 #include <sys/param.h> /* defines used in kernel.h */ 37 #include <sys/kernel.h> /* types used in module initialization */ 38 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 39 #include <sys/sockio.h> 40 #include <sys/socketvar.h> /* struct socket */ 41 #include <sys/malloc.h> 42 #include <sys/poll.h> 43 #include <sys/rwlock.h> 44 #include <sys/socket.h> /* sockaddrs */ 45 #include <sys/selinfo.h> 46 #include <sys/sysctl.h> 47 #include <net/if.h> 48 #include <net/if_var.h> 49 #include <net/bpf.h> /* BIOCIMMEDIATE */ 50 #include <machine/bus.h> /* bus_dmamap_* */ 51 #include <sys/endian.h> 52 #include <sys/refcount.h> 53 #include <sys/smp.h> 54 55 56 #elif defined(linux) 57 58 #include "bsd_glue.h" 59 60 #elif defined(__APPLE__) 61 62 #warning OSX support is only partial 63 #include "osx_glue.h" 64 65 #elif defined(_WIN32) 66 #include "win_glue.h" 67 68 #else 69 70 #error Unsupported platform 71 72 #endif /* unsupported */ 73 74 /* 75 * common headers 76 */ 77 78 #include <net/netmap.h> 79 #include <dev/netmap/netmap_kern.h> 80 #include <dev/netmap/netmap_mem2.h> 81 #include <dev/netmap/netmap_bdg.h> 82 83 #ifdef WITH_VALE 84 85 /* 86 * system parameters (most of them in netmap_kern.h) 87 * NM_BDG_NAME prefix for switch port names, default "vale" 88 * NM_BDG_MAXPORTS number of ports 89 * NM_BRIDGES max number of switches in the system. 90 * XXX should become a sysctl or tunable 91 * 92 * Switch ports are named valeX:Y where X is the switch name and Y 93 * is the port. If Y matches a physical interface name, the port is 94 * connected to a physical device. 95 * 96 * Unlike physical interfaces, switch ports use their own memory region 97 * for rings and buffers. 98 * The virtual interfaces use per-queue lock instead of core lock. 99 * In the tx loop, we aggregate traffic in batches to make all operations 100 * faster. The batch size is bridge_batch. 101 */ 102 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many (must be a pow of 2). */ 103 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ 104 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ 105 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ 106 /* actual size of the tables */ 107 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NETMAP_MAX_FRAGS) 108 /* NM_FT_NULL terminates a list of slots in the ft */ 109 #define NM_FT_NULL NM_BDG_BATCH_MAX 110 111 112 /* 113 * bridge_batch is set via sysctl to the max batch size to be 114 * used in the bridge. The actual value may be larger as the 115 * last packet in the block may overflow the size. 116 */ 117 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ 118 SYSBEGIN(vars_vale); 119 SYSCTL_DECL(_dev_netmap); 120 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0, 121 "Max batch size to be used in the bridge"); 122 SYSEND; 123 124 static int netmap_vale_vp_create(struct nmreq_header *hdr, struct ifnet *, 125 struct netmap_mem_d *nmd, struct netmap_vp_adapter **); 126 static int netmap_vale_vp_bdg_attach(const char *, struct netmap_adapter *, 127 struct nm_bridge *); 128 static int netmap_vale_bwrap_attach(const char *, struct netmap_adapter *); 129 130 /* 131 * For each output interface, nm_vale_q is used to construct a list. 132 * bq_len is the number of output buffers (we can have coalescing 133 * during the copy). 134 */ 135 struct nm_vale_q { 136 uint16_t bq_head; 137 uint16_t bq_tail; 138 uint32_t bq_len; /* number of buffers */ 139 }; 140 141 /* Holds the default callbacks */ 142 struct netmap_bdg_ops vale_bdg_ops = { 143 .lookup = netmap_vale_learning, 144 .config = NULL, 145 .dtor = NULL, 146 .vp_create = netmap_vale_vp_create, 147 .bwrap_attach = netmap_vale_bwrap_attach, 148 .name = NM_BDG_NAME, 149 }; 150 151 /* 152 * this is a slightly optimized copy routine which rounds 153 * to multiple of 64 bytes and is often faster than dealing 154 * with other odd sizes. We assume there is enough room 155 * in the source and destination buffers. 156 * 157 * XXX only for multiples of NM_BUF_ALIGN bytes, non overlapped. 158 */ 159 160 static inline void 161 pkt_copy(void *_src, void *_dst, int l) 162 { 163 uint64_t *src = _src; 164 uint64_t *dst = _dst; 165 if (unlikely(l >= 1024)) { 166 memcpy(dst, src, l); 167 return; 168 } 169 for (; likely(l > 0); l -= NM_BUF_ALIGN) { 170 /* XXX NM_BUF_ALIGN/sizeof(uint64_t) statements */ 171 *dst++ = *src++; 172 *dst++ = *src++; 173 *dst++ = *src++; 174 *dst++ = *src++; 175 *dst++ = *src++; 176 *dst++ = *src++; 177 *dst++ = *src++; 178 *dst++ = *src++; 179 } 180 } 181 182 183 /* 184 * Free the forwarding tables for rings attached to switch ports. 185 */ 186 static void 187 nm_free_bdgfwd(struct netmap_adapter *na) 188 { 189 int nrings, i; 190 struct netmap_kring **kring; 191 192 NMG_LOCK_ASSERT(); 193 nrings = na->num_tx_rings; 194 kring = na->tx_rings; 195 for (i = 0; i < nrings; i++) { 196 if (kring[i]->nkr_ft) { 197 nm_os_free(kring[i]->nkr_ft); 198 kring[i]->nkr_ft = NULL; /* protect from freeing twice */ 199 } 200 } 201 } 202 203 204 /* 205 * Allocate the forwarding tables for the rings attached to the bridge ports. 206 */ 207 static int 208 nm_alloc_bdgfwd(struct netmap_adapter *na) 209 { 210 int nrings, l, i, num_dstq; 211 struct netmap_kring **kring; 212 213 NMG_LOCK_ASSERT(); 214 /* all port:rings + broadcast */ 215 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; 216 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; 217 l += sizeof(struct nm_vale_q) * num_dstq; 218 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; 219 220 nrings = netmap_real_rings(na, NR_TX); 221 kring = na->tx_rings; 222 for (i = 0; i < nrings; i++) { 223 struct nm_bdg_fwd *ft; 224 struct nm_vale_q *dstq; 225 int j; 226 227 ft = nm_os_malloc(l); 228 if (!ft) { 229 nm_free_bdgfwd(na); 230 return ENOMEM; 231 } 232 dstq = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX); 233 for (j = 0; j < num_dstq; j++) { 234 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; 235 dstq[j].bq_len = 0; 236 } 237 kring[i]->nkr_ft = ft; 238 } 239 return 0; 240 } 241 242 /* Allows external modules to create bridges in exclusive mode, 243 * returns an authentication token that the external module will need 244 * to provide during nm_bdg_ctl_{attach, detach}(), netmap_bdg_regops(), 245 * and nm_bdg_update_private_data() operations. 246 * Successfully executed if ret != NULL and *return_status == 0. 247 */ 248 void * 249 netmap_vale_create(const char *bdg_name, int *return_status) 250 { 251 struct nm_bridge *b = NULL; 252 void *ret = NULL; 253 254 NMG_LOCK(); 255 b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL); 256 if (b) { 257 *return_status = EEXIST; 258 goto unlock_bdg_create; 259 } 260 261 b = nm_find_bridge(bdg_name, 1 /* create */, &vale_bdg_ops); 262 if (!b) { 263 *return_status = ENOMEM; 264 goto unlock_bdg_create; 265 } 266 267 b->bdg_flags |= NM_BDG_ACTIVE | NM_BDG_EXCLUSIVE; 268 ret = nm_bdg_get_auth_token(b); 269 *return_status = 0; 270 271 unlock_bdg_create: 272 NMG_UNLOCK(); 273 return ret; 274 } 275 276 /* Allows external modules to destroy a bridge created through 277 * netmap_bdg_create(), the bridge must be empty. 278 */ 279 int 280 netmap_vale_destroy(const char *bdg_name, void *auth_token) 281 { 282 struct nm_bridge *b = NULL; 283 int ret = 0; 284 285 NMG_LOCK(); 286 b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL); 287 if (!b) { 288 ret = ENXIO; 289 goto unlock_bdg_free; 290 } 291 292 if (!nm_bdg_valid_auth_token(b, auth_token)) { 293 ret = EACCES; 294 goto unlock_bdg_free; 295 } 296 if (!(b->bdg_flags & NM_BDG_EXCLUSIVE)) { 297 ret = EINVAL; 298 goto unlock_bdg_free; 299 } 300 301 b->bdg_flags &= ~(NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE); 302 ret = netmap_bdg_free(b); 303 if (ret) { 304 b->bdg_flags |= NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE; 305 } 306 307 unlock_bdg_free: 308 NMG_UNLOCK(); 309 return ret; 310 } 311 312 /* Process NETMAP_REQ_VALE_LIST. */ 313 int 314 netmap_vale_list(struct nmreq_header *hdr) 315 { 316 struct nmreq_vale_list *req = 317 (struct nmreq_vale_list *)(uintptr_t)hdr->nr_body; 318 int namelen = strlen(hdr->nr_name); 319 struct nm_bridge *b, *bridges; 320 struct netmap_vp_adapter *vpna; 321 int error = 0, i, j; 322 u_int num_bridges; 323 324 netmap_bns_getbridges(&bridges, &num_bridges); 325 326 /* this is used to enumerate bridges and ports */ 327 if (namelen) { /* look up indexes of bridge and port */ 328 if (strncmp(hdr->nr_name, NM_BDG_NAME, 329 strlen(NM_BDG_NAME))) { 330 return EINVAL; 331 } 332 NMG_LOCK(); 333 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL); 334 if (!b) { 335 NMG_UNLOCK(); 336 return ENOENT; 337 } 338 339 req->nr_bridge_idx = b - bridges; /* bridge index */ 340 req->nr_port_idx = NM_BDG_NOPORT; 341 for (j = 0; j < b->bdg_active_ports; j++) { 342 i = b->bdg_port_index[j]; 343 vpna = b->bdg_ports[i]; 344 if (vpna == NULL) { 345 nm_prerr("This should not happen"); 346 continue; 347 } 348 /* the former and the latter identify a 349 * virtual port and a NIC, respectively 350 */ 351 if (!strcmp(vpna->up.name, hdr->nr_name)) { 352 req->nr_port_idx = i; /* port index */ 353 break; 354 } 355 } 356 NMG_UNLOCK(); 357 } else { 358 /* return the first non-empty entry starting from 359 * bridge nr_arg1 and port nr_arg2. 360 * 361 * Users can detect the end of the same bridge by 362 * seeing the new and old value of nr_arg1, and can 363 * detect the end of all the bridge by error != 0 364 */ 365 i = req->nr_bridge_idx; 366 j = req->nr_port_idx; 367 368 NMG_LOCK(); 369 for (error = ENOENT; i < NM_BRIDGES; i++) { 370 b = bridges + i; 371 for ( ; j < NM_BDG_MAXPORTS; j++) { 372 if (b->bdg_ports[j] == NULL) 373 continue; 374 vpna = b->bdg_ports[j]; 375 /* write back the VALE switch name */ 376 strlcpy(hdr->nr_name, vpna->up.name, 377 sizeof(hdr->nr_name)); 378 error = 0; 379 goto out; 380 } 381 j = 0; /* following bridges scan from 0 */ 382 } 383 out: 384 req->nr_bridge_idx = i; 385 req->nr_port_idx = j; 386 NMG_UNLOCK(); 387 } 388 389 return error; 390 } 391 392 393 /* nm_dtor callback for ephemeral VALE ports */ 394 static void 395 netmap_vale_vp_dtor(struct netmap_adapter *na) 396 { 397 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; 398 struct nm_bridge *b = vpna->na_bdg; 399 400 nm_prdis("%s has %d references", na->name, na->na_refcount); 401 402 if (b) { 403 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 404 } 405 406 if (na->ifp != NULL && !nm_iszombie(na)) { 407 NM_DETACH_NA(na->ifp); 408 if (vpna->autodelete) { 409 nm_prdis("releasing %s", na->ifp->if_xname); 410 NMG_UNLOCK(); 411 nm_os_vi_detach(na->ifp); 412 NMG_LOCK(); 413 } 414 } 415 } 416 417 418 419 /* nm_krings_create callback for VALE ports. 420 * Calls the standard netmap_krings_create, then adds leases on rx 421 * rings and bdgfwd on tx rings. 422 */ 423 static int 424 netmap_vale_vp_krings_create(struct netmap_adapter *na) 425 { 426 u_int tailroom; 427 int error, i; 428 uint32_t *leases; 429 u_int nrx = netmap_real_rings(na, NR_RX); 430 431 /* 432 * Leases are attached to RX rings on vale ports 433 */ 434 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; 435 436 error = netmap_krings_create(na, tailroom); 437 if (error) 438 return error; 439 440 leases = na->tailroom; 441 442 for (i = 0; i < nrx; i++) { /* Receive rings */ 443 na->rx_rings[i]->nkr_leases = leases; 444 leases += na->num_rx_desc; 445 } 446 447 error = nm_alloc_bdgfwd(na); 448 if (error) { 449 netmap_krings_delete(na); 450 return error; 451 } 452 453 return 0; 454 } 455 456 457 /* nm_krings_delete callback for VALE ports. */ 458 static void 459 netmap_vale_vp_krings_delete(struct netmap_adapter *na) 460 { 461 nm_free_bdgfwd(na); 462 netmap_krings_delete(na); 463 } 464 465 466 static int 467 nm_vale_flush(struct nm_bdg_fwd *ft, u_int n, 468 struct netmap_vp_adapter *na, u_int ring_nr); 469 470 471 /* 472 * main dispatch routine for the bridge. 473 * Grab packets from a kring, move them into the ft structure 474 * associated to the tx (input) port. Max one instance per port, 475 * filtered on input (ioctl, poll or XXX). 476 * Returns the next position in the ring. 477 */ 478 static int 479 nm_vale_preflush(struct netmap_kring *kring, u_int end) 480 { 481 struct netmap_vp_adapter *na = 482 (struct netmap_vp_adapter*)kring->na; 483 struct netmap_ring *ring = kring->ring; 484 struct nm_bdg_fwd *ft; 485 u_int ring_nr = kring->ring_id; 486 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; 487 u_int ft_i = 0; /* start from 0 */ 488 u_int frags = 1; /* how many frags ? */ 489 struct nm_bridge *b = na->na_bdg; 490 491 /* To protect against modifications to the bridge we acquire a 492 * shared lock, waiting if we can sleep (if the source port is 493 * attached to a user process) or with a trylock otherwise (NICs). 494 */ 495 nm_prdis("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); 496 if (na->up.na_flags & NAF_BDG_MAYSLEEP) 497 BDG_RLOCK(b); 498 else if (!BDG_RTRYLOCK(b)) 499 return j; 500 nm_prdis(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); 501 ft = kring->nkr_ft; 502 503 for (; likely(j != end); j = nm_next(j, lim)) { 504 struct netmap_slot *slot = &ring->slot[j]; 505 char *buf; 506 507 ft[ft_i].ft_len = slot->len; 508 ft[ft_i].ft_flags = slot->flags; 509 ft[ft_i].ft_offset = 0; 510 511 nm_prdis("flags is 0x%x", slot->flags); 512 /* we do not use the buf changed flag, but we still need to reset it */ 513 slot->flags &= ~NS_BUF_CHANGED; 514 515 /* this slot goes into a list so initialize the link field */ 516 ft[ft_i].ft_next = NM_FT_NULL; 517 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? 518 (void *)(uintptr_t)slot->ptr : NMB_O(kring, slot); 519 if (unlikely(buf == NULL || 520 slot->len > NETMAP_BUF_SIZE(&na->up) - nm_get_offset(kring, slot))) { 521 nm_prlim(5, "NULL %s buffer pointer from %s slot %d len %d", 522 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT", 523 kring->name, j, ft[ft_i].ft_len); 524 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up); 525 ft[ft_i].ft_len = 0; 526 ft[ft_i].ft_flags = 0; 527 } 528 __builtin_prefetch(buf); 529 ++ft_i; 530 if (slot->flags & NS_MOREFRAG) { 531 frags++; 532 continue; 533 } 534 if (unlikely(netmap_verbose && frags > 1)) 535 nm_prlim(5, "%d frags at %d", frags, ft_i - frags); 536 ft[ft_i - frags].ft_frags = frags; 537 frags = 1; 538 if (unlikely((int)ft_i >= bridge_batch)) 539 ft_i = nm_vale_flush(ft, ft_i, na, ring_nr); 540 } 541 if (frags > 1) { 542 /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we 543 * have to fix frags count. */ 544 frags--; 545 ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG; 546 ft[ft_i - frags].ft_frags = frags; 547 nm_prlim(5, "Truncate incomplete fragment at %d (%d frags)", ft_i, frags); 548 } 549 if (ft_i) 550 ft_i = nm_vale_flush(ft, ft_i, na, ring_nr); 551 BDG_RUNLOCK(b); 552 return j; 553 } 554 555 556 /* ----- FreeBSD if_bridge hash function ------- */ 557 558 /* 559 * The following hash function is adapted from "Hash Functions" by Bob Jenkins 560 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). 561 * 562 * http://www.burtleburtle.net/bob/hash/spooky.html 563 */ 564 #define mix(a, b, c) \ 565 do { \ 566 a -= b; a -= c; a ^= (c >> 13); \ 567 b -= c; b -= a; b ^= (a << 8); \ 568 c -= a; c -= b; c ^= (b >> 13); \ 569 a -= b; a -= c; a ^= (c >> 12); \ 570 b -= c; b -= a; b ^= (a << 16); \ 571 c -= a; c -= b; c ^= (b >> 5); \ 572 a -= b; a -= c; a ^= (c >> 3); \ 573 b -= c; b -= a; b ^= (a << 10); \ 574 c -= a; c -= b; c ^= (b >> 15); \ 575 } while (/*CONSTCOND*/0) 576 577 578 static __inline uint32_t 579 nm_vale_rthash(const uint8_t *addr) 580 { 581 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hash key 582 583 b += addr[5] << 8; 584 b += addr[4]; 585 a += addr[3] << 24; 586 a += addr[2] << 16; 587 a += addr[1] << 8; 588 a += addr[0]; 589 590 mix(a, b, c); 591 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) 592 return (c & BRIDGE_RTHASH_MASK); 593 } 594 595 #undef mix 596 597 598 /* 599 * Lookup function for a learning bridge. 600 * Update the hash table with the source address, 601 * and then returns the destination port index, and the 602 * ring in *dst_ring (at the moment, always use ring 0) 603 */ 604 uint32_t 605 netmap_vale_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, 606 struct netmap_vp_adapter *na, void *private_data) 607 { 608 uint8_t *buf = ((uint8_t *)ft->ft_buf) + ft->ft_offset; 609 u_int buf_len = ft->ft_len - ft->ft_offset; 610 struct nm_hash_ent *ht = private_data; 611 uint32_t sh, dh; 612 u_int dst, mysrc = na->bdg_port; 613 uint64_t smac, dmac; 614 uint8_t indbuf[12]; 615 616 if (buf_len < 14) { 617 return NM_BDG_NOPORT; 618 } 619 620 if (ft->ft_flags & NS_INDIRECT) { 621 if (copyin(buf, indbuf, sizeof(indbuf))) { 622 return NM_BDG_NOPORT; 623 } 624 buf = indbuf; 625 } 626 627 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; 628 smac = le64toh(*(uint64_t *)(buf + 4)); 629 smac >>= 16; 630 631 /* 632 * The hash is somewhat expensive, there might be some 633 * worthwhile optimizations here. 634 */ 635 if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */ 636 uint8_t *s = buf+6; 637 sh = nm_vale_rthash(s); /* hash of source */ 638 /* update source port forwarding entry */ 639 na->last_smac = ht[sh].mac = smac; /* XXX expire ? */ 640 ht[sh].ports = mysrc; 641 if (netmap_debug & NM_DEBUG_VALE) 642 nm_prinf("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", 643 s[0], s[1], s[2], s[3], s[4], s[5], mysrc); 644 } 645 dst = NM_BDG_BROADCAST; 646 if ((buf[0] & 1) == 0) { /* unicast */ 647 dh = nm_vale_rthash(buf); /* hash of dst */ 648 if (ht[dh].mac == dmac) { /* found dst */ 649 dst = ht[dh].ports; 650 } 651 } 652 return dst; 653 } 654 655 656 /* 657 * Available space in the ring. Only used in VALE code 658 * and only with is_rx = 1 659 */ 660 static inline uint32_t 661 nm_kr_space(struct netmap_kring *k, int is_rx) 662 { 663 int space; 664 665 if (is_rx) { 666 int busy = k->nkr_hwlease - k->nr_hwcur; 667 if (busy < 0) 668 busy += k->nkr_num_slots; 669 space = k->nkr_num_slots - 1 - busy; 670 } else { 671 /* XXX never used in this branch */ 672 space = k->nr_hwtail - k->nkr_hwlease; 673 if (space < 0) 674 space += k->nkr_num_slots; 675 } 676 #if 0 677 // sanity check 678 if (k->nkr_hwlease >= k->nkr_num_slots || 679 k->nr_hwcur >= k->nkr_num_slots || 680 k->nr_tail >= k->nkr_num_slots || 681 busy < 0 || 682 busy >= k->nkr_num_slots) { 683 nm_prerr("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", 684 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 685 k->nkr_lease_idx, k->nkr_num_slots); 686 } 687 #endif 688 return space; 689 } 690 691 692 693 694 /* make a lease on the kring for N positions. return the 695 * lease index 696 * XXX only used in VALE code and with is_rx = 1 697 */ 698 static inline uint32_t 699 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) 700 { 701 uint32_t lim = k->nkr_num_slots - 1; 702 uint32_t lease_idx = k->nkr_lease_idx; 703 704 k->nkr_leases[lease_idx] = NR_NOSLOT; 705 k->nkr_lease_idx = nm_next(lease_idx, lim); 706 707 #ifdef CONFIG_NETMAP_DEBUG 708 if (n > nm_kr_space(k, is_rx)) { 709 nm_prerr("invalid request for %d slots", n); 710 panic("x"); 711 } 712 #endif /* CONFIG NETMAP_DEBUG */ 713 /* XXX verify that there are n slots */ 714 k->nkr_hwlease += n; 715 if (k->nkr_hwlease > lim) 716 k->nkr_hwlease -= lim + 1; 717 718 #ifdef CONFIG_NETMAP_DEBUG 719 if (k->nkr_hwlease >= k->nkr_num_slots || 720 k->nr_hwcur >= k->nkr_num_slots || 721 k->nr_hwtail >= k->nkr_num_slots || 722 k->nkr_lease_idx >= k->nkr_num_slots) { 723 nm_prerr("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", 724 k->na->name, 725 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 726 k->nkr_lease_idx, k->nkr_num_slots); 727 } 728 #endif /* CONFIG_NETMAP_DEBUG */ 729 return lease_idx; 730 } 731 732 /* 733 * 734 * This flush routine supports only unicast and broadcast but a large 735 * number of ports, and lets us replace the learn and dispatch functions. 736 */ 737 int 738 nm_vale_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, 739 u_int ring_nr) 740 { 741 struct nm_vale_q *dst_ents, *brddst; 742 uint16_t num_dsts = 0, *dsts; 743 struct nm_bridge *b = na->na_bdg; 744 u_int i, me = na->bdg_port; 745 746 /* 747 * The work area (pointed by ft) is followed by an array of 748 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS 749 * queues per port plus one for the broadcast traffic. 750 * Then we have an array of destination indexes. 751 */ 752 dst_ents = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX); 753 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); 754 755 /* first pass: find a destination for each packet in the batch */ 756 for (i = 0; likely(i < n); i += ft[i].ft_frags) { 757 uint8_t dst_ring = ring_nr; /* default, same ring as origin */ 758 uint16_t dst_port, d_i; 759 struct nm_vale_q *d; 760 struct nm_bdg_fwd *start_ft = NULL; 761 762 nm_prdis("slot %d frags %d", i, ft[i].ft_frags); 763 764 if (na->up.virt_hdr_len < ft[i].ft_len) { 765 ft[i].ft_offset = na->up.virt_hdr_len; 766 start_ft = &ft[i]; 767 } else if (na->up.virt_hdr_len == ft[i].ft_len && ft[i].ft_flags & NS_MOREFRAG) { 768 ft[i].ft_offset = ft[i].ft_len; 769 start_ft = &ft[i+1]; 770 } else { 771 /* Drop the packet if the virtio-net header is not into the first 772 * fragment nor at the very beginning of the second. 773 */ 774 continue; 775 } 776 dst_port = b->bdg_ops.lookup(start_ft, &dst_ring, na, b->private_data); 777 if (netmap_verbose > 255) 778 nm_prlim(5, "slot %d port %d -> %d", i, me, dst_port); 779 if (dst_port >= NM_BDG_NOPORT) 780 continue; /* this packet is identified to be dropped */ 781 else if (dst_port == NM_BDG_BROADCAST) 782 dst_ring = 0; /* broadcasts always go to ring 0 */ 783 else if (unlikely(dst_port == me || 784 !b->bdg_ports[dst_port])) 785 continue; 786 787 /* get a position in the scratch pad */ 788 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; 789 d = dst_ents + d_i; 790 791 /* append the first fragment to the list */ 792 if (d->bq_head == NM_FT_NULL) { /* new destination */ 793 d->bq_head = d->bq_tail = i; 794 /* remember this position to be scanned later */ 795 if (dst_port != NM_BDG_BROADCAST) 796 dsts[num_dsts++] = d_i; 797 } else { 798 ft[d->bq_tail].ft_next = i; 799 d->bq_tail = i; 800 } 801 d->bq_len += ft[i].ft_frags; 802 } 803 804 /* 805 * Broadcast traffic goes to ring 0 on all destinations. 806 * So we need to add these rings to the list of ports to scan. 807 */ 808 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; 809 if (brddst->bq_head != NM_FT_NULL) { 810 u_int j; 811 for (j = 0; likely(j < b->bdg_active_ports); j++) { 812 uint16_t d_i; 813 i = b->bdg_port_index[j]; 814 if (unlikely(i == me)) 815 continue; 816 d_i = i * NM_BDG_MAXRINGS; 817 if (dst_ents[d_i].bq_head == NM_FT_NULL) 818 dsts[num_dsts++] = d_i; 819 } 820 } 821 822 nm_prdis(5, "pass 1 done %d pkts %d dsts", n, num_dsts); 823 /* second pass: scan destinations */ 824 for (i = 0; i < num_dsts; i++) { 825 struct netmap_vp_adapter *dst_na; 826 struct netmap_kring *kring; 827 struct netmap_ring *ring; 828 u_int dst_nr, lim, j, d_i, next, brd_next; 829 u_int needed, howmany; 830 int retry = netmap_txsync_retry; 831 struct nm_vale_q *d; 832 uint32_t my_start = 0, lease_idx = 0; 833 int nrings; 834 int virt_hdr_mismatch = 0; 835 836 d_i = dsts[i]; 837 nm_prdis("second pass %d port %d", i, d_i); 838 d = dst_ents + d_i; 839 // XXX fix the division 840 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; 841 /* protect from the lookup function returning an inactive 842 * destination port 843 */ 844 if (unlikely(dst_na == NULL)) 845 goto cleanup; 846 if (dst_na->up.na_flags & NAF_SW_ONLY) 847 goto cleanup; 848 /* 849 * The interface may be in !netmap mode in two cases: 850 * - when na is attached but not activated yet; 851 * - when na is being deactivated but is still attached. 852 */ 853 if (unlikely(!nm_netmap_on(&dst_na->up))) { 854 nm_prdis("not in netmap mode!"); 855 goto cleanup; 856 } 857 858 /* there is at least one either unicast or broadcast packet */ 859 brd_next = brddst->bq_head; 860 next = d->bq_head; 861 /* we need to reserve this many slots. If fewer are 862 * available, some packets will be dropped. 863 * Packets may have multiple fragments, so 864 * there is a chance that we may not use all of the slots 865 * we have claimed, so we will need to handle the leftover 866 * ones when we regain the lock. 867 */ 868 needed = d->bq_len + brddst->bq_len; 869 870 if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) { 871 if (netmap_verbose) { 872 nm_prlim(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len, 873 dst_na->up.virt_hdr_len); 874 } 875 /* There is a virtio-net header/offloadings mismatch between 876 * source and destination. The slower mismatch datapath will 877 * be used to cope with all the mismatches. 878 */ 879 virt_hdr_mismatch = 1; 880 if (dst_na->mfs < na->mfs) { 881 /* We may need to do segmentation offloadings, and so 882 * we may need a number of destination slots greater 883 * than the number of input slots ('needed'). 884 * We look for the smallest integer 'x' which satisfies: 885 * needed * na->mfs + x * H <= x * na->mfs 886 * where 'H' is the length of the longest header that may 887 * be replicated in the segmentation process (e.g. for 888 * TCPv4 we must account for ethernet header, IP header 889 * and TCPv4 header). 890 */ 891 KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0")); 892 needed = (needed * na->mfs) / 893 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; 894 nm_prdis(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); 895 } 896 } 897 898 nm_prdis(5, "pass 2 dst %d is %x %s", 899 i, d_i, nm_is_bwrap(&dst_na->up) ? "nic/host" : "virtual"); 900 dst_nr = d_i & (NM_BDG_MAXRINGS-1); 901 nrings = dst_na->up.num_rx_rings; 902 if (dst_nr >= nrings) 903 dst_nr = dst_nr % nrings; 904 kring = dst_na->up.rx_rings[dst_nr]; 905 ring = kring->ring; 906 /* the destination ring may have not been opened for RX */ 907 if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON)) 908 goto cleanup; 909 lim = kring->nkr_num_slots - 1; 910 911 retry: 912 913 if (dst_na->retry && retry) { 914 /* try to get some free slot from the previous run */ 915 kring->nm_notify(kring, NAF_FORCE_RECLAIM); 916 /* actually useful only for bwraps, since there 917 * the notify will trigger a txsync on the hwna. VALE ports 918 * have dst_na->retry == 0 919 */ 920 } 921 /* reserve the buffers in the queue and an entry 922 * to report completion, and drop lock. 923 * XXX this might become a helper function. 924 */ 925 mtx_lock(&kring->q_lock); 926 if (kring->nkr_stopped) { 927 mtx_unlock(&kring->q_lock); 928 goto cleanup; 929 } 930 my_start = j = kring->nkr_hwlease; 931 howmany = nm_kr_space(kring, 1); 932 if (needed < howmany) 933 howmany = needed; 934 lease_idx = nm_kr_lease(kring, howmany, 1); 935 mtx_unlock(&kring->q_lock); 936 937 /* only retry if we need more than available slots */ 938 if (retry && needed <= howmany) 939 retry = 0; 940 941 /* copy to the destination queue */ 942 while (howmany > 0) { 943 struct netmap_slot *slot; 944 struct nm_bdg_fwd *ft_p, *ft_end; 945 u_int cnt; 946 947 /* find the queue from which we pick next packet. 948 * NM_FT_NULL is always higher than valid indexes 949 * so we never dereference it if the other list 950 * has packets (and if both are empty we never 951 * get here). 952 */ 953 if (next < brd_next) { 954 ft_p = ft + next; 955 next = ft_p->ft_next; 956 } else { /* insert broadcast */ 957 ft_p = ft + brd_next; 958 brd_next = ft_p->ft_next; 959 } 960 cnt = ft_p->ft_frags; // cnt > 0 961 if (unlikely(cnt > howmany)) 962 break; /* no more space */ 963 if (netmap_verbose && cnt > 1) 964 nm_prlim(5, "rx %d frags to %d", cnt, j); 965 ft_end = ft_p + cnt; 966 if (unlikely(virt_hdr_mismatch)) { 967 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); 968 } else { 969 howmany -= cnt; 970 do { 971 char *dst, *src = ft_p->ft_buf; 972 size_t copy_len = ft_p->ft_len, dst_len = copy_len; 973 uintptr_t src_cb; 974 uint64_t dstoff, dstoff_cb; 975 int src_co, dst_co; 976 const uintptr_t mask = NM_BUF_ALIGN - 1; 977 978 slot = &ring->slot[j]; 979 dst = NMB(&dst_na->up, slot); 980 dstoff = nm_get_offset(kring, slot); 981 dstoff_cb = dstoff & ~mask; 982 src_cb = ((uintptr_t)src) & ~mask; 983 src_co = ((uintptr_t)src) & mask; 984 dst_co = ((uintptr_t)(dst + dstoff)) & mask; 985 if (dst_co < src_co) { 986 dstoff_cb += NM_BUF_ALIGN; 987 } 988 dstoff = dstoff_cb + src_co; 989 copy_len += src_co; 990 991 nm_prdis("send [%d] %d(%d) bytes at %s:%d", 992 i, (int)copy_len, (int)dst_len, 993 NM_IFPNAME(dst_ifp), j); 994 995 if (unlikely(dstoff > NETMAP_BUF_SIZE(&dst_na->up) || 996 dst_len > NETMAP_BUF_SIZE(&dst_na->up) - dstoff)) { 997 nm_prlim(5, "dropping packet/fragment of len %zu, dest offset %llu", 998 dst_len, (unsigned long long)dstoff); 999 copy_len = dst_len = 0; 1000 dstoff = nm_get_offset(kring, slot); 1001 } 1002 1003 if (ft_p->ft_flags & NS_INDIRECT) { 1004 if (copyin(src, dst, copy_len)) { 1005 // invalid user pointer, pretend len is 0 1006 dst_len = 0; 1007 } 1008 } else { 1009 //memcpy(dst, src, copy_len); 1010 pkt_copy((char *)src_cb, dst + dstoff_cb, (int)copy_len); 1011 } 1012 slot->len = dst_len; 1013 slot->flags = (cnt << 8)| NS_MOREFRAG; 1014 nm_write_offset(kring, slot, dstoff); 1015 j = nm_next(j, lim); 1016 needed--; 1017 ft_p++; 1018 } while (ft_p != ft_end); 1019 slot->flags = (cnt << 8); /* clear flag on last entry */ 1020 } 1021 /* are we done ? */ 1022 if (next == NM_FT_NULL && brd_next == NM_FT_NULL) 1023 break; 1024 } 1025 { 1026 /* current position */ 1027 uint32_t *p = kring->nkr_leases; /* shorthand */ 1028 uint32_t update_pos; 1029 int still_locked = 1; 1030 1031 mtx_lock(&kring->q_lock); 1032 if (unlikely(howmany > 0)) { 1033 /* not used all bufs. If i am the last one 1034 * i can recover the slots, otherwise must 1035 * fill them with 0 to mark empty packets. 1036 */ 1037 nm_prdis("leftover %d bufs", howmany); 1038 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { 1039 /* yes i am the last one */ 1040 nm_prdis("roll back nkr_hwlease to %d", j); 1041 kring->nkr_hwlease = j; 1042 } else { 1043 while (howmany-- > 0) { 1044 ring->slot[j].len = 0; 1045 ring->slot[j].flags = 0; 1046 j = nm_next(j, lim); 1047 } 1048 } 1049 } 1050 p[lease_idx] = j; /* report I am done */ 1051 1052 update_pos = kring->nr_hwtail; 1053 1054 if (my_start == update_pos) { 1055 /* all slots before my_start have been reported, 1056 * so scan subsequent leases to see if other ranges 1057 * have been completed, and to a selwakeup or txsync. 1058 */ 1059 while (lease_idx != kring->nkr_lease_idx && 1060 p[lease_idx] != NR_NOSLOT) { 1061 j = p[lease_idx]; 1062 p[lease_idx] = NR_NOSLOT; 1063 lease_idx = nm_next(lease_idx, lim); 1064 } 1065 /* j is the new 'write' position. j != my_start 1066 * means there are new buffers to report 1067 */ 1068 if (likely(j != my_start)) { 1069 kring->nr_hwtail = j; 1070 still_locked = 0; 1071 mtx_unlock(&kring->q_lock); 1072 kring->nm_notify(kring, 0); 1073 /* this is netmap_notify for VALE ports and 1074 * netmap_bwrap_notify for bwrap. The latter will 1075 * trigger a txsync on the underlying hwna 1076 */ 1077 if (dst_na->retry && retry--) { 1078 /* XXX this is going to call nm_notify again. 1079 * Only useful for bwrap in virtual machines 1080 */ 1081 goto retry; 1082 } 1083 } 1084 } 1085 if (still_locked) 1086 mtx_unlock(&kring->q_lock); 1087 } 1088 cleanup: 1089 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ 1090 d->bq_len = 0; 1091 } 1092 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ 1093 brddst->bq_len = 0; 1094 return 0; 1095 } 1096 1097 /* nm_txsync callback for VALE ports */ 1098 static int 1099 netmap_vale_vp_txsync(struct netmap_kring *kring, int flags) 1100 { 1101 struct netmap_vp_adapter *na = 1102 (struct netmap_vp_adapter *)kring->na; 1103 u_int done; 1104 u_int const lim = kring->nkr_num_slots - 1; 1105 u_int const head = kring->rhead; 1106 1107 if (bridge_batch <= 0) { /* testing only */ 1108 done = head; // used all 1109 goto done; 1110 } 1111 if (!na->na_bdg) { 1112 done = head; 1113 goto done; 1114 } 1115 if (bridge_batch > NM_BDG_BATCH) 1116 bridge_batch = NM_BDG_BATCH; 1117 1118 done = nm_vale_preflush(kring, head); 1119 done: 1120 if (done != head) 1121 nm_prerr("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail); 1122 /* 1123 * packets between 'done' and 'cur' are left unsent. 1124 */ 1125 kring->nr_hwcur = done; 1126 kring->nr_hwtail = nm_prev(done, lim); 1127 if (netmap_debug & NM_DEBUG_TXSYNC) 1128 nm_prinf("%s ring %d flags %d", na->up.name, kring->ring_id, flags); 1129 return 0; 1130 } 1131 1132 1133 /* create a netmap_vp_adapter that describes a VALE port. 1134 * Only persistent VALE ports have a non-null ifp. 1135 */ 1136 static int 1137 netmap_vale_vp_create(struct nmreq_header *hdr, struct ifnet *ifp, 1138 struct netmap_mem_d *nmd, struct netmap_vp_adapter **ret) 1139 { 1140 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body; 1141 struct netmap_vp_adapter *vpna; 1142 struct netmap_adapter *na; 1143 int error = 0; 1144 u_int npipes = 0; 1145 u_int extrabufs = 0; 1146 1147 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) { 1148 return EINVAL; 1149 } 1150 1151 vpna = nm_os_malloc(sizeof(*vpna)); 1152 if (vpna == NULL) 1153 return ENOMEM; 1154 1155 na = &vpna->up; 1156 1157 na->ifp = ifp; 1158 strlcpy(na->name, hdr->nr_name, sizeof(na->name)); 1159 1160 /* bound checking */ 1161 na->num_tx_rings = req->nr_tx_rings; 1162 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1163 req->nr_tx_rings = na->num_tx_rings; /* write back */ 1164 na->num_rx_rings = req->nr_rx_rings; 1165 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1166 req->nr_rx_rings = na->num_rx_rings; /* write back */ 1167 nm_bound_var(&req->nr_tx_slots, NM_BRIDGE_RINGSIZE, 1168 1, NM_BDG_MAXSLOTS, NULL); 1169 na->num_tx_desc = req->nr_tx_slots; 1170 nm_bound_var(&req->nr_rx_slots, NM_BRIDGE_RINGSIZE, 1171 1, NM_BDG_MAXSLOTS, NULL); 1172 /* validate number of pipes. We want at least 1, 1173 * but probably can do with some more. 1174 * So let's use 2 as default (when 0 is supplied) 1175 */ 1176 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); 1177 /* validate extra bufs */ 1178 extrabufs = req->nr_extra_bufs; 1179 nm_bound_var(&extrabufs, 0, 0, 1180 128*NM_BDG_MAXSLOTS, NULL); 1181 req->nr_extra_bufs = extrabufs; /* write back */ 1182 na->num_rx_desc = req->nr_rx_slots; 1183 /* Set the mfs to a default value, as it is needed on the VALE 1184 * mismatch datapath. XXX We should set it according to the MTU 1185 * known to the kernel. */ 1186 vpna->mfs = NM_BDG_MFS_DEFAULT; 1187 vpna->last_smac = ~0llu; 1188 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? 1189 vpna->mfs = netmap_buf_size; */ 1190 if (netmap_verbose) 1191 nm_prinf("max frame size %u", vpna->mfs); 1192 1193 na->na_flags |= (NAF_BDG_MAYSLEEP | NAF_OFFSETS); 1194 /* persistent VALE ports look like hw devices 1195 * with a native netmap adapter 1196 */ 1197 if (ifp) 1198 na->na_flags |= NAF_NATIVE; 1199 na->nm_txsync = netmap_vale_vp_txsync; 1200 na->nm_rxsync = netmap_vp_rxsync; /* use the one provided by bdg */ 1201 na->nm_register = netmap_vp_reg; /* use the one provided by bdg */ 1202 na->nm_krings_create = netmap_vale_vp_krings_create; 1203 na->nm_krings_delete = netmap_vale_vp_krings_delete; 1204 na->nm_dtor = netmap_vale_vp_dtor; 1205 nm_prdis("nr_mem_id %d", req->nr_mem_id); 1206 na->nm_mem = nmd ? 1207 netmap_mem_get(nmd): 1208 netmap_mem_private_new( 1209 na->num_tx_rings, na->num_tx_desc, 1210 na->num_rx_rings, na->num_rx_desc, 1211 req->nr_extra_bufs, npipes, &error); 1212 if (na->nm_mem == NULL) 1213 goto err; 1214 na->nm_bdg_attach = netmap_vale_vp_bdg_attach; 1215 /* other nmd fields are set in the common routine */ 1216 error = netmap_attach_common(na); 1217 if (error) 1218 goto err; 1219 *ret = vpna; 1220 return 0; 1221 1222 err: 1223 if (na->nm_mem != NULL) 1224 netmap_mem_put(na->nm_mem); 1225 nm_os_free(vpna); 1226 return error; 1227 } 1228 1229 /* nm_bdg_attach callback for VALE ports 1230 * The na_vp port is this same netmap_adapter. There is no host port. 1231 */ 1232 static int 1233 netmap_vale_vp_bdg_attach(const char *name, struct netmap_adapter *na, 1234 struct nm_bridge *b) 1235 { 1236 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 1237 1238 if ((b->bdg_flags & NM_BDG_NEED_BWRAP) || vpna->na_bdg) { 1239 return NM_NEED_BWRAP; 1240 } 1241 na->na_vp = vpna; 1242 strlcpy(na->name, name, sizeof(na->name)); 1243 na->na_hostvp = NULL; 1244 return 0; 1245 } 1246 1247 static int 1248 netmap_vale_bwrap_krings_create(struct netmap_adapter *na) 1249 { 1250 int error; 1251 1252 /* impersonate a netmap_vp_adapter */ 1253 error = netmap_vale_vp_krings_create(na); 1254 if (error) 1255 return error; 1256 error = netmap_bwrap_krings_create_common(na); 1257 if (error) { 1258 netmap_vale_vp_krings_delete(na); 1259 } 1260 return error; 1261 } 1262 1263 static void 1264 netmap_vale_bwrap_krings_delete(struct netmap_adapter *na) 1265 { 1266 netmap_bwrap_krings_delete_common(na); 1267 netmap_vale_vp_krings_delete(na); 1268 } 1269 1270 static int 1271 netmap_vale_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) 1272 { 1273 struct netmap_bwrap_adapter *bna; 1274 struct netmap_adapter *na = NULL; 1275 struct netmap_adapter *hostna = NULL; 1276 int error; 1277 1278 bna = nm_os_malloc(sizeof(*bna)); 1279 if (bna == NULL) { 1280 return ENOMEM; 1281 } 1282 na = &bna->up.up; 1283 strlcpy(na->name, nr_name, sizeof(na->name)); 1284 na->nm_register = netmap_bwrap_reg; 1285 na->nm_txsync = netmap_vale_vp_txsync; 1286 // na->nm_rxsync = netmap_bwrap_rxsync; 1287 na->nm_krings_create = netmap_vale_bwrap_krings_create; 1288 na->nm_krings_delete = netmap_vale_bwrap_krings_delete; 1289 na->nm_notify = netmap_bwrap_notify; 1290 bna->nm_intr_notify = netmap_bwrap_intr_notify; 1291 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ 1292 /* Set the mfs, needed on the VALE mismatch datapath. */ 1293 bna->up.mfs = NM_BDG_MFS_DEFAULT; 1294 1295 if (hwna->na_flags & NAF_HOST_RINGS) { 1296 hostna = &bna->host.up; 1297 hostna->nm_notify = netmap_bwrap_notify; 1298 bna->host.mfs = NM_BDG_MFS_DEFAULT; 1299 } 1300 1301 error = netmap_bwrap_attach_common(na, hwna); 1302 if (error) { 1303 nm_os_free(bna); 1304 } 1305 return error; 1306 } 1307 1308 int 1309 netmap_get_vale_na(struct nmreq_header *hdr, struct netmap_adapter **na, 1310 struct netmap_mem_d *nmd, int create) 1311 { 1312 return netmap_get_bdg_na(hdr, na, nmd, create, &vale_bdg_ops); 1313 } 1314 1315 1316 /* creates a persistent VALE port */ 1317 int 1318 nm_vi_create(struct nmreq_header *hdr) 1319 { 1320 struct nmreq_vale_newif *req = 1321 (struct nmreq_vale_newif *)(uintptr_t)hdr->nr_body; 1322 int error = 0; 1323 /* Build a nmreq_register out of the nmreq_vale_newif, 1324 * so that we can call netmap_get_bdg_na(). */ 1325 struct nmreq_register regreq; 1326 bzero(®req, sizeof(regreq)); 1327 regreq.nr_tx_slots = req->nr_tx_slots; 1328 regreq.nr_rx_slots = req->nr_rx_slots; 1329 regreq.nr_tx_rings = req->nr_tx_rings; 1330 regreq.nr_rx_rings = req->nr_rx_rings; 1331 regreq.nr_mem_id = req->nr_mem_id; 1332 hdr->nr_reqtype = NETMAP_REQ_REGISTER; 1333 hdr->nr_body = (uintptr_t)®req; 1334 error = netmap_vi_create(hdr, 0 /* no autodelete */); 1335 hdr->nr_reqtype = NETMAP_REQ_VALE_NEWIF; 1336 hdr->nr_body = (uintptr_t)req; 1337 /* Write back to the original struct. */ 1338 req->nr_tx_slots = regreq.nr_tx_slots; 1339 req->nr_rx_slots = regreq.nr_rx_slots; 1340 req->nr_tx_rings = regreq.nr_tx_rings; 1341 req->nr_rx_rings = regreq.nr_rx_rings; 1342 req->nr_mem_id = regreq.nr_mem_id; 1343 return error; 1344 } 1345 1346 /* remove a persistent VALE port from the system */ 1347 int 1348 nm_vi_destroy(const char *name) 1349 { 1350 struct ifnet *ifp; 1351 struct netmap_vp_adapter *vpna; 1352 int error; 1353 1354 ifp = ifunit_ref(name); 1355 if (!ifp) 1356 return ENXIO; 1357 NMG_LOCK(); 1358 /* make sure this is actually a VALE port */ 1359 if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) { 1360 error = EINVAL; 1361 goto err; 1362 } 1363 1364 vpna = (struct netmap_vp_adapter *)NA(ifp); 1365 1366 /* we can only destroy ports that were created via NETMAP_BDG_NEWIF */ 1367 if (vpna->autodelete) { 1368 error = EINVAL; 1369 goto err; 1370 } 1371 1372 /* also make sure that nobody is using the interface */ 1373 if (NETMAP_OWNED_BY_ANY(&vpna->up) || 1374 vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) { 1375 error = EBUSY; 1376 goto err; 1377 } 1378 1379 NMG_UNLOCK(); 1380 1381 if (netmap_verbose) 1382 nm_prinf("destroying a persistent vale interface %s", ifp->if_xname); 1383 /* Linux requires all the references are released 1384 * before unregister 1385 */ 1386 netmap_detach(ifp); 1387 if_rele(ifp); 1388 nm_os_vi_detach(ifp); 1389 return 0; 1390 1391 err: 1392 NMG_UNLOCK(); 1393 if_rele(ifp); 1394 return error; 1395 } 1396 1397 static int 1398 nm_update_info(struct nmreq_register *req, struct netmap_adapter *na) 1399 { 1400 req->nr_rx_rings = na->num_rx_rings; 1401 req->nr_tx_rings = na->num_tx_rings; 1402 req->nr_rx_slots = na->num_rx_desc; 1403 req->nr_tx_slots = na->num_tx_desc; 1404 return netmap_mem_get_info(na->nm_mem, &req->nr_memsize, NULL, 1405 &req->nr_mem_id); 1406 } 1407 1408 1409 /* 1410 * Create a virtual interface registered to the system. 1411 * The interface will be attached to a bridge later. 1412 */ 1413 int 1414 netmap_vi_create(struct nmreq_header *hdr, int autodelete) 1415 { 1416 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body; 1417 struct ifnet *ifp; 1418 struct netmap_vp_adapter *vpna; 1419 struct netmap_mem_d *nmd = NULL; 1420 int error; 1421 1422 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) { 1423 return EINVAL; 1424 } 1425 1426 /* don't include VALE prefix */ 1427 if (!strncmp(hdr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME))) 1428 return EINVAL; 1429 if (strlen(hdr->nr_name) >= IFNAMSIZ) { 1430 return EINVAL; 1431 } 1432 ifp = ifunit_ref(hdr->nr_name); 1433 if (ifp) { /* already exist, cannot create new one */ 1434 error = EEXIST; 1435 NMG_LOCK(); 1436 if (NM_NA_VALID(ifp)) { 1437 int update_err = nm_update_info(req, NA(ifp)); 1438 if (update_err) 1439 error = update_err; 1440 } 1441 NMG_UNLOCK(); 1442 if_rele(ifp); 1443 return error; 1444 } 1445 error = nm_os_vi_persist(hdr->nr_name, &ifp); 1446 if (error) 1447 return error; 1448 1449 NMG_LOCK(); 1450 if (req->nr_mem_id) { 1451 nmd = netmap_mem_find(req->nr_mem_id); 1452 if (nmd == NULL) { 1453 error = EINVAL; 1454 goto err_1; 1455 } 1456 } 1457 /* netmap_vp_create creates a struct netmap_vp_adapter */ 1458 error = netmap_vale_vp_create(hdr, ifp, nmd, &vpna); 1459 if (error) { 1460 if (netmap_debug & NM_DEBUG_VALE) 1461 nm_prerr("error %d", error); 1462 goto err_1; 1463 } 1464 /* persist-specific routines */ 1465 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl; 1466 if (!autodelete) { 1467 netmap_adapter_get(&vpna->up); 1468 } else { 1469 vpna->autodelete = 1; 1470 } 1471 NM_ATTACH_NA(ifp, &vpna->up); 1472 /* return the updated info */ 1473 error = nm_update_info(req, &vpna->up); 1474 if (error) { 1475 goto err_2; 1476 } 1477 nm_prdis("returning nr_mem_id %d", req->nr_mem_id); 1478 if (nmd) 1479 netmap_mem_put(nmd); 1480 NMG_UNLOCK(); 1481 nm_prdis("created %s", ifp->if_xname); 1482 return 0; 1483 1484 err_2: 1485 netmap_detach(ifp); 1486 err_1: 1487 if (nmd) 1488 netmap_mem_put(nmd); 1489 NMG_UNLOCK(); 1490 nm_os_vi_detach(ifp); 1491 1492 return error; 1493 } 1494 1495 #endif /* WITH_VALE */ 1496