1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (C) 2013-2016 Universita` di Pisa 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 30 #if defined(__FreeBSD__) 31 #include <sys/cdefs.h> /* prerequisite */ 32 __FBSDID("$FreeBSD$"); 33 34 #include <sys/types.h> 35 #include <sys/errno.h> 36 #include <sys/param.h> /* defines used in kernel.h */ 37 #include <sys/kernel.h> /* types used in module initialization */ 38 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 39 #include <sys/sockio.h> 40 #include <sys/socketvar.h> /* struct socket */ 41 #include <sys/malloc.h> 42 #include <sys/poll.h> 43 #include <sys/rwlock.h> 44 #include <sys/socket.h> /* sockaddrs */ 45 #include <sys/selinfo.h> 46 #include <sys/sysctl.h> 47 #include <net/if.h> 48 #include <net/if_var.h> 49 #include <net/bpf.h> /* BIOCIMMEDIATE */ 50 #include <machine/bus.h> /* bus_dmamap_* */ 51 #include <sys/endian.h> 52 #include <sys/refcount.h> 53 #include <sys/smp.h> 54 55 56 #elif defined(linux) 57 58 #include "bsd_glue.h" 59 60 #elif defined(__APPLE__) 61 62 #warning OSX support is only partial 63 #include "osx_glue.h" 64 65 #elif defined(_WIN32) 66 #include "win_glue.h" 67 68 #else 69 70 #error Unsupported platform 71 72 #endif /* unsupported */ 73 74 /* 75 * common headers 76 */ 77 78 #include <net/netmap.h> 79 #include <dev/netmap/netmap_kern.h> 80 #include <dev/netmap/netmap_mem2.h> 81 #include <dev/netmap/netmap_bdg.h> 82 83 #ifdef WITH_VALE 84 85 /* 86 * system parameters (most of them in netmap_kern.h) 87 * NM_BDG_NAME prefix for switch port names, default "vale" 88 * NM_BDG_MAXPORTS number of ports 89 * NM_BRIDGES max number of switches in the system. 90 * 91 * Switch ports are named valeX:Y where X is the switch name and Y 92 * is the port. If Y matches a physical interface name, the port is 93 * connected to a physical device. 94 * 95 * Unlike physical interfaces, switch ports use their own memory region 96 * for rings and buffers. 97 * The virtual interfaces use per-queue lock instead of core lock. 98 * In the tx loop, we aggregate traffic in batches to make all operations 99 * faster. The batch size is bridge_batch. 100 */ 101 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many (must be a pow of 2). */ 102 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ 103 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ 104 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ 105 /* actual size of the tables */ 106 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NETMAP_MAX_FRAGS) 107 /* NM_FT_NULL terminates a list of slots in the ft */ 108 #define NM_FT_NULL NM_BDG_BATCH_MAX 109 110 111 /* 112 * bridge_batch is set via sysctl to the max batch size to be 113 * used in the bridge. The actual value may be larger as the 114 * last packet in the block may overflow the size. 115 */ 116 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ 117 118 /* Max number of vale bridges (loader tunable). */ 119 unsigned int vale_max_bridges = NM_BRIDGES; 120 121 SYSBEGIN(vars_vale); 122 SYSCTL_DECL(_dev_netmap); 123 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0, 124 "Max batch size to be used in the bridge"); 125 SYSCTL_UINT(_dev_netmap, OID_AUTO, max_bridges, CTLFLAG_RDTUN, &vale_max_bridges, 0, 126 "Max number of vale bridges"); 127 SYSEND; 128 129 static int netmap_vale_vp_create(struct nmreq_header *hdr, if_t, 130 struct netmap_mem_d *nmd, struct netmap_vp_adapter **); 131 static int netmap_vale_vp_bdg_attach(const char *, struct netmap_adapter *, 132 struct nm_bridge *); 133 static int netmap_vale_bwrap_attach(const char *, struct netmap_adapter *); 134 135 /* 136 * For each output interface, nm_vale_q is used to construct a list. 137 * bq_len is the number of output buffers (we can have coalescing 138 * during the copy). 139 */ 140 struct nm_vale_q { 141 uint16_t bq_head; 142 uint16_t bq_tail; 143 uint32_t bq_len; /* number of buffers */ 144 }; 145 146 /* Holds the default callbacks */ 147 struct netmap_bdg_ops vale_bdg_ops = { 148 .lookup = netmap_vale_learning, 149 .config = NULL, 150 .dtor = NULL, 151 .vp_create = netmap_vale_vp_create, 152 .bwrap_attach = netmap_vale_bwrap_attach, 153 .name = NM_BDG_NAME, 154 }; 155 156 /* 157 * this is a slightly optimized copy routine which rounds 158 * to multiple of 64 bytes and is often faster than dealing 159 * with other odd sizes. We assume there is enough room 160 * in the source and destination buffers. 161 * 162 * XXX only for multiples of NM_BUF_ALIGN bytes, non overlapped. 163 */ 164 165 static inline void 166 pkt_copy(void *_src, void *_dst, int l) 167 { 168 uint64_t *src = _src; 169 uint64_t *dst = _dst; 170 if (unlikely(l >= 1024)) { 171 memcpy(dst, src, l); 172 return; 173 } 174 for (; likely(l > 0); l -= NM_BUF_ALIGN) { 175 /* XXX NM_BUF_ALIGN/sizeof(uint64_t) statements */ 176 *dst++ = *src++; 177 *dst++ = *src++; 178 *dst++ = *src++; 179 *dst++ = *src++; 180 *dst++ = *src++; 181 *dst++ = *src++; 182 *dst++ = *src++; 183 *dst++ = *src++; 184 } 185 } 186 187 188 /* 189 * Free the forwarding tables for rings attached to switch ports. 190 */ 191 static void 192 nm_free_bdgfwd(struct netmap_adapter *na) 193 { 194 int nrings, i; 195 struct netmap_kring **kring; 196 197 NMG_LOCK_ASSERT(); 198 nrings = na->num_tx_rings; 199 kring = na->tx_rings; 200 for (i = 0; i < nrings; i++) { 201 if (kring[i]->nkr_ft) { 202 nm_os_free(kring[i]->nkr_ft); 203 kring[i]->nkr_ft = NULL; /* protect from freeing twice */ 204 } 205 } 206 } 207 208 209 /* 210 * Allocate the forwarding tables for the rings attached to the bridge ports. 211 */ 212 static int 213 nm_alloc_bdgfwd(struct netmap_adapter *na) 214 { 215 int nrings, l, i, num_dstq; 216 struct netmap_kring **kring; 217 218 NMG_LOCK_ASSERT(); 219 /* all port:rings + broadcast */ 220 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; 221 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; 222 l += sizeof(struct nm_vale_q) * num_dstq; 223 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; 224 225 nrings = netmap_real_rings(na, NR_TX); 226 kring = na->tx_rings; 227 for (i = 0; i < nrings; i++) { 228 struct nm_bdg_fwd *ft; 229 struct nm_vale_q *dstq; 230 int j; 231 232 ft = nm_os_malloc(l); 233 if (!ft) { 234 nm_free_bdgfwd(na); 235 return ENOMEM; 236 } 237 dstq = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX); 238 for (j = 0; j < num_dstq; j++) { 239 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; 240 dstq[j].bq_len = 0; 241 } 242 kring[i]->nkr_ft = ft; 243 } 244 return 0; 245 } 246 247 /* Allows external modules to create bridges in exclusive mode, 248 * returns an authentication token that the external module will need 249 * to provide during nm_bdg_ctl_{attach, detach}(), netmap_bdg_regops(), 250 * and nm_bdg_update_private_data() operations. 251 * Successfully executed if ret != NULL and *return_status == 0. 252 */ 253 void * 254 netmap_vale_create(const char *bdg_name, int *return_status) 255 { 256 struct nm_bridge *b = NULL; 257 void *ret = NULL; 258 259 NMG_LOCK(); 260 b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL); 261 if (b) { 262 *return_status = EEXIST; 263 goto unlock_bdg_create; 264 } 265 266 b = nm_find_bridge(bdg_name, 1 /* create */, &vale_bdg_ops); 267 if (!b) { 268 *return_status = ENOMEM; 269 goto unlock_bdg_create; 270 } 271 272 b->bdg_flags |= NM_BDG_ACTIVE | NM_BDG_EXCLUSIVE; 273 ret = nm_bdg_get_auth_token(b); 274 *return_status = 0; 275 276 unlock_bdg_create: 277 NMG_UNLOCK(); 278 return ret; 279 } 280 281 /* Allows external modules to destroy a bridge created through 282 * netmap_bdg_create(), the bridge must be empty. 283 */ 284 int 285 netmap_vale_destroy(const char *bdg_name, void *auth_token) 286 { 287 struct nm_bridge *b = NULL; 288 int ret = 0; 289 290 NMG_LOCK(); 291 b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL); 292 if (!b) { 293 ret = ENXIO; 294 goto unlock_bdg_free; 295 } 296 297 if (!nm_bdg_valid_auth_token(b, auth_token)) { 298 ret = EACCES; 299 goto unlock_bdg_free; 300 } 301 if (!(b->bdg_flags & NM_BDG_EXCLUSIVE)) { 302 ret = EINVAL; 303 goto unlock_bdg_free; 304 } 305 306 b->bdg_flags &= ~(NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE); 307 ret = netmap_bdg_free(b); 308 if (ret) { 309 b->bdg_flags |= NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE; 310 } 311 312 unlock_bdg_free: 313 NMG_UNLOCK(); 314 return ret; 315 } 316 317 /* Process NETMAP_REQ_VALE_LIST. */ 318 int 319 netmap_vale_list(struct nmreq_header *hdr) 320 { 321 struct nmreq_vale_list *req = 322 (struct nmreq_vale_list *)(uintptr_t)hdr->nr_body; 323 int namelen = strlen(hdr->nr_name); 324 struct nm_bridge *b, *bridges; 325 struct netmap_vp_adapter *vpna; 326 int error = 0, i, j; 327 u_int num_bridges; 328 329 netmap_bns_getbridges(&bridges, &num_bridges); 330 331 /* this is used to enumerate bridges and ports */ 332 if (namelen) { /* look up indexes of bridge and port */ 333 if (strncmp(hdr->nr_name, NM_BDG_NAME, 334 strlen(NM_BDG_NAME))) { 335 return EINVAL; 336 } 337 NMG_LOCK(); 338 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL); 339 if (!b) { 340 NMG_UNLOCK(); 341 return ENOENT; 342 } 343 344 req->nr_bridge_idx = b - bridges; /* bridge index */ 345 req->nr_port_idx = NM_BDG_NOPORT; 346 for (j = 0; j < b->bdg_active_ports; j++) { 347 i = b->bdg_port_index[j]; 348 vpna = b->bdg_ports[i]; 349 if (vpna == NULL) { 350 nm_prerr("This should not happen"); 351 continue; 352 } 353 /* the former and the latter identify a 354 * virtual port and a NIC, respectively 355 */ 356 if (!strcmp(vpna->up.name, hdr->nr_name)) { 357 req->nr_port_idx = i; /* port index */ 358 break; 359 } 360 } 361 NMG_UNLOCK(); 362 } else { 363 /* return the first non-empty entry starting from 364 * bridge nr_arg1 and port nr_arg2. 365 * 366 * Users can detect the end of the same bridge by 367 * seeing the new and old value of nr_arg1, and can 368 * detect the end of all the bridge by error != 0 369 */ 370 i = req->nr_bridge_idx; 371 j = req->nr_port_idx; 372 373 NMG_LOCK(); 374 for (error = ENOENT; i < vale_max_bridges; i++) { 375 b = bridges + i; 376 for ( ; j < NM_BDG_MAXPORTS; j++) { 377 if (b->bdg_ports[j] == NULL) 378 continue; 379 vpna = b->bdg_ports[j]; 380 /* write back the VALE switch name */ 381 strlcpy(hdr->nr_name, vpna->up.name, 382 sizeof(hdr->nr_name)); 383 error = 0; 384 goto out; 385 } 386 j = 0; /* following bridges scan from 0 */ 387 } 388 out: 389 req->nr_bridge_idx = i; 390 req->nr_port_idx = j; 391 NMG_UNLOCK(); 392 } 393 394 return error; 395 } 396 397 398 /* nm_dtor callback for ephemeral VALE ports */ 399 static void 400 netmap_vale_vp_dtor(struct netmap_adapter *na) 401 { 402 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; 403 struct nm_bridge *b = vpna->na_bdg; 404 405 nm_prdis("%s has %d references", na->name, na->na_refcount); 406 407 if (b) { 408 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 409 } 410 411 if (na->ifp != NULL && !nm_iszombie(na)) { 412 NM_DETACH_NA(na->ifp); 413 if (vpna->autodelete) { 414 nm_prdis("releasing %s", if_name(na->ifp)); 415 NMG_UNLOCK(); 416 nm_os_vi_detach(na->ifp); 417 NMG_LOCK(); 418 } 419 } 420 } 421 422 423 424 /* nm_krings_create callback for VALE ports. 425 * Calls the standard netmap_krings_create, then adds leases on rx 426 * rings and bdgfwd on tx rings. 427 */ 428 static int 429 netmap_vale_vp_krings_create(struct netmap_adapter *na) 430 { 431 u_int tailroom; 432 int error, i; 433 uint32_t *leases; 434 u_int nrx = netmap_real_rings(na, NR_RX); 435 436 /* 437 * Leases are attached to RX rings on vale ports 438 */ 439 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; 440 441 error = netmap_krings_create(na, tailroom); 442 if (error) 443 return error; 444 445 leases = na->tailroom; 446 447 for (i = 0; i < nrx; i++) { /* Receive rings */ 448 na->rx_rings[i]->nkr_leases = leases; 449 leases += na->num_rx_desc; 450 } 451 452 error = nm_alloc_bdgfwd(na); 453 if (error) { 454 netmap_krings_delete(na); 455 return error; 456 } 457 458 return 0; 459 } 460 461 462 /* nm_krings_delete callback for VALE ports. */ 463 static void 464 netmap_vale_vp_krings_delete(struct netmap_adapter *na) 465 { 466 nm_free_bdgfwd(na); 467 netmap_krings_delete(na); 468 } 469 470 471 static int 472 nm_vale_flush(struct nm_bdg_fwd *ft, u_int n, 473 struct netmap_vp_adapter *na, u_int ring_nr); 474 475 476 /* 477 * main dispatch routine for the bridge. 478 * Grab packets from a kring, move them into the ft structure 479 * associated to the tx (input) port. Max one instance per port, 480 * filtered on input (ioctl, poll or XXX). 481 * Returns the next position in the ring. 482 */ 483 static int 484 nm_vale_preflush(struct netmap_kring *kring, u_int end) 485 { 486 struct netmap_vp_adapter *na = 487 (struct netmap_vp_adapter*)kring->na; 488 struct netmap_ring *ring = kring->ring; 489 struct nm_bdg_fwd *ft; 490 u_int ring_nr = kring->ring_id; 491 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; 492 u_int ft_i = 0; /* start from 0 */ 493 u_int frags = 1; /* how many frags ? */ 494 struct nm_bridge *b = na->na_bdg; 495 496 /* To protect against modifications to the bridge we acquire a 497 * shared lock, waiting if we can sleep (if the source port is 498 * attached to a user process) or with a trylock otherwise (NICs). 499 */ 500 nm_prdis("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); 501 if (na->up.na_flags & NAF_BDG_MAYSLEEP) 502 BDG_RLOCK(b); 503 else if (!BDG_RTRYLOCK(b)) 504 return j; 505 nm_prdis(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); 506 ft = kring->nkr_ft; 507 508 for (; likely(j != end); j = nm_next(j, lim)) { 509 struct netmap_slot *slot = &ring->slot[j]; 510 char *buf; 511 512 ft[ft_i].ft_len = slot->len; 513 ft[ft_i].ft_flags = slot->flags; 514 ft[ft_i].ft_offset = 0; 515 516 nm_prdis("flags is 0x%x", slot->flags); 517 /* we do not use the buf changed flag, but we still need to reset it */ 518 slot->flags &= ~NS_BUF_CHANGED; 519 520 /* this slot goes into a list so initialize the link field */ 521 ft[ft_i].ft_next = NM_FT_NULL; 522 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? 523 (void *)(uintptr_t)slot->ptr : NMB_O(kring, slot); 524 if (unlikely(buf == NULL || 525 slot->len > NETMAP_BUF_SIZE(&na->up) - nm_get_offset(kring, slot))) { 526 nm_prlim(5, "NULL %s buffer pointer from %s slot %d len %d", 527 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT", 528 kring->name, j, ft[ft_i].ft_len); 529 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up); 530 ft[ft_i].ft_len = 0; 531 ft[ft_i].ft_flags = 0; 532 } 533 __builtin_prefetch(buf); 534 ++ft_i; 535 if (slot->flags & NS_MOREFRAG) { 536 frags++; 537 continue; 538 } 539 if (unlikely(netmap_verbose && frags > 1)) 540 nm_prlim(5, "%d frags at %d", frags, ft_i - frags); 541 ft[ft_i - frags].ft_frags = frags; 542 frags = 1; 543 if (unlikely((int)ft_i >= bridge_batch)) 544 ft_i = nm_vale_flush(ft, ft_i, na, ring_nr); 545 } 546 if (frags > 1) { 547 /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we 548 * have to fix frags count. */ 549 frags--; 550 ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG; 551 ft[ft_i - frags].ft_frags = frags; 552 nm_prlim(5, "Truncate incomplete fragment at %d (%d frags)", ft_i, frags); 553 } 554 if (ft_i) 555 ft_i = nm_vale_flush(ft, ft_i, na, ring_nr); 556 BDG_RUNLOCK(b); 557 return j; 558 } 559 560 561 /* ----- FreeBSD if_bridge hash function ------- */ 562 563 /* 564 * The following hash function is adapted from "Hash Functions" by Bob Jenkins 565 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). 566 * 567 * http://www.burtleburtle.net/bob/hash/spooky.html 568 */ 569 #define mix(a, b, c) \ 570 do { \ 571 a -= b; a -= c; a ^= (c >> 13); \ 572 b -= c; b -= a; b ^= (a << 8); \ 573 c -= a; c -= b; c ^= (b >> 13); \ 574 a -= b; a -= c; a ^= (c >> 12); \ 575 b -= c; b -= a; b ^= (a << 16); \ 576 c -= a; c -= b; c ^= (b >> 5); \ 577 a -= b; a -= c; a ^= (c >> 3); \ 578 b -= c; b -= a; b ^= (a << 10); \ 579 c -= a; c -= b; c ^= (b >> 15); \ 580 } while (/*CONSTCOND*/0) 581 582 583 static __inline uint32_t 584 nm_vale_rthash(const uint8_t *addr) 585 { 586 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hash key 587 588 b += addr[5] << 8; 589 b += addr[4]; 590 a += addr[3] << 24; 591 a += addr[2] << 16; 592 a += addr[1] << 8; 593 a += addr[0]; 594 595 mix(a, b, c); 596 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) 597 return (c & BRIDGE_RTHASH_MASK); 598 } 599 600 #undef mix 601 602 603 /* 604 * Lookup function for a learning bridge. 605 * Update the hash table with the source address, 606 * and then returns the destination port index, and the 607 * ring in *dst_ring (at the moment, always use ring 0) 608 */ 609 uint32_t 610 netmap_vale_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, 611 struct netmap_vp_adapter *na, void *private_data) 612 { 613 uint8_t *buf = ((uint8_t *)ft->ft_buf) + ft->ft_offset; 614 u_int buf_len = ft->ft_len - ft->ft_offset; 615 struct nm_hash_ent *ht = private_data; 616 uint32_t sh, dh; 617 u_int dst, mysrc = na->bdg_port; 618 uint64_t smac, dmac; 619 uint8_t indbuf[12]; 620 621 if (buf_len < 14) { 622 return NM_BDG_NOPORT; 623 } 624 625 if (ft->ft_flags & NS_INDIRECT) { 626 if (copyin(buf, indbuf, sizeof(indbuf))) { 627 return NM_BDG_NOPORT; 628 } 629 buf = indbuf; 630 } 631 632 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; 633 smac = le64toh(*(uint64_t *)(buf + 4)); 634 smac >>= 16; 635 636 /* 637 * The hash is somewhat expensive, there might be some 638 * worthwhile optimizations here. 639 */ 640 if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */ 641 uint8_t *s = buf+6; 642 sh = nm_vale_rthash(s); /* hash of source */ 643 /* update source port forwarding entry */ 644 na->last_smac = ht[sh].mac = smac; /* XXX expire ? */ 645 ht[sh].ports = mysrc; 646 if (netmap_debug & NM_DEBUG_VALE) 647 nm_prinf("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", 648 s[0], s[1], s[2], s[3], s[4], s[5], mysrc); 649 } 650 dst = NM_BDG_BROADCAST; 651 if ((buf[0] & 1) == 0) { /* unicast */ 652 dh = nm_vale_rthash(buf); /* hash of dst */ 653 if (ht[dh].mac == dmac) { /* found dst */ 654 dst = ht[dh].ports; 655 } 656 } 657 return dst; 658 } 659 660 661 /* 662 * Available space in the ring. Only used in VALE code 663 * and only with is_rx = 1 664 */ 665 static inline uint32_t 666 nm_kr_space(struct netmap_kring *k, int is_rx) 667 { 668 int space; 669 670 if (is_rx) { 671 int busy = k->nkr_hwlease - k->nr_hwcur; 672 if (busy < 0) 673 busy += k->nkr_num_slots; 674 space = k->nkr_num_slots - 1 - busy; 675 } else { 676 /* XXX never used in this branch */ 677 space = k->nr_hwtail - k->nkr_hwlease; 678 if (space < 0) 679 space += k->nkr_num_slots; 680 } 681 #if 0 682 // sanity check 683 if (k->nkr_hwlease >= k->nkr_num_slots || 684 k->nr_hwcur >= k->nkr_num_slots || 685 k->nr_tail >= k->nkr_num_slots || 686 busy < 0 || 687 busy >= k->nkr_num_slots) { 688 nm_prerr("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", 689 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 690 k->nkr_lease_idx, k->nkr_num_slots); 691 } 692 #endif 693 return space; 694 } 695 696 697 698 699 /* make a lease on the kring for N positions. return the 700 * lease index 701 * XXX only used in VALE code and with is_rx = 1 702 */ 703 static inline uint32_t 704 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) 705 { 706 uint32_t lim = k->nkr_num_slots - 1; 707 uint32_t lease_idx = k->nkr_lease_idx; 708 709 k->nkr_leases[lease_idx] = NR_NOSLOT; 710 k->nkr_lease_idx = nm_next(lease_idx, lim); 711 712 #ifdef CONFIG_NETMAP_DEBUG 713 if (n > nm_kr_space(k, is_rx)) { 714 nm_prerr("invalid request for %d slots", n); 715 panic("x"); 716 } 717 #endif /* CONFIG NETMAP_DEBUG */ 718 /* XXX verify that there are n slots */ 719 k->nkr_hwlease += n; 720 if (k->nkr_hwlease > lim) 721 k->nkr_hwlease -= lim + 1; 722 723 #ifdef CONFIG_NETMAP_DEBUG 724 if (k->nkr_hwlease >= k->nkr_num_slots || 725 k->nr_hwcur >= k->nkr_num_slots || 726 k->nr_hwtail >= k->nkr_num_slots || 727 k->nkr_lease_idx >= k->nkr_num_slots) { 728 nm_prerr("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", 729 k->na->name, 730 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 731 k->nkr_lease_idx, k->nkr_num_slots); 732 } 733 #endif /* CONFIG_NETMAP_DEBUG */ 734 return lease_idx; 735 } 736 737 /* 738 * 739 * This flush routine supports only unicast and broadcast but a large 740 * number of ports, and lets us replace the learn and dispatch functions. 741 */ 742 int 743 nm_vale_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, 744 u_int ring_nr) 745 { 746 struct nm_vale_q *dst_ents, *brddst; 747 uint16_t num_dsts = 0, *dsts; 748 struct nm_bridge *b = na->na_bdg; 749 u_int i, me = na->bdg_port; 750 751 /* 752 * The work area (pointed by ft) is followed by an array of 753 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS 754 * queues per port plus one for the broadcast traffic. 755 * Then we have an array of destination indexes. 756 */ 757 dst_ents = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX); 758 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); 759 760 /* first pass: find a destination for each packet in the batch */ 761 for (i = 0; likely(i < n); i += ft[i].ft_frags) { 762 uint8_t dst_ring = ring_nr; /* default, same ring as origin */ 763 uint16_t dst_port, d_i; 764 struct nm_vale_q *d; 765 struct nm_bdg_fwd *start_ft = NULL; 766 767 nm_prdis("slot %d frags %d", i, ft[i].ft_frags); 768 769 if (na->up.virt_hdr_len < ft[i].ft_len) { 770 ft[i].ft_offset = na->up.virt_hdr_len; 771 start_ft = &ft[i]; 772 } else if (na->up.virt_hdr_len == ft[i].ft_len && ft[i].ft_flags & NS_MOREFRAG) { 773 ft[i].ft_offset = ft[i].ft_len; 774 start_ft = &ft[i+1]; 775 } else { 776 /* Drop the packet if the virtio-net header is not into the first 777 * fragment nor at the very beginning of the second. 778 */ 779 continue; 780 } 781 dst_port = b->bdg_ops.lookup(start_ft, &dst_ring, na, b->private_data); 782 if (netmap_verbose > 255) 783 nm_prlim(5, "slot %d port %d -> %d", i, me, dst_port); 784 if (dst_port >= NM_BDG_NOPORT) 785 continue; /* this packet is identified to be dropped */ 786 else if (dst_port == NM_BDG_BROADCAST) 787 dst_ring = 0; /* broadcasts always go to ring 0 */ 788 else if (unlikely(dst_port == me || 789 !b->bdg_ports[dst_port])) 790 continue; 791 792 /* get a position in the scratch pad */ 793 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; 794 d = dst_ents + d_i; 795 796 /* append the first fragment to the list */ 797 if (d->bq_head == NM_FT_NULL) { /* new destination */ 798 d->bq_head = d->bq_tail = i; 799 /* remember this position to be scanned later */ 800 if (dst_port != NM_BDG_BROADCAST) 801 dsts[num_dsts++] = d_i; 802 } else { 803 ft[d->bq_tail].ft_next = i; 804 d->bq_tail = i; 805 } 806 d->bq_len += ft[i].ft_frags; 807 } 808 809 /* 810 * Broadcast traffic goes to ring 0 on all destinations. 811 * So we need to add these rings to the list of ports to scan. 812 */ 813 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; 814 if (brddst->bq_head != NM_FT_NULL) { 815 u_int j; 816 for (j = 0; likely(j < b->bdg_active_ports); j++) { 817 uint16_t d_i; 818 i = b->bdg_port_index[j]; 819 if (unlikely(i == me)) 820 continue; 821 d_i = i * NM_BDG_MAXRINGS; 822 if (dst_ents[d_i].bq_head == NM_FT_NULL) 823 dsts[num_dsts++] = d_i; 824 } 825 } 826 827 nm_prdis(5, "pass 1 done %d pkts %d dsts", n, num_dsts); 828 /* second pass: scan destinations */ 829 for (i = 0; i < num_dsts; i++) { 830 struct netmap_vp_adapter *dst_na; 831 struct netmap_kring *kring; 832 struct netmap_ring *ring; 833 u_int dst_nr, lim, j, d_i, next, brd_next; 834 u_int needed, howmany; 835 int retry = netmap_txsync_retry; 836 struct nm_vale_q *d; 837 uint32_t my_start = 0, lease_idx = 0; 838 int nrings; 839 int virt_hdr_mismatch = 0; 840 841 d_i = dsts[i]; 842 nm_prdis("second pass %d port %d", i, d_i); 843 d = dst_ents + d_i; 844 // XXX fix the division 845 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; 846 /* protect from the lookup function returning an inactive 847 * destination port 848 */ 849 if (unlikely(dst_na == NULL)) 850 goto cleanup; 851 if (dst_na->up.na_flags & NAF_SW_ONLY) 852 goto cleanup; 853 /* 854 * The interface may be in !netmap mode in two cases: 855 * - when na is attached but not activated yet; 856 * - when na is being deactivated but is still attached. 857 */ 858 if (unlikely(!nm_netmap_on(&dst_na->up))) { 859 nm_prdis("not in netmap mode!"); 860 goto cleanup; 861 } 862 863 /* there is at least one either unicast or broadcast packet */ 864 brd_next = brddst->bq_head; 865 next = d->bq_head; 866 /* we need to reserve this many slots. If fewer are 867 * available, some packets will be dropped. 868 * Packets may have multiple fragments, so 869 * there is a chance that we may not use all of the slots 870 * we have claimed, so we will need to handle the leftover 871 * ones when we regain the lock. 872 */ 873 needed = d->bq_len + brddst->bq_len; 874 875 if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) { 876 if (netmap_verbose) { 877 nm_prlim(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len, 878 dst_na->up.virt_hdr_len); 879 } 880 /* There is a virtio-net header/offloadings mismatch between 881 * source and destination. The slower mismatch datapath will 882 * be used to cope with all the mismatches. 883 */ 884 virt_hdr_mismatch = 1; 885 if (dst_na->mfs < na->mfs) { 886 /* We may need to do segmentation offloadings, and so 887 * we may need a number of destination slots greater 888 * than the number of input slots ('needed'). 889 * We look for the smallest integer 'x' which satisfies: 890 * needed * na->mfs + x * H <= x * na->mfs 891 * where 'H' is the length of the longest header that may 892 * be replicated in the segmentation process (e.g. for 893 * TCPv4 we must account for ethernet header, IP header 894 * and TCPv4 header). 895 */ 896 KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0")); 897 needed = (needed * na->mfs) / 898 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; 899 nm_prdis(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); 900 } 901 } 902 903 nm_prdis(5, "pass 2 dst %d is %x %s", 904 i, d_i, nm_is_bwrap(&dst_na->up) ? "nic/host" : "virtual"); 905 dst_nr = d_i & (NM_BDG_MAXRINGS-1); 906 nrings = dst_na->up.num_rx_rings; 907 if (dst_nr >= nrings) 908 dst_nr = dst_nr % nrings; 909 kring = dst_na->up.rx_rings[dst_nr]; 910 ring = kring->ring; 911 /* the destination ring may have not been opened for RX */ 912 if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON)) 913 goto cleanup; 914 lim = kring->nkr_num_slots - 1; 915 916 retry: 917 918 if (dst_na->retry && retry) { 919 /* try to get some free slot from the previous run */ 920 kring->nm_notify(kring, NAF_FORCE_RECLAIM); 921 /* actually useful only for bwraps, since there 922 * the notify will trigger a txsync on the hwna. VALE ports 923 * have dst_na->retry == 0 924 */ 925 } 926 /* reserve the buffers in the queue and an entry 927 * to report completion, and drop lock. 928 * XXX this might become a helper function. 929 */ 930 mtx_lock(&kring->q_lock); 931 if (kring->nkr_stopped) { 932 mtx_unlock(&kring->q_lock); 933 goto cleanup; 934 } 935 my_start = j = kring->nkr_hwlease; 936 howmany = nm_kr_space(kring, 1); 937 if (needed < howmany) 938 howmany = needed; 939 lease_idx = nm_kr_lease(kring, howmany, 1); 940 mtx_unlock(&kring->q_lock); 941 942 /* only retry if we need more than available slots */ 943 if (retry && needed <= howmany) 944 retry = 0; 945 946 /* copy to the destination queue */ 947 while (howmany > 0) { 948 struct netmap_slot *slot; 949 struct nm_bdg_fwd *ft_p, *ft_end; 950 u_int cnt; 951 952 /* find the queue from which we pick next packet. 953 * NM_FT_NULL is always higher than valid indexes 954 * so we never dereference it if the other list 955 * has packets (and if both are empty we never 956 * get here). 957 */ 958 if (next < brd_next) { 959 ft_p = ft + next; 960 next = ft_p->ft_next; 961 } else { /* insert broadcast */ 962 ft_p = ft + brd_next; 963 brd_next = ft_p->ft_next; 964 } 965 cnt = ft_p->ft_frags; // cnt > 0 966 if (unlikely(cnt > howmany)) 967 break; /* no more space */ 968 if (netmap_verbose && cnt > 1) 969 nm_prlim(5, "rx %d frags to %d", cnt, j); 970 ft_end = ft_p + cnt; 971 if (unlikely(virt_hdr_mismatch)) { 972 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); 973 } else { 974 howmany -= cnt; 975 do { 976 char *dst, *src = ft_p->ft_buf; 977 size_t copy_len = ft_p->ft_len, dst_len = copy_len; 978 uintptr_t src_cb; 979 uint64_t dstoff, dstoff_cb; 980 int src_co, dst_co; 981 const uintptr_t mask = NM_BUF_ALIGN - 1; 982 983 slot = &ring->slot[j]; 984 dst = NMB(&dst_na->up, slot); 985 dstoff = nm_get_offset(kring, slot); 986 dstoff_cb = dstoff & ~mask; 987 src_cb = ((uintptr_t)src) & ~mask; 988 src_co = ((uintptr_t)src) & mask; 989 dst_co = ((uintptr_t)(dst + dstoff)) & mask; 990 if (dst_co < src_co) { 991 dstoff_cb += NM_BUF_ALIGN; 992 } 993 dstoff = dstoff_cb + src_co; 994 copy_len += src_co; 995 996 nm_prdis("send [%d] %d(%d) bytes at %s:%d", 997 i, (int)copy_len, (int)dst_len, 998 NM_IFPNAME(dst_ifp), j); 999 1000 if (unlikely(dstoff > NETMAP_BUF_SIZE(&dst_na->up) || 1001 dst_len > NETMAP_BUF_SIZE(&dst_na->up) - dstoff)) { 1002 nm_prlim(5, "dropping packet/fragment of len %zu, dest offset %llu", 1003 dst_len, (unsigned long long)dstoff); 1004 copy_len = dst_len = 0; 1005 dstoff = nm_get_offset(kring, slot); 1006 } 1007 1008 if (ft_p->ft_flags & NS_INDIRECT) { 1009 if (copyin(src, dst, copy_len)) { 1010 // invalid user pointer, pretend len is 0 1011 dst_len = 0; 1012 } 1013 } else { 1014 //memcpy(dst, src, copy_len); 1015 pkt_copy((char *)src_cb, dst + dstoff_cb, (int)copy_len); 1016 } 1017 slot->len = dst_len; 1018 slot->flags = (cnt << 8)| NS_MOREFRAG; 1019 nm_write_offset(kring, slot, dstoff); 1020 j = nm_next(j, lim); 1021 needed--; 1022 ft_p++; 1023 } while (ft_p != ft_end); 1024 slot->flags = (cnt << 8); /* clear flag on last entry */ 1025 } 1026 /* are we done ? */ 1027 if (next == NM_FT_NULL && brd_next == NM_FT_NULL) 1028 break; 1029 } 1030 { 1031 /* current position */ 1032 uint32_t *p = kring->nkr_leases; /* shorthand */ 1033 uint32_t update_pos; 1034 int still_locked = 1; 1035 1036 mtx_lock(&kring->q_lock); 1037 if (unlikely(howmany > 0)) { 1038 /* not used all bufs. If i am the last one 1039 * i can recover the slots, otherwise must 1040 * fill them with 0 to mark empty packets. 1041 */ 1042 nm_prdis("leftover %d bufs", howmany); 1043 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { 1044 /* yes i am the last one */ 1045 nm_prdis("roll back nkr_hwlease to %d", j); 1046 kring->nkr_hwlease = j; 1047 } else { 1048 while (howmany-- > 0) { 1049 ring->slot[j].len = 0; 1050 ring->slot[j].flags = 0; 1051 j = nm_next(j, lim); 1052 } 1053 } 1054 } 1055 p[lease_idx] = j; /* report I am done */ 1056 1057 update_pos = kring->nr_hwtail; 1058 1059 if (my_start == update_pos) { 1060 /* all slots before my_start have been reported, 1061 * so scan subsequent leases to see if other ranges 1062 * have been completed, and to a selwakeup or txsync. 1063 */ 1064 while (lease_idx != kring->nkr_lease_idx && 1065 p[lease_idx] != NR_NOSLOT) { 1066 j = p[lease_idx]; 1067 p[lease_idx] = NR_NOSLOT; 1068 lease_idx = nm_next(lease_idx, lim); 1069 } 1070 /* j is the new 'write' position. j != my_start 1071 * means there are new buffers to report 1072 */ 1073 if (likely(j != my_start)) { 1074 kring->nr_hwtail = j; 1075 still_locked = 0; 1076 mtx_unlock(&kring->q_lock); 1077 kring->nm_notify(kring, 0); 1078 /* this is netmap_notify for VALE ports and 1079 * netmap_bwrap_notify for bwrap. The latter will 1080 * trigger a txsync on the underlying hwna 1081 */ 1082 if (dst_na->retry && retry--) { 1083 /* XXX this is going to call nm_notify again. 1084 * Only useful for bwrap in virtual machines 1085 */ 1086 goto retry; 1087 } 1088 } 1089 } 1090 if (still_locked) 1091 mtx_unlock(&kring->q_lock); 1092 } 1093 cleanup: 1094 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ 1095 d->bq_len = 0; 1096 } 1097 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ 1098 brddst->bq_len = 0; 1099 return 0; 1100 } 1101 1102 /* nm_txsync callback for VALE ports */ 1103 static int 1104 netmap_vale_vp_txsync(struct netmap_kring *kring, int flags) 1105 { 1106 struct netmap_vp_adapter *na = 1107 (struct netmap_vp_adapter *)kring->na; 1108 u_int done; 1109 u_int const lim = kring->nkr_num_slots - 1; 1110 u_int const head = kring->rhead; 1111 1112 if (bridge_batch <= 0) { /* testing only */ 1113 done = head; // used all 1114 goto done; 1115 } 1116 if (!na->na_bdg) { 1117 done = head; 1118 goto done; 1119 } 1120 if (bridge_batch > NM_BDG_BATCH) 1121 bridge_batch = NM_BDG_BATCH; 1122 1123 done = nm_vale_preflush(kring, head); 1124 done: 1125 if (done != head) 1126 nm_prerr("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail); 1127 /* 1128 * packets between 'done' and 'cur' are left unsent. 1129 */ 1130 kring->nr_hwcur = done; 1131 kring->nr_hwtail = nm_prev(done, lim); 1132 if (netmap_debug & NM_DEBUG_TXSYNC) 1133 nm_prinf("%s ring %d flags %d", na->up.name, kring->ring_id, flags); 1134 return 0; 1135 } 1136 1137 1138 /* create a netmap_vp_adapter that describes a VALE port. 1139 * Only persistent VALE ports have a non-null ifp. 1140 */ 1141 static int 1142 netmap_vale_vp_create(struct nmreq_header *hdr, if_t ifp, 1143 struct netmap_mem_d *nmd, struct netmap_vp_adapter **ret) 1144 { 1145 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body; 1146 struct netmap_vp_adapter *vpna; 1147 struct netmap_adapter *na; 1148 int error = 0; 1149 u_int npipes = 0; 1150 u_int extrabufs = 0; 1151 1152 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) { 1153 return EINVAL; 1154 } 1155 1156 vpna = nm_os_malloc(sizeof(*vpna)); 1157 if (vpna == NULL) 1158 return ENOMEM; 1159 1160 na = &vpna->up; 1161 1162 na->ifp = ifp; 1163 strlcpy(na->name, hdr->nr_name, sizeof(na->name)); 1164 1165 /* bound checking */ 1166 na->num_tx_rings = req->nr_tx_rings; 1167 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1168 req->nr_tx_rings = na->num_tx_rings; /* write back */ 1169 na->num_rx_rings = req->nr_rx_rings; 1170 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1171 req->nr_rx_rings = na->num_rx_rings; /* write back */ 1172 nm_bound_var(&req->nr_tx_slots, NM_BRIDGE_RINGSIZE, 1173 1, NM_BDG_MAXSLOTS, NULL); 1174 na->num_tx_desc = req->nr_tx_slots; 1175 nm_bound_var(&req->nr_rx_slots, NM_BRIDGE_RINGSIZE, 1176 1, NM_BDG_MAXSLOTS, NULL); 1177 /* validate number of pipes. We want at least 1, 1178 * but probably can do with some more. 1179 * So let's use 2 as default (when 0 is supplied) 1180 */ 1181 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); 1182 /* validate extra bufs */ 1183 extrabufs = req->nr_extra_bufs; 1184 nm_bound_var(&extrabufs, 0, 0, 1185 128*NM_BDG_MAXSLOTS, NULL); 1186 req->nr_extra_bufs = extrabufs; /* write back */ 1187 na->num_rx_desc = req->nr_rx_slots; 1188 /* Set the mfs to a default value, as it is needed on the VALE 1189 * mismatch datapath. XXX We should set it according to the MTU 1190 * known to the kernel. */ 1191 vpna->mfs = NM_BDG_MFS_DEFAULT; 1192 vpna->last_smac = ~0llu; 1193 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? 1194 vpna->mfs = netmap_buf_size; */ 1195 if (netmap_verbose) 1196 nm_prinf("max frame size %u", vpna->mfs); 1197 1198 na->na_flags |= (NAF_BDG_MAYSLEEP | NAF_OFFSETS); 1199 /* persistent VALE ports look like hw devices 1200 * with a native netmap adapter 1201 */ 1202 if (ifp) 1203 na->na_flags |= NAF_NATIVE; 1204 na->nm_txsync = netmap_vale_vp_txsync; 1205 na->nm_rxsync = netmap_vp_rxsync; /* use the one provided by bdg */ 1206 na->nm_register = netmap_vp_reg; /* use the one provided by bdg */ 1207 na->nm_krings_create = netmap_vale_vp_krings_create; 1208 na->nm_krings_delete = netmap_vale_vp_krings_delete; 1209 na->nm_dtor = netmap_vale_vp_dtor; 1210 nm_prdis("nr_mem_id %d", req->nr_mem_id); 1211 na->nm_mem = nmd ? 1212 netmap_mem_get(nmd): 1213 netmap_mem_private_new( 1214 na->num_tx_rings, na->num_tx_desc, 1215 na->num_rx_rings, na->num_rx_desc, 1216 req->nr_extra_bufs, npipes, &error); 1217 if (na->nm_mem == NULL) 1218 goto err; 1219 na->nm_bdg_attach = netmap_vale_vp_bdg_attach; 1220 /* other nmd fields are set in the common routine */ 1221 error = netmap_attach_common(na); 1222 if (error) 1223 goto err; 1224 *ret = vpna; 1225 return 0; 1226 1227 err: 1228 if (na->nm_mem != NULL) 1229 netmap_mem_put(na->nm_mem); 1230 nm_os_free(vpna); 1231 return error; 1232 } 1233 1234 /* nm_bdg_attach callback for VALE ports 1235 * The na_vp port is this same netmap_adapter. There is no host port. 1236 */ 1237 static int 1238 netmap_vale_vp_bdg_attach(const char *name, struct netmap_adapter *na, 1239 struct nm_bridge *b) 1240 { 1241 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 1242 1243 if ((b->bdg_flags & NM_BDG_NEED_BWRAP) || vpna->na_bdg) { 1244 return NM_NEED_BWRAP; 1245 } 1246 na->na_vp = vpna; 1247 strlcpy(na->name, name, sizeof(na->name)); 1248 na->na_hostvp = NULL; 1249 return 0; 1250 } 1251 1252 static int 1253 netmap_vale_bwrap_krings_create(struct netmap_adapter *na) 1254 { 1255 int error; 1256 1257 /* impersonate a netmap_vp_adapter */ 1258 error = netmap_vale_vp_krings_create(na); 1259 if (error) 1260 return error; 1261 error = netmap_bwrap_krings_create_common(na); 1262 if (error) { 1263 netmap_vale_vp_krings_delete(na); 1264 } 1265 return error; 1266 } 1267 1268 static void 1269 netmap_vale_bwrap_krings_delete(struct netmap_adapter *na) 1270 { 1271 netmap_bwrap_krings_delete_common(na); 1272 netmap_vale_vp_krings_delete(na); 1273 } 1274 1275 static int 1276 netmap_vale_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) 1277 { 1278 struct netmap_bwrap_adapter *bna; 1279 struct netmap_adapter *na = NULL; 1280 struct netmap_adapter *hostna = NULL; 1281 int error; 1282 1283 bna = nm_os_malloc(sizeof(*bna)); 1284 if (bna == NULL) { 1285 return ENOMEM; 1286 } 1287 na = &bna->up.up; 1288 strlcpy(na->name, nr_name, sizeof(na->name)); 1289 na->nm_register = netmap_bwrap_reg; 1290 na->nm_txsync = netmap_vale_vp_txsync; 1291 // na->nm_rxsync = netmap_bwrap_rxsync; 1292 na->nm_krings_create = netmap_vale_bwrap_krings_create; 1293 na->nm_krings_delete = netmap_vale_bwrap_krings_delete; 1294 na->nm_notify = netmap_bwrap_notify; 1295 bna->nm_intr_notify = netmap_bwrap_intr_notify; 1296 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ 1297 /* Set the mfs, needed on the VALE mismatch datapath. */ 1298 bna->up.mfs = NM_BDG_MFS_DEFAULT; 1299 1300 if (hwna->na_flags & NAF_HOST_RINGS) { 1301 hostna = &bna->host.up; 1302 hostna->nm_notify = netmap_bwrap_notify; 1303 bna->host.mfs = NM_BDG_MFS_DEFAULT; 1304 } 1305 1306 error = netmap_bwrap_attach_common(na, hwna); 1307 if (error) { 1308 nm_os_free(bna); 1309 } 1310 return error; 1311 } 1312 1313 int 1314 netmap_get_vale_na(struct nmreq_header *hdr, struct netmap_adapter **na, 1315 struct netmap_mem_d *nmd, int create) 1316 { 1317 return netmap_get_bdg_na(hdr, na, nmd, create, &vale_bdg_ops); 1318 } 1319 1320 1321 /* creates a persistent VALE port */ 1322 int 1323 nm_vi_create(struct nmreq_header *hdr) 1324 { 1325 struct nmreq_vale_newif *req = 1326 (struct nmreq_vale_newif *)(uintptr_t)hdr->nr_body; 1327 int error = 0; 1328 /* Build a nmreq_register out of the nmreq_vale_newif, 1329 * so that we can call netmap_get_bdg_na(). */ 1330 struct nmreq_register regreq; 1331 bzero(®req, sizeof(regreq)); 1332 regreq.nr_tx_slots = req->nr_tx_slots; 1333 regreq.nr_rx_slots = req->nr_rx_slots; 1334 regreq.nr_tx_rings = req->nr_tx_rings; 1335 regreq.nr_rx_rings = req->nr_rx_rings; 1336 regreq.nr_mem_id = req->nr_mem_id; 1337 hdr->nr_reqtype = NETMAP_REQ_REGISTER; 1338 hdr->nr_body = (uintptr_t)®req; 1339 error = netmap_vi_create(hdr, 0 /* no autodelete */); 1340 hdr->nr_reqtype = NETMAP_REQ_VALE_NEWIF; 1341 hdr->nr_body = (uintptr_t)req; 1342 /* Write back to the original struct. */ 1343 req->nr_tx_slots = regreq.nr_tx_slots; 1344 req->nr_rx_slots = regreq.nr_rx_slots; 1345 req->nr_tx_rings = regreq.nr_tx_rings; 1346 req->nr_rx_rings = regreq.nr_rx_rings; 1347 req->nr_mem_id = regreq.nr_mem_id; 1348 return error; 1349 } 1350 1351 /* remove a persistent VALE port from the system */ 1352 int 1353 nm_vi_destroy(const char *name) 1354 { 1355 if_t ifp; 1356 struct netmap_vp_adapter *vpna; 1357 int error; 1358 1359 ifp = ifunit_ref(name); 1360 if (!ifp) 1361 return ENXIO; 1362 NMG_LOCK(); 1363 /* make sure this is actually a VALE port */ 1364 if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) { 1365 error = EINVAL; 1366 goto err; 1367 } 1368 1369 vpna = (struct netmap_vp_adapter *)NA(ifp); 1370 1371 /* we can only destroy ports that were created via NETMAP_BDG_NEWIF */ 1372 if (vpna->autodelete) { 1373 error = EINVAL; 1374 goto err; 1375 } 1376 1377 /* also make sure that nobody is using the interface */ 1378 if (NETMAP_OWNED_BY_ANY(&vpna->up) || 1379 vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) { 1380 error = EBUSY; 1381 goto err; 1382 } 1383 1384 NMG_UNLOCK(); 1385 1386 if (netmap_verbose) 1387 nm_prinf("destroying a persistent vale interface %s", if_name(ifp)); 1388 /* Linux requires all the references are released 1389 * before unregister 1390 */ 1391 netmap_detach(ifp); 1392 if_rele(ifp); 1393 nm_os_vi_detach(ifp); 1394 return 0; 1395 1396 err: 1397 NMG_UNLOCK(); 1398 if_rele(ifp); 1399 return error; 1400 } 1401 1402 static int 1403 nm_update_info(struct nmreq_register *req, struct netmap_adapter *na) 1404 { 1405 req->nr_rx_rings = na->num_rx_rings; 1406 req->nr_tx_rings = na->num_tx_rings; 1407 req->nr_rx_slots = na->num_rx_desc; 1408 req->nr_tx_slots = na->num_tx_desc; 1409 return netmap_mem_get_info(na->nm_mem, &req->nr_memsize, NULL, 1410 &req->nr_mem_id); 1411 } 1412 1413 1414 /* 1415 * Create a virtual interface registered to the system. 1416 * The interface will be attached to a bridge later. 1417 */ 1418 int 1419 netmap_vi_create(struct nmreq_header *hdr, int autodelete) 1420 { 1421 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body; 1422 if_t ifp; 1423 struct netmap_vp_adapter *vpna; 1424 struct netmap_mem_d *nmd = NULL; 1425 int error; 1426 1427 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) { 1428 return EINVAL; 1429 } 1430 1431 /* don't include VALE prefix */ 1432 if (!strncmp(hdr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME))) 1433 return EINVAL; 1434 if (strlen(hdr->nr_name) >= IFNAMSIZ) { 1435 return EINVAL; 1436 } 1437 ifp = ifunit_ref(hdr->nr_name); 1438 if (ifp) { /* already exist, cannot create new one */ 1439 error = EEXIST; 1440 NMG_LOCK(); 1441 if (NM_NA_VALID(ifp)) { 1442 int update_err = nm_update_info(req, NA(ifp)); 1443 if (update_err) 1444 error = update_err; 1445 } 1446 NMG_UNLOCK(); 1447 if_rele(ifp); 1448 return error; 1449 } 1450 error = nm_os_vi_persist(hdr->nr_name, &ifp); 1451 if (error) 1452 return error; 1453 1454 NMG_LOCK(); 1455 if (req->nr_mem_id) { 1456 nmd = netmap_mem_find(req->nr_mem_id); 1457 if (nmd == NULL) { 1458 error = EINVAL; 1459 goto err_1; 1460 } 1461 } 1462 /* netmap_vp_create creates a struct netmap_vp_adapter */ 1463 error = netmap_vale_vp_create(hdr, ifp, nmd, &vpna); 1464 if (error) { 1465 if (netmap_debug & NM_DEBUG_VALE) 1466 nm_prerr("error %d", error); 1467 goto err_1; 1468 } 1469 /* persist-specific routines */ 1470 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl; 1471 if (!autodelete) { 1472 netmap_adapter_get(&vpna->up); 1473 } else { 1474 vpna->autodelete = 1; 1475 } 1476 NM_ATTACH_NA(ifp, &vpna->up); 1477 /* return the updated info */ 1478 error = nm_update_info(req, &vpna->up); 1479 if (error) { 1480 goto err_2; 1481 } 1482 nm_prdis("returning nr_mem_id %d", req->nr_mem_id); 1483 if (nmd) 1484 netmap_mem_put(nmd); 1485 NMG_UNLOCK(); 1486 nm_prdis("created %s", if_name(ifp)); 1487 return 0; 1488 1489 err_2: 1490 netmap_detach(ifp); 1491 err_1: 1492 if (nmd) 1493 netmap_mem_put(nmd); 1494 NMG_UNLOCK(); 1495 nm_os_vi_detach(ifp); 1496 1497 return error; 1498 } 1499 1500 #endif /* WITH_VALE */ 1501