1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (C) 2013-2016 Universita` di Pisa 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 30 #if defined(__FreeBSD__) 31 #include <sys/cdefs.h> /* prerequisite */ 32 #include <sys/types.h> 33 #include <sys/errno.h> 34 #include <sys/param.h> /* defines used in kernel.h */ 35 #include <sys/kernel.h> /* types used in module initialization */ 36 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 37 #include <sys/sockio.h> 38 #include <sys/socketvar.h> /* struct socket */ 39 #include <sys/malloc.h> 40 #include <sys/poll.h> 41 #include <sys/rwlock.h> 42 #include <sys/socket.h> /* sockaddrs */ 43 #include <sys/selinfo.h> 44 #include <sys/sysctl.h> 45 #include <net/if.h> 46 #include <net/if_var.h> 47 #include <net/bpf.h> /* BIOCIMMEDIATE */ 48 #include <machine/bus.h> /* bus_dmamap_* */ 49 #include <sys/endian.h> 50 #include <sys/refcount.h> 51 #include <sys/smp.h> 52 53 54 #elif defined(linux) 55 56 #include "bsd_glue.h" 57 58 #elif defined(__APPLE__) 59 60 #warning OSX support is only partial 61 #include "osx_glue.h" 62 63 #elif defined(_WIN32) 64 #include "win_glue.h" 65 66 #else 67 68 #error Unsupported platform 69 70 #endif /* unsupported */ 71 72 /* 73 * common headers 74 */ 75 76 #include <net/netmap.h> 77 #include <dev/netmap/netmap_kern.h> 78 #include <dev/netmap/netmap_mem2.h> 79 #include <dev/netmap/netmap_bdg.h> 80 81 #ifdef WITH_VALE 82 83 /* 84 * system parameters (most of them in netmap_kern.h) 85 * NM_BDG_NAME prefix for switch port names, default "vale" 86 * NM_BDG_MAXPORTS number of ports 87 * NM_BRIDGES max number of switches in the system. 88 * 89 * Switch ports are named valeX:Y where X is the switch name and Y 90 * is the port. If Y matches a physical interface name, the port is 91 * connected to a physical device. 92 * 93 * Unlike physical interfaces, switch ports use their own memory region 94 * for rings and buffers. 95 * The virtual interfaces use per-queue lock instead of core lock. 96 * In the tx loop, we aggregate traffic in batches to make all operations 97 * faster. The batch size is bridge_batch. 98 */ 99 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many (must be a pow of 2). */ 100 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ 101 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ 102 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ 103 /* actual size of the tables */ 104 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NETMAP_MAX_FRAGS) 105 /* NM_FT_NULL terminates a list of slots in the ft */ 106 #define NM_FT_NULL NM_BDG_BATCH_MAX 107 108 109 /* 110 * bridge_batch is set via sysctl to the max batch size to be 111 * used in the bridge. The actual value may be larger as the 112 * last packet in the block may overflow the size. 113 */ 114 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ 115 116 /* Max number of vale bridges (loader tunable). */ 117 unsigned int vale_max_bridges = NM_BRIDGES; 118 119 SYSBEGIN(vars_vale); 120 SYSCTL_DECL(_dev_netmap); 121 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0, 122 "Max batch size to be used in the bridge"); 123 SYSCTL_UINT(_dev_netmap, OID_AUTO, max_bridges, CTLFLAG_RDTUN, &vale_max_bridges, 0, 124 "Max number of vale bridges"); 125 SYSEND; 126 127 static int netmap_vale_vp_create(struct nmreq_header *hdr, if_t, 128 struct netmap_mem_d *nmd, struct netmap_vp_adapter **); 129 static int netmap_vale_vp_bdg_attach(const char *, struct netmap_adapter *, 130 struct nm_bridge *); 131 static int netmap_vale_bwrap_attach(const char *, struct netmap_adapter *); 132 133 /* 134 * For each output interface, nm_vale_q is used to construct a list. 135 * bq_len is the number of output buffers (we can have coalescing 136 * during the copy). 137 */ 138 struct nm_vale_q { 139 uint16_t bq_head; 140 uint16_t bq_tail; 141 uint32_t bq_len; /* number of buffers */ 142 }; 143 144 /* Holds the default callbacks */ 145 struct netmap_bdg_ops vale_bdg_ops = { 146 .lookup = netmap_vale_learning, 147 .config = NULL, 148 .dtor = NULL, 149 .vp_create = netmap_vale_vp_create, 150 .bwrap_attach = netmap_vale_bwrap_attach, 151 .name = NM_BDG_NAME, 152 }; 153 154 /* 155 * this is a slightly optimized copy routine which rounds 156 * to multiple of 64 bytes and is often faster than dealing 157 * with other odd sizes. We assume there is enough room 158 * in the source and destination buffers. 159 * 160 * XXX only for multiples of NM_BUF_ALIGN bytes, non overlapped. 161 */ 162 163 static inline void 164 pkt_copy(void *_src, void *_dst, int l) 165 { 166 uint64_t *src = _src; 167 uint64_t *dst = _dst; 168 if (unlikely(l >= 1024)) { 169 memcpy(dst, src, l); 170 return; 171 } 172 for (; likely(l > 0); l -= NM_BUF_ALIGN) { 173 /* XXX NM_BUF_ALIGN/sizeof(uint64_t) statements */ 174 *dst++ = *src++; 175 *dst++ = *src++; 176 *dst++ = *src++; 177 *dst++ = *src++; 178 *dst++ = *src++; 179 *dst++ = *src++; 180 *dst++ = *src++; 181 *dst++ = *src++; 182 } 183 } 184 185 186 /* 187 * Free the forwarding tables for rings attached to switch ports. 188 */ 189 static void 190 nm_free_bdgfwd(struct netmap_adapter *na) 191 { 192 int nrings, i; 193 struct netmap_kring **kring; 194 195 NMG_LOCK_ASSERT(); 196 nrings = na->num_tx_rings; 197 kring = na->tx_rings; 198 for (i = 0; i < nrings; i++) { 199 if (kring[i]->nkr_ft) { 200 nm_os_free(kring[i]->nkr_ft); 201 kring[i]->nkr_ft = NULL; /* protect from freeing twice */ 202 } 203 } 204 } 205 206 207 /* 208 * Allocate the forwarding tables for the rings attached to the bridge ports. 209 */ 210 static int 211 nm_alloc_bdgfwd(struct netmap_adapter *na) 212 { 213 int nrings, l, i, num_dstq; 214 struct netmap_kring **kring; 215 216 NMG_LOCK_ASSERT(); 217 /* all port:rings + broadcast */ 218 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; 219 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; 220 l += sizeof(struct nm_vale_q) * num_dstq; 221 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; 222 223 nrings = netmap_real_rings(na, NR_TX); 224 kring = na->tx_rings; 225 for (i = 0; i < nrings; i++) { 226 struct nm_bdg_fwd *ft; 227 struct nm_vale_q *dstq; 228 int j; 229 230 ft = nm_os_malloc(l); 231 if (!ft) { 232 nm_free_bdgfwd(na); 233 return ENOMEM; 234 } 235 dstq = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX); 236 for (j = 0; j < num_dstq; j++) { 237 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; 238 dstq[j].bq_len = 0; 239 } 240 kring[i]->nkr_ft = ft; 241 } 242 return 0; 243 } 244 245 /* Allows external modules to create bridges in exclusive mode, 246 * returns an authentication token that the external module will need 247 * to provide during nm_bdg_ctl_{attach, detach}(), netmap_bdg_regops(), 248 * and nm_bdg_update_private_data() operations. 249 * Successfully executed if ret != NULL and *return_status == 0. 250 */ 251 void * 252 netmap_vale_create(const char *bdg_name, int *return_status) 253 { 254 struct nm_bridge *b = NULL; 255 void *ret = NULL; 256 257 NMG_LOCK(); 258 b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL); 259 if (b) { 260 *return_status = EEXIST; 261 goto unlock_bdg_create; 262 } 263 264 b = nm_find_bridge(bdg_name, 1 /* create */, &vale_bdg_ops); 265 if (!b) { 266 *return_status = ENOMEM; 267 goto unlock_bdg_create; 268 } 269 270 b->bdg_flags |= NM_BDG_ACTIVE | NM_BDG_EXCLUSIVE; 271 ret = nm_bdg_get_auth_token(b); 272 *return_status = 0; 273 274 unlock_bdg_create: 275 NMG_UNLOCK(); 276 return ret; 277 } 278 279 /* Allows external modules to destroy a bridge created through 280 * netmap_bdg_create(), the bridge must be empty. 281 */ 282 int 283 netmap_vale_destroy(const char *bdg_name, void *auth_token) 284 { 285 struct nm_bridge *b = NULL; 286 int ret = 0; 287 288 NMG_LOCK(); 289 b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL); 290 if (!b) { 291 ret = ENXIO; 292 goto unlock_bdg_free; 293 } 294 295 if (!nm_bdg_valid_auth_token(b, auth_token)) { 296 ret = EACCES; 297 goto unlock_bdg_free; 298 } 299 if (!(b->bdg_flags & NM_BDG_EXCLUSIVE)) { 300 ret = EINVAL; 301 goto unlock_bdg_free; 302 } 303 304 b->bdg_flags &= ~(NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE); 305 ret = netmap_bdg_free(b); 306 if (ret) { 307 b->bdg_flags |= NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE; 308 } 309 310 unlock_bdg_free: 311 NMG_UNLOCK(); 312 return ret; 313 } 314 315 /* Process NETMAP_REQ_VALE_LIST. */ 316 int 317 netmap_vale_list(struct nmreq_header *hdr) 318 { 319 struct nmreq_vale_list *req = 320 (struct nmreq_vale_list *)(uintptr_t)hdr->nr_body; 321 int namelen = strlen(hdr->nr_name); 322 struct nm_bridge *b, *bridges; 323 struct netmap_vp_adapter *vpna; 324 int error = 0, i, j; 325 u_int num_bridges; 326 327 netmap_bns_getbridges(&bridges, &num_bridges); 328 329 /* this is used to enumerate bridges and ports */ 330 if (namelen) { /* look up indexes of bridge and port */ 331 if (strncmp(hdr->nr_name, NM_BDG_NAME, 332 strlen(NM_BDG_NAME))) { 333 return EINVAL; 334 } 335 NMG_LOCK(); 336 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL); 337 if (!b) { 338 NMG_UNLOCK(); 339 return ENOENT; 340 } 341 342 req->nr_bridge_idx = b - bridges; /* bridge index */ 343 req->nr_port_idx = NM_BDG_NOPORT; 344 for (j = 0; j < b->bdg_active_ports; j++) { 345 i = b->bdg_port_index[j]; 346 vpna = b->bdg_ports[i]; 347 if (vpna == NULL) { 348 nm_prerr("This should not happen"); 349 continue; 350 } 351 /* the former and the latter identify a 352 * virtual port and a NIC, respectively 353 */ 354 if (!strcmp(vpna->up.name, hdr->nr_name)) { 355 req->nr_port_idx = i; /* port index */ 356 break; 357 } 358 } 359 NMG_UNLOCK(); 360 } else { 361 /* return the first non-empty entry starting from 362 * bridge nr_arg1 and port nr_arg2. 363 * 364 * Users can detect the end of the same bridge by 365 * seeing the new and old value of nr_arg1, and can 366 * detect the end of all the bridge by error != 0 367 */ 368 i = req->nr_bridge_idx; 369 j = req->nr_port_idx; 370 371 NMG_LOCK(); 372 for (error = ENOENT; i < vale_max_bridges; i++) { 373 b = bridges + i; 374 for ( ; j < NM_BDG_MAXPORTS; j++) { 375 if (b->bdg_ports[j] == NULL) 376 continue; 377 vpna = b->bdg_ports[j]; 378 /* write back the VALE switch name */ 379 strlcpy(hdr->nr_name, vpna->up.name, 380 sizeof(hdr->nr_name)); 381 error = 0; 382 goto out; 383 } 384 j = 0; /* following bridges scan from 0 */ 385 } 386 out: 387 req->nr_bridge_idx = i; 388 req->nr_port_idx = j; 389 NMG_UNLOCK(); 390 } 391 392 return error; 393 } 394 395 396 /* nm_dtor callback for ephemeral VALE ports */ 397 static void 398 netmap_vale_vp_dtor(struct netmap_adapter *na) 399 { 400 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; 401 struct nm_bridge *b = vpna->na_bdg; 402 403 nm_prdis("%s has %d references", na->name, na->na_refcount); 404 405 if (b) { 406 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 407 } 408 409 if (na->ifp != NULL && !nm_iszombie(na)) { 410 NM_DETACH_NA(na->ifp); 411 if (vpna->autodelete) { 412 nm_prdis("releasing %s", if_name(na->ifp)); 413 NMG_UNLOCK(); 414 nm_os_vi_detach(na->ifp); 415 NMG_LOCK(); 416 } 417 } 418 } 419 420 421 422 /* nm_krings_create callback for VALE ports. 423 * Calls the standard netmap_krings_create, then adds leases on rx 424 * rings and bdgfwd on tx rings. 425 */ 426 static int 427 netmap_vale_vp_krings_create(struct netmap_adapter *na) 428 { 429 u_int tailroom; 430 int error, i; 431 uint32_t *leases; 432 u_int nrx = netmap_real_rings(na, NR_RX); 433 434 /* 435 * Leases are attached to RX rings on vale ports 436 */ 437 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; 438 439 error = netmap_krings_create(na, tailroom); 440 if (error) 441 return error; 442 443 leases = na->tailroom; 444 445 for (i = 0; i < nrx; i++) { /* Receive rings */ 446 na->rx_rings[i]->nkr_leases = leases; 447 leases += na->num_rx_desc; 448 } 449 450 error = nm_alloc_bdgfwd(na); 451 if (error) { 452 netmap_krings_delete(na); 453 return error; 454 } 455 456 return 0; 457 } 458 459 460 /* nm_krings_delete callback for VALE ports. */ 461 static void 462 netmap_vale_vp_krings_delete(struct netmap_adapter *na) 463 { 464 nm_free_bdgfwd(na); 465 netmap_krings_delete(na); 466 } 467 468 469 static int 470 nm_vale_flush(struct nm_bdg_fwd *ft, u_int n, 471 struct netmap_vp_adapter *na, u_int ring_nr); 472 473 474 /* 475 * main dispatch routine for the bridge. 476 * Grab packets from a kring, move them into the ft structure 477 * associated to the tx (input) port. Max one instance per port, 478 * filtered on input (ioctl, poll or XXX). 479 * Returns the next position in the ring. 480 */ 481 static int 482 nm_vale_preflush(struct netmap_kring *kring, u_int end) 483 { 484 struct netmap_vp_adapter *na = 485 (struct netmap_vp_adapter*)kring->na; 486 struct netmap_ring *ring = kring->ring; 487 struct nm_bdg_fwd *ft; 488 u_int ring_nr = kring->ring_id; 489 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; 490 u_int ft_i = 0; /* start from 0 */ 491 u_int frags = 1; /* how many frags ? */ 492 struct nm_bridge *b = na->na_bdg; 493 494 /* To protect against modifications to the bridge we acquire a 495 * shared lock, waiting if we can sleep (if the source port is 496 * attached to a user process) or with a trylock otherwise (NICs). 497 */ 498 nm_prdis("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); 499 if (na->up.na_flags & NAF_BDG_MAYSLEEP) 500 BDG_RLOCK(b); 501 else if (!BDG_RTRYLOCK(b)) 502 return j; 503 nm_prdis(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); 504 ft = kring->nkr_ft; 505 506 for (; likely(j != end); j = nm_next(j, lim)) { 507 struct netmap_slot *slot = &ring->slot[j]; 508 char *buf; 509 510 ft[ft_i].ft_len = slot->len; 511 ft[ft_i].ft_flags = slot->flags; 512 ft[ft_i].ft_offset = 0; 513 514 nm_prdis("flags is 0x%x", slot->flags); 515 /* we do not use the buf changed flag, but we still need to reset it */ 516 slot->flags &= ~NS_BUF_CHANGED; 517 518 /* this slot goes into a list so initialize the link field */ 519 ft[ft_i].ft_next = NM_FT_NULL; 520 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? 521 (void *)(uintptr_t)slot->ptr : NMB_O(kring, slot); 522 if (unlikely(buf == NULL || 523 slot->len > NETMAP_BUF_SIZE(&na->up) - nm_get_offset(kring, slot))) { 524 nm_prlim(5, "NULL %s buffer pointer from %s slot %d len %d", 525 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT", 526 kring->name, j, ft[ft_i].ft_len); 527 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up); 528 ft[ft_i].ft_len = 0; 529 ft[ft_i].ft_flags = 0; 530 } 531 __builtin_prefetch(buf); 532 ++ft_i; 533 if (slot->flags & NS_MOREFRAG) { 534 frags++; 535 continue; 536 } 537 if (unlikely(netmap_verbose && frags > 1)) 538 nm_prlim(5, "%d frags at %d", frags, ft_i - frags); 539 ft[ft_i - frags].ft_frags = frags; 540 frags = 1; 541 if (unlikely((int)ft_i >= bridge_batch)) 542 ft_i = nm_vale_flush(ft, ft_i, na, ring_nr); 543 } 544 if (frags > 1) { 545 /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we 546 * have to fix frags count. */ 547 frags--; 548 ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG; 549 ft[ft_i - frags].ft_frags = frags; 550 nm_prlim(5, "Truncate incomplete fragment at %d (%d frags)", ft_i, frags); 551 } 552 if (ft_i) 553 ft_i = nm_vale_flush(ft, ft_i, na, ring_nr); 554 BDG_RUNLOCK(b); 555 return j; 556 } 557 558 559 /* ----- FreeBSD if_bridge hash function ------- */ 560 561 /* 562 * The following hash function is adapted from "Hash Functions" by Bob Jenkins 563 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). 564 * 565 * http://www.burtleburtle.net/bob/hash/spooky.html 566 */ 567 #define mix(a, b, c) \ 568 do { \ 569 a -= b; a -= c; a ^= (c >> 13); \ 570 b -= c; b -= a; b ^= (a << 8); \ 571 c -= a; c -= b; c ^= (b >> 13); \ 572 a -= b; a -= c; a ^= (c >> 12); \ 573 b -= c; b -= a; b ^= (a << 16); \ 574 c -= a; c -= b; c ^= (b >> 5); \ 575 a -= b; a -= c; a ^= (c >> 3); \ 576 b -= c; b -= a; b ^= (a << 10); \ 577 c -= a; c -= b; c ^= (b >> 15); \ 578 } while (/*CONSTCOND*/0) 579 580 581 static __inline uint32_t 582 nm_vale_rthash(const uint8_t *addr) 583 { 584 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hash key 585 586 b += addr[5] << 8; 587 b += addr[4]; 588 a += addr[3] << 24; 589 a += addr[2] << 16; 590 a += addr[1] << 8; 591 a += addr[0]; 592 593 mix(a, b, c); 594 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) 595 return (c & BRIDGE_RTHASH_MASK); 596 } 597 598 #undef mix 599 600 601 /* 602 * Lookup function for a learning bridge. 603 * Update the hash table with the source address, 604 * and then returns the destination port index, and the 605 * ring in *dst_ring (at the moment, always use ring 0) 606 */ 607 uint32_t 608 netmap_vale_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, 609 struct netmap_vp_adapter *na, void *private_data) 610 { 611 uint8_t *buf = ((uint8_t *)ft->ft_buf) + ft->ft_offset; 612 u_int buf_len = ft->ft_len - ft->ft_offset; 613 struct nm_hash_ent *ht = private_data; 614 uint32_t sh, dh; 615 u_int dst, mysrc = na->bdg_port; 616 uint64_t smac, dmac; 617 uint8_t indbuf[12]; 618 619 if (buf_len < 14) { 620 return NM_BDG_NOPORT; 621 } 622 623 if (ft->ft_flags & NS_INDIRECT) { 624 if (copyin(buf, indbuf, sizeof(indbuf))) { 625 return NM_BDG_NOPORT; 626 } 627 buf = indbuf; 628 } 629 630 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; 631 smac = le64toh(*(uint64_t *)(buf + 4)); 632 smac >>= 16; 633 634 /* 635 * The hash is somewhat expensive, there might be some 636 * worthwhile optimizations here. 637 */ 638 if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */ 639 uint8_t *s = buf+6; 640 sh = nm_vale_rthash(s); /* hash of source */ 641 /* update source port forwarding entry */ 642 na->last_smac = ht[sh].mac = smac; /* XXX expire ? */ 643 ht[sh].ports = mysrc; 644 if (netmap_debug & NM_DEBUG_VALE) 645 nm_prinf("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", 646 s[0], s[1], s[2], s[3], s[4], s[5], mysrc); 647 } 648 dst = NM_BDG_BROADCAST; 649 if ((buf[0] & 1) == 0) { /* unicast */ 650 dh = nm_vale_rthash(buf); /* hash of dst */ 651 if (ht[dh].mac == dmac) { /* found dst */ 652 dst = ht[dh].ports; 653 } 654 } 655 return dst; 656 } 657 658 659 /* 660 * Available space in the ring. Only used in VALE code 661 * and only with is_rx = 1 662 */ 663 static inline uint32_t 664 nm_kr_space(struct netmap_kring *k, int is_rx) 665 { 666 int space; 667 668 if (is_rx) { 669 int busy = k->nkr_hwlease - k->nr_hwcur; 670 if (busy < 0) 671 busy += k->nkr_num_slots; 672 space = k->nkr_num_slots - 1 - busy; 673 } else { 674 /* XXX never used in this branch */ 675 space = k->nr_hwtail - k->nkr_hwlease; 676 if (space < 0) 677 space += k->nkr_num_slots; 678 } 679 #if 0 680 // sanity check 681 if (k->nkr_hwlease >= k->nkr_num_slots || 682 k->nr_hwcur >= k->nkr_num_slots || 683 k->nr_tail >= k->nkr_num_slots || 684 busy < 0 || 685 busy >= k->nkr_num_slots) { 686 nm_prerr("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", 687 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 688 k->nkr_lease_idx, k->nkr_num_slots); 689 } 690 #endif 691 return space; 692 } 693 694 695 696 697 /* make a lease on the kring for N positions. return the 698 * lease index 699 * XXX only used in VALE code and with is_rx = 1 700 */ 701 static inline uint32_t 702 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) 703 { 704 uint32_t lim = k->nkr_num_slots - 1; 705 uint32_t lease_idx = k->nkr_lease_idx; 706 707 k->nkr_leases[lease_idx] = NR_NOSLOT; 708 k->nkr_lease_idx = nm_next(lease_idx, lim); 709 710 #ifdef CONFIG_NETMAP_DEBUG 711 if (n > nm_kr_space(k, is_rx)) { 712 nm_prerr("invalid request for %d slots", n); 713 panic("x"); 714 } 715 #endif /* CONFIG NETMAP_DEBUG */ 716 /* XXX verify that there are n slots */ 717 k->nkr_hwlease += n; 718 if (k->nkr_hwlease > lim) 719 k->nkr_hwlease -= lim + 1; 720 721 #ifdef CONFIG_NETMAP_DEBUG 722 if (k->nkr_hwlease >= k->nkr_num_slots || 723 k->nr_hwcur >= k->nkr_num_slots || 724 k->nr_hwtail >= k->nkr_num_slots || 725 k->nkr_lease_idx >= k->nkr_num_slots) { 726 nm_prerr("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", 727 k->na->name, 728 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 729 k->nkr_lease_idx, k->nkr_num_slots); 730 } 731 #endif /* CONFIG_NETMAP_DEBUG */ 732 return lease_idx; 733 } 734 735 /* 736 * 737 * This flush routine supports only unicast and broadcast but a large 738 * number of ports, and lets us replace the learn and dispatch functions. 739 */ 740 int 741 nm_vale_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, 742 u_int ring_nr) 743 { 744 struct nm_vale_q *dst_ents, *brddst; 745 uint16_t num_dsts = 0, *dsts; 746 struct nm_bridge *b = na->na_bdg; 747 u_int i, me = na->bdg_port; 748 749 /* 750 * The work area (pointed by ft) is followed by an array of 751 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS 752 * queues per port plus one for the broadcast traffic. 753 * Then we have an array of destination indexes. 754 */ 755 dst_ents = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX); 756 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); 757 758 /* first pass: find a destination for each packet in the batch */ 759 for (i = 0; likely(i < n); i += ft[i].ft_frags) { 760 uint8_t dst_ring = ring_nr; /* default, same ring as origin */ 761 uint16_t dst_port, d_i; 762 struct nm_vale_q *d; 763 struct nm_bdg_fwd *start_ft = NULL; 764 765 nm_prdis("slot %d frags %d", i, ft[i].ft_frags); 766 767 if (na->up.virt_hdr_len < ft[i].ft_len) { 768 ft[i].ft_offset = na->up.virt_hdr_len; 769 start_ft = &ft[i]; 770 } else if (na->up.virt_hdr_len == ft[i].ft_len && ft[i].ft_flags & NS_MOREFRAG) { 771 ft[i].ft_offset = ft[i].ft_len; 772 start_ft = &ft[i+1]; 773 } else { 774 /* Drop the packet if the virtio-net header is not into the first 775 * fragment nor at the very beginning of the second. 776 */ 777 continue; 778 } 779 dst_port = b->bdg_ops.lookup(start_ft, &dst_ring, na, b->private_data); 780 if (netmap_verbose > 255) 781 nm_prlim(5, "slot %d port %d -> %d", i, me, dst_port); 782 if (dst_port >= NM_BDG_NOPORT) 783 continue; /* this packet is identified to be dropped */ 784 else if (dst_port == NM_BDG_BROADCAST) 785 dst_ring = 0; /* broadcasts always go to ring 0 */ 786 else if (unlikely(dst_port == me || 787 !b->bdg_ports[dst_port])) 788 continue; 789 790 /* get a position in the scratch pad */ 791 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; 792 d = dst_ents + d_i; 793 794 /* append the first fragment to the list */ 795 if (d->bq_head == NM_FT_NULL) { /* new destination */ 796 d->bq_head = d->bq_tail = i; 797 /* remember this position to be scanned later */ 798 if (dst_port != NM_BDG_BROADCAST) 799 dsts[num_dsts++] = d_i; 800 } else { 801 ft[d->bq_tail].ft_next = i; 802 d->bq_tail = i; 803 } 804 d->bq_len += ft[i].ft_frags; 805 } 806 807 /* 808 * Broadcast traffic goes to ring 0 on all destinations. 809 * So we need to add these rings to the list of ports to scan. 810 */ 811 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; 812 if (brddst->bq_head != NM_FT_NULL) { 813 u_int j; 814 for (j = 0; likely(j < b->bdg_active_ports); j++) { 815 uint16_t d_i; 816 i = b->bdg_port_index[j]; 817 if (unlikely(i == me)) 818 continue; 819 d_i = i * NM_BDG_MAXRINGS; 820 if (dst_ents[d_i].bq_head == NM_FT_NULL) 821 dsts[num_dsts++] = d_i; 822 } 823 } 824 825 nm_prdis(5, "pass 1 done %d pkts %d dsts", n, num_dsts); 826 /* second pass: scan destinations */ 827 for (i = 0; i < num_dsts; i++) { 828 struct netmap_vp_adapter *dst_na; 829 struct netmap_kring *kring; 830 struct netmap_ring *ring; 831 u_int dst_nr, lim, j, d_i, next, brd_next; 832 u_int needed, howmany; 833 int retry = netmap_txsync_retry; 834 struct nm_vale_q *d; 835 uint32_t my_start = 0, lease_idx = 0; 836 int nrings; 837 int virt_hdr_mismatch = 0; 838 839 d_i = dsts[i]; 840 nm_prdis("second pass %d port %d", i, d_i); 841 d = dst_ents + d_i; 842 // XXX fix the division 843 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; 844 /* protect from the lookup function returning an inactive 845 * destination port 846 */ 847 if (unlikely(dst_na == NULL)) 848 goto cleanup; 849 if (dst_na->up.na_flags & NAF_SW_ONLY) 850 goto cleanup; 851 /* 852 * The interface may be in !netmap mode in two cases: 853 * - when na is attached but not activated yet; 854 * - when na is being deactivated but is still attached. 855 */ 856 if (unlikely(!nm_netmap_on(&dst_na->up))) { 857 nm_prdis("not in netmap mode!"); 858 goto cleanup; 859 } 860 861 /* there is at least one either unicast or broadcast packet */ 862 brd_next = brddst->bq_head; 863 next = d->bq_head; 864 /* we need to reserve this many slots. If fewer are 865 * available, some packets will be dropped. 866 * Packets may have multiple fragments, so 867 * there is a chance that we may not use all of the slots 868 * we have claimed, so we will need to handle the leftover 869 * ones when we regain the lock. 870 */ 871 needed = d->bq_len + brddst->bq_len; 872 873 if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) { 874 if (netmap_verbose) { 875 nm_prlim(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len, 876 dst_na->up.virt_hdr_len); 877 } 878 /* There is a virtio-net header/offloadings mismatch between 879 * source and destination. The slower mismatch datapath will 880 * be used to cope with all the mismatches. 881 */ 882 virt_hdr_mismatch = 1; 883 if (dst_na->mfs < na->mfs) { 884 /* We may need to do segmentation offloadings, and so 885 * we may need a number of destination slots greater 886 * than the number of input slots ('needed'). 887 * We look for the smallest integer 'x' which satisfies: 888 * needed * na->mfs + x * H <= x * na->mfs 889 * where 'H' is the length of the longest header that may 890 * be replicated in the segmentation process (e.g. for 891 * TCPv4 we must account for ethernet header, IP header 892 * and TCPv4 header). 893 */ 894 KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0")); 895 needed = (needed * na->mfs) / 896 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; 897 nm_prdis(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); 898 } 899 } 900 901 nm_prdis(5, "pass 2 dst %d is %x %s", 902 i, d_i, nm_is_bwrap(&dst_na->up) ? "nic/host" : "virtual"); 903 dst_nr = d_i & (NM_BDG_MAXRINGS-1); 904 nrings = dst_na->up.num_rx_rings; 905 if (dst_nr >= nrings) 906 dst_nr = dst_nr % nrings; 907 kring = dst_na->up.rx_rings[dst_nr]; 908 ring = kring->ring; 909 /* the destination ring may have not been opened for RX */ 910 if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON)) 911 goto cleanup; 912 lim = kring->nkr_num_slots - 1; 913 914 retry: 915 916 if (dst_na->retry && retry) { 917 /* try to get some free slot from the previous run */ 918 kring->nm_notify(kring, NAF_FORCE_RECLAIM); 919 /* actually useful only for bwraps, since there 920 * the notify will trigger a txsync on the hwna. VALE ports 921 * have dst_na->retry == 0 922 */ 923 } 924 /* reserve the buffers in the queue and an entry 925 * to report completion, and drop lock. 926 * XXX this might become a helper function. 927 */ 928 mtx_lock(&kring->q_lock); 929 if (kring->nkr_stopped) { 930 mtx_unlock(&kring->q_lock); 931 goto cleanup; 932 } 933 my_start = j = kring->nkr_hwlease; 934 howmany = nm_kr_space(kring, 1); 935 if (needed < howmany) 936 howmany = needed; 937 lease_idx = nm_kr_lease(kring, howmany, 1); 938 mtx_unlock(&kring->q_lock); 939 940 /* only retry if we need more than available slots */ 941 if (retry && needed <= howmany) 942 retry = 0; 943 944 /* copy to the destination queue */ 945 while (howmany > 0) { 946 struct netmap_slot *slot; 947 struct nm_bdg_fwd *ft_p, *ft_end; 948 u_int cnt; 949 950 /* find the queue from which we pick next packet. 951 * NM_FT_NULL is always higher than valid indexes 952 * so we never dereference it if the other list 953 * has packets (and if both are empty we never 954 * get here). 955 */ 956 if (next < brd_next) { 957 ft_p = ft + next; 958 next = ft_p->ft_next; 959 } else { /* insert broadcast */ 960 ft_p = ft + brd_next; 961 brd_next = ft_p->ft_next; 962 } 963 cnt = ft_p->ft_frags; // cnt > 0 964 if (unlikely(cnt > howmany)) 965 break; /* no more space */ 966 if (netmap_verbose && cnt > 1) 967 nm_prlim(5, "rx %d frags to %d", cnt, j); 968 ft_end = ft_p + cnt; 969 if (unlikely(virt_hdr_mismatch)) { 970 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); 971 } else { 972 howmany -= cnt; 973 do { 974 char *dst, *src = ft_p->ft_buf; 975 size_t copy_len = ft_p->ft_len, dst_len = copy_len; 976 uintptr_t src_cb; 977 uint64_t dstoff, dstoff_cb; 978 int src_co, dst_co; 979 const uintptr_t mask = NM_BUF_ALIGN - 1; 980 981 slot = &ring->slot[j]; 982 dst = NMB(&dst_na->up, slot); 983 dstoff = nm_get_offset(kring, slot); 984 dstoff_cb = dstoff & ~mask; 985 src_cb = ((uintptr_t)src) & ~mask; 986 src_co = ((uintptr_t)src) & mask; 987 dst_co = ((uintptr_t)(dst + dstoff)) & mask; 988 if (dst_co < src_co) { 989 dstoff_cb += NM_BUF_ALIGN; 990 } 991 dstoff = dstoff_cb + src_co; 992 copy_len += src_co; 993 994 nm_prdis("send [%d] %d(%d) bytes at %s:%d", 995 i, (int)copy_len, (int)dst_len, 996 NM_IFPNAME(dst_ifp), j); 997 998 if (unlikely(dstoff > NETMAP_BUF_SIZE(&dst_na->up) || 999 dst_len > NETMAP_BUF_SIZE(&dst_na->up) - dstoff)) { 1000 nm_prlim(5, "dropping packet/fragment of len %zu, dest offset %llu", 1001 dst_len, (unsigned long long)dstoff); 1002 copy_len = dst_len = 0; 1003 dstoff = nm_get_offset(kring, slot); 1004 } 1005 1006 if (ft_p->ft_flags & NS_INDIRECT) { 1007 if (copyin(src, dst, copy_len)) { 1008 // invalid user pointer, pretend len is 0 1009 dst_len = 0; 1010 } 1011 } else { 1012 //memcpy(dst, src, copy_len); 1013 pkt_copy((char *)src_cb, dst + dstoff_cb, (int)copy_len); 1014 } 1015 slot->len = dst_len; 1016 slot->flags = (cnt << 8)| NS_MOREFRAG; 1017 nm_write_offset(kring, slot, dstoff); 1018 j = nm_next(j, lim); 1019 needed--; 1020 ft_p++; 1021 } while (ft_p != ft_end); 1022 slot->flags = (cnt << 8); /* clear flag on last entry */ 1023 } 1024 /* are we done ? */ 1025 if (next == NM_FT_NULL && brd_next == NM_FT_NULL) 1026 break; 1027 } 1028 { 1029 /* current position */ 1030 uint32_t *p = kring->nkr_leases; /* shorthand */ 1031 uint32_t update_pos; 1032 int still_locked = 1; 1033 1034 mtx_lock(&kring->q_lock); 1035 if (unlikely(howmany > 0)) { 1036 /* not used all bufs. If i am the last one 1037 * i can recover the slots, otherwise must 1038 * fill them with 0 to mark empty packets. 1039 */ 1040 nm_prdis("leftover %d bufs", howmany); 1041 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { 1042 /* yes i am the last one */ 1043 nm_prdis("roll back nkr_hwlease to %d", j); 1044 kring->nkr_hwlease = j; 1045 } else { 1046 while (howmany-- > 0) { 1047 ring->slot[j].len = 0; 1048 ring->slot[j].flags = 0; 1049 j = nm_next(j, lim); 1050 } 1051 } 1052 } 1053 p[lease_idx] = j; /* report I am done */ 1054 1055 update_pos = kring->nr_hwtail; 1056 1057 if (my_start == update_pos) { 1058 /* all slots before my_start have been reported, 1059 * so scan subsequent leases to see if other ranges 1060 * have been completed, and to a selwakeup or txsync. 1061 */ 1062 while (lease_idx != kring->nkr_lease_idx && 1063 p[lease_idx] != NR_NOSLOT) { 1064 j = p[lease_idx]; 1065 p[lease_idx] = NR_NOSLOT; 1066 lease_idx = nm_next(lease_idx, lim); 1067 } 1068 /* j is the new 'write' position. j != my_start 1069 * means there are new buffers to report 1070 */ 1071 if (likely(j != my_start)) { 1072 kring->nr_hwtail = j; 1073 still_locked = 0; 1074 mtx_unlock(&kring->q_lock); 1075 kring->nm_notify(kring, 0); 1076 /* this is netmap_notify for VALE ports and 1077 * netmap_bwrap_notify for bwrap. The latter will 1078 * trigger a txsync on the underlying hwna 1079 */ 1080 if (dst_na->retry && retry--) { 1081 /* XXX this is going to call nm_notify again. 1082 * Only useful for bwrap in virtual machines 1083 */ 1084 goto retry; 1085 } 1086 } 1087 } 1088 if (still_locked) 1089 mtx_unlock(&kring->q_lock); 1090 } 1091 cleanup: 1092 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ 1093 d->bq_len = 0; 1094 } 1095 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ 1096 brddst->bq_len = 0; 1097 return 0; 1098 } 1099 1100 /* nm_txsync callback for VALE ports */ 1101 static int 1102 netmap_vale_vp_txsync(struct netmap_kring *kring, int flags) 1103 { 1104 struct netmap_vp_adapter *na = 1105 (struct netmap_vp_adapter *)kring->na; 1106 u_int done; 1107 u_int const lim = kring->nkr_num_slots - 1; 1108 u_int const head = kring->rhead; 1109 1110 if (bridge_batch <= 0) { /* testing only */ 1111 done = head; // used all 1112 goto done; 1113 } 1114 if (!na->na_bdg) { 1115 done = head; 1116 goto done; 1117 } 1118 if (bridge_batch > NM_BDG_BATCH) 1119 bridge_batch = NM_BDG_BATCH; 1120 1121 done = nm_vale_preflush(kring, head); 1122 done: 1123 if (done != head) 1124 nm_prerr("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail); 1125 /* 1126 * packets between 'done' and 'cur' are left unsent. 1127 */ 1128 kring->nr_hwcur = done; 1129 kring->nr_hwtail = nm_prev(done, lim); 1130 if (netmap_debug & NM_DEBUG_TXSYNC) 1131 nm_prinf("%s ring %d flags %d", na->up.name, kring->ring_id, flags); 1132 return 0; 1133 } 1134 1135 1136 /* create a netmap_vp_adapter that describes a VALE port. 1137 * Only persistent VALE ports have a non-null ifp. 1138 */ 1139 static int 1140 netmap_vale_vp_create(struct nmreq_header *hdr, if_t ifp, 1141 struct netmap_mem_d *nmd, struct netmap_vp_adapter **ret) 1142 { 1143 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body; 1144 struct netmap_vp_adapter *vpna; 1145 struct netmap_adapter *na; 1146 int error = 0; 1147 u_int npipes = 0; 1148 u_int extrabufs = 0; 1149 1150 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) { 1151 return EINVAL; 1152 } 1153 1154 vpna = nm_os_malloc(sizeof(*vpna)); 1155 if (vpna == NULL) 1156 return ENOMEM; 1157 1158 na = &vpna->up; 1159 1160 na->ifp = ifp; 1161 strlcpy(na->name, hdr->nr_name, sizeof(na->name)); 1162 1163 /* bound checking */ 1164 na->num_tx_rings = req->nr_tx_rings; 1165 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1166 req->nr_tx_rings = na->num_tx_rings; /* write back */ 1167 na->num_rx_rings = req->nr_rx_rings; 1168 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1169 req->nr_rx_rings = na->num_rx_rings; /* write back */ 1170 nm_bound_var(&req->nr_tx_slots, NM_BRIDGE_RINGSIZE, 1171 1, NM_BDG_MAXSLOTS, NULL); 1172 na->num_tx_desc = req->nr_tx_slots; 1173 nm_bound_var(&req->nr_rx_slots, NM_BRIDGE_RINGSIZE, 1174 1, NM_BDG_MAXSLOTS, NULL); 1175 /* validate number of pipes. We want at least 1, 1176 * but probably can do with some more. 1177 * So let's use 2 as default (when 0 is supplied) 1178 */ 1179 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); 1180 /* validate extra bufs */ 1181 extrabufs = req->nr_extra_bufs; 1182 nm_bound_var(&extrabufs, 0, 0, 1183 128*NM_BDG_MAXSLOTS, NULL); 1184 req->nr_extra_bufs = extrabufs; /* write back */ 1185 na->num_rx_desc = req->nr_rx_slots; 1186 /* Set the mfs to a default value, as it is needed on the VALE 1187 * mismatch datapath. XXX We should set it according to the MTU 1188 * known to the kernel. */ 1189 vpna->mfs = NM_BDG_MFS_DEFAULT; 1190 vpna->last_smac = ~0llu; 1191 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? 1192 vpna->mfs = netmap_buf_size; */ 1193 if (netmap_verbose) 1194 nm_prinf("max frame size %u", vpna->mfs); 1195 1196 na->na_flags |= (NAF_BDG_MAYSLEEP | NAF_OFFSETS); 1197 /* persistent VALE ports look like hw devices 1198 * with a native netmap adapter 1199 */ 1200 if (ifp) 1201 na->na_flags |= NAF_NATIVE; 1202 na->nm_txsync = netmap_vale_vp_txsync; 1203 na->nm_rxsync = netmap_vp_rxsync; /* use the one provided by bdg */ 1204 na->nm_register = netmap_vp_reg; /* use the one provided by bdg */ 1205 na->nm_krings_create = netmap_vale_vp_krings_create; 1206 na->nm_krings_delete = netmap_vale_vp_krings_delete; 1207 na->nm_dtor = netmap_vale_vp_dtor; 1208 nm_prdis("nr_mem_id %d", req->nr_mem_id); 1209 na->nm_mem = nmd ? 1210 netmap_mem_get(nmd): 1211 netmap_mem_private_new( 1212 na->num_tx_rings, na->num_tx_desc, 1213 na->num_rx_rings, na->num_rx_desc, 1214 req->nr_extra_bufs, npipes, &error); 1215 if (na->nm_mem == NULL) 1216 goto err; 1217 na->nm_bdg_attach = netmap_vale_vp_bdg_attach; 1218 /* other nmd fields are set in the common routine */ 1219 error = netmap_attach_common(na); 1220 if (error) 1221 goto err; 1222 *ret = vpna; 1223 return 0; 1224 1225 err: 1226 if (na->nm_mem != NULL) 1227 netmap_mem_put(na->nm_mem); 1228 nm_os_free(vpna); 1229 return error; 1230 } 1231 1232 /* nm_bdg_attach callback for VALE ports 1233 * The na_vp port is this same netmap_adapter. There is no host port. 1234 */ 1235 static int 1236 netmap_vale_vp_bdg_attach(const char *name, struct netmap_adapter *na, 1237 struct nm_bridge *b) 1238 { 1239 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 1240 1241 if ((b->bdg_flags & NM_BDG_NEED_BWRAP) || vpna->na_bdg) { 1242 return NM_NEED_BWRAP; 1243 } 1244 na->na_vp = vpna; 1245 strlcpy(na->name, name, sizeof(na->name)); 1246 na->na_hostvp = NULL; 1247 return 0; 1248 } 1249 1250 static int 1251 netmap_vale_bwrap_krings_create(struct netmap_adapter *na) 1252 { 1253 int error; 1254 1255 /* impersonate a netmap_vp_adapter */ 1256 error = netmap_vale_vp_krings_create(na); 1257 if (error) 1258 return error; 1259 error = netmap_bwrap_krings_create_common(na); 1260 if (error) { 1261 netmap_vale_vp_krings_delete(na); 1262 } 1263 return error; 1264 } 1265 1266 static void 1267 netmap_vale_bwrap_krings_delete(struct netmap_adapter *na) 1268 { 1269 netmap_bwrap_krings_delete_common(na); 1270 netmap_vale_vp_krings_delete(na); 1271 } 1272 1273 static int 1274 netmap_vale_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) 1275 { 1276 struct netmap_bwrap_adapter *bna; 1277 struct netmap_adapter *na = NULL; 1278 struct netmap_adapter *hostna = NULL; 1279 int error; 1280 1281 bna = nm_os_malloc(sizeof(*bna)); 1282 if (bna == NULL) { 1283 return ENOMEM; 1284 } 1285 na = &bna->up.up; 1286 strlcpy(na->name, nr_name, sizeof(na->name)); 1287 na->nm_register = netmap_bwrap_reg; 1288 na->nm_txsync = netmap_vale_vp_txsync; 1289 // na->nm_rxsync = netmap_bwrap_rxsync; 1290 na->nm_krings_create = netmap_vale_bwrap_krings_create; 1291 na->nm_krings_delete = netmap_vale_bwrap_krings_delete; 1292 na->nm_notify = netmap_bwrap_notify; 1293 bna->nm_intr_notify = netmap_bwrap_intr_notify; 1294 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ 1295 /* Set the mfs, needed on the VALE mismatch datapath. */ 1296 bna->up.mfs = NM_BDG_MFS_DEFAULT; 1297 1298 if (hwna->na_flags & NAF_HOST_RINGS) { 1299 hostna = &bna->host.up; 1300 hostna->nm_notify = netmap_bwrap_notify; 1301 bna->host.mfs = NM_BDG_MFS_DEFAULT; 1302 } 1303 1304 error = netmap_bwrap_attach_common(na, hwna); 1305 if (error) { 1306 nm_os_free(bna); 1307 } 1308 return error; 1309 } 1310 1311 int 1312 netmap_get_vale_na(struct nmreq_header *hdr, struct netmap_adapter **na, 1313 struct netmap_mem_d *nmd, int create) 1314 { 1315 return netmap_get_bdg_na(hdr, na, nmd, create, &vale_bdg_ops); 1316 } 1317 1318 1319 /* creates a persistent VALE port */ 1320 int 1321 nm_vi_create(struct nmreq_header *hdr) 1322 { 1323 struct nmreq_vale_newif *req = 1324 (struct nmreq_vale_newif *)(uintptr_t)hdr->nr_body; 1325 int error = 0; 1326 /* Build a nmreq_register out of the nmreq_vale_newif, 1327 * so that we can call netmap_get_bdg_na(). */ 1328 struct nmreq_register regreq; 1329 bzero(®req, sizeof(regreq)); 1330 regreq.nr_tx_slots = req->nr_tx_slots; 1331 regreq.nr_rx_slots = req->nr_rx_slots; 1332 regreq.nr_tx_rings = req->nr_tx_rings; 1333 regreq.nr_rx_rings = req->nr_rx_rings; 1334 regreq.nr_mem_id = req->nr_mem_id; 1335 hdr->nr_reqtype = NETMAP_REQ_REGISTER; 1336 hdr->nr_body = (uintptr_t)®req; 1337 error = netmap_vi_create(hdr, 0 /* no autodelete */); 1338 hdr->nr_reqtype = NETMAP_REQ_VALE_NEWIF; 1339 hdr->nr_body = (uintptr_t)req; 1340 /* Write back to the original struct. */ 1341 req->nr_tx_slots = regreq.nr_tx_slots; 1342 req->nr_rx_slots = regreq.nr_rx_slots; 1343 req->nr_tx_rings = regreq.nr_tx_rings; 1344 req->nr_rx_rings = regreq.nr_rx_rings; 1345 req->nr_mem_id = regreq.nr_mem_id; 1346 return error; 1347 } 1348 1349 /* remove a persistent VALE port from the system */ 1350 int 1351 nm_vi_destroy(const char *name) 1352 { 1353 if_t ifp; 1354 struct netmap_vp_adapter *vpna; 1355 int error; 1356 1357 ifp = ifunit_ref(name); 1358 if (!ifp) 1359 return ENXIO; 1360 NMG_LOCK(); 1361 /* make sure this is actually a VALE port */ 1362 if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) { 1363 error = EINVAL; 1364 goto err; 1365 } 1366 1367 vpna = (struct netmap_vp_adapter *)NA(ifp); 1368 1369 /* we can only destroy ports that were created via NETMAP_BDG_NEWIF */ 1370 if (vpna->autodelete) { 1371 error = EINVAL; 1372 goto err; 1373 } 1374 1375 /* also make sure that nobody is using the interface */ 1376 if (NETMAP_OWNED_BY_ANY(&vpna->up) || 1377 vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) { 1378 error = EBUSY; 1379 goto err; 1380 } 1381 1382 NMG_UNLOCK(); 1383 1384 if (netmap_verbose) 1385 nm_prinf("destroying a persistent vale interface %s", if_name(ifp)); 1386 /* Linux requires all the references are released 1387 * before unregister 1388 */ 1389 netmap_detach(ifp); 1390 if_rele(ifp); 1391 nm_os_vi_detach(ifp); 1392 return 0; 1393 1394 err: 1395 NMG_UNLOCK(); 1396 if_rele(ifp); 1397 return error; 1398 } 1399 1400 static int 1401 nm_update_info(struct nmreq_register *req, struct netmap_adapter *na) 1402 { 1403 req->nr_rx_rings = na->num_rx_rings; 1404 req->nr_tx_rings = na->num_tx_rings; 1405 req->nr_rx_slots = na->num_rx_desc; 1406 req->nr_tx_slots = na->num_tx_desc; 1407 return netmap_mem_get_info(na->nm_mem, &req->nr_memsize, NULL, 1408 &req->nr_mem_id); 1409 } 1410 1411 1412 /* 1413 * Create a virtual interface registered to the system. 1414 * The interface will be attached to a bridge later. 1415 */ 1416 int 1417 netmap_vi_create(struct nmreq_header *hdr, int autodelete) 1418 { 1419 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body; 1420 if_t ifp; 1421 struct netmap_vp_adapter *vpna; 1422 struct netmap_mem_d *nmd = NULL; 1423 int error; 1424 1425 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) { 1426 return EINVAL; 1427 } 1428 1429 /* don't include VALE prefix */ 1430 if (!strncmp(hdr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME))) 1431 return EINVAL; 1432 if (strlen(hdr->nr_name) >= IFNAMSIZ) { 1433 return EINVAL; 1434 } 1435 ifp = ifunit_ref(hdr->nr_name); 1436 if (ifp) { /* already exist, cannot create new one */ 1437 error = EEXIST; 1438 NMG_LOCK(); 1439 if (NM_NA_VALID(ifp)) { 1440 int update_err = nm_update_info(req, NA(ifp)); 1441 if (update_err) 1442 error = update_err; 1443 } 1444 NMG_UNLOCK(); 1445 if_rele(ifp); 1446 return error; 1447 } 1448 error = nm_os_vi_persist(hdr->nr_name, &ifp); 1449 if (error) 1450 return error; 1451 1452 NMG_LOCK(); 1453 if (req->nr_mem_id) { 1454 nmd = netmap_mem_find(req->nr_mem_id); 1455 if (nmd == NULL) { 1456 error = EINVAL; 1457 goto err_1; 1458 } 1459 } 1460 /* netmap_vp_create creates a struct netmap_vp_adapter */ 1461 error = netmap_vale_vp_create(hdr, ifp, nmd, &vpna); 1462 if (error) { 1463 if (netmap_debug & NM_DEBUG_VALE) 1464 nm_prerr("error %d", error); 1465 goto err_1; 1466 } 1467 /* persist-specific routines */ 1468 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl; 1469 if (!autodelete) { 1470 netmap_adapter_get(&vpna->up); 1471 } else { 1472 vpna->autodelete = 1; 1473 } 1474 NM_ATTACH_NA(ifp, &vpna->up); 1475 /* return the updated info */ 1476 error = nm_update_info(req, &vpna->up); 1477 if (error) { 1478 goto err_2; 1479 } 1480 nm_prdis("returning nr_mem_id %d", req->nr_mem_id); 1481 if (nmd) 1482 netmap_mem_put(nmd); 1483 NMG_UNLOCK(); 1484 nm_prdis("created %s", if_name(ifp)); 1485 return 0; 1486 1487 err_2: 1488 netmap_detach(ifp); 1489 err_1: 1490 if (nmd) 1491 netmap_mem_put(nmd); 1492 NMG_UNLOCK(); 1493 nm_os_vi_detach(ifp); 1494 1495 return error; 1496 } 1497 1498 #endif /* WITH_VALE */ 1499