1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (C) 2013-2016 Universita` di Pisa 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 30 #if defined(__FreeBSD__) 31 #include <sys/cdefs.h> /* prerequisite */ 32 __FBSDID("$FreeBSD$"); 33 34 #include <sys/types.h> 35 #include <sys/errno.h> 36 #include <sys/param.h> /* defines used in kernel.h */ 37 #include <sys/kernel.h> /* types used in module initialization */ 38 #include <sys/conf.h> /* cdevsw struct, UID, GID */ 39 #include <sys/sockio.h> 40 #include <sys/socketvar.h> /* struct socket */ 41 #include <sys/malloc.h> 42 #include <sys/poll.h> 43 #include <sys/rwlock.h> 44 #include <sys/socket.h> /* sockaddrs */ 45 #include <sys/selinfo.h> 46 #include <sys/sysctl.h> 47 #include <net/if.h> 48 #include <net/if_var.h> 49 #include <net/bpf.h> /* BIOCIMMEDIATE */ 50 #include <machine/bus.h> /* bus_dmamap_* */ 51 #include <sys/endian.h> 52 #include <sys/refcount.h> 53 #include <sys/smp.h> 54 55 56 #elif defined(linux) 57 58 #include "bsd_glue.h" 59 60 #elif defined(__APPLE__) 61 62 #warning OSX support is only partial 63 #include "osx_glue.h" 64 65 #elif defined(_WIN32) 66 #include "win_glue.h" 67 68 #else 69 70 #error Unsupported platform 71 72 #endif /* unsupported */ 73 74 /* 75 * common headers 76 */ 77 78 #include <net/netmap.h> 79 #include <dev/netmap/netmap_kern.h> 80 #include <dev/netmap/netmap_mem2.h> 81 #include <dev/netmap/netmap_bdg.h> 82 83 #ifdef WITH_VALE 84 85 /* 86 * system parameters (most of them in netmap_kern.h) 87 * NM_BDG_NAME prefix for switch port names, default "vale" 88 * NM_BDG_MAXPORTS number of ports 89 * NM_BRIDGES max number of switches in the system. 90 * XXX should become a sysctl or tunable 91 * 92 * Switch ports are named valeX:Y where X is the switch name and Y 93 * is the port. If Y matches a physical interface name, the port is 94 * connected to a physical device. 95 * 96 * Unlike physical interfaces, switch ports use their own memory region 97 * for rings and buffers. 98 * The virtual interfaces use per-queue lock instead of core lock. 99 * In the tx loop, we aggregate traffic in batches to make all operations 100 * faster. The batch size is bridge_batch. 101 */ 102 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ 103 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ 104 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */ 105 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ 106 /* actual size of the tables */ 107 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NETMAP_MAX_FRAGS) 108 /* NM_FT_NULL terminates a list of slots in the ft */ 109 #define NM_FT_NULL NM_BDG_BATCH_MAX 110 111 112 /* 113 * bridge_batch is set via sysctl to the max batch size to be 114 * used in the bridge. The actual value may be larger as the 115 * last packet in the block may overflow the size. 116 */ 117 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ 118 SYSBEGIN(vars_vale); 119 SYSCTL_DECL(_dev_netmap); 120 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0, 121 "Max batch size to be used in the bridge"); 122 SYSEND; 123 124 static int netmap_vale_vp_create(struct nmreq_header *hdr, struct ifnet *, 125 struct netmap_mem_d *nmd, struct netmap_vp_adapter **); 126 static int netmap_vale_vp_bdg_attach(const char *, struct netmap_adapter *, 127 struct nm_bridge *); 128 static int netmap_vale_bwrap_attach(const char *, struct netmap_adapter *); 129 130 /* 131 * For each output interface, nm_vale_q is used to construct a list. 132 * bq_len is the number of output buffers (we can have coalescing 133 * during the copy). 134 */ 135 struct nm_vale_q { 136 uint16_t bq_head; 137 uint16_t bq_tail; 138 uint32_t bq_len; /* number of buffers */ 139 }; 140 141 /* Holds the default callbacks */ 142 struct netmap_bdg_ops vale_bdg_ops = { 143 .lookup = netmap_vale_learning, 144 .config = NULL, 145 .dtor = NULL, 146 .vp_create = netmap_vale_vp_create, 147 .bwrap_attach = netmap_vale_bwrap_attach, 148 .name = NM_BDG_NAME, 149 }; 150 151 /* 152 * this is a slightly optimized copy routine which rounds 153 * to multiple of 64 bytes and is often faster than dealing 154 * with other odd sizes. We assume there is enough room 155 * in the source and destination buffers. 156 * 157 * XXX only for multiples of 64 bytes, non overlapped. 158 */ 159 static inline void 160 pkt_copy(void *_src, void *_dst, int l) 161 { 162 uint64_t *src = _src; 163 uint64_t *dst = _dst; 164 if (unlikely(l >= 1024)) { 165 memcpy(dst, src, l); 166 return; 167 } 168 for (; likely(l > 0); l-=64) { 169 *dst++ = *src++; 170 *dst++ = *src++; 171 *dst++ = *src++; 172 *dst++ = *src++; 173 *dst++ = *src++; 174 *dst++ = *src++; 175 *dst++ = *src++; 176 *dst++ = *src++; 177 } 178 } 179 180 181 /* 182 * Free the forwarding tables for rings attached to switch ports. 183 */ 184 static void 185 nm_free_bdgfwd(struct netmap_adapter *na) 186 { 187 int nrings, i; 188 struct netmap_kring **kring; 189 190 NMG_LOCK_ASSERT(); 191 nrings = na->num_tx_rings; 192 kring = na->tx_rings; 193 for (i = 0; i < nrings; i++) { 194 if (kring[i]->nkr_ft) { 195 nm_os_free(kring[i]->nkr_ft); 196 kring[i]->nkr_ft = NULL; /* protect from freeing twice */ 197 } 198 } 199 } 200 201 202 /* 203 * Allocate the forwarding tables for the rings attached to the bridge ports. 204 */ 205 static int 206 nm_alloc_bdgfwd(struct netmap_adapter *na) 207 { 208 int nrings, l, i, num_dstq; 209 struct netmap_kring **kring; 210 211 NMG_LOCK_ASSERT(); 212 /* all port:rings + broadcast */ 213 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; 214 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; 215 l += sizeof(struct nm_vale_q) * num_dstq; 216 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; 217 218 nrings = netmap_real_rings(na, NR_TX); 219 kring = na->tx_rings; 220 for (i = 0; i < nrings; i++) { 221 struct nm_bdg_fwd *ft; 222 struct nm_vale_q *dstq; 223 int j; 224 225 ft = nm_os_malloc(l); 226 if (!ft) { 227 nm_free_bdgfwd(na); 228 return ENOMEM; 229 } 230 dstq = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX); 231 for (j = 0; j < num_dstq; j++) { 232 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; 233 dstq[j].bq_len = 0; 234 } 235 kring[i]->nkr_ft = ft; 236 } 237 return 0; 238 } 239 240 /* Allows external modules to create bridges in exclusive mode, 241 * returns an authentication token that the external module will need 242 * to provide during nm_bdg_ctl_{attach, detach}(), netmap_bdg_regops(), 243 * and nm_bdg_update_private_data() operations. 244 * Successfully executed if ret != NULL and *return_status == 0. 245 */ 246 void * 247 netmap_vale_create(const char *bdg_name, int *return_status) 248 { 249 struct nm_bridge *b = NULL; 250 void *ret = NULL; 251 252 NMG_LOCK(); 253 b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL); 254 if (b) { 255 *return_status = EEXIST; 256 goto unlock_bdg_create; 257 } 258 259 b = nm_find_bridge(bdg_name, 1 /* create */, &vale_bdg_ops); 260 if (!b) { 261 *return_status = ENOMEM; 262 goto unlock_bdg_create; 263 } 264 265 b->bdg_flags |= NM_BDG_ACTIVE | NM_BDG_EXCLUSIVE; 266 ret = nm_bdg_get_auth_token(b); 267 *return_status = 0; 268 269 unlock_bdg_create: 270 NMG_UNLOCK(); 271 return ret; 272 } 273 274 /* Allows external modules to destroy a bridge created through 275 * netmap_bdg_create(), the bridge must be empty. 276 */ 277 int 278 netmap_vale_destroy(const char *bdg_name, void *auth_token) 279 { 280 struct nm_bridge *b = NULL; 281 int ret = 0; 282 283 NMG_LOCK(); 284 b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL); 285 if (!b) { 286 ret = ENXIO; 287 goto unlock_bdg_free; 288 } 289 290 if (!nm_bdg_valid_auth_token(b, auth_token)) { 291 ret = EACCES; 292 goto unlock_bdg_free; 293 } 294 if (!(b->bdg_flags & NM_BDG_EXCLUSIVE)) { 295 ret = EINVAL; 296 goto unlock_bdg_free; 297 } 298 299 b->bdg_flags &= ~(NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE); 300 ret = netmap_bdg_free(b); 301 if (ret) { 302 b->bdg_flags |= NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE; 303 } 304 305 unlock_bdg_free: 306 NMG_UNLOCK(); 307 return ret; 308 } 309 310 /* Process NETMAP_REQ_VALE_LIST. */ 311 int 312 netmap_vale_list(struct nmreq_header *hdr) 313 { 314 struct nmreq_vale_list *req = 315 (struct nmreq_vale_list *)(uintptr_t)hdr->nr_body; 316 int namelen = strlen(hdr->nr_name); 317 struct nm_bridge *b, *bridges; 318 struct netmap_vp_adapter *vpna; 319 int error = 0, i, j; 320 u_int num_bridges; 321 322 netmap_bns_getbridges(&bridges, &num_bridges); 323 324 /* this is used to enumerate bridges and ports */ 325 if (namelen) { /* look up indexes of bridge and port */ 326 if (strncmp(hdr->nr_name, NM_BDG_NAME, 327 strlen(NM_BDG_NAME))) { 328 return EINVAL; 329 } 330 NMG_LOCK(); 331 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL); 332 if (!b) { 333 NMG_UNLOCK(); 334 return ENOENT; 335 } 336 337 req->nr_bridge_idx = b - bridges; /* bridge index */ 338 req->nr_port_idx = NM_BDG_NOPORT; 339 for (j = 0; j < b->bdg_active_ports; j++) { 340 i = b->bdg_port_index[j]; 341 vpna = b->bdg_ports[i]; 342 if (vpna == NULL) { 343 nm_prerr("This should not happen"); 344 continue; 345 } 346 /* the former and the latter identify a 347 * virtual port and a NIC, respectively 348 */ 349 if (!strcmp(vpna->up.name, hdr->nr_name)) { 350 req->nr_port_idx = i; /* port index */ 351 break; 352 } 353 } 354 NMG_UNLOCK(); 355 } else { 356 /* return the first non-empty entry starting from 357 * bridge nr_arg1 and port nr_arg2. 358 * 359 * Users can detect the end of the same bridge by 360 * seeing the new and old value of nr_arg1, and can 361 * detect the end of all the bridge by error != 0 362 */ 363 i = req->nr_bridge_idx; 364 j = req->nr_port_idx; 365 366 NMG_LOCK(); 367 for (error = ENOENT; i < NM_BRIDGES; i++) { 368 b = bridges + i; 369 for ( ; j < NM_BDG_MAXPORTS; j++) { 370 if (b->bdg_ports[j] == NULL) 371 continue; 372 vpna = b->bdg_ports[j]; 373 /* write back the VALE switch name */ 374 strlcpy(hdr->nr_name, vpna->up.name, 375 sizeof(hdr->nr_name)); 376 error = 0; 377 goto out; 378 } 379 j = 0; /* following bridges scan from 0 */ 380 } 381 out: 382 req->nr_bridge_idx = i; 383 req->nr_port_idx = j; 384 NMG_UNLOCK(); 385 } 386 387 return error; 388 } 389 390 /* Process NETMAP_REQ_VALE_ATTACH. 391 */ 392 int 393 netmap_vale_attach(struct nmreq_header *hdr, void *auth_token) 394 { 395 struct nmreq_vale_attach *req = 396 (struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body; 397 struct netmap_vp_adapter * vpna; 398 struct netmap_adapter *na = NULL; 399 struct netmap_mem_d *nmd = NULL; 400 struct nm_bridge *b = NULL; 401 int error; 402 403 NMG_LOCK(); 404 /* permission check for modified bridges */ 405 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL); 406 if (b && !nm_bdg_valid_auth_token(b, auth_token)) { 407 error = EACCES; 408 goto unlock_exit; 409 } 410 411 if (req->reg.nr_mem_id) { 412 nmd = netmap_mem_find(req->reg.nr_mem_id); 413 if (nmd == NULL) { 414 error = EINVAL; 415 goto unlock_exit; 416 } 417 } 418 419 /* check for existing one */ 420 error = netmap_get_vale_na(hdr, &na, nmd, 0); 421 if (na) { 422 error = EBUSY; 423 goto unref_exit; 424 } 425 error = netmap_get_vale_na(hdr, &na, 426 nmd, 1 /* create if not exists */); 427 if (error) { /* no device */ 428 goto unlock_exit; 429 } 430 431 if (na == NULL) { /* VALE prefix missing */ 432 error = EINVAL; 433 goto unlock_exit; 434 } 435 436 if (NETMAP_OWNED_BY_ANY(na)) { 437 error = EBUSY; 438 goto unref_exit; 439 } 440 441 if (na->nm_bdg_ctl) { 442 /* nop for VALE ports. The bwrap needs to put the hwna 443 * in netmap mode (see netmap_bwrap_bdg_ctl) 444 */ 445 error = na->nm_bdg_ctl(hdr, na); 446 if (error) 447 goto unref_exit; 448 nm_prdis("registered %s to netmap-mode", na->name); 449 } 450 vpna = (struct netmap_vp_adapter *)na; 451 req->port_index = vpna->bdg_port; 452 453 if (nmd) 454 netmap_mem_put(nmd); 455 456 NMG_UNLOCK(); 457 return 0; 458 459 unref_exit: 460 netmap_adapter_put(na); 461 unlock_exit: 462 if (nmd) 463 netmap_mem_put(nmd); 464 465 NMG_UNLOCK(); 466 return error; 467 } 468 469 /* Process NETMAP_REQ_VALE_DETACH. 470 */ 471 int 472 netmap_vale_detach(struct nmreq_header *hdr, void *auth_token) 473 { 474 struct nmreq_vale_detach *nmreq_det = (void *)(uintptr_t)hdr->nr_body; 475 struct netmap_vp_adapter *vpna; 476 struct netmap_adapter *na; 477 struct nm_bridge *b = NULL; 478 int error; 479 480 NMG_LOCK(); 481 /* permission check for modified bridges */ 482 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL); 483 if (b && !nm_bdg_valid_auth_token(b, auth_token)) { 484 error = EACCES; 485 goto unlock_exit; 486 } 487 488 error = netmap_get_vale_na(hdr, &na, NULL, 0 /* don't create */); 489 if (error) { /* no device, or another bridge or user owns the device */ 490 goto unlock_exit; 491 } 492 493 if (na == NULL) { /* VALE prefix missing */ 494 error = EINVAL; 495 goto unlock_exit; 496 } else if (nm_is_bwrap(na) && 497 ((struct netmap_bwrap_adapter *)na)->na_polling_state) { 498 /* Don't detach a NIC with polling */ 499 error = EBUSY; 500 goto unref_exit; 501 } 502 503 vpna = (struct netmap_vp_adapter *)na; 504 if (na->na_vp != vpna) { 505 /* trying to detach first attach of VALE persistent port attached 506 * to 2 bridges 507 */ 508 error = EBUSY; 509 goto unref_exit; 510 } 511 nmreq_det->port_index = vpna->bdg_port; 512 513 if (na->nm_bdg_ctl) { 514 /* remove the port from bridge. The bwrap 515 * also needs to put the hwna in normal mode 516 */ 517 error = na->nm_bdg_ctl(hdr, na); 518 } 519 520 unref_exit: 521 netmap_adapter_put(na); 522 unlock_exit: 523 NMG_UNLOCK(); 524 return error; 525 526 } 527 528 529 /* nm_dtor callback for ephemeral VALE ports */ 530 static void 531 netmap_vale_vp_dtor(struct netmap_adapter *na) 532 { 533 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; 534 struct nm_bridge *b = vpna->na_bdg; 535 536 nm_prdis("%s has %d references", na->name, na->na_refcount); 537 538 if (b) { 539 netmap_bdg_detach_common(b, vpna->bdg_port, -1); 540 } 541 542 if (na->ifp != NULL && !nm_iszombie(na)) { 543 NM_DETACH_NA(na->ifp); 544 if (vpna->autodelete) { 545 nm_prdis("releasing %s", na->ifp->if_xname); 546 NMG_UNLOCK(); 547 nm_os_vi_detach(na->ifp); 548 NMG_LOCK(); 549 } 550 } 551 } 552 553 554 555 /* nm_krings_create callback for VALE ports. 556 * Calls the standard netmap_krings_create, then adds leases on rx 557 * rings and bdgfwd on tx rings. 558 */ 559 static int 560 netmap_vale_vp_krings_create(struct netmap_adapter *na) 561 { 562 u_int tailroom; 563 int error, i; 564 uint32_t *leases; 565 u_int nrx = netmap_real_rings(na, NR_RX); 566 567 /* 568 * Leases are attached to RX rings on vale ports 569 */ 570 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; 571 572 error = netmap_krings_create(na, tailroom); 573 if (error) 574 return error; 575 576 leases = na->tailroom; 577 578 for (i = 0; i < nrx; i++) { /* Receive rings */ 579 na->rx_rings[i]->nkr_leases = leases; 580 leases += na->num_rx_desc; 581 } 582 583 error = nm_alloc_bdgfwd(na); 584 if (error) { 585 netmap_krings_delete(na); 586 return error; 587 } 588 589 return 0; 590 } 591 592 593 /* nm_krings_delete callback for VALE ports. */ 594 static void 595 netmap_vale_vp_krings_delete(struct netmap_adapter *na) 596 { 597 nm_free_bdgfwd(na); 598 netmap_krings_delete(na); 599 } 600 601 602 static int 603 nm_vale_flush(struct nm_bdg_fwd *ft, u_int n, 604 struct netmap_vp_adapter *na, u_int ring_nr); 605 606 607 /* 608 * main dispatch routine for the bridge. 609 * Grab packets from a kring, move them into the ft structure 610 * associated to the tx (input) port. Max one instance per port, 611 * filtered on input (ioctl, poll or XXX). 612 * Returns the next position in the ring. 613 */ 614 static int 615 nm_vale_preflush(struct netmap_kring *kring, u_int end) 616 { 617 struct netmap_vp_adapter *na = 618 (struct netmap_vp_adapter*)kring->na; 619 struct netmap_ring *ring = kring->ring; 620 struct nm_bdg_fwd *ft; 621 u_int ring_nr = kring->ring_id; 622 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; 623 u_int ft_i = 0; /* start from 0 */ 624 u_int frags = 1; /* how many frags ? */ 625 struct nm_bridge *b = na->na_bdg; 626 627 /* To protect against modifications to the bridge we acquire a 628 * shared lock, waiting if we can sleep (if the source port is 629 * attached to a user process) or with a trylock otherwise (NICs). 630 */ 631 nm_prdis("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); 632 if (na->up.na_flags & NAF_BDG_MAYSLEEP) 633 BDG_RLOCK(b); 634 else if (!BDG_RTRYLOCK(b)) 635 return j; 636 nm_prdis(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); 637 ft = kring->nkr_ft; 638 639 for (; likely(j != end); j = nm_next(j, lim)) { 640 struct netmap_slot *slot = &ring->slot[j]; 641 char *buf; 642 643 ft[ft_i].ft_len = slot->len; 644 ft[ft_i].ft_flags = slot->flags; 645 ft[ft_i].ft_offset = 0; 646 647 nm_prdis("flags is 0x%x", slot->flags); 648 /* we do not use the buf changed flag, but we still need to reset it */ 649 slot->flags &= ~NS_BUF_CHANGED; 650 651 /* this slot goes into a list so initialize the link field */ 652 ft[ft_i].ft_next = NM_FT_NULL; 653 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? 654 (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot); 655 if (unlikely(buf == NULL)) { 656 nm_prlim(5, "NULL %s buffer pointer from %s slot %d len %d", 657 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT", 658 kring->name, j, ft[ft_i].ft_len); 659 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up); 660 ft[ft_i].ft_len = 0; 661 ft[ft_i].ft_flags = 0; 662 } 663 __builtin_prefetch(buf); 664 ++ft_i; 665 if (slot->flags & NS_MOREFRAG) { 666 frags++; 667 continue; 668 } 669 if (unlikely(netmap_verbose && frags > 1)) 670 nm_prlim(5, "%d frags at %d", frags, ft_i - frags); 671 ft[ft_i - frags].ft_frags = frags; 672 frags = 1; 673 if (unlikely((int)ft_i >= bridge_batch)) 674 ft_i = nm_vale_flush(ft, ft_i, na, ring_nr); 675 } 676 if (frags > 1) { 677 /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we 678 * have to fix frags count. */ 679 frags--; 680 ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG; 681 ft[ft_i - frags].ft_frags = frags; 682 nm_prlim(5, "Truncate incomplete fragment at %d (%d frags)", ft_i, frags); 683 } 684 if (ft_i) 685 ft_i = nm_vale_flush(ft, ft_i, na, ring_nr); 686 BDG_RUNLOCK(b); 687 return j; 688 } 689 690 691 /* ----- FreeBSD if_bridge hash function ------- */ 692 693 /* 694 * The following hash function is adapted from "Hash Functions" by Bob Jenkins 695 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). 696 * 697 * http://www.burtleburtle.net/bob/hash/spooky.html 698 */ 699 #define mix(a, b, c) \ 700 do { \ 701 a -= b; a -= c; a ^= (c >> 13); \ 702 b -= c; b -= a; b ^= (a << 8); \ 703 c -= a; c -= b; c ^= (b >> 13); \ 704 a -= b; a -= c; a ^= (c >> 12); \ 705 b -= c; b -= a; b ^= (a << 16); \ 706 c -= a; c -= b; c ^= (b >> 5); \ 707 a -= b; a -= c; a ^= (c >> 3); \ 708 b -= c; b -= a; b ^= (a << 10); \ 709 c -= a; c -= b; c ^= (b >> 15); \ 710 } while (/*CONSTCOND*/0) 711 712 713 static __inline uint32_t 714 nm_vale_rthash(const uint8_t *addr) 715 { 716 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key 717 718 b += addr[5] << 8; 719 b += addr[4]; 720 a += addr[3] << 24; 721 a += addr[2] << 16; 722 a += addr[1] << 8; 723 a += addr[0]; 724 725 mix(a, b, c); 726 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) 727 return (c & BRIDGE_RTHASH_MASK); 728 } 729 730 #undef mix 731 732 733 /* 734 * Lookup function for a learning bridge. 735 * Update the hash table with the source address, 736 * and then returns the destination port index, and the 737 * ring in *dst_ring (at the moment, always use ring 0) 738 */ 739 uint32_t 740 netmap_vale_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring, 741 struct netmap_vp_adapter *na, void *private_data) 742 { 743 uint8_t *buf = ((uint8_t *)ft->ft_buf) + ft->ft_offset; 744 u_int buf_len = ft->ft_len - ft->ft_offset; 745 struct nm_hash_ent *ht = private_data; 746 uint32_t sh, dh; 747 u_int dst, mysrc = na->bdg_port; 748 uint64_t smac, dmac; 749 uint8_t indbuf[12]; 750 751 if (buf_len < 14) { 752 return NM_BDG_NOPORT; 753 } 754 755 if (ft->ft_flags & NS_INDIRECT) { 756 if (copyin(buf, indbuf, sizeof(indbuf))) { 757 return NM_BDG_NOPORT; 758 } 759 buf = indbuf; 760 } 761 762 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; 763 smac = le64toh(*(uint64_t *)(buf + 4)); 764 smac >>= 16; 765 766 /* 767 * The hash is somewhat expensive, there might be some 768 * worthwhile optimizations here. 769 */ 770 if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */ 771 uint8_t *s = buf+6; 772 sh = nm_vale_rthash(s); /* hash of source */ 773 /* update source port forwarding entry */ 774 na->last_smac = ht[sh].mac = smac; /* XXX expire ? */ 775 ht[sh].ports = mysrc; 776 if (netmap_debug & NM_DEBUG_VALE) 777 nm_prinf("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", 778 s[0], s[1], s[2], s[3], s[4], s[5], mysrc); 779 } 780 dst = NM_BDG_BROADCAST; 781 if ((buf[0] & 1) == 0) { /* unicast */ 782 dh = nm_vale_rthash(buf); /* hash of dst */ 783 if (ht[dh].mac == dmac) { /* found dst */ 784 dst = ht[dh].ports; 785 } 786 } 787 return dst; 788 } 789 790 791 /* 792 * Available space in the ring. Only used in VALE code 793 * and only with is_rx = 1 794 */ 795 static inline uint32_t 796 nm_kr_space(struct netmap_kring *k, int is_rx) 797 { 798 int space; 799 800 if (is_rx) { 801 int busy = k->nkr_hwlease - k->nr_hwcur; 802 if (busy < 0) 803 busy += k->nkr_num_slots; 804 space = k->nkr_num_slots - 1 - busy; 805 } else { 806 /* XXX never used in this branch */ 807 space = k->nr_hwtail - k->nkr_hwlease; 808 if (space < 0) 809 space += k->nkr_num_slots; 810 } 811 #if 0 812 // sanity check 813 if (k->nkr_hwlease >= k->nkr_num_slots || 814 k->nr_hwcur >= k->nkr_num_slots || 815 k->nr_tail >= k->nkr_num_slots || 816 busy < 0 || 817 busy >= k->nkr_num_slots) { 818 nm_prerr("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", 819 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 820 k->nkr_lease_idx, k->nkr_num_slots); 821 } 822 #endif 823 return space; 824 } 825 826 827 828 829 /* make a lease on the kring for N positions. return the 830 * lease index 831 * XXX only used in VALE code and with is_rx = 1 832 */ 833 static inline uint32_t 834 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) 835 { 836 uint32_t lim = k->nkr_num_slots - 1; 837 uint32_t lease_idx = k->nkr_lease_idx; 838 839 k->nkr_leases[lease_idx] = NR_NOSLOT; 840 k->nkr_lease_idx = nm_next(lease_idx, lim); 841 842 #ifdef CONFIG_NETMAP_DEBUG 843 if (n > nm_kr_space(k, is_rx)) { 844 nm_prerr("invalid request for %d slots", n); 845 panic("x"); 846 } 847 #endif /* CONFIG NETMAP_DEBUG */ 848 /* XXX verify that there are n slots */ 849 k->nkr_hwlease += n; 850 if (k->nkr_hwlease > lim) 851 k->nkr_hwlease -= lim + 1; 852 853 #ifdef CONFIG_NETMAP_DEBUG 854 if (k->nkr_hwlease >= k->nkr_num_slots || 855 k->nr_hwcur >= k->nkr_num_slots || 856 k->nr_hwtail >= k->nkr_num_slots || 857 k->nkr_lease_idx >= k->nkr_num_slots) { 858 nm_prerr("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", 859 k->na->name, 860 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, 861 k->nkr_lease_idx, k->nkr_num_slots); 862 } 863 #endif /* CONFIG_NETMAP_DEBUG */ 864 return lease_idx; 865 } 866 867 /* 868 * 869 * This flush routine supports only unicast and broadcast but a large 870 * number of ports, and lets us replace the learn and dispatch functions. 871 */ 872 int 873 nm_vale_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, 874 u_int ring_nr) 875 { 876 struct nm_vale_q *dst_ents, *brddst; 877 uint16_t num_dsts = 0, *dsts; 878 struct nm_bridge *b = na->na_bdg; 879 u_int i, me = na->bdg_port; 880 881 /* 882 * The work area (pointed by ft) is followed by an array of 883 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS 884 * queues per port plus one for the broadcast traffic. 885 * Then we have an array of destination indexes. 886 */ 887 dst_ents = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX); 888 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); 889 890 /* first pass: find a destination for each packet in the batch */ 891 for (i = 0; likely(i < n); i += ft[i].ft_frags) { 892 uint8_t dst_ring = ring_nr; /* default, same ring as origin */ 893 uint16_t dst_port, d_i; 894 struct nm_vale_q *d; 895 struct nm_bdg_fwd *start_ft = NULL; 896 897 nm_prdis("slot %d frags %d", i, ft[i].ft_frags); 898 899 if (na->up.virt_hdr_len < ft[i].ft_len) { 900 ft[i].ft_offset = na->up.virt_hdr_len; 901 start_ft = &ft[i]; 902 } else if (na->up.virt_hdr_len == ft[i].ft_len && ft[i].ft_flags & NS_MOREFRAG) { 903 ft[i].ft_offset = ft[i].ft_len; 904 start_ft = &ft[i+1]; 905 } else { 906 /* Drop the packet if the virtio-net header is not into the first 907 * fragment nor at the very beginning of the second. 908 */ 909 continue; 910 } 911 dst_port = b->bdg_ops.lookup(start_ft, &dst_ring, na, b->private_data); 912 if (netmap_verbose > 255) 913 nm_prlim(5, "slot %d port %d -> %d", i, me, dst_port); 914 if (dst_port >= NM_BDG_NOPORT) 915 continue; /* this packet is identified to be dropped */ 916 else if (dst_port == NM_BDG_BROADCAST) 917 dst_ring = 0; /* broadcasts always go to ring 0 */ 918 else if (unlikely(dst_port == me || 919 !b->bdg_ports[dst_port])) 920 continue; 921 922 /* get a position in the scratch pad */ 923 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; 924 d = dst_ents + d_i; 925 926 /* append the first fragment to the list */ 927 if (d->bq_head == NM_FT_NULL) { /* new destination */ 928 d->bq_head = d->bq_tail = i; 929 /* remember this position to be scanned later */ 930 if (dst_port != NM_BDG_BROADCAST) 931 dsts[num_dsts++] = d_i; 932 } else { 933 ft[d->bq_tail].ft_next = i; 934 d->bq_tail = i; 935 } 936 d->bq_len += ft[i].ft_frags; 937 } 938 939 /* 940 * Broadcast traffic goes to ring 0 on all destinations. 941 * So we need to add these rings to the list of ports to scan. 942 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is 943 * expensive. We should keep a compact list of active destinations 944 * so we could shorten this loop. 945 */ 946 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; 947 if (brddst->bq_head != NM_FT_NULL) { 948 u_int j; 949 for (j = 0; likely(j < b->bdg_active_ports); j++) { 950 uint16_t d_i; 951 i = b->bdg_port_index[j]; 952 if (unlikely(i == me)) 953 continue; 954 d_i = i * NM_BDG_MAXRINGS; 955 if (dst_ents[d_i].bq_head == NM_FT_NULL) 956 dsts[num_dsts++] = d_i; 957 } 958 } 959 960 nm_prdis(5, "pass 1 done %d pkts %d dsts", n, num_dsts); 961 /* second pass: scan destinations */ 962 for (i = 0; i < num_dsts; i++) { 963 struct netmap_vp_adapter *dst_na; 964 struct netmap_kring *kring; 965 struct netmap_ring *ring; 966 u_int dst_nr, lim, j, d_i, next, brd_next; 967 u_int needed, howmany; 968 int retry = netmap_txsync_retry; 969 struct nm_vale_q *d; 970 uint32_t my_start = 0, lease_idx = 0; 971 int nrings; 972 int virt_hdr_mismatch = 0; 973 974 d_i = dsts[i]; 975 nm_prdis("second pass %d port %d", i, d_i); 976 d = dst_ents + d_i; 977 // XXX fix the division 978 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; 979 /* protect from the lookup function returning an inactive 980 * destination port 981 */ 982 if (unlikely(dst_na == NULL)) 983 goto cleanup; 984 if (dst_na->up.na_flags & NAF_SW_ONLY) 985 goto cleanup; 986 /* 987 * The interface may be in !netmap mode in two cases: 988 * - when na is attached but not activated yet; 989 * - when na is being deactivated but is still attached. 990 */ 991 if (unlikely(!nm_netmap_on(&dst_na->up))) { 992 nm_prdis("not in netmap mode!"); 993 goto cleanup; 994 } 995 996 /* there is at least one either unicast or broadcast packet */ 997 brd_next = brddst->bq_head; 998 next = d->bq_head; 999 /* we need to reserve this many slots. If fewer are 1000 * available, some packets will be dropped. 1001 * Packets may have multiple fragments, so we may not use 1002 * there is a chance that we may not use all of the slots 1003 * we have claimed, so we will need to handle the leftover 1004 * ones when we regain the lock. 1005 */ 1006 needed = d->bq_len + brddst->bq_len; 1007 1008 if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) { 1009 if (netmap_verbose) { 1010 nm_prlim(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len, 1011 dst_na->up.virt_hdr_len); 1012 } 1013 /* There is a virtio-net header/offloadings mismatch between 1014 * source and destination. The slower mismatch datapath will 1015 * be used to cope with all the mismatches. 1016 */ 1017 virt_hdr_mismatch = 1; 1018 if (dst_na->mfs < na->mfs) { 1019 /* We may need to do segmentation offloadings, and so 1020 * we may need a number of destination slots greater 1021 * than the number of input slots ('needed'). 1022 * We look for the smallest integer 'x' which satisfies: 1023 * needed * na->mfs + x * H <= x * na->mfs 1024 * where 'H' is the length of the longest header that may 1025 * be replicated in the segmentation process (e.g. for 1026 * TCPv4 we must account for ethernet header, IP header 1027 * and TCPv4 header). 1028 */ 1029 KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0")); 1030 needed = (needed * na->mfs) / 1031 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; 1032 nm_prdis(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); 1033 } 1034 } 1035 1036 nm_prdis(5, "pass 2 dst %d is %x %s", 1037 i, d_i, is_vp ? "virtual" : "nic/host"); 1038 dst_nr = d_i & (NM_BDG_MAXRINGS-1); 1039 nrings = dst_na->up.num_rx_rings; 1040 if (dst_nr >= nrings) 1041 dst_nr = dst_nr % nrings; 1042 kring = dst_na->up.rx_rings[dst_nr]; 1043 ring = kring->ring; 1044 /* the destination ring may have not been opened for RX */ 1045 if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON)) 1046 goto cleanup; 1047 lim = kring->nkr_num_slots - 1; 1048 1049 retry: 1050 1051 if (dst_na->retry && retry) { 1052 /* try to get some free slot from the previous run */ 1053 kring->nm_notify(kring, NAF_FORCE_RECLAIM); 1054 /* actually useful only for bwraps, since there 1055 * the notify will trigger a txsync on the hwna. VALE ports 1056 * have dst_na->retry == 0 1057 */ 1058 } 1059 /* reserve the buffers in the queue and an entry 1060 * to report completion, and drop lock. 1061 * XXX this might become a helper function. 1062 */ 1063 mtx_lock(&kring->q_lock); 1064 if (kring->nkr_stopped) { 1065 mtx_unlock(&kring->q_lock); 1066 goto cleanup; 1067 } 1068 my_start = j = kring->nkr_hwlease; 1069 howmany = nm_kr_space(kring, 1); 1070 if (needed < howmany) 1071 howmany = needed; 1072 lease_idx = nm_kr_lease(kring, howmany, 1); 1073 mtx_unlock(&kring->q_lock); 1074 1075 /* only retry if we need more than available slots */ 1076 if (retry && needed <= howmany) 1077 retry = 0; 1078 1079 /* copy to the destination queue */ 1080 while (howmany > 0) { 1081 struct netmap_slot *slot; 1082 struct nm_bdg_fwd *ft_p, *ft_end; 1083 u_int cnt; 1084 1085 /* find the queue from which we pick next packet. 1086 * NM_FT_NULL is always higher than valid indexes 1087 * so we never dereference it if the other list 1088 * has packets (and if both are empty we never 1089 * get here). 1090 */ 1091 if (next < brd_next) { 1092 ft_p = ft + next; 1093 next = ft_p->ft_next; 1094 } else { /* insert broadcast */ 1095 ft_p = ft + brd_next; 1096 brd_next = ft_p->ft_next; 1097 } 1098 cnt = ft_p->ft_frags; // cnt > 0 1099 if (unlikely(cnt > howmany)) 1100 break; /* no more space */ 1101 if (netmap_verbose && cnt > 1) 1102 nm_prlim(5, "rx %d frags to %d", cnt, j); 1103 ft_end = ft_p + cnt; 1104 if (unlikely(virt_hdr_mismatch)) { 1105 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); 1106 } else { 1107 howmany -= cnt; 1108 do { 1109 char *dst, *src = ft_p->ft_buf; 1110 size_t copy_len = ft_p->ft_len, dst_len = copy_len; 1111 1112 slot = &ring->slot[j]; 1113 dst = NMB(&dst_na->up, slot); 1114 1115 nm_prdis("send [%d] %d(%d) bytes at %s:%d", 1116 i, (int)copy_len, (int)dst_len, 1117 NM_IFPNAME(dst_ifp), j); 1118 /* round to a multiple of 64 */ 1119 copy_len = (copy_len + 63) & ~63; 1120 1121 if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) || 1122 copy_len > NETMAP_BUF_SIZE(&na->up))) { 1123 nm_prlim(5, "invalid len %d, down to 64", (int)copy_len); 1124 copy_len = dst_len = 64; // XXX 1125 } 1126 if (ft_p->ft_flags & NS_INDIRECT) { 1127 if (copyin(src, dst, copy_len)) { 1128 // invalid user pointer, pretend len is 0 1129 dst_len = 0; 1130 } 1131 } else { 1132 //memcpy(dst, src, copy_len); 1133 pkt_copy(src, dst, (int)copy_len); 1134 } 1135 slot->len = dst_len; 1136 slot->flags = (cnt << 8)| NS_MOREFRAG; 1137 j = nm_next(j, lim); 1138 needed--; 1139 ft_p++; 1140 } while (ft_p != ft_end); 1141 slot->flags = (cnt << 8); /* clear flag on last entry */ 1142 } 1143 /* are we done ? */ 1144 if (next == NM_FT_NULL && brd_next == NM_FT_NULL) 1145 break; 1146 } 1147 { 1148 /* current position */ 1149 uint32_t *p = kring->nkr_leases; /* shorthand */ 1150 uint32_t update_pos; 1151 int still_locked = 1; 1152 1153 mtx_lock(&kring->q_lock); 1154 if (unlikely(howmany > 0)) { 1155 /* not used all bufs. If i am the last one 1156 * i can recover the slots, otherwise must 1157 * fill them with 0 to mark empty packets. 1158 */ 1159 nm_prdis("leftover %d bufs", howmany); 1160 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { 1161 /* yes i am the last one */ 1162 nm_prdis("roll back nkr_hwlease to %d", j); 1163 kring->nkr_hwlease = j; 1164 } else { 1165 while (howmany-- > 0) { 1166 ring->slot[j].len = 0; 1167 ring->slot[j].flags = 0; 1168 j = nm_next(j, lim); 1169 } 1170 } 1171 } 1172 p[lease_idx] = j; /* report I am done */ 1173 1174 update_pos = kring->nr_hwtail; 1175 1176 if (my_start == update_pos) { 1177 /* all slots before my_start have been reported, 1178 * so scan subsequent leases to see if other ranges 1179 * have been completed, and to a selwakeup or txsync. 1180 */ 1181 while (lease_idx != kring->nkr_lease_idx && 1182 p[lease_idx] != NR_NOSLOT) { 1183 j = p[lease_idx]; 1184 p[lease_idx] = NR_NOSLOT; 1185 lease_idx = nm_next(lease_idx, lim); 1186 } 1187 /* j is the new 'write' position. j != my_start 1188 * means there are new buffers to report 1189 */ 1190 if (likely(j != my_start)) { 1191 kring->nr_hwtail = j; 1192 still_locked = 0; 1193 mtx_unlock(&kring->q_lock); 1194 kring->nm_notify(kring, 0); 1195 /* this is netmap_notify for VALE ports and 1196 * netmap_bwrap_notify for bwrap. The latter will 1197 * trigger a txsync on the underlying hwna 1198 */ 1199 if (dst_na->retry && retry--) { 1200 /* XXX this is going to call nm_notify again. 1201 * Only useful for bwrap in virtual machines 1202 */ 1203 goto retry; 1204 } 1205 } 1206 } 1207 if (still_locked) 1208 mtx_unlock(&kring->q_lock); 1209 } 1210 cleanup: 1211 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ 1212 d->bq_len = 0; 1213 } 1214 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ 1215 brddst->bq_len = 0; 1216 return 0; 1217 } 1218 1219 /* nm_txsync callback for VALE ports */ 1220 static int 1221 netmap_vale_vp_txsync(struct netmap_kring *kring, int flags) 1222 { 1223 struct netmap_vp_adapter *na = 1224 (struct netmap_vp_adapter *)kring->na; 1225 u_int done; 1226 u_int const lim = kring->nkr_num_slots - 1; 1227 u_int const head = kring->rhead; 1228 1229 if (bridge_batch <= 0) { /* testing only */ 1230 done = head; // used all 1231 goto done; 1232 } 1233 if (!na->na_bdg) { 1234 done = head; 1235 goto done; 1236 } 1237 if (bridge_batch > NM_BDG_BATCH) 1238 bridge_batch = NM_BDG_BATCH; 1239 1240 done = nm_vale_preflush(kring, head); 1241 done: 1242 if (done != head) 1243 nm_prerr("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail); 1244 /* 1245 * packets between 'done' and 'cur' are left unsent. 1246 */ 1247 kring->nr_hwcur = done; 1248 kring->nr_hwtail = nm_prev(done, lim); 1249 if (netmap_debug & NM_DEBUG_TXSYNC) 1250 nm_prinf("%s ring %d flags %d", na->up.name, kring->ring_id, flags); 1251 return 0; 1252 } 1253 1254 1255 /* create a netmap_vp_adapter that describes a VALE port. 1256 * Only persistent VALE ports have a non-null ifp. 1257 */ 1258 static int 1259 netmap_vale_vp_create(struct nmreq_header *hdr, struct ifnet *ifp, 1260 struct netmap_mem_d *nmd, struct netmap_vp_adapter **ret) 1261 { 1262 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body; 1263 struct netmap_vp_adapter *vpna; 1264 struct netmap_adapter *na; 1265 int error = 0; 1266 u_int npipes = 0; 1267 u_int extrabufs = 0; 1268 1269 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) { 1270 return EINVAL; 1271 } 1272 1273 vpna = nm_os_malloc(sizeof(*vpna)); 1274 if (vpna == NULL) 1275 return ENOMEM; 1276 1277 na = &vpna->up; 1278 1279 na->ifp = ifp; 1280 strlcpy(na->name, hdr->nr_name, sizeof(na->name)); 1281 1282 /* bound checking */ 1283 na->num_tx_rings = req->nr_tx_rings; 1284 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1285 req->nr_tx_rings = na->num_tx_rings; /* write back */ 1286 na->num_rx_rings = req->nr_rx_rings; 1287 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); 1288 req->nr_rx_rings = na->num_rx_rings; /* write back */ 1289 nm_bound_var(&req->nr_tx_slots, NM_BRIDGE_RINGSIZE, 1290 1, NM_BDG_MAXSLOTS, NULL); 1291 na->num_tx_desc = req->nr_tx_slots; 1292 nm_bound_var(&req->nr_rx_slots, NM_BRIDGE_RINGSIZE, 1293 1, NM_BDG_MAXSLOTS, NULL); 1294 /* validate number of pipes. We want at least 1, 1295 * but probably can do with some more. 1296 * So let's use 2 as default (when 0 is supplied) 1297 */ 1298 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); 1299 /* validate extra bufs */ 1300 extrabufs = req->nr_extra_bufs; 1301 nm_bound_var(&extrabufs, 0, 0, 1302 128*NM_BDG_MAXSLOTS, NULL); 1303 req->nr_extra_bufs = extrabufs; /* write back */ 1304 na->num_rx_desc = req->nr_rx_slots; 1305 /* Set the mfs to a default value, as it is needed on the VALE 1306 * mismatch datapath. XXX We should set it according to the MTU 1307 * known to the kernel. */ 1308 vpna->mfs = NM_BDG_MFS_DEFAULT; 1309 vpna->last_smac = ~0llu; 1310 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? 1311 vpna->mfs = netmap_buf_size; */ 1312 if (netmap_verbose) 1313 nm_prinf("max frame size %u", vpna->mfs); 1314 1315 na->na_flags |= NAF_BDG_MAYSLEEP; 1316 /* persistent VALE ports look like hw devices 1317 * with a native netmap adapter 1318 */ 1319 if (ifp) 1320 na->na_flags |= NAF_NATIVE; 1321 na->nm_txsync = netmap_vale_vp_txsync; 1322 na->nm_rxsync = netmap_vp_rxsync; /* use the one provided by bdg */ 1323 na->nm_register = netmap_vp_reg; /* use the one provided by bdg */ 1324 na->nm_krings_create = netmap_vale_vp_krings_create; 1325 na->nm_krings_delete = netmap_vale_vp_krings_delete; 1326 na->nm_dtor = netmap_vale_vp_dtor; 1327 nm_prdis("nr_mem_id %d", req->nr_mem_id); 1328 na->nm_mem = nmd ? 1329 netmap_mem_get(nmd): 1330 netmap_mem_private_new( 1331 na->num_tx_rings, na->num_tx_desc, 1332 na->num_rx_rings, na->num_rx_desc, 1333 req->nr_extra_bufs, npipes, &error); 1334 if (na->nm_mem == NULL) 1335 goto err; 1336 na->nm_bdg_attach = netmap_vale_vp_bdg_attach; 1337 /* other nmd fields are set in the common routine */ 1338 error = netmap_attach_common(na); 1339 if (error) 1340 goto err; 1341 *ret = vpna; 1342 return 0; 1343 1344 err: 1345 if (na->nm_mem != NULL) 1346 netmap_mem_put(na->nm_mem); 1347 nm_os_free(vpna); 1348 return error; 1349 } 1350 1351 /* nm_bdg_attach callback for VALE ports 1352 * The na_vp port is this same netmap_adapter. There is no host port. 1353 */ 1354 static int 1355 netmap_vale_vp_bdg_attach(const char *name, struct netmap_adapter *na, 1356 struct nm_bridge *b) 1357 { 1358 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na; 1359 1360 if ((b->bdg_flags & NM_BDG_NEED_BWRAP) || vpna->na_bdg) { 1361 return NM_NEED_BWRAP; 1362 } 1363 na->na_vp = vpna; 1364 strlcpy(na->name, name, sizeof(na->name)); 1365 na->na_hostvp = NULL; 1366 return 0; 1367 } 1368 1369 static int 1370 netmap_vale_bwrap_krings_create(struct netmap_adapter *na) 1371 { 1372 int error; 1373 1374 /* impersonate a netmap_vp_adapter */ 1375 error = netmap_vale_vp_krings_create(na); 1376 if (error) 1377 return error; 1378 error = netmap_bwrap_krings_create_common(na); 1379 if (error) { 1380 netmap_vale_vp_krings_delete(na); 1381 } 1382 return error; 1383 } 1384 1385 static void 1386 netmap_vale_bwrap_krings_delete(struct netmap_adapter *na) 1387 { 1388 netmap_bwrap_krings_delete_common(na); 1389 netmap_vale_vp_krings_delete(na); 1390 } 1391 1392 static int 1393 netmap_vale_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna) 1394 { 1395 struct netmap_bwrap_adapter *bna; 1396 struct netmap_adapter *na = NULL; 1397 struct netmap_adapter *hostna = NULL; 1398 int error; 1399 1400 bna = nm_os_malloc(sizeof(*bna)); 1401 if (bna == NULL) { 1402 return ENOMEM; 1403 } 1404 na = &bna->up.up; 1405 strlcpy(na->name, nr_name, sizeof(na->name)); 1406 na->nm_register = netmap_bwrap_reg; 1407 na->nm_txsync = netmap_vale_vp_txsync; 1408 // na->nm_rxsync = netmap_bwrap_rxsync; 1409 na->nm_krings_create = netmap_vale_bwrap_krings_create; 1410 na->nm_krings_delete = netmap_vale_bwrap_krings_delete; 1411 na->nm_notify = netmap_bwrap_notify; 1412 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ 1413 /* Set the mfs, needed on the VALE mismatch datapath. */ 1414 bna->up.mfs = NM_BDG_MFS_DEFAULT; 1415 1416 if (hwna->na_flags & NAF_HOST_RINGS) { 1417 hostna = &bna->host.up; 1418 hostna->nm_notify = netmap_bwrap_notify; 1419 bna->host.mfs = NM_BDG_MFS_DEFAULT; 1420 } 1421 1422 error = netmap_bwrap_attach_common(na, hwna); 1423 if (error) { 1424 nm_os_free(bna); 1425 } 1426 return error; 1427 } 1428 1429 int 1430 netmap_get_vale_na(struct nmreq_header *hdr, struct netmap_adapter **na, 1431 struct netmap_mem_d *nmd, int create) 1432 { 1433 return netmap_get_bdg_na(hdr, na, nmd, create, &vale_bdg_ops); 1434 } 1435 1436 1437 /* creates a persistent VALE port */ 1438 int 1439 nm_vi_create(struct nmreq_header *hdr) 1440 { 1441 struct nmreq_vale_newif *req = 1442 (struct nmreq_vale_newif *)(uintptr_t)hdr->nr_body; 1443 int error = 0; 1444 /* Build a nmreq_register out of the nmreq_vale_newif, 1445 * so that we can call netmap_get_bdg_na(). */ 1446 struct nmreq_register regreq; 1447 bzero(®req, sizeof(regreq)); 1448 regreq.nr_tx_slots = req->nr_tx_slots; 1449 regreq.nr_rx_slots = req->nr_rx_slots; 1450 regreq.nr_tx_rings = req->nr_tx_rings; 1451 regreq.nr_rx_rings = req->nr_rx_rings; 1452 regreq.nr_mem_id = req->nr_mem_id; 1453 hdr->nr_reqtype = NETMAP_REQ_REGISTER; 1454 hdr->nr_body = (uintptr_t)®req; 1455 error = netmap_vi_create(hdr, 0 /* no autodelete */); 1456 hdr->nr_reqtype = NETMAP_REQ_VALE_NEWIF; 1457 hdr->nr_body = (uintptr_t)req; 1458 /* Write back to the original struct. */ 1459 req->nr_tx_slots = regreq.nr_tx_slots; 1460 req->nr_rx_slots = regreq.nr_rx_slots; 1461 req->nr_tx_rings = regreq.nr_tx_rings; 1462 req->nr_rx_rings = regreq.nr_rx_rings; 1463 req->nr_mem_id = regreq.nr_mem_id; 1464 return error; 1465 } 1466 1467 /* remove a persistent VALE port from the system */ 1468 int 1469 nm_vi_destroy(const char *name) 1470 { 1471 struct ifnet *ifp; 1472 struct netmap_vp_adapter *vpna; 1473 int error; 1474 1475 ifp = ifunit_ref(name); 1476 if (!ifp) 1477 return ENXIO; 1478 NMG_LOCK(); 1479 /* make sure this is actually a VALE port */ 1480 if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) { 1481 error = EINVAL; 1482 goto err; 1483 } 1484 1485 vpna = (struct netmap_vp_adapter *)NA(ifp); 1486 1487 /* we can only destroy ports that were created via NETMAP_BDG_NEWIF */ 1488 if (vpna->autodelete) { 1489 error = EINVAL; 1490 goto err; 1491 } 1492 1493 /* also make sure that nobody is using the inferface */ 1494 if (NETMAP_OWNED_BY_ANY(&vpna->up) || 1495 vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) { 1496 error = EBUSY; 1497 goto err; 1498 } 1499 1500 NMG_UNLOCK(); 1501 1502 if (netmap_verbose) 1503 nm_prinf("destroying a persistent vale interface %s", ifp->if_xname); 1504 /* Linux requires all the references are released 1505 * before unregister 1506 */ 1507 netmap_detach(ifp); 1508 if_rele(ifp); 1509 nm_os_vi_detach(ifp); 1510 return 0; 1511 1512 err: 1513 NMG_UNLOCK(); 1514 if_rele(ifp); 1515 return error; 1516 } 1517 1518 static int 1519 nm_update_info(struct nmreq_register *req, struct netmap_adapter *na) 1520 { 1521 req->nr_rx_rings = na->num_rx_rings; 1522 req->nr_tx_rings = na->num_tx_rings; 1523 req->nr_rx_slots = na->num_rx_desc; 1524 req->nr_tx_slots = na->num_tx_desc; 1525 return netmap_mem_get_info(na->nm_mem, &req->nr_memsize, NULL, 1526 &req->nr_mem_id); 1527 } 1528 1529 1530 /* 1531 * Create a virtual interface registered to the system. 1532 * The interface will be attached to a bridge later. 1533 */ 1534 int 1535 netmap_vi_create(struct nmreq_header *hdr, int autodelete) 1536 { 1537 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body; 1538 struct ifnet *ifp; 1539 struct netmap_vp_adapter *vpna; 1540 struct netmap_mem_d *nmd = NULL; 1541 int error; 1542 1543 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) { 1544 return EINVAL; 1545 } 1546 1547 /* don't include VALE prefix */ 1548 if (!strncmp(hdr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME))) 1549 return EINVAL; 1550 if (strlen(hdr->nr_name) >= IFNAMSIZ) { 1551 return EINVAL; 1552 } 1553 ifp = ifunit_ref(hdr->nr_name); 1554 if (ifp) { /* already exist, cannot create new one */ 1555 error = EEXIST; 1556 NMG_LOCK(); 1557 if (NM_NA_VALID(ifp)) { 1558 int update_err = nm_update_info(req, NA(ifp)); 1559 if (update_err) 1560 error = update_err; 1561 } 1562 NMG_UNLOCK(); 1563 if_rele(ifp); 1564 return error; 1565 } 1566 error = nm_os_vi_persist(hdr->nr_name, &ifp); 1567 if (error) 1568 return error; 1569 1570 NMG_LOCK(); 1571 if (req->nr_mem_id) { 1572 nmd = netmap_mem_find(req->nr_mem_id); 1573 if (nmd == NULL) { 1574 error = EINVAL; 1575 goto err_1; 1576 } 1577 } 1578 /* netmap_vp_create creates a struct netmap_vp_adapter */ 1579 error = netmap_vale_vp_create(hdr, ifp, nmd, &vpna); 1580 if (error) { 1581 if (netmap_debug & NM_DEBUG_VALE) 1582 nm_prerr("error %d", error); 1583 goto err_1; 1584 } 1585 /* persist-specific routines */ 1586 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl; 1587 if (!autodelete) { 1588 netmap_adapter_get(&vpna->up); 1589 } else { 1590 vpna->autodelete = 1; 1591 } 1592 NM_ATTACH_NA(ifp, &vpna->up); 1593 /* return the updated info */ 1594 error = nm_update_info(req, &vpna->up); 1595 if (error) { 1596 goto err_2; 1597 } 1598 nm_prdis("returning nr_mem_id %d", req->nr_mem_id); 1599 if (nmd) 1600 netmap_mem_put(nmd); 1601 NMG_UNLOCK(); 1602 nm_prdis("created %s", ifp->if_xname); 1603 return 0; 1604 1605 err_2: 1606 netmap_detach(ifp); 1607 err_1: 1608 if (nmd) 1609 netmap_mem_put(nmd); 1610 NMG_UNLOCK(); 1611 nm_os_vi_detach(ifp); 1612 1613 return error; 1614 } 1615 1616 #endif /* WITH_VALE */ 1617