1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright (c) 2014, 2017 by Delphix. All rights reserved. 29 * Copyright 2020 RackTop Systems, Inc. 30 */ 31 32 /* 33 * 34 * Copyright (c) 2004 Christian Limpach. 35 * All rights reserved. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. This section intentionally left blank. 46 * 4. The name of the author may not be used to endorse or promote products 47 * derived from this software without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 50 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 51 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 52 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 53 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 54 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 55 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 56 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 57 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 58 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 59 */ 60 /* 61 * Section 3 of the above license was updated in response to bug 6379571. 62 */ 63 64 /* 65 * xnf.c - GLDv3 network driver for domU. 66 */ 67 68 /* 69 * This driver uses four per-instance locks: 70 * 71 * xnf_gref_lock: 72 * 73 * Protects access to the grant reference list stored in 74 * xnf_gref_head. Grant references should be acquired and released 75 * using gref_get() and gref_put() respectively. 76 * 77 * xnf_schedlock: 78 * 79 * Protects: 80 * xnf_need_sched - used to record that a previous transmit attempt 81 * failed (and consequently it will be necessary to call 82 * mac_tx_update() when transmit resources are available). 83 * xnf_pending_multicast - the number of multicast requests that 84 * have been submitted to the backend for which we have not 85 * processed responses. 86 * 87 * xnf_txlock: 88 * 89 * Protects the transmit ring (xnf_tx_ring) and associated 90 * structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head). 91 * 92 * xnf_rxlock: 93 * 94 * Protects the receive ring (xnf_rx_ring) and associated 95 * structures (notably xnf_rx_pkt_info). 96 * 97 * If driver-global state that affects both the transmit and receive 98 * rings is manipulated, both xnf_txlock and xnf_rxlock should be 99 * held, in that order. 100 * 101 * xnf_schedlock is acquired both whilst holding xnf_txlock and 102 * without. It should always be acquired after xnf_txlock if both are 103 * held. 104 * 105 * Notes: 106 * - atomic_add_64() is used to manipulate counters where we require 107 * accuracy. For counters intended only for observation by humans, 108 * post increment/decrement are used instead. 109 */ 110 111 #include <sys/types.h> 112 #include <sys/errno.h> 113 #include <sys/param.h> 114 #include <sys/sysmacros.h> 115 #include <sys/systm.h> 116 #include <sys/stream.h> 117 #include <sys/strsubr.h> 118 #include <sys/strsun.h> 119 #include <sys/conf.h> 120 #include <sys/ddi.h> 121 #include <sys/devops.h> 122 #include <sys/sunddi.h> 123 #include <sys/sunndi.h> 124 #include <sys/dlpi.h> 125 #include <sys/ethernet.h> 126 #include <sys/strsun.h> 127 #include <sys/pattr.h> 128 #include <inet/ip.h> 129 #include <inet/ip_impl.h> 130 #include <inet/tcp.h> 131 #include <netinet/udp.h> 132 #include <sys/gld.h> 133 #include <sys/modctl.h> 134 #include <sys/mac_provider.h> 135 #include <sys/mac_ether.h> 136 #include <sys/bootinfo.h> 137 #include <sys/mach_mmu.h> 138 #ifdef XPV_HVM_DRIVER 139 #include <sys/xpv_support.h> 140 #include <sys/hypervisor.h> 141 #else 142 #include <sys/hypervisor.h> 143 #include <sys/evtchn_impl.h> 144 #include <sys/balloon_impl.h> 145 #endif 146 #include <xen/public/io/netif.h> 147 #include <sys/gnttab.h> 148 #include <xen/sys/xendev.h> 149 #include <sys/sdt.h> 150 #include <sys/note.h> 151 #include <sys/debug.h> 152 153 #include <io/xnf.h> 154 155 #if defined(DEBUG) || defined(__lint) 156 #define XNF_DEBUG 157 #endif 158 159 #ifdef XNF_DEBUG 160 int xnf_debug = 0; 161 xnf_t *xnf_debug_instance = NULL; 162 #endif 163 164 /* 165 * On a 32 bit PAE system physical and machine addresses are larger 166 * than 32 bits. ddi_btop() on such systems take an unsigned long 167 * argument, and so addresses above 4G are truncated before ddi_btop() 168 * gets to see them. To avoid this, code the shift operation here. 169 */ 170 #define xnf_btop(addr) ((addr) >> PAGESHIFT) 171 172 /* 173 * The parameters below should only be changed in /etc/system, never in mdb. 174 */ 175 176 /* 177 * Should we use the multicast control feature if the backend provides 178 * it? 179 */ 180 boolean_t xnf_multicast_control = B_TRUE; 181 182 /* 183 * Should we allow scatter-gather for tx if backend allows it? 184 */ 185 boolean_t xnf_enable_tx_sg = B_TRUE; 186 187 /* 188 * Should we allow scatter-gather for rx if backend allows it? 189 */ 190 boolean_t xnf_enable_rx_sg = B_TRUE; 191 192 /* 193 * Should we allow lso for tx sends if backend allows it? 194 * Requires xnf_enable_tx_sg to be also set to TRUE. 195 */ 196 boolean_t xnf_enable_lso = B_TRUE; 197 198 /* 199 * Should we allow lro on rx if backend supports it? 200 * Requires xnf_enable_rx_sg to be also set to TRUE. 201 * 202 * !! WARNING !! 203 * LRO is not yet supported in the OS so this should be left as FALSE. 204 * !! WARNING !! 205 */ 206 boolean_t xnf_enable_lro = B_FALSE; 207 208 /* 209 * Received packets below this size are copied to a new streams buffer 210 * rather than being desballoc'ed. 211 * 212 * This value is chosen to accommodate traffic where there are a large 213 * number of small packets. For data showing a typical distribution, 214 * see: 215 * 216 * Sinha07a: 217 * Rishi Sinha, Christos Papadopoulos, and John 218 * Heidemann. Internet Packet Size Distributions: Some 219 * Observations. Technical Report ISI-TR-2007-643, 220 * USC/Information Sciences Institute, May, 2007. Orignally 221 * released October 2005 as web page 222 * http://netweb.usc.edu/~sinha/pkt-sizes/. 223 * <http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>. 224 */ 225 size_t xnf_rx_copy_limit = 64; 226 227 #define INVALID_GRANT_HANDLE ((grant_handle_t)-1) 228 #define INVALID_GRANT_REF ((grant_ref_t)-1) 229 #define INVALID_TX_ID ((uint16_t)-1) 230 231 #define TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)])) 232 #define TX_ID_VALID(i) \ 233 (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE)) 234 235 /* 236 * calculate how many pages are spanned by an mblk fragment 237 */ 238 #define xnf_mblk_pages(mp) (MBLKL(mp) == 0 ? 0 : \ 239 xnf_btop((uintptr_t)mp->b_wptr - 1) - xnf_btop((uintptr_t)mp->b_rptr) + 1) 240 241 /* Required system entry points */ 242 static int xnf_attach(dev_info_t *, ddi_attach_cmd_t); 243 static int xnf_detach(dev_info_t *, ddi_detach_cmd_t); 244 245 /* Required driver entry points for Nemo */ 246 static int xnf_start(void *); 247 static void xnf_stop(void *); 248 static int xnf_set_mac_addr(void *, const uint8_t *); 249 static int xnf_set_multicast(void *, boolean_t, const uint8_t *); 250 static int xnf_set_promiscuous(void *, boolean_t); 251 static mblk_t *xnf_send(void *, mblk_t *); 252 static uint_t xnf_intr(caddr_t); 253 static int xnf_stat(void *, uint_t, uint64_t *); 254 static boolean_t xnf_getcapab(void *, mac_capab_t, void *); 255 static int xnf_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); 256 static int xnf_setprop(void *, const char *, mac_prop_id_t, uint_t, 257 const void *); 258 static void xnf_propinfo(void *, const char *, mac_prop_id_t, 259 mac_prop_info_handle_t); 260 261 /* Driver private functions */ 262 static int xnf_alloc_dma_resources(xnf_t *); 263 static void xnf_release_dma_resources(xnf_t *); 264 static void xnf_release_mblks(xnf_t *); 265 266 static int xnf_buf_constructor(void *, void *, int); 267 static void xnf_buf_destructor(void *, void *); 268 static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t); 269 #pragma inline(xnf_buf_get) 270 static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t); 271 #pragma inline(xnf_buf_put) 272 static void xnf_buf_refresh(xnf_buf_t *); 273 #pragma inline(xnf_buf_refresh) 274 static void xnf_buf_recycle(xnf_buf_t *); 275 276 static int xnf_tx_buf_constructor(void *, void *, int); 277 static void xnf_tx_buf_destructor(void *, void *); 278 279 static grant_ref_t xnf_gref_get(xnf_t *); 280 #pragma inline(xnf_gref_get) 281 static void xnf_gref_put(xnf_t *, grant_ref_t); 282 #pragma inline(xnf_gref_put) 283 284 static xnf_txid_t *xnf_txid_get(xnf_t *); 285 #pragma inline(xnf_txid_get) 286 static void xnf_txid_put(xnf_t *, xnf_txid_t *); 287 #pragma inline(xnf_txid_put) 288 289 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *); 290 static int xnf_tx_clean_ring(xnf_t *); 291 static void oe_state_change(dev_info_t *, ddi_eventcookie_t, 292 void *, void *); 293 static boolean_t xnf_kstat_init(xnf_t *); 294 static void xnf_rx_collect(xnf_t *); 295 296 #define XNF_CALLBACK_FLAGS (MC_GETCAPAB | MC_PROPERTIES) 297 298 static mac_callbacks_t xnf_callbacks = { 299 .mc_callbacks = XNF_CALLBACK_FLAGS, 300 .mc_getstat = xnf_stat, 301 .mc_start = xnf_start, 302 .mc_stop = xnf_stop, 303 .mc_setpromisc = xnf_set_promiscuous, 304 .mc_multicst = xnf_set_multicast, 305 .mc_unicst = xnf_set_mac_addr, 306 .mc_tx = xnf_send, 307 .mc_getcapab = xnf_getcapab, 308 .mc_setprop = xnf_setprop, 309 .mc_getprop = xnf_getprop, 310 .mc_propinfo = xnf_propinfo, 311 }; 312 313 /* DMA attributes for network ring buffer */ 314 static ddi_dma_attr_t ringbuf_dma_attr = { 315 .dma_attr_version = DMA_ATTR_V0, 316 .dma_attr_addr_lo = 0, 317 .dma_attr_addr_hi = 0xffffffffffffffffULL, 318 .dma_attr_count_max = 0x7fffffff, 319 .dma_attr_align = MMU_PAGESIZE, 320 .dma_attr_burstsizes = 0x7ff, 321 .dma_attr_minxfer = 1, 322 .dma_attr_maxxfer = 0xffffffffU, 323 .dma_attr_seg = 0xffffffffffffffffULL, 324 .dma_attr_sgllen = 1, 325 .dma_attr_granular = 1, 326 .dma_attr_flags = 0 327 }; 328 329 /* DMA attributes for receive data */ 330 static ddi_dma_attr_t rx_buf_dma_attr = { 331 .dma_attr_version = DMA_ATTR_V0, 332 .dma_attr_addr_lo = 0, 333 .dma_attr_addr_hi = 0xffffffffffffffffULL, 334 .dma_attr_count_max = MMU_PAGEOFFSET, 335 .dma_attr_align = MMU_PAGESIZE, /* allocation alignment */ 336 .dma_attr_burstsizes = 0x7ff, 337 .dma_attr_minxfer = 1, 338 .dma_attr_maxxfer = 0xffffffffU, 339 .dma_attr_seg = 0xffffffffffffffffULL, 340 .dma_attr_sgllen = 1, 341 .dma_attr_granular = 1, 342 .dma_attr_flags = 0 343 }; 344 345 /* DMA attributes for transmit data */ 346 static ddi_dma_attr_t tx_buf_dma_attr = { 347 .dma_attr_version = DMA_ATTR_V0, 348 .dma_attr_addr_lo = 0, 349 .dma_attr_addr_hi = 0xffffffffffffffffULL, 350 .dma_attr_count_max = MMU_PAGEOFFSET, 351 .dma_attr_align = 1, 352 .dma_attr_burstsizes = 0x7ff, 353 .dma_attr_minxfer = 1, 354 .dma_attr_maxxfer = 0xffffffffU, 355 .dma_attr_seg = XEN_DATA_BOUNDARY - 1, /* segment boundary */ 356 .dma_attr_sgllen = XEN_MAX_TX_DATA_PAGES, /* max number of segments */ 357 .dma_attr_granular = 1, 358 .dma_attr_flags = 0 359 }; 360 361 /* DMA access attributes for registers and descriptors */ 362 static ddi_device_acc_attr_t accattr = { 363 DDI_DEVICE_ATTR_V0, 364 DDI_STRUCTURE_LE_ACC, /* This is a little-endian device */ 365 DDI_STRICTORDER_ACC 366 }; 367 368 /* DMA access attributes for data: NOT to be byte swapped. */ 369 static ddi_device_acc_attr_t data_accattr = { 370 DDI_DEVICE_ATTR_V0, 371 DDI_NEVERSWAP_ACC, 372 DDI_STRICTORDER_ACC 373 }; 374 375 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach, 376 nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported); 377 378 static struct modldrv xnf_modldrv = { 379 &mod_driverops, 380 "Virtual Ethernet driver", 381 &xnf_dev_ops 382 }; 383 384 static struct modlinkage modlinkage = { 385 MODREV_1, &xnf_modldrv, NULL 386 }; 387 388 int 389 _init(void) 390 { 391 int r; 392 393 mac_init_ops(&xnf_dev_ops, "xnf"); 394 r = mod_install(&modlinkage); 395 if (r != DDI_SUCCESS) 396 mac_fini_ops(&xnf_dev_ops); 397 398 return (r); 399 } 400 401 int 402 _fini(void) 403 { 404 return (EBUSY); /* XXPV should be removable */ 405 } 406 407 int 408 _info(struct modinfo *modinfop) 409 { 410 return (mod_info(&modlinkage, modinfop)); 411 } 412 413 /* 414 * Acquire a grant reference. 415 */ 416 static grant_ref_t 417 xnf_gref_get(xnf_t *xnfp) 418 { 419 grant_ref_t gref; 420 421 mutex_enter(&xnfp->xnf_gref_lock); 422 423 do { 424 gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head); 425 426 } while ((gref == INVALID_GRANT_REF) && 427 (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0)); 428 429 mutex_exit(&xnfp->xnf_gref_lock); 430 431 if (gref == INVALID_GRANT_REF) { 432 xnfp->xnf_stat_gref_failure++; 433 } else { 434 atomic_inc_64(&xnfp->xnf_stat_gref_outstanding); 435 if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak) 436 xnfp->xnf_stat_gref_peak = 437 xnfp->xnf_stat_gref_outstanding; 438 } 439 440 return (gref); 441 } 442 443 /* 444 * Release a grant reference. 445 */ 446 static void 447 xnf_gref_put(xnf_t *xnfp, grant_ref_t gref) 448 { 449 ASSERT(gref != INVALID_GRANT_REF); 450 451 mutex_enter(&xnfp->xnf_gref_lock); 452 gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref); 453 mutex_exit(&xnfp->xnf_gref_lock); 454 455 atomic_dec_64(&xnfp->xnf_stat_gref_outstanding); 456 } 457 458 /* 459 * Acquire a transmit id. 460 */ 461 static xnf_txid_t * 462 xnf_txid_get(xnf_t *xnfp) 463 { 464 xnf_txid_t *tidp; 465 466 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 467 468 if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID) 469 return (NULL); 470 471 ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head)); 472 473 tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head); 474 xnfp->xnf_tx_pkt_id_head = tidp->next; 475 tidp->next = INVALID_TX_ID; 476 477 ASSERT(tidp->txbuf == NULL); 478 479 return (tidp); 480 } 481 482 /* 483 * Release a transmit id. 484 */ 485 static void 486 xnf_txid_put(xnf_t *xnfp, xnf_txid_t *tidp) 487 { 488 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 489 ASSERT(TX_ID_VALID(tidp->id)); 490 ASSERT(tidp->next == INVALID_TX_ID); 491 492 tidp->txbuf = NULL; 493 tidp->next = xnfp->xnf_tx_pkt_id_head; 494 xnfp->xnf_tx_pkt_id_head = tidp->id; 495 } 496 497 static void 498 xnf_data_txbuf_free(xnf_t *xnfp, xnf_txbuf_t *txp) 499 { 500 ASSERT3U(txp->tx_type, ==, TX_DATA); 501 502 /* 503 * We are either using a lookaside buffer or we are mapping existing 504 * buffers. 505 */ 506 if (txp->tx_bdesc != NULL) { 507 ASSERT(!txp->tx_handle_bound); 508 xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE); 509 } else { 510 if (txp->tx_txreq.gref != INVALID_GRANT_REF) { 511 if (gnttab_query_foreign_access(txp->tx_txreq.gref) != 512 0) { 513 cmn_err(CE_PANIC, "tx grant %d still in use by " 514 "backend domain", txp->tx_txreq.gref); 515 } 516 (void) gnttab_end_foreign_access_ref( 517 txp->tx_txreq.gref, 1); 518 xnf_gref_put(xnfp, txp->tx_txreq.gref); 519 } 520 521 if (txp->tx_handle_bound) 522 (void) ddi_dma_unbind_handle(txp->tx_dma_handle); 523 } 524 525 if (txp->tx_mp != NULL) 526 freemsg(txp->tx_mp); 527 528 if (txp->tx_prev != NULL) { 529 ASSERT3P(txp->tx_prev->tx_next, ==, txp); 530 txp->tx_prev->tx_next = NULL; 531 } 532 533 if (txp->tx_txreq.id != INVALID_TX_ID) { 534 /* 535 * This should be only possible when resuming from a suspend. 536 */ 537 ASSERT(!xnfp->xnf_connected); 538 xnf_txid_put(xnfp, TX_ID_TO_TXID(xnfp, txp->tx_txreq.id)); 539 txp->tx_txreq.id = INVALID_TX_ID; 540 } 541 542 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); 543 } 544 545 static void 546 xnf_data_txbuf_free_chain(xnf_t *xnfp, xnf_txbuf_t *txp) 547 { 548 if (txp == NULL) 549 return; 550 551 while (txp->tx_next != NULL) 552 txp = txp->tx_next; 553 554 /* 555 * We free the chain in reverse order so that grants can be released 556 * for all dma chunks before unbinding the dma handles. The mblk is 557 * freed last, after all its fragments' dma handles are unbound. 558 */ 559 xnf_txbuf_t *prev; 560 for (; txp != NULL; txp = prev) { 561 prev = txp->tx_prev; 562 xnf_data_txbuf_free(xnfp, txp); 563 } 564 } 565 566 static xnf_txbuf_t * 567 xnf_data_txbuf_alloc(xnf_t *xnfp) 568 { 569 xnf_txbuf_t *txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP); 570 txp->tx_type = TX_DATA; 571 txp->tx_next = NULL; 572 txp->tx_prev = NULL; 573 txp->tx_head = txp; 574 txp->tx_frags_to_ack = 0; 575 txp->tx_mp = NULL; 576 txp->tx_bdesc = NULL; 577 txp->tx_handle_bound = B_FALSE; 578 txp->tx_txreq.gref = INVALID_GRANT_REF; 579 txp->tx_txreq.id = INVALID_TX_ID; 580 581 return (txp); 582 } 583 584 /* 585 * Get `wanted' slots in the transmit ring, waiting for at least that 586 * number if `wait' is B_TRUE. Force the ring to be cleaned by setting 587 * `wanted' to zero. 588 * 589 * Return the number of slots available. 590 */ 591 static int 592 xnf_tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait) 593 { 594 int slotsfree; 595 boolean_t forced_clean = (wanted == 0); 596 597 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 598 599 /* LINTED: constant in conditional context */ 600 while (B_TRUE) { 601 slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring); 602 603 if ((slotsfree < wanted) || forced_clean) 604 slotsfree = xnf_tx_clean_ring(xnfp); 605 606 /* 607 * If there are more than we need free, tell other 608 * people to come looking again. We hold txlock, so we 609 * are able to take our slots before anyone else runs. 610 */ 611 if (slotsfree > wanted) 612 cv_broadcast(&xnfp->xnf_cv_tx_slots); 613 614 if (slotsfree >= wanted) 615 break; 616 617 if (!wait) 618 break; 619 620 cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock); 621 } 622 623 ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring))); 624 625 return (slotsfree); 626 } 627 628 static int 629 xnf_setup_rings(xnf_t *xnfp) 630 { 631 domid_t oeid; 632 struct xenbus_device *xsd; 633 RING_IDX i; 634 int err; 635 xnf_txid_t *tidp; 636 xnf_buf_t **bdescp; 637 638 oeid = xvdi_get_oeid(xnfp->xnf_devinfo); 639 xsd = xvdi_get_xsd(xnfp->xnf_devinfo); 640 641 if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF) 642 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0); 643 644 err = gnttab_grant_foreign_access(oeid, 645 xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0); 646 if (err <= 0) { 647 err = -err; 648 xenbus_dev_error(xsd, err, "granting access to tx ring page"); 649 goto out; 650 } 651 xnfp->xnf_tx_ring_ref = (grant_ref_t)err; 652 653 if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF) 654 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0); 655 656 err = gnttab_grant_foreign_access(oeid, 657 xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0); 658 if (err <= 0) { 659 err = -err; 660 xenbus_dev_error(xsd, err, "granting access to rx ring page"); 661 goto out; 662 } 663 xnfp->xnf_rx_ring_ref = (grant_ref_t)err; 664 665 mutex_enter(&xnfp->xnf_txlock); 666 667 /* 668 * We first cleanup the TX ring in case we are doing a resume. 669 * Note that this can lose packets, but we expect to stagger on. 670 */ 671 xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */ 672 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0]; 673 i < NET_TX_RING_SIZE; 674 i++, tidp++) { 675 xnf_txbuf_t *txp = tidp->txbuf; 676 if (txp == NULL) 677 continue; 678 679 switch (txp->tx_type) { 680 case TX_DATA: 681 /* 682 * txid_put() will be called for each txbuf's txid in 683 * the chain which will result in clearing tidp->txbuf. 684 */ 685 xnf_data_txbuf_free_chain(xnfp, txp); 686 687 break; 688 689 case TX_MCAST_REQ: 690 txp->tx_type = TX_MCAST_RSP; 691 txp->tx_status = NETIF_RSP_DROPPED; 692 cv_broadcast(&xnfp->xnf_cv_multicast); 693 694 /* 695 * The request consumed two slots in the ring, 696 * yet only a single xnf_txid_t is used. Step 697 * over the empty slot. 698 */ 699 i++; 700 ASSERT3U(i, <, NET_TX_RING_SIZE); 701 break; 702 703 case TX_MCAST_RSP: 704 break; 705 } 706 } 707 708 /* 709 * Now purge old list and add each txid to the new free list. 710 */ 711 xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */ 712 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0]; 713 i < NET_TX_RING_SIZE; 714 i++, tidp++) { 715 tidp->id = i; 716 ASSERT3P(tidp->txbuf, ==, NULL); 717 tidp->next = INVALID_TX_ID; /* Appease txid_put(). */ 718 xnf_txid_put(xnfp, tidp); 719 } 720 721 /* LINTED: constant in conditional context */ 722 SHARED_RING_INIT(xnfp->xnf_tx_ring.sring); 723 /* LINTED: constant in conditional context */ 724 FRONT_RING_INIT(&xnfp->xnf_tx_ring, 725 xnfp->xnf_tx_ring.sring, PAGESIZE); 726 727 mutex_exit(&xnfp->xnf_txlock); 728 729 mutex_enter(&xnfp->xnf_rxlock); 730 731 /* 732 * Clean out any buffers currently posted to the receive ring 733 * before we reset it. 734 */ 735 for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0]; 736 i < NET_RX_RING_SIZE; 737 i++, bdescp++) { 738 if (*bdescp != NULL) { 739 xnf_buf_put(xnfp, *bdescp, B_FALSE); 740 *bdescp = NULL; 741 } 742 } 743 744 /* LINTED: constant in conditional context */ 745 SHARED_RING_INIT(xnfp->xnf_rx_ring.sring); 746 /* LINTED: constant in conditional context */ 747 FRONT_RING_INIT(&xnfp->xnf_rx_ring, 748 xnfp->xnf_rx_ring.sring, PAGESIZE); 749 750 /* 751 * Fill the ring with buffers. 752 */ 753 for (i = 0; i < NET_RX_RING_SIZE; i++) { 754 xnf_buf_t *bdesc; 755 756 bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE); 757 VERIFY(bdesc != NULL); 758 xnf_rxbuf_hang(xnfp, bdesc); 759 } 760 761 /* LINTED: constant in conditional context */ 762 RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring); 763 764 mutex_exit(&xnfp->xnf_rxlock); 765 766 return (0); 767 768 out: 769 if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF) 770 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0); 771 xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF; 772 773 if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF) 774 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0); 775 xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF; 776 777 return (err); 778 } 779 780 /* 781 * Connect driver to back end, called to set up communication with 782 * back end driver both initially and on resume after restore/migrate. 783 */ 784 void 785 xnf_be_connect(xnf_t *xnfp) 786 { 787 const char *message; 788 xenbus_transaction_t xbt; 789 struct xenbus_device *xsd; 790 char *xsname; 791 int err; 792 793 ASSERT(!xnfp->xnf_connected); 794 795 xsd = xvdi_get_xsd(xnfp->xnf_devinfo); 796 xsname = xvdi_get_xsname(xnfp->xnf_devinfo); 797 798 err = xnf_setup_rings(xnfp); 799 if (err != 0) { 800 cmn_err(CE_WARN, "failed to set up tx/rx rings"); 801 xenbus_dev_error(xsd, err, "setting up ring"); 802 return; 803 } 804 805 again: 806 err = xenbus_transaction_start(&xbt); 807 if (err != 0) { 808 xenbus_dev_error(xsd, EIO, "starting transaction"); 809 return; 810 } 811 812 err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u", 813 xnfp->xnf_tx_ring_ref); 814 if (err != 0) { 815 message = "writing tx ring-ref"; 816 goto abort_transaction; 817 } 818 819 err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u", 820 xnfp->xnf_rx_ring_ref); 821 if (err != 0) { 822 message = "writing rx ring-ref"; 823 goto abort_transaction; 824 } 825 826 err = xenbus_printf(xbt, xsname, "event-channel", "%u", 827 xnfp->xnf_evtchn); 828 if (err != 0) { 829 message = "writing event-channel"; 830 goto abort_transaction; 831 } 832 833 err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1); 834 if (err != 0) { 835 message = "writing feature-rx-notify"; 836 goto abort_transaction; 837 } 838 839 err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1); 840 if (err != 0) { 841 message = "writing request-rx-copy"; 842 goto abort_transaction; 843 } 844 845 if (xnfp->xnf_be_mcast_control) { 846 err = xenbus_printf(xbt, xsname, "request-multicast-control", 847 "%d", 1); 848 if (err != 0) { 849 message = "writing request-multicast-control"; 850 goto abort_transaction; 851 } 852 } 853 854 /* 855 * Tell backend if we support scatter-gather lists on the rx side. 856 */ 857 err = xenbus_printf(xbt, xsname, "feature-sg", "%d", 858 xnf_enable_rx_sg ? 1 : 0); 859 if (err != 0) { 860 message = "writing feature-sg"; 861 goto abort_transaction; 862 } 863 864 /* 865 * Tell backend if we support LRO for IPv4. Scatter-gather on rx is 866 * a prerequisite. 867 */ 868 err = xenbus_printf(xbt, xsname, "feature-gso-tcpv4", "%d", 869 (xnf_enable_rx_sg && xnf_enable_lro) ? 1 : 0); 870 if (err != 0) { 871 message = "writing feature-gso-tcpv4"; 872 goto abort_transaction; 873 } 874 875 err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected); 876 if (err != 0) { 877 message = "switching state to XenbusStateConnected"; 878 goto abort_transaction; 879 } 880 881 err = xenbus_transaction_end(xbt, 0); 882 if (err != 0) { 883 if (err == EAGAIN) 884 goto again; 885 xenbus_dev_error(xsd, err, "completing transaction"); 886 } 887 888 return; 889 890 abort_transaction: 891 (void) xenbus_transaction_end(xbt, 1); 892 xenbus_dev_error(xsd, err, "%s", message); 893 } 894 895 /* 896 * Read configuration information from xenstore. 897 */ 898 void 899 xnf_read_config(xnf_t *xnfp) 900 { 901 int err, be_cap; 902 char mac[ETHERADDRL * 3]; 903 char *oename = xvdi_get_oename(xnfp->xnf_devinfo); 904 905 err = xenbus_scanf(XBT_NULL, oename, "mac", 906 "%s", (char *)&mac[0]); 907 if (err != 0) { 908 /* 909 * bad: we're supposed to be set up with a proper mac 910 * addr. at this point 911 */ 912 cmn_err(CE_WARN, "%s%d: no mac address", 913 ddi_driver_name(xnfp->xnf_devinfo), 914 ddi_get_instance(xnfp->xnf_devinfo)); 915 return; 916 } 917 if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) { 918 err = ENOENT; 919 xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT, 920 "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo)); 921 return; 922 } 923 924 err = xenbus_scanf(XBT_NULL, oename, 925 "feature-rx-copy", "%d", &be_cap); 926 /* 927 * If we fail to read the store we assume that the key is 928 * absent, implying an older domain at the far end. Older 929 * domains cannot do HV copy. 930 */ 931 if (err != 0) 932 be_cap = 0; 933 xnfp->xnf_be_rx_copy = (be_cap != 0); 934 935 err = xenbus_scanf(XBT_NULL, oename, 936 "feature-multicast-control", "%d", &be_cap); 937 /* 938 * If we fail to read the store we assume that the key is 939 * absent, implying an older domain at the far end. Older 940 * domains do not support multicast control. 941 */ 942 if (err != 0) 943 be_cap = 0; 944 xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control; 945 946 /* 947 * See if back-end supports scatter-gather for transmits. If not, 948 * we will not support LSO and limit the mtu to 1500. 949 */ 950 err = xenbus_scanf(XBT_NULL, oename, "feature-sg", "%d", &be_cap); 951 if (err != 0) { 952 be_cap = 0; 953 dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading " 954 "'feature-sg' from backend driver"); 955 } 956 if (be_cap == 0) { 957 dev_err(xnfp->xnf_devinfo, CE_WARN, "scatter-gather is not " 958 "supported for transmits in the backend driver. LSO is " 959 "disabled and MTU is restricted to 1500 bytes."); 960 } 961 xnfp->xnf_be_tx_sg = (be_cap != 0) && xnf_enable_tx_sg; 962 963 if (xnfp->xnf_be_tx_sg) { 964 /* 965 * Check if LSO is supported. Currently we only check for 966 * IPv4 as Illumos doesn't support LSO for IPv6. 967 */ 968 err = xenbus_scanf(XBT_NULL, oename, "feature-gso-tcpv4", "%d", 969 &be_cap); 970 if (err != 0) { 971 be_cap = 0; 972 dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading " 973 "'feature-gso-tcpv4' from backend driver"); 974 } 975 if (be_cap == 0) { 976 dev_err(xnfp->xnf_devinfo, CE_WARN, "LSO is not " 977 "supported by the backend driver. Performance " 978 "will be affected."); 979 } 980 xnfp->xnf_be_lso = (be_cap != 0) && xnf_enable_lso; 981 } 982 } 983 984 /* 985 * attach(9E) -- Attach a device to the system 986 */ 987 static int 988 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) 989 { 990 mac_register_t *macp; 991 xnf_t *xnfp; 992 int err; 993 char cachename[32]; 994 995 #ifdef XNF_DEBUG 996 if (xnf_debug & XNF_DEBUG_DDI) 997 printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo), 998 (void *)devinfo); 999 #endif 1000 1001 switch (cmd) { 1002 case DDI_RESUME: 1003 xnfp = ddi_get_driver_private(devinfo); 1004 xnfp->xnf_gen++; 1005 1006 (void) xvdi_resume(devinfo); 1007 (void) xvdi_alloc_evtchn(devinfo); 1008 xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo); 1009 #ifdef XPV_HVM_DRIVER 1010 ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, 1011 xnfp); 1012 #else 1013 (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, 1014 (caddr_t)xnfp); 1015 #endif 1016 return (DDI_SUCCESS); 1017 1018 case DDI_ATTACH: 1019 break; 1020 1021 default: 1022 return (DDI_FAILURE); 1023 } 1024 1025 /* 1026 * Allocate gld_mac_info_t and xnf_instance structures 1027 */ 1028 macp = mac_alloc(MAC_VERSION); 1029 if (macp == NULL) 1030 return (DDI_FAILURE); 1031 xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP); 1032 1033 xnfp->xnf_tx_pkt_id = 1034 kmem_zalloc(sizeof (xnf_txid_t) * NET_TX_RING_SIZE, KM_SLEEP); 1035 1036 xnfp->xnf_rx_pkt_info = 1037 kmem_zalloc(sizeof (xnf_buf_t *) * NET_RX_RING_SIZE, KM_SLEEP); 1038 1039 macp->m_dip = devinfo; 1040 macp->m_driver = xnfp; 1041 xnfp->xnf_devinfo = devinfo; 1042 1043 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1044 macp->m_src_addr = xnfp->xnf_mac_addr; 1045 macp->m_callbacks = &xnf_callbacks; 1046 macp->m_min_sdu = 0; 1047 xnfp->xnf_mtu = ETHERMTU; 1048 macp->m_max_sdu = xnfp->xnf_mtu; 1049 1050 xnfp->xnf_running = B_FALSE; 1051 xnfp->xnf_connected = B_FALSE; 1052 xnfp->xnf_be_rx_copy = B_FALSE; 1053 xnfp->xnf_be_mcast_control = B_FALSE; 1054 xnfp->xnf_need_sched = B_FALSE; 1055 1056 xnfp->xnf_rx_head = NULL; 1057 xnfp->xnf_rx_tail = NULL; 1058 xnfp->xnf_rx_new_buffers_posted = B_FALSE; 1059 1060 #ifdef XPV_HVM_DRIVER 1061 /* Report our version to dom0 */ 1062 (void) xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d", 1063 HVMPV_XNF_VERS); 1064 #endif 1065 1066 /* 1067 * Get the iblock cookie with which to initialize the mutexes. 1068 */ 1069 if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie) 1070 != DDI_SUCCESS) 1071 goto failure; 1072 1073 mutex_init(&xnfp->xnf_txlock, 1074 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1075 mutex_init(&xnfp->xnf_rxlock, 1076 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1077 mutex_init(&xnfp->xnf_schedlock, 1078 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1079 mutex_init(&xnfp->xnf_gref_lock, 1080 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1081 1082 cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL); 1083 cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL); 1084 cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL); 1085 1086 (void) sprintf(cachename, "xnf_buf_cache_%d", 1087 ddi_get_instance(devinfo)); 1088 xnfp->xnf_buf_cache = kmem_cache_create(cachename, 1089 sizeof (xnf_buf_t), 0, 1090 xnf_buf_constructor, xnf_buf_destructor, 1091 NULL, xnfp, NULL, 0); 1092 if (xnfp->xnf_buf_cache == NULL) 1093 goto failure_0; 1094 1095 (void) sprintf(cachename, "xnf_tx_buf_cache_%d", 1096 ddi_get_instance(devinfo)); 1097 xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename, 1098 sizeof (xnf_txbuf_t), 0, 1099 xnf_tx_buf_constructor, xnf_tx_buf_destructor, 1100 NULL, xnfp, NULL, 0); 1101 if (xnfp->xnf_tx_buf_cache == NULL) 1102 goto failure_1; 1103 1104 xnfp->xnf_gref_head = INVALID_GRANT_REF; 1105 1106 if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) { 1107 cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize " 1108 "driver data structures", 1109 ddi_get_instance(xnfp->xnf_devinfo)); 1110 goto failure_2; 1111 } 1112 1113 xnfp->xnf_rx_ring.sring->rsp_event = 1114 xnfp->xnf_tx_ring.sring->rsp_event = 1; 1115 1116 xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF; 1117 xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF; 1118 1119 /* set driver private pointer now */ 1120 ddi_set_driver_private(devinfo, xnfp); 1121 1122 if (!xnf_kstat_init(xnfp)) 1123 goto failure_3; 1124 1125 /* 1126 * Allocate an event channel, add the interrupt handler and 1127 * bind it to the event channel. 1128 */ 1129 (void) xvdi_alloc_evtchn(devinfo); 1130 xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo); 1131 #ifdef XPV_HVM_DRIVER 1132 ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp); 1133 #else 1134 (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp); 1135 #endif 1136 1137 err = mac_register(macp, &xnfp->xnf_mh); 1138 mac_free(macp); 1139 macp = NULL; 1140 if (err != 0) 1141 goto failure_4; 1142 1143 if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL) 1144 != DDI_SUCCESS) 1145 goto failure_5; 1146 1147 #ifdef XPV_HVM_DRIVER 1148 /* 1149 * In the HVM case, this driver essentially replaces a driver for 1150 * a 'real' PCI NIC. Without the "model" property set to 1151 * "Ethernet controller", like the PCI code does, netbooting does 1152 * not work correctly, as strplumb_get_netdev_path() will not find 1153 * this interface. 1154 */ 1155 (void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model", 1156 "Ethernet controller"); 1157 #endif 1158 1159 #ifdef XNF_DEBUG 1160 if (xnf_debug_instance == NULL) 1161 xnf_debug_instance = xnfp; 1162 #endif 1163 1164 return (DDI_SUCCESS); 1165 1166 failure_5: 1167 (void) mac_unregister(xnfp->xnf_mh); 1168 1169 failure_4: 1170 #ifdef XPV_HVM_DRIVER 1171 ec_unbind_evtchn(xnfp->xnf_evtchn); 1172 xvdi_free_evtchn(devinfo); 1173 #else 1174 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); 1175 #endif 1176 xnfp->xnf_evtchn = INVALID_EVTCHN; 1177 kstat_delete(xnfp->xnf_kstat_aux); 1178 1179 failure_3: 1180 xnf_release_dma_resources(xnfp); 1181 1182 failure_2: 1183 kmem_cache_destroy(xnfp->xnf_tx_buf_cache); 1184 1185 failure_1: 1186 kmem_cache_destroy(xnfp->xnf_buf_cache); 1187 1188 failure_0: 1189 cv_destroy(&xnfp->xnf_cv_tx_slots); 1190 cv_destroy(&xnfp->xnf_cv_multicast); 1191 cv_destroy(&xnfp->xnf_cv_state); 1192 1193 mutex_destroy(&xnfp->xnf_gref_lock); 1194 mutex_destroy(&xnfp->xnf_schedlock); 1195 mutex_destroy(&xnfp->xnf_rxlock); 1196 mutex_destroy(&xnfp->xnf_txlock); 1197 1198 failure: 1199 kmem_free(xnfp, sizeof (*xnfp)); 1200 if (macp != NULL) 1201 mac_free(macp); 1202 1203 return (DDI_FAILURE); 1204 } 1205 1206 /* detach(9E) -- Detach a device from the system */ 1207 static int 1208 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) 1209 { 1210 xnf_t *xnfp; /* Our private device info */ 1211 1212 #ifdef XNF_DEBUG 1213 if (xnf_debug & XNF_DEBUG_DDI) 1214 printf("xnf_detach(0x%p)\n", (void *)devinfo); 1215 #endif 1216 1217 xnfp = ddi_get_driver_private(devinfo); 1218 1219 switch (cmd) { 1220 case DDI_SUSPEND: 1221 #ifdef XPV_HVM_DRIVER 1222 ec_unbind_evtchn(xnfp->xnf_evtchn); 1223 xvdi_free_evtchn(devinfo); 1224 #else 1225 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); 1226 #endif 1227 1228 xvdi_suspend(devinfo); 1229 1230 mutex_enter(&xnfp->xnf_rxlock); 1231 mutex_enter(&xnfp->xnf_txlock); 1232 1233 xnfp->xnf_evtchn = INVALID_EVTCHN; 1234 xnfp->xnf_connected = B_FALSE; 1235 mutex_exit(&xnfp->xnf_txlock); 1236 mutex_exit(&xnfp->xnf_rxlock); 1237 1238 /* claim link to be down after disconnect */ 1239 mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN); 1240 return (DDI_SUCCESS); 1241 1242 case DDI_DETACH: 1243 break; 1244 1245 default: 1246 return (DDI_FAILURE); 1247 } 1248 1249 if (xnfp->xnf_connected) 1250 return (DDI_FAILURE); 1251 1252 /* 1253 * Cannot detach if we have xnf_buf_t outstanding. 1254 */ 1255 if (xnfp->xnf_stat_buf_allocated > 0) 1256 return (DDI_FAILURE); 1257 1258 if (mac_unregister(xnfp->xnf_mh) != 0) 1259 return (DDI_FAILURE); 1260 1261 kstat_delete(xnfp->xnf_kstat_aux); 1262 1263 /* Stop the receiver */ 1264 xnf_stop(xnfp); 1265 1266 xvdi_remove_event_handler(devinfo, XS_OE_STATE); 1267 1268 /* Remove the interrupt */ 1269 #ifdef XPV_HVM_DRIVER 1270 ec_unbind_evtchn(xnfp->xnf_evtchn); 1271 xvdi_free_evtchn(devinfo); 1272 #else 1273 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); 1274 #endif 1275 1276 /* Release any pending xmit mblks */ 1277 xnf_release_mblks(xnfp); 1278 1279 /* Release all DMA resources */ 1280 xnf_release_dma_resources(xnfp); 1281 1282 cv_destroy(&xnfp->xnf_cv_tx_slots); 1283 cv_destroy(&xnfp->xnf_cv_multicast); 1284 cv_destroy(&xnfp->xnf_cv_state); 1285 1286 kmem_cache_destroy(xnfp->xnf_tx_buf_cache); 1287 kmem_cache_destroy(xnfp->xnf_buf_cache); 1288 1289 mutex_destroy(&xnfp->xnf_gref_lock); 1290 mutex_destroy(&xnfp->xnf_schedlock); 1291 mutex_destroy(&xnfp->xnf_rxlock); 1292 mutex_destroy(&xnfp->xnf_txlock); 1293 1294 kmem_free(xnfp, sizeof (*xnfp)); 1295 1296 return (DDI_SUCCESS); 1297 } 1298 1299 /* 1300 * xnf_set_mac_addr() -- set the physical network address on the board. 1301 */ 1302 static int 1303 xnf_set_mac_addr(void *arg, const uint8_t *macaddr) 1304 { 1305 _NOTE(ARGUNUSED(arg, macaddr)); 1306 1307 /* 1308 * We can't set our macaddr. 1309 */ 1310 return (ENOTSUP); 1311 } 1312 1313 /* 1314 * xnf_set_multicast() -- set (enable) or disable a multicast address. 1315 * 1316 * Program the hardware to enable/disable the multicast address 1317 * in "mca". Enable if "add" is true, disable if false. 1318 */ 1319 static int 1320 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca) 1321 { 1322 xnf_t *xnfp = arg; 1323 xnf_txbuf_t *txp; 1324 int n_slots; 1325 RING_IDX slot; 1326 xnf_txid_t *tidp; 1327 netif_tx_request_t *txrp; 1328 struct netif_extra_info *erp; 1329 boolean_t notify, result; 1330 1331 /* 1332 * If the backend does not support multicast control then we 1333 * must assume that the right packets will just arrive. 1334 */ 1335 if (!xnfp->xnf_be_mcast_control) 1336 return (0); 1337 1338 txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP); 1339 1340 mutex_enter(&xnfp->xnf_txlock); 1341 1342 /* 1343 * If we're not yet connected then claim success. This is 1344 * acceptable because we refresh the entire set of multicast 1345 * addresses when we get connected. 1346 * 1347 * We can't wait around here because the MAC layer expects 1348 * this to be a non-blocking operation - waiting ends up 1349 * causing a deadlock during resume. 1350 */ 1351 if (!xnfp->xnf_connected) { 1352 mutex_exit(&xnfp->xnf_txlock); 1353 return (0); 1354 } 1355 1356 /* 1357 * 1. Acquire two slots in the ring. 1358 * 2. Fill in the slots. 1359 * 3. Request notification when the operation is done. 1360 * 4. Kick the peer. 1361 * 5. Wait for the response via xnf_tx_clean_ring(). 1362 */ 1363 1364 n_slots = xnf_tx_slots_get(xnfp, 2, B_TRUE); 1365 ASSERT(n_slots >= 2); 1366 1367 slot = xnfp->xnf_tx_ring.req_prod_pvt; 1368 tidp = xnf_txid_get(xnfp); 1369 VERIFY(tidp != NULL); 1370 1371 txp->tx_type = TX_MCAST_REQ; 1372 txp->tx_slot = slot; 1373 1374 txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); 1375 erp = (struct netif_extra_info *) 1376 RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1); 1377 1378 txrp->gref = 0; 1379 txrp->size = 0; 1380 txrp->offset = 0; 1381 /* Set tx_txreq.id to appease xnf_tx_clean_ring(). */ 1382 txrp->id = txp->tx_txreq.id = tidp->id; 1383 txrp->flags = NETTXF_extra_info; 1384 1385 erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD : 1386 XEN_NETIF_EXTRA_TYPE_MCAST_DEL; 1387 bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL); 1388 1389 tidp->txbuf = txp; 1390 1391 xnfp->xnf_tx_ring.req_prod_pvt = slot + 2; 1392 1393 mutex_enter(&xnfp->xnf_schedlock); 1394 xnfp->xnf_pending_multicast++; 1395 mutex_exit(&xnfp->xnf_schedlock); 1396 1397 /* LINTED: constant in conditional context */ 1398 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, 1399 notify); 1400 if (notify) 1401 ec_notify_via_evtchn(xnfp->xnf_evtchn); 1402 1403 while (txp->tx_type == TX_MCAST_REQ) 1404 cv_wait(&xnfp->xnf_cv_multicast, &xnfp->xnf_txlock); 1405 1406 ASSERT3U(txp->tx_type, ==, TX_MCAST_RSP); 1407 1408 mutex_enter(&xnfp->xnf_schedlock); 1409 xnfp->xnf_pending_multicast--; 1410 mutex_exit(&xnfp->xnf_schedlock); 1411 1412 result = (txp->tx_status == NETIF_RSP_OKAY); 1413 1414 xnf_txid_put(xnfp, tidp); 1415 1416 mutex_exit(&xnfp->xnf_txlock); 1417 1418 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); 1419 1420 return (result ? 0 : 1); 1421 } 1422 1423 /* 1424 * xnf_set_promiscuous() -- set or reset promiscuous mode on the board 1425 * 1426 * Program the hardware to enable/disable promiscuous mode. 1427 */ 1428 static int 1429 xnf_set_promiscuous(void *arg, boolean_t on) 1430 { 1431 _NOTE(ARGUNUSED(arg, on)); 1432 1433 /* 1434 * We can't really do this, but we pretend that we can in 1435 * order that snoop will work. 1436 */ 1437 return (0); 1438 } 1439 1440 /* 1441 * Clean buffers that we have responses for from the transmit ring. 1442 */ 1443 static int 1444 xnf_tx_clean_ring(xnf_t *xnfp) 1445 { 1446 boolean_t work_to_do; 1447 1448 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 1449 1450 loop: 1451 while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) { 1452 RING_IDX cons, prod, i; 1453 1454 cons = xnfp->xnf_tx_ring.rsp_cons; 1455 prod = xnfp->xnf_tx_ring.sring->rsp_prod; 1456 membar_consumer(); 1457 /* 1458 * Clean tx requests from ring that we have responses 1459 * for. 1460 */ 1461 DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod); 1462 for (i = cons; i != prod; i++) { 1463 netif_tx_response_t *trp; 1464 xnf_txid_t *tidp; 1465 xnf_txbuf_t *txp; 1466 1467 trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i); 1468 /* 1469 * if this slot was occupied by netif_extra_info_t, 1470 * then the response will be NETIF_RSP_NULL. In this 1471 * case there are no resources to clean up. 1472 */ 1473 if (trp->status == NETIF_RSP_NULL) 1474 continue; 1475 1476 ASSERT(TX_ID_VALID(trp->id)); 1477 1478 tidp = TX_ID_TO_TXID(xnfp, trp->id); 1479 ASSERT3U(tidp->id, ==, trp->id); 1480 ASSERT3U(tidp->next, ==, INVALID_TX_ID); 1481 1482 txp = tidp->txbuf; 1483 ASSERT(txp != NULL); 1484 ASSERT3U(txp->tx_txreq.id, ==, trp->id); 1485 1486 switch (txp->tx_type) { 1487 case TX_DATA: 1488 /* 1489 * We must put the txid for each response we 1490 * acknowledge to make sure that we never have 1491 * more free slots than txids. Because of this 1492 * we do it here instead of waiting for it to 1493 * be done in xnf_data_txbuf_free_chain(). 1494 */ 1495 xnf_txid_put(xnfp, tidp); 1496 txp->tx_txreq.id = INVALID_TX_ID; 1497 ASSERT3S(txp->tx_head->tx_frags_to_ack, >, 0); 1498 txp->tx_head->tx_frags_to_ack--; 1499 1500 /* 1501 * We clean the whole chain once we got a 1502 * response for each fragment. 1503 */ 1504 if (txp->tx_head->tx_frags_to_ack == 0) 1505 xnf_data_txbuf_free_chain(xnfp, txp); 1506 1507 break; 1508 1509 case TX_MCAST_REQ: 1510 txp->tx_type = TX_MCAST_RSP; 1511 txp->tx_status = trp->status; 1512 cv_broadcast(&xnfp->xnf_cv_multicast); 1513 1514 break; 1515 1516 default: 1517 cmn_err(CE_PANIC, "xnf_tx_clean_ring: " 1518 "invalid xnf_txbuf_t type: %d", 1519 txp->tx_type); 1520 break; 1521 } 1522 } 1523 /* 1524 * Record the last response we dealt with so that we 1525 * know where to start next time around. 1526 */ 1527 xnfp->xnf_tx_ring.rsp_cons = prod; 1528 membar_enter(); 1529 } 1530 1531 /* LINTED: constant in conditional context */ 1532 RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do); 1533 if (work_to_do) 1534 goto loop; 1535 1536 return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring)); 1537 } 1538 1539 /* 1540 * Allocate and fill in a look-aside buffer for the packet `mp'. Used 1541 * to ensure that the packet is physically contiguous and contained 1542 * within a single page. 1543 */ 1544 static xnf_buf_t * 1545 xnf_tx_get_lookaside(xnf_t *xnfp, mblk_t *mp, size_t *plen) 1546 { 1547 xnf_buf_t *bd; 1548 caddr_t bp; 1549 1550 bd = xnf_buf_get(xnfp, KM_SLEEP, B_TRUE); 1551 if (bd == NULL) 1552 return (NULL); 1553 1554 bp = bd->buf; 1555 while (mp != NULL) { 1556 size_t len = MBLKL(mp); 1557 1558 bcopy(mp->b_rptr, bp, len); 1559 bp += len; 1560 1561 mp = mp->b_cont; 1562 } 1563 1564 *plen = bp - bd->buf; 1565 ASSERT3U(*plen, <=, PAGESIZE); 1566 1567 xnfp->xnf_stat_tx_lookaside++; 1568 1569 return (bd); 1570 } 1571 1572 /* 1573 * Insert the pseudo-header checksum into the packet. 1574 * Assumes packet is IPv4, TCP/UDP since we only advertised support for 1575 * HCKSUM_INET_FULL_V4. 1576 */ 1577 int 1578 xnf_pseudo_cksum(mblk_t *mp) 1579 { 1580 struct ether_header *ehp; 1581 uint16_t sap, iplen, *stuff; 1582 uint32_t cksum; 1583 size_t len; 1584 ipha_t *ipha; 1585 ipaddr_t src, dst; 1586 uchar_t *ptr; 1587 1588 ptr = mp->b_rptr; 1589 len = MBLKL(mp); 1590 1591 /* Each header must fit completely in an mblk. */ 1592 ASSERT3U(len, >=, sizeof (*ehp)); 1593 1594 ehp = (struct ether_header *)ptr; 1595 1596 if (ntohs(ehp->ether_type) == VLAN_TPID) { 1597 struct ether_vlan_header *evhp; 1598 ASSERT3U(len, >=, sizeof (*evhp)); 1599 evhp = (struct ether_vlan_header *)ptr; 1600 sap = ntohs(evhp->ether_type); 1601 ptr += sizeof (*evhp); 1602 len -= sizeof (*evhp); 1603 } else { 1604 sap = ntohs(ehp->ether_type); 1605 ptr += sizeof (*ehp); 1606 len -= sizeof (*ehp); 1607 } 1608 1609 ASSERT3U(sap, ==, ETHERTYPE_IP); 1610 1611 /* 1612 * Ethernet and IP headers may be in different mblks. 1613 */ 1614 ASSERT3P(ptr, <=, mp->b_wptr); 1615 if (ptr == mp->b_wptr) { 1616 mp = mp->b_cont; 1617 ptr = mp->b_rptr; 1618 len = MBLKL(mp); 1619 } 1620 1621 ASSERT3U(len, >=, sizeof (ipha_t)); 1622 ipha = (ipha_t *)ptr; 1623 1624 /* 1625 * We assume the IP header has no options. (This is enforced in 1626 * ire_send_wire_v4() -- search for IXAF_NO_HW_CKSUM). 1627 */ 1628 ASSERT3U(IPH_HDR_LENGTH(ipha), ==, IP_SIMPLE_HDR_LENGTH); 1629 iplen = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH; 1630 1631 ptr += IP_SIMPLE_HDR_LENGTH; 1632 len -= IP_SIMPLE_HDR_LENGTH; 1633 1634 /* 1635 * IP and L4 headers may be in different mblks. 1636 */ 1637 ASSERT3P(ptr, <=, mp->b_wptr); 1638 if (ptr == mp->b_wptr) { 1639 mp = mp->b_cont; 1640 ptr = mp->b_rptr; 1641 len = MBLKL(mp); 1642 } 1643 1644 switch (ipha->ipha_protocol) { 1645 case IPPROTO_TCP: 1646 ASSERT3U(len, >=, sizeof (tcph_t)); 1647 stuff = (uint16_t *)(ptr + TCP_CHECKSUM_OFFSET); 1648 cksum = IP_TCP_CSUM_COMP; 1649 break; 1650 case IPPROTO_UDP: 1651 ASSERT3U(len, >=, sizeof (struct udphdr)); 1652 stuff = (uint16_t *)(ptr + UDP_CHECKSUM_OFFSET); 1653 cksum = IP_UDP_CSUM_COMP; 1654 break; 1655 default: 1656 cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d", 1657 ipha->ipha_protocol); 1658 return (EINVAL); 1659 } 1660 1661 src = ipha->ipha_src; 1662 dst = ipha->ipha_dst; 1663 1664 cksum += (dst >> 16) + (dst & 0xFFFF); 1665 cksum += (src >> 16) + (src & 0xFFFF); 1666 cksum += htons(iplen); 1667 1668 cksum = (cksum >> 16) + (cksum & 0xFFFF); 1669 cksum = (cksum >> 16) + (cksum & 0xFFFF); 1670 1671 ASSERT(cksum <= 0xFFFF); 1672 1673 *stuff = (uint16_t)(cksum ? cksum : ~cksum); 1674 1675 return (0); 1676 } 1677 1678 /* 1679 * Push a packet into the transmit ring. 1680 * 1681 * Note: the format of a tx packet that spans multiple slots is similar to 1682 * what is described in xnf_rx_one_packet(). 1683 */ 1684 static void 1685 xnf_tx_push_packet(xnf_t *xnfp, xnf_txbuf_t *head) 1686 { 1687 int nslots = 0; 1688 int extras = 0; 1689 RING_IDX slot; 1690 boolean_t notify; 1691 1692 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 1693 ASSERT(xnfp->xnf_running); 1694 1695 slot = xnfp->xnf_tx_ring.req_prod_pvt; 1696 1697 /* 1698 * The caller has already checked that we have enough slots to proceed. 1699 */ 1700 for (xnf_txbuf_t *txp = head; txp != NULL; txp = txp->tx_next) { 1701 xnf_txid_t *tidp; 1702 netif_tx_request_t *txrp; 1703 1704 tidp = xnf_txid_get(xnfp); 1705 VERIFY(tidp != NULL); 1706 txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); 1707 1708 txp->tx_slot = slot; 1709 txp->tx_txreq.id = tidp->id; 1710 *txrp = txp->tx_txreq; 1711 1712 tidp->txbuf = txp; 1713 slot++; 1714 nslots++; 1715 1716 /* 1717 * When present, LSO info is placed in a slot after the first 1718 * data segment, and doesn't require a txid. 1719 */ 1720 if (txp->tx_txreq.flags & NETTXF_extra_info) { 1721 netif_extra_info_t *extra; 1722 ASSERT3U(nslots, ==, 1); 1723 1724 extra = (netif_extra_info_t *) 1725 RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); 1726 *extra = txp->tx_extra; 1727 slot++; 1728 nslots++; 1729 extras = 1; 1730 } 1731 } 1732 1733 ASSERT3U(nslots, <=, XEN_MAX_SLOTS_PER_TX); 1734 1735 /* 1736 * Store the number of data fragments. 1737 */ 1738 head->tx_frags_to_ack = nslots - extras; 1739 1740 xnfp->xnf_tx_ring.req_prod_pvt = slot; 1741 1742 /* 1743 * Tell the peer that we sent something, if it cares. 1744 */ 1745 /* LINTED: constant in conditional context */ 1746 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, notify); 1747 if (notify) 1748 ec_notify_via_evtchn(xnfp->xnf_evtchn); 1749 } 1750 1751 static xnf_txbuf_t * 1752 xnf_mblk_copy(xnf_t *xnfp, mblk_t *mp) 1753 { 1754 xnf_txbuf_t *txp = xnf_data_txbuf_alloc(xnfp); 1755 size_t length; 1756 1757 txp->tx_bdesc = xnf_tx_get_lookaside(xnfp, mp, &length); 1758 if (txp->tx_bdesc == NULL) { 1759 xnf_data_txbuf_free(xnfp, txp); 1760 return (NULL); 1761 } 1762 txp->tx_mfn = txp->tx_bdesc->buf_mfn; 1763 txp->tx_txreq.gref = txp->tx_bdesc->grant_ref; 1764 txp->tx_txreq.size = length; 1765 txp->tx_txreq.offset = (uintptr_t)txp->tx_bdesc->buf & PAGEOFFSET; 1766 txp->tx_txreq.flags = 0; 1767 1768 return (txp); 1769 } 1770 1771 static xnf_txbuf_t * 1772 xnf_mblk_map(xnf_t *xnfp, mblk_t *mp, int *countp) 1773 { 1774 xnf_txbuf_t *head = NULL; 1775 xnf_txbuf_t *tail = NULL; 1776 domid_t oeid; 1777 int nsegs = 0; 1778 1779 oeid = xvdi_get_oeid(xnfp->xnf_devinfo); 1780 1781 for (mblk_t *ml = mp; ml != NULL; ml = ml->b_cont) { 1782 ddi_dma_handle_t dma_handle; 1783 const ddi_dma_cookie_t *dma_cookie, *dma_cookie_prev; 1784 xnf_txbuf_t *txp; 1785 1786 if (MBLKL(ml) == 0) 1787 continue; 1788 1789 txp = xnf_data_txbuf_alloc(xnfp); 1790 1791 if (head == NULL) { 1792 head = txp; 1793 } else { 1794 ASSERT(tail != NULL); 1795 TXBUF_SETNEXT(tail, txp); 1796 txp->tx_head = head; 1797 } 1798 1799 /* 1800 * The necessary segmentation rules (e.g. not crossing a page 1801 * boundary) are enforced by the dma attributes of the handle. 1802 */ 1803 dma_handle = txp->tx_dma_handle; 1804 int ret = ddi_dma_addr_bind_handle(dma_handle, 1805 NULL, (char *)ml->b_rptr, MBLKL(ml), 1806 DDI_DMA_WRITE | DDI_DMA_STREAMING, 1807 DDI_DMA_DONTWAIT, 0, NULL, NULL); 1808 if (ret != DDI_DMA_MAPPED) { 1809 if (ret != DDI_DMA_NORESOURCES) { 1810 dev_err(xnfp->xnf_devinfo, CE_WARN, 1811 "ddi_dma_addr_bind_handle() failed " 1812 "[dma_error=%d]", ret); 1813 } 1814 goto error; 1815 } 1816 txp->tx_handle_bound = B_TRUE; 1817 1818 dma_cookie_prev = NULL; 1819 while ((dma_cookie = ddi_dma_cookie_iter(dma_handle, 1820 dma_cookie_prev)) != NULL) { 1821 if (nsegs == XEN_MAX_TX_DATA_PAGES) { 1822 dev_err(xnfp->xnf_devinfo, CE_WARN, 1823 "xnf_dmamap_alloc() failed: " 1824 "too many segments"); 1825 goto error; 1826 } 1827 if (dma_cookie_prev != NULL) { 1828 txp = xnf_data_txbuf_alloc(xnfp); 1829 ASSERT(tail != NULL); 1830 TXBUF_SETNEXT(tail, txp); 1831 txp->tx_head = head; 1832 } 1833 1834 txp->tx_mfn = 1835 xnf_btop(pa_to_ma(dma_cookie->dmac_laddress)); 1836 txp->tx_txreq.gref = xnf_gref_get(xnfp); 1837 if (txp->tx_txreq.gref == INVALID_GRANT_REF) { 1838 dev_err(xnfp->xnf_devinfo, CE_WARN, 1839 "xnf_dmamap_alloc() failed: " 1840 "invalid grant ref"); 1841 goto error; 1842 } 1843 gnttab_grant_foreign_access_ref(txp->tx_txreq.gref, 1844 oeid, txp->tx_mfn, 1); 1845 txp->tx_txreq.offset = 1846 dma_cookie->dmac_laddress & PAGEOFFSET; 1847 txp->tx_txreq.size = dma_cookie->dmac_size; 1848 txp->tx_txreq.flags = 0; 1849 1850 nsegs++; 1851 1852 if (tail != NULL) 1853 tail->tx_txreq.flags = NETTXF_more_data; 1854 tail = txp; 1855 1856 dma_cookie_prev = dma_cookie; 1857 } 1858 } 1859 1860 *countp = nsegs; 1861 return (head); 1862 1863 error: 1864 xnf_data_txbuf_free_chain(xnfp, head); 1865 return (NULL); 1866 } 1867 1868 static void 1869 xnf_tx_setup_offload(xnf_t *xnfp, xnf_txbuf_t *head, 1870 uint32_t cksum_flags, uint32_t lso_flags, uint32_t mss) 1871 { 1872 if (lso_flags != 0) { 1873 ASSERT3U(lso_flags, ==, HW_LSO); 1874 ASSERT3P(head->tx_bdesc, ==, NULL); 1875 1876 head->tx_txreq.flags |= NETTXF_extra_info; 1877 netif_extra_info_t *extra = &head->tx_extra; 1878 extra->type = XEN_NETIF_EXTRA_TYPE_GSO; 1879 extra->flags = 0; 1880 extra->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; 1881 extra->u.gso.size = mss; 1882 extra->u.gso.features = 0; 1883 extra->u.gso.pad = 0; 1884 } else if (cksum_flags != 0) { 1885 ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM); 1886 /* 1887 * If the local protocol stack requests checksum 1888 * offload we set the 'checksum blank' flag, 1889 * indicating to the peer that we need the checksum 1890 * calculated for us. 1891 * 1892 * We _don't_ set the validated flag, because we haven't 1893 * validated that the data and the checksum match. 1894 * 1895 * Note: we already called xnf_pseudo_cksum() in 1896 * xnf_send(), so we just set the txreq flag here. 1897 */ 1898 head->tx_txreq.flags |= NETTXF_csum_blank; 1899 xnfp->xnf_stat_tx_cksum_deferred++; 1900 } 1901 } 1902 1903 /* 1904 * Send packet mp. Called by the MAC framework. 1905 */ 1906 static mblk_t * 1907 xnf_send(void *arg, mblk_t *mp) 1908 { 1909 xnf_t *xnfp = arg; 1910 xnf_txbuf_t *head; 1911 mblk_t *ml; 1912 int length; 1913 int pages, chunks, slots, slots_free; 1914 uint32_t cksum_flags, lso_flags, mss; 1915 boolean_t pulledup = B_FALSE; 1916 boolean_t force_copy = B_FALSE; 1917 1918 ASSERT3P(mp->b_next, ==, NULL); 1919 1920 mutex_enter(&xnfp->xnf_txlock); 1921 1922 /* 1923 * Wait until we are connected to the backend. 1924 */ 1925 while (!xnfp->xnf_connected) 1926 cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock); 1927 1928 /* 1929 * To simplify logic and be in sync with the rescheduling mechanism, 1930 * we require the maximum amount of slots that could be used by a 1931 * transaction to be free before proceeding. The only downside of doing 1932 * this is that it slightly reduces the effective size of the ring. 1933 */ 1934 slots_free = xnf_tx_slots_get(xnfp, XEN_MAX_SLOTS_PER_TX, B_FALSE); 1935 if (slots_free < XEN_MAX_SLOTS_PER_TX) { 1936 /* 1937 * We need to ask for a re-schedule later as the ring is full. 1938 */ 1939 mutex_enter(&xnfp->xnf_schedlock); 1940 xnfp->xnf_need_sched = B_TRUE; 1941 mutex_exit(&xnfp->xnf_schedlock); 1942 1943 xnfp->xnf_stat_tx_defer++; 1944 mutex_exit(&xnfp->xnf_txlock); 1945 return (mp); 1946 } 1947 1948 /* 1949 * Get hw offload parameters. 1950 * This must be done before pulling up the mp as those parameters 1951 * are not copied over. 1952 */ 1953 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &cksum_flags); 1954 mac_lso_get(mp, &mss, &lso_flags); 1955 1956 /* 1957 * XXX: fix MAC framework so that we can advertise support for 1958 * partial checksum for IPv4 only. This way we won't need to calculate 1959 * the pseudo header checksum ourselves. 1960 */ 1961 if (cksum_flags != 0) { 1962 ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM); 1963 (void) xnf_pseudo_cksum(mp); 1964 } 1965 1966 pulledup: 1967 for (ml = mp, pages = 0, chunks = 0, length = 0; ml != NULL; 1968 ml = ml->b_cont, chunks++) { 1969 pages += xnf_mblk_pages(ml); 1970 length += MBLKL(ml); 1971 } 1972 DTRACE_PROBE3(packet, int, length, int, chunks, int, pages); 1973 DTRACE_PROBE3(lso, int, length, uint32_t, lso_flags, uint32_t, mss); 1974 1975 /* 1976 * If the ethernet header crosses a page boundary the packet 1977 * will be dropped by the backend. In practice it seems like 1978 * this happens fairly rarely so we'll do nothing unless the 1979 * packet is small enough to fit in a look-aside buffer. 1980 */ 1981 if (((uintptr_t)mp->b_rptr & PAGEOFFSET) + 1982 sizeof (struct ether_header) > PAGESIZE) { 1983 xnfp->xnf_stat_tx_eth_hdr_split++; 1984 if (length <= PAGESIZE) 1985 force_copy = B_TRUE; 1986 } 1987 1988 if (force_copy || (pages > 1 && !xnfp->xnf_be_tx_sg)) { 1989 /* 1990 * If the packet spans several pages and scatter-gather is not 1991 * supported then use a look-aside buffer. 1992 */ 1993 ASSERT3U(length, <=, PAGESIZE); 1994 head = xnf_mblk_copy(xnfp, mp); 1995 if (head == NULL) { 1996 dev_err(xnfp->xnf_devinfo, CE_WARN, 1997 "xnf_mblk_copy() failed"); 1998 goto drop; 1999 } 2000 } else { 2001 /* 2002 * There's a limit for how many pages can be passed to the 2003 * backend. If we pass that limit, the packet will be dropped 2004 * and some backend implementations (e.g. Linux) could even 2005 * offline the interface. 2006 */ 2007 if (pages > XEN_MAX_TX_DATA_PAGES) { 2008 if (pulledup) { 2009 dev_err(xnfp->xnf_devinfo, CE_WARN, 2010 "too many pages, even after pullup: %d.", 2011 pages); 2012 goto drop; 2013 } 2014 2015 /* 2016 * Defragment packet if it spans too many pages. 2017 */ 2018 mblk_t *newmp = msgpullup(mp, -1); 2019 freemsg(mp); 2020 mp = newmp; 2021 xnfp->xnf_stat_tx_pullup++; 2022 pulledup = B_TRUE; 2023 goto pulledup; 2024 } 2025 2026 head = xnf_mblk_map(xnfp, mp, &slots); 2027 if (head == NULL) 2028 goto drop; 2029 2030 IMPLY(slots > 1, xnfp->xnf_be_tx_sg); 2031 } 2032 2033 /* 2034 * Set tx_mp so that mblk is freed when the txbuf chain is freed. 2035 */ 2036 head->tx_mp = mp; 2037 2038 xnf_tx_setup_offload(xnfp, head, cksum_flags, lso_flags, mss); 2039 2040 /* 2041 * The first request must store the total length of the packet. 2042 */ 2043 head->tx_txreq.size = length; 2044 2045 /* 2046 * Push the packet we have prepared into the ring. 2047 */ 2048 xnf_tx_push_packet(xnfp, head); 2049 xnfp->xnf_stat_opackets++; 2050 xnfp->xnf_stat_obytes += length; 2051 2052 mutex_exit(&xnfp->xnf_txlock); 2053 return (NULL); 2054 2055 drop: 2056 freemsg(mp); 2057 xnfp->xnf_stat_tx_drop++; 2058 mutex_exit(&xnfp->xnf_txlock); 2059 return (NULL); 2060 } 2061 2062 /* 2063 * Notification of RX packets. Currently no TX-complete interrupt is 2064 * used, as we clean the TX ring lazily. 2065 */ 2066 static uint_t 2067 xnf_intr(caddr_t arg) 2068 { 2069 xnf_t *xnfp = (xnf_t *)arg; 2070 mblk_t *mp; 2071 boolean_t need_sched, clean_ring; 2072 2073 mutex_enter(&xnfp->xnf_rxlock); 2074 2075 /* 2076 * Interrupts before we are connected are spurious. 2077 */ 2078 if (!xnfp->xnf_connected) { 2079 mutex_exit(&xnfp->xnf_rxlock); 2080 xnfp->xnf_stat_unclaimed_interrupts++; 2081 return (DDI_INTR_UNCLAIMED); 2082 } 2083 2084 /* 2085 * Receive side processing. 2086 */ 2087 do { 2088 /* 2089 * Collect buffers from the ring. 2090 */ 2091 xnf_rx_collect(xnfp); 2092 2093 /* 2094 * Interrupt me when the next receive buffer is consumed. 2095 */ 2096 xnfp->xnf_rx_ring.sring->rsp_event = 2097 xnfp->xnf_rx_ring.rsp_cons + 1; 2098 xen_mb(); 2099 2100 } while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)); 2101 2102 if (xnfp->xnf_rx_new_buffers_posted) { 2103 boolean_t notify; 2104 2105 /* 2106 * Indicate to the peer that we have re-filled the 2107 * receive ring, if it cares. 2108 */ 2109 /* LINTED: constant in conditional context */ 2110 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify); 2111 if (notify) 2112 ec_notify_via_evtchn(xnfp->xnf_evtchn); 2113 xnfp->xnf_rx_new_buffers_posted = B_FALSE; 2114 } 2115 2116 mp = xnfp->xnf_rx_head; 2117 xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL; 2118 2119 xnfp->xnf_stat_interrupts++; 2120 mutex_exit(&xnfp->xnf_rxlock); 2121 2122 if (mp != NULL) 2123 mac_rx(xnfp->xnf_mh, NULL, mp); 2124 2125 /* 2126 * Transmit side processing. 2127 * 2128 * If a previous transmit attempt failed or we have pending 2129 * multicast requests, clean the ring. 2130 * 2131 * If we previously stalled transmission and cleaning produces 2132 * some free slots, tell upstream to attempt sending again. 2133 * 2134 * The odd style is to avoid acquiring xnf_txlock unless we 2135 * will actually look inside the tx machinery. 2136 */ 2137 mutex_enter(&xnfp->xnf_schedlock); 2138 need_sched = xnfp->xnf_need_sched; 2139 clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0); 2140 mutex_exit(&xnfp->xnf_schedlock); 2141 2142 if (clean_ring) { 2143 int free_slots; 2144 2145 mutex_enter(&xnfp->xnf_txlock); 2146 free_slots = xnf_tx_slots_get(xnfp, 0, B_FALSE); 2147 2148 if (need_sched && (free_slots >= XEN_MAX_SLOTS_PER_TX)) { 2149 mutex_enter(&xnfp->xnf_schedlock); 2150 xnfp->xnf_need_sched = B_FALSE; 2151 mutex_exit(&xnfp->xnf_schedlock); 2152 2153 mac_tx_update(xnfp->xnf_mh); 2154 } 2155 mutex_exit(&xnfp->xnf_txlock); 2156 } 2157 2158 return (DDI_INTR_CLAIMED); 2159 } 2160 2161 /* 2162 * xnf_start() -- start the board receiving and enable interrupts. 2163 */ 2164 static int 2165 xnf_start(void *arg) 2166 { 2167 xnf_t *xnfp = arg; 2168 2169 #ifdef XNF_DEBUG 2170 if (xnf_debug & XNF_DEBUG_TRACE) 2171 printf("xnf%d start(0x%p)\n", 2172 ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp); 2173 #endif 2174 2175 mutex_enter(&xnfp->xnf_rxlock); 2176 mutex_enter(&xnfp->xnf_txlock); 2177 2178 /* Accept packets from above. */ 2179 xnfp->xnf_running = B_TRUE; 2180 2181 mutex_exit(&xnfp->xnf_txlock); 2182 mutex_exit(&xnfp->xnf_rxlock); 2183 2184 return (0); 2185 } 2186 2187 /* xnf_stop() - disable hardware */ 2188 static void 2189 xnf_stop(void *arg) 2190 { 2191 xnf_t *xnfp = arg; 2192 2193 #ifdef XNF_DEBUG 2194 if (xnf_debug & XNF_DEBUG_TRACE) 2195 printf("xnf%d stop(0x%p)\n", 2196 ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp); 2197 #endif 2198 2199 mutex_enter(&xnfp->xnf_rxlock); 2200 mutex_enter(&xnfp->xnf_txlock); 2201 2202 xnfp->xnf_running = B_FALSE; 2203 2204 mutex_exit(&xnfp->xnf_txlock); 2205 mutex_exit(&xnfp->xnf_rxlock); 2206 } 2207 2208 /* 2209 * Hang buffer `bdesc' on the RX ring. 2210 */ 2211 static void 2212 xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc) 2213 { 2214 netif_rx_request_t *reqp; 2215 RING_IDX hang_ix; 2216 2217 ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock)); 2218 2219 reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring, 2220 xnfp->xnf_rx_ring.req_prod_pvt); 2221 hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0)); 2222 ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL); 2223 2224 reqp->id = bdesc->id = hang_ix; 2225 reqp->gref = bdesc->grant_ref; 2226 2227 xnfp->xnf_rx_pkt_info[hang_ix] = bdesc; 2228 xnfp->xnf_rx_ring.req_prod_pvt++; 2229 2230 xnfp->xnf_rx_new_buffers_posted = B_TRUE; 2231 } 2232 2233 /* 2234 * Receive an entire packet from the ring, starting from slot *consp. 2235 * prod indicates the slot of the latest response. 2236 * On return, *consp will point to the head of the next packet. 2237 * 2238 * Note: If slot prod was reached before we could gather a full packet, we will 2239 * drop the partial packet; this would most likely indicate a bug in either 2240 * the front-end or the back-end driver. 2241 * 2242 * An rx packet can consist of several fragments and thus span multiple slots. 2243 * Each fragment can contain up to 4k of data. 2244 * 2245 * A typical 9000 MTU packet with look like this: 2246 * +------+---------------------+-------------------+-----------------------+ 2247 * | SLOT | TYPE | CONTENTS | FLAGS | 2248 * +------+---------------------+-------------------+-----------------------+ 2249 * | 1 | netif_rx_response_t | 1st data fragment | more_data | 2250 * +------+---------------------+-------------------+-----------------------+ 2251 * | 2 | netif_rx_response_t | 2nd data fragment | more_data | 2252 * +------+---------------------+-------------------+-----------------------+ 2253 * | 3 | netif_rx_response_t | 3rd data fragment | [none] | 2254 * +------+---------------------+-------------------+-----------------------+ 2255 * 2256 * Fragments are chained by setting NETRXF_more_data in the previous 2257 * response's flags. If there are additional flags, such as 2258 * NETRXF_data_validated or NETRXF_extra_info, those should be set on the 2259 * first fragment. 2260 * 2261 * Sometimes extra info can be present. If so, it will follow the first 2262 * fragment, and NETRXF_extra_info flag will be set on the first response. 2263 * If LRO is set on a packet, it will be stored in the extra info. Conforming 2264 * to the spec, extra info can also be chained, but must all be present right 2265 * after the first fragment. 2266 * 2267 * Example of a packet with 2 extra infos: 2268 * +------+---------------------+-------------------+-----------------------+ 2269 * | SLOT | TYPE | CONTENTS | FLAGS | 2270 * +------+---------------------+-------------------+-----------------------+ 2271 * | 1 | netif_rx_response_t | 1st data fragment | extra_info, more_data | 2272 * +------+---------------------+-------------------+-----------------------+ 2273 * | 2 | netif_extra_info_t | 1st extra info | EXTRA_FLAG_MORE | 2274 * +------+---------------------+-------------------+-----------------------+ 2275 * | 3 | netif_extra_info_t | 2nd extra info | [none] | 2276 * +------+---------------------+-------------------+-----------------------+ 2277 * | 4 | netif_rx_response_t | 2nd data fragment | more_data | 2278 * +------+---------------------+-------------------+-----------------------+ 2279 * | 5 | netif_rx_response_t | 3rd data fragment | more_data | 2280 * +------+---------------------+-------------------+-----------------------+ 2281 * | 6 | netif_rx_response_t | 4th data fragment | [none] | 2282 * +------+---------------------+-------------------+-----------------------+ 2283 * 2284 * In practice, the only extra we expect is for LRO, but only if we advertise 2285 * that we support it to the backend (xnf_enable_lro == TRUE). 2286 */ 2287 static int 2288 xnf_rx_one_packet(xnf_t *xnfp, RING_IDX prod, RING_IDX *consp, mblk_t **mpp) 2289 { 2290 mblk_t *head = NULL; 2291 mblk_t *tail = NULL; 2292 mblk_t *mp; 2293 int error = 0; 2294 RING_IDX cons = *consp; 2295 netif_extra_info_t lro; 2296 boolean_t is_lro = B_FALSE; 2297 boolean_t is_extra = B_FALSE; 2298 2299 netif_rx_response_t rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons); 2300 2301 boolean_t hwcsum = (rsp.flags & NETRXF_data_validated) != 0; 2302 boolean_t more_data = (rsp.flags & NETRXF_more_data) != 0; 2303 boolean_t more_extra = (rsp.flags & NETRXF_extra_info) != 0; 2304 2305 IMPLY(more_data, xnf_enable_rx_sg); 2306 2307 while (cons != prod) { 2308 xnf_buf_t *bdesc; 2309 int len, off; 2310 int rxidx = cons & (NET_RX_RING_SIZE - 1); 2311 2312 bdesc = xnfp->xnf_rx_pkt_info[rxidx]; 2313 xnfp->xnf_rx_pkt_info[rxidx] = NULL; 2314 2315 if (is_extra) { 2316 netif_extra_info_t *extra = (netif_extra_info_t *)&rsp; 2317 /* 2318 * The only extra we expect is for LRO, and it should 2319 * only be present once. 2320 */ 2321 if (extra->type == XEN_NETIF_EXTRA_TYPE_GSO && 2322 !is_lro) { 2323 ASSERT(xnf_enable_lro); 2324 lro = *extra; 2325 is_lro = B_TRUE; 2326 DTRACE_PROBE1(lro, netif_extra_info_t *, &lro); 2327 } else { 2328 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx packet " 2329 "contains unexpected extra info of type %d", 2330 extra->type); 2331 error = EINVAL; 2332 } 2333 more_extra = 2334 (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE) != 0; 2335 2336 goto hang_buf; 2337 } 2338 2339 ASSERT3U(bdesc->id, ==, rsp.id); 2340 2341 /* 2342 * status stores packet length when >= 0, or errors when < 0. 2343 */ 2344 len = rsp.status; 2345 off = rsp.offset; 2346 more_data = (rsp.flags & NETRXF_more_data) != 0; 2347 2348 /* 2349 * sanity checks. 2350 */ 2351 if (!xnfp->xnf_running) { 2352 error = EBUSY; 2353 } else if (len <= 0) { 2354 xnfp->xnf_stat_errrx++; 2355 2356 switch (len) { 2357 case 0: 2358 xnfp->xnf_stat_runt++; 2359 break; 2360 case NETIF_RSP_ERROR: 2361 xnfp->xnf_stat_mac_rcv_error++; 2362 break; 2363 case NETIF_RSP_DROPPED: 2364 xnfp->xnf_stat_norxbuf++; 2365 break; 2366 } 2367 error = EINVAL; 2368 } else if (bdesc->grant_ref == INVALID_GRANT_REF) { 2369 dev_err(xnfp->xnf_devinfo, CE_WARN, 2370 "Bad rx grant reference, rsp id %d", rsp.id); 2371 error = EINVAL; 2372 } else if ((off + len) > PAGESIZE) { 2373 dev_err(xnfp->xnf_devinfo, CE_WARN, "Rx packet crosses " 2374 "page boundary (offset %d, length %d)", off, len); 2375 error = EINVAL; 2376 } 2377 2378 if (error != 0) { 2379 /* 2380 * If an error has been detected, we do not attempt 2381 * to read the data but we still need to replace 2382 * the rx bufs. 2383 */ 2384 goto hang_buf; 2385 } 2386 2387 xnf_buf_t *nbuf = NULL; 2388 2389 /* 2390 * If the packet is below a pre-determined size we will 2391 * copy data out of the buf rather than replace it. 2392 */ 2393 if (len > xnf_rx_copy_limit) 2394 nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE); 2395 2396 if (nbuf != NULL) { 2397 mp = desballoc((unsigned char *)bdesc->buf, 2398 bdesc->len, 0, &bdesc->free_rtn); 2399 2400 if (mp == NULL) { 2401 xnfp->xnf_stat_rx_desballoc_fail++; 2402 xnfp->xnf_stat_norxbuf++; 2403 error = ENOMEM; 2404 /* 2405 * we free the buf we just allocated as we 2406 * will re-hang the old buf. 2407 */ 2408 xnf_buf_put(xnfp, nbuf, B_FALSE); 2409 goto hang_buf; 2410 } 2411 2412 mp->b_rptr = mp->b_rptr + off; 2413 mp->b_wptr = mp->b_rptr + len; 2414 2415 /* 2416 * Release the grant as the backend doesn't need to 2417 * access this buffer anymore and grants are scarce. 2418 */ 2419 (void) gnttab_end_foreign_access_ref(bdesc->grant_ref, 2420 0); 2421 xnf_gref_put(xnfp, bdesc->grant_ref); 2422 bdesc->grant_ref = INVALID_GRANT_REF; 2423 2424 bdesc = nbuf; 2425 } else { 2426 /* 2427 * We failed to allocate a new buf or decided to reuse 2428 * the old one. In either case we copy the data off it 2429 * and put it back into the ring. 2430 */ 2431 mp = allocb(len, 0); 2432 if (mp == NULL) { 2433 xnfp->xnf_stat_rx_allocb_fail++; 2434 xnfp->xnf_stat_norxbuf++; 2435 error = ENOMEM; 2436 goto hang_buf; 2437 } 2438 bcopy(bdesc->buf + off, mp->b_wptr, len); 2439 mp->b_wptr += len; 2440 } 2441 2442 if (head == NULL) 2443 head = mp; 2444 else 2445 tail->b_cont = mp; 2446 tail = mp; 2447 2448 hang_buf: 2449 /* 2450 * No matter what happens, for each response we need to hang 2451 * a new buf on the rx ring. Put either the old one, or a new 2452 * one if the old one is borrowed by the kernel via desballoc(). 2453 */ 2454 xnf_rxbuf_hang(xnfp, bdesc); 2455 cons++; 2456 2457 /* next response is an extra */ 2458 is_extra = more_extra; 2459 2460 if (!more_data && !more_extra) 2461 break; 2462 2463 /* 2464 * Note that since requests and responses are union'd on the 2465 * same ring, we copy the response to a local variable instead 2466 * of keeping a pointer. Otherwise xnf_rxbuf_hang() would have 2467 * overwritten contents of rsp. 2468 */ 2469 rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons); 2470 } 2471 2472 /* 2473 * Check that we do not get stuck in a loop. 2474 */ 2475 ASSERT3U(*consp, !=, cons); 2476 *consp = cons; 2477 2478 /* 2479 * We ran out of responses but the flags indicate there is more data. 2480 */ 2481 if (more_data) { 2482 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments."); 2483 error = EINVAL; 2484 } 2485 if (more_extra) { 2486 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments " 2487 "(extras)."); 2488 error = EINVAL; 2489 } 2490 2491 /* 2492 * An error means the packet must be dropped. If we have already formed 2493 * a partial packet, then discard it. 2494 */ 2495 if (error != 0) { 2496 if (head != NULL) 2497 freemsg(head); 2498 xnfp->xnf_stat_rx_drop++; 2499 return (error); 2500 } 2501 2502 ASSERT(head != NULL); 2503 2504 if (hwcsum) { 2505 /* 2506 * If the peer says that the data has been validated then we 2507 * declare that the full checksum has been verified. 2508 * 2509 * We don't look at the "checksum blank" flag, and hence could 2510 * have a packet here that we are asserting is good with 2511 * a blank checksum. 2512 */ 2513 mac_hcksum_set(head, 0, 0, 0, 0, HCK_FULLCKSUM_OK); 2514 xnfp->xnf_stat_rx_cksum_no_need++; 2515 } 2516 2517 /* XXX: set lro info for packet once LRO is supported in OS. */ 2518 2519 *mpp = head; 2520 2521 return (0); 2522 } 2523 2524 /* 2525 * Collect packets from the RX ring, storing them in `xnfp' for later use. 2526 */ 2527 static void 2528 xnf_rx_collect(xnf_t *xnfp) 2529 { 2530 RING_IDX prod; 2531 2532 ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock)); 2533 2534 prod = xnfp->xnf_rx_ring.sring->rsp_prod; 2535 /* 2536 * Ensure we see queued responses up to 'prod'. 2537 */ 2538 membar_consumer(); 2539 2540 while (xnfp->xnf_rx_ring.rsp_cons != prod) { 2541 mblk_t *mp; 2542 2543 /* 2544 * Collect a packet. 2545 * rsp_cons is updated inside xnf_rx_one_packet(). 2546 */ 2547 int error = xnf_rx_one_packet(xnfp, prod, 2548 &xnfp->xnf_rx_ring.rsp_cons, &mp); 2549 if (error == 0) { 2550 xnfp->xnf_stat_ipackets++; 2551 xnfp->xnf_stat_rbytes += xmsgsize(mp); 2552 2553 /* 2554 * Append the mblk to the rx list. 2555 */ 2556 if (xnfp->xnf_rx_head == NULL) { 2557 ASSERT3P(xnfp->xnf_rx_tail, ==, NULL); 2558 xnfp->xnf_rx_head = mp; 2559 } else { 2560 ASSERT(xnfp->xnf_rx_tail != NULL); 2561 xnfp->xnf_rx_tail->b_next = mp; 2562 } 2563 xnfp->xnf_rx_tail = mp; 2564 } 2565 } 2566 } 2567 2568 /* 2569 * xnf_alloc_dma_resources() -- initialize the drivers structures 2570 */ 2571 static int 2572 xnf_alloc_dma_resources(xnf_t *xnfp) 2573 { 2574 dev_info_t *devinfo = xnfp->xnf_devinfo; 2575 size_t len; 2576 ddi_dma_cookie_t dma_cookie; 2577 uint_t ncookies; 2578 int rc; 2579 caddr_t rptr; 2580 2581 /* 2582 * The code below allocates all the DMA data structures that 2583 * need to be released when the driver is detached. 2584 * 2585 * Allocate page for the transmit descriptor ring. 2586 */ 2587 if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr, 2588 DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS) 2589 goto alloc_error; 2590 2591 if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle, 2592 PAGESIZE, &accattr, DDI_DMA_CONSISTENT, 2593 DDI_DMA_SLEEP, 0, &rptr, &len, 2594 &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) { 2595 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle); 2596 xnfp->xnf_tx_ring_dma_handle = NULL; 2597 goto alloc_error; 2598 } 2599 2600 if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL, 2601 rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, 2602 DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) { 2603 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle); 2604 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle); 2605 xnfp->xnf_tx_ring_dma_handle = NULL; 2606 xnfp->xnf_tx_ring_dma_acchandle = NULL; 2607 if (rc == DDI_DMA_NORESOURCES) 2608 goto alloc_error; 2609 else 2610 goto error; 2611 } 2612 2613 ASSERT(ncookies == 1); 2614 bzero(rptr, PAGESIZE); 2615 /* LINTED: constant in conditional context */ 2616 SHARED_RING_INIT((netif_tx_sring_t *)rptr); 2617 /* LINTED: constant in conditional context */ 2618 FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE); 2619 xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress; 2620 2621 /* 2622 * Allocate page for the receive descriptor ring. 2623 */ 2624 if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr, 2625 DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS) 2626 goto alloc_error; 2627 2628 if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle, 2629 PAGESIZE, &accattr, DDI_DMA_CONSISTENT, 2630 DDI_DMA_SLEEP, 0, &rptr, &len, 2631 &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) { 2632 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); 2633 xnfp->xnf_rx_ring_dma_handle = NULL; 2634 goto alloc_error; 2635 } 2636 2637 if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL, 2638 rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, 2639 DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) { 2640 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle); 2641 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); 2642 xnfp->xnf_rx_ring_dma_handle = NULL; 2643 xnfp->xnf_rx_ring_dma_acchandle = NULL; 2644 if (rc == DDI_DMA_NORESOURCES) 2645 goto alloc_error; 2646 else 2647 goto error; 2648 } 2649 2650 ASSERT(ncookies == 1); 2651 bzero(rptr, PAGESIZE); 2652 /* LINTED: constant in conditional context */ 2653 SHARED_RING_INIT((netif_rx_sring_t *)rptr); 2654 /* LINTED: constant in conditional context */ 2655 FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE); 2656 xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress; 2657 2658 return (DDI_SUCCESS); 2659 2660 alloc_error: 2661 cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory", 2662 ddi_get_instance(xnfp->xnf_devinfo)); 2663 error: 2664 xnf_release_dma_resources(xnfp); 2665 return (DDI_FAILURE); 2666 } 2667 2668 /* 2669 * Release all DMA resources in the opposite order from acquisition 2670 */ 2671 static void 2672 xnf_release_dma_resources(xnf_t *xnfp) 2673 { 2674 int i; 2675 2676 /* 2677 * Free receive buffers which are currently associated with 2678 * descriptors. 2679 */ 2680 mutex_enter(&xnfp->xnf_rxlock); 2681 for (i = 0; i < NET_RX_RING_SIZE; i++) { 2682 xnf_buf_t *bp; 2683 2684 if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL) 2685 continue; 2686 xnfp->xnf_rx_pkt_info[i] = NULL; 2687 xnf_buf_put(xnfp, bp, B_FALSE); 2688 } 2689 mutex_exit(&xnfp->xnf_rxlock); 2690 2691 /* Free the receive ring buffer. */ 2692 if (xnfp->xnf_rx_ring_dma_acchandle != NULL) { 2693 (void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle); 2694 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle); 2695 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); 2696 xnfp->xnf_rx_ring_dma_acchandle = NULL; 2697 } 2698 /* Free the transmit ring buffer. */ 2699 if (xnfp->xnf_tx_ring_dma_acchandle != NULL) { 2700 (void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle); 2701 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle); 2702 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle); 2703 xnfp->xnf_tx_ring_dma_acchandle = NULL; 2704 } 2705 2706 } 2707 2708 /* 2709 * Release any packets and associated structures used by the TX ring. 2710 */ 2711 static void 2712 xnf_release_mblks(xnf_t *xnfp) 2713 { 2714 RING_IDX i; 2715 xnf_txid_t *tidp; 2716 2717 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0]; 2718 i < NET_TX_RING_SIZE; 2719 i++, tidp++) { 2720 xnf_txbuf_t *txp = tidp->txbuf; 2721 2722 if (txp != NULL) { 2723 ASSERT(txp->tx_mp != NULL); 2724 freemsg(txp->tx_mp); 2725 2726 xnf_txid_put(xnfp, tidp); 2727 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); 2728 } 2729 } 2730 } 2731 2732 static int 2733 xnf_buf_constructor(void *buf, void *arg, int kmflag) 2734 { 2735 int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP; 2736 xnf_buf_t *bdesc = buf; 2737 xnf_t *xnfp = arg; 2738 ddi_dma_cookie_t dma_cookie; 2739 uint_t ncookies; 2740 size_t len; 2741 2742 if (kmflag & KM_NOSLEEP) 2743 ddiflags = DDI_DMA_DONTWAIT; 2744 2745 /* Allocate a DMA access handle for the buffer. */ 2746 if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buf_dma_attr, 2747 ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS) 2748 goto failure; 2749 2750 /* Allocate DMA-able memory for buffer. */ 2751 if (ddi_dma_mem_alloc(bdesc->dma_handle, 2752 PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0, 2753 &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS) 2754 goto failure_1; 2755 2756 /* Bind to virtual address of buffer to get physical address. */ 2757 if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL, 2758 bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING, 2759 ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) 2760 goto failure_2; 2761 ASSERT(ncookies == 1); 2762 2763 bdesc->free_rtn.free_func = xnf_buf_recycle; 2764 bdesc->free_rtn.free_arg = (caddr_t)bdesc; 2765 bdesc->xnfp = xnfp; 2766 bdesc->buf_phys = dma_cookie.dmac_laddress; 2767 bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys)); 2768 bdesc->len = dma_cookie.dmac_size; 2769 bdesc->grant_ref = INVALID_GRANT_REF; 2770 bdesc->gen = xnfp->xnf_gen; 2771 2772 atomic_inc_64(&xnfp->xnf_stat_buf_allocated); 2773 2774 return (0); 2775 2776 failure_2: 2777 ddi_dma_mem_free(&bdesc->acc_handle); 2778 2779 failure_1: 2780 ddi_dma_free_handle(&bdesc->dma_handle); 2781 2782 failure: 2783 2784 ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */ 2785 return (-1); 2786 } 2787 2788 static void 2789 xnf_buf_destructor(void *buf, void *arg) 2790 { 2791 xnf_buf_t *bdesc = buf; 2792 xnf_t *xnfp = arg; 2793 2794 (void) ddi_dma_unbind_handle(bdesc->dma_handle); 2795 ddi_dma_mem_free(&bdesc->acc_handle); 2796 ddi_dma_free_handle(&bdesc->dma_handle); 2797 2798 atomic_dec_64(&xnfp->xnf_stat_buf_allocated); 2799 } 2800 2801 static xnf_buf_t * 2802 xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly) 2803 { 2804 grant_ref_t gref; 2805 xnf_buf_t *bufp; 2806 2807 /* 2808 * Usually grant references are more scarce than memory, so we 2809 * attempt to acquire a grant reference first. 2810 */ 2811 gref = xnf_gref_get(xnfp); 2812 if (gref == INVALID_GRANT_REF) 2813 return (NULL); 2814 2815 bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags); 2816 if (bufp == NULL) { 2817 xnf_gref_put(xnfp, gref); 2818 return (NULL); 2819 } 2820 2821 ASSERT3U(bufp->grant_ref, ==, INVALID_GRANT_REF); 2822 2823 bufp->grant_ref = gref; 2824 2825 if (bufp->gen != xnfp->xnf_gen) 2826 xnf_buf_refresh(bufp); 2827 2828 gnttab_grant_foreign_access_ref(bufp->grant_ref, 2829 xvdi_get_oeid(bufp->xnfp->xnf_devinfo), 2830 bufp->buf_mfn, readonly ? 1 : 0); 2831 2832 atomic_inc_64(&xnfp->xnf_stat_buf_outstanding); 2833 2834 return (bufp); 2835 } 2836 2837 static void 2838 xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly) 2839 { 2840 if (bufp->grant_ref != INVALID_GRANT_REF) { 2841 (void) gnttab_end_foreign_access_ref( 2842 bufp->grant_ref, readonly ? 1 : 0); 2843 xnf_gref_put(xnfp, bufp->grant_ref); 2844 bufp->grant_ref = INVALID_GRANT_REF; 2845 } 2846 2847 kmem_cache_free(xnfp->xnf_buf_cache, bufp); 2848 2849 atomic_dec_64(&xnfp->xnf_stat_buf_outstanding); 2850 } 2851 2852 /* 2853 * Refresh any cached data about a buffer after resume. 2854 */ 2855 static void 2856 xnf_buf_refresh(xnf_buf_t *bdesc) 2857 { 2858 bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys)); 2859 bdesc->gen = bdesc->xnfp->xnf_gen; 2860 } 2861 2862 /* 2863 * Streams `freeb' routine for `xnf_buf_t' when used as transmit 2864 * look-aside buffers. 2865 */ 2866 static void 2867 xnf_buf_recycle(xnf_buf_t *bdesc) 2868 { 2869 xnf_t *xnfp = bdesc->xnfp; 2870 2871 xnf_buf_put(xnfp, bdesc, B_TRUE); 2872 } 2873 2874 static int 2875 xnf_tx_buf_constructor(void *buf, void *arg, int kmflag) 2876 { 2877 int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP; 2878 xnf_txbuf_t *txp = buf; 2879 xnf_t *xnfp = arg; 2880 2881 if (kmflag & KM_NOSLEEP) 2882 ddiflags = DDI_DMA_DONTWAIT; 2883 2884 if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buf_dma_attr, 2885 ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) { 2886 ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */ 2887 return (-1); 2888 } 2889 2890 return (0); 2891 } 2892 2893 static void 2894 xnf_tx_buf_destructor(void *buf, void *arg) 2895 { 2896 _NOTE(ARGUNUSED(arg)); 2897 xnf_txbuf_t *txp = buf; 2898 2899 ddi_dma_free_handle(&txp->tx_dma_handle); 2900 } 2901 2902 /* 2903 * Statistics. 2904 */ 2905 static char *xnf_aux_statistics[] = { 2906 "tx_cksum_deferred", 2907 "rx_cksum_no_need", 2908 "interrupts", 2909 "unclaimed_interrupts", 2910 "tx_pullup", 2911 "tx_lookaside", 2912 "tx_drop", 2913 "tx_eth_hdr_split", 2914 "buf_allocated", 2915 "buf_outstanding", 2916 "gref_outstanding", 2917 "gref_failure", 2918 "gref_peak", 2919 "rx_allocb_fail", 2920 "rx_desballoc_fail", 2921 }; 2922 2923 static int 2924 xnf_kstat_aux_update(kstat_t *ksp, int flag) 2925 { 2926 xnf_t *xnfp; 2927 kstat_named_t *knp; 2928 2929 if (flag != KSTAT_READ) 2930 return (EACCES); 2931 2932 xnfp = ksp->ks_private; 2933 knp = ksp->ks_data; 2934 2935 /* 2936 * Assignment order must match that of the names in 2937 * xnf_aux_statistics. 2938 */ 2939 (knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred; 2940 (knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need; 2941 2942 (knp++)->value.ui64 = xnfp->xnf_stat_interrupts; 2943 (knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts; 2944 (knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup; 2945 (knp++)->value.ui64 = xnfp->xnf_stat_tx_lookaside; 2946 (knp++)->value.ui64 = xnfp->xnf_stat_tx_drop; 2947 (knp++)->value.ui64 = xnfp->xnf_stat_tx_eth_hdr_split; 2948 2949 (knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated; 2950 (knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding; 2951 (knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding; 2952 (knp++)->value.ui64 = xnfp->xnf_stat_gref_failure; 2953 (knp++)->value.ui64 = xnfp->xnf_stat_gref_peak; 2954 (knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail; 2955 (knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail; 2956 2957 return (0); 2958 } 2959 2960 static boolean_t 2961 xnf_kstat_init(xnf_t *xnfp) 2962 { 2963 int nstat = sizeof (xnf_aux_statistics) / 2964 sizeof (xnf_aux_statistics[0]); 2965 char **cp = xnf_aux_statistics; 2966 kstat_named_t *knp; 2967 2968 /* 2969 * Create and initialise kstats. 2970 */ 2971 if ((xnfp->xnf_kstat_aux = kstat_create("xnf", 2972 ddi_get_instance(xnfp->xnf_devinfo), 2973 "aux_statistics", "net", KSTAT_TYPE_NAMED, 2974 nstat, 0)) == NULL) 2975 return (B_FALSE); 2976 2977 xnfp->xnf_kstat_aux->ks_private = xnfp; 2978 xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update; 2979 2980 knp = xnfp->xnf_kstat_aux->ks_data; 2981 while (nstat > 0) { 2982 kstat_named_init(knp, *cp, KSTAT_DATA_UINT64); 2983 2984 knp++; 2985 cp++; 2986 nstat--; 2987 } 2988 2989 kstat_install(xnfp->xnf_kstat_aux); 2990 2991 return (B_TRUE); 2992 } 2993 2994 static int 2995 xnf_stat(void *arg, uint_t stat, uint64_t *val) 2996 { 2997 xnf_t *xnfp = arg; 2998 2999 mutex_enter(&xnfp->xnf_rxlock); 3000 mutex_enter(&xnfp->xnf_txlock); 3001 3002 #define mac_stat(q, r) \ 3003 case (MAC_STAT_##q): \ 3004 *val = xnfp->xnf_stat_##r; \ 3005 break 3006 3007 #define ether_stat(q, r) \ 3008 case (ETHER_STAT_##q): \ 3009 *val = xnfp->xnf_stat_##r; \ 3010 break 3011 3012 switch (stat) { 3013 3014 mac_stat(IPACKETS, ipackets); 3015 mac_stat(OPACKETS, opackets); 3016 mac_stat(RBYTES, rbytes); 3017 mac_stat(OBYTES, obytes); 3018 mac_stat(NORCVBUF, norxbuf); 3019 mac_stat(IERRORS, errrx); 3020 mac_stat(NOXMTBUF, tx_defer); 3021 3022 ether_stat(MACRCV_ERRORS, mac_rcv_error); 3023 ether_stat(TOOSHORT_ERRORS, runt); 3024 3025 /* always claim to be in full duplex mode */ 3026 case ETHER_STAT_LINK_DUPLEX: 3027 *val = LINK_DUPLEX_FULL; 3028 break; 3029 3030 /* always claim to be at 1Gb/s link speed */ 3031 case MAC_STAT_IFSPEED: 3032 *val = 1000000000ull; 3033 break; 3034 3035 default: 3036 mutex_exit(&xnfp->xnf_txlock); 3037 mutex_exit(&xnfp->xnf_rxlock); 3038 3039 return (ENOTSUP); 3040 } 3041 3042 #undef mac_stat 3043 #undef ether_stat 3044 3045 mutex_exit(&xnfp->xnf_txlock); 3046 mutex_exit(&xnfp->xnf_rxlock); 3047 3048 return (0); 3049 } 3050 3051 static int 3052 xnf_change_mtu(xnf_t *xnfp, uint32_t mtu) 3053 { 3054 if (mtu > ETHERMTU) { 3055 if (!xnf_enable_tx_sg) { 3056 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d " 3057 "because scatter-gather is disabled for transmit " 3058 "in driver settings", ETHERMTU); 3059 return (EINVAL); 3060 } else if (!xnf_enable_rx_sg) { 3061 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d " 3062 "because scatter-gather is disabled for receive " 3063 "in driver settings", ETHERMTU); 3064 return (EINVAL); 3065 } else if (!xnfp->xnf_be_tx_sg) { 3066 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d " 3067 "because backend doesn't support scatter-gather", 3068 ETHERMTU); 3069 return (EINVAL); 3070 } 3071 if (mtu > XNF_MAXPKT) 3072 return (EINVAL); 3073 } 3074 int error = mac_maxsdu_update(xnfp->xnf_mh, mtu); 3075 if (error == 0) 3076 xnfp->xnf_mtu = mtu; 3077 3078 return (error); 3079 } 3080 3081 /*ARGSUSED*/ 3082 static int 3083 xnf_getprop(void *data, const char *prop_name, mac_prop_id_t prop_id, 3084 uint_t prop_val_size, void *prop_val) 3085 { 3086 xnf_t *xnfp = data; 3087 3088 switch (prop_id) { 3089 case MAC_PROP_MTU: 3090 ASSERT(prop_val_size >= sizeof (uint32_t)); 3091 bcopy(&xnfp->xnf_mtu, prop_val, sizeof (uint32_t)); 3092 break; 3093 default: 3094 return (ENOTSUP); 3095 } 3096 return (0); 3097 } 3098 3099 /*ARGSUSED*/ 3100 static int 3101 xnf_setprop(void *data, const char *prop_name, mac_prop_id_t prop_id, 3102 uint_t prop_val_size, const void *prop_val) 3103 { 3104 xnf_t *xnfp = data; 3105 uint32_t new_mtu; 3106 int error; 3107 3108 switch (prop_id) { 3109 case MAC_PROP_MTU: 3110 ASSERT(prop_val_size >= sizeof (uint32_t)); 3111 bcopy(prop_val, &new_mtu, sizeof (new_mtu)); 3112 error = xnf_change_mtu(xnfp, new_mtu); 3113 break; 3114 default: 3115 return (ENOTSUP); 3116 } 3117 3118 return (error); 3119 } 3120 3121 /*ARGSUSED*/ 3122 static void 3123 xnf_propinfo(void *data, const char *prop_name, mac_prop_id_t prop_id, 3124 mac_prop_info_handle_t prop_handle) 3125 { 3126 switch (prop_id) { 3127 case MAC_PROP_MTU: 3128 mac_prop_info_set_range_uint32(prop_handle, 0, XNF_MAXPKT); 3129 break; 3130 default: 3131 break; 3132 } 3133 } 3134 3135 static boolean_t 3136 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data) 3137 { 3138 xnf_t *xnfp = arg; 3139 3140 switch (cap) { 3141 case MAC_CAPAB_HCKSUM: { 3142 uint32_t *capab = cap_data; 3143 3144 /* 3145 * Whilst the flag used to communicate with the IO 3146 * domain is called "NETTXF_csum_blank", the checksum 3147 * in the packet must contain the pseudo-header 3148 * checksum and not zero. 3149 * 3150 * To help out the IO domain, we might use 3151 * HCKSUM_INET_PARTIAL. Unfortunately our stack will 3152 * then use checksum offload for IPv6 packets, which 3153 * the IO domain can't handle. 3154 * 3155 * As a result, we declare outselves capable of 3156 * HCKSUM_INET_FULL_V4. This means that we receive 3157 * IPv4 packets from the stack with a blank checksum 3158 * field and must insert the pseudo-header checksum 3159 * before passing the packet to the IO domain. 3160 */ 3161 *capab = HCKSUM_INET_FULL_V4; 3162 3163 /* 3164 * TODO: query the "feature-ipv6-csum-offload" capability. 3165 * If enabled, that could allow us to use HCKSUM_INET_PARTIAL. 3166 */ 3167 3168 break; 3169 } 3170 case MAC_CAPAB_LSO: { 3171 if (!xnfp->xnf_be_lso) 3172 return (B_FALSE); 3173 3174 mac_capab_lso_t *lso = cap_data; 3175 lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 3176 lso->lso_basic_tcp_ipv4.lso_max = IP_MAXPACKET; 3177 break; 3178 } 3179 default: 3180 return (B_FALSE); 3181 } 3182 3183 return (B_TRUE); 3184 } 3185 3186 /* 3187 * The state of the peer has changed - react accordingly. 3188 */ 3189 static void 3190 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, 3191 void *arg, void *impl_data) 3192 { 3193 _NOTE(ARGUNUSED(id, arg)); 3194 xnf_t *xnfp = ddi_get_driver_private(dip); 3195 XenbusState new_state = *(XenbusState *)impl_data; 3196 3197 ASSERT(xnfp != NULL); 3198 3199 switch (new_state) { 3200 case XenbusStateUnknown: 3201 case XenbusStateInitialising: 3202 case XenbusStateInitialised: 3203 case XenbusStateClosing: 3204 case XenbusStateClosed: 3205 case XenbusStateReconfiguring: 3206 case XenbusStateReconfigured: 3207 break; 3208 3209 case XenbusStateInitWait: 3210 xnf_read_config(xnfp); 3211 3212 if (!xnfp->xnf_be_rx_copy) { 3213 cmn_err(CE_WARN, 3214 "The xnf driver requires a dom0 that " 3215 "supports 'feature-rx-copy'."); 3216 (void) xvdi_switch_state(xnfp->xnf_devinfo, 3217 XBT_NULL, XenbusStateClosed); 3218 break; 3219 } 3220 3221 /* 3222 * Connect to the backend. 3223 */ 3224 xnf_be_connect(xnfp); 3225 3226 /* 3227 * Our MAC address as discovered by xnf_read_config(). 3228 */ 3229 mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr); 3230 3231 /* 3232 * We do not know if some features such as LSO are supported 3233 * until we connect to the backend. We request the MAC layer 3234 * to poll our capabilities again. 3235 */ 3236 mac_capab_update(xnfp->xnf_mh); 3237 3238 break; 3239 3240 case XenbusStateConnected: 3241 mutex_enter(&xnfp->xnf_rxlock); 3242 mutex_enter(&xnfp->xnf_txlock); 3243 3244 xnfp->xnf_connected = B_TRUE; 3245 /* 3246 * Wake up any threads waiting to send data to 3247 * backend. 3248 */ 3249 cv_broadcast(&xnfp->xnf_cv_state); 3250 3251 mutex_exit(&xnfp->xnf_txlock); 3252 mutex_exit(&xnfp->xnf_rxlock); 3253 3254 /* 3255 * Kick the peer in case it missed any transmits 3256 * request in the TX ring. 3257 */ 3258 ec_notify_via_evtchn(xnfp->xnf_evtchn); 3259 3260 /* 3261 * There may already be completed receive requests in 3262 * the ring sent by backend after it gets connected 3263 * but before we see its state change here, so we call 3264 * xnf_intr() to handle them, if any. 3265 */ 3266 (void) xnf_intr((caddr_t)xnfp); 3267 3268 /* 3269 * Mark the link up now that we are connected. 3270 */ 3271 mac_link_update(xnfp->xnf_mh, LINK_STATE_UP); 3272 3273 /* 3274 * Tell the backend about the multicast addresses in 3275 * which we are interested. 3276 */ 3277 mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE); 3278 3279 break; 3280 3281 default: 3282 break; 3283 } 3284 } 3285