1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright (c) 2014, 2017 by Delphix. All rights reserved. 29 * Copyright 2020 RackTop Systems, Inc. 30 */ 31 32 /* 33 * 34 * Copyright (c) 2004 Christian Limpach. 35 * All rights reserved. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. This section intentionally left blank. 46 * 4. The name of the author may not be used to endorse or promote products 47 * derived from this software without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 50 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 51 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 52 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 53 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 54 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 55 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 56 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 57 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 58 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 59 */ 60 /* 61 * Section 3 of the above license was updated in response to bug 6379571. 62 */ 63 64 /* 65 * xnf.c - GLDv3 network driver for domU. 66 */ 67 68 /* 69 * This driver uses four per-instance locks: 70 * 71 * xnf_gref_lock: 72 * 73 * Protects access to the grant reference list stored in 74 * xnf_gref_head. Grant references should be acquired and released 75 * using gref_get() and gref_put() respectively. 76 * 77 * xnf_schedlock: 78 * 79 * Protects: 80 * xnf_need_sched - used to record that a previous transmit attempt 81 * failed (and consequently it will be necessary to call 82 * mac_tx_update() when transmit resources are available). 83 * xnf_pending_multicast - the number of multicast requests that 84 * have been submitted to the backend for which we have not 85 * processed responses. 86 * 87 * xnf_txlock: 88 * 89 * Protects the transmit ring (xnf_tx_ring) and associated 90 * structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head). 91 * 92 * xnf_rxlock: 93 * 94 * Protects the receive ring (xnf_rx_ring) and associated 95 * structures (notably xnf_rx_pkt_info). 96 * 97 * If driver-global state that affects both the transmit and receive 98 * rings is manipulated, both xnf_txlock and xnf_rxlock should be 99 * held, in that order. 100 * 101 * xnf_schedlock is acquired both whilst holding xnf_txlock and 102 * without. It should always be acquired after xnf_txlock if both are 103 * held. 104 * 105 * Notes: 106 * - atomic_add_64() is used to manipulate counters where we require 107 * accuracy. For counters intended only for observation by humans, 108 * post increment/decrement are used instead. 109 */ 110 111 #include <sys/types.h> 112 #include <sys/errno.h> 113 #include <sys/param.h> 114 #include <sys/sysmacros.h> 115 #include <sys/systm.h> 116 #include <sys/stream.h> 117 #include <sys/strsubr.h> 118 #include <sys/strsun.h> 119 #include <sys/conf.h> 120 #include <sys/ddi.h> 121 #include <sys/devops.h> 122 #include <sys/sunddi.h> 123 #include <sys/sunndi.h> 124 #include <sys/dlpi.h> 125 #include <sys/ethernet.h> 126 #include <sys/strsun.h> 127 #include <sys/pattr.h> 128 #include <inet/ip.h> 129 #include <inet/ip_impl.h> 130 #include <inet/tcp.h> 131 #include <netinet/udp.h> 132 #include <sys/gld.h> 133 #include <sys/modctl.h> 134 #include <sys/mac_provider.h> 135 #include <sys/mac_ether.h> 136 #include <sys/bootinfo.h> 137 #include <sys/mach_mmu.h> 138 #ifdef XPV_HVM_DRIVER 139 #include <sys/xpv_support.h> 140 #include <sys/hypervisor.h> 141 #else 142 #include <sys/hypervisor.h> 143 #include <sys/evtchn_impl.h> 144 #include <sys/balloon_impl.h> 145 #endif 146 #include <xen/public/io/netif.h> 147 #include <sys/gnttab.h> 148 #include <xen/sys/xendev.h> 149 #include <sys/sdt.h> 150 #include <sys/note.h> 151 #include <sys/debug.h> 152 153 #include <io/xnf.h> 154 155 /* 156 * On a 32 bit PAE system physical and machine addresses are larger 157 * than 32 bits. ddi_btop() on such systems take an unsigned long 158 * argument, and so addresses above 4G are truncated before ddi_btop() 159 * gets to see them. To avoid this, code the shift operation here. 160 */ 161 #define xnf_btop(addr) ((addr) >> PAGESHIFT) 162 163 /* 164 * The parameters below should only be changed in /etc/system, never in mdb. 165 */ 166 167 /* 168 * Should we use the multicast control feature if the backend provides 169 * it? 170 */ 171 boolean_t xnf_multicast_control = B_TRUE; 172 173 /* 174 * Should we allow scatter-gather for tx if backend allows it? 175 */ 176 boolean_t xnf_enable_tx_sg = B_TRUE; 177 178 /* 179 * Should we allow scatter-gather for rx if backend allows it? 180 */ 181 boolean_t xnf_enable_rx_sg = B_TRUE; 182 183 /* 184 * Should we allow lso for tx sends if backend allows it? 185 * Requires xnf_enable_tx_sg to be also set to TRUE. 186 */ 187 boolean_t xnf_enable_lso = B_TRUE; 188 189 /* 190 * Should we allow lro on rx if backend supports it? 191 * Requires xnf_enable_rx_sg to be also set to TRUE. 192 * 193 * !! WARNING !! 194 * LRO is not yet supported in the OS so this should be left as FALSE. 195 * !! WARNING !! 196 */ 197 boolean_t xnf_enable_lro = B_FALSE; 198 199 /* 200 * Received packets below this size are copied to a new streams buffer 201 * rather than being desballoc'ed. 202 * 203 * This value is chosen to accommodate traffic where there are a large 204 * number of small packets. For data showing a typical distribution, 205 * see: 206 * 207 * Sinha07a: 208 * Rishi Sinha, Christos Papadopoulos, and John 209 * Heidemann. Internet Packet Size Distributions: Some 210 * Observations. Technical Report ISI-TR-2007-643, 211 * USC/Information Sciences Institute, May, 2007. Orignally 212 * released October 2005 as web page 213 * http://netweb.usc.edu/~sinha/pkt-sizes/. 214 * <http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>. 215 */ 216 size_t xnf_rx_copy_limit = 64; 217 218 #define INVALID_GRANT_HANDLE ((grant_handle_t)-1) 219 #define INVALID_GRANT_REF ((grant_ref_t)-1) 220 #define INVALID_TX_ID ((uint16_t)-1) 221 222 #define TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)])) 223 #define TX_ID_VALID(i) \ 224 (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE)) 225 226 /* 227 * calculate how many pages are spanned by an mblk fragment 228 */ 229 #define xnf_mblk_pages(mp) (MBLKL(mp) == 0 ? 0 : \ 230 xnf_btop((uintptr_t)mp->b_wptr - 1) - xnf_btop((uintptr_t)mp->b_rptr) + 1) 231 232 /* Required system entry points */ 233 static int xnf_attach(dev_info_t *, ddi_attach_cmd_t); 234 static int xnf_detach(dev_info_t *, ddi_detach_cmd_t); 235 236 /* Required driver entry points for Nemo */ 237 static int xnf_start(void *); 238 static void xnf_stop(void *); 239 static int xnf_set_mac_addr(void *, const uint8_t *); 240 static int xnf_set_multicast(void *, boolean_t, const uint8_t *); 241 static int xnf_set_promiscuous(void *, boolean_t); 242 static mblk_t *xnf_send(void *, mblk_t *); 243 static uint_t xnf_intr(caddr_t); 244 static int xnf_stat(void *, uint_t, uint64_t *); 245 static boolean_t xnf_getcapab(void *, mac_capab_t, void *); 246 static int xnf_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); 247 static int xnf_setprop(void *, const char *, mac_prop_id_t, uint_t, 248 const void *); 249 static void xnf_propinfo(void *, const char *, mac_prop_id_t, 250 mac_prop_info_handle_t); 251 252 /* Driver private functions */ 253 static int xnf_alloc_dma_resources(xnf_t *); 254 static void xnf_release_dma_resources(xnf_t *); 255 static void xnf_release_mblks(xnf_t *); 256 257 static int xnf_buf_constructor(void *, void *, int); 258 static void xnf_buf_destructor(void *, void *); 259 static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t); 260 static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t); 261 static void xnf_buf_refresh(xnf_buf_t *); 262 static void xnf_buf_recycle(xnf_buf_t *); 263 264 static int xnf_tx_buf_constructor(void *, void *, int); 265 static void xnf_tx_buf_destructor(void *, void *); 266 267 static grant_ref_t xnf_gref_get(xnf_t *); 268 static void xnf_gref_put(xnf_t *, grant_ref_t); 269 270 static xnf_txid_t *xnf_txid_get(xnf_t *); 271 static void xnf_txid_put(xnf_t *, xnf_txid_t *); 272 273 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *); 274 static int xnf_tx_clean_ring(xnf_t *); 275 static void oe_state_change(dev_info_t *, ddi_eventcookie_t, 276 void *, void *); 277 static boolean_t xnf_kstat_init(xnf_t *); 278 static void xnf_rx_collect(xnf_t *); 279 280 #define XNF_CALLBACK_FLAGS (MC_GETCAPAB | MC_PROPERTIES) 281 282 static mac_callbacks_t xnf_callbacks = { 283 .mc_callbacks = XNF_CALLBACK_FLAGS, 284 .mc_getstat = xnf_stat, 285 .mc_start = xnf_start, 286 .mc_stop = xnf_stop, 287 .mc_setpromisc = xnf_set_promiscuous, 288 .mc_multicst = xnf_set_multicast, 289 .mc_unicst = xnf_set_mac_addr, 290 .mc_tx = xnf_send, 291 .mc_getcapab = xnf_getcapab, 292 .mc_setprop = xnf_setprop, 293 .mc_getprop = xnf_getprop, 294 .mc_propinfo = xnf_propinfo, 295 }; 296 297 /* DMA attributes for network ring buffer */ 298 static ddi_dma_attr_t ringbuf_dma_attr = { 299 .dma_attr_version = DMA_ATTR_V0, 300 .dma_attr_addr_lo = 0, 301 .dma_attr_addr_hi = 0xffffffffffffffffULL, 302 .dma_attr_count_max = 0x7fffffff, 303 .dma_attr_align = MMU_PAGESIZE, 304 .dma_attr_burstsizes = 0x7ff, 305 .dma_attr_minxfer = 1, 306 .dma_attr_maxxfer = 0xffffffffU, 307 .dma_attr_seg = 0xffffffffffffffffULL, 308 .dma_attr_sgllen = 1, 309 .dma_attr_granular = 1, 310 .dma_attr_flags = 0 311 }; 312 313 /* DMA attributes for receive data */ 314 static ddi_dma_attr_t rx_buf_dma_attr = { 315 .dma_attr_version = DMA_ATTR_V0, 316 .dma_attr_addr_lo = 0, 317 .dma_attr_addr_hi = 0xffffffffffffffffULL, 318 .dma_attr_count_max = MMU_PAGEOFFSET, 319 .dma_attr_align = MMU_PAGESIZE, /* allocation alignment */ 320 .dma_attr_burstsizes = 0x7ff, 321 .dma_attr_minxfer = 1, 322 .dma_attr_maxxfer = 0xffffffffU, 323 .dma_attr_seg = 0xffffffffffffffffULL, 324 .dma_attr_sgllen = 1, 325 .dma_attr_granular = 1, 326 .dma_attr_flags = 0 327 }; 328 329 /* DMA attributes for transmit data */ 330 static ddi_dma_attr_t tx_buf_dma_attr = { 331 .dma_attr_version = DMA_ATTR_V0, 332 .dma_attr_addr_lo = 0, 333 .dma_attr_addr_hi = 0xffffffffffffffffULL, 334 .dma_attr_count_max = MMU_PAGEOFFSET, 335 .dma_attr_align = 1, 336 .dma_attr_burstsizes = 0x7ff, 337 .dma_attr_minxfer = 1, 338 .dma_attr_maxxfer = 0xffffffffU, 339 .dma_attr_seg = XEN_DATA_BOUNDARY - 1, /* segment boundary */ 340 .dma_attr_sgllen = XEN_MAX_TX_DATA_PAGES, /* max number of segments */ 341 .dma_attr_granular = 1, 342 .dma_attr_flags = 0 343 }; 344 345 /* DMA access attributes for registers and descriptors */ 346 static ddi_device_acc_attr_t accattr = { 347 DDI_DEVICE_ATTR_V0, 348 DDI_STRUCTURE_LE_ACC, /* This is a little-endian device */ 349 DDI_STRICTORDER_ACC 350 }; 351 352 /* DMA access attributes for data: NOT to be byte swapped. */ 353 static ddi_device_acc_attr_t data_accattr = { 354 DDI_DEVICE_ATTR_V0, 355 DDI_NEVERSWAP_ACC, 356 DDI_STRICTORDER_ACC 357 }; 358 359 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach, 360 nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported); 361 362 static struct modldrv xnf_modldrv = { 363 &mod_driverops, 364 "Virtual Ethernet driver", 365 &xnf_dev_ops 366 }; 367 368 static struct modlinkage modlinkage = { 369 MODREV_1, &xnf_modldrv, NULL 370 }; 371 372 int 373 _init(void) 374 { 375 int r; 376 377 mac_init_ops(&xnf_dev_ops, "xnf"); 378 r = mod_install(&modlinkage); 379 if (r != DDI_SUCCESS) 380 mac_fini_ops(&xnf_dev_ops); 381 382 return (r); 383 } 384 385 int 386 _fini(void) 387 { 388 return (EBUSY); /* XXPV should be removable */ 389 } 390 391 int 392 _info(struct modinfo *modinfop) 393 { 394 return (mod_info(&modlinkage, modinfop)); 395 } 396 397 /* 398 * Acquire a grant reference. 399 */ 400 static grant_ref_t 401 xnf_gref_get(xnf_t *xnfp) 402 { 403 grant_ref_t gref; 404 405 mutex_enter(&xnfp->xnf_gref_lock); 406 407 do { 408 gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head); 409 410 } while ((gref == INVALID_GRANT_REF) && 411 (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0)); 412 413 mutex_exit(&xnfp->xnf_gref_lock); 414 415 if (gref == INVALID_GRANT_REF) { 416 xnfp->xnf_stat_gref_failure++; 417 } else { 418 atomic_inc_64(&xnfp->xnf_stat_gref_outstanding); 419 if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak) 420 xnfp->xnf_stat_gref_peak = 421 xnfp->xnf_stat_gref_outstanding; 422 } 423 424 return (gref); 425 } 426 427 /* 428 * Release a grant reference. 429 */ 430 static void 431 xnf_gref_put(xnf_t *xnfp, grant_ref_t gref) 432 { 433 ASSERT(gref != INVALID_GRANT_REF); 434 435 mutex_enter(&xnfp->xnf_gref_lock); 436 gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref); 437 mutex_exit(&xnfp->xnf_gref_lock); 438 439 atomic_dec_64(&xnfp->xnf_stat_gref_outstanding); 440 } 441 442 /* 443 * Acquire a transmit id. 444 */ 445 static xnf_txid_t * 446 xnf_txid_get(xnf_t *xnfp) 447 { 448 xnf_txid_t *tidp; 449 450 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 451 452 if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID) 453 return (NULL); 454 455 ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head)); 456 457 tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head); 458 xnfp->xnf_tx_pkt_id_head = tidp->next; 459 tidp->next = INVALID_TX_ID; 460 461 ASSERT(tidp->txbuf == NULL); 462 463 return (tidp); 464 } 465 466 /* 467 * Release a transmit id. 468 */ 469 static void 470 xnf_txid_put(xnf_t *xnfp, xnf_txid_t *tidp) 471 { 472 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 473 ASSERT(TX_ID_VALID(tidp->id)); 474 ASSERT(tidp->next == INVALID_TX_ID); 475 476 tidp->txbuf = NULL; 477 tidp->next = xnfp->xnf_tx_pkt_id_head; 478 xnfp->xnf_tx_pkt_id_head = tidp->id; 479 } 480 481 static void 482 xnf_data_txbuf_free(xnf_t *xnfp, xnf_txbuf_t *txp) 483 { 484 ASSERT3U(txp->tx_type, ==, TX_DATA); 485 486 /* 487 * We are either using a lookaside buffer or we are mapping existing 488 * buffers. 489 */ 490 if (txp->tx_bdesc != NULL) { 491 ASSERT(!txp->tx_handle_bound); 492 xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE); 493 } else { 494 if (txp->tx_txreq.gref != INVALID_GRANT_REF) { 495 if (gnttab_query_foreign_access(txp->tx_txreq.gref) != 496 0) { 497 cmn_err(CE_PANIC, "tx grant %d still in use by " 498 "backend domain", txp->tx_txreq.gref); 499 } 500 (void) gnttab_end_foreign_access_ref( 501 txp->tx_txreq.gref, 1); 502 xnf_gref_put(xnfp, txp->tx_txreq.gref); 503 } 504 505 if (txp->tx_handle_bound) 506 (void) ddi_dma_unbind_handle(txp->tx_dma_handle); 507 } 508 509 if (txp->tx_mp != NULL) 510 freemsg(txp->tx_mp); 511 512 if (txp->tx_prev != NULL) { 513 ASSERT3P(txp->tx_prev->tx_next, ==, txp); 514 txp->tx_prev->tx_next = NULL; 515 } 516 517 if (txp->tx_txreq.id != INVALID_TX_ID) { 518 /* 519 * This should be only possible when resuming from a suspend. 520 */ 521 ASSERT(!xnfp->xnf_connected); 522 xnf_txid_put(xnfp, TX_ID_TO_TXID(xnfp, txp->tx_txreq.id)); 523 txp->tx_txreq.id = INVALID_TX_ID; 524 } 525 526 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); 527 } 528 529 static void 530 xnf_data_txbuf_free_chain(xnf_t *xnfp, xnf_txbuf_t *txp) 531 { 532 if (txp == NULL) 533 return; 534 535 while (txp->tx_next != NULL) 536 txp = txp->tx_next; 537 538 /* 539 * We free the chain in reverse order so that grants can be released 540 * for all dma chunks before unbinding the dma handles. The mblk is 541 * freed last, after all its fragments' dma handles are unbound. 542 */ 543 xnf_txbuf_t *prev; 544 for (; txp != NULL; txp = prev) { 545 prev = txp->tx_prev; 546 xnf_data_txbuf_free(xnfp, txp); 547 } 548 } 549 550 static xnf_txbuf_t * 551 xnf_data_txbuf_alloc(xnf_t *xnfp, int flag) 552 { 553 xnf_txbuf_t *txp; 554 555 if ((txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, flag)) == NULL) { 556 return (NULL); 557 } 558 559 txp->tx_type = TX_DATA; 560 txp->tx_next = NULL; 561 txp->tx_prev = NULL; 562 txp->tx_head = txp; 563 txp->tx_frags_to_ack = 0; 564 txp->tx_mp = NULL; 565 txp->tx_bdesc = NULL; 566 txp->tx_handle_bound = B_FALSE; 567 txp->tx_txreq.gref = INVALID_GRANT_REF; 568 txp->tx_txreq.id = INVALID_TX_ID; 569 570 return (txp); 571 } 572 573 /* 574 * Get `wanted' slots in the transmit ring, waiting for at least that 575 * number if `wait' is B_TRUE. Force the ring to be cleaned by setting 576 * `wanted' to zero. 577 * 578 * Return the number of slots available. 579 */ 580 static int 581 xnf_tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait) 582 { 583 int slotsfree; 584 boolean_t forced_clean = (wanted == 0); 585 586 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 587 588 /* LINTED: constant in conditional context */ 589 while (B_TRUE) { 590 slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring); 591 592 if ((slotsfree < wanted) || forced_clean) 593 slotsfree = xnf_tx_clean_ring(xnfp); 594 595 /* 596 * If there are more than we need free, tell other 597 * people to come looking again. We hold txlock, so we 598 * are able to take our slots before anyone else runs. 599 */ 600 if (slotsfree > wanted) 601 cv_broadcast(&xnfp->xnf_cv_tx_slots); 602 603 if (slotsfree >= wanted) 604 break; 605 606 if (!wait) 607 break; 608 609 cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock); 610 } 611 612 ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring))); 613 614 return (slotsfree); 615 } 616 617 static int 618 xnf_setup_rings(xnf_t *xnfp) 619 { 620 domid_t oeid; 621 struct xenbus_device *xsd; 622 RING_IDX i; 623 int err; 624 xnf_txid_t *tidp; 625 xnf_buf_t **bdescp; 626 627 oeid = xvdi_get_oeid(xnfp->xnf_devinfo); 628 xsd = xvdi_get_xsd(xnfp->xnf_devinfo); 629 630 if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF) 631 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0); 632 633 err = gnttab_grant_foreign_access(oeid, 634 xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0); 635 if (err <= 0) { 636 err = -err; 637 xenbus_dev_error(xsd, err, "granting access to tx ring page"); 638 goto out; 639 } 640 xnfp->xnf_tx_ring_ref = (grant_ref_t)err; 641 642 if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF) 643 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0); 644 645 err = gnttab_grant_foreign_access(oeid, 646 xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0); 647 if (err <= 0) { 648 err = -err; 649 xenbus_dev_error(xsd, err, "granting access to rx ring page"); 650 goto out; 651 } 652 xnfp->xnf_rx_ring_ref = (grant_ref_t)err; 653 654 mutex_enter(&xnfp->xnf_txlock); 655 656 /* 657 * We first cleanup the TX ring in case we are doing a resume. 658 * Note that this can lose packets, but we expect to stagger on. 659 */ 660 xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */ 661 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0]; 662 i < NET_TX_RING_SIZE; 663 i++, tidp++) { 664 xnf_txbuf_t *txp = tidp->txbuf; 665 if (txp == NULL) 666 continue; 667 668 switch (txp->tx_type) { 669 case TX_DATA: 670 /* 671 * txid_put() will be called for each txbuf's txid in 672 * the chain which will result in clearing tidp->txbuf. 673 */ 674 xnf_data_txbuf_free_chain(xnfp, txp); 675 676 break; 677 678 case TX_MCAST_REQ: 679 txp->tx_type = TX_MCAST_RSP; 680 txp->tx_status = NETIF_RSP_DROPPED; 681 cv_broadcast(&xnfp->xnf_cv_multicast); 682 683 /* 684 * The request consumed two slots in the ring, 685 * yet only a single xnf_txid_t is used. Step 686 * over the empty slot. 687 */ 688 i++; 689 ASSERT3U(i, <, NET_TX_RING_SIZE); 690 break; 691 692 case TX_MCAST_RSP: 693 break; 694 } 695 } 696 697 /* 698 * Now purge old list and add each txid to the new free list. 699 */ 700 xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */ 701 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0]; 702 i < NET_TX_RING_SIZE; 703 i++, tidp++) { 704 tidp->id = i; 705 ASSERT3P(tidp->txbuf, ==, NULL); 706 tidp->next = INVALID_TX_ID; /* Appease txid_put(). */ 707 xnf_txid_put(xnfp, tidp); 708 } 709 710 /* LINTED: constant in conditional context */ 711 SHARED_RING_INIT(xnfp->xnf_tx_ring.sring); 712 /* LINTED: constant in conditional context */ 713 FRONT_RING_INIT(&xnfp->xnf_tx_ring, 714 xnfp->xnf_tx_ring.sring, PAGESIZE); 715 716 mutex_exit(&xnfp->xnf_txlock); 717 718 mutex_enter(&xnfp->xnf_rxlock); 719 720 /* 721 * Clean out any buffers currently posted to the receive ring 722 * before we reset it. 723 */ 724 for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0]; 725 i < NET_RX_RING_SIZE; 726 i++, bdescp++) { 727 if (*bdescp != NULL) { 728 xnf_buf_put(xnfp, *bdescp, B_FALSE); 729 *bdescp = NULL; 730 } 731 } 732 733 /* LINTED: constant in conditional context */ 734 SHARED_RING_INIT(xnfp->xnf_rx_ring.sring); 735 /* LINTED: constant in conditional context */ 736 FRONT_RING_INIT(&xnfp->xnf_rx_ring, 737 xnfp->xnf_rx_ring.sring, PAGESIZE); 738 739 /* 740 * Fill the ring with buffers. 741 */ 742 for (i = 0; i < NET_RX_RING_SIZE; i++) { 743 xnf_buf_t *bdesc; 744 745 bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE); 746 VERIFY(bdesc != NULL); 747 xnf_rxbuf_hang(xnfp, bdesc); 748 } 749 750 /* LINTED: constant in conditional context */ 751 RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring); 752 753 mutex_exit(&xnfp->xnf_rxlock); 754 755 return (0); 756 757 out: 758 if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF) 759 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0); 760 xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF; 761 762 if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF) 763 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0); 764 xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF; 765 766 return (err); 767 } 768 769 /* 770 * Connect driver to back end, called to set up communication with 771 * back end driver both initially and on resume after restore/migrate. 772 */ 773 void 774 xnf_be_connect(xnf_t *xnfp) 775 { 776 const char *message; 777 xenbus_transaction_t xbt; 778 struct xenbus_device *xsd; 779 char *xsname; 780 int err; 781 782 ASSERT(!xnfp->xnf_connected); 783 784 xsd = xvdi_get_xsd(xnfp->xnf_devinfo); 785 xsname = xvdi_get_xsname(xnfp->xnf_devinfo); 786 787 err = xnf_setup_rings(xnfp); 788 if (err != 0) { 789 cmn_err(CE_WARN, "failed to set up tx/rx rings"); 790 xenbus_dev_error(xsd, err, "setting up ring"); 791 return; 792 } 793 794 again: 795 err = xenbus_transaction_start(&xbt); 796 if (err != 0) { 797 xenbus_dev_error(xsd, EIO, "starting transaction"); 798 return; 799 } 800 801 err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u", 802 xnfp->xnf_tx_ring_ref); 803 if (err != 0) { 804 message = "writing tx ring-ref"; 805 goto abort_transaction; 806 } 807 808 err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u", 809 xnfp->xnf_rx_ring_ref); 810 if (err != 0) { 811 message = "writing rx ring-ref"; 812 goto abort_transaction; 813 } 814 815 err = xenbus_printf(xbt, xsname, "event-channel", "%u", 816 xnfp->xnf_evtchn); 817 if (err != 0) { 818 message = "writing event-channel"; 819 goto abort_transaction; 820 } 821 822 err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1); 823 if (err != 0) { 824 message = "writing feature-rx-notify"; 825 goto abort_transaction; 826 } 827 828 err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1); 829 if (err != 0) { 830 message = "writing request-rx-copy"; 831 goto abort_transaction; 832 } 833 834 if (xnfp->xnf_be_mcast_control) { 835 err = xenbus_printf(xbt, xsname, "request-multicast-control", 836 "%d", 1); 837 if (err != 0) { 838 message = "writing request-multicast-control"; 839 goto abort_transaction; 840 } 841 } 842 843 /* 844 * Tell backend if we support scatter-gather lists on the rx side. 845 */ 846 err = xenbus_printf(xbt, xsname, "feature-sg", "%d", 847 xnf_enable_rx_sg ? 1 : 0); 848 if (err != 0) { 849 message = "writing feature-sg"; 850 goto abort_transaction; 851 } 852 853 /* 854 * Tell backend if we support LRO for IPv4. Scatter-gather on rx is 855 * a prerequisite. 856 */ 857 err = xenbus_printf(xbt, xsname, "feature-gso-tcpv4", "%d", 858 (xnf_enable_rx_sg && xnf_enable_lro) ? 1 : 0); 859 if (err != 0) { 860 message = "writing feature-gso-tcpv4"; 861 goto abort_transaction; 862 } 863 864 err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected); 865 if (err != 0) { 866 message = "switching state to XenbusStateConnected"; 867 goto abort_transaction; 868 } 869 870 err = xenbus_transaction_end(xbt, 0); 871 if (err != 0) { 872 if (err == EAGAIN) 873 goto again; 874 xenbus_dev_error(xsd, err, "completing transaction"); 875 } 876 877 return; 878 879 abort_transaction: 880 (void) xenbus_transaction_end(xbt, 1); 881 xenbus_dev_error(xsd, err, "%s", message); 882 } 883 884 /* 885 * Read configuration information from xenstore. 886 */ 887 void 888 xnf_read_config(xnf_t *xnfp) 889 { 890 int err, be_cap; 891 char mac[ETHERADDRL * 3]; 892 char *oename = xvdi_get_oename(xnfp->xnf_devinfo); 893 894 err = xenbus_scanf(XBT_NULL, oename, "mac", 895 "%s", (char *)&mac[0]); 896 if (err != 0) { 897 /* 898 * bad: we're supposed to be set up with a proper mac 899 * addr. at this point 900 */ 901 cmn_err(CE_WARN, "%s%d: no mac address", 902 ddi_driver_name(xnfp->xnf_devinfo), 903 ddi_get_instance(xnfp->xnf_devinfo)); 904 return; 905 } 906 if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) { 907 err = ENOENT; 908 xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT, 909 "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo)); 910 return; 911 } 912 913 err = xenbus_scanf(XBT_NULL, oename, 914 "feature-rx-copy", "%d", &be_cap); 915 /* 916 * If we fail to read the store we assume that the key is 917 * absent, implying an older domain at the far end. Older 918 * domains cannot do HV copy. 919 */ 920 if (err != 0) 921 be_cap = 0; 922 xnfp->xnf_be_rx_copy = (be_cap != 0); 923 924 err = xenbus_scanf(XBT_NULL, oename, 925 "feature-multicast-control", "%d", &be_cap); 926 /* 927 * If we fail to read the store we assume that the key is 928 * absent, implying an older domain at the far end. Older 929 * domains do not support multicast control. 930 */ 931 if (err != 0) 932 be_cap = 0; 933 xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control; 934 935 /* 936 * See if back-end supports scatter-gather for transmits. If not, 937 * we will not support LSO and limit the mtu to 1500. 938 */ 939 err = xenbus_scanf(XBT_NULL, oename, "feature-sg", "%d", &be_cap); 940 if (err != 0) { 941 be_cap = 0; 942 dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading " 943 "'feature-sg' from backend driver"); 944 } 945 if (be_cap == 0) { 946 dev_err(xnfp->xnf_devinfo, CE_WARN, "scatter-gather is not " 947 "supported for transmits in the backend driver. LSO is " 948 "disabled and MTU is restricted to 1500 bytes."); 949 } 950 xnfp->xnf_be_tx_sg = (be_cap != 0) && xnf_enable_tx_sg; 951 952 if (xnfp->xnf_be_tx_sg) { 953 /* 954 * Check if LSO is supported. Currently we only check for 955 * IPv4 as Illumos doesn't support LSO for IPv6. 956 */ 957 err = xenbus_scanf(XBT_NULL, oename, "feature-gso-tcpv4", "%d", 958 &be_cap); 959 if (err != 0) { 960 be_cap = 0; 961 dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading " 962 "'feature-gso-tcpv4' from backend driver"); 963 } 964 if (be_cap == 0) { 965 dev_err(xnfp->xnf_devinfo, CE_WARN, "LSO is not " 966 "supported by the backend driver. Performance " 967 "will be affected."); 968 } 969 xnfp->xnf_be_lso = (be_cap != 0) && xnf_enable_lso; 970 } 971 } 972 973 /* 974 * attach(9E) -- Attach a device to the system 975 */ 976 static int 977 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) 978 { 979 mac_register_t *macp; 980 xnf_t *xnfp; 981 int err; 982 char cachename[32]; 983 984 switch (cmd) { 985 case DDI_RESUME: 986 xnfp = ddi_get_driver_private(devinfo); 987 xnfp->xnf_gen++; 988 989 (void) xvdi_resume(devinfo); 990 (void) xvdi_alloc_evtchn(devinfo); 991 xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo); 992 #ifdef XPV_HVM_DRIVER 993 ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, 994 xnfp); 995 #else 996 (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, 997 (caddr_t)xnfp); 998 #endif 999 return (DDI_SUCCESS); 1000 1001 case DDI_ATTACH: 1002 break; 1003 1004 default: 1005 return (DDI_FAILURE); 1006 } 1007 1008 /* 1009 * Allocate gld_mac_info_t and xnf_instance structures 1010 */ 1011 macp = mac_alloc(MAC_VERSION); 1012 if (macp == NULL) 1013 return (DDI_FAILURE); 1014 xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP); 1015 1016 xnfp->xnf_tx_pkt_id = 1017 kmem_zalloc(sizeof (xnf_txid_t) * NET_TX_RING_SIZE, KM_SLEEP); 1018 1019 xnfp->xnf_rx_pkt_info = 1020 kmem_zalloc(sizeof (xnf_buf_t *) * NET_RX_RING_SIZE, KM_SLEEP); 1021 1022 macp->m_dip = devinfo; 1023 macp->m_driver = xnfp; 1024 xnfp->xnf_devinfo = devinfo; 1025 1026 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1027 macp->m_src_addr = xnfp->xnf_mac_addr; 1028 macp->m_callbacks = &xnf_callbacks; 1029 macp->m_min_sdu = 0; 1030 xnfp->xnf_mtu = ETHERMTU; 1031 macp->m_max_sdu = xnfp->xnf_mtu; 1032 1033 xnfp->xnf_running = B_FALSE; 1034 xnfp->xnf_connected = B_FALSE; 1035 xnfp->xnf_be_rx_copy = B_FALSE; 1036 xnfp->xnf_be_mcast_control = B_FALSE; 1037 xnfp->xnf_need_sched = B_FALSE; 1038 1039 xnfp->xnf_rx_head = NULL; 1040 xnfp->xnf_rx_tail = NULL; 1041 xnfp->xnf_rx_new_buffers_posted = B_FALSE; 1042 1043 #ifdef XPV_HVM_DRIVER 1044 /* Report our version to dom0 */ 1045 (void) xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d", 1046 HVMPV_XNF_VERS); 1047 #endif 1048 1049 /* 1050 * Get the iblock cookie with which to initialize the mutexes. 1051 */ 1052 if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie) 1053 != DDI_SUCCESS) 1054 goto failure; 1055 1056 mutex_init(&xnfp->xnf_txlock, 1057 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1058 mutex_init(&xnfp->xnf_rxlock, 1059 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1060 mutex_init(&xnfp->xnf_schedlock, 1061 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1062 mutex_init(&xnfp->xnf_gref_lock, 1063 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1064 1065 cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL); 1066 cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL); 1067 cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL); 1068 1069 (void) sprintf(cachename, "xnf_buf_cache_%d", 1070 ddi_get_instance(devinfo)); 1071 xnfp->xnf_buf_cache = kmem_cache_create(cachename, 1072 sizeof (xnf_buf_t), 0, 1073 xnf_buf_constructor, xnf_buf_destructor, 1074 NULL, xnfp, NULL, 0); 1075 if (xnfp->xnf_buf_cache == NULL) 1076 goto failure_0; 1077 1078 (void) sprintf(cachename, "xnf_tx_buf_cache_%d", 1079 ddi_get_instance(devinfo)); 1080 xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename, 1081 sizeof (xnf_txbuf_t), 0, 1082 xnf_tx_buf_constructor, xnf_tx_buf_destructor, 1083 NULL, xnfp, NULL, 0); 1084 if (xnfp->xnf_tx_buf_cache == NULL) 1085 goto failure_1; 1086 1087 xnfp->xnf_gref_head = INVALID_GRANT_REF; 1088 1089 if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) { 1090 cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize " 1091 "driver data structures", 1092 ddi_get_instance(xnfp->xnf_devinfo)); 1093 goto failure_2; 1094 } 1095 1096 xnfp->xnf_rx_ring.sring->rsp_event = 1097 xnfp->xnf_tx_ring.sring->rsp_event = 1; 1098 1099 xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF; 1100 xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF; 1101 1102 /* set driver private pointer now */ 1103 ddi_set_driver_private(devinfo, xnfp); 1104 1105 if (!xnf_kstat_init(xnfp)) 1106 goto failure_3; 1107 1108 /* 1109 * Allocate an event channel, add the interrupt handler and 1110 * bind it to the event channel. 1111 */ 1112 (void) xvdi_alloc_evtchn(devinfo); 1113 xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo); 1114 #ifdef XPV_HVM_DRIVER 1115 ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp); 1116 #else 1117 (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp); 1118 #endif 1119 1120 err = mac_register(macp, &xnfp->xnf_mh); 1121 mac_free(macp); 1122 macp = NULL; 1123 if (err != 0) 1124 goto failure_4; 1125 1126 if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL) 1127 != DDI_SUCCESS) 1128 goto failure_5; 1129 1130 #ifdef XPV_HVM_DRIVER 1131 /* 1132 * In the HVM case, this driver essentially replaces a driver for 1133 * a 'real' PCI NIC. Without the "model" property set to 1134 * "Ethernet controller", like the PCI code does, netbooting does 1135 * not work correctly, as strplumb_get_netdev_path() will not find 1136 * this interface. 1137 */ 1138 (void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model", 1139 "Ethernet controller"); 1140 #endif 1141 1142 return (DDI_SUCCESS); 1143 1144 failure_5: 1145 (void) mac_unregister(xnfp->xnf_mh); 1146 1147 failure_4: 1148 #ifdef XPV_HVM_DRIVER 1149 ec_unbind_evtchn(xnfp->xnf_evtchn); 1150 xvdi_free_evtchn(devinfo); 1151 #else 1152 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); 1153 #endif 1154 xnfp->xnf_evtchn = INVALID_EVTCHN; 1155 kstat_delete(xnfp->xnf_kstat_aux); 1156 1157 failure_3: 1158 xnf_release_dma_resources(xnfp); 1159 1160 failure_2: 1161 kmem_cache_destroy(xnfp->xnf_tx_buf_cache); 1162 1163 failure_1: 1164 kmem_cache_destroy(xnfp->xnf_buf_cache); 1165 1166 failure_0: 1167 cv_destroy(&xnfp->xnf_cv_tx_slots); 1168 cv_destroy(&xnfp->xnf_cv_multicast); 1169 cv_destroy(&xnfp->xnf_cv_state); 1170 1171 mutex_destroy(&xnfp->xnf_gref_lock); 1172 mutex_destroy(&xnfp->xnf_schedlock); 1173 mutex_destroy(&xnfp->xnf_rxlock); 1174 mutex_destroy(&xnfp->xnf_txlock); 1175 1176 failure: 1177 kmem_free(xnfp, sizeof (*xnfp)); 1178 if (macp != NULL) 1179 mac_free(macp); 1180 1181 return (DDI_FAILURE); 1182 } 1183 1184 /* detach(9E) -- Detach a device from the system */ 1185 static int 1186 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) 1187 { 1188 xnf_t *xnfp; /* Our private device info */ 1189 1190 xnfp = ddi_get_driver_private(devinfo); 1191 1192 switch (cmd) { 1193 case DDI_SUSPEND: 1194 #ifdef XPV_HVM_DRIVER 1195 ec_unbind_evtchn(xnfp->xnf_evtchn); 1196 xvdi_free_evtchn(devinfo); 1197 #else 1198 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); 1199 #endif 1200 1201 xvdi_suspend(devinfo); 1202 1203 mutex_enter(&xnfp->xnf_rxlock); 1204 mutex_enter(&xnfp->xnf_txlock); 1205 1206 xnfp->xnf_evtchn = INVALID_EVTCHN; 1207 xnfp->xnf_connected = B_FALSE; 1208 mutex_exit(&xnfp->xnf_txlock); 1209 mutex_exit(&xnfp->xnf_rxlock); 1210 1211 /* claim link to be down after disconnect */ 1212 mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN); 1213 return (DDI_SUCCESS); 1214 1215 case DDI_DETACH: 1216 break; 1217 1218 default: 1219 return (DDI_FAILURE); 1220 } 1221 1222 if (xnfp->xnf_connected) 1223 return (DDI_FAILURE); 1224 1225 /* 1226 * Cannot detach if we have xnf_buf_t outstanding. 1227 */ 1228 if (xnfp->xnf_stat_buf_allocated > 0) 1229 return (DDI_FAILURE); 1230 1231 if (mac_unregister(xnfp->xnf_mh) != 0) 1232 return (DDI_FAILURE); 1233 1234 kstat_delete(xnfp->xnf_kstat_aux); 1235 1236 /* Stop the receiver */ 1237 xnf_stop(xnfp); 1238 1239 xvdi_remove_event_handler(devinfo, XS_OE_STATE); 1240 1241 /* Remove the interrupt */ 1242 #ifdef XPV_HVM_DRIVER 1243 ec_unbind_evtchn(xnfp->xnf_evtchn); 1244 xvdi_free_evtchn(devinfo); 1245 #else 1246 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); 1247 #endif 1248 1249 /* Release any pending xmit mblks */ 1250 xnf_release_mblks(xnfp); 1251 1252 /* Release all DMA resources */ 1253 xnf_release_dma_resources(xnfp); 1254 1255 cv_destroy(&xnfp->xnf_cv_tx_slots); 1256 cv_destroy(&xnfp->xnf_cv_multicast); 1257 cv_destroy(&xnfp->xnf_cv_state); 1258 1259 kmem_cache_destroy(xnfp->xnf_tx_buf_cache); 1260 kmem_cache_destroy(xnfp->xnf_buf_cache); 1261 1262 mutex_destroy(&xnfp->xnf_gref_lock); 1263 mutex_destroy(&xnfp->xnf_schedlock); 1264 mutex_destroy(&xnfp->xnf_rxlock); 1265 mutex_destroy(&xnfp->xnf_txlock); 1266 1267 kmem_free(xnfp, sizeof (*xnfp)); 1268 1269 return (DDI_SUCCESS); 1270 } 1271 1272 /* 1273 * xnf_set_mac_addr() -- set the physical network address on the board. 1274 */ 1275 static int 1276 xnf_set_mac_addr(void *arg, const uint8_t *macaddr) 1277 { 1278 _NOTE(ARGUNUSED(arg, macaddr)); 1279 1280 /* 1281 * We can't set our macaddr. 1282 */ 1283 return (ENOTSUP); 1284 } 1285 1286 /* 1287 * xnf_set_multicast() -- set (enable) or disable a multicast address. 1288 * 1289 * Program the hardware to enable/disable the multicast address 1290 * in "mca". Enable if "add" is true, disable if false. 1291 */ 1292 static int 1293 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca) 1294 { 1295 xnf_t *xnfp = arg; 1296 xnf_txbuf_t *txp; 1297 int n_slots; 1298 RING_IDX slot; 1299 xnf_txid_t *tidp; 1300 netif_tx_request_t *txrp; 1301 struct netif_extra_info *erp; 1302 boolean_t notify, result; 1303 1304 /* 1305 * If the backend does not support multicast control then we 1306 * must assume that the right packets will just arrive. 1307 */ 1308 if (!xnfp->xnf_be_mcast_control) 1309 return (0); 1310 1311 txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP); 1312 1313 mutex_enter(&xnfp->xnf_txlock); 1314 1315 /* 1316 * If we're not yet connected then claim success. This is 1317 * acceptable because we refresh the entire set of multicast 1318 * addresses when we get connected. 1319 * 1320 * We can't wait around here because the MAC layer expects 1321 * this to be a non-blocking operation - waiting ends up 1322 * causing a deadlock during resume. 1323 */ 1324 if (!xnfp->xnf_connected) { 1325 mutex_exit(&xnfp->xnf_txlock); 1326 return (0); 1327 } 1328 1329 /* 1330 * 1. Acquire two slots in the ring. 1331 * 2. Fill in the slots. 1332 * 3. Request notification when the operation is done. 1333 * 4. Kick the peer. 1334 * 5. Wait for the response via xnf_tx_clean_ring(). 1335 */ 1336 1337 n_slots = xnf_tx_slots_get(xnfp, 2, B_TRUE); 1338 ASSERT(n_slots >= 2); 1339 1340 slot = xnfp->xnf_tx_ring.req_prod_pvt; 1341 tidp = xnf_txid_get(xnfp); 1342 VERIFY(tidp != NULL); 1343 1344 txp->tx_type = TX_MCAST_REQ; 1345 txp->tx_slot = slot; 1346 1347 txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); 1348 erp = (struct netif_extra_info *) 1349 RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1); 1350 1351 txrp->gref = 0; 1352 txrp->size = 0; 1353 txrp->offset = 0; 1354 /* Set tx_txreq.id to appease xnf_tx_clean_ring(). */ 1355 txrp->id = txp->tx_txreq.id = tidp->id; 1356 txrp->flags = NETTXF_extra_info; 1357 1358 erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD : 1359 XEN_NETIF_EXTRA_TYPE_MCAST_DEL; 1360 bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL); 1361 1362 tidp->txbuf = txp; 1363 1364 xnfp->xnf_tx_ring.req_prod_pvt = slot + 2; 1365 1366 mutex_enter(&xnfp->xnf_schedlock); 1367 xnfp->xnf_pending_multicast++; 1368 mutex_exit(&xnfp->xnf_schedlock); 1369 1370 /* LINTED: constant in conditional context */ 1371 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, 1372 notify); 1373 if (notify) 1374 ec_notify_via_evtchn(xnfp->xnf_evtchn); 1375 1376 while (txp->tx_type == TX_MCAST_REQ) 1377 cv_wait(&xnfp->xnf_cv_multicast, &xnfp->xnf_txlock); 1378 1379 ASSERT3U(txp->tx_type, ==, TX_MCAST_RSP); 1380 1381 mutex_enter(&xnfp->xnf_schedlock); 1382 xnfp->xnf_pending_multicast--; 1383 mutex_exit(&xnfp->xnf_schedlock); 1384 1385 result = (txp->tx_status == NETIF_RSP_OKAY); 1386 1387 xnf_txid_put(xnfp, tidp); 1388 1389 mutex_exit(&xnfp->xnf_txlock); 1390 1391 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); 1392 1393 return (result ? 0 : 1); 1394 } 1395 1396 /* 1397 * xnf_set_promiscuous() -- set or reset promiscuous mode on the board 1398 * 1399 * Program the hardware to enable/disable promiscuous mode. 1400 */ 1401 static int 1402 xnf_set_promiscuous(void *arg, boolean_t on) 1403 { 1404 _NOTE(ARGUNUSED(arg, on)); 1405 1406 /* 1407 * We can't really do this, but we pretend that we can in 1408 * order that snoop will work. 1409 */ 1410 return (0); 1411 } 1412 1413 /* 1414 * Clean buffers that we have responses for from the transmit ring. 1415 */ 1416 static int 1417 xnf_tx_clean_ring(xnf_t *xnfp) 1418 { 1419 boolean_t work_to_do; 1420 1421 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 1422 1423 loop: 1424 while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) { 1425 RING_IDX cons, prod, i; 1426 1427 cons = xnfp->xnf_tx_ring.rsp_cons; 1428 prod = xnfp->xnf_tx_ring.sring->rsp_prod; 1429 membar_consumer(); 1430 /* 1431 * Clean tx requests from ring that we have responses 1432 * for. 1433 */ 1434 DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod); 1435 for (i = cons; i != prod; i++) { 1436 netif_tx_response_t *trp; 1437 xnf_txid_t *tidp; 1438 xnf_txbuf_t *txp; 1439 1440 trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i); 1441 /* 1442 * if this slot was occupied by netif_extra_info_t, 1443 * then the response will be NETIF_RSP_NULL. In this 1444 * case there are no resources to clean up. 1445 */ 1446 if (trp->status == NETIF_RSP_NULL) 1447 continue; 1448 1449 ASSERT(TX_ID_VALID(trp->id)); 1450 1451 tidp = TX_ID_TO_TXID(xnfp, trp->id); 1452 ASSERT3U(tidp->id, ==, trp->id); 1453 ASSERT3U(tidp->next, ==, INVALID_TX_ID); 1454 1455 txp = tidp->txbuf; 1456 ASSERT(txp != NULL); 1457 ASSERT3U(txp->tx_txreq.id, ==, trp->id); 1458 1459 switch (txp->tx_type) { 1460 case TX_DATA: 1461 /* 1462 * We must put the txid for each response we 1463 * acknowledge to make sure that we never have 1464 * more free slots than txids. Because of this 1465 * we do it here instead of waiting for it to 1466 * be done in xnf_data_txbuf_free_chain(). 1467 */ 1468 xnf_txid_put(xnfp, tidp); 1469 txp->tx_txreq.id = INVALID_TX_ID; 1470 ASSERT3S(txp->tx_head->tx_frags_to_ack, >, 0); 1471 txp->tx_head->tx_frags_to_ack--; 1472 1473 /* 1474 * We clean the whole chain once we got a 1475 * response for each fragment. 1476 */ 1477 if (txp->tx_head->tx_frags_to_ack == 0) 1478 xnf_data_txbuf_free_chain(xnfp, txp); 1479 1480 break; 1481 1482 case TX_MCAST_REQ: 1483 txp->tx_type = TX_MCAST_RSP; 1484 txp->tx_status = trp->status; 1485 cv_broadcast(&xnfp->xnf_cv_multicast); 1486 1487 break; 1488 1489 default: 1490 cmn_err(CE_PANIC, "xnf_tx_clean_ring: " 1491 "invalid xnf_txbuf_t type: %d", 1492 txp->tx_type); 1493 break; 1494 } 1495 } 1496 /* 1497 * Record the last response we dealt with so that we 1498 * know where to start next time around. 1499 */ 1500 xnfp->xnf_tx_ring.rsp_cons = prod; 1501 membar_enter(); 1502 } 1503 1504 /* LINTED: constant in conditional context */ 1505 RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do); 1506 if (work_to_do) 1507 goto loop; 1508 1509 return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring)); 1510 } 1511 1512 /* 1513 * Allocate and fill in a look-aside buffer for the packet `mp'. Used 1514 * to ensure that the packet is physically contiguous and contained 1515 * within a single page. 1516 */ 1517 static xnf_buf_t * 1518 xnf_tx_get_lookaside(xnf_t *xnfp, mblk_t *mp, size_t *plen) 1519 { 1520 xnf_buf_t *bd; 1521 caddr_t bp; 1522 1523 if ((bd = xnf_buf_get(xnfp, KM_NOSLEEP, B_TRUE)) == NULL) { 1524 return (NULL); 1525 } 1526 1527 bp = bd->buf; 1528 while (mp != NULL) { 1529 size_t len = MBLKL(mp); 1530 1531 bcopy(mp->b_rptr, bp, len); 1532 bp += len; 1533 1534 mp = mp->b_cont; 1535 } 1536 1537 *plen = bp - bd->buf; 1538 ASSERT3U(*plen, <=, PAGESIZE); 1539 1540 xnfp->xnf_stat_tx_lookaside++; 1541 1542 return (bd); 1543 } 1544 1545 /* 1546 * Insert the pseudo-header checksum into the packet. 1547 * Assumes packet is IPv4, TCP/UDP since we only advertised support for 1548 * HCKSUM_INET_FULL_V4. 1549 */ 1550 int 1551 xnf_pseudo_cksum(mblk_t *mp) 1552 { 1553 struct ether_header *ehp; 1554 uint16_t sap, iplen, *stuff; 1555 uint32_t cksum; 1556 size_t len; 1557 ipha_t *ipha; 1558 ipaddr_t src, dst; 1559 uchar_t *ptr; 1560 1561 ptr = mp->b_rptr; 1562 len = MBLKL(mp); 1563 1564 /* Each header must fit completely in an mblk. */ 1565 ASSERT3U(len, >=, sizeof (*ehp)); 1566 1567 ehp = (struct ether_header *)ptr; 1568 1569 if (ntohs(ehp->ether_type) == VLAN_TPID) { 1570 struct ether_vlan_header *evhp; 1571 ASSERT3U(len, >=, sizeof (*evhp)); 1572 evhp = (struct ether_vlan_header *)ptr; 1573 sap = ntohs(evhp->ether_type); 1574 ptr += sizeof (*evhp); 1575 len -= sizeof (*evhp); 1576 } else { 1577 sap = ntohs(ehp->ether_type); 1578 ptr += sizeof (*ehp); 1579 len -= sizeof (*ehp); 1580 } 1581 1582 ASSERT3U(sap, ==, ETHERTYPE_IP); 1583 1584 /* 1585 * Ethernet and IP headers may be in different mblks. 1586 */ 1587 ASSERT3P(ptr, <=, mp->b_wptr); 1588 if (ptr == mp->b_wptr) { 1589 mp = mp->b_cont; 1590 ptr = mp->b_rptr; 1591 len = MBLKL(mp); 1592 } 1593 1594 ASSERT3U(len, >=, sizeof (ipha_t)); 1595 ipha = (ipha_t *)ptr; 1596 1597 /* 1598 * We assume the IP header has no options. (This is enforced in 1599 * ire_send_wire_v4() -- search for IXAF_NO_HW_CKSUM). 1600 */ 1601 ASSERT3U(IPH_HDR_LENGTH(ipha), ==, IP_SIMPLE_HDR_LENGTH); 1602 iplen = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH; 1603 1604 ptr += IP_SIMPLE_HDR_LENGTH; 1605 len -= IP_SIMPLE_HDR_LENGTH; 1606 1607 /* 1608 * IP and L4 headers may be in different mblks. 1609 */ 1610 ASSERT3P(ptr, <=, mp->b_wptr); 1611 if (ptr == mp->b_wptr) { 1612 mp = mp->b_cont; 1613 ptr = mp->b_rptr; 1614 len = MBLKL(mp); 1615 } 1616 1617 switch (ipha->ipha_protocol) { 1618 case IPPROTO_TCP: 1619 ASSERT3U(len, >=, sizeof (tcph_t)); 1620 stuff = (uint16_t *)(ptr + TCP_CHECKSUM_OFFSET); 1621 cksum = IP_TCP_CSUM_COMP; 1622 break; 1623 case IPPROTO_UDP: 1624 ASSERT3U(len, >=, sizeof (struct udphdr)); 1625 stuff = (uint16_t *)(ptr + UDP_CHECKSUM_OFFSET); 1626 cksum = IP_UDP_CSUM_COMP; 1627 break; 1628 default: 1629 cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d", 1630 ipha->ipha_protocol); 1631 return (EINVAL); 1632 } 1633 1634 src = ipha->ipha_src; 1635 dst = ipha->ipha_dst; 1636 1637 cksum += (dst >> 16) + (dst & 0xFFFF); 1638 cksum += (src >> 16) + (src & 0xFFFF); 1639 cksum += htons(iplen); 1640 1641 cksum = (cksum >> 16) + (cksum & 0xFFFF); 1642 cksum = (cksum >> 16) + (cksum & 0xFFFF); 1643 1644 ASSERT(cksum <= 0xFFFF); 1645 1646 *stuff = (uint16_t)(cksum ? cksum : ~cksum); 1647 1648 return (0); 1649 } 1650 1651 /* 1652 * Push a packet into the transmit ring. 1653 * 1654 * Note: the format of a tx packet that spans multiple slots is similar to 1655 * what is described in xnf_rx_one_packet(). 1656 */ 1657 static void 1658 xnf_tx_push_packet(xnf_t *xnfp, xnf_txbuf_t *head) 1659 { 1660 int nslots = 0; 1661 int extras = 0; 1662 RING_IDX slot; 1663 boolean_t notify; 1664 1665 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 1666 ASSERT(xnfp->xnf_running); 1667 1668 slot = xnfp->xnf_tx_ring.req_prod_pvt; 1669 1670 /* 1671 * The caller has already checked that we have enough slots to proceed. 1672 */ 1673 for (xnf_txbuf_t *txp = head; txp != NULL; txp = txp->tx_next) { 1674 xnf_txid_t *tidp; 1675 netif_tx_request_t *txrp; 1676 1677 tidp = xnf_txid_get(xnfp); 1678 VERIFY(tidp != NULL); 1679 txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); 1680 1681 txp->tx_slot = slot; 1682 txp->tx_txreq.id = tidp->id; 1683 *txrp = txp->tx_txreq; 1684 1685 tidp->txbuf = txp; 1686 slot++; 1687 nslots++; 1688 1689 /* 1690 * When present, LSO info is placed in a slot after the first 1691 * data segment, and doesn't require a txid. 1692 */ 1693 if (txp->tx_txreq.flags & NETTXF_extra_info) { 1694 netif_extra_info_t *extra; 1695 ASSERT3U(nslots, ==, 1); 1696 1697 extra = (netif_extra_info_t *) 1698 RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); 1699 *extra = txp->tx_extra; 1700 slot++; 1701 nslots++; 1702 extras = 1; 1703 } 1704 } 1705 1706 ASSERT3U(nslots, <=, XEN_MAX_SLOTS_PER_TX); 1707 1708 /* 1709 * Store the number of data fragments. 1710 */ 1711 head->tx_frags_to_ack = nslots - extras; 1712 1713 xnfp->xnf_tx_ring.req_prod_pvt = slot; 1714 1715 /* 1716 * Tell the peer that we sent something, if it cares. 1717 */ 1718 /* LINTED: constant in conditional context */ 1719 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, notify); 1720 if (notify) 1721 ec_notify_via_evtchn(xnfp->xnf_evtchn); 1722 } 1723 1724 static xnf_txbuf_t * 1725 xnf_mblk_copy(xnf_t *xnfp, mblk_t *mp) 1726 { 1727 xnf_txbuf_t *txp; 1728 size_t length; 1729 1730 if ((txp = xnf_data_txbuf_alloc(xnfp, KM_NOSLEEP)) == NULL) { 1731 return (NULL); 1732 } 1733 1734 txp->tx_bdesc = xnf_tx_get_lookaside(xnfp, mp, &length); 1735 if (txp->tx_bdesc == NULL) { 1736 xnf_data_txbuf_free(xnfp, txp); 1737 return (NULL); 1738 } 1739 txp->tx_mfn = txp->tx_bdesc->buf_mfn; 1740 txp->tx_txreq.gref = txp->tx_bdesc->grant_ref; 1741 txp->tx_txreq.size = length; 1742 txp->tx_txreq.offset = (uintptr_t)txp->tx_bdesc->buf & PAGEOFFSET; 1743 txp->tx_txreq.flags = 0; 1744 1745 return (txp); 1746 } 1747 1748 static xnf_txbuf_t * 1749 xnf_mblk_map(xnf_t *xnfp, mblk_t *mp, int *countp) 1750 { 1751 xnf_txbuf_t *head = NULL; 1752 xnf_txbuf_t *tail = NULL; 1753 domid_t oeid; 1754 int nsegs = 0; 1755 1756 oeid = xvdi_get_oeid(xnfp->xnf_devinfo); 1757 1758 for (mblk_t *ml = mp; ml != NULL; ml = ml->b_cont) { 1759 ddi_dma_handle_t dma_handle; 1760 const ddi_dma_cookie_t *dma_cookie, *dma_cookie_prev; 1761 xnf_txbuf_t *txp; 1762 1763 if (MBLKL(ml) == 0) 1764 continue; 1765 1766 if ((txp = xnf_data_txbuf_alloc(xnfp, KM_NOSLEEP)) == NULL) { 1767 goto error; 1768 } 1769 1770 if (head == NULL) { 1771 head = txp; 1772 } else { 1773 ASSERT(tail != NULL); 1774 TXBUF_SETNEXT(tail, txp); 1775 txp->tx_head = head; 1776 } 1777 1778 /* 1779 * The necessary segmentation rules (e.g. not crossing a page 1780 * boundary) are enforced by the dma attributes of the handle. 1781 */ 1782 dma_handle = txp->tx_dma_handle; 1783 int ret = ddi_dma_addr_bind_handle(dma_handle, 1784 NULL, (char *)ml->b_rptr, MBLKL(ml), 1785 DDI_DMA_WRITE | DDI_DMA_STREAMING, 1786 DDI_DMA_DONTWAIT, 0, NULL, NULL); 1787 if (ret != DDI_DMA_MAPPED) { 1788 if (ret != DDI_DMA_NORESOURCES) { 1789 dev_err(xnfp->xnf_devinfo, CE_WARN, 1790 "ddi_dma_addr_bind_handle() failed " 1791 "[dma_error=%d]", ret); 1792 } 1793 goto error; 1794 } 1795 txp->tx_handle_bound = B_TRUE; 1796 1797 dma_cookie_prev = NULL; 1798 while ((dma_cookie = ddi_dma_cookie_iter(dma_handle, 1799 dma_cookie_prev)) != NULL) { 1800 if (nsegs == XEN_MAX_TX_DATA_PAGES) { 1801 dev_err(xnfp->xnf_devinfo, CE_WARN, 1802 "xnf_dmamap_alloc() failed: " 1803 "too many segments"); 1804 goto error; 1805 } 1806 if (dma_cookie_prev != NULL) { 1807 if ((txp = xnf_data_txbuf_alloc(xnfp, 1808 KM_NOSLEEP)) == NULL) { 1809 goto error; 1810 } 1811 ASSERT(tail != NULL); 1812 TXBUF_SETNEXT(tail, txp); 1813 txp->tx_head = head; 1814 } 1815 1816 txp->tx_mfn = 1817 xnf_btop(pa_to_ma(dma_cookie->dmac_laddress)); 1818 txp->tx_txreq.gref = xnf_gref_get(xnfp); 1819 if (txp->tx_txreq.gref == INVALID_GRANT_REF) { 1820 dev_err(xnfp->xnf_devinfo, CE_WARN, 1821 "xnf_dmamap_alloc() failed: " 1822 "invalid grant ref"); 1823 goto error; 1824 } 1825 gnttab_grant_foreign_access_ref(txp->tx_txreq.gref, 1826 oeid, txp->tx_mfn, 1); 1827 txp->tx_txreq.offset = 1828 dma_cookie->dmac_laddress & PAGEOFFSET; 1829 txp->tx_txreq.size = dma_cookie->dmac_size; 1830 txp->tx_txreq.flags = 0; 1831 1832 nsegs++; 1833 1834 if (tail != NULL) 1835 tail->tx_txreq.flags = NETTXF_more_data; 1836 tail = txp; 1837 1838 dma_cookie_prev = dma_cookie; 1839 } 1840 } 1841 1842 *countp = nsegs; 1843 return (head); 1844 1845 error: 1846 xnf_data_txbuf_free_chain(xnfp, head); 1847 return (NULL); 1848 } 1849 1850 static void 1851 xnf_tx_setup_offload(xnf_t *xnfp, xnf_txbuf_t *head, 1852 uint32_t cksum_flags, uint32_t lso_flags, uint32_t mss) 1853 { 1854 if (lso_flags != 0) { 1855 ASSERT3U(lso_flags, ==, HW_LSO); 1856 ASSERT3P(head->tx_bdesc, ==, NULL); 1857 1858 head->tx_txreq.flags |= NETTXF_extra_info; 1859 netif_extra_info_t *extra = &head->tx_extra; 1860 extra->type = XEN_NETIF_EXTRA_TYPE_GSO; 1861 extra->flags = 0; 1862 extra->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; 1863 extra->u.gso.size = mss; 1864 extra->u.gso.features = 0; 1865 extra->u.gso.pad = 0; 1866 } else if (cksum_flags != 0) { 1867 ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM); 1868 /* 1869 * If the local protocol stack requests checksum 1870 * offload we set the 'checksum blank' flag, 1871 * indicating to the peer that we need the checksum 1872 * calculated for us. 1873 * 1874 * We _don't_ set the validated flag, because we haven't 1875 * validated that the data and the checksum match. 1876 * 1877 * Note: we already called xnf_pseudo_cksum() in 1878 * xnf_send(), so we just set the txreq flag here. 1879 */ 1880 head->tx_txreq.flags |= NETTXF_csum_blank; 1881 xnfp->xnf_stat_tx_cksum_deferred++; 1882 } 1883 } 1884 1885 /* 1886 * Send packet mp. Called by the MAC framework. 1887 */ 1888 static mblk_t * 1889 xnf_send(void *arg, mblk_t *mp) 1890 { 1891 xnf_t *xnfp = arg; 1892 xnf_txbuf_t *head; 1893 mblk_t *ml; 1894 int length; 1895 int pages, chunks, slots, slots_free; 1896 uint32_t cksum_flags, lso_flags, mss; 1897 boolean_t pulledup = B_FALSE; 1898 boolean_t force_copy = B_FALSE; 1899 1900 ASSERT3P(mp->b_next, ==, NULL); 1901 1902 mutex_enter(&xnfp->xnf_txlock); 1903 1904 /* 1905 * Wait until we are connected to the backend. 1906 */ 1907 while (!xnfp->xnf_connected) 1908 cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock); 1909 1910 /* 1911 * To simplify logic and be in sync with the rescheduling mechanism, 1912 * we require the maximum amount of slots that could be used by a 1913 * transaction to be free before proceeding. The only downside of doing 1914 * this is that it slightly reduces the effective size of the ring. 1915 */ 1916 slots_free = xnf_tx_slots_get(xnfp, XEN_MAX_SLOTS_PER_TX, B_FALSE); 1917 if (slots_free < XEN_MAX_SLOTS_PER_TX) { 1918 /* 1919 * We need to ask for a re-schedule later as the ring is full. 1920 */ 1921 mutex_enter(&xnfp->xnf_schedlock); 1922 xnfp->xnf_need_sched = B_TRUE; 1923 mutex_exit(&xnfp->xnf_schedlock); 1924 1925 xnfp->xnf_stat_tx_defer++; 1926 mutex_exit(&xnfp->xnf_txlock); 1927 return (mp); 1928 } 1929 1930 /* 1931 * Get hw offload parameters. 1932 * This must be done before pulling up the mp as those parameters 1933 * are not copied over. 1934 */ 1935 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &cksum_flags); 1936 mac_lso_get(mp, &mss, &lso_flags); 1937 1938 /* 1939 * XXX: fix MAC framework so that we can advertise support for 1940 * partial checksum for IPv4 only. This way we won't need to calculate 1941 * the pseudo header checksum ourselves. 1942 */ 1943 if (cksum_flags != 0) { 1944 ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM); 1945 (void) xnf_pseudo_cksum(mp); 1946 } 1947 1948 pulledup: 1949 for (ml = mp, pages = 0, chunks = 0, length = 0; ml != NULL; 1950 ml = ml->b_cont, chunks++) { 1951 pages += xnf_mblk_pages(ml); 1952 length += MBLKL(ml); 1953 } 1954 DTRACE_PROBE3(packet, int, length, int, chunks, int, pages); 1955 DTRACE_PROBE3(lso, int, length, uint32_t, lso_flags, uint32_t, mss); 1956 1957 /* 1958 * If the ethernet header crosses a page boundary the packet 1959 * will be dropped by the backend. In practice it seems like 1960 * this happens fairly rarely so we'll do nothing unless the 1961 * packet is small enough to fit in a look-aside buffer. 1962 */ 1963 if (((uintptr_t)mp->b_rptr & PAGEOFFSET) + 1964 sizeof (struct ether_header) > PAGESIZE) { 1965 xnfp->xnf_stat_tx_eth_hdr_split++; 1966 if (length <= PAGESIZE) 1967 force_copy = B_TRUE; 1968 } 1969 1970 if (force_copy || (pages > 1 && !xnfp->xnf_be_tx_sg)) { 1971 /* 1972 * If the packet spans several pages and scatter-gather is not 1973 * supported then use a look-aside buffer. 1974 */ 1975 ASSERT3U(length, <=, PAGESIZE); 1976 head = xnf_mblk_copy(xnfp, mp); 1977 if (head == NULL) { 1978 dev_err(xnfp->xnf_devinfo, CE_WARN, 1979 "xnf_mblk_copy() failed"); 1980 goto drop; 1981 } 1982 } else { 1983 /* 1984 * There's a limit for how many pages can be passed to the 1985 * backend. If we pass that limit, the packet will be dropped 1986 * and some backend implementations (e.g. Linux) could even 1987 * offline the interface. 1988 */ 1989 if (pages > XEN_MAX_TX_DATA_PAGES) { 1990 if (pulledup) { 1991 dev_err(xnfp->xnf_devinfo, CE_WARN, 1992 "too many pages, even after pullup: %d.", 1993 pages); 1994 goto drop; 1995 } 1996 1997 /* 1998 * Defragment packet if it spans too many pages. 1999 */ 2000 mblk_t *newmp = msgpullup(mp, -1); 2001 if (newmp == NULL) { 2002 dev_err(xnfp->xnf_devinfo, CE_WARN, 2003 "msgpullup() failed"); 2004 goto drop; 2005 } 2006 2007 freemsg(mp); 2008 mp = newmp; 2009 xnfp->xnf_stat_tx_pullup++; 2010 pulledup = B_TRUE; 2011 goto pulledup; 2012 } 2013 2014 head = xnf_mblk_map(xnfp, mp, &slots); 2015 if (head == NULL) 2016 goto drop; 2017 2018 IMPLY(slots > 1, xnfp->xnf_be_tx_sg); 2019 } 2020 2021 /* 2022 * Set tx_mp so that mblk is freed when the txbuf chain is freed. 2023 */ 2024 head->tx_mp = mp; 2025 2026 xnf_tx_setup_offload(xnfp, head, cksum_flags, lso_flags, mss); 2027 2028 /* 2029 * The first request must store the total length of the packet. 2030 */ 2031 head->tx_txreq.size = length; 2032 2033 /* 2034 * Push the packet we have prepared into the ring. 2035 */ 2036 xnf_tx_push_packet(xnfp, head); 2037 xnfp->xnf_stat_opackets++; 2038 xnfp->xnf_stat_obytes += length; 2039 2040 mutex_exit(&xnfp->xnf_txlock); 2041 return (NULL); 2042 2043 drop: 2044 freemsg(mp); 2045 xnfp->xnf_stat_tx_drop++; 2046 mutex_exit(&xnfp->xnf_txlock); 2047 return (NULL); 2048 } 2049 2050 /* 2051 * Notification of RX packets. Currently no TX-complete interrupt is 2052 * used, as we clean the TX ring lazily. 2053 */ 2054 static uint_t 2055 xnf_intr(caddr_t arg) 2056 { 2057 xnf_t *xnfp = (xnf_t *)arg; 2058 mblk_t *mp; 2059 boolean_t need_sched, clean_ring; 2060 2061 mutex_enter(&xnfp->xnf_rxlock); 2062 2063 /* 2064 * Interrupts before we are connected are spurious. 2065 */ 2066 if (!xnfp->xnf_connected) { 2067 mutex_exit(&xnfp->xnf_rxlock); 2068 xnfp->xnf_stat_unclaimed_interrupts++; 2069 return (DDI_INTR_UNCLAIMED); 2070 } 2071 2072 /* 2073 * Receive side processing. 2074 */ 2075 do { 2076 /* 2077 * Collect buffers from the ring. 2078 */ 2079 xnf_rx_collect(xnfp); 2080 2081 /* 2082 * Interrupt me when the next receive buffer is consumed. 2083 */ 2084 xnfp->xnf_rx_ring.sring->rsp_event = 2085 xnfp->xnf_rx_ring.rsp_cons + 1; 2086 xen_mb(); 2087 2088 } while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)); 2089 2090 if (xnfp->xnf_rx_new_buffers_posted) { 2091 boolean_t notify; 2092 2093 /* 2094 * Indicate to the peer that we have re-filled the 2095 * receive ring, if it cares. 2096 */ 2097 /* LINTED: constant in conditional context */ 2098 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify); 2099 if (notify) 2100 ec_notify_via_evtchn(xnfp->xnf_evtchn); 2101 xnfp->xnf_rx_new_buffers_posted = B_FALSE; 2102 } 2103 2104 mp = xnfp->xnf_rx_head; 2105 xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL; 2106 2107 xnfp->xnf_stat_interrupts++; 2108 mutex_exit(&xnfp->xnf_rxlock); 2109 2110 if (mp != NULL) 2111 mac_rx(xnfp->xnf_mh, NULL, mp); 2112 2113 /* 2114 * Transmit side processing. 2115 * 2116 * If a previous transmit attempt failed or we have pending 2117 * multicast requests, clean the ring. 2118 * 2119 * If we previously stalled transmission and cleaning produces 2120 * some free slots, tell upstream to attempt sending again. 2121 * 2122 * The odd style is to avoid acquiring xnf_txlock unless we 2123 * will actually look inside the tx machinery. 2124 */ 2125 mutex_enter(&xnfp->xnf_schedlock); 2126 need_sched = xnfp->xnf_need_sched; 2127 clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0); 2128 mutex_exit(&xnfp->xnf_schedlock); 2129 2130 if (clean_ring) { 2131 int free_slots; 2132 2133 mutex_enter(&xnfp->xnf_txlock); 2134 free_slots = xnf_tx_slots_get(xnfp, 0, B_FALSE); 2135 2136 if (need_sched && (free_slots >= XEN_MAX_SLOTS_PER_TX)) { 2137 mutex_enter(&xnfp->xnf_schedlock); 2138 xnfp->xnf_need_sched = B_FALSE; 2139 mutex_exit(&xnfp->xnf_schedlock); 2140 2141 mac_tx_update(xnfp->xnf_mh); 2142 } 2143 mutex_exit(&xnfp->xnf_txlock); 2144 } 2145 2146 return (DDI_INTR_CLAIMED); 2147 } 2148 2149 /* 2150 * xnf_start() -- start the board receiving and enable interrupts. 2151 */ 2152 static int 2153 xnf_start(void *arg) 2154 { 2155 xnf_t *xnfp = arg; 2156 2157 mutex_enter(&xnfp->xnf_rxlock); 2158 mutex_enter(&xnfp->xnf_txlock); 2159 2160 /* Accept packets from above. */ 2161 xnfp->xnf_running = B_TRUE; 2162 2163 mutex_exit(&xnfp->xnf_txlock); 2164 mutex_exit(&xnfp->xnf_rxlock); 2165 2166 return (0); 2167 } 2168 2169 /* xnf_stop() - disable hardware */ 2170 static void 2171 xnf_stop(void *arg) 2172 { 2173 xnf_t *xnfp = arg; 2174 2175 mutex_enter(&xnfp->xnf_rxlock); 2176 mutex_enter(&xnfp->xnf_txlock); 2177 2178 xnfp->xnf_running = B_FALSE; 2179 2180 mutex_exit(&xnfp->xnf_txlock); 2181 mutex_exit(&xnfp->xnf_rxlock); 2182 } 2183 2184 /* 2185 * Hang buffer `bdesc' on the RX ring. 2186 */ 2187 static void 2188 xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc) 2189 { 2190 netif_rx_request_t *reqp; 2191 RING_IDX hang_ix; 2192 2193 ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock)); 2194 2195 reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring, 2196 xnfp->xnf_rx_ring.req_prod_pvt); 2197 hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0)); 2198 ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL); 2199 2200 reqp->id = bdesc->id = hang_ix; 2201 reqp->gref = bdesc->grant_ref; 2202 2203 xnfp->xnf_rx_pkt_info[hang_ix] = bdesc; 2204 xnfp->xnf_rx_ring.req_prod_pvt++; 2205 2206 xnfp->xnf_rx_new_buffers_posted = B_TRUE; 2207 } 2208 2209 /* 2210 * Receive an entire packet from the ring, starting from slot *consp. 2211 * prod indicates the slot of the latest response. 2212 * On return, *consp will point to the head of the next packet. 2213 * 2214 * Note: If slot prod was reached before we could gather a full packet, we will 2215 * drop the partial packet; this would most likely indicate a bug in either 2216 * the front-end or the back-end driver. 2217 * 2218 * An rx packet can consist of several fragments and thus span multiple slots. 2219 * Each fragment can contain up to 4k of data. 2220 * 2221 * A typical 9000 MTU packet with look like this: 2222 * +------+---------------------+-------------------+-----------------------+ 2223 * | SLOT | TYPE | CONTENTS | FLAGS | 2224 * +------+---------------------+-------------------+-----------------------+ 2225 * | 1 | netif_rx_response_t | 1st data fragment | more_data | 2226 * +------+---------------------+-------------------+-----------------------+ 2227 * | 2 | netif_rx_response_t | 2nd data fragment | more_data | 2228 * +------+---------------------+-------------------+-----------------------+ 2229 * | 3 | netif_rx_response_t | 3rd data fragment | [none] | 2230 * +------+---------------------+-------------------+-----------------------+ 2231 * 2232 * Fragments are chained by setting NETRXF_more_data in the previous 2233 * response's flags. If there are additional flags, such as 2234 * NETRXF_data_validated or NETRXF_extra_info, those should be set on the 2235 * first fragment. 2236 * 2237 * Sometimes extra info can be present. If so, it will follow the first 2238 * fragment, and NETRXF_extra_info flag will be set on the first response. 2239 * If LRO is set on a packet, it will be stored in the extra info. Conforming 2240 * to the spec, extra info can also be chained, but must all be present right 2241 * after the first fragment. 2242 * 2243 * Example of a packet with 2 extra infos: 2244 * +------+---------------------+-------------------+-----------------------+ 2245 * | SLOT | TYPE | CONTENTS | FLAGS | 2246 * +------+---------------------+-------------------+-----------------------+ 2247 * | 1 | netif_rx_response_t | 1st data fragment | extra_info, more_data | 2248 * +------+---------------------+-------------------+-----------------------+ 2249 * | 2 | netif_extra_info_t | 1st extra info | EXTRA_FLAG_MORE | 2250 * +------+---------------------+-------------------+-----------------------+ 2251 * | 3 | netif_extra_info_t | 2nd extra info | [none] | 2252 * +------+---------------------+-------------------+-----------------------+ 2253 * | 4 | netif_rx_response_t | 2nd data fragment | more_data | 2254 * +------+---------------------+-------------------+-----------------------+ 2255 * | 5 | netif_rx_response_t | 3rd data fragment | more_data | 2256 * +------+---------------------+-------------------+-----------------------+ 2257 * | 6 | netif_rx_response_t | 4th data fragment | [none] | 2258 * +------+---------------------+-------------------+-----------------------+ 2259 * 2260 * In practice, the only extra we expect is for LRO, but only if we advertise 2261 * that we support it to the backend (xnf_enable_lro == TRUE). 2262 */ 2263 static int 2264 xnf_rx_one_packet(xnf_t *xnfp, RING_IDX prod, RING_IDX *consp, mblk_t **mpp) 2265 { 2266 mblk_t *head = NULL; 2267 mblk_t *tail = NULL; 2268 mblk_t *mp; 2269 int error = 0; 2270 RING_IDX cons = *consp; 2271 netif_extra_info_t lro; 2272 boolean_t is_lro = B_FALSE; 2273 boolean_t is_extra = B_FALSE; 2274 2275 netif_rx_response_t rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons); 2276 2277 boolean_t hwcsum = (rsp.flags & NETRXF_data_validated) != 0; 2278 boolean_t more_data = (rsp.flags & NETRXF_more_data) != 0; 2279 boolean_t more_extra = (rsp.flags & NETRXF_extra_info) != 0; 2280 2281 IMPLY(more_data, xnf_enable_rx_sg); 2282 2283 while (cons != prod) { 2284 xnf_buf_t *bdesc; 2285 int len, off; 2286 int rxidx = cons & (NET_RX_RING_SIZE - 1); 2287 2288 bdesc = xnfp->xnf_rx_pkt_info[rxidx]; 2289 xnfp->xnf_rx_pkt_info[rxidx] = NULL; 2290 2291 if (is_extra) { 2292 netif_extra_info_t *extra = (netif_extra_info_t *)&rsp; 2293 /* 2294 * The only extra we expect is for LRO, and it should 2295 * only be present once. 2296 */ 2297 if (extra->type == XEN_NETIF_EXTRA_TYPE_GSO && 2298 !is_lro) { 2299 ASSERT(xnf_enable_lro); 2300 lro = *extra; 2301 is_lro = B_TRUE; 2302 DTRACE_PROBE1(lro, netif_extra_info_t *, &lro); 2303 } else { 2304 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx packet " 2305 "contains unexpected extra info of type %d", 2306 extra->type); 2307 error = EINVAL; 2308 } 2309 more_extra = 2310 (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE) != 0; 2311 2312 goto hang_buf; 2313 } 2314 2315 ASSERT3U(bdesc->id, ==, rsp.id); 2316 2317 /* 2318 * status stores packet length when >= 0, or errors when < 0. 2319 */ 2320 len = rsp.status; 2321 off = rsp.offset; 2322 more_data = (rsp.flags & NETRXF_more_data) != 0; 2323 2324 /* 2325 * sanity checks. 2326 */ 2327 if (!xnfp->xnf_running) { 2328 error = EBUSY; 2329 } else if (len <= 0) { 2330 xnfp->xnf_stat_errrx++; 2331 2332 switch (len) { 2333 case 0: 2334 xnfp->xnf_stat_runt++; 2335 break; 2336 case NETIF_RSP_ERROR: 2337 xnfp->xnf_stat_mac_rcv_error++; 2338 break; 2339 case NETIF_RSP_DROPPED: 2340 xnfp->xnf_stat_norxbuf++; 2341 break; 2342 } 2343 error = EINVAL; 2344 } else if (bdesc->grant_ref == INVALID_GRANT_REF) { 2345 dev_err(xnfp->xnf_devinfo, CE_WARN, 2346 "Bad rx grant reference, rsp id %d", rsp.id); 2347 error = EINVAL; 2348 } else if ((off + len) > PAGESIZE) { 2349 dev_err(xnfp->xnf_devinfo, CE_WARN, "Rx packet crosses " 2350 "page boundary (offset %d, length %d)", off, len); 2351 error = EINVAL; 2352 } 2353 2354 if (error != 0) { 2355 /* 2356 * If an error has been detected, we do not attempt 2357 * to read the data but we still need to replace 2358 * the rx bufs. 2359 */ 2360 goto hang_buf; 2361 } 2362 2363 xnf_buf_t *nbuf = NULL; 2364 2365 /* 2366 * If the packet is below a pre-determined size we will 2367 * copy data out of the buf rather than replace it. 2368 */ 2369 if (len > xnf_rx_copy_limit) 2370 nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE); 2371 2372 if (nbuf != NULL) { 2373 mp = desballoc((unsigned char *)bdesc->buf, 2374 bdesc->len, 0, &bdesc->free_rtn); 2375 2376 if (mp == NULL) { 2377 xnfp->xnf_stat_rx_desballoc_fail++; 2378 xnfp->xnf_stat_norxbuf++; 2379 error = ENOMEM; 2380 /* 2381 * we free the buf we just allocated as we 2382 * will re-hang the old buf. 2383 */ 2384 xnf_buf_put(xnfp, nbuf, B_FALSE); 2385 goto hang_buf; 2386 } 2387 2388 mp->b_rptr = mp->b_rptr + off; 2389 mp->b_wptr = mp->b_rptr + len; 2390 2391 /* 2392 * Release the grant as the backend doesn't need to 2393 * access this buffer anymore and grants are scarce. 2394 */ 2395 (void) gnttab_end_foreign_access_ref(bdesc->grant_ref, 2396 0); 2397 xnf_gref_put(xnfp, bdesc->grant_ref); 2398 bdesc->grant_ref = INVALID_GRANT_REF; 2399 2400 bdesc = nbuf; 2401 } else { 2402 /* 2403 * We failed to allocate a new buf or decided to reuse 2404 * the old one. In either case we copy the data off it 2405 * and put it back into the ring. 2406 */ 2407 mp = allocb(len, 0); 2408 if (mp == NULL) { 2409 xnfp->xnf_stat_rx_allocb_fail++; 2410 xnfp->xnf_stat_norxbuf++; 2411 error = ENOMEM; 2412 goto hang_buf; 2413 } 2414 bcopy(bdesc->buf + off, mp->b_wptr, len); 2415 mp->b_wptr += len; 2416 } 2417 2418 if (head == NULL) 2419 head = mp; 2420 else 2421 tail->b_cont = mp; 2422 tail = mp; 2423 2424 hang_buf: 2425 /* 2426 * No matter what happens, for each response we need to hang 2427 * a new buf on the rx ring. Put either the old one, or a new 2428 * one if the old one is borrowed by the kernel via desballoc(). 2429 */ 2430 xnf_rxbuf_hang(xnfp, bdesc); 2431 cons++; 2432 2433 /* next response is an extra */ 2434 is_extra = more_extra; 2435 2436 if (!more_data && !more_extra) 2437 break; 2438 2439 /* 2440 * Note that since requests and responses are union'd on the 2441 * same ring, we copy the response to a local variable instead 2442 * of keeping a pointer. Otherwise xnf_rxbuf_hang() would have 2443 * overwritten contents of rsp. 2444 */ 2445 rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons); 2446 } 2447 2448 /* 2449 * Check that we do not get stuck in a loop. 2450 */ 2451 ASSERT3U(*consp, !=, cons); 2452 *consp = cons; 2453 2454 /* 2455 * We ran out of responses but the flags indicate there is more data. 2456 */ 2457 if (more_data) { 2458 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments."); 2459 error = EINVAL; 2460 } 2461 if (more_extra) { 2462 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments " 2463 "(extras)."); 2464 error = EINVAL; 2465 } 2466 2467 /* 2468 * An error means the packet must be dropped. If we have already formed 2469 * a partial packet, then discard it. 2470 */ 2471 if (error != 0) { 2472 if (head != NULL) 2473 freemsg(head); 2474 xnfp->xnf_stat_rx_drop++; 2475 return (error); 2476 } 2477 2478 ASSERT(head != NULL); 2479 2480 if (hwcsum) { 2481 /* 2482 * If the peer says that the data has been validated then we 2483 * declare that the full checksum has been verified. 2484 * 2485 * We don't look at the "checksum blank" flag, and hence could 2486 * have a packet here that we are asserting is good with 2487 * a blank checksum. 2488 */ 2489 mac_hcksum_set(head, 0, 0, 0, 0, HCK_FULLCKSUM_OK); 2490 xnfp->xnf_stat_rx_cksum_no_need++; 2491 } 2492 2493 /* XXX: set lro info for packet once LRO is supported in OS. */ 2494 2495 *mpp = head; 2496 2497 return (0); 2498 } 2499 2500 /* 2501 * Collect packets from the RX ring, storing them in `xnfp' for later use. 2502 */ 2503 static void 2504 xnf_rx_collect(xnf_t *xnfp) 2505 { 2506 RING_IDX prod; 2507 2508 ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock)); 2509 2510 prod = xnfp->xnf_rx_ring.sring->rsp_prod; 2511 /* 2512 * Ensure we see queued responses up to 'prod'. 2513 */ 2514 membar_consumer(); 2515 2516 while (xnfp->xnf_rx_ring.rsp_cons != prod) { 2517 mblk_t *mp; 2518 2519 /* 2520 * Collect a packet. 2521 * rsp_cons is updated inside xnf_rx_one_packet(). 2522 */ 2523 int error = xnf_rx_one_packet(xnfp, prod, 2524 &xnfp->xnf_rx_ring.rsp_cons, &mp); 2525 if (error == 0) { 2526 xnfp->xnf_stat_ipackets++; 2527 xnfp->xnf_stat_rbytes += xmsgsize(mp); 2528 2529 /* 2530 * Append the mblk to the rx list. 2531 */ 2532 if (xnfp->xnf_rx_head == NULL) { 2533 ASSERT3P(xnfp->xnf_rx_tail, ==, NULL); 2534 xnfp->xnf_rx_head = mp; 2535 } else { 2536 ASSERT(xnfp->xnf_rx_tail != NULL); 2537 xnfp->xnf_rx_tail->b_next = mp; 2538 } 2539 xnfp->xnf_rx_tail = mp; 2540 } 2541 } 2542 } 2543 2544 /* 2545 * xnf_alloc_dma_resources() -- initialize the drivers structures 2546 */ 2547 static int 2548 xnf_alloc_dma_resources(xnf_t *xnfp) 2549 { 2550 dev_info_t *devinfo = xnfp->xnf_devinfo; 2551 size_t len; 2552 ddi_dma_cookie_t dma_cookie; 2553 uint_t ncookies; 2554 int rc; 2555 caddr_t rptr; 2556 2557 /* 2558 * The code below allocates all the DMA data structures that 2559 * need to be released when the driver is detached. 2560 * 2561 * Allocate page for the transmit descriptor ring. 2562 */ 2563 if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr, 2564 DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS) 2565 goto alloc_error; 2566 2567 if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle, 2568 PAGESIZE, &accattr, DDI_DMA_CONSISTENT, 2569 DDI_DMA_SLEEP, 0, &rptr, &len, 2570 &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) { 2571 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle); 2572 xnfp->xnf_tx_ring_dma_handle = NULL; 2573 goto alloc_error; 2574 } 2575 2576 if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL, 2577 rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, 2578 DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) { 2579 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle); 2580 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle); 2581 xnfp->xnf_tx_ring_dma_handle = NULL; 2582 xnfp->xnf_tx_ring_dma_acchandle = NULL; 2583 if (rc == DDI_DMA_NORESOURCES) 2584 goto alloc_error; 2585 else 2586 goto error; 2587 } 2588 2589 ASSERT(ncookies == 1); 2590 bzero(rptr, PAGESIZE); 2591 /* LINTED: constant in conditional context */ 2592 SHARED_RING_INIT((netif_tx_sring_t *)rptr); 2593 /* LINTED: constant in conditional context */ 2594 FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE); 2595 xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress; 2596 2597 /* 2598 * Allocate page for the receive descriptor ring. 2599 */ 2600 if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr, 2601 DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS) 2602 goto alloc_error; 2603 2604 if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle, 2605 PAGESIZE, &accattr, DDI_DMA_CONSISTENT, 2606 DDI_DMA_SLEEP, 0, &rptr, &len, 2607 &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) { 2608 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); 2609 xnfp->xnf_rx_ring_dma_handle = NULL; 2610 goto alloc_error; 2611 } 2612 2613 if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL, 2614 rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, 2615 DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) { 2616 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle); 2617 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); 2618 xnfp->xnf_rx_ring_dma_handle = NULL; 2619 xnfp->xnf_rx_ring_dma_acchandle = NULL; 2620 if (rc == DDI_DMA_NORESOURCES) 2621 goto alloc_error; 2622 else 2623 goto error; 2624 } 2625 2626 ASSERT(ncookies == 1); 2627 bzero(rptr, PAGESIZE); 2628 /* LINTED: constant in conditional context */ 2629 SHARED_RING_INIT((netif_rx_sring_t *)rptr); 2630 /* LINTED: constant in conditional context */ 2631 FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE); 2632 xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress; 2633 2634 return (DDI_SUCCESS); 2635 2636 alloc_error: 2637 cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory", 2638 ddi_get_instance(xnfp->xnf_devinfo)); 2639 error: 2640 xnf_release_dma_resources(xnfp); 2641 return (DDI_FAILURE); 2642 } 2643 2644 /* 2645 * Release all DMA resources in the opposite order from acquisition 2646 */ 2647 static void 2648 xnf_release_dma_resources(xnf_t *xnfp) 2649 { 2650 int i; 2651 2652 /* 2653 * Free receive buffers which are currently associated with 2654 * descriptors. 2655 */ 2656 mutex_enter(&xnfp->xnf_rxlock); 2657 for (i = 0; i < NET_RX_RING_SIZE; i++) { 2658 xnf_buf_t *bp; 2659 2660 if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL) 2661 continue; 2662 xnfp->xnf_rx_pkt_info[i] = NULL; 2663 xnf_buf_put(xnfp, bp, B_FALSE); 2664 } 2665 mutex_exit(&xnfp->xnf_rxlock); 2666 2667 /* Free the receive ring buffer. */ 2668 if (xnfp->xnf_rx_ring_dma_acchandle != NULL) { 2669 (void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle); 2670 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle); 2671 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); 2672 xnfp->xnf_rx_ring_dma_acchandle = NULL; 2673 } 2674 /* Free the transmit ring buffer. */ 2675 if (xnfp->xnf_tx_ring_dma_acchandle != NULL) { 2676 (void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle); 2677 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle); 2678 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle); 2679 xnfp->xnf_tx_ring_dma_acchandle = NULL; 2680 } 2681 2682 } 2683 2684 /* 2685 * Release any packets and associated structures used by the TX ring. 2686 */ 2687 static void 2688 xnf_release_mblks(xnf_t *xnfp) 2689 { 2690 RING_IDX i; 2691 xnf_txid_t *tidp; 2692 2693 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0]; 2694 i < NET_TX_RING_SIZE; 2695 i++, tidp++) { 2696 xnf_txbuf_t *txp = tidp->txbuf; 2697 2698 if (txp != NULL) { 2699 ASSERT(txp->tx_mp != NULL); 2700 freemsg(txp->tx_mp); 2701 2702 xnf_txid_put(xnfp, tidp); 2703 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); 2704 } 2705 } 2706 } 2707 2708 static int 2709 xnf_buf_constructor(void *buf, void *arg, int kmflag) 2710 { 2711 int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP; 2712 xnf_buf_t *bdesc = buf; 2713 xnf_t *xnfp = arg; 2714 ddi_dma_cookie_t dma_cookie; 2715 uint_t ncookies; 2716 size_t len; 2717 2718 if (kmflag & KM_NOSLEEP) 2719 ddiflags = DDI_DMA_DONTWAIT; 2720 2721 /* Allocate a DMA access handle for the buffer. */ 2722 if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buf_dma_attr, 2723 ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS) 2724 goto failure; 2725 2726 /* Allocate DMA-able memory for buffer. */ 2727 if (ddi_dma_mem_alloc(bdesc->dma_handle, 2728 PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0, 2729 &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS) 2730 goto failure_1; 2731 2732 /* Bind to virtual address of buffer to get physical address. */ 2733 if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL, 2734 bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING, 2735 ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) 2736 goto failure_2; 2737 ASSERT(ncookies == 1); 2738 2739 bdesc->free_rtn.free_func = xnf_buf_recycle; 2740 bdesc->free_rtn.free_arg = (caddr_t)bdesc; 2741 bdesc->xnfp = xnfp; 2742 bdesc->buf_phys = dma_cookie.dmac_laddress; 2743 bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys)); 2744 bdesc->len = dma_cookie.dmac_size; 2745 bdesc->grant_ref = INVALID_GRANT_REF; 2746 bdesc->gen = xnfp->xnf_gen; 2747 2748 atomic_inc_64(&xnfp->xnf_stat_buf_allocated); 2749 2750 return (0); 2751 2752 failure_2: 2753 ddi_dma_mem_free(&bdesc->acc_handle); 2754 2755 failure_1: 2756 ddi_dma_free_handle(&bdesc->dma_handle); 2757 2758 failure: 2759 2760 ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */ 2761 return (-1); 2762 } 2763 2764 static void 2765 xnf_buf_destructor(void *buf, void *arg) 2766 { 2767 xnf_buf_t *bdesc = buf; 2768 xnf_t *xnfp = arg; 2769 2770 (void) ddi_dma_unbind_handle(bdesc->dma_handle); 2771 ddi_dma_mem_free(&bdesc->acc_handle); 2772 ddi_dma_free_handle(&bdesc->dma_handle); 2773 2774 atomic_dec_64(&xnfp->xnf_stat_buf_allocated); 2775 } 2776 2777 static xnf_buf_t * 2778 xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly) 2779 { 2780 grant_ref_t gref; 2781 xnf_buf_t *bufp; 2782 2783 /* 2784 * Usually grant references are more scarce than memory, so we 2785 * attempt to acquire a grant reference first. 2786 */ 2787 gref = xnf_gref_get(xnfp); 2788 if (gref == INVALID_GRANT_REF) 2789 return (NULL); 2790 2791 bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags); 2792 if (bufp == NULL) { 2793 xnf_gref_put(xnfp, gref); 2794 return (NULL); 2795 } 2796 2797 ASSERT3U(bufp->grant_ref, ==, INVALID_GRANT_REF); 2798 2799 bufp->grant_ref = gref; 2800 2801 if (bufp->gen != xnfp->xnf_gen) 2802 xnf_buf_refresh(bufp); 2803 2804 gnttab_grant_foreign_access_ref(bufp->grant_ref, 2805 xvdi_get_oeid(bufp->xnfp->xnf_devinfo), 2806 bufp->buf_mfn, readonly ? 1 : 0); 2807 2808 atomic_inc_64(&xnfp->xnf_stat_buf_outstanding); 2809 2810 return (bufp); 2811 } 2812 2813 static void 2814 xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly) 2815 { 2816 if (bufp->grant_ref != INVALID_GRANT_REF) { 2817 (void) gnttab_end_foreign_access_ref( 2818 bufp->grant_ref, readonly ? 1 : 0); 2819 xnf_gref_put(xnfp, bufp->grant_ref); 2820 bufp->grant_ref = INVALID_GRANT_REF; 2821 } 2822 2823 kmem_cache_free(xnfp->xnf_buf_cache, bufp); 2824 2825 atomic_dec_64(&xnfp->xnf_stat_buf_outstanding); 2826 } 2827 2828 /* 2829 * Refresh any cached data about a buffer after resume. 2830 */ 2831 static void 2832 xnf_buf_refresh(xnf_buf_t *bdesc) 2833 { 2834 bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys)); 2835 bdesc->gen = bdesc->xnfp->xnf_gen; 2836 } 2837 2838 /* 2839 * Streams `freeb' routine for `xnf_buf_t' when used as transmit 2840 * look-aside buffers. 2841 */ 2842 static void 2843 xnf_buf_recycle(xnf_buf_t *bdesc) 2844 { 2845 xnf_t *xnfp = bdesc->xnfp; 2846 2847 xnf_buf_put(xnfp, bdesc, B_TRUE); 2848 } 2849 2850 static int 2851 xnf_tx_buf_constructor(void *buf, void *arg, int kmflag) 2852 { 2853 int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP; 2854 xnf_txbuf_t *txp = buf; 2855 xnf_t *xnfp = arg; 2856 2857 if (kmflag & KM_NOSLEEP) 2858 ddiflags = DDI_DMA_DONTWAIT; 2859 2860 if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buf_dma_attr, 2861 ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) { 2862 ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */ 2863 return (-1); 2864 } 2865 2866 return (0); 2867 } 2868 2869 static void 2870 xnf_tx_buf_destructor(void *buf, void *arg) 2871 { 2872 _NOTE(ARGUNUSED(arg)); 2873 xnf_txbuf_t *txp = buf; 2874 2875 ddi_dma_free_handle(&txp->tx_dma_handle); 2876 } 2877 2878 /* 2879 * Statistics. 2880 */ 2881 static char *xnf_aux_statistics[] = { 2882 "tx_cksum_deferred", 2883 "rx_cksum_no_need", 2884 "interrupts", 2885 "unclaimed_interrupts", 2886 "tx_pullup", 2887 "tx_lookaside", 2888 "tx_drop", 2889 "tx_eth_hdr_split", 2890 "buf_allocated", 2891 "buf_outstanding", 2892 "gref_outstanding", 2893 "gref_failure", 2894 "gref_peak", 2895 "rx_allocb_fail", 2896 "rx_desballoc_fail", 2897 }; 2898 2899 static int 2900 xnf_kstat_aux_update(kstat_t *ksp, int flag) 2901 { 2902 xnf_t *xnfp; 2903 kstat_named_t *knp; 2904 2905 if (flag != KSTAT_READ) 2906 return (EACCES); 2907 2908 xnfp = ksp->ks_private; 2909 knp = ksp->ks_data; 2910 2911 /* 2912 * Assignment order must match that of the names in 2913 * xnf_aux_statistics. 2914 */ 2915 (knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred; 2916 (knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need; 2917 2918 (knp++)->value.ui64 = xnfp->xnf_stat_interrupts; 2919 (knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts; 2920 (knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup; 2921 (knp++)->value.ui64 = xnfp->xnf_stat_tx_lookaside; 2922 (knp++)->value.ui64 = xnfp->xnf_stat_tx_drop; 2923 (knp++)->value.ui64 = xnfp->xnf_stat_tx_eth_hdr_split; 2924 2925 (knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated; 2926 (knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding; 2927 (knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding; 2928 (knp++)->value.ui64 = xnfp->xnf_stat_gref_failure; 2929 (knp++)->value.ui64 = xnfp->xnf_stat_gref_peak; 2930 (knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail; 2931 (knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail; 2932 2933 return (0); 2934 } 2935 2936 static boolean_t 2937 xnf_kstat_init(xnf_t *xnfp) 2938 { 2939 int nstat = sizeof (xnf_aux_statistics) / 2940 sizeof (xnf_aux_statistics[0]); 2941 char **cp = xnf_aux_statistics; 2942 kstat_named_t *knp; 2943 2944 /* 2945 * Create and initialise kstats. 2946 */ 2947 if ((xnfp->xnf_kstat_aux = kstat_create("xnf", 2948 ddi_get_instance(xnfp->xnf_devinfo), 2949 "aux_statistics", "net", KSTAT_TYPE_NAMED, 2950 nstat, 0)) == NULL) 2951 return (B_FALSE); 2952 2953 xnfp->xnf_kstat_aux->ks_private = xnfp; 2954 xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update; 2955 2956 knp = xnfp->xnf_kstat_aux->ks_data; 2957 while (nstat > 0) { 2958 kstat_named_init(knp, *cp, KSTAT_DATA_UINT64); 2959 2960 knp++; 2961 cp++; 2962 nstat--; 2963 } 2964 2965 kstat_install(xnfp->xnf_kstat_aux); 2966 2967 return (B_TRUE); 2968 } 2969 2970 static int 2971 xnf_stat(void *arg, uint_t stat, uint64_t *val) 2972 { 2973 xnf_t *xnfp = arg; 2974 2975 mutex_enter(&xnfp->xnf_rxlock); 2976 mutex_enter(&xnfp->xnf_txlock); 2977 2978 #define mac_stat(q, r) \ 2979 case (MAC_STAT_##q): \ 2980 *val = xnfp->xnf_stat_##r; \ 2981 break 2982 2983 #define ether_stat(q, r) \ 2984 case (ETHER_STAT_##q): \ 2985 *val = xnfp->xnf_stat_##r; \ 2986 break 2987 2988 switch (stat) { 2989 2990 mac_stat(IPACKETS, ipackets); 2991 mac_stat(OPACKETS, opackets); 2992 mac_stat(RBYTES, rbytes); 2993 mac_stat(OBYTES, obytes); 2994 mac_stat(NORCVBUF, norxbuf); 2995 mac_stat(IERRORS, errrx); 2996 mac_stat(NOXMTBUF, tx_defer); 2997 2998 ether_stat(MACRCV_ERRORS, mac_rcv_error); 2999 ether_stat(TOOSHORT_ERRORS, runt); 3000 3001 /* always claim to be in full duplex mode */ 3002 case ETHER_STAT_LINK_DUPLEX: 3003 *val = LINK_DUPLEX_FULL; 3004 break; 3005 3006 /* always claim to be at 1Gb/s link speed */ 3007 case MAC_STAT_IFSPEED: 3008 *val = 1000000000ull; 3009 break; 3010 3011 default: 3012 mutex_exit(&xnfp->xnf_txlock); 3013 mutex_exit(&xnfp->xnf_rxlock); 3014 3015 return (ENOTSUP); 3016 } 3017 3018 #undef mac_stat 3019 #undef ether_stat 3020 3021 mutex_exit(&xnfp->xnf_txlock); 3022 mutex_exit(&xnfp->xnf_rxlock); 3023 3024 return (0); 3025 } 3026 3027 static int 3028 xnf_change_mtu(xnf_t *xnfp, uint32_t mtu) 3029 { 3030 if (mtu > ETHERMTU) { 3031 if (!xnf_enable_tx_sg) { 3032 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d " 3033 "because scatter-gather is disabled for transmit " 3034 "in driver settings", ETHERMTU); 3035 return (EINVAL); 3036 } else if (!xnf_enable_rx_sg) { 3037 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d " 3038 "because scatter-gather is disabled for receive " 3039 "in driver settings", ETHERMTU); 3040 return (EINVAL); 3041 } else if (!xnfp->xnf_be_tx_sg) { 3042 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d " 3043 "because backend doesn't support scatter-gather", 3044 ETHERMTU); 3045 return (EINVAL); 3046 } 3047 if (mtu > XNF_MAXPKT) 3048 return (EINVAL); 3049 } 3050 int error = mac_maxsdu_update(xnfp->xnf_mh, mtu); 3051 if (error == 0) 3052 xnfp->xnf_mtu = mtu; 3053 3054 return (error); 3055 } 3056 3057 /*ARGSUSED*/ 3058 static int 3059 xnf_getprop(void *data, const char *prop_name, mac_prop_id_t prop_id, 3060 uint_t prop_val_size, void *prop_val) 3061 { 3062 xnf_t *xnfp = data; 3063 3064 switch (prop_id) { 3065 case MAC_PROP_MTU: 3066 ASSERT(prop_val_size >= sizeof (uint32_t)); 3067 bcopy(&xnfp->xnf_mtu, prop_val, sizeof (uint32_t)); 3068 break; 3069 default: 3070 return (ENOTSUP); 3071 } 3072 return (0); 3073 } 3074 3075 /*ARGSUSED*/ 3076 static int 3077 xnf_setprop(void *data, const char *prop_name, mac_prop_id_t prop_id, 3078 uint_t prop_val_size, const void *prop_val) 3079 { 3080 xnf_t *xnfp = data; 3081 uint32_t new_mtu; 3082 int error; 3083 3084 switch (prop_id) { 3085 case MAC_PROP_MTU: 3086 ASSERT(prop_val_size >= sizeof (uint32_t)); 3087 bcopy(prop_val, &new_mtu, sizeof (new_mtu)); 3088 error = xnf_change_mtu(xnfp, new_mtu); 3089 break; 3090 default: 3091 return (ENOTSUP); 3092 } 3093 3094 return (error); 3095 } 3096 3097 /*ARGSUSED*/ 3098 static void 3099 xnf_propinfo(void *data, const char *prop_name, mac_prop_id_t prop_id, 3100 mac_prop_info_handle_t prop_handle) 3101 { 3102 switch (prop_id) { 3103 case MAC_PROP_MTU: 3104 mac_prop_info_set_range_uint32(prop_handle, 0, XNF_MAXPKT); 3105 break; 3106 default: 3107 break; 3108 } 3109 } 3110 3111 static boolean_t 3112 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data) 3113 { 3114 xnf_t *xnfp = arg; 3115 3116 switch (cap) { 3117 case MAC_CAPAB_HCKSUM: { 3118 uint32_t *capab = cap_data; 3119 3120 /* 3121 * Whilst the flag used to communicate with the IO 3122 * domain is called "NETTXF_csum_blank", the checksum 3123 * in the packet must contain the pseudo-header 3124 * checksum and not zero. 3125 * 3126 * To help out the IO domain, we might use 3127 * HCKSUM_INET_PARTIAL. Unfortunately our stack will 3128 * then use checksum offload for IPv6 packets, which 3129 * the IO domain can't handle. 3130 * 3131 * As a result, we declare outselves capable of 3132 * HCKSUM_INET_FULL_V4. This means that we receive 3133 * IPv4 packets from the stack with a blank checksum 3134 * field and must insert the pseudo-header checksum 3135 * before passing the packet to the IO domain. 3136 */ 3137 *capab = HCKSUM_INET_FULL_V4; 3138 3139 /* 3140 * TODO: query the "feature-ipv6-csum-offload" capability. 3141 * If enabled, that could allow us to use HCKSUM_INET_PARTIAL. 3142 */ 3143 3144 break; 3145 } 3146 case MAC_CAPAB_LSO: { 3147 if (!xnfp->xnf_be_lso) 3148 return (B_FALSE); 3149 3150 mac_capab_lso_t *lso = cap_data; 3151 lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 3152 lso->lso_basic_tcp_ipv4.lso_max = IP_MAXPACKET; 3153 break; 3154 } 3155 default: 3156 return (B_FALSE); 3157 } 3158 3159 return (B_TRUE); 3160 } 3161 3162 /* 3163 * The state of the peer has changed - react accordingly. 3164 */ 3165 static void 3166 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, 3167 void *arg, void *impl_data) 3168 { 3169 _NOTE(ARGUNUSED(id, arg)); 3170 xnf_t *xnfp = ddi_get_driver_private(dip); 3171 XenbusState new_state = *(XenbusState *)impl_data; 3172 3173 ASSERT(xnfp != NULL); 3174 3175 switch (new_state) { 3176 case XenbusStateUnknown: 3177 case XenbusStateInitialising: 3178 case XenbusStateInitialised: 3179 case XenbusStateClosing: 3180 case XenbusStateClosed: 3181 case XenbusStateReconfiguring: 3182 case XenbusStateReconfigured: 3183 break; 3184 3185 case XenbusStateInitWait: 3186 xnf_read_config(xnfp); 3187 3188 if (!xnfp->xnf_be_rx_copy) { 3189 cmn_err(CE_WARN, 3190 "The xnf driver requires a dom0 that " 3191 "supports 'feature-rx-copy'."); 3192 (void) xvdi_switch_state(xnfp->xnf_devinfo, 3193 XBT_NULL, XenbusStateClosed); 3194 break; 3195 } 3196 3197 /* 3198 * Connect to the backend. 3199 */ 3200 xnf_be_connect(xnfp); 3201 3202 /* 3203 * Our MAC address as discovered by xnf_read_config(). 3204 */ 3205 mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr); 3206 3207 /* 3208 * We do not know if some features such as LSO are supported 3209 * until we connect to the backend. We request the MAC layer 3210 * to poll our capabilities again. 3211 */ 3212 mac_capab_update(xnfp->xnf_mh); 3213 3214 break; 3215 3216 case XenbusStateConnected: 3217 mutex_enter(&xnfp->xnf_rxlock); 3218 mutex_enter(&xnfp->xnf_txlock); 3219 3220 xnfp->xnf_connected = B_TRUE; 3221 /* 3222 * Wake up any threads waiting to send data to 3223 * backend. 3224 */ 3225 cv_broadcast(&xnfp->xnf_cv_state); 3226 3227 mutex_exit(&xnfp->xnf_txlock); 3228 mutex_exit(&xnfp->xnf_rxlock); 3229 3230 /* 3231 * Kick the peer in case it missed any transmits 3232 * request in the TX ring. 3233 */ 3234 ec_notify_via_evtchn(xnfp->xnf_evtchn); 3235 3236 /* 3237 * There may already be completed receive requests in 3238 * the ring sent by backend after it gets connected 3239 * but before we see its state change here, so we call 3240 * xnf_intr() to handle them, if any. 3241 */ 3242 (void) xnf_intr((caddr_t)xnfp); 3243 3244 /* 3245 * Mark the link up now that we are connected. 3246 */ 3247 mac_link_update(xnfp->xnf_mh, LINK_STATE_UP); 3248 3249 /* 3250 * Tell the backend about the multicast addresses in 3251 * which we are interested. 3252 */ 3253 mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE); 3254 3255 break; 3256 3257 default: 3258 break; 3259 } 3260 } 3261