1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright (c) 2014, 2017 by Delphix. All rights reserved. 29 * Copyright 2020 RackTop Systems, Inc. 30 */ 31 32 /* 33 * 34 * Copyright (c) 2004 Christian Limpach. 35 * All rights reserved. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. This section intentionally left blank. 46 * 4. The name of the author may not be used to endorse or promote products 47 * derived from this software without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 50 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 51 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 52 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 53 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 54 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 55 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 56 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 57 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 58 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 59 */ 60 /* 61 * Section 3 of the above license was updated in response to bug 6379571. 62 */ 63 64 /* 65 * xnf.c - GLDv3 network driver for domU. 66 */ 67 68 /* 69 * This driver uses four per-instance locks: 70 * 71 * xnf_gref_lock: 72 * 73 * Protects access to the grant reference list stored in 74 * xnf_gref_head. Grant references should be acquired and released 75 * using gref_get() and gref_put() respectively. 76 * 77 * xnf_schedlock: 78 * 79 * Protects: 80 * xnf_need_sched - used to record that a previous transmit attempt 81 * failed (and consequently it will be necessary to call 82 * mac_tx_update() when transmit resources are available). 83 * xnf_pending_multicast - the number of multicast requests that 84 * have been submitted to the backend for which we have not 85 * processed responses. 86 * 87 * xnf_txlock: 88 * 89 * Protects the transmit ring (xnf_tx_ring) and associated 90 * structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head). 91 * 92 * xnf_rxlock: 93 * 94 * Protects the receive ring (xnf_rx_ring) and associated 95 * structures (notably xnf_rx_pkt_info). 96 * 97 * If driver-global state that affects both the transmit and receive 98 * rings is manipulated, both xnf_txlock and xnf_rxlock should be 99 * held, in that order. 100 * 101 * xnf_schedlock is acquired both whilst holding xnf_txlock and 102 * without. It should always be acquired after xnf_txlock if both are 103 * held. 104 * 105 * Notes: 106 * - atomic_add_64() is used to manipulate counters where we require 107 * accuracy. For counters intended only for observation by humans, 108 * post increment/decrement are used instead. 109 */ 110 111 #include <sys/types.h> 112 #include <sys/errno.h> 113 #include <sys/param.h> 114 #include <sys/sysmacros.h> 115 #include <sys/systm.h> 116 #include <sys/stream.h> 117 #include <sys/strsubr.h> 118 #include <sys/strsun.h> 119 #include <sys/conf.h> 120 #include <sys/ddi.h> 121 #include <sys/devops.h> 122 #include <sys/sunddi.h> 123 #include <sys/sunndi.h> 124 #include <sys/dlpi.h> 125 #include <sys/ethernet.h> 126 #include <sys/strsun.h> 127 #include <sys/pattr.h> 128 #include <inet/ip.h> 129 #include <inet/ip_impl.h> 130 #include <inet/tcp.h> 131 #include <netinet/udp.h> 132 #include <sys/gld.h> 133 #include <sys/modctl.h> 134 #include <sys/mac_provider.h> 135 #include <sys/mac_ether.h> 136 #include <sys/bootinfo.h> 137 #include <sys/mach_mmu.h> 138 #ifdef XPV_HVM_DRIVER 139 #include <sys/xpv_support.h> 140 #include <sys/hypervisor.h> 141 #else 142 #include <sys/hypervisor.h> 143 #include <sys/evtchn_impl.h> 144 #include <sys/balloon_impl.h> 145 #endif 146 #include <xen/public/io/netif.h> 147 #include <sys/gnttab.h> 148 #include <xen/sys/xendev.h> 149 #include <sys/sdt.h> 150 #include <sys/note.h> 151 #include <sys/debug.h> 152 153 #include <io/xnf.h> 154 155 /* 156 * On a 32 bit PAE system physical and machine addresses are larger 157 * than 32 bits. ddi_btop() on such systems take an unsigned long 158 * argument, and so addresses above 4G are truncated before ddi_btop() 159 * gets to see them. To avoid this, code the shift operation here. 160 */ 161 #define xnf_btop(addr) ((addr) >> PAGESHIFT) 162 163 /* 164 * The parameters below should only be changed in /etc/system, never in mdb. 165 */ 166 167 /* 168 * Should we use the multicast control feature if the backend provides 169 * it? 170 */ 171 boolean_t xnf_multicast_control = B_TRUE; 172 173 /* 174 * Should we allow scatter-gather for tx if backend allows it? 175 */ 176 boolean_t xnf_enable_tx_sg = B_TRUE; 177 178 /* 179 * Should we allow scatter-gather for rx if backend allows it? 180 */ 181 boolean_t xnf_enable_rx_sg = B_TRUE; 182 183 /* 184 * Should we allow lso for tx sends if backend allows it? 185 * Requires xnf_enable_tx_sg to be also set to TRUE. 186 */ 187 boolean_t xnf_enable_lso = B_TRUE; 188 189 /* 190 * Should we allow lro on rx if backend supports it? 191 * Requires xnf_enable_rx_sg to be also set to TRUE. 192 * 193 * !! WARNING !! 194 * LRO is not yet supported in the OS so this should be left as FALSE. 195 * !! WARNING !! 196 */ 197 boolean_t xnf_enable_lro = B_FALSE; 198 199 /* 200 * Received packets below this size are copied to a new streams buffer 201 * rather than being desballoc'ed. 202 * 203 * This value is chosen to accommodate traffic where there are a large 204 * number of small packets. For data showing a typical distribution, 205 * see: 206 * 207 * Sinha07a: 208 * Rishi Sinha, Christos Papadopoulos, and John 209 * Heidemann. Internet Packet Size Distributions: Some 210 * Observations. Technical Report ISI-TR-2007-643, 211 * USC/Information Sciences Institute, May, 2007. Orignally 212 * released October 2005 as web page 213 * http://netweb.usc.edu/~sinha/pkt-sizes/. 214 * <http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>. 215 */ 216 size_t xnf_rx_copy_limit = 64; 217 218 #define INVALID_GRANT_HANDLE ((grant_handle_t)-1) 219 #define INVALID_GRANT_REF ((grant_ref_t)-1) 220 #define INVALID_TX_ID ((uint16_t)-1) 221 222 #define TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)])) 223 #define TX_ID_VALID(i) \ 224 (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE)) 225 226 /* 227 * calculate how many pages are spanned by an mblk fragment 228 */ 229 #define xnf_mblk_pages(mp) (MBLKL(mp) == 0 ? 0 : \ 230 xnf_btop((uintptr_t)mp->b_wptr - 1) - xnf_btop((uintptr_t)mp->b_rptr) + 1) 231 232 /* Required system entry points */ 233 static int xnf_attach(dev_info_t *, ddi_attach_cmd_t); 234 static int xnf_detach(dev_info_t *, ddi_detach_cmd_t); 235 236 /* Required driver entry points for Nemo */ 237 static int xnf_start(void *); 238 static void xnf_stop(void *); 239 static int xnf_set_mac_addr(void *, const uint8_t *); 240 static int xnf_set_multicast(void *, boolean_t, const uint8_t *); 241 static int xnf_set_promiscuous(void *, boolean_t); 242 static mblk_t *xnf_send(void *, mblk_t *); 243 static uint_t xnf_intr(caddr_t); 244 static int xnf_stat(void *, uint_t, uint64_t *); 245 static boolean_t xnf_getcapab(void *, mac_capab_t, void *); 246 static int xnf_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); 247 static int xnf_setprop(void *, const char *, mac_prop_id_t, uint_t, 248 const void *); 249 static void xnf_propinfo(void *, const char *, mac_prop_id_t, 250 mac_prop_info_handle_t); 251 252 /* Driver private functions */ 253 static int xnf_alloc_dma_resources(xnf_t *); 254 static void xnf_release_dma_resources(xnf_t *); 255 static void xnf_release_mblks(xnf_t *); 256 257 static int xnf_buf_constructor(void *, void *, int); 258 static void xnf_buf_destructor(void *, void *); 259 static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t); 260 #pragma inline(xnf_buf_get) 261 static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t); 262 #pragma inline(xnf_buf_put) 263 static void xnf_buf_refresh(xnf_buf_t *); 264 #pragma inline(xnf_buf_refresh) 265 static void xnf_buf_recycle(xnf_buf_t *); 266 267 static int xnf_tx_buf_constructor(void *, void *, int); 268 static void xnf_tx_buf_destructor(void *, void *); 269 270 static grant_ref_t xnf_gref_get(xnf_t *); 271 #pragma inline(xnf_gref_get) 272 static void xnf_gref_put(xnf_t *, grant_ref_t); 273 #pragma inline(xnf_gref_put) 274 275 static xnf_txid_t *xnf_txid_get(xnf_t *); 276 #pragma inline(xnf_txid_get) 277 static void xnf_txid_put(xnf_t *, xnf_txid_t *); 278 #pragma inline(xnf_txid_put) 279 280 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *); 281 static int xnf_tx_clean_ring(xnf_t *); 282 static void oe_state_change(dev_info_t *, ddi_eventcookie_t, 283 void *, void *); 284 static boolean_t xnf_kstat_init(xnf_t *); 285 static void xnf_rx_collect(xnf_t *); 286 287 #define XNF_CALLBACK_FLAGS (MC_GETCAPAB | MC_PROPERTIES) 288 289 static mac_callbacks_t xnf_callbacks = { 290 .mc_callbacks = XNF_CALLBACK_FLAGS, 291 .mc_getstat = xnf_stat, 292 .mc_start = xnf_start, 293 .mc_stop = xnf_stop, 294 .mc_setpromisc = xnf_set_promiscuous, 295 .mc_multicst = xnf_set_multicast, 296 .mc_unicst = xnf_set_mac_addr, 297 .mc_tx = xnf_send, 298 .mc_getcapab = xnf_getcapab, 299 .mc_setprop = xnf_setprop, 300 .mc_getprop = xnf_getprop, 301 .mc_propinfo = xnf_propinfo, 302 }; 303 304 /* DMA attributes for network ring buffer */ 305 static ddi_dma_attr_t ringbuf_dma_attr = { 306 .dma_attr_version = DMA_ATTR_V0, 307 .dma_attr_addr_lo = 0, 308 .dma_attr_addr_hi = 0xffffffffffffffffULL, 309 .dma_attr_count_max = 0x7fffffff, 310 .dma_attr_align = MMU_PAGESIZE, 311 .dma_attr_burstsizes = 0x7ff, 312 .dma_attr_minxfer = 1, 313 .dma_attr_maxxfer = 0xffffffffU, 314 .dma_attr_seg = 0xffffffffffffffffULL, 315 .dma_attr_sgllen = 1, 316 .dma_attr_granular = 1, 317 .dma_attr_flags = 0 318 }; 319 320 /* DMA attributes for receive data */ 321 static ddi_dma_attr_t rx_buf_dma_attr = { 322 .dma_attr_version = DMA_ATTR_V0, 323 .dma_attr_addr_lo = 0, 324 .dma_attr_addr_hi = 0xffffffffffffffffULL, 325 .dma_attr_count_max = MMU_PAGEOFFSET, 326 .dma_attr_align = MMU_PAGESIZE, /* allocation alignment */ 327 .dma_attr_burstsizes = 0x7ff, 328 .dma_attr_minxfer = 1, 329 .dma_attr_maxxfer = 0xffffffffU, 330 .dma_attr_seg = 0xffffffffffffffffULL, 331 .dma_attr_sgllen = 1, 332 .dma_attr_granular = 1, 333 .dma_attr_flags = 0 334 }; 335 336 /* DMA attributes for transmit data */ 337 static ddi_dma_attr_t tx_buf_dma_attr = { 338 .dma_attr_version = DMA_ATTR_V0, 339 .dma_attr_addr_lo = 0, 340 .dma_attr_addr_hi = 0xffffffffffffffffULL, 341 .dma_attr_count_max = MMU_PAGEOFFSET, 342 .dma_attr_align = 1, 343 .dma_attr_burstsizes = 0x7ff, 344 .dma_attr_minxfer = 1, 345 .dma_attr_maxxfer = 0xffffffffU, 346 .dma_attr_seg = XEN_DATA_BOUNDARY - 1, /* segment boundary */ 347 .dma_attr_sgllen = XEN_MAX_TX_DATA_PAGES, /* max number of segments */ 348 .dma_attr_granular = 1, 349 .dma_attr_flags = 0 350 }; 351 352 /* DMA access attributes for registers and descriptors */ 353 static ddi_device_acc_attr_t accattr = { 354 DDI_DEVICE_ATTR_V0, 355 DDI_STRUCTURE_LE_ACC, /* This is a little-endian device */ 356 DDI_STRICTORDER_ACC 357 }; 358 359 /* DMA access attributes for data: NOT to be byte swapped. */ 360 static ddi_device_acc_attr_t data_accattr = { 361 DDI_DEVICE_ATTR_V0, 362 DDI_NEVERSWAP_ACC, 363 DDI_STRICTORDER_ACC 364 }; 365 366 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach, 367 nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported); 368 369 static struct modldrv xnf_modldrv = { 370 &mod_driverops, 371 "Virtual Ethernet driver", 372 &xnf_dev_ops 373 }; 374 375 static struct modlinkage modlinkage = { 376 MODREV_1, &xnf_modldrv, NULL 377 }; 378 379 int 380 _init(void) 381 { 382 int r; 383 384 mac_init_ops(&xnf_dev_ops, "xnf"); 385 r = mod_install(&modlinkage); 386 if (r != DDI_SUCCESS) 387 mac_fini_ops(&xnf_dev_ops); 388 389 return (r); 390 } 391 392 int 393 _fini(void) 394 { 395 return (EBUSY); /* XXPV should be removable */ 396 } 397 398 int 399 _info(struct modinfo *modinfop) 400 { 401 return (mod_info(&modlinkage, modinfop)); 402 } 403 404 /* 405 * Acquire a grant reference. 406 */ 407 static grant_ref_t 408 xnf_gref_get(xnf_t *xnfp) 409 { 410 grant_ref_t gref; 411 412 mutex_enter(&xnfp->xnf_gref_lock); 413 414 do { 415 gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head); 416 417 } while ((gref == INVALID_GRANT_REF) && 418 (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0)); 419 420 mutex_exit(&xnfp->xnf_gref_lock); 421 422 if (gref == INVALID_GRANT_REF) { 423 xnfp->xnf_stat_gref_failure++; 424 } else { 425 atomic_inc_64(&xnfp->xnf_stat_gref_outstanding); 426 if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak) 427 xnfp->xnf_stat_gref_peak = 428 xnfp->xnf_stat_gref_outstanding; 429 } 430 431 return (gref); 432 } 433 434 /* 435 * Release a grant reference. 436 */ 437 static void 438 xnf_gref_put(xnf_t *xnfp, grant_ref_t gref) 439 { 440 ASSERT(gref != INVALID_GRANT_REF); 441 442 mutex_enter(&xnfp->xnf_gref_lock); 443 gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref); 444 mutex_exit(&xnfp->xnf_gref_lock); 445 446 atomic_dec_64(&xnfp->xnf_stat_gref_outstanding); 447 } 448 449 /* 450 * Acquire a transmit id. 451 */ 452 static xnf_txid_t * 453 xnf_txid_get(xnf_t *xnfp) 454 { 455 xnf_txid_t *tidp; 456 457 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 458 459 if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID) 460 return (NULL); 461 462 ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head)); 463 464 tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head); 465 xnfp->xnf_tx_pkt_id_head = tidp->next; 466 tidp->next = INVALID_TX_ID; 467 468 ASSERT(tidp->txbuf == NULL); 469 470 return (tidp); 471 } 472 473 /* 474 * Release a transmit id. 475 */ 476 static void 477 xnf_txid_put(xnf_t *xnfp, xnf_txid_t *tidp) 478 { 479 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 480 ASSERT(TX_ID_VALID(tidp->id)); 481 ASSERT(tidp->next == INVALID_TX_ID); 482 483 tidp->txbuf = NULL; 484 tidp->next = xnfp->xnf_tx_pkt_id_head; 485 xnfp->xnf_tx_pkt_id_head = tidp->id; 486 } 487 488 static void 489 xnf_data_txbuf_free(xnf_t *xnfp, xnf_txbuf_t *txp) 490 { 491 ASSERT3U(txp->tx_type, ==, TX_DATA); 492 493 /* 494 * We are either using a lookaside buffer or we are mapping existing 495 * buffers. 496 */ 497 if (txp->tx_bdesc != NULL) { 498 ASSERT(!txp->tx_handle_bound); 499 xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE); 500 } else { 501 if (txp->tx_txreq.gref != INVALID_GRANT_REF) { 502 if (gnttab_query_foreign_access(txp->tx_txreq.gref) != 503 0) { 504 cmn_err(CE_PANIC, "tx grant %d still in use by " 505 "backend domain", txp->tx_txreq.gref); 506 } 507 (void) gnttab_end_foreign_access_ref( 508 txp->tx_txreq.gref, 1); 509 xnf_gref_put(xnfp, txp->tx_txreq.gref); 510 } 511 512 if (txp->tx_handle_bound) 513 (void) ddi_dma_unbind_handle(txp->tx_dma_handle); 514 } 515 516 if (txp->tx_mp != NULL) 517 freemsg(txp->tx_mp); 518 519 if (txp->tx_prev != NULL) { 520 ASSERT3P(txp->tx_prev->tx_next, ==, txp); 521 txp->tx_prev->tx_next = NULL; 522 } 523 524 if (txp->tx_txreq.id != INVALID_TX_ID) { 525 /* 526 * This should be only possible when resuming from a suspend. 527 */ 528 ASSERT(!xnfp->xnf_connected); 529 xnf_txid_put(xnfp, TX_ID_TO_TXID(xnfp, txp->tx_txreq.id)); 530 txp->tx_txreq.id = INVALID_TX_ID; 531 } 532 533 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); 534 } 535 536 static void 537 xnf_data_txbuf_free_chain(xnf_t *xnfp, xnf_txbuf_t *txp) 538 { 539 if (txp == NULL) 540 return; 541 542 while (txp->tx_next != NULL) 543 txp = txp->tx_next; 544 545 /* 546 * We free the chain in reverse order so that grants can be released 547 * for all dma chunks before unbinding the dma handles. The mblk is 548 * freed last, after all its fragments' dma handles are unbound. 549 */ 550 xnf_txbuf_t *prev; 551 for (; txp != NULL; txp = prev) { 552 prev = txp->tx_prev; 553 xnf_data_txbuf_free(xnfp, txp); 554 } 555 } 556 557 static xnf_txbuf_t * 558 xnf_data_txbuf_alloc(xnf_t *xnfp, int flag) 559 { 560 xnf_txbuf_t *txp; 561 562 if ((txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, flag)) == NULL) { 563 return (NULL); 564 } 565 566 txp->tx_type = TX_DATA; 567 txp->tx_next = NULL; 568 txp->tx_prev = NULL; 569 txp->tx_head = txp; 570 txp->tx_frags_to_ack = 0; 571 txp->tx_mp = NULL; 572 txp->tx_bdesc = NULL; 573 txp->tx_handle_bound = B_FALSE; 574 txp->tx_txreq.gref = INVALID_GRANT_REF; 575 txp->tx_txreq.id = INVALID_TX_ID; 576 577 return (txp); 578 } 579 580 /* 581 * Get `wanted' slots in the transmit ring, waiting for at least that 582 * number if `wait' is B_TRUE. Force the ring to be cleaned by setting 583 * `wanted' to zero. 584 * 585 * Return the number of slots available. 586 */ 587 static int 588 xnf_tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait) 589 { 590 int slotsfree; 591 boolean_t forced_clean = (wanted == 0); 592 593 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 594 595 /* LINTED: constant in conditional context */ 596 while (B_TRUE) { 597 slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring); 598 599 if ((slotsfree < wanted) || forced_clean) 600 slotsfree = xnf_tx_clean_ring(xnfp); 601 602 /* 603 * If there are more than we need free, tell other 604 * people to come looking again. We hold txlock, so we 605 * are able to take our slots before anyone else runs. 606 */ 607 if (slotsfree > wanted) 608 cv_broadcast(&xnfp->xnf_cv_tx_slots); 609 610 if (slotsfree >= wanted) 611 break; 612 613 if (!wait) 614 break; 615 616 cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock); 617 } 618 619 ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring))); 620 621 return (slotsfree); 622 } 623 624 static int 625 xnf_setup_rings(xnf_t *xnfp) 626 { 627 domid_t oeid; 628 struct xenbus_device *xsd; 629 RING_IDX i; 630 int err; 631 xnf_txid_t *tidp; 632 xnf_buf_t **bdescp; 633 634 oeid = xvdi_get_oeid(xnfp->xnf_devinfo); 635 xsd = xvdi_get_xsd(xnfp->xnf_devinfo); 636 637 if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF) 638 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0); 639 640 err = gnttab_grant_foreign_access(oeid, 641 xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0); 642 if (err <= 0) { 643 err = -err; 644 xenbus_dev_error(xsd, err, "granting access to tx ring page"); 645 goto out; 646 } 647 xnfp->xnf_tx_ring_ref = (grant_ref_t)err; 648 649 if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF) 650 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0); 651 652 err = gnttab_grant_foreign_access(oeid, 653 xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0); 654 if (err <= 0) { 655 err = -err; 656 xenbus_dev_error(xsd, err, "granting access to rx ring page"); 657 goto out; 658 } 659 xnfp->xnf_rx_ring_ref = (grant_ref_t)err; 660 661 mutex_enter(&xnfp->xnf_txlock); 662 663 /* 664 * We first cleanup the TX ring in case we are doing a resume. 665 * Note that this can lose packets, but we expect to stagger on. 666 */ 667 xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */ 668 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0]; 669 i < NET_TX_RING_SIZE; 670 i++, tidp++) { 671 xnf_txbuf_t *txp = tidp->txbuf; 672 if (txp == NULL) 673 continue; 674 675 switch (txp->tx_type) { 676 case TX_DATA: 677 /* 678 * txid_put() will be called for each txbuf's txid in 679 * the chain which will result in clearing tidp->txbuf. 680 */ 681 xnf_data_txbuf_free_chain(xnfp, txp); 682 683 break; 684 685 case TX_MCAST_REQ: 686 txp->tx_type = TX_MCAST_RSP; 687 txp->tx_status = NETIF_RSP_DROPPED; 688 cv_broadcast(&xnfp->xnf_cv_multicast); 689 690 /* 691 * The request consumed two slots in the ring, 692 * yet only a single xnf_txid_t is used. Step 693 * over the empty slot. 694 */ 695 i++; 696 ASSERT3U(i, <, NET_TX_RING_SIZE); 697 break; 698 699 case TX_MCAST_RSP: 700 break; 701 } 702 } 703 704 /* 705 * Now purge old list and add each txid to the new free list. 706 */ 707 xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */ 708 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0]; 709 i < NET_TX_RING_SIZE; 710 i++, tidp++) { 711 tidp->id = i; 712 ASSERT3P(tidp->txbuf, ==, NULL); 713 tidp->next = INVALID_TX_ID; /* Appease txid_put(). */ 714 xnf_txid_put(xnfp, tidp); 715 } 716 717 /* LINTED: constant in conditional context */ 718 SHARED_RING_INIT(xnfp->xnf_tx_ring.sring); 719 /* LINTED: constant in conditional context */ 720 FRONT_RING_INIT(&xnfp->xnf_tx_ring, 721 xnfp->xnf_tx_ring.sring, PAGESIZE); 722 723 mutex_exit(&xnfp->xnf_txlock); 724 725 mutex_enter(&xnfp->xnf_rxlock); 726 727 /* 728 * Clean out any buffers currently posted to the receive ring 729 * before we reset it. 730 */ 731 for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0]; 732 i < NET_RX_RING_SIZE; 733 i++, bdescp++) { 734 if (*bdescp != NULL) { 735 xnf_buf_put(xnfp, *bdescp, B_FALSE); 736 *bdescp = NULL; 737 } 738 } 739 740 /* LINTED: constant in conditional context */ 741 SHARED_RING_INIT(xnfp->xnf_rx_ring.sring); 742 /* LINTED: constant in conditional context */ 743 FRONT_RING_INIT(&xnfp->xnf_rx_ring, 744 xnfp->xnf_rx_ring.sring, PAGESIZE); 745 746 /* 747 * Fill the ring with buffers. 748 */ 749 for (i = 0; i < NET_RX_RING_SIZE; i++) { 750 xnf_buf_t *bdesc; 751 752 bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE); 753 VERIFY(bdesc != NULL); 754 xnf_rxbuf_hang(xnfp, bdesc); 755 } 756 757 /* LINTED: constant in conditional context */ 758 RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring); 759 760 mutex_exit(&xnfp->xnf_rxlock); 761 762 return (0); 763 764 out: 765 if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF) 766 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0); 767 xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF; 768 769 if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF) 770 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0); 771 xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF; 772 773 return (err); 774 } 775 776 /* 777 * Connect driver to back end, called to set up communication with 778 * back end driver both initially and on resume after restore/migrate. 779 */ 780 void 781 xnf_be_connect(xnf_t *xnfp) 782 { 783 const char *message; 784 xenbus_transaction_t xbt; 785 struct xenbus_device *xsd; 786 char *xsname; 787 int err; 788 789 ASSERT(!xnfp->xnf_connected); 790 791 xsd = xvdi_get_xsd(xnfp->xnf_devinfo); 792 xsname = xvdi_get_xsname(xnfp->xnf_devinfo); 793 794 err = xnf_setup_rings(xnfp); 795 if (err != 0) { 796 cmn_err(CE_WARN, "failed to set up tx/rx rings"); 797 xenbus_dev_error(xsd, err, "setting up ring"); 798 return; 799 } 800 801 again: 802 err = xenbus_transaction_start(&xbt); 803 if (err != 0) { 804 xenbus_dev_error(xsd, EIO, "starting transaction"); 805 return; 806 } 807 808 err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u", 809 xnfp->xnf_tx_ring_ref); 810 if (err != 0) { 811 message = "writing tx ring-ref"; 812 goto abort_transaction; 813 } 814 815 err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u", 816 xnfp->xnf_rx_ring_ref); 817 if (err != 0) { 818 message = "writing rx ring-ref"; 819 goto abort_transaction; 820 } 821 822 err = xenbus_printf(xbt, xsname, "event-channel", "%u", 823 xnfp->xnf_evtchn); 824 if (err != 0) { 825 message = "writing event-channel"; 826 goto abort_transaction; 827 } 828 829 err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1); 830 if (err != 0) { 831 message = "writing feature-rx-notify"; 832 goto abort_transaction; 833 } 834 835 err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1); 836 if (err != 0) { 837 message = "writing request-rx-copy"; 838 goto abort_transaction; 839 } 840 841 if (xnfp->xnf_be_mcast_control) { 842 err = xenbus_printf(xbt, xsname, "request-multicast-control", 843 "%d", 1); 844 if (err != 0) { 845 message = "writing request-multicast-control"; 846 goto abort_transaction; 847 } 848 } 849 850 /* 851 * Tell backend if we support scatter-gather lists on the rx side. 852 */ 853 err = xenbus_printf(xbt, xsname, "feature-sg", "%d", 854 xnf_enable_rx_sg ? 1 : 0); 855 if (err != 0) { 856 message = "writing feature-sg"; 857 goto abort_transaction; 858 } 859 860 /* 861 * Tell backend if we support LRO for IPv4. Scatter-gather on rx is 862 * a prerequisite. 863 */ 864 err = xenbus_printf(xbt, xsname, "feature-gso-tcpv4", "%d", 865 (xnf_enable_rx_sg && xnf_enable_lro) ? 1 : 0); 866 if (err != 0) { 867 message = "writing feature-gso-tcpv4"; 868 goto abort_transaction; 869 } 870 871 err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected); 872 if (err != 0) { 873 message = "switching state to XenbusStateConnected"; 874 goto abort_transaction; 875 } 876 877 err = xenbus_transaction_end(xbt, 0); 878 if (err != 0) { 879 if (err == EAGAIN) 880 goto again; 881 xenbus_dev_error(xsd, err, "completing transaction"); 882 } 883 884 return; 885 886 abort_transaction: 887 (void) xenbus_transaction_end(xbt, 1); 888 xenbus_dev_error(xsd, err, "%s", message); 889 } 890 891 /* 892 * Read configuration information from xenstore. 893 */ 894 void 895 xnf_read_config(xnf_t *xnfp) 896 { 897 int err, be_cap; 898 char mac[ETHERADDRL * 3]; 899 char *oename = xvdi_get_oename(xnfp->xnf_devinfo); 900 901 err = xenbus_scanf(XBT_NULL, oename, "mac", 902 "%s", (char *)&mac[0]); 903 if (err != 0) { 904 /* 905 * bad: we're supposed to be set up with a proper mac 906 * addr. at this point 907 */ 908 cmn_err(CE_WARN, "%s%d: no mac address", 909 ddi_driver_name(xnfp->xnf_devinfo), 910 ddi_get_instance(xnfp->xnf_devinfo)); 911 return; 912 } 913 if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) { 914 err = ENOENT; 915 xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT, 916 "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo)); 917 return; 918 } 919 920 err = xenbus_scanf(XBT_NULL, oename, 921 "feature-rx-copy", "%d", &be_cap); 922 /* 923 * If we fail to read the store we assume that the key is 924 * absent, implying an older domain at the far end. Older 925 * domains cannot do HV copy. 926 */ 927 if (err != 0) 928 be_cap = 0; 929 xnfp->xnf_be_rx_copy = (be_cap != 0); 930 931 err = xenbus_scanf(XBT_NULL, oename, 932 "feature-multicast-control", "%d", &be_cap); 933 /* 934 * If we fail to read the store we assume that the key is 935 * absent, implying an older domain at the far end. Older 936 * domains do not support multicast control. 937 */ 938 if (err != 0) 939 be_cap = 0; 940 xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control; 941 942 /* 943 * See if back-end supports scatter-gather for transmits. If not, 944 * we will not support LSO and limit the mtu to 1500. 945 */ 946 err = xenbus_scanf(XBT_NULL, oename, "feature-sg", "%d", &be_cap); 947 if (err != 0) { 948 be_cap = 0; 949 dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading " 950 "'feature-sg' from backend driver"); 951 } 952 if (be_cap == 0) { 953 dev_err(xnfp->xnf_devinfo, CE_WARN, "scatter-gather is not " 954 "supported for transmits in the backend driver. LSO is " 955 "disabled and MTU is restricted to 1500 bytes."); 956 } 957 xnfp->xnf_be_tx_sg = (be_cap != 0) && xnf_enable_tx_sg; 958 959 if (xnfp->xnf_be_tx_sg) { 960 /* 961 * Check if LSO is supported. Currently we only check for 962 * IPv4 as Illumos doesn't support LSO for IPv6. 963 */ 964 err = xenbus_scanf(XBT_NULL, oename, "feature-gso-tcpv4", "%d", 965 &be_cap); 966 if (err != 0) { 967 be_cap = 0; 968 dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading " 969 "'feature-gso-tcpv4' from backend driver"); 970 } 971 if (be_cap == 0) { 972 dev_err(xnfp->xnf_devinfo, CE_WARN, "LSO is not " 973 "supported by the backend driver. Performance " 974 "will be affected."); 975 } 976 xnfp->xnf_be_lso = (be_cap != 0) && xnf_enable_lso; 977 } 978 } 979 980 /* 981 * attach(9E) -- Attach a device to the system 982 */ 983 static int 984 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) 985 { 986 mac_register_t *macp; 987 xnf_t *xnfp; 988 int err; 989 char cachename[32]; 990 991 switch (cmd) { 992 case DDI_RESUME: 993 xnfp = ddi_get_driver_private(devinfo); 994 xnfp->xnf_gen++; 995 996 (void) xvdi_resume(devinfo); 997 (void) xvdi_alloc_evtchn(devinfo); 998 xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo); 999 #ifdef XPV_HVM_DRIVER 1000 ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, 1001 xnfp); 1002 #else 1003 (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, 1004 (caddr_t)xnfp); 1005 #endif 1006 return (DDI_SUCCESS); 1007 1008 case DDI_ATTACH: 1009 break; 1010 1011 default: 1012 return (DDI_FAILURE); 1013 } 1014 1015 /* 1016 * Allocate gld_mac_info_t and xnf_instance structures 1017 */ 1018 macp = mac_alloc(MAC_VERSION); 1019 if (macp == NULL) 1020 return (DDI_FAILURE); 1021 xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP); 1022 1023 xnfp->xnf_tx_pkt_id = 1024 kmem_zalloc(sizeof (xnf_txid_t) * NET_TX_RING_SIZE, KM_SLEEP); 1025 1026 xnfp->xnf_rx_pkt_info = 1027 kmem_zalloc(sizeof (xnf_buf_t *) * NET_RX_RING_SIZE, KM_SLEEP); 1028 1029 macp->m_dip = devinfo; 1030 macp->m_driver = xnfp; 1031 xnfp->xnf_devinfo = devinfo; 1032 1033 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1034 macp->m_src_addr = xnfp->xnf_mac_addr; 1035 macp->m_callbacks = &xnf_callbacks; 1036 macp->m_min_sdu = 0; 1037 xnfp->xnf_mtu = ETHERMTU; 1038 macp->m_max_sdu = xnfp->xnf_mtu; 1039 1040 xnfp->xnf_running = B_FALSE; 1041 xnfp->xnf_connected = B_FALSE; 1042 xnfp->xnf_be_rx_copy = B_FALSE; 1043 xnfp->xnf_be_mcast_control = B_FALSE; 1044 xnfp->xnf_need_sched = B_FALSE; 1045 1046 xnfp->xnf_rx_head = NULL; 1047 xnfp->xnf_rx_tail = NULL; 1048 xnfp->xnf_rx_new_buffers_posted = B_FALSE; 1049 1050 #ifdef XPV_HVM_DRIVER 1051 /* Report our version to dom0 */ 1052 (void) xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d", 1053 HVMPV_XNF_VERS); 1054 #endif 1055 1056 /* 1057 * Get the iblock cookie with which to initialize the mutexes. 1058 */ 1059 if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie) 1060 != DDI_SUCCESS) 1061 goto failure; 1062 1063 mutex_init(&xnfp->xnf_txlock, 1064 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1065 mutex_init(&xnfp->xnf_rxlock, 1066 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1067 mutex_init(&xnfp->xnf_schedlock, 1068 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1069 mutex_init(&xnfp->xnf_gref_lock, 1070 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1071 1072 cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL); 1073 cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL); 1074 cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL); 1075 1076 (void) sprintf(cachename, "xnf_buf_cache_%d", 1077 ddi_get_instance(devinfo)); 1078 xnfp->xnf_buf_cache = kmem_cache_create(cachename, 1079 sizeof (xnf_buf_t), 0, 1080 xnf_buf_constructor, xnf_buf_destructor, 1081 NULL, xnfp, NULL, 0); 1082 if (xnfp->xnf_buf_cache == NULL) 1083 goto failure_0; 1084 1085 (void) sprintf(cachename, "xnf_tx_buf_cache_%d", 1086 ddi_get_instance(devinfo)); 1087 xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename, 1088 sizeof (xnf_txbuf_t), 0, 1089 xnf_tx_buf_constructor, xnf_tx_buf_destructor, 1090 NULL, xnfp, NULL, 0); 1091 if (xnfp->xnf_tx_buf_cache == NULL) 1092 goto failure_1; 1093 1094 xnfp->xnf_gref_head = INVALID_GRANT_REF; 1095 1096 if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) { 1097 cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize " 1098 "driver data structures", 1099 ddi_get_instance(xnfp->xnf_devinfo)); 1100 goto failure_2; 1101 } 1102 1103 xnfp->xnf_rx_ring.sring->rsp_event = 1104 xnfp->xnf_tx_ring.sring->rsp_event = 1; 1105 1106 xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF; 1107 xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF; 1108 1109 /* set driver private pointer now */ 1110 ddi_set_driver_private(devinfo, xnfp); 1111 1112 if (!xnf_kstat_init(xnfp)) 1113 goto failure_3; 1114 1115 /* 1116 * Allocate an event channel, add the interrupt handler and 1117 * bind it to the event channel. 1118 */ 1119 (void) xvdi_alloc_evtchn(devinfo); 1120 xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo); 1121 #ifdef XPV_HVM_DRIVER 1122 ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp); 1123 #else 1124 (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp); 1125 #endif 1126 1127 err = mac_register(macp, &xnfp->xnf_mh); 1128 mac_free(macp); 1129 macp = NULL; 1130 if (err != 0) 1131 goto failure_4; 1132 1133 if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL) 1134 != DDI_SUCCESS) 1135 goto failure_5; 1136 1137 #ifdef XPV_HVM_DRIVER 1138 /* 1139 * In the HVM case, this driver essentially replaces a driver for 1140 * a 'real' PCI NIC. Without the "model" property set to 1141 * "Ethernet controller", like the PCI code does, netbooting does 1142 * not work correctly, as strplumb_get_netdev_path() will not find 1143 * this interface. 1144 */ 1145 (void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model", 1146 "Ethernet controller"); 1147 #endif 1148 1149 return (DDI_SUCCESS); 1150 1151 failure_5: 1152 (void) mac_unregister(xnfp->xnf_mh); 1153 1154 failure_4: 1155 #ifdef XPV_HVM_DRIVER 1156 ec_unbind_evtchn(xnfp->xnf_evtchn); 1157 xvdi_free_evtchn(devinfo); 1158 #else 1159 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); 1160 #endif 1161 xnfp->xnf_evtchn = INVALID_EVTCHN; 1162 kstat_delete(xnfp->xnf_kstat_aux); 1163 1164 failure_3: 1165 xnf_release_dma_resources(xnfp); 1166 1167 failure_2: 1168 kmem_cache_destroy(xnfp->xnf_tx_buf_cache); 1169 1170 failure_1: 1171 kmem_cache_destroy(xnfp->xnf_buf_cache); 1172 1173 failure_0: 1174 cv_destroy(&xnfp->xnf_cv_tx_slots); 1175 cv_destroy(&xnfp->xnf_cv_multicast); 1176 cv_destroy(&xnfp->xnf_cv_state); 1177 1178 mutex_destroy(&xnfp->xnf_gref_lock); 1179 mutex_destroy(&xnfp->xnf_schedlock); 1180 mutex_destroy(&xnfp->xnf_rxlock); 1181 mutex_destroy(&xnfp->xnf_txlock); 1182 1183 failure: 1184 kmem_free(xnfp, sizeof (*xnfp)); 1185 if (macp != NULL) 1186 mac_free(macp); 1187 1188 return (DDI_FAILURE); 1189 } 1190 1191 /* detach(9E) -- Detach a device from the system */ 1192 static int 1193 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) 1194 { 1195 xnf_t *xnfp; /* Our private device info */ 1196 1197 xnfp = ddi_get_driver_private(devinfo); 1198 1199 switch (cmd) { 1200 case DDI_SUSPEND: 1201 #ifdef XPV_HVM_DRIVER 1202 ec_unbind_evtchn(xnfp->xnf_evtchn); 1203 xvdi_free_evtchn(devinfo); 1204 #else 1205 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); 1206 #endif 1207 1208 xvdi_suspend(devinfo); 1209 1210 mutex_enter(&xnfp->xnf_rxlock); 1211 mutex_enter(&xnfp->xnf_txlock); 1212 1213 xnfp->xnf_evtchn = INVALID_EVTCHN; 1214 xnfp->xnf_connected = B_FALSE; 1215 mutex_exit(&xnfp->xnf_txlock); 1216 mutex_exit(&xnfp->xnf_rxlock); 1217 1218 /* claim link to be down after disconnect */ 1219 mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN); 1220 return (DDI_SUCCESS); 1221 1222 case DDI_DETACH: 1223 break; 1224 1225 default: 1226 return (DDI_FAILURE); 1227 } 1228 1229 if (xnfp->xnf_connected) 1230 return (DDI_FAILURE); 1231 1232 /* 1233 * Cannot detach if we have xnf_buf_t outstanding. 1234 */ 1235 if (xnfp->xnf_stat_buf_allocated > 0) 1236 return (DDI_FAILURE); 1237 1238 if (mac_unregister(xnfp->xnf_mh) != 0) 1239 return (DDI_FAILURE); 1240 1241 kstat_delete(xnfp->xnf_kstat_aux); 1242 1243 /* Stop the receiver */ 1244 xnf_stop(xnfp); 1245 1246 xvdi_remove_event_handler(devinfo, XS_OE_STATE); 1247 1248 /* Remove the interrupt */ 1249 #ifdef XPV_HVM_DRIVER 1250 ec_unbind_evtchn(xnfp->xnf_evtchn); 1251 xvdi_free_evtchn(devinfo); 1252 #else 1253 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); 1254 #endif 1255 1256 /* Release any pending xmit mblks */ 1257 xnf_release_mblks(xnfp); 1258 1259 /* Release all DMA resources */ 1260 xnf_release_dma_resources(xnfp); 1261 1262 cv_destroy(&xnfp->xnf_cv_tx_slots); 1263 cv_destroy(&xnfp->xnf_cv_multicast); 1264 cv_destroy(&xnfp->xnf_cv_state); 1265 1266 kmem_cache_destroy(xnfp->xnf_tx_buf_cache); 1267 kmem_cache_destroy(xnfp->xnf_buf_cache); 1268 1269 mutex_destroy(&xnfp->xnf_gref_lock); 1270 mutex_destroy(&xnfp->xnf_schedlock); 1271 mutex_destroy(&xnfp->xnf_rxlock); 1272 mutex_destroy(&xnfp->xnf_txlock); 1273 1274 kmem_free(xnfp, sizeof (*xnfp)); 1275 1276 return (DDI_SUCCESS); 1277 } 1278 1279 /* 1280 * xnf_set_mac_addr() -- set the physical network address on the board. 1281 */ 1282 static int 1283 xnf_set_mac_addr(void *arg, const uint8_t *macaddr) 1284 { 1285 _NOTE(ARGUNUSED(arg, macaddr)); 1286 1287 /* 1288 * We can't set our macaddr. 1289 */ 1290 return (ENOTSUP); 1291 } 1292 1293 /* 1294 * xnf_set_multicast() -- set (enable) or disable a multicast address. 1295 * 1296 * Program the hardware to enable/disable the multicast address 1297 * in "mca". Enable if "add" is true, disable if false. 1298 */ 1299 static int 1300 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca) 1301 { 1302 xnf_t *xnfp = arg; 1303 xnf_txbuf_t *txp; 1304 int n_slots; 1305 RING_IDX slot; 1306 xnf_txid_t *tidp; 1307 netif_tx_request_t *txrp; 1308 struct netif_extra_info *erp; 1309 boolean_t notify, result; 1310 1311 /* 1312 * If the backend does not support multicast control then we 1313 * must assume that the right packets will just arrive. 1314 */ 1315 if (!xnfp->xnf_be_mcast_control) 1316 return (0); 1317 1318 txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP); 1319 1320 mutex_enter(&xnfp->xnf_txlock); 1321 1322 /* 1323 * If we're not yet connected then claim success. This is 1324 * acceptable because we refresh the entire set of multicast 1325 * addresses when we get connected. 1326 * 1327 * We can't wait around here because the MAC layer expects 1328 * this to be a non-blocking operation - waiting ends up 1329 * causing a deadlock during resume. 1330 */ 1331 if (!xnfp->xnf_connected) { 1332 mutex_exit(&xnfp->xnf_txlock); 1333 return (0); 1334 } 1335 1336 /* 1337 * 1. Acquire two slots in the ring. 1338 * 2. Fill in the slots. 1339 * 3. Request notification when the operation is done. 1340 * 4. Kick the peer. 1341 * 5. Wait for the response via xnf_tx_clean_ring(). 1342 */ 1343 1344 n_slots = xnf_tx_slots_get(xnfp, 2, B_TRUE); 1345 ASSERT(n_slots >= 2); 1346 1347 slot = xnfp->xnf_tx_ring.req_prod_pvt; 1348 tidp = xnf_txid_get(xnfp); 1349 VERIFY(tidp != NULL); 1350 1351 txp->tx_type = TX_MCAST_REQ; 1352 txp->tx_slot = slot; 1353 1354 txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); 1355 erp = (struct netif_extra_info *) 1356 RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1); 1357 1358 txrp->gref = 0; 1359 txrp->size = 0; 1360 txrp->offset = 0; 1361 /* Set tx_txreq.id to appease xnf_tx_clean_ring(). */ 1362 txrp->id = txp->tx_txreq.id = tidp->id; 1363 txrp->flags = NETTXF_extra_info; 1364 1365 erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD : 1366 XEN_NETIF_EXTRA_TYPE_MCAST_DEL; 1367 bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL); 1368 1369 tidp->txbuf = txp; 1370 1371 xnfp->xnf_tx_ring.req_prod_pvt = slot + 2; 1372 1373 mutex_enter(&xnfp->xnf_schedlock); 1374 xnfp->xnf_pending_multicast++; 1375 mutex_exit(&xnfp->xnf_schedlock); 1376 1377 /* LINTED: constant in conditional context */ 1378 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, 1379 notify); 1380 if (notify) 1381 ec_notify_via_evtchn(xnfp->xnf_evtchn); 1382 1383 while (txp->tx_type == TX_MCAST_REQ) 1384 cv_wait(&xnfp->xnf_cv_multicast, &xnfp->xnf_txlock); 1385 1386 ASSERT3U(txp->tx_type, ==, TX_MCAST_RSP); 1387 1388 mutex_enter(&xnfp->xnf_schedlock); 1389 xnfp->xnf_pending_multicast--; 1390 mutex_exit(&xnfp->xnf_schedlock); 1391 1392 result = (txp->tx_status == NETIF_RSP_OKAY); 1393 1394 xnf_txid_put(xnfp, tidp); 1395 1396 mutex_exit(&xnfp->xnf_txlock); 1397 1398 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); 1399 1400 return (result ? 0 : 1); 1401 } 1402 1403 /* 1404 * xnf_set_promiscuous() -- set or reset promiscuous mode on the board 1405 * 1406 * Program the hardware to enable/disable promiscuous mode. 1407 */ 1408 static int 1409 xnf_set_promiscuous(void *arg, boolean_t on) 1410 { 1411 _NOTE(ARGUNUSED(arg, on)); 1412 1413 /* 1414 * We can't really do this, but we pretend that we can in 1415 * order that snoop will work. 1416 */ 1417 return (0); 1418 } 1419 1420 /* 1421 * Clean buffers that we have responses for from the transmit ring. 1422 */ 1423 static int 1424 xnf_tx_clean_ring(xnf_t *xnfp) 1425 { 1426 boolean_t work_to_do; 1427 1428 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 1429 1430 loop: 1431 while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) { 1432 RING_IDX cons, prod, i; 1433 1434 cons = xnfp->xnf_tx_ring.rsp_cons; 1435 prod = xnfp->xnf_tx_ring.sring->rsp_prod; 1436 membar_consumer(); 1437 /* 1438 * Clean tx requests from ring that we have responses 1439 * for. 1440 */ 1441 DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod); 1442 for (i = cons; i != prod; i++) { 1443 netif_tx_response_t *trp; 1444 xnf_txid_t *tidp; 1445 xnf_txbuf_t *txp; 1446 1447 trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i); 1448 /* 1449 * if this slot was occupied by netif_extra_info_t, 1450 * then the response will be NETIF_RSP_NULL. In this 1451 * case there are no resources to clean up. 1452 */ 1453 if (trp->status == NETIF_RSP_NULL) 1454 continue; 1455 1456 ASSERT(TX_ID_VALID(trp->id)); 1457 1458 tidp = TX_ID_TO_TXID(xnfp, trp->id); 1459 ASSERT3U(tidp->id, ==, trp->id); 1460 ASSERT3U(tidp->next, ==, INVALID_TX_ID); 1461 1462 txp = tidp->txbuf; 1463 ASSERT(txp != NULL); 1464 ASSERT3U(txp->tx_txreq.id, ==, trp->id); 1465 1466 switch (txp->tx_type) { 1467 case TX_DATA: 1468 /* 1469 * We must put the txid for each response we 1470 * acknowledge to make sure that we never have 1471 * more free slots than txids. Because of this 1472 * we do it here instead of waiting for it to 1473 * be done in xnf_data_txbuf_free_chain(). 1474 */ 1475 xnf_txid_put(xnfp, tidp); 1476 txp->tx_txreq.id = INVALID_TX_ID; 1477 ASSERT3S(txp->tx_head->tx_frags_to_ack, >, 0); 1478 txp->tx_head->tx_frags_to_ack--; 1479 1480 /* 1481 * We clean the whole chain once we got a 1482 * response for each fragment. 1483 */ 1484 if (txp->tx_head->tx_frags_to_ack == 0) 1485 xnf_data_txbuf_free_chain(xnfp, txp); 1486 1487 break; 1488 1489 case TX_MCAST_REQ: 1490 txp->tx_type = TX_MCAST_RSP; 1491 txp->tx_status = trp->status; 1492 cv_broadcast(&xnfp->xnf_cv_multicast); 1493 1494 break; 1495 1496 default: 1497 cmn_err(CE_PANIC, "xnf_tx_clean_ring: " 1498 "invalid xnf_txbuf_t type: %d", 1499 txp->tx_type); 1500 break; 1501 } 1502 } 1503 /* 1504 * Record the last response we dealt with so that we 1505 * know where to start next time around. 1506 */ 1507 xnfp->xnf_tx_ring.rsp_cons = prod; 1508 membar_enter(); 1509 } 1510 1511 /* LINTED: constant in conditional context */ 1512 RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do); 1513 if (work_to_do) 1514 goto loop; 1515 1516 return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring)); 1517 } 1518 1519 /* 1520 * Allocate and fill in a look-aside buffer for the packet `mp'. Used 1521 * to ensure that the packet is physically contiguous and contained 1522 * within a single page. 1523 */ 1524 static xnf_buf_t * 1525 xnf_tx_get_lookaside(xnf_t *xnfp, mblk_t *mp, size_t *plen) 1526 { 1527 xnf_buf_t *bd; 1528 caddr_t bp; 1529 1530 if ((bd = xnf_buf_get(xnfp, KM_NOSLEEP, B_TRUE)) == NULL) { 1531 return (NULL); 1532 } 1533 1534 bp = bd->buf; 1535 while (mp != NULL) { 1536 size_t len = MBLKL(mp); 1537 1538 bcopy(mp->b_rptr, bp, len); 1539 bp += len; 1540 1541 mp = mp->b_cont; 1542 } 1543 1544 *plen = bp - bd->buf; 1545 ASSERT3U(*plen, <=, PAGESIZE); 1546 1547 xnfp->xnf_stat_tx_lookaside++; 1548 1549 return (bd); 1550 } 1551 1552 /* 1553 * Insert the pseudo-header checksum into the packet. 1554 * Assumes packet is IPv4, TCP/UDP since we only advertised support for 1555 * HCKSUM_INET_FULL_V4. 1556 */ 1557 int 1558 xnf_pseudo_cksum(mblk_t *mp) 1559 { 1560 struct ether_header *ehp; 1561 uint16_t sap, iplen, *stuff; 1562 uint32_t cksum; 1563 size_t len; 1564 ipha_t *ipha; 1565 ipaddr_t src, dst; 1566 uchar_t *ptr; 1567 1568 ptr = mp->b_rptr; 1569 len = MBLKL(mp); 1570 1571 /* Each header must fit completely in an mblk. */ 1572 ASSERT3U(len, >=, sizeof (*ehp)); 1573 1574 ehp = (struct ether_header *)ptr; 1575 1576 if (ntohs(ehp->ether_type) == VLAN_TPID) { 1577 struct ether_vlan_header *evhp; 1578 ASSERT3U(len, >=, sizeof (*evhp)); 1579 evhp = (struct ether_vlan_header *)ptr; 1580 sap = ntohs(evhp->ether_type); 1581 ptr += sizeof (*evhp); 1582 len -= sizeof (*evhp); 1583 } else { 1584 sap = ntohs(ehp->ether_type); 1585 ptr += sizeof (*ehp); 1586 len -= sizeof (*ehp); 1587 } 1588 1589 ASSERT3U(sap, ==, ETHERTYPE_IP); 1590 1591 /* 1592 * Ethernet and IP headers may be in different mblks. 1593 */ 1594 ASSERT3P(ptr, <=, mp->b_wptr); 1595 if (ptr == mp->b_wptr) { 1596 mp = mp->b_cont; 1597 ptr = mp->b_rptr; 1598 len = MBLKL(mp); 1599 } 1600 1601 ASSERT3U(len, >=, sizeof (ipha_t)); 1602 ipha = (ipha_t *)ptr; 1603 1604 /* 1605 * We assume the IP header has no options. (This is enforced in 1606 * ire_send_wire_v4() -- search for IXAF_NO_HW_CKSUM). 1607 */ 1608 ASSERT3U(IPH_HDR_LENGTH(ipha), ==, IP_SIMPLE_HDR_LENGTH); 1609 iplen = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH; 1610 1611 ptr += IP_SIMPLE_HDR_LENGTH; 1612 len -= IP_SIMPLE_HDR_LENGTH; 1613 1614 /* 1615 * IP and L4 headers may be in different mblks. 1616 */ 1617 ASSERT3P(ptr, <=, mp->b_wptr); 1618 if (ptr == mp->b_wptr) { 1619 mp = mp->b_cont; 1620 ptr = mp->b_rptr; 1621 len = MBLKL(mp); 1622 } 1623 1624 switch (ipha->ipha_protocol) { 1625 case IPPROTO_TCP: 1626 ASSERT3U(len, >=, sizeof (tcph_t)); 1627 stuff = (uint16_t *)(ptr + TCP_CHECKSUM_OFFSET); 1628 cksum = IP_TCP_CSUM_COMP; 1629 break; 1630 case IPPROTO_UDP: 1631 ASSERT3U(len, >=, sizeof (struct udphdr)); 1632 stuff = (uint16_t *)(ptr + UDP_CHECKSUM_OFFSET); 1633 cksum = IP_UDP_CSUM_COMP; 1634 break; 1635 default: 1636 cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d", 1637 ipha->ipha_protocol); 1638 return (EINVAL); 1639 } 1640 1641 src = ipha->ipha_src; 1642 dst = ipha->ipha_dst; 1643 1644 cksum += (dst >> 16) + (dst & 0xFFFF); 1645 cksum += (src >> 16) + (src & 0xFFFF); 1646 cksum += htons(iplen); 1647 1648 cksum = (cksum >> 16) + (cksum & 0xFFFF); 1649 cksum = (cksum >> 16) + (cksum & 0xFFFF); 1650 1651 ASSERT(cksum <= 0xFFFF); 1652 1653 *stuff = (uint16_t)(cksum ? cksum : ~cksum); 1654 1655 return (0); 1656 } 1657 1658 /* 1659 * Push a packet into the transmit ring. 1660 * 1661 * Note: the format of a tx packet that spans multiple slots is similar to 1662 * what is described in xnf_rx_one_packet(). 1663 */ 1664 static void 1665 xnf_tx_push_packet(xnf_t *xnfp, xnf_txbuf_t *head) 1666 { 1667 int nslots = 0; 1668 int extras = 0; 1669 RING_IDX slot; 1670 boolean_t notify; 1671 1672 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 1673 ASSERT(xnfp->xnf_running); 1674 1675 slot = xnfp->xnf_tx_ring.req_prod_pvt; 1676 1677 /* 1678 * The caller has already checked that we have enough slots to proceed. 1679 */ 1680 for (xnf_txbuf_t *txp = head; txp != NULL; txp = txp->tx_next) { 1681 xnf_txid_t *tidp; 1682 netif_tx_request_t *txrp; 1683 1684 tidp = xnf_txid_get(xnfp); 1685 VERIFY(tidp != NULL); 1686 txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); 1687 1688 txp->tx_slot = slot; 1689 txp->tx_txreq.id = tidp->id; 1690 *txrp = txp->tx_txreq; 1691 1692 tidp->txbuf = txp; 1693 slot++; 1694 nslots++; 1695 1696 /* 1697 * When present, LSO info is placed in a slot after the first 1698 * data segment, and doesn't require a txid. 1699 */ 1700 if (txp->tx_txreq.flags & NETTXF_extra_info) { 1701 netif_extra_info_t *extra; 1702 ASSERT3U(nslots, ==, 1); 1703 1704 extra = (netif_extra_info_t *) 1705 RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); 1706 *extra = txp->tx_extra; 1707 slot++; 1708 nslots++; 1709 extras = 1; 1710 } 1711 } 1712 1713 ASSERT3U(nslots, <=, XEN_MAX_SLOTS_PER_TX); 1714 1715 /* 1716 * Store the number of data fragments. 1717 */ 1718 head->tx_frags_to_ack = nslots - extras; 1719 1720 xnfp->xnf_tx_ring.req_prod_pvt = slot; 1721 1722 /* 1723 * Tell the peer that we sent something, if it cares. 1724 */ 1725 /* LINTED: constant in conditional context */ 1726 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, notify); 1727 if (notify) 1728 ec_notify_via_evtchn(xnfp->xnf_evtchn); 1729 } 1730 1731 static xnf_txbuf_t * 1732 xnf_mblk_copy(xnf_t *xnfp, mblk_t *mp) 1733 { 1734 xnf_txbuf_t *txp; 1735 size_t length; 1736 1737 if ((txp = xnf_data_txbuf_alloc(xnfp, KM_NOSLEEP)) == NULL) { 1738 return (NULL); 1739 } 1740 1741 txp->tx_bdesc = xnf_tx_get_lookaside(xnfp, mp, &length); 1742 if (txp->tx_bdesc == NULL) { 1743 xnf_data_txbuf_free(xnfp, txp); 1744 return (NULL); 1745 } 1746 txp->tx_mfn = txp->tx_bdesc->buf_mfn; 1747 txp->tx_txreq.gref = txp->tx_bdesc->grant_ref; 1748 txp->tx_txreq.size = length; 1749 txp->tx_txreq.offset = (uintptr_t)txp->tx_bdesc->buf & PAGEOFFSET; 1750 txp->tx_txreq.flags = 0; 1751 1752 return (txp); 1753 } 1754 1755 static xnf_txbuf_t * 1756 xnf_mblk_map(xnf_t *xnfp, mblk_t *mp, int *countp) 1757 { 1758 xnf_txbuf_t *head = NULL; 1759 xnf_txbuf_t *tail = NULL; 1760 domid_t oeid; 1761 int nsegs = 0; 1762 1763 oeid = xvdi_get_oeid(xnfp->xnf_devinfo); 1764 1765 for (mblk_t *ml = mp; ml != NULL; ml = ml->b_cont) { 1766 ddi_dma_handle_t dma_handle; 1767 const ddi_dma_cookie_t *dma_cookie, *dma_cookie_prev; 1768 xnf_txbuf_t *txp; 1769 1770 if (MBLKL(ml) == 0) 1771 continue; 1772 1773 if ((txp = xnf_data_txbuf_alloc(xnfp, KM_NOSLEEP)) == NULL) { 1774 goto error; 1775 } 1776 1777 if (head == NULL) { 1778 head = txp; 1779 } else { 1780 ASSERT(tail != NULL); 1781 TXBUF_SETNEXT(tail, txp); 1782 txp->tx_head = head; 1783 } 1784 1785 /* 1786 * The necessary segmentation rules (e.g. not crossing a page 1787 * boundary) are enforced by the dma attributes of the handle. 1788 */ 1789 dma_handle = txp->tx_dma_handle; 1790 int ret = ddi_dma_addr_bind_handle(dma_handle, 1791 NULL, (char *)ml->b_rptr, MBLKL(ml), 1792 DDI_DMA_WRITE | DDI_DMA_STREAMING, 1793 DDI_DMA_DONTWAIT, 0, NULL, NULL); 1794 if (ret != DDI_DMA_MAPPED) { 1795 if (ret != DDI_DMA_NORESOURCES) { 1796 dev_err(xnfp->xnf_devinfo, CE_WARN, 1797 "ddi_dma_addr_bind_handle() failed " 1798 "[dma_error=%d]", ret); 1799 } 1800 goto error; 1801 } 1802 txp->tx_handle_bound = B_TRUE; 1803 1804 dma_cookie_prev = NULL; 1805 while ((dma_cookie = ddi_dma_cookie_iter(dma_handle, 1806 dma_cookie_prev)) != NULL) { 1807 if (nsegs == XEN_MAX_TX_DATA_PAGES) { 1808 dev_err(xnfp->xnf_devinfo, CE_WARN, 1809 "xnf_dmamap_alloc() failed: " 1810 "too many segments"); 1811 goto error; 1812 } 1813 if (dma_cookie_prev != NULL) { 1814 if ((txp = xnf_data_txbuf_alloc(xnfp, 1815 KM_NOSLEEP)) == NULL) { 1816 goto error; 1817 } 1818 ASSERT(tail != NULL); 1819 TXBUF_SETNEXT(tail, txp); 1820 txp->tx_head = head; 1821 } 1822 1823 txp->tx_mfn = 1824 xnf_btop(pa_to_ma(dma_cookie->dmac_laddress)); 1825 txp->tx_txreq.gref = xnf_gref_get(xnfp); 1826 if (txp->tx_txreq.gref == INVALID_GRANT_REF) { 1827 dev_err(xnfp->xnf_devinfo, CE_WARN, 1828 "xnf_dmamap_alloc() failed: " 1829 "invalid grant ref"); 1830 goto error; 1831 } 1832 gnttab_grant_foreign_access_ref(txp->tx_txreq.gref, 1833 oeid, txp->tx_mfn, 1); 1834 txp->tx_txreq.offset = 1835 dma_cookie->dmac_laddress & PAGEOFFSET; 1836 txp->tx_txreq.size = dma_cookie->dmac_size; 1837 txp->tx_txreq.flags = 0; 1838 1839 nsegs++; 1840 1841 if (tail != NULL) 1842 tail->tx_txreq.flags = NETTXF_more_data; 1843 tail = txp; 1844 1845 dma_cookie_prev = dma_cookie; 1846 } 1847 } 1848 1849 *countp = nsegs; 1850 return (head); 1851 1852 error: 1853 xnf_data_txbuf_free_chain(xnfp, head); 1854 return (NULL); 1855 } 1856 1857 static void 1858 xnf_tx_setup_offload(xnf_t *xnfp, xnf_txbuf_t *head, 1859 uint32_t cksum_flags, uint32_t lso_flags, uint32_t mss) 1860 { 1861 if (lso_flags != 0) { 1862 ASSERT3U(lso_flags, ==, HW_LSO); 1863 ASSERT3P(head->tx_bdesc, ==, NULL); 1864 1865 head->tx_txreq.flags |= NETTXF_extra_info; 1866 netif_extra_info_t *extra = &head->tx_extra; 1867 extra->type = XEN_NETIF_EXTRA_TYPE_GSO; 1868 extra->flags = 0; 1869 extra->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; 1870 extra->u.gso.size = mss; 1871 extra->u.gso.features = 0; 1872 extra->u.gso.pad = 0; 1873 } else if (cksum_flags != 0) { 1874 ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM); 1875 /* 1876 * If the local protocol stack requests checksum 1877 * offload we set the 'checksum blank' flag, 1878 * indicating to the peer that we need the checksum 1879 * calculated for us. 1880 * 1881 * We _don't_ set the validated flag, because we haven't 1882 * validated that the data and the checksum match. 1883 * 1884 * Note: we already called xnf_pseudo_cksum() in 1885 * xnf_send(), so we just set the txreq flag here. 1886 */ 1887 head->tx_txreq.flags |= NETTXF_csum_blank; 1888 xnfp->xnf_stat_tx_cksum_deferred++; 1889 } 1890 } 1891 1892 /* 1893 * Send packet mp. Called by the MAC framework. 1894 */ 1895 static mblk_t * 1896 xnf_send(void *arg, mblk_t *mp) 1897 { 1898 xnf_t *xnfp = arg; 1899 xnf_txbuf_t *head; 1900 mblk_t *ml; 1901 int length; 1902 int pages, chunks, slots, slots_free; 1903 uint32_t cksum_flags, lso_flags, mss; 1904 boolean_t pulledup = B_FALSE; 1905 boolean_t force_copy = B_FALSE; 1906 1907 ASSERT3P(mp->b_next, ==, NULL); 1908 1909 mutex_enter(&xnfp->xnf_txlock); 1910 1911 /* 1912 * Wait until we are connected to the backend. 1913 */ 1914 while (!xnfp->xnf_connected) 1915 cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock); 1916 1917 /* 1918 * To simplify logic and be in sync with the rescheduling mechanism, 1919 * we require the maximum amount of slots that could be used by a 1920 * transaction to be free before proceeding. The only downside of doing 1921 * this is that it slightly reduces the effective size of the ring. 1922 */ 1923 slots_free = xnf_tx_slots_get(xnfp, XEN_MAX_SLOTS_PER_TX, B_FALSE); 1924 if (slots_free < XEN_MAX_SLOTS_PER_TX) { 1925 /* 1926 * We need to ask for a re-schedule later as the ring is full. 1927 */ 1928 mutex_enter(&xnfp->xnf_schedlock); 1929 xnfp->xnf_need_sched = B_TRUE; 1930 mutex_exit(&xnfp->xnf_schedlock); 1931 1932 xnfp->xnf_stat_tx_defer++; 1933 mutex_exit(&xnfp->xnf_txlock); 1934 return (mp); 1935 } 1936 1937 /* 1938 * Get hw offload parameters. 1939 * This must be done before pulling up the mp as those parameters 1940 * are not copied over. 1941 */ 1942 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &cksum_flags); 1943 mac_lso_get(mp, &mss, &lso_flags); 1944 1945 /* 1946 * XXX: fix MAC framework so that we can advertise support for 1947 * partial checksum for IPv4 only. This way we won't need to calculate 1948 * the pseudo header checksum ourselves. 1949 */ 1950 if (cksum_flags != 0) { 1951 ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM); 1952 (void) xnf_pseudo_cksum(mp); 1953 } 1954 1955 pulledup: 1956 for (ml = mp, pages = 0, chunks = 0, length = 0; ml != NULL; 1957 ml = ml->b_cont, chunks++) { 1958 pages += xnf_mblk_pages(ml); 1959 length += MBLKL(ml); 1960 } 1961 DTRACE_PROBE3(packet, int, length, int, chunks, int, pages); 1962 DTRACE_PROBE3(lso, int, length, uint32_t, lso_flags, uint32_t, mss); 1963 1964 /* 1965 * If the ethernet header crosses a page boundary the packet 1966 * will be dropped by the backend. In practice it seems like 1967 * this happens fairly rarely so we'll do nothing unless the 1968 * packet is small enough to fit in a look-aside buffer. 1969 */ 1970 if (((uintptr_t)mp->b_rptr & PAGEOFFSET) + 1971 sizeof (struct ether_header) > PAGESIZE) { 1972 xnfp->xnf_stat_tx_eth_hdr_split++; 1973 if (length <= PAGESIZE) 1974 force_copy = B_TRUE; 1975 } 1976 1977 if (force_copy || (pages > 1 && !xnfp->xnf_be_tx_sg)) { 1978 /* 1979 * If the packet spans several pages and scatter-gather is not 1980 * supported then use a look-aside buffer. 1981 */ 1982 ASSERT3U(length, <=, PAGESIZE); 1983 head = xnf_mblk_copy(xnfp, mp); 1984 if (head == NULL) { 1985 dev_err(xnfp->xnf_devinfo, CE_WARN, 1986 "xnf_mblk_copy() failed"); 1987 goto drop; 1988 } 1989 } else { 1990 /* 1991 * There's a limit for how many pages can be passed to the 1992 * backend. If we pass that limit, the packet will be dropped 1993 * and some backend implementations (e.g. Linux) could even 1994 * offline the interface. 1995 */ 1996 if (pages > XEN_MAX_TX_DATA_PAGES) { 1997 if (pulledup) { 1998 dev_err(xnfp->xnf_devinfo, CE_WARN, 1999 "too many pages, even after pullup: %d.", 2000 pages); 2001 goto drop; 2002 } 2003 2004 /* 2005 * Defragment packet if it spans too many pages. 2006 */ 2007 mblk_t *newmp = msgpullup(mp, -1); 2008 if (newmp == NULL) { 2009 dev_err(xnfp->xnf_devinfo, CE_WARN, 2010 "msgpullup() failed"); 2011 goto drop; 2012 } 2013 2014 freemsg(mp); 2015 mp = newmp; 2016 xnfp->xnf_stat_tx_pullup++; 2017 pulledup = B_TRUE; 2018 goto pulledup; 2019 } 2020 2021 head = xnf_mblk_map(xnfp, mp, &slots); 2022 if (head == NULL) 2023 goto drop; 2024 2025 IMPLY(slots > 1, xnfp->xnf_be_tx_sg); 2026 } 2027 2028 /* 2029 * Set tx_mp so that mblk is freed when the txbuf chain is freed. 2030 */ 2031 head->tx_mp = mp; 2032 2033 xnf_tx_setup_offload(xnfp, head, cksum_flags, lso_flags, mss); 2034 2035 /* 2036 * The first request must store the total length of the packet. 2037 */ 2038 head->tx_txreq.size = length; 2039 2040 /* 2041 * Push the packet we have prepared into the ring. 2042 */ 2043 xnf_tx_push_packet(xnfp, head); 2044 xnfp->xnf_stat_opackets++; 2045 xnfp->xnf_stat_obytes += length; 2046 2047 mutex_exit(&xnfp->xnf_txlock); 2048 return (NULL); 2049 2050 drop: 2051 freemsg(mp); 2052 xnfp->xnf_stat_tx_drop++; 2053 mutex_exit(&xnfp->xnf_txlock); 2054 return (NULL); 2055 } 2056 2057 /* 2058 * Notification of RX packets. Currently no TX-complete interrupt is 2059 * used, as we clean the TX ring lazily. 2060 */ 2061 static uint_t 2062 xnf_intr(caddr_t arg) 2063 { 2064 xnf_t *xnfp = (xnf_t *)arg; 2065 mblk_t *mp; 2066 boolean_t need_sched, clean_ring; 2067 2068 mutex_enter(&xnfp->xnf_rxlock); 2069 2070 /* 2071 * Interrupts before we are connected are spurious. 2072 */ 2073 if (!xnfp->xnf_connected) { 2074 mutex_exit(&xnfp->xnf_rxlock); 2075 xnfp->xnf_stat_unclaimed_interrupts++; 2076 return (DDI_INTR_UNCLAIMED); 2077 } 2078 2079 /* 2080 * Receive side processing. 2081 */ 2082 do { 2083 /* 2084 * Collect buffers from the ring. 2085 */ 2086 xnf_rx_collect(xnfp); 2087 2088 /* 2089 * Interrupt me when the next receive buffer is consumed. 2090 */ 2091 xnfp->xnf_rx_ring.sring->rsp_event = 2092 xnfp->xnf_rx_ring.rsp_cons + 1; 2093 xen_mb(); 2094 2095 } while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)); 2096 2097 if (xnfp->xnf_rx_new_buffers_posted) { 2098 boolean_t notify; 2099 2100 /* 2101 * Indicate to the peer that we have re-filled the 2102 * receive ring, if it cares. 2103 */ 2104 /* LINTED: constant in conditional context */ 2105 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify); 2106 if (notify) 2107 ec_notify_via_evtchn(xnfp->xnf_evtchn); 2108 xnfp->xnf_rx_new_buffers_posted = B_FALSE; 2109 } 2110 2111 mp = xnfp->xnf_rx_head; 2112 xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL; 2113 2114 xnfp->xnf_stat_interrupts++; 2115 mutex_exit(&xnfp->xnf_rxlock); 2116 2117 if (mp != NULL) 2118 mac_rx(xnfp->xnf_mh, NULL, mp); 2119 2120 /* 2121 * Transmit side processing. 2122 * 2123 * If a previous transmit attempt failed or we have pending 2124 * multicast requests, clean the ring. 2125 * 2126 * If we previously stalled transmission and cleaning produces 2127 * some free slots, tell upstream to attempt sending again. 2128 * 2129 * The odd style is to avoid acquiring xnf_txlock unless we 2130 * will actually look inside the tx machinery. 2131 */ 2132 mutex_enter(&xnfp->xnf_schedlock); 2133 need_sched = xnfp->xnf_need_sched; 2134 clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0); 2135 mutex_exit(&xnfp->xnf_schedlock); 2136 2137 if (clean_ring) { 2138 int free_slots; 2139 2140 mutex_enter(&xnfp->xnf_txlock); 2141 free_slots = xnf_tx_slots_get(xnfp, 0, B_FALSE); 2142 2143 if (need_sched && (free_slots >= XEN_MAX_SLOTS_PER_TX)) { 2144 mutex_enter(&xnfp->xnf_schedlock); 2145 xnfp->xnf_need_sched = B_FALSE; 2146 mutex_exit(&xnfp->xnf_schedlock); 2147 2148 mac_tx_update(xnfp->xnf_mh); 2149 } 2150 mutex_exit(&xnfp->xnf_txlock); 2151 } 2152 2153 return (DDI_INTR_CLAIMED); 2154 } 2155 2156 /* 2157 * xnf_start() -- start the board receiving and enable interrupts. 2158 */ 2159 static int 2160 xnf_start(void *arg) 2161 { 2162 xnf_t *xnfp = arg; 2163 2164 mutex_enter(&xnfp->xnf_rxlock); 2165 mutex_enter(&xnfp->xnf_txlock); 2166 2167 /* Accept packets from above. */ 2168 xnfp->xnf_running = B_TRUE; 2169 2170 mutex_exit(&xnfp->xnf_txlock); 2171 mutex_exit(&xnfp->xnf_rxlock); 2172 2173 return (0); 2174 } 2175 2176 /* xnf_stop() - disable hardware */ 2177 static void 2178 xnf_stop(void *arg) 2179 { 2180 xnf_t *xnfp = arg; 2181 2182 mutex_enter(&xnfp->xnf_rxlock); 2183 mutex_enter(&xnfp->xnf_txlock); 2184 2185 xnfp->xnf_running = B_FALSE; 2186 2187 mutex_exit(&xnfp->xnf_txlock); 2188 mutex_exit(&xnfp->xnf_rxlock); 2189 } 2190 2191 /* 2192 * Hang buffer `bdesc' on the RX ring. 2193 */ 2194 static void 2195 xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc) 2196 { 2197 netif_rx_request_t *reqp; 2198 RING_IDX hang_ix; 2199 2200 ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock)); 2201 2202 reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring, 2203 xnfp->xnf_rx_ring.req_prod_pvt); 2204 hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0)); 2205 ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL); 2206 2207 reqp->id = bdesc->id = hang_ix; 2208 reqp->gref = bdesc->grant_ref; 2209 2210 xnfp->xnf_rx_pkt_info[hang_ix] = bdesc; 2211 xnfp->xnf_rx_ring.req_prod_pvt++; 2212 2213 xnfp->xnf_rx_new_buffers_posted = B_TRUE; 2214 } 2215 2216 /* 2217 * Receive an entire packet from the ring, starting from slot *consp. 2218 * prod indicates the slot of the latest response. 2219 * On return, *consp will point to the head of the next packet. 2220 * 2221 * Note: If slot prod was reached before we could gather a full packet, we will 2222 * drop the partial packet; this would most likely indicate a bug in either 2223 * the front-end or the back-end driver. 2224 * 2225 * An rx packet can consist of several fragments and thus span multiple slots. 2226 * Each fragment can contain up to 4k of data. 2227 * 2228 * A typical 9000 MTU packet with look like this: 2229 * +------+---------------------+-------------------+-----------------------+ 2230 * | SLOT | TYPE | CONTENTS | FLAGS | 2231 * +------+---------------------+-------------------+-----------------------+ 2232 * | 1 | netif_rx_response_t | 1st data fragment | more_data | 2233 * +------+---------------------+-------------------+-----------------------+ 2234 * | 2 | netif_rx_response_t | 2nd data fragment | more_data | 2235 * +------+---------------------+-------------------+-----------------------+ 2236 * | 3 | netif_rx_response_t | 3rd data fragment | [none] | 2237 * +------+---------------------+-------------------+-----------------------+ 2238 * 2239 * Fragments are chained by setting NETRXF_more_data in the previous 2240 * response's flags. If there are additional flags, such as 2241 * NETRXF_data_validated or NETRXF_extra_info, those should be set on the 2242 * first fragment. 2243 * 2244 * Sometimes extra info can be present. If so, it will follow the first 2245 * fragment, and NETRXF_extra_info flag will be set on the first response. 2246 * If LRO is set on a packet, it will be stored in the extra info. Conforming 2247 * to the spec, extra info can also be chained, but must all be present right 2248 * after the first fragment. 2249 * 2250 * Example of a packet with 2 extra infos: 2251 * +------+---------------------+-------------------+-----------------------+ 2252 * | SLOT | TYPE | CONTENTS | FLAGS | 2253 * +------+---------------------+-------------------+-----------------------+ 2254 * | 1 | netif_rx_response_t | 1st data fragment | extra_info, more_data | 2255 * +------+---------------------+-------------------+-----------------------+ 2256 * | 2 | netif_extra_info_t | 1st extra info | EXTRA_FLAG_MORE | 2257 * +------+---------------------+-------------------+-----------------------+ 2258 * | 3 | netif_extra_info_t | 2nd extra info | [none] | 2259 * +------+---------------------+-------------------+-----------------------+ 2260 * | 4 | netif_rx_response_t | 2nd data fragment | more_data | 2261 * +------+---------------------+-------------------+-----------------------+ 2262 * | 5 | netif_rx_response_t | 3rd data fragment | more_data | 2263 * +------+---------------------+-------------------+-----------------------+ 2264 * | 6 | netif_rx_response_t | 4th data fragment | [none] | 2265 * +------+---------------------+-------------------+-----------------------+ 2266 * 2267 * In practice, the only extra we expect is for LRO, but only if we advertise 2268 * that we support it to the backend (xnf_enable_lro == TRUE). 2269 */ 2270 static int 2271 xnf_rx_one_packet(xnf_t *xnfp, RING_IDX prod, RING_IDX *consp, mblk_t **mpp) 2272 { 2273 mblk_t *head = NULL; 2274 mblk_t *tail = NULL; 2275 mblk_t *mp; 2276 int error = 0; 2277 RING_IDX cons = *consp; 2278 netif_extra_info_t lro; 2279 boolean_t is_lro = B_FALSE; 2280 boolean_t is_extra = B_FALSE; 2281 2282 netif_rx_response_t rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons); 2283 2284 boolean_t hwcsum = (rsp.flags & NETRXF_data_validated) != 0; 2285 boolean_t more_data = (rsp.flags & NETRXF_more_data) != 0; 2286 boolean_t more_extra = (rsp.flags & NETRXF_extra_info) != 0; 2287 2288 IMPLY(more_data, xnf_enable_rx_sg); 2289 2290 while (cons != prod) { 2291 xnf_buf_t *bdesc; 2292 int len, off; 2293 int rxidx = cons & (NET_RX_RING_SIZE - 1); 2294 2295 bdesc = xnfp->xnf_rx_pkt_info[rxidx]; 2296 xnfp->xnf_rx_pkt_info[rxidx] = NULL; 2297 2298 if (is_extra) { 2299 netif_extra_info_t *extra = (netif_extra_info_t *)&rsp; 2300 /* 2301 * The only extra we expect is for LRO, and it should 2302 * only be present once. 2303 */ 2304 if (extra->type == XEN_NETIF_EXTRA_TYPE_GSO && 2305 !is_lro) { 2306 ASSERT(xnf_enable_lro); 2307 lro = *extra; 2308 is_lro = B_TRUE; 2309 DTRACE_PROBE1(lro, netif_extra_info_t *, &lro); 2310 } else { 2311 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx packet " 2312 "contains unexpected extra info of type %d", 2313 extra->type); 2314 error = EINVAL; 2315 } 2316 more_extra = 2317 (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE) != 0; 2318 2319 goto hang_buf; 2320 } 2321 2322 ASSERT3U(bdesc->id, ==, rsp.id); 2323 2324 /* 2325 * status stores packet length when >= 0, or errors when < 0. 2326 */ 2327 len = rsp.status; 2328 off = rsp.offset; 2329 more_data = (rsp.flags & NETRXF_more_data) != 0; 2330 2331 /* 2332 * sanity checks. 2333 */ 2334 if (!xnfp->xnf_running) { 2335 error = EBUSY; 2336 } else if (len <= 0) { 2337 xnfp->xnf_stat_errrx++; 2338 2339 switch (len) { 2340 case 0: 2341 xnfp->xnf_stat_runt++; 2342 break; 2343 case NETIF_RSP_ERROR: 2344 xnfp->xnf_stat_mac_rcv_error++; 2345 break; 2346 case NETIF_RSP_DROPPED: 2347 xnfp->xnf_stat_norxbuf++; 2348 break; 2349 } 2350 error = EINVAL; 2351 } else if (bdesc->grant_ref == INVALID_GRANT_REF) { 2352 dev_err(xnfp->xnf_devinfo, CE_WARN, 2353 "Bad rx grant reference, rsp id %d", rsp.id); 2354 error = EINVAL; 2355 } else if ((off + len) > PAGESIZE) { 2356 dev_err(xnfp->xnf_devinfo, CE_WARN, "Rx packet crosses " 2357 "page boundary (offset %d, length %d)", off, len); 2358 error = EINVAL; 2359 } 2360 2361 if (error != 0) { 2362 /* 2363 * If an error has been detected, we do not attempt 2364 * to read the data but we still need to replace 2365 * the rx bufs. 2366 */ 2367 goto hang_buf; 2368 } 2369 2370 xnf_buf_t *nbuf = NULL; 2371 2372 /* 2373 * If the packet is below a pre-determined size we will 2374 * copy data out of the buf rather than replace it. 2375 */ 2376 if (len > xnf_rx_copy_limit) 2377 nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE); 2378 2379 if (nbuf != NULL) { 2380 mp = desballoc((unsigned char *)bdesc->buf, 2381 bdesc->len, 0, &bdesc->free_rtn); 2382 2383 if (mp == NULL) { 2384 xnfp->xnf_stat_rx_desballoc_fail++; 2385 xnfp->xnf_stat_norxbuf++; 2386 error = ENOMEM; 2387 /* 2388 * we free the buf we just allocated as we 2389 * will re-hang the old buf. 2390 */ 2391 xnf_buf_put(xnfp, nbuf, B_FALSE); 2392 goto hang_buf; 2393 } 2394 2395 mp->b_rptr = mp->b_rptr + off; 2396 mp->b_wptr = mp->b_rptr + len; 2397 2398 /* 2399 * Release the grant as the backend doesn't need to 2400 * access this buffer anymore and grants are scarce. 2401 */ 2402 (void) gnttab_end_foreign_access_ref(bdesc->grant_ref, 2403 0); 2404 xnf_gref_put(xnfp, bdesc->grant_ref); 2405 bdesc->grant_ref = INVALID_GRANT_REF; 2406 2407 bdesc = nbuf; 2408 } else { 2409 /* 2410 * We failed to allocate a new buf or decided to reuse 2411 * the old one. In either case we copy the data off it 2412 * and put it back into the ring. 2413 */ 2414 mp = allocb(len, 0); 2415 if (mp == NULL) { 2416 xnfp->xnf_stat_rx_allocb_fail++; 2417 xnfp->xnf_stat_norxbuf++; 2418 error = ENOMEM; 2419 goto hang_buf; 2420 } 2421 bcopy(bdesc->buf + off, mp->b_wptr, len); 2422 mp->b_wptr += len; 2423 } 2424 2425 if (head == NULL) 2426 head = mp; 2427 else 2428 tail->b_cont = mp; 2429 tail = mp; 2430 2431 hang_buf: 2432 /* 2433 * No matter what happens, for each response we need to hang 2434 * a new buf on the rx ring. Put either the old one, or a new 2435 * one if the old one is borrowed by the kernel via desballoc(). 2436 */ 2437 xnf_rxbuf_hang(xnfp, bdesc); 2438 cons++; 2439 2440 /* next response is an extra */ 2441 is_extra = more_extra; 2442 2443 if (!more_data && !more_extra) 2444 break; 2445 2446 /* 2447 * Note that since requests and responses are union'd on the 2448 * same ring, we copy the response to a local variable instead 2449 * of keeping a pointer. Otherwise xnf_rxbuf_hang() would have 2450 * overwritten contents of rsp. 2451 */ 2452 rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons); 2453 } 2454 2455 /* 2456 * Check that we do not get stuck in a loop. 2457 */ 2458 ASSERT3U(*consp, !=, cons); 2459 *consp = cons; 2460 2461 /* 2462 * We ran out of responses but the flags indicate there is more data. 2463 */ 2464 if (more_data) { 2465 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments."); 2466 error = EINVAL; 2467 } 2468 if (more_extra) { 2469 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments " 2470 "(extras)."); 2471 error = EINVAL; 2472 } 2473 2474 /* 2475 * An error means the packet must be dropped. If we have already formed 2476 * a partial packet, then discard it. 2477 */ 2478 if (error != 0) { 2479 if (head != NULL) 2480 freemsg(head); 2481 xnfp->xnf_stat_rx_drop++; 2482 return (error); 2483 } 2484 2485 ASSERT(head != NULL); 2486 2487 if (hwcsum) { 2488 /* 2489 * If the peer says that the data has been validated then we 2490 * declare that the full checksum has been verified. 2491 * 2492 * We don't look at the "checksum blank" flag, and hence could 2493 * have a packet here that we are asserting is good with 2494 * a blank checksum. 2495 */ 2496 mac_hcksum_set(head, 0, 0, 0, 0, HCK_FULLCKSUM_OK); 2497 xnfp->xnf_stat_rx_cksum_no_need++; 2498 } 2499 2500 /* XXX: set lro info for packet once LRO is supported in OS. */ 2501 2502 *mpp = head; 2503 2504 return (0); 2505 } 2506 2507 /* 2508 * Collect packets from the RX ring, storing them in `xnfp' for later use. 2509 */ 2510 static void 2511 xnf_rx_collect(xnf_t *xnfp) 2512 { 2513 RING_IDX prod; 2514 2515 ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock)); 2516 2517 prod = xnfp->xnf_rx_ring.sring->rsp_prod; 2518 /* 2519 * Ensure we see queued responses up to 'prod'. 2520 */ 2521 membar_consumer(); 2522 2523 while (xnfp->xnf_rx_ring.rsp_cons != prod) { 2524 mblk_t *mp; 2525 2526 /* 2527 * Collect a packet. 2528 * rsp_cons is updated inside xnf_rx_one_packet(). 2529 */ 2530 int error = xnf_rx_one_packet(xnfp, prod, 2531 &xnfp->xnf_rx_ring.rsp_cons, &mp); 2532 if (error == 0) { 2533 xnfp->xnf_stat_ipackets++; 2534 xnfp->xnf_stat_rbytes += xmsgsize(mp); 2535 2536 /* 2537 * Append the mblk to the rx list. 2538 */ 2539 if (xnfp->xnf_rx_head == NULL) { 2540 ASSERT3P(xnfp->xnf_rx_tail, ==, NULL); 2541 xnfp->xnf_rx_head = mp; 2542 } else { 2543 ASSERT(xnfp->xnf_rx_tail != NULL); 2544 xnfp->xnf_rx_tail->b_next = mp; 2545 } 2546 xnfp->xnf_rx_tail = mp; 2547 } 2548 } 2549 } 2550 2551 /* 2552 * xnf_alloc_dma_resources() -- initialize the drivers structures 2553 */ 2554 static int 2555 xnf_alloc_dma_resources(xnf_t *xnfp) 2556 { 2557 dev_info_t *devinfo = xnfp->xnf_devinfo; 2558 size_t len; 2559 ddi_dma_cookie_t dma_cookie; 2560 uint_t ncookies; 2561 int rc; 2562 caddr_t rptr; 2563 2564 /* 2565 * The code below allocates all the DMA data structures that 2566 * need to be released when the driver is detached. 2567 * 2568 * Allocate page for the transmit descriptor ring. 2569 */ 2570 if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr, 2571 DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS) 2572 goto alloc_error; 2573 2574 if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle, 2575 PAGESIZE, &accattr, DDI_DMA_CONSISTENT, 2576 DDI_DMA_SLEEP, 0, &rptr, &len, 2577 &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) { 2578 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle); 2579 xnfp->xnf_tx_ring_dma_handle = NULL; 2580 goto alloc_error; 2581 } 2582 2583 if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL, 2584 rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, 2585 DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) { 2586 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle); 2587 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle); 2588 xnfp->xnf_tx_ring_dma_handle = NULL; 2589 xnfp->xnf_tx_ring_dma_acchandle = NULL; 2590 if (rc == DDI_DMA_NORESOURCES) 2591 goto alloc_error; 2592 else 2593 goto error; 2594 } 2595 2596 ASSERT(ncookies == 1); 2597 bzero(rptr, PAGESIZE); 2598 /* LINTED: constant in conditional context */ 2599 SHARED_RING_INIT((netif_tx_sring_t *)rptr); 2600 /* LINTED: constant in conditional context */ 2601 FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE); 2602 xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress; 2603 2604 /* 2605 * Allocate page for the receive descriptor ring. 2606 */ 2607 if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr, 2608 DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS) 2609 goto alloc_error; 2610 2611 if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle, 2612 PAGESIZE, &accattr, DDI_DMA_CONSISTENT, 2613 DDI_DMA_SLEEP, 0, &rptr, &len, 2614 &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) { 2615 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); 2616 xnfp->xnf_rx_ring_dma_handle = NULL; 2617 goto alloc_error; 2618 } 2619 2620 if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL, 2621 rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, 2622 DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) { 2623 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle); 2624 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); 2625 xnfp->xnf_rx_ring_dma_handle = NULL; 2626 xnfp->xnf_rx_ring_dma_acchandle = NULL; 2627 if (rc == DDI_DMA_NORESOURCES) 2628 goto alloc_error; 2629 else 2630 goto error; 2631 } 2632 2633 ASSERT(ncookies == 1); 2634 bzero(rptr, PAGESIZE); 2635 /* LINTED: constant in conditional context */ 2636 SHARED_RING_INIT((netif_rx_sring_t *)rptr); 2637 /* LINTED: constant in conditional context */ 2638 FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE); 2639 xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress; 2640 2641 return (DDI_SUCCESS); 2642 2643 alloc_error: 2644 cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory", 2645 ddi_get_instance(xnfp->xnf_devinfo)); 2646 error: 2647 xnf_release_dma_resources(xnfp); 2648 return (DDI_FAILURE); 2649 } 2650 2651 /* 2652 * Release all DMA resources in the opposite order from acquisition 2653 */ 2654 static void 2655 xnf_release_dma_resources(xnf_t *xnfp) 2656 { 2657 int i; 2658 2659 /* 2660 * Free receive buffers which are currently associated with 2661 * descriptors. 2662 */ 2663 mutex_enter(&xnfp->xnf_rxlock); 2664 for (i = 0; i < NET_RX_RING_SIZE; i++) { 2665 xnf_buf_t *bp; 2666 2667 if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL) 2668 continue; 2669 xnfp->xnf_rx_pkt_info[i] = NULL; 2670 xnf_buf_put(xnfp, bp, B_FALSE); 2671 } 2672 mutex_exit(&xnfp->xnf_rxlock); 2673 2674 /* Free the receive ring buffer. */ 2675 if (xnfp->xnf_rx_ring_dma_acchandle != NULL) { 2676 (void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle); 2677 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle); 2678 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); 2679 xnfp->xnf_rx_ring_dma_acchandle = NULL; 2680 } 2681 /* Free the transmit ring buffer. */ 2682 if (xnfp->xnf_tx_ring_dma_acchandle != NULL) { 2683 (void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle); 2684 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle); 2685 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle); 2686 xnfp->xnf_tx_ring_dma_acchandle = NULL; 2687 } 2688 2689 } 2690 2691 /* 2692 * Release any packets and associated structures used by the TX ring. 2693 */ 2694 static void 2695 xnf_release_mblks(xnf_t *xnfp) 2696 { 2697 RING_IDX i; 2698 xnf_txid_t *tidp; 2699 2700 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0]; 2701 i < NET_TX_RING_SIZE; 2702 i++, tidp++) { 2703 xnf_txbuf_t *txp = tidp->txbuf; 2704 2705 if (txp != NULL) { 2706 ASSERT(txp->tx_mp != NULL); 2707 freemsg(txp->tx_mp); 2708 2709 xnf_txid_put(xnfp, tidp); 2710 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); 2711 } 2712 } 2713 } 2714 2715 static int 2716 xnf_buf_constructor(void *buf, void *arg, int kmflag) 2717 { 2718 int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP; 2719 xnf_buf_t *bdesc = buf; 2720 xnf_t *xnfp = arg; 2721 ddi_dma_cookie_t dma_cookie; 2722 uint_t ncookies; 2723 size_t len; 2724 2725 if (kmflag & KM_NOSLEEP) 2726 ddiflags = DDI_DMA_DONTWAIT; 2727 2728 /* Allocate a DMA access handle for the buffer. */ 2729 if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buf_dma_attr, 2730 ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS) 2731 goto failure; 2732 2733 /* Allocate DMA-able memory for buffer. */ 2734 if (ddi_dma_mem_alloc(bdesc->dma_handle, 2735 PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0, 2736 &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS) 2737 goto failure_1; 2738 2739 /* Bind to virtual address of buffer to get physical address. */ 2740 if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL, 2741 bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING, 2742 ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) 2743 goto failure_2; 2744 ASSERT(ncookies == 1); 2745 2746 bdesc->free_rtn.free_func = xnf_buf_recycle; 2747 bdesc->free_rtn.free_arg = (caddr_t)bdesc; 2748 bdesc->xnfp = xnfp; 2749 bdesc->buf_phys = dma_cookie.dmac_laddress; 2750 bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys)); 2751 bdesc->len = dma_cookie.dmac_size; 2752 bdesc->grant_ref = INVALID_GRANT_REF; 2753 bdesc->gen = xnfp->xnf_gen; 2754 2755 atomic_inc_64(&xnfp->xnf_stat_buf_allocated); 2756 2757 return (0); 2758 2759 failure_2: 2760 ddi_dma_mem_free(&bdesc->acc_handle); 2761 2762 failure_1: 2763 ddi_dma_free_handle(&bdesc->dma_handle); 2764 2765 failure: 2766 2767 ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */ 2768 return (-1); 2769 } 2770 2771 static void 2772 xnf_buf_destructor(void *buf, void *arg) 2773 { 2774 xnf_buf_t *bdesc = buf; 2775 xnf_t *xnfp = arg; 2776 2777 (void) ddi_dma_unbind_handle(bdesc->dma_handle); 2778 ddi_dma_mem_free(&bdesc->acc_handle); 2779 ddi_dma_free_handle(&bdesc->dma_handle); 2780 2781 atomic_dec_64(&xnfp->xnf_stat_buf_allocated); 2782 } 2783 2784 static xnf_buf_t * 2785 xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly) 2786 { 2787 grant_ref_t gref; 2788 xnf_buf_t *bufp; 2789 2790 /* 2791 * Usually grant references are more scarce than memory, so we 2792 * attempt to acquire a grant reference first. 2793 */ 2794 gref = xnf_gref_get(xnfp); 2795 if (gref == INVALID_GRANT_REF) 2796 return (NULL); 2797 2798 bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags); 2799 if (bufp == NULL) { 2800 xnf_gref_put(xnfp, gref); 2801 return (NULL); 2802 } 2803 2804 ASSERT3U(bufp->grant_ref, ==, INVALID_GRANT_REF); 2805 2806 bufp->grant_ref = gref; 2807 2808 if (bufp->gen != xnfp->xnf_gen) 2809 xnf_buf_refresh(bufp); 2810 2811 gnttab_grant_foreign_access_ref(bufp->grant_ref, 2812 xvdi_get_oeid(bufp->xnfp->xnf_devinfo), 2813 bufp->buf_mfn, readonly ? 1 : 0); 2814 2815 atomic_inc_64(&xnfp->xnf_stat_buf_outstanding); 2816 2817 return (bufp); 2818 } 2819 2820 static void 2821 xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly) 2822 { 2823 if (bufp->grant_ref != INVALID_GRANT_REF) { 2824 (void) gnttab_end_foreign_access_ref( 2825 bufp->grant_ref, readonly ? 1 : 0); 2826 xnf_gref_put(xnfp, bufp->grant_ref); 2827 bufp->grant_ref = INVALID_GRANT_REF; 2828 } 2829 2830 kmem_cache_free(xnfp->xnf_buf_cache, bufp); 2831 2832 atomic_dec_64(&xnfp->xnf_stat_buf_outstanding); 2833 } 2834 2835 /* 2836 * Refresh any cached data about a buffer after resume. 2837 */ 2838 static void 2839 xnf_buf_refresh(xnf_buf_t *bdesc) 2840 { 2841 bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys)); 2842 bdesc->gen = bdesc->xnfp->xnf_gen; 2843 } 2844 2845 /* 2846 * Streams `freeb' routine for `xnf_buf_t' when used as transmit 2847 * look-aside buffers. 2848 */ 2849 static void 2850 xnf_buf_recycle(xnf_buf_t *bdesc) 2851 { 2852 xnf_t *xnfp = bdesc->xnfp; 2853 2854 xnf_buf_put(xnfp, bdesc, B_TRUE); 2855 } 2856 2857 static int 2858 xnf_tx_buf_constructor(void *buf, void *arg, int kmflag) 2859 { 2860 int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP; 2861 xnf_txbuf_t *txp = buf; 2862 xnf_t *xnfp = arg; 2863 2864 if (kmflag & KM_NOSLEEP) 2865 ddiflags = DDI_DMA_DONTWAIT; 2866 2867 if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buf_dma_attr, 2868 ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) { 2869 ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */ 2870 return (-1); 2871 } 2872 2873 return (0); 2874 } 2875 2876 static void 2877 xnf_tx_buf_destructor(void *buf, void *arg) 2878 { 2879 _NOTE(ARGUNUSED(arg)); 2880 xnf_txbuf_t *txp = buf; 2881 2882 ddi_dma_free_handle(&txp->tx_dma_handle); 2883 } 2884 2885 /* 2886 * Statistics. 2887 */ 2888 static char *xnf_aux_statistics[] = { 2889 "tx_cksum_deferred", 2890 "rx_cksum_no_need", 2891 "interrupts", 2892 "unclaimed_interrupts", 2893 "tx_pullup", 2894 "tx_lookaside", 2895 "tx_drop", 2896 "tx_eth_hdr_split", 2897 "buf_allocated", 2898 "buf_outstanding", 2899 "gref_outstanding", 2900 "gref_failure", 2901 "gref_peak", 2902 "rx_allocb_fail", 2903 "rx_desballoc_fail", 2904 }; 2905 2906 static int 2907 xnf_kstat_aux_update(kstat_t *ksp, int flag) 2908 { 2909 xnf_t *xnfp; 2910 kstat_named_t *knp; 2911 2912 if (flag != KSTAT_READ) 2913 return (EACCES); 2914 2915 xnfp = ksp->ks_private; 2916 knp = ksp->ks_data; 2917 2918 /* 2919 * Assignment order must match that of the names in 2920 * xnf_aux_statistics. 2921 */ 2922 (knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred; 2923 (knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need; 2924 2925 (knp++)->value.ui64 = xnfp->xnf_stat_interrupts; 2926 (knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts; 2927 (knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup; 2928 (knp++)->value.ui64 = xnfp->xnf_stat_tx_lookaside; 2929 (knp++)->value.ui64 = xnfp->xnf_stat_tx_drop; 2930 (knp++)->value.ui64 = xnfp->xnf_stat_tx_eth_hdr_split; 2931 2932 (knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated; 2933 (knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding; 2934 (knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding; 2935 (knp++)->value.ui64 = xnfp->xnf_stat_gref_failure; 2936 (knp++)->value.ui64 = xnfp->xnf_stat_gref_peak; 2937 (knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail; 2938 (knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail; 2939 2940 return (0); 2941 } 2942 2943 static boolean_t 2944 xnf_kstat_init(xnf_t *xnfp) 2945 { 2946 int nstat = sizeof (xnf_aux_statistics) / 2947 sizeof (xnf_aux_statistics[0]); 2948 char **cp = xnf_aux_statistics; 2949 kstat_named_t *knp; 2950 2951 /* 2952 * Create and initialise kstats. 2953 */ 2954 if ((xnfp->xnf_kstat_aux = kstat_create("xnf", 2955 ddi_get_instance(xnfp->xnf_devinfo), 2956 "aux_statistics", "net", KSTAT_TYPE_NAMED, 2957 nstat, 0)) == NULL) 2958 return (B_FALSE); 2959 2960 xnfp->xnf_kstat_aux->ks_private = xnfp; 2961 xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update; 2962 2963 knp = xnfp->xnf_kstat_aux->ks_data; 2964 while (nstat > 0) { 2965 kstat_named_init(knp, *cp, KSTAT_DATA_UINT64); 2966 2967 knp++; 2968 cp++; 2969 nstat--; 2970 } 2971 2972 kstat_install(xnfp->xnf_kstat_aux); 2973 2974 return (B_TRUE); 2975 } 2976 2977 static int 2978 xnf_stat(void *arg, uint_t stat, uint64_t *val) 2979 { 2980 xnf_t *xnfp = arg; 2981 2982 mutex_enter(&xnfp->xnf_rxlock); 2983 mutex_enter(&xnfp->xnf_txlock); 2984 2985 #define mac_stat(q, r) \ 2986 case (MAC_STAT_##q): \ 2987 *val = xnfp->xnf_stat_##r; \ 2988 break 2989 2990 #define ether_stat(q, r) \ 2991 case (ETHER_STAT_##q): \ 2992 *val = xnfp->xnf_stat_##r; \ 2993 break 2994 2995 switch (stat) { 2996 2997 mac_stat(IPACKETS, ipackets); 2998 mac_stat(OPACKETS, opackets); 2999 mac_stat(RBYTES, rbytes); 3000 mac_stat(OBYTES, obytes); 3001 mac_stat(NORCVBUF, norxbuf); 3002 mac_stat(IERRORS, errrx); 3003 mac_stat(NOXMTBUF, tx_defer); 3004 3005 ether_stat(MACRCV_ERRORS, mac_rcv_error); 3006 ether_stat(TOOSHORT_ERRORS, runt); 3007 3008 /* always claim to be in full duplex mode */ 3009 case ETHER_STAT_LINK_DUPLEX: 3010 *val = LINK_DUPLEX_FULL; 3011 break; 3012 3013 /* always claim to be at 1Gb/s link speed */ 3014 case MAC_STAT_IFSPEED: 3015 *val = 1000000000ull; 3016 break; 3017 3018 default: 3019 mutex_exit(&xnfp->xnf_txlock); 3020 mutex_exit(&xnfp->xnf_rxlock); 3021 3022 return (ENOTSUP); 3023 } 3024 3025 #undef mac_stat 3026 #undef ether_stat 3027 3028 mutex_exit(&xnfp->xnf_txlock); 3029 mutex_exit(&xnfp->xnf_rxlock); 3030 3031 return (0); 3032 } 3033 3034 static int 3035 xnf_change_mtu(xnf_t *xnfp, uint32_t mtu) 3036 { 3037 if (mtu > ETHERMTU) { 3038 if (!xnf_enable_tx_sg) { 3039 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d " 3040 "because scatter-gather is disabled for transmit " 3041 "in driver settings", ETHERMTU); 3042 return (EINVAL); 3043 } else if (!xnf_enable_rx_sg) { 3044 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d " 3045 "because scatter-gather is disabled for receive " 3046 "in driver settings", ETHERMTU); 3047 return (EINVAL); 3048 } else if (!xnfp->xnf_be_tx_sg) { 3049 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d " 3050 "because backend doesn't support scatter-gather", 3051 ETHERMTU); 3052 return (EINVAL); 3053 } 3054 if (mtu > XNF_MAXPKT) 3055 return (EINVAL); 3056 } 3057 int error = mac_maxsdu_update(xnfp->xnf_mh, mtu); 3058 if (error == 0) 3059 xnfp->xnf_mtu = mtu; 3060 3061 return (error); 3062 } 3063 3064 /*ARGSUSED*/ 3065 static int 3066 xnf_getprop(void *data, const char *prop_name, mac_prop_id_t prop_id, 3067 uint_t prop_val_size, void *prop_val) 3068 { 3069 xnf_t *xnfp = data; 3070 3071 switch (prop_id) { 3072 case MAC_PROP_MTU: 3073 ASSERT(prop_val_size >= sizeof (uint32_t)); 3074 bcopy(&xnfp->xnf_mtu, prop_val, sizeof (uint32_t)); 3075 break; 3076 default: 3077 return (ENOTSUP); 3078 } 3079 return (0); 3080 } 3081 3082 /*ARGSUSED*/ 3083 static int 3084 xnf_setprop(void *data, const char *prop_name, mac_prop_id_t prop_id, 3085 uint_t prop_val_size, const void *prop_val) 3086 { 3087 xnf_t *xnfp = data; 3088 uint32_t new_mtu; 3089 int error; 3090 3091 switch (prop_id) { 3092 case MAC_PROP_MTU: 3093 ASSERT(prop_val_size >= sizeof (uint32_t)); 3094 bcopy(prop_val, &new_mtu, sizeof (new_mtu)); 3095 error = xnf_change_mtu(xnfp, new_mtu); 3096 break; 3097 default: 3098 return (ENOTSUP); 3099 } 3100 3101 return (error); 3102 } 3103 3104 /*ARGSUSED*/ 3105 static void 3106 xnf_propinfo(void *data, const char *prop_name, mac_prop_id_t prop_id, 3107 mac_prop_info_handle_t prop_handle) 3108 { 3109 switch (prop_id) { 3110 case MAC_PROP_MTU: 3111 mac_prop_info_set_range_uint32(prop_handle, 0, XNF_MAXPKT); 3112 break; 3113 default: 3114 break; 3115 } 3116 } 3117 3118 static boolean_t 3119 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data) 3120 { 3121 xnf_t *xnfp = arg; 3122 3123 switch (cap) { 3124 case MAC_CAPAB_HCKSUM: { 3125 uint32_t *capab = cap_data; 3126 3127 /* 3128 * Whilst the flag used to communicate with the IO 3129 * domain is called "NETTXF_csum_blank", the checksum 3130 * in the packet must contain the pseudo-header 3131 * checksum and not zero. 3132 * 3133 * To help out the IO domain, we might use 3134 * HCKSUM_INET_PARTIAL. Unfortunately our stack will 3135 * then use checksum offload for IPv6 packets, which 3136 * the IO domain can't handle. 3137 * 3138 * As a result, we declare outselves capable of 3139 * HCKSUM_INET_FULL_V4. This means that we receive 3140 * IPv4 packets from the stack with a blank checksum 3141 * field and must insert the pseudo-header checksum 3142 * before passing the packet to the IO domain. 3143 */ 3144 *capab = HCKSUM_INET_FULL_V4; 3145 3146 /* 3147 * TODO: query the "feature-ipv6-csum-offload" capability. 3148 * If enabled, that could allow us to use HCKSUM_INET_PARTIAL. 3149 */ 3150 3151 break; 3152 } 3153 case MAC_CAPAB_LSO: { 3154 if (!xnfp->xnf_be_lso) 3155 return (B_FALSE); 3156 3157 mac_capab_lso_t *lso = cap_data; 3158 lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 3159 lso->lso_basic_tcp_ipv4.lso_max = IP_MAXPACKET; 3160 break; 3161 } 3162 default: 3163 return (B_FALSE); 3164 } 3165 3166 return (B_TRUE); 3167 } 3168 3169 /* 3170 * The state of the peer has changed - react accordingly. 3171 */ 3172 static void 3173 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, 3174 void *arg, void *impl_data) 3175 { 3176 _NOTE(ARGUNUSED(id, arg)); 3177 xnf_t *xnfp = ddi_get_driver_private(dip); 3178 XenbusState new_state = *(XenbusState *)impl_data; 3179 3180 ASSERT(xnfp != NULL); 3181 3182 switch (new_state) { 3183 case XenbusStateUnknown: 3184 case XenbusStateInitialising: 3185 case XenbusStateInitialised: 3186 case XenbusStateClosing: 3187 case XenbusStateClosed: 3188 case XenbusStateReconfiguring: 3189 case XenbusStateReconfigured: 3190 break; 3191 3192 case XenbusStateInitWait: 3193 xnf_read_config(xnfp); 3194 3195 if (!xnfp->xnf_be_rx_copy) { 3196 cmn_err(CE_WARN, 3197 "The xnf driver requires a dom0 that " 3198 "supports 'feature-rx-copy'."); 3199 (void) xvdi_switch_state(xnfp->xnf_devinfo, 3200 XBT_NULL, XenbusStateClosed); 3201 break; 3202 } 3203 3204 /* 3205 * Connect to the backend. 3206 */ 3207 xnf_be_connect(xnfp); 3208 3209 /* 3210 * Our MAC address as discovered by xnf_read_config(). 3211 */ 3212 mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr); 3213 3214 /* 3215 * We do not know if some features such as LSO are supported 3216 * until we connect to the backend. We request the MAC layer 3217 * to poll our capabilities again. 3218 */ 3219 mac_capab_update(xnfp->xnf_mh); 3220 3221 break; 3222 3223 case XenbusStateConnected: 3224 mutex_enter(&xnfp->xnf_rxlock); 3225 mutex_enter(&xnfp->xnf_txlock); 3226 3227 xnfp->xnf_connected = B_TRUE; 3228 /* 3229 * Wake up any threads waiting to send data to 3230 * backend. 3231 */ 3232 cv_broadcast(&xnfp->xnf_cv_state); 3233 3234 mutex_exit(&xnfp->xnf_txlock); 3235 mutex_exit(&xnfp->xnf_rxlock); 3236 3237 /* 3238 * Kick the peer in case it missed any transmits 3239 * request in the TX ring. 3240 */ 3241 ec_notify_via_evtchn(xnfp->xnf_evtchn); 3242 3243 /* 3244 * There may already be completed receive requests in 3245 * the ring sent by backend after it gets connected 3246 * but before we see its state change here, so we call 3247 * xnf_intr() to handle them, if any. 3248 */ 3249 (void) xnf_intr((caddr_t)xnfp); 3250 3251 /* 3252 * Mark the link up now that we are connected. 3253 */ 3254 mac_link_update(xnfp->xnf_mh, LINK_STATE_UP); 3255 3256 /* 3257 * Tell the backend about the multicast addresses in 3258 * which we are interested. 3259 */ 3260 mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE); 3261 3262 break; 3263 3264 default: 3265 break; 3266 } 3267 } 3268