1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright (c) 2014, 2017 by Delphix. All rights reserved. 29 */ 30 31 /* 32 * 33 * Copyright (c) 2004 Christian Limpach. 34 * All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. This section intentionally left blank. 45 * 4. The name of the author may not be used to endorse or promote products 46 * derived from this software without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 49 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 50 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 51 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 52 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 53 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 54 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 55 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 56 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 57 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 58 */ 59 /* 60 * Section 3 of the above license was updated in response to bug 6379571. 61 */ 62 63 /* 64 * xnf.c - GLDv3 network driver for domU. 65 */ 66 67 /* 68 * This driver uses four per-instance locks: 69 * 70 * xnf_gref_lock: 71 * 72 * Protects access to the grant reference list stored in 73 * xnf_gref_head. Grant references should be acquired and released 74 * using gref_get() and gref_put() respectively. 75 * 76 * xnf_schedlock: 77 * 78 * Protects: 79 * xnf_need_sched - used to record that a previous transmit attempt 80 * failed (and consequently it will be necessary to call 81 * mac_tx_update() when transmit resources are available). 82 * xnf_pending_multicast - the number of multicast requests that 83 * have been submitted to the backend for which we have not 84 * processed responses. 85 * 86 * xnf_txlock: 87 * 88 * Protects the transmit ring (xnf_tx_ring) and associated 89 * structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head). 90 * 91 * xnf_rxlock: 92 * 93 * Protects the receive ring (xnf_rx_ring) and associated 94 * structures (notably xnf_rx_pkt_info). 95 * 96 * If driver-global state that affects both the transmit and receive 97 * rings is manipulated, both xnf_txlock and xnf_rxlock should be 98 * held, in that order. 99 * 100 * xnf_schedlock is acquired both whilst holding xnf_txlock and 101 * without. It should always be acquired after xnf_txlock if both are 102 * held. 103 * 104 * Notes: 105 * - atomic_add_64() is used to manipulate counters where we require 106 * accuracy. For counters intended only for observation by humans, 107 * post increment/decrement are used instead. 108 */ 109 110 #include <sys/types.h> 111 #include <sys/errno.h> 112 #include <sys/param.h> 113 #include <sys/sysmacros.h> 114 #include <sys/systm.h> 115 #include <sys/stream.h> 116 #include <sys/strsubr.h> 117 #include <sys/strsun.h> 118 #include <sys/conf.h> 119 #include <sys/ddi.h> 120 #include <sys/devops.h> 121 #include <sys/sunddi.h> 122 #include <sys/sunndi.h> 123 #include <sys/dlpi.h> 124 #include <sys/ethernet.h> 125 #include <sys/strsun.h> 126 #include <sys/pattr.h> 127 #include <inet/ip.h> 128 #include <inet/ip_impl.h> 129 #include <inet/tcp.h> 130 #include <netinet/udp.h> 131 #include <sys/gld.h> 132 #include <sys/modctl.h> 133 #include <sys/mac_provider.h> 134 #include <sys/mac_ether.h> 135 #include <sys/bootinfo.h> 136 #include <sys/mach_mmu.h> 137 #ifdef XPV_HVM_DRIVER 138 #include <sys/xpv_support.h> 139 #include <sys/hypervisor.h> 140 #else 141 #include <sys/hypervisor.h> 142 #include <sys/evtchn_impl.h> 143 #include <sys/balloon_impl.h> 144 #endif 145 #include <xen/public/io/netif.h> 146 #include <sys/gnttab.h> 147 #include <xen/sys/xendev.h> 148 #include <sys/sdt.h> 149 #include <sys/note.h> 150 #include <sys/debug.h> 151 152 #include <io/xnf.h> 153 154 #if defined(DEBUG) || defined(__lint) 155 #define XNF_DEBUG 156 #endif 157 158 #ifdef XNF_DEBUG 159 int xnf_debug = 0; 160 xnf_t *xnf_debug_instance = NULL; 161 #endif 162 163 /* 164 * On a 32 bit PAE system physical and machine addresses are larger 165 * than 32 bits. ddi_btop() on such systems take an unsigned long 166 * argument, and so addresses above 4G are truncated before ddi_btop() 167 * gets to see them. To avoid this, code the shift operation here. 168 */ 169 #define xnf_btop(addr) ((addr) >> PAGESHIFT) 170 171 /* 172 * The parameters below should only be changed in /etc/system, never in mdb. 173 */ 174 175 /* 176 * Should we use the multicast control feature if the backend provides 177 * it? 178 */ 179 boolean_t xnf_multicast_control = B_TRUE; 180 181 /* 182 * Should we allow scatter-gather for tx if backend allows it? 183 */ 184 boolean_t xnf_enable_tx_sg = B_TRUE; 185 186 /* 187 * Should we allow scatter-gather for rx if backend allows it? 188 */ 189 boolean_t xnf_enable_rx_sg = B_TRUE; 190 191 /* 192 * Should we allow lso for tx sends if backend allows it? 193 * Requires xnf_enable_tx_sg to be also set to TRUE. 194 */ 195 boolean_t xnf_enable_lso = B_TRUE; 196 197 /* 198 * Should we allow lro on rx if backend supports it? 199 * Requires xnf_enable_rx_sg to be also set to TRUE. 200 * 201 * !! WARNING !! 202 * LRO is not yet supported in the OS so this should be left as FALSE. 203 * !! WARNING !! 204 */ 205 boolean_t xnf_enable_lro = B_FALSE; 206 207 /* 208 * Received packets below this size are copied to a new streams buffer 209 * rather than being desballoc'ed. 210 * 211 * This value is chosen to accommodate traffic where there are a large 212 * number of small packets. For data showing a typical distribution, 213 * see: 214 * 215 * Sinha07a: 216 * Rishi Sinha, Christos Papadopoulos, and John 217 * Heidemann. Internet Packet Size Distributions: Some 218 * Observations. Technical Report ISI-TR-2007-643, 219 * USC/Information Sciences Institute, May, 2007. Orignally 220 * released October 2005 as web page 221 * http://netweb.usc.edu/~sinha/pkt-sizes/. 222 * <http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>. 223 */ 224 size_t xnf_rx_copy_limit = 64; 225 226 #define INVALID_GRANT_HANDLE ((grant_handle_t)-1) 227 #define INVALID_GRANT_REF ((grant_ref_t)-1) 228 #define INVALID_TX_ID ((uint16_t)-1) 229 230 #define TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)])) 231 #define TX_ID_VALID(i) \ 232 (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE)) 233 234 /* 235 * calculate how many pages are spanned by an mblk fragment 236 */ 237 #define xnf_mblk_pages(mp) (MBLKL(mp) == 0 ? 0 : \ 238 xnf_btop((uintptr_t)mp->b_wptr - 1) - xnf_btop((uintptr_t)mp->b_rptr) + 1) 239 240 /* Required system entry points */ 241 static int xnf_attach(dev_info_t *, ddi_attach_cmd_t); 242 static int xnf_detach(dev_info_t *, ddi_detach_cmd_t); 243 244 /* Required driver entry points for Nemo */ 245 static int xnf_start(void *); 246 static void xnf_stop(void *); 247 static int xnf_set_mac_addr(void *, const uint8_t *); 248 static int xnf_set_multicast(void *, boolean_t, const uint8_t *); 249 static int xnf_set_promiscuous(void *, boolean_t); 250 static mblk_t *xnf_send(void *, mblk_t *); 251 static uint_t xnf_intr(caddr_t); 252 static int xnf_stat(void *, uint_t, uint64_t *); 253 static boolean_t xnf_getcapab(void *, mac_capab_t, void *); 254 static int xnf_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); 255 static int xnf_setprop(void *, const char *, mac_prop_id_t, uint_t, 256 const void *); 257 static void xnf_propinfo(void *, const char *, mac_prop_id_t, 258 mac_prop_info_handle_t); 259 260 /* Driver private functions */ 261 static int xnf_alloc_dma_resources(xnf_t *); 262 static void xnf_release_dma_resources(xnf_t *); 263 static void xnf_release_mblks(xnf_t *); 264 265 static int xnf_buf_constructor(void *, void *, int); 266 static void xnf_buf_destructor(void *, void *); 267 static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t); 268 #pragma inline(xnf_buf_get) 269 static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t); 270 #pragma inline(xnf_buf_put) 271 static void xnf_buf_refresh(xnf_buf_t *); 272 #pragma inline(xnf_buf_refresh) 273 static void xnf_buf_recycle(xnf_buf_t *); 274 275 static int xnf_tx_buf_constructor(void *, void *, int); 276 static void xnf_tx_buf_destructor(void *, void *); 277 278 static grant_ref_t xnf_gref_get(xnf_t *); 279 #pragma inline(xnf_gref_get) 280 static void xnf_gref_put(xnf_t *, grant_ref_t); 281 #pragma inline(xnf_gref_put) 282 283 static xnf_txid_t *xnf_txid_get(xnf_t *); 284 #pragma inline(xnf_txid_get) 285 static void xnf_txid_put(xnf_t *, xnf_txid_t *); 286 #pragma inline(xnf_txid_put) 287 288 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *); 289 static int xnf_tx_clean_ring(xnf_t *); 290 static void oe_state_change(dev_info_t *, ddi_eventcookie_t, 291 void *, void *); 292 static boolean_t xnf_kstat_init(xnf_t *); 293 static void xnf_rx_collect(xnf_t *); 294 295 #define XNF_CALLBACK_FLAGS (MC_GETCAPAB | MC_PROPERTIES) 296 297 static mac_callbacks_t xnf_callbacks = { 298 .mc_callbacks = XNF_CALLBACK_FLAGS, 299 .mc_getstat = xnf_stat, 300 .mc_start = xnf_start, 301 .mc_stop = xnf_stop, 302 .mc_setpromisc = xnf_set_promiscuous, 303 .mc_multicst = xnf_set_multicast, 304 .mc_unicst = xnf_set_mac_addr, 305 .mc_tx = xnf_send, 306 .mc_getcapab = xnf_getcapab, 307 .mc_setprop = xnf_setprop, 308 .mc_getprop = xnf_getprop, 309 .mc_propinfo = xnf_propinfo, 310 }; 311 312 /* DMA attributes for network ring buffer */ 313 static ddi_dma_attr_t ringbuf_dma_attr = { 314 .dma_attr_version = DMA_ATTR_V0, 315 .dma_attr_addr_lo = 0, 316 .dma_attr_addr_hi = 0xffffffffffffffffULL, 317 .dma_attr_count_max = 0x7fffffff, 318 .dma_attr_align = MMU_PAGESIZE, 319 .dma_attr_burstsizes = 0x7ff, 320 .dma_attr_minxfer = 1, 321 .dma_attr_maxxfer = 0xffffffffU, 322 .dma_attr_seg = 0xffffffffffffffffULL, 323 .dma_attr_sgllen = 1, 324 .dma_attr_granular = 1, 325 .dma_attr_flags = 0 326 }; 327 328 /* DMA attributes for receive data */ 329 static ddi_dma_attr_t rx_buf_dma_attr = { 330 .dma_attr_version = DMA_ATTR_V0, 331 .dma_attr_addr_lo = 0, 332 .dma_attr_addr_hi = 0xffffffffffffffffULL, 333 .dma_attr_count_max = MMU_PAGEOFFSET, 334 .dma_attr_align = MMU_PAGESIZE, /* allocation alignment */ 335 .dma_attr_burstsizes = 0x7ff, 336 .dma_attr_minxfer = 1, 337 .dma_attr_maxxfer = 0xffffffffU, 338 .dma_attr_seg = 0xffffffffffffffffULL, 339 .dma_attr_sgllen = 1, 340 .dma_attr_granular = 1, 341 .dma_attr_flags = 0 342 }; 343 344 /* DMA attributes for transmit data */ 345 static ddi_dma_attr_t tx_buf_dma_attr = { 346 .dma_attr_version = DMA_ATTR_V0, 347 .dma_attr_addr_lo = 0, 348 .dma_attr_addr_hi = 0xffffffffffffffffULL, 349 .dma_attr_count_max = MMU_PAGEOFFSET, 350 .dma_attr_align = 1, 351 .dma_attr_burstsizes = 0x7ff, 352 .dma_attr_minxfer = 1, 353 .dma_attr_maxxfer = 0xffffffffU, 354 .dma_attr_seg = XEN_DATA_BOUNDARY - 1, /* segment boundary */ 355 .dma_attr_sgllen = XEN_MAX_TX_DATA_PAGES, /* max number of segments */ 356 .dma_attr_granular = 1, 357 .dma_attr_flags = 0 358 }; 359 360 /* DMA access attributes for registers and descriptors */ 361 static ddi_device_acc_attr_t accattr = { 362 DDI_DEVICE_ATTR_V0, 363 DDI_STRUCTURE_LE_ACC, /* This is a little-endian device */ 364 DDI_STRICTORDER_ACC 365 }; 366 367 /* DMA access attributes for data: NOT to be byte swapped. */ 368 static ddi_device_acc_attr_t data_accattr = { 369 DDI_DEVICE_ATTR_V0, 370 DDI_NEVERSWAP_ACC, 371 DDI_STRICTORDER_ACC 372 }; 373 374 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach, 375 nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported); 376 377 static struct modldrv xnf_modldrv = { 378 &mod_driverops, 379 "Virtual Ethernet driver", 380 &xnf_dev_ops 381 }; 382 383 static struct modlinkage modlinkage = { 384 MODREV_1, &xnf_modldrv, NULL 385 }; 386 387 int 388 _init(void) 389 { 390 int r; 391 392 mac_init_ops(&xnf_dev_ops, "xnf"); 393 r = mod_install(&modlinkage); 394 if (r != DDI_SUCCESS) 395 mac_fini_ops(&xnf_dev_ops); 396 397 return (r); 398 } 399 400 int 401 _fini(void) 402 { 403 return (EBUSY); /* XXPV should be removable */ 404 } 405 406 int 407 _info(struct modinfo *modinfop) 408 { 409 return (mod_info(&modlinkage, modinfop)); 410 } 411 412 /* 413 * Acquire a grant reference. 414 */ 415 static grant_ref_t 416 xnf_gref_get(xnf_t *xnfp) 417 { 418 grant_ref_t gref; 419 420 mutex_enter(&xnfp->xnf_gref_lock); 421 422 do { 423 gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head); 424 425 } while ((gref == INVALID_GRANT_REF) && 426 (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0)); 427 428 mutex_exit(&xnfp->xnf_gref_lock); 429 430 if (gref == INVALID_GRANT_REF) { 431 xnfp->xnf_stat_gref_failure++; 432 } else { 433 atomic_inc_64(&xnfp->xnf_stat_gref_outstanding); 434 if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak) 435 xnfp->xnf_stat_gref_peak = 436 xnfp->xnf_stat_gref_outstanding; 437 } 438 439 return (gref); 440 } 441 442 /* 443 * Release a grant reference. 444 */ 445 static void 446 xnf_gref_put(xnf_t *xnfp, grant_ref_t gref) 447 { 448 ASSERT(gref != INVALID_GRANT_REF); 449 450 mutex_enter(&xnfp->xnf_gref_lock); 451 gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref); 452 mutex_exit(&xnfp->xnf_gref_lock); 453 454 atomic_dec_64(&xnfp->xnf_stat_gref_outstanding); 455 } 456 457 /* 458 * Acquire a transmit id. 459 */ 460 static xnf_txid_t * 461 xnf_txid_get(xnf_t *xnfp) 462 { 463 xnf_txid_t *tidp; 464 465 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 466 467 if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID) 468 return (NULL); 469 470 ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head)); 471 472 tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head); 473 xnfp->xnf_tx_pkt_id_head = tidp->next; 474 tidp->next = INVALID_TX_ID; 475 476 ASSERT(tidp->txbuf == NULL); 477 478 return (tidp); 479 } 480 481 /* 482 * Release a transmit id. 483 */ 484 static void 485 xnf_txid_put(xnf_t *xnfp, xnf_txid_t *tidp) 486 { 487 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 488 ASSERT(TX_ID_VALID(tidp->id)); 489 ASSERT(tidp->next == INVALID_TX_ID); 490 491 tidp->txbuf = NULL; 492 tidp->next = xnfp->xnf_tx_pkt_id_head; 493 xnfp->xnf_tx_pkt_id_head = tidp->id; 494 } 495 496 static void 497 xnf_data_txbuf_free(xnf_t *xnfp, xnf_txbuf_t *txp) 498 { 499 ASSERT3U(txp->tx_type, ==, TX_DATA); 500 501 /* 502 * We are either using a lookaside buffer or we are mapping existing 503 * buffers. 504 */ 505 if (txp->tx_bdesc != NULL) { 506 ASSERT(!txp->tx_handle_bound); 507 xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE); 508 } else { 509 if (txp->tx_txreq.gref != INVALID_GRANT_REF) { 510 if (gnttab_query_foreign_access(txp->tx_txreq.gref) != 511 0) { 512 cmn_err(CE_PANIC, "tx grant %d still in use by " 513 "backend domain", txp->tx_txreq.gref); 514 } 515 (void) gnttab_end_foreign_access_ref( 516 txp->tx_txreq.gref, 1); 517 xnf_gref_put(xnfp, txp->tx_txreq.gref); 518 } 519 520 if (txp->tx_handle_bound) 521 (void) ddi_dma_unbind_handle(txp->tx_dma_handle); 522 } 523 524 if (txp->tx_mp != NULL) 525 freemsg(txp->tx_mp); 526 527 if (txp->tx_prev != NULL) { 528 ASSERT3P(txp->tx_prev->tx_next, ==, txp); 529 txp->tx_prev->tx_next = NULL; 530 } 531 532 if (txp->tx_txreq.id != INVALID_TX_ID) { 533 /* 534 * This should be only possible when resuming from a suspend. 535 */ 536 ASSERT(!xnfp->xnf_connected); 537 xnf_txid_put(xnfp, TX_ID_TO_TXID(xnfp, txp->tx_txreq.id)); 538 txp->tx_txreq.id = INVALID_TX_ID; 539 } 540 541 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); 542 } 543 544 static void 545 xnf_data_txbuf_free_chain(xnf_t *xnfp, xnf_txbuf_t *txp) 546 { 547 if (txp == NULL) 548 return; 549 550 while (txp->tx_next != NULL) 551 txp = txp->tx_next; 552 553 /* 554 * We free the chain in reverse order so that grants can be released 555 * for all dma chunks before unbinding the dma handles. The mblk is 556 * freed last, after all its fragments' dma handles are unbound. 557 */ 558 xnf_txbuf_t *prev; 559 for (; txp != NULL; txp = prev) { 560 prev = txp->tx_prev; 561 xnf_data_txbuf_free(xnfp, txp); 562 } 563 } 564 565 static xnf_txbuf_t * 566 xnf_data_txbuf_alloc(xnf_t *xnfp) 567 { 568 xnf_txbuf_t *txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP); 569 txp->tx_type = TX_DATA; 570 txp->tx_next = NULL; 571 txp->tx_prev = NULL; 572 txp->tx_head = txp; 573 txp->tx_frags_to_ack = 0; 574 txp->tx_mp = NULL; 575 txp->tx_bdesc = NULL; 576 txp->tx_handle_bound = B_FALSE; 577 txp->tx_txreq.gref = INVALID_GRANT_REF; 578 txp->tx_txreq.id = INVALID_TX_ID; 579 580 return (txp); 581 } 582 583 /* 584 * Get `wanted' slots in the transmit ring, waiting for at least that 585 * number if `wait' is B_TRUE. Force the ring to be cleaned by setting 586 * `wanted' to zero. 587 * 588 * Return the number of slots available. 589 */ 590 static int 591 xnf_tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait) 592 { 593 int slotsfree; 594 boolean_t forced_clean = (wanted == 0); 595 596 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 597 598 /* LINTED: constant in conditional context */ 599 while (B_TRUE) { 600 slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring); 601 602 if ((slotsfree < wanted) || forced_clean) 603 slotsfree = xnf_tx_clean_ring(xnfp); 604 605 /* 606 * If there are more than we need free, tell other 607 * people to come looking again. We hold txlock, so we 608 * are able to take our slots before anyone else runs. 609 */ 610 if (slotsfree > wanted) 611 cv_broadcast(&xnfp->xnf_cv_tx_slots); 612 613 if (slotsfree >= wanted) 614 break; 615 616 if (!wait) 617 break; 618 619 cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock); 620 } 621 622 ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring))); 623 624 return (slotsfree); 625 } 626 627 static int 628 xnf_setup_rings(xnf_t *xnfp) 629 { 630 domid_t oeid; 631 struct xenbus_device *xsd; 632 RING_IDX i; 633 int err; 634 xnf_txid_t *tidp; 635 xnf_buf_t **bdescp; 636 637 oeid = xvdi_get_oeid(xnfp->xnf_devinfo); 638 xsd = xvdi_get_xsd(xnfp->xnf_devinfo); 639 640 if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF) 641 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0); 642 643 err = gnttab_grant_foreign_access(oeid, 644 xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0); 645 if (err <= 0) { 646 err = -err; 647 xenbus_dev_error(xsd, err, "granting access to tx ring page"); 648 goto out; 649 } 650 xnfp->xnf_tx_ring_ref = (grant_ref_t)err; 651 652 if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF) 653 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0); 654 655 err = gnttab_grant_foreign_access(oeid, 656 xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0); 657 if (err <= 0) { 658 err = -err; 659 xenbus_dev_error(xsd, err, "granting access to rx ring page"); 660 goto out; 661 } 662 xnfp->xnf_rx_ring_ref = (grant_ref_t)err; 663 664 mutex_enter(&xnfp->xnf_txlock); 665 666 /* 667 * We first cleanup the TX ring in case we are doing a resume. 668 * Note that this can lose packets, but we expect to stagger on. 669 */ 670 xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */ 671 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0]; 672 i < NET_TX_RING_SIZE; 673 i++, tidp++) { 674 xnf_txbuf_t *txp = tidp->txbuf; 675 if (txp == NULL) 676 continue; 677 678 switch (txp->tx_type) { 679 case TX_DATA: 680 /* 681 * txid_put() will be called for each txbuf's txid in 682 * the chain which will result in clearing tidp->txbuf. 683 */ 684 xnf_data_txbuf_free_chain(xnfp, txp); 685 686 break; 687 688 case TX_MCAST_REQ: 689 txp->tx_type = TX_MCAST_RSP; 690 txp->tx_status = NETIF_RSP_DROPPED; 691 cv_broadcast(&xnfp->xnf_cv_multicast); 692 693 /* 694 * The request consumed two slots in the ring, 695 * yet only a single xnf_txid_t is used. Step 696 * over the empty slot. 697 */ 698 i++; 699 ASSERT3U(i, <, NET_TX_RING_SIZE); 700 break; 701 702 case TX_MCAST_RSP: 703 break; 704 } 705 } 706 707 /* 708 * Now purge old list and add each txid to the new free list. 709 */ 710 xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */ 711 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0]; 712 i < NET_TX_RING_SIZE; 713 i++, tidp++) { 714 tidp->id = i; 715 ASSERT3P(tidp->txbuf, ==, NULL); 716 tidp->next = INVALID_TX_ID; /* Appease txid_put(). */ 717 xnf_txid_put(xnfp, tidp); 718 } 719 720 /* LINTED: constant in conditional context */ 721 SHARED_RING_INIT(xnfp->xnf_tx_ring.sring); 722 /* LINTED: constant in conditional context */ 723 FRONT_RING_INIT(&xnfp->xnf_tx_ring, 724 xnfp->xnf_tx_ring.sring, PAGESIZE); 725 726 mutex_exit(&xnfp->xnf_txlock); 727 728 mutex_enter(&xnfp->xnf_rxlock); 729 730 /* 731 * Clean out any buffers currently posted to the receive ring 732 * before we reset it. 733 */ 734 for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0]; 735 i < NET_RX_RING_SIZE; 736 i++, bdescp++) { 737 if (*bdescp != NULL) { 738 xnf_buf_put(xnfp, *bdescp, B_FALSE); 739 *bdescp = NULL; 740 } 741 } 742 743 /* LINTED: constant in conditional context */ 744 SHARED_RING_INIT(xnfp->xnf_rx_ring.sring); 745 /* LINTED: constant in conditional context */ 746 FRONT_RING_INIT(&xnfp->xnf_rx_ring, 747 xnfp->xnf_rx_ring.sring, PAGESIZE); 748 749 /* 750 * Fill the ring with buffers. 751 */ 752 for (i = 0; i < NET_RX_RING_SIZE; i++) { 753 xnf_buf_t *bdesc; 754 755 bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE); 756 VERIFY(bdesc != NULL); 757 xnf_rxbuf_hang(xnfp, bdesc); 758 } 759 760 /* LINTED: constant in conditional context */ 761 RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring); 762 763 mutex_exit(&xnfp->xnf_rxlock); 764 765 return (0); 766 767 out: 768 if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF) 769 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0); 770 xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF; 771 772 if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF) 773 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0); 774 xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF; 775 776 return (err); 777 } 778 779 /* 780 * Connect driver to back end, called to set up communication with 781 * back end driver both initially and on resume after restore/migrate. 782 */ 783 void 784 xnf_be_connect(xnf_t *xnfp) 785 { 786 const char *message; 787 xenbus_transaction_t xbt; 788 struct xenbus_device *xsd; 789 char *xsname; 790 int err; 791 792 ASSERT(!xnfp->xnf_connected); 793 794 xsd = xvdi_get_xsd(xnfp->xnf_devinfo); 795 xsname = xvdi_get_xsname(xnfp->xnf_devinfo); 796 797 err = xnf_setup_rings(xnfp); 798 if (err != 0) { 799 cmn_err(CE_WARN, "failed to set up tx/rx rings"); 800 xenbus_dev_error(xsd, err, "setting up ring"); 801 return; 802 } 803 804 again: 805 err = xenbus_transaction_start(&xbt); 806 if (err != 0) { 807 xenbus_dev_error(xsd, EIO, "starting transaction"); 808 return; 809 } 810 811 err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u", 812 xnfp->xnf_tx_ring_ref); 813 if (err != 0) { 814 message = "writing tx ring-ref"; 815 goto abort_transaction; 816 } 817 818 err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u", 819 xnfp->xnf_rx_ring_ref); 820 if (err != 0) { 821 message = "writing rx ring-ref"; 822 goto abort_transaction; 823 } 824 825 err = xenbus_printf(xbt, xsname, "event-channel", "%u", 826 xnfp->xnf_evtchn); 827 if (err != 0) { 828 message = "writing event-channel"; 829 goto abort_transaction; 830 } 831 832 err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1); 833 if (err != 0) { 834 message = "writing feature-rx-notify"; 835 goto abort_transaction; 836 } 837 838 err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1); 839 if (err != 0) { 840 message = "writing request-rx-copy"; 841 goto abort_transaction; 842 } 843 844 if (xnfp->xnf_be_mcast_control) { 845 err = xenbus_printf(xbt, xsname, "request-multicast-control", 846 "%d", 1); 847 if (err != 0) { 848 message = "writing request-multicast-control"; 849 goto abort_transaction; 850 } 851 } 852 853 /* 854 * Tell backend if we support scatter-gather lists on the rx side. 855 */ 856 err = xenbus_printf(xbt, xsname, "feature-sg", "%d", 857 xnf_enable_rx_sg ? 1 : 0); 858 if (err != 0) { 859 message = "writing feature-sg"; 860 goto abort_transaction; 861 } 862 863 /* 864 * Tell backend if we support LRO for IPv4. Scatter-gather on rx is 865 * a prerequisite. 866 */ 867 err = xenbus_printf(xbt, xsname, "feature-gso-tcpv4", "%d", 868 (xnf_enable_rx_sg && xnf_enable_lro) ? 1 : 0); 869 if (err != 0) { 870 message = "writing feature-gso-tcpv4"; 871 goto abort_transaction; 872 } 873 874 err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected); 875 if (err != 0) { 876 message = "switching state to XenbusStateConnected"; 877 goto abort_transaction; 878 } 879 880 err = xenbus_transaction_end(xbt, 0); 881 if (err != 0) { 882 if (err == EAGAIN) 883 goto again; 884 xenbus_dev_error(xsd, err, "completing transaction"); 885 } 886 887 return; 888 889 abort_transaction: 890 (void) xenbus_transaction_end(xbt, 1); 891 xenbus_dev_error(xsd, err, "%s", message); 892 } 893 894 /* 895 * Read configuration information from xenstore. 896 */ 897 void 898 xnf_read_config(xnf_t *xnfp) 899 { 900 int err, be_cap; 901 char mac[ETHERADDRL * 3]; 902 char *oename = xvdi_get_oename(xnfp->xnf_devinfo); 903 904 err = xenbus_scanf(XBT_NULL, oename, "mac", 905 "%s", (char *)&mac[0]); 906 if (err != 0) { 907 /* 908 * bad: we're supposed to be set up with a proper mac 909 * addr. at this point 910 */ 911 cmn_err(CE_WARN, "%s%d: no mac address", 912 ddi_driver_name(xnfp->xnf_devinfo), 913 ddi_get_instance(xnfp->xnf_devinfo)); 914 return; 915 } 916 if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) { 917 err = ENOENT; 918 xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT, 919 "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo)); 920 return; 921 } 922 923 err = xenbus_scanf(XBT_NULL, oename, 924 "feature-rx-copy", "%d", &be_cap); 925 /* 926 * If we fail to read the store we assume that the key is 927 * absent, implying an older domain at the far end. Older 928 * domains cannot do HV copy. 929 */ 930 if (err != 0) 931 be_cap = 0; 932 xnfp->xnf_be_rx_copy = (be_cap != 0); 933 934 err = xenbus_scanf(XBT_NULL, oename, 935 "feature-multicast-control", "%d", &be_cap); 936 /* 937 * If we fail to read the store we assume that the key is 938 * absent, implying an older domain at the far end. Older 939 * domains do not support multicast control. 940 */ 941 if (err != 0) 942 be_cap = 0; 943 xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control; 944 945 /* 946 * See if back-end supports scatter-gather for transmits. If not, 947 * we will not support LSO and limit the mtu to 1500. 948 */ 949 err = xenbus_scanf(XBT_NULL, oename, "feature-sg", "%d", &be_cap); 950 if (err != 0) { 951 be_cap = 0; 952 dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading " 953 "'feature-sg' from backend driver"); 954 } 955 if (be_cap == 0) { 956 dev_err(xnfp->xnf_devinfo, CE_WARN, "scatter-gather is not " 957 "supported for transmits in the backend driver. LSO is " 958 "disabled and MTU is restricted to 1500 bytes."); 959 } 960 xnfp->xnf_be_tx_sg = (be_cap != 0) && xnf_enable_tx_sg; 961 962 if (xnfp->xnf_be_tx_sg) { 963 /* 964 * Check if LSO is supported. Currently we only check for 965 * IPv4 as Illumos doesn't support LSO for IPv6. 966 */ 967 err = xenbus_scanf(XBT_NULL, oename, "feature-gso-tcpv4", "%d", 968 &be_cap); 969 if (err != 0) { 970 be_cap = 0; 971 dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading " 972 "'feature-gso-tcpv4' from backend driver"); 973 } 974 if (be_cap == 0) { 975 dev_err(xnfp->xnf_devinfo, CE_WARN, "LSO is not " 976 "supported by the backend driver. Performance " 977 "will be affected."); 978 } 979 xnfp->xnf_be_lso = (be_cap != 0) && xnf_enable_lso; 980 } 981 } 982 983 /* 984 * attach(9E) -- Attach a device to the system 985 */ 986 static int 987 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) 988 { 989 mac_register_t *macp; 990 xnf_t *xnfp; 991 int err; 992 char cachename[32]; 993 994 #ifdef XNF_DEBUG 995 if (xnf_debug & XNF_DEBUG_DDI) 996 printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo), 997 (void *)devinfo); 998 #endif 999 1000 switch (cmd) { 1001 case DDI_RESUME: 1002 xnfp = ddi_get_driver_private(devinfo); 1003 xnfp->xnf_gen++; 1004 1005 (void) xvdi_resume(devinfo); 1006 (void) xvdi_alloc_evtchn(devinfo); 1007 xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo); 1008 #ifdef XPV_HVM_DRIVER 1009 ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, 1010 xnfp); 1011 #else 1012 (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, 1013 (caddr_t)xnfp); 1014 #endif 1015 return (DDI_SUCCESS); 1016 1017 case DDI_ATTACH: 1018 break; 1019 1020 default: 1021 return (DDI_FAILURE); 1022 } 1023 1024 /* 1025 * Allocate gld_mac_info_t and xnf_instance structures 1026 */ 1027 macp = mac_alloc(MAC_VERSION); 1028 if (macp == NULL) 1029 return (DDI_FAILURE); 1030 xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP); 1031 1032 xnfp->xnf_tx_pkt_id = 1033 kmem_zalloc(sizeof (xnf_txid_t) * NET_TX_RING_SIZE, KM_SLEEP); 1034 1035 xnfp->xnf_rx_pkt_info = 1036 kmem_zalloc(sizeof (xnf_buf_t *) * NET_RX_RING_SIZE, KM_SLEEP); 1037 1038 macp->m_dip = devinfo; 1039 macp->m_driver = xnfp; 1040 xnfp->xnf_devinfo = devinfo; 1041 1042 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1043 macp->m_src_addr = xnfp->xnf_mac_addr; 1044 macp->m_callbacks = &xnf_callbacks; 1045 macp->m_min_sdu = 0; 1046 xnfp->xnf_mtu = ETHERMTU; 1047 macp->m_max_sdu = xnfp->xnf_mtu; 1048 1049 xnfp->xnf_running = B_FALSE; 1050 xnfp->xnf_connected = B_FALSE; 1051 xnfp->xnf_be_rx_copy = B_FALSE; 1052 xnfp->xnf_be_mcast_control = B_FALSE; 1053 xnfp->xnf_need_sched = B_FALSE; 1054 1055 xnfp->xnf_rx_head = NULL; 1056 xnfp->xnf_rx_tail = NULL; 1057 xnfp->xnf_rx_new_buffers_posted = B_FALSE; 1058 1059 #ifdef XPV_HVM_DRIVER 1060 /* 1061 * Report our version to dom0. 1062 */ 1063 if (xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d", 1064 HVMPV_XNF_VERS)) 1065 cmn_err(CE_WARN, "xnf: couldn't write version\n"); 1066 #endif 1067 1068 /* 1069 * Get the iblock cookie with which to initialize the mutexes. 1070 */ 1071 if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie) 1072 != DDI_SUCCESS) 1073 goto failure; 1074 1075 mutex_init(&xnfp->xnf_txlock, 1076 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1077 mutex_init(&xnfp->xnf_rxlock, 1078 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1079 mutex_init(&xnfp->xnf_schedlock, 1080 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1081 mutex_init(&xnfp->xnf_gref_lock, 1082 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1083 1084 cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL); 1085 cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL); 1086 cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL); 1087 1088 (void) sprintf(cachename, "xnf_buf_cache_%d", 1089 ddi_get_instance(devinfo)); 1090 xnfp->xnf_buf_cache = kmem_cache_create(cachename, 1091 sizeof (xnf_buf_t), 0, 1092 xnf_buf_constructor, xnf_buf_destructor, 1093 NULL, xnfp, NULL, 0); 1094 if (xnfp->xnf_buf_cache == NULL) 1095 goto failure_0; 1096 1097 (void) sprintf(cachename, "xnf_tx_buf_cache_%d", 1098 ddi_get_instance(devinfo)); 1099 xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename, 1100 sizeof (xnf_txbuf_t), 0, 1101 xnf_tx_buf_constructor, xnf_tx_buf_destructor, 1102 NULL, xnfp, NULL, 0); 1103 if (xnfp->xnf_tx_buf_cache == NULL) 1104 goto failure_1; 1105 1106 xnfp->xnf_gref_head = INVALID_GRANT_REF; 1107 1108 if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) { 1109 cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize " 1110 "driver data structures", 1111 ddi_get_instance(xnfp->xnf_devinfo)); 1112 goto failure_2; 1113 } 1114 1115 xnfp->xnf_rx_ring.sring->rsp_event = 1116 xnfp->xnf_tx_ring.sring->rsp_event = 1; 1117 1118 xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF; 1119 xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF; 1120 1121 /* set driver private pointer now */ 1122 ddi_set_driver_private(devinfo, xnfp); 1123 1124 if (!xnf_kstat_init(xnfp)) 1125 goto failure_3; 1126 1127 /* 1128 * Allocate an event channel, add the interrupt handler and 1129 * bind it to the event channel. 1130 */ 1131 (void) xvdi_alloc_evtchn(devinfo); 1132 xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo); 1133 #ifdef XPV_HVM_DRIVER 1134 ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp); 1135 #else 1136 (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp); 1137 #endif 1138 1139 err = mac_register(macp, &xnfp->xnf_mh); 1140 mac_free(macp); 1141 macp = NULL; 1142 if (err != 0) 1143 goto failure_4; 1144 1145 if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL) 1146 != DDI_SUCCESS) 1147 goto failure_5; 1148 1149 #ifdef XPV_HVM_DRIVER 1150 /* 1151 * In the HVM case, this driver essentially replaces a driver for 1152 * a 'real' PCI NIC. Without the "model" property set to 1153 * "Ethernet controller", like the PCI code does, netbooting does 1154 * not work correctly, as strplumb_get_netdev_path() will not find 1155 * this interface. 1156 */ 1157 (void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model", 1158 "Ethernet controller"); 1159 #endif 1160 1161 #ifdef XNF_DEBUG 1162 if (xnf_debug_instance == NULL) 1163 xnf_debug_instance = xnfp; 1164 #endif 1165 1166 return (DDI_SUCCESS); 1167 1168 failure_5: 1169 (void) mac_unregister(xnfp->xnf_mh); 1170 1171 failure_4: 1172 #ifdef XPV_HVM_DRIVER 1173 ec_unbind_evtchn(xnfp->xnf_evtchn); 1174 xvdi_free_evtchn(devinfo); 1175 #else 1176 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); 1177 #endif 1178 xnfp->xnf_evtchn = INVALID_EVTCHN; 1179 kstat_delete(xnfp->xnf_kstat_aux); 1180 1181 failure_3: 1182 xnf_release_dma_resources(xnfp); 1183 1184 failure_2: 1185 kmem_cache_destroy(xnfp->xnf_tx_buf_cache); 1186 1187 failure_1: 1188 kmem_cache_destroy(xnfp->xnf_buf_cache); 1189 1190 failure_0: 1191 cv_destroy(&xnfp->xnf_cv_tx_slots); 1192 cv_destroy(&xnfp->xnf_cv_multicast); 1193 cv_destroy(&xnfp->xnf_cv_state); 1194 1195 mutex_destroy(&xnfp->xnf_gref_lock); 1196 mutex_destroy(&xnfp->xnf_schedlock); 1197 mutex_destroy(&xnfp->xnf_rxlock); 1198 mutex_destroy(&xnfp->xnf_txlock); 1199 1200 failure: 1201 kmem_free(xnfp, sizeof (*xnfp)); 1202 if (macp != NULL) 1203 mac_free(macp); 1204 1205 return (DDI_FAILURE); 1206 } 1207 1208 /* detach(9E) -- Detach a device from the system */ 1209 static int 1210 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) 1211 { 1212 xnf_t *xnfp; /* Our private device info */ 1213 1214 #ifdef XNF_DEBUG 1215 if (xnf_debug & XNF_DEBUG_DDI) 1216 printf("xnf_detach(0x%p)\n", (void *)devinfo); 1217 #endif 1218 1219 xnfp = ddi_get_driver_private(devinfo); 1220 1221 switch (cmd) { 1222 case DDI_SUSPEND: 1223 #ifdef XPV_HVM_DRIVER 1224 ec_unbind_evtchn(xnfp->xnf_evtchn); 1225 xvdi_free_evtchn(devinfo); 1226 #else 1227 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); 1228 #endif 1229 1230 xvdi_suspend(devinfo); 1231 1232 mutex_enter(&xnfp->xnf_rxlock); 1233 mutex_enter(&xnfp->xnf_txlock); 1234 1235 xnfp->xnf_evtchn = INVALID_EVTCHN; 1236 xnfp->xnf_connected = B_FALSE; 1237 mutex_exit(&xnfp->xnf_txlock); 1238 mutex_exit(&xnfp->xnf_rxlock); 1239 1240 /* claim link to be down after disconnect */ 1241 mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN); 1242 return (DDI_SUCCESS); 1243 1244 case DDI_DETACH: 1245 break; 1246 1247 default: 1248 return (DDI_FAILURE); 1249 } 1250 1251 if (xnfp->xnf_connected) 1252 return (DDI_FAILURE); 1253 1254 /* 1255 * Cannot detach if we have xnf_buf_t outstanding. 1256 */ 1257 if (xnfp->xnf_stat_buf_allocated > 0) 1258 return (DDI_FAILURE); 1259 1260 if (mac_unregister(xnfp->xnf_mh) != 0) 1261 return (DDI_FAILURE); 1262 1263 kstat_delete(xnfp->xnf_kstat_aux); 1264 1265 /* Stop the receiver */ 1266 xnf_stop(xnfp); 1267 1268 xvdi_remove_event_handler(devinfo, XS_OE_STATE); 1269 1270 /* Remove the interrupt */ 1271 #ifdef XPV_HVM_DRIVER 1272 ec_unbind_evtchn(xnfp->xnf_evtchn); 1273 xvdi_free_evtchn(devinfo); 1274 #else 1275 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); 1276 #endif 1277 1278 /* Release any pending xmit mblks */ 1279 xnf_release_mblks(xnfp); 1280 1281 /* Release all DMA resources */ 1282 xnf_release_dma_resources(xnfp); 1283 1284 cv_destroy(&xnfp->xnf_cv_tx_slots); 1285 cv_destroy(&xnfp->xnf_cv_multicast); 1286 cv_destroy(&xnfp->xnf_cv_state); 1287 1288 kmem_cache_destroy(xnfp->xnf_tx_buf_cache); 1289 kmem_cache_destroy(xnfp->xnf_buf_cache); 1290 1291 mutex_destroy(&xnfp->xnf_gref_lock); 1292 mutex_destroy(&xnfp->xnf_schedlock); 1293 mutex_destroy(&xnfp->xnf_rxlock); 1294 mutex_destroy(&xnfp->xnf_txlock); 1295 1296 kmem_free(xnfp, sizeof (*xnfp)); 1297 1298 return (DDI_SUCCESS); 1299 } 1300 1301 /* 1302 * xnf_set_mac_addr() -- set the physical network address on the board. 1303 */ 1304 static int 1305 xnf_set_mac_addr(void *arg, const uint8_t *macaddr) 1306 { 1307 _NOTE(ARGUNUSED(arg, macaddr)); 1308 1309 /* 1310 * We can't set our macaddr. 1311 */ 1312 return (ENOTSUP); 1313 } 1314 1315 /* 1316 * xnf_set_multicast() -- set (enable) or disable a multicast address. 1317 * 1318 * Program the hardware to enable/disable the multicast address 1319 * in "mca". Enable if "add" is true, disable if false. 1320 */ 1321 static int 1322 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca) 1323 { 1324 xnf_t *xnfp = arg; 1325 xnf_txbuf_t *txp; 1326 int n_slots; 1327 RING_IDX slot; 1328 xnf_txid_t *tidp; 1329 netif_tx_request_t *txrp; 1330 struct netif_extra_info *erp; 1331 boolean_t notify, result; 1332 1333 /* 1334 * If the backend does not support multicast control then we 1335 * must assume that the right packets will just arrive. 1336 */ 1337 if (!xnfp->xnf_be_mcast_control) 1338 return (0); 1339 1340 txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP); 1341 1342 mutex_enter(&xnfp->xnf_txlock); 1343 1344 /* 1345 * If we're not yet connected then claim success. This is 1346 * acceptable because we refresh the entire set of multicast 1347 * addresses when we get connected. 1348 * 1349 * We can't wait around here because the MAC layer expects 1350 * this to be a non-blocking operation - waiting ends up 1351 * causing a deadlock during resume. 1352 */ 1353 if (!xnfp->xnf_connected) { 1354 mutex_exit(&xnfp->xnf_txlock); 1355 return (0); 1356 } 1357 1358 /* 1359 * 1. Acquire two slots in the ring. 1360 * 2. Fill in the slots. 1361 * 3. Request notification when the operation is done. 1362 * 4. Kick the peer. 1363 * 5. Wait for the response via xnf_tx_clean_ring(). 1364 */ 1365 1366 n_slots = xnf_tx_slots_get(xnfp, 2, B_TRUE); 1367 ASSERT(n_slots >= 2); 1368 1369 slot = xnfp->xnf_tx_ring.req_prod_pvt; 1370 tidp = xnf_txid_get(xnfp); 1371 VERIFY(tidp != NULL); 1372 1373 txp->tx_type = TX_MCAST_REQ; 1374 txp->tx_slot = slot; 1375 1376 txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); 1377 erp = (struct netif_extra_info *) 1378 RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1); 1379 1380 txrp->gref = 0; 1381 txrp->size = 0; 1382 txrp->offset = 0; 1383 /* Set tx_txreq.id to appease xnf_tx_clean_ring(). */ 1384 txrp->id = txp->tx_txreq.id = tidp->id; 1385 txrp->flags = NETTXF_extra_info; 1386 1387 erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD : 1388 XEN_NETIF_EXTRA_TYPE_MCAST_DEL; 1389 bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL); 1390 1391 tidp->txbuf = txp; 1392 1393 xnfp->xnf_tx_ring.req_prod_pvt = slot + 2; 1394 1395 mutex_enter(&xnfp->xnf_schedlock); 1396 xnfp->xnf_pending_multicast++; 1397 mutex_exit(&xnfp->xnf_schedlock); 1398 1399 /* LINTED: constant in conditional context */ 1400 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, 1401 notify); 1402 if (notify) 1403 ec_notify_via_evtchn(xnfp->xnf_evtchn); 1404 1405 while (txp->tx_type == TX_MCAST_REQ) 1406 cv_wait(&xnfp->xnf_cv_multicast, &xnfp->xnf_txlock); 1407 1408 ASSERT3U(txp->tx_type, ==, TX_MCAST_RSP); 1409 1410 mutex_enter(&xnfp->xnf_schedlock); 1411 xnfp->xnf_pending_multicast--; 1412 mutex_exit(&xnfp->xnf_schedlock); 1413 1414 result = (txp->tx_status == NETIF_RSP_OKAY); 1415 1416 xnf_txid_put(xnfp, tidp); 1417 1418 mutex_exit(&xnfp->xnf_txlock); 1419 1420 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); 1421 1422 return (result ? 0 : 1); 1423 } 1424 1425 /* 1426 * xnf_set_promiscuous() -- set or reset promiscuous mode on the board 1427 * 1428 * Program the hardware to enable/disable promiscuous mode. 1429 */ 1430 static int 1431 xnf_set_promiscuous(void *arg, boolean_t on) 1432 { 1433 _NOTE(ARGUNUSED(arg, on)); 1434 1435 /* 1436 * We can't really do this, but we pretend that we can in 1437 * order that snoop will work. 1438 */ 1439 return (0); 1440 } 1441 1442 /* 1443 * Clean buffers that we have responses for from the transmit ring. 1444 */ 1445 static int 1446 xnf_tx_clean_ring(xnf_t *xnfp) 1447 { 1448 boolean_t work_to_do; 1449 1450 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 1451 1452 loop: 1453 while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) { 1454 RING_IDX cons, prod, i; 1455 1456 cons = xnfp->xnf_tx_ring.rsp_cons; 1457 prod = xnfp->xnf_tx_ring.sring->rsp_prod; 1458 membar_consumer(); 1459 /* 1460 * Clean tx requests from ring that we have responses 1461 * for. 1462 */ 1463 DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod); 1464 for (i = cons; i != prod; i++) { 1465 netif_tx_response_t *trp; 1466 xnf_txid_t *tidp; 1467 xnf_txbuf_t *txp; 1468 1469 trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i); 1470 /* 1471 * if this slot was occupied by netif_extra_info_t, 1472 * then the response will be NETIF_RSP_NULL. In this 1473 * case there are no resources to clean up. 1474 */ 1475 if (trp->status == NETIF_RSP_NULL) 1476 continue; 1477 1478 ASSERT(TX_ID_VALID(trp->id)); 1479 1480 tidp = TX_ID_TO_TXID(xnfp, trp->id); 1481 ASSERT3U(tidp->id, ==, trp->id); 1482 ASSERT3U(tidp->next, ==, INVALID_TX_ID); 1483 1484 txp = tidp->txbuf; 1485 ASSERT(txp != NULL); 1486 ASSERT3U(txp->tx_txreq.id, ==, trp->id); 1487 1488 switch (txp->tx_type) { 1489 case TX_DATA: 1490 /* 1491 * We must put the txid for each response we 1492 * acknowledge to make sure that we never have 1493 * more free slots than txids. Because of this 1494 * we do it here instead of waiting for it to 1495 * be done in xnf_data_txbuf_free_chain(). 1496 */ 1497 xnf_txid_put(xnfp, tidp); 1498 txp->tx_txreq.id = INVALID_TX_ID; 1499 ASSERT3S(txp->tx_head->tx_frags_to_ack, >, 0); 1500 txp->tx_head->tx_frags_to_ack--; 1501 1502 /* 1503 * We clean the whole chain once we got a 1504 * response for each fragment. 1505 */ 1506 if (txp->tx_head->tx_frags_to_ack == 0) 1507 xnf_data_txbuf_free_chain(xnfp, txp); 1508 1509 break; 1510 1511 case TX_MCAST_REQ: 1512 txp->tx_type = TX_MCAST_RSP; 1513 txp->tx_status = trp->status; 1514 cv_broadcast(&xnfp->xnf_cv_multicast); 1515 1516 break; 1517 1518 default: 1519 cmn_err(CE_PANIC, "xnf_tx_clean_ring: " 1520 "invalid xnf_txbuf_t type: %d", 1521 txp->tx_type); 1522 break; 1523 } 1524 } 1525 /* 1526 * Record the last response we dealt with so that we 1527 * know where to start next time around. 1528 */ 1529 xnfp->xnf_tx_ring.rsp_cons = prod; 1530 membar_enter(); 1531 } 1532 1533 /* LINTED: constant in conditional context */ 1534 RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do); 1535 if (work_to_do) 1536 goto loop; 1537 1538 return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring)); 1539 } 1540 1541 /* 1542 * Allocate and fill in a look-aside buffer for the packet `mp'. Used 1543 * to ensure that the packet is physically contiguous and contained 1544 * within a single page. 1545 */ 1546 static xnf_buf_t * 1547 xnf_tx_get_lookaside(xnf_t *xnfp, mblk_t *mp, size_t *plen) 1548 { 1549 xnf_buf_t *bd; 1550 caddr_t bp; 1551 1552 bd = xnf_buf_get(xnfp, KM_SLEEP, B_TRUE); 1553 if (bd == NULL) 1554 return (NULL); 1555 1556 bp = bd->buf; 1557 while (mp != NULL) { 1558 size_t len = MBLKL(mp); 1559 1560 bcopy(mp->b_rptr, bp, len); 1561 bp += len; 1562 1563 mp = mp->b_cont; 1564 } 1565 1566 *plen = bp - bd->buf; 1567 ASSERT3U(*plen, <=, PAGESIZE); 1568 1569 xnfp->xnf_stat_tx_lookaside++; 1570 1571 return (bd); 1572 } 1573 1574 /* 1575 * Insert the pseudo-header checksum into the packet. 1576 * Assumes packet is IPv4, TCP/UDP since we only advertised support for 1577 * HCKSUM_INET_FULL_V4. 1578 */ 1579 int 1580 xnf_pseudo_cksum(mblk_t *mp) 1581 { 1582 struct ether_header *ehp; 1583 uint16_t sap, iplen, *stuff; 1584 uint32_t cksum; 1585 size_t len; 1586 ipha_t *ipha; 1587 ipaddr_t src, dst; 1588 uchar_t *ptr; 1589 1590 ptr = mp->b_rptr; 1591 len = MBLKL(mp); 1592 1593 /* Each header must fit completely in an mblk. */ 1594 ASSERT3U(len, >=, sizeof (*ehp)); 1595 1596 ehp = (struct ether_header *)ptr; 1597 1598 if (ntohs(ehp->ether_type) == VLAN_TPID) { 1599 struct ether_vlan_header *evhp; 1600 ASSERT3U(len, >=, sizeof (*evhp)); 1601 evhp = (struct ether_vlan_header *)ptr; 1602 sap = ntohs(evhp->ether_type); 1603 ptr += sizeof (*evhp); 1604 len -= sizeof (*evhp); 1605 } else { 1606 sap = ntohs(ehp->ether_type); 1607 ptr += sizeof (*ehp); 1608 len -= sizeof (*ehp); 1609 } 1610 1611 ASSERT3U(sap, ==, ETHERTYPE_IP); 1612 1613 /* 1614 * Ethernet and IP headers may be in different mblks. 1615 */ 1616 ASSERT3P(ptr, <=, mp->b_wptr); 1617 if (ptr == mp->b_wptr) { 1618 mp = mp->b_cont; 1619 ptr = mp->b_rptr; 1620 len = MBLKL(mp); 1621 } 1622 1623 ASSERT3U(len, >=, sizeof (ipha_t)); 1624 ipha = (ipha_t *)ptr; 1625 1626 /* 1627 * We assume the IP header has no options. (This is enforced in 1628 * ire_send_wire_v4() -- search for IXAF_NO_HW_CKSUM). 1629 */ 1630 ASSERT3U(IPH_HDR_LENGTH(ipha), ==, IP_SIMPLE_HDR_LENGTH); 1631 iplen = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH; 1632 1633 ptr += IP_SIMPLE_HDR_LENGTH; 1634 len -= IP_SIMPLE_HDR_LENGTH; 1635 1636 /* 1637 * IP and L4 headers may be in different mblks. 1638 */ 1639 ASSERT3P(ptr, <=, mp->b_wptr); 1640 if (ptr == mp->b_wptr) { 1641 mp = mp->b_cont; 1642 ptr = mp->b_rptr; 1643 len = MBLKL(mp); 1644 } 1645 1646 switch (ipha->ipha_protocol) { 1647 case IPPROTO_TCP: 1648 ASSERT3U(len, >=, sizeof (tcph_t)); 1649 stuff = (uint16_t *)(ptr + TCP_CHECKSUM_OFFSET); 1650 cksum = IP_TCP_CSUM_COMP; 1651 break; 1652 case IPPROTO_UDP: 1653 ASSERT3U(len, >=, sizeof (struct udphdr)); 1654 stuff = (uint16_t *)(ptr + UDP_CHECKSUM_OFFSET); 1655 cksum = IP_UDP_CSUM_COMP; 1656 break; 1657 default: 1658 cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d", 1659 ipha->ipha_protocol); 1660 return (EINVAL); 1661 } 1662 1663 src = ipha->ipha_src; 1664 dst = ipha->ipha_dst; 1665 1666 cksum += (dst >> 16) + (dst & 0xFFFF); 1667 cksum += (src >> 16) + (src & 0xFFFF); 1668 cksum += htons(iplen); 1669 1670 cksum = (cksum >> 16) + (cksum & 0xFFFF); 1671 cksum = (cksum >> 16) + (cksum & 0xFFFF); 1672 1673 ASSERT(cksum <= 0xFFFF); 1674 1675 *stuff = (uint16_t)(cksum ? cksum : ~cksum); 1676 1677 return (0); 1678 } 1679 1680 /* 1681 * Push a packet into the transmit ring. 1682 * 1683 * Note: the format of a tx packet that spans multiple slots is similar to 1684 * what is described in xnf_rx_one_packet(). 1685 */ 1686 static void 1687 xnf_tx_push_packet(xnf_t *xnfp, xnf_txbuf_t *head) 1688 { 1689 int nslots = 0; 1690 int extras = 0; 1691 RING_IDX slot; 1692 boolean_t notify; 1693 1694 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 1695 ASSERT(xnfp->xnf_running); 1696 1697 slot = xnfp->xnf_tx_ring.req_prod_pvt; 1698 1699 /* 1700 * The caller has already checked that we have enough slots to proceed. 1701 */ 1702 for (xnf_txbuf_t *txp = head; txp != NULL; txp = txp->tx_next) { 1703 xnf_txid_t *tidp; 1704 netif_tx_request_t *txrp; 1705 1706 tidp = xnf_txid_get(xnfp); 1707 VERIFY(tidp != NULL); 1708 txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); 1709 1710 txp->tx_slot = slot; 1711 txp->tx_txreq.id = tidp->id; 1712 *txrp = txp->tx_txreq; 1713 1714 tidp->txbuf = txp; 1715 slot++; 1716 nslots++; 1717 1718 /* 1719 * When present, LSO info is placed in a slot after the first 1720 * data segment, and doesn't require a txid. 1721 */ 1722 if (txp->tx_txreq.flags & NETTXF_extra_info) { 1723 netif_extra_info_t *extra; 1724 ASSERT3U(nslots, ==, 1); 1725 1726 extra = (netif_extra_info_t *) 1727 RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); 1728 *extra = txp->tx_extra; 1729 slot++; 1730 nslots++; 1731 extras = 1; 1732 } 1733 } 1734 1735 ASSERT3U(nslots, <=, XEN_MAX_SLOTS_PER_TX); 1736 1737 /* 1738 * Store the number of data fragments. 1739 */ 1740 head->tx_frags_to_ack = nslots - extras; 1741 1742 xnfp->xnf_tx_ring.req_prod_pvt = slot; 1743 1744 /* 1745 * Tell the peer that we sent something, if it cares. 1746 */ 1747 /* LINTED: constant in conditional context */ 1748 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, notify); 1749 if (notify) 1750 ec_notify_via_evtchn(xnfp->xnf_evtchn); 1751 } 1752 1753 static xnf_txbuf_t * 1754 xnf_mblk_copy(xnf_t *xnfp, mblk_t *mp) 1755 { 1756 xnf_txbuf_t *txp = xnf_data_txbuf_alloc(xnfp); 1757 size_t length; 1758 1759 txp->tx_bdesc = xnf_tx_get_lookaside(xnfp, mp, &length); 1760 if (txp->tx_bdesc == NULL) { 1761 xnf_data_txbuf_free(xnfp, txp); 1762 return (NULL); 1763 } 1764 txp->tx_mfn = txp->tx_bdesc->buf_mfn; 1765 txp->tx_txreq.gref = txp->tx_bdesc->grant_ref; 1766 txp->tx_txreq.size = length; 1767 txp->tx_txreq.offset = (uintptr_t)txp->tx_bdesc->buf & PAGEOFFSET; 1768 txp->tx_txreq.flags = 0; 1769 1770 return (txp); 1771 } 1772 1773 static xnf_txbuf_t * 1774 xnf_mblk_map(xnf_t *xnfp, mblk_t *mp, int *countp) 1775 { 1776 xnf_txbuf_t *head = NULL; 1777 xnf_txbuf_t *tail = NULL; 1778 domid_t oeid; 1779 int nsegs = 0; 1780 1781 oeid = xvdi_get_oeid(xnfp->xnf_devinfo); 1782 1783 for (mblk_t *ml = mp; ml != NULL; ml = ml->b_cont) { 1784 ddi_dma_handle_t dma_handle; 1785 ddi_dma_cookie_t dma_cookie; 1786 uint_t ncookies; 1787 xnf_txbuf_t *txp; 1788 1789 if (MBLKL(ml) == 0) 1790 continue; 1791 1792 txp = xnf_data_txbuf_alloc(xnfp); 1793 1794 if (head == NULL) { 1795 head = txp; 1796 } else { 1797 ASSERT(tail != NULL); 1798 TXBUF_SETNEXT(tail, txp); 1799 txp->tx_head = head; 1800 } 1801 1802 /* 1803 * The necessary segmentation rules (e.g. not crossing a page 1804 * boundary) are enforced by the dma attributes of the handle. 1805 */ 1806 dma_handle = txp->tx_dma_handle; 1807 int ret = ddi_dma_addr_bind_handle(dma_handle, 1808 NULL, (char *)ml->b_rptr, MBLKL(ml), 1809 DDI_DMA_WRITE | DDI_DMA_STREAMING, 1810 DDI_DMA_DONTWAIT, 0, &dma_cookie, 1811 &ncookies); 1812 if (ret != DDI_DMA_MAPPED) { 1813 if (ret != DDI_DMA_NORESOURCES) { 1814 dev_err(xnfp->xnf_devinfo, CE_WARN, 1815 "ddi_dma_addr_bind_handle() failed " 1816 "[dma_error=%d]", ret); 1817 } 1818 goto error; 1819 } 1820 txp->tx_handle_bound = B_TRUE; 1821 1822 ASSERT(ncookies > 0); 1823 for (int i = 0; i < ncookies; i++) { 1824 if (nsegs == XEN_MAX_TX_DATA_PAGES) { 1825 dev_err(xnfp->xnf_devinfo, CE_WARN, 1826 "xnf_dmamap_alloc() failed: " 1827 "too many segments"); 1828 goto error; 1829 } 1830 if (i > 0) { 1831 txp = xnf_data_txbuf_alloc(xnfp); 1832 ASSERT(tail != NULL); 1833 TXBUF_SETNEXT(tail, txp); 1834 txp->tx_head = head; 1835 } 1836 1837 txp->tx_mfn = 1838 xnf_btop(pa_to_ma(dma_cookie.dmac_laddress)); 1839 txp->tx_txreq.gref = xnf_gref_get(xnfp); 1840 if (txp->tx_txreq.gref == INVALID_GRANT_REF) { 1841 dev_err(xnfp->xnf_devinfo, CE_WARN, 1842 "xnf_dmamap_alloc() failed: " 1843 "invalid grant ref"); 1844 goto error; 1845 } 1846 gnttab_grant_foreign_access_ref(txp->tx_txreq.gref, 1847 oeid, txp->tx_mfn, 1); 1848 txp->tx_txreq.offset = 1849 dma_cookie.dmac_laddress & PAGEOFFSET; 1850 txp->tx_txreq.size = dma_cookie.dmac_size; 1851 txp->tx_txreq.flags = 0; 1852 1853 ddi_dma_nextcookie(dma_handle, &dma_cookie); 1854 nsegs++; 1855 1856 if (tail != NULL) 1857 tail->tx_txreq.flags = NETTXF_more_data; 1858 tail = txp; 1859 } 1860 } 1861 1862 *countp = nsegs; 1863 return (head); 1864 1865 error: 1866 xnf_data_txbuf_free_chain(xnfp, head); 1867 return (NULL); 1868 } 1869 1870 static void 1871 xnf_tx_setup_offload(xnf_t *xnfp, xnf_txbuf_t *head, 1872 uint32_t cksum_flags, uint32_t lso_flags, uint32_t mss) 1873 { 1874 if (lso_flags != 0) { 1875 ASSERT3U(lso_flags, ==, HW_LSO); 1876 ASSERT3P(head->tx_bdesc, ==, NULL); 1877 1878 head->tx_txreq.flags |= NETTXF_extra_info; 1879 netif_extra_info_t *extra = &head->tx_extra; 1880 extra->type = XEN_NETIF_EXTRA_TYPE_GSO; 1881 extra->flags = 0; 1882 extra->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; 1883 extra->u.gso.size = mss; 1884 extra->u.gso.features = 0; 1885 extra->u.gso.pad = 0; 1886 } else if (cksum_flags != 0) { 1887 ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM); 1888 /* 1889 * If the local protocol stack requests checksum 1890 * offload we set the 'checksum blank' flag, 1891 * indicating to the peer that we need the checksum 1892 * calculated for us. 1893 * 1894 * We _don't_ set the validated flag, because we haven't 1895 * validated that the data and the checksum match. 1896 * 1897 * Note: we already called xnf_pseudo_cksum() in 1898 * xnf_send(), so we just set the txreq flag here. 1899 */ 1900 head->tx_txreq.flags |= NETTXF_csum_blank; 1901 xnfp->xnf_stat_tx_cksum_deferred++; 1902 } 1903 } 1904 1905 /* 1906 * Send packet mp. Called by the MAC framework. 1907 */ 1908 static mblk_t * 1909 xnf_send(void *arg, mblk_t *mp) 1910 { 1911 xnf_t *xnfp = arg; 1912 xnf_txbuf_t *head; 1913 mblk_t *ml; 1914 int length; 1915 int pages, chunks, slots, slots_free; 1916 uint32_t cksum_flags, lso_flags, mss; 1917 boolean_t pulledup = B_FALSE; 1918 boolean_t force_copy = B_FALSE; 1919 1920 ASSERT3P(mp->b_next, ==, NULL); 1921 1922 mutex_enter(&xnfp->xnf_txlock); 1923 1924 /* 1925 * Wait until we are connected to the backend. 1926 */ 1927 while (!xnfp->xnf_connected) 1928 cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock); 1929 1930 /* 1931 * To simplify logic and be in sync with the rescheduling mechanism, 1932 * we require the maximum amount of slots that could be used by a 1933 * transaction to be free before proceeding. The only downside of doing 1934 * this is that it slightly reduces the effective size of the ring. 1935 */ 1936 slots_free = xnf_tx_slots_get(xnfp, XEN_MAX_SLOTS_PER_TX, B_FALSE); 1937 if (slots_free < XEN_MAX_SLOTS_PER_TX) { 1938 /* 1939 * We need to ask for a re-schedule later as the ring is full. 1940 */ 1941 mutex_enter(&xnfp->xnf_schedlock); 1942 xnfp->xnf_need_sched = B_TRUE; 1943 mutex_exit(&xnfp->xnf_schedlock); 1944 1945 xnfp->xnf_stat_tx_defer++; 1946 mutex_exit(&xnfp->xnf_txlock); 1947 return (mp); 1948 } 1949 1950 /* 1951 * Get hw offload parameters. 1952 * This must be done before pulling up the mp as those parameters 1953 * are not copied over. 1954 */ 1955 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &cksum_flags); 1956 mac_lso_get(mp, &mss, &lso_flags); 1957 1958 /* 1959 * XXX: fix MAC framework so that we can advertise support for 1960 * partial checksum for IPv4 only. This way we won't need to calculate 1961 * the pseudo header checksum ourselves. 1962 */ 1963 if (cksum_flags != 0) { 1964 ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM); 1965 (void) xnf_pseudo_cksum(mp); 1966 } 1967 1968 pulledup: 1969 for (ml = mp, pages = 0, chunks = 0, length = 0; ml != NULL; 1970 ml = ml->b_cont, chunks++) { 1971 pages += xnf_mblk_pages(ml); 1972 length += MBLKL(ml); 1973 } 1974 DTRACE_PROBE3(packet, int, length, int, chunks, int, pages); 1975 DTRACE_PROBE3(lso, int, length, uint32_t, lso_flags, uint32_t, mss); 1976 1977 /* 1978 * If the ethernet header crosses a page boundary the packet 1979 * will be dropped by the backend. In practice it seems like 1980 * this happens fairly rarely so we'll do nothing unless the 1981 * packet is small enough to fit in a look-aside buffer. 1982 */ 1983 if (((uintptr_t)mp->b_rptr & PAGEOFFSET) + 1984 sizeof (struct ether_header) > PAGESIZE) { 1985 xnfp->xnf_stat_tx_eth_hdr_split++; 1986 if (length <= PAGESIZE) 1987 force_copy = B_TRUE; 1988 } 1989 1990 if (force_copy || (pages > 1 && !xnfp->xnf_be_tx_sg)) { 1991 /* 1992 * If the packet spans several pages and scatter-gather is not 1993 * supported then use a look-aside buffer. 1994 */ 1995 ASSERT3U(length, <=, PAGESIZE); 1996 head = xnf_mblk_copy(xnfp, mp); 1997 if (head == NULL) { 1998 dev_err(xnfp->xnf_devinfo, CE_WARN, 1999 "xnf_mblk_copy() failed"); 2000 goto drop; 2001 } 2002 } else { 2003 /* 2004 * There's a limit for how many pages can be passed to the 2005 * backend. If we pass that limit, the packet will be dropped 2006 * and some backend implementations (e.g. Linux) could even 2007 * offline the interface. 2008 */ 2009 if (pages > XEN_MAX_TX_DATA_PAGES) { 2010 if (pulledup) { 2011 dev_err(xnfp->xnf_devinfo, CE_WARN, 2012 "too many pages, even after pullup: %d.", 2013 pages); 2014 goto drop; 2015 } 2016 2017 /* 2018 * Defragment packet if it spans too many pages. 2019 */ 2020 mblk_t *newmp = msgpullup(mp, -1); 2021 freemsg(mp); 2022 mp = newmp; 2023 xnfp->xnf_stat_tx_pullup++; 2024 pulledup = B_TRUE; 2025 goto pulledup; 2026 } 2027 2028 head = xnf_mblk_map(xnfp, mp, &slots); 2029 if (head == NULL) 2030 goto drop; 2031 2032 IMPLY(slots > 1, xnfp->xnf_be_tx_sg); 2033 } 2034 2035 /* 2036 * Set tx_mp so that mblk is freed when the txbuf chain is freed. 2037 */ 2038 head->tx_mp = mp; 2039 2040 xnf_tx_setup_offload(xnfp, head, cksum_flags, lso_flags, mss); 2041 2042 /* 2043 * The first request must store the total length of the packet. 2044 */ 2045 head->tx_txreq.size = length; 2046 2047 /* 2048 * Push the packet we have prepared into the ring. 2049 */ 2050 xnf_tx_push_packet(xnfp, head); 2051 xnfp->xnf_stat_opackets++; 2052 xnfp->xnf_stat_obytes += length; 2053 2054 mutex_exit(&xnfp->xnf_txlock); 2055 return (NULL); 2056 2057 drop: 2058 freemsg(mp); 2059 xnfp->xnf_stat_tx_drop++; 2060 mutex_exit(&xnfp->xnf_txlock); 2061 return (NULL); 2062 } 2063 2064 /* 2065 * Notification of RX packets. Currently no TX-complete interrupt is 2066 * used, as we clean the TX ring lazily. 2067 */ 2068 static uint_t 2069 xnf_intr(caddr_t arg) 2070 { 2071 xnf_t *xnfp = (xnf_t *)arg; 2072 mblk_t *mp; 2073 boolean_t need_sched, clean_ring; 2074 2075 mutex_enter(&xnfp->xnf_rxlock); 2076 2077 /* 2078 * Interrupts before we are connected are spurious. 2079 */ 2080 if (!xnfp->xnf_connected) { 2081 mutex_exit(&xnfp->xnf_rxlock); 2082 xnfp->xnf_stat_unclaimed_interrupts++; 2083 return (DDI_INTR_UNCLAIMED); 2084 } 2085 2086 /* 2087 * Receive side processing. 2088 */ 2089 do { 2090 /* 2091 * Collect buffers from the ring. 2092 */ 2093 xnf_rx_collect(xnfp); 2094 2095 /* 2096 * Interrupt me when the next receive buffer is consumed. 2097 */ 2098 xnfp->xnf_rx_ring.sring->rsp_event = 2099 xnfp->xnf_rx_ring.rsp_cons + 1; 2100 xen_mb(); 2101 2102 } while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)); 2103 2104 if (xnfp->xnf_rx_new_buffers_posted) { 2105 boolean_t notify; 2106 2107 /* 2108 * Indicate to the peer that we have re-filled the 2109 * receive ring, if it cares. 2110 */ 2111 /* LINTED: constant in conditional context */ 2112 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify); 2113 if (notify) 2114 ec_notify_via_evtchn(xnfp->xnf_evtchn); 2115 xnfp->xnf_rx_new_buffers_posted = B_FALSE; 2116 } 2117 2118 mp = xnfp->xnf_rx_head; 2119 xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL; 2120 2121 xnfp->xnf_stat_interrupts++; 2122 mutex_exit(&xnfp->xnf_rxlock); 2123 2124 if (mp != NULL) 2125 mac_rx(xnfp->xnf_mh, NULL, mp); 2126 2127 /* 2128 * Transmit side processing. 2129 * 2130 * If a previous transmit attempt failed or we have pending 2131 * multicast requests, clean the ring. 2132 * 2133 * If we previously stalled transmission and cleaning produces 2134 * some free slots, tell upstream to attempt sending again. 2135 * 2136 * The odd style is to avoid acquiring xnf_txlock unless we 2137 * will actually look inside the tx machinery. 2138 */ 2139 mutex_enter(&xnfp->xnf_schedlock); 2140 need_sched = xnfp->xnf_need_sched; 2141 clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0); 2142 mutex_exit(&xnfp->xnf_schedlock); 2143 2144 if (clean_ring) { 2145 int free_slots; 2146 2147 mutex_enter(&xnfp->xnf_txlock); 2148 free_slots = xnf_tx_slots_get(xnfp, 0, B_FALSE); 2149 2150 if (need_sched && (free_slots >= XEN_MAX_SLOTS_PER_TX)) { 2151 mutex_enter(&xnfp->xnf_schedlock); 2152 xnfp->xnf_need_sched = B_FALSE; 2153 mutex_exit(&xnfp->xnf_schedlock); 2154 2155 mac_tx_update(xnfp->xnf_mh); 2156 } 2157 mutex_exit(&xnfp->xnf_txlock); 2158 } 2159 2160 return (DDI_INTR_CLAIMED); 2161 } 2162 2163 /* 2164 * xnf_start() -- start the board receiving and enable interrupts. 2165 */ 2166 static int 2167 xnf_start(void *arg) 2168 { 2169 xnf_t *xnfp = arg; 2170 2171 #ifdef XNF_DEBUG 2172 if (xnf_debug & XNF_DEBUG_TRACE) 2173 printf("xnf%d start(0x%p)\n", 2174 ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp); 2175 #endif 2176 2177 mutex_enter(&xnfp->xnf_rxlock); 2178 mutex_enter(&xnfp->xnf_txlock); 2179 2180 /* Accept packets from above. */ 2181 xnfp->xnf_running = B_TRUE; 2182 2183 mutex_exit(&xnfp->xnf_txlock); 2184 mutex_exit(&xnfp->xnf_rxlock); 2185 2186 return (0); 2187 } 2188 2189 /* xnf_stop() - disable hardware */ 2190 static void 2191 xnf_stop(void *arg) 2192 { 2193 xnf_t *xnfp = arg; 2194 2195 #ifdef XNF_DEBUG 2196 if (xnf_debug & XNF_DEBUG_TRACE) 2197 printf("xnf%d stop(0x%p)\n", 2198 ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp); 2199 #endif 2200 2201 mutex_enter(&xnfp->xnf_rxlock); 2202 mutex_enter(&xnfp->xnf_txlock); 2203 2204 xnfp->xnf_running = B_FALSE; 2205 2206 mutex_exit(&xnfp->xnf_txlock); 2207 mutex_exit(&xnfp->xnf_rxlock); 2208 } 2209 2210 /* 2211 * Hang buffer `bdesc' on the RX ring. 2212 */ 2213 static void 2214 xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc) 2215 { 2216 netif_rx_request_t *reqp; 2217 RING_IDX hang_ix; 2218 2219 ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock)); 2220 2221 reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring, 2222 xnfp->xnf_rx_ring.req_prod_pvt); 2223 hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0)); 2224 ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL); 2225 2226 reqp->id = bdesc->id = hang_ix; 2227 reqp->gref = bdesc->grant_ref; 2228 2229 xnfp->xnf_rx_pkt_info[hang_ix] = bdesc; 2230 xnfp->xnf_rx_ring.req_prod_pvt++; 2231 2232 xnfp->xnf_rx_new_buffers_posted = B_TRUE; 2233 } 2234 2235 /* 2236 * Receive an entire packet from the ring, starting from slot *consp. 2237 * prod indicates the slot of the latest response. 2238 * On return, *consp will point to the head of the next packet. 2239 * 2240 * Note: If slot prod was reached before we could gather a full packet, we will 2241 * drop the partial packet; this would most likely indicate a bug in either 2242 * the front-end or the back-end driver. 2243 * 2244 * An rx packet can consist of several fragments and thus span multiple slots. 2245 * Each fragment can contain up to 4k of data. 2246 * 2247 * A typical 9000 MTU packet with look like this: 2248 * +------+---------------------+-------------------+-----------------------+ 2249 * | SLOT | TYPE | CONTENTS | FLAGS | 2250 * +------+---------------------+-------------------+-----------------------+ 2251 * | 1 | netif_rx_response_t | 1st data fragment | more_data | 2252 * +------+---------------------+-------------------+-----------------------+ 2253 * | 2 | netif_rx_response_t | 2nd data fragment | more_data | 2254 * +------+---------------------+-------------------+-----------------------+ 2255 * | 3 | netif_rx_response_t | 3rd data fragment | [none] | 2256 * +------+---------------------+-------------------+-----------------------+ 2257 * 2258 * Fragments are chained by setting NETRXF_more_data in the previous 2259 * response's flags. If there are additional flags, such as 2260 * NETRXF_data_validated or NETRXF_extra_info, those should be set on the 2261 * first fragment. 2262 * 2263 * Sometimes extra info can be present. If so, it will follow the first 2264 * fragment, and NETRXF_extra_info flag will be set on the first response. 2265 * If LRO is set on a packet, it will be stored in the extra info. Conforming 2266 * to the spec, extra info can also be chained, but must all be present right 2267 * after the first fragment. 2268 * 2269 * Example of a packet with 2 extra infos: 2270 * +------+---------------------+-------------------+-----------------------+ 2271 * | SLOT | TYPE | CONTENTS | FLAGS | 2272 * +------+---------------------+-------------------+-----------------------+ 2273 * | 1 | netif_rx_response_t | 1st data fragment | extra_info, more_data | 2274 * +------+---------------------+-------------------+-----------------------+ 2275 * | 2 | netif_extra_info_t | 1st extra info | EXTRA_FLAG_MORE | 2276 * +------+---------------------+-------------------+-----------------------+ 2277 * | 3 | netif_extra_info_t | 2nd extra info | [none] | 2278 * +------+---------------------+-------------------+-----------------------+ 2279 * | 4 | netif_rx_response_t | 2nd data fragment | more_data | 2280 * +------+---------------------+-------------------+-----------------------+ 2281 * | 5 | netif_rx_response_t | 3rd data fragment | more_data | 2282 * +------+---------------------+-------------------+-----------------------+ 2283 * | 6 | netif_rx_response_t | 4th data fragment | [none] | 2284 * +------+---------------------+-------------------+-----------------------+ 2285 * 2286 * In practice, the only extra we expect is for LRO, but only if we advertise 2287 * that we support it to the backend (xnf_enable_lro == TRUE). 2288 */ 2289 static int 2290 xnf_rx_one_packet(xnf_t *xnfp, RING_IDX prod, RING_IDX *consp, mblk_t **mpp) 2291 { 2292 mblk_t *head = NULL; 2293 mblk_t *tail = NULL; 2294 mblk_t *mp; 2295 int error = 0; 2296 RING_IDX cons = *consp; 2297 netif_extra_info_t lro; 2298 boolean_t is_lro = B_FALSE; 2299 boolean_t is_extra = B_FALSE; 2300 2301 netif_rx_response_t rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons); 2302 2303 boolean_t hwcsum = (rsp.flags & NETRXF_data_validated) != 0; 2304 boolean_t more_data = (rsp.flags & NETRXF_more_data) != 0; 2305 boolean_t more_extra = (rsp.flags & NETRXF_extra_info) != 0; 2306 2307 IMPLY(more_data, xnf_enable_rx_sg); 2308 2309 while (cons != prod) { 2310 xnf_buf_t *bdesc; 2311 int len, off; 2312 int rxidx = cons & (NET_RX_RING_SIZE - 1); 2313 2314 bdesc = xnfp->xnf_rx_pkt_info[rxidx]; 2315 xnfp->xnf_rx_pkt_info[rxidx] = NULL; 2316 2317 if (is_extra) { 2318 netif_extra_info_t *extra = (netif_extra_info_t *)&rsp; 2319 /* 2320 * The only extra we expect is for LRO, and it should 2321 * only be present once. 2322 */ 2323 if (extra->type == XEN_NETIF_EXTRA_TYPE_GSO && 2324 !is_lro) { 2325 ASSERT(xnf_enable_lro); 2326 lro = *extra; 2327 is_lro = B_TRUE; 2328 DTRACE_PROBE1(lro, netif_extra_info_t *, &lro); 2329 } else { 2330 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx packet " 2331 "contains unexpected extra info of type %d", 2332 extra->type); 2333 error = EINVAL; 2334 } 2335 more_extra = 2336 (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE) != 0; 2337 2338 goto hang_buf; 2339 } 2340 2341 ASSERT3U(bdesc->id, ==, rsp.id); 2342 2343 /* 2344 * status stores packet length when >= 0, or errors when < 0. 2345 */ 2346 len = rsp.status; 2347 off = rsp.offset; 2348 more_data = (rsp.flags & NETRXF_more_data) != 0; 2349 2350 /* 2351 * sanity checks. 2352 */ 2353 if (!xnfp->xnf_running) { 2354 error = EBUSY; 2355 } else if (len <= 0) { 2356 xnfp->xnf_stat_errrx++; 2357 2358 switch (len) { 2359 case 0: 2360 xnfp->xnf_stat_runt++; 2361 break; 2362 case NETIF_RSP_ERROR: 2363 xnfp->xnf_stat_mac_rcv_error++; 2364 break; 2365 case NETIF_RSP_DROPPED: 2366 xnfp->xnf_stat_norxbuf++; 2367 break; 2368 } 2369 error = EINVAL; 2370 } else if (bdesc->grant_ref == INVALID_GRANT_REF) { 2371 dev_err(xnfp->xnf_devinfo, CE_WARN, 2372 "Bad rx grant reference, rsp id %d", rsp.id); 2373 error = EINVAL; 2374 } else if ((off + len) > PAGESIZE) { 2375 dev_err(xnfp->xnf_devinfo, CE_WARN, "Rx packet crosses " 2376 "page boundary (offset %d, length %d)", off, len); 2377 error = EINVAL; 2378 } 2379 2380 if (error != 0) { 2381 /* 2382 * If an error has been detected, we do not attempt 2383 * to read the data but we still need to replace 2384 * the rx bufs. 2385 */ 2386 goto hang_buf; 2387 } 2388 2389 xnf_buf_t *nbuf = NULL; 2390 2391 /* 2392 * If the packet is below a pre-determined size we will 2393 * copy data out of the buf rather than replace it. 2394 */ 2395 if (len > xnf_rx_copy_limit) 2396 nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE); 2397 2398 if (nbuf != NULL) { 2399 mp = desballoc((unsigned char *)bdesc->buf, 2400 bdesc->len, 0, &bdesc->free_rtn); 2401 2402 if (mp == NULL) { 2403 xnfp->xnf_stat_rx_desballoc_fail++; 2404 xnfp->xnf_stat_norxbuf++; 2405 error = ENOMEM; 2406 /* 2407 * we free the buf we just allocated as we 2408 * will re-hang the old buf. 2409 */ 2410 xnf_buf_put(xnfp, nbuf, B_FALSE); 2411 goto hang_buf; 2412 } 2413 2414 mp->b_rptr = mp->b_rptr + off; 2415 mp->b_wptr = mp->b_rptr + len; 2416 2417 /* 2418 * Release the grant as the backend doesn't need to 2419 * access this buffer anymore and grants are scarce. 2420 */ 2421 (void) gnttab_end_foreign_access_ref(bdesc->grant_ref, 2422 0); 2423 xnf_gref_put(xnfp, bdesc->grant_ref); 2424 bdesc->grant_ref = INVALID_GRANT_REF; 2425 2426 bdesc = nbuf; 2427 } else { 2428 /* 2429 * We failed to allocate a new buf or decided to reuse 2430 * the old one. In either case we copy the data off it 2431 * and put it back into the ring. 2432 */ 2433 mp = allocb(len, 0); 2434 if (mp == NULL) { 2435 xnfp->xnf_stat_rx_allocb_fail++; 2436 xnfp->xnf_stat_norxbuf++; 2437 error = ENOMEM; 2438 goto hang_buf; 2439 } 2440 bcopy(bdesc->buf + off, mp->b_wptr, len); 2441 mp->b_wptr += len; 2442 } 2443 2444 if (head == NULL) 2445 head = mp; 2446 else 2447 tail->b_cont = mp; 2448 tail = mp; 2449 2450 hang_buf: 2451 /* 2452 * No matter what happens, for each response we need to hang 2453 * a new buf on the rx ring. Put either the old one, or a new 2454 * one if the old one is borrowed by the kernel via desballoc(). 2455 */ 2456 xnf_rxbuf_hang(xnfp, bdesc); 2457 cons++; 2458 2459 /* next response is an extra */ 2460 is_extra = more_extra; 2461 2462 if (!more_data && !more_extra) 2463 break; 2464 2465 /* 2466 * Note that since requests and responses are union'd on the 2467 * same ring, we copy the response to a local variable instead 2468 * of keeping a pointer. Otherwise xnf_rxbuf_hang() would have 2469 * overwritten contents of rsp. 2470 */ 2471 rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons); 2472 } 2473 2474 /* 2475 * Check that we do not get stuck in a loop. 2476 */ 2477 ASSERT3U(*consp, !=, cons); 2478 *consp = cons; 2479 2480 /* 2481 * We ran out of responses but the flags indicate there is more data. 2482 */ 2483 if (more_data) { 2484 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments."); 2485 error = EINVAL; 2486 } 2487 if (more_extra) { 2488 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments " 2489 "(extras)."); 2490 error = EINVAL; 2491 } 2492 2493 /* 2494 * An error means the packet must be dropped. If we have already formed 2495 * a partial packet, then discard it. 2496 */ 2497 if (error != 0) { 2498 if (head != NULL) 2499 freemsg(head); 2500 xnfp->xnf_stat_rx_drop++; 2501 return (error); 2502 } 2503 2504 ASSERT(head != NULL); 2505 2506 if (hwcsum) { 2507 /* 2508 * If the peer says that the data has been validated then we 2509 * declare that the full checksum has been verified. 2510 * 2511 * We don't look at the "checksum blank" flag, and hence could 2512 * have a packet here that we are asserting is good with 2513 * a blank checksum. 2514 */ 2515 mac_hcksum_set(head, 0, 0, 0, 0, HCK_FULLCKSUM_OK); 2516 xnfp->xnf_stat_rx_cksum_no_need++; 2517 } 2518 2519 /* XXX: set lro info for packet once LRO is supported in OS. */ 2520 2521 *mpp = head; 2522 2523 return (0); 2524 } 2525 2526 /* 2527 * Collect packets from the RX ring, storing them in `xnfp' for later use. 2528 */ 2529 static void 2530 xnf_rx_collect(xnf_t *xnfp) 2531 { 2532 RING_IDX prod; 2533 2534 ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock)); 2535 2536 prod = xnfp->xnf_rx_ring.sring->rsp_prod; 2537 /* 2538 * Ensure we see queued responses up to 'prod'. 2539 */ 2540 membar_consumer(); 2541 2542 while (xnfp->xnf_rx_ring.rsp_cons != prod) { 2543 mblk_t *mp; 2544 2545 /* 2546 * Collect a packet. 2547 * rsp_cons is updated inside xnf_rx_one_packet(). 2548 */ 2549 int error = xnf_rx_one_packet(xnfp, prod, 2550 &xnfp->xnf_rx_ring.rsp_cons, &mp); 2551 if (error == 0) { 2552 xnfp->xnf_stat_ipackets++; 2553 xnfp->xnf_stat_rbytes += xmsgsize(mp); 2554 2555 /* 2556 * Append the mblk to the rx list. 2557 */ 2558 if (xnfp->xnf_rx_head == NULL) { 2559 ASSERT3P(xnfp->xnf_rx_tail, ==, NULL); 2560 xnfp->xnf_rx_head = mp; 2561 } else { 2562 ASSERT(xnfp->xnf_rx_tail != NULL); 2563 xnfp->xnf_rx_tail->b_next = mp; 2564 } 2565 xnfp->xnf_rx_tail = mp; 2566 } 2567 } 2568 } 2569 2570 /* 2571 * xnf_alloc_dma_resources() -- initialize the drivers structures 2572 */ 2573 static int 2574 xnf_alloc_dma_resources(xnf_t *xnfp) 2575 { 2576 dev_info_t *devinfo = xnfp->xnf_devinfo; 2577 size_t len; 2578 ddi_dma_cookie_t dma_cookie; 2579 uint_t ncookies; 2580 int rc; 2581 caddr_t rptr; 2582 2583 /* 2584 * The code below allocates all the DMA data structures that 2585 * need to be released when the driver is detached. 2586 * 2587 * Allocate page for the transmit descriptor ring. 2588 */ 2589 if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr, 2590 DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS) 2591 goto alloc_error; 2592 2593 if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle, 2594 PAGESIZE, &accattr, DDI_DMA_CONSISTENT, 2595 DDI_DMA_SLEEP, 0, &rptr, &len, 2596 &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) { 2597 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle); 2598 xnfp->xnf_tx_ring_dma_handle = NULL; 2599 goto alloc_error; 2600 } 2601 2602 if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL, 2603 rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, 2604 DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) { 2605 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle); 2606 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle); 2607 xnfp->xnf_tx_ring_dma_handle = NULL; 2608 xnfp->xnf_tx_ring_dma_acchandle = NULL; 2609 if (rc == DDI_DMA_NORESOURCES) 2610 goto alloc_error; 2611 else 2612 goto error; 2613 } 2614 2615 ASSERT(ncookies == 1); 2616 bzero(rptr, PAGESIZE); 2617 /* LINTED: constant in conditional context */ 2618 SHARED_RING_INIT((netif_tx_sring_t *)rptr); 2619 /* LINTED: constant in conditional context */ 2620 FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE); 2621 xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress; 2622 2623 /* 2624 * Allocate page for the receive descriptor ring. 2625 */ 2626 if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr, 2627 DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS) 2628 goto alloc_error; 2629 2630 if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle, 2631 PAGESIZE, &accattr, DDI_DMA_CONSISTENT, 2632 DDI_DMA_SLEEP, 0, &rptr, &len, 2633 &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) { 2634 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); 2635 xnfp->xnf_rx_ring_dma_handle = NULL; 2636 goto alloc_error; 2637 } 2638 2639 if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL, 2640 rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, 2641 DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) { 2642 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle); 2643 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); 2644 xnfp->xnf_rx_ring_dma_handle = NULL; 2645 xnfp->xnf_rx_ring_dma_acchandle = NULL; 2646 if (rc == DDI_DMA_NORESOURCES) 2647 goto alloc_error; 2648 else 2649 goto error; 2650 } 2651 2652 ASSERT(ncookies == 1); 2653 bzero(rptr, PAGESIZE); 2654 /* LINTED: constant in conditional context */ 2655 SHARED_RING_INIT((netif_rx_sring_t *)rptr); 2656 /* LINTED: constant in conditional context */ 2657 FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE); 2658 xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress; 2659 2660 return (DDI_SUCCESS); 2661 2662 alloc_error: 2663 cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory", 2664 ddi_get_instance(xnfp->xnf_devinfo)); 2665 error: 2666 xnf_release_dma_resources(xnfp); 2667 return (DDI_FAILURE); 2668 } 2669 2670 /* 2671 * Release all DMA resources in the opposite order from acquisition 2672 */ 2673 static void 2674 xnf_release_dma_resources(xnf_t *xnfp) 2675 { 2676 int i; 2677 2678 /* 2679 * Free receive buffers which are currently associated with 2680 * descriptors. 2681 */ 2682 mutex_enter(&xnfp->xnf_rxlock); 2683 for (i = 0; i < NET_RX_RING_SIZE; i++) { 2684 xnf_buf_t *bp; 2685 2686 if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL) 2687 continue; 2688 xnfp->xnf_rx_pkt_info[i] = NULL; 2689 xnf_buf_put(xnfp, bp, B_FALSE); 2690 } 2691 mutex_exit(&xnfp->xnf_rxlock); 2692 2693 /* Free the receive ring buffer. */ 2694 if (xnfp->xnf_rx_ring_dma_acchandle != NULL) { 2695 (void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle); 2696 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle); 2697 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); 2698 xnfp->xnf_rx_ring_dma_acchandle = NULL; 2699 } 2700 /* Free the transmit ring buffer. */ 2701 if (xnfp->xnf_tx_ring_dma_acchandle != NULL) { 2702 (void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle); 2703 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle); 2704 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle); 2705 xnfp->xnf_tx_ring_dma_acchandle = NULL; 2706 } 2707 2708 } 2709 2710 /* 2711 * Release any packets and associated structures used by the TX ring. 2712 */ 2713 static void 2714 xnf_release_mblks(xnf_t *xnfp) 2715 { 2716 RING_IDX i; 2717 xnf_txid_t *tidp; 2718 2719 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0]; 2720 i < NET_TX_RING_SIZE; 2721 i++, tidp++) { 2722 xnf_txbuf_t *txp = tidp->txbuf; 2723 2724 if (txp != NULL) { 2725 ASSERT(txp->tx_mp != NULL); 2726 freemsg(txp->tx_mp); 2727 2728 xnf_txid_put(xnfp, tidp); 2729 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); 2730 } 2731 } 2732 } 2733 2734 static int 2735 xnf_buf_constructor(void *buf, void *arg, int kmflag) 2736 { 2737 int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP; 2738 xnf_buf_t *bdesc = buf; 2739 xnf_t *xnfp = arg; 2740 ddi_dma_cookie_t dma_cookie; 2741 uint_t ncookies; 2742 size_t len; 2743 2744 if (kmflag & KM_NOSLEEP) 2745 ddiflags = DDI_DMA_DONTWAIT; 2746 2747 /* Allocate a DMA access handle for the buffer. */ 2748 if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buf_dma_attr, 2749 ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS) 2750 goto failure; 2751 2752 /* Allocate DMA-able memory for buffer. */ 2753 if (ddi_dma_mem_alloc(bdesc->dma_handle, 2754 PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0, 2755 &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS) 2756 goto failure_1; 2757 2758 /* Bind to virtual address of buffer to get physical address. */ 2759 if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL, 2760 bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING, 2761 ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) 2762 goto failure_2; 2763 ASSERT(ncookies == 1); 2764 2765 bdesc->free_rtn.free_func = xnf_buf_recycle; 2766 bdesc->free_rtn.free_arg = (caddr_t)bdesc; 2767 bdesc->xnfp = xnfp; 2768 bdesc->buf_phys = dma_cookie.dmac_laddress; 2769 bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys)); 2770 bdesc->len = dma_cookie.dmac_size; 2771 bdesc->grant_ref = INVALID_GRANT_REF; 2772 bdesc->gen = xnfp->xnf_gen; 2773 2774 atomic_inc_64(&xnfp->xnf_stat_buf_allocated); 2775 2776 return (0); 2777 2778 failure_2: 2779 ddi_dma_mem_free(&bdesc->acc_handle); 2780 2781 failure_1: 2782 ddi_dma_free_handle(&bdesc->dma_handle); 2783 2784 failure: 2785 2786 ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */ 2787 return (-1); 2788 } 2789 2790 static void 2791 xnf_buf_destructor(void *buf, void *arg) 2792 { 2793 xnf_buf_t *bdesc = buf; 2794 xnf_t *xnfp = arg; 2795 2796 (void) ddi_dma_unbind_handle(bdesc->dma_handle); 2797 ddi_dma_mem_free(&bdesc->acc_handle); 2798 ddi_dma_free_handle(&bdesc->dma_handle); 2799 2800 atomic_dec_64(&xnfp->xnf_stat_buf_allocated); 2801 } 2802 2803 static xnf_buf_t * 2804 xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly) 2805 { 2806 grant_ref_t gref; 2807 xnf_buf_t *bufp; 2808 2809 /* 2810 * Usually grant references are more scarce than memory, so we 2811 * attempt to acquire a grant reference first. 2812 */ 2813 gref = xnf_gref_get(xnfp); 2814 if (gref == INVALID_GRANT_REF) 2815 return (NULL); 2816 2817 bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags); 2818 if (bufp == NULL) { 2819 xnf_gref_put(xnfp, gref); 2820 return (NULL); 2821 } 2822 2823 ASSERT3U(bufp->grant_ref, ==, INVALID_GRANT_REF); 2824 2825 bufp->grant_ref = gref; 2826 2827 if (bufp->gen != xnfp->xnf_gen) 2828 xnf_buf_refresh(bufp); 2829 2830 gnttab_grant_foreign_access_ref(bufp->grant_ref, 2831 xvdi_get_oeid(bufp->xnfp->xnf_devinfo), 2832 bufp->buf_mfn, readonly ? 1 : 0); 2833 2834 atomic_inc_64(&xnfp->xnf_stat_buf_outstanding); 2835 2836 return (bufp); 2837 } 2838 2839 static void 2840 xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly) 2841 { 2842 if (bufp->grant_ref != INVALID_GRANT_REF) { 2843 (void) gnttab_end_foreign_access_ref( 2844 bufp->grant_ref, readonly ? 1 : 0); 2845 xnf_gref_put(xnfp, bufp->grant_ref); 2846 bufp->grant_ref = INVALID_GRANT_REF; 2847 } 2848 2849 kmem_cache_free(xnfp->xnf_buf_cache, bufp); 2850 2851 atomic_dec_64(&xnfp->xnf_stat_buf_outstanding); 2852 } 2853 2854 /* 2855 * Refresh any cached data about a buffer after resume. 2856 */ 2857 static void 2858 xnf_buf_refresh(xnf_buf_t *bdesc) 2859 { 2860 bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys)); 2861 bdesc->gen = bdesc->xnfp->xnf_gen; 2862 } 2863 2864 /* 2865 * Streams `freeb' routine for `xnf_buf_t' when used as transmit 2866 * look-aside buffers. 2867 */ 2868 static void 2869 xnf_buf_recycle(xnf_buf_t *bdesc) 2870 { 2871 xnf_t *xnfp = bdesc->xnfp; 2872 2873 xnf_buf_put(xnfp, bdesc, B_TRUE); 2874 } 2875 2876 static int 2877 xnf_tx_buf_constructor(void *buf, void *arg, int kmflag) 2878 { 2879 int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP; 2880 xnf_txbuf_t *txp = buf; 2881 xnf_t *xnfp = arg; 2882 2883 if (kmflag & KM_NOSLEEP) 2884 ddiflags = DDI_DMA_DONTWAIT; 2885 2886 if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buf_dma_attr, 2887 ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) { 2888 ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */ 2889 return (-1); 2890 } 2891 2892 return (0); 2893 } 2894 2895 static void 2896 xnf_tx_buf_destructor(void *buf, void *arg) 2897 { 2898 _NOTE(ARGUNUSED(arg)); 2899 xnf_txbuf_t *txp = buf; 2900 2901 ddi_dma_free_handle(&txp->tx_dma_handle); 2902 } 2903 2904 /* 2905 * Statistics. 2906 */ 2907 static char *xnf_aux_statistics[] = { 2908 "tx_cksum_deferred", 2909 "rx_cksum_no_need", 2910 "interrupts", 2911 "unclaimed_interrupts", 2912 "tx_pullup", 2913 "tx_lookaside", 2914 "tx_drop", 2915 "tx_eth_hdr_split", 2916 "buf_allocated", 2917 "buf_outstanding", 2918 "gref_outstanding", 2919 "gref_failure", 2920 "gref_peak", 2921 "rx_allocb_fail", 2922 "rx_desballoc_fail", 2923 }; 2924 2925 static int 2926 xnf_kstat_aux_update(kstat_t *ksp, int flag) 2927 { 2928 xnf_t *xnfp; 2929 kstat_named_t *knp; 2930 2931 if (flag != KSTAT_READ) 2932 return (EACCES); 2933 2934 xnfp = ksp->ks_private; 2935 knp = ksp->ks_data; 2936 2937 /* 2938 * Assignment order must match that of the names in 2939 * xnf_aux_statistics. 2940 */ 2941 (knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred; 2942 (knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need; 2943 2944 (knp++)->value.ui64 = xnfp->xnf_stat_interrupts; 2945 (knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts; 2946 (knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup; 2947 (knp++)->value.ui64 = xnfp->xnf_stat_tx_lookaside; 2948 (knp++)->value.ui64 = xnfp->xnf_stat_tx_drop; 2949 (knp++)->value.ui64 = xnfp->xnf_stat_tx_eth_hdr_split; 2950 2951 (knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated; 2952 (knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding; 2953 (knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding; 2954 (knp++)->value.ui64 = xnfp->xnf_stat_gref_failure; 2955 (knp++)->value.ui64 = xnfp->xnf_stat_gref_peak; 2956 (knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail; 2957 (knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail; 2958 2959 return (0); 2960 } 2961 2962 static boolean_t 2963 xnf_kstat_init(xnf_t *xnfp) 2964 { 2965 int nstat = sizeof (xnf_aux_statistics) / 2966 sizeof (xnf_aux_statistics[0]); 2967 char **cp = xnf_aux_statistics; 2968 kstat_named_t *knp; 2969 2970 /* 2971 * Create and initialise kstats. 2972 */ 2973 if ((xnfp->xnf_kstat_aux = kstat_create("xnf", 2974 ddi_get_instance(xnfp->xnf_devinfo), 2975 "aux_statistics", "net", KSTAT_TYPE_NAMED, 2976 nstat, 0)) == NULL) 2977 return (B_FALSE); 2978 2979 xnfp->xnf_kstat_aux->ks_private = xnfp; 2980 xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update; 2981 2982 knp = xnfp->xnf_kstat_aux->ks_data; 2983 while (nstat > 0) { 2984 kstat_named_init(knp, *cp, KSTAT_DATA_UINT64); 2985 2986 knp++; 2987 cp++; 2988 nstat--; 2989 } 2990 2991 kstat_install(xnfp->xnf_kstat_aux); 2992 2993 return (B_TRUE); 2994 } 2995 2996 static int 2997 xnf_stat(void *arg, uint_t stat, uint64_t *val) 2998 { 2999 xnf_t *xnfp = arg; 3000 3001 mutex_enter(&xnfp->xnf_rxlock); 3002 mutex_enter(&xnfp->xnf_txlock); 3003 3004 #define mac_stat(q, r) \ 3005 case (MAC_STAT_##q): \ 3006 *val = xnfp->xnf_stat_##r; \ 3007 break 3008 3009 #define ether_stat(q, r) \ 3010 case (ETHER_STAT_##q): \ 3011 *val = xnfp->xnf_stat_##r; \ 3012 break 3013 3014 switch (stat) { 3015 3016 mac_stat(IPACKETS, ipackets); 3017 mac_stat(OPACKETS, opackets); 3018 mac_stat(RBYTES, rbytes); 3019 mac_stat(OBYTES, obytes); 3020 mac_stat(NORCVBUF, norxbuf); 3021 mac_stat(IERRORS, errrx); 3022 mac_stat(NOXMTBUF, tx_defer); 3023 3024 ether_stat(MACRCV_ERRORS, mac_rcv_error); 3025 ether_stat(TOOSHORT_ERRORS, runt); 3026 3027 /* always claim to be in full duplex mode */ 3028 case ETHER_STAT_LINK_DUPLEX: 3029 *val = LINK_DUPLEX_FULL; 3030 break; 3031 3032 /* always claim to be at 1Gb/s link speed */ 3033 case MAC_STAT_IFSPEED: 3034 *val = 1000000000ull; 3035 break; 3036 3037 default: 3038 mutex_exit(&xnfp->xnf_txlock); 3039 mutex_exit(&xnfp->xnf_rxlock); 3040 3041 return (ENOTSUP); 3042 } 3043 3044 #undef mac_stat 3045 #undef ether_stat 3046 3047 mutex_exit(&xnfp->xnf_txlock); 3048 mutex_exit(&xnfp->xnf_rxlock); 3049 3050 return (0); 3051 } 3052 3053 static int 3054 xnf_change_mtu(xnf_t *xnfp, uint32_t mtu) 3055 { 3056 if (mtu > ETHERMTU) { 3057 if (!xnf_enable_tx_sg) { 3058 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d " 3059 "because scatter-gather is disabled for transmit " 3060 "in driver settings", ETHERMTU); 3061 return (EINVAL); 3062 } else if (!xnf_enable_rx_sg) { 3063 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d " 3064 "because scatter-gather is disabled for receive " 3065 "in driver settings", ETHERMTU); 3066 return (EINVAL); 3067 } else if (!xnfp->xnf_be_tx_sg) { 3068 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d " 3069 "because backend doesn't support scatter-gather", 3070 ETHERMTU); 3071 return (EINVAL); 3072 } 3073 if (mtu > XNF_MAXPKT) 3074 return (EINVAL); 3075 } 3076 int error = mac_maxsdu_update(xnfp->xnf_mh, mtu); 3077 if (error == 0) 3078 xnfp->xnf_mtu = mtu; 3079 3080 return (error); 3081 } 3082 3083 /*ARGSUSED*/ 3084 static int 3085 xnf_getprop(void *data, const char *prop_name, mac_prop_id_t prop_id, 3086 uint_t prop_val_size, void *prop_val) 3087 { 3088 xnf_t *xnfp = data; 3089 3090 switch (prop_id) { 3091 case MAC_PROP_MTU: 3092 ASSERT(prop_val_size >= sizeof (uint32_t)); 3093 bcopy(&xnfp->xnf_mtu, prop_val, sizeof (uint32_t)); 3094 break; 3095 default: 3096 return (ENOTSUP); 3097 } 3098 return (0); 3099 } 3100 3101 /*ARGSUSED*/ 3102 static int 3103 xnf_setprop(void *data, const char *prop_name, mac_prop_id_t prop_id, 3104 uint_t prop_val_size, const void *prop_val) 3105 { 3106 xnf_t *xnfp = data; 3107 uint32_t new_mtu; 3108 int error; 3109 3110 switch (prop_id) { 3111 case MAC_PROP_MTU: 3112 ASSERT(prop_val_size >= sizeof (uint32_t)); 3113 bcopy(prop_val, &new_mtu, sizeof (new_mtu)); 3114 error = xnf_change_mtu(xnfp, new_mtu); 3115 break; 3116 default: 3117 return (ENOTSUP); 3118 } 3119 3120 return (error); 3121 } 3122 3123 /*ARGSUSED*/ 3124 static void 3125 xnf_propinfo(void *data, const char *prop_name, mac_prop_id_t prop_id, 3126 mac_prop_info_handle_t prop_handle) 3127 { 3128 switch (prop_id) { 3129 case MAC_PROP_MTU: 3130 mac_prop_info_set_range_uint32(prop_handle, 0, XNF_MAXPKT); 3131 break; 3132 default: 3133 break; 3134 } 3135 } 3136 3137 static boolean_t 3138 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data) 3139 { 3140 xnf_t *xnfp = arg; 3141 3142 switch (cap) { 3143 case MAC_CAPAB_HCKSUM: { 3144 uint32_t *capab = cap_data; 3145 3146 /* 3147 * Whilst the flag used to communicate with the IO 3148 * domain is called "NETTXF_csum_blank", the checksum 3149 * in the packet must contain the pseudo-header 3150 * checksum and not zero. 3151 * 3152 * To help out the IO domain, we might use 3153 * HCKSUM_INET_PARTIAL. Unfortunately our stack will 3154 * then use checksum offload for IPv6 packets, which 3155 * the IO domain can't handle. 3156 * 3157 * As a result, we declare outselves capable of 3158 * HCKSUM_INET_FULL_V4. This means that we receive 3159 * IPv4 packets from the stack with a blank checksum 3160 * field and must insert the pseudo-header checksum 3161 * before passing the packet to the IO domain. 3162 */ 3163 *capab = HCKSUM_INET_FULL_V4; 3164 3165 /* 3166 * TODO: query the "feature-ipv6-csum-offload" capability. 3167 * If enabled, that could allow us to use HCKSUM_INET_PARTIAL. 3168 */ 3169 3170 break; 3171 } 3172 case MAC_CAPAB_LSO: { 3173 if (!xnfp->xnf_be_lso) 3174 return (B_FALSE); 3175 3176 mac_capab_lso_t *lso = cap_data; 3177 lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 3178 lso->lso_basic_tcp_ipv4.lso_max = IP_MAXPACKET; 3179 break; 3180 } 3181 default: 3182 return (B_FALSE); 3183 } 3184 3185 return (B_TRUE); 3186 } 3187 3188 /* 3189 * The state of the peer has changed - react accordingly. 3190 */ 3191 static void 3192 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, 3193 void *arg, void *impl_data) 3194 { 3195 _NOTE(ARGUNUSED(id, arg)); 3196 xnf_t *xnfp = ddi_get_driver_private(dip); 3197 XenbusState new_state = *(XenbusState *)impl_data; 3198 3199 ASSERT(xnfp != NULL); 3200 3201 switch (new_state) { 3202 case XenbusStateUnknown: 3203 case XenbusStateInitialising: 3204 case XenbusStateInitialised: 3205 case XenbusStateClosing: 3206 case XenbusStateClosed: 3207 case XenbusStateReconfiguring: 3208 case XenbusStateReconfigured: 3209 break; 3210 3211 case XenbusStateInitWait: 3212 xnf_read_config(xnfp); 3213 3214 if (!xnfp->xnf_be_rx_copy) { 3215 cmn_err(CE_WARN, 3216 "The xnf driver requires a dom0 that " 3217 "supports 'feature-rx-copy'."); 3218 (void) xvdi_switch_state(xnfp->xnf_devinfo, 3219 XBT_NULL, XenbusStateClosed); 3220 break; 3221 } 3222 3223 /* 3224 * Connect to the backend. 3225 */ 3226 xnf_be_connect(xnfp); 3227 3228 /* 3229 * Our MAC address as discovered by xnf_read_config(). 3230 */ 3231 mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr); 3232 3233 /* 3234 * We do not know if some features such as LSO are supported 3235 * until we connect to the backend. We request the MAC layer 3236 * to poll our capabilities again. 3237 */ 3238 mac_capab_update(xnfp->xnf_mh); 3239 3240 break; 3241 3242 case XenbusStateConnected: 3243 mutex_enter(&xnfp->xnf_rxlock); 3244 mutex_enter(&xnfp->xnf_txlock); 3245 3246 xnfp->xnf_connected = B_TRUE; 3247 /* 3248 * Wake up any threads waiting to send data to 3249 * backend. 3250 */ 3251 cv_broadcast(&xnfp->xnf_cv_state); 3252 3253 mutex_exit(&xnfp->xnf_txlock); 3254 mutex_exit(&xnfp->xnf_rxlock); 3255 3256 /* 3257 * Kick the peer in case it missed any transmits 3258 * request in the TX ring. 3259 */ 3260 ec_notify_via_evtchn(xnfp->xnf_evtchn); 3261 3262 /* 3263 * There may already be completed receive requests in 3264 * the ring sent by backend after it gets connected 3265 * but before we see its state change here, so we call 3266 * xnf_intr() to handle them, if any. 3267 */ 3268 (void) xnf_intr((caddr_t)xnfp); 3269 3270 /* 3271 * Mark the link up now that we are connected. 3272 */ 3273 mac_link_update(xnfp->xnf_mh, LINK_STATE_UP); 3274 3275 /* 3276 * Tell the backend about the multicast addresses in 3277 * which we are interested. 3278 */ 3279 mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE); 3280 3281 break; 3282 3283 default: 3284 break; 3285 } 3286 } 3287