1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright (c) 2014, 2017 by Delphix. All rights reserved. 29 */ 30 31 /* 32 * 33 * Copyright (c) 2004 Christian Limpach. 34 * All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. This section intentionally left blank. 45 * 4. The name of the author may not be used to endorse or promote products 46 * derived from this software without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 49 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 50 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 51 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 52 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 53 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 54 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 55 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 56 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 57 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 58 */ 59 /* 60 * Section 3 of the above license was updated in response to bug 6379571. 61 */ 62 63 /* 64 * xnf.c - GLDv3 network driver for domU. 65 */ 66 67 /* 68 * This driver uses four per-instance locks: 69 * 70 * xnf_gref_lock: 71 * 72 * Protects access to the grant reference list stored in 73 * xnf_gref_head. Grant references should be acquired and released 74 * using gref_get() and gref_put() respectively. 75 * 76 * xnf_schedlock: 77 * 78 * Protects: 79 * xnf_need_sched - used to record that a previous transmit attempt 80 * failed (and consequently it will be necessary to call 81 * mac_tx_update() when transmit resources are available). 82 * xnf_pending_multicast - the number of multicast requests that 83 * have been submitted to the backend for which we have not 84 * processed responses. 85 * 86 * xnf_txlock: 87 * 88 * Protects the transmit ring (xnf_tx_ring) and associated 89 * structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head). 90 * 91 * xnf_rxlock: 92 * 93 * Protects the receive ring (xnf_rx_ring) and associated 94 * structures (notably xnf_rx_pkt_info). 95 * 96 * If driver-global state that affects both the transmit and receive 97 * rings is manipulated, both xnf_txlock and xnf_rxlock should be 98 * held, in that order. 99 * 100 * xnf_schedlock is acquired both whilst holding xnf_txlock and 101 * without. It should always be acquired after xnf_txlock if both are 102 * held. 103 * 104 * Notes: 105 * - atomic_add_64() is used to manipulate counters where we require 106 * accuracy. For counters intended only for observation by humans, 107 * post increment/decrement are used instead. 108 */ 109 110 #include <sys/types.h> 111 #include <sys/errno.h> 112 #include <sys/param.h> 113 #include <sys/sysmacros.h> 114 #include <sys/systm.h> 115 #include <sys/stream.h> 116 #include <sys/strsubr.h> 117 #include <sys/strsun.h> 118 #include <sys/conf.h> 119 #include <sys/ddi.h> 120 #include <sys/devops.h> 121 #include <sys/sunddi.h> 122 #include <sys/sunndi.h> 123 #include <sys/dlpi.h> 124 #include <sys/ethernet.h> 125 #include <sys/strsun.h> 126 #include <sys/pattr.h> 127 #include <inet/ip.h> 128 #include <inet/ip_impl.h> 129 #include <inet/tcp.h> 130 #include <netinet/udp.h> 131 #include <sys/gld.h> 132 #include <sys/modctl.h> 133 #include <sys/mac_provider.h> 134 #include <sys/mac_ether.h> 135 #include <sys/bootinfo.h> 136 #include <sys/mach_mmu.h> 137 #ifdef XPV_HVM_DRIVER 138 #include <sys/xpv_support.h> 139 #include <sys/hypervisor.h> 140 #else 141 #include <sys/hypervisor.h> 142 #include <sys/evtchn_impl.h> 143 #include <sys/balloon_impl.h> 144 #endif 145 #include <xen/public/io/netif.h> 146 #include <sys/gnttab.h> 147 #include <xen/sys/xendev.h> 148 #include <sys/sdt.h> 149 #include <sys/note.h> 150 #include <sys/debug.h> 151 152 #include <io/xnf.h> 153 154 #if defined(DEBUG) || defined(__lint) 155 #define XNF_DEBUG 156 #endif 157 158 #ifdef XNF_DEBUG 159 int xnf_debug = 0; 160 xnf_t *xnf_debug_instance = NULL; 161 #endif 162 163 /* 164 * On a 32 bit PAE system physical and machine addresses are larger 165 * than 32 bits. ddi_btop() on such systems take an unsigned long 166 * argument, and so addresses above 4G are truncated before ddi_btop() 167 * gets to see them. To avoid this, code the shift operation here. 168 */ 169 #define xnf_btop(addr) ((addr) >> PAGESHIFT) 170 171 /* 172 * The parameters below should only be changed in /etc/system, never in mdb. 173 */ 174 175 /* 176 * Should we use the multicast control feature if the backend provides 177 * it? 178 */ 179 boolean_t xnf_multicast_control = B_TRUE; 180 181 /* 182 * Should we allow scatter-gather for tx if backend allows it? 183 */ 184 boolean_t xnf_enable_tx_sg = B_TRUE; 185 186 /* 187 * Should we allow scatter-gather for rx if backend allows it? 188 */ 189 boolean_t xnf_enable_rx_sg = B_TRUE; 190 191 /* 192 * Should we allow lso for tx sends if backend allows it? 193 * Requires xnf_enable_tx_sg to be also set to TRUE. 194 */ 195 boolean_t xnf_enable_lso = B_TRUE; 196 197 /* 198 * Should we allow lro on rx if backend supports it? 199 * Requires xnf_enable_rx_sg to be also set to TRUE. 200 * 201 * !! WARNING !! 202 * LRO is not yet supported in the OS so this should be left as FALSE. 203 * !! WARNING !! 204 */ 205 boolean_t xnf_enable_lro = B_FALSE; 206 207 /* 208 * Received packets below this size are copied to a new streams buffer 209 * rather than being desballoc'ed. 210 * 211 * This value is chosen to accommodate traffic where there are a large 212 * number of small packets. For data showing a typical distribution, 213 * see: 214 * 215 * Sinha07a: 216 * Rishi Sinha, Christos Papadopoulos, and John 217 * Heidemann. Internet Packet Size Distributions: Some 218 * Observations. Technical Report ISI-TR-2007-643, 219 * USC/Information Sciences Institute, May, 2007. Orignally 220 * released October 2005 as web page 221 * http://netweb.usc.edu/~sinha/pkt-sizes/. 222 * <http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>. 223 */ 224 size_t xnf_rx_copy_limit = 64; 225 226 #define INVALID_GRANT_HANDLE ((grant_handle_t)-1) 227 #define INVALID_GRANT_REF ((grant_ref_t)-1) 228 #define INVALID_TX_ID ((uint16_t)-1) 229 230 #define TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)])) 231 #define TX_ID_VALID(i) \ 232 (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE)) 233 234 /* 235 * calculate how many pages are spanned by an mblk fragment 236 */ 237 #define xnf_mblk_pages(mp) (MBLKL(mp) == 0 ? 0 : \ 238 xnf_btop((uintptr_t)mp->b_wptr - 1) - xnf_btop((uintptr_t)mp->b_rptr) + 1) 239 240 /* Required system entry points */ 241 static int xnf_attach(dev_info_t *, ddi_attach_cmd_t); 242 static int xnf_detach(dev_info_t *, ddi_detach_cmd_t); 243 244 /* Required driver entry points for Nemo */ 245 static int xnf_start(void *); 246 static void xnf_stop(void *); 247 static int xnf_set_mac_addr(void *, const uint8_t *); 248 static int xnf_set_multicast(void *, boolean_t, const uint8_t *); 249 static int xnf_set_promiscuous(void *, boolean_t); 250 static mblk_t *xnf_send(void *, mblk_t *); 251 static uint_t xnf_intr(caddr_t); 252 static int xnf_stat(void *, uint_t, uint64_t *); 253 static boolean_t xnf_getcapab(void *, mac_capab_t, void *); 254 static int xnf_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); 255 static int xnf_setprop(void *, const char *, mac_prop_id_t, uint_t, 256 const void *); 257 static void xnf_propinfo(void *, const char *, mac_prop_id_t, 258 mac_prop_info_handle_t); 259 260 /* Driver private functions */ 261 static int xnf_alloc_dma_resources(xnf_t *); 262 static void xnf_release_dma_resources(xnf_t *); 263 static void xnf_release_mblks(xnf_t *); 264 265 static int xnf_buf_constructor(void *, void *, int); 266 static void xnf_buf_destructor(void *, void *); 267 static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t); 268 #pragma inline(xnf_buf_get) 269 static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t); 270 #pragma inline(xnf_buf_put) 271 static void xnf_buf_refresh(xnf_buf_t *); 272 #pragma inline(xnf_buf_refresh) 273 static void xnf_buf_recycle(xnf_buf_t *); 274 275 static int xnf_tx_buf_constructor(void *, void *, int); 276 static void xnf_tx_buf_destructor(void *, void *); 277 278 static grant_ref_t xnf_gref_get(xnf_t *); 279 #pragma inline(xnf_gref_get) 280 static void xnf_gref_put(xnf_t *, grant_ref_t); 281 #pragma inline(xnf_gref_put) 282 283 static xnf_txid_t *xnf_txid_get(xnf_t *); 284 #pragma inline(xnf_txid_get) 285 static void xnf_txid_put(xnf_t *, xnf_txid_t *); 286 #pragma inline(xnf_txid_put) 287 288 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *); 289 static int xnf_tx_clean_ring(xnf_t *); 290 static void oe_state_change(dev_info_t *, ddi_eventcookie_t, 291 void *, void *); 292 static boolean_t xnf_kstat_init(xnf_t *); 293 static void xnf_rx_collect(xnf_t *); 294 295 #define XNF_CALLBACK_FLAGS (MC_GETCAPAB | MC_PROPERTIES) 296 297 static mac_callbacks_t xnf_callbacks = { 298 .mc_callbacks = XNF_CALLBACK_FLAGS, 299 .mc_getstat = xnf_stat, 300 .mc_start = xnf_start, 301 .mc_stop = xnf_stop, 302 .mc_setpromisc = xnf_set_promiscuous, 303 .mc_multicst = xnf_set_multicast, 304 .mc_unicst = xnf_set_mac_addr, 305 .mc_tx = xnf_send, 306 .mc_getcapab = xnf_getcapab, 307 .mc_setprop = xnf_setprop, 308 .mc_getprop = xnf_getprop, 309 .mc_propinfo = xnf_propinfo, 310 }; 311 312 /* DMA attributes for network ring buffer */ 313 static ddi_dma_attr_t ringbuf_dma_attr = { 314 .dma_attr_version = DMA_ATTR_V0, 315 .dma_attr_addr_lo = 0, 316 .dma_attr_addr_hi = 0xffffffffffffffffULL, 317 .dma_attr_count_max = 0x7fffffff, 318 .dma_attr_align = MMU_PAGESIZE, 319 .dma_attr_burstsizes = 0x7ff, 320 .dma_attr_minxfer = 1, 321 .dma_attr_maxxfer = 0xffffffffU, 322 .dma_attr_seg = 0xffffffffffffffffULL, 323 .dma_attr_sgllen = 1, 324 .dma_attr_granular = 1, 325 .dma_attr_flags = 0 326 }; 327 328 /* DMA attributes for receive data */ 329 static ddi_dma_attr_t rx_buf_dma_attr = { 330 .dma_attr_version = DMA_ATTR_V0, 331 .dma_attr_addr_lo = 0, 332 .dma_attr_addr_hi = 0xffffffffffffffffULL, 333 .dma_attr_count_max = MMU_PAGEOFFSET, 334 .dma_attr_align = MMU_PAGESIZE, /* allocation alignment */ 335 .dma_attr_burstsizes = 0x7ff, 336 .dma_attr_minxfer = 1, 337 .dma_attr_maxxfer = 0xffffffffU, 338 .dma_attr_seg = 0xffffffffffffffffULL, 339 .dma_attr_sgllen = 1, 340 .dma_attr_granular = 1, 341 .dma_attr_flags = 0 342 }; 343 344 /* DMA attributes for transmit data */ 345 static ddi_dma_attr_t tx_buf_dma_attr = { 346 .dma_attr_version = DMA_ATTR_V0, 347 .dma_attr_addr_lo = 0, 348 .dma_attr_addr_hi = 0xffffffffffffffffULL, 349 .dma_attr_count_max = MMU_PAGEOFFSET, 350 .dma_attr_align = 1, 351 .dma_attr_burstsizes = 0x7ff, 352 .dma_attr_minxfer = 1, 353 .dma_attr_maxxfer = 0xffffffffU, 354 .dma_attr_seg = XEN_DATA_BOUNDARY - 1, /* segment boundary */ 355 .dma_attr_sgllen = XEN_MAX_TX_DATA_PAGES, /* max number of segments */ 356 .dma_attr_granular = 1, 357 .dma_attr_flags = 0 358 }; 359 360 /* DMA access attributes for registers and descriptors */ 361 static ddi_device_acc_attr_t accattr = { 362 DDI_DEVICE_ATTR_V0, 363 DDI_STRUCTURE_LE_ACC, /* This is a little-endian device */ 364 DDI_STRICTORDER_ACC 365 }; 366 367 /* DMA access attributes for data: NOT to be byte swapped. */ 368 static ddi_device_acc_attr_t data_accattr = { 369 DDI_DEVICE_ATTR_V0, 370 DDI_NEVERSWAP_ACC, 371 DDI_STRICTORDER_ACC 372 }; 373 374 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach, 375 nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported); 376 377 static struct modldrv xnf_modldrv = { 378 &mod_driverops, 379 "Virtual Ethernet driver", 380 &xnf_dev_ops 381 }; 382 383 static struct modlinkage modlinkage = { 384 MODREV_1, &xnf_modldrv, NULL 385 }; 386 387 int 388 _init(void) 389 { 390 int r; 391 392 mac_init_ops(&xnf_dev_ops, "xnf"); 393 r = mod_install(&modlinkage); 394 if (r != DDI_SUCCESS) 395 mac_fini_ops(&xnf_dev_ops); 396 397 return (r); 398 } 399 400 int 401 _fini(void) 402 { 403 return (EBUSY); /* XXPV should be removable */ 404 } 405 406 int 407 _info(struct modinfo *modinfop) 408 { 409 return (mod_info(&modlinkage, modinfop)); 410 } 411 412 /* 413 * Acquire a grant reference. 414 */ 415 static grant_ref_t 416 xnf_gref_get(xnf_t *xnfp) 417 { 418 grant_ref_t gref; 419 420 mutex_enter(&xnfp->xnf_gref_lock); 421 422 do { 423 gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head); 424 425 } while ((gref == INVALID_GRANT_REF) && 426 (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0)); 427 428 mutex_exit(&xnfp->xnf_gref_lock); 429 430 if (gref == INVALID_GRANT_REF) { 431 xnfp->xnf_stat_gref_failure++; 432 } else { 433 atomic_inc_64(&xnfp->xnf_stat_gref_outstanding); 434 if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak) 435 xnfp->xnf_stat_gref_peak = 436 xnfp->xnf_stat_gref_outstanding; 437 } 438 439 return (gref); 440 } 441 442 /* 443 * Release a grant reference. 444 */ 445 static void 446 xnf_gref_put(xnf_t *xnfp, grant_ref_t gref) 447 { 448 ASSERT(gref != INVALID_GRANT_REF); 449 450 mutex_enter(&xnfp->xnf_gref_lock); 451 gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref); 452 mutex_exit(&xnfp->xnf_gref_lock); 453 454 atomic_dec_64(&xnfp->xnf_stat_gref_outstanding); 455 } 456 457 /* 458 * Acquire a transmit id. 459 */ 460 static xnf_txid_t * 461 xnf_txid_get(xnf_t *xnfp) 462 { 463 xnf_txid_t *tidp; 464 465 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 466 467 if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID) 468 return (NULL); 469 470 ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head)); 471 472 tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head); 473 xnfp->xnf_tx_pkt_id_head = tidp->next; 474 tidp->next = INVALID_TX_ID; 475 476 ASSERT(tidp->txbuf == NULL); 477 478 return (tidp); 479 } 480 481 /* 482 * Release a transmit id. 483 */ 484 static void 485 xnf_txid_put(xnf_t *xnfp, xnf_txid_t *tidp) 486 { 487 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 488 ASSERT(TX_ID_VALID(tidp->id)); 489 ASSERT(tidp->next == INVALID_TX_ID); 490 491 tidp->txbuf = NULL; 492 tidp->next = xnfp->xnf_tx_pkt_id_head; 493 xnfp->xnf_tx_pkt_id_head = tidp->id; 494 } 495 496 static void 497 xnf_data_txbuf_free(xnf_t *xnfp, xnf_txbuf_t *txp) 498 { 499 ASSERT3U(txp->tx_type, ==, TX_DATA); 500 501 /* 502 * We are either using a lookaside buffer or we are mapping existing 503 * buffers. 504 */ 505 if (txp->tx_bdesc != NULL) { 506 ASSERT(!txp->tx_handle_bound); 507 xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE); 508 } else { 509 if (txp->tx_txreq.gref != INVALID_GRANT_REF) { 510 if (gnttab_query_foreign_access(txp->tx_txreq.gref) != 511 0) { 512 cmn_err(CE_PANIC, "tx grant %d still in use by " 513 "backend domain", txp->tx_txreq.gref); 514 } 515 (void) gnttab_end_foreign_access_ref( 516 txp->tx_txreq.gref, 1); 517 xnf_gref_put(xnfp, txp->tx_txreq.gref); 518 } 519 520 if (txp->tx_handle_bound) 521 (void) ddi_dma_unbind_handle(txp->tx_dma_handle); 522 } 523 524 if (txp->tx_mp != NULL) 525 freemsg(txp->tx_mp); 526 527 if (txp->tx_prev != NULL) { 528 ASSERT3P(txp->tx_prev->tx_next, ==, txp); 529 txp->tx_prev->tx_next = NULL; 530 } 531 532 if (txp->tx_txreq.id != INVALID_TX_ID) { 533 /* 534 * This should be only possible when resuming from a suspend. 535 */ 536 ASSERT(!xnfp->xnf_connected); 537 xnf_txid_put(xnfp, TX_ID_TO_TXID(xnfp, txp->tx_txreq.id)); 538 txp->tx_txreq.id = INVALID_TX_ID; 539 } 540 541 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); 542 } 543 544 static void 545 xnf_data_txbuf_free_chain(xnf_t *xnfp, xnf_txbuf_t *txp) 546 { 547 if (txp == NULL) 548 return; 549 550 while (txp->tx_next != NULL) 551 txp = txp->tx_next; 552 553 /* 554 * We free the chain in reverse order so that grants can be released 555 * for all dma chunks before unbinding the dma handles. The mblk is 556 * freed last, after all its fragments' dma handles are unbound. 557 */ 558 xnf_txbuf_t *prev; 559 for (; txp != NULL; txp = prev) { 560 prev = txp->tx_prev; 561 xnf_data_txbuf_free(xnfp, txp); 562 } 563 } 564 565 static xnf_txbuf_t * 566 xnf_data_txbuf_alloc(xnf_t *xnfp) 567 { 568 xnf_txbuf_t *txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP); 569 txp->tx_type = TX_DATA; 570 txp->tx_next = NULL; 571 txp->tx_prev = NULL; 572 txp->tx_head = txp; 573 txp->tx_frags_to_ack = 0; 574 txp->tx_mp = NULL; 575 txp->tx_bdesc = NULL; 576 txp->tx_handle_bound = B_FALSE; 577 txp->tx_txreq.gref = INVALID_GRANT_REF; 578 txp->tx_txreq.id = INVALID_TX_ID; 579 580 return (txp); 581 } 582 583 /* 584 * Get `wanted' slots in the transmit ring, waiting for at least that 585 * number if `wait' is B_TRUE. Force the ring to be cleaned by setting 586 * `wanted' to zero. 587 * 588 * Return the number of slots available. 589 */ 590 static int 591 xnf_tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait) 592 { 593 int slotsfree; 594 boolean_t forced_clean = (wanted == 0); 595 596 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 597 598 /* LINTED: constant in conditional context */ 599 while (B_TRUE) { 600 slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring); 601 602 if ((slotsfree < wanted) || forced_clean) 603 slotsfree = xnf_tx_clean_ring(xnfp); 604 605 /* 606 * If there are more than we need free, tell other 607 * people to come looking again. We hold txlock, so we 608 * are able to take our slots before anyone else runs. 609 */ 610 if (slotsfree > wanted) 611 cv_broadcast(&xnfp->xnf_cv_tx_slots); 612 613 if (slotsfree >= wanted) 614 break; 615 616 if (!wait) 617 break; 618 619 cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock); 620 } 621 622 ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring))); 623 624 return (slotsfree); 625 } 626 627 static int 628 xnf_setup_rings(xnf_t *xnfp) 629 { 630 domid_t oeid; 631 struct xenbus_device *xsd; 632 RING_IDX i; 633 int err; 634 xnf_txid_t *tidp; 635 xnf_buf_t **bdescp; 636 637 oeid = xvdi_get_oeid(xnfp->xnf_devinfo); 638 xsd = xvdi_get_xsd(xnfp->xnf_devinfo); 639 640 if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF) 641 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0); 642 643 err = gnttab_grant_foreign_access(oeid, 644 xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0); 645 if (err <= 0) { 646 err = -err; 647 xenbus_dev_error(xsd, err, "granting access to tx ring page"); 648 goto out; 649 } 650 xnfp->xnf_tx_ring_ref = (grant_ref_t)err; 651 652 if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF) 653 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0); 654 655 err = gnttab_grant_foreign_access(oeid, 656 xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0); 657 if (err <= 0) { 658 err = -err; 659 xenbus_dev_error(xsd, err, "granting access to rx ring page"); 660 goto out; 661 } 662 xnfp->xnf_rx_ring_ref = (grant_ref_t)err; 663 664 mutex_enter(&xnfp->xnf_txlock); 665 666 /* 667 * We first cleanup the TX ring in case we are doing a resume. 668 * Note that this can lose packets, but we expect to stagger on. 669 */ 670 xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */ 671 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0]; 672 i < NET_TX_RING_SIZE; 673 i++, tidp++) { 674 xnf_txbuf_t *txp = tidp->txbuf; 675 if (txp == NULL) 676 continue; 677 678 switch (txp->tx_type) { 679 case TX_DATA: 680 /* 681 * txid_put() will be called for each txbuf's txid in 682 * the chain which will result in clearing tidp->txbuf. 683 */ 684 xnf_data_txbuf_free_chain(xnfp, txp); 685 686 break; 687 688 case TX_MCAST_REQ: 689 txp->tx_type = TX_MCAST_RSP; 690 txp->tx_status = NETIF_RSP_DROPPED; 691 cv_broadcast(&xnfp->xnf_cv_multicast); 692 693 /* 694 * The request consumed two slots in the ring, 695 * yet only a single xnf_txid_t is used. Step 696 * over the empty slot. 697 */ 698 i++; 699 ASSERT3U(i, <, NET_TX_RING_SIZE); 700 break; 701 702 case TX_MCAST_RSP: 703 break; 704 } 705 } 706 707 /* 708 * Now purge old list and add each txid to the new free list. 709 */ 710 xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */ 711 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0]; 712 i < NET_TX_RING_SIZE; 713 i++, tidp++) { 714 tidp->id = i; 715 ASSERT3P(tidp->txbuf, ==, NULL); 716 tidp->next = INVALID_TX_ID; /* Appease txid_put(). */ 717 xnf_txid_put(xnfp, tidp); 718 } 719 720 /* LINTED: constant in conditional context */ 721 SHARED_RING_INIT(xnfp->xnf_tx_ring.sring); 722 /* LINTED: constant in conditional context */ 723 FRONT_RING_INIT(&xnfp->xnf_tx_ring, 724 xnfp->xnf_tx_ring.sring, PAGESIZE); 725 726 mutex_exit(&xnfp->xnf_txlock); 727 728 mutex_enter(&xnfp->xnf_rxlock); 729 730 /* 731 * Clean out any buffers currently posted to the receive ring 732 * before we reset it. 733 */ 734 for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0]; 735 i < NET_RX_RING_SIZE; 736 i++, bdescp++) { 737 if (*bdescp != NULL) { 738 xnf_buf_put(xnfp, *bdescp, B_FALSE); 739 *bdescp = NULL; 740 } 741 } 742 743 /* LINTED: constant in conditional context */ 744 SHARED_RING_INIT(xnfp->xnf_rx_ring.sring); 745 /* LINTED: constant in conditional context */ 746 FRONT_RING_INIT(&xnfp->xnf_rx_ring, 747 xnfp->xnf_rx_ring.sring, PAGESIZE); 748 749 /* 750 * Fill the ring with buffers. 751 */ 752 for (i = 0; i < NET_RX_RING_SIZE; i++) { 753 xnf_buf_t *bdesc; 754 755 bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE); 756 VERIFY(bdesc != NULL); 757 xnf_rxbuf_hang(xnfp, bdesc); 758 } 759 760 /* LINTED: constant in conditional context */ 761 RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring); 762 763 mutex_exit(&xnfp->xnf_rxlock); 764 765 return (0); 766 767 out: 768 if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF) 769 gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0); 770 xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF; 771 772 if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF) 773 gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0); 774 xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF; 775 776 return (err); 777 } 778 779 /* 780 * Connect driver to back end, called to set up communication with 781 * back end driver both initially and on resume after restore/migrate. 782 */ 783 void 784 xnf_be_connect(xnf_t *xnfp) 785 { 786 const char *message; 787 xenbus_transaction_t xbt; 788 struct xenbus_device *xsd; 789 char *xsname; 790 int err; 791 792 ASSERT(!xnfp->xnf_connected); 793 794 xsd = xvdi_get_xsd(xnfp->xnf_devinfo); 795 xsname = xvdi_get_xsname(xnfp->xnf_devinfo); 796 797 err = xnf_setup_rings(xnfp); 798 if (err != 0) { 799 cmn_err(CE_WARN, "failed to set up tx/rx rings"); 800 xenbus_dev_error(xsd, err, "setting up ring"); 801 return; 802 } 803 804 again: 805 err = xenbus_transaction_start(&xbt); 806 if (err != 0) { 807 xenbus_dev_error(xsd, EIO, "starting transaction"); 808 return; 809 } 810 811 err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u", 812 xnfp->xnf_tx_ring_ref); 813 if (err != 0) { 814 message = "writing tx ring-ref"; 815 goto abort_transaction; 816 } 817 818 err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u", 819 xnfp->xnf_rx_ring_ref); 820 if (err != 0) { 821 message = "writing rx ring-ref"; 822 goto abort_transaction; 823 } 824 825 err = xenbus_printf(xbt, xsname, "event-channel", "%u", 826 xnfp->xnf_evtchn); 827 if (err != 0) { 828 message = "writing event-channel"; 829 goto abort_transaction; 830 } 831 832 err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1); 833 if (err != 0) { 834 message = "writing feature-rx-notify"; 835 goto abort_transaction; 836 } 837 838 err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1); 839 if (err != 0) { 840 message = "writing request-rx-copy"; 841 goto abort_transaction; 842 } 843 844 if (xnfp->xnf_be_mcast_control) { 845 err = xenbus_printf(xbt, xsname, "request-multicast-control", 846 "%d", 1); 847 if (err != 0) { 848 message = "writing request-multicast-control"; 849 goto abort_transaction; 850 } 851 } 852 853 /* 854 * Tell backend if we support scatter-gather lists on the rx side. 855 */ 856 err = xenbus_printf(xbt, xsname, "feature-sg", "%d", 857 xnf_enable_rx_sg ? 1 : 0); 858 if (err != 0) { 859 message = "writing feature-sg"; 860 goto abort_transaction; 861 } 862 863 /* 864 * Tell backend if we support LRO for IPv4. Scatter-gather on rx is 865 * a prerequisite. 866 */ 867 err = xenbus_printf(xbt, xsname, "feature-gso-tcpv4", "%d", 868 (xnf_enable_rx_sg && xnf_enable_lro) ? 1 : 0); 869 if (err != 0) { 870 message = "writing feature-gso-tcpv4"; 871 goto abort_transaction; 872 } 873 874 err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected); 875 if (err != 0) { 876 message = "switching state to XenbusStateConnected"; 877 goto abort_transaction; 878 } 879 880 err = xenbus_transaction_end(xbt, 0); 881 if (err != 0) { 882 if (err == EAGAIN) 883 goto again; 884 xenbus_dev_error(xsd, err, "completing transaction"); 885 } 886 887 return; 888 889 abort_transaction: 890 (void) xenbus_transaction_end(xbt, 1); 891 xenbus_dev_error(xsd, err, "%s", message); 892 } 893 894 /* 895 * Read configuration information from xenstore. 896 */ 897 void 898 xnf_read_config(xnf_t *xnfp) 899 { 900 int err, be_cap; 901 char mac[ETHERADDRL * 3]; 902 char *oename = xvdi_get_oename(xnfp->xnf_devinfo); 903 904 err = xenbus_scanf(XBT_NULL, oename, "mac", 905 "%s", (char *)&mac[0]); 906 if (err != 0) { 907 /* 908 * bad: we're supposed to be set up with a proper mac 909 * addr. at this point 910 */ 911 cmn_err(CE_WARN, "%s%d: no mac address", 912 ddi_driver_name(xnfp->xnf_devinfo), 913 ddi_get_instance(xnfp->xnf_devinfo)); 914 return; 915 } 916 if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) { 917 err = ENOENT; 918 xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT, 919 "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo)); 920 return; 921 } 922 923 err = xenbus_scanf(XBT_NULL, oename, 924 "feature-rx-copy", "%d", &be_cap); 925 /* 926 * If we fail to read the store we assume that the key is 927 * absent, implying an older domain at the far end. Older 928 * domains cannot do HV copy. 929 */ 930 if (err != 0) 931 be_cap = 0; 932 xnfp->xnf_be_rx_copy = (be_cap != 0); 933 934 err = xenbus_scanf(XBT_NULL, oename, 935 "feature-multicast-control", "%d", &be_cap); 936 /* 937 * If we fail to read the store we assume that the key is 938 * absent, implying an older domain at the far end. Older 939 * domains do not support multicast control. 940 */ 941 if (err != 0) 942 be_cap = 0; 943 xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control; 944 945 /* 946 * See if back-end supports scatter-gather for transmits. If not, 947 * we will not support LSO and limit the mtu to 1500. 948 */ 949 err = xenbus_scanf(XBT_NULL, oename, "feature-sg", "%d", &be_cap); 950 if (err != 0) { 951 be_cap = 0; 952 dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading " 953 "'feature-sg' from backend driver"); 954 } 955 if (be_cap == 0) { 956 dev_err(xnfp->xnf_devinfo, CE_WARN, "scatter-gather is not " 957 "supported for transmits in the backend driver. LSO is " 958 "disabled and MTU is restricted to 1500 bytes."); 959 } 960 xnfp->xnf_be_tx_sg = (be_cap != 0) && xnf_enable_tx_sg; 961 962 if (xnfp->xnf_be_tx_sg) { 963 /* 964 * Check if LSO is supported. Currently we only check for 965 * IPv4 as Illumos doesn't support LSO for IPv6. 966 */ 967 err = xenbus_scanf(XBT_NULL, oename, "feature-gso-tcpv4", "%d", 968 &be_cap); 969 if (err != 0) { 970 be_cap = 0; 971 dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading " 972 "'feature-gso-tcpv4' from backend driver"); 973 } 974 if (be_cap == 0) { 975 dev_err(xnfp->xnf_devinfo, CE_WARN, "LSO is not " 976 "supported by the backend driver. Performance " 977 "will be affected."); 978 } 979 xnfp->xnf_be_lso = (be_cap != 0) && xnf_enable_lso; 980 } 981 } 982 983 /* 984 * attach(9E) -- Attach a device to the system 985 */ 986 static int 987 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) 988 { 989 mac_register_t *macp; 990 xnf_t *xnfp; 991 int err; 992 char cachename[32]; 993 994 #ifdef XNF_DEBUG 995 if (xnf_debug & XNF_DEBUG_DDI) 996 printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo), 997 (void *)devinfo); 998 #endif 999 1000 switch (cmd) { 1001 case DDI_RESUME: 1002 xnfp = ddi_get_driver_private(devinfo); 1003 xnfp->xnf_gen++; 1004 1005 (void) xvdi_resume(devinfo); 1006 (void) xvdi_alloc_evtchn(devinfo); 1007 xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo); 1008 #ifdef XPV_HVM_DRIVER 1009 ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, 1010 xnfp); 1011 #else 1012 (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, 1013 (caddr_t)xnfp); 1014 #endif 1015 return (DDI_SUCCESS); 1016 1017 case DDI_ATTACH: 1018 break; 1019 1020 default: 1021 return (DDI_FAILURE); 1022 } 1023 1024 /* 1025 * Allocate gld_mac_info_t and xnf_instance structures 1026 */ 1027 macp = mac_alloc(MAC_VERSION); 1028 if (macp == NULL) 1029 return (DDI_FAILURE); 1030 xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP); 1031 1032 xnfp->xnf_tx_pkt_id = 1033 kmem_zalloc(sizeof (xnf_txid_t) * NET_TX_RING_SIZE, KM_SLEEP); 1034 1035 xnfp->xnf_rx_pkt_info = 1036 kmem_zalloc(sizeof (xnf_buf_t *) * NET_RX_RING_SIZE, KM_SLEEP); 1037 1038 macp->m_dip = devinfo; 1039 macp->m_driver = xnfp; 1040 xnfp->xnf_devinfo = devinfo; 1041 1042 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1043 macp->m_src_addr = xnfp->xnf_mac_addr; 1044 macp->m_callbacks = &xnf_callbacks; 1045 macp->m_min_sdu = 0; 1046 xnfp->xnf_mtu = ETHERMTU; 1047 macp->m_max_sdu = xnfp->xnf_mtu; 1048 1049 xnfp->xnf_running = B_FALSE; 1050 xnfp->xnf_connected = B_FALSE; 1051 xnfp->xnf_be_rx_copy = B_FALSE; 1052 xnfp->xnf_be_mcast_control = B_FALSE; 1053 xnfp->xnf_need_sched = B_FALSE; 1054 1055 xnfp->xnf_rx_head = NULL; 1056 xnfp->xnf_rx_tail = NULL; 1057 xnfp->xnf_rx_new_buffers_posted = B_FALSE; 1058 1059 #ifdef XPV_HVM_DRIVER 1060 /* Report our version to dom0 */ 1061 (void) xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d", 1062 HVMPV_XNF_VERS); 1063 #endif 1064 1065 /* 1066 * Get the iblock cookie with which to initialize the mutexes. 1067 */ 1068 if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie) 1069 != DDI_SUCCESS) 1070 goto failure; 1071 1072 mutex_init(&xnfp->xnf_txlock, 1073 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1074 mutex_init(&xnfp->xnf_rxlock, 1075 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1076 mutex_init(&xnfp->xnf_schedlock, 1077 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1078 mutex_init(&xnfp->xnf_gref_lock, 1079 NULL, MUTEX_DRIVER, xnfp->xnf_icookie); 1080 1081 cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL); 1082 cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL); 1083 cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL); 1084 1085 (void) sprintf(cachename, "xnf_buf_cache_%d", 1086 ddi_get_instance(devinfo)); 1087 xnfp->xnf_buf_cache = kmem_cache_create(cachename, 1088 sizeof (xnf_buf_t), 0, 1089 xnf_buf_constructor, xnf_buf_destructor, 1090 NULL, xnfp, NULL, 0); 1091 if (xnfp->xnf_buf_cache == NULL) 1092 goto failure_0; 1093 1094 (void) sprintf(cachename, "xnf_tx_buf_cache_%d", 1095 ddi_get_instance(devinfo)); 1096 xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename, 1097 sizeof (xnf_txbuf_t), 0, 1098 xnf_tx_buf_constructor, xnf_tx_buf_destructor, 1099 NULL, xnfp, NULL, 0); 1100 if (xnfp->xnf_tx_buf_cache == NULL) 1101 goto failure_1; 1102 1103 xnfp->xnf_gref_head = INVALID_GRANT_REF; 1104 1105 if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) { 1106 cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize " 1107 "driver data structures", 1108 ddi_get_instance(xnfp->xnf_devinfo)); 1109 goto failure_2; 1110 } 1111 1112 xnfp->xnf_rx_ring.sring->rsp_event = 1113 xnfp->xnf_tx_ring.sring->rsp_event = 1; 1114 1115 xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF; 1116 xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF; 1117 1118 /* set driver private pointer now */ 1119 ddi_set_driver_private(devinfo, xnfp); 1120 1121 if (!xnf_kstat_init(xnfp)) 1122 goto failure_3; 1123 1124 /* 1125 * Allocate an event channel, add the interrupt handler and 1126 * bind it to the event channel. 1127 */ 1128 (void) xvdi_alloc_evtchn(devinfo); 1129 xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo); 1130 #ifdef XPV_HVM_DRIVER 1131 ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp); 1132 #else 1133 (void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp); 1134 #endif 1135 1136 err = mac_register(macp, &xnfp->xnf_mh); 1137 mac_free(macp); 1138 macp = NULL; 1139 if (err != 0) 1140 goto failure_4; 1141 1142 if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL) 1143 != DDI_SUCCESS) 1144 goto failure_5; 1145 1146 #ifdef XPV_HVM_DRIVER 1147 /* 1148 * In the HVM case, this driver essentially replaces a driver for 1149 * a 'real' PCI NIC. Without the "model" property set to 1150 * "Ethernet controller", like the PCI code does, netbooting does 1151 * not work correctly, as strplumb_get_netdev_path() will not find 1152 * this interface. 1153 */ 1154 (void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model", 1155 "Ethernet controller"); 1156 #endif 1157 1158 #ifdef XNF_DEBUG 1159 if (xnf_debug_instance == NULL) 1160 xnf_debug_instance = xnfp; 1161 #endif 1162 1163 return (DDI_SUCCESS); 1164 1165 failure_5: 1166 (void) mac_unregister(xnfp->xnf_mh); 1167 1168 failure_4: 1169 #ifdef XPV_HVM_DRIVER 1170 ec_unbind_evtchn(xnfp->xnf_evtchn); 1171 xvdi_free_evtchn(devinfo); 1172 #else 1173 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); 1174 #endif 1175 xnfp->xnf_evtchn = INVALID_EVTCHN; 1176 kstat_delete(xnfp->xnf_kstat_aux); 1177 1178 failure_3: 1179 xnf_release_dma_resources(xnfp); 1180 1181 failure_2: 1182 kmem_cache_destroy(xnfp->xnf_tx_buf_cache); 1183 1184 failure_1: 1185 kmem_cache_destroy(xnfp->xnf_buf_cache); 1186 1187 failure_0: 1188 cv_destroy(&xnfp->xnf_cv_tx_slots); 1189 cv_destroy(&xnfp->xnf_cv_multicast); 1190 cv_destroy(&xnfp->xnf_cv_state); 1191 1192 mutex_destroy(&xnfp->xnf_gref_lock); 1193 mutex_destroy(&xnfp->xnf_schedlock); 1194 mutex_destroy(&xnfp->xnf_rxlock); 1195 mutex_destroy(&xnfp->xnf_txlock); 1196 1197 failure: 1198 kmem_free(xnfp, sizeof (*xnfp)); 1199 if (macp != NULL) 1200 mac_free(macp); 1201 1202 return (DDI_FAILURE); 1203 } 1204 1205 /* detach(9E) -- Detach a device from the system */ 1206 static int 1207 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) 1208 { 1209 xnf_t *xnfp; /* Our private device info */ 1210 1211 #ifdef XNF_DEBUG 1212 if (xnf_debug & XNF_DEBUG_DDI) 1213 printf("xnf_detach(0x%p)\n", (void *)devinfo); 1214 #endif 1215 1216 xnfp = ddi_get_driver_private(devinfo); 1217 1218 switch (cmd) { 1219 case DDI_SUSPEND: 1220 #ifdef XPV_HVM_DRIVER 1221 ec_unbind_evtchn(xnfp->xnf_evtchn); 1222 xvdi_free_evtchn(devinfo); 1223 #else 1224 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); 1225 #endif 1226 1227 xvdi_suspend(devinfo); 1228 1229 mutex_enter(&xnfp->xnf_rxlock); 1230 mutex_enter(&xnfp->xnf_txlock); 1231 1232 xnfp->xnf_evtchn = INVALID_EVTCHN; 1233 xnfp->xnf_connected = B_FALSE; 1234 mutex_exit(&xnfp->xnf_txlock); 1235 mutex_exit(&xnfp->xnf_rxlock); 1236 1237 /* claim link to be down after disconnect */ 1238 mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN); 1239 return (DDI_SUCCESS); 1240 1241 case DDI_DETACH: 1242 break; 1243 1244 default: 1245 return (DDI_FAILURE); 1246 } 1247 1248 if (xnfp->xnf_connected) 1249 return (DDI_FAILURE); 1250 1251 /* 1252 * Cannot detach if we have xnf_buf_t outstanding. 1253 */ 1254 if (xnfp->xnf_stat_buf_allocated > 0) 1255 return (DDI_FAILURE); 1256 1257 if (mac_unregister(xnfp->xnf_mh) != 0) 1258 return (DDI_FAILURE); 1259 1260 kstat_delete(xnfp->xnf_kstat_aux); 1261 1262 /* Stop the receiver */ 1263 xnf_stop(xnfp); 1264 1265 xvdi_remove_event_handler(devinfo, XS_OE_STATE); 1266 1267 /* Remove the interrupt */ 1268 #ifdef XPV_HVM_DRIVER 1269 ec_unbind_evtchn(xnfp->xnf_evtchn); 1270 xvdi_free_evtchn(devinfo); 1271 #else 1272 ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie); 1273 #endif 1274 1275 /* Release any pending xmit mblks */ 1276 xnf_release_mblks(xnfp); 1277 1278 /* Release all DMA resources */ 1279 xnf_release_dma_resources(xnfp); 1280 1281 cv_destroy(&xnfp->xnf_cv_tx_slots); 1282 cv_destroy(&xnfp->xnf_cv_multicast); 1283 cv_destroy(&xnfp->xnf_cv_state); 1284 1285 kmem_cache_destroy(xnfp->xnf_tx_buf_cache); 1286 kmem_cache_destroy(xnfp->xnf_buf_cache); 1287 1288 mutex_destroy(&xnfp->xnf_gref_lock); 1289 mutex_destroy(&xnfp->xnf_schedlock); 1290 mutex_destroy(&xnfp->xnf_rxlock); 1291 mutex_destroy(&xnfp->xnf_txlock); 1292 1293 kmem_free(xnfp, sizeof (*xnfp)); 1294 1295 return (DDI_SUCCESS); 1296 } 1297 1298 /* 1299 * xnf_set_mac_addr() -- set the physical network address on the board. 1300 */ 1301 static int 1302 xnf_set_mac_addr(void *arg, const uint8_t *macaddr) 1303 { 1304 _NOTE(ARGUNUSED(arg, macaddr)); 1305 1306 /* 1307 * We can't set our macaddr. 1308 */ 1309 return (ENOTSUP); 1310 } 1311 1312 /* 1313 * xnf_set_multicast() -- set (enable) or disable a multicast address. 1314 * 1315 * Program the hardware to enable/disable the multicast address 1316 * in "mca". Enable if "add" is true, disable if false. 1317 */ 1318 static int 1319 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca) 1320 { 1321 xnf_t *xnfp = arg; 1322 xnf_txbuf_t *txp; 1323 int n_slots; 1324 RING_IDX slot; 1325 xnf_txid_t *tidp; 1326 netif_tx_request_t *txrp; 1327 struct netif_extra_info *erp; 1328 boolean_t notify, result; 1329 1330 /* 1331 * If the backend does not support multicast control then we 1332 * must assume that the right packets will just arrive. 1333 */ 1334 if (!xnfp->xnf_be_mcast_control) 1335 return (0); 1336 1337 txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP); 1338 1339 mutex_enter(&xnfp->xnf_txlock); 1340 1341 /* 1342 * If we're not yet connected then claim success. This is 1343 * acceptable because we refresh the entire set of multicast 1344 * addresses when we get connected. 1345 * 1346 * We can't wait around here because the MAC layer expects 1347 * this to be a non-blocking operation - waiting ends up 1348 * causing a deadlock during resume. 1349 */ 1350 if (!xnfp->xnf_connected) { 1351 mutex_exit(&xnfp->xnf_txlock); 1352 return (0); 1353 } 1354 1355 /* 1356 * 1. Acquire two slots in the ring. 1357 * 2. Fill in the slots. 1358 * 3. Request notification when the operation is done. 1359 * 4. Kick the peer. 1360 * 5. Wait for the response via xnf_tx_clean_ring(). 1361 */ 1362 1363 n_slots = xnf_tx_slots_get(xnfp, 2, B_TRUE); 1364 ASSERT(n_slots >= 2); 1365 1366 slot = xnfp->xnf_tx_ring.req_prod_pvt; 1367 tidp = xnf_txid_get(xnfp); 1368 VERIFY(tidp != NULL); 1369 1370 txp->tx_type = TX_MCAST_REQ; 1371 txp->tx_slot = slot; 1372 1373 txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); 1374 erp = (struct netif_extra_info *) 1375 RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1); 1376 1377 txrp->gref = 0; 1378 txrp->size = 0; 1379 txrp->offset = 0; 1380 /* Set tx_txreq.id to appease xnf_tx_clean_ring(). */ 1381 txrp->id = txp->tx_txreq.id = tidp->id; 1382 txrp->flags = NETTXF_extra_info; 1383 1384 erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD : 1385 XEN_NETIF_EXTRA_TYPE_MCAST_DEL; 1386 bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL); 1387 1388 tidp->txbuf = txp; 1389 1390 xnfp->xnf_tx_ring.req_prod_pvt = slot + 2; 1391 1392 mutex_enter(&xnfp->xnf_schedlock); 1393 xnfp->xnf_pending_multicast++; 1394 mutex_exit(&xnfp->xnf_schedlock); 1395 1396 /* LINTED: constant in conditional context */ 1397 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, 1398 notify); 1399 if (notify) 1400 ec_notify_via_evtchn(xnfp->xnf_evtchn); 1401 1402 while (txp->tx_type == TX_MCAST_REQ) 1403 cv_wait(&xnfp->xnf_cv_multicast, &xnfp->xnf_txlock); 1404 1405 ASSERT3U(txp->tx_type, ==, TX_MCAST_RSP); 1406 1407 mutex_enter(&xnfp->xnf_schedlock); 1408 xnfp->xnf_pending_multicast--; 1409 mutex_exit(&xnfp->xnf_schedlock); 1410 1411 result = (txp->tx_status == NETIF_RSP_OKAY); 1412 1413 xnf_txid_put(xnfp, tidp); 1414 1415 mutex_exit(&xnfp->xnf_txlock); 1416 1417 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); 1418 1419 return (result ? 0 : 1); 1420 } 1421 1422 /* 1423 * xnf_set_promiscuous() -- set or reset promiscuous mode on the board 1424 * 1425 * Program the hardware to enable/disable promiscuous mode. 1426 */ 1427 static int 1428 xnf_set_promiscuous(void *arg, boolean_t on) 1429 { 1430 _NOTE(ARGUNUSED(arg, on)); 1431 1432 /* 1433 * We can't really do this, but we pretend that we can in 1434 * order that snoop will work. 1435 */ 1436 return (0); 1437 } 1438 1439 /* 1440 * Clean buffers that we have responses for from the transmit ring. 1441 */ 1442 static int 1443 xnf_tx_clean_ring(xnf_t *xnfp) 1444 { 1445 boolean_t work_to_do; 1446 1447 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 1448 1449 loop: 1450 while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) { 1451 RING_IDX cons, prod, i; 1452 1453 cons = xnfp->xnf_tx_ring.rsp_cons; 1454 prod = xnfp->xnf_tx_ring.sring->rsp_prod; 1455 membar_consumer(); 1456 /* 1457 * Clean tx requests from ring that we have responses 1458 * for. 1459 */ 1460 DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod); 1461 for (i = cons; i != prod; i++) { 1462 netif_tx_response_t *trp; 1463 xnf_txid_t *tidp; 1464 xnf_txbuf_t *txp; 1465 1466 trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i); 1467 /* 1468 * if this slot was occupied by netif_extra_info_t, 1469 * then the response will be NETIF_RSP_NULL. In this 1470 * case there are no resources to clean up. 1471 */ 1472 if (trp->status == NETIF_RSP_NULL) 1473 continue; 1474 1475 ASSERT(TX_ID_VALID(trp->id)); 1476 1477 tidp = TX_ID_TO_TXID(xnfp, trp->id); 1478 ASSERT3U(tidp->id, ==, trp->id); 1479 ASSERT3U(tidp->next, ==, INVALID_TX_ID); 1480 1481 txp = tidp->txbuf; 1482 ASSERT(txp != NULL); 1483 ASSERT3U(txp->tx_txreq.id, ==, trp->id); 1484 1485 switch (txp->tx_type) { 1486 case TX_DATA: 1487 /* 1488 * We must put the txid for each response we 1489 * acknowledge to make sure that we never have 1490 * more free slots than txids. Because of this 1491 * we do it here instead of waiting for it to 1492 * be done in xnf_data_txbuf_free_chain(). 1493 */ 1494 xnf_txid_put(xnfp, tidp); 1495 txp->tx_txreq.id = INVALID_TX_ID; 1496 ASSERT3S(txp->tx_head->tx_frags_to_ack, >, 0); 1497 txp->tx_head->tx_frags_to_ack--; 1498 1499 /* 1500 * We clean the whole chain once we got a 1501 * response for each fragment. 1502 */ 1503 if (txp->tx_head->tx_frags_to_ack == 0) 1504 xnf_data_txbuf_free_chain(xnfp, txp); 1505 1506 break; 1507 1508 case TX_MCAST_REQ: 1509 txp->tx_type = TX_MCAST_RSP; 1510 txp->tx_status = trp->status; 1511 cv_broadcast(&xnfp->xnf_cv_multicast); 1512 1513 break; 1514 1515 default: 1516 cmn_err(CE_PANIC, "xnf_tx_clean_ring: " 1517 "invalid xnf_txbuf_t type: %d", 1518 txp->tx_type); 1519 break; 1520 } 1521 } 1522 /* 1523 * Record the last response we dealt with so that we 1524 * know where to start next time around. 1525 */ 1526 xnfp->xnf_tx_ring.rsp_cons = prod; 1527 membar_enter(); 1528 } 1529 1530 /* LINTED: constant in conditional context */ 1531 RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do); 1532 if (work_to_do) 1533 goto loop; 1534 1535 return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring)); 1536 } 1537 1538 /* 1539 * Allocate and fill in a look-aside buffer for the packet `mp'. Used 1540 * to ensure that the packet is physically contiguous and contained 1541 * within a single page. 1542 */ 1543 static xnf_buf_t * 1544 xnf_tx_get_lookaside(xnf_t *xnfp, mblk_t *mp, size_t *plen) 1545 { 1546 xnf_buf_t *bd; 1547 caddr_t bp; 1548 1549 bd = xnf_buf_get(xnfp, KM_SLEEP, B_TRUE); 1550 if (bd == NULL) 1551 return (NULL); 1552 1553 bp = bd->buf; 1554 while (mp != NULL) { 1555 size_t len = MBLKL(mp); 1556 1557 bcopy(mp->b_rptr, bp, len); 1558 bp += len; 1559 1560 mp = mp->b_cont; 1561 } 1562 1563 *plen = bp - bd->buf; 1564 ASSERT3U(*plen, <=, PAGESIZE); 1565 1566 xnfp->xnf_stat_tx_lookaside++; 1567 1568 return (bd); 1569 } 1570 1571 /* 1572 * Insert the pseudo-header checksum into the packet. 1573 * Assumes packet is IPv4, TCP/UDP since we only advertised support for 1574 * HCKSUM_INET_FULL_V4. 1575 */ 1576 int 1577 xnf_pseudo_cksum(mblk_t *mp) 1578 { 1579 struct ether_header *ehp; 1580 uint16_t sap, iplen, *stuff; 1581 uint32_t cksum; 1582 size_t len; 1583 ipha_t *ipha; 1584 ipaddr_t src, dst; 1585 uchar_t *ptr; 1586 1587 ptr = mp->b_rptr; 1588 len = MBLKL(mp); 1589 1590 /* Each header must fit completely in an mblk. */ 1591 ASSERT3U(len, >=, sizeof (*ehp)); 1592 1593 ehp = (struct ether_header *)ptr; 1594 1595 if (ntohs(ehp->ether_type) == VLAN_TPID) { 1596 struct ether_vlan_header *evhp; 1597 ASSERT3U(len, >=, sizeof (*evhp)); 1598 evhp = (struct ether_vlan_header *)ptr; 1599 sap = ntohs(evhp->ether_type); 1600 ptr += sizeof (*evhp); 1601 len -= sizeof (*evhp); 1602 } else { 1603 sap = ntohs(ehp->ether_type); 1604 ptr += sizeof (*ehp); 1605 len -= sizeof (*ehp); 1606 } 1607 1608 ASSERT3U(sap, ==, ETHERTYPE_IP); 1609 1610 /* 1611 * Ethernet and IP headers may be in different mblks. 1612 */ 1613 ASSERT3P(ptr, <=, mp->b_wptr); 1614 if (ptr == mp->b_wptr) { 1615 mp = mp->b_cont; 1616 ptr = mp->b_rptr; 1617 len = MBLKL(mp); 1618 } 1619 1620 ASSERT3U(len, >=, sizeof (ipha_t)); 1621 ipha = (ipha_t *)ptr; 1622 1623 /* 1624 * We assume the IP header has no options. (This is enforced in 1625 * ire_send_wire_v4() -- search for IXAF_NO_HW_CKSUM). 1626 */ 1627 ASSERT3U(IPH_HDR_LENGTH(ipha), ==, IP_SIMPLE_HDR_LENGTH); 1628 iplen = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH; 1629 1630 ptr += IP_SIMPLE_HDR_LENGTH; 1631 len -= IP_SIMPLE_HDR_LENGTH; 1632 1633 /* 1634 * IP and L4 headers may be in different mblks. 1635 */ 1636 ASSERT3P(ptr, <=, mp->b_wptr); 1637 if (ptr == mp->b_wptr) { 1638 mp = mp->b_cont; 1639 ptr = mp->b_rptr; 1640 len = MBLKL(mp); 1641 } 1642 1643 switch (ipha->ipha_protocol) { 1644 case IPPROTO_TCP: 1645 ASSERT3U(len, >=, sizeof (tcph_t)); 1646 stuff = (uint16_t *)(ptr + TCP_CHECKSUM_OFFSET); 1647 cksum = IP_TCP_CSUM_COMP; 1648 break; 1649 case IPPROTO_UDP: 1650 ASSERT3U(len, >=, sizeof (struct udphdr)); 1651 stuff = (uint16_t *)(ptr + UDP_CHECKSUM_OFFSET); 1652 cksum = IP_UDP_CSUM_COMP; 1653 break; 1654 default: 1655 cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d", 1656 ipha->ipha_protocol); 1657 return (EINVAL); 1658 } 1659 1660 src = ipha->ipha_src; 1661 dst = ipha->ipha_dst; 1662 1663 cksum += (dst >> 16) + (dst & 0xFFFF); 1664 cksum += (src >> 16) + (src & 0xFFFF); 1665 cksum += htons(iplen); 1666 1667 cksum = (cksum >> 16) + (cksum & 0xFFFF); 1668 cksum = (cksum >> 16) + (cksum & 0xFFFF); 1669 1670 ASSERT(cksum <= 0xFFFF); 1671 1672 *stuff = (uint16_t)(cksum ? cksum : ~cksum); 1673 1674 return (0); 1675 } 1676 1677 /* 1678 * Push a packet into the transmit ring. 1679 * 1680 * Note: the format of a tx packet that spans multiple slots is similar to 1681 * what is described in xnf_rx_one_packet(). 1682 */ 1683 static void 1684 xnf_tx_push_packet(xnf_t *xnfp, xnf_txbuf_t *head) 1685 { 1686 int nslots = 0; 1687 int extras = 0; 1688 RING_IDX slot; 1689 boolean_t notify; 1690 1691 ASSERT(MUTEX_HELD(&xnfp->xnf_txlock)); 1692 ASSERT(xnfp->xnf_running); 1693 1694 slot = xnfp->xnf_tx_ring.req_prod_pvt; 1695 1696 /* 1697 * The caller has already checked that we have enough slots to proceed. 1698 */ 1699 for (xnf_txbuf_t *txp = head; txp != NULL; txp = txp->tx_next) { 1700 xnf_txid_t *tidp; 1701 netif_tx_request_t *txrp; 1702 1703 tidp = xnf_txid_get(xnfp); 1704 VERIFY(tidp != NULL); 1705 txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); 1706 1707 txp->tx_slot = slot; 1708 txp->tx_txreq.id = tidp->id; 1709 *txrp = txp->tx_txreq; 1710 1711 tidp->txbuf = txp; 1712 slot++; 1713 nslots++; 1714 1715 /* 1716 * When present, LSO info is placed in a slot after the first 1717 * data segment, and doesn't require a txid. 1718 */ 1719 if (txp->tx_txreq.flags & NETTXF_extra_info) { 1720 netif_extra_info_t *extra; 1721 ASSERT3U(nslots, ==, 1); 1722 1723 extra = (netif_extra_info_t *) 1724 RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot); 1725 *extra = txp->tx_extra; 1726 slot++; 1727 nslots++; 1728 extras = 1; 1729 } 1730 } 1731 1732 ASSERT3U(nslots, <=, XEN_MAX_SLOTS_PER_TX); 1733 1734 /* 1735 * Store the number of data fragments. 1736 */ 1737 head->tx_frags_to_ack = nslots - extras; 1738 1739 xnfp->xnf_tx_ring.req_prod_pvt = slot; 1740 1741 /* 1742 * Tell the peer that we sent something, if it cares. 1743 */ 1744 /* LINTED: constant in conditional context */ 1745 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, notify); 1746 if (notify) 1747 ec_notify_via_evtchn(xnfp->xnf_evtchn); 1748 } 1749 1750 static xnf_txbuf_t * 1751 xnf_mblk_copy(xnf_t *xnfp, mblk_t *mp) 1752 { 1753 xnf_txbuf_t *txp = xnf_data_txbuf_alloc(xnfp); 1754 size_t length; 1755 1756 txp->tx_bdesc = xnf_tx_get_lookaside(xnfp, mp, &length); 1757 if (txp->tx_bdesc == NULL) { 1758 xnf_data_txbuf_free(xnfp, txp); 1759 return (NULL); 1760 } 1761 txp->tx_mfn = txp->tx_bdesc->buf_mfn; 1762 txp->tx_txreq.gref = txp->tx_bdesc->grant_ref; 1763 txp->tx_txreq.size = length; 1764 txp->tx_txreq.offset = (uintptr_t)txp->tx_bdesc->buf & PAGEOFFSET; 1765 txp->tx_txreq.flags = 0; 1766 1767 return (txp); 1768 } 1769 1770 static xnf_txbuf_t * 1771 xnf_mblk_map(xnf_t *xnfp, mblk_t *mp, int *countp) 1772 { 1773 xnf_txbuf_t *head = NULL; 1774 xnf_txbuf_t *tail = NULL; 1775 domid_t oeid; 1776 int nsegs = 0; 1777 1778 oeid = xvdi_get_oeid(xnfp->xnf_devinfo); 1779 1780 for (mblk_t *ml = mp; ml != NULL; ml = ml->b_cont) { 1781 ddi_dma_handle_t dma_handle; 1782 ddi_dma_cookie_t dma_cookie; 1783 uint_t ncookies; 1784 xnf_txbuf_t *txp; 1785 1786 if (MBLKL(ml) == 0) 1787 continue; 1788 1789 txp = xnf_data_txbuf_alloc(xnfp); 1790 1791 if (head == NULL) { 1792 head = txp; 1793 } else { 1794 ASSERT(tail != NULL); 1795 TXBUF_SETNEXT(tail, txp); 1796 txp->tx_head = head; 1797 } 1798 1799 /* 1800 * The necessary segmentation rules (e.g. not crossing a page 1801 * boundary) are enforced by the dma attributes of the handle. 1802 */ 1803 dma_handle = txp->tx_dma_handle; 1804 int ret = ddi_dma_addr_bind_handle(dma_handle, 1805 NULL, (char *)ml->b_rptr, MBLKL(ml), 1806 DDI_DMA_WRITE | DDI_DMA_STREAMING, 1807 DDI_DMA_DONTWAIT, 0, &dma_cookie, 1808 &ncookies); 1809 if (ret != DDI_DMA_MAPPED) { 1810 if (ret != DDI_DMA_NORESOURCES) { 1811 dev_err(xnfp->xnf_devinfo, CE_WARN, 1812 "ddi_dma_addr_bind_handle() failed " 1813 "[dma_error=%d]", ret); 1814 } 1815 goto error; 1816 } 1817 txp->tx_handle_bound = B_TRUE; 1818 1819 ASSERT(ncookies > 0); 1820 for (int i = 0; i < ncookies; i++) { 1821 if (nsegs == XEN_MAX_TX_DATA_PAGES) { 1822 dev_err(xnfp->xnf_devinfo, CE_WARN, 1823 "xnf_dmamap_alloc() failed: " 1824 "too many segments"); 1825 goto error; 1826 } 1827 if (i > 0) { 1828 txp = xnf_data_txbuf_alloc(xnfp); 1829 ASSERT(tail != NULL); 1830 TXBUF_SETNEXT(tail, txp); 1831 txp->tx_head = head; 1832 } 1833 1834 txp->tx_mfn = 1835 xnf_btop(pa_to_ma(dma_cookie.dmac_laddress)); 1836 txp->tx_txreq.gref = xnf_gref_get(xnfp); 1837 if (txp->tx_txreq.gref == INVALID_GRANT_REF) { 1838 dev_err(xnfp->xnf_devinfo, CE_WARN, 1839 "xnf_dmamap_alloc() failed: " 1840 "invalid grant ref"); 1841 goto error; 1842 } 1843 gnttab_grant_foreign_access_ref(txp->tx_txreq.gref, 1844 oeid, txp->tx_mfn, 1); 1845 txp->tx_txreq.offset = 1846 dma_cookie.dmac_laddress & PAGEOFFSET; 1847 txp->tx_txreq.size = dma_cookie.dmac_size; 1848 txp->tx_txreq.flags = 0; 1849 1850 ddi_dma_nextcookie(dma_handle, &dma_cookie); 1851 nsegs++; 1852 1853 if (tail != NULL) 1854 tail->tx_txreq.flags = NETTXF_more_data; 1855 tail = txp; 1856 } 1857 } 1858 1859 *countp = nsegs; 1860 return (head); 1861 1862 error: 1863 xnf_data_txbuf_free_chain(xnfp, head); 1864 return (NULL); 1865 } 1866 1867 static void 1868 xnf_tx_setup_offload(xnf_t *xnfp, xnf_txbuf_t *head, 1869 uint32_t cksum_flags, uint32_t lso_flags, uint32_t mss) 1870 { 1871 if (lso_flags != 0) { 1872 ASSERT3U(lso_flags, ==, HW_LSO); 1873 ASSERT3P(head->tx_bdesc, ==, NULL); 1874 1875 head->tx_txreq.flags |= NETTXF_extra_info; 1876 netif_extra_info_t *extra = &head->tx_extra; 1877 extra->type = XEN_NETIF_EXTRA_TYPE_GSO; 1878 extra->flags = 0; 1879 extra->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; 1880 extra->u.gso.size = mss; 1881 extra->u.gso.features = 0; 1882 extra->u.gso.pad = 0; 1883 } else if (cksum_flags != 0) { 1884 ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM); 1885 /* 1886 * If the local protocol stack requests checksum 1887 * offload we set the 'checksum blank' flag, 1888 * indicating to the peer that we need the checksum 1889 * calculated for us. 1890 * 1891 * We _don't_ set the validated flag, because we haven't 1892 * validated that the data and the checksum match. 1893 * 1894 * Note: we already called xnf_pseudo_cksum() in 1895 * xnf_send(), so we just set the txreq flag here. 1896 */ 1897 head->tx_txreq.flags |= NETTXF_csum_blank; 1898 xnfp->xnf_stat_tx_cksum_deferred++; 1899 } 1900 } 1901 1902 /* 1903 * Send packet mp. Called by the MAC framework. 1904 */ 1905 static mblk_t * 1906 xnf_send(void *arg, mblk_t *mp) 1907 { 1908 xnf_t *xnfp = arg; 1909 xnf_txbuf_t *head; 1910 mblk_t *ml; 1911 int length; 1912 int pages, chunks, slots, slots_free; 1913 uint32_t cksum_flags, lso_flags, mss; 1914 boolean_t pulledup = B_FALSE; 1915 boolean_t force_copy = B_FALSE; 1916 1917 ASSERT3P(mp->b_next, ==, NULL); 1918 1919 mutex_enter(&xnfp->xnf_txlock); 1920 1921 /* 1922 * Wait until we are connected to the backend. 1923 */ 1924 while (!xnfp->xnf_connected) 1925 cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock); 1926 1927 /* 1928 * To simplify logic and be in sync with the rescheduling mechanism, 1929 * we require the maximum amount of slots that could be used by a 1930 * transaction to be free before proceeding. The only downside of doing 1931 * this is that it slightly reduces the effective size of the ring. 1932 */ 1933 slots_free = xnf_tx_slots_get(xnfp, XEN_MAX_SLOTS_PER_TX, B_FALSE); 1934 if (slots_free < XEN_MAX_SLOTS_PER_TX) { 1935 /* 1936 * We need to ask for a re-schedule later as the ring is full. 1937 */ 1938 mutex_enter(&xnfp->xnf_schedlock); 1939 xnfp->xnf_need_sched = B_TRUE; 1940 mutex_exit(&xnfp->xnf_schedlock); 1941 1942 xnfp->xnf_stat_tx_defer++; 1943 mutex_exit(&xnfp->xnf_txlock); 1944 return (mp); 1945 } 1946 1947 /* 1948 * Get hw offload parameters. 1949 * This must be done before pulling up the mp as those parameters 1950 * are not copied over. 1951 */ 1952 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &cksum_flags); 1953 mac_lso_get(mp, &mss, &lso_flags); 1954 1955 /* 1956 * XXX: fix MAC framework so that we can advertise support for 1957 * partial checksum for IPv4 only. This way we won't need to calculate 1958 * the pseudo header checksum ourselves. 1959 */ 1960 if (cksum_flags != 0) { 1961 ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM); 1962 (void) xnf_pseudo_cksum(mp); 1963 } 1964 1965 pulledup: 1966 for (ml = mp, pages = 0, chunks = 0, length = 0; ml != NULL; 1967 ml = ml->b_cont, chunks++) { 1968 pages += xnf_mblk_pages(ml); 1969 length += MBLKL(ml); 1970 } 1971 DTRACE_PROBE3(packet, int, length, int, chunks, int, pages); 1972 DTRACE_PROBE3(lso, int, length, uint32_t, lso_flags, uint32_t, mss); 1973 1974 /* 1975 * If the ethernet header crosses a page boundary the packet 1976 * will be dropped by the backend. In practice it seems like 1977 * this happens fairly rarely so we'll do nothing unless the 1978 * packet is small enough to fit in a look-aside buffer. 1979 */ 1980 if (((uintptr_t)mp->b_rptr & PAGEOFFSET) + 1981 sizeof (struct ether_header) > PAGESIZE) { 1982 xnfp->xnf_stat_tx_eth_hdr_split++; 1983 if (length <= PAGESIZE) 1984 force_copy = B_TRUE; 1985 } 1986 1987 if (force_copy || (pages > 1 && !xnfp->xnf_be_tx_sg)) { 1988 /* 1989 * If the packet spans several pages and scatter-gather is not 1990 * supported then use a look-aside buffer. 1991 */ 1992 ASSERT3U(length, <=, PAGESIZE); 1993 head = xnf_mblk_copy(xnfp, mp); 1994 if (head == NULL) { 1995 dev_err(xnfp->xnf_devinfo, CE_WARN, 1996 "xnf_mblk_copy() failed"); 1997 goto drop; 1998 } 1999 } else { 2000 /* 2001 * There's a limit for how many pages can be passed to the 2002 * backend. If we pass that limit, the packet will be dropped 2003 * and some backend implementations (e.g. Linux) could even 2004 * offline the interface. 2005 */ 2006 if (pages > XEN_MAX_TX_DATA_PAGES) { 2007 if (pulledup) { 2008 dev_err(xnfp->xnf_devinfo, CE_WARN, 2009 "too many pages, even after pullup: %d.", 2010 pages); 2011 goto drop; 2012 } 2013 2014 /* 2015 * Defragment packet if it spans too many pages. 2016 */ 2017 mblk_t *newmp = msgpullup(mp, -1); 2018 freemsg(mp); 2019 mp = newmp; 2020 xnfp->xnf_stat_tx_pullup++; 2021 pulledup = B_TRUE; 2022 goto pulledup; 2023 } 2024 2025 head = xnf_mblk_map(xnfp, mp, &slots); 2026 if (head == NULL) 2027 goto drop; 2028 2029 IMPLY(slots > 1, xnfp->xnf_be_tx_sg); 2030 } 2031 2032 /* 2033 * Set tx_mp so that mblk is freed when the txbuf chain is freed. 2034 */ 2035 head->tx_mp = mp; 2036 2037 xnf_tx_setup_offload(xnfp, head, cksum_flags, lso_flags, mss); 2038 2039 /* 2040 * The first request must store the total length of the packet. 2041 */ 2042 head->tx_txreq.size = length; 2043 2044 /* 2045 * Push the packet we have prepared into the ring. 2046 */ 2047 xnf_tx_push_packet(xnfp, head); 2048 xnfp->xnf_stat_opackets++; 2049 xnfp->xnf_stat_obytes += length; 2050 2051 mutex_exit(&xnfp->xnf_txlock); 2052 return (NULL); 2053 2054 drop: 2055 freemsg(mp); 2056 xnfp->xnf_stat_tx_drop++; 2057 mutex_exit(&xnfp->xnf_txlock); 2058 return (NULL); 2059 } 2060 2061 /* 2062 * Notification of RX packets. Currently no TX-complete interrupt is 2063 * used, as we clean the TX ring lazily. 2064 */ 2065 static uint_t 2066 xnf_intr(caddr_t arg) 2067 { 2068 xnf_t *xnfp = (xnf_t *)arg; 2069 mblk_t *mp; 2070 boolean_t need_sched, clean_ring; 2071 2072 mutex_enter(&xnfp->xnf_rxlock); 2073 2074 /* 2075 * Interrupts before we are connected are spurious. 2076 */ 2077 if (!xnfp->xnf_connected) { 2078 mutex_exit(&xnfp->xnf_rxlock); 2079 xnfp->xnf_stat_unclaimed_interrupts++; 2080 return (DDI_INTR_UNCLAIMED); 2081 } 2082 2083 /* 2084 * Receive side processing. 2085 */ 2086 do { 2087 /* 2088 * Collect buffers from the ring. 2089 */ 2090 xnf_rx_collect(xnfp); 2091 2092 /* 2093 * Interrupt me when the next receive buffer is consumed. 2094 */ 2095 xnfp->xnf_rx_ring.sring->rsp_event = 2096 xnfp->xnf_rx_ring.rsp_cons + 1; 2097 xen_mb(); 2098 2099 } while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)); 2100 2101 if (xnfp->xnf_rx_new_buffers_posted) { 2102 boolean_t notify; 2103 2104 /* 2105 * Indicate to the peer that we have re-filled the 2106 * receive ring, if it cares. 2107 */ 2108 /* LINTED: constant in conditional context */ 2109 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify); 2110 if (notify) 2111 ec_notify_via_evtchn(xnfp->xnf_evtchn); 2112 xnfp->xnf_rx_new_buffers_posted = B_FALSE; 2113 } 2114 2115 mp = xnfp->xnf_rx_head; 2116 xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL; 2117 2118 xnfp->xnf_stat_interrupts++; 2119 mutex_exit(&xnfp->xnf_rxlock); 2120 2121 if (mp != NULL) 2122 mac_rx(xnfp->xnf_mh, NULL, mp); 2123 2124 /* 2125 * Transmit side processing. 2126 * 2127 * If a previous transmit attempt failed or we have pending 2128 * multicast requests, clean the ring. 2129 * 2130 * If we previously stalled transmission and cleaning produces 2131 * some free slots, tell upstream to attempt sending again. 2132 * 2133 * The odd style is to avoid acquiring xnf_txlock unless we 2134 * will actually look inside the tx machinery. 2135 */ 2136 mutex_enter(&xnfp->xnf_schedlock); 2137 need_sched = xnfp->xnf_need_sched; 2138 clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0); 2139 mutex_exit(&xnfp->xnf_schedlock); 2140 2141 if (clean_ring) { 2142 int free_slots; 2143 2144 mutex_enter(&xnfp->xnf_txlock); 2145 free_slots = xnf_tx_slots_get(xnfp, 0, B_FALSE); 2146 2147 if (need_sched && (free_slots >= XEN_MAX_SLOTS_PER_TX)) { 2148 mutex_enter(&xnfp->xnf_schedlock); 2149 xnfp->xnf_need_sched = B_FALSE; 2150 mutex_exit(&xnfp->xnf_schedlock); 2151 2152 mac_tx_update(xnfp->xnf_mh); 2153 } 2154 mutex_exit(&xnfp->xnf_txlock); 2155 } 2156 2157 return (DDI_INTR_CLAIMED); 2158 } 2159 2160 /* 2161 * xnf_start() -- start the board receiving and enable interrupts. 2162 */ 2163 static int 2164 xnf_start(void *arg) 2165 { 2166 xnf_t *xnfp = arg; 2167 2168 #ifdef XNF_DEBUG 2169 if (xnf_debug & XNF_DEBUG_TRACE) 2170 printf("xnf%d start(0x%p)\n", 2171 ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp); 2172 #endif 2173 2174 mutex_enter(&xnfp->xnf_rxlock); 2175 mutex_enter(&xnfp->xnf_txlock); 2176 2177 /* Accept packets from above. */ 2178 xnfp->xnf_running = B_TRUE; 2179 2180 mutex_exit(&xnfp->xnf_txlock); 2181 mutex_exit(&xnfp->xnf_rxlock); 2182 2183 return (0); 2184 } 2185 2186 /* xnf_stop() - disable hardware */ 2187 static void 2188 xnf_stop(void *arg) 2189 { 2190 xnf_t *xnfp = arg; 2191 2192 #ifdef XNF_DEBUG 2193 if (xnf_debug & XNF_DEBUG_TRACE) 2194 printf("xnf%d stop(0x%p)\n", 2195 ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp); 2196 #endif 2197 2198 mutex_enter(&xnfp->xnf_rxlock); 2199 mutex_enter(&xnfp->xnf_txlock); 2200 2201 xnfp->xnf_running = B_FALSE; 2202 2203 mutex_exit(&xnfp->xnf_txlock); 2204 mutex_exit(&xnfp->xnf_rxlock); 2205 } 2206 2207 /* 2208 * Hang buffer `bdesc' on the RX ring. 2209 */ 2210 static void 2211 xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc) 2212 { 2213 netif_rx_request_t *reqp; 2214 RING_IDX hang_ix; 2215 2216 ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock)); 2217 2218 reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring, 2219 xnfp->xnf_rx_ring.req_prod_pvt); 2220 hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0)); 2221 ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL); 2222 2223 reqp->id = bdesc->id = hang_ix; 2224 reqp->gref = bdesc->grant_ref; 2225 2226 xnfp->xnf_rx_pkt_info[hang_ix] = bdesc; 2227 xnfp->xnf_rx_ring.req_prod_pvt++; 2228 2229 xnfp->xnf_rx_new_buffers_posted = B_TRUE; 2230 } 2231 2232 /* 2233 * Receive an entire packet from the ring, starting from slot *consp. 2234 * prod indicates the slot of the latest response. 2235 * On return, *consp will point to the head of the next packet. 2236 * 2237 * Note: If slot prod was reached before we could gather a full packet, we will 2238 * drop the partial packet; this would most likely indicate a bug in either 2239 * the front-end or the back-end driver. 2240 * 2241 * An rx packet can consist of several fragments and thus span multiple slots. 2242 * Each fragment can contain up to 4k of data. 2243 * 2244 * A typical 9000 MTU packet with look like this: 2245 * +------+---------------------+-------------------+-----------------------+ 2246 * | SLOT | TYPE | CONTENTS | FLAGS | 2247 * +------+---------------------+-------------------+-----------------------+ 2248 * | 1 | netif_rx_response_t | 1st data fragment | more_data | 2249 * +------+---------------------+-------------------+-----------------------+ 2250 * | 2 | netif_rx_response_t | 2nd data fragment | more_data | 2251 * +------+---------------------+-------------------+-----------------------+ 2252 * | 3 | netif_rx_response_t | 3rd data fragment | [none] | 2253 * +------+---------------------+-------------------+-----------------------+ 2254 * 2255 * Fragments are chained by setting NETRXF_more_data in the previous 2256 * response's flags. If there are additional flags, such as 2257 * NETRXF_data_validated or NETRXF_extra_info, those should be set on the 2258 * first fragment. 2259 * 2260 * Sometimes extra info can be present. If so, it will follow the first 2261 * fragment, and NETRXF_extra_info flag will be set on the first response. 2262 * If LRO is set on a packet, it will be stored in the extra info. Conforming 2263 * to the spec, extra info can also be chained, but must all be present right 2264 * after the first fragment. 2265 * 2266 * Example of a packet with 2 extra infos: 2267 * +------+---------------------+-------------------+-----------------------+ 2268 * | SLOT | TYPE | CONTENTS | FLAGS | 2269 * +------+---------------------+-------------------+-----------------------+ 2270 * | 1 | netif_rx_response_t | 1st data fragment | extra_info, more_data | 2271 * +------+---------------------+-------------------+-----------------------+ 2272 * | 2 | netif_extra_info_t | 1st extra info | EXTRA_FLAG_MORE | 2273 * +------+---------------------+-------------------+-----------------------+ 2274 * | 3 | netif_extra_info_t | 2nd extra info | [none] | 2275 * +------+---------------------+-------------------+-----------------------+ 2276 * | 4 | netif_rx_response_t | 2nd data fragment | more_data | 2277 * +------+---------------------+-------------------+-----------------------+ 2278 * | 5 | netif_rx_response_t | 3rd data fragment | more_data | 2279 * +------+---------------------+-------------------+-----------------------+ 2280 * | 6 | netif_rx_response_t | 4th data fragment | [none] | 2281 * +------+---------------------+-------------------+-----------------------+ 2282 * 2283 * In practice, the only extra we expect is for LRO, but only if we advertise 2284 * that we support it to the backend (xnf_enable_lro == TRUE). 2285 */ 2286 static int 2287 xnf_rx_one_packet(xnf_t *xnfp, RING_IDX prod, RING_IDX *consp, mblk_t **mpp) 2288 { 2289 mblk_t *head = NULL; 2290 mblk_t *tail = NULL; 2291 mblk_t *mp; 2292 int error = 0; 2293 RING_IDX cons = *consp; 2294 netif_extra_info_t lro; 2295 boolean_t is_lro = B_FALSE; 2296 boolean_t is_extra = B_FALSE; 2297 2298 netif_rx_response_t rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons); 2299 2300 boolean_t hwcsum = (rsp.flags & NETRXF_data_validated) != 0; 2301 boolean_t more_data = (rsp.flags & NETRXF_more_data) != 0; 2302 boolean_t more_extra = (rsp.flags & NETRXF_extra_info) != 0; 2303 2304 IMPLY(more_data, xnf_enable_rx_sg); 2305 2306 while (cons != prod) { 2307 xnf_buf_t *bdesc; 2308 int len, off; 2309 int rxidx = cons & (NET_RX_RING_SIZE - 1); 2310 2311 bdesc = xnfp->xnf_rx_pkt_info[rxidx]; 2312 xnfp->xnf_rx_pkt_info[rxidx] = NULL; 2313 2314 if (is_extra) { 2315 netif_extra_info_t *extra = (netif_extra_info_t *)&rsp; 2316 /* 2317 * The only extra we expect is for LRO, and it should 2318 * only be present once. 2319 */ 2320 if (extra->type == XEN_NETIF_EXTRA_TYPE_GSO && 2321 !is_lro) { 2322 ASSERT(xnf_enable_lro); 2323 lro = *extra; 2324 is_lro = B_TRUE; 2325 DTRACE_PROBE1(lro, netif_extra_info_t *, &lro); 2326 } else { 2327 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx packet " 2328 "contains unexpected extra info of type %d", 2329 extra->type); 2330 error = EINVAL; 2331 } 2332 more_extra = 2333 (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE) != 0; 2334 2335 goto hang_buf; 2336 } 2337 2338 ASSERT3U(bdesc->id, ==, rsp.id); 2339 2340 /* 2341 * status stores packet length when >= 0, or errors when < 0. 2342 */ 2343 len = rsp.status; 2344 off = rsp.offset; 2345 more_data = (rsp.flags & NETRXF_more_data) != 0; 2346 2347 /* 2348 * sanity checks. 2349 */ 2350 if (!xnfp->xnf_running) { 2351 error = EBUSY; 2352 } else if (len <= 0) { 2353 xnfp->xnf_stat_errrx++; 2354 2355 switch (len) { 2356 case 0: 2357 xnfp->xnf_stat_runt++; 2358 break; 2359 case NETIF_RSP_ERROR: 2360 xnfp->xnf_stat_mac_rcv_error++; 2361 break; 2362 case NETIF_RSP_DROPPED: 2363 xnfp->xnf_stat_norxbuf++; 2364 break; 2365 } 2366 error = EINVAL; 2367 } else if (bdesc->grant_ref == INVALID_GRANT_REF) { 2368 dev_err(xnfp->xnf_devinfo, CE_WARN, 2369 "Bad rx grant reference, rsp id %d", rsp.id); 2370 error = EINVAL; 2371 } else if ((off + len) > PAGESIZE) { 2372 dev_err(xnfp->xnf_devinfo, CE_WARN, "Rx packet crosses " 2373 "page boundary (offset %d, length %d)", off, len); 2374 error = EINVAL; 2375 } 2376 2377 if (error != 0) { 2378 /* 2379 * If an error has been detected, we do not attempt 2380 * to read the data but we still need to replace 2381 * the rx bufs. 2382 */ 2383 goto hang_buf; 2384 } 2385 2386 xnf_buf_t *nbuf = NULL; 2387 2388 /* 2389 * If the packet is below a pre-determined size we will 2390 * copy data out of the buf rather than replace it. 2391 */ 2392 if (len > xnf_rx_copy_limit) 2393 nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE); 2394 2395 if (nbuf != NULL) { 2396 mp = desballoc((unsigned char *)bdesc->buf, 2397 bdesc->len, 0, &bdesc->free_rtn); 2398 2399 if (mp == NULL) { 2400 xnfp->xnf_stat_rx_desballoc_fail++; 2401 xnfp->xnf_stat_norxbuf++; 2402 error = ENOMEM; 2403 /* 2404 * we free the buf we just allocated as we 2405 * will re-hang the old buf. 2406 */ 2407 xnf_buf_put(xnfp, nbuf, B_FALSE); 2408 goto hang_buf; 2409 } 2410 2411 mp->b_rptr = mp->b_rptr + off; 2412 mp->b_wptr = mp->b_rptr + len; 2413 2414 /* 2415 * Release the grant as the backend doesn't need to 2416 * access this buffer anymore and grants are scarce. 2417 */ 2418 (void) gnttab_end_foreign_access_ref(bdesc->grant_ref, 2419 0); 2420 xnf_gref_put(xnfp, bdesc->grant_ref); 2421 bdesc->grant_ref = INVALID_GRANT_REF; 2422 2423 bdesc = nbuf; 2424 } else { 2425 /* 2426 * We failed to allocate a new buf or decided to reuse 2427 * the old one. In either case we copy the data off it 2428 * and put it back into the ring. 2429 */ 2430 mp = allocb(len, 0); 2431 if (mp == NULL) { 2432 xnfp->xnf_stat_rx_allocb_fail++; 2433 xnfp->xnf_stat_norxbuf++; 2434 error = ENOMEM; 2435 goto hang_buf; 2436 } 2437 bcopy(bdesc->buf + off, mp->b_wptr, len); 2438 mp->b_wptr += len; 2439 } 2440 2441 if (head == NULL) 2442 head = mp; 2443 else 2444 tail->b_cont = mp; 2445 tail = mp; 2446 2447 hang_buf: 2448 /* 2449 * No matter what happens, for each response we need to hang 2450 * a new buf on the rx ring. Put either the old one, or a new 2451 * one if the old one is borrowed by the kernel via desballoc(). 2452 */ 2453 xnf_rxbuf_hang(xnfp, bdesc); 2454 cons++; 2455 2456 /* next response is an extra */ 2457 is_extra = more_extra; 2458 2459 if (!more_data && !more_extra) 2460 break; 2461 2462 /* 2463 * Note that since requests and responses are union'd on the 2464 * same ring, we copy the response to a local variable instead 2465 * of keeping a pointer. Otherwise xnf_rxbuf_hang() would have 2466 * overwritten contents of rsp. 2467 */ 2468 rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons); 2469 } 2470 2471 /* 2472 * Check that we do not get stuck in a loop. 2473 */ 2474 ASSERT3U(*consp, !=, cons); 2475 *consp = cons; 2476 2477 /* 2478 * We ran out of responses but the flags indicate there is more data. 2479 */ 2480 if (more_data) { 2481 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments."); 2482 error = EINVAL; 2483 } 2484 if (more_extra) { 2485 dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments " 2486 "(extras)."); 2487 error = EINVAL; 2488 } 2489 2490 /* 2491 * An error means the packet must be dropped. If we have already formed 2492 * a partial packet, then discard it. 2493 */ 2494 if (error != 0) { 2495 if (head != NULL) 2496 freemsg(head); 2497 xnfp->xnf_stat_rx_drop++; 2498 return (error); 2499 } 2500 2501 ASSERT(head != NULL); 2502 2503 if (hwcsum) { 2504 /* 2505 * If the peer says that the data has been validated then we 2506 * declare that the full checksum has been verified. 2507 * 2508 * We don't look at the "checksum blank" flag, and hence could 2509 * have a packet here that we are asserting is good with 2510 * a blank checksum. 2511 */ 2512 mac_hcksum_set(head, 0, 0, 0, 0, HCK_FULLCKSUM_OK); 2513 xnfp->xnf_stat_rx_cksum_no_need++; 2514 } 2515 2516 /* XXX: set lro info for packet once LRO is supported in OS. */ 2517 2518 *mpp = head; 2519 2520 return (0); 2521 } 2522 2523 /* 2524 * Collect packets from the RX ring, storing them in `xnfp' for later use. 2525 */ 2526 static void 2527 xnf_rx_collect(xnf_t *xnfp) 2528 { 2529 RING_IDX prod; 2530 2531 ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock)); 2532 2533 prod = xnfp->xnf_rx_ring.sring->rsp_prod; 2534 /* 2535 * Ensure we see queued responses up to 'prod'. 2536 */ 2537 membar_consumer(); 2538 2539 while (xnfp->xnf_rx_ring.rsp_cons != prod) { 2540 mblk_t *mp; 2541 2542 /* 2543 * Collect a packet. 2544 * rsp_cons is updated inside xnf_rx_one_packet(). 2545 */ 2546 int error = xnf_rx_one_packet(xnfp, prod, 2547 &xnfp->xnf_rx_ring.rsp_cons, &mp); 2548 if (error == 0) { 2549 xnfp->xnf_stat_ipackets++; 2550 xnfp->xnf_stat_rbytes += xmsgsize(mp); 2551 2552 /* 2553 * Append the mblk to the rx list. 2554 */ 2555 if (xnfp->xnf_rx_head == NULL) { 2556 ASSERT3P(xnfp->xnf_rx_tail, ==, NULL); 2557 xnfp->xnf_rx_head = mp; 2558 } else { 2559 ASSERT(xnfp->xnf_rx_tail != NULL); 2560 xnfp->xnf_rx_tail->b_next = mp; 2561 } 2562 xnfp->xnf_rx_tail = mp; 2563 } 2564 } 2565 } 2566 2567 /* 2568 * xnf_alloc_dma_resources() -- initialize the drivers structures 2569 */ 2570 static int 2571 xnf_alloc_dma_resources(xnf_t *xnfp) 2572 { 2573 dev_info_t *devinfo = xnfp->xnf_devinfo; 2574 size_t len; 2575 ddi_dma_cookie_t dma_cookie; 2576 uint_t ncookies; 2577 int rc; 2578 caddr_t rptr; 2579 2580 /* 2581 * The code below allocates all the DMA data structures that 2582 * need to be released when the driver is detached. 2583 * 2584 * Allocate page for the transmit descriptor ring. 2585 */ 2586 if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr, 2587 DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS) 2588 goto alloc_error; 2589 2590 if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle, 2591 PAGESIZE, &accattr, DDI_DMA_CONSISTENT, 2592 DDI_DMA_SLEEP, 0, &rptr, &len, 2593 &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) { 2594 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle); 2595 xnfp->xnf_tx_ring_dma_handle = NULL; 2596 goto alloc_error; 2597 } 2598 2599 if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL, 2600 rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, 2601 DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) { 2602 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle); 2603 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle); 2604 xnfp->xnf_tx_ring_dma_handle = NULL; 2605 xnfp->xnf_tx_ring_dma_acchandle = NULL; 2606 if (rc == DDI_DMA_NORESOURCES) 2607 goto alloc_error; 2608 else 2609 goto error; 2610 } 2611 2612 ASSERT(ncookies == 1); 2613 bzero(rptr, PAGESIZE); 2614 /* LINTED: constant in conditional context */ 2615 SHARED_RING_INIT((netif_tx_sring_t *)rptr); 2616 /* LINTED: constant in conditional context */ 2617 FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE); 2618 xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress; 2619 2620 /* 2621 * Allocate page for the receive descriptor ring. 2622 */ 2623 if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr, 2624 DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS) 2625 goto alloc_error; 2626 2627 if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle, 2628 PAGESIZE, &accattr, DDI_DMA_CONSISTENT, 2629 DDI_DMA_SLEEP, 0, &rptr, &len, 2630 &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) { 2631 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); 2632 xnfp->xnf_rx_ring_dma_handle = NULL; 2633 goto alloc_error; 2634 } 2635 2636 if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL, 2637 rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, 2638 DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) { 2639 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle); 2640 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); 2641 xnfp->xnf_rx_ring_dma_handle = NULL; 2642 xnfp->xnf_rx_ring_dma_acchandle = NULL; 2643 if (rc == DDI_DMA_NORESOURCES) 2644 goto alloc_error; 2645 else 2646 goto error; 2647 } 2648 2649 ASSERT(ncookies == 1); 2650 bzero(rptr, PAGESIZE); 2651 /* LINTED: constant in conditional context */ 2652 SHARED_RING_INIT((netif_rx_sring_t *)rptr); 2653 /* LINTED: constant in conditional context */ 2654 FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE); 2655 xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress; 2656 2657 return (DDI_SUCCESS); 2658 2659 alloc_error: 2660 cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory", 2661 ddi_get_instance(xnfp->xnf_devinfo)); 2662 error: 2663 xnf_release_dma_resources(xnfp); 2664 return (DDI_FAILURE); 2665 } 2666 2667 /* 2668 * Release all DMA resources in the opposite order from acquisition 2669 */ 2670 static void 2671 xnf_release_dma_resources(xnf_t *xnfp) 2672 { 2673 int i; 2674 2675 /* 2676 * Free receive buffers which are currently associated with 2677 * descriptors. 2678 */ 2679 mutex_enter(&xnfp->xnf_rxlock); 2680 for (i = 0; i < NET_RX_RING_SIZE; i++) { 2681 xnf_buf_t *bp; 2682 2683 if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL) 2684 continue; 2685 xnfp->xnf_rx_pkt_info[i] = NULL; 2686 xnf_buf_put(xnfp, bp, B_FALSE); 2687 } 2688 mutex_exit(&xnfp->xnf_rxlock); 2689 2690 /* Free the receive ring buffer. */ 2691 if (xnfp->xnf_rx_ring_dma_acchandle != NULL) { 2692 (void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle); 2693 ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle); 2694 ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle); 2695 xnfp->xnf_rx_ring_dma_acchandle = NULL; 2696 } 2697 /* Free the transmit ring buffer. */ 2698 if (xnfp->xnf_tx_ring_dma_acchandle != NULL) { 2699 (void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle); 2700 ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle); 2701 ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle); 2702 xnfp->xnf_tx_ring_dma_acchandle = NULL; 2703 } 2704 2705 } 2706 2707 /* 2708 * Release any packets and associated structures used by the TX ring. 2709 */ 2710 static void 2711 xnf_release_mblks(xnf_t *xnfp) 2712 { 2713 RING_IDX i; 2714 xnf_txid_t *tidp; 2715 2716 for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0]; 2717 i < NET_TX_RING_SIZE; 2718 i++, tidp++) { 2719 xnf_txbuf_t *txp = tidp->txbuf; 2720 2721 if (txp != NULL) { 2722 ASSERT(txp->tx_mp != NULL); 2723 freemsg(txp->tx_mp); 2724 2725 xnf_txid_put(xnfp, tidp); 2726 kmem_cache_free(xnfp->xnf_tx_buf_cache, txp); 2727 } 2728 } 2729 } 2730 2731 static int 2732 xnf_buf_constructor(void *buf, void *arg, int kmflag) 2733 { 2734 int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP; 2735 xnf_buf_t *bdesc = buf; 2736 xnf_t *xnfp = arg; 2737 ddi_dma_cookie_t dma_cookie; 2738 uint_t ncookies; 2739 size_t len; 2740 2741 if (kmflag & KM_NOSLEEP) 2742 ddiflags = DDI_DMA_DONTWAIT; 2743 2744 /* Allocate a DMA access handle for the buffer. */ 2745 if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buf_dma_attr, 2746 ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS) 2747 goto failure; 2748 2749 /* Allocate DMA-able memory for buffer. */ 2750 if (ddi_dma_mem_alloc(bdesc->dma_handle, 2751 PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0, 2752 &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS) 2753 goto failure_1; 2754 2755 /* Bind to virtual address of buffer to get physical address. */ 2756 if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL, 2757 bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING, 2758 ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED) 2759 goto failure_2; 2760 ASSERT(ncookies == 1); 2761 2762 bdesc->free_rtn.free_func = xnf_buf_recycle; 2763 bdesc->free_rtn.free_arg = (caddr_t)bdesc; 2764 bdesc->xnfp = xnfp; 2765 bdesc->buf_phys = dma_cookie.dmac_laddress; 2766 bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys)); 2767 bdesc->len = dma_cookie.dmac_size; 2768 bdesc->grant_ref = INVALID_GRANT_REF; 2769 bdesc->gen = xnfp->xnf_gen; 2770 2771 atomic_inc_64(&xnfp->xnf_stat_buf_allocated); 2772 2773 return (0); 2774 2775 failure_2: 2776 ddi_dma_mem_free(&bdesc->acc_handle); 2777 2778 failure_1: 2779 ddi_dma_free_handle(&bdesc->dma_handle); 2780 2781 failure: 2782 2783 ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */ 2784 return (-1); 2785 } 2786 2787 static void 2788 xnf_buf_destructor(void *buf, void *arg) 2789 { 2790 xnf_buf_t *bdesc = buf; 2791 xnf_t *xnfp = arg; 2792 2793 (void) ddi_dma_unbind_handle(bdesc->dma_handle); 2794 ddi_dma_mem_free(&bdesc->acc_handle); 2795 ddi_dma_free_handle(&bdesc->dma_handle); 2796 2797 atomic_dec_64(&xnfp->xnf_stat_buf_allocated); 2798 } 2799 2800 static xnf_buf_t * 2801 xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly) 2802 { 2803 grant_ref_t gref; 2804 xnf_buf_t *bufp; 2805 2806 /* 2807 * Usually grant references are more scarce than memory, so we 2808 * attempt to acquire a grant reference first. 2809 */ 2810 gref = xnf_gref_get(xnfp); 2811 if (gref == INVALID_GRANT_REF) 2812 return (NULL); 2813 2814 bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags); 2815 if (bufp == NULL) { 2816 xnf_gref_put(xnfp, gref); 2817 return (NULL); 2818 } 2819 2820 ASSERT3U(bufp->grant_ref, ==, INVALID_GRANT_REF); 2821 2822 bufp->grant_ref = gref; 2823 2824 if (bufp->gen != xnfp->xnf_gen) 2825 xnf_buf_refresh(bufp); 2826 2827 gnttab_grant_foreign_access_ref(bufp->grant_ref, 2828 xvdi_get_oeid(bufp->xnfp->xnf_devinfo), 2829 bufp->buf_mfn, readonly ? 1 : 0); 2830 2831 atomic_inc_64(&xnfp->xnf_stat_buf_outstanding); 2832 2833 return (bufp); 2834 } 2835 2836 static void 2837 xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly) 2838 { 2839 if (bufp->grant_ref != INVALID_GRANT_REF) { 2840 (void) gnttab_end_foreign_access_ref( 2841 bufp->grant_ref, readonly ? 1 : 0); 2842 xnf_gref_put(xnfp, bufp->grant_ref); 2843 bufp->grant_ref = INVALID_GRANT_REF; 2844 } 2845 2846 kmem_cache_free(xnfp->xnf_buf_cache, bufp); 2847 2848 atomic_dec_64(&xnfp->xnf_stat_buf_outstanding); 2849 } 2850 2851 /* 2852 * Refresh any cached data about a buffer after resume. 2853 */ 2854 static void 2855 xnf_buf_refresh(xnf_buf_t *bdesc) 2856 { 2857 bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys)); 2858 bdesc->gen = bdesc->xnfp->xnf_gen; 2859 } 2860 2861 /* 2862 * Streams `freeb' routine for `xnf_buf_t' when used as transmit 2863 * look-aside buffers. 2864 */ 2865 static void 2866 xnf_buf_recycle(xnf_buf_t *bdesc) 2867 { 2868 xnf_t *xnfp = bdesc->xnfp; 2869 2870 xnf_buf_put(xnfp, bdesc, B_TRUE); 2871 } 2872 2873 static int 2874 xnf_tx_buf_constructor(void *buf, void *arg, int kmflag) 2875 { 2876 int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP; 2877 xnf_txbuf_t *txp = buf; 2878 xnf_t *xnfp = arg; 2879 2880 if (kmflag & KM_NOSLEEP) 2881 ddiflags = DDI_DMA_DONTWAIT; 2882 2883 if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buf_dma_attr, 2884 ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) { 2885 ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */ 2886 return (-1); 2887 } 2888 2889 return (0); 2890 } 2891 2892 static void 2893 xnf_tx_buf_destructor(void *buf, void *arg) 2894 { 2895 _NOTE(ARGUNUSED(arg)); 2896 xnf_txbuf_t *txp = buf; 2897 2898 ddi_dma_free_handle(&txp->tx_dma_handle); 2899 } 2900 2901 /* 2902 * Statistics. 2903 */ 2904 static char *xnf_aux_statistics[] = { 2905 "tx_cksum_deferred", 2906 "rx_cksum_no_need", 2907 "interrupts", 2908 "unclaimed_interrupts", 2909 "tx_pullup", 2910 "tx_lookaside", 2911 "tx_drop", 2912 "tx_eth_hdr_split", 2913 "buf_allocated", 2914 "buf_outstanding", 2915 "gref_outstanding", 2916 "gref_failure", 2917 "gref_peak", 2918 "rx_allocb_fail", 2919 "rx_desballoc_fail", 2920 }; 2921 2922 static int 2923 xnf_kstat_aux_update(kstat_t *ksp, int flag) 2924 { 2925 xnf_t *xnfp; 2926 kstat_named_t *knp; 2927 2928 if (flag != KSTAT_READ) 2929 return (EACCES); 2930 2931 xnfp = ksp->ks_private; 2932 knp = ksp->ks_data; 2933 2934 /* 2935 * Assignment order must match that of the names in 2936 * xnf_aux_statistics. 2937 */ 2938 (knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred; 2939 (knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need; 2940 2941 (knp++)->value.ui64 = xnfp->xnf_stat_interrupts; 2942 (knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts; 2943 (knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup; 2944 (knp++)->value.ui64 = xnfp->xnf_stat_tx_lookaside; 2945 (knp++)->value.ui64 = xnfp->xnf_stat_tx_drop; 2946 (knp++)->value.ui64 = xnfp->xnf_stat_tx_eth_hdr_split; 2947 2948 (knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated; 2949 (knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding; 2950 (knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding; 2951 (knp++)->value.ui64 = xnfp->xnf_stat_gref_failure; 2952 (knp++)->value.ui64 = xnfp->xnf_stat_gref_peak; 2953 (knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail; 2954 (knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail; 2955 2956 return (0); 2957 } 2958 2959 static boolean_t 2960 xnf_kstat_init(xnf_t *xnfp) 2961 { 2962 int nstat = sizeof (xnf_aux_statistics) / 2963 sizeof (xnf_aux_statistics[0]); 2964 char **cp = xnf_aux_statistics; 2965 kstat_named_t *knp; 2966 2967 /* 2968 * Create and initialise kstats. 2969 */ 2970 if ((xnfp->xnf_kstat_aux = kstat_create("xnf", 2971 ddi_get_instance(xnfp->xnf_devinfo), 2972 "aux_statistics", "net", KSTAT_TYPE_NAMED, 2973 nstat, 0)) == NULL) 2974 return (B_FALSE); 2975 2976 xnfp->xnf_kstat_aux->ks_private = xnfp; 2977 xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update; 2978 2979 knp = xnfp->xnf_kstat_aux->ks_data; 2980 while (nstat > 0) { 2981 kstat_named_init(knp, *cp, KSTAT_DATA_UINT64); 2982 2983 knp++; 2984 cp++; 2985 nstat--; 2986 } 2987 2988 kstat_install(xnfp->xnf_kstat_aux); 2989 2990 return (B_TRUE); 2991 } 2992 2993 static int 2994 xnf_stat(void *arg, uint_t stat, uint64_t *val) 2995 { 2996 xnf_t *xnfp = arg; 2997 2998 mutex_enter(&xnfp->xnf_rxlock); 2999 mutex_enter(&xnfp->xnf_txlock); 3000 3001 #define mac_stat(q, r) \ 3002 case (MAC_STAT_##q): \ 3003 *val = xnfp->xnf_stat_##r; \ 3004 break 3005 3006 #define ether_stat(q, r) \ 3007 case (ETHER_STAT_##q): \ 3008 *val = xnfp->xnf_stat_##r; \ 3009 break 3010 3011 switch (stat) { 3012 3013 mac_stat(IPACKETS, ipackets); 3014 mac_stat(OPACKETS, opackets); 3015 mac_stat(RBYTES, rbytes); 3016 mac_stat(OBYTES, obytes); 3017 mac_stat(NORCVBUF, norxbuf); 3018 mac_stat(IERRORS, errrx); 3019 mac_stat(NOXMTBUF, tx_defer); 3020 3021 ether_stat(MACRCV_ERRORS, mac_rcv_error); 3022 ether_stat(TOOSHORT_ERRORS, runt); 3023 3024 /* always claim to be in full duplex mode */ 3025 case ETHER_STAT_LINK_DUPLEX: 3026 *val = LINK_DUPLEX_FULL; 3027 break; 3028 3029 /* always claim to be at 1Gb/s link speed */ 3030 case MAC_STAT_IFSPEED: 3031 *val = 1000000000ull; 3032 break; 3033 3034 default: 3035 mutex_exit(&xnfp->xnf_txlock); 3036 mutex_exit(&xnfp->xnf_rxlock); 3037 3038 return (ENOTSUP); 3039 } 3040 3041 #undef mac_stat 3042 #undef ether_stat 3043 3044 mutex_exit(&xnfp->xnf_txlock); 3045 mutex_exit(&xnfp->xnf_rxlock); 3046 3047 return (0); 3048 } 3049 3050 static int 3051 xnf_change_mtu(xnf_t *xnfp, uint32_t mtu) 3052 { 3053 if (mtu > ETHERMTU) { 3054 if (!xnf_enable_tx_sg) { 3055 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d " 3056 "because scatter-gather is disabled for transmit " 3057 "in driver settings", ETHERMTU); 3058 return (EINVAL); 3059 } else if (!xnf_enable_rx_sg) { 3060 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d " 3061 "because scatter-gather is disabled for receive " 3062 "in driver settings", ETHERMTU); 3063 return (EINVAL); 3064 } else if (!xnfp->xnf_be_tx_sg) { 3065 dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d " 3066 "because backend doesn't support scatter-gather", 3067 ETHERMTU); 3068 return (EINVAL); 3069 } 3070 if (mtu > XNF_MAXPKT) 3071 return (EINVAL); 3072 } 3073 int error = mac_maxsdu_update(xnfp->xnf_mh, mtu); 3074 if (error == 0) 3075 xnfp->xnf_mtu = mtu; 3076 3077 return (error); 3078 } 3079 3080 /*ARGSUSED*/ 3081 static int 3082 xnf_getprop(void *data, const char *prop_name, mac_prop_id_t prop_id, 3083 uint_t prop_val_size, void *prop_val) 3084 { 3085 xnf_t *xnfp = data; 3086 3087 switch (prop_id) { 3088 case MAC_PROP_MTU: 3089 ASSERT(prop_val_size >= sizeof (uint32_t)); 3090 bcopy(&xnfp->xnf_mtu, prop_val, sizeof (uint32_t)); 3091 break; 3092 default: 3093 return (ENOTSUP); 3094 } 3095 return (0); 3096 } 3097 3098 /*ARGSUSED*/ 3099 static int 3100 xnf_setprop(void *data, const char *prop_name, mac_prop_id_t prop_id, 3101 uint_t prop_val_size, const void *prop_val) 3102 { 3103 xnf_t *xnfp = data; 3104 uint32_t new_mtu; 3105 int error; 3106 3107 switch (prop_id) { 3108 case MAC_PROP_MTU: 3109 ASSERT(prop_val_size >= sizeof (uint32_t)); 3110 bcopy(prop_val, &new_mtu, sizeof (new_mtu)); 3111 error = xnf_change_mtu(xnfp, new_mtu); 3112 break; 3113 default: 3114 return (ENOTSUP); 3115 } 3116 3117 return (error); 3118 } 3119 3120 /*ARGSUSED*/ 3121 static void 3122 xnf_propinfo(void *data, const char *prop_name, mac_prop_id_t prop_id, 3123 mac_prop_info_handle_t prop_handle) 3124 { 3125 switch (prop_id) { 3126 case MAC_PROP_MTU: 3127 mac_prop_info_set_range_uint32(prop_handle, 0, XNF_MAXPKT); 3128 break; 3129 default: 3130 break; 3131 } 3132 } 3133 3134 static boolean_t 3135 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data) 3136 { 3137 xnf_t *xnfp = arg; 3138 3139 switch (cap) { 3140 case MAC_CAPAB_HCKSUM: { 3141 uint32_t *capab = cap_data; 3142 3143 /* 3144 * Whilst the flag used to communicate with the IO 3145 * domain is called "NETTXF_csum_blank", the checksum 3146 * in the packet must contain the pseudo-header 3147 * checksum and not zero. 3148 * 3149 * To help out the IO domain, we might use 3150 * HCKSUM_INET_PARTIAL. Unfortunately our stack will 3151 * then use checksum offload for IPv6 packets, which 3152 * the IO domain can't handle. 3153 * 3154 * As a result, we declare outselves capable of 3155 * HCKSUM_INET_FULL_V4. This means that we receive 3156 * IPv4 packets from the stack with a blank checksum 3157 * field and must insert the pseudo-header checksum 3158 * before passing the packet to the IO domain. 3159 */ 3160 *capab = HCKSUM_INET_FULL_V4; 3161 3162 /* 3163 * TODO: query the "feature-ipv6-csum-offload" capability. 3164 * If enabled, that could allow us to use HCKSUM_INET_PARTIAL. 3165 */ 3166 3167 break; 3168 } 3169 case MAC_CAPAB_LSO: { 3170 if (!xnfp->xnf_be_lso) 3171 return (B_FALSE); 3172 3173 mac_capab_lso_t *lso = cap_data; 3174 lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 3175 lso->lso_basic_tcp_ipv4.lso_max = IP_MAXPACKET; 3176 break; 3177 } 3178 default: 3179 return (B_FALSE); 3180 } 3181 3182 return (B_TRUE); 3183 } 3184 3185 /* 3186 * The state of the peer has changed - react accordingly. 3187 */ 3188 static void 3189 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, 3190 void *arg, void *impl_data) 3191 { 3192 _NOTE(ARGUNUSED(id, arg)); 3193 xnf_t *xnfp = ddi_get_driver_private(dip); 3194 XenbusState new_state = *(XenbusState *)impl_data; 3195 3196 ASSERT(xnfp != NULL); 3197 3198 switch (new_state) { 3199 case XenbusStateUnknown: 3200 case XenbusStateInitialising: 3201 case XenbusStateInitialised: 3202 case XenbusStateClosing: 3203 case XenbusStateClosed: 3204 case XenbusStateReconfiguring: 3205 case XenbusStateReconfigured: 3206 break; 3207 3208 case XenbusStateInitWait: 3209 xnf_read_config(xnfp); 3210 3211 if (!xnfp->xnf_be_rx_copy) { 3212 cmn_err(CE_WARN, 3213 "The xnf driver requires a dom0 that " 3214 "supports 'feature-rx-copy'."); 3215 (void) xvdi_switch_state(xnfp->xnf_devinfo, 3216 XBT_NULL, XenbusStateClosed); 3217 break; 3218 } 3219 3220 /* 3221 * Connect to the backend. 3222 */ 3223 xnf_be_connect(xnfp); 3224 3225 /* 3226 * Our MAC address as discovered by xnf_read_config(). 3227 */ 3228 mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr); 3229 3230 /* 3231 * We do not know if some features such as LSO are supported 3232 * until we connect to the backend. We request the MAC layer 3233 * to poll our capabilities again. 3234 */ 3235 mac_capab_update(xnfp->xnf_mh); 3236 3237 break; 3238 3239 case XenbusStateConnected: 3240 mutex_enter(&xnfp->xnf_rxlock); 3241 mutex_enter(&xnfp->xnf_txlock); 3242 3243 xnfp->xnf_connected = B_TRUE; 3244 /* 3245 * Wake up any threads waiting to send data to 3246 * backend. 3247 */ 3248 cv_broadcast(&xnfp->xnf_cv_state); 3249 3250 mutex_exit(&xnfp->xnf_txlock); 3251 mutex_exit(&xnfp->xnf_rxlock); 3252 3253 /* 3254 * Kick the peer in case it missed any transmits 3255 * request in the TX ring. 3256 */ 3257 ec_notify_via_evtchn(xnfp->xnf_evtchn); 3258 3259 /* 3260 * There may already be completed receive requests in 3261 * the ring sent by backend after it gets connected 3262 * but before we see its state change here, so we call 3263 * xnf_intr() to handle them, if any. 3264 */ 3265 (void) xnf_intr((caddr_t)xnfp); 3266 3267 /* 3268 * Mark the link up now that we are connected. 3269 */ 3270 mac_link_update(xnfp->xnf_mh, LINK_STATE_UP); 3271 3272 /* 3273 * Tell the backend about the multicast addresses in 3274 * which we are interested. 3275 */ 3276 mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE); 3277 3278 break; 3279 3280 default: 3281 break; 3282 } 3283 } 3284