1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #ifdef DEBUG 28 #define XNB_DEBUG 1 29 #endif /* DEBUG */ 30 31 #include "xnb.h" 32 33 #include <sys/sunddi.h> 34 #include <sys/sunndi.h> 35 #include <sys/modctl.h> 36 #include <sys/conf.h> 37 #include <sys/mac.h> 38 #include <sys/dlpi.h> 39 #include <sys/strsubr.h> 40 #include <sys/strsun.h> 41 #include <sys/types.h> 42 #include <sys/pattr.h> 43 #include <vm/seg_kmem.h> 44 #include <vm/hat_i86.h> 45 #include <xen/sys/xenbus_impl.h> 46 #include <xen/sys/xendev.h> 47 #include <sys/balloon_impl.h> 48 #include <sys/evtchn_impl.h> 49 #include <sys/gnttab.h> 50 #include <vm/vm_dep.h> 51 52 #include <sys/gld.h> 53 #include <inet/ip.h> 54 #include <inet/ip_impl.h> 55 #include <sys/vnic_impl.h> /* blech. */ 56 57 /* 58 * The terms "transmit" and "receive" are used in their traditional 59 * sense here - packets from other parts of this system are 60 * "transmitted" to the peer domain and those originating from the 61 * peer are "received". 62 * 63 * In some cases this can be confusing, because various data 64 * structures are shared with the domU driver, which has the opposite 65 * view of what constitutes "transmit" and "receive". In naming the 66 * shared structures the domU driver always wins. 67 */ 68 69 /* 70 * XXPV dme: things to do, as well as various things indicated 71 * throughout the source: 72 * - copy avoidance outbound. 73 * - copy avoidance inbound. 74 * - transfer credit limiting. 75 * - MAC address based filtering. 76 */ 77 78 /* 79 * Linux expects to have some headroom in received buffers. The Linux 80 * frontend driver (netfront) checks to see if the headroom is 81 * available and will re-allocate the buffer to make room if 82 * necessary. To avoid this we add TX_BUFFER_HEADROOM bytes of 83 * headroom to each packet we pass to the peer. 84 */ 85 #define TX_BUFFER_HEADROOM 16 86 87 /* 88 * Should we attempt to defer checksum calculation? 89 */ 90 static boolean_t xnb_cksum_offload = B_TRUE; 91 /* 92 * When receiving packets from a guest, should they be copied 93 * or used as-is (esballoc)? 94 */ 95 static boolean_t xnb_rx_always_copy = B_TRUE; 96 97 static boolean_t xnb_connect_rings(dev_info_t *); 98 static void xnb_disconnect_rings(dev_info_t *); 99 static void xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t, 100 void *, void *); 101 static void xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t, 102 void *, void *); 103 104 static int xnb_rxbuf_constructor(void *, void *, int); 105 static void xnb_rxbuf_destructor(void *, void *); 106 static xnb_rxbuf_t *xnb_rxbuf_get(xnb_t *, int); 107 static void xnb_rxbuf_put(xnb_t *, xnb_rxbuf_t *); 108 static void xnb_rx_notify_peer(xnb_t *); 109 static void xnb_rx_complete(xnb_rxbuf_t *); 110 static void xnb_rx_mark_complete(xnb_t *, RING_IDX, int16_t); 111 static void xnb_rx_schedule_unmop(xnb_t *, gnttab_map_grant_ref_t *, 112 xnb_rxbuf_t *); 113 static void xnb_rx_perform_pending_unmop(xnb_t *); 114 mblk_t *xnb_copy_to_peer(xnb_t *, mblk_t *); 115 116 int xnb_unmop_lowwat = NET_TX_RING_SIZE >> 2; 117 int xnb_unmop_hiwat = NET_TX_RING_SIZE - (NET_TX_RING_SIZE >> 2); 118 119 120 boolean_t xnb_hv_copy = B_TRUE; 121 boolean_t xnb_explicit_pageflip_set = B_FALSE; 122 123 /* XXPV dme: are these really invalid? */ 124 #define INVALID_GRANT_HANDLE ((grant_handle_t)-1) 125 #define INVALID_GRANT_REF ((grant_ref_t)-1) 126 127 static kmem_cache_t *xnb_rxbuf_cachep; 128 static kmutex_t xnb_alloc_page_lock; 129 130 /* 131 * Statistics. 132 */ 133 static char *aux_statistics[] = { 134 "tx_cksum_deferred", 135 "rx_cksum_no_need", 136 "tx_rsp_notok", 137 "tx_notify_deferred", 138 "tx_notify_sent", 139 "rx_notify_deferred", 140 "rx_notify_sent", 141 "tx_too_early", 142 "rx_too_early", 143 "rx_allocb_failed", 144 "tx_allocb_failed", 145 "tx_foreign_page", 146 "mac_full", 147 "spurious_intr", 148 "allocation_success", 149 "allocation_failure", 150 "small_allocation_success", 151 "small_allocation_failure", 152 "other_allocation_failure", 153 "tx_pageboundary_crossed", 154 "tx_cpoparea_grown", 155 "csum_hardware", 156 "csum_software", 157 }; 158 159 static int 160 xnb_ks_aux_update(kstat_t *ksp, int flag) 161 { 162 xnb_t *xnbp; 163 kstat_named_t *knp; 164 165 if (flag != KSTAT_READ) 166 return (EACCES); 167 168 xnbp = ksp->ks_private; 169 knp = ksp->ks_data; 170 171 /* 172 * Assignment order should match that of the names in 173 * aux_statistics. 174 */ 175 (knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_deferred; 176 (knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_no_need; 177 (knp++)->value.ui64 = xnbp->xnb_stat_tx_rsp_notok; 178 (knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred; 179 (knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent; 180 (knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred; 181 (knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent; 182 (knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early; 183 (knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early; 184 (knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed; 185 (knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed; 186 (knp++)->value.ui64 = xnbp->xnb_stat_tx_foreign_page; 187 (knp++)->value.ui64 = xnbp->xnb_stat_mac_full; 188 (knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr; 189 (knp++)->value.ui64 = xnbp->xnb_stat_allocation_success; 190 (knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure; 191 (knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success; 192 (knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure; 193 (knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure; 194 (knp++)->value.ui64 = xnbp->xnb_stat_tx_pagebndry_crossed; 195 (knp++)->value.ui64 = xnbp->xnb_stat_tx_cpoparea_grown; 196 (knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware; 197 (knp++)->value.ui64 = xnbp->xnb_stat_csum_software; 198 199 return (0); 200 } 201 202 static boolean_t 203 xnb_ks_init(xnb_t *xnbp) 204 { 205 int nstat = sizeof (aux_statistics) / 206 sizeof (aux_statistics[0]); 207 char **cp = aux_statistics; 208 kstat_named_t *knp; 209 210 /* 211 * Create and initialise kstats. 212 */ 213 xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo), 214 ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net", 215 KSTAT_TYPE_NAMED, nstat, 0); 216 if (xnbp->xnb_kstat_aux == NULL) 217 return (B_FALSE); 218 219 xnbp->xnb_kstat_aux->ks_private = xnbp; 220 xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update; 221 222 knp = xnbp->xnb_kstat_aux->ks_data; 223 while (nstat > 0) { 224 kstat_named_init(knp, *cp, KSTAT_DATA_UINT64); 225 226 knp++; 227 cp++; 228 nstat--; 229 } 230 231 kstat_install(xnbp->xnb_kstat_aux); 232 233 return (B_TRUE); 234 } 235 236 static void 237 xnb_ks_free(xnb_t *xnbp) 238 { 239 kstat_delete(xnbp->xnb_kstat_aux); 240 } 241 242 /* 243 * Software checksum calculation and insertion for an arbitrary packet. 244 */ 245 /*ARGSUSED*/ 246 static mblk_t * 247 xnb_software_csum(xnb_t *xnbp, mblk_t *mp) 248 { 249 /* 250 * XXPV dme: shouldn't rely on vnic_fix_cksum(), not least 251 * because it doesn't cover all of the interesting cases :-( 252 */ 253 (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, 254 HCK_FULLCKSUM, KM_NOSLEEP); 255 256 return (vnic_fix_cksum(mp)); 257 } 258 259 mblk_t * 260 xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab) 261 { 262 struct ether_header *ehp; 263 uint16_t sap; 264 uint32_t offset; 265 ipha_t *ipha; 266 267 ASSERT(mp->b_next == NULL); 268 269 /* 270 * Check that the packet is contained in a single mblk. In 271 * the "from peer" path this is true today, but will change 272 * when scatter gather support is added. In the "to peer" 273 * path we cannot be sure, but in most cases it will be true 274 * (in the xnbo case the packet has come from a MAC device 275 * which is unlikely to split packets). 276 */ 277 if (mp->b_cont != NULL) 278 goto software; 279 280 /* 281 * If the MAC has no hardware capability don't do any further 282 * checking. 283 */ 284 if (capab == 0) 285 goto software; 286 287 ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); 288 ehp = (struct ether_header *)mp->b_rptr; 289 290 if (ntohs(ehp->ether_type) == VLAN_TPID) { 291 struct ether_vlan_header *evhp; 292 293 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); 294 evhp = (struct ether_vlan_header *)mp->b_rptr; 295 sap = ntohs(evhp->ether_type); 296 offset = sizeof (struct ether_vlan_header); 297 } else { 298 sap = ntohs(ehp->ether_type); 299 offset = sizeof (struct ether_header); 300 } 301 302 /* 303 * We only attempt to do IPv4 packets in hardware. 304 */ 305 if (sap != ETHERTYPE_IP) 306 goto software; 307 308 /* 309 * We know that this is an IPv4 packet. 310 */ 311 ipha = (ipha_t *)(mp->b_rptr + offset); 312 313 switch (ipha->ipha_protocol) { 314 case IPPROTO_TCP: 315 case IPPROTO_UDP: { 316 uint32_t start, length, stuff, cksum; 317 uint16_t *stuffp; 318 319 /* 320 * This is a TCP/IPv4 or UDP/IPv4 packet, for which we 321 * can use full IPv4 and partial checksum offload. 322 */ 323 if ((capab & (HCKSUM_INET_FULL_V4|HCKSUM_INET_PARTIAL)) == 0) 324 break; 325 326 start = IP_SIMPLE_HDR_LENGTH; 327 length = ntohs(ipha->ipha_length); 328 if (ipha->ipha_protocol == IPPROTO_TCP) { 329 stuff = start + TCP_CHECKSUM_OFFSET; 330 cksum = IP_TCP_CSUM_COMP; 331 } else { 332 stuff = start + UDP_CHECKSUM_OFFSET; 333 cksum = IP_UDP_CSUM_COMP; 334 } 335 stuffp = (uint16_t *)(mp->b_rptr + offset + stuff); 336 337 if (capab & HCKSUM_INET_FULL_V4) { 338 /* 339 * Some devices require that the checksum 340 * field of the packet is zero for full 341 * offload. 342 */ 343 *stuffp = 0; 344 345 (void) hcksum_assoc(mp, NULL, NULL, 346 0, 0, 0, 0, 347 HCK_FULLCKSUM, KM_NOSLEEP); 348 349 xnbp->xnb_stat_csum_hardware++; 350 351 return (mp); 352 } 353 354 if (capab & HCKSUM_INET_PARTIAL) { 355 if (*stuffp == 0) { 356 ipaddr_t src, dst; 357 358 /* 359 * Older Solaris guests don't insert 360 * the pseudo-header checksum, so we 361 * calculate it here. 362 */ 363 src = ipha->ipha_src; 364 dst = ipha->ipha_dst; 365 366 cksum += (dst >> 16) + (dst & 0xFFFF); 367 cksum += (src >> 16) + (src & 0xFFFF); 368 cksum += length - IP_SIMPLE_HDR_LENGTH; 369 370 cksum = (cksum >> 16) + (cksum & 0xFFFF); 371 cksum = (cksum >> 16) + (cksum & 0xFFFF); 372 373 ASSERT(cksum <= 0xFFFF); 374 375 *stuffp = (uint16_t)(cksum ? cksum : ~cksum); 376 } 377 378 (void) hcksum_assoc(mp, NULL, NULL, 379 start, stuff, length, 0, 380 HCK_PARTIALCKSUM, KM_NOSLEEP); 381 382 xnbp->xnb_stat_csum_hardware++; 383 384 return (mp); 385 } 386 387 /* NOTREACHED */ 388 break; 389 } 390 391 default: 392 /* Use software. */ 393 break; 394 } 395 396 software: 397 /* 398 * We are not able to use any offload so do the whole thing in 399 * software. 400 */ 401 xnbp->xnb_stat_csum_software++; 402 403 return (xnb_software_csum(xnbp, mp)); 404 } 405 406 int 407 xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data) 408 { 409 xnb_t *xnbp; 410 char *xsname, mac[ETHERADDRL * 3]; 411 412 xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP); 413 414 xnbp->xnb_flavour = flavour; 415 xnbp->xnb_flavour_data = flavour_data; 416 xnbp->xnb_devinfo = dip; 417 xnbp->xnb_evtchn = INVALID_EVTCHN; 418 xnbp->xnb_irq = B_FALSE; 419 xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE; 420 xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE; 421 xnbp->xnb_cksum_offload = xnb_cksum_offload; 422 xnbp->xnb_connected = B_FALSE; 423 xnbp->xnb_hotplugged = B_FALSE; 424 xnbp->xnb_detachable = B_FALSE; 425 xnbp->xnb_peer = xvdi_get_oeid(dip); 426 xnbp->xnb_rx_pages_writable = B_FALSE; 427 xnbp->xnb_rx_always_copy = xnb_rx_always_copy; 428 429 xnbp->xnb_rx_buf_count = 0; 430 xnbp->xnb_rx_unmop_count = 0; 431 432 xnbp->xnb_hv_copy = B_FALSE; 433 434 xnbp->xnb_tx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); 435 ASSERT(xnbp->xnb_tx_va != NULL); 436 437 if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie) 438 != DDI_SUCCESS) 439 goto failure; 440 441 /* allocated on demand, when/if we enter xnb_copy_to_peer() */ 442 xnbp->xnb_tx_cpop = NULL; 443 xnbp->xnb_cpop_sz = 0; 444 445 mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER, 446 xnbp->xnb_icookie); 447 mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER, 448 xnbp->xnb_icookie); 449 450 /* set driver private pointer now */ 451 ddi_set_driver_private(dip, xnbp); 452 453 if (!xnb_ks_init(xnbp)) 454 goto failure_1; 455 456 /* 457 * Receive notification of changes in the state of the 458 * driver in the guest domain. 459 */ 460 if (xvdi_add_event_handler(dip, XS_OE_STATE, 461 xnb_oe_state_change) != DDI_SUCCESS) 462 goto failure_2; 463 464 /* 465 * Receive notification of hotplug events. 466 */ 467 if (xvdi_add_event_handler(dip, XS_HP_STATE, 468 xnb_hp_state_change) != DDI_SUCCESS) 469 goto failure_2; 470 471 xsname = xvdi_get_xsname(dip); 472 473 if (xenbus_printf(XBT_NULL, xsname, 474 "feature-no-csum-offload", "%d", 475 xnbp->xnb_cksum_offload ? 0 : 1) != 0) 476 goto failure_3; 477 478 /* 479 * Use global xnb_hv_copy to export this feature. This means that 480 * we have to decide what to do before starting up a guest domain 481 */ 482 if (xenbus_printf(XBT_NULL, xsname, 483 "feature-rx-copy", "%d", xnb_hv_copy ? 1 : 0) != 0) 484 goto failure_3; 485 /* 486 * Linux domUs seem to depend on "feature-rx-flip" being 0 487 * in addition to "feature-rx-copy" being 1. It seems strange 488 * to use four possible states to describe a binary decision, 489 * but we might as well play nice. 490 */ 491 if (xenbus_printf(XBT_NULL, xsname, 492 "feature-rx-flip", "%d", xnb_explicit_pageflip_set ? 1 : 0) != 0) 493 goto failure_3; 494 495 if (xenbus_scanf(XBT_NULL, xsname, 496 "mac", "%s", mac) != 0) { 497 cmn_err(CE_WARN, "xnb_attach: " 498 "cannot read mac address from %s", 499 xsname); 500 goto failure_3; 501 } 502 503 if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) { 504 cmn_err(CE_WARN, 505 "xnb_attach: cannot parse mac address %s", 506 mac); 507 goto failure_3; 508 } 509 510 (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait); 511 (void) xvdi_post_event(dip, XEN_HP_ADD); 512 513 return (DDI_SUCCESS); 514 515 failure_3: 516 xvdi_remove_event_handler(dip, NULL); 517 518 failure_2: 519 xnb_ks_free(xnbp); 520 521 failure_1: 522 mutex_destroy(&xnbp->xnb_rx_lock); 523 mutex_destroy(&xnbp->xnb_tx_lock); 524 525 failure: 526 vmem_free(heap_arena, xnbp->xnb_tx_va, PAGESIZE); 527 kmem_free(xnbp, sizeof (*xnbp)); 528 return (DDI_FAILURE); 529 } 530 531 /*ARGSUSED*/ 532 void 533 xnb_detach(dev_info_t *dip) 534 { 535 xnb_t *xnbp = ddi_get_driver_private(dip); 536 537 ASSERT(xnbp != NULL); 538 ASSERT(!xnbp->xnb_connected); 539 ASSERT(xnbp->xnb_rx_buf_count == 0); 540 541 xnb_disconnect_rings(dip); 542 543 xvdi_remove_event_handler(dip, NULL); 544 545 xnb_ks_free(xnbp); 546 547 ddi_set_driver_private(dip, NULL); 548 549 mutex_destroy(&xnbp->xnb_tx_lock); 550 mutex_destroy(&xnbp->xnb_rx_lock); 551 552 if (xnbp->xnb_cpop_sz > 0) 553 kmem_free(xnbp->xnb_tx_cpop, sizeof (*xnbp->xnb_tx_cpop) 554 * xnbp->xnb_cpop_sz); 555 556 ASSERT(xnbp->xnb_tx_va != NULL); 557 vmem_free(heap_arena, xnbp->xnb_tx_va, PAGESIZE); 558 559 kmem_free(xnbp, sizeof (*xnbp)); 560 } 561 562 563 static mfn_t 564 xnb_alloc_page(xnb_t *xnbp) 565 { 566 #define WARNING_RATE_LIMIT 100 567 #define BATCH_SIZE 256 568 static mfn_t mfns[BATCH_SIZE]; /* common across all instances */ 569 static int nth = BATCH_SIZE; 570 mfn_t mfn; 571 572 mutex_enter(&xnb_alloc_page_lock); 573 if (nth == BATCH_SIZE) { 574 if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) { 575 xnbp->xnb_stat_allocation_failure++; 576 mutex_exit(&xnb_alloc_page_lock); 577 578 /* 579 * Try for a single page in low memory situations. 580 */ 581 if (balloon_alloc_pages(1, &mfn) != 1) { 582 if ((xnbp->xnb_stat_small_allocation_failure++ 583 % WARNING_RATE_LIMIT) == 0) 584 cmn_err(CE_WARN, "xnb_alloc_page: " 585 "Cannot allocate memory to " 586 "transfer packets to peer."); 587 return (0); 588 } else { 589 xnbp->xnb_stat_small_allocation_success++; 590 return (mfn); 591 } 592 } 593 594 nth = 0; 595 xnbp->xnb_stat_allocation_success++; 596 } 597 598 mfn = mfns[nth++]; 599 mutex_exit(&xnb_alloc_page_lock); 600 601 ASSERT(mfn != 0); 602 603 return (mfn); 604 #undef BATCH_SIZE 605 #undef WARNING_RATE_LIMIT 606 } 607 608 /*ARGSUSED*/ 609 static void 610 xnb_free_page(xnb_t *xnbp, mfn_t mfn) 611 { 612 int r; 613 pfn_t pfn; 614 615 pfn = xen_assign_pfn(mfn); 616 pfnzero(pfn, 0, PAGESIZE); 617 xen_release_pfn(pfn); 618 619 /* 620 * This happens only in the error path, so batching is 621 * not worth the complication. 622 */ 623 if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) { 624 cmn_err(CE_WARN, "free_page: cannot decrease memory " 625 "reservation (%d): page kept but unusable (mfn = 0x%lx).", 626 r, mfn); 627 } 628 } 629 630 /* 631 * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but 632 * using local variables. 633 */ 634 #define XNB_RING_HAS_UNCONSUMED_REQUESTS(_r) \ 635 ((((_r)->sring->req_prod - loop) < \ 636 (RING_SIZE(_r) - (loop - prod))) ? \ 637 ((_r)->sring->req_prod - loop) : \ 638 (RING_SIZE(_r) - (loop - prod))) 639 640 mblk_t * 641 xnb_to_peer(xnb_t *xnbp, mblk_t *mp) 642 { 643 mblk_t *free = mp, *prev = NULL; 644 size_t len; 645 gnttab_transfer_t *gop; 646 boolean_t notify; 647 RING_IDX loop, prod, end; 648 649 /* 650 * For each packet the sequence of operations is: 651 * 652 * 1. get a new page from the hypervisor. 653 * 2. get a request slot from the ring. 654 * 3. copy the data into the new page. 655 * 4. transfer the page to the peer. 656 * 5. update the request slot. 657 * 6. kick the peer. 658 * 7. free mp. 659 * 660 * In order to reduce the number of hypercalls, we prepare 661 * several packets for the peer and perform a single hypercall 662 * to transfer them. 663 */ 664 665 mutex_enter(&xnbp->xnb_tx_lock); 666 667 /* 668 * If we are not connected to the peer or have not yet 669 * finished hotplug it is too early to pass packets to the 670 * peer. 671 */ 672 if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) { 673 mutex_exit(&xnbp->xnb_tx_lock); 674 DTRACE_PROBE(flip_tx_too_early); 675 xnbp->xnb_stat_tx_too_early++; 676 return (mp); 677 } 678 679 loop = xnbp->xnb_rx_ring.req_cons; 680 prod = xnbp->xnb_rx_ring.rsp_prod_pvt; 681 gop = xnbp->xnb_tx_top; 682 683 while ((mp != NULL) && 684 XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) { 685 686 mfn_t mfn; 687 pfn_t pfn; 688 netif_rx_request_t *rxreq; 689 netif_rx_response_t *rxresp; 690 char *valoop; 691 size_t offset; 692 mblk_t *ml; 693 uint16_t cksum_flags; 694 695 /* 1 */ 696 if ((mfn = xnb_alloc_page(xnbp)) == 0) { 697 xnbp->xnb_stat_xmit_defer++; 698 break; 699 } 700 701 /* 2 */ 702 rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop); 703 704 #ifdef XNB_DEBUG 705 if (!(rxreq->id < NET_RX_RING_SIZE)) 706 cmn_err(CE_PANIC, "xnb_to_peer: " 707 "id %d out of range in request 0x%p", 708 rxreq->id, (void *)rxreq); 709 #endif /* XNB_DEBUG */ 710 711 /* Assign a pfn and map the new page at the allocated va. */ 712 pfn = xen_assign_pfn(mfn); 713 hat_devload(kas.a_hat, xnbp->xnb_tx_va, PAGESIZE, 714 pfn, PROT_READ | PROT_WRITE, HAT_LOAD); 715 716 offset = TX_BUFFER_HEADROOM; 717 718 /* 3 */ 719 len = 0; 720 valoop = xnbp->xnb_tx_va + offset; 721 for (ml = mp; ml != NULL; ml = ml->b_cont) { 722 size_t chunk = ml->b_wptr - ml->b_rptr; 723 724 bcopy(ml->b_rptr, valoop, chunk); 725 valoop += chunk; 726 len += chunk; 727 } 728 729 ASSERT(len + offset < PAGESIZE); 730 731 /* Release the pfn. */ 732 hat_unload(kas.a_hat, xnbp->xnb_tx_va, PAGESIZE, 733 HAT_UNLOAD_UNMAP); 734 xen_release_pfn(pfn); 735 736 /* 4 */ 737 gop->mfn = mfn; 738 gop->domid = xnbp->xnb_peer; 739 gop->ref = rxreq->gref; 740 741 /* 5.1 */ 742 rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod); 743 rxresp->offset = offset; 744 rxresp->flags = 0; 745 746 cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp); 747 if (cksum_flags != 0) 748 xnbp->xnb_stat_tx_cksum_deferred++; 749 rxresp->flags |= cksum_flags; 750 751 rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id; 752 rxresp->status = len; 753 754 loop++; 755 prod++; 756 gop++; 757 prev = mp; 758 mp = mp->b_next; 759 } 760 761 /* 762 * Did we actually do anything? 763 */ 764 if (loop == xnbp->xnb_rx_ring.req_cons) { 765 mutex_exit(&xnbp->xnb_tx_lock); 766 return (mp); 767 } 768 769 end = loop; 770 771 /* 772 * Unlink the end of the 'done' list from the remainder. 773 */ 774 ASSERT(prev != NULL); 775 prev->b_next = NULL; 776 777 if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_tx_top, 778 loop - xnbp->xnb_rx_ring.req_cons) != 0) { 779 cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed"); 780 } 781 782 loop = xnbp->xnb_rx_ring.req_cons; 783 prod = xnbp->xnb_rx_ring.rsp_prod_pvt; 784 gop = xnbp->xnb_tx_top; 785 786 while (loop < end) { 787 int16_t status = NETIF_RSP_OKAY; 788 789 if (gop->status != 0) { 790 status = NETIF_RSP_ERROR; 791 792 /* 793 * If the status is anything other than 794 * GNTST_bad_page then we don't own the page 795 * any more, so don't try to give it back. 796 */ 797 if (gop->status != GNTST_bad_page) 798 gop->mfn = 0; 799 } else { 800 /* The page is no longer ours. */ 801 gop->mfn = 0; 802 } 803 804 if (gop->mfn != 0) 805 /* 806 * Give back the page, as we won't be using 807 * it. 808 */ 809 xnb_free_page(xnbp, gop->mfn); 810 else 811 /* 812 * We gave away a page, update our accounting 813 * now. 814 */ 815 balloon_drv_subtracted(1); 816 817 /* 5.2 */ 818 if (status != NETIF_RSP_OKAY) { 819 RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status = 820 status; 821 } else { 822 xnbp->xnb_stat_opackets++; 823 xnbp->xnb_stat_obytes += len; 824 } 825 826 loop++; 827 prod++; 828 gop++; 829 } 830 831 xnbp->xnb_rx_ring.req_cons = loop; 832 xnbp->xnb_rx_ring.rsp_prod_pvt = prod; 833 834 /* 6 */ 835 /* LINTED: constant in conditional context */ 836 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify); 837 if (notify) { 838 ec_notify_via_evtchn(xnbp->xnb_evtchn); 839 xnbp->xnb_stat_tx_notify_sent++; 840 } else { 841 xnbp->xnb_stat_tx_notify_deferred++; 842 } 843 844 if (mp != NULL) 845 xnbp->xnb_stat_xmit_defer++; 846 847 mutex_exit(&xnbp->xnb_tx_lock); 848 849 /* Free mblk_t's that we consumed. */ 850 freemsgchain(free); 851 852 return (mp); 853 } 854 855 /* helper functions for xnb_copy_to_peer */ 856 857 /* 858 * Grow the array of copy operation descriptors. 859 * Returns a pointer to the next available entry. 860 */ 861 gnttab_copy_t * 862 grow_cpop_area(xnb_t *xnbp, gnttab_copy_t *o_cpop) 863 { 864 /* 865 * o_cpop (arg.1) is a ptr to the area we would like to copy 866 * something into but cannot, because we haven't alloc'ed it 867 * yet, or NULL. 868 * old_cpop and new_cpop (local) are pointers to old/new 869 * versions of xnbp->xnb_tx_cpop. 870 */ 871 gnttab_copy_t *new_cpop, *old_cpop, *ret_cpop; 872 size_t newcount; 873 874 ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock)); 875 876 old_cpop = xnbp->xnb_tx_cpop; 877 /* 878 * o_cpop is a pointer into the array pointed to by old_cpop; 879 * it would be an error for exactly one of these pointers to be NULL. 880 * We shouldn't call this function if xnb_tx_cpop has already 881 * been allocated, but we're starting to fill it from the beginning 882 * again. 883 */ 884 ASSERT((o_cpop == NULL && old_cpop == NULL) || 885 (o_cpop != NULL && old_cpop != NULL && o_cpop != old_cpop)); 886 887 newcount = xnbp->xnb_cpop_sz + CPOP_DEFCNT; 888 889 new_cpop = kmem_alloc(sizeof (*new_cpop) * newcount, KM_NOSLEEP); 890 if (new_cpop == NULL) { 891 xnbp->xnb_stat_other_allocation_failure++; 892 return (NULL); 893 } 894 895 if (o_cpop != NULL) { 896 size_t offset = (o_cpop - old_cpop); 897 898 /* we only need to move the parts in use ... */ 899 (void) memmove(new_cpop, old_cpop, xnbp->xnb_cpop_sz * 900 (sizeof (*old_cpop))); 901 902 kmem_free(old_cpop, xnbp->xnb_cpop_sz * sizeof (*old_cpop)); 903 904 ret_cpop = new_cpop + offset; 905 } else { 906 ret_cpop = new_cpop; 907 } 908 909 xnbp->xnb_tx_cpop = new_cpop; 910 xnbp->xnb_cpop_sz = newcount; 911 912 xnbp->xnb_stat_tx_cpoparea_grown++; 913 914 return (ret_cpop); 915 } 916 917 /* 918 * Check whether an address is on a page that's foreign to this domain. 919 */ 920 static boolean_t 921 is_foreign(void *addr) 922 { 923 pfn_t pfn = hat_getpfnum(kas.a_hat, addr); 924 925 return (pfn & PFN_IS_FOREIGN_MFN ? B_TRUE : B_FALSE); 926 } 927 928 /* 929 * Insert a newly allocated mblk into a chain, replacing the old one. 930 */ 931 static mblk_t * 932 replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev) 933 { 934 uint32_t start, stuff, end, value, flags; 935 mblk_t *new_mp; 936 937 new_mp = copyb(mp); 938 if (new_mp == NULL) 939 cmn_err(CE_PANIC, "replace_msg: cannot alloc new message" 940 "for %p, len %lu", (void *) mp, len); 941 942 hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags); 943 (void) hcksum_assoc(new_mp, NULL, NULL, start, stuff, end, value, 944 flags, KM_NOSLEEP); 945 946 new_mp->b_next = mp->b_next; 947 new_mp->b_prev = mp->b_prev; 948 new_mp->b_cont = mp->b_cont; 949 950 /* Make sure we only overwrite pointers to the mblk being replaced. */ 951 if (mp_prev != NULL && mp_prev->b_next == mp) 952 mp_prev->b_next = new_mp; 953 954 if (ml_prev != NULL && ml_prev->b_cont == mp) 955 ml_prev->b_cont = new_mp; 956 957 mp->b_next = mp->b_prev = mp->b_cont = NULL; 958 freemsg(mp); 959 960 return (new_mp); 961 } 962 963 /* 964 * Set all the fields in a gnttab_copy_t. 965 */ 966 static void 967 setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr, 968 size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref) 969 { 970 ASSERT(xnbp != NULL && gp != NULL); 971 972 gp->source.offset = s_off; 973 gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr)); 974 gp->source.domid = DOMID_SELF; 975 976 gp->len = (uint16_t)len; 977 gp->flags = GNTCOPY_dest_gref; 978 gp->status = 0; 979 980 gp->dest.u.ref = d_ref; 981 gp->dest.offset = d_off; 982 gp->dest.domid = xnbp->xnb_peer; 983 } 984 985 mblk_t * 986 xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp) 987 { 988 mblk_t *free = mp, *mp_prev = NULL, *saved_mp = mp; 989 mblk_t *ml, *ml_prev; 990 gnttab_copy_t *gop_cp; 991 boolean_t notify; 992 RING_IDX loop, prod; 993 int i; 994 995 if (!xnbp->xnb_hv_copy) 996 return (xnb_to_peer(xnbp, mp)); 997 998 /* 999 * For each packet the sequence of operations is: 1000 * 1001 * 1. get a request slot from the ring. 1002 * 2. set up data for hypercall (see NOTE below) 1003 * 3. have the hypervisore copy the data 1004 * 4. update the request slot. 1005 * 5. kick the peer. 1006 * 1007 * NOTE ad 2. 1008 * In order to reduce the number of hypercalls, we prepare 1009 * several packets (mp->b_cont != NULL) for the peer and 1010 * perform a single hypercall to transfer them. 1011 * We also have to set up a seperate copy operation for 1012 * every page. 1013 * 1014 * If we have more than one message (mp->b_next != NULL), 1015 * we do this whole dance repeatedly. 1016 */ 1017 1018 mutex_enter(&xnbp->xnb_tx_lock); 1019 1020 if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) { 1021 mutex_exit(&xnbp->xnb_tx_lock); 1022 DTRACE_PROBE(copy_tx_too_early); 1023 xnbp->xnb_stat_tx_too_early++; 1024 return (mp); 1025 } 1026 1027 loop = xnbp->xnb_rx_ring.req_cons; 1028 prod = xnbp->xnb_rx_ring.rsp_prod_pvt; 1029 1030 while ((mp != NULL) && 1031 XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) { 1032 netif_rx_request_t *rxreq; 1033 netif_rx_response_t *rxresp; 1034 size_t offset, d_offset; 1035 size_t len; 1036 uint16_t cksum_flags; 1037 int16_t status = NETIF_RSP_OKAY; 1038 int item_count; 1039 1040 /* 1 */ 1041 rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop); 1042 1043 #ifdef XNB_DEBUG 1044 if (!(rxreq->id < NET_RX_RING_SIZE)) 1045 cmn_err(CE_PANIC, "xnb_copy_to_peer: " 1046 "id %d out of range in request 0x%p", 1047 rxreq->id, (void *)rxreq); 1048 #endif /* XNB_DEBUG */ 1049 1050 /* 2 */ 1051 d_offset = offset = TX_BUFFER_HEADROOM; 1052 len = 0; 1053 item_count = 0; 1054 1055 gop_cp = xnbp->xnb_tx_cpop; 1056 1057 /* 1058 * We walk the b_cont pointers and set up a gop_cp 1059 * structure for every page in every data block we have. 1060 */ 1061 /* 2a */ 1062 for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) { 1063 size_t chunk = ml->b_wptr - ml->b_rptr; 1064 uchar_t *r_tmp, *rpt_align; 1065 size_t r_offset; 1066 1067 /* 1068 * If we get an mblk on a page that doesn't belong to 1069 * this domain, get a new mblk to replace the old one. 1070 */ 1071 if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) { 1072 mblk_t *ml_new = replace_msg(ml, chunk, 1073 mp_prev, ml_prev); 1074 1075 /* We can still use old ml, but not *ml! */ 1076 if (free == ml) 1077 free = ml_new; 1078 if (mp == ml) 1079 mp = ml_new; 1080 ml = ml_new; 1081 1082 xnbp->xnb_stat_tx_foreign_page++; 1083 } 1084 1085 rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr); 1086 r_offset = (uint16_t)(ml->b_rptr - rpt_align); 1087 r_tmp = ml->b_rptr; 1088 1089 if (d_offset + chunk > PAGESIZE) 1090 cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p " 1091 "(svd: %p), ml %p,rpt_alg. %p, d_offset " 1092 "(%lu) + chunk (%lu) > PAGESIZE %d!", 1093 (void *)mp, (void *)saved_mp, (void *)ml, 1094 (void *)rpt_align, 1095 d_offset, chunk, (int)PAGESIZE); 1096 1097 while (chunk > 0) { 1098 size_t part_len; 1099 1100 item_count++; 1101 if (item_count > xnbp->xnb_cpop_sz) { 1102 gop_cp = grow_cpop_area(xnbp, gop_cp); 1103 if (gop_cp == NULL) 1104 goto failure; 1105 } 1106 /* 1107 * If our mblk crosses a page boundary, we need 1108 * to do a seperate copy for every page. 1109 */ 1110 if (r_offset + chunk > PAGESIZE) { 1111 part_len = PAGESIZE - r_offset; 1112 1113 DTRACE_PROBE3(mblk_page_crossed, 1114 (mblk_t *), ml, int, chunk, int, 1115 (int)r_offset); 1116 1117 xnbp->xnb_stat_tx_pagebndry_crossed++; 1118 } else { 1119 part_len = chunk; 1120 } 1121 1122 setup_gop(xnbp, gop_cp, r_tmp, r_offset, 1123 d_offset, part_len, rxreq->gref); 1124 1125 chunk -= part_len; 1126 1127 len += part_len; 1128 d_offset += part_len; 1129 r_tmp += part_len; 1130 /* 1131 * The 2nd, 3rd ... last copies will always 1132 * start at r_tmp, therefore r_offset is 0. 1133 */ 1134 r_offset = 0; 1135 gop_cp++; 1136 } 1137 ml_prev = ml; 1138 DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int, 1139 chunk, int, len, int, item_count); 1140 } 1141 /* 3 */ 1142 if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_tx_cpop, 1143 item_count) != 0) { 1144 cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed"); 1145 DTRACE_PROBE(HV_granttableopfailed); 1146 } 1147 1148 /* 4 */ 1149 rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod); 1150 rxresp->offset = offset; 1151 1152 rxresp->flags = 0; 1153 1154 DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int, 1155 (int)rxresp->offset, int, (int)rxresp->flags, int, 1156 (int)rxresp->status); 1157 1158 cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp); 1159 if (cksum_flags != 0) 1160 xnbp->xnb_stat_tx_cksum_deferred++; 1161 rxresp->flags |= cksum_flags; 1162 1163 rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id; 1164 rxresp->status = len; 1165 1166 DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int, 1167 (int)rxresp->offset, int, (int)rxresp->flags, int, 1168 (int)rxresp->status); 1169 1170 for (i = 0; i < item_count; i++) { 1171 if (xnbp->xnb_tx_cpop[i].status != 0) { 1172 DTRACE_PROBE2(cpop__status__nonnull, int, 1173 (int)xnbp->xnb_tx_cpop[i].status, 1174 int, i); 1175 status = NETIF_RSP_ERROR; 1176 } 1177 } 1178 1179 /* 5.2 */ 1180 if (status != NETIF_RSP_OKAY) { 1181 RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status = 1182 status; 1183 xnbp->xnb_stat_tx_rsp_notok++; 1184 } else { 1185 xnbp->xnb_stat_opackets++; 1186 xnbp->xnb_stat_obytes += len; 1187 } 1188 1189 loop++; 1190 prod++; 1191 mp_prev = mp; 1192 mp = mp->b_next; 1193 } 1194 failure: 1195 /* 1196 * Did we actually do anything? 1197 */ 1198 if (loop == xnbp->xnb_rx_ring.req_cons) { 1199 mutex_exit(&xnbp->xnb_tx_lock); 1200 return (mp); 1201 } 1202 1203 /* 1204 * Unlink the end of the 'done' list from the remainder. 1205 */ 1206 ASSERT(mp_prev != NULL); 1207 mp_prev->b_next = NULL; 1208 1209 xnbp->xnb_rx_ring.req_cons = loop; 1210 xnbp->xnb_rx_ring.rsp_prod_pvt = prod; 1211 1212 /* 6 */ 1213 /* LINTED: constant in conditional context */ 1214 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify); 1215 if (notify) { 1216 ec_notify_via_evtchn(xnbp->xnb_evtchn); 1217 xnbp->xnb_stat_tx_notify_sent++; 1218 } else { 1219 xnbp->xnb_stat_tx_notify_deferred++; 1220 } 1221 1222 if (mp != NULL) 1223 xnbp->xnb_stat_xmit_defer++; 1224 1225 mutex_exit(&xnbp->xnb_tx_lock); 1226 1227 /* Free mblk_t structs we have consumed. */ 1228 freemsgchain(free); 1229 1230 return (mp); 1231 } 1232 1233 /*ARGSUSED*/ 1234 static int 1235 xnb_rxbuf_constructor(void *buf, void *arg, int kmflag) 1236 { 1237 xnb_rxbuf_t *rxp = buf; 1238 1239 bzero(rxp, sizeof (*rxp)); 1240 1241 rxp->xr_free_rtn.free_func = xnb_rx_complete; 1242 rxp->xr_free_rtn.free_arg = (caddr_t)rxp; 1243 1244 rxp->xr_mop.host_addr = 1245 (uint64_t)(uintptr_t)vmem_alloc(heap_arena, PAGESIZE, 1246 ((kmflag & KM_NOSLEEP) == KM_NOSLEEP) ? 1247 VM_NOSLEEP : VM_SLEEP); 1248 1249 if (rxp->xr_mop.host_addr == NULL) { 1250 cmn_err(CE_WARN, "xnb_rxbuf_constructor: " 1251 "cannot get address space"); 1252 return (-1); 1253 } 1254 1255 /* 1256 * Have the hat ensure that page table exists for the VA. 1257 */ 1258 hat_prepare_mapping(kas.a_hat, 1259 (caddr_t)(uintptr_t)rxp->xr_mop.host_addr); 1260 1261 return (0); 1262 } 1263 1264 /*ARGSUSED*/ 1265 static void 1266 xnb_rxbuf_destructor(void *buf, void *arg) 1267 { 1268 xnb_rxbuf_t *rxp = buf; 1269 1270 ASSERT(rxp->xr_mop.host_addr != NULL); 1271 ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == 0); 1272 1273 hat_release_mapping(kas.a_hat, 1274 (caddr_t)(uintptr_t)rxp->xr_mop.host_addr); 1275 vmem_free(heap_arena, 1276 (caddr_t)(uintptr_t)rxp->xr_mop.host_addr, PAGESIZE); 1277 } 1278 1279 static void 1280 xnb_rx_notify_peer(xnb_t *xnbp) 1281 { 1282 boolean_t notify; 1283 1284 ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock)); 1285 1286 /* LINTED: constant in conditional context */ 1287 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify); 1288 if (notify) { 1289 ec_notify_via_evtchn(xnbp->xnb_evtchn); 1290 xnbp->xnb_stat_rx_notify_sent++; 1291 } else { 1292 xnbp->xnb_stat_rx_notify_deferred++; 1293 } 1294 } 1295 1296 static void 1297 xnb_rx_complete(xnb_rxbuf_t *rxp) 1298 { 1299 xnb_t *xnbp = rxp->xr_xnbp; 1300 1301 ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE); 1302 1303 mutex_enter(&xnbp->xnb_rx_lock); 1304 xnb_rx_schedule_unmop(xnbp, &rxp->xr_mop, rxp); 1305 mutex_exit(&xnbp->xnb_rx_lock); 1306 } 1307 1308 static void 1309 xnb_rx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status) 1310 { 1311 RING_IDX i; 1312 netif_tx_response_t *txresp; 1313 1314 ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock)); 1315 1316 i = xnbp->xnb_tx_ring.rsp_prod_pvt; 1317 1318 txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i); 1319 txresp->id = id; 1320 txresp->status = status; 1321 1322 xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1; 1323 1324 /* 1325 * Note that we don't push the change to the peer here - that 1326 * is the callers responsibility. 1327 */ 1328 } 1329 1330 static void 1331 xnb_rx_schedule_unmop(xnb_t *xnbp, gnttab_map_grant_ref_t *mop, 1332 xnb_rxbuf_t *rxp) 1333 { 1334 gnttab_unmap_grant_ref_t *unmop; 1335 int u_count; 1336 int reqs_on_ring; 1337 1338 ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock)); 1339 ASSERT(xnbp->xnb_rx_unmop_count < NET_TX_RING_SIZE); 1340 1341 u_count = xnbp->xnb_rx_unmop_count++; 1342 1343 /* Cache data for the time when we actually unmap grant refs */ 1344 xnbp->xnb_rx_unmop_rxp[u_count] = rxp; 1345 1346 unmop = &xnbp->xnb_rx_unmop[u_count]; 1347 unmop->host_addr = mop->host_addr; 1348 unmop->dev_bus_addr = mop->dev_bus_addr; 1349 unmop->handle = mop->handle; 1350 1351 /* 1352 * We cannot check the ring once we're disconnected from it. Batching 1353 * doesn't seem to be a useful optimisation in this case either, 1354 * so we directly call into the actual unmap function. 1355 */ 1356 if (xnbp->xnb_connected) { 1357 reqs_on_ring = RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_tx_ring); 1358 1359 /* 1360 * By tuning xnb_unmop_hiwat to N, we can emulate "N per batch" 1361 * or (with N == 1) "immediate unmop" behaviour. 1362 * The "> xnb_unmop_lowwat" is a guard against ring exhaustion. 1363 */ 1364 if (xnbp->xnb_rx_unmop_count < xnb_unmop_hiwat && 1365 reqs_on_ring > xnb_unmop_lowwat) 1366 return; 1367 } 1368 1369 xnb_rx_perform_pending_unmop(xnbp); 1370 } 1371 1372 /* 1373 * Here we perform the actual unmapping of the data that was 1374 * accumulated in xnb_rx_schedule_unmop(). 1375 * Note that it is the caller's responsibility to make sure that 1376 * there's actually something there to unmop. 1377 */ 1378 static void 1379 xnb_rx_perform_pending_unmop(xnb_t *xnbp) 1380 { 1381 RING_IDX loop; 1382 #ifdef XNB_DEBUG 1383 gnttab_unmap_grant_ref_t *unmop; 1384 #endif /* XNB_DEBUG */ 1385 1386 ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock)); 1387 ASSERT(xnbp->xnb_rx_unmop_count > 0); 1388 1389 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1390 xnbp->xnb_rx_unmop, xnbp->xnb_rx_unmop_count) < 0) { 1391 cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: " 1392 "unmap grant operation failed, " 1393 "%d pages lost", xnbp->xnb_rx_unmop_count); 1394 } 1395 1396 #ifdef XNB_DEBUG 1397 for (loop = 0, unmop = xnbp->xnb_rx_unmop; 1398 loop < xnbp->xnb_rx_unmop_count; 1399 loop++, unmop++) { 1400 if (unmop->status != 0) { 1401 cmn_err(CE_WARN, "xnb_rx_perform_pending_unmop: " 1402 "unmap grant reference failed (%d)", 1403 unmop->status); 1404 } 1405 } 1406 #endif /* XNB_DEBUG */ 1407 1408 for (loop = 0; loop < xnbp->xnb_rx_unmop_count; loop++) { 1409 xnb_rxbuf_t *rxp = xnbp->xnb_rx_unmop_rxp[loop]; 1410 1411 if (rxp == NULL) 1412 cmn_err(CE_PANIC, 1413 "xnb_rx_perform_pending_unmop: " 1414 "unexpected NULL rxp (loop %d; count %d)!", 1415 loop, xnbp->xnb_rx_unmop_count); 1416 1417 if (xnbp->xnb_connected) 1418 xnb_rx_mark_complete(xnbp, rxp->xr_id, rxp->xr_status); 1419 xnb_rxbuf_put(xnbp, rxp); 1420 } 1421 if (xnbp->xnb_connected) 1422 xnb_rx_notify_peer(xnbp); 1423 1424 xnbp->xnb_rx_unmop_count = 0; 1425 1426 #ifdef XNB_DEBUG 1427 bzero(xnbp->xnb_rx_unmop, sizeof (xnbp->xnb_rx_unmop)); 1428 bzero(xnbp->xnb_rx_unmop_rxp, sizeof (xnbp->xnb_rx_unmop_rxp)); 1429 #endif /* XNB_DEBUG */ 1430 } 1431 1432 static xnb_rxbuf_t * 1433 xnb_rxbuf_get(xnb_t *xnbp, int flags) 1434 { 1435 xnb_rxbuf_t *rxp; 1436 1437 ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock)); 1438 1439 rxp = kmem_cache_alloc(xnb_rxbuf_cachep, flags); 1440 if (rxp != NULL) { 1441 ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == 0); 1442 rxp->xr_flags |= XNB_RXBUF_INUSE; 1443 1444 rxp->xr_xnbp = xnbp; 1445 rxp->xr_mop.dom = xnbp->xnb_peer; 1446 1447 rxp->xr_mop.flags = GNTMAP_host_map; 1448 if (!xnbp->xnb_rx_pages_writable) 1449 rxp->xr_mop.flags |= GNTMAP_readonly; 1450 1451 xnbp->xnb_rx_buf_count++; 1452 } 1453 1454 return (rxp); 1455 } 1456 1457 static void 1458 xnb_rxbuf_put(xnb_t *xnbp, xnb_rxbuf_t *rxp) 1459 { 1460 ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock)); 1461 ASSERT((rxp->xr_flags & XNB_RXBUF_INUSE) == XNB_RXBUF_INUSE); 1462 1463 rxp->xr_flags &= ~XNB_RXBUF_INUSE; 1464 xnbp->xnb_rx_buf_count--; 1465 1466 kmem_cache_free(xnb_rxbuf_cachep, rxp); 1467 } 1468 1469 static mblk_t * 1470 xnb_recv(xnb_t *xnbp) 1471 { 1472 RING_IDX start, end, loop; 1473 gnttab_map_grant_ref_t *mop; 1474 xnb_rxbuf_t **rxpp; 1475 netif_tx_request_t *txreq; 1476 boolean_t work_to_do; 1477 mblk_t *head, *tail; 1478 /* 1479 * If the peer granted a read-only mapping to the page then we 1480 * must copy the data, as the local protocol stack (should the 1481 * packet be destined for this host) will modify the packet 1482 * 'in place'. 1483 */ 1484 boolean_t copy = xnbp->xnb_rx_always_copy || 1485 !xnbp->xnb_rx_pages_writable; 1486 1487 /* 1488 * For each individual request, the sequence of actions is: 1489 * 1490 * 1. get the request. 1491 * 2. map the page based on the grant ref. 1492 * 3. allocate an mblk, copy the data to it. 1493 * 4. release the grant. 1494 * 5. update the ring. 1495 * 6. pass the packet upward. 1496 * 7. kick the peer. 1497 * 1498 * In fact, we try to perform the grant operations in batches, 1499 * so there are two loops. 1500 */ 1501 1502 head = tail = NULL; 1503 around: 1504 ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock)); 1505 1506 /* LINTED: constant in conditional context */ 1507 RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do); 1508 if (!work_to_do) { 1509 finished: 1510 return (head); 1511 } 1512 1513 start = xnbp->xnb_tx_ring.req_cons; 1514 end = xnbp->xnb_tx_ring.sring->req_prod; 1515 1516 for (loop = start, mop = xnbp->xnb_rx_mop, rxpp = xnbp->xnb_rx_bufp; 1517 loop != end; 1518 loop++, mop++, rxpp++) { 1519 xnb_rxbuf_t *rxp; 1520 1521 rxp = xnb_rxbuf_get(xnbp, KM_NOSLEEP); 1522 if (rxp == NULL) 1523 break; 1524 1525 ASSERT(xnbp->xnb_rx_pages_writable || 1526 ((rxp->xr_mop.flags & GNTMAP_readonly) 1527 == GNTMAP_readonly)); 1528 1529 rxp->xr_mop.ref = 1530 RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop)->gref; 1531 1532 *mop = rxp->xr_mop; 1533 *rxpp = rxp; 1534 } 1535 1536 if ((loop - start) == 0) 1537 goto finished; 1538 1539 end = loop; 1540 1541 if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1542 xnbp->xnb_rx_mop, end - start) != 0) { 1543 1544 cmn_err(CE_WARN, "xnb_recv: map grant operation failed"); 1545 1546 loop = start; 1547 rxpp = xnbp->xnb_rx_bufp; 1548 1549 while (loop != end) { 1550 xnb_rxbuf_put(xnbp, *rxpp); 1551 1552 loop++; 1553 rxpp++; 1554 } 1555 1556 goto finished; 1557 } 1558 1559 for (loop = start, mop = xnbp->xnb_rx_mop, rxpp = xnbp->xnb_rx_bufp; 1560 loop != end; 1561 loop++, mop++, rxpp++) { 1562 mblk_t *mp = NULL; 1563 int16_t status = NETIF_RSP_OKAY; 1564 xnb_rxbuf_t *rxp = *rxpp; 1565 1566 if (mop->status != 0) { 1567 cmn_err(CE_WARN, "xnb_recv: " 1568 "failed to map buffer: %d", 1569 mop->status); 1570 status = NETIF_RSP_ERROR; 1571 } 1572 1573 txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop); 1574 1575 if (status == NETIF_RSP_OKAY) { 1576 if (copy) { 1577 mp = allocb(txreq->size, BPRI_MED); 1578 if (mp == NULL) { 1579 status = NETIF_RSP_ERROR; 1580 xnbp->xnb_stat_rx_allocb_failed++; 1581 } else { 1582 bcopy((caddr_t)(uintptr_t) 1583 mop->host_addr + txreq->offset, 1584 mp->b_wptr, txreq->size); 1585 mp->b_wptr += txreq->size; 1586 } 1587 } else { 1588 mp = desballoc((uchar_t *)(uintptr_t) 1589 mop->host_addr + txreq->offset, 1590 txreq->size, 0, &rxp->xr_free_rtn); 1591 if (mp == NULL) { 1592 status = NETIF_RSP_ERROR; 1593 xnbp->xnb_stat_rx_allocb_failed++; 1594 } else { 1595 rxp->xr_id = txreq->id; 1596 rxp->xr_status = status; 1597 rxp->xr_mop = *mop; 1598 1599 mp->b_wptr += txreq->size; 1600 } 1601 } 1602 1603 /* 1604 * If we have a buffer and there are checksum 1605 * flags, process them appropriately. 1606 */ 1607 if ((mp != NULL) && 1608 ((txreq->flags & 1609 (NETTXF_csum_blank | NETTXF_data_validated)) 1610 != 0)) { 1611 mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp, 1612 mp, txreq->flags); 1613 xnbp->xnb_stat_rx_cksum_no_need++; 1614 } 1615 } 1616 1617 if (copy || (mp == NULL)) { 1618 rxp->xr_status = status; 1619 rxp->xr_id = txreq->id; 1620 xnb_rx_schedule_unmop(xnbp, mop, rxp); 1621 } 1622 1623 if (mp != NULL) { 1624 xnbp->xnb_stat_ipackets++; 1625 xnbp->xnb_stat_rbytes += txreq->size; 1626 1627 mp->b_next = NULL; 1628 if (head == NULL) { 1629 ASSERT(tail == NULL); 1630 head = mp; 1631 } else { 1632 ASSERT(tail != NULL); 1633 tail->b_next = mp; 1634 } 1635 tail = mp; 1636 } 1637 } 1638 1639 xnbp->xnb_tx_ring.req_cons = loop; 1640 1641 goto around; 1642 /* NOTREACHED */ 1643 } 1644 1645 /* 1646 * intr() -- ring interrupt service routine 1647 */ 1648 static uint_t 1649 xnb_intr(caddr_t arg) 1650 { 1651 xnb_t *xnbp = (xnb_t *)arg; 1652 mblk_t *mp; 1653 1654 xnbp->xnb_stat_intr++; 1655 1656 mutex_enter(&xnbp->xnb_rx_lock); 1657 1658 ASSERT(xnbp->xnb_connected); 1659 1660 mp = xnb_recv(xnbp); 1661 1662 mutex_exit(&xnbp->xnb_rx_lock); 1663 1664 if (!xnbp->xnb_hotplugged) { 1665 xnbp->xnb_stat_rx_too_early++; 1666 goto fail; 1667 } 1668 if (mp == NULL) { 1669 xnbp->xnb_stat_spurious_intr++; 1670 goto fail; 1671 } 1672 1673 xnbp->xnb_flavour->xf_recv(xnbp, mp); 1674 1675 return (DDI_INTR_CLAIMED); 1676 1677 fail: 1678 freemsgchain(mp); 1679 return (DDI_INTR_CLAIMED); 1680 } 1681 1682 static boolean_t 1683 xnb_connect_rings(dev_info_t *dip) 1684 { 1685 xnb_t *xnbp = ddi_get_driver_private(dip); 1686 char *oename; 1687 struct gnttab_map_grant_ref map_op; 1688 evtchn_port_t evtchn; 1689 int i; 1690 1691 /* 1692 * Cannot attempt to connect the rings if already connected. 1693 */ 1694 ASSERT(!xnbp->xnb_connected); 1695 1696 oename = xvdi_get_oename(dip); 1697 1698 if (xenbus_gather(XBT_NULL, oename, 1699 "event-channel", "%u", &evtchn, 1700 "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref, 1701 "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref, 1702 NULL) != 0) { 1703 cmn_err(CE_WARN, "xnb_connect_rings: " 1704 "cannot read other-end details from %s", 1705 oename); 1706 goto fail; 1707 } 1708 1709 if (xenbus_scanf(XBT_NULL, oename, 1710 "feature-tx-writable", "%d", &i) != 0) 1711 i = 0; 1712 if (i != 0) 1713 xnbp->xnb_rx_pages_writable = B_TRUE; 1714 1715 if (xenbus_scanf(XBT_NULL, oename, 1716 "feature-no-csum-offload", "%d", &i) != 0) 1717 i = 0; 1718 if ((i == 1) || !xnbp->xnb_cksum_offload) 1719 xnbp->xnb_cksum_offload = B_FALSE; 1720 1721 /* Check whether our peer knows and requests hypervisor copy */ 1722 if (xenbus_scanf(XBT_NULL, oename, "request-rx-copy", "%d", &i) 1723 != 0) 1724 i = 0; 1725 if (i != 0) 1726 xnbp->xnb_hv_copy = B_TRUE; 1727 1728 /* 1729 * 1. allocate a vaddr for the tx page, one for the rx page. 1730 * 2. call GNTTABOP_map_grant_ref to map the relevant pages 1731 * into the allocated vaddr (one for tx, one for rx). 1732 * 3. call EVTCHNOP_bind_interdomain to have the event channel 1733 * bound to this domain. 1734 * 4. associate the event channel with an interrupt. 1735 * 5. declare ourselves connected. 1736 * 6. enable the interrupt. 1737 */ 1738 1739 /* 1.tx */ 1740 xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE, 1741 0, 0, 0, 0, VM_SLEEP); 1742 ASSERT(xnbp->xnb_tx_ring_addr != NULL); 1743 1744 /* 2.tx */ 1745 map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr); 1746 map_op.flags = GNTMAP_host_map; 1747 map_op.ref = xnbp->xnb_tx_ring_ref; 1748 map_op.dom = xnbp->xnb_peer; 1749 hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr); 1750 if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1751 &map_op, 1) != 0 || map_op.status != 0) { 1752 cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page."); 1753 goto fail; 1754 } 1755 xnbp->xnb_tx_ring_handle = map_op.handle; 1756 1757 /* LINTED: constant in conditional context */ 1758 BACK_RING_INIT(&xnbp->xnb_tx_ring, 1759 (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE); 1760 1761 /* 1.rx */ 1762 xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE, 1763 0, 0, 0, 0, VM_SLEEP); 1764 ASSERT(xnbp->xnb_rx_ring_addr != NULL); 1765 1766 /* 2.rx */ 1767 map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr); 1768 map_op.flags = GNTMAP_host_map; 1769 map_op.ref = xnbp->xnb_rx_ring_ref; 1770 map_op.dom = xnbp->xnb_peer; 1771 hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr); 1772 if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 1773 &map_op, 1) != 0 || map_op.status != 0) { 1774 cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page."); 1775 goto fail; 1776 } 1777 xnbp->xnb_rx_ring_handle = map_op.handle; 1778 1779 /* LINTED: constant in conditional context */ 1780 BACK_RING_INIT(&xnbp->xnb_rx_ring, 1781 (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE); 1782 1783 /* 3 */ 1784 if (xvdi_bind_evtchn(dip, evtchn) != DDI_SUCCESS) { 1785 cmn_err(CE_WARN, "xnb_connect_rings: " 1786 "cannot bind event channel %d", xnbp->xnb_evtchn); 1787 xnbp->xnb_evtchn = INVALID_EVTCHN; 1788 goto fail; 1789 } 1790 xnbp->xnb_evtchn = xvdi_get_evtchn(dip); 1791 1792 /* 1793 * It would be good to set the state to XenbusStateConnected 1794 * here as well, but then what if ddi_add_intr() failed? 1795 * Changing the state in the store will be noticed by the peer 1796 * and cannot be "taken back". 1797 */ 1798 mutex_enter(&xnbp->xnb_tx_lock); 1799 mutex_enter(&xnbp->xnb_rx_lock); 1800 1801 /* 5.1 */ 1802 xnbp->xnb_connected = B_TRUE; 1803 1804 mutex_exit(&xnbp->xnb_rx_lock); 1805 mutex_exit(&xnbp->xnb_tx_lock); 1806 1807 /* 4, 6 */ 1808 if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp) 1809 != DDI_SUCCESS) { 1810 cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt"); 1811 goto fail; 1812 } 1813 xnbp->xnb_irq = B_TRUE; 1814 1815 /* 5.2 */ 1816 (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected); 1817 1818 return (B_TRUE); 1819 1820 fail: 1821 mutex_enter(&xnbp->xnb_tx_lock); 1822 mutex_enter(&xnbp->xnb_rx_lock); 1823 1824 xnbp->xnb_connected = B_FALSE; 1825 mutex_exit(&xnbp->xnb_rx_lock); 1826 mutex_exit(&xnbp->xnb_tx_lock); 1827 1828 return (B_FALSE); 1829 } 1830 1831 static void 1832 xnb_disconnect_rings(dev_info_t *dip) 1833 { 1834 xnb_t *xnbp = ddi_get_driver_private(dip); 1835 1836 if (xnbp->xnb_irq) { 1837 ddi_remove_intr(dip, 0, NULL); 1838 xnbp->xnb_irq = B_FALSE; 1839 } 1840 1841 if (xnbp->xnb_rx_unmop_count > 0) 1842 xnb_rx_perform_pending_unmop(xnbp); 1843 1844 if (xnbp->xnb_evtchn != INVALID_EVTCHN) { 1845 xvdi_free_evtchn(dip); 1846 xnbp->xnb_evtchn = INVALID_EVTCHN; 1847 } 1848 1849 if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) { 1850 struct gnttab_unmap_grant_ref unmap_op; 1851 1852 unmap_op.host_addr = (uint64_t)(uintptr_t) 1853 xnbp->xnb_rx_ring_addr; 1854 unmap_op.dev_bus_addr = 0; 1855 unmap_op.handle = xnbp->xnb_rx_ring_handle; 1856 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1857 &unmap_op, 1) != 0) 1858 cmn_err(CE_WARN, "xnb_disconnect_rings: " 1859 "cannot unmap rx-ring page (%d)", 1860 unmap_op.status); 1861 1862 xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE; 1863 } 1864 1865 if (xnbp->xnb_rx_ring_addr != NULL) { 1866 hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr); 1867 vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE); 1868 xnbp->xnb_rx_ring_addr = NULL; 1869 } 1870 1871 if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) { 1872 struct gnttab_unmap_grant_ref unmap_op; 1873 1874 unmap_op.host_addr = (uint64_t)(uintptr_t) 1875 xnbp->xnb_tx_ring_addr; 1876 unmap_op.dev_bus_addr = 0; 1877 unmap_op.handle = xnbp->xnb_tx_ring_handle; 1878 if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 1879 &unmap_op, 1) != 0) 1880 cmn_err(CE_WARN, "xnb_disconnect_rings: " 1881 "cannot unmap tx-ring page (%d)", 1882 unmap_op.status); 1883 1884 xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE; 1885 } 1886 1887 if (xnbp->xnb_tx_ring_addr != NULL) { 1888 hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr); 1889 vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE); 1890 xnbp->xnb_tx_ring_addr = NULL; 1891 } 1892 } 1893 1894 /*ARGSUSED*/ 1895 static void 1896 xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id, 1897 void *arg, void *impl_data) 1898 { 1899 xnb_t *xnbp = ddi_get_driver_private(dip); 1900 XenbusState new_state = *(XenbusState *)impl_data; 1901 1902 ASSERT(xnbp != NULL); 1903 1904 switch (new_state) { 1905 case XenbusStateConnected: 1906 /* spurious state change */ 1907 if (xnbp->xnb_connected) 1908 return; 1909 1910 if (xnb_connect_rings(dip)) { 1911 xnbp->xnb_flavour->xf_peer_connected(xnbp); 1912 } else { 1913 xnbp->xnb_flavour->xf_peer_disconnected(xnbp); 1914 xnb_disconnect_rings(dip); 1915 (void) xvdi_switch_state(dip, XBT_NULL, 1916 XenbusStateClosed); 1917 (void) xvdi_post_event(dip, XEN_HP_REMOVE); 1918 } 1919 1920 /* 1921 * Now that we've attempted to connect it's reasonable 1922 * to allow an attempt to detach. 1923 */ 1924 xnbp->xnb_detachable = B_TRUE; 1925 1926 break; 1927 1928 case XenbusStateClosing: 1929 (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing); 1930 1931 break; 1932 1933 case XenbusStateClosed: 1934 xnbp->xnb_flavour->xf_peer_disconnected(xnbp); 1935 1936 mutex_enter(&xnbp->xnb_tx_lock); 1937 mutex_enter(&xnbp->xnb_rx_lock); 1938 1939 xnb_disconnect_rings(dip); 1940 xnbp->xnb_connected = B_FALSE; 1941 1942 mutex_exit(&xnbp->xnb_rx_lock); 1943 mutex_exit(&xnbp->xnb_tx_lock); 1944 1945 (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed); 1946 (void) xvdi_post_event(dip, XEN_HP_REMOVE); 1947 /* 1948 * In all likelyhood this is already set (in the above 1949 * case), but if the peer never attempted to connect 1950 * and the domain is destroyed we get here without 1951 * having been through the case above, so we set it to 1952 * be sure. 1953 */ 1954 xnbp->xnb_detachable = B_TRUE; 1955 1956 break; 1957 1958 default: 1959 break; 1960 } 1961 } 1962 1963 /*ARGSUSED*/ 1964 static void 1965 xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id, 1966 void *arg, void *impl_data) 1967 { 1968 xnb_t *xnbp = ddi_get_driver_private(dip); 1969 xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data; 1970 boolean_t success; 1971 1972 ASSERT(xnbp != NULL); 1973 1974 switch (state) { 1975 case Connected: 1976 1977 /* spurious hotplug event */ 1978 if (xnbp->xnb_hotplugged) 1979 return; 1980 1981 success = xnbp->xnb_flavour->xf_hotplug_connected(xnbp); 1982 1983 mutex_enter(&xnbp->xnb_tx_lock); 1984 mutex_enter(&xnbp->xnb_rx_lock); 1985 1986 xnbp->xnb_hotplugged = success; 1987 1988 mutex_exit(&xnbp->xnb_rx_lock); 1989 mutex_exit(&xnbp->xnb_tx_lock); 1990 break; 1991 1992 default: 1993 break; 1994 } 1995 } 1996 1997 static struct modldrv modldrv = { 1998 &mod_miscops, "xnb", 1999 }; 2000 2001 static struct modlinkage modlinkage = { 2002 MODREV_1, &modldrv, NULL 2003 }; 2004 2005 int 2006 _init(void) 2007 { 2008 int i; 2009 2010 mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL); 2011 2012 xnb_rxbuf_cachep = kmem_cache_create("xnb_rxbuf_cachep", 2013 sizeof (xnb_rxbuf_t), 0, xnb_rxbuf_constructor, 2014 xnb_rxbuf_destructor, NULL, NULL, NULL, 0); 2015 ASSERT(xnb_rxbuf_cachep != NULL); 2016 2017 i = mod_install(&modlinkage); 2018 if (i != DDI_SUCCESS) { 2019 kmem_cache_destroy(xnb_rxbuf_cachep); 2020 mutex_destroy(&xnb_alloc_page_lock); 2021 } 2022 return (i); 2023 } 2024 2025 int 2026 _info(struct modinfo *modinfop) 2027 { 2028 return (mod_info(&modlinkage, modinfop)); 2029 } 2030 2031 int 2032 _fini(void) 2033 { 2034 int i; 2035 2036 i = mod_remove(&modlinkage); 2037 if (i == DDI_SUCCESS) { 2038 kmem_cache_destroy(xnb_rxbuf_cachep); 2039 mutex_destroy(&xnb_alloc_page_lock); 2040 } 2041 return (i); 2042 } 2043