1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 #include <sys/types.h> 27 #include <sys/errno.h> 28 #include <sys/sysmacros.h> 29 #include <sys/param.h> 30 #include <sys/machsystm.h> 31 #include <sys/stream.h> 32 #include <sys/strsubr.h> 33 #include <sys/kmem.h> 34 #include <sys/strsun.h> 35 #include <sys/callb.h> 36 #include <sys/sdt.h> 37 #include <sys/mach_descrip.h> 38 #include <sys/mdeg.h> 39 #include <net/if.h> 40 #include <sys/vsw.h> 41 #include <sys/vio_mailbox.h> 42 #include <sys/vio_common.h> 43 #include <sys/vnet_common.h> 44 #include <sys/vnet_mailbox.h> 45 #include <sys/vio_util.h> 46 47 /* 48 * This file contains the implementation of TxDring data transfer mode of VIO 49 * Protocol in vsw. The functions in this file are invoked from vsw_ldc.c 50 * after TxDring mode is negotiated with the peer during attribute phase of 51 * handshake. This file contains functions that setup the transmit and receive 52 * descriptor rings, and associated resources in TxDring mode. It also contains 53 * the transmit and receive data processing functions that are invoked in 54 * TxDring mode. 55 */ 56 57 /* Functions exported to vsw_ldc.c */ 58 vio_dring_reg_msg_t *vsw_create_tx_dring_info(vsw_ldc_t *); 59 int vsw_setup_tx_dring(vsw_ldc_t *ldcp, dring_info_t *dp); 60 void vsw_destroy_tx_dring(vsw_ldc_t *ldcp); 61 dring_info_t *vsw_map_rx_dring(vsw_ldc_t *ldcp, void *pkt); 62 void vsw_unmap_rx_dring(vsw_ldc_t *ldcp); 63 int vsw_dringsend(vsw_ldc_t *, mblk_t *); 64 void vsw_ldc_msg_worker(void *arg); 65 void vsw_stop_msg_thread(vsw_ldc_t *ldcp); 66 void vsw_process_dringdata(void *, void *); 67 int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t); 68 int vsw_reclaim_dring(dring_info_t *dp, int start); 69 int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **, int *); 70 71 /* Internal functions */ 72 static int vsw_init_multipools(vsw_ldc_t *ldcp, vsw_t *vswp); 73 static dring_info_t *vsw_create_tx_dring(vsw_ldc_t *); 74 75 /* Functions imported from vsw_ldc.c */ 76 extern void vsw_process_pkt(void *); 77 extern void vsw_destroy_rxpools(void *); 78 extern dring_info_t *vsw_map_dring_cmn(vsw_ldc_t *ldcp, 79 vio_dring_reg_msg_t *dring_pkt); 80 extern void vsw_process_conn_evt(vsw_ldc_t *, uint16_t); 81 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp); 82 83 /* Tunables */ 84 extern int vsw_wretries; 85 extern int vsw_recv_delay; 86 extern int vsw_recv_retries; 87 extern boolean_t vsw_jumbo_rxpools; 88 extern uint32_t vsw_chain_len; 89 extern uint32_t vsw_num_descriptors; 90 extern uint32_t vsw_mblk_size1; 91 extern uint32_t vsw_mblk_size2; 92 extern uint32_t vsw_mblk_size3; 93 extern uint32_t vsw_mblk_size4; 94 extern uint32_t vsw_num_mblks1; 95 extern uint32_t vsw_num_mblks2; 96 extern uint32_t vsw_num_mblks3; 97 extern uint32_t vsw_num_mblks4; 98 99 #define VSW_NUM_VMPOOLS 3 /* number of vio mblk pools */ 100 101 #define SND_DRING_NACK(ldcp, pkt) \ 102 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 103 pkt->tag.vio_sid = ldcp->local_session; \ 104 (void) vsw_send_msg(ldcp, (void *)pkt, \ 105 sizeof (vio_dring_msg_t), B_TRUE); 106 107 vio_dring_reg_msg_t * 108 vsw_create_tx_dring_info(vsw_ldc_t *ldcp) 109 { 110 vio_dring_reg_msg_t *mp; 111 dring_info_t *dp; 112 vsw_t *vswp = ldcp->ldc_vswp; 113 114 D1(vswp, "%s enter\n", __func__); 115 116 /* 117 * If we can't create a dring, obviously no point sending 118 * a message. 119 */ 120 if ((dp = vsw_create_tx_dring(ldcp)) == NULL) 121 return (NULL); 122 123 mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP); 124 125 mp->tag.vio_msgtype = VIO_TYPE_CTRL; 126 mp->tag.vio_subtype = VIO_SUBTYPE_INFO; 127 mp->tag.vio_subtype_env = VIO_DRING_REG; 128 mp->tag.vio_sid = ldcp->local_session; 129 130 /* payload */ 131 mp->num_descriptors = dp->num_descriptors; 132 mp->descriptor_size = dp->descriptor_size; 133 mp->options = dp->options; 134 mp->ncookies = dp->dring_ncookies; 135 bcopy(&dp->dring_cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t)); 136 137 mp->dring_ident = 0; 138 139 D1(vswp, "%s exit\n", __func__); 140 141 return (mp); 142 } 143 144 /* 145 * Allocate transmit resources for the channel. The resources consist of a 146 * transmit descriptor ring and an associated transmit buffer area. 147 */ 148 static dring_info_t * 149 vsw_create_tx_dring(vsw_ldc_t *ldcp) 150 { 151 vsw_t *vswp = ldcp->ldc_vswp; 152 ldc_mem_info_t minfo; 153 dring_info_t *dp; 154 155 dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 156 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); 157 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); 158 ldcp->lane_out.dringp = dp; 159 160 /* create public section of ring */ 161 if ((ldc_mem_dring_create(vsw_num_descriptors, 162 sizeof (vnet_public_desc_t), &dp->dring_handle)) != 0) { 163 164 DERR(vswp, "vsw_create_tx_dring(%lld): ldc dring create " 165 "failed", ldcp->ldc_id); 166 goto fail; 167 } 168 ASSERT(dp->dring_handle != NULL); 169 170 /* 171 * Get the base address of the public section of the ring. 172 */ 173 if ((ldc_mem_dring_info(dp->dring_handle, &minfo)) != 0) { 174 DERR(vswp, "vsw_create_tx_dring(%lld): dring info failed\n", 175 ldcp->ldc_id); 176 goto fail; 177 } else { 178 ASSERT(minfo.vaddr != 0); 179 dp->pub_addr = minfo.vaddr; 180 } 181 182 dp->num_descriptors = vsw_num_descriptors; 183 dp->descriptor_size = sizeof (vnet_public_desc_t); 184 dp->options = VIO_TX_DRING; 185 dp->dring_ncookies = 1; /* guaranteed by ldc */ 186 187 /* 188 * create private portion of ring 189 */ 190 dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc( 191 (sizeof (vsw_private_desc_t) * vsw_num_descriptors), KM_SLEEP); 192 193 if (vsw_setup_tx_dring(ldcp, dp)) { 194 DERR(vswp, "%s: unable to setup ring", __func__); 195 goto fail; 196 } 197 198 /* bind dring to the channel */ 199 if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->dring_handle, 200 LDC_DIRECT_MAP | LDC_SHADOW_MAP, LDC_MEM_RW, 201 &dp->dring_cookie[0], &dp->dring_ncookies)) != 0) { 202 DERR(vswp, "vsw_create_tx_dring: unable to bind to channel " 203 "%lld", ldcp->ldc_id); 204 goto fail; 205 } 206 207 /* haven't used any descriptors yet */ 208 dp->end_idx = 0; 209 dp->last_ack_recv = -1; 210 dp->restart_reqd = B_TRUE; 211 212 return (dp); 213 214 fail: 215 vsw_destroy_tx_dring(ldcp); 216 return (NULL); 217 } 218 219 /* 220 * Setup the descriptors in the tx dring. 221 * Returns 0 on success, 1 on failure. 222 */ 223 int 224 vsw_setup_tx_dring(vsw_ldc_t *ldcp, dring_info_t *dp) 225 { 226 vnet_public_desc_t *pub_addr = NULL; 227 vsw_private_desc_t *priv_addr = NULL; 228 vsw_t *vswp = ldcp->ldc_vswp; 229 uint64_t *tmpp; 230 uint64_t offset = 0; 231 uint32_t ncookies = 0; 232 static char *name = "vsw_setup_ring"; 233 int i, j, nc, rv; 234 size_t data_sz; 235 void *data_addr; 236 237 priv_addr = dp->priv_addr; 238 pub_addr = dp->pub_addr; 239 240 /* public section may be null but private should never be */ 241 ASSERT(priv_addr != NULL); 242 243 /* 244 * Allocate the region of memory which will be used to hold 245 * the data the descriptors will refer to. 246 */ 247 data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN; 248 249 /* 250 * In order to ensure that the number of ldc cookies per descriptor is 251 * limited to be within the default MAX_COOKIES (2), we take the steps 252 * outlined below: 253 * 254 * Align the entire data buffer area to 8K and carve out per descriptor 255 * data buffers starting from this 8K aligned base address. 256 * 257 * We round up the mtu specified to be a multiple of 2K or 4K. 258 * For sizes up to 12K we round up the size to the next 2K. 259 * For sizes > 12K we round up to the next 4K (otherwise sizes such as 260 * 14K could end up needing 3 cookies, with the buffer spread across 261 * 3 8K pages: 8K+6K, 2K+8K+2K, 6K+8K, ...). 262 */ 263 if (data_sz <= VNET_12K) { 264 data_sz = VNET_ROUNDUP_2K(data_sz); 265 } else { 266 data_sz = VNET_ROUNDUP_4K(data_sz); 267 } 268 269 dp->desc_data_sz = data_sz; 270 271 /* allocate extra 8K bytes for alignment */ 272 dp->data_sz = (vsw_num_descriptors * data_sz) + VNET_8K; 273 data_addr = kmem_alloc(dp->data_sz, KM_SLEEP); 274 dp->data_addr = data_addr; 275 276 D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name, 277 dp->data_sz, dp->data_addr); 278 279 /* align the starting address of the data area to 8K */ 280 data_addr = (void *)VNET_ROUNDUP_8K((uintptr_t)data_addr); 281 282 tmpp = (uint64_t *)data_addr; 283 offset = dp->desc_data_sz/sizeof (tmpp); 284 285 /* 286 * Initialise some of the private and public (if they exist) 287 * descriptor fields. 288 */ 289 for (i = 0; i < vsw_num_descriptors; i++) { 290 mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL); 291 292 if ((ldc_mem_alloc_handle(ldcp->ldc_handle, 293 &priv_addr->memhandle)) != 0) { 294 DERR(vswp, "%s: alloc mem handle failed", name); 295 goto fail; 296 } 297 298 priv_addr->datap = (void *)tmpp; 299 300 rv = ldc_mem_bind_handle(priv_addr->memhandle, 301 (caddr_t)priv_addr->datap, dp->desc_data_sz, 302 LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W, 303 &(priv_addr->memcookie[0]), &ncookies); 304 if (rv != 0) { 305 DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed " 306 "(rv %d)", name, ldcp->ldc_id, rv); 307 goto fail; 308 } 309 priv_addr->bound = 1; 310 311 D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx", 312 name, i, priv_addr->memcookie[0].addr, 313 priv_addr->memcookie[0].size); 314 315 if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) { 316 DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned " 317 "invalid num of cookies (%d) for size 0x%llx", 318 name, ldcp->ldc_id, ncookies, VSW_RING_EL_DATA_SZ); 319 320 goto fail; 321 } else { 322 for (j = 1; j < ncookies; j++) { 323 rv = ldc_mem_nextcookie(priv_addr->memhandle, 324 &(priv_addr->memcookie[j])); 325 if (rv != 0) { 326 DERR(vswp, "%s: ldc_mem_nextcookie " 327 "failed rv (%d)", name, rv); 328 goto fail; 329 } 330 D3(vswp, "%s: memcookie %d : addr 0x%llx : " 331 "size 0x%llx", name, j, 332 priv_addr->memcookie[j].addr, 333 priv_addr->memcookie[j].size); 334 } 335 336 } 337 priv_addr->ncookies = ncookies; 338 priv_addr->dstate = VIO_DESC_FREE; 339 340 if (pub_addr != NULL) { 341 342 /* link pub and private sides */ 343 priv_addr->descp = pub_addr; 344 345 pub_addr->ncookies = priv_addr->ncookies; 346 347 for (nc = 0; nc < pub_addr->ncookies; nc++) { 348 bcopy(&priv_addr->memcookie[nc], 349 &pub_addr->memcookie[nc], 350 sizeof (ldc_mem_cookie_t)); 351 } 352 353 pub_addr->hdr.dstate = VIO_DESC_FREE; 354 pub_addr++; 355 } 356 357 /* 358 * move to next element in the dring and the next 359 * position in the data buffer. 360 */ 361 priv_addr++; 362 tmpp += offset; 363 } 364 365 return (0); 366 367 fail: 368 /* return failure; caller will cleanup */ 369 return (1); 370 } 371 372 /* 373 * Free transmit resources for the channel. 374 */ 375 void 376 vsw_destroy_tx_dring(vsw_ldc_t *ldcp) 377 { 378 vsw_private_desc_t *paddr = NULL; 379 int i; 380 lane_t *lp = &ldcp->lane_out; 381 dring_info_t *dp; 382 383 dp = lp->dringp; 384 if (dp == NULL) { 385 return; 386 } 387 388 mutex_enter(&dp->dlock); 389 390 if (dp->priv_addr != NULL) { 391 /* 392 * First unbind and free the memory handles 393 * stored in each descriptor within the ring. 394 */ 395 for (i = 0; i < vsw_num_descriptors; i++) { 396 paddr = (vsw_private_desc_t *)dp->priv_addr + i; 397 if (paddr->memhandle != 0) { 398 if (paddr->bound == 1) { 399 if (ldc_mem_unbind_handle( 400 paddr->memhandle) != 0) { 401 DERR(NULL, "error " 402 "unbinding handle for " 403 "ring 0x%llx at pos %d", 404 dp, i); 405 continue; 406 } 407 paddr->bound = 0; 408 } 409 410 if (ldc_mem_free_handle( 411 paddr->memhandle) != 0) { 412 DERR(NULL, "error freeing " 413 "handle for ring 0x%llx " 414 "at pos %d", dp, i); 415 continue; 416 } 417 paddr->memhandle = 0; 418 } 419 mutex_destroy(&paddr->dstate_lock); 420 } 421 kmem_free(dp->priv_addr, 422 (sizeof (vsw_private_desc_t) * vsw_num_descriptors)); 423 } 424 425 /* 426 * Now unbind and destroy the ring itself. 427 */ 428 if (dp->dring_handle != 0) { 429 (void) ldc_mem_dring_unbind(dp->dring_handle); 430 (void) ldc_mem_dring_destroy(dp->dring_handle); 431 } 432 433 if (dp->data_addr != NULL) { 434 kmem_free(dp->data_addr, dp->data_sz); 435 } 436 437 mutex_exit(&dp->dlock); 438 mutex_destroy(&dp->dlock); 439 mutex_destroy(&dp->restart_lock); 440 kmem_free(dp, sizeof (dring_info_t)); 441 lp->dringp = NULL; 442 } 443 444 /* 445 * Map the transmit descriptor ring exported 446 * by the peer, as our receive descriptor ring. 447 */ 448 dring_info_t * 449 vsw_map_rx_dring(vsw_ldc_t *ldcp, void *pkt) 450 { 451 int rv; 452 dring_info_t *dp; 453 vio_dring_reg_msg_t *dring_pkt = pkt; 454 vsw_t *vswp = ldcp->ldc_vswp; 455 456 dp = vsw_map_dring_cmn(ldcp, dring_pkt); 457 if (dp == NULL) { 458 return (NULL); 459 } 460 461 /* TxDring mode specific initializations */ 462 dp->end_idx = 0; 463 ldcp->lane_in.dringp = dp; 464 465 /* Allocate pools of receive mblks */ 466 rv = vsw_init_multipools(ldcp, vswp); 467 if (rv != 0) { 468 /* 469 * We do not return failure if receive mblk pools can't 470 * be allocated, instead allocb(9F) will be used to 471 * dynamically allocate buffers during receive. 472 */ 473 DWARN(vswp, "%s: unable to create free mblk pools for" 474 " channel %ld (rv %d)", __func__, ldcp->ldc_id, rv); 475 } 476 477 return (dp); 478 } 479 480 /* 481 * Unmap the receive descriptor ring. 482 */ 483 void 484 vsw_unmap_rx_dring(vsw_ldc_t *ldcp) 485 { 486 vio_mblk_pool_t *fvmp = NULL; 487 vsw_t *vswp = ldcp->ldc_vswp; 488 lane_t *lp = &ldcp->lane_in; 489 dring_info_t *dp; 490 491 if ((dp = lp->dringp) == NULL) { 492 return; 493 } 494 495 /* 496 * If we can't destroy all the rx pools for this channel, 497 * dispatch a task to retry and clean up those rx pools. Note 498 * that we don't need to wait for the task to complete. If the 499 * vsw device itself gets detached (vsw_detach()), it will wait 500 * for the task to complete implicitly in ddi_taskq_destroy(). 501 */ 502 vio_destroy_multipools(&ldcp->vmp, &fvmp); 503 if (fvmp != NULL) { 504 (void) ddi_taskq_dispatch(vswp->rxp_taskq, 505 vsw_destroy_rxpools, fvmp, DDI_SLEEP); 506 } 507 508 if (dp->dring_handle != 0) { 509 (void) ldc_mem_dring_unmap(dp->dring_handle); 510 } 511 kmem_free(dp, sizeof (dring_info_t)); 512 lp->dringp = NULL; 513 } 514 515 static int 516 vsw_init_multipools(vsw_ldc_t *ldcp, vsw_t *vswp) 517 { 518 size_t data_sz; 519 int rv; 520 uint32_t sz1 = 0; 521 uint32_t sz2 = 0; 522 uint32_t sz3 = 0; 523 uint32_t sz4 = 0; 524 525 /* 526 * We round up the mtu specified to be a multiple of 2K to limit the 527 * number of rx buffer pools created for a given mtu. 528 */ 529 data_sz = vswp->max_frame_size + VNET_IPALIGN + VNET_LDCALIGN; 530 data_sz = VNET_ROUNDUP_2K(data_sz); 531 532 /* 533 * If pool sizes are specified, use them. Note that the presence of 534 * the first tunable will be used as a hint. 535 */ 536 if (vsw_mblk_size1 != 0) { 537 sz1 = vsw_mblk_size1; 538 sz2 = vsw_mblk_size2; 539 sz3 = vsw_mblk_size3; 540 sz4 = vsw_mblk_size4; 541 542 if (sz4 == 0) { /* need 3 pools */ 543 544 ldcp->max_rxpool_size = sz3; 545 rv = vio_init_multipools(&ldcp->vmp, 546 VSW_NUM_VMPOOLS, sz1, sz2, sz3, 547 vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3); 548 549 } else { 550 551 ldcp->max_rxpool_size = sz4; 552 rv = vio_init_multipools(&ldcp->vmp, 553 VSW_NUM_VMPOOLS + 1, sz1, sz2, sz3, sz4, 554 vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3, 555 vsw_num_mblks4); 556 557 } 558 559 return (rv); 560 } 561 562 /* 563 * Pool sizes are not specified. We select the pool sizes based on the 564 * mtu if vnet_jumbo_rxpools is enabled. 565 */ 566 if (vsw_jumbo_rxpools == B_FALSE || data_sz == VNET_2K) { 567 /* 568 * Receive buffer pool allocation based on mtu is disabled. 569 * Use the default mechanism of standard size pool allocation. 570 */ 571 sz1 = VSW_MBLK_SZ_128; 572 sz2 = VSW_MBLK_SZ_256; 573 sz3 = VSW_MBLK_SZ_2048; 574 ldcp->max_rxpool_size = sz3; 575 576 rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS, 577 sz1, sz2, sz3, 578 vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3); 579 580 return (rv); 581 } 582 583 switch (data_sz) { 584 585 case VNET_4K: 586 587 sz1 = VSW_MBLK_SZ_128; 588 sz2 = VSW_MBLK_SZ_256; 589 sz3 = VSW_MBLK_SZ_2048; 590 sz4 = sz3 << 1; /* 4K */ 591 ldcp->max_rxpool_size = sz4; 592 593 rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS + 1, 594 sz1, sz2, sz3, sz4, 595 vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3, 596 vsw_num_mblks4); 597 break; 598 599 default: /* data_sz: 4K+ to 16K */ 600 601 sz1 = VSW_MBLK_SZ_256; 602 sz2 = VSW_MBLK_SZ_2048; 603 sz3 = data_sz >> 1; /* Jumbo-size/2 */ 604 sz4 = data_sz; /* Jumbo-size */ 605 ldcp->max_rxpool_size = sz4; 606 607 rv = vio_init_multipools(&ldcp->vmp, VSW_NUM_VMPOOLS + 1, 608 sz1, sz2, sz3, sz4, 609 vsw_num_mblks1, vsw_num_mblks2, vsw_num_mblks3, 610 vsw_num_mblks4); 611 break; 612 } 613 614 return (rv); 615 616 } 617 618 /* 619 * Generic routine to send message out over ldc channel. 620 * 621 * It is possible that when we attempt to write over the ldc channel 622 * that we get notified that it has been reset. Depending on the value 623 * of the handle_reset flag we either handle that event here or simply 624 * notify the caller that the channel was reset. 625 */ 626 int 627 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size, boolean_t handle_reset) 628 { 629 int rv; 630 size_t msglen = size; 631 vio_msg_tag_t *tag = (vio_msg_tag_t *)msgp; 632 vsw_t *vswp = ldcp->ldc_vswp; 633 vio_dring_msg_t *dmsg; 634 vio_raw_data_msg_t *rmsg; 635 vnet_ibnd_desc_t *imsg; 636 boolean_t data_msg = B_FALSE; 637 int retries = vsw_wretries; 638 639 D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes", 640 ldcp->ldc_id, size); 641 642 D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype); 643 D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype); 644 D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env); 645 646 mutex_enter(&ldcp->ldc_txlock); 647 648 if (tag->vio_subtype == VIO_SUBTYPE_INFO) { 649 if (tag->vio_subtype_env == VIO_DRING_DATA) { 650 dmsg = (vio_dring_msg_t *)tag; 651 dmsg->seq_num = ldcp->lane_out.seq_num; 652 data_msg = B_TRUE; 653 } else if (tag->vio_subtype_env == VIO_PKT_DATA) { 654 rmsg = (vio_raw_data_msg_t *)tag; 655 rmsg->seq_num = ldcp->lane_out.seq_num; 656 data_msg = B_TRUE; 657 } else if (tag->vio_subtype_env == VIO_DESC_DATA) { 658 imsg = (vnet_ibnd_desc_t *)tag; 659 imsg->hdr.seq_num = ldcp->lane_out.seq_num; 660 data_msg = B_TRUE; 661 } 662 } 663 664 do { 665 msglen = size; 666 rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen); 667 } while (rv == EWOULDBLOCK && --retries > 0); 668 669 if (rv == 0 && data_msg == B_TRUE) { 670 ldcp->lane_out.seq_num++; 671 } 672 673 if ((rv != 0) || (msglen != size)) { 674 DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) rv(%d) " 675 "size (%d) msglen(%d)\n", ldcp->ldc_id, rv, size, msglen); 676 ldcp->ldc_stats.oerrors++; 677 } 678 679 mutex_exit(&ldcp->ldc_txlock); 680 681 /* 682 * If channel has been reset we either handle it here or 683 * simply report back that it has been reset and let caller 684 * decide what to do. 685 */ 686 if (rv == ECONNRESET) { 687 DWARN(vswp, "%s (%lld) channel reset", __func__, ldcp->ldc_id); 688 689 if (handle_reset) { 690 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 691 } 692 } 693 694 return (rv); 695 } 696 697 /* 698 * A per LDC worker thread to process ldc messages. This thread is woken up by 699 * the LDC interrupt handler to process LDC packets and receive data. 700 */ 701 void 702 vsw_ldc_msg_worker(void *arg) 703 { 704 callb_cpr_t cprinfo; 705 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 706 vsw_t *vswp = ldcp->ldc_vswp; 707 708 D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id); 709 CALLB_CPR_INIT(&cprinfo, &ldcp->msg_thr_lock, callb_generic_cpr, 710 "vsw_msg_thread"); 711 mutex_enter(&ldcp->msg_thr_lock); 712 while (!(ldcp->msg_thr_flags & VSW_WTHR_STOP)) { 713 714 CALLB_CPR_SAFE_BEGIN(&cprinfo); 715 /* 716 * Wait until the data is received or a stop 717 * request is received. 718 */ 719 while (!(ldcp->msg_thr_flags & 720 (VSW_WTHR_DATARCVD | VSW_WTHR_STOP))) { 721 cv_wait(&ldcp->msg_thr_cv, &ldcp->msg_thr_lock); 722 } 723 CALLB_CPR_SAFE_END(&cprinfo, &ldcp->msg_thr_lock) 724 725 /* 726 * First process the stop request. 727 */ 728 if (ldcp->msg_thr_flags & VSW_WTHR_STOP) { 729 D2(vswp, "%s(%lld):Rx thread stopped\n", 730 __func__, ldcp->ldc_id); 731 break; 732 } 733 ldcp->msg_thr_flags &= ~VSW_WTHR_DATARCVD; 734 mutex_exit(&ldcp->msg_thr_lock); 735 D1(vswp, "%s(%lld):calling vsw_process_pkt\n", 736 __func__, ldcp->ldc_id); 737 mutex_enter(&ldcp->ldc_cblock); 738 vsw_process_pkt(ldcp); 739 mutex_exit(&ldcp->ldc_cblock); 740 mutex_enter(&ldcp->msg_thr_lock); 741 } 742 743 /* 744 * Update the run status and wakeup the thread that 745 * has sent the stop request. 746 */ 747 ldcp->msg_thr_flags &= ~VSW_WTHR_STOP; 748 ldcp->msg_thread = NULL; 749 CALLB_CPR_EXIT(&cprinfo); 750 D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id); 751 thread_exit(); 752 } 753 754 /* Co-ordinate with msg processing thread to stop it */ 755 void 756 vsw_stop_msg_thread(vsw_ldc_t *ldcp) 757 { 758 kt_did_t tid = 0; 759 vsw_t *vswp = ldcp->ldc_vswp; 760 761 D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id); 762 /* 763 * Send a stop request by setting the stop flag and 764 * wait until the msg process thread stops. 765 */ 766 mutex_enter(&ldcp->msg_thr_lock); 767 if (ldcp->msg_thread != NULL) { 768 tid = ldcp->msg_thread->t_did; 769 ldcp->msg_thr_flags |= VSW_WTHR_STOP; 770 cv_signal(&ldcp->msg_thr_cv); 771 } 772 mutex_exit(&ldcp->msg_thr_lock); 773 774 if (tid != 0) { 775 thread_join(tid); 776 } 777 D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id); 778 } 779 780 /* 781 * Send packet out via descriptor ring to a logical device. 782 */ 783 int 784 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp) 785 { 786 vio_dring_msg_t dring_pkt; 787 dring_info_t *dp = NULL; 788 vsw_private_desc_t *priv_desc = NULL; 789 vnet_public_desc_t *pub = NULL; 790 vsw_t *vswp = ldcp->ldc_vswp; 791 mblk_t *bp; 792 size_t n, size; 793 caddr_t bufp; 794 int idx; 795 int status = LDC_TX_SUCCESS; 796 struct ether_header *ehp = (struct ether_header *)mp->b_rptr; 797 lane_t *lp = &ldcp->lane_out; 798 799 D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id); 800 801 /* TODO: make test a macro */ 802 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || 803 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == 0)) { 804 DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping " 805 "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status, 806 ldcp->lane_out.lstate); 807 ldcp->ldc_stats.oerrors++; 808 return (LDC_TX_FAILURE); 809 } 810 811 if ((dp = ldcp->lane_out.dringp) == NULL) { 812 DERR(vswp, "%s(%lld): no dring for outbound lane on" 813 " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id); 814 ldcp->ldc_stats.oerrors++; 815 return (LDC_TX_FAILURE); 816 } 817 818 size = msgsize(mp); 819 if (size > (size_t)lp->mtu) { 820 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, 821 ldcp->ldc_id, size); 822 ldcp->ldc_stats.oerrors++; 823 return (LDC_TX_FAILURE); 824 } 825 826 /* 827 * Find a free descriptor 828 * 829 * Note: for the moment we are assuming that we will only 830 * have one dring going from the switch to each of its 831 * peers. This may change in the future. 832 */ 833 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { 834 D2(vswp, "%s(%lld): no descriptor available for ring " 835 "at 0x%llx", __func__, ldcp->ldc_id, dp); 836 837 /* nothing more we can do */ 838 status = LDC_TX_NORESOURCES; 839 ldcp->ldc_stats.tx_no_desc++; 840 goto vsw_dringsend_free_exit; 841 } else { 842 D2(vswp, "%s(%lld): free private descriptor found at pos %ld " 843 "addr 0x%llx\n", __func__, ldcp->ldc_id, idx, priv_desc); 844 } 845 846 /* copy data into the descriptor */ 847 bufp = priv_desc->datap; 848 bufp += VNET_IPALIGN; 849 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { 850 n = MBLKL(bp); 851 bcopy(bp->b_rptr, bufp, n); 852 bufp += n; 853 } 854 855 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; 856 857 pub = priv_desc->descp; 858 pub->nbytes = priv_desc->datalen; 859 860 /* update statistics */ 861 if (IS_BROADCAST(ehp)) 862 ldcp->ldc_stats.brdcstxmt++; 863 else if (IS_MULTICAST(ehp)) 864 ldcp->ldc_stats.multixmt++; 865 ldcp->ldc_stats.opackets++; 866 ldcp->ldc_stats.obytes += priv_desc->datalen; 867 868 mutex_enter(&priv_desc->dstate_lock); 869 pub->hdr.dstate = VIO_DESC_READY; 870 mutex_exit(&priv_desc->dstate_lock); 871 872 /* 873 * Determine whether or not we need to send a message to our 874 * peer prompting them to read our newly updated descriptor(s). 875 */ 876 mutex_enter(&dp->restart_lock); 877 if (dp->restart_reqd) { 878 dp->restart_reqd = B_FALSE; 879 ldcp->ldc_stats.dring_data_msgs_sent++; 880 mutex_exit(&dp->restart_lock); 881 882 /* 883 * Send a vio_dring_msg to peer to prompt them to read 884 * the updated descriptor ring. 885 */ 886 dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA; 887 dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 888 dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA; 889 dring_pkt.tag.vio_sid = ldcp->local_session; 890 891 /* Note - for now using first ring */ 892 dring_pkt.dring_ident = dp->ident; 893 894 /* 895 * If last_ack_recv is -1 then we know we've not 896 * received any ack's yet, so this must be the first 897 * msg sent, so set the start to the begining of the ring. 898 */ 899 mutex_enter(&dp->dlock); 900 if (dp->last_ack_recv == -1) { 901 dring_pkt.start_idx = 0; 902 } else { 903 dring_pkt.start_idx = 904 (dp->last_ack_recv + 1) % dp->num_descriptors; 905 } 906 dring_pkt.end_idx = -1; 907 mutex_exit(&dp->dlock); 908 909 D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__, 910 ldcp->ldc_id, dp, dring_pkt.dring_ident); 911 D3(vswp, "%s(%lld): start %lld : end %lld :\n", 912 __func__, ldcp->ldc_id, dring_pkt.start_idx, 913 dring_pkt.end_idx); 914 915 (void) vsw_send_msg(ldcp, (void *)&dring_pkt, 916 sizeof (vio_dring_msg_t), B_TRUE); 917 918 return (status); 919 920 } else { 921 mutex_exit(&dp->restart_lock); 922 D2(vswp, "%s(%lld): updating descp %d", __func__, 923 ldcp->ldc_id, idx); 924 } 925 926 vsw_dringsend_free_exit: 927 928 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); 929 return (status); 930 } 931 932 /* 933 * Searches the private section of a ring for a free descriptor, 934 * starting at the location of the last free descriptor found 935 * previously. 936 * 937 * Returns 0 if free descriptor is available, and updates state 938 * of private descriptor to VIO_DESC_READY, otherwise returns 1. 939 * 940 * FUTURE: might need to return contiguous range of descriptors 941 * as dring info msg assumes all will be contiguous. 942 */ 943 int 944 vsw_dring_find_free_desc(dring_info_t *dringp, 945 vsw_private_desc_t **priv_p, int *idx) 946 { 947 vsw_private_desc_t *addr = NULL; 948 int num = vsw_num_descriptors; 949 int ret = 1; 950 951 D1(NULL, "%s enter\n", __func__); 952 953 ASSERT(dringp->priv_addr != NULL); 954 955 D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld", 956 __func__, dringp, dringp->end_idx); 957 958 addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx; 959 960 mutex_enter(&addr->dstate_lock); 961 if (addr->dstate == VIO_DESC_FREE) { 962 addr->dstate = VIO_DESC_READY; 963 *priv_p = addr; 964 *idx = dringp->end_idx; 965 dringp->end_idx = (dringp->end_idx + 1) % num; 966 ret = 0; 967 968 } 969 mutex_exit(&addr->dstate_lock); 970 971 /* ring full */ 972 if (ret == 1) { 973 D2(NULL, "%s: no desp free: started at %d", __func__, 974 dringp->end_idx); 975 } 976 977 D1(NULL, "%s: exit\n", __func__); 978 979 return (ret); 980 } 981 982 /* vsw_reclaim_dring -- reclaim descriptors */ 983 int 984 vsw_reclaim_dring(dring_info_t *dp, int start) 985 { 986 int i, j, len; 987 vsw_private_desc_t *priv_addr; 988 vnet_public_desc_t *pub_addr; 989 990 pub_addr = (vnet_public_desc_t *)dp->pub_addr; 991 priv_addr = (vsw_private_desc_t *)dp->priv_addr; 992 len = dp->num_descriptors; 993 994 D2(NULL, "%s: start index %ld\n", __func__, start); 995 996 j = 0; 997 for (i = start; j < len; i = (i + 1) % len, j++) { 998 pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; 999 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 1000 1001 mutex_enter(&priv_addr->dstate_lock); 1002 if (pub_addr->hdr.dstate != VIO_DESC_DONE) { 1003 mutex_exit(&priv_addr->dstate_lock); 1004 break; 1005 } 1006 pub_addr->hdr.dstate = VIO_DESC_FREE; 1007 priv_addr->dstate = VIO_DESC_FREE; 1008 /* clear all the fields */ 1009 priv_addr->datalen = 0; 1010 pub_addr->hdr.ack = 0; 1011 mutex_exit(&priv_addr->dstate_lock); 1012 1013 D3(NULL, "claiming descp:%d pub state:0x%llx priv state 0x%llx", 1014 i, pub_addr->hdr.dstate, priv_addr->dstate); 1015 } 1016 return (j); 1017 } 1018 1019 void 1020 vsw_process_dringdata(void *arg, void *dpkt) 1021 { 1022 vsw_ldc_t *ldcp = arg; 1023 vio_dring_msg_t *dring_pkt; 1024 vnet_public_desc_t desc, *pub_addr = NULL; 1025 vsw_private_desc_t *priv_addr = NULL; 1026 dring_info_t *dp = NULL; 1027 vsw_t *vswp = ldcp->ldc_vswp; 1028 mblk_t *mp = NULL; 1029 vio_mblk_t *vmp = NULL; 1030 mblk_t *bp = NULL; 1031 mblk_t *bpt = NULL; 1032 size_t nbytes = 0; 1033 uint64_t chain = 0; 1034 uint64_t len; 1035 uint32_t pos, start; 1036 uint32_t range_start, range_end; 1037 int32_t end, num, cnt = 0; 1038 int i, rv, rng_rv = 0, msg_rv = 0; 1039 boolean_t prev_desc_ack = B_FALSE; 1040 int read_attempts = 0; 1041 struct ether_header *ehp; 1042 lane_t *lp = &ldcp->lane_out; 1043 1044 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 1045 1046 /* 1047 * We know this is a data/dring packet so 1048 * cast it into the correct structure. 1049 */ 1050 dring_pkt = (vio_dring_msg_t *)dpkt; 1051 1052 /* 1053 * Switch on the vio_subtype. If its INFO then we need to 1054 * process the data. If its an ACK we need to make sure 1055 * it makes sense (i.e did we send an earlier data/info), 1056 * and if its a NACK then we maybe attempt a retry. 1057 */ 1058 switch (dring_pkt->tag.vio_subtype) { 1059 case VIO_SUBTYPE_INFO: 1060 D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id); 1061 1062 dp = ldcp->lane_in.dringp; 1063 if (dp->ident != dring_pkt->dring_ident) { 1064 DERR(vswp, "%s(%lld): unable to find dring from " 1065 "ident 0x%llx", __func__, ldcp->ldc_id, 1066 dring_pkt->dring_ident); 1067 1068 SND_DRING_NACK(ldcp, dring_pkt); 1069 return; 1070 } 1071 1072 ldcp->ldc_stats.dring_data_msgs_rcvd++; 1073 1074 start = pos = dring_pkt->start_idx; 1075 end = dring_pkt->end_idx; 1076 len = dp->num_descriptors; 1077 1078 range_start = range_end = pos; 1079 1080 D2(vswp, "%s(%lld): start index %ld : end %ld\n", 1081 __func__, ldcp->ldc_id, start, end); 1082 1083 if (end == -1) { 1084 num = -1; 1085 } else if (end >= 0) { 1086 num = end >= pos ? end - pos + 1: (len - pos + 1) + end; 1087 1088 /* basic sanity check */ 1089 if (end > len) { 1090 DERR(vswp, "%s(%lld): endpoint %lld outside " 1091 "ring length %lld", __func__, 1092 ldcp->ldc_id, end, len); 1093 1094 SND_DRING_NACK(ldcp, dring_pkt); 1095 return; 1096 } 1097 } else { 1098 DERR(vswp, "%s(%lld): invalid endpoint %lld", 1099 __func__, ldcp->ldc_id, end); 1100 SND_DRING_NACK(ldcp, dring_pkt); 1101 return; 1102 } 1103 1104 while (cnt != num) { 1105 vsw_recheck_desc: 1106 pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos; 1107 1108 if ((rng_rv = vnet_dring_entry_copy(pub_addr, 1109 &desc, dp->dring_mtype, dp->dring_handle, 1110 pos, pos)) != 0) { 1111 DERR(vswp, "%s(%lld): unable to copy " 1112 "descriptor at pos %d: err %d", 1113 __func__, pos, ldcp->ldc_id, rng_rv); 1114 ldcp->ldc_stats.ierrors++; 1115 break; 1116 } 1117 1118 /* 1119 * When given a bounded range of descriptors 1120 * to process, its an error to hit a descriptor 1121 * which is not ready. In the non-bounded case 1122 * (end_idx == -1) this simply indicates we have 1123 * reached the end of the current active range. 1124 */ 1125 if (desc.hdr.dstate != VIO_DESC_READY) { 1126 /* unbound - no error */ 1127 if (end == -1) { 1128 if (read_attempts == vsw_recv_retries) 1129 break; 1130 1131 delay(drv_usectohz(vsw_recv_delay)); 1132 read_attempts++; 1133 goto vsw_recheck_desc; 1134 } 1135 1136 /* bounded - error - so NACK back */ 1137 DERR(vswp, "%s(%lld): descriptor not READY " 1138 "(%d)", __func__, ldcp->ldc_id, 1139 desc.hdr.dstate); 1140 SND_DRING_NACK(ldcp, dring_pkt); 1141 return; 1142 } 1143 1144 DTRACE_PROBE1(read_attempts, int, read_attempts); 1145 1146 range_end = pos; 1147 1148 /* 1149 * If we ACK'd the previous descriptor then now 1150 * record the new range start position for later 1151 * ACK's. 1152 */ 1153 if (prev_desc_ack) { 1154 range_start = pos; 1155 1156 D2(vswp, "%s(%lld): updating range start to be " 1157 "%d", __func__, ldcp->ldc_id, range_start); 1158 1159 prev_desc_ack = B_FALSE; 1160 } 1161 1162 D2(vswp, "%s(%lld): processing desc %lld at pos" 1163 " 0x%llx : dstate 0x%lx : datalen 0x%lx", 1164 __func__, ldcp->ldc_id, pos, &desc, 1165 desc.hdr.dstate, desc.nbytes); 1166 1167 if ((desc.nbytes < ETHERMIN) || 1168 (desc.nbytes > lp->mtu)) { 1169 /* invalid size; drop the packet */ 1170 ldcp->ldc_stats.ierrors++; 1171 goto vsw_process_desc_done; 1172 } 1173 1174 /* 1175 * Ensure that we ask ldc for an aligned 1176 * number of bytes. Data is padded to align on 8 1177 * byte boundary, desc.nbytes is actual data length, 1178 * i.e. minus that padding. 1179 */ 1180 nbytes = (desc.nbytes + VNET_IPALIGN + 7) & ~7; 1181 if (nbytes > ldcp->max_rxpool_size) { 1182 mp = allocb(desc.nbytes + VNET_IPALIGN + 8, 1183 BPRI_MED); 1184 vmp = NULL; 1185 } else { 1186 vmp = vio_multipool_allocb(&ldcp->vmp, nbytes); 1187 if (vmp == NULL) { 1188 ldcp->ldc_stats.rx_vio_allocb_fail++; 1189 /* 1190 * No free receive buffers available, 1191 * so fallback onto allocb(9F). Make 1192 * sure that we get a data buffer which 1193 * is a multiple of 8 as this is 1194 * required by ldc_mem_copy. 1195 */ 1196 DTRACE_PROBE(allocb); 1197 mp = allocb(desc.nbytes + 1198 VNET_IPALIGN + 8, BPRI_MED); 1199 } else { 1200 mp = vmp->mp; 1201 } 1202 } 1203 if (mp == NULL) { 1204 DERR(vswp, "%s(%ld): allocb failed", 1205 __func__, ldcp->ldc_id); 1206 rng_rv = vnet_dring_entry_set_dstate(pub_addr, 1207 dp->dring_mtype, dp->dring_handle, pos, pos, 1208 VIO_DESC_DONE); 1209 ldcp->ldc_stats.ierrors++; 1210 ldcp->ldc_stats.rx_allocb_fail++; 1211 break; 1212 } 1213 1214 rv = ldc_mem_copy(ldcp->ldc_handle, 1215 (caddr_t)mp->b_rptr, 0, &nbytes, 1216 desc.memcookie, desc.ncookies, LDC_COPY_IN); 1217 if (rv != 0) { 1218 DERR(vswp, "%s(%d): unable to copy in data " 1219 "from %d cookies in desc %d (rv %d)", 1220 __func__, ldcp->ldc_id, desc.ncookies, 1221 pos, rv); 1222 freemsg(mp); 1223 1224 rng_rv = vnet_dring_entry_set_dstate(pub_addr, 1225 dp->dring_mtype, dp->dring_handle, pos, pos, 1226 VIO_DESC_DONE); 1227 ldcp->ldc_stats.ierrors++; 1228 break; 1229 } else { 1230 D2(vswp, "%s(%d): copied in %ld bytes" 1231 " using %d cookies", __func__, 1232 ldcp->ldc_id, nbytes, desc.ncookies); 1233 } 1234 1235 /* adjust the read pointer to skip over the padding */ 1236 mp->b_rptr += VNET_IPALIGN; 1237 1238 /* point to the actual end of data */ 1239 mp->b_wptr = mp->b_rptr + desc.nbytes; 1240 1241 if (vmp != NULL) { 1242 vmp->state = VIO_MBLK_HAS_DATA; 1243 } 1244 1245 /* update statistics */ 1246 ehp = (struct ether_header *)mp->b_rptr; 1247 if (IS_BROADCAST(ehp)) 1248 ldcp->ldc_stats.brdcstrcv++; 1249 else if (IS_MULTICAST(ehp)) 1250 ldcp->ldc_stats.multircv++; 1251 1252 ldcp->ldc_stats.ipackets++; 1253 ldcp->ldc_stats.rbytes += desc.nbytes; 1254 1255 /* 1256 * IPALIGN space can be used for VLAN_TAG 1257 */ 1258 (void) vsw_vlan_frame_pretag(ldcp->ldc_port, 1259 VSW_VNETPORT, mp); 1260 1261 /* build a chain of received packets */ 1262 if (bp == NULL) { 1263 /* first pkt */ 1264 bp = mp; 1265 bp->b_next = bp->b_prev = NULL; 1266 bpt = bp; 1267 chain = 1; 1268 } else { 1269 mp->b_next = mp->b_prev = NULL; 1270 bpt->b_next = mp; 1271 bpt = mp; 1272 chain++; 1273 } 1274 1275 vsw_process_desc_done: 1276 /* mark we are finished with this descriptor */ 1277 if ((rng_rv = vnet_dring_entry_set_dstate(pub_addr, 1278 dp->dring_mtype, dp->dring_handle, pos, pos, 1279 VIO_DESC_DONE)) != 0) { 1280 DERR(vswp, "%s(%lld): unable to update " 1281 "dstate at pos %d: err %d", 1282 __func__, pos, ldcp->ldc_id, rng_rv); 1283 ldcp->ldc_stats.ierrors++; 1284 break; 1285 } 1286 1287 /* 1288 * Send an ACK back to peer if requested. 1289 */ 1290 if (desc.hdr.ack) { 1291 dring_pkt->start_idx = range_start; 1292 dring_pkt->end_idx = range_end; 1293 1294 DERR(vswp, "%s(%lld): processed %d %d, ACK" 1295 " requested", __func__, ldcp->ldc_id, 1296 dring_pkt->start_idx, dring_pkt->end_idx); 1297 1298 dring_pkt->dring_process_state = VIO_DP_ACTIVE; 1299 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 1300 dring_pkt->tag.vio_sid = ldcp->local_session; 1301 1302 msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt, 1303 sizeof (vio_dring_msg_t), B_FALSE); 1304 1305 /* 1306 * Check if ACK was successfully sent. If not 1307 * we break and deal with that below. 1308 */ 1309 if (msg_rv != 0) 1310 break; 1311 1312 prev_desc_ack = B_TRUE; 1313 range_start = pos; 1314 } 1315 1316 /* next descriptor */ 1317 pos = (pos + 1) % len; 1318 cnt++; 1319 1320 /* 1321 * Break out of loop here and stop processing to 1322 * allow some other network device (or disk) to 1323 * get access to the cpu. 1324 */ 1325 if (chain > vsw_chain_len) { 1326 D3(vswp, "%s(%lld): switching chain of %d " 1327 "msgs", __func__, ldcp->ldc_id, chain); 1328 break; 1329 } 1330 } 1331 1332 /* send the chain of packets to be switched */ 1333 if (bp != NULL) { 1334 DTRACE_PROBE1(vsw_rcv_msgs, int, chain); 1335 D3(vswp, "%s(%lld): switching chain of %d msgs", 1336 __func__, ldcp->ldc_id, chain); 1337 vswp->vsw_switch_frame(vswp, bp, VSW_VNETPORT, 1338 ldcp->ldc_port, NULL); 1339 } 1340 1341 /* 1342 * If when we encountered an error when attempting to 1343 * access an imported dring, initiate a connection reset. 1344 */ 1345 if (rng_rv != 0) { 1346 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 1347 break; 1348 } 1349 1350 /* 1351 * If when we attempted to send the ACK we found that the 1352 * channel had been reset then now handle this. 1353 */ 1354 if (msg_rv == ECONNRESET) { 1355 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 1356 break; 1357 } 1358 1359 DTRACE_PROBE1(msg_cnt, int, cnt); 1360 1361 /* 1362 * We are now finished so ACK back with the state 1363 * set to STOPPING so our peer knows we are finished 1364 */ 1365 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 1366 dring_pkt->tag.vio_sid = ldcp->local_session; 1367 1368 dring_pkt->dring_process_state = VIO_DP_STOPPED; 1369 1370 DTRACE_PROBE(stop_process_sent); 1371 1372 /* 1373 * We have not processed any more descriptors beyond 1374 * the last one we ACK'd. 1375 */ 1376 if (prev_desc_ack) 1377 range_start = range_end; 1378 1379 dring_pkt->start_idx = range_start; 1380 dring_pkt->end_idx = range_end; 1381 1382 D2(vswp, "%s(%lld) processed : %d : %d, now stopping", 1383 __func__, ldcp->ldc_id, dring_pkt->start_idx, 1384 dring_pkt->end_idx); 1385 1386 (void) vsw_send_msg(ldcp, (void *)dring_pkt, 1387 sizeof (vio_dring_msg_t), B_TRUE); 1388 ldcp->ldc_stats.dring_data_acks_sent++; 1389 ldcp->ldc_stats.dring_stopped_acks_sent++; 1390 break; 1391 1392 case VIO_SUBTYPE_ACK: 1393 D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id); 1394 /* 1395 * Verify that the relevant descriptors are all 1396 * marked as DONE 1397 */ 1398 dp = ldcp->lane_out.dringp; 1399 if (dp->ident != dring_pkt->dring_ident) { 1400 DERR(vswp, "%s: unknown ident in ACK", __func__); 1401 return; 1402 } 1403 1404 start = end = 0; 1405 start = dring_pkt->start_idx; 1406 end = dring_pkt->end_idx; 1407 len = dp->num_descriptors; 1408 1409 1410 mutex_enter(&dp->dlock); 1411 dp->last_ack_recv = end; 1412 ldcp->ldc_stats.dring_data_acks_rcvd++; 1413 mutex_exit(&dp->dlock); 1414 1415 (void) vsw_reclaim_dring(dp, start); 1416 1417 /* 1418 * If our peer is stopping processing descriptors then 1419 * we check to make sure it has processed all the descriptors 1420 * we have updated. If not then we send it a new message 1421 * to prompt it to restart. 1422 */ 1423 if (dring_pkt->dring_process_state == VIO_DP_STOPPED) { 1424 DTRACE_PROBE(stop_process_recv); 1425 D2(vswp, "%s(%lld): got stopping msg : %d : %d", 1426 __func__, ldcp->ldc_id, dring_pkt->start_idx, 1427 dring_pkt->end_idx); 1428 1429 /* 1430 * Check next descriptor in public section of ring. 1431 * If its marked as READY then we need to prompt our 1432 * peer to start processing the ring again. 1433 */ 1434 i = (end + 1) % len; 1435 pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; 1436 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 1437 1438 /* 1439 * Hold the restart lock across all of this to 1440 * make sure that its not possible for us to 1441 * decide that a msg needs to be sent in the future 1442 * but the sending code having already checked is 1443 * about to exit. 1444 */ 1445 mutex_enter(&dp->restart_lock); 1446 ldcp->ldc_stats.dring_stopped_acks_rcvd++; 1447 mutex_enter(&priv_addr->dstate_lock); 1448 if (pub_addr->hdr.dstate == VIO_DESC_READY) { 1449 1450 mutex_exit(&priv_addr->dstate_lock); 1451 1452 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; 1453 dring_pkt->tag.vio_sid = ldcp->local_session; 1454 1455 dring_pkt->start_idx = (end + 1) % len; 1456 dring_pkt->end_idx = -1; 1457 1458 D2(vswp, "%s(%lld) : sending restart msg:" 1459 " %d : %d", __func__, ldcp->ldc_id, 1460 dring_pkt->start_idx, dring_pkt->end_idx); 1461 1462 msg_rv = vsw_send_msg(ldcp, (void *)dring_pkt, 1463 sizeof (vio_dring_msg_t), B_FALSE); 1464 ldcp->ldc_stats.dring_data_msgs_sent++; 1465 1466 } else { 1467 mutex_exit(&priv_addr->dstate_lock); 1468 dp->restart_reqd = B_TRUE; 1469 } 1470 mutex_exit(&dp->restart_lock); 1471 } 1472 1473 if (msg_rv == ECONNRESET) 1474 vsw_process_conn_evt(ldcp, VSW_CONN_RESET); 1475 1476 break; 1477 1478 case VIO_SUBTYPE_NACK: 1479 DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK", 1480 __func__, ldcp->ldc_id); 1481 /* 1482 * Something is badly wrong if we are getting NACK's 1483 * for our data pkts. So reset the channel. 1484 */ 1485 vsw_process_conn_evt(ldcp, VSW_CONN_RESTART); 1486 1487 break; 1488 1489 default: 1490 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, 1491 ldcp->ldc_id, dring_pkt->tag.vio_subtype); 1492 } 1493 1494 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 1495 } 1496