1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/errno.h> 31 #include <sys/debug.h> 32 #include <sys/time.h> 33 #include <sys/sysmacros.h> 34 #include <sys/systm.h> 35 #include <sys/user.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strlog.h> 39 #include <sys/strsubr.h> 40 #include <sys/cmn_err.h> 41 #include <sys/cpu.h> 42 #include <sys/kmem.h> 43 #include <sys/conf.h> 44 #include <sys/ddi.h> 45 #include <sys/sunddi.h> 46 #include <sys/ksynch.h> 47 #include <sys/stat.h> 48 #include <sys/kstat.h> 49 #include <sys/vtrace.h> 50 #include <sys/strsun.h> 51 #include <sys/dlpi.h> 52 #include <sys/ethernet.h> 53 #include <net/if.h> 54 #include <sys/varargs.h> 55 #include <sys/machsystm.h> 56 #include <sys/modctl.h> 57 #include <sys/modhash.h> 58 #include <sys/mac.h> 59 #include <sys/mac_ether.h> 60 #include <sys/taskq.h> 61 #include <sys/note.h> 62 #include <sys/mach_descrip.h> 63 #include <sys/mac.h> 64 #include <sys/mdeg.h> 65 #include <sys/ldc.h> 66 #include <sys/vsw_fdb.h> 67 #include <sys/vsw.h> 68 #include <sys/vio_mailbox.h> 69 #include <sys/vnet_mailbox.h> 70 #include <sys/vnet_common.h> 71 #include <sys/vio_util.h> 72 #include <sys/sdt.h> 73 74 /* 75 * Function prototypes. 76 */ 77 static int vsw_attach(dev_info_t *, ddi_attach_cmd_t); 78 static int vsw_detach(dev_info_t *, ddi_detach_cmd_t); 79 static int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 80 static void vsw_get_md_properties(vsw_t *vswp); 81 static int vsw_get_physaddr(vsw_t *); 82 static int vsw_setup_layer2(vsw_t *); 83 static int vsw_setup_layer3(vsw_t *); 84 85 /* MAC layer routines */ 86 static int vsw_mac_attach(vsw_t *vswp); 87 static void vsw_mac_detach(vsw_t *vswp); 88 static int vsw_get_hw_maddr(vsw_t *); 89 static int vsw_set_hw(vsw_t *, vsw_port_t *); 90 static int vsw_set_hw_promisc(vsw_t *, vsw_port_t *); 91 static int vsw_unset_hw(vsw_t *, vsw_port_t *); 92 static int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *); 93 static int vsw_reconfig_hw(vsw_t *); 94 static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *); 95 static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *); 96 static int vsw_mac_register(vsw_t *); 97 static int vsw_mac_unregister(vsw_t *); 98 static int vsw_m_stat(void *, uint_t, uint64_t *); 99 static void vsw_m_stop(void *arg); 100 static int vsw_m_start(void *arg); 101 static int vsw_m_unicst(void *arg, const uint8_t *); 102 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *); 103 static int vsw_m_promisc(void *arg, boolean_t); 104 static mblk_t *vsw_m_tx(void *arg, mblk_t *); 105 106 /* MDEG routines */ 107 static void vsw_mdeg_register(vsw_t *vswp); 108 static void vsw_mdeg_unregister(vsw_t *vswp); 109 static int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *); 110 111 /* Port add/deletion routines */ 112 static int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node); 113 static int vsw_port_attach(vsw_t *vswp, int p_instance, 114 uint64_t *ldcids, int nids, struct ether_addr *macaddr); 115 static int vsw_detach_ports(vsw_t *vswp); 116 static int vsw_port_detach(vsw_t *vswp, int p_instance); 117 static int vsw_port_delete(vsw_port_t *port); 118 static int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id); 119 static int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id); 120 static int vsw_init_ldcs(vsw_port_t *port); 121 static int vsw_uninit_ldcs(vsw_port_t *port); 122 static int vsw_ldc_init(vsw_ldc_t *ldcp); 123 static int vsw_ldc_uninit(vsw_ldc_t *ldcp); 124 static int vsw_drain_ldcs(vsw_port_t *port); 125 static int vsw_drain_port_taskq(vsw_port_t *port); 126 static void vsw_marker_task(void *); 127 static vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance); 128 static int vsw_plist_del_node(vsw_t *, vsw_port_t *port); 129 130 /* Interrupt routines */ 131 static uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg); 132 133 /* Handshake routines */ 134 static void vsw_restart_handshake(vsw_ldc_t *); 135 static int vsw_check_flag(vsw_ldc_t *, int, uint64_t); 136 static void vsw_next_milestone(vsw_ldc_t *); 137 static int vsw_supported_version(vio_ver_msg_t *); 138 139 /* Data processing routines */ 140 static void vsw_process_pkt(void *); 141 static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t); 142 static void vsw_process_ctrl_pkt(void *); 143 static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *); 144 static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *); 145 static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *); 146 static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *); 147 static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *); 148 static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *); 149 static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t); 150 static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *); 151 static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *); 152 static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *); 153 static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t); 154 155 /* Switching/data transmit routines */ 156 static void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 157 vsw_port_t *port, mac_resource_handle_t); 158 static void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 159 vsw_port_t *port, mac_resource_handle_t); 160 static int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, 161 vsw_port_t *port); 162 static int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, 163 vsw_port_t *port); 164 static int vsw_portsend(vsw_port_t *, mblk_t *); 165 static int vsw_dringsend(vsw_ldc_t *, mblk_t *); 166 static int vsw_descrsend(vsw_ldc_t *, mblk_t *); 167 168 /* Packet creation routines */ 169 static void vsw_send_ver(vsw_ldc_t *); 170 static void vsw_send_attr(vsw_ldc_t *); 171 static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *); 172 static void vsw_send_dring_info(vsw_ldc_t *); 173 static void vsw_send_rdx(vsw_ldc_t *); 174 175 static void vsw_send_msg(vsw_ldc_t *, void *, int); 176 177 /* Forwarding database (FDB) routines */ 178 static int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port); 179 static int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port); 180 static vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *); 181 static int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *); 182 static int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *); 183 static int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *); 184 static void vsw_del_addr(uint8_t, void *, uint64_t); 185 static void vsw_del_mcst_port(vsw_port_t *); 186 static void vsw_del_mcst_vsw(vsw_t *); 187 188 /* Dring routines */ 189 static dring_info_t *vsw_create_dring(vsw_ldc_t *); 190 static void vsw_create_privring(vsw_ldc_t *); 191 static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp); 192 static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **, 193 int *); 194 static dring_info_t *vsw_ident2dring(lane_t *, uint64_t); 195 196 static void vsw_set_lane_attr(vsw_t *, lane_t *); 197 static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *); 198 static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg); 199 static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *); 200 static int vsw_check_dring_info(vio_dring_reg_msg_t *); 201 202 /* Misc support routines */ 203 static caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf); 204 static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t); 205 static int vsw_free_ring(dring_info_t *); 206 207 208 /* Debugging routines */ 209 static void dump_flags(uint64_t); 210 static void display_state(void); 211 static void display_lane(lane_t *); 212 static void display_ring(dring_info_t *); 213 214 int vsw_num_handshakes = 3; /* # of handshake attempts */ 215 int vsw_wretries = 100; /* # of write attempts */ 216 int vsw_chain_len = 150; /* max # of mblks in msg chain */ 217 int vsw_desc_delay = 0; /* delay in us */ 218 int vsw_read_attempts = 5; /* # of reads of descriptor */ 219 220 uint32_t vsw_mblk_size = VSW_MBLK_SIZE; 221 uint32_t vsw_num_mblks = VSW_NUM_MBLKS; 222 223 224 /* 225 * mode specific frame switching function 226 */ 227 void (*vsw_switch_frame)(vsw_t *, mblk_t *, int, vsw_port_t *, 228 mac_resource_handle_t); 229 230 static mac_callbacks_t vsw_m_callbacks = { 231 0, 232 vsw_m_stat, 233 vsw_m_start, 234 vsw_m_stop, 235 vsw_m_promisc, 236 vsw_m_multicst, 237 vsw_m_unicst, 238 vsw_m_tx, 239 NULL, 240 NULL, 241 NULL 242 }; 243 244 static struct cb_ops vsw_cb_ops = { 245 nulldev, /* cb_open */ 246 nulldev, /* cb_close */ 247 nodev, /* cb_strategy */ 248 nodev, /* cb_print */ 249 nodev, /* cb_dump */ 250 nodev, /* cb_read */ 251 nodev, /* cb_write */ 252 nodev, /* cb_ioctl */ 253 nodev, /* cb_devmap */ 254 nodev, /* cb_mmap */ 255 nodev, /* cb_segmap */ 256 nochpoll, /* cb_chpoll */ 257 ddi_prop_op, /* cb_prop_op */ 258 NULL, /* cb_stream */ 259 D_MP, /* cb_flag */ 260 CB_REV, /* rev */ 261 nodev, /* int (*cb_aread)() */ 262 nodev /* int (*cb_awrite)() */ 263 }; 264 265 static struct dev_ops vsw_ops = { 266 DEVO_REV, /* devo_rev */ 267 0, /* devo_refcnt */ 268 vsw_getinfo, /* devo_getinfo */ 269 nulldev, /* devo_identify */ 270 nulldev, /* devo_probe */ 271 vsw_attach, /* devo_attach */ 272 vsw_detach, /* devo_detach */ 273 nodev, /* devo_reset */ 274 &vsw_cb_ops, /* devo_cb_ops */ 275 (struct bus_ops *)NULL, /* devo_bus_ops */ 276 ddi_power /* devo_power */ 277 }; 278 279 extern struct mod_ops mod_driverops; 280 static struct modldrv vswmodldrv = { 281 &mod_driverops, 282 "sun4v Virtual Switch Driver %I%", 283 &vsw_ops, 284 }; 285 286 #define LDC_ENTER_LOCK(ldcp) \ 287 mutex_enter(&((ldcp)->ldc_cblock));\ 288 mutex_enter(&((ldcp)->ldc_txlock)); 289 #define LDC_EXIT_LOCK(ldcp) \ 290 mutex_exit(&((ldcp)->ldc_txlock));\ 291 mutex_exit(&((ldcp)->ldc_cblock)); 292 293 /* Driver soft state ptr */ 294 static void *vsw_state; 295 296 /* 297 * Linked list of "vsw_t" structures - one per instance. 298 */ 299 vsw_t *vsw_head = NULL; 300 krwlock_t vsw_rw; 301 302 /* 303 * Property names 304 */ 305 static char vdev_propname[] = "virtual-device"; 306 static char vsw_propname[] = "virtual-network-switch"; 307 static char physdev_propname[] = "vsw-phys-dev"; 308 static char smode_propname[] = "vsw-switch-mode"; 309 static char macaddr_propname[] = "local-mac-address"; 310 static char remaddr_propname[] = "remote-mac-address"; 311 static char ldcids_propname[] = "ldc-ids"; 312 static char chan_propname[] = "channel-endpoint"; 313 static char id_propname[] = "id"; 314 static char reg_propname[] = "reg"; 315 316 /* supported versions */ 317 static ver_sup_t vsw_versions[] = { {1, 0} }; 318 319 /* 320 * Matching criteria passed to the MDEG to register interest 321 * in changes to 'virtual-device-port' nodes identified by their 322 * 'id' property. 323 */ 324 static md_prop_match_t vport_prop_match[] = { 325 { MDET_PROP_VAL, "id" }, 326 { MDET_LIST_END, NULL } 327 }; 328 329 static mdeg_node_match_t vport_match = { "virtual-device-port", 330 vport_prop_match }; 331 332 /* 333 * Specification of an MD node passed to the MDEG to filter any 334 * 'vport' nodes that do not belong to the specified node. This 335 * template is copied for each vsw instance and filled in with 336 * the appropriate 'cfg-handle' value before being passed to the MDEG. 337 */ 338 static mdeg_prop_spec_t vsw_prop_template[] = { 339 { MDET_PROP_STR, "name", vsw_propname }, 340 { MDET_PROP_VAL, "cfg-handle", NULL }, 341 { MDET_LIST_END, NULL, NULL } 342 }; 343 344 #define VSW_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val); 345 346 /* 347 * Print debug messages - set to 0x1f to enable all msgs 348 * or 0x0 to turn all off. 349 */ 350 int vswdbg = 0x0; 351 352 /* 353 * debug levels: 354 * 0x01: Function entry/exit tracing 355 * 0x02: Internal function messages 356 * 0x04: Verbose internal messages 357 * 0x08: Warning messages 358 * 0x10: Error messages 359 */ 360 361 static void 362 vswdebug(vsw_t *vswp, const char *fmt, ...) 363 { 364 char buf[512]; 365 va_list ap; 366 367 va_start(ap, fmt); 368 (void) vsprintf(buf, fmt, ap); 369 va_end(ap); 370 371 if (vswp == NULL) 372 cmn_err(CE_CONT, "%s\n", buf); 373 else 374 cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf); 375 } 376 377 /* 378 * For the moment the state dump routines have their own 379 * private flag. 380 */ 381 #define DUMP_STATE 0 382 383 #if DUMP_STATE 384 385 #define DUMP_TAG(tag) \ 386 { \ 387 D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \ 388 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype); \ 389 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env); \ 390 } 391 392 #define DUMP_TAG_PTR(tag) \ 393 { \ 394 D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \ 395 D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype); \ 396 D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env); \ 397 } 398 399 #define DUMP_FLAGS(flags) dump_flags(flags); 400 #define DISPLAY_STATE() display_state() 401 402 #else 403 404 #define DUMP_TAG(tag) 405 #define DUMP_TAG_PTR(tag) 406 #define DUMP_FLAGS(state) 407 #define DISPLAY_STATE() 408 409 #endif /* DUMP_STATE */ 410 411 #ifdef DEBUG 412 413 #define D1 \ 414 if (vswdbg & 0x01) \ 415 vswdebug 416 417 #define D2 \ 418 if (vswdbg & 0x02) \ 419 vswdebug 420 421 #define D3 \ 422 if (vswdbg & 0x04) \ 423 vswdebug 424 425 #define DWARN \ 426 if (vswdbg & 0x08) \ 427 vswdebug 428 429 #define DERR \ 430 if (vswdbg & 0x10) \ 431 vswdebug 432 433 #else 434 435 #define DERR if (0) vswdebug 436 #define DWARN if (0) vswdebug 437 #define D1 if (0) vswdebug 438 #define D2 if (0) vswdebug 439 #define D3 if (0) vswdebug 440 441 #endif /* DEBUG */ 442 443 static struct modlinkage modlinkage = { 444 MODREV_1, 445 &vswmodldrv, 446 NULL 447 }; 448 449 int 450 _init(void) 451 { 452 int status; 453 454 rw_init(&vsw_rw, NULL, RW_DRIVER, NULL); 455 456 status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1); 457 if (status != 0) { 458 return (status); 459 } 460 461 mac_init_ops(&vsw_ops, "vsw"); 462 status = mod_install(&modlinkage); 463 if (status != 0) { 464 ddi_soft_state_fini(&vsw_state); 465 } 466 return (status); 467 } 468 469 int 470 _fini(void) 471 { 472 int status; 473 474 status = mod_remove(&modlinkage); 475 if (status != 0) 476 return (status); 477 mac_fini_ops(&vsw_ops); 478 ddi_soft_state_fini(&vsw_state); 479 480 rw_destroy(&vsw_rw); 481 482 return (status); 483 } 484 485 int 486 _info(struct modinfo *modinfop) 487 { 488 return (mod_info(&modlinkage, modinfop)); 489 } 490 491 static int 492 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 493 { 494 vsw_t *vswp; 495 int instance, i; 496 char hashname[MAXNAMELEN]; 497 char qname[TASKQ_NAMELEN]; 498 int rv = 1; 499 enum { PROG_init = 0x0, PROG_if_lock = 0x1, 500 PROG_fdb = 0x2, PROG_mfdb = 0x4, 501 PROG_report_dev = 0x8, PROG_plist = 0x10, 502 PROG_taskq = 0x20} 503 progress; 504 505 progress = PROG_init; 506 507 switch (cmd) { 508 case DDI_ATTACH: 509 break; 510 case DDI_RESUME: 511 /* nothing to do for this non-device */ 512 return (DDI_SUCCESS); 513 case DDI_PM_RESUME: 514 default: 515 return (DDI_FAILURE); 516 } 517 518 instance = ddi_get_instance(dip); 519 if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) { 520 DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance); 521 return (DDI_FAILURE); 522 } 523 vswp = ddi_get_soft_state(vsw_state, instance); 524 525 if (vswp == NULL) { 526 DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance); 527 goto vsw_attach_fail; 528 } 529 530 vswp->dip = dip; 531 vswp->instance = instance; 532 ddi_set_driver_private(dip, (caddr_t)vswp); 533 534 rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL); 535 536 progress |= PROG_if_lock; 537 538 /* 539 * Get the various properties such as physical device name 540 * (vsw-phys-dev), switch mode etc from the MD. 541 */ 542 vsw_get_md_properties(vswp); 543 544 /* setup the unicast forwarding database */ 545 (void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d", 546 vswp->instance); 547 D2(vswp, "creating unicast hash table (%s)...", hashname); 548 vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS, 549 mod_hash_null_valdtor, sizeof (void *)); 550 551 progress |= PROG_fdb; 552 553 /* setup the multicast fowarding database */ 554 (void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d", 555 vswp->instance); 556 D2(vswp, "creating multicast hash table %s)...", hashname); 557 rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL); 558 vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS, 559 mod_hash_null_valdtor, sizeof (void *)); 560 561 progress |= PROG_mfdb; 562 563 /* 564 * create lock protecting list of multicast addresses 565 * which could come via m_multicst() entry point when plumbed. 566 */ 567 mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL); 568 vswp->mcap = NULL; 569 570 ddi_report_dev(vswp->dip); 571 572 progress |= PROG_report_dev; 573 574 WRITE_ENTER(&vsw_rw); 575 vswp->next = vsw_head; 576 vsw_head = vswp; 577 RW_EXIT(&vsw_rw); 578 579 /* setup the port list */ 580 rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL); 581 vswp->plist.head = NULL; 582 583 progress |= PROG_plist; 584 585 /* 586 * Create the taskq which will process all the VIO 587 * control messages. 588 */ 589 (void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance); 590 if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1, 591 TASKQ_DEFAULTPRI, 0)) == NULL) { 592 cmn_err(CE_WARN, "Unable to create task queue"); 593 goto vsw_attach_fail; 594 } 595 596 progress |= PROG_taskq; 597 598 /* select best switching mode */ 599 for (i = 0; i < vswp->smode_num; i++) { 600 vswp->smode_idx = i; 601 switch (vswp->smode[i]) { 602 case VSW_LAYER2: 603 case VSW_LAYER2_PROMISC: 604 rv = vsw_setup_layer2(vswp); 605 break; 606 607 case VSW_LAYER3: 608 rv = vsw_setup_layer3(vswp); 609 break; 610 611 default: 612 DERR(vswp, "unknown switch mode"); 613 rv = 1; 614 break; 615 } 616 617 if (rv == 0) 618 break; 619 } 620 621 if (rv == 1) { 622 cmn_err(CE_WARN, "Unable to setup switching mode"); 623 goto vsw_attach_fail; 624 } 625 626 D2(vswp, "Operating in mode %d", vswp->smode[vswp->smode_idx]); 627 628 /* 629 * Register with the MAC layer as a network device so 630 * we can be plumbed if desired. 631 * 632 * Do this in both layer 2 and layer 3 mode. 633 */ 634 vswp->if_state &= ~VSW_IF_UP; 635 if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) { 636 if (vsw_mac_register(vswp) != 0) { 637 cmn_err(CE_WARN, "Unable to register as provider " 638 " with MAC layer, continuing with attach"); 639 } 640 } 641 642 /* prevent auto-detaching */ 643 if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip, 644 DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) { 645 cmn_err(CE_NOTE, "Unable to set \"%s\" property for " 646 "instance %u", DDI_NO_AUTODETACH, instance); 647 } 648 649 /* 650 * Now we have everything setup, register for MD change 651 * events. 652 */ 653 vsw_mdeg_register(vswp); 654 655 return (DDI_SUCCESS); 656 657 vsw_attach_fail: 658 DERR(NULL, "vsw_attach: failed"); 659 660 if (progress & PROG_taskq) 661 ddi_taskq_destroy(vswp->taskq_p); 662 663 if (progress & PROG_plist) 664 rw_destroy(&vswp->plist.lockrw); 665 666 if (progress & PROG_report_dev) { 667 ddi_remove_minor_node(dip, NULL); 668 mutex_destroy(&vswp->mca_lock); 669 } 670 671 if (progress & PROG_mfdb) { 672 mod_hash_destroy_hash(vswp->mfdb); 673 vswp->mfdb = NULL; 674 rw_destroy(&vswp->mfdbrw); 675 } 676 677 if (progress & PROG_fdb) { 678 mod_hash_destroy_hash(vswp->fdb); 679 vswp->fdb = NULL; 680 } 681 682 if (progress & PROG_if_lock) 683 rw_destroy(&vswp->if_lockrw); 684 685 ddi_soft_state_free(vsw_state, instance); 686 return (DDI_FAILURE); 687 } 688 689 static int 690 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 691 { 692 vio_mblk_pool_t *poolp, *npoolp; 693 vsw_t **vswpp, *vswp; 694 int instance; 695 696 instance = ddi_get_instance(dip); 697 vswp = ddi_get_soft_state(vsw_state, instance); 698 699 if (vswp == NULL) { 700 return (DDI_FAILURE); 701 } 702 703 switch (cmd) { 704 case DDI_DETACH: 705 break; 706 case DDI_SUSPEND: 707 case DDI_PM_SUSPEND: 708 default: 709 return (DDI_FAILURE); 710 } 711 712 D2(vswp, "detaching instance %d", instance); 713 714 if (vswp->mdprops & (VSW_MD_MACADDR | VSW_DEV_MACADDR)) { 715 if (vsw_mac_unregister(vswp) != 0) { 716 cmn_err(CE_WARN, "Unable to detach from MAC layer"); 717 return (DDI_FAILURE); 718 } 719 rw_destroy(&vswp->if_lockrw); 720 } 721 722 vsw_mdeg_unregister(vswp); 723 724 /* remove mac layer callback */ 725 if ((vswp->mh != NULL) && (vswp->mrh != NULL)) { 726 mac_rx_remove(vswp->mh, vswp->mrh); 727 vswp->mrh = NULL; 728 } 729 730 if (vsw_detach_ports(vswp) != 0) { 731 cmn_err(CE_WARN, "Unable to detach ports"); 732 return (DDI_FAILURE); 733 } 734 735 /* 736 * Now that the ports have been deleted, stop and close 737 * the physical device. 738 */ 739 if (vswp->mh != NULL) { 740 mac_stop(vswp->mh); 741 mac_close(vswp->mh); 742 743 vswp->mh = NULL; 744 vswp->txinfo = NULL; 745 } 746 747 /* 748 * Destroy any free pools that may still exist. 749 */ 750 poolp = vswp->rxh; 751 while (poolp != NULL) { 752 npoolp = vswp->rxh = poolp->nextp; 753 if (vio_destroy_mblks(poolp) != 0) { 754 vswp->rxh = poolp; 755 return (DDI_FAILURE); 756 } 757 poolp = npoolp; 758 } 759 760 /* 761 * Remove this instance from any entries it may be on in 762 * the hash table by using the list of addresses maintained 763 * in the vsw_t structure. 764 */ 765 vsw_del_mcst_vsw(vswp); 766 767 vswp->mcap = NULL; 768 mutex_destroy(&vswp->mca_lock); 769 770 /* 771 * By now any pending tasks have finished and the underlying 772 * ldc's have been destroyed, so its safe to delete the control 773 * message taskq. 774 */ 775 if (vswp->taskq_p != NULL) 776 ddi_taskq_destroy(vswp->taskq_p); 777 778 /* 779 * At this stage all the data pointers in the hash table 780 * should be NULL, as all the ports have been removed and will 781 * have deleted themselves from the port lists which the data 782 * pointers point to. Hence we can destroy the table using the 783 * default destructors. 784 */ 785 D2(vswp, "vsw_detach: destroying hash tables.."); 786 mod_hash_destroy_hash(vswp->fdb); 787 vswp->fdb = NULL; 788 789 WRITE_ENTER(&vswp->mfdbrw); 790 mod_hash_destroy_hash(vswp->mfdb); 791 vswp->mfdb = NULL; 792 RW_EXIT(&vswp->mfdbrw); 793 rw_destroy(&vswp->mfdbrw); 794 795 ddi_remove_minor_node(dip, NULL); 796 797 rw_destroy(&vswp->plist.lockrw); 798 WRITE_ENTER(&vsw_rw); 799 for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) { 800 if (*vswpp == vswp) { 801 *vswpp = vswp->next; 802 break; 803 } 804 } 805 RW_EXIT(&vsw_rw); 806 ddi_soft_state_free(vsw_state, instance); 807 808 return (DDI_SUCCESS); 809 } 810 811 static int 812 vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 813 { 814 _NOTE(ARGUNUSED(dip)) 815 816 vsw_t *vswp = NULL; 817 dev_t dev = (dev_t)arg; 818 int instance; 819 820 instance = getminor(dev); 821 822 switch (infocmd) { 823 case DDI_INFO_DEVT2DEVINFO: 824 if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) { 825 *result = NULL; 826 return (DDI_FAILURE); 827 } 828 *result = vswp->dip; 829 return (DDI_SUCCESS); 830 831 case DDI_INFO_DEVT2INSTANCE: 832 *result = (void *)(uintptr_t)instance; 833 return (DDI_SUCCESS); 834 835 default: 836 *result = NULL; 837 return (DDI_FAILURE); 838 } 839 } 840 841 /* 842 * Get the properties from our MD node. 843 */ 844 static void 845 vsw_get_md_properties(vsw_t *vswp) 846 { 847 md_t *mdp = NULL; 848 int num_nodes = 0; 849 int len = 0, listsz = 0; 850 int num_vdev = 0; 851 int i, idx; 852 boolean_t found_node = B_FALSE; 853 char *smode = NULL; 854 char *curr_mode = NULL; 855 char *physname = NULL; 856 char *node_name = NULL; 857 char *dev; 858 uint64_t macaddr = 0; 859 uint64_t md_inst, obp_inst; 860 mde_cookie_t *listp = NULL; 861 mde_cookie_t rootnode; 862 863 D1(vswp, "%s: enter", __func__); 864 865 /* 866 * Further down we compare the obp 'reg' property to the 867 * 'cfg-handle' property in the vsw MD node to determine 868 * if the node refers to this particular instance. So if 869 * we can't read the obp value then there is no point 870 * in proceeding further. 871 */ 872 if (ddi_prop_exists(DDI_DEV_T_ANY, vswp->dip, 873 DDI_PROP_DONTPASS, reg_propname) != 1) { 874 cmn_err(CE_WARN, "Unable to read %s property " 875 "from OBP device node", reg_propname); 876 return; 877 } 878 879 obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip, 880 DDI_PROP_DONTPASS, reg_propname, 0); 881 882 D2(vswp, "%s: obp_inst 0x%llx", __func__, obp_inst); 883 884 if ((mdp = md_get_handle()) == NULL) { 885 DERR(vswp, "%s: unable to init MD", __func__); 886 return; 887 } 888 889 if ((num_nodes = md_node_count(mdp)) <= 0) { 890 DERR(vswp, "%s: invalid number of nodes found %d", 891 __func__, num_nodes); 892 (void) md_fini_handle(mdp); 893 return; 894 } 895 896 D2(vswp, "%s: %d nodes in total in MD", __func__, num_nodes); 897 898 /* allocate enough space for node list */ 899 listsz = num_nodes * sizeof (mde_cookie_t); 900 listp = kmem_zalloc(listsz, KM_SLEEP); 901 902 rootnode = md_root_node(mdp); 903 904 /* Get the list of virtual devices */ 905 num_vdev = md_scan_dag(mdp, rootnode, 906 md_find_name(mdp, vdev_propname), 907 md_find_name(mdp, "fwd"), listp); 908 909 if (num_vdev <= 0) { 910 DERR(vswp, "%s: didn't find any virtual-device nodes in MD", 911 __func__); 912 goto md_prop_exit; 913 } 914 915 D2(vswp, "%s: %d virtual-device nodes found", __func__, num_vdev); 916 917 /* Look for the virtual switch nodes in the list */ 918 for (idx = 0; idx < num_vdev; idx++) { 919 if (md_get_prop_str(mdp, listp[idx], 920 "name", &node_name) != 0) { 921 DERR(vswp, "%s: unable to get node name", __func__); 922 continue; 923 924 } 925 926 if (strcmp(node_name, vsw_propname) == 0) { 927 /* Virtual switch node */ 928 if (md_get_prop_val(mdp, listp[idx], 929 "cfg-handle", &md_inst) != 0) { 930 DERR(vswp, "%s: unable to get cfg-handle from" 931 " node %d", __func__, idx); 932 goto md_prop_exit; 933 } else if (md_inst == obp_inst) { 934 D2(vswp, "%s: found matching node (%d)" 935 " 0x%llx == 0x%llx", __func__, idx, 936 md_inst, obp_inst); 937 found_node = B_TRUE; 938 break; 939 } 940 } 941 } 942 943 if (!found_node) { 944 DWARN(vswp, "%s: couldn't find correct vsw node", __func__); 945 goto md_prop_exit; 946 } 947 948 /* 949 * Now, having found the correct node, get the various properties. 950 */ 951 952 if (md_get_prop_data(mdp, listp[idx], physdev_propname, 953 (uint8_t **)(&physname), &len) != 0) { 954 cmn_err(CE_WARN, "%s: unable to get name(s) of physical " 955 "device(s) from MD", __func__); 956 } else if ((strlen(physname) + 1) > LIFNAMSIZ) { 957 cmn_err(CE_WARN, "%s is too long a device name", physname); 958 } else { 959 (void) strncpy(vswp->physname, physname, strlen(physname) + 1); 960 vswp->mdprops |= VSW_MD_PHYSNAME; 961 D2(vswp, "%s: using first device specified (%s)", 962 __func__, vswp->physname); 963 } 964 965 #ifdef DEBUG 966 /* 967 * As a temporary measure to aid testing we check to see if there 968 * is a vsw.conf file present. If there is we use the value of the 969 * vsw_physname property in the file as the name of the physical 970 * device, overriding the value from the MD. 971 * 972 * There may be multiple devices listed, but for the moment 973 * we just use the first one. 974 */ 975 if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0, 976 "vsw_physname", &dev) == DDI_PROP_SUCCESS) { 977 if ((strlen(dev) + 1) > LIFNAMSIZ) { 978 cmn_err(CE_WARN, "%s is too long a device name", dev); 979 } else { 980 cmn_err(CE_NOTE, "%s: using device name (%s) from " 981 "config file", __func__, dev); 982 983 (void) strncpy(vswp->physname, dev, strlen(dev) + 1); 984 vswp->mdprops |= VSW_MD_PHYSNAME; 985 } 986 987 ddi_prop_free(dev); 988 989 } 990 #endif 991 992 /* mac address for vswitch device itself */ 993 if (md_get_prop_val(mdp, listp[idx], 994 macaddr_propname, &macaddr) != 0) { 995 cmn_err(CE_WARN, "!Unable to get MAC address from MD"); 996 997 /* 998 * Fallback to using the mac address of the physical 999 * device. 1000 */ 1001 if (vsw_get_physaddr(vswp) == 0) { 1002 cmn_err(CE_NOTE, "!Using MAC address from physical " 1003 "device (%s)", vswp->physname); 1004 } 1005 } else { 1006 READ_ENTER(&vswp->if_lockrw); 1007 for (i = ETHERADDRL - 1; i >= 0; i--) { 1008 vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF; 1009 macaddr >>= 8; 1010 } 1011 RW_EXIT(&vswp->if_lockrw); 1012 vswp->mdprops |= VSW_MD_MACADDR; 1013 } 1014 1015 /* 1016 * Get the switch-mode property. The modes are listed in 1017 * decreasing order of preference, i.e. prefered mode is 1018 * first item in list. 1019 */ 1020 len = 0; 1021 vswp->smode_num = 0; 1022 if (md_get_prop_data(mdp, listp[idx], smode_propname, 1023 (uint8_t **)(&smode), &len) != 0) { 1024 /* 1025 * Unable to get switch-mode property from MD, nothing 1026 * more we can do. 1027 */ 1028 cmn_err(CE_WARN, "!unable to get switch mode property"); 1029 goto md_prop_exit; 1030 } 1031 1032 curr_mode = smode; 1033 /* 1034 * Modes of operation: 1035 * 'switched' - layer 2 switching, underlying HW in 1036 * programmed mode. 1037 * 'promiscuous' - layer 2 switching, underlying HW in 1038 * promiscuous mode. 1039 * 'routed' - layer 3 (i.e. IP) routing, underlying HW 1040 * in non-promiscuous mode. 1041 */ 1042 while ((curr_mode < (smode + len)) && (vswp->smode_num < NUM_SMODES)) { 1043 D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode); 1044 if (strcmp(curr_mode, "switched") == 0) { 1045 vswp->smode[vswp->smode_num++] = VSW_LAYER2; 1046 } else if (strcmp(curr_mode, "promiscuous") == 0) { 1047 vswp->smode[vswp->smode_num++] = VSW_LAYER2_PROMISC; 1048 } else if (strcmp(curr_mode, "routed") == 0) { 1049 vswp->smode[vswp->smode_num++] = VSW_LAYER3; 1050 } else { 1051 cmn_err(CE_WARN, "Unknown switch mode %s, setting to" 1052 " default switched mode", curr_mode); 1053 vswp->smode[vswp->smode_num++] = VSW_LAYER2; 1054 } 1055 curr_mode += strlen(curr_mode) + 1; 1056 } 1057 1058 D2(vswp, "%d switching modes specified", vswp->smode_num); 1059 1060 if (vswp->smode_num > 0) 1061 vswp->mdprops |= VSW_MD_SMODE; 1062 1063 md_prop_exit: 1064 (void) md_fini_handle(mdp); 1065 1066 kmem_free(listp, listsz); 1067 1068 D1(vswp, "%s: exit", __func__); 1069 } 1070 1071 /* 1072 * Get the mac address of the physical device. 1073 * 1074 * Returns 0 on success, 1 on failure. 1075 */ 1076 static int 1077 vsw_get_physaddr(vsw_t *vswp) 1078 { 1079 mac_handle_t mh; 1080 char drv[LIFNAMSIZ]; 1081 uint_t ddi_instance; 1082 1083 D1(vswp, "%s: enter", __func__); 1084 1085 if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) 1086 return (1); 1087 1088 if (mac_open(vswp->physname, ddi_instance, &mh) != 0) { 1089 cmn_err(CE_WARN, "!mac_open %s failed", vswp->physname); 1090 return (1); 1091 } 1092 1093 READ_ENTER(&vswp->if_lockrw); 1094 mac_unicst_get(mh, vswp->if_addr.ether_addr_octet); 1095 RW_EXIT(&vswp->if_lockrw); 1096 1097 mac_close(mh); 1098 1099 vswp->mdprops |= VSW_DEV_MACADDR; 1100 1101 D1(vswp, "%s: exit", __func__); 1102 1103 return (0); 1104 } 1105 1106 /* 1107 * Check to see if the card supports the setting of multiple unicst 1108 * addresses. 1109 * 1110 * Returns 0 if card supports the programming of multiple unicast addresses 1111 * and there are free address slots available, otherwise returns 1. 1112 */ 1113 static int 1114 vsw_get_hw_maddr(vsw_t *vswp) 1115 { 1116 D1(vswp, "%s: enter", __func__); 1117 1118 if (vswp->mh == NULL) { 1119 return (1); 1120 } 1121 1122 if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) { 1123 DWARN(vswp, "Unable to get capabilities of" 1124 " underlying device (%s)", vswp->physname); 1125 return (1); 1126 } 1127 1128 if (vswp->maddr.maddr_naddrfree == 0) { 1129 cmn_err(CE_WARN, "!device %s has no free unicast address slots", 1130 vswp->physname); 1131 return (1); 1132 } 1133 1134 D2(vswp, "%s: %d addrs : %d free", __func__, 1135 vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree); 1136 1137 D1(vswp, "%s: exit", __func__); 1138 1139 return (0); 1140 } 1141 1142 /* 1143 * Setup for layer 2 switching. 1144 * 1145 * Returns 0 on success, 1 on failure. 1146 */ 1147 static int 1148 vsw_setup_layer2(vsw_t *vswp) 1149 { 1150 D1(vswp, "%s: enter", __func__); 1151 1152 vsw_switch_frame = vsw_switch_l2_frame; 1153 1154 /* 1155 * Attempt to link into the MAC layer so we can get 1156 * and send packets out over the physical adapter. 1157 */ 1158 if (vswp->mdprops & VSW_MD_PHYSNAME) { 1159 if (vsw_mac_attach(vswp) != 0) { 1160 /* 1161 * Registration with the MAC layer has failed, 1162 * so return 1 so that can fall back to next 1163 * prefered switching method. 1164 */ 1165 cmn_err(CE_WARN, "!Unable to join as MAC layer " 1166 "client"); 1167 return (1); 1168 } 1169 1170 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) { 1171 /* 1172 * Verify that underlying device can support multiple 1173 * unicast mac addresses, and has free capacity. 1174 */ 1175 if (vsw_get_hw_maddr(vswp) != 0) { 1176 cmn_err(CE_WARN, "!unable to setup switching"); 1177 vsw_mac_detach(vswp); 1178 return (1); 1179 } 1180 } 1181 1182 } else { 1183 /* 1184 * No physical device name found in MD which is 1185 * required for layer 2. 1186 */ 1187 cmn_err(CE_WARN, "!no physical device name specified"); 1188 return (1); 1189 } 1190 1191 D1(vswp, "%s: exit", __func__); 1192 1193 return (0); 1194 } 1195 1196 static int 1197 vsw_setup_layer3(vsw_t *vswp) 1198 { 1199 D1(vswp, "%s: enter", __func__); 1200 1201 D2(vswp, "%s: operating in layer 3 mode", __func__); 1202 vsw_switch_frame = vsw_switch_l3_frame; 1203 1204 D1(vswp, "%s: exit", __func__); 1205 1206 return (0); 1207 } 1208 1209 /* 1210 * Link into the MAC layer to gain access to the services provided by 1211 * the underlying physical device driver (which should also have 1212 * registered with the MAC layer). 1213 * 1214 * Only when in layer 2 mode. 1215 */ 1216 static int 1217 vsw_mac_attach(vsw_t *vswp) 1218 { 1219 char drv[LIFNAMSIZ]; 1220 uint_t ddi_instance; 1221 1222 D1(vswp, "vsw_mac_attach: enter"); 1223 1224 vswp->mh = NULL; 1225 vswp->mrh = NULL; 1226 1227 ASSERT(vswp->mdprops & VSW_MD_PHYSNAME); 1228 1229 if (ddi_parse(vswp->physname, drv, &ddi_instance) != DDI_SUCCESS) { 1230 cmn_err(CE_WARN, "invalid device name: %s", vswp->physname); 1231 goto mac_fail_exit; 1232 } 1233 if ((mac_open(vswp->physname, ddi_instance, &vswp->mh)) != 0) { 1234 cmn_err(CE_WARN, "mac_open %s failed", vswp->physname); 1235 goto mac_fail_exit; 1236 } 1237 1238 D2(vswp, "vsw_mac_attach: using device %s", vswp->physname); 1239 1240 /* register our rx callback function */ 1241 vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp); 1242 1243 /* get the MAC tx fn */ 1244 vswp->txinfo = mac_tx_get(vswp->mh); 1245 1246 /* start the interface */ 1247 if (mac_start(vswp->mh) != 0) { 1248 cmn_err(CE_WARN, "could not start mac interface"); 1249 goto mac_fail_exit; 1250 } 1251 1252 D1(vswp, "vsw_mac_attach: exit"); 1253 return (0); 1254 1255 mac_fail_exit: 1256 if (vswp->mh != NULL) { 1257 if (vswp->mrh != NULL) 1258 mac_rx_remove(vswp->mh, vswp->mrh); 1259 1260 mac_close(vswp->mh); 1261 } 1262 1263 vswp->mrh = NULL; 1264 vswp->mh = NULL; 1265 vswp->txinfo = NULL; 1266 1267 D1(vswp, "vsw_mac_attach: fail exit"); 1268 return (1); 1269 } 1270 1271 static void 1272 vsw_mac_detach(vsw_t *vswp) 1273 { 1274 D1(vswp, "vsw_mac_detach: enter"); 1275 1276 if (vswp->mh != NULL) { 1277 if (vswp->mrh != NULL) 1278 mac_rx_remove(vswp->mh, vswp->mrh); 1279 1280 mac_stop(vswp->mh); 1281 mac_close(vswp->mh); 1282 } 1283 1284 vswp->mrh = NULL; 1285 vswp->mh = NULL; 1286 vswp->txinfo = NULL; 1287 1288 D1(vswp, "vsw_mac_detach: exit"); 1289 } 1290 1291 /* 1292 * Depending on the mode specified, the capabilites and capacity 1293 * of the underlying device setup the physical device. 1294 * 1295 * If in layer 3 mode, then do nothing. 1296 * 1297 * If in layer 2 programmed mode attempt to program the unicast address 1298 * associated with the port into the physical device. If this is not 1299 * possible due to resource exhaustion or simply because the device does 1300 * not support multiple unicast addresses then if required fallback onto 1301 * putting the card into promisc mode. 1302 * 1303 * If in promisc mode then simply set the card into promisc mode. 1304 * 1305 * Returns 0 success, 1 on failure. 1306 */ 1307 static int 1308 vsw_set_hw(vsw_t *vswp, vsw_port_t *port) 1309 { 1310 mac_multi_addr_t mac_addr; 1311 void *mah; 1312 int err; 1313 1314 D1(vswp, "%s: enter", __func__); 1315 1316 if (vswp->smode[vswp->smode_idx] == VSW_LAYER3) 1317 return (0); 1318 1319 if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) { 1320 return (vsw_set_hw_promisc(vswp, port)); 1321 } 1322 1323 if (vswp->maddr.maddr_handle == NULL) 1324 return (1); 1325 1326 mah = vswp->maddr.maddr_handle; 1327 1328 /* 1329 * Attempt to program the unicast address into the HW. 1330 */ 1331 mac_addr.mma_addrlen = ETHERADDRL; 1332 ether_copy(&port->p_macaddr, &mac_addr.mma_addr); 1333 1334 err = vswp->maddr.maddr_add(mah, &mac_addr); 1335 if (err != 0) { 1336 cmn_err(CE_WARN, "!failed to program addr " 1337 "%x:%x:%x:%x:%x:%x for port %d into device %s " 1338 ": err %d", port->p_macaddr.ether_addr_octet[0], 1339 port->p_macaddr.ether_addr_octet[1], 1340 port->p_macaddr.ether_addr_octet[2], 1341 port->p_macaddr.ether_addr_octet[3], 1342 port->p_macaddr.ether_addr_octet[4], 1343 port->p_macaddr.ether_addr_octet[5], 1344 port->p_instance, vswp->physname, err); 1345 1346 /* 1347 * Mark that attempt should be made to re-config sometime 1348 * in future if a port is deleted. 1349 */ 1350 vswp->recfg_reqd = B_TRUE; 1351 1352 /* 1353 * Only 1 mode specified, nothing more to do. 1354 */ 1355 if (vswp->smode_num == 1) 1356 return (err); 1357 1358 /* 1359 * If promiscuous was next mode specified try to 1360 * set the card into that mode. 1361 */ 1362 if ((vswp->smode_idx <= (vswp->smode_num - 2)) && 1363 (vswp->smode[vswp->smode_idx + 1] 1364 == VSW_LAYER2_PROMISC)) { 1365 vswp->smode_idx += 1; 1366 return (vsw_set_hw_promisc(vswp, port)); 1367 } 1368 return (err); 1369 } 1370 1371 port->addr_slot = mac_addr.mma_slot; 1372 port->addr_set = VSW_ADDR_HW; 1373 1374 D2(vswp, "programmed addr %x:%x:%x:%x:%x:%x for port %d " 1375 "into slot %d of device %s", 1376 port->p_macaddr.ether_addr_octet[0], 1377 port->p_macaddr.ether_addr_octet[1], 1378 port->p_macaddr.ether_addr_octet[2], 1379 port->p_macaddr.ether_addr_octet[3], 1380 port->p_macaddr.ether_addr_octet[4], 1381 port->p_macaddr.ether_addr_octet[5], 1382 port->p_instance, port->addr_slot, vswp->physname); 1383 1384 D1(vswp, "%s: exit", __func__); 1385 1386 return (0); 1387 } 1388 1389 /* 1390 * If in layer 3 mode do nothing. 1391 * 1392 * If in layer 2 switched mode remove the address from the physical 1393 * device. 1394 * 1395 * If in layer 2 promiscuous mode disable promisc mode. 1396 * 1397 * Returns 0 on success. 1398 */ 1399 static int 1400 vsw_unset_hw(vsw_t *vswp, vsw_port_t *port) 1401 { 1402 int err; 1403 void *mah; 1404 1405 D1(vswp, "%s: enter", __func__); 1406 1407 if (vswp->smode[vswp->smode_idx] == VSW_LAYER3) 1408 return (0); 1409 1410 if (port->addr_set == VSW_ADDR_PROMISC) { 1411 return (vsw_unset_hw_promisc(vswp, port)); 1412 } 1413 1414 if (port->addr_set == VSW_ADDR_HW) { 1415 if (vswp->mh == NULL) 1416 return (1); 1417 1418 if (vswp->maddr.maddr_handle == NULL) 1419 return (1); 1420 1421 mah = vswp->maddr.maddr_handle; 1422 1423 err = vswp->maddr.maddr_remove(mah, port->addr_slot); 1424 if (err != 0) { 1425 cmn_err(CE_WARN, "!Unable to remove addr " 1426 "%x:%x:%x:%x:%x:%x for port %d from device %s" 1427 " : (err %d)", 1428 port->p_macaddr.ether_addr_octet[0], 1429 port->p_macaddr.ether_addr_octet[1], 1430 port->p_macaddr.ether_addr_octet[2], 1431 port->p_macaddr.ether_addr_octet[3], 1432 port->p_macaddr.ether_addr_octet[4], 1433 port->p_macaddr.ether_addr_octet[5], 1434 port->p_instance, vswp->physname, err); 1435 return (err); 1436 } 1437 1438 port->addr_set = VSW_ADDR_UNSET; 1439 1440 D2(vswp, "removed addr %x:%x:%x:%x:%x:%x for " 1441 "port %d from device %s", 1442 port->p_macaddr.ether_addr_octet[0], 1443 port->p_macaddr.ether_addr_octet[1], 1444 port->p_macaddr.ether_addr_octet[2], 1445 port->p_macaddr.ether_addr_octet[3], 1446 port->p_macaddr.ether_addr_octet[4], 1447 port->p_macaddr.ether_addr_octet[5], 1448 port->p_instance, vswp->physname); 1449 } 1450 1451 D1(vswp, "%s: exit", __func__); 1452 return (0); 1453 } 1454 1455 /* 1456 * Set network card into promisc mode. 1457 * 1458 * Returns 0 on success, 1 on failure. 1459 */ 1460 static int 1461 vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port) 1462 { 1463 D1(vswp, "%s: enter", __func__); 1464 1465 if (vswp->mh == NULL) 1466 return (1); 1467 1468 if (vswp->promisc_cnt++ == 0) { 1469 if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) { 1470 vswp->promisc_cnt--; 1471 return (1); 1472 } 1473 cmn_err(CE_NOTE, "!switching device %s into promiscuous mode", 1474 vswp->physname); 1475 } 1476 port->addr_set = VSW_ADDR_PROMISC; 1477 1478 D1(vswp, "%s: exit", __func__); 1479 1480 return (0); 1481 } 1482 1483 /* 1484 * Turn off promiscuous mode on network card. 1485 * 1486 * Returns 0 on success, 1 on failure. 1487 */ 1488 static int 1489 vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port) 1490 { 1491 vsw_port_list_t *plist = &vswp->plist; 1492 1493 D1(vswp, "%s: enter", __func__); 1494 1495 if (vswp->mh == NULL) 1496 return (1); 1497 1498 ASSERT(port->addr_set == VSW_ADDR_PROMISC); 1499 1500 if (--vswp->promisc_cnt == 0) { 1501 if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) { 1502 vswp->promisc_cnt++; 1503 return (1); 1504 } 1505 1506 /* 1507 * We are exiting promisc mode either because we were 1508 * only in promisc mode because we had failed over from 1509 * switched mode due to HW resource issues, or the user 1510 * wanted the card in promisc mode for all the ports and 1511 * the last port is now being deleted. Tweak the message 1512 * accordingly. 1513 */ 1514 if (plist->num_ports != 0) { 1515 cmn_err(CE_NOTE, "!switching device %s back to " 1516 "programmed mode", vswp->physname); 1517 } else { 1518 cmn_err(CE_NOTE, "!switching device %s out of " 1519 "promiscuous mode", vswp->physname); 1520 } 1521 } 1522 port->addr_set = VSW_ADDR_UNSET; 1523 1524 D1(vswp, "%s: exit", __func__); 1525 return (0); 1526 } 1527 1528 /* 1529 * Determine whether or not we are operating in our prefered 1530 * mode and if not whether the physical resources now allow us 1531 * to operate in it. 1532 * 1533 * Should only be invoked after port which is being deleted has been 1534 * removed from the port list. 1535 */ 1536 static int 1537 vsw_reconfig_hw(vsw_t *vswp) 1538 { 1539 vsw_port_list_t *plist = &vswp->plist; 1540 mac_multi_addr_t mac_addr; 1541 vsw_port_t *tp; 1542 void *mah; 1543 int rv = 0; 1544 int s_idx; 1545 1546 D1(vswp, "%s: enter", __func__); 1547 1548 if (vswp->maddr.maddr_handle == NULL) 1549 return (1); 1550 1551 /* 1552 * Check if there are now sufficient HW resources to 1553 * attempt a re-config. 1554 */ 1555 if (plist->num_ports > vswp->maddr.maddr_naddrfree) 1556 return (1); 1557 1558 /* 1559 * If we are in layer 2 (i.e. switched) or would like to be 1560 * in layer 2 then check if any ports need to be programmed 1561 * into the HW. 1562 * 1563 * This can happen in two cases - switched was specified as 1564 * the prefered mode of operation but we exhausted the HW 1565 * resources and so failed over to the next specifed mode, 1566 * or switched was the only mode specified so after HW 1567 * resources were exhausted there was nothing more we 1568 * could do. 1569 */ 1570 if (vswp->smode_idx > 0) 1571 s_idx = vswp->smode_idx - 1; 1572 else 1573 s_idx = vswp->smode_idx; 1574 1575 if (vswp->smode[s_idx] == VSW_LAYER2) { 1576 mah = vswp->maddr.maddr_handle; 1577 1578 D2(vswp, "%s: attempting reconfig..", __func__); 1579 1580 /* 1581 * Scan the port list for any port whose address has not 1582 * be programmed in HW - there should be a max of one. 1583 */ 1584 for (tp = plist->head; tp != NULL; tp = tp->p_next) { 1585 if (tp->addr_set != VSW_ADDR_HW) { 1586 mac_addr.mma_addrlen = ETHERADDRL; 1587 ether_copy(&tp->p_macaddr, &mac_addr.mma_addr); 1588 1589 rv = vswp->maddr.maddr_add(mah, &mac_addr); 1590 if (rv != 0) { 1591 DWARN(vswp, "Error setting addr in " 1592 "HW for port %d err %d", 1593 tp->p_instance, rv); 1594 goto reconfig_err_exit; 1595 } 1596 tp->addr_slot = mac_addr.mma_slot; 1597 1598 D2(vswp, "re-programmed port %d " 1599 "addr %x:%x:%x:%x:%x:%x into slot %d" 1600 " of device %s", tp->p_instance, 1601 tp->p_macaddr.ether_addr_octet[0], 1602 tp->p_macaddr.ether_addr_octet[1], 1603 tp->p_macaddr.ether_addr_octet[2], 1604 tp->p_macaddr.ether_addr_octet[3], 1605 tp->p_macaddr.ether_addr_octet[4], 1606 tp->p_macaddr.ether_addr_octet[5], 1607 tp->addr_slot, vswp->physname); 1608 1609 /* 1610 * If up to now we had to put the card into 1611 * promisc mode to see this address, we 1612 * can now safely disable promisc mode. 1613 */ 1614 if (tp->addr_set == VSW_ADDR_PROMISC) 1615 (void) vsw_unset_hw_promisc(vswp, tp); 1616 1617 tp->addr_set = VSW_ADDR_HW; 1618 } 1619 } 1620 1621 /* no further re-config needed */ 1622 vswp->recfg_reqd = B_FALSE; 1623 1624 vswp->smode_idx = s_idx; 1625 1626 return (0); 1627 } 1628 1629 reconfig_err_exit: 1630 return (rv); 1631 } 1632 1633 /* 1634 * receive callback routine. Invoked by MAC layer when there 1635 * are pkts being passed up from physical device. 1636 * 1637 * PERF: It may be more efficient when the card is in promisc 1638 * mode to check the dest address of the pkts here (against 1639 * the FDB) rather than checking later. Needs to be investigated. 1640 */ 1641 static void 1642 vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) 1643 { 1644 _NOTE(ARGUNUSED(mrh)) 1645 1646 vsw_t *vswp = (vsw_t *)arg; 1647 1648 ASSERT(vswp != NULL); 1649 1650 D1(vswp, "vsw_rx_cb: enter"); 1651 1652 /* switch the chain of packets received */ 1653 vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL); 1654 1655 D1(vswp, "vsw_rx_cb: exit"); 1656 } 1657 1658 /* 1659 * Send a message out over the physical device via the MAC layer. 1660 * 1661 * Returns any mblks that it was unable to transmit. 1662 */ 1663 static mblk_t * 1664 vsw_tx_msg(vsw_t *vswp, mblk_t *mp) 1665 { 1666 const mac_txinfo_t *mtp; 1667 mblk_t *nextp; 1668 1669 if (vswp->mh == NULL) { 1670 DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail"); 1671 return (mp); 1672 } else { 1673 for (;;) { 1674 nextp = mp->b_next; 1675 mp->b_next = NULL; 1676 1677 mtp = vswp->txinfo; 1678 if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) { 1679 mp->b_next = nextp; 1680 break; 1681 } 1682 1683 if ((mp = nextp) == NULL) 1684 break; 1685 1686 } 1687 1688 } 1689 1690 return (mp); 1691 } 1692 1693 /* 1694 * Register with the MAC layer as a network device, so we 1695 * can be plumbed if necessary. 1696 */ 1697 static int 1698 vsw_mac_register(vsw_t *vswp) 1699 { 1700 mac_register_t *macp; 1701 int rv; 1702 1703 D1(vswp, "%s: enter", __func__); 1704 1705 if ((macp = mac_alloc(MAC_VERSION)) == NULL) 1706 return (EINVAL); 1707 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1708 macp->m_driver = vswp; 1709 macp->m_dip = vswp->dip; 1710 macp->m_src_addr = (uint8_t *)&vswp->if_addr; 1711 macp->m_callbacks = &vsw_m_callbacks; 1712 macp->m_min_sdu = 0; 1713 macp->m_max_sdu = ETHERMTU; 1714 rv = mac_register(macp, &vswp->if_mh); 1715 mac_free(macp); 1716 if (rv == 0) 1717 vswp->if_state |= VSW_IF_REG; 1718 1719 D1(vswp, "%s: exit", __func__); 1720 1721 return (rv); 1722 } 1723 1724 static int 1725 vsw_mac_unregister(vsw_t *vswp) 1726 { 1727 int rv = 0; 1728 1729 D1(vswp, "%s: enter", __func__); 1730 1731 WRITE_ENTER(&vswp->if_lockrw); 1732 1733 if (vswp->if_state & VSW_IF_REG) { 1734 rv = mac_unregister(vswp->if_mh); 1735 if (rv != 0) { 1736 DWARN(vswp, "%s: unable to unregister from MAC " 1737 "framework", __func__); 1738 1739 RW_EXIT(&vswp->if_lockrw); 1740 D1(vswp, "%s: fail exit", __func__); 1741 return (rv); 1742 } 1743 1744 /* mark i/f as down and unregistered */ 1745 vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG); 1746 } 1747 RW_EXIT(&vswp->if_lockrw); 1748 1749 vswp->mdprops &= ~(VSW_MD_MACADDR | VSW_DEV_MACADDR); 1750 1751 D1(vswp, "%s: exit", __func__); 1752 1753 return (rv); 1754 } 1755 1756 static int 1757 vsw_m_stat(void *arg, uint_t stat, uint64_t *val) 1758 { 1759 vsw_t *vswp = (vsw_t *)arg; 1760 1761 D1(vswp, "%s: enter", __func__); 1762 1763 if (vswp->mh == NULL) 1764 return (EINVAL); 1765 1766 /* return stats from underlying device */ 1767 *val = mac_stat_get(vswp->mh, stat); 1768 return (0); 1769 } 1770 1771 static void 1772 vsw_m_stop(void *arg) 1773 { 1774 vsw_t *vswp = (vsw_t *)arg; 1775 1776 D1(vswp, "%s: enter", __func__); 1777 1778 WRITE_ENTER(&vswp->if_lockrw); 1779 vswp->if_state &= ~VSW_IF_UP; 1780 RW_EXIT(&vswp->if_lockrw); 1781 1782 D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); 1783 } 1784 1785 static int 1786 vsw_m_start(void *arg) 1787 { 1788 vsw_t *vswp = (vsw_t *)arg; 1789 1790 D1(vswp, "%s: enter", __func__); 1791 1792 WRITE_ENTER(&vswp->if_lockrw); 1793 vswp->if_state |= VSW_IF_UP; 1794 RW_EXIT(&vswp->if_lockrw); 1795 1796 D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); 1797 return (0); 1798 } 1799 1800 /* 1801 * Change the local interface address. 1802 */ 1803 static int 1804 vsw_m_unicst(void *arg, const uint8_t *macaddr) 1805 { 1806 vsw_t *vswp = (vsw_t *)arg; 1807 1808 D1(vswp, "%s: enter", __func__); 1809 1810 WRITE_ENTER(&vswp->if_lockrw); 1811 ether_copy(macaddr, &vswp->if_addr); 1812 RW_EXIT(&vswp->if_lockrw); 1813 1814 D1(vswp, "%s: exit", __func__); 1815 1816 return (0); 1817 } 1818 1819 static int 1820 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca) 1821 { 1822 vsw_t *vswp = (vsw_t *)arg; 1823 mcst_addr_t *mcst_p = NULL; 1824 uint64_t addr = 0x0; 1825 int i, ret = 0; 1826 1827 D1(vswp, "%s: enter", __func__); 1828 1829 /* 1830 * Convert address into form that can be used 1831 * as hash table key. 1832 */ 1833 for (i = 0; i < ETHERADDRL; i++) { 1834 addr = (addr << 8) | mca[i]; 1835 } 1836 1837 D2(vswp, "%s: addr = 0x%llx", __func__, addr); 1838 1839 if (add) { 1840 D2(vswp, "%s: adding multicast", __func__); 1841 if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) { 1842 /* 1843 * Update the list of multicast addresses 1844 * contained within the vsw_t structure to 1845 * include this new one. 1846 */ 1847 mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP); 1848 if (mcst_p == NULL) { 1849 DERR(vswp, "%s unable to alloc mem", __func__); 1850 return (1); 1851 } 1852 mcst_p->addr = addr; 1853 1854 mutex_enter(&vswp->mca_lock); 1855 mcst_p->nextp = vswp->mcap; 1856 vswp->mcap = mcst_p; 1857 mutex_exit(&vswp->mca_lock); 1858 1859 /* 1860 * Call into the underlying driver to program the 1861 * address into HW. 1862 */ 1863 if (vswp->mh != NULL) { 1864 ret = mac_multicst_add(vswp->mh, mca); 1865 if (ret != 0) { 1866 cmn_err(CE_WARN, "!unable to add " 1867 "multicast address"); 1868 goto vsw_remove_addr; 1869 } 1870 } 1871 } else { 1872 cmn_err(CE_WARN, "!unable to add multicast address"); 1873 } 1874 return (ret); 1875 } 1876 1877 vsw_remove_addr: 1878 1879 D2(vswp, "%s: removing multicast", __func__); 1880 /* 1881 * Remove the address from the hash table.. 1882 */ 1883 if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) { 1884 1885 /* 1886 * ..and then from the list maintained in the 1887 * vsw_t structure. 1888 */ 1889 vsw_del_addr(VSW_LOCALDEV, vswp, addr); 1890 1891 if (vswp->mh != NULL) 1892 (void) mac_multicst_remove(vswp->mh, mca); 1893 } 1894 1895 D1(vswp, "%s: exit", __func__); 1896 1897 return (0); 1898 } 1899 1900 static int 1901 vsw_m_promisc(void *arg, boolean_t on) 1902 { 1903 vsw_t *vswp = (vsw_t *)arg; 1904 1905 D1(vswp, "%s: enter", __func__); 1906 1907 WRITE_ENTER(&vswp->if_lockrw); 1908 if (on) 1909 vswp->if_state |= VSW_IF_PROMISC; 1910 else 1911 vswp->if_state &= ~VSW_IF_PROMISC; 1912 RW_EXIT(&vswp->if_lockrw); 1913 1914 D1(vswp, "%s: exit", __func__); 1915 1916 return (0); 1917 } 1918 1919 static mblk_t * 1920 vsw_m_tx(void *arg, mblk_t *mp) 1921 { 1922 vsw_t *vswp = (vsw_t *)arg; 1923 1924 D1(vswp, "%s: enter", __func__); 1925 1926 vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL); 1927 1928 D1(vswp, "%s: exit", __func__); 1929 1930 return (NULL); 1931 } 1932 1933 /* 1934 * Register for machine description (MD) updates. 1935 */ 1936 static void 1937 vsw_mdeg_register(vsw_t *vswp) 1938 { 1939 mdeg_prop_spec_t *pspecp; 1940 mdeg_node_spec_t *inst_specp; 1941 mdeg_handle_t mdeg_hdl; 1942 size_t templatesz; 1943 int inst, rv; 1944 1945 D1(vswp, "%s: enter", __func__); 1946 1947 inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip, 1948 DDI_PROP_DONTPASS, reg_propname, -1); 1949 if (inst == -1) { 1950 DERR(vswp, "%s: unable to get %s property", 1951 __func__, reg_propname); 1952 return; 1953 } 1954 1955 D2(vswp, "%s: instance %d registering with mdeg", __func__, inst); 1956 1957 /* 1958 * Allocate and initialize a per-instance copy 1959 * of the global property spec array that will 1960 * uniquely identify this vsw instance. 1961 */ 1962 templatesz = sizeof (vsw_prop_template); 1963 pspecp = kmem_zalloc(templatesz, KM_SLEEP); 1964 1965 bcopy(vsw_prop_template, pspecp, templatesz); 1966 1967 VSW_SET_MDEG_PROP_INST(pspecp, inst); 1968 1969 /* initialize the complete prop spec structure */ 1970 inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP); 1971 inst_specp->namep = "virtual-device"; 1972 inst_specp->specp = pspecp; 1973 1974 /* perform the registration */ 1975 rv = mdeg_register(inst_specp, &vport_match, vsw_mdeg_cb, 1976 (void *)vswp, &mdeg_hdl); 1977 1978 if (rv != MDEG_SUCCESS) { 1979 DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv); 1980 kmem_free(inst_specp, sizeof (mdeg_node_spec_t)); 1981 kmem_free(pspecp, templatesz); 1982 return; 1983 } 1984 1985 /* save off data that will be needed later */ 1986 vswp->inst_spec = inst_specp; 1987 vswp->mdeg_hdl = mdeg_hdl; 1988 1989 D1(vswp, "%s: exit", __func__); 1990 } 1991 1992 static void 1993 vsw_mdeg_unregister(vsw_t *vswp) 1994 { 1995 D1(vswp, "vsw_mdeg_unregister: enter"); 1996 1997 (void) mdeg_unregister(vswp->mdeg_hdl); 1998 1999 if (vswp->inst_spec->specp != NULL) { 2000 (void) kmem_free(vswp->inst_spec->specp, 2001 sizeof (vsw_prop_template)); 2002 vswp->inst_spec->specp = NULL; 2003 } 2004 2005 if (vswp->inst_spec != NULL) { 2006 (void) kmem_free(vswp->inst_spec, 2007 sizeof (mdeg_node_spec_t)); 2008 vswp->inst_spec = NULL; 2009 } 2010 2011 D1(vswp, "vsw_mdeg_unregister: exit"); 2012 } 2013 2014 static int 2015 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp) 2016 { 2017 vsw_t *vswp; 2018 int idx; 2019 md_t *mdp; 2020 mde_cookie_t node; 2021 uint64_t inst; 2022 2023 if (resp == NULL) 2024 return (MDEG_FAILURE); 2025 2026 vswp = (vsw_t *)cb_argp; 2027 2028 D1(vswp, "%s: added %d : removed %d : matched %d", 2029 __func__, resp->added.nelem, resp->removed.nelem, 2030 resp->match_prev.nelem); 2031 2032 /* process added ports */ 2033 for (idx = 0; idx < resp->added.nelem; idx++) { 2034 mdp = resp->added.mdp; 2035 node = resp->added.mdep[idx]; 2036 2037 D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node); 2038 2039 if (vsw_port_add(vswp, mdp, &node) != 0) { 2040 cmn_err(CE_WARN, "Unable to add new port (0x%lx)", 2041 node); 2042 } 2043 } 2044 2045 /* process removed ports */ 2046 for (idx = 0; idx < resp->removed.nelem; idx++) { 2047 mdp = resp->removed.mdp; 2048 node = resp->removed.mdep[idx]; 2049 2050 if (md_get_prop_val(mdp, node, id_propname, &inst)) { 2051 DERR(vswp, "%s: prop(%s) not found port(%d)", 2052 __func__, id_propname, idx); 2053 continue; 2054 } 2055 2056 D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node); 2057 2058 if (vsw_port_detach(vswp, inst) != 0) { 2059 cmn_err(CE_WARN, "Unable to remove port %ld", inst); 2060 } 2061 } 2062 2063 /* 2064 * Currently no support for updating already active ports. 2065 * So, ignore the match_curr and match_priv arrays for now. 2066 */ 2067 2068 D1(vswp, "%s: exit", __func__); 2069 2070 return (MDEG_SUCCESS); 2071 } 2072 2073 /* 2074 * Add a new port to the system. 2075 * 2076 * Returns 0 on success, 1 on failure. 2077 */ 2078 int 2079 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node) 2080 { 2081 uint64_t ldc_id; 2082 uint8_t *addrp; 2083 int i, addrsz; 2084 int num_nodes = 0, nchan = 0; 2085 int listsz = 0; 2086 mde_cookie_t *listp = NULL; 2087 struct ether_addr ea; 2088 uint64_t macaddr; 2089 uint64_t inst = 0; 2090 vsw_port_t *port; 2091 2092 if (md_get_prop_val(mdp, *node, id_propname, &inst)) { 2093 DWARN(vswp, "%s: prop(%s) not found", __func__, 2094 id_propname); 2095 return (1); 2096 } 2097 2098 /* 2099 * Find the channel endpoint node(s) (which should be under this 2100 * port node) which contain the channel id(s). 2101 */ 2102 if ((num_nodes = md_node_count(mdp)) <= 0) { 2103 DERR(vswp, "%s: invalid number of nodes found (%d)", 2104 __func__, num_nodes); 2105 return (1); 2106 } 2107 2108 /* allocate enough space for node list */ 2109 listsz = num_nodes * sizeof (mde_cookie_t); 2110 listp = kmem_zalloc(listsz, KM_SLEEP); 2111 2112 nchan = md_scan_dag(mdp, *node, 2113 md_find_name(mdp, chan_propname), 2114 md_find_name(mdp, "fwd"), listp); 2115 2116 if (nchan <= 0) { 2117 DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname); 2118 kmem_free(listp, listsz); 2119 return (1); 2120 } 2121 2122 D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname); 2123 2124 /* use property from first node found */ 2125 if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) { 2126 DWARN(vswp, "%s: prop(%s) not found\n", __func__, 2127 id_propname); 2128 kmem_free(listp, listsz); 2129 return (1); 2130 } 2131 2132 /* don't need list any more */ 2133 kmem_free(listp, listsz); 2134 2135 D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id); 2136 2137 /* read mac-address property */ 2138 if (md_get_prop_data(mdp, *node, remaddr_propname, 2139 &addrp, &addrsz)) { 2140 DWARN(vswp, "%s: prop(%s) not found", 2141 __func__, remaddr_propname); 2142 return (1); 2143 } 2144 2145 if (addrsz < ETHERADDRL) { 2146 DWARN(vswp, "%s: invalid address size", __func__); 2147 return (1); 2148 } 2149 2150 macaddr = *((uint64_t *)addrp); 2151 D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr); 2152 2153 for (i = ETHERADDRL - 1; i >= 0; i--) { 2154 ea.ether_addr_octet[i] = macaddr & 0xFF; 2155 macaddr >>= 8; 2156 } 2157 2158 if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) { 2159 DERR(vswp, "%s: failed to attach port", __func__); 2160 return (1); 2161 } 2162 2163 port = vsw_lookup_port(vswp, (int)inst); 2164 2165 /* just successfuly created the port, so it should exist */ 2166 ASSERT(port != NULL); 2167 2168 return (0); 2169 } 2170 2171 /* 2172 * Attach the specified port. 2173 * 2174 * Returns 0 on success, 1 on failure. 2175 */ 2176 static int 2177 vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids, 2178 struct ether_addr *macaddr) 2179 { 2180 vsw_port_list_t *plist = &vswp->plist; 2181 vsw_port_t *port, **prev_port; 2182 int i; 2183 2184 D1(vswp, "%s: enter : port %d", __func__, p_instance); 2185 2186 /* port already exists? */ 2187 READ_ENTER(&plist->lockrw); 2188 for (port = plist->head; port != NULL; port = port->p_next) { 2189 if (port->p_instance == p_instance) { 2190 DWARN(vswp, "%s: port instance %d already attached", 2191 __func__, p_instance); 2192 RW_EXIT(&plist->lockrw); 2193 return (1); 2194 } 2195 } 2196 RW_EXIT(&plist->lockrw); 2197 2198 port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP); 2199 port->p_vswp = vswp; 2200 port->p_instance = p_instance; 2201 port->p_ldclist.num_ldcs = 0; 2202 port->p_ldclist.head = NULL; 2203 port->addr_set = VSW_ADDR_UNSET; 2204 2205 rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL); 2206 2207 mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL); 2208 mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL); 2209 2210 mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL); 2211 cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL); 2212 2213 mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL); 2214 cv_init(&port->state_cv, NULL, CV_DRIVER, NULL); 2215 port->state = VSW_PORT_INIT; 2216 2217 if (nids > VSW_PORT_MAX_LDCS) { 2218 D2(vswp, "%s: using first of %d ldc ids", 2219 __func__, nids); 2220 nids = VSW_PORT_MAX_LDCS; 2221 } 2222 2223 D2(vswp, "%s: %d nids", __func__, nids); 2224 for (i = 0; i < nids; i++) { 2225 D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]); 2226 if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) { 2227 DERR(vswp, "%s: ldc_attach failed", __func__); 2228 2229 rw_destroy(&port->p_ldclist.lockrw); 2230 2231 cv_destroy(&port->ref_cv); 2232 mutex_destroy(&port->ref_lock); 2233 2234 cv_destroy(&port->state_cv); 2235 mutex_destroy(&port->state_lock); 2236 2237 mutex_destroy(&port->tx_lock); 2238 mutex_destroy(&port->mca_lock); 2239 kmem_free(port, sizeof (vsw_port_t)); 2240 return (1); 2241 } 2242 } 2243 2244 ether_copy(macaddr, &port->p_macaddr); 2245 2246 WRITE_ENTER(&plist->lockrw); 2247 2248 /* create the fdb entry for this port/mac address */ 2249 (void) vsw_add_fdb(vswp, port); 2250 2251 (void) vsw_set_hw(vswp, port); 2252 2253 /* link it into the list of ports for this vsw instance */ 2254 prev_port = (vsw_port_t **)(&plist->head); 2255 port->p_next = *prev_port; 2256 *prev_port = port; 2257 plist->num_ports++; 2258 RW_EXIT(&plist->lockrw); 2259 2260 /* 2261 * Initialise the port and any ldc's under it. 2262 */ 2263 (void) vsw_init_ldcs(port); 2264 2265 D1(vswp, "%s: exit", __func__); 2266 return (0); 2267 } 2268 2269 /* 2270 * Detach the specified port. 2271 * 2272 * Returns 0 on success, 1 on failure. 2273 */ 2274 static int 2275 vsw_port_detach(vsw_t *vswp, int p_instance) 2276 { 2277 vsw_port_t *port = NULL; 2278 vsw_port_list_t *plist = &vswp->plist; 2279 2280 D1(vswp, "%s: enter: port id %d", __func__, p_instance); 2281 2282 WRITE_ENTER(&plist->lockrw); 2283 2284 if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) { 2285 RW_EXIT(&plist->lockrw); 2286 return (1); 2287 } 2288 2289 if (vsw_plist_del_node(vswp, port)) { 2290 RW_EXIT(&plist->lockrw); 2291 return (1); 2292 } 2293 2294 /* Remove address if was programmed into HW. */ 2295 (void) vsw_unset_hw(vswp, port); 2296 2297 /* Remove the fdb entry for this port/mac address */ 2298 (void) vsw_del_fdb(vswp, port); 2299 2300 /* Remove any multicast addresses.. */ 2301 vsw_del_mcst_port(port); 2302 2303 /* 2304 * No longer need to hold writer lock on port list now 2305 * that we have unlinked the target port from the list. 2306 */ 2307 RW_EXIT(&plist->lockrw); 2308 2309 READ_ENTER(&plist->lockrw); 2310 2311 if (vswp->recfg_reqd) 2312 (void) vsw_reconfig_hw(vswp); 2313 2314 RW_EXIT(&plist->lockrw); 2315 2316 if (vsw_port_delete(port)) { 2317 return (1); 2318 } 2319 2320 D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance); 2321 return (0); 2322 } 2323 2324 /* 2325 * Detach all active ports. 2326 * 2327 * Returns 0 on success, 1 on failure. 2328 */ 2329 static int 2330 vsw_detach_ports(vsw_t *vswp) 2331 { 2332 vsw_port_list_t *plist = &vswp->plist; 2333 vsw_port_t *port = NULL; 2334 2335 D1(vswp, "%s: enter", __func__); 2336 2337 WRITE_ENTER(&plist->lockrw); 2338 2339 while ((port = plist->head) != NULL) { 2340 if (vsw_plist_del_node(vswp, port)) { 2341 DERR(vswp, "%s: Error deleting port %d" 2342 " from port list", __func__, 2343 port->p_instance); 2344 RW_EXIT(&plist->lockrw); 2345 return (1); 2346 } 2347 2348 /* Remove address if was programmed into HW. */ 2349 (void) vsw_unset_hw(vswp, port); 2350 2351 /* Remove the fdb entry for this port/mac address */ 2352 (void) vsw_del_fdb(vswp, port); 2353 2354 /* Remove any multicast addresses.. */ 2355 vsw_del_mcst_port(port); 2356 2357 /* 2358 * No longer need to hold the lock on the port list 2359 * now that we have unlinked the target port from the 2360 * list. 2361 */ 2362 RW_EXIT(&plist->lockrw); 2363 if (vsw_port_delete(port)) { 2364 DERR(vswp, "%s: Error deleting port %d", 2365 __func__, port->p_instance); 2366 return (1); 2367 } 2368 WRITE_ENTER(&plist->lockrw); 2369 } 2370 RW_EXIT(&plist->lockrw); 2371 2372 D1(vswp, "%s: exit", __func__); 2373 2374 return (0); 2375 } 2376 2377 /* 2378 * Delete the specified port. 2379 * 2380 * Returns 0 on success, 1 on failure. 2381 */ 2382 static int 2383 vsw_port_delete(vsw_port_t *port) 2384 { 2385 vsw_ldc_list_t *ldcl; 2386 vsw_t *vswp = port->p_vswp; 2387 2388 D1(vswp, "%s: enter : port id %d", __func__, port->p_instance); 2389 2390 (void) vsw_uninit_ldcs(port); 2391 2392 /* 2393 * Wait for any pending ctrl msg tasks which reference this 2394 * port to finish. 2395 */ 2396 if (vsw_drain_port_taskq(port)) 2397 return (1); 2398 2399 /* 2400 * Wait for port reference count to hit zero. 2401 */ 2402 mutex_enter(&port->ref_lock); 2403 while (port->ref_cnt != 0) 2404 cv_wait(&port->ref_cv, &port->ref_lock); 2405 mutex_exit(&port->ref_lock); 2406 2407 /* 2408 * Wait for any active callbacks to finish 2409 */ 2410 if (vsw_drain_ldcs(port)) 2411 return (1); 2412 2413 ldcl = &port->p_ldclist; 2414 WRITE_ENTER(&ldcl->lockrw); 2415 while (ldcl->num_ldcs > 0) { 2416 if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {; 2417 cmn_err(CE_WARN, "unable to detach ldc %ld", 2418 ldcl->head->ldc_id); 2419 RW_EXIT(&ldcl->lockrw); 2420 return (1); 2421 } 2422 } 2423 RW_EXIT(&ldcl->lockrw); 2424 2425 rw_destroy(&port->p_ldclist.lockrw); 2426 2427 mutex_destroy(&port->mca_lock); 2428 mutex_destroy(&port->tx_lock); 2429 cv_destroy(&port->ref_cv); 2430 mutex_destroy(&port->ref_lock); 2431 2432 cv_destroy(&port->state_cv); 2433 mutex_destroy(&port->state_lock); 2434 2435 kmem_free(port, sizeof (vsw_port_t)); 2436 2437 D1(vswp, "%s: exit", __func__); 2438 2439 return (0); 2440 } 2441 2442 /* 2443 * Attach a logical domain channel (ldc) under a specified port. 2444 * 2445 * Returns 0 on success, 1 on failure. 2446 */ 2447 static int 2448 vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id) 2449 { 2450 vsw_t *vswp = port->p_vswp; 2451 vsw_ldc_list_t *ldcl = &port->p_ldclist; 2452 vsw_ldc_t *ldcp = NULL; 2453 ldc_attr_t attr; 2454 ldc_status_t istatus; 2455 int status = DDI_FAILURE; 2456 int rv; 2457 2458 D1(vswp, "%s: enter", __func__); 2459 2460 ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP); 2461 if (ldcp == NULL) { 2462 DERR(vswp, "%s: kmem_zalloc failed", __func__); 2463 return (1); 2464 } 2465 ldcp->ldc_id = ldc_id; 2466 2467 /* allocate pool of receive mblks */ 2468 rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh)); 2469 if (rv) { 2470 DWARN(vswp, "%s: unable to create free mblk pool for" 2471 " channel %ld (rv %d)", __func__, ldc_id, rv); 2472 kmem_free(ldcp, sizeof (vsw_ldc_t)); 2473 return (1); 2474 } 2475 2476 mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL); 2477 mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL); 2478 mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL); 2479 cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL); 2480 2481 /* required for handshake with peer */ 2482 ldcp->local_session = (uint64_t)ddi_get_lbolt(); 2483 ldcp->peer_session = 0; 2484 ldcp->session_status = 0; 2485 2486 mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL); 2487 ldcp->hss_id = 1; /* Initial handshake session id */ 2488 2489 /* only set for outbound lane, inbound set by peer */ 2490 mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL); 2491 mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL); 2492 vsw_set_lane_attr(vswp, &ldcp->lane_out); 2493 2494 attr.devclass = LDC_DEV_NT_SVC; 2495 attr.instance = ddi_get_instance(vswp->dip); 2496 attr.mode = LDC_MODE_UNRELIABLE; 2497 attr.mtu = VSW_LDC_MTU; 2498 status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle); 2499 if (status != 0) { 2500 DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)", 2501 __func__, ldc_id, status); 2502 goto ldc_attach_fail; 2503 } 2504 2505 status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp); 2506 if (status != 0) { 2507 DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)", 2508 __func__, ldc_id, status); 2509 (void) ldc_fini(ldcp->ldc_handle); 2510 goto ldc_attach_fail; 2511 } 2512 2513 2514 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { 2515 DERR(vswp, "%s: ldc_status failed", __func__); 2516 return (1); 2517 } 2518 2519 ldcp->ldc_status = istatus; 2520 ldcp->ldc_port = port; 2521 ldcp->ldc_vswp = vswp; 2522 2523 /* link it into the list of channels for this port */ 2524 WRITE_ENTER(&ldcl->lockrw); 2525 ldcp->ldc_next = ldcl->head; 2526 ldcl->head = ldcp; 2527 ldcl->num_ldcs++; 2528 RW_EXIT(&ldcl->lockrw); 2529 2530 D1(vswp, "%s: exit", __func__); 2531 return (0); 2532 2533 ldc_attach_fail: 2534 mutex_destroy(&ldcp->ldc_txlock); 2535 mutex_destroy(&ldcp->ldc_cblock); 2536 2537 cv_destroy(&ldcp->drain_cv); 2538 2539 if (ldcp->rxh != NULL) { 2540 if (vio_destroy_mblks(ldcp->rxh) != 0) { 2541 /* 2542 * Something odd has happened, as the destroy 2543 * will only fail if some mblks have been allocated 2544 * from the pool already (which shouldn't happen) 2545 * and have not been returned. 2546 * 2547 * Add the pool pointer to a list maintained in 2548 * the device instance. Another attempt will be made 2549 * to free the pool when the device itself detaches. 2550 */ 2551 cmn_err(CE_WARN, "Creation of ldc channel %ld failed" 2552 " and cannot destroy associated mblk pool", 2553 ldc_id); 2554 ldcp->rxh->nextp = vswp->rxh; 2555 vswp->rxh = ldcp->rxh; 2556 } 2557 } 2558 mutex_destroy(&ldcp->drain_cv_lock); 2559 mutex_destroy(&ldcp->hss_lock); 2560 2561 mutex_destroy(&ldcp->lane_in.seq_lock); 2562 mutex_destroy(&ldcp->lane_out.seq_lock); 2563 kmem_free(ldcp, sizeof (vsw_ldc_t)); 2564 2565 return (1); 2566 } 2567 2568 /* 2569 * Detach a logical domain channel (ldc) belonging to a 2570 * particular port. 2571 * 2572 * Returns 0 on success, 1 on failure. 2573 */ 2574 static int 2575 vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id) 2576 { 2577 vsw_t *vswp = port->p_vswp; 2578 vsw_ldc_t *ldcp, *prev_ldcp; 2579 vsw_ldc_list_t *ldcl = &port->p_ldclist; 2580 int rv; 2581 2582 prev_ldcp = ldcl->head; 2583 for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) { 2584 if (ldcp->ldc_id == ldc_id) { 2585 break; 2586 } 2587 } 2588 2589 /* specified ldc id not found */ 2590 if (ldcp == NULL) { 2591 DERR(vswp, "%s: ldcp = NULL", __func__); 2592 return (1); 2593 } 2594 2595 D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id); 2596 2597 /* 2598 * Before we can close the channel we must release any mapped 2599 * resources (e.g. drings). 2600 */ 2601 vsw_free_lane_resources(ldcp, INBOUND); 2602 vsw_free_lane_resources(ldcp, OUTBOUND); 2603 2604 /* 2605 * If the close fails we are in serious trouble, as won't 2606 * be able to delete the parent port. 2607 */ 2608 if ((rv = ldc_close(ldcp->ldc_handle)) != 0) { 2609 DERR(vswp, "%s: error %d closing channel %lld", 2610 __func__, rv, ldcp->ldc_id); 2611 return (1); 2612 } 2613 2614 (void) ldc_fini(ldcp->ldc_handle); 2615 2616 ldcp->ldc_status = LDC_INIT; 2617 ldcp->ldc_handle = NULL; 2618 ldcp->ldc_vswp = NULL; 2619 2620 if (ldcp->rxh != NULL) { 2621 if (vio_destroy_mblks(ldcp->rxh)) { 2622 /* 2623 * Mostly likely some mblks are still in use and 2624 * have not been returned to the pool. Add the pool 2625 * to the list maintained in the device instance. 2626 * Another attempt will be made to destroy the pool 2627 * when the device detaches. 2628 */ 2629 ldcp->rxh->nextp = vswp->rxh; 2630 vswp->rxh = ldcp->rxh; 2631 } 2632 } 2633 2634 mutex_destroy(&ldcp->ldc_txlock); 2635 mutex_destroy(&ldcp->ldc_cblock); 2636 cv_destroy(&ldcp->drain_cv); 2637 mutex_destroy(&ldcp->drain_cv_lock); 2638 mutex_destroy(&ldcp->hss_lock); 2639 mutex_destroy(&ldcp->lane_in.seq_lock); 2640 mutex_destroy(&ldcp->lane_out.seq_lock); 2641 2642 /* unlink it from the list */ 2643 prev_ldcp = ldcp->ldc_next; 2644 ldcl->num_ldcs--; 2645 kmem_free(ldcp, sizeof (vsw_ldc_t)); 2646 2647 return (0); 2648 } 2649 2650 /* 2651 * Open and attempt to bring up the channel. Note that channel 2652 * can only be brought up if peer has also opened channel. 2653 * 2654 * Returns 0 if can open and bring up channel, otherwise 2655 * returns 1. 2656 */ 2657 static int 2658 vsw_ldc_init(vsw_ldc_t *ldcp) 2659 { 2660 vsw_t *vswp = ldcp->ldc_vswp; 2661 ldc_status_t istatus = 0; 2662 int rv; 2663 2664 D1(vswp, "%s: enter", __func__); 2665 2666 LDC_ENTER_LOCK(ldcp); 2667 2668 /* don't start at 0 in case clients don't like that */ 2669 ldcp->next_ident = 1; 2670 2671 rv = ldc_open(ldcp->ldc_handle); 2672 if (rv != 0) { 2673 DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)", 2674 __func__, ldcp->ldc_id, rv); 2675 LDC_EXIT_LOCK(ldcp); 2676 return (1); 2677 } 2678 2679 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { 2680 DERR(vswp, "%s: unable to get status", __func__); 2681 LDC_EXIT_LOCK(ldcp); 2682 return (1); 2683 2684 } else if (istatus != LDC_OPEN && istatus != LDC_READY) { 2685 DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY", 2686 __func__, ldcp->ldc_id, istatus); 2687 LDC_EXIT_LOCK(ldcp); 2688 return (1); 2689 } 2690 2691 ldcp->ldc_status = istatus; 2692 rv = ldc_up(ldcp->ldc_handle); 2693 if (rv != 0) { 2694 /* 2695 * Not a fatal error for ldc_up() to fail, as peer 2696 * end point may simply not be ready yet. 2697 */ 2698 D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__, 2699 ldcp->ldc_id, rv); 2700 LDC_EXIT_LOCK(ldcp); 2701 return (1); 2702 } 2703 2704 /* 2705 * ldc_up() call is non-blocking so need to explicitly 2706 * check channel status to see if in fact the channel 2707 * is UP. 2708 */ 2709 if (ldc_status(ldcp->ldc_handle, &istatus) != 0) { 2710 DERR(vswp, "%s: unable to get status", __func__); 2711 LDC_EXIT_LOCK(ldcp); 2712 return (1); 2713 2714 } else if (istatus != LDC_UP) { 2715 DERR(vswp, "%s: id(%lld) status(%d) is not UP", 2716 __func__, ldcp->ldc_id, istatus); 2717 } else { 2718 ldcp->ldc_status = istatus; 2719 } 2720 2721 LDC_EXIT_LOCK(ldcp); 2722 2723 D1(vswp, "%s: exit", __func__); 2724 return (0); 2725 } 2726 2727 /* disable callbacks on the channel */ 2728 static int 2729 vsw_ldc_uninit(vsw_ldc_t *ldcp) 2730 { 2731 vsw_t *vswp = ldcp->ldc_vswp; 2732 int rv; 2733 2734 D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id); 2735 2736 LDC_ENTER_LOCK(ldcp); 2737 2738 rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE); 2739 if (rv != 0) { 2740 DERR(vswp, "vsw_ldc_uninit(%lld): error disabling " 2741 "interrupts (rv = %d)\n", ldcp->ldc_id, rv); 2742 LDC_EXIT_LOCK(ldcp); 2743 return (1); 2744 } 2745 2746 ldcp->ldc_status = LDC_INIT; 2747 2748 LDC_EXIT_LOCK(ldcp); 2749 2750 D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id); 2751 2752 return (0); 2753 } 2754 2755 static int 2756 vsw_init_ldcs(vsw_port_t *port) 2757 { 2758 vsw_ldc_list_t *ldcl = &port->p_ldclist; 2759 vsw_ldc_t *ldcp; 2760 2761 READ_ENTER(&ldcl->lockrw); 2762 ldcp = ldcl->head; 2763 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 2764 (void) vsw_ldc_init(ldcp); 2765 } 2766 RW_EXIT(&ldcl->lockrw); 2767 2768 return (0); 2769 } 2770 2771 static int 2772 vsw_uninit_ldcs(vsw_port_t *port) 2773 { 2774 vsw_ldc_list_t *ldcl = &port->p_ldclist; 2775 vsw_ldc_t *ldcp; 2776 2777 D1(NULL, "vsw_uninit_ldcs: enter\n"); 2778 2779 READ_ENTER(&ldcl->lockrw); 2780 ldcp = ldcl->head; 2781 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 2782 (void) vsw_ldc_uninit(ldcp); 2783 } 2784 RW_EXIT(&ldcl->lockrw); 2785 2786 D1(NULL, "vsw_uninit_ldcs: exit\n"); 2787 2788 return (0); 2789 } 2790 2791 /* 2792 * Wait until the callback(s) associated with the ldcs under the specified 2793 * port have completed. 2794 * 2795 * Prior to this function being invoked each channel under this port 2796 * should have been quiesced via ldc_set_cb_mode(DISABLE). 2797 * 2798 * A short explaination of what we are doing below.. 2799 * 2800 * The simplest approach would be to have a reference counter in 2801 * the ldc structure which is increment/decremented by the callbacks as 2802 * they use the channel. The drain function could then simply disable any 2803 * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately 2804 * there is a tiny window here - before the callback is able to get the lock 2805 * on the channel it is interrupted and this function gets to execute. It 2806 * sees that the ref count is zero and believes its free to delete the 2807 * associated data structures. 2808 * 2809 * We get around this by taking advantage of the fact that before the ldc 2810 * framework invokes a callback it sets a flag to indicate that there is a 2811 * callback active (or about to become active). If when we attempt to 2812 * unregister a callback when this active flag is set then the unregister 2813 * will fail with EWOULDBLOCK. 2814 * 2815 * If the unregister fails we do a cv_timedwait. We will either be signaled 2816 * by the callback as it is exiting (note we have to wait a short period to 2817 * allow the callback to return fully to the ldc framework and it to clear 2818 * the active flag), or by the timer expiring. In either case we again attempt 2819 * the unregister. We repeat this until we can succesfully unregister the 2820 * callback. 2821 * 2822 * The reason we use a cv_timedwait rather than a simple cv_wait is to catch 2823 * the case where the callback has finished but the ldc framework has not yet 2824 * cleared the active flag. In this case we would never get a cv_signal. 2825 */ 2826 static int 2827 vsw_drain_ldcs(vsw_port_t *port) 2828 { 2829 vsw_ldc_list_t *ldcl = &port->p_ldclist; 2830 vsw_ldc_t *ldcp; 2831 vsw_t *vswp = port->p_vswp; 2832 2833 D1(vswp, "%s: enter", __func__); 2834 2835 READ_ENTER(&ldcl->lockrw); 2836 2837 ldcp = ldcl->head; 2838 2839 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 2840 /* 2841 * If we can unregister the channel callback then we 2842 * know that there is no callback either running or 2843 * scheduled to run for this channel so move on to next 2844 * channel in the list. 2845 */ 2846 mutex_enter(&ldcp->drain_cv_lock); 2847 2848 /* prompt active callbacks to quit */ 2849 ldcp->drain_state = VSW_LDC_DRAINING; 2850 2851 if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) { 2852 D2(vswp, "%s: unreg callback for chan %ld", __func__, 2853 ldcp->ldc_id); 2854 mutex_exit(&ldcp->drain_cv_lock); 2855 continue; 2856 } else { 2857 /* 2858 * If we end up here we know that either 1) a callback 2859 * is currently executing, 2) is about to start (i.e. 2860 * the ldc framework has set the active flag but 2861 * has not actually invoked the callback yet, or 3) 2862 * has finished and has returned to the ldc framework 2863 * but the ldc framework has not yet cleared the 2864 * active bit. 2865 * 2866 * Wait for it to finish. 2867 */ 2868 while (ldc_unreg_callback(ldcp->ldc_handle) 2869 == EWOULDBLOCK) 2870 (void) cv_timedwait(&ldcp->drain_cv, 2871 &ldcp->drain_cv_lock, lbolt + hz); 2872 2873 mutex_exit(&ldcp->drain_cv_lock); 2874 D2(vswp, "%s: unreg callback for chan %ld after " 2875 "timeout", __func__, ldcp->ldc_id); 2876 } 2877 } 2878 RW_EXIT(&ldcl->lockrw); 2879 2880 D1(vswp, "%s: exit", __func__); 2881 return (0); 2882 } 2883 2884 /* 2885 * Wait until all tasks which reference this port have completed. 2886 * 2887 * Prior to this function being invoked each channel under this port 2888 * should have been quiesced via ldc_set_cb_mode(DISABLE). 2889 */ 2890 static int 2891 vsw_drain_port_taskq(vsw_port_t *port) 2892 { 2893 vsw_t *vswp = port->p_vswp; 2894 2895 D1(vswp, "%s: enter", __func__); 2896 2897 /* 2898 * Mark the port as in the process of being detached, and 2899 * dispatch a marker task to the queue so we know when all 2900 * relevant tasks have completed. 2901 */ 2902 mutex_enter(&port->state_lock); 2903 port->state = VSW_PORT_DETACHING; 2904 2905 if ((vswp->taskq_p == NULL) || 2906 (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task, 2907 port, DDI_NOSLEEP) != DDI_SUCCESS)) { 2908 DERR(vswp, "%s: unable to dispatch marker task", 2909 __func__); 2910 mutex_exit(&port->state_lock); 2911 return (1); 2912 } 2913 2914 /* 2915 * Wait for the marker task to finish. 2916 */ 2917 while (port->state != VSW_PORT_DETACHABLE) 2918 cv_wait(&port->state_cv, &port->state_lock); 2919 2920 mutex_exit(&port->state_lock); 2921 2922 D1(vswp, "%s: exit", __func__); 2923 2924 return (0); 2925 } 2926 2927 static void 2928 vsw_marker_task(void *arg) 2929 { 2930 vsw_port_t *port = arg; 2931 vsw_t *vswp = port->p_vswp; 2932 2933 D1(vswp, "%s: enter", __func__); 2934 2935 mutex_enter(&port->state_lock); 2936 2937 /* 2938 * No further tasks should be dispatched which reference 2939 * this port so ok to mark it as safe to detach. 2940 */ 2941 port->state = VSW_PORT_DETACHABLE; 2942 2943 cv_signal(&port->state_cv); 2944 2945 mutex_exit(&port->state_lock); 2946 2947 D1(vswp, "%s: exit", __func__); 2948 } 2949 2950 static vsw_port_t * 2951 vsw_lookup_port(vsw_t *vswp, int p_instance) 2952 { 2953 vsw_port_list_t *plist = &vswp->plist; 2954 vsw_port_t *port; 2955 2956 for (port = plist->head; port != NULL; port = port->p_next) { 2957 if (port->p_instance == p_instance) { 2958 D2(vswp, "vsw_lookup_port: found p_instance\n"); 2959 return (port); 2960 } 2961 } 2962 2963 return (NULL); 2964 } 2965 2966 /* 2967 * Search for and remove the specified port from the port 2968 * list. Returns 0 if able to locate and remove port, otherwise 2969 * returns 1. 2970 */ 2971 static int 2972 vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port) 2973 { 2974 vsw_port_list_t *plist = &vswp->plist; 2975 vsw_port_t *curr_p, *prev_p; 2976 2977 if (plist->head == NULL) 2978 return (1); 2979 2980 curr_p = prev_p = plist->head; 2981 2982 while (curr_p != NULL) { 2983 if (curr_p == port) { 2984 if (prev_p == curr_p) { 2985 plist->head = curr_p->p_next; 2986 } else { 2987 prev_p->p_next = curr_p->p_next; 2988 } 2989 plist->num_ports--; 2990 break; 2991 } else { 2992 prev_p = curr_p; 2993 curr_p = curr_p->p_next; 2994 } 2995 } 2996 return (0); 2997 } 2998 2999 /* 3000 * Interrupt handler for ldc messages. 3001 */ 3002 static uint_t 3003 vsw_ldc_cb(uint64_t event, caddr_t arg) 3004 { 3005 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 3006 vsw_t *vswp = ldcp->ldc_vswp; 3007 ldc_status_t lstatus; 3008 int rv; 3009 3010 D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); 3011 3012 mutex_enter(&ldcp->ldc_cblock); 3013 3014 if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) { 3015 mutex_exit(&ldcp->ldc_cblock); 3016 return (LDC_SUCCESS); 3017 } 3018 3019 if (event & LDC_EVT_UP) { 3020 /* 3021 * Channel has come up, get the state and then start 3022 * the handshake. 3023 */ 3024 rv = ldc_status(ldcp->ldc_handle, &lstatus); 3025 if (rv != 0) { 3026 cmn_err(CE_WARN, "Unable to read channel state"); 3027 } 3028 ldcp->ldc_status = lstatus; 3029 3030 D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)", 3031 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 3032 3033 vsw_restart_handshake(ldcp); 3034 3035 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); 3036 } 3037 3038 if (event & LDC_EVT_READ) { 3039 /* 3040 * Data available for reading. 3041 */ 3042 D2(vswp, "%s: id(ld) event(%llx) data READ", 3043 __func__, ldcp->ldc_id, event); 3044 3045 vsw_process_pkt(ldcp); 3046 3047 ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0); 3048 3049 goto vsw_cb_exit; 3050 } 3051 3052 if (event & LDC_EVT_RESET) { 3053 rv = ldc_status(ldcp->ldc_handle, &lstatus); 3054 if (rv != 0) { 3055 cmn_err(CE_WARN, "Unable to read channel state"); 3056 } else { 3057 ldcp->ldc_status = lstatus; 3058 } 3059 D2(vswp, "%s: id(%ld) event(%llx) RESET: status (%ld)", 3060 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 3061 } 3062 3063 if (event & LDC_EVT_DOWN) { 3064 rv = ldc_status(ldcp->ldc_handle, &lstatus); 3065 if (rv != 0) { 3066 cmn_err(CE_WARN, "Unable to read channel state"); 3067 } else { 3068 ldcp->ldc_status = lstatus; 3069 } 3070 3071 D2(vswp, "%s: id(%ld) event(%llx) DOWN: status (%ld)", 3072 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 3073 3074 } 3075 3076 /* 3077 * Catch either LDC_EVT_WRITE which we don't support or any 3078 * unknown event. 3079 */ 3080 if (event & ~(LDC_EVT_UP | LDC_EVT_RESET 3081 | LDC_EVT_DOWN | LDC_EVT_READ)) { 3082 3083 DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)", 3084 __func__, ldcp->ldc_id, event, ldcp->ldc_status); 3085 } 3086 3087 vsw_cb_exit: 3088 mutex_exit(&ldcp->ldc_cblock); 3089 3090 /* 3091 * Let the drain function know we are finishing if it 3092 * is waiting. 3093 */ 3094 mutex_enter(&ldcp->drain_cv_lock); 3095 if (ldcp->drain_state == VSW_LDC_DRAINING) 3096 cv_signal(&ldcp->drain_cv); 3097 mutex_exit(&ldcp->drain_cv_lock); 3098 3099 return (LDC_SUCCESS); 3100 } 3101 3102 /* 3103 * (Re)start a handshake with our peer by sending them 3104 * our version info. 3105 */ 3106 static void 3107 vsw_restart_handshake(vsw_ldc_t *ldcp) 3108 { 3109 vsw_t *vswp = ldcp->ldc_vswp; 3110 vsw_port_t *port; 3111 vsw_ldc_list_t *ldcl; 3112 3113 D1(vswp, "vsw_restart_handshake: enter"); 3114 3115 port = ldcp->ldc_port; 3116 ldcl = &port->p_ldclist; 3117 3118 WRITE_ENTER(&ldcl->lockrw); 3119 3120 D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__, 3121 ldcp->lane_in.lstate, ldcp->lane_out.lstate); 3122 3123 vsw_free_lane_resources(ldcp, INBOUND); 3124 vsw_free_lane_resources(ldcp, OUTBOUND); 3125 RW_EXIT(&ldcl->lockrw); 3126 3127 ldcp->lane_in.lstate = 0; 3128 ldcp->lane_out.lstate = 0; 3129 3130 /* 3131 * Remove parent port from any multicast groups 3132 * it may have registered with. Client must resend 3133 * multicast add command after handshake completes. 3134 */ 3135 (void) vsw_del_fdb(vswp, port); 3136 3137 vsw_del_mcst_port(port); 3138 3139 ldcp->hphase = VSW_MILESTONE0; 3140 3141 ldcp->peer_session = 0; 3142 ldcp->session_status = 0; 3143 3144 /* 3145 * We now increment the transaction group id. This allows 3146 * us to identify and disard any tasks which are still pending 3147 * on the taskq and refer to the handshake session we are about 3148 * to restart. These stale messages no longer have any real 3149 * meaning. 3150 */ 3151 mutex_enter(&ldcp->hss_lock); 3152 ldcp->hss_id++; 3153 mutex_exit(&ldcp->hss_lock); 3154 3155 if (ldcp->hcnt++ > vsw_num_handshakes) { 3156 cmn_err(CE_WARN, "exceeded number of permitted " 3157 "handshake attempts (%d) on channel %ld", 3158 ldcp->hcnt, ldcp->ldc_id); 3159 return; 3160 } 3161 3162 vsw_send_ver(ldcp); 3163 3164 D1(vswp, "vsw_restart_handshake: exit"); 3165 } 3166 3167 /* 3168 * returns 0 if legal for event signified by flag to have 3169 * occured at the time it did. Otherwise returns 1. 3170 */ 3171 int 3172 vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag) 3173 { 3174 vsw_t *vswp = ldcp->ldc_vswp; 3175 uint64_t state; 3176 uint64_t phase; 3177 3178 if (dir == INBOUND) 3179 state = ldcp->lane_in.lstate; 3180 else 3181 state = ldcp->lane_out.lstate; 3182 3183 phase = ldcp->hphase; 3184 3185 switch (flag) { 3186 case VSW_VER_INFO_RECV: 3187 if (phase > VSW_MILESTONE0) { 3188 DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV" 3189 " when in state %d\n", ldcp->ldc_id, phase); 3190 vsw_restart_handshake(ldcp); 3191 return (1); 3192 } 3193 break; 3194 3195 case VSW_VER_ACK_RECV: 3196 case VSW_VER_NACK_RECV: 3197 if (!(state & VSW_VER_INFO_SENT)) { 3198 DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK" 3199 " or VER_NACK when in state %d\n", 3200 ldcp->ldc_id, phase); 3201 vsw_restart_handshake(ldcp); 3202 return (1); 3203 } else 3204 state &= ~VSW_VER_INFO_SENT; 3205 break; 3206 3207 case VSW_ATTR_INFO_RECV: 3208 if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) { 3209 DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV" 3210 " when in state %d\n", ldcp->ldc_id, phase); 3211 vsw_restart_handshake(ldcp); 3212 return (1); 3213 } 3214 break; 3215 3216 case VSW_ATTR_ACK_RECV: 3217 case VSW_ATTR_NACK_RECV: 3218 if (!(state & VSW_ATTR_INFO_SENT)) { 3219 DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK" 3220 " or ATTR_NACK when in state %d\n", 3221 ldcp->ldc_id, phase); 3222 vsw_restart_handshake(ldcp); 3223 return (1); 3224 } else 3225 state &= ~VSW_ATTR_INFO_SENT; 3226 break; 3227 3228 case VSW_DRING_INFO_RECV: 3229 if (phase < VSW_MILESTONE1) { 3230 DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV" 3231 " when in state %d\n", ldcp->ldc_id, phase); 3232 vsw_restart_handshake(ldcp); 3233 return (1); 3234 } 3235 break; 3236 3237 case VSW_DRING_ACK_RECV: 3238 case VSW_DRING_NACK_RECV: 3239 if (!(state & VSW_DRING_INFO_SENT)) { 3240 DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK" 3241 " or DRING_NACK when in state %d\n", 3242 ldcp->ldc_id, phase); 3243 vsw_restart_handshake(ldcp); 3244 return (1); 3245 } else 3246 state &= ~VSW_DRING_INFO_SENT; 3247 break; 3248 3249 case VSW_RDX_INFO_RECV: 3250 if (phase < VSW_MILESTONE3) { 3251 DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV" 3252 " when in state %d\n", ldcp->ldc_id, phase); 3253 vsw_restart_handshake(ldcp); 3254 return (1); 3255 } 3256 break; 3257 3258 case VSW_RDX_ACK_RECV: 3259 case VSW_RDX_NACK_RECV: 3260 if (!(state & VSW_RDX_INFO_SENT)) { 3261 DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK" 3262 " or RDX_NACK when in state %d\n", 3263 ldcp->ldc_id, phase); 3264 vsw_restart_handshake(ldcp); 3265 return (1); 3266 } else 3267 state &= ~VSW_RDX_INFO_SENT; 3268 break; 3269 3270 case VSW_MCST_INFO_RECV: 3271 if (phase < VSW_MILESTONE3) { 3272 DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV" 3273 " when in state %d\n", ldcp->ldc_id, phase); 3274 vsw_restart_handshake(ldcp); 3275 return (1); 3276 } 3277 break; 3278 3279 default: 3280 DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)", 3281 ldcp->ldc_id, flag); 3282 return (1); 3283 } 3284 3285 if (dir == INBOUND) 3286 ldcp->lane_in.lstate = state; 3287 else 3288 ldcp->lane_out.lstate = state; 3289 3290 D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id); 3291 3292 return (0); 3293 } 3294 3295 void 3296 vsw_next_milestone(vsw_ldc_t *ldcp) 3297 { 3298 vsw_t *vswp = ldcp->ldc_vswp; 3299 3300 D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__, 3301 ldcp->ldc_id, ldcp->hphase); 3302 3303 DUMP_FLAGS(ldcp->lane_in.lstate); 3304 DUMP_FLAGS(ldcp->lane_out.lstate); 3305 3306 switch (ldcp->hphase) { 3307 3308 case VSW_MILESTONE0: 3309 /* 3310 * If we haven't started to handshake with our peer, 3311 * start to do so now. 3312 */ 3313 if (ldcp->lane_out.lstate == 0) { 3314 D2(vswp, "%s: (chan %lld) starting handshake " 3315 "with peer", __func__, ldcp->ldc_id); 3316 vsw_restart_handshake(ldcp); 3317 } 3318 3319 /* 3320 * Only way to pass this milestone is to have successfully 3321 * negotiated version info. 3322 */ 3323 if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) && 3324 (ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) { 3325 3326 D2(vswp, "%s: (chan %lld) leaving milestone 0", 3327 __func__, ldcp->ldc_id); 3328 3329 /* 3330 * Next milestone is passed when attribute 3331 * information has been successfully exchanged. 3332 */ 3333 ldcp->hphase = VSW_MILESTONE1; 3334 vsw_send_attr(ldcp); 3335 3336 } 3337 break; 3338 3339 case VSW_MILESTONE1: 3340 /* 3341 * Only way to pass this milestone is to have successfully 3342 * negotiated attribute information. 3343 */ 3344 if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) { 3345 3346 ldcp->hphase = VSW_MILESTONE2; 3347 3348 /* 3349 * If the peer device has said it wishes to 3350 * use descriptor rings then we send it our ring 3351 * info, otherwise we just set up a private ring 3352 * which we use an internal buffer 3353 */ 3354 if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) 3355 vsw_send_dring_info(ldcp); 3356 } 3357 break; 3358 3359 3360 case VSW_MILESTONE2: 3361 /* 3362 * If peer has indicated in its attribute message that 3363 * it wishes to use descriptor rings then the only way 3364 * to pass this milestone is for us to have received 3365 * valid dring info. 3366 * 3367 * If peer is not using descriptor rings then just fall 3368 * through. 3369 */ 3370 if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) && 3371 (!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT))) 3372 break; 3373 3374 D2(vswp, "%s: (chan %lld) leaving milestone 2", 3375 __func__, ldcp->ldc_id); 3376 3377 ldcp->hphase = VSW_MILESTONE3; 3378 vsw_send_rdx(ldcp); 3379 break; 3380 3381 case VSW_MILESTONE3: 3382 /* 3383 * Pass this milestone when all paramaters have been 3384 * successfully exchanged and RDX sent in both directions. 3385 * 3386 * Mark outbound lane as available to transmit data. 3387 */ 3388 if ((ldcp->lane_in.lstate & VSW_RDX_ACK_SENT) && 3389 (ldcp->lane_out.lstate & VSW_RDX_ACK_RECV)) { 3390 3391 D2(vswp, "%s: (chan %lld) leaving milestone 3", 3392 __func__, ldcp->ldc_id); 3393 D2(vswp, "%s: ** handshake complete **", __func__); 3394 ldcp->lane_out.lstate |= VSW_LANE_ACTIVE; 3395 ldcp->hphase = VSW_MILESTONE4; 3396 ldcp->hcnt = 0; 3397 DISPLAY_STATE(); 3398 } 3399 break; 3400 3401 case VSW_MILESTONE4: 3402 D2(vswp, "%s: (chan %lld) in milestone 4", __func__, 3403 ldcp->ldc_id); 3404 break; 3405 3406 default: 3407 DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__, 3408 ldcp->ldc_id, ldcp->hphase); 3409 } 3410 3411 D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id, 3412 ldcp->hphase); 3413 } 3414 3415 /* 3416 * Check if major version is supported. 3417 * 3418 * Returns 0 if finds supported major number, and if necessary 3419 * adjusts the minor field. 3420 * 3421 * Returns 1 if can't match major number exactly. Sets mjor/minor 3422 * to next lowest support values, or to zero if no other values possible. 3423 */ 3424 static int 3425 vsw_supported_version(vio_ver_msg_t *vp) 3426 { 3427 int i; 3428 3429 D1(NULL, "vsw_supported_version: enter"); 3430 3431 for (i = 0; i < VSW_NUM_VER; i++) { 3432 if (vsw_versions[i].ver_major == vp->ver_major) { 3433 /* 3434 * Matching or lower major version found. Update 3435 * minor number if necessary. 3436 */ 3437 if (vp->ver_minor > vsw_versions[i].ver_minor) { 3438 D2(NULL, "%s: adjusting minor value" 3439 " from %d to %d", __func__, 3440 vp->ver_minor, 3441 vsw_versions[i].ver_minor); 3442 vp->ver_minor = vsw_versions[i].ver_minor; 3443 } 3444 3445 return (0); 3446 } 3447 3448 if (vsw_versions[i].ver_major < vp->ver_major) { 3449 if (vp->ver_minor > vsw_versions[i].ver_minor) { 3450 D2(NULL, "%s: adjusting minor value" 3451 " from %d to %d", __func__, 3452 vp->ver_minor, 3453 vsw_versions[i].ver_minor); 3454 vp->ver_minor = vsw_versions[i].ver_minor; 3455 } 3456 return (1); 3457 } 3458 } 3459 3460 /* No match was possible, zero out fields */ 3461 vp->ver_major = 0; 3462 vp->ver_minor = 0; 3463 3464 D1(NULL, "vsw_supported_version: exit"); 3465 3466 return (1); 3467 } 3468 3469 /* 3470 * Main routine for processing messages received over LDC. 3471 */ 3472 static void 3473 vsw_process_pkt(void *arg) 3474 { 3475 vsw_ldc_t *ldcp = (vsw_ldc_t *)arg; 3476 vsw_t *vswp = ldcp->ldc_vswp; 3477 size_t msglen; 3478 vio_msg_tag_t tag; 3479 def_msg_t dmsg; 3480 int rv = 0; 3481 3482 D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id); 3483 3484 /* 3485 * If channel is up read messages until channel is empty. 3486 */ 3487 do { 3488 msglen = sizeof (dmsg); 3489 rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen); 3490 3491 if (rv != 0) { 3492 DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) " 3493 "len(%d)\n", __func__, ldcp->ldc_id, 3494 rv, msglen); 3495 break; 3496 } 3497 3498 if (msglen == 0) { 3499 D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__, 3500 ldcp->ldc_id); 3501 break; 3502 } 3503 3504 D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__, 3505 ldcp->ldc_id, msglen); 3506 3507 /* 3508 * Figure out what sort of packet we have gotten by 3509 * examining the msg tag, and then switch it appropriately. 3510 */ 3511 bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t)); 3512 3513 switch (tag.vio_msgtype) { 3514 case VIO_TYPE_CTRL: 3515 vsw_dispatch_ctrl_task(ldcp, &dmsg, tag); 3516 break; 3517 case VIO_TYPE_DATA: 3518 vsw_process_data_pkt(ldcp, &dmsg, tag); 3519 break; 3520 case VIO_TYPE_ERR: 3521 vsw_process_err_pkt(ldcp, &dmsg, tag); 3522 break; 3523 default: 3524 DERR(vswp, "%s: Unknown tag(%lx) ", __func__, 3525 "id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id); 3526 break; 3527 } 3528 } while (msglen); 3529 3530 D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id); 3531 } 3532 3533 /* 3534 * Dispatch a task to process a VIO control message. 3535 */ 3536 static void 3537 vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag) 3538 { 3539 vsw_ctrl_task_t *ctaskp = NULL; 3540 vsw_port_t *port = ldcp->ldc_port; 3541 vsw_t *vswp = port->p_vswp; 3542 3543 D1(vswp, "%s: enter", __func__); 3544 3545 /* 3546 * We need to handle RDX ACK messages in-band as once they 3547 * are exchanged it is possible that we will get an 3548 * immediate (legitimate) data packet. 3549 */ 3550 if ((tag.vio_subtype_env == VIO_RDX) && 3551 (tag.vio_subtype == VIO_SUBTYPE_ACK)) { 3552 if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_ACK_RECV)) 3553 return; 3554 3555 ldcp->lane_out.lstate |= VSW_RDX_ACK_RECV; 3556 vsw_next_milestone(ldcp); 3557 D2(vswp, "%s (%ld) handling RDX_ACK in place", __func__, 3558 ldcp->ldc_id); 3559 return; 3560 } 3561 3562 ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP); 3563 3564 if (ctaskp == NULL) { 3565 DERR(vswp, "%s: unable to alloc space for ctrl" 3566 " msg", __func__); 3567 vsw_restart_handshake(ldcp); 3568 return; 3569 } 3570 3571 ctaskp->ldcp = ldcp; 3572 bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t)); 3573 mutex_enter(&ldcp->hss_lock); 3574 ctaskp->hss_id = ldcp->hss_id; 3575 mutex_exit(&ldcp->hss_lock); 3576 3577 /* 3578 * Dispatch task to processing taskq if port is not in 3579 * the process of being detached. 3580 */ 3581 mutex_enter(&port->state_lock); 3582 if (port->state == VSW_PORT_INIT) { 3583 if ((vswp->taskq_p == NULL) || 3584 (ddi_taskq_dispatch(vswp->taskq_p, 3585 vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP) 3586 != DDI_SUCCESS)) { 3587 DERR(vswp, "%s: unable to dispatch task to taskq", 3588 __func__); 3589 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 3590 mutex_exit(&port->state_lock); 3591 vsw_restart_handshake(ldcp); 3592 return; 3593 } 3594 } else { 3595 DWARN(vswp, "%s: port %d detaching, not dispatching " 3596 "task", __func__, port->p_instance); 3597 } 3598 3599 mutex_exit(&port->state_lock); 3600 3601 D2(vswp, "%s: dispatched task to taskq for chan %d", __func__, 3602 ldcp->ldc_id); 3603 D1(vswp, "%s: exit", __func__); 3604 } 3605 3606 /* 3607 * Process a VIO ctrl message. Invoked from taskq. 3608 */ 3609 static void 3610 vsw_process_ctrl_pkt(void *arg) 3611 { 3612 vsw_ctrl_task_t *ctaskp = (vsw_ctrl_task_t *)arg; 3613 vsw_ldc_t *ldcp = ctaskp->ldcp; 3614 vsw_t *vswp = ldcp->ldc_vswp; 3615 vio_msg_tag_t tag; 3616 uint16_t env; 3617 3618 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 3619 3620 bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t)); 3621 env = tag.vio_subtype_env; 3622 3623 /* stale pkt check */ 3624 mutex_enter(&ldcp->hss_lock); 3625 if (ctaskp->hss_id < ldcp->hss_id) { 3626 DWARN(vswp, "%s: discarding stale packet belonging to" 3627 " earlier (%ld) handshake session", __func__, 3628 ctaskp->hss_id); 3629 mutex_exit(&ldcp->hss_lock); 3630 return; 3631 } 3632 mutex_exit(&ldcp->hss_lock); 3633 3634 /* session id check */ 3635 if (ldcp->session_status & VSW_PEER_SESSION) { 3636 if (ldcp->peer_session != tag.vio_sid) { 3637 DERR(vswp, "%s (chan %d): invalid session id (%llx)", 3638 __func__, ldcp->ldc_id, tag.vio_sid); 3639 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 3640 vsw_restart_handshake(ldcp); 3641 return; 3642 } 3643 } 3644 3645 /* 3646 * Switch on vio_subtype envelope, then let lower routines 3647 * decide if its an INFO, ACK or NACK packet. 3648 */ 3649 switch (env) { 3650 case VIO_VER_INFO: 3651 vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp); 3652 break; 3653 case VIO_DRING_REG: 3654 vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp); 3655 break; 3656 case VIO_DRING_UNREG: 3657 vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp); 3658 break; 3659 case VIO_ATTR_INFO: 3660 vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp); 3661 break; 3662 case VNET_MCAST_INFO: 3663 vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp); 3664 break; 3665 case VIO_RDX: 3666 vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp); 3667 break; 3668 default: 3669 DERR(vswp, "%s : unknown vio_subtype_env (%x)\n", 3670 __func__, env); 3671 } 3672 3673 kmem_free(ctaskp, sizeof (vsw_ctrl_task_t)); 3674 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 3675 } 3676 3677 /* 3678 * Version negotiation. We can end up here either because our peer 3679 * has responded to a handshake message we have sent it, or our peer 3680 * has initiated a handshake with us. If its the former then can only 3681 * be ACK or NACK, if its the later can only be INFO. 3682 * 3683 * If its an ACK we move to the next stage of the handshake, namely 3684 * attribute exchange. If its a NACK we see if we can specify another 3685 * version, if we can't we stop. 3686 * 3687 * If it is an INFO we reset all params associated with communication 3688 * in that direction over this channel (remember connection is 3689 * essentially 2 independent simplex channels). 3690 */ 3691 void 3692 vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt) 3693 { 3694 vio_ver_msg_t *ver_pkt; 3695 vsw_t *vswp = ldcp->ldc_vswp; 3696 3697 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 3698 3699 /* 3700 * We know this is a ctrl/version packet so 3701 * cast it into the correct structure. 3702 */ 3703 ver_pkt = (vio_ver_msg_t *)pkt; 3704 3705 switch (ver_pkt->tag.vio_subtype) { 3706 case VIO_SUBTYPE_INFO: 3707 D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n"); 3708 3709 /* 3710 * Record the session id, which we will use from now 3711 * until we see another VER_INFO msg. Even then the 3712 * session id in most cases will be unchanged, execpt 3713 * if channel was reset. 3714 */ 3715 if ((ldcp->session_status & VSW_PEER_SESSION) && 3716 (ldcp->peer_session != ver_pkt->tag.vio_sid)) { 3717 DERR(vswp, "%s: updating session id for chan %lld " 3718 "from %llx to %llx", __func__, ldcp->ldc_id, 3719 ldcp->peer_session, ver_pkt->tag.vio_sid); 3720 } 3721 3722 ldcp->peer_session = ver_pkt->tag.vio_sid; 3723 ldcp->session_status |= VSW_PEER_SESSION; 3724 3725 /* Legal message at this time ? */ 3726 if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV)) 3727 return; 3728 3729 /* 3730 * First check the device class. Currently only expect 3731 * to be talking to a network device. In the future may 3732 * also talk to another switch. 3733 */ 3734 if (ver_pkt->dev_class != VDEV_NETWORK) { 3735 DERR(vswp, "%s: illegal device class %d", __func__, 3736 ver_pkt->dev_class); 3737 3738 ver_pkt->tag.vio_sid = ldcp->local_session; 3739 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 3740 3741 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 3742 3743 vsw_send_msg(ldcp, (void *)ver_pkt, 3744 sizeof (vio_ver_msg_t)); 3745 3746 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; 3747 vsw_next_milestone(ldcp); 3748 return; 3749 } else { 3750 ldcp->dev_class = ver_pkt->dev_class; 3751 } 3752 3753 /* 3754 * Now check the version. 3755 */ 3756 if (vsw_supported_version(ver_pkt) == 0) { 3757 /* 3758 * Support this major version and possibly 3759 * adjusted minor version. 3760 */ 3761 3762 D2(vswp, "%s: accepted ver %d:%d", __func__, 3763 ver_pkt->ver_major, ver_pkt->ver_minor); 3764 3765 /* Store accepted values */ 3766 ldcp->lane_in.ver_major = ver_pkt->ver_major; 3767 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 3768 3769 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 3770 3771 ldcp->lane_in.lstate |= VSW_VER_ACK_SENT; 3772 } else { 3773 /* 3774 * NACK back with the next lower major/minor 3775 * pairing we support (if don't suuport any more 3776 * versions then they will be set to zero. 3777 */ 3778 3779 D2(vswp, "%s: replying with ver %d:%d", __func__, 3780 ver_pkt->ver_major, ver_pkt->ver_minor); 3781 3782 /* Store updated values */ 3783 ldcp->lane_in.ver_major = ver_pkt->ver_major; 3784 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 3785 3786 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 3787 3788 ldcp->lane_in.lstate |= VSW_VER_NACK_SENT; 3789 } 3790 3791 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 3792 ver_pkt->tag.vio_sid = ldcp->local_session; 3793 vsw_send_msg(ldcp, (void *)ver_pkt, sizeof (vio_ver_msg_t)); 3794 3795 vsw_next_milestone(ldcp); 3796 break; 3797 3798 case VIO_SUBTYPE_ACK: 3799 D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__); 3800 3801 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV)) 3802 return; 3803 3804 /* Store updated values */ 3805 ldcp->lane_in.ver_major = ver_pkt->ver_major; 3806 ldcp->lane_in.ver_minor = ver_pkt->ver_minor; 3807 3808 3809 ldcp->lane_out.lstate |= VSW_VER_ACK_RECV; 3810 vsw_next_milestone(ldcp); 3811 3812 break; 3813 3814 case VIO_SUBTYPE_NACK: 3815 D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__); 3816 3817 if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV)) 3818 return; 3819 3820 /* 3821 * If our peer sent us a NACK with the ver fields set to 3822 * zero then there is nothing more we can do. Otherwise see 3823 * if we support either the version suggested, or a lesser 3824 * one. 3825 */ 3826 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { 3827 DERR(vswp, "%s: peer unable to negotiate any " 3828 "further.", __func__); 3829 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; 3830 vsw_next_milestone(ldcp); 3831 return; 3832 } 3833 3834 /* 3835 * Check to see if we support this major version or 3836 * a lower one. If we don't then maj/min will be set 3837 * to zero. 3838 */ 3839 (void) vsw_supported_version(ver_pkt); 3840 if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) { 3841 /* Nothing more we can do */ 3842 DERR(vswp, "%s: version negotiation failed.\n", 3843 __func__); 3844 ldcp->lane_out.lstate |= VSW_VER_NACK_RECV; 3845 vsw_next_milestone(ldcp); 3846 } else { 3847 /* found a supported major version */ 3848 ldcp->lane_out.ver_major = ver_pkt->ver_major; 3849 ldcp->lane_out.ver_minor = ver_pkt->ver_minor; 3850 3851 D2(vswp, "%s: resending with updated values (%x, %x)", 3852 __func__, ver_pkt->ver_major, 3853 ver_pkt->ver_minor); 3854 3855 ldcp->lane_out.lstate |= VSW_VER_INFO_SENT; 3856 ver_pkt->tag.vio_sid = ldcp->local_session; 3857 ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; 3858 3859 DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt); 3860 3861 vsw_send_msg(ldcp, (void *)ver_pkt, 3862 sizeof (vio_ver_msg_t)); 3863 3864 vsw_next_milestone(ldcp); 3865 3866 } 3867 break; 3868 3869 default: 3870 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 3871 ver_pkt->tag.vio_subtype); 3872 } 3873 3874 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); 3875 } 3876 3877 /* 3878 * Process an attribute packet. We can end up here either because our peer 3879 * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our 3880 * peer has sent us an attribute INFO message 3881 * 3882 * If its an ACK we then move to the next stage of the handshake which 3883 * is to send our descriptor ring info to our peer. If its a NACK then 3884 * there is nothing more we can (currently) do. 3885 * 3886 * If we get a valid/acceptable INFO packet (and we have already negotiated 3887 * a version) we ACK back and set channel state to ATTR_RECV, otherwise we 3888 * NACK back and reset channel state to INACTIV. 3889 * 3890 * FUTURE: in time we will probably negotiate over attributes, but for 3891 * the moment unacceptable attributes are regarded as a fatal error. 3892 * 3893 */ 3894 void 3895 vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt) 3896 { 3897 vnet_attr_msg_t *attr_pkt; 3898 vsw_t *vswp = ldcp->ldc_vswp; 3899 vsw_port_t *port = ldcp->ldc_port; 3900 uint64_t macaddr = 0; 3901 int i; 3902 3903 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 3904 3905 /* 3906 * We know this is a ctrl/attr packet so 3907 * cast it into the correct structure. 3908 */ 3909 attr_pkt = (vnet_attr_msg_t *)pkt; 3910 3911 switch (attr_pkt->tag.vio_subtype) { 3912 case VIO_SUBTYPE_INFO: 3913 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 3914 3915 if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV)) 3916 return; 3917 3918 /* 3919 * If the attributes are unacceptable then we NACK back. 3920 */ 3921 if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) { 3922 3923 DERR(vswp, "%s (chan %d): invalid attributes", 3924 __func__, ldcp->ldc_id); 3925 3926 vsw_free_lane_resources(ldcp, INBOUND); 3927 3928 attr_pkt->tag.vio_sid = ldcp->local_session; 3929 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 3930 3931 DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); 3932 ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT; 3933 vsw_send_msg(ldcp, (void *)attr_pkt, 3934 sizeof (vnet_attr_msg_t)); 3935 3936 vsw_next_milestone(ldcp); 3937 return; 3938 } 3939 3940 /* 3941 * Otherwise store attributes for this lane and update 3942 * lane state. 3943 */ 3944 ldcp->lane_in.mtu = attr_pkt->mtu; 3945 ldcp->lane_in.addr = attr_pkt->addr; 3946 ldcp->lane_in.addr_type = attr_pkt->addr_type; 3947 ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode; 3948 ldcp->lane_in.ack_freq = attr_pkt->ack_freq; 3949 3950 macaddr = ldcp->lane_in.addr; 3951 for (i = ETHERADDRL - 1; i >= 0; i--) { 3952 port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF; 3953 macaddr >>= 8; 3954 } 3955 3956 /* create the fdb entry for this port/mac address */ 3957 (void) vsw_add_fdb(vswp, port); 3958 3959 /* setup device specifc xmit routines */ 3960 mutex_enter(&port->tx_lock); 3961 if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) { 3962 D2(vswp, "%s: mode = VIO_DRING_MODE", __func__); 3963 port->transmit = vsw_dringsend; 3964 } else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) { 3965 D2(vswp, "%s: mode = VIO_DESC_MODE", __func__); 3966 vsw_create_privring(ldcp); 3967 port->transmit = vsw_descrsend; 3968 } 3969 mutex_exit(&port->tx_lock); 3970 3971 attr_pkt->tag.vio_sid = ldcp->local_session; 3972 attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 3973 3974 DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt); 3975 3976 ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT; 3977 3978 vsw_send_msg(ldcp, (void *)attr_pkt, 3979 sizeof (vnet_attr_msg_t)); 3980 3981 vsw_next_milestone(ldcp); 3982 break; 3983 3984 case VIO_SUBTYPE_ACK: 3985 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 3986 3987 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV)) 3988 return; 3989 3990 ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV; 3991 vsw_next_milestone(ldcp); 3992 break; 3993 3994 case VIO_SUBTYPE_NACK: 3995 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 3996 3997 if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV)) 3998 return; 3999 4000 ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV; 4001 vsw_next_milestone(ldcp); 4002 break; 4003 4004 default: 4005 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 4006 attr_pkt->tag.vio_subtype); 4007 } 4008 4009 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 4010 } 4011 4012 /* 4013 * Process a dring info packet. We can end up here either because our peer 4014 * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our 4015 * peer has sent us a dring INFO message. 4016 * 4017 * If we get a valid/acceptable INFO packet (and we have already negotiated 4018 * a version) we ACK back and update the lane state, otherwise we NACK back. 4019 * 4020 * FUTURE: nothing to stop client from sending us info on multiple dring's 4021 * but for the moment we will just use the first one we are given. 4022 * 4023 */ 4024 void 4025 vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt) 4026 { 4027 vio_dring_reg_msg_t *dring_pkt; 4028 vsw_t *vswp = ldcp->ldc_vswp; 4029 ldc_mem_info_t minfo; 4030 dring_info_t *dp, *dbp; 4031 int dring_found = 0; 4032 4033 /* 4034 * We know this is a ctrl/dring packet so 4035 * cast it into the correct structure. 4036 */ 4037 dring_pkt = (vio_dring_reg_msg_t *)pkt; 4038 4039 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 4040 4041 switch (dring_pkt->tag.vio_subtype) { 4042 case VIO_SUBTYPE_INFO: 4043 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4044 4045 if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) 4046 return; 4047 4048 /* 4049 * If the dring params are unacceptable then we NACK back. 4050 */ 4051 if (vsw_check_dring_info(dring_pkt)) { 4052 4053 DERR(vswp, "%s (%lld): invalid dring info", 4054 __func__, ldcp->ldc_id); 4055 4056 vsw_free_lane_resources(ldcp, INBOUND); 4057 4058 dring_pkt->tag.vio_sid = ldcp->local_session; 4059 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4060 4061 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 4062 4063 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 4064 4065 vsw_send_msg(ldcp, (void *)dring_pkt, 4066 sizeof (vio_dring_reg_msg_t)); 4067 4068 vsw_next_milestone(ldcp); 4069 return; 4070 } 4071 4072 /* 4073 * Otherwise, attempt to map in the dring using the 4074 * cookie. If that succeeds we send back a unique dring 4075 * identifier that the sending side will use in future 4076 * to refer to this descriptor ring. 4077 */ 4078 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 4079 4080 dp->num_descriptors = dring_pkt->num_descriptors; 4081 dp->descriptor_size = dring_pkt->descriptor_size; 4082 dp->options = dring_pkt->options; 4083 dp->ncookies = dring_pkt->ncookies; 4084 4085 /* 4086 * Note: should only get one cookie. Enforced in 4087 * the ldc layer. 4088 */ 4089 bcopy(&dring_pkt->cookie[0], &dp->cookie[0], 4090 sizeof (ldc_mem_cookie_t)); 4091 4092 D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__, 4093 dp->num_descriptors, dp->descriptor_size); 4094 D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__, 4095 dp->options, dp->ncookies); 4096 4097 if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0], 4098 dp->ncookies, dp->num_descriptors, 4099 dp->descriptor_size, LDC_SHADOW_MAP, 4100 &(dp->handle))) != 0) { 4101 4102 DERR(vswp, "%s: dring_map failed\n", __func__); 4103 4104 kmem_free(dp, sizeof (dring_info_t)); 4105 vsw_free_lane_resources(ldcp, INBOUND); 4106 4107 dring_pkt->tag.vio_sid = ldcp->local_session; 4108 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4109 4110 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 4111 4112 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 4113 vsw_send_msg(ldcp, (void *)dring_pkt, 4114 sizeof (vio_dring_reg_msg_t)); 4115 4116 vsw_next_milestone(ldcp); 4117 return; 4118 } 4119 4120 if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) { 4121 4122 DERR(vswp, "%s: dring_addr failed\n", __func__); 4123 4124 kmem_free(dp, sizeof (dring_info_t)); 4125 vsw_free_lane_resources(ldcp, INBOUND); 4126 4127 dring_pkt->tag.vio_sid = ldcp->local_session; 4128 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; 4129 4130 DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt); 4131 4132 ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT; 4133 vsw_send_msg(ldcp, (void *)dring_pkt, 4134 sizeof (vio_dring_reg_msg_t)); 4135 4136 vsw_next_milestone(ldcp); 4137 return; 4138 } else { 4139 /* store the address of the pub part of ring */ 4140 dp->pub_addr = minfo.vaddr; 4141 } 4142 4143 /* no private section as we are importing */ 4144 dp->priv_addr = NULL; 4145 4146 /* 4147 * Using simple mono increasing int for ident at 4148 * the moment. 4149 */ 4150 dp->ident = ldcp->next_ident; 4151 ldcp->next_ident++; 4152 4153 dp->end_idx = 0; 4154 dp->next = NULL; 4155 4156 /* 4157 * Link it onto the end of the list of drings 4158 * for this lane. 4159 */ 4160 if (ldcp->lane_in.dringp == NULL) { 4161 D2(vswp, "%s: adding first INBOUND dring", __func__); 4162 ldcp->lane_in.dringp = dp; 4163 } else { 4164 dbp = ldcp->lane_in.dringp; 4165 4166 while (dbp->next != NULL) 4167 dbp = dbp->next; 4168 4169 dbp->next = dp; 4170 } 4171 4172 /* acknowledge it */ 4173 dring_pkt->tag.vio_sid = ldcp->local_session; 4174 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4175 dring_pkt->dring_ident = dp->ident; 4176 4177 vsw_send_msg(ldcp, (void *)dring_pkt, 4178 sizeof (vio_dring_reg_msg_t)); 4179 4180 ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT; 4181 vsw_next_milestone(ldcp); 4182 break; 4183 4184 case VIO_SUBTYPE_ACK: 4185 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 4186 4187 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV)) 4188 return; 4189 4190 /* 4191 * Peer is acknowledging our dring info and will have 4192 * sent us a dring identifier which we will use to 4193 * refer to this ring w.r.t. our peer. 4194 */ 4195 dp = ldcp->lane_out.dringp; 4196 if (dp != NULL) { 4197 /* 4198 * Find the ring this ident should be associated 4199 * with. 4200 */ 4201 if (vsw_dring_match(dp, dring_pkt)) { 4202 dring_found = 1; 4203 4204 } else while (dp != NULL) { 4205 if (vsw_dring_match(dp, dring_pkt)) { 4206 dring_found = 1; 4207 break; 4208 } 4209 dp = dp->next; 4210 } 4211 4212 if (dring_found == 0) { 4213 DERR(NULL, "%s: unrecognised ring cookie", 4214 __func__); 4215 vsw_restart_handshake(ldcp); 4216 return; 4217 } 4218 4219 } else { 4220 DERR(vswp, "%s: DRING ACK received but no drings " 4221 "allocated", __func__); 4222 vsw_restart_handshake(ldcp); 4223 return; 4224 } 4225 4226 /* store ident */ 4227 dp->ident = dring_pkt->dring_ident; 4228 ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV; 4229 vsw_next_milestone(ldcp); 4230 break; 4231 4232 case VIO_SUBTYPE_NACK: 4233 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 4234 4235 if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV)) 4236 return; 4237 4238 ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV; 4239 vsw_next_milestone(ldcp); 4240 break; 4241 4242 default: 4243 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 4244 dring_pkt->tag.vio_subtype); 4245 } 4246 4247 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 4248 } 4249 4250 /* 4251 * Process a request from peer to unregister a dring. 4252 * 4253 * For the moment we just restart the handshake if our 4254 * peer endpoint attempts to unregister a dring. 4255 */ 4256 void 4257 vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt) 4258 { 4259 vsw_t *vswp = ldcp->ldc_vswp; 4260 vio_dring_unreg_msg_t *dring_pkt; 4261 4262 /* 4263 * We know this is a ctrl/dring packet so 4264 * cast it into the correct structure. 4265 */ 4266 dring_pkt = (vio_dring_unreg_msg_t *)pkt; 4267 4268 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4269 4270 switch (dring_pkt->tag.vio_subtype) { 4271 case VIO_SUBTYPE_INFO: 4272 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4273 4274 DWARN(vswp, "%s: restarting handshake..", __func__); 4275 vsw_restart_handshake(ldcp); 4276 break; 4277 4278 case VIO_SUBTYPE_ACK: 4279 D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 4280 4281 DWARN(vswp, "%s: restarting handshake..", __func__); 4282 vsw_restart_handshake(ldcp); 4283 break; 4284 4285 case VIO_SUBTYPE_NACK: 4286 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 4287 4288 DWARN(vswp, "%s: restarting handshake..", __func__); 4289 vsw_restart_handshake(ldcp); 4290 break; 4291 4292 default: 4293 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 4294 dring_pkt->tag.vio_subtype); 4295 vsw_restart_handshake(ldcp); 4296 } 4297 4298 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4299 } 4300 4301 #define SND_MCST_NACK(ldcp, pkt) \ 4302 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 4303 pkt->tag.vio_sid = ldcp->local_session; \ 4304 vsw_send_msg(ldcp, (void *)pkt, sizeof (vnet_mcast_msg_t)); 4305 4306 /* 4307 * Process a multicast request from a vnet. 4308 * 4309 * Vnet's specify a multicast address that they are interested in. This 4310 * address is used as a key into the hash table which forms the multicast 4311 * forwarding database (mFDB). 4312 * 4313 * The table keys are the multicast addresses, while the table entries 4314 * are pointers to lists of ports which wish to receive packets for the 4315 * specified multicast address. 4316 * 4317 * When a multicast packet is being switched we use the address as a key 4318 * into the hash table, and then walk the appropriate port list forwarding 4319 * the pkt to each port in turn. 4320 * 4321 * If a vnet is no longer interested in a particular multicast grouping 4322 * we simply find the correct location in the hash table and then delete 4323 * the relevant port from the port list. 4324 * 4325 * To deal with the case whereby a port is being deleted without first 4326 * removing itself from the lists in the hash table, we maintain a list 4327 * of multicast addresses the port has registered an interest in, within 4328 * the port structure itself. We then simply walk that list of addresses 4329 * using them as keys into the hash table and remove the port from the 4330 * appropriate lists. 4331 */ 4332 static void 4333 vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt) 4334 { 4335 vnet_mcast_msg_t *mcst_pkt; 4336 vsw_port_t *port = ldcp->ldc_port; 4337 vsw_t *vswp = ldcp->ldc_vswp; 4338 int i; 4339 4340 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4341 4342 /* 4343 * We know this is a ctrl/mcast packet so 4344 * cast it into the correct structure. 4345 */ 4346 mcst_pkt = (vnet_mcast_msg_t *)pkt; 4347 4348 switch (mcst_pkt->tag.vio_subtype) { 4349 case VIO_SUBTYPE_INFO: 4350 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4351 4352 /* 4353 * Check if in correct state to receive a multicast 4354 * message (i.e. handshake complete). If not reset 4355 * the handshake. 4356 */ 4357 if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV)) 4358 return; 4359 4360 /* 4361 * Before attempting to add or remove address check 4362 * that they are valid multicast addresses. 4363 * If not, then NACK back. 4364 */ 4365 for (i = 0; i < mcst_pkt->count; i++) { 4366 if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) { 4367 DERR(vswp, "%s: invalid multicast address", 4368 __func__); 4369 SND_MCST_NACK(ldcp, mcst_pkt); 4370 return; 4371 } 4372 } 4373 4374 /* 4375 * Now add/remove the addresses. If this fails we 4376 * NACK back. 4377 */ 4378 if (vsw_add_rem_mcst(mcst_pkt, port) != 0) { 4379 SND_MCST_NACK(ldcp, mcst_pkt); 4380 return; 4381 } 4382 4383 mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4384 mcst_pkt->tag.vio_sid = ldcp->local_session; 4385 4386 DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt); 4387 4388 vsw_send_msg(ldcp, (void *)mcst_pkt, 4389 sizeof (vnet_mcast_msg_t)); 4390 break; 4391 4392 case VIO_SUBTYPE_ACK: 4393 DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 4394 4395 /* 4396 * We shouldn't ever get a multicast ACK message as 4397 * at the moment we never request multicast addresses 4398 * to be set on some other device. This may change in 4399 * the future if we have cascading switches. 4400 */ 4401 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV)) 4402 return; 4403 4404 /* Do nothing */ 4405 break; 4406 4407 case VIO_SUBTYPE_NACK: 4408 DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 4409 4410 /* 4411 * We shouldn't get a multicast NACK packet for the 4412 * same reasons as we shouldn't get a ACK packet. 4413 */ 4414 if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV)) 4415 return; 4416 4417 /* Do nothing */ 4418 break; 4419 4420 default: 4421 DERR(vswp, "%s: unknown vio_subtype %x\n", __func__, 4422 mcst_pkt->tag.vio_subtype); 4423 } 4424 4425 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4426 } 4427 4428 static void 4429 vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt) 4430 { 4431 vio_rdx_msg_t *rdx_pkt; 4432 vsw_t *vswp = ldcp->ldc_vswp; 4433 4434 /* 4435 * We know this is a ctrl/rdx packet so 4436 * cast it into the correct structure. 4437 */ 4438 rdx_pkt = (vio_rdx_msg_t *)pkt; 4439 4440 D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id); 4441 4442 switch (rdx_pkt->tag.vio_subtype) { 4443 case VIO_SUBTYPE_INFO: 4444 D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 4445 4446 if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_INFO_RECV)) 4447 return; 4448 4449 rdx_pkt->tag.vio_sid = ldcp->local_session; 4450 rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4451 4452 DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt); 4453 4454 ldcp->lane_in.lstate |= VSW_RDX_ACK_SENT; 4455 4456 vsw_send_msg(ldcp, (void *)rdx_pkt, 4457 sizeof (vio_rdx_msg_t)); 4458 4459 vsw_next_milestone(ldcp); 4460 break; 4461 4462 case VIO_SUBTYPE_ACK: 4463 /* 4464 * Should be handled in-band by callback handler. 4465 */ 4466 DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__); 4467 vsw_restart_handshake(ldcp); 4468 break; 4469 4470 case VIO_SUBTYPE_NACK: 4471 D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 4472 4473 if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_NACK_RECV)) 4474 return; 4475 4476 ldcp->lane_out.lstate |= VSW_RDX_NACK_RECV; 4477 vsw_next_milestone(ldcp); 4478 break; 4479 4480 default: 4481 DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__, 4482 rdx_pkt->tag.vio_subtype); 4483 } 4484 4485 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4486 } 4487 4488 static void 4489 vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag) 4490 { 4491 uint16_t env = tag.vio_subtype_env; 4492 vsw_t *vswp = ldcp->ldc_vswp; 4493 4494 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4495 4496 /* session id check */ 4497 if (ldcp->session_status & VSW_PEER_SESSION) { 4498 if (ldcp->peer_session != tag.vio_sid) { 4499 DERR(vswp, "%s (chan %d): invalid session id (%llx)", 4500 __func__, ldcp->ldc_id, tag.vio_sid); 4501 vsw_restart_handshake(ldcp); 4502 return; 4503 } 4504 } 4505 4506 /* 4507 * It is an error for us to be getting data packets 4508 * before the handshake has completed. 4509 */ 4510 if (ldcp->hphase != VSW_MILESTONE4) { 4511 DERR(vswp, "%s: got data packet before handshake complete " 4512 "hphase %d (%x: %x)", __func__, ldcp->hphase, 4513 ldcp->lane_in.lstate, ldcp->lane_out.lstate); 4514 DUMP_FLAGS(ldcp->lane_in.lstate); 4515 DUMP_FLAGS(ldcp->lane_out.lstate); 4516 vsw_restart_handshake(ldcp); 4517 return; 4518 } 4519 4520 /* 4521 * Switch on vio_subtype envelope, then let lower routines 4522 * decide if its an INFO, ACK or NACK packet. 4523 */ 4524 if (env == VIO_DRING_DATA) { 4525 vsw_process_data_dring_pkt(ldcp, dpkt); 4526 } else if (env == VIO_PKT_DATA) { 4527 vsw_process_data_raw_pkt(ldcp, dpkt); 4528 } else if (env == VIO_DESC_DATA) { 4529 vsw_process_data_ibnd_pkt(ldcp, dpkt); 4530 } else { 4531 DERR(vswp, "%s : unknown vio_subtype_env (%x)\n", 4532 __func__, env); 4533 } 4534 4535 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 4536 } 4537 4538 #define SND_DRING_NACK(ldcp, pkt) \ 4539 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 4540 pkt->tag.vio_sid = ldcp->local_session; \ 4541 vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_dring_msg_t)); 4542 4543 static void 4544 vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt) 4545 { 4546 vio_dring_msg_t *dring_pkt; 4547 vnet_public_desc_t *pub_addr = NULL; 4548 vsw_private_desc_t *priv_addr = NULL; 4549 dring_info_t *dp = NULL; 4550 vsw_t *vswp = ldcp->ldc_vswp; 4551 mblk_t *mp = NULL; 4552 mblk_t *bp = NULL; 4553 mblk_t *bpt = NULL; 4554 size_t nbytes = 0; 4555 size_t off = 0; 4556 uint64_t ncookies = 0; 4557 uint64_t chain = 0; 4558 uint64_t j, len; 4559 uint32_t pos, start, datalen; 4560 uint32_t range_start, range_end; 4561 int32_t end, num, cnt = 0; 4562 int i, rv; 4563 boolean_t ack_needed = B_FALSE; 4564 boolean_t prev_desc_ack = B_FALSE; 4565 int read_attempts = 0; 4566 4567 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 4568 4569 /* 4570 * We know this is a data/dring packet so 4571 * cast it into the correct structure. 4572 */ 4573 dring_pkt = (vio_dring_msg_t *)dpkt; 4574 4575 /* 4576 * Switch on the vio_subtype. If its INFO then we need to 4577 * process the data. If its an ACK we need to make sure 4578 * it makes sense (i.e did we send an earlier data/info), 4579 * and if its a NACK then we maybe attempt a retry. 4580 */ 4581 switch (dring_pkt->tag.vio_subtype) { 4582 case VIO_SUBTYPE_INFO: 4583 D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id); 4584 4585 if ((dp = vsw_ident2dring(&ldcp->lane_in, 4586 dring_pkt->dring_ident)) == NULL) { 4587 4588 DERR(vswp, "%s(%lld): unable to find dring from " 4589 "ident 0x%llx", __func__, ldcp->ldc_id, 4590 dring_pkt->dring_ident); 4591 4592 SND_DRING_NACK(ldcp, dring_pkt); 4593 return; 4594 } 4595 4596 start = pos = dring_pkt->start_idx; 4597 end = dring_pkt->end_idx; 4598 len = dp->num_descriptors; 4599 4600 range_start = range_end = pos; 4601 4602 D2(vswp, "%s(%lld): start index %ld : end %ld\n", 4603 __func__, ldcp->ldc_id, start, end); 4604 4605 if (end == -1) { 4606 num = -1; 4607 } else if (end >= 0) { 4608 num = end >= pos ? 4609 end - pos + 1: (len - pos + 1) + end; 4610 4611 /* basic sanity check */ 4612 if (end > len) { 4613 DERR(vswp, "%s(%lld): endpoint %lld outside " 4614 "ring length %lld", __func__, 4615 ldcp->ldc_id, end, len); 4616 4617 SND_DRING_NACK(ldcp, dring_pkt); 4618 return; 4619 } 4620 } else { 4621 DERR(vswp, "%s(%lld): invalid endpoint %lld", 4622 __func__, ldcp->ldc_id, end); 4623 SND_DRING_NACK(ldcp, dring_pkt); 4624 return; 4625 } 4626 4627 while (cnt != num) { 4628 vsw_recheck_desc: 4629 if ((rv = ldc_mem_dring_acquire(dp->handle, 4630 pos, pos)) != 0) { 4631 DERR(vswp, "%s(%lld): unable to acquire " 4632 "descriptor at pos %d: err %d", 4633 __func__, pos, ldcp->ldc_id, rv); 4634 SND_DRING_NACK(ldcp, dring_pkt); 4635 return; 4636 } 4637 4638 pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos; 4639 4640 /* 4641 * When given a bounded range of descriptors 4642 * to process, its an error to hit a descriptor 4643 * which is not ready. In the non-bounded case 4644 * (end_idx == -1) this simply indicates we have 4645 * reached the end of the current active range. 4646 */ 4647 if (pub_addr->hdr.dstate != VIO_DESC_READY) { 4648 /* unbound - no error */ 4649 if (end == -1) { 4650 if (read_attempts == vsw_read_attempts) 4651 break; 4652 4653 delay(drv_usectohz(vsw_desc_delay)); 4654 read_attempts++; 4655 goto vsw_recheck_desc; 4656 } 4657 4658 /* bounded - error - so NACK back */ 4659 DERR(vswp, "%s(%lld): descriptor not READY " 4660 "(%d)", __func__, ldcp->ldc_id, 4661 pub_addr->hdr.dstate); 4662 SND_DRING_NACK(ldcp, dring_pkt); 4663 return; 4664 } 4665 4666 DTRACE_PROBE1(read_attempts, int, read_attempts); 4667 4668 range_end = pos; 4669 4670 /* 4671 * If we ACK'd the previous descriptor then now 4672 * record the new range start position for later 4673 * ACK's. 4674 */ 4675 if (prev_desc_ack) { 4676 range_start = pos; 4677 4678 D2(vswp, "%s(%lld): updating range start " 4679 "to be %d", __func__, ldcp->ldc_id, 4680 range_start); 4681 4682 prev_desc_ack = B_FALSE; 4683 } 4684 4685 /* 4686 * Data is padded to align on 8 byte boundary, 4687 * datalen is actual data length, i.e. minus that 4688 * padding. 4689 */ 4690 datalen = pub_addr->nbytes; 4691 4692 /* 4693 * Does peer wish us to ACK when we have finished 4694 * with this descriptor ? 4695 */ 4696 if (pub_addr->hdr.ack) 4697 ack_needed = B_TRUE; 4698 4699 D2(vswp, "%s(%lld): processing desc %lld at pos" 4700 " 0x%llx : dstate 0x%lx : datalen 0x%lx", 4701 __func__, ldcp->ldc_id, pos, pub_addr, 4702 pub_addr->hdr.dstate, datalen); 4703 4704 /* 4705 * Mark that we are starting to process descriptor. 4706 */ 4707 pub_addr->hdr.dstate = VIO_DESC_ACCEPTED; 4708 4709 mp = vio_allocb(ldcp->rxh); 4710 if (mp == NULL) { 4711 /* 4712 * No free receive buffers available, so 4713 * fallback onto allocb(9F). Make sure that 4714 * we get a data buffer which is a multiple 4715 * of 8 as this is required by ldc_mem_copy. 4716 */ 4717 DTRACE_PROBE(allocb); 4718 mp = allocb(datalen + VNET_IPALIGN + 8, 4719 BPRI_MED); 4720 } 4721 4722 /* 4723 * Ensure that we ask ldc for an aligned 4724 * number of bytes. 4725 */ 4726 nbytes = datalen + VNET_IPALIGN; 4727 if (nbytes & 0x7) { 4728 off = 8 - (nbytes & 0x7); 4729 nbytes += off; 4730 } 4731 4732 ncookies = pub_addr->ncookies; 4733 rv = ldc_mem_copy(ldcp->ldc_handle, 4734 (caddr_t)mp->b_rptr, 0, &nbytes, 4735 pub_addr->memcookie, ncookies, 4736 LDC_COPY_IN); 4737 4738 if (rv != 0) { 4739 DERR(vswp, "%s(%d): unable to copy in " 4740 "data from %d cookies in desc %d" 4741 " (rv %d)", __func__, ldcp->ldc_id, 4742 ncookies, pos, rv); 4743 freemsg(mp); 4744 4745 pub_addr->hdr.dstate = VIO_DESC_DONE; 4746 (void) ldc_mem_dring_release(dp->handle, 4747 pos, pos); 4748 break; 4749 } else { 4750 D2(vswp, "%s(%d): copied in %ld bytes" 4751 " using %d cookies", __func__, 4752 ldcp->ldc_id, nbytes, ncookies); 4753 } 4754 4755 /* adjust the read pointer to skip over the padding */ 4756 mp->b_rptr += VNET_IPALIGN; 4757 4758 /* point to the actual end of data */ 4759 mp->b_wptr = mp->b_rptr + datalen; 4760 4761 /* build a chain of received packets */ 4762 if (bp == NULL) { 4763 /* first pkt */ 4764 bp = mp; 4765 bp->b_next = bp->b_prev = NULL; 4766 bpt = bp; 4767 chain = 1; 4768 } else { 4769 mp->b_next = NULL; 4770 mp->b_prev = bpt; 4771 bpt->b_next = mp; 4772 bpt = mp; 4773 chain++; 4774 } 4775 4776 /* mark we are finished with this descriptor */ 4777 pub_addr->hdr.dstate = VIO_DESC_DONE; 4778 4779 (void) ldc_mem_dring_release(dp->handle, pos, pos); 4780 4781 /* 4782 * Send an ACK back to peer if requested. 4783 */ 4784 if (ack_needed) { 4785 ack_needed = B_FALSE; 4786 4787 dring_pkt->start_idx = range_start; 4788 dring_pkt->end_idx = range_end; 4789 4790 DERR(vswp, "%s(%lld): processed %d %d, ACK" 4791 " requested", __func__, ldcp->ldc_id, 4792 dring_pkt->start_idx, 4793 dring_pkt->end_idx); 4794 4795 dring_pkt->dring_process_state = VIO_DP_ACTIVE; 4796 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4797 dring_pkt->tag.vio_sid = ldcp->local_session; 4798 vsw_send_msg(ldcp, (void *)dring_pkt, 4799 sizeof (vio_dring_msg_t)); 4800 4801 prev_desc_ack = B_TRUE; 4802 range_start = pos; 4803 } 4804 4805 /* next descriptor */ 4806 pos = (pos + 1) % len; 4807 cnt++; 4808 4809 /* 4810 * Break out of loop here and stop processing to 4811 * allow some other network device (or disk) to 4812 * get access to the cpu. 4813 */ 4814 /* send the chain of packets to be switched */ 4815 if (chain > vsw_chain_len) { 4816 D3(vswp, "%s(%lld): switching chain of %d " 4817 "msgs", __func__, ldcp->ldc_id, chain); 4818 vsw_switch_frame(vswp, bp, VSW_VNETPORT, 4819 ldcp->ldc_port, NULL); 4820 bp = NULL; 4821 break; 4822 } 4823 } 4824 4825 /* send the chain of packets to be switched */ 4826 if (bp != NULL) { 4827 D3(vswp, "%s(%lld): switching chain of %d msgs", 4828 __func__, ldcp->ldc_id, chain); 4829 vsw_switch_frame(vswp, bp, VSW_VNETPORT, 4830 ldcp->ldc_port, NULL); 4831 } 4832 4833 DTRACE_PROBE1(msg_cnt, int, cnt); 4834 4835 /* 4836 * We are now finished so ACK back with the state 4837 * set to STOPPING so our peer knows we are finished 4838 */ 4839 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; 4840 dring_pkt->tag.vio_sid = ldcp->local_session; 4841 4842 dring_pkt->dring_process_state = VIO_DP_STOPPED; 4843 4844 DTRACE_PROBE(stop_process_sent); 4845 4846 /* 4847 * We have not processed any more descriptors beyond 4848 * the last one we ACK'd. 4849 */ 4850 if (prev_desc_ack) 4851 range_start = range_end; 4852 4853 dring_pkt->start_idx = range_start; 4854 dring_pkt->end_idx = range_end; 4855 4856 D2(vswp, "%s(%lld) processed : %d : %d, now stopping", 4857 __func__, ldcp->ldc_id, dring_pkt->start_idx, 4858 dring_pkt->end_idx); 4859 4860 vsw_send_msg(ldcp, (void *)dring_pkt, 4861 sizeof (vio_dring_msg_t)); 4862 break; 4863 4864 case VIO_SUBTYPE_ACK: 4865 D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id); 4866 /* 4867 * Verify that the relevant descriptors are all 4868 * marked as DONE 4869 */ 4870 if ((dp = vsw_ident2dring(&ldcp->lane_out, 4871 dring_pkt->dring_ident)) == NULL) { 4872 DERR(vswp, "%s: unknown ident in ACK", __func__); 4873 return; 4874 } 4875 4876 pub_addr = (vnet_public_desc_t *)dp->pub_addr; 4877 priv_addr = (vsw_private_desc_t *)dp->priv_addr; 4878 4879 start = end = 0; 4880 start = dring_pkt->start_idx; 4881 end = dring_pkt->end_idx; 4882 len = dp->num_descriptors; 4883 4884 j = num = 0; 4885 /* calculate # descriptors taking into a/c wrap around */ 4886 num = end >= start ? end - start + 1: (len - start + 1) + end; 4887 4888 D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n", 4889 __func__, ldcp->ldc_id, start, end, num); 4890 4891 mutex_enter(&dp->dlock); 4892 dp->last_ack_recv = end; 4893 mutex_exit(&dp->dlock); 4894 4895 for (i = start; j < num; i = (i + 1) % len, j++) { 4896 pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; 4897 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 4898 4899 /* 4900 * If the last descriptor in a range has the ACK 4901 * bit set then we will get two messages from our 4902 * peer relating to it. The normal ACK msg and then 4903 * a subsequent STOP msg. The first message will have 4904 * resulted in the descriptor being reclaimed and 4905 * its state set to FREE so when we encounter a non 4906 * DONE descriptor we need to check to see if its 4907 * because we have just reclaimed it. 4908 */ 4909 mutex_enter(&priv_addr->dstate_lock); 4910 if (pub_addr->hdr.dstate == VIO_DESC_DONE) { 4911 /* clear all the fields */ 4912 bzero(priv_addr->datap, priv_addr->datalen); 4913 priv_addr->datalen = 0; 4914 4915 pub_addr->hdr.dstate = VIO_DESC_FREE; 4916 pub_addr->hdr.ack = 0; 4917 4918 priv_addr->dstate = VIO_DESC_FREE; 4919 mutex_exit(&priv_addr->dstate_lock); 4920 4921 D3(vswp, "clearing descp %d : pub state " 4922 "0x%llx : priv state 0x%llx", i, 4923 pub_addr->hdr.dstate, 4924 priv_addr->dstate); 4925 4926 } else { 4927 mutex_exit(&priv_addr->dstate_lock); 4928 4929 if (dring_pkt->dring_process_state != 4930 VIO_DP_STOPPED) { 4931 DERR(vswp, "%s: descriptor %lld at pos " 4932 " 0x%llx not DONE (0x%lx)\n", 4933 __func__, i, pub_addr, 4934 pub_addr->hdr.dstate); 4935 return; 4936 } 4937 } 4938 } 4939 4940 /* 4941 * If our peer is stopping processing descriptors then 4942 * we check to make sure it has processed all the descriptors 4943 * we have updated. If not then we send it a new message 4944 * to prompt it to restart. 4945 */ 4946 if (dring_pkt->dring_process_state == VIO_DP_STOPPED) { 4947 DTRACE_PROBE(stop_process_recv); 4948 D2(vswp, "%s(%lld): got stopping msg : %d : %d", 4949 __func__, ldcp->ldc_id, dring_pkt->start_idx, 4950 dring_pkt->end_idx); 4951 4952 /* 4953 * Check next descriptor in public section of ring. 4954 * If its marked as READY then we need to prompt our 4955 * peer to start processing the ring again. 4956 */ 4957 i = (end + 1) % len; 4958 pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; 4959 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 4960 4961 /* 4962 * Hold the restart lock across all of this to 4963 * make sure that its not possible for us to 4964 * decide that a msg needs to be sent in the future 4965 * but the sending code having already checked is 4966 * about to exit. 4967 */ 4968 mutex_enter(&dp->restart_lock); 4969 mutex_enter(&priv_addr->dstate_lock); 4970 if (pub_addr->hdr.dstate == VIO_DESC_READY) { 4971 4972 mutex_exit(&priv_addr->dstate_lock); 4973 4974 dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; 4975 dring_pkt->tag.vio_sid = ldcp->local_session; 4976 4977 mutex_enter(&ldcp->lane_out.seq_lock); 4978 dring_pkt->seq_num = ldcp->lane_out.seq_num++; 4979 mutex_exit(&ldcp->lane_out.seq_lock); 4980 4981 dring_pkt->start_idx = (end + 1) % len; 4982 dring_pkt->end_idx = -1; 4983 4984 D2(vswp, "%s(%lld) : sending restart msg:" 4985 " %d : %d", __func__, ldcp->ldc_id, 4986 dring_pkt->start_idx, 4987 dring_pkt->end_idx); 4988 4989 vsw_send_msg(ldcp, (void *)dring_pkt, 4990 sizeof (vio_dring_msg_t)); 4991 } else { 4992 mutex_exit(&priv_addr->dstate_lock); 4993 dp->restart_reqd = B_TRUE; 4994 } 4995 mutex_exit(&dp->restart_lock); 4996 } 4997 break; 4998 4999 case VIO_SUBTYPE_NACK: 5000 DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK", 5001 __func__, ldcp->ldc_id); 5002 /* 5003 * Something is badly wrong if we are getting NACK's 5004 * for our data pkts. So reset the channel. 5005 */ 5006 vsw_restart_handshake(ldcp); 5007 5008 break; 5009 5010 default: 5011 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, 5012 ldcp->ldc_id, dring_pkt->tag.vio_subtype); 5013 } 5014 5015 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 5016 } 5017 5018 /* 5019 * VIO_PKT_DATA (a.k.a raw data mode ) 5020 * 5021 * Note - currently not supported. Do nothing. 5022 */ 5023 static void 5024 vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt) 5025 { 5026 _NOTE(ARGUNUSED(dpkt)) 5027 5028 D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id); 5029 5030 DERR(NULL, "%s (%lld): currently not supported", 5031 __func__, ldcp->ldc_id); 5032 5033 D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id); 5034 } 5035 5036 #define SND_IBND_DESC_NACK(ldcp, pkt) \ 5037 pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \ 5038 pkt->tag.vio_sid = ldcp->local_session; \ 5039 vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_ibnd_desc_t)); 5040 5041 /* 5042 * Process an in-band descriptor message (most likely from 5043 * OBP). 5044 */ 5045 static void 5046 vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt) 5047 { 5048 vio_ibnd_desc_t *ibnd_desc; 5049 dring_info_t *dp = NULL; 5050 vsw_private_desc_t *priv_addr = NULL; 5051 vsw_t *vswp = ldcp->ldc_vswp; 5052 mblk_t *mp = NULL; 5053 size_t nbytes = 0; 5054 size_t off = 0; 5055 uint64_t idx = 0; 5056 uint32_t num = 1, len, datalen = 0; 5057 uint64_t ncookies = 0; 5058 int i, rv; 5059 int j = 0; 5060 5061 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5062 5063 ibnd_desc = (vio_ibnd_desc_t *)pkt; 5064 5065 switch (ibnd_desc->hdr.tag.vio_subtype) { 5066 case VIO_SUBTYPE_INFO: 5067 D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__); 5068 5069 if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV)) 5070 return; 5071 5072 /* 5073 * Data is padded to align on a 8 byte boundary, 5074 * nbytes is actual data length, i.e. minus that 5075 * padding. 5076 */ 5077 datalen = ibnd_desc->nbytes; 5078 5079 D2(vswp, "%s(%lld): processing inband desc : " 5080 ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen); 5081 5082 ncookies = ibnd_desc->ncookies; 5083 5084 /* 5085 * allocb(9F) returns an aligned data block. We 5086 * need to ensure that we ask ldc for an aligned 5087 * number of bytes also. 5088 */ 5089 nbytes = datalen; 5090 if (nbytes & 0x7) { 5091 off = 8 - (nbytes & 0x7); 5092 nbytes += off; 5093 } 5094 5095 mp = allocb(datalen, BPRI_MED); 5096 if (mp == NULL) { 5097 DERR(vswp, "%s(%lld): allocb failed", 5098 __func__, ldcp->ldc_id); 5099 return; 5100 } 5101 5102 rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr, 5103 0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies, 5104 LDC_COPY_IN); 5105 5106 if (rv != 0) { 5107 DERR(vswp, "%s(%d): unable to copy in data from " 5108 "%d cookie(s)", __func__, 5109 ldcp->ldc_id, ncookies); 5110 freemsg(mp); 5111 return; 5112 } else { 5113 D2(vswp, "%s(%d): copied in %ld bytes using %d " 5114 "cookies", __func__, ldcp->ldc_id, nbytes, 5115 ncookies); 5116 } 5117 5118 /* point to the actual end of data */ 5119 mp->b_wptr = mp->b_rptr + datalen; 5120 5121 /* 5122 * We ACK back every in-band descriptor message we process 5123 */ 5124 ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK; 5125 ibnd_desc->hdr.tag.vio_sid = ldcp->local_session; 5126 vsw_send_msg(ldcp, (void *)ibnd_desc, 5127 sizeof (vio_ibnd_desc_t)); 5128 5129 /* send the packet to be switched */ 5130 vsw_switch_frame(vswp, mp, VSW_VNETPORT, 5131 ldcp->ldc_port, NULL); 5132 5133 break; 5134 5135 case VIO_SUBTYPE_ACK: 5136 D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__); 5137 5138 /* Verify the ACK is valid */ 5139 idx = ibnd_desc->hdr.desc_handle; 5140 5141 if (idx >= VSW_RING_NUM_EL) { 5142 cmn_err(CE_WARN, "%s: corrupted ACK received " 5143 "(idx %ld)", __func__, idx); 5144 return; 5145 } 5146 5147 if ((dp = ldcp->lane_out.dringp) == NULL) { 5148 DERR(vswp, "%s: no dring found", __func__); 5149 return; 5150 } 5151 5152 len = dp->num_descriptors; 5153 /* 5154 * If the descriptor we are being ACK'ed for is not the 5155 * one we expected, then pkts were lost somwhere, either 5156 * when we tried to send a msg, or a previous ACK msg from 5157 * our peer. In either case we now reclaim the descriptors 5158 * in the range from the last ACK we received up to the 5159 * current ACK. 5160 */ 5161 if (idx != dp->last_ack_recv) { 5162 DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)", 5163 __func__, dp->last_ack_recv, idx); 5164 num = idx >= dp->last_ack_recv ? 5165 idx - dp->last_ack_recv + 1: 5166 (len - dp->last_ack_recv + 1) + idx; 5167 } 5168 5169 /* 5170 * When we sent the in-band message to our peer we 5171 * marked the copy in our private ring as READY. We now 5172 * check that the descriptor we are being ACK'ed for is in 5173 * fact READY, i.e. it is one we have shared with our peer. 5174 * 5175 * If its not we flag an error, but still reset the descr 5176 * back to FREE. 5177 */ 5178 for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) { 5179 priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; 5180 mutex_enter(&priv_addr->dstate_lock); 5181 if (priv_addr->dstate != VIO_DESC_READY) { 5182 DERR(vswp, "%s: (%ld) desc at index %ld not " 5183 "READY (0x%lx)", __func__, 5184 ldcp->ldc_id, idx, priv_addr->dstate); 5185 DERR(vswp, "%s: bound %d: ncookies %ld : " 5186 "datalen %ld", __func__, 5187 priv_addr->bound, priv_addr->ncookies, 5188 priv_addr->datalen); 5189 } 5190 D2(vswp, "%s: (%lld) freeing descp at %lld", __func__, 5191 ldcp->ldc_id, idx); 5192 /* release resources associated with sent msg */ 5193 bzero(priv_addr->datap, priv_addr->datalen); 5194 priv_addr->datalen = 0; 5195 priv_addr->dstate = VIO_DESC_FREE; 5196 mutex_exit(&priv_addr->dstate_lock); 5197 } 5198 /* update to next expected value */ 5199 dp->last_ack_recv = (idx + 1) % dp->num_descriptors; 5200 5201 break; 5202 5203 case VIO_SUBTYPE_NACK: 5204 DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__); 5205 5206 /* 5207 * We should only get a NACK if our peer doesn't like 5208 * something about a message we have sent it. If this 5209 * happens we just release the resources associated with 5210 * the message. (We are relying on higher layers to decide 5211 * whether or not to resend. 5212 */ 5213 5214 /* limit check */ 5215 idx = ibnd_desc->hdr.desc_handle; 5216 5217 if (idx >= VSW_RING_NUM_EL) { 5218 DERR(vswp, "%s: corrupted NACK received (idx %lld)", 5219 __func__, idx); 5220 return; 5221 } 5222 5223 if ((dp = ldcp->lane_out.dringp) == NULL) { 5224 DERR(vswp, "%s: no dring found", __func__); 5225 return; 5226 } 5227 5228 priv_addr = (vsw_private_desc_t *)dp->priv_addr; 5229 5230 /* move to correct location in ring */ 5231 priv_addr += idx; 5232 5233 /* release resources associated with sent msg */ 5234 mutex_enter(&priv_addr->dstate_lock); 5235 bzero(priv_addr->datap, priv_addr->datalen); 5236 priv_addr->datalen = 0; 5237 priv_addr->dstate = VIO_DESC_FREE; 5238 mutex_exit(&priv_addr->dstate_lock); 5239 5240 break; 5241 5242 default: 5243 DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__, 5244 ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype); 5245 } 5246 5247 D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id); 5248 } 5249 5250 static void 5251 vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag) 5252 { 5253 _NOTE(ARGUNUSED(epkt)) 5254 5255 vsw_t *vswp = ldcp->ldc_vswp; 5256 uint16_t env = tag.vio_subtype_env; 5257 5258 D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id); 5259 5260 /* 5261 * Error vio_subtypes have yet to be defined. So for 5262 * the moment we can't do anything. 5263 */ 5264 D2(vswp, "%s: (%x) vio_subtype env", __func__, env); 5265 5266 D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id); 5267 } 5268 5269 /* 5270 * Switch the given ethernet frame when operating in layer 2 mode. 5271 * 5272 * vswp: pointer to the vsw instance 5273 * mp: pointer to chain of ethernet frame(s) to be switched 5274 * caller: identifies the source of this frame as: 5275 * 1. VSW_VNETPORT - a vsw port (connected to a vnet). 5276 * 2. VSW_PHYSDEV - the physical ethernet device 5277 * 3. VSW_LOCALDEV - vsw configured as a virtual interface 5278 * arg: argument provided by the caller. 5279 * 1. for VNETPORT - pointer to the corresponding vsw_port_t. 5280 * 2. for PHYSDEV - NULL 5281 * 3. for LOCALDEV - pointer to to this vsw_t(self) 5282 */ 5283 void 5284 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, 5285 vsw_port_t *arg, mac_resource_handle_t mrh) 5286 { 5287 struct ether_header *ehp; 5288 vsw_port_t *port = NULL; 5289 mblk_t *bp, *ret_m; 5290 mblk_t *nmp = NULL; 5291 vsw_port_list_t *plist = &vswp->plist; 5292 5293 D1(vswp, "%s: enter (caller %d)", __func__, caller); 5294 5295 /* 5296 * PERF: rather than breaking up the chain here, scan it 5297 * to find all mblks heading to same destination and then 5298 * pass that sub-chain to the lower transmit functions. 5299 */ 5300 5301 /* process the chain of packets */ 5302 bp = mp; 5303 while (bp) { 5304 mp = bp; 5305 bp = bp->b_next; 5306 mp->b_next = mp->b_prev = NULL; 5307 ehp = (struct ether_header *)mp->b_rptr; 5308 5309 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 5310 __func__, MBLKSIZE(mp), MBLKL(mp)); 5311 5312 READ_ENTER(&vswp->if_lockrw); 5313 if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) { 5314 /* 5315 * If destination is VSW_LOCALDEV (vsw as an eth 5316 * interface) and if the device is up & running, 5317 * send the packet up the stack on this host. 5318 * If the virtual interface is down, drop the packet. 5319 */ 5320 if (caller != VSW_LOCALDEV) { 5321 if (vswp->if_state & VSW_IF_UP) { 5322 RW_EXIT(&vswp->if_lockrw); 5323 mac_rx(vswp->if_mh, mrh, mp); 5324 } else { 5325 RW_EXIT(&vswp->if_lockrw); 5326 /* Interface down, drop pkt */ 5327 freemsg(mp); 5328 } 5329 } else { 5330 RW_EXIT(&vswp->if_lockrw); 5331 freemsg(mp); 5332 } 5333 continue; 5334 } 5335 RW_EXIT(&vswp->if_lockrw); 5336 5337 READ_ENTER(&plist->lockrw); 5338 port = vsw_lookup_fdb(vswp, ehp); 5339 if (port) { 5340 /* 5341 * Mark the port as in-use. 5342 */ 5343 mutex_enter(&port->ref_lock); 5344 port->ref_cnt++; 5345 mutex_exit(&port->ref_lock); 5346 RW_EXIT(&plist->lockrw); 5347 5348 /* 5349 * If plumbed and in promisc mode then copy msg 5350 * and send up the stack. 5351 */ 5352 READ_ENTER(&vswp->if_lockrw); 5353 if (VSW_U_P(vswp->if_state)) { 5354 RW_EXIT(&vswp->if_lockrw); 5355 nmp = copymsg(mp); 5356 if (nmp) 5357 mac_rx(vswp->if_mh, mrh, nmp); 5358 } else { 5359 RW_EXIT(&vswp->if_lockrw); 5360 } 5361 5362 /* 5363 * If the destination is in FDB, the packet 5364 * should be forwarded to the correponding 5365 * vsw_port (connected to a vnet device - 5366 * VSW_VNETPORT) 5367 */ 5368 (void) vsw_portsend(port, mp); 5369 5370 /* 5371 * Decrement use count in port and check if 5372 * should wake delete thread. 5373 */ 5374 mutex_enter(&port->ref_lock); 5375 port->ref_cnt--; 5376 if (port->ref_cnt == 0) 5377 cv_signal(&port->ref_cv); 5378 mutex_exit(&port->ref_lock); 5379 } else { 5380 RW_EXIT(&plist->lockrw); 5381 /* 5382 * Destination not in FDB. 5383 * 5384 * If the destination is broadcast or 5385 * multicast forward the packet to all 5386 * (VNETPORTs, PHYSDEV, LOCALDEV), 5387 * except the caller. 5388 */ 5389 if (IS_BROADCAST(ehp)) { 5390 D3(vswp, "%s: BROADCAST pkt", __func__); 5391 (void) vsw_forward_all(vswp, mp, 5392 caller, arg); 5393 } else if (IS_MULTICAST(ehp)) { 5394 D3(vswp, "%s: MULTICAST pkt", __func__); 5395 (void) vsw_forward_grp(vswp, mp, 5396 caller, arg); 5397 } else { 5398 /* 5399 * If the destination is unicast, and came 5400 * from either a logical network device or 5401 * the switch itself when it is plumbed, then 5402 * send it out on the physical device and also 5403 * up the stack if the logical interface is 5404 * in promiscious mode. 5405 * 5406 * NOTE: The assumption here is that if we 5407 * cannot find the destination in our fdb, its 5408 * a unicast address, and came from either a 5409 * vnet or down the stack (when plumbed) it 5410 * must be destinded for an ethernet device 5411 * outside our ldoms. 5412 */ 5413 if (caller == VSW_VNETPORT) { 5414 READ_ENTER(&vswp->if_lockrw); 5415 if (VSW_U_P(vswp->if_state)) { 5416 RW_EXIT(&vswp->if_lockrw); 5417 nmp = copymsg(mp); 5418 if (nmp) 5419 mac_rx(vswp->if_mh, 5420 mrh, nmp); 5421 } else { 5422 RW_EXIT(&vswp->if_lockrw); 5423 } 5424 if ((ret_m = vsw_tx_msg(vswp, mp)) 5425 != NULL) { 5426 DERR(vswp, "%s: drop mblks to " 5427 "phys dev", __func__); 5428 freemsg(ret_m); 5429 } 5430 5431 } else if (caller == VSW_PHYSDEV) { 5432 /* 5433 * Pkt seen because card in promisc 5434 * mode. Send up stack if plumbed in 5435 * promisc mode, else drop it. 5436 */ 5437 READ_ENTER(&vswp->if_lockrw); 5438 if (VSW_U_P(vswp->if_state)) { 5439 RW_EXIT(&vswp->if_lockrw); 5440 mac_rx(vswp->if_mh, mrh, mp); 5441 } else { 5442 RW_EXIT(&vswp->if_lockrw); 5443 freemsg(mp); 5444 } 5445 5446 } else if (caller == VSW_LOCALDEV) { 5447 /* 5448 * Pkt came down the stack, send out 5449 * over physical device. 5450 */ 5451 if ((ret_m = vsw_tx_msg(vswp, mp)) 5452 != NULL) { 5453 DERR(vswp, "%s: drop mblks to " 5454 "phys dev", __func__); 5455 freemsg(ret_m); 5456 } 5457 } 5458 } 5459 } 5460 } 5461 D1(vswp, "%s: exit\n", __func__); 5462 } 5463 5464 /* 5465 * Switch ethernet frame when in layer 3 mode (i.e. using IP 5466 * layer to do the routing). 5467 * 5468 * There is a large amount of overlap between this function and 5469 * vsw_switch_l2_frame. At some stage we need to revisit and refactor 5470 * both these functions. 5471 */ 5472 void 5473 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, 5474 vsw_port_t *arg, mac_resource_handle_t mrh) 5475 { 5476 struct ether_header *ehp; 5477 vsw_port_t *port = NULL; 5478 mblk_t *bp = NULL; 5479 vsw_port_list_t *plist = &vswp->plist; 5480 5481 D1(vswp, "%s: enter (caller %d)", __func__, caller); 5482 5483 /* 5484 * In layer 3 mode should only ever be switching packets 5485 * between IP layer and vnet devices. So make sure thats 5486 * who is invoking us. 5487 */ 5488 if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) { 5489 DERR(vswp, "%s: unexpected caller (%d)", __func__, caller); 5490 freemsgchain(mp); 5491 return; 5492 } 5493 5494 /* process the chain of packets */ 5495 bp = mp; 5496 while (bp) { 5497 mp = bp; 5498 bp = bp->b_next; 5499 mp->b_next = mp->b_prev = NULL; 5500 ehp = (struct ether_header *)mp->b_rptr; 5501 5502 D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", 5503 __func__, MBLKSIZE(mp), MBLKL(mp)); 5504 5505 READ_ENTER(&plist->lockrw); 5506 port = vsw_lookup_fdb(vswp, ehp); 5507 if (port) { 5508 /* 5509 * Mark port as in-use. 5510 */ 5511 mutex_enter(&port->ref_lock); 5512 port->ref_cnt++; 5513 mutex_exit(&port->ref_lock); 5514 RW_EXIT(&plist->lockrw); 5515 5516 D2(vswp, "%s: sending to target port", __func__); 5517 (void) vsw_portsend(port, mp); 5518 5519 /* 5520 * Finished with port so decrement ref count and 5521 * check if should wake delete thread. 5522 */ 5523 mutex_enter(&port->ref_lock); 5524 port->ref_cnt--; 5525 if (port->ref_cnt == 0) 5526 cv_signal(&port->ref_cv); 5527 mutex_exit(&port->ref_lock); 5528 } else { 5529 RW_EXIT(&plist->lockrw); 5530 /* 5531 * Destination not in FDB 5532 * 5533 * If the destination is broadcast or 5534 * multicast forward the packet to all 5535 * (VNETPORTs, PHYSDEV, LOCALDEV), 5536 * except the caller. 5537 */ 5538 if (IS_BROADCAST(ehp)) { 5539 D2(vswp, "%s: BROADCAST pkt", __func__); 5540 (void) vsw_forward_all(vswp, mp, 5541 caller, arg); 5542 } else if (IS_MULTICAST(ehp)) { 5543 D2(vswp, "%s: MULTICAST pkt", __func__); 5544 (void) vsw_forward_grp(vswp, mp, 5545 caller, arg); 5546 } else { 5547 /* 5548 * Unicast pkt from vnet that we don't have 5549 * an FDB entry for, so must be destinded for 5550 * the outside world. Attempt to send up to the 5551 * IP layer to allow it to deal with it. 5552 */ 5553 if (caller == VSW_VNETPORT) { 5554 READ_ENTER(&vswp->if_lockrw); 5555 if (vswp->if_state & VSW_IF_UP) { 5556 RW_EXIT(&vswp->if_lockrw); 5557 D2(vswp, "%s: sending up", 5558 __func__); 5559 mac_rx(vswp->if_mh, mrh, mp); 5560 } else { 5561 RW_EXIT(&vswp->if_lockrw); 5562 /* Interface down, drop pkt */ 5563 D2(vswp, "%s I/F down", 5564 __func__); 5565 freemsg(mp); 5566 } 5567 } 5568 } 5569 } 5570 } 5571 5572 D1(vswp, "%s: exit", __func__); 5573 } 5574 5575 /* 5576 * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV), 5577 * except the caller (port on which frame arrived). 5578 */ 5579 static int 5580 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 5581 { 5582 vsw_port_list_t *plist = &vswp->plist; 5583 vsw_port_t *portp; 5584 mblk_t *nmp = NULL; 5585 mblk_t *ret_m = NULL; 5586 int skip_port = 0; 5587 5588 D1(vswp, "vsw_forward_all: enter\n"); 5589 5590 /* 5591 * Broadcast message from inside ldoms so send to outside 5592 * world if in either of layer 2 modes. 5593 */ 5594 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 5595 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 5596 ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) { 5597 5598 nmp = dupmsg(mp); 5599 if (nmp) { 5600 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 5601 DERR(vswp, "%s: dropping pkt(s) " 5602 "consisting of %ld bytes of data for" 5603 " physical device", __func__, MBLKL(ret_m)); 5604 freemsg(ret_m); 5605 } 5606 } 5607 } 5608 5609 if (caller == VSW_VNETPORT) 5610 skip_port = 1; 5611 5612 /* 5613 * Broadcast message from other vnet (layer 2 or 3) or outside 5614 * world (layer 2 only), send up stack if plumbed. 5615 */ 5616 if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) { 5617 READ_ENTER(&vswp->if_lockrw); 5618 if (vswp->if_state & VSW_IF_UP) { 5619 RW_EXIT(&vswp->if_lockrw); 5620 nmp = copymsg(mp); 5621 if (nmp) 5622 mac_rx(vswp->if_mh, NULL, nmp); 5623 } else { 5624 RW_EXIT(&vswp->if_lockrw); 5625 } 5626 } 5627 5628 /* send it to all VNETPORTs */ 5629 READ_ENTER(&plist->lockrw); 5630 for (portp = plist->head; portp != NULL; portp = portp->p_next) { 5631 D2(vswp, "vsw_forward_all: port %d", portp->p_instance); 5632 /* 5633 * Caution ! - don't reorder these two checks as arg 5634 * will be NULL if the caller is PHYSDEV. skip_port is 5635 * only set if caller is VNETPORT. 5636 */ 5637 if ((skip_port) && (portp == arg)) 5638 continue; 5639 else { 5640 nmp = dupmsg(mp); 5641 if (nmp) { 5642 (void) vsw_portsend(portp, nmp); 5643 } else { 5644 DERR(vswp, "vsw_forward_all: nmp NULL"); 5645 } 5646 } 5647 } 5648 RW_EXIT(&plist->lockrw); 5649 5650 freemsg(mp); 5651 5652 D1(vswp, "vsw_forward_all: exit\n"); 5653 return (0); 5654 } 5655 5656 /* 5657 * Forward pkts to any devices or interfaces which have registered 5658 * an interest in them (i.e. multicast groups). 5659 */ 5660 static int 5661 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) 5662 { 5663 struct ether_header *ehp = (struct ether_header *)mp->b_rptr; 5664 mfdb_ent_t *entp = NULL; 5665 mfdb_ent_t *tpp = NULL; 5666 vsw_port_t *port; 5667 uint64_t key = 0; 5668 mblk_t *nmp = NULL; 5669 mblk_t *ret_m = NULL; 5670 boolean_t check_if = B_TRUE; 5671 5672 /* 5673 * Convert address to hash table key 5674 */ 5675 KEY_HASH(key, ehp->ether_dhost); 5676 5677 D1(vswp, "%s: key 0x%llx", __func__, key); 5678 5679 /* 5680 * If pkt came from either a vnet or down the stack (if we are 5681 * plumbed) and we are in layer 2 mode, then we send the pkt out 5682 * over the physical adapter, and then check to see if any other 5683 * vnets are interested in it. 5684 */ 5685 if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || 5686 (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && 5687 ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) { 5688 nmp = dupmsg(mp); 5689 if (nmp) { 5690 if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { 5691 DERR(vswp, "%s: dropping pkt(s) " 5692 "consisting of %ld bytes of " 5693 "data for physical device", 5694 __func__, MBLKL(ret_m)); 5695 freemsg(ret_m); 5696 } 5697 } 5698 } 5699 5700 READ_ENTER(&vswp->mfdbrw); 5701 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key, 5702 (mod_hash_val_t *)&entp) != 0) { 5703 D3(vswp, "%s: no table entry found for addr 0x%llx", 5704 __func__, key); 5705 } else { 5706 /* 5707 * Send to list of devices associated with this address... 5708 */ 5709 for (tpp = entp; tpp != NULL; tpp = tpp->nextp) { 5710 5711 /* dont send to ourselves */ 5712 if ((caller == VSW_VNETPORT) && 5713 (tpp->d_addr == (void *)arg)) { 5714 port = (vsw_port_t *)tpp->d_addr; 5715 D3(vswp, "%s: not sending to ourselves" 5716 " : port %d", __func__, 5717 port->p_instance); 5718 continue; 5719 5720 } else if ((caller == VSW_LOCALDEV) && 5721 (tpp->d_type == VSW_LOCALDEV)) { 5722 D3(vswp, "%s: not sending back up stack", 5723 __func__); 5724 continue; 5725 } 5726 5727 if (tpp->d_type == VSW_VNETPORT) { 5728 port = (vsw_port_t *)tpp->d_addr; 5729 D3(vswp, "%s: sending to port %ld for " 5730 " addr 0x%llx", __func__, 5731 port->p_instance, key); 5732 5733 nmp = dupmsg(mp); 5734 if (nmp) 5735 (void) vsw_portsend(port, nmp); 5736 } else { 5737 if (vswp->if_state & VSW_IF_UP) { 5738 nmp = copymsg(mp); 5739 if (nmp) 5740 mac_rx(vswp->if_mh, NULL, nmp); 5741 check_if = B_FALSE; 5742 D3(vswp, "%s: sending up stack" 5743 " for addr 0x%llx", __func__, 5744 key); 5745 } 5746 } 5747 } 5748 } 5749 5750 RW_EXIT(&vswp->mfdbrw); 5751 5752 /* 5753 * If the pkt came from either a vnet or from physical device, 5754 * and if we havent already sent the pkt up the stack then we 5755 * check now if we can/should (i.e. the interface is plumbed 5756 * and in promisc mode). 5757 */ 5758 if ((check_if) && 5759 ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) { 5760 READ_ENTER(&vswp->if_lockrw); 5761 if (VSW_U_P(vswp->if_state)) { 5762 RW_EXIT(&vswp->if_lockrw); 5763 D3(vswp, "%s: (caller %d) finally sending up stack" 5764 " for addr 0x%llx", __func__, caller, key); 5765 nmp = copymsg(mp); 5766 if (nmp) 5767 mac_rx(vswp->if_mh, NULL, nmp); 5768 } else { 5769 RW_EXIT(&vswp->if_lockrw); 5770 } 5771 } 5772 5773 freemsg(mp); 5774 5775 D1(vswp, "%s: exit", __func__); 5776 5777 return (0); 5778 } 5779 5780 /* transmit the packet over the given port */ 5781 static int 5782 vsw_portsend(vsw_port_t *port, mblk_t *mp) 5783 { 5784 vsw_ldc_list_t *ldcl = &port->p_ldclist; 5785 vsw_ldc_t *ldcp; 5786 int status = 0; 5787 5788 5789 READ_ENTER(&ldcl->lockrw); 5790 /* 5791 * Note for now, we have a single channel. 5792 */ 5793 ldcp = ldcl->head; 5794 if (ldcp == NULL) { 5795 DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n"); 5796 freemsg(mp); 5797 RW_EXIT(&ldcl->lockrw); 5798 return (1); 5799 } 5800 5801 /* 5802 * Send the message out using the appropriate 5803 * transmit function which will free mblock when it 5804 * is finished with it. 5805 */ 5806 mutex_enter(&port->tx_lock); 5807 if (port->transmit != NULL) 5808 status = (*port->transmit)(ldcp, mp); 5809 else { 5810 freemsg(mp); 5811 } 5812 mutex_exit(&port->tx_lock); 5813 5814 RW_EXIT(&ldcl->lockrw); 5815 5816 return (status); 5817 } 5818 5819 /* 5820 * Send packet out via descriptor ring to a logical device. 5821 */ 5822 static int 5823 vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp) 5824 { 5825 vio_dring_msg_t dring_pkt; 5826 dring_info_t *dp = NULL; 5827 vsw_private_desc_t *priv_desc = NULL; 5828 vnet_public_desc_t *pub = NULL; 5829 vsw_t *vswp = ldcp->ldc_vswp; 5830 mblk_t *bp; 5831 size_t n, size; 5832 caddr_t bufp; 5833 int idx; 5834 int status = LDC_TX_SUCCESS; 5835 5836 D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id); 5837 5838 /* TODO: make test a macro */ 5839 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || 5840 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) { 5841 DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping " 5842 "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status, 5843 ldcp->lane_out.lstate); 5844 freemsg(mp); 5845 return (LDC_TX_FAILURE); 5846 } 5847 5848 /* 5849 * Note - using first ring only, this may change 5850 * in the future. 5851 */ 5852 if ((dp = ldcp->lane_out.dringp) == NULL) { 5853 DERR(vswp, "%s(%lld): no dring for outbound lane on" 5854 " channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id); 5855 freemsg(mp); 5856 return (LDC_TX_FAILURE); 5857 } 5858 5859 size = msgsize(mp); 5860 if (size > (size_t)ETHERMAX) { 5861 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, 5862 ldcp->ldc_id, size); 5863 freemsg(mp); 5864 return (LDC_TX_FAILURE); 5865 } 5866 5867 /* 5868 * Find a free descriptor 5869 * 5870 * Note: for the moment we are assuming that we will only 5871 * have one dring going from the switch to each of its 5872 * peers. This may change in the future. 5873 */ 5874 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { 5875 D2(vswp, "%s(%lld): no descriptor available for ring " 5876 "at 0x%llx", __func__, ldcp->ldc_id, dp); 5877 5878 /* nothing more we can do */ 5879 status = LDC_TX_NORESOURCES; 5880 goto vsw_dringsend_free_exit; 5881 } else { 5882 D2(vswp, "%s(%lld): free private descriptor found at pos " 5883 "%ld addr 0x%llx\n", __func__, ldcp->ldc_id, idx, 5884 priv_desc); 5885 } 5886 5887 /* copy data into the descriptor */ 5888 bufp = priv_desc->datap; 5889 bufp += VNET_IPALIGN; 5890 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { 5891 n = MBLKL(bp); 5892 bcopy(bp->b_rptr, bufp, n); 5893 bufp += n; 5894 } 5895 5896 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; 5897 5898 pub = priv_desc->descp; 5899 pub->nbytes = priv_desc->datalen; 5900 5901 mutex_enter(&priv_desc->dstate_lock); 5902 pub->hdr.dstate = VIO_DESC_READY; 5903 mutex_exit(&priv_desc->dstate_lock); 5904 5905 /* 5906 * Determine whether or not we need to send a message to our 5907 * peer prompting them to read our newly updated descriptor(s). 5908 */ 5909 mutex_enter(&dp->restart_lock); 5910 if (dp->restart_reqd) { 5911 dp->restart_reqd = B_FALSE; 5912 mutex_exit(&dp->restart_lock); 5913 5914 /* 5915 * Send a vio_dring_msg to peer to prompt them to read 5916 * the updated descriptor ring. 5917 */ 5918 dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA; 5919 dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; 5920 dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA; 5921 dring_pkt.tag.vio_sid = ldcp->local_session; 5922 5923 /* Note - for now using first ring */ 5924 dring_pkt.dring_ident = dp->ident; 5925 5926 mutex_enter(&ldcp->lane_out.seq_lock); 5927 dring_pkt.seq_num = ldcp->lane_out.seq_num++; 5928 mutex_exit(&ldcp->lane_out.seq_lock); 5929 5930 /* 5931 * If last_ack_recv is -1 then we know we've not 5932 * received any ack's yet, so this must be the first 5933 * msg sent, so set the start to the begining of the ring. 5934 */ 5935 mutex_enter(&dp->dlock); 5936 if (dp->last_ack_recv == -1) { 5937 dring_pkt.start_idx = 0; 5938 } else { 5939 dring_pkt.start_idx = (dp->last_ack_recv + 1) % 5940 dp->num_descriptors; 5941 } 5942 dring_pkt.end_idx = -1; 5943 mutex_exit(&dp->dlock); 5944 5945 D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__, 5946 ldcp->ldc_id, dp, dring_pkt.dring_ident); 5947 D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n", 5948 __func__, ldcp->ldc_id, dring_pkt.start_idx, 5949 dring_pkt.end_idx, dring_pkt.seq_num); 5950 5951 vsw_send_msg(ldcp, (void *)&dring_pkt, 5952 sizeof (vio_dring_msg_t)); 5953 } else { 5954 mutex_exit(&dp->restart_lock); 5955 D2(vswp, "%s(%lld): updating descp %d", __func__, 5956 ldcp->ldc_id, idx); 5957 } 5958 5959 vsw_dringsend_free_exit: 5960 5961 /* free the message block */ 5962 freemsg(mp); 5963 5964 D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id); 5965 return (status); 5966 } 5967 5968 /* 5969 * Send an in-band descriptor message over ldc. 5970 */ 5971 static int 5972 vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp) 5973 { 5974 vsw_t *vswp = ldcp->ldc_vswp; 5975 vio_ibnd_desc_t ibnd_msg; 5976 vsw_private_desc_t *priv_desc = NULL; 5977 dring_info_t *dp = NULL; 5978 size_t n, size = 0; 5979 caddr_t bufp; 5980 mblk_t *bp; 5981 int idx, i; 5982 int status = LDC_TX_SUCCESS; 5983 static int warn_msg = 1; 5984 5985 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 5986 5987 ASSERT(mp != NULL); 5988 5989 if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) || 5990 (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) { 5991 DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt", 5992 __func__, ldcp->ldc_id, ldcp->ldc_status, 5993 ldcp->lane_out.lstate); 5994 freemsg(mp); 5995 return (LDC_TX_FAILURE); 5996 } 5997 5998 /* 5999 * only expect single dring to exist, which we use 6000 * as an internal buffer, rather than a transfer channel. 6001 */ 6002 if ((dp = ldcp->lane_out.dringp) == NULL) { 6003 DERR(vswp, "%s(%lld): no dring for outbound lane", 6004 __func__, ldcp->ldc_id); 6005 DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", 6006 __func__, ldcp->ldc_id, ldcp->ldc_status, 6007 ldcp->lane_out.lstate); 6008 freemsg(mp); 6009 return (LDC_TX_FAILURE); 6010 } 6011 6012 size = msgsize(mp); 6013 if (size > (size_t)ETHERMAX) { 6014 DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, 6015 ldcp->ldc_id, size); 6016 freemsg(mp); 6017 return (LDC_TX_FAILURE); 6018 } 6019 6020 /* 6021 * Find a free descriptor in our buffer ring 6022 */ 6023 if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { 6024 if (warn_msg) { 6025 DERR(vswp, "%s(%lld): no descriptor available for ring " 6026 "at 0x%llx", __func__, ldcp->ldc_id, dp); 6027 warn_msg = 0; 6028 } 6029 6030 /* nothing more we can do */ 6031 status = LDC_TX_NORESOURCES; 6032 goto vsw_descrsend_free_exit; 6033 } else { 6034 D2(vswp, "%s(%lld): free private descriptor found at pos " 6035 "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, 6036 priv_desc); 6037 warn_msg = 1; 6038 } 6039 6040 /* copy data into the descriptor */ 6041 bufp = priv_desc->datap; 6042 for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { 6043 n = MBLKL(bp); 6044 bcopy(bp->b_rptr, bufp, n); 6045 bufp += n; 6046 } 6047 6048 priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; 6049 6050 /* create and send the in-band descp msg */ 6051 ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA; 6052 ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO; 6053 ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA; 6054 ibnd_msg.hdr.tag.vio_sid = ldcp->local_session; 6055 6056 mutex_enter(&ldcp->lane_out.seq_lock); 6057 ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++; 6058 mutex_exit(&ldcp->lane_out.seq_lock); 6059 6060 /* 6061 * Copy the mem cookies describing the data from the 6062 * private region of the descriptor ring into the inband 6063 * descriptor. 6064 */ 6065 for (i = 0; i < priv_desc->ncookies; i++) { 6066 bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i], 6067 sizeof (ldc_mem_cookie_t)); 6068 } 6069 6070 ibnd_msg.hdr.desc_handle = idx; 6071 ibnd_msg.ncookies = priv_desc->ncookies; 6072 ibnd_msg.nbytes = size; 6073 6074 vsw_send_msg(ldcp, (void *)&ibnd_msg, sizeof (vio_ibnd_desc_t)); 6075 6076 vsw_descrsend_free_exit: 6077 6078 /* free the allocated message blocks */ 6079 freemsg(mp); 6080 6081 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 6082 return (status); 6083 } 6084 6085 static void 6086 vsw_send_ver(vsw_ldc_t *ldcp) 6087 { 6088 vsw_t *vswp = ldcp->ldc_vswp; 6089 lane_t *lp = &ldcp->lane_out; 6090 vio_ver_msg_t ver_msg; 6091 6092 D1(vswp, "%s enter", __func__); 6093 6094 ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 6095 ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 6096 ver_msg.tag.vio_subtype_env = VIO_VER_INFO; 6097 ver_msg.tag.vio_sid = ldcp->local_session; 6098 6099 ver_msg.ver_major = vsw_versions[0].ver_major; 6100 ver_msg.ver_minor = vsw_versions[0].ver_minor; 6101 ver_msg.dev_class = VDEV_NETWORK_SWITCH; 6102 6103 lp->lstate |= VSW_VER_INFO_SENT; 6104 lp->ver_major = ver_msg.ver_major; 6105 lp->ver_minor = ver_msg.ver_minor; 6106 6107 DUMP_TAG(ver_msg.tag); 6108 6109 vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t)); 6110 6111 D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id); 6112 } 6113 6114 static void 6115 vsw_send_attr(vsw_ldc_t *ldcp) 6116 { 6117 vsw_t *vswp = ldcp->ldc_vswp; 6118 lane_t *lp = &ldcp->lane_out; 6119 vnet_attr_msg_t attr_msg; 6120 6121 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 6122 6123 /* 6124 * Subtype is set to INFO by default 6125 */ 6126 attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 6127 attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 6128 attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO; 6129 attr_msg.tag.vio_sid = ldcp->local_session; 6130 6131 /* payload copied from default settings for lane */ 6132 attr_msg.mtu = lp->mtu; 6133 attr_msg.addr_type = lp->addr_type; 6134 attr_msg.xfer_mode = lp->xfer_mode; 6135 attr_msg.ack_freq = lp->xfer_mode; 6136 6137 READ_ENTER(&vswp->if_lockrw); 6138 bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL); 6139 RW_EXIT(&vswp->if_lockrw); 6140 6141 ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT; 6142 6143 DUMP_TAG(attr_msg.tag); 6144 6145 vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t)); 6146 6147 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 6148 } 6149 6150 /* 6151 * Create dring info msg (which also results in the creation of 6152 * a dring). 6153 */ 6154 static vio_dring_reg_msg_t * 6155 vsw_create_dring_info_pkt(vsw_ldc_t *ldcp) 6156 { 6157 vio_dring_reg_msg_t *mp; 6158 dring_info_t *dp; 6159 vsw_t *vswp = ldcp->ldc_vswp; 6160 6161 D1(vswp, "vsw_create_dring_info_pkt enter\n"); 6162 6163 /* 6164 * If we can't create a dring, obviously no point sending 6165 * a message. 6166 */ 6167 if ((dp = vsw_create_dring(ldcp)) == NULL) 6168 return (NULL); 6169 6170 mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP); 6171 6172 mp->tag.vio_msgtype = VIO_TYPE_CTRL; 6173 mp->tag.vio_subtype = VIO_SUBTYPE_INFO; 6174 mp->tag.vio_subtype_env = VIO_DRING_REG; 6175 mp->tag.vio_sid = ldcp->local_session; 6176 6177 /* payload */ 6178 mp->num_descriptors = dp->num_descriptors; 6179 mp->descriptor_size = dp->descriptor_size; 6180 mp->options = dp->options; 6181 mp->ncookies = dp->ncookies; 6182 bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t)); 6183 6184 mp->dring_ident = 0; 6185 6186 D1(vswp, "vsw_create_dring_info_pkt exit\n"); 6187 6188 return (mp); 6189 } 6190 6191 static void 6192 vsw_send_dring_info(vsw_ldc_t *ldcp) 6193 { 6194 vio_dring_reg_msg_t *dring_msg; 6195 vsw_t *vswp = ldcp->ldc_vswp; 6196 6197 D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id); 6198 6199 dring_msg = vsw_create_dring_info_pkt(ldcp); 6200 if (dring_msg == NULL) { 6201 cmn_err(CE_WARN, "vsw_send_dring_info: error creating msg"); 6202 return; 6203 } 6204 6205 ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT; 6206 6207 DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg); 6208 6209 vsw_send_msg(ldcp, dring_msg, 6210 sizeof (vio_dring_reg_msg_t)); 6211 6212 kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t)); 6213 6214 D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id); 6215 } 6216 6217 static void 6218 vsw_send_rdx(vsw_ldc_t *ldcp) 6219 { 6220 vsw_t *vswp = ldcp->ldc_vswp; 6221 vio_rdx_msg_t rdx_msg; 6222 6223 D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id); 6224 6225 rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL; 6226 rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO; 6227 rdx_msg.tag.vio_subtype_env = VIO_RDX; 6228 rdx_msg.tag.vio_sid = ldcp->local_session; 6229 6230 ldcp->lane_out.lstate |= VSW_RDX_INFO_SENT; 6231 6232 DUMP_TAG(rdx_msg.tag); 6233 6234 vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t)); 6235 6236 D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id); 6237 } 6238 6239 /* 6240 * Generic routine to send message out over ldc channel. 6241 */ 6242 static void 6243 vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size) 6244 { 6245 int rv; 6246 size_t msglen = size; 6247 vio_msg_tag_t *tag = (vio_msg_tag_t *)msgp; 6248 vsw_t *vswp = ldcp->ldc_vswp; 6249 6250 D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes", 6251 ldcp->ldc_id, size); 6252 6253 D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype); 6254 D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype); 6255 D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env); 6256 6257 mutex_enter(&ldcp->ldc_txlock); 6258 do { 6259 msglen = size; 6260 rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen); 6261 } while (rv == EWOULDBLOCK && --vsw_wretries > 0); 6262 6263 mutex_exit(&ldcp->ldc_txlock); 6264 6265 if ((rv != 0) || (msglen != size)) { 6266 DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) " 6267 "rv(%d) size (%d) msglen(%d)\n", ldcp->ldc_id, 6268 rv, size, msglen); 6269 } 6270 6271 D1(vswp, "vsw_send_msg (%lld) exit : sent %d bytes", 6272 ldcp->ldc_id, msglen); 6273 } 6274 6275 /* 6276 * Add an entry into FDB, for the given mac address and port_id. 6277 * Returns 0 on success, 1 on failure. 6278 * 6279 * Lock protecting FDB must be held by calling process. 6280 */ 6281 static int 6282 vsw_add_fdb(vsw_t *vswp, vsw_port_t *port) 6283 { 6284 uint64_t addr = 0; 6285 6286 D1(vswp, "%s: enter", __func__); 6287 6288 KEY_HASH(addr, port->p_macaddr); 6289 6290 D2(vswp, "%s: key = 0x%llx", __func__, addr); 6291 6292 /* 6293 * Note: duplicate keys will be rejected by mod_hash. 6294 */ 6295 if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr, 6296 (mod_hash_val_t)port) != 0) { 6297 DERR(vswp, "%s: unable to add entry into fdb.", __func__); 6298 return (1); 6299 } 6300 6301 D1(vswp, "%s: exit", __func__); 6302 return (0); 6303 } 6304 6305 /* 6306 * Remove an entry from FDB. 6307 * Returns 0 on success, 1 on failure. 6308 */ 6309 static int 6310 vsw_del_fdb(vsw_t *vswp, vsw_port_t *port) 6311 { 6312 uint64_t addr = 0; 6313 6314 D1(vswp, "%s: enter", __func__); 6315 6316 KEY_HASH(addr, port->p_macaddr); 6317 6318 D2(vswp, "%s: key = 0x%llx", __func__, addr); 6319 6320 (void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr); 6321 6322 D1(vswp, "%s: enter", __func__); 6323 6324 return (0); 6325 } 6326 6327 /* 6328 * Search fdb for a given mac address. 6329 * Returns pointer to the entry if found, else returns NULL. 6330 */ 6331 static vsw_port_t * 6332 vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp) 6333 { 6334 uint64_t key = 0; 6335 vsw_port_t *port = NULL; 6336 6337 D1(vswp, "%s: enter", __func__); 6338 6339 KEY_HASH(key, ehp->ether_dhost); 6340 6341 D2(vswp, "%s: key = 0x%llx", __func__, key); 6342 6343 if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key, 6344 (mod_hash_val_t *)&port) != 0) { 6345 return (NULL); 6346 } 6347 6348 D1(vswp, "%s: exit", __func__); 6349 6350 return (port); 6351 } 6352 6353 /* 6354 * Add or remove multicast address(es). 6355 * 6356 * Returns 0 on success, 1 on failure. 6357 */ 6358 static int 6359 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port) 6360 { 6361 mcst_addr_t *mcst_p = NULL; 6362 vsw_t *vswp = port->p_vswp; 6363 uint64_t addr = 0x0; 6364 int i, ret; 6365 6366 D1(vswp, "%s: enter", __func__); 6367 6368 D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count); 6369 6370 if (vswp->mh == NULL) 6371 return (1); 6372 6373 for (i = 0; i < mcst_pkt->count; i++) { 6374 /* 6375 * Convert address into form that can be used 6376 * as hash table key. 6377 */ 6378 KEY_HASH(addr, mcst_pkt->mca[i]); 6379 6380 /* 6381 * Add or delete the specified address/port combination. 6382 */ 6383 if (mcst_pkt->set == 0x1) { 6384 D3(vswp, "%s: adding multicast address 0x%llx for " 6385 "port %ld", __func__, addr, port->p_instance); 6386 if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 6387 /* 6388 * Update the list of multicast 6389 * addresses contained within the 6390 * port structure to include this new 6391 * one. 6392 */ 6393 mcst_p = kmem_alloc(sizeof (mcst_addr_t), 6394 KM_NOSLEEP); 6395 if (mcst_p == NULL) { 6396 DERR(vswp, "%s: unable to alloc mem", 6397 __func__); 6398 return (1); 6399 } 6400 6401 mcst_p->nextp = NULL; 6402 mcst_p->addr = addr; 6403 6404 mutex_enter(&port->mca_lock); 6405 mcst_p->nextp = port->mcap; 6406 port->mcap = mcst_p; 6407 mutex_exit(&port->mca_lock); 6408 6409 /* 6410 * Program the address into HW. If the addr 6411 * has already been programmed then the MAC 6412 * just increments a ref counter (which is 6413 * used when the address is being deleted) 6414 */ 6415 ret = mac_multicst_add(vswp->mh, 6416 (uchar_t *)&mcst_pkt->mca[i]); 6417 if (ret) { 6418 cmn_err(CE_WARN, "!unable to add " 6419 "multicast address"); 6420 (void) vsw_del_mcst(vswp, VSW_VNETPORT, 6421 addr, port); 6422 vsw_del_addr(VSW_VNETPORT, port, addr); 6423 return (ret); 6424 } 6425 6426 } else { 6427 DERR(vswp, "%s: error adding multicast " 6428 "address 0x%llx for port %ld", 6429 __func__, addr, port->p_instance); 6430 return (1); 6431 } 6432 } else { 6433 /* 6434 * Delete an entry from the multicast hash 6435 * table and update the address list 6436 * appropriately. 6437 */ 6438 if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) { 6439 D3(vswp, "%s: deleting multicast address " 6440 "0x%llx for port %ld", __func__, addr, 6441 port->p_instance); 6442 6443 vsw_del_addr(VSW_VNETPORT, port, addr); 6444 6445 /* 6446 * Remove the address from HW. The address 6447 * will actually only be removed once the ref 6448 * count within the MAC layer has dropped to 6449 * zero. I.e. we can safely call this fn even 6450 * if other ports are interested in this 6451 * address. 6452 */ 6453 (void) mac_multicst_remove(vswp->mh, 6454 (uchar_t *)&mcst_pkt->mca[i]); 6455 6456 } else { 6457 DERR(vswp, "%s: error deleting multicast " 6458 "addr 0x%llx for port %ld", 6459 __func__, addr, port->p_instance); 6460 return (1); 6461 } 6462 } 6463 } 6464 D1(vswp, "%s: exit", __func__); 6465 return (0); 6466 } 6467 6468 /* 6469 * Add a new multicast entry. 6470 * 6471 * Search hash table based on address. If match found then 6472 * update associated val (which is chain of ports), otherwise 6473 * create new key/val (addr/port) pair and insert into table. 6474 */ 6475 static int 6476 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 6477 { 6478 int dup = 0; 6479 int rv = 0; 6480 mfdb_ent_t *ment = NULL; 6481 mfdb_ent_t *tmp_ent = NULL; 6482 mfdb_ent_t *new_ent = NULL; 6483 void *tgt = NULL; 6484 6485 if (devtype == VSW_VNETPORT) { 6486 /* 6487 * Being invoked from a vnet. 6488 */ 6489 ASSERT(arg != NULL); 6490 tgt = arg; 6491 D2(NULL, "%s: port %d : address 0x%llx", __func__, 6492 ((vsw_port_t *)arg)->p_instance, addr); 6493 } else { 6494 /* 6495 * We are being invoked via the m_multicst mac entry 6496 * point. 6497 */ 6498 D2(NULL, "%s: address 0x%llx", __func__, addr); 6499 tgt = (void *)vswp; 6500 } 6501 6502 WRITE_ENTER(&vswp->mfdbrw); 6503 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 6504 (mod_hash_val_t *)&ment) != 0) { 6505 6506 /* address not currently in table */ 6507 ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 6508 ment->d_addr = (void *)tgt; 6509 ment->d_type = devtype; 6510 ment->nextp = NULL; 6511 6512 if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr, 6513 (mod_hash_val_t)ment) != 0) { 6514 DERR(vswp, "%s: hash table insertion failed", __func__); 6515 kmem_free(ment, sizeof (mfdb_ent_t)); 6516 rv = 1; 6517 } else { 6518 D2(vswp, "%s: added initial entry for 0x%llx to " 6519 "table", __func__, addr); 6520 } 6521 } else { 6522 /* 6523 * Address in table. Check to see if specified port 6524 * is already associated with the address. If not add 6525 * it now. 6526 */ 6527 tmp_ent = ment; 6528 while (tmp_ent != NULL) { 6529 if (tmp_ent->d_addr == (void *)tgt) { 6530 if (devtype == VSW_VNETPORT) { 6531 DERR(vswp, "%s: duplicate port entry " 6532 "found for portid %ld and key " 6533 "0x%llx", __func__, 6534 ((vsw_port_t *)arg)->p_instance, 6535 addr); 6536 } else { 6537 DERR(vswp, "%s: duplicate entry found" 6538 "for key 0x%llx", 6539 __func__, addr); 6540 } 6541 rv = 1; 6542 dup = 1; 6543 break; 6544 } 6545 tmp_ent = tmp_ent->nextp; 6546 } 6547 6548 /* 6549 * Port not on list so add it to end now. 6550 */ 6551 if (0 == dup) { 6552 D2(vswp, "%s: added entry for 0x%llx to table", 6553 __func__, addr); 6554 new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP); 6555 new_ent->d_addr = (void *)tgt; 6556 new_ent->d_type = devtype; 6557 new_ent->nextp = NULL; 6558 6559 tmp_ent = ment; 6560 while (tmp_ent->nextp != NULL) 6561 tmp_ent = tmp_ent->nextp; 6562 6563 tmp_ent->nextp = new_ent; 6564 } 6565 } 6566 6567 RW_EXIT(&vswp->mfdbrw); 6568 return (rv); 6569 } 6570 6571 /* 6572 * Remove a multicast entry from the hashtable. 6573 * 6574 * Search hash table based on address. If match found, scan 6575 * list of ports associated with address. If specified port 6576 * found remove it from list. 6577 */ 6578 static int 6579 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg) 6580 { 6581 mfdb_ent_t *ment = NULL; 6582 mfdb_ent_t *curr_p, *prev_p; 6583 void *tgt = NULL; 6584 6585 D1(vswp, "%s: enter", __func__); 6586 6587 if (devtype == VSW_VNETPORT) { 6588 tgt = (vsw_port_t *)arg; 6589 D2(vswp, "%s: removing port %d from mFDB for address" 6590 " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, 6591 addr); 6592 } else { 6593 D2(vswp, "%s: removing entry", __func__); 6594 tgt = (void *)vswp; 6595 } 6596 6597 WRITE_ENTER(&vswp->mfdbrw); 6598 if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr, 6599 (mod_hash_val_t *)&ment) != 0) { 6600 D2(vswp, "%s: address 0x%llx not in table", __func__, addr); 6601 RW_EXIT(&vswp->mfdbrw); 6602 return (1); 6603 } 6604 6605 prev_p = curr_p = ment; 6606 6607 while (curr_p != NULL) { 6608 if (curr_p->d_addr == (void *)tgt) { 6609 if (devtype == VSW_VNETPORT) { 6610 D2(vswp, "%s: port %d found", __func__, 6611 ((vsw_port_t *)tgt)->p_instance); 6612 } else { 6613 D2(vswp, "%s: instance found", __func__); 6614 } 6615 6616 if (prev_p == curr_p) { 6617 /* 6618 * head of list, if no other element is in 6619 * list then destroy this entry, otherwise 6620 * just replace it with updated value. 6621 */ 6622 ment = curr_p->nextp; 6623 kmem_free(curr_p, sizeof (mfdb_ent_t)); 6624 if (ment == NULL) { 6625 (void) mod_hash_destroy(vswp->mfdb, 6626 (mod_hash_val_t)addr); 6627 } else { 6628 (void) mod_hash_replace(vswp->mfdb, 6629 (mod_hash_key_t)addr, 6630 (mod_hash_val_t)ment); 6631 } 6632 } else { 6633 /* 6634 * Not head of list, no need to do 6635 * replacement, just adjust list pointers. 6636 */ 6637 prev_p->nextp = curr_p->nextp; 6638 kmem_free(curr_p, sizeof (mfdb_ent_t)); 6639 } 6640 break; 6641 } 6642 6643 prev_p = curr_p; 6644 curr_p = curr_p->nextp; 6645 } 6646 6647 RW_EXIT(&vswp->mfdbrw); 6648 6649 D1(vswp, "%s: exit", __func__); 6650 6651 return (0); 6652 } 6653 6654 /* 6655 * Port is being deleted, but has registered an interest in one 6656 * or more multicast groups. Using the list of addresses maintained 6657 * within the port structure find the appropriate entry in the hash 6658 * table and remove this port from the list of interested ports. 6659 */ 6660 static void 6661 vsw_del_mcst_port(vsw_port_t *port) 6662 { 6663 mcst_addr_t *mcst_p = NULL; 6664 vsw_t *vswp = port->p_vswp; 6665 6666 D1(vswp, "%s: enter", __func__); 6667 6668 mutex_enter(&port->mca_lock); 6669 while (port->mcap != NULL) { 6670 (void) vsw_del_mcst(vswp, VSW_VNETPORT, 6671 port->mcap->addr, port); 6672 6673 mcst_p = port->mcap->nextp; 6674 kmem_free(port->mcap, sizeof (mcst_addr_t)); 6675 port->mcap = mcst_p; 6676 } 6677 mutex_exit(&port->mca_lock); 6678 6679 D1(vswp, "%s: exit", __func__); 6680 } 6681 6682 /* 6683 * This vsw instance is detaching, but has registered an interest in one 6684 * or more multicast groups. Using the list of addresses maintained 6685 * within the vsw structure find the appropriate entry in the hash 6686 * table and remove this instance from the list of interested ports. 6687 */ 6688 static void 6689 vsw_del_mcst_vsw(vsw_t *vswp) 6690 { 6691 mcst_addr_t *next_p = NULL; 6692 6693 D1(vswp, "%s: enter", __func__); 6694 6695 mutex_enter(&vswp->mca_lock); 6696 6697 while (vswp->mcap != NULL) { 6698 DERR(vswp, "%s: deleting addr 0x%llx", 6699 __func__, vswp->mcap->addr); 6700 (void) vsw_del_mcst(vswp, VSW_LOCALDEV, 6701 vswp->mcap->addr, NULL); 6702 6703 next_p = vswp->mcap->nextp; 6704 kmem_free(vswp->mcap, sizeof (mcst_addr_t)); 6705 vswp->mcap = next_p; 6706 } 6707 6708 vswp->mcap = NULL; 6709 mutex_exit(&vswp->mca_lock); 6710 6711 D1(vswp, "%s: exit", __func__); 6712 } 6713 6714 6715 /* 6716 * Remove the specified address from the list of address maintained 6717 * in this port node. 6718 */ 6719 static void 6720 vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr) 6721 { 6722 vsw_t *vswp = NULL; 6723 vsw_port_t *port = NULL; 6724 mcst_addr_t *prev_p = NULL; 6725 mcst_addr_t *curr_p = NULL; 6726 6727 D1(NULL, "%s: enter : devtype %d : addr 0x%llx", 6728 __func__, devtype, addr); 6729 6730 if (devtype == VSW_VNETPORT) { 6731 port = (vsw_port_t *)arg; 6732 mutex_enter(&port->mca_lock); 6733 prev_p = curr_p = port->mcap; 6734 } else { 6735 vswp = (vsw_t *)arg; 6736 mutex_enter(&vswp->mca_lock); 6737 prev_p = curr_p = vswp->mcap; 6738 } 6739 6740 while (curr_p != NULL) { 6741 if (curr_p->addr == addr) { 6742 D2(NULL, "%s: address found", __func__); 6743 /* match found */ 6744 if (prev_p == curr_p) { 6745 /* list head */ 6746 if (devtype == VSW_VNETPORT) 6747 port->mcap = curr_p->nextp; 6748 else 6749 vswp->mcap = curr_p->nextp; 6750 } else { 6751 prev_p->nextp = curr_p->nextp; 6752 } 6753 kmem_free(curr_p, sizeof (mcst_addr_t)); 6754 break; 6755 } else { 6756 prev_p = curr_p; 6757 curr_p = curr_p->nextp; 6758 } 6759 } 6760 6761 if (devtype == VSW_VNETPORT) 6762 mutex_exit(&port->mca_lock); 6763 else 6764 mutex_exit(&vswp->mca_lock); 6765 6766 D1(NULL, "%s: exit", __func__); 6767 } 6768 6769 /* 6770 * Creates a descriptor ring (dring) and links it into the 6771 * link of outbound drings for this channel. 6772 * 6773 * Returns NULL if creation failed. 6774 */ 6775 static dring_info_t * 6776 vsw_create_dring(vsw_ldc_t *ldcp) 6777 { 6778 vsw_private_desc_t *priv_addr = NULL; 6779 vsw_t *vswp = ldcp->ldc_vswp; 6780 ldc_mem_info_t minfo; 6781 dring_info_t *dp, *tp; 6782 int i; 6783 6784 dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 6785 6786 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); 6787 6788 /* create public section of ring */ 6789 if ((ldc_mem_dring_create(VSW_RING_NUM_EL, 6790 VSW_PUB_SIZE, &dp->handle)) != 0) { 6791 6792 DERR(vswp, "vsw_create_dring(%lld): ldc dring create " 6793 "failed", ldcp->ldc_id); 6794 goto create_fail_exit; 6795 } 6796 6797 ASSERT(dp->handle != NULL); 6798 6799 /* 6800 * Get the base address of the public section of the ring. 6801 */ 6802 if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) { 6803 DERR(vswp, "vsw_create_dring(%lld): dring info failed\n", 6804 ldcp->ldc_id); 6805 goto dring_fail_exit; 6806 } else { 6807 ASSERT(minfo.vaddr != 0); 6808 dp->pub_addr = minfo.vaddr; 6809 } 6810 6811 dp->num_descriptors = VSW_RING_NUM_EL; 6812 dp->descriptor_size = VSW_PUB_SIZE; 6813 dp->options = VIO_TX_DRING; 6814 dp->ncookies = 1; /* guaranteed by ldc */ 6815 6816 /* 6817 * create private portion of ring 6818 */ 6819 dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc( 6820 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP); 6821 6822 if (vsw_setup_ring(ldcp, dp)) { 6823 DERR(vswp, "%s: unable to setup ring", __func__); 6824 goto dring_fail_exit; 6825 } 6826 6827 /* haven't used any descriptors yet */ 6828 dp->end_idx = 0; 6829 dp->last_ack_recv = -1; 6830 6831 /* bind dring to the channel */ 6832 if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle, 6833 LDC_SHADOW_MAP, LDC_MEM_RW, 6834 &dp->cookie[0], &dp->ncookies)) != 0) { 6835 DERR(vswp, "vsw_create_dring: unable to bind to channel " 6836 "%lld", ldcp->ldc_id); 6837 goto dring_fail_exit; 6838 } 6839 6840 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); 6841 dp->restart_reqd = B_TRUE; 6842 6843 /* 6844 * Only ever create rings for outgoing lane. Link it onto 6845 * end of list. 6846 */ 6847 if (ldcp->lane_out.dringp == NULL) { 6848 D2(vswp, "vsw_create_dring: adding first outbound ring"); 6849 ldcp->lane_out.dringp = dp; 6850 } else { 6851 tp = ldcp->lane_out.dringp; 6852 while (tp->next != NULL) 6853 tp = tp->next; 6854 6855 tp->next = dp; 6856 } 6857 6858 return (dp); 6859 6860 dring_fail_exit: 6861 (void) ldc_mem_dring_destroy(dp->handle); 6862 6863 create_fail_exit: 6864 if (dp->priv_addr != NULL) { 6865 priv_addr = dp->priv_addr; 6866 for (i = 0; i < VSW_RING_NUM_EL; i++) { 6867 if (priv_addr->memhandle != NULL) 6868 (void) ldc_mem_free_handle( 6869 priv_addr->memhandle); 6870 priv_addr++; 6871 } 6872 kmem_free(dp->priv_addr, 6873 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); 6874 } 6875 mutex_destroy(&dp->dlock); 6876 6877 kmem_free(dp, sizeof (dring_info_t)); 6878 return (NULL); 6879 } 6880 6881 /* 6882 * Create a ring consisting of just a private portion and link 6883 * it into the list of rings for the outbound lane. 6884 * 6885 * These type of rings are used primarily for temporary data 6886 * storage (i.e. as data buffers). 6887 */ 6888 void 6889 vsw_create_privring(vsw_ldc_t *ldcp) 6890 { 6891 dring_info_t *dp, *tp; 6892 vsw_t *vswp = ldcp->ldc_vswp; 6893 6894 D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); 6895 6896 dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP); 6897 6898 mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL); 6899 6900 /* no public section */ 6901 dp->pub_addr = NULL; 6902 6903 dp->priv_addr = kmem_zalloc((sizeof (vsw_private_desc_t) * 6904 VSW_RING_NUM_EL), KM_SLEEP); 6905 6906 dp->num_descriptors = VSW_RING_NUM_EL; 6907 6908 if (vsw_setup_ring(ldcp, dp)) { 6909 DERR(vswp, "%s: setup of ring failed", __func__); 6910 kmem_free(dp->priv_addr, 6911 (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); 6912 mutex_destroy(&dp->dlock); 6913 kmem_free(dp, sizeof (dring_info_t)); 6914 return; 6915 } 6916 6917 /* haven't used any descriptors yet */ 6918 dp->end_idx = 0; 6919 6920 mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); 6921 dp->restart_reqd = B_TRUE; 6922 6923 /* 6924 * Only ever create rings for outgoing lane. Link it onto 6925 * end of list. 6926 */ 6927 if (ldcp->lane_out.dringp == NULL) { 6928 D2(vswp, "%s: adding first outbound privring", __func__); 6929 ldcp->lane_out.dringp = dp; 6930 } else { 6931 tp = ldcp->lane_out.dringp; 6932 while (tp->next != NULL) 6933 tp = tp->next; 6934 6935 tp->next = dp; 6936 } 6937 6938 D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id); 6939 } 6940 6941 /* 6942 * Setup the descriptors in the dring. Returns 0 on success, 1 on 6943 * failure. 6944 */ 6945 int 6946 vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp) 6947 { 6948 vnet_public_desc_t *pub_addr = NULL; 6949 vsw_private_desc_t *priv_addr = NULL; 6950 vsw_t *vswp = ldcp->ldc_vswp; 6951 uint64_t *tmpp; 6952 uint64_t offset = 0; 6953 uint32_t ncookies = 0; 6954 static char *name = "vsw_setup_ring"; 6955 int i, j, nc, rv; 6956 6957 priv_addr = dp->priv_addr; 6958 pub_addr = dp->pub_addr; 6959 6960 /* public section may be null but private should never be */ 6961 ASSERT(priv_addr != NULL); 6962 6963 /* 6964 * Allocate the region of memory which will be used to hold 6965 * the data the descriptors will refer to. 6966 */ 6967 dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ); 6968 dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP); 6969 6970 D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name, 6971 dp->data_sz, dp->data_addr); 6972 6973 tmpp = (uint64_t *)dp->data_addr; 6974 offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp); 6975 6976 /* 6977 * Initialise some of the private and public (if they exist) 6978 * descriptor fields. 6979 */ 6980 for (i = 0; i < VSW_RING_NUM_EL; i++) { 6981 mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL); 6982 6983 if ((ldc_mem_alloc_handle(ldcp->ldc_handle, 6984 &priv_addr->memhandle)) != 0) { 6985 DERR(vswp, "%s: alloc mem handle failed", name); 6986 goto setup_ring_cleanup; 6987 } 6988 6989 priv_addr->datap = (void *)tmpp; 6990 6991 rv = ldc_mem_bind_handle(priv_addr->memhandle, 6992 (caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ, 6993 LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W, 6994 &(priv_addr->memcookie[0]), &ncookies); 6995 if (rv != 0) { 6996 DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed " 6997 "(rv %d)", name, ldcp->ldc_id, rv); 6998 goto setup_ring_cleanup; 6999 } 7000 priv_addr->bound = 1; 7001 7002 D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx", 7003 name, i, priv_addr->memcookie[0].addr, 7004 priv_addr->memcookie[0].size); 7005 7006 if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) { 7007 DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned " 7008 "invalid num of cookies (%d) for size 0x%llx", 7009 name, ldcp->ldc_id, ncookies, 7010 VSW_RING_EL_DATA_SZ); 7011 7012 goto setup_ring_cleanup; 7013 } else { 7014 for (j = 1; j < ncookies; j++) { 7015 rv = ldc_mem_nextcookie(priv_addr->memhandle, 7016 &(priv_addr->memcookie[j])); 7017 if (rv != 0) { 7018 DERR(vswp, "%s: ldc_mem_nextcookie " 7019 "failed rv (%d)", name, rv); 7020 goto setup_ring_cleanup; 7021 } 7022 D3(vswp, "%s: memcookie %d : addr 0x%llx : " 7023 "size 0x%llx", name, j, 7024 priv_addr->memcookie[j].addr, 7025 priv_addr->memcookie[j].size); 7026 } 7027 7028 } 7029 priv_addr->ncookies = ncookies; 7030 priv_addr->dstate = VIO_DESC_FREE; 7031 7032 if (pub_addr != NULL) { 7033 7034 /* link pub and private sides */ 7035 priv_addr->descp = pub_addr; 7036 7037 pub_addr->ncookies = priv_addr->ncookies; 7038 7039 for (nc = 0; nc < pub_addr->ncookies; nc++) { 7040 bcopy(&priv_addr->memcookie[nc], 7041 &pub_addr->memcookie[nc], 7042 sizeof (ldc_mem_cookie_t)); 7043 } 7044 7045 pub_addr->hdr.dstate = VIO_DESC_FREE; 7046 pub_addr++; 7047 } 7048 7049 /* 7050 * move to next element in the dring and the next 7051 * position in the data buffer. 7052 */ 7053 priv_addr++; 7054 tmpp += offset; 7055 } 7056 7057 return (0); 7058 7059 setup_ring_cleanup: 7060 priv_addr = dp->priv_addr; 7061 7062 for (j = 0; j < i; j++) { 7063 (void) ldc_mem_unbind_handle(priv_addr->memhandle); 7064 (void) ldc_mem_free_handle(priv_addr->memhandle); 7065 7066 mutex_destroy(&priv_addr->dstate_lock); 7067 7068 priv_addr++; 7069 } 7070 kmem_free(dp->data_addr, dp->data_sz); 7071 7072 return (1); 7073 } 7074 7075 /* 7076 * Searches the private section of a ring for a free descriptor, 7077 * starting at the location of the last free descriptor found 7078 * previously. 7079 * 7080 * Returns 0 if free descriptor is available, and updates state 7081 * of private descriptor to VIO_DESC_READY, otherwise returns 1. 7082 * 7083 * FUTURE: might need to return contiguous range of descriptors 7084 * as dring info msg assumes all will be contiguous. 7085 */ 7086 static int 7087 vsw_dring_find_free_desc(dring_info_t *dringp, 7088 vsw_private_desc_t **priv_p, int *idx) 7089 { 7090 vsw_private_desc_t *addr = NULL; 7091 int num = VSW_RING_NUM_EL; 7092 int ret = 1; 7093 7094 D1(NULL, "%s enter\n", __func__); 7095 7096 ASSERT(dringp->priv_addr != NULL); 7097 7098 D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld", 7099 __func__, dringp, dringp->end_idx); 7100 7101 addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx; 7102 7103 mutex_enter(&addr->dstate_lock); 7104 if (addr->dstate == VIO_DESC_FREE) { 7105 addr->dstate = VIO_DESC_READY; 7106 *priv_p = addr; 7107 *idx = dringp->end_idx; 7108 dringp->end_idx = (dringp->end_idx + 1) % num; 7109 ret = 0; 7110 7111 } 7112 mutex_exit(&addr->dstate_lock); 7113 7114 /* ring full */ 7115 if (ret == 1) { 7116 D2(NULL, "%s: no desp free: started at %d", __func__, 7117 dringp->end_idx); 7118 } 7119 7120 D1(NULL, "%s: exit\n", __func__); 7121 7122 return (ret); 7123 } 7124 7125 /* 7126 * Map from a dring identifier to the ring itself. Returns 7127 * pointer to ring or NULL if no match found. 7128 */ 7129 static dring_info_t * 7130 vsw_ident2dring(lane_t *lane, uint64_t ident) 7131 { 7132 dring_info_t *dp = NULL; 7133 7134 if ((dp = lane->dringp) == NULL) { 7135 return (NULL); 7136 } else { 7137 if (dp->ident == ident) 7138 return (dp); 7139 7140 while (dp != NULL) { 7141 if (dp->ident == ident) 7142 break; 7143 dp = dp->next; 7144 } 7145 } 7146 7147 return (dp); 7148 } 7149 7150 /* 7151 * Set the default lane attributes. These are copied into 7152 * the attr msg we send to our peer. If they are not acceptable 7153 * then (currently) the handshake ends. 7154 */ 7155 static void 7156 vsw_set_lane_attr(vsw_t *vswp, lane_t *lp) 7157 { 7158 bzero(lp, sizeof (lane_t)); 7159 7160 READ_ENTER(&vswp->if_lockrw); 7161 ether_copy(&(vswp->if_addr), &(lp->addr)); 7162 RW_EXIT(&vswp->if_lockrw); 7163 7164 lp->mtu = VSW_MTU; 7165 lp->addr_type = ADDR_TYPE_MAC; 7166 lp->xfer_mode = VIO_DRING_MODE; 7167 lp->ack_freq = 0; /* for shared mode */ 7168 7169 mutex_enter(&lp->seq_lock); 7170 lp->seq_num = VNET_ISS; 7171 mutex_exit(&lp->seq_lock); 7172 } 7173 7174 /* 7175 * Verify that the attributes are acceptable. 7176 * 7177 * FUTURE: If some attributes are not acceptable, change them 7178 * our desired values. 7179 */ 7180 static int 7181 vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port) 7182 { 7183 int ret = 0; 7184 7185 D1(NULL, "vsw_check_attr enter\n"); 7186 7187 /* 7188 * Note we currently only support in-band descriptors 7189 * and descriptor rings, not packet based transfer (VIO_PKT_MODE) 7190 */ 7191 if ((pkt->xfer_mode != VIO_DESC_MODE) && 7192 (pkt->xfer_mode != VIO_DRING_MODE)) { 7193 D2(NULL, "vsw_check_attr: unknown mode %x\n", 7194 pkt->xfer_mode); 7195 ret = 1; 7196 } 7197 7198 /* Only support MAC addresses at moment. */ 7199 if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) { 7200 D2(NULL, "vsw_check_attr: invalid addr_type %x, " 7201 "or address 0x%llx\n", pkt->addr_type, 7202 pkt->addr); 7203 ret = 1; 7204 } 7205 7206 /* 7207 * MAC address supplied by device should match that stored 7208 * in the vsw-port OBP node. Need to decide what to do if they 7209 * don't match, for the moment just warn but don't fail. 7210 */ 7211 if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) { 7212 DERR(NULL, "vsw_check_attr: device supplied address " 7213 "0x%llx doesn't match node address 0x%llx\n", 7214 pkt->addr, port->p_macaddr); 7215 } 7216 7217 /* 7218 * Ack freq only makes sense in pkt mode, in shared 7219 * mode the ring descriptors say whether or not to 7220 * send back an ACK. 7221 */ 7222 if ((pkt->xfer_mode == VIO_DRING_MODE) && 7223 (pkt->ack_freq > 0)) { 7224 D2(NULL, "vsw_check_attr: non zero ack freq " 7225 " in SHM mode\n"); 7226 ret = 1; 7227 } 7228 7229 /* 7230 * Note: for the moment we only support ETHER 7231 * frames. This may change in the future. 7232 */ 7233 if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) { 7234 D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n", 7235 pkt->mtu); 7236 ret = 1; 7237 } 7238 7239 D1(NULL, "vsw_check_attr exit\n"); 7240 7241 return (ret); 7242 } 7243 7244 /* 7245 * Returns 1 if there is a problem, 0 otherwise. 7246 */ 7247 static int 7248 vsw_check_dring_info(vio_dring_reg_msg_t *pkt) 7249 { 7250 _NOTE(ARGUNUSED(pkt)) 7251 7252 int ret = 0; 7253 7254 D1(NULL, "vsw_check_dring_info enter\n"); 7255 7256 if ((pkt->num_descriptors == 0) || 7257 (pkt->descriptor_size == 0) || 7258 (pkt->ncookies != 1)) { 7259 DERR(NULL, "vsw_check_dring_info: invalid dring msg"); 7260 ret = 1; 7261 } 7262 7263 D1(NULL, "vsw_check_dring_info exit\n"); 7264 7265 return (ret); 7266 } 7267 7268 /* 7269 * Returns 1 if two memory cookies match. Otherwise returns 0. 7270 */ 7271 static int 7272 vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2) 7273 { 7274 if ((m1->addr != m2->addr) || 7275 (m2->size != m2->size)) { 7276 return (0); 7277 } else { 7278 return (1); 7279 } 7280 } 7281 7282 /* 7283 * Returns 1 if ring described in reg message matches that 7284 * described by dring_info structure. Otherwise returns 0. 7285 */ 7286 static int 7287 vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg) 7288 { 7289 if ((msg->descriptor_size != dp->descriptor_size) || 7290 (msg->num_descriptors != dp->num_descriptors) || 7291 (msg->ncookies != dp->ncookies) || 7292 !(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) { 7293 return (0); 7294 } else { 7295 return (1); 7296 } 7297 7298 } 7299 7300 static caddr_t 7301 vsw_print_ethaddr(uint8_t *a, char *ebuf) 7302 { 7303 (void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x", 7304 a[0], a[1], a[2], a[3], a[4], a[5]); 7305 return (ebuf); 7306 } 7307 7308 /* 7309 * Reset and free all the resources associated with 7310 * the channel. 7311 */ 7312 static void 7313 vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir) 7314 { 7315 dring_info_t *dp, *dpp; 7316 lane_t *lp = NULL; 7317 int rv = 0; 7318 7319 ASSERT(ldcp != NULL); 7320 7321 D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id); 7322 7323 if (dir == INBOUND) { 7324 D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane" 7325 " of channel %lld", __func__, ldcp->ldc_id); 7326 lp = &ldcp->lane_in; 7327 } else { 7328 D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane" 7329 " of channel %lld", __func__, ldcp->ldc_id); 7330 lp = &ldcp->lane_out; 7331 } 7332 7333 lp->lstate = VSW_LANE_INACTIV; 7334 mutex_enter(&lp->seq_lock); 7335 lp->seq_num = VNET_ISS; 7336 mutex_exit(&lp->seq_lock); 7337 if (lp->dringp) { 7338 if (dir == INBOUND) { 7339 dp = lp->dringp; 7340 while (dp != NULL) { 7341 dpp = dp->next; 7342 if (dp->handle != NULL) 7343 (void) ldc_mem_dring_unmap(dp->handle); 7344 kmem_free(dp, sizeof (dring_info_t)); 7345 dp = dpp; 7346 } 7347 } else { 7348 /* 7349 * unbind, destroy exported dring, free dring struct 7350 */ 7351 dp = lp->dringp; 7352 rv = vsw_free_ring(dp); 7353 } 7354 if (rv == 0) { 7355 lp->dringp = NULL; 7356 } 7357 } 7358 7359 D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id); 7360 } 7361 7362 /* 7363 * Free ring and all associated resources. 7364 */ 7365 static int 7366 vsw_free_ring(dring_info_t *dp) 7367 { 7368 vsw_private_desc_t *paddr = NULL; 7369 dring_info_t *dpp; 7370 int i, rv = 1; 7371 7372 while (dp != NULL) { 7373 mutex_enter(&dp->dlock); 7374 dpp = dp->next; 7375 if (dp->priv_addr != NULL) { 7376 /* 7377 * First unbind and free the memory handles 7378 * stored in each descriptor within the ring. 7379 */ 7380 for (i = 0; i < VSW_RING_NUM_EL; i++) { 7381 paddr = (vsw_private_desc_t *) 7382 dp->priv_addr + i; 7383 if (paddr->memhandle != NULL) { 7384 if (paddr->bound == 1) { 7385 rv = ldc_mem_unbind_handle( 7386 paddr->memhandle); 7387 7388 if (rv != 0) { 7389 DERR(NULL, "error " 7390 "unbinding handle for " 7391 "ring 0x%llx at pos %d", 7392 dp, i); 7393 mutex_exit(&dp->dlock); 7394 return (rv); 7395 } 7396 paddr->bound = 0; 7397 } 7398 7399 rv = ldc_mem_free_handle( 7400 paddr->memhandle); 7401 if (rv != 0) { 7402 DERR(NULL, "error freeing " 7403 "handle for ring " 7404 "0x%llx at pos %d", 7405 dp, i); 7406 mutex_exit(&dp->dlock); 7407 return (rv); 7408 } 7409 paddr->memhandle = NULL; 7410 } 7411 mutex_destroy(&paddr->dstate_lock); 7412 } 7413 kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t) 7414 * VSW_RING_NUM_EL)); 7415 } 7416 7417 /* 7418 * Now unbind and destroy the ring itself. 7419 */ 7420 if (dp->handle != NULL) { 7421 (void) ldc_mem_dring_unbind(dp->handle); 7422 (void) ldc_mem_dring_destroy(dp->handle); 7423 } 7424 7425 if (dp->data_addr != NULL) { 7426 kmem_free(dp->data_addr, dp->data_sz); 7427 } 7428 7429 mutex_exit(&dp->dlock); 7430 mutex_destroy(&dp->dlock); 7431 mutex_destroy(&dp->restart_lock); 7432 kmem_free(dp, sizeof (dring_info_t)); 7433 7434 dp = dpp; 7435 } 7436 return (0); 7437 } 7438 7439 /* 7440 * Debugging routines 7441 */ 7442 static void 7443 display_state(void) 7444 { 7445 vsw_t *vswp; 7446 vsw_port_list_t *plist; 7447 vsw_port_t *port; 7448 vsw_ldc_list_t *ldcl; 7449 vsw_ldc_t *ldcp; 7450 7451 cmn_err(CE_NOTE, "***** system state *****"); 7452 7453 for (vswp = vsw_head; vswp; vswp = vswp->next) { 7454 plist = &vswp->plist; 7455 READ_ENTER(&plist->lockrw); 7456 cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n", 7457 vswp->instance, plist->num_ports); 7458 7459 for (port = plist->head; port != NULL; port = port->p_next) { 7460 ldcl = &port->p_ldclist; 7461 cmn_err(CE_CONT, "port %d : %d ldcs attached\n", 7462 port->p_instance, ldcl->num_ldcs); 7463 READ_ENTER(&ldcl->lockrw); 7464 ldcp = ldcl->head; 7465 for (; ldcp != NULL; ldcp = ldcp->ldc_next) { 7466 cmn_err(CE_CONT, "chan %lu : dev %d : " 7467 "status %d : phase %u\n", 7468 ldcp->ldc_id, ldcp->dev_class, 7469 ldcp->ldc_status, ldcp->hphase); 7470 cmn_err(CE_CONT, "chan %lu : lsession %lu : " 7471 "psession %lu\n", 7472 ldcp->ldc_id, 7473 ldcp->local_session, 7474 ldcp->peer_session); 7475 7476 cmn_err(CE_CONT, "Inbound lane:\n"); 7477 display_lane(&ldcp->lane_in); 7478 cmn_err(CE_CONT, "Outbound lane:\n"); 7479 display_lane(&ldcp->lane_out); 7480 } 7481 RW_EXIT(&ldcl->lockrw); 7482 } 7483 RW_EXIT(&plist->lockrw); 7484 } 7485 cmn_err(CE_NOTE, "***** system state *****"); 7486 } 7487 7488 static void 7489 display_lane(lane_t *lp) 7490 { 7491 dring_info_t *drp; 7492 7493 cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n", 7494 lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu); 7495 cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n", 7496 lp->addr_type, lp->addr, lp->xfer_mode); 7497 cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp); 7498 7499 cmn_err(CE_CONT, "Dring info:\n"); 7500 for (drp = lp->dringp; drp != NULL; drp = drp->next) { 7501 cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n", 7502 drp->num_descriptors, drp->descriptor_size); 7503 cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle); 7504 cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n", 7505 (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr); 7506 cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n", 7507 drp->ident, drp->end_idx); 7508 display_ring(drp); 7509 } 7510 } 7511 7512 static void 7513 display_ring(dring_info_t *dringp) 7514 { 7515 uint64_t i; 7516 uint64_t priv_count = 0; 7517 uint64_t pub_count = 0; 7518 vnet_public_desc_t *pub_addr = NULL; 7519 vsw_private_desc_t *priv_addr = NULL; 7520 7521 for (i = 0; i < VSW_RING_NUM_EL; i++) { 7522 if (dringp->pub_addr != NULL) { 7523 pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i; 7524 7525 if (pub_addr->hdr.dstate == VIO_DESC_FREE) 7526 pub_count++; 7527 } 7528 7529 if (dringp->priv_addr != NULL) { 7530 priv_addr = 7531 (vsw_private_desc_t *)dringp->priv_addr + i; 7532 7533 if (priv_addr->dstate == VIO_DESC_FREE) 7534 priv_count++; 7535 } 7536 } 7537 cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n", 7538 i, priv_count, pub_count); 7539 } 7540 7541 static void 7542 dump_flags(uint64_t state) 7543 { 7544 int i; 7545 7546 typedef struct flag_name { 7547 int flag_val; 7548 char *flag_name; 7549 } flag_name_t; 7550 7551 flag_name_t flags[] = { 7552 VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT", 7553 VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV", 7554 VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV", 7555 VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT", 7556 VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV", 7557 VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT", 7558 VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT", 7559 VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV", 7560 VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT", 7561 VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV", 7562 VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT", 7563 VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV", 7564 VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT", 7565 VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV", 7566 VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT", 7567 VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV", 7568 VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT", 7569 VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV", 7570 VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT", 7571 VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV", 7572 VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT", 7573 VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV", 7574 VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT", 7575 VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV", 7576 VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT", 7577 VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV", 7578 VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT", 7579 VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV", 7580 VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT", 7581 VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV", 7582 VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"}; 7583 7584 DERR(NULL, "DUMP_FLAGS: %llx\n", state); 7585 for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) { 7586 if (state & flags[i].flag_val) 7587 DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name); 7588 } 7589 } 7590